Files
aniscrap/main.py
Aira Catapang ed26a0172b Update main.py
2026-03-17 04:59:55 +00:00

670 lines
22 KiB
Python

import json
import asyncio
import re
import os
import httpx
from typing import Optional
from contextlib import asynccontextmanager
from fastapi import FastAPI
from playwright.async_api import async_playwright, BrowserContext
BASE_URL = "https://animepahe.si"
ANILIST_API = "https://graphql.anilist.co"
JIKAN_API = "https://api.jikan.moe/v4"
IS_HEADLESS = os.environ.get("HEADLESS", "true").lower() == "true"
# In-memory caches
_info_cache: dict = {}
_mal_synopsis_cache: dict = {}
class AnimePahe:
def __init__(self):
self.playwright = None
self.context: Optional[BrowserContext] = None
self.ad_domains = [
"doubleclick.net",
"adservice.google",
"popads.net",
"propellerads",
"exoclick",
"bebi.com",
]
async def start(self):
self.playwright = await async_playwright().start()
self.context = await self.playwright.chromium.launch_persistent_context(
user_data_dir="./browser_data",
headless=IS_HEADLESS,
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122 Safari/537.36",
args=[
"--disable-blink-features=AutomationControlled",
"--no-sandbox",
],
)
await self.context.route("**/*", self._intercept_assets)
async def stop(self):
if self.context:
await self.context.close()
if self.playwright:
await self.playwright.stop()
async def _intercept_assets(self, route):
url = route.request.url.lower()
if any(ad in url for ad in self.ad_domains) or url.endswith(
(".png", ".jpg", ".jpeg", ".webp", ".woff")
):
await route.abort()
else:
await route.continue_()
async def _fetch_json(self, url: str):
page = await self.context.new_page()
try:
await page.goto(url, wait_until="domcontentloaded")
txt = await page.evaluate("document.body.innerText")
return json.loads(txt)
except:
return None
finally:
await page.close()
def _generate_mp4(
self, m3u8_url: Optional[str], anime_name: str, episode: str, res: str
) -> Optional[str]:
if not m3u8_url:
return None
match = re.search(r"(https?://[^.]+)[^/]*/stream/(.*?)/[^/]+\.m3u8", m3u8_url)
if match:
clean_name = re.sub(r"[^\w\s]", "", anime_name).strip().replace(" ", "_")
filename = f"{clean_name}_EP{episode}_{res}P.mp4"
return f"{match.group(1)}.kwik.cx/mp4/{match.group(2)}?file={filename}"
return None
async def _scrape_play_meta(self, page) -> tuple:
meta = await page.evaluate("""() => {
const titleEl = document.querySelector('.theatre-info h2 a, .anime-title, h2 a[href*="/anime/"]')
const epEl = document.querySelector('.theatre-info h2, .episode-title, h2')
let title = titleEl ? titleEl.innerText.trim() : ''
let episode = ''
if (epEl) {
const m = epEl.innerText.match(/episode\\s*(\\d+)/i)
if (m) episode = m[1]
}
if (!title || !episode) {
const t = document.title || ''
const m = t.match(/^(.+?)\\s*[-\\u2013]\\s*Episode\\s*(\\d+)/i)
if (m) {
if (!title) title = m[1].trim()
if (!episode) episode = m[2].trim()
}
}
return { title, episode }
}""")
title = (meta.get("title") or "").strip() or "Unknown"
episode = (meta.get("episode") or "").strip() or "00"
return title, episode
# ---------------- SCRAPE IDs ONLY ----------------
async def _scrape_ids(self, session: str) -> dict:
page = await self.context.new_page()
try:
await page.goto(
f"{BASE_URL}/anime/{session}",
wait_until="domcontentloaded",
timeout=30000,
)
# Wait for the anime info block to render
try:
await page.wait_for_selector(
"div.anime-info, div.anime-summary, aside, main", timeout=10000
)
except:
pass
# Extra wait for JS-rendered content
await asyncio.sleep(2)
# Debug: log all hrefs found on page
all_links = await page.evaluate("""() => {
return Array.from(document.querySelectorAll('a[href]')).map(a => a.href)
}""")
print(f"[scrape_ids] All links found: {all_links}")
ids = await page.evaluate("""() => {
let ids = {}
document.querySelectorAll("a[href]").forEach(a => {
const url = a.href || ""
if (url.includes("myanimelist.net/anime"))
ids["mal"] = url.split("/").filter(Boolean).pop()
if (url.includes("anilist.co/anime"))
ids["anilist"] = url.split("/").filter(Boolean).pop()
if (url.includes("anidb.net"))
ids["anidb"] = url.split("/").filter(Boolean).pop()
if (url.includes("animenewsnetwork.com")) {
const m = url.match(/id=(\\d+)/)
if (m) ids["ann"] = m[1]
}
if (url.includes("anime-planet.com/anime"))
ids["animePlanet"] = url.split("/").filter(Boolean).pop()
})
return ids
}""")
print(f"[scrape_ids] Extracted ids: {ids}")
ids["animepahe"] = session
return ids
except Exception as e:
print(f"[scrape_ids] ERROR: {e}")
return {"animepahe": session}
finally:
await page.close()
# ---------------- MAL SYNOPSIS ----------------
async def _fetch_mal_synopsis(self, mal_id: str) -> Optional[str]:
if mal_id in _mal_synopsis_cache:
return _mal_synopsis_cache[mal_id]
try:
async with httpx.AsyncClient(timeout=10) as client:
resp = await client.get(
f"{JIKAN_API}/anime/{mal_id}",
headers={"Accept": "application/json"},
)
resp.raise_for_status()
synopsis = resp.json().get("data", {}).get("synopsis")
_mal_synopsis_cache[mal_id] = synopsis
return synopsis
except Exception as e:
print(f"[mal_synopsis] failed for mal_id={mal_id}: {e}")
_mal_synopsis_cache[mal_id] = None
return None
# ---------------- SHARED RESOLVE HELPERS ----------------
async def _collect_buttons(self, page) -> list:
buttons = await page.locator("#resolutionMenu button").all()
res_data = []
for btn in buttons:
text = (await btn.inner_text()).strip()
res_match = re.search(r"(\d+)", text)
audio_lang = (await btn.get_attribute("data-audio") or "jpn").lower()
audio_type = "dub" if audio_lang == "eng" else "sub"
res_data.append(
{
"embed": await btn.get_attribute("data-src"),
"res": int(res_match.group(1)) if res_match else 720,
"fansub": text.split("·")[0].strip() if "·" in text else "Unknown",
"audio": audio_type,
"audio_lang": audio_lang,
}
)
return res_data
async def _embed_to_m3u8(self, embed_url: str) -> Optional[str]:
p = await self.context.new_page()
m3u8 = None
found = asyncio.Event()
def capture(req):
nonlocal m3u8
if ".m3u8" in req.url and not found.is_set():
m3u8 = req.url
found.set()
p.on("request", capture)
try:
await p.set_extra_http_headers({"Referer": BASE_URL})
await p.goto(embed_url, wait_until="domcontentloaded")
await p.evaluate(
"document.querySelectorAll('button, video, [class*=play]').forEach(el => el.click())"
)
try:
await asyncio.wait_for(found.wait(), timeout=5.0)
except asyncio.TimeoutError:
pass
finally:
await p.close()
return m3u8
# ---------------- ANILIST ----------------
async def _fetch_anilist(self, anilist_id: str) -> dict:
query = """
query ($id: Int) {
Media(id: $id, type: ANIME) {
id
idMal
title {
romaji
english
native
}
synonyms
description(asHtml: false)
format
status
episodes
duration
source
countryOfOrigin
isAdult
startDate { year month day }
endDate { year month day }
season
seasonYear
averageScore
meanScore
popularity
favourites
trending
genres
coverImage {
extraLarge
large
medium
color
}
bannerImage
trailer {
id
site
}
studios(isMain: true) {
nodes { name siteUrl }
}
relations {
edges {
relationType(version: 2)
node {
id
idMal
title { romaji english }
format
status
episodes
averageScore
coverImage { medium }
siteUrl
}
}
}
recommendations(perPage: 20, sort: RATING_DESC) {
nodes {
rating
mediaRecommendation {
id
idMal
title { romaji english }
format
status
episodes
averageScore
coverImage { medium }
siteUrl
}
}
}
externalLinks {
site
url
type
}
nextAiringEpisode {
airingAt
episode
}
}
}
"""
try:
async with httpx.AsyncClient(timeout=15) as client:
resp = await client.post(
ANILIST_API,
json={"query": query, "variables": {"id": int(anilist_id)}},
headers={
"Content-Type": "application/json",
"Accept": "application/json",
},
)
resp.raise_for_status()
result = resp.json()
except Exception as e:
print(f"[anilist] failed for id={anilist_id}: {e}")
return {"error": f"AniList fetch failed: {str(e)}"}
media = result.get("data", {}).get("Media")
if not media:
return {"error": "AniList returned no data"}
mal_id = str(media.get("idMal") or "")
mal_synopsis = await self._fetch_mal_synopsis(mal_id) if mal_id else None
synopsis = mal_synopsis or media.get("description")
def fmt_date(d):
if not d or not d.get("year"):
return None
parts = [d.get("year"), d.get("month"), d.get("day")]
return "-".join(str(p).zfill(2) for p in parts if p)
trailer = None
if media.get("trailer"):
t = media["trailer"]
if t.get("site") == "youtube":
trailer = f"https://www.youtube.com/watch?v={t['id']}"
elif t.get("site") == "dailymotion":
trailer = f"https://www.dailymotion.com/video/{t['id']}"
# ---------- Relations from AniList ----------
relations: dict[str, list] = {}
for edge in media.get("relations", {}).get("edges", []):
node = edge.get("node", {})
if not node:
continue
relation_type = edge.get("relationType", "OTHER")
entry = {
"id": node.get("id"),
"mal_id": node.get("idMal"),
"title": (
node.get("title", {}).get("english")
or node.get("title", {}).get("romaji")
),
"format": node.get("format"),
"status": node.get("status"),
"episodes": node.get("episodes"),
"score": node.get("averageScore"),
"image": node.get("coverImage", {}).get("medium"),
"url": node.get("siteUrl"),
"relation_type": relation_type,
}
relations.setdefault(relation_type, []).append(entry)
# ---------- Recommendations ----------
recommendations = []
for node in media.get("recommendations", {}).get("nodes", []):
rec = node.get("mediaRecommendation")
if not rec:
continue
recommendations.append(
{
"id": rec.get("id"),
"mal_id": rec.get("idMal"),
"title": rec["title"].get("english") or rec["title"].get("romaji"),
"format": rec.get("format"),
"status": rec.get("status"),
"episodes": rec.get("episodes"),
"score": rec.get("averageScore"),
"image": rec.get("coverImage", {}).get("medium"),
"url": rec.get("siteUrl"),
"rating": node.get("rating"),
}
)
return {
"id": media.get("id"),
"mal_id": media.get("idMal"),
"title": {
"romaji": media["title"].get("romaji"),
"english": media["title"].get("english"),
"native": media["title"].get("native"),
},
"synonyms": media.get("synonyms", []),
"synopsis": synopsis,
"format": media.get("format"),
"status": media.get("status"),
"episodes": media.get("episodes"),
"duration": media.get("duration"),
"source": media.get("source"),
"country": media.get("countryOfOrigin"),
"is_adult": media.get("isAdult"),
"start_date": fmt_date(media.get("startDate")),
"end_date": fmt_date(media.get("endDate")),
"season": media.get("season"),
"season_year": media.get("seasonYear"),
"average_score": media.get("averageScore"),
"mean_score": media.get("meanScore"),
"popularity": media.get("popularity"),
"favourites": media.get("favourites"),
"trending": media.get("trending"),
"genres": media.get("genres", []),
"cover_image": media.get("coverImage", {}),
"banner_image": media.get("bannerImage"),
"trailer": trailer,
"studios": [s["name"] for s in media.get("studios", {}).get("nodes", [])],
"next_airing": media.get("nextAiringEpisode"),
"external_links": [
{"site": l["site"], "url": l["url"], "type": l["type"]}
for l in media.get("externalLinks", [])
],
"relations": relations,
"recommendations": recommendations,
}
# ---------------- SEARCH ----------------
async def search(self, q: str):
data = await self._fetch_json(f"{BASE_URL}/api?m=search&q={q}")
return data.get("data", []) if data else []
# ---------------- LATEST ----------------
async def get_latest(self, p: int = 1):
return await self._fetch_json(f"{BASE_URL}/api?m=airing&page={p}")
# ---------------- EPISODES ----------------
async def get_episodes(self, anime_id: str, p: int = 1, resolve: bool = False):
data = await self._fetch_json(
f"{BASE_URL}/api?m=release&id={anime_id}&sort=episode_desc&page={p}"
)
if not data or not resolve:
return data
episodes = data.get("data", [])
async def enrich(ep):
ep_session = ep.get("session")
if not ep_session:
return ep
stream = await self.resolve(anime_id, ep_session)
ep["sub"] = stream.get("sub")
ep["dub"] = stream.get("dub")
return ep
data["data"] = list(await asyncio.gather(*[enrich(ep) for ep in episodes]))
return data
# ---------------- IDS ONLY ----------------
async def get_ids(self, session: str):
try:
ids = await self._scrape_ids(session)
return {
"animepahe": ids.get("animepahe"),
"anilist": ids.get("anilist"),
"mal": ids.get("mal"),
"anidb": ids.get("anidb"),
"ann": ids.get("ann"),
"animePlanet": ids.get("animePlanet"),
}
except Exception as e:
print(f"[get_ids] ERROR: {e}")
return {"error": f"Failed: {str(e)}"}
# ---------------- INFO ----------------
async def get_info(self, session: str):
try:
ids = await self._scrape_ids(session)
anilist_id = ids.get("anilist")
if not anilist_id:
return {
"error": "Could not find AniList ID on AnimePahe page",
"ids": ids,
}
if anilist_id in _info_cache:
return _info_cache[anilist_id]
data = await self._fetch_anilist(anilist_id)
if "error" in data:
return {"error": data["error"], "ids": ids}
data["ids"] = {
"animepahe": ids.get("animepahe"),
"anilist": anilist_id,
"mal": ids.get("mal"),
"anidb": ids.get("anidb"),
"ann": ids.get("ann"),
"animePlanet": ids.get("animePlanet"),
}
_info_cache[anilist_id] = data
return data
except Exception as e:
print(f"[get_info] ERROR: {e}")
return {"error": f"Failed: {str(e)}"}
# ---------------- RESOLVE ----------------
async def resolve(self, anime_session: str, episode_session: str):
play_url = f"{BASE_URL}/play/{anime_session}/{episode_session}"
page = await self.context.new_page()
try:
await page.goto(play_url, wait_until="domcontentloaded")
await page.wait_for_selector(
"#resolutionMenu button",
state="attached",
timeout=15000,
)
anime_name, episode_num = await self._scrape_play_meta(page)
res_data = await self._collect_buttons(page)
await page.close()
page = None
subs = [r for r in res_data if r["audio"] == "sub"]
dubs = [r for r in res_data if r["audio"] == "dub"]
best_sub = max(subs, key=lambda x: x["res"]) if subs else None
best_dub = max(dubs, key=lambda x: x["res"]) if dubs else None
async def resolve_one(item):
try:
m3u8 = await self._embed_to_m3u8(item["embed"])
res_str = str(item["res"])
return {
"resolution": res_str,
"fansub": item["fansub"],
"audio": item["audio"],
"audio_lang": item["audio_lang"],
"url": m3u8,
"download": self._generate_mp4(
m3u8, anime_name, episode_num, res_str
),
}
except Exception as e:
return {
"resolution": str(item["res"]),
"fansub": item["fansub"],
"audio": item["audio"],
"audio_lang": item["audio_lang"],
"url": None,
"download": None,
"error": str(e),
}
tasks = []
if best_sub:
tasks.append(resolve_one(best_sub))
if best_dub:
tasks.append(resolve_one(best_dub))
results = await asyncio.gather(*tasks)
sub_result = results[0] if best_sub else None
dub_result = (
results[1]
if best_sub and best_dub
else (results[0] if best_dub else None)
)
return {
"anime": anime_session,
"episode": episode_session,
"anime_name": anime_name,
"episode_num": episode_num,
"sub": sub_result,
"dub": dub_result,
}
except Exception as e:
return {"error": str(e)}
finally:
if page:
await page.close()
pahe = AnimePahe()
@asynccontextmanager
async def lifespan(app: FastAPI):
await pahe.start()
yield
await pahe.stop()
app = FastAPI(lifespan=lifespan)
@app.get("/search")
async def api_search(q: str):
return await pahe.search(q)
@app.get("/latest")
async def api_latest(p: int = 1):
return await pahe.get_latest(p)
@app.get("/info/{session}")
async def api_info(session: str):
return await pahe.get_info(session)
@app.get("/ids/{session}")
async def api_ids(session: str):
return await pahe.get_ids(session)
@app.get("/episodes/{session}")
async def api_episodes(session: str, p: int = 1, resolve: bool = False):
return await pahe.get_episodes(session, p, resolve)
@app.get("/resolve/{anime}/{episode}")
async def api_resolve(anime: str, episode: str):
return await pahe.resolve(anime, episode)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)