mirror of
https://huggingface.co/spaces/Catapang1989/aniscrap
synced 2026-04-17 15:51:46 +00:00
Update main.py
This commit is contained in:
387
main.py
387
main.py
@@ -11,20 +11,11 @@ from playwright.async_api import async_playwright, BrowserContext
|
||||
BASE_URL = "https://animepahe.si"
|
||||
ANILIST_API = "https://graphql.anilist.co"
|
||||
JIKAN_API = "https://api.jikan.moe/v4"
|
||||
KITSU_API = "https://kitsu.io/api/edge"
|
||||
IS_HEADLESS = os.environ.get("HEADLESS", "true").lower() == "true"
|
||||
|
||||
# In-memory caches
|
||||
_info_cache: dict = {}
|
||||
_mal_synopsis_cache: dict = {}
|
||||
_kitsu_relations_cache: dict = {}
|
||||
|
||||
KITSU_HEADERS = {
|
||||
"Accept": "application/vnd.api+json",
|
||||
"Content-Type": "application/vnd.api+json",
|
||||
}
|
||||
|
||||
DIRECT_RELATION_TYPES = {"sequel", "prequel", "parent", "full_story", "side_story"}
|
||||
|
||||
|
||||
class AnimePahe:
|
||||
@@ -80,14 +71,47 @@ class AnimePahe:
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
def _generate_mp4(self, m3u8_url: Optional[str], anime_id: str, res: str):
|
||||
def _generate_mp4(
|
||||
self, m3u8_url: Optional[str], anime_name: str, episode: str, res: str
|
||||
) -> Optional[str]:
|
||||
if not m3u8_url:
|
||||
return None
|
||||
match = re.search(r"(https?://[^.]+)[^/]*/stream/(.*?)/[^/]+\.m3u8", m3u8_url)
|
||||
if match:
|
||||
return f"{match.group(1)}.kwik.cx/mp4/{match.group(2)}?file=AnimePahe_{anime_id}_{res}p.mp4"
|
||||
clean_name = re.sub(r"[^\w\s]", "", anime_name).strip().replace(" ", "_")
|
||||
filename = f"{clean_name}_EP{episode}_{res}P.mp4"
|
||||
return f"{match.group(1)}.kwik.cx/mp4/{match.group(2)}?file={filename}"
|
||||
return None
|
||||
|
||||
async def _scrape_play_meta(self, page) -> tuple:
|
||||
meta = await page.evaluate("""() => {
|
||||
const titleEl = document.querySelector('.theatre-info h2 a, .anime-title, h2 a[href*="/anime/"]')
|
||||
const epEl = document.querySelector('.theatre-info h2, .episode-title, h2')
|
||||
|
||||
let title = titleEl ? titleEl.innerText.trim() : ''
|
||||
let episode = ''
|
||||
|
||||
if (epEl) {
|
||||
const m = epEl.innerText.match(/episode\\s*(\\d+)/i)
|
||||
if (m) episode = m[1]
|
||||
}
|
||||
|
||||
if (!title || !episode) {
|
||||
const t = document.title || ''
|
||||
const m = t.match(/^(.+?)\\s*[-\\u2013]\\s*Episode\\s*(\\d+)/i)
|
||||
if (m) {
|
||||
if (!title) title = m[1].trim()
|
||||
if (!episode) episode = m[2].trim()
|
||||
}
|
||||
}
|
||||
|
||||
return { title, episode }
|
||||
}""")
|
||||
|
||||
title = (meta.get("title") or "").strip() or "Unknown"
|
||||
episode = (meta.get("episode") or "").strip() or "00"
|
||||
return title, episode
|
||||
|
||||
# ---------------- SCRAPE IDs ONLY ----------------
|
||||
|
||||
async def _scrape_ids(self, session: str) -> dict:
|
||||
@@ -95,11 +119,26 @@ class AnimePahe:
|
||||
try:
|
||||
await page.goto(
|
||||
f"{BASE_URL}/anime/{session}",
|
||||
wait_until="networkidle",
|
||||
wait_until="domcontentloaded",
|
||||
timeout=30000,
|
||||
)
|
||||
await page.wait_for_selector(".anime-info", timeout=10000)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# Wait for the anime info block to render
|
||||
try:
|
||||
await page.wait_for_selector(
|
||||
"div.anime-info, div.anime-summary, aside, main", timeout=10000
|
||||
)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Extra wait for JS-rendered content
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Debug: log all hrefs found on page
|
||||
all_links = await page.evaluate("""() => {
|
||||
return Array.from(document.querySelectorAll('a[href]')).map(a => a.href)
|
||||
}""")
|
||||
print(f"[scrape_ids] All links found: {all_links}")
|
||||
|
||||
ids = await page.evaluate("""() => {
|
||||
let ids = {}
|
||||
@@ -111,8 +150,6 @@ class AnimePahe:
|
||||
ids["anilist"] = url.split("/").filter(Boolean).pop()
|
||||
if (url.includes("anidb.net"))
|
||||
ids["anidb"] = url.split("/").filter(Boolean).pop()
|
||||
if (url.includes("kitsu.io/anime"))
|
||||
ids["kitsu"] = url.split("/").filter(Boolean).pop()
|
||||
if (url.includes("animenewsnetwork.com")) {
|
||||
const m = url.match(/id=(\\d+)/)
|
||||
if (m) ids["ann"] = m[1]
|
||||
@@ -123,6 +160,7 @@ class AnimePahe:
|
||||
return ids
|
||||
}""")
|
||||
|
||||
print(f"[scrape_ids] Extracted ids: {ids}")
|
||||
ids["animepahe"] = session
|
||||
return ids
|
||||
|
||||
@@ -152,84 +190,9 @@ class AnimePahe:
|
||||
_mal_synopsis_cache[mal_id] = None
|
||||
return None
|
||||
|
||||
# ---------------- KITSU RELATIONS ----------------
|
||||
|
||||
async def _fetch_kitsu_relations(self, kitsu_id: str) -> list:
|
||||
if kitsu_id in _kitsu_relations_cache:
|
||||
return _kitsu_relations_cache[kitsu_id]
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15) as client:
|
||||
url = (
|
||||
f"{KITSU_API}/anime/{kitsu_id}/media-relationships"
|
||||
f"?include=destination"
|
||||
f"&fields[anime]=canonicalTitle,posterImage,episodeCount,status,subtype,startDate"
|
||||
f"&page[limit]=20"
|
||||
)
|
||||
resp = await client.get(url, headers=KITSU_HEADERS)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
except Exception as e:
|
||||
print(f"[kitsu_relations] failed for kitsu_id={kitsu_id}: {e}")
|
||||
_kitsu_relations_cache[kitsu_id] = []
|
||||
return []
|
||||
|
||||
included = {}
|
||||
for item in data.get("included", []):
|
||||
included[item["id"]] = item
|
||||
|
||||
direct = []
|
||||
indirect = []
|
||||
|
||||
for rel in data.get("data", []):
|
||||
attrs = rel.get("attributes", {})
|
||||
role = (attrs.get("role") or "").lower()
|
||||
dest_data = (
|
||||
rel.get("relationships", {}).get("destination", {}).get("data", {})
|
||||
)
|
||||
dest_type = dest_data.get("type", "")
|
||||
dest_id = dest_data.get("id", "")
|
||||
|
||||
if dest_type != "anime":
|
||||
continue
|
||||
|
||||
dest = included.get(dest_id, {})
|
||||
dest_attrs = dest.get("attributes", {})
|
||||
poster = dest_attrs.get("posterImage") or {}
|
||||
|
||||
entry = {
|
||||
"kitsu_id": dest_id,
|
||||
"title": dest_attrs.get("canonicalTitle"),
|
||||
"format": dest_attrs.get("subtype"),
|
||||
"status": dest_attrs.get("status"),
|
||||
"episodes": dest_attrs.get("episodeCount"),
|
||||
"start_date": dest_attrs.get("startDate"),
|
||||
"image": (
|
||||
poster.get("small")
|
||||
or poster.get("medium")
|
||||
or poster.get("original")
|
||||
),
|
||||
"url": f"https://kitsu.io/anime/{dest_id}",
|
||||
"relation_type": role,
|
||||
}
|
||||
|
||||
if role in DIRECT_RELATION_TYPES:
|
||||
direct.append(entry)
|
||||
else:
|
||||
indirect.append(entry)
|
||||
|
||||
combined = direct + indirect
|
||||
_kitsu_relations_cache[kitsu_id] = combined
|
||||
return combined
|
||||
|
||||
# ---------------- SHARED RESOLVE HELPERS ----------------
|
||||
|
||||
async def _collect_buttons(self, page) -> list:
|
||||
"""
|
||||
Read all #resolutionMenu buttons.
|
||||
Returns list with embed URL, resolution (int), fansub, audio type.
|
||||
data-audio="jpn" → sub, data-audio="eng" → dub
|
||||
"""
|
||||
buttons = await page.locator("#resolutionMenu button").all()
|
||||
res_data = []
|
||||
for btn in buttons:
|
||||
@@ -306,12 +269,6 @@ class AnimePahe:
|
||||
favourites
|
||||
trending
|
||||
genres
|
||||
tags {
|
||||
name
|
||||
category
|
||||
rank
|
||||
isMediaSpoiler
|
||||
}
|
||||
coverImage {
|
||||
extraLarge
|
||||
large
|
||||
@@ -326,27 +283,18 @@ class AnimePahe:
|
||||
studios(isMain: true) {
|
||||
nodes { name siteUrl }
|
||||
}
|
||||
staff(perPage: 10) {
|
||||
relations {
|
||||
edges {
|
||||
role
|
||||
relationType(version: 2)
|
||||
node {
|
||||
name { full }
|
||||
image { medium }
|
||||
siteUrl
|
||||
}
|
||||
}
|
||||
}
|
||||
characters(perPage: 10, sort: [ROLE, RELEVANCE]) {
|
||||
edges {
|
||||
role
|
||||
node {
|
||||
name { full }
|
||||
image { medium }
|
||||
siteUrl
|
||||
}
|
||||
voiceActors(language: JAPANESE) {
|
||||
name { full }
|
||||
image { medium }
|
||||
id
|
||||
idMal
|
||||
title { romaji english }
|
||||
format
|
||||
status
|
||||
episodes
|
||||
averageScore
|
||||
coverImage { medium }
|
||||
siteUrl
|
||||
}
|
||||
}
|
||||
@@ -418,6 +366,31 @@ class AnimePahe:
|
||||
elif t.get("site") == "dailymotion":
|
||||
trailer = f"https://www.dailymotion.com/video/{t['id']}"
|
||||
|
||||
# ---------- Relations from AniList ----------
|
||||
relations: dict[str, list] = {}
|
||||
for edge in media.get("relations", {}).get("edges", []):
|
||||
node = edge.get("node", {})
|
||||
if not node:
|
||||
continue
|
||||
relation_type = edge.get("relationType", "OTHER")
|
||||
entry = {
|
||||
"id": node.get("id"),
|
||||
"mal_id": node.get("idMal"),
|
||||
"title": (
|
||||
node.get("title", {}).get("english")
|
||||
or node.get("title", {}).get("romaji")
|
||||
),
|
||||
"format": node.get("format"),
|
||||
"status": node.get("status"),
|
||||
"episodes": node.get("episodes"),
|
||||
"score": node.get("averageScore"),
|
||||
"image": node.get("coverImage", {}).get("medium"),
|
||||
"url": node.get("siteUrl"),
|
||||
"relation_type": relation_type,
|
||||
}
|
||||
relations.setdefault(relation_type, []).append(entry)
|
||||
|
||||
# ---------- Recommendations ----------
|
||||
recommendations = []
|
||||
for node in media.get("recommendations", {}).get("nodes", []):
|
||||
rec = node.get("mediaRecommendation")
|
||||
@@ -438,38 +411,6 @@ class AnimePahe:
|
||||
}
|
||||
)
|
||||
|
||||
characters = []
|
||||
for edge in media.get("characters", {}).get("edges", []):
|
||||
node = edge.get("node", {})
|
||||
vas = edge.get("voiceActors", [])
|
||||
characters.append(
|
||||
{
|
||||
"name": node.get("name", {}).get("full"),
|
||||
"image": node.get("image", {}).get("medium"),
|
||||
"role": edge.get("role"),
|
||||
"url": node.get("siteUrl"),
|
||||
"voice_actor": {
|
||||
"name": vas[0]["name"]["full"],
|
||||
"image": vas[0].get("image", {}).get("medium"),
|
||||
"url": vas[0].get("siteUrl"),
|
||||
}
|
||||
if vas
|
||||
else None,
|
||||
}
|
||||
)
|
||||
|
||||
staff = []
|
||||
for edge in media.get("staff", {}).get("edges", []):
|
||||
node = edge.get("node", {})
|
||||
staff.append(
|
||||
{
|
||||
"name": node.get("name", {}).get("full"),
|
||||
"image": node.get("image", {}).get("medium"),
|
||||
"role": edge.get("role"),
|
||||
"url": node.get("siteUrl"),
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"id": media.get("id"),
|
||||
"mal_id": media.get("idMal"),
|
||||
@@ -497,15 +438,6 @@ class AnimePahe:
|
||||
"favourites": media.get("favourites"),
|
||||
"trending": media.get("trending"),
|
||||
"genres": media.get("genres", []),
|
||||
"tags": [
|
||||
{
|
||||
"name": t["name"],
|
||||
"category": t["category"],
|
||||
"rank": t["rank"],
|
||||
"spoiler": t["isMediaSpoiler"],
|
||||
}
|
||||
for t in media.get("tags", [])
|
||||
],
|
||||
"cover_image": media.get("coverImage", {}),
|
||||
"banner_image": media.get("bannerImage"),
|
||||
"trailer": trailer,
|
||||
@@ -515,9 +447,7 @@ class AnimePahe:
|
||||
{"site": l["site"], "url": l["url"], "type": l["type"]}
|
||||
for l in media.get("externalLinks", [])
|
||||
],
|
||||
"characters": characters,
|
||||
"staff": staff,
|
||||
"relations": {},
|
||||
"relations": relations,
|
||||
"recommendations": recommendations,
|
||||
}
|
||||
|
||||
@@ -548,7 +478,7 @@ class AnimePahe:
|
||||
ep_session = ep.get("session")
|
||||
if not ep_session:
|
||||
return ep
|
||||
stream = await self._resolve_episode(anime_id, ep_session)
|
||||
stream = await self.resolve(anime_id, ep_session)
|
||||
ep["sub"] = stream.get("sub")
|
||||
ep["dub"] = stream.get("dub")
|
||||
return ep
|
||||
@@ -556,6 +486,23 @@ class AnimePahe:
|
||||
data["data"] = list(await asyncio.gather(*[enrich(ep) for ep in episodes]))
|
||||
return data
|
||||
|
||||
# ---------------- IDS ONLY ----------------
|
||||
|
||||
async def get_ids(self, session: str):
|
||||
try:
|
||||
ids = await self._scrape_ids(session)
|
||||
return {
|
||||
"animepahe": ids.get("animepahe"),
|
||||
"anilist": ids.get("anilist"),
|
||||
"mal": ids.get("mal"),
|
||||
"anidb": ids.get("anidb"),
|
||||
"ann": ids.get("ann"),
|
||||
"animePlanet": ids.get("animePlanet"),
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"[get_ids] ERROR: {e}")
|
||||
return {"error": f"Failed: {str(e)}"}
|
||||
|
||||
# ---------------- INFO ----------------
|
||||
|
||||
async def get_info(self, session: str):
|
||||
@@ -572,28 +519,16 @@ class AnimePahe:
|
||||
if anilist_id in _info_cache:
|
||||
return _info_cache[anilist_id]
|
||||
|
||||
kitsu_id = ids.get("kitsu")
|
||||
|
||||
async def empty_relations():
|
||||
return []
|
||||
|
||||
anilist_task = self._fetch_anilist(anilist_id)
|
||||
kitsu_task = (
|
||||
self._fetch_kitsu_relations(kitsu_id) if kitsu_id else empty_relations()
|
||||
)
|
||||
|
||||
data, kitsu_relations = await asyncio.gather(anilist_task, kitsu_task)
|
||||
data = await self._fetch_anilist(anilist_id)
|
||||
|
||||
if "error" in data:
|
||||
return {"error": data["error"], "ids": ids}
|
||||
|
||||
data["relations"] = {"Related": kitsu_relations} if kitsu_relations else {}
|
||||
data["ids"] = {
|
||||
"animepahe": ids.get("animepahe"),
|
||||
"anilist": anilist_id,
|
||||
"mal": ids.get("mal"),
|
||||
"anidb": ids.get("anidb"),
|
||||
"kitsu": kitsu_id,
|
||||
"ann": ids.get("ann"),
|
||||
"animePlanet": ids.get("animePlanet"),
|
||||
}
|
||||
@@ -605,9 +540,9 @@ class AnimePahe:
|
||||
print(f"[get_info] ERROR: {e}")
|
||||
return {"error": f"Failed: {str(e)}"}
|
||||
|
||||
# ---------------- _resolve_episode (used by get_episodes) ----------------
|
||||
# ---------------- RESOLVE ----------------
|
||||
|
||||
async def _resolve_episode(self, anime_session: str, episode_session: str) -> dict:
|
||||
async def resolve(self, anime_session: str, episode_session: str):
|
||||
play_url = f"{BASE_URL}/play/{anime_session}/{episode_session}"
|
||||
page = await self.context.new_page()
|
||||
|
||||
@@ -618,70 +553,18 @@ class AnimePahe:
|
||||
state="attached",
|
||||
timeout=15000,
|
||||
)
|
||||
|
||||
anime_name, episode_num = await self._scrape_play_meta(page)
|
||||
res_data = await self._collect_buttons(page)
|
||||
await page.close()
|
||||
page = None
|
||||
|
||||
if not res_data:
|
||||
return {"sub": None, "dub": None}
|
||||
|
||||
subs = [r for r in res_data if r["audio"] == "sub"]
|
||||
dubs = [r for r in res_data if r["audio"] == "dub"]
|
||||
best_sub = max(subs, key=lambda x: x["res"]) if subs else None
|
||||
best_dub = max(dubs, key=lambda x: x["res"]) if dubs else None
|
||||
|
||||
result = {"sub": None, "dub": None}
|
||||
|
||||
async def resolve_one(item, key):
|
||||
m3u8 = await self._embed_to_m3u8(item["embed"])
|
||||
res_str = str(item["res"])
|
||||
result[key] = {
|
||||
"url": m3u8,
|
||||
"download": self._generate_mp4(m3u8, anime_session, res_str),
|
||||
"resolution": res_str,
|
||||
"fansub": item["fansub"],
|
||||
}
|
||||
|
||||
tasks = []
|
||||
if best_sub:
|
||||
tasks.append(resolve_one(best_sub, "sub"))
|
||||
if best_dub:
|
||||
tasks.append(resolve_one(best_dub, "dub"))
|
||||
|
||||
await asyncio.gather(*tasks)
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"[_resolve_episode] ERROR: {e}")
|
||||
return {"sub": None, "dub": None, "error": str(e)}
|
||||
finally:
|
||||
if page:
|
||||
await page.close()
|
||||
|
||||
# ---------------- RESOLVE ----------------
|
||||
|
||||
async def resolve(self, anime_session: str, episode_session: str):
|
||||
"""
|
||||
Resolve highest-res sub and dub for a single episode.
|
||||
Returns:
|
||||
sub: { resolution, fansub, audio, url, download }
|
||||
dub: { resolution, fansub, audio, url, download } or null if no dub
|
||||
"""
|
||||
play_url = f"{BASE_URL}/play/{anime_session}/{episode_session}"
|
||||
page = await self.context.new_page()
|
||||
|
||||
try:
|
||||
await page.goto(play_url, wait_until="domcontentloaded")
|
||||
await page.wait_for_selector(
|
||||
"#resolutionMenu button",
|
||||
state="attached",
|
||||
timeout=15000,
|
||||
)
|
||||
res_data = await self._collect_buttons(page)
|
||||
await page.close()
|
||||
page = None
|
||||
|
||||
async def resolve_source(item):
|
||||
async def resolve_one(item):
|
||||
try:
|
||||
m3u8 = await self._embed_to_m3u8(item["embed"])
|
||||
res_str = str(item["res"])
|
||||
@@ -691,7 +574,9 @@ class AnimePahe:
|
||||
"audio": item["audio"],
|
||||
"audio_lang": item["audio_lang"],
|
||||
"url": m3u8,
|
||||
"download": self._generate_mp4(m3u8, anime_session, res_str),
|
||||
"download": self._generate_mp4(
|
||||
m3u8, anime_name, episode_num, res_str
|
||||
),
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
@@ -704,27 +589,28 @@ class AnimePahe:
|
||||
"error": str(e),
|
||||
}
|
||||
|
||||
all_sources = list(
|
||||
await asyncio.gather(*[resolve_source(i) for i in res_data])
|
||||
tasks = []
|
||||
if best_sub:
|
||||
tasks.append(resolve_one(best_sub))
|
||||
if best_dub:
|
||||
tasks.append(resolve_one(best_dub))
|
||||
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
sub_result = results[0] if best_sub else None
|
||||
dub_result = (
|
||||
results[1]
|
||||
if best_sub and best_dub
|
||||
else (results[0] if best_dub else None)
|
||||
)
|
||||
|
||||
sub_sources = [s for s in all_sources if s["audio"] == "sub"]
|
||||
dub_sources = [s for s in all_sources if s["audio"] == "dub"]
|
||||
|
||||
def best(sources):
|
||||
if not sources:
|
||||
return None
|
||||
return max(
|
||||
[s for s in sources if s["url"]],
|
||||
key=lambda x: int(x["resolution"]) if x["resolution"] else 0,
|
||||
default=None,
|
||||
)
|
||||
|
||||
return {
|
||||
"anime": anime_session,
|
||||
"episode": episode_session,
|
||||
"sub": best(sub_sources),
|
||||
"dub": best(dub_sources),
|
||||
"anime_name": anime_name,
|
||||
"episode_num": episode_num,
|
||||
"sub": sub_result,
|
||||
"dub": dub_result,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
@@ -762,6 +648,11 @@ async def api_info(session: str):
|
||||
return await pahe.get_info(session)
|
||||
|
||||
|
||||
@app.get("/ids/{session}")
|
||||
async def api_ids(session: str):
|
||||
return await pahe.get_ids(session)
|
||||
|
||||
|
||||
@app.get("/episodes/{session}")
|
||||
async def api_episodes(session: str, p: int = 1, resolve: bool = False):
|
||||
return await pahe.get_episodes(session, p, resolve)
|
||||
|
||||
Reference in New Issue
Block a user