mirror of
https://huggingface.co/spaces/Catapang1989/aniscrap
synced 2026-04-17 15:51:46 +00:00
Update main.py
This commit is contained in:
202
main.py
202
main.py
@@ -55,12 +55,13 @@ class AnimePahe:
|
|||||||
|
|
||||||
async def _intercept_assets(self, route):
|
async def _intercept_assets(self, route):
|
||||||
url = route.request.url.lower()
|
url = route.request.url.lower()
|
||||||
# Allow all requests from aniwatchtv so season posters can load
|
# Allow requests from aniwatchtv & kwik (video host) so players/posters load correctly
|
||||||
if "aniwatchtv.to" in url:
|
if "aniwatchtv.to" in url or "kwik" in url:
|
||||||
await route.continue_()
|
await route.continue_()
|
||||||
return
|
return
|
||||||
|
|
||||||
if any(ad in url for ad in self.ad_domains) or url.endswith(
|
if any(ad in url for ad in self.ad_domains) or url.endswith(
|
||||||
(".png", ".jpg", ".jpeg", ".webp", ".woff")
|
(".png", ".jpg", ".jpeg", ".webp", ".woff", ".gif")
|
||||||
):
|
):
|
||||||
await route.abort()
|
await route.abort()
|
||||||
else:
|
else:
|
||||||
@@ -82,33 +83,31 @@ class AnimePahe:
|
|||||||
) -> Optional[str]:
|
) -> Optional[str]:
|
||||||
if not m3u8_url:
|
if not m3u8_url:
|
||||||
return None
|
return None
|
||||||
match = re.search(r"(https?://[^.]+)[^/]*/stream/(.*?)/[^/]+\.m3u8", m3u8_url)
|
# Example: https://na-02.kwik.cx/stream/abc123def/index.m3u8
|
||||||
|
match = re.search(r"(https?://[^/]+)/stream/([^/]+)/", m3u8_url)
|
||||||
if match:
|
if match:
|
||||||
clean_name = re.sub(r"[^\w\s]", "", anime_name).strip().replace(" ", "_")
|
clean_name = re.sub(r"[^\w\s]", "", anime_name).strip().replace(" ", "_")
|
||||||
|
if not clean_name:
|
||||||
|
clean_name = "Anime"
|
||||||
filename = f"{clean_name}_EP{episode}_{res}P.mp4"
|
filename = f"{clean_name}_EP{episode}_{res}P.mp4"
|
||||||
return f"{match.group(1)}.kwik.cx/mp4/{match.group(2)}?file={filename}"
|
domain = match.group(1) # e.g. https://na-02.kwik.cx
|
||||||
|
token = match.group(2) # e.g. abc123def
|
||||||
|
return f"{domain}/mp4/{token}?file={filename}"
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def _scrape_play_meta(self, page) -> tuple:
|
async def _scrape_play_meta(self, page) -> tuple:
|
||||||
meta = await page.evaluate("""() => {
|
meta = await page.evaluate("""() => {
|
||||||
const titleEl = document.querySelector('.theatre-info h2 a, .anime-title, h2 a[href*="/anime/"]')
|
const titleEl = document.querySelector('.theatre-info h1 a, .theatre-info h2 a, .anime-title, h1, h2');
|
||||||
const epEl = document.querySelector('.theatre-info h2, .episode-title, h2')
|
let title = titleEl ? titleEl.innerText.trim() : '';
|
||||||
|
let episode = '';
|
||||||
|
|
||||||
let title = titleEl ? titleEl.innerText.trim() : ''
|
const t = document.title || '';
|
||||||
let episode = ''
|
// Match exactly: "Anime Name - 01 - AnimePahe" OR "Anime Name - Episode 01 - AnimePahe"
|
||||||
|
const m = t.match(/^(.+?)\\s*[-\\u2013]\\s*(?:Episode\\s*)?(\\d+(?:\\.\\d+)?)/i);
|
||||||
if (epEl) {
|
|
||||||
const m = epEl.innerText.match(/episode\\s*(\\d+)/i)
|
if (m) {
|
||||||
if (m) episode = m[1]
|
if (!title || title.length < 2) title = m[1].trim();
|
||||||
}
|
if (!episode) episode = m[2].trim();
|
||||||
|
|
||||||
if (!title || !episode) {
|
|
||||||
const t = document.title || ''
|
|
||||||
const m = t.match(/^(.+?)\\s*[-\\u2013]\\s*Episode\\s*(\\d+)/i)
|
|
||||||
if (m) {
|
|
||||||
if (!title) title = m[1].trim()
|
|
||||||
if (!episode) episode = m[2].trim()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return { title, episode }
|
return { title, episode }
|
||||||
@@ -129,7 +128,6 @@ class AnimePahe:
|
|||||||
timeout=30000,
|
timeout=30000,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Wait for the anime info block to render
|
|
||||||
try:
|
try:
|
||||||
await page.wait_for_selector(
|
await page.wait_for_selector(
|
||||||
"div.anime-info, div.anime-summary, aside, main", timeout=10000
|
"div.anime-info, div.anime-summary, aside, main", timeout=10000
|
||||||
@@ -137,15 +135,8 @@ class AnimePahe:
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Extra wait for JS-rendered content
|
|
||||||
await asyncio.sleep(2)
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
# Debug: log all hrefs found on page
|
|
||||||
all_links = await page.evaluate("""() => {
|
|
||||||
return Array.from(document.querySelectorAll('a[href]')).map(a => a.href)
|
|
||||||
}""")
|
|
||||||
print(f"[scrape_ids] All links found: {all_links}")
|
|
||||||
|
|
||||||
ids = await page.evaluate("""() => {
|
ids = await page.evaluate("""() => {
|
||||||
let ids = {}
|
let ids = {}
|
||||||
document.querySelectorAll("a[href]").forEach(a => {
|
document.querySelectorAll("a[href]").forEach(a => {
|
||||||
@@ -166,7 +157,6 @@ class AnimePahe:
|
|||||||
return ids
|
return ids
|
||||||
}""")
|
}""")
|
||||||
|
|
||||||
print(f"[scrape_ids] Extracted ids: {ids}")
|
|
||||||
ids["animepahe"] = session
|
ids["animepahe"] = session
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
@@ -230,17 +220,35 @@ class AnimePahe:
|
|||||||
|
|
||||||
p.on("request", capture)
|
p.on("request", capture)
|
||||||
try:
|
try:
|
||||||
await p.set_extra_http_headers({"Referer": BASE_URL})
|
# Set the exact referer Kwik expects to prevent token rejections
|
||||||
await p.goto(embed_url, wait_until="domcontentloaded")
|
await p.set_extra_http_headers({"Referer": "https://animepahe.si/"})
|
||||||
await p.evaluate(
|
await p.goto(embed_url, wait_until="domcontentloaded", timeout=15000)
|
||||||
"document.querySelectorAll('button, video, [class*=play]').forEach(el => el.click())"
|
|
||||||
)
|
# Click loop: Muting allows browsers to bypass autoplay restrictions safely
|
||||||
|
for _ in range(6):
|
||||||
|
if found.is_set():
|
||||||
|
break
|
||||||
|
await p.evaluate("""() => {
|
||||||
|
document.querySelectorAll('video').forEach(v => {
|
||||||
|
v.muted = true;
|
||||||
|
const p = v.play();
|
||||||
|
if (p !== undefined) p.catch(() => {});
|
||||||
|
});
|
||||||
|
document.querySelectorAll('button, .vjs-big-play-button').forEach(b => {
|
||||||
|
try { b.click() } catch(e) {}
|
||||||
|
});
|
||||||
|
}""")
|
||||||
|
await asyncio.sleep(1.5)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await asyncio.wait_for(found.wait(), timeout=5.0)
|
await asyncio.wait_for(found.wait(), timeout=5.0)
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
pass
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[_embed_to_m3u8] ERROR: {e}")
|
||||||
finally:
|
finally:
|
||||||
await p.close()
|
await p.close()
|
||||||
|
|
||||||
return m3u8
|
return m3u8
|
||||||
|
|
||||||
# ---------------- ANILIST ----------------
|
# ---------------- ANILIST ----------------
|
||||||
@@ -251,85 +259,21 @@ class AnimePahe:
|
|||||||
Media(id: $id, type: ANIME) {
|
Media(id: $id, type: ANIME) {
|
||||||
id
|
id
|
||||||
idMal
|
idMal
|
||||||
title {
|
title { romaji english native }
|
||||||
romaji
|
|
||||||
english
|
|
||||||
native
|
|
||||||
}
|
|
||||||
synonyms
|
synonyms
|
||||||
description(asHtml: false)
|
description(asHtml: false)
|
||||||
format
|
format status episodes duration source countryOfOrigin isAdult
|
||||||
status
|
|
||||||
episodes
|
|
||||||
duration
|
|
||||||
source
|
|
||||||
countryOfOrigin
|
|
||||||
isAdult
|
|
||||||
startDate { year month day }
|
startDate { year month day }
|
||||||
endDate { year month day }
|
endDate { year month day }
|
||||||
season
|
season seasonYear averageScore meanScore popularity favourites trending genres
|
||||||
seasonYear
|
coverImage { extraLarge large medium color }
|
||||||
averageScore
|
|
||||||
meanScore
|
|
||||||
popularity
|
|
||||||
favourites
|
|
||||||
trending
|
|
||||||
genres
|
|
||||||
coverImage {
|
|
||||||
extraLarge
|
|
||||||
large
|
|
||||||
medium
|
|
||||||
color
|
|
||||||
}
|
|
||||||
bannerImage
|
bannerImage
|
||||||
trailer {
|
trailer { id site }
|
||||||
id
|
studios(isMain: true) { nodes { name siteUrl } }
|
||||||
site
|
relations { edges { relationType(version: 2) node { id idMal title { romaji english } format status episodes averageScore coverImage { medium } siteUrl } } }
|
||||||
}
|
recommendations(perPage: 20, sort: RATING_DESC) { nodes { rating mediaRecommendation { id idMal title { romaji english } format status episodes averageScore coverImage { medium } siteUrl } } }
|
||||||
studios(isMain: true) {
|
externalLinks { site url type }
|
||||||
nodes { name siteUrl }
|
nextAiringEpisode { airingAt episode }
|
||||||
}
|
|
||||||
relations {
|
|
||||||
edges {
|
|
||||||
relationType(version: 2)
|
|
||||||
node {
|
|
||||||
id
|
|
||||||
idMal
|
|
||||||
title { romaji english }
|
|
||||||
format
|
|
||||||
status
|
|
||||||
episodes
|
|
||||||
averageScore
|
|
||||||
coverImage { medium }
|
|
||||||
siteUrl
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
recommendations(perPage: 20, sort: RATING_DESC) {
|
|
||||||
nodes {
|
|
||||||
rating
|
|
||||||
mediaRecommendation {
|
|
||||||
id
|
|
||||||
idMal
|
|
||||||
title { romaji english }
|
|
||||||
format
|
|
||||||
status
|
|
||||||
episodes
|
|
||||||
averageScore
|
|
||||||
coverImage { medium }
|
|
||||||
siteUrl
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
externalLinks {
|
|
||||||
site
|
|
||||||
url
|
|
||||||
type
|
|
||||||
}
|
|
||||||
nextAiringEpisode {
|
|
||||||
airingAt
|
|
||||||
episode
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
@@ -628,21 +572,13 @@ class AnimePahe:
|
|||||||
# ---------------- SEASONS ----------------
|
# ---------------- SEASONS ----------------
|
||||||
|
|
||||||
async def get_seasons(self, anime_id: str) -> dict:
|
async def get_seasons(self, anime_id: str) -> dict:
|
||||||
"""
|
|
||||||
Scrape the 'More Seasons' section from aniwatchtv.to using the
|
|
||||||
existing Playwright browser context.
|
|
||||||
anime_id is the full slug, e.g. jujutsu-kaisen-the-culling-game-part-1-20401
|
|
||||||
"""
|
|
||||||
url = f"{ANIWATCHTV_BASE}/{anime_id}"
|
url = f"{ANIWATCHTV_BASE}/{anime_id}"
|
||||||
page = await self.context.new_page()
|
page = await self.context.new_page()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
||||||
|
|
||||||
# Short wait for lazy-loaded images and JS rendering
|
|
||||||
await asyncio.sleep(1)
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
# Wait for season block — try common selectors gracefully
|
|
||||||
for selector in [".os-list", ".seasons-block", "[class*='season']", "main"]:
|
for selector in [".os-list", ".seasons-block", "[class*='season']", "main"]:
|
||||||
try:
|
try:
|
||||||
await page.wait_for_selector(selector, timeout=5000)
|
await page.wait_for_selector(selector, timeout=5000)
|
||||||
@@ -653,10 +589,9 @@ class AnimePahe:
|
|||||||
seasons = await page.evaluate(f"""() => {{
|
seasons = await page.evaluate(f"""() => {{
|
||||||
const BASE = "{ANIWATCHTV_BASE}";
|
const BASE = "{ANIWATCHTV_BASE}";
|
||||||
const currentId = "{anime_id}";
|
const currentId = "{anime_id}";
|
||||||
const results = [];
|
const results =[];
|
||||||
const seen = new Set();
|
const seen = new Set();
|
||||||
|
|
||||||
// Strategy 1: dedicated season list block (.os-list or similar)
|
|
||||||
const block = (
|
const block = (
|
||||||
document.querySelector('.os-list') ||
|
document.querySelector('.os-list') ||
|
||||||
document.querySelector('.seasons-block') ||
|
document.querySelector('.seasons-block') ||
|
||||||
@@ -664,7 +599,6 @@ class AnimePahe:
|
|||||||
document.querySelector('[class*="season-list"]')
|
document.querySelector('[class*="season-list"]')
|
||||||
);
|
);
|
||||||
|
|
||||||
// Strategy 2: find a heading that says "More Seasons" and walk up
|
|
||||||
const fallbackContainer = (() => {{
|
const fallbackContainer = (() => {{
|
||||||
for (const el of document.querySelectorAll('*')) {{
|
for (const el of document.querySelectorAll('*')) {{
|
||||||
if (/more\\s+seasons?/i.test(el.innerText?.trim() || '')) {{
|
if (/more\\s+seasons?/i.test(el.innerText?.trim() || '')) {{
|
||||||
@@ -690,7 +624,6 @@ class AnimePahe:
|
|||||||
if (!fullUrl) continue;
|
if (!fullUrl) continue;
|
||||||
|
|
||||||
const slug = fullUrl.replace(/\\/$/, '').split('/').pop();
|
const slug = fullUrl.replace(/\\/$/, '').split('/').pop();
|
||||||
// Include ALL slugs — current page included — dedupe only
|
|
||||||
if (!slug || seen.has(slug)) continue;
|
if (!slug || seen.has(slug)) continue;
|
||||||
seen.add(slug);
|
seen.add(slug);
|
||||||
|
|
||||||
@@ -700,7 +633,6 @@ class AnimePahe:
|
|||||||
const titleEl = a.querySelector('span, [class*="title"], [class*="name"]');
|
const titleEl = a.querySelector('span, [class*="title"], [class*="name"]');
|
||||||
const title = (titleEl?.innerText?.trim() || a.innerText?.trim() || slug);
|
const title = (titleEl?.innerText?.trim() || a.innerText?.trim() || slug);
|
||||||
|
|
||||||
// Poster is in a sibling/child div.season-poster as a CSS background-image
|
|
||||||
const posterEl = a.querySelector('.season-poster') || a.closest('li, div')?.querySelector('.season-poster');
|
const posterEl = a.querySelector('.season-poster') || a.closest('li, div')?.querySelector('.season-poster');
|
||||||
let poster = null;
|
let poster = null;
|
||||||
if (posterEl) {{
|
if (posterEl) {{
|
||||||
@@ -752,8 +684,8 @@ async def root():
|
|||||||
"/ids/:session",
|
"/ids/:session",
|
||||||
"/episodes/:session?p=:page&resolve=false|true",
|
"/episodes/:session?p=:page&resolve=false|true",
|
||||||
"/resolve/:animeSession/:episodeSession",
|
"/resolve/:animeSession/:episodeSession",
|
||||||
"/seasons/:animeId - e.g. /seasons/jujutsu-kaisen-the-culling-game-part-1-20401",
|
"/seasons/:animeId",
|
||||||
"/poster?url=:cdnImageUrl - proxy hotlink-protected poster images",
|
"/poster?url=:cdnImageUrl",
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -790,17 +722,6 @@ async def api_resolve(anime: str, episode: str):
|
|||||||
|
|
||||||
@app.get("/seasons/{anime_id:path}")
|
@app.get("/seasons/{anime_id:path}")
|
||||||
async def api_seasons(anime_id: str, request: Request):
|
async def api_seasons(anime_id: str, request: Request):
|
||||||
"""
|
|
||||||
Scrape the More Seasons section from aniwatchtv.to.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
GET /seasons/jujutsu-kaisen-the-culling-game-part-1-20401
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
id - the slug passed in
|
|
||||||
total - number of other seasons found
|
|
||||||
seasons[] - list of { title, id, numericId, url, poster }
|
|
||||||
"""
|
|
||||||
data = await pahe.get_seasons(anime_id)
|
data = await pahe.get_seasons(anime_id)
|
||||||
base_url = str(request.base_url).rstrip("/")
|
base_url = str(request.base_url).rstrip("/")
|
||||||
for season in data.get("seasons", []):
|
for season in data.get("seasons", []):
|
||||||
@@ -813,20 +734,13 @@ async def api_seasons(anime_id: str, request: Request):
|
|||||||
|
|
||||||
@app.get("/poster")
|
@app.get("/poster")
|
||||||
async def api_poster(url: str = Query(..., description="CDN image URL to proxy")):
|
async def api_poster(url: str = Query(..., description="CDN image URL to proxy")):
|
||||||
"""
|
|
||||||
Proxy a hotlink-protected poster image with the correct Referer header.
|
|
||||||
Use this to display season/anime posters in the browser.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
GET /poster?url=https://cdn.noitatnemucod.net/thumbnail/100x200/100/abc123.jpg
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=10, follow_redirects=True) as client:
|
async with httpx.AsyncClient(timeout=10, follow_redirects=True) as client:
|
||||||
resp = await client.get(
|
resp = await client.get(
|
||||||
url,
|
url,
|
||||||
headers={
|
headers={
|
||||||
"Referer": "https://aniwatchtv.to/",
|
"Referer": "https://aniwatchtv.to/",
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122 Safari/537.36",
|
"User-Agent": "Mozilla/5.0",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
|||||||
Reference in New Issue
Block a user