Python Script for parsing books data from the YandexBooks

import asyncio, csv, re, random, json
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Set
from playwright.async_api import async_playwright
 
# ========= CONFIG =========
ENTRY_POINTS = [
    # Add shelves/searches you want:
    "https://books.yandex.ru/books/t-samorazvitie-ru/all",
]
OUTPUT_CSV = "yandex_nonfiction.csv"
 
# Browser
USE_SYSTEM_CHROME = False          # set True if Chromium has issues; then run: python -m playwright install chrome
USE_PERSISTENT_PROFILE = True      # keeps cookies (log in once)
USER_DATA_DIR = "ydx_profile"
 
# Loader
SCROLL_MAX_STEPS = 420
STOP_AFTER = 300
STAGNATION_LIMIT = 8
 
# Enrichment
ENRICH_CONCURRENCY = 3
# ==========================
 
BOOK_LINK_SELECTOR = "a[href^='/books/']"
 
# Heuristics on cards (we overwrite later from book page)
READERS_PAT = re.compile(r"(читал[аи]?|читает|readers?)\D*([0-9\u00A0\u202F\s.,]+)", re.I)
QUOTES_PAT  = re.compile(r"(цитат[аы]?|quotes?)\D*([0-9\u00A0\u202F\s.,]+)", re.I)
 
# Robust patterns for book tabs
BOOK_READERS_PAT = re.compile(r"(читали|читают|читает|прочитал[аи]?|readers?)\s*[:\s]*([0-9\u00A0\u202F\s.,]+(?:\s*(?:тыс\.?|млн|k|к))?)", re.I)
BOOK_QUOTES_PAT  = re.compile(r"(цитат[аы]?|quotes?)\s*[:\s]*([0-9\u00A0\u202F\s.,]+(?:\s*(?:тыс\.?|млн|k|к))?)", re.I)
 
BOOK_HREF_RE = re.compile(r'\/books\/[A-Za-z0-9_-]+')
 
def normalize_count_phrase(raw: str) -> int:
    """
    Handles '848', '3,9 тыс.', '4.3K', '1 499', '1,2 млн', etc.
    """
    t = raw.replace("\u00A0", " ").replace("\u202F", " ").strip()
    m = re.search(r"(\d+(?:[.,]\d+)?)", t)
    if not m:
        return 0
    val = float(m.group(1).replace(",", "."))
    s = t.lower()
    if "млрд" in s or " billion" in s:
        val *= 1_000_000_000
    elif "млн" in s or " million" in s:
        val *= 1_000_000
    elif "тыс" in s or "тысяч" in s or "k" in s or "к" in s:
        val *= 1_000
    return int(val)
 
async def safe_text(locator) -> str:
    try:
        return (await locator.inner_text()) or ""
    except:
        return ""
 
# ---------- Loader: virtualized list + show-more ----------
async def load_all_cards(page, max_steps=SCROLL_MAX_STEPS, target_count=STOP_AFTER, stagnation_limit=STAGNATION_LIMIT):
    stagnation = 0
    prev_count = -1
    for _ in range(max_steps):
        count = await page.locator(BOOK_LINK_SELECTOR).count()
 
        if count >= target_count:
            break
 
        stagnation = stagnation + 1 if count == prev_count else 0
        prev_count = count
 
        if stagnation >= stagnation_limit:
            # last nudge
            try:
                last = page.locator(BOOK_LINK_SELECTOR).last
                if await last.count():
                    await last.scroll_into_view_if_needed()
                    await page.wait_for_timeout(800)
                    try:
                        await page.wait_for_load_state("networkidle", timeout=3000)
                    except:
                        pass
            except:
                pass
            again = await page.locator(BOOK_LINK_SELECTOR).count()
            if again <= count:
                break
            stagnation = 0
            prev_count = again
            continue
 
        # click "show more"
        try:
            more = page.locator(
                "button:has-text('Показать ещё'), button:has-text('Ещё'), "
                "a:has-text('Показать ещё'), a:has-text('Ещё'), "
                "button:has-text('Show more'), a:has-text('Show more')"
            )
            if await more.count():
                await more.first.click(timeout=800)
                await page.wait_for_timeout(900)
        except:
            pass
 
        # scroll containers + window + last card
        await page.evaluate("""
        () => {
          const els = [document.scrollingElement, ...Array.from(document.querySelectorAll('*'))];
          for (const el of els) {
            if (!el) continue;
            if (el.scrollHeight - el.clientHeight > 50) el.scrollTop = el.scrollHeight;
          }
          window.scrollTo(0, document.body.scrollHeight);
        }""")
        try:
            last = page.locator(BOOK_LINK_SELECTOR).last
            if await last.count():
                await last.scroll_into_view_if_needed()
        except:
            pass
 
        await page.wait_for_timeout(500 + int(random.uniform(150, 450)))
        try:
            await page.wait_for_load_state("networkidle", timeout=3000)
        except:
            pass
 
# ---------- XHR sniffer ----------
def looks_like_json(resp) -> bool:
    try:
        ct = resp.headers.get("content-type", "")
        return "application/json" in ct or resp.url.endswith(".json")
    except:
        return False
 
def attach_sniffer(page, bucket: Set[str]):
    async def handle_resp(resp):
        try:
            if resp.request.resource_type in ("xhr", "fetch") and looks_like_json(resp):
                txt = await resp.text()
                for m in BOOK_HREF_RE.findall(txt):
                    bucket.add("https://books.yandex.ru" + m)
                # Optionally parse JSON structures here if you map them.
        except:
            pass
    page.on("response", lambda resp: asyncio.create_task(handle_resp(resp)))
 
# ---------- Card parsing on listing ----------
async def extract_card_data(card) -> Optional[Dict]:
    a = card.locator(BOOK_LINK_SELECTOR)
    if not await a.count():
        return None
    href = await a.first.get_attribute("href")
    title = (await a.first.inner_text() or "").strip()
    if not (href and title):
        return None
 
    authors = ""
    for sel in ["[class*='author']", "[data-testid*='author']", "a[href^='/authors/']", "a[href*='/persons/']"]:
        loc = card.locator(sel)
        if await loc.count():
            names = []
            for i in range(min(await loc.count(), 6)):
                nm = (await loc.nth(i).inner_text() or "").strip()
                if nm and nm.lower() != title.lower():
                    names.append(nm)
            if names:
                authors = "; ".join(dict.fromkeys(names))
                break
    if not authors:
        blob = (await card.inner_text() or "")
        lines = [ln.strip() for ln in blob.splitlines() if ln.strip()]
        if title in lines:
            i = lines.index(title)
            if i + 1 < len(lines):
                authors = lines[i + 1]
        elif len(lines) >= 2:
            authors = lines[1]
 
    readers = quotes = 0
    chips_blob = await safe_text(card)
    m = READERS_PAT.search(chips_blob);  readers = normalize_count_phrase(m.group(0)) if m else 0
    m = QUOTES_PAT.search(chips_blob);   quotes  = normalize_count_phrase(m.group(0)) if m else 0
 
    low = title.lower()
    if any(k in low for k in ["роман", "novel"]) and "non" not in low:
        return None
 
    return {
        "url": "https://books.yandex.ru" + href,
        "title": title,
        "authors": authors or "",
        "readers": readers,
        "quotes": quotes,
    }
 
async def scrape_listing(page) -> List[Dict]:
    results: List[Dict] = []
    seen = set()
    cards = await page.locator("article, div").all()
    for card in cards:
        try:
            data = await extract_card_data(card)
            if not data:
                continue
            key = (data["title"].strip().lower(), data["authors"].strip().lower())
            if key in seen:
                continue
            seen.add(key)
            results.append(data)
        except:
            continue
    return results
 
# ---------- Robust tab counter (returns value + raw) ----------
async def read_counter_from_tab(page, suffix: str) -> Tuple[int, str]:
    a = page.locator(f"a[href$='/{suffix}']").first
    if not await a.count():
        return 0, ""
    parts = []
 
    # keep split spans and plain text
    try:
        html = await a.evaluate("el => el.innerHTML")
        if html:
            parts.append(html)
    except:
        pass
    try:
        txt = await a.evaluate("el => el.textContent")
        if txt:
            parts.append(txt)
    except:
        pass
    for attr in ("aria-label", "title"):
        try:
            v = await a.get_attribute(attr)
            if v:
                parts.append(v)
        except:
            pass
 
    blob = " ".join(p.strip() for p in parts if p and p.strip())
    raw = blob
 
    # Strict label-anchored match first
    if suffix == "readers":
        m = BOOK_READERS_PAT.search(blob)
    else:
        m = BOOK_QUOTES_PAT.search(blob)
    if m:
        return normalize_count_phrase(m.group(0)), raw
 
    # Fallback: first clean number (with optional suffix)
    m = re.search(r"([0-9\u00A0\u202F\s.,]+(?:\s*(?:тыс\.?|млн|k|к))?)", blob, re.I)
    if m:
        return normalize_count_phrase(m.group(1)), raw
 
    return 0, raw
 
async def ensure_title_authors(page, b: Dict):
    if b.get("title") and b.get("authors"):
        return
    tloc = page.locator("h1, [data-testid*='title']")
    if await tloc.count():
        t = (await tloc.first.inner_text() or "").strip()
        if t:
            b["title"] = t
    aloc = page.locator("a[href^='/authors/'], a[href*='/persons/'], [data-testid*='author']")
    if await aloc.count():
        names = []
        for i in range(min(await aloc.count(), 6)):
            nm = (await aloc.nth(i).inner_text() or "").strip()
            if nm:
                names.append(nm)
        if names:
            b["authors"] = "; ".join(dict.fromkeys(names))
 
async def parse_counts_from_book(ctx, b: Dict) -> Tuple[int, int, str, str]:
    p = await ctx.new_page()
    try:
        await p.goto(b["url"], wait_until="domcontentloaded")
        await p.wait_for_timeout(800)
 
        await ensure_title_authors(p, b)
 
        readers, raw_r = await read_counter_from_tab(p, "readers")
        quotes,  raw_q = await read_counter_from_tab(p, "quotes")
 
        if readers == 0:
            await p.goto(b["url"].rstrip("/") + "/readers", wait_until="domcontentloaded")
            await p.wait_for_timeout(800)
            r2, rr2 = await read_counter_from_tab(p, "readers")
            if r2 > readers:
                readers, raw_r = r2, rr2
 
        if quotes == 0:
            await p.goto(b["url"].rstrip("/") + "/quotes", wait_until="domcontentloaded")
            await p.wait_for_timeout(800)
            q2, rq2 = await read_counter_from_tab(p, "quotes")
            if q2 > quotes:
                quotes, raw_q = q2, rq2
 
        return readers or 0, quotes or 0, raw_r, raw_q
    finally:
        await p.close()
 
# ---------- Main ----------
async def main():
    out = Path(OUTPUT_CSV)
    rows: List[Dict] = []
    seen_urls: Set[str] = set()
    seen_from_api: Set[str] = set()
 
    async with async_playwright() as pw:
        launch_args = ["--disable-blink-features=AutomationControlled"]
        if USE_PERSISTENT_PROFILE:
            if USE_SYSTEM_CHROME:
                ctx = await pw.chromium.launch_persistent_context(
                    user_data_dir=USER_DATA_DIR, channel="chrome",
                    headless=False, args=launch_args
                )
            else:
                ctx = await pw.chromium.launch_persistent_context(
                    user_data_dir=USER_DATA_DIR, headless=False, args=launch_args
                )
            page = await ctx.new_page()
        else:
            if USE_SYSTEM_CHROME:
                browser = await pw.chromium.launch(channel="chrome", headless=False, args=launch_args)
            else:
                browser = await pw.chromium.launch(headless=False, args=launch_args)
            ctx = await browser.new_context()
            page = await ctx.new_page()
 
        # open home (captcha?)
        await page.goto("https://books.yandex.ru", wait_until="domcontentloaded")
        if "showcaptcha" in page.url:
            print("⚠️ Captcha detected. Solve it in the browser, then press ENTER here...")
            input()
 
        attach_sniffer(page, seen_from_api)
 
        for url in ENTRY_POINTS:
            print(f"▶️ Visiting: {url}")
            await page.goto(url, wait_until="domcontentloaded")
            if "showcaptcha" in page.url:
                print("⚠️ Captcha detected again. Solve it, then press ENTER...")
                input()
 
            await load_all_cards(page)
            scraped = await scrape_listing(page)
            for b in scraped:
                if b["url"] in seen_urls:
                    continue
                seen_urls.add(b["url"])
                rows.append(b)
                print(f" + {b['title']}{b['authors']} (R:{b['readers']} Q:{b['quotes']})")
 
        # merge URLs from XHR sniffer
        added = 0
        for url in list(seen_from_api):
            if url not in seen_urls:
                rows.append({"url": url, "title": "", "authors": "", "readers": 0, "quotes": 0})
                seen_urls.add(url); added += 1
        if added:
            print(f"📦 Added {added} book URLs from API sniffing.")
 
        # Enrich: accurate counts + fill metadata; also keep raw strings
        print("⏳ Fetching accurate readers/quotes from book pages...")
        sem = asyncio.Semaphore(ENRICH_CONCURRENCY)
 
        async def enrich(b):
            async with sem:
                try:
                    r, q, raw_r, raw_q = await parse_counts_from_book(ctx, b)
                    if r: b["readers"] = max(b.get("readers", 0), r)
                    if q: b["quotes"]  = max(b.get("quotes", 0),  q)
                    b["readers_raw"] = raw_r
                    b["quotes_raw"]  = raw_q
                    print(f"   ↳ counts: {b['title'] or '[no title]'} (R:{b['readers']} Q:{b['quotes']})")
                except Exception as e:
                    print(f"   ! enrich failed: {b.get('title','[no title]')}{e}")
 
        await asyncio.gather(*(enrich(b) for b in rows))
 
        # Save CSV (with raw columns for auditing)
        with out.open("w", newline="", encoding="utf-8") as f:
            w = csv.DictWriter(f, fieldnames=["title","authors","readers","quotes","url","readers_raw","quotes_raw"])
            w.writeheader()
            for r in rows:
                w.writerow(r)
 
        print(f"✅ Saved {len(rows)} records to {out.resolve()}")
 
        # close
        if USE_PERSISTENT_PROFILE:
            await ctx.close()
        else:
            await ctx.browser.close()
 
if __name__ == "__main__":
    asyncio.run(main())

BIO

🧠 theBrain mapping

ID: 202508110721 Source:: Friend:: Child:: Next::

Keywords:

Reference: