usenix_scrape¶

`src.scrapers.usenix_scrape` ¶

Scrape USENIX conference technical-sessions pages (e.g. FAST, OSDI, ATC) for paper titles and artifact evaluation badges.

Each USENIX conference publishes a program at

https://www.usenix.org/conference//technical-sessions

Individual presentation pages may include artifact badge images in a field-artifact-evaluated div. The badge type is inferred from the image filename (available, functional, reproduced/reproduced).

Usage examples

Scrape FAST 2025¶

python usenix_scrape.py --conference fast --years 2025

Scrape FAST 2024 and 2025¶

python usenix_scrape.py --conference fast --years 2024,2025

Scrape multiple conferences¶

python usenix_scrape.py --conference fast,osdi --years 2024,2025

Output as YAML suitable for the pipeline¶

python usenix_scrape.py --conference fast --years 2025 --format yaml

`get_session(session=None)` ¶

Return a requests.Session, optionally reusing an existing one.

Source code in src/scrapers/usenix_scrape.py

def get_session(session=None):
    """Return a requests.Session, optionally reusing an existing one."""
    if session is not None:
        return session
    from src.utils.http import create_session

    return create_session()

`scrape_presentation_links(conference: str, year: int, session: requests.Session | None = None) -> list[str]` ¶

Scrape the technical-sessions page for a USENIX conference and return a list of unique presentation paths (e.g. /conference/fast25/presentation/satija).

Source code in src/scrapers/usenix_scrape.py

def scrape_presentation_links(conference: str, year: int, session: requests.Session | None = None) -> list[str]:
    """
    Scrape the technical-sessions page for a USENIX conference and return
    a list of unique presentation paths (e.g. /conference/fast25/presentation/satija).
    """
    sess = get_session(session)
    suffix = _year_suffix(year)
    url = f"{BASE_URL}/conference/{conference}{suffix}/technical-sessions"

    logger.info(f"  Fetching program: {url}")
    cached = _read_cache(CACHE_DIR, url, ttl=CACHE_TTL, namespace="usenix")
    if cached is not _MISSING:
        links = cached
        logger.info(f"  Found {len(links)} unique presentation pages (cached)")
        return links

    resp = sess.get(url, timeout=30)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    # Find all links to presentation pages
    prefix = f"/conference/{conference}{suffix}/presentation/"
    links = set()
    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        if href.startswith(prefix):
            links.add(href)

    result = sorted(links)
    _write_cache(CACHE_DIR, url, result, namespace="usenix")
    logger.info(f"  Found {len(result)} unique presentation pages")
    return result

`scrape_paper_page(path: str, session: requests.Session | None = None) -> dict | None` ¶

Scrape a single USENIX presentation page and extract

title
authors (text)
artifact evaluation badges (from badge images)
paper PDF URL

Returns a dict or None if the page is not a research paper.

Source code in src/scrapers/usenix_scrape.py

def scrape_paper_page(path: str, session: requests.Session | None = None) -> dict | None:
    """
    Scrape a single USENIX presentation page and extract:
      - title
      - authors (text)
      - artifact evaluation badges (from badge images)
      - paper PDF URL

    Returns a dict or None if the page is not a research paper.
    """
    sess = get_session(session)
    url = f"{BASE_URL}{path}"

    # Check cache first
    cached = _read_cache(CACHE_DIR, url, ttl=CACHE_TTL, namespace="usenix_paper")
    if cached is not _MISSING:
        return cached  # dict or None

    resp = sess.get(url, timeout=30)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    # Extract title from <h1> with id "page-title" or class "page__title"
    title_el = soup.find("h1", id="page-title") or soup.find("h1", class_="page__title")
    if not title_el:
        logger.warning(f"Could not find title element in {url} — page structure may have changed")
        _write_cache(CACHE_DIR, url, None, namespace="usenix_paper")
        return None
    title = title_el.get_text(strip=True)

    # Skip non-paper entries (keynotes, panels, etc.)
    skip_prefixes = (
        "keynote",
        "panel",
        "workshop",
        "tutorial",
        "honoring",
        "break",
        "lunch",
        "closing",
        "opening",
        "reception",
        "poster session",
        "work-in-progress",
    )
    if any(title.lower().startswith(p) for p in skip_prefixes):
        _write_cache(CACHE_DIR, url, None, namespace="usenix_paper")
        return None

    # Extract authors
    authors_div = soup.find("div", class_=re.compile(r"field-name-field-paper-people-text"))
    authors = ""
    if authors_div:
        authors = authors_div.get_text(strip=True)

    # Extract artifact badges
    badges = []
    artifact_div = soup.find("div", class_=re.compile(r"field-name-field-artifact-evaluated"))
    if artifact_div:
        for img in artifact_div.find_all("img"):
            src = img.get("src", "").lower()
            if "available" in src:
                badges.append("available")
            elif "functional" in src:
                badges.append("functional")
            elif "reproduced" in src or "replicated" in src:
                badges.append("reproduced")

    # Extract paper PDF URL
    paper_url = ""
    pdf_div = soup.find("div", class_=re.compile(r"field-name-field-final-paper-pdf"))
    if pdf_div:
        pdf_link = pdf_div.find("a", href=True)
        if pdf_link:
            paper_url = pdf_link["href"]
            if paper_url.startswith("/"):
                paper_url = BASE_URL + paper_url

    result = {
        "title": title,
        "authors": authors,
        "badges": badges,
        "paper_url": paper_url,
        "presentation_url": url,
    }
    _write_cache(CACHE_DIR, url, result, namespace="usenix_paper")
    return result

`scrape_conference_year(conference: str, year: int, session: requests.Session | None = None, max_workers: int = 4, delay: float = 0.5) -> list[dict]` ¶

Scrape all papers and badges for a conference/year combination.

Parameters:

Name	Type	Description	Default
`conference`	`str`	Conference short name (e.g. 'fast')	required
`year`	`int`	Full year (e.g. 2025)	required
`session`	`Session \| None`	Optional requests.Session	`None`
`max_workers`	`int`	Number of parallel requests for paper pages	`4`
`delay`	`float`	Delay between batches of requests (be polite)	`0.5`

Returns:

Type	Description
`list[dict]`	List of artifact dicts with badges

Source code in src/scrapers/usenix_scrape.py

def scrape_conference_year(
    conference: str, year: int, session: requests.Session | None = None, max_workers: int = 4, delay: float = 0.5
) -> list[dict]:
    """
    Scrape all papers and badges for a conference/year combination.

    Args:
        conference: Conference short name (e.g. 'fast')
        year: Full year (e.g. 2025)
        session: Optional requests.Session
        max_workers: Number of parallel requests for paper pages
        delay: Delay between batches of requests (be polite)

    Returns:
        List of artifact dicts with badges
    """
    sess = get_session(session)
    paths = scrape_presentation_links(conference, year, sess)

    if not paths:
        logger.info(f"  No presentation pages found for {conference.upper()} {year}")
        return []

    artifacts = []
    papers_with_badges = 0

    # Scrape paper pages with controlled parallelism
    def _fetch(path):
        time.sleep(delay)  # be polite
        return scrape_paper_page(path, sess)

    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        futures = {pool.submit(_fetch, p): p for p in paths}
        for i, future in enumerate(as_completed(futures), 1):
            path = futures[future]
            try:
                result = future.result()
                if result is not None:
                    artifacts.append(result)
                    if result["badges"]:
                        papers_with_badges += 1
                    if i % 10 == 0 or i == len(paths):
                        logger.info(f"  Scraped {i}/{len(paths)} pages...")
            except Exception as e:
                logger.warning(f"  Error scraping {path}: {e}")

    logger.info(
        f"  {conference.upper()} {year}: {len(artifacts)} papers, {papers_with_badges} with artifact badges",
    )
    return artifacts

`scrape_organizers(conference: str, year: int, session: requests.Session | None = None) -> dict | None` ¶

Scrape the AE committee chairs and members from the USENIX call-for-artifacts page.

USENIX pages use two HTML structures for the AEC list

Structured
sections (e.g. FAST 25, OSDI 25)
Inline
/
with
-separated entries (e.g. FAST 24)

Parameters:

Name	Type	Description	Default
`conference`	`str`	Conference short name (e.g. 'fast', 'osdi')	required
`year`	`int`	Conference year (int)	required
`session`	`Session \| None`	Optional requests.Session	`None`

Returns:

Type	Description
`dict \| None`	dict with 'chairs' (list of {'name': ..., 'affiliation': ...})
`dict \| None`	and 'members' (list of {'name': ..., 'affiliation': ...}),
`dict \| None`	or None if the page cannot be fetched or parsed.

Source code in src/scrapers/usenix_scrape.py

def scrape_organizers(conference: str, year: int, session: requests.Session | None = None) -> dict | None:
    """
    Scrape the AE committee chairs and members from the USENIX call-for-artifacts page.

    USENIX pages use two HTML structures for the AEC list:
      1. Structured <h3 class="grouping-field-heading"> sections (e.g. FAST 25, OSDI 25)
      2. Inline <h2>/<p> with <br>-separated entries (e.g. FAST 24)

    Args:
        conference: Conference short name (e.g. 'fast', 'osdi')
        year: Conference year (int)
        session: Optional requests.Session

    Returns:
        dict with 'chairs' (list of {'name': ..., 'affiliation': ...})
        and 'members' (list of {'name': ..., 'affiliation': ...}),
        or None if the page cannot be fetched or parsed.
    """
    sess = get_session(session)
    suffix = _year_suffix(year)
    url = f"{BASE_URL}/conference/{conference}{suffix}/call-for-artifacts"

    cache_key = f"{url}#organizers"
    cached = _read_cache(CACHE_DIR, cache_key, ttl=CACHE_TTL, namespace="usenix_organizers")
    if cached is not _MISSING:
        return cached

    logger.info(f"  Fetching organizers: {url}")
    try:
        resp = sess.get(url, timeout=30)
        resp.raise_for_status()
    except Exception as e:
        logger.warning(f"  Warning: Could not fetch organizers page: {e}")
        return None

    soup = BeautifulSoup(resp.text, "html.parser")

    def _clean_text(html_str):
        """Remove HTML tags and clean up whitespace/entities."""
        text = re.sub(r"&nbsp;", " ", html_str)
        text = re.sub(r"&quot;", '"', text)
        text = re.sub(r"<em>", "", text)
        text = re.sub(r"</em>", "", text)
        text = re.sub(r"<[^>]+>", "", text)
        return text.strip()

    def _parse_name_affiliation(text):
        """Parse 'Name, Affiliation' into a dict."""
        text = text.strip()
        if not text:
            return None
        # Split on first comma
        if ", " in text:
            name, affiliation = text.split(", ", 1)
        else:
            name, affiliation = text, ""
        return {"name": name.strip(), "affiliation": affiliation.strip()}

    def _extract_from_structured(heading_text, exact=False):
        """Extract entries from structured <h3 class='grouping-field-heading'> sections."""
        entries = []
        heading = None

        # Search via <span> child for reliable matching
        for h3 in soup.find_all("h3", class_="grouping-field-heading"):
            span = h3.find("span")
            if span:
                span_text = span.get_text().strip()
                if exact and span_text == heading_text or not exact and heading_text in span_text:
                    heading = h3
                    break

        if heading:
            # Collect all sibling divs until next h3
            for sibling in heading.find_next_siblings():
                if sibling.name == "h3":
                    break
                for div in sibling.find_all("div", class_=re.compile(r"field-content")):
                    text = _clean_text(str(div))
                    entry = _parse_name_affiliation(text)
                    if entry:
                        entries.append(entry)
        return entries

    def _extract_from_inline(section_html):
        """Extract entries from inline <p> with <br>-separated text."""
        entries = []
        # Split on <br> tags
        lines = re.split(r"<br\s*/?>", section_html)
        for line in lines:
            text = _clean_text(line)
            entry = _parse_name_affiliation(text)
            if entry:
                entries.append(entry)
        return entries

    chairs = []
    members = []

    # Try structured format first (FAST 25, OSDI 25 style)
    chairs = _extract_from_structured("Artifact Evaluation Committee Co-Chairs")
    members = _extract_from_structured("Artifact Evaluation Committee", exact=True)

    # Fall back to inline format (FAST 24 style: <h2> then <p> with <br>)
    if not chairs:
        h2_chairs = soup.find("h2", string=re.compile(r"Artifact Evaluation Committee Co-Chairs"))
        if h2_chairs:
            p_tag = h2_chairs.find_next("p")
            if p_tag:
                chairs = _extract_from_inline(str(p_tag))

    if not members:
        # Find the <h2>Artifact Evaluation Committee</h2> (not Co-Chairs)
        for h2 in soup.find_all("h2"):
            h2_text = h2.get_text(strip=True)
            if h2_text == "Artifact Evaluation Committee":
                # Skip comment blocks, find the actual <p> with members
                for sibling in h2.find_next_siblings():
                    if sibling.name in ("h2", "h3"):
                        break
                    if sibling.name == "p" and "<br" in str(sibling):
                        members = _extract_from_inline(str(sibling))
                        if members:
                            break
                break

    if not chairs and not members:
        logger.warning(f"  Warning: No organizer data found for {conference.upper()} {year}")
        _write_cache(CACHE_DIR, cache_key, None, namespace="usenix_organizers")
        return None

    result = {"chairs": chairs, "members": members}
    logger.info(f"  Found {len(chairs)} chairs and {len(members)} committee members")
    _write_cache(CACHE_DIR, cache_key, result, namespace="usenix_organizers")
    return result

`to_pipeline_format(artifacts)` ¶

Convert scraped artifacts to the format used by the existing pipeline (matching sys_sec_artifacts_results_scrape.py output format).

Source code in src/scrapers/usenix_scrape.py

def to_pipeline_format(artifacts):
    """
    Convert scraped artifacts to the format used by the existing pipeline
    (matching sys_sec_artifacts_results_scrape.py output format).
    """
    pipeline_artifacts = []
    for a in artifacts:
        if not a["badges"]:
            continue  # Only include papers that went through AE
        entry = {
            "title": a["title"],
            "badges": ",".join(a["badges"]),
        }
        if a.get("paper_url"):
            entry["paper_url"] = a["paper_url"]
        pipeline_artifacts.append(entry)
    return pipeline_artifacts

usenix_scrape¶

src.scrapers.usenix_scrape ¶

Scrape FAST 2025¶

Scrape FAST 2024 and 2025¶

Scrape multiple conferences¶

Output as YAML suitable for the pipeline¶

get_session(session=None) ¶

scrape_presentation_links(conference: str, year: int, session: requests.Session | None = None) -> list[str] ¶

scrape_paper_page(path: str, session: requests.Session | None = None) -> dict | None ¶

scrape_conference_year(conference: str, year: int, session: requests.Session | None = None, max_workers: int = 4, delay: float = 0.5) -> list[dict] ¶

scrape_organizers(conference: str, year: int, session: requests.Session | None = None) -> dict | None ¶

sections (e.g. FAST 25, OSDI 25)

/ with -separated entries (e.g. FAST 24)

to_pipeline_format(artifacts) ¶

`src.scrapers.usenix_scrape` ¶

`get_session(session=None)` ¶

`scrape_presentation_links(conference: str, year: int, session: requests.Session | None = None) -> list[str]` ¶

`scrape_paper_page(path: str, session: requests.Session | None = None) -> dict | None` ¶

`scrape_conference_year(conference: str, year: int, session: requests.Session | None = None, max_workers: int = 4, delay: float = 0.5) -> list[dict]` ¶

`scrape_organizers(conference: str, year: int, session: requests.Session | None = None) -> dict | None` ¶

/
with
-separated entries (e.g. FAST 24)

`to_pipeline_format(artifacts)` ¶