Skip to content

usenix_scrape

src.scrapers.usenix_scrape

Scrape USENIX conference technical-sessions pages (e.g. FAST, OSDI, ATC) for paper titles and artifact evaluation badges.

Each USENIX conference publishes a program at

https://www.usenix.org/conference//technical-sessions

Individual presentation pages may include artifact badge images in a field-artifact-evaluated div. The badge type is inferred from the image filename (available, functional, reproduced/reproduced).

Usage examples

Scrape FAST 2025

python usenix_scrape.py --conference fast --years 2025

Scrape FAST 2024 and 2025

python usenix_scrape.py --conference fast --years 2024,2025

Scrape multiple conferences

python usenix_scrape.py --conference fast,osdi --years 2024,2025

Output as YAML suitable for the pipeline

python usenix_scrape.py --conference fast --years 2025 --format yaml

get_session(session=None)

Return a requests.Session, optionally reusing an existing one.

Source code in src/scrapers/usenix_scrape.py
51
52
53
54
55
56
57
def get_session(session=None):
    """Return a requests.Session, optionally reusing an existing one."""
    if session is not None:
        return session
    from src.utils.http import create_session

    return create_session()

Scrape the technical-sessions page for a USENIX conference and return a list of unique presentation paths (e.g. /conference/fast25/presentation/satija).

Source code in src/scrapers/usenix_scrape.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def scrape_presentation_links(conference: str, year: int, session: requests.Session | None = None) -> list[str]:
    """
    Scrape the technical-sessions page for a USENIX conference and return
    a list of unique presentation paths (e.g. /conference/fast25/presentation/satija).
    """
    sess = get_session(session)
    suffix = _year_suffix(year)
    url = f"{BASE_URL}/conference/{conference}{suffix}/technical-sessions"

    logger.info(f"  Fetching program: {url}")
    cached = _read_cache(CACHE_DIR, url, ttl=CACHE_TTL, namespace="usenix")
    if cached is not _MISSING:
        links = cached
        logger.info(f"  Found {len(links)} unique presentation pages (cached)")
        return links

    resp = sess.get(url, timeout=30)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    # Find all links to presentation pages
    prefix = f"/conference/{conference}{suffix}/presentation/"
    links = set()
    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        if href.startswith(prefix):
            links.add(href)

    result = sorted(links)
    _write_cache(CACHE_DIR, url, result, namespace="usenix")
    logger.info(f"  Found {len(result)} unique presentation pages")
    return result

scrape_paper_page(path: str, session: requests.Session | None = None) -> dict | None

Scrape a single USENIX presentation page and extract
  • title
  • authors (text)
  • artifact evaluation badges (from badge images)
  • paper PDF URL

Returns a dict or None if the page is not a research paper.

Source code in src/scrapers/usenix_scrape.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
def scrape_paper_page(path: str, session: requests.Session | None = None) -> dict | None:
    """
    Scrape a single USENIX presentation page and extract:
      - title
      - authors (text)
      - artifact evaluation badges (from badge images)
      - paper PDF URL

    Returns a dict or None if the page is not a research paper.
    """
    sess = get_session(session)
    url = f"{BASE_URL}{path}"

    # Check cache first
    cached = _read_cache(CACHE_DIR, url, ttl=CACHE_TTL, namespace="usenix_paper")
    if cached is not _MISSING:
        return cached  # dict or None

    resp = sess.get(url, timeout=30)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    # Extract title from <h1> with id "page-title" or class "page__title"
    title_el = soup.find("h1", id="page-title") or soup.find("h1", class_="page__title")
    if not title_el:
        logger.warning(f"Could not find title element in {url} — page structure may have changed")
        _write_cache(CACHE_DIR, url, None, namespace="usenix_paper")
        return None
    title = title_el.get_text(strip=True)

    # Skip non-paper entries (keynotes, panels, etc.)
    skip_prefixes = (
        "keynote",
        "panel",
        "workshop",
        "tutorial",
        "honoring",
        "break",
        "lunch",
        "closing",
        "opening",
        "reception",
        "poster session",
        "work-in-progress",
    )
    if any(title.lower().startswith(p) for p in skip_prefixes):
        _write_cache(CACHE_DIR, url, None, namespace="usenix_paper")
        return None

    # Extract authors
    authors_div = soup.find("div", class_=re.compile(r"field-name-field-paper-people-text"))
    authors = ""
    if authors_div:
        authors = authors_div.get_text(strip=True)

    # Extract artifact badges
    badges = []
    artifact_div = soup.find("div", class_=re.compile(r"field-name-field-artifact-evaluated"))
    if artifact_div:
        for img in artifact_div.find_all("img"):
            src = img.get("src", "").lower()
            if "available" in src:
                badges.append("available")
            elif "functional" in src:
                badges.append("functional")
            elif "reproduced" in src or "replicated" in src:
                badges.append("reproduced")

    # Extract paper PDF URL
    paper_url = ""
    pdf_div = soup.find("div", class_=re.compile(r"field-name-field-final-paper-pdf"))
    if pdf_div:
        pdf_link = pdf_div.find("a", href=True)
        if pdf_link:
            paper_url = pdf_link["href"]
            if paper_url.startswith("/"):
                paper_url = BASE_URL + paper_url

    result = {
        "title": title,
        "authors": authors,
        "badges": badges,
        "paper_url": paper_url,
        "presentation_url": url,
    }
    _write_cache(CACHE_DIR, url, result, namespace="usenix_paper")
    return result

scrape_conference_year(conference: str, year: int, session: requests.Session | None = None, max_workers: int = 4, delay: float = 0.5) -> list[dict]

Scrape all papers and badges for a conference/year combination.

Parameters:

Name Type Description Default
conference str

Conference short name (e.g. 'fast')

required
year int

Full year (e.g. 2025)

required
session Session | None

Optional requests.Session

None
max_workers int

Number of parallel requests for paper pages

4
delay float

Delay between batches of requests (be polite)

0.5

Returns:

Type Description
list[dict]

List of artifact dicts with badges

Source code in src/scrapers/usenix_scrape.py
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
def scrape_conference_year(
    conference: str, year: int, session: requests.Session | None = None, max_workers: int = 4, delay: float = 0.5
) -> list[dict]:
    """
    Scrape all papers and badges for a conference/year combination.

    Args:
        conference: Conference short name (e.g. 'fast')
        year: Full year (e.g. 2025)
        session: Optional requests.Session
        max_workers: Number of parallel requests for paper pages
        delay: Delay between batches of requests (be polite)

    Returns:
        List of artifact dicts with badges
    """
    sess = get_session(session)
    paths = scrape_presentation_links(conference, year, sess)

    if not paths:
        logger.info(f"  No presentation pages found for {conference.upper()} {year}")
        return []

    artifacts = []
    papers_with_badges = 0

    # Scrape paper pages with controlled parallelism
    def _fetch(path):
        time.sleep(delay)  # be polite
        return scrape_paper_page(path, sess)

    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        futures = {pool.submit(_fetch, p): p for p in paths}
        for i, future in enumerate(as_completed(futures), 1):
            path = futures[future]
            try:
                result = future.result()
                if result is not None:
                    artifacts.append(result)
                    if result["badges"]:
                        papers_with_badges += 1
                    if i % 10 == 0 or i == len(paths):
                        logger.info(f"  Scraped {i}/{len(paths)} pages...")
            except Exception as e:
                logger.warning(f"  Error scraping {path}: {e}")

    logger.info(
        f"  {conference.upper()} {year}: {len(artifacts)} papers, {papers_with_badges} with artifact badges",
    )
    return artifacts

scrape_organizers(conference: str, year: int, session: requests.Session | None = None) -> dict | None

Scrape the AE committee chairs and members from the USENIX call-for-artifacts page.

USENIX pages use two HTML structures for the AEC list
  1. Structured

    sections (e.g. FAST 25, OSDI 25)

  2. Inline

    /

    with
    -separated entries (e.g. FAST 24)

Parameters:

Name Type Description Default
conference str

Conference short name (e.g. 'fast', 'osdi')

required
year int

Conference year (int)

required
session Session | None

Optional requests.Session

None

Returns:

Type Description
dict | None

dict with 'chairs' (list of {'name': ..., 'affiliation': ...})

dict | None

and 'members' (list of {'name': ..., 'affiliation': ...}),

dict | None

or None if the page cannot be fetched or parsed.

Source code in src/scrapers/usenix_scrape.py
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
def scrape_organizers(conference: str, year: int, session: requests.Session | None = None) -> dict | None:
    """
    Scrape the AE committee chairs and members from the USENIX call-for-artifacts page.

    USENIX pages use two HTML structures for the AEC list:
      1. Structured <h3 class="grouping-field-heading"> sections (e.g. FAST 25, OSDI 25)
      2. Inline <h2>/<p> with <br>-separated entries (e.g. FAST 24)

    Args:
        conference: Conference short name (e.g. 'fast', 'osdi')
        year: Conference year (int)
        session: Optional requests.Session

    Returns:
        dict with 'chairs' (list of {'name': ..., 'affiliation': ...})
        and 'members' (list of {'name': ..., 'affiliation': ...}),
        or None if the page cannot be fetched or parsed.
    """
    sess = get_session(session)
    suffix = _year_suffix(year)
    url = f"{BASE_URL}/conference/{conference}{suffix}/call-for-artifacts"

    cache_key = f"{url}#organizers"
    cached = _read_cache(CACHE_DIR, cache_key, ttl=CACHE_TTL, namespace="usenix_organizers")
    if cached is not _MISSING:
        return cached

    logger.info(f"  Fetching organizers: {url}")
    try:
        resp = sess.get(url, timeout=30)
        resp.raise_for_status()
    except Exception as e:
        logger.warning(f"  Warning: Could not fetch organizers page: {e}")
        return None

    soup = BeautifulSoup(resp.text, "html.parser")

    def _clean_text(html_str):
        """Remove HTML tags and clean up whitespace/entities."""
        text = re.sub(r"&nbsp;", " ", html_str)
        text = re.sub(r"&quot;", '"', text)
        text = re.sub(r"<em>", "", text)
        text = re.sub(r"</em>", "", text)
        text = re.sub(r"<[^>]+>", "", text)
        return text.strip()

    def _parse_name_affiliation(text):
        """Parse 'Name, Affiliation' into a dict."""
        text = text.strip()
        if not text:
            return None
        # Split on first comma
        if ", " in text:
            name, affiliation = text.split(", ", 1)
        else:
            name, affiliation = text, ""
        return {"name": name.strip(), "affiliation": affiliation.strip()}

    def _extract_from_structured(heading_text, exact=False):
        """Extract entries from structured <h3 class='grouping-field-heading'> sections."""
        entries = []
        heading = None

        # Search via <span> child for reliable matching
        for h3 in soup.find_all("h3", class_="grouping-field-heading"):
            span = h3.find("span")
            if span:
                span_text = span.get_text().strip()
                if exact and span_text == heading_text or not exact and heading_text in span_text:
                    heading = h3
                    break

        if heading:
            # Collect all sibling divs until next h3
            for sibling in heading.find_next_siblings():
                if sibling.name == "h3":
                    break
                for div in sibling.find_all("div", class_=re.compile(r"field-content")):
                    text = _clean_text(str(div))
                    entry = _parse_name_affiliation(text)
                    if entry:
                        entries.append(entry)
        return entries

    def _extract_from_inline(section_html):
        """Extract entries from inline <p> with <br>-separated text."""
        entries = []
        # Split on <br> tags
        lines = re.split(r"<br\s*/?>", section_html)
        for line in lines:
            text = _clean_text(line)
            entry = _parse_name_affiliation(text)
            if entry:
                entries.append(entry)
        return entries

    chairs = []
    members = []

    # Try structured format first (FAST 25, OSDI 25 style)
    chairs = _extract_from_structured("Artifact Evaluation Committee Co-Chairs")
    members = _extract_from_structured("Artifact Evaluation Committee", exact=True)

    # Fall back to inline format (FAST 24 style: <h2> then <p> with <br>)
    if not chairs:
        h2_chairs = soup.find("h2", string=re.compile(r"Artifact Evaluation Committee Co-Chairs"))
        if h2_chairs:
            p_tag = h2_chairs.find_next("p")
            if p_tag:
                chairs = _extract_from_inline(str(p_tag))

    if not members:
        # Find the <h2>Artifact Evaluation Committee</h2> (not Co-Chairs)
        for h2 in soup.find_all("h2"):
            h2_text = h2.get_text(strip=True)
            if h2_text == "Artifact Evaluation Committee":
                # Skip comment blocks, find the actual <p> with members
                for sibling in h2.find_next_siblings():
                    if sibling.name in ("h2", "h3"):
                        break
                    if sibling.name == "p" and "<br" in str(sibling):
                        members = _extract_from_inline(str(sibling))
                        if members:
                            break
                break

    if not chairs and not members:
        logger.warning(f"  Warning: No organizer data found for {conference.upper()} {year}")
        _write_cache(CACHE_DIR, cache_key, None, namespace="usenix_organizers")
        return None

    result = {"chairs": chairs, "members": members}
    logger.info(f"  Found {len(chairs)} chairs and {len(members)} committee members")
    _write_cache(CACHE_DIR, cache_key, result, namespace="usenix_organizers")
    return result

to_pipeline_format(artifacts)

Convert scraped artifacts to the format used by the existing pipeline (matching sys_sec_artifacts_results_scrape.py output format).

Source code in src/scrapers/usenix_scrape.py
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
def to_pipeline_format(artifacts):
    """
    Convert scraped artifacts to the format used by the existing pipeline
    (matching sys_sec_artifacts_results_scrape.py output format).
    """
    pipeline_artifacts = []
    for a in artifacts:
        if not a["badges"]:
            continue  # Only include papers that went through AE
        entry = {
            "title": a["title"],
            "badges": ",".join(a["badges"]),
        }
        if a.get("paper_url"):
            entry["paper_url"] = a["paper_url"]
        pipeline_artifacts.append(entry)
    return pipeline_artifacts