Skip to content

alternative_committee_scrape

src.scrapers.alternative_committee_scrape

Scrape AE committee data from alternative sources when sysartifacts/secartifacts GitHub repos don't have the information.

Supported sources: - USENIX website (FAST, OSDI, ATC, USENIX Security, WOOT) - CHES website (ches.iacr.org) - PETS website (petsymposium.org)

scrape_usenix_committee(conference, year, session=None)

Scrape AE committee from a USENIX conference call-for-artifacts page.

Parameters

conference : str Conference name (e.g. 'fast', 'osdi', 'usenixsec', 'woot') year : int 4-digit year session : requests.Session, optional

Returns

list of {name, affiliation} dicts, or None if page not found

Source code in src/scrapers/alternative_committee_scrape.py
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
def scrape_usenix_committee(conference, year, session=None):
    """Scrape AE committee from a USENIX conference call-for-artifacts page.

    Parameters
    ----------
    conference : str
        Conference name (e.g. 'fast', 'osdi', 'usenixsec', 'woot')
    year : int
        4-digit year
    session : requests.Session, optional

    Returns
    -------
    list of {name, affiliation} dicts, or None if page not found
    """
    slug = USENIX_CONF_SLUGS.get(conference.lower())
    if slug is None:
        return None

    yy = str(year)[2:]  # e.g. 2024 -> "24"
    url = f"{BASE_USENIX}/conference/{slug}{yy}/call-for-artifacts"

    sess = session or _get_session()
    try:
        resp = sess.get(url, timeout=30)
        if resp.status_code == 404:
            return None
        resp.raise_for_status()
    except requests.RequestException as e:
        logger.warning(f"  Failed to fetch {url}: {e}")
        return None

    soup = BeautifulSoup(resp.text, "html.parser")

    # Parse co-chairs and regular committee members
    chairs = _parse_usenix_cochairs_html(soup)
    members = _parse_usenix_committee_html(soup)

    # Mark roles
    for m in chairs:
        m["role"] = "chair"
    for m in members:
        m["role"] = "member"

    # Combine (chairs + members, dedup by name)
    all_members = chairs + members
    seen = set()
    deduped = []
    for m in all_members:
        if m["name"].lower() not in seen:
            seen.add(m["name"].lower())
            deduped.append(m)

    if deduped:
        logger.info(f"  USENIX: Found {len(deduped)} members for {conference}{year}")
    return deduped if deduped else None

scrape_ches_committee(year, session=None)

Scrape AE committee from the CHES website.

Members are fetched from the JSON API (ches.iacr.org/{year}/json/artifact.json). If the JSON endpoint is unavailable (e.g. CHES 2022), members are parsed from the static HTML. Chairs are always scraped from the HTML page (ches.iacr.org/{year}/artifacts.php) because the JSON data does not include chair information.

Returns list of {name, affiliation, role} dicts, or None if not found.

Source code in src/scrapers/alternative_committee_scrape.py
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
def scrape_ches_committee(year, session=None):
    """Scrape AE committee from the CHES website.

    Members are fetched from the JSON API
    (``ches.iacr.org/{year}/json/artifact.json``).  If the JSON endpoint is
    unavailable (e.g. CHES 2022), members are parsed from the static HTML.
    Chairs are always scraped from the HTML page
    (``ches.iacr.org/{year}/artifacts.php``) because the JSON data does not
    include chair information.

    Returns list of {name, affiliation, role} dicts, or None if not found.
    """
    sess = session or _get_session()
    members = []

    # 1. Try JSON API for members
    json_url = f"https://ches.iacr.org/{year}/json/artifact.json"
    try:
        resp = sess.get(json_url, timeout=30)
        if resp.status_code == 200:
            data = resp.json()
            for entry in data.get("committee", []):
                name = re.sub(r"\s+", " ", entry.get("name", "")).strip()
                affiliation = re.sub(r"\s+", " ", entry.get("affiliation", "")).strip()
                if name and len(name) > 1:
                    members.append({"name": name, "affiliation": affiliation, "role": "member"})
    except (requests.RequestException, ValueError, KeyError):
        logger.warning("Failed to fetch/parse CHES committee JSON, skipping JSON source")

    # 2. Fetch HTML page for chairs (and fallback members if JSON failed)
    html_url = f"https://ches.iacr.org/{year}/artifacts.php"
    try:
        resp = sess.get(html_url, timeout=30)
        if resp.status_code == 200:
            soup = BeautifulSoup(resp.text, "html.parser")

            # Parse chairs from HTML
            chairs = _scrape_ches_chairs_html(soup)

            # If JSON didn't return members, try HTML fallback (CHES 2022)
            if not members:
                members = _scrape_ches_members_html(soup)

            # Combine: chairs first, then members (dedup by name)
            all_members = chairs + members
            seen = set()
            deduped = []
            for m in all_members:
                key = m["name"].lower()
                if key not in seen:
                    seen.add(key)
                    deduped.append(m)

            if deduped:
                chair_count = sum(1 for m in deduped if m["role"] == "chair")
                member_count = len(deduped) - chair_count
                logger.info(f"  CHES: Found {member_count} members + {chair_count} chair(s) for ches{year}")
            return deduped if deduped else None
    except requests.RequestException as e:
        logger.warning(f"  Failed to fetch {html_url}: {e}")

    # If only JSON members were found (HTML failed), return those
    if members:
        logger.info(f"  CHES: Found {len(members)} members for ches{year} (JSON only)")
        return members

    return None

scrape_pets_committee(year, session=None)

Scrape artifact review committee from PETS/PoPETs website.

PETS publishes ARC on: petsymposium.org/cfp{YY}.php Format:

Artifact Review Committee:
Name, Affiliation
Name, Affiliation
...

Returns list of {name, affiliation} dicts, or None if not found.

Source code in src/scrapers/alternative_committee_scrape.py
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
def scrape_pets_committee(year, session=None):
    """Scrape artifact review committee from PETS/PoPETs website.

    PETS publishes ARC on: petsymposium.org/cfp{YY}.php
    Format: <dt><font><b>Artifact Review Committee:</b></font></dt>
            <dd>Name, <i>Affiliation</i></dd>
            <dd>Name, <i>Affiliation</i></dd>
            ...

    Returns list of {name, affiliation} dicts, or None if not found.
    """
    yy = str(year)[2:]
    url = f"https://petsymposium.org/cfp{yy}.php"
    sess = session or _get_session()

    try:
        resp = sess.get(url, timeout=30)
        if resp.status_code == 404:
            return None
        resp.raise_for_status()
    except requests.RequestException as e:
        logger.warning(f"  Failed to fetch {url}: {e}")
        return None

    soup = BeautifulSoup(resp.text, "html.parser")
    members = []

    # Find the <dt> element containing "Artifact Review Committee"
    arc_dt = None
    for dt in soup.find_all("dt"):
        txt = dt.get_text().lower()
        if "artifact" in txt and "committee" in txt:
            arc_dt = dt
            break

    if arc_dt is None:
        return None

    # Collect all <dd> siblings following the <dt> until the next <dt>
    for sib in arc_dt.next_siblings:
        if not hasattr(sib, "name"):
            continue
        if sib.name == "dt":
            break  # reached the next definition term
        if sib.name == "dd":
            text = sib.get_text().strip()
            if not text or len(text) < 3:
                continue
            # Parse "Name, Affiliation"
            if "," in text:
                parts = text.split(",", 1)
                name = parts[0].strip()
                affiliation = parts[1].strip()
            else:
                name = text
                affiliation = ""
            name = re.sub(r"\s+", " ", name).strip().strip("*_").strip()
            affiliation = re.sub(r"\s+", " ", affiliation).strip().strip("*_").strip()
            if name and len(name) > 2:
                members.append({"name": name, "affiliation": affiliation, "role": "member"})

    if members:
        logger.info(f"  PETS: Found {len(members)} members for pets{year}")
    return members if members else None

get_alternative_committees(conferences_needed)

Fetch committees from alternative sources for conferences not in sysartifacts/secartifacts.

Parameters

conferences_needed : dict {conf_year_str: 'systems'|'security'} — conferences that need data. e.g. {'fast2024': 'systems', 'usenixsec2022': 'security'}

Returns

dict of {conf_year_str: [{name, affiliation}, ...]}

Source code in src/scrapers/alternative_committee_scrape.py
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
def get_alternative_committees(conferences_needed):
    """Fetch committees from alternative sources for conferences not in sysartifacts/secartifacts.

    Parameters
    ----------
    conferences_needed : dict
        {conf_year_str: 'systems'|'security'} — conferences that need data.
        e.g. {'fast2024': 'systems', 'usenixsec2022': 'security'}

    Returns
    -------
    dict of {conf_year_str: [{name, affiliation}, ...]}
    """
    results = {}
    sess = _get_session()

    for conf_year_str, _area in conferences_needed.items():
        m = re.match(r"^([a-zA-Z]+)(\d{4})$", conf_year_str)
        if not m:
            continue
        conf = m.group(1).lower()
        year = int(m.group(2))

        committee = None

        # Try USENIX website
        if conf in USENIX_CONF_SLUGS:
            committee = scrape_usenix_committee(conf, year, session=sess)

        # Try CHES website
        elif conf == "ches":
            committee = scrape_ches_committee(year, session=sess)

        # Try PETS website
        elif conf == "pets":
            committee = scrape_pets_committee(year, session=sess)

        if committee and len(committee) > 0:
            results[conf_year_str] = committee

    return results

get_all_usenix_committees(conf_regex=None)

Scrape all available USENIX conference committees.

Parameters

conf_regex : str, optional Regex to filter conference/year strings (e.g. '.20[2][0-5]')

Returns

dict of {conf_year_str: [{name, affiliation}, ...]}

Source code in src/scrapers/alternative_committee_scrape.py
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
def get_all_usenix_committees(conf_regex=None):
    """Scrape all available USENIX conference committees.

    Parameters
    ----------
    conf_regex : str, optional
        Regex to filter conference/year strings (e.g. '.20[2][0-5]')

    Returns
    -------
    dict of {conf_year_str: [{name, affiliation}, ...]}
    """
    results = {}
    sess = _get_session()

    for conf, years in USENIX_KNOWN_YEARS.items():
        for year in years:
            conf_year = f"{conf}{year}"
            if conf_regex and not re.search(conf_regex, conf_year):
                continue
            committee = scrape_usenix_committee(conf, year, session=sess)
            if committee:
                results[conf_year] = committee

    return results