Skip to content

acm_scrape

src.scrapers.acm_scrape

Scrape ACM Digital Library conference proceedings for paper artifact badges.

ACM conferences (e.g., CCS, SOSP) display artifact evaluation badges on individual paper pages in the ACM DL. This module:

  1. Uses the DBLP API to discover all papers for a given proceedings volume.
  2. Attempts to scrape badge information directly from ACM DL paper pages.
  3. Gracefully degrades when ACM DL access is blocked (Cloudflare 403), returning the DBLP paper list without badge data.
Usage examples

Scrape CCS 2024 (attempts ACM DL, falls back to YAML)

python acm_scrape.py --conference ccs --years 2024

Scrape CCS 2023 and 2024

python acm_scrape.py --conference ccs --years 2023,2024

Output as YAML suitable for the pipeline

python acm_scrape.py --conference ccs --years 2024 --format yaml

scrape_acm_proceedings(conference, year, session=None, max_workers=4, delay=0.5)

Scrape an ACM DL proceedings volume for paper titles and artifact badges.

  1. Gets papers from DBLP (always works).
  2. For each paper, tries to scrape badge info from ACM DL.
  3. If ACM DL is blocked (403), the function stops attempting further papers and returns papers with empty badge lists (partial data).

Returns:

Type Description

(papers_list, acm_dl_accessible) where acm_dl_accessible is a bool

indicating whether ACM DL scraping succeeded.

Source code in src/scrapers/acm_scrape.py
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
def scrape_acm_proceedings(conference, year, session=None, max_workers=4, delay=0.5):
    """
    Scrape an ACM DL proceedings volume for paper titles and artifact badges.

    1. Gets papers from DBLP (always works).
    2. For each paper, tries to scrape badge info from ACM DL.
    3. If ACM DL is blocked (403), the function stops attempting further
       papers and returns papers with empty badge lists (partial data).

    Returns:
        (papers_list, acm_dl_accessible)  where acm_dl_accessible is a bool
        indicating whether ACM DL scraping succeeded.
    """
    conf_meta = ACM_CONFERENCES.get(conference)
    if not conf_meta:
        logger.info(f"  Unknown ACM conference: {conference}")
        return [], False

    dblp_key = conf_meta["dblp_key"]
    papers = _dblp_papers(dblp_key, year, session)
    if not papers:
        return [], False

    sess = session or _session_with_retries()
    acm_dl_accessible = True  # optimistic
    blocked_count = 0

    def _fetch_badges(paper):
        nonlocal acm_dl_accessible, blocked_count
        if not acm_dl_accessible:
            return paper, []
        badges = _scrape_acm_paper_badges(paper["doi"], sess)
        if badges is None:
            blocked_count += 1
            if blocked_count >= 3:
                acm_dl_accessible = False
                logger.warning(
                    "  ACM DL appears blocked (3 consecutive failures), skipping further scraping", file=sys.stderr
                )
            return paper, []
        time.sleep(delay)
        return paper, badges

    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        futures = {pool.submit(_fetch_badges, p): p for p in papers}
        for i, future in enumerate(as_completed(futures), 1):
            paper, badges = future.result()
            paper_out = {
                "title": paper["title"],
                "doi": paper["doi"],
                "authors": paper["authors"],
                "badges": badges,
            }
            results.append(paper_out)
            if i % 20 == 0 or i == len(papers):
                logger.info(f"  Processed {i}/{len(papers)} papers …")

    with_badges = sum(1 for r in results if r["badges"])
    logger.info(
        f"  {conference.upper()} {year}: {len(results)} papers, "
        f"{with_badges} with badges"
        f" (ACM DL {'accessible' if acm_dl_accessible else 'BLOCKED'})",
        file=sys.stderr,
    )
    return results, acm_dl_accessible

scrape_conference_year(conference, year, session=None, max_workers=4, delay=0.5)

Get artifact data for an ACM conference/year via DBLP + ACM DL scraping.

Returns a list of dicts ready for to_pipeline_format().

Source code in src/scrapers/acm_scrape.py
285
286
287
288
289
290
291
292
def scrape_conference_year(conference, year, session=None, max_workers=4, delay=0.5):
    """
    Get artifact data for an ACM conference/year via DBLP + ACM DL scraping.

    Returns a list of dicts ready for ``to_pipeline_format()``.
    """
    scraped, acm_ok = scrape_acm_proceedings(conference, year, session, max_workers, delay)
    return scraped

to_pipeline_format(artifacts)

Convert scraped/merged artifacts to the format used by generate_statistics.py. Only includes papers that have at least one badge.

Source code in src/scrapers/acm_scrape.py
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
def to_pipeline_format(artifacts):
    """
    Convert scraped/merged artifacts to the format used by generate_statistics.py.
    Only includes papers that have at least one badge.
    """
    pipeline = []
    for a in artifacts:
        badges = a.get("badges", [])
        if not badges:
            continue
        entry = {
            "title": a.get("title", "Unknown"),
            "badges": ",".join(badges),
        }
        if a.get("doi"):
            entry["doi"] = a["doi"]
        if a.get("repository_url"):
            entry["repository_url"] = a["repository_url"]
        if a.get("artifact_url"):
            entry["artifact_url"] = a["artifact_url"]
        pipeline.append(entry)
    return pipeline

get_acm_conferences()

Return the ACM_CONFERENCES dict for use by the pipeline.

Source code in src/scrapers/acm_scrape.py
319
320
321
def get_acm_conferences():
    """Return the ACM_CONFERENCES dict for use by the pipeline."""
    return ACM_CONFERENCES