generate_participation_stats¶

`src.generators.generate_participation_stats` ¶

Generate AE participation statistics using the local DBLP XML file.

For each conference/year in the pipeline's artifacts data, this script: 1. Parses the DBLP XML to count ALL papers published at that venue/year. 2. Computes AE participation rate (ae_papers / total_papers). 3. Computes badge rates as % of all papers (not just AE papers).

Requires: data/dblp/dblp.xml.gz (downloaded by scripts/download_dblp.sh)

Writes

_data/participation_stats.yml — per-conference/year participation data assets/data/participation_stats.json — same data in JSON

Usage

python -m src.generators.generate_participation_stats --dblp_file data/dblp/dblp.xml.gz --output_dir ../reprodb.github.io

`generate_participation_stats(dblp_file, output_dir)` ¶

Generate AE participation and badge-rate statistics.

Source code in src/generators/generate_participation_stats.py

def generate_participation_stats(dblp_file, output_dir):
    """Generate AE participation and badge-rate statistics."""

    if not os.path.exists(dblp_file):
        logger.error(f"Error: DBLP file not found: {dblp_file}")
        logger.info("  Run scripts/download_dblp.sh first.")
        return None

    # Load artifacts_by_conference for per-conference badge counts
    abc_path = os.path.join(output_dir, "_data/artifacts_by_conference.yml")
    if not os.path.exists(abc_path):
        logger.error(f"Error: {abc_path} not found — run generate_statistics first")
        return None

    with open(abc_path) as f:
        by_conference = yaml.safe_load(f)

    # Build per-conference/year AE counts and badge counts from pipeline data
    ae_data = {}  # (conf, year) -> {ae_papers, available, functional, ...}
    for conf in by_conference:
        name = conf["name"]
        for yd in conf["years"]:
            key = (name, yd["year"])
            ae_data[key] = {
                "ae_papers": yd["total"],
                "available": yd.get("available", 0),
                "functional": yd.get("functional", 0),
                "reproduced": yd.get("reproducible", 0),
                "category": conf["category"],
                "venue_type": conf.get("venue_type", "conference"),
            }

    # Parse DBLP XML for total paper counts
    dblp_counts = _count_papers_from_dblp(dblp_file, set(ae_data.keys()))

    # Build stats entries
    stats = []
    for (conf, year), info in sorted(ae_data.items()):
        total_papers = dblp_counts.get((conf, year))
        if not total_papers:
            logger.warning(f"  ⚠ {conf} {year}: no DBLP data — skipping")
            continue

        ae_papers = info["ae_papers"]
        if ae_papers > total_papers:
            logger.info(
                f"  ⚠ {conf} {year}: DBLP data incomplete ({total_papers} papers in DBLP, {ae_papers} AE papers) — skipping"
            )
            continue
        participation_rate = round(ae_papers / total_papers * 100, 1)
        area = conf_area(conf)

        entry = {
            "conference": conf,
            "year": year,
            "category": area,
            "venue_type": info["venue_type"],
            "total_papers": total_papers,
            "ae_papers": ae_papers,
            "participation_pct": participation_rate,
            "available": info["available"],
            "functional": info["functional"],
            "reproduced": info["reproduced"],
            "available_pct": round(info["available"] / total_papers * 100, 1),
            "functional_pct": round(info["functional"] / total_papers * 100, 1),
            "reproduced_pct": round(info["reproduced"] / total_papers * 100, 1),
        }
        stats.append(entry)
        logger.info(f"  {conf} {year}: {ae_papers}/{total_papers} = {participation_rate}% AE participation")

    if not stats:
        logger.info("No participation data generated")
        return None

    # Sort by conference then year
    stats.sort(key=lambda x: (x["conference"], x["year"]))

    # Compute area-level summaries
    area_year = defaultdict(lambda: defaultdict(lambda: {"ae": 0, "total": 0, "avail": 0, "func": 0, "repro": 0}))
    for s in stats:
        ay = area_year[s["category"]][s["year"]]
        ay["ae"] += s["ae_papers"]
        ay["total"] += s["total_papers"]
        ay["avail"] += s["available"]
        ay["func"] += s["functional"]
        ay["repro"] += s["reproduced"]

    area_summaries = {}
    for area in sorted(area_year):
        years = sorted(area_year[area])
        area_summaries[area] = {
            "years": years,
            "participation_pct": [
                round(area_year[area][y]["ae"] / area_year[area][y]["total"] * 100, 1) for y in years
            ],
            "available_pct": [round(area_year[area][y]["avail"] / area_year[area][y]["total"] * 100, 1) for y in years],
            "functional_pct": [round(area_year[area][y]["func"] / area_year[area][y]["total"] * 100, 1) for y in years],
            "reproduced_pct": [
                round(area_year[area][y]["repro"] / area_year[area][y]["total"] * 100, 1) for y in years
            ],
        }

    output = {
        "by_conference_year": stats,
        "by_area": area_summaries,
    }

    # Write outputs
    os.makedirs(os.path.join(output_dir, "_data"), exist_ok=True)
    os.makedirs(os.path.join(output_dir, "assets/data"), exist_ok=True)

    yml_path = os.path.join(output_dir, "_data/participation_stats.yml")
    with open(yml_path, "w") as f:
        yaml.dump(output, f, default_flow_style=False, sort_keys=False)

    json_path = os.path.join(output_dir, "assets/data/participation_stats.json")
    with open(json_path, "w") as f:
        json.dump(output, f, indent=2)

    logger.info(f"\n✅ Participation stats: {len(stats)} conference/year entries")
    logger.info(f"   → {yml_path}")
    logger.info(f"   → {json_path}")

    return output

generate_participation_stats¶

src.generators.generate_participation_stats ¶

generate_participation_stats(dblp_file, output_dir) ¶

`src.generators.generate_participation_stats` ¶

`generate_participation_stats(dblp_file, output_dir)` ¶