generate_committee_stats¶

`src.generators.generate_committee_stats` ¶

Generate committee statistics for the ReproDB website.

Scrapes AE committee member data from sysartifacts and secartifacts, classifies members by country, continent, and institution, and outputs structured YAML/JSON for Jekyll rendering + chart generation.

`classify_member(affiliation, prefix_tree, name_index)` ¶

Classify a single member's affiliation to a country.

Returns (country, institution_name) or (None, None) on failure.

Source code in src/generators/generate_committee_stats.py

def classify_member(affiliation, prefix_tree, name_index):
    """Classify a single member's affiliation to a country.

    Returns (country, institution_name) or (None, None) on failure.
    """
    aff_lower = affiliation.lower().strip()
    if not aff_lower:
        return None, None

    # Try prefix-tree match first
    matches = prefix_tree.values(prefix=aff_lower)
    if matches:
        uni = matches[0]
        return uni["country"], uni.get("name", affiliation)

    # Fall back to fuzzy matching
    best_match = None
    best_ratio = 0
    for name, uni in name_index.items():
        ratio = fuzz.ratio(name, aff_lower)
        if ratio > best_ratio:
            best_ratio = ratio
            best_match = uni

    if best_ratio > 80 and best_match:
        return best_match["country"], best_match.get("name", affiliation)

    return None, None

`classify_committees(all_results)` ¶

Classify all committee members by country, continent, and institution.

Parameters¶

all_results : dict {conf_year: [{name, affiliation}, ...]}

Returns¶

dict with keys: by_country, by_continent, by_institution, by_conference, failed_affiliations

Source code in src/generators/generate_committee_stats.py

def classify_committees(all_results):
    """Classify all committee members by country, continent, and institution.

    Parameters
    ----------
    all_results : dict
        {conf_year: [{name, affiliation}, ...]}

    Returns
    -------
    dict with keys: by_country, by_continent, by_institution, by_conference,
                    failed_affiliations
    """
    name_index = _build_university_index()
    prefix_tree = Trie(**name_index)

    # Per-conference-year breakdown
    by_conf_country = {}  # conf_year → {country: count}
    by_conf_continent = {}  # conf_year → {continent: count}
    by_conf_institution = {}  # conf_year → {institution: count}
    failed = []

    for conf_year, members in all_results.items():
        by_conf_country[conf_year] = defaultdict(int)
        by_conf_continent[conf_year] = defaultdict(int)
        by_conf_institution[conf_year] = defaultdict(int)

        for member in members:
            affiliation = _clean_affiliation(member["affiliation"])
            country, inst_name = classify_member(affiliation, prefix_tree, name_index)
            if country:
                by_conf_country[conf_year][country] += 1
                continent = COUNTRY_TO_CONTINENT.get(country, "Unknown")
                by_conf_continent[conf_year][continent] += 1
                by_conf_institution[conf_year][inst_name or member["affiliation"]] += 1
            else:
                failed.append(
                    {
                        "conference": conf_year,
                        "name": member["name"],
                        "affiliation": affiliation,
                    }
                )

    return {
        "by_country": by_conf_country,
        "by_continent": by_conf_continent,
        "by_institution": by_conf_institution,
        "failed": failed,
    }

`generate_committee_data(conf_regex, output_dir)` ¶

Main entry point: scrape committees, classify, and write output files.

Source code in src/generators/generate_committee_stats.py

def generate_committee_data(conf_regex, output_dir):
    """Main entry point: scrape committees, classify, and write output files."""

    # ── 1. Scrape committees from both prefixes ──────────────────────────────
    logger.info("  Scraping systems committee data from sysartifacts...")
    sys_results = get_committees(conf_regex, "sys")
    logger.info(f"    Found {len(sys_results)} systems conference-years")

    logger.info("  Scraping security committee data from secartifacts...")
    sec_results = get_committees(conf_regex, "sec")
    logger.info(f"    Found {len(sec_results)} security conference-years")

    all_results = {}
    all_results.update(sys_results)
    all_results.update(sec_results)

    # Clean all results (remove placeholders, fix markdown links)
    for cy in list(all_results.keys()):
        all_results[cy] = _clean_committee(all_results[cy])

    # ── 1b. Supplement with alternative sources ──────────────────────────────
    #  Identify conferences that are missing or have low-quality data
    logger.info("  Checking for conferences needing alternative sources...")
    conferences_needed = {}

    # Determine which conference-years we expect based on USENIX/CHES/PETS
    for conf, _slug in USENIX_CONF_SLUGS.items():
        for year in USENIX_KNOWN_YEARS.get(conf, []):
            cy = f"{conf}{year}"
            if re.search(conf_regex, cy):
                area = "systems" if conf in SYSTEMS_CONFS else "security"
                if cy not in all_results or not _is_valid_committee(all_results.get(cy)):
                    conferences_needed[cy] = area

    for year in CHES_KNOWN_YEARS:
        cy = f"ches{year}"
        if re.search(conf_regex, cy) and (cy not in all_results or not _is_valid_committee(all_results.get(cy))):
            conferences_needed[cy] = "security"

    for year in PETS_KNOWN_YEARS:
        cy = f"pets{year}"
        if re.search(conf_regex, cy) and (cy not in all_results or not _is_valid_committee(all_results.get(cy))):
            conferences_needed[cy] = "security"

    if conferences_needed:
        logger.info(f"    Need alternative sources for {len(conferences_needed)} conference-years:")
        for cy in sorted(conferences_needed.keys()):
            existing = len(all_results.get(cy, []))
            logger.info(f"      {cy} (currently {existing} members)")

        alt_results = get_alternative_committees(conferences_needed)
        for cy, members in alt_results.items():
            cleaned = _clean_committee(members)
            if cleaned:
                existing_count = len(all_results.get(cy, []))
                all_results[cy] = cleaned
                logger.info(f"    ✓ {cy}: replaced {existing_count} → {len(cleaned)} members (alternative source)")
    else:
        logger.info("    All conference-years have valid committee data.")

    if not all_results:
        logger.warning("  No committee data found — skipping committee stats.")
        return None

    # ── 2. Classify by country / continent / institution ─────────────────────
    logger.info("  Classifying committee members...")
    classified = classify_committees(all_results)

    if classified["failed"]:
        logger.error(f"  ⚠️  Could not classify {len(classified['failed'])} members")

    # Build area map
    conf_to_area = {cy: _conf_area(cy) for cy in all_results}

    # ── 3. Aggregate statistics ──────────────────────────────────────────────
    country_all, country_sys, country_sec = _aggregate_across_conferences(classified["by_country"], conf_to_area)
    continent_all, continent_sys, continent_sec = _aggregate_across_conferences(
        classified["by_continent"], conf_to_area
    )
    inst_all, inst_sys, inst_sec = _aggregate_across_conferences(classified["by_institution"], conf_to_area)

    # Yearly time-series
    country_years_all, country_years_sys, country_years_sec = _build_yearly_series(
        classified["by_country"], conf_to_area
    )
    continent_years_all, continent_years_sys, continent_years_sec = _build_yearly_series(
        classified["by_continent"], conf_to_area
    )

    # Committee sizes per conference-year
    committee_sizes = []
    for conf_year in sorted(all_results.keys()):
        conf_name, year = _extract_conf_year(conf_year)
        area = conf_to_area.get(conf_year, "unknown")
        committee_sizes.append(
            {
                "conference": conf_name,
                "year": year,
                "conf_year": conf_year,
                "area": area,
                "size": len(all_results[conf_year]),
            }
        )

    # Total members
    total_members = sum(len(m) for m in all_results.values())
    total_systems = sum(len(m) for cy, m in all_results.items() if conf_to_area.get(cy) == "systems")
    total_security = sum(len(m) for cy, m in all_results.items() if conf_to_area.get(cy) == "security")

    # ── 3b. Compute recurring AE member statistics ───────────────────────────
    logger.info("  Computing recurring AE member rankings...")
    all_members, sys_members, sec_members, recurring_summary = _compute_recurring_members(
        all_results, conf_to_area, classified
    )
    logger.info(
        f"    Found {recurring_summary['total_recurring']} recurring members "
        f"({recurring_summary['total_chairs']} include chair roles)"
    )

    # ── 3c. Compute institution timeline ─────────────────────────────────────
    logger.info("  Computing institution timeline...")
    inst_timeline = _compute_institution_timeline(classified, conf_to_area)
    logger.info(f"    Tracked {len(inst_timeline['unique_by_year'])} years of institution data")

    # ── 4. Build output structures ───────────────────────────────────────────

    # Summary for _data/committee_stats.yml
    committee_summary = {
        "last_updated": datetime.now().strftime("%Y-%m-%d"),
        "total_members": total_members,
        "total_systems": total_systems,
        "total_security": total_security,
        "total_conferences": len(all_results),
        "total_countries": len(country_all),
        "total_continents": len(continent_all),
        "total_institutions": len(inst_all),
        "recurring_members": recurring_summary["total_recurring"],
        "recurring_members_systems": recurring_summary["total_recurring_systems"],
        "recurring_members_security": recurring_summary["total_recurring_security"],
        "recurring_chairs": recurring_summary["total_chairs"],
        "top_countries": [{"name": k, "count": v} for k, v in _top_n(country_all, 15)],
        "top_countries_systems": [{"name": k, "count": v} for k, v in _top_n(country_sys, 15)],
        "top_countries_security": [{"name": k, "count": v} for k, v in _top_n(country_sec, 15)],
        "top_continents": [{"name": k, "count": v} for k, v in _top_n(continent_all, 10)],
        "top_continents_systems": [{"name": k, "count": v} for k, v in _top_n(continent_sys, 10)],
        "top_continents_security": [{"name": k, "count": v} for k, v in _top_n(continent_sec, 10)],
        "top_institutions": [{"name": k, "count": v} for k, v in _top_n(inst_all, 20)],
        "top_institutions_systems": [{"name": k, "count": v} for k, v in _top_n(inst_sys, 20)],
        "top_institutions_security": [{"name": k, "count": v} for k, v in _top_n(inst_sec, 20)],
        "institution_timeline": inst_timeline["unique_by_year"],
        "committee_sizes": committee_sizes,
    }

    # Detailed JSON for charting / download
    detail_json = {
        "summary": {
            "total_members": total_members,
            "total_systems": total_systems,
            "total_security": total_security,
            "total_countries": len(country_all),
            "total_continents": len(continent_all),
            "total_institutions": len(inst_all),
        },
        "by_country": {
            "overall": [{"name": k, "count": v} for k, v in sorted(country_all.items(), key=lambda x: -x[1])],
            "systems": [{"name": k, "count": v} for k, v in sorted(country_sys.items(), key=lambda x: -x[1])],
            "security": [{"name": k, "count": v} for k, v in sorted(country_sec.items(), key=lambda x: -x[1])],
        },
        "by_continent": {
            "overall": [{"name": k, "count": v} for k, v in sorted(continent_all.items(), key=lambda x: -x[1])],
            "systems": [{"name": k, "count": v} for k, v in sorted(continent_sys.items(), key=lambda x: -x[1])],
            "security": [{"name": k, "count": v} for k, v in sorted(continent_sec.items(), key=lambda x: -x[1])],
        },
        "by_institution": {
            "overall": [{"name": k, "count": v} for k, v in sorted(inst_all.items(), key=lambda x: -x[1])],
            "systems": [{"name": k, "count": v} for k, v in sorted(inst_sys.items(), key=lambda x: -x[1])],
            "security": [{"name": k, "count": v} for k, v in sorted(inst_sec.items(), key=lambda x: -x[1])],
        },
        "by_year": {
            "country": {str(y): dict(c) for y, c in sorted(country_years_all.items())},
            "country_systems": {str(y): dict(c) for y, c in sorted(country_years_sys.items())},
            "country_security": {str(y): dict(c) for y, c in sorted(country_years_sec.items())},
            "continent": {str(y): dict(c) for y, c in sorted(continent_years_all.items())},
            "continent_systems": {str(y): dict(c) for y, c in sorted(continent_years_sys.items())},
            "continent_security": {str(y): dict(c) for y, c in sorted(continent_years_sec.items())},
        },
        "committee_sizes": committee_sizes,
        "failed_classifications": classified["failed"],
    }

    # ── 5. Write output files ────────────────────────────────────────────────
    if output_dir:
        os.makedirs(os.path.join(output_dir, "_data"), exist_ok=True)
        os.makedirs(os.path.join(output_dir, "assets/data"), exist_ok=True)
        os.makedirs(os.path.join(output_dir, "assets/charts"), exist_ok=True)

        yml_path = os.path.join(output_dir, "_data/committee_stats.yml")
        with open(yml_path, "w") as f:
            yaml.dump(committee_summary, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
        logger.info(f"  Wrote {yml_path}")

        json_path = os.path.join(output_dir, "assets/data/committee_stats.json")
        with open(json_path, "w") as f:
            json.dump(detail_json, f, indent=2, ensure_ascii=False)
        logger.info(f"  Wrote {json_path}")

        # Write recurring AE member JSON files
        ae_all_path = os.path.join(output_dir, "assets/data/ae_members.json")
        with open(ae_all_path, "w") as f:
            json.dump(all_members, f, indent=2, ensure_ascii=False)
        logger.info(f"  Wrote {ae_all_path} ({len(all_members)} members)")

        ae_sys_path = os.path.join(output_dir, "assets/data/systems_ae_members.json")
        with open(ae_sys_path, "w") as f:
            json.dump(sys_members, f, indent=2, ensure_ascii=False)
        logger.info(f"  Wrote {ae_sys_path} ({len(sys_members)} members)")

        ae_sec_path = os.path.join(output_dir, "assets/data/security_ae_members.json")
        with open(ae_sec_path, "w") as f:
            json.dump(sec_members, f, indent=2, ensure_ascii=False)
        logger.info(f"  Wrote {ae_sec_path} ({len(sec_members)} members)")

        # Write institution timeline JSON
        inst_timeline_path = os.path.join(output_dir, "assets/data/institution_timeline.json")
        with open(inst_timeline_path, "w") as f:
            json.dump(inst_timeline, f, indent=2, ensure_ascii=False)
        logger.info(f"  Wrote {inst_timeline_path}")

    # ── 6. Generate charts ───────────────────────────────────────────────────
    if output_dir:
        _generate_committee_charts(committee_summary, detail_json, output_dir, inst_timeline=inst_timeline)

    logger.info(
        f"  Committee stats: {total_members} members from "
        f"{len(country_all)} countries, {len(continent_all)} continents, "
        f"{len(inst_all)} institutions"
    )
    logger.info(
        f"  Recurring members: {recurring_summary['total_recurring']} "
        f"(sys: {recurring_summary['total_recurring_systems']}, "
        f"sec: {recurring_summary['total_recurring_security']}, "
        f"chairs: {recurring_summary['total_chairs']})"
    )

    return detail_json

generate_committee_stats¶

src.generators.generate_committee_stats ¶

classify_member(affiliation, prefix_tree, name_index) ¶

classify_committees(all_results) ¶

Parameters¶

Returns¶

generate_committee_data(conf_regex, output_dir) ¶

`src.generators.generate_committee_stats` ¶

`classify_member(affiliation, prefix_tree, name_index)` ¶

`classify_committees(all_results)` ¶

`generate_committee_data(conf_regex, output_dir)` ¶