Skip to content

generate_committee_stats

src.generators.generate_committee_stats

Generate committee statistics for the ReproDB website.

Scrapes AE committee member data from sysartifacts and secartifacts, classifies members by country, continent, and institution, and outputs structured YAML/JSON for Jekyll rendering + chart generation.

classify_member(affiliation, prefix_tree, name_index)

Classify a single member's affiliation to a country.

Returns (country, institution_name) or (None, None) on failure.

Source code in src/generators/generate_committee_stats.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
def classify_member(affiliation, prefix_tree, name_index):
    """Classify a single member's affiliation to a country.

    Returns (country, institution_name) or (None, None) on failure.
    """
    aff_lower = affiliation.lower().strip()
    if not aff_lower:
        return None, None

    # Try prefix-tree match first
    matches = prefix_tree.values(prefix=aff_lower)
    if matches:
        uni = matches[0]
        return uni["country"], uni.get("name", affiliation)

    # Fall back to fuzzy matching
    best_match = None
    best_ratio = 0
    for name, uni in name_index.items():
        ratio = fuzz.ratio(name, aff_lower)
        if ratio > best_ratio:
            best_ratio = ratio
            best_match = uni

    if best_ratio > 80 and best_match:
        return best_match["country"], best_match.get("name", affiliation)

    return None, None

classify_committees(all_results)

Classify all committee members by country, continent, and institution.

Parameters

all_results : dict {conf_year: [{name, affiliation}, ...]}

Returns

dict with keys: by_country, by_continent, by_institution, by_conference, failed_affiliations

Source code in src/generators/generate_committee_stats.py
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
def classify_committees(all_results):
    """Classify all committee members by country, continent, and institution.

    Parameters
    ----------
    all_results : dict
        {conf_year: [{name, affiliation}, ...]}

    Returns
    -------
    dict with keys: by_country, by_continent, by_institution, by_conference,
                    failed_affiliations
    """
    name_index = _build_university_index()
    prefix_tree = Trie(**name_index)

    # Per-conference-year breakdown
    by_conf_country = {}  # conf_year → {country: count}
    by_conf_continent = {}  # conf_year → {continent: count}
    by_conf_institution = {}  # conf_year → {institution: count}
    failed = []

    for conf_year, members in all_results.items():
        by_conf_country[conf_year] = defaultdict(int)
        by_conf_continent[conf_year] = defaultdict(int)
        by_conf_institution[conf_year] = defaultdict(int)

        for member in members:
            affiliation = _clean_affiliation(member["affiliation"])
            country, inst_name = classify_member(affiliation, prefix_tree, name_index)
            if country:
                by_conf_country[conf_year][country] += 1
                continent = COUNTRY_TO_CONTINENT.get(country, "Unknown")
                by_conf_continent[conf_year][continent] += 1
                by_conf_institution[conf_year][inst_name or member["affiliation"]] += 1
            else:
                failed.append(
                    {
                        "conference": conf_year,
                        "name": member["name"],
                        "affiliation": affiliation,
                    }
                )

    return {
        "by_country": by_conf_country,
        "by_continent": by_conf_continent,
        "by_institution": by_conf_institution,
        "failed": failed,
    }

generate_committee_data(conf_regex, output_dir)

Main entry point: scrape committees, classify, and write output files.

Source code in src/generators/generate_committee_stats.py
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
def generate_committee_data(conf_regex, output_dir):
    """Main entry point: scrape committees, classify, and write output files."""

    # ── 1. Scrape committees from both prefixes ──────────────────────────────
    logger.info("  Scraping systems committee data from sysartifacts...")
    sys_results = get_committees(conf_regex, "sys")
    logger.info(f"    Found {len(sys_results)} systems conference-years")

    logger.info("  Scraping security committee data from secartifacts...")
    sec_results = get_committees(conf_regex, "sec")
    logger.info(f"    Found {len(sec_results)} security conference-years")

    all_results = {}
    all_results.update(sys_results)
    all_results.update(sec_results)

    # Clean all results (remove placeholders, fix markdown links)
    for cy in list(all_results.keys()):
        all_results[cy] = _clean_committee(all_results[cy])

    # ── 1b. Supplement with alternative sources ──────────────────────────────
    #  Identify conferences that are missing or have low-quality data
    logger.info("  Checking for conferences needing alternative sources...")
    conferences_needed = {}

    # Determine which conference-years we expect based on USENIX/CHES/PETS
    for conf, _slug in USENIX_CONF_SLUGS.items():
        for year in USENIX_KNOWN_YEARS.get(conf, []):
            cy = f"{conf}{year}"
            if re.search(conf_regex, cy):
                area = "systems" if conf in SYSTEMS_CONFS else "security"
                if cy not in all_results or not _is_valid_committee(all_results.get(cy)):
                    conferences_needed[cy] = area

    for year in CHES_KNOWN_YEARS:
        cy = f"ches{year}"
        if re.search(conf_regex, cy) and (cy not in all_results or not _is_valid_committee(all_results.get(cy))):
            conferences_needed[cy] = "security"

    for year in PETS_KNOWN_YEARS:
        cy = f"pets{year}"
        if re.search(conf_regex, cy) and (cy not in all_results or not _is_valid_committee(all_results.get(cy))):
            conferences_needed[cy] = "security"

    if conferences_needed:
        logger.info(f"    Need alternative sources for {len(conferences_needed)} conference-years:")
        for cy in sorted(conferences_needed.keys()):
            existing = len(all_results.get(cy, []))
            logger.info(f"      {cy} (currently {existing} members)")

        alt_results = get_alternative_committees(conferences_needed)
        for cy, members in alt_results.items():
            cleaned = _clean_committee(members)
            if cleaned:
                existing_count = len(all_results.get(cy, []))
                all_results[cy] = cleaned
                logger.info(f"    ✓ {cy}: replaced {existing_count}{len(cleaned)} members (alternative source)")
    else:
        logger.info("    All conference-years have valid committee data.")

    if not all_results:
        logger.warning("  No committee data found — skipping committee stats.")
        return None

    # ── 2. Classify by country / continent / institution ─────────────────────
    logger.info("  Classifying committee members...")
    classified = classify_committees(all_results)

    if classified["failed"]:
        logger.error(f"  ⚠️  Could not classify {len(classified['failed'])} members")

    # Build area map
    conf_to_area = {cy: _conf_area(cy) for cy in all_results}

    # ── 3. Aggregate statistics ──────────────────────────────────────────────
    country_all, country_sys, country_sec = _aggregate_across_conferences(classified["by_country"], conf_to_area)
    continent_all, continent_sys, continent_sec = _aggregate_across_conferences(
        classified["by_continent"], conf_to_area
    )
    inst_all, inst_sys, inst_sec = _aggregate_across_conferences(classified["by_institution"], conf_to_area)

    # Yearly time-series
    country_years_all, country_years_sys, country_years_sec = _build_yearly_series(
        classified["by_country"], conf_to_area
    )
    continent_years_all, continent_years_sys, continent_years_sec = _build_yearly_series(
        classified["by_continent"], conf_to_area
    )

    # Committee sizes per conference-year
    committee_sizes = []
    for conf_year in sorted(all_results.keys()):
        conf_name, year = _extract_conf_year(conf_year)
        area = conf_to_area.get(conf_year, "unknown")
        committee_sizes.append(
            {
                "conference": conf_name,
                "year": year,
                "conf_year": conf_year,
                "area": area,
                "size": len(all_results[conf_year]),
            }
        )

    # Total members
    total_members = sum(len(m) for m in all_results.values())
    total_systems = sum(len(m) for cy, m in all_results.items() if conf_to_area.get(cy) == "systems")
    total_security = sum(len(m) for cy, m in all_results.items() if conf_to_area.get(cy) == "security")

    # ── 3b. Compute recurring AE member statistics ───────────────────────────
    logger.info("  Computing recurring AE member rankings...")
    all_members, sys_members, sec_members, recurring_summary = _compute_recurring_members(
        all_results, conf_to_area, classified
    )
    logger.info(
        f"    Found {recurring_summary['total_recurring']} recurring members "
        f"({recurring_summary['total_chairs']} include chair roles)"
    )

    # ── 3c. Compute institution timeline ─────────────────────────────────────
    logger.info("  Computing institution timeline...")
    inst_timeline = _compute_institution_timeline(classified, conf_to_area)
    logger.info(f"    Tracked {len(inst_timeline['unique_by_year'])} years of institution data")

    # ── 4. Build output structures ───────────────────────────────────────────

    # Summary for _data/committee_stats.yml
    committee_summary = {
        "last_updated": datetime.now().strftime("%Y-%m-%d"),
        "total_members": total_members,
        "total_systems": total_systems,
        "total_security": total_security,
        "total_conferences": len(all_results),
        "total_countries": len(country_all),
        "total_continents": len(continent_all),
        "total_institutions": len(inst_all),
        "recurring_members": recurring_summary["total_recurring"],
        "recurring_members_systems": recurring_summary["total_recurring_systems"],
        "recurring_members_security": recurring_summary["total_recurring_security"],
        "recurring_chairs": recurring_summary["total_chairs"],
        "top_countries": [{"name": k, "count": v} for k, v in _top_n(country_all, 15)],
        "top_countries_systems": [{"name": k, "count": v} for k, v in _top_n(country_sys, 15)],
        "top_countries_security": [{"name": k, "count": v} for k, v in _top_n(country_sec, 15)],
        "top_continents": [{"name": k, "count": v} for k, v in _top_n(continent_all, 10)],
        "top_continents_systems": [{"name": k, "count": v} for k, v in _top_n(continent_sys, 10)],
        "top_continents_security": [{"name": k, "count": v} for k, v in _top_n(continent_sec, 10)],
        "top_institutions": [{"name": k, "count": v} for k, v in _top_n(inst_all, 20)],
        "top_institutions_systems": [{"name": k, "count": v} for k, v in _top_n(inst_sys, 20)],
        "top_institutions_security": [{"name": k, "count": v} for k, v in _top_n(inst_sec, 20)],
        "institution_timeline": inst_timeline["unique_by_year"],
        "committee_sizes": committee_sizes,
    }

    # Detailed JSON for charting / download
    detail_json = {
        "summary": {
            "total_members": total_members,
            "total_systems": total_systems,
            "total_security": total_security,
            "total_countries": len(country_all),
            "total_continents": len(continent_all),
            "total_institutions": len(inst_all),
        },
        "by_country": {
            "overall": [{"name": k, "count": v} for k, v in sorted(country_all.items(), key=lambda x: -x[1])],
            "systems": [{"name": k, "count": v} for k, v in sorted(country_sys.items(), key=lambda x: -x[1])],
            "security": [{"name": k, "count": v} for k, v in sorted(country_sec.items(), key=lambda x: -x[1])],
        },
        "by_continent": {
            "overall": [{"name": k, "count": v} for k, v in sorted(continent_all.items(), key=lambda x: -x[1])],
            "systems": [{"name": k, "count": v} for k, v in sorted(continent_sys.items(), key=lambda x: -x[1])],
            "security": [{"name": k, "count": v} for k, v in sorted(continent_sec.items(), key=lambda x: -x[1])],
        },
        "by_institution": {
            "overall": [{"name": k, "count": v} for k, v in sorted(inst_all.items(), key=lambda x: -x[1])],
            "systems": [{"name": k, "count": v} for k, v in sorted(inst_sys.items(), key=lambda x: -x[1])],
            "security": [{"name": k, "count": v} for k, v in sorted(inst_sec.items(), key=lambda x: -x[1])],
        },
        "by_year": {
            "country": {str(y): dict(c) for y, c in sorted(country_years_all.items())},
            "country_systems": {str(y): dict(c) for y, c in sorted(country_years_sys.items())},
            "country_security": {str(y): dict(c) for y, c in sorted(country_years_sec.items())},
            "continent": {str(y): dict(c) for y, c in sorted(continent_years_all.items())},
            "continent_systems": {str(y): dict(c) for y, c in sorted(continent_years_sys.items())},
            "continent_security": {str(y): dict(c) for y, c in sorted(continent_years_sec.items())},
        },
        "committee_sizes": committee_sizes,
        "failed_classifications": classified["failed"],
    }

    # ── 5. Write output files ────────────────────────────────────────────────
    if output_dir:
        os.makedirs(os.path.join(output_dir, "_data"), exist_ok=True)
        os.makedirs(os.path.join(output_dir, "assets/data"), exist_ok=True)
        os.makedirs(os.path.join(output_dir, "assets/charts"), exist_ok=True)

        yml_path = os.path.join(output_dir, "_data/committee_stats.yml")
        with open(yml_path, "w") as f:
            yaml.dump(committee_summary, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
        logger.info(f"  Wrote {yml_path}")

        json_path = os.path.join(output_dir, "assets/data/committee_stats.json")
        with open(json_path, "w") as f:
            json.dump(detail_json, f, indent=2, ensure_ascii=False)
        logger.info(f"  Wrote {json_path}")

        # Write recurring AE member JSON files
        ae_all_path = os.path.join(output_dir, "assets/data/ae_members.json")
        with open(ae_all_path, "w") as f:
            json.dump(all_members, f, indent=2, ensure_ascii=False)
        logger.info(f"  Wrote {ae_all_path} ({len(all_members)} members)")

        ae_sys_path = os.path.join(output_dir, "assets/data/systems_ae_members.json")
        with open(ae_sys_path, "w") as f:
            json.dump(sys_members, f, indent=2, ensure_ascii=False)
        logger.info(f"  Wrote {ae_sys_path} ({len(sys_members)} members)")

        ae_sec_path = os.path.join(output_dir, "assets/data/security_ae_members.json")
        with open(ae_sec_path, "w") as f:
            json.dump(sec_members, f, indent=2, ensure_ascii=False)
        logger.info(f"  Wrote {ae_sec_path} ({len(sec_members)} members)")

        # Write institution timeline JSON
        inst_timeline_path = os.path.join(output_dir, "assets/data/institution_timeline.json")
        with open(inst_timeline_path, "w") as f:
            json.dump(inst_timeline, f, indent=2, ensure_ascii=False)
        logger.info(f"  Wrote {inst_timeline_path}")

    # ── 6. Generate charts ───────────────────────────────────────────────────
    if output_dir:
        _generate_committee_charts(committee_summary, detail_json, output_dir, inst_timeline=inst_timeline)

    logger.info(
        f"  Committee stats: {total_members} members from "
        f"{len(country_all)} countries, {len(continent_all)} continents, "
        f"{len(inst_all)} institutions"
    )
    logger.info(
        f"  Recurring members: {recurring_summary['total_recurring']} "
        f"(sys: {recurring_summary['total_recurring_systems']}, "
        f"sec: {recurring_summary['total_recurring_security']}, "
        f"chairs: {recurring_summary['total_chairs']})"
    )

    return detail_json