Skip to content

enrich_affiliations_ae_members

src.enrichers.enrich_affiliations_ae_members

Enrich author affiliations using AE (Artifact Evaluation) committee member data.

AE members are scraped from conference websites and have ~99.5% affiliation coverage. This enricher is fully offline — it reads the existing ae_members.json file and matches author names to fill in missing affiliations.

load_ae_members(data_dir: str) -> dict[str, str]

Load AE member data and build a name → affiliation mapping.

Returns dict mapping normalized author name → affiliation string. The most recent committee appearance wins (AE members list is sorted by frequency, so first occurrence is the most active member).

Source code in src/enrichers/enrich_affiliations_ae_members.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def load_ae_members(data_dir: str) -> dict[str, str]:
    """Load AE member data and build a name → affiliation mapping.

    Returns dict mapping normalized author name → affiliation string.
    The most recent committee appearance wins (AE members list is sorted
    by frequency, so first occurrence is the most active member).
    """
    ae_path = os.path.join(data_dir, "assets", "data", "ae_members.json")
    if not os.path.exists(ae_path):
        logger.warning(f"AE members file not found: {ae_path}")
        return {}

    with open(ae_path, encoding="utf-8") as f:
        members = json.load(f)

    # Build exact-name and normalized-name lookup
    name_to_affil: dict[str, str] = {}
    for member in members:
        name = member.get("name", "")
        affiliation = member.get("affiliation", "")
        if name and affiliation and name not in name_to_affil:
            name_to_affil[name] = affiliation

    logger.info(f"Loaded {len(name_to_affil)} AE members with affiliations")
    return name_to_affil

enrich_affiliations(authors_file: Path, output_file: Path, data_dir: str, max_authors: Optional[int] = None, dry_run: bool = False, verbose: bool = False) -> dict[str, int]

Enrich author affiliations from AE committee member data.

Only fills in missing affiliations — does NOT overwrite existing ones (unlike CSRankings which takes precedence as a curated source).

Source code in src/enrichers/enrich_affiliations_ae_members.py
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def enrich_affiliations(
    authors_file: Path,
    output_file: Path,
    data_dir: str,
    max_authors: Optional[int] = None,
    dry_run: bool = False,
    verbose: bool = False,
) -> dict[str, int]:
    """Enrich author affiliations from AE committee member data.

    Only fills in missing affiliations — does NOT overwrite existing ones
    (unlike CSRankings which takes precedence as a curated source).
    """
    with open(authors_file, encoding="utf-8") as f:
        authors = json.load(f)

    ae_affils = load_ae_members(data_dir)
    if not ae_affils:
        return {"total": len(authors), "enriched": 0, "already_has_affiliation": 0, "no_match": 0}

    # Also build a normalized lookup for fuzzy matching
    norm_to_affil: dict[str, str] = {}
    for name, affil in ae_affils.items():
        nk = normalize_name(name)
        if nk and nk not in norm_to_affil:
            norm_to_affil[nk] = affil

    # Load author index if data_dir is provided
    index_by_name: dict = {}
    _update_index_fn = None
    _save_index_fn = None
    try:
        from src.utils.author_index import load_author_index, save_author_index, update_author_affiliation

        _, index_by_name = load_author_index(data_dir)
        _update_index_fn = update_author_affiliation

        def _save_index_fn():
            return save_author_index(data_dir, sorted(index_by_name.values(), key=lambda e: e["id"]))

        if index_by_name:
            logger.info(f"Loaded author index ({len(index_by_name)} entries)")
    except ImportError:
        logger.debug("Author index not available, skipping")

    stats = {"total": len(authors), "already_has_affiliation": 0, "enriched": 0, "no_match": 0}

    if max_authors:
        authors = authors[:max_authors]

    logger.info(f"Processing {len(authors)} authors for AE member matches...")

    for i, author in enumerate(authors, 1):
        name = author.get("name", "")
        current_affil = author.get("affiliation", "")

        if current_affil and current_affil != "Unknown":
            stats["already_has_affiliation"] += 1
            continue

        # Try exact match first, then normalized
        affiliation = ae_affils.get(name)
        if not affiliation:
            affiliation = norm_to_affil.get(normalize_name(name))

        if affiliation:
            author["affiliation"] = affiliation
            stats["enriched"] += 1
            if name in index_by_name and _update_index_fn:
                _update_index_fn(index_by_name[name], affiliation, "ae_committee")
            if verbose:
                logger.info(f"  [{i}] {name}{affiliation}")
        else:
            stats["no_match"] += 1

    stats["remaining"] = sum(1 for a in authors if not a.get("affiliation") or a.get("affiliation") == "Unknown")
    stats["final_coverage"] = 100 * (stats["total"] - stats["remaining"]) / stats["total"] if stats["total"] else 0

    if not dry_run:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(authors, f, indent=2, ensure_ascii=False)
        logger.info(f"Enriched authors saved to: {output_file}")
        if _save_index_fn and index_by_name:
            _save_index_fn()
            logger.info("Author index updated")
    else:
        logger.info(f"[DRY RUN] Would save to: {output_file}")

    logger.info(f"AE-member enrichment: {stats['enriched']} new affiliations, coverage {stats['final_coverage']:.1f}%")
    return stats