Skip to content

enrich_affiliations_openalex

src.enrichers.enrich_affiliations_openalex

Paper-based affiliation enrichment using OpenAlex, Semantic Scholar, CrossRef, and DBLP.

Disambiguation strategy: instead of searching by author name alone (which returns wrong matches for common names), we look up the author's known papers by title, then extract the affiliation from the matching author entry on that specific paper.

Sources queried (in priority order): 1. CrossRef – DOI lookup → author → affiliation (highest precision) 2. OpenAlex – works search by title → authorships → institutions 3. Semantic Scholar – paper title match → authors → affiliations 4. CrossRef – works search by title → author → affiliation

Usage

python -m src.enrichers.enrich_affiliations_openalex --authors_file ../_data/authors.yml --papers_file ../assets/data/paper_authors_map.json --output_file ../_data/authors.yml [--max_authors 100][--verbose]

find_affiliation_for_author(session: requests.Session, author_name: str, papers: list[dict], verbose: bool = False) -> tuple[Optional[str], str]

Try to find affiliation using paper-based disambiguation.

Strategy (two passes): Pass 1 – papers with real DOIs (highest confidence): CrossRef DOI lookup for exact author match. Pass 2 – title-based search across papers (up to 5): OpenAlex, Semantic Scholar, then CrossRef title search. Papers are tried newest-first for the most current affiliation. Note: DBLP affiliations are already applied in step 0 (generate_author_stats).

Source code in src/enrichers/enrich_affiliations_openalex.py
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
def find_affiliation_for_author(
    session: requests.Session,
    author_name: str,
    papers: list[dict],
    verbose: bool = False,
) -> tuple[Optional[str], str]:
    """
    Try to find affiliation using paper-based disambiguation.

    Strategy (two passes):
      Pass 1 – papers with real DOIs (highest confidence):
        CrossRef DOI lookup for exact author match.
      Pass 2 – title-based search across papers (up to 5):
        OpenAlex, Semantic Scholar, then CrossRef title search.
    Papers are tried newest-first for the most current affiliation.
    Note: DBLP affiliations are already applied in step 0 (generate_author_stats).
    """
    sorted_papers = sorted(papers, key=lambda p: p.get("year", 0), reverse=True)

    # Pass 1: DOI-based lookups (most precise, no ambiguity)
    for paper in sorted_papers:
        doi_url = paper.get("doi_url", "")
        if doi_url and _is_real_doi(doi_url):
            if verbose:
                logger.info(f"    DOI: {paper.get('title', '')[:55]}... ({paper.get('year', '?')})")
            affil = _crossref_affiliation_by_doi(session, author_name, doi_url, verbose)
            if affil:
                return affil, "crossref_doi"

    # Pass 2: title-based lookups (try up to 5 papers, skip 2025 first pass)
    papers_tried = 0
    for paper in sorted_papers:
        if papers_tried >= 5:
            break
        title = paper.get("title", "")
        if not title:
            continue
        papers_tried += 1

        if verbose:
            logger.info(f"    Title: {title[:55]}... ({paper.get('year', '?')})")

        # OpenAlex by title (best coverage)
        affil = _openalex_affiliation_by_title(session, author_name, title, verbose)
        if affil:
            return affil, "openalex_title"

        # Semantic Scholar by title
        affil = _s2_affiliation_by_title(session, author_name, title, verbose)
        if affil:
            return affil, "s2_title"

        # CrossRef by title
        affil = _crossref_affiliation_by_title(session, author_name, title, verbose)
        if affil:
            return affil, "crossref_title"

    return None, ""

enrich(authors_file: str, papers_file: str, output_file: Optional[str] = None, max_authors: Optional[int] = None, verbose: bool = False, dry_run: bool = False, recheck: bool = False, data_dir: Optional[str] = None) -> dict

Main entry point. Reads authors.yml and paper_authors_map.json, enriches missing affiliations, writes back.

Returns stats dict.

Source code in src/enrichers/enrich_affiliations_openalex.py
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
def enrich(
    authors_file: str,
    papers_file: str,
    output_file: Optional[str] = None,
    max_authors: Optional[int] = None,
    verbose: bool = False,
    dry_run: bool = False,
    recheck: bool = False,
    data_dir: Optional[str] = None,
) -> dict:
    """
    Main entry point.  Reads authors.yml and paper_authors_map.json,
    enriches missing affiliations, writes back.

    Returns stats dict.
    """
    output_file = output_file or authors_file

    # Build paper index
    logger.info("Loading paper-authors map...")
    author_papers = _build_author_papers_index(papers_file)
    logger.info(f"  {len(author_papers)} unique author names across papers")

    # Parse authors.yml (fast)
    logger.info("Parsing authors.yml (fast line scan)...")
    authors = _parse_authors_yml_fast(authors_file)
    total = len(authors)
    if recheck:
        candidates = list(authors)
        logger.info(f"  {total} total authors, rechecking ALL affiliations")
    else:
        candidates = [a for a in authors if not a.get("affiliation")]
        logger.info(f"  {total} total authors, {len(candidates)} missing affiliations")

    if max_authors:
        candidates = candidates[:max_authors]
        logger.info(f"  Processing first {len(candidates)} (--max_authors)")

    # HTTP session
    session = create_session()
    http_proxy = os.environ.get("http_proxy") or os.environ.get("HTTP_PROXY", "")
    https_proxy = os.environ.get("https_proxy") or os.environ.get("HTTPS_PROXY", "")
    if https_proxy or http_proxy:
        session.proxies = {"http": http_proxy, "https": https_proxy}
        logger.info(f"  Using proxy: {https_proxy or http_proxy}")

    # Load author index
    index_by_name = {}
    _update_index_fn = None
    _save_index_fn = None
    if data_dir:
        try:
            from src.utils.author_index import load_author_index, save_author_index, update_author_affiliation

            _, index_by_name = load_author_index(data_dir)
            _update_index_fn = update_author_affiliation

            def _save_index_fn():
                return save_author_index(data_dir, sorted(index_by_name.values(), key=lambda e: e["id"]))

            if index_by_name:
                logger.info(f"  Loaded author index ({len(index_by_name)} entries)")
        except ImportError:
            logger.debug("Optional module not available, skipping enrichment")

    stats = {
        "total": total,
        "candidates": len(candidates),
        "found": 0,
        "not_found": 0,
        "by_source": {},
        "errors": 0,
    }

    updates: dict[str, str] = {}

    logger.info(f"\nEnriching {len(candidates)} authors...")
    logger.info("=" * 70)

    for idx, author in enumerate(candidates, 1):
        name = author.get("name", "")
        if not name:
            continue

        try:
            papers = author_papers.get(name, [])
            paper_count = len(papers)

            if verbose:
                logger.info(f"[{idx}/{len(candidates)}] {name}  ({paper_count} papers)")

            affiliation, source = find_affiliation_for_author(session, name, papers, verbose=verbose)

            if affiliation:
                stats["found"] += 1
                stats["by_source"][source] = stats["by_source"].get(source, 0) + 1
                updates[name] = affiliation
                # Update author index
                if name in index_by_name and _update_index_fn:
                    _update_index_fn(index_by_name[name], affiliation, source)
                if not verbose:
                    logger.info(f"[{idx}/{len(candidates)}] {name:40s}  +  {affiliation[:50]}  ({source})")
            else:
                stats["not_found"] += 1
                if not verbose:
                    logger.info(f"[{idx}/{len(candidates)}] {name:40s}  -")
        except Exception:
            stats["errors"] = stats.get("errors", 0) + 1
            logger.warning(f"[{idx}/{len(candidates)}] {name}: error during enrichment", exc_info=True)

    logger.info("=" * 70)
    logger.info(f"\nResults:  found {stats['found']}, not found {stats['not_found']}")
    for src, cnt in sorted(stats["by_source"].items(), key=lambda x: -x[1]):
        logger.info(f"  {src:20s}: {cnt}")

    if not dry_run and updates:
        logger.info(f"\nWriting {len(updates)} updates to {output_file} ...")
        replaced = _update_authors_yml(output_file, updates)
        logger.info(f"  {replaced} lines updated in YAML.")
        # Save updated author index
        if _save_index_fn and index_by_name:
            _save_index_fn()
            logger.info("  Author index updated")
    elif dry_run:
        logger.info(f"\n[DRY RUN] Would update {len(updates)} authors.")

    stats["updates_written"] = len(updates) if not dry_run else 0
    return stats