Skip to content

generate_author_stats

src.generators.generate_author_stats

Generate prolific artifact author statistics by matching artifact papers with DBLP. This script requires downloading the DBLP XML file first (~3GB compressed). Download from: https://dblp.org/xml/dblp.xml.gz

load_artifacts(data_dir: str) -> list[dict] | None

Load artifacts from generated data file

Source code in src/generators/generate_author_stats.py
29
30
31
32
33
34
35
36
37
38
39
40
def load_artifacts(data_dir: str) -> list[dict] | None:
    """Load artifacts from generated data file"""
    artifacts_path = os.path.join(data_dir, "assets/data/artifacts.json")
    if not os.path.exists(artifacts_path):
        logger.error(f"Error: {artifacts_path} not found")
        logger.info("Please run generate_statistics.py first")
        return None

    with open(artifacts_path, "r") as f:
        artifacts = json.load(f)

    return artifacts

load_conference_active_years(data_dir)

Load artifacts_by_conference data and extract active years per conference.

Returns dict: conference_name -> set of years when that conference had artifact evaluation.

Source code in src/generators/generate_author_stats.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def load_conference_active_years(data_dir):
    """Load artifacts_by_conference data and extract active years per conference.

    Returns dict: conference_name -> set of years when that conference had artifact evaluation.
    """
    conf_path = os.path.join(data_dir, "_data/artifacts_by_conference.yml")
    if not os.path.exists(conf_path):
        logger.warning(f"Warning: {conf_path} not found, will count all years")
        return {}

    with open(conf_path, "r") as f:
        conf_data = yaml.safe_load(f)

    active_years = {}
    for conf in conf_data:
        conf_name = conf.get("name", "").upper()
        if not conf_name:
            continue

        years = conf.get("years", [])
        # Include any year that had at least one artifact
        active_years[conf_name] = set(year_entry["year"] for year_entry in years if year_entry.get("total", 0) > 0)

    logger.info(f"Loaded active years for {len(active_years)} conferences")
    for conf, years in sorted(active_years.items()):
        if years:
            year_list = sorted(years)
            logger.info(f"  {conf}: {min(year_list)}-{max(year_list)} ({len(year_list)} years)")

    return active_years

load_artifact_citations(data_dir: str) -> dict[str, int]

Load artifact citation data if available.

Returns dict: normalized_title -> cited_by_count (max across duplicates).

Source code in src/generators/generate_author_stats.py
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def load_artifact_citations(data_dir: str) -> dict[str, int]:
    """Load artifact citation data if available.

    Returns dict: normalized_title -> cited_by_count (max across duplicates).
    """
    citations_path = os.path.join(data_dir, "assets", "data", "artifact_citations.json")
    if not os.path.exists(citations_path):
        logger.info("Artifact citation data not available (collection disabled — see generate_artifact_citations.py)")
        return {}

    with open(citations_path, "r") as f:
        entries = json.load(f)

    by_title = {}
    for entry in entries:
        title = entry.get("normalized_title") or normalize_title(entry.get("title", ""))
        if not title:
            continue
        cited = entry.get("cited_by_count")
        if isinstance(cited, int):
            by_title[title] = max(by_title.get(title, 0), cited)

    return by_title

extract_paper_titles(artifacts: list[dict]) -> tuple[set[str], dict[str, dict]]

Extract unique paper titles from artifacts

Source code in src/generators/generate_author_stats.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def extract_paper_titles(artifacts: list[dict]) -> tuple[set[str], dict[str, dict]]:
    """Extract unique paper titles from artifacts"""
    titles = set()
    title_to_artifact = {}

    for artifact in artifacts:
        title = artifact.get("title", "")
        if title and title != "Unknown":
            normalized = normalize_title(title)
            titles.add(normalized)
            # Keep mapping for metadata
            if normalized not in title_to_artifact:
                title_to_artifact[normalized] = artifact

    # Found {len(titles)} unique paper titles
    return titles, title_to_artifact

parse_dblp_for_authors(dblp_file: str, paper_titles: set[str], title_to_artifact: dict[str, dict]) -> tuple[list[dict], dict, dict]

Find authors for artifact papers using pre-extracted DBLP cache.

Calls extract_dblp() to ensure the cache exists, then reads from the JSON lookup files rather than re-parsing the 3 GB XML.

Parameters:

Name Type Description Default
dblp_file str

Path to dblp.xml.gz file (passed to extract_dblp)

required
paper_titles set[str]

Set of normalized paper titles to find

required
title_to_artifact dict[str, dict]

Mapping from title to artifact metadata

required

Returns:

Type Description
tuple[list[dict], dict, dict]

Tuple of: - List of papers with author information (artifact papers) - Dict mapping (author, conference) -> {year: set of normalized titles} for ALL papers at tracked venues (venue_papers) - Dict mapping author_name -> affiliation string (from DBLP entries)

Source code in src/generators/generate_author_stats.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
def parse_dblp_for_authors(
    dblp_file: str, paper_titles: set[str], title_to_artifact: dict[str, dict]
) -> tuple[list[dict], dict, dict]:
    """
    Find authors for artifact papers using pre-extracted DBLP cache.

    Calls ``extract_dblp()`` to ensure the cache exists, then reads from
    the JSON lookup files rather than re-parsing the 3 GB XML.

    Args:
        dblp_file: Path to dblp.xml.gz file (passed to extract_dblp)
        paper_titles: Set of normalized paper titles to find
        title_to_artifact: Mapping from title to artifact metadata

    Returns:
        Tuple of:
          - List of papers with author information (artifact papers)
          - Dict mapping (author, conference) -> {year: set of normalized titles}
            for ALL papers at tracked venues (venue_papers)
          - Dict mapping author_name -> affiliation string (from DBLP <www> entries)
    """
    if not os.path.exists(dblp_file):
        logger.error(f"Error: DBLP file not found: {dblp_file}")
        logger.info("Please download from: https://dblp.org/xml/dblp.xml.gz")
        return [], {}, {}

    from ..utils.dblp_extract import extract_dblp, load_affiliations, load_papers_by_venue

    logger.info("Extracting DBLP data (cached if unchanged)...")
    extract_dblp(dblp_file)

    affiliations = load_affiliations()
    papers_by_venue = load_papers_by_venue()

    logger.info(f"Loaded {len(affiliations)} affiliations, {len(papers_by_venue)} venue groups from cache")

    papers_found = []
    titles_to_find = paper_titles.copy()
    # (author_name, conference) -> {year: set of normalized_title}
    venue_papers = defaultdict(lambda: defaultdict(set))

    for conf, years in papers_by_venue.items():
        for year_str, paper_list in years.items():
            year = int(year_str) if year_str else 0
            for paper in paper_list:
                title = paper.get("title", "")
                normalized = normalize_title(title.rstrip("."))
                authors = paper.get("authors", [])

                # Track all venue papers (for total-papers denominator)
                for author in authors:
                    if author:
                        venue_papers[(author, conf)][year].add(normalized)

                # Match artifact titles
                if normalized in titles_to_find:
                    artifact_meta = title_to_artifact.get(normalized, {})

                    # Build DOI URL from extracted DOI
                    doi = paper.get("doi", "")
                    doi_url = f"https://doi.org/{doi}" if doi else ""

                    paper_info = {
                        "title": title,
                        "normalized_title": normalized,
                        "authors": authors,
                        "year": year if year else artifact_meta.get("year"),
                        "artifact_year": artifact_meta.get("year"),
                        "venue": conf,
                        "conference": artifact_meta.get("conference", ""),
                        "category": artifact_meta.get("category", "unknown"),
                        "badges": artifact_meta.get("badges", []),
                        "doi_url": doi_url,
                    }

                    papers_found.append(paper_info)
                    titles_to_find.remove(normalized)

    if titles_to_find:
        logger.warning(f"Warning: {len(titles_to_find)} papers not found in DBLP")

    total_venue = sum(len(t) for ydict in venue_papers.values() for t in ydict.values())
    logger.info(f"Total artifact papers matched: {len(papers_found)}")
    logger.info(f"Total papers tracked at conference venues: {total_venue} (author-paper pairs)")
    logger.info(f"Total DBLP affiliations extracted: {len(affiliations)}")

    return papers_found, venue_papers, affiliations

aggregate_author_statistics(papers, venue_papers=None, affiliations=None, conference_active_years=None, citations_by_title=None)

Calculate statistics per author.

Parameters:

Name Type Description Default
papers

list of artifact papers with author info

required
venue_papers

optional dict (author, conference)->year_dict->set(titles) of ALL papers at tracked conferences

None
affiliations

optional dict author_name -> affiliation string

None
conference_active_years

optional dict conference_name -> set of active years Only papers from these years will be counted in total_papers

None
Source code in src/generators/generate_author_stats.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
def aggregate_author_statistics(
    papers, venue_papers=None, affiliations=None, conference_active_years=None, citations_by_title=None
):
    """Calculate statistics per author.

    Args:
        papers: list of artifact papers with author info
        venue_papers: optional dict (author, conference)->year_dict->set(titles)
                      of ALL papers at tracked conferences
        affiliations: optional dict author_name -> affiliation string
        conference_active_years: optional dict conference_name -> set of active years
                      Only papers from these years will be counted in total_papers
    """
    if venue_papers is None:
        venue_papers = {}
    if affiliations is None:
        affiliations = {}
    if conference_active_years is None:
        conference_active_years = {}
    if citations_by_title is None:
        citations_by_title = {}

    author_stats = defaultdict(
        lambda: {
            "name": "",
            "artifact_count": 0,
            "papers": [],
            "papers_without_artifacts": [],
            "artifact_titles": set(),  # Track which titles have artifacts
            "conferences": set(),
            "years": set(),
            "artifact_citations": 0,
            "badges": {"available": 0, "functional": 0, "reproducible": 0},
        }
    )

    # Pre-populate venue_papers with ALL artifact papers.
    # This guarantees artifacts <= total_papers by construction:
    # every artifact paper is counted in the denominator even when the
    # DBLP venue mapping misses a journal/booktitle alias.
    for paper in papers:
        conf = paper.get("conference", "")
        if not conf:
            continue
        # Prefer the artifact's declared year over the DBLP year.
        # DBLP may have matched a preprint version with a different year.
        yr = paper.get("artifact_year") or paper.get("year")
        if yr is None:
            yr = 0
        active_years = conference_active_years.get(conf, set())
        if active_years and yr not in active_years:
            continue
        title_norm = paper.get("normalized_title", "")
        if not title_norm:
            continue
        for author in paper.get("authors", []):
            if not author:
                continue
            if (author, conf) not in venue_papers:
                venue_papers[(author, conf)] = defaultdict(set)
            venue_papers[(author, conf)][yr].add(title_norm)

    for paper in papers:
        for author in paper["authors"]:
            stats = author_stats[author]
            stats["name"] = author

            title_key = paper.get("normalized_title")
            if title_key in stats["artifact_titles"]:
                continue

            stats["artifact_count"] += 1
            stats["papers"].append(
                {
                    "title": paper["title"],
                    "conference": paper["conference"],
                    "year": paper["year"],
                    "badges": paper["badges"],
                    "category": paper.get("category", "unknown"),
                    "artifact_citations": citations_by_title.get(title_key, 0),
                }
            )
            stats["artifact_citations"] += citations_by_title.get(title_key, 0)
            # Track normalized title to identify papers WITHOUT artifacts later
            stats["artifact_titles"].add(title_key)
            stats["conferences"].add(paper["conference"])
            stats["years"].add(paper["year"])

            badge_list = paper["badges"]
            if isinstance(badge_list, str):
                badge_list = [b.strip() for b in badge_list.split(",")]

            # If artifact was evaluated but has no formal badges recorded, treat as "available"
            has_available = False
            has_functional = False
            has_repro = False
            if not badge_list or len(badge_list) == 0:
                has_available = True
            else:
                for badge in badge_list:
                    badge_lower = badge.lower()
                    if "reproduc" in badge_lower or "reusable" in badge_lower:
                        has_repro = True
                    elif "functional" in badge_lower:
                        has_functional = True
                    elif "available" in badge_lower:
                        has_available = True

            if has_available:
                stats["badges"]["available"] += 1
            if has_functional:
                stats["badges"]["functional"] += 1
            if has_repro:
                stats["badges"]["reproducible"] += 1

    # Convert to list and add computed fields
    authors_list = []
    current_year = datetime.now().year

    # Track category-specific authors
    systems_authors = set()
    security_authors = set()
    cross_domain_authors = set()

    for author, stats in author_stats.items():
        years_sorted = sorted(stats["years"])
        recent_count = sum(1 for y in stats["years"] if y >= current_year - 3)

        # Determine author category based on paper categories
        list(stats["conferences"])
        paper_categories = set(p.get("category", "unknown") for p in stats["papers"])
        has_systems = "systems" in paper_categories
        has_security = "security" in paper_categories

        if has_systems and has_security:
            category = "both"
            cross_domain_authors.add(author)
            systems_authors.add(author)
            security_authors.add(author)
        elif has_systems:
            category = "systems"
            systems_authors.add(author)
        elif has_security:
            category = "security"
            security_authors.add(author)
        else:
            category = "unknown"

        # --- Compute total papers at tracked conferences (per-conf per-year) ---
        # Only count papers from years when the conference was actively doing AE
        total_papers_set = set()
        conf_title_sets = {}
        total_papers_by_conf = {}
        total_papers_by_conf_year = {}
        for conf in stats["conferences"]:
            year_dict = venue_papers.get((author, conf), {})
            conf_titles = set()
            conf_year_counts = {}
            active_years = conference_active_years.get(conf, set())

            for yr, titles in year_dict.items():
                # Only count papers from years when this conference had AE
                # If no active_years data available, count all years (backward compat)
                if not active_years or yr in active_years:
                    conf_titles |= titles
                    conf_year_counts[yr] = len(titles)

            total_papers_set |= conf_titles
            conf_title_sets[conf] = conf_titles
            total_papers_by_conf[conf] = len(conf_titles)
            total_papers_by_conf_year[conf] = conf_year_counts

        # Also check conferences the author didn't have artifacts at but
        # did publish at (from DBLP venue scan)
        for (a, c), year_dict in venue_papers.items():
            if a == author and c not in total_papers_by_conf:
                conf_titles = set()
                conf_year_counts = {}
                active_years = conference_active_years.get(c, set())

                for yr, titles in year_dict.items():
                    # Only count papers from years when this conference had AE
                    if not active_years or yr in active_years:
                        conf_titles |= titles
                        conf_year_counts[yr] = len(titles)

                total_papers_set |= conf_titles
                conf_title_sets[c] = conf_titles
                total_papers_by_conf[c] = len(conf_titles)
                total_papers_by_conf_year[c] = conf_year_counts

        # Recompute totals (artifact papers are already in venue_papers
        # thanks to the pre-population step above)
        total_papers_set = set()
        total_papers_by_conf = {}
        for conf, conf_titles in conf_title_sets.items():
            total_papers_set |= conf_titles
            total_papers_by_conf[conf] = len(conf_titles)
        total_papers = len(total_papers_set) if total_papers_set else 0

        # --- Compute papers WITHOUT artifacts ---
        # Collect all papers from venue_papers, then subtract artifact papers
        all_venue_papers = []  # List of (conf, year, title) for papers at venues
        for (a, c), year_dict in venue_papers.items():
            if a == author:
                active_years = conference_active_years.get(c, set())
                for yr, titles in year_dict.items():
                    if not active_years or yr in active_years:
                        for title in titles:
                            all_venue_papers.append((c, yr, title))

        # Find papers without artifacts
        papers_without = []
        for conf, yr, title in all_venue_papers:
            if title not in stats["artifact_titles"]:  # Not an artifact paper
                papers_without.append({"title": title, "conference": conf, "year": yr})

        # Remove duplicates and sort by year desc, then conference
        papers_without_dedup = {}
        for p in papers_without:
            # Use title+year+conf as key to deduplicate
            key = (p["title"], p["year"], p["conference"])
            if key not in papers_without_dedup:
                papers_without_dedup[key] = p

        papers_without_list = list(papers_without_dedup.values())
        papers_without_list.sort(key=lambda x: (-x["year"], x["conference"]))

        art_count = stats["artifact_count"]
        avail = stats["badges"]["available"]
        func = stats["badges"]["functional"]
        repro = stats["badges"]["reproducible"]

        if art_count > total_papers:
            raise ValueError(
                f"Invariant violation for '{stats['name']}': artifacts ({art_count}) > total_papers ({total_papers})"
            )
        if repro > art_count:
            raise ValueError(
                f"Invariant violation for '{stats['name']}': reproduced_badges ({repro}) > artifacts ({art_count})"
            )
        if func > art_count:
            raise ValueError(
                f"Invariant violation for '{stats['name']}': functional_badges ({func}) > artifacts ({art_count})"
            )

        # Artifact rate: % of tracked-conference papers that have an artifact.
        artifact_rate = round(art_count / total_papers * 100, 1) if total_papers > 0 else 0.0
        # Reproducibility rate: % of artifact papers with a "reproduced" badge
        repro_rate = round(repro / art_count * 100, 1) if art_count > 0 else 0.0
        # Functional rate: % of artifact papers with a "functional" badge
        functional_rate = round(func / art_count * 100, 1) if art_count > 0 else 0.0

        # Look up affiliation from DBLP
        affiliation_raw = affiliations.get(stats["name"], "")
        affiliation = _normalize_affiliation(affiliation_raw)

        author_entry = {
            "name": stats["name"],
            "display_name": clean_display_name(stats["name"]),
            "affiliation": affiliation,
            "artifact_count": art_count,
            "total_papers": total_papers,
            "total_papers_by_conf": total_papers_by_conf,
            "total_papers_by_conf_year": total_papers_by_conf_year,
            "artifact_rate": artifact_rate,
            "repro_rate": repro_rate,
            "functional_rate": functional_rate,
            "category": category,
            "conferences": sorted(list(stats["conferences"])),
            "years": years_sorted,
            "year_range": f"{min(years_sorted)}-{max(years_sorted)}" if years_sorted else "",
            "recent_count": recent_count,
            "artifact_citations": stats["artifact_citations"],
            "badges_available": avail,
            "badges_functional": func,
            "badges_reproducible": repro,
            "papers": stats["papers"],
            "papers_without_artifacts": papers_without_list,
        }
        authors_list.append(author_entry)

    # Sort by artifact count
    authors_list.sort(key=lambda x: x["artifact_count"], reverse=True)

    # Add category breakdown to return
    category_breakdown = {
        "systems_count": len(systems_authors),
        "security_count": len(security_authors),
        "cross_domain_count": len(cross_domain_authors),
    }

    return authors_list, category_breakdown

generate_author_stats(dblp_file: str, data_dir: str, output_dir: str) -> None

Main function to generate author statistics

Source code in src/generators/generate_author_stats.py
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
def generate_author_stats(dblp_file: str, data_dir: str, output_dir: str) -> None:
    """Main function to generate author statistics"""
    logger.info("Generating author statistics...")

    # Load artifacts
    artifacts = load_artifacts(data_dir)
    if not artifacts:
        return None

    # Load conference active years (years when each conference had AE)
    conference_active_years = load_conference_active_years(data_dir)

    # Total artifacts: {len(artifacts)}

    # Extract titles
    paper_titles, title_to_artifact = extract_paper_titles(artifacts)

    # Parse DBLP
    papers_with_authors, venue_papers, affiliations = parse_dblp_for_authors(dblp_file, paper_titles, title_to_artifact)

    if not papers_with_authors:
        logger.info("No papers matched in DBLP")
        return None

    # Load artifact citations (optional)
    citations_by_title = load_artifact_citations(data_dir)

    # Aggregate statistics (pass venue_papers and conference_active_years for total-paper counts)
    authors_list, category_breakdown = aggregate_author_statistics(
        papers_with_authors, venue_papers, affiliations, conference_active_years, citations_by_title
    )

    # Load author index for IDs and canonical affiliations
    try:
        from src.utils.author_index import load_author_index

        index_entries, index_by_name = load_author_index(data_dir)
        if index_by_name:
            patched_aff = 0
            for author in authors_list:
                idx_entry = index_by_name.get(author["name"])
                if idx_entry is None:
                    continue
                if idx_entry.get("id") is not None:
                    author["author_id"] = idx_entry["id"]
                # Override affiliation with canonical index value (enricher-sourced)
                idx_aff = idx_entry.get("affiliation", "")
                if idx_aff and idx_aff != author.get("affiliation", ""):
                    author["affiliation"] = idx_aff
                    patched_aff += 1
            assigned = sum(1 for a in authors_list if "author_id" in a)
            logger.info(f"Author IDs assigned: {assigned}/{len(authors_list)}")
            logger.info(f"Affiliations overridden from author index: {patched_aff}")
    except ImportError:
        logger.debug("Optional module not available, skipping enrichment")

    # Count affiliation coverage
    with_affil = sum(1 for a in authors_list if a.get("affiliation"))
    logger.info(
        f"Authors with DBLP affiliation: {with_affil}/{len(authors_list)} ({round(with_affil / len(authors_list) * 100, 1) if authors_list else 0}%)"
    )

    # Generate summary
    author_summary = {
        "total_authors": len(authors_list),
        "total_papers_matched": len(papers_with_authors),
        "active_last_year": sum(1 for a in authors_list if a["recent_count"] > 0),
        "multi_conference": sum(1 for a in authors_list if len(a["conferences"]) > 1),
        "systems_authors": category_breakdown["systems_count"],
        "security_authors": category_breakdown["security_count"],
        "cross_domain_authors": category_breakdown["cross_domain_count"],
        "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC"),
    }

    # Write output files
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(os.path.join(output_dir, "_data"), exist_ok=True)
    os.makedirs(os.path.join(output_dir, "assets/data"), exist_ok=True)

    # --- Build paper index and replace embedded papers with IDs ---
    from .generate_paper_index import build_paper_index, load_existing_index, normalize_title

    index_path = os.path.join(output_dir, "_data", "papers.json")
    existing_papers, existing_by_title = load_existing_index(index_path)
    max_paper_id = max((e["id"] for e in existing_papers), default=0)
    papers_list, norm_to_id = build_paper_index(authors_list, existing_by_title, max_paper_id)

    # Write paper index
    with open(index_path, "w") as f:
        json.dump(papers_list, f, indent=2, ensure_ascii=False)
    assets_papers = os.path.join(output_dir, "assets/data/papers.json")
    with open(assets_papers, "w") as f:
        json.dump(papers_list, f, ensure_ascii=False)
    artifact_count = sum(1 for p in papers_list if p.get("has_artifact", True))
    logger.info(f"Paper index: {len(papers_list)} papers ({artifact_count} with artifacts)")

    # Replace embedded papers with paper_ids in authors_list
    for author in authors_list:
        paper_ids = []
        for p in author.get("papers", []):
            norm = normalize_title(p.get("title", ""))
            pid = norm_to_id.get(norm)
            if pid is not None:
                paper_ids.append(pid)
        author["paper_ids"] = paper_ids

        without_ids = []
        for p in author.get("papers_without_artifacts", []):
            norm = normalize_title(p.get("title", ""))
            pid = norm_to_id.get(norm)
            if pid is not None:
                without_ids.append(pid)
        author["papers_without_artifact_ids"] = without_ids

        # Keep 'papers' in the full JSON for backward compatibility,
        # but remove from YAML to cut file size

    # YAML for Jekyll — without embedded papers (use paper_ids instead)
    authors_for_yaml = []
    for author in authors_list:
        entry = {
            k: v
            for k, v in author.items()
            if k not in ("papers", "papers_without_artifacts", "total_papers_by_conf", "total_papers_by_conf_year")
        }
        authors_for_yaml.append(entry)
    with open(os.path.join(output_dir, "_data/authors.yml"), "w") as f:
        yaml.dump(authors_for_yaml, f, default_flow_style=False, allow_unicode=True)

    with open(os.path.join(output_dir, "_data/author_summary.yml"), "w") as f:
        yaml.dump(author_summary, f, default_flow_style=False)

    # JSON for download (full data including embedded papers for backward compat)
    with open(os.path.join(output_dir, "assets/data/authors.json"), "w") as f:
        json.dump(authors_list, f, indent=2, ensure_ascii=False)

    # Paper -> authors mapping for citation attribution
    with open(os.path.join(output_dir, "assets/data/paper_authors_map.json"), "w") as f:
        json.dump(papers_with_authors, f, indent=2, ensure_ascii=False)

    logger.info(f"Author data written to {output_dir} ({len(authors_list)} authors, {len(papers_with_authors)} papers)")

    return {"authors": authors_list, "summary": author_summary}