Export artifact citations in a simple format showing artifact DOI and citing DOIs.
Usage
python3 export_artifact_citations.py --data_dir ../reprodb.github.io
python3 export_artifact_citations.py --data_dir ../reprodb.github.io --output citations_export.txt
export_citations(data_dir: str, output_file: str = None) -> None
Export artifact citations to a simple DOI mapping format.
Source code in src/generators/export_artifact_citations.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82 | def export_citations(data_dir: str, output_file: str = None) -> None:
"""Export artifact citations to a simple DOI mapping format."""
citations_path = os.path.join(data_dir, "assets", "data", "artifact_citations.json")
if not os.path.exists(citations_path):
logger.error(f"Error: {citations_path} not found.")
logger.info("Run generate_artifact_citations.py first.")
sys.exit(1)
with open(citations_path, "r") as f:
artifacts = json.load(f)
# Open output file or use stdout
out_cm = open(output_file, "w") if output_file else contextlib.nullcontext(sys.stdout) # noqa: SIM115
with out_cm as out:
artifacts_with_citations = 0
total_citing_dois = 0
for artifact in artifacts:
doi = artifact.get("doi")
if not doi:
continue
# Collect citing DOIs from both sources
citing_dois = []
# Add OpenAlex citing DOIs
openalex_citing = artifact.get("citing_dois_openalex", [])
if openalex_citing:
citing_dois.extend(openalex_citing)
# Add Semantic Scholar citing DOIs
semantic_citing = artifact.get("citing_dois_semantic_scholar", [])
if semantic_citing:
citing_dois.extend(semantic_citing)
# Remove duplicates while preserving order
seen = set()
unique_citing_dois = []
for citing_doi in citing_dois:
if citing_doi not in seen:
seen.add(citing_doi)
unique_citing_dois.append(citing_doi)
# Only output if there are citations
if unique_citing_dois:
artifacts_with_citations += 1
total_citing_dois += len(unique_citing_dois)
# Format: artifact_doi: ["citing_doi_1", "citing_doi_2", ...]
citing_dois_str = json.dumps(unique_citing_dois)
out.write(f"{doi}: {citing_dois_str}\n")
# Print summary to stderr so it doesn't interfere with output
logger.info("\n# Summary:")
logger.info(f"# Total artifacts with DOIs: {sum(1 for a in artifacts if a.get('doi'))}")
logger.info(f"# Artifacts with citing DOIs: {artifacts_with_citations}")
logger.info(f"# Total citing DOIs collected: {total_citing_dois}")
if output_file:
logger.info(f"\nWrote citations export to: {output_file}")
|