Skip to content

verify_artifact_citations

src.generators.verify_artifact_citations

Verify that citing papers actually cite the artifact (Zenodo/Figshare DOI), not the published paper that happens to share the same title.

Strategy (Crossref-based): For each citing DOI, fetch its reference list from Crossref and check if any reference contains the artifact DOI (e.g., 10.5281/zenodo. or 10.6084/m9.figshare.). If yes → GENUINE. If no → FALSE_POSITIVE.

This is more reliable than OpenAlex title matching because: - Crossref has the publisher-submitted bibliography with actual DOIs - OpenAlex sometimes conflates paper and artifact records with same titles

Usage

HTTP_PROXY=http://proxy-dmz.intel.com:912 HTTPS_PROXY=http://proxy-dmz.intel.com:912 python3 verify_artifact_citations.py --data_dir ../reprodb.github.io

is_artifact_doi(doi: str) -> bool

Check if a DOI is a known artifact repository DOI.

Source code in src/generators/verify_artifact_citations.py
48
49
50
51
def is_artifact_doi(doi: str) -> bool:
    """Check if a DOI is a known artifact repository DOI."""
    doi_lower = doi.lower()
    return any(doi_lower.startswith(p) for p in ARTIFACT_DOI_PREFIXES)

fetch_crossref_references(doi: str) -> list[dict] | None

Fetch the reference list for a DOI from Crossref.

Source code in src/generators/verify_artifact_citations.py
54
55
56
57
58
59
60
61
62
def fetch_crossref_references(doi: str) -> list[dict] | None:
    """Fetch the reference list for a DOI from Crossref."""
    url = "https://api.crossref.org/works/" + urllib.parse.quote(doi, safe="")
    try:
        data = fetch_json(url, timeout=30)
        return data.get("message", {}).get("reference", [])
    except Exception as e:
        log(f"      [WARN] Crossref lookup failed for {doi}: {e}")
        return None

references_contain_artifact_doi(refs: list[dict], artifact_doi: str) -> bool

Check if any Crossref reference entry contains the artifact DOI.

Source code in src/generators/verify_artifact_citations.py
65
66
67
68
69
70
71
72
def references_contain_artifact_doi(refs: list[dict], artifact_doi: str) -> bool:
    """Check if any Crossref reference entry contains the artifact DOI."""
    artifact_doi_lower = artifact_doi.lower()
    for ref in refs:
        ref_str = json.dumps(ref).lower()
        if artifact_doi_lower in ref_str:
            return True
    return False

references_contain_any_artifact_doi(refs: list[dict]) -> list[str]

Return any artifact-repository DOIs found in the reference list.

Source code in src/generators/verify_artifact_citations.py
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def references_contain_any_artifact_doi(refs: list[dict]) -> list[str]:
    """Return any artifact-repository DOIs found in the reference list."""
    found = []
    for ref in refs:
        # Check the DOI field
        ref_doi = ref.get("DOI", "") or ""
        if is_artifact_doi(ref_doi):
            found.append(ref_doi)
        # Also check unstructured text for Zenodo/Figshare URLs
        unstructured = ref.get("unstructured", "") or ""
        for prefix in ARTIFACT_DOI_PREFIXES:
            if prefix in unstructured.lower():
                found.append(f"[in unstructured: {prefix}...]")
                break
    return found

normalize_author(name: str) -> str

Normalize an author name for comparison (lowercase, stripped).

Source code in src/generators/verify_artifact_citations.py
92
93
94
def normalize_author(name: str) -> str:
    """Normalize an author name for comparison (lowercase, stripped)."""
    return re.sub(r"[^a-z]+", " ", name.lower()).strip()

get_author_surnames(name: str) -> set[str]

Extract likely surnames from an author name string.

Source code in src/generators/verify_artifact_citations.py
 97
 98
 99
100
101
102
103
def get_author_surnames(name: str) -> set[str]:
    """Extract likely surnames from an author name string."""
    parts = normalize_author(name).split()
    if not parts:
        return set()
    # Return last part as surname; also return all parts for short names
    return {p for p in parts if len(p) > 1}

fetch_zenodo_authors(record_id: str) -> list[str]

Fetch author names from Zenodo API.

Source code in src/generators/verify_artifact_citations.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def fetch_zenodo_authors(record_id: str) -> list[str]:
    """Fetch author names from Zenodo API."""
    url = f"https://zenodo.org/api/records/{record_id}"
    try:
        # Follow redirects (some records redirect to a newer version)
        req = urllib.request.Request(url, headers={"User-Agent": "reprodb-verify/0.1"})
        resp = urllib.request.urlopen(req, timeout=20)
        # Handle redirects manually if needed
        data = json.loads(resp.read())
        creators = data.get("metadata", {}).get("creators", [])
        return [c.get("name", "") for c in creators if c.get("name")]
    except Exception as e:
        log(f"      [WARN] Zenodo author lookup failed: {e}")
        return []

fetch_figshare_authors(doi: str) -> list[str]

Fetch author names from Figshare via Crossref (Figshare API is complex).

Source code in src/generators/verify_artifact_citations.py
122
123
124
125
126
127
128
129
130
def fetch_figshare_authors(doi: str) -> list[str]:
    """Fetch author names from Figshare via Crossref (Figshare API is complex)."""
    url = "https://api.crossref.org/works/" + urllib.parse.quote(doi, safe="")
    try:
        data = fetch_json(url, timeout=20)
        authors = data.get("message", {}).get("author", [])
        return [f"{a.get('family', '')} {a.get('given', '')}".strip() for a in authors]
    except Exception:
        return []

fetch_crossref_authors(doi: str) -> list[str]

Fetch author names from Crossref for a citing paper.

Source code in src/generators/verify_artifact_citations.py
133
134
135
136
137
138
139
140
141
def fetch_crossref_authors(doi: str) -> list[str]:
    """Fetch author names from Crossref for a citing paper."""
    url = "https://api.crossref.org/works/" + urllib.parse.quote(doi, safe="")
    try:
        data = fetch_json(url, timeout=20)
        authors = data.get("message", {}).get("author", [])
        return [f"{a.get('family', '')} {a.get('given', '')}".strip() for a in authors]
    except Exception:
        return []

authors_overlap(artifact_authors: list[str], citing_authors: list[str]) -> tuple[bool, list[str]]

Check if any authors overlap between artifact and citing paper. Returns (has_overlap, list_of_matching_surnames).

Source code in src/generators/verify_artifact_citations.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def authors_overlap(artifact_authors: list[str], citing_authors: list[str]) -> tuple[bool, list[str]]:
    """Check if any authors overlap between artifact and citing paper.
    Returns (has_overlap, list_of_matching_surnames)."""
    art_surnames = set()
    for name in artifact_authors:
        art_surnames.update(get_author_surnames(name))

    cite_surnames = set()
    for name in citing_authors:
        cite_surnames.update(get_author_surnames(name))

    # Find overlapping surnames (at least 3 chars to avoid initials)
    common = {s for s in art_surnames & cite_surnames if len(s) >= 3}
    return bool(common), sorted(common)

verify_citations(data_dir: str, output_file: str = None) -> None

Verify each citing DOI actually references the artifact, not the paper.

Source code in src/generators/verify_artifact_citations.py
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
def verify_citations(data_dir: str, output_file: str = None) -> None:
    """Verify each citing DOI actually references the artifact, not the paper."""

    citations_path = os.path.join(data_dir, "assets", "data", "artifact_citations.json")
    if not os.path.exists(citations_path):
        log(f"Error: {citations_path} not found.")
        sys.exit(1)

    with open(citations_path, "r") as f:
        artifacts = json.load(f)

    # Collect artifacts that have citing DOIs
    cited_artifacts = []
    for art in artifacts:
        doi = art.get("doi")
        if not doi:
            continue
        citing_oa = art.get("citing_dois_openalex", [])
        citing_s2 = art.get("citing_dois_semantic_scholar", [])
        all_citing = list(set(citing_oa + citing_s2))
        if all_citing:
            cited_artifacts.append(
                {
                    "title": art.get("title", ""),
                    "doi": doi,
                    "cited_by_count": art.get("cited_by_count") or 0,
                    "citing_dois": all_citing,
                }
            )

    if not cited_artifacts:
        log("No artifacts with citing DOIs found.")
        return

    total_citing = sum(len(a["citing_dois"]) for a in cited_artifacts)
    log(f"Found {len(cited_artifacts)} artifacts with {total_citing} citing DOIs to verify")
    log("Strategy: check Crossref reference lists for artifact DOIs")
    log("=" * 70)

    # Test Crossref connectivity
    log("Testing Crossref connectivity...")
    try:
        test_url = "https://api.crossref.org/works?rows=0"
        fetch_json(test_url, timeout=10)
        log("  Crossref API reachable ✓")
    except Exception as e:
        log(f"  [ERROR] Crossref API unreachable: {e}")
        log("  Make sure HTTP_PROXY/HTTPS_PROXY are set if behind a proxy.")
        sys.exit(1)

    results = []
    total_genuine = 0
    total_false_positive = 0
    total_self_citation = 0
    total_unknown = 0

    for art in cited_artifacts:
        artifact_doi = art["doi"]
        artifact_title = art["title"]
        citing_dois = art["citing_dois"]

        log(f"\n{'─' * 70}")
        log(f"Artifact: {artifact_title[:70]}")
        log(f"  DOI: {artifact_doi}")
        log(f"  Citing DOIs to check: {len(citing_dois)}")

        # Pre-fetch artifact authors (Zenodo or Figshare)
        artifact_authors = []
        if artifact_doi.lower().startswith("10.5281/zenodo."):
            record_id = artifact_doi.split("zenodo.")[1]
            artifact_authors = fetch_zenodo_authors(record_id)
            time.sleep(0.3)
        elif artifact_doi.lower().startswith("10.6084/m9.figshare."):
            artifact_authors = fetch_figshare_authors(artifact_doi)
            time.sleep(0.3)
        if artifact_authors:
            log(
                f"  Artifact authors: {', '.join(a[:30] for a in artifact_authors[:5])}{'...' if len(artifact_authors) > 5 else ''}"
            )
        else:
            log("  Artifact authors: (could not fetch)")

        for cdoi in citing_dois:
            log(f"\n  Checking: {cdoi}")

            # Special case: citing DOI is the artifact DOI itself
            if cdoi.lower() == artifact_doi.lower():
                log("    → SELF_CITATION: citing DOI is the artifact itself")
                results.append(
                    {
                        "artifact_doi": artifact_doi,
                        "artifact_title": artifact_title,
                        "citing_doi": cdoi,
                        "verdict": "SELF_CITATION",
                        "reason": "citing DOI matches artifact DOI",
                    }
                )
                total_self_citation += 1
                continue

            # Fetch Crossref reference list and metadata
            refs = fetch_crossref_references(cdoi)
            time.sleep(0.5)  # Rate limit courtesy

            if refs is None:
                log("    → UNKNOWN: Crossref lookup failed")
                results.append(
                    {
                        "artifact_doi": artifact_doi,
                        "artifact_title": artifact_title,
                        "citing_doi": cdoi,
                        "verdict": "UNKNOWN",
                        "reason": "Crossref lookup failed",
                    }
                )
                total_unknown += 1
                continue

            log(f"    Crossref refs: {len(refs)} total")

            # Check if the exact artifact DOI appears in references
            has_exact = references_contain_artifact_doi(refs, artifact_doi)

            # Also check if ANY artifact-repo DOI appears (zenodo/figshare)
            any_artifact_dois = references_contain_any_artifact_doi(refs)

            if has_exact:
                # Artifact DOI is in the reference list — now check for self-citation
                # by comparing authors of artifact and citing paper
                citing_authors = fetch_crossref_authors(cdoi)
                time.sleep(0.3)
                has_overlap, common = (
                    authors_overlap(artifact_authors, citing_authors) if artifact_authors else (False, [])
                )

                if has_overlap:
                    log(f"    → SELF_CITATION: refs contain artifact DOI, but authors overlap: {common}")
                    verdict = "SELF_CITATION"
                    reason = f"paper cites its own artifact (shared authors: {', '.join(common)})"
                    total_self_citation += 1
                else:
                    log(f"    → GENUINE: reference list contains artifact DOI {artifact_doi}")
                    verdict = "GENUINE"
                    reason = f"Crossref references contain artifact DOI {artifact_doi}"
                    total_genuine += 1
            elif any_artifact_dois:
                log(f"    → GENUINE_SIMILAR: has artifact DOIs {any_artifact_dois} but not exact match")
                verdict = "GENUINE_SIMILAR"
                reason = f"references contain other artifact DOIs: {any_artifact_dois}"
                total_genuine += 1
            else:
                log(f"    → FALSE_POSITIVE: no artifact DOIs found in {len(refs)} references")
                verdict = "FALSE_POSITIVE"
                reason = f"no artifact-repository DOIs in {len(refs)} Crossref references"
                total_false_positive += 1

            results.append(
                {
                    "artifact_doi": artifact_doi,
                    "artifact_title": artifact_title,
                    "citing_doi": cdoi,
                    "crossref_ref_count": len(refs),
                    "has_exact_artifact_doi": has_exact,
                    "artifact_dois_found": any_artifact_dois,
                    "verdict": verdict,
                    "reason": reason,
                }
            )

    # Summary
    log("\n" + "=" * 70)
    log("VERIFICATION SUMMARY")
    log("=" * 70)
    log(f"Total citing DOIs verified: {len(results)}")
    log(f"  GENUINE (artifact DOI in refs):  {total_genuine}")
    log(f"  FALSE_POSITIVE (no artifact DOI):{total_false_positive}")
    log(f"  SELF_CITATION:                   {total_self_citation}")
    log(f"  UNKNOWN (lookup failed):         {total_unknown}")

    # Group by verdict
    log("\n--- GENUINE artifact citations ---")
    for r in results:
        if r["verdict"] in ("GENUINE", "GENUINE_SIMILAR"):
            log(f"  {r['artifact_doi']}{r['citing_doi']}")

    log("\n--- FALSE POSITIVES (citing paper, not artifact) ---")
    for r in results:
        if r["verdict"] == "FALSE_POSITIVE":
            log(f"  {r['artifact_doi']}{r['citing_doi']}")
            log(f"    ({r['reason']})")

    log("\n--- SELF CITATIONS ---")
    for r in results:
        if r["verdict"] == "SELF_CITATION":
            log(f"  {r['artifact_doi']}{r['citing_doi']}")

    log("\n--- UNKNOWN ---")
    for r in results:
        if r["verdict"] == "UNKNOWN":
            log(f"  {r['artifact_doi']}{r['citing_doi']}")
            log(f"    ({r['reason']})")

    # Write results to file
    if output_file:
        with open(output_file, "w") as f:
            json.dump(results, f, indent=2)
        log(f"\nDetailed results written to: {output_file}")

    # Write verified citing dois file (genuine only)
    verified_output = output_file.replace(".json", "_genuine.txt") if output_file else None
    if verified_output:
        from collections import defaultdict

        genuine_map = defaultdict(list)
        for r in results:
            if r["verdict"] in ("GENUINE", "GENUINE_SIMILAR"):
                genuine_map[r["artifact_doi"]].append(r["citing_doi"])
        with open(verified_output, "w") as f:
            for adoi, cdois in sorted(genuine_map.items()):
                f.write(f"{adoi}: {json.dumps(sorted(cdois))}\n")
        log(f"Verified genuine citations written to: {verified_output}")

    # Write false positives list
    fp_output = output_file.replace(".json", "_false_positives.txt") if output_file else None
    if fp_output:
        from collections import defaultdict

        fp_map = defaultdict(list)
        for r in results:
            if r["verdict"] == "FALSE_POSITIVE":
                fp_map[r["artifact_doi"]].append(r["citing_doi"])
        if fp_map:
            with open(fp_output, "w") as f:
                for adoi, cdois in sorted(fp_map.items()):
                    f.write(f"{adoi}: {json.dumps(sorted(cdois))}\n")
            log(f"False positives written to: {fp_output}")