Skip to content

dblp_extract

src.utils.dblp_extract

Pre-extract structured data from the local DBLP XML dump.

Parses dblp.xml.gz in a single pass and writes JSON lookup files that every downstream pipeline step can load instead of hitting the DBLP API.

Outputs (under output_dir): .cache/dblp_extracted/papers_by_venue.json {conf: {year_str: [{title, authors, doi, dblp_key}]}}

.cache/dblp_extracted/affiliations.json {author_name: affiliation}

The extraction is cached: if the DBLP file has not changed (same mtime) the previous JSON files are reused.

Usage

python -m src.utils.dblp_extract --dblp_file data/dblp/dblp.xml.gz

NOTE — DBLP API policy ~~~~~~~~~~~~~~~~~~~~~~ We deliberately avoid the DBLP web API (https://dblp.org/search/…). The local XML dump contains the same data and avoids rate-limiting issues that grow worse as the number of tracked conferences increases. All new code should use the extracted JSON files produced by this module. Do NOT add new DBLP API calls.

extract_dblp(dblp_file: str) -> tuple[str, str]

Parse dblp.xml.gz and write JSON lookup files.

Returns (papers_path, affiliations_path).

Source code in src/utils/dblp_extract.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
def extract_dblp(dblp_file: str) -> tuple[str, str]:
    """Parse dblp.xml.gz and write JSON lookup files.

    Returns (papers_path, affiliations_path).
    """
    extract_dir = _extract_dir()
    os.makedirs(extract_dir, exist_ok=True)

    papers_path = os.path.join(extract_dir, "papers_by_venue.json")
    affiliations_path = os.path.join(extract_dir, "affiliations.json")

    # Re-use cached files if the DBLP dump hasn't changed
    if _is_fresh(dblp_file, extract_dir):
        logger.warning("DBLP extraction cache is fresh — skipping parse")
        return papers_path, affiliations_path

    logger.info(f"Parsing DBLP XML ({dblp_file}) …")

    # {conf -> {year_str -> [paper_dict]}}
    papers: defaultdict[str, defaultdict[str, list[dict]]] = defaultdict(lambda: defaultdict(list))
    # {author_name -> affiliation}
    affiliations = {}

    dblp_stream = _PatchedDTDStream(GzipFile(filename=dblp_file))
    iteration = 0

    for _, elem in ET.iterparse(
        dblp_stream,
        events=("end",),
        tag=("inproceedings", "article", "www"),
        load_dtd=True,
        recover=True,
        huge_tree=True,
    ):
        # --- Person records: extract affiliations ---
        if elem.tag == "www":
            authors = [a.text for a in elem.findall("author") if a.text]
            affil = None
            for note in elem.findall("note"):
                if note.get("type") == "affiliation" and note.text:
                    affil = note.text.strip()
                    break  # take the first (most recent) affiliation
            if affil:
                for name in authors:
                    if name not in affiliations:
                        affiliations[name] = affil
            elem.clear()
            continue

        # --- Papers ---
        booktitle = elem.findtext("booktitle") or elem.findtext("journal") or ""
        conf = venue_to_conference(booktitle)
        if conf:
            year_str = elem.findtext("year")
            if year_str:
                title = elem.findtext("title") or ""
                # Strip trailing period (DBLP convention)
                title = title.rstrip(".")

                # Extract DOI from <ee> elements
                doi = ""
                for ee in elem.findall("ee"):
                    if ee.text and "doi.org/" in ee.text:
                        doi = ee.text.split("doi.org/")[-1]
                        break

                authors = [a.text for a in elem.findall("author") if a.text]
                dblp_key = elem.get("key", "")

                papers[conf][year_str].append(
                    {
                        "title": title,
                        "authors": authors,
                        "doi": doi,
                        "dblp_key": dblp_key,
                    }
                )

        iteration += 1
        if iteration % 2_000_000 == 0:
            logger.info(f"  … {iteration // 1_000_000}M elements")
        elem.clear()

    dblp_stream._raw.close()

    total_papers = sum(len(plist) for conf_years in papers.values() for plist in conf_years.values())
    logger.info(
        f"  Done — {iteration} elements, {total_papers} conference papers, {len(affiliations)} author affiliations"
    )

    # Write JSON files (ensure_ascii=False preserves Unicode characters)
    with open(papers_path, "w", encoding="utf-8") as f:
        json.dump(papers, f, separators=(",", ":"), ensure_ascii=False)
    with open(affiliations_path, "w", encoding="utf-8") as f:
        json.dump(affiliations, f, separators=(",", ":"), ensure_ascii=False)

    # Record the DBLP file mtime for freshness checks
    with open(_mtime_file(extract_dir), "w") as f:
        f.write(str(os.path.getmtime(dblp_file)))

    sz_p = os.path.getsize(papers_path) // 1024 // 1024
    sz_a = os.path.getsize(affiliations_path) // 1024 // 1024
    logger.info(f"  → {papers_path} ({sz_p} MB)")
    logger.info(f"  → {affiliations_path} ({sz_a} MB)")

    return papers_path, affiliations_path

load_papers_by_venue(repo_root: str | None = None) -> dict[str, dict[str, list[dict]]]

Load the pre-extracted papers index.

Returns dict: conf (str) → year_str (str) → list of paper dicts. Each paper dict has keys: title, authors, doi, dblp_key.

Source code in src/utils/dblp_extract.py
239
240
241
242
243
244
245
246
247
248
249
250
def load_papers_by_venue(repo_root: str | None = None) -> dict[str, dict[str, list[dict]]]:
    """Load the pre-extracted papers index.

    Returns dict: conf (str) → year_str (str) → list of paper dicts.
    Each paper dict has keys: title, authors, doi, dblp_key.
    """
    path = os.path.join(_extract_dir(repo_root), "papers_by_venue.json")
    if not os.path.exists(path):
        return {}
    with open(path) as f:
        result: dict[str, dict[str, list[dict]]] = json.load(f)
        return result

load_affiliations(repo_root: str | None = None) -> dict[str, str]

Load the pre-extracted author → affiliation mapping.

Returns dict: author_name (str) → affiliation (str).

Source code in src/utils/dblp_extract.py
253
254
255
256
257
258
259
260
261
262
263
def load_affiliations(repo_root: str | None = None) -> dict[str, str]:
    """Load the pre-extracted author → affiliation mapping.

    Returns dict: author_name (str) → affiliation (str).
    """
    path = os.path.join(_extract_dir(repo_root), "affiliations.json")
    if not os.path.exists(path):
        return {}
    with open(path) as f:
        result: dict[str, str] = json.load(f)
        return result

find_affiliation(name, repo_root=None)

Look up an author's affiliation from the pre-extracted DBLP data.

Tries exact match, then case-insensitive. Returns the affiliation string or None.

Source code in src/utils/dblp_extract.py
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
def find_affiliation(name, repo_root=None):
    """Look up an author's affiliation from the pre-extracted DBLP data.

    Tries exact match, then case-insensitive.  Returns the affiliation
    string or *None*.
    """
    global _affiliations_cache, _affiliations_lower_cache
    if _affiliations_cache is None:
        _affiliations_cache = load_affiliations(repo_root)
        _affiliations_lower_cache = {k.lower(): v for k, v in _affiliations_cache.items()}
    if not _affiliations_cache:
        return None
    # Exact match
    if name in _affiliations_cache:
        return _affiliations_cache[name]
    # Case-insensitive fallback
    assert _affiliations_lower_cache is not None
    return _affiliations_lower_cache.get(name.lower())

papers_for_venue_year(conf: str, year: int, repo_root: str | None = None) -> list[dict]

Convenience: return list of paper dicts for a conference/year.

Falls back to empty list if data is not available.

Source code in src/utils/dblp_extract.py
291
292
293
294
295
296
297
def papers_for_venue_year(conf: str, year: int, repo_root: str | None = None) -> list[dict]:
    """Convenience: return list of paper dicts for a conference/year.

    Falls back to empty list if data is not available.
    """
    data = load_papers_by_venue(repo_root)
    return data.get(conf, {}).get(str(year), [])

paper_count_by_venue_year(repo_root=None)

Return dict: (conf, year_int) → paper_count.

Source code in src/utils/dblp_extract.py
300
301
302
303
304
305
306
307
def paper_count_by_venue_year(repo_root=None):
    """Return dict: (conf, year_int) → paper_count."""
    data = load_papers_by_venue(repo_root)
    counts = {}
    for conf, years in data.items():
        for year_str, paper_list in years.items():
            counts[(conf, int(year_str))] = len(paper_list)
    return counts