Skip to content

generate_artifact_sources_timeline

src.generators.generate_artifact_sources_timeline

Generate artifact storage source statistics over time.

Tracks how GitHub, Zenodo, and other platforms have changed over the years for artifact evaluation repositories.

Usage

python generate_artifact_sources_timeline.py --output_dir ../acm-rep-2026-paper/reproducibility

extract_source(url)

Determine the source of an artifact from its URL.

Source code in src/generators/generate_artifact_sources_timeline.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def extract_source(url):
    """Determine the source of an artifact from its URL."""
    if not url:
        return None

    url_lower = url.lower()

    if "github.com" in url_lower or "github.io" in url_lower:
        return "GitHub"
    if "zenodo" in url_lower or "zenodo.org" in url_lower:
        return "Zenodo"
    if "figshare" in url_lower:
        return "Figshare"
    if "osf.io" in url_lower:
        return "OSF"
    if "gitlab" in url_lower:
        return "GitLab"
    if "bitbucket" in url_lower:
        return "Bitbucket"
    if "archive.org" in url_lower or "arxiv" in url_lower:
        return "Archive"
    if "dataverse" in url_lower:
        return "Dataverse"
    if "doi.org" in url_lower:
        # Try to resolve DOI to actual repository
        resolved = _resolve_doi_prefix(url_lower)
        return resolved if resolved else "DOI"
    return "Other"

get_artifact_url(artifact)

Extract the first valid URL from an artifact.

Source code in src/generators/generate_artifact_sources_timeline.py
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def get_artifact_url(artifact):
    """Extract the first valid URL from an artifact."""
    # New format: artifact_urls is the canonical list
    urls = artifact.get("artifact_urls", [])
    if isinstance(urls, list):
        for u in urls:
            norm = _normalise_url(u)
            if norm:
                return norm
    # Legacy fallback
    for key in ["repository_url", "artifact_url", "github_url", "second_repository_url", "bitbucket_url"]:
        val = artifact.get(key, "")
        if isinstance(val, list):
            val = val[0] if val else ""
        val = _normalise_url(val)
        if val:
            return val

    return None

extract_year_from_confname(conf_year_str)

Extract year from conference name like 'osdi2024' -> 2024. Returns None if no year found.

Source code in src/generators/generate_artifact_sources_timeline.py
 99
100
101
102
103
104
105
106
107
def extract_year_from_confname(conf_year_str):
    """
    Extract year from conference name like 'osdi2024' -> 2024.
    Returns None if no year found.
    """
    match = re.search(r"(\d{4})$", conf_year_str)
    if match:
        return int(match.group(1))
    return None

count_sources_by_year(all_results: dict[str, list[dict]]) -> dict[int, int]

Count artifacts by source for each year.

Returns dict: year -> {source: count}

Source code in src/generators/generate_artifact_sources_timeline.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def count_sources_by_year(all_results: dict[str, list[dict]]) -> dict[int, int]:
    """
    Count artifacts by source for each year.

    Returns dict: year -> {source: count}
    """
    stats = defaultdict(lambda: defaultdict(int))

    for conf_year, artifacts in all_results.items():
        year = extract_year_from_confname(conf_year)
        if not year:
            continue

        for artifact in artifacts:
            url = get_artifact_url(artifact)
            source = extract_source(url)
            if source:
                stats[year][source] += 1

    return dict(stats)

generate_csv(output_dir)

Generate CSV file with artifact sources by year.

Source code in src/generators/generate_artifact_sources_timeline.py
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
def generate_csv(output_dir):
    """Generate CSV file with artifact sources by year."""

    # Get all artifacts (from both systems and security)
    logger.info("Fetching artifact evaluation results...")
    sys_results = get_ae_results(r".*20[12][0-9]", "sys")
    sec_results = get_ae_results(r".*20[12][0-9]", "sec")
    all_results = {**sys_results, **sec_results}

    # Count sources by year
    stats_by_year = count_sources_by_year(all_results)

    # Sort years
    years = sorted(stats_by_year.keys())

    # Get all unique sources
    all_sources = set()
    for year_stats in stats_by_year.values():
        all_sources.update(year_stats.keys())

    # Sort sources with GitHub and Zenodo first, then alphabetically
    sources = sorted(
        all_sources, key=lambda x: (0 if x == "GitHub" else (1 if x == "Zenodo" else (2 if x == "Other" else 3)), x)
    )

    # Write CSV
    os.makedirs(output_dir, exist_ok=True)
    csv_path = os.path.join(output_dir, "fig_sources_over_time.csv")

    with open(csv_path, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["Year"] + sources)
        writer.writeheader()

        for year in years:
            row = {"Year": year}
            for source in sources:
                row[source] = stats_by_year[year].get(source, 0)
            writer.writerow(row)

    logger.info(f"✓ Generated {csv_path}")

    # Print summary
    logger.info("\nArtifact sources over time:")
    logger.info(f"{'Year':<6} {' '.join(f'{s:>10}' for s in sources)}")
    for year in years:
        counts = [str(stats_by_year[year].get(s, 0)) for s in sources]
        logger.info(f"{year:<6} {' '.join(f'{c:>10}' for c in counts)}")

    return csv_path