generate_repo_stats¶

`src.generators.generate_repo_stats` ¶

Generate repository statistics (stars, forks, etc.) for the website.

Collects stats from GitHub/Zenodo/Figshare for all scraped artifacts and writes: - _data/repo_stats.yml — per-conference/year aggregates (for website) - assets/data/repo_stats_detail.json — per-repo detail (for analysis/figures)

Usage

python generate_repo_stats.py --conf_regex '.*20[12][0-9]' --output_dir ../reprodb.github.io

`collect_stats_for_results(results, url_keys=None)` ¶

Collect repository stats for all artifacts.

Source code in src/generators/generate_repo_stats.py

def collect_stats_for_results(results, url_keys=None):
    """Collect repository stats for all artifacts."""
    if url_keys is None:
        url_keys = ["repository_url", "artifact_url", "github_url", "second_repository_url", "bitbucket_url"]

    # First pass: extract ALL URLs from list-valued fields and create expanded artifact entries
    # This ensures we collect stats for every artifact location, not just the first
    expanded_artifacts = {}
    for conf_year, artifacts in results.items():
        expanded_artifacts[conf_year] = []
        for artifact in artifacts:
            # Collect all URLs from this artifact (including multi-valued fields)
            all_urls_by_key = {}

            # Add single-valued URL fields
            for url_key in url_keys:
                if url_key in artifact and artifact[url_key]:
                    all_urls_by_key[url_key] = [artifact[url_key]]

            # Add URLs from list-valued fields (artifact_urls, additional_urls, etc.)
            for list_key in ["artifact_urls", "additional_urls"]:
                if list_key in artifact and isinstance(artifact[list_key], list):
                    for url in artifact[list_key]:
                        if isinstance(url, str) and url:
                            # Map back to single key: artifact_urls -> artifact_url
                            flat_key = list_key.rstrip("s")
                            if flat_key not in all_urls_by_key:
                                all_urls_by_key[flat_key] = []
                            if url not in all_urls_by_key[flat_key]:
                                all_urls_by_key[flat_key].append(url)

            # Create separate artifact entry for each URL to process
            if all_urls_by_key:
                for url_key, urls in all_urls_by_key.items():
                    for url in urls:
                        artifact_copy = {
                            k: v for k, v in artifact.items() if k not in ["artifact_urls", "additional_urls"]
                        }
                        artifact_copy[url_key] = url
                        expanded_artifacts[conf_year].append(artifact_copy)
            else:
                # No URLs found, keep original artifact
                expanded_artifacts[conf_year].append(artifact)

    results = expanded_artifacts

    # Filter url_keys to only those that actually appear in the data
    present_keys = set()
    for artifacts in results.values():
        for artifact in artifacts:
            for key in url_keys:
                if key in artifact and artifact[key]:
                    present_keys.add(key)
    url_keys = [k for k in url_keys if k in present_keys]
    if not url_keys:
        logger.warning("  Warning: No URL keys found in artifact data. No repository stats to collect.")
        return []
    logger.info(f"  Scanning URL fields: {', '.join(url_keys)}")

    # Check which URLs exist
    results, _, _ = check_artifact_exists(results, url_keys)

    # Build deduplicated list of (url, conf_name, year, title) jobs
    jobs = []
    seen_urls = set()
    for conf_year, artifacts in results.items():
        conf_name, year = extract_conference_name(conf_year)
        if year is None:
            continue
        for artifact in artifacts:
            for url_key in url_keys:
                url = artifact.get(url_key, "")
                exists_key = f"{url_key}_exists"
                if not artifact.get(exists_key, False) or not url:
                    continue
                url_normalized = url.rstrip("/")
                if url_normalized in seen_urls:
                    continue
                seen_urls.add(url_normalized)
                jobs.append((url, conf_name, year, artifact.get("title", "Unknown")))

    logger.info(f"  Collecting stats for {len(jobs)} unique URLs (4 workers)")

    def _fetch_stats(url):
        """Fetch stats for a single URL (thread-safe via disk cache)."""
        try:
            if "github" in url:
                return github_stats(url), "github"
            if "zenodo" in url:
                return zenodo_stats(url), "zenodo"
            if "figshare" in url:
                return figshare_stats(url), "figshare"
        except Exception as e:
            logger.error(f"  Error collecting stats for {url}: {e}")
        return None, "unknown"

    all_stats = []
    collected = 0
    with ThreadPoolExecutor(max_workers=4) as pool:
        future_to_job = {pool.submit(_fetch_stats, url): (url, conf, yr, title) for url, conf, yr, title in jobs}
        for i, future in enumerate(as_completed(future_to_job), 1):
            url, conf_name, year, title = future_to_job[future]
            stats, source = future.result()
            if stats:
                collected += 1
                entry = {
                    "conference": conf_name,
                    "year": year,
                    "title": title,
                    "url": url,
                    "source": source,
                }
                entry.update(stats)
                all_stats.append(entry)
            if i % 100 == 0 or i == len(jobs):
                logger.info(f"  Progress: {i}/{len(jobs)} URLs fetched, {collected} stats collected")

    return all_stats

`aggregate_stats(all_stats)` ¶

Aggregate per-conference and per-year statistics.

Source code in src/generators/generate_repo_stats.py

def aggregate_stats(all_stats):
    """Aggregate per-conference and per-year statistics."""
    # Per-conference aggregates
    by_conf = defaultdict(
        lambda: {
            "github_repos": 0,
            "total_stars": 0,
            "total_forks": 0,
            "max_stars": 0,
            "max_forks": 0,
            "zenodo_repos": 0,
            "total_views": 0,
            "total_downloads": 0,
            "years": defaultdict(lambda: {"github_repos": 0, "stars": 0, "forks": 0}),
            "all_github_entries": [],
        }
    )

    by_year = defaultdict(
        lambda: {
            "github_repos": 0,
            "total_stars": 0,
            "total_forks": 0,
            "max_stars": 0,
            "max_forks": 0,
            "zenodo_repos": 0,
            "total_views": 0,
            "total_downloads": 0,
        }
    )

    overall = {
        "github_repos": 0,
        "total_stars": 0,
        "total_forks": 0,
        "max_stars": 0,
        "max_forks": 0,
        "zenodo_repos": 0,
        "total_views": 0,
        "total_downloads": 0,
        "avg_stars": 0,
        "avg_forks": 0,
    }

    for s in all_stats:
        conf = s["conference"]
        year = s["year"]

        if s["source"] == "github":
            stars = s.get("github_stars", 0) or 0
            forks = s.get("github_forks", 0) or 0

            by_conf[conf]["github_repos"] += 1
            by_conf[conf]["total_stars"] += stars
            by_conf[conf]["total_forks"] += forks
            by_conf[conf]["max_stars"] = max(by_conf[conf]["max_stars"], stars)
            by_conf[conf]["max_forks"] = max(by_conf[conf]["max_forks"], forks)
            by_conf[conf]["years"][year]["github_repos"] += 1
            by_conf[conf]["years"][year]["stars"] += stars
            by_conf[conf]["years"][year]["forks"] += forks
            by_conf[conf]["all_github_entries"].append(
                {
                    "title": s.get("title", "Unknown"),
                    "url": s.get("url", ""),
                    "conference": conf,
                    "year": year,
                    "area": _conf_area(conf),
                    "stars": stars,
                    "forks": forks,
                    "description": (s.get("description", "") or "")[:120],
                    "language": s.get("language", "") or "",
                    "name": s.get("name", ""),
                    "pushed_at": s.get("pushed_at", ""),
                }
            )

            by_year[year]["github_repos"] += 1
            by_year[year]["total_stars"] += stars
            by_year[year]["total_forks"] += forks
            by_year[year]["max_stars"] = max(by_year[year]["max_stars"], stars)
            by_year[year]["max_forks"] = max(by_year[year]["max_forks"], forks)

            overall["github_repos"] += 1
            overall["total_stars"] += stars
            overall["total_forks"] += forks
            overall["max_stars"] = max(overall["max_stars"], stars)
            overall["max_forks"] = max(overall["max_forks"], forks)

        elif s["source"] == "zenodo":
            views = s.get("zenodo_views", 0) or 0
            downloads = s.get("zenodo_downloads", 0) or 0

            by_conf[conf]["zenodo_repos"] += 1
            by_conf[conf]["total_views"] += views
            by_conf[conf]["total_downloads"] += downloads

            by_year[year]["zenodo_repos"] += 1
            by_year[year]["total_views"] += views
            by_year[year]["total_downloads"] += downloads

            overall["zenodo_repos"] += 1
            overall["total_views"] += views
            overall["total_downloads"] += downloads

    if overall["github_repos"] > 0:
        overall["avg_stars"] = round(overall["total_stars"] / overall["github_repos"], 1)
        overall["avg_forks"] = round(overall["total_forks"] / overall["github_repos"], 1)

    # Convert to serializable format
    conf_stats = []
    for conf_name in sorted(by_conf.keys()):
        d = by_conf[conf_name]
        avg_stars = round(d["total_stars"] / d["github_repos"], 1) if d["github_repos"] > 0 else 0
        avg_forks = round(d["total_forks"] / d["github_repos"], 1) if d["github_repos"] > 0 else 0
        year_list = []
        for yr in sorted(d["years"].keys()):
            yd = d["years"][yr]
            year_list.append(
                {
                    "year": yr,
                    "github_repos": yd["github_repos"],
                    "stars": yd["stars"],
                    "forks": yd["forks"],
                    "avg_stars": round(yd["stars"] / yd["github_repos"], 1) if yd["github_repos"] > 0 else 0,
                    "avg_forks": round(yd["forks"] / yd["github_repos"], 1) if yd["github_repos"] > 0 else 0,
                }
            )
        # Top 5 repos by stars
        top_repos = sorted(d["all_github_entries"], key=lambda x: x["stars"], reverse=True)[:5]
        conf_stats.append(
            {
                "name": conf_name,
                "github_repos": d["github_repos"],
                "total_stars": d["total_stars"],
                "total_forks": d["total_forks"],
                "avg_stars": avg_stars,
                "avg_forks": avg_forks,
                "max_stars": d["max_stars"],
                "max_forks": d["max_forks"],
                "years": year_list,
                "top_repos": top_repos,
            }
        )

    year_stats = []
    for yr in sorted(by_year.keys()):
        d = by_year[yr]
        avg_stars = round(d["total_stars"] / d["github_repos"], 1) if d["github_repos"] > 0 else 0
        avg_forks = round(d["total_forks"] / d["github_repos"], 1) if d["github_repos"] > 0 else 0
        year_stats.append(
            {
                "year": yr,
                "github_repos": d["github_repos"],
                "total_stars": d["total_stars"],
                "total_forks": d["total_forks"],
                "avg_stars": avg_stars,
                "avg_forks": avg_forks,
                "max_stars": d["max_stars"],
                "max_forks": d["max_forks"],
            }
        )

    overall["last_updated"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC")

    # Per-repo detail: all GitHub entries with individual star/fork counts
    all_github_detail = []
    for conf_name in sorted(by_conf.keys()):
        all_github_detail.extend(by_conf[conf_name]["all_github_entries"])

    return {
        "overall": overall,
        "by_conference": conf_stats,
        "by_year": year_stats,
        "all_github_repos": all_github_detail,
    }

generate_repo_stats¶

src.generators.generate_repo_stats ¶

collect_stats_for_results(results, url_keys=None) ¶

aggregate_stats(all_stats) ¶

`src.generators.generate_repo_stats` ¶

`collect_stats_for_results(results, url_keys=None)` ¶

`aggregate_stats(all_stats)` ¶