ci: enforce mappable co-author credit

Add AUTHOR_MAP plus a lightweight co-author trailer checker so harvested commits use numeric GitHub noreply identities, reject bot/tool trailers, and require machine-readable credit when a commit says it was harvested from a PR.

Also normalize the local unpushed v0.9 harvest range so existing contributor authors/trailers for HUQIANTAO, Implementist, jrcjrcc, xyuai, cyq1017, idling11, and shenjackyuanjie use GitHub-mappable identities before the branch is published.

Validation: python3 scripts/check-coauthor-trailers.py --author-map .github/AUTHOR_MAP --range origin/main..HEAD --check-authors; python3 -m py_compile scripts/check-coauthor-trailers.py; ruby -e 'require "yaml"; YAML.load_file(".github/workflows/ci.yml")'; git diff --check; negative in-process validation for raw email, missing harvested credit, and bot author cases.
This commit is contained in:
Hunter B
2026-06-03 21:07:33 -07:00
parent fb86737a8c
commit 002f8f0ba1
8 changed files with 369 additions and 1 deletions
+245
View File
@@ -0,0 +1,245 @@
#!/usr/bin/env python3
"""Validate that harvested contributor credit is GitHub-mappable.
The check is intentionally scoped to new commits. Historical commits may carry
raw or local emails, but new harvested commits should use GitHub's numeric
`id+login@users.noreply.github.com` address so co-author credit lands in the
contributor graph.
"""
from __future__ import annotations
import argparse
import re
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
DEFAULT_AUTHOR_MAP = ROOT / ".github" / "AUTHOR_MAP"
IDENTITY_RE = re.compile(r"^\s*(?P<name>.+?)\s*<(?P<email>[^<>]+)>\s*$")
CANONICAL_NOREPLY_RE = re.compile(
r"^[0-9]+\+[^@\s]+@users\.noreply\.github\.com$", re.IGNORECASE
)
COAUTHOR_RE = re.compile(
r"^Co-authored-by:\s*(?P<name>.*?)\s*<(?P<email>[^<>]+)>\s*$",
re.IGNORECASE | re.MULTILINE,
)
HARVEST_RE = re.compile(r"Harvested from PR #[0-9]+ by @([A-Za-z0-9-]+)")
BOT_EMAILS = {
"codex@local",
"codex@example.com",
"cursoragent@cursor.com",
"noreply@anthropic.com",
}
BOT_NAMES = ("claude", "codex", "cursor")
@dataclass(frozen=True)
class Identity:
name: str
email: str
def trailer(self) -> str:
return f"Co-authored-by: {self.name} <{self.email}>"
def author(self) -> str:
return f"{self.name} <{self.email}>"
@dataclass(frozen=True)
class Commit:
sha: str
author_name: str
author_email: str
subject: str
body: str
def norm_key(value: str) -> str:
return value.strip().lower()
def github_login_from_noreply(email: str) -> str | None:
if not CANONICAL_NOREPLY_RE.match(email):
return None
local = email.split("@", 1)[0]
return local.split("+", 1)[1]
def parse_identity(raw: str, context: str) -> Identity:
match = IDENTITY_RE.match(raw)
if not match:
raise ValueError(f"{context}: expected 'Name <id+login@users.noreply.github.com>'")
identity = Identity(match.group("name").strip(), match.group("email").strip())
if not CANONICAL_NOREPLY_RE.match(identity.email):
raise ValueError(
f"{context}: right-hand email must be numeric GitHub noreply, got {identity.email}"
)
return identity
def load_author_map(path: Path) -> dict[str, Identity]:
aliases: dict[str, Identity] = {}
for lineno, raw_line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1):
line = raw_line.split("#", 1)[0].strip()
if not line:
continue
if "=" not in line:
raise ValueError(f"{path}:{lineno}: expected 'alias = Name <email>'")
alias, raw_identity = [part.strip() for part in line.split("=", 1)]
identity = parse_identity(raw_identity, f"{path}:{lineno}")
key = norm_key(alias)
if key in aliases and aliases[key] != identity:
raise ValueError(f"{path}:{lineno}: duplicate alias {alias!r}")
aliases[key] = identity
aliases.setdefault(norm_key(identity.email), identity)
aliases.setdefault(norm_key(identity.name), identity)
if login := github_login_from_noreply(identity.email):
aliases.setdefault(norm_key(login), identity)
return aliases
def git_log(commit_range: str) -> list[Commit]:
try:
raw = subprocess.check_output(
[
"git",
"log",
"--format=%H%x00%an%x00%ae%x00%s%x00%B%x1e",
commit_range,
],
cwd=ROOT,
text=True,
)
except subprocess.CalledProcessError as exc:
raise RuntimeError(f"failed to read git range {commit_range!r}: {exc}") from exc
commits: list[Commit] = []
for record in raw.split("\x1e"):
if not record.strip():
continue
parts = record.split("\x00", 4)
if len(parts) != 5:
raise RuntimeError("failed to parse git log output")
commits.append(Commit(*parts))
return commits
def is_bot_identity(name: str, email: str) -> bool:
lowered_name = name.strip().lower()
lowered_email = email.strip().lower()
return lowered_email in BOT_EMAILS or any(
lowered_name == bot or lowered_name.startswith(f"{bot} ") for bot in BOT_NAMES
)
def lookup_identity(aliases: dict[str, Identity], *values: str) -> Identity | None:
for value in values:
identity = aliases.get(norm_key(value))
if identity is not None:
return identity
return None
def validate(commits: list[Commit], aliases: dict[str, Identity], check_authors: bool) -> list[str]:
errors: list[str] = []
for commit in commits:
prefix = f"{commit.sha[:10]} {commit.subject}"
coauthors = [
Identity(match.group("name").strip(), match.group("email").strip())
for match in COAUTHOR_RE.finditer(commit.body)
]
if check_authors:
if is_bot_identity(commit.author_name, commit.author_email):
errors.append(
f"{prefix}: author {commit.author_name} <{commit.author_email}> is a "
"bot/tool identity. Human harvested work should preserve the contributor "
"as author or use a human co-author trailer."
)
elif (
(expected := lookup_identity(aliases, commit.author_email, commit.author_name))
and norm_key(commit.author_email) != norm_key(expected.email)
):
errors.append(
f"{prefix}: author {commit.author_name} <{commit.author_email}> "
f"matches AUTHOR_MAP but is not canonical. Use author {expected.author()}."
)
for coauthor in coauthors:
if CANONICAL_NOREPLY_RE.match(coauthor.email):
continue
if is_bot_identity(coauthor.name, coauthor.email):
errors.append(
f"{prefix}: remove bot/tool co-author trailer "
f"{coauthor.name} <{coauthor.email}>; contributor trailers are for humans."
)
continue
expected = lookup_identity(aliases, coauthor.email, coauthor.name)
if expected:
errors.append(
f"{prefix}: co-author {coauthor.name} <{coauthor.email}> is not "
f"GitHub-mappable. Use `{expected.trailer()}`."
)
else:
errors.append(
f"{prefix}: co-author {coauthor.name} <{coauthor.email}> is not "
"numeric GitHub noreply and has no AUTHOR_MAP entry. Add an alias "
"or use `gh api users/<login> --jq '\"\\(.id)+\\(.login)@users.noreply.github.com\"'`."
)
coauthor_emails = {norm_key(coauthor.email) for coauthor in coauthors}
for login in HARVEST_RE.findall(commit.body):
expected = lookup_identity(aliases, login)
if expected is None:
errors.append(
f"{prefix}: harvested contributor @{login} is missing from .github/AUTHOR_MAP."
)
continue
if (
norm_key(commit.author_email) != norm_key(expected.email)
and norm_key(expected.email) not in coauthor_emails
):
errors.append(
f"{prefix}: `Harvested from PR ... by @{login}` needs machine-readable "
f"credit. Add `{expected.trailer()}` or preserve the contributor as author."
)
return errors
def main(argv: list[str]) -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--author-map", type=Path, default=DEFAULT_AUTHOR_MAP)
parser.add_argument("--range", default="origin/main..HEAD", help="git commit range to check")
parser.add_argument(
"--check-authors",
action="store_true",
help="also reject commit author emails that match known AUTHOR_MAP aliases",
)
args = parser.parse_args(argv)
try:
aliases = load_author_map(args.author_map)
commits = git_log(args.range)
errors = validate(commits, aliases, args.check_authors)
except Exception as exc:
print(f"co-author credit check failed to run: {exc}", file=sys.stderr)
return 2
if errors:
print("Co-author credit check failed:", file=sys.stderr)
for error in errors:
print(f"- {error}", file=sys.stderr)
return 1
print(f"Co-author credit check passed for {len(commits)} commit(s).")
return 0
if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))