002f8f0ba1
Add AUTHOR_MAP plus a lightweight co-author trailer checker so harvested commits use numeric GitHub noreply identities, reject bot/tool trailers, and require machine-readable credit when a commit says it was harvested from a PR.
Also normalize the local unpushed v0.9 harvest range so existing contributor authors/trailers for HUQIANTAO, Implementist, jrcjrcc, xyuai, cyq1017, idling11, and shenjackyuanjie use GitHub-mappable identities before the branch is published.
Validation: python3 scripts/check-coauthor-trailers.py --author-map .github/AUTHOR_MAP --range origin/main..HEAD --check-authors; python3 -m py_compile scripts/check-coauthor-trailers.py; ruby -e 'require "yaml"; YAML.load_file(".github/workflows/ci.yml")'; git diff --check; negative in-process validation for raw email, missing harvested credit, and bot author cases.
246 lines
8.6 KiB
Python
246 lines
8.6 KiB
Python
#!/usr/bin/env python3
|
|
"""Validate that harvested contributor credit is GitHub-mappable.
|
|
|
|
The check is intentionally scoped to new commits. Historical commits may carry
|
|
raw or local emails, but new harvested commits should use GitHub's numeric
|
|
`id+login@users.noreply.github.com` address so co-author credit lands in the
|
|
contributor graph.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
|
|
ROOT = Path(__file__).resolve().parents[1]
|
|
DEFAULT_AUTHOR_MAP = ROOT / ".github" / "AUTHOR_MAP"
|
|
|
|
IDENTITY_RE = re.compile(r"^\s*(?P<name>.+?)\s*<(?P<email>[^<>]+)>\s*$")
|
|
CANONICAL_NOREPLY_RE = re.compile(
|
|
r"^[0-9]+\+[^@\s]+@users\.noreply\.github\.com$", re.IGNORECASE
|
|
)
|
|
COAUTHOR_RE = re.compile(
|
|
r"^Co-authored-by:\s*(?P<name>.*?)\s*<(?P<email>[^<>]+)>\s*$",
|
|
re.IGNORECASE | re.MULTILINE,
|
|
)
|
|
HARVEST_RE = re.compile(r"Harvested from PR #[0-9]+ by @([A-Za-z0-9-]+)")
|
|
|
|
BOT_EMAILS = {
|
|
"codex@local",
|
|
"codex@example.com",
|
|
"cursoragent@cursor.com",
|
|
"noreply@anthropic.com",
|
|
}
|
|
BOT_NAMES = ("claude", "codex", "cursor")
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Identity:
|
|
name: str
|
|
email: str
|
|
|
|
def trailer(self) -> str:
|
|
return f"Co-authored-by: {self.name} <{self.email}>"
|
|
|
|
def author(self) -> str:
|
|
return f"{self.name} <{self.email}>"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Commit:
|
|
sha: str
|
|
author_name: str
|
|
author_email: str
|
|
subject: str
|
|
body: str
|
|
|
|
|
|
def norm_key(value: str) -> str:
|
|
return value.strip().lower()
|
|
|
|
|
|
def github_login_from_noreply(email: str) -> str | None:
|
|
if not CANONICAL_NOREPLY_RE.match(email):
|
|
return None
|
|
local = email.split("@", 1)[0]
|
|
return local.split("+", 1)[1]
|
|
|
|
|
|
def parse_identity(raw: str, context: str) -> Identity:
|
|
match = IDENTITY_RE.match(raw)
|
|
if not match:
|
|
raise ValueError(f"{context}: expected 'Name <id+login@users.noreply.github.com>'")
|
|
identity = Identity(match.group("name").strip(), match.group("email").strip())
|
|
if not CANONICAL_NOREPLY_RE.match(identity.email):
|
|
raise ValueError(
|
|
f"{context}: right-hand email must be numeric GitHub noreply, got {identity.email}"
|
|
)
|
|
return identity
|
|
|
|
|
|
def load_author_map(path: Path) -> dict[str, Identity]:
|
|
aliases: dict[str, Identity] = {}
|
|
for lineno, raw_line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1):
|
|
line = raw_line.split("#", 1)[0].strip()
|
|
if not line:
|
|
continue
|
|
if "=" not in line:
|
|
raise ValueError(f"{path}:{lineno}: expected 'alias = Name <email>'")
|
|
alias, raw_identity = [part.strip() for part in line.split("=", 1)]
|
|
identity = parse_identity(raw_identity, f"{path}:{lineno}")
|
|
key = norm_key(alias)
|
|
if key in aliases and aliases[key] != identity:
|
|
raise ValueError(f"{path}:{lineno}: duplicate alias {alias!r}")
|
|
aliases[key] = identity
|
|
aliases.setdefault(norm_key(identity.email), identity)
|
|
aliases.setdefault(norm_key(identity.name), identity)
|
|
if login := github_login_from_noreply(identity.email):
|
|
aliases.setdefault(norm_key(login), identity)
|
|
return aliases
|
|
|
|
|
|
def git_log(commit_range: str) -> list[Commit]:
|
|
try:
|
|
raw = subprocess.check_output(
|
|
[
|
|
"git",
|
|
"log",
|
|
"--format=%H%x00%an%x00%ae%x00%s%x00%B%x1e",
|
|
commit_range,
|
|
],
|
|
cwd=ROOT,
|
|
text=True,
|
|
)
|
|
except subprocess.CalledProcessError as exc:
|
|
raise RuntimeError(f"failed to read git range {commit_range!r}: {exc}") from exc
|
|
|
|
commits: list[Commit] = []
|
|
for record in raw.split("\x1e"):
|
|
if not record.strip():
|
|
continue
|
|
parts = record.split("\x00", 4)
|
|
if len(parts) != 5:
|
|
raise RuntimeError("failed to parse git log output")
|
|
commits.append(Commit(*parts))
|
|
return commits
|
|
|
|
|
|
def is_bot_identity(name: str, email: str) -> bool:
|
|
lowered_name = name.strip().lower()
|
|
lowered_email = email.strip().lower()
|
|
return lowered_email in BOT_EMAILS or any(
|
|
lowered_name == bot or lowered_name.startswith(f"{bot} ") for bot in BOT_NAMES
|
|
)
|
|
|
|
|
|
def lookup_identity(aliases: dict[str, Identity], *values: str) -> Identity | None:
|
|
for value in values:
|
|
identity = aliases.get(norm_key(value))
|
|
if identity is not None:
|
|
return identity
|
|
return None
|
|
|
|
|
|
def validate(commits: list[Commit], aliases: dict[str, Identity], check_authors: bool) -> list[str]:
|
|
errors: list[str] = []
|
|
for commit in commits:
|
|
prefix = f"{commit.sha[:10]} {commit.subject}"
|
|
coauthors = [
|
|
Identity(match.group("name").strip(), match.group("email").strip())
|
|
for match in COAUTHOR_RE.finditer(commit.body)
|
|
]
|
|
|
|
if check_authors:
|
|
if is_bot_identity(commit.author_name, commit.author_email):
|
|
errors.append(
|
|
f"{prefix}: author {commit.author_name} <{commit.author_email}> is a "
|
|
"bot/tool identity. Human harvested work should preserve the contributor "
|
|
"as author or use a human co-author trailer."
|
|
)
|
|
elif (
|
|
(expected := lookup_identity(aliases, commit.author_email, commit.author_name))
|
|
and norm_key(commit.author_email) != norm_key(expected.email)
|
|
):
|
|
errors.append(
|
|
f"{prefix}: author {commit.author_name} <{commit.author_email}> "
|
|
f"matches AUTHOR_MAP but is not canonical. Use author {expected.author()}."
|
|
)
|
|
|
|
for coauthor in coauthors:
|
|
if CANONICAL_NOREPLY_RE.match(coauthor.email):
|
|
continue
|
|
if is_bot_identity(coauthor.name, coauthor.email):
|
|
errors.append(
|
|
f"{prefix}: remove bot/tool co-author trailer "
|
|
f"{coauthor.name} <{coauthor.email}>; contributor trailers are for humans."
|
|
)
|
|
continue
|
|
expected = lookup_identity(aliases, coauthor.email, coauthor.name)
|
|
if expected:
|
|
errors.append(
|
|
f"{prefix}: co-author {coauthor.name} <{coauthor.email}> is not "
|
|
f"GitHub-mappable. Use `{expected.trailer()}`."
|
|
)
|
|
else:
|
|
errors.append(
|
|
f"{prefix}: co-author {coauthor.name} <{coauthor.email}> is not "
|
|
"numeric GitHub noreply and has no AUTHOR_MAP entry. Add an alias "
|
|
"or use `gh api users/<login> --jq '\"\\(.id)+\\(.login)@users.noreply.github.com\"'`."
|
|
)
|
|
|
|
coauthor_emails = {norm_key(coauthor.email) for coauthor in coauthors}
|
|
for login in HARVEST_RE.findall(commit.body):
|
|
expected = lookup_identity(aliases, login)
|
|
if expected is None:
|
|
errors.append(
|
|
f"{prefix}: harvested contributor @{login} is missing from .github/AUTHOR_MAP."
|
|
)
|
|
continue
|
|
if (
|
|
norm_key(commit.author_email) != norm_key(expected.email)
|
|
and norm_key(expected.email) not in coauthor_emails
|
|
):
|
|
errors.append(
|
|
f"{prefix}: `Harvested from PR ... by @{login}` needs machine-readable "
|
|
f"credit. Add `{expected.trailer()}` or preserve the contributor as author."
|
|
)
|
|
return errors
|
|
|
|
|
|
def main(argv: list[str]) -> int:
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--author-map", type=Path, default=DEFAULT_AUTHOR_MAP)
|
|
parser.add_argument("--range", default="origin/main..HEAD", help="git commit range to check")
|
|
parser.add_argument(
|
|
"--check-authors",
|
|
action="store_true",
|
|
help="also reject commit author emails that match known AUTHOR_MAP aliases",
|
|
)
|
|
args = parser.parse_args(argv)
|
|
|
|
try:
|
|
aliases = load_author_map(args.author_map)
|
|
commits = git_log(args.range)
|
|
errors = validate(commits, aliases, args.check_authors)
|
|
except Exception as exc:
|
|
print(f"co-author credit check failed to run: {exc}", file=sys.stderr)
|
|
return 2
|
|
|
|
if errors:
|
|
print("Co-author credit check failed:", file=sys.stderr)
|
|
for error in errors:
|
|
print(f"- {error}", file=sys.stderr)
|
|
return 1
|
|
|
|
print(f"Co-author credit check passed for {len(commits)} commit(s).")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main(sys.argv[1:]))
|