Add quality-matrix implementation to pdftract-ci with msrv-check step using rust:1.78-slim to detect usage of newer Rust features. Changes: - .ci/argo-workflows/pdftract-ci.yaml: Implement quality-matrix DAG with msrv-check, clippy-fmt, and cargo-audit templates - CHANGELOG.md: New file documenting MSRV bump policy (MINOR version event, warning period, update checklist) The MSRV gate prevents silent drift that would break downstream consumers on older toolchains. Any Rust 1.79+ feature (e.g., let-else, core::error::Error) will fail the msrv-check step, triggering a policy review. See notes/pdftract-2w02.md for acceptance criteria verification. Co-Authored-By: Claude Code <noreply@anthropic.com>
1114 lines
38 KiB
YAML
1114 lines
38 KiB
YAML
# pdftract-ci WorkflowTemplate
|
|
#
|
|
# This template orchestrates the CI/CD pipeline for pdftract, a Rust PDF text extraction
|
|
# library with PyO3 Python bindings and a CLI binary. The pipeline builds, tests, runs
|
|
# quality checks, benchmarks, and publishes releases across multiple targets.
|
|
#
|
|
# === Webhook Payload Schema ===
|
|
# Triggered via GitHub webhook -> WorkflowEventBinding (out of scope for this bead).
|
|
# Expected webhook payload schema:
|
|
#
|
|
# {
|
|
# "ref": "refs/heads/main" | "refs/tags/v0.1.0",
|
|
# "repository": {
|
|
# "full_name": "jedarden/pdftract",
|
|
# "html_url": "https://github.com/jedarden/pdftract"
|
|
# },
|
|
# "head_commit": {
|
|
# "id": "abc123...",
|
|
# "message": "Commit message"
|
|
# },
|
|
# "sender": {
|
|
# "login": "username"
|
|
# }
|
|
# }
|
|
#
|
|
# === Parameter Reference ===
|
|
# - commit-sha: Full Git commit SHA (40 hex chars)
|
|
# - ref: Git ref (branch: "refs/heads/*", tag: "refs/tags/v*")
|
|
# - repo-url: GitHub repository URL
|
|
# - is-tag: Boolean ("true" if ref is a tag, "false" otherwise)
|
|
#
|
|
# === DAG Structure ===
|
|
# setup -> [parallel: build-matrix, test-matrix, quality-matrix, bench-matrix] -> publish-if-tag
|
|
#
|
|
# - setup: Clone repo, fetch dependencies, warm cargo cache
|
|
# - build-matrix: Cross-compile for 5 targets (x86_64/aarch64 Linux musl, macOS x64/ARM64, Windows x64)
|
|
# - test-matrix: Run unit tests across feature combinations (default, full, with OCR)
|
|
# - quality-matrix: Linting (clippy, fmt), security audit (cargo-audit), dependency review
|
|
# - bench-matrix: Performance benchmarks (cargo bench) against fixture corpus
|
|
# - publish-if-tag: On tags only, upload binaries to GitHub Releases
|
|
#
|
|
# === Subsequent Phase 0 Beads ===
|
|
# Each bead fills in a distinct set of templates without colliding:
|
|
# - pdftract-xxxx: setup step, volume mount points, cache warming logic
|
|
# - pdftract-yyyy: build-matrix templates (5 target builds with cross)
|
|
# - pdftract-zzzz: test-matrix templates (feature combinations)
|
|
# - pdftract-wwww: quality-matrix templates (clippy, fmt, audit)
|
|
# - pdftract-vvvv: bench-matrix templates (cargo bench)
|
|
# - pdftract-uuuu: publish-if-tag template (gh release create)
|
|
#
|
|
apiVersion: argoproj.io/v1alpha1
|
|
kind: WorkflowTemplate
|
|
metadata:
|
|
name: pdftract-ci
|
|
namespace: argo-workflows
|
|
labels:
|
|
app.kubernetes.io/name: pdftract-ci
|
|
app.kubernetes.io/component: ci
|
|
app.kubernetes.io/part-of: pdftract
|
|
spec:
|
|
entrypoint: pipeline
|
|
serviceAccountName: argo-workflow
|
|
|
|
podGC: OnPodCompletion
|
|
ttlSecondsAfterFinished:
|
|
success: 1800
|
|
failure: 7200
|
|
|
|
arguments:
|
|
parameters:
|
|
- name: commit-sha
|
|
value: ""
|
|
description: "Full Git commit SHA (40 hex chars)"
|
|
- name: ref
|
|
value: "refs/heads/main"
|
|
description: "Git ref (branch: 'refs/heads/*', tag: 'refs/tags/v*')"
|
|
- name: repo-url
|
|
value: "https://github.com/jedarden/pdftract.git"
|
|
description: "GitHub repository URL"
|
|
- name: is-tag
|
|
value: "false"
|
|
description: "Boolean ('true' if ref is a tag, 'false' otherwise)"
|
|
- name: regression-mode
|
|
value: "gate"
|
|
description: "Regression mode: 'gate' (PR) fails on CER > 0.5%, 'update' (merge) refreshes baselines"
|
|
- name: pr-number
|
|
value: ""
|
|
description: "Pull request number for posting benchmark comments (empty skips commenting)"
|
|
- name: proptest-seed
|
|
value: ""
|
|
description: "Proptest seed for reproducibility (empty = auto-generate)"
|
|
- name: proptest-cases
|
|
value: "10000"
|
|
description: "Number of proptest cases per module (default: 10000)"
|
|
|
|
volumeClaimTemplates:
|
|
- metadata:
|
|
name: cargo-cache
|
|
spec:
|
|
accessModes: [ReadWriteOnce]
|
|
storageClassName: sata-large
|
|
resources:
|
|
requests:
|
|
storage: 50Gi
|
|
- metadata:
|
|
name: workspace
|
|
spec:
|
|
accessModes: [ReadWriteOnce]
|
|
storageClassName: sata-large
|
|
resources:
|
|
requests:
|
|
storage: 10Gi
|
|
- metadata:
|
|
name: shared-artifacts
|
|
spec:
|
|
accessModes: [ReadWriteOnce]
|
|
storageClassName: sata-large
|
|
resources:
|
|
requests:
|
|
storage: 1Gi
|
|
- metadata:
|
|
name: regression-results
|
|
spec:
|
|
accessModes: [ReadWriteOnce]
|
|
storageClassName: sata-large
|
|
resources:
|
|
requests:
|
|
storage: 2Gi
|
|
|
|
volumes:
|
|
- name: docker-config
|
|
secret:
|
|
secretName: docker-hub-registry
|
|
items:
|
|
- key: .dockerconfigjson
|
|
path: config.json
|
|
|
|
podMetadata:
|
|
labels:
|
|
app.kubernetes.io/name: pdftract-ci
|
|
commit-sha: "{{workflow.parameters.commit-sha}}"
|
|
|
|
podSpecPatch: |
|
|
imagePullSecrets:
|
|
- name: docker-hub-registry
|
|
securityContext:
|
|
runAsNonRoot: true
|
|
runAsUser: 1000
|
|
fsGroup: 1000
|
|
|
|
templates:
|
|
# === Top-level DAG ===
|
|
# Setup runs first, then all matrices run in parallel, then publish if tagged
|
|
- name: pipeline
|
|
dag:
|
|
onExit: on-exit
|
|
tasks:
|
|
- name: setup
|
|
template: setup
|
|
|
|
- name: build-matrix
|
|
template: build-matrix
|
|
dependencies: [setup]
|
|
|
|
- name: test-matrix
|
|
template: test-matrix
|
|
dependencies: [setup]
|
|
|
|
- name: quality-matrix
|
|
template: quality-matrix
|
|
dependencies: [setup]
|
|
|
|
- name: bench-matrix
|
|
template: bench-matrix
|
|
dependencies: [setup]
|
|
arguments:
|
|
artifacts:
|
|
- name: pdftract-binary
|
|
from: "{{tasks.build-matrix.tasks.build-linux-x86_64-musl.outputs.artifacts.pdftract-binary}}"
|
|
|
|
- name: benchmark-pr-comment
|
|
template: benchmark-pr-comment
|
|
dependencies: [bench-matrix]
|
|
when: "{{workflow.parameters.pr-number}} != \"\""
|
|
arguments:
|
|
artifacts:
|
|
- name: benchmark-comment
|
|
from: "{{tasks.bench-matrix.outputs.artifacts.benchmark-comment}}"
|
|
|
|
- name: regression-corpus
|
|
template: regression-corpus
|
|
dependencies: [build-matrix]
|
|
|
|
- name: publish-if-tag
|
|
template: publish-if-tag
|
|
dependencies: [build-matrix, test-matrix, quality-matrix, bench-matrix, regression-corpus]
|
|
when: "{{workflow.parameters.is-tag}} == true"
|
|
|
|
# === Exit Handler ===
|
|
# Reports workflow status (success/failure) with details
|
|
- name: on-exit
|
|
script:
|
|
image: alpine:3.19
|
|
command: [sh]
|
|
source: |
|
|
#!/bin/sh
|
|
set -e
|
|
echo "=== Workflow Exit Report ==="
|
|
echo "Workflow: {{workflow.name}}"
|
|
echo "Commit: {{workflow.parameters.commit-sha}}"
|
|
echo "Ref: {{workflow.parameters.ref}}"
|
|
echo "Status available in workflow metadata"
|
|
activeDeadlineSeconds: 60
|
|
|
|
# === Setup Step ===
|
|
# Clones repo, fetches dependencies, warms cargo cache
|
|
# Filled in by subsequent Phase 0 bead
|
|
#
|
|
# CRITICAL: All cargo commands in this workflow MUST use --locked (or --locked --frozen)
|
|
# to enforce the workspace Cargo.lock policy. See CONTRIBUTING.md for details.
|
|
- name: setup
|
|
activeDeadlineSeconds: 600
|
|
container:
|
|
image: alpine:3.19
|
|
command: [sh, -c]
|
|
args:
|
|
- |
|
|
# Placeholder: clone repo to /workspace, warm cargo cache
|
|
echo "Setup step - to be implemented by Phase 0 sibling bead"
|
|
echo "Should clone {{workflow.parameters.repo-url}} to /workspace"
|
|
echo "Should checkout {{workflow.parameters.commit-sha}}"
|
|
exit 0
|
|
volumeMounts:
|
|
- name: workspace
|
|
mountPath: /workspace
|
|
- name: cargo-cache
|
|
mountPath: /cache/cargo
|
|
resources:
|
|
requests:
|
|
cpu: 500m
|
|
memory: 1Gi
|
|
limits:
|
|
cpu: 1000m
|
|
memory: 2Gi
|
|
|
|
# === Build Matrix ===
|
|
# Cross-compile for 5 targets using cross (Docker-based)
|
|
# Targets: x86_64-unknown-linux-musl, aarch64-unknown-linux-musl,
|
|
# x86_64-apple-darwin, aarch64-apple-darwin, x86_64-pc-windows-gnu
|
|
- name: build-matrix
|
|
activeDeadlineSeconds: 3600
|
|
dag:
|
|
onExit: build-matrix-exit
|
|
tasks:
|
|
- name: build-linux-x86_64-musl
|
|
template: build-target
|
|
arguments:
|
|
parameters:
|
|
- name: target
|
|
value: "x86_64-unknown-linux-musl"
|
|
- name: cross-image
|
|
value: "ghcr.io/cross-rs/x86_64-unknown-linux-musl:main"
|
|
- name: strip-cmd
|
|
value: "x86_64-linux-musl-strip"
|
|
- name: ext
|
|
value: ""
|
|
continueOn:
|
|
failed: true
|
|
- name: build-linux-aarch64-musl
|
|
template: build-target
|
|
arguments:
|
|
parameters:
|
|
- name: target
|
|
value: "aarch64-unknown-linux-musl"
|
|
- name: cross-image
|
|
value: "ghcr.io/cross-rs/aarch64-unknown-linux-musl:main"
|
|
- name: strip-cmd
|
|
value: "aarch64-linux-musl-strip"
|
|
- name: ext
|
|
value: ""
|
|
continueOn:
|
|
failed: true
|
|
- name: build-darwin-x86_64
|
|
template: build-target
|
|
arguments:
|
|
parameters:
|
|
- name: target
|
|
value: "x86_64-apple-darwin"
|
|
- name: cross-image
|
|
value: "ghcr.io/cross-rs/x86_64-apple-darwin:main"
|
|
- name: strip-cmd
|
|
value: "x86_64-apple-darwin-strip"
|
|
- name: ext
|
|
value: ""
|
|
continueOn:
|
|
failed: true
|
|
- name: build-darwin-aarch64
|
|
template: build-target
|
|
arguments:
|
|
parameters:
|
|
- name: target
|
|
value: "aarch64-apple-darwin"
|
|
- name: cross-image
|
|
value: "ghcr.io/cross-rs/aarch64-apple-darwin:main"
|
|
- name: strip-cmd
|
|
value: "aarch64-apple-darwin-strip"
|
|
- name: ext
|
|
value: ""
|
|
continueOn:
|
|
failed: true
|
|
- name: build-windows-x86_64-gnu
|
|
template: build-target
|
|
arguments:
|
|
parameters:
|
|
- name: target
|
|
value: "x86_64-pc-windows-gnu"
|
|
- name: cross-image
|
|
value: "ghcr.io/cross-rs/x86_64-pc-windows-gnu:main"
|
|
- name: strip-cmd
|
|
value: "x86_64-w64-mingw32-strip"
|
|
- name: ext
|
|
value: ".exe"
|
|
continueOn:
|
|
failed: true
|
|
|
|
# === Build Target Template ===
|
|
# Single target build using cross (Docker-based)
|
|
# Uses ghcr.io/cross-rs/<target>:main images which have cross pre-installed
|
|
- name: build-target
|
|
inputs:
|
|
parameters:
|
|
- name: target
|
|
- name: cross-image
|
|
- name: strip-cmd
|
|
- name: ext
|
|
activeDeadlineSeconds: 3600
|
|
container:
|
|
image: "{{inputs.parameters.cross-image}}"
|
|
command: [bash, -c]
|
|
args:
|
|
- |
|
|
set -eo pipefail
|
|
|
|
TARGET="{{inputs.parameters.target}}"
|
|
STRIP_CMD="{{inputs.parameters.strip-cmd}}"
|
|
EXT="{{inputs.parameters.ext}}"
|
|
|
|
echo "=========================================="
|
|
echo "Building pdftract for target: $TARGET"
|
|
echo "=========================================="
|
|
|
|
cd /workspace
|
|
|
|
# Set reproducible build timestamp
|
|
export SOURCE_DATE_EPOCH=$(git log -1 --format=%ct 2>/dev/null || echo 0)
|
|
export CARGO_HOME="/cache/cargo/registry"
|
|
export CARGO_TARGET_DIR="/cache/cargo/target-$TARGET"
|
|
|
|
echo "SOURCE_DATE_EPOCH=$SOURCE_DATE_EPOCH"
|
|
echo "CARGO_HOME=$CARGO_HOME"
|
|
echo "CARGO_TARGET_DIR=$CARGO_TARGET_DIR"
|
|
|
|
echo "=== Running cargo build with cross ==="
|
|
cross build --release --target "$TARGET" --locked --features default,serve,decrypt
|
|
|
|
BINARY_PATH="target/$TARGET/release/pdftract$EXT"
|
|
|
|
if [ ! -f "$BINARY_PATH" ]; then
|
|
echo "ERROR: Binary not found at $BINARY_PATH" >&2
|
|
echo "Contents of target directory:"
|
|
ls -la "target/$TARGET/release/" || true
|
|
exit 1
|
|
fi
|
|
|
|
echo "=== Binary size before strip ==="
|
|
ls -lh "$BINARY_PATH"
|
|
|
|
echo "=== Stripping binary ==="
|
|
"$STRIP_CMD" "$BINARY_PATH" || {
|
|
echo "WARNING: Strip command failed, continuing with unstripped binary" >&2
|
|
}
|
|
|
|
echo "=== Binary size after strip ==="
|
|
ls -lh "$BINARY_PATH"
|
|
|
|
mkdir -p /artifacts
|
|
cp "$BINARY_PATH" "/artifacts/pdftract-$TARGET$EXT"
|
|
|
|
echo "=== Final artifact ==="
|
|
ls -lh /artifacts/
|
|
|
|
SIZE=$(stat -c%s "/artifacts/pdftract-$TARGET$EXT" 2>/dev/null || stat -f%z "/artifacts/pdftract-$TARGET$EXT")
|
|
echo "Binary size: $SIZE bytes"
|
|
|
|
if [ "$SIZE" -gt 4194304 ]; then
|
|
echo "WARNING: Binary exceeds 4 MB budget ($SIZE bytes)"
|
|
else
|
|
echo "Binary within 4 MB budget"
|
|
fi
|
|
|
|
echo "=== Build complete ==="
|
|
volumeMounts:
|
|
- name: workspace
|
|
mountPath: /workspace
|
|
- name: cargo-cache
|
|
mountPath: /cache/cargo
|
|
- name: docker-config
|
|
mountPath: /root/.docker
|
|
resources:
|
|
requests:
|
|
cpu: 2000m
|
|
memory: 4Gi
|
|
limits:
|
|
cpu: 4000m
|
|
memory: 8Gi
|
|
outputs:
|
|
artifacts:
|
|
- name: pdftract-binary
|
|
path: /artifacts/pdftract-{{inputs.parameters.target}}{{inputs.parameters.ext}}
|
|
|
|
# === Build Matrix Exit Handler ===
|
|
- name: build-matrix-exit
|
|
script:
|
|
image: alpine:3.19
|
|
command: [sh]
|
|
source: |
|
|
#!/bin/sh
|
|
echo "=== Build Matrix Exit Report ==="
|
|
echo "Commit: {{workflow.parameters.commit-sha}}"
|
|
echo "All binaries available as artifacts"
|
|
|
|
# === Test Matrix ===
|
|
# Run cargo test across feature combinations and proptest
|
|
# - default features unit tests
|
|
# - all features unit tests
|
|
# - proptest property tests (10,000 cases per module)
|
|
#
|
|
# CRITICAL: All cargo commands MUST use --locked (or --locked --frozen)
|
|
- name: test-matrix
|
|
activeDeadlineSeconds: 3600
|
|
container:
|
|
image: rust:1.83-bookworm
|
|
command: [bash, -c]
|
|
args:
|
|
- |
|
|
set -eo pipefail
|
|
|
|
echo "=========================================="
|
|
echo "Test Matrix"
|
|
echo "=========================================="
|
|
|
|
cd /workspace
|
|
export CARGO_HOME="/cache/cargo/registry"
|
|
export CARGO_TARGET_DIR="/cache/cargo/target-test"
|
|
|
|
# Set proptest seed for reproducibility
|
|
SEED="{{workflow.parameters.proptest-seed}}"
|
|
if [ -z "$SEED" ]; then
|
|
SEED=$(date +%s%N | sha256sum | head -c 16)
|
|
echo "Generated proptest seed: $SEED"
|
|
else
|
|
echo "Using provided proptest seed: $SEED"
|
|
fi
|
|
export PROPTEST_SEED="$SEED"
|
|
|
|
# Set proptest case count
|
|
CASES="{{workflow.parameters.proptest-cases}}"
|
|
echo "Proptest cases per module: $CASES"
|
|
export PROPTEST_CASES="$CASES"
|
|
|
|
echo "=== Running unit tests (default features) ==="
|
|
cargo test --locked --lib --bins
|
|
|
|
echo "=== Running unit tests (all features) ==="
|
|
cargo test --locked --all-features --lib --bins
|
|
|
|
echo "=== Running property tests (proptest) ==="
|
|
echo "Seed: $PROPTEST_SEED | Cases: $PROPTEST_CASES"
|
|
cargo nextest run --features proptest --proptest --profile=ci-proptest || {
|
|
EXIT_CODE=$?
|
|
if [ $EXIT_CODE -ne 0 ]; then
|
|
echo "ERROR: Property tests failed!"
|
|
echo "Check proptest-regressions/ for new minimal counterexamples"
|
|
exit $EXIT_CODE
|
|
fi
|
|
}
|
|
|
|
echo "=== All tests passed ==="
|
|
echo "Unit tests: PASS"
|
|
echo "Property tests: PASS ($CASES cases per module)"
|
|
volumeMounts:
|
|
- name: workspace
|
|
mountPath: /workspace
|
|
- name: cargo-cache
|
|
mountPath: /cache/cargo
|
|
resources:
|
|
requests:
|
|
cpu: 2000m
|
|
memory: 4Gi
|
|
limits:
|
|
cpu: 4000m
|
|
memory: 8Gi
|
|
|
|
# === Quality Matrix ===
|
|
# Run linting (clippy, fmt), security audit (cargo-audit), dependency review,
|
|
# and MSRV check (build with rust:1.78-slim to detect new-Rust feature usage)
|
|
#
|
|
# CRITICAL: All cargo commands MUST use --locked (or --locked --frozen)
|
|
- name: quality-matrix
|
|
activeDeadlineSeconds: 900
|
|
dag:
|
|
tasks:
|
|
- name: clippy-fmt
|
|
template: clippy-fmt
|
|
- name: msrv-check
|
|
template: msrv-check
|
|
- name: cargo-audit
|
|
template: cargo-audit
|
|
|
|
# === Clippy and Fmt Check ===
|
|
# Runs clippy with MSRV-aware lints and verifies formatting
|
|
- name: clippy-fmt
|
|
activeDeadlineSeconds: 600
|
|
container:
|
|
image: rust:1.83-bookworm
|
|
command: [bash, -c]
|
|
args:
|
|
- |
|
|
set -eo pipefail
|
|
|
|
echo "=========================================="
|
|
echo "Clippy and Format Check"
|
|
echo "=========================================="
|
|
|
|
cd /workspace
|
|
export CARGO_HOME="/cache/cargo/registry"
|
|
export CARGO_TARGET_DIR="/cache/cargo/target-clippy"
|
|
|
|
echo "=== Running clippy with MSRV = 1.78 ==="
|
|
cargo clippy --locked --all-targets --all-features -- -D warnings
|
|
|
|
echo "=== Running fmt check ==="
|
|
cargo fmt --check
|
|
|
|
echo "=== Clippy and fmt checks passed ==="
|
|
volumeMounts:
|
|
- name: workspace
|
|
mountPath: /workspace
|
|
- name: cargo-cache
|
|
mountPath: /cache/cargo
|
|
resources:
|
|
requests:
|
|
cpu: 1000m
|
|
memory: 2Gi
|
|
limits:
|
|
cpu: 2000m
|
|
memory: 4Gi
|
|
|
|
# === MSRV Check ===
|
|
# Builds with rust:1.78-slim to verify no newer Rust features are used.
|
|
# This gate prevents silent MSRV drift that would break downstream consumers
|
|
# on older toolchains.
|
|
- name: msrv-check
|
|
activeDeadlineSeconds: 600
|
|
container:
|
|
image: rust:1.78-slim
|
|
command: [bash, -c]
|
|
args:
|
|
- |
|
|
set -eo pipefail
|
|
|
|
echo "=========================================="
|
|
echo "MSRV Check (Rust 1.78)"
|
|
echo "=========================================="
|
|
|
|
cd /workspace
|
|
export CARGO_HOME="/cache/cargo/registry"
|
|
export CARGO_TARGET_DIR="/cache/cargo/target-msrv"
|
|
|
|
echo "=== Building with Rust 1.78 (MSRV) ==="
|
|
rustc --version
|
|
|
|
# Build workspace with default features to catch MSRV violations
|
|
cargo build --workspace --features default --locked
|
|
|
|
echo "=== MSRV check passed ==="
|
|
echo "No Rust 1.79+ features detected"
|
|
volumeMounts:
|
|
- name: workspace
|
|
mountPath: /workspace
|
|
- name: cargo-cache
|
|
mountPath: /cache/cargo
|
|
resources:
|
|
requests:
|
|
cpu: 1000m
|
|
memory: 2Gi
|
|
limits:
|
|
cpu: 2000m
|
|
memory: 4Gi
|
|
|
|
# === Cargo Audit ===
|
|
# Runs cargo-audit to check for security vulnerabilities in dependencies
|
|
- name: cargo-audit
|
|
activeDeadlineSeconds: 300
|
|
container:
|
|
image: rust:1.83-bookworm
|
|
command: [bash, -c]
|
|
args:
|
|
- |
|
|
set -eo pipefail
|
|
|
|
echo "=========================================="
|
|
echo "Security Audit (cargo-audit)"
|
|
echo "=========================================="
|
|
|
|
cd /workspace
|
|
export CARGO_HOME="/cache/cargo/registry"
|
|
|
|
# Install cargo-audit if not present
|
|
if ! command -v cargo-audit &> /dev/null; then
|
|
echo "Installing cargo-audit..."
|
|
cargo install cargo-audit --locked
|
|
fi
|
|
|
|
echo "=== Running cargo audit ==="
|
|
cargo audit --locked
|
|
|
|
echo "=== Security audit passed ==="
|
|
volumeMounts:
|
|
- name: workspace
|
|
mountPath: /workspace
|
|
- name: cargo-cache
|
|
mountPath: /cache/cargo
|
|
resources:
|
|
requests:
|
|
cpu: 500m
|
|
memory: 1Gi
|
|
limits:
|
|
cpu: 1000m
|
|
memory: 2Gi
|
|
|
|
# === Bench Matrix ===
|
|
# Competitive benchmarks: pdftract vs pdfminer.six, pypdf, pdfplumber
|
|
# Runs hyperfine against 50-PDF corpus (25 vector + 25 raster)
|
|
# Enforces regression gate (>10%) and 10x-faster gate (vs pdfminer)
|
|
- name: bench-matrix
|
|
inputs:
|
|
artifacts:
|
|
- name: pdftract-binary
|
|
path: /tmp/pdftract-binary
|
|
activeDeadlineSeconds: 3600
|
|
container:
|
|
image: python:3.11-slim-bookworm
|
|
command: [bash, -c]
|
|
args:
|
|
- |
|
|
set -eo pipefail
|
|
|
|
echo "=========================================="
|
|
echo "Competitive Benchmark Matrix"
|
|
echo "=========================================="
|
|
|
|
cd /workspace
|
|
|
|
# Install hyperfine
|
|
echo "=== Installing hyperfine ==="
|
|
apt-get update -qq
|
|
apt-get install -y hyperfine jq
|
|
|
|
# Install competitor tools
|
|
echo "=== Installing competitor tools ==="
|
|
pip install --no-cache-dir -r benches/competitors/requirements.txt
|
|
|
|
# Get pdftract binary from build-matrix artifact
|
|
echo "=== Installing pdftract binary ==="
|
|
PDFTRACT_ARTIFACT="/tmp/pdftract-binary"
|
|
if [ -f "$PDFTRACT_ARTIFACT" ]; then
|
|
cp "$PDFTRACT_ARTIFACT" /usr/local/bin/pdftract
|
|
chmod +x /usr/local/bin/pdftract
|
|
echo "pdftract binary installed from artifact"
|
|
else
|
|
echo "WARNING: pdftract binary not found in artifacts, using PATH"
|
|
fi
|
|
|
|
# Verify pdftract is available
|
|
if ! command -v pdftract &> /dev/null; then
|
|
echo "WARNING: pdftract not found in PATH, benchmarks will fail"
|
|
else
|
|
pdftract --version || echo "WARNING: pdftract --version failed"
|
|
fi
|
|
|
|
# Get baseline from main branch
|
|
echo "=== Fetching baseline from main branch ==="
|
|
mkdir -p /tmp/baseline
|
|
if git show main:benches/baselines/main.json > /tmp/baseline/main.json 2>/dev/null; then
|
|
export BASELINE="/tmp/baseline/main.json"
|
|
echo "Baseline loaded from main branch"
|
|
else
|
|
echo "WARNING: Could not fetch baseline from main, using local file"
|
|
export BASELINE="benches/baselines/main.json"
|
|
fi
|
|
|
|
# Run benchmarks
|
|
echo "=== Running competitive benchmarks ==="
|
|
cd benches/competitors
|
|
|
|
# Set output paths
|
|
export OUTPUT="/tmp/benchmark-results.json"
|
|
export COMMENT="/tmp/benchmark-comment.md"
|
|
|
|
# Run the benchmark script
|
|
bash run-benchmarks.sh || {
|
|
EXIT_CODE=$?
|
|
if [ $EXIT_CODE -eq 1 ]; then
|
|
echo "ERROR: Benchmark gates failed!"
|
|
exit 1
|
|
else
|
|
echo "ERROR: Benchmark execution failed with code $EXIT_CODE"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
# Copy results to workspace for artifacts
|
|
cp "$OUTPUT" /workspace/benchmark-results.json
|
|
cp "$COMMENT" /workspace/benchmark-comment.md
|
|
|
|
echo "=== Benchmark complete ==="
|
|
echo "Results:"
|
|
cat "$OUTPUT" | jq -r '[.[] | select(.tool == "pdftract") | .mean_ms] | length' | xargs -I {} echo " pdftract results: {}"
|
|
cat "$OUTPUT" | jq -r '[.[] | select(.tool == "pdfminer") | .mean_ms] | length' | xargs -I {} echo " pdfminer results: {}"
|
|
|
|
echo "=== All gates passed ==="
|
|
volumeMounts:
|
|
- name: workspace
|
|
mountPath: /workspace
|
|
- name: cargo-cache
|
|
mountPath: /cache/cargo
|
|
resources:
|
|
requests:
|
|
cpu: 2000m
|
|
memory: 4Gi
|
|
limits:
|
|
cpu: 4000m
|
|
memory: 8Gi
|
|
outputs:
|
|
artifacts:
|
|
- name: benchmark-results
|
|
path: /workspace/benchmark-results.json
|
|
- name: benchmark-comment
|
|
path: /workspace/benchmark-comment.md
|
|
|
|
# === Benchmark PR Comment ===
|
|
# Posts benchmark results as a comment on the pull request
|
|
# Only runs when pr-number parameter is non-empty
|
|
- name: benchmark-pr-comment
|
|
inputs:
|
|
artifacts:
|
|
- name: benchmark-comment
|
|
path: /tmp/benchmark-comment.md
|
|
activeDeadlineSeconds: 60
|
|
container:
|
|
image: debian:12
|
|
command: [sh, -c]
|
|
args:
|
|
- |
|
|
set -e
|
|
PR_NUMBER="{{workflow.parameters.pr-number}}"
|
|
COMMENT_FILE="/tmp/benchmark-comment.md"
|
|
|
|
echo "=== Posting benchmark comment to PR #$PR_NUMBER ==="
|
|
|
|
# Read comment content
|
|
if [ ! -f "$COMMENT_FILE" ]; then
|
|
echo "ERROR: Benchmark comment file not found"
|
|
exit 1
|
|
fi
|
|
|
|
COMMENT_BODY=$(cat "$COMMENT_FILE")
|
|
|
|
# Post comment via GitHub API
|
|
curl -s -X POST \
|
|
-H "Authorization: token ${GH_TOKEN}" \
|
|
-H "Accept: application/vnd.github.v3+json" \
|
|
"https://api.github.com/repos/jedarden/pdftract/issues/${PR_NUMBER}/comments" \
|
|
-d "{\"body\": $(echo "$COMMENT_BODY" | jq -R -s '.')}"
|
|
|
|
echo "=== Benchmark comment posted successfully ==="
|
|
env:
|
|
- name: GH_TOKEN
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: github-webhook-secret
|
|
key: token
|
|
resources:
|
|
requests:
|
|
cpu: 100m
|
|
memory: 256Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 512Mi
|
|
|
|
# === Regression Corpus ===
|
|
# Run pdftract binary against 500-PDF private regression corpus via ARMOR proxy
|
|
# Compares per-document CER against baseline; fails if delta > 0.5%
|
|
- name: regression-corpus
|
|
activeDeadlineSeconds: 600
|
|
dag:
|
|
onExit: regression-corpus-exit
|
|
tasks:
|
|
- name: build-cer-diff
|
|
template: build-cer-diff
|
|
- name: regression-shards
|
|
template: regression-shard
|
|
dependencies: [build-cer-diff]
|
|
withSequence:
|
|
start: "0"
|
|
end: "7"
|
|
arguments:
|
|
parameters:
|
|
- name: shard-index
|
|
value: "{{item}}"
|
|
- name: shard-total
|
|
value: "8"
|
|
artifacts:
|
|
- name: pdftract-binary
|
|
from: "{{tasks.build-matrix.tasks.build-linux-x86_64-musl.outputs.artifacts.pdftract-binary}}"
|
|
|
|
# === Build CER Diff Tool ===
|
|
# Build the cer-diff binary for comparing extraction outputs
|
|
- name: build-cer-diff
|
|
activeDeadlineSeconds: 300
|
|
container:
|
|
image: rust:1.83-bookworm
|
|
command: [bash, -c]
|
|
args:
|
|
- |
|
|
set -eo pipefail
|
|
echo "=== Building cer-diff tool ==="
|
|
cd /workspace
|
|
export CARGO_HOME="/cache/cargo/registry"
|
|
export CARGO_TARGET_DIR="/cache/cargo/target-cer-diff"
|
|
cargo build --release --bin cer-diff --package pdftract-cer-diff --locked
|
|
cp target/release/cer-diff /shared/cer-diff
|
|
echo "=== cer-diff binary ready ==="
|
|
ls -lh /shared/cer-diff
|
|
volumeMounts:
|
|
- name: workspace
|
|
mountPath: /workspace
|
|
- name: cargo-cache
|
|
mountPath: /cache/cargo
|
|
- name: shared-artifacts
|
|
mountPath: /shared
|
|
resources:
|
|
requests:
|
|
cpu: 1000m
|
|
memory: 2Gi
|
|
limits:
|
|
cpu: 2000m
|
|
memory: 4Gi
|
|
|
|
# === Regression Shard ===
|
|
# Process a subset of the regression corpus (1 of 8 shards)
|
|
- name: regression-shard
|
|
inputs:
|
|
parameters:
|
|
- name: shard-index
|
|
- name: shard-total
|
|
artifacts:
|
|
- name: pdftract-binary
|
|
path: /tmp/pdftract-binary
|
|
activeDeadlineSeconds: 360
|
|
container:
|
|
image: pdftract-test-glibc:1.78
|
|
command: [bash, -c]
|
|
args:
|
|
- |
|
|
set -eo pipefail
|
|
|
|
SHARD_INDEX="{{inputs.parameters.shard-index}}"
|
|
SHARD_TOTAL="{{inputs.parameters.shard-total}}"
|
|
THRESHOLD="0.005"
|
|
REGRESSION_MODE="{{workflow.parameters.regression-mode}}"
|
|
|
|
echo "=========================================="
|
|
echo "Regression Shard: $SHARD_INDEX / $SHARD_TOTAL"
|
|
echo "Mode: $REGRESSION_MODE"
|
|
echo "=========================================="
|
|
|
|
# Configure AWS CLI for ARMOR proxy
|
|
export AWS_ACCESS_KEY_ID="$ARMOR_ACCESS_KEY_ID"
|
|
export AWS_SECRET_ACCESS_KEY="$ARMOR_SECRET_ACCESS_KEY"
|
|
export AWS_ENDPOINT_URL="http://armor.armor.svc.cluster.local:9000"
|
|
|
|
# Download pdftract binary
|
|
echo "=== Downloading pdftract binary ==="
|
|
PDFTRACT_ARTIFACT="/tmp/pdftract-binary"
|
|
if [ -f "$PDFTRACT_ARTIFACT" ]; then
|
|
cp "$PDFTRACT_ARTIFACT" ./pdftract-x86_64-unknown-linux-musl
|
|
chmod +x pdftract-x86_64-unknown-linux-musl
|
|
echo "Binary downloaded from artifact"
|
|
else
|
|
echo "ERROR: pdftract binary not found in artifacts"
|
|
exit 1
|
|
fi
|
|
|
|
./pdftract-x86_64-unknown-linux-musl --version || echo "Binary check passed"
|
|
|
|
# Copy cer-diff to PATH
|
|
cp /shared/cer-diff /usr/local/bin/cer-diff
|
|
chmod +x /usr/local/bin/cer-diff
|
|
cer-diff --help || true
|
|
|
|
# Create output directory
|
|
mkdir -p /regression/results
|
|
|
|
# List corpus files for this shard
|
|
echo "=== Fetching corpus document list ==="
|
|
aws s3 ls --endpoint-url="$AWS_ENDPOINT_URL" "s3://pdftract-regression-corpus/v1/" | \
|
|
awk '{print $NF}' | grep '\.pdf$' > /tmp/all_docs.txt
|
|
|
|
TOTAL_DOCS=$(wc -l < /tmp/all_docs.txt)
|
|
echo "Total documents in corpus: $TOTAL_DOCS"
|
|
|
|
# Calculate shard boundaries
|
|
DOCS_PER_SHARD=$(( (TOTAL_DOCS + SHARD_TOTAL - 1) / SHARD_TOTAL ))
|
|
START_LINE=$((SHARD_INDEX * DOCS_PER_SHARD + 1))
|
|
END_LINE=$((START_LINE + DOCS_PER_SHARD - 1))
|
|
|
|
echo "Shard $SHARD_INDEX: processing lines $START_LINE to $END_LINE"
|
|
|
|
# Extract shard documents
|
|
sed -n "${START_LINE},${END_LINE}p" /tmp/all_docs.txt > /tmp/shard_docs.txt
|
|
SHARD_DOC_COUNT=$(wc -l < /tmp/shard_docs.txt)
|
|
echo "Documents in this shard: $SHARD_DOC_COUNT"
|
|
|
|
# Process each document
|
|
PASS_COUNT=0
|
|
FAIL_COUNT=0
|
|
PROCESSED=0
|
|
|
|
while IFS= read -r pdf_name; do
|
|
[ -z "$pdf_name" ] && continue
|
|
PROCESSED=$((PROCESSED + 1))
|
|
|
|
SHA256="${pdf_name%.pdf}"
|
|
PDF_PATH="s3://pdftract-regression-corpus/v1/${pdf_name}"
|
|
BASELINE_PATH="s3://pdftract-regression-corpus/baselines/${SHA256}.json"
|
|
|
|
echo "[$PROCESSED/$SHARD_DOC_COUNT] Processing: $pdf_name"
|
|
|
|
# Download PDF
|
|
aws s3 cp --endpoint-url="$AWS_ENDPOINT_URL" "$PDF_PATH" "/tmp/${pdf_name}" || {
|
|
echo "ERROR: Failed to download PDF: $pdf_name"
|
|
continue
|
|
}
|
|
|
|
# Run pdftract extraction
|
|
if ! ./pdftract-x86_64-unknown-linux-musl extract --json --pages all "/tmp/${pdf_name}" > /tmp/actual.json 2>/dev/null; then
|
|
echo "ERROR: Extraction failed for: $pdf_name"
|
|
continue
|
|
fi
|
|
|
|
# Fetch or compute baseline
|
|
if [ "$REGRESSION_MODE" = "update" ]; then
|
|
# Update mode: save current output as new baseline
|
|
aws s3 cp --endpoint-url="$AWS_ENDPOINT_URL" /tmp/actual.json "$BASELINE_PATH"
|
|
RESULT="{\"sha\":\"$SHA256\",\"cer_delta\":0.0,\"pass\":true,\"mode\":\"update\"}"
|
|
else
|
|
# Gate mode: compare against baseline
|
|
if ! aws s3 cp --endpoint-url="$AWS_ENDPOINT_URL" "$BASELINE_PATH" /tmp/baseline.json 2>/dev/null; then
|
|
echo "WARN: No baseline found for: $pdf_name (new corpus doc?)"
|
|
RESULT="{\"sha\":\"$SHA256\",\"cer_delta\":0.0,\"pass\":true,\"note\":\"no_baseline\"}"
|
|
else
|
|
# Compute CER
|
|
CER_OUTPUT=$(cer-diff --sha "$SHA256" /tmp/actual.json /tmp/baseline.json --threshold "$THRESHOLD")
|
|
EXIT_CODE=$?
|
|
|
|
if [ $EXIT_CODE -eq 0 ]; then
|
|
PASS_COUNT=$((PASS_COUNT + 1))
|
|
else
|
|
FAIL_COUNT=$((FAIL_COUNT + 1))
|
|
fi
|
|
|
|
RESULT="$CER_OUTPUT"
|
|
fi
|
|
fi
|
|
|
|
# Write result to JSONL
|
|
echo "$RESULT" >> "/regression/results/shard-${SHARD_INDEX}.jsonl"
|
|
|
|
# Cleanup
|
|
rm -f "/tmp/${pdf_name}" /tmp/actual.json /tmp/baseline.json
|
|
done < /tmp/shard_docs.txt
|
|
|
|
echo "=========================================="
|
|
echo "Shard $SHARD_INDEX complete"
|
|
echo "Processed: $PROCESSED"
|
|
echo "Passed: $PASS_COUNT"
|
|
echo "Failed: $FAIL_COUNT"
|
|
echo "=========================================="
|
|
|
|
# Merge shard results into main output
|
|
if [ -f "/regression/results/shard-${SHARD_INDEX}.jsonl" ]; then
|
|
cat "/regression/results/shard-${SHARD_INDEX}.jsonl" >> "/regression/regression-results.jsonl"
|
|
fi
|
|
|
|
# Fail shard if any document exceeded threshold
|
|
if [ "$FAIL_COUNT" -gt 0 ] && [ "$REGRESSION_MODE" = "gate" ]; then
|
|
echo "ERROR: $FAIL_COUNT documents exceeded CER threshold"
|
|
exit 1
|
|
fi
|
|
|
|
env:
|
|
- name: ARMOR_ACCESS_KEY_ID
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: b2-readonly
|
|
key: access-key-id
|
|
optional: true
|
|
- name: ARMOR_SECRET_ACCESS_KEY
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: b2-readonly
|
|
key: secret-access-key
|
|
optional: true
|
|
volumeMounts:
|
|
- name: workspace
|
|
mountPath: /workspace
|
|
- name: shared-artifacts
|
|
mountPath: /shared
|
|
- name: regression-results
|
|
mountPath: /regression
|
|
resources:
|
|
requests:
|
|
cpu: 1000m
|
|
memory: 2Gi
|
|
limits:
|
|
cpu: 2000m
|
|
memory: 4Gi
|
|
outputs:
|
|
artifacts:
|
|
- name: shard-results
|
|
path: /regression/results/shard-{{inputs.parameters.shard-index}}.jsonl
|
|
optional: true
|
|
|
|
# === Regression Corpus Exit Handler ===
|
|
- name: regression-corpus-exit
|
|
script:
|
|
image: debian:12
|
|
command: [sh]
|
|
source: |
|
|
#!/bin/sh
|
|
set -e
|
|
echo "=== Regression Corpus Exit Report ==="
|
|
echo "Commit: {{workflow.parameters.commit-sha}}"
|
|
echo "Regression mode: {{workflow.parameters.regression-mode}}"
|
|
echo "Results artifacts available from all shards"
|
|
|
|
if [ -f "/regression/regression-results.jsonl" ]; then
|
|
echo "Total results lines: $(wc -l < /regression/regression-results.jsonl)"
|
|
echo "=== Sample results (first 5) ==="
|
|
head -5 /regression/regression-results.jsonl || true
|
|
fi
|
|
volumeMounts:
|
|
- name: regression-results
|
|
mountPath: /regression
|
|
resources:
|
|
requests:
|
|
cpu: 200m
|
|
memory: 256Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 512Mi
|
|
outputs:
|
|
artifacts:
|
|
- name: regression-results
|
|
path: /regression/regression-results.jsonl
|
|
optional: true
|
|
|
|
# === Publish If Tag ===
|
|
# On milestone tags, upload binaries to GitHub Releases
|
|
# Filled in by subsequent Phase 0 bead
|
|
#
|
|
# CRITICAL: All cargo commands MUST use --locked (or --locked --frozen)
|
|
# The build step already uses --locked, so artifacts are reproducible.
|
|
# This step only uploads pre-built binaries to GitHub Releases.
|
|
- name: publish-if-tag
|
|
activeDeadlineSeconds: 600
|
|
container:
|
|
image: alpine:3.19
|
|
command: [sh, -c]
|
|
args:
|
|
- |
|
|
# Placeholder: publish step
|
|
echo "Publish step - to be implemented by Phase 0 sibling bead"
|
|
exit 0
|
|
env:
|
|
- name: GH_TOKEN
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: github-webhook-secret
|
|
key: token
|
|
volumeMounts:
|
|
- name: workspace
|
|
mountPath: /workspace
|
|
- name: cargo-cache
|
|
mountPath: /cache/cargo
|
|
resources:
|
|
requests:
|
|
cpu: 500m
|
|
memory: 1Gi
|
|
limits:
|
|
cpu: 1000m
|
|
memory: 2Gi
|