pdftract/.ci/argo-workflows/pdftract-ci.yaml
jedarden 3c8ac46a3c feat(pdftract-2w02): implement MSRV gate with CI check
Add quality-matrix implementation to pdftract-ci with msrv-check step
using rust:1.78-slim to detect usage of newer Rust features.

Changes:
- .ci/argo-workflows/pdftract-ci.yaml: Implement quality-matrix DAG with
  msrv-check, clippy-fmt, and cargo-audit templates
- CHANGELOG.md: New file documenting MSRV bump policy (MINOR version
  event, warning period, update checklist)

The MSRV gate prevents silent drift that would break downstream consumers
on older toolchains. Any Rust 1.79+ feature (e.g., let-else, core::error::Error)
will fail the msrv-check step, triggering a policy review.

See notes/pdftract-2w02.md for acceptance criteria verification.

Co-Authored-By: Claude Code <noreply@anthropic.com>
2026-05-20 19:03:53 -04:00

1114 lines
38 KiB
YAML

# pdftract-ci WorkflowTemplate
#
# This template orchestrates the CI/CD pipeline for pdftract, a Rust PDF text extraction
# library with PyO3 Python bindings and a CLI binary. The pipeline builds, tests, runs
# quality checks, benchmarks, and publishes releases across multiple targets.
#
# === Webhook Payload Schema ===
# Triggered via GitHub webhook -> WorkflowEventBinding (out of scope for this bead).
# Expected webhook payload schema:
#
# {
# "ref": "refs/heads/main" | "refs/tags/v0.1.0",
# "repository": {
# "full_name": "jedarden/pdftract",
# "html_url": "https://github.com/jedarden/pdftract"
# },
# "head_commit": {
# "id": "abc123...",
# "message": "Commit message"
# },
# "sender": {
# "login": "username"
# }
# }
#
# === Parameter Reference ===
# - commit-sha: Full Git commit SHA (40 hex chars)
# - ref: Git ref (branch: "refs/heads/*", tag: "refs/tags/v*")
# - repo-url: GitHub repository URL
# - is-tag: Boolean ("true" if ref is a tag, "false" otherwise)
#
# === DAG Structure ===
# setup -> [parallel: build-matrix, test-matrix, quality-matrix, bench-matrix] -> publish-if-tag
#
# - setup: Clone repo, fetch dependencies, warm cargo cache
# - build-matrix: Cross-compile for 5 targets (x86_64/aarch64 Linux musl, macOS x64/ARM64, Windows x64)
# - test-matrix: Run unit tests across feature combinations (default, full, with OCR)
# - quality-matrix: Linting (clippy, fmt), security audit (cargo-audit), dependency review
# - bench-matrix: Performance benchmarks (cargo bench) against fixture corpus
# - publish-if-tag: On tags only, upload binaries to GitHub Releases
#
# === Subsequent Phase 0 Beads ===
# Each bead fills in a distinct set of templates without colliding:
# - pdftract-xxxx: setup step, volume mount points, cache warming logic
# - pdftract-yyyy: build-matrix templates (5 target builds with cross)
# - pdftract-zzzz: test-matrix templates (feature combinations)
# - pdftract-wwww: quality-matrix templates (clippy, fmt, audit)
# - pdftract-vvvv: bench-matrix templates (cargo bench)
# - pdftract-uuuu: publish-if-tag template (gh release create)
#
apiVersion: argoproj.io/v1alpha1
kind: WorkflowTemplate
metadata:
name: pdftract-ci
namespace: argo-workflows
labels:
app.kubernetes.io/name: pdftract-ci
app.kubernetes.io/component: ci
app.kubernetes.io/part-of: pdftract
spec:
entrypoint: pipeline
serviceAccountName: argo-workflow
podGC: OnPodCompletion
ttlSecondsAfterFinished:
success: 1800
failure: 7200
arguments:
parameters:
- name: commit-sha
value: ""
description: "Full Git commit SHA (40 hex chars)"
- name: ref
value: "refs/heads/main"
description: "Git ref (branch: 'refs/heads/*', tag: 'refs/tags/v*')"
- name: repo-url
value: "https://github.com/jedarden/pdftract.git"
description: "GitHub repository URL"
- name: is-tag
value: "false"
description: "Boolean ('true' if ref is a tag, 'false' otherwise)"
- name: regression-mode
value: "gate"
description: "Regression mode: 'gate' (PR) fails on CER > 0.5%, 'update' (merge) refreshes baselines"
- name: pr-number
value: ""
description: "Pull request number for posting benchmark comments (empty skips commenting)"
- name: proptest-seed
value: ""
description: "Proptest seed for reproducibility (empty = auto-generate)"
- name: proptest-cases
value: "10000"
description: "Number of proptest cases per module (default: 10000)"
volumeClaimTemplates:
- metadata:
name: cargo-cache
spec:
accessModes: [ReadWriteOnce]
storageClassName: sata-large
resources:
requests:
storage: 50Gi
- metadata:
name: workspace
spec:
accessModes: [ReadWriteOnce]
storageClassName: sata-large
resources:
requests:
storage: 10Gi
- metadata:
name: shared-artifacts
spec:
accessModes: [ReadWriteOnce]
storageClassName: sata-large
resources:
requests:
storage: 1Gi
- metadata:
name: regression-results
spec:
accessModes: [ReadWriteOnce]
storageClassName: sata-large
resources:
requests:
storage: 2Gi
volumes:
- name: docker-config
secret:
secretName: docker-hub-registry
items:
- key: .dockerconfigjson
path: config.json
podMetadata:
labels:
app.kubernetes.io/name: pdftract-ci
commit-sha: "{{workflow.parameters.commit-sha}}"
podSpecPatch: |
imagePullSecrets:
- name: docker-hub-registry
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
templates:
# === Top-level DAG ===
# Setup runs first, then all matrices run in parallel, then publish if tagged
- name: pipeline
dag:
onExit: on-exit
tasks:
- name: setup
template: setup
- name: build-matrix
template: build-matrix
dependencies: [setup]
- name: test-matrix
template: test-matrix
dependencies: [setup]
- name: quality-matrix
template: quality-matrix
dependencies: [setup]
- name: bench-matrix
template: bench-matrix
dependencies: [setup]
arguments:
artifacts:
- name: pdftract-binary
from: "{{tasks.build-matrix.tasks.build-linux-x86_64-musl.outputs.artifacts.pdftract-binary}}"
- name: benchmark-pr-comment
template: benchmark-pr-comment
dependencies: [bench-matrix]
when: "{{workflow.parameters.pr-number}} != \"\""
arguments:
artifacts:
- name: benchmark-comment
from: "{{tasks.bench-matrix.outputs.artifacts.benchmark-comment}}"
- name: regression-corpus
template: regression-corpus
dependencies: [build-matrix]
- name: publish-if-tag
template: publish-if-tag
dependencies: [build-matrix, test-matrix, quality-matrix, bench-matrix, regression-corpus]
when: "{{workflow.parameters.is-tag}} == true"
# === Exit Handler ===
# Reports workflow status (success/failure) with details
- name: on-exit
script:
image: alpine:3.19
command: [sh]
source: |
#!/bin/sh
set -e
echo "=== Workflow Exit Report ==="
echo "Workflow: {{workflow.name}}"
echo "Commit: {{workflow.parameters.commit-sha}}"
echo "Ref: {{workflow.parameters.ref}}"
echo "Status available in workflow metadata"
activeDeadlineSeconds: 60
# === Setup Step ===
# Clones repo, fetches dependencies, warms cargo cache
# Filled in by subsequent Phase 0 bead
#
# CRITICAL: All cargo commands in this workflow MUST use --locked (or --locked --frozen)
# to enforce the workspace Cargo.lock policy. See CONTRIBUTING.md for details.
- name: setup
activeDeadlineSeconds: 600
container:
image: alpine:3.19
command: [sh, -c]
args:
- |
# Placeholder: clone repo to /workspace, warm cargo cache
echo "Setup step - to be implemented by Phase 0 sibling bead"
echo "Should clone {{workflow.parameters.repo-url}} to /workspace"
echo "Should checkout {{workflow.parameters.commit-sha}}"
exit 0
volumeMounts:
- name: workspace
mountPath: /workspace
- name: cargo-cache
mountPath: /cache/cargo
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 1000m
memory: 2Gi
# === Build Matrix ===
# Cross-compile for 5 targets using cross (Docker-based)
# Targets: x86_64-unknown-linux-musl, aarch64-unknown-linux-musl,
# x86_64-apple-darwin, aarch64-apple-darwin, x86_64-pc-windows-gnu
- name: build-matrix
activeDeadlineSeconds: 3600
dag:
onExit: build-matrix-exit
tasks:
- name: build-linux-x86_64-musl
template: build-target
arguments:
parameters:
- name: target
value: "x86_64-unknown-linux-musl"
- name: cross-image
value: "ghcr.io/cross-rs/x86_64-unknown-linux-musl:main"
- name: strip-cmd
value: "x86_64-linux-musl-strip"
- name: ext
value: ""
continueOn:
failed: true
- name: build-linux-aarch64-musl
template: build-target
arguments:
parameters:
- name: target
value: "aarch64-unknown-linux-musl"
- name: cross-image
value: "ghcr.io/cross-rs/aarch64-unknown-linux-musl:main"
- name: strip-cmd
value: "aarch64-linux-musl-strip"
- name: ext
value: ""
continueOn:
failed: true
- name: build-darwin-x86_64
template: build-target
arguments:
parameters:
- name: target
value: "x86_64-apple-darwin"
- name: cross-image
value: "ghcr.io/cross-rs/x86_64-apple-darwin:main"
- name: strip-cmd
value: "x86_64-apple-darwin-strip"
- name: ext
value: ""
continueOn:
failed: true
- name: build-darwin-aarch64
template: build-target
arguments:
parameters:
- name: target
value: "aarch64-apple-darwin"
- name: cross-image
value: "ghcr.io/cross-rs/aarch64-apple-darwin:main"
- name: strip-cmd
value: "aarch64-apple-darwin-strip"
- name: ext
value: ""
continueOn:
failed: true
- name: build-windows-x86_64-gnu
template: build-target
arguments:
parameters:
- name: target
value: "x86_64-pc-windows-gnu"
- name: cross-image
value: "ghcr.io/cross-rs/x86_64-pc-windows-gnu:main"
- name: strip-cmd
value: "x86_64-w64-mingw32-strip"
- name: ext
value: ".exe"
continueOn:
failed: true
# === Build Target Template ===
# Single target build using cross (Docker-based)
# Uses ghcr.io/cross-rs/<target>:main images which have cross pre-installed
- name: build-target
inputs:
parameters:
- name: target
- name: cross-image
- name: strip-cmd
- name: ext
activeDeadlineSeconds: 3600
container:
image: "{{inputs.parameters.cross-image}}"
command: [bash, -c]
args:
- |
set -eo pipefail
TARGET="{{inputs.parameters.target}}"
STRIP_CMD="{{inputs.parameters.strip-cmd}}"
EXT="{{inputs.parameters.ext}}"
echo "=========================================="
echo "Building pdftract for target: $TARGET"
echo "=========================================="
cd /workspace
# Set reproducible build timestamp
export SOURCE_DATE_EPOCH=$(git log -1 --format=%ct 2>/dev/null || echo 0)
export CARGO_HOME="/cache/cargo/registry"
export CARGO_TARGET_DIR="/cache/cargo/target-$TARGET"
echo "SOURCE_DATE_EPOCH=$SOURCE_DATE_EPOCH"
echo "CARGO_HOME=$CARGO_HOME"
echo "CARGO_TARGET_DIR=$CARGO_TARGET_DIR"
echo "=== Running cargo build with cross ==="
cross build --release --target "$TARGET" --locked --features default,serve,decrypt
BINARY_PATH="target/$TARGET/release/pdftract$EXT"
if [ ! -f "$BINARY_PATH" ]; then
echo "ERROR: Binary not found at $BINARY_PATH" >&2
echo "Contents of target directory:"
ls -la "target/$TARGET/release/" || true
exit 1
fi
echo "=== Binary size before strip ==="
ls -lh "$BINARY_PATH"
echo "=== Stripping binary ==="
"$STRIP_CMD" "$BINARY_PATH" || {
echo "WARNING: Strip command failed, continuing with unstripped binary" >&2
}
echo "=== Binary size after strip ==="
ls -lh "$BINARY_PATH"
mkdir -p /artifacts
cp "$BINARY_PATH" "/artifacts/pdftract-$TARGET$EXT"
echo "=== Final artifact ==="
ls -lh /artifacts/
SIZE=$(stat -c%s "/artifacts/pdftract-$TARGET$EXT" 2>/dev/null || stat -f%z "/artifacts/pdftract-$TARGET$EXT")
echo "Binary size: $SIZE bytes"
if [ "$SIZE" -gt 4194304 ]; then
echo "WARNING: Binary exceeds 4 MB budget ($SIZE bytes)"
else
echo "Binary within 4 MB budget"
fi
echo "=== Build complete ==="
volumeMounts:
- name: workspace
mountPath: /workspace
- name: cargo-cache
mountPath: /cache/cargo
- name: docker-config
mountPath: /root/.docker
resources:
requests:
cpu: 2000m
memory: 4Gi
limits:
cpu: 4000m
memory: 8Gi
outputs:
artifacts:
- name: pdftract-binary
path: /artifacts/pdftract-{{inputs.parameters.target}}{{inputs.parameters.ext}}
# === Build Matrix Exit Handler ===
- name: build-matrix-exit
script:
image: alpine:3.19
command: [sh]
source: |
#!/bin/sh
echo "=== Build Matrix Exit Report ==="
echo "Commit: {{workflow.parameters.commit-sha}}"
echo "All binaries available as artifacts"
# === Test Matrix ===
# Run cargo test across feature combinations and proptest
# - default features unit tests
# - all features unit tests
# - proptest property tests (10,000 cases per module)
#
# CRITICAL: All cargo commands MUST use --locked (or --locked --frozen)
- name: test-matrix
activeDeadlineSeconds: 3600
container:
image: rust:1.83-bookworm
command: [bash, -c]
args:
- |
set -eo pipefail
echo "=========================================="
echo "Test Matrix"
echo "=========================================="
cd /workspace
export CARGO_HOME="/cache/cargo/registry"
export CARGO_TARGET_DIR="/cache/cargo/target-test"
# Set proptest seed for reproducibility
SEED="{{workflow.parameters.proptest-seed}}"
if [ -z "$SEED" ]; then
SEED=$(date +%s%N | sha256sum | head -c 16)
echo "Generated proptest seed: $SEED"
else
echo "Using provided proptest seed: $SEED"
fi
export PROPTEST_SEED="$SEED"
# Set proptest case count
CASES="{{workflow.parameters.proptest-cases}}"
echo "Proptest cases per module: $CASES"
export PROPTEST_CASES="$CASES"
echo "=== Running unit tests (default features) ==="
cargo test --locked --lib --bins
echo "=== Running unit tests (all features) ==="
cargo test --locked --all-features --lib --bins
echo "=== Running property tests (proptest) ==="
echo "Seed: $PROPTEST_SEED | Cases: $PROPTEST_CASES"
cargo nextest run --features proptest --proptest --profile=ci-proptest || {
EXIT_CODE=$?
if [ $EXIT_CODE -ne 0 ]; then
echo "ERROR: Property tests failed!"
echo "Check proptest-regressions/ for new minimal counterexamples"
exit $EXIT_CODE
fi
}
echo "=== All tests passed ==="
echo "Unit tests: PASS"
echo "Property tests: PASS ($CASES cases per module)"
volumeMounts:
- name: workspace
mountPath: /workspace
- name: cargo-cache
mountPath: /cache/cargo
resources:
requests:
cpu: 2000m
memory: 4Gi
limits:
cpu: 4000m
memory: 8Gi
# === Quality Matrix ===
# Run linting (clippy, fmt), security audit (cargo-audit), dependency review,
# and MSRV check (build with rust:1.78-slim to detect new-Rust feature usage)
#
# CRITICAL: All cargo commands MUST use --locked (or --locked --frozen)
- name: quality-matrix
activeDeadlineSeconds: 900
dag:
tasks:
- name: clippy-fmt
template: clippy-fmt
- name: msrv-check
template: msrv-check
- name: cargo-audit
template: cargo-audit
# === Clippy and Fmt Check ===
# Runs clippy with MSRV-aware lints and verifies formatting
- name: clippy-fmt
activeDeadlineSeconds: 600
container:
image: rust:1.83-bookworm
command: [bash, -c]
args:
- |
set -eo pipefail
echo "=========================================="
echo "Clippy and Format Check"
echo "=========================================="
cd /workspace
export CARGO_HOME="/cache/cargo/registry"
export CARGO_TARGET_DIR="/cache/cargo/target-clippy"
echo "=== Running clippy with MSRV = 1.78 ==="
cargo clippy --locked --all-targets --all-features -- -D warnings
echo "=== Running fmt check ==="
cargo fmt --check
echo "=== Clippy and fmt checks passed ==="
volumeMounts:
- name: workspace
mountPath: /workspace
- name: cargo-cache
mountPath: /cache/cargo
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
cpu: 2000m
memory: 4Gi
# === MSRV Check ===
# Builds with rust:1.78-slim to verify no newer Rust features are used.
# This gate prevents silent MSRV drift that would break downstream consumers
# on older toolchains.
- name: msrv-check
activeDeadlineSeconds: 600
container:
image: rust:1.78-slim
command: [bash, -c]
args:
- |
set -eo pipefail
echo "=========================================="
echo "MSRV Check (Rust 1.78)"
echo "=========================================="
cd /workspace
export CARGO_HOME="/cache/cargo/registry"
export CARGO_TARGET_DIR="/cache/cargo/target-msrv"
echo "=== Building with Rust 1.78 (MSRV) ==="
rustc --version
# Build workspace with default features to catch MSRV violations
cargo build --workspace --features default --locked
echo "=== MSRV check passed ==="
echo "No Rust 1.79+ features detected"
volumeMounts:
- name: workspace
mountPath: /workspace
- name: cargo-cache
mountPath: /cache/cargo
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
cpu: 2000m
memory: 4Gi
# === Cargo Audit ===
# Runs cargo-audit to check for security vulnerabilities in dependencies
- name: cargo-audit
activeDeadlineSeconds: 300
container:
image: rust:1.83-bookworm
command: [bash, -c]
args:
- |
set -eo pipefail
echo "=========================================="
echo "Security Audit (cargo-audit)"
echo "=========================================="
cd /workspace
export CARGO_HOME="/cache/cargo/registry"
# Install cargo-audit if not present
if ! command -v cargo-audit &> /dev/null; then
echo "Installing cargo-audit..."
cargo install cargo-audit --locked
fi
echo "=== Running cargo audit ==="
cargo audit --locked
echo "=== Security audit passed ==="
volumeMounts:
- name: workspace
mountPath: /workspace
- name: cargo-cache
mountPath: /cache/cargo
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 1000m
memory: 2Gi
# === Bench Matrix ===
# Competitive benchmarks: pdftract vs pdfminer.six, pypdf, pdfplumber
# Runs hyperfine against 50-PDF corpus (25 vector + 25 raster)
# Enforces regression gate (>10%) and 10x-faster gate (vs pdfminer)
- name: bench-matrix
inputs:
artifacts:
- name: pdftract-binary
path: /tmp/pdftract-binary
activeDeadlineSeconds: 3600
container:
image: python:3.11-slim-bookworm
command: [bash, -c]
args:
- |
set -eo pipefail
echo "=========================================="
echo "Competitive Benchmark Matrix"
echo "=========================================="
cd /workspace
# Install hyperfine
echo "=== Installing hyperfine ==="
apt-get update -qq
apt-get install -y hyperfine jq
# Install competitor tools
echo "=== Installing competitor tools ==="
pip install --no-cache-dir -r benches/competitors/requirements.txt
# Get pdftract binary from build-matrix artifact
echo "=== Installing pdftract binary ==="
PDFTRACT_ARTIFACT="/tmp/pdftract-binary"
if [ -f "$PDFTRACT_ARTIFACT" ]; then
cp "$PDFTRACT_ARTIFACT" /usr/local/bin/pdftract
chmod +x /usr/local/bin/pdftract
echo "pdftract binary installed from artifact"
else
echo "WARNING: pdftract binary not found in artifacts, using PATH"
fi
# Verify pdftract is available
if ! command -v pdftract &> /dev/null; then
echo "WARNING: pdftract not found in PATH, benchmarks will fail"
else
pdftract --version || echo "WARNING: pdftract --version failed"
fi
# Get baseline from main branch
echo "=== Fetching baseline from main branch ==="
mkdir -p /tmp/baseline
if git show main:benches/baselines/main.json > /tmp/baseline/main.json 2>/dev/null; then
export BASELINE="/tmp/baseline/main.json"
echo "Baseline loaded from main branch"
else
echo "WARNING: Could not fetch baseline from main, using local file"
export BASELINE="benches/baselines/main.json"
fi
# Run benchmarks
echo "=== Running competitive benchmarks ==="
cd benches/competitors
# Set output paths
export OUTPUT="/tmp/benchmark-results.json"
export COMMENT="/tmp/benchmark-comment.md"
# Run the benchmark script
bash run-benchmarks.sh || {
EXIT_CODE=$?
if [ $EXIT_CODE -eq 1 ]; then
echo "ERROR: Benchmark gates failed!"
exit 1
else
echo "ERROR: Benchmark execution failed with code $EXIT_CODE"
exit 1
fi
}
# Copy results to workspace for artifacts
cp "$OUTPUT" /workspace/benchmark-results.json
cp "$COMMENT" /workspace/benchmark-comment.md
echo "=== Benchmark complete ==="
echo "Results:"
cat "$OUTPUT" | jq -r '[.[] | select(.tool == "pdftract") | .mean_ms] | length' | xargs -I {} echo " pdftract results: {}"
cat "$OUTPUT" | jq -r '[.[] | select(.tool == "pdfminer") | .mean_ms] | length' | xargs -I {} echo " pdfminer results: {}"
echo "=== All gates passed ==="
volumeMounts:
- name: workspace
mountPath: /workspace
- name: cargo-cache
mountPath: /cache/cargo
resources:
requests:
cpu: 2000m
memory: 4Gi
limits:
cpu: 4000m
memory: 8Gi
outputs:
artifacts:
- name: benchmark-results
path: /workspace/benchmark-results.json
- name: benchmark-comment
path: /workspace/benchmark-comment.md
# === Benchmark PR Comment ===
# Posts benchmark results as a comment on the pull request
# Only runs when pr-number parameter is non-empty
- name: benchmark-pr-comment
inputs:
artifacts:
- name: benchmark-comment
path: /tmp/benchmark-comment.md
activeDeadlineSeconds: 60
container:
image: debian:12
command: [sh, -c]
args:
- |
set -e
PR_NUMBER="{{workflow.parameters.pr-number}}"
COMMENT_FILE="/tmp/benchmark-comment.md"
echo "=== Posting benchmark comment to PR #$PR_NUMBER ==="
# Read comment content
if [ ! -f "$COMMENT_FILE" ]; then
echo "ERROR: Benchmark comment file not found"
exit 1
fi
COMMENT_BODY=$(cat "$COMMENT_FILE")
# Post comment via GitHub API
curl -s -X POST \
-H "Authorization: token ${GH_TOKEN}" \
-H "Accept: application/vnd.github.v3+json" \
"https://api.github.com/repos/jedarden/pdftract/issues/${PR_NUMBER}/comments" \
-d "{\"body\": $(echo "$COMMENT_BODY" | jq -R -s '.')}"
echo "=== Benchmark comment posted successfully ==="
env:
- name: GH_TOKEN
valueFrom:
secretKeyRef:
name: github-webhook-secret
key: token
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
# === Regression Corpus ===
# Run pdftract binary against 500-PDF private regression corpus via ARMOR proxy
# Compares per-document CER against baseline; fails if delta > 0.5%
- name: regression-corpus
activeDeadlineSeconds: 600
dag:
onExit: regression-corpus-exit
tasks:
- name: build-cer-diff
template: build-cer-diff
- name: regression-shards
template: regression-shard
dependencies: [build-cer-diff]
withSequence:
start: "0"
end: "7"
arguments:
parameters:
- name: shard-index
value: "{{item}}"
- name: shard-total
value: "8"
artifacts:
- name: pdftract-binary
from: "{{tasks.build-matrix.tasks.build-linux-x86_64-musl.outputs.artifacts.pdftract-binary}}"
# === Build CER Diff Tool ===
# Build the cer-diff binary for comparing extraction outputs
- name: build-cer-diff
activeDeadlineSeconds: 300
container:
image: rust:1.83-bookworm
command: [bash, -c]
args:
- |
set -eo pipefail
echo "=== Building cer-diff tool ==="
cd /workspace
export CARGO_HOME="/cache/cargo/registry"
export CARGO_TARGET_DIR="/cache/cargo/target-cer-diff"
cargo build --release --bin cer-diff --package pdftract-cer-diff --locked
cp target/release/cer-diff /shared/cer-diff
echo "=== cer-diff binary ready ==="
ls -lh /shared/cer-diff
volumeMounts:
- name: workspace
mountPath: /workspace
- name: cargo-cache
mountPath: /cache/cargo
- name: shared-artifacts
mountPath: /shared
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
cpu: 2000m
memory: 4Gi
# === Regression Shard ===
# Process a subset of the regression corpus (1 of 8 shards)
- name: regression-shard
inputs:
parameters:
- name: shard-index
- name: shard-total
artifacts:
- name: pdftract-binary
path: /tmp/pdftract-binary
activeDeadlineSeconds: 360
container:
image: pdftract-test-glibc:1.78
command: [bash, -c]
args:
- |
set -eo pipefail
SHARD_INDEX="{{inputs.parameters.shard-index}}"
SHARD_TOTAL="{{inputs.parameters.shard-total}}"
THRESHOLD="0.005"
REGRESSION_MODE="{{workflow.parameters.regression-mode}}"
echo "=========================================="
echo "Regression Shard: $SHARD_INDEX / $SHARD_TOTAL"
echo "Mode: $REGRESSION_MODE"
echo "=========================================="
# Configure AWS CLI for ARMOR proxy
export AWS_ACCESS_KEY_ID="$ARMOR_ACCESS_KEY_ID"
export AWS_SECRET_ACCESS_KEY="$ARMOR_SECRET_ACCESS_KEY"
export AWS_ENDPOINT_URL="http://armor.armor.svc.cluster.local:9000"
# Download pdftract binary
echo "=== Downloading pdftract binary ==="
PDFTRACT_ARTIFACT="/tmp/pdftract-binary"
if [ -f "$PDFTRACT_ARTIFACT" ]; then
cp "$PDFTRACT_ARTIFACT" ./pdftract-x86_64-unknown-linux-musl
chmod +x pdftract-x86_64-unknown-linux-musl
echo "Binary downloaded from artifact"
else
echo "ERROR: pdftract binary not found in artifacts"
exit 1
fi
./pdftract-x86_64-unknown-linux-musl --version || echo "Binary check passed"
# Copy cer-diff to PATH
cp /shared/cer-diff /usr/local/bin/cer-diff
chmod +x /usr/local/bin/cer-diff
cer-diff --help || true
# Create output directory
mkdir -p /regression/results
# List corpus files for this shard
echo "=== Fetching corpus document list ==="
aws s3 ls --endpoint-url="$AWS_ENDPOINT_URL" "s3://pdftract-regression-corpus/v1/" | \
awk '{print $NF}' | grep '\.pdf$' > /tmp/all_docs.txt
TOTAL_DOCS=$(wc -l < /tmp/all_docs.txt)
echo "Total documents in corpus: $TOTAL_DOCS"
# Calculate shard boundaries
DOCS_PER_SHARD=$(( (TOTAL_DOCS + SHARD_TOTAL - 1) / SHARD_TOTAL ))
START_LINE=$((SHARD_INDEX * DOCS_PER_SHARD + 1))
END_LINE=$((START_LINE + DOCS_PER_SHARD - 1))
echo "Shard $SHARD_INDEX: processing lines $START_LINE to $END_LINE"
# Extract shard documents
sed -n "${START_LINE},${END_LINE}p" /tmp/all_docs.txt > /tmp/shard_docs.txt
SHARD_DOC_COUNT=$(wc -l < /tmp/shard_docs.txt)
echo "Documents in this shard: $SHARD_DOC_COUNT"
# Process each document
PASS_COUNT=0
FAIL_COUNT=0
PROCESSED=0
while IFS= read -r pdf_name; do
[ -z "$pdf_name" ] && continue
PROCESSED=$((PROCESSED + 1))
SHA256="${pdf_name%.pdf}"
PDF_PATH="s3://pdftract-regression-corpus/v1/${pdf_name}"
BASELINE_PATH="s3://pdftract-regression-corpus/baselines/${SHA256}.json"
echo "[$PROCESSED/$SHARD_DOC_COUNT] Processing: $pdf_name"
# Download PDF
aws s3 cp --endpoint-url="$AWS_ENDPOINT_URL" "$PDF_PATH" "/tmp/${pdf_name}" || {
echo "ERROR: Failed to download PDF: $pdf_name"
continue
}
# Run pdftract extraction
if ! ./pdftract-x86_64-unknown-linux-musl extract --json --pages all "/tmp/${pdf_name}" > /tmp/actual.json 2>/dev/null; then
echo "ERROR: Extraction failed for: $pdf_name"
continue
fi
# Fetch or compute baseline
if [ "$REGRESSION_MODE" = "update" ]; then
# Update mode: save current output as new baseline
aws s3 cp --endpoint-url="$AWS_ENDPOINT_URL" /tmp/actual.json "$BASELINE_PATH"
RESULT="{\"sha\":\"$SHA256\",\"cer_delta\":0.0,\"pass\":true,\"mode\":\"update\"}"
else
# Gate mode: compare against baseline
if ! aws s3 cp --endpoint-url="$AWS_ENDPOINT_URL" "$BASELINE_PATH" /tmp/baseline.json 2>/dev/null; then
echo "WARN: No baseline found for: $pdf_name (new corpus doc?)"
RESULT="{\"sha\":\"$SHA256\",\"cer_delta\":0.0,\"pass\":true,\"note\":\"no_baseline\"}"
else
# Compute CER
CER_OUTPUT=$(cer-diff --sha "$SHA256" /tmp/actual.json /tmp/baseline.json --threshold "$THRESHOLD")
EXIT_CODE=$?
if [ $EXIT_CODE -eq 0 ]; then
PASS_COUNT=$((PASS_COUNT + 1))
else
FAIL_COUNT=$((FAIL_COUNT + 1))
fi
RESULT="$CER_OUTPUT"
fi
fi
# Write result to JSONL
echo "$RESULT" >> "/regression/results/shard-${SHARD_INDEX}.jsonl"
# Cleanup
rm -f "/tmp/${pdf_name}" /tmp/actual.json /tmp/baseline.json
done < /tmp/shard_docs.txt
echo "=========================================="
echo "Shard $SHARD_INDEX complete"
echo "Processed: $PROCESSED"
echo "Passed: $PASS_COUNT"
echo "Failed: $FAIL_COUNT"
echo "=========================================="
# Merge shard results into main output
if [ -f "/regression/results/shard-${SHARD_INDEX}.jsonl" ]; then
cat "/regression/results/shard-${SHARD_INDEX}.jsonl" >> "/regression/regression-results.jsonl"
fi
# Fail shard if any document exceeded threshold
if [ "$FAIL_COUNT" -gt 0 ] && [ "$REGRESSION_MODE" = "gate" ]; then
echo "ERROR: $FAIL_COUNT documents exceeded CER threshold"
exit 1
fi
env:
- name: ARMOR_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
name: b2-readonly
key: access-key-id
optional: true
- name: ARMOR_SECRET_ACCESS_KEY
valueFrom:
secretKeyRef:
name: b2-readonly
key: secret-access-key
optional: true
volumeMounts:
- name: workspace
mountPath: /workspace
- name: shared-artifacts
mountPath: /shared
- name: regression-results
mountPath: /regression
resources:
requests:
cpu: 1000m
memory: 2Gi
limits:
cpu: 2000m
memory: 4Gi
outputs:
artifacts:
- name: shard-results
path: /regression/results/shard-{{inputs.parameters.shard-index}}.jsonl
optional: true
# === Regression Corpus Exit Handler ===
- name: regression-corpus-exit
script:
image: debian:12
command: [sh]
source: |
#!/bin/sh
set -e
echo "=== Regression Corpus Exit Report ==="
echo "Commit: {{workflow.parameters.commit-sha}}"
echo "Regression mode: {{workflow.parameters.regression-mode}}"
echo "Results artifacts available from all shards"
if [ -f "/regression/regression-results.jsonl" ]; then
echo "Total results lines: $(wc -l < /regression/regression-results.jsonl)"
echo "=== Sample results (first 5) ==="
head -5 /regression/regression-results.jsonl || true
fi
volumeMounts:
- name: regression-results
mountPath: /regression
resources:
requests:
cpu: 200m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
outputs:
artifacts:
- name: regression-results
path: /regression/regression-results.jsonl
optional: true
# === Publish If Tag ===
# On milestone tags, upload binaries to GitHub Releases
# Filled in by subsequent Phase 0 bead
#
# CRITICAL: All cargo commands MUST use --locked (or --locked --frozen)
# The build step already uses --locked, so artifacts are reproducible.
# This step only uploads pre-built binaries to GitHub Releases.
- name: publish-if-tag
activeDeadlineSeconds: 600
container:
image: alpine:3.19
command: [sh, -c]
args:
- |
# Placeholder: publish step
echo "Publish step - to be implemented by Phase 0 sibling bead"
exit 0
env:
- name: GH_TOKEN
valueFrom:
secretKeyRef:
name: github-webhook-secret
key: token
volumeMounts:
- name: workspace
mountPath: /workspace
- name: cargo-cache
mountPath: /cache/cargo
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 1000m
memory: 2Gi