pdftract/scripts/check-provenance.sh
jedarden 3af009440e fix(pdftract-5z5d8): fix provenance validation script
Fixed scripts/check-provenance.sh to properly validate PROVENANCE.md
against actual fixture files. The script was failing silently due to
subshell EXIT trap removing temp files before parent could read them,
and arithmetic expansion returning exit code 1 on zero value.

Changes:
- Replaced subshell pipes with process substitution
- Moved temp file cleanup to after reading
- Added validated variable initialization
- Added || true to prevent exit on zero arithmetic

All 200 classifier corpus fixtures have valid provenance entries
with matching SHA256 hashes. PROVENANCE.md already existed with
complete documentation.

Refs: pdftract-5z5d8
Co-Authored-By: Claude Code <noreply@anthropic.com>
2026-05-17 23:43:37 -04:00

120 lines
3.5 KiB
Bash
Executable file

#!/usr/bin/env bash
# Validate PROVENANCE.md against actual fixture files.
# Ensures every fixture has a provenance entry with matching SHA256.
set -e
FIXTURES_DIR="tests/fixtures"
PROVENANCE_FILE="$FIXTURES_DIR/profiles/PROVENANCE.md"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
echo "Checking fixture provenance..."
# Check if PROVENANCE.md exists
if [[ ! -f "$PROVENANCE_FILE" ]]; then
echo -e "${RED}ERROR: $PROVENANCE_FILE not found${NC}"
exit 1
fi
# Find all fixture files
FIXTURE_COUNT=$(find "$FIXTURES_DIR" -type f \( -name "*.pdf" -o -name "*.yml" -o -name "*.yaml" \) ! -name "PROVENANCE.md" | wc -l)
echo "Found $FIXTURE_COUNT fixture files"
# Track errors and warnings in temp files for subprocess safety
ERROR_FILE=$(mktemp)
WARN_FILE=$(mktemp)
echo "Validating provenance entries..."
validated=0
# Parse PROVENANCE.md table and validate each entry
while IFS= read -r line; do
# Skip separator row
[[ "$line" =~ ^\|\- ]] && continue
# Remove leading/trailing | and parse fields
row="${line#\|}"
row="${row%\|}"
# Split by | and trim whitespace
path=$(echo "$row" | cut -d'|' -f1 | xargs)
sha256=$(echo "$row" | cut -d'|' -f5 | xargs)
license=$(echo "$row" | cut -d'|' -f3 | xargs)
# Skip header row and empty paths
[[ "$path" == "Path" ]] && continue
[[ -z "$path" ]] && continue
FULL_PATH="$FIXTURES_DIR/$path"
# Check if file exists
if [[ ! -f "$FULL_PATH" ]]; then
echo "ERROR: Provenance entry references non-existent file: $path" >> "$ERROR_FILE"
continue
fi
# Compute actual SHA256
ACTUAL_SHA256=$(sha256sum "$FULL_PATH" | cut -d' ' -f1)
if [[ "$ACTUAL_SHA256" != "$sha256" ]]; then
echo "ERROR: SHA256 mismatch for $path" >> "$ERROR_FILE"
echo " Expected: $sha256" >> "$ERROR_FILE"
echo " Actual: $ACTUAL_SHA256" >> "$ERROR_FILE"
fi
((validated++)) || true
if [[ $((validated % 50)) -eq 0 ]]; then
echo -e "${GREEN}✓${NC} Validated $validated entries..."
fi
# Validate license is from approved list
APPROVED_LICENSES="public-domain|CC0-1.0|CC-BY-3.0|CC-BY-4.0|CC-BY-SA-3.0|CC-BY-SA-4.0|US-government|Apache-2.0|MIT|MIT-0"
if [[ ! "$license" =~ ^($APPROVED_LICENSES)$ ]]; then
echo "WARN: Unapproved license '$license' for $path" >> "$WARN_FILE"
fi
done < <(grep -E "^\|" "$PROVENANCE_FILE")
# Check for orphaned files (files without provenance entries)
echo "Checking for orphaned fixture files..."
while read fixture_file; do
REL_PATH="${fixture_file#$FIXTURES_DIR/}"
if ! grep -q "| $REL_PATH " "$PROVENANCE_FILE"; then
echo "ERROR: Fixture file missing from PROVENANCE.md: $REL_PATH" >> "$ERROR_FILE"
fi
done < <(find "$FIXTURES_DIR" -type f \( -name "*.pdf" -o -name "*.yml" -o -name "*.yaml" \) ! -name "PROVENANCE.md")
# Count errors and warnings
ERRORS=$(wc -l < "$ERROR_FILE" 2>/dev/null || echo 0)
WARNINGS=$(wc -l < "$WARN_FILE" 2>/dev/null || echo 0)
# Display any errors
if [[ $ERRORS -gt 0 ]]; then
cat "$ERROR_FILE"
fi
# Display any warnings
if [[ $WARNINGS -gt 0 ]]; then
cat "$WARN_FILE"
fi
# Clean up temp files
rm -f "$ERROR_FILE" "$WARN_FILE"
# Summary
echo ""
if [[ $ERRORS -eq 0 ]]; then
echo -e "${GREEN}✓ All fixtures have valid provenance entries${NC}"
if [[ $WARNINGS -gt 0 ]]; then
echo -e "${YELLOW}âš  $WARNINGS warning(s)${NC}"
fi
exit 0
else
echo -e "${RED}✗ Found $ERRORS error(s) in provenance validation${NC}"
exit 1
fi