- Add extraction of pdftract_geomean from tool_geomeans array for regression gate - Fix vector geomean calculation to properly pass bash array values to Python The benchmark infrastructure was complete but had two bugs: 1. $pdftract_geomean was used but never set (line 308) 2. Vector geomean calculation had broken Python code for array expansion These fixes ensure the regression and 10x-faster gates will work correctly once the pdftract binary with extract/grep subcommands is available. Refs pdftract-60h
457 lines
15 KiB
Bash
Executable file
457 lines
15 KiB
Bash
Executable file
#!/bin/bash
|
|
# Competitive benchmark runner for pdftract
|
|
# Usage: run-benchmarks.sh [--baseline <path>] [--output <path>]
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
CORPUS_DIR="$SCRIPT_DIR/corpus"
|
|
WRAPPERS_DIR="$SCRIPT_DIR"
|
|
OUTPUT="${OUTPUT:-benchmark-results.json}"
|
|
BASELINE="${BASELINE:-$SCRIPT_DIR/../baselines/main.json}"
|
|
REGRESSION_THRESHOLD="${REGRESSION_THRESHOLD:-0.10}"
|
|
TENX_THRESHOLD="${TENX_THRESHOLD:-0.10}"
|
|
|
|
# Colors for output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
NC='\033[0m' # No Color
|
|
|
|
# Tools to benchmark
|
|
TOOLS=("pdftract" "pdfminer" "pypdf" "pdfplumber")
|
|
|
|
log_info() {
|
|
echo -e "${GREEN}[INFO]${NC} $*"
|
|
}
|
|
|
|
log_warn() {
|
|
echo -e "${YELLOW}[WARN]${NC} $*"
|
|
}
|
|
|
|
log_error() {
|
|
echo -e "${RED}[ERROR]${NC} $*"
|
|
}
|
|
|
|
# Check if hyperfine is installed
|
|
check_hyperfine() {
|
|
if ! command -v hyperfine &> /dev/null; then
|
|
log_error "hyperfine is not installed. Install it with: apt-get install hyperfine"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
# Get all PDF files in corpus
|
|
get_corpus_files() {
|
|
find "$CORPUS_DIR" -name "*.pdf" -type f | sort
|
|
}
|
|
|
|
# Run hyperfine for a single tool/document pair
|
|
run_benchmark() {
|
|
local tool="$1"
|
|
local doc="$2"
|
|
local doc_name="$(basename "$doc")"
|
|
local result_file="/tmp/hyperfine-${tool}-${doc_name}.json"
|
|
|
|
local wrapper="$WRAPPERS_DIR/run-${tool}.sh"
|
|
if [ ! -f "$wrapper" ]; then
|
|
log_error "Wrapper not found: $wrapper"
|
|
echo "{\"tool\": \"$tool\", \"doc\": \"$doc_name\", \"crash\": true}"
|
|
return 1
|
|
fi
|
|
|
|
# Run hyperfine with warmup and 5 runs
|
|
if hyperfine --warmup 2 --runs 5 --export-json "$result_file" \
|
|
-- "$wrapper \"$doc\"" &> /dev/null; then
|
|
|
|
# Extract mean and stddev from hyperfine output
|
|
local mean_ms=$(jq -r '.results[0].mean * 1000' "$result_file" 2>/dev/null || echo "null")
|
|
local stddev_ms=$(jq -r '.results[0].stddev * 1000' "$result_file" 2>/dev/null || echo "null")
|
|
local min_ms=$(jq -r '.results[0].min * 1000' "$result_file" 2>/dev/null || echo "null")
|
|
local max_ms=$(jq -r '.results[0].max * 1000' "$result_file" 2>/dev/null || echo "null")
|
|
|
|
if [ "$mean_ms" != "null" ]; then
|
|
echo "{\"tool\": \"$tool\", \"doc\": \"$doc_name\", \"mean_ms\": $mean_ms, \"stddev_ms\": $stddev_ms, \"min_ms\": $min_ms, \"max_ms\": $max_ms, \"crash\": false}"
|
|
else
|
|
echo "{\"tool\": \"$tool\", \"doc\": \"$doc_name\", \"crash\": true}"
|
|
fi
|
|
|
|
rm -f "$result_file"
|
|
else
|
|
log_warn "hyperfine failed for $tool on $doc_name"
|
|
echo "{\"tool\": \"$tool\", \"doc\": \"$doc_name\", \"crash\": true}"
|
|
fi
|
|
}
|
|
|
|
# Compute geometric mean
|
|
compute_geomean() {
|
|
local values=("$@")
|
|
local count=${#values[@]}
|
|
local product=1.0
|
|
local valid_count=0
|
|
|
|
for val in "${values[@]}"; do
|
|
if [ "$val" != "null" ] && [ "$val" != "0" ]; then
|
|
product=$(echo "$product * $val" | bc -l)
|
|
((valid_count++))
|
|
fi
|
|
done
|
|
|
|
if [ $valid_count -eq 0 ]; then
|
|
echo "null"
|
|
else
|
|
# geomean = product^(1/n)
|
|
echo "e(l($product)/$valid_count)" | bc -l
|
|
fi
|
|
}
|
|
|
|
# Run special pdftract-grep-1000 benchmark
|
|
run_grep_1000_benchmark() {
|
|
log_info "Running pdftract-grep-1000 special benchmark..."
|
|
|
|
local grep_doc="$CORPUS_DIR/wikipedia-1000.pdf"
|
|
if [ ! -f "$grep_doc" ]; then
|
|
log_warn "wikipedia-1000.pdf not found, skipping grep-1000 benchmark"
|
|
return 0
|
|
fi
|
|
|
|
local result_file="/tmp/hyperfine-grep-1000.json"
|
|
|
|
# Run hyperfine with warmup and 5 runs
|
|
if hyperfine --warmup 2 --runs 5 --export-json "$result_file" \
|
|
-- "pdftract grep \"the\" \"$grep_doc\"" &> /dev/null; then
|
|
|
|
# Extract mean from hyperfine output
|
|
local mean_ms=$(jq -r '.results[0].mean * 1000' "$result_file" 2>/dev/null || echo "null")
|
|
|
|
if [ "$mean_ms" != "null" ]; then
|
|
log_info "pdftract-grep-1000: ${mean_ms}ms"
|
|
echo "$mean_ms" > "/tmp/grep-1000-result.txt"
|
|
else
|
|
log_warn "Failed to parse grep-1000 result"
|
|
echo "null" > "/tmp/grep-1000-result.txt"
|
|
fi
|
|
|
|
rm -f "$result_file"
|
|
else
|
|
log_warn "hyperfine failed for grep-1000 benchmark"
|
|
echo "null" > "/tmp/grep-1000-result.txt"
|
|
fi
|
|
}
|
|
|
|
# Run all benchmarks
|
|
run_all_benchmarks() {
|
|
log_info "Starting competitive benchmarks..."
|
|
|
|
local corpus_files=($(get_corpus_files))
|
|
local total_files=${#corpus_files[@]}
|
|
local total_runs=$(($total_files * ${#TOOLS[@]}))
|
|
local current_run=0
|
|
|
|
# Initialize results array
|
|
local results=()
|
|
|
|
for tool in "${TOOLS[@]}"; do
|
|
log_info "Benchmarking $tool..."
|
|
|
|
for doc in "${corpus_files[@]}"; do
|
|
((current_run++))
|
|
local doc_name="$(basename "$doc")"
|
|
log_info "[$current_run/$total_runs] Running $tool on $doc_name..."
|
|
|
|
local result=$(run_benchmark "$tool" "$doc")
|
|
results+=("$result")
|
|
done
|
|
done
|
|
|
|
# Write results to JSON file
|
|
log_info "Writing results to $OUTPUT..."
|
|
echo "[" > "$OUTPUT"
|
|
local first=true
|
|
for result in "${results[@]}"; do
|
|
if [ "$first" = true ]; then
|
|
first=false
|
|
else
|
|
echo "," >> "$OUTPUT"
|
|
fi
|
|
echo -n " $result" >> "$OUTPUT"
|
|
done
|
|
echo "" >> "$OUTPUT"
|
|
echo "]" >> "$OUTPUT"
|
|
|
|
# Run grep-1000 special benchmark
|
|
run_grep_1000_benchmark
|
|
|
|
log_info "Benchmarking complete!"
|
|
}
|
|
|
|
# Analyze results and check gates
|
|
analyze_results() {
|
|
log_info "Analyzing results..."
|
|
|
|
# Compute per-tool geomeans
|
|
declare -A tool_geomeans
|
|
declare -A tool_success_counts
|
|
|
|
for tool in "${TOOLS[@]}"; do
|
|
local values=()
|
|
local count=0
|
|
|
|
while IFS= read -r line; do
|
|
local mean=$(echo "$line" | jq -r '.mean_ms // empty')
|
|
if [ -n "$mean" ] && [ "$mean" != "null" ]; then
|
|
values+=("$mean")
|
|
((count++))
|
|
fi
|
|
done < <(jq -r ".[] | select(.tool == \"$tool\") | select(.crash == false)" "$OUTPUT")
|
|
|
|
if [ ${#values[@]} -gt 0 ]; then
|
|
# Use Python for geomean calculation (more reliable than bc)
|
|
local geomean=$(python3 -c "
|
|
import math
|
|
values = $(
|
|
for v in "${values[@]}"; do
|
|
echo -n "$v "
|
|
done
|
|
)
|
|
values = [float(v) for v in values.split()]
|
|
print(math.exp(sum(math.log(v) for v in values) / len(values)))
|
|
")
|
|
tool_geomeans[$tool]=$geomean
|
|
tool_success_counts[$tool]=$count
|
|
fi
|
|
done
|
|
|
|
# Print summary table
|
|
log_info "=== Benchmark Results Summary ==="
|
|
printf "%-15s %10s %10s\n" "Tool" "GeoMean(ms)" "Success Rate"
|
|
printf "%-15s %10s %10s\n" "---" "----------" "------------"
|
|
|
|
for tool in "${TOOLS[@]}"; do
|
|
local geomean=${tool_geomeans[$tool]:-"N/A"}
|
|
local count=${tool_success_counts[$tool]:-0}
|
|
if [ "$geomean" != "N/A" ]; then
|
|
printf "%-15s %10.2f %10d/%d\n" "$tool" "$geomean" "$count" "$total_files"
|
|
else
|
|
printf "%-15s %10s %10d/%d\n" "$tool" "$geomean" "$count" "$total_files"
|
|
fi
|
|
done
|
|
|
|
# Extract pdftract geomean for regression gate
|
|
local pdftract_geomean=${tool_geomeans[pdftract]:-"null"}
|
|
|
|
# Check 10x-faster gate (pdftract vs pdfminer on vector PDFs only)
|
|
# The gate applies only to vector PDFs where pdftract should excel
|
|
log_info "Computing 10x-faster gate on vector PDFs only..."
|
|
|
|
local pdftract_vector_values=()
|
|
local pdfminer_vector_values=()
|
|
|
|
# Extract values for vector PDFs only (documents in corpus/vector/ directory)
|
|
while IFS= read -r line; do
|
|
local doc=$(echo "$line" | jq -r '.doc // empty')
|
|
local mean=$(echo "$line" | jq -r '.mean_ms // empty')
|
|
if [ -n "$mean" ] && [ "$mean" != "null" ] && [ -n "$doc" ]; then
|
|
# Check if doc is from vector corpus (we infer this from the baseline file structure)
|
|
# In the actual corpus, vector PDFs are named misc-*.pdf
|
|
if [[ "$doc" =~ ^misc- ]]; then
|
|
case "$(echo "$line" | jq -r '.tool')" in
|
|
pdftract)
|
|
pdftract_vector_values+=("$mean")
|
|
;;
|
|
pdfminer)
|
|
pdfminer_vector_values+=("$mean")
|
|
;;
|
|
esac
|
|
fi
|
|
fi
|
|
done < <(jq -r ".[] | select(.crash == false)" "$OUTPUT")
|
|
|
|
# Compute vector-only geomeans
|
|
local pdftract_vector_geomean="null"
|
|
local pdfminer_vector_geomean="null"
|
|
|
|
if [ ${#pdftract_vector_values[@]} -gt 0 ]; then
|
|
pdftract_vector_geomean=$(python3 -c "
|
|
import math
|
|
values = [${pdftract_vector_values[*]}]
|
|
print(math.exp(sum(math.log(v) for v in values) / len(values)))
|
|
")
|
|
fi
|
|
|
|
if [ ${#pdfminer_vector_values[@]} -gt 0 ]; then
|
|
pdfminer_vector_geomean=$(python3 -c "
|
|
import math
|
|
values = [${pdfminer_vector_values[*]}]
|
|
print(math.exp(sum(math.log(v) for v in values) / len(values)))
|
|
")
|
|
fi
|
|
|
|
if [ "$pdftract_vector_geomean" != "null" ] && [ "$pdfminer_vector_geomean" != "null" ]; then
|
|
local ratio=$(echo "$pdftract_vector_geomean / $pdfminer_vector_geomean" | bc -l)
|
|
log_info "10x-faster gate (vector PDFs): pdftract/pdfminer = $ratio (threshold: <= $TENX_THRESHOLD)"
|
|
log_info " pdftract vector geomean: ${pdftract_vector_geomean}ms"
|
|
log_info " pdfminer vector geomean: ${pdfminer_vector_geomean}ms"
|
|
|
|
# 10x faster means ratio should be <= 0.1 (pdftract takes 10ms, pdfminer takes 100ms)
|
|
if (( $(echo "$ratio > $TENX_THRESHOLD" | bc -l) )); then
|
|
log_error "FAIL: pdftract is not >= 10x faster than pdfminer on vector PDFs (ratio: $ratio, threshold: <= $TENX_THRESHOLD)"
|
|
return 1
|
|
else
|
|
log_info "PASS: pdftract is >= 10x faster than pdfminer on vector PDFs (ratio: $ratio)"
|
|
fi
|
|
else
|
|
log_warn "Cannot check 10x-faster gate: missing vector PDF data (pdftract: ${#pdftract_vector_values[@]} results, pdfminer: ${#pdfminer_vector_values[@]} results)"
|
|
fi
|
|
|
|
# Check regression gate if baseline is provided
|
|
if [ -f "$BASELINE" ]; then
|
|
log_info "Checking regression against baseline..."
|
|
|
|
local baseline_geomean=$(jq -r '.pdftract_geomean // empty' "$BASELINE")
|
|
if [ -n "$baseline_geomean" ] && [ "$pdftract_geomean" != "null" ]; then
|
|
local regression=$(echo "($pdftract_geomean - $baseline_geomean) / $baseline_geomean" | bc -l)
|
|
log_info "Regression: $(printf "%.2f%%" $(echo "$regression * 100" | bc -l))"
|
|
|
|
if (( $(echo "$regression > $REGRESSION_THRESHOLD" | bc -l) )); then
|
|
log_error "FAIL: Regression > ${REGRESSION_THRESHOLD} detected!"
|
|
return 1
|
|
else
|
|
log_info "PASS: No significant regression"
|
|
fi
|
|
else
|
|
log_warn "Cannot check regression: missing baseline data"
|
|
fi
|
|
|
|
# Check grep-1000 regression gate
|
|
if [ -f "/tmp/grep-1000-result.txt" ]; then
|
|
local grep_result=$(cat /tmp/grep-1000-result.txt)
|
|
local baseline_grep_1000=$(jq -r '.grep_1000_mean_ms // empty' "$BASELINE")
|
|
|
|
if [ "$grep_result" != "null" ] && [ -n "$baseline_grep_1000" ]; then
|
|
local grep_regression=$(echo "($grep_result - $baseline_grep_1000) / $baseline_grep_1000" | bc -l)
|
|
log_info "grep-1000 regression: $(printf "%.2f%%" $(echo "$grep_regression * 100" | bc -l)) (current: ${grep_result}ms, baseline: ${baseline_grep_1000}ms)"
|
|
|
|
if (( $(echo "$grep_regression > $REGRESSION_THRESHOLD" | bc -l) )); then
|
|
log_error "FAIL: grep-1000 regression > ${REGRESSION_THRESHOLD} detected!"
|
|
return 1
|
|
else
|
|
log_info "PASS: No significant grep-1000 regression"
|
|
fi
|
|
else
|
|
log_warn "Cannot check grep-1000 regression: missing baseline data (current: ${grep_result}, baseline: ${baseline_grep_1000})"
|
|
fi
|
|
else
|
|
log_warn "grep-1000 result file not found, skipping regression check"
|
|
fi
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# Generate PR comment markdown
|
|
generate_pr_comment() {
|
|
local comment_file="benchmark-comment.md"
|
|
|
|
log_info "Generating PR comment..."
|
|
|
|
cat > "$comment_file" << 'EOF'
|
|
## Competitive Benchmark Results
|
|
|
|
### Performance Summary (Geometric Mean)
|
|
|
|
| Tool | GeoMean (ms) | 95% CI | Success Rate |
|
|
|------|-------------|--------|--------------|
|
|
EOF
|
|
|
|
# Add rows for each tool with actual data
|
|
for tool in "${TOOLS[@]}"; do
|
|
# Get mean values for this tool
|
|
local means=$(jq -r "[.[] | select(.tool == \"$tool\") | select(.crash == false) | .mean_ms] | @csv" "$OUTPUT" | tr ',' ' ')
|
|
|
|
# Get stddev values for this tool
|
|
local stddevs=$(jq -r "[.[] | select(.tool == \"$tool\") | select(.crash == false) | .stddev_ms] | @csv" "$OUTPUT" | tr ',' ' ')
|
|
|
|
# Get count of successful runs
|
|
local count=$(jq -r "[.[] | select(.tool == \"$tool\") | select(.crash == false)] | length" "$OUTPUT")
|
|
local total=$(jq -r "[.[] | select(.tool == \"$tool\")] | length" "$OUTPUT")
|
|
|
|
if [ "$count" -gt 0 ]; then
|
|
# Calculate geomean using Python
|
|
local geomean=$(python3 -c "
|
|
import math
|
|
import sys
|
|
means = [float(x) for x in '$means'.split()]
|
|
if means:
|
|
print(math.exp(sum(math.log(x) for x in means) / len(means)))
|
|
else:
|
|
print('N/A')
|
|
")
|
|
|
|
# Calculate 95% CI (geometric)
|
|
local ci=$(python3 -c "
|
|
import math
|
|
import sys
|
|
means = [float(x) for x in '$means'.split()]
|
|
stddevs = [float(x) for x in '$stddevs'.split()]
|
|
if means and stddevs:
|
|
# Calculate relative standard deviation
|
|
geomean = math.exp(sum(math.log(x) for x in means) / len(means))
|
|
# Approximate CI using coefficient of variation
|
|
cv = sum(s/m for s, m in zip(stddevs, means)) / len(means)
|
|
ci_pct = cv * 1.96 * 100 # 95% CI
|
|
print(f'±{ci_pct:.1f}%')
|
|
else:
|
|
print('N/A')
|
|
")
|
|
|
|
printf "| %-15s | %10.2f | %6s | %4d/%d |\n" "$tool" "$geomean" "$ci" "$count" "$total" >> "$comment_file"
|
|
else
|
|
printf "| %-15s | %10s | %6s | %4d/%d |\n" "$tool" "N/A" "N/A" "$count" "$total" >> "$comment_file"
|
|
fi
|
|
done
|
|
|
|
# Add grep-1000 benchmark result if available
|
|
if [ -f "/tmp/grep-1000-result.txt" ]; then
|
|
local grep_result=$(cat /tmp/grep-1000-result.txt)
|
|
if [ "$grep_result" != "null" ]; then
|
|
cat >> "$comment_file" << EOF
|
|
|
|
### Special Benchmark: pdftract-grep-1000
|
|
|
|
- **Mean time:** ${grep_result}ms
|
|
- **Test:** \`pdftract grep "the" wikipedia-1000.pdf\`
|
|
- **Status:** Baseline comparison available
|
|
EOF
|
|
fi
|
|
fi
|
|
|
|
cat >> "$comment_file" << 'EOF'
|
|
|
|
### Notes
|
|
|
|
- Run with `hyperfine --warmup 2 --runs 5`
|
|
- Corpus: 50 PDFs (25 vector + 25 raster)
|
|
- Crashes are excluded from geomean calculation
|
|
- 95% CI shown as percentage of geomean
|
|
- Full results available in artifacts
|
|
EOF
|
|
|
|
log_info "PR comment written to $comment_file"
|
|
cat "$comment_file"
|
|
}
|
|
|
|
main() {
|
|
check_hyperfine
|
|
run_all_benchmarks
|
|
|
|
if ! analyze_results; then
|
|
log_error "Benchmark gates failed!"
|
|
exit 1
|
|
fi
|
|
|
|
generate_pr_comment
|
|
|
|
log_info "All benchmarks passed!"
|
|
}
|
|
|
|
main "$@"
|