pdftract/tools/generate_stress_pdf.py
jedarden c621947686 feat(bf-1g1fd): implement CI memory-ceiling gate with cgroup MemoryMax enforcement
Implements Tier-1 memory ceiling gate that enforces RSS budgets for PDF
extraction, analogous to cargo-bloat for binary size.

Changes:
- CI: Add memory-ceiling template with cgroup MemoryMax (1.5 GB)
- CI: Add cgroup MemoryMax enforcement to test-glibc (6 GB) and test-musl (4 GB)
- CI: Add cgroup MemoryMax + libfuzzer rss/malloc limits to fuzz workflow
- xtask: Implement memory-ceiling command with peak RSS sampling
- Add perf fixtures (100-page, 10k-page) for memory testing
- Add run-fuzz-with-limits.sh for local fuzz testing with memory caps
- Register perf fixtures in PROVENANCE.md

Memory budgets enforced:
- Buffered 100-page PDF: < 512 MB
- Streaming mode: < 256 MB (constant in page count)
- Adversarial fixtures: < 1 GB hard ceiling

Closes bf-1g1fd

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-23 13:22:55 -04:00

150 lines
5 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Generate synthetic stress-test PDFs for memory ceiling testing.
Creates large-page-count PDFs to validate memory targets:
- 100-page vector PDF for buffered mode testing (target: < 512 MB)
- 10,000-page stress test for streaming mode validation (target: < 256 MB)
Usage:
python tools/generate_stress_pdf.py --pages 100 -o tests/fixtures/perf/100-page-vector.pdf
python tools/generate_stress_pdf.py --pages 10000 -o tests/fixtures/perf/10k-page.pdf
"""
import argparse
from pathlib import Path
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.lib.units import inch
def generate_stress_pdf(output_path: Path, num_pages: int) -> None:
"""
Generate a multi-page PDF with synthetic content.
Each page contains:
- Header with page number
- Multiple paragraphs of text
- A table with structured data
- Footer with page count
Args:
output_path: Path where the PDF will be written
num_pages: Number of pages to generate
"""
output_path.parent.mkdir(parents=True, exist_ok=True)
c = canvas.Canvas(str(output_path), pagesize=letter)
for page_num in range(1, num_pages + 1):
# Header
c.setFont("Helvetica-Bold", 12)
c.drawString(1*inch, 10.5*inch, f"Stress Test Document - Page {page_num} of {num_pages}")
# Content - paragraphs of text
c.setFont("Helvetica", 10)
y = 10*inch
paragraphs = [
"This is a synthetic stress test PDF designed for memory ceiling validation. "
"Each page contains structured text and table data to simulate real-world document "
"extraction workloads.",
"The memory targets are: Peak RSS for 100-page vector PDF (buffered mode) < 512 MB, "
"Peak RSS for streaming/NDJSON mode < 256 MB (must stay constant as page count grows), "
"Peak RSS for adversarial fixtures < 1 GB hard ceiling.",
"This paragraph contains additional text to increase page content size. "
"Memory ceiling tests use these documents to verify that pdftract maintains "
"reasonable memory usage regardless of document size or complexity.",
]
for para in paragraphs:
# Simple word wrapping
words = para.split()
line = ""
for word in words:
test_line = line + " " + word if line else word
if c.stringWidth(test_line, "Helvetica", 10) < 6.5*inch:
line = test_line
else:
c.drawString(1*inch, y, line)
y -= 0.15*inch
line = word
if line:
c.drawString(1*inch, y, line)
y -= 0.15*inch
# Table section
y -= 0.2*inch
c.setFont("Helvetica-Bold", 10)
c.drawString(1*inch, y, "Sample Data Table")
y -= 0.2*inch
# Table header
c.setFont("Helvetica-Bold", 9)
c.drawString(1*inch, y, "Column A")
c.drawString(2.5*inch, y, "Column B")
c.drawString(4*inch, y, "Column C")
c.drawString(5.5*inch, y, "Column D")
y -= 0.15*inch
c.line(1*inch, y, 7*inch, y)
y -= 0.15*inch
# Table rows
c.setFont("Helvetica", 9)
for row in range(5):
c.drawString(1*inch, y, f"Data {page_num}-{row}-A")
c.drawString(2.5*inch, y, f"Data {page_num}-{row}-B")
c.drawString(4*inch, y, f"Data {page_num}-{row}-C")
c.drawString(5.5*inch, y, f"Data {page_num}-{row}-D")
y -= 0.15*inch
# More text to fill page
y -= 0.2*inch
c.setFont("Helvetica", 9)
filler_text = (
"Additional content ensures each page has consistent data density. "
"Memory profiling during extraction exercises RSS measurement code paths "
"and validates that streaming mode maintains constant memory footprint."
)
c.drawString(1*inch, y, filler_text)
# Footer
c.setFont("Helvetica", 8)
c.drawString(1*inch, 0.5*inch, f"Page {page_num} of {num_pages} | Memory Ceiling Test Fixture")
c.drawRightString(7.5*inch, 0.5*inch, f"Generated by generate_stress_pdf.py")
c.showPage()
c.save()
print(f"Generated: {output_path} ({num_pages} pages, {output_path.stat().st_size / 1024 / 1024:.1f} MB)")
def main():
parser = argparse.ArgumentParser(
description="Generate stress-test PDFs for memory ceiling testing"
)
parser.add_argument(
"--pages",
type=int,
required=True,
help="Number of pages to generate (e.g., 100, 10000)"
)
parser.add_argument(
"-o", "--output",
type=Path,
required=True,
help="Output PDF path"
)
args = parser.parse_args()
if args.pages <= 0:
parser.error("--pages must be positive")
generate_stress_pdf(args.output, args.pages)
if __name__ == "__main__":
main()