Implements Tier-1 memory ceiling gate that enforces RSS budgets for PDF extraction, analogous to cargo-bloat for binary size. Changes: - CI: Add memory-ceiling template with cgroup MemoryMax (1.5 GB) - CI: Add cgroup MemoryMax enforcement to test-glibc (6 GB) and test-musl (4 GB) - CI: Add cgroup MemoryMax + libfuzzer rss/malloc limits to fuzz workflow - xtask: Implement memory-ceiling command with peak RSS sampling - Add perf fixtures (100-page, 10k-page) for memory testing - Add run-fuzz-with-limits.sh for local fuzz testing with memory caps - Register perf fixtures in PROVENANCE.md Memory budgets enforced: - Buffered 100-page PDF: < 512 MB - Streaming mode: < 256 MB (constant in page count) - Adversarial fixtures: < 1 GB hard ceiling Closes bf-1g1fd Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
150 lines
5 KiB
Python
Executable file
150 lines
5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Generate synthetic stress-test PDFs for memory ceiling testing.
|
|
|
|
Creates large-page-count PDFs to validate memory targets:
|
|
- 100-page vector PDF for buffered mode testing (target: < 512 MB)
|
|
- 10,000-page stress test for streaming mode validation (target: < 256 MB)
|
|
|
|
Usage:
|
|
python tools/generate_stress_pdf.py --pages 100 -o tests/fixtures/perf/100-page-vector.pdf
|
|
python tools/generate_stress_pdf.py --pages 10000 -o tests/fixtures/perf/10k-page.pdf
|
|
"""
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
from reportlab.pdfgen import canvas
|
|
from reportlab.lib.pagesizes import letter
|
|
from reportlab.lib.units import inch
|
|
|
|
|
|
def generate_stress_pdf(output_path: Path, num_pages: int) -> None:
|
|
"""
|
|
Generate a multi-page PDF with synthetic content.
|
|
|
|
Each page contains:
|
|
- Header with page number
|
|
- Multiple paragraphs of text
|
|
- A table with structured data
|
|
- Footer with page count
|
|
|
|
Args:
|
|
output_path: Path where the PDF will be written
|
|
num_pages: Number of pages to generate
|
|
"""
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
c = canvas.Canvas(str(output_path), pagesize=letter)
|
|
|
|
for page_num in range(1, num_pages + 1):
|
|
# Header
|
|
c.setFont("Helvetica-Bold", 12)
|
|
c.drawString(1*inch, 10.5*inch, f"Stress Test Document - Page {page_num} of {num_pages}")
|
|
|
|
# Content - paragraphs of text
|
|
c.setFont("Helvetica", 10)
|
|
y = 10*inch
|
|
|
|
paragraphs = [
|
|
"This is a synthetic stress test PDF designed for memory ceiling validation. "
|
|
"Each page contains structured text and table data to simulate real-world document "
|
|
"extraction workloads.",
|
|
|
|
"The memory targets are: Peak RSS for 100-page vector PDF (buffered mode) < 512 MB, "
|
|
"Peak RSS for streaming/NDJSON mode < 256 MB (must stay constant as page count grows), "
|
|
"Peak RSS for adversarial fixtures < 1 GB hard ceiling.",
|
|
|
|
"This paragraph contains additional text to increase page content size. "
|
|
"Memory ceiling tests use these documents to verify that pdftract maintains "
|
|
"reasonable memory usage regardless of document size or complexity.",
|
|
]
|
|
|
|
for para in paragraphs:
|
|
# Simple word wrapping
|
|
words = para.split()
|
|
line = ""
|
|
for word in words:
|
|
test_line = line + " " + word if line else word
|
|
if c.stringWidth(test_line, "Helvetica", 10) < 6.5*inch:
|
|
line = test_line
|
|
else:
|
|
c.drawString(1*inch, y, line)
|
|
y -= 0.15*inch
|
|
line = word
|
|
if line:
|
|
c.drawString(1*inch, y, line)
|
|
y -= 0.15*inch
|
|
|
|
# Table section
|
|
y -= 0.2*inch
|
|
c.setFont("Helvetica-Bold", 10)
|
|
c.drawString(1*inch, y, "Sample Data Table")
|
|
y -= 0.2*inch
|
|
|
|
# Table header
|
|
c.setFont("Helvetica-Bold", 9)
|
|
c.drawString(1*inch, y, "Column A")
|
|
c.drawString(2.5*inch, y, "Column B")
|
|
c.drawString(4*inch, y, "Column C")
|
|
c.drawString(5.5*inch, y, "Column D")
|
|
y -= 0.15*inch
|
|
c.line(1*inch, y, 7*inch, y)
|
|
y -= 0.15*inch
|
|
|
|
# Table rows
|
|
c.setFont("Helvetica", 9)
|
|
for row in range(5):
|
|
c.drawString(1*inch, y, f"Data {page_num}-{row}-A")
|
|
c.drawString(2.5*inch, y, f"Data {page_num}-{row}-B")
|
|
c.drawString(4*inch, y, f"Data {page_num}-{row}-C")
|
|
c.drawString(5.5*inch, y, f"Data {page_num}-{row}-D")
|
|
y -= 0.15*inch
|
|
|
|
# More text to fill page
|
|
y -= 0.2*inch
|
|
c.setFont("Helvetica", 9)
|
|
filler_text = (
|
|
"Additional content ensures each page has consistent data density. "
|
|
"Memory profiling during extraction exercises RSS measurement code paths "
|
|
"and validates that streaming mode maintains constant memory footprint."
|
|
)
|
|
c.drawString(1*inch, y, filler_text)
|
|
|
|
# Footer
|
|
c.setFont("Helvetica", 8)
|
|
c.drawString(1*inch, 0.5*inch, f"Page {page_num} of {num_pages} | Memory Ceiling Test Fixture")
|
|
c.drawRightString(7.5*inch, 0.5*inch, f"Generated by generate_stress_pdf.py")
|
|
|
|
c.showPage()
|
|
|
|
c.save()
|
|
print(f"Generated: {output_path} ({num_pages} pages, {output_path.stat().st_size / 1024 / 1024:.1f} MB)")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate stress-test PDFs for memory ceiling testing"
|
|
)
|
|
parser.add_argument(
|
|
"--pages",
|
|
type=int,
|
|
required=True,
|
|
help="Number of pages to generate (e.g., 100, 10000)"
|
|
)
|
|
parser.add_argument(
|
|
"-o", "--output",
|
|
type=Path,
|
|
required=True,
|
|
help="Output PDF path"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.pages <= 0:
|
|
parser.error("--pages must be positive")
|
|
|
|
generate_stress_pdf(args.output, args.pages)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|