#!/usr/bin/env python3 """ Generate synthetic stress-test PDFs for memory ceiling testing. Creates large-page-count PDFs to validate memory targets: - 100-page vector PDF for buffered mode testing (target: < 512 MB) - 10,000-page stress test for streaming mode validation (target: < 256 MB) Usage: python tools/generate_stress_pdf.py --pages 100 -o tests/fixtures/perf/100-page-vector.pdf python tools/generate_stress_pdf.py --pages 10000 -o tests/fixtures/perf/10k-page.pdf """ import argparse from pathlib import Path from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import letter from reportlab.lib.units import inch def generate_stress_pdf(output_path: Path, num_pages: int) -> None: """ Generate a multi-page PDF with synthetic content. Each page contains: - Header with page number - Multiple paragraphs of text - A table with structured data - Footer with page count Args: output_path: Path where the PDF will be written num_pages: Number of pages to generate """ output_path.parent.mkdir(parents=True, exist_ok=True) c = canvas.Canvas(str(output_path), pagesize=letter) for page_num in range(1, num_pages + 1): # Header c.setFont("Helvetica-Bold", 12) c.drawString(1*inch, 10.5*inch, f"Stress Test Document - Page {page_num} of {num_pages}") # Content - paragraphs of text c.setFont("Helvetica", 10) y = 10*inch paragraphs = [ "This is a synthetic stress test PDF designed for memory ceiling validation. " "Each page contains structured text and table data to simulate real-world document " "extraction workloads.", "The memory targets are: Peak RSS for 100-page vector PDF (buffered mode) < 512 MB, " "Peak RSS for streaming/NDJSON mode < 256 MB (must stay constant as page count grows), " "Peak RSS for adversarial fixtures < 1 GB hard ceiling.", "This paragraph contains additional text to increase page content size. " "Memory ceiling tests use these documents to verify that pdftract maintains " "reasonable memory usage regardless of document size or complexity.", ] for para in paragraphs: # Simple word wrapping words = para.split() line = "" for word in words: test_line = line + " " + word if line else word if c.stringWidth(test_line, "Helvetica", 10) < 6.5*inch: line = test_line else: c.drawString(1*inch, y, line) y -= 0.15*inch line = word if line: c.drawString(1*inch, y, line) y -= 0.15*inch # Table section y -= 0.2*inch c.setFont("Helvetica-Bold", 10) c.drawString(1*inch, y, "Sample Data Table") y -= 0.2*inch # Table header c.setFont("Helvetica-Bold", 9) c.drawString(1*inch, y, "Column A") c.drawString(2.5*inch, y, "Column B") c.drawString(4*inch, y, "Column C") c.drawString(5.5*inch, y, "Column D") y -= 0.15*inch c.line(1*inch, y, 7*inch, y) y -= 0.15*inch # Table rows c.setFont("Helvetica", 9) for row in range(5): c.drawString(1*inch, y, f"Data {page_num}-{row}-A") c.drawString(2.5*inch, y, f"Data {page_num}-{row}-B") c.drawString(4*inch, y, f"Data {page_num}-{row}-C") c.drawString(5.5*inch, y, f"Data {page_num}-{row}-D") y -= 0.15*inch # More text to fill page y -= 0.2*inch c.setFont("Helvetica", 9) filler_text = ( "Additional content ensures each page has consistent data density. " "Memory profiling during extraction exercises RSS measurement code paths " "and validates that streaming mode maintains constant memory footprint." ) c.drawString(1*inch, y, filler_text) # Footer c.setFont("Helvetica", 8) c.drawString(1*inch, 0.5*inch, f"Page {page_num} of {num_pages} | Memory Ceiling Test Fixture") c.drawRightString(7.5*inch, 0.5*inch, f"Generated by generate_stress_pdf.py") c.showPage() c.save() print(f"Generated: {output_path} ({num_pages} pages, {output_path.stat().st_size / 1024 / 1024:.1f} MB)") def main(): parser = argparse.ArgumentParser( description="Generate stress-test PDFs for memory ceiling testing" ) parser.add_argument( "--pages", type=int, required=True, help="Number of pages to generate (e.g., 100, 10000)" ) parser.add_argument( "-o", "--output", type=Path, required=True, help="Output PDF path" ) args = parser.parse_args() if args.pages <= 0: parser.error("--pages must be positive") generate_stress_pdf(args.output, args.pages) if __name__ == "__main__": main()