pdftract/tools/generate_stress_pdf.py

#!/usr/bin/env python3
"""
Generate synthetic stress-test PDFs for memory ceiling testing.

Creates large-page-count PDFs to validate memory targets:
- 100-page vector PDF for buffered mode testing (target: < 512 MB)
- 10,000-page stress test for streaming mode validation (target: < 256 MB)

Usage:
    python tools/generate_stress_pdf.py --pages 100 -o tests/fixtures/perf/100-page-vector.pdf
    python tools/generate_stress_pdf.py --pages 10000 -o tests/fixtures/perf/10k-page.pdf
"""

import argparse
from pathlib import Path
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.lib.units import inch


def generate_stress_pdf(output_path: Path, num_pages: int) -> None:
    """
    Generate a multi-page PDF with synthetic content.

    Each page contains:
    - Header with page number
    - Multiple paragraphs of text
    - A table with structured data
    - Footer with page count

    Args:
        output_path: Path where the PDF will be written
        num_pages: Number of pages to generate
    """
    output_path.parent.mkdir(parents=True, exist_ok=True)

    c = canvas.Canvas(str(output_path), pagesize=letter)

    for page_num in range(1, num_pages + 1):
        # Header
        c.setFont("Helvetica-Bold", 12)
        c.drawString(1*inch, 10.5*inch, f"Stress Test Document - Page {page_num} of {num_pages}")

        # Content - paragraphs of text
        c.setFont("Helvetica", 10)
        y = 10*inch

        paragraphs = [
            "This is a synthetic stress test PDF designed for memory ceiling validation. "
            "Each page contains structured text and table data to simulate real-world document "
            "extraction workloads.",

            "The memory targets are: Peak RSS for 100-page vector PDF (buffered mode) < 512 MB, "
            "Peak RSS for streaming/NDJSON mode < 256 MB (must stay constant as page count grows), "
            "Peak RSS for adversarial fixtures < 1 GB hard ceiling.",

            "This paragraph contains additional text to increase page content size. "
            "Memory ceiling tests use these documents to verify that pdftract maintains "
            "reasonable memory usage regardless of document size or complexity.",
        ]

        for para in paragraphs:
            # Simple word wrapping
            words = para.split()
            line = ""
            for word in words:
                test_line = line + " " + word if line else word
                if c.stringWidth(test_line, "Helvetica", 10) < 6.5*inch:
                    line = test_line
                else:
                    c.drawString(1*inch, y, line)
                    y -= 0.15*inch
                    line = word
            if line:
                c.drawString(1*inch, y, line)
                y -= 0.15*inch

        # Table section
        y -= 0.2*inch
        c.setFont("Helvetica-Bold", 10)
        c.drawString(1*inch, y, "Sample Data Table")
        y -= 0.2*inch

        # Table header
        c.setFont("Helvetica-Bold", 9)
        c.drawString(1*inch, y, "Column A")
        c.drawString(2.5*inch, y, "Column B")
        c.drawString(4*inch, y, "Column C")
        c.drawString(5.5*inch, y, "Column D")
        y -= 0.15*inch
        c.line(1*inch, y, 7*inch, y)
        y -= 0.15*inch

        # Table rows
        c.setFont("Helvetica", 9)
        for row in range(5):
            c.drawString(1*inch, y, f"Data {page_num}-{row}-A")
            c.drawString(2.5*inch, y, f"Data {page_num}-{row}-B")
            c.drawString(4*inch, y, f"Data {page_num}-{row}-C")
            c.drawString(5.5*inch, y, f"Data {page_num}-{row}-D")
            y -= 0.15*inch

        # More text to fill page
        y -= 0.2*inch
        c.setFont("Helvetica", 9)
        filler_text = (
            "Additional content ensures each page has consistent data density. "
            "Memory profiling during extraction exercises RSS measurement code paths "
            "and validates that streaming mode maintains constant memory footprint."
        )
        c.drawString(1*inch, y, filler_text)

        # Footer
        c.setFont("Helvetica", 8)
        c.drawString(1*inch, 0.5*inch, f"Page {page_num} of {num_pages} | Memory Ceiling Test Fixture")
        c.drawRightString(7.5*inch, 0.5*inch, f"Generated by generate_stress_pdf.py")

        c.showPage()

    c.save()
    print(f"Generated: {output_path} ({num_pages} pages, {output_path.stat().st_size / 1024 / 1024:.1f} MB)")


def main():
    parser = argparse.ArgumentParser(
        description="Generate stress-test PDFs for memory ceiling testing"
    )
    parser.add_argument(
        "--pages",
        type=int,
        required=True,
        help="Number of pages to generate (e.g., 100, 10000)"
    )
    parser.add_argument(
        "-o", "--output",
        type=Path,
        required=True,
        help="Output PDF path"
    )

    args = parser.parse_args()

    if args.pages <= 0:
        parser.error("--pages must be positive")

    generate_stress_pdf(args.output, args.pages)


if __name__ == "__main__":
    main()