pdftract/tests/stream_decoder/fixtures/gen_bomb_zlib.py
jedarden 225f96c241 fix(pyo3): correct extract_text_fn call in extract_markdown stub
The extract_markdown stub was calling extract_text instead of
extract_text_fn, causing a compilation error. This fixes the
function name to match the exported function from extract_text.rs.

This completes the extract_text PyO3 entry point implementation,
which was already present in extract_text.rs and lib.rs.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 20:28:25 -04:00

109 lines
3.9 KiB
Python

#!/usr/bin/env python3
"""Generate a 3GB zlib bomb for testing stream decoder bomb limit.
Uses zlib format (not raw DEFLATE) to match pdftract's FlateDecoder (ZlibDecoder).
Creates ~1KB input that expands to ~3GB when decompressed.
"""
import zlib
import os
def create_zlib_bomb(target_size_gb=3, byte_to_repeat=b'\x00'):
"""Create a zlib-compressed bomb that expands to target_size_gb gigabytes.
Uses DEFLATE back-reference feature to create a small input that expands
to a large output when decompressed.
"""
# Strategy: Use repeated bytes which compress extremely well
# A large block of identical bytes compresses to a few KB with zlib
# This creates a "zip bomb" effect
target_size = target_size_gb * 1024 * 1024 * 1024 # Convert GB to bytes
# Create the input pattern (repeated bytes)
# We'll create a chunk of repeated bytes and compress it
# Due to DEFLATE's back-reference feature, this compresses extremely well
# For a proper bomb, we want to encode a large amount of repeated data
# DEFLATE can encode "repeat last N bytes M times" very efficiently
# Create 3GB of data (in memory for compression, but the compressed form is small)
# Actually, creating 3GB in memory might be too much
# Let's use a streaming approach
chunk_size = 100 * 1024 * 1024 # 100MB chunks
num_chunks = (target_size + chunk_size - 1) // chunk_size
# Use zlib with maximum compression
# The default wbits for zlib is 15, which is what we want
compressor = zlib.compressobj(level=9, memLevel=9)
compressed_chunks = []
total_input = 0
print(f"Creating bomb that expands to {target_size_gb}GB...")
print(f"Using {num_chunks} chunks of {chunk_size // (1024*1024)}MB each...")
for i in range(num_chunks):
this_chunk_size = min(chunk_size, target_size - total_input)
chunk = byte_to_repeat * this_chunk_size
compressed_chunk = compressor.compress(chunk)
if compressed_chunk:
compressed_chunks.append(compressed_chunk)
total_input += this_chunk_size
if i % 10 == 0:
print(f" Processed {total_input / (1024**3):.1f}GB / {target_size_gb}GB...")
if total_input >= target_size:
break
# Flush any remaining data
compressed_chunks.append(compressor.flush())
bomb_data = b''.join(compressed_chunks)
print(f"Input: {total_input} bytes ({total_input / (1024**3):.2f} GB)")
print(f"Compressed to: {len(bomb_data)} bytes ({len(bomb_data) / 1024:.2f} KB)")
print(f"Compression ratio: {total_input / len(bomb_data):.1f}x")
return bomb_data, total_input
def main():
fixtures_dir = os.path.dirname(os.path.abspath(__file__))
# Generate the bomb
bomb_data, actual_input_size = create_zlib_bomb(target_size_gb=3)
# Save the bomb fixture
bomb_path = os.path.join(fixtures_dir, 'flate_bomb_3gb.bin')
with open(bomb_path, 'wb') as f:
f.write(bomb_data)
print(f"Bomb fixture saved: {bomb_path}")
# Verify decompression
decompressor = zlib.decompressobj()
decompressed = decompressor.decompress(bomb_data)
decompressed += decompressor.flush()
print(f"Verified decompression: {len(decompressed)} bytes ({len(decompressed) / (1024**3):.2f} GB)")
# Save expected file (first 1KB of decompressed data)
expected_path = os.path.join(fixtures_dir, 'flate_bomb_3gb.expected')
with open(expected_path, 'wb') as f:
f.write(decompressed[:1024])
print(f"Expected file saved: {expected_path}")
# Save meta file
meta_path = os.path.join(fixtures_dir, 'flate_bomb_3gb.meta')
with open(meta_path, 'w') as f:
f.write(f"FlateDecode: {len(bomb_data)} bytes input -> {len(decompressed)} bytes output\n")
f.write(f"Tests bomb limit of 2GB (should truncate)\n")
print(f"Meta file saved: {meta_path}")
if __name__ == '__main__':
main()