diff --git a/.needle-predispatch-sha b/.needle-predispatch-sha index 644542e..9e7f862 100644 --- a/.needle-predispatch-sha +++ b/.needle-predispatch-sha @@ -1 +1 @@ -c251db8228b93881476bb9dcdeb2748fa9be1f23 +0e466a5ceaaef3e5b3d0d650730bf6ce84c35982 diff --git a/crates/pdftract-core/benches/table_detection.rs b/crates/pdftract-core/benches/table_detection.rs index a1994da..522a1ce 100644 --- a/crates/pdftract-core/benches/table_detection.rs +++ b/crates/pdftract-core/benches/table_detection.rs @@ -1,7 +1,7 @@ // Benchmark for table detection. // -// Tests the performance of line-based table detection on pages with -// varying numbers of path segments. +// Tests the performance of line-based and borderless table detection +// on pages with varying numbers of path segments and text positions. use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId}; use pdftract_core::table::{TableDetector, PageContext}; @@ -54,6 +54,35 @@ fn generate_grid_content(num_horiz: usize, num_vert: usize) -> Vec { content } +/// Generate content with text positions for borderless tables. +/// Creates a grid-like pattern of text at aligned positions. +fn generate_borderless_content(num_rows: usize, num_cols: usize) -> Vec { + let mut content = Vec::new(); + + let y_start = 700.0; + let y_end = 100.0; + let x_start = 50.0; + let x_spacing = 100.0; + + // Start text block + content.extend(b"BT "); + + // Generate text positions in a grid pattern + for row in 0..num_rows { + let y = y_start - (row as f32 * (y_start - y_end) / (num_rows.max(1) - 1) as f32); + for col in 0..num_cols { + let x = x_start + (col as f32 * x_spacing); + // Move to position and show text + content.extend(format!("{} {} Td (R{}C{}) Tj ", x, y, row, col).as_bytes()); + } + } + + // End text block + content.extend(b"ET"); + + content +} + fn bench_table_detection(c: &mut Criterion) { let detector = TableDetector::new(); let page = make_page(); @@ -90,5 +119,31 @@ fn bench_table_detection(c: &mut Criterion) { group.finish(); } -criterion_group!(benches, bench_table_detection); +fn bench_borderless_detection(c: &mut Criterion) { + let detector = TableDetector::new(); + let page = make_page(); + + let mut group = c.benchmark_group("borderless_detection"); + + // Test with increasing numbers of text positions (rows * cols) + for (num_rows, num_cols) in [(3, 3), (5, 5), (10, 10), (20, 20), (50, 50), (70, 72)] { + let total_positions = num_rows * num_cols; + group.bench_with_input( + BenchmarkId::new("text_positions", total_positions), + &total_positions, + |b, _| { + let content = generate_borderless_content(num_rows, num_cols); + let ctx = PageContext::new(&page, &content); + + b.iter(|| { + black_box(detector.detect_borderless(black_box(&ctx))) + }); + }, + ); + } + + group.finish(); +} + +criterion_group!(benches, bench_table_detection, bench_borderless_detection); criterion_main!(benches); diff --git a/notes/pdftract-3nwz.md b/notes/pdftract-3nwz.md new file mode 100644 index 0000000..f950324 --- /dev/null +++ b/notes/pdftract-3nwz.md @@ -0,0 +1,70 @@ +# Verification Note: pdftract-3nwz (Borderless Table Detection) + +## Summary +Implemented borderless table detection using x0-aligned span heuristic. The implementation was already present in the codebase and all tests pass. + +## Changes Made +1. Added benchmark for borderless detection to verify performance +2. Verified all acceptance criteria are met + +## Acceptance Criteria Status + +### PASS +- **Critical test**: 3x3 borderless table detected via alignment heuristic + - `test_detect_borderless_3x3_table_accepted` passes +- **Unit test - paragraph rejected**: Single-column text is rejected + - `test_detect_borderless_paragraph_rejected` passes +- **Unit test - one-row pseudo-table rejected**: Single row with multiple columns rejected + - `test_detect_borderless_one_row_pseudo_table_rejected` passes +- **Unit test - 3-row 3-column borderless table accepted**: Core table detection works + - `test_detect_borderless_3x3_table_accepted` passes +- **Unit test - vertical-gap test**: Two separate tables with >100 pt gap detected separately + - `test_detect_borderless_vertical_gap_test` passes +- **Public API**: `TableDetector::detect_borderless(&PageContext) -> Vec` exists +- **Performance**: 1.56 ms for 5040 text positions (well below 10 ms requirement) + +## Implementation Details +The borderless detector in `crates/pdftract-core/src/table/detector.rs`: +- Collects text positions from content stream (Tm, Td, TD, T*, Tj, TJ, ', " operators) +- Groups by x0 positions within 2.0 pt tolerance using clustering +- Finds column candidates (3+ spans at same x0 on different y positions) +- Finds row candidates (y positions where >= 2 column candidates have spans) +- Validates: 3+ rows AND 3+ columns, contiguous y range, no gap > 100 pt +- Constructs GridCandidate with empty segments (no ruling lines) +- Rejects single-column paragraph reflow patterns + +## Test Results +```bash +cargo test -p pdftract-core --lib table::detector::tests::test_detect_borderless +# running 6 tests +# test table::detector::tests::test_detect_borderless_empty_content ... ok +# test table::detector::tests::test_detect_borderless_no_text_block ... ok +# test table::detector::tests::test_detect_borderless_3x3_table_accepted ... ok +# test table::detector::tests::test_detect_borderless_one_row_pseudo_table_rejected ... ok +# test table::detector::tests::test_detect_borderless_paragraph_rejected ... ok +# test table::detector::tests::test_detect_borderless_vertical_gap_test ... ok +# test result: ok. 6 passed; 0 failed; 0 ignored; 0 measured +``` + +## Benchmark Results +``` +borderless_detection/text_positions/5040 + time: [1.5457 ms 1.5595 ms 1.5755 ms] +``` +Performance target: < 10 ms on 5000-span page +Actual: ~1.56 ms (well within requirement) + +## Files Modified +- `crates/pdftract-core/benches/table_detection.rs`: Added borderless detection benchmark + +## Files Reviewed (no changes needed) +- `crates/pdftract-core/src/table/detector.rs`: Borderless detection already implemented +- `crates/pdftract-core/src/table/mod.rs`: Public API exported +- `crates/pdftract-core/src/lib.rs`: Re-exports for public API + +## Integration Notes +Per task description, borderless detection should run only when line-based detection (7.2.1) returns no GridCandidate covering a region. This is a usage pattern for the caller, not enforced within the detector itself. The detector provides both methods independently: +- `TableDetector::detect_line_based()` - for bordered tables +- `TableDetector::detect_borderless()` - for borderless tables + +Callers can orchestrate the fallback logic as needed.