From fda17d4d77908f23767b565d8eabd30ae5977a59 Mon Sep 17 00:00:00 2001 From: jedarden Date: Wed, 27 May 2026 23:08:37 -0400 Subject: [PATCH] feat(pdftract-2rkc1): implement column confirmation with >= 3 line threshold Implement confirm_columns function that partitions page into candidate columns (regions between consecutive gaps + before-first + after-last), counts unique lines whose first span's x0 falls within each candidate's x-range, and promotes candidates with line_count >= 3 to confirmed columns. Supporting code: - ColumnGap struct with lo/hi bounds, width(), midpoint() - detect_column_gaps function for zero-coverage region detection - HasFirstSpan trait for first span bbox access - CandidateColumn struct for tracking x_range and line_count All 49 column tests pass, including all acceptance criteria. Bead: pdftract-2rkc1 Co-Authored-By: Claude Opus 4.7 --- crates/pdftract-core/src/layout/columns.rs | 724 +++++++++++++++++++++ notes/pdftract-2rkc1.md | 60 ++ 2 files changed, 784 insertions(+) create mode 100644 notes/pdftract-2rkc1.md diff --git a/crates/pdftract-core/src/layout/columns.rs b/crates/pdftract-core/src/layout/columns.rs index 78da7cf..0d6b63a 100644 --- a/crates/pdftract-core/src/layout/columns.rs +++ b/crates/pdftract-core/src/layout/columns.rs @@ -81,6 +81,267 @@ where hist } +/// A gap in the x0 histogram representing a candidate column boundary. +/// +/// The gap spans from bucket `lo` to `hi` (inclusive), where all buckets +/// have zero coverage (no spans start at these x positions). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ColumnGap { + /// Start index of the gap (inclusive). + pub lo: usize, + /// End index of the gap (inclusive). + pub hi: usize, +} + +impl ColumnGap { + /// Create a new column gap. + #[inline] + pub fn new(lo: usize, hi: usize) -> Self { + Self { lo, hi } + } + + /// Return the width of this gap in buckets. + #[inline] + pub fn width(&self) -> usize { + self.hi - self.lo + 1 + } + + /// Return the midpoint of this gap in points. + /// + /// This is useful for setting column boundaries at the center of gaps. + #[inline] + pub fn midpoint(&self) -> f32 { + (self.lo + self.hi) as f32 / 2.0 + } +} + +/// Detect column gaps in the x0 histogram. +/// +/// Finds all contiguous spans of zero-coverage buckets longer than +/// `0.03 * page_width`. Each such gap is a candidate column boundary. +/// +/// # Arguments +/// +/// * `hist` - The x0 histogram from `build_x0_histogram` +/// * `page_width` - Page width in points (used for threshold calculation) +/// +/// # Returns +/// +/// A `Vec` listing boundary indices. Each gap spans from +/// `lo` to `hi` (inclusive), where all buckets have zero coverage. +/// +/// # Behavior +/// +/// - Threshold = `(page_width * 0.03).ceil() as usize` (~18pt on 612pt page) +/// - Leading zeros (page left margin) are included if >= threshold +/// - Trailing zeros (page right margin) are included if >= threshold +/// - All-zero histogram (empty page) returns no gaps +/// - Adjacent gaps are merged (no two gaps can be adjacent) +/// +/// # Examples +/// +/// ``` +/// use pdftract_core::layout::columns::detect_column_gaps; +/// +/// // Histogram with 20 zeros in the middle, page_width=600 +/// let mut hist = vec![1u32; 100]; // [0..100] = 1 +/// hist.extend(vec![0u32; 20]); // [100..120] = 0 (gap) +/// hist.extend(vec![1u32; 100]); // [120..220] = 1 +/// +/// let gaps = detect_column_gaps(&hist, 600.0); +/// assert_eq!(gaps.len(), 1); +/// assert_eq!(gaps[0].lo, 100); +/// assert_eq!(gaps[0].hi, 119); +/// ``` +pub fn detect_column_gaps(hist: &[u32], page_width: f32) -> Vec { + let threshold = (page_width * 0.03_f32).ceil() as usize; + + if hist.is_empty() { + return Vec::new(); + } + + // Edge case: all zeros (empty page) - no gaps + // We need at least some non-zero buckets to have meaningful column detection + if hist.iter().all(|&count| count == 0) { + return Vec::new(); + } + + let mut gaps = Vec::new(); + let mut run_start: Option = None; + + for (i, &count) in hist.iter().enumerate() { + if count == 0 { + // Start a new run if we're not in one + if run_start.is_none() { + run_start = Some(i); + } + } else { + // End of run - check if it meets threshold + if let Some(start) = run_start { + let end = i.saturating_sub(1); + let run_length = end - start + 1; + if run_length >= threshold { + gaps.push(ColumnGap::new(start, end)); + } + run_start = None; + } + } + } + + // Handle trailing zeros (page right margin) + if let Some(start) = run_start { + let end = hist.len().saturating_sub(1); + let run_length = end - start + 1; + if run_length >= threshold { + gaps.push(ColumnGap::new(start, end)); + } + } + + gaps +} + +/// A candidate column region for confirmation. +/// +/// Represents a potential column with its x_range and line count. +/// Used during the column confirmation phase. +#[derive(Debug, Clone, Copy, PartialEq)] +struct CandidateColumn { + /// X range [x0, x1] defining the candidate column bounds. + x_range: [f32; 2], + /// Number of unique lines whose first span starts in this column. + line_count: usize, +} + +/// Confirm column boundaries by counting lines per candidate column. +/// +/// Partitions the page into candidate columns (regions between consecutive +/// gaps + before-first + after-last). For each candidate, counts unique lines +/// whose first span's x0 falls within the column's x-range. Promotes columns +/// with line_count >= 3 to confirmed columns. +/// +/// # Arguments +/// +/// * `gaps` - Candidate column gaps from `detect_column_gaps` +/// * `page_width` - Page width in points +/// * `lines` - Lines to count (must have spans sorted left-to-right) +/// +/// # Returns +/// +/// A `Vec` of confirmed columns with x_ranges and indices. +/// Columns are returned left-to-right with monotonic indices. +/// +/// # Behavior +/// +/// - No gaps detected: entire page is one column (if >= 3 lines) +/// - Gaps detected: candidate columns are (0, gap_0.lo), (gap_i.hi, gap_i+1.lo), (gap_last.hi, page_width) +/// - Lines whose first span is in a GAP region remain unassigned (column = None) +/// - "First span" = leftmost post-sort (within-line sorting already done) +/// - INV: 3-line minimum from plan +/// +/// # Examples +/// +/// ``` +/// use pdftract_core::layout::columns::confirm_columns; +/// +/// // Two gaps detected on a 612pt page +/// let gaps = vec![ColumnGap::new(200, 215), ColumnGap::new(400, 415)]; +/// +/// // Candidate columns: [0, 200), [216, 400), [416, 612) +/// // Only columns with >= 3 lines are confirmed +/// ``` +pub fn confirm_columns(gaps: &[ColumnGap], page_width: f32, lines: &[L]) -> Vec +where + L: HasFirstSpan, +{ + // Handle no gaps: entire page is one candidate column + if gaps.is_empty() { + // Count lines whose first span is within page bounds + let line_count = lines + .iter() + .filter(|line| { + line + .first_span_bbox() + .map_or(false, |bbox| bbox[0] >= 0.0 && bbox[0] < page_width) + }) + .count(); + + // Confirm single column if >= 3 lines + if line_count >= 3 { + return vec![Column::new(0, [0.0, page_width])]; + } else { + return Vec::new(); + } + } + + // Build candidate columns from gaps + let mut candidates = Vec::new(); + + // Before-first gap: (0, gap_0.lo) + if let Some(first_gap) = gaps.first() { + if first_gap.lo > 0 { + candidates.push(CandidateColumn { + x_range: [0.0, first_gap.lo as f32], + line_count: 0, + }); + } + } + + // Between consecutive gaps: (gap_i.hi, gap_i+1.lo) + for window in gaps.windows(2) { + let prev_gap = &window[0]; + let next_gap = &window[1]; + candidates.push(CandidateColumn { + x_range: [(prev_gap.hi + 1) as f32, next_gap.lo as f32], + line_count: 0, + }); + } + + // After-last gap: (gap_last.hi, page_width) + if let Some(last_gap) = gaps.last() { + let x0 = (last_gap.hi + 1) as f32; + if x0 < page_width { + candidates.push(CandidateColumn { + x_range: [x0, page_width], + line_count: 0, + }); + } + } + + // Count lines whose first span's x0 falls in each candidate column + for line in lines { + if let Some(bbox) = line.first_span_bbox() { + let x0 = bbox[0]; + for candidate in &mut candidates { + if x0 >= candidate.x_range[0] && x0 < candidate.x_range[1] { + candidate.line_count += 1; + break; // Each line counted once + } + } + } + } + + // Promote candidates with >= 3 lines to confirmed columns + let confirmed: Vec = candidates + .into_iter() + .filter(|c| c.line_count >= 3) + .enumerate() + .map(|(i, c)| Column::new(i as u32, c.x_range)) + .collect(); + + confirmed +} + +/// Trait for types that can provide the first span's bounding box. +/// +/// This trait allows the column confirmation code to work with different +/// line representations while accessing the leftmost span's position. +pub trait HasFirstSpan { + /// Get the bounding box [x0, y0, x1, y1] of the first (leftmost) span. + /// + /// Returns None if the line has no spans. + fn first_span_bbox(&self) -> Option<[f32; 4]>; +} + /// Trait for types with a bounding box for histogram building. /// /// This is a simplified version of the trait used in column assignment, @@ -617,4 +878,467 @@ mod tests { assert_eq!(hist.len(), 595); assert_eq!(hist[100], 1); } + + // ColumnGap tests + + #[test] + fn test_column_gap_new() { + let gap = ColumnGap::new(10, 20); + assert_eq!(gap.lo, 10); + assert_eq!(gap.hi, 20); + } + + #[test] + fn test_column_gap_width() { + let gap = ColumnGap::new(10, 20); + assert_eq!(gap.width(), 11); // 20 - 10 + 1 + } + + #[test] + fn test_column_gap_midpoint() { + let gap = ColumnGap::new(10, 20); + assert_eq!(gap.midpoint(), 15.0); // (10 + 20) / 2 + } + + #[test] + fn test_column_gap_single_bucket() { + let gap = ColumnGap::new(10, 10); + assert_eq!(gap.width(), 1); + assert_eq!(gap.midpoint(), 10.0); + } + + // detect_column_gaps tests + + #[test] + fn test_detect_column_gaps_short_zeros_no_gap() { + // Histogram with 8 contiguous zeros, page_width=600 (threshold=18): NO gap (8 < 18) + let mut hist = vec![1u32; 50]; + hist.extend(vec![0u32; 8]); + hist.extend(vec![1u32; 50]); + + let gaps = detect_column_gaps(&hist, 600.0_f32); + assert_eq!(gaps.len(), 0); + } + + #[test] + fn test_detect_column_gaps_middle_gap() { + // Histogram with 20 zeros middle, page_width=600: 1 gap + let mut hist = vec![1u32; 50]; + hist.extend(vec![0u32; 20]); + hist.extend(vec![1u32; 50]); + + let gaps = detect_column_gaps(&hist, 600.0_f32); + assert_eq!(gaps.len(), 1); + assert_eq!(gaps[0].lo, 50); + assert_eq!(gaps[0].hi, 69); + assert_eq!(gaps[0].width(), 20); + } + + #[test] + fn test_detect_column_gaps_leading_gap() { + // Leading zeros >= threshold: 1 leading gap + let mut hist = vec![0u32; 25]; + hist.extend(vec![1u32; 100]); + + let gaps = detect_column_gaps(&hist, 600.0_f32); + assert_eq!(gaps.len(), 1); + assert_eq!(gaps[0].lo, 0); + assert_eq!(gaps[0].hi, 24); + } + + #[test] + fn test_detect_column_gaps_trailing_gap() { + // Trailing zeros >= threshold: 1 trailing gap + let mut hist = vec![1u32; 100]; + hist.extend(vec![0u32; 25]); + + let gaps = detect_column_gaps(&hist, 600.0_f32); + assert_eq!(gaps.len(), 1); + assert_eq!(gaps[0].lo, 100); + assert_eq!(gaps[0].hi, 124); + } + + #[test] + fn test_detect_column_gaps_all_zeros_no_gaps() { + // All-zero histogram: 0 gaps (empty page) + let hist = vec![0u32; 600]; + + let gaps = detect_column_gaps(&hist, 600.0_f32); + assert_eq!(gaps.len(), 0); + } + + #[test] + fn test_detect_column_gaps_multiple_gaps() { + // Multiple gaps separated by non-zero regions + let mut hist = vec![1u32; 50]; + hist.extend(vec![0u32; 20]); // gap 1 + hist.extend(vec![1u32; 30]); + hist.extend(vec![0u32; 25]); // gap 2 + hist.extend(vec![1u32; 50]); + + let gaps = detect_column_gaps(&hist, 600.0_f32); + assert_eq!(gaps.len(), 2); + assert_eq!(gaps[0].lo, 50); + assert_eq!(gaps[0].hi, 69); + assert_eq!(gaps[1].lo, 100); + assert_eq!(gaps[1].hi, 124); + } + + #[test] + fn test_detect_column_gaps_threshold_exact() { + // Gap exactly at threshold should be included + let page_width = 600.0_f32; + let threshold = (page_width * 0.03_f32).ceil() as usize; // 18 + + let mut hist = vec![1u32; 50]; + hist.extend(vec![0u32; threshold]); // exactly threshold + hist.extend(vec![1u32; 50]); + + let gaps = detect_column_gaps(&hist, page_width); + assert_eq!(gaps.len(), 1); + assert_eq!(gaps[0].width(), threshold); + } + + #[test] + fn test_detect_column_gaps_threshold_minus_one() { + // Gap at threshold-1 should NOT be included + let page_width = 600.0_f32; + let threshold = (page_width * 0.03_f32).ceil() as usize; // 18 + + let mut hist = vec![1u32; 50]; + hist.extend(vec![0u32; threshold - 1]); // just below threshold + hist.extend(vec![1u32; 50]); + + let gaps = detect_column_gaps(&hist, page_width); + assert_eq!(gaps.len(), 0); + } + + #[test] + fn test_detect_column_gaps_empty_histogram() { + let hist: Vec = vec![]; + let gaps = detect_column_gaps(&hist, 600.0_f32); + assert_eq!(gaps.len(), 0); + } + + #[test] + fn test_detect_column_gaps_no_zeros() { + // Histogram with no zeros: no gaps + let hist = vec![1u32; 600]; + let gaps = detect_column_gaps(&hist, 600.0_f32); + assert_eq!(gaps.len(), 0); + } + + #[test] + fn test_detect_column_gaps_small_page() { + // Test with smaller page (threshold scales proportionally) + let page_width = 300.0_f32; + let threshold = (page_width * 0.03_f32).ceil() as usize; // 9 + + let mut hist = vec![1u32; 50]; + hist.extend(vec![0u32; 12]); // > 9, should be a gap + hist.extend(vec![1u32; 50]); + + let gaps = detect_column_gaps(&hist, page_width); + assert_eq!(gaps.len(), 1); + assert_eq!(gaps[0].width(), 12); + } + + #[test] + fn test_detect_column_gaps_leading_and_trailing() { + // Both leading and trailing gaps + let mut hist = vec![0u32; 20]; // leading + hist.extend(vec![1u32; 100]); + hist.extend(vec![0u32; 20]); // trailing + + let page_width = 600.0_f32; + let threshold = (page_width * 0.03_f32).ceil() as usize; // 18 + + // Only the trailing gap should be detected (20 >= 18) + // Leading is only 20 which is >= 18, so it should also be detected + let gaps = detect_column_gaps(&hist, page_width); + assert_eq!(gaps.len(), 2); + assert_eq!(gaps[0].lo, 0); + assert_eq!(gaps[0].hi, 19); + assert_eq!(gaps[1].lo, 120); + assert_eq!(gaps[1].hi, 139); + } + + // confirm_columns tests + + /// Test line with first span bbox. + #[derive(Debug, Clone)] + struct TestLineWithSpans { + first_span_bbox: Option<[f32; 4]>, + } + + impl TestLineWithSpans { + fn new(bbox: Option<[f32; 4]>) -> Self { + Self { first_span_bbox: bbox } + } + } + + impl HasFirstSpan for TestLineWithSpans { + fn first_span_bbox(&self) -> Option<[f32; 4]> { + self.first_span_bbox + } + } + + #[test] + fn test_confirm_columns_two_column_both_confirmed() { + // 2-column page with 30 lines each: both confirmed + let gaps = vec![ColumnGap::new(300, 319)]; // gap at 300-319 + + let mut lines = Vec::new(); + // Column 0: lines at x0=50 (30 lines) + for _ in 0..30 { + lines.push(TestLineWithSpans::new(Some([50.0, 0.0, 200.0, 10.0]))); + } + // Column 1: lines at x0=350 (30 lines) + for _ in 0..30 { + lines.push(TestLineWithSpans::new(Some([350.0, 0.0, 500.0, 10.0]))); + } + + let confirmed = confirm_columns(&gaps, 600.0, &lines); + + assert_eq!(confirmed.len(), 2); + assert_eq!(confirmed[0].index, 0); + assert_eq!(confirmed[0].x_range, [0.0, 300.0]); + assert_eq!(confirmed[1].index, 1); + assert_eq!(confirmed[1].x_range, [320.0, 600.0]); + } + + #[test] + fn test_confirm_columns_two_column_one_confirmed() { + // 2-column page with 30 lines + 2 lines: only 30-line column confirmed + let gaps = vec![ColumnGap::new(300, 319)]; + + let mut lines = Vec::new(); + // Column 0: lines at x0=50 (30 lines) + for _ in 0..30 { + lines.push(TestLineWithSpans::new(Some([50.0, 0.0, 200.0, 10.0]))); + } + // Column 1: lines at x0=350 (2 lines) + for _ in 0..2 { + lines.push(TestLineWithSpans::new(Some([350.0, 0.0, 500.0, 10.0]))); + } + + let confirmed = confirm_columns(&gaps, 600.0, &lines); + + assert_eq!(confirmed.len(), 1); + assert_eq!(confirmed[0].index, 0); + assert_eq!(confirmed[0].x_range, [0.0, 300.0]); + } + + #[test] + fn test_confirm_columns_single_column_confirmed() { + // Single column with 5 lines -> confirmed + let gaps = vec![]; + + let mut lines = Vec::new(); + for _ in 0..5 { + lines.push(TestLineWithSpans::new(Some([50.0, 0.0, 200.0, 10.0]))); + } + + let confirmed = confirm_columns(&gaps, 600.0, &lines); + + assert_eq!(confirmed.len(), 1); + assert_eq!(confirmed[0].index, 0); + assert_eq!(confirmed[0].x_range, [0.0, 600.0]); + } + + #[test] + fn test_confirm_columns_single_column_insufficient_lines() { + // Single column with only 2 lines -> not confirmed + let gaps = vec![]; + + let mut lines = Vec::new(); + for _ in 0..2 { + lines.push(TestLineWithSpans::new(Some([50.0, 0.0, 200.0, 10.0]))); + } + + let confirmed = confirm_columns(&gaps, 600.0, &lines); + + assert_eq!(confirmed.len(), 0); + } + + #[test] + fn test_confirm_columns_empty_page() { + // Empty page: 0 confirmed + let gaps = vec![]; + let lines: Vec = vec![]; + + let confirmed = confirm_columns(&gaps, 600.0, &lines); + + assert_eq!(confirmed.len(), 0); + } + + #[test] + fn test_confirm_columns_no_gaps_insufficient_lines() { + // No gaps but only 2 lines: 0 confirmed (below 3-line threshold) + let gaps = vec![]; + + let mut lines = Vec::new(); + for _ in 0..2 { + lines.push(TestLineWithSpans::new(Some([50.0, 0.0, 200.0, 10.0]))); + } + + let confirmed = confirm_columns(&gaps, 600.0, &lines); + + assert_eq!(confirmed.len(), 0); + } + + #[test] + fn test_confirm_columns_exactly_three_lines() { + // Exactly 3 lines: confirmed (>= threshold) + let gaps = vec![]; + + let mut lines = Vec::new(); + for _ in 0..3 { + lines.push(TestLineWithSpans::new(Some([50.0, 0.0, 200.0, 10.0]))); + } + + let confirmed = confirm_columns(&gaps, 600.0, &lines); + + assert_eq!(confirmed.len(), 1); + } + + #[test] + fn test_confirm_columns_three_column_all_confirmed() { + // Three-column page with 10 lines each: all confirmed + let gaps = vec![ColumnGap::new(200, 219), ColumnGap::new(400, 419)]; + + let mut lines = Vec::new(); + // Column 0: lines at x0=50 (10 lines) + for _ in 0..10 { + lines.push(TestLineWithSpans::new(Some([50.0, 0.0, 150.0, 10.0]))); + } + // Column 1: lines at x0=250 (10 lines) + for _ in 0..10 { + lines.push(TestLineWithSpans::new(Some([250.0, 0.0, 350.0, 10.0]))); + } + // Column 2: lines at x0=450 (10 lines) + for _ in 0..10 { + lines.push(TestLineWithSpans::new(Some([450.0, 0.0, 550.0, 10.0]))); + } + + let confirmed = confirm_columns(&gaps, 600.0, &lines); + + assert_eq!(confirmed.len(), 3); + assert_eq!(confirmed[0].index, 0); + assert_eq!(confirmed[0].x_range, [0.0, 200.0]); + assert_eq!(confirmed[1].index, 1); + assert_eq!(confirmed[1].x_range, [220.0, 400.0]); + assert_eq!(confirmed[2].index, 2); + assert_eq!(confirmed[2].x_range, [420.0, 600.0]); + } + + #[test] + fn test_confirm_columns_three_column_middle_insufficient() { + // Three-column with middle column having only 2 lines: only outer confirmed + let gaps = vec![ColumnGap::new(200, 219), ColumnGap::new(400, 419)]; + + let mut lines = Vec::new(); + // Column 0: lines at x0=50 (10 lines) + for _ in 0..10 { + lines.push(TestLineWithSpans::new(Some([50.0, 0.0, 150.0, 10.0]))); + } + // Column 1: lines at x0=250 (2 lines) + for _ in 0..2 { + lines.push(TestLineWithSpans::new(Some([250.0, 0.0, 350.0, 10.0]))); + } + // Column 2: lines at x0=450 (10 lines) + for _ in 0..10 { + lines.push(TestLineWithSpans::new(Some([450.0, 0.0, 550.0, 10.0]))); + } + + let confirmed = confirm_columns(&gaps, 600.0, &lines); + + assert_eq!(confirmed.len(), 2); + assert_eq!(confirmed[0].index, 0); + assert_eq!(confirmed[0].x_range, [0.0, 200.0]); + assert_eq!(confirmed[1].index, 1); // Note: index is reassigned after filtering + assert_eq!(confirmed[1].x_range, [420.0, 600.0]); + } + + #[test] + fn test_confirm_columns_lines_in_gap_unassigned() { + // Lines whose first span is in a gap region are not counted + let gaps = vec![ColumnGap::new(200, 219)]; + + let mut lines = Vec::new(); + // Column 0: lines at x0=50 (5 lines) + for _ in 0..5 { + lines.push(TestLineWithSpans::new(Some([50.0, 0.0, 150.0, 10.0]))); + } + // Gap region: lines at x0=210 (3 lines) - should NOT be counted + for _ in 0..3 { + lines.push(TestLineWithSpans::new(Some([210.0, 0.0, 250.0, 10.0]))); + } + // Column 1: lines at x0=250 (5 lines) + for _ in 0..5 { + lines.push(TestLineWithSpans::new(Some([250.0, 0.0, 350.0, 10.0]))); + } + + let confirmed = confirm_columns(&gaps, 600.0, &lines); + + assert_eq!(confirmed.len(), 2); + // Both columns have 5 lines each (gap lines not counted) + assert_eq!(confirmed[0].x_range, [0.0, 200.0]); + assert_eq!(confirmed[1].x_range, [220.0, 600.0]); + } + + #[test] + fn test_confirm_columns_lines_with_no_spans() { + // Lines with no spans (first_span_bbox = None) are not counted + let gaps = vec![]; + + let mut lines = Vec::new(); + // 3 valid lines + for _ in 0..3 { + lines.push(TestLineWithSpans::new(Some([50.0, 0.0, 150.0, 10.0]))); + } + // 2 lines with no spans + for _ in 0..2 { + lines.push(TestLineWithSpans::new(None)); + } + + let confirmed = confirm_columns(&gaps, 600.0, &lines); + + // Only 3 valid lines -> confirmed + assert_eq!(confirmed.len(), 1); + } + + #[test] + fn test_confirm_columns_leading_gap() { + // Leading gap (page margin) creates first column after gap + let gaps = vec![ColumnGap::new(0, 49)]; // leading margin + + let mut lines = Vec::new(); + // Lines in column starting at x0=50 (5 lines) + for _ in 0..5 { + lines.push(TestLineWithSpans::new(Some([50.0, 0.0, 200.0, 10.0]))); + } + + let confirmed = confirm_columns(&gaps, 600.0, &lines); + + assert_eq!(confirmed.len(), 1); + assert_eq!(confirmed[0].x_range, [50.0, 600.0]); + } + + #[test] + fn test_confirm_columns_trailing_gap() { + // Trailing gap (page margin) creates last column before gap + let gaps = vec![ColumnGap::new(550, 599)]; // trailing margin + + let mut lines = Vec::new(); + // Lines in column ending before x0=550 (5 lines) + for _ in 0..5 { + lines.push(TestLineWithSpans::new(Some([50.0, 0.0, 200.0, 10.0]))); + } + + let confirmed = confirm_columns(&gaps, 600.0, &lines); + + assert_eq!(confirmed.len(), 1); + assert_eq!(confirmed[0].x_range, [0.0, 550.0]); + } } diff --git a/notes/pdftract-2rkc1.md b/notes/pdftract-2rkc1.md new file mode 100644 index 0000000..cc6ca6e --- /dev/null +++ b/notes/pdftract-2rkc1.md @@ -0,0 +1,60 @@ +# pdftract-2rkc1: Column confirmation verification + +## Work completed + +The `confirm_columns` function has been implemented in `/home/coding/pdftract/crates/pdftract-core/src/layout/columns.rs` (lines 252-332). + +## Implementation details + +The implementation follows the algorithm specified in the plan: + +1. **No gaps case** (lines 257-274): Entire page is one candidate column. Counts lines whose first span's x0 falls within page bounds. Returns single column if >= 3 lines. + +2. **Candidate column construction** (lines 276-308): + - Before-first gap: `(0, gap_0.lo)` + - Between consecutive gaps: `(gap_i.hi + 1, gap_i+1.lo)` + - After-last gap: `(gap_last.hi + 1, page_width)` + +3. **Line counting** (lines 310-321): For each line, gets first span's bbox via `HasFirstSpan` trait, checks if x0 falls within candidate column range, counts unique lines. + +4. **Column promotion** (lines 324-329): Filters candidates with `line_count >= 3` to confirmed columns, reassigns indices left-to-right. + +## Supporting code added + +- `ColumnGap` struct (lines 89-116): Represents a gap in the x0 histogram with lo/hi bounds, width(), and midpoint() methods. +- `detect_column_gaps` function (lines 156-201): Finds zero-coverage regions >= 3% of page width. +- `HasFirstSpan` trait (lines 334-343): Trait for accessing first span's bbox. +- `CandidateColumn` struct (lines 203-213): Internal tracking of x_range and line_count. +- `Column` struct (lines 372-396): Confirmed column with index and x_range. + +## Test results + +All 49 column tests pass, including all acceptance criteria: + +| Acceptance criteria | Test | Result | +|-------------------|------|--------| +| 2-column page with 30 lines each: both confirmed | `test_confirm_columns_two_column_both_confirmed` | PASS | +| 2-column page with 30 lines + 2 lines: only 30-line column confirmed | `test_confirm_columns_two_column_one_confirmed` | PASS | +| Single column: 1 candidate -> confirmed | `test_confirm_columns_single_column_confirmed` | PASS | +| Empty page: 0 confirmed | `test_confirm_columns_empty_page` | PASS | + +Additional edge cases tested: +- Exactly 3 lines (boundary case): PASS +- Leading/trailing gaps: PASS +- Lines in gap unassigned: PASS +- Lines with no spans: PASS +- Three-column layouts: PASS + +## Invariants verified + +- **INV: 3-line minimum**: The filter condition `c.line_count >= 3` is enforced at line 326. +- **Lines in gaps remain unassigned**: Lines whose first span's x0 falls in a gap region are not counted for any candidate column. +- **"First span" = leftmost post-sort**: The `HasFirstSpan` trait provides the first (leftmost) span's bbox; within-line sorting is assumed to be done before calling `confirm_columns`. + +## Command used + +```bash +cargo nextest run -p pdftract-core 'columns::' +``` + +Result: 49 passed, 2382 skipped (0 failed)