fix(pdftract-3nwz): fix borderless table detection threshold and docs

Fix threshold logic in is_single_column_reflow to correctly detect single-column paragraph reflow patterns. Changed from integer division (< positions.len() / 2) to proper "more than half" check (* 2 < positions.len()). Also update module documentation to reflect that borderless detection is now implemented (7.2.2 complete). Acceptance criteria: - ✅ Borderless 3x3 table detected via alignment heuristic - ✅ Unit tests: paragraph rejected, one-row rejected, vertical-gap test - ✅ Public TableDetector::detect_borderless(&PageContext) -> Vec<GridCandidate> - ✅ All 28 detector tests pass Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-23 22:26:11 -04:00 · 2026-05-23 22:26:11 -04:00 · 8d1e411d7c
commit 8d1e411d7c
parent 21d6514ca8
2 changed files with 702 additions and 3 deletions
--- a/crates/pdftract-core/src/table/detector.rs
+++ b/crates/pdftract-core/src/table/detector.rs
@ -7,6 +7,30 @@ use super::{PageContext, GridCandidate, Segment, SegmentOrientation};
 use crate::parser::lexer::Lexer;
 use std::collections::{HashMap, HashSet};

+/// Tolerance for x0 alignment in borderless detection (2.0 pt).
+const X0_TOLERANCE: f32 = 2.0;
+
+/// Minimum number of spans per column candidate.
+const MIN_SPANS_PER_COLUMN: usize = 3;
+
+/// Minimum number of columns for a valid table.
+const MIN_COLUMNS: usize = 3;
+
+/// Minimum number of rows for a valid table.
+const MIN_ROWS: usize = 3;
+
+/// Maximum vertical gap between rows (100 pt).
+const MAX_VERTICAL_GAP: f32 = 100.0;
+
+/// A text position extracted from the content stream.
+#[derive(Debug, Clone, Copy)]
+struct TextPosition {
+    /// X coordinate of the text origin.
+    x0: f32,
+    /// Y coordinate of the text origin.
+    y0: f32,
+}
+
 /// Epsilon tolerance for collinearity detection (1.0 pt).
 const EPSILON: f32 = 1.0;

@ -80,6 +104,390 @@ impl TableDetector {
        self.build_grids(intersections, segments)
    }

+    /// Detect borderless tables using x0 alignment heuristic.
+    ///
+    /// This method analyzes text positioning to find tables without ruling lines:
+    /// 1. Collect text positions from content stream
+    /// 2. Group by x0 positions (within tolerance)
+    /// 3. Find column candidates (3+ spans at same x0)
+    /// 4. Find row candidates (y positions with multiple columns)
+    /// 5. Validate and build grid candidates
+    ///
+    /// # Arguments
+    ///
+    /// * `ctx` - The page context containing page dict and content bytes
+    ///
+    /// # Returns
+    ///
+    /// A vector of grid candidates representing detected borderless tables.
+    pub fn detect_borderless(&self, ctx: &PageContext) -> Vec<GridCandidate> {
+        // Step 1: Collect text positions from content stream
+        let text_positions = self.collect_text_positions(ctx);
+
+        if text_positions.is_empty() {
+            return Vec::new();
+        }
+
+        // Step 2: Group by x0 positions (within tolerance)
+        let column_buckets = self.group_by_x0(&text_positions);
+
+        // Step 3: Find column candidates (3+ spans at same x0)
+        let column_candidates: Vec<_> = column_buckets
+            .into_iter()
+            .filter(|(_, positions)| positions.len() >= MIN_SPANS_PER_COLUMN)
+            .collect();
+
+        if column_candidates.len() < MIN_COLUMNS {
+            return Vec::new();
+        }
+
+        // Step 4: Find row candidates
+        let row_candidates = self.find_row_candidates(&column_candidates);
+
+        if row_candidates.len() < MIN_ROWS {
+            return Vec::new();
+        }
+
+        // Step 5: Build grid from candidates
+        self.build_borderless_grid(&column_candidates, &row_candidates, &text_positions)
+    }
+
+    /// Collect text positions from the content stream.
+    ///
+    /// Parses Tm, Td, TD, T*, Tj, TJ, ', " operators to track text positions.
+    fn collect_text_positions(&self, ctx: &PageContext) -> Vec<TextPosition> {
+        let mut positions = Vec::new();
+        let mut lexer = Lexer::new(ctx.content_bytes);
+
+        let mut operand_stack: Vec<f32> = Vec::new();
+
+        // Current text matrix (Tm) and line matrix (Tlm)
+        let mut tm: [f32; 6] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]; // Identity matrix
+        let mut tlm: [f32; 6] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
+        let mut in_text_block = false;
+
+        while let Some(token) = lexer.next_token() {
+            match token {
+                crate::parser::lexer::Token::Integer(n) => {
+                    operand_stack.push(n as f32);
+                }
+                crate::parser::lexer::Token::Real(r) => {
+                    operand_stack.push(r as f32);
+                }
+                crate::parser::lexer::Token::Keyword(ref op) => {
+                    match op.as_slice() {
+                        b"BT" => {
+                            // Begin text block
+                            in_text_block = true;
+                            // Reset Tm and Tlm to identity
+                            tm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
+                            tlm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
+                        }
+                        b"ET" => {
+                            // End text block
+                            in_text_block = false;
+                        }
+                        b"Tm" => {
+                            // Set text matrix: Tm (operands: a b c d e f)
+                            if operand_stack.len() >= 6 {
+                                for i in 0..6 {
+                                    tm[i] = operand_stack[operand_stack.len() - 6 + i];
+                                }
+                                operand_stack.truncate(operand_stack.len() - 6);
+                                tlm = tm; // Tm also sets Tlm
+                            }
+                        }
+                        b"Td" => {
+                            // Move text position: Td (tx ty)
+                            if operand_stack.len() >= 2 {
+                                let ty = operand_stack.pop().unwrap();
+                                let tx = operand_stack.pop().unwrap();
+                                // Td: Tm = Tlm * [1 0 0 1 tx ty]
+                                tm[0] = tlm[0];
+                                tm[1] = tlm[1];
+                                tm[2] = tlm[2];
+                                tm[3] = tlm[3];
+                                tm[4] = tlm[0] * tx + tlm[2] * ty + tlm[4];
+                                tm[5] = tlm[1] * tx + tlm[3] * ty + tlm[5];
+                                tlm = tm; // Td also updates Tlm to the new Tm
+                            }
+                        }
+                        b"TD" => {
+                            // Move text position and set leading: TD (tx ty)
+                            if operand_stack.len() >= 2 {
+                                let ty = operand_stack.pop().unwrap();
+                                let tx = operand_stack.pop().unwrap();
+                                // TD: Tl = -ty, then Td
+                                // For position tracking, same as Td
+                                tm[0] = tlm[0];
+                                tm[1] = tlm[1];
+                                tm[2] = tlm[2];
+                                tm[3] = tlm[3];
+                                tm[4] = tlm[0] * tx + tlm[2] * ty + tlm[4];
+                                tm[5] = tlm[1] * tx + tlm[3] * ty + tlm[5];
+                                tlm = tm;
+                            }
+                        }
+                        b"T*" => {
+                            // Move to start of next line
+                            // T*: Td (0 Tl)
+                            // Tm[4] = Tlm[4], Tm[5] = Tlm[5] - Tl
+                            // We don't track Tl, so approximate by using current y
+                            tm[4] = tlm[4];
+                            tm[5] = tlm[5]; // This is approximate; would need Tl for exact
+                            tlm = tm;
+                        }
+                        b"Tj" => {
+                            // Show text: Tj (string)
+                            if in_text_block {
+                                // Record position at current text origin
+                                positions.push(TextPosition { x0: tm[4], y0: tm[5] });
+                            }
+                            operand_stack.clear(); // Tj consumes the string operand
+                        }
+                        b"TJ" => {
+                            // Show text with individual glyph positioning: TJ (array)
+                            if in_text_block {
+                                // Record position
+                                positions.push(TextPosition { x0: tm[4], y0: tm[5] });
+                            }
+                            operand_stack.clear(); // TJ consumes the array operand
+                        }
+                        b"'" => {
+                            // Move to next line and show text: ' (string)
+                            if in_text_block {
+                                tm[4] = tlm[4];
+                                tm[5] = tlm[5]; // Approximate
+                                tlm = tm;
+                                positions.push(TextPosition { x0: tm[4], y0: tm[5] });
+                            }
+                            operand_stack.clear();
+                        }
+                        b"\"" => {
+                            // Set word and character spacing, move to next line, show text
+                            // " (tw tc s) -> we just track position
+                            if in_text_block && operand_stack.len() >= 3 {
+                                operand_stack.truncate(operand_stack.len() - 3);
+                                tm[4] = tlm[4];
+                                tm[5] = tlm[5]; // Approximate
+                                tlm = tm;
+                                positions.push(TextPosition { x0: tm[4], y0: tm[5] });
+                            }
+                        }
+                        _ => {
+                            // Other operators - clear operand stack
+                            operand_stack.clear();
+                        }
+                    }
+                }
+                _ => {
+                    // Other tokens - ignore
+                }
+            }
+        }
+
+        positions
+    }
+
+    /// Group text positions by x0 coordinate within tolerance.
+    ///
+    /// Uses clustering: positions are grouped if their x0 values are within
+    /// X0_TOLERANCE of each other. This is more accurate than fixed-width
+    /// bucketing for detecting aligned columns.
+    fn group_by_x0(&self, positions: &[TextPosition]) -> HashMap<i32, Vec<TextPosition>> {
+        if positions.is_empty() {
+            return HashMap::new();
+        }
+
+        let mut sorted_positions = positions.to_vec();
+        sorted_positions.sort_by(|a, b| a.x0.partial_cmp(&b.x0).unwrap_or(std::cmp::Ordering::Equal));
+
+        let mut clusters: Vec<Vec<TextPosition>> = Vec::new();
+        let mut current_cluster = vec![sorted_positions[0]];
+
+        for pos in &sorted_positions[1..] {
+            if (pos.x0 - current_cluster[0].x0).abs() <= X0_TOLERANCE {
+                // Within tolerance of cluster center, add to current cluster
+                current_cluster.push(*pos);
+            } else {
+                // Start new cluster
+                clusters.push(current_cluster);
+                current_cluster = vec![*pos];
+            }
+        }
+        clusters.push(current_cluster);
+
+        // Convert to HashMap with sequential keys
+        let mut buckets: HashMap<i32, Vec<TextPosition>> = HashMap::new();
+        for (i, cluster) in clusters.into_iter().enumerate() {
+            buckets.insert(i as i32, cluster);
+        }
+
+        buckets
+    }
+
+    /// Find row candidates from column buckets.
+    ///
+    /// A row candidate is a y position where >= 2 column candidates have spans.
+    fn find_row_candidates(&self, column_buckets: &[(i32, Vec<TextPosition>)]) -> Vec<f32> {
+        // Build a map of y positions to column count
+        let mut y_to_column_count: HashMap<i32, HashSet<i32>> = HashMap::new();
+
+        for &(key, ref positions) in column_buckets {
+            for pos in positions {
+                // Round y to nearest integer for grouping (same tolerance as x0)
+                let y_key = (pos.y0 / X0_TOLERANCE).round() as i32;
+                y_to_column_count
+                    .entry(y_key)
+                    .or_insert_with(HashSet::new)
+                    .insert(key);
+            }
+        }
+
+        // Extract y positions that have multiple columns
+        let mut row_ys: Vec<f32> = y_to_column_count
+            .into_iter()
+            .filter(|(_, cols)| cols.len() >= 2)
+            .map(|(y_key, _)| (y_key as f32) * X0_TOLERANCE)
+            .collect();
+
+        // Sort descending (PDF y increases upward)
+        row_ys.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
+        row_ys
+    }
+
+    /// Build a borderless grid from column and row candidates.
+    fn build_borderless_grid(
+        &self,
+        column_buckets: &[(i32, Vec<TextPosition>)],
+        row_ys: &[f32],
+        all_positions: &[TextPosition],
+    ) -> Vec<GridCandidate> {
+        if row_ys.is_empty() || column_buckets.is_empty() {
+            return Vec::new();
+        }
+
+        // Find contiguous y ranges (no gap > MAX_VERTICAL_GAP)
+        let mut y_ranges = Vec::new();
+        let mut current_range_start = row_ys[0];
+        let mut current_range_end = row_ys[0];
+
+        for &y in row_ys.iter().skip(1) {
+            if (current_range_end - y).abs() <= MAX_VERTICAL_GAP {
+                // Extend current range
+                current_range_end = y.min(current_range_end);
+            } else {
+                // Start new range
+                y_ranges.push((current_range_start, current_range_end));
+                current_range_start = y;
+                current_range_end = y;
+            }
+        }
+        y_ranges.push((current_range_start, current_range_end));
+
+        // Build grid for each y range
+        let mut grids = Vec::new();
+        for (y_top, y_bottom) in y_ranges {
+            if let Some(grid) = self.build_single_borderless_grid(column_buckets, y_top, y_bottom, all_positions) {
+                grids.push(grid);
+            }
+        }
+
+        grids
+    }
+
+    /// Build a single borderless grid for a specific y range.
+    fn build_single_borderless_grid(
+        &self,
+        column_buckets: &[(i32, Vec<TextPosition>)],
+        y_top: f32,
+        y_bottom: f32,
+        all_positions: &[TextPosition],
+    ) -> Option<GridCandidate> {
+        // Get sorted column x positions
+        let mut col_xs: Vec<f32> = column_buckets
+            .iter()
+            .map(|(key, _)| (*key as f32) * X0_TOLERANCE)
+            .collect();
+        col_xs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
+
+        // Filter rows to within y range (use integer keys for deduplication)
+        let row_ys: Vec<f32> = all_positions
+            .iter()
+            .map(|p| p.y0)
+            .filter(|&y| y <= y_top && y >= y_bottom)
+            .map(|y| (y / X0_TOLERANCE).round() as i32)
+            .collect::<std::collections::HashSet<_>>()
+            .into_iter()
+            .map(|y_key| (y_key as f32) * X0_TOLERANCE)
+            .collect::<Vec<_>>();
+
+        let mut row_ys_sorted = row_ys.clone();
+        row_ys_sorted.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
+
+        if row_ys_sorted.len() < MIN_ROWS || col_xs.len() < MIN_COLUMNS {
+            return None;
+        }
+
+        // Compute bounding box
+        let x0 = col_xs.first().copied()?;
+        let x1 = col_xs.last().copied()?;
+        let y0 = row_ys_sorted.last().copied()?; // Bottom
+        let y1 = row_ys_sorted.first().copied()?; // Top
+
+        // Reject if spans suggest single-column paragraph reflow
+        if self.is_single_column_reflow(column_buckets) {
+            return None;
+        }
+
+        let bbox = [x0, y0, x1, y1];
+
+        Some(GridCandidate {
+            bbox,
+            row_ys: row_ys_sorted,
+            col_xs,
+            segments: Vec::new(), // No segments for borderless tables
+        })
+    }
+
+    /// Check if the pattern suggests single-column paragraph reflow.
+    ///
+    /// Returns true if any column candidate's spans are all on consecutive
+    /// lines without aligned neighbors in any other column candidate.
+    fn is_single_column_reflow(&self, column_buckets: &[(i32, Vec<TextPosition>)]) -> bool {
+        // Build a map of y positions to column keys
+        let mut y_to_columns: HashMap<i32, Vec<i32>> = HashMap::new();
+        for &(key, ref positions) in column_buckets {
+            for pos in positions {
+                let y_key = (pos.y0 / X0_TOLERANCE).round() as i32;
+                y_to_columns
+                    .entry(y_key)
+                    .or_insert_with(Vec::new)
+                    .push(key);
+            }
+        }
+
+        // For each column, check if its y positions lack multi-column alignment
+        for &(_key, ref positions) in column_buckets {
+            let mut aligned_count = 0;
+            for pos in positions {
+                let y_key = (pos.y0 / X0_TOLERANCE).round() as i32;
+                if let Some(cols) = y_to_columns.get(&y_key) {
+                    if cols.len() >= 2 {
+                        aligned_count += 1;
+                    }
+                }
+            }
+            // If most spans in this column are not aligned with other columns, reject
+            // "Most" means more than half, so we check if aligned_count * 2 < positions.len()
+            if aligned_count * 2 < positions.len() {
+                return true;
+            }
+        }
+
+        false
+    }
+
    /// Collect horizontal and vertical path segments from content stream.
    fn collect_segments(&self, ctx: &PageContext) -> Vec<Segment> {
        let mut segments = Vec::new();
@ -558,4 +966,287 @@ mod tests {
        // A full implementation would detect separate regions
        assert!(!grids.is_empty());
    }
+
+    // Borderless table detection tests
+
+    #[test]
+    fn test_detect_borderless_empty_content() {
+        let detector = TableDetector::new();
+        let page = make_page(b"");
+        let ctx = PageContext::new(&page, b"");
+
+        let grids = detector.detect_borderless(&ctx);
+        assert!(grids.is_empty());
+    }
+
+    #[test]
+    fn test_detect_borderless_no_text_block() {
+        let detector = TableDetector::new();
+        let page = make_page(b"");
+        // Content without text block (only path operators)
+        let content = b"50 100 m 150 100 l S";
+        let ctx = PageContext::new(&page, content);
+
+        let grids = detector.detect_borderless(&ctx);
+        assert!(grids.is_empty());
+    }
+
+    #[test]
+    fn test_detect_borderless_paragraph_rejected() {
+        // Single column text should be rejected (not a table)
+        let detector = TableDetector::new();
+        let page = make_page(b"");
+
+        // Simulate a paragraph with left-aligned text at x=50
+        // Multiple lines but all at same x0
+        let content = b"\
+            BT \
+            50 700 Td (Line 1) Tj \
+            0 -15 Td (Line 2) Tj \
+            0 -15 Td (Line 3) Tj \
+            0 -15 Td (Line 4) Tj \
+            ET";
+
+        let ctx = PageContext::new(&page, content);
+        let grids = detector.detect_borderless(&ctx);
+
+        // Should not detect a table (only 1 column)
+        assert!(grids.is_empty());
+    }
+
+    #[test]
+    fn test_detect_borderless_one_row_pseudo_table_rejected() {
+        // Single row with multiple columns should be rejected (< 3 rows)
+        let detector = TableDetector::new();
+        let page = make_page(b"");
+
+        // Simulate one row with 3 columns
+        let content = b"\
+            BT \
+            50 700 Td (Col1) Tj \
+            100 700 Td (Col2) Tj \
+            150 700 Td (Col3) Tj \
+            ET";
+
+        let ctx = PageContext::new(&page, content);
+        let grids = detector.detect_borderless(&ctx);
+
+        // Should not detect a table (only 1 row)
+        assert!(grids.is_empty());
+    }
+
+    #[test]
+    fn test_detect_borderless_3x3_table_accepted() {
+        // Critical test: 3 rows x 3 columns borderless table
+        let detector = TableDetector::new();
+        let page = make_page(b"");
+
+        // Simulate a 3x3 table with aligned columns
+        // Column 1 at x=50, Column 2 at x=150, Column 3 at x=250
+        // Rows at y=700, 650, 600
+        let content = b"\
+            BT \
+            50 700 Td (R1C1) Tj 100 0 Td (R1C2) Tj 100 0 Td (R1C3) Tj \
+            -200 -50 Td (R2C1) Tj 100 0 Td (R2C2) Tj 100 0 Td (R2C3) Tj \
+            -200 -50 Td (R3C1) Tj 100 0 Td (R3C2) Tj 100 0 Td (R3C3) Tj \
+            ET";
+
+        let ctx = PageContext::new(&page, content);
+        let grids = detector.detect_borderless(&ctx);
+
+        // Should detect a table
+        assert_eq!(grids.len(), 1);
+        assert_eq!(grids[0].row_count(), 2); // 3 rows = 2 intervals
+        assert_eq!(grids[0].col_count(), 2); // 3 columns = 2 intervals
+        assert_eq!(grids[0].cell_count(), 4);
+        // Verify segments are empty for borderless tables
+        assert!(grids[0].segments.is_empty());
+    }
+
+    #[test]
+    fn test_detect_borderless_vertical_gap_test() {
+        // Two separate tables with a large vertical gap (> 100 pt)
+        let detector = TableDetector::new();
+        let page = make_page(b"");
+
+        // First table at y=700, 650, 600
+        // Second table at y=400, 350, 300
+        // Gap = 600 - 400 = 200 pt > 100 pt threshold
+        let content = b"\
+            BT \
+            50 700 Td (R1C1) Tj 100 0 Td (R1C2) Tj 100 0 Td (R1C3) Tj \
+            -200 -50 Td (R2C1) Tj 100 0 Td (R2C2) Tj 100 0 Td (R2C3) Tj \
+            -200 -50 Td (R3C1) Tj 100 0 Td (R3C2) Tj 100 0 Td (R3C3) Tj \
+            ET \
+            BT \
+            50 400 Td (R1C1) Tj 100 0 Td (R1C2) Tj 100 0 Td (R1C3) Tj \
+            -200 -50 Td (R2C1) Tj 100 0 Td (R2C2) Tj 100 0 Td (R2C3) Tj \
+            -200 -50 Td (R3C1) Tj 100 0 Td (R3C2) Tj 100 0 Td (R3C3) Tj \
+            ET";
+
+        let ctx = PageContext::new(&page, content);
+        let grids = detector.detect_borderless(&ctx);
+
+        // Should detect two separate tables
+        assert_eq!(grids.len(), 2);
+    }
+
+    #[test]
+    fn test_collect_text_positions_basic() {
+        let detector = TableDetector::new();
+        let page = make_page(b"");
+
+        // Basic text positioning with Tm and Tj
+        let content = b"BT 1 0 0 1 50 700 Tm (Hello) Tj ET";
+        let ctx = PageContext::new(&page, content);
+
+        let positions = detector.collect_text_positions(&ctx);
+        assert_eq!(positions.len(), 1);
+        assert_eq!(positions[0].x0, 50.0);
+        assert_eq!(positions[0].y0, 700.0);
+    }
+
+    #[test]
+    fn test_collect_text_positions_with_td() {
+        let detector = TableDetector::new();
+        let page = make_page(b"");
+
+        // Text positioning with Td
+        let content = b"BT 50 700 Td (Hello) Tj 100 0 Td (World) Tj ET";
+        let ctx = PageContext::new(&page, content);
+
+        let positions = detector.collect_text_positions(&ctx);
+        assert_eq!(positions.len(), 2);
+        // First position at (50, 700)
+        assert_eq!(positions[0].x0, 50.0);
+        assert_eq!(positions[0].y0, 700.0);
+        // Second position at (150, 700) - Td adds to current position
+        // The actual x position depends on Tm calculation
+    }
+
+    #[test]
+    fn test_collect_text_positions_with_tj() {
+        let detector = TableDetector::new();
+        let page = make_page(b"");
+
+        // Text positioning with TJ (array)
+        let content = b"BT 50 700 Td [(Hello) 100 (World)] TJ ET";
+        let ctx = PageContext::new(&page, content);
+
+        let positions = detector.collect_text_positions(&ctx);
+        // Should record position for TJ operator
+        assert!(!positions.is_empty());
+    }
+
+    #[test]
+    fn test_group_by_x0_tolerance() {
+        let detector = TableDetector::new();
+        let positions = vec![
+            TextPosition { x0: 50.0, y0: 700.0 },
+            TextPosition { x0: 51.0, y0: 650.0 }, // Within 2 pt tolerance
+            TextPosition { x0: 52.0, y0: 600.0 }, // Within 2 pt tolerance
+            TextPosition { x0: 150.0, y0: 700.0 }, // Different column
+        ];
+
+        let buckets = detector.group_by_x0(&positions);
+        // x0=50, 51, 52 should be in same bucket (within tolerance)
+        // x0=150 should be in different bucket
+        assert_eq!(buckets.len(), 2);
+        // One bucket should have 3 positions, one should have 1
+        let counts: Vec<_> = buckets.values().map(|v| v.len()).collect();
+        assert!(counts.contains(&3));
+        assert!(counts.contains(&1));
+    }
+
+    #[test]
+    fn test_find_row_candidates_basic() {
+        let detector = TableDetector::new();
+        let column_buckets = vec![
+            (0, vec![
+                TextPosition { x0: 50.0, y0: 700.0 },
+                TextPosition { x0: 50.0, y0: 650.0 },
+                TextPosition { x0: 50.0, y0: 600.0 },
+            ]),
+            (25, vec![
+                TextPosition { x0: 150.0, y0: 700.0 },
+                TextPosition { x0: 150.0, y0: 650.0 },
+                TextPosition { x0: 150.0, y0: 600.0 },
+            ]),
+            (50, vec![
+                TextPosition { x0: 250.0, y0: 700.0 },
+                TextPosition { x0: 250.0, y0: 650.0 },
+                TextPosition { x0: 250.0, y0: 600.0 },
+            ]),
+        ];
+
+        let rows = detector.find_row_candidates(&column_buckets);
+        // Should find 3 row positions (700, 650, 600)
+        assert_eq!(rows.len(), 3);
+        // Rows should be sorted descending
+        assert_eq!(rows[0], 700.0);
+        assert_eq!(rows[1], 650.0);
+        assert_eq!(rows[2], 600.0);
+    }
+
+    #[test]
+    fn test_is_single_column_reflow_true() {
+        let detector = TableDetector::new();
+        // Column 1 has positions that don't align with other columns
+        let column_buckets = vec![
+            (0, vec![
+                TextPosition { x0: 50.0, y0: 700.0 },
+                TextPosition { x0: 50.0, y0: 685.0 }, // Different y
+                TextPosition { x0: 50.0, y0: 670.0 }, // Different y
+            ]),
+            (25, vec![
+                TextPosition { x0: 150.0, y0: 700.0 }, // Only aligns with first
+            ]),
+        ];
+
+        let is_reflow = detector.is_single_column_reflow(&column_buckets);
+        // First column has mostly non-aligned positions, should be detected as reflow
+        assert!(is_reflow);
+    }
+
+    #[test]
+    fn test_is_single_column_reflow_false() {
+        let detector = TableDetector::new();
+        // All columns have good alignment
+        let column_buckets = vec![
+            (0, vec![
+                TextPosition { x0: 50.0, y0: 700.0 },
+                TextPosition { x0: 50.0, y0: 650.0 },
+                TextPosition { x0: 50.0, y0: 600.0 },
+            ]),
+            (25, vec![
+                TextPosition { x0: 150.0, y0: 700.0 },
+                TextPosition { x0: 150.0, y0: 650.0 },
+                TextPosition { x0: 150.0, y0: 600.0 },
+            ]),
+        ];
+
+        let is_reflow = detector.is_single_column_reflow(&column_buckets);
+        // Good alignment across all rows, not a reflow
+        assert!(!is_reflow);
+    }
+
+    #[test]
+    fn test_borderless_table_has_empty_segments() {
+        // Borderless tables should not have segments (no ruling lines)
+        let detector = TableDetector::new();
+        let page = make_page(b"");
+
+        let content = b"\
+            BT \
+            50 700 Td (R1C1) Tj 100 0 Td (R1C2) Tj 100 0 Td (R1C3) Tj \
+            -200 -50 Td (R2C1) Tj 100 0 Td (R2C2) Tj 100 0 Td (R2C3) Tj \
+            -200 -50 Td (R3C1) Tj 100 0 Td (R3C2) Tj 100 0 Td (R3C3) Tj \
+            ET";
+
+        let ctx = PageContext::new(&page, content);
+        let grids = detector.detect_borderless(&ctx);
+
+        assert!(!grids.is_empty());
+        assert!(grids[0].segments.is_empty());
+    }
 }
--- a/crates/pdftract-core/src/table/mod.rs
+++ b/crates/pdftract-core/src/table/mod.rs
@ -1,13 +1,21 @@
 //! Table detection and structure reconstruction.
 //!
-//! This module implements line-based table detection from PDF content streams.
-//! Per Phase 7.2 of the plan, table detection extracts bordered tables by:
+//! This module implements table detection from PDF content streams using two methods:
+//!
+//! ## Line-based detection (7.2.1)
+//! For bordered tables with ruling lines:
 //! 1. Collecting horizontal and vertical path segments from stroke operators
 //! 2. Clustering collinear segments within epsilon tolerance
 //! 3. Finding intersection points between horizontal and vertical segments
 //! 4. Building candidate grids from the intersections
 //!
-//! Borderless table detection (via alignment heuristics) is deferred to 7.2.2.
+//! ## Borderless detection (7.2.2)
+//! For tables without ruling lines, using x0 alignment heuristics:
+//! 1. Collect text positions from content stream (Tm, Td, TD, T*, Tj, TJ operators)
+//! 2. Group by x0 positions (within 2.0 pt tolerance)
+//! 3. Find column candidates (3+ spans at same x0 on different y positions)
+//! 4. Find row candidates (y positions where >= 2 column candidates have spans)
+//! 5. Validate: 3+ rows AND 3+ columns, contiguous y range, no gap > 100 pt

 mod detector;
 mod segment;