fix(pdftract-3nwz): fix borderless table detection threshold and docs

Fix threshold logic in is_single_column_reflow to correctly detect
single-column paragraph reflow patterns. Changed from integer division
(< positions.len() / 2) to proper "more than half" check (* 2 < positions.len()).

Also update module documentation to reflect that borderless detection
is now implemented (7.2.2 complete).

Acceptance criteria:
-  Borderless 3x3 table detected via alignment heuristic
-  Unit tests: paragraph rejected, one-row rejected, vertical-gap test
-  Public TableDetector::detect_borderless(&PageContext) -> Vec<GridCandidate>
-  All 28 detector tests pass

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
jedarden 2026-05-23 22:26:11 -04:00
parent 21d6514ca8
commit 8d1e411d7c
2 changed files with 702 additions and 3 deletions

View file

@ -7,6 +7,30 @@ use super::{PageContext, GridCandidate, Segment, SegmentOrientation};
use crate::parser::lexer::Lexer;
use std::collections::{HashMap, HashSet};
/// Tolerance for x0 alignment in borderless detection (2.0 pt).
const X0_TOLERANCE: f32 = 2.0;
/// Minimum number of spans per column candidate.
const MIN_SPANS_PER_COLUMN: usize = 3;
/// Minimum number of columns for a valid table.
const MIN_COLUMNS: usize = 3;
/// Minimum number of rows for a valid table.
const MIN_ROWS: usize = 3;
/// Maximum vertical gap between rows (100 pt).
const MAX_VERTICAL_GAP: f32 = 100.0;
/// A text position extracted from the content stream.
#[derive(Debug, Clone, Copy)]
struct TextPosition {
/// X coordinate of the text origin.
x0: f32,
/// Y coordinate of the text origin.
y0: f32,
}
/// Epsilon tolerance for collinearity detection (1.0 pt).
const EPSILON: f32 = 1.0;
@ -80,6 +104,390 @@ impl TableDetector {
self.build_grids(intersections, segments)
}
/// Detect borderless tables using x0 alignment heuristic.
///
/// This method analyzes text positioning to find tables without ruling lines:
/// 1. Collect text positions from content stream
/// 2. Group by x0 positions (within tolerance)
/// 3. Find column candidates (3+ spans at same x0)
/// 4. Find row candidates (y positions with multiple columns)
/// 5. Validate and build grid candidates
///
/// # Arguments
///
/// * `ctx` - The page context containing page dict and content bytes
///
/// # Returns
///
/// A vector of grid candidates representing detected borderless tables.
pub fn detect_borderless(&self, ctx: &PageContext) -> Vec<GridCandidate> {
// Step 1: Collect text positions from content stream
let text_positions = self.collect_text_positions(ctx);
if text_positions.is_empty() {
return Vec::new();
}
// Step 2: Group by x0 positions (within tolerance)
let column_buckets = self.group_by_x0(&text_positions);
// Step 3: Find column candidates (3+ spans at same x0)
let column_candidates: Vec<_> = column_buckets
.into_iter()
.filter(|(_, positions)| positions.len() >= MIN_SPANS_PER_COLUMN)
.collect();
if column_candidates.len() < MIN_COLUMNS {
return Vec::new();
}
// Step 4: Find row candidates
let row_candidates = self.find_row_candidates(&column_candidates);
if row_candidates.len() < MIN_ROWS {
return Vec::new();
}
// Step 5: Build grid from candidates
self.build_borderless_grid(&column_candidates, &row_candidates, &text_positions)
}
/// Collect text positions from the content stream.
///
/// Parses Tm, Td, TD, T*, Tj, TJ, ', " operators to track text positions.
fn collect_text_positions(&self, ctx: &PageContext) -> Vec<TextPosition> {
let mut positions = Vec::new();
let mut lexer = Lexer::new(ctx.content_bytes);
let mut operand_stack: Vec<f32> = Vec::new();
// Current text matrix (Tm) and line matrix (Tlm)
let mut tm: [f32; 6] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]; // Identity matrix
let mut tlm: [f32; 6] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
let mut in_text_block = false;
while let Some(token) = lexer.next_token() {
match token {
crate::parser::lexer::Token::Integer(n) => {
operand_stack.push(n as f32);
}
crate::parser::lexer::Token::Real(r) => {
operand_stack.push(r as f32);
}
crate::parser::lexer::Token::Keyword(ref op) => {
match op.as_slice() {
b"BT" => {
// Begin text block
in_text_block = true;
// Reset Tm and Tlm to identity
tm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
tlm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
}
b"ET" => {
// End text block
in_text_block = false;
}
b"Tm" => {
// Set text matrix: Tm (operands: a b c d e f)
if operand_stack.len() >= 6 {
for i in 0..6 {
tm[i] = operand_stack[operand_stack.len() - 6 + i];
}
operand_stack.truncate(operand_stack.len() - 6);
tlm = tm; // Tm also sets Tlm
}
}
b"Td" => {
// Move text position: Td (tx ty)
if operand_stack.len() >= 2 {
let ty = operand_stack.pop().unwrap();
let tx = operand_stack.pop().unwrap();
// Td: Tm = Tlm * [1 0 0 1 tx ty]
tm[0] = tlm[0];
tm[1] = tlm[1];
tm[2] = tlm[2];
tm[3] = tlm[3];
tm[4] = tlm[0] * tx + tlm[2] * ty + tlm[4];
tm[5] = tlm[1] * tx + tlm[3] * ty + tlm[5];
tlm = tm; // Td also updates Tlm to the new Tm
}
}
b"TD" => {
// Move text position and set leading: TD (tx ty)
if operand_stack.len() >= 2 {
let ty = operand_stack.pop().unwrap();
let tx = operand_stack.pop().unwrap();
// TD: Tl = -ty, then Td
// For position tracking, same as Td
tm[0] = tlm[0];
tm[1] = tlm[1];
tm[2] = tlm[2];
tm[3] = tlm[3];
tm[4] = tlm[0] * tx + tlm[2] * ty + tlm[4];
tm[5] = tlm[1] * tx + tlm[3] * ty + tlm[5];
tlm = tm;
}
}
b"T*" => {
// Move to start of next line
// T*: Td (0 Tl)
// Tm[4] = Tlm[4], Tm[5] = Tlm[5] - Tl
// We don't track Tl, so approximate by using current y
tm[4] = tlm[4];
tm[5] = tlm[5]; // This is approximate; would need Tl for exact
tlm = tm;
}
b"Tj" => {
// Show text: Tj (string)
if in_text_block {
// Record position at current text origin
positions.push(TextPosition { x0: tm[4], y0: tm[5] });
}
operand_stack.clear(); // Tj consumes the string operand
}
b"TJ" => {
// Show text with individual glyph positioning: TJ (array)
if in_text_block {
// Record position
positions.push(TextPosition { x0: tm[4], y0: tm[5] });
}
operand_stack.clear(); // TJ consumes the array operand
}
b"'" => {
// Move to next line and show text: ' (string)
if in_text_block {
tm[4] = tlm[4];
tm[5] = tlm[5]; // Approximate
tlm = tm;
positions.push(TextPosition { x0: tm[4], y0: tm[5] });
}
operand_stack.clear();
}
b"\"" => {
// Set word and character spacing, move to next line, show text
// " (tw tc s) -> we just track position
if in_text_block && operand_stack.len() >= 3 {
operand_stack.truncate(operand_stack.len() - 3);
tm[4] = tlm[4];
tm[5] = tlm[5]; // Approximate
tlm = tm;
positions.push(TextPosition { x0: tm[4], y0: tm[5] });
}
}
_ => {
// Other operators - clear operand stack
operand_stack.clear();
}
}
}
_ => {
// Other tokens - ignore
}
}
}
positions
}
/// Group text positions by x0 coordinate within tolerance.
///
/// Uses clustering: positions are grouped if their x0 values are within
/// X0_TOLERANCE of each other. This is more accurate than fixed-width
/// bucketing for detecting aligned columns.
fn group_by_x0(&self, positions: &[TextPosition]) -> HashMap<i32, Vec<TextPosition>> {
if positions.is_empty() {
return HashMap::new();
}
let mut sorted_positions = positions.to_vec();
sorted_positions.sort_by(|a, b| a.x0.partial_cmp(&b.x0).unwrap_or(std::cmp::Ordering::Equal));
let mut clusters: Vec<Vec<TextPosition>> = Vec::new();
let mut current_cluster = vec![sorted_positions[0]];
for pos in &sorted_positions[1..] {
if (pos.x0 - current_cluster[0].x0).abs() <= X0_TOLERANCE {
// Within tolerance of cluster center, add to current cluster
current_cluster.push(*pos);
} else {
// Start new cluster
clusters.push(current_cluster);
current_cluster = vec![*pos];
}
}
clusters.push(current_cluster);
// Convert to HashMap with sequential keys
let mut buckets: HashMap<i32, Vec<TextPosition>> = HashMap::new();
for (i, cluster) in clusters.into_iter().enumerate() {
buckets.insert(i as i32, cluster);
}
buckets
}
/// Find row candidates from column buckets.
///
/// A row candidate is a y position where >= 2 column candidates have spans.
fn find_row_candidates(&self, column_buckets: &[(i32, Vec<TextPosition>)]) -> Vec<f32> {
// Build a map of y positions to column count
let mut y_to_column_count: HashMap<i32, HashSet<i32>> = HashMap::new();
for &(key, ref positions) in column_buckets {
for pos in positions {
// Round y to nearest integer for grouping (same tolerance as x0)
let y_key = (pos.y0 / X0_TOLERANCE).round() as i32;
y_to_column_count
.entry(y_key)
.or_insert_with(HashSet::new)
.insert(key);
}
}
// Extract y positions that have multiple columns
let mut row_ys: Vec<f32> = y_to_column_count
.into_iter()
.filter(|(_, cols)| cols.len() >= 2)
.map(|(y_key, _)| (y_key as f32) * X0_TOLERANCE)
.collect();
// Sort descending (PDF y increases upward)
row_ys.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
row_ys
}
/// Build a borderless grid from column and row candidates.
fn build_borderless_grid(
&self,
column_buckets: &[(i32, Vec<TextPosition>)],
row_ys: &[f32],
all_positions: &[TextPosition],
) -> Vec<GridCandidate> {
if row_ys.is_empty() || column_buckets.is_empty() {
return Vec::new();
}
// Find contiguous y ranges (no gap > MAX_VERTICAL_GAP)
let mut y_ranges = Vec::new();
let mut current_range_start = row_ys[0];
let mut current_range_end = row_ys[0];
for &y in row_ys.iter().skip(1) {
if (current_range_end - y).abs() <= MAX_VERTICAL_GAP {
// Extend current range
current_range_end = y.min(current_range_end);
} else {
// Start new range
y_ranges.push((current_range_start, current_range_end));
current_range_start = y;
current_range_end = y;
}
}
y_ranges.push((current_range_start, current_range_end));
// Build grid for each y range
let mut grids = Vec::new();
for (y_top, y_bottom) in y_ranges {
if let Some(grid) = self.build_single_borderless_grid(column_buckets, y_top, y_bottom, all_positions) {
grids.push(grid);
}
}
grids
}
/// Build a single borderless grid for a specific y range.
fn build_single_borderless_grid(
&self,
column_buckets: &[(i32, Vec<TextPosition>)],
y_top: f32,
y_bottom: f32,
all_positions: &[TextPosition],
) -> Option<GridCandidate> {
// Get sorted column x positions
let mut col_xs: Vec<f32> = column_buckets
.iter()
.map(|(key, _)| (*key as f32) * X0_TOLERANCE)
.collect();
col_xs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
// Filter rows to within y range (use integer keys for deduplication)
let row_ys: Vec<f32> = all_positions
.iter()
.map(|p| p.y0)
.filter(|&y| y <= y_top && y >= y_bottom)
.map(|y| (y / X0_TOLERANCE).round() as i32)
.collect::<std::collections::HashSet<_>>()
.into_iter()
.map(|y_key| (y_key as f32) * X0_TOLERANCE)
.collect::<Vec<_>>();
let mut row_ys_sorted = row_ys.clone();
row_ys_sorted.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
if row_ys_sorted.len() < MIN_ROWS || col_xs.len() < MIN_COLUMNS {
return None;
}
// Compute bounding box
let x0 = col_xs.first().copied()?;
let x1 = col_xs.last().copied()?;
let y0 = row_ys_sorted.last().copied()?; // Bottom
let y1 = row_ys_sorted.first().copied()?; // Top
// Reject if spans suggest single-column paragraph reflow
if self.is_single_column_reflow(column_buckets) {
return None;
}
let bbox = [x0, y0, x1, y1];
Some(GridCandidate {
bbox,
row_ys: row_ys_sorted,
col_xs,
segments: Vec::new(), // No segments for borderless tables
})
}
/// Check if the pattern suggests single-column paragraph reflow.
///
/// Returns true if any column candidate's spans are all on consecutive
/// lines without aligned neighbors in any other column candidate.
fn is_single_column_reflow(&self, column_buckets: &[(i32, Vec<TextPosition>)]) -> bool {
// Build a map of y positions to column keys
let mut y_to_columns: HashMap<i32, Vec<i32>> = HashMap::new();
for &(key, ref positions) in column_buckets {
for pos in positions {
let y_key = (pos.y0 / X0_TOLERANCE).round() as i32;
y_to_columns
.entry(y_key)
.or_insert_with(Vec::new)
.push(key);
}
}
// For each column, check if its y positions lack multi-column alignment
for &(_key, ref positions) in column_buckets {
let mut aligned_count = 0;
for pos in positions {
let y_key = (pos.y0 / X0_TOLERANCE).round() as i32;
if let Some(cols) = y_to_columns.get(&y_key) {
if cols.len() >= 2 {
aligned_count += 1;
}
}
}
// If most spans in this column are not aligned with other columns, reject
// "Most" means more than half, so we check if aligned_count * 2 < positions.len()
if aligned_count * 2 < positions.len() {
return true;
}
}
false
}
/// Collect horizontal and vertical path segments from content stream.
fn collect_segments(&self, ctx: &PageContext) -> Vec<Segment> {
let mut segments = Vec::new();
@ -558,4 +966,287 @@ mod tests {
// A full implementation would detect separate regions
assert!(!grids.is_empty());
}
// Borderless table detection tests
#[test]
fn test_detect_borderless_empty_content() {
let detector = TableDetector::new();
let page = make_page(b"");
let ctx = PageContext::new(&page, b"");
let grids = detector.detect_borderless(&ctx);
assert!(grids.is_empty());
}
#[test]
fn test_detect_borderless_no_text_block() {
let detector = TableDetector::new();
let page = make_page(b"");
// Content without text block (only path operators)
let content = b"50 100 m 150 100 l S";
let ctx = PageContext::new(&page, content);
let grids = detector.detect_borderless(&ctx);
assert!(grids.is_empty());
}
#[test]
fn test_detect_borderless_paragraph_rejected() {
// Single column text should be rejected (not a table)
let detector = TableDetector::new();
let page = make_page(b"");
// Simulate a paragraph with left-aligned text at x=50
// Multiple lines but all at same x0
let content = b"\
BT \
50 700 Td (Line 1) Tj \
0 -15 Td (Line 2) Tj \
0 -15 Td (Line 3) Tj \
0 -15 Td (Line 4) Tj \
ET";
let ctx = PageContext::new(&page, content);
let grids = detector.detect_borderless(&ctx);
// Should not detect a table (only 1 column)
assert!(grids.is_empty());
}
#[test]
fn test_detect_borderless_one_row_pseudo_table_rejected() {
// Single row with multiple columns should be rejected (< 3 rows)
let detector = TableDetector::new();
let page = make_page(b"");
// Simulate one row with 3 columns
let content = b"\
BT \
50 700 Td (Col1) Tj \
100 700 Td (Col2) Tj \
150 700 Td (Col3) Tj \
ET";
let ctx = PageContext::new(&page, content);
let grids = detector.detect_borderless(&ctx);
// Should not detect a table (only 1 row)
assert!(grids.is_empty());
}
#[test]
fn test_detect_borderless_3x3_table_accepted() {
// Critical test: 3 rows x 3 columns borderless table
let detector = TableDetector::new();
let page = make_page(b"");
// Simulate a 3x3 table with aligned columns
// Column 1 at x=50, Column 2 at x=150, Column 3 at x=250
// Rows at y=700, 650, 600
let content = b"\
BT \
50 700 Td (R1C1) Tj 100 0 Td (R1C2) Tj 100 0 Td (R1C3) Tj \
-200 -50 Td (R2C1) Tj 100 0 Td (R2C2) Tj 100 0 Td (R2C3) Tj \
-200 -50 Td (R3C1) Tj 100 0 Td (R3C2) Tj 100 0 Td (R3C3) Tj \
ET";
let ctx = PageContext::new(&page, content);
let grids = detector.detect_borderless(&ctx);
// Should detect a table
assert_eq!(grids.len(), 1);
assert_eq!(grids[0].row_count(), 2); // 3 rows = 2 intervals
assert_eq!(grids[0].col_count(), 2); // 3 columns = 2 intervals
assert_eq!(grids[0].cell_count(), 4);
// Verify segments are empty for borderless tables
assert!(grids[0].segments.is_empty());
}
#[test]
fn test_detect_borderless_vertical_gap_test() {
// Two separate tables with a large vertical gap (> 100 pt)
let detector = TableDetector::new();
let page = make_page(b"");
// First table at y=700, 650, 600
// Second table at y=400, 350, 300
// Gap = 600 - 400 = 200 pt > 100 pt threshold
let content = b"\
BT \
50 700 Td (R1C1) Tj 100 0 Td (R1C2) Tj 100 0 Td (R1C3) Tj \
-200 -50 Td (R2C1) Tj 100 0 Td (R2C2) Tj 100 0 Td (R2C3) Tj \
-200 -50 Td (R3C1) Tj 100 0 Td (R3C2) Tj 100 0 Td (R3C3) Tj \
ET \
BT \
50 400 Td (R1C1) Tj 100 0 Td (R1C2) Tj 100 0 Td (R1C3) Tj \
-200 -50 Td (R2C1) Tj 100 0 Td (R2C2) Tj 100 0 Td (R2C3) Tj \
-200 -50 Td (R3C1) Tj 100 0 Td (R3C2) Tj 100 0 Td (R3C3) Tj \
ET";
let ctx = PageContext::new(&page, content);
let grids = detector.detect_borderless(&ctx);
// Should detect two separate tables
assert_eq!(grids.len(), 2);
}
#[test]
fn test_collect_text_positions_basic() {
let detector = TableDetector::new();
let page = make_page(b"");
// Basic text positioning with Tm and Tj
let content = b"BT 1 0 0 1 50 700 Tm (Hello) Tj ET";
let ctx = PageContext::new(&page, content);
let positions = detector.collect_text_positions(&ctx);
assert_eq!(positions.len(), 1);
assert_eq!(positions[0].x0, 50.0);
assert_eq!(positions[0].y0, 700.0);
}
#[test]
fn test_collect_text_positions_with_td() {
let detector = TableDetector::new();
let page = make_page(b"");
// Text positioning with Td
let content = b"BT 50 700 Td (Hello) Tj 100 0 Td (World) Tj ET";
let ctx = PageContext::new(&page, content);
let positions = detector.collect_text_positions(&ctx);
assert_eq!(positions.len(), 2);
// First position at (50, 700)
assert_eq!(positions[0].x0, 50.0);
assert_eq!(positions[0].y0, 700.0);
// Second position at (150, 700) - Td adds to current position
// The actual x position depends on Tm calculation
}
#[test]
fn test_collect_text_positions_with_tj() {
let detector = TableDetector::new();
let page = make_page(b"");
// Text positioning with TJ (array)
let content = b"BT 50 700 Td [(Hello) 100 (World)] TJ ET";
let ctx = PageContext::new(&page, content);
let positions = detector.collect_text_positions(&ctx);
// Should record position for TJ operator
assert!(!positions.is_empty());
}
#[test]
fn test_group_by_x0_tolerance() {
let detector = TableDetector::new();
let positions = vec![
TextPosition { x0: 50.0, y0: 700.0 },
TextPosition { x0: 51.0, y0: 650.0 }, // Within 2 pt tolerance
TextPosition { x0: 52.0, y0: 600.0 }, // Within 2 pt tolerance
TextPosition { x0: 150.0, y0: 700.0 }, // Different column
];
let buckets = detector.group_by_x0(&positions);
// x0=50, 51, 52 should be in same bucket (within tolerance)
// x0=150 should be in different bucket
assert_eq!(buckets.len(), 2);
// One bucket should have 3 positions, one should have 1
let counts: Vec<_> = buckets.values().map(|v| v.len()).collect();
assert!(counts.contains(&3));
assert!(counts.contains(&1));
}
#[test]
fn test_find_row_candidates_basic() {
let detector = TableDetector::new();
let column_buckets = vec![
(0, vec![
TextPosition { x0: 50.0, y0: 700.0 },
TextPosition { x0: 50.0, y0: 650.0 },
TextPosition { x0: 50.0, y0: 600.0 },
]),
(25, vec![
TextPosition { x0: 150.0, y0: 700.0 },
TextPosition { x0: 150.0, y0: 650.0 },
TextPosition { x0: 150.0, y0: 600.0 },
]),
(50, vec![
TextPosition { x0: 250.0, y0: 700.0 },
TextPosition { x0: 250.0, y0: 650.0 },
TextPosition { x0: 250.0, y0: 600.0 },
]),
];
let rows = detector.find_row_candidates(&column_buckets);
// Should find 3 row positions (700, 650, 600)
assert_eq!(rows.len(), 3);
// Rows should be sorted descending
assert_eq!(rows[0], 700.0);
assert_eq!(rows[1], 650.0);
assert_eq!(rows[2], 600.0);
}
#[test]
fn test_is_single_column_reflow_true() {
let detector = TableDetector::new();
// Column 1 has positions that don't align with other columns
let column_buckets = vec![
(0, vec![
TextPosition { x0: 50.0, y0: 700.0 },
TextPosition { x0: 50.0, y0: 685.0 }, // Different y
TextPosition { x0: 50.0, y0: 670.0 }, // Different y
]),
(25, vec![
TextPosition { x0: 150.0, y0: 700.0 }, // Only aligns with first
]),
];
let is_reflow = detector.is_single_column_reflow(&column_buckets);
// First column has mostly non-aligned positions, should be detected as reflow
assert!(is_reflow);
}
#[test]
fn test_is_single_column_reflow_false() {
let detector = TableDetector::new();
// All columns have good alignment
let column_buckets = vec![
(0, vec![
TextPosition { x0: 50.0, y0: 700.0 },
TextPosition { x0: 50.0, y0: 650.0 },
TextPosition { x0: 50.0, y0: 600.0 },
]),
(25, vec![
TextPosition { x0: 150.0, y0: 700.0 },
TextPosition { x0: 150.0, y0: 650.0 },
TextPosition { x0: 150.0, y0: 600.0 },
]),
];
let is_reflow = detector.is_single_column_reflow(&column_buckets);
// Good alignment across all rows, not a reflow
assert!(!is_reflow);
}
#[test]
fn test_borderless_table_has_empty_segments() {
// Borderless tables should not have segments (no ruling lines)
let detector = TableDetector::new();
let page = make_page(b"");
let content = b"\
BT \
50 700 Td (R1C1) Tj 100 0 Td (R1C2) Tj 100 0 Td (R1C3) Tj \
-200 -50 Td (R2C1) Tj 100 0 Td (R2C2) Tj 100 0 Td (R2C3) Tj \
-200 -50 Td (R3C1) Tj 100 0 Td (R3C2) Tj 100 0 Td (R3C3) Tj \
ET";
let ctx = PageContext::new(&page, content);
let grids = detector.detect_borderless(&ctx);
assert!(!grids.is_empty());
assert!(grids[0].segments.is_empty());
}
}

View file

@ -1,13 +1,21 @@
//! Table detection and structure reconstruction.
//!
//! This module implements line-based table detection from PDF content streams.
//! Per Phase 7.2 of the plan, table detection extracts bordered tables by:
//! This module implements table detection from PDF content streams using two methods:
//!
//! ## Line-based detection (7.2.1)
//! For bordered tables with ruling lines:
//! 1. Collecting horizontal and vertical path segments from stroke operators
//! 2. Clustering collinear segments within epsilon tolerance
//! 3. Finding intersection points between horizontal and vertical segments
//! 4. Building candidate grids from the intersections
//!
//! Borderless table detection (via alignment heuristics) is deferred to 7.2.2.
//! ## Borderless detection (7.2.2)
//! For tables without ruling lines, using x0 alignment heuristics:
//! 1. Collect text positions from content stream (Tm, Td, TD, T*, Tj, TJ operators)
//! 2. Group by x0 positions (within 2.0 pt tolerance)
//! 3. Find column candidates (3+ spans at same x0 on different y positions)
//! 4. Find row candidates (y positions where >= 2 column candidates have spans)
//! 5. Validate: 3+ rows AND 3+ columns, contiguous y range, no gap > 100 pt
mod detector;
mod segment;