pdftract/crates/pdftract-core/src/layout/figure.rs
jedarden 1c6f26ecaa fix(bf-4mkhv): clean up unused imports in hash.rs
The bead description mentioned compile errors in hash.rs from API drift,
but those errors were either already fixed or misattributed. The API usage
was already correct:
- compute_fingerprint already takes 3 arguments with source
- len() already propagates Result with ?
- read_at method already used correctly
- Catalog fields accessed via trailer correctly

Only cleanup: removed unused std::fs::File and std::io imports.

Verification: notes/bf-4mkhv.md
2026-06-01 09:43:48 -04:00

516 lines
17 KiB
Rust

//! Figure block classifier (Phase 4 figure detection).
//!
//! This module implements classification of image regions as figure blocks
//! based on text overlap analysis. Image XObjects with < 50% text overlap
//! are classified as figures.
//!
//! # Algorithm
//!
//! For each image XObject on a page:
//! 1. Compute the union of all text glyph bboxes intersecting the image bbox
//! 2. Calculate overlap ratio = (text_overlap_area / image_bbox_area)
//! 3. If overlap < 0.5 (50%), create a Figure block with the image's bbox
//!
//! This distinction separates:
//! - **Figures:** Images that are primarily visual content (charts, photos, diagrams)
//! - **Text-on-image:** Screenshots of text, scanned text images, document thumbnails
//!
//! # References
//!
//! - Plan section: Phase 4.4 Block Formation → Figure block kind assignment
//! - Phase 3.3: Do operator (XObject image placement)
//! - Phase 3.5: Inline images (BI/ID/EI sequence)
use crate::content_stream::ImageXObject;
use std::sync::Arc;
/// Block with layout properties for figure classification.
///
/// This reuses the caption.rs Block structure for consistency
/// across layout classifiers.
pub use crate::layout::caption::Block;
/// Page context for figure classification.
///
/// Contains the image list and glyph bboxes needed for figure detection.
#[derive(Debug, Clone)]
pub struct FigurePageContext {
/// Image XObjects on this page (from Phase 3.3 Do + Phase 3.5 inline images).
pub images: Vec<ImageXObject>,
/// Glyph bounding boxes on this page (for text overlap computation).
pub glyph_bboxes: Vec<[f32; 4]>,
}
impl FigurePageContext {
/// Create a new empty page context.
pub fn new() -> Self {
Self {
images: Vec::new(),
glyph_bboxes: Vec::new(),
}
}
/// Create a new page context with images and glyph bboxes.
pub fn with_data(images: Vec<ImageXObject>, glyph_bboxes: Vec<[f32; 4]>) -> Self {
Self {
images,
glyph_bboxes,
}
}
}
impl Default for FigurePageContext {
fn default() -> Self {
Self::new()
}
}
/// Classify figures on a page based on image XObjects and text overlap.
///
/// This function analyzes each image XObject and classifies it as a Figure
/// block if less than 50% of its area is covered by text glyphs.
///
/// # Arguments
///
/// * `ctx` - Page context with images and glyph bboxes
///
/// # Returns
///
/// A vector of Figure blocks, one per qualifying image, sorted by bbox top y
/// (descending, i.e., highest on the page first).
///
/// # Examples
///
/// ```
/// use pdftract_core::layout::figure::{classify_figure, FigurePageContext};
/// use pdftract_core::content_stream::ImageXObject;
/// use std::sync::Arc;
///
/// // Page with two images: one mostly visual, one covered by text
/// let ctx = FigurePageContext::with_data(
/// vec![
/// // Pure figure (no text overlap)
/// ImageXObject {
/// bbox: [100.0, 400.0, 300.0, 600.0],
/// xobject_ref: pdftract_core::parser::object::ObjRef { object_number: 1, generation_number: 0 },
/// name: Arc::from("figure1"),
/// },
/// // Text-on-image (screenshot with text annotation)
/// ImageXObject {
/// bbox: [100.0, 100.0, 300.0, 300.0],
/// xobject_ref: pdftract_core::parser::object::ObjRef { object_number: 2, generation_number: 0 },
/// name: Arc::from("figure2"),
/// },
/// ],
/// vec![
/// // Text overlaps the second image
/// [120.0, 120.0, 280.0, 280.0],
/// ],
/// );
///
/// let figures = classify_figure(&ctx);
/// assert_eq!(figures.len(), 1); // Only the first image is a figure
/// ```
pub fn classify_figure(ctx: &FigurePageContext) -> Vec<Block> {
let mut figures = Vec::new();
for image in &ctx.images {
let image_bbox = image.bbox;
let image_area = bbox_area(image_bbox);
// Skip zero-area images (degenerate CTM)
if image_area <= 0.0 {
continue;
}
// Compute text overlap area
let text_overlap_area = compute_text_overlap_area(&image_bbox, &ctx.glyph_bboxes);
// Classify as figure if < 50% text overlap
if text_overlap_area / image_area < 0.5 {
figures.push(Block {
kind: "figure".to_string(),
text: String::new(),
median_font_size: 0.0,
bbox: image_bbox,
column: 0, // TODO: assign column based on image center x position
});
}
}
// Sort by bbox top y (descending) so highest figures appear first
figures.sort_by(|a, b| b.top().partial_cmp(&a.top()).unwrap_or(std::cmp::Ordering::Equal));
figures
}
/// Compute the area of a bounding box.
fn bbox_area(bbox: [f32; 4]) -> f32 {
let width = bbox[2] - bbox[0];
let height = bbox[3] - bbox[1];
width * height
}
/// Compute the union area of all glyph bboxes intersecting the image bbox.
///
/// This function:
/// 1. Filters to glyphs that intersect the image bbox
/// 2. Computes the union of all intersecting glyph bboxes
/// 3. Returns the area of the union (clipped to the image bbox)
///
/// Uses a sweep line algorithm: for each vertical strip between unique x coordinates,
/// compute the total y coverage and sum (strip_width * y_coverage).
///
/// # Arguments
///
/// * `image_bbox` - The image's bounding box [x0, y0, x1, y1]
/// * `glyph_bboxes` - All glyph bboxes on the page
///
/// # Returns
///
/// The area of the union of all intersecting glyph bboxes, clipped to the image bbox.
fn compute_text_overlap_area(image_bbox: &[f32; 4], glyph_bboxes: &[[f32; 4]]) -> f32 {
// Collect all intersecting rectangles (clipped to image bbox)
let mut rects: Vec<[f32; 4]> = Vec::new();
for glyph_bbox in glyph_bboxes {
if bboxes_intersect(image_bbox, glyph_bbox) {
let intersection = [
image_bbox[0].max(glyph_bbox[0]),
image_bbox[1].max(glyph_bbox[1]),
image_bbox[2].min(glyph_bbox[2]),
image_bbox[3].min(glyph_bbox[3]),
];
// Skip empty intersections
if intersection[0] < intersection[2] && intersection[1] < intersection[3] {
rects.push(intersection);
}
}
}
if rects.is_empty() {
return 0.0;
}
// Sweep line algorithm: compute union area
// 1. Collect all unique x coordinates
let mut xs: Vec<f32> = rects.iter().flat_map(|r| [r[0], r[2]]).collect();
xs.sort_by(|a, b| a.partial_cmp(b).unwrap());
xs.dedup_by(|a, b| (*a - *b).abs() < 1e-6);
let mut total_area = 0.0;
// 2. For each vertical strip between consecutive x coordinates
for i in 0..xs.len() - 1 {
let x_left = xs[i];
let x_right = xs[i + 1];
// Skip zero-width strips
if x_right <= x_left {
continue;
}
// 3. Collect all y-intervals that cover this x-strip
let mut intervals: Vec<[f32; 2]> = Vec::new();
for rect in &rects {
// Check if rectangle overlaps this x-strip (not fully contained)
if rect[2] > x_left && rect[0] < x_right {
intervals.push([rect[1], rect[3]]);
}
}
if intervals.is_empty() {
continue;
}
// 4. Merge overlapping y-intervals
intervals.sort_by(|a, b| a[0].partial_cmp(&b[0]).unwrap());
let mut merged: Vec<[f32; 2]> = Vec::new();
for interval in intervals {
if let Some(last) = merged.last_mut() {
if interval[0] <= last[1] {
// Overlapping or adjacent - merge
last[1] = last[1].max(interval[1]);
} else {
merged.push(interval);
}
} else {
merged.push(interval);
}
}
// 5. Sum up y coverage for this strip
let y_coverage: f32 = merged.iter().map(|i| i[1] - i[0]).sum();
let strip_width = x_right - x_left;
total_area += strip_width * y_coverage;
}
total_area
}
/// Check if two bounding boxes intersect.
fn bboxes_intersect(a: &[f32; 4], b: &[f32; 4]) -> bool {
// No intersection if one is completely to the left/right/above/below the other
!(a[2] <= b[0] || b[2] <= a[0] || a[3] <= b[1] || b[3] <= a[1])
}
#[cfg(test)]
mod tests {
use super::*;
use crate::parser::object::ObjRef;
fn make_image(x0: f32, y0: f32, x1: f32, y1: f32) -> ImageXObject {
ImageXObject {
bbox: [x0, y0, x1, y1],
xobject_ref: ObjRef { object: 1, generation: 0 },
name: Arc::from("test"),
}
}
#[test]
fn test_bbox_area() {
assert_eq!(bbox_area([0.0, 0.0, 100.0, 50.0]), 5000.0);
assert_eq!(bbox_area([10.0, 20.0, 30.0, 40.0]), 400.0);
}
#[test]
fn test_bboxes_intersect() {
// Overlapping
assert!(bboxes_intersect(&[0.0, 0.0, 10.0, 10.0], &[5.0, 5.0, 15.0, 15.0]));
// Touching at edge (no actual overlap)
assert!(!bboxes_intersect(&[0.0, 0.0, 10.0, 10.0], &[10.0, 0.0, 20.0, 10.0]));
// Disjoint
assert!(!bboxes_intersect(&[0.0, 0.0, 10.0, 10.0], &[20.0, 20.0, 30.0, 30.0]));
// One inside the other
assert!(bboxes_intersect(&[0.0, 0.0, 100.0, 100.0], &[10.0, 10.0, 20.0, 20.0]));
}
#[test]
fn test_classify_figure_pure_visual_image() {
// Image with no text overlap → classified as figure
let ctx = FigurePageContext::with_data(
vec![make_image(100.0, 400.0, 300.0, 600.0)],
vec![
[400.0, 400.0, 500.0, 500.0], // Text far to the right
],
);
let figures = classify_figure(&ctx);
assert_eq!(figures.len(), 1);
assert_eq!(figures[0].kind, "figure");
assert_eq!(figures[0].bbox, [100.0, 400.0, 300.0, 600.0]);
}
#[test]
fn test_classify_figure_text_on_image() {
// Image fully covered by text → NOT classified as figure
let ctx = FigurePageContext::with_data(
vec![make_image(100.0, 100.0, 300.0, 300.0)],
vec![
[90.0, 90.0, 310.0, 310.0], // Text fully covers image
],
);
let figures = classify_figure(&ctx);
assert_eq!(figures.len(), 0); // Too much text overlap
}
#[test]
fn test_classify_figure_partial_text_below_threshold() {
// Image with 40% text overlap → classified as figure
let ctx = FigurePageContext::with_data(
vec![make_image(0.0, 0.0, 100.0, 100.0)],
vec![
[0.0, 0.0, 60.0, 60.0], // 36% coverage (60*60 / 100*100 = 0.36)
],
);
let figures = classify_figure(&ctx);
assert_eq!(figures.len(), 1);
}
#[test]
fn test_classify_figure_partial_text_above_threshold() {
// Image with 60% text overlap → NOT classified as figure
let ctx = FigurePageContext::with_data(
vec![make_image(0.0, 0.0, 100.0, 100.0)],
vec![
[0.0, 0.0, 80.0, 80.0], // 64% coverage (80*80 / 100*100 = 0.64)
],
);
let figures = classify_figure(&ctx);
assert_eq!(figures.len(), 0);
}
#[test]
fn test_classify_figure_exactly_at_threshold() {
// Image with exactly 50% text overlap → classified as figure
// (overlap < 0.5, so 0.5 exactly is NOT a figure per the spec)
// Actually, re-reading the spec: "If overlap < 0.5 (50%)"
// So overlap < 0.5 means figure, overlap >= 0.5 means NOT figure
// Let's verify: 49% should be figure, 50% should NOT be figure, 51% should NOT be figure
// 49% overlap (70.7 * 70.7 ≈ 5000, which is 50% of 10000)
// Let's use simpler numbers: 70*70 = 4900, which is 49% of 10000
let ctx = FigurePageContext::with_data(
vec![make_image(0.0, 0.0, 100.0, 100.0)],
vec![
[0.0, 0.0, 70.0, 70.0], // 49% coverage
],
);
let figures = classify_figure(&ctx);
assert_eq!(figures.len(), 1, "49% overlap should be classified as figure");
// 50% overlap (sqrt(5000) ≈ 70.71)
let ctx = FigurePageContext::with_data(
vec![make_image(0.0, 0.0, 100.0, 100.0)],
vec![
[0.0, 0.0, 71.0, 71.0], // ~50.4% coverage (>50%)
],
);
let figures = classify_figure(&ctx);
assert_eq!(figures.len(), 0, ">=50% overlap should NOT be classified as figure");
}
#[test]
fn test_classify_figure_sort_order() {
// Multiple figures should be sorted by top y (highest first)
let ctx = FigurePageContext::with_data(
vec![
make_image(0.0, 100.0, 100.0, 200.0), // Lower
make_image(0.0, 300.0, 100.0, 400.0), // Higher
make_image(0.0, 200.0, 100.0, 300.0), // Middle
],
vec![], // No text
);
let figures = classify_figure(&ctx);
assert_eq!(figures.len(), 3);
assert_eq!(figures[0].bbox[3], 400.0); // Highest
assert_eq!(figures[1].bbox[3], 300.0); // Middle
assert_eq!(figures[2].bbox[3], 200.0); // Lowest
}
#[test]
fn test_classify_figure_empty_context() {
let ctx = FigurePageContext::new();
let figures = classify_figure(&ctx);
assert_eq!(figures.len(), 0);
}
#[test]
fn test_classify_figure_no_images() {
let ctx = FigurePageContext::with_data(
vec![],
vec![[0.0, 0.0, 100.0, 100.0]],
);
let figures = classify_figure(&ctx);
assert_eq!(figures.len(), 0);
}
#[test]
fn test_classify_figure_no_glyphs() {
// Images with no glyphs at all should all be figures
let ctx = FigurePageContext::with_data(
vec![
make_image(0.0, 0.0, 100.0, 100.0),
make_image(200.0, 200.0, 300.0, 300.0),
],
vec![],
);
let figures = classify_figure(&ctx);
assert_eq!(figures.len(), 2);
}
#[test]
fn test_compute_text_overlap_area_multiple_glyphs() {
// Multiple overlapping glyphs should produce a union area
let image_bbox = [0.0, 0.0, 100.0, 100.0];
let glyph_bboxes = vec![
[0.0, 0.0, 40.0, 40.0], // Bottom-left
[60.0, 0.0, 100.0, 40.0], // Bottom-right
[0.0, 60.0, 40.0, 100.0], // Top-left
[60.0, 60.0, 100.0, 100.0], // Top-right
];
let overlap = compute_text_overlap_area(&image_bbox, &glyph_bboxes);
// All 4 corners, disjoint, so area = 4 * 40*40 = 6400
assert!((overlap - 6400.0).abs() < 1.0);
}
#[test]
fn test_compute_text_overlap_area_union() {
// Overlapping glyphs should produce union (not sum)
let image_bbox = [0.0, 0.0, 100.0, 100.0];
let glyph_bboxes = vec![
[0.0, 0.0, 60.0, 60.0], // Large area
[40.0, 40.0, 100.0, 100.0], // Overlaps with first
];
let overlap = compute_text_overlap_area(&image_bbox, &glyph_bboxes);
// Union of [0,0,60,60] and [40,40,100,100] = 6800 (not 7200 sum due to overlap)
// The overlapping region [40,40,60,60] is counted only once
let expected = 6800.0;
assert!((overlap - expected).abs() < 1.0, "Union area should be {}, got {}", expected, overlap);
assert!(overlap < 10000.0, "Union should not exceed image bounds");
}
#[test]
fn test_figure_block_properties() {
let ctx = FigurePageContext::with_data(
vec![make_image(100.0, 400.0, 300.0, 600.0)],
vec![],
);
let figures = classify_figure(&ctx);
assert_eq!(figures.len(), 1);
let figure = &figures[0];
assert_eq!(figure.kind, "figure");
assert_eq!(figure.text, "");
assert_eq!(figure.median_font_size, 0.0);
assert_eq!(figure.column, 0);
}
#[test]
fn test_five_figures_no_text() {
// Test case from acceptance criteria:
// PDF with 5 figures (images, no text overlay) → 5 Figure blocks in output
let ctx = FigurePageContext::with_data(
vec![
make_image(0.0, 100.0, 100.0, 200.0),
make_image(100.0, 100.0, 200.0, 200.0),
make_image(200.0, 100.0, 300.0, 200.0),
make_image(300.0, 100.0, 400.0, 200.0),
make_image(400.0, 100.0, 500.0, 200.0),
],
vec![], // No text overlay
);
let figures = classify_figure(&ctx);
assert_eq!(figures.len(), 5);
}
#[test]
fn test_text_covered_image_not_figure() {
// Test case from acceptance criteria:
// PDF with 1 image fully covered by text (screenshot with text annotation)
// → no Figure block (text overlap >= 50%)
let ctx = FigurePageContext::with_data(
vec![make_image(0.0, 0.0, 100.0, 100.0)],
vec![
[0.0, 0.0, 100.0, 100.0], // Text fully covers image (100% overlap)
],
);
let figures = classify_figure(&ctx);
assert_eq!(figures.len(), 0);
}
}