The bead description mentioned compile errors in hash.rs from API drift, but those errors were either already fixed or misattributed. The API usage was already correct: - compute_fingerprint already takes 3 arguments with source - len() already propagates Result with ? - read_at method already used correctly - Catalog fields accessed via trailer correctly Only cleanup: removed unused std::fs::File and std::io imports. Verification: notes/bf-4mkhv.md
516 lines
17 KiB
Rust
516 lines
17 KiB
Rust
//! Figure block classifier (Phase 4 figure detection).
|
|
//!
|
|
//! This module implements classification of image regions as figure blocks
|
|
//! based on text overlap analysis. Image XObjects with < 50% text overlap
|
|
//! are classified as figures.
|
|
//!
|
|
//! # Algorithm
|
|
//!
|
|
//! For each image XObject on a page:
|
|
//! 1. Compute the union of all text glyph bboxes intersecting the image bbox
|
|
//! 2. Calculate overlap ratio = (text_overlap_area / image_bbox_area)
|
|
//! 3. If overlap < 0.5 (50%), create a Figure block with the image's bbox
|
|
//!
|
|
//! This distinction separates:
|
|
//! - **Figures:** Images that are primarily visual content (charts, photos, diagrams)
|
|
//! - **Text-on-image:** Screenshots of text, scanned text images, document thumbnails
|
|
//!
|
|
//! # References
|
|
//!
|
|
//! - Plan section: Phase 4.4 Block Formation → Figure block kind assignment
|
|
//! - Phase 3.3: Do operator (XObject image placement)
|
|
//! - Phase 3.5: Inline images (BI/ID/EI sequence)
|
|
|
|
use crate::content_stream::ImageXObject;
|
|
use std::sync::Arc;
|
|
|
|
/// Block with layout properties for figure classification.
|
|
///
|
|
/// This reuses the caption.rs Block structure for consistency
|
|
/// across layout classifiers.
|
|
pub use crate::layout::caption::Block;
|
|
|
|
/// Page context for figure classification.
|
|
///
|
|
/// Contains the image list and glyph bboxes needed for figure detection.
|
|
#[derive(Debug, Clone)]
|
|
pub struct FigurePageContext {
|
|
/// Image XObjects on this page (from Phase 3.3 Do + Phase 3.5 inline images).
|
|
pub images: Vec<ImageXObject>,
|
|
/// Glyph bounding boxes on this page (for text overlap computation).
|
|
pub glyph_bboxes: Vec<[f32; 4]>,
|
|
}
|
|
|
|
impl FigurePageContext {
|
|
/// Create a new empty page context.
|
|
pub fn new() -> Self {
|
|
Self {
|
|
images: Vec::new(),
|
|
glyph_bboxes: Vec::new(),
|
|
}
|
|
}
|
|
|
|
/// Create a new page context with images and glyph bboxes.
|
|
pub fn with_data(images: Vec<ImageXObject>, glyph_bboxes: Vec<[f32; 4]>) -> Self {
|
|
Self {
|
|
images,
|
|
glyph_bboxes,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Default for FigurePageContext {
|
|
fn default() -> Self {
|
|
Self::new()
|
|
}
|
|
}
|
|
|
|
/// Classify figures on a page based on image XObjects and text overlap.
|
|
///
|
|
/// This function analyzes each image XObject and classifies it as a Figure
|
|
/// block if less than 50% of its area is covered by text glyphs.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `ctx` - Page context with images and glyph bboxes
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// A vector of Figure blocks, one per qualifying image, sorted by bbox top y
|
|
/// (descending, i.e., highest on the page first).
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use pdftract_core::layout::figure::{classify_figure, FigurePageContext};
|
|
/// use pdftract_core::content_stream::ImageXObject;
|
|
/// use std::sync::Arc;
|
|
///
|
|
/// // Page with two images: one mostly visual, one covered by text
|
|
/// let ctx = FigurePageContext::with_data(
|
|
/// vec![
|
|
/// // Pure figure (no text overlap)
|
|
/// ImageXObject {
|
|
/// bbox: [100.0, 400.0, 300.0, 600.0],
|
|
/// xobject_ref: pdftract_core::parser::object::ObjRef { object_number: 1, generation_number: 0 },
|
|
/// name: Arc::from("figure1"),
|
|
/// },
|
|
/// // Text-on-image (screenshot with text annotation)
|
|
/// ImageXObject {
|
|
/// bbox: [100.0, 100.0, 300.0, 300.0],
|
|
/// xobject_ref: pdftract_core::parser::object::ObjRef { object_number: 2, generation_number: 0 },
|
|
/// name: Arc::from("figure2"),
|
|
/// },
|
|
/// ],
|
|
/// vec![
|
|
/// // Text overlaps the second image
|
|
/// [120.0, 120.0, 280.0, 280.0],
|
|
/// ],
|
|
/// );
|
|
///
|
|
/// let figures = classify_figure(&ctx);
|
|
/// assert_eq!(figures.len(), 1); // Only the first image is a figure
|
|
/// ```
|
|
pub fn classify_figure(ctx: &FigurePageContext) -> Vec<Block> {
|
|
let mut figures = Vec::new();
|
|
|
|
for image in &ctx.images {
|
|
let image_bbox = image.bbox;
|
|
let image_area = bbox_area(image_bbox);
|
|
|
|
// Skip zero-area images (degenerate CTM)
|
|
if image_area <= 0.0 {
|
|
continue;
|
|
}
|
|
|
|
// Compute text overlap area
|
|
let text_overlap_area = compute_text_overlap_area(&image_bbox, &ctx.glyph_bboxes);
|
|
|
|
// Classify as figure if < 50% text overlap
|
|
if text_overlap_area / image_area < 0.5 {
|
|
figures.push(Block {
|
|
kind: "figure".to_string(),
|
|
text: String::new(),
|
|
median_font_size: 0.0,
|
|
bbox: image_bbox,
|
|
column: 0, // TODO: assign column based on image center x position
|
|
});
|
|
}
|
|
}
|
|
|
|
// Sort by bbox top y (descending) so highest figures appear first
|
|
figures.sort_by(|a, b| b.top().partial_cmp(&a.top()).unwrap_or(std::cmp::Ordering::Equal));
|
|
|
|
figures
|
|
}
|
|
|
|
/// Compute the area of a bounding box.
|
|
fn bbox_area(bbox: [f32; 4]) -> f32 {
|
|
let width = bbox[2] - bbox[0];
|
|
let height = bbox[3] - bbox[1];
|
|
width * height
|
|
}
|
|
|
|
/// Compute the union area of all glyph bboxes intersecting the image bbox.
|
|
///
|
|
/// This function:
|
|
/// 1. Filters to glyphs that intersect the image bbox
|
|
/// 2. Computes the union of all intersecting glyph bboxes
|
|
/// 3. Returns the area of the union (clipped to the image bbox)
|
|
///
|
|
/// Uses a sweep line algorithm: for each vertical strip between unique x coordinates,
|
|
/// compute the total y coverage and sum (strip_width * y_coverage).
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `image_bbox` - The image's bounding box [x0, y0, x1, y1]
|
|
/// * `glyph_bboxes` - All glyph bboxes on the page
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// The area of the union of all intersecting glyph bboxes, clipped to the image bbox.
|
|
fn compute_text_overlap_area(image_bbox: &[f32; 4], glyph_bboxes: &[[f32; 4]]) -> f32 {
|
|
// Collect all intersecting rectangles (clipped to image bbox)
|
|
let mut rects: Vec<[f32; 4]> = Vec::new();
|
|
|
|
for glyph_bbox in glyph_bboxes {
|
|
if bboxes_intersect(image_bbox, glyph_bbox) {
|
|
let intersection = [
|
|
image_bbox[0].max(glyph_bbox[0]),
|
|
image_bbox[1].max(glyph_bbox[1]),
|
|
image_bbox[2].min(glyph_bbox[2]),
|
|
image_bbox[3].min(glyph_bbox[3]),
|
|
];
|
|
|
|
// Skip empty intersections
|
|
if intersection[0] < intersection[2] && intersection[1] < intersection[3] {
|
|
rects.push(intersection);
|
|
}
|
|
}
|
|
}
|
|
|
|
if rects.is_empty() {
|
|
return 0.0;
|
|
}
|
|
|
|
// Sweep line algorithm: compute union area
|
|
// 1. Collect all unique x coordinates
|
|
let mut xs: Vec<f32> = rects.iter().flat_map(|r| [r[0], r[2]]).collect();
|
|
xs.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
|
xs.dedup_by(|a, b| (*a - *b).abs() < 1e-6);
|
|
|
|
let mut total_area = 0.0;
|
|
|
|
// 2. For each vertical strip between consecutive x coordinates
|
|
for i in 0..xs.len() - 1 {
|
|
let x_left = xs[i];
|
|
let x_right = xs[i + 1];
|
|
|
|
// Skip zero-width strips
|
|
if x_right <= x_left {
|
|
continue;
|
|
}
|
|
|
|
// 3. Collect all y-intervals that cover this x-strip
|
|
let mut intervals: Vec<[f32; 2]> = Vec::new();
|
|
for rect in &rects {
|
|
// Check if rectangle overlaps this x-strip (not fully contained)
|
|
if rect[2] > x_left && rect[0] < x_right {
|
|
intervals.push([rect[1], rect[3]]);
|
|
}
|
|
}
|
|
|
|
if intervals.is_empty() {
|
|
continue;
|
|
}
|
|
|
|
// 4. Merge overlapping y-intervals
|
|
intervals.sort_by(|a, b| a[0].partial_cmp(&b[0]).unwrap());
|
|
let mut merged: Vec<[f32; 2]> = Vec::new();
|
|
|
|
for interval in intervals {
|
|
if let Some(last) = merged.last_mut() {
|
|
if interval[0] <= last[1] {
|
|
// Overlapping or adjacent - merge
|
|
last[1] = last[1].max(interval[1]);
|
|
} else {
|
|
merged.push(interval);
|
|
}
|
|
} else {
|
|
merged.push(interval);
|
|
}
|
|
}
|
|
|
|
// 5. Sum up y coverage for this strip
|
|
let y_coverage: f32 = merged.iter().map(|i| i[1] - i[0]).sum();
|
|
let strip_width = x_right - x_left;
|
|
total_area += strip_width * y_coverage;
|
|
}
|
|
|
|
total_area
|
|
}
|
|
|
|
/// Check if two bounding boxes intersect.
|
|
fn bboxes_intersect(a: &[f32; 4], b: &[f32; 4]) -> bool {
|
|
// No intersection if one is completely to the left/right/above/below the other
|
|
!(a[2] <= b[0] || b[2] <= a[0] || a[3] <= b[1] || b[3] <= a[1])
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::parser::object::ObjRef;
|
|
|
|
fn make_image(x0: f32, y0: f32, x1: f32, y1: f32) -> ImageXObject {
|
|
ImageXObject {
|
|
bbox: [x0, y0, x1, y1],
|
|
xobject_ref: ObjRef { object: 1, generation: 0 },
|
|
name: Arc::from("test"),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_bbox_area() {
|
|
assert_eq!(bbox_area([0.0, 0.0, 100.0, 50.0]), 5000.0);
|
|
assert_eq!(bbox_area([10.0, 20.0, 30.0, 40.0]), 400.0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_bboxes_intersect() {
|
|
// Overlapping
|
|
assert!(bboxes_intersect(&[0.0, 0.0, 10.0, 10.0], &[5.0, 5.0, 15.0, 15.0]));
|
|
|
|
// Touching at edge (no actual overlap)
|
|
assert!(!bboxes_intersect(&[0.0, 0.0, 10.0, 10.0], &[10.0, 0.0, 20.0, 10.0]));
|
|
|
|
// Disjoint
|
|
assert!(!bboxes_intersect(&[0.0, 0.0, 10.0, 10.0], &[20.0, 20.0, 30.0, 30.0]));
|
|
|
|
// One inside the other
|
|
assert!(bboxes_intersect(&[0.0, 0.0, 100.0, 100.0], &[10.0, 10.0, 20.0, 20.0]));
|
|
}
|
|
|
|
#[test]
|
|
fn test_classify_figure_pure_visual_image() {
|
|
// Image with no text overlap → classified as figure
|
|
let ctx = FigurePageContext::with_data(
|
|
vec![make_image(100.0, 400.0, 300.0, 600.0)],
|
|
vec![
|
|
[400.0, 400.0, 500.0, 500.0], // Text far to the right
|
|
],
|
|
);
|
|
|
|
let figures = classify_figure(&ctx);
|
|
assert_eq!(figures.len(), 1);
|
|
assert_eq!(figures[0].kind, "figure");
|
|
assert_eq!(figures[0].bbox, [100.0, 400.0, 300.0, 600.0]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_classify_figure_text_on_image() {
|
|
// Image fully covered by text → NOT classified as figure
|
|
let ctx = FigurePageContext::with_data(
|
|
vec![make_image(100.0, 100.0, 300.0, 300.0)],
|
|
vec![
|
|
[90.0, 90.0, 310.0, 310.0], // Text fully covers image
|
|
],
|
|
);
|
|
|
|
let figures = classify_figure(&ctx);
|
|
assert_eq!(figures.len(), 0); // Too much text overlap
|
|
}
|
|
|
|
#[test]
|
|
fn test_classify_figure_partial_text_below_threshold() {
|
|
// Image with 40% text overlap → classified as figure
|
|
let ctx = FigurePageContext::with_data(
|
|
vec![make_image(0.0, 0.0, 100.0, 100.0)],
|
|
vec![
|
|
[0.0, 0.0, 60.0, 60.0], // 36% coverage (60*60 / 100*100 = 0.36)
|
|
],
|
|
);
|
|
|
|
let figures = classify_figure(&ctx);
|
|
assert_eq!(figures.len(), 1);
|
|
}
|
|
|
|
#[test]
|
|
fn test_classify_figure_partial_text_above_threshold() {
|
|
// Image with 60% text overlap → NOT classified as figure
|
|
let ctx = FigurePageContext::with_data(
|
|
vec![make_image(0.0, 0.0, 100.0, 100.0)],
|
|
vec![
|
|
[0.0, 0.0, 80.0, 80.0], // 64% coverage (80*80 / 100*100 = 0.64)
|
|
],
|
|
);
|
|
|
|
let figures = classify_figure(&ctx);
|
|
assert_eq!(figures.len(), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_classify_figure_exactly_at_threshold() {
|
|
// Image with exactly 50% text overlap → classified as figure
|
|
// (overlap < 0.5, so 0.5 exactly is NOT a figure per the spec)
|
|
// Actually, re-reading the spec: "If overlap < 0.5 (50%)"
|
|
// So overlap < 0.5 means figure, overlap >= 0.5 means NOT figure
|
|
// Let's verify: 49% should be figure, 50% should NOT be figure, 51% should NOT be figure
|
|
|
|
// 49% overlap (70.7 * 70.7 ≈ 5000, which is 50% of 10000)
|
|
// Let's use simpler numbers: 70*70 = 4900, which is 49% of 10000
|
|
let ctx = FigurePageContext::with_data(
|
|
vec![make_image(0.0, 0.0, 100.0, 100.0)],
|
|
vec![
|
|
[0.0, 0.0, 70.0, 70.0], // 49% coverage
|
|
],
|
|
);
|
|
let figures = classify_figure(&ctx);
|
|
assert_eq!(figures.len(), 1, "49% overlap should be classified as figure");
|
|
|
|
// 50% overlap (sqrt(5000) ≈ 70.71)
|
|
let ctx = FigurePageContext::with_data(
|
|
vec![make_image(0.0, 0.0, 100.0, 100.0)],
|
|
vec![
|
|
[0.0, 0.0, 71.0, 71.0], // ~50.4% coverage (>50%)
|
|
],
|
|
);
|
|
let figures = classify_figure(&ctx);
|
|
assert_eq!(figures.len(), 0, ">=50% overlap should NOT be classified as figure");
|
|
}
|
|
|
|
#[test]
|
|
fn test_classify_figure_sort_order() {
|
|
// Multiple figures should be sorted by top y (highest first)
|
|
let ctx = FigurePageContext::with_data(
|
|
vec![
|
|
make_image(0.0, 100.0, 100.0, 200.0), // Lower
|
|
make_image(0.0, 300.0, 100.0, 400.0), // Higher
|
|
make_image(0.0, 200.0, 100.0, 300.0), // Middle
|
|
],
|
|
vec![], // No text
|
|
);
|
|
|
|
let figures = classify_figure(&ctx);
|
|
assert_eq!(figures.len(), 3);
|
|
assert_eq!(figures[0].bbox[3], 400.0); // Highest
|
|
assert_eq!(figures[1].bbox[3], 300.0); // Middle
|
|
assert_eq!(figures[2].bbox[3], 200.0); // Lowest
|
|
}
|
|
|
|
#[test]
|
|
fn test_classify_figure_empty_context() {
|
|
let ctx = FigurePageContext::new();
|
|
let figures = classify_figure(&ctx);
|
|
assert_eq!(figures.len(), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_classify_figure_no_images() {
|
|
let ctx = FigurePageContext::with_data(
|
|
vec![],
|
|
vec![[0.0, 0.0, 100.0, 100.0]],
|
|
);
|
|
let figures = classify_figure(&ctx);
|
|
assert_eq!(figures.len(), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_classify_figure_no_glyphs() {
|
|
// Images with no glyphs at all should all be figures
|
|
let ctx = FigurePageContext::with_data(
|
|
vec![
|
|
make_image(0.0, 0.0, 100.0, 100.0),
|
|
make_image(200.0, 200.0, 300.0, 300.0),
|
|
],
|
|
vec![],
|
|
);
|
|
|
|
let figures = classify_figure(&ctx);
|
|
assert_eq!(figures.len(), 2);
|
|
}
|
|
|
|
#[test]
|
|
fn test_compute_text_overlap_area_multiple_glyphs() {
|
|
// Multiple overlapping glyphs should produce a union area
|
|
let image_bbox = [0.0, 0.0, 100.0, 100.0];
|
|
let glyph_bboxes = vec![
|
|
[0.0, 0.0, 40.0, 40.0], // Bottom-left
|
|
[60.0, 0.0, 100.0, 40.0], // Bottom-right
|
|
[0.0, 60.0, 40.0, 100.0], // Top-left
|
|
[60.0, 60.0, 100.0, 100.0], // Top-right
|
|
];
|
|
|
|
let overlap = compute_text_overlap_area(&image_bbox, &glyph_bboxes);
|
|
// All 4 corners, disjoint, so area = 4 * 40*40 = 6400
|
|
assert!((overlap - 6400.0).abs() < 1.0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_compute_text_overlap_area_union() {
|
|
// Overlapping glyphs should produce union (not sum)
|
|
let image_bbox = [0.0, 0.0, 100.0, 100.0];
|
|
let glyph_bboxes = vec![
|
|
[0.0, 0.0, 60.0, 60.0], // Large area
|
|
[40.0, 40.0, 100.0, 100.0], // Overlaps with first
|
|
];
|
|
|
|
let overlap = compute_text_overlap_area(&image_bbox, &glyph_bboxes);
|
|
// Union of [0,0,60,60] and [40,40,100,100] = 6800 (not 7200 sum due to overlap)
|
|
// The overlapping region [40,40,60,60] is counted only once
|
|
let expected = 6800.0;
|
|
assert!((overlap - expected).abs() < 1.0, "Union area should be {}, got {}", expected, overlap);
|
|
assert!(overlap < 10000.0, "Union should not exceed image bounds");
|
|
}
|
|
|
|
#[test]
|
|
fn test_figure_block_properties() {
|
|
let ctx = FigurePageContext::with_data(
|
|
vec![make_image(100.0, 400.0, 300.0, 600.0)],
|
|
vec![],
|
|
);
|
|
|
|
let figures = classify_figure(&ctx);
|
|
assert_eq!(figures.len(), 1);
|
|
|
|
let figure = &figures[0];
|
|
assert_eq!(figure.kind, "figure");
|
|
assert_eq!(figure.text, "");
|
|
assert_eq!(figure.median_font_size, 0.0);
|
|
assert_eq!(figure.column, 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_five_figures_no_text() {
|
|
// Test case from acceptance criteria:
|
|
// PDF with 5 figures (images, no text overlay) → 5 Figure blocks in output
|
|
let ctx = FigurePageContext::with_data(
|
|
vec![
|
|
make_image(0.0, 100.0, 100.0, 200.0),
|
|
make_image(100.0, 100.0, 200.0, 200.0),
|
|
make_image(200.0, 100.0, 300.0, 200.0),
|
|
make_image(300.0, 100.0, 400.0, 200.0),
|
|
make_image(400.0, 100.0, 500.0, 200.0),
|
|
],
|
|
vec![], // No text overlay
|
|
);
|
|
|
|
let figures = classify_figure(&ctx);
|
|
assert_eq!(figures.len(), 5);
|
|
}
|
|
|
|
#[test]
|
|
fn test_text_covered_image_not_figure() {
|
|
// Test case from acceptance criteria:
|
|
// PDF with 1 image fully covered by text (screenshot with text annotation)
|
|
// → no Figure block (text overlap >= 50%)
|
|
let ctx = FigurePageContext::with_data(
|
|
vec![make_image(0.0, 0.0, 100.0, 100.0)],
|
|
vec![
|
|
[0.0, 0.0, 100.0, 100.0], // Text fully covers image (100% overlap)
|
|
],
|
|
);
|
|
|
|
let figures = classify_figure(&ctx);
|
|
assert_eq!(figures.len(), 0);
|
|
}
|
|
}
|