pdftract/crates/pdftract-core/src/page_class.rs
jedarden 6a05f7e247 fix(pdftract-tuky): fix color clamping test and verify Phase 3.1 coordinator
Fixes:
- Corrected test_color_device_rgb_clamped expected value from "#ff8080" to "#ff0080"
  (G value -0.5 should clamp to 0.0, not 0.5)
- Fixed lifetime annotation in readability.rs (Cow<str> -> Cow<'_, str>)
- Fixed unused_must_use warning in page_class.rs test

Verification (notes/pdftract-tuky.md):
- All 8 children of Phase 3.1 coordinator are closed
- q/Q 64-level depth limit verified (test_64_nested_q_calls_succeed)
- Td chain accumulation verified (test_td_chain)
- Tm/Td ordering correct per ISO 72-bit spec
- /Rotate normalization implemented in child pdftract-1jlpy
- All 6 color operators tracked (72 graphics_state tests pass)

Closes: pdftract-tuky
2026-05-26 16:36:01 -04:00

634 lines
23 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Page classification enum.
//!
//! This module defines the four canonical page classes used throughout pdftract's
//! extraction pipeline. Per **INV-9 (stable taxonomy)**, these four variants are the
//! complete and stable set; adding new variants requires a schema_version bump and
//! an ADR.
//!
//! The `PageClass` enum drives routing decisions in Phase 5:
//! - `Vector`: Clean text PDF, extract via content-stream parsing
//! - `Scanned`: Image-only pages, require OCR
//! - `Hybrid`: Mixed text and image regions, require hybrid extraction
//! - `BrokenVector`: Text with encoding issues (e.g., invisible text layer over scan),
//! may escalate to OCR
//!
//! # Serde representation
//!
//! The enum serializes to the variant name verbatim (`Vector`, `Scanned`, `Hybrid`,
//! `BrokenVector`). This internal representation is distinct from the `page_type`
//! strings emitted in JSON output (see Phase 5.1.1 page_type mapping table).
use serde::{Deserialize, Serialize};
use std::collections::BTreeSet;
/// Classification result for a single page, combining the class with confidence
/// and optional hybrid-cell metadata.
///
/// This struct bundles three pieces of per-page metadata:
/// - `class`: The canonical page class (Vector, Scanned, Hybrid, BrokenVector)
/// - `confidence`: Classifier confidence in `[0.0, 1.0]` (for Phase 5.5 escalation thresholds)
/// - `hybrid_cells`: For Hybrid pages, the set of image-heavy cells on the 8×8 grid
///
/// Per INV-8, the constructor validates confidence range via `debug_assert` in dev
/// builds; production code with out-of-range confidence should clamp silently.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct PageClassification {
/// The canonical page class.
pub class: PageClass,
/// Classifier confidence in `[0.0, 1.0]`.
pub confidence: f32,
/// For Hybrid pages, the set of image-heavy cells (row, col) on the 8×8 grid.
/// `None` for non-Hybrid classes per the invariant below.
#[serde(skip_serializing_if = "Option::is_none")]
pub hybrid_cells: Option<BTreeSet<(u8, u8)>>,
}
impl PageClassification {
/// Construct a new `PageClassification`.
///
/// # Invariant
///
/// - `confidence` must be in `[0.0, 1.0]`. In dev builds, this is enforced via
/// `debug_assert!`; in release builds, out-of-range values should be clamped
/// by the caller (per INV-8).
/// - `hybrid_cells` should be `Some` only when `class == PageClass::Hybrid`.
/// The type system permits other combinations, but they represent bugs.
///
/// # Panics
///
/// In debug builds, panics if `confidence` is outside `[0.0, 1.0]`.
#[must_use]
pub fn new(
class: PageClass,
confidence: f32,
hybrid_cells: Option<BTreeSet<(u8, u8)>>,
) -> Self {
debug_assert!(
0.0 <= confidence && confidence <= 1.0,
"confidence must be in [0.0, 1.0], got {confidence}"
);
Self {
class,
confidence,
hybrid_cells,
}
}
}
/// The four canonical page classes.
///
/// Per INV-9 (stable taxonomy), this enum is fixed at these four variants.
/// Adding new variants requires a schema_version bump and an ADR.
///
/// # Hash
///
/// This type derives `Hash` so it can be used as a key in `HashMap` and `HashSet`,
/// which is required for Phase 6.9 cache keying and Phase 5 routing tables.
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum PageClass {
/// Clean vector PDF with readable text encoding.
Vector,
/// Image-only page requiring OCR.
Scanned,
/// Mixed page with both vector text and image regions.
Hybrid,
/// Text present but encoding is broken (e.g., invisible text over scanned image).
BrokenVector,
}
impl PageClass {
/// Returns the JSON output string for this page type.
///
/// Maps internal enum values to the schema's `page_type` field.
pub fn as_type_str(&self) -> &'static str {
match self {
PageClass::Vector => "text",
PageClass::Scanned => "scanned",
PageClass::Hybrid => "mixed",
PageClass::BrokenVector => "broken_vector",
}
}
/// Check if this page class is eligible for BrokenVector escalation.
///
/// Only Vector pages can be escalated to BrokenVector based on readability.
/// Scanned and Hybrid pages are already handled by other paths.
pub fn can_escalate_to_broken_vector(&self) -> bool {
matches!(self, PageClass::Vector)
}
}
/// Compute the canonical page_type string for the JSON schema output.
///
/// This function implements the stable mapping from (PageClass, ocr_succeeded, has_text, has_images)
/// to the page_type string emitted in the 6.1 JSON schema. The mapping is frozen per INV-9.
///
/// # Mapping Table
///
/// | class | ocr_succeeded | has_text | has_images | page_type |
/// |-----------------|---------------|----------|------------|------------------|
/// | Vector | - | - | - | "text" |
/// | Scanned | - | - | - | "scanned" |
/// | Hybrid | - | - | - | "mixed" |
/// | BrokenVector | false | - | - | "broken_vector" |
/// | BrokenVector | true | - | - | "scanned" | // post-OCR recovery
/// | (any) | - | false | false | "blank" | // overrides class
/// | (any) | - | false | true | "figure_only" | // overrides class
///
/// # Precedence Rules
///
/// 1. **Override checks first**: If `has_text == false` and `has_images == false`, return "blank".
/// If `has_text == false` and `has_images == true`, return "figure_only".
/// These overrides apply regardless of the PageClass value.
/// 2. **Class-based mapping**: If no override applies, map based on PageClass:
/// - Vector → "text"
/// - Scanned → "scanned"
/// - Hybrid → "mixed"
/// - BrokenVector with `ocr_succeeded == true` → "scanned" (post-OCR recovery)
/// - BrokenVector with `ocr_succeeded == false` → "broken_vector"
///
/// # Arguments
///
/// * `class` - The PageClass from Phase 5.1 classification
/// * `ocr_succeeded` - Whether OCR successfully recovered text (only relevant for BrokenVector)
/// * `has_text` - Whether the page contains any text glyphs
/// * `has_images` - Whether the page contains any images
///
/// # Returns
///
/// The canonical page_type string as a static str. This string is guaranteed to be
/// one of the six values in the 6.1 JSON schema enum: "text", "scanned", "mixed",
/// "broken_vector", "blank", or "figure_only".
///
/// # INV-9 Stable Taxonomy
///
/// The page_type strings are FROZEN by the 6.1 schema version. Any change requires
/// a schema_version bump and a downstream migration plan. Do not modify this function
/// without updating the JSON schema and plan.md.
pub fn page_type_string(
class: PageClass,
ocr_succeeded: bool,
has_text: bool,
has_images: bool,
) -> &'static str {
// Override checks take precedence over class-based mapping.
// These represent the "blank" and "figure_only" page types which are
// determined solely by content presence, not by classification.
if !has_text && !has_images {
return "blank";
}
if !has_text && has_images {
return "figure_only";
}
// Class-based mapping (applies when has_text == true or the override didn't match).
match class {
PageClass::Vector => "text",
PageClass::Scanned => "scanned",
PageClass::Hybrid => "mixed",
PageClass::BrokenVector => {
if ocr_succeeded {
"scanned" // Post-OCR recovery: treated as scanned
} else {
"broken_vector"
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_serialize_deserialize_roundtrip() {
let variants = [
PageClass::Vector,
PageClass::Scanned,
PageClass::Hybrid,
PageClass::BrokenVector,
];
for variant in variants {
// Serialize to JSON
let json = serde_json::to_string(&variant).expect("serialize failed");
let expected = match variant {
PageClass::Vector => "\"Vector\"",
PageClass::Scanned => "\"Scanned\"",
PageClass::Hybrid => "\"Hybrid\"",
PageClass::BrokenVector => "\"BrokenVector\"",
};
assert_eq!(json, expected);
// Deserialize roundtrip
let deserialized: PageClass = serde_json::from_str(&json).expect("deserialize failed");
assert_eq!(deserialized, variant);
}
}
#[test]
fn test_pageclass_hashable() {
use std::collections::HashMap;
use std::hash::Hash;
// Verify Hash trait is implemented and usable
let mut map: HashMap<PageClass, String> = HashMap::new();
map.insert(PageClass::Vector, "text".to_string());
map.insert(PageClass::Scanned, "scanned".to_string());
map.insert(PageClass::Hybrid, "mixed".to_string());
map.insert(PageClass::BrokenVector, "broken_vector".to_string());
assert_eq!(map.len(), 4);
assert_eq!(map.get(&PageClass::Vector), Some(&"text".to_string()));
// Verify Hash::hash does not panic
use std::hash::Hasher;
let mut hasher = std::collections::hash_map::DefaultHasher::new();
PageClass::Vector.hash(&mut hasher);
PageClass::Scanned.hash(&mut hasher);
}
}
#[cfg(test)]
mod page_classification_tests {
use super::*;
#[test]
fn test_page_classification_new_vector() {
// Unit test: PageClassification::new(Vector, 0.85, None) constructs successfully
let classification = PageClassification::new(PageClass::Vector, 0.85, None);
assert_eq!(classification.class, PageClass::Vector);
assert_eq!(classification.confidence, 0.85);
assert!(classification.hybrid_cells.is_none());
}
#[test]
fn test_page_classification_serialize_hybrid_with_cells() {
// Unit test: serialize PageClassification { class: Hybrid, confidence: 0.9, hybrid_cells: Some(...) }
let mut cells = BTreeSet::new();
cells.insert((0, 0));
cells.insert((1, 2));
cells.insert((7, 7));
let classification = PageClassification::new(PageClass::Hybrid, 0.9, Some(cells.clone()));
let json = serde_json::to_string(&classification).expect("serialize failed");
// Verify JSON contains hybrid_cells array
assert!(json.contains("\"hybrid_cells\""));
assert!(json.contains("[[0,0],[1,2],[7,7]]"));
// Deserialize roundtrip → equal
let deserialized: PageClassification =
serde_json::from_str(&json).expect("deserialize failed");
assert_eq!(deserialized.class, PageClass::Hybrid);
assert_eq!(deserialized.confidence, 0.9);
assert_eq!(deserialized.hybrid_cells, Some(cells));
}
#[test]
fn test_page_classification_hybrid_cells_none_omitted_from_json() {
// Unit test: hybrid_cells: None is omitted from JSON output via skip_serializing_if
let classification = PageClassification::new(PageClass::Vector, 0.85, None);
let json = serde_json::to_string(&classification).expect("serialize failed");
// Verify hybrid_cells key is NOT present in JSON
assert!(!json.contains("hybrid_cells"));
// Deserialize roundtrip still works (Option defaults to None)
let deserialized: PageClassification =
serde_json::from_str(&json).expect("deserialize failed");
assert_eq!(deserialized, classification);
}
#[test]
#[should_panic(expected = "confidence must be in [0.0, 1.0]")]
#[cfg(debug_assertions)]
fn test_page_classification_debug_assert_fires_on_invalid_confidence() {
// Unit test: debug_assert fires on confidence = 1.5 in dev build
// This test only runs in debug builds where debug_assert! is active
let _ = PageClassification::new(PageClass::Vector, 1.5, None);
}
#[test]
fn test_page_classification_btree_set_deterministic_order() {
// Unit test: BTreeSet provides deterministic iteration order
let mut cells = BTreeSet::new();
cells.insert((7, 7));
cells.insert((0, 0));
cells.insert((3, 2));
cells.insert((1, 5));
let classification = PageClassification::new(PageClass::Hybrid, 0.9, Some(cells));
let json = serde_json::to_string(&classification).expect("serialize failed");
// BTreeSet iterates in sorted order, so JSON should have sorted cells
// Extract the cells array from JSON
let parsed: serde_json::Value = serde_json::from_str(&json).expect("parse failed");
let cells_array = parsed["hybrid_cells"]
.as_array()
.expect("hybrid_cells should be array");
// Verify sorted order: (0,0), (1,5), (3,2), (7,7)
assert_eq!(cells_array[0], serde_json::json!([0, 0]));
assert_eq!(cells_array[1], serde_json::json!([1, 5]));
assert_eq!(cells_array[2], serde_json::json!([3, 2]));
assert_eq!(cells_array[3], serde_json::json!([7, 7]));
}
#[test]
fn test_page_classification_roundtrip_all_variants() {
// Roundtrip test: serialize -> deserialize PageClassification == original
let test_cases = [
(PageClass::Vector, 0.85, None),
(PageClass::Scanned, 0.72, None),
(PageClass::BrokenVector, 0.60, None),
(
PageClass::Hybrid,
0.90,
Some(BTreeSet::from([(0, 0), (3, 3)])),
),
(PageClass::Hybrid, 0.75, Some(BTreeSet::new())), // Empty cells
];
for (class, confidence, hybrid_cells) in test_cases {
let original = PageClassification::new(class, confidence, hybrid_cells.clone());
let json = serde_json::to_string(&original).expect("serialize failed");
let deserialized: PageClassification =
serde_json::from_str(&json).expect("deserialize failed");
assert_eq!(deserialized.class, original.class);
assert_eq!(deserialized.confidence, original.confidence);
assert_eq!(deserialized.hybrid_cells, original.hybrid_cells);
}
}
#[test]
fn test_page_classification_invariant_hybrid_cells_only_for_hybrid() {
// Verify the invariant: hybrid_cells should only be Some for Hybrid class
// This test documents the expected invariant; the type system allows
// violations but they represent bugs.
let vector_with_cells =
PageClassification::new(PageClass::Vector, 0.8, Some(BTreeSet::from([(0, 0)])));
// This is technically allowed by the type system but violates the invariant
assert_eq!(vector_with_cells.class, PageClass::Vector);
assert!(vector_with_cells.hybrid_cells.is_some());
// In production code, callers should enforce: hybrid_cells.is_some() ⇔ class == Hybrid
}
}
#[cfg(test)]
mod page_type_string_tests {
use super::*;
#[test]
fn test_page_type_string_vector() {
// AC: Vector → "text"
assert_eq!(
page_type_string(PageClass::Vector, false, true, false),
"text"
);
assert_eq!(
page_type_string(PageClass::Vector, true, true, false),
"text"
);
assert_eq!(
page_type_string(PageClass::Vector, false, true, true),
"text"
);
}
#[test]
fn test_page_type_string_scanned() {
// AC: Scanned → "scanned"
assert_eq!(
page_type_string(PageClass::Scanned, false, true, false),
"scanned"
);
assert_eq!(
page_type_string(PageClass::Scanned, true, true, false),
"scanned"
);
}
#[test]
fn test_page_type_string_hybrid() {
// AC: Hybrid → "mixed"
assert_eq!(
page_type_string(PageClass::Hybrid, false, true, true),
"mixed"
);
assert_eq!(
page_type_string(PageClass::Hybrid, true, true, true),
"mixed"
);
}
#[test]
fn test_page_type_string_broken_vector_ocr_failed() {
// AC: BrokenVector + ocr_succeeded=false → "broken_vector"
assert_eq!(
page_type_string(PageClass::BrokenVector, false, true, false),
"broken_vector"
);
assert_eq!(
page_type_string(PageClass::BrokenVector, false, true, true),
"broken_vector"
);
}
#[test]
fn test_page_type_string_broken_vector_ocr_succeeded() {
// AC: BrokenVector + ocr_succeeded=true → "scanned" (post-OCR recovery)
assert_eq!(
page_type_string(PageClass::BrokenVector, true, true, false),
"scanned"
);
assert_eq!(
page_type_string(PageClass::BrokenVector, true, true, true),
"scanned"
);
}
#[test]
fn test_page_type_string_blank_override() {
// AC: has_text=false + has_images=false → "blank" (overrides class)
assert_eq!(
page_type_string(PageClass::Vector, false, false, false),
"blank"
);
assert_eq!(
page_type_string(PageClass::Scanned, false, false, false),
"blank"
);
assert_eq!(
page_type_string(PageClass::Hybrid, false, false, false),
"blank"
);
assert_eq!(
page_type_string(PageClass::BrokenVector, false, false, false),
"blank"
);
assert_eq!(
page_type_string(PageClass::BrokenVector, true, false, false),
"blank"
);
}
#[test]
fn test_page_type_string_figure_only_override() {
// AC: has_text=false + has_images=true → "figure_only" (overrides class)
assert_eq!(
page_type_string(PageClass::Vector, false, false, true),
"figure_only"
);
assert_eq!(
page_type_string(PageClass::Scanned, false, false, true),
"figure_only"
);
assert_eq!(
page_type_string(PageClass::Hybrid, false, false, true),
"figure_only"
);
assert_eq!(
page_type_string(PageClass::BrokenVector, false, false, true),
"figure_only"
);
assert_eq!(
page_type_string(PageClass::BrokenVector, true, false, true),
"figure_only"
);
}
#[test]
fn test_page_type_string_exhaustive_combinations() {
// AC: Every combination from the mapping table produces the documented string
// 4 classes × 2 ocr_succeeded × 2 has_text × 2 has_images = 32 cases
let all_classes = [
PageClass::Vector,
PageClass::Scanned,
PageClass::Hybrid,
PageClass::BrokenVector,
];
for &class in &all_classes {
for &ocr_succeeded in &[false, true] {
for &has_text in &[false, true] {
for &has_images in &[false, true] {
let result = page_type_string(class, ocr_succeeded, has_text, has_images);
// Verify result is one of the six valid enum values
assert!(
matches!(
result,
"text" | "scanned" | "mixed" | "broken_vector" | "blank" | "figure_only"
),
"Invalid page_type: '{}' for class={:?}, ocr={}, has_text={}, has_images={}",
result,
class,
ocr_succeeded,
has_text,
has_images
);
// Verify override rules
if !has_text && !has_images {
assert_eq!(result, "blank");
} else if !has_text && has_images {
assert_eq!(result, "figure_only");
} else {
// Class-based mapping
match class {
PageClass::Vector => assert_eq!(result, "text"),
PageClass::Scanned => assert_eq!(result, "scanned"),
PageClass::Hybrid => assert_eq!(result, "mixed"),
PageClass::BrokenVector => {
if ocr_succeeded {
assert_eq!(result, "scanned");
} else {
assert_eq!(result, "broken_vector");
}
}
}
}
}
}
}
}
}
#[test]
fn test_page_type_enum_schema_set() {
// Schema list test asserting page_type enum exactly equals
// { text, scanned, mixed, broken_vector, blank, figure_only }
let expected = [
"text",
"scanned",
"mixed",
"broken_vector",
"blank",
"figure_only",
];
// Verify all expected values are produced by page_type_string
let mut found = std::collections::HashSet::new();
let all_classes = [
PageClass::Vector,
PageClass::Scanned,
PageClass::Hybrid,
PageClass::BrokenVector,
];
for &class in &all_classes {
for &ocr_succeeded in &[false, true] {
for &has_text in &[false, true] {
for &has_images in &[false, true] {
let result = page_type_string(class, ocr_succeeded, has_text, has_images);
found.insert(result);
}
}
}
}
// Verify all expected values are present
for &expected_value in &expected {
assert!(
found.contains(expected_value),
"Expected page_type '{}' not found in output set",
expected_value
);
}
// Verify no unexpected values are present
assert_eq!(
found.len(),
expected.len(),
"page_type set has unexpected values: {:?}",
found
);
}
#[test]
fn test_page_class_as_type_str() {
assert_eq!(PageClass::Vector.as_type_str(), "text");
assert_eq!(PageClass::Scanned.as_type_str(), "scanned");
assert_eq!(PageClass::Hybrid.as_type_str(), "mixed");
assert_eq!(PageClass::BrokenVector.as_type_str(), "broken_vector");
}
#[test]
fn test_page_class_can_escalate_to_broken_vector() {
// AC: Vector pages can escalate to BrokenVector
assert!(PageClass::Vector.can_escalate_to_broken_vector());
// AC: Scanned pages cannot escalate
assert!(!PageClass::Scanned.can_escalate_to_broken_vector());
// AC: Hybrid pages cannot escalate
assert!(!PageClass::Hybrid.can_escalate_to_broken_vector());
// AC: BrokenVector pages cannot escalate (already there)
assert!(!PageClass::BrokenVector.can_escalate_to_broken_vector());
}
}