feat(pdftract-1ob): implement page_type_string in page_class module
Per bead pdftract-1ob acceptance criteria: - Add page_type_string function to page_class.rs that implements the stable mapping from (PageClass, ocr_succeeded, has_text, has_images) to page_type JSON enum values per Phase 5.1.1 spec - Add PageClass impl with as_type_str() and can_escalate_to_broken_vector() methods - Re-export PageClassification and page_type_string from lib.rs - Add comprehensive unit tests: * test_page_type_string_*: tests for each PageClass variant and override cases * test_page_type_string_exhaustive_combinations: validates all 32 combinations * test_page_type_enum_schema_set: verifies output equals the 6 schema values * test_page_class_as_type_str: tests as_type_str method * test_page_class_can_escalate_to_broken_vector: tests escalation eligibility Closes: pdftract-1ob
This commit is contained in:
parent
fce3a75526
commit
d0ea4a7085
2 changed files with 354 additions and 1 deletions
|
|
@ -62,7 +62,7 @@ pub use markdown::{
|
|||
block_to_markdown, form_fields_to_markdown, page_to_markdown, parse_anchors, Anchor,
|
||||
};
|
||||
pub use options::{ExtractionOptions, ReceiptsMode};
|
||||
pub use page_class::PageClass;
|
||||
pub use page_class::{page_type_string, PageClass, PageClassification};
|
||||
pub use parser::pages::{count_pages_tree, LazyPageIter, PageDict, DEFAULT_MEDIABOX};
|
||||
pub use schema::{BlockJson, CellJson, ExtractionQuality, RowJson, SpanJson, SpanRef, TableJson};
|
||||
pub use table::{GridCandidate, PageContext as TablePageContext, TableDetector};
|
||||
|
|
|
|||
|
|
@ -99,6 +99,106 @@ pub enum PageClass {
|
|||
BrokenVector,
|
||||
}
|
||||
|
||||
impl PageClass {
|
||||
/// Returns the JSON output string for this page type.
|
||||
///
|
||||
/// Maps internal enum values to the schema's `page_type` field.
|
||||
pub fn as_type_str(&self) -> &'static str {
|
||||
match self {
|
||||
PageClass::Vector => "text",
|
||||
PageClass::Scanned => "scanned",
|
||||
PageClass::Hybrid => "mixed",
|
||||
PageClass::BrokenVector => "broken_vector",
|
||||
}
|
||||
}
|
||||
|
||||
/// Check if this page class is eligible for BrokenVector escalation.
|
||||
///
|
||||
/// Only Vector pages can be escalated to BrokenVector based on readability.
|
||||
/// Scanned and Hybrid pages are already handled by other paths.
|
||||
pub fn can_escalate_to_broken_vector(&self) -> bool {
|
||||
matches!(self, PageClass::Vector)
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the canonical page_type string for the JSON schema output.
|
||||
///
|
||||
/// This function implements the stable mapping from (PageClass, ocr_succeeded, has_text, has_images)
|
||||
/// to the page_type string emitted in the 6.1 JSON schema. The mapping is frozen per INV-9.
|
||||
///
|
||||
/// # Mapping Table
|
||||
///
|
||||
/// | class | ocr_succeeded | has_text | has_images | page_type |
|
||||
/// |-----------------|---------------|----------|------------|------------------|
|
||||
/// | Vector | - | - | - | "text" |
|
||||
/// | Scanned | - | - | - | "scanned" |
|
||||
/// | Hybrid | - | - | - | "mixed" |
|
||||
/// | BrokenVector | false | - | - | "broken_vector" |
|
||||
/// | BrokenVector | true | - | - | "scanned" | // post-OCR recovery
|
||||
/// | (any) | - | false | false | "blank" | // overrides class
|
||||
/// | (any) | - | false | true | "figure_only" | // overrides class
|
||||
///
|
||||
/// # Precedence Rules
|
||||
///
|
||||
/// 1. **Override checks first**: If `has_text == false` and `has_images == false`, return "blank".
|
||||
/// If `has_text == false` and `has_images == true`, return "figure_only".
|
||||
/// These overrides apply regardless of the PageClass value.
|
||||
/// 2. **Class-based mapping**: If no override applies, map based on PageClass:
|
||||
/// - Vector → "text"
|
||||
/// - Scanned → "scanned"
|
||||
/// - Hybrid → "mixed"
|
||||
/// - BrokenVector with `ocr_succeeded == true` → "scanned" (post-OCR recovery)
|
||||
/// - BrokenVector with `ocr_succeeded == false` → "broken_vector"
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `class` - The PageClass from Phase 5.1 classification
|
||||
/// * `ocr_succeeded` - Whether OCR successfully recovered text (only relevant for BrokenVector)
|
||||
/// * `has_text` - Whether the page contains any text glyphs
|
||||
/// * `has_images` - Whether the page contains any images
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// The canonical page_type string as a static str. This string is guaranteed to be
|
||||
/// one of the six values in the 6.1 JSON schema enum: "text", "scanned", "mixed",
|
||||
/// "broken_vector", "blank", or "figure_only".
|
||||
///
|
||||
/// # INV-9 Stable Taxonomy
|
||||
///
|
||||
/// The page_type strings are FROZEN by the 6.1 schema version. Any change requires
|
||||
/// a schema_version bump and a downstream migration plan. Do not modify this function
|
||||
/// without updating the JSON schema and plan.md.
|
||||
pub fn page_type_string(
|
||||
class: PageClass,
|
||||
ocr_succeeded: bool,
|
||||
has_text: bool,
|
||||
has_images: bool,
|
||||
) -> &'static str {
|
||||
// Override checks take precedence over class-based mapping.
|
||||
// These represent the "blank" and "figure_only" page types which are
|
||||
// determined solely by content presence, not by classification.
|
||||
if !has_text && !has_images {
|
||||
return "blank";
|
||||
}
|
||||
if !has_text && has_images {
|
||||
return "figure_only";
|
||||
}
|
||||
|
||||
// Class-based mapping (applies when has_text == true or the override didn't match).
|
||||
match class {
|
||||
PageClass::Vector => "text",
|
||||
PageClass::Scanned => "scanned",
|
||||
PageClass::Hybrid => "mixed",
|
||||
PageClass::BrokenVector => {
|
||||
if ocr_succeeded {
|
||||
"scanned" // Post-OCR recovery: treated as scanned
|
||||
} else {
|
||||
"broken_vector"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
|
@ -279,3 +379,256 @@ mod page_classification_tests {
|
|||
// In production code, callers should enforce: hybrid_cells.is_some() ⇔ class == Hybrid
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod page_type_string_tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_page_type_string_vector() {
|
||||
// AC: Vector → "text"
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::Vector, false, true, false),
|
||||
"text"
|
||||
);
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::Vector, true, true, false),
|
||||
"text"
|
||||
);
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::Vector, false, true, true),
|
||||
"text"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_type_string_scanned() {
|
||||
// AC: Scanned → "scanned"
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::Scanned, false, true, false),
|
||||
"scanned"
|
||||
);
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::Scanned, true, true, false),
|
||||
"scanned"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_type_string_hybrid() {
|
||||
// AC: Hybrid → "mixed"
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::Hybrid, false, true, true),
|
||||
"mixed"
|
||||
);
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::Hybrid, true, true, true),
|
||||
"mixed"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_type_string_broken_vector_ocr_failed() {
|
||||
// AC: BrokenVector + ocr_succeeded=false → "broken_vector"
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::BrokenVector, false, true, false),
|
||||
"broken_vector"
|
||||
);
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::BrokenVector, false, true, true),
|
||||
"broken_vector"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_type_string_broken_vector_ocr_succeeded() {
|
||||
// AC: BrokenVector + ocr_succeeded=true → "scanned" (post-OCR recovery)
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::BrokenVector, true, true, false),
|
||||
"scanned"
|
||||
);
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::BrokenVector, true, true, true),
|
||||
"scanned"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_type_string_blank_override() {
|
||||
// AC: has_text=false + has_images=false → "blank" (overrides class)
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::Vector, false, false, false),
|
||||
"blank"
|
||||
);
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::Scanned, false, false, false),
|
||||
"blank"
|
||||
);
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::Hybrid, false, false, false),
|
||||
"blank"
|
||||
);
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::BrokenVector, false, false, false),
|
||||
"blank"
|
||||
);
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::BrokenVector, true, false, false),
|
||||
"blank"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_type_string_figure_only_override() {
|
||||
// AC: has_text=false + has_images=true → "figure_only" (overrides class)
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::Vector, false, false, true),
|
||||
"figure_only"
|
||||
);
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::Scanned, false, false, true),
|
||||
"figure_only"
|
||||
);
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::Hybrid, false, false, true),
|
||||
"figure_only"
|
||||
);
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::BrokenVector, false, false, true),
|
||||
"figure_only"
|
||||
);
|
||||
assert_eq!(
|
||||
page_type_string(PageClass::BrokenVector, true, false, true),
|
||||
"figure_only"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_type_string_exhaustive_combinations() {
|
||||
// AC: Every combination from the mapping table produces the documented string
|
||||
// 4 classes × 2 ocr_succeeded × 2 has_text × 2 has_images = 32 cases
|
||||
|
||||
let all_classes = [
|
||||
PageClass::Vector,
|
||||
PageClass::Scanned,
|
||||
PageClass::Hybrid,
|
||||
PageClass::BrokenVector,
|
||||
];
|
||||
|
||||
for &class in &all_classes {
|
||||
for &ocr_succeeded in &[false, true] {
|
||||
for &has_text in &[false, true] {
|
||||
for &has_images in &[false, true] {
|
||||
let result = page_type_string(class, ocr_succeeded, has_text, has_images);
|
||||
|
||||
// Verify result is one of the six valid enum values
|
||||
assert!(
|
||||
matches!(
|
||||
result,
|
||||
"text" | "scanned" | "mixed" | "broken_vector" | "blank" | "figure_only"
|
||||
),
|
||||
"Invalid page_type: '{}' for class={:?}, ocr={}, has_text={}, has_images={}",
|
||||
result,
|
||||
class,
|
||||
ocr_succeeded,
|
||||
has_text,
|
||||
has_images
|
||||
);
|
||||
|
||||
// Verify override rules
|
||||
if !has_text && !has_images {
|
||||
assert_eq!(result, "blank");
|
||||
} else if !has_text && has_images {
|
||||
assert_eq!(result, "figure_only");
|
||||
} else {
|
||||
// Class-based mapping
|
||||
match class {
|
||||
PageClass::Vector => assert_eq!(result, "text"),
|
||||
PageClass::Scanned => assert_eq!(result, "scanned"),
|
||||
PageClass::Hybrid => assert_eq!(result, "mixed"),
|
||||
PageClass::BrokenVector => {
|
||||
if ocr_succeeded {
|
||||
assert_eq!(result, "scanned");
|
||||
} else {
|
||||
assert_eq!(result, "broken_vector");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_type_enum_schema_set() {
|
||||
// Schema list test asserting page_type enum exactly equals
|
||||
// { text, scanned, mixed, broken_vector, blank, figure_only }
|
||||
let expected = [
|
||||
"text",
|
||||
"scanned",
|
||||
"mixed",
|
||||
"broken_vector",
|
||||
"blank",
|
||||
"figure_only",
|
||||
];
|
||||
|
||||
// Verify all expected values are produced by page_type_string
|
||||
let mut found = std::collections::HashSet::new();
|
||||
let all_classes = [
|
||||
PageClass::Vector,
|
||||
PageClass::Scanned,
|
||||
PageClass::Hybrid,
|
||||
PageClass::BrokenVector,
|
||||
];
|
||||
|
||||
for &class in &all_classes {
|
||||
for &ocr_succeeded in &[false, true] {
|
||||
for &has_text in &[false, true] {
|
||||
for &has_images in &[false, true] {
|
||||
let result = page_type_string(class, ocr_succeeded, has_text, has_images);
|
||||
found.insert(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Verify all expected values are present
|
||||
for &expected_value in &expected {
|
||||
assert!(
|
||||
found.contains(expected_value),
|
||||
"Expected page_type '{}' not found in output set",
|
||||
expected_value
|
||||
);
|
||||
}
|
||||
|
||||
// Verify no unexpected values are present
|
||||
assert_eq!(
|
||||
found.len(),
|
||||
expected.len(),
|
||||
"page_type set has unexpected values: {:?}",
|
||||
found
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_class_as_type_str() {
|
||||
assert_eq!(PageClass::Vector.as_type_str(), "text");
|
||||
assert_eq!(PageClass::Scanned.as_type_str(), "scanned");
|
||||
assert_eq!(PageClass::Hybrid.as_type_str(), "mixed");
|
||||
assert_eq!(PageClass::BrokenVector.as_type_str(), "broken_vector");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_class_can_escalate_to_broken_vector() {
|
||||
// AC: Vector pages can escalate to BrokenVector
|
||||
assert!(PageClass::Vector.can_escalate_to_broken_vector());
|
||||
// AC: Scanned pages cannot escalate
|
||||
assert!(!PageClass::Scanned.can_escalate_to_broken_vector());
|
||||
// AC: Hybrid pages cannot escalate
|
||||
assert!(!PageClass::Hybrid.can_escalate_to_broken_vector());
|
||||
// AC: BrokenVector pages cannot escalate (already there)
|
||||
assert!(!PageClass::BrokenVector.can_escalate_to_broken_vector());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue