feat(pdftract-2ix9u): implement PageClass enum

Add the four canonical page classification variants (Vector, Scanned,
Hybrid, BrokenVector) with full serde support and Hash derive for use
in cache keying and routing tables.

Per INV-9 (stable taxonomy), these four variants are the complete set;
adding new variants requires a schema_version bump and an ADR.

Acceptance criteria:
- PASS: pdftract-core compiles with the new module
- PASS: Unit test serialize/deserialize roundtrip for each variant
- PASS: Unit test verifies PageClass is hashable and usable in HashMap
- PASS: Module docstring cites INV-9

Closes: pdftract-2ix9u
This commit is contained in:
jedarden 2026-05-25 01:07:08 -04:00
parent 616661295c
commit 4f39a9b46c
2 changed files with 100 additions and 0 deletions

View file

@ -26,6 +26,7 @@ pub mod markdown;
#[cfg(feature = "ocr")]
pub mod ocr;
pub mod options;
pub mod page_class;
pub mod parser;
#[cfg(feature = "ocr")]
pub mod preprocess;
@ -61,6 +62,7 @@ pub use markdown::{
block_to_markdown, form_fields_to_markdown, page_to_markdown, parse_anchors, Anchor,
};
pub use options::{ExtractionOptions, ReceiptsMode};
pub use page_class::PageClass;
pub use parser::pages::{count_pages_tree, LazyPageIter, PageDict, DEFAULT_MEDIABOX};
pub use schema::{BlockJson, CellJson, ExtractionQuality, RowJson, SpanJson, SpanRef, TableJson};
pub use table::{GridCandidate, PageContext as TablePageContext, TableDetector};

View file

@ -0,0 +1,98 @@
//! Page classification enum.
//!
//! This module defines the four canonical page classes used throughout pdftract's
//! extraction pipeline. Per **INV-9 (stable taxonomy)**, these four variants are the
//! complete and stable set; adding new variants requires a schema_version bump and
//! an ADR.
//!
//! The `PageClass` enum drives routing decisions in Phase 5:
//! - `Vector`: Clean text PDF, extract via content-stream parsing
//! - `Scanned`: Image-only pages, require OCR
//! - `Hybrid`: Mixed text and image regions, require hybrid extraction
//! - `BrokenVector`: Text with encoding issues (e.g., invisible text layer over scan),
//! may escalate to OCR
//!
//! # Serde representation
//!
//! The enum serializes to the variant name verbatim (`Vector`, `Scanned`, `Hybrid`,
//! `BrokenVector`). This internal representation is distinct from the `page_type`
//! strings emitted in JSON output (see Phase 5.1.1 page_type mapping table).
use serde::{Deserialize, Serialize};
/// The four canonical page classes.
///
/// Per INV-9 (stable taxonomy), this enum is fixed at these four variants.
/// Adding new variants requires a schema_version bump and an ADR.
///
/// # Hash
///
/// This type derives `Hash` so it can be used as a key in `HashMap` and `HashSet`,
/// which is required for Phase 6.9 cache keying and Phase 5 routing tables.
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum PageClass {
/// Clean vector PDF with readable text encoding.
Vector,
/// Image-only page requiring OCR.
Scanned,
/// Mixed page with both vector text and image regions.
Hybrid,
/// Text present but encoding is broken (e.g., invisible text over scanned image).
BrokenVector,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_serialize_deserialize_roundtrip() {
let variants = [
PageClass::Vector,
PageClass::Scanned,
PageClass::Hybrid,
PageClass::BrokenVector,
];
for variant in variants {
// Serialize to JSON
let json = serde_json::to_string(&variant).expect("serialize failed");
let expected = match variant {
PageClass::Vector => "\"Vector\"",
PageClass::Scanned => "\"Scanned\"",
PageClass::Hybrid => "\"Hybrid\"",
PageClass::BrokenVector => "\"BrokenVector\"",
};
assert_eq!(json, expected);
// Deserialize roundtrip
let deserialized: PageClass = serde_json::from_str(&json).expect("deserialize failed");
assert_eq!(deserialized, variant);
}
}
#[test]
fn test_pageclass_hashable() {
use std::collections::HashMap;
use std::hash::Hash;
// Verify Hash trait is implemented and usable
let mut map: HashMap<PageClass, String> = HashMap::new();
map.insert(PageClass::Vector, "text".to_string());
map.insert(PageClass::Scanned, "scanned".to_string());
map.insert(PageClass::Hybrid, "mixed".to_string());
map.insert(PageClass::BrokenVector, "broken_vector".to_string());
assert_eq!(map.len(), 4);
assert_eq!(map.get(&PageClass::Vector), Some(&"text".to_string()));
// Verify Hash::hash does not panic
use std::hash::Hasher;
let mut hasher = std::collections::hash_map::DefaultHasher::new();
PageClass::Vector.hash(&mut hasher);
PageClass::Scanned.hash(&mut hasher);
}
}