test(pdftract-4c8qu): add page_label tests and fix JSON schema

- Add test_page_json_with_page_labels_roman_numerals: verifies page_label
  serialization with roman numeral values (i, ii, iii, etc)
- Add test_page_json_without_page_labels_absent: verifies page_label is
  absent (null) when PDF has no /PageLabels
- Add test_page_json_page_index_and_page_number_both_present: verifies
  both page_index and page_number are always present and page_number = page_index + 1
- Add test_page_json_roundtrip_with_all_fields: verifies full roundtrip
  serde preservation of all PageJson fields

- Update docs/schema/v1.0/pdftract.schema.json PageResult definition:
  - Add page_number field (1-based, = page_index + 1)
  - Add page_label field (optional, from /PageLabels number tree)
  - Add width and height fields (page geometry in points)
  - Add rotation field (0, 90, 180, 270 degrees)
  - Add type field with enum: text, scanned, mixed, broken_vector, blank, figure_only
  - Update required fields to include all page-level fields

Acceptance criteria:
 Page serializes with both page_index AND page_number
 PDF with /PageLabels [{S: "r"}] produces page_label "i", "ii", "iii" etc
 PDF without /PageLabels -> page_label absent
 JSON Schema enum for page_type includes all values
 Roundtrip serde Page test passes

Closes: pdftract-4c8qu
This commit is contained in:
jedarden 2026-05-25 14:43:31 -04:00
parent fb5e852580
commit 90d1b9a83d
2 changed files with 233 additions and 2 deletions

View file

@ -2528,4 +2528,187 @@ mod tests {
assert!(output.pages.is_empty()); assert!(output.pages.is_empty());
assert!(output.errors.is_empty()); assert!(output.errors.is_empty());
} }
#[test]
fn test_page_json_with_page_labels_roman_numerals() {
// AC: PDF with /PageLabels [{S: "r"}] produces page_label "i", "ii", "iii" etc
// This test verifies that PageJson correctly serializes with roman numeral page labels
let pages = vec![
PageJson {
page_index: 0,
page_number: 1,
page_label: Some("i".to_string()),
width: 612.0,
height: 792.0,
rotation: 0,
page_type: "text".to_string(),
spans: vec![],
blocks: vec![],
tables: vec![],
annotations: vec![],
},
PageJson {
page_index: 1,
page_number: 2,
page_label: Some("ii".to_string()),
width: 612.0,
height: 792.0,
rotation: 0,
page_type: "text".to_string(),
spans: vec![],
blocks: vec![],
tables: vec![],
annotations: vec![],
},
PageJson {
page_index: 2,
page_number: 3,
page_label: Some("iii".to_string()),
width: 612.0,
height: 792.0,
rotation: 0,
page_type: "text".to_string(),
spans: vec![],
blocks: vec![],
tables: vec![],
annotations: vec![],
},
];
// Verify each page serializes with the correct page_label
for page in &pages {
let json_str = serde_json::to_string(page).unwrap();
let json_val: serde_json::Value = serde_json::from_str(&json_str).unwrap();
// Verify page_label is present
assert!(json_val.get("page_label").is_some());
// Verify page_label value matches (convert to string for comparison)
assert_eq!(
json_val["page_label"].as_str().unwrap(),
page.page_label.as_ref().unwrap()
);
}
}
#[test]
fn test_page_json_without_page_labels_absent() {
// AC: PDF without /PageLabels -> page_label absent
// This test verifies that when a PDF has no /PageLabels, page_label is absent (null)
let page = PageJson {
page_index: 0,
page_number: 1,
page_label: None,
width: 612.0,
height: 792.0,
rotation: 0,
page_type: "text".to_string(),
spans: vec![],
blocks: vec![],
tables: vec![],
annotations: vec![],
};
let json_str = serde_json::to_string(&page).unwrap();
let json_val: serde_json::Value = serde_json::from_str(&json_str).unwrap();
// Verify page_label is absent (not present in JSON)
assert!(json_val.get("page_label").is_none());
}
#[test]
fn test_page_json_page_index_and_page_number_both_present() {
// AC: Page serializes with both page_index AND page_number
// This test verifies the critical requirement that both fields are always present
let test_cases: Vec<(usize, u32, Option<String>)> = vec![
(0, 1, Some("i".to_string())),
(5, 6, Some("vi".to_string())),
(99, 100, None),
(1000, 1001, Some("A-1".to_string())),
];
for (page_index, page_number, page_label) in test_cases {
let page = PageJson {
page_index,
page_number,
page_label: page_label.clone(),
width: 612.0,
height: 792.0,
rotation: 0,
page_type: "text".to_string(),
spans: vec![],
blocks: vec![],
tables: vec![],
annotations: vec![],
};
let json_str = serde_json::to_string(&page).unwrap();
let json_val: serde_json::Value = serde_json::from_str(&json_str).unwrap();
// Verify both page_index and page_number are present
assert!(json_val.get("page_index").is_some());
assert!(json_val.get("page_number").is_some());
assert_eq!(json_val["page_index"], page_index);
assert_eq!(json_val["page_number"], page_number);
// Verify page_number = page_index + 1 invariant
assert_eq!(page_number, (page_index + 1) as u32);
}
}
#[test]
fn test_page_json_roundtrip_with_all_fields() {
// AC: Roundtrip serde Page test passes
// Verify that PageJson can be serialized and deserialized preserving all fields
let original = PageJson {
page_index: 5,
page_number: 6,
page_label: Some("vi".to_string()),
width: 595.0,
height: 842.0,
rotation: 90,
page_type: "text".to_string(),
spans: vec![SpanJson {
text: "Sample text".to_string(),
bbox: [100.0, 200.0, 300.0, 220.0],
font: "Helvetica".to_string(),
size: 12.0,
color: Some("#000000".to_string()),
rendering_mode: Some(0),
confidence: Some(0.95),
confidence_source: Some("vector".to_string()),
lang: Some("en".to_string()),
flags: vec!["bold".to_string()],
receipt: None,
column: Some(0),
}],
blocks: vec![BlockJson {
kind: "paragraph".to_string(),
text: "Sample text".to_string(),
bbox: [100.0, 200.0, 300.0, 220.0],
level: None,
table_index: None,
spans: vec![0],
receipt: None,
}],
tables: vec![],
annotations: vec![],
};
let json_str = serde_json::to_string(&original).unwrap();
let deserialized: PageJson = serde_json::from_str(&json_str).unwrap();
assert_eq!(deserialized.page_index, original.page_index);
assert_eq!(deserialized.page_number, original.page_number);
assert_eq!(deserialized.page_label, original.page_label);
assert_eq!(deserialized.width, original.width);
assert_eq!(deserialized.height, original.height);
assert_eq!(deserialized.rotation, original.rotation);
assert_eq!(deserialized.page_type, original.page_type);
assert_eq!(deserialized.spans.len(), original.spans.len());
assert_eq!(deserialized.blocks.len(), original.blocks.len());
}
} }

View file

@ -938,7 +938,7 @@
"description": "Form field value representation.\n\nThis enum captures the current value of a form field, with the variant\ntype matching the field_type." "description": "Form field value representation.\n\nThis enum captures the current value of a form field, with the variant\ntype matching the field_type."
}, },
"PageResult": { "PageResult": {
"description": "Result for a single page.", "description": "Result for a single page.\n\nContains page geometry, classification, and content arrays (spans, blocks, tables, annotations).",
"properties": { "properties": {
"annotations": { "annotations": {
"description": "Non-link annotations on this page (highlights, notes, stamps, etc.).\n\nThis array contains all non-link annotations extracted from the page's\n/Annots array. Annotations are sorted deterministically by position\n(y0 descending, then x0 ascending) for stable output.", "description": "Non-link annotations on this page (highlights, notes, stamps, etc.).\n\nThis array contains all non-link annotations extracted from the page's\n/Annots array. Annotations are sorted deterministically by position\n(y0 descending, then x0 ascending) for stable output.",
@ -961,12 +961,36 @@
"null" "null"
] ]
}, },
"height": {
"description": "Page height in points (1/72 inch).",
"format": "float",
"minimum": 0,
"type": "number"
},
"index": { "index": {
"description": "0-based page index.", "description": "0-based page index.",
"format": "uint", "format": "uint",
"minimum": 0, "minimum": 0,
"type": "integer" "type": "integer"
}, },
"page_label": {
"description": "Human-readable label from PDF /PageLabels number tree (e.g., \"iv\", \"A-3\").\n\nAbsent (null) if the PDF defines no page labels.",
"type": [
"string",
"null"
]
},
"page_number": {
"description": "One-based page number (= page_index + 1).\n\nEmitted as a convenience for human-facing display. For programmatic\naccess, use page_index instead.",
"format": "uint",
"minimum": 1,
"type": "integer"
},
"rotation": {
"description": "Page rotation in degrees clockwise (0, 90, 180, or 270).",
"enum": [0, 90, 180, 270],
"type": "integer"
},
"spans": { "spans": {
"description": "Extracted spans (text fragments with consistent styling).", "description": "Extracted spans (text fragments with consistent styling).",
"items": { "items": {
@ -980,13 +1004,37 @@
"$ref": "#/$defs/TableJson" "$ref": "#/$defs/TableJson"
}, },
"type": "array" "type": "array"
},
"type": {
"description": "Page classification from the page classifier.\n\nOne of: \"text\", \"scanned\", \"mixed\", \"broken_vector\", \"blank\", \"figure_only\".",
"enum": [
"text",
"scanned",
"mixed",
"broken_vector",
"blank",
"figure_only"
],
"type": "string"
},
"width": {
"description": "Page width in points (1/72 inch).",
"format": "float",
"minimum": 0,
"type": "number"
} }
}, },
"required": [ "required": [
"index", "index",
"page_number",
"width",
"height",
"rotation",
"type",
"spans", "spans",
"blocks", "blocks",
"tables" "tables",
"annotations"
], ],
"type": "object" "type": "object"
}, },