test(pdftract-57o4): add ParentTree integration tests for annotation and sparse arrays
Add two comprehensive integration tests to validate the ParentTree resolver: 1. test_parent_tree_annotation_with_struct_parent: - Creates a body paragraph StructElem - Creates ParentTree with page array (MCID 0 -> body, MCID 1 -> orphan/null) - Creates ParentTree with annotation entry (key 100 -> body) - Verifies MCID resolution returns correct map and orphans - Verifies annotation /StructParent resolution returns the body ref - Verifies the referenced StructElem is in the tree 2. test_parent_tree_off_by_one_missing_entries: - Creates ParentTree with sparse array (only 3 entries for potentially more MCIDs) - Verifies non-null entries are correctly mapped - Verifies null entries are recorded as orphans - Documents that MCIDs beyond array length would be detected in Phase 7.1.4 Also export ParentTreeResolver and ParentTreeEntry from parser module for use by the block builder in Phase 7.1.4. All 67 struct_tree tests pass (18 ParentTree-specific tests). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
ecf78671b5
commit
b72d8312ce
3 changed files with 175 additions and 4 deletions
|
|
@ -32,7 +32,7 @@ pub use resources::{ResourceDict, merge_resources, extract_resources};
|
|||
pub use pages::{PageDict, flatten_page_tree, DEFAULT_MEDIABOX};
|
||||
pub use struct_tree::{
|
||||
StructureType, StructElemNode, StructTreeRoot, RoleMap, Kid,
|
||||
BlockKind, MappingResult,
|
||||
BlockKind, MappingResult, ParentTreeResolver, ParentTreeEntry,
|
||||
parse_struct_tree, structure_type_to_block_kind, map_element_to_block, is_artifact,
|
||||
};
|
||||
pub use stream::{
|
||||
|
|
|
|||
|
|
@ -2640,4 +2640,137 @@ mod tests {
|
|||
assert!(mcid_map.is_empty()); // No struct_elems with that ref
|
||||
assert_eq!(orphans, vec![0]); // MCID 0 is an orphan
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parent_tree_annotation_with_struct_parent() {
|
||||
// Integration test: tagged PDF with annotation /StructParent linking to body StructElem
|
||||
// This test verifies that an annotation's /StructParent correctly resolves to
|
||||
// a StructElem in the structure tree, as required by PDF 1.7 spec 14.7.4.4
|
||||
let resolver = XrefResolver::new();
|
||||
let root_ref = ObjRef::new(1, 0);
|
||||
|
||||
// Create body paragraph StructElem that the annotation will reference
|
||||
let mut body_dict = PdfDict::new();
|
||||
body_dict.insert(intern("S"), PdfObject::Name(intern("P")));
|
||||
body_dict.insert(intern("K"), PdfObject::Array(Box::new(vec![
|
||||
PdfObject::Integer(0),
|
||||
])));
|
||||
let body_ref = ObjRef::new(10, 0);
|
||||
resolver.cache_object(body_ref, PdfObject::Dict(Box::new(body_dict)));
|
||||
|
||||
// Create ParentTree with:
|
||||
// - Key 0: array for page with 2 MCIDs (one null entry for orphan)
|
||||
// - Key 100: single ref for annotation /StructParent
|
||||
let parent_tree_nums = PdfObject::Array(Box::new(vec![
|
||||
// Page 0's ParentTree entry (array of StructElem refs)
|
||||
PdfObject::Integer(0),
|
||||
PdfObject::Array(Box::new(vec![
|
||||
PdfObject::Ref(body_ref), // MCID 0 -> body paragraph
|
||||
PdfObject::Null, // MCID 1 -> orphan (null entry)
|
||||
])),
|
||||
// Annotation's ParentTree entry (single StructElem ref)
|
||||
PdfObject::Integer(100),
|
||||
PdfObject::Ref(body_ref), // Annotation /StructParent=100 -> body paragraph
|
||||
]));
|
||||
|
||||
let mut parent_tree_dict = PdfDict::new();
|
||||
parent_tree_dict.insert(intern("Nums"), parent_tree_nums);
|
||||
|
||||
// Create StructTreeRoot
|
||||
let mut root_dict = PdfDict::new();
|
||||
root_dict.insert(intern("K"), PdfObject::Array(Box::new(vec![
|
||||
PdfObject::Ref(body_ref),
|
||||
])));
|
||||
root_dict.insert(intern("ParentTree"), PdfObject::Dict(Box::new(parent_tree_dict)));
|
||||
resolver.cache_object(root_ref, PdfObject::Dict(Box::new(root_dict)));
|
||||
|
||||
// Parse struct tree
|
||||
let result = parse_struct_tree(&resolver, root_ref);
|
||||
assert!(result.is_ok());
|
||||
|
||||
let tree = result.unwrap();
|
||||
|
||||
// Verify page MCID resolution
|
||||
let (mcid_map, orphans) = tree.parent_tree.resolve_page(Some(0));
|
||||
|
||||
// MCID 0 should map to the body paragraph
|
||||
assert_eq!(mcid_map.len(), 1);
|
||||
let mcid0_node = mcid_map.get(&0).unwrap();
|
||||
assert_eq!(mcid0_node.std_type, StructureType::P);
|
||||
|
||||
// MCID 1 should be an orphan (null entry)
|
||||
assert_eq!(orphans, vec![1]);
|
||||
|
||||
// Verify annotation /StructParent resolution
|
||||
let annot_struct_ref = tree.parent_tree.resolve_annotation(Some(100));
|
||||
assert_eq!(annot_struct_ref, Some(body_ref));
|
||||
|
||||
// Verify the referenced StructElem is actually in the tree
|
||||
assert!(tree.struct_elems.contains_key(&body_ref));
|
||||
assert_eq!(tree.struct_elems.get(&body_ref).unwrap().std_type, StructureType::P);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parent_tree_off_by_one_missing_entries() {
|
||||
// Test that malformed ParentTree with off-by-one indexing or missing entries
|
||||
// doesn't crash and records orphans appropriately
|
||||
let resolver = XrefResolver::new();
|
||||
let root_ref = ObjRef::new(1, 0);
|
||||
|
||||
// Create two StructElems with /K arrays containing MCIDs
|
||||
let mut elem1_dict = PdfDict::new();
|
||||
elem1_dict.insert(intern("S"), PdfObject::Name(intern("P")));
|
||||
elem1_dict.insert(intern("K"), PdfObject::Array(Box::new(vec![
|
||||
PdfObject::Integer(0),
|
||||
])));
|
||||
let elem1_ref = ObjRef::new(10, 0);
|
||||
resolver.cache_object(elem1_ref, PdfObject::Dict(Box::new(elem1_dict)));
|
||||
|
||||
let mut elem2_dict = PdfDict::new();
|
||||
elem2_dict.insert(intern("S"), PdfObject::Name(intern("H1")));
|
||||
elem2_dict.insert(intern("K"), PdfObject::Array(Box::new(vec![
|
||||
PdfObject::Integer(2),
|
||||
])));
|
||||
let elem2_ref = ObjRef::new(11, 0);
|
||||
resolver.cache_object(elem2_ref, PdfObject::Dict(Box::new(elem2_dict)));
|
||||
|
||||
// Create ParentTree with sparse array (missing entries)
|
||||
// Only 3 entries for what might be more MCIDs on the page
|
||||
let parent_tree_nums = PdfObject::Array(Box::new(vec![
|
||||
PdfObject::Integer(0),
|
||||
PdfObject::Array(Box::new(vec![
|
||||
PdfObject::Ref(elem1_ref),
|
||||
PdfObject::Null,
|
||||
PdfObject::Ref(elem2_ref),
|
||||
])),
|
||||
]));
|
||||
|
||||
let mut parent_tree_dict = PdfDict::new();
|
||||
parent_tree_dict.insert(intern("Nums"), parent_tree_nums);
|
||||
|
||||
// Add StructElems to /K array so they get parsed into struct_elems
|
||||
let mut root_dict = PdfDict::new();
|
||||
root_dict.insert(intern("K"), PdfObject::Array(Box::new(vec![
|
||||
PdfObject::Ref(elem1_ref),
|
||||
PdfObject::Ref(elem2_ref),
|
||||
])));
|
||||
root_dict.insert(intern("ParentTree"), PdfObject::Dict(Box::new(parent_tree_dict)));
|
||||
resolver.cache_object(root_ref, PdfObject::Dict(Box::new(root_dict)));
|
||||
|
||||
// Parse struct tree
|
||||
let result = parse_struct_tree(&resolver, root_ref);
|
||||
assert!(result.is_ok());
|
||||
|
||||
let tree = result.unwrap();
|
||||
|
||||
// Resolve page - should only map the 2 non-null entries
|
||||
let (mcid_map, orphans) = tree.parent_tree.resolve_page(Some(0));
|
||||
assert_eq!(mcid_map.len(), 2);
|
||||
assert!(mcid_map.get(&0).is_some());
|
||||
assert!(mcid_map.get(&2).is_some());
|
||||
assert_eq!(orphans, vec![1]); // MCID 1 is null
|
||||
|
||||
// If the page has MCIDs beyond the array length, they'd be orphans too
|
||||
// (This would be detected in Phase 7.1.4 coverage check)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -108,6 +108,24 @@ The `resolve_page()` function already checks for `elem_ref.object == 0` as a nul
|
|||
- [x] **PASS**: Annotations with /StructParent point INTO the structure tree
|
||||
- [x] **PASS**: Malformed ParentTree handling (off-by-one indexing, missing entries) - emits diagnostics without crashing
|
||||
|
||||
### Additional Integration Tests Added (2025-05-23)
|
||||
|
||||
Added two comprehensive integration tests to fully validate the ParentTree resolver:
|
||||
|
||||
1. **`test_parent_tree_annotation_with_struct_parent`**: Full integration test for annotation /StructParent linking
|
||||
- Creates a body paragraph StructElem
|
||||
- Creates ParentTree with page array (MCID 0 -> body, MCID 1 -> orphan/null)
|
||||
- Creates ParentTree with annotation entry (key 100 -> body)
|
||||
- Verifies MCID resolution returns correct map and orphans
|
||||
- Verifies annotation /StructParent resolution returns the body ref
|
||||
- Verifies the referenced StructElem is in the tree
|
||||
|
||||
2. **`test_parent_tree_off_by_one_missing_entries`**: Sparse array handling
|
||||
- Creates ParentTree with sparse array (only 3 entries for potentially more MCIDs)
|
||||
- Verifies non-null entries are correctly mapped
|
||||
- Verifies null entries are recorded as orphans
|
||||
- Documents that MCIDs beyond array length would be detected in Phase 7.1.4
|
||||
|
||||
## Files Modified
|
||||
|
||||
- `crates/pdftract-core/src/parser/struct_tree.rs`:
|
||||
|
|
@ -116,12 +134,32 @@ The `resolve_page()` function already checks for `elem_ref.object == 0` as a nul
|
|||
|
||||
## Test Results
|
||||
|
||||
All 65 struct_tree tests pass:
|
||||
All 67 struct_tree tests pass (18 ParentTree-specific tests):
|
||||
```bash
|
||||
$ cargo test -p pdftract-core --lib struct_tree
|
||||
test result: ok. 65 passed; 0 failed; 0 ignored; 0 measured; 886 filtered out
|
||||
$ cargo test -p pdftract-core parser::struct_tree
|
||||
test result: ok. 67 passed; 0 failed; 0 ignored; 0 measured; 886 filtered out
|
||||
```
|
||||
|
||||
ParentTree-specific tests:
|
||||
- `test_parent_tree_leaf_nums` - Simple leaf number tree with /Nums array
|
||||
- `test_parent_tree_single_ref` - Single ref for annotations
|
||||
- `test_parent_tree_null_entry` - Null entries in arrays (orphan MCIDs)
|
||||
- `test_parent_tree_intermediate_kids` - Intermediate nodes with /Kids + /Limits
|
||||
- `test_parent_tree_missing_key` - Missing /StructParents key returns empty
|
||||
- `test_parent_tree_no_struct_parents` - No /StructParents on page returns empty
|
||||
- `test_parent_tree_annotation_resolution` - Annotation /StructParent lookup
|
||||
- `test_parent_tree_annotation_from_array` - Fallback for arrays (incorrect but handled)
|
||||
- `test_parent_tree_malformed_nums_non_integer_key` - Diagnostic for non-integer keys
|
||||
- `test_parent_tree_malformed_nums_odd_length` - Diagnostic for odd-length arrays
|
||||
- `test_parent_tree_malformed_unsupported_value_type` - Diagnostic for unsupported value types
|
||||
- `test_parent_tree_no_parent_tree_entry` - Missing /ParentTree is valid
|
||||
- `test_parent_tree_invalid_node_type` - Non-dict node diagnostic
|
||||
- `test_parent_tree_empty_struct_tree_root` - Integration with parse_struct_tree
|
||||
- `test_parent_tree_resolver_new` - Constructor
|
||||
- `test_parent_tree_resolver_default` - Default trait
|
||||
- `test_parent_tree_annotation_with_struct_parent` - Full integration test (NEW)
|
||||
- `test_parent_tree_off_by_one_missing_entries` - Sparse array handling (NEW)
|
||||
|
||||
## Integration Points
|
||||
|
||||
- **`parse_struct_tree()`**: Calls `ParentTreeResolver::parse()` and sets the struct_elems map via `set_struct_elems()`
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue