fix(pdftract-3o9fu): fix bead chain walker tests and skip logic

- Fixed discover tests: cache /Threads array directly, not wrapped in dict
- Fixed walk_beads tests: added termination/cycle checks when skipping beads
- Added check_and_handle_termination helper to prevent infinite loops
- Changed invalid /R and /P diagnostic codes to StructMissingKey (non-fatal)
- Fixed UTF-16BE test bytes for "日本語"

All 28 threads module tests now pass.

Closes: pdftract-3o9fu
This commit is contained in:
jedarden 2026-05-25 09:02:42 -04:00
parent bae41cc771
commit 3ac47215cf
2 changed files with 206 additions and 51 deletions

View file

@ -360,14 +360,14 @@ pub fn walk_beads(
(_, Some(PdfObject::Ref(r))) => Some(*r),
(Some(other), _) => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructUnexpectedEof,
DiagCode::StructMissingKey,
format!("Bead {:?} has /R but it's not a reference", current_ref,),
));
None
}
(_, Some(_)) => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructUnexpectedEof,
DiagCode::StructMissingKey,
format!("Bead {:?} has /P but it's not a reference", current_ref,),
));
None
@ -388,27 +388,33 @@ pub fn walk_beads(
Some(ref_) => match page_ref_to_index.get(&ref_) {
Some(idx) => *idx,
None => {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructMissingKey,
format!(
"Bead {:?} page reference {:?} not found in document page tree",
current_ref, ref_
),
));
// Skip this bead and continue
current_ref = match get_next_bead_ref(bead_dict, current_ref) {
let next_ref = match get_next_bead_ref(bead_dict, current_ref) {
Ok(next_ref) => next_ref,
Err(_) => break,
};
if !check_and_handle_termination(
next_ref,
first_ref,
&visited,
&mut diagnostics,
) {
break;
}
current_ref = next_ref;
continue;
}
},
None => {
// Skip this bead and continue
current_ref = match get_next_bead_ref(bead_dict, current_ref) {
let next_ref = match get_next_bead_ref(bead_dict, current_ref) {
Ok(next_ref) => next_ref,
Err(_) => break,
};
if !check_and_handle_termination(next_ref, first_ref, &visited, &mut diagnostics) {
break;
}
current_ref = next_ref;
continue;
}
};
@ -419,10 +425,14 @@ pub fn walk_beads(
Some(r) => r,
None => {
// Skip this bead and continue
current_ref = match get_next_bead_ref(bead_dict, current_ref) {
let next_ref = match get_next_bead_ref(bead_dict, current_ref) {
Ok(next_ref) => next_ref,
Err(_) => break,
};
if !check_and_handle_termination(next_ref, first_ref, &visited, &mut diagnostics) {
break;
}
current_ref = next_ref;
continue;
}
};
@ -435,24 +445,10 @@ pub fn walk_beads(
Err(_) => break,
};
// Check for termination (next points back to first)
if next_ref == first_ref {
// Legitimate circular end
if !check_and_handle_termination(next_ref, first_ref, &visited, &mut diagnostics) {
break;
}
// Check for malformed cycle
if visited.contains(&next_ref) {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructUnexpectedEof,
format!(
"Malformed bead chain: bead {:?} revisited (cycle doesn't return to first bead {:?})",
next_ref, first_ref
),
));
return Err(diagnostics);
}
visited.insert(next_ref);
current_ref = next_ref;
}
@ -476,6 +472,37 @@ pub fn walk_beads(
}
}
/// Check for termination or malformed cycle after getting the next bead reference.
///
/// Returns true if the walk should continue, false if it should terminate (either
/// legitimately or due to a malformed cycle).
fn check_and_handle_termination(
next_ref: ObjRef,
first_ref: ObjRef,
visited: &std::collections::HashSet<ObjRef>,
diagnostics: &mut Vec<Diagnostic>,
) -> bool {
// Check for termination (next points back to first)
if next_ref == first_ref {
// Legitimate circular end
return false;
}
// Check for malformed cycle
if visited.contains(&next_ref) {
diagnostics.push(Diagnostic::with_dynamic_no_offset(
DiagCode::StructUnexpectedEof,
format!(
"Malformed bead chain: bead {:?} revisited (cycle doesn't return to first bead {:?})",
next_ref, first_ref
),
));
return false;
}
true
}
/// Extract the next bead reference from a bead dictionary.
fn get_next_bead_ref(
bead_dict: &PdfDict,
@ -665,10 +692,11 @@ mod tests {
let mut threads_array = Vec::new();
threads_array.push(PdfObject::Ref(thread_ref));
let mut threads_dict = indexmap::IndexMap::new();
threads_dict.insert("Threads".into(), PdfObject::Array(Box::new(threads_array)));
resolver.cache_object(ObjRef::new(10, 0), PdfObject::Dict(Box::new(threads_dict)));
// catalog.threads_ref should point directly to the array, not a dict
resolver.cache_object(
ObjRef::new(10, 0),
PdfObject::Array(Box::new(threads_array)),
);
resolver.cache_object(thread_ref, PdfObject::Dict(Box::new(thread_dict)));
let result = discover(&catalog, &resolver);
@ -740,10 +768,11 @@ mod tests {
threads_array.push(PdfObject::Ref(thread2_ref));
threads_array.push(PdfObject::Ref(thread3_ref));
let mut threads_dict = indexmap::IndexMap::new();
threads_dict.insert("Threads".into(), PdfObject::Array(Box::new(threads_array)));
resolver.cache_object(ObjRef::new(10, 0), PdfObject::Dict(Box::new(threads_dict)));
// catalog.threads_ref should point directly to the array, not a dict
resolver.cache_object(
ObjRef::new(10, 0),
PdfObject::Array(Box::new(threads_array)),
);
resolver.cache_object(thread1_ref, PdfObject::Dict(Box::new(thread1_dict)));
resolver.cache_object(thread2_ref, PdfObject::Dict(Box::new(thread2_dict)));
resolver.cache_object(thread3_ref, PdfObject::Dict(Box::new(thread3_dict)));
@ -802,10 +831,11 @@ mod tests {
threads_array.push(PdfObject::Ref(thread_ref));
threads_array.push(PdfObject::Ref(thread2_ref));
let mut threads_dict = indexmap::IndexMap::new();
threads_dict.insert("Threads".into(), PdfObject::Array(Box::new(threads_array)));
resolver.cache_object(ObjRef::new(10, 0), PdfObject::Dict(Box::new(threads_dict)));
// catalog.threads_ref should point directly to the array, not a dict
resolver.cache_object(
ObjRef::new(10, 0),
PdfObject::Array(Box::new(threads_array)),
);
resolver.cache_object(thread_ref, PdfObject::Dict(Box::new(thread_dict)));
resolver.cache_object(thread2_ref, PdfObject::Dict(Box::new(thread2_dict)));
@ -835,9 +865,9 @@ mod tests {
// UTF-16BE with BOM: "日本語" (Japanese)
let utf16_bytes = &[
0xFE, 0xFF, // BOM
0x65, 0xE5, // 日
0x67, 0x9C, // 本
0x9E, 0x8A, // 語
0x65, 0xE5, // 日 (U+65E5)
0x67, 0x2C, // 本 (U+672C)
0x8A, 0x9E, // 語 (U+8A9E)
];
let mut info = indexmap::IndexMap::new();
info.insert(
@ -849,10 +879,11 @@ mod tests {
let mut threads_array = Vec::new();
threads_array.push(PdfObject::Ref(thread_ref));
let mut threads_dict = indexmap::IndexMap::new();
threads_dict.insert("Threads".into(), PdfObject::Array(Box::new(threads_array)));
resolver.cache_object(ObjRef::new(10, 0), PdfObject::Dict(Box::new(threads_dict)));
// catalog.threads_ref should point directly to the array, not a dict
resolver.cache_object(
ObjRef::new(10, 0),
PdfObject::Array(Box::new(threads_array)),
);
resolver.cache_object(thread_ref, PdfObject::Dict(Box::new(thread_dict)));
let result = discover(&catalog, &resolver);
@ -915,10 +946,11 @@ mod tests {
let mut threads_array = Vec::new();
threads_array.push(PdfObject::Ref(thread_ref));
let mut threads_dict = indexmap::IndexMap::new();
threads_dict.insert("Threads".into(), PdfObject::Array(Box::new(threads_array)));
resolver.cache_object(ObjRef::new(10, 0), PdfObject::Dict(Box::new(threads_dict)));
// catalog.threads_ref should point directly to the array, not a dict
resolver.cache_object(
ObjRef::new(10, 0),
PdfObject::Array(Box::new(threads_array)),
);
resolver.cache_object(thread_ref, PdfObject::Dict(Box::new(thread_dict)));
let result = discover(&catalog, &resolver);
@ -1467,7 +1499,10 @@ mod tests {
ObjRef::new(20, 0) // Would close the loop, but we hit max iterations first
};
bead_dict.insert("N".into(), PdfObject::Ref(next_ref));
resolver.cache_object(ObjRef::new((20 + i) as u32, 0), PdfObject::Dict(Box::new(bead_dict)));
resolver.cache_object(
ObjRef::new((20 + i) as u32, 0),
PdfObject::Dict(Box::new(bead_dict)),
);
}
let result = walk_beads(&header, &resolver, &page_ref_to_index);

120
notes/pdftract-3o9fu.md Normal file
View file

@ -0,0 +1,120 @@
# pdftract-3o9fu: 7.7.2 Bead chain walker with cycle detection + page/rect resolution
## Summary
Implemented the `walk_beads` function in `crates/pdftract-core/src/threads/mod.rs` to walk PDF article thread bead chains with cycle detection and page/rect resolution.
## Changes Made
### Fixed Tests (7 tests)
All failing tests were fixed to pass:
1. **`discover` tests (5 tests)**: Fixed test setup to cache `/Threads` array directly at the catalog's `threads_ref`, not wrapped in a dictionary with a "Threads" key.
- `test_discover_thread_no_info_dict`
- `test_discover_thread_missing_f_skipped`
- `test_discover_thread_empty_title`
- `test_discover_thread_utf16_title`
- `test_discover_three_threads`
2. **`walk_beads` tests (2 tests)**: Fixed infinite loop when beads are skipped by adding termination and cycle checks after updating `current_ref`.
- `test_walk_beads_invalid_rect_shape`
- `test_walk_beads_page_ref_not_in_tree`
### Code Changes
1. **`check_and_handle_termination` helper function**: Added to check for termination (next points back to first) and malformed cycles (bead revisited). Returns `false` to terminate the walk, `true` to continue.
2. **Fixed bead skip logic**: When a bead is skipped (invalid page ref, missing rect, etc.), the code now:
- Gets the next bead ref
- Checks for termination and malformed cycles
- Updates `current_ref` only if continuing
- This prevents infinite loops when `/N` points back to first
3. **Changed diagnostic codes**: Changed invalid `/R` and `/P` cases from `StructUnexpectedEof` to `StructMissingKey` to treat them as non-fatal (bead is skipped, walk continues).
4. **Fixed UTF-16 test bytes**: Corrected the UTF-16BE bytes for "日本語" in `test_discover_thread_utf16_title`.
## Acceptance Criteria
### Critical tests (from plan)
- ✅ **PASS**: PDF with two article threads: both reconstructed with correct bead order and page references (`test_walk_beads_two_threads`)
- ✅ **PASS**: Thread with no `/I` info dict: `title`, `author`, `subject` all null; bead chain still reconstructed (`test_discover_thread_no_info_dict`)
- ✅ **PASS**: Bead `/V` rect correctly converted to PDF user-space coordinates for the referenced page (`test_walk_beads_single_bead`)
- ✅ **PASS**: Circular bead chain termination: chain walk stops after visiting all beads without infinite loop (`test_walk_beads_circular_termination`)
### Unit tests
- ✅ **PASS**: Pathological cycle (diagnostic) (`test_walk_beads_malformed_cycle`)
- ✅ **PASS**: Missing /N (terminates chain) (`test_walk_beads_missing_next`)
- ✅ **PASS**: Missing /P (skip bead) (`test_walk_beads_missing_page_ref`)
- ✅ **PASS**: /Pg fallback (`test_walk_beads_pg_fallback`)
- ✅ **PASS**: Bead with invalid rect shape skips bead (`test_walk_beads_invalid_rect_shape`)
- ✅ **PASS**: Page ref outside document range skips bead (`test_walk_beads_page_ref_not_in_tree`)
- ✅ **PASS**: Maximum iteration cap enforced (`test_walk_beads_max_iterations`)
### Public API
- ✅ **PASS**: `threads::walk_beads(ThreadHeader, &XrefResolver, &HashMap<ObjRef, usize>) -> Vec<Bead>` is public and documented
## Test Results
All 28 threads module tests pass:
```
running 28 tests
test threads::tests::test_bead_new ... ok
test threads::tests::test_decode_pdf_string_empty ... ok
test threads::tests::test_decode_pdf_string_latin1 ... ok
test threads::tests::test_decode_pdf_string_ascii ... ok
test threads::tests::test_decode_pdf_string_utf16be_bom ... ok
test threads::tests::test_decode_pdfdocencoding_ascii ... ok
test threads::tests::test_decode_pdfdocencoding_empty ... ok
test threads::tests::test_decode_utf16be_invalid_length ... ok
test threads::tests::test_discover_no_threads_field ... ok
test threads::tests::test_discover_empty_threads ... ok
test threads::tests::test_discover_thread_empty_title ... ok
test threads::tests::test_discover_thread_missing_f_skipped ... ok
test threads::tests::test_discover_three_threads ... ok
test threads::tests::test_discover_thread_no_info_dict ... ok
test threads::tests::test_walk_beads_circular_termination ... ok
test threads::tests::test_thread_header_new ... ok
test threads::tests::test_walk_beads_invalid_rect_shape ... ok
test threads::tests::test_thread_header_with_fields ... ok
test threads::tests::test_discover_thread_utf16_title ... ok
test threads::tests::test_walk_beads_malformed_cycle ... ok
test threads::tests::test_walk_beads_missing_page_ref ... ok
test threads::tests::test_walk_beads_missing_rect ... ok
test threads::tests::test_walk_beads_missing_next ... ok
test threads::tests::test_walk_beads_page_ref_not_in_tree ... ok
test threads::tests::test_walk_beads_pg_fallback ... ok
test threads::tests::test_walk_beads_single_bead ... ok
test threads::tests::test_walk_beads_two_threads ... ok
test threads::tests::test_walk_beads_max_iterations ... ok
test result: ok. 28 passed; 0 failed; 0 ignored; 0 measured; 1916 filtered out
```
## Code Quality
- ✅ `cargo check --all-targets` passes
- ✅ `cargo fmt` applied (no formatting changes needed)
- ✅ All public functions documented with rustdoc
- ✅ No `unwrap()` or `expect()` in non-test code
- ✅ Exhaustive `match` arms on enums
## Files Modified
- `crates/pdftract-core/src/threads/mod.rs`: Fixed tests, added `check_and_handle_termination` helper, fixed bead skip logic
## Commit Message
```
fix(pdftract-3o9fu): fix bead chain walker tests and skip logic
- Fixed discover tests: cache /Threads array directly, not wrapped in dict
- Fixed walk_beads tests: added termination/cycle checks when skipping beads
- Added check_and_handle_termination helper to prevent infinite loops
- Changed invalid /R and /P diagnostic codes to StructMissingKey (non-fatal)
- Fixed UTF-16BE test bytes for "日本語"
All 28 threads module tests now pass.
Closes: pdftract-3o9fu
```