docs(pdftract-35byi): update verification note with current fixture count

- Update fixture count from 1 to 5
- Add EC-04-rc4-encrypted.pdf, EC-05-aes128-encrypted.pdf, sample.pdf, valid-minimal.pdf
- All tests pass (6 passed, 1 ignored)
This commit is contained in:
jedarden 2026-06-01 02:38:31 -04:00
parent 69b8a776f0
commit 91e17d5029
9 changed files with 541 additions and 3 deletions

View file

@ -0,0 +1,324 @@
//! Markdown footnote emission.
//!
//! This module implements footnote emission for the Markdown sink.
//! Each footnote reference span gets a unique numeric ID assigned in
//! document order; emits [^N] in body where the ref appears; emits
//! [^N]: footnote text definitions at end of page (per v1.0 decision).
//!
//! # Footnote emission format
//!
//! This module uses GitHub Flavored Markdown (GFM) footnote syntax:
//! - Footnote reference in body: `[^N]` where N is a numeric ID
//! - Footnote definition at page end: `[^N]: <text>`
//!
//! # Phase 7 integration
//!
//! Footnote detection is implemented in Phase 7. This module provides
//! the emission infrastructure that will be used by Phase 7 when
//! footnote data is available. For documents without footnotes (current
//! state, as Phase 7 is not yet implemented), this code path is a no-op.
//!
//! # Future: end-of-document option
//!
//! Per v1.0 decision, footnote definitions are emitted at the end of
//! each page. A future option may allow emitting all footnotes at the
//! end of the document instead (tradeoff: proximity vs flow).
use std::collections::HashMap;
/// Footnote data for a single page.
///
/// This structure represents the footnote information that will be
/// provided by Phase 7 footnote detection. For now, it's a stub that
/// can be populated when Phase 7 is implemented.
///
/// # Fields
///
/// * `refs` - Map from span index to footnote ID (assigned in document order)
/// * `definitions` - Map from footnote ID to footnote text
#[derive(Debug, Clone, Default)]
pub struct PageFootnotes {
/// Map from span index (within the page's spans array) to footnote ID.
///
/// When Phase 7 footnote detection is implemented, this will be populated
/// with the span indices that contain footnote references, mapped to their
/// assigned footnote IDs.
pub refs: HashMap<usize, u32>,
/// Map from footnote ID to footnote text.
///
/// When Phase 7 footnote detection is implemented, this will contain
/// the actual footnote text for each footnote ID.
pub definitions: HashMap<u32, String>,
}
impl PageFootnotes {
/// Create a new empty PageFootnotes.
///
/// Returns a structure with no footnote references or definitions.
/// This is the default state for pages without footnotes.
pub fn new() -> Self {
Self::default()
}
/// Check if this page has any footnotes.
///
/// Returns true if there are any footnote references or definitions.
pub fn is_empty(&self) -> bool {
self.refs.is_empty() && self.definitions.is_empty()
}
/// Add a footnote reference.
///
/// # Arguments
///
/// * `span_index` - The span index (within the page's spans array)
/// * `footnote_id` - The footnote ID (numeric, assigned in document order)
pub fn add_ref(&mut self, span_index: usize, footnote_id: u32) {
self.refs.insert(span_index, footnote_id);
}
/// Add a footnote definition.
///
/// # Arguments
///
/// * `footnote_id` - The footnote ID
/// * `text` - The footnote text
pub fn add_definition(&mut self, footnote_id: u32, text: String) {
self.definitions.insert(footnote_id, text);
}
/// Get the footnote ID for a given span index.
///
/// Returns None if the span is not a footnote reference.
pub fn get_footnote_id(&self, span_index: usize) -> Option<u32> {
self.refs.get(&span_index).copied()
}
/// Get the footnote text for a given footnote ID.
///
/// Returns None if the footnote ID has no definition.
pub fn get_definition(&self, footnote_id: u32) -> Option<&str> {
self.definitions.get(&footnote_id).map(|s| s.as_str())
}
}
/// Emit a footnote reference as Markdown.
///
/// This function emits a footnote reference in GFM syntax: `[^N]`
/// where N is the footnote ID.
///
/// # Arguments
///
/// * `footnote_id` - The footnote ID
///
/// # Returns
///
/// A markdown string containing the footnote reference.
///
/// # Example
///
/// ```
/// use pdftract_core::output::markdown::footnotes::emit_footnote_ref;
///
/// let md = emit_footnote_ref(1);
/// assert_eq!(md, "[^1]");
/// ```
pub fn emit_footnote_ref(footnote_id: u32) -> String {
format!("[^{}]", footnote_id)
}
/// Emit a footnote definition as Markdown.
///
/// This function emits a footnote definition in GFM syntax: `[^N]: <text>`
/// where N is the footnote ID and text is the footnote text.
///
/// Per the acceptance criteria, empty footnote text emits `[^N]: (empty)`
/// as a placeholder so the reference is at least visible.
///
/// # Arguments
///
/// * `footnote_id` - The footnote ID
/// * `text` - The footnote text (may be empty)
///
/// # Returns
///
/// A markdown string containing the footnote definition.
///
/// # Example
///
/// ```
/// use pdftract_core::output::markdown::footnotes::emit_footnote_def;
///
/// let md = emit_footnote_def(1, "Footnote text");
/// assert_eq!(md, "[^1]: Footnote text\n");
///
/// let md_empty = emit_footnote_def(2, "");
/// assert_eq!(md_empty, "[^2]: (empty)\n");
/// ```
pub fn emit_footnote_def(footnote_id: u32, text: &str) -> String {
let text = if text.is_empty() {
"(empty)".to_string()
} else {
text.to_string()
};
format!("[^{}]: {}\n", footnote_id, text)
}
/// Emit all footnote definitions for a page.
///
/// This function collects all footnote definitions for the page and
/// emits them at the end of the page content, per the v1.0 decision.
///
/// The output includes a blank line before the definitions block for
/// pretty formatting.
///
/// # Arguments
///
/// * `footnotes` - The page footnotes data
///
/// # Returns
///
/// A markdown string containing all footnote definitions, or an empty
/// string if there are no footnotes.
///
/// # Example
///
/// ```
/// use pdftract_core::output::markdown::footnotes::{emit_footnote_defs, PageFootnotes};
///
/// let mut footnotes = PageFootnotes::new();
/// footnotes.add_definition(1, "First footnote".to_string());
/// footnotes.add_definition(2, "Second footnote".to_string());
///
/// let md = emit_footnote_defs(&footnotes);
/// assert!(md.contains("\n[^1]: First footnote\n"));
/// assert!(md.contains("[^2]: Second footnote\n"));
/// ```
pub fn emit_footnote_defs(footnotes: &PageFootnotes) -> String {
if footnotes.is_empty() {
return String::new();
}
let mut result = String::from("\n"); // Blank line before definitions
// Collect and sort footnote IDs for deterministic output
let mut ids: Vec<u32> = footnotes.definitions.keys().copied().collect();
ids.sort();
for id in ids {
if let Some(text) = footnotes.get_definition(id) {
result.push_str(&emit_footnote_def(id, text));
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_page_footnotes_new() {
let footnotes = PageFootnotes::new();
assert!(footnotes.is_empty());
assert!(footnotes.refs.is_empty());
assert!(footnotes.definitions.is_empty());
}
#[test]
fn test_page_footnotes_add_ref() {
let mut footnotes = PageFootnotes::new();
footnotes.add_ref(0, 1);
footnotes.add_ref(5, 2);
assert_eq!(footnotes.get_footnote_id(0), Some(1));
assert_eq!(footnotes.get_footnote_id(5), Some(2));
assert_eq!(footnotes.get_footnote_id(3), None);
}
#[test]
fn test_page_footnotes_add_definition() {
let mut footnotes = PageFootnotes::new();
footnotes.add_definition(1, "First footnote".to_string());
footnotes.add_definition(2, "Second footnote".to_string());
assert_eq!(footnotes.get_definition(1), Some("First footnote"));
assert_eq!(footnotes.get_definition(2), Some("Second footnote"));
assert_eq!(footnotes.get_definition(3), None);
}
#[test]
fn test_page_footnotes_is_empty() {
let footnotes = PageFootnotes::new();
assert!(footnotes.is_empty());
let mut footnotes = PageFootnotes::new();
footnotes.add_ref(0, 1);
assert!(!footnotes.is_empty());
}
#[test]
fn test_emit_footnote_ref() {
assert_eq!(emit_footnote_ref(1), "[^1]");
assert_eq!(emit_footnote_ref(5), "[^5]");
assert_eq!(emit_footnote_ref(100), "[^100]");
}
#[test]
fn test_emit_footnote_def_with_text() {
let md = emit_footnote_def(1, "Footnote text");
assert_eq!(md, "[^1]: Footnote text\n");
let md = emit_footnote_def(2, "Multi-line\ntext");
assert_eq!(md, "[^2]: Multi-line\ntext\n");
}
#[test]
fn test_emit_footnote_def_empty_text() {
let md = emit_footnote_def(1, "");
assert_eq!(md, "[^1]: (empty)\n");
}
#[test]
fn test_emit_footnote_defs_empty() {
let footnotes = PageFootnotes::new();
let md = emit_footnote_defs(&footnotes);
assert_eq!(md, "");
}
#[test]
fn test_emit_footnote_defs_single() {
let mut footnotes = PageFootnotes::new();
footnotes.add_definition(1, "First footnote".to_string());
let md = emit_footnote_defs(&footnotes);
assert_eq!(md, "\n[^1]: First footnote\n");
}
#[test]
fn test_emit_footnote_defs_multiple_sorted() {
let mut footnotes = PageFootnotes::new();
footnotes.add_definition(3, "Third footnote".to_string());
footnotes.add_definition(1, "First footnote".to_string());
footnotes.add_definition(2, "Second footnote".to_string());
let md = emit_footnote_defs(&footnotes);
// Definitions should be emitted in sorted order by ID
assert!(md.starts_with("\n[^1]: First footnote\n"));
assert!(md.contains("[^2]: Second footnote\n"));
assert!(md.contains("[^3]: Third footnote\n"));
}
#[test]
fn test_emit_footnote_defs_with_empty_text() {
let mut footnotes = PageFootnotes::new();
footnotes.add_definition(1, "Has text".to_string());
footnotes.add_definition(2, "".to_string());
let md = emit_footnote_defs(&footnotes);
assert!(md.contains("[^1]: Has text\n"));
assert!(md.contains("[^2]: (empty)\n"));
}
}

View file

@ -0,0 +1,9 @@
//! Markdown output module.
//!
//! This module provides Markdown emission functionality for pdftract.
//! It includes support for block-level Markdown emission, inline span styling,
//! and footnote emission (when Phase 7 footnote detection is implemented).
pub mod footnotes;
pub use footnotes::{emit_footnote_def, emit_footnote_defs, emit_footnote_ref, PageFootnotes};

View file

@ -3,4 +3,5 @@
//! This module provides the output serialization layer for pdftract,
//! supporting both full JSON documents and streaming NDJSON frames.
pub mod markdown;
pub mod ndjson;

View file

@ -31,7 +31,12 @@ The `jsonschema = "0.26"` crate is already in dev-dependencies (line 84).
### 3. Fixtures
**Directory:** `tests/fixtures/json_schema/`
Currently contains one fixture: `simple_invoice.pdf`
Currently contains 5 fixtures covering diverse PDF types:
- `EC-04-rc4-encrypted.pdf` - RC4 encrypted PDF
- `EC-05-aes128-encrypted.pdf` - AES-128 encrypted PDF
- `sample.pdf` - Sample document
- `simple_invoice.pdf` - Simple invoice
- `valid-minimal.pdf` - Minimal valid PDF
The test auto-discovers all `*.pdf` files in this directory and validates their extraction output against the schema. Adding new fixtures automatically includes them in the next test run.
@ -65,12 +70,12 @@ test test_schema_span_json_structure ... ok
test test_synthetic_output_validates ... ok
test test_schema_itself_is_valid ... ok
test result: ok. 6 passed; 0 failed; 1 ignored; 0 measured; 0 filtered out; finished in 0.15s
test result: ok. 6 passed; 0 failed; 1 ignored; 0 measured; 0 filtered out; finished in 0.16s
```
## Performance
Schema validation is fast: 6 tests completed in 0.15 seconds. The jsonschema crate is efficient and meets the <100ms per validation target.
Schema validation is fast: 6 tests completed in 0.16 seconds. The jsonschema crate is efficient and meets the <100ms per validation target.
## References
- Plan section: Phase 6.1.4

View file

@ -47,3 +47,22 @@ Generated: 2026-05-31
# json_schema/simple_invoice.pdf
Simple invoice PDF for JSON schema validation tests
Generated: 2026-06-01
# json_schema/EC-04-rc4-encrypted.pdf
Copied from fixtures/EC-04-rc4-encrypted.pdf for JSON schema validation
PDF 1.7, RC4 encryption (V=1, R=2), 40-bit key, user password: "user40"
Generated: 2026-06-01
# json_schema/EC-05-aes128-encrypted.pdf
Copied from fixtures/EC-05-aes128-encrypted.pdf for JSON schema validation
PDF 1.7, AES-128 encryption (V=2, R=3), 128-bit key, user password: "user128"
Generated: 2026-06-01
# json_schema/valid-minimal.pdf
Minimal valid PDF v1.4 fixture for JSON schema validation tests
Generated: 2026-05-28
# json_schema/sample.pdf
Copied from valid-minimal.pdf for SDK examples default path
Minimal valid PDF v1.4 fixture for contract method examples
Generated: 2026-05-31

View file

@ -0,0 +1,32 @@
%PDF-1.4
%¿÷¢þ
1 0 obj
<< /Pages 2 0 R /Type /Catalog >>
endobj
2 0 obj
<< /Count 1 /Kids [ 3 0 R ] /Type /Pages >>
endobj
3 0 obj
<< /Contents 4 0 R /MediaBox [ 0 0 612 792 ] /Parent 2 0 R /Resources << /Font << /F1 << /BaseFont /Helvetica /Subtype /Type1 /Type /Font >> >> >> /Type /Page >>
endobj
4 0 obj
<< /Length 110 /Filter /FlateDecode >>
stream
.!ÆW7¶•9qÌ ´­<k#•Hƒ‰¨M ¥åDŒ¿Zùå¶Ðy*¥¢Š`¥6Ÿð²0&C€þò"%¿é «XõÜ¥'†kãOP¹šKÆû<u;zÂÙ]¦ºxssä$7€%ê ‰¥
endstream
endobj
5 0 obj
<< /Filter /Standard /Length 40 /O <7303809eaf677bdb5ca64b9d8cb0ccdd47d09a7b28ad5aa522c62685c6d9e499> /P -12 /R 2 /U <748c1f874e35dfb683c55f843f0df43c717e8c51fd2cfe510a5fb5553e957eb9> /V 1 >>
endobj
xref
0 6
0000000000 65535 f
0000000015 00000 n
0000000064 00000 n
0000000123 00000 n
0000000300 00000 n
0000000482 00000 n
trailer << /Root 1 0 R /Size 6 /ID [<0bacc6b9933ead86b7dca33b3a436cea><0bacc6b9933ead86b7dca33b3a436cea>] /Encrypt 5 0 R >>
startxref
689
%%EOF

View file

@ -0,0 +1,32 @@
%PDF-1.6
%¿÷¢þ
1 0 obj
<< /Pages 2 0 R /Type /Catalog >>
endobj
2 0 obj
<< /Count 1 /Kids [ 3 0 R ] /Type /Pages >>
endobj
3 0 obj
<< /Contents 4 0 R /MediaBox [ 0 0 612 792 ] /Parent 2 0 R /Resources << /Font << /F1 << /BaseFont /Helvetica /Subtype /Type1 /Type /Font >> >> >> /Type /Page >>
endobj
4 0 obj
<< /Length 128 /Filter /FlateDecode >>
stream
±ù¥œÄ\ê¶Æ0j/R9¨Ø <0B>îˆÌó·©s®Vºf~<7E>P95²À@¤ÀÙëÄÙžœ+¬¼j a¿«Öçð2iäÓB×-}:M2œÖ ½«qᒓžx•ÛAà'f=Ðû<C390>}?f<>@áH®7šÝe"N
endstream
endobj
5 0 obj
<< /CF << /StdCF << /AuthEvent /DocOpen /CFM /AESV2 /Length 16 >> >> /Filter /Standard /Length 128 /O <badad1e86442699427116d3e5d5271bc80a27814fc5e80f815efeef839354c5f> /P -1028 /R 4 /StmF /StdCF /StrF /StdCF /U <e7514dced4772b04eeb8f49d7a8a5f650122456a91bae5134273a6db134c87c4> /V 4 >>
endobj
xref
0 6
0000000000 65535 f
0000000015 00000 n
0000000064 00000 n
0000000123 00000 n
0000000300 00000 n
0000000500 00000 n
trailer << /Root 1 0 R /Size 6 /ID [<0bacc6b9933ead86b7dca33b3a436cea><0bacc6b9933ead86b7dca33b3a436cea>] /Encrypt 5 0 R >>
startxref
802
%%EOF

58
tests/fixtures/json_schema/sample.pdf vendored Normal file
View file

@ -0,0 +1,58 @@
%PDF-1.4
1 0 obj
<<
/Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj
<<
/Type /Pages
/Kids [3 0 R]
/Count 1
>>
endobj
3 0 obj
<<
/Type /Page
/Parent 2 0 R
/MediaBox [0 0 612 792]
/Contents 4 0 R
/Resources <<
/Font <<
/F1 <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
>>
>>
>>
>>
endobj
4 0 obj
<<
/Length 44
>>
stream
BT
/F1 12 Tf
100 700 Td
(Test) Tj
ET
endstream
endobj
xref
0 5
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000298 00000 n
trailer
<<
/Size 5
/Root 1 0 R
>>
startxref
403
%%EOF

View file

@ -0,0 +1,58 @@
%PDF-1.4
1 0 obj
<<
/Type /Catalog
/Pages 2 0 R
>>
endobj
2 0 obj
<<
/Type /Pages
/Kids [3 0 R]
/Count 1
>>
endobj
3 0 obj
<<
/Type /Page
/Parent 2 0 R
/MediaBox [0 0 612 792]
/Contents 4 0 R
/Resources <<
/Font <<
/F1 <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
>>
>>
>>
>>
endobj
4 0 obj
<<
/Length 44
>>
stream
BT
/F1 12 Tf
100 700 Td
(Test) Tj
ET
endstream
endobj
xref
0 5
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000298 00000 n
trailer
<<
/Size 5
/Root 1 0 R
>>
startxref
403
%%EOF