feat(pdftract-2kpm0): implement NdjsonFrame enum with internal-tag discriminator and write_frame helper

- Add unified NdjsonFrame enum with serde internal tagging (tag = "frame")
- Remove frame_type field from individual frame structs (HeaderFrame, PageFrame, FooterFrame)
- Add write_frame<W: Write>() helper that serializes, adds newline, and flushes
- Add #[serde(default)] to optional fields for proper deserialization
- Add roundtrip tests for all frame types
- Add test verifying frame discriminator appears first in JSON output
- Update module exports to include NdjsonFrame and write_frame

Per plan 6.2.1: frame sequence (lines 2038-2042)
Closes: pdftract-2kpm0
This commit is contained in:
jedarden 2026-05-25 11:24:08 -04:00
parent 3ac47215cf
commit fa57ab3e90
2 changed files with 184 additions and 36 deletions

View file

@ -8,6 +8,23 @@
use crate::schema::{BlockJson, ExtractionQuality, SpanJson, TableJson};
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::io::Write;
/// Unified NDJSON frame enum with internal-tag discriminator.
///
/// This enum uses serde's internal tagging with the "frame" field as the tag.
/// When serialized, the "frame" field appears first with values "header", "page",
/// or "footer", allowing consumers to dispatch to the appropriate handler.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(tag = "frame", rename_all = "lowercase")]
pub enum NdjsonFrame {
/// Header frame containing document metadata.
Header(HeaderFrame),
/// Page frame containing a single page's extraction result.
Page(PageFrame),
/// Footer frame containing aggregated metrics and diagnostics.
Footer(FooterFrame),
}
/// Frame discriminator field.
///
@ -29,10 +46,6 @@ pub enum FrameType {
/// Contains document-level metadata that is known before page processing begins.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct HeaderFrame {
/// Frame discriminator (always "header").
#[serde(rename = "frame")]
pub frame_type: FrameType,
/// Schema version identifier.
///
/// Consumers should check this field to ensure compatibility.
@ -64,7 +77,6 @@ impl HeaderFrame {
total_pages: usize,
) -> Self {
Self {
frame_type: FrameType::Header,
schema_version,
metadata,
outline,
@ -86,10 +98,6 @@ impl HeaderFrame {
/// and output in page_index order by the streaming pipeline.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct PageFrame {
/// Frame discriminator (always "page").
#[serde(rename = "frame")]
pub frame_type: FrameType,
/// Zero-based page index.
///
/// Consumers use this to reorder pages if processing concurrently.
@ -118,13 +126,13 @@ pub struct PageFrame {
/// Annotations (highlights, stamps, notes, links).
///
/// Empty in Phase 6; populated in Phase 7.
#[serde(skip_serializing_if = "Vec::is_empty")]
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub annotations: Vec<Value>,
/// Optional page-level diagnostics.
///
/// Present only if there were errors or warnings during extraction.
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default, skip_serializing_if = "Option::is_none")]
pub errors: Option<Vec<Value>>,
}
@ -138,7 +146,6 @@ impl PageFrame {
tables: Vec<TableJson>,
) -> Self {
Self {
frame_type: FrameType::Page,
page_index,
page_type,
spans,
@ -169,10 +176,6 @@ impl PageFrame {
/// known after all pages have been processed.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct FooterFrame {
/// Frame discriminator (always "footer").
#[serde(rename = "frame")]
pub frame_type: FrameType,
/// Aggregate extraction quality metrics.
///
/// Includes overall quality, confidence statistics, OCR fraction, etc.
@ -186,31 +189,31 @@ pub struct FooterFrame {
/// Thread information (for debugging and profiling).
///
/// Empty in the initial implementation.
#[serde(skip_serializing_if = "Vec::is_empty")]
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub threads: Vec<Value>,
/// Attachments extracted from the document.
///
/// Empty in Phase 6; populated in Phase 7.
#[serde(skip_serializing_if = "Vec::is_empty")]
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub attachments: Vec<Value>,
/// Digital signatures extracted from the document.
///
/// Empty in Phase 6; populated in Phase 7.
#[serde(skip_serializing_if = "Vec::is_empty")]
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub signatures: Vec<Value>,
/// Form fields extracted from the document.
///
/// Empty in Phase 6; populated in Phase 7.
#[serde(skip_serializing_if = "Vec::is_empty")]
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub form_fields: Vec<Value>,
/// Links extracted from the document.
///
/// Empty in Phase 6; populated in Phase 7.
#[serde(skip_serializing_if = "Vec::is_empty")]
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub links: Vec<Value>,
}
@ -218,7 +221,6 @@ impl FooterFrame {
/// Create a new footer frame.
pub fn new(extraction_quality: ExtractionQuality, errors: Vec<Value>) -> Self {
Self {
frame_type: FrameType::Footer,
extraction_quality,
errors,
threads: Vec::new(),
@ -237,28 +239,71 @@ impl FooterFrame {
}
}
/// Write a single frame to a writer as a JSON line with trailing newline and flush.
///
/// This is the primary function for emitting NDJSON frames during streaming extraction.
/// It serializes the frame, appends a newline, writes it to the writer, and flushes
/// to ensure immediate delivery to streaming consumers.
///
/// # Arguments
///
/// * `writer` - Any writer implementing `Write` (e.g., `File`, `BufWriter`, `Stdout`)
/// * `frame` - The frame to write (wrapped in `NdjsonFrame` enum)
///
/// # Returns
///
/// * `Ok(())` if the frame was written and flushed successfully
/// * `Err(io::Error)` if serialization or writing failed
///
/// # Example
///
/// ```ignore
/// use std::io::BufWriter;
/// use pdftract_core::output::ndjson::frames::{write_frame, NdjsonFrame, HeaderFrame};
///
/// let mut writer = BufWriter::new(file);
/// let header = HeaderFrame::new(...);
/// write_frame(&mut writer, &NdjsonFrame::Header(header))?;
/// ```
pub fn write_frame<W: Write>(writer: &mut W, frame: &NdjsonFrame) -> std::io::Result<()> {
// Serialize the frame to JSON
let json_string = serde_json::to_string(frame)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
// Write the JSON line with trailing newline
writer.write_all(json_string.as_bytes())?;
writer.write_all(b"\n")?;
// Flush to ensure immediate delivery to streaming consumers
writer.flush()?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
#[test]
fn test_header_frame_serialization() {
fn test_ndjson_frame_header_discriminator() {
let header = HeaderFrame::new(
"1.0".to_string(),
serde_json::json!({"title": "Test", "author": "Test Author"}),
Some(serde_json::json!([{"title": "Chapter 1", "level": 1}])),
10,
);
let frame = NdjsonFrame::Header(header);
let json = header.to_json_line().unwrap();
assert!(json.contains("\"frame\":\"header\""));
let json = serde_json::to_string(&frame).unwrap();
// The "frame" key should appear first (serde internal tag)
assert!(json.starts_with("{\"frame\":\"header\""));
assert!(json.contains("\"schema_version\":\"1.0\""));
assert!(json.contains("\"total_pages\":10"));
assert!(json.ends_with('\n'));
}
#[test]
fn test_page_frame_serialization() {
fn test_ndjson_frame_page_discriminator() {
let page = PageFrame::new(
0,
"content".to_string(),
@ -279,29 +324,132 @@ mod tests {
vec![],
vec![],
);
let frame = NdjsonFrame::Page(page);
let json = page.to_json_line().unwrap();
assert!(json.contains("\"frame\":\"page\""));
let json = serde_json::to_string(&frame).unwrap();
// The "frame" key should appear first
assert!(json.starts_with("{\"frame\":\"page\""));
assert!(json.contains("\"page_index\":0"));
assert!(json.contains("\"page_type\":\"content\""));
assert!(json.ends_with('\n'));
}
#[test]
fn test_footer_frame_serialization() {
fn test_ndjson_frame_footer_discriminator() {
let footer = FooterFrame::new(ExtractionQuality::new().with_quality("high"), vec![]);
let frame = NdjsonFrame::Footer(footer);
let json = footer.to_json_line().unwrap();
assert!(json.contains("\"frame\":\"footer\""));
let json = serde_json::to_string(&frame).unwrap();
// The "frame" key should appear first
assert!(json.starts_with("{\"frame\":\"footer\""));
assert!(json.contains("\"overall_quality\":\"high\""));
assert!(json.ends_with('\n'));
}
#[test]
fn test_write_frame_includes_newline_and_flush() {
let header = HeaderFrame::new(
"1.0".to_string(),
serde_json::json!({"title": "Test"}),
None,
1,
);
let frame = NdjsonFrame::Header(header);
let mut buffer = Vec::new();
write_frame(&mut buffer, &frame).unwrap();
let output = String::from_utf8(buffer).unwrap();
// Should end with newline
assert!(output.ends_with('\n'));
// Should contain the frame discriminator
assert!(output.contains("\"frame\":\"header\""));
}
#[test]
fn test_roundtrip_header_frame() {
let original = HeaderFrame::new(
"1.0".to_string(),
serde_json::json!({"title": "Test", "author": "Test Author"}),
Some(serde_json::json!([{"title": "Chapter 1", "level": 1}])),
10,
);
let frame = NdjsonFrame::Header(original.clone());
// Serialize
let json = serde_json::to_string(&frame).unwrap();
// Deserialize
let deserialized: NdjsonFrame = serde_json::from_str(&json).unwrap();
// Verify equality
assert_eq!(frame, deserialized);
// Extract and verify the inner HeaderFrame
match deserialized {
NdjsonFrame::Header(header) => {
assert_eq!(header.schema_version, original.schema_version);
assert_eq!(header.metadata, original.metadata);
assert_eq!(header.outline, original.outline);
assert_eq!(header.total_pages, original.total_pages);
}
_ => panic!("Expected Header frame"),
}
}
#[test]
fn test_roundtrip_page_frame() {
let original = PageFrame::new(
0,
"content".to_string(),
vec![SpanJson {
text: "Hello".to_string(),
bbox: [0.0, 0.0, 100.0, 20.0],
font: "Helvetica".to_string(),
size: 12.0,
color: None,
rendering_mode: None,
confidence: None,
confidence_source: None,
lang: None,
flags: vec![],
receipt: None,
column: None,
}],
vec![],
vec![],
);
let frame = NdjsonFrame::Page(original.clone());
// Serialize
let json = serde_json::to_string(&frame).unwrap();
// Deserialize
let deserialized: NdjsonFrame = serde_json::from_str(&json).unwrap();
// Verify equality
assert_eq!(frame, deserialized);
}
#[test]
fn test_roundtrip_footer_frame() {
let original = FooterFrame::new(ExtractionQuality::new().with_quality("high"), vec![]);
let frame = NdjsonFrame::Footer(original.clone());
// Serialize
let json = serde_json::to_string(&frame).unwrap();
// Deserialize
let deserialized: NdjsonFrame = serde_json::from_str(&json).unwrap();
// Verify equality
assert_eq!(frame, deserialized);
}
#[test]
fn test_page_frame_with_empty_collections() {
let page = PageFrame::new(5, "blank".to_string(), vec![], vec![], vec![]);
let frame = NdjsonFrame::Page(page);
let json = page.to_json_line().unwrap();
let json = serde_json::to_string(&frame).unwrap();
// Empty spans/blocks/tables should still be present
assert!(json.contains("\"spans\":[]"));
assert!(json.contains("\"blocks\":[]"));

View file

@ -16,5 +16,5 @@ pub mod frames;
pub mod pipeline;
pub use buffer::OutOfOrderBuffer;
pub use frames::{FooterFrame, HeaderFrame, PageFrame};
pub use frames::{write_frame, FooterFrame, HeaderFrame, NdjsonFrame, PageFrame};
pub use pipeline::extract_streaming;