From fa57ab3e9024448af6249e93fa7c2dd841eda50a Mon Sep 17 00:00:00 2001 From: jedarden Date: Mon, 25 May 2026 11:24:08 -0400 Subject: [PATCH] feat(pdftract-2kpm0): implement NdjsonFrame enum with internal-tag discriminator and write_frame helper - Add unified NdjsonFrame enum with serde internal tagging (tag = "frame") - Remove frame_type field from individual frame structs (HeaderFrame, PageFrame, FooterFrame) - Add write_frame() helper that serializes, adds newline, and flushes - Add #[serde(default)] to optional fields for proper deserialization - Add roundtrip tests for all frame types - Add test verifying frame discriminator appears first in JSON output - Update module exports to include NdjsonFrame and write_frame Per plan 6.2.1: frame sequence (lines 2038-2042) Closes: pdftract-2kpm0 --- .../pdftract-core/src/output/ndjson/frames.rs | 218 +++++++++++++++--- crates/pdftract-core/src/output/ndjson/mod.rs | 2 +- 2 files changed, 184 insertions(+), 36 deletions(-) diff --git a/crates/pdftract-core/src/output/ndjson/frames.rs b/crates/pdftract-core/src/output/ndjson/frames.rs index 5d3ea1b..9e9207c 100644 --- a/crates/pdftract-core/src/output/ndjson/frames.rs +++ b/crates/pdftract-core/src/output/ndjson/frames.rs @@ -8,6 +8,23 @@ use crate::schema::{BlockJson, ExtractionQuality, SpanJson, TableJson}; use serde::{Deserialize, Serialize}; use serde_json::Value; +use std::io::Write; + +/// Unified NDJSON frame enum with internal-tag discriminator. +/// +/// This enum uses serde's internal tagging with the "frame" field as the tag. +/// When serialized, the "frame" field appears first with values "header", "page", +/// or "footer", allowing consumers to dispatch to the appropriate handler. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(tag = "frame", rename_all = "lowercase")] +pub enum NdjsonFrame { + /// Header frame containing document metadata. + Header(HeaderFrame), + /// Page frame containing a single page's extraction result. + Page(PageFrame), + /// Footer frame containing aggregated metrics and diagnostics. + Footer(FooterFrame), +} /// Frame discriminator field. /// @@ -29,10 +46,6 @@ pub enum FrameType { /// Contains document-level metadata that is known before page processing begins. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct HeaderFrame { - /// Frame discriminator (always "header"). - #[serde(rename = "frame")] - pub frame_type: FrameType, - /// Schema version identifier. /// /// Consumers should check this field to ensure compatibility. @@ -64,7 +77,6 @@ impl HeaderFrame { total_pages: usize, ) -> Self { Self { - frame_type: FrameType::Header, schema_version, metadata, outline, @@ -86,10 +98,6 @@ impl HeaderFrame { /// and output in page_index order by the streaming pipeline. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct PageFrame { - /// Frame discriminator (always "page"). - #[serde(rename = "frame")] - pub frame_type: FrameType, - /// Zero-based page index. /// /// Consumers use this to reorder pages if processing concurrently. @@ -118,13 +126,13 @@ pub struct PageFrame { /// Annotations (highlights, stamps, notes, links). /// /// Empty in Phase 6; populated in Phase 7. - #[serde(skip_serializing_if = "Vec::is_empty")] + #[serde(default, skip_serializing_if = "Vec::is_empty")] pub annotations: Vec, /// Optional page-level diagnostics. /// /// Present only if there were errors or warnings during extraction. - #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default, skip_serializing_if = "Option::is_none")] pub errors: Option>, } @@ -138,7 +146,6 @@ impl PageFrame { tables: Vec, ) -> Self { Self { - frame_type: FrameType::Page, page_index, page_type, spans, @@ -169,10 +176,6 @@ impl PageFrame { /// known after all pages have been processed. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct FooterFrame { - /// Frame discriminator (always "footer"). - #[serde(rename = "frame")] - pub frame_type: FrameType, - /// Aggregate extraction quality metrics. /// /// Includes overall quality, confidence statistics, OCR fraction, etc. @@ -186,31 +189,31 @@ pub struct FooterFrame { /// Thread information (for debugging and profiling). /// /// Empty in the initial implementation. - #[serde(skip_serializing_if = "Vec::is_empty")] + #[serde(default, skip_serializing_if = "Vec::is_empty")] pub threads: Vec, /// Attachments extracted from the document. /// /// Empty in Phase 6; populated in Phase 7. - #[serde(skip_serializing_if = "Vec::is_empty")] + #[serde(default, skip_serializing_if = "Vec::is_empty")] pub attachments: Vec, /// Digital signatures extracted from the document. /// /// Empty in Phase 6; populated in Phase 7. - #[serde(skip_serializing_if = "Vec::is_empty")] + #[serde(default, skip_serializing_if = "Vec::is_empty")] pub signatures: Vec, /// Form fields extracted from the document. /// /// Empty in Phase 6; populated in Phase 7. - #[serde(skip_serializing_if = "Vec::is_empty")] + #[serde(default, skip_serializing_if = "Vec::is_empty")] pub form_fields: Vec, /// Links extracted from the document. /// /// Empty in Phase 6; populated in Phase 7. - #[serde(skip_serializing_if = "Vec::is_empty")] + #[serde(default, skip_serializing_if = "Vec::is_empty")] pub links: Vec, } @@ -218,7 +221,6 @@ impl FooterFrame { /// Create a new footer frame. pub fn new(extraction_quality: ExtractionQuality, errors: Vec) -> Self { Self { - frame_type: FrameType::Footer, extraction_quality, errors, threads: Vec::new(), @@ -237,28 +239,71 @@ impl FooterFrame { } } +/// Write a single frame to a writer as a JSON line with trailing newline and flush. +/// +/// This is the primary function for emitting NDJSON frames during streaming extraction. +/// It serializes the frame, appends a newline, writes it to the writer, and flushes +/// to ensure immediate delivery to streaming consumers. +/// +/// # Arguments +/// +/// * `writer` - Any writer implementing `Write` (e.g., `File`, `BufWriter`, `Stdout`) +/// * `frame` - The frame to write (wrapped in `NdjsonFrame` enum) +/// +/// # Returns +/// +/// * `Ok(())` if the frame was written and flushed successfully +/// * `Err(io::Error)` if serialization or writing failed +/// +/// # Example +/// +/// ```ignore +/// use std::io::BufWriter; +/// use pdftract_core::output::ndjson::frames::{write_frame, NdjsonFrame, HeaderFrame}; +/// +/// let mut writer = BufWriter::new(file); +/// let header = HeaderFrame::new(...); +/// write_frame(&mut writer, &NdjsonFrame::Header(header))?; +/// ``` +pub fn write_frame(writer: &mut W, frame: &NdjsonFrame) -> std::io::Result<()> { + // Serialize the frame to JSON + let json_string = serde_json::to_string(frame) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; + + // Write the JSON line with trailing newline + writer.write_all(json_string.as_bytes())?; + writer.write_all(b"\n")?; + + // Flush to ensure immediate delivery to streaming consumers + writer.flush()?; + + Ok(()) +} + #[cfg(test)] mod tests { use super::*; + use std::io::Cursor; #[test] - fn test_header_frame_serialization() { + fn test_ndjson_frame_header_discriminator() { let header = HeaderFrame::new( "1.0".to_string(), serde_json::json!({"title": "Test", "author": "Test Author"}), Some(serde_json::json!([{"title": "Chapter 1", "level": 1}])), 10, ); + let frame = NdjsonFrame::Header(header); - let json = header.to_json_line().unwrap(); - assert!(json.contains("\"frame\":\"header\"")); + let json = serde_json::to_string(&frame).unwrap(); + // The "frame" key should appear first (serde internal tag) + assert!(json.starts_with("{\"frame\":\"header\"")); assert!(json.contains("\"schema_version\":\"1.0\"")); assert!(json.contains("\"total_pages\":10")); - assert!(json.ends_with('\n')); } #[test] - fn test_page_frame_serialization() { + fn test_ndjson_frame_page_discriminator() { let page = PageFrame::new( 0, "content".to_string(), @@ -279,29 +324,132 @@ mod tests { vec![], vec![], ); + let frame = NdjsonFrame::Page(page); - let json = page.to_json_line().unwrap(); - assert!(json.contains("\"frame\":\"page\"")); + let json = serde_json::to_string(&frame).unwrap(); + // The "frame" key should appear first + assert!(json.starts_with("{\"frame\":\"page\"")); assert!(json.contains("\"page_index\":0")); assert!(json.contains("\"page_type\":\"content\"")); - assert!(json.ends_with('\n')); } #[test] - fn test_footer_frame_serialization() { + fn test_ndjson_frame_footer_discriminator() { let footer = FooterFrame::new(ExtractionQuality::new().with_quality("high"), vec![]); + let frame = NdjsonFrame::Footer(footer); - let json = footer.to_json_line().unwrap(); - assert!(json.contains("\"frame\":\"footer\"")); + let json = serde_json::to_string(&frame).unwrap(); + // The "frame" key should appear first + assert!(json.starts_with("{\"frame\":\"footer\"")); assert!(json.contains("\"overall_quality\":\"high\"")); - assert!(json.ends_with('\n')); + } + + #[test] + fn test_write_frame_includes_newline_and_flush() { + let header = HeaderFrame::new( + "1.0".to_string(), + serde_json::json!({"title": "Test"}), + None, + 1, + ); + let frame = NdjsonFrame::Header(header); + + let mut buffer = Vec::new(); + write_frame(&mut buffer, &frame).unwrap(); + + let output = String::from_utf8(buffer).unwrap(); + // Should end with newline + assert!(output.ends_with('\n')); + // Should contain the frame discriminator + assert!(output.contains("\"frame\":\"header\"")); + } + + #[test] + fn test_roundtrip_header_frame() { + let original = HeaderFrame::new( + "1.0".to_string(), + serde_json::json!({"title": "Test", "author": "Test Author"}), + Some(serde_json::json!([{"title": "Chapter 1", "level": 1}])), + 10, + ); + let frame = NdjsonFrame::Header(original.clone()); + + // Serialize + let json = serde_json::to_string(&frame).unwrap(); + + // Deserialize + let deserialized: NdjsonFrame = serde_json::from_str(&json).unwrap(); + + // Verify equality + assert_eq!(frame, deserialized); + + // Extract and verify the inner HeaderFrame + match deserialized { + NdjsonFrame::Header(header) => { + assert_eq!(header.schema_version, original.schema_version); + assert_eq!(header.metadata, original.metadata); + assert_eq!(header.outline, original.outline); + assert_eq!(header.total_pages, original.total_pages); + } + _ => panic!("Expected Header frame"), + } + } + + #[test] + fn test_roundtrip_page_frame() { + let original = PageFrame::new( + 0, + "content".to_string(), + vec![SpanJson { + text: "Hello".to_string(), + bbox: [0.0, 0.0, 100.0, 20.0], + font: "Helvetica".to_string(), + size: 12.0, + color: None, + rendering_mode: None, + confidence: None, + confidence_source: None, + lang: None, + flags: vec![], + receipt: None, + column: None, + }], + vec![], + vec![], + ); + let frame = NdjsonFrame::Page(original.clone()); + + // Serialize + let json = serde_json::to_string(&frame).unwrap(); + + // Deserialize + let deserialized: NdjsonFrame = serde_json::from_str(&json).unwrap(); + + // Verify equality + assert_eq!(frame, deserialized); + } + + #[test] + fn test_roundtrip_footer_frame() { + let original = FooterFrame::new(ExtractionQuality::new().with_quality("high"), vec![]); + let frame = NdjsonFrame::Footer(original.clone()); + + // Serialize + let json = serde_json::to_string(&frame).unwrap(); + + // Deserialize + let deserialized: NdjsonFrame = serde_json::from_str(&json).unwrap(); + + // Verify equality + assert_eq!(frame, deserialized); } #[test] fn test_page_frame_with_empty_collections() { let page = PageFrame::new(5, "blank".to_string(), vec![], vec![], vec![]); + let frame = NdjsonFrame::Page(page); - let json = page.to_json_line().unwrap(); + let json = serde_json::to_string(&frame).unwrap(); // Empty spans/blocks/tables should still be present assert!(json.contains("\"spans\":[]")); assert!(json.contains("\"blocks\":[]")); diff --git a/crates/pdftract-core/src/output/ndjson/mod.rs b/crates/pdftract-core/src/output/ndjson/mod.rs index 5840f12..51c0f88 100644 --- a/crates/pdftract-core/src/output/ndjson/mod.rs +++ b/crates/pdftract-core/src/output/ndjson/mod.rs @@ -16,5 +16,5 @@ pub mod frames; pub mod pipeline; pub use buffer::OutOfOrderBuffer; -pub use frames::{FooterFrame, HeaderFrame, PageFrame}; +pub use frames::{write_frame, FooterFrame, HeaderFrame, NdjsonFrame, PageFrame}; pub use pipeline::extract_streaming;