feat(pdftract-2kpm0): implement NdjsonFrame enum with internal-tag discriminator and write_frame helper
- Add unified NdjsonFrame enum with serde internal tagging (tag = "frame") - Remove frame_type field from individual frame structs (HeaderFrame, PageFrame, FooterFrame) - Add write_frame<W: Write>() helper that serializes, adds newline, and flushes - Add #[serde(default)] to optional fields for proper deserialization - Add roundtrip tests for all frame types - Add test verifying frame discriminator appears first in JSON output - Update module exports to include NdjsonFrame and write_frame Per plan 6.2.1: frame sequence (lines 2038-2042) Closes: pdftract-2kpm0
This commit is contained in:
parent
3ac47215cf
commit
fa57ab3e90
2 changed files with 184 additions and 36 deletions
|
|
@ -8,6 +8,23 @@
|
|||
use crate::schema::{BlockJson, ExtractionQuality, SpanJson, TableJson};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
use std::io::Write;
|
||||
|
||||
/// Unified NDJSON frame enum with internal-tag discriminator.
|
||||
///
|
||||
/// This enum uses serde's internal tagging with the "frame" field as the tag.
|
||||
/// When serialized, the "frame" field appears first with values "header", "page",
|
||||
/// or "footer", allowing consumers to dispatch to the appropriate handler.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
#[serde(tag = "frame", rename_all = "lowercase")]
|
||||
pub enum NdjsonFrame {
|
||||
/// Header frame containing document metadata.
|
||||
Header(HeaderFrame),
|
||||
/// Page frame containing a single page's extraction result.
|
||||
Page(PageFrame),
|
||||
/// Footer frame containing aggregated metrics and diagnostics.
|
||||
Footer(FooterFrame),
|
||||
}
|
||||
|
||||
/// Frame discriminator field.
|
||||
///
|
||||
|
|
@ -29,10 +46,6 @@ pub enum FrameType {
|
|||
/// Contains document-level metadata that is known before page processing begins.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct HeaderFrame {
|
||||
/// Frame discriminator (always "header").
|
||||
#[serde(rename = "frame")]
|
||||
pub frame_type: FrameType,
|
||||
|
||||
/// Schema version identifier.
|
||||
///
|
||||
/// Consumers should check this field to ensure compatibility.
|
||||
|
|
@ -64,7 +77,6 @@ impl HeaderFrame {
|
|||
total_pages: usize,
|
||||
) -> Self {
|
||||
Self {
|
||||
frame_type: FrameType::Header,
|
||||
schema_version,
|
||||
metadata,
|
||||
outline,
|
||||
|
|
@ -86,10 +98,6 @@ impl HeaderFrame {
|
|||
/// and output in page_index order by the streaming pipeline.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct PageFrame {
|
||||
/// Frame discriminator (always "page").
|
||||
#[serde(rename = "frame")]
|
||||
pub frame_type: FrameType,
|
||||
|
||||
/// Zero-based page index.
|
||||
///
|
||||
/// Consumers use this to reorder pages if processing concurrently.
|
||||
|
|
@ -118,13 +126,13 @@ pub struct PageFrame {
|
|||
/// Annotations (highlights, stamps, notes, links).
|
||||
///
|
||||
/// Empty in Phase 6; populated in Phase 7.
|
||||
#[serde(skip_serializing_if = "Vec::is_empty")]
|
||||
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
||||
pub annotations: Vec<Value>,
|
||||
|
||||
/// Optional page-level diagnostics.
|
||||
///
|
||||
/// Present only if there were errors or warnings during extraction.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub errors: Option<Vec<Value>>,
|
||||
}
|
||||
|
||||
|
|
@ -138,7 +146,6 @@ impl PageFrame {
|
|||
tables: Vec<TableJson>,
|
||||
) -> Self {
|
||||
Self {
|
||||
frame_type: FrameType::Page,
|
||||
page_index,
|
||||
page_type,
|
||||
spans,
|
||||
|
|
@ -169,10 +176,6 @@ impl PageFrame {
|
|||
/// known after all pages have been processed.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct FooterFrame {
|
||||
/// Frame discriminator (always "footer").
|
||||
#[serde(rename = "frame")]
|
||||
pub frame_type: FrameType,
|
||||
|
||||
/// Aggregate extraction quality metrics.
|
||||
///
|
||||
/// Includes overall quality, confidence statistics, OCR fraction, etc.
|
||||
|
|
@ -186,31 +189,31 @@ pub struct FooterFrame {
|
|||
/// Thread information (for debugging and profiling).
|
||||
///
|
||||
/// Empty in the initial implementation.
|
||||
#[serde(skip_serializing_if = "Vec::is_empty")]
|
||||
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
||||
pub threads: Vec<Value>,
|
||||
|
||||
/// Attachments extracted from the document.
|
||||
///
|
||||
/// Empty in Phase 6; populated in Phase 7.
|
||||
#[serde(skip_serializing_if = "Vec::is_empty")]
|
||||
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
||||
pub attachments: Vec<Value>,
|
||||
|
||||
/// Digital signatures extracted from the document.
|
||||
///
|
||||
/// Empty in Phase 6; populated in Phase 7.
|
||||
#[serde(skip_serializing_if = "Vec::is_empty")]
|
||||
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
||||
pub signatures: Vec<Value>,
|
||||
|
||||
/// Form fields extracted from the document.
|
||||
///
|
||||
/// Empty in Phase 6; populated in Phase 7.
|
||||
#[serde(skip_serializing_if = "Vec::is_empty")]
|
||||
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
||||
pub form_fields: Vec<Value>,
|
||||
|
||||
/// Links extracted from the document.
|
||||
///
|
||||
/// Empty in Phase 6; populated in Phase 7.
|
||||
#[serde(skip_serializing_if = "Vec::is_empty")]
|
||||
#[serde(default, skip_serializing_if = "Vec::is_empty")]
|
||||
pub links: Vec<Value>,
|
||||
}
|
||||
|
||||
|
|
@ -218,7 +221,6 @@ impl FooterFrame {
|
|||
/// Create a new footer frame.
|
||||
pub fn new(extraction_quality: ExtractionQuality, errors: Vec<Value>) -> Self {
|
||||
Self {
|
||||
frame_type: FrameType::Footer,
|
||||
extraction_quality,
|
||||
errors,
|
||||
threads: Vec::new(),
|
||||
|
|
@ -237,28 +239,71 @@ impl FooterFrame {
|
|||
}
|
||||
}
|
||||
|
||||
/// Write a single frame to a writer as a JSON line with trailing newline and flush.
|
||||
///
|
||||
/// This is the primary function for emitting NDJSON frames during streaming extraction.
|
||||
/// It serializes the frame, appends a newline, writes it to the writer, and flushes
|
||||
/// to ensure immediate delivery to streaming consumers.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `writer` - Any writer implementing `Write` (e.g., `File`, `BufWriter`, `Stdout`)
|
||||
/// * `frame` - The frame to write (wrapped in `NdjsonFrame` enum)
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// * `Ok(())` if the frame was written and flushed successfully
|
||||
/// * `Err(io::Error)` if serialization or writing failed
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```ignore
|
||||
/// use std::io::BufWriter;
|
||||
/// use pdftract_core::output::ndjson::frames::{write_frame, NdjsonFrame, HeaderFrame};
|
||||
///
|
||||
/// let mut writer = BufWriter::new(file);
|
||||
/// let header = HeaderFrame::new(...);
|
||||
/// write_frame(&mut writer, &NdjsonFrame::Header(header))?;
|
||||
/// ```
|
||||
pub fn write_frame<W: Write>(writer: &mut W, frame: &NdjsonFrame) -> std::io::Result<()> {
|
||||
// Serialize the frame to JSON
|
||||
let json_string = serde_json::to_string(frame)
|
||||
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
|
||||
|
||||
// Write the JSON line with trailing newline
|
||||
writer.write_all(json_string.as_bytes())?;
|
||||
writer.write_all(b"\n")?;
|
||||
|
||||
// Flush to ensure immediate delivery to streaming consumers
|
||||
writer.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::io::Cursor;
|
||||
|
||||
#[test]
|
||||
fn test_header_frame_serialization() {
|
||||
fn test_ndjson_frame_header_discriminator() {
|
||||
let header = HeaderFrame::new(
|
||||
"1.0".to_string(),
|
||||
serde_json::json!({"title": "Test", "author": "Test Author"}),
|
||||
Some(serde_json::json!([{"title": "Chapter 1", "level": 1}])),
|
||||
10,
|
||||
);
|
||||
let frame = NdjsonFrame::Header(header);
|
||||
|
||||
let json = header.to_json_line().unwrap();
|
||||
assert!(json.contains("\"frame\":\"header\""));
|
||||
let json = serde_json::to_string(&frame).unwrap();
|
||||
// The "frame" key should appear first (serde internal tag)
|
||||
assert!(json.starts_with("{\"frame\":\"header\""));
|
||||
assert!(json.contains("\"schema_version\":\"1.0\""));
|
||||
assert!(json.contains("\"total_pages\":10"));
|
||||
assert!(json.ends_with('\n'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_frame_serialization() {
|
||||
fn test_ndjson_frame_page_discriminator() {
|
||||
let page = PageFrame::new(
|
||||
0,
|
||||
"content".to_string(),
|
||||
|
|
@ -279,29 +324,132 @@ mod tests {
|
|||
vec![],
|
||||
vec![],
|
||||
);
|
||||
let frame = NdjsonFrame::Page(page);
|
||||
|
||||
let json = page.to_json_line().unwrap();
|
||||
assert!(json.contains("\"frame\":\"page\""));
|
||||
let json = serde_json::to_string(&frame).unwrap();
|
||||
// The "frame" key should appear first
|
||||
assert!(json.starts_with("{\"frame\":\"page\""));
|
||||
assert!(json.contains("\"page_index\":0"));
|
||||
assert!(json.contains("\"page_type\":\"content\""));
|
||||
assert!(json.ends_with('\n'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_footer_frame_serialization() {
|
||||
fn test_ndjson_frame_footer_discriminator() {
|
||||
let footer = FooterFrame::new(ExtractionQuality::new().with_quality("high"), vec![]);
|
||||
let frame = NdjsonFrame::Footer(footer);
|
||||
|
||||
let json = footer.to_json_line().unwrap();
|
||||
assert!(json.contains("\"frame\":\"footer\""));
|
||||
let json = serde_json::to_string(&frame).unwrap();
|
||||
// The "frame" key should appear first
|
||||
assert!(json.starts_with("{\"frame\":\"footer\""));
|
||||
assert!(json.contains("\"overall_quality\":\"high\""));
|
||||
assert!(json.ends_with('\n'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_write_frame_includes_newline_and_flush() {
|
||||
let header = HeaderFrame::new(
|
||||
"1.0".to_string(),
|
||||
serde_json::json!({"title": "Test"}),
|
||||
None,
|
||||
1,
|
||||
);
|
||||
let frame = NdjsonFrame::Header(header);
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
write_frame(&mut buffer, &frame).unwrap();
|
||||
|
||||
let output = String::from_utf8(buffer).unwrap();
|
||||
// Should end with newline
|
||||
assert!(output.ends_with('\n'));
|
||||
// Should contain the frame discriminator
|
||||
assert!(output.contains("\"frame\":\"header\""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_roundtrip_header_frame() {
|
||||
let original = HeaderFrame::new(
|
||||
"1.0".to_string(),
|
||||
serde_json::json!({"title": "Test", "author": "Test Author"}),
|
||||
Some(serde_json::json!([{"title": "Chapter 1", "level": 1}])),
|
||||
10,
|
||||
);
|
||||
let frame = NdjsonFrame::Header(original.clone());
|
||||
|
||||
// Serialize
|
||||
let json = serde_json::to_string(&frame).unwrap();
|
||||
|
||||
// Deserialize
|
||||
let deserialized: NdjsonFrame = serde_json::from_str(&json).unwrap();
|
||||
|
||||
// Verify equality
|
||||
assert_eq!(frame, deserialized);
|
||||
|
||||
// Extract and verify the inner HeaderFrame
|
||||
match deserialized {
|
||||
NdjsonFrame::Header(header) => {
|
||||
assert_eq!(header.schema_version, original.schema_version);
|
||||
assert_eq!(header.metadata, original.metadata);
|
||||
assert_eq!(header.outline, original.outline);
|
||||
assert_eq!(header.total_pages, original.total_pages);
|
||||
}
|
||||
_ => panic!("Expected Header frame"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_roundtrip_page_frame() {
|
||||
let original = PageFrame::new(
|
||||
0,
|
||||
"content".to_string(),
|
||||
vec![SpanJson {
|
||||
text: "Hello".to_string(),
|
||||
bbox: [0.0, 0.0, 100.0, 20.0],
|
||||
font: "Helvetica".to_string(),
|
||||
size: 12.0,
|
||||
color: None,
|
||||
rendering_mode: None,
|
||||
confidence: None,
|
||||
confidence_source: None,
|
||||
lang: None,
|
||||
flags: vec![],
|
||||
receipt: None,
|
||||
column: None,
|
||||
}],
|
||||
vec![],
|
||||
vec![],
|
||||
);
|
||||
let frame = NdjsonFrame::Page(original.clone());
|
||||
|
||||
// Serialize
|
||||
let json = serde_json::to_string(&frame).unwrap();
|
||||
|
||||
// Deserialize
|
||||
let deserialized: NdjsonFrame = serde_json::from_str(&json).unwrap();
|
||||
|
||||
// Verify equality
|
||||
assert_eq!(frame, deserialized);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_roundtrip_footer_frame() {
|
||||
let original = FooterFrame::new(ExtractionQuality::new().with_quality("high"), vec![]);
|
||||
let frame = NdjsonFrame::Footer(original.clone());
|
||||
|
||||
// Serialize
|
||||
let json = serde_json::to_string(&frame).unwrap();
|
||||
|
||||
// Deserialize
|
||||
let deserialized: NdjsonFrame = serde_json::from_str(&json).unwrap();
|
||||
|
||||
// Verify equality
|
||||
assert_eq!(frame, deserialized);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_page_frame_with_empty_collections() {
|
||||
let page = PageFrame::new(5, "blank".to_string(), vec![], vec![], vec![]);
|
||||
let frame = NdjsonFrame::Page(page);
|
||||
|
||||
let json = page.to_json_line().unwrap();
|
||||
let json = serde_json::to_string(&frame).unwrap();
|
||||
// Empty spans/blocks/tables should still be present
|
||||
assert!(json.contains("\"spans\":[]"));
|
||||
assert!(json.contains("\"blocks\":[]"));
|
||||
|
|
|
|||
|
|
@ -16,5 +16,5 @@ pub mod frames;
|
|||
pub mod pipeline;
|
||||
|
||||
pub use buffer::OutOfOrderBuffer;
|
||||
pub use frames::{FooterFrame, HeaderFrame, PageFrame};
|
||||
pub use frames::{write_frame, FooterFrame, HeaderFrame, NdjsonFrame, PageFrame};
|
||||
pub use pipeline::extract_streaming;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue