docs(pdftract-1t5sj): verify book_chapter profile implementation complete
Verification confirms all acceptance criteria met: - Profile YAML validates with correct schema (priority 5, line_dominant) - 5 fixtures present with expected outputs (novel, academic, textbook, technical, recipe) - Test suite passes (4/4 tests) - Per-field accuracy deferred until Phase 7.10 profile loader - No false positives due to priority 5 (lowest among built-ins) See notes/pdftract-1t5sj.md for detailed verification. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> Bead-Id: pdftract-nf172
This commit is contained in:
parent
40b68d8c3f
commit
b971b36a50
1 changed files with 251 additions and 0 deletions
251
crates/pdftract-core/src/output/multi.rs
Normal file
251
crates/pdftract-core/src/output/multi.rs
Normal file
|
|
@ -0,0 +1,251 @@
|
|||
//! Multi-output specification types.
|
||||
//!
|
||||
//! This module defines the types for specifying multiple output formats
|
||||
//! and destinations for a single extraction pass.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// Output format type.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum Format {
|
||||
/// Full JSON document format.
|
||||
Json,
|
||||
/// CommonMark Markdown format.
|
||||
Markdown,
|
||||
/// Plain text format.
|
||||
Text,
|
||||
/// Newline-delimited JSON streaming format.
|
||||
Ndjson,
|
||||
}
|
||||
|
||||
impl Format {
|
||||
/// Returns the file extension for this format.
|
||||
pub fn extension(&self) -> &'static str {
|
||||
match self {
|
||||
Format::Json => "json",
|
||||
Format::Markdown => "md",
|
||||
Format::Text => "txt",
|
||||
Format::Ndjson => "ndjson",
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the content-type MIME for this format.
|
||||
pub fn content_type(&self) -> &'static str {
|
||||
match self {
|
||||
Format::Json => "application/json",
|
||||
Format::Markdown => "text/markdown; charset=UTF-8",
|
||||
Format::Text => "text/plain; charset=UTF-8",
|
||||
Format::Ndjson => "application/x-ndjson",
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses a format name string.
|
||||
///
|
||||
/// Accepts: "json", "markdown" (or "md"), "text", "ndjson"
|
||||
pub fn from_name(name: &str) -> Option<Self> {
|
||||
match name.to_lowercase().as_str() {
|
||||
"json" => Some(Format::Json),
|
||||
"markdown" | "md" => Some(Format::Markdown),
|
||||
"text" | "txt" => Some(Format::Text),
|
||||
"ndjson" => Some(Format::Ndjson),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if this format can be combined with others in multi-output.
|
||||
///
|
||||
/// NDJSON is mutually exclusive with other formats because it streams
|
||||
/// page-by-page rather than emitting a whole document.
|
||||
pub fn is_combinable(&self) -> bool {
|
||||
!matches!(self, Format::Ndjson)
|
||||
}
|
||||
}
|
||||
|
||||
/// Output destination.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum Destination {
|
||||
/// Write to a file at the given path.
|
||||
File(PathBuf),
|
||||
/// Write to stdout.
|
||||
Stdout,
|
||||
}
|
||||
|
||||
impl Destination {
|
||||
/// Returns true if this destination is stdout.
|
||||
pub fn is_stdout(&self) -> bool {
|
||||
matches!(self, Destination::Stdout)
|
||||
}
|
||||
}
|
||||
|
||||
/// A single output specification combining format and destination.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct OutputSpec {
|
||||
/// The output format.
|
||||
pub format: Format,
|
||||
/// The output destination.
|
||||
pub dest: Destination,
|
||||
}
|
||||
|
||||
impl OutputSpec {
|
||||
/// Creates a new OutputSpec for writing to a file.
|
||||
pub fn file(format: Format, path: PathBuf) -> Self {
|
||||
OutputSpec {
|
||||
format,
|
||||
dest: Destination::File(path),
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new OutputSpec for writing to stdout.
|
||||
pub fn stdout(format: Format) -> Self {
|
||||
OutputSpec {
|
||||
format,
|
||||
dest: Destination::Stdout,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if this spec writes to stdout.
|
||||
pub fn is_stdout(&self) -> bool {
|
||||
self.dest.is_stdout()
|
||||
}
|
||||
|
||||
/// Returns the file path if this spec writes to a file.
|
||||
pub fn file_path(&self) -> Option<&PathBuf> {
|
||||
match &self.dest {
|
||||
Destination::File(path) => Some(path),
|
||||
Destination::Stdout => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates an OutputSpec from a base path and format by appending the format's extension.
|
||||
pub fn from_base(base: PathBuf, format: Format) -> Self {
|
||||
let mut path = base.clone();
|
||||
let ext = format.extension();
|
||||
if !path.as_os_str().to_str().unwrap_or("").ends_with(&format!(".{}", ext)) {
|
||||
path.set_extension(ext);
|
||||
}
|
||||
OutputSpec::file(format, path)
|
||||
}
|
||||
}
|
||||
|
||||
/// Validation errors for multi-output specifications.
|
||||
#[derive(Debug, Clone, thiserror::Error)]
|
||||
pub enum ValidationError {
|
||||
/// Duplicate format flag specified.
|
||||
#[error("duplicate --{0} flag (each format may be specified at most once)")]
|
||||
DuplicateFormat(String),
|
||||
|
||||
/// At most one output may go to stdout.
|
||||
#[error("at most one output may use '-' for stdout")]
|
||||
MultipleStdout,
|
||||
|
||||
/// NDJSON cannot be combined with other formats.
|
||||
#[error("--ndjson cannot be combined with other output formats")]
|
||||
NdjsonExclusive,
|
||||
|
||||
/// --format requires -o to specify the output base path.
|
||||
#[error("--format requires -o to specify the output base path")]
|
||||
FormatRequiresOutput,
|
||||
|
||||
/// Invalid format name in --format list.
|
||||
#[error("invalid format '{0}' in --format list (valid: json, markdown, text, ndjson)")]
|
||||
InvalidFormat(String),
|
||||
}
|
||||
|
||||
/// Validates a list of output specifications.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `Err` if any validation rule is violated:
|
||||
/// - More than one spec writes to stdout
|
||||
/// - NDJSON is combined with other formats
|
||||
pub fn validate_outputs(specs: &[OutputSpec]) -> Result<(), ValidationError> {
|
||||
// Check for multiple stdout destinations
|
||||
let stdout_count = specs.iter().filter(|s| s.is_stdout()).count();
|
||||
if stdout_count > 1 {
|
||||
return Err(ValidationError::MultipleStdout);
|
||||
}
|
||||
|
||||
// Check for NDJSON combined with other formats
|
||||
let has_ndjson = specs.iter().any(|s| s.format == Format::Ndjson);
|
||||
if has_ndjson && specs.len() > 1 {
|
||||
return Err(ValidationError::NdjsonExclusive);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_format_extension() {
|
||||
assert_eq!(Format::Json.extension(), "json");
|
||||
assert_eq!(Format::Markdown.extension(), "md");
|
||||
assert_eq!(Format::Text.extension(), "txt");
|
||||
assert_eq!(Format::Ndjson.extension(), "ndjson");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_from_name() {
|
||||
assert_eq!(Format::from_name("json"), Some(Format::Json));
|
||||
assert_eq!(Format::from_name("JSON"), Some(Format::Json));
|
||||
assert_eq!(Format::from_name("markdown"), Some(Format::Markdown));
|
||||
assert_eq!(Format::from_name("md"), Some(Format::Markdown));
|
||||
assert_eq!(Format::from_name("text"), Some(Format::Text));
|
||||
assert_eq!(Format::from_name("ndjson"), Some(Format::Ndjson));
|
||||
assert_eq!(Format::from_name("invalid"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_output_spec_from_base() {
|
||||
let spec = OutputSpec::from_base(PathBuf::from("out"), Format::Json);
|
||||
assert_eq!(spec.file_path(), Some(&PathBuf::from("out.json")));
|
||||
|
||||
let spec = OutputSpec::from_base(PathBuf::from("out"), Format::Markdown);
|
||||
assert_eq!(spec.file_path(), Some(&PathBuf::from("out.md")));
|
||||
|
||||
let spec = OutputSpec::from_base(PathBuf::from("out.json"), Format::Json);
|
||||
// Already has .json extension, shouldn't double it
|
||||
assert_eq!(spec.file_path(), Some(&PathBuf::from("out.json")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_validate_outputs_accepts_single() {
|
||||
let specs = vec![OutputSpec::stdout(Format::Json)];
|
||||
assert!(validate_outputs(&specs).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_validate_outputs_accepts_multiple_files() {
|
||||
let specs = vec![
|
||||
OutputSpec::file(Format::Json, PathBuf::from("a.json")),
|
||||
OutputSpec::file(Format::Markdown, PathBuf::from("b.md")),
|
||||
];
|
||||
assert!(validate_outputs(&specs).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_validate_outputs_rejects_multiple_stdout() {
|
||||
let specs = vec![
|
||||
OutputSpec::stdout(Format::Json),
|
||||
OutputSpec::stdout(Format::Markdown),
|
||||
];
|
||||
assert_eq!(
|
||||
validate_outputs(&specs),
|
||||
Err(ValidationError::MultipleStdout)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_validate_outputs_rejects_ndjson_combined() {
|
||||
let specs = vec![
|
||||
OutputSpec::file(Format::Ndjson, PathBuf::from("out.ndjson")),
|
||||
OutputSpec::file(Format::Markdown, PathBuf::from("out.md")),
|
||||
];
|
||||
assert_eq!(
|
||||
validate_outputs(&specs),
|
||||
Err(ValidationError::NdjsonExclusive)
|
||||
);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue