diff --git a/crates/pdftract-core/src/extract.rs b/crates/pdftract-core/src/extract.rs index b8afc36..b04e428 100644 --- a/crates/pdftract-core/src/extract.rs +++ b/crates/pdftract-core/src/extract.rs @@ -14,6 +14,9 @@ //! large documents with 10,000+ pages. use crate::document::compute_fingerprint_lazy; +use crate::forms::{ + acro_field_to_value, combine, walk_acroform_fields, AcroFormField, FormFieldValue, +}; use crate::options::{ExtractionOptions, ReceiptsMode}; use crate::parser::catalog::ReadingOrderAlgorithm; use crate::parser::marked_content::{track_mcids_from_content_stream, McidTracker}; @@ -21,7 +24,10 @@ use crate::parser::stream::FileSource; use crate::parser::stream::DEFAULT_MAX_DECOMPRESS_BYTES; use crate::parser::struct_tree::{check_coverage_for_pages, parse_struct_tree}; use crate::receipts::Receipt; -use crate::schema::{BlockJson, SignatureJson, SpanJson, TableJson}; +use crate::schema::{ + BlockJson, ChoiceValueJson, FormFieldJson, FormFieldTypeJson, FormFieldValueJson, + SignatureJson, SpanJson, TableJson, +}; use crate::semaphore::{Semaphore, SemaphoreExt}; use crate::signature::{discover, extract_signatures}; use crate::table::{ @@ -121,6 +127,13 @@ pub struct ExtractionResult { /// including both signed and unsigned (blank) signature fields. /// Empty when the PDF has no signature fields. pub signatures: Vec, + /// Interactive form fields extracted from the document. + /// + /// This array contains all form fields from the AcroForm and/or XFA data. + /// Fields are sorted alphabetically by name. When both AcroForm and XFA + /// are present, XFA values take precedence on collision. + /// Empty when the PDF has no form fields. + pub form_fields: Vec, } /// Result for a single page. @@ -501,6 +514,54 @@ pub fn extract_pdf( let signatures_core = extract_signatures(&sig_fields, &resolver_arc, file_size); let signatures: Vec = signatures_core.into_iter().map(|s| s.into()).collect(); + // Phase 7.4: Extract form fields from AcroForm and XFA + // Walk AcroForm fields and convert to FormFieldValue + let acro_fields = walk_acroform_fields(&resolver_arc, &catalog, None); + let mut acro_fields_typed: Vec<(String, FormFieldValue)> = Vec::new(); + for field in acro_fields { + let field_value = acro_field_to_value(&field); + acro_fields_typed.push((field.full_name.clone(), field_value)); + } + + // Extract XFA fields if present (requires re-opening the source for stream access) + let xfa_fields = if catalog.acroform_ref.is_some() { + // Resolve the AcroForm dictionary + use crate::parser::xref::XrefResolver; + let acroform_ref = catalog.acroform_ref.unwrap(); + if let Ok(acroform_obj) = resolver_arc.resolve(acroform_ref) { + if let Some(acroform_dict) = acroform_obj.as_dict() { + // Create extraction options for stream decoding + use crate::parser::stream::ExtractionOptions as StreamExtractionOptions; + let stream_opts = StreamExtractionOptions { + max_decompress_bytes: DEFAULT_MAX_DECOMPRESS_BYTES, + password: None, + }; + use crate::forms::extract_xfa_fields; + let xfa_extracted = + extract_xfa_fields(&resolver_arc, acroform_dict, &source, &stream_opts); + xfa_extracted + .into_iter() + .filter_map(|f| f.value.map(|v| (f.full_name, v))) + .collect() + } else { + Vec::new() + } + } else { + Vec::new() + } + } else { + Vec::new() + }; + + // Combine AcroForm and XFA fields (XFA wins on collision) + let (combined_fields, _form_diagnostics) = combine(acro_fields_typed, xfa_fields); + + // Convert to FormFieldJson + let form_fields: Vec = combined_fields + .into_iter() + .map(|(name, value)| convert_form_field_to_json(name, value, &resolver_arc, &catalog)) + .collect(); + Ok(ExtractionResult { fingerprint, pages: extracted_pages, @@ -516,6 +577,7 @@ pub fn extract_pdf( diagnostics: coverage_diagnostics, }, signatures, + form_fields, }) } @@ -560,6 +622,145 @@ fn apply_two_page_table_detection( pages } +/// Convert a FormFieldValue to FormFieldJson for serialization. +/// +/// This helper function converts the internal FormFieldValue representation +/// to the JSON-serializable FormFieldJson structure. +/// +/// # Arguments +/// +/// * `name` - The field name +/// * `value` - The FormFieldValue to convert +/// * `resolver` - Xref resolver (for looking up field metadata) +/// * `catalog` - Document catalog (for accessing AcroForm) +fn convert_form_field_to_json( + name: String, + value: FormFieldValue, + resolver: &crate::parser::xref::XrefResolver, + catalog: &crate::parser::catalog::Catalog, +) -> FormFieldJson { + match value { + FormFieldValue::Text { + value, + default, + multiline, + max_length, + } => FormFieldJson { + name, + field_type: FormFieldTypeJson::Text, + value: FormFieldValueJson::Text(value), + default: default.map(|v| FormFieldValueJson::Text(Some(v))), + page_index: None, + rect: None, + required: false, + read_only: false, + multiline: Some(multiline), + max_length, + options: None, + multi_select: None, + selected: None, + state_name: None, + pushbutton: None, + radio: None, + }, + + FormFieldValue::Button { + kind, + selected, + state_name, + default_selected, + pushbutton, + radio, + } => FormFieldJson { + name, + field_type: FormFieldTypeJson::Button, + value: FormFieldValueJson::Button(selected), + default: default_selected.map(FormFieldValueJson::Button), + page_index: None, + rect: None, + required: false, + read_only: false, + multiline: None, + max_length: None, + options: None, + multi_select: None, + selected: Some(selected), + state_name, + pushbutton: Some(pushbutton), + radio: Some(radio), + }, + + FormFieldValue::Choice { + value, + default, + options, + is_combo, + is_multi_select, + } => { + let json_value = match value { + crate::forms::ChoiceValue::Single(s) => { + FormFieldValueJson::Choice(ChoiceValueJson::Single(s)) + } + crate::forms::ChoiceValue::Multiple(vec) => { + FormFieldValueJson::Choice(ChoiceValueJson::Multiple(vec)) + } + }; + + let json_default = default.map(|dv| match dv { + crate::forms::ChoiceValue::Single(s) => { + FormFieldValueJson::Choice(ChoiceValueJson::Single(s)) + } + crate::forms::ChoiceValue::Multiple(vec) => { + FormFieldValueJson::Choice(ChoiceValueJson::Multiple(vec)) + } + }); + + let json_options: Vec<[String; 2]> = options + .into_iter() + .map(|(export, display)| [export, display]) + .collect(); + + FormFieldJson { + name, + field_type: FormFieldTypeJson::Choice, + value: json_value, + default: json_default, + page_index: None, + rect: None, + required: false, + read_only: false, + multiline: None, + max_length: None, + options: Some(json_options), + multi_select: Some(is_multi_select), + selected: None, + state_name: None, + pushbutton: None, + radio: None, + } + } + + FormFieldValue::Signature { signature_ref } => FormFieldJson { + name, + field_type: FormFieldTypeJson::Signature, + value: FormFieldValueJson::Signature(signature_ref), + default: None, + page_index: None, + rect: None, + required: false, + read_only: false, + multiline: None, + max_length: None, + options: None, + multi_select: None, + selected: None, + state_name: None, + pushbutton: None, + radio: None, + }, + } +} + /// Extract content from a single page. /// /// # Arguments @@ -604,6 +805,7 @@ fn extract_page( size: 12.0, confidence: None, receipt, + column: None, }; // Create a block containing the span @@ -1422,6 +1624,7 @@ fn extract_page_from_dict( size: 12.0, confidence: None, receipt, + column: None, }; // Create blocks including table blocks diff --git a/crates/pdftract-core/src/forms/mod.rs b/crates/pdftract-core/src/forms/mod.rs index f790c5e..be58e1e 100644 --- a/crates/pdftract-core/src/forms/mod.rs +++ b/crates/pdftract-core/src/forms/mod.rs @@ -26,12 +26,189 @@ pub use xfa::{extract_xfa_fields, XfaField}; pub use combiner::{combine, ChoiceValue, FormFieldValue}; pub use value_button::{extract_button_value, ButtonKind, ButtonValue}; +/// Convert an AcroFormField to FormFieldValue. +/// +/// This function implements Phase 7.4.2 type-specific extraction: it converts +/// the raw AcroFormField (from Phase 7.4.1) into a type-safe FormFieldValue +/// that can be combined with XFA fields and serialized to JSON. +/// +/// # Arguments +/// +/// * `field` - The AcroFormField to convert +/// +/// # Returns +/// +/// A `FormFieldValue` variant matching the field's type, with values extracted +/// from the field's /V, /DV, /Ff, and /Opt entries. +pub fn acro_field_to_value(field: &AcroFormField) -> FormFieldValue { + match field.field_type { + AcroFieldType::Tx => { + // Text field: extract string value from /V + let value = field + .value + .as_ref() + .and_then(|v| v.as_string()) + .and_then(|bytes| String::from_utf8(bytes.to_vec()).ok()); + let default = field + .default + .as_ref() + .and_then(|v| v.as_string()) + .and_then(|bytes| String::from_utf8(bytes.to_vec()).ok()); + let multiline = field.is_multi_line(); + + // Extract /MaxLen if present (would need to be added to AcroFormField) + let max_length = None; // TODO: extract from field dict if needed + + FormFieldValue::Text { + value, + default, + multiline, + max_length, + } + } + + AcroFieldType::Btn => { + // Button field: use extract_button_value + let button_value = extract_button_value(field.value.as_ref(), field.flags); + + // Extract default selected state from /DV + let default_selected = field + .default + .as_ref() + .and_then(|v| v.as_name()) + .map(|name| { + let name_str: &str = &name; + name_str != "Off" + }); + + FormFieldValue::Button { + kind: button_value.kind, + selected: button_value.selected, + state_name: button_value.state_name, + default_selected, + pushbutton: button_value.pushbutton, + radio: button_value.radio, + } + } + + AcroFieldType::Ch => { + // Choice field: extract selected value(s) and options + let (value, default) = extract_choice_values(&field.value, &field.default); + let options = field.opt.clone().unwrap_or_default(); + + // Extract combo and multi_select flags from /Ff + const COMBO_FLAG: u32 = 1 << 17; // Bit 18 + const MULTI_SELECT_FLAG: u32 = 1 << 20; // Bit 21 + let is_combo = (field.flags & COMBO_FLAG) != 0; + let is_multi_select = (field.flags & MULTI_SELECT_FLAG) != 0; + + FormFieldValue::Choice { + value, + default, + options, + is_combo, + is_multi_select, + } + } + + AcroFieldType::Sig => { + // Signature field: extract reference number + let ref_num = field.value.as_ref().and_then(|v| match v { + PdfObject::Ref(ref_) => Some(ref_.object), + _ => None, + }); + + FormFieldValue::Signature { + signature_ref: ref_num, + } + } + + AcroFieldType::Other => { + // Unknown field type: treat as text + let value = field + .value + .as_ref() + .and_then(|v| v.as_string()) + .and_then(|bytes| String::from_utf8(bytes.to_vec()).ok()); + let default = field + .default + .as_ref() + .and_then(|v| v.as_string()) + .and_then(|bytes| String::from_utf8(bytes.to_vec()).ok()); + + FormFieldValue::Text { + value, + default, + multiline: false, + max_length: None, + } + } + } +} + +/// Extract choice field values from /V and /DV entries. +/// +/// Choice fields can have either a single selected value or multiple +/// selected values (for multi-select list boxes). +fn extract_choice_values( + value: &Option, + default: &Option, +) -> (ChoiceValue, Option) { + // Extract current value + let current = match value { + Some(PdfObject::String(s)) => String::from_utf8(s.to_vec()) + .ok() + .map(|v| ChoiceValue::Single(v)) + .unwrap_or_else(|| ChoiceValue::Single(String::new())), + Some(PdfObject::Array(arr)) => { + let values: Vec = arr + .iter() + .filter_map(|v| v.as_string()) + .filter_map(|bytes| String::from_utf8(bytes.to_vec()).ok()) + .collect(); + if values.is_empty() { + ChoiceValue::Single(String::new()) + } else if values.len() == 1 { + ChoiceValue::Single(values.into_iter().next().unwrap()) + } else { + ChoiceValue::Multiple(values) + } + } + _ => ChoiceValue::Single(String::new()), + }; + + // Extract default value + let default_val = match default { + Some(PdfObject::String(s)) => String::from_utf8(s.to_vec()) + .ok() + .map(|v| ChoiceValue::Single(v)), + Some(PdfObject::Array(arr)) => { + let values: Vec = arr + .iter() + .filter_map(|v| v.as_string()) + .filter_map(|bytes| String::from_utf8(bytes.to_vec()).ok()) + .collect(); + if values.is_empty() { + None + } else if values.len() == 1 { + Some(ChoiceValue::Single(values.into_iter().next().unwrap())) + } else { + Some(ChoiceValue::Multiple(values)) + } + } + _ => None, + }; + + (current, default_val) +} + use crate::diagnostics::{DiagCode, Diagnostic}; use crate::parser::catalog::Catalog; use crate::parser::object::{intern, ObjRef, PdfDict, PdfObject}; use crate::parser::pages::PageDict; use crate::parser::xref::XrefResolver; use std::collections::{HashMap, HashSet}; +use std::sync::Arc; /// Result type for form operations. pub type Result = std::result::Result>; diff --git a/crates/pdftract-core/src/lib.rs b/crates/pdftract-core/src/lib.rs index 1390b2c..ebcf5c5 100644 --- a/crates/pdftract-core/src/lib.rs +++ b/crates/pdftract-core/src/lib.rs @@ -56,7 +56,9 @@ pub use font::std14::{get_std14_metrics, NamedEncoding, Std14Metrics}; pub use forms::{ combine, walk_acroform_fields, AcroFieldType, AcroFormField, ChoiceValue, FormFieldValue, }; -pub use markdown::{block_to_markdown, page_to_markdown, parse_anchors, Anchor}; +pub use markdown::{ + block_to_markdown, form_fields_to_markdown, page_to_markdown, parse_anchors, Anchor, +}; pub use options::{ExtractionOptions, ReceiptsMode}; pub use parser::pages::{count_pages_tree, LazyPageIter, PageDict, DEFAULT_MEDIABOX}; pub use schema::{BlockJson, CellJson, ExtractionQuality, RowJson, SpanJson, SpanRef, TableJson}; diff --git a/crates/pdftract-core/src/markdown.rs b/crates/pdftract-core/src/markdown.rs index 1e15abd..0324b2e 100644 --- a/crates/pdftract-core/src/markdown.rs +++ b/crates/pdftract-core/src/markdown.rs @@ -35,7 +35,9 @@ //! assert_eq!(anchors[0].block, 0); //! ``` -use crate::schema::BlockJson; +use crate::schema::{ + BlockJson, ChoiceValueJson, FormFieldJson, FormFieldTypeJson, FormFieldValueJson, +}; use regex::Regex; use serde::{Deserialize, Serialize}; use std::sync::OnceLock; @@ -504,3 +506,85 @@ Some text."#; assert_eq!(anchors[0].kind, "heading"); } } + +/// Generate a markdown footer section for form fields. +/// +/// This function creates a formatted markdown table listing all form fields +/// with their names, types, and current values. Only emits the section when +/// form_fields count > 0. +/// +/// # Arguments +/// +/// * `form_fields` - The form fields to include in the footer +/// +/// # Returns +/// +/// A markdown string with a form fields table, or an empty string if no fields. +/// +/// # Example +/// +/// ```ignore +/// use pdftract_core::markdown::form_fields_to_markdown; +/// use pdftract_core::schema::{FormFieldJson, FormFieldTypeJson, FormFieldValueJson}; +/// +/// let fields = vec![ +/// FormFieldJson { +/// name: "employee_name".to_string(), +/// field_type: FormFieldTypeJson::Text, +/// value: FormFieldValueJson::Text(Some("John Doe".to_string())), +/// // ... other fields +/// }, +/// ]; +/// +/// let md = form_fields_to_markdown(&fields); +/// assert!(md.contains("## Form Fields")); +/// assert!(md.contains("employee_name")); +/// ``` +pub fn form_fields_to_markdown(form_fields: &[FormFieldJson]) -> String { + if form_fields.is_empty() { + return String::new(); + } + + let mut result = String::from("\n\n## Form Fields\n\n"); + result.push_str("| Name | Type | Value |\n"); + result.push_str("|------|------|-------|\n"); + + for field in form_fields { + let type_str = match field.field_type { + FormFieldTypeJson::Text => "text", + FormFieldTypeJson::Button => "button", + FormFieldTypeJson::Choice => "choice", + FormFieldTypeJson::Signature => "signature", + }; + + let value_str = format_value_json(&field.value); + + result.push_str(&format!( + "| {} | {} | {} |\n", + field.name, type_str, value_str + )); + } + + result +} + +/// Format a FormFieldValueJson as a string for markdown display. +fn format_value_json(value: &FormFieldValueJson) -> String { + match value { + FormFieldValueJson::Text(None) => "*empty*".to_string(), + FormFieldValueJson::Text(Some(s)) => escape_pipe(s), + FormFieldValueJson::Button(b) => b.to_string(), + FormFieldValueJson::Choice(ChoiceValueJson::Single(s)) => escape_pipe(s), + FormFieldValueJson::Choice(ChoiceValueJson::Multiple(vec)) => { + let values: Vec = vec.iter().map(|s| escape_pipe(s.as_str())).collect(); + values.join(", ") + } + FormFieldValueJson::Signature(None) => "*unsigned*".to_string(), + FormFieldValueJson::Signature(Some(n)) => format!("ref #{}", n), + } +} + +/// Escape pipe characters for markdown table cells. +fn escape_pipe(s: &str) -> String { + s.replace('|', "\\|") +} diff --git a/crates/pdftract-core/src/schema/mod.rs b/crates/pdftract-core/src/schema/mod.rs index 013e20a..145bc65 100644 --- a/crates/pdftract-core/src/schema/mod.rs +++ b/crates/pdftract-core/src/schema/mod.rs @@ -67,6 +67,13 @@ pub struct SpanJson { /// is enabled. When receipts are disabled, the field is `null`. #[serde(skip_serializing_if = "Option::is_none")] pub receipt: Option, + + /// Column index (0-based) assigned by Phase 4.3 column detection. + /// + /// This field is `None` for spans outside any detected column + /// (e.g., full-width headings, inter-column gaps). + #[serde(skip_serializing_if = "Option::is_none")] + pub column: Option, } /// JSON representation of a structural block. @@ -329,6 +336,153 @@ impl Default for ExtractionQuality { } } +/// JSON representation of a form field. +/// +/// This struct represents a single interactive form field from the PDF's +/// AcroForm or XFA data, including its type, value, and metadata. +/// +/// Per the plan (Phase 7.4), form fields are extracted from both AcroForm +/// and XFA sources, with XFA values taking precedence on collision. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] +pub struct FormFieldJson { + /// The absolute (dot-joined) field name from the AcroForm. + /// Example: "employer_signature" or "form.employee_sig" + pub name: String, + + /// The field type variant (text, button, choice, or signature). + #[serde(rename = "type")] + pub field_type: FormFieldTypeJson, + + /// The current value of the form field. + /// + /// This field's structure varies by field_type: + /// - text: string value + /// - button: boolean selected state + /// - choice: string or array of strings (for multi-select) + /// - signature: signature reference number (or null if unsigned) + pub value: FormFieldValueJson, + + /// The default value (/DV entry) if present. + /// + /// Matches the structure of `value` but represents the field's default state. + #[serde(skip_serializing_if = "Option::is_none")] + pub default: Option, + + /// Zero-based page index where this field's widget appears. + /// + /// None if the field has no visual representation (form-only field). + #[serde(skip_serializing_if = "Option::is_none")] + pub page_index: Option, + + /// Bounding box in PDF user-space points. + /// + /// Format: [x0, y0, x1, y1] where (x0, y0) is the bottom-left corner. + /// None if the field has no visual appearance. + #[serde(skip_serializing_if = "Option::is_none")] + pub rect: Option<[f32; 4]>, + + /// Whether this field is required (bit 2 of /Ff flags). + pub required: bool, + + /// Whether this field is read-only (bit 1 of /Ff flags). + pub read_only: bool, + + /// Whether this text field supports multiple lines (bit 13 of /Ff). + /// Only present for text fields. + #[serde(skip_serializing_if = "Option::is_none")] + pub multiline: Option, + + /// Maximum length for text fields (/MaxLen entry). + /// Only present for text fields that have a max length set. + #[serde(skip_serializing_if = "Option::is_none")] + pub max_length: Option, + + /// Available options for choice fields. + /// + /// Each option is a [export_value, display_name] pair. + /// Only present for choice fields. + #[serde(skip_serializing_if = "Option::is_none")] + pub options: Option>, + + /// Whether this choice field supports multiple selections (bit 21 of /Ff). + /// Only present for choice fields. + #[serde(skip_serializing_if = "Option::is_none")] + pub multi_select: Option, + + /// Selected state for button fields. + /// True = checked/selected, False = unchecked. + /// Only present for button fields. + #[serde(skip_serializing_if = "Option::is_none")] + pub selected: Option, + + /// Appearance state name for button fields. + /// E.g., "Yes", "Off", or custom state names. + /// Only present for button fields. + #[serde(skip_serializing_if = "Option::is_none")] + pub state_name: Option, + + /// Whether this button is a pushbutton (bit 26 of /Ff). + /// Only present for button fields. + #[serde(skip_serializing_if = "Option::is_none")] + pub pushbutton: Option, + + /// Whether this button is a radio button (bit 25 of /Ff). + /// Only present for button fields. + #[serde(skip_serializing_if = "Option::is_none")] + pub radio: Option, +} + +/// Form field type discriminator. +/// +/// This enum uses serde's "tag" representation to produce a JSON string +/// indicating the field type. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "snake_case")] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] +pub enum FormFieldTypeJson { + /// Text field (/FT /Tx) - single-line or multi-line text input. + Text, + /// Button field (/FT /Btn) - pushbutton, checkbox, or radio button. + Button, + /// Choice field (/FT /Ch) - dropdown or list box. + Choice, + /// Signature field (/FT /Sig) - digital signature field. + Signature, +} + +/// Form field value representation. +/// +/// This enum captures the current value of a form field, with the variant +/// type matching the field_type. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(untagged)] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] +pub enum FormFieldValueJson { + /// Text field value (string or null). + Text(Option), + /// Button field value (boolean selected state). + Button(bool), + /// Choice field value (single string or array of strings for multi-select). + Choice(ChoiceValueJson), + /// Signature field value (signature reference number or null). + Signature(Option), +} + +/// Choice field value representation. +/// +/// Choice fields can have either a single selected value or multiple +/// selected values (for multi-select list boxes). +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(untagged)] +#[cfg_attr(feature = "schemars", derive(schemars::JsonSchema))] +pub enum ChoiceValueJson { + /// Single selected option. + Single(String), + /// Multiple selected options. + Multiple(Vec), +} + /// JSON representation of a digital signature. /// /// This struct represents a signature extracted from a PDF signature field, diff --git a/docs/schema/v1.0/pdftract.schema.json b/docs/schema/v1.0/pdftract.schema.json index 4cd442f..412b96d 100644 --- a/docs/schema/v1.0/pdftract.schema.json +++ b/docs/schema/v1.0/pdftract.schema.json @@ -25,13 +25,21 @@ "items": { "$ref": "#/$defs/SignatureJson" } + }, + "form_fields": { + "description": "Interactive form fields extracted from the document.\n\nThis array contains all form fields from the AcroForm and/or XFA data.\nFields are sorted alphabetically by name. When both AcroForm and XFA\nare present, XFA values take precedence on collision.\nEmpty when the PDF has no form fields.", + "type": "array", + "items": { + "$ref": "#/$defs/FormFieldJson" + } } }, "required": [ "fingerprint", "pages", "metadata", - "signatures" + "signatures", + "form_fields" ], "$defs": { "BlockJson": { @@ -561,6 +569,176 @@ "signer_name", "validation_status" ] + }, + "FormFieldJson": { + "description": "JSON representation of a form field.\n\nThis struct represents a single interactive form field from the PDF's\nAcroForm or XFA data, including its type, value, and metadata.\n\nPer the plan (Phase 7.4), form fields are extracted from both AcroForm\nand XFA sources, with XFA values taking precedence on collision.", + "type": "object", + "properties": { + "name": { + "description": "The absolute (dot-joined) field name from the AcroForm.\nExample: \"employer_signature\" or \"form.employee_sig\"", + "type": "string" + }, + "type": { + "description": "The field type variant (text, button, choice, or signature).", + "type": "string", + "enum": ["text", "button", "choice", "signature"] + }, + "value": { + "description": "The current value of the form field.\n\nThis field's structure varies by field_type:\n- text: string value\n- button: boolean selected state\n- choice: string or array of strings (for multi-select)\n- signature: signature reference number (or null if unsigned)", + "anyOf": [ + { + "type": "string", + "description": "Text field value (null if empty/absent)" + }, + { + "type": "null", + "description": "Null value for empty text or unsigned signature" + }, + { + "type": "boolean", + "description": "Button field selected state" + }, + { + "type": "string", + "description": "Choice field single selected value" + }, + { + "type": "array", + "items": { + "type": "string" + }, + "description": "Choice field multiple selected values" + }, + { + "type": "integer", + "description": "Signature reference number", + "minimum": 0 + } + ] + }, + "default": { + "description": "The default value (/DV entry) if present.\n\nMatches the structure of `value` but represents the field's default state.", + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + }, + { + "type": "boolean" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "page_index": { + "description": "Zero-based page index where this field's widget appears.\n\nNone if the field has no visual representation (form-only field).", + "type": [ + "integer", + "null" + ], + "minimum": 0 + }, + "rect": { + "description": "Bounding box in PDF user-space points.\n\nFormat: [x0, y0, x1, y1] where (x0, y0) is the bottom-left corner.\nNone if the field has no visual appearance.", + "type": [ + "array", + "null" + ], + "items": { + "type": "number", + "format": "float" + }, + "minItems": 4, + "maxItems": 4 + }, + "required": { + "description": "Whether this field is required (bit 2 of /Ff flags).", + "type": "boolean" + }, + "read_only": { + "description": "Whether this field is read-only (bit 1 of /Ff flags).", + "type": "boolean" + }, + "multiline": { + "description": "Whether this text field supports multiple lines (bit 13 of /Ff).\nOnly present for text fields.", + "type": [ + "boolean", + "null" + ] + }, + "max_length": { + "description": "Maximum length for text fields (/MaxLen entry).\nOnly present for text fields that have a max length set.", + "type": [ + "integer", + "null" + ], + "format": "uint32", + "minimum": 0 + }, + "options": { + "description": "Available options for choice fields.\n\nEach option is a [export_value, display_name] pair.\nOnly present for choice fields.", + "type": [ + "array", + "null" + ], + "items": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 2, + "maxItems": 2 + } + }, + "multi_select": { + "description": "Whether this choice field supports multiple selections (bit 21 of /Ff).\nOnly present for choice fields.", + "type": [ + "boolean", + "null" + ] + }, + "selected": { + "description": "Selected state for button fields.\nTrue = checked/selected, False = unchecked.\nOnly present for button fields.", + "type": [ + "boolean", + "null" + ] + }, + "state_name": { + "description": "Appearance state name for button fields.\nE.g., \"Yes\", \"Off\", or custom state names.\nOnly present for button fields.", + "type": [ + "string", + "null" + ] + }, + "pushbutton": { + "description": "Whether this button is a pushbutton (bit 26 of /Ff).\nOnly present for button fields.", + "type": [ + "boolean", + "null" + ] + }, + "radio": { + "description": "Whether this button is a radio button (bit 25 of /Ff).\nOnly present for button fields.", + "type": [ + "boolean", + "null" + ] + } + }, + "required": [ + "name", + "type", + "value", + "required", + "read_only" + ] } } } \ No newline at end of file