- Add profile source modules: apply_profile, extraction, extraction_loader, field_extractor, match_eval - Add profiles CLI subcommand (profiles_cmd.rs) - Update all 9 built-in profile YAMLs (invoice, receipt, contract, scientific_paper, slide_deck, form, bank_statement, legal_filing, book_chapter) - Add 50 invoice fixture PDFs - Add 2 receipt fixture PDFs Part of: pdftract-3a310 (Phase 7.10 coordinator)
34 lines
882 B
YAML
34 lines
882 B
YAML
# Form extraction profile
|
|
# Matches fillable forms with fields; uses line_dominant reading order
|
|
name: form
|
|
description: Fillable form with fields; uses line_dominant reading order and form_fields from Phase 7.4
|
|
priority: 30
|
|
|
|
match:
|
|
all:
|
|
- any:
|
|
- text_contains:
|
|
patterns: ["form", "application form", "questionnaire", "please fill out", "required fields"]
|
|
- structural:
|
|
has_table: false
|
|
has_form_field: true
|
|
has_math: false
|
|
page_count: null
|
|
- structural:
|
|
has_table: false
|
|
has_form_field: false
|
|
has_math: false
|
|
page_count:
|
|
min: 1
|
|
max: 10
|
|
|
|
extraction:
|
|
reading_order: line_dominant
|
|
table_detection: off
|
|
readability_threshold: 0.5
|
|
include_invisible: false
|
|
include_headers_footers: true
|
|
force_ocr: false
|
|
min_block_chars: 0
|
|
|
|
fields: {}
|