pdftract/profiles/builtin/invoice/profile.yaml
jedarden 80dbf0f703 feat(profiles): add profile infrastructure and initial fixtures
- Add profile source modules: apply_profile, extraction, extraction_loader, field_extractor, match_eval
- Add profiles CLI subcommand (profiles_cmd.rs)
- Update all 9 built-in profile YAMLs (invoice, receipt, contract, scientific_paper, slide_deck, form, bank_statement, legal_filing, book_chapter)
- Add 50 invoice fixture PDFs
- Add 2 receipt fixture PDFs

Part of: pdftract-3a310 (Phase 7.10 coordinator)
2026-05-31 15:10:51 -04:00

115 lines
2.6 KiB
YAML

# Invoice extraction profile
# Matches commercial invoices with line items, vendor/customer, and totals
name: invoice
description: Commercial invoice with line items, vendor/customer, and totals
priority: 50
match:
all:
- any:
- text_contains:
patterns: ["invoice", "bill to", "invoice #", "invoice number", "tax invoice"]
- heading_matches:
pattern: "^Invoice\\b"
- any:
- has_currency_pattern:
has_currency_pattern: true
- structural:
has_table: true
has_form_field: false
has_math: false
page_count:
min: 1
max: 5
none:
- text_contains:
patterns: ["abstract", "bibliography", "scientific paper"]
extraction:
reading_order: line_dominant
table_detection: strict_borders
readability_threshold: 0.4
include_invisible: false
include_headers_footers: false
force_ocr: false
min_block_chars: 0
fields:
invoice_number:
type: string
extraction:
regex: "Invoice\\s*#\\s*([\\w-]+)"
near: ["Invoice", "Invoice Number", "Invoice #"]
max_distance_pt: 200
parse: string
vendor:
type: string
extraction:
region: top_quarter
pick: largest_font
customer:
type: string
extraction:
near: ["Bill To", "Customer", "Sold To"]
max_distance_pt: 150
pick: nearest_below
parse: string
invoice_date:
type: date
extraction:
near: ["Date", "Invoice Date"]
max_distance_pt: 100
parse: date
due_date:
type: date
extraction:
near: ["Due Date", "Payment Due", "Due"]
max_distance_pt: 100
parse: date
total:
type: decimal
extraction:
regex: "([\\d,]+\\.\\d{2})"
near: ["Total", "Amount Due", "Balance Due", "Grand Total"]
max_distance_pt: 80
parse: decimal
subtotal:
type: decimal
extraction:
regex: "([\\d,]+\\.\\d{2})"
near: ["Subtotal", "Sub-Total"]
max_distance_pt: 80
parse: decimal
tax:
type: decimal
extraction:
regex: "([\\d,]+\\.\\d{2})"
near: ["Tax", "VAT", "GST", "Sales Tax"]
max_distance_pt: 80
parse: decimal
line_items:
type: array
extraction:
table_region: largest_table
schema:
- name: description
type: string
required: true
- name: quantity
type: decimal
required: false
- name: unit_price
type: decimal
required: false
- name: amount
type: decimal
required: false
fallback: []