- Add profile source modules: apply_profile, extraction, extraction_loader, field_extractor, match_eval - Add profiles CLI subcommand (profiles_cmd.rs) - Update all 9 built-in profile YAMLs (invoice, receipt, contract, scientific_paper, slide_deck, form, bank_statement, legal_filing, book_chapter) - Add 50 invoice fixture PDFs - Add 2 receipt fixture PDFs Part of: pdftract-3a310 (Phase 7.10 coordinator)
115 lines
2.6 KiB
YAML
115 lines
2.6 KiB
YAML
# Invoice extraction profile
|
|
# Matches commercial invoices with line items, vendor/customer, and totals
|
|
name: invoice
|
|
description: Commercial invoice with line items, vendor/customer, and totals
|
|
priority: 50
|
|
|
|
match:
|
|
all:
|
|
- any:
|
|
- text_contains:
|
|
patterns: ["invoice", "bill to", "invoice #", "invoice number", "tax invoice"]
|
|
- heading_matches:
|
|
pattern: "^Invoice\\b"
|
|
- any:
|
|
- has_currency_pattern:
|
|
has_currency_pattern: true
|
|
- structural:
|
|
has_table: true
|
|
has_form_field: false
|
|
has_math: false
|
|
page_count:
|
|
min: 1
|
|
max: 5
|
|
none:
|
|
- text_contains:
|
|
patterns: ["abstract", "bibliography", "scientific paper"]
|
|
|
|
extraction:
|
|
reading_order: line_dominant
|
|
table_detection: strict_borders
|
|
readability_threshold: 0.4
|
|
include_invisible: false
|
|
include_headers_footers: false
|
|
force_ocr: false
|
|
min_block_chars: 0
|
|
|
|
fields:
|
|
invoice_number:
|
|
type: string
|
|
extraction:
|
|
regex: "Invoice\\s*#\\s*([\\w-]+)"
|
|
near: ["Invoice", "Invoice Number", "Invoice #"]
|
|
max_distance_pt: 200
|
|
parse: string
|
|
|
|
vendor:
|
|
type: string
|
|
extraction:
|
|
region: top_quarter
|
|
pick: largest_font
|
|
|
|
customer:
|
|
type: string
|
|
extraction:
|
|
near: ["Bill To", "Customer", "Sold To"]
|
|
max_distance_pt: 150
|
|
pick: nearest_below
|
|
parse: string
|
|
|
|
invoice_date:
|
|
type: date
|
|
extraction:
|
|
near: ["Date", "Invoice Date"]
|
|
max_distance_pt: 100
|
|
parse: date
|
|
|
|
due_date:
|
|
type: date
|
|
extraction:
|
|
near: ["Due Date", "Payment Due", "Due"]
|
|
max_distance_pt: 100
|
|
parse: date
|
|
|
|
total:
|
|
type: decimal
|
|
extraction:
|
|
regex: "([\\d,]+\\.\\d{2})"
|
|
near: ["Total", "Amount Due", "Balance Due", "Grand Total"]
|
|
max_distance_pt: 80
|
|
parse: decimal
|
|
|
|
subtotal:
|
|
type: decimal
|
|
extraction:
|
|
regex: "([\\d,]+\\.\\d{2})"
|
|
near: ["Subtotal", "Sub-Total"]
|
|
max_distance_pt: 80
|
|
parse: decimal
|
|
|
|
tax:
|
|
type: decimal
|
|
extraction:
|
|
regex: "([\\d,]+\\.\\d{2})"
|
|
near: ["Tax", "VAT", "GST", "Sales Tax"]
|
|
max_distance_pt: 80
|
|
parse: decimal
|
|
|
|
line_items:
|
|
type: array
|
|
extraction:
|
|
table_region: largest_table
|
|
schema:
|
|
- name: description
|
|
type: string
|
|
required: true
|
|
- name: quantity
|
|
type: decimal
|
|
required: false
|
|
- name: unit_price
|
|
type: decimal
|
|
required: false
|
|
- name: amount
|
|
type: decimal
|
|
required: false
|
|
fallback: []
|