This commit creates user-facing documentation for each built-in profile: - Profile YAML files defining match criteria, priority, and extracted fields - Per-profile READMEs with match criteria summary, extracted fields table, known limitations, sample input pointers, and configuration tips - xtask skeleton generator for automated README generation Profiles documented: - invoice: Commercial invoices with line items, vendor/customer, totals - receipt: POS receipts with items, payment method - contract: Legal contracts with parties, effective date, term, signatures - scientific_paper: Academic papers with title, authors, abstract, DOI, references - slide_deck: Presentation slides with title, presenter, date, slide titles - form: Fillable forms (degenerate case: uses Phase 7.4 form_fields) - bank_statement: Bank statements with account info, period, balances, transactions - legal_filing: Court filings with case number, court, parties, filing date, docket - book_chapter: Book chapters with title, chapter number, author, section headings Closes: pdftract-4iier Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
94 lines
2.7 KiB
YAML
94 lines
2.7 KiB
YAML
description: Commercial invoice with line items, vendor/customer, and totals
|
|
priority: 50
|
|
match:
|
|
any:
|
|
- text_patterns:
|
|
- "(?i)invoice"
|
|
- "(?i)bill to"
|
|
- "(?i)invoice #"
|
|
- "(?i)invoice number"
|
|
- "(?i)tax invoice"
|
|
- text_patterns:
|
|
- "(?i)due date"
|
|
- "(?i)payment terms"
|
|
- "(?i)purchase order"
|
|
- "(?i)po #"
|
|
- structural:
|
|
- has_line_item_table: true
|
|
page_count_hint: 1-5
|
|
profile_fields:
|
|
invoice_number:
|
|
type: string
|
|
extraction:
|
|
patterns:
|
|
- "(?i)invoice\\s*[#:]?\\s*([A-Z0-9-]+)"
|
|
- "(?i)bill\\s*invoice\\s*[#:]?\\s*([A-Z0-9-]+)"
|
|
fallback: null
|
|
vendor:
|
|
type: string
|
|
extraction:
|
|
patterns:
|
|
- "(?i)(?:from|vendor|supplier|company)\\s*:?\\s*([A-Z][A-Za-z0-9\\s&]+?)(?=\\n|\\r|$)"
|
|
- "(?i)^([A-Z][A-Za-z0-9\\s&]+)\\s+(?:Inc|LLC|Ltd|Corp|GmbH)"
|
|
fallback: null
|
|
customer:
|
|
type: string
|
|
extraction:
|
|
patterns:
|
|
- "(?i)(?:bill\\s*to|customer|client)\\s*:?\\s*([A-Z][A-Za-z0-9\\s&]+?)(?=\\n|\\r|$)"
|
|
fallback: null
|
|
invoice_date:
|
|
type: date
|
|
extraction:
|
|
patterns:
|
|
- "(?i)invoice\\s*date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
|
|
- "(?i)date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
|
|
fallback: null
|
|
due_date:
|
|
type: date
|
|
extraction:
|
|
patterns:
|
|
- "(?i)due\\s*date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
|
|
- "(?i)payment\\s*due\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
|
|
fallback: null
|
|
total:
|
|
type: decimal
|
|
extraction:
|
|
patterns:
|
|
- "(?i)total\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
|
- "(?i)amount\\s*due\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
|
fallback: null
|
|
subtotal:
|
|
type: decimal
|
|
extraction:
|
|
patterns:
|
|
- "(?i)sub\\s*total\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
|
fallback: null
|
|
tax:
|
|
type: decimal
|
|
extraction:
|
|
patterns:
|
|
- "(?i)tax\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
|
- "(?i)vat\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
|
- "(?i)gst\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
|
|
fallback: null
|
|
line_items:
|
|
type: array
|
|
extraction:
|
|
table_region: "largest_table_or_bottom_half"
|
|
schema:
|
|
- name: description
|
|
type: string
|
|
required: true
|
|
- name: quantity
|
|
type: decimal
|
|
required: false
|
|
- name: unit_price
|
|
type: decimal
|
|
required: false
|
|
- name: amount
|
|
type: decimal
|
|
required: false
|
|
fallback: []
|
|
reading_order: line_dominant
|
|
zone_filtering: exclude_headers_footers
|