pdftract/profiles/builtin/receipt/profile.yaml
jedarden 8b5dd4febb docs(pdftract-4iier): add per-profile README documentation for all 9 built-in profiles
This commit creates user-facing documentation for each built-in profile:

- Profile YAML files defining match criteria, priority, and extracted fields
- Per-profile READMEs with match criteria summary, extracted fields table,
  known limitations, sample input pointers, and configuration tips
- xtask skeleton generator for automated README generation

Profiles documented:
- invoice: Commercial invoices with line items, vendor/customer, totals
- receipt: POS receipts with items, payment method
- contract: Legal contracts with parties, effective date, term, signatures
- scientific_paper: Academic papers with title, authors, abstract, DOI, references
- slide_deck: Presentation slides with title, presenter, date, slide titles
- form: Fillable forms (degenerate case: uses Phase 7.4 form_fields)
- bank_statement: Bank statements with account info, period, balances, transactions
- legal_filing: Court filings with case number, court, parties, filing date, docket
- book_chapter: Book chapters with title, chapter number, author, section headings

Closes: pdftract-4iier
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-17 23:19:00 -04:00

68 lines
1.8 KiB
YAML

description: Point-of-sale or purchase receipt with items, payment method
priority: 45
match:
any:
- text_patterns:
- "(?i)receipt"
- "(?i)store receipt"
- "(?i)register receipt"
- "(?i)transaction receipt"
- text_patterns:
- "(?i)total.*sold"
- "(?i)change.*due"
- "(?i)cash.*credit"
- "(?i)card.*payment"
- structural:
- has_monetary_columnar_layout: true
- page_aspect_ratio: "narrow_or_square"
page_count_hint: 1
profile_fields:
merchant:
type: string
extraction:
patterns:
- "(?i)^([A-Z][A-Za-z0-9\\s&']+)$"
- "(?i)(?:store|merchant|retailer)\\s*:?\\s*([A-Z][A-Za-z0-9\\s&']+)"
fallback: null
date:
type: date
extraction:
patterns:
- "(?i)date\\s*:?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
- "([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})\\s+([0-9]{1,2}:[0-9]{2})"
fallback: null
total:
type: decimal
extraction:
patterns:
- "(?i)total\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
fallback: null
tax:
type: decimal
extraction:
patterns:
- "(?i)tax\\s*[:=]?\\s*[\\$€£¥]?\\s*([0-9,]+\\.?[0-9]*)"
fallback: null
items:
type: array
extraction:
columnar_regions: "monetary_columns"
schema:
- name: name
type: string
required: true
- name: quantity
type: decimal
required: false
- name: price
type: decimal
required: false
fallback: []
payment_method:
type: string
extraction:
patterns:
- "(?i)(cash|credit|debit|visa|mastercard|amex|discover|check|cheque)"
fallback: null
reading_order: line_dominant
zone_filtering: exclude_headers_footers