pdftract/profiles/builtin/receipt/profile.yaml
jedarden 80dbf0f703 feat(profiles): add profile infrastructure and initial fixtures
- Add profile source modules: apply_profile, extraction, extraction_loader, field_extractor, match_eval
- Add profiles CLI subcommand (profiles_cmd.rs)
- Update all 9 built-in profile YAMLs (invoice, receipt, contract, scientific_paper, slide_deck, form, bank_statement, legal_filing, book_chapter)
- Add 50 invoice fixture PDFs
- Add 2 receipt fixture PDFs

Part of: pdftract-3a310 (Phase 7.10 coordinator)
2026-05-31 15:10:51 -04:00

81 lines
1.8 KiB
YAML

# Receipt extraction profile
# Matches point-of-sale or purchase receipts with items and payment method
name: receipt
description: Point-of-sale or purchase receipt with items, payment method
priority: 45
match:
all:
- any:
- text_contains:
patterns: ["receipt", "store receipt", "register receipt", "transaction receipt"]
- text_contains:
patterns: ["total sold", "change due", "cash credit", "card payment"]
- structural:
has_table: true
has_form_field: false
has_math: false
page_count:
min: 1
max: 2
extraction:
reading_order: line_dominant
table_detection: default
readability_threshold: 0.5
include_invisible: false
include_headers_footers: false
force_ocr: false
min_block_chars: 0
fields:
merchant:
type: string
extraction:
region: top_quarter
pick: largest_font
parse: string
date:
type: date
extraction:
regex: "\\d{1,2}[/-]\\d{1,2}[/-]\\d{2,4}"
parse: date
total:
type: decimal
extraction:
regex: "([\\d,]+\\.\\d{2})"
near: ["Total", "Amount Due", "Balance"]
max_distance_pt: 80
parse: decimal
tax:
type: decimal
extraction:
regex: "([\\d,]+\\.\\d{2})"
near: ["Tax", "VAT"]
max_distance_pt: 80
parse: decimal
items:
type: array
extraction:
table_region: largest_table
schema:
- name: name
type: string
required: true
- name: quantity
type: decimal
required: false
- name: price
type: decimal
required: false
fallback: []
payment_method:
type: string
extraction:
regex: "(cash|credit|debit|visa|mastercard|amex|discover|check|cheque)"
parse: string