This commit creates user-facing documentation for each built-in profile: - Profile YAML files defining match criteria, priority, and extracted fields - Per-profile READMEs with match criteria summary, extracted fields table, known limitations, sample input pointers, and configuration tips - xtask skeleton generator for automated README generation Profiles documented: - invoice: Commercial invoices with line items, vendor/customer, totals - receipt: POS receipts with items, payment method - contract: Legal contracts with parties, effective date, term, signatures - scientific_paper: Academic papers with title, authors, abstract, DOI, references - slide_deck: Presentation slides with title, presenter, date, slide titles - form: Fillable forms (degenerate case: uses Phase 7.4 form_fields) - bank_statement: Bank statements with account info, period, balances, transactions - legal_filing: Court filings with case number, court, parties, filing date, docket - book_chapter: Book chapters with title, chapter number, author, section headings Closes: pdftract-4iier Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
57 lines
1.8 KiB
YAML
57 lines
1.8 KiB
YAML
description: Legal contract with parties, effective date, term, signatures
|
|
priority: 40
|
|
match:
|
|
any:
|
|
- text_patterns:
|
|
- "(?i)agreement\\s+is\\s+made"
|
|
- "(?i)contract\\s+agreement"
|
|
- "(?i)this\\s+agreement"
|
|
- "(?i)terms\\s+and\\s+conditions"
|
|
- "(?i)memorandum\\s+of\\s+understanding"
|
|
- text_patterns:
|
|
- "(?i)effective\\s+date"
|
|
- "(?i)governing\\s+law"
|
|
- "(?i)termination\\s+notice"
|
|
- "(?i)indemnification"
|
|
- structural:
|
|
- has_signature_blocks: true
|
|
- page_count_gte: 2
|
|
page_count_hint: 2-50
|
|
profile_fields:
|
|
parties:
|
|
type: array
|
|
extraction:
|
|
patterns:
|
|
- "(?i)between\\s+([A-Z][A-Za-z0-9\\s&]+)\\s+and\\s+([A-Z][A-Za-z0-9\\s&]+)"
|
|
- "(?i)party\\s+[A-Z]\\s*:.*?([A-Z][A-Za-z0-9\\s&]+)"
|
|
fallback: []
|
|
effective_date:
|
|
type: date
|
|
extraction:
|
|
patterns:
|
|
- "(?i)effective\\s+date\\s*(?:as\\s+of|:)?\\s*([A-Za-z]+\\s+[0-9]{1,2},?\\s+[0-9]{4})"
|
|
- "(?i)effective\\s+date\\s*(?:as\\s+of|:)?\\s*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})"
|
|
fallback: null
|
|
term:
|
|
type: string
|
|
extraction:
|
|
patterns:
|
|
- "(?i)term\\s*(?:of\\s*this\\s+agreement)?\\s*:?.*?([0-9]+\\s+(?:months?|years?))"
|
|
- "(?i)shall\\s+continue\\s+for.*?([0-9]+\\s+(?:months?|years?))"
|
|
fallback: null
|
|
governing_law:
|
|
type: string
|
|
extraction:
|
|
patterns:
|
|
- "(?i)governing\\s+law\\s*(?:of|:)?\\s*([A-Za-z\\s]+?)(?=\\n|\\r|\\.)"
|
|
fallback: null
|
|
signatures:
|
|
type: array
|
|
extraction:
|
|
region_hint: "bottom_20_percent"
|
|
patterns:
|
|
- "(?i)signature\\s*:.*?([A-Z][A-Za-z\\s]+)"
|
|
- "(?i)signed\\s*:.*?([A-Z][A-Za-z\\s]+)"
|
|
fallback: []
|
|
reading_order: line_dominant
|
|
zone_filtering: exclude_headers_footers
|