This commit creates user-facing documentation for each built-in profile: - Profile YAML files defining match criteria, priority, and extracted fields - Per-profile READMEs with match criteria summary, extracted fields table, known limitations, sample input pointers, and configuration tips - xtask skeleton generator for automated README generation Profiles documented: - invoice: Commercial invoices with line items, vendor/customer, totals - receipt: POS receipts with items, payment method - contract: Legal contracts with parties, effective date, term, signatures - scientific_paper: Academic papers with title, authors, abstract, DOI, references - slide_deck: Presentation slides with title, presenter, date, slide titles - form: Fillable forms (degenerate case: uses Phase 7.4 form_fields) - bank_statement: Bank statements with account info, period, balances, transactions - legal_filing: Court filings with case number, court, parties, filing date, docket - book_chapter: Book chapters with title, chapter number, author, section headings Closes: pdftract-4iier Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
46 lines
1.2 KiB
YAML
46 lines
1.2 KiB
YAML
description: Book chapter with title, chapter number, author, section headings
|
|
priority: 32
|
|
match:
|
|
any:
|
|
- text_patterns:
|
|
- "(?i)chapter\\s+[IVXLCDM0-9]+"
|
|
- "(?i)section\\s+[0-9]+\\.?[0-9]*"
|
|
- "(?i)^\\d+\\.\\s+[A-Z]"
|
|
- structural:
|
|
- has_running_headers: true
|
|
- has_chapter_headings: true
|
|
- page_count_gte: 5
|
|
page_count_hint: 5-50
|
|
profile_fields:
|
|
title:
|
|
type: string
|
|
extraction:
|
|
region_hint: "first_page_top"
|
|
patterns:
|
|
- "^(.+)$"
|
|
fallback: null
|
|
chapter_number:
|
|
type: string
|
|
extraction:
|
|
region_hint: "first_page_top"
|
|
patterns:
|
|
- "(?i)chapter\\s+([IVXLCDM0-9]+)"
|
|
- "^([0-9]+)\\.\\s+[A-Z]"
|
|
fallback: null
|
|
author:
|
|
type: string
|
|
extraction:
|
|
patterns:
|
|
- "(?i)(?:by|author)\\s*:?.*?([A-Z][a-z]+\\s+[A-Z][a-z]+)"
|
|
- "([A-Z][a-z]+\\s+[A-Z][a-z]+)\\s+(?:is\\s+the\\s+author)"
|
|
fallback: null
|
|
sections:
|
|
type: array
|
|
extraction:
|
|
per_page: false
|
|
region_hint: "headings"
|
|
patterns:
|
|
- "^(?:[0-9]+\\.\\s*)?[A-Z][A-Za-z0-9\\s\\-:]+$"
|
|
fallback: []
|
|
reading_order: line_dominant
|
|
zone_filtering: exclude_headers_footers_page_numbers
|