pdftract/profiles/builtin/scientific_paper/profile.yaml
jedarden 8b5dd4febb docs(pdftract-4iier): add per-profile README documentation for all 9 built-in profiles
This commit creates user-facing documentation for each built-in profile:

- Profile YAML files defining match criteria, priority, and extracted fields
- Per-profile READMEs with match criteria summary, extracted fields table,
  known limitations, sample input pointers, and configuration tips
- xtask skeleton generator for automated README generation

Profiles documented:
- invoice: Commercial invoices with line items, vendor/customer, totals
- receipt: POS receipts with items, payment method
- contract: Legal contracts with parties, effective date, term, signatures
- scientific_paper: Academic papers with title, authors, abstract, DOI, references
- slide_deck: Presentation slides with title, presenter, date, slide titles
- form: Fillable forms (degenerate case: uses Phase 7.4 form_fields)
- bank_statement: Bank statements with account info, period, balances, transactions
- legal_filing: Court filings with case number, court, parties, filing date, docket
- book_chapter: Book chapters with title, chapter number, author, section headings

Closes: pdftract-4iier
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-17 23:19:00 -04:00

68 lines
1.8 KiB
YAML

description: Academic paper with title, authors, abstract, DOI, references
priority: 55
match:
any:
- text_patterns:
- "(?i)abstract"
- "(?i)introduction"
- "(?i)keywords\\s*[:\\.]"
- "(?i)doi\\s*10\\."
- text_patterns:
- "(?i)references\\s*[1-9]"
- "(?i)bibliography"
- "(?i)acknowledgments?"
- structural:
- has_two_column_layout: true
- has_bibliography_section: true
page_count_hint: 4-30
profile_fields:
title:
type: string
extraction:
region_hint: "first_page_top"
patterns:
- "^(.+)$"
fallback: null
authors:
type: array
extraction:
region_hint: "first_page_top_below_title"
patterns:
- "([A-Z][a-z]+\\s+[A-Z][a-z]+,?\\s+(?:et\\s+al\\.?|and\\s+[A-Z][a-z]+)*)"
fallback: []
abstract:
type: string
extraction:
region_hint: "after_abstract_heading"
patterns:
- "(?i)abstract\\s*[:\\.]?\\s*(.+?)(?=\\n\\n|keywords|introduction|$)"
fallback: null
doi:
type: string
extraction:
patterns:
- "(?i)doi\\s*:?\\s*(10\\.[0-9]{4,9}/[-._;()/:A-Z0-9]+)"
- "(?i)https?://doi\\.org/(10\\.[0-9]{4,9}/[-._;()/:A-Z0-9]+)"
fallback: null
journal:
type: string
extraction:
patterns:
- "(?i)(?:published\\s+in|journal|proceedings)\\s*:.*?([A-Z][A-Za-z\\s]+?)(?=,|\\.|\\n)"
fallback: null
publication_date:
type: date
extraction:
patterns:
- "(?i)(?:received|accepted|published|revised)\\s*:.*?([A-Za-z]+\\s+[0-9]{4})"
- "©\\s*([0-9]{4})"
fallback: null
references:
type: array
extraction:
region_hint: "after_references_heading"
patterns:
- "\\[\\d+\\]\\s+.+"
fallback: []
reading_order: column_aware
zone_filtering: exclude_headers_footers_page_numbers