pdftract/profiles/builtin/contract/profile.yaml
jedarden 80dbf0f703 feat(profiles): add profile infrastructure and initial fixtures
- Add profile source modules: apply_profile, extraction, extraction_loader, field_extractor, match_eval
- Add profiles CLI subcommand (profiles_cmd.rs)
- Update all 9 built-in profile YAMLs (invoice, receipt, contract, scientific_paper, slide_deck, form, bank_statement, legal_filing, book_chapter)
- Add 50 invoice fixture PDFs
- Add 2 receipt fixture PDFs

Part of: pdftract-3a310 (Phase 7.10 coordinator)
2026-05-31 15:10:51 -04:00

66 lines
1.6 KiB
YAML

# Contract extraction profile
# Matches legal contracts and agreements with parties, effective date, term, governing law, and signatures
name: contract
description: Legal contracts and agreements with parties, effective date, term, governing law, and signatures
priority: 20
match:
all:
- any:
- text_contains:
patterns: ["AGREEMENT", "CONTRACT", "WHEREAS", "NOW THEREFORE", "In witness whereof"]
- heading_matches:
pattern: "^(Agreement|Contract|Memorandum of Understanding)"
- structural:
has_table: false
has_form_field: false
has_math: false
page_count:
min: 2
max: 200
none:
- text_contains:
patterns: ["Invoice #", "Receipt"]
extraction:
reading_order: xy_cut
table_detection: off
readability_threshold: 0.5
include_invisible: false
include_headers_footers: false
force_ocr: false
min_block_chars: 0
fields:
parties:
type: string
extraction:
near: ["between", "party of the first part", "BY AND BETWEEN"]
pick: nearest_below
parse: string
effective_date:
type: date
extraction:
near: ["Effective Date", "Date of Agreement", "as of"]
parse: date
term:
type: string
extraction:
near: ["Term", "Initial Term", "expires on", "shall remain in effect"]
regex: "\\d+\\s+(years?|months?)|expires?\\s+\\d{4}"
parse: string
governing_law:
type: string
extraction:
near: ["Governing Law", "governed by the laws of"]
pick: nearest_right
parse: string
signatures:
type: array
extraction:
region: bottom_quarter
fallback: []