- Add profile source modules: apply_profile, extraction, extraction_loader, field_extractor, match_eval - Add profiles CLI subcommand (profiles_cmd.rs) - Update all 9 built-in profile YAMLs (invoice, receipt, contract, scientific_paper, slide_deck, form, bank_statement, legal_filing, book_chapter) - Add 50 invoice fixture PDFs - Add 2 receipt fixture PDFs Part of: pdftract-3a310 (Phase 7.10 coordinator)
66 lines
1.6 KiB
YAML
66 lines
1.6 KiB
YAML
# Contract extraction profile
|
|
# Matches legal contracts and agreements with parties, effective date, term, governing law, and signatures
|
|
name: contract
|
|
description: Legal contracts and agreements with parties, effective date, term, governing law, and signatures
|
|
priority: 20
|
|
|
|
match:
|
|
all:
|
|
- any:
|
|
- text_contains:
|
|
patterns: ["AGREEMENT", "CONTRACT", "WHEREAS", "NOW THEREFORE", "In witness whereof"]
|
|
- heading_matches:
|
|
pattern: "^(Agreement|Contract|Memorandum of Understanding)"
|
|
- structural:
|
|
has_table: false
|
|
has_form_field: false
|
|
has_math: false
|
|
page_count:
|
|
min: 2
|
|
max: 200
|
|
none:
|
|
- text_contains:
|
|
patterns: ["Invoice #", "Receipt"]
|
|
|
|
extraction:
|
|
reading_order: xy_cut
|
|
table_detection: off
|
|
readability_threshold: 0.5
|
|
include_invisible: false
|
|
include_headers_footers: false
|
|
force_ocr: false
|
|
min_block_chars: 0
|
|
|
|
fields:
|
|
parties:
|
|
type: string
|
|
extraction:
|
|
near: ["between", "party of the first part", "BY AND BETWEEN"]
|
|
pick: nearest_below
|
|
parse: string
|
|
|
|
effective_date:
|
|
type: date
|
|
extraction:
|
|
near: ["Effective Date", "Date of Agreement", "as of"]
|
|
parse: date
|
|
|
|
term:
|
|
type: string
|
|
extraction:
|
|
near: ["Term", "Initial Term", "expires on", "shall remain in effect"]
|
|
regex: "\\d+\\s+(years?|months?)|expires?\\s+\\d{4}"
|
|
parse: string
|
|
|
|
governing_law:
|
|
type: string
|
|
extraction:
|
|
near: ["Governing Law", "governed by the laws of"]
|
|
pick: nearest_right
|
|
parse: string
|
|
|
|
signatures:
|
|
type: array
|
|
extraction:
|
|
region: bottom_quarter
|
|
fallback: []
|