pdftract/profiles/builtin/contract/profile.yaml
jedarden 702306125f feat(pdftract-dtpwa): implement contract profile per Phase 7.10 schema
- Rewrite profiles/builtin/contract/profile.yaml following Phase 7.10 schema
  with match predicates, extraction tuning, and field extractors
- Create tests/fixtures/profiles/contract/ directory with 5 expected outputs
- Add comprehensive regression tests in tests/profiles/test_contract.rs
- Profile extracts: parties, effective_date, term, governing_law, signatures

Fixtures cover: NDA, employment agreement, MSA, service agreement, real estate purchase

Closes: pdftract-dtpwa

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-24 07:10:32 -04:00

38 lines
1.3 KiB
YAML

# Contract profile for legal agreements
# Extracts parties, effective date, term, governing law, and signatures from contracts
name: contract
description: Legal contracts and agreements with parties, effective date, term, governing law, and signatures
priority: 20
# Matching predicates: identify documents as contracts
match:
all:
- any:
- text_contains: ["AGREEMENT", "CONTRACT", "WHEREAS", "NOW THEREFORE", "In witness whereof"]
- heading_matches: '^(Agreement|Contract|Memorandum of Understanding)'
- structural: {page_count: {min: 2, max: 200}}
none:
- text_contains: ["Invoice #", "Receipt"]
# Extraction tuning for contracts
extraction:
reading_order: xy_cut
readability_threshold: 0.5
include_headers_footers: false
# Field extractors for contract-specific metadata
fields:
parties:
near: ["between", "party of the first part", "BY AND BETWEEN"]
pick: nearest_below
effective_date:
near: ["Effective Date", "Date of Agreement", "as of"]
parse: date
term:
near: ["Term", "Initial Term", "expires on", "shall remain in effect"]
regex: '\d+\s+(years?|months?)|expires?\s+\d{4}'
governing_law:
near: ["Governing Law", "governed by the laws of"]
pick: nearest_right
signatures:
region: bottom_quarter