pdftract/profiles/builtin/legal_filing/profile.yaml
jedarden 80dbf0f703 feat(profiles): add profile infrastructure and initial fixtures
- Add profile source modules: apply_profile, extraction, extraction_loader, field_extractor, match_eval
- Add profiles CLI subcommand (profiles_cmd.rs)
- Update all 9 built-in profile YAMLs (invoice, receipt, contract, scientific_paper, slide_deck, form, bank_statement, legal_filing, book_chapter)
- Add 50 invoice fixture PDFs
- Add 2 receipt fixture PDFs

Part of: pdftract-3a310 (Phase 7.10 coordinator)
2026-05-31 15:10:51 -04:00

62 lines
1.5 KiB
YAML

# Legal Filing extraction profile
# Matches court filings: motions, briefs, orders, docket entries
name: legal_filing
description: Court filings: motions, briefs, orders, docket entries
priority: 40
match:
all:
- any:
- text_contains:
patterns: ["UNITED STATES DISTRICT COURT", "IN THE COURT OF", "IN THE MATTER OF", "Case No.", "Civil Action No.", "Plaintiff", "Defendant", "Petitioner", "Respondent", "COMPLAINT", "MOTION TO", "ORDER GRANTING", "OPINION"]
- heading_matches:
pattern: "^(COMPLAINT|MOTION|ORDER|OPINION|BRIEF)"
- structural:
has_table: false
has_form_field: false
has_math: false
page_count:
min: 1
max: 500
extraction:
reading_order: xy_cut
table_detection: default
readability_threshold: 0.5
include_invisible: false
include_headers_footers: true
force_ocr: false
min_block_chars: 0
fields:
case_number:
type: string
extraction:
near: ["Case No.", "Civil Action No.", "Docket No.", "Cause No."]
regex: "[\\w-]+:?\\s*\\d+[\\w-]*"
parse: string
court:
type: string
extraction:
region: top_quarter
pick: largest_font
parse: string
parties:
type: array
extraction:
near: ["Plaintiff", "Defendant", "Petitioner", "Respondent", "v."]
fallback: []
filing_date:
type: date
extraction:
near: ["Filed", "Date Filed", "Dated"]
parse: date
docket_entries:
type: array
extraction:
region: bottom_half
fallback: []