- Add profile source modules: apply_profile, extraction, extraction_loader, field_extractor, match_eval - Add profiles CLI subcommand (profiles_cmd.rs) - Update all 9 built-in profile YAMLs (invoice, receipt, contract, scientific_paper, slide_deck, form, bank_statement, legal_filing, book_chapter) - Add 50 invoice fixture PDFs - Add 2 receipt fixture PDFs Part of: pdftract-3a310 (Phase 7.10 coordinator)
63 lines
1.4 KiB
YAML
63 lines
1.4 KiB
YAML
# Book Chapter extraction profile
|
|
# Matches book chapters, monographs, and long-form narrative documents
|
|
name: book_chapter
|
|
description: Book chapters, monographs, long-form narrative documents
|
|
priority: 5
|
|
|
|
match:
|
|
all:
|
|
- structural:
|
|
has_table: false
|
|
has_form_field: false
|
|
has_math: false
|
|
page_count:
|
|
min: 5
|
|
max: 1000
|
|
- any:
|
|
- text_matches:
|
|
pattern: "^Chapter \\d+"
|
|
- heading_matches:
|
|
pattern: "^(Chapter|Part|Section) \\d+"
|
|
- text_matches:
|
|
pattern: "^\\d+\\.\\s+[A-Z]"
|
|
none:
|
|
- text_contains:
|
|
patterns: ["Abstract", "WHEREAS", "Invoice", "Account Statement", "References"]
|
|
|
|
extraction:
|
|
reading_order: line_dominant
|
|
table_detection: default
|
|
readability_threshold: 0.6
|
|
include_invisible: false
|
|
include_headers_footers: false
|
|
force_ocr: false
|
|
min_block_chars: 0
|
|
|
|
fields:
|
|
title:
|
|
type: string
|
|
extraction:
|
|
region: top_third
|
|
pick: largest_font
|
|
parse: string
|
|
|
|
chapter_number:
|
|
type: string
|
|
extraction:
|
|
near: ["Chapter", "Part"]
|
|
regex: "\\d+"
|
|
max_distance_pt: 100
|
|
parse: string
|
|
|
|
author:
|
|
type: string
|
|
extraction:
|
|
region: top_quarter
|
|
pick: smallest_font
|
|
parse: string
|
|
|
|
sections:
|
|
type: array
|
|
extraction:
|
|
pick: largest_font
|
|
fallback: []
|