pdftract/profiles/builtin/book_chapter/profile.yaml
jedarden 80dbf0f703 feat(profiles): add profile infrastructure and initial fixtures
- Add profile source modules: apply_profile, extraction, extraction_loader, field_extractor, match_eval
- Add profiles CLI subcommand (profiles_cmd.rs)
- Update all 9 built-in profile YAMLs (invoice, receipt, contract, scientific_paper, slide_deck, form, bank_statement, legal_filing, book_chapter)
- Add 50 invoice fixture PDFs
- Add 2 receipt fixture PDFs

Part of: pdftract-3a310 (Phase 7.10 coordinator)
2026-05-31 15:10:51 -04:00

63 lines
1.4 KiB
YAML

# Book Chapter extraction profile
# Matches book chapters, monographs, and long-form narrative documents
name: book_chapter
description: Book chapters, monographs, long-form narrative documents
priority: 5
match:
all:
- structural:
has_table: false
has_form_field: false
has_math: false
page_count:
min: 5
max: 1000
- any:
- text_matches:
pattern: "^Chapter \\d+"
- heading_matches:
pattern: "^(Chapter|Part|Section) \\d+"
- text_matches:
pattern: "^\\d+\\.\\s+[A-Z]"
none:
- text_contains:
patterns: ["Abstract", "WHEREAS", "Invoice", "Account Statement", "References"]
extraction:
reading_order: line_dominant
table_detection: default
readability_threshold: 0.6
include_invisible: false
include_headers_footers: false
force_ocr: false
min_block_chars: 0
fields:
title:
type: string
extraction:
region: top_third
pick: largest_font
parse: string
chapter_number:
type: string
extraction:
near: ["Chapter", "Part"]
regex: "\\d+"
max_distance_pt: 100
parse: string
author:
type: string
extraction:
region: top_quarter
pick: smallest_font
parse: string
sections:
type: array
extraction:
pick: largest_font
fallback: []