pdftract/profiles/builtin/scientific_paper/profile.yaml
jedarden 80dbf0f703 feat(profiles): add profile infrastructure and initial fixtures
- Add profile source modules: apply_profile, extraction, extraction_loader, field_extractor, match_eval
- Add profiles CLI subcommand (profiles_cmd.rs)
- Update all 9 built-in profile YAMLs (invoice, receipt, contract, scientific_paper, slide_deck, form, bank_statement, legal_filing, book_chapter)
- Add 50 invoice fixture PDFs
- Add 2 receipt fixture PDFs

Part of: pdftract-3a310 (Phase 7.10 coordinator)
2026-05-31 15:10:51 -04:00

87 lines
1.9 KiB
YAML

# Scientific Paper extraction profile
# Matches academic papers from arXiv, journals, conference proceedings
name: scientific_paper
description: Academic papers from arXiv, journals, conference proceedings
priority: 30
match:
all:
- any:
- text_contains:
patterns: ["Abstract", "References", "doi:", "arXiv:", "Bibliography"]
- heading_matches:
pattern: "^(Abstract|Introduction|References|Bibliography)"
- any:
- structural:
has_table: false
has_form_field: false
has_math: true
page_count: null
- structural:
has_table: false
has_form_field: false
has_math: false
page_count:
min: 4
max: 50
none:
- text_contains:
patterns: ["Invoice", "Receipt", "WHEREAS", "NOW THEREFORE"]
extraction:
reading_order: xy_cut
table_detection: default
readability_threshold: 0.5
include_invisible: false
include_headers_footers: false
force_ocr: false
min_block_chars: 0
fields:
title:
type: string
extraction:
region: top_quarter
pick: largest_font
parse: string
authors:
type: array
extraction:
region: top_quarter
pick: nearest_below
after_heading: title
fallback: []
abstract:
type: string
extraction:
near: ["Abstract"]
region: top_half
parse: string
doi:
type: string
extraction:
regex: "doi[:\\.]\\s*(10\\.\\d{4,9}/[\\w\\-\\._;()/:]+)"
parse: string
journal:
type: string
extraction:
region: top_eighth
pick: first
parse: string
publication_date:
type: date
extraction:
near: ["Published", "Received", "Accepted"]
parse: date
references:
type: array
extraction:
region: bottom_half
after_heading: References
fallback: []