Implements the slide_deck document profile for PowerPoint/Keynote/Google Slides exports as PDF. Includes 5 fixtures, expected outputs, and regression tests. Components: - profiles/builtin/slide_deck/profile.yaml - Profile configuration - tests/fixtures/profiles/slide_deck/ - 5 PDF fixtures with expected outputs - crates/pdftract-cli/tests/test_slide_deck.rs - Regression tests (12 PASS) Fixtures cover: 1. pitch_deck - Sales pitch (10 slides) 2. academic_lecture - Academic lecture (40 slides) 3. corporate_kickoff - Corporate kickoff (15 slides) 4. bilingual_deck - Bilingual EN/ES (12 slides) 5. googleslides_handout - Google Slides handout mode (4 pages, 3 slides/page) Extracted fields: title, presenter, date, slide_titles Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
64 lines
1.6 KiB
YAML
64 lines
1.6 KiB
YAML
# Slide Deck Profile
|
|
#
|
|
# PowerPoint / Keynote / Google Slides exports as PDF.
|
|
# Extracts title, presenter, date, slide_titles.
|
|
|
|
name: slide_deck
|
|
description: PowerPoint / Keynote / Google Slides exports as PDF
|
|
priority: 15
|
|
|
|
# Matching predicates for slide deck classification
|
|
match:
|
|
all:
|
|
# Page count in typical slide deck range
|
|
- structural:
|
|
page_count: {min: 3, max: 200}
|
|
# And EITHER: has limited font diversity (not a dense academic paper)
|
|
# OR: contains "Slide N" patterns
|
|
# OR: contains slide deck keywords
|
|
- any:
|
|
- structural:
|
|
has_form_field: false
|
|
font_diversity: {min: 2, max: 10}
|
|
- text_matches: '^Slide \d+$'
|
|
- text_contains: ["slides", "presentation"]
|
|
none:
|
|
# Exclude academic papers (these have their own profile)
|
|
- text_contains: ["Abstract", "References", "WHEREAS", "Invoice"]
|
|
|
|
# Extraction tuning for slide decks
|
|
extraction:
|
|
# Use xy_cut reading order for proper layout handling
|
|
reading_order: xy_cut
|
|
# Default table detection
|
|
table_detection: default
|
|
# Lower readability threshold for slides (less text density)
|
|
readability_threshold: 0.6
|
|
# Don't include invisible text
|
|
include_invisible: false
|
|
# Minimum block characters
|
|
min_block_chars: 5
|
|
|
|
# Field extraction specifications
|
|
fields:
|
|
title:
|
|
type: string
|
|
region: middle_half
|
|
pick: largest_font
|
|
page: first
|
|
|
|
presenter:
|
|
type: string
|
|
region: bottom_half
|
|
pick: largest_font
|
|
page: first
|
|
|
|
date:
|
|
type: date
|
|
near: ["Date"]
|
|
parse: date
|
|
|
|
slide_titles:
|
|
type: array
|
|
pick: largest_font
|
|
per_page: true
|