pdftract/profiles/builtin/slide_deck/profile.yaml

# Slide Deck Profile
#
# PowerPoint / Keynote / Google Slides exports as PDF.
# Extracts title, presenter, date, slide_titles.

name: slide_deck
description: PowerPoint / Keynote / Google Slides exports as PDF
priority: 15

# Matching predicates for slide deck classification
match:
  all:
    # Page count in typical slide deck range
    - structural:
        page_count: {min: 3, max: 200}
    # And EITHER: has limited font diversity (not a dense academic paper)
    # OR: contains "Slide N" patterns
    # OR: contains slide deck keywords
    - any:
        - structural:
            has_form_field: false
            font_diversity: {min: 2, max: 10}
        - text_matches: '^Slide \d+$'
        - text_contains: ["slides", "presentation"]
  none:
    # Exclude academic papers (these have their own profile)
    - text_contains: ["Abstract", "References", "WHEREAS", "Invoice"]

# Extraction tuning for slide decks
extraction:
  # Use xy_cut reading order for proper layout handling
  reading_order: xy_cut
  # Default table detection
  table_detection: default
  # Lower readability threshold for slides (less text density)
  readability_threshold: 0.6
  # Don't include invisible text
  include_invisible: false
  # Minimum block characters
  min_block_chars: 5

# Field extraction specifications
fields:
  title:
    type: string
    region: middle_half
    pick: largest_font
    page: first

  presenter:
    type: string
    region: bottom_half
    pick: largest_font
    page: first

  date:
    type: date
    near: ["Date"]
    parse: date

  slide_titles:
    type: array
    pick: largest_font
    per_page: true