feat(pdftract-cbrbg): implement span flag detector for Phase 4.1
Implement `detect_span_flags()` function that returns a u8 bitmask combining 5 style flag bits (BOLD, ITALIC, SMALLCAPS, SUBSCRIPT, SUPERSCRIPT). Detection uses multiple signals per the plan (lines 1667-1671): - BOLD: font name contains "Bold", /Flags bit 18, or /StemV > 120 - ITALIC: font name contains "Italic"/"Oblique" or /ItalicAngle != 0 - SMALLCAPS: font name contains "SC"/"SmallCaps"/".sc" or /Flags bit 3 - SUBSCRIPT: text_rise < -0.1 * font_size - SUPERSCRIPT: text_rise > 0.1 * font_size The multi-signal approach achieves >95% detection accuracy vs pdfminer.six's ~70%. Acceptance criteria: - "Times-Bold" → BOLD set - "Helvetica-Italic" → ITALIC set - "Times-BoldItalic" → BOLD | ITALIC set - text_rise -2pt with font_size 12pt → SUBSCRIPT set (rise/size = -0.167 < -0.1) - text_rise +1.5pt with font_size 12pt → SUPERSCRIPT set - text_rise -0.5pt with font_size 12pt → NEITHER (rise/size = -0.042, within threshold) - /Flags bit 18 set → BOLD set - /StemV 150 → BOLD set Closes: pdftract-cbrbg
This commit is contained in:
parent
4f1a3e84b7
commit
cad7d2c72b
2 changed files with 467 additions and 0 deletions
|
|
@ -42,6 +42,7 @@ pub use render::pdfium_path::has_full_render;
|
|||
pub mod schema;
|
||||
pub mod semaphore;
|
||||
pub mod signature;
|
||||
pub mod span_flags;
|
||||
pub mod table;
|
||||
|
||||
// Re-export key types for convenience
|
||||
|
|
|
|||
466
crates/pdftract-core/src/span_flags.rs
Normal file
466
crates/pdftract-core/src/span_flags.rs
Normal file
|
|
@ -0,0 +1,466 @@
|
|||
//! Span flag detection (Phase 4.1).
|
||||
//!
|
||||
//! This module implements detection of text style flags on spans:
|
||||
//! - BOLD: font name contains "Bold", FontDescriptor /Flags bit 18 set, or /StemV > 120
|
||||
//! - ITALIC: font name contains "Italic"/"Oblique" or /ItalicAngle != 0
|
||||
//! - SMALLCAPS: font name contains "SC"/"SmallCaps" or /Flags bit 3 set
|
||||
//! - SUBSCRIPT: text_rise < -0.1 * font_size
|
||||
//! - SUPERSCRIPT: text_rise > 0.1 * font_size
|
||||
//!
|
||||
//! The detector uses multiple signals and combines them into a bitmask.
|
||||
//! This multi-signal approach catches >95% of styled text vs pdfminer.six's ~70%.
|
||||
|
||||
use crate::font::strip_subset_prefix;
|
||||
|
||||
/// Span flag bits.
|
||||
///
|
||||
/// Each flag is a single bit in a u8 bitmask.
|
||||
/// Multiple flags can be set simultaneously (e.g., BoldItalic).
|
||||
pub mod flags {
|
||||
/// Bit 0: Bold text
|
||||
pub const BOLD: u8 = 1 << 0;
|
||||
/// Bit 1: Italic text
|
||||
pub const ITALIC: u8 = 1 << 1;
|
||||
/// Bit 2: Small caps text
|
||||
pub const SMALLCAPS: u8 = 1 << 2;
|
||||
/// Bit 3: Subscript text
|
||||
pub const SUBSCRIPT: u8 = 1 << 3;
|
||||
/// Bit 4: Superscript text
|
||||
pub const SUPERSCRIPT: u8 = 1 << 4;
|
||||
}
|
||||
|
||||
/// Font descriptor flags per PDF spec ISO 32000-1 Table 123.
|
||||
///
|
||||
/// These flags are stored in the FontDescriptor's /Flags entry.
|
||||
pub mod font_flags {
|
||||
/// Bit 1: Fixed-pitch font (monospace)
|
||||
pub const FIXED_PITCH: u32 = 1 << 1;
|
||||
/// Bit 2: Serif font
|
||||
pub const SERIF: u32 = 1 << 2;
|
||||
/// Bit 3: Symbolic font (small caps indicator)
|
||||
pub const SYMBOLIC: u32 = 1 << 3;
|
||||
/// Bit 4: Script font (cursive)
|
||||
pub const SCRIPT: u32 = 1 << 4;
|
||||
/// Bit 6: Nonsymbolic font
|
||||
pub const NONSYMBOLIC: u32 = 1 << 6;
|
||||
/// Bit 7: Italic font
|
||||
pub const ITALIC: u32 = 1 << 7;
|
||||
/// Bit 17: All caps (reserved)
|
||||
pub const ALL_CAP: u32 = 1 << 17;
|
||||
/// Bit 18: Force bold or SmallCap (context-dependent)
|
||||
pub const FORCE_BOLD: u32 = 1 << 18;
|
||||
/// Bit 19: Force bold (alternative interpretation)
|
||||
pub const FORCE_BOLD_ALT: u32 = 1 << 19;
|
||||
}
|
||||
|
||||
/// Bold indicator patterns in PostScript font names.
|
||||
///
|
||||
/// These patterns are used to detect bold fonts when the ForceBold flag
|
||||
/// is not available or authoritative.
|
||||
const BOLD_PATTERNS: &[&str] = &[
|
||||
"Bold",
|
||||
"Bd",
|
||||
"Black",
|
||||
"Heavy",
|
||||
"ExtraBold",
|
||||
"Extrabold",
|
||||
"UltraBold",
|
||||
"Ultrabold",
|
||||
];
|
||||
|
||||
/// Italic indicator patterns in PostScript font names.
|
||||
const ITALIC_PATTERNS: &[&str] = &["Italic", "Oblique"];
|
||||
|
||||
/// Small caps indicator patterns in PostScript font names.
|
||||
const SMALLCAPS_PATTERNS: &[&str] = &["SC", "SmallCaps", ".sc"];
|
||||
|
||||
/// Font information needed for flag detection.
|
||||
///
|
||||
/// This struct contains all the font properties that influence
|
||||
/// style flag detection.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct FontInfo {
|
||||
/// Font name (with optional subset prefix)
|
||||
pub name: Option<String>,
|
||||
/// FontDescriptor /Flags value (if available)
|
||||
pub flags: Option<u32>,
|
||||
/// FontDescriptor /StemV value (if available)
|
||||
pub stem_v: Option<f32>,
|
||||
/// FontDescriptor /ItalicAngle value (if available)
|
||||
pub italic_angle: Option<f32>,
|
||||
}
|
||||
|
||||
impl FontInfo {
|
||||
/// Create a new FontInfo with default values.
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Set the font name.
|
||||
pub fn with_name(mut self, name: String) -> Self {
|
||||
self.name = Some(name);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the font descriptor flags.
|
||||
pub fn with_flags(mut self, flags: u32) -> Self {
|
||||
self.flags = Some(flags);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the stem_v value.
|
||||
pub fn with_stem_v(mut self, stem_v: f32) -> Self {
|
||||
self.stem_v = Some(stem_v);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the italic_angle value.
|
||||
pub fn with_italic_angle(mut self, italic_angle: f32) -> Self {
|
||||
self.italic_angle = Some(italic_angle);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Detect span flags from font and text position information.
|
||||
///
|
||||
/// This function implements the multi-signal flag detection algorithm
|
||||
/// described in Phase 4.1 of the plan. It combines multiple indicators
|
||||
/// to achieve >95% detection accuracy.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `font` - Font information (name, descriptor flags, metrics)
|
||||
/// * `text_rise` - Text rise offset from baseline (in points)
|
||||
/// * `font_size` - Font size (in points)
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A u8 bitmask combining the detected flags. Use `flags::*` constants
|
||||
/// to test individual bits.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use pdftract_core::span_flags::{detect_span_flags, FontInfo, flags};
|
||||
///
|
||||
/// // Times-Bold font
|
||||
/// let font = FontInfo::new().with_name("Times-Bold".to_string());
|
||||
/// let flags = detect_span_flags(&font, 0.0, 12.0);
|
||||
/// assert!(flags & flags::BOLD != 0);
|
||||
///
|
||||
/// // Subscript: text_rise = -2pt, font_size = 12pt
|
||||
/// let font = FontInfo::new();
|
||||
/// let flags = detect_span_flags(&font, -2.0, 12.0);
|
||||
/// assert!(flags & flags::SUBSCRIPT != 0);
|
||||
/// ```
|
||||
pub fn detect_span_flags(font: &FontInfo, text_rise: f32, font_size: f32) -> u8 {
|
||||
let mut flags: u8 = 0;
|
||||
|
||||
// BOLD detection: set if ANY of:
|
||||
// - font.name contains "Bold" (case-sensitive substring)
|
||||
// - font.descriptor.flags has bit 18 (ForceBold)
|
||||
// - font.descriptor.stem_v > 120
|
||||
if is_bold(font) {
|
||||
flags |= flags::BOLD;
|
||||
}
|
||||
|
||||
// ITALIC detection: set if ANY of:
|
||||
// - font.name contains "Italic" or "Oblique"
|
||||
// - font.descriptor.italic_angle != 0.0
|
||||
if is_italic(font) {
|
||||
flags |= flags::ITALIC;
|
||||
}
|
||||
|
||||
// SMALLCAPS detection: set if ANY of:
|
||||
// - font.name contains "SC" or "SmallCaps"
|
||||
// - font.descriptor.flags has bit 3 (Symbolic/SmallCap)
|
||||
if is_smallcaps(font) {
|
||||
flags |= flags::SMALLCAPS;
|
||||
}
|
||||
|
||||
// SUBSCRIPT detection: text_rise < -0.1 * font_size
|
||||
let rise_ratio = text_rise / font_size;
|
||||
if rise_ratio < -0.1 {
|
||||
flags |= flags::SUBSCRIPT;
|
||||
}
|
||||
// SUPERSCRIPT detection: text_rise > 0.1 * font_size
|
||||
// Note: SUB and SUPER are mutually exclusive by definition
|
||||
// (text_rise is a single value per span)
|
||||
else if rise_ratio > 0.1 {
|
||||
flags |= flags::SUPERSCRIPT;
|
||||
}
|
||||
|
||||
flags
|
||||
}
|
||||
|
||||
/// Check if font indicates bold style.
|
||||
fn is_bold(font: &FontInfo) -> bool {
|
||||
// Check font name patterns
|
||||
if let Some(name) = &font.name {
|
||||
let base_name = strip_subset_prefix(name);
|
||||
if BOLD_PATTERNS.iter().any(|p| base_name.contains(p)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Check FontDescriptor flags bit 18 (ForceBold)
|
||||
if let Some(flags) = font.flags {
|
||||
if flags & font_flags::FORCE_BOLD != 0 {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Check StemV > 120 (bold by convention)
|
||||
if let Some(stem_v) = font.stem_v {
|
||||
if stem_v > 120.0 {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Check if font indicates italic style.
|
||||
fn is_italic(font: &FontInfo) -> bool {
|
||||
// Check font name patterns
|
||||
if let Some(name) = &font.name {
|
||||
let base_name = strip_subset_prefix(name);
|
||||
if ITALIC_PATTERNS.iter().any(|p| base_name.contains(p)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Check ItalicAngle != 0
|
||||
if let Some(italic_angle) = font.italic_angle {
|
||||
if italic_angle != 0.0 {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Check if font indicates small caps style.
|
||||
fn is_smallcaps(font: &FontInfo) -> bool {
|
||||
// Check font name patterns
|
||||
if let Some(name) = &font.name {
|
||||
let base_name = strip_subset_prefix(name);
|
||||
if SMALLCAPS_PATTERNS.iter().any(|p| base_name.contains(p)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Check FontDescriptor flags bit 3 (Symbolic/SmallCap)
|
||||
if let Some(flags) = font.flags {
|
||||
if flags & font_flags::SYMBOLIC != 0 {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_bold_by_name_times_bold() {
|
||||
let font = FontInfo::new().with_name("Times-Bold".to_string());
|
||||
assert!(is_bold(&font));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bold_by_name_helvetica_bold() {
|
||||
let font = FontInfo::new().with_name("Helvetica-Bold".to_string());
|
||||
assert!(is_bold(&font));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bold_by_name_bold_italic() {
|
||||
let font = FontInfo::new().with_name("Times-BoldItalic".to_string());
|
||||
assert!(is_bold(&font));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bold_by_name_bd() {
|
||||
let font = FontInfo::new().with_name("Arial-Bd".to_string());
|
||||
assert!(is_bold(&font));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bold_by_name_black() {
|
||||
let font = FontInfo::new().with_name("Helvetica-Black".to_string());
|
||||
assert!(is_bold(&font));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bold_by_subset_prefix() {
|
||||
let font = FontInfo::new().with_name("ABCDEF+Times-Bold".to_string());
|
||||
assert!(is_bold(&font));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bold_by_flags_bit_18() {
|
||||
let font = FontInfo::new()
|
||||
.with_name("RegularFont".to_string())
|
||||
.with_flags(font_flags::FORCE_BOLD);
|
||||
assert!(is_bold(&font));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bold_by_stem_v() {
|
||||
let font = FontInfo::new()
|
||||
.with_name("RegularFont".to_string())
|
||||
.with_stem_v(150.0);
|
||||
assert!(is_bold(&font));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_not_bold_low_stem_v() {
|
||||
let font = FontInfo::new()
|
||||
.with_name("RegularFont".to_string())
|
||||
.with_stem_v(80.0);
|
||||
assert!(!is_bold(&font));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_italic_by_name_helvetica_italic() {
|
||||
let font = FontInfo::new().with_name("Helvetica-Italic".to_string());
|
||||
assert!(is_italic(&font));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_italic_by_name_oblique() {
|
||||
let font = FontInfo::new().with_name("Times-Oblique".to_string());
|
||||
assert!(is_italic(&font));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_italic_by_angle() {
|
||||
let font = FontInfo::new()
|
||||
.with_name("RegularFont".to_string())
|
||||
.with_italic_angle(-15.0);
|
||||
assert!(is_italic(&font));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_not_italic_zero_angle() {
|
||||
let font = FontInfo::new()
|
||||
.with_name("RegularFont".to_string())
|
||||
.with_italic_angle(0.0);
|
||||
assert!(!is_italic(&font));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_smallcaps_by_name_sc() {
|
||||
let font = FontInfo::new().with_name("TimesNewRomanSC".to_string());
|
||||
assert!(is_smallcaps(&font));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_smallcaps_by_name_smallcaps() {
|
||||
let font = FontInfo::new().with_name("Arial-SmallCaps".to_string());
|
||||
assert!(is_smallcaps(&font));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_smallcaps_by_name_dot_sc() {
|
||||
let font = FontInfo::new().with_name("TimesNewRoman.sc".to_string());
|
||||
assert!(is_smallcaps(&font));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_smallcaps_by_flags() {
|
||||
let font = FontInfo::new()
|
||||
.with_name("RegularFont".to_string())
|
||||
.with_flags(font_flags::SYMBOLIC);
|
||||
assert!(is_smallcaps(&font));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_subscript_negative_rise() {
|
||||
let font = FontInfo::new();
|
||||
let flags = detect_span_flags(&font, -2.0, 12.0);
|
||||
assert!(flags & flags::SUBSCRIPT != 0);
|
||||
assert!(flags & flags::SUPERSCRIPT == 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_superscript_positive_rise() {
|
||||
let font = FontInfo::new();
|
||||
let flags = detect_span_flags(&font, 1.5, 12.0);
|
||||
assert!(flags & flags::SUPERSCRIPT != 0);
|
||||
assert!(flags & flags::SUBSCRIPT == 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_script_within_threshold() {
|
||||
let font = FontInfo::new();
|
||||
let flags = detect_span_flags(&font, -0.5, 12.0); // rise/size = -0.042
|
||||
assert!(flags & flags::SUBSCRIPT == 0);
|
||||
assert!(flags & flags::SUPERSCRIPT == 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bold_italic_combination() {
|
||||
let font = FontInfo::new().with_name("Times-BoldItalic".to_string());
|
||||
let flags = detect_span_flags(&font, 0.0, 12.0);
|
||||
assert!(flags & flags::BOLD != 0);
|
||||
assert!(flags & flags::ITALIC != 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_all_flags_bold_italic_smallcaps_superscript() {
|
||||
let font = FontInfo::new().with_name("Times-BoldItalic-SmallCaps".to_string());
|
||||
let span_flags_value = detect_span_flags(&font, 2.0, 12.0);
|
||||
assert!(span_flags_value & flags::BOLD != 0);
|
||||
assert!(span_flags_value & flags::ITALIC != 0);
|
||||
assert!(span_flags_value & flags::SMALLCAPS != 0);
|
||||
assert!(span_flags_value & flags::SUPERSCRIPT != 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_regular_font_no_flags() {
|
||||
let font = FontInfo::new().with_name("Times-Roman".to_string());
|
||||
let flags = detect_span_flags(&font, 0.0, 12.0);
|
||||
assert_eq!(flags, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_subscript_threshold_exactly_negative() {
|
||||
let font = FontInfo::new();
|
||||
let flags = detect_span_flags(&font, -1.21, 12.0); // rise/size = -0.1008 < -0.1
|
||||
assert!(flags & flags::SUBSCRIPT != 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_superscript_threshold_exactly_positive() {
|
||||
let font = FontInfo::new();
|
||||
let flags = detect_span_flags(&font, 1.21, 12.0); // rise/size = 0.1008 > 0.1
|
||||
assert!(flags & flags::SUPERSCRIPT != 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_zero_font_size_handling() {
|
||||
// Edge case: font_size = 0 should not cause division by zero
|
||||
// In practice, this shouldn't happen, but we handle it gracefully
|
||||
let font = FontInfo::new().with_name("Times-Bold".to_string());
|
||||
let flags = detect_span_flags(&font, 0.0, 0.0);
|
||||
// Bold detection still works (doesn't depend on font_size)
|
||||
assert!(flags & flags::BOLD != 0);
|
||||
// Sub/super detection should not crash
|
||||
assert!(flags & flags::SUBSCRIPT == 0);
|
||||
assert!(flags & flags::SUPERSCRIPT == 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mutually_exclusive_sub_super() {
|
||||
// Sub and super are mutually exclusive by definition
|
||||
// (text_rise is a single value per span)
|
||||
let font = FontInfo::new();
|
||||
let flags = detect_span_flags(&font, 0.0, 12.0);
|
||||
assert!(flags & flags::SUBSCRIPT == 0);
|
||||
assert!(flags & flags::SUPERSCRIPT == 0);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue