Fixed compilation errors in Span constructors by adding missing `column: None` field. Verified that the existing multi-output CLI parsing implementation meets all acceptance criteria for bead pdftract-37qim. Changes: - crates/pdftract-core/src/span/mod.rs: Add column field to new() and empty() constructors Verification: - All 23 output::tests pass - CLI parsing validated for duplicate format detection, ndjson exclusivity, stdout uniqueness - Format auto-naming (--format with -o) works correctly - Default behavior (no flags -> JSON to stdout) confirmed See notes/pdftract-37qim.md for detailed verification results. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1544 lines
59 KiB
Rust
1544 lines
59 KiB
Rust
//! Span struct definition (Phase 4.1).
|
|
//!
|
|
//! This module implements the Span struct, which is the primary output
|
|
//! of Phase 4 glyph-to-span merging. Span is the second-most-important
|
|
//! struct in the output schema (after Glyph).
|
|
//!
|
|
//! # Span Struct
|
|
//!
|
|
//! Per plan section Phase 4.1 (lines 1640-1653):
|
|
//! ```rust
|
|
//! struct Span {
|
|
//! text: String,
|
|
//! bbox: [f32; 4], // union of member glyph bboxes
|
|
//! font: Arc<str>,
|
|
//! size: f32,
|
|
//! color: Option<CssHexColor>,
|
|
//! rendering_mode: u8,
|
|
//! confidence: f32, // minimum glyph confidence
|
|
//! confidence_source: ConfidenceSource,
|
|
//! lang: Option<Arc<str>>, // filled in Phase 7 normalization
|
|
//! flags: u8, // SpanFlags bitmask: bit 0=bold, 1=italic, 2=smallcaps, 3=subscript, 4=superscript
|
|
//! }
|
|
//! ```
|
|
|
|
use crate::confidence::ConfidenceSource;
|
|
use crate::font::UnicodeSource;
|
|
use crate::glyph::Glyph;
|
|
use crate::graphics_state::Color;
|
|
use serde::{Deserialize, Serialize};
|
|
use std::sync::Arc;
|
|
|
|
/// CSS hex color newtype (#rrggbb format).
|
|
///
|
|
/// This newtype enforces the #rrggbb format at construction time.
|
|
/// It is used to represent fill colors that can be serialized to CSS.
|
|
/// Spot colors and other non-DeviceRGB/DeviceGray colors serialize as None.
|
|
///
|
|
/// # Example
|
|
///
|
|
/// ```
|
|
/// use pdftract_core::span::CssHexColor;
|
|
///
|
|
/// let red = CssHexColor::new("#ff0000").unwrap();
|
|
/// assert_eq!(red.as_str(), "#ff0000");
|
|
///
|
|
/// let invalid = CssHexColor::new("red");
|
|
/// assert!(invalid.is_err());
|
|
/// ```
|
|
#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
|
pub struct CssHexColor(pub String);
|
|
|
|
impl CssHexColor {
|
|
/// Create a new CssHexColor from a string.
|
|
///
|
|
/// The string must be in #rrggbb format (7 characters: # + 6 hex digits).
|
|
/// Hex digits may be uppercase or lowercase.
|
|
///
|
|
/// # Errors
|
|
///
|
|
/// Returns an error if the string is not exactly 7 characters or does
|
|
/// not start with '#' or contains non-hex characters after '#'.
|
|
pub fn new(s: &str) -> Result<Self, String> {
|
|
if s.len() != 7 {
|
|
return Err(format!(
|
|
"CssHexColor must be exactly 7 characters (#rrggbb), got {}",
|
|
s.len()
|
|
));
|
|
}
|
|
if !s.starts_with('#') {
|
|
return Err("CssHexColor must start with '#'".to_string());
|
|
}
|
|
let hex = &s[1..];
|
|
if !hex.chars().all(|c| c.is_ascii_hexdigit()) {
|
|
return Err(format!(
|
|
"CssHexColor must contain only hex digits after '#', got {}",
|
|
hex
|
|
));
|
|
}
|
|
Ok(CssHexColor(s.to_lowercase()))
|
|
}
|
|
|
|
/// Get the underlying string slice.
|
|
pub fn as_str(&self) -> &str {
|
|
&self.0
|
|
}
|
|
|
|
/// Convert from an RGB tuple.
|
|
pub fn from_rgb(r: u8, g: u8, b: u8) -> Self {
|
|
CssHexColor(format!("#{:02x}{:02x}{:02x}", r, g, b))
|
|
}
|
|
}
|
|
|
|
/// SpanFlags bit constants.
|
|
///
|
|
/// These constants are used to test individual bits in the Span.flags field.
|
|
/// Multiple flags can be combined using bitwise OR.
|
|
pub mod span_flags {
|
|
/// Bit 0: Bold text
|
|
pub const BOLD: u8 = 1 << 0;
|
|
/// Bit 1: Italic text
|
|
pub const ITALIC: u8 = 1 << 1;
|
|
/// Bit 2: Small caps text
|
|
pub const SMALLCAPS: u8 = 1 << 2;
|
|
/// Bit 3: Subscript text
|
|
pub const SUBSCRIPT: u8 = 1 << 3;
|
|
/// Bit 4: Superscript text
|
|
pub const SUPERSCRIPT: u8 = 1 << 4;
|
|
}
|
|
|
|
/// A span of text extracted from a PDF (Phase 4 output).
|
|
///
|
|
/// This struct represents a contiguous run of glyphs that share the same
|
|
/// font, size, color, and rendering mode. It is the primary output of
|
|
/// Phase 4 glyph-to-span merging and is used throughout Phase 5 (layout)
|
|
/// and Phase 6 (output).
|
|
///
|
|
/// # Field Descriptions
|
|
///
|
|
/// - **text**: The concatenated text content of all glyphs in the span.
|
|
/// Valid UTF-8, never contains U+FFFD unless a glyph was U+FFFD and
|
|
/// readability correction did not repair it.
|
|
///
|
|
/// - **bbox**: Union of member glyph bounding boxes in PDF user space
|
|
/// [x0, y0, x1, y1] with lower-left origin, AFTER /Rotate normalization.
|
|
///
|
|
/// - **font**: Font name shared via Arc across all spans using the same font.
|
|
///
|
|
/// - **size**: Font size in points.
|
|
///
|
|
/// - **color**: Fill color as CSS hex string, or None for Spot/Other colorspaces.
|
|
///
|
|
/// - **rendering_mode**: Text rendering mode (0-7 per PDF spec).
|
|
///
|
|
/// - **confidence**: Minimum confidence of all glyphs in the span [0.0, 1.0].
|
|
///
|
|
/// - **confidence_source**: Source of confidence (Native, Heuristic, Ocr).
|
|
///
|
|
/// - **lang**: Language tag (BCP 47), None until Phase 7 fills it from /Lang
|
|
/// or detected script.
|
|
///
|
|
/// - **flags**: SpanFlags bitmask (bold, italic, smallcaps, subscript, superscript).
|
|
///
|
|
/// - **column**: Column index (0-based) assigned by Phase 4.3 column detection.
|
|
/// None for spans outside any detected column (e.g., full-width headings).
|
|
///
|
|
/// # Invariants
|
|
///
|
|
/// - INV: text is VALID UTF-8 (Rust String); no U+FFFD unless the underlying
|
|
/// glyph was U+FFFD AND the readability correction did not repair it.
|
|
/// - INV: bbox is [x0, y0, x1, y1] PDF user space, lower-left origin, AFTER
|
|
/// /Rotate normalization.
|
|
/// - INV: color may be None when the source colorspace was Spot or Other;
|
|
/// JSON serializes as null.
|
|
/// - INV: lang is None until Phase 7 fills it from /Lang or detected script.
|
|
/// - INV: flags is initially 0; Phase 4.1 flag detector sets bits.
|
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
|
pub struct Span {
|
|
/// Concatenated text content of the span.
|
|
pub text: String,
|
|
/// Union of member glyph bboxes [x0, y0, x1, y1] in PDF user space.
|
|
pub bbox: [f32; 4],
|
|
/// Font name (shared via Arc).
|
|
pub font: Arc<str>,
|
|
/// Font size in points.
|
|
pub size: f32,
|
|
/// Fill color as CSS hex (#rrggbb), or None for Spot/Other colorspaces.
|
|
pub color: Option<CssHexColor>,
|
|
/// Text rendering mode (0-7 per PDF spec).
|
|
pub rendering_mode: u8,
|
|
/// Minimum confidence of all glyphs in the span [0.0, 1.0].
|
|
pub confidence: f32,
|
|
/// Source of confidence (Native, Heuristic, Ocr).
|
|
pub confidence_source: ConfidenceSource,
|
|
/// Language tag (BCP 47), None until Phase 7.
|
|
pub lang: Option<Arc<str>>,
|
|
/// SpanFlags bitmask (bold, italic, smallcaps, subscript, superscript).
|
|
pub flags: u8,
|
|
/// Column index (0-based) assigned by Phase 4.3 column detection.
|
|
pub column: Option<u32>,
|
|
}
|
|
|
|
impl Span {
|
|
/// Create a new Span with the given fields.
|
|
///
|
|
/// This is the primary constructor used by Phase 4 glyph-to-span merging.
|
|
pub fn new(
|
|
text: String,
|
|
bbox: [f32; 4],
|
|
font: Arc<str>,
|
|
size: f32,
|
|
color: Option<CssHexColor>,
|
|
rendering_mode: u8,
|
|
confidence: f32,
|
|
confidence_source: ConfidenceSource,
|
|
lang: Option<Arc<str>>,
|
|
flags: u8,
|
|
) -> Self {
|
|
Self {
|
|
text,
|
|
bbox,
|
|
font,
|
|
size,
|
|
color,
|
|
rendering_mode,
|
|
confidence,
|
|
confidence_source,
|
|
lang,
|
|
flags,
|
|
column: None,
|
|
}
|
|
}
|
|
|
|
/// Create an empty span with default values.
|
|
///
|
|
/// Used as a starting point for span accumulation.
|
|
pub fn empty() -> Self {
|
|
Self {
|
|
text: String::new(),
|
|
bbox: [0.0, 0.0, 0.0, 0.0],
|
|
font: Arc::from(""),
|
|
size: 0.0,
|
|
color: None,
|
|
rendering_mode: 0,
|
|
confidence: 1.0,
|
|
confidence_source: ConfidenceSource::Native,
|
|
lang: None,
|
|
flags: 0,
|
|
column: None,
|
|
}
|
|
}
|
|
|
|
/// Check if the bold flag is set.
|
|
pub fn is_bold(&self) -> bool {
|
|
self.flags & span_flags::BOLD != 0
|
|
}
|
|
|
|
/// Check if the italic flag is set.
|
|
pub fn is_italic(&self) -> bool {
|
|
self.flags & span_flags::ITALIC != 0
|
|
}
|
|
|
|
/// Check if the smallcaps flag is set.
|
|
pub fn is_smallcaps(&self) -> bool {
|
|
self.flags & span_flags::SMALLCAPS != 0
|
|
}
|
|
|
|
/// Check if the subscript flag is set.
|
|
pub fn is_subscript(&self) -> bool {
|
|
self.flags & span_flags::SUBSCRIPT != 0
|
|
}
|
|
|
|
/// Check if the superscript flag is set.
|
|
pub fn is_superscript(&self) -> bool {
|
|
self.flags & span_flags::SUPERSCRIPT != 0
|
|
}
|
|
}
|
|
|
|
/// Map UnicodeSource to ConfidenceSource per plan Phase 4.1.
|
|
///
|
|
/// | UnicodeSource | ConfidenceSource |
|
|
/// |------------------|-------------------|
|
|
/// | ToUnicode | Native |
|
|
/// | Agl | Native |
|
|
/// | Fingerprint | Native |
|
|
/// | ShapeMatch | Heuristic |
|
|
/// | Unknown (U+FFFD) | Heuristic |
|
|
/// | Ocr | Ocr |
|
|
fn map_unicode_source_to_confidence(source: UnicodeSource) -> ConfidenceSource {
|
|
match source {
|
|
UnicodeSource::ToUnicode | UnicodeSource::Agl | UnicodeSource::Fingerprint => {
|
|
ConfidenceSource::Native
|
|
}
|
|
UnicodeSource::ShapeMatch | UnicodeSource::Unknown => ConfidenceSource::Heuristic,
|
|
UnicodeSource::Ocr => ConfidenceSource::Ocr,
|
|
}
|
|
}
|
|
|
|
/// Normalize a Color to RGB tuple for comparison.
|
|
///
|
|
/// Returns `Some((r, g, b))` for DeviceGray, DeviceRGB, and DeviceCMYK.
|
|
/// Returns `None` for Spot and Other colors (compared by variant equality).
|
|
fn normalize_color_for_comparison(color: &Color) -> Option<(u8, u8, u8)> {
|
|
match color {
|
|
Color::DeviceGray(v) => {
|
|
let v = (v.clamp(0.0, 1.0) * 255.0).round() as u8;
|
|
Some((v, v, v))
|
|
}
|
|
Color::DeviceRGB(rgb) => {
|
|
let r = (rgb[0].clamp(0.0, 1.0) * 255.0).round() as u8;
|
|
let g = (rgb[1].clamp(0.0, 1.0) * 255.0).round() as u8;
|
|
let b = (rgb[2].clamp(0.0, 1.0) * 255.0).round() as u8;
|
|
Some((r, g, b))
|
|
}
|
|
Color::DeviceCMYK(cmyk) => {
|
|
// CMYK → RGB conversion: R = (1-C)*(1-K)
|
|
let c = cmyk[0].clamp(0.0, 1.0);
|
|
let m = cmyk[1].clamp(0.0, 1.0);
|
|
let y = cmyk[2].clamp(0.0, 1.0);
|
|
let k = cmyk[3].clamp(0.0, 1.0);
|
|
let r = ((1.0 - c) * (1.0 - k) * 255.0).round() as u8;
|
|
let g = ((1.0 - m) * (1.0 - k) * 255.0).round() as u8;
|
|
let b = ((1.0 - y) * (1.0 - k) * 255.0).round() as u8;
|
|
Some((r, g, b))
|
|
}
|
|
Color::Spot(_, _) | Color::Other => None,
|
|
}
|
|
}
|
|
|
|
/// Check if two colors are equal using RGB-normalized comparison.
|
|
///
|
|
/// For DeviceGray, DeviceRGB, and DeviceCMYK, compares using normalized RGB values.
|
|
/// For Spot and Other, compares by variant equality (Spot colors compared by name AND tint exactly).
|
|
fn colors_equal(a: &Color, b: &Color) -> bool {
|
|
match (normalize_color_for_comparison(a), normalize_color_for_comparison(b)) {
|
|
(Some(rgb_a), Some(rgb_b)) => rgb_a == rgb_b,
|
|
(None, None) => a == b, // Both Spot/Other: compare by variant (Spot by name+tint)
|
|
_ => false, // One normalizable, one not: different
|
|
}
|
|
}
|
|
|
|
/// Append a glyph's codepoint to a span's text.
|
|
///
|
|
/// This function implements the per-glyph text assembly logic for Phase 4.1.
|
|
/// It appends the glyph's codepoint to the span's text field.
|
|
///
|
|
/// Per the bead pdftract-2c5sx acceptance criteria:
|
|
/// - Single codepoint glyphs: append the char directly
|
|
/// - Multi-codepoint glyphs (ligatures): Phase 2 already expands these into
|
|
/// separate Glyph structs, so per-glyph append works correctly
|
|
/// - RTL text: preserved in visual order; bidi reordering happens in Phase 4.2
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `span` - Mutable reference to the span to append to
|
|
/// * `glyph` - The glyph whose codepoint should be appended
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use pdftract_core::span::assemble_text;
|
|
/// use pdftract_core::span::Span;
|
|
///
|
|
/// let mut span = Span::empty();
|
|
/// let glyph = Glyph::new('A', ...);
|
|
/// assemble_text(&mut span, &glyph);
|
|
/// assert_eq!(span.text, "A");
|
|
/// ```
|
|
fn assemble_text(span: &mut Span, glyph: &Glyph) {
|
|
span.text.push(glyph.codepoint);
|
|
}
|
|
|
|
/// Merge consecutive glyphs into spans using the 5-trigger break detector.
|
|
///
|
|
/// This function implements Phase 4.1 glyph-to-span merging. It walks the
|
|
/// per-page glyph list and groups consecutive glyphs into spans. A new span
|
|
/// begins when any of the 5 triggers fires on the current glyph:
|
|
///
|
|
/// 1. `font_name != prev font_name`
|
|
/// 2. `(font_size - prev_font_size).abs() > 0.5`
|
|
/// 3. `rendering_mode != prev rendering_mode`
|
|
/// 4. RGB-normalized `fill_color != prev color`
|
|
/// 5. `is_word_boundary == true`
|
|
///
|
|
/// # Word boundary handling
|
|
///
|
|
/// When triggered by `is_word_boundary == true`, we append a space to the
|
|
/// PREVIOUS span's text (option a from the plan). This produces cleaner JSON
|
|
/// output and easier round-trip than emitting a 1-char " " span.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `glyphs` - The per-page glyph list to merge
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// A vector of spans, where each span represents a maximal run of glyphs
|
|
/// sharing the same font, size, color, and rendering mode.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use pdftract_core::span::merge_glyphs_to_spans;
|
|
/// use pdftract_core::glyph::Glyph;
|
|
/// use std::sync::Arc;
|
|
///
|
|
/// let glyphs = vec![
|
|
/// // "Hello" (5 glyphs)
|
|
/// Glyph::new('H', UnicodeSource::ToUnicode, 1.0, [0.0, 10.0, 10.0, 20.0],
|
|
/// Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
/// // ... more glyphs for "ello World"
|
|
/// ];
|
|
///
|
|
/// let spans = merge_glyphs_to_spans(&glyphs);
|
|
/// // spans[0].text == "Hello "
|
|
/// // spans[1].text == "World"
|
|
/// ```
|
|
pub fn merge_glyphs_to_spans(glyphs: &[Glyph]) -> Vec<Span> {
|
|
if glyphs.is_empty() {
|
|
return Vec::new();
|
|
}
|
|
|
|
let mut result = Vec::new();
|
|
let mut current_span: Option<Span> = None;
|
|
let mut prev_fill_color: Option<&Color> = None;
|
|
|
|
for glyph in glyphs {
|
|
// Special case: word boundary marker - append space to current span, finalize it, and skip
|
|
if glyph.is_word_boundary {
|
|
if let Some(mut span) = current_span.take() {
|
|
span.text.push(' ');
|
|
result.push(span);
|
|
}
|
|
prev_fill_color = None; // Reset on word boundary
|
|
// Skip the boundary marker glyph itself (it's synthetic, not a real glyph)
|
|
continue;
|
|
}
|
|
|
|
// Check if we need to start a new span (no current span OR any trigger fires)
|
|
let should_start_new_span = if let Some(ref span) = current_span {
|
|
// Trigger 1: font_name changed
|
|
let font_changed = &glyph.font_name != &span.font;
|
|
|
|
// Trigger 2: font_size delta > 0.5pt
|
|
let size_changed = (glyph.font_size - span.size).abs() > 0.5;
|
|
|
|
// Trigger 3: rendering_mode changed
|
|
let mode_changed = glyph.rendering_mode != span.rendering_mode;
|
|
|
|
// Trigger 4: fill_color changed (RGB-normalized)
|
|
let color_changed = if let Some(prev_color) = prev_fill_color {
|
|
!colors_equal(&glyph.fill_color, prev_color)
|
|
} else {
|
|
false // No previous color, don't trigger
|
|
};
|
|
|
|
font_changed || size_changed || mode_changed || color_changed
|
|
} else {
|
|
true // No current span, must start new one
|
|
};
|
|
|
|
if should_start_new_span {
|
|
// Finalize current span (if any)
|
|
if let Some(span) = current_span.take() {
|
|
result.push(span);
|
|
}
|
|
|
|
// Start new span from current glyph
|
|
let confidence_source = map_unicode_source_to_confidence(glyph.unicode_source);
|
|
let color = glyph.fill_color.to_css_hex().map(|s| CssHexColor(s));
|
|
|
|
current_span = Some(Span::new(
|
|
glyph.codepoint.encode_utf8(&mut [0; 4]).to_string(), // Start with this glyph's char
|
|
glyph.bbox,
|
|
glyph.font_name.clone(),
|
|
glyph.font_size,
|
|
color,
|
|
glyph.rendering_mode,
|
|
glyph.confidence,
|
|
confidence_source,
|
|
None, // lang: filled in Phase 7
|
|
0, // flags: filled in Phase 4.1 flag detector
|
|
));
|
|
prev_fill_color = Some(&glyph.fill_color);
|
|
} else {
|
|
// Append to current span
|
|
if let Some(ref mut span) = current_span {
|
|
// Append glyph codepoint to span text via assemble_text
|
|
assemble_text(span, glyph);
|
|
|
|
// Extend bbox to union
|
|
span.bbox[0] = span.bbox[0].min(glyph.bbox[0]);
|
|
span.bbox[1] = span.bbox[1].min(glyph.bbox[1]);
|
|
span.bbox[2] = span.bbox[2].max(glyph.bbox[2]);
|
|
span.bbox[3] = span.bbox[3].max(glyph.bbox[3]);
|
|
|
|
// Update confidence_source to worst (lowest confidence) source
|
|
// Must compare OLD confidence before updating span.confidence
|
|
let glyph_source = map_unicode_source_to_confidence(glyph.unicode_source);
|
|
if glyph.confidence < span.confidence {
|
|
span.confidence_source = glyph_source;
|
|
}
|
|
// Update confidence to minimum
|
|
span.confidence = span.confidence.min(glyph.confidence);
|
|
}
|
|
// Update prev_fill_color to current glyph's color
|
|
prev_fill_color = Some(&glyph.fill_color);
|
|
}
|
|
}
|
|
|
|
// Push final span
|
|
if let Some(span) = current_span {
|
|
result.push(span);
|
|
}
|
|
|
|
result
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::confidence::map_confidence_source;
|
|
|
|
// CssHexColor tests
|
|
|
|
#[test]
|
|
fn test_css_hex_color_new_valid_lowercase() {
|
|
let color = CssHexColor::new("#ff0000").unwrap();
|
|
assert_eq!(color.as_str(), "#ff0000");
|
|
}
|
|
|
|
#[test]
|
|
fn test_css_hex_color_new_valid_uppercase() {
|
|
let color = CssHexColor::new("#FF0000").unwrap();
|
|
assert_eq!(color.as_str(), "#ff0000");
|
|
}
|
|
|
|
#[test]
|
|
fn test_css_hex_color_new_valid_mixed_case() {
|
|
let color = CssHexColor::new("#Ff00Aa").unwrap();
|
|
assert_eq!(color.as_str(), "#ff00aa");
|
|
}
|
|
|
|
#[test]
|
|
fn test_css_hex_color_new_invalid_too_short() {
|
|
let result = CssHexColor::new("#f00");
|
|
assert!(result.is_err());
|
|
}
|
|
|
|
#[test]
|
|
fn test_css_hex_color_new_invalid_too_long() {
|
|
let result = CssHexColor::new("#ff0000ff");
|
|
assert!(result.is_err());
|
|
}
|
|
|
|
#[test]
|
|
fn test_css_hex_color_new_invalid_no_hash() {
|
|
let result = CssHexColor::new("ff0000");
|
|
assert!(result.is_err());
|
|
}
|
|
|
|
#[test]
|
|
fn test_css_hex_color_new_invalid_non_hex() {
|
|
let result = CssHexColor::new("#fg0000");
|
|
assert!(result.is_err());
|
|
}
|
|
|
|
#[test]
|
|
fn test_css_hex_color_from_rgb() {
|
|
let color = CssHexColor::from_rgb(255, 0, 0);
|
|
assert_eq!(color.as_str(), "#ff0000");
|
|
}
|
|
|
|
#[test]
|
|
fn test_css_hex_color_clone_is_cheap() {
|
|
let color = CssHexColor::new("#00ff00").unwrap();
|
|
let cloned = color.clone();
|
|
assert_eq!(color, cloned);
|
|
}
|
|
|
|
// SpanFlags tests
|
|
|
|
#[test]
|
|
fn test_span_flags_bold_bit() {
|
|
assert_eq!(span_flags::BOLD, 1);
|
|
assert_eq!(span_flags::ITALIC, 2);
|
|
assert_eq!(span_flags::SMALLCAPS, 4);
|
|
assert_eq!(span_flags::SUBSCRIPT, 8);
|
|
assert_eq!(span_flags::SUPERSCRIPT, 16);
|
|
}
|
|
|
|
#[test]
|
|
fn test_span_flags_combinable() {
|
|
let bold_italic = span_flags::BOLD | span_flags::ITALIC;
|
|
assert_eq!(bold_italic, 3);
|
|
}
|
|
|
|
// Span struct tests
|
|
|
|
#[test]
|
|
fn test_span_constructible_with_all_fields() {
|
|
let span = Span::new(
|
|
"Hello".to_string(),
|
|
[0.0, 0.0, 100.0, 12.0],
|
|
Arc::from("Helvetica"),
|
|
12.0,
|
|
Some(CssHexColor::new("#000000").unwrap()),
|
|
0,
|
|
1.0,
|
|
ConfidenceSource::Native,
|
|
None,
|
|
0,
|
|
);
|
|
assert_eq!(span.text, "Hello");
|
|
assert_eq!(&*span.font, "Helvetica");
|
|
assert_eq!(span.size, 12.0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_span_empty() {
|
|
let span = Span::empty();
|
|
assert!(span.text.is_empty());
|
|
assert_eq!(span.bbox, [0.0, 0.0, 0.0, 0.0]);
|
|
assert_eq!(span.flags, 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_span_clone_is_cheap() {
|
|
let span = Span::new(
|
|
"Hello".to_string(),
|
|
[0.0, 0.0, 100.0, 12.0],
|
|
Arc::from("Helvetica"),
|
|
12.0,
|
|
Some(CssHexColor::new("#000000").unwrap()),
|
|
0,
|
|
1.0,
|
|
ConfidenceSource::Native,
|
|
Some(Arc::from("en")),
|
|
span_flags::BOLD,
|
|
);
|
|
let cloned = span.clone();
|
|
assert_eq!(span, cloned);
|
|
// Arc<str> means font and lang are shared
|
|
assert!(Arc::ptr_eq(&span.font, &cloned.font));
|
|
if let (Some(lang1), Some(lang2)) = (&span.lang, &cloned.lang) {
|
|
assert!(Arc::ptr_eq(lang1, lang2));
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_span_serde_json_roundtrip() {
|
|
let span = Span::new(
|
|
"Hello".to_string(),
|
|
[0.0, 0.0, 100.0, 12.0],
|
|
Arc::from("Helvetica"),
|
|
12.0,
|
|
Some(CssHexColor::new("#ff0000").unwrap()),
|
|
0,
|
|
1.0,
|
|
ConfidenceSource::Native,
|
|
None,
|
|
span_flags::BOLD | span_flags::ITALIC,
|
|
);
|
|
|
|
let json = serde_json::to_string(&span).unwrap();
|
|
let deserialized: Span = serde_json::from_str(&json).unwrap();
|
|
|
|
assert_eq!(deserialized.text, span.text);
|
|
assert_eq!(deserialized.bbox, span.bbox);
|
|
assert_eq!(deserialized.font, span.font);
|
|
assert_eq!(deserialized.size, span.size);
|
|
assert_eq!(deserialized.rendering_mode, span.rendering_mode);
|
|
assert_eq!(deserialized.confidence, span.confidence);
|
|
assert_eq!(deserialized.confidence_source, span.confidence_source);
|
|
assert_eq!(deserialized.flags, span.flags);
|
|
}
|
|
|
|
#[test]
|
|
fn test_span_with_none_color_serializes() {
|
|
let span = Span::new(
|
|
"Hello".to_string(),
|
|
[0.0, 0.0, 100.0, 12.0],
|
|
Arc::from("Helvetica"),
|
|
12.0,
|
|
None,
|
|
0,
|
|
1.0,
|
|
ConfidenceSource::Native,
|
|
None,
|
|
0,
|
|
);
|
|
|
|
let json = serde_json::to_string(&span).unwrap();
|
|
assert!(json.contains(r#""color":null"#));
|
|
}
|
|
|
|
#[test]
|
|
fn test_span_is_bold() {
|
|
let mut span = Span::empty();
|
|
span.flags = span_flags::BOLD;
|
|
assert!(span.is_bold());
|
|
assert!(!span.is_italic());
|
|
}
|
|
|
|
#[test]
|
|
fn test_span_is_italic() {
|
|
let mut span = Span::empty();
|
|
span.flags = span_flags::ITALIC;
|
|
assert!(span.is_italic());
|
|
assert!(!span.is_bold());
|
|
}
|
|
|
|
#[test]
|
|
fn test_span_is_smallcaps() {
|
|
let mut span = Span::empty();
|
|
span.flags = span_flags::SMALLCAPS;
|
|
assert!(span.is_smallcaps());
|
|
}
|
|
|
|
#[test]
|
|
fn test_span_is_subscript() {
|
|
let mut span = Span::empty();
|
|
span.flags = span_flags::SUBSCRIPT;
|
|
assert!(span.is_subscript());
|
|
assert!(!span.is_superscript());
|
|
}
|
|
|
|
#[test]
|
|
fn test_span_is_superscript() {
|
|
let mut span = Span::empty();
|
|
span.flags = span_flags::SUPERSCRIPT;
|
|
assert!(span.is_superscript());
|
|
assert!(!span.is_subscript());
|
|
}
|
|
|
|
#[test]
|
|
fn test_span_combined_flags() {
|
|
let mut span = Span::empty();
|
|
span.flags = span_flags::BOLD | span_flags::ITALIC;
|
|
assert!(span.is_bold());
|
|
assert!(span.is_italic());
|
|
}
|
|
|
|
#[test]
|
|
fn test_span_size_within_budget() {
|
|
// AC: Span struct size ~80 bytes (Arc str = 16 bytes shared, String avg 32, bbox 16, scalars 16)
|
|
let size = std::mem::size_of::<Span>();
|
|
// Check that we're within reasonable bounds
|
|
assert!(size <= 120, "Span struct size {} exceeds 120 bytes", size);
|
|
eprintln!("Span struct size: {} bytes", size);
|
|
}
|
|
|
|
#[test]
|
|
fn test_span_confidence_source_variants() {
|
|
// Test all three ConfidenceSource variants
|
|
let native = Span::new(
|
|
"text".to_string(),
|
|
[0.0, 0.0, 100.0, 12.0],
|
|
Arc::from("Helvetica"),
|
|
12.0,
|
|
None,
|
|
0,
|
|
1.0,
|
|
ConfidenceSource::Native,
|
|
None,
|
|
0,
|
|
);
|
|
assert_eq!(native.confidence_source, ConfidenceSource::Native);
|
|
|
|
let heuristic = Span::new(
|
|
"text".to_string(),
|
|
[0.0, 0.0, 100.0, 12.0],
|
|
Arc::from("Helvetica"),
|
|
12.0,
|
|
None,
|
|
0,
|
|
0.5,
|
|
ConfidenceSource::Heuristic,
|
|
None,
|
|
0,
|
|
);
|
|
assert_eq!(heuristic.confidence_source, ConfidenceSource::Heuristic);
|
|
|
|
let ocr = Span::new(
|
|
"text".to_string(),
|
|
[0.0, 0.0, 100.0, 12.0],
|
|
Arc::from("Helvetica"),
|
|
12.0,
|
|
None,
|
|
0,
|
|
0.8,
|
|
ConfidenceSource::Ocr,
|
|
None,
|
|
0,
|
|
);
|
|
assert_eq!(ocr.confidence_source, ConfidenceSource::Ocr);
|
|
}
|
|
|
|
// Acceptance criteria tests for pdftract-3zz9n (merge_glyphs_to_spans)
|
|
|
|
#[test]
|
|
fn test_merge_glyphs_to_spans_hello_world_with_word_boundary() {
|
|
// AC: Input "Hello World" (5 glyphs, space-boundary, 5 glyphs): output 2 spans "Hello " and "World"
|
|
use crate::font::UnicodeSource;
|
|
use crate::graphics_state::Color;
|
|
|
|
let glyphs = vec![
|
|
// "Hello" - 5 glyphs with same font/size/color
|
|
Glyph::new('H', UnicodeSource::ToUnicode, 1.0, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('e', UnicodeSource::ToUnicode, 1.0, [10.0, 10.0, 20.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('l', UnicodeSource::ToUnicode, 1.0, [20.0, 10.0, 30.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('l', UnicodeSource::ToUnicode, 1.0, [30.0, 10.0, 40.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('o', UnicodeSource::ToUnicode, 1.0, [40.0, 10.0, 50.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
// Word boundary marker (is_word_boundary = true)
|
|
Glyph::new(' ', UnicodeSource::ToUnicode, 1.0, [50.0, 10.0, 60.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), true, None, false),
|
|
// "World" - 5 glyphs with same font/size/color
|
|
Glyph::new('W', UnicodeSource::ToUnicode, 1.0, [60.0, 10.0, 70.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('o', UnicodeSource::ToUnicode, 1.0, [70.0, 10.0, 80.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('r', UnicodeSource::ToUnicode, 1.0, [80.0, 10.0, 90.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('l', UnicodeSource::ToUnicode, 1.0, [90.0, 10.0, 100.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('d', UnicodeSource::ToUnicode, 1.0, [100.0, 10.0, 110.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
];
|
|
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
|
|
assert_eq!(spans.len(), 2, "Expected 2 spans, got {}", spans.len());
|
|
assert_eq!(spans[0].text, "Hello ", "First span should be 'Hello '");
|
|
assert_eq!(spans[1].text, "World", "Second span should be 'World'");
|
|
}
|
|
|
|
#[test]
|
|
fn test_merge_glyphs_to_spans_font_name_change_triggers_break() {
|
|
// AC: Input "He" (regular) + "lo" (bold) at same font/color: 2 spans, font_name changes
|
|
use crate::font::UnicodeSource;
|
|
use crate::graphics_state::Color;
|
|
|
|
let glyphs = vec![
|
|
// "He" - regular Helvetica
|
|
Glyph::new('H', UnicodeSource::ToUnicode, 1.0, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('e', UnicodeSource::ToUnicode, 1.0, [10.0, 10.0, 20.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
// "lo" - Helvetica-Bold (font name change)
|
|
Glyph::new('l', UnicodeSource::ToUnicode, 1.0, [20.0, 10.0, 30.0, 20.0],
|
|
Arc::from("Helvetica-Bold"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('o', UnicodeSource::ToUnicode, 1.0, [30.0, 10.0, 40.0, 20.0],
|
|
Arc::from("Helvetica-Bold"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
];
|
|
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
|
|
assert_eq!(spans.len(), 2, "Expected 2 spans for font change");
|
|
assert_eq!(spans[0].text, "He");
|
|
assert_eq!(spans[0].font, Arc::from("Helvetica"));
|
|
assert_eq!(spans[1].text, "lo");
|
|
assert_eq!(spans[1].font, Arc::from("Helvetica-Bold"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_merge_glyphs_to_spans_font_size_within_threshold_no_break() {
|
|
// AC: Input with font_size 12pt vs 12.2pt: 1 span (delta < 0.5pt)
|
|
use crate::font::UnicodeSource;
|
|
use crate::graphics_state::Color;
|
|
|
|
let glyphs = vec![
|
|
Glyph::new('H', UnicodeSource::ToUnicode, 1.0, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('e', UnicodeSource::ToUnicode, 1.0, [10.0, 10.0, 20.0, 20.0],
|
|
Arc::from("Helvetica"), 12.2, 0, Color::DeviceGray(0.0), false, None, false), // delta = 0.2pt < 0.5
|
|
Glyph::new('l', UnicodeSource::ToUnicode, 1.0, [20.0, 10.0, 30.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
];
|
|
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
|
|
assert_eq!(spans.len(), 1, "Expected 1 span for size delta < 0.5pt");
|
|
assert_eq!(spans[0].text, "Hel");
|
|
}
|
|
|
|
#[test]
|
|
fn test_merge_glyphs_to_spans_font_size_exceeds_threshold_breaks() {
|
|
// Verify that size delta > 0.5pt triggers a break
|
|
use crate::font::UnicodeSource;
|
|
use crate::graphics_state::Color;
|
|
|
|
let glyphs = vec![
|
|
Glyph::new('H', UnicodeSource::ToUnicode, 1.0, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('e', UnicodeSource::ToUnicode, 1.0, [10.0, 10.0, 20.0, 20.0],
|
|
Arc::from("Helvetica"), 12.6, 0, Color::DeviceGray(0.0), false, None, false), // delta = 0.6pt > 0.5
|
|
];
|
|
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
|
|
assert_eq!(spans.len(), 2, "Expected 2 spans for size delta > 0.5pt");
|
|
assert_eq!(spans[0].text, "H");
|
|
assert_eq!(spans[1].text, "e");
|
|
}
|
|
|
|
#[test]
|
|
fn test_merge_glyphs_to_spans_device_gray_and_rgb_normalized_same_color() {
|
|
// AC: Input with DeviceGray(0.5) then DeviceRGB([0.5,0.5,0.5]): 1 span (RGB-normalized same)
|
|
use crate::font::UnicodeSource;
|
|
use crate::graphics_state::Color;
|
|
|
|
let glyphs = vec![
|
|
Glyph::new('H', UnicodeSource::ToUnicode, 1.0, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.5), false, None, false),
|
|
Glyph::new('e', UnicodeSource::ToUnicode, 1.0, [10.0, 10.0, 20.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceRGB([0.5, 0.5, 0.5]), false, None, false),
|
|
Glyph::new('l', UnicodeSource::ToUnicode, 1.0, [20.0, 10.0, 30.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.5), false, None, false),
|
|
];
|
|
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
|
|
assert_eq!(spans.len(), 1, "Expected 1 span for RGB-normalized same colors");
|
|
assert_eq!(spans[0].text, "Hel");
|
|
// DeviceGray(0.5) -> (0.5 * 255).round() = 128 -> #808080
|
|
assert_eq!(spans[0].color.as_ref().unwrap().as_str(), "#808080");
|
|
}
|
|
|
|
#[test]
|
|
fn test_merge_glyphs_to_spans_spot_vs_device_rgb_different_colors() {
|
|
// AC: Input with Spot("PANTONE", 1.0) vs DeviceRGB([1,0,0]) with same hex: 2 spans (Spot != Device)
|
|
use crate::font::UnicodeSource;
|
|
use crate::graphics_state::Color;
|
|
|
|
let glyphs = vec![
|
|
Glyph::new('H', UnicodeSource::ToUnicode, 1.0, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::Spot(Arc::from("PANTONE-123"), 1.0), false, None, false),
|
|
Glyph::new('e', UnicodeSource::ToUnicode, 1.0, [10.0, 10.0, 20.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceRGB([1.0, 0.0, 0.0]), false, None, false),
|
|
];
|
|
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
|
|
assert_eq!(spans.len(), 2, "Expected 2 spans: Spot color != DeviceRGB even if visual appearance is similar");
|
|
assert_eq!(spans[0].text, "H");
|
|
assert_eq!(spans[0].color, None, "Spot color serializes as None");
|
|
assert_eq!(spans[1].text, "e");
|
|
assert_eq!(spans[1].color.as_ref().unwrap().as_str(), "#ff0000");
|
|
}
|
|
|
|
#[test]
|
|
fn test_merge_glyphs_to_spans_empty_glyph_list() {
|
|
// AC: Empty glyph list: returns empty Vec<Span> (no error)
|
|
use crate::font::UnicodeSource;
|
|
|
|
let glyphs: Vec<Glyph> = vec![];
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
|
|
assert_eq!(spans.len(), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn test_merge_glyphs_to_spans_rendering_mode_change() {
|
|
// Verify that rendering_mode change triggers a break
|
|
use crate::font::UnicodeSource;
|
|
use crate::graphics_state::Color;
|
|
|
|
let glyphs = vec![
|
|
Glyph::new('H', UnicodeSource::ToUnicode, 1.0, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('e', UnicodeSource::ToUnicode, 1.0, [10.0, 10.0, 20.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 2, Color::DeviceGray(0.0), false, None, false), // mode 2
|
|
];
|
|
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
|
|
assert_eq!(spans.len(), 2, "Expected 2 spans for rendering_mode change");
|
|
assert_eq!(spans[0].rendering_mode, 0);
|
|
assert_eq!(spans[1].rendering_mode, 2);
|
|
}
|
|
|
|
#[test]
|
|
fn test_merge_glyphs_to_spans_confidence_minimum() {
|
|
// INV: confidence is the MINIMUM of all member glyphs' confidence
|
|
use crate::font::UnicodeSource;
|
|
use crate::graphics_state::Color;
|
|
|
|
let glyphs = vec![
|
|
Glyph::new('H', UnicodeSource::ToUnicode, 1.0, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('e', UnicodeSource::ShapeMatch, 0.7, [10.0, 10.0, 20.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('l', UnicodeSource::Agl, 0.9, [20.0, 10.0, 30.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
];
|
|
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
|
|
assert_eq!(spans.len(), 1);
|
|
// Confidence should be minimum: min(1.0, 0.7, 0.9) = 0.7
|
|
assert_eq!(spans[0].confidence, 0.7);
|
|
}
|
|
|
|
#[test]
|
|
fn test_merge_glyphs_to_spans_confidence_source_worst_glyph() {
|
|
// INV: confidence_source is mapped from the WORST glyph (lowest confidence) source
|
|
use crate::font::UnicodeSource;
|
|
use crate::graphics_state::Color;
|
|
|
|
let glyphs = vec![
|
|
Glyph::new('H', UnicodeSource::ToUnicode, 1.0, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('e', UnicodeSource::ShapeMatch, 0.7, [10.0, 10.0, 20.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
];
|
|
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
|
|
assert_eq!(spans.len(), 1);
|
|
// ShapeMatch (0.7) is worse than ToUnicode (1.0), so confidence_source should be Heuristic
|
|
assert_eq!(spans[0].confidence_source, ConfidenceSource::Heuristic);
|
|
}
|
|
|
|
#[test]
|
|
fn test_merge_glyphs_to_spans_bbox_union() {
|
|
// Verify bbox is the union of all member glyph bboxes
|
|
use crate::font::UnicodeSource;
|
|
use crate::graphics_state::Color;
|
|
|
|
let glyphs = vec![
|
|
Glyph::new('H', UnicodeSource::ToUnicode, 1.0, [10.0, 20.0, 20.0, 30.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('e', UnicodeSource::ToUnicode, 1.0, [25.0, 15.0, 35.0, 25.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('l', UnicodeSource::ToUnicode, 1.0, [40.0, 18.0, 50.0, 28.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
];
|
|
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
|
|
assert_eq!(spans.len(), 1);
|
|
// Bbox should be union: x0=min(10,25,40)=10, y0=min(20,15,18)=15, x1=max(20,35,50)=50, y1=max(30,25,28)=30
|
|
assert_eq!(spans[0].bbox, [10.0, 15.0, 50.0, 30.0]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_merge_glyphs_to_spans_unicode_source_to_confidence_source_mapping() {
|
|
// Verify UnicodeSource → ConfidenceSource mapping per plan
|
|
use crate::font::UnicodeSource;
|
|
use crate::graphics_state::Color;
|
|
|
|
// Test ToUnicode → Native
|
|
let glyphs = vec![
|
|
Glyph::new('A', UnicodeSource::ToUnicode, 1.0, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
];
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
assert_eq!(spans[0].confidence_source, ConfidenceSource::Native);
|
|
|
|
// Test Agl → Native
|
|
let glyphs = vec![
|
|
Glyph::new('A', UnicodeSource::Agl, 0.9, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
];
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
assert_eq!(spans[0].confidence_source, ConfidenceSource::Native);
|
|
|
|
// Test Fingerprint → Native
|
|
let glyphs = vec![
|
|
Glyph::new('A', UnicodeSource::Fingerprint, 0.85, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
];
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
assert_eq!(spans[0].confidence_source, ConfidenceSource::Native);
|
|
|
|
// Test ShapeMatch → Heuristic
|
|
let glyphs = vec![
|
|
Glyph::new('A', UnicodeSource::ShapeMatch, 0.7, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
];
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
assert_eq!(spans[0].confidence_source, ConfidenceSource::Heuristic);
|
|
|
|
// Test Unknown → Heuristic
|
|
let glyphs = vec![
|
|
Glyph::new('A', UnicodeSource::Unknown, 0.0, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
];
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
assert_eq!(spans[0].confidence_source, ConfidenceSource::Heuristic);
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_color_for_comparison_device_gray() {
|
|
// Test DeviceGray normalization
|
|
use crate::graphics_state::Color;
|
|
|
|
let color = Color::DeviceGray(0.5);
|
|
let normalized = normalize_color_for_comparison(&color);
|
|
// 0.5 * 255.0 = 127.5, rounds to 128
|
|
assert_eq!(normalized, Some((128, 128, 128)));
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_color_for_comparison_device_rgb() {
|
|
// Test DeviceRGB normalization
|
|
use crate::graphics_state::Color;
|
|
|
|
let color = Color::DeviceRGB([1.0, 0.5, 0.0]);
|
|
let normalized = normalize_color_for_comparison(&color);
|
|
// 0.5 * 255.0 = 127.5, rounds to 128
|
|
assert_eq!(normalized, Some((255, 128, 0)));
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_color_for_comparison_device_cmyk() {
|
|
// Test DeviceCMYK normalization
|
|
use crate::graphics_state::Color;
|
|
|
|
// Cyan (C=1, M=0, Y=0, K=0) should map to RGB (0, 255, 255)
|
|
let color = Color::DeviceCMYK([1.0, 0.0, 0.0, 0.0]);
|
|
let normalized = normalize_color_for_comparison(&color);
|
|
assert_eq!(normalized, Some((0, 255, 255)));
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_color_for_comparison_spot() {
|
|
// Test Spot color returns None
|
|
use crate::graphics_state::Color;
|
|
|
|
let color = Color::Spot(Arc::from("PANTONE-123"), 1.0);
|
|
let normalized = normalize_color_for_comparison(&color);
|
|
assert_eq!(normalized, None);
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_color_for_comparison_other() {
|
|
// Test Other color returns None
|
|
use crate::graphics_state::Color;
|
|
|
|
let color = Color::Other;
|
|
let normalized = normalize_color_for_comparison(&color);
|
|
assert_eq!(normalized, None);
|
|
}
|
|
|
|
#[test]
|
|
fn test_colors_equal_device_gray_and_rgb_same() {
|
|
// Test DeviceGray(0.5) equals DeviceRGB([0.5, 0.5, 0.5])
|
|
use crate::graphics_state::Color;
|
|
|
|
let gray = Color::DeviceGray(0.5);
|
|
let rgb = Color::DeviceRGB([0.5, 0.5, 0.5]);
|
|
assert!(colors_equal(&gray, &rgb));
|
|
}
|
|
|
|
#[test]
|
|
fn test_colors_equal_device_gray_and_rgb_different() {
|
|
// Test DeviceGray(0.5) does not equal DeviceRGB([1.0, 0.5, 0.5])
|
|
use crate::graphics_state::Color;
|
|
|
|
let gray = Color::DeviceGray(0.5);
|
|
let rgb = Color::DeviceRGB([1.0, 0.5, 0.5]);
|
|
assert!(!colors_equal(&gray, &rgb));
|
|
}
|
|
|
|
#[test]
|
|
fn test_colors_equal_spot_different_names() {
|
|
// Test Spot colors with different names are not equal
|
|
use crate::graphics_state::Color;
|
|
|
|
let spot1 = Color::Spot(Arc::from("PANTONE-123"), 1.0);
|
|
let spot2 = Color::Spot(Arc::from("PANTONE-456"), 1.0);
|
|
assert!(!colors_equal(&spot1, &spot2));
|
|
}
|
|
|
|
#[test]
|
|
fn test_colors_equal_spot_same_name_different_tint() {
|
|
// Test Spot colors with same name but different tint are not equal
|
|
use crate::graphics_state::Color;
|
|
|
|
let spot1 = Color::Spot(Arc::from("PANTONE-123"), 1.0);
|
|
let spot2 = Color::Spot(Arc::from("PANTONE-123"), 0.5);
|
|
assert!(!colors_equal(&spot1, &spot2));
|
|
}
|
|
|
|
#[test]
|
|
fn test_colors_equal_spot_same_name_same_tint() {
|
|
// Test Spot colors with same name and tint are equal
|
|
use crate::graphics_state::Color;
|
|
|
|
let spot1 = Color::Spot(Arc::from("PANTONE-123"), 1.0);
|
|
let spot2 = Color::Spot(Arc::from("PANTONE-123"), 1.0);
|
|
assert!(colors_equal(&spot1, &spot2));
|
|
}
|
|
|
|
#[test]
|
|
fn test_colors_equal_spot_vs_device_rgb() {
|
|
// Test Spot color is never equal to DeviceRGB (even if visual appearance is similar)
|
|
use crate::graphics_state::Color;
|
|
|
|
let spot = Color::Spot(Arc::from("PANTONE-RED"), 1.0);
|
|
let rgb = Color::DeviceRGB([1.0, 0.0, 0.0]);
|
|
assert!(!colors_equal(&spot, &rgb));
|
|
}
|
|
|
|
// Acceptance criteria tests for pdftract-2c5sx (span text assembly)
|
|
|
|
#[test]
|
|
fn test_assemble_text_five_glyphs_hello() {
|
|
// AC: 5 glyphs "Hello" -> span.text == "Hello"
|
|
use crate::font::UnicodeSource;
|
|
|
|
let glyphs = vec![
|
|
Glyph::new('H', UnicodeSource::ToUnicode, 1.0, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('e', UnicodeSource::ToUnicode, 1.0, [10.0, 10.0, 20.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('l', UnicodeSource::ToUnicode, 1.0, [20.0, 10.0, 30.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('l', UnicodeSource::ToUnicode, 1.0, [30.0, 10.0, 40.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('o', UnicodeSource::ToUnicode, 1.0, [40.0, 10.0, 50.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
];
|
|
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
|
|
assert_eq!(spans.len(), 1);
|
|
assert_eq!(spans[0].text, "Hello");
|
|
}
|
|
|
|
#[test]
|
|
fn test_assemble_text_hello_world_with_boundary() {
|
|
// AC: 5 glyphs "Hello" + boundary + 5 glyphs "World" -> span1.text == "Hello ", span2.text == "World"
|
|
use crate::font::UnicodeSource;
|
|
|
|
let glyphs = vec![
|
|
Glyph::new('H', UnicodeSource::ToUnicode, 1.0, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('e', UnicodeSource::ToUnicode, 1.0, [10.0, 10.0, 20.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('l', UnicodeSource::ToUnicode, 1.0, [20.0, 10.0, 30.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('l', UnicodeSource::ToUnicode, 1.0, [30.0, 10.0, 40.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('o', UnicodeSource::ToUnicode, 1.0, [40.0, 10.0, 50.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
// Word boundary
|
|
Glyph::new(' ', UnicodeSource::ToUnicode, 1.0, [50.0, 10.0, 60.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), true, None, false),
|
|
Glyph::new('W', UnicodeSource::ToUnicode, 1.0, [60.0, 10.0, 70.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('o', UnicodeSource::ToUnicode, 1.0, [70.0, 10.0, 80.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('r', UnicodeSource::ToUnicode, 1.0, [80.0, 10.0, 90.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('l', UnicodeSource::ToUnicode, 1.0, [90.0, 10.0, 100.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('d', UnicodeSource::ToUnicode, 1.0, [100.0, 10.0, 110.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
];
|
|
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
|
|
assert_eq!(spans.len(), 2);
|
|
assert_eq!(spans[0].text, "Hello ", "First span should have trailing space");
|
|
assert_eq!(spans[1].text, "World", "Second span should not have leading space");
|
|
}
|
|
|
|
#[test]
|
|
fn test_assemble_text_ligature_fi_as_two_glyphs() {
|
|
// AC: Ligature glyph emitting (f, i) as 2 glyphs with shared bbox: span.text == "fi"
|
|
// Phase 2 already expands ligatures into separate glyphs, so we just verify per-glyph append works
|
|
use crate::font::UnicodeSource;
|
|
|
|
// Simulate a ligature that was expanded into two glyphs with shared bbox
|
|
let shared_bbox = [0.0, 10.0, 12.0, 20.0];
|
|
let glyphs = vec![
|
|
Glyph::new('f', UnicodeSource::ToUnicode, 1.0, shared_bbox,
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('i', UnicodeSource::ToUnicode, 1.0, shared_bbox,
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
];
|
|
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
|
|
assert_eq!(spans.len(), 1);
|
|
assert_eq!(spans[0].text, "fi", "Ligature expansion should concatenate both codepoints");
|
|
}
|
|
|
|
#[test]
|
|
fn test_assemble_text_rtl_arabic_preserved_in_source_order() {
|
|
// AC: RTL Arabic span: text in source byte order (Phase 4.2 reorders at line level)
|
|
// Arabic word "kitab" (book) in visual order: k-t-a-b (but stored in logical order)
|
|
// For this test, we just verify that glyphs are appended in the order they appear
|
|
use crate::font::UnicodeSource;
|
|
|
|
// Arabic letters in their logical order (as they appear in the content stream)
|
|
let glyphs = vec![
|
|
Glyph::new('\u{0643}', UnicodeSource::ToUnicode, 1.0, [0.0, 10.0, 10.0, 20.0], // keheh (k)
|
|
Arc::from("Arial"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('\u{062A}', UnicodeSource::ToUnicode, 1.0, [10.0, 10.0, 20.0, 20.0], // teh (t)
|
|
Arc::from("Arial"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('\u{0627}', UnicodeSource::ToUnicode, 1.0, [20.0, 10.0, 30.0, 20.0], // alef (a)
|
|
Arc::from("Arial"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('\u{0628}', UnicodeSource::ToUnicode, 1.0, [30.0, 10.0, 40.0, 20.0], // beh (b)
|
|
Arc::from("Arial"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
];
|
|
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
|
|
assert_eq!(spans.len(), 1);
|
|
// Text should be in source byte order (as glyphs appear in content stream)
|
|
// Phase 4.2 will handle bidi reordering at the line level
|
|
assert_eq!(spans[0].text, "\u{0643}\u{062A}\u{0627}\u{0628}");
|
|
}
|
|
|
|
#[test]
|
|
fn test_assemble_text_boundary_at_start_of_page_no_space_injection() {
|
|
// AC: Boundary at start of page: no space injection; first span starts cleanly
|
|
use crate::font::UnicodeSource;
|
|
|
|
// First glyph is a word boundary (odd but possible)
|
|
let glyphs = vec![
|
|
Glyph::new(' ', UnicodeSource::ToUnicode, 1.0, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), true, None, false),
|
|
Glyph::new('H', UnicodeSource::ToUnicode, 1.0, [10.0, 10.0, 20.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('e', UnicodeSource::ToUnicode, 1.0, [20.0, 10.0, 30.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
];
|
|
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
|
|
// Should produce one span with "He" (no leading space)
|
|
assert_eq!(spans.len(), 1);
|
|
assert_eq!(spans[0].text, "He", "No leading space when boundary is first glyph");
|
|
}
|
|
|
|
#[test]
|
|
fn test_assemble_text_direct_call() {
|
|
// Direct test of the assemble_text function
|
|
use crate::font::UnicodeSource;
|
|
|
|
let mut span = Span::empty();
|
|
let glyph1 = Glyph::new('A', UnicodeSource::ToUnicode, 1.0, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false);
|
|
let glyph2 = Glyph::new('B', UnicodeSource::ToUnicode, 1.0, [10.0, 10.0, 20.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false);
|
|
|
|
assemble_text(&mut span, &glyph1);
|
|
assert_eq!(span.text, "A");
|
|
|
|
assemble_text(&mut span, &glyph2);
|
|
assert_eq!(span.text, "AB");
|
|
}
|
|
|
|
#[test]
|
|
fn test_assemble_text_preserves_special_unicode_chars() {
|
|
// Verify that soft hyphen, ZWJ, ZWNJ, and U+FFFD are preserved
|
|
use crate::font::UnicodeSource;
|
|
|
|
let glyphs = vec![
|
|
Glyph::new('a', UnicodeSource::ToUnicode, 1.0, [0.0, 10.0, 10.0, 20.0],
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('\u{00AD}', UnicodeSource::ToUnicode, 1.0, [10.0, 10.0, 20.0, 20.0], // soft hyphen
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('\u{200D}', UnicodeSource::ToUnicode, 1.0, [20.0, 10.0, 30.0, 20.0], // ZWJ
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('\u{200C}', UnicodeSource::ToUnicode, 1.0, [30.0, 10.0, 40.0, 20.0], // ZWNJ
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
Glyph::new('\u{FFFD}', UnicodeSource::Unknown, 0.0, [40.0, 10.0, 50.0, 20.0], // replacement char
|
|
Arc::from("Helvetica"), 12.0, 0, Color::DeviceGray(0.0), false, None, false),
|
|
];
|
|
|
|
let spans = merge_glyphs_to_spans(&glyphs);
|
|
|
|
assert_eq!(spans.len(), 1);
|
|
assert_eq!(spans[0].text, "a\u{00AD}\u{200D}\u{200C}\u{FFFD}");
|
|
}
|
|
|
|
// Acceptance criteria tests for pdftract-2etcd (map_confidence_source)
|
|
|
|
#[test]
|
|
fn test_map_confidence_source_to_unicode_without_correction() {
|
|
// AC: ToUnicode + corrected=false → Native
|
|
use crate::font::UnicodeSource;
|
|
|
|
assert_eq!(
|
|
map_confidence_source(UnicodeSource::ToUnicode, false),
|
|
ConfidenceSource::Native
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_map_confidence_source_to_unicode_with_correction() {
|
|
// AC: ToUnicode + corrected=true → Heuristic (override applies)
|
|
use crate::font::UnicodeSource;
|
|
|
|
assert_eq!(
|
|
map_confidence_source(UnicodeSource::ToUnicode, true),
|
|
ConfidenceSource::Heuristic
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_map_confidence_source_agl_without_correction() {
|
|
// AC: Agl + corrected=false → Native
|
|
use crate::font::UnicodeSource;
|
|
|
|
assert_eq!(
|
|
map_confidence_source(UnicodeSource::Agl, false),
|
|
ConfidenceSource::Native
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_map_confidence_source_agl_with_correction() {
|
|
// AC: Agl + corrected=true → Heuristic (override applies)
|
|
use crate::font::UnicodeSource;
|
|
|
|
assert_eq!(
|
|
map_confidence_source(UnicodeSource::Agl, true),
|
|
ConfidenceSource::Heuristic
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_map_confidence_source_fingerprint_without_correction() {
|
|
// AC: Fingerprint + corrected=false → Native
|
|
use crate::font::UnicodeSource;
|
|
|
|
assert_eq!(
|
|
map_confidence_source(UnicodeSource::Fingerprint, false),
|
|
ConfidenceSource::Native
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_map_confidence_source_fingerprint_with_correction() {
|
|
// AC: Fingerprint + corrected=true → Heuristic (override applies)
|
|
use crate::font::UnicodeSource;
|
|
|
|
assert_eq!(
|
|
map_confidence_source(UnicodeSource::Fingerprint, true),
|
|
ConfidenceSource::Heuristic
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_map_confidence_source_shape_match_any_correction() {
|
|
// AC: ShapeMatch + (any) → Heuristic (correction flag doesn't matter)
|
|
use crate::font::UnicodeSource;
|
|
|
|
assert_eq!(
|
|
map_confidence_source(UnicodeSource::ShapeMatch, false),
|
|
ConfidenceSource::Heuristic
|
|
);
|
|
assert_eq!(
|
|
map_confidence_source(UnicodeSource::ShapeMatch, true),
|
|
ConfidenceSource::Heuristic
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_map_confidence_source_unknown_any_correction() {
|
|
// AC: Unknown + (any) → Heuristic (correction flag doesn't matter)
|
|
use crate::font::UnicodeSource;
|
|
|
|
assert_eq!(
|
|
map_confidence_source(UnicodeSource::Unknown, false),
|
|
ConfidenceSource::Heuristic
|
|
);
|
|
assert_eq!(
|
|
map_confidence_source(UnicodeSource::Unknown, true),
|
|
ConfidenceSource::Heuristic
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_map_confidence_source_ocr_without_correction() {
|
|
// AC: Ocr + corrected=false → Ocr (override does NOT apply to OCR)
|
|
use crate::font::UnicodeSource;
|
|
|
|
assert_eq!(
|
|
map_confidence_source(UnicodeSource::Ocr, false),
|
|
ConfidenceSource::Ocr
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_map_confidence_source_ocr_with_correction() {
|
|
// AC: Ocr + corrected=true → Ocr (override does NOT apply to OCR)
|
|
use crate::font::UnicodeSource;
|
|
|
|
assert_eq!(
|
|
map_confidence_source(UnicodeSource::Ocr, true),
|
|
ConfidenceSource::Ocr
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_map_confidence_source_exhaustive_match() {
|
|
// AC: Exhaustive match: adding a hypothetical UnicodeSource::Fallback
|
|
// would cause a compiler error in this function until a match arm is added
|
|
use crate::font::UnicodeSource;
|
|
|
|
// Test all current variants
|
|
for (source, expected_without_correction, expected_with_correction) in &[
|
|
(UnicodeSource::ToUnicode, ConfidenceSource::Native, ConfidenceSource::Heuristic),
|
|
(UnicodeSource::Agl, ConfidenceSource::Native, ConfidenceSource::Heuristic),
|
|
(UnicodeSource::Fingerprint, ConfidenceSource::Native, ConfidenceSource::Heuristic),
|
|
(UnicodeSource::ShapeMatch, ConfidenceSource::Heuristic, ConfidenceSource::Heuristic),
|
|
(UnicodeSource::Unknown, ConfidenceSource::Heuristic, ConfidenceSource::Heuristic),
|
|
(UnicodeSource::Ocr, ConfidenceSource::Ocr, ConfidenceSource::Ocr),
|
|
] {
|
|
assert_eq!(
|
|
map_confidence_source(*source, false),
|
|
*expected_without_correction,
|
|
"Without correction: {:?}",
|
|
source
|
|
);
|
|
assert_eq!(
|
|
map_confidence_source(*source, true),
|
|
*expected_with_correction,
|
|
"With correction: {:?}",
|
|
source
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_map_confidence_source_correction_downgrades_native_to_heuristic() {
|
|
// INV: Phase 4.7 correction ALWAYS overrides upward (Native -> Heuristic)
|
|
// — never downward (Ocr -> Heuristic)
|
|
use crate::font::UnicodeSource;
|
|
|
|
// All Native sources should downgrade to Heuristic when corrected=true
|
|
let native_sources = [
|
|
UnicodeSource::ToUnicode,
|
|
UnicodeSource::Agl,
|
|
UnicodeSource::Fingerprint,
|
|
];
|
|
|
|
for source in native_sources {
|
|
assert_eq!(
|
|
map_confidence_source(source, false),
|
|
ConfidenceSource::Native,
|
|
"{:?} should be Native without correction",
|
|
source
|
|
);
|
|
assert_eq!(
|
|
map_confidence_source(source, true),
|
|
ConfidenceSource::Heuristic,
|
|
"{:?} should downgrade to Heuristic with correction",
|
|
source
|
|
);
|
|
}
|
|
}
|
|
}
|