feat(pdftract-47vu): implement pHash for glyph shape recognition
Implement phash_glyph(bitmap: &[u8; 1024]) -> u64 that computes a 64-bit perceptual hash for 32×32 grayscale glyph bitmaps. Algorithm: 1. Normalize pixel values to [-1.0, +1.0] 2. Apply 32×32 2D DCT-II (hand-rolled, precomputed basis) 3. Extract 64 low-frequency AC coefficients (8×8 block, DC excluded) 4. Threshold against median to produce 64-bit hash Key features: - Special case for uniform bitmaps (returns 0 deterministically) - Deterministic across platforms (no NaN, stable float ordering) - hamming_distance helper for hash comparison Closes: pdftract-47vu
This commit is contained in:
parent
730eeffcee
commit
ca1582a839
3 changed files with 466 additions and 50 deletions
|
|
@ -3,33 +3,37 @@
|
|||
//! This module provides utilities for classifying PDF fonts by type
|
||||
//! and handling font subset prefixes.
|
||||
|
||||
pub mod std14;
|
||||
pub mod agl;
|
||||
pub mod cmap;
|
||||
pub mod embedded;
|
||||
pub mod encoding;
|
||||
pub mod fingerprint;
|
||||
pub mod predefined_cmap;
|
||||
pub mod resolver;
|
||||
pub mod shape;
|
||||
pub mod std14;
|
||||
pub mod type0;
|
||||
pub mod type3;
|
||||
pub mod type3_rasterizer;
|
||||
pub mod cmap;
|
||||
pub mod encoding;
|
||||
pub mod agl;
|
||||
pub mod fingerprint;
|
||||
pub mod resolver;
|
||||
pub mod predefined_cmap;
|
||||
|
||||
#[cfg(feature = "cjk")]
|
||||
pub mod cjk_encoding;
|
||||
|
||||
pub use embedded::{EmbeddedFont, FontMetrics, EmptyFontMetrics, GlyphBbox};
|
||||
pub use type0::{Type0Font, DescendantCIDFont, CIDToGIDMap};
|
||||
pub use type3::Type3Font;
|
||||
pub use cmap::{ToUnicodeMap, parse_to_unicode, parse_to_unicode_with_diags};
|
||||
pub use encoding::{NamedEncoding, DifferencesOverlay, FontEncoding};
|
||||
pub use agl::{unicode_for_glyph_name, unicode_for_glyph_name_multi};
|
||||
pub use fingerprint::{FontFingerprint, CachedFingerprint, lookup_font_fingerprint};
|
||||
pub use resolver::{FontId, UnicodeSource, ResolvedGlyph, ResolverCache, Font, resolve_unicode};
|
||||
pub use predefined_cmap::{PredefinedCMap, from_name as predefined_cmap_from_name, CharacterCollection};
|
||||
pub use cmap::{parse_to_unicode, parse_to_unicode_with_diags, ToUnicodeMap};
|
||||
pub use embedded::{EmbeddedFont, EmptyFontMetrics, FontMetrics, GlyphBbox};
|
||||
pub use encoding::{DifferencesOverlay, FontEncoding, NamedEncoding};
|
||||
pub use fingerprint::{lookup_font_fingerprint, CachedFingerprint, FontFingerprint};
|
||||
pub use predefined_cmap::{
|
||||
from_name as predefined_cmap_from_name, CharacterCollection, PredefinedCMap,
|
||||
};
|
||||
pub use resolver::{resolve_unicode, Font, FontId, ResolvedGlyph, ResolverCache, UnicodeSource};
|
||||
pub use shape::{hamming_distance, phash_glyph};
|
||||
pub use type0::{CIDToGIDMap, DescendantCIDFont, Type0Font};
|
||||
pub use type3::Type3Font;
|
||||
|
||||
#[cfg(feature = "cjk")]
|
||||
pub use cjk_encoding::{CjkEncoding, decode_cjk_bytes};
|
||||
pub use cjk_encoding::{decode_cjk_bytes, CjkEncoding};
|
||||
|
||||
use crate::parser::object::types::{PdfDict, PdfObject};
|
||||
|
||||
|
|
@ -435,10 +439,7 @@ mod tests {
|
|||
#[test]
|
||||
fn test_classify_font_cidfonttype0() {
|
||||
let mut dict = PdfDict::new();
|
||||
dict.insert(
|
||||
intern("/Subtype"),
|
||||
PdfObject::Name(intern("/CIDFontType0")),
|
||||
);
|
||||
dict.insert(intern("/Subtype"), PdfObject::Name(intern("/CIDFontType0")));
|
||||
dict.insert(intern("/BaseFont"), PdfObject::Name(intern("CIDFont0")));
|
||||
|
||||
assert_eq!(classify_font(&dict), FontKind::CIDFontType0);
|
||||
|
|
@ -447,10 +448,7 @@ mod tests {
|
|||
#[test]
|
||||
fn test_classify_font_cidfonttype2() {
|
||||
let mut dict = PdfDict::new();
|
||||
dict.insert(
|
||||
intern("/Subtype"),
|
||||
PdfObject::Name(intern("/CIDFontType2")),
|
||||
);
|
||||
dict.insert(intern("/Subtype"), PdfObject::Name(intern("/CIDFontType2")));
|
||||
dict.insert(intern("/BaseFont"), PdfObject::Name(intern("CIDFont2")));
|
||||
|
||||
assert_eq!(classify_font(&dict), FontKind::CIDFontType2);
|
||||
|
|
@ -460,23 +458,15 @@ mod tests {
|
|||
fn test_classify_font_type0_with_cidfonttype0() {
|
||||
// Create descendant CIDFont dict
|
||||
let mut cidfont_dict = PdfDict::new();
|
||||
cidfont_dict.insert(
|
||||
intern("/Subtype"),
|
||||
PdfObject::Name(intern("/CIDFontType0")),
|
||||
);
|
||||
cidfont_dict.insert(intern("/Subtype"), PdfObject::Name(intern("/CIDFontType0")));
|
||||
|
||||
// Create Type0 font dict with descendant
|
||||
let mut dict = PdfDict::new();
|
||||
dict.insert(intern("/Subtype"), PdfObject::Name(intern("/Type0")));
|
||||
dict.insert(
|
||||
intern("/BaseFont"),
|
||||
PdfObject::Name(intern("Type0Font")),
|
||||
);
|
||||
dict.insert(intern("/BaseFont"), PdfObject::Name(intern("Type0Font")));
|
||||
dict.insert(
|
||||
intern("/DescendantFonts"),
|
||||
PdfObject::Array(Box::new(vec![PdfObject::Dict(Box::new(
|
||||
cidfont_dict,
|
||||
))])),
|
||||
PdfObject::Array(Box::new(vec![PdfObject::Dict(Box::new(cidfont_dict))])),
|
||||
);
|
||||
|
||||
assert_eq!(classify_font(&dict), FontKind::CIDFontType0);
|
||||
|
|
@ -486,23 +476,15 @@ mod tests {
|
|||
fn test_classify_font_type0_with_cidfonttype2() {
|
||||
// Create descendant CIDFont dict
|
||||
let mut cidfont_dict = PdfDict::new();
|
||||
cidfont_dict.insert(
|
||||
intern("/Subtype"),
|
||||
PdfObject::Name(intern("/CIDFontType2")),
|
||||
);
|
||||
cidfont_dict.insert(intern("/Subtype"), PdfObject::Name(intern("/CIDFontType2")));
|
||||
|
||||
// Create Type0 font dict with descendant
|
||||
let mut dict = PdfDict::new();
|
||||
dict.insert(intern("/Subtype"), PdfObject::Name(intern("/Type0")));
|
||||
dict.insert(
|
||||
intern("/BaseFont"),
|
||||
PdfObject::Name(intern("Type0Font")),
|
||||
);
|
||||
dict.insert(intern("/BaseFont"), PdfObject::Name(intern("Type0Font")));
|
||||
dict.insert(
|
||||
intern("/DescendantFonts"),
|
||||
PdfObject::Array(Box::new(vec![PdfObject::Dict(Box::new(
|
||||
cidfont_dict,
|
||||
))])),
|
||||
PdfObject::Array(Box::new(vec![PdfObject::Dict(Box::new(cidfont_dict))])),
|
||||
);
|
||||
|
||||
assert_eq!(classify_font(&dict), FontKind::CIDFontType2);
|
||||
|
|
@ -512,10 +494,7 @@ mod tests {
|
|||
fn test_classify_font_opentype_cff() {
|
||||
// Create FontFile3 stream dict with /Subtype /OpenType
|
||||
let mut font_file3_dict = PdfDict::new();
|
||||
font_file3_dict.insert(
|
||||
intern("/Subtype"),
|
||||
PdfObject::Name(intern("/OpenType")),
|
||||
);
|
||||
font_file3_dict.insert(intern("/Subtype"), PdfObject::Name(intern("/OpenType")));
|
||||
|
||||
// Create FontDescriptor dict
|
||||
let mut font_descriptor = PdfDict::new();
|
||||
|
|
|
|||
366
crates/pdftract-core/src/font/shape.rs
Normal file
366
crates/pdftract-core/src/font/shape.rs
Normal file
|
|
@ -0,0 +1,366 @@
|
|||
//! Perceptual hash (pHash) implementation for glyph shape recognition.
|
||||
//!
|
||||
//! This module implements the pHash algorithm for comparing glyph shapes.
|
||||
//! It produces a 64-bit hash that is robust to minor rendering differences
|
||||
//! between fonts of the same character.
|
||||
//!
|
||||
//! # Algorithm
|
||||
//!
|
||||
//! 1. Convert 32×32 grayscale bitmap to float32 values
|
||||
//! 2. Apply 32×32 2D DCT-II (Discrete Cosine Transform)
|
||||
//! 3. Extract top-left 8×8 AC coefficients (skipping DC at [0,0])
|
||||
//! 4. Compute median of those 64 values
|
||||
//! 5. Produce 64-bit hash: bit i is set if coefficient i > median
|
||||
//!
|
||||
//! # Properties
|
||||
//!
|
||||
//! - Same input bitmap produces identical hash across platforms (deterministic)
|
||||
//! - Hamming distance ≤ 8 indicates similar shapes (same character, different font)
|
||||
//! - Hamming distance > 12 indicates different characters
|
||||
//!
|
||||
//! # References
|
||||
//!
|
||||
//! - Phash library by Evan Prodromou
|
||||
//! - Marr & Hildreth visual feature theory
|
||||
//! - Plan section: Phase 2.5 Glyph Shape Database (line 1420)
|
||||
|
||||
use std::f32;
|
||||
|
||||
/// DCT size: 32×32 input bitmap
|
||||
const DCT_SIZE: usize = 32;
|
||||
|
||||
/// Output hash size: 64 bits
|
||||
const HASH_SIZE: usize = 64;
|
||||
|
||||
/// Size of the low-frequency coefficient block: 8×8
|
||||
const LOW_FREQ_SIZE: usize = 8;
|
||||
|
||||
/// Perceptual hash of a 32×32 grayscale glyph bitmap.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `bitmap` - A 32×32 grayscale bitmap (row-major, 8-bit per pixel).
|
||||
/// Per convention: 0 = black ink, 255 = white paper.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A 64-bit hash where each bit represents whether one of the 64 low-frequency
|
||||
/// DCT coefficients is above the median of those coefficients.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use pdftract_core::font::shape::phash_glyph;
|
||||
///
|
||||
/// // White bitmap (all 255) -> all zeros in DCT -> hash = 0
|
||||
/// let white_bitmap = [255u8; 1024];
|
||||
/// let hash = phash_glyph(&white_bitmap);
|
||||
/// assert_eq!(hash, 0x0000000000000000);
|
||||
/// ```
|
||||
///
|
||||
/// # Invariants
|
||||
///
|
||||
/// - Same input bitmap produces identical hash across runs and platforms
|
||||
/// - No NaN values in computation
|
||||
/// - Deterministic float ordering (no platform-specific differences)
|
||||
pub fn phash_glyph(bitmap: &[u8; 1024]) -> u64 {
|
||||
// Special case: uniform bitmaps (all pixels identical) have no visual information
|
||||
// Return 0 deterministically
|
||||
let first_pixel = bitmap[0];
|
||||
if bitmap.iter().all(|&p| p == first_pixel) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Step 1: Convert to float32, centered at zero
|
||||
let mut input = [0.0f32; DCT_SIZE * DCT_SIZE];
|
||||
for i in 0..1024 {
|
||||
// Center the values: 0->-1.0, 255->+1.0, 128->0.0
|
||||
// This centers the pixel intensity around zero for better DCT behavior
|
||||
input[i] = (bitmap[i] as f32) / 127.5 - 1.0;
|
||||
}
|
||||
|
||||
// Step 2: Apply 2D DCT-II (row-wise, then column-wise)
|
||||
let mut dct_output = [0.0f32; DCT_SIZE * DCT_SIZE];
|
||||
dct_2d(&input, &mut dct_output);
|
||||
|
||||
// Step 3: Extract top-left 8×8 coefficients (excluding DC at [0,0])
|
||||
// We need 64 values total. The plan says "top-left 8×8 AC coefficients"
|
||||
// and "skipping DC at [0,0]". The standard pHash approach:
|
||||
// - Use 8×8 block starting at [0,0] (64 values)
|
||||
// - Exclude [0,0] (the DC component)
|
||||
// - We need one more value to make 64
|
||||
//
|
||||
// Plan clarification: "use the remaining 63 + the [0,1] cell"
|
||||
// Actually, re-reading: the standard approach uses all 64 values
|
||||
// including DC in the median computation, but DC is always the
|
||||
// largest value, so it doesn't affect the threshold much.
|
||||
//
|
||||
// For this implementation, we'll use the 64 lowest-frequency AC
|
||||
// coefficients: the 8×8 block starting at [0,0], but we replace
|
||||
// [0,0] (DC) with [0,8] to get 64 AC values total.
|
||||
let mut low_freq = [0.0f32; HASH_SIZE];
|
||||
let mut idx = 0;
|
||||
for y in 0..LOW_FREQ_SIZE {
|
||||
for x in 0..LOW_FREQ_SIZE {
|
||||
if x == 0 && y == 0 {
|
||||
// Skip DC, use [0,8] instead (still low frequency)
|
||||
low_freq[idx] = dct_output[8 * DCT_SIZE].abs();
|
||||
} else {
|
||||
low_freq[idx] = dct_output[y * DCT_SIZE + x].abs();
|
||||
}
|
||||
idx += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 4: Compute median
|
||||
let mut sorted = low_freq;
|
||||
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
|
||||
// Median of 64 values is average of indices 31 and 32
|
||||
let median = (sorted[31] + sorted[32]) / 2.0;
|
||||
|
||||
// Step 5: Threshold to produce 64-bit hash
|
||||
let mut hash: u64 = 0;
|
||||
for i in 0..HASH_SIZE {
|
||||
if low_freq[i] > median {
|
||||
hash |= 1 << i;
|
||||
}
|
||||
}
|
||||
|
||||
hash
|
||||
}
|
||||
|
||||
/// Apply 2D DCT-II to a 32×32 input matrix.
|
||||
///
|
||||
/// DCT-II formula for a 2D matrix:
|
||||
/// F[u,v] = (2/√(MN)) * Σ_x Σ_y f[x,y] * cos(π(2x+1)u/(2N)) * cos(π(2y+1)v/(2M))
|
||||
///
|
||||
/// For orthonormal DCT, the scale factor is applied such that the transform
|
||||
/// is its own inverse (up to scaling).
|
||||
///
|
||||
/// This implementation uses a separable approach: apply 1D DCT to each row,
|
||||
/// then apply 1D DCT to each column of the result.
|
||||
fn dct_2d(input: &[f32; DCT_SIZE * DCT_SIZE], output: &mut [f32; DCT_SIZE * DCT_SIZE]) {
|
||||
let mut temp = [0.0f32; DCT_SIZE * DCT_SIZE];
|
||||
|
||||
// Precompute cosine basis for 1D DCT
|
||||
// basis[k][n] = cos(π * k * (2n + 1) / (2 * N))
|
||||
let mut basis = [[0.0f32; DCT_SIZE]; DCT_SIZE];
|
||||
for k in 0..DCT_SIZE {
|
||||
for n in 0..DCT_SIZE {
|
||||
basis[k][n] =
|
||||
(f32::consts::PI * k as f32 * (2 * n + 1) as f32 / (2 * DCT_SIZE) as f32).cos();
|
||||
}
|
||||
}
|
||||
|
||||
// Apply 1D DCT to each row
|
||||
for y in 0..DCT_SIZE {
|
||||
for k in 0..DCT_SIZE {
|
||||
let mut sum = 0.0f32;
|
||||
for n in 0..DCT_SIZE {
|
||||
sum += input[y * DCT_SIZE + n] * basis[k][n];
|
||||
}
|
||||
// Normalize: scale factor for orthonormal DCT
|
||||
let scale = if k == 0 {
|
||||
(1.0 / DCT_SIZE as f32).sqrt()
|
||||
} else {
|
||||
(2.0 / DCT_SIZE as f32).sqrt()
|
||||
};
|
||||
temp[y * DCT_SIZE + k] = sum * scale;
|
||||
}
|
||||
}
|
||||
|
||||
// Apply 1D DCT to each column
|
||||
for x in 0..DCT_SIZE {
|
||||
for k in 0..DCT_SIZE {
|
||||
let mut sum = 0.0f32;
|
||||
for n in 0..DCT_SIZE {
|
||||
sum += temp[n * DCT_SIZE + x] * basis[k][n];
|
||||
}
|
||||
let scale = if k == 0 {
|
||||
(1.0 / DCT_SIZE as f32).sqrt()
|
||||
} else {
|
||||
(2.0 / DCT_SIZE as f32).sqrt()
|
||||
};
|
||||
output[k * DCT_SIZE + x] = sum * scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute Hamming distance between two pHash values.
|
||||
///
|
||||
/// The Hamming distance is the count of differing bits. For pHash:
|
||||
/// - Distance ≤ 8: similar shapes (likely same character, different font)
|
||||
/// - Distance 9-12: uncertain (may be similar or different)
|
||||
/// - Distance > 12: different characters
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `a` - First pHash value
|
||||
/// * `b` - Second pHash value
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Number of differing bits (0-64)
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use pdftract_core::font::shape::{phash_glyph, hamming_distance};
|
||||
///
|
||||
/// let bitmap1 = [128u8; 1024];
|
||||
/// let bitmap2 = [128u8; 1024];
|
||||
/// let hash1 = phash_glyph(&bitmap1);
|
||||
/// let hash2 = phash_glyph(&bitmap2);
|
||||
/// assert_eq!(hamming_distance(hash1, hash2), 0);
|
||||
/// ```
|
||||
pub fn hamming_distance(a: u64, b: u64) -> u32 {
|
||||
(a ^ b).count_ones()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_phash_white_bitmap() {
|
||||
// All-white bitmap (all 255) -> all pixels centered at +1.0
|
||||
// After DCT, only DC coefficient is non-zero
|
||||
// All other coefficients are 0, so all bits below median -> hash = 0
|
||||
let white_bitmap = [255u8; 1024];
|
||||
let hash = phash_glyph(&white_bitmap);
|
||||
assert_eq!(hash, 0x0000000000000000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_phash_black_bitmap() {
|
||||
// All-black bitmap (all 0) -> all pixels centered at -1.0
|
||||
// After DCT, only DC coefficient is non-zero
|
||||
// All other coefficients are 0, so all bits below median -> hash = 0
|
||||
let black_bitmap = [0u8; 1024];
|
||||
let hash = phash_glyph(&black_bitmap);
|
||||
assert_eq!(hash, 0x0000000000000000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_phash_gray_bitmap() {
|
||||
// All-gray bitmap (all 128) -> all pixels centered at 0.0
|
||||
// After DCT, all coefficients are 0
|
||||
let gray_bitmap = [128u8; 1024];
|
||||
let hash = phash_glyph(&gray_bitmap);
|
||||
assert_eq!(hash, 0x0000000000000000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_phash_half_white_half_black() {
|
||||
// Left half white, right half black
|
||||
let mut bitmap = [0u8; 1024];
|
||||
for y in 0..32 {
|
||||
for x in 16..32 {
|
||||
bitmap[y * 32 + x] = 255;
|
||||
}
|
||||
}
|
||||
let hash = phash_glyph(&bitmap);
|
||||
// This should produce a non-zero hash due to the vertical edge
|
||||
assert_ne!(hash, 0x0000000000000000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_phash_deterministic() {
|
||||
// Same input must produce same hash
|
||||
let mut bitmap = [0u8; 1024];
|
||||
for i in 0..1024 {
|
||||
bitmap[i] = (i % 256) as u8;
|
||||
}
|
||||
let hash1 = phash_glyph(&bitmap);
|
||||
let hash2 = phash_glyph(&bitmap);
|
||||
assert_eq!(hash1, hash2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_phash_horizontal_gradient() {
|
||||
// Horizontal gradient from black to white
|
||||
let mut bitmap = [0u8; 1024];
|
||||
for y in 0..32 {
|
||||
for x in 0..32 {
|
||||
bitmap[y * 32 + x] = (x * 255 / 31) as u8;
|
||||
}
|
||||
}
|
||||
let hash = phash_glyph(&bitmap);
|
||||
// Should produce a non-zero hash
|
||||
assert_ne!(hash, 0x0000000000000000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_phash_checkerboard() {
|
||||
// Checkerboard pattern
|
||||
let mut bitmap = [0u8; 1024];
|
||||
for y in 0..32 {
|
||||
for x in 0..32 {
|
||||
if (x + y) % 2 == 0 {
|
||||
bitmap[y * 32 + x] = 0;
|
||||
} else {
|
||||
bitmap[y * 32 + x] = 255;
|
||||
}
|
||||
}
|
||||
}
|
||||
let hash = phash_glyph(&bitmap);
|
||||
// Should produce a non-zero hash
|
||||
assert_ne!(hash, 0x0000000000000000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hamming_distance_identical() {
|
||||
let hash = 0x1234567890ABCDEF;
|
||||
assert_eq!(hamming_distance(hash, hash), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hamming_distance_completely_different() {
|
||||
assert_eq!(hamming_distance(0xFFFFFFFFFFFFFFFF, 0x0000000000000000), 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hamming_distance_one_bit() {
|
||||
assert_eq!(hamming_distance(0x0000000000000001, 0x0000000000000000), 1);
|
||||
assert_eq!(hamming_distance(0x8000000000000000, 0x0000000000000000), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hamming_distance_multiple_bits() {
|
||||
assert_eq!(hamming_distance(0x000000000000000F, 0x0000000000000000), 4);
|
||||
// These differ in all 64 bits
|
||||
assert_eq!(hamming_distance(0xFFFFFFFF00000000, 0x00000000FFFFFFFF), 64);
|
||||
// These differ in 32 bits (first half only)
|
||||
assert_eq!(hamming_distance(0xFFFFFFFF00000000, 0x0000000000000000), 32);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_phash_different_shapes_different_hashes() {
|
||||
// Different shapes should produce different hashes (with high probability)
|
||||
let mut bitmap1 = [255u8; 1024]; // Start with white
|
||||
let mut bitmap2 = [255u8; 1024]; // Start with white
|
||||
|
||||
// Create a horizontal stripe pattern (black stripe in middle)
|
||||
for y in 8..16 {
|
||||
for x in 0..32 {
|
||||
bitmap1[y * 32 + x] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Create a vertical stripe pattern (black stripe in middle)
|
||||
for y in 0..32 {
|
||||
for x in 8..16 {
|
||||
bitmap2[y * 32 + x] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
let hash1 = phash_glyph(&bitmap1);
|
||||
let hash2 = phash_glyph(&bitmap2);
|
||||
|
||||
// These are very different patterns, so hashes should differ
|
||||
assert_ne!(
|
||||
hash1, hash2,
|
||||
"Different shapes should produce different hashes"
|
||||
);
|
||||
}
|
||||
}
|
||||
71
notes/pdftract-47vu.md
Normal file
71
notes/pdftract-47vu.md
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
# pdftract-47vu: pHash implementation
|
||||
|
||||
## Summary
|
||||
|
||||
Implemented `phash_glyph(bitmap: &[u8; 1024]) -> u64` that computes a 64-bit perceptual hash for 32×32 grayscale glyph bitmaps using the pHash algorithm.
|
||||
|
||||
## Implementation Details
|
||||
|
||||
### Algorithm
|
||||
1. **Input validation**: Special case for uniform bitmaps (all pixels identical) returns 0 deterministically
|
||||
2. **Normalization**: Convert 8-bit pixel values [0, 255] to centered float32 [-1.0, +1.0]
|
||||
3. **DCT**: Apply 32×32 2D DCT-II using separable row-column approach
|
||||
4. **Coefficient extraction**: Extract 64 low-frequency AC coefficients (8×8 block, skipping DC at [0,0], using [0,8] as replacement)
|
||||
5. **Median threshold**: Compute median of the 64 coefficients, set bit i if coefficient i > median
|
||||
|
||||
### Files Created
|
||||
- `crates/pdftract-core/src/font/shape.rs` - pHash implementation with DCT, hamming_distance helper
|
||||
|
||||
### Files Modified
|
||||
- `crates/pdftract-core/src/font/mod.rs` - Added shape module, exported `phash_glyph` and `hamming_distance`
|
||||
|
||||
### Key Design Decisions
|
||||
1. **Hand-rolled DCT**: Implemented DCT-II from scratch with precomputed cosine basis for performance
|
||||
2. **Uniform bitmap handling**: Added early return for uniform bitmaps to ensure deterministic output (avoiding floating-point noise issues)
|
||||
3. **Coefficient selection**: Used 8×8 low-frequency block with DC excluded, replaced with [0,8] to maintain 64 values
|
||||
4. **Median computation**: Average of indices 31 and 32 for 64 values (standard median for even-length array)
|
||||
|
||||
### Test Coverage
|
||||
- Uniform bitmaps (white, black, gray) hash to 0
|
||||
- Non-uniform patterns produce non-zero hashes
|
||||
- Identical inputs produce identical hashes (deterministic)
|
||||
- Different shapes produce different hashes
|
||||
- Hamming distance computation correct
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
| Criterion | Status | Notes |
|
||||
|-----------|--------|-------|
|
||||
| Hash of all-zero bitmap is 0x0000000000000000 | PASS | Special case handling |
|
||||
| Hash of all-255 bitmap is 0x0000000000000000 | PASS | Special case handling |
|
||||
| Hash of half-black/half-white is non-zero | PASS | Vertical edge produces non-zero hash |
|
||||
| Same bitmap hashed twice produces identical u64 | PASS | Deterministic |
|
||||
| Bench: phash 1000 glyphs in < 50 ms | N/A | No benchmark yet (performance test TODO) |
|
||||
|
||||
## Verification
|
||||
|
||||
```bash
|
||||
# Tests
|
||||
cargo test -p pdftract-core --lib font::shape
|
||||
# Result: 12 passed; 0 failed
|
||||
|
||||
# Compile check
|
||||
cargo check --all-targets
|
||||
# Result: OK
|
||||
|
||||
# Formatting
|
||||
cargo fmt
|
||||
# Result: OK
|
||||
```
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
- DCT precomputation: 32×32 cosine basis computed at runtime (~100 LOC, no external deps)
|
||||
- Expected per-hash time: ~30 µs (per bead description)
|
||||
- Called only for Level 4 fallback (~1% of glyphs)
|
||||
- Amortized cost: negligible for per-page processing
|
||||
|
||||
## References
|
||||
|
||||
- Plan section: Phase 2.5 Perceptual hash algorithm (line 1420)
|
||||
- Bead: pdftract-47vu
|
||||
Loading…
Add table
Reference in a new issue