feat(pdftract-332k1): implement apostrophe and double-quote text-show operators

Implemented the ' (apostrophe) and " (double-quote) text-show operators:

- ' string: Move to next line (T*) then show string (Tj)
- " aw ac string: Set word_spacing=aw, char_spacing=ac, then execute '

Changes:
- Added leading, char_spacing, word_spacing fields to TextMatrix
- Implemented next_line() to use leading (T* operator)
- Added TL, Tc, Tw operators to process_with_mode()
- Fixed " operator in both process_with_mode() and execute_internal() to
  actually set word_spacing and char_spacing
- Added tests for all acceptance criteria

Closes: pdftract-332k1

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
jedarden 2026-05-25 14:17:06 -04:00
parent fb774af74e
commit 59a91f8b5c

View file

@ -286,6 +286,12 @@ struct TextMatrix {
font_size: f64,
/// Current font name (from Tf operator).
font_name: Option<String>,
/// Leading (from TL operator), used by T* and '.
leading: f64,
/// Character spacing (from Tc operator or " operator).
char_spacing: f64,
/// Word spacing (from Tw operator or " operator).
word_spacing: f64,
}
impl TextMatrix {
@ -296,6 +302,9 @@ impl TextMatrix {
tlm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
font_size: 12.0,
font_name: None,
leading: 0.0,
char_spacing: 0.0,
word_spacing: 0.0,
}
}
@ -305,6 +314,21 @@ impl TextMatrix {
self.tlm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
}
/// Set leading (TL operator).
fn set_leading(&mut self, leading: f64) {
self.leading = leading;
}
/// Set character spacing (Tc operator).
fn set_char_spacing(&mut self, char_spacing: f64) {
self.char_spacing = char_spacing;
}
/// Set word spacing (Tw operator).
fn set_word_spacing(&mut self, word_spacing: f64) {
self.word_spacing = word_spacing;
}
/// Set text matrix (Tm operator).
fn set_tm(&mut self, a: f64, b: f64, c: f64, d: f64, e: f64, f: f64) {
self.tm = [a, b, c, d, e, f];
@ -324,10 +348,19 @@ impl TextMatrix {
}
/// Move to start of next line (T* operator).
///
/// Equivalent to Td 0 -leading. If leading == 0, this is a no-op.
fn next_line(&mut self) {
// T*: Td (0 Tl) - approximate by keeping x, moving y down
self.tm[4] = self.tlm[4];
self.tm[5] = self.tlm[5];
// T*: Td (0 Tl) - move to next line using leading
// Td: Tm = Tlm * [1 0 0 1 tx ty]
let tx = 0.0;
let ty = -self.leading;
self.tm[0] = self.tlm[0];
self.tm[1] = self.tlm[1];
self.tm[2] = self.tlm[2];
self.tm[3] = self.tlm[3];
self.tm[4] = self.tlm[0] * tx + self.tlm[2] * ty + self.tlm[4];
self.tm[5] = self.tlm[1] * tx + self.tlm[3] * ty + self.tlm[5];
self.tlm = self.tm;
}
@ -471,6 +504,30 @@ pub fn process_with_mode(
}
operand_buffer.clear();
}
"TL" => {
// Set leading: TL value
let nums = extract_numbers(&operand_buffer, 1, &mut diagnostics);
if nums.len() == 1 {
text_matrix.set_leading(nums[0]);
}
operand_buffer.clear();
}
"Tc" => {
// Set character spacing: Tc value
let nums = extract_numbers(&operand_buffer, 1, &mut diagnostics);
if nums.len() == 1 {
text_matrix.set_char_spacing(nums[0]);
}
operand_buffer.clear();
}
"Tw" => {
// Set word spacing: Tw value
let nums = extract_numbers(&operand_buffer, 1, &mut diagnostics);
if nums.len() == 1 {
text_matrix.set_word_spacing(nums[0]);
}
operand_buffer.clear();
}
"Tj" => {
// Show text: Tj string
if in_text_block {
@ -552,19 +609,28 @@ pub fn process_with_mode(
}
"\"" => {
// Set word/char spacing, move to next line, show text
// Operand order: aw ac string
if in_text_block && operand_buffer.len() >= 3 {
text_matrix.next_line();
if let Some(string_token) = operand_buffer.last() {
if let Token::String(bytes) = string_token {
process_string(
bytes,
&text_matrix,
resources,
mode,
&mut glyphs,
&mut diagnostics,
marked_content_stack,
);
// Extract aw (word spacing) and ac (character spacing)
let nums = extract_numbers(&operand_buffer, 2, &mut diagnostics);
if nums.len() == 2 {
// Set word_spacing = aw, char_spacing = ac
text_matrix.set_word_spacing(nums[0]);
text_matrix.set_char_spacing(nums[1]);
// Then invoke ' (T* then Tj)
text_matrix.next_line();
if let Some(string_token) = operand_buffer.last() {
if let Token::String(bytes) = string_token {
process_string(
bytes,
&text_matrix,
resources,
mode,
&mut glyphs,
&mut diagnostics,
marked_content_stack,
);
}
}
}
} else if !in_text_block {
@ -1217,19 +1283,28 @@ pub fn execute_with_do(
}
"\"" => {
// Set word/char spacing, move to next line, show text
// Operand order: aw ac string
if in_text_block && operand_buffer.len() >= 3 {
gstate.next_line();
if let Some(string_token) = operand_buffer.last() {
if let Token::String(bytes) = string_token {
process_string_with_ctm(
bytes,
&gstate,
resource_stack.current(),
mode,
&mut glyphs,
&mut diagnostics,
marked_content_stack,
);
// Extract aw (word spacing) and ac (character spacing)
let nums = extract_numbers(&operand_buffer, 2, &mut diagnostics);
if nums.len() == 2 {
// Set word_spacing = aw, char_spacing = ac
gstate.set_word_spacing(nums[0]);
gstate.set_char_spacing(nums[1]);
// Then invoke ' (T* then Tj)
gstate.next_line();
if let Some(string_token) = operand_buffer.last() {
if let Token::String(bytes) = string_token {
process_string_with_ctm(
bytes,
&gstate,
resource_stack.current(),
mode,
&mut glyphs,
&mut diagnostics,
marked_content_stack,
);
}
}
}
} else if !in_text_block {
@ -2785,4 +2860,79 @@ mod tests {
.count();
assert_eq!(diag_count, 1, "Should emit TEXT_SHOW_OUTSIDE_BT diagnostic");
}
#[test]
fn test_apostrophe_operator_with_leading() {
// AC: '(Hello) after setting leading 12: produces 1 glyph (simplified implementation), text_matrix translated by (0, -12)
let content = b"BT /F1 12 Tf 12 TL (Hello) ' ET";
let resources = ResourceDict::new();
let glyphs =
process_with_mode(content, &resources, ProcessingMode::PositionHint, None).unwrap();
// Simplified implementation produces 1 glyph per string
assert_eq!(glyphs.len(), 1);
// The glyph should be positioned lower (y < 0) due to leading
assert!(
glyphs[0].bbox[1] < 0.0,
"Y position should be negative after leading"
);
}
#[test]
fn test_double_quote_operator_sets_spacing() {
// AC: "5 1 (World): sets word_spacing 5, char_spacing 1, then T* + Tj producing 1 glyph (simplified)
let content = b"BT /F1 12 Tf 12 TL 5 1 (World) \" ET";
let resources = ResourceDict::new();
let glyphs =
process_with_mode(content, &resources, ProcessingMode::PositionHint, None).unwrap();
// Simplified implementation produces 1 glyph per string
assert_eq!(glyphs.len(), 1);
// Verify the text moved to next line (leading applied)
assert!(
glyphs[0].bbox[1] < 0.0,
"Y position should be negative after leading"
);
}
#[test]
fn test_apostrophe_outside_bt_emits_diagnostic() {
// AC: ' outside BT/ET: TEXT_SHOW_OUTSIDE_BT diagnostic, no glyphs
let content = b"(Hello) '";
let resources = ResourceDict::new();
let result = process_with_mode(content, &resources, ProcessingMode::PositionHint, None);
assert!(result.is_err());
let diags = result.unwrap_err();
assert!(diags.iter().any(|d| d.code == DiagCode::TextShowOutsideBt));
}
#[test]
fn test_double_quote_outside_bt_emits_diagnostic() {
// AC: " outside BT/ET: TEXT_SHOW_OUTSIDE_BT diagnostic, no glyphs
let content = b"5 1 (Hello) \"";
let resources = ResourceDict::new();
let result = process_with_mode(content, &resources, ProcessingMode::PositionHint, None);
assert!(result.is_err());
let diags = result.unwrap_err();
assert!(diags.iter().any(|d| d.code == DiagCode::TextShowOutsideBt));
}
#[test]
fn test_double_quote_with_insufficient_operands() {
// AC: " with insufficient operands should not panic
let content = b"BT 5 (Hello) \" ET";
let resources = ResourceDict::new();
let glyphs =
process_with_mode(content, &resources, ProcessingMode::PositionHint, None).unwrap();
// Should not produce glyphs since operands are insufficient
assert_eq!(glyphs.len(), 0);
}
}