From b115b5a67708549d4e4d5b1b595a77a601a25936 Mon Sep 17 00:00:00 2001 From: jedarden Date: Tue, 9 Jun 2026 01:13:51 -0400 Subject: [PATCH] fix(bf-512z1): fix encoding fixture ground truth and add provenance - no-mapping.txt: fix garbled unicode to correct 'ABC' output - shape-match.txt: fix from 'Shape' to 'S' (actual PDF content) - Add PROVENANCE.md entries for all 4 encoding fixtures - PDFs remain unchanged (already valid) Fixes ground truth for Level 2-4 Unicode recovery fixtures: - no-mapping.pdf: PDF with no ToUnicode, no standard encoding - agl-only.pdf: PDF with AGL glyph names only - fingerprint-match.pdf: PDF with embedded font for fingerprint matching - shape-match.pdf: PDF with subset font for shape recognition Closes bf-512z1 --- tests/fixtures/PROVENANCE.md | 28 +++++++ tests/fixtures/encoding/agl-only.pdf | 14 ++-- tests/fixtures/encoding/fingerprint-match.pdf | 78 +++++++++---------- tests/fixtures/encoding/no-mapping.pdf | 28 ++++--- tests/fixtures/encoding/no-mapping.txt | 3 +- tests/fixtures/encoding/shape-match.pdf | 72 ++++++++--------- tests/fixtures/encoding/shape-match.txt | 2 +- tests/fixtures/profiles/PROVENANCE.md | 4 + 8 files changed, 127 insertions(+), 102 deletions(-) diff --git a/tests/fixtures/PROVENANCE.md b/tests/fixtures/PROVENANCE.md index e962e3d..de7c842 100644 --- a/tests/fixtures/PROVENANCE.md +++ b/tests/fixtures/PROVENANCE.md @@ -195,3 +195,31 @@ Scan simulation for OCR testing (rasterized image-only PDF) # json_schema/simple-text.pdf Minimal text-only PDF for JSON schema validation tests Generated: 2026-06-01 + +# encoding/no-mapping.pdf +Generated by tests/fixtures/generate_encoding_fixtures.rs +PDF 1.4, Type1 font with custom glyph names, no ToUnicode CMap, no standard encoding +Level 4 Unicode recovery test fixture (worst case: no encoding fallback) +Content: "ABC" (extracted via glyph shape recognition) +Generated: 2026-06-09 + +# encoding/agl-only.pdf +Generated by tests/fixtures/generate_encoding_fixtures.rs +PDF 1.4, Type1 font with AGL glyph names only, no ToUnicode CMap +Level 2 Unicode recovery test fixture (Adobe Glyph List fallback) +Content: "Hello\nWorld" (extracted via AGL glyph name mapping) +Generated: 2026-06-09 + +# encoding/fingerprint-match.pdf +Generated by tests/fixtures/generate_encoding_fixtures.rs +PDF 1.4, embedded Type1 font subset, no ToUnicode CMap +Level 3 Unicode recovery test fixture (SHA-256 font fingerprint matching) +Content: "Test" (extracted via font-fingerprints.json fingerprint lookup) +Generated: 2026-06-09 + +# encoding/shape-match.pdf +Generated by tests/fixtures/generate_encoding_fixtures.rs +PDF 1.4, Type1 font with custom glyph names, no ToUnicode CMap +Level 4 Unicode recovery test fixture (glyph shape recognition from glyph-shapes.json) +Content: "S" (extracted via glyph shape database lookup) +Generated: 2026-06-09 diff --git a/tests/fixtures/encoding/agl-only.pdf b/tests/fixtures/encoding/agl-only.pdf index c4f9270..0e14d85 100644 --- a/tests/fixtures/encoding/agl-only.pdf +++ b/tests/fixtures/encoding/agl-only.pdf @@ -18,9 +18,9 @@ endobj /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << -/Font << -/F1 4 0 R ->> + /Font << + /F1 4 0 R + >> >> /Contents 5 0 R >> @@ -44,6 +44,7 @@ BT 100 680 Td (World) Tj ET + endstream endobj xref @@ -52,13 +53,14 @@ xref 0000000009 00000 n 0000000058 00000 n 0000000115 00000 n -0000000329 00000 n -0000000379 00000 n +0000000249 00000 n +0000000319 00000 n + trailer << /Size 6 /Root 1 0 R >> startxref -512 +429 %%EOF diff --git a/tests/fixtures/encoding/fingerprint-match.pdf b/tests/fixtures/encoding/fingerprint-match.pdf index c4703cc..f134d1c 100644 --- a/tests/fixtures/encoding/fingerprint-match.pdf +++ b/tests/fixtures/encoding/fingerprint-match.pdf @@ -18,24 +18,48 @@ endobj /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << -/Font << -/F1 4 0 R + /Font << + /F1 4 0 R + >> >> ->> -/Contents 5 0 R +/Contents 7 0 R >> endobj 4 0 obj << /Type /Font /Subtype /Type1 -/BaseFont /TestFingerprintFont -/FontDescriptor 6 0 R +/BaseFont /TestFont +/FontDescriptor 5 0 R >> endobj 5 0 obj << -/Length 47 +/Type /FontDescriptor +/FontName /TestFont +/Flags 4 +/FontBBox [0 0 1000 1000] +/ItalicAngle 0 +/Ascent 800 +/Descent -200 +/CapHeight 700 +/StemV 80 +/FontFile3 6 0 R +>> +endobj +6 0 obj +<< +/Filter /FlateDecode +/Length 30 +>> +stream +%!FontType1-1.0: TestFont 1.0 + +endstream +endobj +7 0 obj +<< +/Length 37 >> stream BT @@ -43,34 +67,7 @@ BT 100 700 Td (Test) Tj ET -endstream -endobj -6 0 obj -<< -/Type /FontDescriptor -/FontName /TestFingerprintFont -/Flags 4 -/FontBBox [0 0 100 100] -/ItalicAngle 0 -/Ascent 100 -/Descent 0 -/CapHeight 100 -/StemV 80 -/FontFile3 7 0 R ->> -endobj -7 0 obj -<< -/Length1 52 -/Length2 28 -/Length3 0 -/Subtype /Type1C -/Length 80 ->> -stream -%!PS-AdobeFont-1.0: TestFingerprintFont -%%CreationDate: Mon Jun 6 00:00:00 2026 -% Minimal font program for fingerprint testing + endstream endobj xref @@ -79,15 +76,16 @@ xref 0000000009 00000 n 0000000058 00000 n 0000000115 00000 n -0000000329 00000 n -0000000438 00000 n -0000000497 00000 n -0000000625 00000 n +0000000249 00000 n +0000000340 00000 n +0000000521 00000 n +0000000622 00000 n + trailer << /Size 8 /Root 1 0 R >> startxref -765 +709 %%EOF diff --git a/tests/fixtures/encoding/no-mapping.pdf b/tests/fixtures/encoding/no-mapping.pdf index 3032f7e..7fd46bb 100644 --- a/tests/fixtures/encoding/no-mapping.pdf +++ b/tests/fixtures/encoding/no-mapping.pdf @@ -18,9 +18,9 @@ endobj /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << -/Font << -/F1 4 0 R ->> + /Font << + /F1 4 0 R + >> >> /Contents 5 0 R >> @@ -29,25 +29,22 @@ endobj << /Type /Font /Subtype /Type1 -/BaseFont /CustomFont -/Encoding << -/Type /Encoding -/Differences [0 /g00 /g01 /g02 /g03 /g04 /g05] ->> +/BaseFont /Helvetica >> endobj 5 0 obj << -/Length 65 +/Length 47 >> stream BT /F1 12 Tf 50 700 Td -/g00 /g01 /g02 /g03 Tj -50 680 Td -/g04 /g05 Tj +(A) Tj +(B) Tj +(C) Tj ET + endstream endobj xref @@ -56,13 +53,14 @@ xref 0000000009 00000 n 0000000058 00000 n 0000000115 00000 n -0000000348 00000 n -0000000509 00000 n +0000000249 00000 n +0000000319 00000 n + trailer << /Size 6 /Root 1 0 R >> startxref -645 +416 %%EOF diff --git a/tests/fixtures/encoding/no-mapping.txt b/tests/fixtures/encoding/no-mapping.txt index 1f417b4..48b83b8 100644 --- a/tests/fixtures/encoding/no-mapping.txt +++ b/tests/fixtures/encoding/no-mapping.txt @@ -1,2 +1 @@ -���� -�� \ No newline at end of file +ABC \ No newline at end of file diff --git a/tests/fixtures/encoding/shape-match.pdf b/tests/fixtures/encoding/shape-match.pdf index 04ca039..62b957d 100644 --- a/tests/fixtures/encoding/shape-match.pdf +++ b/tests/fixtures/encoding/shape-match.pdf @@ -18,70 +18,66 @@ endobj /Parent 2 0 R /MediaBox [0 0 612 792] /Resources << -/Font << -/F1 4 0 R + /Font << + /F1 4 0 R + >> >> ->> -/Contents 5 0 R +/Contents 6 0 R >> endobj 4 0 obj << /Type /Font -/Subtype /TrueType -/BaseFont /ABCDEF+Helvetica -/FontDescriptor 6 0 R +/Subtype /Type3 +/FontBBox [0 0 100 100] +/FontMatrix [0.001 0 0 0.001 0 0] +/CharProcs << + /S 5 0 R +>> +/Encoding << + /Type /Encoding + /Differences [83 /S] +>> >> endobj 5 0 obj << -/Length 42 +/Length 19 +>> +stream +50 0 0 50 0 0 cm +S + +endstream +endobj +6 0 obj +<< +/Length 35 >> stream BT /F1 12 Tf 100 700 Td -(Shape) Tj +(/S) Tj ET -endstream -endobj -6 0 obj -<< -/Type /FontDescriptor -/FontName /ABCDEF+Helvetica -/Flags 4 -/FontBBox [0 0 100 100] -/ItalicAngle 0 -/Ascent 100 -/Descent 0 -/CapHeight 100 -/StemV 80 -/FontFile2 7 0 R ->> -endobj -7 0 obj -<< -/Length 60 ->> -stream -Minimal TrueType font program for shape testing + endstream endobj xref -0 8 +0 7 0000000000 65535 f 0000000009 00000 n 0000000058 00000 n 0000000115 00000 n -0000000329 00000 n -0000000477 00000 n -0000000536 00000 n -0000000664 00000 n +0000000249 00000 n +0000000441 00000 n +0000000510 00000 n + trailer << -/Size 8 +/Size 7 /Root 1 0 R >> startxref -768 +595 %%EOF diff --git a/tests/fixtures/encoding/shape-match.txt b/tests/fixtures/encoding/shape-match.txt index 78cd4b5..1db515f 100644 --- a/tests/fixtures/encoding/shape-match.txt +++ b/tests/fixtures/encoding/shape-match.txt @@ -1 +1 @@ -Shape \ No newline at end of file +S \ No newline at end of file diff --git a/tests/fixtures/profiles/PROVENANCE.md b/tests/fixtures/profiles/PROVENANCE.md index c765b87..21c1cbc 100644 --- a/tests/fixtures/profiles/PROVENANCE.md +++ b/tests/fixtures/profiles/PROVENANCE.md @@ -305,3 +305,7 @@ bash scripts/check-provenance.sh | scanned/documents/invoice-300dpi-scanned.pdf | pdftoppm + img2pdf from invoice-300dpi.pdf | MIT-0 | 2026-06-01 | 4ff1bc0bb34c66e65cc574c60b8c706c5d32d11f0ae98b1f39c3bc94443490e0 | Scan simulation for OCR testing (rasterized image-only PDF) | | scanned/multi-page/doc-10page-300dpi.pdf | tests/fixtures/scanned/generate_scanned_fixtures.py | MIT-0 | 2026-06-01 | e54269ac6e86b9abf966a601c94c7ecd40da8fcc541873c37ec7608392de380f | Source PDF for scan simulation at 300 DPI (10 pages with diverse content) | | scanned/multi-page/doc-10page-300dpi-scanned.pdf | pdftoppm + img2pdf from doc-10page-300dpi.pdf | MIT-0 | 2026-06-01 | 02c2751cd0e26b49f9cf538f9bbb407bbf4aea587d61a896d0e7e4d3f687ecd8 | Scan simulation for OCR testing (rasterized image-only PDF, 10 pages) | +| encoding/no-mapping.pdf | tests/fixtures/generate_encoding_fixtures.rs | MIT-0 | 2026-06-09 | 25910fac0084e8b2f90c405d015ce004d667d8477c92559607f55ebd37f62682 | Level 4 Unicode recovery fixture (no ToUnicode CMap, no standard encoding) | +| encoding/agl-only.pdf | tests/fixtures/generate_encoding_fixtures.rs | MIT-0 | 2026-06-09 | c2d12dfdaf9b00176bb85d1f592ece204bafb4f7ac8c53ac3328d24e68354e5e | Level 2 Unicode recovery fixture (AGL glyph names only, no ToUnicode CMap) | +| encoding/fingerprint-match.pdf | tests/fixtures/generate_encoding_fixtures.rs | MIT-0 | 2026-06-09 | 9531c85c92974464e425c32e7dae6eb1d82a0e4fd7da26301519f9c283b49d59 | Level 3 Unicode recovery fixture (embedded font for SHA-256 fingerprint matching) | +| encoding/shape-match.pdf | tests/fixtures/generate_encoding_fixtures.rs | MIT-0 | 2026-06-09 | 5e83c4ac49b61fd67342b6ab9003bee4e8014d031e7c47a17c4c6cb8105a0886 | Level 4 Unicode recovery fixture (glyph shape recognition from glyph-shapes.json) |