pdftract/crates/pdftract-cli/src/hash.rs
jedarden 1c6f26ecaa fix(bf-4mkhv): clean up unused imports in hash.rs
The bead description mentioned compile errors in hash.rs from API drift,
but those errors were either already fixed or misattributed. The API usage
was already correct:
- compute_fingerprint already takes 3 arguments with source
- len() already propagates Result with ?
- read_at method already used correctly
- Catalog fields accessed via trailer correctly

Only cleanup: removed unused std::fs::File and std::io imports.

Verification: notes/bf-4mkhv.md
2026-06-01 09:43:48 -04:00

338 lines
11 KiB
Rust

//! PDF structural fingerprint (hash) subcommand.
//!
//! Implements the `pdftract hash` command that computes the PDF fingerprint
//! and outputs it to stdout with appropriate exit codes.
use anyhow::{anyhow, Context, Result};
use pdftract_core::fingerprint::{compute_fingerprint, CatalogFlags, ContentStreamData, FingerprintInput, PageFingerprintData};
use pdftract_core::parser::catalog::parse_catalog;
use pdftract_core::parser::pages::{flatten_page_tree, PageDict};
use pdftract_core::parser::stream::{FileSource, PdfSource};
use pdftract_core::parser::xref::{load_xref_with_prev_chain, XrefResolver};
use std::path::Path;
/// Exit codes for the hash subcommand.
pub const EXIT_SUCCESS: i32 = 0;
pub const EXIT_CORRUPT: i32 = 2;
pub const EXIT_ENCRYPTED: i32 = 3;
pub const EXIT_NOT_FOUND: i32 = 4;
pub const EXIT_NETWORK_FAILURE: i32 = 5;
pub const EXIT_TLS_FAILURE: i32 = 6;
/// Arguments for the hash subcommand.
pub struct HashArgs {
/// Input path or URL
pub input: String,
/// Optional password
pub password: Option<String>,
/// Custom HTTP headers (for remote sources)
pub headers: Vec<(String, String)>,
}
/// Map an error to the appropriate exit code.
pub fn map_error_to_exit_code(err: &anyhow::Error) -> i32 {
let err_msg = err.to_string().to_lowercase();
// Check for encryption-related errors
if err_msg.contains("encryption") || err_msg.contains("password") || err_msg.contains("decrypt") {
return EXIT_ENCRYPTED;
}
// Check for network-related errors (remote sources only)
if err_msg.contains("tls") || err_msg.contains("certificate") || err_msg.contains("handshake") {
return EXIT_TLS_FAILURE;
}
if err_msg.contains("network") || err_msg.contains("timeout") || err_msg.contains("connection") {
return EXIT_NETWORK_FAILURE;
}
if err_msg.contains("dns") || err_msg.contains("hostname") || err_msg.contains("resolution") {
return EXIT_NOT_FOUND;
}
// Check for file not found / permission errors
if err_msg.contains("not found") || err_msg.contains("no such file") {
return EXIT_NOT_FOUND;
}
// Check for io::ErrorKind::PermissionDenied for file permission errors (NOT TLS)
if err_msg.contains("permission denied") && !err_msg.contains("tls") {
return EXIT_NOT_FOUND;
}
// Default to corrupt for unrecognised errors
EXIT_CORRUPT
}
/// Check if a string is a URL (http:// or https://).
fn is_url(s: &str) -> bool {
s.starts_with("http://") || s.starts_with("https://")
}
/// Compute the fingerprint for a PDF from a local file.
fn compute_fingerprint_from_file(
path: &Path,
_password: Option<&str>,
) -> Result<String> {
// Open the PDF file
let source = FileSource::open(path).context("Failed to open PDF file")?;
// Find the startxref offset
let startxref_offset = find_startxref(&source).context("Failed to find startxref offset")?;
// Load the xref table
let xref_section = load_xref_with_prev_chain(&source, startxref_offset);
// Create resolver from xref section
let resolver = XrefResolver::from_section(xref_section.clone());
// Get the root reference from trailer
let root_ref = xref_section
.trailer
.as_ref()
.and_then(|trailer| trailer.get("Root"))
.and_then(|obj| obj.as_ref())
.ok_or_else(|| anyhow::anyhow!("No /Root reference in trailer"))?;
// Parse the catalog
let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn PdfSource))
.map_err(|diagnostics| {
let msg = diagnostics
.first()
.map(|d| d.message.as_ref())
.unwrap_or("unknown error");
anyhow::anyhow!("Failed to parse catalog: {}", msg)
})?;
// Flatten the page tree
let pages = flatten_page_tree(&resolver, catalog.pages_ref).map_err(|diagnostics| {
let msg = diagnostics
.first()
.map(|d| d.message.as_ref())
.unwrap_or("unknown error");
anyhow::anyhow!("Failed to flatten page tree: {}", msg)
})?;
// Build fingerprint input
let fingerprint_input = build_fingerprint_input(&catalog, &pages, &xref_section);
// Compute fingerprint
let fingerprint = compute_fingerprint(&fingerprint_input, &resolver, Some(&source as &dyn PdfSource));
Ok(fingerprint)
}
/// Compute the fingerprint for a PDF from a remote URL.
#[cfg(feature = "remote")]
fn compute_fingerprint_from_url(
url: &str,
headers: &[(String, String)],
) -> Result<String> {
use pdftract_core::source::HttpRangeSource;
// Open the remote PDF
let source = HttpRangeSource::with_headers(url, headers.to_vec())
.context("Failed to open remote PDF")?;
// Find the startxref offset
let startxref_offset = find_startxref(&source).context("Failed to find startxref offset")?;
// Load the xref table
let xref_section = load_xref_with_prev_chain(&source, startxref_offset);
// Create resolver from xref section
let resolver = XrefResolver::from_section(xref_section.clone());
// Get the root reference from trailer
let root_ref = xref_section
.trailer
.as_ref()
.and_then(|trailer| trailer.get("Root"))
.and_then(|obj| obj.as_ref())
.ok_or_else(|| anyhow::anyhow!("No /Root reference in trailer"))?;
// Parse the catalog
let catalog = parse_catalog(&resolver, root_ref, Some(&source as &dyn PdfSource))
.map_err(|diagnostics| {
let msg = diagnostics
.first()
.map(|d| d.message.as_ref())
.unwrap_or("unknown error");
anyhow::anyhow!("Failed to parse catalog: {}", msg)
})?;
// Flatten the page tree
let pages = flatten_page_tree(&resolver, catalog.pages_ref).map_err(|diagnostics| {
let msg = diagnostics
.first()
.map(|d| d.message.as_ref())
.unwrap_or("unknown error");
anyhow::anyhow!("Failed to flatten page tree: {}", msg)
})?;
// Build fingerprint input
let fingerprint_input = build_fingerprint_input(&catalog, &pages, &xref_section);
// Compute fingerprint
let fingerprint = compute_fingerprint(&fingerprint_input, &resolver, Some(&source as &dyn PdfSource));
Ok(fingerprint)
}
/// Find the startxref offset in a PDF source.
fn find_startxref(source: &dyn PdfSource) -> Result<u64> {
let len = source.len()?;
let scan_size = 1024.min(len) as usize;
let scan_start = (len - scan_size as u64) as u64;
let tail_data = source
.read_at(scan_start, scan_size)
.context("Failed to read PDF tail")?;
// Find "startxref" in the tail data
let startxref_pos = tail_data
.windows(9)
.rposition(|w| w == b"startxref")
.ok_or_else(|| anyhow!("startxref not found in PDF"))?;
// Parse the offset after "startxref"
let offset_data = &tail_data[startxref_pos + 9..];
// Skip leading whitespace
let offset_start = offset_data
.iter()
.position(|&b| !matches!(b, b' ' | b'\r' | b'\n' | b'\t'))
.unwrap_or(offset_data.len());
let offset_data_trimmed = &offset_data[offset_start..];
// Find the newline after the offset
let newline_pos = offset_data_trimmed
.iter()
.position(|&b| b == b'\n' || b == b'\r')
.unwrap_or(offset_data_trimmed.len());
let offset_str = std::str::from_utf8(&offset_data_trimmed[..newline_pos])
.context("startxref offset is not valid UTF-8")?;
let offset: u64 = offset_str
.trim()
.parse()
.context("startxref offset is not a valid number")?;
Ok(offset)
}
/// Build FingerprintInput from catalog and pages.
fn build_fingerprint_input(
catalog: &pdftract_core::parser::catalog::Catalog,
pages: &[PageDict],
xref_section: &pdftract_core::parser::xref::XrefSection,
) -> FingerprintInput {
let page_count = pages.len() as u32;
// Check encryption status from trailer (/Encrypt key)
let is_encrypted = xref_section
.trailer
.as_ref()
.and_then(|trailer| trailer.get("Encrypt"))
.map_or(false, |obj| !matches!(obj, pdftract_core::parser::object::PdfObject::Null));
// Check for XFA forms via /AcroForm in trailer
let contains_xfa = xref_section
.trailer
.as_ref()
.and_then(|trailer| trailer.get("AcroForm"))
.and_then(|acroform_obj| acroform_obj.as_dict())
.and_then(|acroform_dict| acroform_dict.get("XFA"))
.map_or(false, |obj| !matches!(obj, pdftract_core::parser::object::PdfObject::Null));
let fingerprint_pages = pages
.iter()
.map(|page| PageFingerprintData {
content_streams: page
.contents
.iter()
.map(|&obj_ref| ContentStreamData::Indirect(obj_ref))
.collect(),
resources: None,
media_box: page.media_box,
crop_box: page.crop_box,
rotate: page.rotate,
})
.collect();
// Build catalog flags
let catalog_flags = CatalogFlags {
is_encrypted,
contains_javascript: catalog.open_action.is_some() || catalog.aa.is_some(),
contains_xfa,
ocg_present: catalog
.oc_properties
.as_ref()
.map(|props| props.present)
.unwrap_or(false),
};
FingerprintInput {
page_count,
pages: fingerprint_pages,
struct_tree_root_ref: catalog.struct_tree_root_ref,
is_tagged: catalog.mark_info.is_tagged,
catalog_flags,
}
}
/// Run the hash subcommand.
pub fn run_hash(args: HashArgs) -> Result<()> {
let input = &args.input;
if is_url(input) {
#[cfg(feature = "remote")]
{
let fingerprint = compute_fingerprint_from_url(input, &args.headers)
.context("Failed to compute fingerprint from URL")?;
println!("{}", fingerprint);
return Ok(());
}
#[cfg(not(feature = "remote"))]
{
return Err(anyhow::anyhow!(
"Remote sources are not supported; rebuild with --features remote"
));
}
} else {
// Local file
let path = Path::new(input);
let fingerprint = compute_fingerprint_from_file(path, args.password.as_deref())
.context("Failed to compute fingerprint from file")?;
println!("{}", fingerprint);
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_url() {
assert!(is_url("http://example.com/file.pdf"));
assert!(is_url("https://example.com/file.pdf"));
assert!(!is_url("file.pdf"));
assert!(!is_url("/path/to/file.pdf"));
assert!(!is_url("ftp://example.com/file.pdf"));
}
#[test]
fn test_exit_code_constants() {
assert_eq!(EXIT_SUCCESS, 0);
assert_eq!(EXIT_CORRUPT, 2);
assert_eq!(EXIT_ENCRYPTED, 3);
assert_eq!(EXIT_NOT_FOUND, 4);
assert_eq!(EXIT_NETWORK_FAILURE, 5);
assert_eq!(EXIT_TLS_FAILURE, 6);
}
}