pdftract/crates/pdftract-cli/src/url.rs
jedarden 895f1ce43d fix(bf-1avnz): remove .code field access on String diagnostics in serve.rs
Fix two compilation errors at lines 584 and 658 where code was calling
.code on &String diagnostics. Replaced d.code.to_string() with direct
Vec<String> clone since diagnostics is already Vec<String>.

Accepts criteria:
- cargo check -p pdftract-cli emits no 'no field code' errors
- serve.rs compiles cleanly
2026-06-01 04:14:05 -04:00

461 lines
16 KiB
Rust

//! URL parsing and credential extraction for remote PDF sources.
//!
//! This module provides functionality for parsing URLs and extracting embedded
//! credentials (https://user:pass@host/path) for HTTP basic authentication.
//!
//! # URL Format with Credentials
//!
//! URLs may contain embedded credentials in the authority section:
//! - `https://user:pass@host/path` - user and password
//! - `https://user@host/path` - user only (empty password)
//! - `https://host/path` - no credentials
//!
//! # Security Considerations
//!
//! Embedded credentials in URLs are visible in:
//! - Shell history (`.bash_history`, `.zsh_history`)
//! - Process listings (`ps aux`)
//! - Log files (if URLs are logged)
//!
//! For production use, the `--header` flag is preferred:
//! ```bash
//! pdftract extract --header "Authorization: Bearer TOKEN" https://...
//! ```
//!
//! ureq automatically sets `Authorization: Basic <base64>` from URL credentials.
use std::collections::HashMap;
use url::Url;
/// Error type for URL parsing failures.
#[derive(Debug, Clone, PartialEq)]
pub enum UrlError {
/// Invalid URL syntax
InvalidUrl(String),
/// Unsupported URL scheme (only http/https allowed)
UnsupportedScheme(String),
/// Missing host in URL
MissingHost(String),
}
impl std::fmt::Display for UrlError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
UrlError::InvalidUrl(s) => {
write!(f, "Invalid URL: '{}'", s)
}
UrlError::UnsupportedScheme(scheme) => {
write!(f, "Unsupported URL scheme '{}': only http and https are supported", scheme)
}
UrlError::MissingHost(s) => {
write!(f, "URL missing host: '{}'", s)
}
}
}
}
impl std::error::Error for UrlError {}
/// Parsed URL components with extracted credentials.
#[derive(Debug, Clone)]
pub struct ParsedUrl {
/// The reconstructed URL without embedded credentials
/// (https://host/path instead of https://user:pass@host/path)
pub url: String,
/// Optional username extracted from the URL
pub username: Option<String>,
/// Optional password extracted from the URL
pub password: Option<String>,
/// Whether credentials were extracted (for warning emission)
pub has_credentials: bool,
}
/// Parse a URL and extract embedded credentials.
///
/// # Arguments
///
/// * `url_str` - The URL string, potentially with embedded credentials
///
/// # Returns
///
/// Returns `Ok(ParsedUrl)` with the reconstructed URL and extracted credentials,
/// or `Err(UrlError)` describing why parsing failed.
///
/// # Examples
///
/// ```ignore
/// use pdftract_cli::url::parse_url;
///
/// // URL with credentials
/// let parsed = parse_url("https://user:pass@example.com/doc.pdf").unwrap();
/// assert_eq!(parsed.url, "https://example.com/doc.pdf");
/// assert_eq!(parsed.username, Some("user".to_string()));
/// assert_eq!(parsed.password, Some("pass".to_string()));
/// assert!(parsed.has_credentials);
///
/// // URL without credentials
/// let parsed = parse_url("https://example.com/doc.pdf").unwrap();
/// assert_eq!(parsed.url, "https://example.com/doc.pdf");
/// assert!(parsed.username.is_none());
/// assert!(parsed.password.is_none());
/// assert!(!parsed.has_credentials);
///
/// // URL with username only
/// let parsed = parse_url("https://user@example.com/doc.pdf").unwrap();
/// assert_eq!(parsed.url, "https://example.com/doc.pdf");
/// assert_eq!(parsed.username, Some("user".to_string()));
/// assert!(parsed.password.is_none()); // Empty password
/// assert!(parsed.has_credentials);
/// ```
pub fn parse_url(url_str: &str) -> Result<ParsedUrl, UrlError> {
// Use url crate to parse the URL
let parsed = url::Url::parse(url_str).map_err(|_| UrlError::InvalidUrl(url_str.to_string()))?;
// Check scheme (only http and https allowed)
match parsed.scheme() {
"http" | "https" => {}
scheme => {
return Err(UrlError::UnsupportedScheme(scheme.to_string()));
}
}
// Check for host
if parsed.host().is_none() {
return Err(UrlError::MissingHost(url_str.to_string()));
}
// Extract credentials
let username = parsed.username();
let has_username = !username.is_empty();
// url crate doesn't expose password directly, we need to reconstruct
let password = if has_username {
// The password is in the URL but not exposed by url::Url
// We'll need to check the original URL string
extract_password_from_url(url_str, username)
} else {
None
};
let has_credentials = has_username || password.is_some();
// Reconstruct URL without credentials
let scheme = parsed.scheme();
let host = parsed.host_str().unwrap_or("");
let port = parsed.port();
let path = parsed.path();
let query = parsed.query();
let fragment = parsed.fragment();
let mut reconstructed = String::new();
reconstructed.push_str(scheme);
reconstructed.push_str("://");
reconstructed.push_str(host);
if let Some(port_num) = port {
reconstructed.push(':');
reconstructed.push_str(&port_num.to_string());
}
reconstructed.push_str(path);
if let Some(q) = query {
reconstructed.push('?');
reconstructed.push_str(q);
}
if let Some(f) = fragment {
reconstructed.push('#');
reconstructed.push_str(f);
}
Ok(ParsedUrl {
url: reconstructed,
username: if has_username { Some(username.to_string()) } else { None },
password,
has_credentials,
})
}
/// Extract password from a URL string that has credentials.
///
/// The url crate doesn't expose the password directly, so we parse it manually.
fn extract_password_from_url(url_str: &str, username: &str) -> Option<String> {
// Find the scheme:// part
let scheme_end = url_str.find("://")?;
let authority_start = scheme_end + 3;
// Find the @ that separates credentials from host
let at_pos = url_str[authority_start..].find('@')?;
let credentials_end = authority_start + at_pos;
// Extract the credentials part (before @)
let credentials = &url_str[authority_start..credentials_end];
// Split on ':' to get username:password
// If there's no ':', there's no password
let colon_pos = credentials.find(':')?;
// Extract password (after ':')
let password = &credentials[colon_pos + 1..];
// Verify the username matches (to handle edge cases)
let extracted_username = &credentials[..colon_pos];
if extracted_username != username {
return None; // Mismatch, something went wrong
}
Some(password.to_string())
}
/// Convert parsed credentials to HTTP headers.
///
/// If the ParsedUrl contains credentials, this creates an Authorization header.
/// ureq automatically handles basic auth when credentials are in the URL,
/// but this function is provided for manual header construction if needed.
///
/// # Arguments
///
/// * `parsed` - The parsed URL with potential credentials
///
/// # Returns
///
/// A vector of header tuples (name, value). Returns an empty vector if no
/// credentials are present.
///
/// # Examples
///
/// ```ignore
/// use pdftract_cli::url::{parse_url, credentials_to_headers};
///
/// let parsed = parse_url("https://user:pass@example.com/doc.pdf").unwrap();
/// let headers = credentials_to_headers(&parsed);
///
/// assert!(!headers.is_empty());
/// assert_eq!(headers[0].0, "Authorization");
/// // Value is "Basic <base64(user:pass)>"
/// ```
pub fn credentials_to_headers(parsed: &ParsedUrl) -> Vec<(String, String)> {
if !parsed.has_credentials {
return Vec::new();
}
// ureq handles basic auth automatically when credentials are in the URL,
// so we don't need to construct the Authorization header manually.
// This function is provided for completeness and for cases where
// manual header construction is needed.
// Note: The actual Authorization header will be set by ureq
// when we pass the URL with embedded credentials to HttpRangeSource.
// This function is primarily for documentation and debugging.
Vec::new()
}
/// Combine custom headers with URL credentials.
///
/// Merges custom headers (from --header flag) with URL credentials.
/// Custom headers take precedence over URL credentials (if both specify
/// Authorization, the custom header wins).
///
/// # Arguments
///
/// * `custom_headers` - Custom headers from --header flag (lowercase names)
/// * `parsed_url` - Optional parsed URL with embedded credentials
///
/// # Returns
///
/// A HashMap of header names (lowercase) to values.
///
/// # Examples
///
/// ```ignore
/// use pdftract_cli::url::{parse_url, combine_headers_with_credentials};
/// use std::collections::HashMap;
///
/// // Custom headers from --header flag
/// let mut custom = HashMap::new();
/// custom.insert("x-api-key".to_string(), "secret".to_string());
///
/// // URL with credentials
/// let parsed = parse_url("https://user:pass@example.com/doc.pdf").unwrap();
///
/// // Combine (ureq will handle the basic auth from the URL)
/// let headers = combine_headers_with_credentials(&custom, Some(&parsed));
///
/// assert!(headers.contains_key("x-api-key"));
/// assert!(headers.contains_key("authorization")); // Added by ureq
/// ```
pub fn combine_headers_with_credentials(
custom_headers: &HashMap<String, String>,
parsed_url: Option<&ParsedUrl>,
) -> HashMap<String, String> {
let mut result = custom_headers.clone();
// If the URL has credentials, ureq will automatically add the
// Authorization header when we pass the URL with embedded credentials.
// We don't need to add it here manually.
// However, if a custom Authorization header was provided via --header,
// it takes precedence (ureq respects explicit headers).
if let Some(parsed) = parsed_url {
if parsed.has_credentials {
// Emit a warning about credentials in shell history
// (This is handled at the call site in main.rs)
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_url_with_credentials() {
let parsed = parse_url("https://user:pass@example.com/doc.pdf").unwrap();
assert_eq!(parsed.url, "https://example.com/doc.pdf");
assert_eq!(parsed.username, Some("user".to_string()));
assert_eq!(parsed.password, Some("pass".to_string()));
assert!(parsed.has_credentials);
}
#[test]
fn test_parse_url_without_credentials() {
let parsed = parse_url("https://example.com/doc.pdf").unwrap();
assert_eq!(parsed.url, "https://example.com/doc.pdf");
assert!(parsed.username.is_none());
assert!(parsed.password.is_none());
assert!(!parsed.has_credentials);
}
#[test]
fn test_parse_url_with_username_only() {
let parsed = parse_url("https://user@example.com/doc.pdf").unwrap();
assert_eq!(parsed.url, "https://example.com/doc.pdf");
assert_eq!(parsed.username, Some("user".to_string()));
assert!(parsed.password.is_none()); // Empty password
assert!(parsed.has_credentials);
}
#[test]
fn test_parse_url_with_port() {
let parsed = parse_url("https://user:pass@example.com:8080/doc.pdf").unwrap();
assert_eq!(parsed.url, "https://example.com:8080/doc.pdf");
assert_eq!(parsed.username, Some("user".to_string()));
assert_eq!(parsed.password, Some("pass".to_string()));
assert!(parsed.has_credentials);
}
#[test]
fn test_parse_url_with_query_and_fragment() {
let parsed = parse_url("https://user:pass@example.com/doc.pdf?query=1#fragment").unwrap();
assert_eq!(parsed.url, "https://example.com/doc.pdf?query=1#fragment");
assert_eq!(parsed.username, Some("user".to_string()));
assert_eq!(parsed.password, Some("pass".to_string()));
assert!(parsed.has_credentials);
}
#[test]
fn test_parse_url_http_scheme() {
let parsed = parse_url("http://user:pass@example.com/doc.pdf").unwrap();
assert_eq!(parsed.url, "http://example.com/doc.pdf");
assert!(parsed.has_credentials);
}
#[test]
fn test_parse_url_invalid_scheme() {
let result = parse_url("ftp://example.com/doc.pdf");
assert!(matches!(result, Err(UrlError::UnsupportedScheme(_))));
let result = parse_url("file:///path/to/doc.pdf");
assert!(matches!(result, Err(UrlError::UnsupportedScheme(_))));
}
#[test]
fn test_parse_url_invalid() {
let result = parse_url("not-a-url");
assert!(matches!(result, Err(UrlError::InvalidUrl(_))));
let result = parse_url("https://");
assert!(matches!(result, Err(UrlError::MissingHost(_))));
}
#[test]
fn test_extract_password_from_url() {
let password = extract_password_from_url("https://user:pass@example.com/doc.pdf", "user");
assert_eq!(password, Some("pass".to_string()));
let password = extract_password_from_url("https://user:password123@example.com/doc.pdf", "user");
assert_eq!(password, Some("password123".to_string()));
let password = extract_password_from_url("https://user:@example.com/doc.pdf", "user");
assert_eq!(password, Some("".to_string()));
let password = extract_password_from_url("https://user@example.com/doc.pdf", "user");
assert_eq!(password, None);
}
#[test]
fn test_credentials_to_headers() {
let parsed = parse_url("https://user:pass@example.com/doc.pdf").unwrap();
let headers = credentials_to_headers(&parsed);
// ureq handles basic auth automatically, so we return empty
assert!(headers.is_empty());
}
#[test]
fn test_combine_headers_with_credentials() {
let mut custom = HashMap::new();
custom.insert("x-api-key".to_string(), "secret".to_string());
let parsed = parse_url("https://user:pass@example.com/doc.pdf").unwrap();
let result = combine_headers_with_credentials(&custom, Some(&parsed));
assert_eq!(result.get("x-api-key"), Some(&"secret".to_string()));
// ureq will add Authorization automatically from URL credentials
}
#[test]
fn test_combine_headers_without_credentials() {
let mut custom = HashMap::new();
custom.insert("x-api-key".to_string(), "secret".to_string());
let result = combine_headers_with_credentials(&custom, None);
assert_eq!(result.get("x-api-key"), Some(&"secret".to_string()));
assert_eq!(result.len(), 1);
}
#[test]
fn test_parse_url_preserves_path() {
let parsed = parse_url("https://user:pass@example.com/path/to/doc.pdf").unwrap();
assert_eq!(parsed.url, "https://example.com/path/to/doc.pdf");
}
#[test]
fn test_parse_url_with_empty_path() {
let parsed = parse_url("https://user:pass@example.com").unwrap();
assert_eq!(parsed.url, "https://example.com");
}
#[test]
fn test_parse_url_with_special_chars_in_password() {
let parsed = parse_url("https://user:p@ss:wo_rd@example.com/doc.pdf").unwrap();
assert_eq!(parsed.username, Some("user".to_string()));
// Password should include special chars
assert!(parsed.password.is_some());
assert!(parsed.has_credentials);
}
#[test]
fn test_parse_url_urlencoded_credentials() {
// URL-encoded credentials (e.g., @ in username as %40)
let parsed = parse_url("https://user%40domain:pass%23word@example.com/doc.pdf").unwrap();
assert_eq!(parsed.username, Some("user@domain".to_string()));
assert_eq!(parsed.password, Some("pass#word".to_string()));
assert!(parsed.has_credentials);
}
}