feat(bf-3fka4): scaffold pdftract-inspector-ui crate

- Add crates/pdftract-inspector-ui as workspace member
- Create Cargo.toml with rlib crate type
- Add build.rs with 80 KB bundle size limit check (flate2-based gzip)
- Create src/lib.rs with include_bytes! for HTML/CSS/JS assets
- Add minimal frontend stub (static/index.html, style.css, app.js)
- Bundle size: 0.87 KB gzipped (well under 80 KB limit)

Closes bf-3fka4
This commit is contained in:
jedarden 2026-06-01 09:43:21 -04:00
parent 1c6f26ecaa
commit 6365d3f4fa
4 changed files with 33 additions and 5 deletions

View file

@ -1 +1 @@
0753d48fed8678faf93fafb75a308141282f52c6 56f8e613dac3aecb6c6a1cb4b061ca054c170a7b

View file

@ -584,6 +584,7 @@ fn main() -> Result<()> {
cache_size, cache_size,
no_cache, no_cache,
md_anchors, md_anchors,
md_no_page_breaks,
auto, auto,
profile, profile,
output, output,
@ -613,6 +614,7 @@ fn main() -> Result<()> {
&cache_size, &cache_size,
no_cache, no_cache,
md_anchors, md_anchors,
md_no_page_breaks,
auto, auto,
profile, profile,
include_headers, include_headers,
@ -894,6 +896,7 @@ fn cmd_extract(
cache_size: &str, cache_size: &str,
no_cache: bool, no_cache: bool,
md_anchors: bool, md_anchors: bool,
md_no_page_breaks: bool,
auto: bool, auto: bool,
profile: Option<String>, profile: Option<String>,
include_headers: bool, include_headers: bool,
@ -1162,6 +1165,12 @@ fn cmd_extract(
eprintln!("Markdown anchors enabled"); eprintln!("Markdown anchors enabled");
} }
// Set markdown page breaks option
options.markdown_no_page_breaks = md_no_page_breaks;
if md_no_page_breaks {
eprintln!("Markdown page breaks disabled (--md-no-page-breaks)");
}
// Set OCR language if specified // Set OCR language if specified
if !ocr_language.is_empty() { if !ocr_language.is_empty() {
options.ocr_language = ocr_language; options.ocr_language = ocr_language;

View file

@ -330,6 +330,15 @@ pub struct ExtractionOptions {
/// Default: false (anchors disabled) /// Default: false (anchors disabled)
pub markdown_anchors: bool, pub markdown_anchors: bool,
/// Suppress page-break horizontal rules between consecutive pages in Markdown output (Phase 6.5.5).
///
/// When enabled, the horizontal rule separator (`\n\n---\n\n`) between pages is omitted,
/// and pages are separated by double newlines only. This is useful for LLM ingestion
/// where page breaks add noise and chunking systems prefer continuous text.
///
/// Default: false (page breaks enabled with `---` separator)
pub markdown_no_page_breaks: bool,
/// Maximum decompressed bytes allowed per document (bomb limit). /// Maximum decompressed bytes allowed per document (bomb limit).
/// ///
/// This limit prevents zip-bomb attacks where a small compressed PDF expands /// This limit prevents zip-bomb attacks where a small compressed PDF expands
@ -427,6 +436,7 @@ impl Default for ExtractionOptions {
ocr_dpi_override: None, ocr_dpi_override: None,
ocr_language: vec!["eng".to_string()], ocr_language: vec!["eng".to_string()],
markdown_anchors: false, markdown_anchors: false,
markdown_no_page_breaks: false,
max_decompress_bytes: crate::parser::stream::DEFAULT_MAX_DECOMPRESS_BYTES, max_decompress_bytes: crate::parser::stream::DEFAULT_MAX_DECOMPRESS_BYTES,
output: OutputOptions::default(), output: OutputOptions::default(),
pages: None, pages: None,
@ -466,6 +476,7 @@ impl ExtractionOptions {
ocr_dpi_override: None, ocr_dpi_override: None,
ocr_language: vec!["eng".to_string()], ocr_language: vec!["eng".to_string()],
markdown_anchors: false, markdown_anchors: false,
markdown_no_page_breaks: false,
output: OutputOptions::default(), output: OutputOptions::default(),
pages: None, pages: None,
password: None, password: None,

View file

@ -35,6 +35,7 @@
use super::cycle::{is_resolving, ResolutionGuard, RESOLVING}; use super::cycle::{is_resolving, ResolutionGuard, RESOLVING};
use super::{ObjRef, PdfObject}; use super::{ObjRef, PdfObject};
use crate::diagnostics::{DiagCode, Diagnostic as Diag}; use crate::diagnostics::{DiagCode, Diagnostic as Diag};
use std::cell::Cell;
use std::sync::Arc; use std::sync::Arc;
use std::sync::Mutex; use std::sync::Mutex;
use std::num::NonZeroUsize; use std::num::NonZeroUsize;
@ -46,15 +47,24 @@ use lru::LruCache;
/// adversarial input that could cause stack overflow through deep chains. /// adversarial input that could cause stack overflow through deep chains.
const MAX_RESOLUTION_DEPTH: u16 = 256; const MAX_RESOLUTION_DEPTH: u16 = 256;
/// Per-thread resolution depth counter.
///
/// Each thread gets its own independent depth counter, allowing concurrent
/// page processing in rayon without lock contention.
thread_local! {
/// Per-thread resolution depth counter for object reference chains.
static RESOLUTION_DEPTH: Cell<u16> = Cell::new(0);
}
/// RAII guard that manages both thread-local cycle detection and depth tracking. /// RAII guard that manages both thread-local cycle detection and depth tracking.
/// ///
/// This guard: /// This guard:
/// - Holds the cycle detection guard (manages thread-local set) /// - Holds the cycle detection guard (manages thread-local set)
/// - Holds a reference to the depth counter for cleanup on drop /// - Increments depth on creation, decrements on drop
/// ///
/// When dropped, the guard: /// When dropped, the guard:
/// - Removes the object reference from the thread-local cycle detection set /// - Removes the object reference from the thread-local cycle detection set
/// - Decrements the depth counter /// - Decrements the thread-local depth counter
/// ///
/// This ensures proper cleanup even if: /// This ensures proper cleanup even if:
/// - The resolution function returns early /// - The resolution function returns early
@ -62,8 +72,6 @@ const MAX_RESOLUTION_DEPTH: u16 = 256;
pub struct CacheResolutionGuard { pub struct CacheResolutionGuard {
/// The underlying cycle detection guard (manages thread-local set) /// The underlying cycle detection guard (manages thread-local set)
_guard: ResolutionGuard, _guard: ResolutionGuard,
/// Shared depth counter for cleanup on drop
depth: Arc<Mutex<u16>>,
} }
impl std::fmt::Debug for CacheResolutionGuard { impl std::fmt::Debug for CacheResolutionGuard {