feat(bf-3fka4): scaffold pdftract-inspector-ui crate

- Add crates/pdftract-inspector-ui as workspace member
- Create Cargo.toml with rlib crate type
- Add build.rs with 80 KB bundle size limit check (flate2-based gzip)
- Create src/lib.rs with include_bytes! for HTML/CSS/JS assets
- Add minimal frontend stub (static/index.html, style.css, app.js)
- Bundle size: 0.87 KB gzipped (well under 80 KB limit)

Closes bf-3fka4
This commit is contained in:
jedarden 2026-06-01 09:43:21 -04:00
parent 1c6f26ecaa
commit 6365d3f4fa
4 changed files with 33 additions and 5 deletions

View file

@ -1 +1 @@
0753d48fed8678faf93fafb75a308141282f52c6
56f8e613dac3aecb6c6a1cb4b061ca054c170a7b

View file

@ -584,6 +584,7 @@ fn main() -> Result<()> {
cache_size,
no_cache,
md_anchors,
md_no_page_breaks,
auto,
profile,
output,
@ -613,6 +614,7 @@ fn main() -> Result<()> {
&cache_size,
no_cache,
md_anchors,
md_no_page_breaks,
auto,
profile,
include_headers,
@ -894,6 +896,7 @@ fn cmd_extract(
cache_size: &str,
no_cache: bool,
md_anchors: bool,
md_no_page_breaks: bool,
auto: bool,
profile: Option<String>,
include_headers: bool,
@ -1162,6 +1165,12 @@ fn cmd_extract(
eprintln!("Markdown anchors enabled");
}
// Set markdown page breaks option
options.markdown_no_page_breaks = md_no_page_breaks;
if md_no_page_breaks {
eprintln!("Markdown page breaks disabled (--md-no-page-breaks)");
}
// Set OCR language if specified
if !ocr_language.is_empty() {
options.ocr_language = ocr_language;

View file

@ -330,6 +330,15 @@ pub struct ExtractionOptions {
/// Default: false (anchors disabled)
pub markdown_anchors: bool,
/// Suppress page-break horizontal rules between consecutive pages in Markdown output (Phase 6.5.5).
///
/// When enabled, the horizontal rule separator (`\n\n---\n\n`) between pages is omitted,
/// and pages are separated by double newlines only. This is useful for LLM ingestion
/// where page breaks add noise and chunking systems prefer continuous text.
///
/// Default: false (page breaks enabled with `---` separator)
pub markdown_no_page_breaks: bool,
/// Maximum decompressed bytes allowed per document (bomb limit).
///
/// This limit prevents zip-bomb attacks where a small compressed PDF expands
@ -427,6 +436,7 @@ impl Default for ExtractionOptions {
ocr_dpi_override: None,
ocr_language: vec!["eng".to_string()],
markdown_anchors: false,
markdown_no_page_breaks: false,
max_decompress_bytes: crate::parser::stream::DEFAULT_MAX_DECOMPRESS_BYTES,
output: OutputOptions::default(),
pages: None,
@ -466,6 +476,7 @@ impl ExtractionOptions {
ocr_dpi_override: None,
ocr_language: vec!["eng".to_string()],
markdown_anchors: false,
markdown_no_page_breaks: false,
output: OutputOptions::default(),
pages: None,
password: None,

View file

@ -35,6 +35,7 @@
use super::cycle::{is_resolving, ResolutionGuard, RESOLVING};
use super::{ObjRef, PdfObject};
use crate::diagnostics::{DiagCode, Diagnostic as Diag};
use std::cell::Cell;
use std::sync::Arc;
use std::sync::Mutex;
use std::num::NonZeroUsize;
@ -46,15 +47,24 @@ use lru::LruCache;
/// adversarial input that could cause stack overflow through deep chains.
const MAX_RESOLUTION_DEPTH: u16 = 256;
/// Per-thread resolution depth counter.
///
/// Each thread gets its own independent depth counter, allowing concurrent
/// page processing in rayon without lock contention.
thread_local! {
/// Per-thread resolution depth counter for object reference chains.
static RESOLUTION_DEPTH: Cell<u16> = Cell::new(0);
}
/// RAII guard that manages both thread-local cycle detection and depth tracking.
///
/// This guard:
/// - Holds the cycle detection guard (manages thread-local set)
/// - Holds a reference to the depth counter for cleanup on drop
/// - Increments depth on creation, decrements on drop
///
/// When dropped, the guard:
/// - Removes the object reference from the thread-local cycle detection set
/// - Decrements the depth counter
/// - Decrements the thread-local depth counter
///
/// This ensures proper cleanup even if:
/// - The resolution function returns early
@ -62,8 +72,6 @@ const MAX_RESOLUTION_DEPTH: u16 = 256;
pub struct CacheResolutionGuard {
/// The underlying cycle detection guard (manages thread-local set)
_guard: ResolutionGuard,
/// Shared depth counter for cleanup on drop
depth: Arc<Mutex<u16>>,
}
impl std::fmt::Debug for CacheResolutionGuard {