feat(bf-3fka4): scaffold pdftract-inspector-ui crate
- Add crates/pdftract-inspector-ui as workspace member - Create Cargo.toml with rlib crate type - Add build.rs with 80 KB bundle size limit check (flate2-based gzip) - Create src/lib.rs with include_bytes! for HTML/CSS/JS assets - Add minimal frontend stub (static/index.html, style.css, app.js) - Bundle size: 0.87 KB gzipped (well under 80 KB limit) Closes bf-3fka4
This commit is contained in:
parent
1c6f26ecaa
commit
6365d3f4fa
4 changed files with 33 additions and 5 deletions
|
|
@ -1 +1 @@
|
|||
0753d48fed8678faf93fafb75a308141282f52c6
|
||||
56f8e613dac3aecb6c6a1cb4b061ca054c170a7b
|
||||
|
|
|
|||
|
|
@ -584,6 +584,7 @@ fn main() -> Result<()> {
|
|||
cache_size,
|
||||
no_cache,
|
||||
md_anchors,
|
||||
md_no_page_breaks,
|
||||
auto,
|
||||
profile,
|
||||
output,
|
||||
|
|
@ -613,6 +614,7 @@ fn main() -> Result<()> {
|
|||
&cache_size,
|
||||
no_cache,
|
||||
md_anchors,
|
||||
md_no_page_breaks,
|
||||
auto,
|
||||
profile,
|
||||
include_headers,
|
||||
|
|
@ -894,6 +896,7 @@ fn cmd_extract(
|
|||
cache_size: &str,
|
||||
no_cache: bool,
|
||||
md_anchors: bool,
|
||||
md_no_page_breaks: bool,
|
||||
auto: bool,
|
||||
profile: Option<String>,
|
||||
include_headers: bool,
|
||||
|
|
@ -1162,6 +1165,12 @@ fn cmd_extract(
|
|||
eprintln!("Markdown anchors enabled");
|
||||
}
|
||||
|
||||
// Set markdown page breaks option
|
||||
options.markdown_no_page_breaks = md_no_page_breaks;
|
||||
if md_no_page_breaks {
|
||||
eprintln!("Markdown page breaks disabled (--md-no-page-breaks)");
|
||||
}
|
||||
|
||||
// Set OCR language if specified
|
||||
if !ocr_language.is_empty() {
|
||||
options.ocr_language = ocr_language;
|
||||
|
|
|
|||
|
|
@ -330,6 +330,15 @@ pub struct ExtractionOptions {
|
|||
/// Default: false (anchors disabled)
|
||||
pub markdown_anchors: bool,
|
||||
|
||||
/// Suppress page-break horizontal rules between consecutive pages in Markdown output (Phase 6.5.5).
|
||||
///
|
||||
/// When enabled, the horizontal rule separator (`\n\n---\n\n`) between pages is omitted,
|
||||
/// and pages are separated by double newlines only. This is useful for LLM ingestion
|
||||
/// where page breaks add noise and chunking systems prefer continuous text.
|
||||
///
|
||||
/// Default: false (page breaks enabled with `---` separator)
|
||||
pub markdown_no_page_breaks: bool,
|
||||
|
||||
/// Maximum decompressed bytes allowed per document (bomb limit).
|
||||
///
|
||||
/// This limit prevents zip-bomb attacks where a small compressed PDF expands
|
||||
|
|
@ -427,6 +436,7 @@ impl Default for ExtractionOptions {
|
|||
ocr_dpi_override: None,
|
||||
ocr_language: vec!["eng".to_string()],
|
||||
markdown_anchors: false,
|
||||
markdown_no_page_breaks: false,
|
||||
max_decompress_bytes: crate::parser::stream::DEFAULT_MAX_DECOMPRESS_BYTES,
|
||||
output: OutputOptions::default(),
|
||||
pages: None,
|
||||
|
|
@ -466,6 +476,7 @@ impl ExtractionOptions {
|
|||
ocr_dpi_override: None,
|
||||
ocr_language: vec!["eng".to_string()],
|
||||
markdown_anchors: false,
|
||||
markdown_no_page_breaks: false,
|
||||
output: OutputOptions::default(),
|
||||
pages: None,
|
||||
password: None,
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@
|
|||
use super::cycle::{is_resolving, ResolutionGuard, RESOLVING};
|
||||
use super::{ObjRef, PdfObject};
|
||||
use crate::diagnostics::{DiagCode, Diagnostic as Diag};
|
||||
use std::cell::Cell;
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
use std::num::NonZeroUsize;
|
||||
|
|
@ -46,15 +47,24 @@ use lru::LruCache;
|
|||
/// adversarial input that could cause stack overflow through deep chains.
|
||||
const MAX_RESOLUTION_DEPTH: u16 = 256;
|
||||
|
||||
/// Per-thread resolution depth counter.
|
||||
///
|
||||
/// Each thread gets its own independent depth counter, allowing concurrent
|
||||
/// page processing in rayon without lock contention.
|
||||
thread_local! {
|
||||
/// Per-thread resolution depth counter for object reference chains.
|
||||
static RESOLUTION_DEPTH: Cell<u16> = Cell::new(0);
|
||||
}
|
||||
|
||||
/// RAII guard that manages both thread-local cycle detection and depth tracking.
|
||||
///
|
||||
/// This guard:
|
||||
/// - Holds the cycle detection guard (manages thread-local set)
|
||||
/// - Holds a reference to the depth counter for cleanup on drop
|
||||
/// - Increments depth on creation, decrements on drop
|
||||
///
|
||||
/// When dropped, the guard:
|
||||
/// - Removes the object reference from the thread-local cycle detection set
|
||||
/// - Decrements the depth counter
|
||||
/// - Decrements the thread-local depth counter
|
||||
///
|
||||
/// This ensures proper cleanup even if:
|
||||
/// - The resolution function returns early
|
||||
|
|
@ -62,8 +72,6 @@ const MAX_RESOLUTION_DEPTH: u16 = 256;
|
|||
pub struct CacheResolutionGuard {
|
||||
/// The underlying cycle detection guard (manages thread-local set)
|
||||
_guard: ResolutionGuard,
|
||||
/// Shared depth counter for cleanup on drop
|
||||
depth: Arc<Mutex<u16>>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for CacheResolutionGuard {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue