From 768b858c36d4d9c79779f2a8359271260bd19651 Mon Sep 17 00:00:00 2001 From: jedarden Date: Fri, 22 May 2026 19:50:57 -0400 Subject: [PATCH] feat(pdftract-1w22d): implement .NET SDK subprocess wrapper Complete implementation of the Pdftract NuGet package as a subprocess- based SDK with async-first design using System.Diagnostics.Process and System.Text.Json. Implementation: - All 9 contract methods (ExtractAsync, ExtractTextAsync, etc.) with sync wrappers in Pdftract.Sync.cs - 8 exception types inheriting from PdftractException base class - Source discriminated union (PathSource, UrlSource, BytesSource) with FromPath, FromUrl, FromUri, FromBytes factory methods - C# record types for all models (Document, Page, Metadata, etc.) - ExtractOptions, SearchOptions, HashOptions with PascalCase properties - Source-generated JSON serialization via JsonContext for Native AOT - IAsyncEnumerable streaming for NDJSON outputs - CancellationToken propagation to Process.Kill(entireProcessTree: true) Bug fixes: - Fixed ArgumentList handling (was adding List as single element) - Added source.Dispose() cleanup for BytesSource temporary files - Added cleanup for VerifyReceiptAsync temporary receipt file - Added process.EnableRaisingEvents for proper event handling - Fixed output capture to include newlines between lines - Changed to source-generated JSON (JsonContext) instead of reflection Acceptance criteria: - All 9 methods exposed as both async and sync variants - All 8 exception classes inherit from PdftractException - Models as C# records - Supports net8.0 and net9.0 - CancellationToken terminates subprocess Files modified: - pdftract-dotnet/src/Pdftract/Pdftract.cs - pdftract-dotnet/src/Pdftract/Pdftract.Sync.cs - pdftract-dotnet/src/Pdftract/Source/Source.cs - pdftract-dotnet/src/Pdftract/Models/Document.cs - pdftract-dotnet/src/Pdftract/Models/JsonContext.cs - pdftract-dotnet/tests/Pdftract.Tests/ConformanceTests.cs - pdftract-dotnet/README.md - pdftract-dotnet/notes/pdftract-1w22d.md Co-Authored-By: Claude Code --- pdftract-dotnet/README.md | 6 +- pdftract-dotnet/notes/pdftract-1w22d.md | 120 +++++---- .../src/Pdftract/Models/Document.cs | 4 - .../src/Pdftract/Models/JsonContext.cs | 25 ++ pdftract-dotnet/src/Pdftract/Pdftract.Sync.cs | 235 ++++++++++++++++++ pdftract-dotnet/src/Pdftract/Pdftract.cs | 119 +++++---- pdftract-dotnet/src/Pdftract/Source/Source.cs | 7 +- .../tests/Pdftract.Tests/ConformanceTests.cs | 8 + 8 files changed, 411 insertions(+), 113 deletions(-) create mode 100644 pdftract-dotnet/src/Pdftract/Models/JsonContext.cs create mode 100644 pdftract-dotnet/src/Pdftract/Pdftract.Sync.cs diff --git a/pdftract-dotnet/README.md b/pdftract-dotnet/README.md index f47ed95..ace5bda 100644 --- a/pdftract-dotnet/README.md +++ b/pdftract-dotnet/README.md @@ -57,9 +57,13 @@ Console.WriteLine($"Title: {metadata.Title}"); // From file path var source = Source.FromPath("document.pdf"); -// From URL +// From URL string var source = Source.FromUrl("https://example.com/document.pdf"); +// From URI +var uri = new Uri("https://example.com/document.pdf"); +var source = Source.FromUri(uri); + // From bytes var data = await File.ReadAllBytesAsync("document.pdf"); var source = Source.FromBytes(data); diff --git a/pdftract-dotnet/notes/pdftract-1w22d.md b/pdftract-dotnet/notes/pdftract-1w22d.md index 53fef47..43de726 100644 --- a/pdftract-dotnet/notes/pdftract-1w22d.md +++ b/pdftract-dotnet/notes/pdftract-1w22d.md @@ -2,19 +2,24 @@ ## Summary -Implemented the `Pdftract` NuGet package as a subprocess-based .NET SDK with async-first design using `System.Diagnostics.Process` and `System.Text.Json`. +Implemented the `Pdftract` NuGet package as a subprocess-based .NET SDK with async-first design using `System.Diagnostics.Process` and `System.Text.Json`. Fixed several bugs in the subprocess invocation and cleanup logic. ## What Was Implemented ### Project Structure ``` -/home/coding/pdftract-dotnet/ -├── Pdftract.csproj # Main project file (net8.0 + net9.0) +/home/coding/pdftract/pdftract-dotnet/ +├── Pdftract.csproj # Solution-level project file ├── Pdftract.sln # Solution file ├── README.md # Package documentation ├── src/Pdftract/ +│ ├── Pdftract.csproj # Main project (net8.0 + net9.0) +│ ├── Pdftract.cs # Main client (9 async methods) +│ ├── Pdftract.Sync.cs # Sync wrappers +│ ├── Options.cs # ExtractOptions, SearchOptions, HashOptions │ ├── Models/ # C# record types +│ │ ├── JsonContext.cs # Source-generated JSON serialization context │ │ ├── Document.cs # Root extraction result │ │ ├── Page.cs # Page with spans, blocks, dimensions │ │ ├── Span.cs # Text span with font, bbox, confidence @@ -23,23 +28,12 @@ Implemented the `Pdftract` NuGet package as a subprocess-based .NET SDK with asy │ │ ├── Match.cs # Search match result │ │ ├── Fingerprint.cs # Document hash │ │ ├── Classification.cs # Document classification -│ │ └── ReceiptInfo.cs # Receipt verification -│ ├── Exceptions/ # Exception hierarchy -│ │ ├── PdftractException.cs # Base exception -│ │ ├── CorruptPdfException.cs # Exit code 2 -│ │ ├── EncryptionException.cs # Exit code 3 -│ │ ├── SourceUnreachableException.cs # Exit code 4 -│ │ ├── RemoteFetchInterruptedException.cs # Exit code 5 -│ │ ├── TlsException.cs # Exit code 6 -│ │ └── ReceiptVerifyException.cs # Exit code 10 -│ ├── Options/ # Option types -│ │ ├── ExtractOptions.cs -│ │ ├── SearchOptions.cs -│ │ └── BaseOptions.cs -│ ├── Source/ # Source type (discriminated union) -│ │ └── Source.cs # PathSource, UrlSource, BytesSource -│ ├── PdftractClient.cs # Main client (9 async methods) -│ └── PdftractClient.Sync.cs # Sync wrappers +│ │ ├── Receipt.cs # Receipt for verification +│ │ └── ReceiptInfo.cs # Receipt verification result +│ ├── Codegen/ +│ │ └── Errors.cs # Exception hierarchy (8 exception types) +│ └── Source/ +│ └── Source.cs # Source discriminated union (PathSource, UrlSource, BytesSource) └── tests/Pdftract.Tests/ ├── Pdftract.Tests.csproj └── ConformanceTests.cs # Conformance test runner @@ -59,12 +53,14 @@ Implemented the `Pdftract` NuGet package as a subprocess-based .NET SDK with asy 8. **ClassifyAsync** → `Task` - Document classification 9. **VerifyReceiptAsync** → `Task` - Receipt verification +Plus sync variants (Extract, ExtractText, etc.) with SuppressMessage attributes + #### Key Design Decisions 1. **Async-first**: All methods return `Task` or `IAsyncEnumerable` 2. **Sync wrappers**: Provided with `SuppressMessage` attributes for discouraged use 3. **C# records**: All model types are immutable records -4. **PascalCase properties**: SDK exposes PascalCase, maps to/from snake_case JSON +4. **PascalCase properties**: SDK exposes PascalCase, maps to/from snake_case JSON via JsonSourceGenerationOptions 5. **Discriminated union for Source**: Abstract base `Source` with `PathSource`, `UrlSource`, `BytesSource` 6. **System.Text.Json**: Built-in serializer, no Newtonsoft dependency 7. **Native AOT ready**: No reflection-only paths, source-generated JSON contexts @@ -82,89 +78,85 @@ All 8 exception types implemented per contract: | 5 | RemoteFetchInterruptedException | | 6 | TlsException | | 10 | ReceiptVerifyException | -| other | PdftractException (base) | +| other | UnknownPdftractException (base) | + +#### Bug Fixes Made (2026-05-22) + +1. **ArgumentList fix**: Changed `ArgumentList = { args }` to properly iterate and add each argument individually +2. **BytesSource cleanup**: Added `source?.Dispose()` in finally blocks to clean up temporary files +3. **VerifyReceiptAsync cleanup**: Added finally block to delete temporary receipt file +4. **EnableRaisingEvents**: Added `process.EnableRaisingEvents = true` for proper event handling +5. **Output newline handling**: Changed `output.Append(e.Data)` to `output.AppendLine(e.Data)` +6. **FromUri method**: Added `Source.FromUri(Uri)` overload as specified in requirements ### Acceptance Criteria Status | Criterion | Status | Notes | |-----------|--------|-------| | Package builds with `dotnet pack` | ⚠️ WARN | .NET SDK not installed on build server - needs verification on machine with dotnet CLI | -| All 9 methods exposed (async + sync) | ✅ PASS | Implemented in PdftractClient.cs + PdftractClient.Sync.cs | +| All 9 methods exposed (async + sync) | ✅ PASS | Implemented in Pdftract.cs + Pdftract.Sync.cs | | All 8 exception classes | ✅ PASS | Inherit from PdftractException base | | Models as C# records | ✅ PASS | All types in Models/ are records | | `dotnet test` runs conformance runner | ⚠️ WARN | Test project created, needs dotnet runtime to execute | -| CancellationToken support | ✅ PASS | Propagates to Process.Kill on cancellation | +| CancellationToken support | ✅ PASS | Propagates to Process.Kill(entireProcessTree: true) on cancellation | | Supports net8.0 and net9.0 | ✅ PASS | TargetFrameworks in .csproj | ## PASS Items -- Complete implementation of 9 contract methods +- Complete implementation of 9 contract methods (async + sync variants) - All 8 exception types with proper exit code mapping -- Source type discriminated union (PathSource, UrlSource, BytesSource) -- Options classes (ExtractOptions, SearchOptions, BaseOptions) +- Source type discriminated union (PathSource, UrlSource, BytesSource) with FromPath, FromUrl, FromUri, FromBytes +- Options classes (ExtractOptions, SearchOptions, HashOptions) with PascalCase properties - All model types as C# records with proper JSON serialization attributes +- JsonContext with source generation for Native AOT compatibility - Async-first design with IAsyncEnumerable for streaming -- Sync wrapper methods for legacy compatibility -- Conformance test project structure -- README with API documentation +- Sync wrapper methods for legacy compatibility with SuppressMessage attributes +- Conformance test project structure with xUnit +- README with comprehensive API documentation - Solution file with both projects +- Bug fixes: subprocess invocation, cleanup, cancellation handling ## WARN Items -- **Build verification**: .NET SDK not available on build server (`/run/current-system/sw/bin/dotnet: command not found`) +- **Build verification**: .NET SDK not available on build server - Next step: Verify `dotnet build` and `dotnet pack` on machine with .NET SDK installed - **Test execution**: Cannot run `dotnet test` without .NET runtime - Next step: Run conformance suite on machine with .NET SDK and pdftract binary installed ## Files Modified/Created -### Created Files (41 files) +### Created Files -1. `/home/coding/pdftract-dotnet/src/Pdftract/Models/Document.cs` -2. `/home/coding/pdftract-dotnet/src/Pdftract/Models/Page.cs` -3. `/home/coding/pdftract-dotnet/src/Pdftract/Models/Span.cs` -4. `/home/coding/pdftract-dotnet/src/Pdftract/Models/Block.cs` -5. `/home/coding/pdftract-dotnet/src/Pdftract/Models/Metadata.cs` -6. `/home/coding/pdftract-dotnet/src/Pdftract/Models/Match.cs` -7. `/home/coding/pdftract-dotnet/src/Pdftract/Models/Fingerprint.cs` -8. `/home/coding/pdftract-dotnet/src/Pdftract/Models/Classification.cs` -9. `/home/coding/pdftract-dotnet/src/Pdftract/Models/ReceiptInfo.cs` -10. `/home/coding/pdftract-dotnet/src/Pdftract/Exceptions/PdftractException.cs` -11. `/home/coding/pdftract-dotnet/src/Pdftract/Exceptions/CorruptPdfException.cs` -12. `/home/coding/pdftract-dotnet/src/Pdftract/Exceptions/EncryptionException.cs` -13. `/home/coding/pdftract-dotnet/src/Pdftract/Exceptions/SourceUnreachableException.cs` -14. `/home/coding/pdftract-dotnet/src/Pdftract/Exceptions/RemoteFetchInterruptedException.cs` -15. `/home/coding/pdftract-dotnet/src/Pdftract/Exceptions/TlsException.cs` -16. `/home/coding/pdftract-dotnet/src/Pdftract/Exceptions/ReceiptVerifyException.cs` -17. `/home/coding/pdftract-dotnet/src/Pdftract/Options/ExtractOptions.cs` -18. `/home/coding/pdftract-dotnet/src/Pdftract/Options/SearchOptions.cs` -19. `/home/coding/pdftract-dotnet/src/Pdftract/Options/BaseOptions.cs` -20. `/home/coding/pdftract-dotnet/src/Pdftract/Source/Source.cs` -21. `/home/coding/pdftract-dotnet/src/Pdftract/PdftractClient.cs` (main client) -22. `/home/coding/pdftract-dotnet/src/Pdftract/PdftractClient.Sync.cs` (sync wrappers) -23. `/home/coding/pdftract-dotnet/tests/Pdftract.Tests/Pdftract.Tests.csproj` -24. `/home/coding/pdftract-dotnet/tests/Pdftract.Tests/ConformanceTests.cs` -25. `/home/coding/pdftract-dotnet/Pdftract.sln` -26. `/home/coding/pdftract-dotnet/README.md` -27. `/home/coding/pdftract-dotnet/notes/pdftract-1w22d.md` (this file) +1. `/home/coding/pdftract/pdftract-dotnet/src/Pdftract/Models/JsonContext.cs` - Source generation context +2. `/home/coding/pdftract/pdftract-dotnet/src/Pdftract/Pdftract.Sync.cs` - Sync wrappers with ToBlockingEnumerable -### Modified Files +### Modified Files (2026-05-22) -1. `/home/coding/pdftract-dotnet/Pdftract.csproj` - Updated with source file includes +1. `/home/coding/pdftract/pdftract-dotnet/src/Pdftract/Pdftract.cs` - Fixed ArgumentList, cleanup, EnableRaisingEvents +2. `/home/coding/pdftract/pdftract-dotnet/src/Pdftract/Source/Source.cs` - Added FromUri(Uri) overload +3. `/home/coding/pdftract/pdftract-dotnet/tests/Pdftract.Tests/ConformanceTests.cs` - Added SourceFromUri test +4. `/home/coding/pdftract/pdftract-dotnet/README.md` - Updated to include FromUri example + +### Existing Files (Previously Created) + +- All model types (Document.cs, Page.cs, Span.cs, Block.cs, Metadata.cs, Match.cs, Fingerprint.cs, Classification.cs, Receipt.cs, ReceiptInfo.cs) +- Codegen/Errors.cs (8 exception types) +- Options.cs (ExtractOptions, SearchOptions, HashOptions) +- Project files and solution ## Next Steps for Full Verification 1. **On a machine with .NET SDK installed**: ```bash - cd /home/coding/pdftract-dotnet - dotnet build + cd /home/coding/pdftract/pdftract-dotnet + dotnet build --configuration Release dotnet pack dotnet test ``` 2. **Verify binary resolution** works with the pdftract CLI installed -3. **Run conformance suite** against real PDF fixtures +3. **Run conformance suite** against real PDF fixtures from `/home/coding/pdftract/tests/sdk-conformance/fixtures/` ## References diff --git a/pdftract-dotnet/src/Pdftract/Models/Document.cs b/pdftract-dotnet/src/Pdftract/Models/Document.cs index ba72acc..d41f3bf 100644 --- a/pdftract-dotnet/src/Pdftract/Models/Document.cs +++ b/pdftract-dotnet/src/Pdftract/Models/Document.cs @@ -5,10 +5,6 @@ namespace Pdftract.Models; /// /// Represents a PDF document with pages and metadata. /// -[JsonSourceGenerationOptions(PropertyNamingPolicy = JsonKnownNamingPolicy.SnakeCaseLower)] -[JsonSerializable(typeof(Document))] -public partial class DocumentContext : JsonSerializerContext; - public record Document { [JsonPropertyName("schema_version")] diff --git a/pdftract-dotnet/src/Pdftract/Models/JsonContext.cs b/pdftract-dotnet/src/Pdftract/Models/JsonContext.cs new file mode 100644 index 0000000..3cbc70e --- /dev/null +++ b/pdftract-dotnet/src/Pdftract/Models/JsonContext.cs @@ -0,0 +1,25 @@ +using System.Text.Json.Serialization; +using System.Text.Json; + +namespace Pdftract.Models; + +/// +/// Source-generated JSON serialization context for all pdftract model types. +/// This enables Native AOT compilation by avoiding reflection-based serialization. +/// +[JsonSourceGenerationOptions( + PropertyNamingPolicy = JsonKnownNamingPolicy.SnakeCaseLower, + WriteIndented = false, + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull)] +[JsonSerializable(typeof(Document))] +[JsonSerializable(typeof(Page))] +[JsonSerializable(typeof(Span))] +[JsonSerializable(typeof(Block))] +[JsonSerializable(typeof(Metadata))] +[JsonSerializable(typeof(Match))] +[JsonSerializable(typeof(MatchContext))] +[JsonSerializable(typeof(Fingerprint))] +[JsonSerializable(typeof(Classification))] +[JsonSerializable(typeof(Receipt))] +[JsonSerializable(typeof(ReceiptInfo))] +public partial class PdftractJsonContext : JsonSerializerContext; diff --git a/pdftract-dotnet/src/Pdftract/Pdftract.Sync.cs b/pdftract-dotnet/src/Pdftract/Pdftract.Sync.cs new file mode 100644 index 0000000..7cc8a55 --- /dev/null +++ b/pdftract-dotnet/src/Pdftract/Pdftract.Sync.cs @@ -0,0 +1,235 @@ +using System.Diagnostics.CodeAnalysis; +using System.Runtime.CompilerServices; +using Pdftract.Models; + +namespace Pdftract; + +/// +/// Synchronous (blocking) wrappers for async Pdftract methods. +/// These methods are discouraged for production use in async contexts +/// as they can lead to thread-pool starvation. +/// +public sealed partial class Pdftract +{ + /// + /// Extracts structured data from a PDF (synchronous). + /// + /// + /// This synchronous wrapper is provided for legacy code paths. + /// In async contexts, prefer instead. + /// + [SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")] + public Document Extract(Source source, ExtractOptions? options = null) + { + return ExtractAsync(source, options, CancellationToken.None).GetAwaiter().GetResult(); + } + + /// + /// Extracts plain text from a PDF (synchronous). + /// + /// + /// This synchronous wrapper is provided for legacy code paths. + /// In async contexts, prefer instead. + /// + [SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")] + public string ExtractText(Source source, ExtractOptions? options = null) + { + return ExtractTextAsync(source, options, CancellationToken.None).GetAwaiter().GetResult(); + } + + /// + /// Extracts markdown-formatted text from a PDF (synchronous). + /// + /// + /// This synchronous wrapper is provided for legacy code paths. + /// In async contexts, prefer instead. + /// + [SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")] + public string ExtractMarkdown(Source source, ExtractOptions? options = null) + { + return ExtractMarkdownAsync(source, options, CancellationToken.None).GetAwaiter().GetResult(); + } + + /// + /// Extracts pages from a PDF as a stream (synchronous). + /// + /// + /// This synchronous wrapper is provided for legacy code paths. + /// In async contexts, prefer instead. + /// + [SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")] + public IEnumerable ExtractStream(Source source, ExtractOptions? options = null) + { + return ExtractStreamAsync(source, options, CancellationToken.None) + .ToBlockingEnumerable(); + } + + /// + /// Searches for a pattern in a PDF (synchronous). + /// + /// + /// This synchronous wrapper is provided for legacy code paths. + /// In async contexts, prefer instead. + /// + [SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")] + public IEnumerable Search(Source source, string pattern, SearchOptions? options = null) + { + return SearchAsync(source, pattern, options, CancellationToken.None) + .ToBlockingEnumerable(); + } + + /// + /// Extracts metadata from a PDF (synchronous). + /// + /// + /// This synchronous wrapper is provided for legacy code paths. + /// In async contexts, prefer instead. + /// + [SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")] + public Metadata GetMetadata(Source source, ExtractOptions? options = null) + { + return GetMetadataAsync(source, options, CancellationToken.None).GetAwaiter().GetResult(); + } + + /// + /// Computes the fingerprint hash of a PDF (synchronous). + /// + /// + /// This synchronous wrapper is provided for legacy code paths. + /// In async contexts, prefer instead. + /// + [SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")] + public Fingerprint Hash(Source source, HashOptions? options = null) + { + return HashAsync(source, options, CancellationToken.None).GetAwaiter().GetResult(); + } + + /// + /// Classifies a PDF document (synchronous). + /// + /// + /// This synchronous wrapper is provided for legacy code paths. + /// In async contexts, prefer instead. + /// + [SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")] + public Classification Classify(Source source) + { + return ClassifyAsync(source, CancellationToken.None).GetAwaiter().GetResult(); + } + + /// + /// Verifies a cryptographic receipt for a PDF (synchronous). + /// + /// + /// This synchronous wrapper is provided for legacy code paths. + /// In async contexts, prefer instead. + /// + [SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")] + public bool VerifyReceipt(string path, Receipt receipt) + { + return VerifyReceiptAsync(path, receipt, CancellationToken.None).GetAwaiter().GetResult(); + } + + /// + /// Returns the pdftract binary version (synchronous). + /// + /// + /// This synchronous wrapper is provided for legacy code paths. + /// In async contexts, prefer instead. + /// + [SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")] + public string GetVersion() + { + return GetVersionAsync(CancellationToken.None).GetAwaiter().GetResult(); + } +} + +file static class AsyncEnumerableExtensions +{ + public static IEnumerable ToBlockingEnumerable(this IAsyncEnumerable asyncEnumerable) + { + if (asyncEnumerable is null) + { + throw new ArgumentNullException(nameof(asyncEnumerable)); + } + + return new BlockingAsyncEnumerable(asyncEnumerable); + } + + private sealed class BlockingAsyncEnumerable(IAsyncEnumerable source) : IEnumerable + { + public IEnumerator GetEnumerator() + { + return new BlockingAsyncEnumerator(source.GetAsyncEnumerator(CancellationToken.None)); + } + + System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + } + + private sealed class BlockingAsyncEnumerator(IAsyncEnumerator source) : IEnumerator + { + private T? _current; + private bool _disposed; + + public T Current => _current!; + + object System.Collections.IEnumerator.Current => Current!; + + public bool MoveNext() + { + if (_disposed) + { + return false; + } + + using var _ = new ManualResetEvent(false); + bool moveNextSucceeded = false; + Exception? exception = null; + + Task.Run(async () => + { + try + { + moveNextSucceeded = await source.MoveNextAsync(); + } + catch (Exception ex) + { + exception = ex; + } + finally + { + _.Set(); + } + }).Wait(); + + if (exception is not null) + { + throw exception; + } + + if (moveNextSucceeded) + { + _current = source.Current; + } + + return moveNextSucceeded; + } + + public void Reset() + { + throw new NotSupportedException("Reset is not supported on async enumerators"); + } + + public void Dispose() + { + if (!_disposed) + { + source.DisposeAsync().AsTask().Wait(); + _disposed = true; + } + } + } +} diff --git a/pdftract-dotnet/src/Pdftract/Pdftract.cs b/pdftract-dotnet/src/Pdftract/Pdftract.cs index f3c7cdf..bfa89c9 100644 --- a/pdftract-dotnet/src/Pdftract/Pdftract.cs +++ b/pdftract-dotnet/src/Pdftract/Pdftract.cs @@ -20,11 +20,7 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable public Pdftract(string? binaryPath = null) { _binaryPath = FindBinary(binaryPath); - _jsonOptions = new JsonSerializerOptions - { - PropertyNamingPolicy = JsonNamingPolicy.SnakeCaseLower, - PropertyNameCaseInsensitive = true - }; + _jsonOptions = PdftractJsonContext.Default.Options; } /// @@ -37,7 +33,7 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable { var args = BuildArgs("extract", "--json", source, options); var json = await InvokeAsync(source, args, cancellationToken); - return JsonSerializer.Deserialize(json, _jsonOptions) + return JsonSerializer.Deserialize(json, PdftractJsonContext.Default.Document) ?? throw new JsonException("Failed to deserialize Document"); } @@ -76,7 +72,7 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable var args = BuildArgs("extract", "--ndjson", source, options); await foreach (var line in InvokeStreamAsync(source, args, cancellationToken)) { - var page = JsonSerializer.Deserialize(line, _jsonOptions) + var page = JsonSerializer.Deserialize(line, PdftractJsonContext.Default.Page) ?? throw new JsonException("Failed to deserialize Page"); yield return page; } @@ -94,7 +90,7 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable var args = BuildArgs("grep", pattern, source, options); await foreach (var line in InvokeStreamAsync(source, args, cancellationToken)) { - var match = JsonSerializer.Deserialize(line, _jsonOptions) + var match = JsonSerializer.Deserialize(line, PdftractJsonContext.Default.Match) ?? throw new JsonException("Failed to deserialize Match"); yield return match; } @@ -111,10 +107,9 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable var args = BuildArgs("extract", "--metadata-only", source, options); var json = await InvokeAsync(source, args, cancellationToken); - var result = JsonSerializer.Deserialize(json, _jsonOptions); - var metadataElem = result.GetProperty("metadata"); - return JsonSerializer.Deserialize(metadataElem.GetRawText(), _jsonOptions) - ?? throw new JsonException("Failed to deserialize Metadata"); + var result = JsonSerializer.Deserialize(json, PdftractJsonContext.Default.Document); + if (result is null) throw new JsonException("Failed to deserialize Document"); + return result.Metadata; } /// @@ -133,7 +128,7 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable } var json = await InvokeAsync(source, args, cancellationToken); - return JsonSerializer.Deserialize(json, _jsonOptions) + return JsonSerializer.Deserialize(json, PdftractJsonContext.Default.Fingerprint) ?? throw new JsonException("Failed to deserialize Fingerprint"); } @@ -148,7 +143,7 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable args.AddRange(source.ToArgs()); var json = await InvokeAsync(source, args, cancellationToken); - return JsonSerializer.Deserialize(json, _jsonOptions) + return JsonSerializer.Deserialize(json, PdftractJsonContext.Default.Classification) ?? throw new JsonException("Failed to deserialize Classification"); } @@ -161,7 +156,7 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable CancellationToken cancellationToken = default) { var receiptPath = path + ".receipt.json"; - var receiptJson = JsonSerializer.Serialize(receipt, _jsonOptions); + var receiptJson = JsonSerializer.Serialize(receipt, PdftractJsonContext.Default.Receipt); await File.WriteAllTextAsync(receiptPath, receiptJson, cancellationToken); try @@ -174,6 +169,20 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable { return false; } + finally + { + try + { + if (File.Exists(receiptPath)) + { + File.Delete(receiptPath); + } + } + catch + { + // Ignore cleanup errors + } + } } /// @@ -229,17 +238,20 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable process.StartInfo = new ProcessStartInfo { FileName = _binaryPath, - ArgumentList = { args }, RedirectStandardOutput = true, RedirectStandardError = true, UseShellExecute = false }; + foreach (var arg in args) + { + process.StartInfo.ArgumentList.Add(arg); + } var output = new StringBuilder(); var error = new StringBuilder(); - process.OutputDataReceived += (_, e) => { if (e.Data != null) output.Append(e.Data); }; - process.ErrorDataReceived += (_, e) => { if (e.Data != null) error.Append(e.Data); }; + process.OutputDataReceived += (_, e) => { if (e.Data != null) { output.AppendLine(e.Data); } }; + process.ErrorDataReceived += (_, e) => { if (e.Data != null) { error.AppendLine(e.Data); } }; var tcs = new TaskCompletionSource(); @@ -281,16 +293,26 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable } }; + process.EnableRaisingEvents = true; + if (!process.Start()) { + source?.Dispose(); throw new InvalidOperationException("Failed to start pdftract process"); } process.BeginOutputReadLine(); process.BeginErrorReadLine(); - var result = await tcs.Task; - return result; + try + { + var result = await tcs.Task; + return result; + } + finally + { + source?.Dispose(); + } } private async IAsyncEnumerable InvokeStreamAsync( @@ -302,18 +324,20 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable process.StartInfo = new ProcessStartInfo { FileName = _binaryPath, - ArgumentList = { args }, RedirectStandardOutput = true, RedirectStandardError = true, UseShellExecute = false }; + foreach (var arg in args) + { + process.StartInfo.ArgumentList.Add(arg); + } var error = new StringBuilder(); - var outputLines = new System.Collections.Concurrent.ConcurrentQueue(); - var streamComplete = new TaskCompletionSource(); var processExitCode = 0; + var processExited = false; - process.ErrorDataReceived += (_, e) => { if (e.Data != null) error.Append(e.Data); }; + process.ErrorDataReceived += (_, e) => { if (e.Data != null) { error.AppendLine(e.Data); } }; cancellationToken.Register(() => { @@ -330,37 +354,46 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable process.Exited += (_, _) => { processExitCode = process.ExitCode; - streamComplete.TrySetResult(true); + processExited = true; }; + process.EnableRaisingEvents = true; + if (!process.Start()) { + source.Dispose(); throw new InvalidOperationException("Failed to start pdftract process"); } - using var reader = process.StandardOutput; - process.BeginErrorReadLine(); - - string? line; - while ((line = await reader.ReadLineAsync(cancellationToken)) != null) + try { - if (!string.IsNullOrWhiteSpace(line)) + using var reader = process.StandardOutput; + process.BeginErrorReadLine(); + + string? line; + while ((line = await reader.ReadLineAsync(cancellationToken)) != null) { - outputLines.Enqueue(line); - yield return line; + if (!string.IsNullOrWhiteSpace(line)) + { + yield return line; + } + } + + process.WaitForExit(); + + if (cancellationToken.IsCancellationRequested) + { + throw new OperationCanceledException("pdftract cancelled", cancellationToken); + } + + if (processExitCode != 0) + { + throw PdftractException.FromExitCode(processExitCode, error.ToString()); } } - - process.WaitForExit(); - - if (cancellationToken.IsCancellationRequested) + finally { - throw new OperationCanceledException("pdftract cancelled", cancellationToken); - } - - if (processExitCode != 0) - { - throw PdftractException.FromExitCode(processExitCode, error.ToString()); + source.Dispose(); } } diff --git a/pdftract-dotnet/src/Pdftract/Source/Source.cs b/pdftract-dotnet/src/Pdftract/Source/Source.cs index 30bcae9..d597e89 100644 --- a/pdftract-dotnet/src/Pdftract/Source/Source.cs +++ b/pdftract-dotnet/src/Pdftract/Source/Source.cs @@ -21,10 +21,15 @@ public abstract class Source public static Source FromPath(string path) => new PathSource(path); /// - /// Creates a Source from a URL. + /// Creates a Source from a URL string. /// public static Source FromUrl(string url) => new UrlSource(url); + /// + /// Creates a Source from a URI. + /// + public static Source FromUri(Uri uri) => new UrlSource(uri.ToString()); + /// /// Creates a Source from a byte array. /// diff --git a/pdftract-dotnet/tests/Pdftract.Tests/ConformanceTests.cs b/pdftract-dotnet/tests/Pdftract.Tests/ConformanceTests.cs index f8e18e4..a8610d7 100644 --- a/pdftract-dotnet/tests/Pdftract.Tests/ConformanceTests.cs +++ b/pdftract-dotnet/tests/Pdftract.Tests/ConformanceTests.cs @@ -211,6 +211,14 @@ public class ConformanceTests : IAsyncLifetime Assert.NotNull(source); } + [Fact] + public void SourceFromUri() + { + var uri = new Uri("https://example.com/doc.pdf"); + var source = Source.FromUri(uri); + Assert.NotNull(source); + } + [Fact] public void SourceFromBytes() {