feat(pdftract-1w22d): implement .NET SDK subprocess wrapper
Complete implementation of the Pdftract NuGet package as a subprocess- based SDK with async-first design using System.Diagnostics.Process and System.Text.Json. Implementation: - All 9 contract methods (ExtractAsync, ExtractTextAsync, etc.) with sync wrappers in Pdftract.Sync.cs - 8 exception types inheriting from PdftractException base class - Source discriminated union (PathSource, UrlSource, BytesSource) with FromPath, FromUrl, FromUri, FromBytes factory methods - C# record types for all models (Document, Page, Metadata, etc.) - ExtractOptions, SearchOptions, HashOptions with PascalCase properties - Source-generated JSON serialization via JsonContext for Native AOT - IAsyncEnumerable streaming for NDJSON outputs - CancellationToken propagation to Process.Kill(entireProcessTree: true) Bug fixes: - Fixed ArgumentList handling (was adding List as single element) - Added source.Dispose() cleanup for BytesSource temporary files - Added cleanup for VerifyReceiptAsync temporary receipt file - Added process.EnableRaisingEvents for proper event handling - Fixed output capture to include newlines between lines - Changed to source-generated JSON (JsonContext) instead of reflection Acceptance criteria: - All 9 methods exposed as both async and sync variants - All 8 exception classes inherit from PdftractException - Models as C# records - Supports net8.0 and net9.0 - CancellationToken terminates subprocess Files modified: - pdftract-dotnet/src/Pdftract/Pdftract.cs - pdftract-dotnet/src/Pdftract/Pdftract.Sync.cs - pdftract-dotnet/src/Pdftract/Source/Source.cs - pdftract-dotnet/src/Pdftract/Models/Document.cs - pdftract-dotnet/src/Pdftract/Models/JsonContext.cs - pdftract-dotnet/tests/Pdftract.Tests/ConformanceTests.cs - pdftract-dotnet/README.md - pdftract-dotnet/notes/pdftract-1w22d.md Co-Authored-By: Claude Code <noreply@anthropic.com>
This commit is contained in:
parent
43d31f8dfc
commit
768b858c36
8 changed files with 411 additions and 113 deletions
|
|
@ -57,9 +57,13 @@ Console.WriteLine($"Title: {metadata.Title}");
|
|||
// From file path
|
||||
var source = Source.FromPath("document.pdf");
|
||||
|
||||
// From URL
|
||||
// From URL string
|
||||
var source = Source.FromUrl("https://example.com/document.pdf");
|
||||
|
||||
// From URI
|
||||
var uri = new Uri("https://example.com/document.pdf");
|
||||
var source = Source.FromUri(uri);
|
||||
|
||||
// From bytes
|
||||
var data = await File.ReadAllBytesAsync("document.pdf");
|
||||
var source = Source.FromBytes(data);
|
||||
|
|
|
|||
|
|
@ -2,19 +2,24 @@
|
|||
|
||||
## Summary
|
||||
|
||||
Implemented the `Pdftract` NuGet package as a subprocess-based .NET SDK with async-first design using `System.Diagnostics.Process` and `System.Text.Json`.
|
||||
Implemented the `Pdftract` NuGet package as a subprocess-based .NET SDK with async-first design using `System.Diagnostics.Process` and `System.Text.Json`. Fixed several bugs in the subprocess invocation and cleanup logic.
|
||||
|
||||
## What Was Implemented
|
||||
|
||||
### Project Structure
|
||||
|
||||
```
|
||||
/home/coding/pdftract-dotnet/
|
||||
├── Pdftract.csproj # Main project file (net8.0 + net9.0)
|
||||
/home/coding/pdftract/pdftract-dotnet/
|
||||
├── Pdftract.csproj # Solution-level project file
|
||||
├── Pdftract.sln # Solution file
|
||||
├── README.md # Package documentation
|
||||
├── src/Pdftract/
|
||||
│ ├── Pdftract.csproj # Main project (net8.0 + net9.0)
|
||||
│ ├── Pdftract.cs # Main client (9 async methods)
|
||||
│ ├── Pdftract.Sync.cs # Sync wrappers
|
||||
│ ├── Options.cs # ExtractOptions, SearchOptions, HashOptions
|
||||
│ ├── Models/ # C# record types
|
||||
│ │ ├── JsonContext.cs # Source-generated JSON serialization context
|
||||
│ │ ├── Document.cs # Root extraction result
|
||||
│ │ ├── Page.cs # Page with spans, blocks, dimensions
|
||||
│ │ ├── Span.cs # Text span with font, bbox, confidence
|
||||
|
|
@ -23,23 +28,12 @@ Implemented the `Pdftract` NuGet package as a subprocess-based .NET SDK with asy
|
|||
│ │ ├── Match.cs # Search match result
|
||||
│ │ ├── Fingerprint.cs # Document hash
|
||||
│ │ ├── Classification.cs # Document classification
|
||||
│ │ └── ReceiptInfo.cs # Receipt verification
|
||||
│ ├── Exceptions/ # Exception hierarchy
|
||||
│ │ ├── PdftractException.cs # Base exception
|
||||
│ │ ├── CorruptPdfException.cs # Exit code 2
|
||||
│ │ ├── EncryptionException.cs # Exit code 3
|
||||
│ │ ├── SourceUnreachableException.cs # Exit code 4
|
||||
│ │ ├── RemoteFetchInterruptedException.cs # Exit code 5
|
||||
│ │ ├── TlsException.cs # Exit code 6
|
||||
│ │ └── ReceiptVerifyException.cs # Exit code 10
|
||||
│ ├── Options/ # Option types
|
||||
│ │ ├── ExtractOptions.cs
|
||||
│ │ ├── SearchOptions.cs
|
||||
│ │ └── BaseOptions.cs
|
||||
│ ├── Source/ # Source type (discriminated union)
|
||||
│ │ └── Source.cs # PathSource, UrlSource, BytesSource
|
||||
│ ├── PdftractClient.cs # Main client (9 async methods)
|
||||
│ └── PdftractClient.Sync.cs # Sync wrappers
|
||||
│ │ ├── Receipt.cs # Receipt for verification
|
||||
│ │ └── ReceiptInfo.cs # Receipt verification result
|
||||
│ ├── Codegen/
|
||||
│ │ └── Errors.cs # Exception hierarchy (8 exception types)
|
||||
│ └── Source/
|
||||
│ └── Source.cs # Source discriminated union (PathSource, UrlSource, BytesSource)
|
||||
└── tests/Pdftract.Tests/
|
||||
├── Pdftract.Tests.csproj
|
||||
└── ConformanceTests.cs # Conformance test runner
|
||||
|
|
@ -59,12 +53,14 @@ Implemented the `Pdftract` NuGet package as a subprocess-based .NET SDK with asy
|
|||
8. **ClassifyAsync** → `Task<Classification>` - Document classification
|
||||
9. **VerifyReceiptAsync** → `Task<bool>` - Receipt verification
|
||||
|
||||
Plus sync variants (Extract, ExtractText, etc.) with SuppressMessage attributes
|
||||
|
||||
#### Key Design Decisions
|
||||
|
||||
1. **Async-first**: All methods return `Task<T>` or `IAsyncEnumerable<T>`
|
||||
2. **Sync wrappers**: Provided with `SuppressMessage` attributes for discouraged use
|
||||
3. **C# records**: All model types are immutable records
|
||||
4. **PascalCase properties**: SDK exposes PascalCase, maps to/from snake_case JSON
|
||||
4. **PascalCase properties**: SDK exposes PascalCase, maps to/from snake_case JSON via JsonSourceGenerationOptions
|
||||
5. **Discriminated union for Source**: Abstract base `Source` with `PathSource`, `UrlSource`, `BytesSource`
|
||||
6. **System.Text.Json**: Built-in serializer, no Newtonsoft dependency
|
||||
7. **Native AOT ready**: No reflection-only paths, source-generated JSON contexts
|
||||
|
|
@ -82,89 +78,85 @@ All 8 exception types implemented per contract:
|
|||
| 5 | RemoteFetchInterruptedException |
|
||||
| 6 | TlsException |
|
||||
| 10 | ReceiptVerifyException |
|
||||
| other | PdftractException (base) |
|
||||
| other | UnknownPdftractException (base) |
|
||||
|
||||
#### Bug Fixes Made (2026-05-22)
|
||||
|
||||
1. **ArgumentList fix**: Changed `ArgumentList = { args }` to properly iterate and add each argument individually
|
||||
2. **BytesSource cleanup**: Added `source?.Dispose()` in finally blocks to clean up temporary files
|
||||
3. **VerifyReceiptAsync cleanup**: Added finally block to delete temporary receipt file
|
||||
4. **EnableRaisingEvents**: Added `process.EnableRaisingEvents = true` for proper event handling
|
||||
5. **Output newline handling**: Changed `output.Append(e.Data)` to `output.AppendLine(e.Data)`
|
||||
6. **FromUri method**: Added `Source.FromUri(Uri)` overload as specified in requirements
|
||||
|
||||
### Acceptance Criteria Status
|
||||
|
||||
| Criterion | Status | Notes |
|
||||
|-----------|--------|-------|
|
||||
| Package builds with `dotnet pack` | ⚠️ WARN | .NET SDK not installed on build server - needs verification on machine with dotnet CLI |
|
||||
| All 9 methods exposed (async + sync) | ✅ PASS | Implemented in PdftractClient.cs + PdftractClient.Sync.cs |
|
||||
| All 9 methods exposed (async + sync) | ✅ PASS | Implemented in Pdftract.cs + Pdftract.Sync.cs |
|
||||
| All 8 exception classes | ✅ PASS | Inherit from PdftractException base |
|
||||
| Models as C# records | ✅ PASS | All types in Models/ are records |
|
||||
| `dotnet test` runs conformance runner | ⚠️ WARN | Test project created, needs dotnet runtime to execute |
|
||||
| CancellationToken support | ✅ PASS | Propagates to Process.Kill on cancellation |
|
||||
| CancellationToken support | ✅ PASS | Propagates to Process.Kill(entireProcessTree: true) on cancellation |
|
||||
| Supports net8.0 and net9.0 | ✅ PASS | TargetFrameworks in .csproj |
|
||||
|
||||
## PASS Items
|
||||
|
||||
- Complete implementation of 9 contract methods
|
||||
- Complete implementation of 9 contract methods (async + sync variants)
|
||||
- All 8 exception types with proper exit code mapping
|
||||
- Source type discriminated union (PathSource, UrlSource, BytesSource)
|
||||
- Options classes (ExtractOptions, SearchOptions, BaseOptions)
|
||||
- Source type discriminated union (PathSource, UrlSource, BytesSource) with FromPath, FromUrl, FromUri, FromBytes
|
||||
- Options classes (ExtractOptions, SearchOptions, HashOptions) with PascalCase properties
|
||||
- All model types as C# records with proper JSON serialization attributes
|
||||
- JsonContext with source generation for Native AOT compatibility
|
||||
- Async-first design with IAsyncEnumerable for streaming
|
||||
- Sync wrapper methods for legacy compatibility
|
||||
- Conformance test project structure
|
||||
- README with API documentation
|
||||
- Sync wrapper methods for legacy compatibility with SuppressMessage attributes
|
||||
- Conformance test project structure with xUnit
|
||||
- README with comprehensive API documentation
|
||||
- Solution file with both projects
|
||||
- Bug fixes: subprocess invocation, cleanup, cancellation handling
|
||||
|
||||
## WARN Items
|
||||
|
||||
- **Build verification**: .NET SDK not available on build server (`/run/current-system/sw/bin/dotnet: command not found`)
|
||||
- **Build verification**: .NET SDK not available on build server
|
||||
- Next step: Verify `dotnet build` and `dotnet pack` on machine with .NET SDK installed
|
||||
- **Test execution**: Cannot run `dotnet test` without .NET runtime
|
||||
- Next step: Run conformance suite on machine with .NET SDK and pdftract binary installed
|
||||
|
||||
## Files Modified/Created
|
||||
|
||||
### Created Files (41 files)
|
||||
### Created Files
|
||||
|
||||
1. `/home/coding/pdftract-dotnet/src/Pdftract/Models/Document.cs`
|
||||
2. `/home/coding/pdftract-dotnet/src/Pdftract/Models/Page.cs`
|
||||
3. `/home/coding/pdftract-dotnet/src/Pdftract/Models/Span.cs`
|
||||
4. `/home/coding/pdftract-dotnet/src/Pdftract/Models/Block.cs`
|
||||
5. `/home/coding/pdftract-dotnet/src/Pdftract/Models/Metadata.cs`
|
||||
6. `/home/coding/pdftract-dotnet/src/Pdftract/Models/Match.cs`
|
||||
7. `/home/coding/pdftract-dotnet/src/Pdftract/Models/Fingerprint.cs`
|
||||
8. `/home/coding/pdftract-dotnet/src/Pdftract/Models/Classification.cs`
|
||||
9. `/home/coding/pdftract-dotnet/src/Pdftract/Models/ReceiptInfo.cs`
|
||||
10. `/home/coding/pdftract-dotnet/src/Pdftract/Exceptions/PdftractException.cs`
|
||||
11. `/home/coding/pdftract-dotnet/src/Pdftract/Exceptions/CorruptPdfException.cs`
|
||||
12. `/home/coding/pdftract-dotnet/src/Pdftract/Exceptions/EncryptionException.cs`
|
||||
13. `/home/coding/pdftract-dotnet/src/Pdftract/Exceptions/SourceUnreachableException.cs`
|
||||
14. `/home/coding/pdftract-dotnet/src/Pdftract/Exceptions/RemoteFetchInterruptedException.cs`
|
||||
15. `/home/coding/pdftract-dotnet/src/Pdftract/Exceptions/TlsException.cs`
|
||||
16. `/home/coding/pdftract-dotnet/src/Pdftract/Exceptions/ReceiptVerifyException.cs`
|
||||
17. `/home/coding/pdftract-dotnet/src/Pdftract/Options/ExtractOptions.cs`
|
||||
18. `/home/coding/pdftract-dotnet/src/Pdftract/Options/SearchOptions.cs`
|
||||
19. `/home/coding/pdftract-dotnet/src/Pdftract/Options/BaseOptions.cs`
|
||||
20. `/home/coding/pdftract-dotnet/src/Pdftract/Source/Source.cs`
|
||||
21. `/home/coding/pdftract-dotnet/src/Pdftract/PdftractClient.cs` (main client)
|
||||
22. `/home/coding/pdftract-dotnet/src/Pdftract/PdftractClient.Sync.cs` (sync wrappers)
|
||||
23. `/home/coding/pdftract-dotnet/tests/Pdftract.Tests/Pdftract.Tests.csproj`
|
||||
24. `/home/coding/pdftract-dotnet/tests/Pdftract.Tests/ConformanceTests.cs`
|
||||
25. `/home/coding/pdftract-dotnet/Pdftract.sln`
|
||||
26. `/home/coding/pdftract-dotnet/README.md`
|
||||
27. `/home/coding/pdftract-dotnet/notes/pdftract-1w22d.md` (this file)
|
||||
1. `/home/coding/pdftract/pdftract-dotnet/src/Pdftract/Models/JsonContext.cs` - Source generation context
|
||||
2. `/home/coding/pdftract/pdftract-dotnet/src/Pdftract/Pdftract.Sync.cs` - Sync wrappers with ToBlockingEnumerable
|
||||
|
||||
### Modified Files
|
||||
### Modified Files (2026-05-22)
|
||||
|
||||
1. `/home/coding/pdftract-dotnet/Pdftract.csproj` - Updated with source file includes
|
||||
1. `/home/coding/pdftract/pdftract-dotnet/src/Pdftract/Pdftract.cs` - Fixed ArgumentList, cleanup, EnableRaisingEvents
|
||||
2. `/home/coding/pdftract/pdftract-dotnet/src/Pdftract/Source/Source.cs` - Added FromUri(Uri) overload
|
||||
3. `/home/coding/pdftract/pdftract-dotnet/tests/Pdftract.Tests/ConformanceTests.cs` - Added SourceFromUri test
|
||||
4. `/home/coding/pdftract/pdftract-dotnet/README.md` - Updated to include FromUri example
|
||||
|
||||
### Existing Files (Previously Created)
|
||||
|
||||
- All model types (Document.cs, Page.cs, Span.cs, Block.cs, Metadata.cs, Match.cs, Fingerprint.cs, Classification.cs, Receipt.cs, ReceiptInfo.cs)
|
||||
- Codegen/Errors.cs (8 exception types)
|
||||
- Options.cs (ExtractOptions, SearchOptions, HashOptions)
|
||||
- Project files and solution
|
||||
|
||||
## Next Steps for Full Verification
|
||||
|
||||
1. **On a machine with .NET SDK installed**:
|
||||
```bash
|
||||
cd /home/coding/pdftract-dotnet
|
||||
dotnet build
|
||||
cd /home/coding/pdftract/pdftract-dotnet
|
||||
dotnet build --configuration Release
|
||||
dotnet pack
|
||||
dotnet test
|
||||
```
|
||||
|
||||
2. **Verify binary resolution** works with the pdftract CLI installed
|
||||
|
||||
3. **Run conformance suite** against real PDF fixtures
|
||||
3. **Run conformance suite** against real PDF fixtures from `/home/coding/pdftract/tests/sdk-conformance/fixtures/`
|
||||
|
||||
## References
|
||||
|
||||
|
|
|
|||
|
|
@ -5,10 +5,6 @@ namespace Pdftract.Models;
|
|||
/// <summary>
|
||||
/// Represents a PDF document with pages and metadata.
|
||||
/// </summary>
|
||||
[JsonSourceGenerationOptions(PropertyNamingPolicy = JsonKnownNamingPolicy.SnakeCaseLower)]
|
||||
[JsonSerializable(typeof(Document))]
|
||||
public partial class DocumentContext : JsonSerializerContext;
|
||||
|
||||
public record Document
|
||||
{
|
||||
[JsonPropertyName("schema_version")]
|
||||
|
|
|
|||
25
pdftract-dotnet/src/Pdftract/Models/JsonContext.cs
Normal file
25
pdftract-dotnet/src/Pdftract/Models/JsonContext.cs
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
using System.Text.Json.Serialization;
|
||||
using System.Text.Json;
|
||||
|
||||
namespace Pdftract.Models;
|
||||
|
||||
/// <summary>
|
||||
/// Source-generated JSON serialization context for all pdftract model types.
|
||||
/// This enables Native AOT compilation by avoiding reflection-based serialization.
|
||||
/// </summary>
|
||||
[JsonSourceGenerationOptions(
|
||||
PropertyNamingPolicy = JsonKnownNamingPolicy.SnakeCaseLower,
|
||||
WriteIndented = false,
|
||||
DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull)]
|
||||
[JsonSerializable(typeof(Document))]
|
||||
[JsonSerializable(typeof(Page))]
|
||||
[JsonSerializable(typeof(Span))]
|
||||
[JsonSerializable(typeof(Block))]
|
||||
[JsonSerializable(typeof(Metadata))]
|
||||
[JsonSerializable(typeof(Match))]
|
||||
[JsonSerializable(typeof(MatchContext))]
|
||||
[JsonSerializable(typeof(Fingerprint))]
|
||||
[JsonSerializable(typeof(Classification))]
|
||||
[JsonSerializable(typeof(Receipt))]
|
||||
[JsonSerializable(typeof(ReceiptInfo))]
|
||||
public partial class PdftractJsonContext : JsonSerializerContext;
|
||||
235
pdftract-dotnet/src/Pdftract/Pdftract.Sync.cs
Normal file
235
pdftract-dotnet/src/Pdftract/Pdftract.Sync.cs
Normal file
|
|
@ -0,0 +1,235 @@
|
|||
using System.Diagnostics.CodeAnalysis;
|
||||
using System.Runtime.CompilerServices;
|
||||
using Pdftract.Models;
|
||||
|
||||
namespace Pdftract;
|
||||
|
||||
/// <summary>
|
||||
/// Synchronous (blocking) wrappers for async Pdftract methods.
|
||||
/// These methods are discouraged for production use in async contexts
|
||||
/// as they can lead to thread-pool starvation.
|
||||
/// </summary>
|
||||
public sealed partial class Pdftract
|
||||
{
|
||||
/// <summary>
|
||||
/// Extracts structured data from a PDF (synchronous).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This synchronous wrapper is provided for legacy code paths.
|
||||
/// In async contexts, prefer <see cref="ExtractAsync"/> instead.
|
||||
/// </remarks>
|
||||
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
|
||||
public Document Extract(Source source, ExtractOptions? options = null)
|
||||
{
|
||||
return ExtractAsync(source, options, CancellationToken.None).GetAwaiter().GetResult();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extracts plain text from a PDF (synchronous).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This synchronous wrapper is provided for legacy code paths.
|
||||
/// In async contexts, prefer <see cref="ExtractTextAsync"/> instead.
|
||||
/// </remarks>
|
||||
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
|
||||
public string ExtractText(Source source, ExtractOptions? options = null)
|
||||
{
|
||||
return ExtractTextAsync(source, options, CancellationToken.None).GetAwaiter().GetResult();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extracts markdown-formatted text from a PDF (synchronous).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This synchronous wrapper is provided for legacy code paths.
|
||||
/// In async contexts, prefer <see cref="ExtractMarkdownAsync"/> instead.
|
||||
/// </remarks>
|
||||
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
|
||||
public string ExtractMarkdown(Source source, ExtractOptions? options = null)
|
||||
{
|
||||
return ExtractMarkdownAsync(source, options, CancellationToken.None).GetAwaiter().GetResult();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extracts pages from a PDF as a stream (synchronous).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This synchronous wrapper is provided for legacy code paths.
|
||||
/// In async contexts, prefer <see cref="ExtractStreamAsync"/> instead.
|
||||
/// </remarks>
|
||||
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
|
||||
public IEnumerable<Page> ExtractStream(Source source, ExtractOptions? options = null)
|
||||
{
|
||||
return ExtractStreamAsync(source, options, CancellationToken.None)
|
||||
.ToBlockingEnumerable();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Searches for a pattern in a PDF (synchronous).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This synchronous wrapper is provided for legacy code paths.
|
||||
/// In async contexts, prefer <see cref="SearchAsync"/> instead.
|
||||
/// </remarks>
|
||||
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
|
||||
public IEnumerable<Match> Search(Source source, string pattern, SearchOptions? options = null)
|
||||
{
|
||||
return SearchAsync(source, pattern, options, CancellationToken.None)
|
||||
.ToBlockingEnumerable();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Extracts metadata from a PDF (synchronous).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This synchronous wrapper is provided for legacy code paths.
|
||||
/// In async contexts, prefer <see cref="GetMetadataAsync"/> instead.
|
||||
/// </remarks>
|
||||
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
|
||||
public Metadata GetMetadata(Source source, ExtractOptions? options = null)
|
||||
{
|
||||
return GetMetadataAsync(source, options, CancellationToken.None).GetAwaiter().GetResult();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the fingerprint hash of a PDF (synchronous).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This synchronous wrapper is provided for legacy code paths.
|
||||
/// In async contexts, prefer <see cref="HashAsync"/> instead.
|
||||
/// </remarks>
|
||||
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
|
||||
public Fingerprint Hash(Source source, HashOptions? options = null)
|
||||
{
|
||||
return HashAsync(source, options, CancellationToken.None).GetAwaiter().GetResult();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Classifies a PDF document (synchronous).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This synchronous wrapper is provided for legacy code paths.
|
||||
/// In async contexts, prefer <see cref="ClassifyAsync"/> instead.
|
||||
/// </remarks>
|
||||
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
|
||||
public Classification Classify(Source source)
|
||||
{
|
||||
return ClassifyAsync(source, CancellationToken.None).GetAwaiter().GetResult();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Verifies a cryptographic receipt for a PDF (synchronous).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This synchronous wrapper is provided for legacy code paths.
|
||||
/// In async contexts, prefer <see cref="VerifyReceiptAsync"/> instead.
|
||||
/// </remarks>
|
||||
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
|
||||
public bool VerifyReceipt(string path, Receipt receipt)
|
||||
{
|
||||
return VerifyReceiptAsync(path, receipt, CancellationToken.None).GetAwaiter().GetResult();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the pdftract binary version (synchronous).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// This synchronous wrapper is provided for legacy code paths.
|
||||
/// In async contexts, prefer <see cref="GetVersionAsync"/> instead.
|
||||
/// </remarks>
|
||||
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
|
||||
public string GetVersion()
|
||||
{
|
||||
return GetVersionAsync(CancellationToken.None).GetAwaiter().GetResult();
|
||||
}
|
||||
}
|
||||
|
||||
file static class AsyncEnumerableExtensions
|
||||
{
|
||||
public static IEnumerable<T> ToBlockingEnumerable<T>(this IAsyncEnumerable<T> asyncEnumerable)
|
||||
{
|
||||
if (asyncEnumerable is null)
|
||||
{
|
||||
throw new ArgumentNullException(nameof(asyncEnumerable));
|
||||
}
|
||||
|
||||
return new BlockingAsyncEnumerable<T>(asyncEnumerable);
|
||||
}
|
||||
|
||||
private sealed class BlockingAsyncEnumerable<T>(IAsyncEnumerable<T> source) : IEnumerable<T>
|
||||
{
|
||||
public IEnumerator<T> GetEnumerator()
|
||||
{
|
||||
return new BlockingAsyncEnumerator<T>(source.GetAsyncEnumerator(CancellationToken.None));
|
||||
}
|
||||
|
||||
System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
|
||||
{
|
||||
return GetEnumerator();
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class BlockingAsyncEnumerator<T>(IAsyncEnumerator<T> source) : IEnumerator<T>
|
||||
{
|
||||
private T? _current;
|
||||
private bool _disposed;
|
||||
|
||||
public T Current => _current!;
|
||||
|
||||
object System.Collections.IEnumerator.Current => Current!;
|
||||
|
||||
public bool MoveNext()
|
||||
{
|
||||
if (_disposed)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
using var _ = new ManualResetEvent(false);
|
||||
bool moveNextSucceeded = false;
|
||||
Exception? exception = null;
|
||||
|
||||
Task.Run(async () =>
|
||||
{
|
||||
try
|
||||
{
|
||||
moveNextSucceeded = await source.MoveNextAsync();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
exception = ex;
|
||||
}
|
||||
finally
|
||||
{
|
||||
_.Set();
|
||||
}
|
||||
}).Wait();
|
||||
|
||||
if (exception is not null)
|
||||
{
|
||||
throw exception;
|
||||
}
|
||||
|
||||
if (moveNextSucceeded)
|
||||
{
|
||||
_current = source.Current;
|
||||
}
|
||||
|
||||
return moveNextSucceeded;
|
||||
}
|
||||
|
||||
public void Reset()
|
||||
{
|
||||
throw new NotSupportedException("Reset is not supported on async enumerators");
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
if (!_disposed)
|
||||
{
|
||||
source.DisposeAsync().AsTask().Wait();
|
||||
_disposed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -20,11 +20,7 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable
|
|||
public Pdftract(string? binaryPath = null)
|
||||
{
|
||||
_binaryPath = FindBinary(binaryPath);
|
||||
_jsonOptions = new JsonSerializerOptions
|
||||
{
|
||||
PropertyNamingPolicy = JsonNamingPolicy.SnakeCaseLower,
|
||||
PropertyNameCaseInsensitive = true
|
||||
};
|
||||
_jsonOptions = PdftractJsonContext.Default.Options;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
|
@ -37,7 +33,7 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable
|
|||
{
|
||||
var args = BuildArgs("extract", "--json", source, options);
|
||||
var json = await InvokeAsync(source, args, cancellationToken);
|
||||
return JsonSerializer.Deserialize<Document>(json, _jsonOptions)
|
||||
return JsonSerializer.Deserialize(json, PdftractJsonContext.Default.Document)
|
||||
?? throw new JsonException("Failed to deserialize Document");
|
||||
}
|
||||
|
||||
|
|
@ -76,7 +72,7 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable
|
|||
var args = BuildArgs("extract", "--ndjson", source, options);
|
||||
await foreach (var line in InvokeStreamAsync(source, args, cancellationToken))
|
||||
{
|
||||
var page = JsonSerializer.Deserialize<Page>(line, _jsonOptions)
|
||||
var page = JsonSerializer.Deserialize(line, PdftractJsonContext.Default.Page)
|
||||
?? throw new JsonException("Failed to deserialize Page");
|
||||
yield return page;
|
||||
}
|
||||
|
|
@ -94,7 +90,7 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable
|
|||
var args = BuildArgs("grep", pattern, source, options);
|
||||
await foreach (var line in InvokeStreamAsync(source, args, cancellationToken))
|
||||
{
|
||||
var match = JsonSerializer.Deserialize<Match>(line, _jsonOptions)
|
||||
var match = JsonSerializer.Deserialize(line, PdftractJsonContext.Default.Match)
|
||||
?? throw new JsonException("Failed to deserialize Match");
|
||||
yield return match;
|
||||
}
|
||||
|
|
@ -111,10 +107,9 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable
|
|||
var args = BuildArgs("extract", "--metadata-only", source, options);
|
||||
var json = await InvokeAsync(source, args, cancellationToken);
|
||||
|
||||
var result = JsonSerializer.Deserialize<JsonElement>(json, _jsonOptions);
|
||||
var metadataElem = result.GetProperty("metadata");
|
||||
return JsonSerializer.Deserialize<Metadata>(metadataElem.GetRawText(), _jsonOptions)
|
||||
?? throw new JsonException("Failed to deserialize Metadata");
|
||||
var result = JsonSerializer.Deserialize(json, PdftractJsonContext.Default.Document);
|
||||
if (result is null) throw new JsonException("Failed to deserialize Document");
|
||||
return result.Metadata;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
|
@ -133,7 +128,7 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable
|
|||
}
|
||||
|
||||
var json = await InvokeAsync(source, args, cancellationToken);
|
||||
return JsonSerializer.Deserialize<Fingerprint>(json, _jsonOptions)
|
||||
return JsonSerializer.Deserialize(json, PdftractJsonContext.Default.Fingerprint)
|
||||
?? throw new JsonException("Failed to deserialize Fingerprint");
|
||||
}
|
||||
|
||||
|
|
@ -148,7 +143,7 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable
|
|||
args.AddRange(source.ToArgs());
|
||||
|
||||
var json = await InvokeAsync(source, args, cancellationToken);
|
||||
return JsonSerializer.Deserialize<Classification>(json, _jsonOptions)
|
||||
return JsonSerializer.Deserialize(json, PdftractJsonContext.Default.Classification)
|
||||
?? throw new JsonException("Failed to deserialize Classification");
|
||||
}
|
||||
|
||||
|
|
@ -161,7 +156,7 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable
|
|||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
var receiptPath = path + ".receipt.json";
|
||||
var receiptJson = JsonSerializer.Serialize(receipt, _jsonOptions);
|
||||
var receiptJson = JsonSerializer.Serialize(receipt, PdftractJsonContext.Default.Receipt);
|
||||
await File.WriteAllTextAsync(receiptPath, receiptJson, cancellationToken);
|
||||
|
||||
try
|
||||
|
|
@ -174,6 +169,20 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable
|
|||
{
|
||||
return false;
|
||||
}
|
||||
finally
|
||||
{
|
||||
try
|
||||
{
|
||||
if (File.Exists(receiptPath))
|
||||
{
|
||||
File.Delete(receiptPath);
|
||||
}
|
||||
}
|
||||
catch
|
||||
{
|
||||
// Ignore cleanup errors
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
|
@ -229,17 +238,20 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable
|
|||
process.StartInfo = new ProcessStartInfo
|
||||
{
|
||||
FileName = _binaryPath,
|
||||
ArgumentList = { args },
|
||||
RedirectStandardOutput = true,
|
||||
RedirectStandardError = true,
|
||||
UseShellExecute = false
|
||||
};
|
||||
foreach (var arg in args)
|
||||
{
|
||||
process.StartInfo.ArgumentList.Add(arg);
|
||||
}
|
||||
|
||||
var output = new StringBuilder();
|
||||
var error = new StringBuilder();
|
||||
|
||||
process.OutputDataReceived += (_, e) => { if (e.Data != null) output.Append(e.Data); };
|
||||
process.ErrorDataReceived += (_, e) => { if (e.Data != null) error.Append(e.Data); };
|
||||
process.OutputDataReceived += (_, e) => { if (e.Data != null) { output.AppendLine(e.Data); } };
|
||||
process.ErrorDataReceived += (_, e) => { if (e.Data != null) { error.AppendLine(e.Data); } };
|
||||
|
||||
var tcs = new TaskCompletionSource<string>();
|
||||
|
||||
|
|
@ -281,17 +293,27 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable
|
|||
}
|
||||
};
|
||||
|
||||
process.EnableRaisingEvents = true;
|
||||
|
||||
if (!process.Start())
|
||||
{
|
||||
source?.Dispose();
|
||||
throw new InvalidOperationException("Failed to start pdftract process");
|
||||
}
|
||||
|
||||
process.BeginOutputReadLine();
|
||||
process.BeginErrorReadLine();
|
||||
|
||||
try
|
||||
{
|
||||
var result = await tcs.Task;
|
||||
return result;
|
||||
}
|
||||
finally
|
||||
{
|
||||
source?.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
private async IAsyncEnumerable<string> InvokeStreamAsync(
|
||||
Source source,
|
||||
|
|
@ -302,18 +324,20 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable
|
|||
process.StartInfo = new ProcessStartInfo
|
||||
{
|
||||
FileName = _binaryPath,
|
||||
ArgumentList = { args },
|
||||
RedirectStandardOutput = true,
|
||||
RedirectStandardError = true,
|
||||
UseShellExecute = false
|
||||
};
|
||||
foreach (var arg in args)
|
||||
{
|
||||
process.StartInfo.ArgumentList.Add(arg);
|
||||
}
|
||||
|
||||
var error = new StringBuilder();
|
||||
var outputLines = new System.Collections.Concurrent.ConcurrentQueue<string>();
|
||||
var streamComplete = new TaskCompletionSource<bool>();
|
||||
var processExitCode = 0;
|
||||
var processExited = false;
|
||||
|
||||
process.ErrorDataReceived += (_, e) => { if (e.Data != null) error.Append(e.Data); };
|
||||
process.ErrorDataReceived += (_, e) => { if (e.Data != null) { error.AppendLine(e.Data); } };
|
||||
|
||||
cancellationToken.Register(() =>
|
||||
{
|
||||
|
|
@ -330,14 +354,19 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable
|
|||
process.Exited += (_, _) =>
|
||||
{
|
||||
processExitCode = process.ExitCode;
|
||||
streamComplete.TrySetResult(true);
|
||||
processExited = true;
|
||||
};
|
||||
|
||||
process.EnableRaisingEvents = true;
|
||||
|
||||
if (!process.Start())
|
||||
{
|
||||
source.Dispose();
|
||||
throw new InvalidOperationException("Failed to start pdftract process");
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
using var reader = process.StandardOutput;
|
||||
process.BeginErrorReadLine();
|
||||
|
||||
|
|
@ -346,7 +375,6 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable
|
|||
{
|
||||
if (!string.IsNullOrWhiteSpace(line))
|
||||
{
|
||||
outputLines.Enqueue(line);
|
||||
yield return line;
|
||||
}
|
||||
}
|
||||
|
|
@ -363,6 +391,11 @@ public sealed partial class Pdftract : IAsyncDisposable, IDisposable
|
|||
throw PdftractException.FromExitCode(processExitCode, error.ToString());
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
source.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
private static string FindBinary(string? path)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -21,10 +21,15 @@ public abstract class Source
|
|||
public static Source FromPath(string path) => new PathSource(path);
|
||||
|
||||
/// <summary>
|
||||
/// Creates a Source from a URL.
|
||||
/// Creates a Source from a URL string.
|
||||
/// </summary>
|
||||
public static Source FromUrl(string url) => new UrlSource(url);
|
||||
|
||||
/// <summary>
|
||||
/// Creates a Source from a URI.
|
||||
/// </summary>
|
||||
public static Source FromUri(Uri uri) => new UrlSource(uri.ToString());
|
||||
|
||||
/// <summary>
|
||||
/// Creates a Source from a byte array.
|
||||
/// </summary>
|
||||
|
|
|
|||
|
|
@ -211,6 +211,14 @@ public class ConformanceTests : IAsyncLifetime
|
|||
Assert.NotNull(source);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SourceFromUri()
|
||||
{
|
||||
var uri = new Uri("https://example.com/doc.pdf");
|
||||
var source = Source.FromUri(uri);
|
||||
Assert.NotNull(source);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SourceFromBytes()
|
||||
{
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue