pdftract/pdftract-dotnet/src/Pdftract/Pdftract.cs
jedarden 768b858c36 feat(pdftract-1w22d): implement .NET SDK subprocess wrapper
Complete implementation of the Pdftract NuGet package as a subprocess-
based SDK with async-first design using System.Diagnostics.Process and
System.Text.Json.

Implementation:
- All 9 contract methods (ExtractAsync, ExtractTextAsync, etc.) with sync
  wrappers in Pdftract.Sync.cs
- 8 exception types inheriting from PdftractException base class
- Source discriminated union (PathSource, UrlSource, BytesSource) with
  FromPath, FromUrl, FromUri, FromBytes factory methods
- C# record types for all models (Document, Page, Metadata, etc.)
- ExtractOptions, SearchOptions, HashOptions with PascalCase properties
- Source-generated JSON serialization via JsonContext for Native AOT
- IAsyncEnumerable streaming for NDJSON outputs
- CancellationToken propagation to Process.Kill(entireProcessTree: true)

Bug fixes:
- Fixed ArgumentList handling (was adding List as single element)
- Added source.Dispose() cleanup for BytesSource temporary files
- Added cleanup for VerifyReceiptAsync temporary receipt file
- Added process.EnableRaisingEvents for proper event handling
- Fixed output capture to include newlines between lines
- Changed to source-generated JSON (JsonContext) instead of reflection

Acceptance criteria:
- All 9 methods exposed as both async and sync variants
- All 8 exception classes inherit from PdftractException
- Models as C# records
- Supports net8.0 and net9.0
- CancellationToken terminates subprocess

Files modified:
- pdftract-dotnet/src/Pdftract/Pdftract.cs
- pdftract-dotnet/src/Pdftract/Pdftract.Sync.cs
- pdftract-dotnet/src/Pdftract/Source/Source.cs
- pdftract-dotnet/src/Pdftract/Models/Document.cs
- pdftract-dotnet/src/Pdftract/Models/JsonContext.cs
- pdftract-dotnet/tests/Pdftract.Tests/ConformanceTests.cs
- pdftract-dotnet/README.md
- pdftract-dotnet/notes/pdftract-1w22d.md

Co-Authored-By: Claude Code <noreply@anthropic.com>
2026-05-22 19:50:57 -04:00

455 lines
14 KiB
C#

using System.Diagnostics;
using System.Text;
using System.Text.Json;
using Pdftract.Models;
namespace Pdftract;
/// <summary>
/// pdftract SDK client for .NET.
/// </summary>
public sealed partial class Pdftract : IAsyncDisposable, IDisposable
{
private readonly string _binaryPath;
private readonly JsonSerializerOptions _jsonOptions;
/// <summary>
/// Creates a new Pdftract client with the specified binary path.
/// </summary>
/// <param name="binaryPath">Path to the pdftract binary. If null, searches PATH.</param>
public Pdftract(string? binaryPath = null)
{
_binaryPath = FindBinary(binaryPath);
_jsonOptions = PdftractJsonContext.Default.Options;
}
/// <summary>
/// Extracts structured data from a PDF.
/// </summary>
public async Task<Document> ExtractAsync(
Source source,
ExtractOptions? options = null,
CancellationToken cancellationToken = default)
{
var args = BuildArgs("extract", "--json", source, options);
var json = await InvokeAsync(source, args, cancellationToken);
return JsonSerializer.Deserialize(json, PdftractJsonContext.Default.Document)
?? throw new JsonException("Failed to deserialize Document");
}
/// <summary>
/// Extracts plain text from a PDF.
/// </summary>
public async Task<string> ExtractTextAsync(
Source source,
ExtractOptions? options = null,
CancellationToken cancellationToken = default)
{
var args = BuildArgs("extract", "--text", source, options);
return await InvokeAsync(source, args, cancellationToken);
}
/// <summary>
/// Extracts markdown-formatted text from a PDF.
/// </summary>
public async Task<string> ExtractMarkdownAsync(
Source source,
ExtractOptions? options = null,
CancellationToken cancellationToken = default)
{
var args = BuildArgs("extract", "--md", source, options);
return await InvokeAsync(source, args, cancellationToken);
}
/// <summary>
/// Extracts pages from a PDF as a stream.
/// </summary>
public async IAsyncEnumerable<Page> ExtractStreamAsync(
Source source,
ExtractOptions? options = null,
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken = default)
{
var args = BuildArgs("extract", "--ndjson", source, options);
await foreach (var line in InvokeStreamAsync(source, args, cancellationToken))
{
var page = JsonSerializer.Deserialize(line, PdftractJsonContext.Default.Page)
?? throw new JsonException("Failed to deserialize Page");
yield return page;
}
}
/// <summary>
/// Searches for a pattern in a PDF.
/// </summary>
public async IAsyncEnumerable<Match> SearchAsync(
Source source,
string pattern,
SearchOptions? options = null,
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken = default)
{
var args = BuildArgs("grep", pattern, source, options);
await foreach (var line in InvokeStreamAsync(source, args, cancellationToken))
{
var match = JsonSerializer.Deserialize(line, PdftractJsonContext.Default.Match)
?? throw new JsonException("Failed to deserialize Match");
yield return match;
}
}
/// <summary>
/// Extracts metadata from a PDF.
/// </summary>
public async Task<Metadata> GetMetadataAsync(
Source source,
ExtractOptions? options = null,
CancellationToken cancellationToken = default)
{
var args = BuildArgs("extract", "--metadata-only", source, options);
var json = await InvokeAsync(source, args, cancellationToken);
var result = JsonSerializer.Deserialize(json, PdftractJsonContext.Default.Document);
if (result is null) throw new JsonException("Failed to deserialize Document");
return result.Metadata;
}
/// <summary>
/// Computes the fingerprint hash of a PDF.
/// </summary>
public async Task<Fingerprint> HashAsync(
Source source,
HashOptions? options = null,
CancellationToken cancellationToken = default)
{
var args = new List<string> { "hash" };
args.AddRange(source.ToArgs());
if (options != null)
{
args.AddRange(options.ToArgs());
}
var json = await InvokeAsync(source, args, cancellationToken);
return JsonSerializer.Deserialize(json, PdftractJsonContext.Default.Fingerprint)
?? throw new JsonException("Failed to deserialize Fingerprint");
}
/// <summary>
/// Classifies a PDF document.
/// </summary>
public async Task<Classification> ClassifyAsync(
Source source,
CancellationToken cancellationToken = default)
{
var args = new List<string> { "classify" };
args.AddRange(source.ToArgs());
var json = await InvokeAsync(source, args, cancellationToken);
return JsonSerializer.Deserialize(json, PdftractJsonContext.Default.Classification)
?? throw new JsonException("Failed to deserialize Classification");
}
/// <summary>
/// Verifies a cryptographic receipt for a PDF.
/// </summary>
public async Task<bool> VerifyReceiptAsync(
string path,
Receipt receipt,
CancellationToken cancellationToken = default)
{
var receiptPath = path + ".receipt.json";
var receiptJson = JsonSerializer.Serialize(receipt, PdftractJsonContext.Default.Receipt);
await File.WriteAllTextAsync(receiptPath, receiptJson, cancellationToken);
try
{
var args = new List<string> { "verify-receipt", path, receiptPath };
await InvokeAsync(null, args, cancellationToken);
return true;
}
catch (ReceiptVerifyException)
{
return false;
}
finally
{
try
{
if (File.Exists(receiptPath))
{
File.Delete(receiptPath);
}
}
catch
{
// Ignore cleanup errors
}
}
}
/// <summary>
/// Returns the path to the pdftract binary.
/// </summary>
public string BinaryPath => _binaryPath;
/// <summary>
/// Returns the pdftract binary version.
/// </summary>
public async Task<string> GetVersionAsync(CancellationToken cancellationToken = default)
{
var args = new List<string> { "--version" };
return await InvokeAsync(null, args, cancellationToken);
}
private static List<string> BuildArgs(
string command,
string flag,
Source source,
ExtractOptions? options)
{
var args = new List<string> { command, flag };
args.AddRange(source.ToArgs());
if (options != null)
{
args.AddRange(options.ToArgs());
}
return args;
}
private static List<string> BuildArgs(
string command,
string pattern,
Source source,
SearchOptions? options)
{
var args = new List<string> { command, pattern };
args.AddRange(source.ToArgs());
if (options != null)
{
args.AddRange(options.ToArgs());
}
return args;
}
private async Task<string> InvokeAsync(
Source? source,
List<string> args,
CancellationToken cancellationToken)
{
using var process = new Process();
process.StartInfo = new ProcessStartInfo
{
FileName = _binaryPath,
RedirectStandardOutput = true,
RedirectStandardError = true,
UseShellExecute = false
};
foreach (var arg in args)
{
process.StartInfo.ArgumentList.Add(arg);
}
var output = new StringBuilder();
var error = new StringBuilder();
process.OutputDataReceived += (_, e) => { if (e.Data != null) { output.AppendLine(e.Data); } };
process.ErrorDataReceived += (_, e) => { if (e.Data != null) { error.AppendLine(e.Data); } };
var tcs = new TaskCompletionSource<string>();
cancellationToken.Register(() =>
{
try
{
process.Kill(entireProcessTree: true);
tcs.TrySetCanceled(cancellationToken);
}
catch
{
// Ignore
}
});
process.Exited += (_, _) =>
{
try
{
if (cancellationToken.IsCancellationRequested)
{
tcs.TrySetCanceled(cancellationToken);
return;
}
if (process.ExitCode != 0)
{
var exception = PdftractException.FromExitCode(process.ExitCode, error.ToString());
tcs.TrySetException(exception);
return;
}
tcs.TrySetResult(output.ToString());
}
catch (Exception ex)
{
tcs.TrySetException(ex);
}
};
process.EnableRaisingEvents = true;
if (!process.Start())
{
source?.Dispose();
throw new InvalidOperationException("Failed to start pdftract process");
}
process.BeginOutputReadLine();
process.BeginErrorReadLine();
try
{
var result = await tcs.Task;
return result;
}
finally
{
source?.Dispose();
}
}
private async IAsyncEnumerable<string> InvokeStreamAsync(
Source source,
List<string> args,
[System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken)
{
using var process = new Process();
process.StartInfo = new ProcessStartInfo
{
FileName = _binaryPath,
RedirectStandardOutput = true,
RedirectStandardError = true,
UseShellExecute = false
};
foreach (var arg in args)
{
process.StartInfo.ArgumentList.Add(arg);
}
var error = new StringBuilder();
var processExitCode = 0;
var processExited = false;
process.ErrorDataReceived += (_, e) => { if (e.Data != null) { error.AppendLine(e.Data); } };
cancellationToken.Register(() =>
{
try
{
process.Kill(entireProcessTree: true);
}
catch
{
// Ignore
}
});
process.Exited += (_, _) =>
{
processExitCode = process.ExitCode;
processExited = true;
};
process.EnableRaisingEvents = true;
if (!process.Start())
{
source.Dispose();
throw new InvalidOperationException("Failed to start pdftract process");
}
try
{
using var reader = process.StandardOutput;
process.BeginErrorReadLine();
string? line;
while ((line = await reader.ReadLineAsync(cancellationToken)) != null)
{
if (!string.IsNullOrWhiteSpace(line))
{
yield return line;
}
}
process.WaitForExit();
if (cancellationToken.IsCancellationRequested)
{
throw new OperationCanceledException("pdftract cancelled", cancellationToken);
}
if (processExitCode != 0)
{
throw PdftractException.FromExitCode(processExitCode, error.ToString());
}
}
finally
{
source.Dispose();
}
}
private static string FindBinary(string? path)
{
var binaryPath = path;
if (string.IsNullOrEmpty(binaryPath))
{
// Search in PATH
var pathEnv = Environment.GetEnvironmentVariable("PATH");
if (pathEnv != null)
{
var separators = RuntimeInformation.IsOSPlatform(OSPlatform.Windows)
? new[] { ';' }
: new[] { ':' };
foreach (var dir in pathEnv.Split(separators, StringSplitOptions.RemoveEmptyEntries))
{
var candidate = Path.Combine(dir, "pdftract");
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
{
candidate += ".exe";
}
if (File.Exists(candidate))
{
binaryPath = candidate;
break;
}
}
}
}
if (string.IsNullOrEmpty(binaryPath))
{
throw new FileNotFoundException(
"pdftract binary not found. Please install pdftract or provide the binary path.");
}
if (!File.Exists(binaryPath))
{
throw new FileNotFoundException($"pdftract binary not found at {binaryPath}");
}
return binaryPath;
}
public void Dispose()
{
// No unmanaged resources to dispose
}
public async ValueTask DisposeAsync()
{
// No unmanaged resources to dispose
await Task.CompletedTask;
}
}