pdftract/pdftract-dotnet/src/Pdftract/Pdftract.Sync.cs
jedarden 768b858c36 feat(pdftract-1w22d): implement .NET SDK subprocess wrapper
Complete implementation of the Pdftract NuGet package as a subprocess-
based SDK with async-first design using System.Diagnostics.Process and
System.Text.Json.

Implementation:
- All 9 contract methods (ExtractAsync, ExtractTextAsync, etc.) with sync
  wrappers in Pdftract.Sync.cs
- 8 exception types inheriting from PdftractException base class
- Source discriminated union (PathSource, UrlSource, BytesSource) with
  FromPath, FromUrl, FromUri, FromBytes factory methods
- C# record types for all models (Document, Page, Metadata, etc.)
- ExtractOptions, SearchOptions, HashOptions with PascalCase properties
- Source-generated JSON serialization via JsonContext for Native AOT
- IAsyncEnumerable streaming for NDJSON outputs
- CancellationToken propagation to Process.Kill(entireProcessTree: true)

Bug fixes:
- Fixed ArgumentList handling (was adding List as single element)
- Added source.Dispose() cleanup for BytesSource temporary files
- Added cleanup for VerifyReceiptAsync temporary receipt file
- Added process.EnableRaisingEvents for proper event handling
- Fixed output capture to include newlines between lines
- Changed to source-generated JSON (JsonContext) instead of reflection

Acceptance criteria:
- All 9 methods exposed as both async and sync variants
- All 8 exception classes inherit from PdftractException
- Models as C# records
- Supports net8.0 and net9.0
- CancellationToken terminates subprocess

Files modified:
- pdftract-dotnet/src/Pdftract/Pdftract.cs
- pdftract-dotnet/src/Pdftract/Pdftract.Sync.cs
- pdftract-dotnet/src/Pdftract/Source/Source.cs
- pdftract-dotnet/src/Pdftract/Models/Document.cs
- pdftract-dotnet/src/Pdftract/Models/JsonContext.cs
- pdftract-dotnet/tests/Pdftract.Tests/ConformanceTests.cs
- pdftract-dotnet/README.md
- pdftract-dotnet/notes/pdftract-1w22d.md

Co-Authored-By: Claude Code <noreply@anthropic.com>
2026-05-22 19:50:57 -04:00

235 lines
8.3 KiB
C#

using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using Pdftract.Models;
namespace Pdftract;
/// <summary>
/// Synchronous (blocking) wrappers for async Pdftract methods.
/// These methods are discouraged for production use in async contexts
/// as they can lead to thread-pool starvation.
/// </summary>
public sealed partial class Pdftract
{
/// <summary>
/// Extracts structured data from a PDF (synchronous).
/// </summary>
/// <remarks>
/// This synchronous wrapper is provided for legacy code paths.
/// In async contexts, prefer <see cref="ExtractAsync"/> instead.
/// </remarks>
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
public Document Extract(Source source, ExtractOptions? options = null)
{
return ExtractAsync(source, options, CancellationToken.None).GetAwaiter().GetResult();
}
/// <summary>
/// Extracts plain text from a PDF (synchronous).
/// </summary>
/// <remarks>
/// This synchronous wrapper is provided for legacy code paths.
/// In async contexts, prefer <see cref="ExtractTextAsync"/> instead.
/// </remarks>
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
public string ExtractText(Source source, ExtractOptions? options = null)
{
return ExtractTextAsync(source, options, CancellationToken.None).GetAwaiter().GetResult();
}
/// <summary>
/// Extracts markdown-formatted text from a PDF (synchronous).
/// </summary>
/// <remarks>
/// This synchronous wrapper is provided for legacy code paths.
/// In async contexts, prefer <see cref="ExtractMarkdownAsync"/> instead.
/// </remarks>
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
public string ExtractMarkdown(Source source, ExtractOptions? options = null)
{
return ExtractMarkdownAsync(source, options, CancellationToken.None).GetAwaiter().GetResult();
}
/// <summary>
/// Extracts pages from a PDF as a stream (synchronous).
/// </summary>
/// <remarks>
/// This synchronous wrapper is provided for legacy code paths.
/// In async contexts, prefer <see cref="ExtractStreamAsync"/> instead.
/// </remarks>
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
public IEnumerable<Page> ExtractStream(Source source, ExtractOptions? options = null)
{
return ExtractStreamAsync(source, options, CancellationToken.None)
.ToBlockingEnumerable();
}
/// <summary>
/// Searches for a pattern in a PDF (synchronous).
/// </summary>
/// <remarks>
/// This synchronous wrapper is provided for legacy code paths.
/// In async contexts, prefer <see cref="SearchAsync"/> instead.
/// </remarks>
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
public IEnumerable<Match> Search(Source source, string pattern, SearchOptions? options = null)
{
return SearchAsync(source, pattern, options, CancellationToken.None)
.ToBlockingEnumerable();
}
/// <summary>
/// Extracts metadata from a PDF (synchronous).
/// </summary>
/// <remarks>
/// This synchronous wrapper is provided for legacy code paths.
/// In async contexts, prefer <see cref="GetMetadataAsync"/> instead.
/// </remarks>
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
public Metadata GetMetadata(Source source, ExtractOptions? options = null)
{
return GetMetadataAsync(source, options, CancellationToken.None).GetAwaiter().GetResult();
}
/// <summary>
/// Computes the fingerprint hash of a PDF (synchronous).
/// </summary>
/// <remarks>
/// This synchronous wrapper is provided for legacy code paths.
/// In async contexts, prefer <see cref="HashAsync"/> instead.
/// </remarks>
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
public Fingerprint Hash(Source source, HashOptions? options = null)
{
return HashAsync(source, options, CancellationToken.None).GetAwaiter().GetResult();
}
/// <summary>
/// Classifies a PDF document (synchronous).
/// </summary>
/// <remarks>
/// This synchronous wrapper is provided for legacy code paths.
/// In async contexts, prefer <see cref="ClassifyAsync"/> instead.
/// </remarks>
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
public Classification Classify(Source source)
{
return ClassifyAsync(source, CancellationToken.None).GetAwaiter().GetResult();
}
/// <summary>
/// Verifies a cryptographic receipt for a PDF (synchronous).
/// </summary>
/// <remarks>
/// This synchronous wrapper is provided for legacy code paths.
/// In async contexts, prefer <see cref="VerifyReceiptAsync"/> instead.
/// </remarks>
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
public bool VerifyReceipt(string path, Receipt receipt)
{
return VerifyReceiptAsync(path, receipt, CancellationToken.None).GetAwaiter().GetResult();
}
/// <summary>
/// Returns the pdftract binary version (synchronous).
/// </summary>
/// <remarks>
/// This synchronous wrapper is provided for legacy code paths.
/// In async contexts, prefer <see cref="GetVersionAsync"/> instead.
/// </remarks>
[SuppressMessage("Usage", "CA1849:Call async methods when in an async context", Justification = "Intentional sync wrapper")]
public string GetVersion()
{
return GetVersionAsync(CancellationToken.None).GetAwaiter().GetResult();
}
}
file static class AsyncEnumerableExtensions
{
public static IEnumerable<T> ToBlockingEnumerable<T>(this IAsyncEnumerable<T> asyncEnumerable)
{
if (asyncEnumerable is null)
{
throw new ArgumentNullException(nameof(asyncEnumerable));
}
return new BlockingAsyncEnumerable<T>(asyncEnumerable);
}
private sealed class BlockingAsyncEnumerable<T>(IAsyncEnumerable<T> source) : IEnumerable<T>
{
public IEnumerator<T> GetEnumerator()
{
return new BlockingAsyncEnumerator<T>(source.GetAsyncEnumerator(CancellationToken.None));
}
System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
}
private sealed class BlockingAsyncEnumerator<T>(IAsyncEnumerator<T> source) : IEnumerator<T>
{
private T? _current;
private bool _disposed;
public T Current => _current!;
object System.Collections.IEnumerator.Current => Current!;
public bool MoveNext()
{
if (_disposed)
{
return false;
}
using var _ = new ManualResetEvent(false);
bool moveNextSucceeded = false;
Exception? exception = null;
Task.Run(async () =>
{
try
{
moveNextSucceeded = await source.MoveNextAsync();
}
catch (Exception ex)
{
exception = ex;
}
finally
{
_.Set();
}
}).Wait();
if (exception is not null)
{
throw exception;
}
if (moveNextSucceeded)
{
_current = source.Current;
}
return moveNextSucceeded;
}
public void Reset()
{
throw new NotSupportedException("Reset is not supported on async enumerators");
}
public void Dispose()
{
if (!_disposed)
{
source.DisposeAsync().AsTask().Wait();
_disposed = true;
}
}
}
}