pdftract/sdk/php/src/Pdftract/Models/Span.php
jedarden 246befd8d1 feat(pdftract-2m3gl): implement PHP SDK with Packagist publishing
- Add jedarden/pdftract Composer package (sdk/php/)
- Implement Client.php with proc_open subprocess execution
- Add PSR-3 LoggerInterface integration (defaults to NullLogger)
- Add 9 contract methods: extract, extractText, extractMarkdown, extractStream, search, getMetadata, hash, classify, verifyReceipt
- Add readonly model classes: Document, Page, Metadata, Fingerprint, Classification, Match, Receipt
- Add exception classes: PdftractException base + 8 subclasses
- Add PHPUnit conformance test suite
- Add phpunit.xml configuration
- Add composer.json with jedarden/pdftract package name
- Add .ci/argo-workflows/pdftract-php-publish.yaml (Packagist auto-discovery from git tags)

Also includes Ruby SDK scaffold from parallel workflow.

Closes pdftract-2m3gl
2026-06-01 10:27:03 -04:00

181 lines
4.7 KiB
PHP

<?php
declare(strict_types=1);
namespace Jedarden\Pdftract\Models;
/**
* JSON representation of a text span
*
* A span is the smallest unit of extracted text, representing a
* contiguous run of text with consistent font and styling.
*/
class Span
{
/**
* The extracted text content
*/
public string $text;
/**
* Bounding box in PDF user-space points
*
* Format: [x0, y0, x1, y1] where (x0, y0) is the bottom-left
* corner and (x1, y1) is the top-right corner.
*
* @var array<float>
*/
public array $bbox;
/**
* Font name or identifier
*/
public string $font;
/**
* Font size in points
*/
public float $size;
/**
* Fill color as CSS hex string (e.g., "#1a1a1a"), or null if not expressible as RGB
*
* Null for spot colors, patterns, or complex color spaces that cannot be
* accurately represented as RGB hex.
*/
public ?string $color = null;
/**
* PDF Tr operator value (0-7) indicating the text rendering mode
*
* 0 = fill, 1 = stroke, 2 = fill then stroke, 3 = invisible,
* 4 = fill to clip, 5 = stroke to clip, 6 = fill then stroke to clip,
* 7 = clip.
*/
public ?int $rendering_mode = null;
/**
* Optional confidence score (0.0 to 1.0)
*
* This field is present when OCR is used or when the extraction
* has uncertainty about the text. When confidence is not applicable,
* this field is null.
*/
public ?float $confidence = null;
/**
* Source of the confidence/text extraction
*
* One of: "vector" (native font decoding), "ocr" (pure OCR),
* "ocr-assisted" (OCR + vector correction), "ocr-fallback" (region-level fallback),
* "repaired" (text was repaired via heuristics).
*/
public ?string $confidence_source = null;
/**
* BCP-47 language tag if detected, otherwise null
*
* Examples: "en", "en-US", "zh-Hans". Null when language detection
* is not available or not applicable.
*/
public ?string $lang = null;
/**
* Set of style flags applied to this span
*
* Possible values: "bold", "italic", "smallcaps", "subscript", "superscript"
*
* @var array<string>
*/
public array $flags = [];
/**
* Optional cryptographic receipt for verification
*
* This field is present when `--receipts=lite` or `--receipts=svg`
* is enabled. When receipts are disabled, the field is null.
*/
public ?Receipt $receipt = null;
/**
* Column index (0-based) assigned by Phase 4.3 column detection
*
* This field is null for spans outside any detected column
* (e.g., full-width headings, inter-column gaps).
*/
public ?int $column = null;
/**
* Create Span from JSON array
*
* @param array<string,mixed> $data JSON data
* @return self
*/
public static function fromArray(array $data): self
{
$span = new self();
$span->text = $data['text'];
$span->bbox = $data['bbox'];
$span->font = $data['font'];
$span->size = $data['size'];
$span->color = $data['color'] ?? null;
$span->rendering_mode = $data['rendering_mode'] ?? null;
$span->confidence = $data['confidence'] ?? null;
$span->confidence_source = $data['confidence_source'] ?? null;
$span->lang = $data['lang'] ?? null;
$span->flags = $data['flags'] ?? [];
$span->column = $data['column'] ?? null;
if (isset($data['receipt']) && $data['receipt'] !== null) {
$span->receipt = Receipt::fromArray($data['receipt']);
}
return $span;
}
/**
* Convert to JSON array
*
* @return array<string,mixed>
*/
public function toArray(): array
{
$data = [
'text' => $this->text,
'bbox' => $this->bbox,
'font' => $this->font,
'size' => $this->size,
'flags' => $this->flags,
];
if ($this->color !== null) {
$data['color'] = $this->color;
}
if ($this->rendering_mode !== null) {
$data['rendering_mode'] = $this->rendering_mode;
}
if ($this->confidence !== null) {
$data['confidence'] = $this->confidence;
}
if ($this->confidence_source !== null) {
$data['confidence_source'] = $this->confidence_source;
}
if ($this->lang !== null) {
$data['lang'] = $this->lang;
}
if ($this->column !== null) {
$data['column'] = $this->column;
}
if ($this->receipt !== null) {
$data['receipt'] = $this->receipt->toArray();
}
return $data;
}
}