2021-05-21 08:49:41 +02:00

68 lines
1.7 KiB
PHP

<?php
declare(strict_types=1);
namespace Xentral\Modules\HocrParser\Service;
use DOMDocument;
use Xentral\Modules\HocrParser\Data\BoundingBox;
final class HocrParser
{
/**
* @param string $content HOCR-Dokument
*
* @return array|BoundingBox[]
*/
public function parse(string $content): array
{
$boxes = [];
$dom = new DOMDocument;
$dom->loadXML($content);
$spans = $dom->getElementsByTagName('span');
/** @var DOMElement $span */
foreach ($spans as $span) {
if ($span->getAttribute('class') === 'ocrx_word') {
$title = $span->getAttribute('title');
$coords = $this->extractCoordinates($title);
$text = trim($span->nodeValue);
// Boxen ohne Text auslassen; kann vorkommen bei Barcode-Fonts
if (empty($text)) {
continue;
}
// Boxen mit Text sammeln
$boxes[] = new BoundingBox(
$coords['tlx'], $coords['tly'], $coords['brx'], $coords['bry'], ['text' => $text]
);
}
}
return $boxes;
}
/**
* @param string $text
*
* @return array|bool
*/
private function extractCoordinates(string $text)
{
// bbox 599 2737 743 2758; x_wconf 96
$parts = explode(' ', $text);
if ($parts[0] === 'bbox') {
return [
'tlx' => (int)$parts[1], // Top left X-Coordinate
'tly' => (int)$parts[2], // Top left Y
'brx' => (int)$parts[3], // Bottom right X
'bry' => (int)$parts[4], // Bottom right Y
];
}
return false;
}
}