mirror of
https://github.com/OpenXE-org/OpenXE.git
synced 2025-01-26 12:41:13 +01:00
68 lines
1.7 KiB
PHP
68 lines
1.7 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace Xentral\Modules\HocrParser\Service;
|
|
|
|
use DOMDocument;
|
|
use Xentral\Modules\HocrParser\Data\BoundingBox;
|
|
|
|
final class HocrParser
|
|
{
|
|
/**
|
|
* @param string $content HOCR-Dokument
|
|
*
|
|
* @return array|BoundingBox[]
|
|
*/
|
|
public function parse(string $content): array
|
|
{
|
|
$boxes = [];
|
|
|
|
$dom = new DOMDocument;
|
|
$dom->loadXML($content);
|
|
$spans = $dom->getElementsByTagName('span');
|
|
|
|
/** @var DOMElement $span */
|
|
foreach ($spans as $span) {
|
|
if ($span->getAttribute('class') === 'ocrx_word') {
|
|
$title = $span->getAttribute('title');
|
|
$coords = $this->extractCoordinates($title);
|
|
$text = trim($span->nodeValue);
|
|
|
|
// Boxen ohne Text auslassen; kann vorkommen bei Barcode-Fonts
|
|
if (empty($text)) {
|
|
continue;
|
|
}
|
|
|
|
// Boxen mit Text sammeln
|
|
$boxes[] = new BoundingBox(
|
|
$coords['tlx'], $coords['tly'], $coords['brx'], $coords['bry'], ['text' => $text]
|
|
);
|
|
}
|
|
}
|
|
|
|
return $boxes;
|
|
}
|
|
|
|
/**
|
|
* @param string $text
|
|
*
|
|
* @return array|bool
|
|
*/
|
|
private function extractCoordinates(string $text)
|
|
{
|
|
// bbox 599 2737 743 2758; x_wconf 96
|
|
$parts = explode(' ', $text);
|
|
if ($parts[0] === 'bbox') {
|
|
return [
|
|
'tlx' => (int)$parts[1], // Top left X-Coordinate
|
|
'tly' => (int)$parts[2], // Top left Y
|
|
'brx' => (int)$parts[3], // Bottom right X
|
|
'bry' => (int)$parts[4], // Bottom right Y
|
|
];
|
|
}
|
|
|
|
return false;
|
|
}
|
|
}
|