Skip to content

Commit

Permalink
Fix tests (#1)
Browse files Browse the repository at this point in the history
  • Loading branch information
markuspoerschke authored Dec 23, 2020
1 parent 696c053 commit c05f790
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 39 deletions.
15 changes: 12 additions & 3 deletions src/Extractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,12 @@
use Extractum\Parser\DateParser;
use Extractum\Scorer\DocumentScorer;
use Extractum\StopWords\StopWords;
use ML\JsonLD;
use Symfony\Component\DomCrawler\Crawler;

/**
* @SuppressWarnings(PHPMD.CouplingBetweenObjects)
*/
final class Extractor
{
use ExtractJsonLdTrait;
Expand Down Expand Up @@ -76,6 +80,11 @@ public function extract(string $html, string $uri): Essence

$jsonLd = $this->extractJsonLd($document);

return $this->createEssence($document, $jsonLd, $topNode, $language, $scorerLanguage);
}

private function createEssence(Crawler $document, ?JsonLD\Document $jsonLd, Crawler $topNode, string $language, string $scorerLanguage): Essence
{
$date = $this->dateExtractor->extract($document, $jsonLd);
$essence = (new Essence())
->setDate($date)
Expand All @@ -85,11 +94,11 @@ public function extract(string $html, string $uri): Essence
->setLinks($this->linksExtractor->extract($topNode))
->setText($this->textExtractor->extract($topNode))
->setTitle($this->titleExtractor->extract($document))
->setFree($this->freeExtractor->extract($document, $jsonLd))
->setFree($this->freeExtractor->extract($jsonLd))
;

if ($date !== null && $language !== null) {
$essence->setParsedDate($this->dateParser->parse($essence->getDate(), $essence->getLanguage()));
if ($date !== null) {
$essence->setParsedDate($this->dateParser->parse($date, $scorerLanguage));
}

return $essence;
Expand Down
35 changes: 26 additions & 9 deletions src/Extractor/DateExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -50,16 +50,11 @@ final class DateExtractor extends AbstractExtractor

public function extract(Crawler $crawler, ?JsonLD\Document $jsonLd): ?string
{
if ($jsonLd !== null && ($graph = $jsonLd->getGraph()) !== null) {
foreach ($graph->getNodes() as $node) {
/** @var JsonLD\TypedValue|null $property */
$property = $node->getProperty('http://schema.org/datePublished');
if ($property instanceof JsonLD\TypedValue) {
return $property->getValue();
}
}
}
return $this->extractFromJsonLd($jsonLd) ?? $this->extractFromCrawler($crawler);
}

private function extractFromCrawler(Crawler $crawler): ?string
{
$candidates = $crawler->filter(self::CANDIDATE_SELECTOR);
/** @var DOMNode|DOMElement $node */
foreach ($candidates as $node) {
Expand Down Expand Up @@ -89,4 +84,26 @@ public function extract(Crawler $crawler, ?JsonLD\Document $jsonLd): ?string

return null;
}

private function extractFromJsonLd(?JsonLD\Document $jsonLd): ?string
{
if ($jsonLd === null) {
return null;
}

$graph = $jsonLd->getGraph();
if ($graph === null) {
return null;
}

foreach ($graph->getNodes() as $node) {
/** @var JsonLD\TypedValue|null $property */
$property = $node->getProperty('http://schema.org/datePublished');
if ($property instanceof JsonLD\TypedValue) {
return $property->getValue();
}
}

return null;
}
}
26 changes: 16 additions & 10 deletions src/Extractor/FreeExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,26 @@
namespace Extractum\Extractor;

use ML\JsonLD;
use Symfony\Component\DomCrawler\Crawler;

class FreeExtractor extends AbstractExtractor
{
public function extract(Crawler $crawler, ?JsonLD\Document $jsonLd): bool
public function extract(?JsonLD\Document $jsonLd): bool
{
if ($jsonLd !== null && ($graph = $jsonLd->getGraph()) !== null) {
$nodes = $graph->getNodes();
if (isset($nodes[0])) {
$property = $nodes[0]->getProperty('http://schema.org/isAccessibleForFree');
if ($property instanceof JsonLD\TypedValue) {
if (strtolower($property->getValue()) === 'false') {
return false;
}
if ($jsonLd === null) {
return true;
}

$graph = $jsonLd->getGraph();
if ($graph === null) {
return true;
}

$nodes = $graph->getNodes();
if (isset($nodes[0])) {
$property = $nodes[0]->getProperty('http://schema.org/isAccessibleForFree');
if ($property instanceof JsonLD\TypedValue) {
if (strtolower($property->getValue()) === 'false') {
return false;
}
}
}
Expand Down
40 changes: 26 additions & 14 deletions src/Extractor/ImageExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,24 +27,36 @@ final class ImageExtractor extends AbstractExtractor

public function extract(Crawler $crawler, ?JsonLD\Document $jsonLd): ?string
{
if ($jsonLd !== null && ($graph = $jsonLd->getGraph()) !== null) {
$nodes = $graph->getNodes();
if (isset($nodes[0])) {
$images = $nodes[0]->getProperty('http://schema.org/image');
if ($images instanceof JsonLD\NodeInterface) {
$images = [$images];
}
return $this->extractFromJsonLd($jsonLd) ?? $this->extractAttribute('content', self::CANDIDATE_SELECTORS, $crawler);
}

private function extractFromJsonLd(?JsonLD\Document $jsonLd): ?string
{
if ($jsonLd === null) {
return null;
}

$graph = $jsonLd->getGraph();
if ($graph === null) {
return null;
}

$nodes = $graph->getNodes();
if (isset($nodes[0])) {
$images = $nodes[0]->getProperty('http://schema.org/image');
if ($images instanceof JsonLD\NodeInterface) {
$images = [$images];
}

/** @var JsonLD\NodeInterface $image */
foreach ($images ?? [] as $image) {
$url = $image->getProperty('http://schema.org/url');
if ($url instanceof JsonLD\NodeInterface) {
return $url->getId();
}
/** @var JsonLD\NodeInterface $image */
foreach ($images ?? [] as $image) {
$url = $image->getProperty('http://schema.org/url');
if ($url instanceof JsonLD\NodeInterface) {
return $url->getId();
}
}
}

return $this->extractAttribute('content', self::CANDIDATE_SELECTORS, $crawler);
return null;
}
}
4 changes: 2 additions & 2 deletions src/Parser/DateParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ public function parse(string $dateAsString, string $language = 'en'): ?DateTimeI
continue;
}

$formattedDateAsString = preg_replace($pattern, $options[self::REPLACEMENT], $dateAsString, 1);
$date = DateTimeImmutable::createFromFormat($options[self::FORMAT], $formattedDateAsString);
$formattedDate = preg_replace($pattern, $options[self::REPLACEMENT], $dateAsString, 1);
$date = DateTimeImmutable::createFromFormat($options[self::FORMAT], $formattedDate);

return $date ?: null;
}
Expand Down
2 changes: 1 addition & 1 deletion tests/Extractor/FreeExtractorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class FreeExtractorTest extends TestCase
public function testFreeIsExtracted(string $html, $expected): void
{
$crawler = new Crawler($html, 'https://www.example.com');
$actual = (new FreeExtractor())->extract($crawler, $this->extractJsonLd($crawler));
$actual = (new FreeExtractor())->extract($this->extractJsonLd($crawler));

self::assertSame($expected, $actual);
}
Expand Down

0 comments on commit c05f790

Please sign in to comment.