Fix bin/publish: copy docs.dist from project root

Fix bin/publish: use correct .env path for rspade_system
Fix bin/publish script: prevent grep exit code 1 from terminating script

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
root
2025-10-21 02:08:33 +00:00
commit f6fac6c4bc
79758 changed files with 10547827 additions and 0 deletions

View File

@@ -0,0 +1,40 @@
<?php
namespace Spatie\Crawler\CrawlObservers;
use GuzzleHttp\Exception\RequestException;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\UriInterface;
abstract class CrawlObserver
{
/*
* Called when the crawler will crawl the url.
*/
public function willCrawl(UriInterface $url, ?string $linkText): void {}
/*
* Called when the crawler has crawled the given url successfully.
*/
public function crawled(
UriInterface $url,
ResponseInterface $response,
?UriInterface $foundOnUrl = null,
?string $linkText = null,
): void {}
/*
* Called when the crawler had a problem crawling the given url.
*/
public function crawlFailed(
UriInterface $url,
RequestException $requestException,
?UriInterface $foundOnUrl = null,
?string $linkText = null,
): void {}
/*
* Called when the crawl has ended.
*/
public function finishedCrawling(): void {}
}

View File

@@ -0,0 +1,97 @@
<?php
namespace Spatie\Crawler\CrawlObservers;
use ArrayAccess;
use GuzzleHttp\Exception\RequestException;
use Iterator;
use Psr\Http\Message\ResponseInterface;
use Spatie\Crawler\CrawlUrl;
class CrawlObserverCollection implements ArrayAccess, Iterator
{
protected int $position;
public function __construct(protected array $observers = [])
{
$this->position = 0;
}
public function addObserver(CrawlObserver $observer): void
{
$this->observers[] = $observer;
}
public function crawled(CrawlUrl $crawlUrl, ResponseInterface $response): void
{
foreach ($this->observers as $crawlObserver) {
$crawlObserver->crawled(
$crawlUrl->url,
$response,
$crawlUrl->foundOnUrl,
$crawlUrl->linkText,
);
}
}
public function crawlFailed(CrawlUrl $crawlUrl, RequestException $exception): void
{
foreach ($this->observers as $crawlObserver) {
$crawlObserver->crawlFailed(
$crawlUrl->url,
$exception,
$crawlUrl->foundOnUrl,
$crawlUrl->linkText,
);
}
}
public function current(): mixed
{
return $this->observers[$this->position];
}
public function offsetGet(mixed $offset): mixed
{
return $this->observers[$offset] ?? null;
}
public function offsetSet(mixed $offset, mixed $value): void
{
if (is_null($offset)) {
$this->observers[] = $value;
} else {
$this->observers[$offset] = $value;
}
}
public function offsetExists(mixed $offset): bool
{
return isset($this->observers[$offset]);
}
public function offsetUnset(mixed $offset): void
{
unset($this->observers[$offset]);
}
public function next(): void
{
$this->position++;
}
public function key(): mixed
{
return $this->position;
}
public function valid(): bool
{
return isset($this->observers[$this->position]);
}
public function rewind(): void
{
$this->position = 0;
}
}

View File

@@ -0,0 +1,13 @@
<?php
namespace Spatie\Crawler\CrawlProfiles;
use Psr\Http\Message\UriInterface;
class CrawlAllUrls extends CrawlProfile
{
public function shouldCrawl(UriInterface $url): bool
{
return true;
}
}

View File

@@ -0,0 +1,25 @@
<?php
namespace Spatie\Crawler\CrawlProfiles;
use GuzzleHttp\Psr7\Uri;
use Psr\Http\Message\UriInterface;
class CrawlInternalUrls extends CrawlProfile
{
protected mixed $baseUrl;
public function __construct($baseUrl)
{
if (! $baseUrl instanceof UriInterface) {
$baseUrl = new Uri($baseUrl);
}
$this->baseUrl = $baseUrl;
}
public function shouldCrawl(UriInterface $url): bool
{
return $this->baseUrl->getHost() === $url->getHost();
}
}

View File

@@ -0,0 +1,10 @@
<?php
namespace Spatie\Crawler\CrawlProfiles;
use Psr\Http\Message\UriInterface;
abstract class CrawlProfile
{
abstract public function shouldCrawl(UriInterface $url): bool;
}

View File

@@ -0,0 +1,30 @@
<?php
namespace Spatie\Crawler\CrawlProfiles;
use GuzzleHttp\Psr7\Uri;
use Psr\Http\Message\UriInterface;
class CrawlSubdomains extends CrawlProfile
{
protected mixed $baseUrl;
public function __construct($baseUrl)
{
if (! $baseUrl instanceof UriInterface) {
$baseUrl = new Uri($baseUrl);
}
$this->baseUrl = $baseUrl;
}
public function shouldCrawl(UriInterface $url): bool
{
return $this->isSubdomainOfHost($url);
}
public function isSubdomainOfHost(UriInterface $url): bool
{
return str_ends_with($url->getHost(), $this->baseUrl->getHost());
}
}

View File

@@ -0,0 +1,102 @@
<?php
namespace Spatie\Crawler\CrawlQueues;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\Exceptions\InvalidUrl;
use Spatie\Crawler\Exceptions\UrlNotFoundByIndex;
class ArrayCrawlQueue implements CrawlQueue
{
/**
* All known URLs, indexed by URL string.
*
* @var CrawlUrl[]
*/
protected array $urls = [];
/**
* Pending URLs, indexed by URL string.
*
* @var CrawlUrl[]
*/
protected array $pendingUrls = [];
public function add(CrawlUrl $crawlUrl): CrawlQueue
{
$urlString = (string) $crawlUrl->url;
if (! isset($this->urls[$urlString])) {
$crawlUrl->setId($urlString);
$this->urls[$urlString] = $crawlUrl;
$this->pendingUrls[$urlString] = $crawlUrl;
}
return $this;
}
public function hasPendingUrls(): bool
{
return (bool) $this->pendingUrls;
}
public function getUrlById($id): CrawlUrl
{
if (! isset($this->urls[$id])) {
throw new UrlNotFoundByIndex("Crawl url {$id} not found in collection.");
}
return $this->urls[$id];
}
public function hasAlreadyBeenProcessed(CrawlUrl $crawlUrl): bool
{
$urlString = (string) $crawlUrl->url;
if (isset($this->pendingUrls[$urlString])) {
return false;
}
if (isset($this->urls[$urlString])) {
return true;
}
return false;
}
public function markAsProcessed(CrawlUrl $crawlUrl): void
{
$urlString = (string) $crawlUrl->url;
unset($this->pendingUrls[$urlString]);
}
public function getProcessedUrlCount(): int
{
return count($this->urls) - count($this->pendingUrls);
}
public function has(CrawlUrl|UriInterface $crawlUrl): bool
{
if ($crawlUrl instanceof CrawlUrl) {
$urlString = (string) $crawlUrl->url;
} elseif ($crawlUrl instanceof UriInterface) {
$urlString = (string) $crawlUrl;
} else {
throw InvalidUrl::unexpectedType($crawlUrl);
}
return isset($this->urls[$urlString]);
}
public function getPendingUrl(): ?CrawlUrl
{
foreach ($this->pendingUrls as $pendingUrl) {
return $pendingUrl;
}
return null;
}
}

View File

@@ -0,0 +1,25 @@
<?php
namespace Spatie\Crawler\CrawlQueues;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\CrawlUrl;
interface CrawlQueue
{
public function add(CrawlUrl $url): self;
public function has(CrawlUrl|UriInterface $crawlUrl): bool;
public function hasPendingUrls(): bool;
public function getUrlById($id): CrawlUrl;
public function getPendingUrl(): ?CrawlUrl;
public function hasAlreadyBeenProcessed(CrawlUrl $url): bool;
public function markAsProcessed(CrawlUrl $crawlUrl): void;
public function getProcessedUrlCount(): int;
}

50
vendor/spatie/crawler/src/CrawlUrl.php vendored Executable file
View File

@@ -0,0 +1,50 @@
<?php
namespace Spatie\Crawler;
use Psr\Http\Message\UriInterface;
class CrawlUrl
{
public UriInterface $url;
public ?UriInterface $foundOnUrl = null;
public ?string $linkText = null;
protected mixed $id;
public static function create(
UriInterface $url,
?UriInterface $foundOnUrl = null,
$id = null,
?string $linkText = null,
): static {
$static = new static($url, $foundOnUrl, linkText: $linkText);
if ($id !== null) {
$static->setId($id);
}
return $static;
}
protected function __construct(UriInterface $url, $foundOnUrl = null, $linkText = null)
{
$this->url = $url;
$this->foundOnUrl = $foundOnUrl;
$this->linkText = $linkText;
}
public function getId(): mixed
{
return $this->id;
}
public function setId($id): void
{
$this->id = $id;
}
}

644
vendor/spatie/crawler/src/Crawler.php vendored Executable file
View File

@@ -0,0 +1,644 @@
<?php
namespace Spatie\Crawler;
use Generator;
use GuzzleHttp\Client;
use GuzzleHttp\Pool;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Uri;
use GuzzleHttp\RequestOptions;
use Psr\Http\Message\UriInterface;
use Spatie\Browsershot\Browsershot;
use Spatie\Crawler\CrawlObservers\CrawlObserver;
use Spatie\Crawler\CrawlObservers\CrawlObserverCollection;
use Spatie\Crawler\CrawlProfiles\CrawlAllUrls;
use Spatie\Crawler\CrawlProfiles\CrawlProfile;
use Spatie\Crawler\CrawlQueues\ArrayCrawlQueue;
use Spatie\Crawler\CrawlQueues\CrawlQueue;
use Spatie\Crawler\Exceptions\InvalidCrawlRequestHandler;
use Spatie\Crawler\Handlers\CrawlRequestFailed;
use Spatie\Crawler\Handlers\CrawlRequestFulfilled;
use Spatie\Crawler\UrlParsers\LinkUrlParser;
use Spatie\Robots\RobotsTxt;
use Tree\Node\Node;
class Crawler
{
public const DEFAULT_USER_AGENT = '*';
protected UriInterface $baseUrl;
protected CrawlObserverCollection $crawlObservers;
protected CrawlProfile $crawlProfile;
protected CrawlQueue $crawlQueue;
protected int $totalUrlCount = 0;
protected int $currentUrlCount = 0;
protected ?int $totalCrawlLimit = null;
protected ?int $currentCrawlLimit = null;
protected ?int $startedAt = null;
protected int $executionTime = 0;
protected ?int $totalExecutionTimeLimit = null;
protected ?int $currentExecutionTimeLimit = null;
protected int $maximumResponseSize = 1024 * 1024 * 2;
protected ?int $maximumDepth = null;
protected bool $respectRobots = true;
protected bool $rejectNofollowLinks = true;
protected Node $depthTree;
protected bool $executeJavaScript = false;
protected ?Browsershot $browsershot = null;
protected ?RobotsTxt $robotsTxt = null;
protected string $crawlRequestFulfilledClass;
protected string $crawlRequestFailedClass;
protected string $urlParserClass;
protected int $delayBetweenRequests = 0;
protected array $allowedMimeTypes = [];
protected string $defaultScheme = 'http';
protected static array $defaultClientOptions = [
RequestOptions::COOKIES => true,
RequestOptions::CONNECT_TIMEOUT => 10,
RequestOptions::TIMEOUT => 10,
RequestOptions::ALLOW_REDIRECTS => false,
RequestOptions::HEADERS => [
'User-Agent' => self::DEFAULT_USER_AGENT,
],
];
public static function create(array $clientOptions = []): static
{
$clientOptions = (count($clientOptions))
? $clientOptions
: static::$defaultClientOptions;
$client = new Client($clientOptions);
return new static($client);
}
public function __construct(
protected Client $client,
protected int $concurrency = 10,
) {
$this->crawlProfile = new CrawlAllUrls;
$this->crawlQueue = new ArrayCrawlQueue;
$this->crawlObservers = new CrawlObserverCollection;
$this->crawlRequestFulfilledClass = CrawlRequestFulfilled::class;
$this->crawlRequestFailedClass = CrawlRequestFailed::class;
$this->urlParserClass = LinkUrlParser::class;
}
public function getDefaultScheme(): string
{
return $this->defaultScheme;
}
public function setDefaultScheme(string $defaultScheme): self
{
$this->defaultScheme = $defaultScheme;
return $this;
}
public function setConcurrency(int $concurrency): self
{
$this->concurrency = $concurrency;
return $this;
}
public function setMaximumResponseSize(int $maximumResponseSizeInBytes): self
{
$this->maximumResponseSize = $maximumResponseSizeInBytes;
return $this;
}
public function getMaximumResponseSize(): ?int
{
return $this->maximumResponseSize;
}
public function setTotalCrawlLimit(int $totalCrawlLimit): self
{
$this->totalCrawlLimit = $totalCrawlLimit;
return $this;
}
public function getTotalCrawlLimit(): ?int
{
return $this->totalCrawlLimit;
}
public function getTotalCrawlCount(): int
{
return $this->totalUrlCount;
}
public function setCurrentCrawlLimit(int $currentCrawlLimit): self
{
$this->currentCrawlLimit = $currentCrawlLimit;
return $this;
}
public function getCurrentCrawlLimit(): ?int
{
return $this->currentCrawlLimit;
}
public function getCurrentCrawlCount(): int
{
return $this->currentUrlCount;
}
public function setTotalExecutionTimeLimit(int $totalExecutionTimeLimitInSecond): self
{
$this->totalExecutionTimeLimit = $totalExecutionTimeLimitInSecond;
return $this;
}
public function getTotalExecutionTimeLimit(): ?int
{
return $this->totalExecutionTimeLimit;
}
public function getTotalExecutionTime(): int
{
return $this->executionTime + $this->getCurrentExecutionTime();
}
public function setCurrentExecutionTimeLimit(int $currentExecutionTimeLimitInSecond): self
{
$this->currentExecutionTimeLimit = $currentExecutionTimeLimitInSecond;
return $this;
}
public function getCurrentExecutionTimeLimit(): ?int
{
return $this->currentExecutionTimeLimit;
}
public function getCurrentExecutionTime(): int
{
if (is_null($this->startedAt)) {
return 0;
}
return time() - $this->startedAt;
}
public function setMaximumDepth(int $maximumDepth): self
{
$this->maximumDepth = $maximumDepth;
return $this;
}
public function getMaximumDepth(): ?int
{
return $this->maximumDepth;
}
public function setDelayBetweenRequests(int $delayInMilliseconds): self
{
$this->delayBetweenRequests = ($delayInMilliseconds * 1000);
return $this;
}
public function getDelayBetweenRequests(): int
{
return $this->delayBetweenRequests;
}
public function setParseableMimeTypes(array $types): self
{
$this->allowedMimeTypes = $types;
return $this;
}
public function getParseableMimeTypes(): array
{
return $this->allowedMimeTypes;
}
public function ignoreRobots(): self
{
$this->respectRobots = false;
return $this;
}
public function respectRobots(): self
{
$this->respectRobots = true;
return $this;
}
public function mustRespectRobots(): bool
{
return $this->respectRobots;
}
public function acceptNofollowLinks(): self
{
$this->rejectNofollowLinks = false;
return $this;
}
public function rejectNofollowLinks(): self
{
$this->rejectNofollowLinks = true;
return $this;
}
public function mustRejectNofollowLinks(): bool
{
return $this->rejectNofollowLinks;
}
public function getRobotsTxt(): ?RobotsTxt
{
return $this->robotsTxt;
}
public function setCrawlQueue(CrawlQueue $crawlQueue): self
{
$this->crawlQueue = $crawlQueue;
return $this;
}
public function getCrawlQueue(): CrawlQueue
{
return $this->crawlQueue;
}
public function executeJavaScript(): self
{
$this->executeJavaScript = true;
return $this;
}
public function doNotExecuteJavaScript(): self
{
$this->executeJavaScript = false;
return $this;
}
public function mayExecuteJavascript(): bool
{
return $this->executeJavaScript;
}
public function setCrawlObserver(CrawlObserver|array $crawlObservers): self
{
if (! is_array($crawlObservers)) {
$crawlObservers = [$crawlObservers];
}
return $this->setCrawlObservers($crawlObservers);
}
public function setCrawlObservers(array $crawlObservers): self
{
$this->crawlObservers = new CrawlObserverCollection($crawlObservers);
return $this;
}
public function addCrawlObserver(CrawlObserver $crawlObserver): self
{
$this->crawlObservers->addObserver($crawlObserver);
return $this;
}
public function getCrawlObservers(): CrawlObserverCollection
{
return $this->crawlObservers;
}
public function setCrawlProfile(CrawlProfile $crawlProfile): self
{
$this->crawlProfile = $crawlProfile;
return $this;
}
public function getCrawlProfile(): CrawlProfile
{
return $this->crawlProfile;
}
public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): self
{
$baseClass = CrawlRequestFulfilled::class;
if (! is_subclass_of($crawlRequestFulfilledClass, $baseClass)) {
throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFulfilledClass, $baseClass);
}
$this->crawlRequestFulfilledClass = $crawlRequestFulfilledClass;
return $this;
}
public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): self
{
$baseClass = CrawlRequestFailed::class;
if (! is_subclass_of($crawlRequestFailedClass, $baseClass)) {
throw InvalidCrawlRequestHandler::doesNotExtendBaseClass($crawlRequestFailedClass, $baseClass);
}
$this->crawlRequestFailedClass = $crawlRequestFailedClass;
return $this;
}
public function setUrlParserClass(string $urlParserClass): self
{
$this->urlParserClass = $urlParserClass;
return $this;
}
public function getUrlParserClass(): string
{
return $this->urlParserClass;
}
public function setBrowsershot(Browsershot $browsershot)
{
$this->browsershot = $browsershot;
return $this;
}
public function setUserAgent(string $userAgent): self
{
$clientOptions = $this->client->getConfig();
$headers = array_change_key_case($clientOptions['headers']);
$headers['user-agent'] = $userAgent;
$clientOptions['headers'] = $headers;
$this->client = new Client($clientOptions);
return $this;
}
public function getUserAgent(): string
{
$headers = $this->client->getConfig('headers');
foreach (array_keys($headers) as $name) {
if (strtolower($name) === 'user-agent') {
return (string) $headers[$name];
}
}
return static::DEFAULT_USER_AGENT;
}
public function getBrowsershot(): Browsershot
{
if (! $this->browsershot) {
$this->browsershot = new Browsershot;
}
return $this->browsershot;
}
public function getBaseUrl(): UriInterface
{
return $this->baseUrl;
}
public function startCrawling(UriInterface|string $baseUrl)
{
$this->startedAt = time();
if (! $baseUrl instanceof UriInterface) {
$baseUrl = new Uri($baseUrl);
}
if ($baseUrl->getScheme() === '') {
$baseUrl = $baseUrl->withScheme($this->defaultScheme);
}
if ($baseUrl->getPath() === '') {
$baseUrl = $baseUrl->withPath('/');
}
$this->totalUrlCount = $this->crawlQueue->getProcessedUrlCount();
$this->baseUrl = $baseUrl;
$crawlUrl = CrawlUrl::create($this->baseUrl);
if ($this->respectRobots) {
$this->robotsTxt = $this->createRobotsTxt($crawlUrl->url);
}
if ($this->shouldAddToCrawlQueue($crawlUrl)) {
$this->addToCrawlQueue($crawlUrl);
}
$this->depthTree = new Node((string) $this->baseUrl);
$this->startCrawlingQueue();
foreach ($this->crawlObservers as $crawlObserver) {
$crawlObserver->finishedCrawling();
}
$this->executionTime += time() - $this->startedAt;
$this->startedAt = null; // To reset currentExecutionTime
}
public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, ?Node $node = null, ?UriInterface $originalUrl = null): ?Node
{
if (is_null($this->maximumDepth)) {
return new Node((string) $url);
}
$node = $node ?? $this->depthTree;
$returnNode = null;
if ($node->getValue() === (string) $parentUrl || $node->getValue() === (string) $originalUrl) {
$newNode = new Node((string) $url);
$node->addChild($newNode);
return $newNode;
}
foreach ($node->getChildren() as $currentNode) {
$returnNode = $this->addToDepthTree($url, $parentUrl, $currentNode, $originalUrl);
if (! is_null($returnNode)) {
break;
}
}
return $returnNode;
}
protected function shouldAddToCrawlQueue($crawlUrl): bool
{
if (! $this->respectRobots) {
return true;
}
if ($this->robotsTxt === null) {
return false;
}
if ($this->robotsTxt->allows((string) $crawlUrl->url, $this->getUserAgent())) {
return true;
}
return false;
}
protected function startCrawlingQueue(): void
{
while (
$this->reachedCrawlLimits() === false &&
$this->reachedTimeLimits() === false &&
$this->crawlQueue->hasPendingUrls()
) {
$pool = new Pool($this->client, $this->getCrawlRequests(), [
'concurrency' => $this->concurrency,
'options' => $this->client->getConfig(),
'fulfilled' => new $this->crawlRequestFulfilledClass($this),
'rejected' => new $this->crawlRequestFailedClass($this),
]);
$promise = $pool->promise();
$promise->wait();
}
}
protected function createRobotsTxt(UriInterface $uri): RobotsTxt
{
return RobotsTxt::create($uri->withPath('/robots.txt'));
}
protected function getCrawlRequests(): Generator
{
while (
$this->reachedCrawlLimits() === false &&
$this->reachedTimeLimits() === false &&
$crawlUrl = $this->crawlQueue->getPendingUrl()
) {
if (
$this->crawlProfile->shouldCrawl($crawlUrl->url) === false ||
$this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)
) {
$this->crawlQueue->markAsProcessed($crawlUrl);
continue;
}
foreach ($this->crawlObservers as $crawlObserver) {
$crawlObserver->willCrawl($crawlUrl->url, $crawlUrl->linkText);
}
$this->totalUrlCount++;
$this->currentUrlCount++;
$this->crawlQueue->markAsProcessed($crawlUrl);
yield $crawlUrl->getId() => new Request('GET', $crawlUrl->url);
}
}
public function addToCrawlQueue(CrawlUrl $crawlUrl): self
{
if (! $this->getCrawlProfile()->shouldCrawl($crawlUrl->url)) {
return $this;
}
if ($this->getCrawlQueue()->has($crawlUrl->url)) {
return $this;
}
$this->crawlQueue->add($crawlUrl);
return $this;
}
public function reachedCrawlLimits(): bool
{
$totalCrawlLimit = $this->getTotalCrawlLimit();
if (! is_null($totalCrawlLimit) && $this->getTotalCrawlCount() >= $totalCrawlLimit) {
return true;
}
$currentCrawlLimit = $this->getCurrentCrawlLimit();
if (! is_null($currentCrawlLimit) && $this->getCurrentCrawlCount() >= $currentCrawlLimit) {
return true;
}
return false;
}
public function reachedTimeLimits(): bool
{
$totalExecutionTimeLimit = $this->getTotalExecutionTimeLimit();
if (! is_null($totalExecutionTimeLimit) && $this->getTotalExecutionTime() >= $totalExecutionTimeLimit) {
return true;
}
$currentExecutionTimeLimit = $this->getCurrentExecutionTimeLimit();
if (! is_null($currentExecutionTimeLimit) && $this->getCurrentExecutionTime() >= $currentExecutionTimeLimit) {
return true;
}
return false;
}
}

58
vendor/spatie/crawler/src/CrawlerRobots.php vendored Executable file
View File

@@ -0,0 +1,58 @@
<?php
namespace Spatie\Crawler;
use Spatie\Robots\RobotsHeaders;
use Spatie\Robots\RobotsMeta;
class CrawlerRobots
{
protected RobotsHeaders $robotsHeaders;
protected RobotsMeta $robotsMeta;
protected bool $mustRespectRobots;
public function __construct(array $headers, string $body, bool $mustRespectRobots)
{
$this->robotsHeaders = RobotsHeaders::create($headers);
$this->robotsMeta = RobotsMeta::create($body);
$this->mustRespectRobots = $mustRespectRobots;
}
public function mayIndex(): bool
{
if (! $this->mustRespectRobots) {
return true;
}
if (! $this->robotsHeaders->mayIndex()) {
return false;
}
if (! $this->robotsMeta->mayIndex()) {
return false;
}
return true;
}
public function mayFollow(): bool
{
if (! $this->mustRespectRobots) {
return true;
}
if (! $this->robotsHeaders->mayFollow()) {
return false;
}
if (! $this->robotsMeta->mayFollow()) {
return false;
}
return true;
}
}

View File

@@ -0,0 +1,13 @@
<?php
namespace Spatie\Crawler\Exceptions;
use RuntimeException;
class InvalidCrawlRequestHandler extends RuntimeException
{
public static function doesNotExtendBaseClass(string $handlerClass, string $baseClass): static
{
return new static("`{$handlerClass} is not a valid handler class. A valid handler class should extend `{$baseClass}`.");
}
}

View File

@@ -0,0 +1,19 @@
<?php
namespace Spatie\Crawler\Exceptions;
use Exception;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\CrawlUrl;
class InvalidUrl extends Exception
{
public static function unexpectedType(mixed $url): static
{
$crawlUrlClass = CrawlUrl::class;
$uriInterfaceClass = UriInterface::class;
$givenUrlClass = is_object($url) ? get_class($url) : gettype($url);
return new static("You passed an invalid url of type `{$givenUrlClass}`. This should be either a {$crawlUrlClass} or `{$uriInterfaceClass}`");
}
}

View File

@@ -0,0 +1,7 @@
<?php
namespace Spatie\Crawler\Exceptions;
use RuntimeException;
class UrlNotFoundByIndex extends RuntimeException {}

View File

@@ -0,0 +1,31 @@
<?php
namespace Spatie\Crawler\Handlers;
use Exception;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\Exception\RequestException;
use Spatie\Crawler\Crawler;
class CrawlRequestFailed
{
public function __construct(protected Crawler $crawler)
{
//
}
public function __invoke(Exception $exception, $index)
{
if ($exception instanceof ConnectException) {
$exception = new RequestException($exception->getMessage(), $exception->getRequest());
}
if ($exception instanceof RequestException) {
$crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index);
$this->crawler->getCrawlObservers()->crawlFailed($crawlUrl, $exception);
}
usleep($this->crawler->getDelayBetweenRequests());
}
}

View File

@@ -0,0 +1,173 @@
<?php
namespace Spatie\Crawler\Handlers;
use Exception;
use GuzzleHttp\Exception\RequestException;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Psr7\Uri;
use GuzzleHttp\Psr7\Utils;
use GuzzleHttp\RedirectMiddleware;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\StreamInterface;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\Crawler;
use Spatie\Crawler\CrawlerRobots;
use Spatie\Crawler\CrawlProfiles\CrawlSubdomains;
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\ResponseWithCachedBody;
use Spatie\Crawler\UrlParsers\UrlParser;
use Symfony\Component\Process\Exception\ProcessFailedException;
class CrawlRequestFulfilled
{
protected UrlParser $urlParser;
public function __construct(protected Crawler $crawler)
{
$urlParserClass = $this->crawler->getUrlParserClass();
$this->urlParser = new $urlParserClass($this->crawler);
}
public function __invoke(ResponseInterface $response, $index)
{
$body = $this->getBody($response);
if (empty($body)) {
usleep($this->crawler->getDelayBetweenRequests());
return;
}
$robots = new CrawlerRobots(
$response->getHeaders(),
$body,
$this->crawler->mustRespectRobots()
);
$crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index);
if ($this->crawler->mayExecuteJavaScript()) {
try {
$body = $this->getBodyAfterExecutingJavaScript($crawlUrl->url);
} catch (ProcessFailedException $exception) {
$request = new Request('GET', $crawlUrl->url);
$exception = new RequestException($exception->getMessage(), $request);
$crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index);
$this->crawler->getCrawlObservers()->crawlFailed($crawlUrl, $exception);
usleep($this->crawler->getDelayBetweenRequests());
return;
}
$response = $response->withBody(Utils::streamFor($body));
}
$responseWithCachedBody = ResponseWithCachedBody::fromGuzzlePsr7Response($response);
$responseWithCachedBody->setCachedBody($body);
if ($robots->mayIndex()) {
$this->handleCrawled($responseWithCachedBody, $crawlUrl);
}
if (! $this->crawler->getCrawlProfile() instanceof CrawlSubdomains) {
if ($crawlUrl->url->getHost() !== $this->crawler->getBaseUrl()->getHost()) {
return;
}
}
if (! $robots->mayFollow()) {
return;
}
$baseUrl = $this->getBaseUrl($response, $crawlUrl);
$originalUrl = $crawlUrl->url;
$this->urlParser->addFromHtml($body, $baseUrl, $originalUrl);
usleep($this->crawler->getDelayBetweenRequests());
}
protected function getBaseUrl(ResponseInterface $response, CrawlUrl $crawlUrl): UriInterface
{
$redirectHistory = $response->getHeader(RedirectMiddleware::HISTORY_HEADER);
if (empty($redirectHistory)) {
return $crawlUrl->url;
}
return new Uri(end($redirectHistory));
}
protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl): void
{
$this->crawler->getCrawlObservers()->crawled($crawlUrl, $response);
}
protected function getBody(ResponseInterface $response): string
{
$contentType = $response->getHeaderLine('Content-Type');
if (! $this->isMimetypeAllowedToParse($contentType)) {
return '';
}
return $this->convertBodyToString($response->getBody(), $this->crawler->getMaximumResponseSize());
}
protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string
{
if ($bodyStream->isSeekable()) {
$bodyStream->rewind();
}
$body = '';
$chunksToRead = $readMaximumBytes < 512 ? $readMaximumBytes : 512;
for ($bytesRead = 0; $bytesRead < $readMaximumBytes; $bytesRead += $chunksToRead) {
try {
$newDataRead = $bodyStream->read($chunksToRead);
} catch (Exception $exception) {
$newDataRead = null;
}
if (! $newDataRead) {
break;
}
$body .= $newDataRead;
}
return $body;
}
protected function getBodyAfterExecutingJavaScript(UriInterface $url): string
{
$browsershot = $this->crawler->getBrowsershot();
$html = $browsershot->setUrl((string) $url)->bodyHtml();
return html_entity_decode($html);
}
protected function isMimetypeAllowedToParse($contentType): bool
{
if (empty($contentType)) {
return true;
}
if (! count($this->crawler->getParseableMimeTypes())) {
return true;
}
foreach ($this->crawler->getParseableMimeTypes() as $allowedType) {
if (stristr($contentType, $allowedType)) {
return true;
}
}
return false;
}
}

View File

@@ -0,0 +1,32 @@
<?php
namespace Spatie\Crawler;
use GuzzleHttp\Psr7\Response;
use Psr\Http\Message\ResponseInterface;
class ResponseWithCachedBody extends Response
{
protected ?string $cachedBody = null;
public static function fromGuzzlePsr7Response(ResponseInterface $response): static
{
return new static(
$response->getStatusCode(),
$response->getHeaders(),
$response->getBody(),
$response->getProtocolVersion(),
$response->getReasonPhrase()
);
}
public function setCachedBody(?string $body = null): void
{
$this->cachedBody = $body;
}
public function getCachedBody(): ?string
{
return $this->cachedBody;
}
}

20
vendor/spatie/crawler/src/Url.php vendored Executable file
View File

@@ -0,0 +1,20 @@
<?php
namespace Spatie\Crawler;
use GuzzleHttp\Psr7\Uri;
class Url extends Uri
{
public function __construct(
protected string $link,
protected ?string $linkText,
) {
parent::__construct($link);
}
public function linkText(): ?string
{
return $this->linkText;
}
}

View File

@@ -0,0 +1,125 @@
<?php
namespace Spatie\Crawler\UrlParsers;
use Illuminate\Support\Collection;
use InvalidArgumentException;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\Crawler;
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\Url;
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
use Symfony\Component\DomCrawler\Link;
use Tree\Node\Node;
class LinkUrlParser implements UrlParser
{
protected Crawler $crawler;
public function __construct(Crawler $crawler)
{
$this->crawler = $crawler;
}
public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void
{
$allLinks = $this->extractLinksFromHtml($html, $foundOnUrl);
collect($allLinks)
->filter(fn (Url $url) => $this->hasCrawlableScheme($url))
->map(fn (Url $url) => $this->normalizeUrl($url))
->filter(function (Url $url) use ($foundOnUrl, $originalUrl) {
if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, null, $originalUrl)) {
return false;
}
return $this->shouldCrawl($node);
})
->filter(fn (Url $url) => ! str_contains($url->getPath(), '/tel:'))
->each(function (Url $url) use ($foundOnUrl) {
$crawlUrl = CrawlUrl::create($url, $foundOnUrl, linkText: $url->linkText());
$this->crawler->addToCrawlQueue($crawlUrl);
});
}
protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl): ?Collection
{
$domCrawler = new DomCrawler($html, $foundOnUrl);
return collect($domCrawler->filterXpath('//a | //link[@rel="next" or @rel="prev"]')->links())
->reject(function (Link $link) {
if ($this->isInvalidHrefNode($link)) {
return true;
}
if ($this->crawler->mustRejectNofollowLinks() && str_contains($link->getNode()->getAttribute('rel'), 'nofollow')) {
return true;
}
return false;
})
->map(function (Link $link) {
try {
$linkText = $link->getNode()->textContent;
if ($linkText) {
$linkText = substr($linkText, 0, 4000);
}
return new Url($link->getUri(), $linkText);
} catch (InvalidArgumentException $exception) {
return null;
}
})
->filter();
}
protected function hasCrawlableScheme(UriInterface $uri): bool
{
return in_array($uri->getScheme(), ['http', 'https']);
}
protected function normalizeUrl(UriInterface $url): UriInterface
{
return $url->withFragment('');
}
protected function shouldCrawl(Node $node): bool
{
$mustRespectRobots = $this->crawler->mustRespectRobots();
$robotsTxt = $this->crawler->getRobotsTxt();
if ($mustRespectRobots && $robotsTxt !== null) {
$isAllowed = $robotsTxt->allows($node->getValue(), $this->crawler->getUserAgent());
if (! $isAllowed) {
return false;
}
}
$maximumDepth = $this->crawler->getMaximumDepth();
if (is_null($maximumDepth)) {
return true;
}
return $node->getDepth() <= $maximumDepth;
}
protected function isInvalidHrefNode(Link $link): bool
{
if ($link->getNode()->nodeName !== 'a') {
return false;
}
if ($link->getNode()->nextSibling !== null) {
return false;
}
if ($link->getNode()->childNodes->length !== 0) {
return false;
}
return true;
}
}

View File

@@ -0,0 +1,95 @@
<?php
namespace Spatie\Crawler\UrlParsers;
use Illuminate\Support\Collection;
use InvalidArgumentException;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\Crawler;
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\Url;
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
use Tree\Node\Node;
class SitemapUrlParser implements UrlParser
{
protected Crawler $crawler;
public function __construct(Crawler $crawler)
{
$this->crawler = $crawler;
}
public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void
{
$allLinks = $this->extractLinksFromHtml($html, $foundOnUrl);
collect($allLinks)
->filter(fn (Url $url) => $this->hasCrawlableScheme($url))
->map(fn (Url $url) => $this->normalizeUrl($url))
->filter(function (Url $url) use ($foundOnUrl, $originalUrl) {
if (! $node = $this->crawler->addToDepthTree($url, $foundOnUrl, null, $originalUrl)) {
return false;
}
return $this->shouldCrawl($node);
})
->filter(fn (Url $url) => ! str_contains($url->getPath(), '/tel:'))
->each(function (Url $url) use ($foundOnUrl) {
$crawlUrl = CrawlUrl::create($url, $foundOnUrl, linkText: $url->linkText());
$this->crawler->addToCrawlQueue($crawlUrl);
});
}
protected function extractLinksFromHtml(string $html, UriInterface $foundOnUrl): ?Collection
{
$domCrawler = new DomCrawler($html, $foundOnUrl);
return collect($domCrawler->filterXPath('//loc')
->each(function (DomCrawler $node) {
try {
$linkText = $node->text();
if ($linkText) {
$linkText = substr($linkText, 0, 4000);
}
return new Url($linkText, $linkText);
} catch (InvalidArgumentException $exception) {
return null;
}
}));
}
protected function hasCrawlableScheme(UriInterface $uri): bool
{
return in_array($uri->getScheme(), ['http', 'https']);
}
protected function normalizeUrl(UriInterface $url): UriInterface
{
return $url->withFragment('');
}
protected function shouldCrawl(Node $node): bool
{
$mustRespectRobots = $this->crawler->mustRespectRobots();
$robotsTxt = $this->crawler->getRobotsTxt();
if ($mustRespectRobots && $robotsTxt !== null) {
$isAllowed = $robotsTxt->allows($node->getValue(), $this->crawler->getUserAgent());
if (! $isAllowed) {
return false;
}
}
$maximumDepth = $this->crawler->getMaximumDepth();
if (is_null($maximumDepth)) {
return true;
}
return $node->getDepth() <= $maximumDepth;
}
}

View File

@@ -0,0 +1,13 @@
<?php
namespace Spatie\Crawler\UrlParsers;
use Psr\Http\Message\UriInterface;
use Spatie\Crawler\Crawler;
interface UrlParser
{
public function __construct(Crawler $crawler);
public function addFromHtml(string $html, UriInterface $foundOnUrl, ?UriInterface $originalUrl = null): void;
}