Split out feed parsing from document
WIP - still moving things around...
This commit is contained in:
315
src/lib/Feed.php
315
src/lib/Feed.php
@@ -12,44 +12,16 @@ use BitBadger\Documents\SQLite\Exists;
|
||||
use BitBadger\Documents\SQLite\Find;
|
||||
use BitBadger\Documents\SQLite\Patch;
|
||||
use DateTimeInterface;
|
||||
use DOMDocument;
|
||||
use DOMElement;
|
||||
use DOMException;
|
||||
use DOMNode;
|
||||
use FeedReaderCentral\Domain\Feed as FeedDocument;
|
||||
use FeedReaderCentral\Domain\Item;
|
||||
use FeedReaderCentral\Domain\Table;
|
||||
use SQLite3;
|
||||
|
||||
/**
|
||||
* Feed retrieval, parsing, and manipulation
|
||||
* An RSS or Atom feed
|
||||
*/
|
||||
class Feed {
|
||||
|
||||
/** @var string The URL for the feed */
|
||||
public string $url = '';
|
||||
|
||||
/** @var string The title of the feed */
|
||||
public string $title = '';
|
||||
|
||||
/** @var ?string When the feed was last updated */
|
||||
public ?string $updatedOn = null;
|
||||
|
||||
/** @var FeedItem[] The items contained in the feed */
|
||||
public array $items = [];
|
||||
|
||||
/** @var string The XML namespace for Atom feeds */
|
||||
public const string ATOM_NS = 'http://www.w3.org/2005/Atom';
|
||||
|
||||
/** @var string The XML namespace for the `<content:encoded>` tag that allows HTML content in a feed */
|
||||
public const string CONTENT_NS = 'http://purl.org/rss/1.0/modules/content/';
|
||||
|
||||
/** @var string The XML namespace for XHTML */
|
||||
public const string XHTML_NS = 'http://www.w3.org/1999/xhtml';
|
||||
|
||||
/** @var string The user agent for Feed Reader Central's refresh requests */
|
||||
private const string USER_AGENT =
|
||||
'FeedReaderCentral/' . FRC_VERSION . ' +https://bitbadger.solutions/open-source/feed-reader-central';
|
||||
class Feed
|
||||
{
|
||||
// ***** CONSTANTS *****
|
||||
|
||||
/** @var int Do not purge items */
|
||||
public const int PURGE_NONE = 0;
|
||||
@@ -63,231 +35,57 @@ class Feed {
|
||||
/** @var int Purge items in number greater than the specified number of items to keep */
|
||||
public const int PURGE_BY_COUNT = 3;
|
||||
|
||||
/**
|
||||
* When parsing XML into a DOMDocument, errors are presented as warnings; this creates an exception for them
|
||||
*
|
||||
* @param int $errno The error level encountered
|
||||
* @param string $errstr The text of the error encountered
|
||||
* @return bool False, to delegate to the next error handler in the chain
|
||||
* @throws DOMException If the error is a warning
|
||||
*/
|
||||
private static function xmlParseError(int $errno, string $errstr): bool {
|
||||
if ($errno == E_WARNING && substr_count($errstr, 'DOMDocument::loadXML()') > 0) {
|
||||
throw new DOMException($errstr, $errno);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
// ***** PROPERTIES *****
|
||||
|
||||
/** @var int The ID of the feed */
|
||||
public int $id = 0;
|
||||
|
||||
/** @var int The ID of the user to whom this subscription belongs */
|
||||
public int $user_id = 0;
|
||||
|
||||
/** @var string The URL of the feed */
|
||||
public string $url = '';
|
||||
|
||||
/** @var string|null The title of this feed */
|
||||
public ?string $title = null;
|
||||
|
||||
/** @var string|null The date/time items in this feed were last updated */
|
||||
public ?string $updated_on = null;
|
||||
|
||||
/** @var string|null The date/time this feed was last checked */
|
||||
public ?string $checked_on = null;
|
||||
|
||||
// ***** STATIC FUNCTIONS *****
|
||||
|
||||
/**
|
||||
* Parse a feed into an XML tree
|
||||
* Create a document from the parsed feed
|
||||
*
|
||||
* @param string $content The feed's RSS content
|
||||
* @return array|DOMDocument[]|string[] ['ok' => feed] if successful, ['error' => message] if not
|
||||
* @param ParsedFeed $parsed The parsed feed
|
||||
* @return static The document constructed from the parsed feed
|
||||
*/
|
||||
public static function parseFeed(string $content): array {
|
||||
set_error_handler(self::xmlParseError(...));
|
||||
try {
|
||||
$feed = new DOMDocument();
|
||||
$feed->loadXML($content);
|
||||
return ['ok' => $feed];
|
||||
} catch (DOMException $ex) {
|
||||
return ['error' => $ex->getMessage()];
|
||||
} finally {
|
||||
restore_error_handler();
|
||||
}
|
||||
}
|
||||
public static function fromParsed(ParsedFeed $parsed): static
|
||||
{
|
||||
$it = new static();
|
||||
$it->user_id = $_SESSION[Key::USER_ID];
|
||||
$it->url = $parsed->url;
|
||||
$it->title = $parsed->title;
|
||||
$it->updated_on = $parsed->updatedOn;
|
||||
$it->checked_on = Data::formatDate('now');
|
||||
|
||||
/**
|
||||
* Get the value of a child element by its tag name for an RSS feed
|
||||
*
|
||||
* @param DOMNode $element The parent element
|
||||
* @param string $tagName The name of the tag whose value should be obtained
|
||||
* @return string The value of the element (or "[element] not found" if that element does not exist)
|
||||
*/
|
||||
public static function rssValue(DOMNode $element, string $tagName): string {
|
||||
$tags = $element->getElementsByTagName($tagName);
|
||||
return $tags->length == 0 ? "$tagName not found" : $tags->item(0)->textContent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract items from an RSS feed
|
||||
*
|
||||
* @param DOMDocument $xml The XML received from the feed
|
||||
* @param string $url The actual URL for the feed
|
||||
* @return array|Feed[]|string[] ['ok' => feed] if successful, ['error' => message] if not
|
||||
*/
|
||||
private static function fromRSS(DOMDocument $xml, string $url): array {
|
||||
$channel = $xml->getElementsByTagName('channel')->item(0);
|
||||
if (!($channel instanceof DOMElement)) {
|
||||
$type = $channel?->nodeType ?? -1;
|
||||
return ['error' => "Channel element not found ($type)"];
|
||||
}
|
||||
|
||||
// The Atom namespace provides a lastBuildDate, which contains the last time an item in the feed was updated; if
|
||||
// that is not present, use the pubDate element instead
|
||||
if (($updatedOn = self::rssValue($channel, 'lastBuildDate')) == 'lastBuildDate not found') {
|
||||
if (($updatedOn = self::rssValue($channel, 'pubDate')) == 'pubDate not found') {
|
||||
$updatedOn = null;
|
||||
}
|
||||
}
|
||||
|
||||
$feed = new static();
|
||||
$feed->title = self::rssValue($channel, 'title');
|
||||
$feed->url = $url;
|
||||
$feed->updatedOn = Data::formatDate($updatedOn);
|
||||
foreach ($channel->getElementsByTagName('item') as $item) $feed->items[] = FeedItem::fromRSS($item);
|
||||
|
||||
return ['ok' => $feed];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an attribute value from a DOM node
|
||||
*
|
||||
* @param DOMNode $node The node with an attribute value to obtain
|
||||
* @param string $name The name of the attribute whose value should be obtained
|
||||
* @return string The attribute value if it exists, an empty string if not
|
||||
*/
|
||||
private static function attrValue(DOMNode $node, string $name): string {
|
||||
return ($node->hasAttributes() ? $node->attributes->getNamedItem($name)?->value : null) ?? '';
|
||||
|
||||
}
|
||||
/**
|
||||
* Get the value of a child element by its tag name for an Atom feed
|
||||
*
|
||||
* (Atom feeds can have type attributes on nearly any value. For our purposes, types "text" and "html" will work as
|
||||
* regular string values; for "xhtml", though, we will need to get the `<div>` and extract its contents instead.)
|
||||
*
|
||||
* @param DOMNode $element The parent element
|
||||
* @param string $tagName The name of the tag whose value should be obtained
|
||||
* @return string The value of the element (or "[element] not found" if that element does not exist)
|
||||
*/
|
||||
public static function atomValue(DOMNode $element, string $tagName): string {
|
||||
$tags = $element->getElementsByTagName($tagName);
|
||||
if ($tags->length == 0) return "$tagName not found";
|
||||
$tag = $tags->item(0);
|
||||
if (!($tag instanceof DOMElement)) return $tag->textContent;
|
||||
if (self::attrValue($tag, 'type') == 'xhtml') {
|
||||
$div = $tag->getElementsByTagNameNS(Feed::XHTML_NS, 'div');
|
||||
if ($div->length == 0) return "-- invalid XHTML content --";
|
||||
return $div->item(0)->textContent;
|
||||
}
|
||||
return $tag->textContent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract items from an Atom feed
|
||||
*
|
||||
* @param DOMDocument $xml The XML received from the feed
|
||||
* @param string $url The actual URL for the feed
|
||||
* @return array|Feed[] ['ok' => feed]
|
||||
*/
|
||||
private static function fromAtom(DOMDocument $xml, string $url): array {
|
||||
$root = $xml->getElementsByTagNameNS(self::ATOM_NS, 'feed')->item(0);
|
||||
if (($updatedOn = self::atomValue($root, 'updated')) == 'pubDate not found') $updatedOn = null;
|
||||
|
||||
$feed = new Feed();
|
||||
$feed->title = self::atomValue($root, 'title');
|
||||
$feed->url = $url;
|
||||
$feed->updatedOn = Data::formatDate($updatedOn);
|
||||
foreach ($root->getElementsByTagName('entry') as $entry) $feed->items[] = FeedItem::fromAtom($entry);
|
||||
|
||||
return ['ok' => $feed];
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve a document (http/https)
|
||||
*
|
||||
* @param string $url The URL of the document to retrieve
|
||||
* @return array ['content' => document content, 'error' => error message, 'code' => HTTP response code,
|
||||
* 'url' => effective URL]
|
||||
*/
|
||||
private static function retrieveDocument(string $url): array {
|
||||
$docReq = curl_init($url);
|
||||
curl_setopt($docReq, CURLOPT_FOLLOWLOCATION, true);
|
||||
curl_setopt($docReq, CURLOPT_RETURNTRANSFER, true);
|
||||
curl_setopt($docReq, CURLOPT_CONNECTTIMEOUT, 5);
|
||||
curl_setopt($docReq, CURLOPT_TIMEOUT, 15);
|
||||
curl_setopt($docReq, CURLOPT_USERAGENT, self::USER_AGENT);
|
||||
|
||||
$result = [
|
||||
'content' => curl_exec($docReq),
|
||||
'error' => curl_error($docReq),
|
||||
'code' => curl_getinfo($docReq, CURLINFO_RESPONSE_CODE),
|
||||
'url' => curl_getinfo($docReq, CURLINFO_EFFECTIVE_URL)
|
||||
];
|
||||
|
||||
curl_close($docReq);
|
||||
return $result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Derive a feed URL from an HTML document
|
||||
*
|
||||
* @param string $content The HTML document content from which to derive a feed URL
|
||||
* @return array|string[] ['ok' => feed URL] if successful, ['error' => message] if not
|
||||
*/
|
||||
private static function deriveFeedFromHTML(string $content): array {
|
||||
$html = new DOMDocument();
|
||||
$html->loadHTML(substr($content, 0, strpos($content, '</head>') + 7));
|
||||
$headTags = $html->getElementsByTagName('head');
|
||||
if ($headTags->length < 1) return ['error' => 'Cannot find feed at this URL'];
|
||||
$head = $headTags->item(0);
|
||||
foreach ($head->getElementsByTagName('link') as $link) {
|
||||
if (self::attrValue($link, 'rel') == 'alternate') {
|
||||
$type = self::attrValue($link, 'type');
|
||||
if ($type == 'application/rss+xml' || $type == 'application/atom+xml') {
|
||||
return ['ok' => self::attrValue($link, 'href')];
|
||||
}
|
||||
}
|
||||
}
|
||||
return ['error' => 'Cannot find feed at this URL'];
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the feed
|
||||
*
|
||||
* @param string $url The URL of the feed to retrieve
|
||||
* @return array|Feed[]|string[] ['ok' => feed] if successful, ['error' => message] if not
|
||||
*/
|
||||
public static function retrieveFeed(string $url): array {
|
||||
$doc = self::retrieveDocument($url);
|
||||
|
||||
if ($doc['error'] != '') return ['error' => $doc['error']];
|
||||
if ($doc['code'] != 200) {
|
||||
return ['error' => "Prospective feed URL $url returned HTTP Code {$doc['code']}: {$doc['content']}"];
|
||||
}
|
||||
|
||||
$start = strtolower(strlen($doc['content']) >= 9 ? substr($doc['content'], 0, 9) : $doc['content']);
|
||||
if ($start == '<!doctype' || str_starts_with($start, '<html')) {
|
||||
$derivedURL = self::deriveFeedFromHTML($doc['content']);
|
||||
if (key_exists('error', $derivedURL)) return ['error' => $derivedURL['error']];
|
||||
$feedURL = $derivedURL['ok'];
|
||||
if (!str_starts_with($feedURL, 'http')) {
|
||||
// Relative URL; feed should be retrieved in the context of the original URL
|
||||
$original = parse_url($url);
|
||||
$port = key_exists('port', $original) ? ":{$original['port']}" : '';
|
||||
$feedURL = $original['scheme'] . '://' . $original['host'] . $port . $feedURL;
|
||||
}
|
||||
$doc = self::retrieveDocument($feedURL);
|
||||
}
|
||||
|
||||
$parsed = self::parseFeed($doc['content']);
|
||||
if (key_exists('error', $parsed)) return ['error' => $parsed['error']];
|
||||
|
||||
$extract = $parsed['ok']->getElementsByTagNameNS(self::ATOM_NS, 'feed')->length > 0
|
||||
? self::fromAtom(...) : self::fromRSS(...);
|
||||
return $extract($parsed['ok'], $doc['url']);
|
||||
return $it;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update a feed's items
|
||||
*
|
||||
* @param int $feedId The ID of the feed to which these items belong
|
||||
* @param Feed $feed The extracted Atom or RSS feed items
|
||||
* @param ParsedFeed $parsed The extracted Atom or RSS feed items
|
||||
* @param DateTimeInterface $lastChecked When this feed was last checked (only new items will be added)
|
||||
* @return array ['ok' => true] if successful, ['error' => message] if not
|
||||
*/
|
||||
public static function updateItems(int $feedId, Feed $feed, DateTimeInterface $lastChecked, SQLite3 $db): array {
|
||||
public static function updateItems(int $feedId, ParsedFeed $parsed, DateTimeInterface $lastChecked,
|
||||
SQLite3 $db): array
|
||||
{
|
||||
$results =
|
||||
array_map(function ($item) use ($db, $feedId) {
|
||||
try {
|
||||
@@ -305,7 +103,7 @@ class Feed {
|
||||
} catch (DocumentException $ex) {
|
||||
return ['error' => "$ex"];
|
||||
}
|
||||
}, array_filter($feed->items,
|
||||
}, array_filter($parsed->items,
|
||||
fn($it) => date_create_immutable($it->updatedOn ?? $it->publishedOn) >= $lastChecked));
|
||||
$errors = array_map(fn($it) => $it['error'], array_filter($results, fn($it) => array_key_exists('error', $it)));
|
||||
return sizeof($errors) > 0 ? ['error' => implode("\n", $errors)] : ['ok' => true];
|
||||
@@ -318,7 +116,8 @@ class Feed {
|
||||
* @param SQLite3 $db The database connection on which items should be purged
|
||||
* @return array|string[]|true[] ['ok' => true] if purging was successful, ['error' => message] if not
|
||||
*/
|
||||
private static function purgeItems(int $feedId, SQLite3 $db): array {
|
||||
private static function purgeItems(int $feedId, SQLite3 $db): array
|
||||
{
|
||||
if (!array_search(PURGE_TYPE, [self::PURGE_READ, self::PURGE_BY_DAYS, self::PURGE_BY_COUNT])) {
|
||||
return ['error' => 'Unrecognized purge type ' . PURGE_TYPE];
|
||||
}
|
||||
@@ -362,12 +161,12 @@ class Feed {
|
||||
* @return array|string[]|true[] ['ok' => true] if successful, ['error' => message] if not
|
||||
*/
|
||||
public static function refreshFeed(int $feedId, string $url, SQLite3 $db): array {
|
||||
$feedRetrieval = self::retrieveFeed($url);
|
||||
$feedRetrieval = ParsedFeed::retrieve($url);
|
||||
if (key_exists('error', $feedRetrieval)) return $feedRetrieval;
|
||||
$feed = $feedRetrieval['ok'];
|
||||
|
||||
try {
|
||||
$feedDoc = Find::byId(Table::FEED, $feedId, FeedDocument::class);
|
||||
$feedDoc = Find::byId(Table::FEED, $feedId, self::class);
|
||||
if (!$feedDoc) return ['error' => 'Could not derive date last checked for feed'];
|
||||
$lastChecked = date_create_immutable($feedDoc->checked_on ?? WWW_EPOCH);
|
||||
|
||||
@@ -395,7 +194,7 @@ class Feed {
|
||||
* @return array ['ok' => feedId] if successful, ['error' => message] if not
|
||||
*/
|
||||
public static function add(string $url, SQLite3 $db): array {
|
||||
$feedExtract = self::retrieveFeed($url);
|
||||
$feedExtract = ParsedFeed::retrieve($url);
|
||||
if (key_exists('error', $feedExtract)) return $feedExtract;
|
||||
|
||||
$feed = $feedExtract['ok'];
|
||||
@@ -406,9 +205,9 @@ class Feed {
|
||||
return ['error' => "Already subscribed to feed $feed->url"];
|
||||
}
|
||||
|
||||
Document::insert(Table::FEED, FeedDocument::fromParsed($feed), $db);
|
||||
Document::insert(Table::FEED, self::fromParsed($feed), $db);
|
||||
|
||||
$doc = Find::firstByFields(Table::FEED, $fields, FeedDocument::class);
|
||||
$doc = Find::firstByFields(Table::FEED, $fields, self::class);
|
||||
if (!$doc) return ['error' => 'Could not retrieve inserted feed'];
|
||||
} catch (DocumentException $ex) {
|
||||
return ['error' => "$ex"];
|
||||
@@ -423,12 +222,12 @@ class Feed {
|
||||
/**
|
||||
* Update an RSS feed
|
||||
*
|
||||
* @param FeedDocument $existing The existing RSS feed
|
||||
* @param Feed $existing The existing feed
|
||||
* @param string $url The URL with which the existing feed should be modified
|
||||
* @param SQLite3 $db The database connection on which to execute the update
|
||||
* @return bool[]|string[] [ 'ok' => true ] if successful, [ 'error' => message ] if not
|
||||
*/
|
||||
public static function update(FeedDocument $existing, string $url, SQLite3 $db): array {
|
||||
public static function update(Feed $existing, string $url, SQLite3 $db): array {
|
||||
try {
|
||||
Patch::byFields(Table::FEED,
|
||||
[Field::EQ(Configuration::idField(), $existing->id), Field::EQ('user_id', $_SESSION[Key::USER_ID])],
|
||||
@@ -444,14 +243,14 @@ class Feed {
|
||||
* Retrieve all feeds, optionally for a specific user
|
||||
*
|
||||
* @param int $user The ID of the user whose feeds should be retrieved (optional, defaults to all feeds)
|
||||
* @return DocumentList<FeedDocument> A list of feeds
|
||||
* @return DocumentList<Feed> A list of feeds
|
||||
* @throws DocumentException If any is encountered
|
||||
*/
|
||||
public static function retrieveAll(int $user = 0): DocumentList
|
||||
{
|
||||
return $user == 0
|
||||
? Find::all(Table::FEED, FeedDocument::class)
|
||||
: Find::byFields(Table::FEED, [Field::EQ('user_id', $user)], FeedDocument::class);
|
||||
? Find::all(Table::FEED, self::class)
|
||||
: Find::byFields(Table::FEED, [Field::EQ('user_id', $user)], self::class);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -482,11 +281,11 @@ class Feed {
|
||||
* Retrieve a feed by its ID for the current user
|
||||
*
|
||||
* @param int $feedId The ID of the feed to retrieve
|
||||
* @return FeedDocument|false The data for the feed if found, false if not found
|
||||
* @return static|false The data for the feed if found, false if not found
|
||||
* @throws DocumentException If any is encountered
|
||||
*/
|
||||
public static function retrieveById(int $feedId): FeedDocument|false {
|
||||
$doc = Find::byId(Table::FEED, $feedId, FeedDocument::class);
|
||||
public static function retrieveById(int $feedId): static|false {
|
||||
$doc = Find::byId(Table::FEED, $feedId, self::class);
|
||||
return $doc && $doc->user_id == $_SESSION[Key::USER_ID] ? $doc : false;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user