Change from SimpleXML to DOM (#4)

This API is more reliable, and should help when implementing the "load a site's HTML and look for feed links" functionality coming before the final release
This commit is contained in:
2024-04-10 20:50:45 -04:00
parent 0530ed0dc9
commit 8ca4bf2109
2 changed files with 115 additions and 50 deletions

View File

@@ -4,23 +4,52 @@
*/
class Feed {
/** @var string The XML namespace for Atom feeds */
public const ATOM_NS = 'http://www.w3.org/2005/Atom';
/** @var string The XML namespace for the `<content>` tag that allows HTML content in a feed */
public const CONTENT_NS = 'http://purl.org/rss/1.0/modules/content/';
/**
* When parsing XML into a DOMDocument, errors are presented as warnings; this creates an exception for them
*
* @param int $errno The error level encountered
* @param string $errstr The text of the error encountered
* @return bool False, to delegate to the next error handler in the chain
* @throws DOMException If the error is a warning
*/
private static function xmlParseError(int $errno, string $errstr): bool {
if ($errno == E_WARNING && substr_count($errstr, 'DOMDocument::loadXml()') > 0) {
throw new DOMException($errstr, $errno);
}
return false;
}
/**
* Parse a feed into an XML tree
*
* @param string $content The feed's RSS content
* @return array|SimpleXMLElement[]|string[] [ 'ok' => feed ] if successful, [ 'error' => message] if not
* @return array|DOMDocument[]|string[] [ 'ok' => feed ] if successful, [ 'error' => message] if not
*/
public static function parseFeed(string $content): array {
set_error_handler(self::xmlParseError(...));
try {
return [ 'ok' => new SimpleXMLElement($content) ];
} catch (Exception $ex) {
$feed = new DOMDocument();
$feed->loadXML($content);
return [ 'ok' => $feed ];
} catch (DOMException $ex) {
return [ 'error' => $ex->getMessage() ];
} finally {
restore_error_handler();
}
}
/**
* Retrieve the feed
*
* @param string $url
* @return array|SimpleXMLElement[]|string[] [ 'ok' => feedXml, 'url' => actualUrl ] if successful, [ 'error' => message ] if not
* @return array|DOMDocument[]|string[] [ 'ok' => feedXml, 'url' => actualUrl ] if successful,
* [ 'error' => message ] if not
*/
public static function retrieveFeed(string $url): array {
$feedReq = curl_init($url);
@@ -52,32 +81,46 @@ class Feed {
return $result;
}
/**
* Get the value of a child element by its tag name
*
* @param DOMElement $element The parent element
* @param string $tagName The name of the tag whose value should be obtained
* @return string The value of the element (or "[element] not found" if that element does not exist)
*/
private static function eltValue(DOMElement $element, string $tagName): string {
$tags = $element->getElementsByTagName($tagName);
return $tags->length == 0 ? "$tagName not found" : $tags->item(0)->textContent;
}
/**
* Update a feed's items
*
* @param int $feedId The ID of the feed to which these items belong
* @param SimpleXMLElement $channel The RSS feed items
* @param DOMElement $channel The RSS feed items
* @return array [ 'ok' => true ] if successful, [ 'error' => message ] if not
*/
public static function updateItems(int $feedId, SimpleXMLElement $channel): array {
public static function updateItems(int $feedId, DOMElement $channel): array {
try {
for ($i = 0; $i < sizeof($channel->item); $i++) {
$item = $channel->item[$i];
$itemGuid = (string)$item->guid ? $item->guid : $item->link;
foreach ($channel->getElementsByTagName('item') as $item) {
$itemGuid = self::eltValue($item, 'guid');
if ($itemGuid == 'guid not found') $itemGuid = self::eltValue($item, 'link');
$isNew = !Data::itemExists($feedId, $itemGuid);
if ($isNew) {
$title = (string)$item->title;
$link = (string)$item->link;
$published = (string)$item->pubDate;
// TODO: why is this getting all encoded content, and not just the one for the current item?
$encodedContent = $item->xpath('//content:encoded');
if ($encodedContent) {
$content = (string) $encodedContent[$i];
$title = self::eltValue($item, 'title');
$link = self::eltValue($item, 'link');
$published = self::eltValue($item, 'pubDate');
$updNodes = $item->getElementsByTagNameNS(self::ATOM_NS, 'updated');
$updated = $updNodes->length > 0 ? $updNodes->item(0)->textContent : null;
$encNodes = $item->getElementsByTagNameNS(self::CONTENT_NS, 'encoded');
if ($encNodes->length > 0) {
$content = $encNodes->item(0)->textContent;
$isEncoded = true;
} else {
$content = $item->description;
$content = self::eltValue($item, 'description');
$isEncoded = false;
}
Data::addItem($feedId, $itemGuid, $link, $title, $published, $content, $isEncoded);
Data::addItem($feedId, $itemGuid, $link, $title, $published, $updated, $content, $isEncoded);
} // TODO: else check updated date; may want to return that from the isNew check instead
}
} catch (Exception $ex) {
@@ -88,6 +131,7 @@ class Feed {
/**
* Add an RSS feed
*
* @param string $url The URL of the RSS feed to add
* @return array [ 'ok' => true ] if successful, [ 'error' => message ] if not
*/
@@ -95,8 +139,11 @@ class Feed {
$feed = self::retrieveFeed($url);
if (array_key_exists('error', $feed)) return $feed;
$channel = $feed['ok']->channel;
$feedId = Data::addFeed($feed['url'], (string) $channel->title, (string) $channel->lastBuildDate);
$channel = $feed['ok']->getElementsByTagName('channel')->item(0);
if (!$channel instanceof DOMElement) return [ 'error' => "Channel element not found ($channel->nodeType)" ];
$feedId = Data::addFeed($feed['url'], self::eltValue($channel, 'title'),
self::eltValue($channel, 'lastBuildDate'));
$result = self::updateItems($feedId, $channel);
if (array_key_exists('error', $result)) return $result;