From 8ca4bf2109be046ed6c22f95ad45def556180323 Mon Sep 17 00:00:00 2001 From: "Daniel J. Summers" Date: Wed, 10 Apr 2024 20:50:45 -0400 Subject: [PATCH] Change from SimpleXML to DOM (#4) This API is more reliable, and should help when implementing the "load a site's HTML and look for feed links" functionality coming before the final release --- src/lib/Data.php | 78 ++++++++++++++++++++++++++----------------- src/lib/Feed.php | 87 +++++++++++++++++++++++++++++++++++++----------- 2 files changed, 115 insertions(+), 50 deletions(-) diff --git a/src/lib/Data.php b/src/lib/Data.php index dcd599b..0969f36 100644 --- a/src/lib/Data.php +++ b/src/lib/Data.php @@ -99,8 +99,23 @@ class Data { $query->execute(); } + /** + * Parse/format a date/time from a string + * + * @param ?string $value The date/time to be parsed and formatted + * @return string|null The date/time in `DateTimeInterface::ATOM` format, or `null` if the input cannot be parsed + */ + private static function formatDate(?string $value): ?string { + try { + return $value ? (new DateTimeImmutable($value))->format(DateTimeInterface::ATOM) : null; + } catch (Exception) { + return null; + } + } + /** * Add an RSS feed + * * @param string $url The URL for the RSS feed * @param string $title The title of the RSS feed * @param string $updatedOn The date/time the RSS feed was last updated (from the XML, not when we checked) @@ -108,28 +123,25 @@ class Data { */ public static function addFeed(string $url, string $title, string $updatedOn): int { $db = self::getConnection(); - if ($updatedOn) { - try { - $updated = (new DateTimeImmutable($updatedOn))->format(DateTimeInterface::ATOM); - } catch (Exception) { - $updated = null; - } - } else { - $updated = null; - } - $query = $db->prepare('INSERT INTO feed (user_id, url, title, updated_on, checked_on)' - . ' VALUES (:user, :url, :title, :updated, :checked)'); - $query->bindValue(':user', $_REQUEST['FRC_USER_ID']); - $query->bindValue(':url', $url); - $query->bindValue(':title', $title); - $query->bindValue(':updated', $updated); - $query->bindValue(':checked', (new DateTimeImmutable())->format(DateTimeInterface::ATOM)); + $query = $db->prepare(<<<'SQL' + INSERT INTO feed ( + user_id, url, title, updated_on, checked_on + ) VALUES ( + :user, :url, :title, :updated, :checked + ) + SQL); + $query->bindValue(':user', $_REQUEST['FRC_USER_ID']); + $query->bindValue(':url', $url); + $query->bindValue(':title', $title); + $query->bindValue(':updated', self::formatDate($updatedOn)); + $query->bindValue(':checked', self::formatDate('now')); $result = $query->execute(); return $result ? $db->lastInsertRowID() : -1; } /** * Does a feed item already exist? + * * @param int $feedId The ID of the feed to which the item belongs * @param string $guid The GUID from the RSS feed, uniquely identifying the item * @return bool True if the item exists, false if not @@ -145,28 +157,34 @@ class Data { /** * Add a feed item + * * @param int $feedId The ID of the feed to which the item should be added * @param string $guid The GUID from the RSS feed (uses link if `` not specified) * @param string $link The link to this item * @param string $title The title of the item - * @param string $published The date/time the item was published + * @param string $publishedOn The date/time the item was published + * @param ?string $updatedOn The date/time the item was last updated * @param string $content The content of the item * @param bool $isEncoded Whether the content has HTML (true) or is plaintext (false) - * @throws Exception If the published date is not valid */ - public static function addItem(int $feedId, string $guid, string $link, string $title, string $published, - string $content, bool $isEncoded): void { + public static function addItem(int $feedId, string $guid, string $link, string $title, string $publishedOn, + ?string $updatedOn, string $content, bool $isEncoded): void { $db = self::getConnection(); - $query = $db->prepare( - 'INSERT INTO item (feed_id, item_guid, item_link, title, published_on, content, is_encoded)' - . ' VALUES (:feed, :guid, :link, :title, :published, :content, :encoded)'); - $query->bindValue(':feed', $feedId); - $query->bindValue(':guid', $guid); - $query->bindValue(':link', $link); - $query->bindValue(':title', $title); - $query->bindValue(':published', (new DateTimeImmutable($published))->format(DateTimeInterface::ATOM)); - $query->bindValue(':content', $content); - $query->bindValue(':encoded', $isEncoded); + $query = $db->prepare(<<<'SQL' + INSERT INTO item ( + feed_id, item_guid, item_link, title, published_on, updated_on, content, is_encoded + ) VALUES ( + :feed, :guid, :link, :title, :published, :updated, :content, :encoded + ) + SQL); + $query->bindValue(':feed', $feedId); + $query->bindValue(':guid', $guid); + $query->bindValue(':link', $link); + $query->bindValue(':title', $title); + $query->bindValue(':published', self::formatDate($publishedOn)); + $query->bindValue(':updated', self::formatDate($updatedOn)); + $query->bindValue(':content', $content); + $query->bindValue(':encoded', $isEncoded); $query->execute(); } } diff --git a/src/lib/Feed.php b/src/lib/Feed.php index 1b0e09a..fc8120d 100644 --- a/src/lib/Feed.php +++ b/src/lib/Feed.php @@ -4,23 +4,52 @@ */ class Feed { + /** @var string The XML namespace for Atom feeds */ + public const ATOM_NS = 'http://www.w3.org/2005/Atom'; + + /** @var string The XML namespace for the `` tag that allows HTML content in a feed */ + public const CONTENT_NS = 'http://purl.org/rss/1.0/modules/content/'; + + /** + * When parsing XML into a DOMDocument, errors are presented as warnings; this creates an exception for them + * + * @param int $errno The error level encountered + * @param string $errstr The text of the error encountered + * @return bool False, to delegate to the next error handler in the chain + * @throws DOMException If the error is a warning + */ + private static function xmlParseError(int $errno, string $errstr): bool { + if ($errno == E_WARNING && substr_count($errstr, 'DOMDocument::loadXml()') > 0) { + throw new DOMException($errstr, $errno); + } + return false; + } + /** * Parse a feed into an XML tree + * * @param string $content The feed's RSS content - * @return array|SimpleXMLElement[]|string[] [ 'ok' => feed ] if successful, [ 'error' => message] if not + * @return array|DOMDocument[]|string[] [ 'ok' => feed ] if successful, [ 'error' => message] if not */ public static function parseFeed(string $content): array { + set_error_handler(self::xmlParseError(...)); try { - return [ 'ok' => new SimpleXMLElement($content) ]; - } catch (Exception $ex) { + $feed = new DOMDocument(); + $feed->loadXML($content); + return [ 'ok' => $feed ]; + } catch (DOMException $ex) { return [ 'error' => $ex->getMessage() ]; + } finally { + restore_error_handler(); } } /** * Retrieve the feed + * * @param string $url - * @return array|SimpleXMLElement[]|string[] [ 'ok' => feedXml, 'url' => actualUrl ] if successful, [ 'error' => message ] if not + * @return array|DOMDocument[]|string[] [ 'ok' => feedXml, 'url' => actualUrl ] if successful, + * [ 'error' => message ] if not */ public static function retrieveFeed(string $url): array { $feedReq = curl_init($url); @@ -52,32 +81,46 @@ class Feed { return $result; } + /** + * Get the value of a child element by its tag name + * + * @param DOMElement $element The parent element + * @param string $tagName The name of the tag whose value should be obtained + * @return string The value of the element (or "[element] not found" if that element does not exist) + */ + private static function eltValue(DOMElement $element, string $tagName): string { + $tags = $element->getElementsByTagName($tagName); + return $tags->length == 0 ? "$tagName not found" : $tags->item(0)->textContent; + } + /** * Update a feed's items + * * @param int $feedId The ID of the feed to which these items belong - * @param SimpleXMLElement $channel The RSS feed items + * @param DOMElement $channel The RSS feed items * @return array [ 'ok' => true ] if successful, [ 'error' => message ] if not */ - public static function updateItems(int $feedId, SimpleXMLElement $channel): array { + public static function updateItems(int $feedId, DOMElement $channel): array { try { - for ($i = 0; $i < sizeof($channel->item); $i++) { - $item = $channel->item[$i]; - $itemGuid = (string)$item->guid ? $item->guid : $item->link; + foreach ($channel->getElementsByTagName('item') as $item) { + $itemGuid = self::eltValue($item, 'guid'); + if ($itemGuid == 'guid not found') $itemGuid = self::eltValue($item, 'link'); $isNew = !Data::itemExists($feedId, $itemGuid); if ($isNew) { - $title = (string)$item->title; - $link = (string)$item->link; - $published = (string)$item->pubDate; - // TODO: why is this getting all encoded content, and not just the one for the current item? - $encodedContent = $item->xpath('//content:encoded'); - if ($encodedContent) { - $content = (string) $encodedContent[$i]; + $title = self::eltValue($item, 'title'); + $link = self::eltValue($item, 'link'); + $published = self::eltValue($item, 'pubDate'); + $updNodes = $item->getElementsByTagNameNS(self::ATOM_NS, 'updated'); + $updated = $updNodes->length > 0 ? $updNodes->item(0)->textContent : null; + $encNodes = $item->getElementsByTagNameNS(self::CONTENT_NS, 'encoded'); + if ($encNodes->length > 0) { + $content = $encNodes->item(0)->textContent; $isEncoded = true; } else { - $content = $item->description; + $content = self::eltValue($item, 'description'); $isEncoded = false; } - Data::addItem($feedId, $itemGuid, $link, $title, $published, $content, $isEncoded); + Data::addItem($feedId, $itemGuid, $link, $title, $published, $updated, $content, $isEncoded); } // TODO: else check updated date; may want to return that from the isNew check instead } } catch (Exception $ex) { @@ -88,6 +131,7 @@ class Feed { /** * Add an RSS feed + * * @param string $url The URL of the RSS feed to add * @return array [ 'ok' => true ] if successful, [ 'error' => message ] if not */ @@ -95,8 +139,11 @@ class Feed { $feed = self::retrieveFeed($url); if (array_key_exists('error', $feed)) return $feed; - $channel = $feed['ok']->channel; - $feedId = Data::addFeed($feed['url'], (string) $channel->title, (string) $channel->lastBuildDate); + $channel = $feed['ok']->getElementsByTagName('channel')->item(0); + if (!$channel instanceof DOMElement) return [ 'error' => "Channel element not found ($channel->nodeType)" ]; + + $feedId = Data::addFeed($feed['url'], self::eltValue($channel, 'title'), + self::eltValue($channel, 'lastBuildDate')); $result = self::updateItems($feedId, $channel); if (array_key_exists('error', $result)) return $result;