diff --git a/src/lib/Feed.php b/src/lib/Feed.php index 79a9859..14ec00d 100644 --- a/src/lib/Feed.php +++ b/src/lib/Feed.php @@ -1,15 +1,55 @@ ` tag that allows HTML content in a feed */ public const string CONTENT_NS = 'http://purl.org/rss/1.0/modules/content/'; + /** @var string The XML namespace for XHTML */ + public const string XHTML_NS = 'http://www.w3.org/1999/xhtml'; + /** * When parsing XML into a DOMDocument, errors are presented as warnings; this creates an exception for them * @@ -45,24 +85,135 @@ class Feed { } /** - * Get the value of a child element by its tag name + * Get the value of a child element by its tag name for an RSS feed * * @param DOMElement $element The parent element * @param string $tagName The name of the tag whose value should be obtained * @return string The value of the element (or "[element] not found" if that element does not exist) */ - private static function eltValue(DOMElement $element, string $tagName): string { + private static function rssValue(DOMElement $element, string $tagName): string { $tags = $element->getElementsByTagName($tagName); return $tags->length == 0 ? "$tagName not found" : $tags->item(0)->textContent; } + /** + * Extract items from an RSS feed + * + * @param DOMDocument $xml The XML received from the feed + * @param string $url The actual URL for the feed + * @return array|Feed[]|string[] ['ok' => feed] if successful, ['error' => message] if not + */ + private static function fromRSS(DOMDocument $xml, string $url): array { + $channel = $xml->getElementsByTagName('channel')->item(0); + if (!($channel instanceof DOMElement)) { + return ['error' => "Channel element not found ($channel->nodeType)"]; + } + + $feed = new Feed(); + $feed->title = self::rssValue($channel, 'title'); + $feed->url = $url; + + // The Atom namespace provides a lastBuildDate, which contains the last time an item in the feed was updated; if + // that is not present, use the pubDate element instead + $feed->updatedOn = self::rssValue($channel, 'lastBuildDate'); + if ($feed->updatedOn == 'lastBuildDate not found') { + $feed->updatedOn = self::rssValue($channel, 'pubDate'); + if ($feed->updatedOn == 'pubDate not found') $feed->updatedOn = null; + } + $feed->updatedOn = Data::formatDate($feed->updatedOn); + + foreach ($channel->getElementsByTagName('item') as $xmlItem) { + $itemGuid = self::rssValue($xmlItem, 'guid'); + $updNodes = $xmlItem->getElementsByTagNameNS(Feed::ATOM_NS, 'updated'); + $encNodes = $xmlItem->getElementsByTagNameNS(Feed::CONTENT_NS, 'encoded'); + $item = new FeedItem(); + $item->guid = $itemGuid == 'guid not found' ? self::rssValue($xmlItem, 'link') : $itemGuid; + $item->title = self::rssValue($xmlItem, 'title'); + $item->link = self::rssValue($xmlItem, 'link'); + $item->publishedOn = Data::formatDate(self::rssValue($xmlItem, 'pubDate')); + $item->updatedOn = Data::formatDate($updNodes->length > 0 ? $updNodes->item(0)->textContent : null); + $item->content = $encNodes->length > 0 + ? $encNodes->item(0)->textContent + : self::rssValue($xmlItem, 'description'); + $feed->items[] = $item; + } + + return ['ok' => $feed]; + } + + /** + * Get the value of a child element by its tag name for an Atom feed + * + * (Atom feeds can have type attributes on nearly any value. For our purposes, types "text" and "html" will work as + * regular string values; for "xhtml", though, we will need to get the `
` and extract its contents instead.) + * + * @param DOMElement $element The parent element + * @param string $tagName The name of the tag whose value should be obtained + * @return string The value of the element (or "[element] not found" if that element does not exist) + */ + private static function atomValue(DOMElement $element, string $tagName): string { + $tags = $element->getElementsByTagName($tagName); + if ($tags->length == 0) return "$tagName not found"; + $tag = $tags->item(0); + if (!($tag instanceof DOMElement)) return $tag->textContent; + if ($tag->hasAttributes() && $tag->attributes->getNamedItem('type') == 'xhtml') { + $div = $tag->getElementsByTagNameNS(Feed::XHTML_NS, 'div'); + if ($div->length == 0) return "-- invalid XHTML content --"; + return $div->item(0)->textContent; + } + return $tag->textContent; + } + + /** + * Extract items from an Atom feed + * + * @param DOMDocument $xml The XML received from the feed + * @param string $url The actual URL for the feed + * @return array|Feed[] ['ok' => feed] + */ + private static function fromAtom(DOMDocument $xml, string $url): array { + /** @var DOMElement $root */ + $root = $xml->getElementsByTagNameNS(self::ATOM_NS, 'feed')->item(0); + $feed = new Feed(); + $feed->title = self::atomValue($root, 'title'); + $feed->url = $url; + + $feed->updatedOn = self::atomValue($root, 'updated'); + if ($feed->updatedOn == 'pubDate not found') $feed->updatedOn = null; + $feed->updatedOn = Data::formatDate($feed->updatedOn); + + foreach ($root->getElementsByTagName('entry') as $xmlItem) { + $guid = self::atomValue($xmlItem, 'id'); + $link = ''; + foreach ($xmlItem->getElementsByTagName('link') as $linkElt) { + if ($linkElt->hasAttributes()) { + $relAttr = $linkElt->attributes->getNamedItem('rel'); + if ($relAttr && $relAttr->value == 'alternate') { + $link = $linkElt->attributes->getNamedItem('href')->value; + break; + } + } + } + if ($link == '' && str_starts_with($guid, 'http')) $link = $guid; + + $item = new FeedItem(); + $item->guid = $guid; + $item->title = self::atomValue($xmlItem, 'title'); + $item->link = $link; + $item->publishedOn = Data::formatDate(self::atomValue($xmlItem, 'published')); + $item->updatedOn = Data::formatDate(self::atomValue($xmlItem, 'updated')); + $item->content = self::atomValue($xmlItem, 'content'); + $feed->items[] = $item; + } + + return ['ok' => $feed]; + } + /** * Retrieve the feed * - * @param string $url - * @return array|DOMDocument[]|string[]|DOMElement[] - * ['ok' => feedXml, 'url' => actualUrl, 'channel' => channel, 'updated' => updatedDate] if successful, - * ['error' => message] if not + * @param string $url The URL of the feed to retrieve + * @return array|Feed[]|string[] ['ok' => feed] if successful, ['error' => message] if not */ public static function retrieveFeed(string $url): array { $feedReq = curl_init($url); @@ -83,26 +234,9 @@ class Feed { if (array_key_exists('error', $parsed)) { $result['error'] = $parsed['error']; } else { - $result['ok'] = $parsed['ok']; - $result['url'] = curl_getinfo($feedReq, CURLINFO_EFFECTIVE_URL); - - $channel = $result['ok']->getElementsByTagName('channel')->item(0); - if ($channel instanceof DOMElement) { - $result['channel'] = $channel; - } else { - return ['error' => "Channel element not found ($channel->nodeType)"]; - } - - // In Atom feeds, lastBuildDate contains the last time an item in the feed was updated; if that is not - // present, use the pubDate element instead - $updated = self::eltValue($channel, 'lastBuildDate'); - if ($updated == 'lastBuildDate not found') { - $updated = self::eltValue($channel, 'pubDate'); - if ($updated == 'pubDate not found') $updated = null; - } - $result['updated'] = Data::formatDate($updated); - return $result; - + $extract = $parsed['ok']->getElementsByTagNameNS(self::ATOM_NS, 'feed')->length > 0 + ? self::fromAtom(...) : self::fromRSS(...); + $result = $extract($parsed['ok'], curl_getinfo($feedReq, CURLINFO_EFFECTIVE_URL)); } } else { $result['error'] = "Prospective feed URL $url returned HTTP Code $code: $feedContent"; @@ -112,35 +246,14 @@ class Feed { return $result; } - /** - * Extract the fields we need to keep from the feed - * - * @param DOMElement $item The item from the feed - * @return array The fields for the item as an associative array - */ - private static function itemFields(DOMElement $item): array { - $itemGuid = self::eltValue($item, 'guid'); - $updNodes = $item->getElementsByTagNameNS(self::ATOM_NS, 'updated'); - $encNodes = $item->getElementsByTagNameNS(self::CONTENT_NS, 'encoded'); - return [ - 'guid' => $itemGuid == 'guid not found' ? self::eltValue($item, 'link') : $itemGuid, - 'title' => self::eltValue($item, 'title'), - 'link' => self::eltValue($item, 'link'), - 'published' => Data::formatDate(self::eltValue($item, 'pubDate')), - 'updated' => Data::formatDate($updNodes->length > 0 ? $updNodes->item(0)->textContent : null), - 'content' => $encNodes->length > 0 ? $encNodes->item(0)->textContent - : self::eltValue($item, 'description') - ]; - } - /** * Update a feed item * * @param int $itemId The ID of the item to be updated - * @param array $item The fields from the updated item + * @param FeedItem $item The item to be updated * @param SQLite3 $db A database connection to use for the update */ - private static function updateItem(int $itemId, array $item, SQLite3 $db): void { + private static function updateItem(int $itemId, FeedItem $item, SQLite3 $db): void { $query = $db->prepare(<<<'SQL' UPDATE item SET title = :title, @@ -150,10 +263,10 @@ class Feed { is_read = 0 WHERE id = :id SQL); - $query->bindValue(':title', $item['title']); - $query->bindValue(':published', $item['published']); - $query->bindValue(':updated', $item['updated']); - $query->bindValue(':content', $item['content']); + $query->bindValue(':title', $item->title); + $query->bindValue(':published', $item->publishedOn); + $query->bindValue(':updated', $item->updatedOn); + $query->bindValue(':content', $item->content); $query->bindValue(':id', $itemId); $query->execute(); } @@ -162,10 +275,10 @@ class Feed { * Add a feed item * * @param int $feedId The ID of the feed to which the item should be added - * @param array $item The fields for the item + * @param FeedItem $item The item to be added * @param SQLite3 $db A database connection to use for the addition */ - private static function addItem(int $feedId, array $item, SQLite3 $db): void { + private static function addItem(int $feedId, FeedItem $item, SQLite3 $db): void { $query = $db->prepare(<<<'SQL' INSERT INTO item ( feed_id, item_guid, item_link, title, published_on, updated_on, content @@ -174,12 +287,12 @@ class Feed { ) SQL); $query->bindValue(':feed', $feedId); - $query->bindValue(':guid', $item['guid']); - $query->bindValue(':link', $item['link']); - $query->bindValue(':title', $item['title']); - $query->bindValue(':published', $item['published']); - $query->bindValue(':updated', $item['updated']); - $query->bindValue(':content', $item['content']); + $query->bindValue(':guid', $item->guid); + $query->bindValue(':link', $item->link); + $query->bindValue(':title', $item->title); + $query->bindValue(':published', $item->publishedOn); + $query->bindValue(':updated', $item->updatedOn); + $query->bindValue(':content', $item->content); $query->execute(); } @@ -187,23 +300,22 @@ class Feed { * Update a feed's items * * @param int $feedId The ID of the feed to which these items belong - * @param DOMElement $channel The RSS feed items + * @param Feed $feed The extracted Atom or RSS feed items * @return array ['ok' => true] if successful, ['error' => message] if not */ - public static function updateItems(int $feedId, DOMElement $channel, SQLite3 $db): array { + public static function updateItems(int $feedId, Feed $feed, SQLite3 $db): array { try { - foreach ($channel->getElementsByTagName('item') as $rawItem) { - $item = self::itemFields($rawItem); + foreach ($feed->items as $item) { $existsQuery = $db->prepare( 'SELECT id, published_on, updated_on FROM item WHERE feed_id = :feed AND item_guid = :guid'); $existsQuery->bindValue(':feed', $feedId); - $existsQuery->bindValue(':guid', $item['guid']); + $existsQuery->bindValue(':guid', $item->guid); $exists = $existsQuery->execute(); if ($exists) { $existing = $exists->fetchArray(SQLITE3_ASSOC); if ($existing) { - if ( $existing['published_on'] != $item['published'] - || $existing['updated_on'] ?? '' != $item['updated'] ?? '') { + if ( $existing['published_on'] != $item->publishedOn + || ($existing['updated_on'] ?? '') != ($item->updatedOn ?? '')) { self::updateItem($existing['id'], $item, $db); } } else { @@ -234,13 +346,14 @@ class Feed { $feedId = $feedResult ? $feedResult->fetchArray(SQLITE3_NUM)[0] : -1; if ($feedId < 0) return ['error' => "No feed for URL $url found"]; - $feed = self::retrieveFeed($url); - if (array_key_exists('error', $feed)) return $feed; + $feedExtract = self::retrieveFeed($url); + if (array_key_exists('error', $feedExtract)) return $feedExtract; - $itemUpdate = self::updateItems($feedId, $feed['channel'], $db); + $feed = $feedExtract['ok']; + $itemUpdate = self::updateItems($feedId, $feed, $db); if (array_key_exists('error', $itemUpdate)) return $itemUpdate; - $urlUpdate = $url == $feed['url'] ? '' : ', url = :url'; + $urlUpdate = $url == $feed->url ? '' : ', url = :url'; $feedUpdate = $db->prepare(<<bindValue(':title', self::eltValue($feed['channel'], 'title')); - $feedUpdate->bindValue(':updated', $feed['updated']); + $feedUpdate->bindValue(':title', $feed->title); + $feedUpdate->bindValue(':updated', $feed->updatedOn); $feedUpdate->bindValue(':checked', Data::formatDate('now')); $feedUpdate->bindValue(':id', $feedId); - if ($urlUpdate != '') $feedUpdate->bindValue(':url', $feed['url']); + if ($urlUpdate != '') $feedUpdate->bindValue(':url', $feed->url); $feedUpdate->execute(); return ['ok' => true]; @@ -266,24 +379,25 @@ class Feed { * @return array ['ok' => feedId] if successful, ['error' => message] if not */ public static function add(string $url, SQLite3 $db): array { - $feed = self::retrieveFeed($url); - if (array_key_exists('error', $feed)) return $feed; + $feedExtract = self::retrieveFeed($url); + if (array_key_exists('error', $feedExtract)) return $feedExtract; + $feed = $feedExtract['ok']; $query = $db->prepare(<<<'SQL' INSERT INTO feed (user_id, url, title, updated_on, checked_on) VALUES (:user, :url, :title, :updated, :checked) SQL); $query->bindValue(':user', $_SESSION[Key::USER_ID]); - $query->bindValue(':url', $feed['url']); - $query->bindValue(':title', self::eltValue($feed['channel'], 'title')); - $query->bindValue(':updated', $feed['updated']); + $query->bindValue(':url', $feed->url); + $query->bindValue(':title', $feed->title); + $query->bindValue(':updated', $feed->updatedOn); $query->bindValue(':checked', Data::formatDate('now')); $result = $query->execute(); $feedId = $result ? $db->lastInsertRowID() : -1; if ($feedId < 0) return ['error' => $db->lastErrorMsg()]; - $result = self::updateItems($feedId, $feed['channel'], $db); + $result = self::updateItems($feedId, $feed, $db); if (array_key_exists('error', $result)) return $result; return ['ok' => $feedId]; @@ -294,6 +408,7 @@ class Feed { * * @param array $existing The existing RSS feed * @param string $url The URL with which the existing feed should be modified + * @param SQLite3 $db The database connection on which to execute the update * @return bool[]|string[] [ 'ok' => true ] if successful, [ 'error' => message ] if not */ public static function update(array $existing, string $url, SQLite3 $db): array { @@ -307,7 +422,9 @@ class Feed { } /** - * @param SQLite3 $db + * Refresh all feeds + * + * @param SQLite3 $db The database connection to use for refreshing feeds * @return array|true[] ['ok => true] if successful, ['error' => message] if not (may have multiple error lines) */ public static function refreshAll(SQLite3 $db): array { diff --git a/src/public/index.php b/src/public/index.php index 9b75399..3775c26 100644 --- a/src/public/index.php +++ b/src/public/index.php @@ -35,7 +35,7 @@ page_head('Welcome'); ?> if ($item) { while ($item) { ?>

>
-

fetchArray(SQLITE3_ASSOC); } } else { ?> diff --git a/src/public/item.php b/src/public/item.php index ee4a714..43fe030 100644 --- a/src/public/item.php +++ b/src/public/item.php @@ -31,7 +31,7 @@ if ($_SERVER['REQUEST_METHOD'] == 'POST') { } $query = $db->prepare(<<<'SQL' - SELECT item.title AS item_title, item.item_link, item.published_on, item.updated_on, item.content, item.is_encoded, + SELECT item.title AS item_title, item.item_link, item.published_on, item.updated_on, item.content, feed.title AS feed_title FROM item INNER JOIN feed ON feed.id = item.feed_id WHERE item.id = :id