Add Atom support (#17)

This commit is contained in:
Daniel J. Summers 2024-04-25 21:19:29 -04:00
parent 9d476b644b
commit ce83b2a389
2 changed files with 200 additions and 83 deletions

View File

@ -1,15 +1,55 @@
<?php
/**
* Information for a feed item
*/
class FeedItem {
/** @var string The title of the feed item */
public string $title = '';
/** @var string The unique ID for the feed item */
public string $guid = '';
/** @var string The link to the original content */
public string $link = '';
/** @var string When this item was published */
public string $publishedOn = '';
/** @var ?string When this item was last updated */
public ?string $updatedOn = null;
/** @var string The content for the item */
public string $content = '';
}
/**
* Feed retrieval, parsing, and manipulation
*/
class Feed {
/** @var string The URL for the feed */
public string $url = '';
/** @var string The title of the feed */
public string $title = '';
/** @var ?string When the feed was last updated */
public ?string $updatedOn = null;
/** @var FeedItem[] The items contained in the feed */
public array $items = [];
/** @var string The XML namespace for Atom feeds */
public const string ATOM_NS = 'http://www.w3.org/2005/Atom';
/** @var string The XML namespace for the `<content:encoded>` tag that allows HTML content in a feed */
public const string CONTENT_NS = 'http://purl.org/rss/1.0/modules/content/';
/** @var string The XML namespace for XHTML */
public const string XHTML_NS = 'http://www.w3.org/1999/xhtml';
/**
* When parsing XML into a DOMDocument, errors are presented as warnings; this creates an exception for them
*
@ -45,24 +85,135 @@ class Feed {
}
/**
* Get the value of a child element by its tag name
* Get the value of a child element by its tag name for an RSS feed
*
* @param DOMElement $element The parent element
* @param string $tagName The name of the tag whose value should be obtained
* @return string The value of the element (or "[element] not found" if that element does not exist)
*/
private static function eltValue(DOMElement $element, string $tagName): string {
private static function rssValue(DOMElement $element, string $tagName): string {
$tags = $element->getElementsByTagName($tagName);
return $tags->length == 0 ? "$tagName not found" : $tags->item(0)->textContent;
}
/**
* Extract items from an RSS feed
*
* @param DOMDocument $xml The XML received from the feed
* @param string $url The actual URL for the feed
* @return array|Feed[]|string[] ['ok' => feed] if successful, ['error' => message] if not
*/
private static function fromRSS(DOMDocument $xml, string $url): array {
$channel = $xml->getElementsByTagName('channel')->item(0);
if (!($channel instanceof DOMElement)) {
return ['error' => "Channel element not found ($channel->nodeType)"];
}
$feed = new Feed();
$feed->title = self::rssValue($channel, 'title');
$feed->url = $url;
// The Atom namespace provides a lastBuildDate, which contains the last time an item in the feed was updated; if
// that is not present, use the pubDate element instead
$feed->updatedOn = self::rssValue($channel, 'lastBuildDate');
if ($feed->updatedOn == 'lastBuildDate not found') {
$feed->updatedOn = self::rssValue($channel, 'pubDate');
if ($feed->updatedOn == 'pubDate not found') $feed->updatedOn = null;
}
$feed->updatedOn = Data::formatDate($feed->updatedOn);
foreach ($channel->getElementsByTagName('item') as $xmlItem) {
$itemGuid = self::rssValue($xmlItem, 'guid');
$updNodes = $xmlItem->getElementsByTagNameNS(Feed::ATOM_NS, 'updated');
$encNodes = $xmlItem->getElementsByTagNameNS(Feed::CONTENT_NS, 'encoded');
$item = new FeedItem();
$item->guid = $itemGuid == 'guid not found' ? self::rssValue($xmlItem, 'link') : $itemGuid;
$item->title = self::rssValue($xmlItem, 'title');
$item->link = self::rssValue($xmlItem, 'link');
$item->publishedOn = Data::formatDate(self::rssValue($xmlItem, 'pubDate'));
$item->updatedOn = Data::formatDate($updNodes->length > 0 ? $updNodes->item(0)->textContent : null);
$item->content = $encNodes->length > 0
? $encNodes->item(0)->textContent
: self::rssValue($xmlItem, 'description');
$feed->items[] = $item;
}
return ['ok' => $feed];
}
/**
* Get the value of a child element by its tag name for an Atom feed
*
* (Atom feeds can have type attributes on nearly any value. For our purposes, types "text" and "html" will work as
* regular string values; for "xhtml", though, we will need to get the `<div>` and extract its contents instead.)
*
* @param DOMElement $element The parent element
* @param string $tagName The name of the tag whose value should be obtained
* @return string The value of the element (or "[element] not found" if that element does not exist)
*/
private static function atomValue(DOMElement $element, string $tagName): string {
$tags = $element->getElementsByTagName($tagName);
if ($tags->length == 0) return "$tagName not found";
$tag = $tags->item(0);
if (!($tag instanceof DOMElement)) return $tag->textContent;
if ($tag->hasAttributes() && $tag->attributes->getNamedItem('type') == 'xhtml') {
$div = $tag->getElementsByTagNameNS(Feed::XHTML_NS, 'div');
if ($div->length == 0) return "-- invalid XHTML content --";
return $div->item(0)->textContent;
}
return $tag->textContent;
}
/**
* Extract items from an Atom feed
*
* @param DOMDocument $xml The XML received from the feed
* @param string $url The actual URL for the feed
* @return array|Feed[] ['ok' => feed]
*/
private static function fromAtom(DOMDocument $xml, string $url): array {
/** @var DOMElement $root */
$root = $xml->getElementsByTagNameNS(self::ATOM_NS, 'feed')->item(0);
$feed = new Feed();
$feed->title = self::atomValue($root, 'title');
$feed->url = $url;
$feed->updatedOn = self::atomValue($root, 'updated');
if ($feed->updatedOn == 'pubDate not found') $feed->updatedOn = null;
$feed->updatedOn = Data::formatDate($feed->updatedOn);
foreach ($root->getElementsByTagName('entry') as $xmlItem) {
$guid = self::atomValue($xmlItem, 'id');
$link = '';
foreach ($xmlItem->getElementsByTagName('link') as $linkElt) {
if ($linkElt->hasAttributes()) {
$relAttr = $linkElt->attributes->getNamedItem('rel');
if ($relAttr && $relAttr->value == 'alternate') {
$link = $linkElt->attributes->getNamedItem('href')->value;
break;
}
}
}
if ($link == '' && str_starts_with($guid, 'http')) $link = $guid;
$item = new FeedItem();
$item->guid = $guid;
$item->title = self::atomValue($xmlItem, 'title');
$item->link = $link;
$item->publishedOn = Data::formatDate(self::atomValue($xmlItem, 'published'));
$item->updatedOn = Data::formatDate(self::atomValue($xmlItem, 'updated'));
$item->content = self::atomValue($xmlItem, 'content');
$feed->items[] = $item;
}
return ['ok' => $feed];
}
/**
* Retrieve the feed
*
* @param string $url
* @return array|DOMDocument[]|string[]|DOMElement[]
* ['ok' => feedXml, 'url' => actualUrl, 'channel' => channel, 'updated' => updatedDate] if successful,
* ['error' => message] if not
* @param string $url The URL of the feed to retrieve
* @return array|Feed[]|string[] ['ok' => feed] if successful, ['error' => message] if not
*/
public static function retrieveFeed(string $url): array {
$feedReq = curl_init($url);
@ -83,26 +234,9 @@ class Feed {
if (array_key_exists('error', $parsed)) {
$result['error'] = $parsed['error'];
} else {
$result['ok'] = $parsed['ok'];
$result['url'] = curl_getinfo($feedReq, CURLINFO_EFFECTIVE_URL);
$channel = $result['ok']->getElementsByTagName('channel')->item(0);
if ($channel instanceof DOMElement) {
$result['channel'] = $channel;
} else {
return ['error' => "Channel element not found ($channel->nodeType)"];
}
// In Atom feeds, lastBuildDate contains the last time an item in the feed was updated; if that is not
// present, use the pubDate element instead
$updated = self::eltValue($channel, 'lastBuildDate');
if ($updated == 'lastBuildDate not found') {
$updated = self::eltValue($channel, 'pubDate');
if ($updated == 'pubDate not found') $updated = null;
}
$result['updated'] = Data::formatDate($updated);
return $result;
$extract = $parsed['ok']->getElementsByTagNameNS(self::ATOM_NS, 'feed')->length > 0
? self::fromAtom(...) : self::fromRSS(...);
$result = $extract($parsed['ok'], curl_getinfo($feedReq, CURLINFO_EFFECTIVE_URL));
}
} else {
$result['error'] = "Prospective feed URL $url returned HTTP Code $code: $feedContent";
@ -112,35 +246,14 @@ class Feed {
return $result;
}
/**
* Extract the fields we need to keep from the feed
*
* @param DOMElement $item The item from the feed
* @return array The fields for the item as an associative array
*/
private static function itemFields(DOMElement $item): array {
$itemGuid = self::eltValue($item, 'guid');
$updNodes = $item->getElementsByTagNameNS(self::ATOM_NS, 'updated');
$encNodes = $item->getElementsByTagNameNS(self::CONTENT_NS, 'encoded');
return [
'guid' => $itemGuid == 'guid not found' ? self::eltValue($item, 'link') : $itemGuid,
'title' => self::eltValue($item, 'title'),
'link' => self::eltValue($item, 'link'),
'published' => Data::formatDate(self::eltValue($item, 'pubDate')),
'updated' => Data::formatDate($updNodes->length > 0 ? $updNodes->item(0)->textContent : null),
'content' => $encNodes->length > 0 ? $encNodes->item(0)->textContent
: self::eltValue($item, 'description')
];
}
/**
* Update a feed item
*
* @param int $itemId The ID of the item to be updated
* @param array $item The fields from the updated item
* @param FeedItem $item The item to be updated
* @param SQLite3 $db A database connection to use for the update
*/
private static function updateItem(int $itemId, array $item, SQLite3 $db): void {
private static function updateItem(int $itemId, FeedItem $item, SQLite3 $db): void {
$query = $db->prepare(<<<'SQL'
UPDATE item
SET title = :title,
@ -150,10 +263,10 @@ class Feed {
is_read = 0
WHERE id = :id
SQL);
$query->bindValue(':title', $item['title']);
$query->bindValue(':published', $item['published']);
$query->bindValue(':updated', $item['updated']);
$query->bindValue(':content', $item['content']);
$query->bindValue(':title', $item->title);
$query->bindValue(':published', $item->publishedOn);
$query->bindValue(':updated', $item->updatedOn);
$query->bindValue(':content', $item->content);
$query->bindValue(':id', $itemId);
$query->execute();
}
@ -162,10 +275,10 @@ class Feed {
* Add a feed item
*
* @param int $feedId The ID of the feed to which the item should be added
* @param array $item The fields for the item
* @param FeedItem $item The item to be added
* @param SQLite3 $db A database connection to use for the addition
*/
private static function addItem(int $feedId, array $item, SQLite3 $db): void {
private static function addItem(int $feedId, FeedItem $item, SQLite3 $db): void {
$query = $db->prepare(<<<'SQL'
INSERT INTO item (
feed_id, item_guid, item_link, title, published_on, updated_on, content
@ -174,12 +287,12 @@ class Feed {
)
SQL);
$query->bindValue(':feed', $feedId);
$query->bindValue(':guid', $item['guid']);
$query->bindValue(':link', $item['link']);
$query->bindValue(':title', $item['title']);
$query->bindValue(':published', $item['published']);
$query->bindValue(':updated', $item['updated']);
$query->bindValue(':content', $item['content']);
$query->bindValue(':guid', $item->guid);
$query->bindValue(':link', $item->link);
$query->bindValue(':title', $item->title);
$query->bindValue(':published', $item->publishedOn);
$query->bindValue(':updated', $item->updatedOn);
$query->bindValue(':content', $item->content);
$query->execute();
}
@ -187,23 +300,22 @@ class Feed {
* Update a feed's items
*
* @param int $feedId The ID of the feed to which these items belong
* @param DOMElement $channel The RSS feed items
* @param Feed $feed The extracted Atom or RSS feed items
* @return array ['ok' => true] if successful, ['error' => message] if not
*/
public static function updateItems(int $feedId, DOMElement $channel, SQLite3 $db): array {
public static function updateItems(int $feedId, Feed $feed, SQLite3 $db): array {
try {
foreach ($channel->getElementsByTagName('item') as $rawItem) {
$item = self::itemFields($rawItem);
foreach ($feed->items as $item) {
$existsQuery = $db->prepare(
'SELECT id, published_on, updated_on FROM item WHERE feed_id = :feed AND item_guid = :guid');
$existsQuery->bindValue(':feed', $feedId);
$existsQuery->bindValue(':guid', $item['guid']);
$existsQuery->bindValue(':guid', $item->guid);
$exists = $existsQuery->execute();
if ($exists) {
$existing = $exists->fetchArray(SQLITE3_ASSOC);
if ($existing) {
if ( $existing['published_on'] != $item['published']
|| $existing['updated_on'] ?? '' != $item['updated'] ?? '') {
if ( $existing['published_on'] != $item->publishedOn
|| ($existing['updated_on'] ?? '') != ($item->updatedOn ?? '')) {
self::updateItem($existing['id'], $item, $db);
}
} else {
@ -234,13 +346,14 @@ class Feed {
$feedId = $feedResult ? $feedResult->fetchArray(SQLITE3_NUM)[0] : -1;
if ($feedId < 0) return ['error' => "No feed for URL $url found"];
$feed = self::retrieveFeed($url);
if (array_key_exists('error', $feed)) return $feed;
$feedExtract = self::retrieveFeed($url);
if (array_key_exists('error', $feedExtract)) return $feedExtract;
$itemUpdate = self::updateItems($feedId, $feed['channel'], $db);
$feed = $feedExtract['ok'];
$itemUpdate = self::updateItems($feedId, $feed, $db);
if (array_key_exists('error', $itemUpdate)) return $itemUpdate;
$urlUpdate = $url == $feed['url'] ? '' : ', url = :url';
$urlUpdate = $url == $feed->url ? '' : ', url = :url';
$feedUpdate = $db->prepare(<<<SQL
UPDATE feed
SET title = :title,
@ -249,11 +362,11 @@ class Feed {
$urlUpdate
WHERE id = :id
SQL);
$feedUpdate->bindValue(':title', self::eltValue($feed['channel'], 'title'));
$feedUpdate->bindValue(':updated', $feed['updated']);
$feedUpdate->bindValue(':title', $feed->title);
$feedUpdate->bindValue(':updated', $feed->updatedOn);
$feedUpdate->bindValue(':checked', Data::formatDate('now'));
$feedUpdate->bindValue(':id', $feedId);
if ($urlUpdate != '') $feedUpdate->bindValue(':url', $feed['url']);
if ($urlUpdate != '') $feedUpdate->bindValue(':url', $feed->url);
$feedUpdate->execute();
return ['ok' => true];
@ -266,24 +379,25 @@ class Feed {
* @return array ['ok' => feedId] if successful, ['error' => message] if not
*/
public static function add(string $url, SQLite3 $db): array {
$feed = self::retrieveFeed($url);
if (array_key_exists('error', $feed)) return $feed;
$feedExtract = self::retrieveFeed($url);
if (array_key_exists('error', $feedExtract)) return $feedExtract;
$feed = $feedExtract['ok'];
$query = $db->prepare(<<<'SQL'
INSERT INTO feed (user_id, url, title, updated_on, checked_on)
VALUES (:user, :url, :title, :updated, :checked)
SQL);
$query->bindValue(':user', $_REQUEST[Key::USER_ID]);
$query->bindValue(':url', $feed['url']);
$query->bindValue(':title', self::eltValue($feed['channel'], 'title'));
$query->bindValue(':updated', $feed['updated']);
$query->bindValue(':url', $feed->url);
$query->bindValue(':title', $feed->title);
$query->bindValue(':updated', $feed->updatedOn);
$query->bindValue(':checked', Data::formatDate('now'));
$result = $query->execute();
$feedId = $result ? $db->lastInsertRowID() : -1;
if ($feedId < 0) return ['error' => $db->lastErrorMsg()];
$result = self::updateItems($feedId, $feed['channel'], $db);
$result = self::updateItems($feedId, $feed, $db);
if (array_key_exists('error', $result)) return $result;
return ['ok' => $feedId];
@ -294,6 +408,7 @@ class Feed {
*
* @param array $existing The existing RSS feed
* @param string $url The URL with which the existing feed should be modified
* @param SQLite3 $db The database connection on which to execute the update
* @return bool[]|string[] [ 'ok' => true ] if successful, [ 'error' => message ] if not
*/
public static function update(array $existing, string $url, SQLite3 $db): array {
@ -307,7 +422,9 @@ class Feed {
}
/**
* @param SQLite3 $db
* Refresh all feeds
*
* @param SQLite3 $db The database connection to use for refreshing feeds
* @return array|true[] ['ok => true] if successful, ['error' => message] if not (may have multiple error lines)
*/
public static function refreshAll(SQLite3 $db): array {

View File

@ -36,7 +36,7 @@ page_head('Welcome'); ?>
if ($item) {
while ($item) { ?>
<p><a href=/item?id=<?=$item['id']?>><?=$item['item_title']?></a><br>
<?=$item['feed_title']?><br><small><em><?=date_time($item['as_of'])?></em></small><?php
<?=htmlentities($item['feed_title'])?><br><small><em><?=date_time($item['as_of'])?></em></small><?php
$item = $result->fetchArray(SQLITE3_ASSOC);
}
} else { ?>