Merge branch 'main' into security-models
This commit is contained in:
commit
7b21b86550
281
src/lib/Feed.php
281
src/lib/Feed.php
@ -1,15 +1,55 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Information for a feed item
|
||||
*/
|
||||
class FeedItem {
|
||||
|
||||
/** @var string The title of the feed item */
|
||||
public string $title = '';
|
||||
|
||||
/** @var string The unique ID for the feed item */
|
||||
public string $guid = '';
|
||||
|
||||
/** @var string The link to the original content */
|
||||
public string $link = '';
|
||||
|
||||
/** @var string When this item was published */
|
||||
public string $publishedOn = '';
|
||||
|
||||
/** @var ?string When this item was last updated */
|
||||
public ?string $updatedOn = null;
|
||||
|
||||
/** @var string The content for the item */
|
||||
public string $content = '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Feed retrieval, parsing, and manipulation
|
||||
*/
|
||||
class Feed {
|
||||
|
||||
/** @var string The URL for the feed */
|
||||
public string $url = '';
|
||||
|
||||
/** @var string The title of the feed */
|
||||
public string $title = '';
|
||||
|
||||
/** @var ?string When the feed was last updated */
|
||||
public ?string $updatedOn = null;
|
||||
|
||||
/** @var FeedItem[] The items contained in the feed */
|
||||
public array $items = [];
|
||||
|
||||
/** @var string The XML namespace for Atom feeds */
|
||||
public const string ATOM_NS = 'http://www.w3.org/2005/Atom';
|
||||
|
||||
/** @var string The XML namespace for the `<content:encoded>` tag that allows HTML content in a feed */
|
||||
public const string CONTENT_NS = 'http://purl.org/rss/1.0/modules/content/';
|
||||
|
||||
/** @var string The XML namespace for XHTML */
|
||||
public const string XHTML_NS = 'http://www.w3.org/1999/xhtml';
|
||||
|
||||
/**
|
||||
* When parsing XML into a DOMDocument, errors are presented as warnings; this creates an exception for them
|
||||
*
|
||||
@ -45,24 +85,135 @@ class Feed {
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the value of a child element by its tag name
|
||||
* Get the value of a child element by its tag name for an RSS feed
|
||||
*
|
||||
* @param DOMElement $element The parent element
|
||||
* @param string $tagName The name of the tag whose value should be obtained
|
||||
* @return string The value of the element (or "[element] not found" if that element does not exist)
|
||||
*/
|
||||
private static function eltValue(DOMElement $element, string $tagName): string {
|
||||
private static function rssValue(DOMElement $element, string $tagName): string {
|
||||
$tags = $element->getElementsByTagName($tagName);
|
||||
return $tags->length == 0 ? "$tagName not found" : $tags->item(0)->textContent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract items from an RSS feed
|
||||
*
|
||||
* @param DOMDocument $xml The XML received from the feed
|
||||
* @param string $url The actual URL for the feed
|
||||
* @return array|Feed[]|string[] ['ok' => feed] if successful, ['error' => message] if not
|
||||
*/
|
||||
private static function fromRSS(DOMDocument $xml, string $url): array {
|
||||
$channel = $xml->getElementsByTagName('channel')->item(0);
|
||||
if (!($channel instanceof DOMElement)) {
|
||||
return ['error' => "Channel element not found ($channel->nodeType)"];
|
||||
}
|
||||
|
||||
$feed = new Feed();
|
||||
$feed->title = self::rssValue($channel, 'title');
|
||||
$feed->url = $url;
|
||||
|
||||
// The Atom namespace provides a lastBuildDate, which contains the last time an item in the feed was updated; if
|
||||
// that is not present, use the pubDate element instead
|
||||
$feed->updatedOn = self::rssValue($channel, 'lastBuildDate');
|
||||
if ($feed->updatedOn == 'lastBuildDate not found') {
|
||||
$feed->updatedOn = self::rssValue($channel, 'pubDate');
|
||||
if ($feed->updatedOn == 'pubDate not found') $feed->updatedOn = null;
|
||||
}
|
||||
$feed->updatedOn = Data::formatDate($feed->updatedOn);
|
||||
|
||||
foreach ($channel->getElementsByTagName('item') as $xmlItem) {
|
||||
$itemGuid = self::rssValue($xmlItem, 'guid');
|
||||
$updNodes = $xmlItem->getElementsByTagNameNS(Feed::ATOM_NS, 'updated');
|
||||
$encNodes = $xmlItem->getElementsByTagNameNS(Feed::CONTENT_NS, 'encoded');
|
||||
$item = new FeedItem();
|
||||
$item->guid = $itemGuid == 'guid not found' ? self::rssValue($xmlItem, 'link') : $itemGuid;
|
||||
$item->title = self::rssValue($xmlItem, 'title');
|
||||
$item->link = self::rssValue($xmlItem, 'link');
|
||||
$item->publishedOn = Data::formatDate(self::rssValue($xmlItem, 'pubDate'));
|
||||
$item->updatedOn = Data::formatDate($updNodes->length > 0 ? $updNodes->item(0)->textContent : null);
|
||||
$item->content = $encNodes->length > 0
|
||||
? $encNodes->item(0)->textContent
|
||||
: self::rssValue($xmlItem, 'description');
|
||||
$feed->items[] = $item;
|
||||
}
|
||||
|
||||
return ['ok' => $feed];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the value of a child element by its tag name for an Atom feed
|
||||
*
|
||||
* (Atom feeds can have type attributes on nearly any value. For our purposes, types "text" and "html" will work as
|
||||
* regular string values; for "xhtml", though, we will need to get the `<div>` and extract its contents instead.)
|
||||
*
|
||||
* @param DOMElement $element The parent element
|
||||
* @param string $tagName The name of the tag whose value should be obtained
|
||||
* @return string The value of the element (or "[element] not found" if that element does not exist)
|
||||
*/
|
||||
private static function atomValue(DOMElement $element, string $tagName): string {
|
||||
$tags = $element->getElementsByTagName($tagName);
|
||||
if ($tags->length == 0) return "$tagName not found";
|
||||
$tag = $tags->item(0);
|
||||
if (!($tag instanceof DOMElement)) return $tag->textContent;
|
||||
if ($tag->hasAttributes() && $tag->attributes->getNamedItem('type') == 'xhtml') {
|
||||
$div = $tag->getElementsByTagNameNS(Feed::XHTML_NS, 'div');
|
||||
if ($div->length == 0) return "-- invalid XHTML content --";
|
||||
return $div->item(0)->textContent;
|
||||
}
|
||||
return $tag->textContent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract items from an Atom feed
|
||||
*
|
||||
* @param DOMDocument $xml The XML received from the feed
|
||||
* @param string $url The actual URL for the feed
|
||||
* @return array|Feed[] ['ok' => feed]
|
||||
*/
|
||||
private static function fromAtom(DOMDocument $xml, string $url): array {
|
||||
/** @var DOMElement $root */
|
||||
$root = $xml->getElementsByTagNameNS(self::ATOM_NS, 'feed')->item(0);
|
||||
$feed = new Feed();
|
||||
$feed->title = self::atomValue($root, 'title');
|
||||
$feed->url = $url;
|
||||
|
||||
$feed->updatedOn = self::atomValue($root, 'updated');
|
||||
if ($feed->updatedOn == 'pubDate not found') $feed->updatedOn = null;
|
||||
$feed->updatedOn = Data::formatDate($feed->updatedOn);
|
||||
|
||||
foreach ($root->getElementsByTagName('entry') as $xmlItem) {
|
||||
$guid = self::atomValue($xmlItem, 'id');
|
||||
$link = '';
|
||||
foreach ($xmlItem->getElementsByTagName('link') as $linkElt) {
|
||||
if ($linkElt->hasAttributes()) {
|
||||
$relAttr = $linkElt->attributes->getNamedItem('rel');
|
||||
if ($relAttr && $relAttr->value == 'alternate') {
|
||||
$link = $linkElt->attributes->getNamedItem('href')->value;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($link == '' && str_starts_with($guid, 'http')) $link = $guid;
|
||||
|
||||
$item = new FeedItem();
|
||||
$item->guid = $guid;
|
||||
$item->title = self::atomValue($xmlItem, 'title');
|
||||
$item->link = $link;
|
||||
$item->publishedOn = Data::formatDate(self::atomValue($xmlItem, 'published'));
|
||||
$item->updatedOn = Data::formatDate(self::atomValue($xmlItem, 'updated'));
|
||||
$item->content = self::atomValue($xmlItem, 'content');
|
||||
$feed->items[] = $item;
|
||||
}
|
||||
|
||||
return ['ok' => $feed];
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the feed
|
||||
*
|
||||
* @param string $url
|
||||
* @return array|DOMDocument[]|string[]|DOMElement[]
|
||||
* ['ok' => feedXml, 'url' => actualUrl, 'channel' => channel, 'updated' => updatedDate] if successful,
|
||||
* ['error' => message] if not
|
||||
* @param string $url The URL of the feed to retrieve
|
||||
* @return array|Feed[]|string[] ['ok' => feed] if successful, ['error' => message] if not
|
||||
*/
|
||||
public static function retrieveFeed(string $url): array {
|
||||
$feedReq = curl_init($url);
|
||||
@ -83,26 +234,9 @@ class Feed {
|
||||
if (array_key_exists('error', $parsed)) {
|
||||
$result['error'] = $parsed['error'];
|
||||
} else {
|
||||
$result['ok'] = $parsed['ok'];
|
||||
$result['url'] = curl_getinfo($feedReq, CURLINFO_EFFECTIVE_URL);
|
||||
|
||||
$channel = $result['ok']->getElementsByTagName('channel')->item(0);
|
||||
if ($channel instanceof DOMElement) {
|
||||
$result['channel'] = $channel;
|
||||
} else {
|
||||
return ['error' => "Channel element not found ($channel->nodeType)"];
|
||||
}
|
||||
|
||||
// In Atom feeds, lastBuildDate contains the last time an item in the feed was updated; if that is not
|
||||
// present, use the pubDate element instead
|
||||
$updated = self::eltValue($channel, 'lastBuildDate');
|
||||
if ($updated == 'lastBuildDate not found') {
|
||||
$updated = self::eltValue($channel, 'pubDate');
|
||||
if ($updated == 'pubDate not found') $updated = null;
|
||||
}
|
||||
$result['updated'] = Data::formatDate($updated);
|
||||
return $result;
|
||||
|
||||
$extract = $parsed['ok']->getElementsByTagNameNS(self::ATOM_NS, 'feed')->length > 0
|
||||
? self::fromAtom(...) : self::fromRSS(...);
|
||||
$result = $extract($parsed['ok'], curl_getinfo($feedReq, CURLINFO_EFFECTIVE_URL));
|
||||
}
|
||||
} else {
|
||||
$result['error'] = "Prospective feed URL $url returned HTTP Code $code: $feedContent";
|
||||
@ -112,35 +246,14 @@ class Feed {
|
||||
return $result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the fields we need to keep from the feed
|
||||
*
|
||||
* @param DOMElement $item The item from the feed
|
||||
* @return array The fields for the item as an associative array
|
||||
*/
|
||||
private static function itemFields(DOMElement $item): array {
|
||||
$itemGuid = self::eltValue($item, 'guid');
|
||||
$updNodes = $item->getElementsByTagNameNS(self::ATOM_NS, 'updated');
|
||||
$encNodes = $item->getElementsByTagNameNS(self::CONTENT_NS, 'encoded');
|
||||
return [
|
||||
'guid' => $itemGuid == 'guid not found' ? self::eltValue($item, 'link') : $itemGuid,
|
||||
'title' => self::eltValue($item, 'title'),
|
||||
'link' => self::eltValue($item, 'link'),
|
||||
'published' => Data::formatDate(self::eltValue($item, 'pubDate')),
|
||||
'updated' => Data::formatDate($updNodes->length > 0 ? $updNodes->item(0)->textContent : null),
|
||||
'content' => $encNodes->length > 0 ? $encNodes->item(0)->textContent
|
||||
: self::eltValue($item, 'description')
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Update a feed item
|
||||
*
|
||||
* @param int $itemId The ID of the item to be updated
|
||||
* @param array $item The fields from the updated item
|
||||
* @param FeedItem $item The item to be updated
|
||||
* @param SQLite3 $db A database connection to use for the update
|
||||
*/
|
||||
private static function updateItem(int $itemId, array $item, SQLite3 $db): void {
|
||||
private static function updateItem(int $itemId, FeedItem $item, SQLite3 $db): void {
|
||||
$query = $db->prepare(<<<'SQL'
|
||||
UPDATE item
|
||||
SET title = :title,
|
||||
@ -150,10 +263,10 @@ class Feed {
|
||||
is_read = 0
|
||||
WHERE id = :id
|
||||
SQL);
|
||||
$query->bindValue(':title', $item['title']);
|
||||
$query->bindValue(':published', $item['published']);
|
||||
$query->bindValue(':updated', $item['updated']);
|
||||
$query->bindValue(':content', $item['content']);
|
||||
$query->bindValue(':title', $item->title);
|
||||
$query->bindValue(':published', $item->publishedOn);
|
||||
$query->bindValue(':updated', $item->updatedOn);
|
||||
$query->bindValue(':content', $item->content);
|
||||
$query->bindValue(':id', $itemId);
|
||||
$query->execute();
|
||||
}
|
||||
@ -162,10 +275,10 @@ class Feed {
|
||||
* Add a feed item
|
||||
*
|
||||
* @param int $feedId The ID of the feed to which the item should be added
|
||||
* @param array $item The fields for the item
|
||||
* @param FeedItem $item The item to be added
|
||||
* @param SQLite3 $db A database connection to use for the addition
|
||||
*/
|
||||
private static function addItem(int $feedId, array $item, SQLite3 $db): void {
|
||||
private static function addItem(int $feedId, FeedItem $item, SQLite3 $db): void {
|
||||
$query = $db->prepare(<<<'SQL'
|
||||
INSERT INTO item (
|
||||
feed_id, item_guid, item_link, title, published_on, updated_on, content
|
||||
@ -174,12 +287,12 @@ class Feed {
|
||||
)
|
||||
SQL);
|
||||
$query->bindValue(':feed', $feedId);
|
||||
$query->bindValue(':guid', $item['guid']);
|
||||
$query->bindValue(':link', $item['link']);
|
||||
$query->bindValue(':title', $item['title']);
|
||||
$query->bindValue(':published', $item['published']);
|
||||
$query->bindValue(':updated', $item['updated']);
|
||||
$query->bindValue(':content', $item['content']);
|
||||
$query->bindValue(':guid', $item->guid);
|
||||
$query->bindValue(':link', $item->link);
|
||||
$query->bindValue(':title', $item->title);
|
||||
$query->bindValue(':published', $item->publishedOn);
|
||||
$query->bindValue(':updated', $item->updatedOn);
|
||||
$query->bindValue(':content', $item->content);
|
||||
$query->execute();
|
||||
}
|
||||
|
||||
@ -187,23 +300,22 @@ class Feed {
|
||||
* Update a feed's items
|
||||
*
|
||||
* @param int $feedId The ID of the feed to which these items belong
|
||||
* @param DOMElement $channel The RSS feed items
|
||||
* @param Feed $feed The extracted Atom or RSS feed items
|
||||
* @return array ['ok' => true] if successful, ['error' => message] if not
|
||||
*/
|
||||
public static function updateItems(int $feedId, DOMElement $channel, SQLite3 $db): array {
|
||||
public static function updateItems(int $feedId, Feed $feed, SQLite3 $db): array {
|
||||
try {
|
||||
foreach ($channel->getElementsByTagName('item') as $rawItem) {
|
||||
$item = self::itemFields($rawItem);
|
||||
foreach ($feed->items as $item) {
|
||||
$existsQuery = $db->prepare(
|
||||
'SELECT id, published_on, updated_on FROM item WHERE feed_id = :feed AND item_guid = :guid');
|
||||
$existsQuery->bindValue(':feed', $feedId);
|
||||
$existsQuery->bindValue(':guid', $item['guid']);
|
||||
$existsQuery->bindValue(':guid', $item->guid);
|
||||
$exists = $existsQuery->execute();
|
||||
if ($exists) {
|
||||
$existing = $exists->fetchArray(SQLITE3_ASSOC);
|
||||
if ($existing) {
|
||||
if ( $existing['published_on'] != $item['published']
|
||||
|| $existing['updated_on'] ?? '' != $item['updated'] ?? '') {
|
||||
if ( $existing['published_on'] != $item->publishedOn
|
||||
|| ($existing['updated_on'] ?? '') != ($item->updatedOn ?? '')) {
|
||||
self::updateItem($existing['id'], $item, $db);
|
||||
}
|
||||
} else {
|
||||
@ -234,13 +346,14 @@ class Feed {
|
||||
$feedId = $feedResult ? $feedResult->fetchArray(SQLITE3_NUM)[0] : -1;
|
||||
if ($feedId < 0) return ['error' => "No feed for URL $url found"];
|
||||
|
||||
$feed = self::retrieveFeed($url);
|
||||
if (array_key_exists('error', $feed)) return $feed;
|
||||
$feedExtract = self::retrieveFeed($url);
|
||||
if (array_key_exists('error', $feedExtract)) return $feedExtract;
|
||||
|
||||
$itemUpdate = self::updateItems($feedId, $feed['channel'], $db);
|
||||
$feed = $feedExtract['ok'];
|
||||
$itemUpdate = self::updateItems($feedId, $feed, $db);
|
||||
if (array_key_exists('error', $itemUpdate)) return $itemUpdate;
|
||||
|
||||
$urlUpdate = $url == $feed['url'] ? '' : ', url = :url';
|
||||
$urlUpdate = $url == $feed->url ? '' : ', url = :url';
|
||||
$feedUpdate = $db->prepare(<<<SQL
|
||||
UPDATE feed
|
||||
SET title = :title,
|
||||
@ -249,11 +362,11 @@ class Feed {
|
||||
$urlUpdate
|
||||
WHERE id = :id
|
||||
SQL);
|
||||
$feedUpdate->bindValue(':title', self::eltValue($feed['channel'], 'title'));
|
||||
$feedUpdate->bindValue(':updated', $feed['updated']);
|
||||
$feedUpdate->bindValue(':title', $feed->title);
|
||||
$feedUpdate->bindValue(':updated', $feed->updatedOn);
|
||||
$feedUpdate->bindValue(':checked', Data::formatDate('now'));
|
||||
$feedUpdate->bindValue(':id', $feedId);
|
||||
if ($urlUpdate != '') $feedUpdate->bindValue(':url', $feed['url']);
|
||||
if ($urlUpdate != '') $feedUpdate->bindValue(':url', $feed->url);
|
||||
$feedUpdate->execute();
|
||||
|
||||
return ['ok' => true];
|
||||
@ -266,24 +379,25 @@ class Feed {
|
||||
* @return array ['ok' => feedId] if successful, ['error' => message] if not
|
||||
*/
|
||||
public static function add(string $url, SQLite3 $db): array {
|
||||
$feed = self::retrieveFeed($url);
|
||||
if (array_key_exists('error', $feed)) return $feed;
|
||||
$feedExtract = self::retrieveFeed($url);
|
||||
if (array_key_exists('error', $feedExtract)) return $feedExtract;
|
||||
|
||||
$feed = $feedExtract['ok'];
|
||||
$query = $db->prepare(<<<'SQL'
|
||||
INSERT INTO feed (user_id, url, title, updated_on, checked_on)
|
||||
VALUES (:user, :url, :title, :updated, :checked)
|
||||
SQL);
|
||||
$query->bindValue(':user', $_SESSION[Key::USER_ID]);
|
||||
$query->bindValue(':url', $feed['url']);
|
||||
$query->bindValue(':title', self::eltValue($feed['channel'], 'title'));
|
||||
$query->bindValue(':updated', $feed['updated']);
|
||||
$query->bindValue(':url', $feed->url);
|
||||
$query->bindValue(':title', $feed->title);
|
||||
$query->bindValue(':updated', $feed->updatedOn);
|
||||
$query->bindValue(':checked', Data::formatDate('now'));
|
||||
$result = $query->execute();
|
||||
|
||||
$feedId = $result ? $db->lastInsertRowID() : -1;
|
||||
if ($feedId < 0) return ['error' => $db->lastErrorMsg()];
|
||||
|
||||
$result = self::updateItems($feedId, $feed['channel'], $db);
|
||||
$result = self::updateItems($feedId, $feed, $db);
|
||||
if (array_key_exists('error', $result)) return $result;
|
||||
|
||||
return ['ok' => $feedId];
|
||||
@ -294,6 +408,7 @@ class Feed {
|
||||
*
|
||||
* @param array $existing The existing RSS feed
|
||||
* @param string $url The URL with which the existing feed should be modified
|
||||
* @param SQLite3 $db The database connection on which to execute the update
|
||||
* @return bool[]|string[] [ 'ok' => true ] if successful, [ 'error' => message ] if not
|
||||
*/
|
||||
public static function update(array $existing, string $url, SQLite3 $db): array {
|
||||
@ -307,7 +422,9 @@ class Feed {
|
||||
}
|
||||
|
||||
/**
|
||||
* @param SQLite3 $db
|
||||
* Refresh all feeds
|
||||
*
|
||||
* @param SQLite3 $db The database connection to use for refreshing feeds
|
||||
* @return array|true[] ['ok => true] if successful, ['error' => message] if not (may have multiple error lines)
|
||||
*/
|
||||
public static function refreshAll(SQLite3 $db): array {
|
||||
|
@ -35,7 +35,7 @@ page_head('Welcome'); ?>
|
||||
if ($item) {
|
||||
while ($item) { ?>
|
||||
<p><a href=/item?id=<?=$item['id']?>><?=$item['item_title']?></a><br>
|
||||
<?=$item['feed_title']?><br><small><em><?=date_time($item['as_of'])?></em></small><?php
|
||||
<?=htmlentities($item['feed_title'])?><br><small><em><?=date_time($item['as_of'])?></em></small><?php
|
||||
$item = $result->fetchArray(SQLITE3_ASSOC);
|
||||
}
|
||||
} else { ?>
|
||||
|
@ -31,7 +31,7 @@ if ($_SERVER['REQUEST_METHOD'] == 'POST') {
|
||||
}
|
||||
|
||||
$query = $db->prepare(<<<'SQL'
|
||||
SELECT item.title AS item_title, item.item_link, item.published_on, item.updated_on, item.content, item.is_encoded,
|
||||
SELECT item.title AS item_title, item.item_link, item.published_on, item.updated_on, item.content,
|
||||
feed.title AS feed_title
|
||||
FROM item INNER JOIN feed ON feed.id = item.feed_id
|
||||
WHERE item.id = :id
|
||||
|
Loading…
Reference in New Issue
Block a user