Split out feed parsing from document

WIP - still moving things around...
This commit is contained in:
Daniel J. Summers 2024-05-31 16:00:04 -04:00
parent 67747899ac
commit f7f5dba795
12 changed files with 371 additions and 396 deletions

View File

@ -15,7 +15,8 @@
"bit-badger/documents-sqlite": "dev-conversion", "bit-badger/documents-sqlite": "dev-conversion",
"ext-sqlite3": "*", "ext-sqlite3": "*",
"ext-dom": "*", "ext-dom": "*",
"ext-curl": "*" "ext-curl": "*",
"ext-readline": "*"
}, },
"autoload": { "autoload": {
"psr-4": { "psr-4": {

5
src/composer.lock generated
View File

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically" "This file is @generated automatically"
], ],
"content-hash": "6919c5b5b8f417396276d24c8f8edbde", "content-hash": "029a3af4ce4e5cc5488c1ca634a8af61",
"packages": [ "packages": [
{ {
"name": "bit-badger/documents-common", "name": "bit-badger/documents-common",
@ -44,7 +44,8 @@
"platform": { "platform": {
"ext-sqlite3": "*", "ext-sqlite3": "*",
"ext-dom": "*", "ext-dom": "*",
"ext-curl": "*" "ext-curl": "*",
"ext-readline": "*"
}, },
"platform-dev": [], "platform-dev": [],
"plugin-api-version": "2.6.0" "plugin-api-version": "2.6.0"

View File

@ -1,72 +0,0 @@
<?php
namespace FeedReaderCentral\Domain;
use DateTimeImmutable;
use Exception;
use FeedReaderCentral\Data;
use FeedReaderCentral\Feed as FeedParsed;
use FeedReaderCentral\Key;
/**
* An RSS or Atom feed
*/
class Feed
{
/** @var int The ID of the feed */
public int $id = 0;
/** @var int The ID of the user to whom this subscription belongs */
public int $user_id = 0;
/** @var string The URL of the feed */
public string $url = '';
/** @var string|null The title of this feed */
public ?string $title = null;
/** @var string|null The date/time items in this feed were last updated */
public ?string $updated_on = null;
/** @var string|null The date/time this feed was last checked */
public ?string $checked_on = null;
/**
* The date/time items in this feed were last updated
*
* @return DateTimeImmutable|null The updated date, or null if it is not set
* @throws Exception If the date/time is an invalid format
*/
public function updatedOn(): ?DateTimeImmutable
{
return is_null($this->updated_on) ? null : new DateTimeImmutable($this->updated_on);
}
/**
* The date/time this feed was last checked
*
* @return DateTimeImmutable|null The last checked date, or null if it is not set
* @throws Exception If the date/time is an invalid format
*/
public function checkedOn(): ?DateTimeImmutable
{
return is_null($this->checked_on) ? null : new DateTimeImmutable($this->checked_on);
}
/**
* Create a document from the parsed feed
*
* @param FeedParsed $feed The parsed feed
* @return static The document constructed from the parsed feed
*/
public static function fromParsed(FeedParsed $feed): static
{
$it = new static();
$it->user_id = $_SESSION[Key::USER_ID];
$it->url = $feed->url;
$it->title = $feed->title;
$it->updated_on = $feed->updatedOn;
$it->checked_on = Data::formatDate('now');
return $it;
}
}

View File

@ -1,7 +1,7 @@
<?php <?php
namespace FeedReaderCentral\Domain; namespace FeedReaderCentral\Domain;
use FeedReaderCentral\FeedItem; use FeedReaderCentral\ParsedItem;
/** /**
* An item from a feed * An item from a feed
@ -62,10 +62,10 @@ class Item
* Create an item document from a parsed feed item * Create an item document from a parsed feed item
* *
* @param int $feedId The ID of the feed to which this item belongs * @param int $feedId The ID of the feed to which this item belongs
* @param FeedItem $item The parsed feed item * @param ParsedItem $item The parsed feed item
* @return static The item document * @return static The item document
*/ */
public static function fromFeedItem(int $feedId, FeedItem $item): static public static function fromFeedItem(int $feedId, ParsedItem $item): static
{ {
$it = new static(); $it = new static();
$it->feed_id = $feedId; $it->feed_id = $feedId;

View File

@ -12,44 +12,16 @@ use BitBadger\Documents\SQLite\Exists;
use BitBadger\Documents\SQLite\Find; use BitBadger\Documents\SQLite\Find;
use BitBadger\Documents\SQLite\Patch; use BitBadger\Documents\SQLite\Patch;
use DateTimeInterface; use DateTimeInterface;
use DOMDocument;
use DOMElement;
use DOMException;
use DOMNode;
use FeedReaderCentral\Domain\Feed as FeedDocument;
use FeedReaderCentral\Domain\Item; use FeedReaderCentral\Domain\Item;
use FeedReaderCentral\Domain\Table; use FeedReaderCentral\Domain\Table;
use SQLite3; use SQLite3;
/** /**
* Feed retrieval, parsing, and manipulation * An RSS or Atom feed
*/ */
class Feed { class Feed
{
/** @var string The URL for the feed */ // ***** CONSTANTS *****
public string $url = '';
/** @var string The title of the feed */
public string $title = '';
/** @var ?string When the feed was last updated */
public ?string $updatedOn = null;
/** @var FeedItem[] The items contained in the feed */
public array $items = [];
/** @var string The XML namespace for Atom feeds */
public const string ATOM_NS = 'http://www.w3.org/2005/Atom';
/** @var string The XML namespace for the `<content:encoded>` tag that allows HTML content in a feed */
public const string CONTENT_NS = 'http://purl.org/rss/1.0/modules/content/';
/** @var string The XML namespace for XHTML */
public const string XHTML_NS = 'http://www.w3.org/1999/xhtml';
/** @var string The user agent for Feed Reader Central's refresh requests */
private const string USER_AGENT =
'FeedReaderCentral/' . FRC_VERSION . ' +https://bitbadger.solutions/open-source/feed-reader-central';
/** @var int Do not purge items */ /** @var int Do not purge items */
public const int PURGE_NONE = 0; public const int PURGE_NONE = 0;
@ -63,231 +35,57 @@ class Feed {
/** @var int Purge items in number greater than the specified number of items to keep */ /** @var int Purge items in number greater than the specified number of items to keep */
public const int PURGE_BY_COUNT = 3; public const int PURGE_BY_COUNT = 3;
/** // ***** PROPERTIES *****
* When parsing XML into a DOMDocument, errors are presented as warnings; this creates an exception for them
* /** @var int The ID of the feed */
* @param int $errno The error level encountered public int $id = 0;
* @param string $errstr The text of the error encountered
* @return bool False, to delegate to the next error handler in the chain /** @var int The ID of the user to whom this subscription belongs */
* @throws DOMException If the error is a warning public int $user_id = 0;
*/
private static function xmlParseError(int $errno, string $errstr): bool { /** @var string The URL of the feed */
if ($errno == E_WARNING && substr_count($errstr, 'DOMDocument::loadXML()') > 0) { public string $url = '';
throw new DOMException($errstr, $errno);
} /** @var string|null The title of this feed */
return false; public ?string $title = null;
}
/** @var string|null The date/time items in this feed were last updated */
public ?string $updated_on = null;
/** @var string|null The date/time this feed was last checked */
public ?string $checked_on = null;
// ***** STATIC FUNCTIONS *****
/** /**
* Parse a feed into an XML tree * Create a document from the parsed feed
* *
* @param string $content The feed's RSS content * @param ParsedFeed $parsed The parsed feed
* @return array|DOMDocument[]|string[] ['ok' => feed] if successful, ['error' => message] if not * @return static The document constructed from the parsed feed
*/ */
public static function parseFeed(string $content): array { public static function fromParsed(ParsedFeed $parsed): static
set_error_handler(self::xmlParseError(...)); {
try { $it = new static();
$feed = new DOMDocument(); $it->user_id = $_SESSION[Key::USER_ID];
$feed->loadXML($content); $it->url = $parsed->url;
return ['ok' => $feed]; $it->title = $parsed->title;
} catch (DOMException $ex) { $it->updated_on = $parsed->updatedOn;
return ['error' => $ex->getMessage()]; $it->checked_on = Data::formatDate('now');
} finally {
restore_error_handler();
}
}
/** return $it;
* Get the value of a child element by its tag name for an RSS feed
*
* @param DOMNode $element The parent element
* @param string $tagName The name of the tag whose value should be obtained
* @return string The value of the element (or "[element] not found" if that element does not exist)
*/
public static function rssValue(DOMNode $element, string $tagName): string {
$tags = $element->getElementsByTagName($tagName);
return $tags->length == 0 ? "$tagName not found" : $tags->item(0)->textContent;
}
/**
* Extract items from an RSS feed
*
* @param DOMDocument $xml The XML received from the feed
* @param string $url The actual URL for the feed
* @return array|Feed[]|string[] ['ok' => feed] if successful, ['error' => message] if not
*/
private static function fromRSS(DOMDocument $xml, string $url): array {
$channel = $xml->getElementsByTagName('channel')->item(0);
if (!($channel instanceof DOMElement)) {
$type = $channel?->nodeType ?? -1;
return ['error' => "Channel element not found ($type)"];
}
// The Atom namespace provides a lastBuildDate, which contains the last time an item in the feed was updated; if
// that is not present, use the pubDate element instead
if (($updatedOn = self::rssValue($channel, 'lastBuildDate')) == 'lastBuildDate not found') {
if (($updatedOn = self::rssValue($channel, 'pubDate')) == 'pubDate not found') {
$updatedOn = null;
}
}
$feed = new static();
$feed->title = self::rssValue($channel, 'title');
$feed->url = $url;
$feed->updatedOn = Data::formatDate($updatedOn);
foreach ($channel->getElementsByTagName('item') as $item) $feed->items[] = FeedItem::fromRSS($item);
return ['ok' => $feed];
}
/**
* Get an attribute value from a DOM node
*
* @param DOMNode $node The node with an attribute value to obtain
* @param string $name The name of the attribute whose value should be obtained
* @return string The attribute value if it exists, an empty string if not
*/
private static function attrValue(DOMNode $node, string $name): string {
return ($node->hasAttributes() ? $node->attributes->getNamedItem($name)?->value : null) ?? '';
}
/**
* Get the value of a child element by its tag name for an Atom feed
*
* (Atom feeds can have type attributes on nearly any value. For our purposes, types "text" and "html" will work as
* regular string values; for "xhtml", though, we will need to get the `<div>` and extract its contents instead.)
*
* @param DOMNode $element The parent element
* @param string $tagName The name of the tag whose value should be obtained
* @return string The value of the element (or "[element] not found" if that element does not exist)
*/
public static function atomValue(DOMNode $element, string $tagName): string {
$tags = $element->getElementsByTagName($tagName);
if ($tags->length == 0) return "$tagName not found";
$tag = $tags->item(0);
if (!($tag instanceof DOMElement)) return $tag->textContent;
if (self::attrValue($tag, 'type') == 'xhtml') {
$div = $tag->getElementsByTagNameNS(Feed::XHTML_NS, 'div');
if ($div->length == 0) return "-- invalid XHTML content --";
return $div->item(0)->textContent;
}
return $tag->textContent;
}
/**
* Extract items from an Atom feed
*
* @param DOMDocument $xml The XML received from the feed
* @param string $url The actual URL for the feed
* @return array|Feed[] ['ok' => feed]
*/
private static function fromAtom(DOMDocument $xml, string $url): array {
$root = $xml->getElementsByTagNameNS(self::ATOM_NS, 'feed')->item(0);
if (($updatedOn = self::atomValue($root, 'updated')) == 'pubDate not found') $updatedOn = null;
$feed = new Feed();
$feed->title = self::atomValue($root, 'title');
$feed->url = $url;
$feed->updatedOn = Data::formatDate($updatedOn);
foreach ($root->getElementsByTagName('entry') as $entry) $feed->items[] = FeedItem::fromAtom($entry);
return ['ok' => $feed];
}
/**
* Retrieve a document (http/https)
*
* @param string $url The URL of the document to retrieve
* @return array ['content' => document content, 'error' => error message, 'code' => HTTP response code,
* 'url' => effective URL]
*/
private static function retrieveDocument(string $url): array {
$docReq = curl_init($url);
curl_setopt($docReq, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($docReq, CURLOPT_RETURNTRANSFER, true);
curl_setopt($docReq, CURLOPT_CONNECTTIMEOUT, 5);
curl_setopt($docReq, CURLOPT_TIMEOUT, 15);
curl_setopt($docReq, CURLOPT_USERAGENT, self::USER_AGENT);
$result = [
'content' => curl_exec($docReq),
'error' => curl_error($docReq),
'code' => curl_getinfo($docReq, CURLINFO_RESPONSE_CODE),
'url' => curl_getinfo($docReq, CURLINFO_EFFECTIVE_URL)
];
curl_close($docReq);
return $result;
}
/**
* Derive a feed URL from an HTML document
*
* @param string $content The HTML document content from which to derive a feed URL
* @return array|string[] ['ok' => feed URL] if successful, ['error' => message] if not
*/
private static function deriveFeedFromHTML(string $content): array {
$html = new DOMDocument();
$html->loadHTML(substr($content, 0, strpos($content, '</head>') + 7));
$headTags = $html->getElementsByTagName('head');
if ($headTags->length < 1) return ['error' => 'Cannot find feed at this URL'];
$head = $headTags->item(0);
foreach ($head->getElementsByTagName('link') as $link) {
if (self::attrValue($link, 'rel') == 'alternate') {
$type = self::attrValue($link, 'type');
if ($type == 'application/rss+xml' || $type == 'application/atom+xml') {
return ['ok' => self::attrValue($link, 'href')];
}
}
}
return ['error' => 'Cannot find feed at this URL'];
}
/**
* Retrieve the feed
*
* @param string $url The URL of the feed to retrieve
* @return array|Feed[]|string[] ['ok' => feed] if successful, ['error' => message] if not
*/
public static function retrieveFeed(string $url): array {
$doc = self::retrieveDocument($url);
if ($doc['error'] != '') return ['error' => $doc['error']];
if ($doc['code'] != 200) {
return ['error' => "Prospective feed URL $url returned HTTP Code {$doc['code']}: {$doc['content']}"];
}
$start = strtolower(strlen($doc['content']) >= 9 ? substr($doc['content'], 0, 9) : $doc['content']);
if ($start == '<!doctype' || str_starts_with($start, '<html')) {
$derivedURL = self::deriveFeedFromHTML($doc['content']);
if (key_exists('error', $derivedURL)) return ['error' => $derivedURL['error']];
$feedURL = $derivedURL['ok'];
if (!str_starts_with($feedURL, 'http')) {
// Relative URL; feed should be retrieved in the context of the original URL
$original = parse_url($url);
$port = key_exists('port', $original) ? ":{$original['port']}" : '';
$feedURL = $original['scheme'] . '://' . $original['host'] . $port . $feedURL;
}
$doc = self::retrieveDocument($feedURL);
}
$parsed = self::parseFeed($doc['content']);
if (key_exists('error', $parsed)) return ['error' => $parsed['error']];
$extract = $parsed['ok']->getElementsByTagNameNS(self::ATOM_NS, 'feed')->length > 0
? self::fromAtom(...) : self::fromRSS(...);
return $extract($parsed['ok'], $doc['url']);
} }
/** /**
* Update a feed's items * Update a feed's items
* *
* @param int $feedId The ID of the feed to which these items belong * @param int $feedId The ID of the feed to which these items belong
* @param Feed $feed The extracted Atom or RSS feed items * @param ParsedFeed $parsed The extracted Atom or RSS feed items
* @param DateTimeInterface $lastChecked When this feed was last checked (only new items will be added) * @param DateTimeInterface $lastChecked When this feed was last checked (only new items will be added)
* @return array ['ok' => true] if successful, ['error' => message] if not * @return array ['ok' => true] if successful, ['error' => message] if not
*/ */
public static function updateItems(int $feedId, Feed $feed, DateTimeInterface $lastChecked, SQLite3 $db): array { public static function updateItems(int $feedId, ParsedFeed $parsed, DateTimeInterface $lastChecked,
SQLite3 $db): array
{
$results = $results =
array_map(function ($item) use ($db, $feedId) { array_map(function ($item) use ($db, $feedId) {
try { try {
@ -305,7 +103,7 @@ class Feed {
} catch (DocumentException $ex) { } catch (DocumentException $ex) {
return ['error' => "$ex"]; return ['error' => "$ex"];
} }
}, array_filter($feed->items, }, array_filter($parsed->items,
fn($it) => date_create_immutable($it->updatedOn ?? $it->publishedOn) >= $lastChecked)); fn($it) => date_create_immutable($it->updatedOn ?? $it->publishedOn) >= $lastChecked));
$errors = array_map(fn($it) => $it['error'], array_filter($results, fn($it) => array_key_exists('error', $it))); $errors = array_map(fn($it) => $it['error'], array_filter($results, fn($it) => array_key_exists('error', $it)));
return sizeof($errors) > 0 ? ['error' => implode("\n", $errors)] : ['ok' => true]; return sizeof($errors) > 0 ? ['error' => implode("\n", $errors)] : ['ok' => true];
@ -318,7 +116,8 @@ class Feed {
* @param SQLite3 $db The database connection on which items should be purged * @param SQLite3 $db The database connection on which items should be purged
* @return array|string[]|true[] ['ok' => true] if purging was successful, ['error' => message] if not * @return array|string[]|true[] ['ok' => true] if purging was successful, ['error' => message] if not
*/ */
private static function purgeItems(int $feedId, SQLite3 $db): array { private static function purgeItems(int $feedId, SQLite3 $db): array
{
if (!array_search(PURGE_TYPE, [self::PURGE_READ, self::PURGE_BY_DAYS, self::PURGE_BY_COUNT])) { if (!array_search(PURGE_TYPE, [self::PURGE_READ, self::PURGE_BY_DAYS, self::PURGE_BY_COUNT])) {
return ['error' => 'Unrecognized purge type ' . PURGE_TYPE]; return ['error' => 'Unrecognized purge type ' . PURGE_TYPE];
} }
@ -362,12 +161,12 @@ class Feed {
* @return array|string[]|true[] ['ok' => true] if successful, ['error' => message] if not * @return array|string[]|true[] ['ok' => true] if successful, ['error' => message] if not
*/ */
public static function refreshFeed(int $feedId, string $url, SQLite3 $db): array { public static function refreshFeed(int $feedId, string $url, SQLite3 $db): array {
$feedRetrieval = self::retrieveFeed($url); $feedRetrieval = ParsedFeed::retrieve($url);
if (key_exists('error', $feedRetrieval)) return $feedRetrieval; if (key_exists('error', $feedRetrieval)) return $feedRetrieval;
$feed = $feedRetrieval['ok']; $feed = $feedRetrieval['ok'];
try { try {
$feedDoc = Find::byId(Table::FEED, $feedId, FeedDocument::class); $feedDoc = Find::byId(Table::FEED, $feedId, self::class);
if (!$feedDoc) return ['error' => 'Could not derive date last checked for feed']; if (!$feedDoc) return ['error' => 'Could not derive date last checked for feed'];
$lastChecked = date_create_immutable($feedDoc->checked_on ?? WWW_EPOCH); $lastChecked = date_create_immutable($feedDoc->checked_on ?? WWW_EPOCH);
@ -395,7 +194,7 @@ class Feed {
* @return array ['ok' => feedId] if successful, ['error' => message] if not * @return array ['ok' => feedId] if successful, ['error' => message] if not
*/ */
public static function add(string $url, SQLite3 $db): array { public static function add(string $url, SQLite3 $db): array {
$feedExtract = self::retrieveFeed($url); $feedExtract = ParsedFeed::retrieve($url);
if (key_exists('error', $feedExtract)) return $feedExtract; if (key_exists('error', $feedExtract)) return $feedExtract;
$feed = $feedExtract['ok']; $feed = $feedExtract['ok'];
@ -406,9 +205,9 @@ class Feed {
return ['error' => "Already subscribed to feed $feed->url"]; return ['error' => "Already subscribed to feed $feed->url"];
} }
Document::insert(Table::FEED, FeedDocument::fromParsed($feed), $db); Document::insert(Table::FEED, self::fromParsed($feed), $db);
$doc = Find::firstByFields(Table::FEED, $fields, FeedDocument::class); $doc = Find::firstByFields(Table::FEED, $fields, self::class);
if (!$doc) return ['error' => 'Could not retrieve inserted feed']; if (!$doc) return ['error' => 'Could not retrieve inserted feed'];
} catch (DocumentException $ex) { } catch (DocumentException $ex) {
return ['error' => "$ex"]; return ['error' => "$ex"];
@ -423,12 +222,12 @@ class Feed {
/** /**
* Update an RSS feed * Update an RSS feed
* *
* @param FeedDocument $existing The existing RSS feed * @param Feed $existing The existing feed
* @param string $url The URL with which the existing feed should be modified * @param string $url The URL with which the existing feed should be modified
* @param SQLite3 $db The database connection on which to execute the update * @param SQLite3 $db The database connection on which to execute the update
* @return bool[]|string[] [ 'ok' => true ] if successful, [ 'error' => message ] if not * @return bool[]|string[] [ 'ok' => true ] if successful, [ 'error' => message ] if not
*/ */
public static function update(FeedDocument $existing, string $url, SQLite3 $db): array { public static function update(Feed $existing, string $url, SQLite3 $db): array {
try { try {
Patch::byFields(Table::FEED, Patch::byFields(Table::FEED,
[Field::EQ(Configuration::idField(), $existing->id), Field::EQ('user_id', $_SESSION[Key::USER_ID])], [Field::EQ(Configuration::idField(), $existing->id), Field::EQ('user_id', $_SESSION[Key::USER_ID])],
@ -444,14 +243,14 @@ class Feed {
* Retrieve all feeds, optionally for a specific user * Retrieve all feeds, optionally for a specific user
* *
* @param int $user The ID of the user whose feeds should be retrieved (optional, defaults to all feeds) * @param int $user The ID of the user whose feeds should be retrieved (optional, defaults to all feeds)
* @return DocumentList<FeedDocument> A list of feeds * @return DocumentList<Feed> A list of feeds
* @throws DocumentException If any is encountered * @throws DocumentException If any is encountered
*/ */
public static function retrieveAll(int $user = 0): DocumentList public static function retrieveAll(int $user = 0): DocumentList
{ {
return $user == 0 return $user == 0
? Find::all(Table::FEED, FeedDocument::class) ? Find::all(Table::FEED, self::class)
: Find::byFields(Table::FEED, [Field::EQ('user_id', $user)], FeedDocument::class); : Find::byFields(Table::FEED, [Field::EQ('user_id', $user)], self::class);
} }
/** /**
@ -482,11 +281,11 @@ class Feed {
* Retrieve a feed by its ID for the current user * Retrieve a feed by its ID for the current user
* *
* @param int $feedId The ID of the feed to retrieve * @param int $feedId The ID of the feed to retrieve
* @return FeedDocument|false The data for the feed if found, false if not found * @return static|false The data for the feed if found, false if not found
* @throws DocumentException If any is encountered * @throws DocumentException If any is encountered
*/ */
public static function retrieveById(int $feedId): FeedDocument|false { public static function retrieveById(int $feedId): static|false {
$doc = Find::byId(Table::FEED, $feedId, FeedDocument::class); $doc = Find::byId(Table::FEED, $feedId, self::class);
return $doc && $doc->user_id == $_SESSION[Key::USER_ID] ? $doc : false; return $doc && $doc->user_id == $_SESSION[Key::USER_ID] ? $doc : false;
} }
} }

252
src/lib/ParsedFeed.php Normal file
View File

@ -0,0 +1,252 @@
<?php
namespace FeedReaderCentral;
use DOMDocument;
use DOMElement;
use DOMException;
use DOMNode;
class ParsedFeed
{
/** @var string The URL for the feed */
public string $url = '';
/** @var string The title of the feed */
public string $title = '';
/** @var ?string When the feed was last updated */
public ?string $updatedOn = null;
/** @var ParsedItem[] The items contained in the feed */
public array $items = [];
/** @var string The XML namespace for Atom feeds */
public const string ATOM_NS = 'http://www.w3.org/2005/Atom';
/** @var string The XML namespace for the `<content:encoded>` tag that allows HTML content in a feed */
public const string CONTENT_NS = 'http://purl.org/rss/1.0/modules/content/';
/** @var string The XML namespace for XHTML */
public const string XHTML_NS = 'http://www.w3.org/1999/xhtml';
/** @var string The user agent for Feed Reader Central's refresh requests */
private const string USER_AGENT =
'FeedReaderCentral/' . FRC_VERSION . ' +https://bitbadger.solutions/open-source/feed-reader-central';
/**
* When parsing XML into a DOMDocument, errors are presented as warnings; this creates an exception for them
*
* @param int $errno The error level encountered
* @param string $errstr The text of the error encountered
* @return bool False, to delegate to the next error handler in the chain
* @throws DOMException If the error is a warning
*/
private static function xmlParseError(int $errno, string $errstr): bool {
if ($errno == E_WARNING && substr_count($errstr, 'DOMDocument::loadXML()') > 0) {
throw new DOMException($errstr, $errno);
}
return false;
}
/**
* Parse a feed into an XML tree
*
* @param string $content The feed's RSS content
* @return array|DOMDocument[]|string[] ['ok' => feed] if successful, ['error' => message] if not
*/
public static function parseFeed(string $content): array {
set_error_handler(self::xmlParseError(...));
try {
$feed = new DOMDocument();
$feed->loadXML($content);
return ['ok' => $feed];
} catch (DOMException $ex) {
return ['error' => $ex->getMessage()];
} finally {
restore_error_handler();
}
}
/**
* Get the value of a child element by its tag name for an RSS feed
*
* @param DOMNode $element The parent element
* @param string $tagName The name of the tag whose value should be obtained
* @return string The value of the element (or "[element] not found" if that element does not exist)
*/
public static function rssValue(DOMNode $element, string $tagName): string {
$tags = $element->getElementsByTagName($tagName);
return $tags->length == 0 ? "$tagName not found" : $tags->item(0)->textContent;
}
/**
* Extract items from an RSS feed
*
* @param DOMDocument $xml The XML received from the feed
* @param string $url The actual URL for the feed
* @return array|Feed[]|string[] ['ok' => feed] if successful, ['error' => message] if not
*/
private static function fromRSS(DOMDocument $xml, string $url): array {
$channel = $xml->getElementsByTagName('channel')->item(0);
if (!($channel instanceof DOMElement)) {
$type = $channel?->nodeType ?? -1;
return ['error' => "Channel element not found ($type)"];
}
// The Atom namespace provides a lastBuildDate, which contains the last time an item in the feed was updated; if
// that is not present, use the pubDate element instead
if (($updatedOn = self::rssValue($channel, 'lastBuildDate')) == 'lastBuildDate not found') {
if (($updatedOn = self::rssValue($channel, 'pubDate')) == 'pubDate not found') {
$updatedOn = null;
}
}
$feed = new static();
$feed->title = self::rssValue($channel, 'title');
$feed->url = $url;
$feed->updatedOn = Data::formatDate($updatedOn);
foreach ($channel->getElementsByTagName('item') as $item) $feed->items[] = ParsedItem::fromRSS($item);
return ['ok' => $feed];
}
/**
* Get an attribute value from a DOM node
*
* @param DOMNode $node The node with an attribute value to obtain
* @param string $name The name of the attribute whose value should be obtained
* @return string The attribute value if it exists, an empty string if not
*/
private static function attrValue(DOMNode $node, string $name): string {
return ($node->hasAttributes() ? $node->attributes->getNamedItem($name)?->value : null) ?? '';
}
/**
* Get the value of a child element by its tag name for an Atom feed
*
* (Atom feeds can have type attributes on nearly any value. For our purposes, types "text" and "html" will work as
* regular string values; for "xhtml", though, we will need to get the `<div>` and extract its contents instead.)
*
* @param DOMNode $element The parent element
* @param string $tagName The name of the tag whose value should be obtained
* @return string The value of the element (or "[element] not found" if that element does not exist)
*/
public static function atomValue(DOMNode $element, string $tagName): string {
$tags = $element->getElementsByTagName($tagName);
if ($tags->length == 0) return "$tagName not found";
$tag = $tags->item(0);
if (!($tag instanceof DOMElement)) return $tag->textContent;
if (self::attrValue($tag, 'type') == 'xhtml') {
$div = $tag->getElementsByTagNameNS(self::XHTML_NS, 'div');
if ($div->length == 0) return "-- invalid XHTML content --";
return $div->item(0)->textContent;
}
return $tag->textContent;
}
/**
* Extract items from an Atom feed
*
* @param DOMDocument $xml The XML received from the feed
* @param string $url The actual URL for the feed
* @return array|Feed[] ['ok' => feed]
*/
private static function fromAtom(DOMDocument $xml, string $url): array {
$root = $xml->getElementsByTagNameNS(self::ATOM_NS, 'feed')->item(0);
if (($updatedOn = self::atomValue($root, 'updated')) == 'pubDate not found') $updatedOn = null;
$feed = new static();
$feed->title = self::atomValue($root, 'title');
$feed->url = $url;
$feed->updatedOn = Data::formatDate($updatedOn);
foreach ($root->getElementsByTagName('entry') as $entry) $feed->items[] = ParsedItem::fromAtom($entry);
return ['ok' => $feed];
}
/**
* Retrieve a document (http/https)
*
* @param string $url The URL of the document to retrieve
* @return array ['content' => document content, 'error' => error message, 'code' => HTTP response code,
* 'url' => effective URL]
*/
private static function retrieveDocument(string $url): array {
$docReq = curl_init($url);
curl_setopt($docReq, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($docReq, CURLOPT_RETURNTRANSFER, true);
curl_setopt($docReq, CURLOPT_CONNECTTIMEOUT, 5);
curl_setopt($docReq, CURLOPT_TIMEOUT, 15);
curl_setopt($docReq, CURLOPT_USERAGENT, self::USER_AGENT);
$result = [
'content' => curl_exec($docReq),
'error' => curl_error($docReq),
'code' => curl_getinfo($docReq, CURLINFO_RESPONSE_CODE),
'url' => curl_getinfo($docReq, CURLINFO_EFFECTIVE_URL)
];
curl_close($docReq);
return $result;
}
/**
* Derive a feed URL from an HTML document
*
* @param string $content The HTML document content from which to derive a feed URL
* @return array|string[] ['ok' => feed URL] if successful, ['error' => message] if not
*/
private static function deriveFeedFromHTML(string $content): array {
$html = new DOMDocument();
$html->loadHTML(substr($content, 0, strpos($content, '</head>') + 7));
$headTags = $html->getElementsByTagName('head');
if ($headTags->length < 1) return ['error' => 'Cannot find feed at this URL'];
$head = $headTags->item(0);
foreach ($head->getElementsByTagName('link') as $link) {
if (self::attrValue($link, 'rel') == 'alternate') {
$type = self::attrValue($link, 'type');
if ($type == 'application/rss+xml' || $type == 'application/atom+xml') {
return ['ok' => self::attrValue($link, 'href')];
}
}
}
return ['error' => 'Cannot find feed at this URL'];
}
/**
* Retrieve the feed
*
* @param string $url The URL of the feed to retrieve
* @return array|ParsedFeed[]|string[] ['ok' => feed] if successful, ['error' => message] if not
*/
public static function retrieve(string $url): array {
$doc = self::retrieveDocument($url);
if ($doc['error'] != '') return ['error' => $doc['error']];
if ($doc['code'] != 200) {
return ['error' => "Prospective feed URL $url returned HTTP Code {$doc['code']}: {$doc['content']}"];
}
$start = strtolower(strlen($doc['content']) >= 9 ? substr($doc['content'], 0, 9) : $doc['content']);
if ($start == '<!doctype' || str_starts_with($start, '<html')) {
$derivedURL = self::deriveFeedFromHTML($doc['content']);
if (key_exists('error', $derivedURL)) return ['error' => $derivedURL['error']];
$feedURL = $derivedURL['ok'];
if (!str_starts_with($feedURL, 'http')) {
// Relative URL; feed should be retrieved in the context of the original URL
$original = parse_url($url);
$port = key_exists('port', $original) ? ":{$original['port']}" : '';
$feedURL = $original['scheme'] . '://' . $original['host'] . $port . $feedURL;
}
$doc = self::retrieveDocument($feedURL);
}
$parsed = self::parseFeed($doc['content']);
if (key_exists('error', $parsed)) return ['error' => $parsed['error']];
$extract = $parsed['ok']->getElementsByTagNameNS(self::ATOM_NS, 'feed')->length > 0
? self::fromAtom(...) : self::fromRSS(...);
return $extract($parsed['ok'], $doc['url']);
}
}

View File

@ -6,8 +6,8 @@ use DOMNode;
/** /**
* Information for a feed item * Information for a feed item
*/ */
class FeedItem { class ParsedItem
{
/** @var string The title of the feed item */ /** @var string The title of the feed item */
public string $title = ''; public string $title = '';
@ -50,7 +50,7 @@ class FeedItem {
*/ */
public static function fromAtom(DOMNode $node): static public static function fromAtom(DOMNode $node): static
{ {
$guid = Feed::atomValue($node, 'id'); $guid = ParsedFeed::atomValue($node, 'id');
$link = ''; $link = '';
foreach ($node->getElementsByTagName('link') as $linkElt) { foreach ($node->getElementsByTagName('link') as $linkElt) {
if ($linkElt->hasAttributes()) { if ($linkElt->hasAttributes()) {
@ -65,11 +65,11 @@ class FeedItem {
$item = new static(); $item = new static();
$item->guid = $guid; $item->guid = $guid;
$item->title = Feed::atomValue($node, 'title'); $item->title = ParsedFeed::atomValue($node, 'title');
$item->link = $link; $item->link = $link;
$item->publishedOn = Data::formatDate(Feed::atomValue($node, 'published')); $item->publishedOn = Data::formatDate(ParsedFeed::atomValue($node, 'published'));
$item->updatedOn = Data::formatDate(Feed::atomValue($node, 'updated')); $item->updatedOn = Data::formatDate(ParsedFeed::atomValue($node, 'updated'));
$item->content = Feed::atomValue($node, 'content'); $item->content = ParsedFeed::atomValue($node, 'content');
return $item; return $item;
} }
@ -82,19 +82,19 @@ class FeedItem {
*/ */
public static function fromRSS(DOMNode $node): static public static function fromRSS(DOMNode $node): static
{ {
$itemGuid = Feed::rssValue($node, 'guid'); $itemGuid = ParsedFeed::rssValue($node, 'guid');
$updNodes = $node->getElementsByTagNameNS(Feed::ATOM_NS, 'updated'); $updNodes = $node->getElementsByTagNameNS(ParsedFeed::ATOM_NS, 'updated');
$encNodes = $node->getElementsByTagNameNS(Feed::CONTENT_NS, 'encoded'); $encNodes = $node->getElementsByTagNameNS(ParsedFeed::CONTENT_NS, 'encoded');
$item = new static(); $item = new static();
$item->guid = $itemGuid == 'guid not found' ? Feed::rssValue($node, 'link') : $itemGuid; $item->guid = $itemGuid == 'guid not found' ? ParsedFeed::rssValue($node, 'link') : $itemGuid;
$item->title = Feed::rssValue($node, 'title'); $item->title = ParsedFeed::rssValue($node, 'title');
$item->link = Feed::rssValue($node, 'link'); $item->link = ParsedFeed::rssValue($node, 'link');
$item->publishedOn = Data::formatDate(Feed::rssValue($node, 'pubDate')); $item->publishedOn = Data::formatDate(ParsedFeed::rssValue($node, 'pubDate'));
$item->updatedOn = Data::formatDate($updNodes->length > 0 ? $updNodes->item(0)->textContent : null); $item->updatedOn = Data::formatDate($updNodes->length > 0 ? $updNodes->item(0)->textContent : null);
$item->content = $encNodes->length > 0 $item->content = $encNodes->length > 0
? $encNodes->item(0)->textContent ? $encNodes->item(0)->textContent
: Feed::rssValue($node, 'description'); : ParsedFeed::rssValue($node, 'description');
return $item; return $item;
} }

View File

@ -6,8 +6,11 @@
* This will display a button which will either add or remove a bookmark for a given item. * This will display a button which will either add or remove a bookmark for a given item.
*/ */
use BitBadger\Documents\DocumentException;
use BitBadger\Documents\SQLite\Find;
use BitBadger\Documents\SQLite\Patch; use BitBadger\Documents\SQLite\Patch;
use FeedReaderCentral\Data; use FeedReaderCentral\Data;
use FeedReaderCentral\Domain\Item;
use FeedReaderCentral\Domain\Table; use FeedReaderCentral\Domain\Table;
use FeedReaderCentral\Key; use FeedReaderCentral\Key;
use FeedReaderCentral\Security; use FeedReaderCentral\Security;
@ -36,21 +39,18 @@ if (key_exists('action', $_GET)) {
$flag = 0; $flag = 0;
} }
if (isset($flag)) { if (isset($flag)) {
Patch::byId(Table::ITEM, $id, ['is_bookmarked' => $flag], $db); try {
// $update = $db->prepare('UPDATE item SET is_bookmarked = :flag WHERE id = :id'); Patch::byId(Table::ITEM, $id, ['is_bookmarked' => $flag], $db);
// $update->bindValue(':id', $id); } catch (DocumentException $ex) {
// $update->bindValue(':flag', $flag); add_error("$ex");
// if (!$update->execute()) die(Data::error($db)['error']); }
} }
} }
$bookQuery = $db->prepare('SELECT id, is_bookmarked FROM item WHERE id = :id'); if (!$item = Find::byId(Table::ITEM, $id, Item::class)) not_found();
$bookQuery->bindValue(':id', $id);
$bookResult = $bookQuery->execute();
$bookmark = $bookResult ? $bookResult->fetchArray(SQLITE3_ASSOC) : ['id' => $id, 'is_bookmarked' => 0];
$action = $bookmark['is_bookmarked'] ? 'remove' : 'add'; $action = $item->isBookmarked() ? 'remove' : 'add';
$icon = $bookmark['is_bookmarked'] ? 'added' : 'add'; ?> $icon = $item->isBookmarked() ? 'added' : 'add'; ?>
<button class="bookmark <?=$action?>" type=button role=button hx-patch="/bookmark?id=<?=$id?>&action=<?=$action?>" <button class="bookmark <?=$action?>" type=button role=button hx-patch="/bookmark?id=<?=$id?>&action=<?=$action?>"
hx-target=this hx-swap=outerHTML hx-push-url=false title="<?=init_cap($action)?> Bookmark"> hx-target=this hx-swap=outerHTML hx-push-url=false title="<?=init_cap($action)?> Bookmark">
<img src=/assets/bookmark-<?=$icon?>.png alt="<?=$action?> bookmark"> <img src=/assets/bookmark-<?=$icon?>.png alt="<?=$action?> bookmark">

View File

@ -15,7 +15,7 @@ include '../../start.php';
$db = Data::getConnection(); $db = Data::getConnection();
Security::verifyUser($db); Security::verifyUser($db);
if (!($feed = Feed::retrieveById($_GET['id'], $db))) not_found(); if (!($feed = Feed::retrieveById($_GET['id']))) not_found();
$list = match (true) { $list = match (true) {
key_exists('unread', $_GET) => ItemList::unreadForFeed($feed->id, $db), key_exists('unread', $_GET) => ItemList::unreadForFeed($feed->id, $db),

View File

@ -1,6 +1,7 @@
<?php <?php
include '../../start.php'; include '../../start.php';
use FeedReaderCentral\Data;
use FeedReaderCentral\Key; use FeedReaderCentral\Key;
use FeedReaderCentral\Security; use FeedReaderCentral\Security;

View File

@ -3,6 +3,7 @@
use BitBadger\Documents\DocumentException; use BitBadger\Documents\DocumentException;
use BitBadger\Documents\SQLite\Find; use BitBadger\Documents\SQLite\Find;
use FeedReaderCentral\Data; use FeedReaderCentral\Data;
use FeedReaderCentral\Domain\Feed as FeedDocument;
use FeedReaderCentral\Domain\Table; use FeedReaderCentral\Domain\Table;
use FeedReaderCentral\Domain\User; use FeedReaderCentral\Domain\User;
use FeedReaderCentral\Feed; use FeedReaderCentral\Feed;
@ -36,12 +37,11 @@ function display_help(): never
function refresh_all(): void function refresh_all(): void
{ {
$db = Data::getConnection(); $db = Data::getConnection();
$users = [];
try { try {
$feeds = Feed::retrieveAll(); $users = [];
foreach ($feeds->items() as /** @var Feed $feed */ $feed) { iterator_apply(Feed::retrieveAll()->items(), function (FeedDocument $feed) use ($db, $users) {
$result = Feed::refreshFeed($feed->id, $feed->url, $db); $result = Feed::refreshFeed($feed->id, $feed->url, $db);
$userKey = "$feed->user_id"; $userKey = "$feed->user_id";
if (!key_exists($userKey, $users)) $users[$userKey] = Find::byId(Table::USER, $feed->user_id, User::class); if (!key_exists($userKey, $users)) $users[$userKey] = Find::byId(Table::USER, $feed->user_id, User::class);
@ -51,7 +51,7 @@ function refresh_all(): void
} else { } else {
printfn('OK (%s) %s', $users[$userKey]->email, $feed->url); printfn('OK (%s) %s', $users[$userKey]->email, $feed->url);
} }
} });
printfn(PHP_EOL . 'All feeds refreshed'); printfn(PHP_EOL . 'All feeds refreshed');
} catch (DocumentException $ex) { } catch (DocumentException $ex) {
printfn("ERR $ex"); printfn("ERR $ex");

View File

@ -1,5 +1,6 @@
<?php <?php
use BitBadger\Documents\DocumentException;
use BitBadger\Documents\Field; use BitBadger\Documents\Field;
use BitBadger\Documents\SQLite\Count; use BitBadger\Documents\SQLite\Count;
use BitBadger\Documents\SQLite\Delete; use BitBadger\Documents\SQLite\Delete;
@ -161,22 +162,19 @@ function delete_user(string $email): void
try { try {
$displayUser = display_user($email); $displayUser = display_user($email);
// Get the ID for the provided e-mail address // Get the user for the provided e-mail address
$user = Security::findUserByEmail($email, $db); $user = Security::findUserByEmail($email, $db);
if (!$user) { if (!$user) {
printfn('No %s exists', $displayUser); printfn('No %s exists', $displayUser);
return; return;
} }
$feedCount = Count::byField(Table::FEED, Field::EQ('user_id', $user->id)); try {
// $feedCountQuery = $db->prepare('SELECT COUNT(*) FROM feed WHERE user_id = :user'); $feedCount = Count::byFields(Table::FEED, [Field::EQ('user_id', $user->id)], $db);
// $feedCountQuery->bindValue(':user', $user['id']); } catch (DocumentException $ex) {
// $feedCountResult = $feedCountQuery->execute(); printfn("$ex");
// if (!$feedCountResult) { return;
// printfn('SQLite error: %s', $db->lastErrorMsg()); }
// return;
// }
// $feedCount = $feedCountResult->fetchArray(SQLITE3_NUM);
$proceed = readline("Delete the $displayUser and their $feedCount feed(s)? (y/N)" . PHP_EOL); $proceed = readline("Delete the $displayUser and their $feedCount feed(s)? (y/N)" . PHP_EOL);
if (!$proceed || !str_starts_with(strtolower($proceed), 'y')) { if (!$proceed || !str_starts_with(strtolower($proceed), 'y')) {
@ -184,21 +182,19 @@ function delete_user(string $email): void
return; return;
} }
$itemDelete = $db->prepare('DELETE FROM item WHERE feed_id IN (SELECT id FROM feed WHERE user_id = :user)'); try {
$itemDelete->bindValue(':user', $user->id); // TODO: convert query
$itemDelete->execute(); $itemDelete = $db->prepare('DELETE FROM item WHERE feed_id IN (SELECT id FROM feed WHERE user_id = :user)');
$itemDelete->bindValue(':user', $user->id);
$itemDelete->execute();
Delete::byField(Table::FEED, Field::EQ('user_id', $user['id']), $db); Delete::byFields(Table::FEED, [Field::EQ('user_id', $user->id)], $db);
// $feedDelete = $db->prepare('DELETE FROM feed WHERE user_id = :user'); Delete::byId(Table::USER, $user->id, $db);
// $feedDelete->bindValue(':user', $user['id']);
// $feedDelete->execute();
Delete::byId(Table::USER, $user->id, $db); printfn('%s deleted successfully', init_cap($displayUser));
// $userDelete = $db->prepare('DELETE FROM frc_user WHERE id = :user'); } catch (DocumentException $ex) {
// $userDelete->bindValue(':user', $user['id']); printfn("$ex");
// $userDelete->execute(); }
printfn('%s deleted successfully', init_cap($displayUser));
} finally { } finally {
$db->close(); $db->close();
} }
@ -221,13 +217,10 @@ function migrate_single_user(): void
Patch::byId(Table::USER, $single->id, Patch::byId(Table::USER, $single->id,
['email' => $argv[2], 'password' => password_hash($argv[3], Security::PW_ALGORITHM)], $db); ['email' => $argv[2], 'password' => password_hash($argv[3], Security::PW_ALGORITHM)], $db);
// $migrateQuery = $db->prepare('UPDATE frc_user SET email = :email, password = :password WHERE id = :id');
// $migrateQuery->bindValue(':email', $argv[2]);
// $migrateQuery->bindValue(':password', password_hash($argv[3], Security::PW_ALGORITHM));
// $migrateQuery->bindValue(':id', $single['id']);
// $migrateQuery->execute();
printfn('The single user has been moved to "%s", with password "%s"', $argv[2], $argv[3]); printfn('The single user has been moved to "%s", with password "%s"', $argv[2], $argv[3]);
} catch (DocumentException $ex) {
printfn("$ex");
} finally { } finally {
$db->close(); $db->close();
} }