Change from SimpleXML to DOM (#4)
This API is more reliable, and should help when implementing the "load a site's HTML and look for feed links" functionality coming before the final release
This commit is contained in:
parent
0530ed0dc9
commit
8ca4bf2109
@ -99,8 +99,23 @@ class Data {
|
||||
$query->execute();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse/format a date/time from a string
|
||||
*
|
||||
* @param ?string $value The date/time to be parsed and formatted
|
||||
* @return string|null The date/time in `DateTimeInterface::ATOM` format, or `null` if the input cannot be parsed
|
||||
*/
|
||||
private static function formatDate(?string $value): ?string {
|
||||
try {
|
||||
return $value ? (new DateTimeImmutable($value))->format(DateTimeInterface::ATOM) : null;
|
||||
} catch (Exception) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an RSS feed
|
||||
*
|
||||
* @param string $url The URL for the RSS feed
|
||||
* @param string $title The title of the RSS feed
|
||||
* @param string $updatedOn The date/time the RSS feed was last updated (from the XML, not when we checked)
|
||||
@ -108,28 +123,25 @@ class Data {
|
||||
*/
|
||||
public static function addFeed(string $url, string $title, string $updatedOn): int {
|
||||
$db = self::getConnection();
|
||||
if ($updatedOn) {
|
||||
try {
|
||||
$updated = (new DateTimeImmutable($updatedOn))->format(DateTimeInterface::ATOM);
|
||||
} catch (Exception) {
|
||||
$updated = null;
|
||||
}
|
||||
} else {
|
||||
$updated = null;
|
||||
}
|
||||
$query = $db->prepare('INSERT INTO feed (user_id, url, title, updated_on, checked_on)'
|
||||
. ' VALUES (:user, :url, :title, :updated, :checked)');
|
||||
$query->bindValue(':user', $_REQUEST['FRC_USER_ID']);
|
||||
$query->bindValue(':url', $url);
|
||||
$query->bindValue(':title', $title);
|
||||
$query->bindValue(':updated', $updated);
|
||||
$query->bindValue(':checked', (new DateTimeImmutable())->format(DateTimeInterface::ATOM));
|
||||
$query = $db->prepare(<<<'SQL'
|
||||
INSERT INTO feed (
|
||||
user_id, url, title, updated_on, checked_on
|
||||
) VALUES (
|
||||
:user, :url, :title, :updated, :checked
|
||||
)
|
||||
SQL);
|
||||
$query->bindValue(':user', $_REQUEST['FRC_USER_ID']);
|
||||
$query->bindValue(':url', $url);
|
||||
$query->bindValue(':title', $title);
|
||||
$query->bindValue(':updated', self::formatDate($updatedOn));
|
||||
$query->bindValue(':checked', self::formatDate('now'));
|
||||
$result = $query->execute();
|
||||
return $result ? $db->lastInsertRowID() : -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does a feed item already exist?
|
||||
*
|
||||
* @param int $feedId The ID of the feed to which the item belongs
|
||||
* @param string $guid The GUID from the RSS feed, uniquely identifying the item
|
||||
* @return bool True if the item exists, false if not
|
||||
@ -145,28 +157,34 @@ class Data {
|
||||
|
||||
/**
|
||||
* Add a feed item
|
||||
*
|
||||
* @param int $feedId The ID of the feed to which the item should be added
|
||||
* @param string $guid The GUID from the RSS feed (uses link if `<guid>` not specified)
|
||||
* @param string $link The link to this item
|
||||
* @param string $title The title of the item
|
||||
* @param string $published The date/time the item was published
|
||||
* @param string $publishedOn The date/time the item was published
|
||||
* @param ?string $updatedOn The date/time the item was last updated
|
||||
* @param string $content The content of the item
|
||||
* @param bool $isEncoded Whether the content has HTML (true) or is plaintext (false)
|
||||
* @throws Exception If the published date is not valid
|
||||
*/
|
||||
public static function addItem(int $feedId, string $guid, string $link, string $title, string $published,
|
||||
string $content, bool $isEncoded): void {
|
||||
public static function addItem(int $feedId, string $guid, string $link, string $title, string $publishedOn,
|
||||
?string $updatedOn, string $content, bool $isEncoded): void {
|
||||
$db = self::getConnection();
|
||||
$query = $db->prepare(
|
||||
'INSERT INTO item (feed_id, item_guid, item_link, title, published_on, content, is_encoded)'
|
||||
. ' VALUES (:feed, :guid, :link, :title, :published, :content, :encoded)');
|
||||
$query->bindValue(':feed', $feedId);
|
||||
$query->bindValue(':guid', $guid);
|
||||
$query->bindValue(':link', $link);
|
||||
$query->bindValue(':title', $title);
|
||||
$query->bindValue(':published', (new DateTimeImmutable($published))->format(DateTimeInterface::ATOM));
|
||||
$query->bindValue(':content', $content);
|
||||
$query->bindValue(':encoded', $isEncoded);
|
||||
$query = $db->prepare(<<<'SQL'
|
||||
INSERT INTO item (
|
||||
feed_id, item_guid, item_link, title, published_on, updated_on, content, is_encoded
|
||||
) VALUES (
|
||||
:feed, :guid, :link, :title, :published, :updated, :content, :encoded
|
||||
)
|
||||
SQL);
|
||||
$query->bindValue(':feed', $feedId);
|
||||
$query->bindValue(':guid', $guid);
|
||||
$query->bindValue(':link', $link);
|
||||
$query->bindValue(':title', $title);
|
||||
$query->bindValue(':published', self::formatDate($publishedOn));
|
||||
$query->bindValue(':updated', self::formatDate($updatedOn));
|
||||
$query->bindValue(':content', $content);
|
||||
$query->bindValue(':encoded', $isEncoded);
|
||||
$query->execute();
|
||||
}
|
||||
}
|
||||
|
@ -4,23 +4,52 @@
|
||||
*/
|
||||
class Feed {
|
||||
|
||||
/** @var string The XML namespace for Atom feeds */
|
||||
public const ATOM_NS = 'http://www.w3.org/2005/Atom';
|
||||
|
||||
/** @var string The XML namespace for the `<content>` tag that allows HTML content in a feed */
|
||||
public const CONTENT_NS = 'http://purl.org/rss/1.0/modules/content/';
|
||||
|
||||
/**
|
||||
* When parsing XML into a DOMDocument, errors are presented as warnings; this creates an exception for them
|
||||
*
|
||||
* @param int $errno The error level encountered
|
||||
* @param string $errstr The text of the error encountered
|
||||
* @return bool False, to delegate to the next error handler in the chain
|
||||
* @throws DOMException If the error is a warning
|
||||
*/
|
||||
private static function xmlParseError(int $errno, string $errstr): bool {
|
||||
if ($errno == E_WARNING && substr_count($errstr, 'DOMDocument::loadXml()') > 0) {
|
||||
throw new DOMException($errstr, $errno);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a feed into an XML tree
|
||||
*
|
||||
* @param string $content The feed's RSS content
|
||||
* @return array|SimpleXMLElement[]|string[] [ 'ok' => feed ] if successful, [ 'error' => message] if not
|
||||
* @return array|DOMDocument[]|string[] [ 'ok' => feed ] if successful, [ 'error' => message] if not
|
||||
*/
|
||||
public static function parseFeed(string $content): array {
|
||||
set_error_handler(self::xmlParseError(...));
|
||||
try {
|
||||
return [ 'ok' => new SimpleXMLElement($content) ];
|
||||
} catch (Exception $ex) {
|
||||
$feed = new DOMDocument();
|
||||
$feed->loadXML($content);
|
||||
return [ 'ok' => $feed ];
|
||||
} catch (DOMException $ex) {
|
||||
return [ 'error' => $ex->getMessage() ];
|
||||
} finally {
|
||||
restore_error_handler();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the feed
|
||||
*
|
||||
* @param string $url
|
||||
* @return array|SimpleXMLElement[]|string[] [ 'ok' => feedXml, 'url' => actualUrl ] if successful, [ 'error' => message ] if not
|
||||
* @return array|DOMDocument[]|string[] [ 'ok' => feedXml, 'url' => actualUrl ] if successful,
|
||||
* [ 'error' => message ] if not
|
||||
*/
|
||||
public static function retrieveFeed(string $url): array {
|
||||
$feedReq = curl_init($url);
|
||||
@ -52,32 +81,46 @@ class Feed {
|
||||
return $result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the value of a child element by its tag name
|
||||
*
|
||||
* @param DOMElement $element The parent element
|
||||
* @param string $tagName The name of the tag whose value should be obtained
|
||||
* @return string The value of the element (or "[element] not found" if that element does not exist)
|
||||
*/
|
||||
private static function eltValue(DOMElement $element, string $tagName): string {
|
||||
$tags = $element->getElementsByTagName($tagName);
|
||||
return $tags->length == 0 ? "$tagName not found" : $tags->item(0)->textContent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update a feed's items
|
||||
*
|
||||
* @param int $feedId The ID of the feed to which these items belong
|
||||
* @param SimpleXMLElement $channel The RSS feed items
|
||||
* @param DOMElement $channel The RSS feed items
|
||||
* @return array [ 'ok' => true ] if successful, [ 'error' => message ] if not
|
||||
*/
|
||||
public static function updateItems(int $feedId, SimpleXMLElement $channel): array {
|
||||
public static function updateItems(int $feedId, DOMElement $channel): array {
|
||||
try {
|
||||
for ($i = 0; $i < sizeof($channel->item); $i++) {
|
||||
$item = $channel->item[$i];
|
||||
$itemGuid = (string)$item->guid ? $item->guid : $item->link;
|
||||
foreach ($channel->getElementsByTagName('item') as $item) {
|
||||
$itemGuid = self::eltValue($item, 'guid');
|
||||
if ($itemGuid == 'guid not found') $itemGuid = self::eltValue($item, 'link');
|
||||
$isNew = !Data::itemExists($feedId, $itemGuid);
|
||||
if ($isNew) {
|
||||
$title = (string)$item->title;
|
||||
$link = (string)$item->link;
|
||||
$published = (string)$item->pubDate;
|
||||
// TODO: why is this getting all encoded content, and not just the one for the current item?
|
||||
$encodedContent = $item->xpath('//content:encoded');
|
||||
if ($encodedContent) {
|
||||
$content = (string) $encodedContent[$i];
|
||||
$title = self::eltValue($item, 'title');
|
||||
$link = self::eltValue($item, 'link');
|
||||
$published = self::eltValue($item, 'pubDate');
|
||||
$updNodes = $item->getElementsByTagNameNS(self::ATOM_NS, 'updated');
|
||||
$updated = $updNodes->length > 0 ? $updNodes->item(0)->textContent : null;
|
||||
$encNodes = $item->getElementsByTagNameNS(self::CONTENT_NS, 'encoded');
|
||||
if ($encNodes->length > 0) {
|
||||
$content = $encNodes->item(0)->textContent;
|
||||
$isEncoded = true;
|
||||
} else {
|
||||
$content = $item->description;
|
||||
$content = self::eltValue($item, 'description');
|
||||
$isEncoded = false;
|
||||
}
|
||||
Data::addItem($feedId, $itemGuid, $link, $title, $published, $content, $isEncoded);
|
||||
Data::addItem($feedId, $itemGuid, $link, $title, $published, $updated, $content, $isEncoded);
|
||||
} // TODO: else check updated date; may want to return that from the isNew check instead
|
||||
}
|
||||
} catch (Exception $ex) {
|
||||
@ -88,6 +131,7 @@ class Feed {
|
||||
|
||||
/**
|
||||
* Add an RSS feed
|
||||
*
|
||||
* @param string $url The URL of the RSS feed to add
|
||||
* @return array [ 'ok' => true ] if successful, [ 'error' => message ] if not
|
||||
*/
|
||||
@ -95,8 +139,11 @@ class Feed {
|
||||
$feed = self::retrieveFeed($url);
|
||||
if (array_key_exists('error', $feed)) return $feed;
|
||||
|
||||
$channel = $feed['ok']->channel;
|
||||
$feedId = Data::addFeed($feed['url'], (string) $channel->title, (string) $channel->lastBuildDate);
|
||||
$channel = $feed['ok']->getElementsByTagName('channel')->item(0);
|
||||
if (!$channel instanceof DOMElement) return [ 'error' => "Channel element not found ($channel->nodeType)" ];
|
||||
|
||||
$feedId = Data::addFeed($feed['url'], self::eltValue($channel, 'title'),
|
||||
self::eltValue($channel, 'lastBuildDate'));
|
||||
|
||||
$result = self::updateItems($feedId, $channel);
|
||||
if (array_key_exists('error', $result)) return $result;
|
||||
|
Loading…
Reference in New Issue
Block a user