` tag that allows HTML content in a feed */ public const CONTENT_NS = 'http://purl.org/rss/1.0/modules/content/'; /** @var string The XML namespace for XHTML */ public const XHTML_NS = 'http://www.w3.org/1999/xhtml'; /** @var string The user agent for Feed Reader Central's refresh requests */ private const USER_AGENT = 'FeedReaderCentral/' . FRC_VERSION . ' +https://bitbadger.solutions/open-source/feed-reader-central'; /** * When parsing XML into a DOMDocument, errors are presented as warnings; this creates an exception for them * * @param int $errno The error level encountered * @param string $errstr The text of the error encountered * @return bool False, to delegate to the next error handler in the chain * @throws DOMException If the error is a warning */ private static function xmlParseError(int $errno, string $errstr): bool { if ($errno == E_WARNING && substr_count($errstr, 'DOMDocument::loadXML()') > 0) { throw new DOMException($errstr, $errno); } return false; } /** * Parse a feed into an XML tree * * @param string $content The feed's RSS content * @return array|DOMDocument[]|string[] ['ok' => feed] if successful, ['error' => message] if not */ public static function parseFeed(string $content): array { set_error_handler(self::xmlParseError(...)); try { $feed = new DOMDocument(); $feed->loadXML($content); return ['ok' => $feed]; } catch (DOMException $ex) { return ['error' => $ex->getMessage()]; } finally { restore_error_handler(); } } /** * Get the value of a child element by its tag name for an RSS feed * * @param DOMNode $element The parent element * @param string $tagName The name of the tag whose value should be obtained * @return string The value of the element (or "[element] not found" if that element does not exist) */ public static function rssValue(DOMNode $element, string $tagName): string { $tags = $element->getElementsByTagName($tagName); return $tags->length == 0 ? "$tagName not found" : $tags->item(0)->textContent; } /** * Extract items from an RSS feed * * @param DOMDocument $xml The XML received from the feed * @param string $url The actual URL for the feed * @return array|Feed[]|string[] ['ok' => feed] if successful, ['error' => message] if not */ private static function fromRSS(DOMDocument $xml, string $url): array { $channel = $xml->getElementsByTagName('channel')->item(0); if (!($channel instanceof DOMElement)) { $type = $channel?->nodeType ?? -1; return ['error' => "Channel element not found ($type)"]; } // The Atom namespace provides a lastBuildDate, which contains the last time an item in the feed was updated; if // that is not present, use the pubDate element instead if (($updatedOn = self::rssValue($channel, 'lastBuildDate')) == 'lastBuildDate not found') { if (($updatedOn = self::rssValue($channel, 'pubDate')) == 'pubDate not found') { $updatedOn = null; } } $feed = new static(); $feed->title = self::rssValue($channel, 'title'); $feed->url = $url; $feed->updatedOn = Data::formatDate($updatedOn); foreach ($channel->getElementsByTagName('item') as $item) $feed->items[] = ParsedItem::fromRSS($item); return ['ok' => $feed]; } /** * Get an attribute value from a DOM node * * @param DOMNode $node The node with an attribute value to obtain * @param string $name The name of the attribute whose value should be obtained * @return string The attribute value if it exists, an empty string if not */ private static function attrValue(DOMNode $node, string $name): string { return ($node->hasAttributes() ? $node->attributes->getNamedItem($name)?->value : null) ?? ''; } /** * Get the value of a child element by its tag name for an Atom feed * * (Atom feeds can have type attributes on nearly any value. For our purposes, types "text" and "html" will work as * regular string values; for "xhtml", though, we will need to get the `
` and extract its contents instead.) * * @param DOMNode $element The parent element * @param string $tagName The name of the tag whose value should be obtained * @return string The value of the element (or "[element] not found" if that element does not exist) */ public static function atomValue(DOMNode $element, string $tagName): string { $tags = $element->getElementsByTagName($tagName); if ($tags->length == 0) return "$tagName not found"; $tag = $tags->item(0); if (!($tag instanceof DOMElement)) return $tag->textContent; if (self::attrValue($tag, 'type') == 'xhtml') { $div = $tag->getElementsByTagNameNS(self::XHTML_NS, 'div'); if ($div->length == 0) return "-- invalid XHTML content --"; return $div->item(0)->textContent; } return $tag->textContent; } /** * Extract items from an Atom feed * * @param DOMDocument $xml The XML received from the feed * @param string $url The actual URL for the feed * @return array|Feed[] ['ok' => feed] */ private static function fromAtom(DOMDocument $xml, string $url): array { $root = $xml->getElementsByTagNameNS(self::ATOM_NS, 'feed')->item(0); if (($updatedOn = self::atomValue($root, 'updated')) == 'pubDate not found') $updatedOn = null; $feed = new static(); $feed->title = self::atomValue($root, 'title'); $feed->url = $url; $feed->updatedOn = Data::formatDate($updatedOn); foreach ($root->getElementsByTagName('entry') as $entry) $feed->items[] = ParsedItem::fromAtom($entry); return ['ok' => $feed]; } /** * Retrieve a document (http/https) * * @param string $url The URL of the document to retrieve * @return array ['content' => document content, 'error' => error message, 'code' => HTTP response code, * 'url' => effective URL] */ private static function retrieveDocument(string $url): array { $docReq = curl_init($url); curl_setopt($docReq, CURLOPT_FOLLOWLOCATION, true); curl_setopt($docReq, CURLOPT_RETURNTRANSFER, true); curl_setopt($docReq, CURLOPT_CONNECTTIMEOUT, 5); curl_setopt($docReq, CURLOPT_TIMEOUT, 15); curl_setopt($docReq, CURLOPT_USERAGENT, self::USER_AGENT); $result = [ 'content' => curl_exec($docReq), 'error' => curl_error($docReq), 'code' => curl_getinfo($docReq, CURLINFO_RESPONSE_CODE), 'url' => curl_getinfo($docReq, CURLINFO_EFFECTIVE_URL) ]; curl_close($docReq); return $result; } /** * Derive a feed URL from an HTML document * * @param string $content The HTML document content from which to derive a feed URL * @return array|string[] ['ok' => feed URL] if successful, ['error' => message] if not */ private static function deriveFeedFromHTML(string $content): array { $html = new DOMDocument(); $html->loadHTML(substr($content, 0, strpos($content, '') + 7)); $headTags = $html->getElementsByTagName('head'); if ($headTags->length < 1) return ['error' => 'Cannot find feed at this URL']; $head = $headTags->item(0); foreach ($head->getElementsByTagName('link') as $link) { if (self::attrValue($link, 'rel') == 'alternate') { $type = self::attrValue($link, 'type'); if ($type == 'application/rss+xml' || $type == 'application/atom+xml') { return ['ok' => self::attrValue($link, 'href')]; } } } return ['error' => 'Cannot find feed at this URL']; } /** * Retrieve the feed * * @param string $url The URL of the feed to retrieve * @return array|ParsedFeed[]|string[] ['ok' => feed] if successful, ['error' => message] if not */ public static function retrieve(string $url): array { $doc = self::retrieveDocument($url); if ($doc['error'] != '') return ['error' => $doc['error']]; if ($doc['code'] != 200) { return ['error' => "Prospective feed URL $url returned HTTP Code {$doc['code']}: {$doc['content']}"]; } $start = strtolower(strlen($doc['content']) >= 9 ? substr($doc['content'], 0, 9) : $doc['content']); if ($start == ' $derivedURL['error']]; $feedURL = $derivedURL['ok']; if (!str_starts_with($feedURL, 'http')) { // Relative URL; feed should be retrieved in the context of the original URL $original = parse_url($url); $port = key_exists('port', $original) ? ":{$original['port']}" : ''; $feedURL = $original['scheme'] . '://' . $original['host'] . $port . $feedURL; } $doc = self::retrieveDocument($feedURL); } $parsed = self::parseFeed($doc['content']); if (key_exists('error', $parsed)) return ['error' => $parsed['error']]; $extract = $parsed['ok']->getElementsByTagNameNS(self::ATOM_NS, 'feed')->length > 0 ? self::fromAtom(...) : self::fromRSS(...); return $extract($parsed['ok'], $doc['url']); } }