feed-reader-central/src/lib/ParsedFeed.php

<?php declare(strict_types=1);

namespace FeedReaderCentral;

use DOMDocument;
use DOMElement;
use DOMException;
use DOMNode;

class ParsedFeed
{
    /** @var string The URL for the feed */
    public string $url = '';

    /** @var string The title of the feed */
    public string $title = '';

    /** @var ?string When the feed was last updated */
    public ?string $updatedOn = null;

    /** @var ParsedItem[] The items contained in the feed */
    public array $items = [];

    /** @var string The XML namespace for Atom feeds */
    public const ATOM_NS = 'http://www.w3.org/2005/Atom';

    /** @var string The XML namespace for the `<content:encoded>` tag that allows HTML content in a feed */
    public const CONTENT_NS = 'http://purl.org/rss/1.0/modules/content/';

    /** @var string The XML namespace for XHTML */
    public const XHTML_NS = 'http://www.w3.org/1999/xhtml';

    /** @var string The user agent for Feed Reader Central's refresh requests */
    private const USER_AGENT =
        'FeedReaderCentral/' . FRC_VERSION . ' +https://bitbadger.solutions/open-source/feed-reader-central';

    /**
     * When parsing XML into a DOMDocument, errors are presented as warnings; this creates an exception for them
     *
     * @param int $errno The error level encountered
     * @param string $errstr The text of the error encountered
     * @return bool False, to delegate to the next error handler in the chain
     * @throws DOMException If the error is a warning
     */
    private static function xmlParseError(int $errno, string $errstr): bool {
        if ($errno == E_WARNING && substr_count($errstr, 'DOMDocument::loadXML()') > 0) {
            throw new DOMException($errstr, $errno);
        }
        return false;
    }

    /**
     * Parse a feed into an XML tree
     *
     * @param string $content The feed's RSS content
     * @return array|DOMDocument[]|string[] ['ok' => feed] if successful, ['error' => message] if not
     */
    public static function parseFeed(string $content): array {
        set_error_handler(self::xmlParseError(...));
        try {
            $feed = new DOMDocument();
            $feed->loadXML($content);
            return ['ok' => $feed];
        } catch (DOMException $ex) {
            return ['error' => $ex->getMessage()];
        } finally {
            restore_error_handler();
        }
    }

    /**
     * Get the value of a child element by its tag name for an RSS feed
     *
     * @param DOMNode $element The parent element
     * @param string $tagName The name of the tag whose value should be obtained
     * @return string The value of the element (or "[element] not found" if that element does not exist)
     */
    public static function rssValue(DOMNode $element, string $tagName): string {
        $tags = $element->getElementsByTagName($tagName);
        return $tags->length == 0 ? "$tagName not found" : $tags->item(0)->textContent;
    }

    /**
     * Extract items from an RSS feed
     *
     * @param DOMDocument $xml The XML received from the feed
     * @param string $url The actual URL for the feed
     * @return array|Feed[]|string[] ['ok' => feed] if successful, ['error' => message] if not
     */
    private static function fromRSS(DOMDocument $xml, string $url): array {
        $channel = $xml->getElementsByTagName('channel')->item(0);
        if (!($channel instanceof DOMElement)) {
            $type = $channel?->nodeType ?? -1;
            return ['error' => "Channel element not found ($type)"];
        }

        // The Atom namespace provides a lastBuildDate, which contains the last time an item in the feed was updated; if
        // that is not present, use the pubDate element instead
        if (($updatedOn = self::rssValue($channel, 'lastBuildDate')) == 'lastBuildDate not found') {
            if (($updatedOn = self::rssValue($channel, 'pubDate')) == 'pubDate not found') {
                $updatedOn = null;
            }
        }

        $feed            = new static();
        $feed->title     = self::rssValue($channel, 'title');
        $feed->url       = $url;
        $feed->updatedOn = Data::formatDate($updatedOn);
        foreach ($channel->getElementsByTagName('item') as $item) $feed->items[] = ParsedItem::fromRSS($item);

        return ['ok' => $feed];
    }

    /**
     * Get an attribute value from a DOM node
     *
     * @param DOMNode $node The node with an attribute value to obtain
     * @param string $name The name of the attribute whose value should be obtained
     * @return string The attribute value if it exists, an empty string if not
     */
    private static function attrValue(DOMNode $node, string $name): string {
        return ($node->hasAttributes() ? $node->attributes->getNamedItem($name)?->value : null) ?? '';

    }
    /**
     * Get the value of a child element by its tag name for an Atom feed
     *
     * (Atom feeds can have type attributes on nearly any value. For our purposes, types "text" and "html" will work as
     * regular string values; for "xhtml", though, we will need to get the `<div>` and extract its contents instead.)
     *
     * @param DOMNode $element The parent element
     * @param string $tagName The name of the tag whose value should be obtained
     * @return string The value of the element (or "[element] not found" if that element does not exist)
     */
    public static function atomValue(DOMNode $element, string $tagName): string {
        $tags = $element->getElementsByTagName($tagName);
        if ($tags->length == 0) return "$tagName not found";
        $tag = $tags->item(0);
        if (!($tag instanceof DOMElement)) return $tag->textContent;
        if (self::attrValue($tag, 'type') == 'xhtml') {
            $div = $tag->getElementsByTagNameNS(self::XHTML_NS, 'div');
            if ($div->length == 0) return "-- invalid XHTML content --";
            return $div->item(0)->textContent;
        }
        return $tag->textContent;
    }

    /**
     * Extract items from an Atom feed
     *
     * @param DOMDocument $xml The XML received from the feed
     * @param string $url The actual URL for the feed
     * @return array|Feed[] ['ok' => feed]
     */
    private static function fromAtom(DOMDocument $xml, string $url): array {
        $root = $xml->getElementsByTagNameNS(self::ATOM_NS, 'feed')->item(0);
        if (($updatedOn = self::atomValue($root, 'updated')) == 'pubDate not found') $updatedOn = null;

        $feed            = new static();
        $feed->title     = self::atomValue($root, 'title');
        $feed->url       = $url;
        $feed->updatedOn = Data::formatDate($updatedOn);
        foreach ($root->getElementsByTagName('entry') as $entry) $feed->items[] = ParsedItem::fromAtom($entry);

        return ['ok' => $feed];
    }

    /**
     * Retrieve a document (http/https)
     *
     * @param string $url The URL of the document to retrieve
     * @return array ['content' => document content, 'error' => error message, 'code' => HTTP response code,
     *      'url' => effective URL]
     */
    private static function retrieveDocument(string $url): array {
        $docReq = curl_init($url);
        curl_setopt($docReq, CURLOPT_FOLLOWLOCATION, true);
        curl_setopt($docReq, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($docReq, CURLOPT_CONNECTTIMEOUT, 5);
        curl_setopt($docReq, CURLOPT_TIMEOUT,        15);
        curl_setopt($docReq, CURLOPT_USERAGENT,      self::USER_AGENT);

        $result = [
            'content' => curl_exec($docReq),
            'error'   => curl_error($docReq),
            'code'    => curl_getinfo($docReq, CURLINFO_RESPONSE_CODE),
            'url'     => curl_getinfo($docReq, CURLINFO_EFFECTIVE_URL)
        ];

        curl_close($docReq);
        return $result;
    }

    /**
     * Derive a feed URL from an HTML document
     *
     * @param string $content The HTML document content from which to derive a feed URL
     * @return array|string[] ['ok' => feed URL] if successful, ['error' => message] if not
     */
    private static function deriveFeedFromHTML(string $content): array {
        $html = new DOMDocument();
        $html->loadHTML(substr($content, 0, strpos($content, '</head>') + 7));
        $headTags = $html->getElementsByTagName('head');
        if ($headTags->length < 1) return ['error' => 'Cannot find feed at this URL'];
        $head = $headTags->item(0);
        foreach ($head->getElementsByTagName('link') as $link) {
            if (self::attrValue($link, 'rel') == 'alternate') {
                $type = self::attrValue($link, 'type');
                if ($type == 'application/rss+xml' || $type == 'application/atom+xml') {
                    return ['ok' => self::attrValue($link, 'href')];
                }
            }
        }
        return ['error' => 'Cannot find feed at this URL'];
    }

    /**
     * Retrieve the feed
     *
     * @param string $url The URL of the feed to retrieve
     * @return array|ParsedFeed[]|string[] ['ok' => feed] if successful, ['error' => message] if not
     */
    public static function retrieve(string $url): array {
        $doc = self::retrieveDocument($url);

        if ($doc['error'] != '') return ['error' => $doc['error']];
        if ($doc['code'] != 200) {
            return ['error' => "Prospective feed URL $url returned HTTP Code {$doc['code']}: {$doc['content']}"];
        }

        $start = strtolower(strlen($doc['content']) >= 9 ? substr($doc['content'], 0, 9) : $doc['content']);
        if ($start == '<!doctype' || str_starts_with($start, '<html')) {
            $derivedURL = self::deriveFeedFromHTML($doc['content']);
            if (key_exists('error', $derivedURL)) return ['error' => $derivedURL['error']];
            $feedURL = $derivedURL['ok'];
            if (!str_starts_with($feedURL, 'http')) {
                // Relative URL; feed should be retrieved in the context of the original URL
                $original = parse_url($url);
                $port     = key_exists('port', $original) ? ":{$original['port']}" : '';
                $feedURL  = $original['scheme'] . '://' . $original['host'] . $port . $feedURL;
            }
            $doc = self::retrieveDocument($feedURL);
        }

        $parsed = self::parseFeed($doc['content']);
        if (key_exists('error', $parsed)) return ['error' => $parsed['error']];

        $extract = $parsed['ok']->getElementsByTagNameNS(self::ATOM_NS, 'feed')->length > 0
            ? self::fromAtom(...) : self::fromRSS(...);
        return $extract($parsed['ok'], $doc['url']);
    }
}