gnu-social/plugins/OStatus/lib/feeddiscovery.php
Brion Vibber 9a53be4669 Initial support for third-party fallback hub such as Superfeedr for feed subscriptions.
If set up, this hub will be used to subscribe to feeds that don't specify a hub of their own.
Assumes that the fallback hub will, in fact, handle polling and updates for any feed we throw at it!
Authentication may be specified for the fallback hub.

Example:

  $config['feedsub']['fallback_hub'] = 'https://superfeedr.com/hubbub';
  $config['feedsub']['hub_user'] = 'abcd';
  $config['feedsub']['hub_pass'] = 'ckcmdkmckdmkcdk';

Also:
* Fix for WordPress-RSS-via-Superfeedr-Atom; if we have <author> info but no ID from a native ActivityStreams actor, don't freak out in the low-level processing code that checks for identity matches.
* enhanced messages for low-level FeedSub exceptions if they make it to outside display
2010-08-10 12:57:40 -07:00

289 lines
8.6 KiB
PHP

<?php
/*
* StatusNet - the distributed open-source microblogging tool
* Copyright (C) 2009, StatusNet, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/**
* @package FeedSubPlugin
* @maintainer Brion Vibber <brion@status.net>
*/
if (!defined('STATUSNET') && !defined('LACONICA')) { exit(1); }
class FeedSubBadURLException extends FeedSubException
{
}
class FeedSubBadResponseException extends FeedSubException
{
}
class FeedSubEmptyException extends FeedSubException
{
}
class FeedSubBadHTMLException extends FeedSubException
{
}
class FeedSubUnrecognizedTypeException extends FeedSubException
{
}
class FeedSubNoFeedException extends FeedSubException
{
}
class FeedSubBadXmlException extends FeedSubException
{
}
class FeedSubNoHubException extends FeedSubException
{
}
/**
* Given a web page or feed URL, discover the final location of the feed
* and return its current contents.
*
* @example
* $feed = new FeedDiscovery();
* if ($feed->discoverFromURL($url)) {
* print $feed->uri;
* print $feed->type;
* processFeed($feed->feed); // DOMDocument
* }
*/
class FeedDiscovery
{
public $uri;
public $type;
public $feed;
public $root;
/** Post-initialize query helper... */
public function getLink($rel, $type=null)
{
// @fixme check for non-Atom links in RSS2 feeds as well
return self::getAtomLink($rel, $type);
}
public function getAtomLink($rel, $type=null)
{
return ActivityUtils::getLink($this->root, $rel, $type);
}
/**
* Get the referenced PuSH hub link from an Atom feed.
*
* @return mixed string or false
*/
public function getHubLink()
{
return $this->getAtomLink('hub');
}
/**
* @param string $url
* @param bool $htmlOk pass false here if you don't want to follow web pages.
* @return string with validated URL
* @throws FeedSubBadURLException
* @throws FeedSubBadHtmlException
* @throws FeedSubNoFeedException
* @throws FeedSubEmptyException
* @throws FeedSubUnrecognizedTypeException
*/
function discoverFromURL($url, $htmlOk=true)
{
try {
$client = new HTTPClient();
$response = $client->get($url);
} catch (HTTP_Request2_Exception $e) {
common_log(LOG_ERR, __METHOD__ . " Failure for $url - " . $e->getMessage());
throw new FeedSubBadURLException($e->getMessage());
}
if ($htmlOk) {
$type = $response->getHeader('Content-Type');
$isHtml = preg_match('!^(text/html|application/xhtml\+xml)!i', $type);
if ($isHtml) {
$target = $this->discoverFromHTML($response->getUrl(), $response->getBody());
if (!$target) {
throw new FeedSubNoFeedException($url);
}
return $this->discoverFromURL($target, false);
}
}
return $this->initFromResponse($response);
}
function discoverFromFeedURL($url)
{
return $this->discoverFromURL($url, false);
}
function initFromResponse($response)
{
if (!$response->isOk()) {
throw new FeedSubBadResponseException($response->getStatus());
}
$sourceurl = $response->getUrl();
$body = $response->getBody();
if (!$body) {
throw new FeedSubEmptyException($sourceurl);
}
$type = $response->getHeader('Content-Type');
if (preg_match('!^(text/xml|application/xml|application/(rss|atom)\+xml)!i', $type)) {
return $this->init($sourceurl, $type, $body);
} else {
common_log(LOG_WARNING, "Unrecognized feed type $type for $sourceurl");
throw new FeedSubUnrecognizedTypeException($type);
}
}
function init($sourceurl, $type, $body)
{
$feed = new DOMDocument();
if ($feed->loadXML($body)) {
$this->uri = $sourceurl;
$this->type = $type;
$this->feed = $feed;
$el = $this->feed->documentElement;
// Looking for the "root" element: RSS channel or Atom feed
if ($el->tagName == 'rss') {
$channels = $el->getElementsByTagName('channel');
if ($channels->length > 0) {
$this->root = $channels->item(0);
} else {
throw new FeedSubBadXmlException($sourceurl);
}
} else if ($el->tagName == 'feed') {
$this->root = $el;
} else {
throw new FeedSubBadXmlException($sourceurl);
}
return $this->uri;
} else {
throw new FeedSubBadXmlException($sourceurl);
}
}
/**
* @param string $url source URL, used to resolve relative links
* @param string $body HTML body text
* @return mixed string with URL or false if no target found
*/
function discoverFromHTML($url, $body)
{
// DOMDocument::loadHTML may throw warnings on unrecognized elements.
$old = error_reporting(error_reporting() & ~E_WARNING);
$dom = new DOMDocument();
$ok = $dom->loadHTML($body);
error_reporting($old);
if (!$ok) {
throw new FeedSubBadHtmlException();
}
// Autodiscovery links may be relative to the page's URL or <base href>
$base = false;
$nodes = $dom->getElementsByTagName('base');
for ($i = 0; $i < $nodes->length; $i++) {
$node = $nodes->item($i);
if ($node->hasAttributes()) {
$href = $node->attributes->getNamedItem('href');
if ($href) {
$base = trim($href->value);
}
}
}
if ($base) {
$base = $this->resolveURI($base, $url);
} else {
$base = $url;
}
// Ok... now on to the links!
// Types listed in order of priority -- we'll prefer Atom if available.
// @fixme merge with the munger link checks
$feeds = array(
'application/atom+xml' => false,
'application/rss+xml' => false,
);
$nodes = $dom->getElementsByTagName('link');
for ($i = 0; $i < $nodes->length; $i++) {
$node = $nodes->item($i);
if ($node->hasAttributes()) {
$rel = $node->attributes->getNamedItem('rel');
$type = $node->attributes->getNamedItem('type');
$href = $node->attributes->getNamedItem('href');
if ($rel && $type && $href) {
$rel = array_filter(explode(" ", $rel->value));
$type = trim($type->value);
$href = trim($href->value);
if (in_array('alternate', $rel) && array_key_exists($type, $feeds) && empty($feeds[$type])) {
// Save the first feed found of each type...
$feeds[$type] = $this->resolveURI($href, $base);
}
}
}
}
// Return the highest-priority feed found
foreach ($feeds as $type => $url) {
if ($url) {
return $url;
}
}
return false;
}
/**
* Resolve a possibly relative URL against some absolute base URL
* @param string $rel relative or absolute URL
* @param string $base absolute URL
* @return string absolute URL, or original URL if could not be resolved.
*/
function resolveURI($rel, $base)
{
require_once "Net/URL2.php";
try {
$relUrl = new Net_URL2($rel);
if ($relUrl->isAbsolute()) {
return $rel;
}
$baseUrl = new Net_URL2($base);
$absUrl = $baseUrl->resolve($relUrl);
return $absUrl->getURL();
} catch (Exception $e) {
common_log(LOG_WARNING, 'Unable to resolve relative link "' .
$rel . '" against base "' . $base . '": ' . $e->getMessage());
return $rel;
}
}
}