Merge branch 'testing' of gitorious.org:statusnet/mainline into testing

* 'testing' of gitorious.org:statusnet/mainline:
  Parse RSS items as activities
  Remove hkit and do our own hcard parsing
  Work around weird bug with HTML normalization via PHP DOM module; if source had xmlns and xml:lang I ended up with double output, breaking the subsequent parsing. Will have to track this down later and report upstream if not already resolved.
This commit is contained in:
Zach Copley 2010-03-19 10:08:47 -07:00
commit fb50a2d83c
5 changed files with 467 additions and 709 deletions

View File

@ -643,38 +643,11 @@ class ActivityObject
);
if ($element->tagName == 'author') {
$this->type = self::PERSON; // XXX: is this fair?
$this->title = $this->_childContent($element, self::NAME);
$this->id = $this->_childContent($element, self::URI);
if (empty($this->id)) {
$email = $this->_childContent($element, self::EMAIL);
if (!empty($email)) {
// XXX: acct: ?
$this->id = 'mailto:'.$email;
}
}
$this->_fromAuthor($element);
} else if ($element->tagName == 'item') {
$this->_fromRssItem($element);
} else {
$this->type = $this->_childContent($element, Activity::OBJECTTYPE,
Activity::SPEC);
if (empty($this->type)) {
$this->type = ActivityObject::NOTE;
}
$this->id = $this->_childContent($element, self::ID);
$this->title = $this->_childContent($element, self::TITLE);
$this->summary = $this->_childContent($element, self::SUMMARY);
$this->source = $this->_getSource($element);
$this->content = ActivityUtils::getContent($element);
$this->link = ActivityUtils::getPermalink($element);
$this->_fromAtomEntry($element);
}
// Some per-type attributes...
@ -697,6 +670,72 @@ class ActivityObject
}
}
private function _fromAuthor($element)
{
$this->type = self::PERSON; // XXX: is this fair?
$this->title = $this->_childContent($element, self::NAME);
$this->id = $this->_childContent($element, self::URI);
if (empty($this->id)) {
$email = $this->_childContent($element, self::EMAIL);
if (!empty($email)) {
// XXX: acct: ?
$this->id = 'mailto:'.$email;
}
}
}
private function _fromAtomEntry($element)
{
$this->type = $this->_childContent($element, Activity::OBJECTTYPE,
Activity::SPEC);
if (empty($this->type)) {
$this->type = ActivityObject::NOTE;
}
$this->id = $this->_childContent($element, self::ID);
$this->title = $this->_childContent($element, self::TITLE);
$this->summary = $this->_childContent($element, self::SUMMARY);
$this->source = $this->_getSource($element);
$this->content = ActivityUtils::getContent($element);
$this->link = ActivityUtils::getPermalink($element);
}
// @fixme rationalize with Activity::_fromRssItem()
private function _fromRssItem($item)
{
$this->title = ActivityUtils::childContent($item, ActivityObject::TITLE, Activity::RSS);
$contentEl = ActivityUtils::child($item, ActivityUtils::CONTENT, Activity::CONTENTNS);
if (!empty($contentEl)) {
$this->content = htmlspecialchars_decode($contentEl->textContent, ENT_QUOTES);
} else {
$descriptionEl = ActivityUtils::child($item, Activity::DESCRIPTION, Activity::RSS);
if (!empty($descriptionEl)) {
$this->content = htmlspecialchars_decode($descriptionEl->textContent, ENT_QUOTES);
}
}
$this->link = ActivityUtils::childContent($item, ActivityUtils::LINK, Activity::RSS);
$guidEl = ActivityUtils::child($item, Activity::GUID, Activity::RSS);
if (!empty($guidEl)) {
$this->id = $guidEl->textContent;
if ($guidEl->hasAttribute('isPermaLink')) {
// overwrites <link>
$this->link = $this->id;
}
}
}
private function _childContent($element, $tag, $namespace=ActivityUtils::ATOM)
{
return ActivityUtils::childContent($element, $tag, $namespace);
@ -1051,6 +1090,21 @@ class Activity
const PUBLISHED = 'published';
const UPDATED = 'updated';
const RSS = null; // no namespace!
const PUBDATE = 'pubDate';
const DESCRIPTION = 'description';
const GUID = 'guid';
const SELF = 'self';
const IMAGE = 'image';
const URL = 'url';
const DC = 'http://purl.org/dc/elements/1.1/';
const CREATOR = 'creator';
const CONTENTNS = 'http://purl.org/rss/1.0/modules/content/';
public $actor; // an ActivityObject
public $verb; // a string (the URL)
public $object; // an ActivityObject
@ -1081,8 +1135,6 @@ class Activity
return;
}
$this->entry = $entry;
// Insist on a feed's root DOMElement; don't allow a DOMDocument
if ($feed instanceof DOMDocument) {
throw new ClientException(
@ -1090,8 +1142,22 @@ class Activity
);
}
$this->entry = $entry;
$this->feed = $feed;
if ($entry->namespaceURI == Activity::ATOM &&
$entry->localName == 'entry') {
$this->_fromAtomEntry($entry, $feed);
} else if ($entry->namespaceURI == Activity::RSS &&
$entry->localName == 'item') {
$this->_fromRssItem($entry, $feed);
} else {
throw new Exception("Unknown DOM element: {$entry->namespaceURI} {$entry->localName}");
}
}
function _fromAtomEntry($entry, $feed)
{
$pubEl = $this->_child($entry, self::PUBLISHED, self::ATOM);
if (!empty($pubEl)) {
@ -1177,6 +1243,69 @@ class Activity
}
}
function _fromRssItem($item, $rss)
{
$verbEl = $this->_child($item, self::VERB);
if (!empty($verbEl)) {
$this->verb = trim($verbEl->textContent);
} else {
$this->verb = ActivityVerb::POST;
// XXX: do other implied stuff here
}
$pubDateEl = $this->_child($item, self::PUBDATE, self::RSS);
if (!empty($pubDateEl)) {
$this->time = strtotime($pubDateEl->textContent);
}
$authorEl = $this->_child($item, self::AUTHOR, self::RSS);
if (!empty($authorEl)) {
$this->actor = $this->_fromRssAuthor($authorEl);
} else {
$dcCreatorEl = $this->_child($item, self::CREATOR, self::DC);
if (!empty($dcCreatorEl)) {
$this->actor = $this->_fromDcCreator($dcCreatorEl);
} else if (!empty($rss)) {
$this->actor = $this->_fromRss($rss);
}
}
$this->title = ActivityUtils::childContent($item, ActivityObject::TITLE, self::RSS);
$contentEl = ActivityUtils::child($item, ActivityUtils::CONTENT, self::CONTENTNS);
if (!empty($contentEl)) {
$this->content = htmlspecialchars_decode($contentEl->textContent, ENT_QUOTES);
} else {
$descriptionEl = ActivityUtils::child($item, self::DESCRIPTION, self::RSS);
if (!empty($descriptionEl)) {
$this->content = htmlspecialchars_decode($descriptionEl->textContent, ENT_QUOTES);
}
}
$this->link = ActivityUtils::childContent($item, ActivityUtils::LINK, self::RSS);
// @fixme enclosures
// @fixme thumbnails... maybe
$guidEl = ActivityUtils::child($item, self::GUID, self::RSS);
if (!empty($guidEl)) {
$this->id = $guidEl->textContent;
if ($guidEl->hasAttribute('isPermaLink') && $guidEl->getAttribute('isPermaLink') != 'false') {
// overwrites <link>
$this->link = $this->id;
}
}
$this->object = new ActivityObject($item);
$this->context = new ActivityContext($item);
}
/**
* Returns an Atom <entry> based on this activity
*
@ -1249,6 +1378,83 @@ class Activity
return $xs->getString();
}
function _fromRssAuthor($el)
{
$text = $el->textContent;
if (preg_match('/^(.*?) \((.*)\)$/', $text, $match)) {
$email = $match[1];
$name = $match[2];
} else if (preg_match('/^(.*?) <(.*)>$/', $text, $match)) {
$name = $match[1];
$email = $match[2];
} else if (preg_match('/.*@.*/', $text)) {
$email = $text;
$name = null;
} else {
$name = $text;
$email = null;
}
// Not really enough info
$actor = new ActivityObject();
$actor->element = $el;
$actor->type = ActivityObject::PERSON;
$actor->title = $name;
if (!empty($email)) {
$actor->id = 'mailto:'.$email;
}
return $actor;
}
function _fromDcCreator($el)
{
// Not really enough info
$text = $el->textContent;
$actor = new ActivityObject();
$actor->element = $el;
$actor->title = $text;
$actor->type = ActivityObject::PERSON;
return $actor;
}
function _fromRss($el)
{
$actor = new ActivityObject();
$actor->element = $el;
$actor->type = ActivityObject::PERSON; // @fixme guess better
$actor->title = ActivityUtils::childContent($el, ActivityObject::TITLE, self::RSS);
$actor->link = ActivityUtils::childContent($el, ActivityUtils::LINK, self::RSS);
$actor->id = ActivityUtils::getLink($el, self::SELF);
$desc = ActivityUtils::childContent($el, self::DESCRIPTION, self::RSS);
if (!empty($desc)) {
$actor->content = htmlspecialchars_decode($desc, ENT_QUOTES);
}
$imageEl = ActivityUtils::child($el, self::IMAGE, self::RSS);
if (!empty($imageEl)) {
$actor->avatarLinks[] = ActivityUtils::childContent($imageEl, self::URL, self::RSS);
}
return $actor;
}
private function _child($element, $tag, $namespace=self::SPEC)
{
return ActivityUtils::child($element, $tag, $namespace);

View File

@ -1,105 +0,0 @@
<?php
// hcard profile for hkit
$this->root_class = 'vcard';
$this->classes = array(
'fn', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'),
'n', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'),
'adr', array('post-office-box', 'extended-address', 'street-address', 'postal-code', 'country-name', 'type', 'region', 'locality'),
'label', 'bday', 'agent', 'nickname', 'photo', 'class',
'email', array('type', 'value'),
'category', 'key', 'logo', 'mailer', 'note',
'org', array('organization-name', 'organization-unit'),
'tel', array('type', 'value'),
'geo', array('latitude', 'longitude'),
'tz', 'uid', 'url', 'rev', 'role', 'sort-string', 'sound', 'title'
);
// classes that must only appear once per card
$this->singles = array(
'fn'
);
// classes that are required (not strictly enforced - give at least one!)
$this->required = array(
'fn'
);
$this->att_map = array(
'fn' => array('IMG|alt'),
'url' => array('A|href', 'IMG|src', 'AREA|href'),
'photo' => array('IMG|src'),
'bday' => array('ABBR|title'),
'logo' => array('IMG|src'),
'email' => array('A|href'),
'geo' => array('ABBR|title')
);
$this->callbacks = array(
'url' => array($this, 'resolvePath'),
'photo' => array($this, 'resolvePath'),
'logo' => array($this, 'resolvePath'),
'email' => array($this, 'resolveEmail')
);
function hKit_hcard_post($a)
{
foreach ($a as &$vcard){
hKit_implied_n_optimization($vcard);
hKit_implied_n_from_fn($vcard);
}
return $a;
}
function hKit_implied_n_optimization(&$vcard)
{
if (array_key_exists('fn', $vcard) && !is_array($vcard['fn']) &&
!array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){
if (sizeof(explode(' ', $vcard['fn'])) == 2){
$patterns = array();
$patterns[] = array('/^(\S+),\s*(\S{1})$/', 2, 1); // Lastname, Initial
$patterns[] = array('/^(\S+)\s*(\S{1})\.*$/', 2, 1); // Lastname Initial(.)
$patterns[] = array('/^(\S+),\s*(\S+)$/', 2, 1); // Lastname, Firstname
$patterns[] = array('/^(\S+)\s*(\S+)$/', 1, 2); // Firstname Lastname
foreach ($patterns as $pattern){
if (preg_match($pattern[0], $vcard['fn'], $matches) === 1){
$n = array();
$n['given-name'] = $matches[$pattern[1]];
$n['family-name'] = $matches[$pattern[2]];
$vcard['n'] = $n;
break;
}
}
}
}
}
function hKit_implied_n_from_fn(&$vcard)
{
if (array_key_exists('fn', $vcard) && is_array($vcard['fn'])
&& !array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){
$vcard['n'] = $vcard['fn'];
}
if (array_key_exists('fn', $vcard) && is_array($vcard['fn'])){
$vcard['fn'] = $vcard['fn']['text'];
}
}
?>

View File

@ -1,475 +0,0 @@
<?php
/*
hKit Library for PHP5 - a generic library for parsing Microformats
Copyright (C) 2006 Drew McLellan
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Author
Drew McLellan - http://allinthehead.com/
Contributors:
Scott Reynen - http://www.randomchaos.com/
Version 0.5, 22-Jul-2006
fixed by-ref issue cropping up in PHP 5.0.5
fixed a bug with a@title
added support for new fn=n optimisation
added support for new a.include include-pattern
Version 0.4, 23-Jun-2006
prevented nested includes from causing infinite loops
returns false if URL can't be fetched
added pre-flight check for base support level
added deduping of once-only classnames
prevented accumulation of multiple 'value' values
tuned whitespace handling and treatment of DEL elements
Version 0.3, 21-Jun-2006
added post-processor callback method into profiles
fixed minor problems raised by hcard testsuite
added support for include-pattern
added support for td@headers pattern
added implied-n optimization into default hcard profile
Version 0.2, 20-Jun-2006
added class callback mechanism
added resolvePath & resolveEmail
added basic BASE support
Version 0.1.1, 19-Jun-2006 (different timezone, no time machine)
added external Tidy option
Version 0.1, 20-Jun-2006
initial release
*/
class hKit
{
public $tidy_mode = 'proxy'; // 'proxy', 'exec', 'php' or 'none'
public $tidy_proxy = 'http://cgi.w3.org/cgi-bin/tidy?forceXML=on&docAddr='; // required only for tidy_mode=proxy
public $tmp_dir = '/path/to/writable/dir/'; // required only for tidy_mode=exec
private $root_class = '';
private $classes = '';
private $singles = '';
private $required = '';
private $att_map = '';
private $callbacks = '';
private $processor = '';
private $url = '';
private $base = '';
private $doc = '';
public function hKit()
{
// pre-flight checks
$pass = true;
$required = array('dom_import_simplexml', 'file_get_contents', 'simplexml_load_string');
$missing = array();
foreach ($required as $f){
if (!function_exists($f)){
$pass = false;
$missing[] = $f . '()';
}
}
if (!$pass)
die('hKit error: these required functions are not available: <strong>' . implode(', ', $missing) . '</strong>');
}
public function getByURL($profile='', $url='')
{
if ($profile=='' || $url == '') return false;
$this->loadProfile($profile);
$source = $this->loadURL($url);
if ($source){
$tidy_xhtml = $this->tidyThis($source);
$fragment = false;
if (strrchr($url, '#'))
$fragment = array_pop(explode('#', $url));
$doc = $this->loadDoc($tidy_xhtml, $fragment);
$s = $this->processNodes($doc, $this->classes);
$s = $this->postProcess($profile, $s);
return $s;
}else{
return false;
}
}
public function getByString($profile='', $input_xml='')
{
if ($profile=='' || $input_xml == '') return false;
$this->loadProfile($profile);
$doc = $this->loadDoc($input_xml);
$s = $this->processNodes($doc, $this->classes);
$s = $this->postProcess($profile, $s);
return $s;
}
private function processNodes($items, $classes, $allow_includes=true){
$out = array();
foreach($items as $item){
$data = array();
for ($i=0; $i<sizeof($classes); $i++){
if (!is_array($classes[$i])){
$xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' " . $classes[$i] . " ')]";
$results = $item->xpath($xpath);
if ($results){
foreach ($results as $result){
if (isset($classes[$i+1]) && is_array($classes[$i+1])){
$nodes = $this->processNodes($results, $classes[$i+1]);
if (sizeof($nodes) > 0){
$nodes = array_merge(array('text'=>$this->getNodeValue($result, $classes[$i])), $nodes);
$data[$classes[$i]] = $nodes;
}else{
$data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]);
}
}else{
if (isset($data[$classes[$i]])){
if (is_array($data[$classes[$i]])){
// is already an array - append
$data[$classes[$i]][] = $this->getNodeValue($result, $classes[$i]);
}else{
// make it an array
if ($classes[$i] == 'value'){ // unless it's the 'value' of a type/value pattern
$data[$classes[$i]] .= $this->getNodeValue($result, $classes[$i]);
}else{
$old_val = $data[$classes[$i]];
$data[$classes[$i]] = array($old_val, $this->getNodeValue($result, $classes[$i]));
$old_val = false;
}
}
}else{
// set as normal value
$data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]);
}
}
// td@headers pattern
if (strtoupper(dom_import_simplexml($result)->tagName)== "TD" && $result['headers']){
$include_ids = explode(' ', $result['headers']);
$doc = $this->doc;
foreach ($include_ids as $id){
$xpath = "//*[@id='$id']/..";
$includes = $doc->xpath($xpath);
foreach ($includes as $include){
$tmp = $this->processNodes($include, $this->classes);
if (is_array($tmp)) $data = array_merge($data, $tmp);
}
}
}
}
}
}
$result = false;
}
// include-pattern
if ($allow_includes){
$xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' include ')]";
$results = $item->xpath($xpath);
if ($results){
foreach ($results as $result){
$tagName = strtoupper(dom_import_simplexml($result)->tagName);
if ((($tagName == "OBJECT" && $result['data']) || ($tagName == "A" && $result['href']))
&& preg_match('/\binclude\b/', $result['class'])){
$att = ($tagName == "OBJECT" ? 'data' : 'href');
$id = str_replace('#', '', $result[$att]);
$doc = $this->doc;
$xpath = "//*[@id='$id']";
$includes = $doc->xpath($xpath);
foreach ($includes as $include){
$include = simplexml_load_string('<root1><root2>'.$include->asXML().'</root2></root1>'); // don't ask.
$tmp = $this->processNodes($include, $this->classes, false);
if (is_array($tmp)) $data = array_merge($data, $tmp);
}
}
}
}
}
$out[] = $data;
}
if (sizeof($out) > 1){
return $out;
}else if (isset($data)){
return $data;
}else{
return array();
}
}
private function getNodeValue($node, $className)
{
$tag_name = strtoupper(dom_import_simplexml($node)->tagName);
$s = false;
// ignore DEL tags
if ($tag_name == 'DEL') return $s;
// look up att map values
if (array_key_exists($className, $this->att_map)){
foreach ($this->att_map[$className] as $map){
if (preg_match("/$tag_name\|/", $map)){
$s = ''.$node[array_pop($foo = explode('|', $map))];
}
}
}
// if nothing and OBJ, try data.
if (!$s && $tag_name=='OBJECT' && $node['data']) $s = ''.$node['data'];
// if nothing and IMG, try alt.
if (!$s && $tag_name=='IMG' && $node['alt']) $s = ''.$node['alt'];
// if nothing and AREA, try alt.
if (!$s && $tag_name=='AREA' && $node['alt']) $s = ''.$node['alt'];
//if nothing and not A, try title.
if (!$s && $tag_name!='A' && $node['title']) $s = ''.$node['title'];
// if nothing found, go with node text
$s = ($s ? $s : implode(array_filter($node->xpath('child::node()'), array(&$this, "filterBlankValues")), ' '));
// callbacks
if (array_key_exists($className, $this->callbacks)){
$s = preg_replace_callback('/.*/', $this->callbacks[$className], $s, 1);
}
// trim and remove line breaks
if ($tag_name != 'PRE'){
$s = trim(preg_replace('/[\r\n\t]+/', '', $s));
$s = trim(preg_replace('/(\s{2})+/', ' ', $s));
}
return $s;
}
private function filterBlankValues($s){
return preg_match("/\w+/", $s);
}
private function tidyThis($source)
{
switch ( $this->tidy_mode )
{
case 'exec':
$tmp_file = $this->tmp_dir.md5($source).'.txt';
file_put_contents($tmp_file, $source);
exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet $tmp_file", $tidy);
unlink($tmp_file);
return implode("\n", $tidy);
break;
case 'php':
$tidy = tidy_parse_string($source);
return tidy_clean_repair($tidy);
break;
default:
return $source;
break;
}
}
private function loadProfile($profile)
{
require_once("$profile.profile.php");
}
private function loadDoc($input_xml, $fragment=false)
{
$xml = simplexml_load_string($input_xml);
$this->doc = $xml;
if ($fragment){
$doc = $xml->xpath("//*[@id='$fragment']");
$xml = simplexml_load_string($doc[0]->asXML());
$doc = null;
}
// base tag
if ($xml->head->base['href']) $this->base = $xml->head->base['href'];
// xml:base attribute - PITA with SimpleXML
preg_match('/xml:base="(.*)"/', $xml->asXML(), $matches);
if (is_array($matches) && sizeof($matches)>1) $this->base = $matches[1];
return $xml->xpath("//*[contains(concat(' ',normalize-space(@class),' '),' $this->root_class ')]");
}
private function loadURL($url)
{
$this->url = $url;
if ($this->tidy_mode == 'proxy' && $this->tidy_proxy != ''){
$url = $this->tidy_proxy . $url;
}
return @file_get_contents($url);
}
private function postProcess($profile, $s)
{
$required = $this->required;
if (is_array($s) && array_key_exists($required[0], $s)){
$s = array($s);
}
$s = $this->dedupeSingles($s);
if (function_exists('hKit_'.$profile.'_post')){
$s = call_user_func('hKit_'.$profile.'_post', $s);
}
$s = $this->removeTextVals($s);
return $s;
}
private function resolvePath($filepath)
{ // ugly code ahoy: needs a serious tidy up
$filepath = $filepath[0];
$base = $this->base;
$url = $this->url;
if ($base != '' && strpos($base, '://') !== false)
$url = $base;
$r = parse_url($url);
$domain = $r['scheme'] . '://' . $r['host'];
if (!isset($r['path'])) $r['path'] = '/';
$path = explode('/', $r['path']);
$file = explode('/', $filepath);
$new = array('');
if (strpos($filepath, '://') !== false || strpos($filepath, 'data:') !== false){
return $filepath;
}
if ($file[0] == ''){
// absolute path
return ''.$domain . implode('/', $file);
}else{
// relative path
if ($path[sizeof($path)-1] == '') array_pop($path);
if (strpos($path[sizeof($path)-1], '.') !== false) array_pop($path);
foreach ($file as $segment){
if ($segment == '..'){
array_pop($path);
}else{
$new[] = $segment;
}
}
return ''.$domain . implode('/', $path) . implode('/', $new);
}
}
private function resolveEmail($v)
{
$parts = parse_url($v[0]);
return ($parts['path']);
}
private function dedupeSingles($s)
{
$singles = $this->singles;
foreach ($s as &$item){
foreach ($singles as $classname){
if (array_key_exists($classname, $item) && is_array($item[$classname])){
if (isset($item[$classname][0])) $item[$classname] = $item[$classname][0];
}
}
}
return $s;
}
private function removeTextVals($s)
{
foreach ($s as $key => &$val){
if ($key){
$k = $key;
}else{
$k = '';
}
if (is_array($val)){
$val = $this->removeTextVals($val);
}else{
if ($k == 'text'){
$val = '';
}
}
}
return array_filter($s);
}
}
?>

View File

@ -63,54 +63,12 @@ class DiscoveryHints {
static function hcardHints($body, $url)
{
common_debug("starting tidy");
$body = self::_tidy($body, $url);
common_debug("done with tidy");
set_include_path(get_include_path() . PATH_SEPARATOR . INSTALLDIR . '/plugins/OStatus/extlib/hkit/');
require_once('hkit.class.php');
// hKit code is not clean for notices and warnings
$old = error_reporting();
error_reporting($old & ~E_NOTICE & ~E_WARNING);
$h = new hKit;
$hcards = $h->getByString('hcard', $body);
error_reporting($old);
if (empty($hcards)) {
return array();
}
if (count($hcards) == 1) {
$hcard = $hcards[0];
} else {
foreach ($hcards as $try) {
if (array_key_exists('url', $try)) {
if (is_string($try['url']) && $try['url'] == $url) {
$hcard = $try;
break;
} else if (is_array($try['url'])) {
foreach ($try['url'] as $tryurl) {
if ($tryurl == $url) {
$hcard = $try;
break 2;
}
}
}
}
}
// last chance; grab the first one
if (empty($hcard)) {
$hcard = $hcards[0];
}
}
$hcard = self::_hcard($body, $url);
$hints = array();
// XXX: don't copy stuff into an array and then copy it again
if (array_key_exists('nickname', $hcard)) {
$hints['nickname'] = $hcard['nickname'];
}
@ -122,7 +80,7 @@ class DiscoveryHints {
}
if (array_key_exists('photo', $hcard)) {
$hints['avatar'] = $hcard['photo'];
$hints['avatar'] = $hcard['photo'][0];
}
if (array_key_exists('note', $hcard)) {
@ -149,61 +107,142 @@ class DiscoveryHints {
return $hints;
}
/**
* hKit needs well-formed XML for its parsing.
* We'll take the HTML body here and normalize it to XML.
*
* @param string $body HTML document source, possibly not-well-formed
* @param string $url source URL
* @return string well-formed XML document source
* @throws Exception if HTML parsing failed.
*/
private static function _tidy($body, $url)
static function _hcard($body, $url)
{
if (empty($body)) {
throw new Exception("Empty HTML could not be parsed.");
}
$dom = new DOMDocument();
// DOMDocument::loadHTML may throw warnings on unrecognized elements.
// Some HTML errors will trigger warnings, but still work.
$old = error_reporting();
error_reporting($old & ~E_WARNING);
$old = error_reporting(error_reporting() & ~E_WARNING);
$ok = $dom->loadHTML($body);
$doc = new DOMDocument();
$doc->loadHTML($body);
error_reporting($old);
if ($ok) {
// hKit doesn't give us a chance to pass the source URL for
// resolving relative links, such as the avatar photo on a
// Google profile. We'll slip it into a <base> tag if there's
// not already one present.
$bases = $dom->getElementsByTagName('base');
if ($bases && $bases->length >= 1) {
$base = $bases->item(0);
if ($base->hasAttribute('href')) {
$base->setAttribute('href', $url);
}
} else {
$base = $dom->createElement('base');
$base->setAttribute('href', $url);
$heads = $dom->getElementsByTagName('head');
if ($heads || $heads->length) {
$head = $heads->item(0);
} else {
$head = $dom->createElement('head');
$root = $dom->documentRoot;
if ($root->firstChild) {
$root->insertBefore($head, $root->firstChild);
} else {
$root->appendChild($head);
}
}
$head->appendChild($base);
$xp = new DOMXPath($doc);
$hcardNodes = self::_getChildrenByClass($doc->documentElement, 'vcard', $xp);
$hcards = array();
for ($i = 0; $i < $hcardNodes->length; $i++) {
$hcardNode = $hcardNodes->item($i);
$hcard = self::_hcardFromNode($hcardNode, $xp, $url);
$hcards[] = $hcard;
}
$repr = null;
foreach ($hcards as $hcard) {
if (in_array($url, $hcard['url'])) {
$repr = $hcard;
break;
}
return $dom->saveXML();
}
if (!is_null($repr)) {
return $repr;
} else if (count($hcards) > 0) {
return $hcards[0];
} else {
throw new Exception("Invalid HTML could not be parsed.");
return null;
}
}
function _getChildrenByClass($el, $cls, $xp)
{
// borrowed from hkit. Thanks dudes!
$qry = ".//*[contains(concat(' ',normalize-space(@class),' '),' $cls ')]";
$nodes = $xp->query($qry, $el);
return $nodes;
}
function _hcardFromNode($hcardNode, $xp, $base)
{
$hcard = array();
$hcard['url'] = array();
$urlNodes = self::_getChildrenByClass($hcardNode, 'url', $xp);
for ($j = 0; $j < $urlNodes->length; $j++) {
$urlNode = $urlNodes->item($j);
if ($urlNode->hasAttribute('href')) {
$url = $urlNode->getAttribute('href');
} else {
$url = $urlNode->textContent;
}
$hcard['url'][] = self::_rel2abs($url, $base);
}
$hcard['photo'] = array();
$photoNodes = self::_getChildrenByClass($hcardNode, 'photo', $xp);
for ($j = 0; $j < $photoNodes->length; $j++) {
$photoNode = $photoNodes->item($j);
if ($photoNode->hasAttribute('src')) {
$url = $photoNode->getAttribute('src');
} else if ($photoNode->hasAttribute('href')) {
$url = $photoNode->getAttribute('href');
} else {
$url = $photoNode->textContent;
}
$hcard['photo'][] = self::_rel2abs($url, $base);
}
$singles = array('nickname', 'note', 'fn', 'n', 'adr');
foreach ($singles as $single) {
$nodes = self::_getChildrenByClass($hcardNode, $single, $xp);
if ($nodes->length > 0) {
$node = $nodes->item(0);
$hcard[$single] = $node->textContent;
}
}
return $hcard;
}
// XXX: this is a first pass; we probably need
// to handle things like ../ and ./ and so on
static function _rel2abs($rel, $wrt)
{
$parts = parse_url($rel);
if ($parts === false) {
return false;
}
// If it's got a scheme, use it
if ($parts['scheme'] != '') {
return $rel;
}
$w = parse_url($wrt);
$base = $w['scheme'].'://'.$w['host'];
if ($rel[0] == '/') {
return $base.$rel;
}
$wp = explode('/', $w['path']);
array_pop($wp);
return $base.implode('/', $wp).'/'.$rel;
}
}

View File

@ -138,9 +138,38 @@ class ActivityParseTests extends PHPUnit_Framework_TestCase
$this->assertEquals($poco->urls[0]->value, 'http://example.com/blog.html');
$this->assertEquals($poco->urls[0]->primary, 'true');
$this->assertEquals($act->actor->geopoint, '37.7749295 -122.4194155');
}
public function testExample6()
{
global $_example6;
$dom = DOMDocument::loadXML($_example6);
$rss = $dom->documentElement;
$channels = $dom->getElementsByTagName('channel');
$channel = $channels->item(0);
$items = $channel->getElementsByTagName('item');
$item = $items->item(0);
$act = new Activity($item, $channel);
$this->assertEquals($act->verb, ActivityVerb::POST);
$this->assertEquals($act->id, 'http://en.blog.wordpress.com/?p=3857');
$this->assertEquals($act->link, 'http://en.blog.wordpress.com/2010/03/03/rub-a-dub-dub-in-the-pubsubhubbub/');
$this->assertEquals($act->title, 'Rub-a-Dub-Dub in the PubSubHubbub');
$this->assertEquals($act->time, 1267634892);
$actor = $act->actor;
$this->assertFalse(empty($actor));
$this->assertEquals($actor->title, "Joseph Scott");
}
}
$_example1 = <<<EXAMPLE1
@ -330,3 +359,67 @@ $_example5 = <<<EXAMPLE5
</entry>
</feed>
EXAMPLE5;
$_example6 = <<<EXAMPLE6
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:atom="http://www.w3.org/2005/Atom"
xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
xmlns:georss="http://www.georss.org/georss" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:media="http://search.yahoo.com/mrss/"
>
<channel>
<title>WordPress.com News</title>
<atom:link href="http://en.blog.wordpress.com/feed/" rel="self" type="application/rss+xml" />
<link>http://en.blog.wordpress.com</link>
<description>The latest news on WordPress.com and the WordPress community.</description>
<lastBuildDate>Thu, 18 Mar 2010 23:25:35 +0000</lastBuildDate>
<generator>http://wordpress.com/</generator>
<language>en</language>
<sy:updatePeriod>hourly</sy:updatePeriod>
<sy:updateFrequency>1</sy:updateFrequency>
<cloud domain='en.blog.wordpress.com' port='80' path='/?rsscloud=notify' registerProcedure='' protocol='http-post' />
<image>
<url>http://www.gravatar.com/blavatar/e6392390e3bcfadff3671c5a5653d95b?s=96&#038;d=http://s2.wp.com/i/buttonw-com.png</url>
<title>WordPress.com News</title>
<link>http://en.blog.wordpress.com</link>
</image>
<atom:link rel="search" type="application/opensearchdescription+xml" href="http://en.blog.wordpress.com/osd.xml" title="WordPress.com News" />
<atom:link rel='hub' href='http://en.blog.wordpress.com/?pushpress=hub'/>
<item>
<title>Rub-a-Dub-Dub in the PubSubHubbub</title>
<link>http://en.blog.wordpress.com/2010/03/03/rub-a-dub-dub-in-the-pubsubhubbub/</link>
<comments>http://en.blog.wordpress.com/2010/03/03/rub-a-dub-dub-in-the-pubsubhubbub/#comments</comments>
<pubDate>Wed, 03 Mar 2010 16:48:12 +0000</pubDate>
<dc:creator>Joseph Scott</dc:creator>
<category><![CDATA[Feeds]]></category>
<category><![CDATA[atom]]></category>
<category><![CDATA[pubsubhubbub]]></category>
<category><![CDATA[rss]]></category>
<guid isPermaLink="false">http://en.blog.wordpress.com/?p=3857</guid>
<description><![CDATA[From the tongue twisting name department we welcome PubSubHubbub, or as some people have shortened it to: PuSH. Like rssCloud, PuSH is a way for services that subscribe to updates from your blog (think Google Reader, Bloglines or Netvibes) to get updates even faster. In a nutshell, instead of having to periodically ask [...]<img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=en.blog.wordpress.com&blog=3584907&post=3857&subd=en.blog&ref=&feed=1" />]]></description>
<content:encoded><![CDATA[<p>From the tongue twisting name department we welcome <a href="http://code.google.com/p/pubsubhubbub/">PubSubHubbub</a>, or as some people have shortened it to: PuSH. Like <a href="http://en.blog.wordpress.com/2009/09/07/rss-in-the-clouds/">rssCloud</a>, PuSH is a way for services that subscribe to updates from your blog (think Google Reader, Bloglines or Netvibes) to get updates even faster. In a nutshell, instead of having to periodically ask your blog if there are any updates they can now register to automatically receive updates each time you publish new content. In most cases these updates are sent out within a second or two of when you hit the publish button.</p>
<p>Today we&#8217;ve turned on PuSH support for the more than 10.5 million blogs on WordPress.com. There&#8217;s nothing to configure, it&#8217;s working right now behind the scenes to help others keep up to date with your posts.</p>
<p>For those using the WordPress.org software we are releasing a new PuSH plugin: <a href="http://wordpress.org/extend/plugins/pushpress/">PuSHPress</a>. This plugin differs from the current PuSH related plugins by including a built-in hub.</p>
<p>For more PuSH related reading check out the <a href="http://code.google.com/p/pubsubhubbub/">PubSubHubbub project site</a> and <a href="http://groups.google.com/group/pubsubhubbub?pli=1">Google Group</a>. And if you really want to geek out there&#8217;s always the <a href="http://pubsubhubbub.googlecode.com/svn/trunk/pubsubhubbub-core-0.3.html">PubSubHubbub Spec</a> <img src='http://s.wordpress.com/wp-includes/images/smilies/icon_smile.gif' alt=':-)' class='wp-smiley' /> </p>
<br /> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gocomments/en.blog.wordpress.com/3857/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/comments/en.blog.wordpress.com/3857/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godelicious/en.blog.wordpress.com/3857/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/delicious/en.blog.wordpress.com/3857/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/gostumble/en.blog.wordpress.com/3857/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/stumble/en.blog.wordpress.com/3857/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/godigg/en.blog.wordpress.com/3857/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/digg/en.blog.wordpress.com/3857/" /></a> <a rel="nofollow" href="http://feeds.wordpress.com/1.0/goreddit/en.blog.wordpress.com/3857/"><img alt="" border="0" src="http://feeds.wordpress.com/1.0/reddit/en.blog.wordpress.com/3857/" /></a> <img alt="" border="0" src="http://stats.wordpress.com/b.gif?host=en.blog.wordpress.com&blog=3584907&post=3857&subd=en.blog&ref=&feed=1" />]]></content:encoded>
<wfw:commentRss>http://en.blog.wordpress.com/2010/03/03/rub-a-dub-dub-in-the-pubsubhubbub/feed/</wfw:commentRss>
<slash:comments>96</slash:comments>
<media:content url="http://1.gravatar.com/avatar/582b66ad5ae1b69c7601a990cb9a661a?s=96&#38;d=identicon" medium="image">
<media:title type="html">josephscott</media:title>
</media:content>
</item>
</channel>
</rss>
EXAMPLE6;