diff --git a/lib/activity.php b/lib/activity.php index c67d090f72..5b304020d6 100644 --- a/lib/activity.php +++ b/lib/activity.php @@ -643,38 +643,11 @@ class ActivityObject ); if ($element->tagName == 'author') { - - $this->type = self::PERSON; // XXX: is this fair? - $this->title = $this->_childContent($element, self::NAME); - $this->id = $this->_childContent($element, self::URI); - - if (empty($this->id)) { - $email = $this->_childContent($element, self::EMAIL); - if (!empty($email)) { - // XXX: acct: ? - $this->id = 'mailto:'.$email; - } - } - + $this->_fromAuthor($element); + } else if ($element->tagName == 'item') { + $this->_fromRssItem($element); } else { - - $this->type = $this->_childContent($element, Activity::OBJECTTYPE, - Activity::SPEC); - - if (empty($this->type)) { - $this->type = ActivityObject::NOTE; - } - - $this->id = $this->_childContent($element, self::ID); - $this->title = $this->_childContent($element, self::TITLE); - $this->summary = $this->_childContent($element, self::SUMMARY); - - $this->source = $this->_getSource($element); - - $this->content = ActivityUtils::getContent($element); - - $this->link = ActivityUtils::getPermalink($element); - + $this->_fromAtomEntry($element); } // Some per-type attributes... @@ -697,6 +670,72 @@ class ActivityObject } } + private function _fromAuthor($element) + { + $this->type = self::PERSON; // XXX: is this fair? + $this->title = $this->_childContent($element, self::NAME); + $this->id = $this->_childContent($element, self::URI); + + if (empty($this->id)) { + $email = $this->_childContent($element, self::EMAIL); + if (!empty($email)) { + // XXX: acct: ? + $this->id = 'mailto:'.$email; + } + } + } + + private function _fromAtomEntry($element) + { + $this->type = $this->_childContent($element, Activity::OBJECTTYPE, + Activity::SPEC); + + if (empty($this->type)) { + $this->type = ActivityObject::NOTE; + } + + $this->id = $this->_childContent($element, self::ID); + $this->title = $this->_childContent($element, self::TITLE); + $this->summary = $this->_childContent($element, self::SUMMARY); + + $this->source = $this->_getSource($element); + + $this->content = ActivityUtils::getContent($element); + + $this->link = ActivityUtils::getPermalink($element); + } + + // @fixme rationalize with Activity::_fromRssItem() + + private function _fromRssItem($item) + { + $this->title = ActivityUtils::childContent($item, ActivityObject::TITLE, Activity::RSS); + + $contentEl = ActivityUtils::child($item, ActivityUtils::CONTENT, Activity::CONTENTNS); + + if (!empty($contentEl)) { + $this->content = htmlspecialchars_decode($contentEl->textContent, ENT_QUOTES); + } else { + $descriptionEl = ActivityUtils::child($item, Activity::DESCRIPTION, Activity::RSS); + if (!empty($descriptionEl)) { + $this->content = htmlspecialchars_decode($descriptionEl->textContent, ENT_QUOTES); + } + } + + $this->link = ActivityUtils::childContent($item, ActivityUtils::LINK, Activity::RSS); + + $guidEl = ActivityUtils::child($item, Activity::GUID, Activity::RSS); + + if (!empty($guidEl)) { + $this->id = $guidEl->textContent; + + if ($guidEl->hasAttribute('isPermaLink')) { + // overwrites + $this->link = $this->id; + } + } + } + private function _childContent($element, $tag, $namespace=ActivityUtils::ATOM) { return ActivityUtils::childContent($element, $tag, $namespace); @@ -1051,6 +1090,21 @@ class Activity const PUBLISHED = 'published'; const UPDATED = 'updated'; + const RSS = null; // no namespace! + + const PUBDATE = 'pubDate'; + const DESCRIPTION = 'description'; + const GUID = 'guid'; + const SELF = 'self'; + const IMAGE = 'image'; + const URL = 'url'; + + const DC = 'http://purl.org/dc/elements/1.1/'; + + const CREATOR = 'creator'; + + const CONTENTNS = 'http://purl.org/rss/1.0/modules/content/'; + public $actor; // an ActivityObject public $verb; // a string (the URL) public $object; // an ActivityObject @@ -1081,8 +1135,6 @@ class Activity return; } - $this->entry = $entry; - // Insist on a feed's root DOMElement; don't allow a DOMDocument if ($feed instanceof DOMDocument) { throw new ClientException( @@ -1090,8 +1142,22 @@ class Activity ); } + $this->entry = $entry; $this->feed = $feed; + if ($entry->namespaceURI == Activity::ATOM && + $entry->localName == 'entry') { + $this->_fromAtomEntry($entry, $feed); + } else if ($entry->namespaceURI == Activity::RSS && + $entry->localName == 'item') { + $this->_fromRssItem($entry, $feed); + } else { + throw new Exception("Unknown DOM element: {$entry->namespaceURI} {$entry->localName}"); + } + } + + function _fromAtomEntry($entry, $feed) + { $pubEl = $this->_child($entry, self::PUBLISHED, self::ATOM); if (!empty($pubEl)) { @@ -1177,6 +1243,69 @@ class Activity } } + function _fromRssItem($item, $rss) + { + $verbEl = $this->_child($item, self::VERB); + + if (!empty($verbEl)) { + $this->verb = trim($verbEl->textContent); + } else { + $this->verb = ActivityVerb::POST; + // XXX: do other implied stuff here + } + + $pubDateEl = $this->_child($item, self::PUBDATE, self::RSS); + + if (!empty($pubDateEl)) { + $this->time = strtotime($pubDateEl->textContent); + } + + $authorEl = $this->_child($item, self::AUTHOR, self::RSS); + + if (!empty($authorEl)) { + $this->actor = $this->_fromRssAuthor($authorEl); + } else { + $dcCreatorEl = $this->_child($item, self::CREATOR, self::DC); + if (!empty($dcCreatorEl)) { + $this->actor = $this->_fromDcCreator($dcCreatorEl); + } else if (!empty($rss)) { + $this->actor = $this->_fromRss($rss); + } + } + + $this->title = ActivityUtils::childContent($item, ActivityObject::TITLE, self::RSS); + + $contentEl = ActivityUtils::child($item, ActivityUtils::CONTENT, self::CONTENTNS); + + if (!empty($contentEl)) { + $this->content = htmlspecialchars_decode($contentEl->textContent, ENT_QUOTES); + } else { + $descriptionEl = ActivityUtils::child($item, self::DESCRIPTION, self::RSS); + if (!empty($descriptionEl)) { + $this->content = htmlspecialchars_decode($descriptionEl->textContent, ENT_QUOTES); + } + } + + $this->link = ActivityUtils::childContent($item, ActivityUtils::LINK, self::RSS); + + // @fixme enclosures + // @fixme thumbnails... maybe + + $guidEl = ActivityUtils::child($item, self::GUID, self::RSS); + + if (!empty($guidEl)) { + $this->id = $guidEl->textContent; + + if ($guidEl->hasAttribute('isPermaLink') && $guidEl->getAttribute('isPermaLink') != 'false') { + // overwrites + $this->link = $this->id; + } + } + + $this->object = new ActivityObject($item); + $this->context = new ActivityContext($item); + } + /** * Returns an Atom based on this activity * @@ -1249,6 +1378,83 @@ class Activity return $xs->getString(); } + function _fromRssAuthor($el) + { + $text = $el->textContent; + + if (preg_match('/^(.*?) \((.*)\)$/', $text, $match)) { + $email = $match[1]; + $name = $match[2]; + } else if (preg_match('/^(.*?) <(.*)>$/', $text, $match)) { + $name = $match[1]; + $email = $match[2]; + } else if (preg_match('/.*@.*/', $text)) { + $email = $text; + $name = null; + } else { + $name = $text; + $email = null; + } + + // Not really enough info + + $actor = new ActivityObject(); + + $actor->element = $el; + + $actor->type = ActivityObject::PERSON; + $actor->title = $name; + + if (!empty($email)) { + $actor->id = 'mailto:'.$email; + } + + return $actor; + } + + function _fromDcCreator($el) + { + // Not really enough info + + $text = $el->textContent; + + $actor = new ActivityObject(); + + $actor->element = $el; + + $actor->title = $text; + $actor->type = ActivityObject::PERSON; + + return $actor; + } + + function _fromRss($el) + { + $actor = new ActivityObject(); + + $actor->element = $el; + + $actor->type = ActivityObject::PERSON; // @fixme guess better + + $actor->title = ActivityUtils::childContent($el, ActivityObject::TITLE, self::RSS); + $actor->link = ActivityUtils::childContent($el, ActivityUtils::LINK, self::RSS); + $actor->id = ActivityUtils::getLink($el, self::SELF); + + $desc = ActivityUtils::childContent($el, self::DESCRIPTION, self::RSS); + + if (!empty($desc)) { + $actor->content = htmlspecialchars_decode($desc, ENT_QUOTES); + } + + $imageEl = ActivityUtils::child($el, self::IMAGE, self::RSS); + + if (!empty($imageEl)) { + $actor->avatarLinks[] = ActivityUtils::childContent($imageEl, self::URL, self::RSS); + } + + return $actor; + } + private function _child($element, $tag, $namespace=self::SPEC) { return ActivityUtils::child($element, $tag, $namespace); diff --git a/plugins/OStatus/extlib/hkit/hcard.profile.php b/plugins/OStatus/extlib/hkit/hcard.profile.php deleted file mode 100644 index 6ec0dc8906..0000000000 --- a/plugins/OStatus/extlib/hkit/hcard.profile.php +++ /dev/null @@ -1,105 +0,0 @@ -root_class = 'vcard'; - - $this->classes = array( - 'fn', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'), - 'n', array('honorific-prefix', 'given-name', 'additional-name', 'family-name', 'honorific-suffix'), - 'adr', array('post-office-box', 'extended-address', 'street-address', 'postal-code', 'country-name', 'type', 'region', 'locality'), - 'label', 'bday', 'agent', 'nickname', 'photo', 'class', - 'email', array('type', 'value'), - 'category', 'key', 'logo', 'mailer', 'note', - 'org', array('organization-name', 'organization-unit'), - 'tel', array('type', 'value'), - 'geo', array('latitude', 'longitude'), - 'tz', 'uid', 'url', 'rev', 'role', 'sort-string', 'sound', 'title' - ); - - // classes that must only appear once per card - $this->singles = array( - 'fn' - ); - - // classes that are required (not strictly enforced - give at least one!) - $this->required = array( - 'fn' - ); - - $this->att_map = array( - 'fn' => array('IMG|alt'), - 'url' => array('A|href', 'IMG|src', 'AREA|href'), - 'photo' => array('IMG|src'), - 'bday' => array('ABBR|title'), - 'logo' => array('IMG|src'), - 'email' => array('A|href'), - 'geo' => array('ABBR|title') - ); - - - $this->callbacks = array( - 'url' => array($this, 'resolvePath'), - 'photo' => array($this, 'resolvePath'), - 'logo' => array($this, 'resolvePath'), - 'email' => array($this, 'resolveEmail') - ); - - - - function hKit_hcard_post($a) - { - - foreach ($a as &$vcard){ - - hKit_implied_n_optimization($vcard); - hKit_implied_n_from_fn($vcard); - - } - - return $a; - - } - - - function hKit_implied_n_optimization(&$vcard) - { - if (array_key_exists('fn', $vcard) && !is_array($vcard['fn']) && - !array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){ - - if (sizeof(explode(' ', $vcard['fn'])) == 2){ - $patterns = array(); - $patterns[] = array('/^(\S+),\s*(\S{1})$/', 2, 1); // Lastname, Initial - $patterns[] = array('/^(\S+)\s*(\S{1})\.*$/', 2, 1); // Lastname Initial(.) - $patterns[] = array('/^(\S+),\s*(\S+)$/', 2, 1); // Lastname, Firstname - $patterns[] = array('/^(\S+)\s*(\S+)$/', 1, 2); // Firstname Lastname - - foreach ($patterns as $pattern){ - if (preg_match($pattern[0], $vcard['fn'], $matches) === 1){ - $n = array(); - $n['given-name'] = $matches[$pattern[1]]; - $n['family-name'] = $matches[$pattern[2]]; - $vcard['n'] = $n; - - - break; - } - } - } - } - } - - - function hKit_implied_n_from_fn(&$vcard) - { - if (array_key_exists('fn', $vcard) && is_array($vcard['fn']) - && !array_key_exists('n', $vcard) && (!array_key_exists('org', $vcard) || $vcard['fn'] != $vcard['org'])){ - - $vcard['n'] = $vcard['fn']; - } - - if (array_key_exists('fn', $vcard) && is_array($vcard['fn'])){ - $vcard['fn'] = $vcard['fn']['text']; - } - } - -?> \ No newline at end of file diff --git a/plugins/OStatus/extlib/hkit/hkit.class.php b/plugins/OStatus/extlib/hkit/hkit.class.php deleted file mode 100644 index c3a54cff65..0000000000 --- a/plugins/OStatus/extlib/hkit/hkit.class.php +++ /dev/null @@ -1,475 +0,0 @@ -' . implode(', ', $missing) . ''); - - } - - - public function getByURL($profile='', $url='') - { - - if ($profile=='' || $url == '') return false; - - $this->loadProfile($profile); - - $source = $this->loadURL($url); - - if ($source){ - $tidy_xhtml = $this->tidyThis($source); - - $fragment = false; - - if (strrchr($url, '#')) - $fragment = array_pop(explode('#', $url)); - - $doc = $this->loadDoc($tidy_xhtml, $fragment); - $s = $this->processNodes($doc, $this->classes); - $s = $this->postProcess($profile, $s); - - return $s; - }else{ - return false; - } - } - - public function getByString($profile='', $input_xml='') - { - if ($profile=='' || $input_xml == '') return false; - - $this->loadProfile($profile); - - $doc = $this->loadDoc($input_xml); - $s = $this->processNodes($doc, $this->classes); - $s = $this->postProcess($profile, $s); - - return $s; - - } - - private function processNodes($items, $classes, $allow_includes=true){ - - $out = array(); - - foreach($items as $item){ - $data = array(); - - for ($i=0; $ixpath($xpath); - - if ($results){ - foreach ($results as $result){ - if (isset($classes[$i+1]) && is_array($classes[$i+1])){ - $nodes = $this->processNodes($results, $classes[$i+1]); - if (sizeof($nodes) > 0){ - $nodes = array_merge(array('text'=>$this->getNodeValue($result, $classes[$i])), $nodes); - $data[$classes[$i]] = $nodes; - }else{ - $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]); - } - - }else{ - if (isset($data[$classes[$i]])){ - if (is_array($data[$classes[$i]])){ - // is already an array - append - $data[$classes[$i]][] = $this->getNodeValue($result, $classes[$i]); - - }else{ - // make it an array - if ($classes[$i] == 'value'){ // unless it's the 'value' of a type/value pattern - $data[$classes[$i]] .= $this->getNodeValue($result, $classes[$i]); - }else{ - $old_val = $data[$classes[$i]]; - $data[$classes[$i]] = array($old_val, $this->getNodeValue($result, $classes[$i])); - $old_val = false; - } - } - }else{ - // set as normal value - $data[$classes[$i]] = $this->getNodeValue($result, $classes[$i]); - - } - } - - // td@headers pattern - if (strtoupper(dom_import_simplexml($result)->tagName)== "TD" && $result['headers']){ - $include_ids = explode(' ', $result['headers']); - $doc = $this->doc; - foreach ($include_ids as $id){ - $xpath = "//*[@id='$id']/.."; - $includes = $doc->xpath($xpath); - foreach ($includes as $include){ - $tmp = $this->processNodes($include, $this->classes); - if (is_array($tmp)) $data = array_merge($data, $tmp); - } - } - } - } - } - } - $result = false; - } - - // include-pattern - if ($allow_includes){ - $xpath = ".//*[contains(concat(' ',normalize-space(@class),' '),' include ')]"; - $results = $item->xpath($xpath); - - if ($results){ - foreach ($results as $result){ - $tagName = strtoupper(dom_import_simplexml($result)->tagName); - if ((($tagName == "OBJECT" && $result['data']) || ($tagName == "A" && $result['href'])) - && preg_match('/\binclude\b/', $result['class'])){ - $att = ($tagName == "OBJECT" ? 'data' : 'href'); - $id = str_replace('#', '', $result[$att]); - $doc = $this->doc; - $xpath = "//*[@id='$id']"; - $includes = $doc->xpath($xpath); - foreach ($includes as $include){ - $include = simplexml_load_string(''.$include->asXML().''); // don't ask. - $tmp = $this->processNodes($include, $this->classes, false); - if (is_array($tmp)) $data = array_merge($data, $tmp); - } - } - } - } - } - $out[] = $data; - } - - if (sizeof($out) > 1){ - return $out; - }else if (isset($data)){ - return $data; - }else{ - return array(); - } - } - - - private function getNodeValue($node, $className) - { - - $tag_name = strtoupper(dom_import_simplexml($node)->tagName); - $s = false; - - // ignore DEL tags - if ($tag_name == 'DEL') return $s; - - // look up att map values - if (array_key_exists($className, $this->att_map)){ - - foreach ($this->att_map[$className] as $map){ - if (preg_match("/$tag_name\|/", $map)){ - $s = ''.$node[array_pop($foo = explode('|', $map))]; - } - } - } - - // if nothing and OBJ, try data. - if (!$s && $tag_name=='OBJECT' && $node['data']) $s = ''.$node['data']; - - // if nothing and IMG, try alt. - if (!$s && $tag_name=='IMG' && $node['alt']) $s = ''.$node['alt']; - - // if nothing and AREA, try alt. - if (!$s && $tag_name=='AREA' && $node['alt']) $s = ''.$node['alt']; - - //if nothing and not A, try title. - if (!$s && $tag_name!='A' && $node['title']) $s = ''.$node['title']; - - - // if nothing found, go with node text - $s = ($s ? $s : implode(array_filter($node->xpath('child::node()'), array(&$this, "filterBlankValues")), ' ')); - - // callbacks - if (array_key_exists($className, $this->callbacks)){ - $s = preg_replace_callback('/.*/', $this->callbacks[$className], $s, 1); - } - - // trim and remove line breaks - if ($tag_name != 'PRE'){ - $s = trim(preg_replace('/[\r\n\t]+/', '', $s)); - $s = trim(preg_replace('/(\s{2})+/', ' ', $s)); - } - - return $s; - } - - private function filterBlankValues($s){ - return preg_match("/\w+/", $s); - } - - - private function tidyThis($source) - { - switch ( $this->tidy_mode ) - { - case 'exec': - $tmp_file = $this->tmp_dir.md5($source).'.txt'; - file_put_contents($tmp_file, $source); - exec("tidy -utf8 -indent -asxhtml -numeric -bare -quiet $tmp_file", $tidy); - unlink($tmp_file); - return implode("\n", $tidy); - break; - - case 'php': - $tidy = tidy_parse_string($source); - return tidy_clean_repair($tidy); - break; - - default: - return $source; - break; - } - - } - - - private function loadProfile($profile) - { - require_once("$profile.profile.php"); - } - - - private function loadDoc($input_xml, $fragment=false) - { - $xml = simplexml_load_string($input_xml); - - $this->doc = $xml; - - if ($fragment){ - $doc = $xml->xpath("//*[@id='$fragment']"); - $xml = simplexml_load_string($doc[0]->asXML()); - $doc = null; - } - - // base tag - if ($xml->head->base['href']) $this->base = $xml->head->base['href']; - - // xml:base attribute - PITA with SimpleXML - preg_match('/xml:base="(.*)"/', $xml->asXML(), $matches); - if (is_array($matches) && sizeof($matches)>1) $this->base = $matches[1]; - - return $xml->xpath("//*[contains(concat(' ',normalize-space(@class),' '),' $this->root_class ')]"); - - } - - - private function loadURL($url) - { - $this->url = $url; - - if ($this->tidy_mode == 'proxy' && $this->tidy_proxy != ''){ - $url = $this->tidy_proxy . $url; - } - - return @file_get_contents($url); - - } - - - private function postProcess($profile, $s) - { - $required = $this->required; - - if (is_array($s) && array_key_exists($required[0], $s)){ - $s = array($s); - } - - $s = $this->dedupeSingles($s); - - if (function_exists('hKit_'.$profile.'_post')){ - $s = call_user_func('hKit_'.$profile.'_post', $s); - } - - $s = $this->removeTextVals($s); - - return $s; - } - - - private function resolvePath($filepath) - { // ugly code ahoy: needs a serious tidy up - - $filepath = $filepath[0]; - - $base = $this->base; - $url = $this->url; - - if ($base != '' && strpos($base, '://') !== false) - $url = $base; - - $r = parse_url($url); - $domain = $r['scheme'] . '://' . $r['host']; - - if (!isset($r['path'])) $r['path'] = '/'; - $path = explode('/', $r['path']); - $file = explode('/', $filepath); - $new = array(''); - - if (strpos($filepath, '://') !== false || strpos($filepath, 'data:') !== false){ - return $filepath; - } - - if ($file[0] == ''){ - // absolute path - return ''.$domain . implode('/', $file); - }else{ - // relative path - if ($path[sizeof($path)-1] == '') array_pop($path); - if (strpos($path[sizeof($path)-1], '.') !== false) array_pop($path); - - foreach ($file as $segment){ - if ($segment == '..'){ - array_pop($path); - }else{ - $new[] = $segment; - } - } - return ''.$domain . implode('/', $path) . implode('/', $new); - } - } - - private function resolveEmail($v) - { - $parts = parse_url($v[0]); - return ($parts['path']); - } - - - private function dedupeSingles($s) - { - $singles = $this->singles; - - foreach ($s as &$item){ - foreach ($singles as $classname){ - if (array_key_exists($classname, $item) && is_array($item[$classname])){ - if (isset($item[$classname][0])) $item[$classname] = $item[$classname][0]; - } - } - } - - return $s; - } - - private function removeTextVals($s) - { - foreach ($s as $key => &$val){ - if ($key){ - $k = $key; - }else{ - $k = ''; - } - - if (is_array($val)){ - $val = $this->removeTextVals($val); - }else{ - if ($k == 'text'){ - $val = ''; - } - } - } - - return array_filter($s); - } - - } - - -?> \ No newline at end of file diff --git a/plugins/OStatus/lib/discoveryhints.php b/plugins/OStatus/lib/discoveryhints.php index 4da2ec0f1e..1bb0ad2aea 100644 --- a/plugins/OStatus/lib/discoveryhints.php +++ b/plugins/OStatus/lib/discoveryhints.php @@ -63,54 +63,12 @@ class DiscoveryHints { static function hcardHints($body, $url) { - common_debug("starting tidy"); - - $body = self::_tidy($body, $url); - - common_debug("done with tidy"); - - set_include_path(get_include_path() . PATH_SEPARATOR . INSTALLDIR . '/plugins/OStatus/extlib/hkit/'); - require_once('hkit.class.php'); - - // hKit code is not clean for notices and warnings - $old = error_reporting(); - error_reporting($old & ~E_NOTICE & ~E_WARNING); - - $h = new hKit; - $hcards = $h->getByString('hcard', $body); - - error_reporting($old); - - if (empty($hcards)) { - return array(); - } - - if (count($hcards) == 1) { - $hcard = $hcards[0]; - } else { - foreach ($hcards as $try) { - if (array_key_exists('url', $try)) { - if (is_string($try['url']) && $try['url'] == $url) { - $hcard = $try; - break; - } else if (is_array($try['url'])) { - foreach ($try['url'] as $tryurl) { - if ($tryurl == $url) { - $hcard = $try; - break 2; - } - } - } - } - } - // last chance; grab the first one - if (empty($hcard)) { - $hcard = $hcards[0]; - } - } + $hcard = self::_hcard($body, $url); $hints = array(); + // XXX: don't copy stuff into an array and then copy it again + if (array_key_exists('nickname', $hcard)) { $hints['nickname'] = $hcard['nickname']; } @@ -122,7 +80,7 @@ class DiscoveryHints { } if (array_key_exists('photo', $hcard)) { - $hints['avatar'] = $hcard['photo']; + $hints['avatar'] = $hcard['photo'][0]; } if (array_key_exists('note', $hcard)) { @@ -149,61 +107,142 @@ class DiscoveryHints { return $hints; } - /** - * hKit needs well-formed XML for its parsing. - * We'll take the HTML body here and normalize it to XML. - * - * @param string $body HTML document source, possibly not-well-formed - * @param string $url source URL - * @return string well-formed XML document source - * @throws Exception if HTML parsing failed. - */ - private static function _tidy($body, $url) + static function _hcard($body, $url) { - if (empty($body)) { - throw new Exception("Empty HTML could not be parsed."); - } - $dom = new DOMDocument(); + // DOMDocument::loadHTML may throw warnings on unrecognized elements. - // Some HTML errors will trigger warnings, but still work. - $old = error_reporting(); - error_reporting($old & ~E_WARNING); - - $ok = $dom->loadHTML($body); + $old = error_reporting(error_reporting() & ~E_WARNING); + + $doc = new DOMDocument(); + $doc->loadHTML($body); error_reporting($old); - - if ($ok) { - // hKit doesn't give us a chance to pass the source URL for - // resolving relative links, such as the avatar photo on a - // Google profile. We'll slip it into a tag if there's - // not already one present. - $bases = $dom->getElementsByTagName('base'); - if ($bases && $bases->length >= 1) { - $base = $bases->item(0); - if ($base->hasAttribute('href')) { - $base->setAttribute('href', $url); - } - } else { - $base = $dom->createElement('base'); - $base->setAttribute('href', $url); - $heads = $dom->getElementsByTagName('head'); - if ($heads || $heads->length) { - $head = $heads->item(0); - } else { - $head = $dom->createElement('head'); - $root = $dom->documentRoot; - if ($root->firstChild) { - $root->insertBefore($head, $root->firstChild); - } else { - $root->appendChild($head); - } - } - $head->appendChild($base); + + $xp = new DOMXPath($doc); + + $hcardNodes = self::_getChildrenByClass($doc->documentElement, 'vcard', $xp); + + $hcards = array(); + + for ($i = 0; $i < $hcardNodes->length; $i++) { + + $hcardNode = $hcardNodes->item($i); + + $hcard = self::_hcardFromNode($hcardNode, $xp, $url); + + $hcards[] = $hcard; + } + + $repr = null; + + foreach ($hcards as $hcard) { + if (in_array($url, $hcard['url'])) { + $repr = $hcard; + break; } - return $dom->saveXML(); + } + + if (!is_null($repr)) { + return $repr; + } else if (count($hcards) > 0) { + return $hcards[0]; } else { - throw new Exception("Invalid HTML could not be parsed."); + return null; } } + + function _getChildrenByClass($el, $cls, $xp) + { + // borrowed from hkit. Thanks dudes! + + $qry = ".//*[contains(concat(' ',normalize-space(@class),' '),' $cls ')]"; + + $nodes = $xp->query($qry, $el); + + return $nodes; + } + + function _hcardFromNode($hcardNode, $xp, $base) + { + $hcard = array(); + + $hcard['url'] = array(); + + $urlNodes = self::_getChildrenByClass($hcardNode, 'url', $xp); + + for ($j = 0; $j < $urlNodes->length; $j++) { + + $urlNode = $urlNodes->item($j); + + if ($urlNode->hasAttribute('href')) { + $url = $urlNode->getAttribute('href'); + } else { + $url = $urlNode->textContent; + } + + $hcard['url'][] = self::_rel2abs($url, $base); + } + + $hcard['photo'] = array(); + + $photoNodes = self::_getChildrenByClass($hcardNode, 'photo', $xp); + + for ($j = 0; $j < $photoNodes->length; $j++) { + $photoNode = $photoNodes->item($j); + if ($photoNode->hasAttribute('src')) { + $url = $photoNode->getAttribute('src'); + } else if ($photoNode->hasAttribute('href')) { + $url = $photoNode->getAttribute('href'); + } else { + $url = $photoNode->textContent; + } + $hcard['photo'][] = self::_rel2abs($url, $base); + } + + $singles = array('nickname', 'note', 'fn', 'n', 'adr'); + + foreach ($singles as $single) { + + $nodes = self::_getChildrenByClass($hcardNode, $single, $xp); + + if ($nodes->length > 0) { + $node = $nodes->item(0); + $hcard[$single] = $node->textContent; + } + } + + return $hcard; + } + + // XXX: this is a first pass; we probably need + // to handle things like ../ and ./ and so on + + static function _rel2abs($rel, $wrt) + { + $parts = parse_url($rel); + + if ($parts === false) { + return false; + } + + // If it's got a scheme, use it + + if ($parts['scheme'] != '') { + return $rel; + } + + $w = parse_url($wrt); + + $base = $w['scheme'].'://'.$w['host']; + + if ($rel[0] == '/') { + return $base.$rel; + } + + $wp = explode('/', $w['path']); + + array_pop($wp); + + return $base.implode('/', $wp).'/'.$rel; + } } diff --git a/tests/ActivityParseTests.php b/tests/ActivityParseTests.php index 7bf9cec7c4..b6980a6bb9 100644 --- a/tests/ActivityParseTests.php +++ b/tests/ActivityParseTests.php @@ -138,9 +138,38 @@ class ActivityParseTests extends PHPUnit_Framework_TestCase $this->assertEquals($poco->urls[0]->value, 'http://example.com/blog.html'); $this->assertEquals($poco->urls[0]->primary, 'true'); $this->assertEquals($act->actor->geopoint, '37.7749295 -122.4194155'); - } + public function testExample6() + { + global $_example6; + + $dom = DOMDocument::loadXML($_example6); + + $rss = $dom->documentElement; + + $channels = $dom->getElementsByTagName('channel'); + + $channel = $channels->item(0); + + $items = $channel->getElementsByTagName('item'); + + $item = $items->item(0); + + $act = new Activity($item, $channel); + + $this->assertEquals($act->verb, ActivityVerb::POST); + + $this->assertEquals($act->id, 'http://en.blog.wordpress.com/?p=3857'); + $this->assertEquals($act->link, 'http://en.blog.wordpress.com/2010/03/03/rub-a-dub-dub-in-the-pubsubhubbub/'); + $this->assertEquals($act->title, 'Rub-a-Dub-Dub in the PubSubHubbub'); + $this->assertEquals($act->time, 1267634892); + + $actor = $act->actor; + + $this->assertFalse(empty($actor)); + $this->assertEquals($actor->title, "Joseph Scott"); + } } $_example1 = << EXAMPLE5; + +$_example6 = << + + + + WordPress.com News + + http://en.blog.wordpress.com + The latest news on WordPress.com and the WordPress community. + Thu, 18 Mar 2010 23:25:35 +0000 + + http://wordpress.com/ + en + hourly + 1 + + + http://www.gravatar.com/blavatar/e6392390e3bcfadff3671c5a5653d95b?s=96&d=http://s2.wp.com/i/buttonw-com.png + WordPress.com News + http://en.blog.wordpress.com + + + +EXAMPLE6; +