gnu-social/plugins/Bookmark/deliciousbackupimporter.php
Brion Vibber 3368c33be7 Bookmark plugin: fix for delicious import with queues enabled
We were passing DOM nodes directly into the queues for the final bookmark import stage; unfortunately these don't actually survive serialization.
Moved the extraction of properties from the HTML up to the first-stage handler, so now we don't have to worry about moving DOM nodes from one handler to the next. Instead passing an associative array of properties, which is fed into the Bookmark::saveNew by the per-bookmark handler.
2010-12-31 12:33:51 -08:00

311 lines
9.3 KiB
PHP

<?php
/**
* StatusNet - the distributed open-source microblogging tool
* Copyright (C) 2010, StatusNet, Inc.
*
* Importer class for Delicious.com backups
*
* PHP version 5
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* @category Bookmark
* @package StatusNet
* @author Evan Prodromou <evan@status.net>
* @copyright 2010 StatusNet, Inc.
* @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
* @link http://status.net/
*/
if (!defined('STATUSNET')) {
// This check helps protect against security problems;
// your code file can't be executed directly from the web.
exit(1);
}
/**
* Importer class for Delicious bookmarks
*
* @category Bookmark
* @package StatusNet
* @author Evan Prodromou <evan@status.net>
* @copyright 2010 StatusNet, Inc.
* @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
* @link http://status.net/
*/
class DeliciousBackupImporter extends QueueHandler
{
/**
* Transport of the importer
*
* @return string transport string
*/
function transport()
{
return 'dlcsback';
}
/**
* Import an in-memory bookmark list to a user's account
*
* Take a delicious.com backup file (same as Netscape bookmarks.html)
* and import to StatusNet as Bookmark activities.
*
* The document format is terrible. It consists of a <dl> with
* a bunch of <dt>'s, occasionally with <dd>'s adding descriptions.
* There are sometimes <p>'s lost inside.
*
* @param array $data pair of user, text
*
* @return boolean success value
*/
function handle($data)
{
list($user, $body) = $data;
$doc = $this->importHTML($body);
$dls = $doc->getElementsByTagName('dl');
if ($dls->length != 1) {
throw new ClientException(_("Bad import file."));
}
$dl = $dls->item(0);
$children = $dl->childNodes;
$dt = null;
for ($i = 0; $i < $children->length; $i++) {
try {
$child = $children->item($i);
if ($child->nodeType != XML_ELEMENT_NODE) {
continue;
}
switch (strtolower($child->tagName)) {
case 'dt':
// <dt> nodes contain primary information about a bookmark.
// We can't import the current one just yet though, since
// it may be followed by a <dd>.
if (!empty($dt)) {
// No DD provided
$this->importBookmark($user, $dt);
$dt = null;
}
$dt = $child;
break;
case 'dd':
$dd = $child;
// This <dd> contains a description for the bookmark in
// the preceding <dt> node.
$saved = $this->importBookmark($user, $dt, $dd);
$dt = null;
$dd = null;
break;
case 'p':
common_log(LOG_INFO, 'Skipping the <p> in the <dl>.');
break;
default:
common_log(LOG_WARNING,
"Unexpected element $child->tagName ".
" found in import.");
}
} catch (Exception $e) {
common_log(LOG_ERR, $e->getMessage());
$dt = $dd = null;
}
}
if (!empty($dt)) {
// There was a final bookmark without a description.
try {
$this->importBookmark($user, $dt);
} catch (Exception $e) {
common_log(LOG_ERR, $e->getMessage());
}
}
return true;
}
/**
* Import a single bookmark
*
* Takes a <dt>/<dd> pair. The <dt> has a single
* <a> in it with some non-standard attributes.
*
* A <dt><dt><dd> sequence will appear as a <dt> with
* anothe <dt> as a child. We handle this case recursively.
*
* @param User $user User to import data as
* @param DOMElement $dt <dt> element
* @param DOMElement $dd <dd> element
*
* @return Notice imported notice
*/
function importBookmark($user, $dt, $dd = null)
{
$as = $dt->getElementsByTagName('a');
if ($as->length == 0) {
throw new ClientException(_("No <A> tag in a <DT>."));
}
$a = $as->item(0);
$private = $a->getAttribute('private');
if ($private != 0) {
throw new ClientException(_('Skipping private bookmark.'));
}
if (!empty($dd)) {
$description = $dd->nodeValue;
} else {
$description = null;
}
$addDate = $a->getAttribute('add_date');
$data = array(
'profile_id' => $user->id,
'title' => $a->nodeValue,
'description' => $description,
'url' => $a->getAttribute('href'),
'tags' => $a->getAttribute('tags'),
'created' => common_sql_date(intval($addDate))
);
$qm = QueueManager::get();
$qm->enqueue($data, 'dlcsbkmk');
}
/**
* Parse some HTML
*
* Hides the errors that the dom parser returns
*
* @param string $body Data to import
*
* @return DOMDocument parsed document
*/
function importHTML($body)
{
// DOMDocument::loadHTML may throw warnings on unrecognized elements,
// and notices on unrecognized namespaces.
$old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE));
$dom = new DOMDocument();
$ok = $dom->loadHTML($body);
error_reporting($old);
if ($ok) {
foreach ($dom->getElementsByTagName('body') as $node) {
$this->fixListsIn($node);
}
return $dom;
} else {
return null;
}
}
function fixListsIn(DOMNode $body) {
$toFix = array();
foreach ($body->childNodes as $node) {
if ($node->nodeType == XML_ELEMENT_NODE) {
$el = strtolower($node->nodeName);
if ($el == 'dl') {
$toFix[] = $node;
}
}
}
foreach ($toFix as $node) {
$this->fixList($node);
}
}
function fixList(DOMNode $list) {
$toFix = array();
foreach ($list->childNodes as $node) {
if ($node->nodeType == XML_ELEMENT_NODE) {
$el = strtolower($node->nodeName);
if ($el == 'dt' || $el == 'dd') {
$toFix[] = $node;
}
if ($el == 'dl') {
// Sublist.
// Technically, these can only appear inside a <dd>...
$this->fixList($node);
}
}
}
foreach ($toFix as $node) {
$this->fixListItem($node);
}
}
function fixListItem(DOMNode $item) {
// The HTML parser in libxml2 doesn't seem to properly handle
// many cases of implied close tags, apparently because it doesn't
// understand the nesting rules specified in the HTML DTD.
//
// This leads to sequences of adjacent <dt>s or <dd>s being incorrectly
// interpreted as parent->child trees instead of siblings:
//
// When parsing this input: "<dt>aaa <dt>bbb"
// should be equivalent to: "<dt>aaa </dt><dt>bbb</dt>"
// but we're seeing instead: "<dt>aaa <dt>bbb</dt></dt>"
//
// It does at least know that going from dt to dd, or dd to dt,
// should make a break.
$toMove = array();
foreach ($item->childNodes as $node) {
if ($node->nodeType == XML_ELEMENT_NODE) {
$el = strtolower($node->nodeName);
if ($el == 'dt' || $el == 'dd') {
// dt & dd cannot contain each other;
// This node was incorrectly placed; move it up a level!
$toMove[] = $node;
}
if ($el == 'dl') {
// Sublist.
// Technically, these can only appear inside a <dd>.
$this->fixList($node);
}
}
}
$parent = $item->parentNode;
$next = $item->nextSibling;
foreach ($toMove as $node) {
$item->removeChild($node);
$parent->insertBefore($node, $next);
$this->fixListItem($node);
}
}
}