gnu-social/plugins/Bookmark/deliciousbackupimporter.php
Brion Vibber fedfde9bbb Bookmark plugin: fixes for bad DOM element nesting in delicious import data
delicious bookmark exports use the godawful HTML bookmark file format that ancient versions of Netscape used (and has thus been the common import/export format for bookmarks since the dark ages of the web :)
This arranges bookmark entries as an HTML definition list, using a lot of implied close tags (leaving off the </dt> and </dd>).
DOMDocument->loadHTML() uses libxml2's HTML mode, which generally does ok with muddling through things but apparently is really, really bad about handling those implied close tags.

Sequences of adjacent <dt> elements (eg bookmark without a description, followed by another bookmark "<dt><dt>"), end up interpreted as nested ("<dt><dt></dt></dt>") instead of as siblings ("<dt></dt><dt></dt>").
The first round of code tried to resolve the nesting inline, but ended up a bit funky in places.
I've replaced this with a standalone run through the data to re-order the elements, based on our knowing that <dt> and <dd> cannot directly contain one another; once that's done, our main logic loop can be a bit cleaner. I'm not 100% sure it's doing nested sublists correctly, but these don't seem to show up in delicious export (and even if they do, with the way we flatten the input it shouldn't make a difference).

Also fixed a clearer edge case where some bookmarks didn't get imported when missing descriptions.
2010-12-31 12:09:54 -08:00

282 lines
8.5 KiB
PHP

<?php
/**
* StatusNet - the distributed open-source microblogging tool
* Copyright (C) 2010, StatusNet, Inc.
*
* Importer class for Delicious.com backups
*
* PHP version 5
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* @category Bookmark
* @package StatusNet
* @author Evan Prodromou <evan@status.net>
* @copyright 2010 StatusNet, Inc.
* @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
* @link http://status.net/
*/
if (!defined('STATUSNET')) {
// This check helps protect against security problems;
// your code file can't be executed directly from the web.
exit(1);
}
/**
* Importer class for Delicious bookmarks
*
* @category Bookmark
* @package StatusNet
* @author Evan Prodromou <evan@status.net>
* @copyright 2010 StatusNet, Inc.
* @license http://www.fsf.org/licensing/licenses/agpl-3.0.html AGPL 3.0
* @link http://status.net/
*/
class DeliciousBackupImporter extends QueueHandler
{
/**
* Transport of the importer
*
* @return string transport string
*/
function transport()
{
return 'dlcsback';
}
/**
* Import an in-memory bookmark list to a user's account
*
* Take a delicious.com backup file (same as Netscape bookmarks.html)
* and import to StatusNet as Bookmark activities.
*
* The document format is terrible. It consists of a <dl> with
* a bunch of <dt>'s, occasionally with <dd>'s adding descriptions.
* There are sometimes <p>'s lost inside.
*
* @param array $data pair of user, text
*
* @return boolean success value
*/
function handle($data)
{
list($user, $body) = $data;
$doc = $this->importHTML($body);
$dls = $doc->getElementsByTagName('dl');
if ($dls->length != 1) {
throw new ClientException(_("Bad import file."));
}
$dl = $dls->item(0);
$children = $dl->childNodes;
$dt = null;
for ($i = 0; $i < $children->length; $i++) {
try {
$child = $children->item($i);
if ($child->nodeType != XML_ELEMENT_NODE) {
continue;
}
switch (strtolower($child->tagName)) {
case 'dt':
// <dt> nodes contain primary information about a bookmark.
// We can't import the current one just yet though, since
// it may be followed by a <dd>.
if (!empty($dt)) {
// No DD provided
$this->importBookmark($user, $dt);
$dt = null;
}
$dt = $child;
break;
case 'dd':
$dd = $child;
// This <dd> contains a description for the bookmark in
// the preceding <dt> node.
$saved = $this->importBookmark($user, $dt, $dd);
$dt = null;
$dd = null;
break;
case 'p':
common_log(LOG_INFO, 'Skipping the <p> in the <dl>.');
break;
default:
common_log(LOG_WARNING,
"Unexpected element $child->tagName ".
" found in import.");
}
} catch (Exception $e) {
common_log(LOG_ERR, $e->getMessage());
$dt = $dd = null;
}
}
if (!empty($dt)) {
// There was a final bookmark without a description.
try {
$this->importBookmark($user, $dt);
} catch (Exception $e) {
common_log(LOG_ERR, $e->getMessage());
}
}
return true;
}
/**
* Import a single bookmark
*
* Takes a <dt>/<dd> pair. The <dt> has a single
* <a> in it with some non-standard attributes.
*
* A <dt><dt><dd> sequence will appear as a <dt> with
* anothe <dt> as a child. We handle this case recursively.
*
* @param User $user User to import data as
* @param DOMElement $dt <dt> element
* @param DOMElement $dd <dd> element
*
* @return Notice imported notice
*/
function importBookmark($user, $dt, $dd = null)
{
$qm = QueueManager::get();
$qm->enqueue(array($user, $dt, $dd), 'dlcsbkmk');
}
/**
* Parse some HTML
*
* Hides the errors that the dom parser returns
*
* @param string $body Data to import
*
* @return DOMDocument parsed document
*/
function importHTML($body)
{
// DOMDocument::loadHTML may throw warnings on unrecognized elements,
// and notices on unrecognized namespaces.
$old = error_reporting(error_reporting() & ~(E_WARNING | E_NOTICE));
$dom = new DOMDocument();
$ok = $dom->loadHTML($body);
error_reporting($old);
if ($ok) {
foreach ($dom->getElementsByTagName('body') as $node) {
$this->fixListsIn($node);
}
return $dom;
} else {
return null;
}
}
function fixListsIn(DOMNode $body) {
$toFix = array();
foreach ($body->childNodes as $node) {
if ($node->nodeType == XML_ELEMENT_NODE) {
$el = strtolower($node->nodeName);
if ($el == 'dl') {
$toFix[] = $node;
}
}
}
foreach ($toFix as $node) {
$this->fixList($node);
}
}
function fixList(DOMNode $list) {
$toFix = array();
foreach ($list->childNodes as $node) {
if ($node->nodeType == XML_ELEMENT_NODE) {
$el = strtolower($node->nodeName);
if ($el == 'dt' || $el == 'dd') {
$toFix[] = $node;
}
if ($el == 'dl') {
// Sublist.
// Technically, these can only appear inside a <dd>...
$this->fixList($node);
}
}
}
foreach ($toFix as $node) {
$this->fixListItem($node);
}
}
function fixListItem(DOMNode $item) {
// The HTML parser in libxml2 doesn't seem to properly handle
// many cases of implied close tags, apparently because it doesn't
// understand the nesting rules specified in the HTML DTD.
//
// This leads to sequences of adjacent <dt>s or <dd>s being incorrectly
// interpreted as parent->child trees instead of siblings:
//
// When parsing this input: "<dt>aaa <dt>bbb"
// should be equivalent to: "<dt>aaa </dt><dt>bbb</dt>"
// but we're seeing instead: "<dt>aaa <dt>bbb</dt></dt>"
//
// It does at least know that going from dt to dd, or dd to dt,
// should make a break.
$toMove = array();
foreach ($item->childNodes as $node) {
if ($node->nodeType == XML_ELEMENT_NODE) {
$el = strtolower($node->nodeName);
if ($el == 'dt' || $el == 'dd') {
// dt & dd cannot contain each other;
// This node was incorrectly placed; move it up a level!
$toMove[] = $node;
}
if ($el == 'dl') {
// Sublist.
// Technically, these can only appear inside a <dd>.
$this->fixList($node);
}
}
}
$parent = $item->parentNode;
$next = $item->nextSibling;
foreach ($toMove as $node) {
$item->removeChild($node);
$parent->insertBefore($node, $next);
$this->fixListItem($node);
}
}
}