Fix ticket #3057: apply HTML escaping on special characters in Twitter import

Changes the replacement of Twitter "entities" from in-place reverse ordering ('to preserve indices') to a forward-facing append-in-chunks that pulls in both the text and link portions, and escapes them all.
This unfortunately means first *de*-escaping the < and > that Twitter helpfully adds for us.... and any literal &blah;s that get written. This seems to match Twitter's web UI, however horrid it is.
This commit is contained in:
Brion Vibber 2011-02-21 16:18:45 -08:00
parent 2a42dac72a
commit 8eca1b8dac

View File

@ -554,8 +554,8 @@ class TwitterImport
} }
// Move all the entities into order so we can // Move all the entities into order so we can
// replace them in reverse order and thus // replace them and escape surrounding plaintext
// not mess up their indices // in order
$toReplace = array(); $toReplace = array();
@ -577,13 +577,23 @@ class TwitterImport
} }
} }
// sort in reverse order by key // sort in forward order by key
krsort($toReplace); ksort($toReplace);
$result = '';
$cursor = 0;
foreach ($toReplace as $part) { foreach ($toReplace as $part) {
list($type, $object) = $part; list($type, $object) = $part;
$orig = mb_substr($text, $object->indices[0], $object->indices[1] - $object->indices[0]); $start = $object->indices[0];
$end = $object->indices[1];
if ($cursor < $start) {
// Copy in the preceding plaintext
$result .= $this->twitEscape(mb_substr($text, $cursor, $start - $cursor));
$cursor = $start;
}
$orig = $this->twitEscape(mb_substr($text, $start, $end - $start));
switch($type) { switch($type) {
case self::URL: case self::URL:
$linkText = $this->makeUrlLink($object, $orig); $linkText = $this->makeUrlLink($object, $orig);
@ -595,11 +605,29 @@ class TwitterImport
$linkText = $this->makeMentionLink($object, $orig); $linkText = $this->makeMentionLink($object, $orig);
break; break;
default: default:
$linkText = $orig;
continue; continue;
} }
$text = mb_substr($text, 0, $object->indices[0]) . $linkText . mb_substr($text, $object->indices[1]); $result .= $linkText;
$cursor = $end;
} }
return $text; $last = $this->twitEscape(mb_substr($text, $cursor));
$result .= $last;
return $result;
}
function twitEscape($str)
{
// Twitter seems to preemptive turn < and > into &lt; and &gt;
// but doesn't for &, so while you may have some magic protection
// against XSS by not bothing to escape manually, you still get
// invalid XHTML. Thanks!
//
// Looks like their web interface pretty much sends anything
// through intact, so.... to do equivalent, decode all entities
// and then re-encode the special ones.
return htmlspecialchars(html_entity_decode($str, ENT_COMPAT, 'UTF-8'));
} }
function makeUrlLink($object, $orig) function makeUrlLink($object, $orig)