Fix ticket #3057: apply HTML escaping on special characters in Twitter import
Changes the replacement of Twitter "entities" from in-place reverse ordering ('to preserve indices') to a forward-facing append-in-chunks that pulls in both the text and link portions, and escapes them all. This unfortunately means first *de*-escaping the < and > that Twitter helpfully adds for us.... and any literal &blah;s that get written. This seems to match Twitter's web UI, however horrid it is.
This commit is contained in:
parent
2a42dac72a
commit
8eca1b8dac
|
@ -554,8 +554,8 @@ class TwitterImport
|
||||||
}
|
}
|
||||||
|
|
||||||
// Move all the entities into order so we can
|
// Move all the entities into order so we can
|
||||||
// replace them in reverse order and thus
|
// replace them and escape surrounding plaintext
|
||||||
// not mess up their indices
|
// in order
|
||||||
|
|
||||||
$toReplace = array();
|
$toReplace = array();
|
||||||
|
|
||||||
|
@ -577,13 +577,23 @@ class TwitterImport
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// sort in reverse order by key
|
// sort in forward order by key
|
||||||
|
|
||||||
krsort($toReplace);
|
ksort($toReplace);
|
||||||
|
|
||||||
|
$result = '';
|
||||||
|
$cursor = 0;
|
||||||
|
|
||||||
foreach ($toReplace as $part) {
|
foreach ($toReplace as $part) {
|
||||||
list($type, $object) = $part;
|
list($type, $object) = $part;
|
||||||
$orig = mb_substr($text, $object->indices[0], $object->indices[1] - $object->indices[0]);
|
$start = $object->indices[0];
|
||||||
|
$end = $object->indices[1];
|
||||||
|
if ($cursor < $start) {
|
||||||
|
// Copy in the preceding plaintext
|
||||||
|
$result .= $this->twitEscape(mb_substr($text, $cursor, $start - $cursor));
|
||||||
|
$cursor = $start;
|
||||||
|
}
|
||||||
|
$orig = $this->twitEscape(mb_substr($text, $start, $end - $start));
|
||||||
switch($type) {
|
switch($type) {
|
||||||
case self::URL:
|
case self::URL:
|
||||||
$linkText = $this->makeUrlLink($object, $orig);
|
$linkText = $this->makeUrlLink($object, $orig);
|
||||||
|
@ -595,11 +605,29 @@ class TwitterImport
|
||||||
$linkText = $this->makeMentionLink($object, $orig);
|
$linkText = $this->makeMentionLink($object, $orig);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
$linkText = $orig;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
$text = mb_substr($text, 0, $object->indices[0]) . $linkText . mb_substr($text, $object->indices[1]);
|
$result .= $linkText;
|
||||||
|
$cursor = $end;
|
||||||
}
|
}
|
||||||
return $text;
|
$last = $this->twitEscape(mb_substr($text, $cursor));
|
||||||
|
$result .= $last;
|
||||||
|
|
||||||
|
return $result;
|
||||||
|
}
|
||||||
|
|
||||||
|
function twitEscape($str)
|
||||||
|
{
|
||||||
|
// Twitter seems to preemptive turn < and > into < and >
|
||||||
|
// but doesn't for &, so while you may have some magic protection
|
||||||
|
// against XSS by not bothing to escape manually, you still get
|
||||||
|
// invalid XHTML. Thanks!
|
||||||
|
//
|
||||||
|
// Looks like their web interface pretty much sends anything
|
||||||
|
// through intact, so.... to do equivalent, decode all entities
|
||||||
|
// and then re-encode the special ones.
|
||||||
|
return htmlspecialchars(html_entity_decode($str, ENT_COMPAT, 'UTF-8'));
|
||||||
}
|
}
|
||||||
|
|
||||||
function makeUrlLink($object, $orig)
|
function makeUrlLink($object, $orig)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user