183 lines
5.8 KiB
PHP
183 lines
5.8 KiB
PHP
<?php
|
|
/**
|
|
* Phergie
|
|
*
|
|
* PHP version 5
|
|
*
|
|
* LICENSE
|
|
*
|
|
* This source file is subject to the new BSD license that is bundled
|
|
* with this package in the file LICENSE.
|
|
* It is also available through the world-wide-web at this URL:
|
|
* http://phergie.org/license
|
|
*
|
|
* @category Phergie
|
|
* @package Phergie_Plugin_Encoding
|
|
* @author Phergie Development Team <team@phergie.org>
|
|
* @copyright 2008-2010 Phergie Development Team (http://phergie.org)
|
|
* @license http://phergie.org/license New BSD License
|
|
* @link http://pear.phergie.org/package/Phergie_Plugin_Encoding
|
|
*/
|
|
|
|
/**
|
|
* Handles decoding markup entities and converting text between character
|
|
* encodings.
|
|
*
|
|
* @category Phergie
|
|
* @package Phergie_Plugin_Encoding
|
|
* @author Phergie Development Team <team@phergie.org>
|
|
* @license http://phergie.org/license New BSD License
|
|
* @link http://pear.phergie.org/package/Phergie_Plugin_Encoding
|
|
*/
|
|
class Phergie_Plugin_Encoding extends Phergie_Plugin_Abstract
|
|
{
|
|
/**
|
|
* Lookup table for entity conversions not supported by
|
|
* html_entity_decode()
|
|
*
|
|
* @var array
|
|
* @link http://us.php.net/manual/en/function.get-html-translation-table.php#73409
|
|
* @link http://us.php.net/manual/en/function.get-html-translation-table.php#73410
|
|
*/
|
|
protected static $entities = array(
|
|
'α' => 913,
|
|
''' => 39,
|
|
'β' => 914,
|
|
'•' => 149,
|
|
'χ' => 935,
|
|
'ˆ' => 94,
|
|
'δ' => 916,
|
|
'ε' => 917,
|
|
'η' => 919,
|
|
'ƒ' => 402,
|
|
'γ' => 915,
|
|
'ι' => 921,
|
|
'κ' => 922,
|
|
'λ' => 923,
|
|
'“' => 147,
|
|
'‹' => 139,
|
|
'‘' => 145,
|
|
'—' => 151,
|
|
'−' => 45,
|
|
'μ' => 924,
|
|
'–' => 150,
|
|
'ν' => 925,
|
|
'œ' => 140,
|
|
'ω' => 937,
|
|
'ο' => 927,
|
|
'φ' => 934,
|
|
'π' => 928,
|
|
'ϖ' => 982,
|
|
'ψ' => 936,
|
|
'”' => 148,
|
|
'ρ' => 929,
|
|
'›' => 155,
|
|
'’' => 146,
|
|
'š' => 138,
|
|
'σ' => 931,
|
|
'ς' => 962,
|
|
'τ' => 932,
|
|
'θ' => 920,
|
|
'ϑ' => 977,
|
|
'˜' => 126,
|
|
'™' => 153,
|
|
'ϒ' => 978,
|
|
'υ' => 933,
|
|
'ξ' => 926,
|
|
'ÿ' => 159,
|
|
'ζ' => 918,
|
|
);
|
|
|
|
/**
|
|
* Decodes markup entities in a given string.
|
|
*
|
|
* @param string $string String containing markup entities
|
|
* @param string $charset Optional character set name to use in decoding
|
|
* entities, defaults to UTF-8
|
|
*
|
|
* @return string String with markup entities decoded
|
|
*/
|
|
public function decodeEntities($string, $charset = 'UTF-8')
|
|
{
|
|
$string = str_ireplace(
|
|
array_keys(self::$entities),
|
|
array_map('chr', self::$entities),
|
|
$string
|
|
);
|
|
$string = html_entity_decode($string, ENT_QUOTES, $charset);
|
|
$string = preg_replace(
|
|
array('/�*([0-9]+);/me', '/�*([a-f0-9]+);/mei'),
|
|
array('$this->codeToUtf(\\1)', '$this->codeToUtf(hexdec(\\1))'),
|
|
$string
|
|
);
|
|
return $string;
|
|
}
|
|
|
|
/**
|
|
* Converts a given unicode to its UTF-8 equivalent.
|
|
*
|
|
* @param int $code Code to convert
|
|
* @return string Character corresponding to code
|
|
*/
|
|
public function codeToUtf8($code)
|
|
{
|
|
$code = (int) $code;
|
|
switch ($code) {
|
|
// 1 byte, 7 bits
|
|
case 0:
|
|
return chr(0);
|
|
case ($code & 0x7F):
|
|
return chr($code);
|
|
|
|
// 2 bytes, 11 bits
|
|
case ($code & 0x7FF):
|
|
return chr(0xC0 | (($code >> 6) & 0x1F)) .
|
|
chr(0x80 | ($code & 0x3F));
|
|
|
|
// 3 bytes, 16 bits
|
|
case ($code & 0xFFFF):
|
|
return chr(0xE0 | (($code >> 12) & 0x0F)) .
|
|
chr(0x80 | (($code >> 6) & 0x3F)) .
|
|
chr(0x80 | ($code & 0x3F));
|
|
|
|
// 4 bytes, 21 bits
|
|
case ($code & 0x1FFFFF):
|
|
return chr(0xF0 | ($code >> 18)) .
|
|
chr(0x80 | (($code >> 12) & 0x3F)) .
|
|
chr(0x80 | (($code >> 6) & 0x3F)) .
|
|
chr(0x80 | ($code & 0x3F));
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Transliterates characters in a given string where possible.
|
|
*
|
|
* @param string $string String containing characters to
|
|
* transliterate
|
|
* @param string $charsetFrom Optional character set of the string,
|
|
* defaults to UTF-8
|
|
* @param string $charsetTo Optional character set to which the string
|
|
* should be converted, defaults to ISO-8859-1
|
|
*
|
|
* @return string String with characters transliterated or the original
|
|
* string if transliteration was not possible
|
|
*/
|
|
public function transliterate($string, $charsetFrom = 'UTF-8', $charsetTo = 'ISO-8859-1')
|
|
{
|
|
// @link http://pecl.php.net/package/translit
|
|
if (function_exists('transliterate')) {
|
|
$string = transliterate($string, array('han_transliterate', 'diacritical_remove'), $charsetFrom, $charsetTo);
|
|
} elseif (function_exists('iconv')) {
|
|
$string = iconv($charsetFrom, $charsetTo . '//TRANSLIT', $string);
|
|
} else {
|
|
// @link http://stackoverflow.com/questions/1284535/php-transliteration/1285491#1285491
|
|
$string = preg_replace(
|
|
'~&([a-z]{1,2})(acute|cedil|circ|grave|lig|orn|ring|slash|th|tilde|uml);~i',
|
|
'$1',
|
|
htmlentities($string, ENT_COMPAT, $charsetFrom)
|
|
);
|
|
}
|
|
return $string;
|
|
}
|
|
}
|