root / drupal7 / sites / all / libraries / simplepie / idn / idna_convert.class.php @ 41cc1b08
1 |
<?php
|
---|---|
2 |
// {{{ license
|
3 |
|
4 |
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
|
5 |
//
|
6 |
// +----------------------------------------------------------------------+
|
7 |
// | This library is free software; you can redistribute it and/or modify |
|
8 |
// | it under the terms of the GNU Lesser General Public License as |
|
9 |
// | published by the Free Software Foundation; either version 2.1 of the |
|
10 |
// | License, or (at your option) any later version. |
|
11 |
// | |
|
12 |
// | This library is distributed in the hope that it will be useful, but |
|
13 |
// | WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 |
// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
15 |
// | Lesser General Public License for more details. |
|
16 |
// | |
|
17 |
// | You should have received a copy of the GNU Lesser General Public |
|
18 |
// | License along with this library; if not, write to the Free Software |
|
19 |
// | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
|
20 |
// | USA. |
|
21 |
// +----------------------------------------------------------------------+
|
22 |
//
|
23 |
|
24 |
// }}}
|
25 |
|
26 |
/**
|
27 |
* Encode/decode Internationalized Domain Names.
|
28 |
*
|
29 |
* The class allows to convert internationalized domain names
|
30 |
* (see RFC 3490 for details) as they can be used with various registries worldwide
|
31 |
* to be translated between their original (localized) form and their encoded form
|
32 |
* as it will be used in the DNS (Domain Name System).
|
33 |
*
|
34 |
* The class provides two public methods, encode() and decode(), which do exactly
|
35 |
* what you would expect them to do. You are allowed to use complete domain names,
|
36 |
* simple strings and complete email addresses as well. That means, that you might
|
37 |
* use any of the following notations:
|
38 |
*
|
39 |
* - www.nörgler.com
|
40 |
* - xn--nrgler-wxa
|
41 |
* - xn--brse-5qa.xn--knrz-1ra.info
|
42 |
*
|
43 |
* Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4
|
44 |
* array. Unicode output is available in the same formats.
|
45 |
* You can select your preferred format via {@link set_paramter()}.
|
46 |
*
|
47 |
* ACE input and output is always expected to be ASCII.
|
48 |
*
|
49 |
* @author Matthias Sommerfeld <mso@phlylabs.de>
|
50 |
* @copyright 2004-2007 phlyLabs Berlin, http://phlylabs.de
|
51 |
* @version 0.5.1
|
52 |
*
|
53 |
*/
|
54 |
class idna_convert |
55 |
{ |
56 |
/**
|
57 |
* Holds all relevant mapping tables, loaded from a seperate file on construct
|
58 |
* See RFC3454 for details
|
59 |
*
|
60 |
* @var array
|
61 |
* @access private
|
62 |
*/
|
63 |
var $NP = array(); |
64 |
|
65 |
// Internal settings, do not mess with them
|
66 |
var $_punycode_prefix = 'xn--'; |
67 |
var $_invalid_ucs = 0x80000000; |
68 |
var $_max_ucs = 0x10FFFF; |
69 |
var $_base = 36; |
70 |
var $_tmin = 1; |
71 |
var $_tmax = 26; |
72 |
var $_skew = 38; |
73 |
var $_damp = 700; |
74 |
var $_initial_bias = 72; |
75 |
var $_initial_n = 0x80; |
76 |
var $_sbase = 0xAC00; |
77 |
var $_lbase = 0x1100; |
78 |
var $_vbase = 0x1161; |
79 |
var $_tbase = 0x11A7; |
80 |
var $_lcount = 19; |
81 |
var $_vcount = 21; |
82 |
var $_tcount = 28; |
83 |
var $_ncount = 588; // _vcount * _tcount |
84 |
var $_scount = 11172; // _lcount * _tcount * _vcount |
85 |
var $_error = false; |
86 |
|
87 |
// See {@link set_paramter()} for details of how to change the following
|
88 |
// settings from within your script / application
|
89 |
var $_api_encoding = 'utf8'; // Default input charset is UTF-8 |
90 |
var $_allow_overlong = false; // Overlong UTF-8 encodings are forbidden |
91 |
var $_strict_mode = false; // Behave strict or not |
92 |
|
93 |
// The constructor
|
94 |
function idna_convert($options = false) |
95 |
{ |
96 |
$this->slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount; |
97 |
if (function_exists('file_get_contents')) { |
98 |
$this->NP = unserialize(file_get_contents(dirname(__FILE__).'/npdata.ser')); |
99 |
} else {
|
100 |
$this->NP = unserialize(join('', file(dirname(__FILE__).'/npdata.ser'))); |
101 |
} |
102 |
// If parameters are given, pass these to the respective method
|
103 |
if (is_array($options)) { |
104 |
return $this->set_parameter($options); |
105 |
} |
106 |
return true; |
107 |
} |
108 |
|
109 |
/**
|
110 |
* Sets a new option value. Available options and values:
|
111 |
* [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
|
112 |
* 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
|
113 |
* [overlong - Unicode does not allow unnecessarily long encodings of chars,
|
114 |
* to allow this, set this parameter to true, else to false;
|
115 |
* default is false.]
|
116 |
* [strict - true: strict mode, good for registration purposes - Causes errors
|
117 |
* on failures; false: loose mode, ideal for "wildlife" applications
|
118 |
* by silently ignoring errors and returning the original input instead
|
119 |
*
|
120 |
* @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
|
121 |
* @param string Value to use (if parameter 1 is a string)
|
122 |
* @return boolean true on success, false otherwise
|
123 |
* @access public
|
124 |
*/
|
125 |
function set_parameter($option, $value = false) |
126 |
{ |
127 |
if (!is_array($option)) { |
128 |
$option = array($option => $value); |
129 |
} |
130 |
foreach ($option as $k => $v) { |
131 |
switch ($k) { |
132 |
case 'encoding': |
133 |
switch ($v) { |
134 |
case 'utf8': |
135 |
case 'ucs4_string': |
136 |
case 'ucs4_array': |
137 |
$this->_api_encoding = $v; |
138 |
break;
|
139 |
default:
|
140 |
$this->_error('Set Parameter: Unknown parameter '.$v.' for option '.$k); |
141 |
return false; |
142 |
} |
143 |
break;
|
144 |
case 'overlong': |
145 |
$this->_allow_overlong = ($v) ? true : false; |
146 |
break;
|
147 |
case 'strict': |
148 |
$this->_strict_mode = ($v) ? true : false; |
149 |
break;
|
150 |
default:
|
151 |
$this->_error('Set Parameter: Unknown option '.$k); |
152 |
return false; |
153 |
} |
154 |
} |
155 |
return true; |
156 |
} |
157 |
|
158 |
/**
|
159 |
* Decode a given ACE domain name
|
160 |
* @param string Domain name (ACE string)
|
161 |
* [@param string Desired output encoding, see {@link set_parameter}]
|
162 |
* @return string Decoded Domain name (UTF-8 or UCS-4)
|
163 |
* @access public
|
164 |
*/
|
165 |
function decode($input, $one_time_encoding = false) |
166 |
{ |
167 |
// Optionally set
|
168 |
if ($one_time_encoding) { |
169 |
switch ($one_time_encoding) { |
170 |
case 'utf8': |
171 |
case 'ucs4_string': |
172 |
case 'ucs4_array': |
173 |
break;
|
174 |
default:
|
175 |
$this->_error('Unknown encoding '.$one_time_encoding); |
176 |
return false; |
177 |
} |
178 |
} |
179 |
// Make sure to drop any newline characters around
|
180 |
$input = trim($input); |
181 |
|
182 |
// Negotiate input and try to determine, whether it is a plain string,
|
183 |
// an email address or something like a complete URL
|
184 |
if (strpos($input, '@')) { // Maybe it is an email address |
185 |
// No no in strict mode
|
186 |
if ($this->_strict_mode) { |
187 |
$this->_error('Only simple domain name parts can be handled in strict mode'); |
188 |
return false; |
189 |
} |
190 |
list ($email_pref, $input) = explode('@', $input, 2); |
191 |
$arr = explode('.', $input); |
192 |
foreach ($arr as $k => $v) { |
193 |
if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) { |
194 |
$conv = $this->_decode($v); |
195 |
if ($conv) $arr[$k] = $conv; |
196 |
} |
197 |
} |
198 |
$input = join('.', $arr); |
199 |
$arr = explode('.', $email_pref); |
200 |
foreach ($arr as $k => $v) { |
201 |
if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) { |
202 |
$conv = $this->_decode($v); |
203 |
if ($conv) $arr[$k] = $conv; |
204 |
} |
205 |
} |
206 |
$email_pref = join('.', $arr); |
207 |
$return = $email_pref . '@' . $input; |
208 |
} elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters) |
209 |
// No no in strict mode
|
210 |
if ($this->_strict_mode) { |
211 |
$this->_error('Only simple domain name parts can be handled in strict mode'); |
212 |
return false; |
213 |
} |
214 |
$parsed = parse_url($input); |
215 |
if (isset($parsed['host'])) { |
216 |
$arr = explode('.', $parsed['host']); |
217 |
foreach ($arr as $k => $v) { |
218 |
$conv = $this->_decode($v); |
219 |
if ($conv) $arr[$k] = $conv; |
220 |
} |
221 |
$parsed['host'] = join('.', $arr); |
222 |
$return =
|
223 |
(empty($parsed['scheme']) ? '' : $parsed['scheme'].(strtolower($parsed['scheme']) == 'mailto' ? ':' : '://')) |
224 |
.(empty($parsed['user']) ? '' : $parsed['user'].(empty($parsed['pass']) ? '' : ':'.$parsed['pass']).'@') |
225 |
.$parsed['host'] |
226 |
.(empty($parsed['port']) ? '' : ':'.$parsed['port']) |
227 |
.(empty($parsed['path']) ? '' : $parsed['path']) |
228 |
.(empty($parsed['query']) ? '' : '?'.$parsed['query']) |
229 |
.(empty($parsed['fragment']) ? '' : '#'.$parsed['fragment']); |
230 |
} else { // parse_url seems to have failed, try without it |
231 |
$arr = explode('.', $input); |
232 |
foreach ($arr as $k => $v) { |
233 |
$conv = $this->_decode($v); |
234 |
$arr[$k] = ($conv) ? $conv : $v; |
235 |
} |
236 |
$return = join('.', $arr); |
237 |
} |
238 |
} else { // Otherwise we consider it being a pure domain name string |
239 |
$return = $this->_decode($input); |
240 |
if (!$return) $return = $input; |
241 |
} |
242 |
// The output is UTF-8 by default, other output formats need conversion here
|
243 |
// If one time encoding is given, use this, else the objects property
|
244 |
switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) { |
245 |
case 'utf8': |
246 |
return $return; |
247 |
break;
|
248 |
case 'ucs4_string': |
249 |
return $this->_ucs4_to_ucs4_string($this->_utf8_to_ucs4($return)); |
250 |
break;
|
251 |
case 'ucs4_array': |
252 |
return $this->_utf8_to_ucs4($return); |
253 |
break;
|
254 |
default:
|
255 |
$this->_error('Unsupported output format'); |
256 |
return false; |
257 |
} |
258 |
} |
259 |
|
260 |
/**
|
261 |
* Encode a given UTF-8 domain name
|
262 |
* @param string Domain name (UTF-8 or UCS-4)
|
263 |
* [@param string Desired input encoding, see {@link set_parameter}]
|
264 |
* @return string Encoded Domain name (ACE string)
|
265 |
* @access public
|
266 |
*/
|
267 |
function encode($decoded, $one_time_encoding = false) |
268 |
{ |
269 |
// Forcing conversion of input to UCS4 array
|
270 |
// If one time encoding is given, use this, else the objects property
|
271 |
switch ($one_time_encoding ? $one_time_encoding : $this->_api_encoding) { |
272 |
case 'utf8': |
273 |
$decoded = $this->_utf8_to_ucs4($decoded); |
274 |
break;
|
275 |
case 'ucs4_string': |
276 |
$decoded = $this->_ucs4_string_to_ucs4($decoded); |
277 |
case 'ucs4_array': |
278 |
break;
|
279 |
default:
|
280 |
$this->_error('Unsupported input format: '.($one_time_encoding ? $one_time_encoding : $this->_api_encoding)); |
281 |
return false; |
282 |
} |
283 |
|
284 |
// No input, no output, what else did you expect?
|
285 |
if (empty($decoded)) return ''; |
286 |
|
287 |
// Anchors for iteration
|
288 |
$last_begin = 0; |
289 |
// Output string
|
290 |
$output = ''; |
291 |
foreach ($decoded as $k => $v) { |
292 |
// Make sure to use just the plain dot
|
293 |
switch($v) { |
294 |
case 0x3002: |
295 |
case 0xFF0E: |
296 |
case 0xFF61: |
297 |
$decoded[$k] = 0x2E; |
298 |
// Right, no break here, the above are converted to dots anyway
|
299 |
// Stumbling across an anchoring character
|
300 |
case 0x2E: |
301 |
case 0x2F: |
302 |
case 0x3A: |
303 |
case 0x3F: |
304 |
case 0x40: |
305 |
// Neither email addresses nor URLs allowed in strict mode
|
306 |
if ($this->_strict_mode) { |
307 |
$this->_error('Neither email addresses nor URLs are allowed in strict mode.'); |
308 |
return false; |
309 |
} else {
|
310 |
// Skip first char
|
311 |
if ($k) { |
312 |
$encoded = ''; |
313 |
$encoded = $this->_encode(array_slice($decoded, $last_begin, (($k)-$last_begin))); |
314 |
if ($encoded) { |
315 |
$output .= $encoded; |
316 |
} else {
|
317 |
$output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($k)-$last_begin))); |
318 |
} |
319 |
$output .= chr($decoded[$k]); |
320 |
} |
321 |
$last_begin = $k + 1; |
322 |
} |
323 |
} |
324 |
} |
325 |
// Catch the rest of the string
|
326 |
if ($last_begin) { |
327 |
$inp_len = sizeof($decoded); |
328 |
$encoded = ''; |
329 |
$encoded = $this->_encode(array_slice($decoded, $last_begin, (($inp_len)-$last_begin))); |
330 |
if ($encoded) { |
331 |
$output .= $encoded; |
332 |
} else {
|
333 |
$output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($inp_len)-$last_begin))); |
334 |
} |
335 |
return $output; |
336 |
} else {
|
337 |
if ($output = $this->_encode($decoded)) { |
338 |
return $output; |
339 |
} else {
|
340 |
return $this->_ucs4_to_utf8($decoded); |
341 |
} |
342 |
} |
343 |
} |
344 |
|
345 |
/**
|
346 |
* Use this method to get the last error ocurred
|
347 |
* @param void
|
348 |
* @return string The last error, that occured
|
349 |
* @access public
|
350 |
*/
|
351 |
function get_last_error() |
352 |
{ |
353 |
return $this->_error; |
354 |
} |
355 |
|
356 |
/**
|
357 |
* The actual decoding algorithm
|
358 |
* @access private
|
359 |
*/
|
360 |
function _decode($encoded) |
361 |
{ |
362 |
// We do need to find the Punycode prefix
|
363 |
if (!preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $encoded)) { |
364 |
$this->_error('This is not a punycode string'); |
365 |
return false; |
366 |
} |
367 |
$encode_test = preg_replace('!^'.preg_quote($this->_punycode_prefix, '!').'!', '', $encoded); |
368 |
// If nothing left after removing the prefix, it is hopeless
|
369 |
if (!$encode_test) { |
370 |
$this->_error('The given encoded string was empty'); |
371 |
return false; |
372 |
} |
373 |
// Find last occurence of the delimiter
|
374 |
$delim_pos = strrpos($encoded, '-'); |
375 |
if ($delim_pos > strlen($this->_punycode_prefix)) { |
376 |
for ($k = strlen($this->_punycode_prefix); $k < $delim_pos; ++$k) { |
377 |
$decoded[] = ord($encoded{$k}); |
378 |
} |
379 |
} else {
|
380 |
$decoded = array(); |
381 |
} |
382 |
$deco_len = count($decoded); |
383 |
$enco_len = strlen($encoded); |
384 |
|
385 |
// Wandering through the strings; init
|
386 |
$is_first = true; |
387 |
$bias = $this->_initial_bias; |
388 |
$idx = 0; |
389 |
$char = $this->_initial_n; |
390 |
|
391 |
for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) { |
392 |
for ($old_idx = $idx, $w = 1, $k = $this->_base; 1 ; $k += $this->_base) { |
393 |
$digit = $this->_decode_digit($encoded{$enco_idx++}); |
394 |
$idx += $digit * $w; |
395 |
$t = ($k <= $bias) ? $this->_tmin : |
396 |
(($k >= $bias + $this->_tmax) ? $this->_tmax : ($k - $bias)); |
397 |
if ($digit < $t) break; |
398 |
$w = (int) ($w * ($this->_base - $t)); |
399 |
} |
400 |
$bias = $this->_adapt($idx - $old_idx, $deco_len + 1, $is_first); |
401 |
$is_first = false; |
402 |
$char += (int) ($idx / ($deco_len + 1)); |
403 |
$idx %= ($deco_len + 1); |
404 |
if ($deco_len > 0) { |
405 |
// Make room for the decoded char
|
406 |
for ($i = $deco_len; $i > $idx; $i--) { |
407 |
$decoded[$i] = $decoded[($i - 1)]; |
408 |
} |
409 |
} |
410 |
$decoded[$idx++] = $char; |
411 |
} |
412 |
return $this->_ucs4_to_utf8($decoded); |
413 |
} |
414 |
|
415 |
/**
|
416 |
* The actual encoding algorithm
|
417 |
* @access private
|
418 |
*/
|
419 |
function _encode($decoded) |
420 |
{ |
421 |
// We cannot encode a domain name containing the Punycode prefix
|
422 |
$extract = strlen($this->_punycode_prefix); |
423 |
$check_pref = $this->_utf8_to_ucs4($this->_punycode_prefix); |
424 |
$check_deco = array_slice($decoded, 0, $extract); |
425 |
|
426 |
if ($check_pref == $check_deco) { |
427 |
$this->_error('This is already a punycode string'); |
428 |
return false; |
429 |
} |
430 |
// We will not try to encode strings consisting of basic code points only
|
431 |
$encodable = false; |
432 |
foreach ($decoded as $k => $v) { |
433 |
if ($v > 0x7a) { |
434 |
$encodable = true; |
435 |
break;
|
436 |
} |
437 |
} |
438 |
if (!$encodable) { |
439 |
$this->_error('The given string does not contain encodable chars'); |
440 |
return false; |
441 |
} |
442 |
|
443 |
// Do NAMEPREP
|
444 |
$decoded = $this->_nameprep($decoded); |
445 |
if (!$decoded || !is_array($decoded)) return false; // NAMEPREP failed |
446 |
|
447 |
$deco_len = count($decoded); |
448 |
if (!$deco_len) return false; // Empty array |
449 |
|
450 |
$codecount = 0; // How many chars have been consumed |
451 |
|
452 |
$encoded = ''; |
453 |
// Copy all basic code points to output
|
454 |
for ($i = 0; $i < $deco_len; ++$i) { |
455 |
$test = $decoded[$i]; |
456 |
// Will match [-0-9a-zA-Z]
|
457 |
if ((0x2F < $test && $test < 0x40) || (0x40 < $test && $test < 0x5B) |
458 |
|| (0x60 < $test && $test <= 0x7B) || (0x2D == $test)) { |
459 |
$encoded .= chr($decoded[$i]); |
460 |
$codecount++;
|
461 |
} |
462 |
} |
463 |
if ($codecount == $deco_len) return $encoded; // All codepoints were basic ones |
464 |
|
465 |
// Start with the prefix; copy it to output
|
466 |
$encoded = $this->_punycode_prefix.$encoded; |
467 |
|
468 |
// If we have basic code points in output, add an hyphen to the end
|
469 |
if ($codecount) $encoded .= '-'; |
470 |
|
471 |
// Now find and encode all non-basic code points
|
472 |
$is_first = true; |
473 |
$cur_code = $this->_initial_n; |
474 |
$bias = $this->_initial_bias; |
475 |
$delta = 0; |
476 |
while ($codecount < $deco_len) { |
477 |
// Find the smallest code point >= the current code point and
|
478 |
// remember the last ouccrence of it in the input
|
479 |
for ($i = 0, $next_code = $this->_max_ucs; $i < $deco_len; $i++) { |
480 |
if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) { |
481 |
$next_code = $decoded[$i]; |
482 |
} |
483 |
} |
484 |
|
485 |
$delta += ($next_code - $cur_code) * ($codecount + 1); |
486 |
$cur_code = $next_code; |
487 |
|
488 |
// Scan input again and encode all characters whose code point is $cur_code
|
489 |
for ($i = 0; $i < $deco_len; $i++) { |
490 |
if ($decoded[$i] < $cur_code) { |
491 |
$delta++;
|
492 |
} elseif ($decoded[$i] == $cur_code) { |
493 |
for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) { |
494 |
$t = ($k <= $bias) ? $this->_tmin : |
495 |
(($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias); |
496 |
if ($q < $t) break; |
497 |
$encoded .= $this->_encode_digit(intval($t + (($q - $t) % ($this->_base - $t)))); //v0.4.5 Changed from ceil() to intval() |
498 |
$q = (int) (($q - $t) / ($this->_base - $t)); |
499 |
} |
500 |
$encoded .= $this->_encode_digit($q); |
501 |
$bias = $this->_adapt($delta, $codecount+1, $is_first); |
502 |
$codecount++;
|
503 |
$delta = 0; |
504 |
$is_first = false; |
505 |
} |
506 |
} |
507 |
$delta++;
|
508 |
$cur_code++;
|
509 |
} |
510 |
return $encoded; |
511 |
} |
512 |
|
513 |
/**
|
514 |
* Adapt the bias according to the current code point and position
|
515 |
* @access private
|
516 |
*/
|
517 |
function _adapt($delta, $npoints, $is_first) |
518 |
{ |
519 |
$delta = intval($is_first ? ($delta / $this->_damp) : ($delta / 2)); |
520 |
$delta += intval($delta / $npoints); |
521 |
for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) { |
522 |
$delta = intval($delta / ($this->_base - $this->_tmin)); |
523 |
} |
524 |
return intval($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew)); |
525 |
} |
526 |
|
527 |
/**
|
528 |
* Encoding a certain digit
|
529 |
* @access private
|
530 |
*/
|
531 |
function _encode_digit($d) |
532 |
{ |
533 |
return chr($d + 22 + 75 * ($d < 26)); |
534 |
} |
535 |
|
536 |
/**
|
537 |
* Decode a certain digit
|
538 |
* @access private
|
539 |
*/
|
540 |
function _decode_digit($cp) |
541 |
{ |
542 |
$cp = ord($cp); |
543 |
return ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $this->_base)); |
544 |
} |
545 |
|
546 |
/**
|
547 |
* Internal error handling method
|
548 |
* @access private
|
549 |
*/
|
550 |
function _error($error = '') |
551 |
{ |
552 |
$this->_error = $error; |
553 |
} |
554 |
|
555 |
/**
|
556 |
* Do Nameprep according to RFC3491 and RFC3454
|
557 |
* @param array Unicode Characters
|
558 |
* @return string Unicode Characters, Nameprep'd
|
559 |
* @access private
|
560 |
*/
|
561 |
function _nameprep($input) |
562 |
{ |
563 |
$output = array(); |
564 |
$error = false; |
565 |
//
|
566 |
// Mapping
|
567 |
// Walking through the input array, performing the required steps on each of
|
568 |
// the input chars and putting the result into the output array
|
569 |
// While mapping required chars we apply the cannonical ordering
|
570 |
foreach ($input as $v) { |
571 |
// Map to nothing == skip that code point
|
572 |
if (in_array($v, $this->NP['map_nothing'])) continue; |
573 |
|
574 |
// Try to find prohibited input
|
575 |
if (in_array($v, $this->NP['prohibit']) || in_array($v, $this->NP['general_prohibited'])) { |
576 |
$this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v)); |
577 |
return false; |
578 |
} |
579 |
foreach ($this->NP['prohibit_ranges'] as $range) { |
580 |
if ($range[0] <= $v && $v <= $range[1]) { |
581 |
$this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v)); |
582 |
return false; |
583 |
} |
584 |
} |
585 |
//
|
586 |
// Hangul syllable decomposition
|
587 |
if (0xAC00 <= $v && $v <= 0xD7AF) { |
588 |
foreach ($this->_hangul_decompose($v) as $out) { |
589 |
$output[] = (int) $out; |
590 |
} |
591 |
// There's a decomposition mapping for that code point
|
592 |
} elseif (isset($this->NP['replacemaps'][$v])) { |
593 |
foreach ($this->_apply_cannonical_ordering($this->NP['replacemaps'][$v]) as $out) { |
594 |
$output[] = (int) $out; |
595 |
} |
596 |
} else {
|
597 |
$output[] = (int) $v; |
598 |
} |
599 |
} |
600 |
// Before applying any Combining, try to rearrange any Hangul syllables
|
601 |
$output = $this->_hangul_compose($output); |
602 |
//
|
603 |
// Combine code points
|
604 |
//
|
605 |
$last_class = 0; |
606 |
$last_starter = 0; |
607 |
$out_len = count($output); |
608 |
for ($i = 0; $i < $out_len; ++$i) { |
609 |
$class = $this->_get_combining_class($output[$i]); |
610 |
if ((!$last_class || $last_class > $class) && $class) { |
611 |
// Try to match
|
612 |
$seq_len = $i - $last_starter; |
613 |
$out = $this->_combine(array_slice($output, $last_starter, $seq_len)); |
614 |
// On match: Replace the last starter with the composed character and remove
|
615 |
// the now redundant non-starter(s)
|
616 |
if ($out) { |
617 |
$output[$last_starter] = $out; |
618 |
if (count($out) != $seq_len) { |
619 |
for ($j = $i+1; $j < $out_len; ++$j) { |
620 |
$output[$j-1] = $output[$j]; |
621 |
} |
622 |
unset($output[$out_len]); |
623 |
} |
624 |
// Rewind the for loop by one, since there can be more possible compositions
|
625 |
$i--;
|
626 |
$out_len--;
|
627 |
$last_class = ($i == $last_starter) ? 0 : $this->_get_combining_class($output[$i-1]); |
628 |
continue;
|
629 |
} |
630 |
} |
631 |
// The current class is 0
|
632 |
if (!$class) $last_starter = $i; |
633 |
$last_class = $class; |
634 |
} |
635 |
return $output; |
636 |
} |
637 |
|
638 |
/**
|
639 |
* Decomposes a Hangul syllable
|
640 |
* (see http://www.unicode.org/unicode/reports/tr15/#Hangul
|
641 |
* @param integer 32bit UCS4 code point
|
642 |
* @return array Either Hangul Syllable decomposed or original 32bit value as one value array
|
643 |
* @access private
|
644 |
*/
|
645 |
function _hangul_decompose($char) |
646 |
{ |
647 |
$sindex = (int) $char - $this->_sbase; |
648 |
if ($sindex < 0 || $sindex >= $this->_scount) { |
649 |
return array($char); |
650 |
} |
651 |
$result = array(); |
652 |
$result[] = (int) $this->_lbase + $sindex / $this->_ncount; |
653 |
$result[] = (int) $this->_vbase + ($sindex % $this->_ncount) / $this->_tcount; |
654 |
$T = intval($this->_tbase + $sindex % $this->_tcount); |
655 |
if ($T != $this->_tbase) $result[] = $T; |
656 |
return $result; |
657 |
} |
658 |
/**
|
659 |
* Ccomposes a Hangul syllable
|
660 |
* (see http://www.unicode.org/unicode/reports/tr15/#Hangul
|
661 |
* @param array Decomposed UCS4 sequence
|
662 |
* @return array UCS4 sequence with syllables composed
|
663 |
* @access private
|
664 |
*/
|
665 |
function _hangul_compose($input) |
666 |
{ |
667 |
$inp_len = count($input); |
668 |
if (!$inp_len) return array(); |
669 |
$result = array(); |
670 |
$last = (int) $input[0]; |
671 |
$result[] = $last; // copy first char from input to output |
672 |
|
673 |
for ($i = 1; $i < $inp_len; ++$i) { |
674 |
$char = (int) $input[$i]; |
675 |
$sindex = $last - $this->_sbase; |
676 |
$lindex = $last - $this->_lbase; |
677 |
$vindex = $char - $this->_vbase; |
678 |
$tindex = $char - $this->_tbase; |
679 |
// Find out, whether two current characters are LV and T
|
680 |
if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount == 0) |
681 |
&& 0 <= $tindex && $tindex <= $this->_tcount) { |
682 |
// create syllable of form LVT
|
683 |
$last += $tindex; |
684 |
$result[(count($result) - 1)] = $last; // reset last |
685 |
continue; // discard char |
686 |
} |
687 |
// Find out, whether two current characters form L and V
|
688 |
if (0 <= $lindex && $lindex < $this->_lcount && 0 <= $vindex && $vindex < $this->_vcount) { |
689 |
// create syllable of form LV
|
690 |
$last = (int) $this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount; |
691 |
$result[(count($result) - 1)] = $last; // reset last |
692 |
continue; // discard char |
693 |
} |
694 |
// if neither case was true, just add the character
|
695 |
$last = $char; |
696 |
$result[] = $char; |
697 |
} |
698 |
return $result; |
699 |
} |
700 |
|
701 |
/**
|
702 |
* Returns the combining class of a certain wide char
|
703 |
* @param integer Wide char to check (32bit integer)
|
704 |
* @return integer Combining class if found, else 0
|
705 |
* @access private
|
706 |
*/
|
707 |
function _get_combining_class($char) |
708 |
{ |
709 |
return isset($this->NP['norm_combcls'][$char]) ? $this->NP['norm_combcls'][$char] : 0; |
710 |
} |
711 |
|
712 |
/**
|
713 |
* Apllies the cannonical ordering of a decomposed UCS4 sequence
|
714 |
* @param array Decomposed UCS4 sequence
|
715 |
* @return array Ordered USC4 sequence
|
716 |
* @access private
|
717 |
*/
|
718 |
function _apply_cannonical_ordering($input) |
719 |
{ |
720 |
$swap = true; |
721 |
$size = count($input); |
722 |
while ($swap) { |
723 |
$swap = false; |
724 |
$last = $this->_get_combining_class(intval($input[0])); |
725 |
for ($i = 0; $i < $size-1; ++$i) { |
726 |
$next = $this->_get_combining_class(intval($input[$i+1])); |
727 |
if ($next != 0 && $last > $next) { |
728 |
// Move item leftward until it fits
|
729 |
for ($j = $i + 1; $j > 0; --$j) { |
730 |
if ($this->_get_combining_class(intval($input[$j-1])) <= $next) break; |
731 |
$t = intval($input[$j]); |
732 |
$input[$j] = intval($input[$j-1]); |
733 |
$input[$j-1] = $t; |
734 |
$swap = true; |
735 |
} |
736 |
// Reentering the loop looking at the old character again
|
737 |
$next = $last; |
738 |
} |
739 |
$last = $next; |
740 |
} |
741 |
} |
742 |
return $input; |
743 |
} |
744 |
|
745 |
/**
|
746 |
* Do composition of a sequence of starter and non-starter
|
747 |
* @param array UCS4 Decomposed sequence
|
748 |
* @return array Ordered USC4 sequence
|
749 |
* @access private
|
750 |
*/
|
751 |
function _combine($input) |
752 |
{ |
753 |
$inp_len = count($input); |
754 |
foreach ($this->NP['replacemaps'] as $np_src => $np_target) { |
755 |
if ($np_target[0] != $input[0]) continue; |
756 |
if (count($np_target) != $inp_len) continue; |
757 |
$hit = false; |
758 |
foreach ($input as $k2 => $v2) { |
759 |
if ($v2 == $np_target[$k2]) { |
760 |
$hit = true; |
761 |
} else {
|
762 |
$hit = false; |
763 |
break;
|
764 |
} |
765 |
} |
766 |
if ($hit) return $np_src; |
767 |
} |
768 |
return false; |
769 |
} |
770 |
|
771 |
/**
|
772 |
* This converts an UTF-8 encoded string to its UCS-4 representation
|
773 |
* By talking about UCS-4 "strings" we mean arrays of 32bit integers representing
|
774 |
* each of the "chars". This is due to PHP not being able to handle strings with
|
775 |
* bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too.
|
776 |
* The following UTF-8 encodings are supported:
|
777 |
* bytes bits representation
|
778 |
* 1 7 0xxxxxxx
|
779 |
* 2 11 110xxxxx 10xxxxxx
|
780 |
* 3 16 1110xxxx 10xxxxxx 10xxxxxx
|
781 |
* 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
782 |
* 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
783 |
* 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
784 |
* Each x represents a bit that can be used to store character data.
|
785 |
* The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000
|
786 |
* @access private
|
787 |
*/
|
788 |
function _utf8_to_ucs4($input) |
789 |
{ |
790 |
$output = array(); |
791 |
$out_len = 0; |
792 |
$inp_len = strlen($input); |
793 |
$mode = 'next'; |
794 |
$test = 'none'; |
795 |
for ($k = 0; $k < $inp_len; ++$k) { |
796 |
$v = ord($input{$k}); // Extract byte from input string |
797 |
|
798 |
if ($v < 128) { // We found an ASCII char - put into stirng as is |
799 |
$output[$out_len] = $v; |
800 |
++$out_len;
|
801 |
if ('add' == $mode) { |
802 |
$this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k); |
803 |
return false; |
804 |
} |
805 |
continue;
|
806 |
} |
807 |
if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char |
808 |
$start_byte = $v; |
809 |
$mode = 'add'; |
810 |
$test = 'range'; |
811 |
if ($v >> 5 == 6) { // &110xxxxx 10xxxxx |
812 |
$next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left |
813 |
$v = ($v - 192) << 6; |
814 |
} elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx |
815 |
$next_byte = 1; |
816 |
$v = ($v - 224) << 12; |
817 |
} elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
818 |
$next_byte = 2; |
819 |
$v = ($v - 240) << 18; |
820 |
} elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
821 |
$next_byte = 3; |
822 |
$v = ($v - 248) << 24; |
823 |
} elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
824 |
$next_byte = 4; |
825 |
$v = ($v - 252) << 30; |
826 |
} else {
|
827 |
$this->_error('This might be UTF-8, but I don\'t understand it at byte '.$k); |
828 |
return false; |
829 |
} |
830 |
if ('add' == $mode) { |
831 |
$output[$out_len] = (int) $v; |
832 |
++$out_len;
|
833 |
continue;
|
834 |
} |
835 |
} |
836 |
if ('add' == $mode) { |
837 |
if (!$this->_allow_overlong && $test == 'range') { |
838 |
$test = 'none'; |
839 |
if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) { |
840 |
$this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.$k); |
841 |
return false; |
842 |
} |
843 |
} |
844 |
if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx |
845 |
$v = ($v - 128) << ($next_byte * 6); |
846 |
$output[($out_len - 1)] += $v; |
847 |
--$next_byte;
|
848 |
} else {
|
849 |
$this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k); |
850 |
return false; |
851 |
} |
852 |
if ($next_byte < 0) { |
853 |
$mode = 'next'; |
854 |
} |
855 |
} |
856 |
} // for
|
857 |
return $output; |
858 |
} |
859 |
|
860 |
/**
|
861 |
* Convert UCS-4 string into UTF-8 string
|
862 |
* See _utf8_to_ucs4() for details
|
863 |
* @access private
|
864 |
*/
|
865 |
function _ucs4_to_utf8($input) |
866 |
{ |
867 |
$output = ''; |
868 |
$k = 0; |
869 |
foreach ($input as $v) { |
870 |
++$k;
|
871 |
// $v = ord($v);
|
872 |
if ($v < 128) { // 7bit are transferred literally |
873 |
$output .= chr($v); |
874 |
} elseif ($v < (1 << 11)) { // 2 bytes |
875 |
$output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63)); |
876 |
} elseif ($v < (1 << 16)) { // 3 bytes |
877 |
$output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63)); |
878 |
} elseif ($v < (1 << 21)) { // 4 bytes |
879 |
$output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63)) |
880 |
. chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63)); |
881 |
} elseif ($v < (1 << 26)) { // 5 bytes |
882 |
$output .= chr(248 + ($v >> 24)) . chr(128 + (($v >> 18) & 63)) |
883 |
. chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63)) |
884 |
. chr(128 + ($v & 63)); |
885 |
} elseif ($v < (1 << 31)) { // 6 bytes |
886 |
$output .= chr(252 + ($v >> 30)) . chr(128 + (($v >> 24) & 63)) |
887 |
. chr(128 + (($v >> 18) & 63)) . chr(128 + (($v >> 12) & 63)) |
888 |
. chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63)); |
889 |
} else {
|
890 |
$this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k); |
891 |
return false; |
892 |
} |
893 |
} |
894 |
return $output; |
895 |
} |
896 |
|
897 |
/**
|
898 |
* Convert UCS-4 array into UCS-4 string
|
899 |
*
|
900 |
* @access private
|
901 |
*/
|
902 |
function _ucs4_to_ucs4_string($input) |
903 |
{ |
904 |
$output = ''; |
905 |
// Take array values and split output to 4 bytes per value
|
906 |
// The bit mask is 255, which reads &11111111
|
907 |
foreach ($input as $v) { |
908 |
$output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255); |
909 |
} |
910 |
return $output; |
911 |
} |
912 |
|
913 |
/**
|
914 |
* Convert UCS-4 strin into UCS-4 garray
|
915 |
*
|
916 |
* @access private
|
917 |
*/
|
918 |
function _ucs4_string_to_ucs4($input) |
919 |
{ |
920 |
$output = array(); |
921 |
$inp_len = strlen($input); |
922 |
// Input length must be dividable by 4
|
923 |
if ($inp_len % 4) { |
924 |
$this->_error('Input UCS4 string is broken'); |
925 |
return false; |
926 |
} |
927 |
// Empty input - return empty output
|
928 |
if (!$inp_len) return $output; |
929 |
for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) { |
930 |
// Increment output position every 4 input bytes
|
931 |
if (!($i % 4)) { |
932 |
$out_len++;
|
933 |
$output[$out_len] = 0; |
934 |
} |
935 |
$output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) ); |
936 |
} |
937 |
return $output; |
938 |
} |
939 |
} |
940 |
|
941 |
/**
|
942 |
* Adapter class for aligning the API of idna_convert with that of Net_IDNA
|
943 |
* @author Matthias Sommerfeld <mso@phlylabs.de>
|
944 |
*/
|
945 |
class Net_IDNA_php4 extends idna_convert |
946 |
{ |
947 |
/**
|
948 |
* Sets a new option value. Available options and values:
|
949 |
* [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
|
950 |
* 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
|
951 |
* [overlong - Unicode does not allow unnecessarily long encodings of chars,
|
952 |
* to allow this, set this parameter to true, else to false;
|
953 |
* default is false.]
|
954 |
* [strict - true: strict mode, good for registration purposes - Causes errors
|
955 |
* on failures; false: loose mode, ideal for "wildlife" applications
|
956 |
* by silently ignoring errors and returning the original input instead
|
957 |
*
|
958 |
* @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
|
959 |
* @param string Value to use (if parameter 1 is a string)
|
960 |
* @return boolean true on success, false otherwise
|
961 |
* @access public
|
962 |
*/
|
963 |
function setParams($option, $param = false) |
964 |
{ |
965 |
return $this->IC->set_parameters($option, $param); |
966 |
} |
967 |
} |
968 |
|
969 |
?>
|