root / drupal7 / sites / all / libraries / simplepie-1.3.1 / idn / idna_convert.class.php @ 7295e063
1 | 41cc1b08 | Assos Assos | <?php
|
---|---|---|---|
2 | // {{{ license
|
||
3 | |||
4 | /* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
|
||
5 | //
|
||
6 | // +----------------------------------------------------------------------+
|
||
7 | // | This library is free software; you can redistribute it and/or modify |
|
||
8 | // | it under the terms of the GNU Lesser General Public License as |
|
||
9 | // | published by the Free Software Foundation; either version 2.1 of the |
|
||
10 | // | License, or (at your option) any later version. |
|
||
11 | // | |
|
||
12 | // | This library is distributed in the hope that it will be useful, but |
|
||
13 | // | WITHOUT ANY WARRANTY; without even the implied warranty of |
|
||
14 | // | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
||
15 | // | Lesser General Public License for more details. |
|
||
16 | // | |
|
||
17 | // | You should have received a copy of the GNU Lesser General Public |
|
||
18 | // | License along with this library; if not, write to the Free Software |
|
||
19 | // | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 |
|
||
20 | // | USA. |
|
||
21 | // +----------------------------------------------------------------------+
|
||
22 | //
|
||
23 | |||
24 | // }}}
|
||
25 | |||
26 | /**
|
||
27 | * Encode/decode Internationalized Domain Names.
|
||
28 | *
|
||
29 | * The class allows to convert internationalized domain names
|
||
30 | * (see RFC 3490 for details) as they can be used with various registries worldwide
|
||
31 | * to be translated between their original (localized) form and their encoded form
|
||
32 | * as it will be used in the DNS (Domain Name System).
|
||
33 | *
|
||
34 | * The class provides two public methods, encode() and decode(), which do exactly
|
||
35 | * what you would expect them to do. You are allowed to use complete domain names,
|
||
36 | * simple strings and complete email addresses as well. That means, that you might
|
||
37 | * use any of the following notations:
|
||
38 | *
|
||
39 | * - www.nörgler.com
|
||
40 | * - xn--nrgler-wxa
|
||
41 | * - xn--brse-5qa.xn--knrz-1ra.info
|
||
42 | *
|
||
43 | * Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4
|
||
44 | * array. Unicode output is available in the same formats.
|
||
45 | * You can select your preferred format via {@link set_paramter()}.
|
||
46 | *
|
||
47 | * ACE input and output is always expected to be ASCII.
|
||
48 | *
|
||
49 | * @author Matthias Sommerfeld <mso@phlylabs.de>
|
||
50 | * @copyright 2004-2007 phlyLabs Berlin, http://phlylabs.de
|
||
51 | * @version 0.5.1
|
||
52 | *
|
||
53 | */
|
||
54 | class idna_convert |
||
55 | { |
||
56 | /**
|
||
57 | * Holds all relevant mapping tables, loaded from a seperate file on construct
|
||
58 | * See RFC3454 for details
|
||
59 | *
|
||
60 | * @var array
|
||
61 | * @access private
|
||
62 | */
|
||
63 | var $NP = array(); |
||
64 | |||
65 | // Internal settings, do not mess with them
|
||
66 | var $_punycode_prefix = 'xn--'; |
||
67 | var $_invalid_ucs = 0x80000000; |
||
68 | var $_max_ucs = 0x10FFFF; |
||
69 | var $_base = 36; |
||
70 | var $_tmin = 1; |
||
71 | var $_tmax = 26; |
||
72 | var $_skew = 38; |
||
73 | var $_damp = 700; |
||
74 | var $_initial_bias = 72; |
||
75 | var $_initial_n = 0x80; |
||
76 | var $_sbase = 0xAC00; |
||
77 | var $_lbase = 0x1100; |
||
78 | var $_vbase = 0x1161; |
||
79 | var $_tbase = 0x11A7; |
||
80 | var $_lcount = 19; |
||
81 | var $_vcount = 21; |
||
82 | var $_tcount = 28; |
||
83 | var $_ncount = 588; // _vcount * _tcount |
||
84 | var $_scount = 11172; // _lcount * _tcount * _vcount |
||
85 | var $_error = false; |
||
86 | |||
87 | // See {@link set_paramter()} for details of how to change the following
|
||
88 | // settings from within your script / application
|
||
89 | var $_api_encoding = 'utf8'; // Default input charset is UTF-8 |
||
90 | var $_allow_overlong = false; // Overlong UTF-8 encodings are forbidden |
||
91 | var $_strict_mode = false; // Behave strict or not |
||
92 | |||
93 | // The constructor
|
||
94 | function idna_convert($options = false) |
||
95 | { |
||
96 | $this->slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount; |
||
97 | if (function_exists('file_get_contents')) { |
||
98 | $this->NP = unserialize(file_get_contents(dirname(__FILE__).'/npdata.ser')); |
||
99 | } else {
|
||
100 | $this->NP = unserialize(join('', file(dirname(__FILE__).'/npdata.ser'))); |
||
101 | } |
||
102 | // If parameters are given, pass these to the respective method
|
||
103 | if (is_array($options)) { |
||
104 | return $this->set_parameter($options); |
||
105 | } |
||
106 | return true; |
||
107 | } |
||
108 | |||
109 | /**
|
||
110 | * Sets a new option value. Available options and values:
|
||
111 | * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
|
||
112 | * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
|
||
113 | * [overlong - Unicode does not allow unnecessarily long encodings of chars,
|
||
114 | * to allow this, set this parameter to true, else to false;
|
||
115 | * default is false.]
|
||
116 | * [strict - true: strict mode, good for registration purposes - Causes errors
|
||
117 | * on failures; false: loose mode, ideal for "wildlife" applications
|
||
118 | * by silently ignoring errors and returning the original input instead
|
||
119 | *
|
||
120 | * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
|
||
121 | * @param string Value to use (if parameter 1 is a string)
|
||
122 | * @return boolean true on success, false otherwise
|
||
123 | * @access public
|
||
124 | */
|
||
125 | function set_parameter($option, $value = false) |
||
126 | { |
||
127 | if (!is_array($option)) { |
||
128 | $option = array($option => $value); |
||
129 | } |
||
130 | foreach ($option as $k => $v) { |
||
131 | switch ($k) { |
||
132 | case 'encoding': |
||
133 | switch ($v) { |
||
134 | case 'utf8': |
||
135 | case 'ucs4_string': |
||
136 | case 'ucs4_array': |
||
137 | $this->_api_encoding = $v; |
||
138 | break;
|
||
139 | default:
|
||
140 | $this->_error('Set Parameter: Unknown parameter '.$v.' for option '.$k); |
||
141 | return false; |
||
142 | } |
||
143 | break;
|
||
144 | case 'overlong': |
||
145 | $this->_allow_overlong = ($v) ? true : false; |
||
146 | break;
|
||
147 | case 'strict': |
||
148 | $this->_strict_mode = ($v) ? true : false; |
||
149 | break;
|
||
150 | default:
|
||
151 | $this->_error('Set Parameter: Unknown option '.$k); |
||
152 | return false; |
||
153 | } |
||
154 | } |
||
155 | return true; |
||
156 | } |
||
157 | |||
158 | /**
|
||
159 | * Decode a given ACE domain name
|
||
160 | * @param string Domain name (ACE string)
|
||
161 | * [@param string Desired output encoding, see {@link set_parameter}]
|
||
162 | * @return string Decoded Domain name (UTF-8 or UCS-4)
|
||
163 | * @access public
|
||
164 | */
|
||
165 | function decode($input, $one_time_encoding = false) |
||
166 | { |
||
167 | // Optionally set
|
||
168 | if ($one_time_encoding) { |
||
169 | switch ($one_time_encoding) { |
||
170 | case 'utf8': |
||
171 | case 'ucs4_string': |
||
172 | case 'ucs4_array': |
||
173 | break;
|
||
174 | default:
|
||
175 | $this->_error('Unknown encoding '.$one_time_encoding); |
||
176 | return false; |
||
177 | } |
||
178 | } |
||
179 | // Make sure to drop any newline characters around
|
||
180 | $input = trim($input); |
||
181 | |||
182 | // Negotiate input and try to determine, whether it is a plain string,
|
||
183 | // an email address or something like a complete URL
|
||
184 | if (strpos($input, '@')) { // Maybe it is an email address |
||
185 | // No no in strict mode
|
||
186 | if ($this->_strict_mode) { |
||
187 | $this->_error('Only simple domain name parts can be handled in strict mode'); |
||
188 | return false; |
||
189 | } |
||
190 | list ($email_pref, $input) = explode('@', $input, 2); |
||
191 | $arr = explode('.', $input); |
||
192 | foreach ($arr as $k => $v) { |
||
193 | if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) { |
||
194 | $conv = $this->_decode($v); |
||
195 | if ($conv) $arr[$k] = $conv; |
||
196 | } |
||
197 | } |
||
198 | $input = join('.', $arr); |
||
199 | $arr = explode('.', $email_pref); |
||
200 | foreach ($arr as $k => $v) { |
||
201 | if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) { |
||
202 | $conv = $this->_decode($v); |
||
203 | if ($conv) $arr[$k] = $conv; |
||
204 | } |
||
205 | } |
||
206 | $email_pref = join('.', $arr); |
||
207 | $return = $email_pref . '@' . $input; |
||
208 | } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters) |
||
209 | // No no in strict mode
|
||
210 | if ($this->_strict_mode) { |
||
211 | $this->_error('Only simple domain name parts can be handled in strict mode'); |
||
212 | return false; |
||
213 | } |
||
214 | $parsed = parse_url($input); |
||
215 | if (isset($parsed['host'])) { |
||
216 | $arr = explode('.', $parsed['host']); |
||
217 | foreach ($arr as $k => $v) { |
||
218 | $conv = $this->_decode($v); |
||
219 | if ($conv) $arr[$k] = $conv; |
||
220 | } |
||
221 | $parsed['host'] = join('.', $arr); |
||
222 | $return =
|
||
223 | (empty($parsed['scheme']) ? '' : $parsed['scheme'].(strtolower($parsed['scheme']) == 'mailto' ? ':' : '://')) |
||
224 | .(empty($parsed['user']) ? '' : $parsed['user'].(empty($parsed['pass']) ? '' : ':'.$parsed['pass']).'@') |
||
225 | .$parsed['host'] |
||
226 | .(empty($parsed['port']) ? '' : ':'.$parsed['port']) |
||
227 | .(empty($parsed['path']) ? '' : $parsed['path']) |
||
228 | .(empty($parsed['query']) ? '' : '?'.$parsed['query']) |
||
229 | .(empty($parsed['fragment']) ? '' : '#'.$parsed['fragment']); |
||
230 | } else { // parse_url seems to have failed, try without it |
||
231 | $arr = explode('.', $input); |
||
232 | foreach ($arr as $k => $v) { |
||
233 | $conv = $this->_decode($v); |
||
234 | $arr[$k] = ($conv) ? $conv : $v; |
||
235 | } |
||
236 | $return = join('.', $arr); |
||
237 | } |
||
238 | } else { // Otherwise we consider it being a pure domain name string |
||
239 | $return = $this->_decode($input); |
||
240 | if (!$return) $return = $input; |
||
241 | } |
||
242 | // The output is UTF-8 by default, other output formats need conversion here
|
||
243 | // If one time encoding is given, use this, else the objects property
|
||
244 | switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) { |
||
245 | case 'utf8': |
||
246 | return $return; |
||
247 | break;
|
||
248 | case 'ucs4_string': |
||
249 | return $this->_ucs4_to_ucs4_string($this->_utf8_to_ucs4($return)); |
||
250 | break;
|
||
251 | case 'ucs4_array': |
||
252 | return $this->_utf8_to_ucs4($return); |
||
253 | break;
|
||
254 | default:
|
||
255 | $this->_error('Unsupported output format'); |
||
256 | return false; |
||
257 | } |
||
258 | } |
||
259 | |||
260 | /**
|
||
261 | * Encode a given UTF-8 domain name
|
||
262 | * @param string Domain name (UTF-8 or UCS-4)
|
||
263 | * [@param string Desired input encoding, see {@link set_parameter}]
|
||
264 | * @return string Encoded Domain name (ACE string)
|
||
265 | * @access public
|
||
266 | */
|
||
267 | function encode($decoded, $one_time_encoding = false) |
||
268 | { |
||
269 | // Forcing conversion of input to UCS4 array
|
||
270 | // If one time encoding is given, use this, else the objects property
|
||
271 | switch ($one_time_encoding ? $one_time_encoding : $this->_api_encoding) { |
||
272 | case 'utf8': |
||
273 | $decoded = $this->_utf8_to_ucs4($decoded); |
||
274 | break;
|
||
275 | case 'ucs4_string': |
||
276 | $decoded = $this->_ucs4_string_to_ucs4($decoded); |
||
277 | case 'ucs4_array': |
||
278 | break;
|
||
279 | default:
|
||
280 | $this->_error('Unsupported input format: '.($one_time_encoding ? $one_time_encoding : $this->_api_encoding)); |
||
281 | return false; |
||
282 | } |
||
283 | |||
284 | // No input, no output, what else did you expect?
|
||
285 | if (empty($decoded)) return ''; |
||
286 | |||
287 | // Anchors for iteration
|
||
288 | $last_begin = 0; |
||
289 | // Output string
|
||
290 | $output = ''; |
||
291 | foreach ($decoded as $k => $v) { |
||
292 | // Make sure to use just the plain dot
|
||
293 | switch($v) { |
||
294 | case 0x3002: |
||
295 | case 0xFF0E: |
||
296 | case 0xFF61: |
||
297 | $decoded[$k] = 0x2E; |
||
298 | // Right, no break here, the above are converted to dots anyway
|
||
299 | // Stumbling across an anchoring character
|
||
300 | case 0x2E: |
||
301 | case 0x2F: |
||
302 | case 0x3A: |
||
303 | case 0x3F: |
||
304 | case 0x40: |
||
305 | // Neither email addresses nor URLs allowed in strict mode
|
||
306 | if ($this->_strict_mode) { |
||
307 | $this->_error('Neither email addresses nor URLs are allowed in strict mode.'); |
||
308 | return false; |
||
309 | } else {
|
||
310 | // Skip first char
|
||
311 | if ($k) { |
||
312 | $encoded = ''; |
||
313 | $encoded = $this->_encode(array_slice($decoded, $last_begin, (($k)-$last_begin))); |
||
314 | if ($encoded) { |
||
315 | $output .= $encoded; |
||
316 | } else {
|
||
317 | $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($k)-$last_begin))); |
||
318 | } |
||
319 | $output .= chr($decoded[$k]); |
||
320 | } |
||
321 | $last_begin = $k + 1; |
||
322 | } |
||
323 | } |
||
324 | } |
||
325 | // Catch the rest of the string
|
||
326 | if ($last_begin) { |
||
327 | $inp_len = sizeof($decoded); |
||
328 | $encoded = ''; |
||
329 | $encoded = $this->_encode(array_slice($decoded, $last_begin, (($inp_len)-$last_begin))); |
||
330 | if ($encoded) { |
||
331 | $output .= $encoded; |
||
332 | } else {
|
||
333 | $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($inp_len)-$last_begin))); |
||
334 | } |
||
335 | return $output; |
||
336 | } else {
|
||
337 | if ($output = $this->_encode($decoded)) { |
||
338 | return $output; |
||
339 | } else {
|
||
340 | return $this->_ucs4_to_utf8($decoded); |
||
341 | } |
||
342 | } |
||
343 | } |
||
344 | |||
345 | /**
|
||
346 | * Use this method to get the last error ocurred
|
||
347 | * @param void
|
||
348 | * @return string The last error, that occured
|
||
349 | * @access public
|
||
350 | */
|
||
351 | function get_last_error() |
||
352 | { |
||
353 | return $this->_error; |
||
354 | } |
||
355 | |||
356 | /**
|
||
357 | * The actual decoding algorithm
|
||
358 | * @access private
|
||
359 | */
|
||
360 | function _decode($encoded) |
||
361 | { |
||
362 | // We do need to find the Punycode prefix
|
||
363 | if (!preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $encoded)) { |
||
364 | $this->_error('This is not a punycode string'); |
||
365 | return false; |
||
366 | } |
||
367 | $encode_test = preg_replace('!^'.preg_quote($this->_punycode_prefix, '!').'!', '', $encoded); |
||
368 | // If nothing left after removing the prefix, it is hopeless
|
||
369 | if (!$encode_test) { |
||
370 | $this->_error('The given encoded string was empty'); |
||
371 | return false; |
||
372 | } |
||
373 | // Find last occurence of the delimiter
|
||
374 | $delim_pos = strrpos($encoded, '-'); |
||
375 | if ($delim_pos > strlen($this->_punycode_prefix)) { |
||
376 | for ($k = strlen($this->_punycode_prefix); $k < $delim_pos; ++$k) { |
||
377 | $decoded[] = ord($encoded{$k}); |
||
378 | } |
||
379 | } else {
|
||
380 | $decoded = array(); |
||
381 | } |
||
382 | $deco_len = count($decoded); |
||
383 | $enco_len = strlen($encoded); |
||
384 | |||
385 | // Wandering through the strings; init
|
||
386 | $is_first = true; |
||
387 | $bias = $this->_initial_bias; |
||
388 | $idx = 0; |
||
389 | $char = $this->_initial_n; |
||
390 | |||
391 | for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) { |
||
392 | for ($old_idx = $idx, $w = 1, $k = $this->_base; 1 ; $k += $this->_base) { |
||
393 | $digit = $this->_decode_digit($encoded{$enco_idx++}); |
||
394 | $idx += $digit * $w; |
||
395 | $t = ($k <= $bias) ? $this->_tmin : |
||
396 | (($k >= $bias + $this->_tmax) ? $this->_tmax : ($k - $bias)); |
||
397 | if ($digit < $t) break; |
||
398 | $w = (int) ($w * ($this->_base - $t)); |
||
399 | } |
||
400 | $bias = $this->_adapt($idx - $old_idx, $deco_len + 1, $is_first); |
||
401 | $is_first = false; |
||
402 | $char += (int) ($idx / ($deco_len + 1)); |
||
403 | $idx %= ($deco_len + 1); |
||
404 | if ($deco_len > 0) { |
||
405 | // Make room for the decoded char
|
||
406 | for ($i = $deco_len; $i > $idx; $i--) { |
||
407 | $decoded[$i] = $decoded[($i - 1)]; |
||
408 | } |
||
409 | } |
||
410 | $decoded[$idx++] = $char; |
||
411 | } |
||
412 | return $this->_ucs4_to_utf8($decoded); |
||
413 | } |
||
414 | |||
415 | /**
|
||
416 | * The actual encoding algorithm
|
||
417 | * @access private
|
||
418 | */
|
||
419 | function _encode($decoded) |
||
420 | { |
||
421 | // We cannot encode a domain name containing the Punycode prefix
|
||
422 | $extract = strlen($this->_punycode_prefix); |
||
423 | $check_pref = $this->_utf8_to_ucs4($this->_punycode_prefix); |
||
424 | $check_deco = array_slice($decoded, 0, $extract); |
||
425 | |||
426 | if ($check_pref == $check_deco) { |
||
427 | $this->_error('This is already a punycode string'); |
||
428 | return false; |
||
429 | } |
||
430 | // We will not try to encode strings consisting of basic code points only
|
||
431 | $encodable = false; |
||
432 | foreach ($decoded as $k => $v) { |
||
433 | if ($v > 0x7a) { |
||
434 | $encodable = true; |
||
435 | break;
|
||
436 | } |
||
437 | } |
||
438 | if (!$encodable) { |
||
439 | $this->_error('The given string does not contain encodable chars'); |
||
440 | return false; |
||
441 | } |
||
442 | |||
443 | // Do NAMEPREP
|
||
444 | $decoded = $this->_nameprep($decoded); |
||
445 | if (!$decoded || !is_array($decoded)) return false; // NAMEPREP failed |
||
446 | |||
447 | $deco_len = count($decoded); |
||
448 | if (!$deco_len) return false; // Empty array |
||
449 | |||
450 | $codecount = 0; // How many chars have been consumed |
||
451 | |||
452 | $encoded = ''; |
||
453 | // Copy all basic code points to output
|
||
454 | for ($i = 0; $i < $deco_len; ++$i) { |
||
455 | $test = $decoded[$i]; |
||
456 | // Will match [-0-9a-zA-Z]
|
||
457 | if ((0x2F < $test && $test < 0x40) || (0x40 < $test && $test < 0x5B) |
||
458 | || (0x60 < $test && $test <= 0x7B) || (0x2D == $test)) { |
||
459 | $encoded .= chr($decoded[$i]); |
||
460 | $codecount++;
|
||
461 | } |
||
462 | } |
||
463 | if ($codecount == $deco_len) return $encoded; // All codepoints were basic ones |
||
464 | |||
465 | // Start with the prefix; copy it to output
|
||
466 | $encoded = $this->_punycode_prefix.$encoded; |
||
467 | |||
468 | // If we have basic code points in output, add an hyphen to the end
|
||
469 | if ($codecount) $encoded .= '-'; |
||
470 | |||
471 | // Now find and encode all non-basic code points
|
||
472 | $is_first = true; |
||
473 | $cur_code = $this->_initial_n; |
||
474 | $bias = $this->_initial_bias; |
||
475 | $delta = 0; |
||
476 | while ($codecount < $deco_len) { |
||
477 | // Find the smallest code point >= the current code point and
|
||
478 | // remember the last ouccrence of it in the input
|
||
479 | for ($i = 0, $next_code = $this->_max_ucs; $i < $deco_len; $i++) { |
||
480 | if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) { |
||
481 | $next_code = $decoded[$i]; |
||
482 | } |
||
483 | } |
||
484 | |||
485 | $delta += ($next_code - $cur_code) * ($codecount + 1); |
||
486 | $cur_code = $next_code; |
||
487 | |||
488 | // Scan input again and encode all characters whose code point is $cur_code
|
||
489 | for ($i = 0; $i < $deco_len; $i++) { |
||
490 | if ($decoded[$i] < $cur_code) { |
||
491 | $delta++;
|
||
492 | } elseif ($decoded[$i] == $cur_code) { |
||
493 | for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) { |
||
494 | $t = ($k <= $bias) ? $this->_tmin : |
||
495 | (($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias); |
||
496 | if ($q < $t) break; |
||
497 | $encoded .= $this->_encode_digit(intval($t + (($q - $t) % ($this->_base - $t)))); //v0.4.5 Changed from ceil() to intval() |
||
498 | $q = (int) (($q - $t) / ($this->_base - $t)); |
||
499 | } |
||
500 | $encoded .= $this->_encode_digit($q); |
||
501 | $bias = $this->_adapt($delta, $codecount+1, $is_first); |
||
502 | $codecount++;
|
||
503 | $delta = 0; |
||
504 | $is_first = false; |
||
505 | } |
||
506 | } |
||
507 | $delta++;
|
||
508 | $cur_code++;
|
||
509 | } |
||
510 | return $encoded; |
||
511 | } |
||
512 | |||
513 | /**
|
||
514 | * Adapt the bias according to the current code point and position
|
||
515 | * @access private
|
||
516 | */
|
||
517 | function _adapt($delta, $npoints, $is_first) |
||
518 | { |
||
519 | $delta = intval($is_first ? ($delta / $this->_damp) : ($delta / 2)); |
||
520 | $delta += intval($delta / $npoints); |
||
521 | for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) { |
||
522 | $delta = intval($delta / ($this->_base - $this->_tmin)); |
||
523 | } |
||
524 | return intval($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew)); |
||
525 | } |
||
526 | |||
527 | /**
|
||
528 | * Encoding a certain digit
|
||
529 | * @access private
|
||
530 | */
|
||
531 | function _encode_digit($d) |
||
532 | { |
||
533 | return chr($d + 22 + 75 * ($d < 26)); |
||
534 | } |
||
535 | |||
536 | /**
|
||
537 | * Decode a certain digit
|
||
538 | * @access private
|
||
539 | */
|
||
540 | function _decode_digit($cp) |
||
541 | { |
||
542 | $cp = ord($cp); |
||
543 | return ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $this->_base)); |
||
544 | } |
||
545 | |||
546 | /**
|
||
547 | * Internal error handling method
|
||
548 | * @access private
|
||
549 | */
|
||
550 | function _error($error = '') |
||
551 | { |
||
552 | $this->_error = $error; |
||
553 | } |
||
554 | |||
555 | /**
|
||
556 | * Do Nameprep according to RFC3491 and RFC3454
|
||
557 | * @param array Unicode Characters
|
||
558 | * @return string Unicode Characters, Nameprep'd
|
||
559 | * @access private
|
||
560 | */
|
||
561 | function _nameprep($input) |
||
562 | { |
||
563 | $output = array(); |
||
564 | $error = false; |
||
565 | //
|
||
566 | // Mapping
|
||
567 | // Walking through the input array, performing the required steps on each of
|
||
568 | // the input chars and putting the result into the output array
|
||
569 | // While mapping required chars we apply the cannonical ordering
|
||
570 | foreach ($input as $v) { |
||
571 | // Map to nothing == skip that code point
|
||
572 | if (in_array($v, $this->NP['map_nothing'])) continue; |
||
573 | |||
574 | // Try to find prohibited input
|
||
575 | if (in_array($v, $this->NP['prohibit']) || in_array($v, $this->NP['general_prohibited'])) { |
||
576 | $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v)); |
||
577 | return false; |
||
578 | } |
||
579 | foreach ($this->NP['prohibit_ranges'] as $range) { |
||
580 | if ($range[0] <= $v && $v <= $range[1]) { |
||
581 | $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v)); |
||
582 | return false; |
||
583 | } |
||
584 | } |
||
585 | //
|
||
586 | // Hangul syllable decomposition
|
||
587 | if (0xAC00 <= $v && $v <= 0xD7AF) { |
||
588 | foreach ($this->_hangul_decompose($v) as $out) { |
||
589 | $output[] = (int) $out; |
||
590 | } |
||
591 | // There's a decomposition mapping for that code point
|
||
592 | } elseif (isset($this->NP['replacemaps'][$v])) { |
||
593 | foreach ($this->_apply_cannonical_ordering($this->NP['replacemaps'][$v]) as $out) { |
||
594 | $output[] = (int) $out; |
||
595 | } |
||
596 | } else {
|
||
597 | $output[] = (int) $v; |
||
598 | } |
||
599 | } |
||
600 | // Before applying any Combining, try to rearrange any Hangul syllables
|
||
601 | $output = $this->_hangul_compose($output); |
||
602 | //
|
||
603 | // Combine code points
|
||
604 | //
|
||
605 | $last_class = 0; |
||
606 | $last_starter = 0; |
||
607 | $out_len = count($output); |
||
608 | for ($i = 0; $i < $out_len; ++$i) { |
||
609 | $class = $this->_get_combining_class($output[$i]); |
||
610 | if ((!$last_class || $last_class > $class) && $class) { |
||
611 | // Try to match
|
||
612 | $seq_len = $i - $last_starter; |
||
613 | $out = $this->_combine(array_slice($output, $last_starter, $seq_len)); |
||
614 | // On match: Replace the last starter with the composed character and remove
|
||
615 | // the now redundant non-starter(s)
|
||
616 | if ($out) { |
||
617 | $output[$last_starter] = $out; |
||
618 | if (count($out) != $seq_len) { |
||
619 | for ($j = $i+1; $j < $out_len; ++$j) { |
||
620 | $output[$j-1] = $output[$j]; |
||
621 | } |
||
622 | unset($output[$out_len]); |
||
623 | } |
||
624 | // Rewind the for loop by one, since there can be more possible compositions
|
||
625 | $i--;
|
||
626 | $out_len--;
|
||
627 | $last_class = ($i == $last_starter) ? 0 : $this->_get_combining_class($output[$i-1]); |
||
628 | continue;
|
||
629 | } |
||
630 | } |
||
631 | // The current class is 0
|
||
632 | if (!$class) $last_starter = $i; |
||
633 | $last_class = $class; |
||
634 | } |
||
635 | return $output; |
||
636 | } |
||
637 | |||
638 | /**
|
||
639 | * Decomposes a Hangul syllable
|
||
640 | * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
|
||
641 | * @param integer 32bit UCS4 code point
|
||
642 | * @return array Either Hangul Syllable decomposed or original 32bit value as one value array
|
||
643 | * @access private
|
||
644 | */
|
||
645 | function _hangul_decompose($char) |
||
646 | { |
||
647 | $sindex = (int) $char - $this->_sbase; |
||
648 | if ($sindex < 0 || $sindex >= $this->_scount) { |
||
649 | return array($char); |
||
650 | } |
||
651 | $result = array(); |
||
652 | $result[] = (int) $this->_lbase + $sindex / $this->_ncount; |
||
653 | $result[] = (int) $this->_vbase + ($sindex % $this->_ncount) / $this->_tcount; |
||
654 | $T = intval($this->_tbase + $sindex % $this->_tcount); |
||
655 | if ($T != $this->_tbase) $result[] = $T; |
||
656 | return $result; |
||
657 | } |
||
658 | /**
|
||
659 | * Ccomposes a Hangul syllable
|
||
660 | * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
|
||
661 | * @param array Decomposed UCS4 sequence
|
||
662 | * @return array UCS4 sequence with syllables composed
|
||
663 | * @access private
|
||
664 | */
|
||
665 | function _hangul_compose($input) |
||
666 | { |
||
667 | $inp_len = count($input); |
||
668 | if (!$inp_len) return array(); |
||
669 | $result = array(); |
||
670 | $last = (int) $input[0]; |
||
671 | $result[] = $last; // copy first char from input to output |
||
672 | |||
673 | for ($i = 1; $i < $inp_len; ++$i) { |
||
674 | $char = (int) $input[$i]; |
||
675 | $sindex = $last - $this->_sbase; |
||
676 | $lindex = $last - $this->_lbase; |
||
677 | $vindex = $char - $this->_vbase; |
||
678 | $tindex = $char - $this->_tbase; |
||
679 | // Find out, whether two current characters are LV and T
|
||
680 | if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount == 0) |
||
681 | && 0 <= $tindex && $tindex <= $this->_tcount) { |
||
682 | // create syllable of form LVT
|
||
683 | $last += $tindex; |
||
684 | $result[(count($result) - 1)] = $last; // reset last |
||
685 | continue; // discard char |
||
686 | } |
||
687 | // Find out, whether two current characters form L and V
|
||
688 | if (0 <= $lindex && $lindex < $this->_lcount && 0 <= $vindex && $vindex < $this->_vcount) { |
||
689 | // create syllable of form LV
|
||
690 | $last = (int) $this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount; |
||
691 | $result[(count($result) - 1)] = $last; // reset last |
||
692 | continue; // discard char |
||
693 | } |
||
694 | // if neither case was true, just add the character
|
||
695 | $last = $char; |
||
696 | $result[] = $char; |
||
697 | } |
||
698 | return $result; |
||
699 | } |
||
700 | |||
701 | /**
|
||
702 | * Returns the combining class of a certain wide char
|
||
703 | * @param integer Wide char to check (32bit integer)
|
||
704 | * @return integer Combining class if found, else 0
|
||
705 | * @access private
|
||
706 | */
|
||
707 | function _get_combining_class($char) |
||
708 | { |
||
709 | return isset($this->NP['norm_combcls'][$char]) ? $this->NP['norm_combcls'][$char] : 0; |
||
710 | } |
||
711 | |||
712 | /**
|
||
713 | * Apllies the cannonical ordering of a decomposed UCS4 sequence
|
||
714 | * @param array Decomposed UCS4 sequence
|
||
715 | * @return array Ordered USC4 sequence
|
||
716 | * @access private
|
||
717 | */
|
||
718 | function _apply_cannonical_ordering($input) |
||
719 | { |
||
720 | $swap = true; |
||
721 | $size = count($input); |
||
722 | while ($swap) { |
||
723 | $swap = false; |
||
724 | $last = $this->_get_combining_class(intval($input[0])); |
||
725 | for ($i = 0; $i < $size-1; ++$i) { |
||
726 | $next = $this->_get_combining_class(intval($input[$i+1])); |
||
727 | if ($next != 0 && $last > $next) { |
||
728 | // Move item leftward until it fits
|
||
729 | for ($j = $i + 1; $j > 0; --$j) { |
||
730 | if ($this->_get_combining_class(intval($input[$j-1])) <= $next) break; |
||
731 | $t = intval($input[$j]); |
||
732 | $input[$j] = intval($input[$j-1]); |
||
733 | $input[$j-1] = $t; |
||
734 | $swap = true; |
||
735 | } |
||
736 | // Reentering the loop looking at the old character again
|
||
737 | $next = $last; |
||
738 | } |
||
739 | $last = $next; |
||
740 | } |
||
741 | } |
||
742 | return $input; |
||
743 | } |
||
744 | |||
745 | /**
|
||
746 | * Do composition of a sequence of starter and non-starter
|
||
747 | * @param array UCS4 Decomposed sequence
|
||
748 | * @return array Ordered USC4 sequence
|
||
749 | * @access private
|
||
750 | */
|
||
751 | function _combine($input) |
||
752 | { |
||
753 | $inp_len = count($input); |
||
754 | foreach ($this->NP['replacemaps'] as $np_src => $np_target) { |
||
755 | if ($np_target[0] != $input[0]) continue; |
||
756 | if (count($np_target) != $inp_len) continue; |
||
757 | $hit = false; |
||
758 | foreach ($input as $k2 => $v2) { |
||
759 | if ($v2 == $np_target[$k2]) { |
||
760 | $hit = true; |
||
761 | } else {
|
||
762 | $hit = false; |
||
763 | break;
|
||
764 | } |
||
765 | } |
||
766 | if ($hit) return $np_src; |
||
767 | } |
||
768 | return false; |
||
769 | } |
||
770 | |||
771 | /**
|
||
772 | * This converts an UTF-8 encoded string to its UCS-4 representation
|
||
773 | * By talking about UCS-4 "strings" we mean arrays of 32bit integers representing
|
||
774 | * each of the "chars". This is due to PHP not being able to handle strings with
|
||
775 | * bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too.
|
||
776 | * The following UTF-8 encodings are supported:
|
||
777 | * bytes bits representation
|
||
778 | * 1 7 0xxxxxxx
|
||
779 | * 2 11 110xxxxx 10xxxxxx
|
||
780 | * 3 16 1110xxxx 10xxxxxx 10xxxxxx
|
||
781 | * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||
782 | * 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||
783 | * 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||
784 | * Each x represents a bit that can be used to store character data.
|
||
785 | * The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000
|
||
786 | * @access private
|
||
787 | */
|
||
788 | function _utf8_to_ucs4($input) |
||
789 | { |
||
790 | $output = array(); |
||
791 | $out_len = 0; |
||
792 | $inp_len = strlen($input); |
||
793 | $mode = 'next'; |
||
794 | $test = 'none'; |
||
795 | for ($k = 0; $k < $inp_len; ++$k) { |
||
796 | $v = ord($input{$k}); // Extract byte from input string |
||
797 | |||
798 | if ($v < 128) { // We found an ASCII char - put into stirng as is |
||
799 | $output[$out_len] = $v; |
||
800 | ++$out_len;
|
||
801 | if ('add' == $mode) { |
||
802 | $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k); |
||
803 | return false; |
||
804 | } |
||
805 | continue;
|
||
806 | } |
||
807 | if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char |
||
808 | $start_byte = $v; |
||
809 | $mode = 'add'; |
||
810 | $test = 'range'; |
||
811 | if ($v >> 5 == 6) { // &110xxxxx 10xxxxx |
||
812 | $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left |
||
813 | $v = ($v - 192) << 6; |
||
814 | } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx |
||
815 | $next_byte = 1; |
||
816 | $v = ($v - 224) << 12; |
||
817 | } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
||
818 | $next_byte = 2; |
||
819 | $v = ($v - 240) << 18; |
||
820 | } elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
||
821 | $next_byte = 3; |
||
822 | $v = ($v - 248) << 24; |
||
823 | } elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
||
824 | $next_byte = 4; |
||
825 | $v = ($v - 252) << 30; |
||
826 | } else {
|
||
827 | $this->_error('This might be UTF-8, but I don\'t understand it at byte '.$k); |
||
828 | return false; |
||
829 | } |
||
830 | if ('add' == $mode) { |
||
831 | $output[$out_len] = (int) $v; |
||
832 | ++$out_len;
|
||
833 | continue;
|
||
834 | } |
||
835 | } |
||
836 | if ('add' == $mode) { |
||
837 | if (!$this->_allow_overlong && $test == 'range') { |
||
838 | $test = 'none'; |
||
839 | if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) { |
||
840 | $this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.$k); |
||
841 | return false; |
||
842 | } |
||
843 | } |
||
844 | if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx |
||
845 | $v = ($v - 128) << ($next_byte * 6); |
||
846 | $output[($out_len - 1)] += $v; |
||
847 | --$next_byte;
|
||
848 | } else {
|
||
849 | $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k); |
||
850 | return false; |
||
851 | } |
||
852 | if ($next_byte < 0) { |
||
853 | $mode = 'next'; |
||
854 | } |
||
855 | } |
||
856 | } // for
|
||
857 | return $output; |
||
858 | } |
||
859 | |||
860 | /**
|
||
861 | * Convert UCS-4 string into UTF-8 string
|
||
862 | * See _utf8_to_ucs4() for details
|
||
863 | * @access private
|
||
864 | */
|
||
865 | function _ucs4_to_utf8($input) |
||
866 | { |
||
867 | $output = ''; |
||
868 | $k = 0; |
||
869 | foreach ($input as $v) { |
||
870 | ++$k;
|
||
871 | // $v = ord($v);
|
||
872 | if ($v < 128) { // 7bit are transferred literally |
||
873 | $output .= chr($v); |
||
874 | } elseif ($v < (1 << 11)) { // 2 bytes |
||
875 | $output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63)); |
||
876 | } elseif ($v < (1 << 16)) { // 3 bytes |
||
877 | $output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63)); |
||
878 | } elseif ($v < (1 << 21)) { // 4 bytes |
||
879 | $output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63)) |
||
880 | . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63)); |
||
881 | } elseif ($v < (1 << 26)) { // 5 bytes |
||
882 | $output .= chr(248 + ($v >> 24)) . chr(128 + (($v >> 18) & 63)) |
||
883 | . chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63)) |
||
884 | . chr(128 + ($v & 63)); |
||
885 | } elseif ($v < (1 << 31)) { // 6 bytes |
||
886 | $output .= chr(252 + ($v >> 30)) . chr(128 + (($v >> 24) & 63)) |
||
887 | . chr(128 + (($v >> 18) & 63)) . chr(128 + (($v >> 12) & 63)) |
||
888 | . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63)); |
||
889 | } else {
|
||
890 | $this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k); |
||
891 | return false; |
||
892 | } |
||
893 | } |
||
894 | return $output; |
||
895 | } |
||
896 | |||
897 | /**
|
||
898 | * Convert UCS-4 array into UCS-4 string
|
||
899 | *
|
||
900 | * @access private
|
||
901 | */
|
||
902 | function _ucs4_to_ucs4_string($input) |
||
903 | { |
||
904 | $output = ''; |
||
905 | // Take array values and split output to 4 bytes per value
|
||
906 | // The bit mask is 255, which reads &11111111
|
||
907 | foreach ($input as $v) { |
||
908 | $output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255); |
||
909 | } |
||
910 | return $output; |
||
911 | } |
||
912 | |||
913 | /**
|
||
914 | * Convert UCS-4 strin into UCS-4 garray
|
||
915 | *
|
||
916 | * @access private
|
||
917 | */
|
||
918 | function _ucs4_string_to_ucs4($input) |
||
919 | { |
||
920 | $output = array(); |
||
921 | $inp_len = strlen($input); |
||
922 | // Input length must be dividable by 4
|
||
923 | if ($inp_len % 4) { |
||
924 | $this->_error('Input UCS4 string is broken'); |
||
925 | return false; |
||
926 | } |
||
927 | // Empty input - return empty output
|
||
928 | if (!$inp_len) return $output; |
||
929 | for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) { |
||
930 | // Increment output position every 4 input bytes
|
||
931 | if (!($i % 4)) { |
||
932 | $out_len++;
|
||
933 | $output[$out_len] = 0; |
||
934 | } |
||
935 | $output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) ); |
||
936 | } |
||
937 | return $output; |
||
938 | } |
||
939 | } |
||
940 | |||
941 | /**
|
||
942 | * Adapter class for aligning the API of idna_convert with that of Net_IDNA
|
||
943 | * @author Matthias Sommerfeld <mso@phlylabs.de>
|
||
944 | */
|
||
945 | class Net_IDNA_php4 extends idna_convert |
||
946 | { |
||
947 | /**
|
||
948 | * Sets a new option value. Available options and values:
|
||
949 | * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
|
||
950 | * 'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
|
||
951 | * [overlong - Unicode does not allow unnecessarily long encodings of chars,
|
||
952 | * to allow this, set this parameter to true, else to false;
|
||
953 | * default is false.]
|
||
954 | * [strict - true: strict mode, good for registration purposes - Causes errors
|
||
955 | * on failures; false: loose mode, ideal for "wildlife" applications
|
||
956 | * by silently ignoring errors and returning the original input instead
|
||
957 | *
|
||
958 | * @param mixed Parameter to set (string: single parameter; array of Parameter => Value pairs)
|
||
959 | * @param string Value to use (if parameter 1 is a string)
|
||
960 | * @return boolean true on success, false otherwise
|
||
961 | * @access public
|
||
962 | */
|
||
963 | function setParams($option, $param = false) |
||
964 | { |
||
965 | return $this->IC->set_parameters($option, $param); |
||
966 | } |
||
967 | } |
||
968 | |||
969 | ?> |