Projet

Général

Profil

Paste
Télécharger (37,5 ko) Statistiques
| Branche: | Révision:

root / drupal7 / sites / all / libraries / simplepie / idn / idna_convert.class.php @ 41cc1b08

1
<?php
2
// {{{ license
3

    
4
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4 foldmethod=marker: */
5
//
6
// +----------------------------------------------------------------------+
7
// | This library is free software; you can redistribute it and/or modify |
8
// | it under the terms of the GNU Lesser General Public License as       |
9
// | published by the Free Software Foundation; either version 2.1 of the |
10
// | License, or (at your option) any later version.                      |
11
// |                                                                      |
12
// | This library is distributed in the hope that it will be useful, but  |
13
// | WITHOUT ANY WARRANTY; without even the implied warranty of           |
14
// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    |
15
// | Lesser General Public License for more details.                      |
16
// |                                                                      |
17
// | You should have received a copy of the GNU Lesser General Public     |
18
// | License along with this library; if not, write to the Free Software  |
19
// | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 |
20
// | USA.                                                                 |
21
// +----------------------------------------------------------------------+
22
//
23

    
24
// }}}
25

    
26
/**
27
 * Encode/decode Internationalized Domain Names.
28
 *
29
 * The class allows to convert internationalized domain names
30
 * (see RFC 3490 for details) as they can be used with various registries worldwide
31
 * to be translated between their original (localized) form and their encoded form
32
 * as it will be used in the DNS (Domain Name System).
33
 *
34
 * The class provides two public methods, encode() and decode(), which do exactly
35
 * what you would expect them to do. You are allowed to use complete domain names,
36
 * simple strings and complete email addresses as well. That means, that you might
37
 * use any of the following notations:
38
 *
39
 * - www.nörgler.com
40
 * - xn--nrgler-wxa
41
 * - xn--brse-5qa.xn--knrz-1ra.info
42
 *
43
 * Unicode input might be given as either UTF-8 string, UCS-4 string or UCS-4
44
 * array. Unicode output is available in the same formats.
45
 * You can select your preferred format via {@link set_paramter()}.
46
 *
47
 * ACE input and output is always expected to be ASCII.
48
 *
49
 * @author  Matthias Sommerfeld <mso@phlylabs.de>
50
 * @copyright 2004-2007 phlyLabs Berlin, http://phlylabs.de
51
 * @version 0.5.1
52
 *
53
 */
54
class idna_convert
55
{
56
    /**
57
     * Holds all relevant mapping tables, loaded from a seperate file on construct
58
     * See RFC3454 for details
59
     *
60
     * @var array
61
     * @access private
62
     */
63
    var $NP = array();
64

    
65
    // Internal settings, do not mess with them
66
    var $_punycode_prefix = 'xn--';
67
    var $_invalid_ucs =     0x80000000;
68
    var $_max_ucs =         0x10FFFF;
69
    var $_base =            36;
70
    var $_tmin =            1;
71
    var $_tmax =            26;
72
    var $_skew =            38;
73
    var $_damp =            700;
74
    var $_initial_bias =    72;
75
    var $_initial_n =       0x80;
76
    var $_sbase =           0xAC00;
77
    var $_lbase =           0x1100;
78
    var $_vbase =           0x1161;
79
    var $_tbase =           0x11A7;
80
    var $_lcount =          19;
81
    var $_vcount =          21;
82
    var $_tcount =          28;
83
    var $_ncount =          588;   // _vcount * _tcount
84
    var $_scount =          11172; // _lcount * _tcount * _vcount
85
    var $_error =           false;
86

    
87
    // See {@link set_paramter()} for details of how to change the following
88
    // settings from within your script / application
89
    var $_api_encoding   =  'utf8'; // Default input charset is UTF-8
90
    var $_allow_overlong =  false;  // Overlong UTF-8 encodings are forbidden
91
    var $_strict_mode    =  false;  // Behave strict or not
92

    
93
    // The constructor
94
    function idna_convert($options = false)
95
    {
96
        $this->slast = $this->_sbase + $this->_lcount * $this->_vcount * $this->_tcount;
97
        if (function_exists('file_get_contents')) {
98
            $this->NP = unserialize(file_get_contents(dirname(__FILE__).'/npdata.ser'));
99
        } else {
100
            $this->NP = unserialize(join('', file(dirname(__FILE__).'/npdata.ser')));
101
        }
102
        // If parameters are given, pass these to the respective method
103
        if (is_array($options)) {
104
            return $this->set_parameter($options);
105
        }
106
        return true;
107
    }
108

    
109
    /**
110
     * Sets a new option value. Available options and values:
111
     * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
112
     *         'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
113
     * [overlong - Unicode does not allow unnecessarily long encodings of chars,
114
     *             to allow this, set this parameter to true, else to false;
115
     *             default is false.]
116
     * [strict - true: strict mode, good for registration purposes - Causes errors
117
     *           on failures; false: loose mode, ideal for "wildlife" applications
118
     *           by silently ignoring errors and returning the original input instead
119
     *
120
     * @param    mixed     Parameter to set (string: single parameter; array of Parameter => Value pairs)
121
     * @param    string    Value to use (if parameter 1 is a string)
122
     * @return   boolean   true on success, false otherwise
123
     * @access   public
124
     */
125
    function set_parameter($option, $value = false)
126
    {
127
        if (!is_array($option)) {
128
            $option = array($option => $value);
129
        }
130
        foreach ($option as $k => $v) {
131
            switch ($k) {
132
            case 'encoding':
133
                switch ($v) {
134
                case 'utf8':
135
                case 'ucs4_string':
136
                case 'ucs4_array':
137
                    $this->_api_encoding = $v;
138
                    break;
139
                default:
140
                    $this->_error('Set Parameter: Unknown parameter '.$v.' for option '.$k);
141
                    return false;
142
                }
143
                break;
144
            case 'overlong':
145
                $this->_allow_overlong = ($v) ? true : false;
146
                break;
147
            case 'strict':
148
                $this->_strict_mode = ($v) ? true : false;
149
                break;
150
            default:
151
                $this->_error('Set Parameter: Unknown option '.$k);
152
                return false;
153
            }
154
        }
155
        return true;
156
    }
157

    
158
    /**
159
     * Decode a given ACE domain name
160
     * @param    string   Domain name (ACE string)
161
     * [@param    string   Desired output encoding, see {@link set_parameter}]
162
     * @return   string   Decoded Domain name (UTF-8 or UCS-4)
163
     * @access   public
164
     */
165
    function decode($input, $one_time_encoding = false)
166
    {
167
        // Optionally set
168
        if ($one_time_encoding) {
169
            switch ($one_time_encoding) {
170
            case 'utf8':
171
            case 'ucs4_string':
172
            case 'ucs4_array':
173
                break;
174
            default:
175
                $this->_error('Unknown encoding '.$one_time_encoding);
176
                return false;
177
            }
178
        }
179
        // Make sure to drop any newline characters around
180
        $input = trim($input);
181

    
182
        // Negotiate input and try to determine, whether it is a plain string,
183
        // an email address or something like a complete URL
184
        if (strpos($input, '@')) { // Maybe it is an email address
185
            // No no in strict mode
186
            if ($this->_strict_mode) {
187
                $this->_error('Only simple domain name parts can be handled in strict mode');
188
                return false;
189
            }
190
            list ($email_pref, $input) = explode('@', $input, 2);
191
            $arr = explode('.', $input);
192
            foreach ($arr as $k => $v) {
193
                if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
194
                    $conv = $this->_decode($v);
195
                    if ($conv) $arr[$k] = $conv;
196
                }
197
            }
198
            $input = join('.', $arr);
199
            $arr = explode('.', $email_pref);
200
            foreach ($arr as $k => $v) {
201
                if (preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $v)) {
202
                    $conv = $this->_decode($v);
203
                    if ($conv) $arr[$k] = $conv;
204
                }
205
            }
206
            $email_pref = join('.', $arr);
207
            $return = $email_pref . '@' . $input;
208
        } elseif (preg_match('![:\./]!', $input)) { // Or a complete domain name (with or without paths / parameters)
209
            // No no in strict mode
210
            if ($this->_strict_mode) {
211
                $this->_error('Only simple domain name parts can be handled in strict mode');
212
                return false;
213
            }
214
            $parsed = parse_url($input);
215
            if (isset($parsed['host'])) {
216
                $arr = explode('.', $parsed['host']);
217
                foreach ($arr as $k => $v) {
218
                    $conv = $this->_decode($v);
219
                    if ($conv) $arr[$k] = $conv;
220
                }
221
                $parsed['host'] = join('.', $arr);
222
                $return =
223
                        (empty($parsed['scheme']) ? '' : $parsed['scheme'].(strtolower($parsed['scheme']) == 'mailto' ? ':' : '://'))
224
                        .(empty($parsed['user']) ? '' : $parsed['user'].(empty($parsed['pass']) ? '' : ':'.$parsed['pass']).'@')
225
                        .$parsed['host']
226
                        .(empty($parsed['port']) ? '' : ':'.$parsed['port'])
227
                        .(empty($parsed['path']) ? '' : $parsed['path'])
228
                        .(empty($parsed['query']) ? '' : '?'.$parsed['query'])
229
                        .(empty($parsed['fragment']) ? '' : '#'.$parsed['fragment']);
230
            } else { // parse_url seems to have failed, try without it
231
                $arr = explode('.', $input);
232
                foreach ($arr as $k => $v) {
233
                    $conv = $this->_decode($v);
234
                    $arr[$k] = ($conv) ? $conv : $v;
235
                }
236
                $return = join('.', $arr);
237
            }
238
        } else { // Otherwise we consider it being a pure domain name string
239
            $return = $this->_decode($input);
240
            if (!$return) $return = $input;
241
        }
242
        // The output is UTF-8 by default, other output formats need conversion here
243
        // If one time encoding is given, use this, else the objects property
244
        switch (($one_time_encoding) ? $one_time_encoding : $this->_api_encoding) {
245
        case 'utf8':
246
            return $return;
247
            break;
248
        case 'ucs4_string':
249
           return $this->_ucs4_to_ucs4_string($this->_utf8_to_ucs4($return));
250
           break;
251
        case 'ucs4_array':
252
            return $this->_utf8_to_ucs4($return);
253
            break;
254
        default:
255
            $this->_error('Unsupported output format');
256
            return false;
257
        }
258
    }
259

    
260
    /**
261
     * Encode a given UTF-8 domain name
262
     * @param    string   Domain name (UTF-8 or UCS-4)
263
     * [@param    string   Desired input encoding, see {@link set_parameter}]
264
     * @return   string   Encoded Domain name (ACE string)
265
     * @access   public
266
     */
267
    function encode($decoded, $one_time_encoding = false)
268
    {
269
        // Forcing conversion of input to UCS4 array
270
        // If one time encoding is given, use this, else the objects property
271
        switch ($one_time_encoding ? $one_time_encoding : $this->_api_encoding) {
272
        case 'utf8':
273
            $decoded = $this->_utf8_to_ucs4($decoded);
274
            break;
275
        case 'ucs4_string':
276
           $decoded = $this->_ucs4_string_to_ucs4($decoded);
277
        case 'ucs4_array':
278
           break;
279
        default:
280
            $this->_error('Unsupported input format: '.($one_time_encoding ? $one_time_encoding : $this->_api_encoding));
281
            return false;
282
        }
283

    
284
        // No input, no output, what else did you expect?
285
        if (empty($decoded)) return '';
286

    
287
        // Anchors for iteration
288
        $last_begin = 0;
289
        // Output string
290
        $output = '';
291
        foreach ($decoded as $k => $v) {
292
            // Make sure to use just the plain dot
293
            switch($v) {
294
            case 0x3002:
295
            case 0xFF0E:
296
            case 0xFF61:
297
                $decoded[$k] = 0x2E;
298
                // Right, no break here, the above are converted to dots anyway
299
            // Stumbling across an anchoring character
300
            case 0x2E:
301
            case 0x2F:
302
            case 0x3A:
303
            case 0x3F:
304
            case 0x40:
305
                // Neither email addresses nor URLs allowed in strict mode
306
                if ($this->_strict_mode) {
307
                   $this->_error('Neither email addresses nor URLs are allowed in strict mode.');
308
                   return false;
309
                } else {
310
                    // Skip first char
311
                    if ($k) {
312
                        $encoded = '';
313
                        $encoded = $this->_encode(array_slice($decoded, $last_begin, (($k)-$last_begin)));
314
                        if ($encoded) {
315
                            $output .= $encoded;
316
                        } else {
317
                            $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($k)-$last_begin)));
318
                        }
319
                        $output .= chr($decoded[$k]);
320
                    }
321
                    $last_begin = $k + 1;
322
                }
323
            }
324
        }
325
        // Catch the rest of the string
326
        if ($last_begin) {
327
            $inp_len = sizeof($decoded);
328
            $encoded = '';
329
            $encoded = $this->_encode(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
330
            if ($encoded) {
331
                $output .= $encoded;
332
            } else {
333
                $output .= $this->_ucs4_to_utf8(array_slice($decoded, $last_begin, (($inp_len)-$last_begin)));
334
            }
335
            return $output;
336
        } else {
337
            if ($output = $this->_encode($decoded)) {
338
                return $output;
339
            } else {
340
                return $this->_ucs4_to_utf8($decoded);
341
            }
342
        }
343
    }
344

    
345
    /**
346
     * Use this method to get the last error ocurred
347
     * @param    void
348
     * @return   string   The last error, that occured
349
     * @access   public
350
     */
351
    function get_last_error()
352
    {
353
        return $this->_error;
354
    }
355

    
356
    /**
357
     * The actual decoding algorithm
358
     * @access   private
359
     */
360
    function _decode($encoded)
361
    {
362
        // We do need to find the Punycode prefix
363
        if (!preg_match('!^'.preg_quote($this->_punycode_prefix, '!').'!', $encoded)) {
364
            $this->_error('This is not a punycode string');
365
            return false;
366
        }
367
        $encode_test = preg_replace('!^'.preg_quote($this->_punycode_prefix, '!').'!', '', $encoded);
368
        // If nothing left after removing the prefix, it is hopeless
369
        if (!$encode_test) {
370
            $this->_error('The given encoded string was empty');
371
            return false;
372
        }
373
        // Find last occurence of the delimiter
374
        $delim_pos = strrpos($encoded, '-');
375
        if ($delim_pos > strlen($this->_punycode_prefix)) {
376
            for ($k = strlen($this->_punycode_prefix); $k < $delim_pos; ++$k) {
377
                $decoded[] = ord($encoded{$k});
378
            }
379
        } else {
380
            $decoded = array();
381
        }
382
        $deco_len = count($decoded);
383
        $enco_len = strlen($encoded);
384

    
385
        // Wandering through the strings; init
386
        $is_first = true;
387
        $bias     = $this->_initial_bias;
388
        $idx      = 0;
389
        $char     = $this->_initial_n;
390

    
391
        for ($enco_idx = ($delim_pos) ? ($delim_pos + 1) : 0; $enco_idx < $enco_len; ++$deco_len) {
392
            for ($old_idx = $idx, $w = 1, $k = $this->_base; 1 ; $k += $this->_base) {
393
                $digit = $this->_decode_digit($encoded{$enco_idx++});
394
                $idx += $digit * $w;
395
                $t = ($k <= $bias) ? $this->_tmin :
396
                        (($k >= $bias + $this->_tmax) ? $this->_tmax : ($k - $bias));
397
                if ($digit < $t) break;
398
                $w = (int) ($w * ($this->_base - $t));
399
            }
400
            $bias = $this->_adapt($idx - $old_idx, $deco_len + 1, $is_first);
401
            $is_first = false;
402
            $char += (int) ($idx / ($deco_len + 1));
403
            $idx %= ($deco_len + 1);
404
            if ($deco_len > 0) {
405
                // Make room for the decoded char
406
                for ($i = $deco_len; $i > $idx; $i--) {
407
                    $decoded[$i] = $decoded[($i - 1)];
408
                }
409
            }
410
            $decoded[$idx++] = $char;
411
        }
412
        return $this->_ucs4_to_utf8($decoded);
413
    }
414

    
415
    /**
416
     * The actual encoding algorithm
417
     * @access   private
418
     */
419
    function _encode($decoded)
420
    {
421
        // We cannot encode a domain name containing the Punycode prefix
422
        $extract = strlen($this->_punycode_prefix);
423
        $check_pref = $this->_utf8_to_ucs4($this->_punycode_prefix);
424
        $check_deco = array_slice($decoded, 0, $extract);
425

    
426
        if ($check_pref == $check_deco) {
427
            $this->_error('This is already a punycode string');
428
            return false;
429
        }
430
        // We will not try to encode strings consisting of basic code points only
431
        $encodable = false;
432
        foreach ($decoded as $k => $v) {
433
            if ($v > 0x7a) {
434
                $encodable = true;
435
                break;
436
            }
437
        }
438
        if (!$encodable) {
439
            $this->_error('The given string does not contain encodable chars');
440
            return false;
441
        }
442

    
443
        // Do NAMEPREP
444
        $decoded = $this->_nameprep($decoded);
445
        if (!$decoded || !is_array($decoded)) return false; // NAMEPREP failed
446

    
447
        $deco_len  = count($decoded);
448
        if (!$deco_len) return false; // Empty array
449

    
450
        $codecount = 0; // How many chars have been consumed
451

    
452
        $encoded = '';
453
        // Copy all basic code points to output
454
        for ($i = 0; $i < $deco_len; ++$i) {
455
            $test = $decoded[$i];
456
            // Will match [-0-9a-zA-Z]
457
            if ((0x2F < $test && $test < 0x40) || (0x40 < $test && $test < 0x5B)
458
                    || (0x60 < $test && $test <= 0x7B) || (0x2D == $test)) {
459
                $encoded .= chr($decoded[$i]);
460
                $codecount++;
461
            }
462
        }
463
        if ($codecount == $deco_len) return $encoded; // All codepoints were basic ones
464

    
465
        // Start with the prefix; copy it to output
466
        $encoded = $this->_punycode_prefix.$encoded;
467

    
468
        // If we have basic code points in output, add an hyphen to the end
469
        if ($codecount) $encoded .= '-';
470

    
471
        // Now find and encode all non-basic code points
472
        $is_first  = true;
473
        $cur_code  = $this->_initial_n;
474
        $bias      = $this->_initial_bias;
475
        $delta     = 0;
476
        while ($codecount < $deco_len) {
477
            // Find the smallest code point >= the current code point and
478
            // remember the last ouccrence of it in the input
479
            for ($i = 0, $next_code = $this->_max_ucs; $i < $deco_len; $i++) {
480
                if ($decoded[$i] >= $cur_code && $decoded[$i] <= $next_code) {
481
                    $next_code = $decoded[$i];
482
                }
483
            }
484

    
485
            $delta += ($next_code - $cur_code) * ($codecount + 1);
486
            $cur_code = $next_code;
487

    
488
            // Scan input again and encode all characters whose code point is $cur_code
489
            for ($i = 0; $i < $deco_len; $i++) {
490
                if ($decoded[$i] < $cur_code) {
491
                    $delta++;
492
                } elseif ($decoded[$i] == $cur_code) {
493
                    for ($q = $delta, $k = $this->_base; 1; $k += $this->_base) {
494
                        $t = ($k <= $bias) ? $this->_tmin :
495
                                (($k >= $bias + $this->_tmax) ? $this->_tmax : $k - $bias);
496
                        if ($q < $t) break;
497
                        $encoded .= $this->_encode_digit(intval($t + (($q - $t) % ($this->_base - $t)))); //v0.4.5 Changed from ceil() to intval()
498
                        $q = (int) (($q - $t) / ($this->_base - $t));
499
                    }
500
                    $encoded .= $this->_encode_digit($q);
501
                    $bias = $this->_adapt($delta, $codecount+1, $is_first);
502
                    $codecount++;
503
                    $delta = 0;
504
                    $is_first = false;
505
                }
506
            }
507
            $delta++;
508
            $cur_code++;
509
        }
510
        return $encoded;
511
    }
512

    
513
    /**
514
     * Adapt the bias according to the current code point and position
515
     * @access   private
516
     */
517
    function _adapt($delta, $npoints, $is_first)
518
    {
519
        $delta = intval($is_first ? ($delta / $this->_damp) : ($delta / 2));
520
        $delta += intval($delta / $npoints);
521
        for ($k = 0; $delta > (($this->_base - $this->_tmin) * $this->_tmax) / 2; $k += $this->_base) {
522
            $delta = intval($delta / ($this->_base - $this->_tmin));
523
        }
524
        return intval($k + ($this->_base - $this->_tmin + 1) * $delta / ($delta + $this->_skew));
525
    }
526

    
527
    /**
528
     * Encoding a certain digit
529
     * @access   private
530
     */
531
    function _encode_digit($d)
532
    {
533
        return chr($d + 22 + 75 * ($d < 26));
534
    }
535

    
536
    /**
537
     * Decode a certain digit
538
     * @access   private
539
     */
540
    function _decode_digit($cp)
541
    {
542
        $cp = ord($cp);
543
        return ($cp - 48 < 10) ? $cp - 22 : (($cp - 65 < 26) ? $cp - 65 : (($cp - 97 < 26) ? $cp - 97 : $this->_base));
544
    }
545

    
546
    /**
547
     * Internal error handling method
548
     * @access   private
549
     */
550
    function _error($error = '')
551
    {
552
        $this->_error = $error;
553
    }
554

    
555
    /**
556
     * Do Nameprep according to RFC3491 and RFC3454
557
     * @param    array    Unicode Characters
558
     * @return   string   Unicode Characters, Nameprep'd
559
     * @access   private
560
     */
561
    function _nameprep($input)
562
    {
563
        $output = array();
564
        $error = false;
565
        //
566
        // Mapping
567
        // Walking through the input array, performing the required steps on each of
568
        // the input chars and putting the result into the output array
569
        // While mapping required chars we apply the cannonical ordering
570
        foreach ($input as $v) {
571
            // Map to nothing == skip that code point
572
            if (in_array($v, $this->NP['map_nothing'])) continue;
573

    
574
            // Try to find prohibited input
575
            if (in_array($v, $this->NP['prohibit']) || in_array($v, $this->NP['general_prohibited'])) {
576
                $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
577
                return false;
578
            }
579
            foreach ($this->NP['prohibit_ranges'] as $range) {
580
                if ($range[0] <= $v && $v <= $range[1]) {
581
                    $this->_error('NAMEPREP: Prohibited input U+'.sprintf('%08X', $v));
582
                    return false;
583
                }
584
            }
585
            //
586
            // Hangul syllable decomposition
587
            if (0xAC00 <= $v && $v <= 0xD7AF) {
588
                foreach ($this->_hangul_decompose($v) as $out) {
589
                    $output[] = (int) $out;
590
                }
591
            // There's a decomposition mapping for that code point
592
            } elseif (isset($this->NP['replacemaps'][$v])) {
593
                foreach ($this->_apply_cannonical_ordering($this->NP['replacemaps'][$v]) as $out) {
594
                    $output[] = (int) $out;
595
                }
596
            } else {
597
                $output[] = (int) $v;
598
            }
599
        }
600
        // Before applying any Combining, try to rearrange any Hangul syllables
601
        $output = $this->_hangul_compose($output);
602
        //
603
        // Combine code points
604
        //
605
        $last_class   = 0;
606
        $last_starter = 0;
607
        $out_len      = count($output);
608
        for ($i = 0; $i < $out_len; ++$i) {
609
            $class = $this->_get_combining_class($output[$i]);
610
            if ((!$last_class || $last_class > $class) && $class) {
611
                // Try to match
612
                $seq_len = $i - $last_starter;
613
                $out = $this->_combine(array_slice($output, $last_starter, $seq_len));
614
                // On match: Replace the last starter with the composed character and remove
615
                // the now redundant non-starter(s)
616
                if ($out) {
617
                    $output[$last_starter] = $out;
618
                    if (count($out) != $seq_len) {
619
                        for ($j = $i+1; $j < $out_len; ++$j) {
620
                            $output[$j-1] = $output[$j];
621
                        }
622
                        unset($output[$out_len]);
623
                    }
624
                    // Rewind the for loop by one, since there can be more possible compositions
625
                    $i--;
626
                    $out_len--;
627
                    $last_class = ($i == $last_starter) ? 0 : $this->_get_combining_class($output[$i-1]);
628
                    continue;
629
                }
630
            }
631
            // The current class is 0
632
            if (!$class) $last_starter = $i;
633
            $last_class = $class;
634
        }
635
        return $output;
636
    }
637

    
638
    /**
639
     * Decomposes a Hangul syllable
640
     * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
641
     * @param    integer  32bit UCS4 code point
642
     * @return   array    Either Hangul Syllable decomposed or original 32bit value as one value array
643
     * @access   private
644
     */
645
    function _hangul_decompose($char)
646
    {
647
        $sindex = (int) $char - $this->_sbase;
648
        if ($sindex < 0 || $sindex >= $this->_scount) {
649
            return array($char);
650
        }
651
        $result = array();
652
        $result[] = (int) $this->_lbase + $sindex / $this->_ncount;
653
        $result[] = (int) $this->_vbase + ($sindex % $this->_ncount) / $this->_tcount;
654
        $T = intval($this->_tbase + $sindex % $this->_tcount);
655
        if ($T != $this->_tbase) $result[] = $T;
656
        return $result;
657
    }
658
    /**
659
     * Ccomposes a Hangul syllable
660
     * (see http://www.unicode.org/unicode/reports/tr15/#Hangul
661
     * @param    array    Decomposed UCS4 sequence
662
     * @return   array    UCS4 sequence with syllables composed
663
     * @access   private
664
     */
665
    function _hangul_compose($input)
666
    {
667
        $inp_len = count($input);
668
        if (!$inp_len) return array();
669
        $result = array();
670
        $last = (int) $input[0];
671
        $result[] = $last; // copy first char from input to output
672

    
673
        for ($i = 1; $i < $inp_len; ++$i) {
674
            $char = (int) $input[$i];
675
            $sindex = $last - $this->_sbase;
676
            $lindex = $last - $this->_lbase;
677
            $vindex = $char - $this->_vbase;
678
            $tindex = $char - $this->_tbase;
679
            // Find out, whether two current characters are LV and T
680
            if (0 <= $sindex && $sindex < $this->_scount && ($sindex % $this->_tcount == 0)
681
                    && 0 <= $tindex && $tindex <= $this->_tcount) {
682
                // create syllable of form LVT
683
                $last += $tindex;
684
                $result[(count($result) - 1)] = $last; // reset last
685
                continue; // discard char
686
            }
687
            // Find out, whether two current characters form L and V
688
            if (0 <= $lindex && $lindex < $this->_lcount && 0 <= $vindex && $vindex < $this->_vcount) {
689
                // create syllable of form LV
690
                $last = (int) $this->_sbase + ($lindex * $this->_vcount + $vindex) * $this->_tcount;
691
                $result[(count($result) - 1)] = $last; // reset last
692
                continue; // discard char
693
            }
694
            // if neither case was true, just add the character
695
            $last = $char;
696
            $result[] = $char;
697
        }
698
        return $result;
699
    }
700

    
701
    /**
702
     * Returns the combining class of a certain wide char
703
     * @param    integer    Wide char to check (32bit integer)
704
     * @return   integer    Combining class if found, else 0
705
     * @access   private
706
     */
707
    function _get_combining_class($char)
708
    {
709
        return isset($this->NP['norm_combcls'][$char]) ? $this->NP['norm_combcls'][$char] : 0;
710
    }
711

    
712
    /**
713
     * Apllies the cannonical ordering of a decomposed UCS4 sequence
714
     * @param    array      Decomposed UCS4 sequence
715
     * @return   array      Ordered USC4 sequence
716
     * @access   private
717
     */
718
    function _apply_cannonical_ordering($input)
719
    {
720
        $swap = true;
721
        $size = count($input);
722
        while ($swap) {
723
            $swap = false;
724
            $last = $this->_get_combining_class(intval($input[0]));
725
            for ($i = 0; $i < $size-1; ++$i) {
726
                $next = $this->_get_combining_class(intval($input[$i+1]));
727
                if ($next != 0 && $last > $next) {
728
                    // Move item leftward until it fits
729
                    for ($j = $i + 1; $j > 0; --$j) {
730
                        if ($this->_get_combining_class(intval($input[$j-1])) <= $next) break;
731
                        $t = intval($input[$j]);
732
                        $input[$j] = intval($input[$j-1]);
733
                        $input[$j-1] = $t;
734
                        $swap = true;
735
                    }
736
                    // Reentering the loop looking at the old character again
737
                    $next = $last;
738
                }
739
                $last = $next;
740
            }
741
        }
742
        return $input;
743
    }
744

    
745
    /**
746
     * Do composition of a sequence of starter and non-starter
747
     * @param    array      UCS4 Decomposed sequence
748
     * @return   array      Ordered USC4 sequence
749
     * @access   private
750
     */
751
    function _combine($input)
752
    {
753
        $inp_len = count($input);
754
        foreach ($this->NP['replacemaps'] as $np_src => $np_target) {
755
            if ($np_target[0] != $input[0]) continue;
756
            if (count($np_target) != $inp_len) continue;
757
            $hit = false;
758
            foreach ($input as $k2 => $v2) {
759
                if ($v2 == $np_target[$k2]) {
760
                    $hit = true;
761
                } else {
762
                    $hit = false;
763
                    break;
764
                }
765
            }
766
            if ($hit) return $np_src;
767
        }
768
        return false;
769
    }
770

    
771
    /**
772
     * This converts an UTF-8 encoded string to its UCS-4 representation
773
     * By talking about UCS-4 "strings" we mean arrays of 32bit integers representing
774
     * each of the "chars". This is due to PHP not being able to handle strings with
775
     * bit depth different from 8. This apllies to the reverse method _ucs4_to_utf8(), too.
776
     * The following UTF-8 encodings are supported:
777
     * bytes bits  representation
778
     * 1        7  0xxxxxxx
779
     * 2       11  110xxxxx 10xxxxxx
780
     * 3       16  1110xxxx 10xxxxxx 10xxxxxx
781
     * 4       21  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
782
     * 5       26  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
783
     * 6       31  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
784
     * Each x represents a bit that can be used to store character data.
785
     * The five and six byte sequences are part of Annex D of ISO/IEC 10646-1:2000
786
     * @access   private
787
     */
788
    function _utf8_to_ucs4($input)
789
    {
790
        $output = array();
791
        $out_len = 0;
792
        $inp_len = strlen($input);
793
        $mode = 'next';
794
        $test = 'none';
795
        for ($k = 0; $k < $inp_len; ++$k) {
796
            $v = ord($input{$k}); // Extract byte from input string
797

    
798
            if ($v < 128) { // We found an ASCII char - put into stirng as is
799
                $output[$out_len] = $v;
800
                ++$out_len;
801
                if ('add' == $mode) {
802
                    $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
803
                    return false;
804
                }
805
                continue;
806
            }
807
            if ('next' == $mode) { // Try to find the next start byte; determine the width of the Unicode char
808
                $start_byte = $v;
809
                $mode = 'add';
810
                $test = 'range';
811
                if ($v >> 5 == 6) { // &110xxxxx 10xxxxx
812
                    $next_byte = 0; // Tells, how many times subsequent bitmasks must rotate 6bits to the left
813
                    $v = ($v - 192) << 6;
814
                } elseif ($v >> 4 == 14) { // &1110xxxx 10xxxxxx 10xxxxxx
815
                    $next_byte = 1;
816
                    $v = ($v - 224) << 12;
817
                } elseif ($v >> 3 == 30) { // &11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
818
                    $next_byte = 2;
819
                    $v = ($v - 240) << 18;
820
                } elseif ($v >> 2 == 62) { // &111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
821
                    $next_byte = 3;
822
                    $v = ($v - 248) << 24;
823
                } elseif ($v >> 1 == 126) { // &1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
824
                    $next_byte = 4;
825
                    $v = ($v - 252) << 30;
826
                } else {
827
                    $this->_error('This might be UTF-8, but I don\'t understand it at byte '.$k);
828
                    return false;
829
                }
830
                if ('add' == $mode) {
831
                    $output[$out_len] = (int) $v;
832
                    ++$out_len;
833
                    continue;
834
                }
835
            }
836
            if ('add' == $mode) {
837
                if (!$this->_allow_overlong && $test == 'range') {
838
                    $test = 'none';
839
                    if (($v < 0xA0 && $start_byte == 0xE0) || ($v < 0x90 && $start_byte == 0xF0) || ($v > 0x8F && $start_byte == 0xF4)) {
840
                        $this->_error('Bogus UTF-8 character detected (out of legal range) at byte '.$k);
841
                        return false;
842
                    }
843
                }
844
                if ($v >> 6 == 2) { // Bit mask must be 10xxxxxx
845
                    $v = ($v - 128) << ($next_byte * 6);
846
                    $output[($out_len - 1)] += $v;
847
                    --$next_byte;
848
                } else {
849
                    $this->_error('Conversion from UTF-8 to UCS-4 failed: malformed input at byte '.$k);
850
                    return false;
851
                }
852
                if ($next_byte < 0) {
853
                    $mode = 'next';
854
                }
855
            }
856
        } // for
857
        return $output;
858
    }
859

    
860
    /**
861
     * Convert UCS-4 string into UTF-8 string
862
     * See _utf8_to_ucs4() for details
863
     * @access   private
864
     */
865
    function _ucs4_to_utf8($input)
866
    {
867
        $output = '';
868
        $k = 0;
869
        foreach ($input as $v) {
870
            ++$k;
871
            // $v = ord($v);
872
            if ($v < 128) { // 7bit are transferred literally
873
                $output .= chr($v);
874
            } elseif ($v < (1 << 11)) { // 2 bytes
875
                $output .= chr(192 + ($v >> 6)) . chr(128 + ($v & 63));
876
            } elseif ($v < (1 << 16)) { // 3 bytes
877
                $output .= chr(224 + ($v >> 12)) . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
878
            } elseif ($v < (1 << 21)) { // 4 bytes
879
                $output .= chr(240 + ($v >> 18)) . chr(128 + (($v >> 12) & 63))
880
                         . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
881
            } elseif ($v < (1 << 26)) { // 5 bytes
882
                $output .= chr(248 + ($v >> 24)) . chr(128 + (($v >> 18) & 63))
883
                         . chr(128 + (($v >> 12) & 63)) . chr(128 + (($v >> 6) & 63))
884
                         . chr(128 + ($v & 63));
885
            } elseif ($v < (1 << 31)) { // 6 bytes
886
                $output .= chr(252 + ($v >> 30)) . chr(128 + (($v >> 24) & 63))
887
                         . chr(128 + (($v >> 18) & 63)) . chr(128 + (($v >> 12) & 63))
888
                         . chr(128 + (($v >> 6) & 63)) . chr(128 + ($v & 63));
889
            } else {
890
                $this->_error('Conversion from UCS-4 to UTF-8 failed: malformed input at byte '.$k);
891
                return false;
892
            }
893
        }
894
        return $output;
895
    }
896

    
897
    /**
898
      * Convert UCS-4 array into UCS-4 string
899
      *
900
      * @access   private
901
      */
902
    function _ucs4_to_ucs4_string($input)
903
    {
904
        $output = '';
905
        // Take array values and split output to 4 bytes per value
906
        // The bit mask is 255, which reads &11111111
907
        foreach ($input as $v) {
908
            $output .= chr(($v >> 24) & 255).chr(($v >> 16) & 255).chr(($v >> 8) & 255).chr($v & 255);
909
        }
910
        return $output;
911
    }
912

    
913
    /**
914
      * Convert UCS-4 strin into UCS-4 garray
915
      *
916
      * @access   private
917
      */
918
    function _ucs4_string_to_ucs4($input)
919
    {
920
        $output = array();
921
        $inp_len = strlen($input);
922
        // Input length must be dividable by 4
923
        if ($inp_len % 4) {
924
            $this->_error('Input UCS4 string is broken');
925
            return false;
926
        }
927
        // Empty input - return empty output
928
        if (!$inp_len) return $output;
929
        for ($i = 0, $out_len = -1; $i < $inp_len; ++$i) {
930
            // Increment output position every 4 input bytes
931
            if (!($i % 4)) {
932
                $out_len++;
933
                $output[$out_len] = 0;
934
            }
935
            $output[$out_len] += ord($input{$i}) << (8 * (3 - ($i % 4) ) );
936
        }
937
        return $output;
938
    }
939
}
940

    
941
/**
942
* Adapter class for aligning the API of idna_convert with that of Net_IDNA
943
* @author  Matthias Sommerfeld <mso@phlylabs.de>
944
*/
945
class Net_IDNA_php4 extends idna_convert
946
{
947
    /**
948
     * Sets a new option value. Available options and values:
949
     * [encoding - Use either UTF-8, UCS4 as array or UCS4 as string as input ('utf8' for UTF-8,
950
     *         'ucs4_string' and 'ucs4_array' respectively for UCS4); The output is always UTF-8]
951
     * [overlong - Unicode does not allow unnecessarily long encodings of chars,
952
     *             to allow this, set this parameter to true, else to false;
953
     *             default is false.]
954
     * [strict - true: strict mode, good for registration purposes - Causes errors
955
     *           on failures; false: loose mode, ideal for "wildlife" applications
956
     *           by silently ignoring errors and returning the original input instead
957
     *
958
     * @param    mixed     Parameter to set (string: single parameter; array of Parameter => Value pairs)
959
     * @param    string    Value to use (if parameter 1 is a string)
960
     * @return   boolean   true on success, false otherwise
961
     * @access   public
962
     */
963
    function setParams($option, $param = false)
964
    {
965
        return $this->IC->set_parameters($option, $param);
966
    }
967
}
968

    
969
?>