/htmltest/includes/unicode.inc - Annoter - Club Drupal - Forge Centrale Marseille

85ad3d82

Assos Assos

<?php

2

3

/**

4

* @file

5

* Provides Unicode-related conversions and operations.

6

*/

7

8

/**

9

 * Indicates an error during check for PHP unicode support.

10

*/

11

define('UNICODE_ERROR', -1);

12

13

/**

14

 * Indicates that standard PHP (emulated) unicode support is being used.

15

*/

16

define('UNICODE_SINGLEBYTE', 0);

17

18

/**

19

 * Indicates that full unicode support with the PHP mbstring extension is being

20

 * used.

21

*/

22

define('UNICODE_MULTIBYTE', 1);

23

24

/**

25

 * Matches Unicode characters that are word boundaries.

26

27

 * Characters with the following General_category (gc) property values are used

28

 * as word boundaries. While this does not fully conform to the Word Boundaries

29

 * algorithm described in http://unicode.org/reports/tr29, as PCRE does not

30

 * contain the Word_Break property table, this simpler algorithm has to do.

31

 * - Cc, Cf, Cn, Co, Cs: Other.

32

 * - Pc, Pd, Pe, Pf, Pi, Po, Ps: Punctuation.

33

 * - Sc, Sk, Sm, So: Symbols.

34

 * - Zl, Zp, Zs: Separators.

35

36

 * Non-boundary characters include the following General_category (gc) property

37

 * values:

38

 * - Ll, Lm, Lo, Lt, Lu: Letters.

39

 * - Mc, Me, Mn: Combining Marks.

40

 * - Nd, Nl, No: Numbers.

41

42

 * Note that the PCRE property matcher is not used because we wanted to be

43

 * compatible with Unicode 5.2.0 regardless of the PCRE version used (and any

44

 * bugs in PCRE property tables).

45

46

 * @see http://unicode.org/glossary

47

*/

48

define('PREG_CLASS_UNICODE_WORD_BOUNDARY',

49

  '\x{0}-\x{2F}\x{3A}-\x{40}\x{5B}-\x{60}\x{7B}-\x{A9}\x{AB}-\x{B1}\x{B4}' .

50

  '\x{B6}-\x{B8}\x{BB}\x{BF}\x{D7}\x{F7}\x{2C2}-\x{2C5}\x{2D2}-\x{2DF}' .

51

  '\x{2E5}-\x{2EB}\x{2ED}\x{2EF}-\x{2FF}\x{375}\x{37E}-\x{385}\x{387}\x{3F6}' .

52

  '\x{482}\x{55A}-\x{55F}\x{589}-\x{58A}\x{5BE}\x{5C0}\x{5C3}\x{5C6}' .

53

  '\x{5F3}-\x{60F}\x{61B}-\x{61F}\x{66A}-\x{66D}\x{6D4}\x{6DD}\x{6E9}' .

54

  '\x{6FD}-\x{6FE}\x{700}-\x{70F}\x{7F6}-\x{7F9}\x{830}-\x{83E}' .

55

  '\x{964}-\x{965}\x{970}\x{9F2}-\x{9F3}\x{9FA}-\x{9FB}\x{AF1}\x{B70}' .

56

  '\x{BF3}-\x{BFA}\x{C7F}\x{CF1}-\x{CF2}\x{D79}\x{DF4}\x{E3F}\x{E4F}' .

57

  '\x{E5A}-\x{E5B}\x{F01}-\x{F17}\x{F1A}-\x{F1F}\x{F34}\x{F36}\x{F38}' .

58

  '\x{F3A}-\x{F3D}\x{F85}\x{FBE}-\x{FC5}\x{FC7}-\x{FD8}\x{104A}-\x{104F}' .

59

  '\x{109E}-\x{109F}\x{10FB}\x{1360}-\x{1368}\x{1390}-\x{1399}\x{1400}' .

60

  '\x{166D}-\x{166E}\x{1680}\x{169B}-\x{169C}\x{16EB}-\x{16ED}' .

61

  '\x{1735}-\x{1736}\x{17B4}-\x{17B5}\x{17D4}-\x{17D6}\x{17D8}-\x{17DB}' .

62

  '\x{1800}-\x{180A}\x{180E}\x{1940}-\x{1945}\x{19DE}-\x{19FF}' .

63

  '\x{1A1E}-\x{1A1F}\x{1AA0}-\x{1AA6}\x{1AA8}-\x{1AAD}\x{1B5A}-\x{1B6A}' .

64

  '\x{1B74}-\x{1B7C}\x{1C3B}-\x{1C3F}\x{1C7E}-\x{1C7F}\x{1CD3}\x{1FBD}' .

65

  '\x{1FBF}-\x{1FC1}\x{1FCD}-\x{1FCF}\x{1FDD}-\x{1FDF}\x{1FED}-\x{1FEF}' .

66

  '\x{1FFD}-\x{206F}\x{207A}-\x{207E}\x{208A}-\x{208E}\x{20A0}-\x{20B8}' .

67

  '\x{2100}-\x{2101}\x{2103}-\x{2106}\x{2108}-\x{2109}\x{2114}' .

68

  '\x{2116}-\x{2118}\x{211E}-\x{2123}\x{2125}\x{2127}\x{2129}\x{212E}' .

69

  '\x{213A}-\x{213B}\x{2140}-\x{2144}\x{214A}-\x{214D}\x{214F}' .

70

  '\x{2190}-\x{244A}\x{249C}-\x{24E9}\x{2500}-\x{2775}\x{2794}-\x{2B59}' .

71

  '\x{2CE5}-\x{2CEA}\x{2CF9}-\x{2CFC}\x{2CFE}-\x{2CFF}\x{2E00}-\x{2E2E}' .

72

  '\x{2E30}-\x{3004}\x{3008}-\x{3020}\x{3030}\x{3036}-\x{3037}' .

73

  '\x{303D}-\x{303F}\x{309B}-\x{309C}\x{30A0}\x{30FB}\x{3190}-\x{3191}' .

74

  '\x{3196}-\x{319F}\x{31C0}-\x{31E3}\x{3200}-\x{321E}\x{322A}-\x{3250}' .

75

  '\x{3260}-\x{327F}\x{328A}-\x{32B0}\x{32C0}-\x{33FF}\x{4DC0}-\x{4DFF}' .

76

  '\x{A490}-\x{A4C6}\x{A4FE}-\x{A4FF}\x{A60D}-\x{A60F}\x{A673}\x{A67E}' .

77

  '\x{A6F2}-\x{A716}\x{A720}-\x{A721}\x{A789}-\x{A78A}\x{A828}-\x{A82B}' .

78

  '\x{A836}-\x{A839}\x{A874}-\x{A877}\x{A8CE}-\x{A8CF}\x{A8F8}-\x{A8FA}' .

79

  '\x{A92E}-\x{A92F}\x{A95F}\x{A9C1}-\x{A9CD}\x{A9DE}-\x{A9DF}' .

80

  '\x{AA5C}-\x{AA5F}\x{AA77}-\x{AA79}\x{AADE}-\x{AADF}\x{ABEB}' .

81

  '\x{E000}-\x{F8FF}\x{FB29}\x{FD3E}-\x{FD3F}\x{FDFC}-\x{FDFD}' .

82

  '\x{FE10}-\x{FE19}\x{FE30}-\x{FE6B}\x{FEFF}-\x{FF0F}\x{FF1A}-\x{FF20}' .

83

  '\x{FF3B}-\x{FF40}\x{FF5B}-\x{FF65}\x{FFE0}-\x{FFFD}');

84

85

/**

86

 * Wrapper around _unicode_check().

87

*/

88

function unicode_check() {

89

  list($GLOBALS['multibyte']) = _unicode_check();

90

91

92

/**

93

 * Perform checks about Unicode support in PHP, and set the right settings if

94

 * needed.

95

96

 * Because Drupal needs to be able to handle text in various encodings, we do

97

 * not support mbstring function overloading. HTTP input/output conversion must

98

 * be disabled for similar reasons.

99

100

 * @param $errors

101

 *   Whether to report any fatal errors with form_set_error().

102

*/

103

function _unicode_check() {

104

  // Ensure translations don't break during installation.

105

  $t = get_t();

106

107

  // Check for mbstring extension

108

  if (!function_exists('mb_strlen')) {

109

    return array(UNICODE_SINGLEBYTE, $t('Operations on Unicode strings are emulated on a best-effort basis. Install the <a href="@url">PHP mbstring extension</a> for improved Unicode support.', array('@url' => 'http://www.php.net/mbstring')));

110

111

112

  // Check mbstring configuration

113

  if (ini_get('mbstring.func_overload') != 0) {

114

    return array(UNICODE_ERROR, $t('Multibyte string function overloading in PHP is active and must be disabled. Check the php.ini <em>mbstring.func_overload</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));

115

116

  if (ini_get('mbstring.encoding_translation') != 0) {

117

    return array(UNICODE_ERROR, $t('Multibyte string input conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.encoding_translation</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));

118

119

  if (ini_get('mbstring.http_input') != 'pass') {

120

    return array(UNICODE_ERROR, $t('Multibyte string input conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.http_input</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));

121

122

  if (ini_get('mbstring.http_output') != 'pass') {

123

    return array(UNICODE_ERROR, $t('Multibyte string output conversion in PHP is active and must be disabled. Check the php.ini <em>mbstring.http_output</em> setting. Please refer to the <a href="@url">PHP mbstring documentation</a> for more information.', array('@url' => 'http://www.php.net/mbstring')));

124

125

126

  // Set appropriate configuration

127

  mb_internal_encoding('utf-8');

128

  mb_language('uni');

129

  return array(UNICODE_MULTIBYTE, '');

130

131

132

/**

133

 * Returns Unicode library status and errors.

134

*/

135

function unicode_requirements() {

136

  // Ensure translations don't break during installation.

137

  $t = get_t();

138

139

  $libraries = array(

140

    UNICODE_SINGLEBYTE => $t('Standard PHP'),

141

    UNICODE_MULTIBYTE => $t('PHP Mbstring Extension'),

142

    UNICODE_ERROR => $t('Error'),

143

);

144

  $severities = array(

145

    UNICODE_SINGLEBYTE => REQUIREMENT_WARNING,

146

    UNICODE_MULTIBYTE => REQUIREMENT_OK,

147

    UNICODE_ERROR => REQUIREMENT_ERROR,

148

);

149

  list($library, $description) = _unicode_check();

150

151

  $requirements['unicode'] = array(

152

    'title' => $t('Unicode library'),

153

    'value' => $libraries[$library],

154

);

155

  if ($description) {

156

    $requirements['unicode']['description'] = $description;

157

158

159

  $requirements['unicode']['severity'] = $severities[$library];

160

161

  return $requirements;

162

163

164

/**

165

 * Prepares a new XML parser.

166

167

 * This is a wrapper around xml_parser_create() which extracts the encoding

168

 * from the XML data first and sets the output encoding to UTF-8. This function

169

 * should be used instead of xml_parser_create(), because PHP 4's XML parser

170

 * doesn't check the input encoding itself. "Starting from PHP 5, the input

171

 * encoding is automatically detected, so that the encoding parameter specifies

172

 * only the output encoding."

173

174

 * This is also where unsupported encodings will be converted. Callers should

175

 * take this into account: $data might have been changed after the call.

176

177

 * @param $data

178

 *   The XML data which will be parsed later.

179

180

 * @return

181

 *   An XML parser object or FALSE on error.

182

183

 * @ingroup php_wrappers

184

*/

185

function drupal_xml_parser_create(&$data) {

186

  // Default XML encoding is UTF-8

187

  $encoding = 'utf-8';

188

  $bom = FALSE;

189

190

  // Check for UTF-8 byte order mark (PHP5's XML parser doesn't handle it).

191

  if (!strncmp($data, "\xEF\xBB\xBF", 3)) {

192

    $bom = TRUE;

193

    $data = substr($data, 3);

194

195

196

  // Check for an encoding declaration in the XML prolog if no BOM was found.

197

  if (!$bom && preg_match('/^<\?xml[^>]+encoding="(.+?)"/', $data, $match)) {

198

    $encoding = $match[1];

199

200

201

  // Unsupported encodings are converted here into UTF-8.

202

  $php_supported = array('utf-8', 'iso-8859-1', 'us-ascii');

203

  if (!in_array(strtolower($encoding), $php_supported)) {

204

    $out = drupal_convert_to_utf8($data, $encoding);

205

    if ($out !== FALSE) {

206

      $encoding = 'utf-8';

207

      $data = preg_replace('/^(<\?xml[^>]+encoding)="(.+?)"/', '\\1="utf-8"', $out);

208

209

    else {

210

      watchdog('php', 'Could not convert XML encoding %s to UTF-8.', array('%s' => $encoding), WATCHDOG_WARNING);

211

      return FALSE;

  $xml_parser = xml_parser_create($encoding);

216

  xml_parser_set_option($xml_parser, XML_OPTION_TARGET_ENCODING, 'utf-8');

217

  return $xml_parser;

218

219

220

/**

221

 * Converts data to UTF-8.

222

223

 * Requires the iconv, GNU recode or mbstring PHP extension.

224

225

 * @param $data

226

 *   The data to be converted.

227

 * @param $encoding

228

 *   The encoding that the data is in.

229

230

 * @return

231

 *   Converted data or FALSE.

232

*/

233

function drupal_convert_to_utf8($data, $encoding) {

234

  if (function_exists('iconv')) {

235

    $out = @iconv($encoding, 'utf-8', $data);

236

237

  elseif (function_exists('mb_convert_encoding')) {

238

    $out = @mb_convert_encoding($data, 'utf-8', $encoding);

239

240

  elseif (function_exists('recode_string')) {

241

    $out = @recode_string($encoding . '..utf-8', $data);

242

243

  else {

244

    watchdog('php', 'Unsupported encoding %s. Please install iconv, GNU recode or mbstring for PHP.', array('%s' => $encoding), WATCHDOG_ERROR);

245

    return FALSE;

246

247

248

  return $out;

249

250

251

/**

252

 * Truncates a UTF-8-encoded string safely to a number of bytes.

253

254

 * If the end position is in the middle of a UTF-8 sequence, it scans backwards

255

 * until the beginning of the byte sequence.

256

257

 * Use this function whenever you want to chop off a string at an unsure

258

 * location. On the other hand, if you're sure that you're splitting on a

259

 * character boundary (e.g. after using strpos() or similar), you can safely

260

 * use substr() instead.

261

262

 * @param $string

263

 *   The string to truncate.

264

 * @param $len

265

 *   An upper limit on the returned string length.

266

267

 * @return

268

 *   The truncated string.

269

*/

270

function drupal_truncate_bytes($string, $len) {

271

  if (strlen($string) <= $len) {

272

    return $string;

273

274

  if ((ord($string[$len]) < 0x80) || (ord($string[$len]) >= 0xC0)) {

275

    return substr($string, 0, $len);

276

277

  // Scan backwards to beginning of the byte sequence.

278

  while (--$len >= 0 && ord($string[$len]) >= 0x80 && ord($string[$len]) < 0xC0);

279

280

  return substr($string, 0, $len);

281

282

283

/**

284

 * Truncates a UTF-8-encoded string safely to a number of characters.

285

286

 * @param $string

287

 *   The string to truncate.

288

 * @param $max_length

289

 *   An upper limit on the returned string length, including trailing ellipsis

290

 *   if $add_ellipsis is TRUE.

291

 * @param $wordsafe

292

 *   If TRUE, attempt to truncate on a word boundary. Word boundaries are

293

 *   spaces, punctuation, and Unicode characters used as word boundaries in

294

 *   non-Latin languages; see PREG_CLASS_UNICODE_WORD_BOUNDARY for more

295

 *   information. If a word boundary cannot be found that would make the length

296

 *   of the returned string fall within length guidelines (see parameters

297

 *   $max_length and $min_wordsafe_length), word boundaries are ignored.

298

 * @param $add_ellipsis

299

 *   If TRUE, add t('...') to the end of the truncated string (defaults to

300

 *   FALSE). The string length will still fall within $max_length.

301

 * @param $min_wordsafe_length

302

 *   If $wordsafe is TRUE, the minimum acceptable length for truncation (before

303

 *   adding an ellipsis, if $add_ellipsis is TRUE). Has no effect if $wordsafe

304

 *   is FALSE. This can be used to prevent having a very short resulting string

305

 *   that will not be understandable. For instance, if you are truncating the

306

 *   string "See myverylongurlexample.com for more information" to a word-safe

307

 *   return length of 20, the only available word boundary within 20 characters

308

 *   is after the word "See", which wouldn't leave a very informative string. If

309

 *   you had set $min_wordsafe_length to 10, though, the function would realise

310

 *   that "See" alone is too short, and would then just truncate ignoring word

311

 *   boundaries, giving you "See myverylongurl..." (assuming you had set

312

 *   $add_ellipses to TRUE).

313

314

 * @return string

315

 *   The truncated string.

316

*/

317

function truncate_utf8($string, $max_length, $wordsafe = FALSE, $add_ellipsis = FALSE, $min_wordsafe_length = 1) {

318

  $ellipsis = '';

319

  $max_length = max($max_length, 0);

320

  $min_wordsafe_length = max($min_wordsafe_length, 0);

321

322

  if (drupal_strlen($string) <= $max_length) {

323

    // No truncation needed, so don't add ellipsis, just return.

324

    return $string;

325

326

327

  if ($add_ellipsis) {

328

    // Truncate ellipsis in case $max_length is small.

329

    $ellipsis = drupal_substr(t('...'), 0, $max_length);

330

    $max_length -= drupal_strlen($ellipsis);

331

    $max_length = max($max_length, 0);

332

333

334

  if ($max_length <= $min_wordsafe_length) {

335

    // Do not attempt word-safe if lengths are bad.

336

    $wordsafe = FALSE;

337

338

339

  if ($wordsafe) {

340

    $matches = array();

341

    // Find the last word boundary, if there is one within $min_wordsafe_length

342

    // to $max_length characters. preg_match() is always greedy, so it will

343

    // find the longest string possible.

344

    $found = preg_match('/^(.{' . $min_wordsafe_length . ',' . $max_length . '})[' . PREG_CLASS_UNICODE_WORD_BOUNDARY . ']/u', $string, $matches);

345

    if ($found) {

346

      $string = $matches[1];

347

348

    else {

349

      $string = drupal_substr($string, 0, $max_length);

350

351

352

  else {

353

    $string = drupal_substr($string, 0, $max_length);

354

355

356

  if ($add_ellipsis) {

357

    $string .= $ellipsis;

358

359

360

  return $string;

361

362

363

/**

364

 * Encodes MIME/HTTP header values that contain incorrectly encoded characters.

365

366

 * For example, mime_header_encode('tést.txt') returns "=?UTF-8?B?dMOpc3QudHh0?=".

367

368

 * See http://www.rfc-editor.org/rfc/rfc2047.txt for more information.

369

370

 * Notes:

371

 * - Only encode strings that contain non-ASCII characters.

372

 * - We progressively cut-off a chunk with truncate_utf8(). This is to ensure

373

 *   each chunk starts and ends on a character boundary.

374

 * - Using \n as the chunk separator may cause problems on some systems and may

375

 *   have to be changed to \r\n or \r.

376

377

 * @param $string

378

 *   The header to encode.

379

380

 * @return string

381

 *   The mime-encoded header.

382

383

 * @see mime_header_decode()

384

*/

385

function mime_header_encode($string) {

386

  if (preg_match('/[^\x20-\x7E]/', $string)) {

387

    $chunk_size = 47; // floor((75 - strlen("=?UTF-8?B??=")) * 0.75);

388

    $len = strlen($string);

389

    $output = '';

390

    while ($len > 0) {

391

      $chunk = drupal_truncate_bytes($string, $chunk_size);

392

      $output .= ' =?UTF-8?B?' . base64_encode($chunk) . "?=\n";

393

      $c = strlen($chunk);

394

      $string = substr($string, $c);

395

      $len -= $c;

396

397

    return trim($output);

398

399

  return $string;

400

401

402

/**

403

 * Decodes MIME/HTTP encoded header values.

404

405

 * @param $header

406

 *   The header to decode.

407

408

 * @return string

409

 *   The mime-decoded header.

410

411

 * @see mime_header_encode()

412

*/

413

function mime_header_decode($header) {

414

  // First step: encoded chunks followed by other encoded chunks (need to collapse whitespace)

415

  $header = preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=\s+(?==\?)/', '_mime_header_decode', $header);

416

  // Second step: remaining chunks (do not collapse whitespace)

417

  return preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=/', '_mime_header_decode', $header);

418

419

420

/**

421

 * Decodes encoded header data passed from mime_header_decode().

422

423

 * Callback for preg_replace_callback() within mime_header_decode().

424

425

 * @param $matches

426

 *   The array of matches from preg_replace_callback().

427

428

 * @return string

429

 *   The mime-decoded string.

430

431

 * @see mime_header_decode()

432

*/

433

function _mime_header_decode($matches) {

434

  // Regexp groups:

435

  // 1: Character set name

436

  // 2: Escaping method (Q or B)

437

  // 3: Encoded data

438

  $data = ($matches[2] == 'B') ? base64_decode($matches[3]) : str_replace('_', ' ', quoted_printable_decode($matches[3]));

439

  if (strtolower($matches[1]) != 'utf-8') {

440

    $data = drupal_convert_to_utf8($data, $matches[1]);

441

442

  return $data;

443

444

445

/**

446

 * Decodes all HTML entities (including numerical ones) to regular UTF-8 bytes.

447

448

 * Double-escaped entities will only be decoded once ("&amp;lt;" becomes "&lt;"

449

 * , not "<"). Be careful when using this function, as decode_entities can

450

 * revert previous sanitization efforts (&lt;script&gt; will become <script>).

451

452

 * @param $text

453

 *   The text to decode entities in.

454

455

 * @return

456

 *   The input $text, with all HTML entities decoded once.

457

*/

458

function decode_entities($text) {

459

  return html_entity_decode($text, ENT_QUOTES, 'UTF-8');

460

461

462

/**

463

 * Counts the number of characters in a UTF-8 string.

464

465

 * This is less than or equal to the byte count.

466

467

 * @param $text

468

 *   The string to run the operation on.

469

470

 * @return integer

471

 *   The length of the string.

472

473

 * @ingroup php_wrappers

474

*/

475

function drupal_strlen($text) {

476

  global $multibyte;

477

  if ($multibyte == UNICODE_MULTIBYTE) {

478

    return mb_strlen($text);

479

480

  else {

481

    // Do not count UTF-8 continuation bytes.

482

    return strlen(preg_replace("/[\x80-\xBF]/", '', $text));

/**

487

 * Uppercase a UTF-8 string.

488

489

 * @param $text

490

 *   The string to run the operation on.

491

492

 * @return string

493

 *   The string in uppercase.

494

495

 * @ingroup php_wrappers

496

*/

497

function drupal_strtoupper($text) {

498

  global $multibyte;

499

  if ($multibyte == UNICODE_MULTIBYTE) {

500

    return mb_strtoupper($text);

501

502

  else {

503

    // Use C-locale for ASCII-only uppercase

504

    $text = strtoupper($text);

505

    // Case flip Latin-1 accented letters

506

    $text = preg_replace_callback('/\xC3[\xA0-\xB6\xB8-\xBE]/', '_unicode_caseflip', $text);

507

    return $text;

/**

512

 * Lowercase a UTF-8 string.

513

514

 * @param $text

515

 *   The string to run the operation on.

516

517

 * @return string

518

 *   The string in lowercase.

519

520

 * @ingroup php_wrappers

521

*/

522

function drupal_strtolower($text) {

523

  global $multibyte;

524

  if ($multibyte == UNICODE_MULTIBYTE) {

525

    return mb_strtolower($text);

526

527

  else {

528

    // Use C-locale for ASCII-only lowercase

529

    $text = strtolower($text);

530

    // Case flip Latin-1 accented letters

531

    $text = preg_replace_callback('/\xC3[\x80-\x96\x98-\x9E]/', '_unicode_caseflip', $text);

532

    return $text;

/**

537

 * Flips U+C0-U+DE to U+E0-U+FD and back.

538

539

 * @param $matches

540

 *   An array of matches.

541

542

 * @return array

543

 *   The Latin-1 version of the array of matches.

544

545

 * @see drupal_strtolower()

546

*/

547

function _unicode_caseflip($matches) {

548

  return $matches[0][0] . chr(ord($matches[0][1]) ^ 32);

549

550

551

/**

552

 * Capitalizes the first letter of a UTF-8 string.

553

554

 * @param $text

555

 *   The string to convert.

556

557

 * @return

558

 *   The string with the first letter as uppercase.

559

560

 * @ingroup php_wrappers

561

*/

562

function drupal_ucfirst($text) {

563

  // Note: no mbstring equivalent!

564

  return drupal_strtoupper(drupal_substr($text, 0, 1)) . drupal_substr($text, 1);

565

566

567

/**

568

 * Cuts off a piece of a string based on character indices and counts.

569

570

 * Follows the same behavior as PHP's own substr() function. Note that for

571

 * cutting off a string at a known character/substring location, the usage of

572

 * PHP's normal strpos/substr is safe and much faster.

573

574

 * @param $text

575

 *   The input string.

576

 * @param $start

577

 *   The position at which to start reading.

578

 * @param $length

579

 *   The number of characters to read.

580

581

 * @return

582

 *   The shortened string.

583

584

 * @ingroup php_wrappers

585

*/

586

function drupal_substr($text, $start, $length = NULL) {

587

  global $multibyte;

588

  if ($multibyte == UNICODE_MULTIBYTE) {

589

    return $length === NULL ? mb_substr($text, $start) : mb_substr($text, $start, $length);

590

591

  else {

592

    $strlen = strlen($text);

593

    // Find the starting byte offset.

594

    $bytes = 0;

595

    if ($start > 0) {

596

      // Count all the continuation bytes from the start until we have found

597

      // $start characters or the end of the string.

598

      $bytes = -1; $chars = -1;

599

      while ($bytes < $strlen - 1 && $chars < $start) {

600

        $bytes++;

601

        $c = ord($text[$bytes]);

602

        if ($c < 0x80 || $c >= 0xC0) {

603

          $chars++;

    elseif ($start < 0) {

608

      // Count all the continuation bytes from the end until we have found

609

      // abs($start) characters.

610

      $start = abs($start);

611

      $bytes = $strlen; $chars = 0;

612

      while ($bytes > 0 && $chars < $start) {

613

        $bytes--;

614

        $c = ord($text[$bytes]);

615

        if ($c < 0x80 || $c >= 0xC0) {

616

          $chars++;

    $istart = $bytes;

621

622

    // Find the ending byte offset.

623

    if ($length === NULL) {

624

      $iend = $strlen;

625

626

    elseif ($length > 0) {

627

      // Count all the continuation bytes from the starting index until we have

628

      // found $length characters or reached the end of the string, then

629

      // backtrace one byte.

630

      $iend = $istart - 1;

631

      $chars = -1;

632

      $last_real = FALSE;

633

      while ($iend < $strlen - 1 && $chars < $length) {

634

        $iend++;

635

        $c = ord($text[$iend]);

636

        $last_real = FALSE;

637

        if ($c < 0x80 || $c >= 0xC0) {

638

          $chars++;

639

          $last_real = TRUE;

640

641

642

      // Backtrace one byte if the last character we found was a real character

643

      // and we don't need it.

644

      if ($last_real && $chars >= $length) {

645

        $iend--;

646

647

648

    elseif ($length < 0) {

649

      // Count all the continuation bytes from the end until we have found

650

      // abs($start) characters, then backtrace one byte.

651

      $length = abs($length);

652

      $iend = $strlen; $chars = 0;

653

      while ($iend > 0 && $chars < $length) {

654

        $iend--;

655

        $c = ord($text[$iend]);

656

        if ($c < 0x80 || $c >= 0xC0) {

657

          $chars++;

658

659

660

      // Backtrace one byte if we are not at the beginning of the string.

661

      if ($iend > 0) {

662

        $iend--;

663

664

665

    else {

666

      // $length == 0, return an empty string.

667

      return '';

668

669

670

    return substr($text, $istart, max(0, $iend - $istart + 1));

671

672

Projet

Général

Profil

Club Drupal

root / htmltest / includes / unicode.inc @ a5572547