Projet

Général

Profil

Paste
Télécharger (19,9 ko) Statistiques
| Branche: | Révision:

root / drupal7 / sites / all / modules / feeds / libraries / http_request.inc @ ed9a13f1

1
<?php
2

    
3
/**
4
 * @file
5
 * Download via HTTP.
6
 *
7
 * Support caching, HTTP Basic Authentication, detection of RSS/Atom feeds,
8
 * redirects.
9
 */
10

    
11
/**
12
 * Error code for when the URL could not be parsed.
13
 *
14
 * @var int
15
 */
16
define('FEEDS_ERROR_PARSE_ERROR', -1001);
17

    
18
/**
19
 * Error code for when the scheme of an URL could not be determined.
20
 *
21
 * An example of a scheme is 'http'.
22
 *
23
 * @var int
24
 */
25
define('FEEDS_ERROR_NO_SCHEME', -1002);
26

    
27
/**
28
 * Error code for when the scheme of an URL is not supported.
29
 *
30
 * @var int
31
 */
32
define('FEEDS_ERROR_INVALID_SCHEME', -1003);
33

    
34
/**
35
 * PCRE for finding the link tags in html.
36
 */
37
define('HTTP_REQUEST_PCRE_LINK_TAG', '/<link((?:[\x09\x0A\x0B\x0C\x0D\x20]+[^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"(?:[^"]*)"|\'(?:[^\']*)\'|(?:[^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?)*)[\x09\x0A\x0B\x0C\x0D\x20]*(>(.*)<\/link>|(\/)?>)/si');
38

    
39
/**
40
 * PCRE for matching all the attributes in a tag.
41
 */
42
define('HTTP_REQUEST_PCRE_TAG_ATTRIBUTES', '/[\x09\x0A\x0B\x0C\x0D\x20]+([^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*)(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"([^"]*)"|\'([^\']*)\'|([^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?/');
43

    
44
/**
45
 * For cUrl specific errors.
46
 */
47
class HRCurlException extends Exception {}
48

    
49
/**
50
 * For HTTP requests that do not return a 2xx code.
51
 */
52
class FeedsHTTPRequestException extends Exception {}
53

    
54
/**
55
 * Discovers RSS or atom feeds at the given URL.
56
 *
57
 * If document in given URL is an HTML document, function attempts to discover
58
 * RSS or Atom feeds.
59
 *
60
 * @param string $url
61
 *   The url of the feed to retrieve.
62
 * @param array $options
63
 *   An optional array of options.
64
 *   For valid options, see feeds_http_request().
65
 *
66
 * @return bool|string
67
 *   The discovered feed, or FALSE if the URL is not reachable or there was an
68
 *   error.
69
 */
70
function http_request_get_common_syndication($url, $options = array()) {
71
  $download = feeds_http_request($url, $options);
72

    
73
  // Cannot get the feed, return.
74
  // feeds_http_request() always returns 200 even if its 304.
75
  if ($download->code != 200) {
76
    return FALSE;
77
  }
78

    
79
  // Drop the data into a separate variable so all manipulations of the html
80
  // will not effect the actual object that exists in the static cache.
81
  // @see feeds_http_request()
82
  $downloaded_string = $download->data;
83
  // If this happens to be a feed then just return the url.
84
  if (isset($download->headers['content-type']) && http_request_is_feed($download->headers['content-type'], $downloaded_string)) {
85
    return $url;
86
  }
87

    
88
  $discovered_feeds = http_request_find_feeds($downloaded_string);
89
  foreach ($discovered_feeds as $feed_url) {
90
    $absolute = http_request_create_absolute_url($feed_url, $url);
91
    if (!empty($absolute)) {
92
      // @TODO: something more intelligent?
93
      return $absolute;
94
    }
95
  }
96
}
97

    
98
/**
99
 * Get the content from the given URL.
100
 *
101
 * @param string $url
102
 *   A valid URL (not only web URLs).
103
 * @param string $username
104
 *   If the URL uses authentication, supply the username.
105
 * @param string $password
106
 *   If the URL uses authentication, supply the password.
107
 * @param bool $accept_invalid_cert
108
 *   Whether to accept invalid certificates.
109
 * @param int $timeout
110
 *   Timeout in seconds to wait for an HTTP get request to finish.
111
 *
112
 * @return object
113
 *   An object that describes the data downloaded from $url.
114
 */
115
function http_request_get($url, $username = NULL, $password = NULL, $accept_invalid_cert = FALSE, $timeout = NULL) {
116
  return feeds_http_request($url, array(
117
    'username' => $username,
118
    'password' => $password,
119
    'accept_invalid_cert' => $accept_invalid_cert,
120
    'timeout' => $timeout,
121
  ));
122
}
123

    
124
/**
125
 * Get the content from the given URL.
126
 *
127
 * @param string $url
128
 *   A valid URL (not only web URLs).
129
 * @param array $options
130
 *   (optional) An array that can have one or more of the following elements:
131
 *   - username: (string) If the URL uses authentication, supply the username.
132
 *   - password: (string) If the URL uses authentication, supply the password.
133
 *   - accept_invalid_cert: (bool) Whether to accept invalid certificates.
134
 *     Defaults to FALSE.
135
 *   - timeout: (integer) Timeout in seconds to wait for an HTTP get request to
136
 *     finish. Defaults to 30 seconds.
137
 *   - cache_http_result: (bool) Whether to cache the HTTP result. Defaults to
138
 *     TRUE.
139
 *
140
 * @return object
141
 *   An object that describes the data downloaded from $url.
142
 */
143
function feeds_http_request($url, array $options = array()) {
144
  $options += array(
145
    'username' => NULL,
146
    'password' => NULL,
147
    'accept_invalid_cert' => FALSE,
148
    'cache_http_result' => TRUE,
149
  );
150

    
151
  // Make sure a request timeout is set.
152
  if (empty($options['timeout'])) {
153
    $options['timeout'] = variable_get('http_request_timeout', 30);
154
  }
155

    
156
  // Intra-pagedownload cache, avoid to download the same content twice within
157
  // one page download (it's possible, compatible and parse calls).
158
  $cached_urls = &drupal_static(__FUNCTION__, array());
159
  if (!empty($cached_urls[$url])) {
160
    $cache = http_request_get_cache($url);
161
    if ($cache->data) {
162
      return $cache->data;
163
    }
164
  }
165

    
166
  if (!$options['username'] && valid_url($url, TRUE)) {
167
    // Handle password protected feeds.
168
    $url_parts = parse_url($url);
169
    if (!empty($url_parts['user'])) {
170
      $options['password'] = urldecode($url_parts['pass']);
171
      $options['username'] = urldecode($url_parts['user']);
172
    }
173
  }
174

    
175
  $curl = http_request_use_curl();
176

    
177
  // Only download and parse data if really needs refresh.
178
  // Based on "Last-Modified" and "If-Modified-Since".
179
  $headers = array();
180
  if ($options['cache_http_result'] && $cache = http_request_get_cache($url)) {
181
    $last_result = $cache->data;
182
    $last_headers = array_change_key_case($last_result->headers);
183

    
184
    if (!empty($last_headers['etag'])) {
185
      if ($curl) {
186
        $headers[] = 'If-None-Match: ' . $last_headers['etag'];
187
      }
188
      else {
189
        $headers['If-None-Match'] = $last_headers['etag'];
190
      }
191
    }
192
    if (!empty($last_headers['last-modified'])) {
193
      if ($curl) {
194
        $headers[] = 'If-Modified-Since: ' . $last_headers['last-modified'];
195
      }
196
      else {
197
        $headers['If-Modified-Since'] = $last_headers['last-modified'];
198
      }
199
    }
200
    if (!empty($options['username']) && !$curl) {
201
      $headers['Authorization'] = 'Basic ' . base64_encode($options['username'] . ':' . $options['password']);
202
    }
203
  }
204

    
205
  // Support the 'feed' and 'webcal' schemes by converting them into 'http'.
206
  $url = strtr($url, array('feed://' => 'http://', 'webcal://' => 'http://'));
207

    
208
  if ($curl) {
209
    $headers[] = 'User-Agent: Drupal (+http://drupal.org/)';
210
    $result = new stdClass();
211
    $result->headers = array();
212

    
213
    // Parse the URL and make sure we can handle the schema.
214
    // cURL can only support either http:// or https://.
215
    // CURLOPT_PROTOCOLS is only supported with cURL 7.19.4.
216
    $uri = parse_url($url);
217
    if ($uri === FALSE) {
218
      $result->error = 'unable to parse URL';
219
      $result->code = FEEDS_ERROR_PARSE_ERROR;
220
    }
221
    elseif (!isset($uri['scheme'])) {
222
      $result->error = 'missing schema';
223
      $result->code = FEEDS_ERROR_NO_SCHEME;
224
    }
225
    else {
226
      switch ($uri['scheme']) {
227
        case 'http':
228
        case 'https':
229
          // Valid scheme.
230
          break;
231

    
232
        default:
233
          $result->error = 'invalid schema ' . $uri['scheme'];
234
          $result->code = FEEDS_ERROR_INVALID_SCHEME;
235
          break;
236
      }
237
    }
238

    
239
    // If the scheme was valid, continue to request the feed using cURL.
240
    if (empty($result->error)) {
241
      $download = curl_init($url);
242
      curl_setopt($download, CURLOPT_FOLLOWLOCATION, TRUE);
243
      if (!empty($options['username'])) {
244
        curl_setopt($download, CURLOPT_USERPWD, $options['username'] . ':' . $options['password']);
245
        curl_setopt($download, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
246
      }
247
      curl_setopt($download, CURLOPT_HTTPHEADER, $headers);
248
      curl_setopt($download, CURLOPT_HEADER, TRUE);
249
      curl_setopt($download, CURLOPT_RETURNTRANSFER, TRUE);
250
      curl_setopt($download, CURLOPT_ENCODING, '');
251
      curl_setopt($download, CURLOPT_TIMEOUT, $options['timeout']);
252

    
253
      $proxy_server = variable_get('proxy_server');
254

    
255
      if ($proxy_server && _drupal_http_use_proxy($uri['host'])) {
256
        curl_setopt($download, CURLOPT_PROXY, $proxy_server);
257
        curl_setopt($download, CURLOPT_PROXYPORT, variable_get('proxy_port', 8080));
258

    
259
        // Proxy user/password.
260
        if ($proxy_username = variable_get('proxy_username')) {
261
          $username_password = $proxy_username . ':' . variable_get('proxy_password', '');
262

    
263
          curl_setopt($download, CURLOPT_PROXYUSERPWD, $username_password);
264
          curl_setopt($download, CURLOPT_PROXYAUTH, variable_get('proxy_auth_method', CURLAUTH_BASIC));
265
        }
266
      }
267

    
268
      if ($options['accept_invalid_cert']) {
269
        curl_setopt($download, CURLOPT_SSL_VERIFYPEER, 0);
270
        curl_setopt($download, CURLOPT_SSL_VERIFYHOST, 0);
271
      }
272
      $header = '';
273
      $result->data = curl_exec($download);
274
      if (curl_error($download)) {
275
        throw new HRCurlException(
276
          t('cURL error (@code) @error for @url', array(
277
            '@code' => curl_errno($download),
278
            '@error' => curl_error($download),
279
            '@url' => $url,
280
          )), curl_errno($download)
281
        );
282
      }
283

    
284
      // When using a proxy, remove extra data from the header which is not
285
      // considered by CURLINFO_HEADER_SIZE (possibly cURL bug).
286
      // This data is only added when to HTTP header when working with a proxy.
287
      // Example string added: <HTTP/1.0 200 Connection established\r\n\r\n>
288
      // This was fixed in libcurl version 7.30.0 (0x71e00) (April 12, 2013),
289
      // so this workaround only removes the proxy-added headers if we are using
290
      // an older version of libcurl.
291
      $curl_ver = curl_version();
292

    
293
      if ($proxy_server && $curl_ver['version_number'] < 0x71e00 && _drupal_http_use_proxy($uri['host'])) {
294
        $http_header_break = "\r\n\r\n";
295
        $response = explode($http_header_break, $result->data);
296
        if (count($response) > 2) {
297
          $result->data = substr($result->data, strlen($response[0] . $http_header_break), strlen($result->data));
298
        }
299
      }
300

    
301
      $header_size = curl_getinfo($download, CURLINFO_HEADER_SIZE);
302
      $header = substr($result->data, 0, $header_size - 1);
303
      $result->data = substr($result->data, $header_size);
304
      $headers = preg_split("/(\r\n){2}/", $header);
305
      $header_lines = preg_split("/\r\n|\n|\r/", end($headers));
306
      // Skip HTTP response status.
307
      array_shift($header_lines);
308

    
309
      while ($line = trim(array_shift($header_lines))) {
310
        list($header, $value) = explode(':', $line, 2);
311
        // Normalize the headers.
312
        $header = strtolower($header);
313

    
314
        if (isset($result->headers[$header]) && $header == 'set-cookie') {
315
          // RFC 2109: the Set-Cookie response header comprises the token Set-
316
          // Cookie:, followed by a comma-separated list of one or more cookies.
317
          $result->headers[$header] .= ',' . trim($value);
318
        }
319
        else {
320
          $result->headers[$header] = trim($value);
321
        }
322
      }
323
      $result->code = curl_getinfo($download, CURLINFO_HTTP_CODE);
324

    
325
      curl_close($download);
326
    }
327
  }
328
  else {
329
    $result = drupal_http_request($url, array(
330
      'headers' => $headers,
331
      'timeout' => $options['timeout'],
332
    ));
333
    $result->headers = isset($result->headers) ? $result->headers : array();
334
  }
335

    
336
  $result->code = isset($result->code) ? $result->code : 200;
337

    
338
  // In case of 304 Not Modified try to return cached data.
339
  if ($result->code == 304) {
340

    
341
    if (isset($last_result->data)) {
342
      $last_result->from_cache = TRUE;
343
      return $last_result;
344
    }
345
    else {
346
      // It's a tragedy, this file must exist and contain good data.
347
      // In this case, clear cache and repeat.
348
      http_request_clear_cache($url);
349
      return feeds_http_request($url, $options);
350
    }
351
  }
352

    
353
  // Set caches if asked.
354
  if ($options['cache_http_result']) {
355
    http_request_set_cache($url, $result);
356
    // In the static cache, mark this URL as being cached.
357
    $cached_urls[$url] = TRUE;
358
  }
359

    
360
  return $result;
361
}
362

    
363
/**
364
 * Checks the result of the HTTP Request.
365
 *
366
 * @param string $url
367
 *   The URL that was requested.
368
 * @param object $result
369
 *   The HTTP Request result.
370
 *
371
 * @throws FeedsHTTPRequestException
372
 *   In case the result code of the HTTP request is not in the 2xx series.
373
 */
374
function http_request_check_result($url, $result) {
375
  if (!in_array($result->code, array(200, 201, 202, 203, 204, 205, 206))) {
376
    $vars = array(
377
      '@url' => $url,
378
      '@code' => $result->code,
379
      '@error' => isset($result->error) ? $result->error : 'Unknown error',
380
    );
381

    
382
    switch ($result->code) {
383
      case FEEDS_ERROR_PARSE_ERROR:
384
        $message = t('Download of @url failed because it could not be parsed.', $vars);
385
        break;
386

    
387
      case FEEDS_ERROR_NO_SCHEME:
388
        $message = t("Download of @url failed because its scheme could not be determined. The URL is expected to start with something like '@example'.", $vars + array(
389
          '@example' => 'http://',
390
        ));
391
        break;
392

    
393
      case FEEDS_ERROR_INVALID_SCHEME:
394
        $message = t('Download of @url failed because its scheme is not supported: @error. Examples of supported schemes are: @supported.', $vars + array(
395
          '@supported' => implode(', ', array('http', 'https')),
396
        ));
397
        break;
398

    
399
      default:
400
        if (isset($result->error)) {
401
          $message = t('Download of @url failed with code @code and the following error: @error.', $vars);
402
        }
403
        else {
404
          $message = t('Download of @url failed with code @code.', $vars);
405
        }
406
        break;
407
    }
408

    
409
    throw new FeedsHTTPRequestException($message);
410
  }
411
}
412

    
413
/**
414
 * Decides if it's possible to use cURL or not.
415
 *
416
 * @return bool
417
 *   TRUE if cURL may be used, FALSE otherwise.
418
 */
419
function http_request_use_curl() {
420
  // Allow site administrators to choose to not use cURL.
421
  if (variable_get('feeds_never_use_curl', FALSE)) {
422
    return FALSE;
423
  }
424

    
425
  // Check that the PHP cURL extension has been enabled.
426
  if (!extension_loaded('curl')) {
427
    return FALSE;
428
  }
429

    
430
  // cURL below PHP 5.6.0 must not have open_basedir or safe_mode enabled.
431
  if (version_compare(PHP_VERSION, '5.6.0', '<')) {
432
    // phpcs:ignore PHPCompatibility.IniDirectives.RemovedIniDirectives.safe_modeDeprecatedRemoved
433
    return !ini_get('safe_mode') && !ini_get('open_basedir');
434
  }
435

    
436
  // cURL in PHP 5.6.0 and above re-enables CURLOPT_FOLLOWLOCATION with
437
  // open_basedir so there is no need to check for this.
438
  return TRUE;
439
}
440

    
441
/**
442
 * Clear cache for a specific URL.
443
 *
444
 * @param string $url
445
 *   The URL to clear.
446
 */
447
function http_request_clear_cache($url) {
448
  cache_clear_all(hash('sha256', $url), 'cache_feeds_http');
449
}
450

    
451
/**
452
 * Gets the cache for a specific URL.
453
 *
454
 * @param string $url
455
 *   The URL to find the cached item.
456
 *
457
 * @return object|false
458
 *   The cache or FALSE on failure.
459
 */
460
function http_request_get_cache($url) {
461
  return cache_get(hash('sha256', $url), 'cache_feeds_http');
462
}
463

    
464
/**
465
 * Sets the cache for a specific URL.
466
 *
467
 * @param string $url
468
 *   The URL to cache.
469
 * @param object $result
470
 *   The result of the HTTP request.
471
 */
472
function http_request_set_cache($url, $result) {
473
  $item = ($result instanceof FeedsHTTPCacheItem) ? $result : new FeedsHTTPCacheItem(hash('sha256', $url), $result);
474
  $item->cacheSet();
475
}
476

    
477
/**
478
 * Returns if the provided $content_type is a feed.
479
 *
480
 * @param string $content_type
481
 *   The Content-Type header.
482
 *
483
 * @param string $data
484
 *   The actual data from the http request.
485
 *
486
 * @return bool
487
 *   Returns TRUE if this is a parsable feed.
488
 */
489
function http_request_is_feed($content_type, $data) {
490
  $pos = strpos($content_type, ';');
491
  if ($pos !== FALSE) {
492
    $content_type = substr($content_type, 0, $pos);
493
  }
494
  $content_type = strtolower($content_type);
495
  if (strpos($content_type, 'xml') !== FALSE) {
496
    return TRUE;
497
  }
498

    
499
  // @TODO: Sometimes the content-type can be text/html but still be a valid
500
  // feed.
501
  return FALSE;
502
}
503

    
504
/**
505
 * Finds potential feed tags in the HTML document.
506
 *
507
 * @param string $html
508
 *   The html string to search.
509
 *
510
 * @return array
511
 *   An array of href to feeds.
512
 */
513
function http_request_find_feeds($html) {
514
  $matches = array();
515
  preg_match_all(HTTP_REQUEST_PCRE_LINK_TAG, $html, $matches);
516
  $links = $matches[1];
517
  $valid_links = array();
518

    
519
  // Build up all the links information.
520
  foreach ($links as $link_tag) {
521
    $attributes = array();
522
    $candidate = array();
523

    
524
    preg_match_all(HTTP_REQUEST_PCRE_TAG_ATTRIBUTES, $link_tag, $attributes, PREG_SET_ORDER);
525
    foreach ($attributes as $attribute) {
526
      // Find the key value pairs, attribute[1] is key and attribute[2] is the
527
      // value.  However, if the link tag used single quotes, the value might
528
      // be in attribute[3] instead.
529
      if (empty($attribute[2])) {
530
        $attribute[2] = $attribute[3];
531
      }
532
      if (!empty($attribute[1]) && !empty($attribute[2])) {
533
        $candidate[drupal_strtolower($attribute[1])] = drupal_strtolower(decode_entities($attribute[2]));
534
      }
535
    }
536

    
537
    // Examine candidate to see if it s a feed.
538
    // @TODO: could/should use http_request_is_feed ??
539
    if (isset($candidate['rel']) && $candidate['rel'] == 'alternate') {
540
      if (isset($candidate['href']) && isset($candidate['type']) && strpos($candidate['type'], 'xml') !== FALSE) {
541
        // All tests pass, its a valid candidate.
542
        $valid_links[] = $candidate['href'];
543
      }
544
    }
545
  }
546

    
547
  return $valid_links;
548
}
549

    
550
/**
551
 * Create an absolute url.
552
 *
553
 * @param string $url
554
 *   The href to transform.
555
 * @param string $base_url
556
 *   The url to be used as the base for a relative $url.
557
 *
558
 * @return string
559
 *   An absolute url
560
 */
561
function http_request_create_absolute_url($url, $base_url) {
562
  $url = trim($url);
563
  if (valid_url($url, TRUE)) {
564
    // Valid absolute url already.
565
    return $url;
566
  }
567

    
568
  // Turn relative url into absolute.
569
  if (valid_url($url, FALSE)) {
570
    // Produces variables $scheme, $host, $user, $pass, $path, $query and
571
    // $fragment.
572
    $parsed_url = parse_url($base_url);
573
    if ($parsed_url === FALSE) {
574
      // Invalid $base_url.
575
      return FALSE;
576
    }
577

    
578
    $path = isset($parsed_url['path']) ? $parsed_url['path'] : '';
579
    if (strlen($path) > 0 && substr($path, -1) != '/') {
580
      // Path ends not with '/', so remove all before previous '/'.
581
      $path = dirname($path);
582
    }
583

    
584
    // Adding to the existing path.
585
    $cparts = array();
586
    if ($url[0] == '/') {
587
      $cparts = array_filter(explode("/", $url));
588
    }
589
    else {
590
      // Backtracking from the existing path.
591
      $path_cparts = array_filter(explode("/", $path));
592
      $url_cparts = array_filter(explode("/", $url));
593
      $cparts = array_merge($path_cparts, $url_cparts);
594
    }
595

    
596
    $remove_parts = 0;
597
    // Start from behind.
598
    $reverse_cparts = array_reverse($cparts);
599
    foreach ($reverse_cparts as $i => &$part) {
600
      if ($part == '.') {
601
        $part = NULL;
602
      }
603
      elseif ($part == '..') {
604
        $part = NULL;
605
        $remove_parts++;
606
      }
607
      elseif ($remove_parts > 0) {
608
        // If the current part isn't "..", and we had ".." before, then delete
609
        // the part.
610
        $part = NULL;
611
        $remove_parts--;
612
      }
613
    }
614
    $cparts = array_filter(array_reverse($reverse_cparts));
615
    $path = implode("/", $cparts);
616

    
617
    // Build the prefix to the path.
618
    $absolute_url = '';
619
    if (isset($parsed_url['scheme'])) {
620
      $absolute_url = $parsed_url['scheme'] . '://';
621
    }
622

    
623
    if (isset($parsed_url['user'])) {
624
      $absolute_url .= $parsed_url['user'];
625
      if (isset($pass)) {
626
        $absolute_url .= ':' . $parsed_url['pass'];
627
      }
628
      $absolute_url .= '@';
629
    }
630
    if (isset($parsed_url['host'])) {
631
      $absolute_url .= $parsed_url['host'] . '/';
632
    }
633

    
634
    $absolute_url .= $path;
635

    
636
    if (valid_url($absolute_url, TRUE)) {
637
      return $absolute_url;
638
    }
639
  }
640
  return FALSE;
641
}