Projet

Général

Profil

Paste
Télécharger (16 ko) Statistiques
| Branche: | Révision:

root / drupal7 / sites / all / modules / feeds / libraries / http_request.inc @ a192dc0b

1
<?php
2

    
3
/**
4
 * @file
5
 * Download via HTTP.
6
 *
7
 * Support caching, HTTP Basic Authentication, detection of RSS/Atom feeds,
8
 * redirects.
9
 */
10

    
11
/**
12
 * PCRE for finding the link tags in html.
13
 */
14
define('HTTP_REQUEST_PCRE_LINK_TAG', '/<link((?:[\x09\x0A\x0B\x0C\x0D\x20]+[^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"(?:[^"]*)"|\'(?:[^\']*)\'|(?:[^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?)*)[\x09\x0A\x0B\x0C\x0D\x20]*(>(.*)<\/link>|(\/)?>)/si');
15

    
16
/**
17
 * PCRE for matching all the attributes in a tag.
18
 */
19
define('HTTP_REQUEST_PCRE_TAG_ATTRIBUTES', '/[\x09\x0A\x0B\x0C\x0D\x20]+([^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*)(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"([^"]*)"|\'([^\']*)\'|([^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?/');
20

    
21
/**
22
 * For cUrl specific errors.
23
 */
24
class HRCurlException extends Exception {}
25

    
26
/**
27
 * Discovers RSS or atom feeds at the given URL.
28
 *
29
 * If document in given URL is an HTML document, function attempts to discover
30
 * RSS or Atom feeds.
31
 *
32
 * @param string $url
33
 *   The url of the feed to retrieve.
34
 * @param array $settings
35
 *   An optional array of settings. Valid options are: accept_invalid_cert.
36
 *
37
 * @return bool|string
38
 *   The discovered feed, or FALSE if the URL is not reachable or there was an
39
 *   error.
40
 */
41
function http_request_get_common_syndication($url, $settings = array()) {
42

    
43
  $accept_invalid_cert = isset($settings['accept_invalid_cert']) ? $settings['accept_invalid_cert'] : FALSE;
44
  $download = http_request_get($url, NULL, NULL, $accept_invalid_cert);
45

    
46
  // Cannot get the feed, return.
47
  // http_request_get() always returns 200 even if its 304.
48
  if ($download->code != 200) {
49
    return FALSE;
50
  }
51

    
52
  // Drop the data into a separate variable so all manipulations of the html
53
  // will not effect the actual object that exists in the static cache.
54
  // @see http_request_get.
55
  $downloaded_string = $download->data;
56
  // If this happens to be a feed then just return the url.
57
  if (isset($download->headers['content-type']) && http_request_is_feed($download->headers['content-type'], $downloaded_string)) {
58
    return $url;
59
  }
60

    
61
  $discovered_feeds = http_request_find_feeds($downloaded_string);
62
  foreach ($discovered_feeds as $feed_url) {
63
    $absolute = http_request_create_absolute_url($feed_url, $url);
64
    if (!empty($absolute)) {
65
      // @TODO: something more intelligent?
66
      return $absolute;
67
    }
68
  }
69
}
70

    
71
/**
72
 * Get the content from the given URL.
73
 *
74
 * @param string $url
75
 *   A valid URL (not only web URLs).
76
 * @param string $username
77
 *   If the URL uses authentication, supply the username.
78
 * @param string $password
79
 *   If the URL uses authentication, supply the password.
80
 * @param bool $accept_invalid_cert
81
 *   Whether to accept invalid certificates.
82
 * @param integer $timeout
83
 *   Timeout in seconds to wait for an HTTP get request to finish.
84
 *
85
 * @return stdClass
86
 *   An object that describes the data downloaded from $url.
87
 */
88
function http_request_get($url, $username = NULL, $password = NULL, $accept_invalid_cert = FALSE, $timeout = NULL) {
89
  // Intra-pagedownload cache, avoid to download the same content twice within
90
  // one page download (it's possible, compatible and parse calls).
91
  static $download_cache = array();
92
  if (isset($download_cache[$url])) {
93
    return $download_cache[$url];
94
  }
95

    
96
  // Determine request timeout.
97
  $request_timeout = !empty($timeout) ? $timeout : variable_get('http_request_timeout', 30);
98

    
99
  if (!$username && valid_url($url, TRUE)) {
100
    // Handle password protected feeds.
101
    $url_parts = parse_url($url);
102
    if (!empty($url_parts['user'])) {
103
      $password = urldecode($url_parts['pass']);
104
      $username = urldecode($url_parts['user']);
105
    }
106
  }
107

    
108
  $curl = http_request_use_curl();
109

    
110
  // Only download and parse data if really needs refresh.
111
  // Based on "Last-Modified" and "If-Modified-Since".
112
  $headers = array();
113
  if ($cache = http_request_get_cache($url)) {
114
    $last_result = $cache->data;
115
    $last_headers = array_change_key_case($last_result->headers);
116

    
117
    if (!empty($last_headers['etag'])) {
118
      if ($curl) {
119
        $headers[] = 'If-None-Match: ' . $last_headers['etag'];
120
      }
121
      else {
122
        $headers['If-None-Match'] = $last_headers['etag'];
123
      }
124
    }
125
    if (!empty($last_headers['last-modified'])) {
126
      if ($curl) {
127
        $headers[] = 'If-Modified-Since: ' . $last_headers['last-modified'];
128
      }
129
      else {
130
        $headers['If-Modified-Since'] = $last_headers['last-modified'];
131
      }
132
    }
133
    if (!empty($username) && !$curl) {
134
      $headers['Authorization'] = 'Basic ' . base64_encode("$username:$password");
135
    }
136
  }
137

    
138
  // Support the 'feed' and 'webcal' schemes by converting them into 'http'.
139
  $url = strtr($url, array('feed://' => 'http://', 'webcal://' => 'http://'));
140

    
141
  if ($curl) {
142
    $headers[] = 'User-Agent: Drupal (+http://drupal.org/)';
143
    $result = new stdClass();
144
    $result->headers = array();
145

    
146
    // Parse the URL and make sure we can handle the schema.
147
    // cURL can only support either http:// or https://.
148
    // CURLOPT_PROTOCOLS is only supported with cURL 7.19.4
149
    $uri = parse_url($url);
150
    if (!isset($uri['scheme'])) {
151
      $result->error = 'missing schema';
152
      $result->code = -1002;
153
    }
154
    else {
155
      switch ($uri['scheme']) {
156
        case 'http':
157
        case 'https':
158
          // Valid scheme.
159
          break;
160

    
161
        default:
162
          $result->error = 'invalid schema ' . $uri['scheme'];
163
          $result->code = -1003;
164
          break;
165
      }
166
    }
167

    
168
    // If the scheme was valid, continue to request the feed using cURL.
169
    if (empty($result->error)) {
170
      $download = curl_init($url);
171
      curl_setopt($download, CURLOPT_FOLLOWLOCATION, TRUE);
172
      if (!empty($username)) {
173
        curl_setopt($download, CURLOPT_USERPWD, "{$username}:{$password}");
174
        curl_setopt($download, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
175
      }
176
      curl_setopt($download, CURLOPT_HTTPHEADER, $headers);
177
      curl_setopt($download, CURLOPT_HEADER, TRUE);
178
      curl_setopt($download, CURLOPT_RETURNTRANSFER, TRUE);
179
      curl_setopt($download, CURLOPT_ENCODING, '');
180
      curl_setopt($download, CURLOPT_TIMEOUT, $request_timeout);
181

    
182
      $proxy_server = variable_get('proxy_server');
183

    
184
      if ($proxy_server && _drupal_http_use_proxy($uri['host'])) {
185
        curl_setopt($download, CURLOPT_PROXY, $proxy_server);
186
        curl_setopt($download, CURLOPT_PROXYPORT, variable_get('proxy_port', 8080));
187

    
188
        // Proxy user/password.
189
        if ($proxy_username = variable_get('proxy_username')) {
190
          $username_password = $proxy_username . ':' . variable_get('proxy_password', '');
191

    
192
          curl_setopt($download, CURLOPT_PROXYUSERPWD, $username_password);
193
          curl_setopt($download, CURLOPT_PROXYAUTH, variable_get('proxy_auth_method', CURLAUTH_BASIC));
194
        }
195
      }
196

    
197
      if ($accept_invalid_cert) {
198
        curl_setopt($download, CURLOPT_SSL_VERIFYPEER, 0);
199
        curl_setopt($download, CURLOPT_SSL_VERIFYHOST, 0);
200
      }
201
      $header = '';
202
      $data = curl_exec($download);
203
      if (curl_error($download)) {
204
        throw new HRCurlException(
205
          t('cURL error (@code) @error for @url', array(
206
            '@code' => curl_errno($download),
207
            '@error' => curl_error($download),
208
            '@url' => $url,
209
          )), curl_errno($download)
210
        );
211
      }
212

    
213
      // When using a proxy, remove extra data from the header which is not
214
      // considered by CURLINFO_HEADER_SIZE (possibly cURL bug).
215
      // This data is only added when to HTTP header when working with a proxy.
216
      // Example string added: <HTTP/1.0 200 Connection established\r\n\r\n>
217
      // This was fixed in libcurl version 7.30.0 (0x71e00) (April 12, 2013),
218
      // so this workaround only removes the proxy-added headers if we are using
219
      // an older version of libcurl.
220
      $curl_ver = curl_version();
221

    
222
      if ($proxy_server && $curl_ver['version_number'] < 0x71e00 && _drupal_http_use_proxy($uri['host'])) {
223
        $http_header_break = "\r\n\r\n";
224
        $response = explode($http_header_break, $data);
225
        if (count($response) > 2) {
226
          $data = substr($data, strlen($response[0] . $http_header_break), strlen($data));
227
        }
228
      }
229

    
230
      $header_size = curl_getinfo($download, CURLINFO_HEADER_SIZE);
231
      $header = substr($data, 0, $header_size - 1);
232
      $result->data = substr($data, $header_size);
233
      $headers = preg_split("/(\r\n){2}/", $header);
234
      $header_lines = preg_split("/\r\n|\n|\r/", end($headers));
235
      // Skip HTTP response status.
236
      array_shift($header_lines);
237

    
238
      while ($line = trim(array_shift($header_lines))) {
239
        list($header, $value) = explode(':', $line, 2);
240
        // Normalize the headers.
241
        $header = strtolower($header);
242

    
243
        if (isset($result->headers[$header]) && $header == 'set-cookie') {
244
          // RFC 2109: the Set-Cookie response header comprises the token Set-
245
          // Cookie:, followed by a comma-separated list of one or more cookies.
246
          $result->headers[$header] .= ',' . trim($value);
247
        }
248
        else {
249
          $result->headers[$header] = trim($value);
250
        }
251
      }
252
      $result->code = curl_getinfo($download, CURLINFO_HTTP_CODE);
253

    
254
      curl_close($download);
255
    }
256
  }
257
  else {
258
    $result = drupal_http_request($url, array('headers' => $headers, 'timeout' => $request_timeout));
259
    $result->headers = isset($result->headers) ? $result->headers : array();
260
  }
261

    
262
  $result->code = isset($result->code) ? $result->code : 200;
263

    
264
  // In case of 304 Not Modified try to return cached data.
265
  if ($result->code == 304) {
266

    
267
    if (isset($last_result)) {
268
      $last_result->from_cache = TRUE;
269
      return $last_result;
270
    }
271
    else {
272
      // It's a tragedy, this file must exist and contain good data.
273
      // In this case, clear cache and repeat.
274
      http_request_clear_cache($url);
275
      return http_request_get($url, $username, $password, $accept_invalid_cert, $request_timeout);
276
    }
277
  }
278

    
279
  // Set caches.
280
  http_request_set_cache($url, $result);
281
  $download_cache[$url] = $result;
282

    
283
  return $result;
284
}
285

    
286
/**
287
 * Decides if it's possible to use cURL or not.
288
 *
289
 * @return bool
290
 *   TRUE if cURL may be used, FALSE otherwise.
291
 */
292
function http_request_use_curl() {
293
  // Allow site administrators to choose to not use cURL.
294
  if (variable_get('feeds_never_use_curl', FALSE)) {
295
    return FALSE;
296
  }
297

    
298
  // Check that the PHP cURL extension has been enabled.
299
  if (!extension_loaded('curl')) {
300
    return FALSE;
301
  }
302

    
303
  // cURL below PHP 5.6.0 must not have open_basedir or safe_mode enabled.
304
  if (version_compare(PHP_VERSION, '5.6.0', '<')) {
305
    return !ini_get('safe_mode') && !ini_get('open_basedir');
306
  }
307

    
308
  // cURL in PHP 5.6.0 and above re-enables CURLOPT_FOLLOWLOCATION with
309
  // open_basedir so there is no need to check for this.
310
  return TRUE;
311
}
312

    
313
/**
314
 * Clear cache for a specific URL.
315
 *
316
 * @param string $url
317
 *   The URL to clear.
318
 */
319
function http_request_clear_cache($url) {
320
  cache_clear_all(hash('sha256', $url), 'cache_feeds_http');
321
}
322

    
323
/**
324
 * Gets the cache for a specific URL.
325
 *
326
 * @param string $url
327
 *   The URL to find the cached item.
328
 *
329
 * @return object|false
330
 *   The cache or FALSE on failure.
331
 */
332
function http_request_get_cache($url) {
333
  return cache_get(hash('sha256', $url), 'cache_feeds_http');
334
}
335

    
336
/**
337
 * Sets the cache for a specific URL.
338
 *
339
 * @param string $url
340
 *   The URL to cache.
341
 * @param stdClass $result
342
 *   The result of the HTTP request.
343
 */
344
function http_request_set_cache($url, stdClass $result) {
345
  cache_set(hash('sha256', $url), $result, 'cache_feeds_http');
346
}
347

    
348
/**
349
 * Returns if the provided $content_type is a feed.
350
 *
351
 * @param string $content_type
352
 *   The Content-Type header.
353
 *
354
 * @param string $data
355
 *   The actual data from the http request.
356
 *
357
 * @return bool
358
 *   Returns TRUE if this is a parsable feed.
359
 */
360
function http_request_is_feed($content_type, $data) {
361
  $pos = strpos($content_type, ';');
362
  if ($pos !== FALSE) {
363
    $content_type = substr($content_type, 0, $pos);
364
  }
365
  $content_type = strtolower($content_type);
366
  if (strpos($content_type, 'xml') !== FALSE) {
367
    return TRUE;
368
  }
369

    
370
  // @TODO: Sometimes the content-type can be text/html but still be a valid
371
  // feed.
372
  return FALSE;
373
}
374

    
375
/**
376
 * Finds potential feed tags in the HTML document.
377
 *
378
 * @param string $html
379
 *   The html string to search.
380
 *
381
 * @return array
382
 *   An array of href to feeds.
383
 */
384
function http_request_find_feeds($html) {
385
  $matches = array();
386
  preg_match_all(HTTP_REQUEST_PCRE_LINK_TAG, $html, $matches);
387
  $links = $matches[1];
388
  $valid_links = array();
389

    
390
  // Build up all the links information.
391
  foreach ($links as $link_tag) {
392
    $attributes = array();
393
    $candidate = array();
394

    
395
    preg_match_all(HTTP_REQUEST_PCRE_TAG_ATTRIBUTES, $link_tag, $attributes, PREG_SET_ORDER);
396
    foreach ($attributes as $attribute) {
397
      // Find the key value pairs, attribute[1] is key and attribute[2] is the
398
      // value.  However, if the link tag used single quotes, the value might
399
      // be in attribute[3] instead.
400
      if (empty($attribute[2])) {
401
        $attribute[2] = $attribute[3];
402
      }
403
      if (!empty($attribute[1]) && !empty($attribute[2])) {
404
        $candidate[drupal_strtolower($attribute[1])] = drupal_strtolower(decode_entities($attribute[2]));
405
      }
406
    }
407

    
408
    // Examine candidate to see if it s a feed.
409
    // @TODO: could/should use http_request_is_feed ??
410
    if (isset($candidate['rel']) && $candidate['rel'] == 'alternate') {
411
      if (isset($candidate['href']) && isset($candidate['type']) && strpos($candidate['type'], 'xml') !== FALSE) {
412
        // All tests pass, its a valid candidate.
413
        $valid_links[] = $candidate['href'];
414
      }
415
    }
416
  }
417

    
418
  return $valid_links;
419
}
420

    
421
/**
422
 * Create an absolute url.
423
 *
424
 * @param string $url
425
 *   The href to transform.
426
 * @param string $base_url
427
 *   The url to be used as the base for a relative $url.
428
 *
429
 * @return string
430
 *   An absolute url
431
 */
432
function http_request_create_absolute_url($url, $base_url) {
433
  $url = trim($url);
434
  if (valid_url($url, TRUE)) {
435
    // Valid absolute url already.
436
    return $url;
437
  }
438

    
439
  // Turn relative url into absolute.
440
  if (valid_url($url, FALSE)) {
441
    // Produces variables $scheme, $host, $user, $pass, $path, $query and
442
    // $fragment.
443
    $parsed_url = parse_url($base_url);
444
    if ($parsed_url === FALSE) {
445
      // Invalid $base_url.
446
      return FALSE;
447
    }
448

    
449
    $path = isset($parsed_url['path']) ? $parsed_url['path'] : '';
450
    if (strlen($path) > 0 && substr($path, -1) != '/') {
451
      // Path ends not with '/', so remove all before previous '/'.
452
      $path = dirname($path);
453
    }
454

    
455
    // Adding to the existing path.
456
    $cparts = array();
457
    if ($url{0} == '/') {
458
      $cparts = array_filter(explode("/", $url));
459
    }
460
    else {
461
      // Backtracking from the existing path.
462
      $path_cparts = array_filter(explode("/", $path));
463
      $url_cparts = array_filter(explode("/", $url));
464
      $cparts = array_merge($path_cparts, $url_cparts);
465
    }
466

    
467
    $remove_parts = 0;
468
    // Start from behind.
469
    $reverse_cparts = array_reverse($cparts);
470
    foreach ($reverse_cparts as $i => &$part) {
471
      if ($part == '.') {
472
        $part = NULL;
473
      }
474
      elseif ($part == '..') {
475
        $part = NULL;
476
        $remove_parts++;
477
      }
478
      elseif ($remove_parts > 0) {
479
        // If the current part isn't "..", and we had ".." before, then delete
480
        // the part.
481
        $part = NULL;
482
        $remove_parts--;
483
      }
484
    }
485
    $cparts = array_filter(array_reverse($reverse_cparts));
486
    $path = implode("/", $cparts);
487

    
488
    // Build the prefix to the path.
489
    $absolute_url = '';
490
    if (isset($parsed_url['scheme'])) {
491
      $absolute_url = $parsed_url['scheme'] . '://';
492
    }
493

    
494
    if (isset($parsed_url['user'])) {
495
      $absolute_url .= $parsed_url['user'];
496
      if (isset($pass)) {
497
        $absolute_url .= ':' . $parsed_url['pass'];
498
      }
499
      $absolute_url .= '@';
500
    }
501
    if (isset($parsed_url['host'])) {
502
      $absolute_url .= $parsed_url['host'] . '/';
503
    }
504

    
505
    $absolute_url .= $path;
506

    
507
    if (valid_url($absolute_url, TRUE)) {
508
      return $absolute_url;
509
    }
510
  }
511
  return FALSE;
512
}