Projet

Général

Profil

Paste
Télécharger (18,9 ko) Statistiques
| Branche: | Révision:

root / drupal7 / sites / all / modules / feeds / libraries / common_syndication_parser.inc @ 13755f8d

1
<?php
2

    
3
/**
4
 * @file
5
 *   Downloading and parsing functions for Common Syndication Parser.
6
 *   Pillaged from FeedAPI common syndication parser.
7
 *
8
 * @todo Restructure. OO could work wonders here.
9
 * @todo Write unit tests.
10
 * @todo Keep in Feeds project or host on Drupal?
11
 */
12

    
13
/**
14
 * Parse the feed into a data structure.
15
 *
16
 * @param $feed
17
 *  The feed object (contains the URL or the parsed XML structure.
18
 * @return
19
 *  stdClass The structured datas extracted from the feed.
20
 */
21
function common_syndication_parser_parse($string) {
22
  @ $xml = simplexml_load_string($string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NOCDATA);
23

    
24
  // Got a malformed XML.
25
  if ($xml === FALSE || is_null($xml)) {
26
    return FALSE;
27
  }
28
  $feed_type = _parser_common_syndication_feed_format_detect($xml);
29
  if ($feed_type ==  "atom1.0") {
30
    return _parser_common_syndication_atom10_parse($xml);
31
  }
32
  if ($feed_type == "RSS2.0" || $feed_type == "RSS0.91" || $feed_type == "RSS0.92") {
33
    return _parser_common_syndication_RSS20_parse($xml);
34
  }
35
  if ($feed_type == "RDF") {
36
    return _parser_common_syndication_RDF10_parse($xml);
37
  }
38
  return FALSE;
39
}
40

    
41
/**
42
 * Get the cached version of the <var>$url</var>
43
 */
44
function _parser_common_syndication_cache_get($url) {
45
  $cache_file = _parser_common_syndication_sanitize_cache() . '/' . md5($url);
46
  if (file_exists($cache_file)) {
47
    $file_content = file_get_contents($cache_file);
48
    return unserialize($file_content);
49
  }
50
  return FALSE;
51
}
52

    
53
/**
54
 * Determine the feed format of a SimpleXML parsed object structure.
55
 *
56
 * @param $xml
57
 *  SimpleXML-preprocessed feed.
58
 * @return
59
 *  The feed format short description or FALSE if not compatible.
60
 */
61
function _parser_common_syndication_feed_format_detect($xml) {
62
  if (!is_object($xml)) {
63
    return FALSE;
64
  }
65
  $attr = $xml->attributes();
66
  $type = strtolower($xml->getName());
67
  if (isset($xml->entry) && $type == "feed") {
68
    return "atom1.0";
69
  }
70
  if ($type == "rss" && $attr["version"] == "2.0") {
71
    return "RSS2.0";
72
  }
73
  if ($type == "rdf" && isset($xml->channel)) {
74
    return "RDF";
75
  }
76
  if ($type == "rss" && $attr["version"] == "0.91") {
77
    return "RSS0.91";
78
  }
79
  if ($type == "rss" && $attr["version"] == "0.92") {
80
    return "RSS0.92";
81
  }
82
  return FALSE;
83
}
84

    
85
/**
86
 * Parse atom feeds.
87
 */
88
function _parser_common_syndication_atom10_parse($feed_XML) {
89
  $parsed_source = array();
90

    
91
  $ns = array(
92
    "georss" => "http://www.georss.org/georss",
93
  );
94

    
95
  $base = $feed_XML->xpath("@base");
96
  $base = (string) array_shift($base);
97
  if (!valid_url($base, TRUE)) {
98
    $base = FALSE;
99
  }
100

    
101
  // Detect the title
102
  $parsed_source['title'] = isset($feed_XML->title) ? _parser_common_syndication_title("{$feed_XML->title}") : "";
103
  // Detect the description
104
  $parsed_source['description'] = isset($feed_XML->subtitle) ? "{$feed_XML->subtitle}" : "";
105

    
106
  $parsed_source['link'] = _parser_common_syndication_link($feed_XML->link);
107
  if (valid_url($parsed_source['link']) && !valid_url($parsed_source['link'], TRUE) && !empty($base)) {
108
    $parsed_source['link'] = $base . $parsed_source['link'];
109
  }
110

    
111
  $parsed_source['items'] = array();
112

    
113
  foreach ($feed_XML->entry as $news) {
114

    
115
    $original_url = NULL;
116
    $guid = !empty($news->id) ? "{$news->id}" : NULL;
117
    if (valid_url($guid, TRUE)) {
118
      $original_url = $guid;
119
    }
120

    
121
    $georss = (array)$news->children($ns["georss"]);
122
    $geoname = '';
123
    if (isset($georss['featureName'])) {
124
      $geoname = "{$georss['featureName']}";
125
    }
126

    
127
    $latlon =
128
    $lat =
129
    $lon = NULL;
130
    if (isset($georss['point'])) {
131
      $latlon = explode(' ', $georss['point']);
132
      $lat = "{$latlon[0]}";
133
      $lon = "{$latlon[1]}";
134
      if (!$geoname) {
135
        $geoname = "{$lat} {$lon}";
136
      }
137
    }
138

    
139
    $additional_taxonomies = array();
140
    if (isset($news->category)) {
141
      $additional_taxonomies['ATOM Categories'] = array();
142
      $additional_taxonomies['ATOM Domains'] = array();
143
      foreach ($news->category as $category) {
144
        if (isset($category['scheme'])) {
145
          $domain = "{$category['scheme']}";
146
          if (!empty($domain)) {
147
              if (!isset($additional_taxonomies['ATOM Domains'][$domain])) {
148
                $additional_taxonomies['ATOM Domains'][$domain] = array();
149
              }
150
              $additional_taxonomies['ATOM Domains'][$domain][] = count($additional_taxonomies['ATOM Categories']) - 1;
151
          }
152
        }
153
        $additional_taxonomies['ATOM Categories'][] = "{$category['term']}";
154
      }
155
    }
156

    
157
    $title = "{$news->title}";
158

    
159
    $body = '';
160
    if (!empty($news->content)) {
161
      foreach ($news->content->children() as $child)  {
162
        $body .= $child->asXML();
163
      }
164
      $body .= "{$news->content}";
165
    }
166
    elseif (!empty($news->summary)) {
167
      foreach ($news->summary->children() as $child)  {
168
        $body .= $child->asXML();
169
      }
170
      $body .= "{$news->summary}";
171
    }
172

    
173
    if (!empty($news->content['src'])) {
174
      // some src elements in some valid atom feeds contained no urls at all
175
      if (valid_url("{$news->content['src']}", TRUE)) {
176
        $original_url = "{$news->content['src']}";
177
      }
178
    }
179

    
180
    $author_found = FALSE;
181
    if (!empty($news->source->author->name)) {
182
      $original_author = "{$news->source->author->name}";
183
      $author_found = TRUE;
184
    }
185
    elseif (!empty($news->author->name)) {
186
      $original_author = "{$news->author->name}";
187
      $author_found = TRUE;
188
    }
189
    if (!empty($feed_XML->author->name) && !$author_found) {
190
      $original_author = "{$feed_XML->author->name}";
191
    }
192

    
193
    $original_url = _parser_common_syndication_link($news->link);
194

    
195
    $item = array();
196
    $item['title'] = _parser_common_syndication_title($title, $body);
197
    $item['description'] = $body;
198
    $item['author_name'] = $original_author;
199

    
200
    // Fall back to updated for timestamp if both published and issued are
201
    // empty.
202
    if (isset($news->published)) {
203
      $item['timestamp'] = _parser_common_syndication_parse_date("{$news->published}");
204
    }
205
    elseif (isset($news->issued)) {
206
       $item['timestamp'] = _parser_common_syndication_parse_date("{$news->issued}");
207
    }
208
    elseif (isset($news->updated)) {
209
      $item['timestamp'] = _parser_common_syndication_parse_date("{$news->updated}");
210
    }
211

    
212
    $item['url'] = trim($original_url);
213
    if (valid_url($item['url']) && !valid_url($item['url'], TRUE) && !empty($base)) {
214
      $item['url'] = $base . $item['url'];
215
    }
216
    // Fall back on URL if GUID is empty.
217
    if (!empty($guid)) {
218
      $item['guid'] = $guid;
219
    }
220
    else {
221
      $item['guid'] = $item['url'];
222
    }
223
    $item['geolocations'] = array();
224
    if ($lat && $lon) {
225
      $item['geolocations'] = array(
226
        array(
227
          'name' => $geoname,
228
          'lat' => $lat,
229
          'lon' => $lon,
230
        ),
231
      );
232
    }
233
    $item['tags'] = isset($additional_taxonomies['ATOM Categories']) ? $additional_taxonomies['ATOM Categories'] : array();
234
    $item['domains'] = isset($additional_taxonomies['ATOM Domains']) ? $additional_taxonomies['ATOM Domains'] : array();
235
    $parsed_source['items'][] = $item;
236
  }
237
  return $parsed_source;
238
}
239

    
240
/**
241
 * Parse RDF Site Summary (RSS) 1.0 feeds in RDF/XML format.
242
 *
243
 * @see http://web.resource.org/rss/1.0/
244
 */
245
function _parser_common_syndication_RDF10_parse($feed_XML) {
246
  // Declare some canonical standard prefixes for well-known namespaces:
247
  static $canonical_namespaces = array(
248
    'rdf'      => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
249
    'rdfs'     => 'http://www.w3.org/2000/01/rdf-schema#',
250
    'xsi'      => 'http://www.w3.org/2001/XMLSchema-instance#',
251
    'xsd'      => 'http://www.w3.org/2001/XMLSchema#',
252
    'owl'      => 'http://www.w3.org/2002/07/owl#',
253
    'dc'       => 'http://purl.org/dc/elements/1.1/',
254
    'dcterms'  => 'http://purl.org/dc/terms/',
255
    'dcmitype' => 'http://purl.org/dc/dcmitype/',
256
    'foaf'     => 'http://xmlns.com/foaf/0.1/',
257
    'rss'      => 'http://purl.org/rss/1.0/',
258
  );
259

    
260
  // Get all namespaces declared in the feed element.
261
  $namespaces = $feed_XML->getNamespaces(TRUE);
262

    
263
  // Process the <rss:channel> resource containing feed metadata:
264
  foreach ($feed_XML->children($canonical_namespaces['rss'])->channel as $rss_channel) {
265
    $parsed_source = array(
266
      'title'       => _parser_common_syndication_title((string) $rss_channel->title),
267
      'description' => (string) $rss_channel->description,
268
      'link'        => (string) $rss_channel->link,
269
      'items'       => array(),
270
    );
271
    break;
272
  }
273

    
274
  // Process each <rss:item> resource contained in the feed:
275
  foreach ($feed_XML->children($canonical_namespaces['rss'])->item as $rss_item) {
276

    
277
    // Extract all available RDF statements from the feed item's RDF/XML
278
    // tags, allowing for both the item's attributes and child elements to
279
    // contain RDF properties:
280
    $rdf_data = array();
281
    foreach ($namespaces as $ns => $ns_uri) {
282
      // Note that we attempt to normalize the found property name
283
      // namespaces to well-known 'standard' prefixes where possible, as the
284
      // feed may in principle use any arbitrary prefixes and we should
285
      // still be able to correctly handle it.
286
      foreach ($rss_item->attributes($ns_uri) as $attr_name => $attr_value) {
287
        $ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
288
        $rdf_data[$ns_prefix . ':' . $attr_name][] = (string) $attr_value;
289
      }
290
      foreach ($rss_item->children($ns_uri) as $rss_property) {
291
        $ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
292
        $rdf_data[$ns_prefix . ':' . $rss_property->getName()][] = (string) $rss_property;
293
      }
294
    }
295

    
296
    // Declaratively define mappings that determine how to construct the result object.
297
    $item = _parser_common_syndication_RDF10_item($rdf_data, array(
298
      'title'       => array('rss:title', 'dc:title'),
299
      'description' => array('rss:description', 'dc:description', 'content:encoded'),
300
      'url'         => array('rss:link', 'rdf:about'),
301
      'author_name' => array('dc:creator', 'dc:publisher'),
302
      'guid'        => 'rdf:about',
303
      'timestamp'   => 'dc:date',
304
      'tags'        => 'dc:subject'
305
    ));
306

    
307
    // Special handling for the title:
308
    $item['title'] = _parser_common_syndication_title($item['title'], $item['description']);
309

    
310
    // Parse any date/time values into Unix timestamps:
311
    $item['timestamp'] = _parser_common_syndication_parse_date($item['timestamp']);
312

    
313
    // If no GUID found, use the URL of the feed.
314
    if (empty($item['guid'])) {
315
      $item['guid'] = $item['url'];
316
    }
317

    
318
    // Add every found RDF property to the feed item.
319
    $item['rdf'] = array();
320
    foreach ($rdf_data as $rdf_property => $rdf_value) {
321
      // looks nicer in the mapper UI
322
      // @todo Revisit, not used with feedapi mapper anymore.
323
      $rdf_property = str_replace(':', '_', $rdf_property);
324
      $item['rdf'][$rdf_property] = $rdf_value;
325
    }
326

    
327
    $parsed_source['items'][] = $item;
328
  }
329

    
330
  return $parsed_source;
331
}
332

    
333
function _parser_common_syndication_RDF10_property($rdf_data, $rdf_properties = array()) {
334
  $rdf_properties = is_array($rdf_properties) ? $rdf_properties : array_slice(func_get_args(), 1);
335
  foreach ($rdf_properties as $rdf_property) {
336
    if ($rdf_property && !empty($rdf_data[$rdf_property])) {
337
      // remove empty strings
338
      return array_filter($rdf_data[$rdf_property], 'strlen');
339
    }
340
  }
341
}
342

    
343
function _parser_common_syndication_RDF10_item($rdf_data, $mappings) {
344
  foreach ($mappings as $k => $v) {
345
    $values = _parser_common_syndication_RDF10_property($rdf_data, $v);
346
    $mappings[$k] = !is_array($values) || count($values) > 1 ? $values : reset($values);
347
  }
348
  return $mappings;
349
}
350

    
351
/**
352
 * Parse RSS2.0 feeds.
353
 */
354
function _parser_common_syndication_RSS20_parse($feed_XML) {
355

    
356
  $ns = array(
357
    "content" => "http://purl.org/rss/1.0/modules/content/",
358
     "dc" => "http://purl.org/dc/elements/1.1/",
359
     "georss" => "http://www.georss.org/georss",
360
  );
361

    
362
  $parsed_source = array();
363
  // Detect the title.
364
  $parsed_source['title'] = isset($feed_XML->channel->title) ? _parser_common_syndication_title("{$feed_XML->channel->title}") : "";
365
  // Detect the description.
366
  $parsed_source['description'] = isset($feed_XML->channel->description) ? "{$feed_XML->channel->description}" : "";
367
  // Detect the link.
368
  $parsed_source['link'] = isset($feed_XML->channel->link) ? "{$feed_XML->channel->link}" : "";
369
  $parsed_source['items'] = array();
370

    
371
  foreach ($feed_XML->xpath('//item') as $news) {
372
    $title = $body = $original_author = $original_url = $guid = '';
373

    
374
    $category = $news->xpath('category');
375
    // Get children for current namespace.
376
    $content = (array)$news->children($ns["content"]);
377
    $dc      = (array)$news->children($ns["dc"]);
378
    $georss  = (array)$news->children($ns["georss"]);
379
    $news = (array) $news;
380
    $news['category'] = $category;
381

    
382
    if (isset($news['title'])) {
383
      $title = "{$news['title']}";
384
    }
385

    
386
    if (isset($news['description'])) {
387
      $body = "{$news['description']}";
388
    }
389
    // Some sources use content:encoded as description i.e.
390
    // PostNuke PageSetter module.
391
    if (isset($news['encoded'])) {  // content:encoded for PHP < 5.1.2.
392
      if (strlen($body) < strlen("{$news['encoded']}")) {
393
        $body = "{$news['encoded']}";
394
      }
395
    }
396
    if (isset($content['encoded'])) { // content:encoded for PHP >= 5.1.2.
397
      if (strlen($body) < strlen("{$content['encoded']}")) {
398
        $body = "{$content['encoded']}";
399
      }
400
    }
401
    if (!isset($body)) {
402
      $body = "{$news['title']}";
403
    }
404

    
405
    if (!empty($news['author'])) {
406
      $original_author = "{$news['author']}";
407
    }
408
    elseif (!empty($dc["creator"])) {
409
      $original_author = (string)$dc["creator"];
410
    }
411

    
412
    if (!empty($news['link'])) {
413
      $original_url = "{$news['link']}";
414
      $guid = $original_url;
415
    }
416

    
417
    if (!empty($news['guid'])) {
418
      $guid = "{$news['guid']}";
419
    }
420

    
421
    if (!empty($georss['featureName'])) {
422
      $geoname = "{$georss['featureName']}";
423
    }
424

    
425
    $lat =
426
    $lon =
427
    $latlon =
428
    $geoname = NULL;
429
    if (!empty($georss['point'])) {
430
      $latlon = explode(' ', $georss['point']);
431
      $lat = "{$latlon[0]}";
432
      $lon = "{$latlon[1]}";
433
      if (!$geoname) {
434
        $geoname = "$lat $lon";
435
      }
436
    }
437

    
438
    $additional_taxonomies = array();
439
    $additional_taxonomies['RSS Categories'] = array();
440
    $additional_taxonomies['RSS Domains'] = array();
441
    if (isset($news['category'])) {
442
      foreach ($news['category'] as $category) {
443
        $additional_taxonomies['RSS Categories'][] = "{$category}";
444
        if (isset($category['domain'])) {
445
          $domain = "{$category['domain']}";
446
          if (!empty($domain)) {
447
              if (!isset($additional_taxonomies['RSS Domains'][$domain])) {
448
                $additional_taxonomies['RSS Domains'][$domain] = array();
449
              }
450
              $additional_taxonomies['RSS Domains'][$domain][] = count($additional_taxonomies['RSS Categories']) - 1;
451
          }
452
        }
453
      }
454
    }
455

    
456
    $item = array();
457
    $item['title'] = _parser_common_syndication_title($title, $body);
458
    $item['description'] = $body;
459
    $item['author_name'] = $original_author;
460
    if (!empty($news['pubDate'])) {
461
      $item['timestamp'] = _parser_common_syndication_parse_date($news['pubDate']);
462
    }
463
    elseif (!empty($dc['date'])) {
464
      $item['timestamp'] = _parser_common_syndication_parse_date($dc['date']);
465
    }
466
    else {
467
      $item['timestamp'] = time();
468
    }
469
    $item['url'] = trim($original_url);
470
    $item['guid'] = $guid;
471

    
472
    $item['geolocations'] = array();
473
    if (isset($geoname, $lat, $lon)) {
474
      $item['geolocations'] = array(
475
        array(
476
          'name' => $geoname,
477
          'lat' => $lat,
478
          'lon' => $lon,
479
        ),
480
      );
481
    }
482

    
483
    $item['domains'] = $additional_taxonomies['RSS Domains'];
484
    $item['tags'] = $additional_taxonomies['RSS Categories'];
485
    $parsed_source['items'][] = $item;
486
  }
487
  return $parsed_source;
488
}
489

    
490
/**
491
 * Parse a date comes from a feed.
492
 *
493
 * @param $date_string
494
 *  The date string in various formats.
495
 * @return
496
 *  The timestamp of the string or the current time if can't be parsed
497
 */
498
function _parser_common_syndication_parse_date($date_str) {
499
  // PHP < 5.3 doesn't like the GMT- notation for parsing timezones.
500
  $date_str = str_replace('GMT-', '-', $date_str);
501
  $date_str = str_replace('GMT+', '+', $date_str);
502
  $parsed_date = strtotime($date_str);
503

    
504
  if ($parsed_date === FALSE || $parsed_date == -1) {
505
    $parsed_date = _parser_common_syndication_parse_w3cdtf($date_str);
506
  }
507

    
508
  if (($parsed_date === FALSE || $parsed_date == -1)) {
509
    // PHP does not support the UT timezone. Fake it. The system that generated
510
    // this, Google Groups, probably meant UTC.
511
    $date_str = strtolower(trim($date_str));
512
    $last_three = substr($date_str, strlen($date_str) - 3, 3);
513

    
514
    if ($last_three == ' ut') {
515
      $parsed_date = strtotime($date_str . 'c');
516
    }
517
  }
518

    
519
  return $parsed_date === FALSE ? time() : $parsed_date;
520
}
521

    
522
/**
523
 * Parse the W3C date/time format, a subset of ISO 8601.
524
 *
525
 * PHP date parsing functions do not handle this format.
526
 * See http://www.w3.org/TR/NOTE-datetime for more information.
527
 * Originally from MagpieRSS (http://magpierss.sourceforge.net/).
528
 *
529
 * @param $date_str
530
 *   A string with a potentially W3C DTF date.
531
 * @return
532
 *   A timestamp if parsed successfully or FALSE if not.
533
 */
534
function _parser_common_syndication_parse_w3cdtf($date_str) {
535
  if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) {
536
    list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]);
537
    // Calculate the epoch for current date assuming GMT.
538
    $epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);
539
    if ($match[10] != 'Z') { // Z is zulu time, aka GMT
540
      list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]);
541
      // Zero out the variables.
542
      if (!$tz_hour) {
543
        $tz_hour = 0;
544
      }
545
      if (!$tz_min) {
546
        $tz_min = 0;
547
      }
548
      $offset_secs = (($tz_hour * 60) + $tz_min) * 60;
549
      // Is timezone ahead of GMT?  If yes, subtract offset.
550
      if ($tz_mod == '+') {
551
        $offset_secs *= -1;
552
      }
553
      $epoch += $offset_secs;
554
    }
555
    return $epoch;
556
  }
557
  else {
558
    return FALSE;
559
  }
560
}
561

    
562
/**
563
 * Extract the link that points to the original content (back to site or
564
 * original article)
565
 *
566
 * @param $links
567
 *  Array of SimpleXML objects
568
 */
569
function _parser_common_syndication_link($links) {
570
  $to_link = '';
571
  if (count($links) > 0) {
572
    foreach ($links as $link) {
573
      $link = $link->attributes();
574
      $to_link = isset($link["href"]) ? "{$link["href"]}" : "";
575
      if (isset($link["rel"])) {
576
        if ("{$link["rel"]}" == 'alternate') {
577
          break;
578
        }
579
      }
580
    }
581
  }
582
  return $to_link;
583
}
584

    
585
/**
586
 * Prepare raw data to be a title
587
 */
588
function _parser_common_syndication_title($title, $body = FALSE) {
589
  if (empty($title) && !empty($body)) {
590
    // Explode to words and use the first 3 words.
591
    $words = preg_split('/[\s,]+/', strip_tags($body));
592
    $title = implode(' ', array_slice($words, 0, 3));
593
  }
594
  return $title;
595
}