Projet

Général

Profil

Paste
Télécharger (18,6 ko) Statistiques
| Branche: | Révision:

root / drupal7 / sites / all / modules / feeds / libraries / common_syndication_parser.inc @ 41cc1b08

1
<?php
2

    
3
/**
4
 * @file
5
 *   Downloading and parsing functions for Common Syndication Parser.
6
 *   Pillaged from FeedAPI common syndication parser.
7
 *
8
 * @todo Restructure. OO could work wonders here.
9
 * @todo Write unit tests.
10
 * @todo Keep in Feeds project or host on Drupal?
11
 */
12

    
13
/**
14
 * Parse the feed into a data structure.
15
 *
16
 * @param $feed
17
 *  The feed object (contains the URL or the parsed XML structure.
18
 * @return
19
 *  stdClass The structured datas extracted from the feed.
20
 */
21
function common_syndication_parser_parse($string) {
22
  @ $xml = simplexml_load_string($string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NOCDATA);
23

    
24
  // Got a malformed XML.
25
  if ($xml === FALSE || is_null($xml)) {
26
    return FALSE;
27
  }
28
  $feed_type = _parser_common_syndication_feed_format_detect($xml);
29
  if ($feed_type ==  "atom1.0") {
30
    return _parser_common_syndication_atom10_parse($xml);
31
  }
32
  if ($feed_type == "RSS2.0" || $feed_type == "RSS0.91" || $feed_type == "RSS0.92") {
33
    return _parser_common_syndication_RSS20_parse($xml);
34
  }
35
  if ($feed_type == "RDF") {
36
    return _parser_common_syndication_RDF10_parse($xml);
37
  }
38
  return FALSE;
39
}
40

    
41
/**
42
 * Determine the feed format of a SimpleXML parsed object structure.
43
 *
44
 * @param $xml
45
 *  SimpleXML-preprocessed feed.
46
 * @return
47
 *  The feed format short description or FALSE if not compatible.
48
 */
49
function _parser_common_syndication_feed_format_detect($xml) {
50
  if (!is_object($xml)) {
51
    return FALSE;
52
  }
53
  $attr = $xml->attributes();
54
  $type = strtolower($xml->getName());
55
  if (isset($xml->entry) && $type == "feed") {
56
    return "atom1.0";
57
  }
58
  if ($type == "rss" && $attr["version"] == "2.0") {
59
    return "RSS2.0";
60
  }
61
  if ($type == "rdf" && isset($xml->channel)) {
62
    return "RDF";
63
  }
64
  if ($type == "rss" && $attr["version"] == "0.91") {
65
    return "RSS0.91";
66
  }
67
  if ($type == "rss" && $attr["version"] == "0.92") {
68
    return "RSS0.92";
69
  }
70
  return FALSE;
71
}
72

    
73
/**
74
 * Parse atom feeds.
75
 */
76
function _parser_common_syndication_atom10_parse($feed_XML) {
77
  $parsed_source = array();
78

    
79
  $ns = array(
80
    "georss" => "http://www.georss.org/georss",
81
  );
82

    
83
  $base = $feed_XML->xpath("@base");
84
  $base = (string) array_shift($base);
85
  if (!valid_url($base, TRUE)) {
86
    $base = FALSE;
87
  }
88

    
89
  // Detect the title
90
  $parsed_source['title'] = isset($feed_XML->title) ? _parser_common_syndication_title("{$feed_XML->title}") : "";
91
  // Detect the description
92
  $parsed_source['description'] = isset($feed_XML->subtitle) ? "{$feed_XML->subtitle}" : "";
93

    
94
  $parsed_source['link'] = _parser_common_syndication_link($feed_XML->link);
95
  if (valid_url($parsed_source['link']) && !valid_url($parsed_source['link'], TRUE) && !empty($base)) {
96
    $parsed_source['link'] = $base . $parsed_source['link'];
97
  }
98

    
99
  $parsed_source['items'] = array();
100

    
101
  foreach ($feed_XML->entry as $news) {
102

    
103
    $original_url = NULL;
104
    $guid = !empty($news->id) ? "{$news->id}" : NULL;
105
    if (valid_url($guid, TRUE)) {
106
      $original_url = $guid;
107
    }
108

    
109
    $georss = (array)$news->children($ns["georss"]);
110
    $geoname = '';
111
    if (isset($georss['featureName'])) {
112
      $geoname = "{$georss['featureName']}";
113
    }
114

    
115
    $latlon =
116
    $lat =
117
    $lon = NULL;
118
    if (isset($georss['point'])) {
119
      $latlon = explode(' ', $georss['point']);
120
      $lat = "{$latlon[0]}";
121
      $lon = "{$latlon[1]}";
122
      if (!$geoname) {
123
        $geoname = "{$lat} {$lon}";
124
      }
125
    }
126

    
127
    $additional_taxonomies = array();
128
    if (isset($news->category)) {
129
      $additional_taxonomies['ATOM Categories'] = array();
130
      $additional_taxonomies['ATOM Domains'] = array();
131
      foreach ($news->category as $category) {
132
        if (isset($category['scheme'])) {
133
          $domain = "{$category['scheme']}";
134
          if (!empty($domain)) {
135
              if (!isset($additional_taxonomies['ATOM Domains'][$domain])) {
136
                $additional_taxonomies['ATOM Domains'][$domain] = array();
137
              }
138
              $additional_taxonomies['ATOM Domains'][$domain][] = count($additional_taxonomies['ATOM Categories']) - 1;
139
          }
140
        }
141
        $additional_taxonomies['ATOM Categories'][] = "{$category['term']}";
142
      }
143
    }
144

    
145
    $title = "{$news->title}";
146

    
147
    $body = '';
148
    if (!empty($news->content)) {
149
      foreach ($news->content->children() as $child)  {
150
        $body .= $child->asXML();
151
      }
152
      $body .= "{$news->content}";
153
    }
154
    elseif (!empty($news->summary)) {
155
      foreach ($news->summary->children() as $child)  {
156
        $body .= $child->asXML();
157
      }
158
      $body .= "{$news->summary}";
159
    }
160

    
161
    if (!empty($news->content['src'])) {
162
      // some src elements in some valid atom feeds contained no urls at all
163
      if (valid_url("{$news->content['src']}", TRUE)) {
164
        $original_url = "{$news->content['src']}";
165
      }
166
    }
167

    
168
    $author_found = FALSE;
169
    if (!empty($news->source->author->name)) {
170
      $original_author = "{$news->source->author->name}";
171
      $author_found = TRUE;
172
    }
173
    elseif (!empty($news->author->name)) {
174
      $original_author = "{$news->author->name}";
175
      $author_found = TRUE;
176
    }
177
    if (!empty($feed_XML->author->name) && !$author_found) {
178
      $original_author = "{$feed_XML->author->name}";
179
    }
180

    
181
    $original_url = _parser_common_syndication_link($news->link);
182

    
183
    $item = array();
184
    $item['title'] = _parser_common_syndication_title($title, $body);
185
    $item['description'] = $body;
186
    $item['author_name'] = $original_author;
187

    
188
    // Fall back to updated for timestamp if both published and issued are
189
    // empty.
190
    if (isset($news->published)) {
191
      $item['timestamp'] = _parser_common_syndication_parse_date("{$news->published}");
192
    }
193
    elseif (isset($news->issued)) {
194
       $item['timestamp'] = _parser_common_syndication_parse_date("{$news->issued}");
195
    }
196
    elseif (isset($news->updated)) {
197
      $item['timestamp'] = _parser_common_syndication_parse_date("{$news->updated}");
198
    }
199

    
200
    $item['url'] = trim($original_url);
201
    if (valid_url($item['url']) && !valid_url($item['url'], TRUE) && !empty($base)) {
202
      $item['url'] = $base . $item['url'];
203
    }
204
    // Fall back on URL if GUID is empty.
205
    if (!empty($guid)) {
206
      $item['guid'] = $guid;
207
    }
208
    else {
209
      $item['guid'] = $item['url'];
210
    }
211
    $item['geolocations'] = array();
212
    if ($lat && $lon) {
213
      $item['geolocations'] = array(
214
        array(
215
          'name' => $geoname,
216
          'lat' => $lat,
217
          'lon' => $lon,
218
        ),
219
      );
220
    }
221
    $item['tags'] = isset($additional_taxonomies['ATOM Categories']) ? $additional_taxonomies['ATOM Categories'] : array();
222
    $item['domains'] = isset($additional_taxonomies['ATOM Domains']) ? $additional_taxonomies['ATOM Domains'] : array();
223
    $parsed_source['items'][] = $item;
224
  }
225
  return $parsed_source;
226
}
227

    
228
/**
229
 * Parse RDF Site Summary (RSS) 1.0 feeds in RDF/XML format.
230
 *
231
 * @see http://web.resource.org/rss/1.0/
232
 */
233
function _parser_common_syndication_RDF10_parse($feed_XML) {
234
  // Declare some canonical standard prefixes for well-known namespaces:
235
  static $canonical_namespaces = array(
236
    'rdf'      => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
237
    'rdfs'     => 'http://www.w3.org/2000/01/rdf-schema#',
238
    'xsi'      => 'http://www.w3.org/2001/XMLSchema-instance#',
239
    'xsd'      => 'http://www.w3.org/2001/XMLSchema#',
240
    'owl'      => 'http://www.w3.org/2002/07/owl#',
241
    'dc'       => 'http://purl.org/dc/elements/1.1/',
242
    'dcterms'  => 'http://purl.org/dc/terms/',
243
    'dcmitype' => 'http://purl.org/dc/dcmitype/',
244
    'foaf'     => 'http://xmlns.com/foaf/0.1/',
245
    'rss'      => 'http://purl.org/rss/1.0/',
246
  );
247

    
248
  // Get all namespaces declared in the feed element.
249
  $namespaces = $feed_XML->getNamespaces(TRUE);
250

    
251
  // Process the <rss:channel> resource containing feed metadata:
252
  foreach ($feed_XML->children($canonical_namespaces['rss'])->channel as $rss_channel) {
253
    $parsed_source = array(
254
      'title'       => _parser_common_syndication_title((string) $rss_channel->title),
255
      'description' => (string) $rss_channel->description,
256
      'link'        => (string) $rss_channel->link,
257
      'items'       => array(),
258
    );
259
    break;
260
  }
261

    
262
  // Process each <rss:item> resource contained in the feed:
263
  foreach ($feed_XML->children($canonical_namespaces['rss'])->item as $rss_item) {
264

    
265
    // Extract all available RDF statements from the feed item's RDF/XML
266
    // tags, allowing for both the item's attributes and child elements to
267
    // contain RDF properties:
268
    $rdf_data = array();
269
    foreach ($namespaces as $ns => $ns_uri) {
270
      // Note that we attempt to normalize the found property name
271
      // namespaces to well-known 'standard' prefixes where possible, as the
272
      // feed may in principle use any arbitrary prefixes and we should
273
      // still be able to correctly handle it.
274
      foreach ($rss_item->attributes($ns_uri) as $attr_name => $attr_value) {
275
        $ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
276
        $rdf_data[$ns_prefix . ':' . $attr_name][] = (string) $attr_value;
277
      }
278
      foreach ($rss_item->children($ns_uri) as $rss_property) {
279
        $ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
280
        $rdf_data[$ns_prefix . ':' . $rss_property->getName()][] = (string) $rss_property;
281
      }
282
    }
283

    
284
    // Declaratively define mappings that determine how to construct the result object.
285
    $item = _parser_common_syndication_RDF10_item($rdf_data, array(
286
      'title'       => array('rss:title', 'dc:title'),
287
      'description' => array('rss:description', 'dc:description', 'content:encoded'),
288
      'url'         => array('rss:link', 'rdf:about'),
289
      'author_name' => array('dc:creator', 'dc:publisher'),
290
      'guid'        => 'rdf:about',
291
      'timestamp'   => 'dc:date',
292
      'tags'        => 'dc:subject'
293
    ));
294

    
295
    // Special handling for the title:
296
    $item['title'] = _parser_common_syndication_title($item['title'], $item['description']);
297

    
298
    // Parse any date/time values into Unix timestamps:
299
    $item['timestamp'] = _parser_common_syndication_parse_date($item['timestamp']);
300

    
301
    // If no GUID found, use the URL of the feed.
302
    if (empty($item['guid'])) {
303
      $item['guid'] = $item['url'];
304
    }
305

    
306
    // Add every found RDF property to the feed item.
307
    $item['rdf'] = array();
308
    foreach ($rdf_data as $rdf_property => $rdf_value) {
309
      // looks nicer in the mapper UI
310
      // @todo Revisit, not used with feedapi mapper anymore.
311
      $rdf_property = str_replace(':', '_', $rdf_property);
312
      $item['rdf'][$rdf_property] = $rdf_value;
313
    }
314

    
315
    $parsed_source['items'][] = $item;
316
  }
317

    
318
  return $parsed_source;
319
}
320

    
321
function _parser_common_syndication_RDF10_property($rdf_data, $rdf_properties = array()) {
322
  $rdf_properties = is_array($rdf_properties) ? $rdf_properties : array_slice(func_get_args(), 1);
323
  foreach ($rdf_properties as $rdf_property) {
324
    if ($rdf_property && !empty($rdf_data[$rdf_property])) {
325
      // remove empty strings
326
      return array_filter($rdf_data[$rdf_property], 'strlen');
327
    }
328
  }
329
}
330

    
331
function _parser_common_syndication_RDF10_item($rdf_data, $mappings) {
332
  foreach ($mappings as $k => $v) {
333
    $values = _parser_common_syndication_RDF10_property($rdf_data, $v);
334
    $mappings[$k] = !is_array($values) || count($values) > 1 ? $values : reset($values);
335
  }
336
  return $mappings;
337
}
338

    
339
/**
340
 * Parse RSS2.0 feeds.
341
 */
342
function _parser_common_syndication_RSS20_parse($feed_XML) {
343

    
344
  $ns = array(
345
    "content" => "http://purl.org/rss/1.0/modules/content/",
346
     "dc" => "http://purl.org/dc/elements/1.1/",
347
     "georss" => "http://www.georss.org/georss",
348
  );
349

    
350
  $parsed_source = array();
351
  // Detect the title.
352
  $parsed_source['title'] = isset($feed_XML->channel->title) ? _parser_common_syndication_title("{$feed_XML->channel->title}") : "";
353
  // Detect the description.
354
  $parsed_source['description'] = isset($feed_XML->channel->description) ? "{$feed_XML->channel->description}" : "";
355
  // Detect the link.
356
  $parsed_source['link'] = isset($feed_XML->channel->link) ? "{$feed_XML->channel->link}" : "";
357
  $parsed_source['items'] = array();
358

    
359
  foreach ($feed_XML->xpath('//item') as $news) {
360
    $title = $body = $original_author = $original_url = $guid = '';
361

    
362
    $category = $news->xpath('category');
363
    // Get children for current namespace.
364
    $content = (array)$news->children($ns["content"]);
365
    $dc      = (array)$news->children($ns["dc"]);
366
    $georss  = (array)$news->children($ns["georss"]);
367
    $news = (array) $news;
368
    $news['category'] = $category;
369

    
370
    if (isset($news['title'])) {
371
      $title = "{$news['title']}";
372
    }
373

    
374
    if (isset($news['description'])) {
375
      $body = "{$news['description']}";
376
    }
377
    // Some sources use content:encoded as description i.e.
378
    // PostNuke PageSetter module.
379
    if (isset($news['encoded'])) {  // content:encoded for PHP < 5.1.2.
380
      if (strlen($body) < strlen("{$news['encoded']}")) {
381
        $body = "{$news['encoded']}";
382
      }
383
    }
384
    if (isset($content['encoded'])) { // content:encoded for PHP >= 5.1.2.
385
      if (strlen($body) < strlen("{$content['encoded']}")) {
386
        $body = "{$content['encoded']}";
387
      }
388
    }
389
    if (!isset($body)) {
390
      $body = "{$news['title']}";
391
    }
392

    
393
    if (!empty($news['author'])) {
394
      $original_author = "{$news['author']}";
395
    }
396
    elseif (!empty($dc["creator"])) {
397
      $original_author = (string)$dc["creator"];
398
    }
399

    
400
    if (!empty($news['link'])) {
401
      $original_url = "{$news['link']}";
402
      $guid = $original_url;
403
    }
404

    
405
    if (!empty($news['guid'])) {
406
      $guid = "{$news['guid']}";
407
    }
408

    
409
    if (!empty($georss['featureName'])) {
410
      $geoname = "{$georss['featureName']}";
411
    }
412

    
413
    $lat =
414
    $lon =
415
    $latlon =
416
    $geoname = NULL;
417
    if (!empty($georss['point'])) {
418
      $latlon = explode(' ', $georss['point']);
419
      $lat = "{$latlon[0]}";
420
      $lon = "{$latlon[1]}";
421
      if (!$geoname) {
422
        $geoname = "$lat $lon";
423
      }
424
    }
425

    
426
    $additional_taxonomies = array();
427
    $additional_taxonomies['RSS Categories'] = array();
428
    $additional_taxonomies['RSS Domains'] = array();
429
    if (isset($news['category'])) {
430
      foreach ($news['category'] as $category) {
431
        $additional_taxonomies['RSS Categories'][] = "{$category}";
432
        if (isset($category['domain'])) {
433
          $domain = "{$category['domain']}";
434
          if (!empty($domain)) {
435
              if (!isset($additional_taxonomies['RSS Domains'][$domain])) {
436
                $additional_taxonomies['RSS Domains'][$domain] = array();
437
              }
438
              $additional_taxonomies['RSS Domains'][$domain][] = count($additional_taxonomies['RSS Categories']) - 1;
439
          }
440
        }
441
      }
442
    }
443

    
444
    $item = array();
445
    $item['title'] = _parser_common_syndication_title($title, $body);
446
    $item['description'] = $body;
447
    $item['author_name'] = $original_author;
448
    if (!empty($news['pubDate'])) {
449
      $item['timestamp'] = _parser_common_syndication_parse_date($news['pubDate']);
450
    }
451
    elseif (!empty($dc['date'])) {
452
      $item['timestamp'] = _parser_common_syndication_parse_date($dc['date']);
453
    }
454
    else {
455
      $item['timestamp'] = time();
456
    }
457
    $item['url'] = trim($original_url);
458
    $item['guid'] = $guid;
459

    
460
    $item['geolocations'] = array();
461
    if (isset($geoname, $lat, $lon)) {
462
      $item['geolocations'] = array(
463
        array(
464
          'name' => $geoname,
465
          'lat' => $lat,
466
          'lon' => $lon,
467
        ),
468
      );
469
    }
470

    
471
    $item['domains'] = $additional_taxonomies['RSS Domains'];
472
    $item['tags'] = $additional_taxonomies['RSS Categories'];
473
    $parsed_source['items'][] = $item;
474
  }
475
  return $parsed_source;
476
}
477

    
478
/**
479
 * Parse a date comes from a feed.
480
 *
481
 * @param $date_string
482
 *  The date string in various formats.
483
 * @return
484
 *  The timestamp of the string or the current time if can't be parsed
485
 */
486
function _parser_common_syndication_parse_date($date_str) {
487
  // PHP < 5.3 doesn't like the GMT- notation for parsing timezones.
488
  $date_str = str_replace('GMT-', '-', $date_str);
489
  $date_str = str_replace('GMT+', '+', $date_str);
490
  $parsed_date = strtotime($date_str);
491

    
492
  if ($parsed_date === FALSE || $parsed_date == -1) {
493
    $parsed_date = _parser_common_syndication_parse_w3cdtf($date_str);
494
  }
495

    
496
  if (($parsed_date === FALSE || $parsed_date == -1)) {
497
    // PHP does not support the UT timezone. Fake it. The system that generated
498
    // this, Google Groups, probably meant UTC.
499
    $date_str = strtolower(trim($date_str));
500
    $last_three = substr($date_str, strlen($date_str) - 3, 3);
501

    
502
    if ($last_three == ' ut') {
503
      $parsed_date = strtotime($date_str . 'c');
504
    }
505
  }
506

    
507
  return $parsed_date === FALSE ? time() : $parsed_date;
508
}
509

    
510
/**
511
 * Parse the W3C date/time format, a subset of ISO 8601.
512
 *
513
 * PHP date parsing functions do not handle this format.
514
 * See http://www.w3.org/TR/NOTE-datetime for more information.
515
 * Originally from MagpieRSS (http://magpierss.sourceforge.net/).
516
 *
517
 * @param $date_str
518
 *   A string with a potentially W3C DTF date.
519
 * @return
520
 *   A timestamp if parsed successfully or FALSE if not.
521
 */
522
function _parser_common_syndication_parse_w3cdtf($date_str) {
523
  if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) {
524
    list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]);
525
    // Calculate the epoch for current date assuming GMT.
526
    $epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);
527
    if ($match[10] != 'Z') { // Z is zulu time, aka GMT
528
      list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]);
529
      // Zero out the variables.
530
      if (!$tz_hour) {
531
        $tz_hour = 0;
532
      }
533
      if (!$tz_min) {
534
        $tz_min = 0;
535
      }
536
      $offset_secs = (($tz_hour * 60) + $tz_min) * 60;
537
      // Is timezone ahead of GMT?  If yes, subtract offset.
538
      if ($tz_mod == '+') {
539
        $offset_secs *= -1;
540
      }
541
      $epoch += $offset_secs;
542
    }
543
    return $epoch;
544
  }
545
  else {
546
    return FALSE;
547
  }
548
}
549

    
550
/**
551
 * Extract the link that points to the original content (back to site or
552
 * original article)
553
 *
554
 * @param $links
555
 *  Array of SimpleXML objects
556
 */
557
function _parser_common_syndication_link($links) {
558
  $to_link = '';
559
  if (count($links) > 0) {
560
    foreach ($links as $link) {
561
      $link = $link->attributes();
562
      $to_link = isset($link["href"]) ? "{$link["href"]}" : "";
563
      if (isset($link["rel"])) {
564
        if ("{$link["rel"]}" == 'alternate') {
565
          break;
566
        }
567
      }
568
    }
569
  }
570
  return $to_link;
571
}
572

    
573
/**
574
 * Prepare raw data to be a title
575
 */
576
function _parser_common_syndication_title($title, $body = FALSE) {
577
  if (empty($title) && !empty($body)) {
578
    // Explode to words and use the first 3 words.
579
    $words = preg_split('/[\s,]+/', strip_tags($body));
580
    $title = implode(' ', array_slice($words, 0, 3));
581
  }
582
  return $title;
583
}