1
|
<?php
|
2
|
|
3
|
/**
|
4
|
* @file
|
5
|
* Downloading and parsing functions for Common Syndication Parser.
|
6
|
* Pillaged from FeedAPI common syndication parser.
|
7
|
*
|
8
|
* @todo Restructure. OO could work wonders here.
|
9
|
* @todo Write unit tests.
|
10
|
* @todo Keep in Feeds project or host on Drupal?
|
11
|
*/
|
12
|
|
13
|
/**
|
14
|
* Parse the feed into a data structure.
|
15
|
*
|
16
|
* @param string $string
|
17
|
* The feed object (contains the URL or the parsed XML structure).
|
18
|
*
|
19
|
* @return array|false
|
20
|
* The structured datas extracted from the feed or FALSE in case of failures.
|
21
|
*/
|
22
|
function common_syndication_parser_parse($string) {
|
23
|
// SimpleXML can only deal with XML declaration at the start of the document,
|
24
|
// so remove any surrounding whitespace.
|
25
|
$string = trim($string);
|
26
|
|
27
|
@ $xml = simplexml_load_string($string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NOCDATA);
|
28
|
|
29
|
// Got a malformed XML.
|
30
|
if ($xml === FALSE || is_null($xml)) {
|
31
|
return FALSE;
|
32
|
}
|
33
|
$feed_type = _parser_common_syndication_feed_format_detect($xml);
|
34
|
if ($feed_type == "atom1.0") {
|
35
|
return _parser_common_syndication_atom10_parse($xml);
|
36
|
}
|
37
|
if ($feed_type == "RSS2.0" || $feed_type == "RSS0.91" || $feed_type == "RSS0.92") {
|
38
|
return _parser_common_syndication_RSS20_parse($xml);
|
39
|
}
|
40
|
if ($feed_type == "RDF") {
|
41
|
return _parser_common_syndication_RDF10_parse($xml);
|
42
|
}
|
43
|
return FALSE;
|
44
|
}
|
45
|
|
46
|
/**
|
47
|
* Determine the feed format of a SimpleXML parsed object structure.
|
48
|
*
|
49
|
* @param SimpleXMLElement $xml
|
50
|
* SimpleXML-preprocessed feed.
|
51
|
*
|
52
|
* @return string|false
|
53
|
* The feed format short description or FALSE if not compatible.
|
54
|
*/
|
55
|
function _parser_common_syndication_feed_format_detect($xml) {
|
56
|
if (!is_object($xml)) {
|
57
|
return FALSE;
|
58
|
}
|
59
|
$attr = $xml->attributes();
|
60
|
$type = strtolower($xml->getName());
|
61
|
if (isset($xml->entry) && $type == "feed") {
|
62
|
return "atom1.0";
|
63
|
}
|
64
|
if ($type == "rss" && $attr["version"] == "2.0") {
|
65
|
return "RSS2.0";
|
66
|
}
|
67
|
if ($type == "rdf" && isset($xml->channel)) {
|
68
|
return "RDF";
|
69
|
}
|
70
|
if ($type == "rss" && $attr["version"] == "0.91") {
|
71
|
return "RSS0.91";
|
72
|
}
|
73
|
if ($type == "rss" && $attr["version"] == "0.92") {
|
74
|
return "RSS0.92";
|
75
|
}
|
76
|
return FALSE;
|
77
|
}
|
78
|
|
79
|
/**
|
80
|
* Parse atom feeds.
|
81
|
*/
|
82
|
function _parser_common_syndication_atom10_parse($feed_XML) {
|
83
|
$parsed_source = array();
|
84
|
|
85
|
$ns = array(
|
86
|
"georss" => "http://www.georss.org/georss",
|
87
|
);
|
88
|
|
89
|
$base = _parser_common_syndication_atom10_parse_base_url($feed_XML);
|
90
|
|
91
|
// Detect the title.
|
92
|
$parsed_source['title'] = isset($feed_XML->title) ? _parser_common_syndication_title("{$feed_XML->title}") : "";
|
93
|
// Detect the description.
|
94
|
$parsed_source['description'] = isset($feed_XML->subtitle) ? "{$feed_XML->subtitle}" : "";
|
95
|
|
96
|
$parsed_source['link'] = _parser_common_syndication_link($feed_XML->link);
|
97
|
if ($base && !valid_url($parsed_source['link'], TRUE) && valid_url($parsed_source['link'])) {
|
98
|
$parsed_source['link'] = $base . $parsed_source['link'];
|
99
|
}
|
100
|
|
101
|
$parsed_source['items'] = array();
|
102
|
|
103
|
foreach ($feed_XML->entry as $news) {
|
104
|
$georss = (array) $news->children($ns["georss"]);
|
105
|
$geoname = '';
|
106
|
if (isset($georss['featureName'])) {
|
107
|
$geoname = "{$georss['featureName']}";
|
108
|
}
|
109
|
|
110
|
$latlon =
|
111
|
$lat =
|
112
|
$lon = NULL;
|
113
|
if (isset($georss['point'])) {
|
114
|
$latlon = explode(' ', $georss['point']);
|
115
|
$lat = "{$latlon[0]}";
|
116
|
$lon = "{$latlon[1]}";
|
117
|
if (!$geoname) {
|
118
|
$geoname = "{$lat} {$lon}";
|
119
|
}
|
120
|
}
|
121
|
|
122
|
$additional_taxonomies = array();
|
123
|
if (isset($news->category)) {
|
124
|
$additional_taxonomies['ATOM Categories'] = array();
|
125
|
$additional_taxonomies['ATOM Domains'] = array();
|
126
|
foreach ($news->category as $category) {
|
127
|
if (isset($category['scheme'])) {
|
128
|
$domain = "{$category['scheme']}";
|
129
|
if (!empty($domain)) {
|
130
|
if (!isset($additional_taxonomies['ATOM Domains'][$domain])) {
|
131
|
$additional_taxonomies['ATOM Domains'][$domain] = array();
|
132
|
}
|
133
|
$additional_taxonomies['ATOM Domains'][$domain][] = count($additional_taxonomies['ATOM Categories']) - 1;
|
134
|
}
|
135
|
}
|
136
|
$additional_taxonomies['ATOM Categories'][] = "{$category['term']}";
|
137
|
}
|
138
|
}
|
139
|
|
140
|
$title = "{$news->title}";
|
141
|
|
142
|
$body = '';
|
143
|
if (!empty($news->content)) {
|
144
|
foreach ($news->content->children() as $child) {
|
145
|
$body .= $child->asXML();
|
146
|
}
|
147
|
$body .= "{$news->content}";
|
148
|
}
|
149
|
elseif (!empty($news->summary)) {
|
150
|
foreach ($news->summary->children() as $child) {
|
151
|
$body .= $child->asXML();
|
152
|
}
|
153
|
$body .= "{$news->summary}";
|
154
|
}
|
155
|
|
156
|
$original_author = '';
|
157
|
if (!empty($news->source->author->name)) {
|
158
|
$original_author = "{$news->source->author->name}";
|
159
|
}
|
160
|
elseif (!empty($news->author->name)) {
|
161
|
$original_author = "{$news->author->name}";
|
162
|
}
|
163
|
elseif (!empty($feed_XML->author->name)) {
|
164
|
$original_author = "{$feed_XML->author->name}";
|
165
|
}
|
166
|
|
167
|
$item = array();
|
168
|
$item['title'] = _parser_common_syndication_title($title, $body);
|
169
|
$item['description'] = $body;
|
170
|
$item['author_name'] = $original_author;
|
171
|
|
172
|
// Fall back to updated for timestamp if both published and issued are
|
173
|
// empty.
|
174
|
if (isset($news->published)) {
|
175
|
$item['timestamp'] = _parser_common_syndication_parse_date("{$news->published}");
|
176
|
}
|
177
|
elseif (isset($news->issued)) {
|
178
|
$item['timestamp'] = _parser_common_syndication_parse_date("{$news->issued}");
|
179
|
}
|
180
|
elseif (isset($news->updated)) {
|
181
|
$item['timestamp'] = _parser_common_syndication_parse_date("{$news->updated}");
|
182
|
}
|
183
|
|
184
|
$item['guid'] = (string) $news->id;
|
185
|
|
186
|
$item['url'] = _parser_common_syndication_link($news->link);
|
187
|
|
188
|
if (!$item['url'] && !empty($news->content['src']) && valid_url($news->content['src'], TRUE)) {
|
189
|
$item['url'] = (string) $news->content['src'];
|
190
|
}
|
191
|
|
192
|
if (!strlen($item['url']) && $item['guid'] && valid_url($item['guid'], TRUE)) {
|
193
|
$item['url'] = $item['guid'];
|
194
|
}
|
195
|
|
196
|
if (!valid_url($item['url'], TRUE) && valid_url($item['url'])) {
|
197
|
if ($item_base = _parser_common_syndication_atom10_parse_base_url($news)) {
|
198
|
$item['url'] = $item_base . $item['url'];
|
199
|
}
|
200
|
elseif ($base) {
|
201
|
$item['url'] = $base . $item['url'];
|
202
|
}
|
203
|
}
|
204
|
|
205
|
// Fall back on URL if GUID is empty.
|
206
|
if (!strlen($item['guid'])) {
|
207
|
$item['guid'] = $item['url'];
|
208
|
}
|
209
|
|
210
|
$item['geolocations'] = array();
|
211
|
if ($lat && $lon) {
|
212
|
$item['geolocations'] = array(
|
213
|
array(
|
214
|
'name' => $geoname,
|
215
|
'lat' => $lat,
|
216
|
'lon' => $lon,
|
217
|
),
|
218
|
);
|
219
|
}
|
220
|
$item['tags'] = isset($additional_taxonomies['ATOM Categories']) ? $additional_taxonomies['ATOM Categories'] : array();
|
221
|
$item['domains'] = isset($additional_taxonomies['ATOM Domains']) ? $additional_taxonomies['ATOM Domains'] : array();
|
222
|
$parsed_source['items'][] = $item;
|
223
|
}
|
224
|
|
225
|
return $parsed_source;
|
226
|
}
|
227
|
|
228
|
/**
|
229
|
* Finds the base URL of an Atom document.
|
230
|
*
|
231
|
* @param SimpleXMLElement $xml
|
232
|
* The XML document.
|
233
|
*
|
234
|
* @return string|false
|
235
|
* Returns the base URL or false on failure.
|
236
|
*/
|
237
|
function _parser_common_syndication_atom10_parse_base_url(SimpleXMLElement $xml) {
|
238
|
$base = $xml->attributes('xml', TRUE)->base;
|
239
|
if (!$base) {
|
240
|
$base = $xml['base'];
|
241
|
}
|
242
|
|
243
|
if ($base && valid_url($base, TRUE)) {
|
244
|
return rtrim($base, '/') . '/';
|
245
|
}
|
246
|
|
247
|
// Try to build a base from the self link.
|
248
|
foreach ($xml->xpath('*[local-name() = "link" and @rel="self" and @href]') as $self) {
|
249
|
if (valid_url($self['href'], TRUE)) {
|
250
|
return _parser_common_syndication_string_url_path((string) $self['href']);
|
251
|
}
|
252
|
}
|
253
|
|
254
|
// Try to build a base from the alternate link.
|
255
|
foreach ($xml->xpath('*[local-name() = "link" and @rel="alternate" and @href]') as $alternate) {
|
256
|
if (valid_url($alternate['href'], TRUE)) {
|
257
|
return _parser_common_syndication_string_url_path((string) $alternate['href']);
|
258
|
}
|
259
|
}
|
260
|
|
261
|
return FALSE;
|
262
|
}
|
263
|
|
264
|
/**
|
265
|
* Removes the path parts of an absolute URL.
|
266
|
*
|
267
|
* @param string $url
|
268
|
* The absolute URL.
|
269
|
*
|
270
|
* @return string
|
271
|
* The absolute URL with the path stripped.
|
272
|
*/
|
273
|
function _parser_common_syndication_string_url_path($url) {
|
274
|
$pos = strpos($url, '/', strpos($url, '//') + 2);
|
275
|
|
276
|
return $pos ? substr($url, 0, $pos + 1) : $url . '/';
|
277
|
}
|
278
|
|
279
|
/**
|
280
|
* Parse RDF Site Summary (RSS) 1.0 feeds in RDF/XML format.
|
281
|
*
|
282
|
* @see http://web.resource.org/rss/1.0/
|
283
|
*/
|
284
|
function _parser_common_syndication_RDF10_parse($feed_XML) {
|
285
|
// Declare some canonical standard prefixes for well-known namespaces:
|
286
|
static $canonical_namespaces = array(
|
287
|
'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
|
288
|
'rdfs' => 'http://www.w3.org/2000/01/rdf-schema#',
|
289
|
'xsi' => 'http://www.w3.org/2001/XMLSchema-instance#',
|
290
|
'xsd' => 'http://www.w3.org/2001/XMLSchema#',
|
291
|
'owl' => 'http://www.w3.org/2002/07/owl#',
|
292
|
'dc' => 'http://purl.org/dc/elements/1.1/',
|
293
|
'dcterms' => 'http://purl.org/dc/terms/',
|
294
|
'dcmitype' => 'http://purl.org/dc/dcmitype/',
|
295
|
'foaf' => 'http://xmlns.com/foaf/0.1/',
|
296
|
'rss' => 'http://purl.org/rss/1.0/',
|
297
|
);
|
298
|
|
299
|
// Get all namespaces declared in the feed element.
|
300
|
$namespaces = $feed_XML->getNamespaces(TRUE);
|
301
|
|
302
|
// Process the <rss:channel> resource containing feed metadata:
|
303
|
foreach ($feed_XML->children($canonical_namespaces['rss'])->channel as $rss_channel) {
|
304
|
$parsed_source = array(
|
305
|
'title' => _parser_common_syndication_title((string) $rss_channel->title),
|
306
|
'description' => (string) $rss_channel->description,
|
307
|
'link' => (string) $rss_channel->link,
|
308
|
'items' => array(),
|
309
|
);
|
310
|
break;
|
311
|
}
|
312
|
|
313
|
// Process each <rss:item> resource contained in the feed:
|
314
|
foreach ($feed_XML->children($canonical_namespaces['rss'])->item as $rss_item) {
|
315
|
|
316
|
// Extract all available RDF statements from the feed item's RDF/XML
|
317
|
// tags, allowing for both the item's attributes and child elements to
|
318
|
// contain RDF properties:
|
319
|
$rdf_data = array();
|
320
|
foreach ($namespaces as $ns => $ns_uri) {
|
321
|
// Note that we attempt to normalize the found property name
|
322
|
// namespaces to well-known 'standard' prefixes where possible, as the
|
323
|
// feed may in principle use any arbitrary prefixes and we should
|
324
|
// still be able to correctly handle it.
|
325
|
foreach ($rss_item->attributes($ns_uri) as $attr_name => $attr_value) {
|
326
|
$ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
|
327
|
$rdf_data[$ns_prefix . ':' . $attr_name][] = (string) $attr_value;
|
328
|
}
|
329
|
foreach ($rss_item->children($ns_uri) as $rss_property) {
|
330
|
$ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
|
331
|
$rdf_data[$ns_prefix . ':' . $rss_property->getName()][] = (string) $rss_property;
|
332
|
}
|
333
|
}
|
334
|
|
335
|
// Declaratively define mappings that determine how to construct the result object.
|
336
|
$item = _parser_common_syndication_RDF10_item($rdf_data, array(
|
337
|
'title' => array('rss:title', 'dc:title'),
|
338
|
'description' => array('rss:description', 'dc:description', 'content:encoded'),
|
339
|
'url' => array('rss:link', 'rdf:about'),
|
340
|
'author_name' => array('dc:creator', 'dc:publisher'),
|
341
|
'guid' => 'rdf:about',
|
342
|
'timestamp' => 'dc:date',
|
343
|
'tags' => 'dc:subject',
|
344
|
));
|
345
|
|
346
|
// Special handling for the title:
|
347
|
$item['title'] = _parser_common_syndication_title($item['title'], $item['description']);
|
348
|
|
349
|
// Parse any date/time values into Unix timestamps:
|
350
|
$item['timestamp'] = _parser_common_syndication_parse_date($item['timestamp']);
|
351
|
|
352
|
// If no GUID found, use the URL of the feed.
|
353
|
if (empty($item['guid'])) {
|
354
|
$item['guid'] = $item['url'];
|
355
|
}
|
356
|
|
357
|
// Add every found RDF property to the feed item.
|
358
|
$item['rdf'] = array();
|
359
|
foreach ($rdf_data as $rdf_property => $rdf_value) {
|
360
|
// Looks nicer in the mapper UI.
|
361
|
// @todo Revisit, not used with feedapi mapper anymore.
|
362
|
$rdf_property = str_replace(':', '_', $rdf_property);
|
363
|
$item['rdf'][$rdf_property] = $rdf_value;
|
364
|
}
|
365
|
|
366
|
$parsed_source['items'][] = $item;
|
367
|
}
|
368
|
|
369
|
return $parsed_source;
|
370
|
}
|
371
|
|
372
|
/**
|
373
|
*
|
374
|
*/
|
375
|
function _parser_common_syndication_RDF10_property($rdf_data, $rdf_properties = array()) {
|
376
|
$rdf_properties = is_array($rdf_properties) ? $rdf_properties : array_slice(func_get_args(), 1);
|
377
|
foreach ($rdf_properties as $rdf_property) {
|
378
|
if ($rdf_property && !empty($rdf_data[$rdf_property])) {
|
379
|
// Remove empty strings.
|
380
|
return array_filter($rdf_data[$rdf_property], 'strlen');
|
381
|
}
|
382
|
}
|
383
|
}
|
384
|
|
385
|
/**
|
386
|
*
|
387
|
*/
|
388
|
function _parser_common_syndication_RDF10_item($rdf_data, $mappings) {
|
389
|
foreach ($mappings as $k => $v) {
|
390
|
$values = _parser_common_syndication_RDF10_property($rdf_data, $v);
|
391
|
$mappings[$k] = !is_array($values) || count($values) > 1 ? $values : reset($values);
|
392
|
}
|
393
|
return $mappings;
|
394
|
}
|
395
|
|
396
|
/**
|
397
|
* Parse RSS2.0 feeds.
|
398
|
*/
|
399
|
function _parser_common_syndication_RSS20_parse($feed_XML) {
|
400
|
|
401
|
$ns = array(
|
402
|
"content" => "http://purl.org/rss/1.0/modules/content/",
|
403
|
"dc" => "http://purl.org/dc/elements/1.1/",
|
404
|
"georss" => "http://www.georss.org/georss",
|
405
|
);
|
406
|
|
407
|
$parsed_source = array();
|
408
|
// Detect the title.
|
409
|
$parsed_source['title'] = isset($feed_XML->channel->title) ? _parser_common_syndication_title("{$feed_XML->channel->title}") : "";
|
410
|
// Detect the description.
|
411
|
$parsed_source['description'] = isset($feed_XML->channel->description) ? "{$feed_XML->channel->description}" : "";
|
412
|
// Detect the link.
|
413
|
$parsed_source['link'] = isset($feed_XML->channel->link) ? "{$feed_XML->channel->link}" : "";
|
414
|
$parsed_source['items'] = array();
|
415
|
|
416
|
foreach ($feed_XML->xpath('//item') as $news) {
|
417
|
$title = $body = $original_author = $original_url = $guid = '';
|
418
|
|
419
|
// Get optional source url.
|
420
|
$source_url = (string) $news->source['url'];
|
421
|
|
422
|
$category = $news->xpath('category');
|
423
|
// Get children for current namespace.
|
424
|
$content = (array) $news->children($ns["content"]);
|
425
|
$dc = (array) $news->children($ns["dc"]);
|
426
|
$georss = (array) $news->children($ns["georss"]);
|
427
|
$news = (array) $news;
|
428
|
$news['category'] = $category;
|
429
|
|
430
|
if (isset($news['title'])) {
|
431
|
$title = "{$news['title']}";
|
432
|
}
|
433
|
|
434
|
if (isset($news['description'])) {
|
435
|
$body = "{$news['description']}";
|
436
|
}
|
437
|
// Some sources use content:encoded as description i.e.
|
438
|
// PostNuke PageSetter module.
|
439
|
// content:encoded for PHP < 5.1.2.
|
440
|
if (isset($news['encoded'])) {
|
441
|
if (strlen($body) < strlen("{$news['encoded']}")) {
|
442
|
$body = "{$news['encoded']}";
|
443
|
}
|
444
|
}
|
445
|
// content:encoded for PHP >= 5.1.2.
|
446
|
if (isset($content['encoded'])) {
|
447
|
if (strlen($body) < strlen("{$content['encoded']}")) {
|
448
|
$body = "{$content['encoded']}";
|
449
|
}
|
450
|
}
|
451
|
if (!isset($body)) {
|
452
|
$body = "{$news['title']}";
|
453
|
}
|
454
|
|
455
|
if (!empty($news['author'])) {
|
456
|
$original_author = "{$news['author']}";
|
457
|
}
|
458
|
elseif (!empty($dc["creator"])) {
|
459
|
$original_author = (string) $dc["creator"];
|
460
|
}
|
461
|
|
462
|
if (!empty($news['link'])) {
|
463
|
$original_url = "{$news['link']}";
|
464
|
$guid = $original_url;
|
465
|
}
|
466
|
|
467
|
if (!empty($news['guid'])) {
|
468
|
$guid = "{$news['guid']}";
|
469
|
}
|
470
|
|
471
|
if (!empty($georss['featureName'])) {
|
472
|
$geoname = "{$georss['featureName']}";
|
473
|
}
|
474
|
|
475
|
$lat =
|
476
|
$lon =
|
477
|
$latlon =
|
478
|
$geoname = NULL;
|
479
|
if (!empty($georss['point'])) {
|
480
|
$latlon = explode(' ', $georss['point']);
|
481
|
$lat = "{$latlon[0]}";
|
482
|
$lon = "{$latlon[1]}";
|
483
|
if (!$geoname) {
|
484
|
$geoname = "$lat $lon";
|
485
|
}
|
486
|
}
|
487
|
|
488
|
$additional_taxonomies = array();
|
489
|
$additional_taxonomies['RSS Categories'] = array();
|
490
|
$additional_taxonomies['RSS Domains'] = array();
|
491
|
if (isset($news['category'])) {
|
492
|
foreach ($news['category'] as $category) {
|
493
|
$additional_taxonomies['RSS Categories'][] = "{$category}";
|
494
|
if (isset($category['domain'])) {
|
495
|
$domain = "{$category['domain']}";
|
496
|
if (!empty($domain)) {
|
497
|
if (!isset($additional_taxonomies['RSS Domains'][$domain])) {
|
498
|
$additional_taxonomies['RSS Domains'][$domain] = array();
|
499
|
}
|
500
|
$additional_taxonomies['RSS Domains'][$domain][] = count($additional_taxonomies['RSS Categories']) - 1;
|
501
|
}
|
502
|
}
|
503
|
}
|
504
|
}
|
505
|
|
506
|
$item = array();
|
507
|
$item['title'] = _parser_common_syndication_title($title, $body);
|
508
|
$item['description'] = $body;
|
509
|
$item['author_name'] = $original_author;
|
510
|
if (!empty($news['pubDate'])) {
|
511
|
$item['timestamp'] = _parser_common_syndication_parse_date($news['pubDate']);
|
512
|
}
|
513
|
elseif (!empty($dc['date'])) {
|
514
|
$item['timestamp'] = _parser_common_syndication_parse_date($dc['date']);
|
515
|
}
|
516
|
else {
|
517
|
$item['timestamp'] = time();
|
518
|
}
|
519
|
$item['url'] = trim($original_url);
|
520
|
$item['guid'] = $guid;
|
521
|
if (!empty($news['source'])) {
|
522
|
$item['source:title'] = $news['source'];
|
523
|
}
|
524
|
else {
|
525
|
$item['source:title'] = NULL;
|
526
|
}
|
527
|
$item['source:url'] = trim($source_url);
|
528
|
|
529
|
$item['geolocations'] = array();
|
530
|
if (isset($geoname, $lat, $lon)) {
|
531
|
$item['geolocations'] = array(
|
532
|
array(
|
533
|
'name' => $geoname,
|
534
|
'lat' => $lat,
|
535
|
'lon' => $lon,
|
536
|
),
|
537
|
);
|
538
|
}
|
539
|
|
540
|
$item['domains'] = $additional_taxonomies['RSS Domains'];
|
541
|
$item['tags'] = $additional_taxonomies['RSS Categories'];
|
542
|
$parsed_source['items'][] = $item;
|
543
|
}
|
544
|
return $parsed_source;
|
545
|
}
|
546
|
|
547
|
/**
|
548
|
* Parse a date comes from a feed.
|
549
|
*
|
550
|
* @param string $date_str
|
551
|
* The date string in various formats.
|
552
|
*
|
553
|
* @return int
|
554
|
* The timestamp of the string or the current time if can't be parsed.
|
555
|
*/
|
556
|
function _parser_common_syndication_parse_date($date_str) {
|
557
|
// PHP < 5.3 doesn't like the GMT- notation for parsing timezones.
|
558
|
$date_str = str_replace('GMT-', '-', $date_str);
|
559
|
$date_str = str_replace('GMT+', '+', $date_str);
|
560
|
$parsed_date = strtotime($date_str);
|
561
|
|
562
|
if ($parsed_date === FALSE || $parsed_date == -1) {
|
563
|
$parsed_date = _parser_common_syndication_parse_w3cdtf($date_str);
|
564
|
}
|
565
|
|
566
|
if (($parsed_date === FALSE || $parsed_date == -1)) {
|
567
|
// PHP does not support the UT timezone. Fake it. The system that generated
|
568
|
// this, Google Groups, probably meant UTC.
|
569
|
$date_str = strtolower(trim($date_str));
|
570
|
$last_three = substr($date_str, strlen($date_str) - 3, 3);
|
571
|
|
572
|
if ($last_three == ' ut') {
|
573
|
$parsed_date = strtotime($date_str . 'c');
|
574
|
}
|
575
|
}
|
576
|
|
577
|
return $parsed_date === FALSE ? time() : $parsed_date;
|
578
|
}
|
579
|
|
580
|
/**
|
581
|
* Parse the W3C date/time format, a subset of ISO 8601.
|
582
|
*
|
583
|
* PHP date parsing functions do not handle this format.
|
584
|
* See http://www.w3.org/TR/NOTE-datetime for more information.
|
585
|
* Originally from MagpieRSS (http://magpierss.sourceforge.net/).
|
586
|
*
|
587
|
* @param string $date_str
|
588
|
* A potentially W3C DTF date.
|
589
|
*
|
590
|
* @return int|false
|
591
|
* A timestamp if parsed successfully or FALSE if not.
|
592
|
*/
|
593
|
function _parser_common_syndication_parse_w3cdtf($date_str) {
|
594
|
if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) {
|
595
|
list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]);
|
596
|
// Calculate the epoch for current date assuming GMT.
|
597
|
$epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);
|
598
|
// Z is zulu time, aka GMT.
|
599
|
if ($match[10] != 'Z') {
|
600
|
list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]);
|
601
|
// Zero out the variables.
|
602
|
if (!$tz_hour) {
|
603
|
$tz_hour = 0;
|
604
|
}
|
605
|
if (!$tz_min) {
|
606
|
$tz_min = 0;
|
607
|
}
|
608
|
$offset_secs = (($tz_hour * 60) + $tz_min) * 60;
|
609
|
// Is timezone ahead of GMT? If yes, subtract offset.
|
610
|
if ($tz_mod == '+') {
|
611
|
$offset_secs *= -1;
|
612
|
}
|
613
|
$epoch += $offset_secs;
|
614
|
}
|
615
|
return $epoch;
|
616
|
}
|
617
|
else {
|
618
|
return FALSE;
|
619
|
}
|
620
|
}
|
621
|
|
622
|
/**
|
623
|
* Extract the link that points to the original content (back to site or
|
624
|
* original article)
|
625
|
*
|
626
|
* @param array $links
|
627
|
* Array of SimpleXML objects.
|
628
|
*
|
629
|
* @return string
|
630
|
* An URL if found. An empty string otherwise.
|
631
|
*/
|
632
|
function _parser_common_syndication_link($links) {
|
633
|
$to_link = '';
|
634
|
if (count($links) > 0) {
|
635
|
foreach ($links as $link) {
|
636
|
$link = $link->attributes();
|
637
|
$to_link = isset($link["href"]) ? "{$link["href"]}" : "";
|
638
|
if (isset($link["rel"])) {
|
639
|
if ("{$link["rel"]}" == 'alternate') {
|
640
|
break;
|
641
|
}
|
642
|
}
|
643
|
}
|
644
|
}
|
645
|
return trim($to_link);
|
646
|
}
|
647
|
|
648
|
/**
|
649
|
* Prepare raw data to be a title.
|
650
|
*/
|
651
|
function _parser_common_syndication_title($title, $body = FALSE) {
|
652
|
if (empty($title) && !empty($body)) {
|
653
|
// Explode to words and use the first 3 words.
|
654
|
$words = preg_split('/[\s,]+/', strip_tags($body));
|
655
|
$title = implode(' ', array_slice($words, 0, 3));
|
656
|
}
|
657
|
return $title;
|
658
|
}
|