1 |
85ad3d82
|
Assos Assos
|
<?php
|
2 |
|
|
|
3 |
|
|
/**
|
4 |
|
|
* @file
|
5 |
|
|
* Downloading and parsing functions for Common Syndication Parser.
|
6 |
|
|
* Pillaged from FeedAPI common syndication parser.
|
7 |
|
|
*
|
8 |
|
|
* @todo Restructure. OO could work wonders here.
|
9 |
|
|
* @todo Write unit tests.
|
10 |
|
|
* @todo Keep in Feeds project or host on Drupal?
|
11 |
|
|
*/
|
12 |
|
|
|
13 |
|
|
/**
|
14 |
|
|
* Parse the feed into a data structure.
|
15 |
|
|
*
|
16 |
|
|
* @param $feed
|
17 |
|
|
* The feed object (contains the URL or the parsed XML structure.
|
18 |
|
|
* @return
|
19 |
|
|
* stdClass The structured datas extracted from the feed.
|
20 |
|
|
*/
|
21 |
|
|
function common_syndication_parser_parse($string) {
|
22 |
|
|
@ $xml = simplexml_load_string($string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NOCDATA);
|
23 |
|
|
|
24 |
|
|
// Got a malformed XML.
|
25 |
|
|
if ($xml === FALSE || is_null($xml)) {
|
26 |
|
|
return FALSE;
|
27 |
|
|
}
|
28 |
|
|
$feed_type = _parser_common_syndication_feed_format_detect($xml);
|
29 |
|
|
if ($feed_type == "atom1.0") {
|
30 |
|
|
return _parser_common_syndication_atom10_parse($xml);
|
31 |
|
|
}
|
32 |
|
|
if ($feed_type == "RSS2.0" || $feed_type == "RSS0.91" || $feed_type == "RSS0.92") {
|
33 |
|
|
return _parser_common_syndication_RSS20_parse($xml);
|
34 |
|
|
}
|
35 |
|
|
if ($feed_type == "RDF") {
|
36 |
|
|
return _parser_common_syndication_RDF10_parse($xml);
|
37 |
|
|
}
|
38 |
|
|
return FALSE;
|
39 |
|
|
}
|
40 |
|
|
|
41 |
|
|
/**
|
42 |
|
|
* Determine the feed format of a SimpleXML parsed object structure.
|
43 |
|
|
*
|
44 |
|
|
* @param $xml
|
45 |
|
|
* SimpleXML-preprocessed feed.
|
46 |
|
|
* @return
|
47 |
|
|
* The feed format short description or FALSE if not compatible.
|
48 |
|
|
*/
|
49 |
|
|
function _parser_common_syndication_feed_format_detect($xml) {
|
50 |
|
|
if (!is_object($xml)) {
|
51 |
|
|
return FALSE;
|
52 |
|
|
}
|
53 |
|
|
$attr = $xml->attributes();
|
54 |
|
|
$type = strtolower($xml->getName());
|
55 |
|
|
if (isset($xml->entry) && $type == "feed") {
|
56 |
|
|
return "atom1.0";
|
57 |
|
|
}
|
58 |
|
|
if ($type == "rss" && $attr["version"] == "2.0") {
|
59 |
|
|
return "RSS2.0";
|
60 |
|
|
}
|
61 |
|
|
if ($type == "rdf" && isset($xml->channel)) {
|
62 |
|
|
return "RDF";
|
63 |
|
|
}
|
64 |
|
|
if ($type == "rss" && $attr["version"] == "0.91") {
|
65 |
|
|
return "RSS0.91";
|
66 |
|
|
}
|
67 |
|
|
if ($type == "rss" && $attr["version"] == "0.92") {
|
68 |
|
|
return "RSS0.92";
|
69 |
|
|
}
|
70 |
|
|
return FALSE;
|
71 |
|
|
}
|
72 |
|
|
|
73 |
|
|
/**
|
74 |
|
|
* Parse atom feeds.
|
75 |
|
|
*/
|
76 |
|
|
function _parser_common_syndication_atom10_parse($feed_XML) {
|
77 |
|
|
$parsed_source = array();
|
78 |
|
|
|
79 |
|
|
$ns = array(
|
80 |
|
|
"georss" => "http://www.georss.org/georss",
|
81 |
|
|
);
|
82 |
|
|
|
83 |
|
|
$base = $feed_XML->xpath("@base");
|
84 |
|
|
$base = (string) array_shift($base);
|
85 |
|
|
if (!valid_url($base, TRUE)) {
|
86 |
|
|
$base = FALSE;
|
87 |
|
|
}
|
88 |
|
|
|
89 |
|
|
// Detect the title
|
90 |
|
|
$parsed_source['title'] = isset($feed_XML->title) ? _parser_common_syndication_title("{$feed_XML->title}") : "";
|
91 |
|
|
// Detect the description
|
92 |
|
|
$parsed_source['description'] = isset($feed_XML->subtitle) ? "{$feed_XML->subtitle}" : "";
|
93 |
|
|
|
94 |
|
|
$parsed_source['link'] = _parser_common_syndication_link($feed_XML->link);
|
95 |
|
|
if (valid_url($parsed_source['link']) && !valid_url($parsed_source['link'], TRUE) && !empty($base)) {
|
96 |
|
|
$parsed_source['link'] = $base . $parsed_source['link'];
|
97 |
|
|
}
|
98 |
|
|
|
99 |
|
|
$parsed_source['items'] = array();
|
100 |
|
|
|
101 |
|
|
foreach ($feed_XML->entry as $news) {
|
102 |
|
|
|
103 |
|
|
$original_url = NULL;
|
104 |
|
|
$guid = !empty($news->id) ? "{$news->id}" : NULL;
|
105 |
|
|
if (valid_url($guid, TRUE)) {
|
106 |
|
|
$original_url = $guid;
|
107 |
|
|
}
|
108 |
|
|
|
109 |
|
|
$georss = (array)$news->children($ns["georss"]);
|
110 |
|
|
$geoname = '';
|
111 |
|
|
if (isset($georss['featureName'])) {
|
112 |
|
|
$geoname = "{$georss['featureName']}";
|
113 |
|
|
}
|
114 |
|
|
|
115 |
|
|
$latlon =
|
116 |
|
|
$lat =
|
117 |
|
|
$lon = NULL;
|
118 |
|
|
if (isset($georss['point'])) {
|
119 |
|
|
$latlon = explode(' ', $georss['point']);
|
120 |
|
|
$lat = "{$latlon[0]}";
|
121 |
|
|
$lon = "{$latlon[1]}";
|
122 |
|
|
if (!$geoname) {
|
123 |
|
|
$geoname = "{$lat} {$lon}";
|
124 |
|
|
}
|
125 |
|
|
}
|
126 |
|
|
|
127 |
|
|
$additional_taxonomies = array();
|
128 |
|
|
if (isset($news->category)) {
|
129 |
|
|
$additional_taxonomies['ATOM Categories'] = array();
|
130 |
|
|
$additional_taxonomies['ATOM Domains'] = array();
|
131 |
|
|
foreach ($news->category as $category) {
|
132 |
|
|
if (isset($category['scheme'])) {
|
133 |
|
|
$domain = "{$category['scheme']}";
|
134 |
|
|
if (!empty($domain)) {
|
135 |
|
|
if (!isset($additional_taxonomies['ATOM Domains'][$domain])) {
|
136 |
|
|
$additional_taxonomies['ATOM Domains'][$domain] = array();
|
137 |
|
|
}
|
138 |
|
|
$additional_taxonomies['ATOM Domains'][$domain][] = count($additional_taxonomies['ATOM Categories']) - 1;
|
139 |
|
|
}
|
140 |
|
|
}
|
141 |
|
|
$additional_taxonomies['ATOM Categories'][] = "{$category['term']}";
|
142 |
|
|
}
|
143 |
|
|
}
|
144 |
|
|
|
145 |
|
|
$title = "{$news->title}";
|
146 |
|
|
|
147 |
|
|
$body = '';
|
148 |
|
|
if (!empty($news->content)) {
|
149 |
|
|
foreach ($news->content->children() as $child) {
|
150 |
|
|
$body .= $child->asXML();
|
151 |
|
|
}
|
152 |
|
|
$body .= "{$news->content}";
|
153 |
|
|
}
|
154 |
|
|
elseif (!empty($news->summary)) {
|
155 |
|
|
foreach ($news->summary->children() as $child) {
|
156 |
|
|
$body .= $child->asXML();
|
157 |
|
|
}
|
158 |
|
|
$body .= "{$news->summary}";
|
159 |
|
|
}
|
160 |
|
|
|
161 |
|
|
if (!empty($news->content['src'])) {
|
162 |
|
|
// some src elements in some valid atom feeds contained no urls at all
|
163 |
|
|
if (valid_url("{$news->content['src']}", TRUE)) {
|
164 |
|
|
$original_url = "{$news->content['src']}";
|
165 |
|
|
}
|
166 |
|
|
}
|
167 |
|
|
|
168 |
a192dc0b
|
Assos Assos
|
$original_author = '';
|
169 |
85ad3d82
|
Assos Assos
|
if (!empty($news->source->author->name)) {
|
170 |
|
|
$original_author = "{$news->source->author->name}";
|
171 |
|
|
}
|
172 |
|
|
elseif (!empty($news->author->name)) {
|
173 |
|
|
$original_author = "{$news->author->name}";
|
174 |
|
|
}
|
175 |
a192dc0b
|
Assos Assos
|
elseif (!empty($feed_XML->author->name)) {
|
176 |
85ad3d82
|
Assos Assos
|
$original_author = "{$feed_XML->author->name}";
|
177 |
|
|
}
|
178 |
|
|
|
179 |
|
|
$original_url = _parser_common_syndication_link($news->link);
|
180 |
|
|
|
181 |
|
|
$item = array();
|
182 |
|
|
$item['title'] = _parser_common_syndication_title($title, $body);
|
183 |
|
|
$item['description'] = $body;
|
184 |
|
|
$item['author_name'] = $original_author;
|
185 |
|
|
|
186 |
|
|
// Fall back to updated for timestamp if both published and issued are
|
187 |
|
|
// empty.
|
188 |
|
|
if (isset($news->published)) {
|
189 |
|
|
$item['timestamp'] = _parser_common_syndication_parse_date("{$news->published}");
|
190 |
|
|
}
|
191 |
|
|
elseif (isset($news->issued)) {
|
192 |
|
|
$item['timestamp'] = _parser_common_syndication_parse_date("{$news->issued}");
|
193 |
|
|
}
|
194 |
|
|
elseif (isset($news->updated)) {
|
195 |
|
|
$item['timestamp'] = _parser_common_syndication_parse_date("{$news->updated}");
|
196 |
|
|
}
|
197 |
|
|
|
198 |
|
|
$item['url'] = trim($original_url);
|
199 |
|
|
if (valid_url($item['url']) && !valid_url($item['url'], TRUE) && !empty($base)) {
|
200 |
|
|
$item['url'] = $base . $item['url'];
|
201 |
|
|
}
|
202 |
|
|
// Fall back on URL if GUID is empty.
|
203 |
|
|
if (!empty($guid)) {
|
204 |
|
|
$item['guid'] = $guid;
|
205 |
|
|
}
|
206 |
|
|
else {
|
207 |
|
|
$item['guid'] = $item['url'];
|
208 |
|
|
}
|
209 |
|
|
$item['geolocations'] = array();
|
210 |
|
|
if ($lat && $lon) {
|
211 |
|
|
$item['geolocations'] = array(
|
212 |
|
|
array(
|
213 |
|
|
'name' => $geoname,
|
214 |
|
|
'lat' => $lat,
|
215 |
|
|
'lon' => $lon,
|
216 |
|
|
),
|
217 |
|
|
);
|
218 |
|
|
}
|
219 |
|
|
$item['tags'] = isset($additional_taxonomies['ATOM Categories']) ? $additional_taxonomies['ATOM Categories'] : array();
|
220 |
|
|
$item['domains'] = isset($additional_taxonomies['ATOM Domains']) ? $additional_taxonomies['ATOM Domains'] : array();
|
221 |
|
|
$parsed_source['items'][] = $item;
|
222 |
|
|
}
|
223 |
|
|
return $parsed_source;
|
224 |
|
|
}
|
225 |
|
|
|
226 |
|
|
/**
|
227 |
|
|
* Parse RDF Site Summary (RSS) 1.0 feeds in RDF/XML format.
|
228 |
|
|
*
|
229 |
|
|
* @see http://web.resource.org/rss/1.0/
|
230 |
|
|
*/
|
231 |
|
|
function _parser_common_syndication_RDF10_parse($feed_XML) {
|
232 |
|
|
// Declare some canonical standard prefixes for well-known namespaces:
|
233 |
|
|
static $canonical_namespaces = array(
|
234 |
|
|
'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
|
235 |
|
|
'rdfs' => 'http://www.w3.org/2000/01/rdf-schema#',
|
236 |
|
|
'xsi' => 'http://www.w3.org/2001/XMLSchema-instance#',
|
237 |
|
|
'xsd' => 'http://www.w3.org/2001/XMLSchema#',
|
238 |
|
|
'owl' => 'http://www.w3.org/2002/07/owl#',
|
239 |
|
|
'dc' => 'http://purl.org/dc/elements/1.1/',
|
240 |
|
|
'dcterms' => 'http://purl.org/dc/terms/',
|
241 |
|
|
'dcmitype' => 'http://purl.org/dc/dcmitype/',
|
242 |
|
|
'foaf' => 'http://xmlns.com/foaf/0.1/',
|
243 |
|
|
'rss' => 'http://purl.org/rss/1.0/',
|
244 |
|
|
);
|
245 |
|
|
|
246 |
|
|
// Get all namespaces declared in the feed element.
|
247 |
|
|
$namespaces = $feed_XML->getNamespaces(TRUE);
|
248 |
|
|
|
249 |
|
|
// Process the <rss:channel> resource containing feed metadata:
|
250 |
|
|
foreach ($feed_XML->children($canonical_namespaces['rss'])->channel as $rss_channel) {
|
251 |
|
|
$parsed_source = array(
|
252 |
|
|
'title' => _parser_common_syndication_title((string) $rss_channel->title),
|
253 |
|
|
'description' => (string) $rss_channel->description,
|
254 |
|
|
'link' => (string) $rss_channel->link,
|
255 |
|
|
'items' => array(),
|
256 |
|
|
);
|
257 |
|
|
break;
|
258 |
|
|
}
|
259 |
|
|
|
260 |
|
|
// Process each <rss:item> resource contained in the feed:
|
261 |
|
|
foreach ($feed_XML->children($canonical_namespaces['rss'])->item as $rss_item) {
|
262 |
|
|
|
263 |
|
|
// Extract all available RDF statements from the feed item's RDF/XML
|
264 |
|
|
// tags, allowing for both the item's attributes and child elements to
|
265 |
|
|
// contain RDF properties:
|
266 |
|
|
$rdf_data = array();
|
267 |
|
|
foreach ($namespaces as $ns => $ns_uri) {
|
268 |
|
|
// Note that we attempt to normalize the found property name
|
269 |
|
|
// namespaces to well-known 'standard' prefixes where possible, as the
|
270 |
|
|
// feed may in principle use any arbitrary prefixes and we should
|
271 |
|
|
// still be able to correctly handle it.
|
272 |
|
|
foreach ($rss_item->attributes($ns_uri) as $attr_name => $attr_value) {
|
273 |
|
|
$ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
|
274 |
|
|
$rdf_data[$ns_prefix . ':' . $attr_name][] = (string) $attr_value;
|
275 |
|
|
}
|
276 |
|
|
foreach ($rss_item->children($ns_uri) as $rss_property) {
|
277 |
|
|
$ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
|
278 |
|
|
$rdf_data[$ns_prefix . ':' . $rss_property->getName()][] = (string) $rss_property;
|
279 |
|
|
}
|
280 |
|
|
}
|
281 |
|
|
|
282 |
|
|
// Declaratively define mappings that determine how to construct the result object.
|
283 |
|
|
$item = _parser_common_syndication_RDF10_item($rdf_data, array(
|
284 |
|
|
'title' => array('rss:title', 'dc:title'),
|
285 |
|
|
'description' => array('rss:description', 'dc:description', 'content:encoded'),
|
286 |
|
|
'url' => array('rss:link', 'rdf:about'),
|
287 |
|
|
'author_name' => array('dc:creator', 'dc:publisher'),
|
288 |
|
|
'guid' => 'rdf:about',
|
289 |
|
|
'timestamp' => 'dc:date',
|
290 |
|
|
'tags' => 'dc:subject'
|
291 |
|
|
));
|
292 |
|
|
|
293 |
|
|
// Special handling for the title:
|
294 |
|
|
$item['title'] = _parser_common_syndication_title($item['title'], $item['description']);
|
295 |
|
|
|
296 |
|
|
// Parse any date/time values into Unix timestamps:
|
297 |
|
|
$item['timestamp'] = _parser_common_syndication_parse_date($item['timestamp']);
|
298 |
|
|
|
299 |
|
|
// If no GUID found, use the URL of the feed.
|
300 |
|
|
if (empty($item['guid'])) {
|
301 |
|
|
$item['guid'] = $item['url'];
|
302 |
|
|
}
|
303 |
|
|
|
304 |
|
|
// Add every found RDF property to the feed item.
|
305 |
|
|
$item['rdf'] = array();
|
306 |
|
|
foreach ($rdf_data as $rdf_property => $rdf_value) {
|
307 |
|
|
// looks nicer in the mapper UI
|
308 |
|
|
// @todo Revisit, not used with feedapi mapper anymore.
|
309 |
|
|
$rdf_property = str_replace(':', '_', $rdf_property);
|
310 |
|
|
$item['rdf'][$rdf_property] = $rdf_value;
|
311 |
|
|
}
|
312 |
|
|
|
313 |
|
|
$parsed_source['items'][] = $item;
|
314 |
|
|
}
|
315 |
|
|
|
316 |
|
|
return $parsed_source;
|
317 |
|
|
}
|
318 |
|
|
|
319 |
|
|
function _parser_common_syndication_RDF10_property($rdf_data, $rdf_properties = array()) {
|
320 |
|
|
$rdf_properties = is_array($rdf_properties) ? $rdf_properties : array_slice(func_get_args(), 1);
|
321 |
|
|
foreach ($rdf_properties as $rdf_property) {
|
322 |
|
|
if ($rdf_property && !empty($rdf_data[$rdf_property])) {
|
323 |
|
|
// remove empty strings
|
324 |
|
|
return array_filter($rdf_data[$rdf_property], 'strlen');
|
325 |
|
|
}
|
326 |
|
|
}
|
327 |
|
|
}
|
328 |
|
|
|
329 |
|
|
function _parser_common_syndication_RDF10_item($rdf_data, $mappings) {
|
330 |
|
|
foreach ($mappings as $k => $v) {
|
331 |
|
|
$values = _parser_common_syndication_RDF10_property($rdf_data, $v);
|
332 |
|
|
$mappings[$k] = !is_array($values) || count($values) > 1 ? $values : reset($values);
|
333 |
|
|
}
|
334 |
|
|
return $mappings;
|
335 |
|
|
}
|
336 |
|
|
|
337 |
|
|
/**
|
338 |
|
|
* Parse RSS2.0 feeds.
|
339 |
|
|
*/
|
340 |
|
|
function _parser_common_syndication_RSS20_parse($feed_XML) {
|
341 |
|
|
|
342 |
|
|
$ns = array(
|
343 |
|
|
"content" => "http://purl.org/rss/1.0/modules/content/",
|
344 |
|
|
"dc" => "http://purl.org/dc/elements/1.1/",
|
345 |
|
|
"georss" => "http://www.georss.org/georss",
|
346 |
|
|
);
|
347 |
|
|
|
348 |
|
|
$parsed_source = array();
|
349 |
|
|
// Detect the title.
|
350 |
|
|
$parsed_source['title'] = isset($feed_XML->channel->title) ? _parser_common_syndication_title("{$feed_XML->channel->title}") : "";
|
351 |
|
|
// Detect the description.
|
352 |
|
|
$parsed_source['description'] = isset($feed_XML->channel->description) ? "{$feed_XML->channel->description}" : "";
|
353 |
|
|
// Detect the link.
|
354 |
|
|
$parsed_source['link'] = isset($feed_XML->channel->link) ? "{$feed_XML->channel->link}" : "";
|
355 |
|
|
$parsed_source['items'] = array();
|
356 |
|
|
|
357 |
|
|
foreach ($feed_XML->xpath('//item') as $news) {
|
358 |
|
|
$title = $body = $original_author = $original_url = $guid = '';
|
359 |
|
|
|
360 |
|
|
$category = $news->xpath('category');
|
361 |
|
|
// Get children for current namespace.
|
362 |
|
|
$content = (array)$news->children($ns["content"]);
|
363 |
|
|
$dc = (array)$news->children($ns["dc"]);
|
364 |
|
|
$georss = (array)$news->children($ns["georss"]);
|
365 |
|
|
$news = (array) $news;
|
366 |
|
|
$news['category'] = $category;
|
367 |
|
|
|
368 |
|
|
if (isset($news['title'])) {
|
369 |
|
|
$title = "{$news['title']}";
|
370 |
|
|
}
|
371 |
|
|
|
372 |
|
|
if (isset($news['description'])) {
|
373 |
|
|
$body = "{$news['description']}";
|
374 |
|
|
}
|
375 |
|
|
// Some sources use content:encoded as description i.e.
|
376 |
|
|
// PostNuke PageSetter module.
|
377 |
|
|
if (isset($news['encoded'])) { // content:encoded for PHP < 5.1.2.
|
378 |
|
|
if (strlen($body) < strlen("{$news['encoded']}")) {
|
379 |
|
|
$body = "{$news['encoded']}";
|
380 |
|
|
}
|
381 |
|
|
}
|
382 |
|
|
if (isset($content['encoded'])) { // content:encoded for PHP >= 5.1.2.
|
383 |
|
|
if (strlen($body) < strlen("{$content['encoded']}")) {
|
384 |
|
|
$body = "{$content['encoded']}";
|
385 |
|
|
}
|
386 |
|
|
}
|
387 |
|
|
if (!isset($body)) {
|
388 |
|
|
$body = "{$news['title']}";
|
389 |
|
|
}
|
390 |
|
|
|
391 |
|
|
if (!empty($news['author'])) {
|
392 |
|
|
$original_author = "{$news['author']}";
|
393 |
|
|
}
|
394 |
|
|
elseif (!empty($dc["creator"])) {
|
395 |
|
|
$original_author = (string)$dc["creator"];
|
396 |
|
|
}
|
397 |
|
|
|
398 |
|
|
if (!empty($news['link'])) {
|
399 |
|
|
$original_url = "{$news['link']}";
|
400 |
|
|
$guid = $original_url;
|
401 |
|
|
}
|
402 |
|
|
|
403 |
|
|
if (!empty($news['guid'])) {
|
404 |
|
|
$guid = "{$news['guid']}";
|
405 |
|
|
}
|
406 |
|
|
|
407 |
|
|
if (!empty($georss['featureName'])) {
|
408 |
|
|
$geoname = "{$georss['featureName']}";
|
409 |
|
|
}
|
410 |
|
|
|
411 |
|
|
$lat =
|
412 |
|
|
$lon =
|
413 |
|
|
$latlon =
|
414 |
|
|
$geoname = NULL;
|
415 |
|
|
if (!empty($georss['point'])) {
|
416 |
|
|
$latlon = explode(' ', $georss['point']);
|
417 |
|
|
$lat = "{$latlon[0]}";
|
418 |
|
|
$lon = "{$latlon[1]}";
|
419 |
|
|
if (!$geoname) {
|
420 |
|
|
$geoname = "$lat $lon";
|
421 |
|
|
}
|
422 |
|
|
}
|
423 |
|
|
|
424 |
|
|
$additional_taxonomies = array();
|
425 |
|
|
$additional_taxonomies['RSS Categories'] = array();
|
426 |
|
|
$additional_taxonomies['RSS Domains'] = array();
|
427 |
|
|
if (isset($news['category'])) {
|
428 |
|
|
foreach ($news['category'] as $category) {
|
429 |
|
|
$additional_taxonomies['RSS Categories'][] = "{$category}";
|
430 |
|
|
if (isset($category['domain'])) {
|
431 |
|
|
$domain = "{$category['domain']}";
|
432 |
|
|
if (!empty($domain)) {
|
433 |
|
|
if (!isset($additional_taxonomies['RSS Domains'][$domain])) {
|
434 |
|
|
$additional_taxonomies['RSS Domains'][$domain] = array();
|
435 |
|
|
}
|
436 |
|
|
$additional_taxonomies['RSS Domains'][$domain][] = count($additional_taxonomies['RSS Categories']) - 1;
|
437 |
|
|
}
|
438 |
|
|
}
|
439 |
|
|
}
|
440 |
|
|
}
|
441 |
|
|
|
442 |
|
|
$item = array();
|
443 |
|
|
$item['title'] = _parser_common_syndication_title($title, $body);
|
444 |
|
|
$item['description'] = $body;
|
445 |
|
|
$item['author_name'] = $original_author;
|
446 |
|
|
if (!empty($news['pubDate'])) {
|
447 |
|
|
$item['timestamp'] = _parser_common_syndication_parse_date($news['pubDate']);
|
448 |
|
|
}
|
449 |
|
|
elseif (!empty($dc['date'])) {
|
450 |
|
|
$item['timestamp'] = _parser_common_syndication_parse_date($dc['date']);
|
451 |
|
|
}
|
452 |
|
|
else {
|
453 |
|
|
$item['timestamp'] = time();
|
454 |
|
|
}
|
455 |
|
|
$item['url'] = trim($original_url);
|
456 |
|
|
$item['guid'] = $guid;
|
457 |
|
|
|
458 |
|
|
$item['geolocations'] = array();
|
459 |
|
|
if (isset($geoname, $lat, $lon)) {
|
460 |
|
|
$item['geolocations'] = array(
|
461 |
|
|
array(
|
462 |
|
|
'name' => $geoname,
|
463 |
|
|
'lat' => $lat,
|
464 |
|
|
'lon' => $lon,
|
465 |
|
|
),
|
466 |
|
|
);
|
467 |
|
|
}
|
468 |
|
|
|
469 |
|
|
$item['domains'] = $additional_taxonomies['RSS Domains'];
|
470 |
|
|
$item['tags'] = $additional_taxonomies['RSS Categories'];
|
471 |
|
|
$parsed_source['items'][] = $item;
|
472 |
|
|
}
|
473 |
|
|
return $parsed_source;
|
474 |
|
|
}
|
475 |
|
|
|
476 |
|
|
/**
|
477 |
|
|
* Parse a date comes from a feed.
|
478 |
|
|
*
|
479 |
|
|
* @param $date_string
|
480 |
|
|
* The date string in various formats.
|
481 |
|
|
* @return
|
482 |
|
|
* The timestamp of the string or the current time if can't be parsed
|
483 |
|
|
*/
|
484 |
|
|
function _parser_common_syndication_parse_date($date_str) {
|
485 |
|
|
// PHP < 5.3 doesn't like the GMT- notation for parsing timezones.
|
486 |
|
|
$date_str = str_replace('GMT-', '-', $date_str);
|
487 |
|
|
$date_str = str_replace('GMT+', '+', $date_str);
|
488 |
|
|
$parsed_date = strtotime($date_str);
|
489 |
|
|
|
490 |
|
|
if ($parsed_date === FALSE || $parsed_date == -1) {
|
491 |
|
|
$parsed_date = _parser_common_syndication_parse_w3cdtf($date_str);
|
492 |
|
|
}
|
493 |
|
|
|
494 |
|
|
if (($parsed_date === FALSE || $parsed_date == -1)) {
|
495 |
|
|
// PHP does not support the UT timezone. Fake it. The system that generated
|
496 |
|
|
// this, Google Groups, probably meant UTC.
|
497 |
|
|
$date_str = strtolower(trim($date_str));
|
498 |
|
|
$last_three = substr($date_str, strlen($date_str) - 3, 3);
|
499 |
|
|
|
500 |
|
|
if ($last_three == ' ut') {
|
501 |
|
|
$parsed_date = strtotime($date_str . 'c');
|
502 |
|
|
}
|
503 |
|
|
}
|
504 |
|
|
|
505 |
|
|
return $parsed_date === FALSE ? time() : $parsed_date;
|
506 |
|
|
}
|
507 |
|
|
|
508 |
|
|
/**
|
509 |
|
|
* Parse the W3C date/time format, a subset of ISO 8601.
|
510 |
|
|
*
|
511 |
|
|
* PHP date parsing functions do not handle this format.
|
512 |
|
|
* See http://www.w3.org/TR/NOTE-datetime for more information.
|
513 |
|
|
* Originally from MagpieRSS (http://magpierss.sourceforge.net/).
|
514 |
|
|
*
|
515 |
|
|
* @param $date_str
|
516 |
|
|
* A string with a potentially W3C DTF date.
|
517 |
|
|
* @return
|
518 |
|
|
* A timestamp if parsed successfully or FALSE if not.
|
519 |
|
|
*/
|
520 |
|
|
function _parser_common_syndication_parse_w3cdtf($date_str) {
|
521 |
|
|
if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) {
|
522 |
|
|
list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]);
|
523 |
|
|
// Calculate the epoch for current date assuming GMT.
|
524 |
|
|
$epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);
|
525 |
|
|
if ($match[10] != 'Z') { // Z is zulu time, aka GMT
|
526 |
|
|
list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]);
|
527 |
|
|
// Zero out the variables.
|
528 |
|
|
if (!$tz_hour) {
|
529 |
|
|
$tz_hour = 0;
|
530 |
|
|
}
|
531 |
|
|
if (!$tz_min) {
|
532 |
|
|
$tz_min = 0;
|
533 |
|
|
}
|
534 |
|
|
$offset_secs = (($tz_hour * 60) + $tz_min) * 60;
|
535 |
|
|
// Is timezone ahead of GMT? If yes, subtract offset.
|
536 |
|
|
if ($tz_mod == '+') {
|
537 |
|
|
$offset_secs *= -1;
|
538 |
|
|
}
|
539 |
|
|
$epoch += $offset_secs;
|
540 |
|
|
}
|
541 |
|
|
return $epoch;
|
542 |
|
|
}
|
543 |
|
|
else {
|
544 |
|
|
return FALSE;
|
545 |
|
|
}
|
546 |
|
|
}
|
547 |
|
|
|
548 |
|
|
/**
|
549 |
|
|
* Extract the link that points to the original content (back to site or
|
550 |
|
|
* original article)
|
551 |
|
|
*
|
552 |
|
|
* @param $links
|
553 |
|
|
* Array of SimpleXML objects
|
554 |
|
|
*/
|
555 |
|
|
function _parser_common_syndication_link($links) {
|
556 |
|
|
$to_link = '';
|
557 |
|
|
if (count($links) > 0) {
|
558 |
|
|
foreach ($links as $link) {
|
559 |
|
|
$link = $link->attributes();
|
560 |
|
|
$to_link = isset($link["href"]) ? "{$link["href"]}" : "";
|
561 |
|
|
if (isset($link["rel"])) {
|
562 |
|
|
if ("{$link["rel"]}" == 'alternate') {
|
563 |
|
|
break;
|
564 |
|
|
}
|
565 |
|
|
}
|
566 |
|
|
}
|
567 |
|
|
}
|
568 |
|
|
return $to_link;
|
569 |
|
|
}
|
570 |
|
|
|
571 |
|
|
/**
|
572 |
|
|
* Prepare raw data to be a title
|
573 |
|
|
*/
|
574 |
|
|
function _parser_common_syndication_title($title, $body = FALSE) {
|
575 |
|
|
if (empty($title) && !empty($body)) {
|
576 |
|
|
// Explode to words and use the first 3 words.
|
577 |
|
|
$words = preg_split('/[\s,]+/', strip_tags($body));
|
578 |
|
|
$title = implode(' ', array_slice($words, 0, 3));
|
579 |
|
|
}
|
580 |
|
|
return $title;
|
581 |
|
|
} |