/drupal7/sites/all/modules/feeds/libraries/common_syndication_parser.inc - Annoter - Club Drupal - Forge Centrale Marseille

85ad3d82

Assos Assos

<?php

2

3

/**

4

 * @file

5

 *   Downloading and parsing functions for Common Syndication Parser.

6

 *   Pillaged from FeedAPI common syndication parser.

7

8

 * @todo Restructure. OO could work wonders here.

9

 * @todo Write unit tests.

10

 * @todo Keep in Feeds project or host on Drupal?

11

*/

12

13

/**

14

 * Parse the feed into a data structure.

15

16

 * @param $feed

17

 *  The feed object (contains the URL or the parsed XML structure.

18

 * @return

19

 *  stdClass The structured datas extracted from the feed.

20

*/

21

function common_syndication_parser_parse($string) {

22

  @ $xml = simplexml_load_string($string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NOCDATA);

23

24

  // Got a malformed XML.

25

  if ($xml === FALSE || is_null($xml)) {

26

    return FALSE;

27

28

  $feed_type = _parser_common_syndication_feed_format_detect($xml);

29

  if ($feed_type ==  "atom1.0") {

30

    return _parser_common_syndication_atom10_parse($xml);

31

32

  if ($feed_type == "RSS2.0" || $feed_type == "RSS0.91" || $feed_type == "RSS0.92") {

33

    return _parser_common_syndication_RSS20_parse($xml);

34

35

  if ($feed_type == "RDF") {

36

    return _parser_common_syndication_RDF10_parse($xml);

37

38

  return FALSE;

39

40

41

/**

42

 * Determine the feed format of a SimpleXML parsed object structure.

43

44

 * @param $xml

45

 *  SimpleXML-preprocessed feed.

46

 * @return

47

 *  The feed format short description or FALSE if not compatible.

48

*/

49

function _parser_common_syndication_feed_format_detect($xml) {

50

  if (!is_object($xml)) {

51

    return FALSE;

52

53

  $attr = $xml->attributes();

54

  $type = strtolower($xml->getName());

55

  if (isset($xml->entry) && $type == "feed") {

56

    return "atom1.0";

57

58

  if ($type == "rss" && $attr["version"] == "2.0") {

59

    return "RSS2.0";

60

61

  if ($type == "rdf" && isset($xml->channel)) {

62

    return "RDF";

63

64

  if ($type == "rss" && $attr["version"] == "0.91") {

65

    return "RSS0.91";

66

67

  if ($type == "rss" && $attr["version"] == "0.92") {

68

    return "RSS0.92";

69

70

  return FALSE;

71

72

73

/**

74

 * Parse atom feeds.

75

*/

76

function _parser_common_syndication_atom10_parse($feed_XML) {

77

  $parsed_source = array();

78

79

  $ns = array(

80

    "georss" => "http://www.georss.org/georss",

81

);

82

83

  $base = $feed_XML->xpath("@base");

84

  $base = (string) array_shift($base);

85

  if (!valid_url($base, TRUE)) {

86

    $base = FALSE;

87

88

89

  // Detect the title

90

  $parsed_source['title'] = isset($feed_XML->title) ? _parser_common_syndication_title("{$feed_XML->title}") : "";

91

  // Detect the description

92

  $parsed_source['description'] = isset($feed_XML->subtitle) ? "{$feed_XML->subtitle}" : "";

93

94

  $parsed_source['link'] = _parser_common_syndication_link($feed_XML->link);

95

  if (valid_url($parsed_source['link']) && !valid_url($parsed_source['link'], TRUE) && !empty($base)) {

96

    $parsed_source['link'] = $base . $parsed_source['link'];

97

98

99

  $parsed_source['items'] = array();

100

101

  foreach ($feed_XML->entry as $news) {

102

103

    $original_url = NULL;

104

    $guid = !empty($news->id) ? "{$news->id}" : NULL;

105

    if (valid_url($guid, TRUE)) {

106

      $original_url = $guid;

107

108

109

    $georss = (array)$news->children($ns["georss"]);

110

    $geoname = '';

111

    if (isset($georss['featureName'])) {

112

      $geoname = "{$georss['featureName']}";

113

114

115

    $latlon =

116

    $lat =

117

    $lon = NULL;

118

    if (isset($georss['point'])) {

119

      $latlon = explode(' ', $georss['point']);

120

      $lat = "{$latlon[0]}";

121

      $lon = "{$latlon[1]}";

122

      if (!$geoname) {

123

        $geoname = "{$lat} {$lon}";

    $additional_taxonomies = array();

128

    if (isset($news->category)) {

129

      $additional_taxonomies['ATOM Categories'] = array();

130

      $additional_taxonomies['ATOM Domains'] = array();

131

      foreach ($news->category as $category) {

132

        if (isset($category['scheme'])) {

133

          $domain = "{$category['scheme']}";

134

          if (!empty($domain)) {

135

              if (!isset($additional_taxonomies['ATOM Domains'][$domain])) {

136

                $additional_taxonomies['ATOM Domains'][$domain] = array();

137

138

              $additional_taxonomies['ATOM Domains'][$domain][] = count($additional_taxonomies['ATOM Categories']) - 1;

139

140

141

        $additional_taxonomies['ATOM Categories'][] = "{$category['term']}";

    $title = "{$news->title}";

146

147

    $body = '';

148

    if (!empty($news->content)) {

149

      foreach ($news->content->children() as $child)  {

150

        $body .= $child->asXML();

151

152

      $body .= "{$news->content}";

153

154

    elseif (!empty($news->summary)) {

155

      foreach ($news->summary->children() as $child)  {

156

        $body .= $child->asXML();

157

158

      $body .= "{$news->summary}";

159

160

161

    if (!empty($news->content['src'])) {

162

      // some src elements in some valid atom feeds contained no urls at all

163

      if (valid_url("{$news->content['src']}", TRUE)) {

164

        $original_url = "{$news->content['src']}";

85ad3d82

Assos Assos

    if (!empty($news->source->author->name)) {

170

      $original_author = "{$news->source->author->name}";

171

172

    elseif (!empty($news->author->name)) {

173

      $original_author = "{$news->author->name}";

174

176

85ad3d82

Assos Assos

      $original_author = "{$feed_XML->author->name}";

177

178

179

    $original_url = _parser_common_syndication_link($news->link);

180

181

    $item = array();

182

    $item['title'] = _parser_common_syndication_title($title, $body);

183

    $item['description'] = $body;

184

    $item['author_name'] = $original_author;

185

186

    // Fall back to updated for timestamp if both published and issued are

187

    // empty.

188

    if (isset($news->published)) {

189

      $item['timestamp'] = _parser_common_syndication_parse_date("{$news->published}");

190

191

    elseif (isset($news->issued)) {

192

       $item['timestamp'] = _parser_common_syndication_parse_date("{$news->issued}");

193

194

    elseif (isset($news->updated)) {

195

      $item['timestamp'] = _parser_common_syndication_parse_date("{$news->updated}");

196

197

198

    $item['url'] = trim($original_url);

199

    if (valid_url($item['url']) && !valid_url($item['url'], TRUE) && !empty($base)) {

200

      $item['url'] = $base . $item['url'];

201

202

    // Fall back on URL if GUID is empty.

203

    if (!empty($guid)) {

204

      $item['guid'] = $guid;

205

206

    else {

207

      $item['guid'] = $item['url'];

208

209

    $item['geolocations'] = array();

210

    if ($lat && $lon) {

211

      $item['geolocations'] = array(

212

        array(

213

          'name' => $geoname,

214

          'lat' => $lat,

215

          'lon' => $lon,

216

),

217

);

218

219

    $item['tags'] = isset($additional_taxonomies['ATOM Categories']) ? $additional_taxonomies['ATOM Categories'] : array();

220

    $item['domains'] = isset($additional_taxonomies['ATOM Domains']) ? $additional_taxonomies['ATOM Domains'] : array();

221

    $parsed_source['items'][] = $item;

222

223

  return $parsed_source;

224

225

226

/**

227

 * Parse RDF Site Summary (RSS) 1.0 feeds in RDF/XML format.

228

229

 * @see http://web.resource.org/rss/1.0/

230

*/

231

function _parser_common_syndication_RDF10_parse($feed_XML) {

232

  // Declare some canonical standard prefixes for well-known namespaces:

233

  static $canonical_namespaces = array(

234

    'rdf'      => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',

235

    'rdfs'     => 'http://www.w3.org/2000/01/rdf-schema#',

236

    'xsi'      => 'http://www.w3.org/2001/XMLSchema-instance#',

237

    'xsd'      => 'http://www.w3.org/2001/XMLSchema#',

238

    'owl'      => 'http://www.w3.org/2002/07/owl#',

239

    'dc'       => 'http://purl.org/dc/elements/1.1/',

240

    'dcterms'  => 'http://purl.org/dc/terms/',

241

    'dcmitype' => 'http://purl.org/dc/dcmitype/',

242

    'foaf'     => 'http://xmlns.com/foaf/0.1/',

243

    'rss'      => 'http://purl.org/rss/1.0/',

244

);

245

246

  // Get all namespaces declared in the feed element.

247

  $namespaces = $feed_XML->getNamespaces(TRUE);

248

249

  // Process the <rss:channel> resource containing feed metadata:

250

  foreach ($feed_XML->children($canonical_namespaces['rss'])->channel as $rss_channel) {

251

    $parsed_source = array(

252

      'title'       => _parser_common_syndication_title((string) $rss_channel->title),

253

      'description' => (string) $rss_channel->description,

254

      'link'        => (string) $rss_channel->link,

255

      'items'       => array(),

256

);

257

    break;

258

259

260

  // Process each <rss:item> resource contained in the feed:

261

  foreach ($feed_XML->children($canonical_namespaces['rss'])->item as $rss_item) {

262

263

    // Extract all available RDF statements from the feed item's RDF/XML

264

    // tags, allowing for both the item's attributes and child elements to

265

    // contain RDF properties:

266

    $rdf_data = array();

267

    foreach ($namespaces as $ns => $ns_uri) {

268

      // Note that we attempt to normalize the found property name

269

      // namespaces to well-known 'standard' prefixes where possible, as the

270

      // feed may in principle use any arbitrary prefixes and we should

271

      // still be able to correctly handle it.

272

      foreach ($rss_item->attributes($ns_uri) as $attr_name => $attr_value) {

273

        $ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;

274

        $rdf_data[$ns_prefix . ':' . $attr_name][] = (string) $attr_value;

275

276

      foreach ($rss_item->children($ns_uri) as $rss_property) {

277

        $ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;

278

        $rdf_data[$ns_prefix . ':' . $rss_property->getName()][] = (string) $rss_property;

    // Declaratively define mappings that determine how to construct the result object.

283

    $item = _parser_common_syndication_RDF10_item($rdf_data, array(

284

      'title'       => array('rss:title', 'dc:title'),

285

      'description' => array('rss:description', 'dc:description', 'content:encoded'),

286

      'url'         => array('rss:link', 'rdf:about'),

287

      'author_name' => array('dc:creator', 'dc:publisher'),

288

      'guid'        => 'rdf:about',

289

      'timestamp'   => 'dc:date',

290

      'tags'        => 'dc:subject'

291

));

292

293

    // Special handling for the title:

294

    $item['title'] = _parser_common_syndication_title($item['title'], $item['description']);

295

296

    // Parse any date/time values into Unix timestamps:

297

    $item['timestamp'] = _parser_common_syndication_parse_date($item['timestamp']);

298

299

    // If no GUID found, use the URL of the feed.

300

    if (empty($item['guid'])) {

301

      $item['guid'] = $item['url'];

302

303

304

    // Add every found RDF property to the feed item.

305

    $item['rdf'] = array();

306

    foreach ($rdf_data as $rdf_property => $rdf_value) {

307

      // looks nicer in the mapper UI

308

      // @todo Revisit, not used with feedapi mapper anymore.

309

      $rdf_property = str_replace(':', '_', $rdf_property);

310

      $item['rdf'][$rdf_property] = $rdf_value;

311

312

313

    $parsed_source['items'][] = $item;

314

315

316

  return $parsed_source;

317

318

319

function _parser_common_syndication_RDF10_property($rdf_data, $rdf_properties = array()) {

320

  $rdf_properties = is_array($rdf_properties) ? $rdf_properties : array_slice(func_get_args(), 1);

321

  foreach ($rdf_properties as $rdf_property) {

322

    if ($rdf_property && !empty($rdf_data[$rdf_property])) {

323

      // remove empty strings

324

      return array_filter($rdf_data[$rdf_property], 'strlen');

function _parser_common_syndication_RDF10_item($rdf_data, $mappings) {

330

  foreach ($mappings as $k => $v) {

331

    $values = _parser_common_syndication_RDF10_property($rdf_data, $v);

332

    $mappings[$k] = !is_array($values) || count($values) > 1 ? $values : reset($values);

333

334

  return $mappings;

335

336

337

/**

338

 * Parse RSS2.0 feeds.

339

*/

340

function _parser_common_syndication_RSS20_parse($feed_XML) {

341

342

  $ns = array(

343

    "content" => "http://purl.org/rss/1.0/modules/content/",

344

     "dc" => "http://purl.org/dc/elements/1.1/",

345

     "georss" => "http://www.georss.org/georss",

346

);

347

348

  $parsed_source = array();

349

  // Detect the title.

350

  $parsed_source['title'] = isset($feed_XML->channel->title) ? _parser_common_syndication_title("{$feed_XML->channel->title}") : "";

351

  // Detect the description.

352

  $parsed_source['description'] = isset($feed_XML->channel->description) ? "{$feed_XML->channel->description}" : "";

353

  // Detect the link.

354

  $parsed_source['link'] = isset($feed_XML->channel->link) ? "{$feed_XML->channel->link}" : "";

355

  $parsed_source['items'] = array();

356

357

  foreach ($feed_XML->xpath('//item') as $news) {

358

    $title = $body = $original_author = $original_url = $guid = '';

359

360

    $category = $news->xpath('category');

361

    // Get children for current namespace.

362

    $content = (array)$news->children($ns["content"]);

363

    $dc      = (array)$news->children($ns["dc"]);

364

    $georss  = (array)$news->children($ns["georss"]);

365

    $news = (array) $news;

366

    $news['category'] = $category;

367

368

    if (isset($news['title'])) {

369

      $title = "{$news['title']}";

370

371

372

    if (isset($news['description'])) {

373

      $body = "{$news['description']}";

374

375

    // Some sources use content:encoded as description i.e.

376

    // PostNuke PageSetter module.

377

    if (isset($news['encoded'])) {  // content:encoded for PHP < 5.1.2.

378

      if (strlen($body) < strlen("{$news['encoded']}")) {

379

        $body = "{$news['encoded']}";

380

381

382

    if (isset($content['encoded'])) { // content:encoded for PHP >= 5.1.2.

383

      if (strlen($body) < strlen("{$content['encoded']}")) {

384

        $body = "{$content['encoded']}";

385

386

387

    if (!isset($body)) {

388

      $body = "{$news['title']}";

389

390

391

    if (!empty($news['author'])) {

392

      $original_author = "{$news['author']}";

393

394

    elseif (!empty($dc["creator"])) {

395

      $original_author = (string)$dc["creator"];

396

397

398

    if (!empty($news['link'])) {

399

      $original_url = "{$news['link']}";

400

      $guid = $original_url;

401

402

403

    if (!empty($news['guid'])) {

404

      $guid = "{$news['guid']}";

405

406

407

    if (!empty($georss['featureName'])) {

408

      $geoname = "{$georss['featureName']}";

409

410

411

    $lat =

412

    $lon =

413

    $latlon =

414

    $geoname = NULL;

415

    if (!empty($georss['point'])) {

416

      $latlon = explode(' ', $georss['point']);

417

      $lat = "{$latlon[0]}";

418

      $lon = "{$latlon[1]}";

419

      if (!$geoname) {

420

        $geoname = "$lat $lon";

    $additional_taxonomies = array();

425

    $additional_taxonomies['RSS Categories'] = array();

426

    $additional_taxonomies['RSS Domains'] = array();

427

    if (isset($news['category'])) {

428

      foreach ($news['category'] as $category) {

429

        $additional_taxonomies['RSS Categories'][] = "{$category}";

430

        if (isset($category['domain'])) {

431

          $domain = "{$category['domain']}";

432

          if (!empty($domain)) {

433

              if (!isset($additional_taxonomies['RSS Domains'][$domain])) {

434

                $additional_taxonomies['RSS Domains'][$domain] = array();

435

436

              $additional_taxonomies['RSS Domains'][$domain][] = count($additional_taxonomies['RSS Categories']) - 1;

    $item = array();

443

    $item['title'] = _parser_common_syndication_title($title, $body);

444

    $item['description'] = $body;

445

    $item['author_name'] = $original_author;

446

    if (!empty($news['pubDate'])) {

447

      $item['timestamp'] = _parser_common_syndication_parse_date($news['pubDate']);

448

449

    elseif (!empty($dc['date'])) {

450

      $item['timestamp'] = _parser_common_syndication_parse_date($dc['date']);

451

452

    else {

453

      $item['timestamp'] = time();

454

455

    $item['url'] = trim($original_url);

456

    $item['guid'] = $guid;

457

458

    $item['geolocations'] = array();

459

    if (isset($geoname, $lat, $lon)) {

460

      $item['geolocations'] = array(

461

        array(

462

          'name' => $geoname,

463

          'lat' => $lat,

464

          'lon' => $lon,

465

),

466

);

467

468

469

    $item['domains'] = $additional_taxonomies['RSS Domains'];

470

    $item['tags'] = $additional_taxonomies['RSS Categories'];

471

    $parsed_source['items'][] = $item;

472

473

  return $parsed_source;

474

475

476

/**

477

 * Parse a date comes from a feed.

478

479

 * @param $date_string

480

 *  The date string in various formats.

481

 * @return

482

 *  The timestamp of the string or the current time if can't be parsed

483

*/

484

function _parser_common_syndication_parse_date($date_str) {

485

  // PHP < 5.3 doesn't like the GMT- notation for parsing timezones.

486

  $date_str = str_replace('GMT-', '-', $date_str);

487

  $date_str = str_replace('GMT+', '+', $date_str);

488

  $parsed_date = strtotime($date_str);

489

490

  if ($parsed_date === FALSE || $parsed_date == -1) {

491

    $parsed_date = _parser_common_syndication_parse_w3cdtf($date_str);

492

493

494

  if (($parsed_date === FALSE || $parsed_date == -1)) {

495

    // PHP does not support the UT timezone. Fake it. The system that generated

496

    // this, Google Groups, probably meant UTC.

497

    $date_str = strtolower(trim($date_str));

498

    $last_three = substr($date_str, strlen($date_str) - 3, 3);

499

500

    if ($last_three == ' ut') {

501

      $parsed_date = strtotime($date_str . 'c');

  return $parsed_date === FALSE ? time() : $parsed_date;

506

507

508

/**

509

 * Parse the W3C date/time format, a subset of ISO 8601.

510

511

 * PHP date parsing functions do not handle this format.

512

 * See http://www.w3.org/TR/NOTE-datetime for more information.

513

 * Originally from MagpieRSS (http://magpierss.sourceforge.net/).

514

515

 * @param $date_str

516

 *   A string with a potentially W3C DTF date.

517

 * @return

518

 *   A timestamp if parsed successfully or FALSE if not.

519

*/

520

function _parser_common_syndication_parse_w3cdtf($date_str) {

521

  if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) {

522

    list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]);

523

    // Calculate the epoch for current date assuming GMT.

524

    $epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);

525

    if ($match[10] != 'Z') { // Z is zulu time, aka GMT

526

      list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]);

527

      // Zero out the variables.

528

      if (!$tz_hour) {

529

        $tz_hour = 0;

530

531

      if (!$tz_min) {

532

        $tz_min = 0;

533

534

      $offset_secs = (($tz_hour * 60) + $tz_min) * 60;

535

      // Is timezone ahead of GMT?  If yes, subtract offset.

536

      if ($tz_mod == '+') {

537

        $offset_secs *= -1;

538

539

      $epoch += $offset_secs;

540

541

    return $epoch;

542

543

  else {

544

    return FALSE;

/**

549

 * Extract the link that points to the original content (back to site or

550

 * original article)

551

552

 * @param $links

553

 *  Array of SimpleXML objects

554

*/

555

function _parser_common_syndication_link($links) {

556

  $to_link = '';

557

  if (count($links) > 0) {

558

    foreach ($links as $link) {

559

      $link = $link->attributes();

560

      $to_link = isset($link["href"]) ? "{$link["href"]}" : "";

561

      if (isset($link["rel"])) {

562

        if ("{$link["rel"]}" == 'alternate') {

563

          break;

  return $to_link;

569

570

571

/**

572

 * Prepare raw data to be a title

573

*/

574

function _parser_common_syndication_title($title, $body = FALSE) {

575

  if (empty($title) && !empty($body)) {

576

    // Explode to words and use the first 3 words.

577

    $words = preg_split('/[\s,]+/', strip_tags($body));

578

    $title = implode(' ', array_slice($words, 0, 3));

579

580

  return $title;

581

Projet

Général

Profil

Club Drupal

root / drupal7 / sites / all / modules / feeds / libraries / common_syndication_parser.inc @ d756b39a