1
|
<?php
|
2
|
|
3
|
/**
|
4
|
* @file
|
5
|
* Provides the base class for FeedsXPathParserHTML and FeedsXPathParserXML.
|
6
|
*/
|
7
|
|
8
|
/**
|
9
|
* Base class for the HTML and XML parsers.
|
10
|
*/
|
11
|
abstract class FeedsXPathParserBase extends FeedsParser {
|
12
|
|
13
|
protected $rawXML = array();
|
14
|
protected $doc = NULL;
|
15
|
protected $xpath = NULL;
|
16
|
|
17
|
/**
|
18
|
* Classes that use FeedsXPathParserBase must implement this.
|
19
|
*
|
20
|
* @param array $source_config
|
21
|
* The configuration for the source.
|
22
|
* @param FeedsFetcherResult $fetcher_result
|
23
|
* A FeedsFetcherResult object.
|
24
|
*
|
25
|
* @return DOMDocument
|
26
|
* The DOMDocument to perform XPath queries on.
|
27
|
*/
|
28
|
abstract protected function setup($source_config, FeedsFetcherResult $fetcher_result);
|
29
|
|
30
|
/**
|
31
|
* Helper callback to return the raw value.
|
32
|
*
|
33
|
* @param DOMNode $node
|
34
|
* The DOMNode to convert to a string.
|
35
|
*
|
36
|
* @return string
|
37
|
* The string representation of the DOMNode.
|
38
|
*/
|
39
|
abstract protected function getRaw(DOMNode $node);
|
40
|
|
41
|
/**
|
42
|
* Implements FeedsParser::parse().
|
43
|
*/
|
44
|
public function parse(FeedsSource $source, FeedsFetcherResult $fetcher_result) {
|
45
|
$source_config = $source->getConfigFor($this);
|
46
|
$state = $source->state(FEEDS_PARSE);
|
47
|
|
48
|
if (empty($source_config)) {
|
49
|
$source_config = $this->getConfig();
|
50
|
}
|
51
|
|
52
|
$this->doc = $this->setup($source_config, $fetcher_result);
|
53
|
|
54
|
$parser_result = new FeedsParserResult();
|
55
|
|
56
|
$mappings = $this->getOwnMappings();
|
57
|
$this->rawXML = array_keys(array_filter($source_config['rawXML']));
|
58
|
// Set link.
|
59
|
$fetcher_config = $source->getConfigFor($source->importer->fetcher);
|
60
|
$parser_result->link = $fetcher_config['source'];
|
61
|
|
62
|
$this->xpath = new FeedsXPathParserDOMXPath($this->doc);
|
63
|
$config = array();
|
64
|
$config['debug'] = array_keys(array_filter($source_config['exp']['debug']));
|
65
|
$config['errors'] = $source_config['exp']['errors'];
|
66
|
|
67
|
$this->xpath->setConfig($config);
|
68
|
|
69
|
$context_query = '(' . $source_config['context'] . ')';
|
70
|
if (empty($state->total)) {
|
71
|
$state->total = $this->xpath->namespacedQuery('count(' . $context_query . ')', $this->doc, 'count');
|
72
|
}
|
73
|
|
74
|
$start = $state->pointer ? $state->pointer : 0;
|
75
|
$limit = $start + $source->importer->getLimit();
|
76
|
$end = ($limit > $state->total) ? $state->total : $limit;
|
77
|
$state->pointer = $end;
|
78
|
|
79
|
$context_query .= "[position() > $start and position() <= $end]";
|
80
|
|
81
|
$progress = $state->pointer ? $state->pointer : 0;
|
82
|
|
83
|
$all_nodes = $this->xpath->namespacedQuery($context_query, NULL, 'context');
|
84
|
|
85
|
foreach ($all_nodes as $node) {
|
86
|
// Invoke a hook to check whether the domnode should be skipped.
|
87
|
if (in_array(TRUE, module_invoke_all('feeds_xpathparser_filter_domnode', $node, $this->doc, $source), TRUE)) {
|
88
|
continue;
|
89
|
}
|
90
|
|
91
|
$parsed_item = $variables = array();
|
92
|
foreach ($source_config['sources'] as $element_key => $query) {
|
93
|
// Variable substitution.
|
94
|
$query = strtr($query, $variables);
|
95
|
// Parse the item.
|
96
|
$result = $this->parseSourceElement($query, $node, $element_key);
|
97
|
if (isset($result)) {
|
98
|
if (!is_array($result)) {
|
99
|
$variables['$' . $mappings[$element_key]] = $result;
|
100
|
}
|
101
|
else {
|
102
|
$variables['$' . $mappings[$element_key]] = '';
|
103
|
}
|
104
|
$parsed_item[$element_key] = $result;
|
105
|
}
|
106
|
}
|
107
|
if (!empty($parsed_item)) {
|
108
|
$parser_result->items[] = $parsed_item;
|
109
|
}
|
110
|
}
|
111
|
|
112
|
$state->progress($state->total, $progress);
|
113
|
unset($this->doc);
|
114
|
unset($this->xpath);
|
115
|
return $parser_result;
|
116
|
}
|
117
|
|
118
|
/**
|
119
|
* Parses one item from the context array.
|
120
|
*
|
121
|
* @param string $query
|
122
|
* An XPath query.
|
123
|
* @param DOMNode $context
|
124
|
* The current context DOMNode .
|
125
|
* @param string $source
|
126
|
* The name of the source for this query.
|
127
|
*
|
128
|
* @return array
|
129
|
* An array containing the results of the query.
|
130
|
*/
|
131
|
protected function parseSourceElement($query, $context, $source) {
|
132
|
|
133
|
if (empty($query)) {
|
134
|
return;
|
135
|
}
|
136
|
|
137
|
$node_list = $this->xpath->namespacedQuery($query, $context, $source);
|
138
|
|
139
|
// Iterate through the results of the XPath query. If this source is
|
140
|
// configured to return raw xml, make it so.
|
141
|
if ($node_list instanceof DOMNodeList) {
|
142
|
$results = array();
|
143
|
if (in_array($source, $this->rawXML)) {
|
144
|
foreach ($node_list as $node) {
|
145
|
$results[] = $this->getRaw($node);
|
146
|
}
|
147
|
}
|
148
|
else {
|
149
|
foreach ($node_list as $node) {
|
150
|
$results[] = $node->nodeValue;
|
151
|
}
|
152
|
}
|
153
|
// Return single result if so.
|
154
|
if (count($results) === 1) {
|
155
|
return $results[0];
|
156
|
}
|
157
|
// Empty result returns NULL, that way we can check.
|
158
|
elseif (empty($results)) {
|
159
|
return;
|
160
|
}
|
161
|
else {
|
162
|
return $results;
|
163
|
}
|
164
|
}
|
165
|
// A value was returned directly from namespacedQuery().
|
166
|
else {
|
167
|
return $node_list;
|
168
|
}
|
169
|
}
|
170
|
|
171
|
/**
|
172
|
* Overrides parent::sourceForm().
|
173
|
*/
|
174
|
public function sourceForm($source_config) {
|
175
|
$form = array();
|
176
|
$importer = feeds_importer($this->id);
|
177
|
$importer_config = $importer->getConfig();
|
178
|
$mappings_ = $importer_config['processor']['config']['mappings'];
|
179
|
|
180
|
if (empty($source_config)) {
|
181
|
$source_config = $this->getConfig();
|
182
|
}
|
183
|
|
184
|
if (isset($source_config['allow_override']) &&
|
185
|
!$source_config['allow_override'] &&
|
186
|
empty($source_config['config'])) {
|
187
|
return;
|
188
|
}
|
189
|
|
190
|
// Add extensions that might get importerd.
|
191
|
$allowed_extensions = isset($importer_config['fetcher']['config']['allowed_extensions']) ? $importer_config['fetcher']['config']['allowed_extensions'] : FALSE;
|
192
|
if ($allowed_extensions) {
|
193
|
if (strpos($allowed_extensions, 'html') === FALSE) {
|
194
|
$importer->fetcher->config['allowed_extensions'] .= ' html htm';
|
195
|
}
|
196
|
}
|
197
|
|
198
|
$uniques = $mappings = array();
|
199
|
foreach ($mappings_ as $mapping) {
|
200
|
if (strpos($mapping['source'], 'xpathparser:') === 0) {
|
201
|
$mappings[$mapping['source']] = $mapping['target'];
|
202
|
if ($mapping['unique']) {
|
203
|
$uniques[] = $mapping['target'];
|
204
|
}
|
205
|
}
|
206
|
}
|
207
|
$form['xpath'] = array(
|
208
|
'#type' => 'fieldset',
|
209
|
'#tree' => TRUE,
|
210
|
'#title' => t('XPath Parser Settings'),
|
211
|
'#collapsible' => TRUE,
|
212
|
'#collapsed' => TRUE,
|
213
|
);
|
214
|
if (empty($mappings)) {
|
215
|
// Detect if Feeds menu structure has changed. This will take a while to
|
216
|
// be released, but since I run dev it needs to work.
|
217
|
$feeds_menu = feeds_ui_menu();
|
218
|
if (isset($feeds_menu['admin/structure/feeds/list'])) {
|
219
|
$feeds_base = 'admin/structure/feeds/edit/';
|
220
|
}
|
221
|
else {
|
222
|
$feeds_base = 'admin/structure/feeds/';
|
223
|
}
|
224
|
$form['xpath']['error_message']['#markup'] = '<div class="help">' . t('No XPath mappings are defined. Define mappings !link.', array('!link' => l(t('here'), $feeds_base . $this->id . '/mapping'))) . '</div><br />';
|
225
|
return $form;
|
226
|
}
|
227
|
$form['xpath']['context'] = array(
|
228
|
'#type' => 'textfield',
|
229
|
'#title' => t('Context'),
|
230
|
'#required' => TRUE,
|
231
|
'#description' => t('This is the base query, all other queries will run in this context.'),
|
232
|
'#default_value' => isset($source_config['context']) ? $source_config['context'] : '',
|
233
|
'#maxlength' => 1024,
|
234
|
'#size' => 80,
|
235
|
);
|
236
|
$form['xpath']['sources'] = array(
|
237
|
'#type' => 'fieldset',
|
238
|
'#tree' => TRUE,
|
239
|
);
|
240
|
if (!empty($uniques)) {
|
241
|
$items = array(
|
242
|
format_plural(count($uniques),
|
243
|
t('Field <strong>!column</strong> is mandatory and considered unique: only one item per !column value will be created.',
|
244
|
array('!column' => implode(', ', $uniques))),
|
245
|
t('Fields <strong>!columns</strong> are mandatory and values in these columns are considered unique: only one entry per value in one of these columns will be created.',
|
246
|
array('!columns' => implode(', ', $uniques)))),
|
247
|
);
|
248
|
$form['xpath']['sources']['help']['#markup'] = '<div class="help">' . theme('item_list', array('items' => $items)) . '</div>';
|
249
|
}
|
250
|
$variables = array();
|
251
|
foreach ($mappings as $source => $target) {
|
252
|
$form['xpath']['sources'][$source] = array(
|
253
|
'#type' => 'textfield',
|
254
|
'#title' => check_plain($target),
|
255
|
'#description' => t('The XPath query to run.'),
|
256
|
'#default_value' => isset($source_config['sources'][$source]) ? $source_config['sources'][$source] : '',
|
257
|
'#maxlength' => 1024,
|
258
|
'#size' => 80,
|
259
|
);
|
260
|
if (!empty($variables)) {
|
261
|
$variable_text = format_plural(count($variables),
|
262
|
t('The variable %variable is available for replacement.', array('%variable' => implode(', ', $variables))),
|
263
|
t('The variables %variable are available for replacement.', array('%variable' => implode(', ', $variables)))
|
264
|
);
|
265
|
$form['xpath']['sources'][$source]['#description'] .= '<br />' . $variable_text;
|
266
|
}
|
267
|
$variables[] = '$' . $target;
|
268
|
}
|
269
|
$form['xpath']['rawXML'] = array(
|
270
|
'#type' => 'checkboxes',
|
271
|
'#title' => t('Select the queries you would like to return raw XML or HTML'),
|
272
|
'#options' => $mappings,
|
273
|
'#default_value' => isset($source_config['rawXML']) ? $source_config['rawXML'] : array(),
|
274
|
);
|
275
|
$form['xpath']['exp'] = array(
|
276
|
'#type' => 'fieldset',
|
277
|
'#collapsible' => TRUE,
|
278
|
'#collapsed' => TRUE,
|
279
|
'#tree' => TRUE,
|
280
|
'#title' => t('Debug Options'),
|
281
|
);
|
282
|
$form['xpath']['exp']['errors'] = array(
|
283
|
'#type' => 'checkbox',
|
284
|
'#title' => t('Show error messages.'),
|
285
|
'#default_value' => isset($source_config['exp']['errors']) ? $source_config['exp']['errors'] : FALSE,
|
286
|
);
|
287
|
if (extension_loaded('tidy')) {
|
288
|
$form['xpath']['exp']['tidy'] = array(
|
289
|
'#type' => 'checkbox',
|
290
|
'#title' => t('Use Tidy'),
|
291
|
'#description' => t('The Tidy PHP extension has been detected.
|
292
|
Select this to clean the markup before parsing.'),
|
293
|
'#default_value' => isset($source_config['exp']['tidy']) ? $source_config['exp']['tidy'] : FALSE,
|
294
|
);
|
295
|
$form['xpath']['exp']['tidy_encoding'] = array(
|
296
|
'#type' => 'textfield',
|
297
|
'#title' => t('Tidy encoding'),
|
298
|
'#description' => t('Set the encoding for tidy. See the !phpdocs for possible values.', array('!phpdocs' => l(t('PHP docs'), 'http://www.php.net/manual/en/tidy.parsestring.php/'))),
|
299
|
'#default_value' => isset($source_config['exp']['tidy_encoding']) ? $source_config['exp']['tidy_encoding'] : 'UTF8',
|
300
|
'#states' => array(
|
301
|
'visible' => array(
|
302
|
':input[name$="[tidy]"]' => array(
|
303
|
'checked' => TRUE,
|
304
|
),
|
305
|
),
|
306
|
),
|
307
|
);
|
308
|
}
|
309
|
$form['xpath']['exp']['debug'] = array(
|
310
|
'#type' => 'checkboxes',
|
311
|
'#title' => t('Debug query'),
|
312
|
'#options' => array_merge(array('context' => 'context'), $mappings),
|
313
|
'#default_value' => isset($source_config['exp']['debug']) ? $source_config['exp']['debug'] : array(),
|
314
|
);
|
315
|
return $form;
|
316
|
}
|
317
|
|
318
|
/**
|
319
|
* Overrides parent::configForm().
|
320
|
*/
|
321
|
public function configForm(&$form_state) {
|
322
|
$config = $this->getConfig();
|
323
|
$config['config'] = TRUE;
|
324
|
$form = $this->sourceForm($config);
|
325
|
$form['xpath']['context']['#required'] = FALSE;
|
326
|
$form['xpath']['#collapsed'] = FALSE;
|
327
|
$form['xpath']['allow_override'] = array(
|
328
|
'#type' => 'checkbox',
|
329
|
'#title' => t('Allow source configuration override'),
|
330
|
'#description' => t('This setting allows feed nodes to specify their own XPath values for the context and sources.'),
|
331
|
'#default_value' => $config['allow_override'],
|
332
|
);
|
333
|
|
334
|
return $form;
|
335
|
}
|
336
|
|
337
|
/**
|
338
|
* Overrides parent::sourceDefaults().
|
339
|
*/
|
340
|
public function sourceDefaults() {
|
341
|
return array();
|
342
|
}
|
343
|
|
344
|
/**
|
345
|
* Overrides parent::configDefaults().
|
346
|
*/
|
347
|
public function configDefaults() {
|
348
|
return array(
|
349
|
'sources' => array(),
|
350
|
'rawXML' => array(),
|
351
|
'context' => '',
|
352
|
'exp' => array(
|
353
|
'errors' => FALSE,
|
354
|
'tidy' => FALSE,
|
355
|
'debug' => array(),
|
356
|
'tidy_encoding' => 'UTF8',
|
357
|
),
|
358
|
'allow_override' => TRUE,
|
359
|
);
|
360
|
}
|
361
|
|
362
|
/**
|
363
|
* Overrides parent::sourceFormValidate().
|
364
|
*
|
365
|
* If the values of this source are the same as the base config we set them to
|
366
|
* blank so that the values will be inherited from the importer defaults.
|
367
|
*/
|
368
|
public function sourceFormValidate(&$values) {
|
369
|
$config = $this->getConfig();
|
370
|
$values = $values['xpath'];
|
371
|
$allow_override = $config['allow_override'];
|
372
|
unset($config['allow_override']);
|
373
|
ksort($values);
|
374
|
ksort($config);
|
375
|
if ($values === $config || !$allow_override) {
|
376
|
$values = array();
|
377
|
return;
|
378
|
}
|
379
|
|
380
|
$this->configFormValidate($values);
|
381
|
}
|
382
|
|
383
|
/**
|
384
|
* Overrides parent::sourceFormValidate().
|
385
|
*/
|
386
|
public function configFormValidate(&$values) {
|
387
|
$mappings = $this->getOwnMappings();
|
388
|
|
389
|
// This tests if we're validating configForm or sourceForm.
|
390
|
$config_form = FALSE;
|
391
|
if (isset($values['xpath'])) {
|
392
|
$values = $values['xpath'];
|
393
|
$config_form = TRUE;
|
394
|
}
|
395
|
$class = get_class($this);
|
396
|
$xml = new SimpleXMLElement('<?xml version="1.0" encoding="UTF-8"?>' . "\n<items></items>");
|
397
|
$use_errors = $this->errorStart();
|
398
|
|
399
|
$values['context'] = trim($values['context']);
|
400
|
if (!empty($values['context'])) {
|
401
|
$result = $xml->xpath($values['context']);
|
402
|
}
|
403
|
$error = libxml_get_last_error();
|
404
|
|
405
|
// Error code 1219 is undefined namespace prefix.
|
406
|
// Our sample doc doesn't have any namespaces let alone the one they're
|
407
|
// trying to use. Besides, if someone is trying to use a namespace in an
|
408
|
// XPath query, they're probably right.
|
409
|
if ($error && $error->code != 1219) {
|
410
|
$element = 'feeds][' . $class . '][xpath][context';
|
411
|
if ($config_form) {
|
412
|
$element = 'xpath][context';
|
413
|
}
|
414
|
form_set_error($element, t('There was an error with the XPath selector: %error', array('%error' => $error->message)));
|
415
|
libxml_clear_errors();
|
416
|
}
|
417
|
foreach ($values['sources'] as $key => &$query) {
|
418
|
$query = trim($query);
|
419
|
if (!empty($query)) {
|
420
|
$result = $xml->xpath($query);
|
421
|
$error = libxml_get_last_error();
|
422
|
if ($error && $error->code != 1219) {
|
423
|
$variable_present = FALSE;
|
424
|
// Our variable substitution options can cause syntax errors, check
|
425
|
// if we're doing that.
|
426
|
if ($error->code == 1207) {
|
427
|
foreach ($mappings as $target) {
|
428
|
if (strpos($query, '$' . $target) !== FALSE) {
|
429
|
$variable_present = TRUE;
|
430
|
break;
|
431
|
}
|
432
|
}
|
433
|
}
|
434
|
if (!$variable_present) {
|
435
|
$element = 'feeds][' . $class . '][xpath][sources][' . $key;
|
436
|
if ($config_form) {
|
437
|
$element = 'xpath][sources][' . $key;
|
438
|
}
|
439
|
form_set_error($element, t('There was an error with the XPath selector: %error', array('%error' => $error->message)));
|
440
|
libxml_clear_errors();
|
441
|
}
|
442
|
}
|
443
|
}
|
444
|
}
|
445
|
$this->errorStop($use_errors, FALSE);
|
446
|
}
|
447
|
|
448
|
/**
|
449
|
* Overrides parent::getMappingSources().
|
450
|
*/
|
451
|
public function getMappingSources() {
|
452
|
$mappings = $this->filterMappings(feeds_importer($this->id)->processor->config['mappings']);
|
453
|
$next = 0;
|
454
|
if (!empty($mappings)) {
|
455
|
$keys = array_keys($mappings);
|
456
|
$last_mapping = end($keys);
|
457
|
$next = explode(':', $last_mapping);
|
458
|
$next = $next[1] + 1;
|
459
|
}
|
460
|
return array(
|
461
|
'xpathparser:' . $next => array(
|
462
|
'name' => t('XPath Expression'),
|
463
|
'description' => t('Allows you to configure an XPath expression that will populate this field.'),
|
464
|
),
|
465
|
) + parent::getMappingSources();
|
466
|
}
|
467
|
|
468
|
/**
|
469
|
* Gets the mappings that are defined by this parser.
|
470
|
*
|
471
|
* The mappings begin with "xpathparser:".
|
472
|
*
|
473
|
* @return array
|
474
|
* An array of mappings keyed source => target.
|
475
|
*/
|
476
|
protected function getOwnMappings() {
|
477
|
$importer_config = feeds_importer($this->id)->getConfig();
|
478
|
return $this->filterMappings($importer_config['processor']['config']['mappings']);
|
479
|
}
|
480
|
|
481
|
/**
|
482
|
* Filters mappings, returning the ones that belong to us.
|
483
|
*
|
484
|
* @param array $mappings
|
485
|
* A mapping array from a processor.
|
486
|
*
|
487
|
* @return array
|
488
|
* An array of mappings keyed source => target.
|
489
|
*/
|
490
|
protected function filterMappings($mappings) {
|
491
|
$our_mappings = array();
|
492
|
foreach ($mappings as $mapping) {
|
493
|
if (strpos($mapping['source'], 'xpathparser:') === 0) {
|
494
|
$our_mappings[$mapping['source']] = $mapping['target'];
|
495
|
}
|
496
|
}
|
497
|
return $our_mappings;
|
498
|
}
|
499
|
|
500
|
/**
|
501
|
* Starts custom error handling.
|
502
|
*
|
503
|
* @return bool
|
504
|
* The previous value of use_errors.
|
505
|
*/
|
506
|
protected function errorStart() {
|
507
|
return libxml_use_internal_errors(TRUE);
|
508
|
}
|
509
|
|
510
|
/**
|
511
|
* Stops custom error handling.
|
512
|
*
|
513
|
* @param bool $use
|
514
|
* The previous value of use_errors.
|
515
|
* @param bool $print
|
516
|
* (Optional) Whether to print errors to the screen. Defaults to TRUE.
|
517
|
*/
|
518
|
protected function errorStop($use, $print = TRUE) {
|
519
|
if ($print) {
|
520
|
foreach (libxml_get_errors() as $error) {
|
521
|
switch ($error->level) {
|
522
|
case LIBXML_ERR_WARNING:
|
523
|
case LIBXML_ERR_ERROR:
|
524
|
$type = 'warning';
|
525
|
break;
|
526
|
|
527
|
case LIBXML_ERR_FATAL:
|
528
|
$type = 'error';
|
529
|
break;
|
530
|
}
|
531
|
$args = array(
|
532
|
'%error' => trim($error->message),
|
533
|
'%num' => $error->line,
|
534
|
'%code' => $error->code,
|
535
|
);
|
536
|
$message = t('%error on line %num. Error code: %code', $args);
|
537
|
drupal_set_message($message, $type, FALSE);
|
538
|
}
|
539
|
}
|
540
|
libxml_clear_errors();
|
541
|
libxml_use_internal_errors($use);
|
542
|
}
|
543
|
|
544
|
}
|