Projet

Général

Profil

Paste
Télécharger (10 ko) Statistiques
| Branche: | Révision:

root / drupal7 / sites / all / modules / feeds / libraries / ParserCSV.inc @ 41cc1b08

1
<?php
2

    
3
/**
4
 * @file
5
 * Contains CSV Parser.
6
 *
7
 * Functions in this file are independent of the Feeds specific implementation.
8
 * Thanks to jpetso http://drupal.org/user/56020 for most of the code in this
9
 * file.
10
 */
11

    
12
/**
13
 * Text lines from file iterator.
14
 */
15
class ParserCSVIterator implements Iterator {
16
  private $handle;
17
  private $currentLine;
18
  private $currentPos;
19

    
20
  public function __construct($filepath) {
21
    $this->handle = fopen($filepath, 'r');
22
    $this->currentLine = NULL;
23
    $this->currentPos = NULL;
24
  }
25

    
26
  function __destruct() {
27
    if ($this->handle) {
28
      fclose($this->handle);
29
    }
30
  }
31

    
32
  public function rewind($pos = 0) {
33
    if ($this->handle) {
34
      fseek($this->handle, $pos);
35
      $this->next();
36
    }
37
  }
38

    
39
  public function next() {
40
    if ($this->handle) {
41
      $this->currentLine = feof($this->handle) ? NULL : fgets($this->handle);
42
      $this->currentPos = ftell($this->handle);
43
      return $this->currentLine;
44
    }
45
  }
46

    
47
  public function valid() {
48
    return isset($this->currentLine);
49
  }
50

    
51
  public function current() {
52
    return $this->currentLine;
53
  }
54

    
55
  public function currentPos() {
56
    return $this->currentPos;
57
  }
58

    
59
  public function key() {
60
    return 'line';
61
  }
62
}
63

    
64
/**
65
 * Functionality to parse CSV files into a two dimensional array.
66
 */
67
class ParserCSV {
68
  private $delimiter;
69
  private $skipFirstLine;
70
  private $columnNames;
71
  private $timeout;
72
  private $timeoutReached;
73
  private $startByte;
74
  private $lineLimit;
75
  private $lastLinePos;
76

    
77
  public function __construct() {
78
    $this->delimiter = ',';
79
    $this->skipFirstLine = FALSE;
80
    $this->columnNames = FALSE;
81
    $this->timeout = FALSE;
82
    $this->timeoutReached = FALSE;
83
    $this->startByte = 0;
84
    $this->lineLimit = 0;
85
    $this->lastLinePos = 0;
86
    ini_set('auto_detect_line_endings', TRUE);
87
  }
88

    
89
  /**
90
   * Set the column delimiter string.
91
   * By default, the comma (',') is used as delimiter.
92
   */
93
  public function setDelimiter($delimiter) {
94
    $this->delimiter = $delimiter;
95
  }
96

    
97
  /**
98
   * Set this to TRUE if the parser should skip the first line of the CSV text,
99
   * which might be desired if the first line contains the column names.
100
   * By default, this is set to FALSE and the first line is not skipped.
101
   */
102
  public function setSkipFirstLine($skipFirstLine) {
103
    $this->skipFirstLine = $skipFirstLine;
104
  }
105

    
106
  /**
107
   * Specify an array of column names if you know them in advance, or FALSE
108
   * (which is the default) to unset any prior column names. If no column names
109
   * are set, the parser will put each row into a simple numerically indexed
110
   * array. If column names are given, the parser will create arrays with
111
   * these column names as array keys instead.
112
   */
113
  public function setColumnNames($columnNames) {
114
    $this->columnNames = $columnNames;
115
  }
116

    
117
  /**
118
   * Define the time (in milliseconds) after which the parser stops parsing,
119
   * even if it has not yet finished processing the CSV data. If the timeout
120
   * has been reached before parsing is done, the parse() method will return
121
   * an incomplete list of rows - a single row will never be cut off in the
122
   * middle, though. By default, no timeout (@p $timeout == FALSE) is defined.
123
   *
124
   * You can check if the timeout has been reached by calling the
125
   * timeoutReached() method after parse() has been called.
126
   */
127
  public function setTimeout($timeout) {
128
    $this->timeout = $timeout;
129
  }
130

    
131
  /**
132
   * After calling the parse() method, determine if the timeout (set by the
133
   * setTimeout() method) has been reached.
134
   *
135
   * @deprecated Use lastLinePos() instead to determine whether a file has
136
   *   finished parsing.
137
   */
138
  public function timeoutReached() {
139
    return $this->timeoutReached;
140
  }
141

    
142
  /**
143
   * Define the number of lines to parse in one parsing operation.
144
   *
145
   * By default, all lines of a file are being parsed.
146
   */
147
  public function setLineLimit($lines) {
148
    $this->lineLimit = $lines;
149
  }
150

    
151
  /**
152
   * Get the byte number where the parser left off after last parse() call.
153
   *
154
   * @return
155
   *  0 if all lines or no line has been parsed, the byte position of where a
156
   *  timeout or the line limit has been reached otherwise. This position can be
157
   *  used to set the start byte for the next iteration after parse() has
158
   *  reached the timeout set with setTimeout() or the line limit set with
159
   *  setLineLimit().
160
   *
161
   * @see ParserCSV::setStartByte()
162
   */
163
  public function lastLinePos() {
164
    return $this->lastLinePos;
165
  }
166

    
167
  /**
168
   * Set the byte where file should be started to read.
169
   *
170
   * Useful when parsing a file in batches.
171
   */
172
  public function setStartByte($start) {
173
    return $this->startByte = $start;
174
  }
175

    
176
  /**
177
   * Parse CSV files into a two dimensional array.
178
   *
179
   * @param Iterator $lineIterator
180
   *   An Iterator object that yields line strings, e.g. ParserCSVIterator.
181
   * @param $start
182
   *   The byte number from where to start parsing the file.
183
   * @param $lines
184
   *   The number of lines to parse, 0 for all lines.
185
   * @return
186
   *   Two dimensional array that contains the data in the CSV file.
187
   */
188
  public function parse(Iterator $lineIterator) {
189
    $skipLine = $this->skipFirstLine;
190
    $rows = array();
191

    
192
    $this->timeoutReached = FALSE;
193
    $this->lastLinePos = 0;
194
    $maxTime = empty($this->timeout) ? FALSE : (microtime() + $this->timeout);
195
    $linesParsed = 0;
196

    
197
    for ($lineIterator->rewind($this->startByte); $lineIterator->valid(); $lineIterator->next()) {
198

    
199
      // Make really sure we've got lines without trailing newlines.
200
      $line = trim($lineIterator->current(), "\r\n");
201

    
202
      // Skip empty lines.
203
      if (empty($line)) {
204
        continue;
205
      }
206
      // If the first line contains column names, skip it.
207
      if ($skipLine) {
208
        $skipLine = FALSE;
209
        continue;
210
      }
211

    
212
      // The actual parser. explode() is unfortunately not suitable because the
213
      // delimiter might be located inside a quoted field, and that would break
214
      // the field and/or require additional effort to re-join the fields.
215
      $quoted = FALSE;
216
      $currentIndex = 0;
217
      $currentField = '';
218
      $fields = array();
219

    
220
      // We must use strlen() as we're parsing byte by byte using strpos(), so
221
      // drupal_strlen() will not work properly.
222
      while ($currentIndex <= strlen($line)) {
223
        if ($quoted) {
224
          $nextQuoteIndex = strpos($line, '"', $currentIndex);
225

    
226
          if ($nextQuoteIndex === FALSE) {
227
            // There's a line break before the quote is closed, so fetch the
228
            // next line and start from there.
229
            $currentField .= substr($line, $currentIndex);
230
            $lineIterator->next();
231

    
232
            if (!$lineIterator->valid()) {
233
              // Whoa, an unclosed quote! Well whatever, let's just ignore
234
              // that shortcoming and record it nevertheless.
235
              $fields[] = $currentField;
236
              break;
237
            }
238
            // Ok, so, on with fetching the next line, as mentioned above.
239
            $currentField .= "\n";
240
            $line = trim($lineIterator->current(), "\r\n");
241
            $currentIndex = 0;
242
            continue;
243
          }
244

    
245
          // There's actually another quote in this line...
246
          // find out whether it's escaped or not.
247
          $currentField .= substr($line, $currentIndex, $nextQuoteIndex - $currentIndex);
248

    
249
          if (isset($line[$nextQuoteIndex + 1]) && $line[$nextQuoteIndex + 1] === '"') {
250
            // Escaped quote, add a single one to the field and proceed quoted.
251
            $currentField .= '"';
252
            $currentIndex = $nextQuoteIndex + 2;
253
          }
254
          else {
255
            // End of the quoted section, close the quote and let the
256
            // $quoted == FALSE block finalize the field.
257
            $quoted = FALSE;
258
            $currentIndex = $nextQuoteIndex + 1;
259
          }
260
        }
261
        else { // $quoted == FALSE
262
          // First, let's find out where the next character of interest is.
263
          $nextQuoteIndex = strpos($line, '"', $currentIndex);
264
          $nextDelimiterIndex = strpos($line, $this->delimiter, $currentIndex);
265

    
266
          if ($nextQuoteIndex === FALSE) {
267
            $nextIndex = $nextDelimiterIndex;
268
          }
269
          elseif ($nextDelimiterIndex === FALSE) {
270
            $nextIndex = $nextQuoteIndex;
271
          }
272
          else {
273
            $nextIndex = min($nextQuoteIndex, $nextDelimiterIndex);
274
          }
275

    
276
          if ($nextIndex === FALSE) {
277
            // This line is done, add the rest of it as last field.
278
            $currentField .= substr($line, $currentIndex);
279
            $fields[] = $currentField;
280
            break;
281
          }
282
          elseif ($line[$nextIndex] === $this->delimiter[0]) {
283
            $length = ($nextIndex + strlen($this->delimiter) - 1) - $currentIndex;
284
            $currentField .= substr($line, $currentIndex, $length);
285
            $fields[] = $currentField;
286
            $currentField = '';
287
            $currentIndex += $length + 1;
288
            // Continue with the next field.
289
          }
290
          else { // $line[$nextIndex] == '"'
291
            $quoted = TRUE;
292
            $currentField .= substr($line, $currentIndex, $nextIndex - $currentIndex);
293
            $currentIndex = $nextIndex + 1;
294
            // Continue this field in the $quoted == TRUE block.
295
          }
296
        }
297
      }
298
      // End of CSV parser. We've now got all the fields of the line as strings
299
      // in the $fields array.
300

    
301
      if (empty($this->columnNames)) {
302
        $row = $fields;
303
      }
304
      else {
305
        $row = array();
306
        foreach ($this->columnNames as $columnName) {
307
          $field = array_shift($fields);
308
          $row[$columnName] = isset($field) ? $field : '';
309
        }
310
      }
311
      $rows[] = $row;
312

    
313
      // Quit parsing if timeout has been reached or requested lines have been
314
      // reached.
315
      if (!empty($maxTime) && microtime() > $maxTime) {
316
        $this->timeoutReached = TRUE;
317
        $this->lastLinePos = $lineIterator->currentPos();
318
        break;
319
      }
320
      $linesParsed++;
321
      if ($this->lineLimit && $linesParsed >= $this->lineLimit) {
322
        $this->lastLinePos = $lineIterator->currentPos();
323
        break;
324
      }
325
    }
326
    return $rows;
327
  }
328
}