Projet

Général

Profil

Paste
Télécharger (23,2 ko) Statistiques
| Branche: | Révision:

root / drupal7 / sites / all / libraries / fpdi-version / pdf_parser.php @ 76df55b7

1
<?php
2
//
3
//  FPDI - Version 1.4.4
4
//
5
//    Copyright 2004-2013 Setasign - Jan Slabon
6
//
7
//  Licensed under the Apache License, Version 2.0 (the "License");
8
//  you may not use this file except in compliance with the License.
9
//  You may obtain a copy of the License at
10
//
11
//      http://www.apache.org/licenses/LICENSE-2.0
12
//
13
//  Unless required by applicable law or agreed to in writing, software
14
//  distributed under the License is distributed on an "AS IS" BASIS,
15
//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
//  See the License for the specific language governing permissions and
17
//  limitations under the License.
18
//
19

    
20
if (!defined ('PDF_TYPE_NULL'))
21
    define ('PDF_TYPE_NULL', 0);
22
if (!defined ('PDF_TYPE_NUMERIC'))
23
    define ('PDF_TYPE_NUMERIC', 1);
24
if (!defined ('PDF_TYPE_TOKEN'))
25
    define ('PDF_TYPE_TOKEN', 2);
26
if (!defined ('PDF_TYPE_HEX'))
27
    define ('PDF_TYPE_HEX', 3);
28
if (!defined ('PDF_TYPE_STRING'))
29
    define ('PDF_TYPE_STRING', 4);
30
if (!defined ('PDF_TYPE_DICTIONARY'))
31
    define ('PDF_TYPE_DICTIONARY', 5);
32
if (!defined ('PDF_TYPE_ARRAY'))
33
    define ('PDF_TYPE_ARRAY', 6);
34
if (!defined ('PDF_TYPE_OBJDEC'))
35
    define ('PDF_TYPE_OBJDEC', 7);
36
if (!defined ('PDF_TYPE_OBJREF'))
37
    define ('PDF_TYPE_OBJREF', 8);
38
if (!defined ('PDF_TYPE_OBJECT'))
39
    define ('PDF_TYPE_OBJECT', 9);
40
if (!defined ('PDF_TYPE_STREAM'))
41
    define ('PDF_TYPE_STREAM', 10);
42
if (!defined ('PDF_TYPE_BOOLEAN'))
43
    define ('PDF_TYPE_BOOLEAN', 11);
44
if (!defined ('PDF_TYPE_REAL'))
45
    define ('PDF_TYPE_REAL', 12);
46
    
47
require_once('pdf_context.php');
48

    
49
if (!class_exists('pdf_parser', false)) {
50
    
51
    class pdf_parser {
52
            
53
            /**
54
         * Filename
55
         * @var string
56
         */
57
        var $filename;
58
        
59
        /**
60
         * File resource
61
         * @var resource
62
         */
63
        var $f;
64
        
65
        /**
66
         * PDF Context
67
         * @var object pdf_context-Instance
68
         */
69
        var $c;
70
        
71
        /**
72
         * xref-Data
73
         * @var array
74
         */
75
        var $xref;
76
    
77
        /**
78
         * root-Object
79
         * @var array
80
         */
81
        var $root;
82
            
83
        /**
84
         * PDF version of the loaded document
85
         * @var string
86
         */
87
        var $pdfVersion;
88
        
89
        /**
90
             * For reading encrypted documents and xref/objectstreams are in use
91
             *
92
             * @var boolean
93
             */
94
            var $readPlain = true;
95
            
96
        /**
97
         * Constructor
98
         *
99
         * @param string $filename  Source-Filename
100
         */
101
            function pdf_parser($filename) {
102
            $this->filename = $filename;
103
            
104
            $this->f = @fopen($this->filename, 'rb');
105
    
106
            if (!$this->f)
107
                $this->error(sprintf('Cannot open %s !', $filename));
108
    
109
            $this->getPDFVersion();
110
    
111
            $this->c = new pdf_context($this->f);
112
            
113
            // Read xref-Data
114
            $this->xref = array();
115
            $this->pdf_read_xref($this->xref, $this->pdf_find_xref());
116
            
117
            // Check for Encryption
118
            $this->getEncryption();
119
    
120
            // Read root
121
            $this->pdf_read_root();
122
        }
123
        
124
        /**
125
         * Close the opened file
126
         */
127
        function closeFile() {
128
                if (isset($this->f) && is_resource($this->f)) {
129
                    fclose($this->f);        
130
                        unset($this->f);
131
                }        
132
        }
133
        
134
        /**
135
         * Print Error and die
136
         *
137
         * @param string $msg  Error-Message
138
         */
139
        function error($msg) {
140
                die('<b>PDF-Parser Error:</b> ' . $msg);        
141
        }
142
        
143
        /**
144
         * Check Trailer for Encryption
145
         */
146
        function getEncryption() {
147
            if (isset($this->xref['trailer'][1]['/Encrypt'])) {
148
                    $this->error('File is encrypted!');
149
            }
150
        }
151
        
152
            /**
153
         * Find/Return /Root
154
         *
155
         * @return array
156
         */
157
        function pdf_find_root() {
158
            if ($this->xref['trailer'][1]['/Root'][0] != PDF_TYPE_OBJREF) {
159
                $this->error('Wrong Type of Root-Element! Must be an indirect reference');
160
            }
161
            
162
            return $this->xref['trailer'][1]['/Root'];
163
        }
164
    
165
        /**
166
         * Read the /Root
167
         */
168
        function pdf_read_root() {
169
            // read root
170
            $this->root = $this->pdf_resolve_object($this->c, $this->pdf_find_root());
171
        }
172
        
173
        /**
174
         * Get PDF-Version
175
         *
176
         * And reset the PDF Version used in FPDI if needed
177
         */
178
        function getPDFVersion() {
179
            fseek($this->f, 0);
180
            preg_match('/\d\.\d/',fread($this->f, 16), $m);
181
            if (isset($m[0]))
182
                $this->pdfVersion = $m[0];
183
            return $this->pdfVersion;
184
        }
185
        
186
        /**
187
         * Find the xref-Table
188
         */
189
        function pdf_find_xref() {
190
                   $toRead = 1500;
191
                    
192
            $stat = fseek ($this->f, -$toRead, SEEK_END);
193
            if ($stat === -1) {
194
                fseek ($this->f, 0);
195
            }
196
                   $data = fread($this->f, $toRead);
197
            
198
            $pos = strlen($data) - strpos(strrev($data), strrev('startxref')); 
199
            $data = substr($data, $pos);
200
            
201
            if (!preg_match('/\s*(\d+).*$/s', $data, $matches)) {
202
                $this->error('Unable to find pointer to xref table');
203
                }
204
    
205
                return (int) $matches[1];
206
        }
207
    
208
        /**
209
         * Read xref-table
210
         *
211
         * @param array $result Array of xref-table
212
         * @param integer $offset of xref-table
213
         */
214
        function pdf_read_xref(&$result, $offset) {
215
            $o_pos = $offset-min(20, $offset);
216
                fseek($this->f, $o_pos); // set some bytes backwards to fetch errorious docs
217
                
218
            $data = fread($this->f, 100);
219
            
220
            $xrefPos = strrpos($data, 'xref');
221
    
222
            if ($xrefPos === false) {
223
                fseek($this->f, $offset);
224
                $c = new pdf_context($this->f);
225
                $xrefStreamObjDec = $this->pdf_read_value($c);
226
                
227
                if (is_array($xrefStreamObjDec) && isset($xrefStreamObjDec[0]) && $xrefStreamObjDec[0] == PDF_TYPE_OBJDEC) {
228
                    $this->error(sprintf('This document (%s) probably uses a compression technique which is not supported by the free parser shipped with FPDI.', $this->filename));
229
                } else {            
230
                        $this->error('Unable to find xref table.');
231
                }
232
            }
233
            
234
            if (!isset($result['xref_location'])) {
235
                $result['xref_location'] = $o_pos + $xrefPos;
236
                $result['max_object'] = 0;
237
                }
238
    
239
                $cylces = -1;
240
            $bytesPerCycle = 100;
241
            
242
                fseek($this->f, $o_pos = $o_pos + $xrefPos + 4); // set the handle directly after the "xref"-keyword
243
            $data = fread($this->f, $bytesPerCycle);
244
            
245
            while (($trailerPos = strpos($data, 'trailer', max($bytesPerCycle * $cylces++, 0))) === false && !feof($this->f)) {
246
                $data .= fread($this->f, $bytesPerCycle);
247
            }
248
            
249
            if ($trailerPos === false) {
250
                $this->error('Trailer keyword not found after xref table');
251
            }
252
            
253
            $data = substr($data, 0, $trailerPos);
254
            
255
            // get Line-Ending
256
            preg_match_all("/(\r\n|\n|\r)/", substr($data, 0, 100), $m); // check the first 100 bytes for linebreaks
257
    
258
            $differentLineEndings = count(array_unique($m[0]));
259
            if ($differentLineEndings > 1) {
260
                $lines = preg_split("/(\r\n|\n|\r)/", $data, -1, PREG_SPLIT_NO_EMPTY);
261
            } else {
262
                $lines = explode($m[0][1], $data);
263
            }
264
            
265
            $data = $differentLineEndings = $m = null;
266
            unset($data, $differentLineEndings, $m);
267
            
268
            $linesCount = count($lines);
269
            
270
            $start = 1;
271
            
272
            for ($i = 0; $i < $linesCount; $i++) {
273
                $line = trim($lines[$i]);
274
                if ($line) {
275
                    $pieces = explode(' ', $line);
276
                    $c = count($pieces);
277
                    switch($c) {
278
                        case 2:
279
                            $start = (int)$pieces[0];
280
                            $end   = $start + (int)$pieces[1];
281
                            if ($end > $result['max_object'])
282
                                $result['max_object'] = $end;
283
                            break;
284
                        case 3:
285
                            if (!isset($result['xref'][$start]))
286
                                $result['xref'][$start] = array();
287
                            
288
                            if (!array_key_exists($gen = (int) $pieces[1], $result['xref'][$start])) {
289
                                    $result['xref'][$start][$gen] = $pieces[2] == 'n' ? (int) $pieces[0] : null;
290
                                }
291
                            $start++;
292
                            break;
293
                        default:
294
                            $this->error('Unexpected data in xref table');
295
                    }
296
                }
297
            }
298
            
299
            $lines = $pieces = $line = $start = $end = $gen = null;
300
            unset($lines, $pieces, $line, $start, $end, $gen);
301
            
302
            fseek($this->f, $o_pos + $trailerPos + 7);
303
            
304
            $c = new pdf_context($this->f);
305
                $trailer = $this->pdf_read_value($c);
306
                
307
                $c = null;
308
                unset($c);
309
                
310
                if (!isset($result['trailer'])) {
311
                $result['trailer'] = $trailer;          
312
                }
313
                
314
                if (isset($trailer[1]['/Prev'])) {
315
                    $this->pdf_read_xref($result, $trailer[1]['/Prev'][1]);
316
                } 
317
                
318
                $trailer = null;
319
                unset($trailer);
320
            
321
            return true;
322
        }
323
        
324
        /**
325
         * Reads an Value
326
         *
327
         * @param object $c pdf_context
328
         * @param string $token a Token
329
         * @return mixed
330
         */
331
        function pdf_read_value(&$c, $token = null) {
332
                if (is_null($token)) {
333
                    $token = $this->pdf_read_token($c);
334
                }
335
                
336
            if ($token === false) {
337
                    return false;
338
                }
339
    
340
                switch ($token) {
341
                case        '<':
342
                                // This is a hex string.
343
                                // Read the value, then the terminator
344
    
345
                    $pos = $c->offset;
346
    
347
                                while(1) {
348
    
349
                        $match = strpos ($c->buffer, '>', $pos);
350
                                    
351
                                        // If you can't find it, try
352
                                        // reading more data from the stream
353
    
354
                                        if ($match === false) {
355
                                                if (!$c->increase_length()) {
356
                                                        return false;
357
                                                } else {
358
                                    continue;
359
                                }
360
                                        }
361
    
362
                                        $result = substr ($c->buffer, $c->offset, $match - $c->offset);
363
                                        $c->offset = $match + 1;
364
                                        
365
                                        return array (PDF_TYPE_HEX, $result);
366
                    }
367
                    
368
                    break;
369
                        case        '<<':
370
                                // This is a dictionary.
371
    
372
                                $result = array();
373
    
374
                                // Recurse into this function until we reach
375
                                // the end of the dictionary.
376
                                while (($key = $this->pdf_read_token($c)) !== '>>') {
377
                                        if ($key === false) {
378
                                                return false;
379
                                        }
380
                                        
381
                                        if (($value =   $this->pdf_read_value($c)) === false) {
382
                                                return false;
383
                                        }
384
                                        
385
                                        // Catch missing value
386
                                        if ($value[0] == PDF_TYPE_TOKEN && $value[1] == '>>') {
387
                                            $result[$key] = array(PDF_TYPE_NULL);
388
                                            break;
389
                                        }
390
                                        
391
                                        $result[$key] = $value;
392
                                }
393
                                    
394
                                return array (PDF_TYPE_DICTIONARY, $result);
395
    
396
                        case        '[':
397
                                // This is an array.
398
    
399
                                $result = array();
400
    
401
                                // Recurse into this function until we reach
402
                                // the end of the array.
403
                                while (($token = $this->pdf_read_token($c)) !== ']') {
404
                        if ($token === false) {
405
                                                return false;
406
                                        }
407
                                            
408
                                        if (($value = $this->pdf_read_value($c, $token)) === false) {
409
                            return false;
410
                                        }
411
                                            
412
                                        $result[] = $value;
413
                                }
414
                                
415
                    return array (PDF_TYPE_ARRAY, $result);
416
    
417
                        case        '('                :
418
                    // This is a string
419
                    $pos = $c->offset;
420
                    
421
                    $openBrackets = 1;
422
                                do {
423
                        for (; $openBrackets != 0 && $pos < $c->length; $pos++) {
424
                            switch (ord($c->buffer[$pos])) {
425
                                case 0x28: // '('
426
                                    $openBrackets++;
427
                                    break;
428
                                case 0x29: // ')'
429
                                    $openBrackets--;
430
                                    break;
431
                                case 0x5C: // backslash
432
                                    $pos++;
433
                            }
434
                        }
435
                                } while($openBrackets != 0 && $c->increase_length());
436
                                
437
                                $result = substr($c->buffer, $c->offset, $pos - $c->offset - 1);
438
                                $c->offset = $pos;
439
                                
440
                                return array (PDF_TYPE_STRING, $result);
441
    
442
                case 'stream':
443
                        $o_pos = ftell($c->file)-strlen($c->buffer);
444
                            $o_offset = $c->offset;
445
                            
446
                            $c->reset($startpos = $o_pos + $o_offset);
447
                            
448
                            $e = 0; // ensure line breaks in front of the stream
449
                            if ($c->buffer[0] == chr(10) || $c->buffer[0] == chr(13))
450
                                    $e++;
451
                            if ($c->buffer[1] == chr(10) && $c->buffer[0] != chr(10))
452
                                    $e++;
453
                            
454
                            if ($this->actual_obj[1][1]['/Length'][0] == PDF_TYPE_OBJREF) {
455
                                    $tmp_c = new pdf_context($this->f);
456
                                    $tmp_length = $this->pdf_resolve_object($tmp_c, $this->actual_obj[1][1]['/Length']);
457
                                    $length = $tmp_length[1][1];
458
                            } else {
459
                                    $length = $this->actual_obj[1][1]['/Length'][1];        
460
                            }
461
                                    
462
                            if ($length > 0) {
463
                                $c->reset($startpos + $e,$length);
464
                                $v = $c->buffer;
465
                            } else {
466
                                $v = '';   
467
                            }
468
                            $c->reset($startpos + $e + $length + 9); // 9 = strlen("endstream")
469
                            
470
                            return array(PDF_TYPE_STREAM, $v);
471
                            
472
                    default        :
473
                        if (is_numeric ($token)) {
474
                        // A numeric token. Make sure that
475
                                        // it is not part of something else.
476
                                        if (($tok2 = $this->pdf_read_token ($c)) !== false) {
477
                            if (is_numeric ($tok2)) {
478
    
479
                                                        // Two numeric tokens in a row.
480
                                                        // In this case, we're probably in
481
                                                        // front of either an object reference
482
                                                        // or an object specification.
483
                                                        // Determine the case and return the data
484
                                                        if (($tok3 = $this->pdf_read_token ($c)) !== false) {
485
                                    switch ($tok3) {
486
                                                                        case 'obj':
487
                                            return array (PDF_TYPE_OBJDEC, (int) $token, (int) $tok2);
488
                                                                        case 'R':
489
                                                                                return array (PDF_TYPE_OBJREF, (int) $token, (int) $tok2);
490
                                                                }
491
                                                                // If we get to this point, that numeric value up
492
                                                                // there was just a numeric value. Push the extra
493
                                                                // tokens back into the stack and return the value.
494
                                                                array_push ($c->stack, $tok3);
495
                                                        }
496
                                                }
497
    
498
                                                array_push ($c->stack, $tok2);
499
                                        }
500
    
501
                                        if ($token === (string)((int)$token))
502
                                            return array (PDF_TYPE_NUMERIC, (int)$token);
503
                                        else 
504
                                                return array (PDF_TYPE_REAL, (float)$token);
505
                                } elseif ($token == 'true' || $token == 'false') {
506
                        return array (PDF_TYPE_BOOLEAN, $token == 'true');
507
                                } elseif ($token == 'null') {
508
                                   return array (PDF_TYPE_NULL);
509
                                } else {
510
                        // Just a token. Return it.
511
                                        return array (PDF_TYPE_TOKEN, $token);
512
                                }
513
             }
514
        }
515
        
516
        /**
517
         * Resolve an object
518
         *
519
         * @param object $c pdf_context
520
         * @param array $obj_spec The object-data
521
         * @param boolean $encapsulate Must set to true, cause the parsing and fpdi use this method only without this para
522
         */
523
        function pdf_resolve_object(&$c, $obj_spec, $encapsulate = true) {
524
            // Exit if we get invalid data
525
                if (!is_array($obj_spec)) {
526
                $ret = false;
527
                    return $ret;
528
                }
529
    
530
                if ($obj_spec[0] == PDF_TYPE_OBJREF) {
531
    
532
                        // This is a reference, resolve it
533
                        if (isset($this->xref['xref'][$obj_spec[1]][$obj_spec[2]])) {
534
    
535
                                // Save current file position
536
                                // This is needed if you want to resolve
537
                                // references while you're reading another object
538
                                // (e.g.: if you need to determine the length
539
                                // of a stream)
540
    
541
                                $old_pos = ftell($c->file);
542
    
543
                                // Reposition the file pointer and
544
                                // load the object header.
545
                                    
546
                                $c->reset($this->xref['xref'][$obj_spec[1]][$obj_spec[2]]);
547
    
548
                                $header = $this->pdf_read_value($c);
549
    
550
                                if ($header[0] != PDF_TYPE_OBJDEC || $header[1] != $obj_spec[1] || $header[2] != $obj_spec[2]) {
551
                                        $toSearchFor = $obj_spec[1] . ' ' . $obj_spec[2] . ' obj';
552
                                        if (preg_match('/' . $toSearchFor . '/', $c->buffer)) {
553
                                                $c->offset = strpos($c->buffer, $toSearchFor) + strlen($toSearchFor);
554
                                                // reset stack
555
                                                $c->stack = array();
556
                                        } else {
557
                                                $this->error("Unable to find object ({$obj_spec[1]}, {$obj_spec[2]}) at expected location");
558
                                        }
559
                                }
560
    
561
                                // If we're being asked to store all the information
562
                                // about the object, we add the object ID and generation
563
                                // number for later use
564
                                    $result = array();
565
                                    $this->actual_obj =& $result;
566
                                if ($encapsulate) {
567
                                        $result = array (
568
                                                PDF_TYPE_OBJECT,
569
                                                'obj' => $obj_spec[1],
570
                                                'gen' => $obj_spec[2]
571
                                        );
572
                                } 
573
    
574
                                // Now simply read the object data until
575
                                // we encounter an end-of-object marker
576
                                while(1) {
577
                        $value = $this->pdf_read_value($c);
578
                                            if ($value === false || count($result) > 4) {
579
                                                    // in this case the parser coudn't find an endobj so we break here
580
                                                    break;
581
                                        }
582
    
583
                                        if ($value[0] == PDF_TYPE_TOKEN && $value[1] === 'endobj') {
584
                                                break;
585
                                        }
586
    
587
                        $result[] = $value;
588
                                }
589
    
590
                                $c->reset($old_pos);
591
    
592
                    if (isset($result[2][0]) && $result[2][0] == PDF_TYPE_STREAM) {
593
                        $result[0] = PDF_TYPE_STREAM;
594
                    }
595
    
596
                                return $result;
597
                        }
598
                } else {
599
                        return $obj_spec;
600
                }
601
        }
602
    
603
        
604
        
605
        /**
606
         * Reads a token from the file
607
         *
608
         * @param object $c pdf_context
609
         * @return mixed
610
         */
611
        function pdf_read_token(&$c)
612
        {
613
                // If there is a token available
614
                // on the stack, pop it out and
615
                // return it.
616
    
617
                if (count($c->stack)) {
618
                        return array_pop($c->stack);
619
                }
620
    
621
                // Strip away any whitespace
622
    
623
                do {
624
                        if (!$c->ensure_content()) {
625
                                return false;
626
                        }
627
                        $c->offset += strspn($c->buffer, "\x20\x0A\x0C\x0D\x09\x00", $c->offset);
628
                } while ($c->offset >= $c->length - 1);
629
    
630
                // Get the first character in the stream
631
    
632
                $char = $c->buffer[$c->offset++];
633
    
634
                switch ($char) {
635
    
636
                        case '[':
637
                        case ']':
638
                        case '(':
639
                        case ')':
640
                        
641
                                // This is either an array or literal string
642
                                // delimiter, Return it
643
    
644
                                return $char;
645
    
646
                        case '<':
647
                        case '>':
648
    
649
                                // This could either be a hex string or
650
                                // dictionary delimiter. Determine the
651
                                // appropriate case and return the token
652
    
653
                                if ($c->buffer[$c->offset] == $char) {
654
                                        if (!$c->ensure_content()) {
655
                                            return false;
656
                                        }
657
                                        $c->offset++;
658
                                        return $char . $char;
659
                                } else {
660
                                        return $char;
661
                                }
662
    
663
                            case '%':
664
                                
665
                                // This is a comment - jump over it!
666
                                
667
                    $pos = $c->offset;
668
                                while(1) {
669
                                    $match = preg_match("/(\r\n|\r|\n)/", $c->buffer, $m, PREG_OFFSET_CAPTURE, $pos);
670
                        if ($match === 0) {
671
                                                if (!$c->increase_length()) {
672
                                                        return false;
673
                                                } else {
674
                                    continue;
675
                                }
676
                                        }
677
    
678
                                        $c->offset = $m[0][1]+strlen($m[0][0]);
679
                                        
680
                                        return $this->pdf_read_token($c);
681
                    }
682
                    
683
                            default:
684
    
685
                                // This is "another" type of token (probably
686
                                // a dictionary entry or a numeric value)
687
                                // Find the end and return it.
688
    
689
                                if (!$c->ensure_content()) {
690
                                        return false;
691
                                }
692
    
693
                                while(1) {
694
    
695
                                        // Determine the length of the token
696
    
697
                                        $pos = strcspn($c->buffer, "\x20%[]<>()/\x0A\x0C\x0D\x09\x00", $c->offset);
698
                                        
699
                                        if ($c->offset + $pos <= $c->length - 1) {
700
                                                break;
701
                                        } else {
702
                                                // If the script reaches this point,
703
                                                // the token may span beyond the end
704
                                                // of the current buffer. Therefore,
705
                                                // we increase the size of the buffer
706
                                                // and try again--just to be safe.
707
    
708
                                                $c->increase_length();
709
                                        }
710
                                }
711
    
712
                                $result = substr($c->buffer, $c->offset - 1, $pos + 1);
713
    
714
                                $c->offset += $pos;
715
                                return $result;
716
                }
717
        }
718
    }
719
}