1 | <?php
|
---|
2 | /**
|
---|
3 | * Html2Pdf Library - parsing Html class
|
---|
4 | *
|
---|
5 | * HTML => PDF converter
|
---|
6 | * distributed under the OSL-3.0 License
|
---|
7 | *
|
---|
8 | * @package Html2pdf
|
---|
9 | * @author Laurent MINGUET <webmaster@html2pdf.fr>
|
---|
10 | * @copyright 2017 Laurent MINGUET
|
---|
11 | */
|
---|
12 | namespace Spipu\Html2Pdf\Parsing;
|
---|
13 |
|
---|
14 | /**
|
---|
15 | * Class HtmlLexer
|
---|
16 | */
|
---|
17 | class HtmlLexer
|
---|
18 | {
|
---|
19 | /**
|
---|
20 | * Tokenize the HTML code
|
---|
21 | *
|
---|
22 | * @param string $html HTML code to tokenize
|
---|
23 | *
|
---|
24 | * @return Token[]
|
---|
25 | */
|
---|
26 | public function tokenize($html)
|
---|
27 | {
|
---|
28 | // initialise the array
|
---|
29 | $tokens = array();
|
---|
30 |
|
---|
31 | // regexp to separate the tags from the texts
|
---|
32 | $reg = '/(<\/?\w[^<>]*>)|([^<]+|<)/is';
|
---|
33 | $commentRegex = '/(<!--.*-->)/isU';
|
---|
34 |
|
---|
35 | // last match found
|
---|
36 | $str = '';
|
---|
37 | $offset = 0;
|
---|
38 | $line = 1;
|
---|
39 | $length = strlen($html);
|
---|
40 |
|
---|
41 | // As it finds a match
|
---|
42 | while ($offset < $length) {
|
---|
43 | if (strpos($html, '<!--', $offset) === $offset
|
---|
44 | && preg_match($commentRegex, $html, $match, PREG_OFFSET_CAPTURE, $offset)
|
---|
45 | ) {
|
---|
46 | $line += substr_count($match[1][0], "\n");
|
---|
47 | $offset = $match[0][1] + strlen($match[0][0]);
|
---|
48 | continue;
|
---|
49 | }
|
---|
50 | preg_match($reg, $html, $parse, PREG_OFFSET_CAPTURE, $offset);
|
---|
51 | // if it is a tag
|
---|
52 | if ($parse[1][0]) {
|
---|
53 | // save the previous text if it exists
|
---|
54 | if ($str !== '') {
|
---|
55 | $tokens[] = new Token('txt', $str);
|
---|
56 | }
|
---|
57 |
|
---|
58 | // save the tag, with the offset
|
---|
59 | $tokens[] = new Token('code', trim($parse[1][0]), $line);
|
---|
60 | $line += substr_count($parse[1][0], "\n");
|
---|
61 |
|
---|
62 | // init the current text
|
---|
63 | $str = '';
|
---|
64 | } else { // else (if it is a text)
|
---|
65 | // add the new text to the current text
|
---|
66 | $str .= $parse[2][0];
|
---|
67 | $line += substr_count($parse[2][0], "\n");
|
---|
68 | }
|
---|
69 |
|
---|
70 | // Update offset to the end of the match
|
---|
71 | $offset = $parse[0][1] + strlen($parse[0][0]);
|
---|
72 | unset($parse);
|
---|
73 | }
|
---|
74 | // if a text is present in the end, we save it
|
---|
75 | if ($str !== '') {
|
---|
76 | $tokens[] = new Token('txt', $str);
|
---|
77 | }
|
---|
78 |
|
---|
79 | return $tokens;
|
---|
80 | }
|
---|
81 | }
|
---|