[347] | 1 | <?php
|
---|
| 2 | /**
|
---|
| 3 | * Html2Pdf Library - parsing Html class
|
---|
| 4 | *
|
---|
| 5 | * HTML => PDF converter
|
---|
| 6 | * distributed under the OSL-3.0 License
|
---|
| 7 | *
|
---|
| 8 | * @package Html2pdf
|
---|
| 9 | * @author Laurent MINGUET <webmaster@html2pdf.fr>
|
---|
| 10 | * @copyright 2017 Laurent MINGUET
|
---|
| 11 | */
|
---|
| 12 | namespace Spipu\Html2Pdf\Parsing;
|
---|
| 13 |
|
---|
| 14 | /**
|
---|
| 15 | * Class HtmlLexer
|
---|
| 16 | */
|
---|
| 17 | class HtmlLexer
|
---|
| 18 | {
|
---|
| 19 | /**
|
---|
| 20 | * Tokenize the HTML code
|
---|
| 21 | *
|
---|
| 22 | * @param string $html HTML code to tokenize
|
---|
| 23 | *
|
---|
| 24 | * @return Token[]
|
---|
| 25 | */
|
---|
| 26 | public function tokenize($html)
|
---|
| 27 | {
|
---|
| 28 | // initialise the array
|
---|
| 29 | $tokens = array();
|
---|
| 30 |
|
---|
| 31 | // regexp to separate the tags from the texts
|
---|
| 32 | $reg = '/(<\/?\w[^<>]*>)|([^<]+|<)/is';
|
---|
| 33 | $commentRegex = '/(<!--.*-->)/isU';
|
---|
| 34 |
|
---|
| 35 | // last match found
|
---|
| 36 | $str = '';
|
---|
| 37 | $offset = 0;
|
---|
| 38 | $line = 1;
|
---|
| 39 | $length = strlen($html);
|
---|
| 40 |
|
---|
| 41 | // As it finds a match
|
---|
| 42 | while ($offset < $length) {
|
---|
| 43 | if (strpos($html, '<!--', $offset) === $offset
|
---|
| 44 | && preg_match($commentRegex, $html, $match, PREG_OFFSET_CAPTURE, $offset)
|
---|
| 45 | ) {
|
---|
| 46 | $line += substr_count($match[1][0], "\n");
|
---|
| 47 | $offset = $match[0][1] + strlen($match[0][0]);
|
---|
| 48 | continue;
|
---|
| 49 | }
|
---|
| 50 | preg_match($reg, $html, $parse, PREG_OFFSET_CAPTURE, $offset);
|
---|
| 51 | // if it is a tag
|
---|
| 52 | if ($parse[1][0]) {
|
---|
| 53 | // save the previous text if it exists
|
---|
| 54 | if ($str !== '') {
|
---|
| 55 | $tokens[] = new Token('txt', $str);
|
---|
| 56 | }
|
---|
| 57 |
|
---|
| 58 | // save the tag, with the offset
|
---|
| 59 | $tokens[] = new Token('code', trim($parse[1][0]), $line);
|
---|
| 60 | $line += substr_count($parse[1][0], "\n");
|
---|
| 61 |
|
---|
| 62 | // init the current text
|
---|
| 63 | $str = '';
|
---|
| 64 | } else { // else (if it is a text)
|
---|
| 65 | // add the new text to the current text
|
---|
| 66 | $str .= $parse[2][0];
|
---|
| 67 | $line += substr_count($parse[2][0], "\n");
|
---|
| 68 | }
|
---|
| 69 |
|
---|
| 70 | // Update offset to the end of the match
|
---|
| 71 | $offset = $parse[0][1] + strlen($parse[0][0]);
|
---|
| 72 | unset($parse);
|
---|
| 73 | }
|
---|
| 74 | // if a text is present in the end, we save it
|
---|
| 75 | if ($str !== '') {
|
---|
| 76 | $tokens[] = new Token('txt', $str);
|
---|
| 77 | }
|
---|
| 78 |
|
---|
| 79 | return $tokens;
|
---|
| 80 | }
|
---|
| 81 | }
|
---|