1 | <?php
|
---|
2 | /**
|
---|
3 | * Html2Pdf Library
|
---|
4 | *
|
---|
5 | * HTML => PDF converter
|
---|
6 | * distributed under the OSL-3.0 License
|
---|
7 | *
|
---|
8 | * @package Html2pdf
|
---|
9 | * @author Laurent MINGUET <webmaster@html2pdf.fr>
|
---|
10 | * @copyright 2017 Laurent MINGUET
|
---|
11 | */
|
---|
12 | namespace Spipu\Html2Pdf\Parsing;
|
---|
13 |
|
---|
14 | use Spipu\Html2Pdf\Exception\HtmlParsingException;
|
---|
15 |
|
---|
16 | /**
|
---|
17 | * Class Html
|
---|
18 | */
|
---|
19 | class Html
|
---|
20 | {
|
---|
21 | const HTML_TAB = ' ';
|
---|
22 |
|
---|
23 | /**
|
---|
24 | * @var TagParser
|
---|
25 | */
|
---|
26 | protected $tagParser;
|
---|
27 |
|
---|
28 | /**
|
---|
29 | * @var TextParser
|
---|
30 | */
|
---|
31 | protected $textParser;
|
---|
32 |
|
---|
33 | /**
|
---|
34 | * are we in a pre ?
|
---|
35 | * @var boolean
|
---|
36 | */
|
---|
37 | protected $tagPreIn = false;
|
---|
38 |
|
---|
39 | /**
|
---|
40 | * parsed HTML code
|
---|
41 | * @var Node[]
|
---|
42 | */
|
---|
43 | public $code = array();
|
---|
44 |
|
---|
45 | /**
|
---|
46 | * main constructor
|
---|
47 | *
|
---|
48 | * @param TextParser $textParser
|
---|
49 | */
|
---|
50 | public function __construct(TextParser $textParser)
|
---|
51 | {
|
---|
52 | $this->textParser = $textParser;
|
---|
53 | $this->tagParser = new TagParser($this->textParser);
|
---|
54 | $this->code = array();
|
---|
55 | }
|
---|
56 |
|
---|
57 | /**
|
---|
58 | * Get the list of the codes, but cloned
|
---|
59 | *
|
---|
60 | * @return Node[]
|
---|
61 | */
|
---|
62 | public function getCloneCodes()
|
---|
63 | {
|
---|
64 | $codes = array();
|
---|
65 | foreach ($this->code as $key => $code) {
|
---|
66 | $codes[$key] = clone $code;
|
---|
67 | }
|
---|
68 | return $codes;
|
---|
69 | }
|
---|
70 |
|
---|
71 | /**
|
---|
72 | * parse the HTML code
|
---|
73 | *
|
---|
74 | * @param Token[] $tokens A list of tokens to parse
|
---|
75 | *
|
---|
76 | * @throws HtmlParsingException
|
---|
77 | */
|
---|
78 | public function parse($tokens)
|
---|
79 | {
|
---|
80 | $parents = array();
|
---|
81 |
|
---|
82 | // flag : are we in a <pre> Tag ?
|
---|
83 | $this->tagPreIn = false;
|
---|
84 |
|
---|
85 | /**
|
---|
86 | * all the actions to do
|
---|
87 | * @var Node[] $actions
|
---|
88 | */
|
---|
89 | $actions = array();
|
---|
90 |
|
---|
91 | // get the actions from the html tokens
|
---|
92 | foreach ($tokens as $token) {
|
---|
93 | if ($token->getType() === 'code') {
|
---|
94 | $actions = array_merge($actions, $this->getTagAction($token, $parents));
|
---|
95 | } elseif ($token->getType() === 'txt') {
|
---|
96 | $actions = array_merge($actions, $this->getTextAction($token));
|
---|
97 | }
|
---|
98 | }
|
---|
99 |
|
---|
100 | // for each identified action, we have to clean up the begin and the end of the texte
|
---|
101 | // based on tags that surround it
|
---|
102 |
|
---|
103 | // list of the tags to clean
|
---|
104 | $tagsToClean = array(
|
---|
105 | 'page', 'page_header', 'page_footer', 'form',
|
---|
106 | 'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br',
|
---|
107 | 'div', 'hr', 'p', 'ul', 'ol', 'li',
|
---|
108 | 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
---|
109 | 'bookmark', 'fieldset', 'legend',
|
---|
110 | 'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline',
|
---|
111 | 'option'
|
---|
112 | );
|
---|
113 |
|
---|
114 | // list of the tags to move space
|
---|
115 | $tagsToSpace = array(
|
---|
116 | 'span', 'font', 'label',
|
---|
117 | 'strong', 'b',
|
---|
118 | 'address', 'cite', 'em', 'i', 'samp',
|
---|
119 | 'cite', 's',
|
---|
120 | 'ins', 'u',
|
---|
121 | 'big', 'small', 'sub', 'sup'
|
---|
122 | );
|
---|
123 |
|
---|
124 | // foreach action
|
---|
125 | $nb = count($actions);
|
---|
126 | for ($k = 0; $k < $nb; $k++) {
|
---|
127 | // if it is a Text
|
---|
128 | if ($actions[$k]->getName() !== 'write') {
|
---|
129 | continue;
|
---|
130 | }
|
---|
131 |
|
---|
132 | // if the tag before the text is a tag to clean => ltrim on the text
|
---|
133 | if ($k>0) {
|
---|
134 | if (in_array($actions[$k - 1]->getName(), $tagsToClean)) {
|
---|
135 | $actions[$k]->setParam('txt', ltrim($actions[$k]->getParam('txt')));
|
---|
136 | }
|
---|
137 | }
|
---|
138 |
|
---|
139 | if ($k < $nb - 1) {
|
---|
140 | // if the tag after the text is a tag to clean => rtrim on the text
|
---|
141 | if (in_array($actions[$k + 1]->getName(), $tagsToClean)) {
|
---|
142 | $actions[$k]->setParam('txt', rtrim($actions[$k]->getParam('txt')));
|
---|
143 | }
|
---|
144 |
|
---|
145 | // if the tag after the text is a tag with space to move => move the space to the next write
|
---|
146 | if (in_array($actions[$k + 1]->getName(), $tagsToSpace)) {
|
---|
147 | if (substr($actions[$k]->getParam('txt'), -1) == ' ') {
|
---|
148 | $actions[$k]->setParam('txt', rtrim($actions[$k]->getParam('txt')));
|
---|
149 | for ($subK = $k+2; $subK < $nb; $subK++) {
|
---|
150 | if ($actions[$subK]->getName() === 'write') {
|
---|
151 | $actions[$subK]->setParam('txt', ' '.ltrim($actions[$subK]->getParam('txt')));
|
---|
152 | break;
|
---|
153 | }
|
---|
154 | }
|
---|
155 | }
|
---|
156 | }
|
---|
157 | }
|
---|
158 |
|
---|
159 | // if the text is empty => remove the action
|
---|
160 | if (!strlen($actions[$k]->getParam('txt'))) {
|
---|
161 | unset($actions[$k]);
|
---|
162 | }
|
---|
163 | }
|
---|
164 |
|
---|
165 | // if we are not on the level 0 => HTML validator ERROR
|
---|
166 | if (count($parents)) {
|
---|
167 | if (count($parents)>1) {
|
---|
168 | $errorMsg = 'The following tags have not been closed:';
|
---|
169 | } else {
|
---|
170 | $errorMsg = 'The following tag has not been closed:';
|
---|
171 | }
|
---|
172 |
|
---|
173 | $e = new HtmlParsingException($errorMsg.' '.implode(', ', $parents));
|
---|
174 | $e->setInvalidTag($parents[0]);
|
---|
175 | throw $e;
|
---|
176 | }
|
---|
177 |
|
---|
178 | $this->verifyMustContain($actions, 'thead', 'tr');
|
---|
179 | $this->verifyMustContain($actions, 'tfoot', 'tr');
|
---|
180 |
|
---|
181 | // save the actions to do
|
---|
182 | $this->code = array_values($actions);
|
---|
183 | }
|
---|
184 |
|
---|
185 | /**
|
---|
186 | * Verify some tags that must contain other tags
|
---|
187 | *
|
---|
188 | * @param Node[] $actions
|
---|
189 | * @param string $mainTag
|
---|
190 | * @param string $mustTag
|
---|
191 | *
|
---|
192 | * @return bool
|
---|
193 | * @throws HtmlParsingException
|
---|
194 | */
|
---|
195 | protected function verifyMustContain(&$actions, $mainTag, $mustTag)
|
---|
196 | {
|
---|
197 | $inMainTag = 0;
|
---|
198 | $foundMustTag = false;
|
---|
199 |
|
---|
200 | foreach ($actions as $action) {
|
---|
201 | if ($action->getName() == $mainTag && !$action->isClose()) {
|
---|
202 | $inMainTag++;
|
---|
203 | $foundMustTag = false;
|
---|
204 | }
|
---|
205 |
|
---|
206 | if ($action->getName() == $mustTag && $inMainTag > 0) {
|
---|
207 | $foundMustTag = true;
|
---|
208 | }
|
---|
209 |
|
---|
210 | if ($action->getName() == $mainTag && $action->isClose()) {
|
---|
211 | if (!$foundMustTag) {
|
---|
212 | $exception = new HtmlParsingException(
|
---|
213 | "The tag [$mainTag] must contain at least one tag [$mustTag]"
|
---|
214 | );
|
---|
215 | $exception->setInvalidTag($action->getName());
|
---|
216 | $exception->setHtmlLine($action->getLine());
|
---|
217 | throw $exception;
|
---|
218 | }
|
---|
219 | $inMainTag--;
|
---|
220 | }
|
---|
221 | }
|
---|
222 |
|
---|
223 | return true;
|
---|
224 | }
|
---|
225 |
|
---|
226 | /**
|
---|
227 | * TODO remove the reference on the $parents variable
|
---|
228 | *
|
---|
229 | * @param Token $token
|
---|
230 | * @param array $parents
|
---|
231 | *
|
---|
232 | * @return array
|
---|
233 | * @throws HtmlParsingException
|
---|
234 | */
|
---|
235 | protected function getTagAction(Token $token, &$parents)
|
---|
236 | {
|
---|
237 | // tag that can be not closed
|
---|
238 | $tagsNotClosed = array(
|
---|
239 | 'br', 'hr', 'img', 'col',
|
---|
240 | 'input', 'link', 'option',
|
---|
241 | 'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline'
|
---|
242 | );
|
---|
243 |
|
---|
244 | // analyze the HTML code
|
---|
245 | $node = $this->tagParser->analyzeTag($token->getData());
|
---|
246 |
|
---|
247 | // save the current position in the HTML code
|
---|
248 | $node->setLine($token->getLine());
|
---|
249 |
|
---|
250 | $actions = array();
|
---|
251 | // if the tag must be closed
|
---|
252 | if (!in_array($node->getName(), $tagsNotClosed)) {
|
---|
253 | // if it is a closure tag
|
---|
254 | if ($node->isClose()) {
|
---|
255 | // HTML validation
|
---|
256 | if (count($parents) < 1) {
|
---|
257 | $e = new HtmlParsingException('Too many tag closures found for ['.$node->getName().']');
|
---|
258 | $e->setInvalidTag($node->getName());
|
---|
259 | $e->setHtmlLine($token->getLine());
|
---|
260 | throw $e;
|
---|
261 | } elseif (end($parents) != $node->getName()) {
|
---|
262 | $e = new HtmlParsingException('Tags are closed in a wrong order for ['.$node->getName().']');
|
---|
263 | $e->setInvalidTag($node->getName());
|
---|
264 | $e->setHtmlLine($token->getLine());
|
---|
265 | throw $e;
|
---|
266 | } else {
|
---|
267 | array_pop($parents);
|
---|
268 | }
|
---|
269 | } else {
|
---|
270 | // if it is an auto-closed tag
|
---|
271 | if ($node->isAutoClose()) {
|
---|
272 | // save the opened tag
|
---|
273 | $actions[] = $node;
|
---|
274 |
|
---|
275 | // prepare the closed tag
|
---|
276 | $node = clone $node;
|
---|
277 | $node->setParams(array());
|
---|
278 | $node->setClose(true);
|
---|
279 | } else {
|
---|
280 | // else: add a child for validation
|
---|
281 | array_push($parents, $node->getName());
|
---|
282 | }
|
---|
283 | }
|
---|
284 |
|
---|
285 | // if it is a <pre> tag (or <code> tag) not auto-closed => update the flag
|
---|
286 | if (($node->getName() === 'pre' || $node->getName() === 'code') && !$node->isAutoClose()) {
|
---|
287 | $this->tagPreIn = !$node->isClose();
|
---|
288 | }
|
---|
289 | }
|
---|
290 |
|
---|
291 | // save the actions to convert
|
---|
292 | $actions[] = $node;
|
---|
293 |
|
---|
294 | return $actions;
|
---|
295 | }
|
---|
296 |
|
---|
297 | /**
|
---|
298 | * get the Text action
|
---|
299 | *
|
---|
300 | * @param Token $token
|
---|
301 | *
|
---|
302 | * @return array
|
---|
303 | */
|
---|
304 | protected function getTextAction(Token $token)
|
---|
305 | {
|
---|
306 | // action to use for each line of the content of a <pre> Tag
|
---|
307 | $tagPreBr = new Node('br', array('style' => array(), 'num' => 0), false);
|
---|
308 |
|
---|
309 | $actions = array();
|
---|
310 |
|
---|
311 | // if we are not in a <pre> tag
|
---|
312 | if (!$this->tagPreIn) {
|
---|
313 | // save the action
|
---|
314 | $actions[] = new Node('write', array('txt' => $this->textParser->prepareTxt($token->getData())), false);
|
---|
315 | } else { // else (if we are in a <pre> tag)
|
---|
316 | // prepare the text
|
---|
317 | $data = str_replace("\r", '', $token->getData());
|
---|
318 | $lines = explode("\n", $data);
|
---|
319 |
|
---|
320 | // foreach line of the text
|
---|
321 | foreach ($lines as $k => $txt) {
|
---|
322 | // transform the line
|
---|
323 | $txt = str_replace("\t", self::HTML_TAB, $txt);
|
---|
324 | $txt = str_replace(' ', ' ', $txt);
|
---|
325 |
|
---|
326 | // add a break line
|
---|
327 | if ($k > 0) {
|
---|
328 | $actions[] = clone $tagPreBr;
|
---|
329 | }
|
---|
330 |
|
---|
331 | // save the action
|
---|
332 | $actions[] = new Node('write', array('txt' => $this->textParser->prepareTxt($txt, false)), false);
|
---|
333 | }
|
---|
334 | }
|
---|
335 | return $actions;
|
---|
336 | }
|
---|
337 |
|
---|
338 | /**
|
---|
339 | * get a full level of HTML, between an opening and closing corresponding
|
---|
340 | *
|
---|
341 | * @param integer $k
|
---|
342 | * @return array actions
|
---|
343 | */
|
---|
344 | public function getLevel($k)
|
---|
345 | {
|
---|
346 | // if the code does not exist => return empty
|
---|
347 | if (!isset($this->code[$k])) {
|
---|
348 | return array();
|
---|
349 | }
|
---|
350 |
|
---|
351 | // the tag to detect
|
---|
352 | $detect = $this->code[$k]->getName();
|
---|
353 |
|
---|
354 | // if it is a text => return
|
---|
355 | if ($detect === 'write') {
|
---|
356 | return array($this->code[$k]);
|
---|
357 | }
|
---|
358 |
|
---|
359 | //
|
---|
360 | $level = 0; // depth level
|
---|
361 | $end = false; // end of the search
|
---|
362 | $code = array(); // extract code
|
---|
363 |
|
---|
364 | // while it's not ended
|
---|
365 | while (!$end) {
|
---|
366 | // current action
|
---|
367 | /** @var Node $node */
|
---|
368 | $node = $this->code[$k];
|
---|
369 |
|
---|
370 | // if 'write' => we add the text
|
---|
371 | if ($node->getName() === 'write') {
|
---|
372 | $code[] = $node;
|
---|
373 | } else { // else, it is a html tag
|
---|
374 | $not = false; // flag for not taking into account the current tag
|
---|
375 |
|
---|
376 | // if it is the searched tag
|
---|
377 | if ($node->getName() == $detect) {
|
---|
378 | // if we are just at the root level => dont take it
|
---|
379 | if ($level == 0) {
|
---|
380 | $not = true;
|
---|
381 | }
|
---|
382 |
|
---|
383 | // update the level
|
---|
384 | $level += ($node->isClose() ? -1 : 1);
|
---|
385 |
|
---|
386 | // if we are now at the root level => it is the end, and dont take it
|
---|
387 | if ($level == 0) {
|
---|
388 | $not = true;
|
---|
389 | $end = true;
|
---|
390 | }
|
---|
391 | }
|
---|
392 |
|
---|
393 | // if we can take into account the current tag => save it
|
---|
394 | if (!$not) {
|
---|
395 | $code[] = $node;
|
---|
396 | }
|
---|
397 | }
|
---|
398 |
|
---|
399 | // it continues as long as there has code to analyze
|
---|
400 | if (isset($this->code[$k + 1])) {
|
---|
401 | $k++;
|
---|
402 | } else {
|
---|
403 | $end = true;
|
---|
404 | }
|
---|
405 | }
|
---|
406 |
|
---|
407 | // return the extract
|
---|
408 | return $code;
|
---|
409 | }
|
---|
410 |
|
---|
411 | /**
|
---|
412 | * prepare the HTML
|
---|
413 | *
|
---|
414 | * @param string $html
|
---|
415 | *
|
---|
416 | * @return string
|
---|
417 | */
|
---|
418 | public function prepareHtml($html)
|
---|
419 | {
|
---|
420 | // if it is a real html page, we have to convert it
|
---|
421 | if (preg_match('/<body/isU', $html)) {
|
---|
422 | $html = $this->getHtmlFromRealPage($html);
|
---|
423 | }
|
---|
424 |
|
---|
425 | // replace some constants
|
---|
426 | $html = str_replace('[[date_y]]', date('Y'), $html);
|
---|
427 | $html = str_replace('[[date_m]]', date('m'), $html);
|
---|
428 | $html = str_replace('[[date_d]]', date('d'), $html);
|
---|
429 |
|
---|
430 | $html = str_replace('[[date_h]]', date('H'), $html);
|
---|
431 | $html = str_replace('[[date_i]]', date('i'), $html);
|
---|
432 | $html = str_replace('[[date_s]]', date('s'), $html);
|
---|
433 |
|
---|
434 | return $html;
|
---|
435 | }
|
---|
436 |
|
---|
437 | /**
|
---|
438 | * convert the HTML of a real page, to a code adapted to Html2Pdf
|
---|
439 | *
|
---|
440 | * @param string $html HTML code of a real page
|
---|
441 | * @return string HTML adapted to Html2Pdf
|
---|
442 | */
|
---|
443 | protected function getHtmlFromRealPage($html)
|
---|
444 | {
|
---|
445 | // set body tag to lower case
|
---|
446 | $html = str_replace('<BODY', '<body', $html);
|
---|
447 | $html = str_replace('</BODY', '</body', $html);
|
---|
448 |
|
---|
449 | // explode from the body tag. If no body tag => end
|
---|
450 | $res = explode('<body', $html);
|
---|
451 |
|
---|
452 | // the html content is between body tag openning and closing
|
---|
453 | $content = '<page'.$res[1];
|
---|
454 | $content = explode('</body', $content);
|
---|
455 | $content = $content[0].'</page>';
|
---|
456 |
|
---|
457 | // extract the link tags from the original html
|
---|
458 | // and add them before the content
|
---|
459 | preg_match_all('/<link ([^>]*)[\/]?>/isU', $html, $match);
|
---|
460 | foreach ($match[1] as $src) {
|
---|
461 | $content = '<link '.$src.'/>'.$content;
|
---|
462 | }
|
---|
463 |
|
---|
464 | // extract the css style tags from the original html
|
---|
465 | // and add them before the content
|
---|
466 | preg_match_all('/<style[^>]*>(.*)<\/style[^>]*>/isU', $html, $match);
|
---|
467 | foreach ($match[0] as $src) {
|
---|
468 | $content = $src.$content;
|
---|
469 | }
|
---|
470 |
|
---|
471 | return $content;
|
---|
472 | }
|
---|
473 | }
|
---|