source: trunk/client/inc/hpdf5/spipu/html2pdf/src/Parsing/Html.php@ 424

Last change on this file since 424 was 347, checked in by roby, 4 years ago

Aggiornamento per compatibilità con php7.4

File size: 14.3 KB
Line 
1<?php
2/**
3 * Html2Pdf Library
4 *
5 * HTML => PDF converter
6 * distributed under the OSL-3.0 License
7 *
8 * @package Html2pdf
9 * @author Laurent MINGUET <webmaster@html2pdf.fr>
10 * @copyright 2017 Laurent MINGUET
11 */
12namespace Spipu\Html2Pdf\Parsing;
13
14use Spipu\Html2Pdf\Exception\HtmlParsingException;
15
16/**
17 * Class Html
18 */
19class Html
20{
21 const HTML_TAB = ' ';
22
23 /**
24 * @var TagParser
25 */
26 protected $tagParser;
27
28 /**
29 * @var TextParser
30 */
31 protected $textParser;
32
33 /**
34 * are we in a pre ?
35 * @var boolean
36 */
37 protected $tagPreIn = false;
38
39 /**
40 * parsed HTML code
41 * @var Node[]
42 */
43 public $code = array();
44
45 /**
46 * main constructor
47 *
48 * @param TextParser $textParser
49 */
50 public function __construct(TextParser $textParser)
51 {
52 $this->textParser = $textParser;
53 $this->tagParser = new TagParser($this->textParser);
54 $this->code = array();
55 }
56
57 /**
58 * Get the list of the codes, but cloned
59 *
60 * @return Node[]
61 */
62 public function getCloneCodes()
63 {
64 $codes = array();
65 foreach ($this->code as $key => $code) {
66 $codes[$key] = clone $code;
67 }
68 return $codes;
69 }
70
71 /**
72 * parse the HTML code
73 *
74 * @param Token[] $tokens A list of tokens to parse
75 *
76 * @throws HtmlParsingException
77 */
78 public function parse($tokens)
79 {
80 $parents = array();
81
82 // flag : are we in a <pre> Tag ?
83 $this->tagPreIn = false;
84
85 /**
86 * all the actions to do
87 * @var Node[] $actions
88 */
89 $actions = array();
90
91 // get the actions from the html tokens
92 foreach ($tokens as $token) {
93 if ($token->getType() === 'code') {
94 $actions = array_merge($actions, $this->getTagAction($token, $parents));
95 } elseif ($token->getType() === 'txt') {
96 $actions = array_merge($actions, $this->getTextAction($token));
97 }
98 }
99
100 // for each identified action, we have to clean up the begin and the end of the texte
101 // based on tags that surround it
102
103 // list of the tags to clean
104 $tagsToClean = array(
105 'page', 'page_header', 'page_footer', 'form',
106 'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br',
107 'div', 'hr', 'p', 'ul', 'ol', 'li',
108 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
109 'bookmark', 'fieldset', 'legend',
110 'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline',
111 'option'
112 );
113
114 // list of the tags to move space
115 $tagsToSpace = array(
116 'span', 'font', 'label',
117 'strong', 'b',
118 'address', 'cite', 'em', 'i', 'samp',
119 'cite', 's',
120 'ins', 'u',
121 'big', 'small', 'sub', 'sup'
122 );
123
124 // foreach action
125 $nb = count($actions);
126 for ($k = 0; $k < $nb; $k++) {
127 // if it is a Text
128 if ($actions[$k]->getName() !== 'write') {
129 continue;
130 }
131
132 // if the tag before the text is a tag to clean => ltrim on the text
133 if ($k>0) {
134 if (in_array($actions[$k - 1]->getName(), $tagsToClean)) {
135 $actions[$k]->setParam('txt', ltrim($actions[$k]->getParam('txt')));
136 }
137 }
138
139 if ($k < $nb - 1) {
140 // if the tag after the text is a tag to clean => rtrim on the text
141 if (in_array($actions[$k + 1]->getName(), $tagsToClean)) {
142 $actions[$k]->setParam('txt', rtrim($actions[$k]->getParam('txt')));
143 }
144
145 // if the tag after the text is a tag with space to move => move the space to the next write
146 if (in_array($actions[$k + 1]->getName(), $tagsToSpace)) {
147 if (substr($actions[$k]->getParam('txt'), -1) == ' ') {
148 $actions[$k]->setParam('txt', rtrim($actions[$k]->getParam('txt')));
149 for ($subK = $k+2; $subK < $nb; $subK++) {
150 if ($actions[$subK]->getName() === 'write') {
151 $actions[$subK]->setParam('txt', ' '.ltrim($actions[$subK]->getParam('txt')));
152 break;
153 }
154 }
155 }
156 }
157 }
158
159 // if the text is empty => remove the action
160 if (!strlen($actions[$k]->getParam('txt'))) {
161 unset($actions[$k]);
162 }
163 }
164
165 // if we are not on the level 0 => HTML validator ERROR
166 if (count($parents)) {
167 if (count($parents)>1) {
168 $errorMsg = 'The following tags have not been closed:';
169 } else {
170 $errorMsg = 'The following tag has not been closed:';
171 }
172
173 $e = new HtmlParsingException($errorMsg.' '.implode(', ', $parents));
174 $e->setInvalidTag($parents[0]);
175 throw $e;
176 }
177
178 $this->verifyMustContain($actions, 'thead', 'tr');
179 $this->verifyMustContain($actions, 'tfoot', 'tr');
180
181 // save the actions to do
182 $this->code = array_values($actions);
183 }
184
185 /**
186 * Verify some tags that must contain other tags
187 *
188 * @param Node[] $actions
189 * @param string $mainTag
190 * @param string $mustTag
191 *
192 * @return bool
193 * @throws HtmlParsingException
194 */
195 protected function verifyMustContain(&$actions, $mainTag, $mustTag)
196 {
197 $inMainTag = 0;
198 $foundMustTag = false;
199
200 foreach ($actions as $action) {
201 if ($action->getName() == $mainTag && !$action->isClose()) {
202 $inMainTag++;
203 $foundMustTag = false;
204 }
205
206 if ($action->getName() == $mustTag && $inMainTag > 0) {
207 $foundMustTag = true;
208 }
209
210 if ($action->getName() == $mainTag && $action->isClose()) {
211 if (!$foundMustTag) {
212 $exception = new HtmlParsingException(
213 "The tag [$mainTag] must contain at least one tag [$mustTag]"
214 );
215 $exception->setInvalidTag($action->getName());
216 $exception->setHtmlLine($action->getLine());
217 throw $exception;
218 }
219 $inMainTag--;
220 }
221 }
222
223 return true;
224 }
225
226 /**
227 * TODO remove the reference on the $parents variable
228 *
229 * @param Token $token
230 * @param array $parents
231 *
232 * @return array
233 * @throws HtmlParsingException
234 */
235 protected function getTagAction(Token $token, &$parents)
236 {
237 // tag that can be not closed
238 $tagsNotClosed = array(
239 'br', 'hr', 'img', 'col',
240 'input', 'link', 'option',
241 'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline'
242 );
243
244 // analyze the HTML code
245 $node = $this->tagParser->analyzeTag($token->getData());
246
247 // save the current position in the HTML code
248 $node->setLine($token->getLine());
249
250 $actions = array();
251 // if the tag must be closed
252 if (!in_array($node->getName(), $tagsNotClosed)) {
253 // if it is a closure tag
254 if ($node->isClose()) {
255 // HTML validation
256 if (count($parents) < 1) {
257 $e = new HtmlParsingException('Too many tag closures found for ['.$node->getName().']');
258 $e->setInvalidTag($node->getName());
259 $e->setHtmlLine($token->getLine());
260 throw $e;
261 } elseif (end($parents) != $node->getName()) {
262 $e = new HtmlParsingException('Tags are closed in a wrong order for ['.$node->getName().']');
263 $e->setInvalidTag($node->getName());
264 $e->setHtmlLine($token->getLine());
265 throw $e;
266 } else {
267 array_pop($parents);
268 }
269 } else {
270 // if it is an auto-closed tag
271 if ($node->isAutoClose()) {
272 // save the opened tag
273 $actions[] = $node;
274
275 // prepare the closed tag
276 $node = clone $node;
277 $node->setParams(array());
278 $node->setClose(true);
279 } else {
280 // else: add a child for validation
281 array_push($parents, $node->getName());
282 }
283 }
284
285 // if it is a <pre> tag (or <code> tag) not auto-closed => update the flag
286 if (($node->getName() === 'pre' || $node->getName() === 'code') && !$node->isAutoClose()) {
287 $this->tagPreIn = !$node->isClose();
288 }
289 }
290
291 // save the actions to convert
292 $actions[] = $node;
293
294 return $actions;
295 }
296
297 /**
298 * get the Text action
299 *
300 * @param Token $token
301 *
302 * @return array
303 */
304 protected function getTextAction(Token $token)
305 {
306 // action to use for each line of the content of a <pre> Tag
307 $tagPreBr = new Node('br', array('style' => array(), 'num' => 0), false);
308
309 $actions = array();
310
311 // if we are not in a <pre> tag
312 if (!$this->tagPreIn) {
313 // save the action
314 $actions[] = new Node('write', array('txt' => $this->textParser->prepareTxt($token->getData())), false);
315 } else { // else (if we are in a <pre> tag)
316 // prepare the text
317 $data = str_replace("\r", '', $token->getData());
318 $lines = explode("\n", $data);
319
320 // foreach line of the text
321 foreach ($lines as $k => $txt) {
322 // transform the line
323 $txt = str_replace("\t", self::HTML_TAB, $txt);
324 $txt = str_replace(' ', '&nbsp;', $txt);
325
326 // add a break line
327 if ($k > 0) {
328 $actions[] = clone $tagPreBr;
329 }
330
331 // save the action
332 $actions[] = new Node('write', array('txt' => $this->textParser->prepareTxt($txt, false)), false);
333 }
334 }
335 return $actions;
336 }
337
338 /**
339 * get a full level of HTML, between an opening and closing corresponding
340 *
341 * @param integer $k
342 * @return array actions
343 */
344 public function getLevel($k)
345 {
346 // if the code does not exist => return empty
347 if (!isset($this->code[$k])) {
348 return array();
349 }
350
351 // the tag to detect
352 $detect = $this->code[$k]->getName();
353
354 // if it is a text => return
355 if ($detect === 'write') {
356 return array($this->code[$k]);
357 }
358
359 //
360 $level = 0; // depth level
361 $end = false; // end of the search
362 $code = array(); // extract code
363
364 // while it's not ended
365 while (!$end) {
366 // current action
367 /** @var Node $node */
368 $node = $this->code[$k];
369
370 // if 'write' => we add the text
371 if ($node->getName() === 'write') {
372 $code[] = $node;
373 } else { // else, it is a html tag
374 $not = false; // flag for not taking into account the current tag
375
376 // if it is the searched tag
377 if ($node->getName() == $detect) {
378 // if we are just at the root level => dont take it
379 if ($level == 0) {
380 $not = true;
381 }
382
383 // update the level
384 $level += ($node->isClose() ? -1 : 1);
385
386 // if we are now at the root level => it is the end, and dont take it
387 if ($level == 0) {
388 $not = true;
389 $end = true;
390 }
391 }
392
393 // if we can take into account the current tag => save it
394 if (!$not) {
395 $code[] = $node;
396 }
397 }
398
399 // it continues as long as there has code to analyze
400 if (isset($this->code[$k + 1])) {
401 $k++;
402 } else {
403 $end = true;
404 }
405 }
406
407 // return the extract
408 return $code;
409 }
410
411 /**
412 * prepare the HTML
413 *
414 * @param string $html
415 *
416 * @return string
417 */
418 public function prepareHtml($html)
419 {
420 // if it is a real html page, we have to convert it
421 if (preg_match('/<body/isU', $html)) {
422 $html = $this->getHtmlFromRealPage($html);
423 }
424
425 // replace some constants
426 $html = str_replace('[[date_y]]', date('Y'), $html);
427 $html = str_replace('[[date_m]]', date('m'), $html);
428 $html = str_replace('[[date_d]]', date('d'), $html);
429
430 $html = str_replace('[[date_h]]', date('H'), $html);
431 $html = str_replace('[[date_i]]', date('i'), $html);
432 $html = str_replace('[[date_s]]', date('s'), $html);
433
434 return $html;
435 }
436
437 /**
438 * convert the HTML of a real page, to a code adapted to Html2Pdf
439 *
440 * @param string $html HTML code of a real page
441 * @return string HTML adapted to Html2Pdf
442 */
443 protected function getHtmlFromRealPage($html)
444 {
445 // set body tag to lower case
446 $html = str_replace('<BODY', '<body', $html);
447 $html = str_replace('</BODY', '</body', $html);
448
449 // explode from the body tag. If no body tag => end
450 $res = explode('<body', $html);
451
452 // the html content is between body tag openning and closing
453 $content = '<page'.$res[1];
454 $content = explode('</body', $content);
455 $content = $content[0].'</page>';
456
457 // extract the link tags from the original html
458 // and add them before the content
459 preg_match_all('/<link ([^>]*)[\/]?>/isU', $html, $match);
460 foreach ($match[1] as $src) {
461 $content = '<link '.$src.'/>'.$content;
462 }
463
464 // extract the css style tags from the original html
465 // and add them before the content
466 preg_match_all('/<style[^>]*>(.*)<\/style[^>]*>/isU', $html, $match);
467 foreach ($match[0] as $src) {
468 $content = $src.$content;
469 }
470
471 return $content;
472 }
473}
Note: See TracBrowser for help on using the repository browser.