1 | <?php
|
---|
2 | /**
|
---|
3 | * HTML2PDF Librairy - parsingHtml class
|
---|
4 | *
|
---|
5 | * HTML => PDF convertor
|
---|
6 | * distributed under the LGPL License
|
---|
7 | *
|
---|
8 | * @author Laurent MINGUET <webmaster@html2pdf.fr>
|
---|
9 | * @version 4.03
|
---|
10 | */
|
---|
11 |
|
---|
12 | class HTML2PDF_parsingHtml
|
---|
13 | {
|
---|
14 | protected $_html = ''; // HTML code to parse
|
---|
15 | protected $_num = 0; // table number
|
---|
16 | protected $_level = 0; // table level
|
---|
17 | protected $_encoding = ''; // encoding
|
---|
18 | public $code = array(); // parsed HTML codfe
|
---|
19 |
|
---|
20 | const HTML_TAB = ' ';
|
---|
21 |
|
---|
22 | /**
|
---|
23 | * main constructor
|
---|
24 | *
|
---|
25 | * @param string encoding
|
---|
26 | * @access public
|
---|
27 | */
|
---|
28 | public function __construct($encoding = 'UTF-8')
|
---|
29 | {
|
---|
30 | $this->_num = 0;
|
---|
31 | $this->_level = array($this->_num);
|
---|
32 | $this->_html = '';
|
---|
33 | $this->code = array();
|
---|
34 | $this->setEncoding($encoding);
|
---|
35 | }
|
---|
36 |
|
---|
37 | /**
|
---|
38 | * change the encoding
|
---|
39 | *
|
---|
40 | * @param string encoding
|
---|
41 | * @access public
|
---|
42 | */
|
---|
43 | public function setEncoding($encoding)
|
---|
44 | {
|
---|
45 | $this->_encoding = $encoding;
|
---|
46 | }
|
---|
47 |
|
---|
48 | /**
|
---|
49 | * Define the HTML code to parse
|
---|
50 | *
|
---|
51 | * @param string HTML code
|
---|
52 | * @access public
|
---|
53 | */
|
---|
54 | public function setHTML($html)
|
---|
55 | {
|
---|
56 | // remove the HTML in comment
|
---|
57 | $html = preg_replace('/<!--(.*)-->/isU', '', $html);
|
---|
58 |
|
---|
59 | // save the HTML code
|
---|
60 | $this->_html = $html;
|
---|
61 | }
|
---|
62 |
|
---|
63 | /**
|
---|
64 | * parse the HTML code
|
---|
65 | *
|
---|
66 | * @access public
|
---|
67 | */
|
---|
68 | public function parse()
|
---|
69 | {
|
---|
70 | $parents = array();
|
---|
71 |
|
---|
72 | // flag : are we in a <pre> Tag ?
|
---|
73 | $tagPreIn = false;
|
---|
74 |
|
---|
75 | // action to use for each line of the content of a <pre> Tag
|
---|
76 | $tagPreBr = array(
|
---|
77 | 'name' => 'br',
|
---|
78 | 'close' => false,
|
---|
79 | 'param' => array(
|
---|
80 | 'style' => array(),
|
---|
81 | 'num' => 0
|
---|
82 | )
|
---|
83 | );
|
---|
84 |
|
---|
85 | // tag that can be not closed
|
---|
86 | $tagsNotClosed = array(
|
---|
87 | 'br', 'hr', 'img', 'col',
|
---|
88 | 'input', 'link', 'option',
|
---|
89 | 'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline'
|
---|
90 | );
|
---|
91 |
|
---|
92 | // search the HTML tags
|
---|
93 | $tmp = array();
|
---|
94 | $this->_searchCode($tmp);
|
---|
95 |
|
---|
96 | // all the actions to do
|
---|
97 | $actions = array();
|
---|
98 |
|
---|
99 | // foreach part of the HTML code
|
---|
100 | foreach ($tmp as $part) {
|
---|
101 | // if it is a tag code
|
---|
102 | if ($part[0]=='code') {
|
---|
103 | // analise the HTML code
|
---|
104 | $res = $this->_analiseCode($part[1]);
|
---|
105 |
|
---|
106 | // if it is a real HTML tag
|
---|
107 | if ($res) {
|
---|
108 | // save the current posistion in the HTML code
|
---|
109 | $res['html_pos'] = $part[2];
|
---|
110 |
|
---|
111 | // if the tag must be closed
|
---|
112 | if (!in_array($res['name'], $tagsNotClosed)) {
|
---|
113 | // if it is a closure tag
|
---|
114 | if ($res['close']) {
|
---|
115 | // HTML validation
|
---|
116 | if (count($parents)<1)
|
---|
117 | throw new HTML2PDF_exception(3, $res['name'], $this->getHtmlErrorCode($res['html_pos']));
|
---|
118 | else if ($parents[count($parents)-1]!=$res['name'])
|
---|
119 | throw new HTML2PDF_exception(4, $parents, $this->getHtmlErrorCode($res['html_pos']));
|
---|
120 | else
|
---|
121 | unset($parents[count($parents)-1]);
|
---|
122 | } else {
|
---|
123 | // if it is a autoclosed tag
|
---|
124 | if ($res['autoclose']) {
|
---|
125 | // save the opened tag
|
---|
126 | $actions[] = $res;
|
---|
127 |
|
---|
128 | // prepare the closed tag
|
---|
129 | $res['params'] = array();
|
---|
130 | $res['close'] = true;
|
---|
131 | }
|
---|
132 | // else :add a child for validation
|
---|
133 | else
|
---|
134 | $parents[count($parents)] = $res['name'];
|
---|
135 | }
|
---|
136 |
|
---|
137 | // if it is a <pre> tag (or <code> tag) not auclosed => update the flag
|
---|
138 | if (($res['name']=='pre' || $res['name']=='code') && !$res['autoclose']) {
|
---|
139 | $tagPreIn = !$res['close'];
|
---|
140 | }
|
---|
141 | }
|
---|
142 |
|
---|
143 | // save the actions to convert
|
---|
144 | $actions[] = $res;
|
---|
145 | } else { // else (it is not a real HTML tag => we transform it in Texte
|
---|
146 | $part[0]='txt';
|
---|
147 | }
|
---|
148 | }
|
---|
149 | // if it is text
|
---|
150 | if ($part[0]=='txt') {
|
---|
151 | // if we are not in a <pre> tag
|
---|
152 | if (!$tagPreIn) {
|
---|
153 | // save the action
|
---|
154 | $actions[] = array(
|
---|
155 | 'name' => 'write',
|
---|
156 | 'close' => false,
|
---|
157 | 'param' => array('txt' => $this->_prepareTxt($part[1])),
|
---|
158 | );
|
---|
159 | } else { // else (if we are in a <pre> tag)
|
---|
160 | // prepare the text
|
---|
161 | $part[1] = str_replace("\r", '', $part[1]);
|
---|
162 | $part[1] = explode("\n", $part[1]);
|
---|
163 |
|
---|
164 | // foreach line of the text
|
---|
165 | foreach ($part[1] as $k => $txt) {
|
---|
166 | // transform the line
|
---|
167 | $txt = str_replace("\t", self::HTML_TAB, $txt);
|
---|
168 | $txt = str_replace(' ', ' ', $txt);
|
---|
169 |
|
---|
170 | // add a break line
|
---|
171 | if ($k>0) $actions[] = $tagPreBr;
|
---|
172 |
|
---|
173 | // save the action
|
---|
174 | $actions[] = array(
|
---|
175 | 'name' => 'write',
|
---|
176 | 'close' => false,
|
---|
177 | 'param' => array('txt' => $this->_prepareTxt($txt, false)),
|
---|
178 | );
|
---|
179 | }
|
---|
180 | }
|
---|
181 | }
|
---|
182 | }
|
---|
183 |
|
---|
184 | // for each indentified action, we have to clean up the begin and the end of the texte
|
---|
185 | // based on tags that surround it
|
---|
186 |
|
---|
187 | // list of the tags to clean
|
---|
188 | $tagsToClean = array(
|
---|
189 | 'page', 'page_header', 'page_footer', 'form',
|
---|
190 | 'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br',
|
---|
191 | 'div', 'hr', 'p', 'ul', 'ol', 'li',
|
---|
192 | 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
---|
193 | 'bookmark', 'fieldset', 'legend',
|
---|
194 | 'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline',
|
---|
195 | 'option'
|
---|
196 | );
|
---|
197 |
|
---|
198 | // foreach action
|
---|
199 | $nb = count($actions);
|
---|
200 | for ($k=0; $k<$nb; $k++) {
|
---|
201 | // if it is a Text
|
---|
202 | if ($actions[$k]['name']=='write') {
|
---|
203 | // if the tag before the text is a tag to clean => ltrim on the text
|
---|
204 | if ($k>0 && in_array($actions[$k-1]['name'], $tagsToClean))
|
---|
205 | $actions[$k]['param']['txt'] = ltrim($actions[$k]['param']['txt']);
|
---|
206 |
|
---|
207 | // if the tag after the text is a tag to clean => rtrim on the text
|
---|
208 | if ($k<$nb-1 && in_array($actions[$k+1]['name'], $tagsToClean))
|
---|
209 | $actions[$k]['param']['txt'] = rtrim($actions[$k]['param']['txt']);
|
---|
210 |
|
---|
211 | // if the text is empty => remove the action
|
---|
212 | if (!strlen($actions[$k]['param']['txt']))
|
---|
213 | unset($actions[$k]);
|
---|
214 | }
|
---|
215 | }
|
---|
216 |
|
---|
217 | // if we are not on the level 0 => HTML validator ERROR
|
---|
218 | if (count($parents)) throw new HTML2PDF_exception(5, $parents);
|
---|
219 |
|
---|
220 | // save the actions to do
|
---|
221 | $this->code = array_values($actions);
|
---|
222 | }
|
---|
223 |
|
---|
224 | /**
|
---|
225 | * prepare the text
|
---|
226 | *
|
---|
227 | * @param string texte
|
---|
228 | * @param boolean true => replace multiple space+\t+\r+\n by a single space
|
---|
229 | * @return string texte
|
---|
230 | * @access protected
|
---|
231 | */
|
---|
232 | protected function _prepareTxt($txt, $spaces = true)
|
---|
233 | {
|
---|
234 | if ($spaces) $txt = preg_replace('/\s+/is', ' ', $txt);
|
---|
235 | $txt = str_replace('€', 'â¬', $txt);
|
---|
236 | $txt = html_entity_decode($txt, ENT_QUOTES, $this->_encoding);
|
---|
237 | return $txt;
|
---|
238 | }
|
---|
239 |
|
---|
240 | /**
|
---|
241 | * parse the HTML code
|
---|
242 | *
|
---|
243 | * @param &array array's result
|
---|
244 | * @return null
|
---|
245 | */
|
---|
246 | protected function _searchCode(&$tmp)
|
---|
247 | {
|
---|
248 | // initialise the array
|
---|
249 | $tmp = array();
|
---|
250 |
|
---|
251 | // regexp to separate the tags from the texts
|
---|
252 | $reg = '/(<[^>]+>)|([^<]+)+/isU';
|
---|
253 |
|
---|
254 | // last match found
|
---|
255 | $str = '';
|
---|
256 | $offset = 0;
|
---|
257 |
|
---|
258 | // As it finds a match
|
---|
259 | while (preg_match($reg, $this->_html, $parse, PREG_OFFSET_CAPTURE, $offset)) {
|
---|
260 | // if it is a tag
|
---|
261 | if ($parse[1][0]) {
|
---|
262 | // save the previous text if it exists
|
---|
263 | if ($str!=='') $tmp[] = array('txt', $str);
|
---|
264 |
|
---|
265 | // save the tag, with the offset
|
---|
266 | $tmp[] = array('code', trim($parse[1][0]), $offset);
|
---|
267 |
|
---|
268 | // init the current text
|
---|
269 | $str = '';
|
---|
270 | } else { // else (if it is a text)
|
---|
271 | // add the new text to the current text
|
---|
272 | $str.= $parse[2][0];
|
---|
273 | }
|
---|
274 |
|
---|
275 | // Update offset to the end of the match
|
---|
276 | $offset = $parse[0][1] + strlen($parse[0][0]);
|
---|
277 | unset($parse);
|
---|
278 | }
|
---|
279 | // if a text is present in the end, we save it
|
---|
280 | if ($str!='') $tmp[] = array('txt', $str);
|
---|
281 | unset($str);
|
---|
282 | }
|
---|
283 |
|
---|
284 | /**
|
---|
285 | * analise a HTML tag
|
---|
286 | *
|
---|
287 | * @param string HTML code to analise
|
---|
288 | * @return array corresponding action
|
---|
289 | */
|
---|
290 | protected function _analiseCode($code)
|
---|
291 | {
|
---|
292 | // name of the tag, opening, closure, autoclosure
|
---|
293 | $tag = '<([\/]{0,1})([_a-z0-9]+)([\/>\s]+)';
|
---|
294 | if (!preg_match('/'.$tag.'/isU', $code, $match)) return null;
|
---|
295 | $close = ($match[1]=='/' ? true : false);
|
---|
296 | $autoclose = preg_match('/\/>$/isU', $code);
|
---|
297 | $name = strtolower($match[2]);
|
---|
298 |
|
---|
299 | // required parameters (depends on the tag name)
|
---|
300 | $param = array();
|
---|
301 | $param['style'] = '';
|
---|
302 | if ($name=='img') {
|
---|
303 | $param['alt'] = '';
|
---|
304 | $param['src'] = '';
|
---|
305 | }
|
---|
306 | if ($name=='a') {
|
---|
307 | $param['href'] = '';
|
---|
308 | }
|
---|
309 |
|
---|
310 | // read the parameters : nom=valeur
|
---|
311 | $prop = '([a-zA-Z0-9_]+)=([^"\'\s>]+)';
|
---|
312 | preg_match_all('/'.$prop.'/is', $code, $match);
|
---|
313 | for($k=0; $k<count($match[0]); $k++)
|
---|
314 | $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
|
---|
315 |
|
---|
316 | // read the parameters : nom="valeur"
|
---|
317 | $prop = '([a-zA-Z0-9_]+)=["]([^"]*)["]';
|
---|
318 | preg_match_all('/'.$prop.'/is', $code, $match);
|
---|
319 | for($k=0; $k<count($match[0]); $k++)
|
---|
320 | $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
|
---|
321 |
|
---|
322 | // read the parameters : nom='valeur'
|
---|
323 | $prop = "([a-zA-Z0-9_]+)=[']([^']*)[']";
|
---|
324 | preg_match_all('/'.$prop.'/is', $code, $match);
|
---|
325 | for($k=0; $k<count($match[0]); $k++)
|
---|
326 | $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
|
---|
327 |
|
---|
328 | // compliance of each parameter
|
---|
329 | $color = "#000000";
|
---|
330 | $border = null;
|
---|
331 | foreach ($param as $key => $val) {
|
---|
332 | $key = strtolower($key);
|
---|
333 | switch($key)
|
---|
334 | {
|
---|
335 | case 'width':
|
---|
336 | unset($param[$key]);
|
---|
337 | $param['style'] .= 'width: '.$val.'px; ';
|
---|
338 | break;
|
---|
339 |
|
---|
340 | case 'align':
|
---|
341 | if ($name==='img') {
|
---|
342 | unset($param[$key]);
|
---|
343 | $param['style'] .= 'float: '.$val.'; ';
|
---|
344 | } elseif ($name!=='table') {
|
---|
345 | unset($param[$key]);
|
---|
346 | $param['style'] .= 'text-align: '.$val.'; ';
|
---|
347 | }
|
---|
348 | break;
|
---|
349 |
|
---|
350 | case 'valign':
|
---|
351 | unset($param[$key]);
|
---|
352 | $param['style'] .= 'vertical-align: '.$val.'; ';
|
---|
353 | break;
|
---|
354 |
|
---|
355 | case 'height':
|
---|
356 | unset($param[$key]);
|
---|
357 | $param['style'] .= 'height: '.$val.'px; ';
|
---|
358 | break;
|
---|
359 |
|
---|
360 | case 'bgcolor':
|
---|
361 | unset($param[$key]);
|
---|
362 | $param['style'] .= 'background: '.$val.'; ';
|
---|
363 | break;
|
---|
364 |
|
---|
365 | case 'bordercolor':
|
---|
366 | unset($param[$key]);
|
---|
367 | $color = $val;
|
---|
368 | break;
|
---|
369 |
|
---|
370 | case 'border':
|
---|
371 | unset($param[$key]);
|
---|
372 | if (preg_match('/^[0-9]+$/isU', $val)) $val = $val.'px';
|
---|
373 | $border = $val;
|
---|
374 | break;
|
---|
375 |
|
---|
376 | case 'cellpadding':
|
---|
377 | case 'cellspacing':
|
---|
378 | if (preg_match('/^([0-9]+)$/isU', $val)) $param[$key] = $val.'px';
|
---|
379 | break;
|
---|
380 |
|
---|
381 | case 'colspan':
|
---|
382 | case 'rowspan':
|
---|
383 | $val = preg_replace('/[^0-9]/isU', '', $val);
|
---|
384 | if (!$val) $val = 1;
|
---|
385 | $param[$key] = $val;
|
---|
386 | break;
|
---|
387 | }
|
---|
388 | }
|
---|
389 |
|
---|
390 | // compliance of the border
|
---|
391 | if ($border!==null) {
|
---|
392 | if ($border) $border = 'border: solid '.$border.' '.$color;
|
---|
393 | else $border = 'border: none';
|
---|
394 |
|
---|
395 | $param['style'] .= $border.'; ';
|
---|
396 | $param['border'] = $border;
|
---|
397 | }
|
---|
398 |
|
---|
399 | // reading styles: decomposition and standardization
|
---|
400 | $styles = explode(';', $param['style']);
|
---|
401 | $param['style'] = array();
|
---|
402 | foreach ($styles as $style) {
|
---|
403 | $tmp = explode(':', $style);
|
---|
404 | if (count($tmp)>1) {
|
---|
405 | $cod = $tmp[0];
|
---|
406 | unset($tmp[0]);
|
---|
407 | $tmp = implode(':', $tmp);
|
---|
408 | $param['style'][trim(strtolower($cod))] = preg_replace('/[\s]+/isU', ' ', trim($tmp));
|
---|
409 | }
|
---|
410 | }
|
---|
411 |
|
---|
412 | // determining the level of table opening, with an added level
|
---|
413 | if (in_array($name, array('ul', 'ol', 'table')) && !$close) {
|
---|
414 | $this->_num++;
|
---|
415 | $this->_level[count($this->_level)] = $this->_num;
|
---|
416 | }
|
---|
417 |
|
---|
418 | // get the level of the table containing the element
|
---|
419 | if (!isset($param['num'])) {
|
---|
420 | $param['num'] = $this->_level[count($this->_level)-1];
|
---|
421 | }
|
---|
422 |
|
---|
423 | // for closures table: remove a level
|
---|
424 | if (in_array($name, array('ul', 'ol', 'table')) && $close) {
|
---|
425 | unset($this->_level[count($this->_level)-1]);
|
---|
426 | }
|
---|
427 |
|
---|
428 | // prepare the parameters
|
---|
429 | if (isset($param['value'])) $param['value'] = $this->_prepareTxt($param['value']);
|
---|
430 | if (isset($param['alt'])) $param['alt'] = $this->_prepareTxt($param['alt']);
|
---|
431 | if (isset($param['title'])) $param['title'] = $this->_prepareTxt($param['title']);
|
---|
432 | if (isset($param['class'])) $param['class'] = $this->_prepareTxt($param['class']);
|
---|
433 |
|
---|
434 | // return the new action to do
|
---|
435 | return array('name' => $name, 'close' => $close ? 1 : 0, 'autoclose' => $autoclose, 'param' => $param);
|
---|
436 | }
|
---|
437 |
|
---|
438 | /**
|
---|
439 | * get a full level of HTML, between an opening and closing corresponding
|
---|
440 | *
|
---|
441 | * @param integer key
|
---|
442 | * @return array actions
|
---|
443 | */
|
---|
444 | public function getLevel($k)
|
---|
445 | {
|
---|
446 | // if the code does not exist => return empty
|
---|
447 | if (!isset($this->code[$k])) return array();
|
---|
448 |
|
---|
449 | // the tag to detect
|
---|
450 | $detect = $this->code[$k]['name'];
|
---|
451 |
|
---|
452 | // if it is a text => return
|
---|
453 | if ($detect=='write') {
|
---|
454 | return array($this->code[$k]);
|
---|
455 | }
|
---|
456 |
|
---|
457 | //
|
---|
458 | $level = 0; // depth level
|
---|
459 | $end = false; // end of the search
|
---|
460 | $code = array(); // extract code
|
---|
461 |
|
---|
462 | // while it's not ended
|
---|
463 | while (!$end) {
|
---|
464 | // current action
|
---|
465 | $row = $this->code[$k];
|
---|
466 |
|
---|
467 | // if 'write' => we add the text
|
---|
468 | if ($row['name']=='write') {
|
---|
469 | $code[] = $row;
|
---|
470 | } else { // else, it is a html tag
|
---|
471 | $not = false; // flag for not taking into account the current tag
|
---|
472 |
|
---|
473 | // if it is the searched tag
|
---|
474 | if ($row['name']==$detect) {
|
---|
475 | // if we are just at the root level => dont take it
|
---|
476 | if ($level==0) {
|
---|
477 | $not = true;
|
---|
478 | }
|
---|
479 |
|
---|
480 | // update the level
|
---|
481 | $level+= ($row['close'] ? -1 : 1);
|
---|
482 |
|
---|
483 | // if we are now at the root level => it is the end, and dont take it
|
---|
484 | if ($level==0) {
|
---|
485 | $not = true;
|
---|
486 | $end = true;
|
---|
487 | }
|
---|
488 | }
|
---|
489 |
|
---|
490 | // if we can takin into account the current tag => save it
|
---|
491 | if (!$not) {
|
---|
492 | if (isset($row['style']['text-align'])) unset($row['style']['text-align']);
|
---|
493 | $code[] = $row;
|
---|
494 | }
|
---|
495 | }
|
---|
496 |
|
---|
497 | // it continues as long as there has code to analise
|
---|
498 | if (isset($this->code[$k+1]))
|
---|
499 | $k++;
|
---|
500 | else
|
---|
501 | $end = true;
|
---|
502 | }
|
---|
503 |
|
---|
504 | // return the extract
|
---|
505 | return $code;
|
---|
506 | }
|
---|
507 |
|
---|
508 | /**
|
---|
509 | * return a part of the HTML code, for error message
|
---|
510 | *
|
---|
511 | * @param integer position
|
---|
512 | * @param integer take before
|
---|
513 | * @param integer take after
|
---|
514 | * @return string part of the html code
|
---|
515 | */
|
---|
516 | public function getHtmlErrorCode($pos, $before=30, $after=40)
|
---|
517 | {
|
---|
518 | return substr($this->_html, $pos-$before, $before+$after);
|
---|
519 | }
|
---|
520 | }
|
---|