source: trunk/client/inc/hpdf403/_class/parsingHtml.class.php@ 429

Last change on this file since 429 was 349, checked in by roby, 4 years ago
File size: 17.3 KB
Line 
1<?php
2/**
3 * HTML2PDF Librairy - parsingHtml class
4 *
5 * HTML => PDF convertor
6 * distributed under the LGPL License
7 *
8 * @author Laurent MINGUET <webmaster@html2pdf.fr>
9 * @version 4.03
10 */
11
12class HTML2PDF_parsingHtml
13{
14 protected $_html = ''; // HTML code to parse
15 protected $_num = 0; // table number
16 protected $_level = 0; // table level
17 protected $_encoding = ''; // encoding
18 public $code = array(); // parsed HTML codfe
19
20 const HTML_TAB = ' ';
21
22 /**
23 * main constructor
24 *
25 * @param string encoding
26 * @access public
27 */
28 public function __construct($encoding = 'UTF-8')
29 {
30 $this->_num = 0;
31 $this->_level = array($this->_num);
32 $this->_html = '';
33 $this->code = array();
34 $this->setEncoding($encoding);
35 }
36
37 /**
38 * change the encoding
39 *
40 * @param string encoding
41 * @access public
42 */
43 public function setEncoding($encoding)
44 {
45 $this->_encoding = $encoding;
46 }
47
48 /**
49 * Define the HTML code to parse
50 *
51 * @param string HTML code
52 * @access public
53 */
54 public function setHTML($html)
55 {
56 // remove the HTML in comment
57 $html = preg_replace('/<!--(.*)-->/isU', '', $html);
58
59 // save the HTML code
60 $this->_html = $html;
61 }
62
63 /**
64 * parse the HTML code
65 *
66 * @access public
67 */
68 public function parse()
69 {
70 $parents = array();
71
72 // flag : are we in a <pre> Tag ?
73 $tagPreIn = false;
74
75 // action to use for each line of the content of a <pre> Tag
76 $tagPreBr = array(
77 'name' => 'br',
78 'close' => false,
79 'param' => array(
80 'style' => array(),
81 'num' => 0
82 )
83 );
84
85 // tag that can be not closed
86 $tagsNotClosed = array(
87 'br', 'hr', 'img', 'col',
88 'input', 'link', 'option',
89 'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline'
90 );
91
92 // search the HTML tags
93 $tmp = array();
94 $this->_searchCode($tmp);
95
96 // all the actions to do
97 $actions = array();
98
99 // foreach part of the HTML code
100 foreach ($tmp as $part) {
101 // if it is a tag code
102 if ($part[0]=='code') {
103 // analise the HTML code
104 $res = $this->_analiseCode($part[1]);
105
106 // if it is a real HTML tag
107 if ($res) {
108 // save the current posistion in the HTML code
109 $res['html_pos'] = $part[2];
110
111 // if the tag must be closed
112 if (!in_array($res['name'], $tagsNotClosed)) {
113 // if it is a closure tag
114 if ($res['close']) {
115 // HTML validation
116 if (count($parents)<1)
117 throw new HTML2PDF_exception(3, $res['name'], $this->getHtmlErrorCode($res['html_pos']));
118 else if ($parents[count($parents)-1]!=$res['name'])
119 throw new HTML2PDF_exception(4, $parents, $this->getHtmlErrorCode($res['html_pos']));
120 else
121 unset($parents[count($parents)-1]);
122 } else {
123 // if it is a autoclosed tag
124 if ($res['autoclose']) {
125 // save the opened tag
126 $actions[] = $res;
127
128 // prepare the closed tag
129 $res['params'] = array();
130 $res['close'] = true;
131 }
132 // else :add a child for validation
133 else
134 $parents[count($parents)] = $res['name'];
135 }
136
137 // if it is a <pre> tag (or <code> tag) not auclosed => update the flag
138 if (($res['name']=='pre' || $res['name']=='code') && !$res['autoclose']) {
139 $tagPreIn = !$res['close'];
140 }
141 }
142
143 // save the actions to convert
144 $actions[] = $res;
145 } else { // else (it is not a real HTML tag => we transform it in Texte
146 $part[0]='txt';
147 }
148 }
149 // if it is text
150 if ($part[0]=='txt') {
151 // if we are not in a <pre> tag
152 if (!$tagPreIn) {
153 // save the action
154 $actions[] = array(
155 'name' => 'write',
156 'close' => false,
157 'param' => array('txt' => $this->_prepareTxt($part[1])),
158 );
159 } else { // else (if we are in a <pre> tag)
160 // prepare the text
161 $part[1] = str_replace("\r", '', $part[1]);
162 $part[1] = explode("\n", $part[1]);
163
164 // foreach line of the text
165 foreach ($part[1] as $k => $txt) {
166 // transform the line
167 $txt = str_replace("\t", self::HTML_TAB, $txt);
168 $txt = str_replace(' ', '&nbsp;', $txt);
169
170 // add a break line
171 if ($k>0) $actions[] = $tagPreBr;
172
173 // save the action
174 $actions[] = array(
175 'name' => 'write',
176 'close' => false,
177 'param' => array('txt' => $this->_prepareTxt($txt, false)),
178 );
179 }
180 }
181 }
182 }
183
184 // for each indentified action, we have to clean up the begin and the end of the texte
185 // based on tags that surround it
186
187 // list of the tags to clean
188 $tagsToClean = array(
189 'page', 'page_header', 'page_footer', 'form',
190 'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br',
191 'div', 'hr', 'p', 'ul', 'ol', 'li',
192 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
193 'bookmark', 'fieldset', 'legend',
194 'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline',
195 'option'
196 );
197
198 // foreach action
199 $nb = count($actions);
200 for ($k=0; $k<$nb; $k++) {
201 // if it is a Text
202 if ($actions[$k]['name']=='write') {
203 // if the tag before the text is a tag to clean => ltrim on the text
204 if ($k>0 && in_array($actions[$k-1]['name'], $tagsToClean))
205 $actions[$k]['param']['txt'] = ltrim($actions[$k]['param']['txt']);
206
207 // if the tag after the text is a tag to clean => rtrim on the text
208 if ($k<$nb-1 && in_array($actions[$k+1]['name'], $tagsToClean))
209 $actions[$k]['param']['txt'] = rtrim($actions[$k]['param']['txt']);
210
211 // if the text is empty => remove the action
212 if (!strlen($actions[$k]['param']['txt']))
213 unset($actions[$k]);
214 }
215 }
216
217 // if we are not on the level 0 => HTML validator ERROR
218 if (count($parents)) throw new HTML2PDF_exception(5, $parents);
219
220 // save the actions to do
221 $this->code = array_values($actions);
222 }
223
224 /**
225 * prepare the text
226 *
227 * @param string texte
228 * @param boolean true => replace multiple space+\t+\r+\n by a single space
229 * @return string texte
230 * @access protected
231 */
232 protected function _prepareTxt($txt, $spaces = true)
233 {
234 if ($spaces) $txt = preg_replace('/\s+/is', ' ', $txt);
235 $txt = str_replace('&euro;', '€', $txt);
236 $txt = html_entity_decode($txt, ENT_QUOTES, $this->_encoding);
237 return $txt;
238 }
239
240 /**
241 * parse the HTML code
242 *
243 * @param &array array's result
244 * @return null
245 */
246 protected function _searchCode(&$tmp)
247 {
248 // initialise the array
249 $tmp = array();
250
251 // regexp to separate the tags from the texts
252 $reg = '/(<[^>]+>)|([^<]+)+/isU';
253
254 // last match found
255 $str = '';
256 $offset = 0;
257
258 // As it finds a match
259 while (preg_match($reg, $this->_html, $parse, PREG_OFFSET_CAPTURE, $offset)) {
260 // if it is a tag
261 if ($parse[1][0]) {
262 // save the previous text if it exists
263 if ($str!=='') $tmp[] = array('txt', $str);
264
265 // save the tag, with the offset
266 $tmp[] = array('code', trim($parse[1][0]), $offset);
267
268 // init the current text
269 $str = '';
270 } else { // else (if it is a text)
271 // add the new text to the current text
272 $str.= $parse[2][0];
273 }
274
275 // Update offset to the end of the match
276 $offset = $parse[0][1] + strlen($parse[0][0]);
277 unset($parse);
278 }
279 // if a text is present in the end, we save it
280 if ($str!='') $tmp[] = array('txt', $str);
281 unset($str);
282 }
283
284 /**
285 * analise a HTML tag
286 *
287 * @param string HTML code to analise
288 * @return array corresponding action
289 */
290 protected function _analiseCode($code)
291 {
292 // name of the tag, opening, closure, autoclosure
293 $tag = '<([\/]{0,1})([_a-z0-9]+)([\/>\s]+)';
294 if (!preg_match('/'.$tag.'/isU', $code, $match)) return null;
295 $close = ($match[1]=='/' ? true : false);
296 $autoclose = preg_match('/\/>$/isU', $code);
297 $name = strtolower($match[2]);
298
299 // required parameters (depends on the tag name)
300 $param = array();
301 $param['style'] = '';
302 if ($name=='img') {
303 $param['alt'] = '';
304 $param['src'] = '';
305 }
306 if ($name=='a') {
307 $param['href'] = '';
308 }
309
310 // read the parameters : nom=valeur
311 $prop = '([a-zA-Z0-9_]+)=([^"\'\s>]+)';
312 preg_match_all('/'.$prop.'/is', $code, $match);
313 for($k=0; $k<count($match[0]); $k++)
314 $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
315
316 // read the parameters : nom="valeur"
317 $prop = '([a-zA-Z0-9_]+)=["]([^"]*)["]';
318 preg_match_all('/'.$prop.'/is', $code, $match);
319 for($k=0; $k<count($match[0]); $k++)
320 $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
321
322 // read the parameters : nom='valeur'
323 $prop = "([a-zA-Z0-9_]+)=[']([^']*)[']";
324 preg_match_all('/'.$prop.'/is', $code, $match);
325 for($k=0; $k<count($match[0]); $k++)
326 $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
327
328 // compliance of each parameter
329 $color = "#000000";
330 $border = null;
331 foreach ($param as $key => $val) {
332 $key = strtolower($key);
333 switch($key)
334 {
335 case 'width':
336 unset($param[$key]);
337 $param['style'] .= 'width: '.$val.'px; ';
338 break;
339
340 case 'align':
341 if ($name==='img') {
342 unset($param[$key]);
343 $param['style'] .= 'float: '.$val.'; ';
344 } elseif ($name!=='table') {
345 unset($param[$key]);
346 $param['style'] .= 'text-align: '.$val.'; ';
347 }
348 break;
349
350 case 'valign':
351 unset($param[$key]);
352 $param['style'] .= 'vertical-align: '.$val.'; ';
353 break;
354
355 case 'height':
356 unset($param[$key]);
357 $param['style'] .= 'height: '.$val.'px; ';
358 break;
359
360 case 'bgcolor':
361 unset($param[$key]);
362 $param['style'] .= 'background: '.$val.'; ';
363 break;
364
365 case 'bordercolor':
366 unset($param[$key]);
367 $color = $val;
368 break;
369
370 case 'border':
371 unset($param[$key]);
372 if (preg_match('/^[0-9]+$/isU', $val)) $val = $val.'px';
373 $border = $val;
374 break;
375
376 case 'cellpadding':
377 case 'cellspacing':
378 if (preg_match('/^([0-9]+)$/isU', $val)) $param[$key] = $val.'px';
379 break;
380
381 case 'colspan':
382 case 'rowspan':
383 $val = preg_replace('/[^0-9]/isU', '', $val);
384 if (!$val) $val = 1;
385 $param[$key] = $val;
386 break;
387 }
388 }
389
390 // compliance of the border
391 if ($border!==null) {
392 if ($border) $border = 'border: solid '.$border.' '.$color;
393 else $border = 'border: none';
394
395 $param['style'] .= $border.'; ';
396 $param['border'] = $border;
397 }
398
399 // reading styles: decomposition and standardization
400 $styles = explode(';', $param['style']);
401 $param['style'] = array();
402 foreach ($styles as $style) {
403 $tmp = explode(':', $style);
404 if (count($tmp)>1) {
405 $cod = $tmp[0];
406 unset($tmp[0]);
407 $tmp = implode(':', $tmp);
408 $param['style'][trim(strtolower($cod))] = preg_replace('/[\s]+/isU', ' ', trim($tmp));
409 }
410 }
411
412 // determining the level of table opening, with an added level
413 if (in_array($name, array('ul', 'ol', 'table')) && !$close) {
414 $this->_num++;
415 $this->_level[count($this->_level)] = $this->_num;
416 }
417
418 // get the level of the table containing the element
419 if (!isset($param['num'])) {
420 $param['num'] = $this->_level[count($this->_level)-1];
421 }
422
423 // for closures table: remove a level
424 if (in_array($name, array('ul', 'ol', 'table')) && $close) {
425 unset($this->_level[count($this->_level)-1]);
426 }
427
428 // prepare the parameters
429 if (isset($param['value'])) $param['value'] = $this->_prepareTxt($param['value']);
430 if (isset($param['alt'])) $param['alt'] = $this->_prepareTxt($param['alt']);
431 if (isset($param['title'])) $param['title'] = $this->_prepareTxt($param['title']);
432 if (isset($param['class'])) $param['class'] = $this->_prepareTxt($param['class']);
433
434 // return the new action to do
435 return array('name' => $name, 'close' => $close ? 1 : 0, 'autoclose' => $autoclose, 'param' => $param);
436 }
437
438 /**
439 * get a full level of HTML, between an opening and closing corresponding
440 *
441 * @param integer key
442 * @return array actions
443 */
444 public function getLevel($k)
445 {
446 // if the code does not exist => return empty
447 if (!isset($this->code[$k])) return array();
448
449 // the tag to detect
450 $detect = $this->code[$k]['name'];
451
452 // if it is a text => return
453 if ($detect=='write') {
454 return array($this->code[$k]);
455 }
456
457 //
458 $level = 0; // depth level
459 $end = false; // end of the search
460 $code = array(); // extract code
461
462 // while it's not ended
463 while (!$end) {
464 // current action
465 $row = $this->code[$k];
466
467 // if 'write' => we add the text
468 if ($row['name']=='write') {
469 $code[] = $row;
470 } else { // else, it is a html tag
471 $not = false; // flag for not taking into account the current tag
472
473 // if it is the searched tag
474 if ($row['name']==$detect) {
475 // if we are just at the root level => dont take it
476 if ($level==0) {
477 $not = true;
478 }
479
480 // update the level
481 $level+= ($row['close'] ? -1 : 1);
482
483 // if we are now at the root level => it is the end, and dont take it
484 if ($level==0) {
485 $not = true;
486 $end = true;
487 }
488 }
489
490 // if we can takin into account the current tag => save it
491 if (!$not) {
492 if (isset($row['style']['text-align'])) unset($row['style']['text-align']);
493 $code[] = $row;
494 }
495 }
496
497 // it continues as long as there has code to analise
498 if (isset($this->code[$k+1]))
499 $k++;
500 else
501 $end = true;
502 }
503
504 // return the extract
505 return $code;
506 }
507
508 /**
509 * return a part of the HTML code, for error message
510 *
511 * @param integer position
512 * @param integer take before
513 * @param integer take after
514 * @return string part of the html code
515 */
516 public function getHtmlErrorCode($pos, $before=30, $after=40)
517 {
518 return substr($this->_html, $pos-$before, $before+$after);
519 }
520}
Note: See TracBrowser for help on using the repository browser.