Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

Html.php@ 394

Last change on this file since 394 was 347, checked in by roby, 3 years ago
Aggiornamento per compatibilità con php7.4
File size: 14.3 KB

Line
1	<?php
2	/**
3	* Html2Pdf Library
4	*
5	* HTML => PDF converter
6	* distributed under the OSL-3.0 License
7	*
8	* @package Html2pdf
9	* @author Laurent MINGUET <webmaster@html2pdf.fr>
10	* @copyright 2017 Laurent MINGUET
11	*/
12	namespace Spipu\Html2Pdf\Parsing;
13
14	use Spipu\Html2Pdf\Exception\HtmlParsingException;
15
16	/**
17	* Class Html
18	*/
19	class Html
20	{
21	const HTML_TAB = ' ';
22
23	/**
24	* @var TagParser
25	*/
26	protected $tagParser;
27
28	/**
29	* @var TextParser
30	*/
31	protected $textParser;
32
33	/**
34	* are we in a pre ?
35	* @var boolean
36	*/
37	protected $tagPreIn = false;
38
39	/**
40	* parsed HTML code
41	* @var Node[]
42	*/
43	public $code = array();
44
45	/**
46	* main constructor
47	*
48	* @param TextParser $textParser
49	*/
50	public function __construct(TextParser $textParser)
51	{
52	$this->textParser = $textParser;
53	$this->tagParser = new TagParser($this->textParser);
54	$this->code = array();
55	}
56
57	/**
58	* Get the list of the codes, but cloned
59	*
60	* @return Node[]
61	*/
62	public function getCloneCodes()
63	{
64	$codes = array();
65	foreach ($this->code as $key => $code) {
66	$codes[$key] = clone $code;
67	}
68	return $codes;
69	}
70
71	/**
72	* parse the HTML code
73	*
74	* @param Token[] $tokens A list of tokens to parse
75	*
76	* @throws HtmlParsingException
77	*/
78	public function parse($tokens)
79	{
80	$parents = array();
81
82	// flag : are we in a <pre> Tag ?
83	$this->tagPreIn = false;
84
85	/**
86	* all the actions to do
87	* @var Node[] $actions
88	*/
89	$actions = array();
90
91	// get the actions from the html tokens
92	foreach ($tokens as $token) {
93	if ($token->getType() === 'code') {
94	$actions = array_merge($actions, $this->getTagAction($token, $parents));
95	} elseif ($token->getType() === 'txt') {
96	$actions = array_merge($actions, $this->getTextAction($token));
97	}
98	}
99
100	// for each identified action, we have to clean up the begin and the end of the texte
101	// based on tags that surround it
102
103	// list of the tags to clean
104	$tagsToClean = array(
105	'page', 'page_header', 'page_footer', 'form',
106	'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br',
107	'div', 'hr', 'p', 'ul', 'ol', 'li',
108	'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
109	'bookmark', 'fieldset', 'legend',
110	'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline',
111	'option'
112	);
113
114	// list of the tags to move space
115	$tagsToSpace = array(
116	'span', 'font', 'label',
117	'strong', 'b',
118	'address', 'cite', 'em', 'i', 'samp',
119	'cite', 's',
120	'ins', 'u',
121	'big', 'small', 'sub', 'sup'
122	);
123
124	// foreach action
125	$nb = count($actions);
126	for ($k = 0; $k < $nb; $k++) {
127	// if it is a Text
128	if ($actions[$k]->getName() !== 'write') {
129	continue;
130	}
131
132	// if the tag before the text is a tag to clean => ltrim on the text
133	if ($k>0) {
134	if (in_array($actions[$k - 1]->getName(), $tagsToClean)) {
135	$actions[$k]->setParam('txt', ltrim($actions[$k]->getParam('txt')));
136	}
137	}
138
139	if ($k < $nb - 1) {
140	// if the tag after the text is a tag to clean => rtrim on the text
141	if (in_array($actions[$k + 1]->getName(), $tagsToClean)) {
142	$actions[$k]->setParam('txt', rtrim($actions[$k]->getParam('txt')));
143	}
144
145	// if the tag after the text is a tag with space to move => move the space to the next write
146	if (in_array($actions[$k + 1]->getName(), $tagsToSpace)) {
147	if (substr($actions[$k]->getParam('txt'), -1) == ' ') {
148	$actions[$k]->setParam('txt', rtrim($actions[$k]->getParam('txt')));
149	for ($subK = $k+2; $subK < $nb; $subK++) {
150	if ($actions[$subK]->getName() === 'write') {
151	$actions[$subK]->setParam('txt', ' '.ltrim($actions[$subK]->getParam('txt')));
152	break;
153	}
154	}
155	}
156	}
157	}
158
159	// if the text is empty => remove the action
160	if (!strlen($actions[$k]->getParam('txt'))) {
161	unset($actions[$k]);
162	}
163	}
164
165	// if we are not on the level 0 => HTML validator ERROR
166	if (count($parents)) {
167	if (count($parents)>1) {
168	$errorMsg = 'The following tags have not been closed:';
169	} else {
170	$errorMsg = 'The following tag has not been closed:';
171	}
172
173	$e = new HtmlParsingException($errorMsg.' '.implode(', ', $parents));
174	$e->setInvalidTag($parents[0]);
175	throw $e;
176	}
177
178	$this->verifyMustContain($actions, 'thead', 'tr');
179	$this->verifyMustContain($actions, 'tfoot', 'tr');
180
181	// save the actions to do
182	$this->code = array_values($actions);
183	}
184
185	/**
186	* Verify some tags that must contain other tags
187	*
188	* @param Node[] $actions
189	* @param string $mainTag
190	* @param string $mustTag
191	*
192	* @return bool
193	* @throws HtmlParsingException
194	*/
195	protected function verifyMustContain(&$actions, $mainTag, $mustTag)
196	{
197	$inMainTag = 0;
198	$foundMustTag = false;
199
200	foreach ($actions as $action) {
201	if ($action->getName() == $mainTag && !$action->isClose()) {
202	$inMainTag++;
203	$foundMustTag = false;
204	}
205
206	if ($action->getName() == $mustTag && $inMainTag > 0) {
207	$foundMustTag = true;
208	}
209
210	if ($action->getName() == $mainTag && $action->isClose()) {
211	if (!$foundMustTag) {
212	$exception = new HtmlParsingException(
213	"The tag [$mainTag] must contain at least one tag [$mustTag]"
214	);
215	$exception->setInvalidTag($action->getName());
216	$exception->setHtmlLine($action->getLine());
217	throw $exception;
218	}
219	$inMainTag--;
220	}
221	}
222
223	return true;
224	}
225
226	/**
227	* TODO remove the reference on the $parents variable
228	*
229	* @param Token $token
230	* @param array $parents
231	*
232	* @return array
233	* @throws HtmlParsingException
234	*/
235	protected function getTagAction(Token $token, &$parents)
236	{
237	// tag that can be not closed
238	$tagsNotClosed = array(
239	'br', 'hr', 'img', 'col',
240	'input', 'link', 'option',
241	'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline'
242	);
243
244	// analyze the HTML code
245	$node = $this->tagParser->analyzeTag($token->getData());
246
247	// save the current position in the HTML code
248	$node->setLine($token->getLine());
249
250	$actions = array();
251	// if the tag must be closed
252	if (!in_array($node->getName(), $tagsNotClosed)) {
253	// if it is a closure tag
254	if ($node->isClose()) {
255	// HTML validation
256	if (count($parents) < 1) {
257	$e = new HtmlParsingException('Too many tag closures found for ['.$node->getName().']');
258	$e->setInvalidTag($node->getName());
259	$e->setHtmlLine($token->getLine());
260	throw $e;
261	} elseif (end($parents) != $node->getName()) {
262	$e = new HtmlParsingException('Tags are closed in a wrong order for ['.$node->getName().']');
263	$e->setInvalidTag($node->getName());
264	$e->setHtmlLine($token->getLine());
265	throw $e;
266	} else {
267	array_pop($parents);
268	}
269	} else {
270	// if it is an auto-closed tag
271	if ($node->isAutoClose()) {
272	// save the opened tag
273	$actions[] = $node;
274
275	// prepare the closed tag
276	$node = clone $node;
277	$node->setParams(array());
278	$node->setClose(true);
279	} else {
280	// else: add a child for validation
281	array_push($parents, $node->getName());
282	}
283	}
284
285	// if it is a <pre> tag (or <code> tag) not auto-closed => update the flag
286	if (($node->getName() === 'pre' \|\| $node->getName() === 'code') && !$node->isAutoClose()) {
287	$this->tagPreIn = !$node->isClose();
288	}
289	}
290
291	// save the actions to convert
292	$actions[] = $node;
293
294	return $actions;
295	}
296
297	/**
298	* get the Text action
299	*
300	* @param Token $token
301	*
302	* @return array
303	*/
304	protected function getTextAction(Token $token)
305	{
306	// action to use for each line of the content of a <pre> Tag
307	$tagPreBr = new Node('br', array('style' => array(), 'num' => 0), false);
308
309	$actions = array();
310
311	// if we are not in a <pre> tag
312	if (!$this->tagPreIn) {
313	// save the action
314	$actions[] = new Node('write', array('txt' => $this->textParser->prepareTxt($token->getData())), false);
315	} else { // else (if we are in a <pre> tag)
316	// prepare the text
317	$data = str_replace("\r", '', $token->getData());
318	$lines = explode("\n", $data);
319
320	// foreach line of the text
321	foreach ($lines as $k => $txt) {
322	// transform the line
323	$txt = str_replace("\t", self::HTML_TAB, $txt);
324	$txt = str_replace(' ', ' ', $txt);
325
326	// add a break line
327	if ($k > 0) {
328	$actions[] = clone $tagPreBr;
329	}
330
331	// save the action
332	$actions[] = new Node('write', array('txt' => $this->textParser->prepareTxt($txt, false)), false);
333	}
334	}
335	return $actions;
336	}
337
338	/**
339	* get a full level of HTML, between an opening and closing corresponding
340	*
341	* @param integer $k
342	* @return array actions
343	*/
344	public function getLevel($k)
345	{
346	// if the code does not exist => return empty
347	if (!isset($this->code[$k])) {
348	return array();
349	}
350
351	// the tag to detect
352	$detect = $this->code[$k]->getName();
353
354	// if it is a text => return
355	if ($detect === 'write') {
356	return array($this->code[$k]);
357	}
358
359	//
360	$level = 0; // depth level
361	$end = false; // end of the search
362	$code = array(); // extract code
363
364	// while it's not ended
365	while (!$end) {
366	// current action
367	/** @var Node $node */
368	$node = $this->code[$k];
369
370	// if 'write' => we add the text
371	if ($node->getName() === 'write') {
372	$code[] = $node;
373	} else { // else, it is a html tag
374	$not = false; // flag for not taking into account the current tag
375
376	// if it is the searched tag
377	if ($node->getName() == $detect) {
378	// if we are just at the root level => dont take it
379	if ($level == 0) {
380	$not = true;
381	}
382
383	// update the level
384	$level += ($node->isClose() ? -1 : 1);
385
386	// if we are now at the root level => it is the end, and dont take it
387	if ($level == 0) {
388	$not = true;
389	$end = true;
390	}
391	}
392
393	// if we can take into account the current tag => save it
394	if (!$not) {
395	$code[] = $node;
396	}
397	}
398
399	// it continues as long as there has code to analyze
400	if (isset($this->code[$k + 1])) {
401	$k++;
402	} else {
403	$end = true;
404	}
405	}
406
407	// return the extract
408	return $code;
409	}
410
411	/**
412	* prepare the HTML
413	*
414	* @param string $html
415	*
416	* @return string
417	*/
418	public function prepareHtml($html)
419	{
420	// if it is a real html page, we have to convert it
421	if (preg_match('/<body/isU', $html)) {
422	$html = $this->getHtmlFromRealPage($html);
423	}
424
425	// replace some constants
426	$html = str_replace('[[date_y]]', date('Y'), $html);
427	$html = str_replace('[[date_m]]', date('m'), $html);
428	$html = str_replace('[[date_d]]', date('d'), $html);
429
430	$html = str_replace('[[date_h]]', date('H'), $html);
431	$html = str_replace('[[date_i]]', date('i'), $html);
432	$html = str_replace('[[date_s]]', date('s'), $html);
433
434	return $html;
435	}
436
437	/**
438	* convert the HTML of a real page, to a code adapted to Html2Pdf
439	*
440	* @param string $html HTML code of a real page
441	* @return string HTML adapted to Html2Pdf
442	*/
443	protected function getHtmlFromRealPage($html)
444	{
445	// set body tag to lower case
446	$html = str_replace('<BODY', '<body', $html);
447	$html = str_replace('</BODY', '</body', $html);
448
449	// explode from the body tag. If no body tag => end
450	$res = explode('<body', $html);
451
452	// the html content is between body tag openning and closing
453	$content = '<page'.$res[1];
454	$content = explode('</body', $content);
455	$content = $content[0].'</page>';
456
457	// extract the link tags from the original html
458	// and add them before the content
459	preg_match_all('/<link ([^>]*)[\/]?>/isU', $html, $match);
460	foreach ($match[1] as $src) {
461	$content = '<link '.$src.'/>'.$content;
462	}
463
464	// extract the css style tags from the original html
465	// and add them before the content
466	preg_match_all('/<style[^>]>(.)<\/style[^>]*>/isU', $html, $match);
467	foreach ($match[0] as $src) {
468	$content = $src.$content;
469	}
470
471	return $content;
472	}
473	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/client/inc/hpdf5/spipu/html2pdf/src/Parsing/Html.php@ 394

Download in other formats: