Context Navigation

Html.php

Last change on this file was 347, checked in by roby, 4 years ago
Aggiornamento per compatibilità con php7.4
File size: 14.3 KB

Rev	Line
[347]	1	<?php
	2	/**
	3	* Html2Pdf Library
	4	*
	5	* HTML => PDF converter
	6	* distributed under the OSL-3.0 License
	7	*
	8	* @package Html2pdf
	9	* @author Laurent MINGUET <webmaster@html2pdf.fr>
	10	* @copyright 2017 Laurent MINGUET
	11	*/
	12	namespace Spipu\Html2Pdf\Parsing;
	13
	14	use Spipu\Html2Pdf\Exception\HtmlParsingException;
	15
	16	/**
	17	* Class Html
	18	*/
	19	class Html
	20	{
	21	const HTML_TAB = ' ';
	22
	23	/**
	24	* @var TagParser
	25	*/
	26	protected $tagParser;
	27
	28	/**
	29	* @var TextParser
	30	*/
	31	protected $textParser;
	32
	33	/**
	34	* are we in a pre ?
	35	* @var boolean
	36	*/
	37	protected $tagPreIn = false;
	38
	39	/**
	40	* parsed HTML code
	41	* @var Node[]
	42	*/
	43	public $code = array();
	44
	45	/**
	46	* main constructor
	47	*
	48	* @param TextParser $textParser
	49	*/
	50	public function __construct(TextParser $textParser)
	51	{
	52	$this->textParser = $textParser;
	53	$this->tagParser = new TagParser($this->textParser);
	54	$this->code = array();
	55	}
	56
	57	/**
	58	* Get the list of the codes, but cloned
	59	*
	60	* @return Node[]
	61	*/
	62	public function getCloneCodes()
	63	{
	64	$codes = array();
	65	foreach ($this->code as $key => $code) {
	66	$codes[$key] = clone $code;
	67	}
	68	return $codes;
	69	}
	70
	71	/**
	72	* parse the HTML code
	73	*
	74	* @param Token[] $tokens A list of tokens to parse
	75	*
	76	* @throws HtmlParsingException
	77	*/
	78	public function parse($tokens)
	79	{
	80	$parents = array();
	81
	82	// flag : are we in a <pre> Tag ?
	83	$this->tagPreIn = false;
	84
	85	/**
	86	* all the actions to do
	87	* @var Node[] $actions
	88	*/
	89	$actions = array();
	90
	91	// get the actions from the html tokens
	92	foreach ($tokens as $token) {
	93	if ($token->getType() === 'code') {
	94	$actions = array_merge($actions, $this->getTagAction($token, $parents));
	95	} elseif ($token->getType() === 'txt') {
	96	$actions = array_merge($actions, $this->getTextAction($token));
	97	}
	98	}
	99
	100	// for each identified action, we have to clean up the begin and the end of the texte
	101	// based on tags that surround it
	102
	103	// list of the tags to clean
	104	$tagsToClean = array(
	105	'page', 'page_header', 'page_footer', 'form',
	106	'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br',
	107	'div', 'hr', 'p', 'ul', 'ol', 'li',
	108	'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
	109	'bookmark', 'fieldset', 'legend',
	110	'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline',
	111	'option'
	112	);
	113
	114	// list of the tags to move space
	115	$tagsToSpace = array(
	116	'span', 'font', 'label',
	117	'strong', 'b',
	118	'address', 'cite', 'em', 'i', 'samp',
	119	'cite', 's',
	120	'ins', 'u',
	121	'big', 'small', 'sub', 'sup'
	122	);
	123
	124	// foreach action
	125	$nb = count($actions);
	126	for ($k = 0; $k < $nb; $k++) {
	127	// if it is a Text
	128	if ($actions[$k]->getName() !== 'write') {
	129	continue;
	130	}
	131
	132	// if the tag before the text is a tag to clean => ltrim on the text
	133	if ($k>0) {
	134	if (in_array($actions[$k - 1]->getName(), $tagsToClean)) {
	135	$actions[$k]->setParam('txt', ltrim($actions[$k]->getParam('txt')));
	136	}
	137	}
	138
	139	if ($k < $nb - 1) {
	140	// if the tag after the text is a tag to clean => rtrim on the text
	141	if (in_array($actions[$k + 1]->getName(), $tagsToClean)) {
	142	$actions[$k]->setParam('txt', rtrim($actions[$k]->getParam('txt')));
	143	}
	144
	145	// if the tag after the text is a tag with space to move => move the space to the next write
	146	if (in_array($actions[$k + 1]->getName(), $tagsToSpace)) {
	147	if (substr($actions[$k]->getParam('txt'), -1) == ' ') {
	148	$actions[$k]->setParam('txt', rtrim($actions[$k]->getParam('txt')));
	149	for ($subK = $k+2; $subK < $nb; $subK++) {
	150	if ($actions[$subK]->getName() === 'write') {
	151	$actions[$subK]->setParam('txt', ' '.ltrim($actions[$subK]->getParam('txt')));
	152	break;
	153	}
	154	}
	155	}
	156	}
	157	}
	158
	159	// if the text is empty => remove the action
	160	if (!strlen($actions[$k]->getParam('txt'))) {
	161	unset($actions[$k]);
	162	}
	163	}
	164
	165	// if we are not on the level 0 => HTML validator ERROR
	166	if (count($parents)) {
	167	if (count($parents)>1) {
	168	$errorMsg = 'The following tags have not been closed:';
	169	} else {
	170	$errorMsg = 'The following tag has not been closed:';
	171	}
	172
	173	$e = new HtmlParsingException($errorMsg.' '.implode(', ', $parents));
	174	$e->setInvalidTag($parents[0]);
	175	throw $e;
	176	}
	177
	178	$this->verifyMustContain($actions, 'thead', 'tr');
	179	$this->verifyMustContain($actions, 'tfoot', 'tr');
	180
	181	// save the actions to do
	182	$this->code = array_values($actions);
	183	}
	184
	185	/**
	186	* Verify some tags that must contain other tags
	187	*
	188	* @param Node[] $actions
	189	* @param string $mainTag
	190	* @param string $mustTag
	191	*
	192	* @return bool
	193	* @throws HtmlParsingException
	194	*/
	195	protected function verifyMustContain(&$actions, $mainTag, $mustTag)
	196	{
	197	$inMainTag = 0;
	198	$foundMustTag = false;
	199
	200	foreach ($actions as $action) {
	201	if ($action->getName() == $mainTag && !$action->isClose()) {
	202	$inMainTag++;
	203	$foundMustTag = false;
	204	}
	205
	206	if ($action->getName() == $mustTag && $inMainTag > 0) {
	207	$foundMustTag = true;
	208	}
	209
	210	if ($action->getName() == $mainTag && $action->isClose()) {
	211	if (!$foundMustTag) {
	212	$exception = new HtmlParsingException(
	213	"The tag [$mainTag] must contain at least one tag [$mustTag]"
	214	);
	215	$exception->setInvalidTag($action->getName());
	216	$exception->setHtmlLine($action->getLine());
	217	throw $exception;
	218	}
	219	$inMainTag--;
	220	}
	221	}
	222
	223	return true;
	224	}
	225
	226	/**
	227	* TODO remove the reference on the $parents variable
	228	*
	229	* @param Token $token
	230	* @param array $parents
	231	*
	232	* @return array
	233	* @throws HtmlParsingException
	234	*/
	235	protected function getTagAction(Token $token, &$parents)
	236	{
	237	// tag that can be not closed
	238	$tagsNotClosed = array(
	239	'br', 'hr', 'img', 'col',
	240	'input', 'link', 'option',
	241	'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline'
	242	);
	243
	244	// analyze the HTML code
	245	$node = $this->tagParser->analyzeTag($token->getData());
	246
	247	// save the current position in the HTML code
	248	$node->setLine($token->getLine());
	249
	250	$actions = array();
	251	// if the tag must be closed
	252	if (!in_array($node->getName(), $tagsNotClosed)) {
	253	// if it is a closure tag
	254	if ($node->isClose()) {
	255	// HTML validation
	256	if (count($parents) < 1) {
	257	$e = new HtmlParsingException('Too many tag closures found for ['.$node->getName().']');
	258	$e->setInvalidTag($node->getName());
	259	$e->setHtmlLine($token->getLine());
	260	throw $e;
	261	} elseif (end($parents) != $node->getName()) {
	262	$e = new HtmlParsingException('Tags are closed in a wrong order for ['.$node->getName().']');
	263	$e->setInvalidTag($node->getName());
	264	$e->setHtmlLine($token->getLine());
	265	throw $e;
	266	} else {
	267	array_pop($parents);
	268	}
	269	} else {
	270	// if it is an auto-closed tag
	271	if ($node->isAutoClose()) {
	272	// save the opened tag
	273	$actions[] = $node;
	274
	275	// prepare the closed tag
	276	$node = clone $node;
	277	$node->setParams(array());
	278	$node->setClose(true);
	279	} else {
	280	// else: add a child for validation
	281	array_push($parents, $node->getName());
	282	}
	283	}
	284
	285	// if it is a <pre> tag (or <code> tag) not auto-closed => update the flag
	286	if (($node->getName() === 'pre' \|\| $node->getName() === 'code') && !$node->isAutoClose()) {
	287	$this->tagPreIn = !$node->isClose();
	288	}
	289	}
	290
	291	// save the actions to convert
	292	$actions[] = $node;
	293
	294	return $actions;
	295	}
	296
	297	/**
	298	* get the Text action
	299	*
	300	* @param Token $token
	301	*
	302	* @return array
	303	*/
	304	protected function getTextAction(Token $token)
	305	{
	306	// action to use for each line of the content of a <pre> Tag
	307	$tagPreBr = new Node('br', array('style' => array(), 'num' => 0), false);
	308
	309	$actions = array();
	310
	311	// if we are not in a <pre> tag
	312	if (!$this->tagPreIn) {
	313	// save the action
	314	$actions[] = new Node('write', array('txt' => $this->textParser->prepareTxt($token->getData())), false);
	315	} else { // else (if we are in a <pre> tag)
	316	// prepare the text
	317	$data = str_replace("\r", '', $token->getData());
	318	$lines = explode("\n", $data);
	319
	320	// foreach line of the text
	321	foreach ($lines as $k => $txt) {
	322	// transform the line
	323	$txt = str_replace("\t", self::HTML_TAB, $txt);
	324	$txt = str_replace(' ', ' ', $txt);
	325
	326	// add a break line
	327	if ($k > 0) {
	328	$actions[] = clone $tagPreBr;
	329	}
	330
	331	// save the action
	332	$actions[] = new Node('write', array('txt' => $this->textParser->prepareTxt($txt, false)), false);
	333	}
	334	}
	335	return $actions;
	336	}
	337
	338	/**
	339	* get a full level of HTML, between an opening and closing corresponding
	340	*
	341	* @param integer $k
	342	* @return array actions
	343	*/
	344	public function getLevel($k)
	345	{
	346	// if the code does not exist => return empty
	347	if (!isset($this->code[$k])) {
	348	return array();
	349	}
	350
	351	// the tag to detect
	352	$detect = $this->code[$k]->getName();
	353
	354	// if it is a text => return
	355	if ($detect === 'write') {
	356	return array($this->code[$k]);
	357	}
	358
	359	//
	360	$level = 0; // depth level
	361	$end = false; // end of the search
	362	$code = array(); // extract code
	363
	364	// while it's not ended
	365	while (!$end) {
	366	// current action
	367	/** @var Node $node */
	368	$node = $this->code[$k];
	369
	370	// if 'write' => we add the text
	371	if ($node->getName() === 'write') {
	372	$code[] = $node;
	373	} else { // else, it is a html tag
	374	$not = false; // flag for not taking into account the current tag
	375
	376	// if it is the searched tag
	377	if ($node->getName() == $detect) {
	378	// if we are just at the root level => dont take it
	379	if ($level == 0) {
	380	$not = true;
	381	}
	382
	383	// update the level
	384	$level += ($node->isClose() ? -1 : 1);
	385
	386	// if we are now at the root level => it is the end, and dont take it
	387	if ($level == 0) {
	388	$not = true;
	389	$end = true;
	390	}
	391	}
	392
	393	// if we can take into account the current tag => save it
	394	if (!$not) {
	395	$code[] = $node;
	396	}
	397	}
	398
	399	// it continues as long as there has code to analyze
	400	if (isset($this->code[$k + 1])) {
	401	$k++;
	402	} else {
	403	$end = true;
	404	}
	405	}
	406
	407	// return the extract
	408	return $code;
	409	}
	410
	411	/**
	412	* prepare the HTML
	413	*
	414	* @param string $html
	415	*
	416	* @return string
	417	*/
	418	public function prepareHtml($html)
	419	{
	420	// if it is a real html page, we have to convert it
	421	if (preg_match('/<body/isU', $html)) {
	422	$html = $this->getHtmlFromRealPage($html);
	423	}
	424
	425	// replace some constants
	426	$html = str_replace('[[date_y]]', date('Y'), $html);
	427	$html = str_replace('[[date_m]]', date('m'), $html);
	428	$html = str_replace('[[date_d]]', date('d'), $html);
	429
	430	$html = str_replace('[[date_h]]', date('H'), $html);
	431	$html = str_replace('[[date_i]]', date('i'), $html);
	432	$html = str_replace('[[date_s]]', date('s'), $html);
	433
	434	return $html;
	435	}
	436
	437	/**
	438	* convert the HTML of a real page, to a code adapted to Html2Pdf
	439	*
	440	* @param string $html HTML code of a real page
	441	* @return string HTML adapted to Html2Pdf
	442	*/
	443	protected function getHtmlFromRealPage($html)
	444	{
	445	// set body tag to lower case
	446	$html = str_replace('<BODY', '<body', $html);
	447	$html = str_replace('</BODY', '</body', $html);
	448
	449	// explode from the body tag. If no body tag => end
	450	$res = explode('<body', $html);
	451
	452	// the html content is between body tag openning and closing
	453	$content = '<page'.$res[1];
	454	$content = explode('</body', $content);
	455	$content = $content[0].'</page>';
	456
	457	// extract the link tags from the original html
	458	// and add them before the content
	459	preg_match_all('/<link ([^>]*)[\/]?>/isU', $html, $match);
	460	foreach ($match[1] as $src) {
	461	$content = '<link '.$src.'/>'.$content;
	462	}
	463
	464	// extract the css style tags from the original html
	465	// and add them before the content
	466	preg_match_all('/<style[^>]>(.)<\/style[^>]*>/isU', $html, $match);
	467	foreach ($match[0] as $src) {
	468	$content = $src.$content;
	469	}
	470
	471	return $content;
	472	}
	473	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/client/inc/hpdf5/spipu/html2pdf/src/Parsing/Html.php

Download in other formats: