[347] | 1 | <?php
|
---|
| 2 | /**
|
---|
| 3 | * Html2Pdf Library
|
---|
| 4 | *
|
---|
| 5 | * HTML => PDF converter
|
---|
| 6 | * distributed under the OSL-3.0 License
|
---|
| 7 | *
|
---|
| 8 | * @package Html2pdf
|
---|
| 9 | * @author Laurent MINGUET <webmaster@html2pdf.fr>
|
---|
| 10 | * @copyright 2017 Laurent MINGUET
|
---|
| 11 | */
|
---|
| 12 | namespace Spipu\Html2Pdf\Parsing;
|
---|
| 13 |
|
---|
| 14 | use Spipu\Html2Pdf\Exception\HtmlParsingException;
|
---|
| 15 |
|
---|
| 16 | /**
|
---|
| 17 | * Class Html
|
---|
| 18 | */
|
---|
| 19 | class Html
|
---|
| 20 | {
|
---|
| 21 | const HTML_TAB = ' ';
|
---|
| 22 |
|
---|
| 23 | /**
|
---|
| 24 | * @var TagParser
|
---|
| 25 | */
|
---|
| 26 | protected $tagParser;
|
---|
| 27 |
|
---|
| 28 | /**
|
---|
| 29 | * @var TextParser
|
---|
| 30 | */
|
---|
| 31 | protected $textParser;
|
---|
| 32 |
|
---|
| 33 | /**
|
---|
| 34 | * are we in a pre ?
|
---|
| 35 | * @var boolean
|
---|
| 36 | */
|
---|
| 37 | protected $tagPreIn = false;
|
---|
| 38 |
|
---|
| 39 | /**
|
---|
| 40 | * parsed HTML code
|
---|
| 41 | * @var Node[]
|
---|
| 42 | */
|
---|
| 43 | public $code = array();
|
---|
| 44 |
|
---|
| 45 | /**
|
---|
| 46 | * main constructor
|
---|
| 47 | *
|
---|
| 48 | * @param TextParser $textParser
|
---|
| 49 | */
|
---|
| 50 | public function __construct(TextParser $textParser)
|
---|
| 51 | {
|
---|
| 52 | $this->textParser = $textParser;
|
---|
| 53 | $this->tagParser = new TagParser($this->textParser);
|
---|
| 54 | $this->code = array();
|
---|
| 55 | }
|
---|
| 56 |
|
---|
| 57 | /**
|
---|
| 58 | * Get the list of the codes, but cloned
|
---|
| 59 | *
|
---|
| 60 | * @return Node[]
|
---|
| 61 | */
|
---|
| 62 | public function getCloneCodes()
|
---|
| 63 | {
|
---|
| 64 | $codes = array();
|
---|
| 65 | foreach ($this->code as $key => $code) {
|
---|
| 66 | $codes[$key] = clone $code;
|
---|
| 67 | }
|
---|
| 68 | return $codes;
|
---|
| 69 | }
|
---|
| 70 |
|
---|
| 71 | /**
|
---|
| 72 | * parse the HTML code
|
---|
| 73 | *
|
---|
| 74 | * @param Token[] $tokens A list of tokens to parse
|
---|
| 75 | *
|
---|
| 76 | * @throws HtmlParsingException
|
---|
| 77 | */
|
---|
| 78 | public function parse($tokens)
|
---|
| 79 | {
|
---|
| 80 | $parents = array();
|
---|
| 81 |
|
---|
| 82 | // flag : are we in a <pre> Tag ?
|
---|
| 83 | $this->tagPreIn = false;
|
---|
| 84 |
|
---|
| 85 | /**
|
---|
| 86 | * all the actions to do
|
---|
| 87 | * @var Node[] $actions
|
---|
| 88 | */
|
---|
| 89 | $actions = array();
|
---|
| 90 |
|
---|
| 91 | // get the actions from the html tokens
|
---|
| 92 | foreach ($tokens as $token) {
|
---|
| 93 | if ($token->getType() === 'code') {
|
---|
| 94 | $actions = array_merge($actions, $this->getTagAction($token, $parents));
|
---|
| 95 | } elseif ($token->getType() === 'txt') {
|
---|
| 96 | $actions = array_merge($actions, $this->getTextAction($token));
|
---|
| 97 | }
|
---|
| 98 | }
|
---|
| 99 |
|
---|
| 100 | // for each identified action, we have to clean up the begin and the end of the texte
|
---|
| 101 | // based on tags that surround it
|
---|
| 102 |
|
---|
| 103 | // list of the tags to clean
|
---|
| 104 | $tagsToClean = array(
|
---|
| 105 | 'page', 'page_header', 'page_footer', 'form',
|
---|
| 106 | 'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br',
|
---|
| 107 | 'div', 'hr', 'p', 'ul', 'ol', 'li',
|
---|
| 108 | 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
---|
| 109 | 'bookmark', 'fieldset', 'legend',
|
---|
| 110 | 'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline',
|
---|
| 111 | 'option'
|
---|
| 112 | );
|
---|
| 113 |
|
---|
| 114 | // list of the tags to move space
|
---|
| 115 | $tagsToSpace = array(
|
---|
| 116 | 'span', 'font', 'label',
|
---|
| 117 | 'strong', 'b',
|
---|
| 118 | 'address', 'cite', 'em', 'i', 'samp',
|
---|
| 119 | 'cite', 's',
|
---|
| 120 | 'ins', 'u',
|
---|
| 121 | 'big', 'small', 'sub', 'sup'
|
---|
| 122 | );
|
---|
| 123 |
|
---|
| 124 | // foreach action
|
---|
| 125 | $nb = count($actions);
|
---|
| 126 | for ($k = 0; $k < $nb; $k++) {
|
---|
| 127 | // if it is a Text
|
---|
| 128 | if ($actions[$k]->getName() !== 'write') {
|
---|
| 129 | continue;
|
---|
| 130 | }
|
---|
| 131 |
|
---|
| 132 | // if the tag before the text is a tag to clean => ltrim on the text
|
---|
| 133 | if ($k>0) {
|
---|
| 134 | if (in_array($actions[$k - 1]->getName(), $tagsToClean)) {
|
---|
| 135 | $actions[$k]->setParam('txt', ltrim($actions[$k]->getParam('txt')));
|
---|
| 136 | }
|
---|
| 137 | }
|
---|
| 138 |
|
---|
| 139 | if ($k < $nb - 1) {
|
---|
| 140 | // if the tag after the text is a tag to clean => rtrim on the text
|
---|
| 141 | if (in_array($actions[$k + 1]->getName(), $tagsToClean)) {
|
---|
| 142 | $actions[$k]->setParam('txt', rtrim($actions[$k]->getParam('txt')));
|
---|
| 143 | }
|
---|
| 144 |
|
---|
| 145 | // if the tag after the text is a tag with space to move => move the space to the next write
|
---|
| 146 | if (in_array($actions[$k + 1]->getName(), $tagsToSpace)) {
|
---|
| 147 | if (substr($actions[$k]->getParam('txt'), -1) == ' ') {
|
---|
| 148 | $actions[$k]->setParam('txt', rtrim($actions[$k]->getParam('txt')));
|
---|
| 149 | for ($subK = $k+2; $subK < $nb; $subK++) {
|
---|
| 150 | if ($actions[$subK]->getName() === 'write') {
|
---|
| 151 | $actions[$subK]->setParam('txt', ' '.ltrim($actions[$subK]->getParam('txt')));
|
---|
| 152 | break;
|
---|
| 153 | }
|
---|
| 154 | }
|
---|
| 155 | }
|
---|
| 156 | }
|
---|
| 157 | }
|
---|
| 158 |
|
---|
| 159 | // if the text is empty => remove the action
|
---|
| 160 | if (!strlen($actions[$k]->getParam('txt'))) {
|
---|
| 161 | unset($actions[$k]);
|
---|
| 162 | }
|
---|
| 163 | }
|
---|
| 164 |
|
---|
| 165 | // if we are not on the level 0 => HTML validator ERROR
|
---|
| 166 | if (count($parents)) {
|
---|
| 167 | if (count($parents)>1) {
|
---|
| 168 | $errorMsg = 'The following tags have not been closed:';
|
---|
| 169 | } else {
|
---|
| 170 | $errorMsg = 'The following tag has not been closed:';
|
---|
| 171 | }
|
---|
| 172 |
|
---|
| 173 | $e = new HtmlParsingException($errorMsg.' '.implode(', ', $parents));
|
---|
| 174 | $e->setInvalidTag($parents[0]);
|
---|
| 175 | throw $e;
|
---|
| 176 | }
|
---|
| 177 |
|
---|
| 178 | $this->verifyMustContain($actions, 'thead', 'tr');
|
---|
| 179 | $this->verifyMustContain($actions, 'tfoot', 'tr');
|
---|
| 180 |
|
---|
| 181 | // save the actions to do
|
---|
| 182 | $this->code = array_values($actions);
|
---|
| 183 | }
|
---|
| 184 |
|
---|
| 185 | /**
|
---|
| 186 | * Verify some tags that must contain other tags
|
---|
| 187 | *
|
---|
| 188 | * @param Node[] $actions
|
---|
| 189 | * @param string $mainTag
|
---|
| 190 | * @param string $mustTag
|
---|
| 191 | *
|
---|
| 192 | * @return bool
|
---|
| 193 | * @throws HtmlParsingException
|
---|
| 194 | */
|
---|
| 195 | protected function verifyMustContain(&$actions, $mainTag, $mustTag)
|
---|
| 196 | {
|
---|
| 197 | $inMainTag = 0;
|
---|
| 198 | $foundMustTag = false;
|
---|
| 199 |
|
---|
| 200 | foreach ($actions as $action) {
|
---|
| 201 | if ($action->getName() == $mainTag && !$action->isClose()) {
|
---|
| 202 | $inMainTag++;
|
---|
| 203 | $foundMustTag = false;
|
---|
| 204 | }
|
---|
| 205 |
|
---|
| 206 | if ($action->getName() == $mustTag && $inMainTag > 0) {
|
---|
| 207 | $foundMustTag = true;
|
---|
| 208 | }
|
---|
| 209 |
|
---|
| 210 | if ($action->getName() == $mainTag && $action->isClose()) {
|
---|
| 211 | if (!$foundMustTag) {
|
---|
| 212 | $exception = new HtmlParsingException(
|
---|
| 213 | "The tag [$mainTag] must contain at least one tag [$mustTag]"
|
---|
| 214 | );
|
---|
| 215 | $exception->setInvalidTag($action->getName());
|
---|
| 216 | $exception->setHtmlLine($action->getLine());
|
---|
| 217 | throw $exception;
|
---|
| 218 | }
|
---|
| 219 | $inMainTag--;
|
---|
| 220 | }
|
---|
| 221 | }
|
---|
| 222 |
|
---|
| 223 | return true;
|
---|
| 224 | }
|
---|
| 225 |
|
---|
| 226 | /**
|
---|
| 227 | * TODO remove the reference on the $parents variable
|
---|
| 228 | *
|
---|
| 229 | * @param Token $token
|
---|
| 230 | * @param array $parents
|
---|
| 231 | *
|
---|
| 232 | * @return array
|
---|
| 233 | * @throws HtmlParsingException
|
---|
| 234 | */
|
---|
| 235 | protected function getTagAction(Token $token, &$parents)
|
---|
| 236 | {
|
---|
| 237 | // tag that can be not closed
|
---|
| 238 | $tagsNotClosed = array(
|
---|
| 239 | 'br', 'hr', 'img', 'col',
|
---|
| 240 | 'input', 'link', 'option',
|
---|
| 241 | 'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline'
|
---|
| 242 | );
|
---|
| 243 |
|
---|
| 244 | // analyze the HTML code
|
---|
| 245 | $node = $this->tagParser->analyzeTag($token->getData());
|
---|
| 246 |
|
---|
| 247 | // save the current position in the HTML code
|
---|
| 248 | $node->setLine($token->getLine());
|
---|
| 249 |
|
---|
| 250 | $actions = array();
|
---|
| 251 | // if the tag must be closed
|
---|
| 252 | if (!in_array($node->getName(), $tagsNotClosed)) {
|
---|
| 253 | // if it is a closure tag
|
---|
| 254 | if ($node->isClose()) {
|
---|
| 255 | // HTML validation
|
---|
| 256 | if (count($parents) < 1) {
|
---|
| 257 | $e = new HtmlParsingException('Too many tag closures found for ['.$node->getName().']');
|
---|
| 258 | $e->setInvalidTag($node->getName());
|
---|
| 259 | $e->setHtmlLine($token->getLine());
|
---|
| 260 | throw $e;
|
---|
| 261 | } elseif (end($parents) != $node->getName()) {
|
---|
| 262 | $e = new HtmlParsingException('Tags are closed in a wrong order for ['.$node->getName().']');
|
---|
| 263 | $e->setInvalidTag($node->getName());
|
---|
| 264 | $e->setHtmlLine($token->getLine());
|
---|
| 265 | throw $e;
|
---|
| 266 | } else {
|
---|
| 267 | array_pop($parents);
|
---|
| 268 | }
|
---|
| 269 | } else {
|
---|
| 270 | // if it is an auto-closed tag
|
---|
| 271 | if ($node->isAutoClose()) {
|
---|
| 272 | // save the opened tag
|
---|
| 273 | $actions[] = $node;
|
---|
| 274 |
|
---|
| 275 | // prepare the closed tag
|
---|
| 276 | $node = clone $node;
|
---|
| 277 | $node->setParams(array());
|
---|
| 278 | $node->setClose(true);
|
---|
| 279 | } else {
|
---|
| 280 | // else: add a child for validation
|
---|
| 281 | array_push($parents, $node->getName());
|
---|
| 282 | }
|
---|
| 283 | }
|
---|
| 284 |
|
---|
| 285 | // if it is a <pre> tag (or <code> tag) not auto-closed => update the flag
|
---|
| 286 | if (($node->getName() === 'pre' || $node->getName() === 'code') && !$node->isAutoClose()) {
|
---|
| 287 | $this->tagPreIn = !$node->isClose();
|
---|
| 288 | }
|
---|
| 289 | }
|
---|
| 290 |
|
---|
| 291 | // save the actions to convert
|
---|
| 292 | $actions[] = $node;
|
---|
| 293 |
|
---|
| 294 | return $actions;
|
---|
| 295 | }
|
---|
| 296 |
|
---|
| 297 | /**
|
---|
| 298 | * get the Text action
|
---|
| 299 | *
|
---|
| 300 | * @param Token $token
|
---|
| 301 | *
|
---|
| 302 | * @return array
|
---|
| 303 | */
|
---|
| 304 | protected function getTextAction(Token $token)
|
---|
| 305 | {
|
---|
| 306 | // action to use for each line of the content of a <pre> Tag
|
---|
| 307 | $tagPreBr = new Node('br', array('style' => array(), 'num' => 0), false);
|
---|
| 308 |
|
---|
| 309 | $actions = array();
|
---|
| 310 |
|
---|
| 311 | // if we are not in a <pre> tag
|
---|
| 312 | if (!$this->tagPreIn) {
|
---|
| 313 | // save the action
|
---|
| 314 | $actions[] = new Node('write', array('txt' => $this->textParser->prepareTxt($token->getData())), false);
|
---|
| 315 | } else { // else (if we are in a <pre> tag)
|
---|
| 316 | // prepare the text
|
---|
| 317 | $data = str_replace("\r", '', $token->getData());
|
---|
| 318 | $lines = explode("\n", $data);
|
---|
| 319 |
|
---|
| 320 | // foreach line of the text
|
---|
| 321 | foreach ($lines as $k => $txt) {
|
---|
| 322 | // transform the line
|
---|
| 323 | $txt = str_replace("\t", self::HTML_TAB, $txt);
|
---|
| 324 | $txt = str_replace(' ', ' ', $txt);
|
---|
| 325 |
|
---|
| 326 | // add a break line
|
---|
| 327 | if ($k > 0) {
|
---|
| 328 | $actions[] = clone $tagPreBr;
|
---|
| 329 | }
|
---|
| 330 |
|
---|
| 331 | // save the action
|
---|
| 332 | $actions[] = new Node('write', array('txt' => $this->textParser->prepareTxt($txt, false)), false);
|
---|
| 333 | }
|
---|
| 334 | }
|
---|
| 335 | return $actions;
|
---|
| 336 | }
|
---|
| 337 |
|
---|
| 338 | /**
|
---|
| 339 | * get a full level of HTML, between an opening and closing corresponding
|
---|
| 340 | *
|
---|
| 341 | * @param integer $k
|
---|
| 342 | * @return array actions
|
---|
| 343 | */
|
---|
| 344 | public function getLevel($k)
|
---|
| 345 | {
|
---|
| 346 | // if the code does not exist => return empty
|
---|
| 347 | if (!isset($this->code[$k])) {
|
---|
| 348 | return array();
|
---|
| 349 | }
|
---|
| 350 |
|
---|
| 351 | // the tag to detect
|
---|
| 352 | $detect = $this->code[$k]->getName();
|
---|
| 353 |
|
---|
| 354 | // if it is a text => return
|
---|
| 355 | if ($detect === 'write') {
|
---|
| 356 | return array($this->code[$k]);
|
---|
| 357 | }
|
---|
| 358 |
|
---|
| 359 | //
|
---|
| 360 | $level = 0; // depth level
|
---|
| 361 | $end = false; // end of the search
|
---|
| 362 | $code = array(); // extract code
|
---|
| 363 |
|
---|
| 364 | // while it's not ended
|
---|
| 365 | while (!$end) {
|
---|
| 366 | // current action
|
---|
| 367 | /** @var Node $node */
|
---|
| 368 | $node = $this->code[$k];
|
---|
| 369 |
|
---|
| 370 | // if 'write' => we add the text
|
---|
| 371 | if ($node->getName() === 'write') {
|
---|
| 372 | $code[] = $node;
|
---|
| 373 | } else { // else, it is a html tag
|
---|
| 374 | $not = false; // flag for not taking into account the current tag
|
---|
| 375 |
|
---|
| 376 | // if it is the searched tag
|
---|
| 377 | if ($node->getName() == $detect) {
|
---|
| 378 | // if we are just at the root level => dont take it
|
---|
| 379 | if ($level == 0) {
|
---|
| 380 | $not = true;
|
---|
| 381 | }
|
---|
| 382 |
|
---|
| 383 | // update the level
|
---|
| 384 | $level += ($node->isClose() ? -1 : 1);
|
---|
| 385 |
|
---|
| 386 | // if we are now at the root level => it is the end, and dont take it
|
---|
| 387 | if ($level == 0) {
|
---|
| 388 | $not = true;
|
---|
| 389 | $end = true;
|
---|
| 390 | }
|
---|
| 391 | }
|
---|
| 392 |
|
---|
| 393 | // if we can take into account the current tag => save it
|
---|
| 394 | if (!$not) {
|
---|
| 395 | $code[] = $node;
|
---|
| 396 | }
|
---|
| 397 | }
|
---|
| 398 |
|
---|
| 399 | // it continues as long as there has code to analyze
|
---|
| 400 | if (isset($this->code[$k + 1])) {
|
---|
| 401 | $k++;
|
---|
| 402 | } else {
|
---|
| 403 | $end = true;
|
---|
| 404 | }
|
---|
| 405 | }
|
---|
| 406 |
|
---|
| 407 | // return the extract
|
---|
| 408 | return $code;
|
---|
| 409 | }
|
---|
| 410 |
|
---|
| 411 | /**
|
---|
| 412 | * prepare the HTML
|
---|
| 413 | *
|
---|
| 414 | * @param string $html
|
---|
| 415 | *
|
---|
| 416 | * @return string
|
---|
| 417 | */
|
---|
| 418 | public function prepareHtml($html)
|
---|
| 419 | {
|
---|
| 420 | // if it is a real html page, we have to convert it
|
---|
| 421 | if (preg_match('/<body/isU', $html)) {
|
---|
| 422 | $html = $this->getHtmlFromRealPage($html);
|
---|
| 423 | }
|
---|
| 424 |
|
---|
| 425 | // replace some constants
|
---|
| 426 | $html = str_replace('[[date_y]]', date('Y'), $html);
|
---|
| 427 | $html = str_replace('[[date_m]]', date('m'), $html);
|
---|
| 428 | $html = str_replace('[[date_d]]', date('d'), $html);
|
---|
| 429 |
|
---|
| 430 | $html = str_replace('[[date_h]]', date('H'), $html);
|
---|
| 431 | $html = str_replace('[[date_i]]', date('i'), $html);
|
---|
| 432 | $html = str_replace('[[date_s]]', date('s'), $html);
|
---|
| 433 |
|
---|
| 434 | return $html;
|
---|
| 435 | }
|
---|
| 436 |
|
---|
| 437 | /**
|
---|
| 438 | * convert the HTML of a real page, to a code adapted to Html2Pdf
|
---|
| 439 | *
|
---|
| 440 | * @param string $html HTML code of a real page
|
---|
| 441 | * @return string HTML adapted to Html2Pdf
|
---|
| 442 | */
|
---|
| 443 | protected function getHtmlFromRealPage($html)
|
---|
| 444 | {
|
---|
| 445 | // set body tag to lower case
|
---|
| 446 | $html = str_replace('<BODY', '<body', $html);
|
---|
| 447 | $html = str_replace('</BODY', '</body', $html);
|
---|
| 448 |
|
---|
| 449 | // explode from the body tag. If no body tag => end
|
---|
| 450 | $res = explode('<body', $html);
|
---|
| 451 |
|
---|
| 452 | // the html content is between body tag openning and closing
|
---|
| 453 | $content = '<page'.$res[1];
|
---|
| 454 | $content = explode('</body', $content);
|
---|
| 455 | $content = $content[0].'</page>';
|
---|
| 456 |
|
---|
| 457 | // extract the link tags from the original html
|
---|
| 458 | // and add them before the content
|
---|
| 459 | preg_match_all('/<link ([^>]*)[\/]?>/isU', $html, $match);
|
---|
| 460 | foreach ($match[1] as $src) {
|
---|
| 461 | $content = '<link '.$src.'/>'.$content;
|
---|
| 462 | }
|
---|
| 463 |
|
---|
| 464 | // extract the css style tags from the original html
|
---|
| 465 | // and add them before the content
|
---|
| 466 | preg_match_all('/<style[^>]*>(.*)<\/style[^>]*>/isU', $html, $match);
|
---|
| 467 | foreach ($match[0] as $src) {
|
---|
| 468 | $content = $src.$content;
|
---|
| 469 | }
|
---|
| 470 |
|
---|
| 471 | return $content;
|
---|
| 472 | }
|
---|
| 473 | }
|
---|