PDF converter
* distributed under the OSL-3.0 License
*
* @package Html2pdf
* @author Laurent MINGUET
* @copyright 2017 Laurent MINGUET
*/
namespace Spipu\Html2Pdf\Parsing;
use Spipu\Html2Pdf\Exception\HtmlParsingException;
/**
* Class Html
*/
class Html
{
const HTML_TAB = ' ';
/**
* @var TagParser
*/
protected $tagParser;
/**
* @var TextParser
*/
protected $textParser;
/**
* are we in a pre ?
* @var boolean
*/
protected $tagPreIn = false;
/**
* parsed HTML code
* @var Node[]
*/
public $code = array();
/**
* main constructor
*
* @param TextParser $textParser
*/
public function __construct(TextParser $textParser)
{
$this->textParser = $textParser;
$this->tagParser = new TagParser($this->textParser);
$this->code = array();
}
/**
* Get the list of the codes, but cloned
*
* @return Node[]
*/
public function getCloneCodes()
{
$codes = array();
foreach ($this->code as $key => $code) {
$codes[$key] = clone $code;
}
return $codes;
}
/**
* parse the HTML code
*
* @param Token[] $tokens A list of tokens to parse
*
* @throws HtmlParsingException
*/
public function parse($tokens)
{
$parents = array();
// flag : are we in a
Tag ?
$this->tagPreIn = false;
/**
* all the actions to do
* @var Node[] $actions
*/
$actions = array();
// get the actions from the html tokens
foreach ($tokens as $token) {
if ($token->getType() === 'code') {
$actions = array_merge($actions, $this->getTagAction($token, $parents));
} elseif ($token->getType() === 'txt') {
$actions = array_merge($actions, $this->getTextAction($token));
}
}
// for each identified action, we have to clean up the begin and the end of the texte
// based on tags that surround it
// list of the tags to clean
$tagsToClean = array(
'page', 'page_header', 'page_footer', 'form',
'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br',
'div', 'hr', 'p', 'ul', 'ol', 'li',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'bookmark', 'fieldset', 'legend',
'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline',
'option'
);
// list of the tags to move space
$tagsToSpace = array(
'span', 'font', 'label',
'strong', 'b',
'address', 'cite', 'em', 'i', 'samp',
'cite', 's',
'ins', 'u',
'big', 'small', 'sub', 'sup'
);
// foreach action
$nb = count($actions);
for ($k = 0; $k < $nb; $k++) {
// if it is a Text
if ($actions[$k]->getName() !== 'write') {
continue;
}
// if the tag before the text is a tag to clean => ltrim on the text
if ($k>0) {
if (in_array($actions[$k - 1]->getName(), $tagsToClean)) {
$actions[$k]->setParam('txt', ltrim($actions[$k]->getParam('txt')));
}
}
if ($k < $nb - 1) {
// if the tag after the text is a tag to clean => rtrim on the text
if (in_array($actions[$k + 1]->getName(), $tagsToClean)) {
$actions[$k]->setParam('txt', rtrim($actions[$k]->getParam('txt')));
}
// if the tag after the text is a tag with space to move => move the space to the next write
if (in_array($actions[$k + 1]->getName(), $tagsToSpace)) {
if (substr($actions[$k]->getParam('txt'), -1) == ' ') {
$actions[$k]->setParam('txt', rtrim($actions[$k]->getParam('txt')));
for ($subK = $k+2; $subK < $nb; $subK++) {
if ($actions[$subK]->getName() === 'write') {
$actions[$subK]->setParam('txt', ' '.ltrim($actions[$subK]->getParam('txt')));
break;
}
}
}
}
}
// if the text is empty => remove the action
if (!strlen($actions[$k]->getParam('txt'))) {
unset($actions[$k]);
}
}
// if we are not on the level 0 => HTML validator ERROR
if (count($parents)) {
if (count($parents)>1) {
$errorMsg = 'The following tags have not been closed:';
} else {
$errorMsg = 'The following tag has not been closed:';
}
$e = new HtmlParsingException($errorMsg.' '.implode(', ', $parents));
$e->setInvalidTag($parents[0]);
throw $e;
}
$this->verifyMustContain($actions, 'thead', 'tr');
$this->verifyMustContain($actions, 'tfoot', 'tr');
// save the actions to do
$this->code = array_values($actions);
}
/**
* Verify some tags that must contain other tags
*
* @param Node[] $actions
* @param string $mainTag
* @param string $mustTag
*
* @return bool
* @throws HtmlParsingException
*/
protected function verifyMustContain(&$actions, $mainTag, $mustTag)
{
$inMainTag = 0;
$foundMustTag = false;
foreach ($actions as $action) {
if ($action->getName() == $mainTag && !$action->isClose()) {
$inMainTag++;
$foundMustTag = false;
}
if ($action->getName() == $mustTag && $inMainTag > 0) {
$foundMustTag = true;
}
if ($action->getName() == $mainTag && $action->isClose()) {
if (!$foundMustTag) {
$exception = new HtmlParsingException(
"The tag [$mainTag] must contain at least one tag [$mustTag]"
);
$exception->setInvalidTag($action->getName());
$exception->setHtmlLine($action->getLine());
throw $exception;
}
$inMainTag--;
}
}
return true;
}
/**
* TODO remove the reference on the $parents variable
*
* @param Token $token
* @param array $parents
*
* @return array
* @throws HtmlParsingException
*/
protected function getTagAction(Token $token, &$parents)
{
// tag that can be not closed
$tagsNotClosed = array(
'br', 'hr', 'img', 'col',
'input', 'link', 'option',
'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline'
);
// analyze the HTML code
$node = $this->tagParser->analyzeTag($token->getData());
// save the current position in the HTML code
$node->setLine($token->getLine());
$actions = array();
// if the tag must be closed
if (!in_array($node->getName(), $tagsNotClosed)) {
// if it is a closure tag
if ($node->isClose()) {
// HTML validation
if (count($parents) < 1) {
$e = new HtmlParsingException('Too many tag closures found for ['.$node->getName().']');
$e->setInvalidTag($node->getName());
$e->setHtmlLine($token->getLine());
throw $e;
} elseif (end($parents) != $node->getName()) {
$e = new HtmlParsingException('Tags are closed in a wrong order for ['.$node->getName().']');
$e->setInvalidTag($node->getName());
$e->setHtmlLine($token->getLine());
throw $e;
} else {
array_pop($parents);
}
} else {
// if it is an auto-closed tag
if ($node->isAutoClose()) {
// save the opened tag
$actions[] = $node;
// prepare the closed tag
$node = clone $node;
$node->setParams(array());
$node->setClose(true);
} else {
// else: add a child for validation
array_push($parents, $node->getName());
}
}
// if it is a
tag (or tag) not auto-closed => update the flag
if (($node->getName() === 'pre' || $node->getName() === 'code') && !$node->isAutoClose()) {
$this->tagPreIn = !$node->isClose();
}
}
// save the actions to convert
$actions[] = $node;
return $actions;
}
/**
* get the Text action
*
* @param Token $token
*
* @return array
*/
protected function getTextAction(Token $token)
{
// action to use for each line of the content of a
Tag
$tagPreBr = new Node('br', array('style' => array(), 'num' => 0), false);
$actions = array();
// if we are not in a
tag
if (!$this->tagPreIn) {
// save the action
$actions[] = new Node('write', array('txt' => $this->textParser->prepareTxt($token->getData())), false);
} else { // else (if we are in a
tag)
// prepare the text
$data = str_replace("\r", '', $token->getData());
$lines = explode("\n", $data);
// foreach line of the text
foreach ($lines as $k => $txt) {
// transform the line
$txt = str_replace("\t", self::HTML_TAB, $txt);
$txt = str_replace(' ', ' ', $txt);
// add a break line
if ($k > 0) {
$actions[] = clone $tagPreBr;
}
// save the action
$actions[] = new Node('write', array('txt' => $this->textParser->prepareTxt($txt, false)), false);
}
}
return $actions;
}
/**
* get a full level of HTML, between an opening and closing corresponding
*
* @param integer $k
* @return array actions
*/
public function getLevel($k)
{
// if the code does not exist => return empty
if (!isset($this->code[$k])) {
return array();
}
// the tag to detect
$detect = $this->code[$k]->getName();
// if it is a text => return
if ($detect === 'write') {
return array($this->code[$k]);
}
//
$level = 0; // depth level
$end = false; // end of the search
$code = array(); // extract code
// while it's not ended
while (!$end) {
// current action
/** @var Node $node */
$node = $this->code[$k];
// if 'write' => we add the text
if ($node->getName() === 'write') {
$code[] = $node;
} else { // else, it is a html tag
$not = false; // flag for not taking into account the current tag
// if it is the searched tag
if ($node->getName() == $detect) {
// if we are just at the root level => dont take it
if ($level == 0) {
$not = true;
}
// update the level
$level += ($node->isClose() ? -1 : 1);
// if we are now at the root level => it is the end, and dont take it
if ($level == 0) {
$not = true;
$end = true;
}
}
// if we can take into account the current tag => save it
if (!$not) {
$code[] = $node;
}
}
// it continues as long as there has code to analyze
if (isset($this->code[$k + 1])) {
$k++;
} else {
$end = true;
}
}
// return the extract
return $code;
}
/**
* prepare the HTML
*
* @param string $html
*
* @return string
*/
public function prepareHtml($html)
{
// if it is a real html page, we have to convert it
if (preg_match('/getHtmlFromRealPage($html);
}
// replace some constants
$html = str_replace('[[date_y]]', date('Y'), $html);
$html = str_replace('[[date_m]]', date('m'), $html);
$html = str_replace('[[date_d]]', date('d'), $html);
$html = str_replace('[[date_h]]', date('H'), $html);
$html = str_replace('[[date_i]]', date('i'), $html);
$html = str_replace('[[date_s]]', date('s'), $html);
return $html;
}
/**
* convert the HTML of a real page, to a code adapted to Html2Pdf
*
* @param string $html HTML code of a real page
* @return string HTML adapted to Html2Pdf
*/
protected function getHtmlFromRealPage($html)
{
// set body tag to lower case
$html = str_replace(' end
$res = explode('';
// extract the link tags from the original html
// and add them before the content
preg_match_all('/]*)[\/]?>/isU', $html, $match);
foreach ($match[1] as $src) {
$content = ''.$content;
}
// extract the css style tags from the original html
// and add them before the content
preg_match_all('/