HtmlDom.php
Go to the documentation of this file.
1<?php
10
17class HtmlDom extends \DOMDocument implements \Serializable
18{
22 protected $allowedTags = [
23 "p",
24 "br",
25 "h1",
26 "h2",
27 "ul",
28 "ol",
29 "li",
30
31 "a",
32 "b",
33 "strong",
34 "i",
35 "em",
36 ];
37
41 protected $allowedAttributes = [
42 'class',
43 'href',
44 'target',
45 'alt',
46 'title',
47 'data-dbid',
48 ];
49
58 public function __construct($version = null, $encoding = null)
59 {
60 parent::__construct($version, $encoding);
61 }
67 public function serialize()
68 {
69 $s = $this->saveXML();
70
71 return $s;
72 }
80 public function unserialize($serialized)
81 {
82 $this->loadXML($serialized);
83 }
94 public function loadHTML($html, $options = null)
95 {
96 $tmpDOM = new \DOMDocument();
97
98 $encoding = mb_http_input();
99 if ($encoding == '') {
100 $encoding = "utf-8";
101 }
102
103 // @todo take original content-type if available
104 $success = @$tmpDOM->loadHTML("<meta http-equiv=\"content-type\" content=\"text/html; charset=$encoding\">$html");
105
106 $xpath = new \DOMXPath($tmpDOM);
107 $nodelist = $xpath->query("//body/*");
108
109 $this->resolveExternals = true;
110 $this->loadXML('<?xml version="1.0" encoding="utf-8"?>
111 <!DOCTYPE html [
112 <!ENTITY nbsp "&#160;">
113 ]>
114 <body></body>');
115 if ($tmpDOM->encoding != '') {
116 $this->encoding = $tmpDOM->encoding;
117 }
118 $rootnode = $this->documentElement;
119
120 foreach ($nodelist as $node) {
121 // copy all nodes inside the body tag to target document
122 $newnode = $this->importNode($node, true);
123 $rootnode->appendChild($newnode);
124 }
125
126 return $success;
127 }
138 public function cleanHTML($allowedTags = null, $allowedAttributes = null)
139 {
140 $xpath = new \DOMXPath($this);
141
142 if (is_null($allowedTags)) {
144 }
145 if (is_null($allowedAttributes)) {
147 }
148
149 $tags = [];
150 $classByTag = [];
151
152 foreach ($allowedTags as $t) {
153 preg_match("/([a-zA-Z0-9]*)(\.(.*))?/", $t, $matches);
154
155 $tag = $matches[1] ?? "";
156 $class = $matches[3] ?? "";
157 $tags[$tag] = true;
158 if (!isset($classByTag[$tag])) {
159 $classByTag[$tag] = [];
160 }
161 if (!empty($class)) {
162 $classByTag[$tag][] = $class;
163 } else {
164 $classByTag[$tag][] = "";
165 }
166 }
167
168 $nodelist = $xpath->query("//body//*");
169
170 for ($i = $nodelist->length - 1; $i >= 0; $i--) {
171 $node = $nodelist->item($i);
172
173 if (!isset($tags[$node->nodeName])) {
174 // move child nodes before element itself
175 while ($node->firstChild != null) {
176 if ($node->parentNode->nodeName == "body" && $node->firstChild->nodeType == XML_TEXT_NODE) {
177 // put text nodes into additional p when added directly to body
178 $paragraph = $node->parentNode->insertBefore($this->createElement("p"), $node);
179 $paragraph->appendChild($node->firstChild);
180 } else {
181 $node->parentNode->insertBefore($node->firstChild, $node);
182 }
183 }
184
185 // delete empty node
186 $node->parentNode->removeChild($node);
187 } else {
188 // test for allowed attributes
189 for ($j = $node->attributes->length - 1; $j >= 0; $j--) {
190 $attr = $node->attributes->item($j);
191
192 // remove attributes that are not in allowedAttributes
193 if (!in_array($attr->name, $allowedAttributes)) {
194 $node->removeAttribute($attr->name);
195 }
196 }
197
198 // remove invalid classnames
199 if ($node->getAttribute("class") != "") {
200 $attr = implode(" ", array_intersect(
201 explode(" ", $node->getAttribute("class")),
202 $classByTag[$node->nodeName]
203 ));
204 if (empty($attr)) {
205 $node->removeAttribute("class");
206 } else {
207 $node->setAttribute("class", $attr);
208 }
209 }
210 }
211 }
212 // @todo check to use br or nbsp
213 $nodelist = $xpath->query("//p[. = '' and count(br) = 0]");
214
215 foreach ($nodelist as $node) {
216 $node->appendChild($this->createElement("br"));
217 }
218 $nodelist = $xpath->query("//li");
219 $parentNodes = array("ul", "ol", "menu");
220
221 foreach ($nodelist as $node) {
222 if (!in_array($node->parentNode->nodeName, $parentNodes)) {
223 // find previous XML-element-node
224 $previous = $node->previousSibling;
225 while (!is_null($previous) && $previous->nodeType != XML_ELEMENT_NODE) {
226 $previous = $previous->previousSibling;
227 }
228
229 if (!is_null($previous) && in_array($previous->nodeName, $parentNodes)) {
230 // previous element is a list -> add node to it
231 $listNode->appendChild($node);
232 } else {
233 // create a new ul-list and add element to it
234 $listNode = $node->parentNode->insertBefore($this->createElement("ul"), $node);
235 $listNode->appendChild($node);
236 }
237 }
238 }
239 $nodelist = $xpath->query("//b[not(node())] | //i[not(node())] | //strong[not(node())] | //span[not(node())] | //a[not(node())]");
240
241 for ($i = $nodelist->length - 1; $i >= 0; $i--) {
242 $node = $nodelist->item($i);
243
244 $node->parentNode->removeChild($node);
245 }
246 $nodes = $this->getBodyNodes();
247 if ($nodes->length == 1) {
248 $node = $nodes->item(0);
249 if ($node->nodeName == "p" && $node->childNodes->length == 1 && $node->childNodes->item(0)->nodeName == "br") {
250 $node->parentNode->removeChild($node);
251 }
252 }
253 }
260 public function cutToMaxlength($max)
261 {
262 $charsToRemove = mb_strlen($this->documentElement->textContent) - $max;
263
264 if ($charsToRemove <= 0) {
265 return;
266 }
267
268 $xpath = new \DOMXPath($this);
269 $textNodes = $xpath->query("//text()");
270 $i = $textNodes->length - 1;
271 while ($charsToRemove > 0 && $i >= 0) {
272 $n = $textNodes->item($i);
273 $len = mb_strlen($n->textContent);
274 $parent = $n->parentNode;
275
276 if ($len <= $charsToRemove) {
277 $parent->removeChild($n);
278 $charsToRemove -= $len;
279 } else {
280 $restNode = $n->splitText($len - $charsToRemove);
281 $parent->removeChild($restNode);
282 $charsToRemove = 0;
283 }
284
285 // remove empty nodes
286 if (mb_strlen($parent->textContent) == 0) {
287 $parent->parentNode->removeChild($parent);
288 }
289
290 $i--;
291 }
292 }
299 public function __toString()
300 {
301 $html = "";
302 foreach ($this->documentElement->childNodes as $node) {
303 $html .= $this->saveXML($node) . "\n";
304 }
305
306 return $html;
307 }
313 public function getBodyNodes()
314 {
315 $xpath = new \DOMXPath($this);
316 $nodelist = $xpath->query("//body/*");
317
318 return $nodelist;
319 }
320}
321
322/* vim:set ft=php sw=4 sts=4 fdm=marker et : */
DOMDocument for html-content.
Definition HtmlDom.php:18
loadHTML($html, $options=null)
loads html from a htmls string
Definition HtmlDom.php:94
cleanHTML($allowedTags=null, $allowedAttributes=null)
cleans up a htmlDOM
Definition HtmlDom.php:138
$allowedAttributes
allowedAttributes
Definition HtmlDom.php:41
cutToMaxlength($max)
cutToMaxlength
Definition HtmlDom.php:260
__construct($version=null, $encoding=null)
htmldom class constructor
Definition HtmlDom.php:58
serialize()
serializes htmldom into string
Definition HtmlDom.php:67
$allowedTags
Tags that are allowed inside of html.
Definition HtmlDom.php:22
getBodyNodes()
gets a nodelist with all nodes inside the body
Definition HtmlDom.php:313
unserialize($serialized)
unserializes htmldom-objects
Definition HtmlDom.php:80
Abstract element classes.
Definition Container.php:10