depage-forms v1.4.1
html forms made easy
Loading...
Searching...
No Matches
HtmlDom.php
Go to the documentation of this file.
1<?php
2
9
10namespace Depage\HtmlForm\Abstracts;
11
18class HtmlDom extends \DOMDocument
19{
23 protected $allowedTags = [
24 "p",
25 "br",
26 "h1",
27 "h2",
28 "ul",
29 "ol",
30 "li",
31
32 "a",
33 "b",
34 "strong",
35 "i",
36 "em",
37 ];
38
42 protected $allowedAttributes = [
43 'class',
44 'href',
45 'target',
46 'alt',
47 'title',
48 'data-dbid',
49 ];
50
59 public function __construct($version = "1.0", $encoding = "")
60 {
61 parent::__construct($version, $encoding);
62 }
63
73 public function loadHTML($html, $options = 0): bool
74 {
75 $tmpDOM = new \DOMDocument();
76
77 $encoding = mb_http_input();
78 if ($encoding == '') {
79 $encoding = "utf-8";
80 }
81
82 // @todo take original content-type if available
83 $success = @$tmpDOM->loadHTML("<meta http-equiv=\"content-type\" content=\"text/html; charset=$encoding\">$html");
84
85 $xpath = new \DOMXPath($tmpDOM);
86 $nodelist = $xpath->query("//body/*");
87
88 $this->resolveExternals = true;
89 $this->loadXML('<?xml version="1.0" encoding="utf-8"?>
90 <!DOCTYPE html [
91 <!ENTITY nbsp "&#160;">
92 ]>
93 <body></body>');
94 if ($tmpDOM->encoding != '') {
95 $this->encoding = $tmpDOM->encoding;
96 }
97 $rootnode = $this->documentElement;
98
99 foreach ($nodelist as $node) {
100 // copy all nodes inside the body tag to target document
101 $newnode = $this->importNode($node, true);
102 $rootnode->appendChild($newnode);
103 }
104
105 return $success;
106 }
107
117 public function cleanHTML($allowedTags = null, $allowedAttributes = null)
118 {
119 $xpath = new \DOMXPath($this);
120
121 if (is_null($allowedTags)) {
123 }
124 if (is_null($allowedAttributes)) {
126 }
127
128 $tags = [];
129 $classByTag = [];
130
131 foreach ($allowedTags as $t) {
132 preg_match("/([a-zA-Z0-9]*)(\.(.*))?/", $t, $matches);
133
134 $tag = $matches[1] ?? "";
135 $class = $matches[3] ?? "";
136 $tags[$tag] = true;
137 if (!isset($classByTag[$tag])) {
138 $classByTag[$tag] = [];
139 }
140 if (!empty($class)) {
141 $classByTag[$tag][] = $class;
142 } else {
143 $classByTag[$tag][] = "";
144 }
145 }
146
147 $nodelist = $xpath->query("//body//*");
148
149 for ($i = $nodelist->length - 1; $i >= 0; $i--) {
150 $node = $nodelist->item($i);
151
152 if (!isset($tags[$node->nodeName])) {
153 // move child nodes before element itself
154 while ($node->firstChild != null) {
155 if ($node->parentNode->nodeName == "body" && $node->firstChild->nodeType == XML_TEXT_NODE) {
156 // put text nodes into additional p when added directly to body
157 $paragraph = $node->parentNode->insertBefore($this->createElement("p"), $node);
158 $paragraph->appendChild($node->firstChild);
159 } else {
160 $node->parentNode->insertBefore($node->firstChild, $node);
161 }
162 }
163
164 // delete empty node
165 $node->parentNode->removeChild($node);
166 } else {
167 // test for allowed attributes
168 for ($j = $node->attributes->length - 1; $j >= 0; $j--) {
169 $attr = $node->attributes->item($j);
170
171 // remove attributes that are not in allowedAttributes
172 if (!in_array($attr->name, $allowedAttributes)) {
173 $node->removeAttribute($attr->name);
174 }
175 }
176
177 // remove invalid classnames
178 if ($node->getAttribute("class") != "") {
179 $attr = implode(" ", array_intersect(
180 explode(" ", $node->getAttribute("class")),
181 $classByTag[$node->nodeName],
182 ));
183 if (empty($attr)) {
184 $node->removeAttribute("class");
185 } else {
186 $node->setAttribute("class", $attr);
187 }
188 }
189 }
190 }
191 $nodelist = $xpath->query("//p[. = '' and count(br) = 0]");
192
193 foreach ($nodelist as $node) {
194 $node->appendChild($this->createElement("br"));
195 }
196 $nodelist = $xpath->query("//li");
197 $parentNodes = ["ul", "ol", "menu"];
198
199 foreach ($nodelist as $node) {
200 if (!in_array($node->parentNode->nodeName, $parentNodes)) {
201 // find previous XML-element-node
202 $previous = $node->previousSibling;
203 while (!is_null($previous) && $previous->nodeType != XML_ELEMENT_NODE) {
204 $previous = $previous->previousSibling;
205 }
206
207 if (!is_null($previous) && in_array($previous->nodeName, $parentNodes)) {
208 // previous element is a list -> add node to it
209 $listNode->appendChild($node);
210 } else {
211 // create a new ul-list and add element to it
212 $listNode = $node->parentNode->insertBefore($this->createElement("ul"), $node);
213 $listNode->appendChild($node);
214 }
215 }
216 }
217 $nodelist = $xpath->query("//b[not(node())] | //i[not(node())] | //strong[not(node())] | //span[not(node())] | //a[not(node())] | //u[not(node())]");
218
219 for ($i = $nodelist->length - 1; $i >= 0; $i--) {
220 $node = $nodelist->item($i);
221
222 $node->parentNode->removeChild($node);
223 }
224 $nodes = $this->getBodyNodes();
225 if ($nodes->length == 1) {
226 $node = $nodes->item(0);
227 if ($node->nodeName == "p" && $node->childNodes->length == 1 && $node->childNodes->item(0)->nodeName == "br") {
228 $node->parentNode->removeChild($node);
229 }
230 }
231 }
232
238 public function cutToMaxlength($max)
239 {
240 $charsToRemove = mb_strlen($this->documentElement->textContent) - $max;
241
242 if ($charsToRemove <= 0) {
243 return;
244 }
245
246 $xpath = new \DOMXPath($this);
247 $textNodes = $xpath->query("//text()");
248 $i = $textNodes->length - 1;
249 while ($charsToRemove > 0 && $i >= 0) {
250 $n = $textNodes->item($i);
251 $len = mb_strlen($n->textContent);
252 $parent = $n->parentNode;
253
254 if ($len <= $charsToRemove) {
255 $parent->removeChild($n);
256 $charsToRemove -= $len;
257 } else {
258 $restNode = $n->splitText($len - $charsToRemove);
259 $parent->removeChild($restNode);
260 $charsToRemove = 0;
261 }
262
263 // remove empty nodes
264 if (mb_strlen($parent->textContent) == 0) {
265 $parent->parentNode->removeChild($parent);
266 }
267
268 $i--;
269 }
270 }
271
277 public function __toString()
278 {
279 $html = "";
280 foreach ($this->documentElement->childNodes as $node) {
281 $html .= $this->saveHTML($node) . "\n";
282 }
283
284 return $html;
285 }
286 public function __serialize(): array
287 {
288 return [
289 'xml' => $this->saveXML(),
290 ];
291 }
292 public function __unserialize(array $data): void
293 {
294 $this->loadXML($data['xml']);
295 }
296
302 public function getBodyNodes()
303 {
304 $xpath = new \DOMXPath($this);
305 $nodelist = $xpath->query("//body/*");
306
307 return $nodelist;
308 }
309}
310
311/* vim:set ft=php sw=4 sts=4 fdm=marker et : */
DOMDocument for html-content.
Definition HtmlDom.php:19
cleanHTML($allowedTags=null, $allowedAttributes=null)
cleans up a htmlDOM
Definition HtmlDom.php:117
__construct($version="1.0", $encoding="")
htmldom class constructor
Definition HtmlDom.php:59
$allowedAttributes
allowedAttributes
Definition HtmlDom.php:42
cutToMaxlength($max)
cutToMaxlength
Definition HtmlDom.php:238
$allowedTags
Tags that are allowed inside of html.
Definition HtmlDom.php:23
getBodyNodes()
gets a nodelist with all nodes inside the body
Definition HtmlDom.php:302
loadHTML($html, $options=0)
loads html from a htmls string
Definition HtmlDom.php:73