htmldom.php
Go to the documentation of this file.
1 <?php
9 namespace depage\htmlform\abstracts;
10 
17 class htmldom extends \DOMDocument implements \Serializable
18 {
22  protected $allowedTags = array(
23  "p",
24  "br",
25  "h1",
26  "h2",
27  "ul",
28  "ol",
29  "li",
30 
31  "a",
32  "b",
33  "strong",
34  "i",
35  "em",
36  );
37 
46  public function __construct($version = null, $encoding = null)
47  {
48  parent::__construct($version, $encoding);
49  }
55  public function serialize()
56  {
57  $s = $this->saveXML();
58 
59  return $s;
60  }
68  public function unserialize($serialized)
69  {
70  $this->loadXML($serialized);
71  }
82  public function loadHTML($html)
83  {
84  $tmpDOM = new \DOMDocument();
85 
86  $encoding = mb_http_input();
87  if ($encoding == '') {
88  $encoding = "utf-8";
89  }
90 
91  // @todo take original content-type if available
92  $success = @$tmpDOM->loadHTML("<meta http-equiv=\"content-type\" content=\"text/html; charset=$encoding\">$html");
93 
94  $xpath = new \DOMXPath($tmpDOM);
95  $nodelist = $xpath->query("//body/*");
96 
97  $this->resolveExternals = true;
98  $this->loadXML('<?xml version="1.0" encoding="utf-8"?>
99  <!DOCTYPE html [
100  <!ENTITY nbsp "&#160;">
101  ]>
102  <body></body>');
103  if ($tmpDOM->encoding != '') {
104  $this->encoding = $tmpDOM->encoding;
105  }
106  $rootnode = $this->documentElement;
107 
108  foreach ($nodelist as $node) {
109  // copy all nodes inside the body tag to target document
110  $newnode = $this->importNode($node, true);
111  $rootnode->appendChild($newnode);
112  }
113 
114  return $success;
115  }
126  public function cleanHTML($allowedTags = null)
127  {
128  $xpath = new \DOMXPath($this);
129 
130  if (is_null($allowedTags)) {
132  }
133 
134  $nodelist = $xpath->query("//body//*");
135 
136  for ($i = $nodelist->length - 1; $i >= 0; $i--) {
137  $node = $nodelist->item($i);
138 
139  if (!in_array($node->nodeName, $allowedTags)) {
140  // move child nodes before element itself
141  while ($node->firstChild != null) {
142  if ($node->parentNode->nodeName == "body" && $node->firstChild->nodeType == XML_TEXT_NODE) {
143  // put text nodes into additional p when added directly to body
144  $paragraph = $node->parentNode->insertBefore($this->createElement("p"), $node);
145  $paragraph->appendChild($node->firstChild);
146  } else {
147  $node->parentNode->insertBefore($node->firstChild, $node);
148  }
149  }
150 
151  // delete empty node
152  $node->parentNode->removeChild($node);
153  }
154  }
155  $nodelist = $xpath->query("//p[. = '']");
156 
157  foreach ($nodelist as $node) {
158  $node->appendChild($this->createEntityReference("nbsp"));
159  }
160  $nodelist = $xpath->query("//li");
161  $parentNodes = array("ul", "ol", "menu");
162 
163  foreach ($nodelist as $node) {
164  if (!in_array($node->parentNode->nodeName, $parentNodes)) {
165  // find previous XML-element-node
166  $previous = $node->previousSibling;
167  while (!is_null($previous) && $previous->nodeType != XML_ELEMENT_NODE) {
168  $previous = $previous->previousSibling;
169  }
170 
171  if (!is_null($previous) && in_array($previous->nodeName, $parentNodes)) {
172  // previous element is a list -> add node to it
173  $listNode->appendChild($node);
174  } else {
175  // create a new ul-list and add element to it
176  $listNode = $node->parentNode->insertBefore($this->createElement("ul"), $node);
177  $listNode->appendChild($node);
178  }
179  }
180  }
181  }
187  public function getBodyNodes()
188  {
189  $xpath = new \DOMXPath($this);
190  $nodelist = $xpath->query("//body/*");
191 
192  return $nodelist;
193  }
194 }
195 
196 /* vim:set ft=php sw=4 sts=4 fdm=marker et : */
cleanHTML($allowedTags=null)
cleans up a htmlDOM
Definition: htmldom.php:126
getBodyNodes()
gets a nodelist with all nodes inside the body
Definition: htmldom.php:187
DOMDocument for html-content.
Definition: htmldom.php:17
loadHTML($html)
loads html from a htmls string
Definition: htmldom.php:82
__construct($version=null, $encoding=null)
htmldom class constructor
Definition: htmldom.php:46
serialize()
serializes htmldom into string
Definition: htmldom.php:55
$allowedTags
Tags that are allowed inside of html.
Definition: htmldom.php:22
unserialize($serialized)
unserializes htmldom-objects
Definition: htmldom.php:68