From acccafe3daee1c94064202d38fa244bd5a15c2e7 Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Sun, 28 Oct 2012 12:21:21 +0400 Subject: replace htmlpurifier with htmlawed --- .../library/HTMLPurifier/Lexer/DOMLex.php | 243 --------------------- 1 file changed, 243 deletions(-) delete mode 100644 lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php (limited to 'lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php') diff --git a/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php b/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php deleted file mode 100644 index 82f377450..000000000 --- a/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php +++ /dev/null @@ -1,243 +0,0 @@ -factory = new HTMLPurifier_TokenFactory(); - } - - public function tokenizeHTML($html, $config, $context) { - - $html = $this->normalize($html, $config, $context); - - // attempt to armor stray angled brackets that cannot possibly - // form tags and thus are probably being used as emoticons - if ($config->get('Core.AggressivelyFixLt')) { - $char = '[^a-z!\/]'; - $comment = "/|\z)/is"; - $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html); - do { - $old = $html; - $html = preg_replace("/<($char)/i", '<\\1', $html); - } while ($html !== $old); - $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments - } - - // preprocess html, essential for UTF-8 - $html = $this->wrapHTML($html, $config, $context); - - $doc = new DOMDocument(); - $doc->encoding = 'UTF-8'; // theoretically, the above has this covered - - set_error_handler(array($this, 'muteErrorHandler')); - $doc->loadHTML($html); - restore_error_handler(); - - $tokens = array(); - $this->tokenizeDOM( - $doc->getElementsByTagName('html')->item(0)-> // - getElementsByTagName('body')->item(0)-> // - getElementsByTagName('div')->item(0) //
- , $tokens); - return $tokens; - } - - /** - * Iterative function that tokenizes a node, putting it into an accumulator. - * To iterate is human, to recurse divine - L. Peter Deutsch - * @param $node DOMNode to be tokenized. - * @param $tokens Array-list of already tokenized tokens. - * @returns Tokens of node appended to previously passed tokens. - */ - protected function tokenizeDOM($node, &$tokens) { - - $level = 0; - $nodes = array($level => array($node)); - $closingNodes = array(); - do { - while (!empty($nodes[$level])) { - $node = array_shift($nodes[$level]); // FIFO - $collect = $level > 0 ? true : false; - $needEndingTag = $this->createStartNode($node, $tokens, $collect); - if ($needEndingTag) { - $closingNodes[$level][] = $node; - } - if ($node->childNodes && $node->childNodes->length) { - $level++; - $nodes[$level] = array(); - foreach ($node->childNodes as $childNode) { - array_push($nodes[$level], $childNode); - } - } - } - $level--; - if ($level && isset($closingNodes[$level])) { - while($node = array_pop($closingNodes[$level])) { - $this->createEndNode($node, $tokens); - } - } - } while ($level > 0); - } - - /** - * @param $node DOMNode to be tokenized. - * @param $tokens Array-list of already tokenized tokens. - * @param $collect Says whether or start and close are collected, set to - * false at first recursion because it's the implicit DIV - * tag you're dealing with. - * @returns bool if the token needs an endtoken - */ - protected function createStartNode($node, &$tokens, $collect) { - // intercept non element nodes. WE MUST catch all of them, - // but we're not getting the character reference nodes because - // those should have been preprocessed - if ($node->nodeType === XML_TEXT_NODE) { - $tokens[] = $this->factory->createText($node->data); - return false; - } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { - // undo libxml's special treatment of