Zend_Filter_HtmlToText
1 2 3 4 56 7 8 9 1011 12 13 14 1516 17 18 19 2021 22 23 24 2526 27 28 29 3031 32 33 34 3536 37 38 39 4041 42 43 44 4546 47 48 49 5051 52 53 54 5556 57 58 59 6061 62 63 64 6566 67 68 69 7071 72 73 74 7576 77 78 79 8081 82 83 84 8586 87 88 89 9091 92 93 94 9596 97 98 99 100101 102 103 104 105106 107 108 109 110111 112 113 114 115116 117 118 119 120121 122 123 124 125126 127 128 129 130131 132 133 134 135136 137 138 139 140141 142 143 144 145146 147 148 149 150151 152 153 154 155156 157 158 159 160161 162 163 164 165166 167 168 169 170171 172 173 174 175176 177 178 179 180181 182 183 184 185186 187 188 189 190191 192 193 194 195196 197 198 199 200201 202 203 204 205206 207 208 209 210211 212 213 214 215216 217 218 219 220221 222 223 224 225226 227 228 229 230231 232 233 234 235236 237 238 239 240241 242 243 244 245246 247 248 249 250251 252 253 254 255256 257 258 259 260261 262 263 264 265266 267 268 269 270271 272 273 274 275276 277 278 279 280281 282 283 284 285286 287 288 289 290291 292 293 294 295296 297 298 299 300301 302 303 304 305306 307 308 309 310311 312 313 314 315316 317 318 319 320321 322 323 324 325326 327 328 329 330331 332 333 334 335336 337 338 339 340341 342 343 344 345346 347 348 349 350351 352 | <?php /** * REQUIRES DOM and quite valid markup to work * * Strips all HTML and applies basic formats (email compatible) - means it does * text basic formatting (h1-h6,a,b,i,u,strong,em,big,sup,legend), prefixes block * elements with default margin (p, h1-h6, ul...) with 2 newlines and such without * default margin with single newlines. When those are nesting those with margin * succeed (means max. 2 newlines between blocks). Furthermore it parses ULs and * OLs and even respects nesting: * * = Examples = * * <code title="Example showing main capabilities of this helper"> * $filter = new Zend_Filter_HtmlToText() * echo $filter->filter(' * <div><a href="sth.html">This is a test</a><h1>Heading</h1></div> * <h2>Subheading</h2> * <p>Lorem <u>ipsum</u> dolor <i>sit</i> amet <b>consectetuer</b> Vestibulum Aliquam ut <br />magna tempor.</p> * Text * <ul> * <li>Regular list item</li> * <li> * Sublist * <ul><li>Sublist item</li> * <li>Sublist item</li> * </ul> * </li> * <li>Regular list item</li> * </ul> * <ol> * <li>Regular list item</li> * <li> * <ol> * <li>Sublist item</li><li>Sublist item</li> * </ol> * </li> * <li>Text prior to sublist * <ol><li>Sublist item</li> * <li>Sublist item</li> * </ol> * </li> * <li>Regular list item</li> * <li>Regular list item</li> * </ol> * <dl><dt>Topic</dt><dd>Description</dd><dt>Topic</dt><dd>Description</dd></dl> * '); * </code> * * <output> * [This is a test]<sth.html> * * _*Heading*_ * * _*Subheading*_ * * Lorem _ipsum_ dolor /sit/ amet *consectetuer* Vestibulum Aliquam ut magna tempor. * * Text * * - Regular list item * - Sublist * - Sublist item * - Sublist item * - Regular list item * * 1. Regular list item * 2. 1. Sublist item * 2. Sublist item * 3. Text prior to sublist * 1. Sublist item * 2. Sublist item * 4. Regular list item * 5. Regular list item * * Topic * Description * Topic * Description * </output> * * @author Christian Opitz <co@netzelf.de> * @category Zend * @package Zend_Filter * @copyright Copyright (c) 2011 - Christian Opitz - Netzelf (http://www.netzelf.de) * @license http://framework.zend.com/license/new-bsd New BSD License */ class Zend_Filter_HtmlToText implements Zend_Filter_Interface { /** * Internally used to deal with tabs (lines are trimmed, so that tabs * need to be masked while processing) * * @var unknown_type */ const TAB_MASK = '~#~TAB~#~'; /** * Internally used to work with bullets (this mask is the indicator for how deep * nested lists need to be indented) * * @var unknown_type */ const BULLET_MASK = '~#~BULLET~#~'; /** * Do the conversion * * @param string $html * @return string The rendered text * @throws Zend_Filter_Exception */ public function filter($html) { $html = $this->_fixNewlines($html); //DOM messes utf8 $html = utf8_decode($html); $doc = new DOMDocument(); if (!$doc->loadHTML('<span>'.$html.'</span>')) { throw new Zend_Filter_Exception('Could not load HTML - badly formed?'); } $output = $this->_iterateOverNode($doc); $output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output); //return $output; return str_replace( array( self::TAB_MASK, self::BULLET_MASK, ),array( ' ', '- ' ), trim($output)); } /** * Unify newlines; in particular, \r\n becomes \n, and * then \r becomes \n. This means that all newlines (Unix, Windows, Mac) * all become \ns. * * @param text text with any number of \r, \r\n and \n combinations * @return the fixed text */ protected function _fixNewlines($text) { // replace \r\n to \n $text = str_replace("\r\n", "\n", $text); // remove \rs $text = str_replace("\r", "\n", $text); return $text; } /** * Iterates over all nodes, organizes breaks (vertical margins) and calls * the postProcessTag for each tag that is not DOMText * * @param DOMElement|DOMText $node * @param string $prevOutput * @return string */ protected function _iterateOverNode($node, $prevOutput = '') { if ($node instanceof DOMText) { return $prevOutput.preg_replace("/\\s+/im", " ", $node->wholeText); } if ($node instanceof DOMDocumentType) { // ignore return ""; } $name = strtolower($node->nodeName); $this->_preProcessTag($node); if (in_array($name, array('style', 'head', 'title', 'meta', 'script', 'object'))) { return ''; } $output = ''; for ($i = 0; $i < $node->childNodes->length; $i++) { $output = $this->_iterateOverNode($node->childNodes->item($i), $output); } $before = ''; $after =''; if (strlen($output)) { if ($this->_isSingleBreakElement($name)) { $prevOutput = rtrim($prevOutput); $before = "\n"; $len = strlen($output); $pos = strlen(rtrim($output)); $output = ltrim($output); $after .= ($len - $pos == 2) ? "\n\n" : "\n"; } elseif ($this->_isDoubleBreakElement($name)){ $prevOutput = rtrim($prevOutput); $output = ltrim($output); $before = $after = "\n\n"; } $this->_postProcessTag($output, $before, $after, $name, $node); return $prevOutput.$before.$output.$after; } return $prevOutput; } /** * This method gets called for each DOMElement BEFORE it's children are rendered * * @param DOMElement $node */ protected function _preProcessTag($node) { switch ($node->nodeName) { case 'ul': case 'ol': if ($node->parentNode->nodeName == 'li') { $node->setAttribute('rel', intval($node->parentNode->getAttribute('rel')) + 1); }else{ $node->setAttribute('rel', 0); } $node->setAttribute('top', 0); break; case 'li': $rel = intval($node->parentNode->getAttribute('rel')); $node->setAttribute('rel', $rel); if ($node->parentNode->nodeName == 'ol') { $i = intval($node->parentNode->getAttribute('top')) + 1; $node->parentNode->setAttribute('top', $i); $node->setAttribute('left', str_repeat(self::TAB_MASK, $rel).$i.'. '); } else { $node->setAttribute('left', str_repeat(self::TAB_MASK, $rel).self::BULLET_MASK); } break; } } /** * This method gets called for each DOMElement AFTER it's children are rendered * Used to decorate tags according to theyr name and influence it's margin * * @param string $output * @param string $before * @param string $after * @param string $name * @param DOMElement $node */ protected function _postProcessTag(&$output, &$before, &$after, $name, $node) { switch ($name) { case 'a': $href = $node->getAttribute("href"); if ($href != null) { if ($href == $output) { $output = '<'.$output.'>'; } else { $output = '['.$output.']<'.$href.'>'; } } break; case 'h1': case 'h2': case 'h3': $output = '_*'.$output.'*_'; break; case 'h4': case 'h5': case 'h6': case 'u': $output = '_'.$output.'_'; break; case 'i': case 'em': $output = '/'.$output.'/'; break; case 'b': case 'strong': case 'legend': $output = '*'.$output.'*'; break; case 'big': case 'sup': $output = strtoupper($output); break; case 'ol': case 'ul': if ($node->parentNode->nodeName == 'li') { $before = "\n"; } break; case 'li': if ($node->getAttribute('left')) { $parts = explode("\n", $output); if (count($parts) > 1) { // Ok - this is a li directly following an nested ul or ul // we have to strip out the previous indention // Probably it's better to avoid that in preProcessing but I // did not get it :( $parts[0] = str_replace(self::TAB_MASK, '', $parts[0]); $output = implode("\n",$parts); } } $output = $node->getAttribute('left').$output; break; case 'dd': $output = self::TAB_MASK.$output; break; } } /** * If this tag is a block element without vertical margin * * @param string $name The tag name * @return boolean */ protected function _isSingleBreakElement($name) { return in_array($name, array( 'address', 'blockquote', 'center', 'dir', 'div', 'fieldset', 'form', 'legend', 'isindex', 'menu', 'noscript', 'dd', 'dt', 'li', 'pre', 'tr', 'br' )); } /** * If this is a block element that requires vertical margin * * @param string $name * @return boolean */ protected function _isDoubleBreakElement($name) { return in_array($name, array( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'p', )); } } |
Comments
You must login before commenting on a snippet. If you do not have an account, please register.
Snippet description
Strips all HTML and applies basic formats (email compatible) - means it does basic text formatting (h1-h6,a,b,i,u,strong,em,big,sup,legend), prefixes block elements with default vertical margin (p, h1-h6, ul...) with 2 newlines and such without default margin with single newlines. When those are nesting, those with margin succeed (means max. 2 newlines between blocks). Furthermore it parses ULs and OLs and even respects nesting.
See example in class DocBlock
Snippet details
- Created:
-
Christian Opitz
- Edited:
-
Christian Opitz
- Revision Id:
- 189
- Edit Message:
- Corrected Exception class
- ZF Version
- 1.0.4
- Tags:
- filter text html txt
- Comments:
- 0
- Views:
- 1910
- Points:
- 1 (1 votes)