Zend Framework Source Code Snippets

Zend_Filter_HtmlToText

Bookmark and Share
1
2
3
4
56
7
8
9
1011
12
13
14
1516
17
18
19
2021
22
23
24
2526
27
28
29
3031
32
33
34
3536
37
38
39
4041
42
43
44
4546
47
48
49
5051
52
53
54
5556
57
58
59
6061
62
63
64
6566
67
68
69
7071
72
73
74
7576
77
78
79
8081
82
83
84
8586
87
88
89
9091
92
93
94
9596
97
98
99
100101
102
103
104
105106
107
108
109
110111
112
113
114
115116
117
118
119
120121
122
123
124
125126
127
128
129
130131
132
133
134
135136
137
138
139
140141
142
143
144
145146
147
148
149
150151
152
153
154
155156
157
158
159
160161
162
163
164
165166
167
168
169
170171
172
173
174
175176
177
178
179
180181
182
183
184
185186
187
188
189
190191
192
193
194
195196
197
198
199
200201
202
203
204
205206
207
208
209
210211
212
213
214
215216
217
218
219
220221
222
223
224
225226
227
228
229
230231
232
233
234
235236
237
238
239
240241
242
243
244
245246
247
248
249
250251
252
253
254
255256
257
258
259
260261
262
263
264
265266
267
268
269
270271
272
273
274
275276
277
278
279
280281
282
283
284
285286
287
288
289
290291
292
293
294
295296
297
298
299
300301
302
303
304
305306
307
308
309
310311
312
313
314
315316
317
318
319
320321
322
323
324
325326
327
328
329
330331
332
333
334
335336
337
338
339
340341
342
343
344
345346
347
348
349
350351
352
<?php
/**
 * REQUIRES DOM and quite valid markup to work
 * 
 * Strips all HTML and applies basic formats (email compatible) - means it does * text basic formatting (h1-h6,a,b,i,u,strong,em,big,sup,legend), prefixes block
 * elements with default margin (p, h1-h6, ul...) with 2 newlines and such without
 * default margin with single newlines. When those are nesting those with margin
 * succeed (means max. 2 newlines between blocks). Furthermore it parses ULs and 
 * OLs and even respects nesting: * 
 * = Examples =
 * 
 * <code title="Example showing main capabilities of this helper">
 * $filter = new Zend_Filter_HtmlToText() * echo $filter->filter('
 *    <div><a href="sth.html">This is a test</a><h1>Heading</h1></div>
 *     <h2>Subheading</h2>
 *     <p>Lorem <u>ipsum</u> dolor <i>sit</i> amet <b>consectetuer</b> Vestibulum Aliquam ut <br />magna tempor.</p>
 *     Text *     <ul>
 *         <li>Regular list item</li>
 *         <li>
 *             Sublist
 *             <ul><li>Sublist item</li> *                 <li>Sublist item</li>
 *             </ul>
 *         </li>
 *         <li>Regular list item</li>
 *     </ul> *     <ol>
 *         <li>Regular list item</li>
 *         <li>
 *             <ol>
 *                 <li>Sublist item</li><li>Sublist item</li> *             </ol>
 *         </li>
 *         <li>Text prior to sublist
 *             <ol><li>Sublist item</li>
 *                 <li>Sublist item</li> *             </ol>
 *         </li>
 *         <li>Regular list item</li>
 *         <li>Regular list item</li>
 *     </ol> *     <dl><dt>Topic</dt><dd>Description</dd><dt>Topic</dt><dd>Description</dd></dl>
 * ');
 * </code>
 * 
 * <output> * [This is a test]<sth.html>
 * 
 * _*Heading*_
 * 
 * _*Subheading*_ * 
 * Lorem _ipsum_ dolor /sit/ amet *consectetuer* Vestibulum Aliquam ut magna tempor.
 * 
 * Text
 *  * -  Regular list item
 * -  Sublist
 *    -  Sublist item
 *    -  Sublist item
 * -  Regular list item * 
 * 1. Regular list item
 * 2. 1. Sublist item
 *    2. Sublist item
 * 3. Text prior to sublist *    1. Sublist item
 *    2. Sublist item
 * 4. Regular list item
 * 5. Regular list item
 *  * Topic
 *    Description
 * Topic
 *    Description
 * </output> *
 * @author     Christian Opitz <co@netzelf.de>
 * @category   Zend
 * @package    Zend_Filter
 * @copyright  Copyright (c) 2011 - Christian Opitz - Netzelf (http://www.netzelf.de) * @license    http://framework.zend.com/license/new-bsd     New BSD License
 */
class Zend_Filter_HtmlToText implements Zend_Filter_Interface
{
    /**     * Internally used to deal with tabs (lines are trimmed, so that tabs
     * need to be masked while processing)
     * 
     * @var unknown_type
     */    const TAB_MASK = '~#~TAB~#~';
    
    /**
     * Internally used to work with bullets (this mask is the indicator for how deep 
     * nested lists need to be indented)     * 
     * @var unknown_type
     */
    const BULLET_MASK = '~#~BULLET~#~';
        /**
     * Do the conversion
     * 
     * @param string $html
     * @return string The rendered text     * @throws Zend_Filter_Exception
     */
    public function filter($html)
    {
        $html = $this->_fixNewlines($html);        
        //DOM messes utf8
        $html = utf8_decode($html);
        $doc = new DOMDocument();
        if (!$doc->loadHTML('<span>'.$html.'</span>')) {            throw new Zend_Filter_Exception('Could not load HTML - badly formed?');
        }
    
        $output = $this->_iterateOverNode($doc);
            $output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output);
        
        //return $output;
        return str_replace(
            array(                self::TAB_MASK,
                self::BULLET_MASK,
            ),array(
                '   ',
                '-  '            ),
            trim($output));
    }
    
    /**     * Unify newlines; in particular, \r\n becomes \n, and
     * then \r becomes \n. This means that all newlines (Unix, Windows, Mac)
     * all become \ns.
     *
     * @param text text with any number of \r, \r\n and \n combinations     * @return the fixed text
     */
    protected function _fixNewlines($text)
    {
        // replace \r\n to \n        $text = str_replace("\r\n", "\n", $text);
        // remove \rs
        $text = str_replace("\r", "\n", $text);
    
        return $text;    }
    
    /**
     * Iterates over all nodes, organizes breaks (vertical margins) and calls
     * the postProcessTag for each tag that is not DOMText     * 
     * @param DOMElement|DOMText $node
     * @param string $prevOutput
     * @return string
     */    protected function _iterateOverNode($node, $prevOutput = '')
    {
        if ($node instanceof DOMText) {
            return $prevOutput.preg_replace("/\\s+/im", " ", $node->wholeText);
        }        if ($node instanceof DOMDocumentType) {
            // ignore
            return "";
        }
            $name = strtolower($node->nodeName);
        $this->_preProcessTag($node);
        
        if (in_array($name, array('style', 'head', 'title', 'meta', 'script', 'object'))) {
            return '';        }
    
        $output = '';
        for ($i = 0; $i < $node->childNodes->length; $i++) {
            $output = $this->_iterateOverNode($node->childNodes->item($i), $output);        }
        
        $before = ''; $after ='';
        if (strlen($output)) {
            if ($this->_isSingleBreakElement($name)) {                $prevOutput = rtrim($prevOutput);
                $before = "\n";
                
                $len = strlen($output);
                $pos = strlen(rtrim($output));                $output = ltrim($output);
                $after .= ($len - $pos == 2) ? "\n\n" : "\n";
            }
            elseif ($this->_isDoubleBreakElement($name)){
                $prevOutput = rtrim($prevOutput);                $output = ltrim($output);
                $before = $after = "\n\n";
            }
            
            $this->_postProcessTag($output, $before, $after, $name, $node);            
            return $prevOutput.$before.$output.$after;
        }
        
        return $prevOutput;    }
    
    /**
     * This method gets called for each DOMElement BEFORE it's children are rendered 
     *      * @param DOMElement $node
     */
    protected function _preProcessTag($node)
    {
        switch ($node->nodeName) {            case 'ul':
            case 'ol':
                if ($node->parentNode->nodeName == 'li') {
                    $node->setAttribute('rel', intval($node->parentNode->getAttribute('rel')) + 1);
                }else{                    $node->setAttribute('rel', 0);
                }
                $node->setAttribute('top', 0);
            break;
            case 'li':                $rel = intval($node->parentNode->getAttribute('rel'));
                $node->setAttribute('rel', $rel);
                if ($node->parentNode->nodeName == 'ol') {
                    $i = intval($node->parentNode->getAttribute('top')) + 1;
                    $node->parentNode->setAttribute('top', $i);                    $node->setAttribute('left', str_repeat(self::TAB_MASK, $rel).$i.'. ');
                }
                else {
                    $node->setAttribute('left', str_repeat(self::TAB_MASK, $rel).self::BULLET_MASK);
                }            break;
        }
    }
    
    /**     * This method gets called for each DOMElement AFTER it's children are rendered
     * Used to decorate tags according to theyr name and influence it's margin
     * 
     * @param string $output
     * @param string $before     * @param string $after
     * @param string $name
     * @param DOMElement $node
     */
    protected function _postProcessTag(&$output, &$before, &$after, $name, $node)    {
        switch ($name) {
            case 'a':
                $href = $node->getAttribute("href");
                if ($href != null) {                    if ($href == $output) {
                        $output = '<'.$output.'>';
                    } else {
                        $output = '['.$output.']<'.$href.'>';
                    }                }
            break;
            case 'h1': case 'h2': case 'h3':
                $output = '_*'.$output.'*_';
            break;            case 'h4': case 'h5': case 'h6':
            case 'u':
                $output = '_'.$output.'_';
            break;
            case 'i': case 'em':                $output = '/'.$output.'/';
            break;
            case 'b': case 'strong':
            case 'legend':
                $output = '*'.$output.'*';            break;
            case 'big': case 'sup':
                $output = strtoupper($output);
            break;
            case 'ol':            case 'ul':
                if ($node->parentNode->nodeName == 'li') {
                    $before = "\n";
                }
            break;            case 'li':
                if ($node->getAttribute('left')) {
                    $parts = explode("\n", $output);
                    if (count($parts) > 1) {
                        // Ok - this is a li directly following an nested ul or ul                        // we have to strip out the previous indention
                        // Probably it's better to avoid that in preProcessing but I
                        // did not get it :(
                        $parts[0] = str_replace(self::TAB_MASK, '', $parts[0]);
                        $output = implode("\n",$parts);                    }
                }
                $output = $node->getAttribute('left').$output;
            break;
            case 'dd':                $output = self::TAB_MASK.$output;
            break;
        }
    }
        /**
     * If this tag is a block element without vertical margin
     * 
     * @param string $name The tag name
     * @return boolean     */
    protected function _isSingleBreakElement($name)
    {
        return in_array($name, array(
            'address',            'blockquote',
            'center',
            'dir',
            'div',
            'fieldset',            'form',
            'legend',
            'isindex',
            'menu',
            'noscript',            'dd', 'dt', 'li',
            'pre',
            'tr',
            'br'
        ));    }
    
    /**
     * If this is a block element that requires vertical margin
     *      * @param string $name
     * @return boolean
     */
    protected function _isDoubleBreakElement($name)
    {        return in_array($name, array(
            'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
            'ol', 'ul', 'dl',
            'p',
        ));    }
}

Comments

You must login before commenting on a snippet. If you do not have an account, please register.

Snippet description

Strips all HTML and applies basic formats (email compatible) - means it does basic text formatting (h1-h6,a,b,i,u,strong,em,big,sup,legend), prefixes block elements with default vertical margin (p, h1-h6, ul...) with 2 newlines and such without default margin with single newlines. When those are nesting, those with margin succeed (means max. 2 newlines between blocks). Furthermore it parses ULs and OLs and even respects nesting.

See example in class DocBlock

Snippet details

Created:
Christian Opitz Christian Opitz
1 year ago
Edited:
Christian Opitz Christian Opitz
1 year ago
Revision Id:
189
Edit Message:
Corrected Exception class
ZF Version
1.0.4
Tags:
filter text html txt
Comments:
0
Views:
1910
Points:
1 (1 votes)

History

r189

Corrected Exception class

Christian Opitz Christian Opitz
1 year ago
diff
r188

Initial Release

Christian Opitz Christian Opitz
1 year ago