PageRenderTime 248ms CodeModel.GetById 69ms app.highlight 52ms RepoModel.GetById 122ms app.codeStats 0ms

/code/ryzom/tools/server/www/webtt/vendors/simpletest/tidy_parser.php

https://bitbucket.org/SirCotare/ryzom
PHP | 382 lines | 210 code | 27 blank | 145 comment | 30 complexity | afb2062671500784ffd97908d4ea5d5c MD5 | raw file
  1<?php
  2/**
  3 *  base include file for SimpleTest
  4 *  @package    SimpleTest
  5 *  @subpackage WebTester
  6 *  @version    $Id: php_parser.php 1911 2009-07-29 16:38:04Z lastcraft $
  7 */
  8
  9/**
 10 *    Builds the page object.
 11 *    @package SimpleTest
 12 *    @subpackage WebTester
 13 */
 14class SimpleTidyPageBuilder {
 15    private $page;
 16    private $forms = array();
 17    private $labels = array();
 18    private $widgets_by_id = array();
 19
 20    public function __destruct() {
 21        $this->free();
 22    }
 23
 24    /**
 25     *    Frees up any references so as to allow the PHP garbage
 26     *    collection from unset() to work.
 27     */
 28    private function free() {
 29        unset($this->page);
 30        $this->forms = array();
 31        $this->labels = array();
 32    }
 33
 34    /**
 35     *    This builder is only available if the 'tidy' extension is loaded.
 36     *    @return boolean       True if available.
 37     */
 38    function can() {
 39        return extension_loaded('tidy');
 40    }
 41
 42    /**
 43     *    Reads the raw content the page using HTML Tidy.
 44     *    @param $response SimpleHttpResponse  Fetched response.
 45     *    @return SimplePage                   Newly parsed page.
 46     */
 47    function parse($response) {
 48        $this->page = new SimplePage($response);
 49        $tidied = tidy_parse_string($input = $this->insertGuards($response->getContent()),
 50                                    array('output-xml' => false, 'wrap' => '0', 'indent' => 'no'),
 51                                    'latin1');
 52        $this->walkTree($tidied->html());
 53        $this->attachLabels($this->widgets_by_id, $this->labels);
 54        $this->page->setForms($this->forms);
 55        $page = $this->page;
 56        $this->free();
 57        return $page;
 58    }
 59
 60    /**
 61     *    Stops HTMLTidy stripping content that we wish to preserve.
 62     *    @param string      The raw html.
 63     *    @return string     The html with guard tags inserted.
 64     */
 65    private function insertGuards($html) {
 66        return $this->insertEmptyTagGuards($this->insertTextareaSimpleWhitespaceGuards($html));
 67    }
 68
 69    /**
 70     *    Removes the extra content added during the parse stage
 71     *    in order to preserve content we don't want stripped
 72     *    out by HTMLTidy.
 73     *    @param string      The raw html.
 74     *    @return string     The html with guard tags removed.
 75     */
 76    private function stripGuards($html) {
 77        return $this->stripTextareaWhitespaceGuards($this->stripEmptyTagGuards($html));
 78    }
 79
 80    /**
 81     *    HTML tidy strips out empty tags such as <option> which we
 82     *    need to preserve. This method inserts an additional marker.
 83     *    @param string      The raw html.
 84     *    @return string     The html with guards inserted.
 85     */
 86    private function insertEmptyTagGuards($html) {
 87        return preg_replace('#<(option|textarea)([^>]*)>(\s*)</(option|textarea)>#is',
 88                            '<\1\2>___EMPTY___\3</\4>',
 89                            $html);
 90    }
 91
 92    /**
 93     *    HTML tidy strips out empty tags such as <option> which we
 94     *    need to preserve. This method strips additional markers
 95     *    inserted by SimpleTest to the tidy output used to make the
 96     *    tags non-empty. This ensures their preservation.
 97     *    @param string      The raw html.
 98     *    @return string     The html with guards removed.
 99     */
100    private function stripEmptyTagGuards($html) {
101        return preg_replace('#(^|>)(\s*)___EMPTY___(\s*)(</|$)#i', '\2\3', $html);
102    }
103
104    /**
105     *    By parsing the XML output of tidy, we lose some whitespace
106     *    information in textarea tags. We temporarily recode this
107     *    data ourselves so as not to lose it.
108     *    @param string      The raw html.
109     *    @return string     The html with guards inserted.
110     */
111    private function insertTextareaSimpleWhitespaceGuards($html) {
112        return preg_replace_callback('#<textarea([^>]*)>(.*?)</textarea>#is',
113                                     array($this, 'insertWhitespaceGuards'),
114                                     $html);
115    }
116
117    /**
118     *  Callback for insertTextareaSimpleWhitespaceGuards().
119     *  @param array $matches       Result of preg_replace_callback().
120     *  @return string              Guard tags now replace whitespace.
121     */
122    private function insertWhitespaceGuards($matches) {
123        return '<textarea' . $matches[1] . '>' .
124                str_replace(array("\n", "\r", "\t", ' '),
125                            array('___NEWLINE___', '___CR___', '___TAB___', '___SPACE___'),
126                            $matches[2]) .
127                '</textarea>';
128    }
129
130    /**
131     *    Removes the whitespace preserving guards we added
132     *    before parsing.
133     *    @param string      The raw html.
134     *    @return string     The html with guards removed.
135     */
136    private function stripTextareaWhitespaceGuards($html) {
137        return str_replace(array('___NEWLINE___', '___CR___', '___TAB___', '___SPACE___'),
138                           array("\n", "\r", "\t", ' '),
139                           $html);
140    }
141
142    /**
143     *  Visits the given node and all children
144     *  @param object $node      Tidy XML node.
145     */
146    private function walkTree($node) {
147        if ($node->name == 'a') {
148            $this->page->addLink($this->tags()->createTag($node->name, (array)$node->attribute)
149                                        ->addContent($this->innerHtml($node)));
150        } elseif ($node->name == 'base' and isset($node->attribute['href'])) {
151            $this->page->setBase($node->attribute['href']);
152        } elseif ($node->name == 'title') {
153            $this->page->setTitle($this->tags()->createTag($node->name, (array)$node->attribute)
154                                         ->addContent($this->innerHtml($node)));
155        } elseif ($node->name == 'frameset') {
156            $this->page->setFrames($this->collectFrames($node));
157        } elseif ($node->name == 'form') {
158            $this->forms[] = $this->walkForm($node, $this->createEmptyForm($node));
159        } elseif ($node->name == 'label') {
160            $this->labels[] = $this->tags()->createTag($node->name, (array)$node->attribute)
161                                           ->addContent($this->innerHtml($node));
162        } else {
163            $this->walkChildren($node);
164        }
165    }
166
167    /**
168     *  Helper method for traversing the XML tree.
169     *  @param object $node     Tidy XML node.
170     */
171    private function walkChildren($node) {
172        if ($node->hasChildren()) {
173            foreach ($node->child as $child) {
174                $this->walkTree($child);
175            }
176        }
177    }
178
179    /**
180     *  Facade for forms containing preparsed widgets.
181     *  @param object $node     Tidy XML node.
182     *  @return SimpleForm      Facade for SimpleBrowser.
183     */
184    private function createEmptyForm($node) {
185        return new SimpleForm($this->tags()->createTag($node->name, (array)$node->attribute), $this->page);
186    }
187
188    /**
189     *  Visits the given node and all children
190     *  @param object $node      Tidy XML node.
191     */
192    private function walkForm($node, $form, $enclosing_label = '') {
193        if ($node->name == 'a') {
194            $this->page->addLink($this->tags()->createTag($node->name, (array)$node->attribute)
195                                              ->addContent($this->innerHtml($node)));
196        } elseif (in_array($node->name, array('input', 'button', 'textarea', 'select'))) {
197            $this->addWidgetToForm($node, $form, $enclosing_label);
198        } elseif ($node->name == 'label') {
199            $this->labels[] = $this->tags()->createTag($node->name, (array)$node->attribute)
200                                           ->addContent($this->innerHtml($node));
201            if ($node->hasChildren()) {
202                foreach ($node->child as $child) {
203                    $this->walkForm($child, $form, SimplePage::normalise($this->innerHtml($node)));
204                }
205            }
206        } elseif ($node->hasChildren()) {
207            foreach ($node->child as $child) {
208                $this->walkForm($child, $form);
209            }
210        }
211        return $form;
212    }
213
214    /**
215     *  Tests a node for a "for" atribute. Used for
216     *  attaching labels.
217     *  @param object $node      Tidy XML node.
218     *  @return boolean          True if the "for" attribute exists.
219     */
220    private function hasFor($node) {
221        return isset($node->attribute) and $node->attribute['for'];
222    }
223
224    /**
225     *  Adds the widget into the form container.
226     *  @param object $node             Tidy XML node of widget.
227     *  @param SimpleForm $form         Form to add it to.
228     *  @param string $enclosing_label  The label of any label
229     *                                  tag we might be in.
230     */
231    private function addWidgetToForm($node, $form, $enclosing_label) {
232        $widget = $this->tags()->createTag($node->name, $this->attributes($node));
233        if (! $widget) {
234            return;
235        }
236        $widget->setLabel($enclosing_label)
237               ->addContent($this->innerHtml($node));
238        if ($node->name == 'select') {
239            $widget->addTags($this->collectSelectOptions($node));
240        }
241        $form->addWidget($widget);
242        $this->indexWidgetById($widget);
243    }
244
245    /**
246     *  Fills the widget cache to speed up searching.
247     *  @param SimpleTag $widget    Parsed widget to cache.
248     */
249    private function indexWidgetById($widget) {
250        $id = $widget->getAttribute('id');
251        if (! $id) {
252            return;
253        }
254        if (! isset($this->widgets_by_id[$id])) {
255            $this->widgets_by_id[$id] = array();
256        }
257        $this->widgets_by_id[$id][] = $widget;
258    }
259
260    /**
261     *  Parses the options from inside an XML select node.
262     *  @param object $node      Tidy XML node.
263     *  @return array            List of SimpleTag options.
264     */
265    private function collectSelectOptions($node) {
266        $options = array();
267        if ($node->name == 'option') {
268            $options[] = $this->tags()->createTag($node->name, $this->attributes($node))
269                                      ->addContent($this->innerHtml($node));
270        }
271        if ($node->hasChildren()) {
272            foreach ($node->child as $child) {
273                $options = array_merge($options, $this->collectSelectOptions($child));
274            }
275        }
276        return $options;
277    }
278
279    /**
280     *  Convenience method for collecting all the attributes
281     *  of a tag. Not sure why Tidy does not have this.
282     *  @param object $node      Tidy XML node.
283     *  @return array            Hash of attribute strings.
284     */
285    private function attributes($node) {
286        if (! preg_match('|<[^ ]+\s(.*?)/?>|s', $node->value, $first_tag_contents)) {
287            return array();
288        }
289        $attributes = array();
290        preg_match_all('/\S+\s*=\s*\'[^\']*\'|(\S+\s*=\s*"[^"]*")|([^ =]+\s*=\s*[^ "\']+?)|[^ "\']+/', $first_tag_contents[1], $matches);
291        foreach($matches[0] as $unparsed) {
292            $attributes = $this->mergeAttribute($attributes, $unparsed);
293        }
294        return $attributes;
295    }
296
297    /**
298     *  Overlay an attribute into the attributes hash.
299     *  @param array $attributes        Current attribute list.
300     *  @param string $raw              Raw attribute string with
301     *                                  both key and value.
302     *  @return array                   New attribute hash.
303     */
304    private function mergeAttribute($attributes, $raw) {
305        $parts = explode('=', $raw);
306        list($name, $value) = count($parts) == 1 ? array($parts[0], $parts[0]) : $parts;
307        $attributes[trim($name)] = html_entity_decode($this->dequote(trim($value)), ENT_QUOTES);
308        return $attributes;
309    }
310
311    /**
312     *  Remove start and end quotes.
313     *  @param string $quoted    A quoted string.
314     *  @return string           Quotes are gone.
315     */
316    private function dequote($quoted) {
317        if (preg_match('/^(\'([^\']*)\'|"([^"]*)")$/', $quoted, $matches)) {
318            return isset($matches[3]) ? $matches[3] : $matches[2];
319        }
320        return $quoted;
321    }
322
323    /**
324     *  Collects frame information inside a frameset tag.
325     *  @param object $node     Tidy XML node.
326     *  @return array           List of SimpleTag frame descriptions.
327     */
328    private function collectFrames($node) {
329        $frames = array();
330        if ($node->name == 'frame') {
331            $frames = array($this->tags()->createTag($node->name, (array)$node->attribute));
332        } else if ($node->hasChildren()) {
333            $frames = array();
334            foreach ($node->child as $child) {
335                $frames = array_merge($frames, $this->collectFrames($child));
336            }
337        }
338        return $frames;
339    }
340
341    /**
342     *  Extracts the XML node text.
343     *  @param object $node     Tidy XML node.
344     *  @return string          The text only.
345     */
346    private function innerHtml($node) {
347        $raw = '';
348        if ($node->hasChildren()) {
349            foreach ($node->child as $child) {
350                $raw .= $child->value;
351            }
352        }
353        return $this->stripGuards($raw);
354    }
355
356    /**
357     *  Factory for parsed content holders.
358     *  @return SimpleTagBuilder    Factory.
359     */
360    private function tags() {
361        return new SimpleTagBuilder();
362    }
363
364    /**
365     *  Called at the end of a parse run. Attaches any
366     *  non-wrapping labels to their form elements.
367     *  @param array $widgets_by_id     Cached SimpleTag hash.
368     *  @param array $labels            SimpleTag label elements.
369     */
370    private function attachLabels($widgets_by_id, $labels) {
371        foreach ($labels as $label) {
372            $for = $label->getFor();
373            if ($for and isset($widgets_by_id[$for])) {
374                $text = $label->getText();
375                foreach ($widgets_by_id[$for] as $widget) {
376                    $widget->setLabel($text);
377                }
378            }
379        }
380    }
381}
382?>