mediawiki_parser.class.php

/common/libraries/plugin/wiki/mediawiki_parser.class.php

Large files files are truncated, but you can click here to view the full file

<?php

require_once dirname(__FILE__) . '/mediawiki/Utilities.php';

require_once dirname(__FILE__) . '/mediawiki/Sanitizer.php';

require_once dirname(__FILE__) . '/mediawiki/StringUtils.php';

require_once dirname(__FILE__) . '/mediawiki/Xml.php';

require_once dirname(__FILE__) . '/mediawiki/StripState.php';

require_once dirname(__FILE__) . '/mediawiki/Title.php';

require_once dirname(__FILE__) . '/mediawiki/LinkHolderArray.php';

require_once dirname(__FILE__) . '/mediawiki/Linker.php';

require_once dirname(__FILE__) . '/mediawiki/LinkCache.php';

require_once dirname(__FILE__) . '/mediawiki/Defines.php';

require_once dirname(__FILE__) . '/mediawiki/ParserOutput.php';

require_once dirname(__FILE__) . '/mediawiki/Namespace.php';

require_once dirname(__FILE__) . '/mediawiki_parser_context.class.php';



function wfUrlProtocols()

{

    /**

     * The external URL protocols

     */

    $wgUrlProtocols = array('http://', 'https://', 'ftp://', 'irc://', 'gopher://', 'telnet://', // Well if we're going to support the above.. -ĂŚvar

'nntp://',  // @bug 3808 RFC 1738

            'worldwind://', 'mailto:', 'news:', 'svn://');

    

    // Support old-style $wgUrlProtocols strings, for backwards compatibility

    // with LocalSettings files from 1.5

    if (is_array($wgUrlProtocols))

    {

        $protocols = array();

        foreach ($wgUrlProtocols as $protocol)

            $protocols[] = preg_quote($protocol, '/');

        

        return implode('|', $protocols);

    }

    else

    {

        return $wgUrlProtocols;

    }

}



function wfUrlencode($s)

{

    $s = urlencode($s);

    $s = str_ireplace(array('%3B', '%3A', '%40', '%24', '%21', '%2A', '%28', '%29', '%2C', '%2F'), array(';', ':', '@', 

            '$', '!', '*', '(', ')', ',', '/'), $s);

    

    return $s;

}



/**

 * This is the logical opposite of wfArrayToCGI(): it accepts a query string as

 * its argument and returns the same string in array form.  This allows compa-

 * tibility with legacy functions that accept raw query strings instead of nice

 * arrays.  Of course, keys and values are urldecode()d.  Don't try passing in-

 * valid query strings, or it will explode.

 *

 * @param $query string Query string

 * @return array Array version of input

 */

function wfCgiToArray($query)

{

    if (isset($query[0]) and $query[0] == '?')

    {

        $query = substr($query, 1);

    }

    $bits = explode('&', $query);

    $ret = array();

    foreach ($bits as $bit)

    {

        if ($bit === '')

        {

            continue;

        }

        list($key, $value) = explode('=', $bit);

        $key = urldecode($key);

        $value = urldecode($value);

        $ret[$key] = $value;

    }

    return $ret;

}



/**

 * This function takes two arrays as input, and returns a CGI-style string, e.g.

 * "days=7&limit=100". Options in the first array override options in the second.

 * Options set to "" will not be output.

 */

function wfArrayToCGI($array1, $array2 = NULL)

{

    if (! is_null($array2))

    {

        $array1 = $array1 + $array2;

    }

    

    $cgi = '';

    foreach ($array1 as $key => $value)

    {

        if ('' !== $value)

        {

            if ('' != $cgi)

            {

                $cgi .= '&';

            }

            if (is_array($value))

            {

                $firstTime = true;

                foreach ($value as $v)

                {

                    $cgi .= ($firstTime ? '' : '&') . urlencode($key . '[]') . '=' . urlencode($v);

                    $firstTime = false;

                }

            }

            else

                $cgi .= urlencode($key) . '=' . urlencode($value);

        }

    }

    return $cgi;

}



/**

 * Append a query string to an existing URL, which may or may not already

 * have query string parameters already. If so, they will be combined.

 *

 * @param string $url

 * @param string $query

 * @return string

 */

function wfAppendQuery($url, $query)

{

    if ($query != '')

    {

        if (false === strpos($url, '?'))

        {

            $url .= '?';

        }

        else

        {

            $url .= '&';

        }

        $url .= $query;

    }

    return $url;

}



/**

 * A Mediawiki wikitext parser using the same functions

 * as used by Mediawiki's parsing engine

 *

 * @author Hans De Bisschop

 * @see Parser

 *

 */

class MediawikiParser

{

    // State constants for the definition list colon extraction

    const COLON_STATE_TEXT = 0;

    const COLON_STATE_TAG = 1;

    const COLON_STATE_TAGSTART = 2;

    const COLON_STATE_CLOSETAG = 3;

    const COLON_STATE_TAGSLASH = 4;

    const COLON_STATE_COMMENT = 5;

    const COLON_STATE_COMMENTDASH = 6;

    const COLON_STATE_COMMENTDASHDASH = 7;

    

    const MARKER_SUFFIX = "-QINU\x7f";

    

    const VERSION = '1.6.4';

    

    // Flags for preprocessToDom

    const PTD_FOR_INCLUSION = 1;

    

    private $mUniqPrefix;

    /**

     * The context of the MediawikiParser

     *

     * @var MediawikiParserContext

     */

    private $mediawiki_parser_context;



    function __construct(MediaWikiParserContext $mediawiki_parser_context)

    {

        $this->mediawiki_parser_context = $mediawiki_parser_context;

        $this->mUniqPrefix = "\x7fUNIQ" . self :: getRandomString();

        $this->mLinkID = 0;

        $this->mOutput = new MediawikiParserOutput();

        $this->mStripState = new MediawikiStripState();

        $this->mLinkHolders = new MediawikiLinkHolderArray($this);

    }



    function get_mediawiki_parser_context()

    {

        return $this->mediawiki_parser_context;

    }



    /**

     * Get a random string

     *

     * @private

     * @static

     */

    function getRandomString()

    {

        return dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff));

    }



    function parse()

    {

        $text = $this->mediawiki_parser_context->get_body();

        $text = $this->internalParse($text);

        

        # Clean up special characters, only run once, next-to-last before doBlockLevels

        $fixtags = array(# french spaces, last one Guillemet-left

# only if there is something before the space

        '/(.) (?=\\?|:|;|!|%|\\302\\273)/' => '\\1&nbsp;\\2',  # french spaces, Guillemet-right

                '/(\\302\\253) /' => '\\1&nbsp;', '/&nbsp;(!\s*important)/' => ' \\1'); #Beware of CSS magic word !important, bug #11874.

        



        $text = preg_replace(array_keys($fixtags), array_values($fixtags), $text);

        

        $text = $this->doBlockLevels($text, $linestart);

        

        $this->replaceLinkHolders($text);

        

        return $text;

    }



    /**

     * Replace <!--LINK--> link placeholders with actual links, in the buffer

     * Placeholders created in Skin::makeLinkObj()

     * Returns an array of link CSS classes, indexed by PDBK.

     */

    function replaceLinkHolders(&$text, $options = 0)

    {

        return $this->mLinkHolders->replace($text);

    }



    function internalParse($text)

    {

        $isMain = true;

        //$text = Sanitizer :: removeHTMLtags($text, array(&$this, 'attributeStripCallback'), false, array_keys($this->mTransparentTagHooks));

        



        // Tables need to come after variable replacement for things to work

        // properly; putting them before other transformations should keep

        // exciting things like link expansions from showing up in surprising

        // places.

        $text = $this->doTableStuff($text);

        

        $text = preg_replace('/(^|\n)-----*/', '\\1<hr />', $text);

        //

        //        $text = $this->doDoubleUnderscore($text);

        $text = $this->doHeadings($text);

        //        //if ($this->mOptions->getUseDynamicDates())

        //        //{

        //        //    $df = DateFormatter :: getInstance();

        //        //    $text = $df->reformat($this->mOptions->getDateFormat(), $text);

        //        //}

        $text = $this->doAllQuotes($text);

        $text = $this->replaceInternalLinks($text);

        //        $text = $this->replaceExternalLinks($text);

        //

        //        # replaceInternalLinks may sometimes leave behind

        //        # absolute URLs, which have to be masked to hide them from replaceExternalLinks

        //        $text = str_replace($this->mUniqPrefix . 'NOPARSE', '', $text);

        //

        //        $text = $this->doMagicLinks($text);

        $text = $this->formatHeadings($text, $isMain);

        

        return $text;

    }



    /**

     * parse the wiki syntax used to render tables

     *

     * @private

     */

    function doTableStuff($text)

    {

        

        $lines = MediawikiStringUtils :: explode("\n", $text);

        $out = '';

        $td_history = array(); // Is currently a td tag open?

        $last_tag_history = array(); // Save history of last lag activated (td, th or caption)

        $tr_history = array(); // Is currently a tr tag open?

        $tr_attributes = array(); // history of tr attributes

        $has_opened_tr = array(); // Did this table open a <tr> element?

        $indent_level = 0; // indent level of the table

        



        foreach ($lines as $outLine)

        {

            $line = trim($outLine);

            

            if ($line == '')

            { // empty line, go to next line

                $out .= $outLine . "\n";

                continue;

            }

            $first_character = $line[0];

            $matches = array();

            

            if (preg_match('/^(:*)\{\|(.*)$/', $line, $matches))

            {

                // First check if we are starting a new table

                $indent_level = strlen($matches[1]);

                

                $attributes = $this->mStripState->unstripBoth($matches[2]);

                $attributes = MediawikiSanitizer :: fixTagAttributes($attributes, 'table');

                

                $outLine = str_repeat('<dl><dd>', $indent_level) . "<table{$attributes}>";

                array_push($td_history, false);

                array_push($last_tag_history, '');

                array_push($tr_history, false);

                array_push($tr_attributes, '');

                array_push($has_opened_tr, false);

            }

            else 

                if (count($td_history) == 0)

                {

                    // Don't do any of the following

                    $out .= $outLine . "\n";

                    continue;

                }

                else 

                    if (substr($line, 0, 2) === '|}')

                    {

                        // We are ending a table

                        $line = '</table>' . substr($line, 2);

                        $last_tag = array_pop($last_tag_history);

                        

                        if (! array_pop($has_opened_tr))

                        {

                            $line = "<tr><td></td></tr>{$line}";

                        }

                        

                        if (array_pop($tr_history))

                        {

                            $line = "</tr>{$line}";

                        }

                        

                        if (array_pop($td_history))

                        {

                            $line = "</{$last_tag}>{$line}";

                        }

                        array_pop($tr_attributes);

                        $outLine = $line . str_repeat('</dd></dl>', $indent_level);

                    }

                    else 

                        if (substr($line, 0, 2) === '|-')

                        {

                            // Now we have a table row

                            $line = preg_replace('#^\|-+#', '', $line);

                            

                            // Whats after the tag is now only attributes

                            $attributes = $this->mStripState->unstripBoth($line);

                            $attributes = MediawikiSanitizer :: fixTagAttributes($attributes, 'tr');

                            array_pop($tr_attributes);

                            array_push($tr_attributes, $attributes);

                            

                            $line = '';

                            $last_tag = array_pop($last_tag_history);

                            array_pop($has_opened_tr);

                            array_push($has_opened_tr, true);

                            

                            if (array_pop($tr_history))

                            {

                                $line = '</tr>';

                            }

                            

                            if (array_pop($td_history))

                            {

                                $line = "</{$last_tag}>{$line}";

                            }

                            

                            $outLine = $line;

                            array_push($tr_history, false);

                            array_push($td_history, false);

                            array_push($last_tag_history, '');

                        }

                        else 

                            if ($first_character === '|' || $first_character === '!' || substr($line, 0, 2) === '|+')

                            {

                                // This might be cell elements, td, th or captions

                                if (substr($line, 0, 2) === '|+')

                                {

                                    $first_character = '+';

                                    $line = substr($line, 1);

                                }

                                

                                $line = substr($line, 1);

                                

                                if ($first_character === '!')

                                {

                                    $line = str_replace('!!', '||', $line);

                                }

                                

                                // Split up multiple cells on the same line.

                                // FIXME : This can result in improper nesting of tags processed

                                // by earlier parser steps, but should avoid splitting up eg

                                // attribute values containing literal "||".

                                $cells = MediawikiStringUtils :: explodeMarkup('||', $line);

                                

                                $outLine = '';

                                

                                // Loop through each table cell

                                foreach ($cells as $cell)

                                {

                                    $previous = '';

                                    if ($first_character !== '+')

                                    {

                                        $tr_after = array_pop($tr_attributes);

                                        if (! array_pop($tr_history))

                                        {

                                            $previous = "<tr{$tr_after}>\n";

                                        }

                                        array_push($tr_history, true);

                                        array_push($tr_attributes, '');

                                        array_pop($has_opened_tr);

                                        array_push($has_opened_tr, true);

                                    }

                                    

                                    $last_tag = array_pop($last_tag_history);

                                    

                                    if (array_pop($td_history))

                                    {

                                        $previous = "</{$last_tag}>{$previous}";

                                    }

                                    

                                    if ($first_character === '|')

                                    {

                                        $last_tag = 'td';

                                    }

                                    else 

                                        if ($first_character === '!')

                                        {

                                            $last_tag = 'th';

                                        }

                                        else 

                                            if ($first_character === '+')

                                            {

                                                $last_tag = 'caption';

                                            }

                                            else

                                            {

                                                $last_tag = '';

                                            }

                                    

                                    array_push($last_tag_history, $last_tag);

                                    

                                    // A cell could contain both parameters and data

                                    $cell_data = explode('|', $cell, 2);

                                    

                                    // Bug 553: Note that a '|' inside an invalid link should not

                                    // be mistaken as delimiting cell parameters

                                    if (strpos($cell_data[0], '[[') !== false)

                                    {

                                        $cell = "{$previous}<{$last_tag}>{$cell}";

                                    }

                                    else 

                                        if (count($cell_data) == 1)

                                            $cell = "{$previous}<{$last_tag}>{$cell_data[0]}";

                                        else

                                        {

                                            $attributes = $this->mStripState->unstripBoth($cell_data[0]);

                                            $attributes = MediawikiSanitizer :: fixTagAttributes($attributes, $last_tag);

                                            $cell = "{$previous}<{$last_tag}{$attributes}>{$cell_data[1]}";

                                        }

                                    

                                    $outLine .= $cell;

                                    array_push($td_history, true);

                                }

                            }

            $out .= $outLine . "\n";

        }

        

        // Closing open td, tr && table

        while (count($td_history) > 0)

        {

            if (array_pop($td_history))

            {

                $out .= "</td>\n";

            }

            if (array_pop($tr_history))

            {

                $out .= "</tr>\n";

            }

            if (! array_pop($has_opened_tr))

            {

                $out .= "<tr><td></td></tr>\n";

            }

            

            $out .= "</table>\n";

        }

        

        // Remove trailing line-ending (b/c)

        if (substr($out, - 1) === "\n")

        {

            $out = substr($out, 0, - 1);

        }

        

        // special case: don't return empty table

        if ($out === "<table>\n<tr><td></td></tr>\n</table>")

        {

            $out = '';

        }

        

        return $out;

    }



    /**

     * Parse headers and return html

     *

     * @private

     */

    function doHeadings($text)

    {

        for($i = 6; $i >= 1; -- $i)

        {

            $h = str_repeat('=', $i);

            $text = preg_replace("/^$h(.+)$h\\s*$/m", "<h$i>\\1</h$i>", $text);

        }

        return $text;

    }



    /**

     * Replace single quotes with HTML markup

     * @private

     * @return string the altered text

     */

    function doAllQuotes($text)

    {

        $outtext = '';

        $lines = MediawikiStringUtils :: explode("\n", $text);

        foreach ($lines as $line)

        {

            $outtext .= $this->doQuotes($line) . "\n";

        }

        $outtext = substr($outtext, 0, - 1);

        return $outtext;

    }



    /**

     * Helper function for doAllQuotes()

     */

    public function doQuotes($text)

    {

        $arr = preg_split("/(''+)/", $text, - 1, PREG_SPLIT_DELIM_CAPTURE);

        if (count($arr) == 1)

            return $text;

        else

        {

            # First, do some preliminary work. This may shift some apostrophes from

            # being mark-up to being text. It also counts the number of occurrences

            # of bold and italics mark-ups.

            $i = 0;

            $numbold = 0;

            $numitalics = 0;

            foreach ($arr as $r)

            {

                if (($i % 2) == 1)

                {

                    # If there are ever four apostrophes, assume the first is supposed to

                    # be text, and the remaining three constitute mark-up for bold text.

                    if (strlen($arr[$i]) == 4)

                    {

                        $arr[$i - 1] .= "'";

                        $arr[$i] = "'''";

                    }

                    # If there are more than 5 apostrophes in a row, assume they're all

                    # text except for the last 5.

                    else 

                        if (strlen($arr[$i]) > 5)

                        {

                            $arr[$i - 1] .= str_repeat("'", strlen($arr[$i]) - 5);

                            $arr[$i] = "'''''";

                        }

                    # Count the number of occurrences of bold and italics mark-ups.

                    # We are not counting sequences of five apostrophes.

                    if (strlen($arr[$i]) == 2)

                    {

                        $numitalics ++;

                    }

                    else 

                        if (strlen($arr[$i]) == 3)

                        {

                            $numbold ++;

                        }

                        else 

                            if (strlen($arr[$i]) == 5)

                            {

                                $numitalics ++;

                                $numbold ++;

                            }

                }

                $i ++;

            }

            

            # If there is an odd number of both bold and italics, it is likely

            # that one of the bold ones was meant to be an apostrophe followed

            # by italics. Which one we cannot know for certain, but it is more

            # likely to be one that has a single-letter word before it.

            if (($numbold % 2 == 1) && ($numitalics % 2 == 1))

            {

                $i = 0;

                $firstsingleletterword = - 1;

                $firstmultiletterword = - 1;

                $firstspace = - 1;

                foreach ($arr as $r)

                {

                    if (($i % 2 == 1) and (strlen($r) == 3))

                    {

                        $x1 = substr($arr[$i - 1], - 1);

                        $x2 = substr($arr[$i - 1], - 2, 1);

                        if ($x1 === ' ')

                        {

                            if ($firstspace == - 1)

                                $firstspace = $i;

                        }

                        else 

                            if ($x2 === ' ')

                            {

                                if ($firstsingleletterword == - 1)

                                    $firstsingleletterword = $i;

                            }

                            else

                            {

                                if ($firstmultiletterword == - 1)

                                    $firstmultiletterword = $i;

                            }

                    }

                    $i ++;

                }

                

                # If there is a single-letter word, use it!

                if ($firstsingleletterword > - 1)

                {

                    $arr[$firstsingleletterword] = "''";

                    $arr[$firstsingleletterword - 1] .= "'";

                }

                # If not, but there's a multi-letter word, use that one.

                else 

                    if ($firstmultiletterword > - 1)

                    {

                        $arr[$firstmultiletterword] = "''";

                        $arr[$firstmultiletterword - 1] .= "'";

                    }

                    # ... otherwise use the first one that has neither.

                    # (notice that it is possible for all three to be -1 if, for example,

                    # there is only one pentuple-apostrophe in the line)

                    else 

                        if ($firstspace > - 1)

                        {

                            $arr[$firstspace] = "''";

                            $arr[$firstspace - 1] .= "'";

                        }

            }

            

            # Now let's actually convert our apostrophic mush to HTML!

            $output = '';

            $buffer = '';

            $state = '';

            $i = 0;

            foreach ($arr as $r)

            {

                if (($i % 2) == 0)

                {

                    if ($state === 'both')

                        $buffer .= $r;

                    else

                        $output .= $r;

                }

                else

                {

                    if (strlen($r) == 2)

                    {

                        if ($state === 'i')

                        {

                            $output .= '</i>';

                            $state = '';

                        }

                        else 

                            if ($state === 'bi')

                            {

                                $output .= '</i>';

                                $state = 'b';

                            }

                            else 

                                if ($state === 'ib')

                                {

                                    $output .= '</b></i><b>';

                                    $state = 'b';

                                }

                                else 

                                    if ($state === 'both')

                                    {

                                        $output .= '<b><i>' . $buffer . '</i>';

                                        $state = 'b';

                                    }

                                    else # $state can be 'b' or ''

                                    {

                                        $output .= '<i>';

                                        $state .= 'i';

                                    }

                    }

                    else 

                        if (strlen($r) == 3)

                        {

                            if ($state === 'b')

                            {

                                $output .= '</b>';

                                $state = '';

                            }

                            else 

                                if ($state === 'bi')

                                {

                                    $output .= '</i></b><i>';

                                    $state = 'i';

                                }

                                else 

                                    if ($state === 'ib')

                                    {

                                        $output .= '</b>';

                                        $state = 'i';

                                    }

                                    else 

                                        if ($state === 'both')

                                        {

                                            $output .= '<i><b>' . $buffer . '</b>';

                                            $state = 'i';

                                        }

                                        else # $state can be 'i' or ''

                                        {

                                            $output .= '<b>';

                                            $state .= 'b';

                                        }

                        }

                        else 

                            if (strlen($r) == 5)

                            {

                                if ($state === 'b')

                                {

                                    $output .= '</b><i>';

                                    $state = 'i';

                                }

                                else 

                                    if ($state === 'i')

                                    {

                                        $output .= '</i><b>';

                                        $state = 'b';

                                    }

                                    else 

                                        if ($state === 'bi')

                                        {

                                            $output .= '</i></b>';

                                            $state = '';

                                        }

                                        else 

                                            if ($state === 'ib')

                                            {

                                                $output .= '</b></i>';

                                                $state = '';

                                            }

                                            else 

                                                if ($state === 'both')

                                                {

                                                    $output .= '<i><b>' . $buffer . '</b></i>';

                                                    $state = '';

                                                }

                                                else # ($state == '')

                                                {

                                                    $buffer = '';

                                                    $state = 'both';

                                                }

                            }

                }

                $i ++;

            }

            # Now close all remaining tags.  Notice that the order is important.

            if ($state === 'b' || $state === 'ib')

                $output .= '</b>';

            if ($state === 'i' || $state === 'bi' || $state === 'ib')

                $output .= '</i>';

            if ($state === 'bi')

                $output .= '</b>';

            

     # There might be lonely ''''', so make sure we have a buffer

            if ($state === 'both' && $buffer)

                $output .= '<b><i>' . $buffer . '</i></b>';

            return $output;

        }

    }



    /**

     * Make lists from lines starting with ':', '*', '#', etc. (DBL)

     *

     * @private

     * @return string the lists rendered as HTML

     */

    function doBlockLevels($text, $linestart)

    {

        # Parsing through the text line by line.  The main thing

        # happening here is handling of block-level elements p, pre,

        # and making lists from lines starting with * # : etc.

        #

        $textLines = MediawikiStringUtils :: explode("\n", $text);

        

        $lastPrefix = $output = '';

        $this->mDTopen = $inBlockElem = false;

        $prefixLength = 0;

        $paragraphStack = false;

        

        foreach ($textLines as $oLine)

        {

            # Fix up $linestart

            if (! $linestart)

            {

                $output .= $oLine;

                $linestart = true;

                continue;

            }

            

            $lastPrefixLength = strlen($lastPrefix);

            $preCloseMatch = preg_match('/<\\/pre/i', $oLine);

            $preOpenMatch = preg_match('/<pre/i', $oLine);

            if (! $this->mInPre)

            {

                # Multiple prefixes may abut each other for nested lists.

                $prefixLength = strspn($oLine, '*#:;');

                $prefix = substr($oLine, 0, $prefixLength);

                

                # eh?

                $prefix2 = str_replace(';', ':', $prefix);

                $t = substr($oLine, $prefixLength);

                $this->mInPre = (bool) $preOpenMatch;

            }

            else

            {

                # Don't interpret any other prefixes in preformatted text

                $prefixLength = 0;

                $prefix = $prefix2 = '';

                $t = $oLine;

            }

            

            # List generation

            if ($prefixLength && $lastPrefix === $prefix2)

            {

                # Same as the last item, so no need to deal with nesting or opening stuff

                $output .= $this->nextItem(substr($prefix, - 1));

                $paragraphStack = false;

                

                if (substr($prefix, - 1) === ';')

                {

                    # The one nasty exception: definition lists work like this:

                    # ; title : definition text

                    # So we check for : in the remainder text to split up the

                    # title and definition, without b0rking links.

                    $term = $t2 = '';

                    if ($this->findColonNoLinks($t, $term, $t2) !== false)

                    {

                        $t = $t2;

                        $output .= $term . $this->nextItem(':');

                    }

                }

            }

            elseif ($prefixLength || $lastPrefixLength)

            {

                # Either open or close a level...

                $commonPrefixLength = $this->getCommon($prefix, $lastPrefix);

                $paragraphStack = false;

                

                while ($commonPrefixLength < $lastPrefixLength)

                {

                    $output .= $this->closeList($lastPrefix[$lastPrefixLength - 1]);

                    -- $lastPrefixLength;

                }

                if ($prefixLength <= $commonPrefixLength && $commonPrefixLength > 0)

                {

                    $output .= $this->nextItem($prefix[$commonPrefixLength - 1]);

                }

                while ($prefixLength > $commonPrefixLength)

                {

                    $char = substr($prefix, $commonPrefixLength, 1);

                    $output .= $this->openList($char);

                    

                    if (';' === $char)

                    {

                        # FIXME: This is dupe of code above

                        if ($this->findColonNoLinks($t, $term, $t2) !== false)

                        {

                            $t = $t2;

                            $output .= $term . $this->nextItem(':');

                        }

                    }

                    ++ $commonPrefixLength;

                }

                $lastPrefix = $prefix2;

            }

            if (0 == $prefixLength)

            {

                # No prefix (not in list)--go to paragraph mode

                // XXX: use a stack for nestable elements like span, table and div

                $openmatch = preg_match('/(?:<table|<blockquote|<h1|<h2|<h3|<h4|<h5|<h6|<pre|<tr|<p|<ul|<ol|<li|<\\/tr|<\\/td|<\\/th)/iS', $t);

                $closematch = preg_match('/(?:<\\/table|<\\/blockquote|<\\/h1|<\\/h2|<\\/h3|<\\/h4|<\\/h5|<\\/h6|' . '<td|<th|<\\/?div|<hr|<\\/pre|<\\/p|' . $this->mUniqPrefix . '-pre|<\\/li|<\\/ul|<\\/ol|<\\/?center)/iS', $t);

                if ($openmatch or $closematch)

                {

                    $paragraphStack = false;

                    #Â TODO bug 5718: paragraph closed

                    $output .= $this->closeParagraph();

                    if ($preOpenMatch and ! $preCloseMatch)

                    {

                        $this->mInPre = true;

                    }

                    if ($closematch)

                    {

                        $inBlockElem = false;

                    }

                    else

                    {

                        $inBlockElem = true;

                    }

                }

                else 

                    if (! $inBlockElem && ! $this->mInPre)

                    {

                        if (' ' == substr($t, 0, 1) and ($this->mLastSection === 'pre' or trim($t) != ''))

                        {

                            // pre

                            if ($this->mLastSection !== 'pre')

                            {

                                $paragraphStack = false;

                                $output .= $this->closeParagraph() . '<pre>';

                                $this->mLastSection = 'pre';

                            }

                            $t = substr($t, 1);

                        }

                        else

                        {

                            // paragraph

                            if ('' == trim($t))

                            {

                                if ($paragraphStack)

                                {

                                    $output .= $paragraphStack . '<br />';

                                    $paragraphStack = false;

                                    $this->mLastSection = 'p';

                                }

                                else

                                {

                                    if ($this->mLastSection !== 'p')

                                    {

                                        $output .= $this->closeParagraph();

                                        $this->mLastSection = '';

                                        $paragraphStack = '<p>';

                                    }

                                    else

                                    {

                                        $paragraphStack = '</p><p>';

                                    }

                                }

                            }

                            else

                            {

                                if ($paragraphStack)

                                {

                                    $output .= $paragraphStack;

                                    $paragraphStack = false;

                                    $this->mLastSection = 'p';

                                }

                                else 

                                    if ($this->mLastSection !== 'p')

                                    {

                                        $output .= $this->closeParagraph() . '<p>';

                                        $this->mLastSection = 'p';

                                    }

                            }

                        }

                    }

            }

            // somewhere above we forget to get out of pre block (bug 785)

            if ($preCloseMatch && $this->mInPre)

            {

                $this->mInPre = false;

            }

            if ($paragraphStack === false)

            {

                $output .= $t . "\n";

            }

        }

        while ($prefixLength)

        {

            $output .= $this->closeList($prefix2[$prefixLength - 1]);

            -- $prefixLength;

        }

        if ('' != $this->mLastSection)

        {

            $output .= '</' . $this->mLastSection . '>';

            $this->mLastSection = '';

        }

        

        return $output;

    }



    /* private */    function nextItem($char)

    {

        if ('*' === $char || '#' === $char)

        {

            return '</li><li>';

        }

        else 

            if (':' === $char || ';' === $char)

            {

                $close = '</dd>';

                if ($this->mDTopen)

                {

                    $close = '</dt>';

                }

                if (';' === $char)

                {

                    $this->mDTopen = true;

                    return $close . '<dt>';

                }

                else

                {

                    $this->mDTopen = false;

                    return $close . '<dd>';

                }

            }

        return '<!-- ERR 2 -->';

    }



    /**

     * Split up a string on ':', ignoring any occurences inside tags

     * to prevent illegal overlapping.

     * @param string $str the string to split

     * @param string &$before set to everything before the ':'

     * @param string &$after set to everything after the ':'

     * return string the position of the ':', or false if none found

     */

    function findColonNoLinks($str, &$before, &$after)

    {

        $pos = strpos($str, ':');

        if ($pos === false)

        {

            // Nothing to find!

            return false;

        }

        

        $lt = strpos($str, '<');

        if ($lt === false || $lt > $pos)

        {

            // Easy; no tag nesting to worry about

            $before = substr($str, 0, $pos);

            $after = substr($str, $pos + 1);

            return $pos;

        }

        

        // Ugly state machine to walk through avoiding tags.

        $state = self :: COLON_STATE_TEXT;

        $stack = 0;

        $len = strlen($str);

        for($i = 0; $i < $len; $i ++)

        {

            $c = $str{$i};

            

            switch ($state)

            {

                // (Using the number is a performance hack for common cases)

                case 0 : // self::COLON_STATE_TEXT:

                    switch ($c)

                    {

                        case "<" :

                            // Could be either a <start> tag or an </end> tag

                            $state = self :: COLON_STATE_TAGSTART;

                            break;

                        case ":" :

                            if ($stack == 0)

                            {

                                // We found it!

                                $before = substr($str, 0, $i);

                                $after = substr($str, $i + 1);

                                return $i;

                            }

                            // Embedded in a tag; don't break it.

                            break;

                        default :

                            // Skip ahead looking for something interesting

                            $colon = strpos($str, ':', $i);

                            if ($colon === false)

                            {

                                // Nothing else interesting

                                return false;

                            }

                            $lt = strpos($str, '<', $i);

                            if ($stack === 0)

                            {

                                if ($lt === false || $colon < $lt)

                                {

                                    // We found it!

                                    $before = substr($str, 0, $colon);

                                    $after = substr($str, $colon + 1);

                                    return $i;

                                }

                            }

                            if ($lt === false)

                            {

                                // Nothing else interesting to find; abort!

                                // We're nested, but there's no close tags left. Abort!

                                break 2;

                            }

                            // Skip ahead to next tag start

                            $i = $lt;

                            $state = self :: COLON_STATE_TAGSTART;

                    }

                    break;

                case 1 : // self::COLON_STATE_TAG:

                    // In a <tag>

                    switch ($c)

                    {

                        case ">" :

                            $stack ++;

                            $state = self :: COLON_STATE_TEXT;

                            break;

                        case "/" :

                            // Slash may be followed by >?

                            $state = self :: COLON_STATE_TAGSLASH;

                            break;

                        default :

                    

     // ignore

                    }

                    break;

                case 2 : // self::COLON_STATE_TAGSTART:

                    switch ($c)

                    {

                        case "/" :

                            $state = self :: COLON_STATE_CLOSETAG;

                            break;

                        case "!" :

                            $state = self :: COLON_STATE_COMMENT;

                            break;

                        case ">" :

                            // Illegal early close? This shouldn't happen D:

                            $state = self :: COLON_STATE_TEXT;

                            break;

                        default :

                            $state = self :: COLON_STATE_TAG;

                    }

                    break;

                case 3 : // self::COLON_STATE_CLOSETAG:

                    // In a </tag>

                    if ($c === ">")

                    {

                        $stack --;

                        if ($stack < 0)

                        {

                            return false;

                        }

                        $state = self :: COLON_STATE_TEXT;

                    }

                    break;

                case self :: COLON_STATE_TAGSLASH :

                    if ($c === ">")

                    {

                        // Yes, a self-closed tag <blah/>

                        $state = self :: COLON_STATE_TEXT;

                    }

                    else

                    {

                        // Probably we're jumping the gun, and this is an attribute

                        $state = self :: COLON_STATE_TAG;

                    }

                    break;

                case 5 : // self::COLON_STATE_COMMENT:

                    if ($c === "-")

                    {

                        $state = self :: COLON_STATE_COMMENTDASH;

                    }

                    break;

                case self :: COLON_STATE_COMMENTDASH :

                    if ($c === "-")

                    {

                        $state = self :: COLON_STATE_COMMENTDASHDASH;

                    }

                    else

                    {

                        $state = self :: COLON_STATE_COMMENT;

                    }

                    break;

                case self :: COLON_STATE_COMMENTDASHDASH :

                    if ($c === ">")

                    {

                        $state = self :: COLON_STATE_TEXT;

                    }

                    else

                    {

                        $state = self :: COLON_STATE_COMMENT;

                    }

                    break;

                default :

                    throw new MWException("State machine error in " . __METHOD__);

            }

        }

        if ($stack > 0)

        {

            return false;

        }

        return false;

    }



    # getCommon() returns the length of the longest common substring

    # of both arguments, starting at the beginning of both.

    #

    function getCommon($st1, $st2)

    {

        $fl = strlen($st1);

        $shorter = strlen($st2);

        if ($fl < $shorter)

        {

            $shorter = $fl;

        }

        

        for($i = 0; $i < $shorter; ++ $i)

        {

            if ($st1{$i} != $st2{$i})

            {

                break;

            }

        }

        return $i;

    }



    function closeList($char)

    {

        if ('*' === $char)

        {

            $text = '</li></ul>';

        }

        else 

            if ('#' === $char)

            {

                $text = '</li></ol>';

            }

            else 

                if (':' === $char)

                {

                    if ($this->mDTopen)

                    {

                        $this->mDTopen = false;

                        $text = '</dt></dl>';

                    }

                    else

                    {

                        $text = '</dd></dl>';

                    }

                }

                else

                {

                    return '<!-- ERR 3 -->';

                }

        return $text . "\n";

    }



    # These next three functions open, continue, and close the list

    # element appropriate to the prefix character passed into them.

    #

    function openList($char)

    {

        $result = $this->closeParagraph();

        

        if ('*' === $char)

        {

            $result .= '<ul><li>';

        }

        else 

            if ('#' === $char)

            {

                $result .= '<ol><li>';

            }

            else 

                if (':' === $char)

                {

                    $result .= '<dl><dd>';

                }

                else 

                    i…
Large files files are truncated, but you can click here to view the full file