/carl_util/tidy/tidy.php
PHP | 297 lines | 264 code | 8 blank | 25 comment | 8 complexity | 6655b3ed94ac9ee640a35e7be860dfbc MD5 | raw file
- <?php
- /**
- * Tidy HTML
- *
- * tidy function converts HTML to XHTML in a sort of round-a-bout way
- * uses the w3c's tidy program and some command line and string manipulation trickery
- * to produce valid XHTML from HTML
- *
- * @author dave hendler
- * @author nate white
- * @package carl_util
- * @subpackage tidy
- */
-
- /**
- * Turn a string or array into valid, standards-compliant (x)HTML
- *
- * Uses configuraton options in tidy.conf - which should minimally have show-body-only set to yes
- *
- * @param mixed $text The data to be tidied up
- * @return mixed $result Tidied data
- */
- function tidy( $text )
- {
- static $tidy_funcs;
- static $tidy_conf;
- if (!isset($tidy_conf)) $tidy_conf = SETTINGS_INC . 'tidy.conf';
- if(is_array($text))
- {
- $result = array();
- foreach(array_keys($text) as $key)
- {
- $result[$key] = tidy($text[$key]);
- }
- return $result;
- }
-
- // determine what tidy libraries are available
- if (empty($tidy_funcs)) $tidy_funcs = get_extension_funcs('tidy');
- $tidy_1_lib_available = (!empty($tidy_funcs)) && (array_search('tidy_setopt', $tidy_funcs) !== false);
- $tidy_2_lib_available = (!empty($tidy_funcs)) && (array_search('tidy_setopt', $tidy_funcs) === false);
- $tidy_command_line_available = (TIDY_EXE) ? file_exists(TIDY_EXE) : false;
-
- $text = protect_string_from_tidy( $text );
-
- $text = '<html><body>'.$text.'</body></html>';
-
- if ($tidy_2_lib_available) // Run tidy for PHP 5
- {
- $tidy = new tidy();
- $tidy->parseString($text, $tidy_conf, 'utf8');
- $tidy->cleanRepair();
- $result = $tidy;
- }
- elseif ($tidy_1_lib_available) // Run tidy for PHP 4
- {
- tidy_load_config($tidy_conf);
- tidy_set_encoding('utf8');
- tidy_parse_string($text);
- tidy_clean_repair();
- $result = tidy_get_output();
- }
- elseif ($tidy_command_line_available) // attempt to run COMMAND LINE tidy
- {
- $arg = escapeshellarg( $text ); // escape the bad stuff in the text
- $cmd = 'echo '.$arg.' | '.TIDY_EXE.' -q -config '.$tidy_conf.' 2> /dev/null'; // the actual command - pipes the input to tidy which diverts its output to the random file
- $result = shell_exec($cmd); // execute the command
- }
- else
- {
- trigger_error('tidy does not appear to be available within php or at the command line - no tidying is taking place.');
- $result = $text;
- }
- return trim($result);
- }
- /**
- * See where this is used and provide better error handling for tidylib in php 4 and 5
- */
- function tidy_err( $text )
- {
- static $tidy_conf;
- if (!isset($tidy_conf)) $tidy_conf = SETTINGS_INC . 'tidy.conf';
- $arg = escapeshellarg( $text );
- $err = shell_exec( 'echo '.$arg.' | '.TIDY_EXE.' -q -config '.$tidy_conf.' 2>&1' );
- $err = explode( "\n", $err );
- $errors = array();
- foreach( $err AS $line )
- {
- // look for both type and value inequality
- if( strstr( $line, 'Error:' ) !== false )
- $errors[] = $line;
- }
- return implode("\n",$errors);
- }
- function protect_string_from_tidy( $str )
- {
- $utf_entity_trans = array(
- ' ' => chr('194').chr('160'),
- ' ' => chr('194').chr('160'),
- '¡' => chr('194').chr('161'),
- '¡' => chr('194').chr('161'),
- '¢' => chr('194').chr('162'),
- '¢' => chr('194').chr('162'),
- '£' => chr('194').chr('163'),
- '£' => chr('194').chr('163'),
- '¤' => chr('194').chr('164'),
- '¤' => chr('194').chr('164'),
- '¥' => chr('194').chr('165'),
- '¥' => chr('194').chr('165'),
- '¦' => chr('194').chr('166'),
- '¦' => chr('194').chr('166'),
- '§' => chr('194').chr('167'),
- '§' => chr('194').chr('167'),
- '¨' => chr('194').chr('168'),
- '¨' => chr('194').chr('168'),
- '©' => chr('194').chr('169'),
- '©' => chr('194').chr('169'),
- 'ª' => chr('194').chr('170'),
- 'ª' => chr('194').chr('170'),
- '«' => chr('194').chr('171'),
- '«' => chr('194').chr('171'),
- '¬' => chr('194').chr('172'),
- '¬' => chr('194').chr('172'),
- '­' => chr('194').chr('173'),
- '­' => chr('194').chr('173'),
- '®' => chr('194').chr('174'),
- '®' => chr('194').chr('174'),
- '¯' => chr('194').chr('175'),
- '¯' => chr('194').chr('175'),
- '°' => chr('194').chr('176'),
- '°' => chr('194').chr('176'),
- '±' => chr('194').chr('177'),
- '±' => chr('194').chr('177'),
- '²' => chr('194').chr('178'),
- '²' => chr('194').chr('178'),
- '³' => chr('194').chr('179'),
- '³' => chr('194').chr('179'),
- '´' => chr('194').chr('180'),
- '´' => chr('194').chr('180'),
- 'µ' => chr('194').chr('181'),
- 'µ' => chr('194').chr('181'),
- '¶' => chr('194').chr('182'),
- '¶' => chr('194').chr('182'),
- '·' => chr('194').chr('183'),
- '·' => chr('194').chr('183'),
- '¸' => chr('194').chr('184'),
- '¸' => chr('194').chr('184'),
- '¹' => chr('194').chr('185'),
- '¹' => chr('194').chr('185'),
- 'º' => chr('194').chr('186'),
- 'º' => chr('194').chr('186'),
- '»' => chr('194').chr('187'),
- '»' => chr('194').chr('187'),
- '¼' => chr('194').chr('188'),
- '¼' => chr('194').chr('188'),
- '½' => chr('194').chr('189'),
- '½' => chr('194').chr('189'),
- '¾' => chr('194').chr('190'),
- '¾' => chr('194').chr('190'),
- '¿' => chr('194').chr('191'),
- '¿' => chr('194').chr('191'),
- 'À' => chr('195').chr('128'),
- 'À' => chr('195').chr('128'),
- 'Á' => chr('195').chr('129'),
- 'Á' => chr('195').chr('129'),
- 'Â' => chr('195').chr('130'),
- 'Â' => chr('195').chr('130'),
- 'Ã' => chr('195').chr('131'),
- 'Ã' => chr('195').chr('131'),
- 'Ä' => chr('195').chr('132'),
- 'Ä' => chr('195').chr('132'),
- 'Å' => chr('195').chr('133'),
- 'Å' => chr('195').chr('133'),
- 'Æ' => chr('195').chr('134'),
- 'Æ' => chr('195').chr('134'),
- 'Ç' => chr('195').chr('135'),
- 'Ç' => chr('195').chr('135'),
- 'È' => chr('195').chr('136'),
- 'È' => chr('195').chr('136'),
- 'É' => chr('195').chr('137'),
- 'É' => chr('195').chr('137'),
- 'Ê' => chr('195').chr('138'),
- 'Ê' => chr('195').chr('138'),
- 'Ë' => chr('195').chr('139'),
- 'Ë' => chr('195').chr('139'),
- 'Ì' => chr('195').chr('140'),
- 'Ì' => chr('195').chr('140'),
- 'Í' => chr('195').chr('141'),
- 'Í' => chr('195').chr('141'),
- 'Î' => chr('195').chr('142'),
- 'Î' => chr('195').chr('142'),
- 'Ï' => chr('195').chr('143'),
- 'Ï' => chr('195').chr('143'),
- 'Ð' => chr('195').chr('144'),
- 'Ð' => chr('195').chr('144'),
- 'Ñ' => chr('195').chr('145'),
- 'Ñ' => chr('195').chr('145'),
- 'Ò' => chr('195').chr('146'),
- 'Ò' => chr('195').chr('146'),
- 'Ó' => chr('195').chr('147'),
- 'Ó' => chr('195').chr('147'),
- 'Ô' => chr('195').chr('148'),
- 'Ô' => chr('195').chr('148'),
- 'Õ' => chr('195').chr('149'),
- 'Õ' => chr('195').chr('149'),
- 'Ö' => chr('195').chr('150'),
- 'Ö' => chr('195').chr('150'),
- '×' => chr('195').chr('151'),
- '×' => chr('195').chr('151'),
- 'Ø' => chr('195').chr('152'),
- 'Ø' => chr('195').chr('152'),
- 'Ù' => chr('195').chr('153'),
- 'Ù' => chr('195').chr('153'),
- 'Ú' => chr('195').chr('154'),
- 'Ú' => chr('195').chr('154'),
- 'Û' => chr('195').chr('155'),
- 'Û' => chr('195').chr('155'),
- 'Ü' => chr('195').chr('156'),
- 'Ü' => chr('195').chr('156'),
- 'Ý' => chr('195').chr('157'),
- 'Ý' => chr('195').chr('157'),
- 'Þ' => chr('195').chr('158'),
- 'Þ' => chr('195').chr('158'),
- 'ß' => chr('195').chr('159'),
- 'ß' => chr('195').chr('159'),
- 'à' => chr('195').chr('160'),
- 'à' => chr('195').chr('160'),
- 'á' => chr('195').chr('161'),
- 'á' => chr('195').chr('161'),
- 'â' => chr('195').chr('162'),
- 'â' => chr('195').chr('162'),
- 'ã' => chr('195').chr('163'),
- 'ã' => chr('195').chr('163'),
- 'ä' => chr('195').chr('164'),
- 'ä' => chr('195').chr('164'),
- 'å' => chr('195').chr('165'),
- 'å' => chr('195').chr('165'),
- 'æ' => chr('195').chr('166'),
- 'æ' => chr('195').chr('166'),
- 'ç' => chr('195').chr('167'),
- 'ç' => chr('195').chr('167'),
- 'è' => chr('195').chr('168'),
- 'è' => chr('195').chr('168'),
- 'é' => chr('195').chr('169'),
- 'é' => chr('195').chr('169'),
- 'ê' => chr('195').chr('170'),
- 'ê' => chr('195').chr('170'),
- 'ë' => chr('195').chr('171'),
- 'ë' => chr('195').chr('171'),
- 'ì' => chr('195').chr('172'),
- 'ì' => chr('195').chr('172'),
- 'í' => chr('195').chr('173'),
- 'í' => chr('195').chr('173'),
- 'î' => chr('195').chr('174'),
- 'î' => chr('195').chr('174'),
- 'ï' => chr('195').chr('175'),
- 'ï' => chr('195').chr('175'),
- 'ð' => chr('195').chr('176'),
- 'ð' => chr('195').chr('176'),
- 'ñ' => chr('195').chr('177'),
- 'ñ' => chr('195').chr('177'),
- 'ò' => chr('195').chr('178'),
- 'ò' => chr('195').chr('178'),
- 'ó' => chr('195').chr('179'),
- 'ó' => chr('195').chr('179'),
- 'ô' => chr('195').chr('180'),
- 'ô' => chr('195').chr('180'),
- 'õ' => chr('195').chr('181'),
- 'õ' => chr('195').chr('181'),
- 'ö' => chr('195').chr('182'),
- 'ö' => chr('195').chr('182'),
- '÷' => chr('195').chr('183'),
- '÷' => chr('195').chr('183'),
- 'ø' => chr('195').chr('184'),
- 'ø' => chr('195').chr('184'),
- 'ù' => chr('195').chr('185'),
- 'ù' => chr('195').chr('185'),
- 'ú' => chr('195').chr('186'),
- 'ú' => chr('195').chr('186'),
- 'û' => chr('195').chr('187'),
- 'û' => chr('195').chr('187'),
- 'ü' => chr('195').chr('188'),
- 'ü' => chr('195').chr('188'),
- 'ý' => chr('195').chr('189'),
- 'ý' => chr('195').chr('189'),
- 'þ' => chr('195').chr('190'),
- 'þ' => chr('195').chr('190'),
- 'ÿ' => chr('195').chr('191'),
- 'ÿ' => chr('195').chr('191'),
- );
- $str = str_replace( array_keys( $utf_entity_trans ), $utf_entity_trans, $str );
- return $str;
- }
- ?>