PageRenderTime 82ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/cms/openid/Services/Yadis/ParseHTML.php

https://github.com/swat/pragyan
PHP | 270 lines | 136 code | 35 blank | 99 comment | 14 complexity | 965f7130645e3bf29164c8bc92fd26bd MD5 | raw file
  1. <?php
  2. /**
  3. * This is the HTML pseudo-parser for the Yadis library.
  4. *
  5. * PHP versions 4 and 5
  6. *
  7. * LICENSE: See the COPYING file included in this distribution.
  8. *
  9. * @package Yadis
  10. * @author JanRain, Inc. <openid@janrain.com>
  11. * @copyright 2005 Janrain, Inc.
  12. * @license http://www.gnu.org/copyleft/lesser.html LGPL
  13. */
  14. /**
  15. * This class is responsible for scanning an HTML string to find META
  16. * tags and their attributes. This is used by the Yadis discovery
  17. * process. This class must be instantiated to be used.
  18. *
  19. * @package Yadis
  20. */
  21. class Services_Yadis_ParseHTML {
  22. /**
  23. * @access private
  24. */
  25. var $_re_flags = "si";
  26. /**
  27. * @access private
  28. */
  29. var $_tag_expr = "<%s\b(?!:)([^>]*?)(?:\/>|>(.*?)(?:<\/?%s\s*>|\Z))";
  30. /**
  31. * @access private
  32. */
  33. var $_close_tag_expr = "<\/?%s\s*>";
  34. /**
  35. * @access private
  36. */
  37. var $_removed_re =
  38. "<!--.*?-->|<!\[CDATA\[.*?\]\]>|<script\b(?!:)[^>]*>.*?<\/script>";
  39. /**
  40. * @access private
  41. */
  42. var $_attr_find = '\b([-\w]+)=("[^"]*"|\'[^\']*\'|[^\'"\s\/<>]+)';
  43. function Services_Yadis_ParseHTML()
  44. {
  45. $this->_meta_find = sprintf("/<meta\b(?!:)([^>]*)(?!<)>/%s",
  46. $this->_re_flags);
  47. $this->_removed_re = sprintf("/%s/%s",
  48. $this->_removed_re,
  49. $this->_re_flags);
  50. $this->_attr_find = sprintf("/%s/%s",
  51. $this->_attr_find,
  52. $this->_re_flags);
  53. $this->_entity_replacements = array(
  54. 'amp' => '&',
  55. 'lt' => '<',
  56. 'gt' => '>',
  57. 'quot' => '"'
  58. );
  59. $this->_ent_replace =
  60. sprintf("&(%s);", implode("|",
  61. $this->_entity_replacements));
  62. }
  63. /**
  64. * Replace HTML entities (amp, lt, gt, and quot) as well as
  65. * numeric entities (e.g. #x9f;) with their actual values and
  66. * return the new string.
  67. *
  68. * @access private
  69. * @param string $str The string in which to look for entities
  70. * @return string $new_str The new string entities decoded
  71. */
  72. function replaceEntities($str)
  73. {
  74. foreach ($this->_entity_replacements as $old => $new) {
  75. $str = preg_replace(sprintf("/&%s;/", $old), $new, $str);
  76. }
  77. // Replace numeric entities because html_entity_decode doesn't
  78. // do it for us.
  79. $str = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $str);
  80. $str = preg_replace('~&#([0-9]+);~e', 'chr(\\1)', $str);
  81. return $str;
  82. }
  83. /**
  84. * Strip single and double quotes off of a string, if they are
  85. * present.
  86. *
  87. * @access private
  88. * @param string $str The original string
  89. * @return string $new_str The new string with leading and
  90. * trailing quotes removed
  91. */
  92. function removeQuotes($str)
  93. {
  94. $matches = array();
  95. $double = '/^"(.*)"$/';
  96. $single = "/^\'(.*)\'$/";
  97. if (preg_match($double, $str, $matches)) {
  98. return $matches[1];
  99. } else if (preg_match($single, $str, $matches)) {
  100. return $matches[1];
  101. } else {
  102. return $str;
  103. }
  104. }
  105. /**
  106. * Create a regular expression that will match an opening (and
  107. * optional) closing tag of a given name.
  108. *
  109. * @access private
  110. * @param string $tag_name The tag name to match
  111. * @param array $close_tags An array of tag names which also
  112. * constitute closing of the original tag
  113. * @return string $regex A regular expression string to be used
  114. * in, say, preg_match.
  115. */
  116. function tagMatcher($tag_name, $close_tags = null)
  117. {
  118. if ($close_tags) {
  119. $options = implode("|", array_merge(array($tag_name), $close_tags));
  120. $closer = sprintf("(?:%s)", $options);
  121. } else {
  122. $closer = $tag_name;
  123. }
  124. $expr = sprintf($this->_tag_expr, $tag_name, $closer);
  125. return sprintf("/%s/%s", $expr, $this->_re_flags);
  126. }
  127. /**
  128. * @access private
  129. */
  130. function htmlFind($str)
  131. {
  132. return $this->tagMatcher('html', array('body'));
  133. }
  134. /**
  135. * @access private
  136. */
  137. function headFind()
  138. {
  139. return $this->tagMatcher('head', array('body'));
  140. }
  141. /**
  142. * Given an HTML document string, this finds all the META tags in
  143. * the document, provided they are found in the
  144. * <HTML><HEAD>...</HEAD> section of the document. The <HTML> tag
  145. * may be missing.
  146. *
  147. * @access private
  148. * @param string $html_string An HTMl document string
  149. * @return array $tag_list Array of tags; each tag is an array of
  150. * attribute -> value.
  151. */
  152. function getMetaTags($html_string)
  153. {
  154. $stripped = preg_replace($this->_removed_re,
  155. "",
  156. $html_string);
  157. // Look for the closing body tag.
  158. $body_closer = sprintf($this->_close_tag_expr, 'body');
  159. $body_matches = array();
  160. preg_match($body_closer, $html_string, $body_matches,
  161. PREG_OFFSET_CAPTURE);
  162. if ($body_matches) {
  163. $html_string = substr($html_string, 0, $body_matches[0][1]);
  164. }
  165. // Look for the opening body tag, and discard everything after
  166. // that tag.
  167. $body_re = $this->tagMatcher('body');
  168. $body_matches = array();
  169. preg_match($body_re, $html_string, $body_matches, PREG_OFFSET_CAPTURE);
  170. if ($body_matches) {
  171. $html_string = substr($html_string, 0, $body_matches[0][1]);
  172. }
  173. // If an HTML tag is found at all, it must be in the right
  174. // order; else, it may be missing (which is a case we allow
  175. // for).
  176. $html_re = $this->tagMatcher('html', array('body'));
  177. preg_match($html_re, $html_string, $html_matches);
  178. if ($html_matches) {
  179. $html = $html_matches[0];
  180. } else {
  181. $html = $html_string;
  182. }
  183. // Try to find the <HEAD> tag.
  184. $head_re = $this->headFind();
  185. $head_matches = array();
  186. if (!preg_match($head_re, $html, $head_matches)) {
  187. return array();
  188. }
  189. $link_data = array();
  190. $link_matches = array();
  191. if (!preg_match_all($this->_meta_find, $head_matches[0],
  192. $link_matches)) {
  193. return array();
  194. }
  195. foreach ($link_matches[0] as $link) {
  196. $attr_matches = array();
  197. preg_match_all($this->_attr_find, $link, $attr_matches);
  198. $link_attrs = array();
  199. foreach ($attr_matches[0] as $index => $full_match) {
  200. $name = $attr_matches[1][$index];
  201. $value = $this->replaceEntities(
  202. $this->removeQuotes($attr_matches[2][$index]));
  203. $link_attrs[strtolower($name)] = $value;
  204. }
  205. $link_data[] = $link_attrs;
  206. }
  207. return $link_data;
  208. }
  209. /**
  210. * Looks for a META tag with an "http-equiv" attribute whose value
  211. * is one of ("x-xrds-location", "x-yadis-location"), ignoring
  212. * case. If such a META tag is found, its "content" attribute
  213. * value is returned.
  214. *
  215. * @param string $html_string An HTML document in string format
  216. * @return mixed $content The "content" attribute value of the
  217. * META tag, if found, or null if no such tag was found.
  218. */
  219. function getHTTPEquiv($html_string)
  220. {
  221. $meta_tags = $this->getMetaTags($html_string);
  222. if ($meta_tags) {
  223. foreach ($meta_tags as $tag) {
  224. if (array_key_exists('http-equiv', $tag) &&
  225. (in_array(strtolower($tag['http-equiv']),
  226. array('x-xrds-location', 'x-yadis-location'))) &&
  227. array_key_exists('content', $tag)) {
  228. return $tag['content'];
  229. }
  230. }
  231. }
  232. return null;
  233. }
  234. }
  235. ?>