PageRenderTime 21ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/classes/ARC2/parsers/ARC2_SemHTMLParser.php

https://github.com/mterenzio/FollowThis
PHP | 339 lines | 264 code | 44 blank | 31 comment | 69 complexity | f71910499e0e6d433a5562a1edc419aa MD5 | raw file
  1. <?php
  2. /*
  3. homepage: http://arc.semsol.org/
  4. license: http://arc.semsol.org/license
  5. class: ARC2 RDF/XML Parser
  6. author: Benjamin Nowack
  7. version: 2010-11-16
  8. */
  9. ARC2::inc('LegacyXMLParser');
  10. class ARC2_SemHTMLParser extends ARC2_LegacyXMLParser {
  11. function __construct($a, &$caller) {
  12. parent::__construct($a, $caller);
  13. }
  14. function __init() {/* reader */
  15. parent::__init();
  16. $this->default_sem_html_formats = 'dc openid erdf rdfa posh-rdf microformats';
  17. $this->triples = array();
  18. $this->target_encoding = '';
  19. $this->t_count = 0;
  20. $this->added_triples = array();
  21. $this->skip_dupes = false;
  22. $this->bnode_prefix = $this->v('bnode_prefix', 'arc'.substr(md5(uniqid(rand())), 0, 4).'b', $this->a);
  23. $this->bnode_id = 0;
  24. $this->auto_extract = $this->v('auto_extract', 1, $this->a);
  25. $this->extracted_formats = array();
  26. $this->cache = array();
  27. $this->detected_formats = array();
  28. $this->keep_cdata_ws = $this->v('keep_cdata_whitespace', 0, $this->a);
  29. }
  30. /* */
  31. function x($re, $v, $options = 'si', $keep_ws = 0) {
  32. list($ws, $v) = preg_match('/^(\s*)(.*)$/s', $v, $m) ? array($m[1], $m[2]) : array('', $v);
  33. if (preg_match("/^" . $re . "(.*)$/" . $options, $v, $m)) {
  34. if ($keep_ws) $m[1] = $ws . $m[1];
  35. return $m;
  36. }
  37. return false;
  38. }
  39. /* */
  40. function setReader(&$reader) {
  41. $this->reader = $reader;
  42. }
  43. function createBnodeID(){
  44. $this->bnode_id++;
  45. return '_:' . $this->bnode_prefix . $this->bnode_id;
  46. }
  47. function addT($t) {
  48. if (function_exists('html_entity_decode')) {
  49. $t['o'] = html_entity_decode($t['o']);
  50. }
  51. if ($this->skip_dupes) {
  52. $h = md5(serialize($t));
  53. if (!isset($this->added_triples[$h])) {
  54. $this->triples[$this->t_count] = $t;
  55. $this->t_count++;
  56. $this->added_triples[$h] = true;
  57. }
  58. }
  59. else {
  60. $this->triples[$this->t_count] = $t;
  61. $this->t_count++;
  62. }
  63. }
  64. function getTriples() {
  65. return $this->v('triples', array());
  66. }
  67. function countTriples() {
  68. return $this->t_count;
  69. }
  70. function getSimpleIndex($flatten_objects = 1, $vals = '') {
  71. return ARC2::getSimpleIndex($this->getTriples(), $flatten_objects, $vals);
  72. }
  73. /* */
  74. function parse($path, $data = '', $iso_fallback = 'ignore') {
  75. $this->nodes = array();
  76. $this->node_count = 0;
  77. $this->level = 0;
  78. /* reader */
  79. if (!$this->v('reader')) {
  80. ARC2::inc('Reader');
  81. $this->reader = new ARC2_Reader($this->a, $this);
  82. }
  83. $this->reader->setAcceptHeader('Accept: text/html, application/xhtml, */*; q=0.9');
  84. $this->reader->activate($path, $data);
  85. $this->target_encoding = $this->reader->getEncoding(false);
  86. $this->x_base = isset($this->a['base']) && $this->a['base'] ? $this->a['base'] : $this->reader->base;
  87. $this->base = $this->x_base;
  88. $this->doc_url = $this->reader->base;
  89. /* parse */
  90. $rest = '';
  91. $this->cur_tag = '';
  92. while ($d = $this->reader->readStream(1)) {
  93. $rest = $this->processData($rest . $d);
  94. }
  95. $this->reader->closeStream();
  96. unset($this->reader);
  97. return $this->done();
  98. }
  99. /* */
  100. function getEncoding($src = 'ignore') {
  101. return $this->target_encoding;
  102. }
  103. /* */
  104. function done() {
  105. if ($this->auto_extract) {
  106. $this->extractRDF();
  107. }
  108. }
  109. /* */
  110. function processData($v) {
  111. $sub_v = $v;
  112. do {
  113. $proceed = 1;
  114. if ((list($sub_r, $sub_v) = $this->xComment($sub_v)) && $sub_r) {
  115. $this->open(0, 'comment', array('value' => $sub_r));
  116. $this->close(0, 'comment');
  117. continue;
  118. }
  119. if ((list($sub_r, $sub_v) = $this->xDoctype($sub_v)) && $sub_r) {
  120. $this->open(0, 'doctype', array('value' => $sub_r));
  121. $this->close(0, 'doctype');
  122. /* RDFa detection */
  123. if (preg_match('/rdfa /i', $sub_r)) $this->detected_formats['rdfa'] = 1;
  124. continue;
  125. }
  126. if ($this->level && ((list($sub_r, $sub_v) = $this->xWS($sub_v)) && $sub_r)) {
  127. $this->cData(0, $sub_r);
  128. }
  129. elseif ((list($sub_r, $sub_v) = $this->xOpen($sub_v)) && $sub_r) {
  130. $this->open(0, $sub_r['tag'], $sub_r['a']);
  131. $this->cur_tag = $sub_r['tag'];
  132. if ($sub_r['empty']) {
  133. $this->close(0, $sub_r['tag'], 1);
  134. $this->cur_tag = '';
  135. }
  136. /* eRDF detection */
  137. if (!isset($this->detected_formats['erdf']) && isset($sub_r['a']['profile m']) && in_array('http://purl.org/NET/erdf/profile', $sub_r['a']['profile m'])) $this->detected_formats['erdf'] = 1;
  138. /* poshRDF detection */
  139. if (!isset($this->detected_formats['posh-rdf']) && isset($sub_r['a']['class m']) && in_array('rdf-p', $sub_r['a']['class m'])) $this->detected_formats['posh-rdf'] = 1;
  140. /* RDFa detection */
  141. if (!isset($this->detected_formats['rdfa']) && ($this->cur_tag == 'html') && isset($sub_r['a']['version m']) && in_array('XHTML+RDFa', $sub_r['a']['version m'])) $this->detected_formats['rdfa'] = 1;
  142. if (!isset($this->detected_formats['rdfa']) && isset($sub_r['a']['xmlns']) && $sub_r['a']['xmlns'] && $this->isRDFNSDecl($sub_r['a']['xmlns'])) $this->detected_formats['rdfa'] = 1;
  143. if (!isset($this->detected_formats['rdfa']) && array_intersect(array('about', 'typeof', 'property'), array_keys($sub_r['a']))) $this->detected_formats['rdfa'] = 1;
  144. }
  145. elseif ((list($sub_r, $sub_v) = $this->xClose($sub_v)) && $sub_r) {
  146. if (preg_match('/^(area|base|br|col|frame|hr|input|img|link|xmeta|param)$/', $sub_r['tag'])) {
  147. /* already implicitly closed */
  148. }
  149. else {
  150. $this->close(0, $sub_r['tag']);
  151. $this->cur_tag = '';
  152. }
  153. }
  154. elseif ((list($sub_r, $sub_v) = $this->xCData($sub_v)) && $sub_r) {
  155. $this->cData(0, $sub_r);
  156. }
  157. else {
  158. $proceed = 0;
  159. }
  160. } while ($proceed);
  161. return $sub_v;
  162. }
  163. /* */
  164. function isRDFNSDecl($ns) {
  165. foreach ($ns as $k => $v) {
  166. if ($k) return 1;
  167. }
  168. return 0;
  169. }
  170. /* */
  171. function xComment($v) {
  172. if ($r = $this->x('\<\!\-\-', $v)) {
  173. if ($sub_r = $this->x('(.*)\-\-\>', $r[1], 'Us')) {
  174. return array($sub_r[1], $sub_r[2]);
  175. }
  176. }
  177. return array(0, $v);
  178. }
  179. function xDoctype($v) {
  180. if ($r = $this->x('\<\!DOCTYPE', $v)) {
  181. if ($sub_r = $this->x('([^\>]+)\>', $r[1])) {
  182. return array($sub_r[1], $sub_r[2]);
  183. }
  184. }
  185. return array(0, $v);
  186. }
  187. function xWS($v) {
  188. if ($r = ARC2::x('(\s+)', $v)) {
  189. return array($r[1], $r[2]);
  190. }
  191. return array(0, $v);
  192. }
  193. /* */
  194. function xOpen($v) {
  195. if ($r = $this->x('\<([^\s\/\>]+)([^\>]*)\>', $v)) {
  196. list($sub_r, $sub_v) = $this->xAttributes($r[2]);
  197. return array(array('tag' => strtolower($r[1]), 'a' => $sub_r, 'empty' => $this->isEmpty($r[1], $r[2])), $r[3]);
  198. }
  199. return array(0, $v);
  200. }
  201. /* */
  202. function xAttributes($v) {
  203. $r = array();
  204. while ((list($sub_r, $v) = $this->xAttribute($v)) && $sub_r) {
  205. if ($sub_sub_r = $this->x('xmlns\:?(.*)', $sub_r['k'])) {
  206. $this->nsDecl(0, $sub_sub_r[1], $sub_r['value']);
  207. $r['xmlns'][$sub_sub_r[1]] = $sub_r['value'];
  208. }
  209. else {
  210. $r[$sub_r['k']] = $sub_r['value'];
  211. $r[$sub_r['k'] . ' m'] = $sub_r['values'];
  212. }
  213. }
  214. return array($r, $v);
  215. }
  216. /* */
  217. function xAttribute($v) {
  218. if ($r = $this->x('([^\s\=]+)\s*(\=)?\s*([\'\"]?)', $v)) {
  219. if (!$r[2]) {/* no '=' */
  220. if ($r[1] == '/') {
  221. return array(0, $r[4]);
  222. }
  223. return array(array('k' => $r[1], 'value' => 1, 'values' => array(1)), $r[4]);
  224. }
  225. if (!$r[3]) {/* no quots */
  226. if ($sub_r = $this->x('([^\s]+)', $r[4])) {
  227. return array(array('k' => $r[1], 'value' => $sub_r[1], 'values' => array($sub_r[1])), $sub_r[2]);
  228. }
  229. return array(array('k' => $r[1], 'value' => '', 'values' => array()), $r[4]);
  230. }
  231. $val = '';
  232. $multi = 0;
  233. $sub_v = $r[4];
  234. while ($sub_v && (!$sub_r = $this->x('(\x5c\\' .$r[3]. '|\\' .$r[3]. ')', $sub_v))) {
  235. $val .= substr($sub_v, 0, 1);
  236. $sub_v = substr($sub_v, 1);
  237. }
  238. $sub_v = $sub_v ? $sub_r[2] : $sub_v;
  239. $vals = preg_split('/ /', $val);
  240. return array(array('k' => $r[1], 'value' => $val, 'values' => $vals), $sub_v);
  241. }
  242. return array(0, $v);
  243. }
  244. /* */
  245. function isEmpty($t, $v) {
  246. if (preg_match('/^(area|base|br|col|frame|hr|input|img|link|xmeta|param)$/', $t)) {
  247. return 1;
  248. }
  249. if (preg_match('/\/$/', $v)) {
  250. return 1;
  251. }
  252. return 0;
  253. }
  254. /* */
  255. function xClose($v) {
  256. if ($r = $this->x('\<\/([^\s\>]+)\>', $v)) {
  257. return array(array('tag' => strtolower($r[1])), $r[2]);
  258. }
  259. return array(0, $v);
  260. }
  261. /* */
  262. function xCData($v) {
  263. if (preg_match('/(script|style)/i', $this->cur_tag)) {
  264. if ($r = $this->x('(.+)(\<\/' . $this->cur_tag . '\>)', $v, 'Uis')) {
  265. return array($r[1], $r[2] . $r[3]);
  266. }
  267. }
  268. elseif ($r = $this->x('([^\<]+)', $v, 'si', $this->keep_cdata_ws)) {
  269. return array($r[1], $r[2]);
  270. }
  271. return array(0, $v);
  272. }
  273. /* */
  274. function extractRDF($formats = '') {
  275. $this->node_index = $this->getNodeIndex();
  276. $formats = !$formats ? $this->v('sem_html_formats', $this->default_sem_html_formats, $this->a) : $formats;
  277. $formats = preg_split('/ /', $formats);
  278. foreach ($formats as $format) {
  279. if (!in_array($format, $this->extracted_formats)) {
  280. $comp = $this->camelCase($format) . 'Extractor';
  281. if (ARC2::inc($comp)) {
  282. $cls = 'ARC2_' . $comp;
  283. $e = new $cls($this->a, $this);
  284. $e->extractRDF();
  285. }
  286. $this->extracted_formats[] = $format;
  287. }
  288. }
  289. }
  290. function getNode($id) {
  291. return isset($this->nodes[$id]) ? $this->nodes[$id] : 0;
  292. }
  293. /* */
  294. }