PageRenderTime 50ms CodeModel.GetById 14ms RepoModel.GetById 1ms app.codeStats 0ms

/sites/all/modules/rdf/vendor/arc/extractors/ARC2_RdfaExtractor.php

https://github.com/ntulip/tattler
PHP | 374 lines | 318 code | 23 blank | 33 comment | 76 complexity | 859a7a3d1965319457e25f6b890e7553 MD5 | raw file
  1. <?php
  2. /*
  3. homepage: http://arc.semsol.org/
  4. license: http://arc.semsol.org/license
  5. class: ARC2 RDFa Extractor
  6. author: Benjamin Nowack
  7. version: 2009-02-09 (Tweak: getRootNode returns 1st node if html tag is not found)
  8. */
  9. ARC2::inc('RDFExtractor');
  10. class ARC2_RdfaExtractor extends ARC2_RDFExtractor {
  11. function __construct($a = '', &$caller) {
  12. parent::__construct($a, $caller);
  13. }
  14. function ARC2_RdfaExtractor($a = '', &$caller) {
  15. $this->__construct($a, $caller);
  16. }
  17. function __init() {
  18. parent::__init();
  19. }
  20. /* */
  21. function extractRDF() {
  22. //echo '<pre>' . htmlspecialchars(print_r($this->nodes, 1)) . '</pre>';
  23. if (!isset($this->caller->detected_formats['rdfa'])) return 0;
  24. $root_node = $this->getRootNode();
  25. //$base = $this->v('xml:base', $this->getDocBase(), $root_node['a']);
  26. $base = $this->getDocBase();
  27. $context = array(
  28. 'base' => $base,
  29. 'p_s' => $base,
  30. 'p_o' => '',
  31. 'ns' => array(),
  32. 'inco_ts' => array(),
  33. 'lang' => '',
  34. );
  35. $this->processNode($root_node, $context, 0);
  36. }
  37. /* */
  38. function getRootNode() {
  39. foreach ($this->nodes as $id => $node) {
  40. if ($node['tag'] == 'html') {
  41. return $node;
  42. }
  43. }
  44. return $this->nodes[0];
  45. }
  46. /* */
  47. function processNode($n, $ct, $level) {
  48. $ts_added = 0;
  49. /* step 1 */
  50. $lct = array();
  51. $lct['recurse'] = 1;
  52. $lct['skip'] = 0;
  53. $lct['new_s'] = '';
  54. $lct['cur_o_res'] = '';
  55. $lct['inco_ts'] = array();
  56. $lct['base'] = $ct['base'];
  57. //$lct['base'] = $this->v('xml:base', $ct['base'], $n['a']);
  58. /* step 2 */
  59. $lct['ns'] = array_merge($ct['ns'], $this->v('xmlns', array(), $n['a']));
  60. /* step 3 */
  61. $lct['lang'] = $this->v('xml:lang', $ct['lang'], $n['a']);
  62. /* step 4 */
  63. $rel_uris = $this->getAttributeURIs($n, $ct, $lct, 'rel');
  64. $rev_uris = $this->getAttributeURIs($n, $ct, $lct, 'rev');
  65. if (!$rel_uris && !$rev_uris) {
  66. foreach (array('about', 'src', 'resource', 'href') as $attr) {
  67. if (isset($n['a'][$attr]) && (list($uri, $sub_v) = $this->xURI($n['a'][$attr], $lct['base'], $lct['ns'])) && $uri) {
  68. $lct['new_s'] = $uri;
  69. break;
  70. }
  71. }
  72. if (!$lct['new_s']) {
  73. if (preg_match('/(head|body)/i', $n['tag'])) {
  74. $lct['new_s'] = $lct['base'];
  75. }
  76. elseif ($this->getAttributeURIs($n, $ct, $lct, 'typeof')) {
  77. $lct['new_s'] = $this->createBnodeID();
  78. }
  79. elseif ($ct['p_o']) {
  80. $lct['new_s'] = $ct['p_o'];
  81. $lct['skip'] = 1;
  82. }
  83. }
  84. }
  85. /* step 5 */
  86. else {
  87. foreach (array('about', 'src') as $attr) {
  88. if (isset($n['a'][$attr]) && (list($uri, $sub_v) = $this->xURI($n['a'][$attr], $lct['base'], $lct['ns'])) && $uri) {
  89. $lct['new_s'] = $uri;
  90. break;
  91. }
  92. }
  93. if (!$lct['new_s']) {
  94. if (preg_match('/(head|body)/i', $n['tag'])) {
  95. $lct['new_s'] = $lct['base'];
  96. }
  97. elseif ($this->getAttributeURIs($n, $ct, $lct, 'typeof')) {
  98. $lct['new_s'] = $this->createBnodeID();
  99. }
  100. elseif ($ct['p_o']) {
  101. $lct['new_s'] = $ct['p_o'];
  102. }
  103. }
  104. foreach (array('resource', 'href') as $attr) {
  105. if (isset($n['a'][$attr]) && (list($uri, $sub_v) = $this->xURI($n['a'][$attr], $lct['base'], $lct['ns'])) && $uri) {
  106. $lct['cur_o_res'] = $uri;
  107. break;
  108. }
  109. }
  110. }
  111. /* step 6 */
  112. if ($lct['new_s']) {
  113. if ($uris = $this->getAttributeURIs($n, $ct, $lct, 'typeof')) {
  114. foreach ($uris as $uri) {
  115. $this->addT(array(
  116. 's' => $lct['new_s'],
  117. 's_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri',
  118. 'p' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type',
  119. 'o' => $uri,
  120. 'o_type' => 'uri',
  121. 'o_lang' => '',
  122. 'o_datatype' => '',
  123. ));
  124. $ts_added = 1;
  125. }
  126. }
  127. /* step 7 */
  128. if ($lct['cur_o_res']) {
  129. if ($rel_uris) {
  130. foreach ($rel_uris as $uri) {
  131. $this->addT(array(
  132. 's' => $lct['new_s'],
  133. 's_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri',
  134. 'p' => $uri,
  135. 'o' => $lct['cur_o_res'],
  136. 'o_type' => preg_match('/^\_\:/', $lct['cur_o_res']) ? 'bnode' : 'uri',
  137. 'o_lang' => '',
  138. 'o_datatype' => '',
  139. ));
  140. $ts_added = 1;
  141. }
  142. }
  143. if ($rev_uris) {
  144. foreach ($rev_uris as $uri) {
  145. $this->addT(array(
  146. 's' => $lct['cur_o_res'],
  147. 's_type' => preg_match('/^\_\:/', $lct['cur_o_res']) ? 'bnode' : 'uri',
  148. 'p' => $uri,
  149. 'o' => $lct['new_s'],
  150. 'o_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri',
  151. 'o_lang' => '',
  152. 'o_datatype' => '',
  153. ));
  154. $ts_added = 1;
  155. }
  156. }
  157. }
  158. }
  159. /* step 8 */
  160. if (!$lct['cur_o_res']) {
  161. if ($rel_uris || $rev_uris) {
  162. $lct['cur_o_res'] = $this->createBnodeID();
  163. foreach ($rel_uris as $uri) {
  164. $lct['inco_ts'][] = array('p' => $uri, 'dir' => 'fwd');
  165. }
  166. foreach ($rev_uris as $uri) {
  167. $lct['inco_ts'][] = array('p' => $uri, 'dir' => 'rev');
  168. }
  169. }
  170. }
  171. /* step 10 */
  172. if ($new_s = $lct['new_s']) {// ?
  173. if ($uris = $this->getAttributeURIs($n, $ct, $lct, 'property')) {
  174. foreach ($uris as $uri) {
  175. $lct['cur_o_lit'] = $this->getCurrentObjectLiteral($n, $lct, $ct);
  176. $this->addT(array(
  177. 's' => $lct['new_s'],
  178. 's_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri',
  179. 'p' => $uri,
  180. 'o' => $lct['cur_o_lit']['value'],
  181. 'o_type' => 'literal',
  182. 'o_lang' => $lct['cur_o_lit']['lang'],
  183. 'o_datatype' => $lct['cur_o_lit']['datatype'],
  184. ));
  185. $ts_added = 1;
  186. if ($lct['cur_o_lit']['datatype'] == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral') {
  187. $lct['recurse'] = 0;
  188. }
  189. }
  190. }
  191. }
  192. /* step 11 (10) */
  193. if ($lct['recurse']) {
  194. if ($lct['skip']) {
  195. $new_ct = array_merge($ct, array('base' => $lct['base'], 'lang' => $lct['lang'], 'ns' => $lct['ns']));
  196. }
  197. else {
  198. $new_ct = array(
  199. 'base' => $lct['base'],
  200. 'p_s' => $lct['new_s'] ? $lct['new_s'] : $ct['p_s'],
  201. 'p_o' => $lct['cur_o_res'] ? $lct['cur_o_res'] : ($lct['new_s'] ? $lct['new_s'] : $ct['p_s']),
  202. 'ns' => $lct['ns'],
  203. 'inco_ts' => $lct['inco_ts'],
  204. 'lang' => $lct['lang']
  205. );
  206. }
  207. $sub_nodes = $this->getSubNodes($n);
  208. $complete_triples = 0;
  209. foreach ($sub_nodes as $sub_node) {
  210. if ($this->processNode($sub_node, $new_ct, $level+1)) {
  211. $complete_triples = 1;
  212. }
  213. }
  214. }
  215. /* step 12 (11) */
  216. $other = 0;
  217. if ($ts_added || $complete_triples || ($lct['new_s'] && !preg_match('/^\_\:/', $lct['new_s'])) || ($other = 1)) {
  218. //if (!$lct['skip'] && ($complete_triples || ($lct['new_s'] && !preg_match('/^\_\:/', $lct['new_s'])))) {
  219. foreach ($ct['inco_ts'] as $inco_t) {
  220. if ($inco_t['dir'] == 'fwd') {
  221. $this->addT(array(
  222. 's' => $ct['p_s'],
  223. 's_type' => preg_match('/^\_\:/', $ct['p_s']) ? 'bnode' : 'uri',
  224. 'p' => $inco_t['p'],
  225. 'o' => $lct['new_s'],
  226. 'o_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri',
  227. 'o_lang' => '',
  228. 'o_datatype' => '',
  229. ));
  230. }
  231. elseif ($inco_t['dir'] == 'rev') {
  232. $this->addT(array(
  233. 's' => $lct['new_s'],
  234. 's_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri',
  235. 'p' => $inco_t['p'],
  236. 'o' => $ct['p_s'],
  237. 'o_type' => preg_match('/^\_\:/', $ct['p_s']) ? 'bnode' : 'uri',
  238. 'o_lang' => '',
  239. 'o_datatype' => '',
  240. ));
  241. }
  242. }
  243. }
  244. /* step 13 (12) (result flag) */
  245. if ($ts_added) return 1;
  246. if ($lct['new_s'] && !preg_match('/^\_\:/', $lct['new_s'])) return 1;
  247. if ($complete_triples) return 1;
  248. return 0;
  249. }
  250. /* */
  251. function getAttributeURIs($n, $ct, $lct, $attr) {
  252. $vals = ($val = $this->v($attr, '', $n['a'])) ? explode(' ', $val) : array();
  253. $r = array();
  254. foreach ($vals as $val) {
  255. if(!trim($val)) continue;
  256. if ((list($uri, $sub_v) = $this->xURI(trim($val), $lct['base'], $lct['ns'], $attr)) && $uri) {
  257. $r[] = $uri;
  258. }
  259. }
  260. return $r;
  261. }
  262. /* */
  263. function getCurrentObjectLiteral($n, $lct, $ct) {
  264. $xml_val = $this->getContent($n);
  265. $plain_val = $this->getPlainContent($n);
  266. $dt = $this->v('datatype', '', $n['a']);
  267. list($dt_uri, $sub_v) = $this->xURI($dt, $lct['base'], $lct['ns']);
  268. $dt = $dt ? $dt_uri : $dt;
  269. $r = array('value' => '', 'lang' => $lct['lang'], 'datatype' => $dt);
  270. if (isset($n['a']['content'])) {
  271. $r['value'] = $n['a']['content'];
  272. }
  273. elseif ($xml_val == $plain_val) {
  274. $r['value'] = $plain_val;
  275. }
  276. elseif (!preg_match('/[\<\>]/', $xml_val)) {
  277. $r['value'] = $xml_val;
  278. }
  279. elseif (isset($n['a']['datatype']) && ($dt != 'http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral')) {
  280. $r['value'] = $plain_val;
  281. }
  282. elseif (!isset($n['a']['datatype']) || ($dt == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral')) {
  283. $r['value'] = $this->injectXMLDeclarations($xml_val, $lct['ns'], $lct['lang']);
  284. $r['datatype'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral';
  285. }
  286. return $r;
  287. }
  288. function injectXMLDeclarations($val, $ns, $lang) {//@@todo proper node rebuilding */
  289. $lang_code = $lang ? ' xml:lang="' . $lang . '"' : '';
  290. /* ns */
  291. $val = preg_replace('/<([a-z0-9]+)([\>\s])/is', '<\\1 xmlns="http://www.w3.org/1999/xhtml"' . $lang_code . '\\2', $val);
  292. foreach ($ns as $prefix => $uri) {
  293. if ($prefix && ($pos = strpos(' ' . $val, '<' . $prefix . ':'))) {
  294. $val = substr($val, 0, $pos - 1) . preg_replace('/^(<' . $prefix . '\:[^\>\s]+)/', '\\1 xmlns:' . $prefix. '="' . $uri . '"' . $lang_code, substr($val, $pos - 1));
  295. }
  296. }
  297. /* remove accidentally added xml:lang and xmlns= */
  298. $val = preg_replace('/(\<[^\>]*)( xml\:lang[^\s\>]+)([^\>]*)(xml\:lang[^\s\>]+)/s', '\\1\\3\\4', $val);
  299. $val = preg_replace('/(\<[^\>]*)( xmlns=[^\s\>]+)([^\>]*)(xmlns=[^\s\>]+)/s', '\\1\\3\\4', $val);
  300. return $val;
  301. }
  302. /* */
  303. function xURI($v, $base, $ns, $attr_type = '') {
  304. if ((list($sub_r, $sub_v) = $this->xBlankCURIE($v, $base, $ns)) && $sub_r) {
  305. return array($sub_r, $sub_v);
  306. }
  307. if ((list($sub_r, $sub_v) = $this->xSafeCURIE($v, $base, $ns)) && $sub_r) {
  308. return array($sub_r, $sub_v);
  309. }
  310. if ((list($sub_r, $sub_v) = $this->xCURIE($v, $base, $ns)) && $sub_r) {
  311. return array($sub_r, $sub_v);
  312. }
  313. if (preg_match('/^(rel|rev)$/', $attr_type) && preg_match('/^\s*(alternate|appendix|bookmark|cite|chapter|contents|copyright|glossary|help|icon|index|last|license|meta|next|p3pv1|prev|role|section|stylesheet|subsection|start|up)(\s|$)/s', $v, $m)) {
  314. return array('http://www.w3.org/1999/xhtml/vocab#' . $m[1], preg_replace('/^\s*' . $m[1]. '/s', '', $v));
  315. }
  316. if (preg_match('/^(rel|rev)$/', $attr_type) && preg_match('/^[a-z0-9\.]+$/i', $v)) {
  317. return array(0, $v);
  318. }
  319. return array($this->calcURI($v, $base), '');
  320. }
  321. function xBlankCURIE($v, $base, $ns) {
  322. if ($sub_r = $this->x('\[\_\:\]', $v)) {
  323. $this->empty_bnode = isset($this->empty_bnode) ? $this->empty_bnode : $this->createBnodeID();
  324. return array($this->empty_bnode, '');
  325. }
  326. if ($sub_r = $this->x('\[?(\_\:[a-z0-9\_\-]+)\]?', $v)) {
  327. return array($sub_r[1], '');
  328. }
  329. return array(0, $v);
  330. }
  331. function xSafeCURIE($v, $base, $ns) {
  332. if ($sub_r = $this->x('\[([^\:]*)\:(.*)\]', $v)) {
  333. if (!$sub_r[1]) return array('http://www.w3.org/1999/xhtml/vocab#' . $sub_r[2], '');
  334. if (isset($ns[$sub_r[1]])) {
  335. return array($ns[$sub_r[1]] . $sub_r[2], '');
  336. }
  337. }
  338. return array(0, $v);
  339. }
  340. function xCURIE($v, $base, $ns) {
  341. if ($sub_r = $this->x('([a-z0-9\-\_]*)\:([a-z0-9\-\_]+)', $v)) {
  342. if (!$sub_r[1]) return array('http://www.w3.org/1999/xhtml/vocab#' . $sub_r[2], '');
  343. if (isset($ns[$sub_r[1]])) {
  344. return array($ns[$sub_r[1]] . $sub_r[2], '');
  345. }
  346. }
  347. return array(0, $v);
  348. }
  349. /* */
  350. }