PageRenderTime 57ms CodeModel.GetById 28ms RepoModel.GetById 1ms app.codeStats 0ms

/app/arc/parsers/ARC2_LegacyXMLParser.php

https://github.com/rinie/TransFormr
PHP | 315 lines | 249 code | 33 blank | 33 comment | 49 complexity | 06a1eb16b36cb5b86d83208051f5fb15 MD5 | raw file
  1. <?php
  2. /*
  3. homepage: http://arc.semsol.org/
  4. license: http://arc.semsol.org/license
  5. class: ARC2 Legaxy XML Parser
  6. author: Benjamin Nowack
  7. version: 2008-10-04 (Fix: nsDecl led to warnings when uri was an array.)
  8. */
  9. ARC2::inc('Class');
  10. class ARC2_LegacyXMLParser extends ARC2_Class {
  11. function __construct($a = '', &$caller) {
  12. parent::__construct($a, $caller);
  13. }
  14. function ARC2_LegacyXMLParser($a = '', &$caller) {
  15. $this->__construct($a, $caller);
  16. }
  17. function __init() {/* reader */
  18. parent::__init();
  19. $this->encoding = $this->v('encoding', false, $this->a);
  20. $this->state = 0;
  21. $this->x_base = $this->base;
  22. $this->xml = 'http://www.w3.org/XML/1998/namespace';
  23. $this->rdf = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
  24. $this->nsp = array($this->xml => 'xml', $this->rdf => 'rdf');
  25. $this->allowCDataNodes = 1;
  26. $this->target_encoding = '';
  27. $this->keep_cdata_ws = $this->v('keep_cdata_whitespace', 0, $this->a);
  28. }
  29. /* */
  30. function setReader(&$reader) {
  31. $this->reader =& $reader;
  32. }
  33. function parse($path, $data = '', $iso_fallback = false) {
  34. $this->nodes = array();
  35. $this->node_count = 0;
  36. $this->level = 0;
  37. /* reader */
  38. if (!$this->v('reader')) {
  39. ARC2::inc('Reader');
  40. $this->reader = & new ARC2_Reader($this->a, $this);
  41. }
  42. $this->reader->setAcceptHeader('Accept: application/xml; q=0.9, */*; q=0.1');
  43. $this->reader->activate($path, $data);
  44. $this->x_base = isset($this->a['base']) && $this->a['base'] ? $this->a['base'] : $this->reader->base;
  45. $this->base = $this->x_base;
  46. $this->doc_url = $this->reader->base;
  47. /* xml parser */
  48. $this->initXMLParser();
  49. /* parse */
  50. $first = true;
  51. while ($d = $this->reader->readStream(1)) {
  52. if ($iso_fallback && $first) {
  53. $d = '<?xml version="1.0" encoding="ISO-8859-1"?>' . "\n" . preg_replace('/^\<\?xml [^\>]+\?\>\s*/s', '', $d);
  54. }
  55. if (!xml_parse($this->xml_parser, $d, false)) {
  56. $error_str = xml_error_string(xml_get_error_code($this->xml_parser));
  57. $line = xml_get_current_line_number($this->xml_parser);
  58. if (!$iso_fallback && preg_match("/Invalid character/i", $error_str)) {
  59. xml_parser_free($this->xml_parser);
  60. unset($this->xml_parser);
  61. $this->reader->closeStream();
  62. unset($this->reader);
  63. $this->__init();
  64. $this->encoding = 'ISO-8859-1';
  65. $this->initXMLParser();
  66. return $this->parse($path, $data, true);
  67. }
  68. else {
  69. return $this->addError('XML error: "' . $error_str . '" at line ' . $line . ' (parsing as ' . $this->getEncoding() . ')');
  70. }
  71. }
  72. $first = false;
  73. }
  74. $this->target_encoding = xml_parser_get_option($this->xml_parser, XML_OPTION_TARGET_ENCODING);
  75. xml_parser_free($this->xml_parser);
  76. $this->reader->closeStream();
  77. unset($this->reader);
  78. return $this->done();
  79. }
  80. /* */
  81. function getEncoding($src = 'config') {
  82. if ($src == 'parser') {
  83. return $this->target_encoding;
  84. }
  85. elseif (($src == 'config') && $this->encoding) {
  86. return $this->encoding;
  87. }
  88. return $this->reader->getEncoding();
  89. }
  90. /* */
  91. function done() {
  92. }
  93. /* */
  94. function getStructure() {
  95. return array('nodes' => $this->v('nodes', array()));
  96. }
  97. /* */
  98. function getNodeIndex(){
  99. if (!isset($this->node_index)) {
  100. /* index by parent */
  101. $index = array();
  102. for ($i = 0, $i_max = count($this->nodes); $i < $i_max; $i++) {
  103. $node = $this->nodes[$i];
  104. $node['id'] = $i;
  105. $node['doc_base'] = $this->base;
  106. if (isset($this->doc_url)) $node['doc_url'] = $this->doc_url;
  107. $this->updateNode($node);
  108. $p_id = $node['p_id'];
  109. if (!isset($index[$p_id])) {
  110. $index[$p_id] = array();
  111. }
  112. $index[$p_id][$node['pos']] = $node;
  113. }
  114. $this->node_index = $index;
  115. }
  116. return $this->node_index;
  117. }
  118. function getNodes() {
  119. return $this->nodes;
  120. }
  121. function getSubNodes($n) {
  122. return $this->v($n['id'], array(), $this->getNodeIndex());
  123. }
  124. function getNodeContent($n, $outer = 0, $trim = 1) {
  125. //echo '<pre>' . htmlspecialchars(print_r($n, 1)) . '</pre>';
  126. if ($n['tag'] == 'cdata') {
  127. $r = $n['a']['value'];
  128. }
  129. else {
  130. $r = '';
  131. if ($outer) {
  132. $r .= '<' . $n['tag'];
  133. asort($n['a']);
  134. if (isset($n['a']['xmlns']) && $n['a']['xmlns']['']) {
  135. $r .= ' xmlns="' . $n['a']['xmlns'][''] . '"';
  136. }
  137. foreach ($n['a'] as $a => $val) {
  138. $r .= preg_match('/^[^\s]+$/', $a) && !is_array($val) ? ' ' . $a . '="' . addslashes($val) . '"' : '';
  139. }
  140. $r .= $n['empty'] ? '/>' : '>';
  141. }
  142. if (!$n['empty']) {
  143. $r .= $this->v('cdata', '', $n);
  144. $sub_nodes = $this->getSubNodes($n);
  145. foreach ($sub_nodes as $sub_n) {
  146. $r .= $this->getNodeContent($sub_n, 1, 0);
  147. }
  148. if ($outer) {
  149. $r .= '</' . $n['tag'] . '>';
  150. }
  151. }
  152. }
  153. return ($trim && !$this->keep_cdata_ws) ? trim($r) : $r;
  154. }
  155. /* */
  156. function pushNode($n) {
  157. $n['id'] = $this->node_count;
  158. $this->nodes[$this->node_count] = $n;
  159. $this->node_count++;
  160. }
  161. function getCurNode($t = '') {
  162. $i = 1;
  163. do {
  164. $r = $this->node_count ? $this->nodes[$this->node_count - $i] : 0;
  165. $found = (!$t || ($r['tag'] == $t)) ? 1 : 0;
  166. $i++;
  167. } while (!$found && isset($this->nodes[$this->node_count - $i]));
  168. return $r;
  169. }
  170. function updateNode($node) {/* php4-save */
  171. $this->nodes[$node['id']] = $node;
  172. }
  173. /* */
  174. function initXMLParser() {
  175. if (!isset($this->xml_parser)) {
  176. $enc = preg_match('/^(utf\-8|iso\-8859\-1|us\-ascii)$/i', $this->getEncoding(), $m) ? $m[1] : 'UTF-8';
  177. $parser = xml_parser_create_ns($enc, '');
  178. xml_parser_set_option($parser, XML_OPTION_SKIP_WHITE, 0);
  179. xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 0);
  180. xml_set_element_handler($parser, 'open', 'close');
  181. xml_set_character_data_handler($parser, 'cData');
  182. xml_set_start_namespace_decl_handler($parser, 'nsDecl');
  183. xml_set_object($parser, $this);
  184. $this->xml_parser =& $parser;
  185. }
  186. }
  187. /* */
  188. function open($p, $t, $a) {
  189. $t_exact = $t;
  190. //echo "<br />\n".'opening '.$t . ' ' . print_r($a, 1); flush();
  191. //echo "<br />\n".'opening '.$t; flush();
  192. $t = strpos($t, ':') ? $t : strtolower($t);
  193. /* base check */
  194. $base = '';
  195. if (($t == 'base') && isset($a['href'])) {
  196. $this->base = $a['href'];
  197. $base = $a['href'];
  198. }
  199. /* URIs */
  200. foreach (array('href', 'src', 'id') as $uri_a) {
  201. if (isset($a[$uri_a])) {
  202. $a[$uri_a . ' uri'] = ($uri_a == 'id') ? $this->calcURI('#'.$a[$uri_a]) : $this->calcURI($a[$uri_a]);
  203. }
  204. }
  205. /* ns */
  206. if ($a) {
  207. foreach ($a as $k => $v) {
  208. if (strpos($k, 'xmlns') === 0) {
  209. $this->nsDecl($p, trim(substr($k, 5), ':'), $v);
  210. }
  211. }
  212. }
  213. /* node */
  214. $node = array(
  215. 'tag' => $t,
  216. 'tag_exact' => $t_exact,
  217. 'a' => $a,
  218. 'level' => $this->level,
  219. 'pos' => 0,
  220. 'p_id' => $this->node_count-1,
  221. 'state' => 'open',
  222. 'empty' => 0,
  223. 'cdata' =>''
  224. );
  225. if ($base) {
  226. $node['base'] = $base;
  227. }
  228. /* parent/sibling */
  229. if ($this->node_count) {
  230. $l = $this->level;
  231. $prev_node = $this->getCurNode();
  232. if ($prev_node['level'] == $l) {
  233. $node['p_id'] = $prev_node['p_id'];
  234. $node['pos'] = $prev_node['pos']+1;
  235. }
  236. elseif($prev_node['level'] > $l) {
  237. while($prev_node['level'] > $l) {
  238. if (!isset($this->nodes[$prev_node['p_id']])) {
  239. //$this->addError('nesting mismatch: tag is ' . $t . ', level is ' . $l . ', prev_level is ' . $prev_node['level'] . ', prev_node p_id is ' . $prev_node['p_id']);
  240. break;
  241. }
  242. $prev_node = $this->nodes[$prev_node['p_id']];
  243. }
  244. $node['p_id'] = $prev_node['p_id'];
  245. $node['pos'] = $prev_node['pos']+1;
  246. }
  247. }
  248. $this->pushNode($node);
  249. $this->level++;
  250. /* cdata */
  251. $this->cur_cdata="";
  252. }
  253. function close($p, $t, $empty = 0) {
  254. //echo "<br />\n".'closing '.$t; flush();
  255. $node = $this->getCurNode($t);
  256. $node['state'] = 'closed';
  257. $node['empty'] = $empty;
  258. $this->updateNode($node);
  259. $this->level--;
  260. }
  261. function cData($p, $d) {
  262. //echo trim($d) ? "<br />\n".'cdata: ' . $d : ''; flush();
  263. $node = $this->getCurNode();
  264. if($node['state'] == 'open') {
  265. $node['cdata'] .= $d;
  266. $this->updateNode($node);
  267. }
  268. else {/* cdata is sibling of node */
  269. if ($this->allowCDataNodes) {
  270. $this->open($p, 'cdata', array('value' => $d));
  271. $this->close($p, 'cdata');
  272. }
  273. }
  274. }
  275. function nsDecl($p, $prf, $uri) {
  276. if (is_array($uri)) return 1;
  277. $this->ns[$prf] = $uri;
  278. $this->nsp[$uri] = isset($this->nsp[$uri]) ? $this->nsp[$uri] : $prf;
  279. }
  280. /* */
  281. }