/extensions/ParserWiki/ParserEngine.php

https://github.com/ChuguluGames/mediawiki-svn · PHP · 236 lines · 24 code · 2 blank · 210 comment · 5 complexity · fede7dc38e98b02e4e8b30273dda162b MD5 · raw file

  1. <?php
  2. /**
  3. * Acts as the primary interface between the world and the parser.
  4. * mStartRule - the first rule to use while parsing
  5. * mRules - The list of rules to use while parsing
  6. * mDom - Used to create Dom objects and get's returned at the end of parsing
  7. * mIter - Keeps track of how many times the parser recurses to stop endless loops
  8. */
  9. class ParseEngine {
  10. const maxIter = 2048;
  11. private $mGrammars;
  12. function __construct() {
  13. $this->mGrammars = array();
  14. }
  15. function parse($grammarName, &$text) {
  16. wfDebugLog("ParseEngine", "==========Start Parse Engine==========\n");
  17. $grammar = isset($this->mGrammars[$grammarName]) ? $this->mGrammars[$grammarName] : NULL;
  18. if ($grammar == NULL) {
  19. $revision = Revision::newFromTitle(Title::newFromText($grammarName, NS_GRAMMAR));
  20. $grammar = new DOMDocument();
  21. if ($revision == NULL || ! $grammar->loadXML($revision->getText(), LIBXML_NOBLANKS)) {
  22. return TRUE;
  23. }
  24. $this->pushTags($grammar->documentElement, NULL);
  25. $this->mGrammars[$grammarName] = $grammar;
  26. }
  27. $doc = new DOMDocument();
  28. $rootTag = $doc->createElement($grammar->documentElement->getAttribute("rootTag"));
  29. $startRule = $grammar->documentElement->getAttribute("startRule");
  30. $xpath = new DOMXPath($grammar);
  31. $startRule = $xpath->query("/Grammar/*[@name='$startRule']")->item(0);
  32. $refText = $text;
  33. if (! $this->parseRec($startRule, "", "", $iter, $refText, $rootTag)) {
  34. return TRUE;
  35. }
  36. $doc->appendChild($rootTag);
  37. $text = $doc->saveXML();
  38. wfDebugLog("ParseEngine", "Parsed text - $text\n");
  39. return TRUE;
  40. }
  41. static function unparse($inNodes) {
  42. $retStr = "";
  43. foreach ($inNodes as $child) {
  44. if ($child instanceof DOMText) {
  45. $retStr .= $child->data;
  46. } else {
  47. $retStr .= $child->getAttribute("tag") . self::unparse($child->childNodes);
  48. }
  49. }
  50. return $retStr;
  51. }
  52. private function parseRec($rule, $replaceStr, $saveTags, &$iter, &$text, &$outNode) {
  53. wfDebugLog("ParseEngine", "Entering {$rule->nodeName}, {$rule->getAttribute("name")}\n");
  54. $iter ++;
  55. if ($iter > ParseEngine::maxIter) {
  56. throw new MWException("Parser iterated too many times. Probable loop in grammar.");
  57. }
  58. if ($rule->nodeName == "Assignment" || $rule->nodeName == "Reference" || $rule->nodeName == "Text") {
  59. $saveTags = str_replace("~r", preg_quote($replaceStr, "/"), $saveTags);
  60. $newTags = $rule->getAttribute("saveTags");
  61. if ($saveTags == "") {
  62. $saveTags = $newTags;
  63. } elseif ($newTags != "") {
  64. $saveTags .= "|" . $newTags;
  65. }
  66. }
  67. $dom = $outNode->ownerDocument;
  68. $retCode = FALSE;
  69. if ($rule->nodeName == "Assignment") {
  70. $tag = $rule->getAttribute("tag");
  71. $foundTag = $tag == NULL;
  72. if (! $foundTag) {
  73. if ($rule->getAttribute("regex") != NULL) {
  74. $tag = str_replace("~r", preg_quote($replaceStr, "/"), $tag);
  75. $foundTag = preg_match("/^$tag/s", $text, $matches);
  76. if ($foundTag) {
  77. $tag = $matches[0];
  78. if (isset($matches[1])) {
  79. $replaceStr = $matches[1];
  80. }
  81. }
  82. } else {
  83. $tag = str_replace("~r", $replaceStr, $tag);
  84. $foundTag = strncmp($tag, $text, strlen($tag)) == 0;
  85. }
  86. }
  87. if ($foundTag) {
  88. $newText = $text;
  89. $newElement = $dom->createElement($rule->getAttribute("tagName"));
  90. if ($tag != NULL) {
  91. $newText = substr($newText, strlen($tag));
  92. $newElement->setAttribute("tag", $tag);
  93. }
  94. $retCode = $rule->firstChild == NULL || $this->parseRec($rule->firstChild, $replaceStr, $saveTags, $iter, $newText, $newElement);
  95. if ($retCode) {
  96. $outNode->appendChild($newElement);
  97. $text = $newText;
  98. }
  99. }
  100. } elseif ($rule->nodeName == "Sequence") {
  101. $saveText = $text;
  102. $saveNode = $outNode->cloneNode(TRUE);
  103. $pushInd = $rule->getAttribute("pushInd");
  104. foreach ($rule->childNodes as $i => $crrnt) {
  105. $pushTags = $i >= $pushInd ? $saveTags : "";
  106. $retCode = $this->parseRec($crrnt, $replaceStr, $pushTags, $iter, $text, $outNode);
  107. if (! $retCode) {
  108. $text = $saveText;
  109. $outNode = $saveNode;
  110. break;
  111. }
  112. }
  113. } elseif ($rule->nodeName == "Choice") {
  114. foreach ($rule->childNodes as $crrnt) {
  115. $retCode = $this->parseRec($crrnt, $replaceStr, $saveTags, $iter, $text, $outNode);
  116. if ($retCode) {
  117. break;
  118. }
  119. }
  120. $retCode |= $rule->getAttribute("failSafe") != NULL;
  121. } elseif ($rule->nodeName == "Reference") {
  122. $newVar = $rule->hasAttribute("var") ? str_replace("~r", $replaceStr, $rule->getAttribute("var")) : $replaceStr;
  123. $xpath = new DOMXPath($rule->ownerDocument);
  124. $refRule = $xpath->query("/Grammar/*[@name='{$rule->getAttribute("name")}']")->item(0);
  125. $retCode = $this->parseRec($refRule, $newVar, $saveTags, $iter, $text, $outNode);
  126. } elseif ($rule->nodeName == "Text") {
  127. $tagSearch = $rule->getAttribute("childTags");
  128. if ($tagSearch == "") {
  129. $tagSearch = $saveTags;
  130. } elseif ($saveTags != "") {
  131. $tagSearch .= "|" . $saveTags;
  132. }
  133. while ($text != "" && ($saveTags == "" || ! preg_match("/^($saveTags)/s", $text))) {
  134. $offset = $rule->firstChild != NULL && $this->parseRec($rule->firstChild, $replaceStr, "", $iter, $text, $outNode) ? 0 : 1;
  135. if (preg_match("/$tagSearch/s", $text, $matches, PREG_OFFSET_CAPTURE, $offset)) {
  136. if ($matches[0][1] > 0) {
  137. $outNode->appendChild($dom->createTextNode(substr($text, 0, $matches[0][1])));
  138. $text = substr($text, $matches[0][1]);
  139. }
  140. } else {
  141. $outNode->appendChild($dom->createTextNode($text));
  142. $text = "";
  143. }
  144. }
  145. $retCode = true;
  146. }
  147. wfDebugLog("ParseEngine", "Exiting {$rule->nodeName}, Return Code - $retCode\n");
  148. wfDebugLog("ParseEngine", "Text - $text\n");
  149. return $retCode;
  150. }
  151. private function pushTags($rule, $tagStr) {
  152. if ($rule->nodeName == "Sequence") {
  153. $pushInd = $rule->childNodes->length - 1;
  154. $shouldPush = true;
  155. for ($child = $rule->lastChild; $child != NULL; $child = $child->previousSibling) {
  156. $this->pushTags($child, $tagStr);
  157. if ($child->previousSibling != NULL) {
  158. if ($this->pullTags($child, $iter, $childTag)) {
  159. if ($shouldPush) {
  160. $pushInd --;
  161. }
  162. if ($tagStr == "") {
  163. $tagStr = $childTag;
  164. } elseif ($childTag != "") {
  165. $tagStr .= "|" . $childTag;
  166. }
  167. } else {
  168. $shouldPush = false;
  169. $tagStr = $childTag;
  170. }
  171. }
  172. }
  173. $rule->setAttribute("pushInd", $pushInd);
  174. } else {
  175. if ($rule->nodeName != "Choice") {
  176. $rule->setAttribute("saveTags", $tagStr);
  177. $tagStr = NULL;
  178. if ($rule->nodeName == "Text") {
  179. $childTags = "";
  180. foreach ($rule->childNodes as $crrnt) {
  181. if ($childTags != "") {
  182. $childTags .= "|";
  183. }
  184. $this->pullTags($crrnt, $iter, $childTag);
  185. $childTags .= $childTag;
  186. }
  187. $rule->setAttribute("childTags", $childTags);
  188. }
  189. }
  190. foreach ($rule->childNodes as $crrnt) {
  191. $this->pushTags($crrnt, $tagStr);
  192. }
  193. }
  194. }
  195. private function pullTags($rule, &$iter, &$childTags) {
  196. $iter ++;
  197. if ($iter > ParseEngine::maxIter) {
  198. throw new MWException("Collecter iterated too many times. Probable loop in grammar.");
  199. }
  200. $childTags = "";
  201. $failSafe = TRUE;
  202. if ($rule->nodeName == "Assignment") {
  203. $childTags = $rule->getAttribute("tag");
  204. if ($rule->getAttribute("regex") == NULL) {
  205. $childTags = preg_quote($childTags, "/");
  206. }
  207. $failSafe = FALSE;
  208. } elseif ($rule->nodeName == "Choice" || $rule->nodeName == "Sequence") {
  209. $failSafe = $rule->nodeName == "Sequence";
  210. foreach ($rule->childNodes as $child) {
  211. $failSafe = $this->pullTags($child, $iter, $newTags);
  212. if ($childTags == "") {
  213. $childTags = $newTags;
  214. } elseif ($newTags != "") {
  215. $childTags .= "|" . $newTags;
  216. }
  217. if (($failSafe && $rule->nodeName == "Choice") || (! $failSafe && $rule->nodeName == "Sequence")) {
  218. break;
  219. }
  220. }
  221. $failSafe |= $rule->nodeName == "Choice" && $rule->getAttribute("failSafe") != NULL;
  222. } elseif ($rule->nodeName == "Reference") {
  223. $xpath = new DOMXPath($rule->ownerDocument);
  224. $refRule = $xpath->query("/Grammar/*[@name='{$rule->getAttribute("name")}']")->item(0);
  225. $failSafe = $this->pullTags($refRule, $iter, $childTags);
  226. }
  227. return $failSafe;
  228. }
  229. }