PageRenderTime 26ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/sergiosgc/Text/Parser/LALR.php

https://github.com/sergiosgc/Text_Parser
PHP | 219 lines | 139 code | 10 blank | 70 comment | 33 complexity | d668d97418cdd65181d13ecac8beca9b MD5 | raw file
  1. <?php
  2. /* vim: set expandtab tabstop=4 shiftwidth=4 foldmethod=marker: */
  3. namespace sergiosgc;
  4. /**
  5. * Text_Parser is the base class for parsers. In order to be useful, it must
  6. * be extended to include proper reduction functions for the grammar, as well as
  7. * action and goto tables.
  8. *
  9. * The best bet for creating a Text_Parser subclass is to use a compiler compiler
  10. * that interprets a grammar description and produces the parser class.
  11. */
  12. abstract class Text_Parser_LALR
  13. {
  14. /** The tokenizer provides a stream of Tokens (Text_Tokenizer_Token instances)*/
  15. protected $_tokenizer;
  16. /* _debugLevel {{{ */
  17. /** Debug verbosity */
  18. protected $_debugLevel = 0;
  19. public function setDebugLevel($val)
  20. {
  21. $this->_debugLevel = $val;
  22. }
  23. /* }}} */
  24. /* _stateStack field {{{ */
  25. protected $_stateStack = array();
  26. protected function pushState(&$nextState, &$token)
  27. {
  28. $this->_stateStack[] = $nextState;
  29. $this->_stateStack[] = $token;
  30. }
  31. protected function getCurrentState()
  32. {
  33. if (count($this->_stateStack) == 0) return 0;
  34. if (count($this->_stateStack) % 2 != 0) throw new Text_Parser_InvalidStateStackException('State stack is invalid (uneven count)');
  35. return $this->_stateStack[count($this->_stateStack) - 2];
  36. }
  37. protected function getTopToken()
  38. {
  39. if (count($this->_stateStack) == 0) return null;
  40. return $this->_stateStack[count($this->_stateStack) - 1];
  41. }
  42. protected function popTokens(&$tokenArray, $count = 1)
  43. {
  44. if (count($this->_stateStack) % 2 != 0) throw new Text_Parser_InvalidStateStackException('State stack is invalid (uneven count)');
  45. if (count($this->_stateStack) < (2 * $count))
  46. throw new Text_Parser_EmptyStackException(sprintf('Unable to pop %d tokens from a stack with %d tokens', $count, count($this->_stateStack) / 2));
  47. $currentStateIndex = count($this->_stateStack) - 2 * $count - 2;
  48. $currentState = $currentStateIndex >= 0 ? $this->_stateStack[$currentStateIndex] : 0;
  49. $currentStateIndex += 2;
  50. for ($i=0; $i < $count; $i++)
  51. {
  52. $tokenArray[] = $this->_stateStack[$currentStateIndex + 1];
  53. $currentStateIndex += 2;
  54. }
  55. $this->_stateStack = array_slice($this->_stateStack, 0, count($this->_stateStack) - 2 * $count);
  56. return $currentState;
  57. }
  58. protected function stateStackAsString()
  59. {
  60. $result = '[';
  61. $separator = '';
  62. for ($i=0; $i<count($this->_stateStack); $i+=2) {
  63. $result .= $separator . $this->_stateStack[$i];
  64. $separator = ' ';
  65. }
  66. $result .= ']';
  67. return $result;
  68. }
  69. /* }}} */
  70. /* _actionTable field {{{ */
  71. /**
  72. * The action table, indexed by the current state and terminal token id.
  73. *
  74. * The action table is a matrix. The first coordinate is the current parser state, an integer. The second coordinate is the current terminal, a token id.
  75. * The end of input is represented as an empty string terminal id.
  76. * Each element is an associative array:
  77. * - For an accept action, it contains an element 'action' containing the string 'accept'
  78. * - For a shift action, it contains an element 'action' containing the string 'shift' and an element 'nextState' containing the next state (an int)
  79. * - For a reduce action, it contains:
  80. * - An element 'action' containing the string 'reduce'
  81. * - An element 'function' containing the function name that will execute the reduction
  82. * - An element 'symbols' which is a numerically indexed array containing the names assigned to the symbols in the grammar rule being
  83. * reduced (non-assigned symbols should contain an empty string
  84. * - An element 'rule' containing the human-readable representation of the grammar rule (for debugging purposes)
  85. * - For a lookahead action, it contains
  86. * - An element 'action' containing the string 'lookahead'
  87. * - An element 'actionTable' containing one action table row, indexed by lookahead token IDs.
  88. * - An element 'wildcardActionTable' containing an action table row to be used if no match can be found in 'actionTable' above.
  89. * If 'wildcardActionTable' is null and no match is found, a Text_Parser_UnexpectedTokenException is thrown
  90. *
  91. * Check Text_Parser_LR_Test for an example actionTable.
  92. */
  93. protected $_actionTable = null;
  94. protected function getAction($state, $nextToken)
  95. {
  96. if (!is_array($this->_actionTable)) throw new Text_Parser_InvalidParserException('This parser has not been configured. It has no action table');
  97. if ($nextToken === false) {
  98. $nextToken = new Text_Tokenizer_Token('','');
  99. }
  100. if (!array_key_exists($state, $this->_actionTable)) throw new Text_Parser_UnexpectedTokenException($nextToken, $state);
  101. $result = $this->getActionFromRow($state, $nextToken, $this->_actionTable[$state]);
  102. if (is_null($result)) throw new Text_Parser_UnexpectedTokenException($nextToken, $state);
  103. return $this->getActionFromRow($state,$nextToken, $this->_actionTable[$state]);
  104. }
  105. protected function getActionFromRow($state, $nextToken, $row, $lookAhead = 1 )
  106. {
  107. if ($nextToken === '') $nextToken = new Text_Tokenizer_Token('', '');
  108. if (!array_key_exists($nextToken->getId(), $row)) throw new Text_Parser_UnexpectedTokenException($nextToken, $state);
  109. if ($row[$nextToken->getId()]['action'] == 'lookahead') {
  110. $lookAheadToken = $this->_tokenizer->getLookAhead($lookAhead);
  111. $result = $this->getActionFromRow($state, $lookAheadToken, $row[$nextToken->getId()]['actionTable'], $lookAhead + 1);
  112. if (!is_null($result)) return $result;
  113. if (is_null($row[$nextToken->getId()]['wildcardActionTable'])) return null;
  114. return getActionFromRow($state, $lookAheadToken, $row, $lookAhead + 1);
  115. }
  116. return $row[$nextToken->getId()];
  117. }
  118. /* }}} */
  119. /* _gotoTable field {{{ */
  120. /**
  121. * The goto table, indexed by current parser state and non-terminal
  122. *
  123. * The goto table is a matrix, whose first index is the parser state, an integer, and the second index is the non-terminal token id.
  124. * Each entry contains the next parser state.
  125. *
  126. * Check Text_Parser_LR_Test for an example gotoTable.
  127. */
  128. protected $_gotoTable = null;
  129. protected function getNextState($state, $nextToken)
  130. {
  131. if (!is_array($this->_gotoTable)) throw new Text_Parser_InvalidParserException('This parser has not been configured. It has no goto table');
  132. if (!array_key_exists($state, $this->_gotoTable) || !array_key_exists($nextToken->getId(), $this->_gotoTable[$state])) throw new Text_Parser_UnexpectedTokenException($nextToken, $state);
  133. return $this->_gotoTable[$state][$nextToken->getId()];
  134. }
  135. /* }}} */
  136. /* debugPrintf {{{ */
  137. /**
  138. * debugPrintf acts like printf, except output is conditioned by the debug level and message severity
  139. *
  140. * @param int messageLevel Severity level for the message
  141. * @param string message Message to print. Accepts printf syntax
  142. * @param ... Values to replace in message placeholders
  143. */
  144. protected function debugPrintf($messageLevel, $message)
  145. {
  146. if ($this->_debugLevel >= $messageLevel) {
  147. $sprintfArgs = func_get_args();
  148. unset($sprintfArgs[0]);
  149. $sprintfArgs = array_values($sprintfArgs);
  150. call_user_func_array('printf', $sprintfArgs);
  151. }
  152. }
  153. /* }}} */
  154. /* parse {{{ */
  155. /**
  156. * Parse the input
  157. */
  158. public function parse()
  159. {
  160. $nextToken = $this->_tokenizer->getNextToken();
  161. $this->debugPrintf(1, "Read token %s(%s) state %s\n", $nextToken->getId(), $nextToken->getValue(), $this->stateStackAsString());
  162. do {
  163. $action = $this->getAction($this->getCurrentState(), $nextToken);
  164. switch ($action['action'])
  165. {
  166. case 'accept':
  167. $this->debugPrintf(1, "-Accepting\n");
  168. return $this->getTopToken();
  169. break;
  170. case 'shift':
  171. $this->debugPrintf(1, "-Shifting to state %d\n", $action['nextState']);
  172. $this->pushState($action['nextState'], $nextToken);
  173. $nextToken = $this->_tokenizer->getNextToken();
  174. $this->debugPrintf(1, "\nRead token %s(%s) state %s\n", $nextToken ? $nextToken->getId() : '$', $nextToken ? $nextToken->getValue() : '$', $this->stateStackAsString());
  175. break;
  176. case 'reduce':
  177. $this->debugPrintf(1, "-Reducing using %s state %s ", $action['function'], $this->stateStackAsString());
  178. $values = array();
  179. // Retrieve tokens to be reduced, and the state the parser was at on the last retrieved token
  180. $nextState = $this->popTokens($values, count($action['symbols']));
  181. // Marshall arguments for the reduction function
  182. $symbols = array();
  183. foreach ($action['symbols'] as $idx => $symbol) {
  184. if ($symbol != '') $symbols[$symbol] =& $values[$idx];
  185. }
  186. // Call the reduction function, retrieving the non-terminal that results from the reduction
  187. $token = call_user_func_array(array($this, $action['function']), $symbols);
  188. if (!is_object($token) || !method_exists($token, 'getId') || !method_exists($token, 'getValue')) {
  189. $token = new Text_Tokenizer_Token($action['leftNonTerminal'], $token);
  190. }
  191. // Using the reduction non-terminal and the parser state on the last retrieved token, lookup the goto table
  192. $nextState = $this->getNextState($nextState, $token);
  193. // Move to the goto state
  194. $this->pushState($nextState, $token);
  195. $this->debugPrintf(1, "Result state %s\n", $this->stateStackAsString());
  196. break;
  197. }
  198. } while (true); // Exit happens through a return on the accept action
  199. }
  200. /* }}} */
  201. /* Constructor {{{ */
  202. public function __construct(&$tokenizer)
  203. {
  204. if (!$tokenizer instanceof Text_Tokenizer_Lookahead) $tokenizer = new Text_Tokenizer_Lookahead_Adapter($tokenizer);
  205. $this->_tokenizer = $tokenizer;
  206. }
  207. /* }}} */
  208. }
  209. ?>