PageRenderTime 45ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/lexer.php

https://github.com/nadavkav/MoodleTAO
PHP | 393 lines | 185 code | 24 blank | 184 comment | 33 complexity | c4e11fc35e8ea9f513abb871c336b892 MD5 | raw file
  1. <?php // $Id$
  2. /* PHP lexer code snarfed from the CVS tree for the lamplib project at
  3. * http://sourceforge.net/projects/lamplib
  4. * This project is administered by Markus Baker, Harry Fuecks and Matt
  5. * Mitchell, and the project code is in the public domain.
  6. *
  7. * Thanks, guys!
  8. */
  9. define("LEXER_ENTER", 1);
  10. define("LEXER_MATCHED", 2);
  11. define("LEXER_UNMATCHED", 3);
  12. define("LEXER_EXIT", 4);
  13. define("LEXER_SPECIAL", 5);
  14. /**
  15. * Compounded regular expression. Any of
  16. * the contained patterns could match and
  17. * when one does it's label is returned.
  18. */
  19. class ParallelRegex {
  20. var $_patterns;
  21. var $_labels;
  22. var $_regex;
  23. var $_case;
  24. /**
  25. * Constructor. Starts with no patterns.
  26. * @param $case True for case sensitive, false
  27. * for insensitive.
  28. * @public
  29. */
  30. function ParallelRegex($case) {
  31. $this->_case = $case;
  32. $this->_patterns = array();
  33. $this->_labels = array();
  34. $this->_regex = null;
  35. }
  36. /**
  37. * Adds a pattern with an optional label.
  38. * @param $pattern Perl style regex, but ( and )
  39. * lose the usual meaning.
  40. * @param $label Label of regex to be returned
  41. * on a match.
  42. * @public
  43. */
  44. function addPattern($pattern, $label = true) {
  45. $count = count($this->_patterns);
  46. $this->_patterns[$count] = $pattern;
  47. $this->_labels[$count] = $label;
  48. $this->_regex = null;
  49. }
  50. /**
  51. * Attempts to match all patterns at once against
  52. * a string.
  53. * @param $subject String to match against.
  54. * @param $match First matched portion of
  55. * subject.
  56. * @return True on success.
  57. * @public
  58. */
  59. function match($subject, &$match) {
  60. if (count($this->_patterns) == 0) {
  61. return false;
  62. }
  63. if (!preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
  64. $match = "";
  65. return false;
  66. }
  67. $match = $matches[0];
  68. for ($i = 1; $i < count($matches); $i++) {
  69. if ($matches[$i]) {
  70. return $this->_labels[$i - 1];
  71. }
  72. }
  73. return true;
  74. }
  75. /**
  76. * Compounds the patterns into a single
  77. * regular expression separated with the
  78. * "or" operator. Caches the regex.
  79. * Will automatically escape (, ) and / tokens.
  80. * @param $patterns List of patterns in order.
  81. * @private
  82. */
  83. function _getCompoundedRegex() {
  84. if ($this->_regex == null) {
  85. for ($i = 0; $i < count($this->_patterns); $i++) {
  86. $this->_patterns[$i] = '(' . str_replace(
  87. array('/', '(', ')'),
  88. array('\/', '\(', '\)'),
  89. $this->_patterns[$i]) . ')';
  90. }
  91. $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
  92. }
  93. return $this->_regex;
  94. }
  95. /**
  96. * Accessor for perl regex mode flags to use.
  97. * @return Flags as string.
  98. * @private
  99. */
  100. function _getPerlMatchingFlags() {
  101. return ($this->_case ? "msS" : "msSi");
  102. }
  103. }
  104. /**
  105. * States for a stack machine.
  106. */
  107. class StateStack {
  108. var $_stack;
  109. /**
  110. * Constructor. Starts in named state.
  111. * @param $start Starting state name.
  112. * @public
  113. */
  114. function StateStack($start) {
  115. $this->_stack = array($start);
  116. }
  117. /**
  118. * Accessor for current state.
  119. * @return State as string.
  120. * @public
  121. */
  122. function getCurrent() {
  123. return $this->_stack[count($this->_stack) - 1];
  124. }
  125. /**
  126. * Adds a state to the stack and sets it
  127. * to be the current state.
  128. * @param $state New state.
  129. * @public
  130. */
  131. function enter($state) {
  132. array_push($this->_stack, $state);
  133. }
  134. /**
  135. * Leaves the current state and reverts
  136. * to the previous one.
  137. * @return False if we drop off
  138. * the bottom of the list.
  139. * @public
  140. */
  141. function leave() {
  142. if (count($this->_stack) == 1) {
  143. return false;
  144. }
  145. array_pop($this->_stack);
  146. return true;
  147. }
  148. }
  149. /**
  150. * Accepts text and breaks it into tokens.
  151. * Some optimisation to make the sure the
  152. * content is only scanned by the PHP regex
  153. * parser once. Lexer modes must not start
  154. * with leading underscores.
  155. */
  156. class Lexer {
  157. var $_regexes;
  158. var $_parser;
  159. var $_mode;
  160. var $_mode_handlers;
  161. var $_case;
  162. /**
  163. * Sets up the lexer in case insensitive matching
  164. * by default.
  165. * @param $parser Handling strategy by
  166. * reference.
  167. * @param $start Starting handler.
  168. * @param $case True for case sensitive.
  169. * @public
  170. */
  171. function Lexer(&$parser, $start = "accept", $case = false) {
  172. $this->_case = $case;
  173. $this->_regexes = array();
  174. $this->_parser = &$parser;
  175. $this->_mode = new StateStack($start);
  176. $this->_mode_handlers = array();
  177. }
  178. /**
  179. * Adds a token search pattern for a particular
  180. * parsing mode. The pattern does not change the
  181. * current mode.
  182. * @param $pattern Perl style regex, but ( and )
  183. * lose the usual meaning.
  184. * @param $mode Should only apply this
  185. * pattern when dealing with
  186. * this type of input.
  187. * @public
  188. */
  189. function addPattern($pattern, $mode = "accept") {
  190. if (!isset($this->_regexes[$mode])) {
  191. $this->_regexes[$mode] = new ParallelRegex($this->_case);
  192. }
  193. $this->_regexes[$mode]->addPattern($pattern);
  194. }
  195. /**
  196. * Adds a pattern that will enter a new parsing
  197. * mode. Useful for entering parenthesis, strings,
  198. * tags, etc.
  199. * @param $pattern Perl style regex, but ( and )
  200. * lose the usual meaning.
  201. * @param $mode Should only apply this
  202. * pattern when dealing with
  203. * this type of input.
  204. * @param $new_mode Change parsing to this new
  205. * nested mode.
  206. * @public
  207. */
  208. function addEntryPattern($pattern, $mode, $new_mode) {
  209. if (!isset($this->_regexes[$mode])) {
  210. $this->_regexes[$mode] = new ParallelRegex($this->_case);
  211. }
  212. $this->_regexes[$mode]->addPattern($pattern, $new_mode);
  213. }
  214. /**
  215. * Adds a pattern that will exit the current mode
  216. * and re-enter the previous one.
  217. * @param $pattern Perl style regex, but ( and )
  218. * lose the usual meaning.
  219. * @param $mode Mode to leave.
  220. * @public
  221. */
  222. function addExitPattern($pattern, $mode) {
  223. if (!isset($this->_regexes[$mode])) {
  224. $this->_regexes[$mode] = new ParallelRegex($this->_case);
  225. }
  226. $this->_regexes[$mode]->addPattern($pattern, "__exit");
  227. }
  228. /**
  229. * Adds a pattern that has a special mode.
  230. * Acts as an entry and exit pattern in one go.
  231. * @param $pattern Perl style regex, but ( and )
  232. * lose the usual meaning.
  233. * @param $mode Should only apply this
  234. * pattern when dealing with
  235. * this type of input.
  236. * @param $special Use this mode for this one token.
  237. * @public
  238. */
  239. function addSpecialPattern($pattern, $mode, $special) {
  240. if (!isset($this->_regexes[$mode])) {
  241. $this->_regexes[$mode] = new ParallelRegex($this->_case);
  242. }
  243. $this->_regexes[$mode]->addPattern($pattern, "_$special");
  244. }
  245. /**
  246. * Adds a mapping from a mode to another handler.
  247. * @param $mode Mode to be remapped.
  248. * @param $handler New target handler.
  249. * @public
  250. */
  251. function mapHandler($mode, $handler) {
  252. $this->_mode_handlers[$mode] = $handler;
  253. }
  254. /**
  255. * Splits the page text into tokens. Will fail
  256. * if the handlers report an error or if no
  257. * content is consumed. If successful then each
  258. * unparsed and parsed token invokes a call to the
  259. * held listener.
  260. * @param $raw Raw HTML text.
  261. * @return True on success, else false.
  262. * @public
  263. */
  264. function parse($raw) {
  265. if (!isset($this->_parser)) {
  266. return false;
  267. }
  268. $length = strlen($raw);
  269. while (is_array($parsed = $this->_reduce($raw))) {
  270. list($unmatched, $matched, $mode) = $parsed;
  271. if (!$this->_dispatchTokens($unmatched, $matched, $mode)) {
  272. return false;
  273. }
  274. if (strlen($raw) == $length) {
  275. return false;
  276. }
  277. $length = strlen($raw);
  278. }
  279. if (!$parsed) {
  280. return false;
  281. }
  282. return $this->_invokeParser($raw, LEXER_UNMATCHED);
  283. }
  284. /**
  285. * Sends the matched token and any leading unmatched
  286. * text to the parser changing the lexer to a new
  287. * mode if one is listed.
  288. * @param $unmatched Unmatched leading portion.
  289. * @param $matched Actual token match.
  290. * @param $mode Mode after match. The "_exit"
  291. * mode causes a stack pop. An
  292. * false mode causes no change.
  293. * @return False if there was any error
  294. * from the parser.
  295. * @private
  296. */
  297. function _dispatchTokens($unmatched, $matched, $mode = false) {
  298. if (!$this->_invokeParser($unmatched, LEXER_UNMATCHED)) {
  299. return false;
  300. }
  301. if ($mode === "__exit") {
  302. if (!$this->_invokeParser($matched, LEXER_EXIT)) {
  303. return false;
  304. }
  305. return $this->_mode->leave();
  306. }
  307. if (strncmp($mode, "_", 1) == 0) {
  308. $mode = substr($mode, 1);
  309. $this->_mode->enter($mode);
  310. if (!$this->_invokeParser($matched, LEXER_SPECIAL)) {
  311. return false;
  312. }
  313. return $this->_mode->leave();
  314. }
  315. if (is_string($mode)) {
  316. $this->_mode->enter($mode);
  317. return $this->_invokeParser($matched, LEXER_ENTER);
  318. }
  319. return $this->_invokeParser($matched, LEXER_MATCHED);
  320. }
  321. /**
  322. * Calls the parser method named after the current
  323. * mode. Empty content will be ignored.
  324. * @param $content Text parsed.
  325. * @param $is_match Token is recognised rather
  326. * than unparsed data.
  327. * @private
  328. */
  329. function _invokeParser($content, $is_match) {
  330. if (($content === "") || ($content === false)) {
  331. return true;
  332. }
  333. $handler = $this->_mode->getCurrent();
  334. if (isset($this->_mode_handlers[$handler])) {
  335. $handler = $this->_mode_handlers[$handler];
  336. }
  337. return $this->_parser->$handler($content, $is_match);
  338. }
  339. /**
  340. * Tries to match a chunk of text and if successful
  341. * removes the recognised chunk and any leading
  342. * unparsed data. Empty strings will not be matched.
  343. * @param $raw The subject to parse. This is the
  344. * content that will be eaten.
  345. * @return Three item list of unparsed
  346. * content followed by the
  347. * recognised token and finally the
  348. * action the parser is to take.
  349. * True if no match, false if there
  350. * is a parsing error.
  351. * @private
  352. */
  353. function _reduce(&$raw) {
  354. if (!isset($this->_regexes[$this->_mode->getCurrent()])) {
  355. return false;
  356. }
  357. if ($raw === "") {
  358. return true;
  359. }
  360. if ($action = $this->_regexes[$this->_mode->getCurrent()]->match($raw, $match)) {
  361. $count = strpos($raw, $match);
  362. $unparsed = substr($raw, 0, $count);
  363. $raw = substr($raw, $count + strlen($match));
  364. return array($unparsed, $match, $action);
  365. }
  366. return true;
  367. }
  368. }
  369. ?>