PageRenderTime 34ms CodeModel.GetById 9ms RepoModel.GetById 0ms app.codeStats 0ms

/workbench/pcre/preg_match.js

https://gitlab.com/orvi2014/phpjs
JavaScript | 208 lines | 84 code | 20 blank | 104 comment | 16 complexity | 15e8a7f72912b0d0a482c94c0dabe42b MD5 | raw file
  1. function preg_match(pattern, subject, matches, flags, offset) {
  2. // http://kevin.vanzonneveld.net
  3. // + original by: Francis Lewis
  4. // + improved by: Brett Zamir (http://brett-zamir.me)
  5. // * example 1: matches = [];
  6. // * example 1: preg_match(/(\w+)\W([\W\w]+)/, 'this is some text', matches);
  7. // * matches 1: matches[1] == 'this'
  8. // * returns 1: 1
  9. // UNFINISHED
  10. // Just found something we should take a very serious look at Steve Levithan's XRegExp which implements Unicode classes and two extra flags: http://blog.stevenlevithan.com/archives/xregexp-javascript-regex-constructor
  11. // Before finding this, I was working on a script to search through an SQLite database to build our Unicode expressions automatically; I may finish that as it should be expandable for the future, and be an extra eye to confirm Steve's work
  12. // Also need to look at/integrate with Michael Grier's http://mgrier.com/te5t/preg_match_all.js ; http://mgrier.com/te5t/testpma.html ; http://mgrier.com/te5t/testpma.php
  13. var i = 0, lastDelimPos = -1, flag = '', patternPart = '', flagPart = '', array = [], regexpFlags = '', subPatternNames = [];
  14. var getFuncName = function(fn) {
  15. var name = (/\W*function\s+([\w\$]+)\s*\(/).exec(fn);
  16. if (!name) {
  17. return '(Anonymous)';
  18. }
  19. return name[1];
  20. };
  21. var join = function(arr) {
  22. return '(?:' + arr.join('|') + ')';
  23. };
  24. if (typeof pattern === 'string') {
  25. if (pattern === '') {
  26. // Handle how?
  27. }
  28. lastDelimPos = pattern.lastIndexOf(pattern[0]);
  29. if (lastDelimPos === 0) { // convenience to allow raw string without delimiters // || a-zA-Z/.test(pattern[0]) || pattern.length === 1) { // The user is probably not using letters for delimiters (not recommended, but could be convenient for non-flagged expressions)
  30. pattern = new RegExp(pattern);
  31. }
  32. else {
  33. patternPart = pattern.slice(1, lastDelimPos);
  34. flagPart = pattern.slice(lastDelimPos + 1);
  35. // Fix: Need to study http://php.net/manual/en/regexp.reference.php more thoroughly
  36. // e.g., internal options i, m, s, x, U, X, J; conditional subpatterns?, comments, recursive subpatterns,
  37. for (i = 0; i < flagPart.length; i++) {
  38. flag = flagPart[i];
  39. switch (flag) {
  40. case 'g': // We don't use this in preg_match, but it's presumably not an error
  41. case 'm':
  42. case 'i':
  43. regexpFlags += flag;
  44. break;
  45. case 'e': // used in preg_replace only but ignored elsewhere; "does normal substitution of backreferences in the replacement string, evaluates it as PHP code, and uses the result for replacing the search string". "Single quotes, double quotes, backslashes and NULL chars will be escaped by backslashes in substituted backreferences."
  46. // Safely ignorable
  47. break;
  48. case 's': // "dot metacharacter in the pattern matches all characters, including newlines. Without it, newlines are excluded... A negative class such as [^a] always matches a newline character"
  49. case 'x': // "whitespace data characters in the pattern are totally ignored except when escaped or inside a character class, and characters between an unescaped # outside a character class and the next newline character, inclusive, are also ignored"; "Whitespace characters may never appear within special character sequences in a pattern"
  50. case 'A': // pattern is "constrained to match only at the start of the string which is being searched"
  51. case 'D': // "a dollar metacharacter in the pattern matches only at the end of the subject string" (ignored if 'm' set)
  52. case 'U': // "makes not greedy by default, but become greedy if followed by "?""
  53. case 'J': // "changes the local PCRE_DUPNAMES option. Allow duplicate names for subpatterns"
  54. case 'u': // "turns on additional functionality of PCRE that is incompatible with Perl. Pattern strings are treated as UTF-8."
  55. throw 'The passed flag "' + flag + '" is presently unsupported in ' + getFuncName(arguments.callee);
  56. case 'X': // "additional functionality of PCRE that is incompatible with Perl. Any backslash in a pattern that is followed by a letter that has no special meaning causes an error, thus reserving these combinations for future expansion"; not in use in PHP presently
  57. throw 'X flag is unimplemented at present';
  58. if (/\/([^\\^$.[\]|()?*+{}aefnrtdDhHsSvVwWbBAZzGCcxkgpPX\d])/.test(patternPart)) { // can be 1-3 \d together after backslash (as one unit)
  59. // \C = single byte (useful in 'u'/UTF8 mode)
  60. // CcxpPXkg are all special uses;
  61. //c. (any character after 'c' for control character)
  62. // x[a-fA-F\d][a-fA-F\d] (hex)
  63. // "Back references to the named subpatterns can be achieved by (?P=name) or, since PHP 5.2.4, also by \k<name>, \k'name', \k{name} or \g{name}"
  64. // Unicode classes (with u flag only)
  65. // p{} | P{} (case insensitive does not affect)
  66. // [CLMNPSZ]
  67. // C|Cc|Cf|Cn|Co|Cs|L|Ll|Lm|Lo|Lt|Lu|M|Mc|Me|Mn|N|Nd|Nl|No|P|Pc|Pd|Pe|Pf|Pi|Po|Ps|S|Sc|Sk|Sm|So|Z|Zl|Zp|Zs
  68. // Other, Control
  69. // Cc = '[\u0000-\u001f\u007f-\u009f]';
  70. // Other, Format
  71. // Cf = '(?:[\u00ad\u0600-\u0603\u06dd\u070f\u17b4-\u17b5\u200b-\u200f\u202a-\u202e\u2060-\u2064\u206a-\u206f\ufeff\ufff9-\ufffb]|[\ud834][\udd73-\udd7a]|[\udb40][\udc01\udc20-\udc58]'); /* latter surrogates represent 1d173-1d17a, e0001, e0020-e0058 */
  72. // Other, Unassigned
  73. // Cn = TO-DO;
  74. // Other, Private use
  75. // Co = '(?:[\ue000-\uf8ff]|[\udb80-\udbbe][\udc00-\udfff]|[\udbff][\udc00-\udffd]|[\udbc0-\udbfe][\udc00-\udfff]|[\udbff][\udc00-\udffd])'; // f0000-ffffd, 100000-10fffd
  76. // Other, Surrogate
  77. // Cs = '[\ud800-\udb7f\udb80-\udbff\udc00-\udfff]';
  78. // Need to finish Cn (above) and Ll-Sm here below
  79. // Letter, Lower case
  80. // Ll = '[]';
  81. // Letter, Modifier
  82. // Lm =
  83. // Letter, Other
  84. // Lo =
  85. // Letter, Title case
  86. // Lt =
  87. // Letter, Upper case
  88. // Lu =
  89. // Mark, Spacing
  90. // Mc =
  91. // Mark, Enclosing
  92. // Me =
  93. // Mark, Non-spacing
  94. // Mn =
  95. // Number, Decimal
  96. // Nd =
  97. // Number, letter
  98. // Nl =
  99. // Number, Other
  100. // No =
  101. // Punctuation, Connector
  102. // Pc =
  103. // Punctuation, Dash
  104. // Pd =
  105. // Punctuation, Close
  106. // Pe =
  107. // Punctuation, Final
  108. // Pf =
  109. // Punctuation, Initial
  110. // Pi =
  111. // Punctuation, Other
  112. // Po =
  113. // Punctuation, Open
  114. // Ps =
  115. // Symbol, Currency
  116. // Sc =
  117. // Symbol, Modifier
  118. // Sk =
  119. // Symbol, Mathematical
  120. // Sm ='\u002b\u003c-\u003e\u007c\u007e\u00ac\u00b1\u00d7\u00f7\u03f6\u0606-\u0608\u2044\u2052\u207a-\u207c\u208a-\u208c\u2140-\u2144\u214b\u2190-\u2194\u219a\u219b\u21a0\u21a3\u21a6\u21ae\u21ce\u21cf\u21d2\u21d4\u21f4-\u22ff\u2308-\u230b\u2320\u2321\u237c\u239b-\u23b3\u23dc-\u23e1\u25b7\u25c1\u25f8-\u25ff\u266f\u27c0-\u27c4\u27c7-\u27ca\u27cc\u27d0-\u27e5\u27f0-\u27ff\u2900-\u2982\u2999-\u29d7\u29dc-\u29fb\u29fe-\u2aff\u2b30-\u2b44\u2b47-\u2b4c\ufb29\ufe62\ufe64-\ufe66\uff0b\uff1c-\uff1e\uff5c\uff5e\uffe2\uffe9-\uffec
  121. // 1d6c1 1d6db 1d6fb 1d715 1d735 1d74f 1d76f 1d789 1d7a9 1d7c3
  122. // Symbol, Other
  123. // latter alternates are surrogate pairs comprising 10102, 10137-1013f, 10179-10189, 10190-1019b, 101d0-101fc, 1d000-1d0f5, 1d100-1d126, 1d129-1d164, 1d16a-1d16c, 1d183-1d184, 1d18c-1d1a9, 1d1ae-1d1dd, 1d200-1d241, 1d245, 1d300-1d356, 1f000-1f02b, 1f030-1f093
  124. // So = '(?:[\u00a6\u00a7\u00a9\u00ae\u00b0\u00b6\u0482\u060e\u060f\u06e9\u06fd\u06fe\u07f6\u09fa\u0b70\u0bf3-\u0bf8\u0bfa\u0c7f\u0cf1\u0cf2\u0d79\u0f01-\u0f03\u0f13-\u0f17\u0f1a-\u0f1f\u0f34\u0f36\u0f38\u0fbe-\u0fc5\u0fc7-\u0fcc\u0fce\u0fcf\u109e\u109f\u1360\u1390-\u1399\u1940\u19e0-\u19ff\u1b61-\u1b6a\u1b74-\u1b7c\u2100\u2101\u2103-\u2106\u2108\u2109\u2114\u2116-\u2118\u211e-\u2123\u2125\u2127\u2129\u212e\u213a\u213b\u214a\u214c\u214d\u214f\u2195-\u2199\u219c-\u219f\u21a1\u21a2\u21a4\u21a5\u21a7-\u21ad\u21af-\u21cd\u21d0\u21d1\u21d3\u21d5-\u21f3\u2300-\u2307\u230c-\u231f\u2322-\u2328\u232b-\u237b\u237d-\u239a\u23b4-\u23db\u23e2-\u23e7\u2400-\u2426\u2440-\u244a\u249c-\u24e9\u2500-\u25b6\u25b8-\u25c0\u25c2-\u25f7\u2600-\u266e\u2670-\u269d\u26a0-\u26bc\u26c0-\u26c3\u2701-\u2704\u2706-\u2709\u270c-\u2727\u2729-\u274b\u274d\u274f-\u2752\u2756\u2758-\u275e\u2761-\u2767\u2794\u2798-\u27af\u27b1-\u27be\u2800-\u28ff\u2b00-\u2b2f\u2b45\u2b46\u2b50-\u2b54\u2ce5-\u2cea\u2e80-\u2e99\u2e9b-\u2ef3\u2f00-\u2fd5\u2ff0-\u2ffb\u3004\u3012\u3013\u3020\u3036\u3037\u303e\u303f\u3190\u3191\u3196-\u319f\u31c0-\u31e3\u3200-\u321e\u322a-\u3243\u3250\u3260-\u327f\u328a-\u32b0\u32c0-\u32fe\u3300-\u33ff\u4dc0-\u4dff\ua490-\ua4c6\ua828-\ua82b\ufdfd\uffe4\uffe8\uffed\uffee\ufffc\ufffd]|(?:\ud800[\udd02\udd37-\udd3f\udd79-\udd89\udd90-\udd9b\uddd0-\uddfc])|(?:\ud834[\udc00-\udcf5\udd00-\udd26\udd29-\udd64\udd6a-\udd6c\udd83-\udd84\udd8c-\udda9\uddae-\udddd\ude00-\ude41\ude45\udf00-\udf56])|(?:\ud83c[\udc00-\udc2b\udc30-\udc93]))';
  125. // Separator, Line
  126. // Zl = '[\u2028]';
  127. // Separator, Paragraph
  128. // Zp = '[\u2029]';
  129. // Separator, Space
  130. // Zs = '[\u0020\u00a0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]';
  131. // Form broader groups
  132. // C = join([Cc, Cf, Cn, Co, Cs]);
  133. // L = join([Ll, Lm, Lo, Lt, Lu]);
  134. // M = join([Mc, Me, Mn]);
  135. // N = join([Nd, Nl, No]);
  136. // P = join([Pc, Pd, Pe, Pf, Pi, Po, Ps]);
  137. // S = join([Sc, Sk, Sm, So]);
  138. // Z = join([Zl, Zp, Zs]);
  139. // \X = (?>\PM\pM*)
  140. // "Extended properties such as "Greek" or "InMusicalSymbols" are not supported by PCRE."
  141. throw 'You are in "X" (PCRE_EXTRA) mode, using a reserved and presently unused escape sequence in ' + getFuncName(arguments.callee);
  142. }
  143. break;
  144. case 'S': // spends "more time analyzing pattern in order to speed up the time taken for matching" (for subsequent matches)
  145. throw 'The passed flag "' + flag + '" to ' + getFuncName(arguments.callee) + ' cannot be implemented in JavaScript'; // Could possibly optimize inefficient expressions, however
  146. case 'y':
  147. throw 'Flag "y" is a non-cross-browser, non-PHP flag, not supported in ' + getFuncName(arguments.callee);
  148. default:
  149. throw 'Unrecognized flag "' + flag + '" passed to ' + getFuncName(arguments.callee);
  150. }
  151. }
  152. }
  153. }
  154. else {
  155. patternPart = pattern.source; // Allow JavaScript type expressions to take advantage of named subpatterns, so temporarily convert to string
  156. regexpFlags += pattern.global ? 'g' : '';
  157. regexpFlags += pattern.ignoreCase ? 'i' : '';
  158. regexpFlags += pattern.multiline ? 'm' : '';
  159. }
  160. patternPart = patternPart.replace(/\(\?<(.*?)>(.*?)\)/g, function(namedSubpattern, name, pattern) {
  161. subPatternNames.push(name);
  162. return '(' + pattern + ')';
  163. });
  164. pattern = new RegExp(patternPart, regexpFlags);
  165. // store the matches in the first index of the array
  166. array[0] = pattern.exec(subject);
  167. if (!array[0]) {
  168. return 0;
  169. }
  170. // If the user passed in a RegExp object or literal, we will probably need to reflect on
  171. // its source, ignoreCase, global, and multiline properties to form a new expression (as above?),
  172. // and use lastIndex
  173. if (offset) {
  174. // Not implemented
  175. }
  176. if (flags === 'PREG_OFFSET_CAPTURE' || flags === 256) { // Fix: make flags as number and allow bitwise AND checks against flags; see pathinfo()
  177. // Not implemented
  178. return 1; // matches will need to be different, so we return early here
  179. }
  180. // loop through the first indice of the array and store the values in the $matches array
  181. for (i = 0; i < array[0].length; i++) {
  182. matches[i] = array[0][i];
  183. if (i > 0 && subPatternNames[i - 1] !== undefined) {
  184. matches[subPatternNames] = array[0][i]; // UNTESTED
  185. }
  186. }
  187. return 1;
  188. }