PageRenderTime 149ms CodeModel.GetById 29ms RepoModel.GetById 1ms app.codeStats 0ms

/DLR_Main/Languages/IronPython/IronPython.Modules/re.cs

https://bitbucket.org/mdavid/dlr
C# | 1139 lines | 880 code | 169 blank | 90 comment | 200 complexity | fb1c57795022386ffe6b3d6eab990b11 MD5 | raw file
  1. /* ****************************************************************************
  2. *
  3. * Copyright (c) Microsoft Corporation.
  4. *
  5. * This source code is subject to terms and conditions of the Apache License, Version 2.0. A
  6. * copy of the license can be found in the License.html file at the root of this distribution. If
  7. * you cannot locate the Apache License, Version 2.0, please send an email to
  8. * ironpy@microsoft.com. By using this source code in any fashion, you are agreeing to be bound
  9. * by the terms of the Apache License, Version 2.0.
  10. *
  11. * You must not remove this notice, or any other, from this software.
  12. *
  13. *
  14. * ***************************************************************************/
  15. using System;
  16. using System.Collections;
  17. using System.Collections.Generic;
  18. using System.Diagnostics;
  19. using System.Runtime.CompilerServices;
  20. using System.Runtime.InteropServices;
  21. using System.Text;
  22. using System.Text.RegularExpressions;
  23. using Microsoft.Scripting;
  24. using Microsoft.Scripting.Runtime;
  25. using Microsoft.Scripting.Utils;
  26. using IronPython.Runtime;
  27. using IronPython.Runtime.Exceptions;
  28. using IronPython.Runtime.Operations;
  29. using IronPython.Runtime.Types;
  30. [assembly: PythonModule("re", typeof(IronPython.Modules.PythonRegex))]
  31. namespace IronPython.Modules {
  32. /// <summary>
  33. /// Python regular expression module.
  34. /// </summary>
  35. public static class PythonRegex {
  36. private static CacheDict<PatternKey, RE_Pattern> _cachedPatterns = new CacheDict<PatternKey, RE_Pattern>(100);
  37. [SpecialName]
  38. public static void PerformModuleReload(PythonContext/*!*/ context, PythonDictionary/*!*/ dict) {
  39. context.EnsureModuleException("reerror", dict, "error", "re");
  40. PythonCopyReg.GetDispatchTable(context.SharedContext)[DynamicHelpers.GetPythonTypeFromType(typeof(RE_Pattern))] = dict["_pickle"];
  41. }
  42. private static readonly Random r = new Random(DateTime.Now.Millisecond);
  43. #region CONSTANTS
  44. // short forms
  45. public const int I = 0x02;
  46. public const int L = 0x04;
  47. public const int M = 0x08;
  48. public const int S = 0x10;
  49. public const int U = 0x20;
  50. public const int X = 0x40;
  51. // long forms
  52. public const int IGNORECASE = 0x02;
  53. public const int LOCALE = 0x04;
  54. public const int MULTILINE = 0x08;
  55. public const int DOTALL = 0x10;
  56. public const int UNICODE = 0x20;
  57. public const int VERBOSE = 0x40;
  58. #endregion
  59. #region Public API Surface
  60. public static RE_Pattern compile(CodeContext/*!*/ context, object pattern) {
  61. try {
  62. return new RE_Pattern(context, ValidatePattern(pattern), 0, true);
  63. } catch (ArgumentException e) {
  64. throw PythonExceptions.CreateThrowable(error(context), e.Message);
  65. }
  66. }
  67. public static RE_Pattern compile(CodeContext/*!*/ context, object pattern, object flags) {
  68. try {
  69. return new RE_Pattern(context, ValidatePattern(pattern), PythonContext.GetContext(context).ConvertToInt32(flags), true);
  70. } catch (ArgumentException e) {
  71. throw PythonExceptions.CreateThrowable(error(context), e.Message);
  72. }
  73. }
  74. public const string engine = "cli reg ex";
  75. public static string escape(string text) {
  76. if (text == null) throw PythonOps.TypeError("text must not be None");
  77. for (int i = 0; i < text.Length; i++) {
  78. if (!Char.IsLetterOrDigit(text[i])) {
  79. StringBuilder sb = new StringBuilder(text, 0, i, text.Length);
  80. char ch = text[i];
  81. do {
  82. sb.Append('\\');
  83. sb.Append(ch);
  84. i++;
  85. int last = i;
  86. while (i < text.Length) {
  87. ch = text[i];
  88. if (!Char.IsLetterOrDigit(ch)) {
  89. break;
  90. }
  91. i++;
  92. }
  93. sb.Append(text, last, i - last);
  94. } while (i < text.Length);
  95. return sb.ToString();
  96. }
  97. }
  98. return text;
  99. }
  100. public static List findall(CodeContext/*!*/ context, object pattern, string @string) {
  101. return findall(context, pattern, @string, 0);
  102. }
  103. public static List findall(CodeContext/*!*/ context, object pattern, string @string, int flags) {
  104. RE_Pattern pat = GetPattern(context, ValidatePattern(pattern), flags);
  105. ValidateString(@string, "string");
  106. MatchCollection mc = pat.FindAllWorker(context, @string, 0, @string.Length);
  107. return FixFindAllMatch(pat, mc);
  108. }
  109. private static List FixFindAllMatch(RE_Pattern pat, MatchCollection mc) {
  110. object[] matches = new object[mc.Count];
  111. int numgrps = pat._re.GetGroupNumbers().Length;
  112. for (int i = 0; i < mc.Count; i++) {
  113. if (numgrps > 2) { // CLR gives us a "bonus" group of 0 - the entire expression
  114. // at this point we have more than one group in the pattern;
  115. // need to return a list of tuples in this case
  116. // for each match item in the matchcollection, create a tuple representing what was matched
  117. // e.g. findall("(\d+)|(\w+)", "x = 99y") == [('', 'x'), ('99', ''), ('', 'y')]
  118. // in the example above, ('', 'x') did not match (\d+) as indicated by '' but did
  119. // match (\w+) as indicated by 'x' and so on...
  120. int k = 0;
  121. List<object> tpl = new List<object>();
  122. foreach (Group g in mc[i].Groups) {
  123. // here also the CLR gives us a "bonus" match as the first item which is the
  124. // group that was actually matched in the tuple e.g. we get 'x', '', 'x' for
  125. // the first match object...so we'll skip the first item when creating the
  126. // tuple
  127. if (k++ != 0) {
  128. tpl.Add(g.Value);
  129. }
  130. }
  131. matches[i] = PythonTuple.Make(tpl);
  132. } else if (numgrps == 2) {
  133. // at this point we have exactly one group in the pattern (including the "bonus" one given
  134. // by the CLR
  135. // skip the first match since that contains the entire match and not the group match
  136. // e.g. re.findall(r"(\w+)\s+fish\b", "green fish") will have "green fish" in the 0
  137. // index and "green" as the (\w+) group match
  138. matches[i] = mc[i].Groups[1].Value;
  139. } else {
  140. matches[i] = mc[i].Value;
  141. }
  142. }
  143. return List.FromArrayNoCopy(matches);
  144. }
  145. public static object finditer(CodeContext/*!*/ context, object pattern, object @string) {
  146. return finditer(context, pattern, @string, 0);
  147. }
  148. public static object finditer(CodeContext/*!*/ context, object pattern, object @string, int flags) {
  149. RE_Pattern pat = GetPattern(context, ValidatePattern(pattern), flags);
  150. string str = ValidateString(@string, "string");
  151. return MatchIterator(pat.FindAllWorker(context, str, 0, str.Length), pat, str);
  152. }
  153. public static RE_Match match(CodeContext/*!*/ context, object pattern, object @string) {
  154. return match(context, pattern, @string, 0);
  155. }
  156. public static RE_Match match(CodeContext/*!*/ context, object pattern, object @string, int flags) {
  157. return GetPattern(context, ValidatePattern(pattern), flags).match(ValidateString(@string, "string"));
  158. }
  159. public static RE_Match search(CodeContext/*!*/ context, object pattern, object @string) {
  160. return search(context, pattern, @string, 0);
  161. }
  162. public static RE_Match search(CodeContext/*!*/ context, object pattern, object @string, int flags) {
  163. return GetPattern(context, ValidatePattern(pattern), flags).search(ValidateString(@string, "string"));
  164. }
  165. [return: SequenceTypeInfo(typeof(string))]
  166. public static List split(CodeContext/*!*/ context, object pattern, object @string) {
  167. return split(context, ValidatePattern(pattern), ValidateString(@string, "string"), 0);
  168. }
  169. [return: SequenceTypeInfo(typeof(string))]
  170. public static List split(CodeContext/*!*/ context, object pattern, object @string, int maxsplit) {
  171. return GetPattern(context, ValidatePattern(pattern), 0).split(ValidateString(@string, "string"),
  172. maxsplit);
  173. }
  174. public static string sub(CodeContext/*!*/ context, object pattern, object repl, object @string) {
  175. return sub(context, pattern, repl, @string, Int32.MaxValue);
  176. }
  177. public static string sub(CodeContext/*!*/ context, object pattern, object repl, object @string, int count) {
  178. return GetPattern(context, ValidatePattern(pattern), 0).sub(context, repl, ValidateString(@string, "string"), count);
  179. }
  180. public static object subn(CodeContext/*!*/ context, object pattern, object repl, object @string) {
  181. return subn(context, pattern, repl, @string, Int32.MaxValue);
  182. }
  183. public static object subn(CodeContext/*!*/ context, object pattern, object repl, object @string, int count) {
  184. return GetPattern(context, ValidatePattern(pattern), 0).subn(context, repl, ValidateString(@string, "string"), count);
  185. }
  186. public static void purge() {
  187. _cachedPatterns = new CacheDict<PatternKey, RE_Pattern>(100);
  188. }
  189. #endregion
  190. #region Public classes
  191. /// <summary>
  192. /// Compiled reg-ex pattern
  193. /// </summary>
  194. [PythonType]
  195. public class RE_Pattern : IWeakReferenceable {
  196. internal Regex _re;
  197. private PythonDictionary _groups;
  198. private int _compileFlags;
  199. private WeakRefTracker _weakRefTracker;
  200. internal ParsedRegex _pre;
  201. internal RE_Pattern(CodeContext/*!*/ context, object pattern)
  202. : this(context, pattern, 0) {
  203. }
  204. internal RE_Pattern(CodeContext/*!*/ context, object pattern, int flags) :
  205. this(context, pattern, flags, false) {
  206. }
  207. internal RE_Pattern(CodeContext/*!*/ context, object pattern, int flags, bool compiled) {
  208. _pre = PreParseRegex(context, ValidatePatternAsString(pattern));
  209. try {
  210. flags |= OptionToFlags(_pre.Options);
  211. RegexOptions opts = FlagsToOption(flags);
  212. #if SILVERLIGHT
  213. this._re = new Regex(_pre.Pattern, opts);
  214. #else
  215. this._re = new Regex(_pre.Pattern, opts | (compiled ? RegexOptions.Compiled : RegexOptions.None));
  216. #endif
  217. } catch (ArgumentException e) {
  218. throw PythonExceptions.CreateThrowable(error(context), e.Message);
  219. }
  220. this._compileFlags = flags;
  221. }
  222. public RE_Match match(object text) {
  223. string input = ValidateString(text, "text");
  224. return RE_Match.makeMatch(_re.Match(input), this, input, 0, input.Length);
  225. }
  226. private static int FixPosition(string text, int position) {
  227. if (position < 0) return 0;
  228. if (position > text.Length) return text.Length;
  229. return position;
  230. }
  231. public RE_Match match(object text, int pos) {
  232. string input = ValidateString(text, "text");
  233. pos = FixPosition(input, pos);
  234. return RE_Match.makeMatch(_re.Match(input, pos), this, input, pos, input.Length);
  235. }
  236. public RE_Match match(object text, [DefaultParameterValue(0)]int pos, int endpos) {
  237. string input = ValidateString(text, "text");
  238. pos = FixPosition(input, pos);
  239. endpos = FixPosition(input, endpos);
  240. return RE_Match.makeMatch(
  241. _re.Match(input.Substring(0, endpos), pos),
  242. this,
  243. input,
  244. pos,
  245. endpos);
  246. }
  247. public RE_Match search(object text) {
  248. string input = ValidateString(text, "text");
  249. return RE_Match.make(_re.Match(input), this, input);
  250. }
  251. public RE_Match search(object text, int pos) {
  252. string input = ValidateString(text, "text");
  253. return RE_Match.make(_re.Match(input, pos, input.Length - pos), this, input);
  254. }
  255. public RE_Match search(object text, int pos, int endpos) {
  256. string input = ValidateString(text, "text");
  257. return RE_Match.make(_re.Match(input, pos, Math.Min(Math.Max(endpos - pos, 0), input.Length - pos)), this, input);
  258. }
  259. public object findall(CodeContext/*!*/ context, string @string) {
  260. return findall(context, @string, 0, null);
  261. }
  262. public object findall(CodeContext/*!*/ context, string @string, int pos) {
  263. return findall(context, @string, pos, null);
  264. }
  265. public object findall(CodeContext/*!*/ context, object @string, int pos, object endpos) {
  266. MatchCollection mc = FindAllWorker(context, ValidateString(@string, "text"), pos, endpos);
  267. return FixFindAllMatch(this, mc);
  268. }
  269. internal MatchCollection FindAllWorker(CodeContext/*!*/ context, string str, int pos, object endpos) {
  270. string against = str;
  271. if (endpos != null) {
  272. int end = PythonContext.GetContext(context).ConvertToInt32(endpos);
  273. against = against.Substring(0, Math.Max(end, 0));
  274. }
  275. return _re.Matches(against, pos);
  276. }
  277. public object finditer(CodeContext/*!*/ context, object @string) {
  278. string input = ValidateString(@string, "string");
  279. return MatchIterator(FindAllWorker(context, input, 0, input.Length), this, input);
  280. }
  281. public object finditer(CodeContext/*!*/ context, object @string, int pos) {
  282. string input = ValidateString(@string, "string");
  283. return MatchIterator(FindAllWorker(context, input, pos, input.Length), this, input);
  284. }
  285. public object finditer(CodeContext/*!*/ context, object @string, int pos, int endpos) {
  286. string input = ValidateString(@string, "string");
  287. return MatchIterator(FindAllWorker(context, input, pos, endpos), this, input);
  288. }
  289. [return: SequenceTypeInfo(typeof(string))]
  290. public List split(string @string) {
  291. return split(@string, 0);
  292. }
  293. [return: SequenceTypeInfo(typeof(string))]
  294. public List split(object @string, int maxsplit) {
  295. List result = new List();
  296. // fast path for negative maxSplit ( == "make no splits")
  297. if (maxsplit < 0) {
  298. result.AddNoLock(ValidateString(@string, "string"));
  299. } else {
  300. // iterate over all matches
  301. string theStr = ValidateString(@string, "string");
  302. MatchCollection matches = _re.Matches(theStr);
  303. int lastPos = 0; // is either start of the string, or first position *after* the last match
  304. int nSplits = 0; // how many splits have occurred?
  305. foreach (Match m in matches) {
  306. if (m.Length > 0) {
  307. // add substring from lastPos to beginning of current match
  308. result.AddNoLock(theStr.Substring(lastPos, m.Index - lastPos));
  309. // if there are subgroups of the match, add their match or None
  310. if (m.Groups.Count > 1)
  311. for (int i = 1; i < m.Groups.Count; i++)
  312. if (m.Groups[i].Success)
  313. result.AddNoLock(m.Groups[i].Value);
  314. else
  315. result.AddNoLock(null);
  316. // update lastPos, nSplits
  317. lastPos = m.Index + m.Length;
  318. nSplits++;
  319. if (nSplits == maxsplit)
  320. break;
  321. }
  322. }
  323. // add tail following last match
  324. result.AddNoLock(theStr.Substring(lastPos));
  325. }
  326. return result;
  327. }
  328. public string sub(CodeContext/*!*/ context, object repl, object @string) {
  329. return sub(context, repl, ValidateString(@string, "string"), Int32.MaxValue);
  330. }
  331. public string sub(CodeContext/*!*/ context, object repl, object @string, int count) {
  332. if (repl == null) throw PythonOps.TypeError("NoneType is not valid repl");
  333. // if 'count' is omitted or 0, all occurrences are replaced
  334. if (count == 0) count = Int32.MaxValue;
  335. string replacement = repl as string;
  336. if (replacement == null) {
  337. if (repl is ExtensibleString) {
  338. replacement = (repl as ExtensibleString).Value;
  339. }
  340. }
  341. Match prev = null;
  342. string input = ValidateString(@string, "string");
  343. return _re.Replace(
  344. input,
  345. delegate(Match match) {
  346. // from the docs: Empty matches for the pattern are replaced
  347. // only when not adjacent to a previous match
  348. if (String.IsNullOrEmpty(match.Value) && prev != null &&
  349. (prev.Index + prev.Length) == match.Index) {
  350. return "";
  351. };
  352. prev = match;
  353. if (replacement != null) return UnescapeGroups(match, replacement);
  354. return PythonCalls.Call(context, repl, RE_Match.make(match, this, input)) as string;
  355. },
  356. count);
  357. }
  358. public object subn(CodeContext/*!*/ context, object repl, string @string) {
  359. return subn(context, repl, @string, Int32.MaxValue);
  360. }
  361. public object subn(CodeContext/*!*/ context, object repl, object @string, int count) {
  362. if (repl == null) throw PythonOps.TypeError("NoneType is not valid repl");
  363. // if 'count' is omitted or 0, all occurrences are replaced
  364. if (count == 0) count = Int32.MaxValue;
  365. int totalCount = 0;
  366. string res;
  367. string replacement = repl as string;
  368. if (replacement == null) {
  369. if (repl is ExtensibleString) {
  370. replacement = (repl as ExtensibleString).Value;
  371. }
  372. }
  373. Match prev = null;
  374. string input = ValidateString(@string, "string");
  375. res = _re.Replace(
  376. input,
  377. delegate(Match match) {
  378. // from the docs: Empty matches for the pattern are replaced
  379. // only when not adjacent to a previous match
  380. if (String.IsNullOrEmpty(match.Value) && prev != null &&
  381. (prev.Index + prev.Length) == match.Index) {
  382. return "";
  383. };
  384. prev = match;
  385. totalCount++;
  386. if (replacement != null) return UnescapeGroups(match, replacement);
  387. return PythonCalls.Call(context, repl, RE_Match.make(match, this, input)) as string;
  388. },
  389. count);
  390. return PythonTuple.MakeTuple(res, totalCount);
  391. }
  392. public int flags {
  393. get {
  394. return _compileFlags;
  395. }
  396. }
  397. public PythonDictionary groupindex {
  398. get {
  399. if (_groups == null) {
  400. PythonDictionary d = new PythonDictionary();
  401. string[] names = _re.GetGroupNames();
  402. int[] nums = _re.GetGroupNumbers();
  403. for (int i = 1; i < names.Length; i++) {
  404. if (Char.IsDigit(names[i][0]) || names[i].StartsWith(_mangledNamedGroup)) {
  405. // skip numeric names and our mangling for unnamed groups mixed w/ named groups.
  406. continue;
  407. }
  408. d[names[i]] = nums[i];
  409. }
  410. _groups = d;
  411. }
  412. return _groups;
  413. }
  414. }
  415. public int groups {
  416. get {
  417. return _re.GetGroupNumbers().Length - 1;
  418. }
  419. }
  420. public string pattern {
  421. get {
  422. return _pre.UserPattern;
  423. }
  424. }
  425. public override bool Equals(object obj) {
  426. RE_Pattern other = obj as RE_Pattern;
  427. if (other == null) {
  428. return false;
  429. }
  430. return other.pattern == pattern && other.flags == flags;
  431. }
  432. public override int GetHashCode() {
  433. return pattern.GetHashCode() ^ flags;
  434. }
  435. #region IWeakReferenceable Members
  436. WeakRefTracker IWeakReferenceable.GetWeakRef() {
  437. return _weakRefTracker;
  438. }
  439. bool IWeakReferenceable.SetWeakRef(WeakRefTracker value) {
  440. _weakRefTracker = value;
  441. return true;
  442. }
  443. void IWeakReferenceable.SetFinalizer(WeakRefTracker value) {
  444. ((IWeakReferenceable)this).SetWeakRef(value);
  445. }
  446. #endregion
  447. }
  448. public static PythonTuple _pickle(CodeContext/*!*/ context, RE_Pattern pattern) {
  449. object scope = Importer.ImportModule(context, new PythonDictionary(), "re", false, 0);
  450. object compile;
  451. if (scope is PythonModule && ((PythonModule)scope).__dict__.TryGetValue("compile", out compile)) {
  452. return PythonTuple.MakeTuple(compile, PythonTuple.MakeTuple(pattern.pattern, pattern.flags));
  453. }
  454. throw new InvalidOperationException("couldn't find compile method");
  455. }
  456. [PythonType]
  457. public class RE_Match {
  458. RE_Pattern _pattern;
  459. private Match _m;
  460. private string _text;
  461. private int _lastindex = -1;
  462. private int _pos, _endPos;
  463. #region Internal makers
  464. internal static RE_Match make(Match m, RE_Pattern pattern, string input) {
  465. if (m.Success) return new RE_Match(m, pattern, input, 0, input.Length);
  466. return null;
  467. }
  468. internal static RE_Match make(Match m, RE_Pattern pattern, string input, int offset, int endpos) {
  469. if (m.Success) return new RE_Match(m, pattern, input, offset, endpos);
  470. return null;
  471. }
  472. internal static RE_Match makeMatch(Match m, RE_Pattern pattern, string input, int offset, int endpos) {
  473. if (m.Success && m.Index == offset) return new RE_Match(m, pattern, input, offset, endpos);
  474. return null;
  475. }
  476. #endregion
  477. #region Public ctors
  478. public RE_Match(Match m, RE_Pattern pattern, string text) {
  479. _m = m;
  480. _pattern = pattern;
  481. _text = text;
  482. }
  483. public RE_Match(Match m, RE_Pattern pattern, string text, int pos, int endpos) {
  484. _m = m;
  485. _pattern = pattern;
  486. _text = text;
  487. _pos = pos;
  488. _endPos = endpos;
  489. }
  490. #endregion
  491. // public override bool __nonzero__() {
  492. // return m.Success;
  493. // }
  494. #region Public API Surface
  495. public int end() {
  496. return _m.Index + _m.Length;
  497. }
  498. public int start() {
  499. return _m.Index;
  500. }
  501. public int start(object group) {
  502. int grpIndex = GetGroupIndex(group);
  503. if (!_m.Groups[grpIndex].Success) {
  504. return -1;
  505. }
  506. return _m.Groups[grpIndex].Index;
  507. }
  508. public int end(object group) {
  509. int grpIndex = GetGroupIndex(group);
  510. if (!_m.Groups[grpIndex].Success) {
  511. return -1;
  512. }
  513. return _m.Groups[grpIndex].Index + _m.Groups[grpIndex].Length;
  514. }
  515. public object group(object index, params object[] additional) {
  516. if (additional.Length == 0) {
  517. return group(index);
  518. }
  519. object[] res = new object[additional.Length + 1];
  520. res[0] = _m.Groups[GetGroupIndex(index)].Success ? _m.Groups[GetGroupIndex(index)].Value : null;
  521. for (int i = 1; i < res.Length; i++) {
  522. int grpIndex = GetGroupIndex(additional[i - 1]);
  523. res[i] = _m.Groups[grpIndex].Success ? _m.Groups[grpIndex].Value : null;
  524. }
  525. return PythonTuple.MakeTuple(res);
  526. }
  527. public string group(object index) {
  528. int pos = GetGroupIndex(index);
  529. Group g = _m.Groups[pos];
  530. return g.Success ? g.Value : null;
  531. }
  532. public string group() {
  533. return group(0);
  534. }
  535. [return: SequenceTypeInfo(typeof(string))]
  536. public PythonTuple groups() {
  537. return groups(null);
  538. }
  539. public PythonTuple groups(object @default) {
  540. object[] ret = new object[_m.Groups.Count - 1];
  541. for (int i = 1; i < _m.Groups.Count; i++) {
  542. if (!_m.Groups[i].Success) {
  543. ret[i - 1] = @default;
  544. } else {
  545. ret[i - 1] = _m.Groups[i].Value;
  546. }
  547. }
  548. return PythonTuple.MakeTuple(ret);
  549. }
  550. public string expand(object template) {
  551. string strTmp = ValidateString(template, "template");
  552. StringBuilder res = new StringBuilder();
  553. for (int i = 0; i < strTmp.Length; i++) {
  554. if (strTmp[i] != '\\') { res.Append(strTmp[i]); continue; }
  555. if (++i == strTmp.Length) { res.Append(strTmp[i - 1]); continue; }
  556. if (Char.IsDigit(strTmp[i])) {
  557. AppendGroup(res, (int)(strTmp[i] - '0'));
  558. } else if (strTmp[i] == 'g') {
  559. if (++i == strTmp.Length) { res.Append("\\g"); return res.ToString(); }
  560. if (strTmp[i] != '<') {
  561. res.Append("\\g<"); continue;
  562. } else { // '<'
  563. StringBuilder name = new StringBuilder();
  564. i++;
  565. while (strTmp[i] != '>' && i < strTmp.Length) {
  566. name.Append(strTmp[i++]);
  567. }
  568. AppendGroup(res, _pattern._re.GroupNumberFromName(name.ToString()));
  569. }
  570. } else {
  571. switch (strTmp[i]) {
  572. case 'n': res.Append('\n'); break;
  573. case 'r': res.Append('\r'); break;
  574. case 't': res.Append('\t'); break;
  575. case '\\': res.Append('\\'); break;
  576. }
  577. }
  578. }
  579. return res.ToString();
  580. }
  581. [return: DictionaryTypeInfo(typeof(string), typeof(string))]
  582. public PythonDictionary groupdict() {
  583. return groupdict(null);
  584. }
  585. private static bool IsGroupNumber(string name) {
  586. foreach (char c in name) {
  587. if (!Char.IsNumber(c)) return false;
  588. }
  589. return true;
  590. }
  591. [return: DictionaryTypeInfo(typeof(string), typeof(string))]
  592. public PythonDictionary groupdict([NotNull]string value) {
  593. return groupdict((object)value);
  594. }
  595. [return: DictionaryTypeInfo(typeof(string), typeof(object))]
  596. public PythonDictionary groupdict(object value) {
  597. string[] groupNames = this._pattern._re.GetGroupNames();
  598. Debug.Assert(groupNames.Length == this._m.Groups.Count);
  599. PythonDictionary d = new PythonDictionary();
  600. for (int i = 0; i < groupNames.Length; i++) {
  601. if (IsGroupNumber(groupNames[i])) continue; // python doesn't report group numbers
  602. if (_m.Groups[i].Captures.Count != 0) {
  603. d[groupNames[i]] = _m.Groups[i].Value;
  604. } else {
  605. d[groupNames[i]] = value;
  606. }
  607. }
  608. return d;
  609. }
  610. [return: SequenceTypeInfo(typeof(int))]
  611. public PythonTuple span() {
  612. return PythonTuple.MakeTuple(this.start(), this.end());
  613. }
  614. [return: SequenceTypeInfo(typeof(int))]
  615. public PythonTuple span(object group) {
  616. return PythonTuple.MakeTuple(this.start(group), this.end(group));
  617. }
  618. public int pos {
  619. get {
  620. return _pos;
  621. }
  622. }
  623. public int endpos {
  624. get {
  625. return _endPos;
  626. }
  627. }
  628. public string @string {
  629. get {
  630. return _text;
  631. }
  632. }
  633. public PythonTuple regs {
  634. get {
  635. object[] res = new object[_m.Groups.Count];
  636. for (int i = 0; i < res.Length; i++) {
  637. res[i] = PythonTuple.MakeTuple(start(i), end(i));
  638. }
  639. return PythonTuple.MakeTuple(res);
  640. }
  641. }
  642. public RE_Pattern re {
  643. get {
  644. return _pattern;
  645. }
  646. }
  647. public object lastindex {
  648. get {
  649. // -1 : initial value of lastindex
  650. // 0 : no match found
  651. //other : the true lastindex
  652. // Match.Groups contains "lower" level matched groups, which has to be removed
  653. if (_lastindex == -1) {
  654. int i = 1;
  655. while (i < _m.Groups.Count) {
  656. if (_m.Groups[i].Success) {
  657. _lastindex = i;
  658. int start = _m.Groups[i].Index;
  659. int end = start + _m.Groups[i].Length;
  660. i++;
  661. // skip any group which fall into the range [start, end],
  662. // no matter match succeed or fail
  663. while (i < _m.Groups.Count && (_m.Groups[i].Index < end)) {
  664. i++;
  665. }
  666. } else {
  667. i++;
  668. }
  669. }
  670. if (_lastindex == -1) {
  671. _lastindex = 0;
  672. }
  673. }
  674. if (_lastindex == 0) {
  675. return null;
  676. } else {
  677. return _lastindex;
  678. }
  679. }
  680. }
  681. public string lastgroup {
  682. get {
  683. if (lastindex == null) return null;
  684. // when group was not explicitly named, RegEx assigns the number as name
  685. // This is different from C-Python, which returns None in such cases
  686. return this._pattern._re.GroupNameFromNumber((int)lastindex);
  687. }
  688. }
  689. #endregion
  690. #region Private helper functions
  691. private void AppendGroup(StringBuilder sb, int index) {
  692. sb.Append(_m.Groups[index].Value);
  693. }
  694. private int GetGroupIndex(object group) {
  695. int grpIndex;
  696. if (!Converter.TryConvertToInt32(group, out grpIndex)) {
  697. grpIndex = _pattern._re.GroupNumberFromName(ValidateString(group, "group"));
  698. }
  699. if (grpIndex < 0 || grpIndex >= _m.Groups.Count) {
  700. throw PythonOps.IndexError("no such group");
  701. }
  702. return grpIndex;
  703. }
  704. #endregion
  705. }
  706. #endregion
  707. #region Private helper functions
  708. private static RE_Pattern GetPattern(CodeContext/*!*/ context, object pattern, int flags) {
  709. RE_Pattern res = pattern as RE_Pattern;
  710. if (res != null) {
  711. return res;
  712. }
  713. string strPattern = ValidatePatternAsString(pattern);
  714. PatternKey key = new PatternKey(strPattern, flags);
  715. lock (_cachedPatterns) {
  716. if (_cachedPatterns.TryGetValue(new PatternKey(strPattern, flags), out res)) {
  717. return res;
  718. }
  719. res = new RE_Pattern(context, strPattern, flags);
  720. _cachedPatterns[key] = res;
  721. return res;
  722. }
  723. }
  724. private static IEnumerator MatchIterator(MatchCollection matches, RE_Pattern pattern, string input) {
  725. for (int i = 0; i < matches.Count; i++) {
  726. yield return RE_Match.make(matches[i], pattern, input, 0, input.Length);
  727. }
  728. }
  729. private static RegexOptions FlagsToOption(int flags) {
  730. RegexOptions opts = RegexOptions.None;
  731. if ((flags & (int)IGNORECASE) != 0) opts |= RegexOptions.IgnoreCase;
  732. if ((flags & (int)MULTILINE) != 0) opts |= RegexOptions.Multiline;
  733. if (((flags & (int)LOCALE)) == 0) opts &= (~RegexOptions.CultureInvariant);
  734. if ((flags & (int)DOTALL) != 0) opts |= RegexOptions.Singleline;
  735. if ((flags & (int)VERBOSE) != 0) opts |= RegexOptions.IgnorePatternWhitespace;
  736. return opts;
  737. }
  738. private static int OptionToFlags(RegexOptions options) {
  739. int flags = 0;
  740. if ((options & RegexOptions.IgnoreCase) != 0) {
  741. flags |= IGNORECASE;
  742. }
  743. if ((options & RegexOptions.Multiline) != 0) {
  744. flags |= MULTILINE;
  745. }
  746. if ((options & RegexOptions.CultureInvariant) == 0) {
  747. flags |= LOCALE;
  748. }
  749. if ((options & RegexOptions.Singleline) != 0) {
  750. flags |= DOTALL;
  751. }
  752. if ((options & RegexOptions.IgnorePatternWhitespace) != 0) {
  753. flags |= VERBOSE;
  754. }
  755. return flags;
  756. }
  757. internal class ParsedRegex {
  758. public ParsedRegex(string pattern) {
  759. this.UserPattern = pattern;
  760. }
  761. public string UserPattern;
  762. public string Pattern;
  763. public RegexOptions Options = RegexOptions.CultureInvariant;
  764. }
  765. private static char[] _preParsedChars = new[] { '(', '{', '[', ']' };
  766. private const string _mangledNamedGroup = "___PyRegexNameMangled";
  767. /// <summary>
  768. /// Preparses a regular expression text returning a ParsedRegex class
  769. /// that can be used for further regular expressions.
  770. /// </summary>
  771. private static ParsedRegex PreParseRegex(CodeContext/*!*/ context, string pattern) {
  772. ParsedRegex res = new ParsedRegex(pattern);
  773. //string newPattern;
  774. int cur = 0, nameIndex;
  775. int curGroup = 0;
  776. bool isCharList = false;
  777. bool containsNamedGroup = false;
  778. for (; ; ) {
  779. nameIndex = pattern.IndexOfAny(_preParsedChars, cur);
  780. if (nameIndex > 0 && pattern[nameIndex - 1] == '\\') {
  781. int curIndex = nameIndex - 2;
  782. int backslashCount = 1;
  783. while (curIndex >= 0 && pattern[curIndex] == '\\') {
  784. backslashCount++;
  785. curIndex--;
  786. }
  787. // odd number of back slashes, this is an optional
  788. // paren that we should ignore.
  789. if ((backslashCount & 0x01) != 0) {
  790. cur++;
  791. continue;
  792. }
  793. }
  794. if (nameIndex == -1) break;
  795. if (nameIndex == pattern.Length - 1) break;
  796. switch (pattern[nameIndex]) {
  797. case '{':
  798. if (pattern[++nameIndex] == ',') {
  799. // no beginning specified for the n-m quntifier, add the
  800. // default 0 value.
  801. pattern = pattern.Insert(nameIndex, "0");
  802. }
  803. break;
  804. case '[':
  805. nameIndex++;
  806. isCharList = true;
  807. break;
  808. case ']':
  809. nameIndex++;
  810. isCharList = false;
  811. break;
  812. case '(':
  813. // make sure we're not dealing with [(]
  814. if (!isCharList) {
  815. switch (pattern[++nameIndex]) {
  816. case '?':
  817. // extension syntax
  818. if (nameIndex == pattern.Length - 1) throw PythonExceptions.CreateThrowable(error(context), "unexpected end of regex");
  819. switch (pattern[++nameIndex]) {
  820. case 'P':
  821. // named regex, .NET doesn't expect the P so we'll remove it;
  822. // also, once we see a named group i.e. ?P then we need to start artificially
  823. // naming all unnamed groups from then on---this is to get around the fact that
  824. // the CLR RegEx support orders all the unnamed groups before all the named
  825. // groups, even if the named groups are before the unnamed ones in the pattern;
  826. // the artificial naming preserves the order of the groups and thus the order of
  827. // the matches
  828. if (nameIndex + 1 < pattern.Length && pattern[nameIndex + 1] == '=') {
  829. // match whatever was previously matched by the named group
  830. // remove the (?P=
  831. pattern = pattern.Remove(nameIndex - 2, 4);
  832. pattern = pattern.Insert(nameIndex - 2, "\\k<");
  833. int tmpIndex = nameIndex;
  834. while (tmpIndex < pattern.Length && pattern[tmpIndex] != ')')
  835. tmpIndex++;
  836. if (tmpIndex == pattern.Length) throw PythonExceptions.CreateThrowable(error(context), "unexpected end of regex");
  837. pattern = pattern.Substring(0, tmpIndex) + ">" + pattern.Substring(tmpIndex + 1);
  838. } else {
  839. containsNamedGroup = true;
  840. pattern = pattern.Remove(nameIndex, 1);
  841. }
  842. break;
  843. case 'i': res.Options |= RegexOptions.IgnoreCase; break;
  844. case 'L':
  845. res.Options &= ~(RegexOptions.CultureInvariant);
  846. RemoveOption(ref pattern, ref nameIndex);
  847. break;
  848. case 'm': res.Options |= RegexOptions.Multiline; break;
  849. case 's': res.Options |= RegexOptions.Singleline; break;
  850. case 'u':
  851. // specify unicode; not relevant and not valid under .NET as we're always unicode
  852. // -- so the option needs to be removed
  853. RemoveOption(ref pattern, ref nameIndex);
  854. break;
  855. case 'x': res.Options |= RegexOptions.IgnorePatternWhitespace; break;
  856. case ':': break; // non-capturing
  857. case '=': break; // look ahead assertion
  858. case '<': break; // positive look behind assertion
  859. case '!': break; // negative look ahead assertion
  860. case '#': break; // inline comment
  861. case '(':
  862. // conditional match alternation (?(id/name)yes-pattern|no-pattern)
  863. // move past ?( so we don't preparse the name.
  864. nameIndex++;
  865. break;
  866. default: throw PythonExceptions.CreateThrowable(error(context), "Unrecognized extension " + pattern[nameIndex]);
  867. }
  868. break;
  869. default:
  870. // just another group
  871. curGroup++;
  872. if (containsNamedGroup) {
  873. // need to name this unnamed group
  874. pattern = pattern.Insert(nameIndex, "?<" + _mangledNamedGroup + GetRandomString() + ">");
  875. }
  876. break;
  877. }
  878. } else {
  879. nameIndex++;
  880. }
  881. break;
  882. }
  883. cur = nameIndex;
  884. }
  885. cur = 0;
  886. for (; ; ) {
  887. nameIndex = pattern.IndexOf('\\', cur);
  888. if (nameIndex == -1 || nameIndex == pattern.Length - 1) break;
  889. cur = ++nameIndex;
  890. char curChar = pattern[cur];
  891. switch (curChar) {
  892. case 'x':
  893. case 'u':
  894. case 'a':
  895. case 'b':
  896. case 'e':
  897. case 'f':
  898. case 'k':
  899. case 'n':
  900. case 'r':
  901. case 't':
  902. case 'v':
  903. case 'c':
  904. case 's':
  905. case 'W':
  906. case 'w':
  907. case 'p':
  908. case 'P':
  909. case 'S':
  910. case 'd':
  911. case 'D':
  912. case 'A':
  913. case 'Z':
  914. case '\\':
  915. // known escape sequences, leave escaped.
  916. break;
  917. default:
  918. System.Globalization.UnicodeCategory charClass = Char.GetUnicodeCategory(curChar);
  919. switch (charClass) {
  920. // recognized word characters, always unescape.
  921. case System.Globalization.UnicodeCategory.ModifierLetter:
  922. case System.Globalization.UnicodeCategory.LowercaseLetter:
  923. case System.Globalization.UnicodeCategory.UppercaseLetter:
  924. case System.Globalization.UnicodeCategory.TitlecaseLetter:
  925. case System.Globalization.UnicodeCategory.OtherLetter:
  926. case System.Globalization.UnicodeCategory.LetterNumber:
  927. case System.Globalization.UnicodeCategory.OtherNumber:
  928. case System.Globalization.UnicodeCategory.ConnectorPunctuation:
  929. pattern = pattern.Remove(nameIndex - 1, 1);
  930. cur--;
  931. break;
  932. case System.Globalization.UnicodeCategory.DecimalDigitNumber:
  933. // actually don't want to unescape '\1', '\2' etc. which are references to groups
  934. break;
  935. }
  936. break;
  937. }
  938. if (++cur >= pattern.Length) {
  939. break;
  940. }
  941. }
  942. res.Pattern = pattern;
  943. return res;
  944. }
  945. private static void RemoveOption(ref string pattern, ref int nameIndex) {
  946. if (pattern[nameIndex - 1] == '?' && nameIndex < (pattern.Length - 1) && pattern[nameIndex + 1] == ')') {
  947. pattern = pattern.Remove(nameIndex - 2, 4);
  948. nameIndex -= 2;
  949. } else {
  950. pattern = pattern.Remove(nameIndex--, 1);
  951. }
  952. }
  953. private static string GetRandomString() {
  954. return r.Next(Int32.MaxValue / 2, Int32.MaxValue).ToString();
  955. }
  956. private static string UnescapeGroups(Match m, string text) {
  957. for (int i = 0; i < text.Length; i++) {
  958. if (text[i] == '\\') {
  959. StringBuilder sb = new StringBuilder(text, 0, i, text.Length);
  960. do {
  961. if (text[i] == '\\') {
  962. i++;
  963. if (i == text.Length) { sb.Append('\\'); break; }
  964. switch (text[i]) {
  965. case 'n': sb.Append('\n'); break;
  966. case 'r': sb.Append('\r'); break;
  967. case 't': sb.Append('\t'); break;
  968. case '\\': sb.Append('\\'); break;
  969. case '\'': sb.Append('\''); break;
  970. case '