PageRenderTime 55ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/Languages/IronPython/IronPython.Modules/re.cs

http://github.com/IronLanguages/main
C# | 1316 lines | 1032 code | 190 blank | 94 comment | 270 complexity | 698d40f46d44baf8725815993f1c2944 MD5 | raw file
Possible License(s): CPL-1.0, BSD-3-Clause, ISC, GPL-2.0, MPL-2.0-no-copyleft-exception

Large files files are truncated, but you can click here to view the full file

  1. /* ****************************************************************************
  2. *
  3. * Copyright (c) Microsoft Corporation.
  4. *
  5. * This source code is subject to terms and conditions of the Apache License, Version 2.0. A
  6. * copy of the license can be found in the License.html file at the root of this distribution. If
  7. * you cannot locate the Apache License, Version 2.0, please send an email to
  8. * ironpy@microsoft.com. By using this source code in any fashion, you are agreeing to be bound
  9. * by the terms of the Apache License, Version 2.0.
  10. *
  11. * You must not remove this notice, or any other, from this software.
  12. *
  13. *
  14. * ***************************************************************************/
  15. using System;
  16. using System.Collections;
  17. using System.Collections.Generic;
  18. using System.Diagnostics;
  19. using System.Globalization;
  20. using System.Runtime.CompilerServices;
  21. using System.Runtime.InteropServices;
  22. using System.Text;
  23. using System.Text.RegularExpressions;
  24. using Microsoft.Scripting;
  25. using Microsoft.Scripting.Runtime;
  26. using Microsoft.Scripting.Utils;
  27. using IronPython.Runtime;
  28. using IronPython.Runtime.Exceptions;
  29. using IronPython.Runtime.Operations;
  30. using IronPython.Runtime.Types;
  31. [assembly: PythonModule("re", typeof(IronPython.Modules.PythonRegex))]
  32. namespace IronPython.Modules {
  33. /// <summary>
  34. /// Python regular expression module.
  35. /// </summary>
  36. public static class PythonRegex {
  37. private static CacheDict<PatternKey, RE_Pattern> _cachedPatterns = new CacheDict<PatternKey, RE_Pattern>(100);
  38. [SpecialName]
  39. public static void PerformModuleReload(PythonContext/*!*/ context, PythonDictionary/*!*/ dict) {
  40. context.EnsureModuleException("reerror", dict, "error", "re");
  41. PythonCopyReg.GetDispatchTable(context.SharedContext)[DynamicHelpers.GetPythonTypeFromType(typeof(RE_Pattern))] = dict["_pickle"];
  42. }
  43. private static readonly Random r = new Random(DateTime.Now.Millisecond);
  44. #region CONSTANTS
  45. // short forms
  46. public const int I = 0x02;
  47. public const int L = 0x04;
  48. public const int M = 0x08;
  49. public const int S = 0x10;
  50. public const int U = 0x20;
  51. public const int X = 0x40;
  52. // long forms
  53. public const int IGNORECASE = 0x02;
  54. public const int LOCALE = 0x04;
  55. public const int MULTILINE = 0x08;
  56. public const int DOTALL = 0x10;
  57. public const int UNICODE = 0x20;
  58. public const int VERBOSE = 0x40;
  59. #endregion
  60. #region Public API Surface
  61. public static RE_Pattern compile(CodeContext/*!*/ context, object pattern, [DefaultParameterValue(0)]int flags) {
  62. try {
  63. return GetPattern(context, pattern, flags, true);
  64. } catch (ArgumentException e) {
  65. throw PythonExceptions.CreateThrowable(error(context), e.Message);
  66. }
  67. }
  68. public const string engine = "cli reg ex";
  69. public static string escape(string text) {
  70. if (text == null) throw PythonOps.TypeError("text must not be None");
  71. for (int i = 0; i < text.Length; i++) {
  72. if (!Char.IsLetterOrDigit(text[i])) {
  73. StringBuilder sb = new StringBuilder(text, 0, i, text.Length);
  74. char ch = text[i];
  75. do {
  76. sb.Append('\\');
  77. sb.Append(ch);
  78. i++;
  79. int last = i;
  80. while (i < text.Length) {
  81. ch = text[i];
  82. if (!Char.IsLetterOrDigit(ch)) {
  83. break;
  84. }
  85. i++;
  86. }
  87. sb.Append(text, last, i - last);
  88. } while (i < text.Length);
  89. return sb.ToString();
  90. }
  91. }
  92. return text;
  93. }
  94. public static List findall(CodeContext/*!*/ context, object pattern, string @string, [DefaultParameterValue(0)]int flags) {
  95. RE_Pattern pat = GetPattern(context, ValidatePattern(pattern), flags);
  96. ValidateString(@string, "string");
  97. MatchCollection mc = pat.FindAllWorker(context, @string, 0, @string.Length);
  98. return FixFindAllMatch(pat, mc, null);
  99. }
  100. public static List findall(CodeContext context, object pattern, IList<byte> @string, [DefaultParameterValue(0)]int flags) {
  101. RE_Pattern pat = GetPattern(context, ValidatePattern (pattern), flags);
  102. ValidateString (@string, "string");
  103. MatchCollection mc = pat.FindAllWorker(context, @string, 0, @string.Count);
  104. return FixFindAllMatch (pat, mc, FindMaker(@string));
  105. }
  106. private static Func<string, object> FindMaker (object input) {
  107. Func<string, object> maker = null;
  108. if (input is ByteArray) {
  109. maker = delegate (string x) { return new ByteArray (x.MakeByteArray ()); };
  110. }
  111. return maker;
  112. }
  113. private static List FixFindAllMatch(RE_Pattern pat, MatchCollection mc, Func<string, object> maker) {
  114. object[] matches = new object[mc.Count];
  115. int numgrps = pat._re.GetGroupNumbers().Length;
  116. for (int i = 0; i < mc.Count; i++) {
  117. if (numgrps > 2) { // CLR gives us a "bonus" group of 0 - the entire expression
  118. // at this point we have more than one group in the pattern;
  119. // need to return a list of tuples in this case
  120. // for each match item in the matchcollection, create a tuple representing what was matched
  121. // e.g. findall("(\d+)|(\w+)", "x = 99y") == [('', 'x'), ('99', ''), ('', 'y')]
  122. // in the example above, ('', 'x') did not match (\d+) as indicated by '' but did
  123. // match (\w+) as indicated by 'x' and so on...
  124. int k = 0;
  125. List<object> tpl = new List<object>();
  126. foreach (Group g in mc[i].Groups) {
  127. // here also the CLR gives us a "bonus" match as the first item which is the
  128. // group that was actually matched in the tuple e.g. we get 'x', '', 'x' for
  129. // the first match object...so we'll skip the first item when creating the
  130. // tuple
  131. if (k++ != 0) {
  132. tpl.Add(maker != null ? maker(g.Value) : g.Value);
  133. }
  134. }
  135. matches[i] = PythonTuple.Make(tpl);
  136. } else if (numgrps == 2) {
  137. // at this point we have exactly one group in the pattern (including the "bonus" one given
  138. // by the CLR
  139. // skip the first match since that contains the entire match and not the group match
  140. // e.g. re.findall(r"(\w+)\s+fish\b", "green fish") will have "green fish" in the 0
  141. // index and "green" as the (\w+) group match
  142. matches[i] = maker != null ? maker(mc[i].Groups[1].Value) : mc[i].Groups[1].Value;
  143. } else {
  144. matches[i] = maker != null ? maker (mc[i].Value) : mc[i].Value;
  145. }
  146. }
  147. return List.FromArrayNoCopy(matches);
  148. }
  149. public static object finditer(CodeContext/*!*/ context, object pattern, object @string, [DefaultParameterValue(0)]int flags) {
  150. RE_Pattern pat = GetPattern(context, ValidatePattern(pattern), flags);
  151. string str = ValidateString(@string, "string");
  152. return MatchIterator(pat.FindAllWorker(context, str, 0, str.Length), pat, str);
  153. }
  154. public static RE_Match match(CodeContext/*!*/ context, object pattern, object @string, [DefaultParameterValue(0)]int flags) {
  155. return GetPattern(context, ValidatePattern(pattern), flags).match(ValidateString(@string, "string"));
  156. }
  157. public static RE_Match search(CodeContext/*!*/ context, object pattern, object @string, [DefaultParameterValue(0)]int flags) {
  158. return GetPattern(context, ValidatePattern(pattern), flags).search(ValidateString(@string, "string"));
  159. }
  160. [return: SequenceTypeInfo(typeof(string))]
  161. public static List split(CodeContext/*!*/ context, object pattern, object @string, [DefaultParameterValue(0)]int maxsplit, [DefaultParameterValue(0)]int flags) {
  162. return GetPattern(context, ValidatePattern(pattern), flags).split(ValidateString(@string, "string"), maxsplit);
  163. }
  164. public static string sub(CodeContext/*!*/ context, object pattern, object repl, object @string, [DefaultParameterValue(0)]int count, [DefaultParameterValue(0)]int flags) {
  165. return GetPattern(context, ValidatePattern(pattern), flags).sub(context, repl, ValidateString(@string, "string"), count);
  166. }
  167. public static object subn(CodeContext/*!*/ context, object pattern, object repl, object @string, [DefaultParameterValue(0)]int count, [DefaultParameterValue(0)]int flags) {
  168. return GetPattern(context, ValidatePattern(pattern), flags).subn(context, repl, ValidateString(@string, "string"), count);
  169. }
  170. public static void purge() {
  171. _cachedPatterns = new CacheDict<PatternKey, RE_Pattern>(100);
  172. }
  173. #endregion
  174. #region Public classes
  175. /// <summary>
  176. /// Compiled reg-ex pattern
  177. /// </summary>
  178. [PythonType]
  179. public class RE_Pattern : IWeakReferenceable {
  180. internal Regex _re;
  181. private PythonDictionary _groups;
  182. private int _compileFlags;
  183. private WeakRefTracker _weakRefTracker;
  184. internal ParsedRegex _pre;
  185. internal RE_Pattern(CodeContext/*!*/ context, object pattern)
  186. : this(context, pattern, 0) {
  187. }
  188. internal RE_Pattern(CodeContext/*!*/ context, object pattern, int flags) :
  189. this(context, pattern, flags, false) {
  190. }
  191. internal RE_Pattern(CodeContext/*!*/ context, object pattern, int flags, bool compiled) {
  192. _pre = PreParseRegex(context, ValidatePatternAsString(pattern));
  193. try {
  194. flags |= OptionToFlags(_pre.Options);
  195. RegexOptions opts = FlagsToOption(flags);
  196. #if SILVERLIGHT
  197. this._re = new Regex(_pre.Pattern, opts);
  198. #else
  199. this._re = new Regex(_pre.Pattern, opts | (compiled ? RegexOptions.Compiled : RegexOptions.None));
  200. #endif
  201. } catch (ArgumentException e) {
  202. throw PythonExceptions.CreateThrowable(error(context), e.Message);
  203. }
  204. this._compileFlags = flags;
  205. }
  206. public RE_Match match(object text) {
  207. string input = ValidateString(text, "text");
  208. return RE_Match.makeMatch(_re.Match(input), this, input, 0, input.Length);
  209. }
  210. private static int FixPosition(string text, int position) {
  211. if (position < 0) return 0;
  212. if (position > text.Length) return text.Length;
  213. return position;
  214. }
  215. public RE_Match match(object text, int pos) {
  216. string input = ValidateString(text, "text");
  217. pos = FixPosition(input, pos);
  218. return RE_Match.makeMatch(_re.Match(input, pos), this, input, pos, input.Length);
  219. }
  220. public RE_Match match(object text, [DefaultParameterValue(0)]int pos, int endpos) {
  221. string input = ValidateString(text, "text");
  222. pos = FixPosition(input, pos);
  223. endpos = FixPosition(input, endpos);
  224. return RE_Match.makeMatch(
  225. _re.Match(input.Substring(0, endpos), pos),
  226. this,
  227. input,
  228. pos,
  229. endpos);
  230. }
  231. public RE_Match search(object text) {
  232. string input = ValidateString(text, "text");
  233. return RE_Match.make(_re.Match(input), this, input);
  234. }
  235. public RE_Match search(object text, int pos) {
  236. string input = ValidateString(text, "text");
  237. if (pos < 0) pos = 0;
  238. return RE_Match.make(_re.Match(input, pos), this, input);
  239. }
  240. public RE_Match search(object text, int pos, int endpos) {
  241. string input = ValidateString(text, "text");
  242. if (pos < 0) pos = 0;
  243. if (endpos < pos) return null;
  244. if (endpos < input.Length) input = input.Substring(0, endpos);
  245. return RE_Match.make(_re.Match(input, pos), this, input);
  246. }
  247. public object findall(CodeContext/*!*/ context, string @string) {
  248. return findall(context, @string, 0, null);
  249. }
  250. public object findall(CodeContext/*!*/ context, string @string, int pos) {
  251. return findall(context, @string, pos, null);
  252. }
  253. public object findall(CodeContext/*!*/ context, object @string, int pos, object endpos) {
  254. MatchCollection mc = FindAllWorker(context, ValidateString(@string, "text"), pos, endpos);
  255. return FixFindAllMatch(this, mc, FindMaker(@string));
  256. }
  257. internal MatchCollection FindAllWorker(CodeContext/*!*/ context, string str, int pos, object endpos) {
  258. string against = str;
  259. if (endpos != null) {
  260. int end = PythonContext.GetContext(context).ConvertToInt32(endpos);
  261. against = against.Substring(0, Math.Max(end, 0));
  262. }
  263. return _re.Matches(against, pos);
  264. }
  265. internal MatchCollection FindAllWorker(CodeContext/*!*/ context, IList<byte> str, int pos, object endpos) {
  266. string against = str.MakeString();
  267. if (endpos != null) {
  268. int end = PythonContext.GetContext(context).ConvertToInt32(endpos);
  269. against = against.Substring(0, Math.Max(end, 0));
  270. }
  271. return _re.Matches(against, pos);
  272. }
  273. public object finditer(CodeContext/*!*/ context, object @string) {
  274. string input = ValidateString(@string, "string");
  275. return MatchIterator(FindAllWorker(context, input, 0, input.Length), this, input);
  276. }
  277. public object finditer(CodeContext/*!*/ context, object @string, int pos) {
  278. string input = ValidateString(@string, "string");
  279. return MatchIterator(FindAllWorker(context, input, pos, input.Length), this, input);
  280. }
  281. public object finditer(CodeContext/*!*/ context, object @string, int pos, int endpos) {
  282. string input = ValidateString(@string, "string");
  283. return MatchIterator(FindAllWorker(context, input, pos, endpos), this, input);
  284. }
  285. [return: SequenceTypeInfo(typeof(string))]
  286. public List split(object @string, [DefaultParameterValue(0)]int maxsplit) {
  287. List result = new List();
  288. // fast path for negative maxSplit ( == "make no splits")
  289. if (maxsplit < 0) {
  290. result.AddNoLock(ValidateString(@string, "string"));
  291. } else {
  292. // iterate over all matches
  293. string theStr = ValidateString(@string, "string");
  294. MatchCollection matches = _re.Matches(theStr);
  295. int lastPos = 0; // is either start of the string, or first position *after* the last match
  296. int nSplits = 0; // how many splits have occurred?
  297. foreach (Match m in matches) {
  298. if (m.Length > 0) {
  299. // add substring from lastPos to beginning of current match
  300. result.AddNoLock(theStr.Substring(lastPos, m.Index - lastPos));
  301. // if there are subgroups of the match, add their match or None
  302. if (m.Groups.Count > 1)
  303. for (int i = 1; i < m.Groups.Count; i++)
  304. if (m.Groups[i].Success)
  305. result.AddNoLock(m.Groups[i].Value);
  306. else
  307. result.AddNoLock(null);
  308. // update lastPos, nSplits
  309. lastPos = m.Index + m.Length;
  310. nSplits++;
  311. if (nSplits == maxsplit)
  312. break;
  313. }
  314. }
  315. // add tail following last match
  316. result.AddNoLock(theStr.Substring(lastPos));
  317. }
  318. return result;
  319. }
  320. public string sub(CodeContext/*!*/ context, object repl, object @string, [DefaultParameterValue(0)]int count) {
  321. if (repl == null) throw PythonOps.TypeError("NoneType is not valid repl");
  322. // if 'count' is omitted or 0, all occurrences are replaced
  323. if (count == 0) count = Int32.MaxValue;
  324. string replacement = repl as string;
  325. if (replacement == null) {
  326. if (repl is ExtensibleString) {
  327. replacement = ((ExtensibleString)repl).Value;
  328. } else if (repl is Bytes) {
  329. replacement = ((Bytes)repl).ToString();
  330. }
  331. }
  332. Match prev = null;
  333. string input = ValidateString(@string, "string");
  334. return _re.Replace(
  335. input,
  336. delegate(Match match) {
  337. // from the docs: Empty matches for the pattern are replaced
  338. // only when not adjacent to a previous match
  339. if (String.IsNullOrEmpty(match.Value) && prev != null &&
  340. (prev.Index + prev.Length) == match.Index) {
  341. return "";
  342. };
  343. prev = match;
  344. if (replacement != null) return UnescapeGroups(match, replacement);
  345. return PythonCalls.Call(context, repl, RE_Match.make(match, this, input)) as string;
  346. },
  347. count);
  348. }
  349. public object subn(CodeContext/*!*/ context, object repl, object @string, [DefaultParameterValue(0)]int count) {
  350. if (repl == null) throw PythonOps.TypeError("NoneType is not valid repl");
  351. // if 'count' is omitted or 0, all occurrences are replaced
  352. if (count == 0) count = Int32.MaxValue;
  353. int totalCount = 0;
  354. string res;
  355. string replacement = repl as string;
  356. if (replacement == null) {
  357. if (repl is ExtensibleString) {
  358. replacement = ((ExtensibleString)repl).Value;
  359. } else if (repl is Bytes) {
  360. replacement = ((Bytes)repl).ToString();
  361. }
  362. }
  363. Match prev = null;
  364. string input = ValidateString(@string, "string");
  365. res = _re.Replace(
  366. input,
  367. delegate(Match match) {
  368. // from the docs: Empty matches for the pattern are replaced
  369. // only when not adjacent to a previous match
  370. if (String.IsNullOrEmpty(match.Value) && prev != null &&
  371. (prev.Index + prev.Length) == match.Index) {
  372. return "";
  373. };
  374. prev = match;
  375. totalCount++;
  376. if (replacement != null) return UnescapeGroups(match, replacement);
  377. return PythonCalls.Call(context, repl, RE_Match.make(match, this, input)) as string;
  378. },
  379. count);
  380. return PythonTuple.MakeTuple(res, totalCount);
  381. }
  382. public int flags {
  383. get {
  384. return _compileFlags;
  385. }
  386. }
  387. public PythonDictionary groupindex {
  388. get {
  389. if (_groups == null) {
  390. PythonDictionary d = new PythonDictionary();
  391. string[] names = _re.GetGroupNames();
  392. int[] nums = _re.GetGroupNumbers();
  393. for (int i = 1; i < names.Length; i++) {
  394. if (Char.IsDigit(names[i][0]) || names[i].StartsWith(_mangledNamedGroup)) {
  395. // skip numeric names and our mangling for unnamed groups mixed w/ named groups.
  396. continue;
  397. }
  398. d[names[i]] = nums[i];
  399. }
  400. _groups = d;
  401. }
  402. return _groups;
  403. }
  404. }
  405. public int groups {
  406. get {
  407. return _re.GetGroupNumbers().Length - 1;
  408. }
  409. }
  410. public string pattern {
  411. get {
  412. return _pre.UserPattern;
  413. }
  414. }
  415. public override bool Equals(object obj) {
  416. RE_Pattern other = obj as RE_Pattern;
  417. if (other == null) {
  418. return false;
  419. }
  420. return other.pattern == pattern && other.flags == flags;
  421. }
  422. public override int GetHashCode() {
  423. return pattern.GetHashCode() ^ flags;
  424. }
  425. #region IWeakReferenceable Members
  426. WeakRefTracker IWeakReferenceable.GetWeakRef() {
  427. return _weakRefTracker;
  428. }
  429. bool IWeakReferenceable.SetWeakRef(WeakRefTracker value) {
  430. _weakRefTracker = value;
  431. return true;
  432. }
  433. void IWeakReferenceable.SetFinalizer(WeakRefTracker value) {
  434. ((IWeakReferenceable)this).SetWeakRef(value);
  435. }
  436. #endregion
  437. }
  438. public static PythonTuple _pickle(CodeContext/*!*/ context, RE_Pattern pattern) {
  439. object scope = Importer.ImportModule(context, new PythonDictionary(), "re", false, 0);
  440. object compile;
  441. if (scope is PythonModule && ((PythonModule)scope).__dict__.TryGetValue("compile", out compile)) {
  442. return PythonTuple.MakeTuple(compile, PythonTuple.MakeTuple(pattern.pattern, pattern.flags));
  443. }
  444. throw new InvalidOperationException("couldn't find compile method");
  445. }
  446. [PythonType]
  447. public class RE_Match {
  448. RE_Pattern _pattern;
  449. private Match _m;
  450. private string _text;
  451. private int _lastindex = -1;
  452. private int _pos, _endPos;
  453. #region Internal makers
  454. internal static RE_Match make(Match m, RE_Pattern pattern, string input) {
  455. if (m.Success) return new RE_Match(m, pattern, input, 0, input.Length);
  456. return null;
  457. }
  458. internal static RE_Match make(Match m, RE_Pattern pattern, string input, int offset, int endpos) {
  459. if (m.Success) return new RE_Match(m, pattern, input, offset, endpos);
  460. return null;
  461. }
  462. internal static RE_Match makeMatch(Match m, RE_Pattern pattern, string input, int offset, int endpos) {
  463. if (m.Success && m.Index == offset) return new RE_Match(m, pattern, input, offset, endpos);
  464. return null;
  465. }
  466. #endregion
  467. #region Public ctors
  468. public RE_Match(Match m, RE_Pattern pattern, string text) {
  469. _m = m;
  470. _pattern = pattern;
  471. _text = text;
  472. }
  473. public RE_Match(Match m, RE_Pattern pattern, string text, int pos, int endpos) {
  474. _m = m;
  475. _pattern = pattern;
  476. _text = text;
  477. _pos = pos;
  478. _endPos = endpos;
  479. }
  480. #endregion
  481. // public override bool __nonzero__() {
  482. // return m.Success;
  483. // }
  484. #region Public API Surface
  485. public int end() {
  486. return _m.Index + _m.Length;
  487. }
  488. public int start() {
  489. return _m.Index;
  490. }
  491. public int start(object group) {
  492. int grpIndex = GetGroupIndex(group);
  493. if (!_m.Groups[grpIndex].Success) {
  494. return -1;
  495. }
  496. return _m.Groups[grpIndex].Index;
  497. }
  498. public int end(object group) {
  499. int grpIndex = GetGroupIndex(group);
  500. if (!_m.Groups[grpIndex].Success) {
  501. return -1;
  502. }
  503. return _m.Groups[grpIndex].Index + _m.Groups[grpIndex].Length;
  504. }
  505. public object group(object index, params object[] additional) {
  506. if (additional.Length == 0) {
  507. return group(index);
  508. }
  509. object[] res = new object[additional.Length + 1];
  510. res[0] = _m.Groups[GetGroupIndex(index)].Success ? _m.Groups[GetGroupIndex(index)].Value : null;
  511. for (int i = 1; i < res.Length; i++) {
  512. int grpIndex = GetGroupIndex(additional[i - 1]);
  513. res[i] = _m.Groups[grpIndex].Success ? _m.Groups[grpIndex].Value : null;
  514. }
  515. return PythonTuple.MakeTuple(res);
  516. }
  517. public string group(object index) {
  518. int pos = GetGroupIndex(index);
  519. Group g = _m.Groups[pos];
  520. return g.Success ? g.Value : null;
  521. }
  522. public string group() {
  523. return group(0);
  524. }
  525. [return: SequenceTypeInfo(typeof(string))]
  526. public PythonTuple groups() {
  527. return groups(null);
  528. }
  529. public PythonTuple groups(object @default) {
  530. object[] ret = new object[_m.Groups.Count - 1];
  531. for (int i = 1; i < _m.Groups.Count; i++) {
  532. if (!_m.Groups[i].Success) {
  533. ret[i - 1] = @default;
  534. } else {
  535. ret[i - 1] = _m.Groups[i].Value;
  536. }
  537. }
  538. return PythonTuple.MakeTuple(ret);
  539. }
  540. public string expand(object template) {
  541. string strTmp = ValidateString(template, "template");
  542. StringBuilder res = new StringBuilder();
  543. for (int i = 0; i < strTmp.Length; i++) {
  544. if (strTmp[i] != '\\') { res.Append(strTmp[i]); continue; }
  545. if (++i == strTmp.Length) { res.Append(strTmp[i - 1]); continue; }
  546. if (Char.IsDigit(strTmp[i])) {
  547. AppendGroup(res, (int)(strTmp[i] - '0'));
  548. } else if (strTmp[i] == 'g') {
  549. if (++i == strTmp.Length) { res.Append("\\g"); return res.ToString(); }
  550. if (strTmp[i] != '<') {
  551. res.Append("\\g<"); continue;
  552. } else { // '<'
  553. StringBuilder name = new StringBuilder();
  554. i++;
  555. while (strTmp[i] != '>' && i < strTmp.Length) {
  556. name.Append(strTmp[i++]);
  557. }
  558. AppendGroup(res, _pattern._re.GroupNumberFromName(name.ToString()));
  559. }
  560. } else {
  561. switch (strTmp[i]) {
  562. case 'n': res.Append('\n'); break;
  563. case 'r': res.Append('\r'); break;
  564. case 't': res.Append('\t'); break;
  565. case '\\': res.Append('\\'); break;
  566. }
  567. }
  568. }
  569. return res.ToString();
  570. }
  571. [return: DictionaryTypeInfo(typeof(string), typeof(string))]
  572. public PythonDictionary groupdict() {
  573. return groupdict(null);
  574. }
  575. private static bool IsGroupNumber(string name) {
  576. foreach (char c in name) {
  577. if (!Char.IsNumber(c)) return false;
  578. }
  579. return true;
  580. }
  581. [return: DictionaryTypeInfo(typeof(string), typeof(string))]
  582. public PythonDictionary groupdict([NotNull]string value) {
  583. return groupdict((object)value);
  584. }
  585. [return: DictionaryTypeInfo(typeof(string), typeof(object))]
  586. public PythonDictionary groupdict(object value) {
  587. string[] groupNames = this._pattern._re.GetGroupNames();
  588. Debug.Assert(groupNames.Length == this._m.Groups.Count);
  589. PythonDictionary d = new PythonDictionary();
  590. for (int i = 0; i < groupNames.Length; i++) {
  591. if (IsGroupNumber(groupNames[i])) continue; // python doesn't report group numbers
  592. if (_m.Groups[i].Captures.Count != 0) {
  593. d[groupNames[i]] = _m.Groups[i].Value;
  594. } else {
  595. d[groupNames[i]] = value;
  596. }
  597. }
  598. return d;
  599. }
  600. [return: SequenceTypeInfo(typeof(int))]
  601. public PythonTuple span() {
  602. return PythonTuple.MakeTuple(this.start(), this.end());
  603. }
  604. [return: SequenceTypeInfo(typeof(int))]
  605. public PythonTuple span(object group) {
  606. return PythonTuple.MakeTuple(this.start(group), this.end(group));
  607. }
  608. public int pos {
  609. get {
  610. return _pos;
  611. }
  612. }
  613. public int endpos {
  614. get {
  615. return _endPos;
  616. }
  617. }
  618. public string @string {
  619. get {
  620. return _text;
  621. }
  622. }
  623. public PythonTuple regs {
  624. get {
  625. object[] res = new object[_m.Groups.Count];
  626. for (int i = 0; i < res.Length; i++) {
  627. res[i] = PythonTuple.MakeTuple(start(i), end(i));
  628. }
  629. return PythonTuple.MakeTuple(res);
  630. }
  631. }
  632. public RE_Pattern re {
  633. get {
  634. return _pattern;
  635. }
  636. }
  637. public object lastindex {
  638. get {
  639. // -1 : initial value of lastindex
  640. // 0 : no match found
  641. //other : the true lastindex
  642. // Match.Groups contains "lower" level matched groups, which has to be removed
  643. if (_lastindex == -1) {
  644. int i = 1;
  645. while (i < _m.Groups.Count) {
  646. if (_m.Groups[i].Success) {
  647. _lastindex = i;
  648. int start = _m.Groups[i].Index;
  649. int end = start + _m.Groups[i].Length;
  650. i++;
  651. // skip any group which fall into the range [start, end],
  652. // no matter match succeed or fail
  653. while (i < _m.Groups.Count && (_m.Groups[i].Index < end)) {
  654. i++;
  655. }
  656. } else {
  657. i++;
  658. }
  659. }
  660. if (_lastindex == -1) {
  661. _lastindex = 0;
  662. }
  663. }
  664. if (_lastindex == 0) {
  665. return null;
  666. } else {
  667. return _lastindex;
  668. }
  669. }
  670. }
  671. public string lastgroup {
  672. get {
  673. if (lastindex == null) return null;
  674. // when group was not explicitly named, RegEx assigns the number as name
  675. // This is different from C-Python, which returns None in such cases
  676. return this._pattern._re.GroupNameFromNumber((int)lastindex);
  677. }
  678. }
  679. #endregion
  680. #region Private helper functions
  681. private void AppendGroup(StringBuilder sb, int index) {
  682. sb.Append(_m.Groups[index].Value);
  683. }
  684. private int GetGroupIndex(object group) {
  685. int grpIndex;
  686. if (!Converter.TryConvertToInt32(group, out grpIndex)) {
  687. grpIndex = _pattern._re.GroupNumberFromName(ValidateString(group, "group"));
  688. }
  689. if (grpIndex < 0 || grpIndex >= _m.Groups.Count) {
  690. throw PythonOps.IndexError("no such group");
  691. }
  692. return grpIndex;
  693. }
  694. #endregion
  695. }
  696. #endregion
  697. #region Private helper functions
  698. private static RE_Pattern GetPattern(CodeContext/*!*/ context, object pattern, int flags) {
  699. return GetPattern(context, pattern, flags, false);
  700. }
  701. private static RE_Pattern GetPattern(CodeContext/*!*/ context, object pattern, int flags, bool compiled) {
  702. RE_Pattern res = pattern as RE_Pattern;
  703. if (res != null) {
  704. return res;
  705. }
  706. string strPattern = ValidatePatternAsString(pattern);
  707. PatternKey key = new PatternKey(strPattern, flags);
  708. lock (_cachedPatterns) {
  709. if (_cachedPatterns.TryGetValue(new PatternKey(strPattern, flags), out res)) {
  710. #if SILVERLIGHT
  711. return res;
  712. #else
  713. if ( ! compiled || (res._re.Options & RegexOptions.Compiled) == RegexOptions.Compiled) {
  714. return res;
  715. }
  716. #endif
  717. }
  718. res = new RE_Pattern(context, strPattern, flags, compiled);
  719. _cachedPatterns[key] = res;
  720. return res;
  721. }
  722. }
  723. private static IEnumerator MatchIterator(MatchCollection matches, RE_Pattern pattern, string input) {
  724. for (int i = 0; i < matches.Count; i++) {
  725. yield return RE_Match.make(matches[i], pattern, input, 0, input.Length);
  726. }
  727. }
  728. private static RegexOptions FlagsToOption(int flags) {
  729. RegexOptions opts = RegexOptions.None;
  730. if ((flags & (int)IGNORECASE) != 0) opts |= RegexOptions.IgnoreCase;
  731. if ((flags & (int)MULTILINE) != 0) opts |= RegexOptions.Multiline;
  732. if (((flags & (int)LOCALE)) == 0) opts &= (~RegexOptions.CultureInvariant);
  733. if ((flags & (int)DOTALL) != 0) opts |= RegexOptions.Singleline;
  734. if ((flags & (int)VERBOSE) != 0) opts |= RegexOptions.IgnorePatternWhitespace;
  735. return opts;
  736. }
  737. private static int OptionToFlags(RegexOptions options) {
  738. int flags = 0;
  739. if ((options & RegexOptions.IgnoreCase) != 0) {
  740. flags |= IGNORECASE;
  741. }
  742. if ((options & RegexOptions.Multiline) != 0) {
  743. flags |= MULTILINE;
  744. }
  745. if ((options & RegexOptions.CultureInvariant) == 0) {
  746. flags |= LOCALE;
  747. }
  748. if ((options & RegexOptions.Singleline) != 0) {
  749. flags |= DOTALL;
  750. }
  751. if ((options & RegexOptions.IgnorePatternWhitespace) != 0) {
  752. flags |= VERBOSE;
  753. }
  754. return flags;
  755. }
  756. internal class ParsedRegex {
  757. public ParsedRegex(string pattern) {
  758. this.UserPattern = pattern;
  759. }
  760. public string UserPattern;
  761. public string Pattern;
  762. public RegexOptions Options = RegexOptions.CultureInvariant;
  763. }
  764. private static char[] _preParsedChars = new[] { '(', '{', '[', ']' };
  765. private const string _mangledNamedGroup = "___PyRegexNameMangled";
  766. /// <summary>
  767. /// Preparses a regular expression text returning a ParsedRegex class
  768. /// that can be used for further regular expressions.
  769. /// </summary>
  770. private static ParsedRegex PreParseRegex(CodeContext/*!*/ context, string pattern) {
  771. ParsedRegex res = new ParsedRegex(pattern);
  772. //string newPattern;
  773. int cur = 0, nameIndex;
  774. int curGroup = 0;
  775. bool isCharList = false;
  776. bool containsNamedGroup = false;
  777. for (; ; ) {
  778. nameIndex = pattern.IndexOfAny(_preParsedChars, cur);
  779. if (nameIndex > 0 && pattern[nameIndex - 1] == '\\') {
  780. int curIndex = nameIndex - 2;
  781. int backslashCount = 1;
  782. while (curIndex >= 0 && pattern[curIndex] == '\\') {
  783. backslashCount++;
  784. curIndex--;
  785. }
  786. // odd number of back slashes, this is an optional
  787. // paren that we should ignore.
  788. if ((backslashCount & 0x01) != 0) {
  789. cur++;
  790. continue;
  791. }
  792. }
  793. if (nameIndex == -1) break;
  794. if (nameIndex == pattern.Length - 1) break;
  795. switch (pattern[nameIndex]) {
  796. case '{':
  797. if (pattern[++nameIndex] == ',') {
  798. // no beginning specified for the n-m quntifier, add the
  799. // default 0 value.
  800. pattern = pattern.Insert(nameIndex, "0");
  801. }
  802. break;
  803. case '[':
  804. nameIndex++;
  805. isCharList = true;
  806. break;
  807. case ']':
  808. nameIndex++;
  809. isCharList = false;
  810. break;
  811. case '(':
  812. // make sure we're not dealing with [(]
  813. if (!isCharList) {
  814. switch (pattern[++nameIndex]) {
  815. case '?':
  816. // extension syntax
  817. if (nameIndex == pattern.Length - 1) throw PythonExceptions.CreateThrowable(error(context), "unexpected end of regex");
  818. switch (pattern[++nameIndex]) {
  819. case 'P':
  820. // named regex, .NET doesn't expect the P so we'll remove it;
  821. // also, once we see a named group i.e. ?P then we need to start artificially
  822. // naming all unnamed groups from then on---this is to get around the fact that
  823. // the CLR RegEx support orders all the unnamed groups before all the named
  824. // groups, even if the named groups are before the unnamed ones in the pattern;
  825. // the artificial naming preserves the order of the groups and thus the order of
  826. // the matches
  827. if (nameIndex + 1 < pattern.Length && pattern[nameIndex + 1] == '=') {
  828. // match whatever was previously matched by the named group
  829. // remove the (?P=
  830. pattern = pattern.Remove(nameIndex - 2, 4);
  831. pattern = pattern.Insert(nameIndex - 2, "\\k<");
  832. int tmpIndex = nameIndex;
  833. while (tmpIndex < pattern.Length && pattern[tmpIndex] != ')')
  834. tmpIndex++;
  835. if (tmpIndex == pattern.Length) throw PythonExceptions.CreateThrowable(error(context), "unexpected end of regex");
  836. pattern = pattern.Substring(0, tmpIndex) + ">" + pattern.Substring(tmpIndex + 1);
  837. } else {
  838. containsNamedGroup = true;
  839. pattern = pattern.Remove(nameIndex, 1);
  840. }
  841. break;
  842. case 'i':
  843. res.Options |= RegexOptions.IgnoreCase;
  844. RemoveOption(ref pattern, ref nameIndex);
  845. break;
  846. case 'L':
  847. res.Options &= ~(RegexOptions.CultureInvariant);
  848. RemoveOption(ref pattern, ref nameIndex);
  849. break;
  850. case 'm': res.Options |= RegexOptions.Multiline;
  851. RemoveOption(ref pattern, ref nameIndex);
  852. break;
  853. case 's': res.Options |= RegexOptions.Singleline;
  854. RemoveOption(ref pattern, ref nameIndex);
  855. break;
  856. case 'u':
  857. // specify unicode; not relevant and not valid under .NET as we're always unicode
  858. // -- so the option needs to be removed
  859. RemoveOption(ref pattern, ref nameIndex);
  860. break;
  861. case 'x': res.Options |= RegexOptions.IgnorePatternWhitespace;
  862. RemoveOption(ref pattern, ref nameIndex);
  863. break;
  864. case ':': break; // non-capturing
  865. case '=': break; // look ahead assertion
  866. case '<': break; // positive look behind assertion
  867. case '!': break; // negative look ahead assertion
  868. case '#': break; // inline comment
  869. case '(':
  870. // conditional match alternation (?(id/name)yes-pattern|no-pattern)
  871. // move past ?( so we don't preparse the name.
  872. nameIndex++;
  873. break;
  874. default: throw PythonExceptions.CreateThrowable(error(context), "Unrecognized extension " + pattern[nameIndex]);
  875. }
  876. break;
  877. default:
  878. // just another group
  879. curGroup++;
  880. if (containsNamedGroup) {
  881. // need to name this unnamed group
  882. pattern = pattern.Insert(nameIndex, "?<" + _mangledNamedGroup + GetRandomString() + ">");
  883. }
  884. break;
  885. }
  886. } else {
  887. nameIndex++;
  888. }
  889. break;
  890. }
  891. cur = nameIndex;
  892. }
  893. cur = 0;
  894. for (; ; ) {
  895. nameIndex = pattern.IndexOf('\\', cur);
  896. if (nameIndex == -1 || nameIndex == pattern.Length - 1) break;
  897. cur = ++nameIndex;
  898. char curChar = pattern[cur];
  899. switch (curChar) {
  900. case 'x':
  901. case 'u':
  902. case 'a':
  903. case 'b':
  904. case 'e':
  905. case 'f':
  906. case 'k':
  907. case 'n':
  908. case 'r':
  909. case 't':
  910. case 'v':
  911. case 'c':
  912. case 's':
  913. case 'W':
  914. case 'w':
  915. case 'p':
  916. case 'P':
  917. case 'S':
  918. case 'd':
  919. case 'D':
  920. case 'A':
  921. case 'B':
  922. case '\\':
  923. // known escape sequences, leave escaped.
  924. break;
  925. case 'Z':
  926. // /Z matches "end of string" in Python, replace with /z which is the .NET equivalent
  927. pattern = pattern.Remove(cur, 1).Insert(cur, "z");
  928. break;
  929. default:
  930. System.Globalization.UnicodeCategory charClass = CharUnicodeInfo.GetUnicodeCategory(curChar);
  931. switch (charClass) {
  932. // recognized word characters, always unescape.
  933. case System.Globalization.UnicodeCategory.ModifierLetter:
  934. case System.Globalization.UnicodeCategory.LowercaseLetter:
  935. case System.Globalization.UnicodeCategory.UppercaseLetter:
  936. case System.Globalization.UnicodeCategory.TitlecaseLetter:
  937. case System.Globalization.UnicodeCategory.OtherLetter:
  938. case System.Globalization.UnicodeCategory.LetterNumber:
  939. case System.Globalization.UnicodeCategory.OtherNumber:
  940. case System.Globalization.UnicodeCategory.ConnectorPunctuation:
  941. pattern = pattern.Remove(nameIndex - 1, 1);
  942. cur--;
  943. break;
  944. case System.Globalization.UnicodeCategory.DecimalDigitNumber:
  945. // actually don't want to unescape '\1', '\2' etc. which are references to groups
  946. break;
  947. }
  948. break;
  949. }
  950. if (++cur >= pattern.Length) {
  951. break;
  952. }
  953. }
  954. res.Pattern = pattern;
  955. return res;
  956. }
  957. private static void RemoveOption(ref string pattern, ref int nameIndex) {
  958. if (pattern[nameIndex - 1] == '?' && nameIndex < (pattern.Length - 1) && pattern[nameIndex + 1] == ')') {
  959. pattern = pattern.Remove(nameIndex - 2, 4);
  960. nameIndex -= 2;
  961. } else {
  962. pattern = pattern.Remove(nameIndex, 1);
  963. nameIndex -= 2;
  964. }
  965. }
  966. private static string GetRandomString() {
  967. return r.Next(Int32.MaxValue / 2, Int32.MaxValue).ToString();
  968. }
  969. private static string UnescapeGroups(Match m, string text) {
  970. for (int i = 0; i < text.Length; i++) {
  971. if (text[i] == '\\') {
  972. StringBuilder sb = new StringBuilder(text, 0, i, text.Length);
  973. do {
  974. if (text[i] == '\\') {
  975. i++;
  976. if (i == text.Length) { sb.Append('\\'); break; }
  977. switch (text[i]) {
  978. case 'n': sb.Append('\n'); break;
  979. case 'r': sb.Append('\r'); break;
  980. case 't': sb.Append('\t'); break;
  981. case '\\': sb.Append('\\'); break;
  982. case '\'': sb.Append('\''); break;
  983. case 'b':

Large files files are truncated, but you can click here to view the full file