PageRenderTime 64ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/CoolEngine/IronPython/Src/IronPython.Modules/re.cs

#
C# | 1081 lines | 827 code | 168 blank | 86 comment | 196 complexity | c14567b6540677dd706900fac58d4652 MD5 | raw file
  1. /* ****************************************************************************
  2. *
  3. * Copyright (c) Microsoft Corporation.
  4. *
  5. * This source code is subject to terms and conditions of the Microsoft Public License. A
  6. * copy of the license can be found in the License.html file at the root of this distribution. If
  7. * you cannot locate the Microsoft Public License, please send an email to
  8. * ironpy@microsoft.com. By using this source code in any fashion, you are agreeing to be bound
  9. * by the terms of the Microsoft Public License.
  10. *
  11. * You must not remove this notice, or any other, from this software.
  12. *
  13. *
  14. * ***************************************************************************/
  15. using System;
  16. using System.Text;
  17. using System.Collections;
  18. using System.Collections.Generic;
  19. using System.Diagnostics;
  20. using IronPython.Runtime;
  21. using IronPython.Runtime.Operations;
  22. using IronPython.Runtime.Exceptions;
  23. using System.Text.RegularExpressions;
  24. using System.Runtime.InteropServices;
  25. using Microsoft.Scripting.Utils;
  26. using IronPython.Runtime.Types;
  27. [assembly: PythonModule("re", typeof(IronPython.Modules.PythonRegex))]
  28. namespace IronPython.Modules {
  29. /// <summary>
  30. /// Python regular expression module.
  31. /// </summary>
  32. public static class PythonRegex {
  33. private static readonly Random r = new Random(DateTime.Now.Millisecond);
  34. #region CONSTANTS
  35. // short forms
  36. public const int I = 0x02;
  37. public const int L = 0x04;
  38. public const int M = 0x08;
  39. public const int S = 0x10;
  40. public const int U = 0x20;
  41. public const int X = 0x40;
  42. // long forms
  43. public const int IGNORECASE = 0x02;
  44. public const int LOCALE = 0x04;
  45. public const int MULTILINE = 0x08;
  46. public const int DOTALL = 0x10;
  47. public const int UNICODE = 0x20;
  48. public const int VERBOSE = 0x40;
  49. #endregion
  50. #region Public API Surface
  51. public static RE_Pattern compile(object pattern) {
  52. try {
  53. return new RE_Pattern(ValidatePattern(pattern), 0, true);
  54. } catch (ArgumentException e) {
  55. throw PythonExceptions.CreateThrowable(error, e.Message);
  56. }
  57. }
  58. public static RE_Pattern compile(object pattern, object flags) {
  59. try {
  60. return new RE_Pattern(ValidatePattern(pattern), Converter.ConvertToInt32(flags), true);
  61. } catch (ArgumentException e) {
  62. throw PythonExceptions.CreateThrowable(error, e.Message);
  63. }
  64. }
  65. public const string engine = "cli reg ex";
  66. public static PythonType error = PythonExceptions.CreateSubType(PythonExceptions.Exception, "error", "re", "");
  67. public static string escape(string text) {
  68. if (text == null) throw PythonOps.TypeError("text must not be None");
  69. for (int i = 0; i < text.Length; i++) {
  70. if (!Char.IsLetterOrDigit(text[i])) {
  71. StringBuilder sb = new StringBuilder(text, 0, i, text.Length);
  72. char ch = text[i];
  73. do {
  74. sb.Append('\\');
  75. sb.Append(ch);
  76. i++;
  77. int last = i;
  78. while (i < text.Length) {
  79. ch = text[i];
  80. if (!Char.IsLetterOrDigit(ch)) {
  81. break;
  82. }
  83. i++;
  84. }
  85. sb.Append(text, last, i - last);
  86. } while (i < text.Length);
  87. return sb.ToString();
  88. }
  89. }
  90. return text;
  91. }
  92. public static object findall(object pattern, string @string) {
  93. return findall(pattern, @string, 0);
  94. }
  95. public static object findall(object pattern, string @string, int flags) {
  96. RE_Pattern pat = new RE_Pattern(ValidatePattern(pattern), flags);
  97. ValidateString(@string, "string");
  98. MatchCollection mc = pat.FindAllWorker(@string, 0, @string.Length);
  99. return FixFindAllMatch(pat, mc);
  100. }
  101. private static object FixFindAllMatch(RE_Pattern pat, MatchCollection mc) {
  102. object[] matches = new object[mc.Count];
  103. int numgrps = pat._re.GetGroupNumbers().Length;
  104. for (int i = 0; i < mc.Count; i++) {
  105. if (numgrps > 2) { // CLR gives us a "bonus" group of 0 - the entire expression
  106. // at this point we have more than one group in the pattern;
  107. // need to return a list of tuples in this case
  108. // for each match item in the matchcollection, create a tuple representing what was matched
  109. // e.g. findall("(\d+)|(\w+)", "x = 99y") == [('', 'x'), ('99', ''), ('', 'y')]
  110. // in the example above, ('', 'x') did not match (\d+) as indicated by '' but did
  111. // match (\w+) as indicated by 'x' and so on...
  112. int k = 0;
  113. List<object> tpl = new List<object>();
  114. foreach (Group g in mc[i].Groups) {
  115. // here also the CLR gives us a "bonus" match as the first item which is the
  116. // group that was actually matched in the tuple e.g. we get 'x', '', 'x' for
  117. // the first match object...so we'll skip the first item when creating the
  118. // tuple
  119. if (k++ != 0) {
  120. tpl.Add(g.Value);
  121. }
  122. }
  123. matches[i] = PythonTuple.Make(tpl);
  124. } else if (numgrps == 2) {
  125. // at this point we have exactly one group in the pattern (including the "bonus" one given
  126. // by the CLR
  127. // skip the first match since that contains the entire match and not the group match
  128. // e.g. re.findall(r"(\w+)\s+fish\b", "green fish") will have "green fish" in the 0
  129. // index and "green" as the (\w+) group match
  130. matches[i] = mc[i].Groups[1].Value;
  131. } else {
  132. matches[i] = mc[i].Value;
  133. }
  134. }
  135. return List.FromArrayNoCopy(matches);
  136. }
  137. public static object finditer(object pattern, object @string) {
  138. return finditer(pattern, @string, 0);
  139. }
  140. public static object finditer(object pattern, object @string, int flags) {
  141. RE_Pattern pat = new RE_Pattern(ValidatePattern(pattern), flags);
  142. string str = ValidateString(@string, "string");
  143. return MatchIterator(pat.FindAllWorker(str, 0, str.Length), pat, str);
  144. }
  145. public static object match(object pattern, object @string) {
  146. return match(pattern, @string, 0);
  147. }
  148. public static object match(object pattern, object @string, int flags) {
  149. return new RE_Pattern(ValidatePattern(pattern), flags).match(ValidateString(@string, "string"));
  150. }
  151. public static object search(object pattern, object @string) {
  152. return search(pattern, @string, 0);
  153. }
  154. public static object search(object pattern, object @string, int flags) {
  155. return new RE_Pattern(ValidatePattern(pattern), flags).search(ValidateString(@string, "string"));
  156. }
  157. public static object split(object pattern, object @string) {
  158. return split(ValidatePattern(pattern), ValidateString(@string, "string"), 0);
  159. }
  160. public static object split(object pattern, object @string, int maxsplit) {
  161. return new RE_Pattern(ValidatePattern(pattern)).split(ValidateString(@string, "string"),
  162. maxsplit);
  163. }
  164. public static object sub(object pattern, object repl, object @string) {
  165. return sub(pattern, repl, @string, Int32.MaxValue);
  166. }
  167. public static object sub(object pattern, object repl, object @string, int count) {
  168. return new RE_Pattern(ValidatePattern(pattern)).sub(repl, ValidateString(@string, "string"), count);
  169. }
  170. public static object subn(object pattern, object repl, object @string) {
  171. return subn(pattern, repl, @string, Int32.MaxValue);
  172. }
  173. public static object subn(object pattern, object repl, object @string, int count) {
  174. return new RE_Pattern(ValidatePattern(pattern)).subn(repl, ValidateString(@string, "string"), count);
  175. }
  176. #endregion
  177. #region Public classes
  178. /// <summary>
  179. /// Compiled reg-ex pattern
  180. /// </summary>
  181. [PythonSystemType]
  182. public class RE_Pattern : IWeakReferenceable {
  183. internal Regex _re;
  184. private PythonDictionary _groups;
  185. private int _compileFlags;
  186. private WeakRefTracker _weakRefTracker;
  187. internal ParsedRegex _pre;
  188. internal RE_Pattern(object pattern)
  189. : this(pattern, 0) {
  190. }
  191. internal RE_Pattern(object pattern, int flags) :
  192. this(pattern, flags, false) {
  193. }
  194. internal RE_Pattern(object pattern, int flags, bool compiled) {
  195. _pre = PreParseRegex(ValidatePattern(pattern));
  196. try {
  197. RegexOptions opts = FlagsToOption(flags);
  198. #if SILVERLIGHT
  199. this._re = new Regex(_pre.Pattern, opts);
  200. #else
  201. this._re = new Regex(_pre.Pattern, opts | (compiled ? RegexOptions.Compiled : RegexOptions.None));
  202. #endif
  203. } catch (ArgumentException e) {
  204. throw PythonExceptions.CreateThrowable(error, e.Message);
  205. }
  206. this._compileFlags = flags;
  207. }
  208. public RE_Match match(object text) {
  209. string input = ValidateString(text, "text");
  210. return RE_Match.makeMatch(_re.Match(input), this, input, 0, input.Length);
  211. }
  212. private static int FixPosition(string text, int position) {
  213. if (position < 0) return 0;
  214. if (position > text.Length) return text.Length;
  215. return position;
  216. }
  217. public RE_Match match(object text, int pos) {
  218. string input = ValidateString(text, "text");
  219. pos = FixPosition(input, pos);
  220. return RE_Match.makeMatch(_re.Match(input, pos), this, input, pos, input.Length);
  221. }
  222. public RE_Match match(object text, [DefaultParameterValue(0)]int pos, int endpos) {
  223. string input = ValidateString(text, "text");
  224. pos = FixPosition(input, pos);
  225. endpos = FixPosition(input, endpos);
  226. return RE_Match.makeMatch(
  227. _re.Match(input.Substring(0, endpos), pos),
  228. this,
  229. input,
  230. pos,
  231. endpos);
  232. }
  233. public RE_Match search(object text) {
  234. string input = ValidateString(text, "text");
  235. return RE_Match.make(_re.Match(input), this, input);
  236. }
  237. public RE_Match search(object text, int pos) {
  238. string input = ValidateString(text, "text");
  239. return RE_Match.make(_re.Match(input, pos, input.Length - pos), this, input);
  240. }
  241. public RE_Match search(object text, int pos, int endpos) {
  242. string input = ValidateString(text, "text");
  243. return RE_Match.make(_re.Match(input, pos, Math.Max(endpos - pos, 0)), this, input);
  244. }
  245. public object findall(string @string) {
  246. return findall(@string, 0, null);
  247. }
  248. public object findall(string @string, int pos) {
  249. return findall(@string, pos, null);
  250. }
  251. public object findall(object @string, int pos, object endpos) {
  252. MatchCollection mc = FindAllWorker(ValidateString(@string, "text"), pos, endpos);
  253. return FixFindAllMatch(this, mc);
  254. }
  255. internal MatchCollection FindAllWorker(string str, int pos, object endpos) {
  256. string against = str;
  257. if (endpos != null) {
  258. int end = Converter.ConvertToInt32(endpos);
  259. against = against.Substring(0, Math.Max(end, 0));
  260. }
  261. return _re.Matches(against, pos);
  262. }
  263. public object finditer(object @string) {
  264. string input = ValidateString(@string, "string");
  265. return MatchIterator(FindAllWorker(input, 0, input.Length), this, input);
  266. }
  267. public object finditer(object @string, int pos) {
  268. string input = ValidateString(@string, "string");
  269. return MatchIterator(FindAllWorker(input, pos, input.Length), this, input);
  270. }
  271. public object finditer(object @string, int pos, int endpos) {
  272. string input = ValidateString(@string, "string");
  273. return MatchIterator(FindAllWorker(input, pos, endpos), this, input);
  274. }
  275. public object split(object @string) {
  276. return split(@string, 0);
  277. }
  278. public object split(object @string, int maxsplit) {
  279. List result = new List();
  280. // fast path for negative maxSplit ( == "make no splits")
  281. if (maxsplit < 0)
  282. result.AddNoLock(@string);
  283. else {
  284. // iterate over all matches
  285. string theStr = ValidateString(@string, "string");
  286. MatchCollection matches = _re.Matches(theStr);
  287. int lastPos = 0; // is either start of the string, or first position *after* the last match
  288. int nSplits = 0; // how many splits have occurred?
  289. foreach (Match m in matches) {
  290. if (m.Length > 0) {
  291. // add substring from lastPos to beginning of current match
  292. result.AddNoLock(theStr.Substring(lastPos, m.Index - lastPos));
  293. // if there are subgroups of the match, add their match or None
  294. if (m.Groups.Count > 1)
  295. for (int i = 1; i < m.Groups.Count; i++)
  296. if (m.Groups[i].Success)
  297. result.AddNoLock(m.Groups[i].Value);
  298. else
  299. result.AddNoLock(null);
  300. // update lastPos, nSplits
  301. lastPos = m.Index + m.Length;
  302. nSplits++;
  303. if (nSplits == maxsplit)
  304. break;
  305. }
  306. }
  307. // add tail following last match
  308. result.AddNoLock(theStr.Substring(lastPos));
  309. }
  310. return result;
  311. }
  312. public string sub(object repl, object @string) {
  313. return sub(repl, ValidateString(@string, "string"), Int32.MaxValue);
  314. }
  315. public string sub(object repl, object @string, int count) {
  316. if (repl == null) throw PythonOps.TypeError("NoneType is not valid repl");
  317. // if 'count' is omitted or 0, all occurrences are replaced
  318. if (count == 0) count = Int32.MaxValue;
  319. string replacement = repl as string;
  320. if (replacement == null) {
  321. if (repl is ExtensibleString) {
  322. replacement = (repl as ExtensibleString).Value;
  323. }
  324. }
  325. Match prev = null;
  326. string input = ValidateString(@string, "string");
  327. return _re.Replace(
  328. input,
  329. delegate(Match match) {
  330. // from the docs: Empty matches for the pattern are replaced
  331. // only when not adjacent to a previous match
  332. if (String.IsNullOrEmpty(match.Value) && prev != null &&
  333. (prev.Index + prev.Length) == match.Index) {
  334. return "";
  335. };
  336. prev = match;
  337. if (replacement != null) return UnescapeGroups(match, replacement);
  338. return PythonCalls.Call(repl, RE_Match.make(match, this, input)) as string;
  339. },
  340. count);
  341. }
  342. public object subn(object repl, string @string) {
  343. return subn(repl, @string, Int32.MaxValue);
  344. }
  345. public object subn(object repl, object @string, int count) {
  346. if (repl == null) throw PythonOps.TypeError("NoneType is not valid repl");
  347. // if 'count' is omitted or 0, all occurrences are replaced
  348. if (count == 0) count = Int32.MaxValue;
  349. int totalCount = 0;
  350. string res;
  351. string replacement = repl as string;
  352. if (replacement == null) {
  353. if (repl is ExtensibleString) {
  354. replacement = (repl as ExtensibleString).Value;
  355. }
  356. }
  357. Match prev = null;
  358. string input = ValidateString(@string, "string");
  359. res = _re.Replace(
  360. input,
  361. delegate(Match match) {
  362. // from the docs: Empty matches for the pattern are replaced
  363. // only when not adjacent to a previous match
  364. if (String.IsNullOrEmpty(match.Value) && prev != null &&
  365. (prev.Index + prev.Length) == match.Index) {
  366. return "";
  367. };
  368. prev = match;
  369. totalCount++;
  370. if (replacement != null) return UnescapeGroups(match, replacement);
  371. return PythonCalls.Call(repl, RE_Match.make(match, this, input)) as string;
  372. },
  373. count);
  374. return PythonTuple.MakeTuple(res, totalCount);
  375. }
  376. public int flags {
  377. get {
  378. return _compileFlags;
  379. }
  380. }
  381. public PythonDictionary groupindex {
  382. get {
  383. if (_groups == null) {
  384. PythonDictionary d = new PythonDictionary();
  385. string[] names = _re.GetGroupNames();
  386. int[] nums = _re.GetGroupNumbers();
  387. for (int i = 1; i < names.Length; i++) {
  388. if (Char.IsDigit(names[i][0])) continue; // skip numeric names
  389. d[names[i]] = nums[i];
  390. }
  391. _groups = d;
  392. }
  393. return _groups;
  394. }
  395. }
  396. public string pattern {
  397. get {
  398. return _pre.UserPattern;
  399. }
  400. }
  401. #region IWeakReferenceable Members
  402. WeakRefTracker IWeakReferenceable.GetWeakRef() {
  403. return _weakRefTracker;
  404. }
  405. bool IWeakReferenceable.SetWeakRef(WeakRefTracker value) {
  406. _weakRefTracker = value;
  407. return true;
  408. }
  409. void IWeakReferenceable.SetFinalizer(WeakRefTracker value) {
  410. ((IWeakReferenceable)this).SetWeakRef(value);
  411. }
  412. #endregion
  413. }
  414. [PythonSystemType]
  415. public class RE_Match {
  416. RE_Pattern _pattern;
  417. private Match _m;
  418. private string _text;
  419. private int _lastindex = -1;
  420. private int _pos, _endPos;
  421. #region Internal makers
  422. internal static RE_Match make(Match m, RE_Pattern pattern, string input) {
  423. if (m.Success) return new RE_Match(m, pattern, input, 0, input.Length);
  424. return null;
  425. }
  426. internal static RE_Match make(Match m, RE_Pattern pattern, string input, int offset, int endpos) {
  427. if (m.Success) return new RE_Match(m, pattern, input, offset, endpos);
  428. return null;
  429. }
  430. internal static RE_Match makeMatch(Match m, RE_Pattern pattern, string input, int offset, int endpos) {
  431. if (m.Success && m.Index == offset) return new RE_Match(m, pattern, input, offset, endpos);
  432. return null;
  433. }
  434. #endregion
  435. #region Public ctors
  436. public RE_Match(Match m, RE_Pattern pattern, string text) {
  437. _m = m;
  438. _pattern = pattern;
  439. _text = text;
  440. }
  441. public RE_Match(Match m, RE_Pattern pattern, string text, int pos, int endpos) {
  442. _m = m;
  443. _pattern = pattern;
  444. _text = text;
  445. _pos = pos;
  446. _endPos = endpos;
  447. }
  448. #endregion
  449. // public override bool __nonzero__() {
  450. // return m.Success;
  451. // }
  452. #region Public API Surface
  453. public int end() {
  454. return _m.Index + _m.Length;
  455. }
  456. public int start() {
  457. return _m.Index;
  458. }
  459. public int start(object group) {
  460. int grpIndex = GetGroupIndex(group);
  461. if (!_m.Groups[grpIndex].Success) {
  462. return -1;
  463. }
  464. return _m.Groups[grpIndex].Index;
  465. }
  466. public int end(object group) {
  467. int grpIndex = GetGroupIndex(group);
  468. if (!_m.Groups[grpIndex].Success) {
  469. return -1;
  470. }
  471. return _m.Groups[grpIndex].Index + _m.Groups[grpIndex].Length;
  472. }
  473. public object group(object index, params object[] additional) {
  474. if (additional.Length == 0) return group(index);
  475. object[] res = new object[additional.Length + 1];
  476. res[0] = _m.Groups[GetGroupIndex(index)].Success ? _m.Groups[GetGroupIndex(index)].Value : null;
  477. for (int i = 1; i < res.Length; i++) {
  478. int grpIndex = GetGroupIndex(additional[i - 1]);
  479. res[i] = _m.Groups[grpIndex].Success ? _m.Groups[grpIndex].Value : null;
  480. }
  481. return PythonTuple.MakeTuple(res);
  482. }
  483. public object group(object index) {
  484. int pos = GetGroupIndex(index);
  485. Group g = _m.Groups[pos];
  486. return g.Success ? g.Value : null;
  487. }
  488. public object group() {
  489. return group(0);
  490. }
  491. public object groups() {
  492. return groups(null);
  493. }
  494. public object groups(object @default) {
  495. object[] ret = new object[_m.Groups.Count - 1];
  496. for (int i = 1; i < _m.Groups.Count; i++) {
  497. if (!_m.Groups[i].Success) {
  498. ret[i - 1] = @default;
  499. } else {
  500. ret[i - 1] = _m.Groups[i].Value;
  501. }
  502. }
  503. return PythonTuple.MakeTuple(ret);
  504. }
  505. public object expand(object template) {
  506. string strTmp = ValidateString(template, "template");
  507. StringBuilder res = new StringBuilder();
  508. for (int i = 0; i < strTmp.Length; i++) {
  509. if (strTmp[i] != '\\') { res.Append(strTmp[i]); continue; }
  510. if (++i == strTmp.Length) { res.Append(strTmp[i - 1]); continue; }
  511. if (Char.IsDigit(strTmp[i])) {
  512. AppendGroup(res, (int)(strTmp[i] - '0'));
  513. } else if (strTmp[i] == 'g') {
  514. if (++i == strTmp.Length) { res.Append("\\g"); return res.ToString(); }
  515. if (strTmp[i] != '<') {
  516. res.Append("\\g<"); continue;
  517. } else { // '<'
  518. StringBuilder name = new StringBuilder();
  519. i++;
  520. while (strTmp[i] != '>' && i < strTmp.Length) {
  521. name.Append(strTmp[i++]);
  522. }
  523. AppendGroup(res, _pattern._re.GroupNumberFromName(name.ToString()));
  524. }
  525. } else {
  526. switch (strTmp[i]) {
  527. case 'n': res.Append('\n'); break;
  528. case 'r': res.Append('\r'); break;
  529. case 't': res.Append('\t'); break;
  530. case '\\': res.Append('\\'); break;
  531. }
  532. }
  533. }
  534. return res.ToString();
  535. }
  536. public object groupdict() {
  537. return groupdict(null);
  538. }
  539. private static bool IsGroupNumber(string name) {
  540. foreach (char c in name) {
  541. if (!Char.IsNumber(c)) return false;
  542. }
  543. return true;
  544. }
  545. public object groupdict(object value) {
  546. string[] groupNames = this._pattern._re.GetGroupNames();
  547. Debug.Assert(groupNames.Length == this._m.Groups.Count);
  548. PythonDictionary d = new PythonDictionary();
  549. for (int i = 0; i < groupNames.Length; i++) {
  550. if (IsGroupNumber(groupNames[i])) continue; // python doesn't report group numbers
  551. if (_m.Groups[i].Captures.Count != 0) {
  552. d[groupNames[i]] = _m.Groups[i].Value;
  553. } else {
  554. d[groupNames[i]] = value;
  555. }
  556. }
  557. return d;
  558. }
  559. public object span() {
  560. return PythonTuple.MakeTuple(this.start(), this.end());
  561. }
  562. public object span(object group) {
  563. return PythonTuple.MakeTuple(this.start(group), this.end(group));
  564. }
  565. public int pos {
  566. get {
  567. return _pos;
  568. }
  569. }
  570. public int endpos {
  571. get {
  572. return _endPos;
  573. }
  574. }
  575. public string @string {
  576. get {
  577. return _text;
  578. }
  579. }
  580. public object regs {
  581. get {
  582. object[] res = new object[_m.Groups.Count];
  583. for (int i = 0; i < res.Length; i++) {
  584. res[i] = PythonTuple.MakeTuple(start(i), end(i));
  585. }
  586. return PythonTuple.MakeTuple(res);
  587. }
  588. }
  589. public object re {
  590. get {
  591. return _pattern;
  592. }
  593. }
  594. public object lastindex {
  595. get {
  596. // -1 : initial value of lastindex
  597. // 0 : no match found
  598. //other : the true lastindex
  599. // Match.Groups contains "lower" level matched groups, which has to be removed
  600. if (_lastindex == -1) {
  601. int i = 1;
  602. while (i < _m.Groups.Count) {
  603. if (_m.Groups[i].Success) {
  604. _lastindex = i;
  605. int start = _m.Groups[i].Index;
  606. int end = start + _m.Groups[i].Length;
  607. i++;
  608. // skip any group which fall into the range [start, end],
  609. // no matter match succeed or fail
  610. while (i < _m.Groups.Count && (_m.Groups[i].Index < end)) {
  611. i++;
  612. }
  613. } else {
  614. i++;
  615. }
  616. }
  617. if (_lastindex == -1) {
  618. _lastindex = 0;
  619. }
  620. }
  621. if (_lastindex == 0) {
  622. return null;
  623. } else {
  624. return _lastindex;
  625. }
  626. }
  627. }
  628. public object lastgroup {
  629. get {
  630. if (lastindex == null) return null;
  631. // when group was not explicitly named, RegEx assigns the number as name
  632. // This is different from C-Python, which returns None in such cases
  633. return this._pattern._re.GroupNameFromNumber((int)lastindex);
  634. }
  635. }
  636. #endregion
  637. #region Private helper functions
  638. private void AppendGroup(StringBuilder sb, int index) {
  639. sb.Append(_m.Groups[index].Value);
  640. }
  641. private int GetGroupIndex(object group) {
  642. int grpIndex;
  643. if (!Converter.TryConvertToInt32(group, out grpIndex)) {
  644. grpIndex = _pattern._re.GroupNumberFromName(ValidateString(group, "group"));
  645. }
  646. if (grpIndex < 0 || grpIndex >= _m.Groups.Count) {
  647. throw PythonOps.IndexError("no such group");
  648. }
  649. return grpIndex;
  650. }
  651. #endregion
  652. }
  653. #endregion
  654. #region Private helper functions
  655. private static IEnumerator MatchIterator(MatchCollection matches, RE_Pattern pattern, string input) {
  656. for (int i = 0; i < matches.Count; i++) {
  657. yield return RE_Match.make(matches[i], pattern, input, 0, input.Length);
  658. }
  659. }
  660. private static RegexOptions FlagsToOption(int flags) {
  661. RegexOptions opts = RegexOptions.None;
  662. if ((flags & (int)IGNORECASE) != 0) opts |= RegexOptions.IgnoreCase;
  663. if ((flags & (int)MULTILINE) != 0) opts |= RegexOptions.Multiline;
  664. if (((flags & (int)LOCALE)) == 0) opts &= (~RegexOptions.CultureInvariant);
  665. if ((flags & (int)DOTALL) != 0) opts |= RegexOptions.Singleline;
  666. if ((flags & (int)VERBOSE) != 0) opts |= RegexOptions.IgnorePatternWhitespace;
  667. return opts;
  668. }
  669. internal class ParsedRegex {
  670. public ParsedRegex(string pattern) {
  671. this.UserPattern = pattern;
  672. }
  673. public string UserPattern;
  674. public string Pattern;
  675. public RegexOptions Options = RegexOptions.CultureInvariant;
  676. }
  677. /// <summary>
  678. /// Preparses a regular expression text returning a ParsedRegex class
  679. /// that can be used for further regular expressions.
  680. /// </summary>
  681. private static ParsedRegex PreParseRegex(string pattern) {
  682. ParsedRegex res = new ParsedRegex(pattern);
  683. //string newPattern;
  684. int cur = 0, nameIndex;
  685. int curGroup = 0;
  686. bool containsNamedGroup = false;
  687. for (; ; ) {
  688. nameIndex = pattern.IndexOf("(", cur);
  689. if (nameIndex > 0 && pattern[nameIndex - 1] == '\\') {
  690. int curIndex = nameIndex - 2;
  691. int backslashCount = 1;
  692. while (curIndex >= 0 && pattern[curIndex] == '\\') {
  693. backslashCount++;
  694. curIndex--;
  695. }
  696. // odd number of back slashes, this is an optional
  697. // paren that we should ignore.
  698. if ((backslashCount & 0x01) != 0) {
  699. cur++;
  700. continue;
  701. }
  702. }
  703. if (nameIndex == -1) break;
  704. if (nameIndex == pattern.Length - 1) break;
  705. switch (pattern[++nameIndex]) {
  706. case '?':
  707. // extension syntax
  708. if (nameIndex == pattern.Length - 1) throw PythonExceptions.CreateThrowable(error, "unexpected end of regex");
  709. switch (pattern[++nameIndex]) {
  710. case 'P':
  711. // named regex, .NET doesn't expect the P so we'll remove it;
  712. // also, once we see a named group i.e. ?P then we need to start artificially
  713. // naming all unnamed groups from then on---this is to get around the fact that
  714. // the CLR RegEx support orders all the unnamed groups before all the named
  715. // groups, even if the named groups are before the unnamed ones in the pattern;
  716. // the artificial naming preserves the order of the groups and thus the order of
  717. // the matches
  718. if (nameIndex + 1 < pattern.Length && pattern[nameIndex + 1] == '=') {
  719. // match whatever was previously matched by the named group
  720. // remove the (?P=
  721. pattern = pattern.Remove(nameIndex - 2, 4);
  722. pattern = pattern.Insert(nameIndex - 2, "\\\\k<");
  723. int tmpIndex = nameIndex;
  724. while (tmpIndex < pattern.Length && pattern[tmpIndex] != ')')
  725. tmpIndex++;
  726. if (tmpIndex == pattern.Length) throw PythonExceptions.CreateThrowable(error, "unexpected end of regex");
  727. pattern = pattern.Substring(0, tmpIndex) + ">" + pattern.Substring(tmpIndex + 1);
  728. } else {
  729. containsNamedGroup = true;
  730. pattern = pattern.Remove(nameIndex, 1);
  731. }
  732. break;
  733. case 'i': res.Options |= RegexOptions.IgnoreCase; break;
  734. case 'L': res.Options &= ~(RegexOptions.CultureInvariant); break;
  735. case 'm': res.Options |= RegexOptions.Multiline; break;
  736. case 's': res.Options |= RegexOptions.Singleline; break;
  737. case 'u': break;
  738. case 'x': res.Options |= RegexOptions.IgnorePatternWhitespace; break;
  739. case ':': break; // non-capturing
  740. case '=': break; // look ahead assertion
  741. case '<': break; // positive look behind assertion
  742. case '!': break; // negative look ahead assertion
  743. case '#': break; // inline comment
  744. case '(': // yes/no if group exists, we don't support this
  745. default: throw PythonExceptions.CreateThrowable(error, "Unrecognized extension " + pattern[nameIndex]);
  746. }
  747. break;
  748. default:
  749. // just another group
  750. curGroup++;
  751. if (containsNamedGroup) {
  752. // need to name this unnamed group
  753. pattern = pattern.Insert(nameIndex, "?<Named" + GetRandomString() + ">");
  754. }
  755. break;
  756. }
  757. cur = nameIndex;
  758. }
  759. cur = 0;
  760. for (; ; ) {
  761. nameIndex = pattern.IndexOf('\\', cur);
  762. if (nameIndex == -1 || nameIndex == pattern.Length - 1) break;
  763. char curChar = pattern[++nameIndex];
  764. switch (curChar) {
  765. case 'x':
  766. case 'u':
  767. case 'a':
  768. case 'b':
  769. case 'e':
  770. case 'f':
  771. case 'n':
  772. case 'r':
  773. case 't':
  774. case 'v':
  775. case 'c':
  776. case 's':
  777. case 'W':
  778. case 'w':
  779. case 'p':
  780. case 'P':
  781. case 'S':
  782. case 'd':
  783. case 'D':
  784. case 'Z':
  785. // known escape sequences, leave escaped.
  786. break;
  787. case '\\':
  788. // escaping a \\
  789. cur += 2;
  790. break;
  791. default:
  792. System.Globalization.UnicodeCategory charClass = Char.GetUnicodeCategory(curChar);
  793. switch (charClass) {
  794. // recognized word characters, always unescape.
  795. case System.Globalization.UnicodeCategory.ModifierLetter:
  796. case System.Globalization.UnicodeCategory.LowercaseLetter:
  797. case System.Globalization.UnicodeCategory.UppercaseLetter:
  798. case System.Globalization.UnicodeCategory.TitlecaseLetter:
  799. case System.Globalization.UnicodeCategory.OtherLetter:
  800. case System.Globalization.UnicodeCategory.LetterNumber:
  801. case System.Globalization.UnicodeCategory.OtherNumber:
  802. case System.Globalization.UnicodeCategory.ConnectorPunctuation:
  803. pattern = pattern.Remove(nameIndex - 1, 1);
  804. break;
  805. case System.Globalization.UnicodeCategory.DecimalDigitNumber:
  806. // actually don't want to unescape '\1', '\2' etc. which are references to groups
  807. break;
  808. }
  809. break;
  810. }
  811. cur++;
  812. }
  813. res.Pattern = pattern;
  814. return res;
  815. }
  816. private static string GetRandomString() {
  817. return r.Next(Int32.MaxValue / 2, Int32.MaxValue).ToString();
  818. }
  819. private static string UnescapeGroups(Match m, string text) {
  820. for (int i = 0; i < text.Length; i++) {
  821. if (text[i] == '\\') {
  822. StringBuilder sb = new StringBuilder(text, 0, i, text.Length);
  823. do {
  824. if (text[i] == '\\') {
  825. i++;
  826. if (i == text.Length) { sb.Append('\\'); break; }
  827. switch (text[i]) {
  828. case 'n': sb.Append('\n'); break;
  829. case 'r': sb.Append('\r'); break;
  830. case 't': sb.Append('\t'); break;
  831. case '\\': sb.Append('\\'); break;
  832. case '\'': sb.Append('\''); break;
  833. case '"': sb.Append('"'); break;
  834. case 'b': sb.Append('\b'); break;
  835. case 'g':
  836. // \g<#>, \g<name> need to be substituted by the groups they
  837. // matched
  838. if (text[i + 1] == '<') {
  839. int anglebrkStart = i + 1;
  840. int anglebrkEnd = text.IndexOf('>', i + 2);
  841. if (anglebrkEnd != -1) {
  842. // grab the # or 'name' of the group between '< >'
  843. int lengrp = anglebrkEnd - (anglebrkStart + 1);
  844. string grp = text.Substring(anglebrkStart + 1, lengrp);
  845. int num;
  846. Group g;
  847. if (StringUtils.TryParseInt32(grp, out num)) {
  848. g = m.Groups[num];
  849. if (String.IsNullOrEmpty(g.Value)) {
  850. throw PythonOps.IndexError("unknown group reference");
  851. }
  852. sb.Append(g.Value);
  853. } else {
  854. g = m.Groups[grp];
  855. if (String.IsNullOrEmpty(g.Value)) {
  856. throw PythonOps.IndexError("unknown group reference");
  857. }
  858. sb.Append(g.Value);
  859. }
  860. i = anglebrkEnd;
  861. }
  862. break;
  863. }
  864. sb.Append('\\');
  865. sb.Append((char)text[i]);
  866. break;
  867. default:
  868. if (Char.IsDigit(text[i]) && text[i] <= '7') {
  869. int val = 0;
  870. int digitCount = 0;
  871. while (i < text.Length && Char.IsDigit(text[i]) && text[i] <= '7') {
  872. digitCount++;
  873. val += val * 8 + (text[i] - '0');
  874. i++;
  875. }
  876. i--;
  877. if (digitCount == 1 && val > 0 && val < m.Groups.Count) {
  878. sb.Append(m.Groups[val].Value);
  879. } else {
  880. sb.Append((char)val);
  881. }
  882. } else {
  883. sb.Append('\\');
  884. sb.Append((char)text[i]);
  885. }
  886. break;
  887. }
  888. } else {
  889. sb.Append(text[i]);
  890. }
  891. } while (++i < text.Length);
  892. return sb.ToString();
  893. }
  894. }
  895. return text;
  896. }
  897. private static string ValidatePattern(object pattern) {
  898. if (pattern is string) return pattern as string;
  899. ExtensibleString es = pattern as ExtensibleString;
  900. if (es != null) return es.Value;
  901. RE_Pattern rep = pattern as RE_Pattern;
  902. if (rep != null) return rep._pre.UserPattern;
  903. throw PythonOps.TypeError("pattern must be a string or compiled pattern");
  904. }
  905. private static string ValidateString(object str, string param) {
  906. if (str is string) return str as string;
  907. ExtensibleString es = str as ExtensibleString;
  908. if (es != null) return es.Value;
  909. throw PythonOps.TypeError("expected string for parameter '{0}' but got '{1}'", param, PythonOps.GetPythonTypeName(str));
  910. }
  911. #endregion
  912. }
  913. }