PageRenderTime 44ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 1ms

/ExtLibs/wxWidgets/src/common/regex.cpp

https://bitbucket.org/lennonchan/cafu
C++ | 697 lines | 462 code | 118 blank | 117 comment | 60 complexity | 6e51ed3c75d14c4afce16e5013ba6014 MD5 | raw file
  1. ///////////////////////////////////////////////////////////////////////////////
  2. // Name: src/common/regex.cpp
  3. // Purpose: regular expression matching
  4. // Author: Karsten Ballueder and Vadim Zeitlin
  5. // Modified by:
  6. // Created: 13.07.01
  7. // RCS-ID: $Id$
  8. // Copyright: (c) 2000 Karsten Ballueder <ballueder@gmx.net>
  9. // 2001 Vadim Zeitlin <vadim@wxwindows.org>
  10. // Licence: wxWindows licence
  11. ///////////////////////////////////////////////////////////////////////////////
  12. // ============================================================================
  13. // declarations
  14. // ============================================================================
  15. // ----------------------------------------------------------------------------
  16. // headers
  17. // ----------------------------------------------------------------------------
  18. // For compilers that support precompilation, includes "wx.h".
  19. #include "wx/wxprec.h"
  20. #ifdef __BORLANDC__
  21. #pragma hdrstop
  22. #endif
  23. #if wxUSE_REGEX
  24. #include "wx/regex.h"
  25. #ifndef WX_PRECOMP
  26. #include "wx/object.h"
  27. #include "wx/log.h"
  28. #include "wx/intl.h"
  29. #include "wx/crt.h"
  30. #endif //WX_PRECOMP
  31. // FreeBSD, Watcom and DMars require this, CW doesn't have nor need it.
  32. // Others also don't seem to need it. If you have an error related to
  33. // (not) including <sys/types.h> please report details to
  34. // wx-dev@lists.wxwindows.org
  35. #if defined(__UNIX__) || defined(__WATCOMC__) || defined(__DIGITALMARS__)
  36. # include <sys/types.h>
  37. #endif
  38. #include <regex.h>
  39. // WXREGEX_USING_BUILTIN defined when using the built-in regex lib
  40. // WXREGEX_USING_RE_SEARCH defined when using re_search in the GNU regex lib
  41. // WXREGEX_IF_NEED_LEN() wrap the len parameter only used with the built-in
  42. // or GNU regex
  43. // WXREGEX_CONVERT_TO_MB defined when the regex lib is using chars and
  44. // wxChar is wide, so conversion must be done
  45. // WXREGEX_CHAR(x) Convert wxChar to wxRegChar
  46. //
  47. #ifdef __REG_NOFRONT
  48. # define WXREGEX_USING_BUILTIN
  49. # define WXREGEX_IF_NEED_LEN(x) ,x
  50. # if wxUSE_UNICODE
  51. # define WXREGEX_CHAR(x) (x).wc_str()
  52. # else
  53. # define WXREGEX_CHAR(x) (x).mb_str()
  54. # endif
  55. #else
  56. # ifdef HAVE_RE_SEARCH
  57. # define WXREGEX_IF_NEED_LEN(x) ,x
  58. # define WXREGEX_USING_RE_SEARCH
  59. # else
  60. # define WXREGEX_IF_NEED_LEN(x)
  61. # endif
  62. # if wxUSE_UNICODE
  63. # define WXREGEX_CONVERT_TO_MB
  64. # endif
  65. # define WXREGEX_CHAR(x) (x).mb_str()
  66. # define wx_regfree regfree
  67. # define wx_regerror regerror
  68. #endif
  69. // ----------------------------------------------------------------------------
  70. // private classes
  71. // ----------------------------------------------------------------------------
  72. #ifndef WXREGEX_USING_RE_SEARCH
  73. // the array of offsets for the matches, the usual POSIX regmatch_t array.
  74. class wxRegExMatches
  75. {
  76. public:
  77. typedef regmatch_t *match_type;
  78. wxRegExMatches(size_t n) { m_matches = new regmatch_t[n]; }
  79. ~wxRegExMatches() { delete [] m_matches; }
  80. // we just use casts here because the fields of regmatch_t struct may be 64
  81. // bit but we're limited to size_t in our public API and are not going to
  82. // change it because operating on strings longer than 4GB using it is
  83. // absolutely impractical anyhow
  84. size_t Start(size_t n) const
  85. {
  86. return wx_truncate_cast(size_t, m_matches[n].rm_so);
  87. }
  88. size_t End(size_t n) const
  89. {
  90. return wx_truncate_cast(size_t, m_matches[n].rm_eo);
  91. }
  92. regmatch_t *get() const { return m_matches; }
  93. private:
  94. regmatch_t *m_matches;
  95. };
  96. #else // WXREGEX_USING_RE_SEARCH
  97. // the array of offsets for the matches, the struct used by the GNU lib
  98. class wxRegExMatches
  99. {
  100. public:
  101. typedef re_registers *match_type;
  102. wxRegExMatches(size_t n)
  103. {
  104. m_matches.num_regs = n;
  105. m_matches.start = new regoff_t[n];
  106. m_matches.end = new regoff_t[n];
  107. }
  108. ~wxRegExMatches()
  109. {
  110. delete [] m_matches.start;
  111. delete [] m_matches.end;
  112. }
  113. size_t Start(size_t n) const { return m_matches.start[n]; }
  114. size_t End(size_t n) const { return m_matches.end[n]; }
  115. re_registers *get() { return &m_matches; }
  116. private:
  117. re_registers m_matches;
  118. };
  119. #endif // WXREGEX_USING_RE_SEARCH
  120. // the character type used by the regular expression engine
  121. #ifndef WXREGEX_CONVERT_TO_MB
  122. typedef wxChar wxRegChar;
  123. #else
  124. typedef char wxRegChar;
  125. #endif
  126. // the real implementation of wxRegEx
  127. class wxRegExImpl
  128. {
  129. public:
  130. // ctor and dtor
  131. wxRegExImpl();
  132. ~wxRegExImpl();
  133. // return true if Compile() had been called successfully
  134. bool IsValid() const { return m_isCompiled; }
  135. // RE operations
  136. bool Compile(const wxString& expr, int flags = 0);
  137. bool Matches(const wxRegChar *str, int flags
  138. WXREGEX_IF_NEED_LEN(size_t len)) const;
  139. bool GetMatch(size_t *start, size_t *len, size_t index = 0) const;
  140. size_t GetMatchCount() const;
  141. int Replace(wxString *pattern, const wxString& replacement,
  142. size_t maxMatches = 0) const;
  143. private:
  144. // return the string containing the error message for the given err code
  145. wxString GetErrorMsg(int errorcode, bool badconv) const;
  146. // init the members
  147. void Init()
  148. {
  149. m_isCompiled = false;
  150. m_Matches = NULL;
  151. m_nMatches = 0;
  152. }
  153. // free the RE if compiled
  154. void Free()
  155. {
  156. if ( IsValid() )
  157. {
  158. wx_regfree(&m_RegEx);
  159. }
  160. delete m_Matches;
  161. }
  162. // free the RE if any and reinit the members
  163. void Reinit()
  164. {
  165. Free();
  166. Init();
  167. }
  168. // compiled RE
  169. regex_t m_RegEx;
  170. // the subexpressions data
  171. wxRegExMatches *m_Matches;
  172. size_t m_nMatches;
  173. // true if m_RegEx is valid
  174. bool m_isCompiled;
  175. };
  176. // ============================================================================
  177. // implementation
  178. // ============================================================================
  179. // ----------------------------------------------------------------------------
  180. // wxRegExImpl
  181. // ----------------------------------------------------------------------------
  182. wxRegExImpl::wxRegExImpl()
  183. {
  184. Init();
  185. }
  186. wxRegExImpl::~wxRegExImpl()
  187. {
  188. Free();
  189. }
  190. wxString wxRegExImpl::GetErrorMsg(int errorcode, bool badconv) const
  191. {
  192. #ifdef WXREGEX_CONVERT_TO_MB
  193. // currently only needed when using system library in Unicode mode
  194. if ( badconv )
  195. {
  196. return _("conversion to 8-bit encoding failed");
  197. }
  198. #else
  199. // 'use' badconv to avoid a compiler warning
  200. (void)badconv;
  201. #endif
  202. wxString szError;
  203. // first get the string length needed
  204. int len = wx_regerror(errorcode, &m_RegEx, NULL, 0);
  205. if ( len > 0 )
  206. {
  207. char* szcmbError = new char[++len];
  208. (void)wx_regerror(errorcode, &m_RegEx, szcmbError, len);
  209. szError = wxConvLibc.cMB2WX(szcmbError);
  210. delete [] szcmbError;
  211. }
  212. else // regerror() returned 0
  213. {
  214. szError = _("unknown error");
  215. }
  216. return szError;
  217. }
  218. bool wxRegExImpl::Compile(const wxString& expr, int flags)
  219. {
  220. Reinit();
  221. #ifdef WX_NO_REGEX_ADVANCED
  222. # define FLAVORS wxRE_BASIC
  223. #else
  224. # define FLAVORS (wxRE_ADVANCED | wxRE_BASIC)
  225. wxASSERT_MSG( (flags & FLAVORS) != FLAVORS,
  226. wxT("incompatible flags in wxRegEx::Compile") );
  227. #endif
  228. wxASSERT_MSG( !(flags & ~(FLAVORS | wxRE_ICASE | wxRE_NOSUB | wxRE_NEWLINE)),
  229. wxT("unrecognized flags in wxRegEx::Compile") );
  230. // translate our flags to regcomp() ones
  231. int flagsRE = 0;
  232. if ( !(flags & wxRE_BASIC) )
  233. {
  234. #ifndef WX_NO_REGEX_ADVANCED
  235. if (flags & wxRE_ADVANCED)
  236. flagsRE |= REG_ADVANCED;
  237. else
  238. #endif
  239. flagsRE |= REG_EXTENDED;
  240. }
  241. if ( flags & wxRE_ICASE )
  242. flagsRE |= REG_ICASE;
  243. if ( flags & wxRE_NOSUB )
  244. flagsRE |= REG_NOSUB;
  245. if ( flags & wxRE_NEWLINE )
  246. flagsRE |= REG_NEWLINE;
  247. // compile it
  248. #ifdef WXREGEX_USING_BUILTIN
  249. bool conv = true;
  250. // FIXME-UTF8: use wc_str() after removing ANSI build
  251. int errorcode = wx_re_comp(&m_RegEx, expr.c_str(), expr.length(), flagsRE);
  252. #else
  253. // FIXME-UTF8: this is potentially broken, we shouldn't even try it
  254. // and should always use builtin regex library (or PCRE?)
  255. const wxWX2MBbuf conv = expr.mbc_str();
  256. int errorcode = conv ? regcomp(&m_RegEx, conv, flagsRE) : REG_BADPAT;
  257. #endif
  258. if ( errorcode )
  259. {
  260. wxLogError(_("Invalid regular expression '%s': %s"),
  261. expr.c_str(), GetErrorMsg(errorcode, !conv).c_str());
  262. m_isCompiled = false;
  263. }
  264. else // ok
  265. {
  266. // don't allocate the matches array now, but do it later if necessary
  267. if ( flags & wxRE_NOSUB )
  268. {
  269. // we don't need it at all
  270. m_nMatches = 0;
  271. }
  272. else
  273. {
  274. // we will alloc the array later (only if really needed) but count
  275. // the number of sub-expressions in the regex right now
  276. // there is always one for the whole expression
  277. m_nMatches = 1;
  278. // and some more for bracketed subexperessions
  279. for ( const wxChar *cptr = expr.c_str(); *cptr; cptr++ )
  280. {
  281. if ( *cptr == wxT('\\') )
  282. {
  283. // in basic RE syntax groups are inside \(...\)
  284. if ( *++cptr == wxT('(') && (flags & wxRE_BASIC) )
  285. {
  286. m_nMatches++;
  287. }
  288. }
  289. else if ( *cptr == wxT('(') && !(flags & wxRE_BASIC) )
  290. {
  291. // we know that the previous character is not an unquoted
  292. // backslash because it would have been eaten above, so we
  293. // have a bare '(' and this indicates a group start for the
  294. // extended syntax. '(?' is used for extensions by perl-
  295. // like REs (e.g. advanced), and is not valid for POSIX
  296. // extended, so ignore them always.
  297. if ( cptr[1] != wxT('?') )
  298. m_nMatches++;
  299. }
  300. }
  301. }
  302. m_isCompiled = true;
  303. }
  304. return IsValid();
  305. }
  306. #ifdef WXREGEX_USING_RE_SEARCH
  307. // On GNU, regexec is implemented as a wrapper around re_search. re_search
  308. // requires a length parameter which the POSIX regexec does not have,
  309. // therefore regexec must do a strlen on the search text each time it is
  310. // called. This can drastically affect performance when matching is done in
  311. // a loop along a string, such as during a search and replace. Therefore if
  312. // re_search is detected by configure, it is used directly.
  313. //
  314. static int ReSearch(const regex_t *preg,
  315. const char *text,
  316. size_t len,
  317. re_registers *matches,
  318. int eflags)
  319. {
  320. regex_t *pattern = const_cast<regex_t*>(preg);
  321. pattern->not_bol = (eflags & REG_NOTBOL) != 0;
  322. pattern->not_eol = (eflags & REG_NOTEOL) != 0;
  323. pattern->regs_allocated = REGS_FIXED;
  324. int ret = re_search(pattern, text, len, 0, len, matches);
  325. return ret >= 0 ? 0 : REG_NOMATCH;
  326. }
  327. #endif // WXREGEX_USING_RE_SEARCH
  328. bool wxRegExImpl::Matches(const wxRegChar *str,
  329. int flags
  330. WXREGEX_IF_NEED_LEN(size_t len)) const
  331. {
  332. wxCHECK_MSG( IsValid(), false, wxT("must successfully Compile() first") );
  333. // translate our flags to regexec() ones
  334. wxASSERT_MSG( !(flags & ~(wxRE_NOTBOL | wxRE_NOTEOL)),
  335. wxT("unrecognized flags in wxRegEx::Matches") );
  336. int flagsRE = 0;
  337. if ( flags & wxRE_NOTBOL )
  338. flagsRE |= REG_NOTBOL;
  339. if ( flags & wxRE_NOTEOL )
  340. flagsRE |= REG_NOTEOL;
  341. // allocate matches array if needed
  342. wxRegExImpl *self = wxConstCast(this, wxRegExImpl);
  343. if ( !m_Matches && m_nMatches )
  344. {
  345. self->m_Matches = new wxRegExMatches(m_nMatches);
  346. }
  347. wxRegExMatches::match_type matches = m_Matches ? m_Matches->get() : NULL;
  348. // do match it
  349. #if defined WXREGEX_USING_BUILTIN
  350. int rc = wx_re_exec(&self->m_RegEx, str, len, NULL, m_nMatches, matches, flagsRE);
  351. #elif defined WXREGEX_USING_RE_SEARCH
  352. int rc = str ? ReSearch(&self->m_RegEx, str, len, matches, flagsRE) : REG_BADPAT;
  353. #else
  354. int rc = str ? regexec(&self->m_RegEx, str, m_nMatches, matches, flagsRE) : REG_BADPAT;
  355. #endif
  356. switch ( rc )
  357. {
  358. case 0:
  359. // matched successfully
  360. return true;
  361. default:
  362. // an error occurred
  363. wxLogError(_("Failed to find match for regular expression: %s"),
  364. GetErrorMsg(rc, !str).c_str());
  365. // fall through
  366. case REG_NOMATCH:
  367. // no match
  368. return false;
  369. }
  370. }
  371. bool wxRegExImpl::GetMatch(size_t *start, size_t *len, size_t index) const
  372. {
  373. wxCHECK_MSG( IsValid(), false, wxT("must successfully Compile() first") );
  374. wxCHECK_MSG( m_nMatches, false, wxT("can't use with wxRE_NOSUB") );
  375. wxCHECK_MSG( m_Matches, false, wxT("must call Matches() first") );
  376. wxCHECK_MSG( index < m_nMatches, false, wxT("invalid match index") );
  377. if ( start )
  378. *start = m_Matches->Start(index);
  379. if ( len )
  380. *len = m_Matches->End(index) - m_Matches->Start(index);
  381. return true;
  382. }
  383. size_t wxRegExImpl::GetMatchCount() const
  384. {
  385. wxCHECK_MSG( IsValid(), 0, wxT("must successfully Compile() first") );
  386. wxCHECK_MSG( m_nMatches, 0, wxT("can't use with wxRE_NOSUB") );
  387. return m_nMatches;
  388. }
  389. int wxRegExImpl::Replace(wxString *text,
  390. const wxString& replacement,
  391. size_t maxMatches) const
  392. {
  393. wxCHECK_MSG( text, wxNOT_FOUND, wxT("NULL text in wxRegEx::Replace") );
  394. wxCHECK_MSG( IsValid(), wxNOT_FOUND, wxT("must successfully Compile() first") );
  395. // the input string
  396. #ifndef WXREGEX_CONVERT_TO_MB
  397. const wxChar *textstr = text->c_str();
  398. size_t textlen = text->length();
  399. #else
  400. const wxWX2MBbuf textstr = WXREGEX_CHAR(*text);
  401. if (!textstr)
  402. {
  403. wxLogError(_("Failed to find match for regular expression: %s"),
  404. GetErrorMsg(0, true).c_str());
  405. return 0;
  406. }
  407. size_t textlen = strlen(textstr);
  408. text->clear();
  409. #endif
  410. // the replacement text
  411. wxString textNew;
  412. // the result, allow 25% extra
  413. wxString result;
  414. result.reserve(5 * textlen / 4);
  415. // attempt at optimization: don't iterate over the string if it doesn't
  416. // contain back references at all
  417. bool mayHaveBackrefs =
  418. replacement.find_first_of(wxT("\\&")) != wxString::npos;
  419. if ( !mayHaveBackrefs )
  420. {
  421. textNew = replacement;
  422. }
  423. // the position where we start looking for the match
  424. size_t matchStart = 0;
  425. // number of replacement made: we won't make more than maxMatches of them
  426. // (unless maxMatches is 0 which doesn't limit the number of replacements)
  427. size_t countRepl = 0;
  428. // note that "^" shouldn't match after the first call to Matches() so we
  429. // use wxRE_NOTBOL to prevent it from happening
  430. while ( (!maxMatches || countRepl < maxMatches) &&
  431. Matches(
  432. #ifndef WXREGEX_CONVERT_TO_MB
  433. textstr + matchStart,
  434. #else
  435. textstr.data() + matchStart,
  436. #endif
  437. countRepl ? wxRE_NOTBOL : 0
  438. WXREGEX_IF_NEED_LEN(textlen - matchStart)) )
  439. {
  440. // the string possibly contains back references: we need to calculate
  441. // the replacement text anew after each match
  442. if ( mayHaveBackrefs )
  443. {
  444. mayHaveBackrefs = false;
  445. textNew.clear();
  446. textNew.reserve(replacement.length());
  447. for ( const wxChar *p = replacement.c_str(); *p; p++ )
  448. {
  449. size_t index = (size_t)-1;
  450. if ( *p == wxT('\\') )
  451. {
  452. if ( wxIsdigit(*++p) )
  453. {
  454. // back reference
  455. wxChar *end;
  456. index = (size_t)wxStrtoul(p, &end, 10);
  457. p = end - 1; // -1 to compensate for p++ in the loop
  458. }
  459. //else: backslash used as escape character
  460. }
  461. else if ( *p == wxT('&') )
  462. {
  463. // treat this as "\0" for compatbility with ed and such
  464. index = 0;
  465. }
  466. // do we have a back reference?
  467. if ( index != (size_t)-1 )
  468. {
  469. // yes, get its text
  470. size_t start, len;
  471. if ( !GetMatch(&start, &len, index) )
  472. {
  473. wxFAIL_MSG( wxT("invalid back reference") );
  474. // just eat it...
  475. }
  476. else
  477. {
  478. textNew += wxString(
  479. #ifndef WXREGEX_CONVERT_TO_MB
  480. textstr
  481. #else
  482. textstr.data()
  483. #endif
  484. + matchStart + start,
  485. *wxConvCurrent, len);
  486. mayHaveBackrefs = true;
  487. }
  488. }
  489. else // ordinary character
  490. {
  491. textNew += *p;
  492. }
  493. }
  494. }
  495. size_t start, len;
  496. if ( !GetMatch(&start, &len) )
  497. {
  498. // we did have match as Matches() returned true above!
  499. wxFAIL_MSG( wxT("internal logic error in wxRegEx::Replace") );
  500. return wxNOT_FOUND;
  501. }
  502. // an insurance against implementations that don't grow exponentially
  503. // to ensure building the result takes linear time
  504. if (result.capacity() < result.length() + start + textNew.length())
  505. result.reserve(2 * result.length());
  506. #ifndef WXREGEX_CONVERT_TO_MB
  507. result.append(*text, matchStart, start);
  508. #else
  509. result.append(wxString(textstr.data() + matchStart, *wxConvCurrent, start));
  510. #endif
  511. matchStart += start;
  512. result.append(textNew);
  513. countRepl++;
  514. matchStart += len;
  515. }
  516. #ifndef WXREGEX_CONVERT_TO_MB
  517. result.append(*text, matchStart, wxString::npos);
  518. #else
  519. result.append(wxString(textstr.data() + matchStart, *wxConvCurrent));
  520. #endif
  521. *text = result;
  522. return countRepl;
  523. }
  524. // ----------------------------------------------------------------------------
  525. // wxRegEx: all methods are mostly forwarded to wxRegExImpl
  526. // ----------------------------------------------------------------------------
  527. void wxRegEx::Init()
  528. {
  529. m_impl = NULL;
  530. }
  531. wxRegEx::~wxRegEx()
  532. {
  533. delete m_impl;
  534. }
  535. bool wxRegEx::Compile(const wxString& expr, int flags)
  536. {
  537. if ( !m_impl )
  538. {
  539. m_impl = new wxRegExImpl;
  540. }
  541. if ( !m_impl->Compile(expr, flags) )
  542. {
  543. // error message already given in wxRegExImpl::Compile
  544. wxDELETE(m_impl);
  545. return false;
  546. }
  547. return true;
  548. }
  549. bool wxRegEx::Matches(const wxString& str, int flags) const
  550. {
  551. wxCHECK_MSG( IsValid(), false, wxT("must successfully Compile() first") );
  552. return m_impl->Matches(WXREGEX_CHAR(str), flags
  553. WXREGEX_IF_NEED_LEN(str.length()));
  554. }
  555. bool wxRegEx::GetMatch(size_t *start, size_t *len, size_t index) const
  556. {
  557. wxCHECK_MSG( IsValid(), false, wxT("must successfully Compile() first") );
  558. return m_impl->GetMatch(start, len, index);
  559. }
  560. wxString wxRegEx::GetMatch(const wxString& text, size_t index) const
  561. {
  562. size_t start, len;
  563. if ( !GetMatch(&start, &len, index) )
  564. return wxEmptyString;
  565. return text.Mid(start, len);
  566. }
  567. size_t wxRegEx::GetMatchCount() const
  568. {
  569. wxCHECK_MSG( IsValid(), 0, wxT("must successfully Compile() first") );
  570. return m_impl->GetMatchCount();
  571. }
  572. int wxRegEx::Replace(wxString *pattern,
  573. const wxString& replacement,
  574. size_t maxMatches) const
  575. {
  576. wxCHECK_MSG( IsValid(), wxNOT_FOUND, wxT("must successfully Compile() first") );
  577. return m_impl->Replace(pattern, replacement, maxMatches);
  578. }
  579. #endif // wxUSE_REGEX