PageRenderTime 51ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/tags/harbour-2.0.0/src/rtl/hbregex.c

#
C | 569 lines | 435 code | 46 blank | 88 comment | 73 complexity | 7c3fd8959a3cddcd0834daa8784be7dd MD5 | raw file
Possible License(s): AGPL-1.0, BSD-3-Clause, CC-BY-SA-3.0, LGPL-3.0, GPL-2.0, LGPL-2.0, LGPL-2.1
  1. /*
  2. * $Id: hbregex.c 13174 2009-12-09 14:33:48Z druzus $
  3. */
  4. /*
  5. * Harbour Project source code:
  6. *
  7. *
  8. * Copyright 2007 Przemyslaw Czerpak <druzus / at / priv.onet.pl>
  9. * www - http://www.harbour-project.org
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of the GNU General Public License as published by
  13. * the Free Software Foundation; either version 2, or (at your option)
  14. * any later version.
  15. *
  16. * This program is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. * GNU General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU General Public License
  22. * along with this software; see the file COPYING. If not, write to
  23. * the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
  24. * Boston, MA 02111-1307 USA (or visit the web site http://www.gnu.org/).
  25. *
  26. * As a special exception, the Harbour Project gives permission for
  27. * additional uses of the text contained in its release of Harbour.
  28. *
  29. * The exception is that, if you link the Harbour libraries with other
  30. * files to produce an executable, this does not by itself cause the
  31. * resulting executable to be covered by the GNU General Public License.
  32. * Your use of that executable is in no way restricted on account of
  33. * linking the Harbour library code into it.
  34. *
  35. * This exception does not however invalidate any other reasons why
  36. * the executable file might be covered by the GNU General Public License.
  37. *
  38. * This exception applies only to the code released by the Harbour
  39. * Project under the name Harbour. If you copy code from other
  40. * Harbour Project or Free Software Foundation releases into a copy of
  41. * Harbour, as the General Public License permits, the exception does
  42. * not apply to the code that you add in this way. To avoid misleading
  43. * anyone as to the status of such modified files, you must delete
  44. * this exception notice from them.
  45. *
  46. * If you write modifications of your own for Harbour, it is your choice
  47. * whether to permit this exception to apply to your modifications.
  48. * If you do not wish that, delete this exception notice.
  49. *
  50. */
  51. #define _HB_REGEX_INTERNAL_
  52. #include "hbregex.h"
  53. #include "hbapiitm.h"
  54. #include "hbapierr.h"
  55. #include "hbinit.h"
  56. static void hb_regfree( PHB_REGEX pRegEx )
  57. {
  58. #if defined( HB_HAS_PCRE )
  59. ( pcre_free )( pRegEx->re_pcre );
  60. #elif defined( HB_POSIX_REGEX )
  61. regfree( &pRegEx->reg );
  62. #else
  63. HB_SYMBOL_UNUSED( pRegEx );
  64. #endif
  65. }
  66. static int hb_regcomp( PHB_REGEX pRegEx, const char * szRegEx )
  67. {
  68. #if defined( HB_HAS_PCRE )
  69. const unsigned char * pCharTable = NULL;
  70. const char *szError = NULL;
  71. int iErrOffset = 0;
  72. int iCFlags = ( ( pRegEx->iFlags & HBREG_ICASE ) ? PCRE_CASELESS : 0 ) |
  73. ( ( pRegEx->iFlags & HBREG_NEWLINE ) ? PCRE_MULTILINE : 0 ) |
  74. ( ( pRegEx->iFlags & HBREG_DOTALL ) ? PCRE_DOTALL : 0 );
  75. pRegEx->iEFlags = ( ( pRegEx->iFlags & HBREG_NOTBOL ) ? PCRE_NOTBOL : 0 ) |
  76. ( ( pRegEx->iFlags & HBREG_NOTEOL ) ? PCRE_NOTEOL : 0 );
  77. pRegEx->re_pcre = pcre_compile( szRegEx, iCFlags, &szError,
  78. &iErrOffset, pCharTable );
  79. return pRegEx->re_pcre ? 0 : -1;
  80. #elif defined( HB_POSIX_REGEX )
  81. int iCFlags = REG_EXTENDED |
  82. ( ( pRegEx->iFlags & HBREG_ICASE ) ? REG_ICASE : 0 ) |
  83. ( ( pRegEx->iFlags & HBREG_NEWLINE ) ? REG_NEWLINE : 0 ) |
  84. ( ( pRegEx->iFlags & HBREG_NOSUB ) ? REG_NOSUB : 0 );
  85. pRegEx->iEFlags = ( ( pRegEx->iFlags & HBREG_NOTBOL ) ? REG_NOTBOL : 0 ) |
  86. ( ( pRegEx->iFlags & HBREG_NOTEOL ) ? REG_NOTEOL : 0 );
  87. return regcomp( &pRegEx->reg, szRegEx, iCFlags );
  88. #else
  89. HB_SYMBOL_UNUSED( pRegEx );
  90. HB_SYMBOL_UNUSED( szRegEx );
  91. return -1;
  92. #endif
  93. }
  94. static int hb_regexec( PHB_REGEX pRegEx, const char * szString, ULONG ulLen,
  95. int iMatches, HB_REGMATCH * aMatches )
  96. {
  97. #if defined( HB_HAS_PCRE )
  98. int iResult, i;
  99. iResult = pcre_exec( pRegEx->re_pcre, NULL /* pcre_extra */,
  100. szString, ulLen, 0 /* startoffset */,
  101. pRegEx->iEFlags, aMatches, HB_REGMATCH_SIZE( iMatches ) );
  102. if( iResult == 0 )
  103. {
  104. for( i = 0; i < iMatches; i++ )
  105. {
  106. if( HB_REGMATCH_EO( aMatches, i ) != -1 )
  107. iResult = i + 1;
  108. }
  109. }
  110. return iResult;
  111. #elif defined( HB_POSIX_REGEX )
  112. char * szBuffer = NULL;
  113. int iResult, i;
  114. if( szString[ ulLen ] != 0 )
  115. {
  116. szBuffer = hb_strndup( szString, ulLen );
  117. szString = szBuffer;
  118. }
  119. for( i = 0; i < iMatches; i++ )
  120. HB_REGMATCH_EO( aMatches, i ) = -1;
  121. iResult = regexec( &pRegEx->reg, szString, iMatches, aMatches, pRegEx->iEFlags );
  122. if( iResult == 0 )
  123. {
  124. for( i = 0; i < iMatches; i++ )
  125. {
  126. if( HB_REGMATCH_EO( aMatches, i ) != -1 )
  127. iResult = i + 1;
  128. }
  129. }
  130. else
  131. iResult = -1;
  132. if( szBuffer )
  133. hb_xfree( szBuffer );
  134. return iResult;
  135. #else
  136. HB_SYMBOL_UNUSED( pRegEx );
  137. HB_SYMBOL_UNUSED( szString );
  138. HB_SYMBOL_UNUSED( ulLen );
  139. HB_SYMBOL_UNUSED( iMatches );
  140. HB_SYMBOL_UNUSED( aMatches );
  141. return -1;
  142. #endif
  143. }
  144. HB_FUNC( HB_REGEXCOMP )
  145. {
  146. ULONG ulLen = hb_parclen( 1 );
  147. if( ulLen == 0 )
  148. hb_errRT_BASE_SubstR( EG_ARG, 3012, "Wrong parameter count/type",
  149. HB_ERR_FUNCNAME, HB_ERR_ARGS_BASEPARAMS );
  150. else
  151. {
  152. int iFlags = HBREG_EXTENDED;
  153. PHB_REGEX pRegEx;
  154. if( !hb_parldef( 2, 1 ) )
  155. iFlags |= HBREG_ICASE;
  156. if( hb_parl( 3 ) )
  157. iFlags |= HBREG_NEWLINE;
  158. pRegEx = hb_regexCompile( hb_parc( 1 ), ulLen, iFlags );
  159. if( pRegEx )
  160. {
  161. pRegEx->fFree = FALSE;
  162. hb_retptrGC( pRegEx );
  163. }
  164. }
  165. }
  166. HB_FUNC( HB_ISREGEX )
  167. {
  168. hb_retl( hb_regexIs( hb_param( 1, HB_IT_ANY ) ) );
  169. }
  170. HB_FUNC( HB_ATX )
  171. {
  172. const char * pszString;
  173. ULONG ulLen, ulStart, ulEnd;
  174. PHB_REGEX pRegEx;
  175. PHB_ITEM pString;
  176. int iPCount = hb_pcount();
  177. pString = hb_param( 2, HB_IT_STRING );
  178. if( !pString )
  179. {
  180. hb_errRT_BASE_SubstR( EG_ARG, 3012, "Wrong parameters",
  181. HB_ERR_FUNCNAME, HB_ERR_ARGS_BASEPARAMS );
  182. return;
  183. }
  184. pszString = hb_itemGetCPtr( pString );
  185. ulLen = hb_itemGetCLen( pString );
  186. pRegEx = hb_regexGet( hb_param( 1, HB_IT_ANY ),
  187. !hb_parldef( 3, 1 ) ? HBREG_ICASE : 0 );
  188. if( !pRegEx )
  189. return;
  190. ulStart = hb_parnl( 4 );
  191. ulEnd = hb_parnl( 5 );
  192. if( ulLen && ulStart <= ulLen && ulStart <= ulEnd )
  193. {
  194. HB_REGMATCH aMatches[ HB_REGMATCH_SIZE( 1 ) ];
  195. if( ulEnd < ulLen )
  196. ulLen = ulEnd;
  197. if( ulStart )
  198. {
  199. --ulStart;
  200. ulLen -= ulStart;
  201. }
  202. if( hb_regexec( pRegEx, pszString + ulStart, ulLen, 1, aMatches ) > 0 )
  203. {
  204. ulStart += HB_REGMATCH_SO( aMatches, 0 ) + 1;
  205. ulLen = HB_REGMATCH_EO( aMatches, 0 ) - HB_REGMATCH_SO( aMatches, 0 );
  206. hb_retclen( pszString + ulStart - 1, ulLen );
  207. }
  208. else
  209. ulStart = ulLen = 0;
  210. }
  211. else
  212. ulStart = ulLen = 0;
  213. hb_regexFree( pRegEx );
  214. if( iPCount > 3 )
  215. {
  216. hb_stornl( ulStart, 4 );
  217. if( iPCount > 4 )
  218. hb_stornl( ulLen, 5 );
  219. }
  220. }
  221. static BOOL hb_regex( int iRequest )
  222. {
  223. HB_REGMATCH aMatches[ HB_REGMATCH_SIZE( REGEX_MAX_GROUPS ) ];
  224. PHB_ITEM pRetArray, pMatch, pString;
  225. int i, iMatches, iMaxMatch;
  226. BOOL fResult = FALSE;
  227. PHB_REGEX pRegEx;
  228. const char * pszString;
  229. ULONG ulLen;
  230. pString = hb_param( 2, HB_IT_STRING );
  231. if( !pString )
  232. {
  233. hb_errRT_BASE_SubstR( EG_ARG, 3012, "Wrong parameters",
  234. HB_ERR_FUNCNAME, HB_ERR_ARGS_BASEPARAMS );
  235. return FALSE;
  236. }
  237. pRegEx = hb_regexGet( hb_param( 1, HB_IT_ANY ),
  238. ( !hb_parldef( 3, 1 ) ? HBREG_ICASE : 0 ) |
  239. ( hb_parl( 4 ) ? HBREG_NEWLINE : 0 ) );
  240. if( !pRegEx )
  241. return FALSE;
  242. pszString = hb_itemGetCPtr( pString );
  243. ulLen = hb_itemGetCLen( pString );
  244. iMaxMatch = iRequest == 0 || iRequest == 4 || iRequest == 5 ?
  245. REGEX_MAX_GROUPS : 1;
  246. iMatches = hb_regexec( pRegEx, pszString, ulLen, iMaxMatch, aMatches );
  247. if( iMatches > 0 )
  248. {
  249. switch( iRequest )
  250. {
  251. case 0:
  252. pRetArray = hb_itemArrayNew( iMatches );
  253. for( i = 0; i < iMatches; i++ )
  254. {
  255. if( HB_REGMATCH_EO( aMatches, i ) > -1 )
  256. hb_arraySetCL( pRetArray, i + 1,
  257. pszString + HB_REGMATCH_SO( aMatches, i ),
  258. HB_REGMATCH_EO( aMatches, i ) -
  259. HB_REGMATCH_SO( aMatches, i ) );
  260. else
  261. hb_arraySetCL( pRetArray, i + 1, NULL, 0 );
  262. }
  263. hb_itemReturnRelease( pRetArray );
  264. fResult = TRUE;
  265. break;
  266. case 1: /* LIKE */
  267. fResult = HB_REGMATCH_SO( aMatches, 0 ) == 0 &&
  268. ( ULONG ) HB_REGMATCH_EO( aMatches, 0 ) == ulLen;
  269. break;
  270. case 2: /* MATCH ( HAS ) */
  271. fResult = TRUE;
  272. break;
  273. case 3: /* SPLIT */
  274. iMaxMatch = hb_parni( 5 );
  275. pRetArray = hb_itemArrayNew( 0 );
  276. pMatch = hb_itemNew( NULL );
  277. iMatches = 0;
  278. do
  279. {
  280. hb_itemPutCL( pMatch, pszString, HB_REGMATCH_SO( aMatches, 0 ) );
  281. hb_arrayAddForward( pRetArray, pMatch );
  282. ulLen -= HB_REGMATCH_EO( aMatches, 0 );
  283. pszString += HB_REGMATCH_EO( aMatches, 0 );
  284. iMatches++;
  285. }
  286. while( HB_REGMATCH_EO( aMatches, 0 ) > 0 && ulLen &&
  287. ( iMaxMatch == 0 || iMatches < iMaxMatch ) &&
  288. hb_regexec( pRegEx, pszString, ulLen, 1, aMatches ) > 0 );
  289. /* last match must be done also in case that pszString is empty;
  290. this would mean an empty split field at the end of the string */
  291. /* if( ulLen ) */
  292. {
  293. hb_itemPutCL( pMatch, pszString, ulLen );
  294. hb_arrayAddForward( pRetArray, pMatch );
  295. }
  296. hb_itemRelease( pMatch );
  297. hb_itemReturnRelease( pRetArray );
  298. fResult = TRUE;
  299. break;
  300. case 4: /* results AND positions */
  301. pRetArray = hb_itemArrayNew( iMatches );
  302. for( i = 0; i < iMatches; i++ )
  303. {
  304. int iSO = HB_REGMATCH_SO( aMatches, i ),
  305. iEO = HB_REGMATCH_EO( aMatches, i );
  306. pMatch = hb_arrayGetItemPtr( pRetArray, i + 1 );
  307. hb_arrayNew( pMatch, 3 );
  308. if( iEO != -1 )
  309. {
  310. /* matched string */
  311. hb_arraySetCL( pMatch, 1, pszString + iSO, iEO - iSO );
  312. /* begin of match */
  313. hb_arraySetNI( pMatch, 2, iSO + 1 );
  314. /* End of match */
  315. hb_arraySetNI( pMatch, 3, iEO );
  316. }
  317. else
  318. {
  319. hb_arraySetCL( pMatch, 1, NULL, 0 );
  320. hb_arraySetNI( pMatch, 2, 0 );
  321. hb_arraySetNI( pMatch, 3, 0 );
  322. }
  323. }
  324. hb_itemReturnRelease( pRetArray );
  325. fResult = TRUE;
  326. break;
  327. case 5: /* _ALL_ results AND positions */
  328. {
  329. PHB_ITEM pAtxArray;
  330. int iMax = hb_parni( 5 ); /* max nuber of matches I want, 0 = unlimited */
  331. int iGetMatch = hb_parni( 6 ); /* Gets if want only one single match or a sub-match */
  332. BOOL fOnlyMatch = hb_parldef( 7, 1 ); /* if TRUE returns only matches and sub-matches, not positions */
  333. ULONG ulOffSet = 0;
  334. int iCount = 0;
  335. int iSO, iEO;
  336. /* Set new array */
  337. pRetArray = hb_itemArrayNew( 0 );
  338. do
  339. {
  340. /* If I want all matches */
  341. if( iGetMatch == 0 || /* Check boundaries */
  342. ( iGetMatch < 0 || iGetMatch > iMatches ) )
  343. {
  344. pAtxArray = hb_itemArrayNew( iMatches );
  345. for( i = 0; i < iMatches; i++ )
  346. {
  347. iSO = HB_REGMATCH_SO( aMatches, i );
  348. iEO = HB_REGMATCH_EO( aMatches, i );
  349. pMatch = hb_arrayGetItemPtr( pAtxArray, i + 1 );
  350. if( !fOnlyMatch )
  351. {
  352. hb_arrayNew( pMatch, 3 );
  353. if( iEO != -1 )
  354. {
  355. /* matched string */
  356. hb_arraySetCL( pMatch, 1, pszString + iSO, iEO - iSO );
  357. /* begin of match */
  358. hb_arraySetNI( pMatch, 2, ulOffSet + iSO + 1 );
  359. /* End of match */
  360. hb_arraySetNI( pMatch, 3, ulOffSet + iEO );
  361. }
  362. else
  363. {
  364. hb_arraySetCL( pMatch, 1, NULL, 0 );
  365. hb_arraySetNI( pMatch, 2, 0 );
  366. hb_arraySetNI( pMatch, 3, 0 );
  367. }
  368. }
  369. else
  370. {
  371. if( iEO != -1 )
  372. /* matched string */
  373. hb_itemPutCL( pMatch, pszString + iSO, iEO - iSO );
  374. else
  375. hb_itemPutC( pMatch, NULL );
  376. }
  377. }
  378. hb_arrayAddForward( pRetArray, pAtxArray );
  379. hb_itemRelease( pAtxArray );
  380. }
  381. else /* Here I get only single matches */
  382. {
  383. i = iGetMatch - 1;
  384. iSO = HB_REGMATCH_SO( aMatches, i );
  385. iEO = HB_REGMATCH_EO( aMatches, i );
  386. pMatch = hb_itemNew( NULL );
  387. if( !fOnlyMatch )
  388. {
  389. hb_arrayNew( pMatch, 3 );
  390. if( iEO != -1 )
  391. {
  392. /* matched string */
  393. hb_arraySetCL( pMatch, 1, pszString + iSO, iEO - iSO );
  394. /* begin of match */
  395. hb_arraySetNI( pMatch, 2, ulOffSet + iSO + 1 );
  396. /* End of match */
  397. hb_arraySetNI( pMatch, 3, ulOffSet + iEO );
  398. }
  399. else
  400. {
  401. hb_arraySetCL( pMatch, 1, NULL, 0 );
  402. hb_arraySetNI( pMatch, 2, 0 );
  403. hb_arraySetNI( pMatch, 3, 0 );
  404. }
  405. }
  406. else
  407. {
  408. if( iEO != -1 )
  409. /* matched string */
  410. hb_itemPutCL( pMatch, pszString + iSO, iEO - iSO );
  411. else
  412. hb_itemPutC( pMatch, NULL );
  413. }
  414. hb_arrayAddForward( pRetArray, pMatch );
  415. hb_itemRelease( pMatch );
  416. }
  417. iEO = HB_REGMATCH_EO( aMatches, 0 );
  418. if( iEO == -1 )
  419. break;
  420. ulLen -= iEO;
  421. pszString += iEO;
  422. ulOffSet += iEO;
  423. iCount++;
  424. }
  425. while( iEO && ulLen && ( iMax == 0 || iCount < iMax ) &&
  426. ( iMatches = hb_regexec( pRegEx, pszString, ulLen, iMaxMatch, aMatches ) ) > 0 );
  427. hb_itemReturnRelease( pRetArray );
  428. fResult = TRUE;
  429. break;
  430. }
  431. }
  432. }
  433. else if( iRequest == 3 )
  434. {
  435. pRetArray = hb_itemArrayNew( 1 );
  436. hb_arraySet( pRetArray, 1, pString );
  437. hb_itemReturnRelease( pRetArray );
  438. fResult = TRUE;
  439. }
  440. hb_regexFree( pRegEx );
  441. return fResult;
  442. }
  443. /* Returns array of Match + Sub-Matches. */
  444. HB_FUNC( HB_REGEX )
  445. {
  446. hb_regex( 0 );
  447. }
  448. /* Returns just .T. if match found or .F. otherwise. */
  449. /* NOTE: Deprecated compatibility function.
  450. Please use HB_REGEXLIKE() and HB_REGEXHAS() instead. */
  451. HB_FUNC( HB_REGEXMATCH )
  452. {
  453. hb_retl( hb_regex( hb_parl( 5 ) ? 1 /* LIKE */ : 2 /* HAS */ ) );
  454. }
  455. HB_FUNC( HB_REGEXLIKE )
  456. {
  457. hb_retl( hb_regex( 1 ) );
  458. }
  459. HB_FUNC( HB_REGEXHAS )
  460. {
  461. hb_retl( hb_regex( 2 ) );
  462. }
  463. /* Splits the string in an array of matched expressions */
  464. HB_FUNC( HB_REGEXSPLIT )
  465. {
  466. hb_regex( 3 );
  467. }
  468. /* Returns array of { Match, start, end }, { Sub-Matches, start, end } */
  469. HB_FUNC( HB_REGEXATX )
  470. {
  471. hb_regex( 4 );
  472. }
  473. /* 2005-12-16 - Francesco Saverio Giudice
  474. HB_RegExAll( cRegex, cString, lCaseSensitive, lNewLine, nMaxMatches, nGetMatch, lOnlyMatch ) -> aAllRegexMatches
  475. This function return all matches from a Regex search.
  476. It is a mix from hb_RegEx() and hb_RegExAtX()
  477. PARAMETERS:
  478. cRegex - Regex pattern string or precompiled Regex
  479. cString - The string you want to search
  480. lCaseSensitive - default = FALSE
  481. lNewLine - default = FALSE
  482. nMaxMatches - default = unlimited, this limit number of matches that have to return
  483. nGetMatch - default = unlimited, this returns only one from Match + Sub-Matches
  484. lOnlyMatch - default = TRUE, if TRUE returns Matches, otherwise it returns also start and end positions
  485. */
  486. HB_FUNC( HB_REGEXALL )
  487. {
  488. hb_regex( 5 );
  489. }
  490. #if defined( HB_HAS_PCRE )
  491. static void * hb_pcre_grab( size_t size )
  492. {
  493. return hb_xgrab( size );
  494. }
  495. static void hb_pcre_free( void * ptr )
  496. {
  497. hb_xfree( ptr );
  498. }
  499. #endif
  500. HB_CALL_ON_STARTUP_BEGIN( _hb_regex_init_ )
  501. #if defined( HB_HAS_PCRE )
  502. /* Hack to force linking newer PCRE versions not the one included in BCC RTL */
  503. # if defined( __BORLANDC__ )
  504. {
  505. int iUTF8Enabled;
  506. pcre_config( PCRE_CONFIG_UTF8, &iUTF8Enabled );
  507. }
  508. # endif
  509. pcre_malloc = hb_pcre_grab;
  510. pcre_free = hb_pcre_free;
  511. pcre_stack_malloc = hb_pcre_grab;
  512. pcre_stack_free = hb_pcre_free;
  513. #endif
  514. hb_regexInit( hb_regfree, hb_regcomp, hb_regexec );
  515. HB_CALL_ON_STARTUP_END( _hb_regex_init_ )
  516. #if defined( HB_PRAGMA_STARTUP )
  517. #pragma startup _hb_regex_init_
  518. #elif defined( HB_DATASEG_STARTUP )
  519. #define HB_DATASEG_BODY HB_DATASEG_FUNC( _hb_regex_init_ )
  520. #include "hbiniseg.h"
  521. #endif