/samples/M/pcre.m
Objective C | 511 lines | 479 code | 32 blank | 0 comment | 31 complexity | 1b6a4d46b7fd2d6d85bba6f4faba9818 MD5 | raw file
- ;
- ; GT.M PCRE Extension
- ; Copyright (C) 2012 Piotr Koper <piotr.koper@gmail.com>
- ;
- ; This program is free software: you can redistribute it and/or modify
- ; it under the terms of the GNU Affero General Public License as
- ; published by the Free Software Foundation, either version 3 of the
- ; License, or (at your option) any later version.
- ;
- ; This program is distributed in the hope that it will be useful,
- ; but WITHOUT ANY WARRANTY; without even the implied warranty of
- ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ; GNU Affero General Public License for more details.
- ;
- ; You should have received a copy of the GNU Affero General Public License
- ; along with this program. If not, see <http://www.gnu.org/licenses/>.
- ;
- ; GT.M™ is a trademark of Fidelity Information Services, Inc.
- ; "GT.M™ is a vetted industrial strength, transaction processing application
- ; platform consisting of a key-value database engine optimized for extreme
- ; transaction processing throughput & business continuity."
- ; -- http://sourceforge.net/projects/fis-gtm/
- ; GT.M PCRE Extension
- ;
- ; This extension tries to deliver the best possible PCRE interface for the
- ; M world by providing a support for PCRE with M arrays, stringified parameter
- ; names, simplified API, locales, exceptions and Perl5 Global Match.
- ;
- ; See pcreexamples.m for comprehensive examples on ^pcre routines usage and
- ; beginner level tips on match limits, exception handling and UTF-8 in GT.M.
- ;
- ; Try out the best known book on regular expressions: http://regex.info/
- ; For more information on PCRE see: http://pcre.org/
- ;
- ; Please feel free to contact me if you have any questions or comments,
- ; Piotr Koper <piotr.koper@gmail.com>
- ;
- pcre ;GT.M PCRE Extension
- ;1.0;Initial release;pkoper
- q
- version()
- q $&pcre.version()
- config(name)
- ; name is one of: (case insensitive)
- ; "UTF8", "NEWLINE", "LINK_SIZE", "POSIX_MALLOC_THRESHOLD",
- ; "MATCH_LIMIT", "MATCH_LIMIT_RECURSION", "STACKRECURSE",
- ; "BSR", "UNICODE_PROPERTIES", "JIT", "JITTARGET"
- ;
- d protect
- ;
- n erropt,isstring,s,n,code
- s code=$&pcre.config(.name,.erropt,.isstring,.s,.n)
- s:code $ec=",U"_(-code)_","
- q $s(isstring:s,1:n)
- compile(pattern,options,locale,mlimit,reclimit)
- ; options is case insensitive and optional string with "|" joined:
- ; "ANCHORED", "CASELESS", "DOLLAR_ENDONLY", "DOTALL", "EXTENDED",
- ; "FIRSTLINE", "MULTILINE", "NO_AUTO_CAPTURE", "DUPNAMES",
- ; "UNGREEDY", "BSR_ANYCRLF", "BSR_UNICODE", "JAVASCRIPT_COMPAT",
- ; "NL_ANY", "NL_ANYCRLF", "NL_CR", "NL_CRLF","NL_LF",
- ; "UTF8", "UCP", "NO_UTF8_CHECK"
- ;
- ; locale is an optional Unix locale name used for pcre_maketables(),
- ; cases:
- ; undefined or "":
- ; pcre_maketables() will not be called
- ; "ENV" (case insensitive):
- ; use locale in program environment defined by the
- ; environment variables LANG or LC_*
- ; specified:
- ; "pl_PL.iso88592", "pl_PL.utf8", "C", ...
- ; see locale(1), locale(2) and the output of command:
- ; $ locale -a
- ; Debian tip: use
- ; $ dpkg-reconfigure locales
- ; to enable or set system-wide locale
- ;
- ; mlimit (optional) limits the number of internal matching function
- ; calls in pcre_exec() execution, see PCRE manual for details
- ;
- ; reclimit (optional) limit for the depth of recursion when calling
- ; the internal matching function in a pcre_exec() execution,
- ; see PCRE manual for details
- ;
- d protect
- ;
- n erropt,ref,err,erroffset,code
- s code=$&pcre.compile(.pattern,$g(options),.erropt,.ref,.err,.erroffset,$g(locale),$g(mlimit,0),$g(reclimit,0))
- s:code $ec=",U"_(-code)_","
- q ref
- exec(ref,subject,options,startoffset,length)
- ; options is case insensitive and optional string with "|" joined:
- ; "ANCHORED", "BSR_ANYCRLF", "BSR_UNICODE",
- ; "NL_ANY", "NL_ANYCRLF", "NL_CR", "NL_CRLF", "NL_LF",
- ; "NOTBOL", "NOTEOL", "NOTEMPTY", "NOTEMPTY_ATSTART",
- ; "NO_START_OPTIMIZE", "NO_UTF8_CHECK",
- ; "PARTIAL_SOFT", "PARTIAL_HARD"
- ;
- ; startoffset is in octets, starts with 1 (like in M) (optional)
- ;
- ; length is subject length in octets, not chars (optional)
- ;
- d protect
- ;
- n erropt,code,start
- s start=$g(startoffset,1)-1
- s code=$&pcre.exec(.ref,.subject,$g(length,$zl(subject)),start,$g(options),.erropt)
- s:code<0 $ec=",U"_(-code)_","
- q code
- ovector(ref,i) ; return i-element from ovector
- d protect
- ;
- n n,code
- s code=$&pcre.ovector(.ref,.i,.n)
- s:code $ec=",U"_(-code)_","
- ;s $ec=",U123,"
- q n
- ovecsize(ref) ; return ovecsize
- d protect
- ;
- n n,code
- s code=$&pcre.ovecsize(.ref,.n)
- s:code $ec=",U"_(-code)_","
- q n
- fullinfo(ref,name)
- ; name is one of: (case insensitive)
- ; "OPTIONS", "SIZE", "CAPTURECOUNT", "BACKREFMAX", "FIRSTBYTE",
- ; "FIRSTTABLE", "LASTLITERAL", "NAMEENTRYSIZE", "NAMECOUNT",
- ; "STUDYSIZE", "OKPARTIAL", "JCHANGED", "HASCRORLF", "MINLENGTH",
- ; "JIT", "JITSIZE"
- ; for NAME* options see also $$nametable^pcre()
- ;
- d protect
- ;
- n erropt,isstring,s,n,code
- s code=$&pcre.fullinfo(.ref,.name,.erropt,.isstring,.s,.n)
- s:code $ec=",U"_(-code)_","
- q $s(isstring:s,1:n)
- nametable(ref,i,n) ; returns index (n) and name, or { 0, "" } for invalid i
- ; i is indexed from 1
- ;
- d protect
- ;
- n s,code
- s code=$&pcre.nametable(.ref,.i,.n,.s)
- s:code $ec=",U"_(-code)_","
- q s
- substring(ref,i,begin,end)
- s begin=$$ovector(.ref,i*2)+1,end=$$ovector(.ref,i*2+1)
- ; ovector contains octet indexed data not UNICODE chars, so $ze is used
- q:'begin ""
- q $s($g(o,0):begin_","_end,1:$ze(subject,begin,end))
- store(ref,i,n,o,key) ; same as above but stores captured data in n array
- n begin,end
- s begin=$$ovector(.ref,i*2)+1,end=$$ovector(.ref,i*2+1)
- q:'begin
- s key=$g(key,i)
- s:o n(key,0)=begin,n(key,1)=end
- s n(key)=$ze(subject,begin,end)
- q
- gstore(ref,i,n,round,byref,o,key) ; store for global match
- n begin,end
- s begin=$$ovector(.ref,i*2)+1,end=$$ovector(.ref,i*2+1)
- q:'begin
- s key=$g(key,i)
- i byref d
- . s:o n(key,round,0)=begin,n(key,round,1)=end
- . s n(key,round)=$ze(subject,begin,end)
- e d
- . s:o n(round,key,0)=begin,n(round,key,1)=end
- . s n(round,key)=$ze(subject,begin,end)
- q
- test(subject,pattern,options,locale,mlimit,reclimit)
- ; see $$compile^pcre for options, locale, mlimit and reclimit
- ;
- d protect
- n ref,l
- s ref=$$compile(.pattern,$g(options),$g(locale),$g(mlimit,0),$g(reclimit,0))
- s l=$$exec(.ref,.subject)
- d free(.ref)
- q l
- match(subject,pattern,match,capture,options,locale,mlimit,reclimit)
- ; see $$compile^pcre for options, locale, mlimit and reclimit
- ;
- ; capture is case insensitive and optional string with "|" joined
- ; names or indexes to be capture
- ;
- ; extended options:
- ; "NAMED_ONLY" - capture only named groups
- ; "OVECTOR" - return additional ovector data
- ;
- d protect
- ;
- n namedonly,ovector,ref,o,l,i,j,s,c,begin
- ;
- s options=$g(options),(namedonly,ovector)=0
- f i=1:1:$l(options,"|") d
- . s o=$zco($p(options,"|",i),"u")
- . i o="NAMED_ONLY" s namedonly=1,$p(options,"|",i)=""
- . i o="OVECTOR" s ovector=1,$p(options,"|",i)=""
- s:namedonly options=options_"|NO_AUTO_CAPTURE"
- ;
- k match
- s ref=$$compile(.pattern,.options,$g(locale),$g(mlimit,0),$g(reclimit,0))
- s l=$$exec(.ref,.subject)
- i $d(capture) d
- . s c="|"_capture_"|"
- . ; ovector indexed data
- . i 'namedonly f i=0:1:l-1 d:c[("|"_i_"|") store(.ref,.i,.match,.ovector)
- . ; named matches data
- . f i=1:1 s s=$$nametable(.ref,.i,.j) q:s="" d:c[("|"_s_"|") store(.ref,.j,.match,.ovector,.s)
- e d
- . i 'namedonly f i=0:1:l-1 d store(.ref,.i,.match,.ovector)
- . f i=1:1 s s=$$nametable(.ref,.i,.j) q:s="" d store(.ref,.j,.match,.ovector,.s)
- d free(.ref)
- q:$q l q
- global(subject,pattern,match,capture,options,locale,mlimit,reclimit)
- ; options is the same as for match^pcre, extended options:
- ; "OVECTOR" - return additional ovector data
- ; "GROUPED" - group the result in match array by pattern groups
- ; "NAMED_ONLY" - capture only named patterns
- ;
- ; see pcredemo.c and pcreccp.cc from PCRE for comments on procedure
- ; for Perl like global matching
- ;
- d protect
- ;
- n ref,c,o,ovector,byref,namedonly,utf8,crlf,start,end,matches,empty,skip,round,i,j,s,n,q
- k match
- ;
- ; determine additional options and remove them before calling the compile^pcre
- s options=$g(options),(ovector,byref,namedonly)=0
- f i=1:1:$l(options,"|") d
- . s o=$zco($p(options,"|",i),"u")
- . i o="NAMED_ONLY" s namedonly=1,$p(options,"|",i)=""
- . i o="GROUPED" s byref=1,$p(options,"|",i)=""
- . i o="OVECTOR" s ovector=1,$p(options,"|",i)=""
- s:namedonly options=options_"|NO_AUTO_CAPTURE"
- ;
- ; compile the pattern
- s ref=$$compile(.pattern,.options,$g(locale),$g(mlimit,0),$g(reclimit,0))
- ;
- s:$d(capture) c="|"_capture_"|"
- s byref=$g(byref,0)
- ;
- ; check pattern options for UTF8 and double char new line
- s o="|"_$$fullinfo(.ref,"OPTIONS")_"|"
- s utf8=$s(o["|UTF8|":1,1:0)
- s crlf=$s(o["|NL_CRLF|":1,o["|NL_ANY|":1,o["|NL_ANYCRLF|":1,1:0)
- ;
- ; if none check the PCRE build options
- i crlf=0 d
- . s o=$$config("NEWLINE")
- . s crlf=$s(o="NL_CRLF":1,o="NL_ANY":1,o="NL_ANYCRLF":1,1:0)
- ;
- s (start,round,i)=1,(empty,skip,q)=0
- s end=$l(subject)+1
- f d q:start>end!q
- . i empty d
- .. s matches=$$exec(.ref,.subject,"NOTEMPTY_ATSTART|ANCHORED",.start) ; unwind this call to optimize
- .. q:matches ; quit this do, leave empty=1, store the matches
- ..
- .. ; advance if no match & clear empty
- .. s start=start+1
- .. i start>end s q=1 q
- ..
- .. ; skip LF if CR was before and CRLF mode
- .. s:crlf&(($ze(subject,start-1)=$c(13))&($ze(subject,start)=$c(10))) start=start+1
- ..
- .. ; skip if in a middle of UTF char
- .. i utf8 f q:start'<end!($zbitand($c(0)_$ze(subject,start),$c(0)_$c(192))=$c(0)_$c(128)) s start=start+1
- ..
- .. ; take into account skipped chars
- .. s skip=1,empty=0
- . e d
- .. s matches=$$exec(.ref,.subject,,.start)
- .. i 'matches s q=1 q
- .
- . q:q
- . i skip s skip=0 q
- .
- . i $d(c) d
- .. ; ovector indexed data
- .. i 'namedonly f i=0:1:matches-1 d:c[("|"_i_"|") gstore(.ref,.i,.match,.round,.byref,.ovector)
- .. ; named matches data
- .. f i=1:1 s s=$$nametable(.ref,.i,.n) q:s="" d:c[("|"_s_"|") gstore(.ref,.n,.match,.round,.byref,.ovector,.s)
- . e d
- .. i 'namedonly f i=0:1:matches-1 d gstore(.ref,.i,.match,.round,.byref,.ovector)
- .. f i=1:1 s s=$$nametable(.ref,.i,.n) q:s="" d gstore(.ref,.n,.match,.round,.byref,.ovector,.s)
- . s round=round+1
- .
- . s start=$$ovector(.ref,1)+1
- . s empty=(($$ovector(.ref,0)+1)=start)
- d free(.ref)
- q:$q round-1 q
- replace(subject,pattern,subst,first,last,options,locale,mlimit,reclimit)
- ; see $$match^pcre and $$compile^pcre for options, locale, mlimit and
- ; reclimit
- ;
- ; subst is a string to replace with all occurrences of matched data
- ; \n (like \1, \2, ..) is a back ref for the n-th captured group
- ; \{name} is back ref for a named captured data
- ; \\ is replaced with \
- ;
- ; first is the n-th match in the subject where the substitution begins,
- ; 1 .. n-1 matches are not substituted
- ; defaults to 1
- ;
- ; last is the n-th match in the subject where the substitution ends,
- ; n+1 .. matches are not substituted
- ; defaults to 0 (no limit)
- ;
- n ref,o,n,i,j,begin,end,offset,backref,boffset,value,s
- s ref=$$compile(.pattern,,$g(locale),$g(mlimit,0),$g(reclimit,0))
- ;
- ; prepare back reference stack
- d global^pcre(.subst,"\\(?:(?<ref>(?:\d+|\\))|{(?<ref>[^}]+)})",.backref,,"ovector|dupnames")
- ;
- s options=$g(options)_"|ovector"
- ; silently remove "NAMED_ONLY" and "GROUPPED" options
- f i=1:1:$l(options,"|") d
- . s o=$zco($p(options,"|",i),"u")
- . s:o="NAMED_ONLY"!(o="GROUPED") $p(options,"|",i)=""
- q:'$$global(.subject,.pattern,.n,,.options,$g(locale),$g(mlimit,0),$g(reclimit,0)) subject
- ;
- ; perform the substitution on matched subject parts
- s first=$g(first,1),last=$g(last,0)
- s offset=0,i=""
- f s i=$o(n(i)) q:i="" d:i'<first q:last>0&(i'<last)
- .
- . ; replace back refs in subst (s) with captured data
- . s s=subst,boffset=0,j=""
- . f s j=$o(backref(j)) q:j="" d
- ..
- .. ; determine the back ref type and get the value
- .. ; silently ignore invalid refs
- .. s value=$s(backref(j,"ref")="\":"\\",1:$g(n(i,backref(j,"ref"))))
- ..
- .. ; replace back ref with the value
- .. s begin=backref(j,0,0)
- .. s end=backref(j,0,1)
- .. s $ze(s,begin+boffset,end+boffset)=value
- .. s boffset=boffset-(end+1-begin)+$l(value)
- .
- . ; replace matched data with prepared s
- . s begin=n(i,0,0)
- . s end=n(i,0,1)
- .
- . s $ze(subject,begin+offset,end+offset)=s
- .
- . ; substitute empty matches also (Perl style)
- . ;
- . ; perl -e '$_ = "aa"; s/(b*|a)/Xy/g; print "$_\n"'
- . ; w $$replace^pcre("aa","(b*|a)","Xy")
- . ;
- . ; perl -e '$_ = "aa"; s/(b*|aa)/Xy/g; print "$_\n"'
- . ; w $$replace^pcre("aa","(b*|aa)","Xy")
- . ;
- . ; perl -e '$_ = "aaa"; s/(b*|aa)/Xy/g; print "$_\n"'
- . ; w $$replace^pcre("aaa","(b*|aa)","Xy")
- . ;
- . s:begin>end $ze(subject,begin+offset,begin+offset+1)=s_$ze(subject,begin+offset,begin+offset+1)
- .
- . s offset=offset-(end+1-begin)+$l(s)
- q:$q subject q
- free(ref)
- d protect
- n code
- s code=$&pcre.free(.ref)
- s:code $ec=",U"_(-code)_","
- q
- stackusage()
- ; return the approximate amount of stack (in bytes) used per
- ; recursion in pcre_exec()
- q -$&pcre.stackusage()
- ; Exception Handling
- ;
- ; Error conditions are handled by setting the $zc to user codes, see labels
- ; at the end of this file. When neither $zt nor $et are set by the user,
- ; the default handler (trap^pcre) is used within $zt mechanism.
- ;
- ; The default handler will write out the details of the exception, and
- ; depending on the caller type, it will re raise the exception. This will
- ; lead to:
- ; a) writing the exception details, when called from the GT.M prompt,
- ; b) writing the exception details, the M code place when the pcre routine
- ; was called, and terminating the GT.M image.
- ;
- ; The user should define own exception handler using $zt or $et, see
- ; pcreexample.m for example exception handlers.
- ;
- protect ; try setup $zt with default handler
- ;
- ; "n protect" in the $zt is a marker for trap^pcre
- s:'$l($et)&(($zt="B")!'$l($zt)) $zt="n protect d trap zg "_($zl-2)
- q
- trap(stack)
- ; see U* labels at the bottom of this file, some lvns are mandatory
- ; all exceptions are passed through if we wasn't called from direct mode
- ;
- n zl,ref,msg,place
- ;
- ; take the $zl if in default handler setup by protect^trap
- s zl=$p($zt,"n protect d trap zg ",2)
- ;
- ; clear the $zt
- s $zt=""
- ;
- ; source location from either stack argument, zl (default handler), or $st-2
- s place=$st($g(stack,$g(zl,$st-1)-1),"PLACE")
- ;
- ; clear location if called from direct mode
- s:place["^GTM$DMOD" place=""
- ;
- s ref=$p($ec,",",$l($ec,",")-1)
- i $l($t(@ref)) d
- . u $p
- . w @$p($t(@ref),";",2)
- . ; %PCRE-E-COMPILE additional message
- . w:ref="U16392"&$g(erroffset) " in "_$e($g(pattern),1,erroffset)_" <-- HERE"
- . w !
- . ; write the location it has any meaning
- . w:$l(place) "%PCRE-I-RTSLOC, At M source location ",place,!
- e d
- . w $p($zs,",",3,4),!
- . w "%GTM-I-RTSLOC, At M source location ",$p($zs,",",2),!
- ;
- ; re raise the exception if in a default handler and not called from the direct mode
- s:$l(place)&$g(zl,0) $ec=$ec
- q
- ; XC API specific
- ;
- U16384 ;"%PCRE-E-ARGSMALL, Actual argument count is too small"
- U16385 ;"%PCRE-E-OPTNAME, Unknown option name "_$p($g(erropt),"|")
- U16386 ;"%PCRE-E-OBJLIMIT, Maximum number of objects exceeded"
- U16387 ;"%PCRE-E-INVREF, Invalid object reference"
- U16388 ;"%PCRE-E-INTBUF, Internal buffer too small"
- U16389 ;"%PCRE-E-MALLOC, Could not allocate memory"
- U16390 ;"%PCRE-E-STUDY, Pattern study failed: "_$g(err,"unknown reason")
- U16391 ;"%PCRE-E-LOCALE, Invalid locale name "_$g(locale)
- U16392 ;"%PCRE-E-COMPILE, Pattern compilation failed, "_$g(err,"unknown reason")
- U16393 ;"%PCRE-E-LENGTH, Invalid length value specified"
- ; PCRE specific
- ;
- ; NOTES:
- ;
- ; U16401 exception is never raised; when pcre_exec() returns -1
- ; (i.e. NOMATCH) the pcre.exec returns 0, so no exception will
- ; ever raise, NOMATCH is not an uncommon situation
- ;
- ; U16388 is raised when pcre_exec() returns 0, i.e. the ovector
- ; was too small; considering that ovector size is not controlled
- ; in M world, it is an exception here
- ;
- U16401 ;"%PCRE-E-NOMATCH, The subject string did not match the pattern"
- U16402 ;"%PCRE-E-NULL, Either compiled code or subject was passed as NULL, or ovector was NULL"
- U16403 ;"%PCRE-E-BADOPTION, An unrecognized bit was set in the options argument"
- U16404 ;"%PCRE-E-BADMAGIC, The magic number is not present in compiled code"
- U16405 ;"%PCRE-E-UNKNOWNOPCODE, While running the pattern match, an unknown item was encountered in the compiled pattern"
- U16406 ;"%PCRE-E-NOMEMORY, Call via pcre_malloc() or pcre_stack_malloc() failed"
- U16407 ;"%PCRE-E-NOSUBSTRING, No substring"
- U16408 ;"%PCRE-E-MATCHLIMIT, Match limit was reached"
- U16409 ;"%PCRE-E-CALLOUT, Callout function wanted to yield a distinctive error code"
- U16410 ;"%PCRE-E-BADUTF8, A string that contains an invalid UTF-8 byte sequence was passed as a subject"
- U16411 ;"%PCRE-E-BADUTF8OFFSET, The value of startoffset did not point to the beginning of a UTF-8 character or the end of the subject"
- U16412 ;"%PCRE-E-PARTIAL, The subject string did not match, but it did match partially"
- U16414 ;"%PCRE-E-INTERNAL, An unexpected internal error has occurred"
- U16415 ;"%PCRE-E-BADCOUNT, The value of the ovecsize argument is negative"
- U16416 ;"%PCRE-E-DFAUITEM, Unsupported item in the pattern, for e.g. \C o a back reference"
- U16417 ;"%PCRE-E-DFAUCOND, Unsupported condition item, for e.g. a back reference for a condition, or a test for recursion in a specific group"
- U16418 ;"%PCRE-E-DFAUMLIMIT, Match limits are unsupported for DTA matching"
- U16419 ;"%PCRE-E-DFAWSSIZE, Out of space in the workspace vector"
- U16420 ;"%PCRE-E-DFARECURSE, The output vector was not large enough while processing recursive subpattern"
- U16421 ;"%PCRE-E-RECURSIONLIMIT, The internal recursion limit was reached"
- U16423 ;"%PCRE-E-BADNEWLINE, An invalid combination of NL_xxx options was given"
- U16424 ;"%PCRE-E-BADOFFSET, The startoffset was negative or greater than the length of the value in length"
- U16425 ;"%PCRE-E-SHORTUTF8, The subject string ends with a truncated UTF-8 character and the PCRE_PARTIAL_HARD option is set"
- U16426 ;"%PCRE-E-RECURSELOOP, A recursion loop within the pattern was detected"
- U16427 ;"%PCRE-E-JITSTACKLIMIT, The memory available for the just-in-time processing stack is not large enough"