/lexer/regex.go

http://github.com/bobappleyard/bwl · Go · 285 lines · 235 code · 26 blank · 24 comment · 40 complexity · 96eb8df1554ce5c3c5d28b897429bd10 MD5 · raw file

  1. package lexer
  2. import (
  3. "bytes"
  4. "container/list"
  5. "errors"
  6. "strings"
  7. bwlerrors "github.com/bobappleyard/bwl/errors"
  8. )
  9. type RegexSet map[rune]string
  10. var defaultMeta = RegexSet{
  11. 'w': "a-zA-Z0-9_",
  12. 's': " \t\n\r",
  13. 'a': "a-zA-Z",
  14. 'd': "0-9",
  15. 'W': "^a-zA-Z0-9_",
  16. 'S': "^ \t\n\r",
  17. 'A': "^a-zA-Z",
  18. 'D': "^0-9",
  19. }
  20. func ExtendSet(base, ext RegexSet) RegexSet {
  21. res := make(RegexSet)
  22. if base == nil {
  23. base = defaultMeta
  24. }
  25. for k, v := range base {
  26. res[k] = v
  27. }
  28. for k, v := range ext {
  29. res[k] = v
  30. }
  31. return res
  32. }
  33. func (self *Lexer) Regex(re string, m RegexSet) (*BasicState, error) {
  34. return self.root.AddRegex(re, m)
  35. }
  36. func (self *Lexer) ForceRegex(re string, m RegexSet) *BasicState {
  37. res, err := self.Regex(re, m)
  38. if err != nil {
  39. println(re, err.Error())
  40. }
  41. bwlerrors.Fatal(err)
  42. return res
  43. }
  44. func (self *Lexer) Regexes(m, regexes RegexSet) {
  45. for i, x := range regexes {
  46. self.ForceRegex(x, m).SetFinal(int(i))
  47. }
  48. }
  49. type Regex struct {
  50. l *Lexer
  51. }
  52. func NewRegex(re string, m RegexSet) *Regex {
  53. l := New()
  54. l.ForceRegex(re, m).SetFinal(0)
  55. l.ForceRegex(".", nil).SetFinal(1)
  56. return &Regex{l}
  57. }
  58. func (self *Regex) Match(s string) bool {
  59. self.l.StartString(s)
  60. if self.l.Next() == 0 {
  61. return self.l.Len() == len(s)
  62. }
  63. return false
  64. }
  65. func (self *Regex) Matches(s string) []string {
  66. res := make([]string, 0)
  67. self.l.StartString(s)
  68. for !self.l.Eof() {
  69. if self.l.Next() == 0 {
  70. res = append(res, self.l.String())
  71. }
  72. }
  73. return res
  74. }
  75. func (self *Regex) Replace(s string, f func(string) string) string {
  76. res := make([]string, 0)
  77. buf := bytes.Runes([]byte(s))
  78. last := 0
  79. self.l.StartString(s)
  80. for !self.l.Eof() {
  81. if self.l.Next() == 0 {
  82. res = append(res, string(buf[last:self.l.Pos()]))
  83. res = append(res, f(self.l.String()))
  84. last = self.l.Pos() + self.l.Len()
  85. }
  86. }
  87. res = append(res, string(buf[last:]))
  88. return strings.Join(res, "")
  89. }
  90. func Match(re, s string) bool {
  91. expr := NewRegex(re, nil)
  92. return expr.Match(s)
  93. }
  94. func Matches(re, s string) []string {
  95. expr := NewRegex(re, nil)
  96. return expr.Matches(s)
  97. }
  98. func Replace(re, s string, f func(string) string) string {
  99. expr := NewRegex(re, nil)
  100. return expr.Replace(s, f)
  101. }
  102. type regexPos struct {
  103. start, end *BasicState
  104. }
  105. func (self *BasicState) AddRegex(re string, m RegexSet) (*BasicState, error) {
  106. if m == nil {
  107. m = defaultMeta
  108. }
  109. // this is just the sort of horror show that lexical analysis avoids
  110. // stack machine
  111. start := self
  112. end := NewState()
  113. stack := list.New()
  114. // state flags
  115. expr, esc, cs := false, false, false
  116. setstr := ""
  117. // go into a subexpression
  118. push := func() {
  119. rp := &regexPos{start, end}
  120. stack.PushBack(rp)
  121. end = NewState()
  122. start.AddEmptyTransition(end)
  123. }
  124. // come out of a subexpression
  125. pop := func() {
  126. v := stack.Back()
  127. stack.Remove(v)
  128. rp := v.Value.(*regexPos)
  129. end.AddEmptyTransition(rp.end)
  130. start = rp.start
  131. end = rp.end
  132. }
  133. // move forward, for the purposes of concatenation
  134. move := func() {
  135. start = end
  136. end = NewState()
  137. }
  138. // the expression is inside an implicit ( ... )
  139. push()
  140. // parse the expression
  141. for _, c := range re {
  142. // escaped characters
  143. if esc {
  144. esc = false
  145. // inside a charset jobby
  146. if cs {
  147. setstr += string(c)
  148. continue
  149. }
  150. // check out the metachar action
  151. if meta, ok := m[c]; ok {
  152. move()
  153. chars, err := Charset(meta, end)
  154. if err != nil {
  155. return nil, err
  156. }
  157. start.AddEmptyTransition(chars)
  158. expr = true
  159. continue
  160. }
  161. // nothing else going on? well you escaped it for a reason
  162. goto add
  163. }
  164. // charsets
  165. if cs {
  166. if c == '\\' {
  167. esc = true
  168. } else if c == ']' {
  169. chars, err := Charset(setstr, end)
  170. if err != nil {
  171. return nil, err
  172. }
  173. start.AddEmptyTransition(chars)
  174. setstr = ""
  175. cs = false
  176. expr = true
  177. } else {
  178. setstr += string(c)
  179. }
  180. continue
  181. }
  182. // everything else
  183. switch c {
  184. // charsets
  185. case '.':
  186. move()
  187. start.AddEmptyTransition(Any(end))
  188. expr = true
  189. case '[':
  190. move()
  191. cs = true
  192. case ']':
  193. if !cs {
  194. return nil, errors.New("trying to close unopened charset")
  195. }
  196. // grouping
  197. case '(':
  198. move()
  199. push()
  200. expr = false
  201. case ')':
  202. if stack.Len() <= 1 {
  203. return nil, errors.New("trying to close unopened subexpr")
  204. }
  205. pop()
  206. expr = true
  207. // alternation
  208. case '|':
  209. pop()
  210. push()
  211. expr = false
  212. // modifiers
  213. case '?':
  214. start.AddEmptyTransition(end)
  215. goto check
  216. case '*':
  217. start.AddEmptyTransition(end)
  218. end.AddEmptyTransition(start)
  219. goto check
  220. case '+':
  221. end.AddEmptyTransition(start)
  222. goto check
  223. // escape character
  224. case '\\':
  225. esc = true
  226. expr = false
  227. // otherwise just add that char
  228. default:
  229. goto add
  230. }
  231. continue
  232. // make sure the modifier modified something
  233. check:
  234. if !expr {
  235. return nil, errors.New("nothing to modify")
  236. }
  237. expr = false
  238. continue
  239. // add a character transition
  240. add:
  241. move()
  242. start.AddTransition(c, end)
  243. expr = true
  244. continue
  245. }
  246. // some final consistency checks
  247. if cs {
  248. return nil, errors.New("unclosed charset")
  249. }
  250. if esc {
  251. return nil, errors.New("invalid escape sequence")
  252. }
  253. if stack.Len() > 1 {
  254. return nil, errors.New("unclosed subexpr")
  255. }
  256. // close the implicit brackets
  257. pop()
  258. return end, nil
  259. }