/tm-go/lex/compile.go

https://github.com/inspirer/textmapper · Go · 267 lines · 239 code · 27 blank · 1 comment · 56 complexity · 1b00f51e539822a5b989940d827713ed MD5 · raw file

  1. package lex
  2. import (
  3. "fmt"
  4. "log"
  5. "strings"
  6. "github.com/inspirer/textmapper/tm-go/status"
  7. "github.com/inspirer/textmapper/tm-go/util/container"
  8. )
  9. type Pattern struct {
  10. Name string
  11. RE *Regexp
  12. Text string // of RE
  13. Origin status.SourceNode
  14. }
  15. type inst struct {
  16. consume symlist // An empty list means we cannot advance to the next instruction.
  17. links []int // Relative offsets of other instructions that should be considered at the same time.
  18. rule *Rule // The rule to be accepted.
  19. trace trace // core() instructions only: the trace of this instruction
  20. }
  21. func (i inst) core() bool {
  22. return i.rule != nil || len(i.consume) != 0
  23. }
  24. func (i inst) String() string {
  25. var sb strings.Builder
  26. i.trace.toString(&sb)
  27. return sb.String()
  28. }
  29. type trace struct {
  30. pattern *Pattern // The pattern that produced a given instruction.
  31. offset int // The corresponding offset in pattern.Text.
  32. caller *trace // The last pattern in this chain is a Rule.
  33. }
  34. func (t *trace) toString(sb *strings.Builder) {
  35. if t.caller != nil {
  36. t.caller.toString(sb)
  37. sb.WriteString(" -> ")
  38. } else {
  39. sb.WriteString(t.pattern.Name)
  40. sb.WriteString(": ")
  41. }
  42. fmt.Fprintf(sb, "/%v<STATE>%v/", t.pattern.Text[:t.offset], t.pattern.Text[t.offset:])
  43. }
  44. // reCompiler translates a set of regular expressions into a single list of instructions.
  45. type reCompiler struct {
  46. sets []charset
  47. out []inst
  48. consume []int
  49. runes map[rune]int // index in sets
  50. inExternal map[string]bool
  51. err error
  52. }
  53. func newCompiler() *reCompiler {
  54. return &reCompiler{
  55. runes: make(map[rune]int),
  56. inExternal: make(map[string]bool),
  57. }
  58. }
  59. func (c *reCompiler) addPattern(p *Pattern, action int, rule *Rule) (int, error) {
  60. c.err = nil
  61. ret := c.next()
  62. t := trace{pattern: p}
  63. c.serialize(p.RE, rule.Resolver, t)
  64. t.offset = len(t.pattern.Text)
  65. accept := c.emit(nil, t)
  66. c.out[accept].rule = rule
  67. transitiveClosure(c.out[ret:])
  68. for _, delta := range c.out[ret].links {
  69. dst := ret + delta
  70. if c.out[dst].rule != nil {
  71. c.errorf("`%v` accepts empty text", p.Name)
  72. break
  73. }
  74. }
  75. return ret, c.err
  76. }
  77. func (c *reCompiler) compile() (ins []inst, inputMap []RangeEntry) {
  78. symlists, inputMap := compressCharsets(c.sets)
  79. for i, id := range c.consume {
  80. if id >= 0 {
  81. c.out[i].consume = symlists[id]
  82. }
  83. }
  84. for src := range c.out {
  85. nlinks := c.out[src].links[:0]
  86. for _, delta := range c.out[src].links {
  87. if c.out[src+delta].core() {
  88. nlinks = append(nlinks, delta)
  89. }
  90. }
  91. c.out[src].links = nlinks
  92. }
  93. return c.out, inputMap
  94. }
  95. func (c *reCompiler) serialize(re *Regexp, resolver Resolver, t trace) {
  96. if c.err != nil {
  97. return
  98. }
  99. switch re.op {
  100. case opLiteral:
  101. for i, r := range re.text {
  102. t.offset = re.offset + i
  103. c.emit(charset{r, r}, t)
  104. }
  105. case opCharClass:
  106. t.offset = re.offset
  107. c.emit(re.charset, t)
  108. case opExternal:
  109. t.offset = re.offset
  110. if re.text == "eoi" {
  111. eoi := c.emit(nil, t)
  112. c.out[eoi].consume = symlist{EOI}
  113. return
  114. }
  115. if _, ok := c.inExternal[re.text]; ok {
  116. c.errorf("named patterns cannot recursively depend on each other (in %s)", re.text)
  117. return
  118. }
  119. pattern := resolver.Resolve(re.text)
  120. if pattern == nil {
  121. c.errorf("cannot find named pattern: %s", re.text)
  122. return
  123. }
  124. c.inExternal[re.text] = true
  125. child := trace{pattern: pattern, caller: &t}
  126. c.serialize(pattern.RE, resolver, child)
  127. delete(c.inExternal, re.text)
  128. case opRepeat:
  129. if re.min > 16 || re.max > 16 {
  130. c.errorf("cannot expand the regexp, too many entities to repeat (max. 16)")
  131. return
  132. }
  133. barrier := c.emit(nil, trace{})
  134. c.link(barrier, c.next())
  135. var last int
  136. for i := 0; i < re.min; i++ {
  137. last = c.next()
  138. c.serialize(re.sub[0], resolver, t)
  139. }
  140. if re.max == -1 {
  141. if re.min == 0 {
  142. last = c.next()
  143. c.serialize(re.sub[0], resolver, t)
  144. }
  145. barrier := c.emit(nil, trace{})
  146. c.link(barrier, last)
  147. c.link(barrier, c.next())
  148. if re.min == 0 {
  149. c.link(last, c.next())
  150. }
  151. } else if re.max > re.min {
  152. var subs []int
  153. for i := re.max - re.min; i > 0; i-- {
  154. subs = append(subs, c.next())
  155. c.serialize(re.sub[0], resolver, t)
  156. }
  157. barrier := c.emit(nil, trace{})
  158. for _, sub := range subs {
  159. c.link(sub, c.next())
  160. }
  161. c.link(barrier, c.next())
  162. }
  163. case opAlternate:
  164. alt := c.emit(nil, trace{})
  165. var ends []int
  166. for _, s := range re.sub {
  167. c.link(alt, c.next())
  168. c.serialize(s, resolver, t)
  169. ends = append(ends, c.emit(nil, trace{}))
  170. }
  171. for _, end := range ends {
  172. c.link(end, c.next())
  173. }
  174. case opConcat:
  175. for _, s := range re.sub {
  176. c.serialize(s, resolver, t)
  177. }
  178. default:
  179. log.Fatal("unknown regexp operation")
  180. }
  181. }
  182. func (c *reCompiler) errorf(format string, a ...interface{}) {
  183. if c.err == nil {
  184. c.err = fmt.Errorf(format, a...)
  185. }
  186. }
  187. func (c *reCompiler) next() int {
  188. return len(c.out)
  189. }
  190. func (c *reCompiler) link(src, dst int) {
  191. c.out[src].links = append(c.out[src].links, dst-src)
  192. }
  193. func (c *reCompiler) emit(cs charset, t trace) int {
  194. c.out = append(c.out, inst{trace: t})
  195. id := -1
  196. if len(cs) != 0 {
  197. if len(cs) == 2 && cs[0] == cs[1] {
  198. r := cs[0]
  199. var ok bool
  200. id, ok = c.runes[r]
  201. if !ok {
  202. id = len(c.sets)
  203. c.sets = append(c.sets, []rune{r, r})
  204. c.runes[r] = id
  205. }
  206. } else {
  207. id = len(c.sets)
  208. c.sets = append(c.sets, cs)
  209. }
  210. }
  211. c.consume = append(c.consume, id)
  212. return len(c.out) - 1
  213. }
  214. func transitiveClosure(code []inst) {
  215. seen := container.NewBitSet(len(code))
  216. var visit func(int, int)
  217. visit = func(origin, src int) {
  218. for _, delta := range code[src].links {
  219. dst := src + delta
  220. if !seen.Get(dst) {
  221. code[origin].links = append(code[origin].links, dst-origin)
  222. seen.Set(dst)
  223. visit(origin, dst)
  224. }
  225. }
  226. }
  227. for src, ins := range code {
  228. if len(ins.links) == 0 {
  229. continue
  230. }
  231. seen.ClearAll(len(code))
  232. seen.Set(src)
  233. for _, delta := range ins.links {
  234. seen.Set(src + delta)
  235. }
  236. for _, delta := range ins.links {
  237. visit(src, src+delta)
  238. }
  239. }
  240. }