PageRenderTime 48ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/dpma/Pattern.hpp

https://gitlab.com/rjs/keyword-regex-research
C++ Header | 468 lines | 358 code | 71 blank | 39 comment | 65 complexity | 05c985a88845933ce6cc663134806420 MD5 | raw file
  1. /*
  2. * dpma - Research prototype of regular expression matching using a
  3. * dynamic pattern matching automaton
  4. *
  5. * Copyright 2017 Tuukka Haapasalo, Riku Saikkonen, Panu Silvasti and
  6. * Aalto University
  7. *
  8. * Permission is hereby granted, free of charge, to any person
  9. * obtaining a copy of this software and associated documentation
  10. * files (the "Software"), to deal in the Software without
  11. * restriction, including without limitation the rights to use, copy,
  12. * modify, merge, publish, distribute, sublicense, and/or sell copies
  13. * of the Software, and to permit persons to whom the Software is
  14. * furnished to do so, subject to the following conditions:
  15. *
  16. * The above copyright notice and this permission notice shall be
  17. * included in all copies or substantial portions of the Software.
  18. *
  19. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  20. * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  21. * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  22. * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  23. * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  24. * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  26. * DEALINGS IN THE SOFTWARE.
  27. */
  28. /*
  29. * Classes for representing parsed regular expression patterns using
  30. * character classes, keywords and gaps.
  31. */
  32. #ifndef PM_PATTERN_HPP
  33. #define PM_PATTERN_HPP
  34. #include "Reader.hpp"
  35. #include "Utils.hpp"
  36. #include "util-Log.hpp"
  37. #include <limits>
  38. #include <string>
  39. #include <iostream>
  40. #include <vector>
  41. #include <set>
  42. #include <bitset>
  43. #include <tuple>
  44. #include <utility>
  45. #include <regex.h>
  46. typedef unsigned char chartype;
  47. typedef unsigned int keywordid_t;
  48. typedef unsigned int gap_size_t;
  49. class CharClass: public virtual Printable {
  50. private:
  51. std::bitset<1<<(8*sizeof(chartype))> chars;
  52. bool saved;
  53. static std::vector<const CharClass *> allCharClasses;
  54. // For boundaries; these are initialized by CharClass::init()
  55. static const CharClass *wordCC, *nonWordCC, *eolCC, *bothEolCC;
  56. public:
  57. // Helper char class containing all characters (like .)
  58. static const CharClass *dotCC;
  59. public:
  60. CharClass() : chars(), saved(false) { }
  61. virtual ~CharClass() { }
  62. static void init(chartype eolChar);
  63. static const CharClass *boundary_nonb(char btype, bool second);
  64. static const CharClass *boundary(char btype, bool second, chartype otherch);
  65. static const CharClass *predef_class(char ctype);
  66. void add(chartype c) {
  67. ASSERT(!saved);
  68. chars.set(c);
  69. }
  70. void add(const std::string & chars_) {
  71. ASSERT(!saved);
  72. for (chartype c: chars_)
  73. chars.set(c);
  74. }
  75. void add_range(chartype range_first, chartype range_last) {
  76. ASSERT(!saved);
  77. ASSERT(range_first <= range_last);
  78. for (chartype c = range_first; c != range_last; ++c)
  79. chars.set(c);
  80. chars.set(range_last);
  81. }
  82. void reverse() {
  83. ASSERT(!saved);
  84. chars.flip();
  85. }
  86. // Number of chars in class
  87. size_t count() const {
  88. return chars.count();
  89. }
  90. bool matches(chartype c) const {
  91. return chars[c];
  92. }
  93. bool matches(const std::string & s) const {
  94. for (chartype c : s)
  95. if (!chars[c])
  96. return false;
  97. return true;
  98. }
  99. bool operator==(const CharClass & c) const {
  100. return chars == c.chars;
  101. }
  102. bool operator!=(const CharClass & c) const {
  103. return !((*this) == c);
  104. }
  105. const CharClass *save();
  106. void matching_chars(std::string & s) const;
  107. virtual void print(std::ostream & os) const;
  108. };
  109. class Gap: public virtual Printable {
  110. private:
  111. struct CCRange {
  112. gap_size_t i; // position (from beginning or end)
  113. gap_size_t j; // length
  114. const CharClass *cc;
  115. };
  116. gap_size_t minLength;
  117. gap_size_t maxLength;
  118. bool topLevel, tooComplexMid, needFallback;
  119. std::vector<CCRange> ccrBegin, ccrEnd;
  120. CCRange ccrMid; // ccrMid.j is position from end
  121. public:
  122. // Empty gap (by itself, matches only the empty string)
  123. Gap() :
  124. minLength(0), maxLength(0),
  125. topLevel(false), tooComplexMid(false), needFallback(false),
  126. ccrBegin(0), ccrEnd(0), ccrMid({0,0,0}) { }
  127. // Fallback match-anything gap (anything within the lengths)
  128. Gap(gap_size_t minLength_, gap_size_t maxLength_,
  129. const CharClass *cc_ = nullptr)
  130. : Gap() {
  131. addAtEnd(minLength_, maxLength_, cc_);
  132. setNeedFallback();
  133. }
  134. virtual ~Gap() { }
  135. static const gap_size_t infinite = std::numeric_limits<gap_size_t>::max();
  136. static gap_size_t add(gap_size_t l1, gap_size_t l2) {
  137. if (l1 == Gap::infinite || l2 == Gap::infinite)
  138. return Gap::infinite;
  139. return l1 + l2;
  140. }
  141. gap_size_t getMinLength() const {
  142. return minLength;
  143. }
  144. gap_size_t getMaxLength() const {
  145. return maxLength;
  146. }
  147. gap_size_t getLength() const {
  148. return minLength;
  149. }
  150. void set(gap_size_t minLength_, gap_size_t maxLength_) {
  151. minLength = minLength_;
  152. maxLength = maxLength_;
  153. }
  154. void addAtEnd(gap_size_t minLength_, gap_size_t maxLength_,
  155. const CharClass * cc) {
  156. ASSERT(maxLength_ > 0);
  157. ASSERT(maxLength_ >= minLength_);
  158. ccrBegin.push_back({minLength_, maxLength_, cc});
  159. }
  160. bool anythingAdded() const {
  161. return !ccrBegin.empty();
  162. }
  163. void finishAdding();
  164. // Returns true if this gap possibly or certainly matches s.
  165. // Sets needfb = true if it matches only possibly.
  166. bool matches(const ReadString & s, bool & needfb) const {
  167. if (s.length() < minLength || s.length() > maxLength)
  168. return false;
  169. try {
  170. // Search s from end to beginning
  171. for (const CCRange & ccr : ccrEnd)
  172. for (size_t p = ccr.i; p < ccr.i + ccr.j; p++)
  173. if (!ccr.cc->matches(s.rat(p)))
  174. return false;
  175. if (ccrMid.cc && s.length() > ccrMid.i + ccrMid.j) {
  176. size_t pend = s.length() - ccrMid.i;
  177. for (size_t p = ccrMid.j; p < pend; p++)
  178. if (!ccrMid.cc->matches(s.rat(p)))
  179. return false;
  180. }
  181. for (const CCRange & ccr : ccrBegin)
  182. for (ssize_t p = ccr.i + ccr.j - 1; p >= ccr.i; p--)
  183. if (!ccr.cc->matches(s.at(p)))
  184. return false;
  185. } catch (ReadString::read_off_end &e) {
  186. needfb = true;
  187. }
  188. if (needFallback)
  189. needfb = true;
  190. if (tooComplexMid && s.length() > ccrMid.i + ccrMid.j)
  191. needfb = true;
  192. return true;
  193. }
  194. void setTopLevel() {
  195. topLevel = true;
  196. }
  197. bool isTopLevel() const {
  198. return topLevel;
  199. }
  200. void setNeedFallback() {
  201. needFallback = true;
  202. }
  203. bool needsFallback() const {
  204. return needFallback;
  205. }
  206. bool mayNeedFallback() const {
  207. return needFallback || tooComplexMid;
  208. }
  209. static std::ostream & formatLength(std::ostream & os, gap_size_t length);
  210. virtual void print(std::ostream & os) const;
  211. };
  212. class Pattern: public virtual Printable {
  213. public:
  214. class invalid_pattern: public std::exception {
  215. public:
  216. explicit invalid_pattern() : std::exception() { }
  217. };
  218. Pattern(int id_, std::string const & pattern_)
  219. : id(id_), pattern(pattern_),
  220. keywords(), gaps(), beginSet(), followSets(),
  221. preBoundaries(), postBoundaries(),
  222. found(false), lastFoundPos(0),
  223. fallbackMatcher(0), alwaysFallback(false),
  224. postFallbackAmount(0) {
  225. parse();
  226. }
  227. virtual ~Pattern() {
  228. if (fallbackMatcher != nullptr) {
  229. regfree(fallbackMatcher);
  230. delete fallbackMatcher;
  231. fallbackMatcher = nullptr;
  232. }
  233. }
  234. Pattern(Pattern & p) = delete;
  235. Pattern(Pattern && p) noexcept
  236. : id(p.id), pattern(p.pattern),
  237. keywords(p.keywords), gaps(p.gaps),
  238. beginSet(p.beginSet), followSets(p.followSets),
  239. preBoundaries(p.preBoundaries), postBoundaries(p.postBoundaries),
  240. found(p.found), lastFoundPos(p.lastFoundPos),
  241. fallbackMatcher(p.fallbackMatcher), alwaysFallback(p.alwaysFallback),
  242. postFallbackAmount(p.postFallbackAmount) {
  243. p.fallbackMatcher = nullptr;
  244. }
  245. static const keywordid_t begin_keyword_id =
  246. std::numeric_limits<keywordid_t>::max() - 1;
  247. static const keywordid_t end_keyword_id =
  248. std::numeric_limits<keywordid_t>::max();
  249. int getId() const { return id; }
  250. std::string const & getPattern() const {
  251. return pattern;
  252. }
  253. size_t getLastFoundPos() const {
  254. return lastFoundPos;
  255. }
  256. void setFoundPos(size_t lastFoundPos_) {
  257. found = true;
  258. lastFoundPos = lastFoundPos_;
  259. }
  260. unsigned int getNumKeywords() const {
  261. return keywords.size();
  262. }
  263. std::string const & getKeyword(keywordid_t i) const {
  264. return keywords[i];
  265. }
  266. Gap const & getGap(keywordid_t i) const {
  267. return gaps[i];
  268. }
  269. unsigned int getDistance(keywordid_t i) const {
  270. unsigned int dist = 0;
  271. for (keywordid_t p = 0; p < i; p++) {
  272. dist += getKeyword(p).length();
  273. dist += getGap(p).getLength();
  274. }
  275. dist += getGap(i).getLength();
  276. return dist;
  277. }
  278. unsigned int getKeywordLength(keywordid_t keyword) const {
  279. return keywords[keyword].length();
  280. }
  281. unsigned int getLength() const {
  282. return getDistance(getNumKeywords());
  283. }
  284. keywordid_t addKeyword(std::string const & keyword);
  285. void finalizeKeywords(); // call after all addKeyword()s
  286. void setGap(keywordid_t dest, Gap & gap) {
  287. ASSERT(dest < getNumKeywords());
  288. if (gap.mayNeedFallback())
  289. establishFallbackMatcher(false);
  290. gaps[dest] = gap;
  291. }
  292. void setBeginSet(std::vector<keywordid_t> ids) {
  293. ASSERT(beginSet.empty());
  294. for (keywordid_t id : ids)
  295. beginSet.insert(id);
  296. }
  297. void setFollowSet(keywordid_t dest, std::vector<keywordid_t> ids) {
  298. ASSERT(dest < getNumKeywords());
  299. ASSERT(followSets[dest].empty());
  300. for (keywordid_t id : ids)
  301. followSets[dest].insert(id);
  302. }
  303. void setPreBoundary(keywordid_t dest, char btype) {
  304. ASSERT(dest < getNumKeywords());
  305. ASSERT(keywords[dest].length() > 0 || (btype != 'b' && btype != 'B'));
  306. if (keywords[dest].length() > 0 && btype != 'b' && btype != 'B') {
  307. const CharClass *cc = CharClass::boundary_nonb(btype, true);
  308. if (cc && !cc->matches(keywords[dest][0])) {
  309. WARN("Keyword preBoundary that never matches: kw="
  310. << keywords[dest] << " btype=" << btype);
  311. throw invalid_pattern();
  312. }
  313. }
  314. preBoundaries[dest] =
  315. CharClass::boundary(btype, false, keywords[dest][0]);
  316. }
  317. void setPostBoundary(keywordid_t dest, char btype) {
  318. ASSERT(dest < getNumKeywords());
  319. ASSERT(keywords[dest].length() > 0 || (btype != 'b' && btype != 'B'));
  320. if (keywords[dest].length() > 0 && btype != 'b' && btype != 'B') {
  321. const CharClass *cc = CharClass::boundary_nonb(btype, false);
  322. if (cc && !cc->matches(keywords[dest][keywords[dest].length() - 1])) {
  323. WARN("Keyword postBoundary that never matches: kw="
  324. << keywords[dest] << " btype=" << btype);
  325. throw invalid_pattern();
  326. }
  327. }
  328. postBoundaries[dest] =
  329. CharClass::boundary(btype,
  330. true,
  331. keywords[dest][keywords[dest].length() - 1]);
  332. }
  333. bool hasPreBoundary(keywordid_t kwId) const {
  334. return preBoundaries[kwId] != nullptr;
  335. }
  336. bool matchPreBoundary(keywordid_t kwId, chartype outsideChar) const {
  337. if (preBoundaries[kwId] == nullptr)
  338. return true;
  339. return preBoundaries[kwId]->matches(outsideChar);
  340. }
  341. bool hasPostBoundary(keywordid_t kwId) const {
  342. return postBoundaries[kwId] != nullptr;
  343. }
  344. bool matchPostBoundary(keywordid_t kwId, chartype outsideChar) const {
  345. if (postBoundaries[kwId] == nullptr)
  346. return true;
  347. return postBoundaries[kwId]->matches(outsideChar);
  348. }
  349. const std::set<keywordid_t> & getBeginSet() const {
  350. return beginSet;
  351. }
  352. const std::set<keywordid_t> & getFollowSet(keywordid_t kwId) const {
  353. return followSets[kwId];
  354. }
  355. bool isBeginningKeyword(keywordid_t kwId) const {
  356. return beginSet.find(kwId) != beginSet.cend();
  357. }
  358. bool isEndingKeyword(keywordid_t kwId) const {
  359. std::set<keywordid_t> const & fs = getFollowSet(kwId);
  360. return fs.find(Pattern::end_keyword_id) != fs.end();
  361. }
  362. bool isFound() const {
  363. return found;
  364. }
  365. bool isAlwaysFallback() const {
  366. return alwaysFallback;
  367. }
  368. gap_size_t getPostFallbackAmount() const {
  369. return postFallbackAmount;
  370. }
  371. void establishPostFallback(gap_size_t amount = Gap::infinite) {
  372. if (postFallbackAmount < amount)
  373. postFallbackAmount = amount;
  374. establishFallbackMatcher(true);
  375. }
  376. void establishFallbackMatcher(bool alwaysNeeded);
  377. bool useFallbackMatcher(const char *text) const;
  378. bool useFallbackMatcher(const std::string & text) const {
  379. return useFallbackMatcher(text.c_str());
  380. }
  381. virtual void print(std::ostream & os) const;
  382. private:
  383. void parse();
  384. int id;
  385. std::string pattern;
  386. std::vector<std::string> keywords;
  387. std::vector<Gap> gaps;
  388. std::set<keywordid_t> beginSet;
  389. std::vector<std::set<keywordid_t> > followSets;
  390. std::vector<const CharClass *> preBoundaries, postBoundaries;
  391. bool found;
  392. size_t lastFoundPos;
  393. regex_t *fallbackMatcher;
  394. bool alwaysFallback;
  395. gap_size_t postFallbackAmount;
  396. };
  397. #endif // PM_PATTERN_HPP