/dpma/Pattern.hpp
C++ Header | 468 lines | 358 code | 71 blank | 39 comment | 65 complexity | 05c985a88845933ce6cc663134806420 MD5 | raw file
- /*
- * dpma - Research prototype of regular expression matching using a
- * dynamic pattern matching automaton
- *
- * Copyright 2017 Tuukka Haapasalo, Riku Saikkonen, Panu Silvasti and
- * Aalto University
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy,
- * modify, merge, publish, distribute, sublicense, and/or sell copies
- * of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
- /*
- * Classes for representing parsed regular expression patterns using
- * character classes, keywords and gaps.
- */
- #ifndef PM_PATTERN_HPP
- #define PM_PATTERN_HPP
- #include "Reader.hpp"
- #include "Utils.hpp"
- #include "util-Log.hpp"
- #include <limits>
- #include <string>
- #include <iostream>
- #include <vector>
- #include <set>
- #include <bitset>
- #include <tuple>
- #include <utility>
- #include <regex.h>
- typedef unsigned char chartype;
- typedef unsigned int keywordid_t;
- typedef unsigned int gap_size_t;
- class CharClass: public virtual Printable {
- private:
- std::bitset<1<<(8*sizeof(chartype))> chars;
- bool saved;
- static std::vector<const CharClass *> allCharClasses;
- // For boundaries; these are initialized by CharClass::init()
- static const CharClass *wordCC, *nonWordCC, *eolCC, *bothEolCC;
- public:
- // Helper char class containing all characters (like .)
- static const CharClass *dotCC;
- public:
- CharClass() : chars(), saved(false) { }
- virtual ~CharClass() { }
- static void init(chartype eolChar);
- static const CharClass *boundary_nonb(char btype, bool second);
- static const CharClass *boundary(char btype, bool second, chartype otherch);
- static const CharClass *predef_class(char ctype);
- void add(chartype c) {
- ASSERT(!saved);
- chars.set(c);
- }
- void add(const std::string & chars_) {
- ASSERT(!saved);
- for (chartype c: chars_)
- chars.set(c);
- }
- void add_range(chartype range_first, chartype range_last) {
- ASSERT(!saved);
- ASSERT(range_first <= range_last);
- for (chartype c = range_first; c != range_last; ++c)
- chars.set(c);
- chars.set(range_last);
- }
- void reverse() {
- ASSERT(!saved);
- chars.flip();
- }
- // Number of chars in class
- size_t count() const {
- return chars.count();
- }
- bool matches(chartype c) const {
- return chars[c];
- }
- bool matches(const std::string & s) const {
- for (chartype c : s)
- if (!chars[c])
- return false;
- return true;
- }
- bool operator==(const CharClass & c) const {
- return chars == c.chars;
- }
- bool operator!=(const CharClass & c) const {
- return !((*this) == c);
- }
- const CharClass *save();
- void matching_chars(std::string & s) const;
- virtual void print(std::ostream & os) const;
- };
- class Gap: public virtual Printable {
- private:
- struct CCRange {
- gap_size_t i; // position (from beginning or end)
- gap_size_t j; // length
- const CharClass *cc;
- };
- gap_size_t minLength;
- gap_size_t maxLength;
- bool topLevel, tooComplexMid, needFallback;
- std::vector<CCRange> ccrBegin, ccrEnd;
- CCRange ccrMid; // ccrMid.j is position from end
- public:
- // Empty gap (by itself, matches only the empty string)
- Gap() :
- minLength(0), maxLength(0),
- topLevel(false), tooComplexMid(false), needFallback(false),
- ccrBegin(0), ccrEnd(0), ccrMid({0,0,0}) { }
- // Fallback match-anything gap (anything within the lengths)
- Gap(gap_size_t minLength_, gap_size_t maxLength_,
- const CharClass *cc_ = nullptr)
- : Gap() {
- addAtEnd(minLength_, maxLength_, cc_);
- setNeedFallback();
- }
- virtual ~Gap() { }
- static const gap_size_t infinite = std::numeric_limits<gap_size_t>::max();
- static gap_size_t add(gap_size_t l1, gap_size_t l2) {
- if (l1 == Gap::infinite || l2 == Gap::infinite)
- return Gap::infinite;
- return l1 + l2;
- }
- gap_size_t getMinLength() const {
- return minLength;
- }
- gap_size_t getMaxLength() const {
- return maxLength;
- }
- gap_size_t getLength() const {
- return minLength;
- }
- void set(gap_size_t minLength_, gap_size_t maxLength_) {
- minLength = minLength_;
- maxLength = maxLength_;
- }
- void addAtEnd(gap_size_t minLength_, gap_size_t maxLength_,
- const CharClass * cc) {
- ASSERT(maxLength_ > 0);
- ASSERT(maxLength_ >= minLength_);
- ccrBegin.push_back({minLength_, maxLength_, cc});
- }
- bool anythingAdded() const {
- return !ccrBegin.empty();
- }
- void finishAdding();
- // Returns true if this gap possibly or certainly matches s.
- // Sets needfb = true if it matches only possibly.
- bool matches(const ReadString & s, bool & needfb) const {
- if (s.length() < minLength || s.length() > maxLength)
- return false;
- try {
- // Search s from end to beginning
- for (const CCRange & ccr : ccrEnd)
- for (size_t p = ccr.i; p < ccr.i + ccr.j; p++)
- if (!ccr.cc->matches(s.rat(p)))
- return false;
- if (ccrMid.cc && s.length() > ccrMid.i + ccrMid.j) {
- size_t pend = s.length() - ccrMid.i;
- for (size_t p = ccrMid.j; p < pend; p++)
- if (!ccrMid.cc->matches(s.rat(p)))
- return false;
- }
- for (const CCRange & ccr : ccrBegin)
- for (ssize_t p = ccr.i + ccr.j - 1; p >= ccr.i; p--)
- if (!ccr.cc->matches(s.at(p)))
- return false;
- } catch (ReadString::read_off_end &e) {
- needfb = true;
- }
- if (needFallback)
- needfb = true;
- if (tooComplexMid && s.length() > ccrMid.i + ccrMid.j)
- needfb = true;
- return true;
- }
- void setTopLevel() {
- topLevel = true;
- }
- bool isTopLevel() const {
- return topLevel;
- }
- void setNeedFallback() {
- needFallback = true;
- }
- bool needsFallback() const {
- return needFallback;
- }
- bool mayNeedFallback() const {
- return needFallback || tooComplexMid;
- }
- static std::ostream & formatLength(std::ostream & os, gap_size_t length);
- virtual void print(std::ostream & os) const;
- };
- class Pattern: public virtual Printable {
- public:
- class invalid_pattern: public std::exception {
- public:
- explicit invalid_pattern() : std::exception() { }
- };
- Pattern(int id_, std::string const & pattern_)
- : id(id_), pattern(pattern_),
- keywords(), gaps(), beginSet(), followSets(),
- preBoundaries(), postBoundaries(),
- found(false), lastFoundPos(0),
- fallbackMatcher(0), alwaysFallback(false),
- postFallbackAmount(0) {
- parse();
- }
- virtual ~Pattern() {
- if (fallbackMatcher != nullptr) {
- regfree(fallbackMatcher);
- delete fallbackMatcher;
- fallbackMatcher = nullptr;
- }
- }
- Pattern(Pattern & p) = delete;
- Pattern(Pattern && p) noexcept
- : id(p.id), pattern(p.pattern),
- keywords(p.keywords), gaps(p.gaps),
- beginSet(p.beginSet), followSets(p.followSets),
- preBoundaries(p.preBoundaries), postBoundaries(p.postBoundaries),
- found(p.found), lastFoundPos(p.lastFoundPos),
- fallbackMatcher(p.fallbackMatcher), alwaysFallback(p.alwaysFallback),
- postFallbackAmount(p.postFallbackAmount) {
- p.fallbackMatcher = nullptr;
- }
- static const keywordid_t begin_keyword_id =
- std::numeric_limits<keywordid_t>::max() - 1;
- static const keywordid_t end_keyword_id =
- std::numeric_limits<keywordid_t>::max();
- int getId() const { return id; }
- std::string const & getPattern() const {
- return pattern;
- }
- size_t getLastFoundPos() const {
- return lastFoundPos;
- }
- void setFoundPos(size_t lastFoundPos_) {
- found = true;
- lastFoundPos = lastFoundPos_;
- }
- unsigned int getNumKeywords() const {
- return keywords.size();
- }
- std::string const & getKeyword(keywordid_t i) const {
- return keywords[i];
- }
- Gap const & getGap(keywordid_t i) const {
- return gaps[i];
- }
- unsigned int getDistance(keywordid_t i) const {
- unsigned int dist = 0;
- for (keywordid_t p = 0; p < i; p++) {
- dist += getKeyword(p).length();
- dist += getGap(p).getLength();
- }
- dist += getGap(i).getLength();
- return dist;
- }
- unsigned int getKeywordLength(keywordid_t keyword) const {
- return keywords[keyword].length();
- }
- unsigned int getLength() const {
- return getDistance(getNumKeywords());
- }
- keywordid_t addKeyword(std::string const & keyword);
- void finalizeKeywords(); // call after all addKeyword()s
- void setGap(keywordid_t dest, Gap & gap) {
- ASSERT(dest < getNumKeywords());
- if (gap.mayNeedFallback())
- establishFallbackMatcher(false);
- gaps[dest] = gap;
- }
- void setBeginSet(std::vector<keywordid_t> ids) {
- ASSERT(beginSet.empty());
- for (keywordid_t id : ids)
- beginSet.insert(id);
- }
- void setFollowSet(keywordid_t dest, std::vector<keywordid_t> ids) {
- ASSERT(dest < getNumKeywords());
- ASSERT(followSets[dest].empty());
- for (keywordid_t id : ids)
- followSets[dest].insert(id);
- }
- void setPreBoundary(keywordid_t dest, char btype) {
- ASSERT(dest < getNumKeywords());
- ASSERT(keywords[dest].length() > 0 || (btype != 'b' && btype != 'B'));
- if (keywords[dest].length() > 0 && btype != 'b' && btype != 'B') {
- const CharClass *cc = CharClass::boundary_nonb(btype, true);
- if (cc && !cc->matches(keywords[dest][0])) {
- WARN("Keyword preBoundary that never matches: kw="
- << keywords[dest] << " btype=" << btype);
- throw invalid_pattern();
- }
- }
- preBoundaries[dest] =
- CharClass::boundary(btype, false, keywords[dest][0]);
- }
- void setPostBoundary(keywordid_t dest, char btype) {
- ASSERT(dest < getNumKeywords());
- ASSERT(keywords[dest].length() > 0 || (btype != 'b' && btype != 'B'));
- if (keywords[dest].length() > 0 && btype != 'b' && btype != 'B') {
- const CharClass *cc = CharClass::boundary_nonb(btype, false);
- if (cc && !cc->matches(keywords[dest][keywords[dest].length() - 1])) {
- WARN("Keyword postBoundary that never matches: kw="
- << keywords[dest] << " btype=" << btype);
- throw invalid_pattern();
- }
- }
- postBoundaries[dest] =
- CharClass::boundary(btype,
- true,
- keywords[dest][keywords[dest].length() - 1]);
- }
- bool hasPreBoundary(keywordid_t kwId) const {
- return preBoundaries[kwId] != nullptr;
- }
- bool matchPreBoundary(keywordid_t kwId, chartype outsideChar) const {
- if (preBoundaries[kwId] == nullptr)
- return true;
- return preBoundaries[kwId]->matches(outsideChar);
- }
- bool hasPostBoundary(keywordid_t kwId) const {
- return postBoundaries[kwId] != nullptr;
- }
- bool matchPostBoundary(keywordid_t kwId, chartype outsideChar) const {
- if (postBoundaries[kwId] == nullptr)
- return true;
- return postBoundaries[kwId]->matches(outsideChar);
- }
- const std::set<keywordid_t> & getBeginSet() const {
- return beginSet;
- }
- const std::set<keywordid_t> & getFollowSet(keywordid_t kwId) const {
- return followSets[kwId];
- }
- bool isBeginningKeyword(keywordid_t kwId) const {
- return beginSet.find(kwId) != beginSet.cend();
- }
- bool isEndingKeyword(keywordid_t kwId) const {
- std::set<keywordid_t> const & fs = getFollowSet(kwId);
- return fs.find(Pattern::end_keyword_id) != fs.end();
- }
- bool isFound() const {
- return found;
- }
- bool isAlwaysFallback() const {
- return alwaysFallback;
- }
- gap_size_t getPostFallbackAmount() const {
- return postFallbackAmount;
- }
- void establishPostFallback(gap_size_t amount = Gap::infinite) {
- if (postFallbackAmount < amount)
- postFallbackAmount = amount;
- establishFallbackMatcher(true);
- }
- void establishFallbackMatcher(bool alwaysNeeded);
- bool useFallbackMatcher(const char *text) const;
- bool useFallbackMatcher(const std::string & text) const {
- return useFallbackMatcher(text.c_str());
- }
- virtual void print(std::ostream & os) const;
- private:
- void parse();
- int id;
- std::string pattern;
- std::vector<std::string> keywords;
- std::vector<Gap> gaps;
- std::set<keywordid_t> beginSet;
- std::vector<std::set<keywordid_t> > followSets;
- std::vector<const CharClass *> preBoundaries, postBoundaries;
- bool found;
- size_t lastFoundPos;
- regex_t *fallbackMatcher;
- bool alwaysFallback;
- gap_size_t postFallbackAmount;
- };
- #endif // PM_PATTERN_HPP