/lib/parsing_nesting/tree.rb
Ruby | 443 lines | 273 code | 50 blank | 120 comment | 21 complexity | c2bdb2619117cd36104c2bfe86d05a21 MD5 | raw file
Possible License(s): Apache-2.0
- require 'parsing_nesting/grammar'
- module ParsingNesting::Tree
- # Get parslet output for string (parslet output is json-y objects), and
- # transform to an actual abstract syntax tree made up of more semantic
- # ruby objects, Node's. The top one will always be a List.
- #
- # Call #to_query on resulting Node in order to transform to Solr query,
- # optionally passing in Solr params to be used as LocalParams in nested
- # dismax queries.
- #
- # Our approach here works, but as we have to put in special cases
- # it starts getting messy. Ideally we might want to actually transform
- # the Object graph (abstract syntax tree) instead of trying to handle
- # special cases in #to_query.
- # For instance, transform object graph for a problematic pure-negative
- # clause to the corresponding object graph without that (-a AND -b) ==>
- # (NOT (a OR b). Transform (NOT NOT a) to (a). That would probably be
- # more robust. But instead we handle special cases in to_query, which
- # means the special cases tend to multiply and need to be handled at
- # multiple levels. But it's working for now.
- #
- # the #negate method was an experiment in transforming parse tree in
- # place, but isn't being used. But it's left as a sign post.
- def self.parse(string, query_parser = 'dismax')
- to_node_tree(ParsingNesting::Grammar.new.parse(string), query_parser)
- end
- # theoretically Parslet's Transform could be used for this, but I think the
- # manner in which I'm parsing to Parslet labelled hash isn't exactly what
- # Parslet Transform is set up to work with, I couldn't figure it out. But
- # easy enough to do 'manually'.
- def self.to_node_tree(tree, query_parser)
- if tree.is_a? Array
- # at one point I was normalizing top-level lists of one item to just
- # be that item, no list wrapper. But having the list wrapper
- # at the top level is actually useful for Solr output.
- List.new(tree.collect { |i| to_node_tree(i, query_parser) }, query_parser)
- elsif tree.is_a? Hash
- if list = tree[:list]
- List.new(list.collect { |i| to_node_tree(i, query_parser) }, query_parser)
- elsif tree.has_key?(:and_list)
- AndList.new(tree[:and_list].collect { |i| to_node_tree(i, query_parser) }, query_parser)
- elsif tree.has_key?(:or_list)
- OrList.new(tree[:or_list].collect { |i| to_node_tree(i, query_parser) }, query_parser)
- elsif not_payload = tree[:not_expression]
- NotExpression.new(to_node_tree(not_payload, query_parser))
- elsif tree.has_key?(:mandatory)
- MandatoryClause.new(to_node_tree(tree[:mandatory], query_parser))
- elsif tree.has_key?(:excluded)
- ExcludedClause.new(to_node_tree(tree[:excluded], query_parser))
- elsif phrase = tree[:phrase]
- Phrase.new(phrase)
- elsif tree.has_key?(:token)
- Term.new(tree[:token].to_s)
- end
- end
- end
- class Node
- # this default to_query works well for anything that is embeddable in
- # a standard way.
- # non-embeddable nodes will have to override and do it different.
- def to_query(solr_params)
- build_nested_query([self], solr_params)
- end
- protected # some utility methods
- # Builds a query from a list of Node's that have #to_embed, and some
- # solr params to embed as LocalParams.
- #
- # By default will create a nested _query_, handling escaping appropriately.
- # but pass in :always_nested=>false, and it will sometimes be an ordinary
- # query where possible. (possibly still with LocalParams).
- #
- # LocalParams will be default have "!dismax" added to them, but set
- # :force_deftype to something else (or nil) if you want.
- #
- # Also takes care of simple "pure negative" queries like "-one -two",
- # converting them to a nested NOT query that will be handled appropriately.
- # those simple negatives can't be handled right by dismax otherwise.
- def build_nested_query(embeddables, solr_params = {}, options = {})
- options = { :always_nested => true,
- :force_deftype => "dismax" }.merge(options)
- # if it's pure negative, we need to transform
- if embeddables.find_all { |n| n.is_a?(ExcludedClause) }.length == embeddables.length
- negated = NotExpression.new(List.new(embeddables.collect { |n| n.operand }, options[:force_deftype]))
- solr_params = solr_params.merge(:mm => "1")
- return negated.to_query(solr_params)
- else
- inner_query = build_local_params(solr_params, options[:force_deftype]) +
- embeddables.collect { |n| n.to_embed }.join(" ")
- if options[:always_nested]
- return '_query_:"' + bs_escape(inner_query) + '"'
- else
- return inner_query
- end
- end
- end
- # Pass in nil 2nd argument if you DON'T want to embed
- # "!dismax" in your local params. Used by #to_single_query_params
- def build_local_params(hash = {}, force_deftype = "dismax")
- # we insist on dismax for our embedded queries, or whatever
- # other defType supplied in 2nd argument.
- hash = hash.dup
- if force_deftype
- hash[:defType] = force_deftype
- hash.delete("defType") # avoid weird colision with hard to debug results
- end
- if !hash.empty?
- defType = hash.delete(:defType) || hash.delete("defType")
- "{!" + (defType ? "#{defType} " : "") + hash.collect { |k, v| "#{k}=#{v.to_s.include?(" ") ? "'" + v + "'" : v}" }.join(" ") + "}"
- else
- # no local params!
- ""
- end
- end
- def bs_escape(val, char = '"')
- # crazy double escaping to actually get a single backslash
- # in there without triggering regexp capture reference
- val.gsub(char, '\\\\' + char)
- end
- end
- class List < Node
- attr_accessor :list
- attr_reader :query_parser
- def initialize(aList, query_parser)
- @query_parser = query_parser
- self.list = aList
- end
- def can_embed?
- false
- end
- def simple_pure_negative?
- list.find_all { |i| i.is_a? ExcludedClause }.length == list.length
- end
- def to_query(solr_params = {})
- queries = []
- (embeddable, gen_full_query) = list.partition { |i| i.respond_to?(:can_embed?) && i.can_embed? }
- unless embeddable.empty?
- queries << build_nested_query(embeddable, solr_params, force_deftype: query_parser)
- end
- gen_full_query.each do |node|
- queries << node.to_query(solr_params)
- end
- queries.join(" AND ")
- end
- # Returns a Hash, assumes this will be the ONLY :q, used for
- # parsing 'simple search' to Solr. Pass in params that need to
- # be LOCAL solr params (using "{foo=bar}" embedded in query).
- # Params that should be sent to Solr seperately are caller's responsibility,
- # merge em into the returned hash.
- #
- # For very simple queries, this will produce an ordinary Solr q
- # much like would be produced ordinarily. But for AND/OR/NOT, will
- # sometimes include multiple nested queries instead.
- #
- # This method will still sometimes return a single nested _query_, that
- # could theoretically really be ordinary query possibly with localparams.
- # It still works, but isn't optimizing for a simpler query, because
- # it's using much of the same code used for combining multiple fields
- # that need nested queries. Maybe we'll optimize later, but the code
- # gets tricky.
- def to_single_query_params(solr_local_params)
- # Can it be expressed in a single dismax?
- if list.find_all { |i| i.respond_to?(:can_embed?) && i.can_embed? }.length == list.length
- {
- # build_local_params(solr_local_params, nil) + list.collect {|n| n.to_embed}.join(" "),
- :q => build_nested_query(list, solr_local_params, :always_nested => false, :force_deftype => nil),
- :defType => query_parser
- }
- else
- # Can't be expressed in a single dismax, do it the normal way
- {
- :q => self.to_query(solr_local_params),
- :defType => "lucene"
- }
- end
- end
- def negate
- List.new(list.collect { |i| i.negate })
- end
- end
- class AndList < List
- # We make an and-list embeddable only if all it's elements
- # are embeddable, then no problem we just embed them all
- # as Solr '+' mandatory, and achieve the AND.
- # For now, pure negative is considered not embeddable, although
- # theoretically it could sometimes be embedded if transformed
- # properly.
- def can_embed?
- !simple_pure_negative? && !list.collect { |i| i.can_embed? }.include?(false)
- end
- # Only if all operands are embeddable.
- # Trick is if they were bare terms/phrases, we add a '+' on
- # front, but if they already were +/-, then we don't need to,
- # and leaving them along will have desired semantics.
- # This works even on "-", because dismax mm seems to not consider "-"
- # clauses, they are always required regardless of mm.
- def to_embed
- list.collect do |operand|
- s = operand.to_embed
- if s =~ /^\+/ || s =~ /^\-/
- s
- else
- '+' + s
- end
- end.join(" ")
- end
- # for those that aren't embeddable, or pure negative
- def to_query(local_params)
- if simple_pure_negative?
- # Can do it in one single nested dismax, if we're simple arguments
- # that are pure negative.
- # build_nested_query will handle negating the pure negative for
- # us.
- build_nested_query(list, local_params)
- else
- "( " +
- list.collect do |i|
- i.to_query(local_params)
- end.join(" AND ") +
- " )"
- end
- end
- # convent logical property here, not(a AND b) === not(a) OR not(b)
- def negate
- OrList.new(list.collect { |n| n.negate })
- end
- end
- class OrList < List
- # never embeddable
- def can_embed?
- false
- end
- def to_query(local_params)
- # Okay, we're never embeddable as such, but sometimes we can
- # turn our operands into one single nested dismax query with mm=1, when
- # all our operands are 'simple', other times we need to actually do
- # two seperate nested queries seperated by lucene OR.
- # If all our children are embeddable but _not_ an "AndList", we can
- # do the one query part. The AndList is theoretically embeddable, but
- # not in a way compatible with flattening an OR to one query.
- # Sorry, this part is one of the least clean part of this code!
- not_flattenable = list.find { |i| !(i.can_embed? && !i.is_a?(AndList)) }
- if not_flattenable
- to_multi_queries(local_params)
- elsif simple_pure_negative?
- to_simple_pure_negative_query(local_params)
- else
- to_one_dismax_query(local_params)
- end
- end
- # build_nested_query isn't smart enough to handle refactoring
- # a simple pure negative "OR", that needs an mm of 100%.
- # Let's just do it ourselves. What we're doing makes more sense
- # if you remember that:
- # -a OR -b === NOT (a AND b)
- def to_simple_pure_negative_query(local_params)
- # take em out of their ExcludedClauses
- embeddables = list.collect { |n| n.operand }
- # and insist on mm 100%
- solr_params = local_params.merge(:mm => "100%")
- # and put the NOT in front to preserve semantics.
- 'NOT _query_:"' +
- bs_escape(build_local_params(solr_params) +
- embeddables.collect { |n| n.to_embed }.join(" ")) +
- '"'
- end
- # all our arguments are 'simple' (terms and phrases with +/-),
- # put am all in one single dismax with mm forced to 1.
- def to_one_dismax_query(local_params)
- build_nested_query(list, local_params.merge(:mm => "1"))
- end
- def to_multi_queries(local_params)
- "( " +
- list.collect do |i|
- if i.is_a?(NotExpression) || (i.respond_to?(:simple_pure_negative?) && i.simple_pure_negative?)
- # need special handling to work around Solr 1.4.1's lack of handling
- # of pure negative in an OR
- "(*:* AND #{i.to_query(local_params)})"
- else
- i.to_query(local_params)
- end
- end.join(" OR ") +
- " )"
- end
- # convenient logical property here, not(a OR b) === not(a) AND not(b)
- def negate
- AndList.new(list.collect { |n| n.negate })
- end
- end
- class NotExpression
- def initialize(exp)
- self.operand = exp
- end
- attr_accessor :operand
- # We have to do the weird thing with *:* AND NOT (real thing), because
- # Solr 1.4.1 seems not to be able to handle "x OR NOT y" otherwise, at least
- # in some cases, but does fine with
- # "x OR (*:* AND NOT y)", which should mean the same thing.
- def to_query(solr_params)
- # rescue double-nots to not treat them crazy-like and make the query
- # more work for Solr than it needs to be with a double-negative.
- if operand.is_a?(NotExpression)
- operand.operand.to_query(solr_params)
- else
- "NOT " + operand.to_query(solr_params)
- end
- end
- def can_embed?
- false
- end
- def negate
- operand
- end
- end
- class MandatoryClause < Node
- attr_accessor :operand
- def initialize(v)
- self.operand = v
- end
- def can_embed?
- # right now '+' clauses only apply to terms/phrases
- # which we can embed with a + in front.
- true
- end
- def to_embed
- '+' + operand.to_embed
- end
- # negating mandatory to excluded is decent semantics, although
- # it's not strictly 'true', it's a choice.
- def negate
- ExcludedClause.new(operand)
- end
- end
- class ExcludedClause < Node
- attr_accessor :operand
- def initialize(v)
- self.operand = v
- end
- def can_embed?
- # right now '-' clauses only apply to terms/phrases, which
- # we can embed with a '-' in front.
- true
- end
- def to_embed
- '-' + operand.to_embed
- end
- # negating excluded to mandatory is a pretty decent choice
- def negate
- MandatoryClause.new(operand)
- end
- def simple_pure_negative?
- true
- end
- end
- class Phrase < Node
- attr_accessor :value
- def initialize(string)
- self.value = string
- end
- def can_embed?
- true
- end
- def to_embed
- '"' + value + '"'
- end
- def negate
- ExcludedClause.new(self)
- end
- end
- class Term < Node
- attr_accessor :value
- def initialize(string)
- self.value = string
- end
- def can_embed?
- true
- end
- def to_embed
- value
- end
- def negate
- ExcludedClause.new(self)
- end
- end
- end