PageRenderTime 24ms CodeModel.GetById 53ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/parsing_nesting/tree.rb

https://github.com/projectblacklight/blacklight_advanced_search
Ruby | 443 lines | 273 code | 50 blank | 120 comment | 21 complexity | c2bdb2619117cd36104c2bfe86d05a21 MD5 | raw file
Possible License(s): Apache-2.0
  1. require 'parsing_nesting/grammar'
  2. module ParsingNesting::Tree
  3. # Get parslet output for string (parslet output is json-y objects), and
  4. # transform to an actual abstract syntax tree made up of more semantic
  5. # ruby objects, Node's. The top one will always be a List.
  6. #
  7. # Call #to_query on resulting Node in order to transform to Solr query,
  8. # optionally passing in Solr params to be used as LocalParams in nested
  9. # dismax queries.
  10. #
  11. # Our approach here works, but as we have to put in special cases
  12. # it starts getting messy. Ideally we might want to actually transform
  13. # the Object graph (abstract syntax tree) instead of trying to handle
  14. # special cases in #to_query.
  15. # For instance, transform object graph for a problematic pure-negative
  16. # clause to the corresponding object graph without that (-a AND -b) ==>
  17. # (NOT (a OR b). Transform (NOT NOT a) to (a). That would probably be
  18. # more robust. But instead we handle special cases in to_query, which
  19. # means the special cases tend to multiply and need to be handled at
  20. # multiple levels. But it's working for now.
  21. #
  22. # the #negate method was an experiment in transforming parse tree in
  23. # place, but isn't being used. But it's left as a sign post.
  24. def self.parse(string, query_parser = 'dismax')
  25. to_node_tree(ParsingNesting::Grammar.new.parse(string), query_parser)
  26. end
  27. # theoretically Parslet's Transform could be used for this, but I think the
  28. # manner in which I'm parsing to Parslet labelled hash isn't exactly what
  29. # Parslet Transform is set up to work with, I couldn't figure it out. But
  30. # easy enough to do 'manually'.
  31. def self.to_node_tree(tree, query_parser)
  32. if tree.is_a? Array
  33. # at one point I was normalizing top-level lists of one item to just
  34. # be that item, no list wrapper. But having the list wrapper
  35. # at the top level is actually useful for Solr output.
  36. List.new(tree.collect { |i| to_node_tree(i, query_parser) }, query_parser)
  37. elsif tree.is_a? Hash
  38. if list = tree[:list]
  39. List.new(list.collect { |i| to_node_tree(i, query_parser) }, query_parser)
  40. elsif tree.has_key?(:and_list)
  41. AndList.new(tree[:and_list].collect { |i| to_node_tree(i, query_parser) }, query_parser)
  42. elsif tree.has_key?(:or_list)
  43. OrList.new(tree[:or_list].collect { |i| to_node_tree(i, query_parser) }, query_parser)
  44. elsif not_payload = tree[:not_expression]
  45. NotExpression.new(to_node_tree(not_payload, query_parser))
  46. elsif tree.has_key?(:mandatory)
  47. MandatoryClause.new(to_node_tree(tree[:mandatory], query_parser))
  48. elsif tree.has_key?(:excluded)
  49. ExcludedClause.new(to_node_tree(tree[:excluded], query_parser))
  50. elsif phrase = tree[:phrase]
  51. Phrase.new(phrase)
  52. elsif tree.has_key?(:token)
  53. Term.new(tree[:token].to_s)
  54. end
  55. end
  56. end
  57. class Node
  58. # this default to_query works well for anything that is embeddable in
  59. # a standard way.
  60. # non-embeddable nodes will have to override and do it different.
  61. def to_query(solr_params)
  62. build_nested_query([self], solr_params)
  63. end
  64. protected # some utility methods
  65. # Builds a query from a list of Node's that have #to_embed, and some
  66. # solr params to embed as LocalParams.
  67. #
  68. # By default will create a nested _query_, handling escaping appropriately.
  69. # but pass in :always_nested=>false, and it will sometimes be an ordinary
  70. # query where possible. (possibly still with LocalParams).
  71. #
  72. # LocalParams will be default have "!dismax" added to them, but set
  73. # :force_deftype to something else (or nil) if you want.
  74. #
  75. # Also takes care of simple "pure negative" queries like "-one -two",
  76. # converting them to a nested NOT query that will be handled appropriately.
  77. # those simple negatives can't be handled right by dismax otherwise.
  78. def build_nested_query(embeddables, solr_params = {}, options = {})
  79. options = { :always_nested => true,
  80. :force_deftype => "dismax" }.merge(options)
  81. # if it's pure negative, we need to transform
  82. if embeddables.find_all { |n| n.is_a?(ExcludedClause) }.length == embeddables.length
  83. negated = NotExpression.new(List.new(embeddables.collect { |n| n.operand }, options[:force_deftype]))
  84. solr_params = solr_params.merge(:mm => "1")
  85. return negated.to_query(solr_params)
  86. else
  87. inner_query = build_local_params(solr_params, options[:force_deftype]) +
  88. embeddables.collect { |n| n.to_embed }.join(" ")
  89. if options[:always_nested]
  90. return '_query_:"' + bs_escape(inner_query) + '"'
  91. else
  92. return inner_query
  93. end
  94. end
  95. end
  96. # Pass in nil 2nd argument if you DON'T want to embed
  97. # "!dismax" in your local params. Used by #to_single_query_params
  98. def build_local_params(hash = {}, force_deftype = "dismax")
  99. # we insist on dismax for our embedded queries, or whatever
  100. # other defType supplied in 2nd argument.
  101. hash = hash.dup
  102. if force_deftype
  103. hash[:defType] = force_deftype
  104. hash.delete("defType") # avoid weird colision with hard to debug results
  105. end
  106. if !hash.empty?
  107. defType = hash.delete(:defType) || hash.delete("defType")
  108. "{!" + (defType ? "#{defType} " : "") + hash.collect { |k, v| "#{k}=#{v.to_s.include?(" ") ? "'" + v + "'" : v}" }.join(" ") + "}"
  109. else
  110. # no local params!
  111. ""
  112. end
  113. end
  114. def bs_escape(val, char = '"')
  115. # crazy double escaping to actually get a single backslash
  116. # in there without triggering regexp capture reference
  117. val.gsub(char, '\\\\' + char)
  118. end
  119. end
  120. class List < Node
  121. attr_accessor :list
  122. attr_reader :query_parser
  123. def initialize(aList, query_parser)
  124. @query_parser = query_parser
  125. self.list = aList
  126. end
  127. def can_embed?
  128. false
  129. end
  130. def simple_pure_negative?
  131. list.find_all { |i| i.is_a? ExcludedClause }.length == list.length
  132. end
  133. def to_query(solr_params = {})
  134. queries = []
  135. (embeddable, gen_full_query) = list.partition { |i| i.respond_to?(:can_embed?) && i.can_embed? }
  136. unless embeddable.empty?
  137. queries << build_nested_query(embeddable, solr_params, force_deftype: query_parser)
  138. end
  139. gen_full_query.each do |node|
  140. queries << node.to_query(solr_params)
  141. end
  142. queries.join(" AND ")
  143. end
  144. # Returns a Hash, assumes this will be the ONLY :q, used for
  145. # parsing 'simple search' to Solr. Pass in params that need to
  146. # be LOCAL solr params (using "{foo=bar}" embedded in query).
  147. # Params that should be sent to Solr seperately are caller's responsibility,
  148. # merge em into the returned hash.
  149. #
  150. # For very simple queries, this will produce an ordinary Solr q
  151. # much like would be produced ordinarily. But for AND/OR/NOT, will
  152. # sometimes include multiple nested queries instead.
  153. #
  154. # This method will still sometimes return a single nested _query_, that
  155. # could theoretically really be ordinary query possibly with localparams.
  156. # It still works, but isn't optimizing for a simpler query, because
  157. # it's using much of the same code used for combining multiple fields
  158. # that need nested queries. Maybe we'll optimize later, but the code
  159. # gets tricky.
  160. def to_single_query_params(solr_local_params)
  161. # Can it be expressed in a single dismax?
  162. if list.find_all { |i| i.respond_to?(:can_embed?) && i.can_embed? }.length == list.length
  163. {
  164. # build_local_params(solr_local_params, nil) + list.collect {|n| n.to_embed}.join(" "),
  165. :q => build_nested_query(list, solr_local_params, :always_nested => false, :force_deftype => nil),
  166. :defType => query_parser
  167. }
  168. else
  169. # Can't be expressed in a single dismax, do it the normal way
  170. {
  171. :q => self.to_query(solr_local_params),
  172. :defType => "lucene"
  173. }
  174. end
  175. end
  176. def negate
  177. List.new(list.collect { |i| i.negate })
  178. end
  179. end
  180. class AndList < List
  181. # We make an and-list embeddable only if all it's elements
  182. # are embeddable, then no problem we just embed them all
  183. # as Solr '+' mandatory, and achieve the AND.
  184. # For now, pure negative is considered not embeddable, although
  185. # theoretically it could sometimes be embedded if transformed
  186. # properly.
  187. def can_embed?
  188. !simple_pure_negative? && !list.collect { |i| i.can_embed? }.include?(false)
  189. end
  190. # Only if all operands are embeddable.
  191. # Trick is if they were bare terms/phrases, we add a '+' on
  192. # front, but if they already were +/-, then we don't need to,
  193. # and leaving them along will have desired semantics.
  194. # This works even on "-", because dismax mm seems to not consider "-"
  195. # clauses, they are always required regardless of mm.
  196. def to_embed
  197. list.collect do |operand|
  198. s = operand.to_embed
  199. if s =~ /^\+/ || s =~ /^\-/
  200. s
  201. else
  202. '+' + s
  203. end
  204. end.join(" ")
  205. end
  206. # for those that aren't embeddable, or pure negative
  207. def to_query(local_params)
  208. if simple_pure_negative?
  209. # Can do it in one single nested dismax, if we're simple arguments
  210. # that are pure negative.
  211. # build_nested_query will handle negating the pure negative for
  212. # us.
  213. build_nested_query(list, local_params)
  214. else
  215. "( " +
  216. list.collect do |i|
  217. i.to_query(local_params)
  218. end.join(" AND ") +
  219. " )"
  220. end
  221. end
  222. # convent logical property here, not(a AND b) === not(a) OR not(b)
  223. def negate
  224. OrList.new(list.collect { |n| n.negate })
  225. end
  226. end
  227. class OrList < List
  228. # never embeddable
  229. def can_embed?
  230. false
  231. end
  232. def to_query(local_params)
  233. # Okay, we're never embeddable as such, but sometimes we can
  234. # turn our operands into one single nested dismax query with mm=1, when
  235. # all our operands are 'simple', other times we need to actually do
  236. # two seperate nested queries seperated by lucene OR.
  237. # If all our children are embeddable but _not_ an "AndList", we can
  238. # do the one query part. The AndList is theoretically embeddable, but
  239. # not in a way compatible with flattening an OR to one query.
  240. # Sorry, this part is one of the least clean part of this code!
  241. not_flattenable = list.find { |i| !(i.can_embed? && !i.is_a?(AndList)) }
  242. if not_flattenable
  243. to_multi_queries(local_params)
  244. elsif simple_pure_negative?
  245. to_simple_pure_negative_query(local_params)
  246. else
  247. to_one_dismax_query(local_params)
  248. end
  249. end
  250. # build_nested_query isn't smart enough to handle refactoring
  251. # a simple pure negative "OR", that needs an mm of 100%.
  252. # Let's just do it ourselves. What we're doing makes more sense
  253. # if you remember that:
  254. # -a OR -b === NOT (a AND b)
  255. def to_simple_pure_negative_query(local_params)
  256. # take em out of their ExcludedClauses
  257. embeddables = list.collect { |n| n.operand }
  258. # and insist on mm 100%
  259. solr_params = local_params.merge(:mm => "100%")
  260. # and put the NOT in front to preserve semantics.
  261. 'NOT _query_:"' +
  262. bs_escape(build_local_params(solr_params) +
  263. embeddables.collect { |n| n.to_embed }.join(" ")) +
  264. '"'
  265. end
  266. # all our arguments are 'simple' (terms and phrases with +/-),
  267. # put am all in one single dismax with mm forced to 1.
  268. def to_one_dismax_query(local_params)
  269. build_nested_query(list, local_params.merge(:mm => "1"))
  270. end
  271. def to_multi_queries(local_params)
  272. "( " +
  273. list.collect do |i|
  274. if i.is_a?(NotExpression) || (i.respond_to?(:simple_pure_negative?) && i.simple_pure_negative?)
  275. # need special handling to work around Solr 1.4.1's lack of handling
  276. # of pure negative in an OR
  277. "(*:* AND #{i.to_query(local_params)})"
  278. else
  279. i.to_query(local_params)
  280. end
  281. end.join(" OR ") +
  282. " )"
  283. end
  284. # convenient logical property here, not(a OR b) === not(a) AND not(b)
  285. def negate
  286. AndList.new(list.collect { |n| n.negate })
  287. end
  288. end
  289. class NotExpression
  290. def initialize(exp)
  291. self.operand = exp
  292. end
  293. attr_accessor :operand
  294. # We have to do the weird thing with *:* AND NOT (real thing), because
  295. # Solr 1.4.1 seems not to be able to handle "x OR NOT y" otherwise, at least
  296. # in some cases, but does fine with
  297. # "x OR (*:* AND NOT y)", which should mean the same thing.
  298. def to_query(solr_params)
  299. # rescue double-nots to not treat them crazy-like and make the query
  300. # more work for Solr than it needs to be with a double-negative.
  301. if operand.is_a?(NotExpression)
  302. operand.operand.to_query(solr_params)
  303. else
  304. "NOT " + operand.to_query(solr_params)
  305. end
  306. end
  307. def can_embed?
  308. false
  309. end
  310. def negate
  311. operand
  312. end
  313. end
  314. class MandatoryClause < Node
  315. attr_accessor :operand
  316. def initialize(v)
  317. self.operand = v
  318. end
  319. def can_embed?
  320. # right now '+' clauses only apply to terms/phrases
  321. # which we can embed with a + in front.
  322. true
  323. end
  324. def to_embed
  325. '+' + operand.to_embed
  326. end
  327. # negating mandatory to excluded is decent semantics, although
  328. # it's not strictly 'true', it's a choice.
  329. def negate
  330. ExcludedClause.new(operand)
  331. end
  332. end
  333. class ExcludedClause < Node
  334. attr_accessor :operand
  335. def initialize(v)
  336. self.operand = v
  337. end
  338. def can_embed?
  339. # right now '-' clauses only apply to terms/phrases, which
  340. # we can embed with a '-' in front.
  341. true
  342. end
  343. def to_embed
  344. '-' + operand.to_embed
  345. end
  346. # negating excluded to mandatory is a pretty decent choice
  347. def negate
  348. MandatoryClause.new(operand)
  349. end
  350. def simple_pure_negative?
  351. true
  352. end
  353. end
  354. class Phrase < Node
  355. attr_accessor :value
  356. def initialize(string)
  357. self.value = string
  358. end
  359. def can_embed?
  360. true
  361. end
  362. def to_embed
  363. '"' + value + '"'
  364. end
  365. def negate
  366. ExcludedClause.new(self)
  367. end
  368. end
  369. class Term < Node
  370. attr_accessor :value
  371. def initialize(string)
  372. self.value = string
  373. end
  374. def can_embed?
  375. true
  376. end
  377. def to_embed
  378. value
  379. end
  380. def negate
  381. ExcludedClause.new(self)
  382. end
  383. end
  384. end