PageRenderTime 62ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/modules/websearch/lib/search_engine_query_parser.py

https://github.com/chokribr/invenio-1
Python | 1347 lines | 1262 code | 31 blank | 54 comment | 42 complexity | 711a4eecf261c4d0b791205d50a96470 MD5 | raw file
Possible License(s): GPL-2.0

Large files files are truncated, but you can click here to view the full file

  1. # -*- coding: utf-8 -*-
  2. ## This file is part of Invenio.
  3. ## Copyright (C) 2008, 2010, 2011, 2012, 2013 CERN.
  4. ##
  5. ## Invenio is free software; you can redistribute it and/or
  6. ## modify it under the terms of the GNU General Public License as
  7. ## published by the Free Software Foundation; either version 2 of the
  8. ## License, or (at your option) any later version.
  9. ##
  10. ## Invenio is distributed in the hope that it will be useful, but
  11. ## WITHOUT ANY WARRANTY; without even the implied warranty of
  12. ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. ## General Public License for more details.
  14. ##
  15. ## You should have received a copy of the GNU General Public License
  16. ## along with Invenio; if not, write to the Free Software Foundation, Inc.,
  17. ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
  18. # pylint: disable=C0301
  19. """Invenio Search Engine query parsers."""
  20. import re
  21. import string
  22. from invenio.dateutils import datetime
  23. try:
  24. import dateutil
  25. if not hasattr(dateutil, '__version__') or dateutil.__version__ != '2.0':
  26. from dateutil import parser as du_parser
  27. from dateutil.relativedelta import relativedelta as du_delta
  28. from dateutil import relativedelta
  29. GOT_DATEUTIL = True
  30. else:
  31. from warnings import warn
  32. warn("Not using dateutil module because the version %s is not compatible with Python-2.x" % dateutil.__version__)
  33. GOT_DATEUTIL = False
  34. except ImportError:
  35. # Ok, no date parsing is possible, but continue anyway,
  36. # since this package is only recommended, not mandatory.
  37. GOT_DATEUTIL = False
  38. from invenio.bibindex_tokenizers.BibIndexAuthorTokenizer import BibIndexAuthorTokenizer as FNT
  39. from invenio.logicutils import to_cnf
  40. from invenio.config import CFG_WEBSEARCH_SPIRES_SYNTAX
  41. from invenio.dateutils import strptime, strftime
  42. NameScanner = FNT()
  43. class InvenioWebSearchMismatchedParensError(Exception):
  44. """Exception for parse errors caused by mismatched parentheses."""
  45. def __init__(self, message):
  46. """Initialization."""
  47. self.message = message
  48. def __str__(self):
  49. """String representation."""
  50. return repr(self.message)
  51. class SearchQueryParenthesisedParser(object):
  52. """Search query parser that handles arbitrarily-nested parentheses
  53. Parameters:
  54. * substitution_dict: a dictionary mapping strings to other strings. By
  55. default, maps 'and', 'or' and 'not' to '+', '|', and '-'. Dictionary
  56. values will be treated as valid operators for output.
  57. A note (valkyrie 25.03.2011):
  58. Based on looking through the prod search logs, it is evident that users,
  59. when they are using parentheses to do searches, only run word characters
  60. up against parens when they intend the parens to be part of the word (e.g.
  61. U(1)), and when they are using parentheses to combine operators, they put
  62. a space before and after them. As of writing, this is the behavior that
  63. SQPP now expects, in order that it be able to handle such queries as
  64. e(+)e(-) that contain operators in parentheses that should be interpreted
  65. as words.
  66. """
  67. def __init__(self, substitution_dict = {'and': '+', 'or': '|', 'not': '-'}):
  68. self.substitution_dict = substitution_dict
  69. self.specials = set(['(', ')', '+', '|', '-', '+ -'])
  70. self.__tl_idx = 0
  71. self.__tl_len = 0
  72. # I think my names are both concise and clear
  73. # pylint: disable=C0103
  74. def _invenio_to_python_logical(self, q):
  75. """Translate the + and - in invenio query strings into & and ~."""
  76. p = q
  77. p = re.sub('\+ -', '&~', p)
  78. p = re.sub('\+', '&', p)
  79. p = re.sub('-', '~', p)
  80. p = re.sub(' ~', ' & ~', p)
  81. return p
  82. def _python_logical_to_invenio(self, q):
  83. """Translate the & and ~ in logical expression strings into + and -."""
  84. p = q
  85. p = re.sub('\& ~', '-', p)
  86. p = re.sub('~', '-', p)
  87. p = re.sub('\&', '+', p)
  88. return p
  89. # pylint: enable=C0103
  90. def parse_query(self, query):
  91. """Make query into something suitable for search_engine.
  92. This is the main entry point of the class.
  93. Given an expression of the form:
  94. "expr1 or expr2 (expr3 not (expr4 or expr5))"
  95. produces annoted list output suitable for consumption by search_engine,
  96. of the form:
  97. ['+', 'expr1', '|', 'expr2', '+', 'expr3 - expr4 | expr5']
  98. parse_query() is a wrapper for self.tokenize() and self.parse().
  99. """
  100. toklist = self.tokenize(query)
  101. depth, balanced, dummy_d0_p = self.nesting_depth_and_balance(toklist)
  102. if not balanced:
  103. raise SyntaxError("Mismatched parentheses in "+str(toklist))
  104. toklist, var_subs = self.substitute_variables(toklist)
  105. if depth > 1:
  106. toklist = self.tokenize(self.logically_reduce(toklist))
  107. return self.parse(toklist, var_subs)
  108. def substitute_variables(self, toklist):
  109. """Given a token list, return a copy of token list in which all free
  110. variables are bound with boolean variable names of the form 'pN'.
  111. Additionally, all the substitutable logical operators are exchanged
  112. for their symbolic form and implicit ands are made explicit
  113. e.g., ((author:'ellis, j' and title:quark) or author:stevens jones)
  114. becomes:
  115. ((p0 + p1) | p2 + p3)
  116. with the substitution table:
  117. {'p0': "author:'ellis, j'", 'p1': "title:quark",
  118. 'p2': "author:stevens", 'p3': "jones" }
  119. Return value is the substituted token list and a copy of the
  120. substitution table.
  121. """
  122. def labels():
  123. i = 0
  124. while True:
  125. yield 'p'+str(i)
  126. i += 1
  127. def filter_front_ands(toklist):
  128. """Filter out extra logical connectives and whitespace from the front."""
  129. while toklist[0] == '+' or toklist[0] == '|' or toklist[0] == '':
  130. toklist = toklist[1:]
  131. return toklist
  132. var_subs = {}
  133. labeler = labels()
  134. new_toklist = ['']
  135. cannot_be_anded = self.specials.difference((')',))
  136. for token in toklist:
  137. token = token.lower()
  138. if token in self.substitution_dict:
  139. if token == 'not' and new_toklist[-1] == '+':
  140. new_toklist[-1] = '-'
  141. else:
  142. new_toklist.append(self.substitution_dict[token])
  143. elif token == '(':
  144. if new_toklist[-1] not in self.specials:
  145. new_toklist.append('+')
  146. new_toklist.append(token)
  147. elif token not in self.specials:
  148. # apparently generators are hard for pylint to figure out
  149. # Turns off msg about labeler not having a 'next' method
  150. # pylint: disable=E1101
  151. label = labeler.next()
  152. # pylint: enable=E1101
  153. var_subs[label] = token
  154. if new_toklist[-1] not in cannot_be_anded:
  155. new_toklist.append('+')
  156. new_toklist.append(label)
  157. else:
  158. if token == '-' and new_toklist[-1] == '+':
  159. new_toklist[-1] = '-'
  160. else:
  161. new_toklist.append(token)
  162. return filter_front_ands(new_toklist), var_subs
  163. def nesting_depth_and_balance(self, token_list):
  164. """Checks that parentheses are balanced and counts how deep they nest"""
  165. depth = 0
  166. maxdepth = 0
  167. depth0_pairs = 0
  168. good_depth = True
  169. for i in range(len(token_list)):
  170. token = token_list[i]
  171. if token == '(':
  172. if depth == 0:
  173. depth0_pairs += 1
  174. depth += 1
  175. if depth > maxdepth:
  176. maxdepth += 1
  177. elif token == ')':
  178. depth -= 1
  179. if depth == -1: # can only happen with unmatched )
  180. good_depth = False # so force depth check to fail
  181. depth = 0 # but keep maxdepth in good range
  182. return maxdepth, depth == 0 and good_depth, depth0_pairs
  183. def logically_reduce(self, token_list):
  184. """Return token_list in conjunctive normal form as a string.
  185. CNF has the property that there will only ever be one level of
  186. parenthetical nesting, and all distributable operators (such as
  187. the not in -(p | q) will be fully distributed (as -p + -q).
  188. """
  189. maxdepth, dummy_balanced, d0_p = self.nesting_depth_and_balance(token_list)
  190. s = ' '.join(token_list)
  191. s = self._invenio_to_python_logical(s)
  192. last_maxdepth = 0
  193. while maxdepth != last_maxdepth: # XXX: sometimes NaryExpr doesn't
  194. try: # fully flatten Expr; but it usually
  195. s = str(to_cnf(s)) # does in 2 passes FIXME: diagnose
  196. except SyntaxError:
  197. raise SyntaxError(str(s)+" couldn't be converted to a logic expression.")
  198. last_maxdepth = maxdepth
  199. maxdepth, dummy_balanced, d0_p = self.nesting_depth_and_balance(self.tokenize(s))
  200. if d0_p == 1 and s[0] == '(' and s[-1] == ')': # s can come back with extra parens
  201. s = s[1:-1]
  202. s = self._python_logical_to_invenio(s)
  203. return s
  204. def tokenize(self, query):
  205. """Given a query string, return a list of tokens from that string.
  206. * Isolates meaningful punctuation: ( ) + | -
  207. * Keeps single- and double-quoted strings together without interpretation.
  208. * Splits everything else on whitespace.
  209. i.e.:
  210. "expr1|expr2 (expr3-(expr4 or expr5))"
  211. becomes:
  212. ['expr1', '|', 'expr2', '(', 'expr3', '-', '(', 'expr4', 'or', 'expr5', ')', ')']
  213. special case:
  214. "e(+)e(-)" interprets '+' and '-' as word characters since they are in parens with
  215. word characters run up against them.
  216. it becomes:
  217. ['e(+)e(-)']
  218. """
  219. ###
  220. # Invariants:
  221. # * Query is never modified
  222. # * In every loop iteration, querytokens grows to the right
  223. # * The only return point is at the bottom of the function, and the only
  224. # return value is querytokens
  225. ###
  226. def get_tokens(s):
  227. """
  228. Given string s, return a list of s's tokens.
  229. Adds space around special punctuation, then splits on whitespace.
  230. """
  231. s = ' '+s
  232. s = s.replace('->', '####DATE###RANGE##OP#') # XXX: Save '->'
  233. s = re.sub('(?P<outside>[a-zA-Z0-9_,=:]+)\((?P<inside>[a-zA-Z0-9_,+-/]*)\)',
  234. '#####\g<outside>####PAREN###\g<inside>##PAREN#', s) # XXX: Save U(1) and SL(2,Z)
  235. s = re.sub('####PAREN###(?P<content0>[.0-9/-]*)(?P<plus>[+])(?P<content1>[.0-9/-]*)##PAREN#',
  236. '####PAREN###\g<content0>##PLUS##\g<content1>##PAREN#', s)
  237. s = re.sub('####PAREN###(?P<content0>([.0-9/]|##PLUS##)*)(?P<minus>[-])' +\
  238. '(?P<content1>([.0-9/]|##PLUS##)*)##PAREN#',
  239. '####PAREN###\g<content0>##MINUS##\g<content1>##PAREN#', s) # XXX: Save e(+)e(-)
  240. for char in self.specials:
  241. if char == '-':
  242. s = s.replace(' -', ' - ')
  243. s = s.replace(')-', ') - ')
  244. s = s.replace('-(', ' - (')
  245. else:
  246. s = s.replace(char, ' '+char+' ')
  247. s = re.sub('##PLUS##', '+', s)
  248. s = re.sub('##MINUS##', '-', s) # XXX: Restore e(+)e(-)
  249. s = re.sub('#####(?P<outside>[a-zA-Z0-9_,=:]+)####PAREN###(?P<inside>[a-zA-Z0-9_,+-/]*)##PAREN#',
  250. '\g<outside>(\g<inside>)', s) # XXX: Restore U(1) and SL(2,Z)
  251. s = s.replace('####DATE###RANGE##OP#', '->') # XXX: Restore '->'
  252. return s.split()
  253. querytokens = []
  254. current_position = 0
  255. re_quotes_match = re.compile(r'(?![\\])(".*?[^\\]")' + r"|(?![\\])('.*?[^\\]')")
  256. for match in re_quotes_match.finditer(query):
  257. match_start = match.start()
  258. quoted_region = match.group(0).strip()
  259. # clean the content after the previous quotes and before current quotes
  260. unquoted = query[current_position : match_start]
  261. querytokens.extend(get_tokens(unquoted))
  262. # XXX: In case we end up with e.g. title:, "compton scattering", make it
  263. # title:"compton scattering"
  264. if querytokens and querytokens[0] and querytokens[-1][-1] == ':':
  265. querytokens[-1] += quoted_region
  266. # XXX: In case we end up with e.g. "expr1",->,"expr2", make it
  267. # "expr1"->"expr2"
  268. elif len(querytokens) >= 2 and querytokens[-1] == '->':
  269. arrow = querytokens.pop()
  270. querytokens[-1] += arrow + quoted_region
  271. else:
  272. # add our newly tokenized content to the token list
  273. querytokens.extend([quoted_region])
  274. # move current position to the end of the tokenized content
  275. current_position = match.end()
  276. # get tokens from the last appearance of quotes until the query end
  277. unquoted = query[current_position : len(query)]
  278. querytokens.extend(get_tokens(unquoted))
  279. return querytokens
  280. def parse(self, token_list, variable_substitution_dict=None):
  281. """Make token_list consumable by search_engine.
  282. Turns a list of tokens and a variable mapping into a grouped list
  283. of subexpressions in the format suitable for use by search_engine,
  284. e.g.:
  285. ['+', 'searchterm', '-', 'searchterm to exclude', '|', 'another term']
  286. Incidentally, this works recursively so parens can cause arbitrarily
  287. deep nestings. But since the search_engine doesn't know about nested
  288. structures, we need to flatten the input structure first.
  289. """
  290. ###
  291. # Invariants:
  292. # * Token list is never modified
  293. # * Balanced parens remain balanced; unbalanced parens are an error
  294. # * Individual tokens may only be exchanged for items in the variable
  295. # substitution dict; otherwise they pass through unmolested
  296. # * Return value is built up mostly as a stack
  297. ###
  298. op_symbols = self.substitution_dict.values()
  299. self.__tl_idx = 0
  300. self.__tl_len = len(token_list)
  301. def inner_parse(token_list, open_parens=False):
  302. '''
  303. although it's not in the API, it seems sensible to comment
  304. this function a bit.
  305. dist_token here is a token (e.g. a second-order operator)
  306. which needs to be distributed across other tokens inside
  307. the inner parens
  308. '''
  309. if open_parens:
  310. parsed_values = []
  311. else:
  312. parsed_values = ['+']
  313. i = 0
  314. while i < len(token_list):
  315. token = token_list[i]
  316. if i > 0 and parsed_values[-1] not in op_symbols:
  317. parsed_values.append('+')
  318. if token == '(':
  319. # if we need to distribute something over the tokens inside the parens
  320. # we will know it because... it will end in a :
  321. # that part of the list will be 'px', '+', '('
  322. distributing = (len(parsed_values) > 2 and parsed_values[-2].endswith(':') and parsed_values[-1] == '+')
  323. if distributing:
  324. # we don't need the + if we are distributing
  325. parsed_values = parsed_values[:-1]
  326. offset = self.__tl_len - len(token_list)
  327. inner_value = inner_parse(token_list[i+1:], True)
  328. inner_value = ' '.join(inner_value)
  329. if distributing:
  330. if len(self.tokenize(inner_value)) == 1:
  331. parsed_values[-1] = parsed_values[-1] + inner_value
  332. elif "'" in inner_value:
  333. parsed_values[-1] = parsed_values[-1] + '"' + inner_value + '"'
  334. elif '"' in inner_value:
  335. parsed_values[-1] = parsed_values[-1] + "'" + inner_value + "'"
  336. else:
  337. parsed_values[-1] = parsed_values[-1] + '"' + inner_value + '"'
  338. else:
  339. parsed_values.append(inner_value)
  340. self.__tl_idx += 1
  341. i = self.__tl_idx - offset
  342. elif token == ')':
  343. if parsed_values[-1] in op_symbols:
  344. parsed_values = parsed_values[:-1]
  345. if len(parsed_values) > 1 and parsed_values[0] == '+' and parsed_values[1] in op_symbols:
  346. parsed_values = parsed_values[1:]
  347. return parsed_values
  348. elif token in op_symbols:
  349. if len(parsed_values) > 0:
  350. parsed_values[-1] = token
  351. else:
  352. parsed_values = [token]
  353. else:
  354. if variable_substitution_dict != None and token in variable_substitution_dict:
  355. token = variable_substitution_dict[token]
  356. parsed_values.append(token)
  357. i += 1
  358. self.__tl_idx += 1
  359. # If we have an extra start symbol, remove the default one
  360. if parsed_values[1] in op_symbols:
  361. parsed_values = parsed_values[1:]
  362. return parsed_values
  363. return inner_parse(token_list, False)
  364. class SpiresToInvenioSyntaxConverter:
  365. """Converts queries defined with SPIRES search syntax into queries
  366. that use Invenio search syntax.
  367. """
  368. # Constants defining fields
  369. _DATE_ADDED_FIELD = 'datecreated:'
  370. _DATE_UPDATED_FIELD = 'datemodified:'
  371. _DATE_FIELD = 'year:'
  372. _A_TAG = 'author:'
  373. _EA_TAG = 'exactauthor:'
  374. # Dictionary containing the matches between SPIRES keywords
  375. # and their corresponding Invenio keywords or fields
  376. # SPIRES keyword : Invenio keyword or field
  377. _SPIRES_TO_INVENIO_KEYWORDS_MATCHINGS = {
  378. # address
  379. 'address' : 'address:',
  380. # affiliation
  381. 'affiliation' : 'affiliation:',
  382. 'affil' : 'affiliation:',
  383. 'aff' : 'affiliation:',
  384. 'af' : 'affiliation:',
  385. 'institution' : 'affiliation:',
  386. 'inst' : 'affiliation:',
  387. # any field
  388. 'any' : 'anyfield:',
  389. # author count
  390. 'ac' : 'authorcount:',
  391. # bulletin
  392. 'bb' : 'reportnumber:',
  393. 'bbn' : 'reportnumber:',
  394. 'bull' : 'reportnumber:',
  395. 'bulletin-bd' : 'reportnumber:',
  396. 'bulletin-bd-no' : 'reportnumber:',
  397. 'eprint' : 'reportnumber:',
  398. # citation / reference
  399. 'c' : 'reference:',
  400. 'citation' : 'reference:',
  401. 'cited' : 'reference:',
  402. 'jour-vol-page' : 'reference:',
  403. 'jvp' : 'reference:',
  404. # collaboration
  405. 'collaboration' : 'collaboration:',
  406. 'collab-name' : 'collaboration:',
  407. 'cn' : 'collaboration:',
  408. # conference number
  409. 'conf-number' : '111__g:',
  410. 'cnum' : '773__w:',
  411. # country
  412. 'cc' : '044__a:',
  413. 'country' : '044__a:',
  414. # date
  415. 'date': _DATE_FIELD,
  416. 'd': _DATE_FIELD,
  417. # date added
  418. 'date-added': _DATE_ADDED_FIELD,
  419. 'dadd': _DATE_ADDED_FIELD,
  420. 'da': _DATE_ADDED_FIELD,
  421. # date updated
  422. 'date-updated': _DATE_UPDATED_FIELD,
  423. 'dupd': _DATE_UPDATED_FIELD,
  424. 'du': _DATE_UPDATED_FIELD,
  425. # first author
  426. 'fa' : 'firstauthor:',
  427. 'first-author' : 'firstauthor:',
  428. # author
  429. 'a' : 'author:',
  430. 'au' : 'author:',
  431. 'author' : 'author:',
  432. 'name' : 'author:',
  433. # exact author
  434. # this is not a real keyword match. It is pseudo keyword that
  435. # will be replaced later with author search
  436. 'ea' : 'exactauthor:',
  437. 'exact-author' : 'exactauthor:',
  438. # experiment
  439. 'exp' : 'experiment:',
  440. 'experiment' : 'experiment:',
  441. 'expno' : 'experiment:',
  442. 'sd' : 'experiment:',
  443. 'se' : 'experiment:',
  444. # journal
  445. 'journal' : 'journal:',
  446. 'j' : 'journal:',
  447. 'published_in' : 'journal:',
  448. 'spicite' : 'journal:',
  449. 'vol' : 'volume:',
  450. # journal page
  451. 'journal-page' : '773__c:',
  452. 'jp' : '773__c:',
  453. # journal year
  454. 'journal-year' : '773__y:',
  455. 'jy' : '773__y:',
  456. # key
  457. 'key' : '970__a:',
  458. 'irn' : '970__a:',
  459. 'record' : '970__a:',
  460. 'document' : '970__a:',
  461. 'documents' : '970__a:',
  462. # keywords
  463. 'k' : 'keyword:',
  464. 'keywords' : 'keyword:',
  465. 'kw' : 'keyword:',
  466. # note
  467. 'note' : '500__a:',
  468. # old title
  469. 'old-title' : '246__a:',
  470. 'old-t' : '246__a:',
  471. 'ex-ti' : '246__a:',
  472. 'et' : '246__a:',
  473. #postal code
  474. 'postalcode' : 'postalcode:',
  475. 'zip' : 'postalcode:',
  476. 'cc' : 'postalcode:',
  477. # ppf subject
  478. 'ppf-subject' : '650__a:',
  479. 'status' : '650__a:',
  480. # recid
  481. 'recid' : 'recid:',
  482. # report number
  483. 'r' : 'reportnumber:',
  484. 'rn' : 'reportnumber:',
  485. 'rept' : 'reportnumber:',
  486. 'report' : 'reportnumber:',
  487. 'report-num' : 'reportnumber:',
  488. # title
  489. 't' : 'title:',
  490. 'ti' : 'title:',
  491. 'title' : 'title:',
  492. 'with-language' : 'title:',
  493. # fulltext
  494. 'fulltext' : 'fulltext:',
  495. 'ft' : 'fulltext:',
  496. # topic
  497. 'topic' : '695__a:',
  498. 'tp' : '695__a:',
  499. 'hep-topic' : '695__a:',
  500. 'desy-keyword' : '695__a:',
  501. 'dk' : '695__a:',
  502. # doi
  503. 'doi': 'doi:',
  504. # topcite
  505. 'topcit' : 'cited:',
  506. 'topcite' : 'cited:',
  507. # captions
  508. 'caption' : 'caption:',
  509. # category
  510. 'arx' : '037__c:',
  511. 'category' : '037__c:',
  512. # primarch
  513. 'parx' : '037__c:',
  514. 'primarch' : '037__c:',
  515. # texkey
  516. 'texkey' : '035__%:',
  517. # type code
  518. 'tc' : 'collection:',
  519. 'ty' : 'collection:',
  520. 'type' : 'collection:',
  521. 'type-code' : 'collection:',
  522. 'scl': 'collection:',
  523. 'ps': 'collection:',
  524. # field code
  525. 'f' : 'subject:',
  526. 'fc' : 'subject:',
  527. 'field' : 'subject:',
  528. 'field-code' : 'subject:',
  529. 'subject' : 'subject:',
  530. # coden
  531. 'bc' : 'journal:',
  532. 'browse-only-indx' : 'journal:',
  533. 'coden' : 'journal:',
  534. 'journal-coden' : 'journal:',
  535. # jobs specific codes
  536. 'job' : 'title:',
  537. 'position' : 'title:',
  538. 'region' : 'region:',
  539. 'continent' : 'region:',
  540. 'deadline' : '046__a:',
  541. 'rank' : 'rank:',
  542. 'cat' : 'cataloguer:',
  543. # replace all the keywords without match with empty string
  544. # this will remove the noise from the unknown keywrds in the search
  545. # and will in all fields for the words following the keywords
  546. # energy
  547. 'e' : '',
  548. 'energy' : '',
  549. 'energyrange-code' : '',
  550. # exact experiment number
  551. 'ee' : '',
  552. 'exact-exp' : '',
  553. 'exact-expno' : '',
  554. # hidden note
  555. 'hidden-note' : '',
  556. 'hn' : '',
  557. # ppf
  558. 'ppf' : '',
  559. 'ppflist' : '',
  560. # slac topics
  561. 'ppfa' : '',
  562. 'slac-topics' : '',
  563. 'special-topics' : '',
  564. 'stp' : '',
  565. # test index
  566. 'test' : '',
  567. 'testindex' : '',
  568. }
  569. _SECOND_ORDER_KEYWORD_MATCHINGS = {
  570. 'rawref' : 'rawref:',
  571. 'refersto' : 'refersto:',
  572. 'refs': 'refersto:',
  573. 'citedby' : 'citedby:'
  574. }
  575. _INVENIO_KEYWORDS_FOR_SPIRES_PHRASE_SEARCHES = [
  576. 'affiliation:',
  577. #'cited:', # topcite is technically a phrase index - this isn't necessary
  578. '773__y:', # journal-year
  579. '773__c:', # journal-page
  580. '773__w:', # cnum
  581. '044__a:', # country code
  582. 'subject:', # field code
  583. 'collection:', # type code
  584. '035__z:', # texkey
  585. # also exact expno, corp-auth, url, abstract, doi, mycite, citing
  586. # but we have no invenio equivalents for these ATM
  587. ]
  588. def __init__(self):
  589. """Initialize the state of the converter"""
  590. self._months = {}
  591. self._month_name_to_month_number = {}
  592. self._init_months()
  593. self._compile_regular_expressions()
  594. def _compile_regular_expressions(self):
  595. """Compiles some of the regular expressions that are used in the class
  596. for higher performance."""
  597. # regular expression that matches the contents in single and double quotes
  598. # taking in mind if they are escaped.
  599. self._re_quotes_match = re.compile(r'(?![\\])(".*?[^\\]")' + r"|(?![\\])('.*?[^\\]')")
  600. # match cases where a keyword distributes across a conjunction
  601. self._re_distribute_keywords = re.compile(r'''(?ix) # verbose, ignorecase on
  602. \b(?P<keyword>\S*:) # a keyword is anything that's not whitespace with a colon
  603. (?P<content>[^:]+?)\s* # content is the part that comes after the keyword; it should NOT
  604. # have colons in it! that implies that we might be distributing
  605. # a keyword OVER another keyword. see ticket #701
  606. (?P<combination>\ and\ not\ |\ and\ |\ or\ |\ not\ )\s*
  607. (?P<last_content>[^:]*?) # oh look, content without a keyword!
  608. (?=\ and\ |\ or\ |\ not\ |$)''')
  609. # massaging SPIRES quirks
  610. self._re_pattern_IRN_search = re.compile(r'970__a:(?P<irn>\d+)')
  611. self._re_topcite_match = re.compile(r'(?P<x>cited:\d+)\+')
  612. # regular expression that matches author patterns
  613. # and author patterns with second-order-ops on top
  614. # does not match names with " or ' around them, since
  615. # those should not be touched
  616. self._re_author_match = re.compile(r'''(?ix) # verbose, ignorecase
  617. \b((?P<secondorderop>[^\s]+:)?) # do we have a second-order-op on top?
  618. ((?P<first>first)?)author:(?P<name>
  619. [^\'\"] # first character not a quotemark
  620. [^()]*? # some stuff that isn't parentheses (that is dealt with in pp)
  621. [^\'\"]) # last character not a quotemark
  622. (?=\ and\ not\ |\ and\ |\ or\ |\ not\ |$)''')
  623. # regular expression that matches exact author patterns
  624. # the group defined in this regular expression is used in method
  625. # _convert_spires_exact_author_search_to_invenio_author_search(...)
  626. # in case of changes correct also the code in this method
  627. self._re_exact_author_match = re.compile(r'\b((?P<secondorderop>[^\s]+:)?)exactauthor:(?P<author_name>[^\'\"].*?[^\'\"]\b)(?= and not | and | or | not |$)', re.IGNORECASE)
  628. # match a second-order operator with no operator following it
  629. self._re_second_order_op_no_index_match = re.compile(r'''(?ix) # ignorecase, verbose
  630. (^|\b|:)(?P<second_order_op>(refersto|citedby):)
  631. (?P<search_terms>[^\"\'][^:]+?) # anything without an index should be absorbed here
  632. \s*
  633. (?P<conjunction_or_next_keyword>(\ and\ |\ not\ |\ or\ |\ \w+:\w+|$))
  634. ''')
  635. # match search term, its content (words that are searched) and
  636. # the operator preceding the term.
  637. self._re_search_term_pattern_match = re.compile(r'\b(?P<combine_operator>find|and|or|not)\s+(?P<search_term>\S+:)(?P<search_content>.+?)(?= and not | and | or | not |$)', re.IGNORECASE)
  638. # match journal searches
  639. self._re_search_term_is_journal = re.compile(r'''(?ix) # verbose, ignorecase
  640. \b(?P<leading>(find|and|or|not)\s+journal:) # first combining operator and index
  641. (?P<search_content>.+?) # what we are searching
  642. (?=\ and\ not\ |\ and\ |\ or\ |\ not\ |$)''')
  643. # regular expression matching date after pattern
  644. self._re_date_after_match = re.compile(r'\b(?P<searchop>d|date|dupd|dadd|da|date-added|du|date-updated)\b\s*(after|>)\s*(?P<search_content>.+?)(?= and not | and | or | not |$)', re.IGNORECASE)
  645. # regular expression matching date after pattern
  646. self._re_date_before_match = re.compile(r'\b(?P<searchop>d|date|dupd|dadd|da|date-added|du|date-updated)\b\s*(before|<)\s*(?P<search_content>.+?)(?= and not | and | or | not |$)', re.IGNORECASE)
  647. # match date searches which have been keyword-substituted
  648. self._re_keysubbed_date_expr = re.compile(r'\b(?P<term>(' + self._DATE_ADDED_FIELD + ')|(' + self._DATE_UPDATED_FIELD + ')|(' + self._DATE_FIELD + '))(?P<content>.+?)(?= and not | and | or | not |$)', re.IGNORECASE)
  649. # for finding (and changing) a variety of different SPIRES search keywords
  650. self._re_spires_find_keyword = re.compile('^(f|fin|find)\s+', re.IGNORECASE)
  651. # for finding boolean expressions
  652. self._re_boolean_expression = re.compile(r' and | or | not | and not ')
  653. # patterns for subbing out spaces within quotes temporarily
  654. self._re_pattern_single_quotes = re.compile("'(.*?)'")
  655. self._re_pattern_double_quotes = re.compile("\"(.*?)\"")
  656. self._re_pattern_regexp_quotes = re.compile("\/(.*?)\/")
  657. self._re_pattern_space = re.compile("__SPACE__")
  658. self._re_pattern_equals = re.compile("__EQUALS__")
  659. # for date math:
  660. self._re_datemath = re.compile(r'(?P<datestamp>.+)\s+(?P<operator>[-+])\s+(?P<units>\d+)')
  661. def is_applicable(self, query):
  662. """Is this converter applicable to this query?
  663. Return true if query begins with find, fin, or f, or if it contains
  664. a SPIRES-specific keyword (a, t, etc.), or if it contains the invenio
  665. author: field search. """
  666. if not CFG_WEBSEARCH_SPIRES_SYNTAX:
  667. #SPIRES syntax is switched off
  668. return False
  669. query = query.lower()
  670. if self._re_spires_find_keyword.match(query):
  671. #leading 'find' is present and SPIRES syntax is switched on
  672. return True
  673. if CFG_WEBSEARCH_SPIRES_SYNTAX > 1:
  674. query = self._re_pattern_double_quotes.sub('', query)
  675. for word in query.split(' '):
  676. if word in self._SPIRES_TO_INVENIO_KEYWORDS_MATCHINGS:
  677. return True
  678. return False
  679. def convert_query(self, query):
  680. """Convert SPIRES syntax queries to Invenio syntax.
  681. Do nothing to queries not in SPIRES syntax."""
  682. # SPIRES syntax allows searches with 'find' or 'fin'.
  683. if self.is_applicable(query):
  684. query = re.sub(self._re_spires_find_keyword, 'find ', query)
  685. if not query.startswith('find'):
  686. query = 'find ' + query
  687. # a holdover from SPIRES syntax is e.g. date = 2000 rather than just date 2000
  688. query = self._remove_extraneous_equals_signs(query)
  689. # these calls are before keywords replacement because when keywords
  690. # are replaced, date keyword is replaced by specific field search
  691. # and the DATE keyword is not match in DATE BEFORE or DATE AFTER
  692. query = self._convert_spires_date_before_to_invenio_span_query(query)
  693. query = self._convert_spires_date_after_to_invenio_span_query(query)
  694. # call to _replace_spires_keywords_with_invenio_keywords should be at the
  695. # beginning because the next methods use the result of the replacement
  696. query = self._standardize_already_invenio_keywords(query)
  697. query = self._replace_spires_keywords_with_invenio_keywords(query)
  698. query = self._normalise_journal_page_format(query)
  699. query = self._distribute_keywords_across_combinations(query)
  700. query = self._distribute_and_quote_second_order_ops(query)
  701. query = self._convert_all_dates(query)
  702. query = self._convert_irns_to_spires_irns(query)
  703. query = self._convert_topcite_to_cited(query)
  704. query = self._convert_spires_author_search_to_invenio_author_search(query)
  705. query = self._convert_spires_exact_author_search_to_invenio_author_search(query)
  706. query = self._convert_spires_truncation_to_invenio_truncation(query)
  707. query = self._expand_search_patterns(query)
  708. # remove FIND in the beginning of the query as it is not necessary in Invenio
  709. query = query[4:]
  710. query = query.strip()
  711. return query
  712. def _init_months(self):
  713. """Defines a dictionary matching the name
  714. of the month with its corresponding number"""
  715. # this dictionary is used when generating match patterns for months
  716. self._months = {'jan':'01', 'january':'01',
  717. 'feb':'02', 'february':'02',
  718. 'mar':'03', 'march':'03',
  719. 'apr':'04', 'april':'04',
  720. 'may':'05', 'may':'05',
  721. 'jun':'06', 'june':'06',
  722. 'jul':'07', 'july':'07',
  723. 'aug':'08', 'august':'08',
  724. 'sep':'09', 'september':'09',
  725. 'oct':'10', 'october':'10',
  726. 'nov':'11', 'november':'11',
  727. 'dec':'12', 'december':'12'}
  728. # this dictionary is used to transform name of the month
  729. # to a number used in the date format. By this reason it
  730. # contains also the numbers itself to simplify the conversion
  731. self._month_name_to_month_number = {'1':'01', '01':'01',
  732. '2':'02', '02':'02',
  733. '3':'03', '03':'03',
  734. '4':'04', '04':'04',
  735. '5':'05', '05':'05',
  736. '6':'06', '06':'06',
  737. '7':'07', '07':'07',
  738. '8':'08', '08':'08',
  739. '9':'09', '09':'09',
  740. '10':'10',
  741. '11':'11',
  742. '12':'12',}
  743. # combine it with months in order to cover all the cases
  744. self._month_name_to_month_number.update(self._months)
  745. def _get_month_names_match(self):
  746. """Retruns part of a patter that matches month in a date"""
  747. months_match = ''
  748. for month_name in self._months.keys():
  749. months_match = months_match + month_name + '|'
  750. months_match = r'\b(' + months_match[0:-1] + r')\b'
  751. return months_match
  752. def _convert_all_dates(self, query):
  753. """Tries to find dates in query and make them look like ISO-8601."""
  754. def mangle_with_dateutils(query):
  755. result = ''
  756. position = 0
  757. for match in self._re_keysubbed_date_expr.finditer(query):
  758. result += query[position : match.start()]
  759. datestamp = match.group('content')
  760. daterange = self.convert_date(datestamp)
  761. result += match.group('term') + daterange
  762. position = match.end()
  763. result += query[position : ]
  764. return result
  765. if GOT_DATEUTIL:
  766. query = mangle_with_dateutils(query)
  767. # else do nothing with the dates
  768. return query
  769. def convert_date(self, date_str):
  770. def parse_relative_unit(date_str):
  771. units = 0
  772. datemath = self._re_datemath.match(date_str)
  773. if datemath:
  774. date_str = datemath.group('datestamp')
  775. units = int(datemath.group('operator') + datemath.group('units'))
  776. return date_str, units
  777. def guess_best_year(d):
  778. if d.year > datetime.today().year + 10:
  779. return d - du_delta(years=100)
  780. else:
  781. return d
  782. def parse_date_unit(date_str):
  783. begin = date_str
  784. end = None
  785. # First split, relative time directive
  786. # e.g. "2012-01-01 - 3" to ("2012-01-01", -3)
  787. date_str, relative_units = parse_relative_unit(date_str)
  788. try:
  789. d = strptime(date_str, '%Y-%m-%d')
  790. d += du_delta(days=relative_units)
  791. return strftime('%Y-%m-%d', d), end
  792. except ValueError:
  793. pass
  794. try:
  795. d = strptime(date_str, '%y-%m-%d')
  796. d += du_delta(days=relative_units)
  797. d = guess_best_year(d)
  798. return strftime('%Y-%m-%d', d), end
  799. except ValueError:
  800. pass
  801. for date_fmt in ('%Y-%m', '%y-%m', '%m/%y', '%m/%Y'):
  802. try:
  803. d = strptime(date_str, date_fmt)
  804. d += du_delta(months=relative_units)
  805. return strftime('%Y-%m', d), end
  806. except ValueError:
  807. pass
  808. try:
  809. d = strptime(date_str, '%Y')
  810. d += du_delta(years=relative_units)
  811. return strftime('%Y', d), end
  812. except ValueError:
  813. pass
  814. try:
  815. d = strptime(date_str, '%y')
  816. d += du_delta(days=relative_units)
  817. d = guess_best_year(d)
  818. return strftime('%Y', d), end
  819. except ValueError:
  820. pass
  821. try:
  822. d = strptime(date_str, '%b %y')
  823. d = guess_best_year(d)
  824. return strftime('%Y-%m', d), end
  825. except ValueError:
  826. pass
  827. if 'this week' in date_str:
  828. # Past monday to today
  829. # This week is iffy, not sure if we should
  830. # start with sunday or monday
  831. begin = datetime.today()
  832. begin += du_delta(weekday=relativedelta.SU(-1))
  833. end = datetime.today()
  834. begin = strftime('%Y-%m-%d', begin)
  835. end = strftime('%Y-%m-%d', end)
  836. elif 'last week' in date_str:
  837. # Past monday to today
  838. # Same problem as last week
  839. begin = datetime.today()
  840. begin += du_delta(weekday=relativedelta.SU(-2))
  841. end = begin + du_delta(weekday=relativedelta.SA(1))
  842. begin = strftime('%Y-%m-%d', begin)
  843. end = strftime('%Y-%m-%d', end)
  844. elif 'this month' in date_str:
  845. d = datetime.today()
  846. begin = strftime('%Y-%m', d)
  847. elif 'last month' in date_str:
  848. d = datetime.today() - du_delta(months=1)
  849. begin = strftime('%Y-%m', d)
  850. elif 'yesterday' in date_str:
  851. d = datetime.today() - du_delta(days=1)
  852. begin = strftime('%Y-%m-%d', d)
  853. elif 'today' in date_str:
  854. start = datetime.today()
  855. start += du_delta(days=relative_units)
  856. begin = strftime('%Y-%m-%d', start)
  857. elif date_str.strip() == '0':
  858. begin = '0'
  859. else:
  860. default = datetime(datetime.today().year, 1, 1)
  861. try:
  862. d = du_parser.parse(date_str, default=default)
  863. except (ValueError, TypeError):
  864. begin = date_str
  865. else:
  866. begin = strftime('%Y-%m-%d', d)
  867. return begin, end
  868. if '->' in date_str:
  869. begin_unit, end_unit = date_str.split('->', 1)
  870. begin, dummy = parse_date_unit(begin_unit)
  871. end, dummy = parse_date_unit(end_unit)
  872. else:
  873. begin, end = parse_date_unit(date_str)
  874. if end:
  875. daterange = '%s->%s' % (begin, end)
  876. else:
  877. daterange = begin
  878. return daterange
  879. def _convert_irns_to_spires_irns(self, query):
  880. """Prefix IRN numbers with SPIRES- so they match the INSPIRE format."""
  881. def create_replacement_pattern(match):
  882. """method used for replacement with regular expression"""
  883. return '970__a:SPIRES-' + match.group('irn')
  884. query = self._re_pattern_IRN_search.sub(create_replacement_pattern, query)
  885. return query
  886. def _convert_topcite_to_cited(self, query):
  887. """Replace SPIRES topcite x+ with cited:x->999999999"""
  888. def create_replacement_pattern(match):
  889. """method used for replacement with regular expression"""
  890. return match.group('x') + '->999999999'
  891. query = self._re_topcite_match.sub(create_replacement_pattern, query)
  892. return query
  893. def _convert_spires_date_after_to_invenio_span_query(self, query):
  894. """Converts date after SPIRES search term into invenio span query"""
  895. def create_replacement_pattern(match):
  896. """method used for replacement with regular expression"""
  897. return match.group('searchop') + ' ' + match.group('search_content') + '->9999'
  898. query = self._re_date_after_match.sub(create_replacement_pattern, query)
  899. return query
  900. def _convert_spires_date_before_to_invenio_span_query(self, query):
  901. """Converts date before SPIRES search term into invenio span query"""
  902. # method used for replacement with regular expression
  903. def create_replacement_pattern(match):
  904. return match.group('searchop') + ' ' + '0->' + match.group('search_content')
  905. query = self._re_date_before_match.sub(create_replacement_pattern, query)
  906. return query
  907. def _expand_search_patterns(self, query):
  908. """Expands search queries.
  909. If a search term is followed by several words e.g.
  910. author:ellis or title:THESE THREE WORDS it is expanded to
  911. author:ellis or (title:THESE and title:THREE...)
  912. All keywords are thus expanded. XXX: this may lead to surprising
  913. results for any later parsing stages if we're not careful.
  914. """
  915. def create_replacements(term, content):
  916. result = ''
  917. content = content.strip()
  918. # replace spaces within quotes by __SPACE__ temporarily:
  919. content = self._re_pattern_single_quotes.sub(lambda x: "'"+string.replace(x.group(1), ' ', '__SPACE__')+"'", content)
  920. content = self._re_pattern_double_quotes.sub(lambda x: "\""+string.replace(x.group(1), ' ', '__SPACE__')+"\"", content)
  921. content = self._re_pattern_regexp_quotes.sub(lambda x: "/"+string.replace(x.group(1), ' ', '__SPACE__')+"/", content)
  922. if term in self._INVENIO_KEYWORDS_FOR_SPIRES_PHRASE_SEARCHES \
  923. and not self._re_boolean_expression.search(content) and ' ' in content:
  924. # the case of things which should be searched as phrases
  925. result = term + '"' + content + '"'
  926. else:
  927. words = content.split()
  928. if len(words) == 0:
  929. # this should almost never happen, req user to say 'find a junk:'
  930. result = term
  931. elif len(words) == 1:
  932. # this is more common but still occasional
  933. result = term + words[0]
  934. else:
  935. # general case
  936. result = '(' + term + words[0]
  937. for word in words[1:]:
  938. result += ' and ' + term + word
  939. result += ')'
  940. # replace back __SPACE__ by spaces:
  941. result = self._re_pattern_space.sub(" ", result)
  942. return result.strip()
  943. result = ''
  944. current_position = 0
  945. for match in self._re_search_term_pattern_match.finditer(query):
  946. result += query[current_position : match.start()]
  947. result += ' ' + match.group('combine_operator') + ' '
  948. result += create_replacements(match.group('search_term'), match.group('search_content'))
  949. current_position = match.end()
  950. result += query[current_position : len(query)]
  951. return result.strip()
  952. def _remove_extraneous_equals_signs(self, query):
  953. """In SPIRES, both date = 2000 and date 2000 are acceptable. Get rid of the ="""
  954. query = self._re_pattern_single_quotes.sub(lambda x: "'"+string.replace(x.group(1), '=', '__EQUALS__')+"'", query)
  955. query = self._re_pattern_double_quotes.sub(lambda x: "\""+string.replace(x.group(1), '=', '__EQUALS__')+'\"', query)
  956. query = self._re_pattern_regexp_quotes.sub(lambda x: "/"+string.replace(x.group(1), '=', '__EQUALS__')+"/", query)
  957. query = query.replace('=', '')
  958. query = self._re_pattern_equals.sub("=", query)
  959. return query
  960. def _convert_spires_truncation_to_invenio_truncation(self, query):
  961. """Replace SPIRES truncation symbol # with invenio trancation symbol *"""
  962. return query.replace('#', '*')
  963. def _convert_spires_exact_author_search_to_invenio_author_search(self, query):
  964. """Converts SPIRES search patterns for exact author into search pattern
  965. for invenio"""
  966. # method used for replacement with regular expression
  967. def create_replacement_pattern(match):
  968. # the regular expression where this group name is defined is in
  969. # the method _compile_regular_expressions()
  970. return self._EA_TAG + '"' + match.group('author_name') + '"'
  971. query = self._re_exact_author_match.sub(create_replacement_pattern, query)
  972. return query
  973. def _convert_spires_author_search_to_invenio_author_search(self, query):
  974. """Converts SPIRES search patterns for authors to search patterns in invenio
  975. that give similar results to the spires search.
  976. """
  977. # result of the replacement
  978. result = ''
  979. current_position = 0
  980. for match in self._re_author_match.finditer(query):
  981. result += query[current_position : match.start() ]
  982. if match.group('secondorderop'):
  983. result += match.group('secondorderop')
  984. scanned_name = NameScanner.scan_string_for_phrases(match.group('name'))
  985. author_atoms = self._create_author_search_pattern_from_fuzzy_name_dict(scanned_name)
  986. if match.group('first'):
  987. author_atoms = author_atoms.replace('author:', 'firstauthor:')
  988. if author_atoms.find(' ') == -1:
  989. result += author_atoms + ' '
  990. else:
  991. result += '(' + author_atoms + ') '
  992. current_position = match.end()
  993. result += query[current_position : len(query)]
  994. return result
  995. def _create_author_search_pattern_from_fuzzy_name_dict(self, fuzzy_name):
  996. """Creates an invenio search pattern for an author from a fuzzy name dict"""
  997. author_name = ''
  998. author_middle_name = ''
  999. author_surname = ''
  1000. full_search = ''
  1001. if len(fuzzy_name['nonlastnames']) > 0:
  1002. author_name = fuzzy_name['nonlastnames'][0]
  1003. if len(fuzzy_name['nonlastnames']) == 2:
  1004. author_middle_name = fuzzy_name['nonlastnames'][1]
  1005. if len(fuzzy_name['nonlastnames']) > 2:
  1006. author_middle_name = ' '.join(fuzzy_name['nonlastnames'][1:])
  1007. if fuzzy_name['raw']:
  1008. full_search = fuzzy_name['raw']
  1009. author_surname = ' '.join(fuzzy_name['lastnames'])
  1010. NAME_IS_INITIAL = (len(author_name) == 1)
  1011. NAME_IS_NOT_INITIAL = not NAME_IS_INITIAL
  1012. # we expect to have at least surname
  1013. if author_surname == '' or author_surname == None:
  1014. return ''
  1015. # ellis ---> "author:ellis"
  1016. #if author_name == '' or author_name == None:
  1017. if not author_name:
  1018. return self._A_TAG + author_surname
  1019. # ellis, j ---> "ellis, j*"
  1020. if NAME_IS_INITIAL and not author_middle_name:
  1021. return self._A_TAG + '"' + author_surname + ', ' + author_name + '*"'
  1022. # if there is middle name we expect to have also name and surname
  1023. # ellis, j. r. ---> ellis, j* r*
  1024. # j r ellis ---> ellis, j* r*
  1025. # ellis, john r. ---> ellis, j* r* or ellis, j. r. or ellis, jo. r.
  1026. # ellis, john r. ---> author:ellis, j* r* or exactauthor:ellis, j r or exactauthor:ellis jo r
  1027. if author_middle_name:
  1028. search_pattern = self._A_TAG + '"' + author_surname + ', ' + author_name + '*' + ' ' + author_middle_name.replace(" ","* ") + '*"'
  1029. if NAME_IS_NOT_INITIAL:
  1030. for i in range(1, len(author_name)):
  1031. search_pattern += ' or ' + self._EA_TAG + "\"%s, %s %s\"" % (author_surname, author_name[0:i], author_middle_name)
  1032. return search_pattern
  1033. # ellis, jacqueline ---> "ellis, jacqueline" or "ellis, j.*" or "ellis, j" or "ellis, ja.*" or "ellis, ja" or "ellis, jacqueline *, ellis, j *"
  1034. # in case we don't use SPIRES data, the ending dot is ommited.
  1035. search_pattern = self._A_TAG + '"' + author_surname + ', ' + author_name + '*"'
  1036. search_pattern += " or " + self

Large files files are truncated, but you can click here to view the full file