PageRenderTime 192ms CodeModel.GetById 24ms RepoModel.GetById 2ms app.codeStats 0ms

/benchmark/marc21.py

https://bitbucket.org/rayleyva/whoosh
Python | 296 lines | 265 code | 28 blank | 3 comment | 25 complexity | a476603fb5829da873f67cb2f04168d6 MD5 | raw file
Possible License(s): Apache-2.0
  1. from __future__ import with_statement, print_function
  2. import fnmatch, logging, os.path, re
  3. from whoosh import analysis, fields, index, qparser, query, scoring
  4. from whoosh.util import now
  5. log = logging.getLogger(__name__)
  6. # Functions for reading MARC format
  7. LEADER = (' ' * 10) + '22' + (' ' * 8) + '4500'
  8. LEADER_LEN = len(LEADER)
  9. DIRECTORY_ENTRY_LEN = 12
  10. SUBFIELD_INDICATOR = "\x1F"
  11. END_OF_FIELD = "\x1E"
  12. END_OF_RECORD = "\x1D"
  13. isbn_regex = re.compile(r'[-0-9xX]+')
  14. def read_file(dbfile, tags=None):
  15. while True:
  16. pos = dbfile.tell()
  17. first5 = dbfile.read(5)
  18. if not first5:
  19. return
  20. if len(first5) < 5:
  21. raise Exception
  22. length = int(first5)
  23. chunk = dbfile.read(length - 5)
  24. yield parse_record(first5 + chunk, tags), pos
  25. def read_record(filename, pos, tags=None):
  26. f = open(filename, "rb")
  27. f.seek(pos)
  28. first5 = f.read(5)
  29. length = int(first5)
  30. chunk = f.read(length - 5)
  31. return parse_record(first5 + chunk, tags)
  32. def parse_record(data, tags=None):
  33. leader = data[:LEADER_LEN]
  34. assert len(leader) == LEADER_LEN
  35. dataoffset = int(data[12:17])
  36. assert dataoffset > 0
  37. assert dataoffset < len(data)
  38. # dataoffset - 1 to avoid END-OF-FIELD byte
  39. dirstart = LEADER_LEN
  40. dirend = dataoffset - 1
  41. # Number of fields in record
  42. assert (dirend - dirstart) % DIRECTORY_ENTRY_LEN == 0
  43. field_count = (dirend - dirstart) // DIRECTORY_ENTRY_LEN
  44. result = {}
  45. for i in xrange(field_count):
  46. start = dirstart + i * DIRECTORY_ENTRY_LEN
  47. end = start + DIRECTORY_ENTRY_LEN
  48. tag = data[start:start + 3]
  49. if tags and not tag in tags:
  50. continue
  51. entry = data[start:end]
  52. elen = int(entry[3:7])
  53. offset = dataoffset + int(entry[7:12])
  54. edata = data[offset:offset + elen - 1]
  55. if not (tag < "010" and tag.isdigit()):
  56. edata = edata.split(SUBFIELD_INDICATOR)[1:]
  57. if tag in result:
  58. result[tag].extend(edata)
  59. else:
  60. result[tag] = edata
  61. else:
  62. result[tag] = edata
  63. return result
  64. def subfield(vs, code):
  65. for v in vs:
  66. if v.startswith(code):
  67. return v[1:]
  68. return None
  69. def joinsubfields(vs):
  70. return " ".join(v[1:] for v in vs if v and v[0] != "6")
  71. def getfields(d, *tags):
  72. return (d[tag] for tag in tags if tag in d)
  73. def title(d):
  74. title = None
  75. if "245" in d:
  76. svs = d["245"]
  77. title = subfield(svs, "a")
  78. if title:
  79. t2 = subfield(svs, "b")
  80. if t2:
  81. title += t2
  82. return title
  83. def isbn(d):
  84. if "020" in d:
  85. num = subfield(d["020"], "a")
  86. if num:
  87. match = isbn_regex.search(num)
  88. if match:
  89. return match.group(0).replace('-', '')
  90. def author(d):
  91. if "100" in d:
  92. return joinsubfields(d["100"])
  93. elif "110" in d:
  94. return joinsubfields(d["110"])
  95. elif "111" in d:
  96. return joinsubfields(d["111"])
  97. def uniform_title(d):
  98. if "130" in d:
  99. return joinsubfields(d["130"])
  100. elif "240" in d:
  101. return joinsubfields(d["240"])
  102. subjectfields = ("600 610 611 630 648 650 651 653 654 655 656 657 658 662 "
  103. "690 691 696 697 698 699").split()
  104. def subjects(d):
  105. return " ".join(joinsubfields(vs) for vs in getfields(d, *subjectfields))
  106. def physical(d):
  107. return joinsubfields(d["300"])
  108. def location(d):
  109. return joinsubfields(d["852"])
  110. def publisher(d):
  111. if "260" in d:
  112. return subfield(d["260"], "b")
  113. def pubyear(d):
  114. if "260" in d:
  115. return subfield(d["260"], "c")
  116. def uni(v):
  117. return u"" if v is None else v.decode("utf-8", "replace")
  118. # Indexing and searching
  119. def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True,
  120. glob="*.mrc"):
  121. if not os.path.exists(ixdir):
  122. os.mkdir(ixdir)
  123. # Multi-lingual stop words
  124. stoplist = (analysis.STOP_WORDS
  125. | set("de la der und le die et en al no von di du da "
  126. "del zur ein".split()))
  127. # Schema
  128. ana = analysis.StemmingAnalyzer(stoplist=stoplist)
  129. schema = fields.Schema(title=fields.TEXT(analyzer=ana),
  130. author=fields.TEXT(phrase=False),
  131. subject=fields.TEXT(analyzer=ana, phrase=False),
  132. file=fields.STORED, pos=fields.STORED,
  133. )
  134. # MARC fields to extract
  135. mfields = set(subjectfields) # Subjects
  136. mfields.update("100 110 111".split()) # Author
  137. mfields.add("245") # Title
  138. print("Indexing with %d processor(s) and %d MB per processor"
  139. % (procs, limitmb))
  140. c = 0
  141. t = now()
  142. ix = index.create_in(ixdir, schema)
  143. with ix.writer(procs=procs, limitmb=limitmb,
  144. multisegment=multisegment) as w:
  145. filenames = [filename for filename in os.listdir(basedir)
  146. if fnmatch.fnmatch(filename, glob)]
  147. for filename in filenames:
  148. path = os.path.join(basedir, filename)
  149. print("Indexing", path)
  150. f = open(path, 'rb')
  151. for x, pos in read_file(f, mfields):
  152. w.add_document(title=uni(title(x)), author=uni(author(x)),
  153. subject=uni(subjects(x)),
  154. file=filename, pos=pos)
  155. c += 1
  156. f.close()
  157. print("Committing...")
  158. print("Indexed %d records in %0.02f minutes" % (c, (now() - t) / 60.0))
  159. def print_record(no, basedir, filename, pos):
  160. path = os.path.join(basedir, filename)
  161. record = read_record(path, pos)
  162. print("% 5d. %s" % (no + 1, title(record)))
  163. print(" ", author(record))
  164. print(" ", subjects(record))
  165. isbn_num = isbn(record)
  166. if isbn_num:
  167. print(" ISBN:", isbn_num)
  168. print()
  169. def search(qstring, ixdir, basedir, limit=None, optimize=True, scores=True):
  170. ix = index.open_dir(ixdir)
  171. qp = qparser.QueryParser("title", ix.schema)
  172. q = qp.parse(qstring)
  173. with ix.searcher(weighting=scoring.PL2()) as s:
  174. if scores:
  175. r = s.search(q, limit=limit, optimize=optimize)
  176. for hit in r:
  177. print_record(hit.rank, basedir, hit["file"], hit["pos"])
  178. print("Found %d records in %0.06f seconds" % (len(r), r.runtime))
  179. else:
  180. t = now()
  181. for i, docnum in enumerate(s.docs_for_query(q)):
  182. if not limit or i < limit:
  183. fields = s.stored_fields(docnum)
  184. print_record(i, basedir, fields["file"], fields["pos"])
  185. print("Found %d records in %0.06f seconds" % (i, now() - t))
  186. if __name__ == "__main__":
  187. from optparse import OptionParser
  188. p = OptionParser(usage="usage: %prog [options] query")
  189. # Common options
  190. p.add_option("-f", "--filedir", metavar="DIR", dest="basedir",
  191. help="Directory containing the .mrc files to index",
  192. default="data/HLOM")
  193. p.add_option("-d", "--dir", metavar="DIR", dest="ixdir",
  194. help="Directory containing the index", default="marc_index")
  195. # Indexing options
  196. p.add_option("-i", "--index", dest="index",
  197. help="Index the records", action="store_true", default=False)
  198. p.add_option("-p", "--procs", metavar="NPROCS", dest="procs",
  199. help="Number of processors to use", default="1")
  200. p.add_option("-m", "--mb", metavar="MB", dest="limitmb",
  201. help="Limit the indexer to this many MB of memory per writer",
  202. default="128")
  203. p.add_option("-M", "--merge-segments", dest="multisegment",
  204. help="If indexing with multiproc, merge the segments after"
  205. " indexing", action="store_false", default=True)
  206. p.add_option("-g", "--match", metavar="GLOB", dest="glob",
  207. help="Only index file names matching the given pattern",
  208. default="*.mrc")
  209. # Search options
  210. p.add_option("-l", "--limit", metavar="NHITS", dest="limit",
  211. help="Maximum number of search results to print (0=no limit)",
  212. default="10")
  213. p.add_option("-O", "--no-optimize", dest="optimize",
  214. help="Turn off searcher optimization (for debugging)",
  215. action="store_false", default=True)
  216. p.add_option("-s", "--scoring", dest="scores",
  217. help="Score the results", action="store_true", default=False)
  218. options, args = p.parse_args()
  219. if options.index:
  220. make_index(options.basedir, options.ixdir,
  221. procs=int(options.procs),
  222. limitmb=int(options.limitmb),
  223. multisegment=options.multisegment,
  224. glob=options.glob)
  225. if args:
  226. qstring = " ".join(args).decode("utf-8")
  227. limit = int(options.limit)
  228. if limit < 1:
  229. limit = None
  230. search(qstring, options.ixdir, options.basedir, limit=limit,
  231. optimize=options.optimize, scores=options.scores)