/benchmark/marc21.py
Python | 296 lines | 265 code | 28 blank | 3 comment | 25 complexity | a476603fb5829da873f67cb2f04168d6 MD5 | raw file
Possible License(s): Apache-2.0
- from __future__ import with_statement, print_function
- import fnmatch, logging, os.path, re
- from whoosh import analysis, fields, index, qparser, query, scoring
- from whoosh.util import now
- log = logging.getLogger(__name__)
- # Functions for reading MARC format
- LEADER = (' ' * 10) + '22' + (' ' * 8) + '4500'
- LEADER_LEN = len(LEADER)
- DIRECTORY_ENTRY_LEN = 12
- SUBFIELD_INDICATOR = "\x1F"
- END_OF_FIELD = "\x1E"
- END_OF_RECORD = "\x1D"
- isbn_regex = re.compile(r'[-0-9xX]+')
- def read_file(dbfile, tags=None):
- while True:
- pos = dbfile.tell()
- first5 = dbfile.read(5)
- if not first5:
- return
- if len(first5) < 5:
- raise Exception
- length = int(first5)
- chunk = dbfile.read(length - 5)
- yield parse_record(first5 + chunk, tags), pos
- def read_record(filename, pos, tags=None):
- f = open(filename, "rb")
- f.seek(pos)
- first5 = f.read(5)
- length = int(first5)
- chunk = f.read(length - 5)
- return parse_record(first5 + chunk, tags)
- def parse_record(data, tags=None):
- leader = data[:LEADER_LEN]
- assert len(leader) == LEADER_LEN
- dataoffset = int(data[12:17])
- assert dataoffset > 0
- assert dataoffset < len(data)
- # dataoffset - 1 to avoid END-OF-FIELD byte
- dirstart = LEADER_LEN
- dirend = dataoffset - 1
- # Number of fields in record
- assert (dirend - dirstart) % DIRECTORY_ENTRY_LEN == 0
- field_count = (dirend - dirstart) // DIRECTORY_ENTRY_LEN
- result = {}
- for i in xrange(field_count):
- start = dirstart + i * DIRECTORY_ENTRY_LEN
- end = start + DIRECTORY_ENTRY_LEN
- tag = data[start:start + 3]
- if tags and not tag in tags:
- continue
- entry = data[start:end]
- elen = int(entry[3:7])
- offset = dataoffset + int(entry[7:12])
- edata = data[offset:offset + elen - 1]
- if not (tag < "010" and tag.isdigit()):
- edata = edata.split(SUBFIELD_INDICATOR)[1:]
- if tag in result:
- result[tag].extend(edata)
- else:
- result[tag] = edata
- else:
- result[tag] = edata
- return result
- def subfield(vs, code):
- for v in vs:
- if v.startswith(code):
- return v[1:]
- return None
- def joinsubfields(vs):
- return " ".join(v[1:] for v in vs if v and v[0] != "6")
- def getfields(d, *tags):
- return (d[tag] for tag in tags if tag in d)
- def title(d):
- title = None
- if "245" in d:
- svs = d["245"]
- title = subfield(svs, "a")
- if title:
- t2 = subfield(svs, "b")
- if t2:
- title += t2
- return title
- def isbn(d):
- if "020" in d:
- num = subfield(d["020"], "a")
- if num:
- match = isbn_regex.search(num)
- if match:
- return match.group(0).replace('-', '')
- def author(d):
- if "100" in d:
- return joinsubfields(d["100"])
- elif "110" in d:
- return joinsubfields(d["110"])
- elif "111" in d:
- return joinsubfields(d["111"])
- def uniform_title(d):
- if "130" in d:
- return joinsubfields(d["130"])
- elif "240" in d:
- return joinsubfields(d["240"])
- subjectfields = ("600 610 611 630 648 650 651 653 654 655 656 657 658 662 "
- "690 691 696 697 698 699").split()
- def subjects(d):
- return " ".join(joinsubfields(vs) for vs in getfields(d, *subjectfields))
- def physical(d):
- return joinsubfields(d["300"])
- def location(d):
- return joinsubfields(d["852"])
- def publisher(d):
- if "260" in d:
- return subfield(d["260"], "b")
- def pubyear(d):
- if "260" in d:
- return subfield(d["260"], "c")
- def uni(v):
- return u"" if v is None else v.decode("utf-8", "replace")
- # Indexing and searching
- def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True,
- glob="*.mrc"):
- if not os.path.exists(ixdir):
- os.mkdir(ixdir)
- # Multi-lingual stop words
- stoplist = (analysis.STOP_WORDS
- | set("de la der und le die et en al no von di du da "
- "del zur ein".split()))
- # Schema
- ana = analysis.StemmingAnalyzer(stoplist=stoplist)
- schema = fields.Schema(title=fields.TEXT(analyzer=ana),
- author=fields.TEXT(phrase=False),
- subject=fields.TEXT(analyzer=ana, phrase=False),
- file=fields.STORED, pos=fields.STORED,
- )
- # MARC fields to extract
- mfields = set(subjectfields) # Subjects
- mfields.update("100 110 111".split()) # Author
- mfields.add("245") # Title
- print("Indexing with %d processor(s) and %d MB per processor"
- % (procs, limitmb))
- c = 0
- t = now()
- ix = index.create_in(ixdir, schema)
- with ix.writer(procs=procs, limitmb=limitmb,
- multisegment=multisegment) as w:
- filenames = [filename for filename in os.listdir(basedir)
- if fnmatch.fnmatch(filename, glob)]
- for filename in filenames:
- path = os.path.join(basedir, filename)
- print("Indexing", path)
- f = open(path, 'rb')
- for x, pos in read_file(f, mfields):
- w.add_document(title=uni(title(x)), author=uni(author(x)),
- subject=uni(subjects(x)),
- file=filename, pos=pos)
- c += 1
- f.close()
- print("Committing...")
- print("Indexed %d records in %0.02f minutes" % (c, (now() - t) / 60.0))
- def print_record(no, basedir, filename, pos):
- path = os.path.join(basedir, filename)
- record = read_record(path, pos)
- print("% 5d. %s" % (no + 1, title(record)))
- print(" ", author(record))
- print(" ", subjects(record))
- isbn_num = isbn(record)
- if isbn_num:
- print(" ISBN:", isbn_num)
- print()
- def search(qstring, ixdir, basedir, limit=None, optimize=True, scores=True):
- ix = index.open_dir(ixdir)
- qp = qparser.QueryParser("title", ix.schema)
- q = qp.parse(qstring)
- with ix.searcher(weighting=scoring.PL2()) as s:
- if scores:
- r = s.search(q, limit=limit, optimize=optimize)
- for hit in r:
- print_record(hit.rank, basedir, hit["file"], hit["pos"])
- print("Found %d records in %0.06f seconds" % (len(r), r.runtime))
- else:
- t = now()
- for i, docnum in enumerate(s.docs_for_query(q)):
- if not limit or i < limit:
- fields = s.stored_fields(docnum)
- print_record(i, basedir, fields["file"], fields["pos"])
- print("Found %d records in %0.06f seconds" % (i, now() - t))
- if __name__ == "__main__":
- from optparse import OptionParser
- p = OptionParser(usage="usage: %prog [options] query")
- # Common options
- p.add_option("-f", "--filedir", metavar="DIR", dest="basedir",
- help="Directory containing the .mrc files to index",
- default="data/HLOM")
- p.add_option("-d", "--dir", metavar="DIR", dest="ixdir",
- help="Directory containing the index", default="marc_index")
- # Indexing options
- p.add_option("-i", "--index", dest="index",
- help="Index the records", action="store_true", default=False)
- p.add_option("-p", "--procs", metavar="NPROCS", dest="procs",
- help="Number of processors to use", default="1")
- p.add_option("-m", "--mb", metavar="MB", dest="limitmb",
- help="Limit the indexer to this many MB of memory per writer",
- default="128")
- p.add_option("-M", "--merge-segments", dest="multisegment",
- help="If indexing with multiproc, merge the segments after"
- " indexing", action="store_false", default=True)
- p.add_option("-g", "--match", metavar="GLOB", dest="glob",
- help="Only index file names matching the given pattern",
- default="*.mrc")
- # Search options
- p.add_option("-l", "--limit", metavar="NHITS", dest="limit",
- help="Maximum number of search results to print (0=no limit)",
- default="10")
- p.add_option("-O", "--no-optimize", dest="optimize",
- help="Turn off searcher optimization (for debugging)",
- action="store_false", default=True)
- p.add_option("-s", "--scoring", dest="scores",
- help="Score the results", action="store_true", default=False)
- options, args = p.parse_args()
- if options.index:
- make_index(options.basedir, options.ixdir,
- procs=int(options.procs),
- limitmb=int(options.limitmb),
- multisegment=options.multisegment,
- glob=options.glob)
- if args:
- qstring = " ".join(args).decode("utf-8")
- limit = int(options.limit)
- if limit < 1:
- limit = None
- search(qstring, options.ixdir, options.basedir, limit=limit,
- optimize=options.optimize, scores=options.scores)