/citotron.py
Python | 265 lines | 252 code | 7 blank | 6 comment | 9 complexity | bfb91a53a3e0a3bddc81ebadd2af4bac MD5 | raw file
Possible License(s): AGPL-3.0
- #!/usr/bin/python3
- # Wikipedia citation tools
- # citotron.py
- # Main file
- __version__ = '3.0.0'
- from args import args
- from bs4 import BeautifulSoup as bs
- from collections import Counter as counter
- from itertools import repeat
- from joblib import Parallel, delayed
- from poliplot import *
- from pprint import pprint as pprint
- from requests import get, post
- from resolve import *
- from time import sleep
- import utils as u
- import csv, json, sys, re, os, resolve
- import settings as s
-
- # ---- HELPERS ----
- def counter(name):
- i = 0
- def counter(x):
- nonlocal i
- if x:
- i +=1
- def print_fails():
- print("=" * 80)
- print("Fails by kind:")
- for k,v in kindsfails.items():
- print(" %s: %s" % (k,v))
- print("Fails by resolver:")
- for k,v in resolversfails.items():
- print(" %s: %s" % (k,v))
- print("Processed rows: %s" % total)
- print("Total fails: %s" % fails)
- print("Total successes: %s" % successes)
- print("Fail rate:", str((fails / total) * 100)[:4] + "%")
- def progress(character):
- """Convenient dot machine type printer."""
- if args.debug:
- print(character, end='', flush=True)
- def safeprint(*args):
- """Convert arguments to strings, False to "ERROR", print the result."""
- print(*[str(arg) if arg is not False else 'ERROR' for arg in args], sep="")
- def scientific_publishers():
- """Return a list of scientific publishers read from file."""
- with open(os.path.join(s.datadir, s.scientific_publishers),
- mode='rt', encoding='utf-8') as f:
- publishers = [u.canonical(l.strip()) for l in f.readlines()]
- abbrevs = {'SOC': ' SOCIETY OF',
- 'ACAD': 'ACADEMY OF ',
- 'UNIV': 'UNIVERSITY OF',
- 'NATL': 'NATIONAL',
- 'PUBL': 'PUBLISHING',
- 'LTD': 'LIMITED',
- 'CORP': 'CORPORATION',
- 'INC': 'INCORPORATED',
- 'COLL': 'COLLEGE',
- 'CO': ' COMPANY',
- 'INST': 'INSTITUTE',
- 'INT': 'INTERNATIONAL',
- 'M I T': 'MIT',
- 'MIT': 'MIT',
- 'DIV': 'DIVISION',
- 'LLC': 'LIMITED',
- 'PUBS': 'PUBLISHERS'}
- for n, publisher in enumerate(publishers):
- for abbrev in abbrevs:
- r = re.compile('^abbrev\s|\sabbrev$|\sabbrev\s'.replace('abbrev', abbrev))
- if re.match(r, publisher):
- print('******')
- print(abbrev)
- print(publisher)
- print('++++++')
- publishers[n] = publisher.replace(abbrev, abbrevs[abbrev])
- print(publishers[n])
- print('......')
- print(publishers)
- return publishers
- # ---- GENERAL FUNCTIONS ----
- def resolve_row(row, serial, n=0):
- '''
- Resolves title and publisher for row, where uid = row['id'] and kind = row['type'].
- - Calls resolver function resolve.<kind><n>.
- - If resolver returns string as first element, it returns the result.
- - Else, if resolver returns False as first element, tries resolve.<kind><n+1>, etc.
- - Finally, if resolver does not exist, it returns False, False.
- '''
- global total, successes, fails
- resolver = row['type'] + str(n)
- try:
- resolved = globals()[resolver](row['id'])
- safeprint("-" * 80)
- safeprint("[", str((serial / total) * 100)[:4],"%] ",
- serial, "/", total,
- " (", successes, " successes / ", fails, " fails) ",
- str((fails / serial) * 100)[:4],"% error rate."
- " Cc: ", resolver, ":", "\n",
- " ", resolved[0], ", [", resolved[1], "]")
- sleep(args.sleep)
- # resolved[0] is False if title not found, otherwise string
- # resolved[1] is True if the row was not isbn,
- # False if it was isbn and publisher was not found,
- # string if it was isbn and publisher was found.
- if resolved[0]:
- if isinstance(resolved[1], str):
- resolved = u.expand(u.canonical(resolved[0])), u.canonical(resolved[1])
- else:
- resolved = u.expand(u.canonical(resolved[0])), resolved[1]
- log.writerow([resolved[0]])
- if isinstance(resolved[1], str):
- print(" ", resolved[0] + ", [" + resolved[1] + "]")
- else:
- # safeprint() can only print string or boolean False so use print()
- if resolved[1]:
- print(" ", resolved[0] + ", [ True ]")
- else:
- print(" ", resolved[0] + ", [ False ]")
- successes += 1
- return resolved
- else:
- resolversfails[resolver] += 1
- return resolve_row(row, serial, n + 1)
- except KeyError:
- fails += 1
- kindsfails[row['type']] += 1
- return False, False
- def resolve_rows(rows, sci=False):
- """Resolve rows to titles in parallel."""
- n = 8 if sci else 0
- return (Parallel(n_jobs=args.jobs,
- verbose=51)
- (delayed(resolve_row)(row, serial, n) for row, serial, n in
- zip(rows, range(1, len(rows)), repeat(n))))
-
- # ---- MISC. FUNCTIONS ----
- def isbns_in_libthing(rows):
- '''
- Tells how many ISBNS of rows are in Library Thing.
- '''
- # List of all ISBNs in Library Thing:
- # http://www.librarything.com/feeds/AllLibraryThingISBNs.csv
- libthing = open(os.path.join(s.datadir,
- 'AllLibraryThingISBNs.csv'), 'r').readlines()
- libthing = [x.rstrip() for x in libthing]
- isbns = [row['id'] for row in rows if row['type'] == 'isbn']
- hits = 0
- for isbn in isbns:
- if isbn in libthing:
- hits += 1
- print("We have %s ISBNS and only %s of them are in Library Thing." % (str(len(isbns)), str(hits)))
- return hits
- def count_titles(titles):
- '''
- Accepts a list of titles, logging and returning their frequencies.
- '''
- freq = counter(titles).most_common()
- for k, v in freq:
- out.writerow([k, v])
- return freq
- def count_types(rows):
- """Counts how many rows of each kind (isbn, doi, etc.) are in the input csv."""
- types = []
- for row in rows:
- types.append(row['type'])
- return counter(types).most_common()
- def filter_kind(kind, rows):
- return [row for row in rows if row['type'] == kind]
- def scisbn(rows):
- """Returns statistics about which ISBNs are scientific"""
- list_of_scientific_publishers = scientific_publishers()
- results = resolve_rows(rows)
- # Remove cases where we could not find the publisher:
- nr_all_results = len(results)
- results = [ x for x in results if x[1] ]
- nr_results_with_publishers = len(results)
- results = [ x for x in results if x[1] in list_of_scientific_publishers ]
- nr_results_from_scientific_publishers = len(results)
- nr_results_not_from_scientific_publishers = nr_results_with_publishers - nr_results_from_scientific_publishers
- return {'All books resolved': nr_all_results,
- 'Books with known publishers': nr_results_with_publishers,
- 'Books with scientific publishers': nr_results_from_scientific_publishers,
- 'Books with non-scientific publishers': nr_results_not_from_scientific_publishers,
- 'Percent of books with scientific publishers from all books with known publishers': nr_results_from_scientific_publishers / float(nr_results_with_publishers)}
- # ---- INIT ----
- # Enumerate generator so that we only read the file once
- rows = list(csv.DictReader(open(os.path.join(s.datadir, args.inputfile), 'rt'),
- delimiter="\t"))
- rows = u.correct_isbn_rows(rows)
- log = csv.writer(open(s.logfile, 'w', encoding='utf-8'),
- delimiter='|',
- quotechar='"',
- quoting=csv.QUOTE_MINIMAL)
- out = csv.writer(open(args.outputfile, 'w', encoding='utf-8'),
- delimiter='|',
- quotechar='"',
- quoting=csv.QUOTE_MINIMAL)
- # ---- GLOBAL STATE ----
- total, fails, successes = len(rows), 0, 0
- kinds = [x[:-1] for x in dir(resolve) if '0' in x]
- kindsfails = {k:v for k, v in zip(kinds, repeat(0))}
- resolvers = dir(resolve)
- resolversfails = {k:v for k, v in zip(resolvers, repeat(0))}
- # ---- CALLS ----
- if args.kind != 'all':
- rows = filter_kind(args.kind, rows)
- total = len(rows)
- m = args.mode
- if m == "count":
- pprint(count_titles(resolve_rows(rows)))
- print_fails()
- plot_occurrences(args.outputfile, args.discriminant)
- elif m == "resolve":
- resolve_rows(rows)
- print_fails()
- elif m == "types":
- pprint(count_types(rows))
- elif m == "scisbn":
- pprint(scisbn(filter_kind('isbn', rows)))
- elif m == "journaltitles":
- isbnrows = filter_kind('isbn', rows)
- otherrows = [ row for row in rows if row not in isbnrows ]
- pprint(count_titles(resolve_rows(otherrows)))
- print_fails()
- else:
- print("mode was: '" + str(m) + "'")
- print("="*80)
- print("You have to choose a supported mode!")
- print("Try to run the script without arguments to get a help message.")