citotron.py - Wikipedia citation tools Main file

/citotron.py

https://gitlab.com/maxigas/citotron · Python · 265 lines · 197 code · 37 blank · 31 comment · 51 complexity · bfb91a53a3e0a3bddc81ebadd2af4bac MD5 · raw file


#!/usr/bin/python3
# Wikipedia citation tools
# citotron.py
# Main file

__version__ = '3.0.0'

from args import args
from bs4 import BeautifulSoup as bs
from collections import Counter as counter
from itertools import repeat
from joblib import Parallel, delayed
from poliplot import *
from pprint import pprint as pprint
from requests import get, post
from resolve import *
from time import sleep
import utils as u
import csv, json, sys, re, os, resolve
import settings as s
        
# ---- HELPERS ----
def counter(name):
    i = 0
    def counter(x):
        nonlocal i
        if x:
            i +=1


def print_fails():
    print("=" * 80)
    print("Fails by kind:")
    for k,v in kindsfails.items():
        print("    %s: %s" % (k,v))
    print("Fails by resolver:")
    for k,v in resolversfails.items():
        print("    %s: %s" % (k,v))
    print("Processed rows: %s" % total)
    print("Total fails: %s" % fails)
    print("Total successes: %s" % successes)
    print("Fail rate:", str((fails / total) * 100)[:4] + "%")


def progress(character):
    """Convenient dot machine type printer."""
    if args.debug:
        print(character, end='', flush=True)


def safeprint(*args):
    """Convert arguments to strings, False to "ERROR", print the result."""
    print(*[str(arg) if arg is not False else 'ERROR' for arg in args], sep="")


def scientific_publishers():
    """Return a list of scientific publishers read from file."""
    with open(os.path.join(s.datadir, s.scientific_publishers),
            mode='rt', encoding='utf-8') as f:
        publishers = [u.canonical(l.strip()) for l in f.readlines()]
    abbrevs = {'SOC': ' SOCIETY OF',
               'ACAD': 'ACADEMY OF ',
               'UNIV': 'UNIVERSITY OF',
               'NATL': 'NATIONAL',
               'PUBL': 'PUBLISHING',
               'LTD': 'LIMITED',
               'CORP': 'CORPORATION',
               'INC': 'INCORPORATED',
               'COLL': 'COLLEGE',
               'CO': ' COMPANY',
               'INST': 'INSTITUTE',
               'INT': 'INTERNATIONAL',
               'M I T': 'MIT',
               'MIT': 'MIT',
               'DIV': 'DIVISION',
               'LLC': 'LIMITED',
               'PUBS': 'PUBLISHERS'}
    for n, publisher in enumerate(publishers):
        for abbrev in abbrevs:
            r = re.compile('^abbrev\s|\sabbrev$|\sabbrev\s'.replace('abbrev', abbrev))
            if re.match(r, publisher):
                print('******')
                print(abbrev)
                print(publisher)
                print('++++++')
                publishers[n] = publisher.replace(abbrev, abbrevs[abbrev])
                print(publishers[n])
                print('......')
    print(publishers)
    return publishers



# ---- GENERAL FUNCTIONS ----

def resolve_row(row, serial, n=0):
    '''
    Resolves title and publisher for row, where uid = row['id'] and kind = row['type'].
    - Calls resolver function resolve.<kind><n>.
    - If resolver returns string as first element, it returns the result.
    - Else, if resolver returns False as first element, tries resolve.<kind><n+1>, etc.
    - Finally, if resolver does not exist, it returns False, False.
    '''
    global total, successes, fails
    resolver = row['type'] + str(n)
    try:
        resolved = globals()[resolver](row['id'])
        safeprint("-" * 80)
        safeprint("[", str((serial / total) * 100)[:4],"%] ",
                  serial, "/", total,
                  " (", successes, " successes / ", fails, " fails) ",
                  str((fails / serial) * 100)[:4],"% error rate."
                  " Cc: ", resolver, ":", "\n",
                  "    ", resolved[0], ", [", resolved[1], "]")
        sleep(args.sleep)
        # resolved[0] is False if title not found, otherwise string
        # resolved[1] is True if the row was not isbn,
        #                False if it was isbn and publisher was not found,
        #                string if it was isbn and publisher was found.
        if resolved[0]:
            if isinstance(resolved[1], str):
                resolved = u.expand(u.canonical(resolved[0])), u.canonical(resolved[1])
            else:
                resolved = u.expand(u.canonical(resolved[0])), resolved[1]
            log.writerow([resolved[0]])
            if isinstance(resolved[1], str):
                print("   ", resolved[0] + ", [" + resolved[1] + "]")
            else:
                # safeprint() can only print string or boolean False so use print()
                if resolved[1]:
                    print("   ", resolved[0] + ", [ True ]")
                else:
                    print("   ", resolved[0] + ", [ False ]")
            successes += 1
            return resolved
        else:
            resolversfails[resolver] += 1
        return resolve_row(row, serial, n + 1)
    except KeyError:
        fails += 1
        kindsfails[row['type']] += 1
        return False, False


def resolve_rows(rows, sci=False):
    """Resolve rows to titles in parallel."""
    n = 8 if sci else 0
    return (Parallel(n_jobs=args.jobs,
                     verbose=51)
            (delayed(resolve_row)(row, serial, n) for row, serial, n in
             zip(rows, range(1, len(rows)), repeat(n))))
    
# ---- MISC. FUNCTIONS ----

def isbns_in_libthing(rows):
    '''
    Tells how many ISBNS of rows are in Library Thing.
    '''
    # List of all ISBNs in Library Thing:
    # http://www.librarything.com/feeds/AllLibraryThingISBNs.csv
    libthing = open(os.path.join(s.datadir,
                                 'AllLibraryThingISBNs.csv'), 'r').readlines()
    libthing = [x.rstrip() for x in libthing]
    isbns = [row['id'] for row in rows if row['type'] == 'isbn']
    hits = 0
    for isbn in isbns:
        if isbn in libthing:
            hits += 1
    print("We have %s ISBNS and only %s of them are in Library Thing." % (str(len(isbns)), str(hits)))
    return hits


def count_titles(titles):
    '''
    Accepts a list of titles, logging and returning their frequencies.
    '''
    freq = counter(titles).most_common()
    for k, v in freq:
        out.writerow([k, v])
    return freq


def count_types(rows):
    """Counts how many rows of each kind (isbn, doi, etc.) are in the input csv."""
    types = []
    for row in rows:
        types.append(row['type'])
    return counter(types).most_common()


def filter_kind(kind, rows):
    return [row for row in rows if row['type'] == kind]


def scisbn(rows):
    """Returns statistics about which ISBNs are scientific"""
    list_of_scientific_publishers = scientific_publishers()
    results = resolve_rows(rows)
    # Remove cases where we could not find the publisher:
    nr_all_results = len(results)
    results = [ x for x in results if x[1] ]
    nr_results_with_publishers = len(results)
    results = [ x for x in results if x[1] in list_of_scientific_publishers ]
    nr_results_from_scientific_publishers = len(results)
    nr_results_not_from_scientific_publishers = nr_results_with_publishers - nr_results_from_scientific_publishers
    return {'All books resolved': nr_all_results,
            'Books with known publishers': nr_results_with_publishers,
            'Books with scientific publishers': nr_results_from_scientific_publishers,
            'Books with non-scientific publishers': nr_results_not_from_scientific_publishers,
            'Percent of books with scientific publishers from all books with known publishers': nr_results_from_scientific_publishers / float(nr_results_with_publishers)}

# ---- INIT ----

# Enumerate generator so that we only read the file once
rows = list(csv.DictReader(open(os.path.join(s.datadir, args.inputfile), 'rt'),
                           delimiter="\t"))
rows = u.correct_isbn_rows(rows)
log = csv.writer(open(s.logfile, 'w', encoding='utf-8'),
                 delimiter='|',
                 quotechar='"',
                 quoting=csv.QUOTE_MINIMAL)
out = csv.writer(open(args.outputfile, 'w', encoding='utf-8'),
                 delimiter='|',
                 quotechar='"',
                 quoting=csv.QUOTE_MINIMAL)


# ---- GLOBAL STATE ----

total, fails, successes = len(rows), 0, 0
kinds = [x[:-1] for x in dir(resolve) if '0' in x]
kindsfails = {k:v for k, v in zip(kinds, repeat(0))}

resolvers = dir(resolve)
resolversfails = {k:v for k, v in zip(resolvers, repeat(0))}


# ---- CALLS ----

if args.kind != 'all':
    rows = filter_kind(args.kind, rows)
    total = len(rows)

m = args.mode
if m == "count":
    pprint(count_titles(resolve_rows(rows)))
    print_fails()
    plot_occurrences(args.outputfile, args.discriminant)
elif m == "resolve":
    resolve_rows(rows)
    print_fails()
elif m == "types":
    pprint(count_types(rows))
elif m == "scisbn":
    pprint(scisbn(filter_kind('isbn', rows)))
elif m == "journaltitles":
    isbnrows = filter_kind('isbn', rows)
    otherrows = [ row for row in rows if row not in isbnrows ]
    pprint(count_titles(resolve_rows(otherrows)))
    print_fails()
else:
    print("mode was: '" + str(m) + "'")
    print("="*80)
    print("You have to choose a supported mode!")
    print("Try to run the script without arguments to get a help message.")

Tech Fingerprint

Alerts (51)

'import *' Avoid to prevent namespace pollution; import specific names or use aliases
13 16
'def' Ensure functions have docstrings for documentation
23 25 31 191
'print(' Use logging module for better control and configurability
32 33 35 36 38 39 40 41 42 48 51 53 82 83 84 85 87 88 89 108 109 127 129 131 133 169 246 253 255 259 262 263 264 265
Complexity hotspot; line 53 (total complexity: 3)
53
'global' Avoid global variables; use function parameters or class attributes for better scope management
104
'try:' Ensure try blocks have corresponding except or finally blocks
106
'isinstance(' Overuse may indicate design issues; consider polymorphism
121 126
'open(' Use 'with open()' to ensure Files are properly closed
161
Complexity hotspot; lines 163 to 164 (total complexity: 3)
163 164
'list(' Avoid unnecessary list conversions; use generators where possible
215
Complexity hotspot; lines 231 to 232 (total complexity: 3)
231 232