institution_test_suite.py - Find the match file between old…

/institution_test_suite.py

https://github.com/adsabs/scripts-affiliation-disambiguation · Python · 187 lines · 140 code · 45 blank · 2 comment · 48 complexity · 24b55f8c024b0bcdc901eecc5b3ed5f9 MD5 · raw file


from collections import defaultdict
import Levenshtein
import marshal
import os
import re
import time

import institution_searcher as s

RE_SPACES = re.compile('\s+')

def get_icns(reextract=False):
    if reextract:
        os.chdir('desy_affiliations')
        import desy_affs
        icns = desy_affs.get_icns()
        os.chdir('..')

        # Find the match file between old and new ICNs.
        match = dict(line.strip().split('\t') for line in open('old_new.txt').readlines())

        for icn in icns.keys():
            if '; 'in icn or icn.endswith(' to be removed'):
                # Delete pairs of affiliations.
                del icns[icn]
            else:
                if icn in match:
                    new_icn = RE_SPACES.sub(' ', match[icn].strip())
                else:
                    new_icn = icn

                try:
                    new_icn = new_icn.decode('utf-8')
                except:
                    pass

                new_icn = RE_SPACES.sub(' ', new_icn.strip())

                if new_icn != icn:
                    icns[new_icn] = sorted(list(set(icns.get(new_icn, []) + icns.pop(icn, []))))

        return icns
    else:
        return marshal.load(open('icns.marshal'))

PROCESS_NUMBER = 20

def extend_icns(icns):
    out = []
    for icn, institutions in icns.items():
        for institution in institutions:
            out.append((icn, institution))
    return out

def test_icns_only(icns):
    return test([(icn, icn) for icn in icns])

def analyse_icns(res, icns):
    out = []
    for original, _, matched in res:
        if matched is not None:
            if not isinstance(original, str):
                original = original.encode('utf-8')
            if not isinstance(matched, str):
                matched = matched.encode('utf-8')

            if original != matched:
                out.append((
                    Levenshtein.distance(original, matched),
                    1. / len(icns[original]),
                    original,
                    matched
                    ))
    return sorted(out)

def test(icns):
    if isinstance(icns, dict):
        icns = extend_icns(icns)

    results = []
    chunk_size = len(icns) / PROCESS_NUMBER + 1

    while icns:
        chunk = icns[:chunk_size]
        icns = icns[chunk_size:]
        results.append(s.match_institutions.delay(chunk))

    while not all([r.ready() for r in results]):
        time.sleep(0.1)

    out = []
    for r in results:
        out += r.result

    print_statistics(out)

    return out

def test_ratio(icns):
    res = test(icns)
    correct_matches, incorrect_matches = separate_results(res)

    correct_ratios = compute_ratios(correct_matches)
    incorrect_ratios = compute_ratios(incorrect_matches)

    return correct_ratios, incorrect_ratios

def compute_ratios(matches):
    out = []

    for inst, results in get_two_first_results(matches):
        if results and len(results) >= 2:
            out.append((inst, float(int(results[1][0] / results[0][0] * 20)) / 20))

    return out

def display_ratios(ratios):
    clustered = defaultdict(list)
    for inst, ratio in ratios:
        clustered[ratio].append(clustered)

    for i in [float(i) / 20 for i in range(0, 21)]:
        print len(clustered[i])

def get_two_first_results(institutions):
    results = []
    chunk_size = len(institutions) / PROCESS_NUMBER + 1

    first = []

    while institutions:
        chunk = institutions[:chunk_size]
        institutions = institutions[chunk_size:]
        results.append(s.get_match_ratio.delay(chunk))

    while not all([r.ready() for r in results]):
        time.sleep(0.1)

    out = []
    for r in results:
        out += r.result

    return out

def separate_results(res):
    correct, error = [], []
    for r in res:
        if r[0] == r[2]:
            correct.append(r[1])
        else:
            error.append(r[1])

    return correct, error

def get_sorted_errors(res):
    errors = [r for r in res if r[0] != r[2]]
    sorted_errors = defaultdict(list)
    for r in errors:
        sorted_errors[r[0]].append((r[1], r[2]))

    return sorted(((len(v), k) for k, v in sorted_errors.items()), reverse=True)

PREVIOUS_SCORE = 0

def print_statistics(results):
    correct = [r for r in results if r[0] == r[2]]
    score = float(len(correct)) / len(results) * 100
    print '%s/%s (%.2f%%)' % (len(correct), len(results), score)
    global PREVIOUS_SCORE
    print 'Previous score: %.2f%%' % PREVIOUS_SCORE
    PREVIOUS_SCORE = score

def format_results(results):

    from BeautifulSoup import UnicodeDammit

    new_results = []
    for line in results:
        new_line = []
        for elem in line:
            if elem is None:
                new_line.append('')
            else:
                new_line.append(UnicodeDammit(elem).unicode)
        new_results.append('\t'.join(new_line))

    return '\n'.join(new_results)

Tech Fingerprint

Alerts (30)

'def' Ensure functions have docstrings for documentation
12 48 55 58 76 99 108 117 125 145 155 165 173
'open(' Use 'with open()' to ensure Files are properly closed
20 44
Complexity hotspot; lines 22 to 23 (total complexity: 3)
22 23
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
25
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
34
Complexity hotspot; lines 60 to 62 (total complexity: 3)
60 61 62
'isinstance(' Overuse may indicate design issues; consider polymorphism
62 64 77
'time.sleep(' Avoid blocking; use threading.Timer or asyncio.sleep for non-blocking delays
89 137
Complexity hotspot; lines 111 to 112 (total complexity: 3)
111 112
'global' Avoid global variables; use function parameters or class attributes for better scope management
169