PageRenderTime 256ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/citotron.py

https://gitlab.com/maxigas/citotron
Python | 265 lines | 252 code | 7 blank | 6 comment | 9 complexity | bfb91a53a3e0a3bddc81ebadd2af4bac MD5 | raw file
Possible License(s): AGPL-3.0
  1. #!/usr/bin/python3
  2. # Wikipedia citation tools
  3. # citotron.py
  4. # Main file
  5. __version__ = '3.0.0'
  6. from args import args
  7. from bs4 import BeautifulSoup as bs
  8. from collections import Counter as counter
  9. from itertools import repeat
  10. from joblib import Parallel, delayed
  11. from poliplot import *
  12. from pprint import pprint as pprint
  13. from requests import get, post
  14. from resolve import *
  15. from time import sleep
  16. import utils as u
  17. import csv, json, sys, re, os, resolve
  18. import settings as s
  19. # ---- HELPERS ----
  20. def counter(name):
  21. i = 0
  22. def counter(x):
  23. nonlocal i
  24. if x:
  25. i +=1
  26. def print_fails():
  27. print("=" * 80)
  28. print("Fails by kind:")
  29. for k,v in kindsfails.items():
  30. print(" %s: %s" % (k,v))
  31. print("Fails by resolver:")
  32. for k,v in resolversfails.items():
  33. print(" %s: %s" % (k,v))
  34. print("Processed rows: %s" % total)
  35. print("Total fails: %s" % fails)
  36. print("Total successes: %s" % successes)
  37. print("Fail rate:", str((fails / total) * 100)[:4] + "%")
  38. def progress(character):
  39. """Convenient dot machine type printer."""
  40. if args.debug:
  41. print(character, end='', flush=True)
  42. def safeprint(*args):
  43. """Convert arguments to strings, False to "ERROR", print the result."""
  44. print(*[str(arg) if arg is not False else 'ERROR' for arg in args], sep="")
  45. def scientific_publishers():
  46. """Return a list of scientific publishers read from file."""
  47. with open(os.path.join(s.datadir, s.scientific_publishers),
  48. mode='rt', encoding='utf-8') as f:
  49. publishers = [u.canonical(l.strip()) for l in f.readlines()]
  50. abbrevs = {'SOC': ' SOCIETY OF',
  51. 'ACAD': 'ACADEMY OF ',
  52. 'UNIV': 'UNIVERSITY OF',
  53. 'NATL': 'NATIONAL',
  54. 'PUBL': 'PUBLISHING',
  55. 'LTD': 'LIMITED',
  56. 'CORP': 'CORPORATION',
  57. 'INC': 'INCORPORATED',
  58. 'COLL': 'COLLEGE',
  59. 'CO': ' COMPANY',
  60. 'INST': 'INSTITUTE',
  61. 'INT': 'INTERNATIONAL',
  62. 'M I T': 'MIT',
  63. 'MIT': 'MIT',
  64. 'DIV': 'DIVISION',
  65. 'LLC': 'LIMITED',
  66. 'PUBS': 'PUBLISHERS'}
  67. for n, publisher in enumerate(publishers):
  68. for abbrev in abbrevs:
  69. r = re.compile('^abbrev\s|\sabbrev$|\sabbrev\s'.replace('abbrev', abbrev))
  70. if re.match(r, publisher):
  71. print('******')
  72. print(abbrev)
  73. print(publisher)
  74. print('++++++')
  75. publishers[n] = publisher.replace(abbrev, abbrevs[abbrev])
  76. print(publishers[n])
  77. print('......')
  78. print(publishers)
  79. return publishers
  80. # ---- GENERAL FUNCTIONS ----
  81. def resolve_row(row, serial, n=0):
  82. '''
  83. Resolves title and publisher for row, where uid = row['id'] and kind = row['type'].
  84. - Calls resolver function resolve.<kind><n>.
  85. - If resolver returns string as first element, it returns the result.
  86. - Else, if resolver returns False as first element, tries resolve.<kind><n+1>, etc.
  87. - Finally, if resolver does not exist, it returns False, False.
  88. '''
  89. global total, successes, fails
  90. resolver = row['type'] + str(n)
  91. try:
  92. resolved = globals()[resolver](row['id'])
  93. safeprint("-" * 80)
  94. safeprint("[", str((serial / total) * 100)[:4],"%] ",
  95. serial, "/", total,
  96. " (", successes, " successes / ", fails, " fails) ",
  97. str((fails / serial) * 100)[:4],"% error rate."
  98. " Cc: ", resolver, ":", "\n",
  99. " ", resolved[0], ", [", resolved[1], "]")
  100. sleep(args.sleep)
  101. # resolved[0] is False if title not found, otherwise string
  102. # resolved[1] is True if the row was not isbn,
  103. # False if it was isbn and publisher was not found,
  104. # string if it was isbn and publisher was found.
  105. if resolved[0]:
  106. if isinstance(resolved[1], str):
  107. resolved = u.expand(u.canonical(resolved[0])), u.canonical(resolved[1])
  108. else:
  109. resolved = u.expand(u.canonical(resolved[0])), resolved[1]
  110. log.writerow([resolved[0]])
  111. if isinstance(resolved[1], str):
  112. print(" ", resolved[0] + ", [" + resolved[1] + "]")
  113. else:
  114. # safeprint() can only print string or boolean False so use print()
  115. if resolved[1]:
  116. print(" ", resolved[0] + ", [ True ]")
  117. else:
  118. print(" ", resolved[0] + ", [ False ]")
  119. successes += 1
  120. return resolved
  121. else:
  122. resolversfails[resolver] += 1
  123. return resolve_row(row, serial, n + 1)
  124. except KeyError:
  125. fails += 1
  126. kindsfails[row['type']] += 1
  127. return False, False
  128. def resolve_rows(rows, sci=False):
  129. """Resolve rows to titles in parallel."""
  130. n = 8 if sci else 0
  131. return (Parallel(n_jobs=args.jobs,
  132. verbose=51)
  133. (delayed(resolve_row)(row, serial, n) for row, serial, n in
  134. zip(rows, range(1, len(rows)), repeat(n))))
  135. # ---- MISC. FUNCTIONS ----
  136. def isbns_in_libthing(rows):
  137. '''
  138. Tells how many ISBNS of rows are in Library Thing.
  139. '''
  140. # List of all ISBNs in Library Thing:
  141. # http://www.librarything.com/feeds/AllLibraryThingISBNs.csv
  142. libthing = open(os.path.join(s.datadir,
  143. 'AllLibraryThingISBNs.csv'), 'r').readlines()
  144. libthing = [x.rstrip() for x in libthing]
  145. isbns = [row['id'] for row in rows if row['type'] == 'isbn']
  146. hits = 0
  147. for isbn in isbns:
  148. if isbn in libthing:
  149. hits += 1
  150. print("We have %s ISBNS and only %s of them are in Library Thing." % (str(len(isbns)), str(hits)))
  151. return hits
  152. def count_titles(titles):
  153. '''
  154. Accepts a list of titles, logging and returning their frequencies.
  155. '''
  156. freq = counter(titles).most_common()
  157. for k, v in freq:
  158. out.writerow([k, v])
  159. return freq
  160. def count_types(rows):
  161. """Counts how many rows of each kind (isbn, doi, etc.) are in the input csv."""
  162. types = []
  163. for row in rows:
  164. types.append(row['type'])
  165. return counter(types).most_common()
  166. def filter_kind(kind, rows):
  167. return [row for row in rows if row['type'] == kind]
  168. def scisbn(rows):
  169. """Returns statistics about which ISBNs are scientific"""
  170. list_of_scientific_publishers = scientific_publishers()
  171. results = resolve_rows(rows)
  172. # Remove cases where we could not find the publisher:
  173. nr_all_results = len(results)
  174. results = [ x for x in results if x[1] ]
  175. nr_results_with_publishers = len(results)
  176. results = [ x for x in results if x[1] in list_of_scientific_publishers ]
  177. nr_results_from_scientific_publishers = len(results)
  178. nr_results_not_from_scientific_publishers = nr_results_with_publishers - nr_results_from_scientific_publishers
  179. return {'All books resolved': nr_all_results,
  180. 'Books with known publishers': nr_results_with_publishers,
  181. 'Books with scientific publishers': nr_results_from_scientific_publishers,
  182. 'Books with non-scientific publishers': nr_results_not_from_scientific_publishers,
  183. 'Percent of books with scientific publishers from all books with known publishers': nr_results_from_scientific_publishers / float(nr_results_with_publishers)}
  184. # ---- INIT ----
  185. # Enumerate generator so that we only read the file once
  186. rows = list(csv.DictReader(open(os.path.join(s.datadir, args.inputfile), 'rt'),
  187. delimiter="\t"))
  188. rows = u.correct_isbn_rows(rows)
  189. log = csv.writer(open(s.logfile, 'w', encoding='utf-8'),
  190. delimiter='|',
  191. quotechar='"',
  192. quoting=csv.QUOTE_MINIMAL)
  193. out = csv.writer(open(args.outputfile, 'w', encoding='utf-8'),
  194. delimiter='|',
  195. quotechar='"',
  196. quoting=csv.QUOTE_MINIMAL)
  197. # ---- GLOBAL STATE ----
  198. total, fails, successes = len(rows), 0, 0
  199. kinds = [x[:-1] for x in dir(resolve) if '0' in x]
  200. kindsfails = {k:v for k, v in zip(kinds, repeat(0))}
  201. resolvers = dir(resolve)
  202. resolversfails = {k:v for k, v in zip(resolvers, repeat(0))}
  203. # ---- CALLS ----
  204. if args.kind != 'all':
  205. rows = filter_kind(args.kind, rows)
  206. total = len(rows)
  207. m = args.mode
  208. if m == "count":
  209. pprint(count_titles(resolve_rows(rows)))
  210. print_fails()
  211. plot_occurrences(args.outputfile, args.discriminant)
  212. elif m == "resolve":
  213. resolve_rows(rows)
  214. print_fails()
  215. elif m == "types":
  216. pprint(count_types(rows))
  217. elif m == "scisbn":
  218. pprint(scisbn(filter_kind('isbn', rows)))
  219. elif m == "journaltitles":
  220. isbnrows = filter_kind('isbn', rows)
  221. otherrows = [ row for row in rows if row not in isbnrows ]
  222. pprint(count_titles(resolve_rows(otherrows)))
  223. print_fails()
  224. else:
  225. print("mode was: '" + str(m) + "'")
  226. print("="*80)
  227. print("You have to choose a supported mode!")
  228. print("Try to run the script without arguments to get a help message.")