PageRenderTime 46ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/resolve.py

https://gitlab.com/maxigas/citotron3
Python | 232 lines | 186 code | 12 blank | 34 comment | 23 complexity | aa8db09e72ddc4708d8005e276c92f14 MD5 | raw file
  1. # Wikipedia citation tools
  2. # resolve.py
  3. # Resolvers
  4. from bs4 import BeautifulSoup as bs
  5. from isbnlib import mask
  6. import utils as u
  7. def doi0(doi):
  8. """
  9. Doi to title and publisher, if either is not found then None is returned in place.
  10. """
  11. url = "http://su8bj7jh4j.search.serialssolutions.com/?id=DOI"
  12. doi = doi.replace('/', '%2F')
  13. url = url.replace('DOI', doi)
  14. # print('URL: ' + url)
  15. try:
  16. html = u.safeget(url).text
  17. soup = bs(html, 'html5lib')
  18. citation_data = dict(zip([x.text.strip() for x in soup.select(".section .citation-data .span3")],
  19. [x.text.strip() for x in soup.select(".section .citation-data .span9")]))
  20. return citation_data['Journal:'], None
  21. except:
  22. return None, None
  23. def doi1(doi):
  24. """
  25. Doi to title and False, or (False, False) if no title found.
  26. """
  27. url = "http://search.crossref.org/dois?q=DOI"
  28. doi = doi.replace('/', '%2F')
  29. url = url.replace('DOI', doi)
  30. # print('URL: ' + url)
  31. try:
  32. content = u.safeget(url)
  33. j = json.loads(content)
  34. if j:
  35. return bs(j[0]['fullCitation'], 'html5lib').select('i')[0].text, None
  36. else:
  37. return None, None
  38. except:
  39. return None, None
  40. # WorldCat Advanced search lookup
  41. def isbn0(isbn):
  42. """
  43. Isbn to title and publisher, if either is not found then None is returned in place.
  44. """
  45. url = "https://www.worldcat.org/search?q=bn%3AISBN&qt=advanced&dblist=638"
  46. url = url.replace('ISBN', isbn)
  47. # print('URL: ' + url)
  48. try:
  49. html = u.safeget(url).text
  50. soup = bs(html, 'html5lib')
  51. title = soup.select(".name a strong")[0].text
  52. if not "ISBN" in title:
  53. publisher = soup.find("span", class_="itemPublisher").text.split(': ')[1].split(',')[0]
  54. if publisher:
  55. return title, publisher
  56. else:
  57. return title, None
  58. else:
  59. return None, None
  60. except:
  61. return None, None
  62. # Bookfinder API
  63. # http://www.bookfinder.com/search/?isbn=ISBN&st=xl&ac=qr
  64. def isbn1(isbn):
  65. """
  66. Isbn to title and publisher, if either is not found then None is returned in place.
  67. """
  68. url = "http://www.bookfinder.com/search/?isbn=ISBN&st=xl&ac=qr"
  69. url = url.replace('ISBN', isbn)
  70. # print('URL: ' + url)
  71. try:
  72. html = safeget(url).text
  73. soup = bs(html, 'html5lib')
  74. title = soup.title.text.strip()
  75. title = title.split(' (')[0]
  76. if title and (('Search error' not in title) and ('ISBN' not in title) and ('BookFinder.com: Forbidden' not in title)):
  77. publisher = soup.find(itemprop="publisher").text.split(',')[0]
  78. return title.split(' by ')[0], publisher or None
  79. else:
  80. return None, None
  81. except:
  82. return None, None
  83. # Isbnsearch API
  84. # http://www.isbnsearch.org/isbn/ISBN
  85. def isbn2(isbn):
  86. """
  87. Isbn to title and False, or (False, False) if no title found.
  88. """
  89. url = "http://www.isbnsearch.org/isbn/ISBN"
  90. url = url.replace('ISBN', isbn)
  91. # print('URL: ' + url)
  92. try:
  93. html = u.safeget(url).text
  94. soup = bs(html, 'html5lib')
  95. title = soup.title.text
  96. publisher = [p.text.split(': ')[1] for p in soup.select(".bookinfo")[0].find_all('p') if "Publisher" in p.text][0]
  97. return title.split('|')[1].split('(')[0].strip(), publisher or None
  98. except:
  99. return None, None
  100. # Open Library API
  101. # http://openlibrary.org/api/books?bibkeys=ISBN:ISBN&details=true
  102. def isbn3(isbn):
  103. """
  104. Isbn to title and publisher, if either is not found then None is returned in place.
  105. """
  106. url = "http://openlibrary.org/api/books?bibkeys=ISBN:ISBN&details=true"
  107. url = url.replace('ISBN', isbn)
  108. # print('URL: ' + url)
  109. try:
  110. content = u.safeget(url).text
  111. j = json.loads(content.replace('var _OLBookInfo = ', '')[:-1])
  112. return j['ISBN:'+isbn]['details']['title'], j['ISBN:'+isbn]['details']['publishers']
  113. except:
  114. return None, None
  115. # Amazon Advanced Search lookup
  116. def isbn4(isbn):
  117. """
  118. Isbn to title and publisher, if either is not found then None is returned in place.
  119. """
  120. url = "http://www.amazon.com/gp/search/ref=sr_adv_b/?search-alias=stripbooks&unfiltered=1&field-isbn=ISBN&field-dateop=During&sort=relevanceexprank&Adv-Srch-Books-Submit.x=22&Adv-Srch-Books-Submit.y=5"
  121. url = url.replace('ISBN', isbn)
  122. # print('URL: ' + url)
  123. try:
  124. html = u.safeget(url).text
  125. soup = bs(html, 'html5lib')
  126. title = soup.select('.s-access-detail-page h2')[0].text
  127. # API required for looking at the details of the book like this:
  128. # publisher_page_link = soup.find("a", class_="a-link-normal a-text-normal")['href']
  129. # publisher_page_soup = bs(u.safeget(publisher_page_link).text, 'html5lib')
  130. return title, None
  131. except:
  132. return False, None
  133. def pmid0(pmid):
  134. """
  135. PMID to title. None is also returned in place of publisher.
  136. """
  137. url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=PMID"
  138. url = url.replace('PMID', pmid)
  139. # print('URL: ' + url)
  140. try:
  141. html = u.safeget(url).text
  142. soup = bs(html, 'html5lib')
  143. return soup.find('item', {'name':"Source"}).text, None
  144. except:
  145. return None, None
  146. def pmc0(pmc):
  147. """
  148. PMC to title. None is also returned in place of publisher.
  149. """
  150. url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pmc&id=PMC"
  151. url = url.replace('PMC', pmc)
  152. # print('URL: ' + url)
  153. try:
  154. html = u.safeget(url).text
  155. soup = bs(html, 'html5lib')
  156. return soup.find('item', {'name':"Source"}).text, None
  157. except:
  158. return None, None
  159. def arxiv0(arxiv):
  160. """
  161. Arxiv ID to title. None is also returned in place of publisher.
  162. """
  163. url = "http://arxiv.org/abs/ARXIV"
  164. url = url.replace('ARXIV', arxiv)
  165. # print('URL: ' + url)
  166. try:
  167. html = u.safeget(url).text
  168. soup = bs(html, 'html5lib')
  169. doi = soup.find('meta', {'name':"citation_doi"})['content']
  170. resolved = doi0(doi)
  171. if resolved[0]:
  172. return resolved
  173. else:
  174. return doi1(doi)
  175. except:
  176. return None, None
  177. def arxiv1(arxiv):
  178. """
  179. Arxiv ID to title. None is also returned in place of publisher.
  180. """
  181. url = "http://export.arxiv.org/api/query?id_list=ARXIV"
  182. url = url.replace('ARXIV', arxiv)
  183. # print('URL: ' + url)
  184. try:
  185. html = u.safeget(url).text
  186. soup = bs(html, 'html5lib')
  187. # print('Found doi:', soup.find('arxiv:doi').text)
  188. return doi0(soup.find('arxiv:doi').text)
  189. except:
  190. return None, None
  191. def is_academic(isbn):
  192. """
  193. Thomson Reuters Web of Knowledge Book Master List.
  194. For checking if an ISBN is "scientific" or not.
  195. http://wokinfo.com/cgi-bin/bkci/search.cgi
  196. Example matching ISBN: 978-0-415-59181-2
  197. Note: Probably needs minimum -s 5 to work!
  198. """
  199. try:
  200. html = u.safepost("http://wokinfo.com/cgi-bin/bkci/search.cgi", data={'search': mask(isbn), 'searchtype': 'and'}).text
  201. soup = bs(html, 'html5lib')
  202. title = soup.select("tbody")[1].select("td")[0].text.rstrip()
  203. if 'No matches to your query.' in title:
  204. return False
  205. else:
  206. return True
  207. except:
  208. return None