resolve.py | searchcode

/resolve.py

https://gitlab.com/maxigas/citotron3
Python | 232 lines | 186 code | 12 blank | 34 comment | 23 complexity | aa8db09e72ddc4708d8005e276c92f14 MD5 | raw file

# Wikipedia citation tools
# resolve.py
# Resolvers

from bs4 import BeautifulSoup as bs
from isbnlib import mask
import utils as u

def doi0(doi):
    """
    Doi to title and publisher, if either is not found then None is returned in place.
    """
    url = "http://su8bj7jh4j.search.serialssolutions.com/?id=DOI"
    doi = doi.replace('/', '%2F')
    url = url.replace('DOI', doi)
    # print('URL: ' + url)
    try:
        html = u.safeget(url).text
        soup = bs(html, 'html5lib')
        citation_data = dict(zip([x.text.strip() for x in soup.select(".section .citation-data .span3")],
                                 [x.text.strip() for x in soup.select(".section .citation-data .span9")]))
        return citation_data['Journal:'], None
    except:
        return None, None

        
def doi1(doi):
    """
    Doi to title and False, or (False, False) if no title found.
    """
    url = "http://search.crossref.org/dois?q=DOI"
    doi = doi.replace('/', '%2F')
    url = url.replace('DOI', doi)
    # print('URL: ' + url)
    try:
        content = u.safeget(url)
        j = json.loads(content)
        if j:
            return bs(j[0]['fullCitation'], 'html5lib').select('i')[0].text, None
        else:
            return None, None
    except:
        return None, None


# WorldCat Advanced search lookup
def isbn0(isbn):
    """
    Isbn  to title and publisher, if either is not found then None is returned in place.
    """
    url = "https://www.worldcat.org/search?q=bn%3AISBN&qt=advanced&dblist=638"
    url = url.replace('ISBN', isbn)
    # print('URL: ' + url)
    try:
        html = u.safeget(url).text
        soup = bs(html, 'html5lib')
        title = soup.select(".name a strong")[0].text
        if not "ISBN" in title:
            publisher = soup.find("span", class_="itemPublisher").text.split(': ')[1].split(',')[0]
            if publisher:
                return title, publisher
            else:
                return title, None
        else:
            return None, None
    except:
        return None, None


# Bookfinder API
# http://www.bookfinder.com/search/?isbn=ISBN&st=xl&ac=qr
def isbn1(isbn):
    """
    Isbn to title and publisher, if either is not found then None is returned in place.
    """
    url = "http://www.bookfinder.com/search/?isbn=ISBN&st=xl&ac=qr"
    url = url.replace('ISBN', isbn)
    # print('URL: ' + url)
    try:
        html = safeget(url).text
        soup = bs(html, 'html5lib')
        title = soup.title.text.strip()
        title = title.split(' (')[0]
        if title and (('Search error' not in title) and ('ISBN' not in title) and ('BookFinder.com: Forbidden' not in title)):
            publisher = soup.find(itemprop="publisher").text.split(',')[0]
            return title.split(' by ')[0], publisher or None
        else:
            return None, None
    except:
        return None, None


# Isbnsearch API
# http://www.isbnsearch.org/isbn/ISBN
def isbn2(isbn):
    """
    Isbn to title and False, or (False, False) if no title found.
    """
    url = "http://www.isbnsearch.org/isbn/ISBN"
    url = url.replace('ISBN', isbn)
    # print('URL: ' + url)
    try:
        html = u.safeget(url).text
        soup = bs(html, 'html5lib')
        title = soup.title.text
        publisher = [p.text.split(': ')[1] for p in soup.select(".bookinfo")[0].find_all('p') if "Publisher" in p.text][0]
        return title.split('|')[1].split('(')[0].strip(), publisher or None
    except:
        return None, None


# Open Library API
# http://openlibrary.org/api/books?bibkeys=ISBN:ISBN&details=true
def isbn3(isbn):
    """
    Isbn to title and publisher, if either is not found then None is returned in place.
    """
    url = "http://openlibrary.org/api/books?bibkeys=ISBN:ISBN&details=true"
    url = url.replace('ISBN', isbn)
    # print('URL: ' + url)
    try:
        content = u.safeget(url).text
        j = json.loads(content.replace('var _OLBookInfo = ', '')[:-1])
        return j['ISBN:'+isbn]['details']['title'], j['ISBN:'+isbn]['details']['publishers']
    except:
        return None, None


# Amazon Advanced Search lookup
def isbn4(isbn):
    """
    Isbn to title and publisher, if either is not found then None is returned in place.
    """
    url = "http://www.amazon.com/gp/search/ref=sr_adv_b/?search-alias=stripbooks&unfiltered=1&field-isbn=ISBN&field-dateop=During&sort=relevanceexprank&Adv-Srch-Books-Submit.x=22&Adv-Srch-Books-Submit.y=5"
    url = url.replace('ISBN', isbn)
    # print('URL: ' + url)
    try:
        html = u.safeget(url).text
        soup = bs(html, 'html5lib')
        title = soup.select('.s-access-detail-page h2')[0].text
        # API required for looking at the details of the book like this:
        #        publisher_page_link = soup.find("a", class_="a-link-normal a-text-normal")['href']
        #        publisher_page_soup = bs(u.safeget(publisher_page_link).text, 'html5lib')
        return title, None
    except:
        return False, None


def pmid0(pmid):
    """
    PMID to title. None is also returned in place of publisher.
    """
    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=PMID"
    url = url.replace('PMID', pmid)
    # print('URL: ' + url)
    try:
        html = u.safeget(url).text
        soup = bs(html, 'html5lib')
        return soup.find('item', {'name':"Source"}).text, None
    except:
        return None, None

    
def pmc0(pmc):
    """
    PMC to title. None is also returned in place of publisher.
    """
    url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pmc&id=PMC"
    url = url.replace('PMC', pmc)
    # print('URL: ' + url)
    try:
        html = u.safeget(url).text
        soup = bs(html, 'html5lib')
        return soup.find('item', {'name':"Source"}).text, None
    except:
        return None, None

    
def arxiv0(arxiv):
    """
    Arxiv ID to title. None is also returned in place of publisher.
    """
    url = "http://arxiv.org/abs/ARXIV"
    url = url.replace('ARXIV', arxiv)
    # print('URL: ' + url)
    try:
        html = u.safeget(url).text
        soup = bs(html, 'html5lib')
        doi = soup.find('meta', {'name':"citation_doi"})['content']
        resolved = doi0(doi)
        if resolved[0]:
            return resolved
        else:
            return doi1(doi)
    except:
        return None, None

    
def arxiv1(arxiv):
    """
    Arxiv ID to title. None is also returned in place of publisher.
    """
    url = "http://export.arxiv.org/api/query?id_list=ARXIV"
    url = url.replace('ARXIV', arxiv)
    # print('URL: ' + url)
    try:
        html = u.safeget(url).text
        soup = bs(html, 'html5lib')
        # print('Found doi:', soup.find('arxiv:doi').text)
        return doi0(soup.find('arxiv:doi').text)
    except:
        return None, None


def is_academic(isbn):
    """
    Thomson Reuters Web of Knowledge Book Master List.
    For checking if an ISBN is "scientific" or not.
    http://wokinfo.com/cgi-bin/bkci/search.cgi
    Example matching ISBN: 978-0-415-59181-2
    Note: Probably needs minimum -s 5 to work!
    """
    try:
        html = u.safepost("http://wokinfo.com/cgi-bin/bkci/search.cgi", data={'search': mask(isbn), 'searchtype': 'and'}).text
        soup = bs(html, 'html5lib')
        title = soup.select("tbody")[1].select("td")[0].text.rstrip()
        if 'No matches to your query.' in title:
            return False
        else:
            return True
    except:
        return None