regurgitator.py | searchcode

/regurgitator.py

https://gitlab.com/crossref/hairball · Python · 376 lines · 249 code · 102 blank · 25 comment · 43 complexity · 8a3daa8857770283313a2379654c5700 MD5 · raw file

import logging
import os
import sys
import time
import json
import concurrent.futures
from collections import OrderedDict

import click
import pathlib
import requests
from bs4 import BeautifulSoup as bs
from jinja2 import Template
from lxml import etree
from rich.logging import RichHandler

import cr_schema

APP_NAME = "Regurgitator"
CONTACT = "labs@crossref.org"
USER_AGENT = {"UserAgent": f"{APP_NAME};  mailto:{CONTACT}"}
CRAPI_URI = "https://api.crossref.org"
XML_API = "http://doi.crossref.org/search/doi"
DEPOSIT_TEMPLATE_PATH = "deposit_templates"
SCHEMA_PATH = "schemas"

logging.basicConfig(level=logging.WARNING, handlers=[RichHandler()])
logger = logging.getLogger("rich")

logger.info(f"starting: {APP_NAME}")


def get_template(fn):
    template = pathlib.Path(fn).read_text()
    return Template(template)


def get_header_name_element(schema_version):
    """given schema version, which element name do we use in
    header?"""
    version_changed = "4.3.4"
    cutoff = cr_schema.standard.index(version_changed)
    return (
        "name"
        if cr_schema.standard.index(schema_version) > cutoff
        else "depositor_name"
    )


def standard_template():
    return get_template(
        os.path.join(DEPOSIT_TEMPLATE_PATH, "standard_deposit_template.xml")
    )


def grant_template():
    return get_template(
        os.path.join(DEPOSIT_TEMPLATE_PATH, "grant_deposit_template.xml")
    )


def cn(doi, accept="application/vnd.crossref.unixsd+xml"):
    return requests.get(
        f"{CRAPI_URI}/works/{doi}/transform/{accept}", headers=USER_AGENT
    ).text


def xml_api(doi):
    return requests.get(
        f"{XML_API}?pid={CONTACT}&format=unixsd&doi={doi}",
        headers=USER_AGENT,
    ).text


def get_unixsd(doi):
    # We can either use CN (preferred)
    # or go directly to xml api. Just
    # easy way to switch between the two.

    # return xml_api(doi)
    return cn(doi)


def remove_uneeded_namespaces_from_elements(record):
    for e in record.find_all():
        if e.prefix not in cr_schema.crossref_namespaces:
            e.prefix = None


def is_namespace(attr: str) -> bool:
    """
    is the attribute a namespace?
    """
    return attr.startswith("xmlns:")


def ns_name(attr: str) -> bool:
    """
    given namspace attribute, return the name
    """
    _, ns = attr.split(":")
    return ns


def remove_uneeded_namespaces_decalrations(record):
    content_type = detect_content_type(record)
    root = record.find(content_type)

    namespaces_to_keep = {
        attr: root.attrs[attr]
        for attr in root.attrs.keys()
        if is_namespace(attr) and ns_name(attr) in cr_schema.crossref_namespaces
    }
    non_namespaces_to_keep = {
        attr: root.attrs[attr] for attr in root.attrs.keys() if not is_namespace(attr)
    }

    root.attrs = namespaces_to_keep | non_namespaces_to_keep


def remove_non_crossref_namespaces(record):

    remove_uneeded_namespaces_from_elements(record)
    remove_uneeded_namespaces_decalrations(record)


def extract_doi_record(xml):
    # NB if you just specify ust "lxml", bs4 case-folds element names and your XML will no longer validate.
    bs_content = bs(xml, "lxml-xml")
    return bs_content.find("crossref")


def pp_xml(xml):
    x = etree.fromstring(xml.encode(encoding="utf-8"))
    etree.indent(x, space=" ", level=0)
    return etree.tostring(x, pretty_print=True, encoding=str)


def prettify_xml(xml):
    return str(bs(xml, features="xml"))
    #return bs(xml, features="lxml-xml").prettify()


def detect_content_type(record):
    return record.find().name


def validate(xml: str, xsd_path: str) -> bool:

    xmlschema_doc = etree.parse(xsd_path)
    xmlschema = etree.XMLSchema(xmlschema_doc)

    xml_doc = etree.fromstring(xml.encode(encoding="utf-8"))
    return xmlschema.validate(xml_doc)


def schema_path(content_type, schema_version):
    schema_type = "grant_id" if content_type == "grant" else "crossref"
    return os.path.join(SCHEMA_PATH, f"{schema_type}{schema_version}.xsd")


def move_element_after(record, element_to_move_name, target_element_name):
    if element_to_move := record.find(element_to_move_name):
        if target_element := record.find(target_element_name):
            target_element.insert_after(element_to_move)


def remove_attribute_from_elements(record, attribute_name, element_name):
    for element in record.findAll(element_name):
        if attribute_name in element.attrs:
            del element.attrs[attribute_name]


def rename_element(record, old_name, new_name):
    if element := record.find(old_name):
        element.name = new_name


def copy_to_new_date(date_part, old_pub_date, new_pub_date):
    if old_date_part_element := old_pub_date.find(date_part):
        bs_content = bs(features="lxml-xml")
        new_month_tag = bs_content.new_tag(date_part)
        new_month_tag.string = old_date_part_element.text
        new_pub_date.append(new_month_tag)


def canonicalize_date(old_pub_date):
    # bs_content = bs(features="lxml-xml")
    # new_pub_date = bs_content.new_tag("publication_date")
    new_pub_date = bs(features="lxml-xml").new_tag("publication_date")
    new_pub_date.attrs = old_pub_date.attrs

    copy_to_new_date("month", old_pub_date, new_pub_date)
    copy_to_new_date("day", old_pub_date, new_pub_date)
    copy_to_new_date("year", old_pub_date, new_pub_date)

    return new_pub_date


def fix_dates(record):
    for publication_date in record.findAll("publication_date"):
        publication_date.replace_with(canonicalize_date(publication_date))


def degunk_book_chapter(record):

    remove_attribute_from_elements(record, "provider", "doi")
    remove_attribute_from_elements(record, "provider", "rel:intra_work_relation")
    remove_attribute_from_elements(record, "setbyID", "collection")

    rename_element(record, "volume", "edition_number")

    fix_dates(record)


def degunk_journal_article(record):

    remove_attribute_from_elements(record, "provider", "doi")
    remove_attribute_from_elements(record, "provider", "rel:intra_work_relation")
    move_element_after(record, "ai:program", "pages")
    move_element_after(record, "publisher_item", "pages")
    move_element_after(record, "abstract", "contributors")
    # move_element_after(record,'journal_volume','contributors')


def degunk(record, content_type) -> None:
    """
    Remove output schema anomolies

    The Crossref output schema will often do things in a slightly different order
    to the crossref deposit schema. It may also incluude elements that are not in the
    deposit schema. This *needs to be fixed in the output schema*, but this is my current
    workaround.
    """
    remove_non_crossref_namespaces(record)

    if content_type == "book":
        degunk_book_chapter(record)
    elif content_type == "journal":
        degunk_journal_article(record)


def validate_all(content_type: str, all_schemas: dict) -> OrderedDict:
    results = {}
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        future_to_version = {
            executor.submit(validate, xml, schema_path(content_type, schema_version)): (
                xml,
                schema_version,
            )
            for schema_version, xml in all_schemas.items()
        }

        for future in concurrent.futures.as_completed(future_to_version):
            version = future_to_version[future][1]

            try:
                validation_result = future.result()
                results[version] = validation_result

            except Exception as exc:
                logger.error("%r generated an exception: %s" % (url, exc))

    return OrderedDict(sorted(results.items(), reverse=True))


def try_schema_versions(
    record,
    doi,
    content_type,
    doi_batch_id,
    timestamp,
    depositor_name,
    email_address,
    registrant,
):
    new_xml = None

    template = grant_template() if content_type == "grant" else standard_template()

    schema_versions = (
        cr_schema.grants if content_type == "grant" else cr_schema.standard
    )

    all_schemas = {}

    for schema_version in schema_versions:
        header_name_element = (
            "depositor_name"
            if content_type == "grant"
            else get_header_name_element(schema_version)
        )
        new_xml = template.render(
            body=record,
            schema_version=schema_version,
            doi_batch_id=doi_batch_id,
            timestamp=timestamp,
            depositor_name=depositor_name,
            email_address=email_address,
            registrant=registrant,
            header_name_element=header_name_element,
        )

        all_schemas[schema_version] = new_xml

    validates_against = validate_all(content_type, all_schemas)
    if not any(all_valid := [key for key, value in validates_against.items() if value]):
        return None, all_schemas[schema_versions[0]]

    most_recent_to_validate = all_valid[0]
    return most_recent_to_validate, all_schemas[most_recent_to_validate]


def regurgitate(
    doi, doi_batch_id, timestamp, depositor_name, email_address, registrant
):
    xml = get_unixsd(doi)
    record = extract_doi_record(xml)

    record.name = "body"
    record.attrs = {}

    content_type = detect_content_type(record)

    degunk(record, content_type)

    schema_version, xml = try_schema_versions(
        record=record,
        doi=doi,
        content_type=content_type,
        doi_batch_id=doi_batch_id,
        timestamp=timestamp,
        depositor_name=depositor_name,
        email_address=email_address,
        registrant=registrant,
    )

    return schema_version, pp_xml(xml)


if __name__ == "__main__":

    @click.command()
    @click.argument("input", type=click.File("rb"), nargs=-1)
    @click.option("-v", "--verbose", default=False, show_default=True, is_flag=True)
    def cli(input, verbose):

        if verbose:
            logging.getLogger().setLevel(logging.INFO)
            logger.info("verbose mode")

        dois = []
        for f in input:
            dois += [line.decode("utf-8").rstrip() for line in f.readlines()]
        for index, doi in enumerate(dois):
            ts = int(time.time())
            schema_version, new_xml = regurgitate(
                doi=doi,
                timestamp=ts,
                doi_batch_id=ts,
                email_address="gbilder@crossref.org",
                registrant="crossref",
                depositor_name="gbilder",
            )
            if schema_version:
                fn = f"results/valid-{index}.xml"

            else:
                logger.error(f"failed to regurgitate {doi}")
                fn = f"results/invalid-{index}.xml"

            with open(fn, "w") as f:
                f.write(prettify_xml(new_xml))
            logger.info(f"saved: {fn}")

    cli()
Tech Fingerprint

Alerts (34)

'def' Ensure functions have docstrings for documentation
33 50 56 62 68 75 84 105 121 127 133 139 144 148 157 162 168 174 179 187 200 205 216 243 267 314 346
Complexity hotspot; lines 111 to 112 (total complexity: 3)
111 112
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
171
Complexity hotspot; line 307 (total complexity: 3)
307
Complexity hotspot; lines 353 to 355 (total complexity: 3)
353 354 355