/regurgitator.py
Python | 376 lines | 249 code | 102 blank | 25 comment | 43 complexity | 8a3daa8857770283313a2379654c5700 MD5 | raw file
- import logging
- import os
- import sys
- import time
- import json
- import concurrent.futures
- from collections import OrderedDict
- import click
- import pathlib
- import requests
- from bs4 import BeautifulSoup as bs
- from jinja2 import Template
- from lxml import etree
- from rich.logging import RichHandler
- import cr_schema
- APP_NAME = "Regurgitator"
- CONTACT = "labs@crossref.org"
- USER_AGENT = {"UserAgent": f"{APP_NAME}; mailto:{CONTACT}"}
- CRAPI_URI = "https://api.crossref.org"
- XML_API = "http://doi.crossref.org/search/doi"
- DEPOSIT_TEMPLATE_PATH = "deposit_templates"
- SCHEMA_PATH = "schemas"
- logging.basicConfig(level=logging.WARNING, handlers=[RichHandler()])
- logger = logging.getLogger("rich")
- logger.info(f"starting: {APP_NAME}")
- def get_template(fn):
- template = pathlib.Path(fn).read_text()
- return Template(template)
- def get_header_name_element(schema_version):
- """given schema version, which element name do we use in
- header?"""
- version_changed = "4.3.4"
- cutoff = cr_schema.standard.index(version_changed)
- return (
- "name"
- if cr_schema.standard.index(schema_version) > cutoff
- else "depositor_name"
- )
- def standard_template():
- return get_template(
- os.path.join(DEPOSIT_TEMPLATE_PATH, "standard_deposit_template.xml")
- )
- def grant_template():
- return get_template(
- os.path.join(DEPOSIT_TEMPLATE_PATH, "grant_deposit_template.xml")
- )
- def cn(doi, accept="application/vnd.crossref.unixsd+xml"):
- return requests.get(
- f"{CRAPI_URI}/works/{doi}/transform/{accept}", headers=USER_AGENT
- ).text
- def xml_api(doi):
- return requests.get(
- f"{XML_API}?pid={CONTACT}&format=unixsd&doi={doi}",
- headers=USER_AGENT,
- ).text
- def get_unixsd(doi):
- # We can either use CN (preferred)
- # or go directly to xml api. Just
- # easy way to switch between the two.
- # return xml_api(doi)
- return cn(doi)
- def remove_uneeded_namespaces_from_elements(record):
- for e in record.find_all():
- if e.prefix not in cr_schema.crossref_namespaces:
- e.prefix = None
- def is_namespace(attr: str) -> bool:
- """
- is the attribute a namespace?
- """
- return attr.startswith("xmlns:")
- def ns_name(attr: str) -> bool:
- """
- given namspace attribute, return the name
- """
- _, ns = attr.split(":")
- return ns
- def remove_uneeded_namespaces_decalrations(record):
- content_type = detect_content_type(record)
- root = record.find(content_type)
- namespaces_to_keep = {
- attr: root.attrs[attr]
- for attr in root.attrs.keys()
- if is_namespace(attr) and ns_name(attr) in cr_schema.crossref_namespaces
- }
- non_namespaces_to_keep = {
- attr: root.attrs[attr] for attr in root.attrs.keys() if not is_namespace(attr)
- }
- root.attrs = namespaces_to_keep | non_namespaces_to_keep
- def remove_non_crossref_namespaces(record):
- remove_uneeded_namespaces_from_elements(record)
- remove_uneeded_namespaces_decalrations(record)
- def extract_doi_record(xml):
- # NB if you just specify ust "lxml", bs4 case-folds element names and your XML will no longer validate.
- bs_content = bs(xml, "lxml-xml")
- return bs_content.find("crossref")
- def pp_xml(xml):
- x = etree.fromstring(xml.encode(encoding="utf-8"))
- etree.indent(x, space=" ", level=0)
- return etree.tostring(x, pretty_print=True, encoding=str)
- def prettify_xml(xml):
- return str(bs(xml, features="xml"))
- #return bs(xml, features="lxml-xml").prettify()
- def detect_content_type(record):
- return record.find().name
- def validate(xml: str, xsd_path: str) -> bool:
- xmlschema_doc = etree.parse(xsd_path)
- xmlschema = etree.XMLSchema(xmlschema_doc)
- xml_doc = etree.fromstring(xml.encode(encoding="utf-8"))
- return xmlschema.validate(xml_doc)
- def schema_path(content_type, schema_version):
- schema_type = "grant_id" if content_type == "grant" else "crossref"
- return os.path.join(SCHEMA_PATH, f"{schema_type}{schema_version}.xsd")
- def move_element_after(record, element_to_move_name, target_element_name):
- if element_to_move := record.find(element_to_move_name):
- if target_element := record.find(target_element_name):
- target_element.insert_after(element_to_move)
- def remove_attribute_from_elements(record, attribute_name, element_name):
- for element in record.findAll(element_name):
- if attribute_name in element.attrs:
- del element.attrs[attribute_name]
- def rename_element(record, old_name, new_name):
- if element := record.find(old_name):
- element.name = new_name
- def copy_to_new_date(date_part, old_pub_date, new_pub_date):
- if old_date_part_element := old_pub_date.find(date_part):
- bs_content = bs(features="lxml-xml")
- new_month_tag = bs_content.new_tag(date_part)
- new_month_tag.string = old_date_part_element.text
- new_pub_date.append(new_month_tag)
- def canonicalize_date(old_pub_date):
- # bs_content = bs(features="lxml-xml")
- # new_pub_date = bs_content.new_tag("publication_date")
- new_pub_date = bs(features="lxml-xml").new_tag("publication_date")
- new_pub_date.attrs = old_pub_date.attrs
- copy_to_new_date("month", old_pub_date, new_pub_date)
- copy_to_new_date("day", old_pub_date, new_pub_date)
- copy_to_new_date("year", old_pub_date, new_pub_date)
- return new_pub_date
- def fix_dates(record):
- for publication_date in record.findAll("publication_date"):
- publication_date.replace_with(canonicalize_date(publication_date))
- def degunk_book_chapter(record):
- remove_attribute_from_elements(record, "provider", "doi")
- remove_attribute_from_elements(record, "provider", "rel:intra_work_relation")
- remove_attribute_from_elements(record, "setbyID", "collection")
- rename_element(record, "volume", "edition_number")
- fix_dates(record)
- def degunk_journal_article(record):
- remove_attribute_from_elements(record, "provider", "doi")
- remove_attribute_from_elements(record, "provider", "rel:intra_work_relation")
- move_element_after(record, "ai:program", "pages")
- move_element_after(record, "publisher_item", "pages")
- move_element_after(record, "abstract", "contributors")
- # move_element_after(record,'journal_volume','contributors')
- def degunk(record, content_type) -> None:
- """
- Remove output schema anomolies
- The Crossref output schema will often do things in a slightly different order
- to the crossref deposit schema. It may also incluude elements that are not in the
- deposit schema. This *needs to be fixed in the output schema*, but this is my current
- workaround.
- """
- remove_non_crossref_namespaces(record)
- if content_type == "book":
- degunk_book_chapter(record)
- elif content_type == "journal":
- degunk_journal_article(record)
- def validate_all(content_type: str, all_schemas: dict) -> OrderedDict:
- results = {}
- with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
- future_to_version = {
- executor.submit(validate, xml, schema_path(content_type, schema_version)): (
- xml,
- schema_version,
- )
- for schema_version, xml in all_schemas.items()
- }
- for future in concurrent.futures.as_completed(future_to_version):
- version = future_to_version[future][1]
- try:
- validation_result = future.result()
- results[version] = validation_result
- except Exception as exc:
- logger.error("%r generated an exception: %s" % (url, exc))
- return OrderedDict(sorted(results.items(), reverse=True))
- def try_schema_versions(
- record,
- doi,
- content_type,
- doi_batch_id,
- timestamp,
- depositor_name,
- email_address,
- registrant,
- ):
- new_xml = None
- template = grant_template() if content_type == "grant" else standard_template()
- schema_versions = (
- cr_schema.grants if content_type == "grant" else cr_schema.standard
- )
- all_schemas = {}
- for schema_version in schema_versions:
- header_name_element = (
- "depositor_name"
- if content_type == "grant"
- else get_header_name_element(schema_version)
- )
- new_xml = template.render(
- body=record,
- schema_version=schema_version,
- doi_batch_id=doi_batch_id,
- timestamp=timestamp,
- depositor_name=depositor_name,
- email_address=email_address,
- registrant=registrant,
- header_name_element=header_name_element,
- )
- all_schemas[schema_version] = new_xml
- validates_against = validate_all(content_type, all_schemas)
- if not any(all_valid := [key for key, value in validates_against.items() if value]):
- return None, all_schemas[schema_versions[0]]
- most_recent_to_validate = all_valid[0]
- return most_recent_to_validate, all_schemas[most_recent_to_validate]
- def regurgitate(
- doi, doi_batch_id, timestamp, depositor_name, email_address, registrant
- ):
- xml = get_unixsd(doi)
- record = extract_doi_record(xml)
- record.name = "body"
- record.attrs = {}
- content_type = detect_content_type(record)
- degunk(record, content_type)
- schema_version, xml = try_schema_versions(
- record=record,
- doi=doi,
- content_type=content_type,
- doi_batch_id=doi_batch_id,
- timestamp=timestamp,
- depositor_name=depositor_name,
- email_address=email_address,
- registrant=registrant,
- )
- return schema_version, pp_xml(xml)
- if __name__ == "__main__":
- @click.command()
- @click.argument("input", type=click.File("rb"), nargs=-1)
- @click.option("-v", "--verbose", default=False, show_default=True, is_flag=True)
- def cli(input, verbose):
- if verbose:
- logging.getLogger().setLevel(logging.INFO)
- logger.info("verbose mode")
- dois = []
- for f in input:
- dois += [line.decode("utf-8").rstrip() for line in f.readlines()]
- for index, doi in enumerate(dois):
- ts = int(time.time())
- schema_version, new_xml = regurgitate(
- doi=doi,
- timestamp=ts,
- doi_batch_id=ts,
- email_address="gbilder@crossref.org",
- registrant="crossref",
- depositor_name="gbilder",
- )
- if schema_version:
- fn = f"results/valid-{index}.xml"
- else:
- logger.error(f"failed to regurgitate {doi}")
- fn = f"results/invalid-{index}.xml"
- with open(fn, "w") as f:
- f.write(prettify_xml(new_xml))
- logger.info(f"saved: {fn}")
- cli()