bibrecord.py - Some values used for the RXP parsing. Find o…

/modules/bibrecord/lib/bibrecord.py

https://github.com/chokribr/invenio-1 · Python · 1898 lines · 1686 code · 64 blank · 148 comment · 124 complexity · 71aa8a445c3535b820ce0b586bb83959 MD5 · raw file
Large files are truncated click here to view the full file

# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

"""BibRecord - XML MARC processing library for Invenio.

For API, see create_record(), record_get_field_instances() and friends
in the source code of this file in the section entitled INTERFACE.

Note: Does not access the database, the input is MARCXML only."""

### IMPORT INTERESTING MODULES AND XML PARSERS

import re
import sys
from cStringIO import StringIO

if sys.hexversion < 0x2040000:
    # pylint: disable=W0622
    from sets import Set as set
    # pylint: enable=W0622

from invenio.bibrecord_config import CFG_MARC21_DTD, \
    CFG_BIBRECORD_WARNING_MSGS, CFG_BIBRECORD_DEFAULT_VERBOSE_LEVEL, \
    CFG_BIBRECORD_DEFAULT_CORRECT, CFG_BIBRECORD_PARSERS_AVAILABLE, \
    InvenioBibRecordParserError, InvenioBibRecordFieldError
from invenio.config import CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG
from invenio.textutils import encode_for_xml

# Some values used for the RXP parsing.
TAG, ATTRS, CHILDREN = 0, 1, 2

# Find out about the best usable parser:
AVAILABLE_PARSERS = []

# Do we remove singletons (empty tags)?
# NOTE: this is currently set to True as there are some external workflow
# exploiting singletons, e.g. bibupload -c used to delete fields, and
# bibdocfile --fix-marc called on a record where the latest document
# has been deleted.
CFG_BIBRECORD_KEEP_SINGLETONS = True

try:
    import pyRXP
    if 'pyrxp' in CFG_BIBRECORD_PARSERS_AVAILABLE:
        AVAILABLE_PARSERS.append('pyrxp')
except ImportError:
    pass

try:
    from lxml import etree
    if 'lxml' in CFG_BIBRECORD_PARSERS_AVAILABLE:
        AVAILABLE_PARSERS.append('lxml')
except ImportError:
    pass

try:
    import Ft.Xml.Domlette
    if '4suite' in CFG_BIBRECORD_PARSERS_AVAILABLE:
        AVAILABLE_PARSERS.append('4suite')
except ImportError:
    pass
except Exception, err:
    from warnings import warn
    warn("Error when importing 4suite: %s" % err)
    pass

try:
    import xml.dom.minidom
    import xml.parsers.expat
    if 'minidom' in CFG_BIBRECORD_PARSERS_AVAILABLE:
        AVAILABLE_PARSERS.append('minidom')
except ImportError:
    pass

### INTERFACE / VISIBLE FUNCTIONS

def create_field(subfields=None, ind1=' ', ind2=' ', controlfield_value='',
    global_position=-1):
    """
    Returns a field created with the provided elements. Global position is
    set arbitrary to -1."""
    if subfields is None:
        subfields = []

    ind1, ind2 = _wash_indicators(ind1, ind2)
    field = (subfields, ind1, ind2, controlfield_value, global_position)
    _check_field_validity(field)
    return field

def create_records(marcxml, verbose=CFG_BIBRECORD_DEFAULT_VERBOSE_LEVEL,
    correct=CFG_BIBRECORD_DEFAULT_CORRECT, parser='',
    keep_singletons=CFG_BIBRECORD_KEEP_SINGLETONS):
    """Creates a list of records from the marcxml description. Returns a
    list of objects initiated by the function create_record(). Please
    see that function's docstring."""
    # Use the DOTALL flag to include newlines.
    regex = re.compile('<record.*?>.*?</record>', re.DOTALL)
    record_xmls = regex.findall(marcxml)

    return [create_record(record_xml, verbose=verbose, correct=correct,
            parser=parser, keep_singletons=keep_singletons) for record_xml in record_xmls]

def create_record(marcxml, verbose=CFG_BIBRECORD_DEFAULT_VERBOSE_LEVEL,
    correct=CFG_BIBRECORD_DEFAULT_CORRECT, parser='',
    sort_fields_by_indicators=False,
    keep_singletons=CFG_BIBRECORD_KEEP_SINGLETONS):
    """Creates a record object from the marcxml description.

    Uses the best parser available in CFG_BIBRECORD_PARSERS_AVAILABLE or
    the parser specified.

    The returned object is a tuple (record, status_code, list_of_errors),
    where status_code is 0 when there are errors, 1 when no errors.

    The return record structure is as follows:
    Record := {tag : [Field]}
    Field := (Subfields, ind1, ind2, value)
    Subfields := [(code, value)]

    For example:
                                ______
                               |record|
                                ------
        __________________________|_______________________________________
       |record['001']             |record['909']           |record['520'] |
       |                          |                        |              |
[list of fields]             [list of fields]       [list of fields]     ...
       |                    ______|______________          |
       |[0]                |[0]          |[1]    |         |[0]
    ___|_____         _____|___       ___|_____ ...    ____|____
   |Field 001|       |Field 909|     |Field 909|      |Field 520|
    ---------         ---------       ---------        ---------
     |     _______________|_________________    |             |
    ...   |[0]            |[1]    |[2]      |  ...           ...
          |               |       |         |
    [list of subfields]  'C'     '4'
       ___|__________________________________________
       |                    |                        |
('a', 'value') ('b', 'value for subfield b') ('a', 'value for another a')

    @param marcxml: an XML string representation of the record to create
    @param verbose: the level of verbosity: 0 (silent), 1-2 (warnings),
        3(strict:stop when errors)
    @param correct: 1 to enable correction of marcxml syntax. Else 0.
    @return: a tuple (record, status_code, list_of_errors), where status
        code is 0 where there are errors, 1 when no errors"""
    # Select the appropriate parser.
    parser = _select_parser(parser)

    try:
        if parser == 'pyrxp':
            rec = _create_record_rxp(marcxml, verbose, correct,
                keep_singletons=keep_singletons)
        elif parser == 'lxml':
            rec = _create_record_lxml(marcxml, verbose, correct,
                keep_singletons=keep_singletons)
        elif parser == '4suite':
            rec = _create_record_4suite(marcxml,
                keep_singletons=keep_singletons)
        elif parser == 'minidom':
            rec = _create_record_minidom(marcxml,
                keep_singletons=keep_singletons)
    except InvenioBibRecordParserError, ex1:
        return (None, 0, str(ex1))

#   _create_record = {
#       'pyrxp': _create_record_rxp,
#       'lxml': _create_record_lxml,
#       '4suite': _create_record_4suite,
#       'minidom': _create_record_minidom,
#       }

#   try:
#       rec = _create_record[parser](marcxml, verbose)
#   except InvenioBibRecordParserError, ex1:
#       return (None, 0, str(ex1))

    if sort_fields_by_indicators:
        _record_sort_by_indicators(rec)

    errs = []
    if correct:
        # Correct the structure of the record.
        errs = _correct_record(rec)

    return (rec, int(not errs), errs)

def filter_field_instances(field_instances, filter_subcode, filter_value, filter_mode = 'e'):
    """ Filters given field and returns only that field instances
        that contain filter_subcode with given filter_value.
        As an input for search function accepts output from
        record_get_field_instances function.
        Function can be run in three modes:
        'e' - looking for exact match in subfield value
        's' - looking for substring in subfield value
        'r' - looking for regular expression in subfield value

        Example:
        record_filter_field(record_get_field_instances(rec, '999', '%', '%'), 'y', '2001')
        In this case filter_subcode is 'y' and
        filter_value is '2001'.
        @param field_instances: output from record_get_field_instances
        @param filter_subcode: name of the subfield
        @type filter_subcode: string
        @param filter_value: value of the subfield
        @type filter_value: string
        @param filter_mode: 'e','s' or 'r'
    """
    matched = []
    if filter_mode == 'e':
        to_match = (filter_subcode, filter_value)
        for instance in field_instances:
            if to_match in instance[0]:
                matched.append(instance)
    elif filter_mode == 's':
        for instance in field_instances:
            for subfield in instance[0]:
                if subfield[0] == filter_subcode and \
                   subfield[1].find(filter_value) > -1:
                    matched.append(instance)
                    break
    elif filter_mode == 'r':
        reg_exp = re.compile(filter_value)
        for instance in field_instances:
            for subfield in instance[0]:
                if subfield[0] == filter_subcode and \
                   reg_exp.match(subfield[1]) is not None:
                    matched.append(instance)
                    break
    return matched

def record_drop_duplicate_fields(record):
    """
    Return a record where all the duplicate fields have been removed.
    Fields are considered identical considering also the order of their
    subfields.
    """
    out = {}
    position = 0
    tags = sorted(record.keys())
    for tag in tags:
        fields = record[tag]
        out[tag] = []
        current_fields = set()
        for full_field in fields:
            field = (tuple(full_field[0]),) + full_field[1:4]
            if field not in current_fields:
                current_fields.add(field)
                position += 1
                out[tag].append(full_field[:4] + (position,))
    return out

def records_identical(rec1, rec2, skip_005=True, ignore_field_order=False, ignore_subfield_order=False, ignore_duplicate_subfields=False, ignore_duplicate_controlfields=False):
    """
    Return True if rec1 is identical to rec2, regardless of a difference
    in the 005 tag (i.e. the timestamp).
    """
    rec1_keys = set(rec1.keys())
    rec2_keys = set(rec2.keys())
    if skip_005:
        rec1_keys.discard("005")
        rec2_keys.discard("005")
    if rec1_keys != rec2_keys:
        return False
    for key in rec1_keys:
        if ignore_duplicate_controlfields and key.startswith('00'):
            if set(field[3] for field in rec1[key]) != set(field[3] for field in rec2[key]):
                return False
            continue

        rec1_fields = rec1[key]
        rec2_fields = rec2[key]
        if len(rec1_fields) != len(rec2_fields):
            ## They already differs in length...
            return False
        if ignore_field_order:
            ## We sort the fields, first by indicators and then by anything else
            rec1_fields = sorted(rec1_fields, key=lambda elem: (elem[1], elem[2], elem[3], elem[0]))
            rec2_fields = sorted(rec2_fields, key=lambda elem: (elem[1], elem[2], elem[3], elem[0]))
        else:
            ## We sort the fields, first by indicators, then by global position and then by anything else
            rec1_fields = sorted(rec1_fields, key=lambda elem: (elem[1], elem[2], elem[4], elem[3], elem[0]))
            rec2_fields = sorted(rec2_fields, key=lambda elem: (elem[1], elem[2], elem[4], elem[3], elem[0]))
        for field1, field2 in zip(rec1_fields, rec2_fields):
            if ignore_duplicate_subfields:
                if field1[1:4] != field2[1:4] or set(field1[0]) != set(field2[0]):
                    return False
            elif ignore_subfield_order:
                if field1[1:4] != field2[1:4] or sorted(field1[0]) != sorted(field2[0]):
                    return False
            elif field1[:4] != field2[:4]:
                return False
    return True

def record_get_field_instances(rec, tag="", ind1=" ", ind2=" "):
    """Returns the list of field instances for the specified tag and
    indicators of the record (rec).

    Returns empty list if not found.
    If tag is empty string, returns all fields

    Parameters (tag, ind1, ind2) can contain wildcard %.

    @param rec: a record structure as returned by create_record()
    @param tag: a 3 characters long string
    @param ind1: a 1 character long string
    @param ind2: a 1 character long string
    @param code: a 1 character long string
    @return: a list of field tuples (Subfields, ind1, ind2, value,
        field_position_global) where subfields is list of (code, value)"""
    if not rec:
        return []
    if not tag:
        return rec.items()
    else:
        out = []
        ind1, ind2 = _wash_indicators(ind1, ind2)

        if '%' in tag:
            # Wildcard in tag. Check all possible
            for field_tag in rec:
                if _tag_matches_pattern(field_tag, tag):
                    for possible_field_instance in rec[field_tag]:
                        if (ind1 in ('%', possible_field_instance[1]) and
                            ind2 in ('%', possible_field_instance[2])):
                            out.append(possible_field_instance)
        else:
            # Completely defined tag. Use dict
            for possible_field_instance in rec.get(tag, []):
                if (ind1 in ('%', possible_field_instance[1]) and
                    ind2 in ('%', possible_field_instance[2])):
                    out.append(possible_field_instance)
        return out


def record_add_field(rec, tag, ind1=' ', ind2=' ', controlfield_value='',
    subfields=None, field_position_global=None, field_position_local=None):
    """
    Adds a new field into the record.
    If field_position_global or field_position_local is specified then
    this method will insert the new field at the desired position.
    Otherwise a global field position will be computed in order to
    insert the field at the best position (first we try to keep the
    order of the tags and then we insert the field at the end of the
    fields with the same tag).

    If both field_position_global and field_position_local are present,
    then field_position_local takes precedence.

    @param rec: the record data structure
    @param tag: the tag of the field to be added
    @param ind1: the first indicator
    @param ind2: the second indicator
    @param controlfield_value: the value of the controlfield
    @param subfields: the subfields (a list of tuples (code, value))
    @param field_position_global: the global field position (record wise)
    @param field_position_local: the local field position (tag wise)
    @return: the global field position of the newly inserted field or -1 if the
        operation failed
    """
    error = _validate_record_field_positions_global(rec)
    if error:
        # FIXME one should write a message here
        pass

    # Clean the parameters.
    if subfields is None:
        subfields = []
    ind1, ind2 = _wash_indicators(ind1, ind2)

    if controlfield_value and (ind1 != ' ' or ind2 != ' ' or subfields):
        return -1

    # Detect field number to be used for insertion:
    # Dictionaries for uniqueness.
    tag_field_positions_global = {}.fromkeys([field[4]
                                              for field in rec.get(tag, [])])
    all_field_positions_global = {}.fromkeys([field[4]
                                              for fields in rec.values()
                                              for field in fields])

    if field_position_global is None and field_position_local is None:
        # Let's determine the global field position of the new field.
        if tag in rec:
            try:
                field_position_global = max([field[4] for field in rec[tag]]) \
                    + 1
            except IndexError:
                if tag_field_positions_global:
                    field_position_global = max(tag_field_positions_global) + 1
                elif all_field_positions_global:
                    field_position_global = max(all_field_positions_global) + 1
                else:
                    field_position_global = 1
        else:
            if tag in ('FMT', 'FFT', 'BDR', 'BDM'):
                # Add the new tag to the end of the record.
                if tag_field_positions_global:
                    field_position_global = max(tag_field_positions_global) + 1
                elif all_field_positions_global:
                    field_position_global = max(all_field_positions_global) + 1
                else:
                    field_position_global = 1
            else:
                # Insert the tag in an ordered way by selecting the
                # right global field position.
                immediate_lower_tag = '000'
                for rec_tag in rec:
                    if (tag not in ('FMT', 'FFT', 'BDR', 'BDM') and
                        immediate_lower_tag < rec_tag < tag):
                        immediate_lower_tag = rec_tag

                if immediate_lower_tag == '000':
                    field_position_global = 1
                else:
                    field_position_global = rec[immediate_lower_tag][-1][4] + 1

        field_position_local = len(rec.get(tag, []))
        _shift_field_positions_global(rec, field_position_global, 1)
    elif field_position_local is not None:
        if tag in rec:
            if field_position_local >= len(rec[tag]):
                field_position_global = rec[tag][-1][4] + 1
            else:
                field_position_global = rec[tag][field_position_local][4]
            _shift_field_positions_global(rec, field_position_global, 1)
        else:
            if all_field_positions_global:
                field_position_global = max(all_field_positions_global) + 1
            else:
                # Empty record.
                field_position_global = 1
    elif field_position_global is not None:
        # If the user chose an existing global field position, shift all the
        # global field positions greater than the input global field position.
        if tag not in rec:
            if all_field_positions_global:
                field_position_global = max(all_field_positions_global) + 1
            else:
                field_position_global = 1
            field_position_local = 0
        elif field_position_global < min(tag_field_positions_global):
            field_position_global = min(tag_field_positions_global)
            _shift_field_positions_global(rec, min(tag_field_positions_global),
                1)
            field_position_local = 0
        elif field_position_global > max(tag_field_positions_global):
            field_position_global = max(tag_field_positions_global) + 1
            _shift_field_positions_global(rec,
                max(tag_field_positions_global) + 1, 1)
            field_position_local = len(rec.get(tag, []))
        else:
            if field_position_global in tag_field_positions_global:
                _shift_field_positions_global(rec, field_position_global, 1)

            field_position_local = 0
            for position, field in enumerate(rec[tag]):
                if field[4] == field_position_global + 1:
                    field_position_local = position

    # Create the new field.
    newfield = (subfields, ind1, ind2, str(controlfield_value),
        field_position_global)
    rec.setdefault(tag, []).insert(field_position_local, newfield)

    # Return new field number:
    return field_position_global

def record_has_field(rec, tag):
    """
    Checks if the tag exists in the record.

    @param rec: the record data structure
    @param the: field
    @return: a boolean
    """
    return tag in rec

def record_delete_field(rec, tag, ind1=' ', ind2=' ',
    field_position_global=None, field_position_local=None):
    """
    If global field position is specified, deletes the field with the
    corresponding global field position.
    If field_position_local is specified, deletes the field with the
    corresponding local field position and tag.
    Else deletes all the fields matching tag and optionally ind1 and
    ind2.

    If both field_position_global and field_position_local are present,
    then field_position_local takes precedence.

    @param rec: the record data structure
    @param tag: the tag of the field to be deleted
    @param ind1: the first indicator of the field to be deleted
    @param ind2: the second indicator of the field to be deleted
    @param field_position_global: the global field position (record wise)
    @param field_position_local: the local field position (tag wise)
    @return: the list of deleted fields
    """
    error = _validate_record_field_positions_global(rec)
    if error:
        # FIXME one should write a message here.
        pass

    if tag not in rec:
        return False

    ind1, ind2 = _wash_indicators(ind1, ind2)

    deleted = []
    newfields = []

    if field_position_global is None and field_position_local is None:
        # Remove all fields with tag 'tag'.
        for field in rec[tag]:
            if field[1] != ind1 or field[2] != ind2:
                newfields.append(field)
            else:
                deleted.append(field)
        rec[tag] = newfields
    elif field_position_global is not None:
        # Remove the field with 'field_position_global'.
        for field in rec[tag]:
            if (field[1] != ind1 and field[2] != ind2 or
                field[4] != field_position_global):
                newfields.append(field)
            else:
                deleted.append(field)
        rec[tag] = newfields
    elif field_position_local is not None:
        # Remove the field with 'field_position_local'.
        try:
            del rec[tag][field_position_local]
        except IndexError:
            return []

    if not rec[tag]:
        # Tag is now empty, remove it.
        del rec[tag]

    return deleted

def record_delete_fields(rec, tag, field_positions_local=None):
    """
    Delete all/some fields defined with MARC tag 'tag' from record 'rec'.

    @param rec: a record structure.
    @type rec: tuple
    @param tag: three letter field.
    @type tag: string
    @param field_position_local: if set, it is the list of local positions
        within all the fields with the specified tag, that should be deleted.
        If not set all the fields with the specified tag will be deleted.
    @type field_position_local: sequence
    @return: the list of deleted fields.
    @rtype: list
    @note: the record is modified in place.
    """
    if tag not in rec:
        return []

    new_fields, deleted_fields = [], []

    for position, field in enumerate(rec.get(tag, [])):
        if field_positions_local is None or position in field_positions_local:
            deleted_fields.append(field)
        else:
            new_fields.append(field)

    if new_fields:
        rec[tag] = new_fields
    else:
        del rec[tag]

    return deleted_fields

def record_add_fields(rec, tag, fields, field_position_local=None,
    field_position_global=None):
    """
    Adds the fields into the record at the required position. The
    position is specified by the tag and the field_position_local in
    the list of fields.

    @param rec: a record structure
    @param tag: the tag of the fields
    to be moved
    @param field_position_local: the field_position_local to which the
    field will be inserted. If not specified, appends the fields to
    the tag.
    @param a: list of fields to be added
    @return: -1 if the operation failed, or the field_position_local
    if it was successful
    """
    if field_position_local is None and field_position_global is None:
        for field in fields:
            record_add_field(rec, tag, ind1=field[1],
                ind2=field[2], subfields=field[0],
                controlfield_value=field[3])
    else:
        fields.reverse()
        for field in fields:
            record_add_field(rec, tag, ind1=field[1], ind2=field[2],
                subfields=field[0], controlfield_value=field[3],
                field_position_local=field_position_local,
                field_position_global=field_position_global)

    return field_position_local

def record_move_fields(rec, tag, field_positions_local,
    field_position_local=None):
    """
    Moves some fields to the position specified by
    'field_position_local'.

    @param rec: a record structure as returned by create_record()
    @param tag: the tag of the fields to be moved
    @param field_positions_local: the positions of the
    fields to move
    @param field_position_local: insert the field before that
    field_position_local. If unspecified, appends the fields
    @return: the field_position_local is the operation was successful
    """
    fields = record_delete_fields(rec, tag,
        field_positions_local=field_positions_local)
    return record_add_fields(rec, tag, fields,
        field_position_local=field_position_local)

def record_delete_subfield(rec, tag, subfield_code, ind1=' ', ind2=' '):
    """Deletes all subfields with subfield_code in the record."""
    ind1, ind2 = _wash_indicators(ind1, ind2)

    for field in rec.get(tag, []):
        if field[1] == ind1 and field[2] == ind2:
            field[0][:] = [subfield for subfield in field[0]
                        if subfield_code != subfield[0]]

def record_get_field(rec, tag, field_position_global=None,
    field_position_local=None):
    """
    Returns the the matching field. One has to enter either a global
    field position or a local field position.

    @return: a list of subfield tuples (subfield code, value).
    @rtype:  list
    """
    if field_position_global is None and field_position_local is None:
        raise InvenioBibRecordFieldError("A field position is required to "
            "complete this operation.")
    elif field_position_global is not None and field_position_local is not None:
        raise InvenioBibRecordFieldError("Only one field position is required "
            "to complete this operation.")
    elif field_position_global:
        if not tag in rec:
            raise InvenioBibRecordFieldError("No tag '%s' in record." % tag)

        for field in rec[tag]:
            if field[4] == field_position_global:
                return field
        raise InvenioBibRecordFieldError("No field has the tag '%s' and the "
            "global field position '%d'." % (tag, field_position_global))
    else:
        try:
            return rec[tag][field_position_local]
        except KeyError:
            raise InvenioBibRecordFieldError("No tag '%s' in record." % tag)
        except IndexError:
            raise InvenioBibRecordFieldError("No field has the tag '%s' and "
                "the local field position '%d'." % (tag, field_position_local))

def record_replace_field(rec, tag, new_field, field_position_global=None,
    field_position_local=None):
    """Replaces a field with a new field."""
    if field_position_global is None and field_position_local is None:
        raise InvenioBibRecordFieldError("A field position is required to "
            "complete this operation.")
    elif field_position_global is not None and field_position_local is not None:
        raise InvenioBibRecordFieldError("Only one field position is required "
            "to complete this operation.")
    elif field_position_global:
        if not tag in rec:
            raise InvenioBibRecordFieldError("No tag '%s' in record." % tag)

        replaced = False
        for position, field in enumerate(rec[tag]):
            if field[4] == field_position_global:
                rec[tag][position] = new_field
                replaced = True

        if not replaced:
            raise InvenioBibRecordFieldError("No field has the tag '%s' and "
                "the global field position '%d'." %
                (tag, field_position_global))
    else:
        try:
            rec[tag][field_position_local] = new_field
        except KeyError:
            raise InvenioBibRecordFieldError("No tag '%s' in record." % tag)
        except IndexError:
            raise InvenioBibRecordFieldError("No field has the tag '%s' and "
                "the local field position '%d'." % (tag, field_position_local))

def record_get_subfields(rec, tag, field_position_global=None,
    field_position_local=None):
    """
    Returns the subfield of the matching field. One has to enter either a
    global field position or a local field position.

    @return: a list of subfield tuples (subfield code, value).
    @rtype:  list
    """
    field = record_get_field(rec, tag,
        field_position_global=field_position_global,
        field_position_local=field_position_local)

    return field[0]

def record_delete_subfield_from(rec, tag, subfield_position,
    field_position_global=None, field_position_local=None):
    """Delete subfield from position specified by tag, field number and
    subfield position."""
    subfields = record_get_subfields(rec, tag,
        field_position_global=field_position_global,
        field_position_local=field_position_local)

    try:
        del subfields[subfield_position]
    except IndexError:
        from invenio.xmlmarc2textmarc import create_marc_record
        recordMarc = create_marc_record(rec, 0, {"text-marc": 1, "aleph-marc": 0})
        raise InvenioBibRecordFieldError("The record : %(recordCode)s does not contain the subfield "
            "'%(subfieldIndex)s' inside the field (local: '%(fieldIndexLocal)s, global: '%(fieldIndexGlobal)s' ) of tag '%(tag)s'." % \
            {"subfieldIndex" : subfield_position, \
             "fieldIndexLocal" : str(field_position_local), \
             "fieldIndexGlobal" : str(field_position_global), \
             "tag" : tag, \
             "recordCode" : recordMarc})
    if not subfields:
        if field_position_global is not None:
            for position, field in enumerate(rec[tag]):
                if field[4] == field_position_global:
                    del rec[tag][position]
        else:
            del rec[tag][field_position_local]

        if not rec[tag]:
            del rec[tag]

def record_add_subfield_into(rec, tag, subfield_code, value,
    subfield_position=None, field_position_global=None,
    field_position_local=None):
    """Add subfield into position specified by tag, field number and
    optionally by subfield position."""
    subfields = record_get_subfields(rec, tag,
        field_position_global=field_position_global,
        field_position_local=field_position_local)

    if subfield_position is None:
        subfields.append((subfield_code, value))
    else:
        subfields.insert(subfield_position, (subfield_code, value))

def record_modify_controlfield(rec, tag, controlfield_value,
    field_position_global=None, field_position_local=None):
    """Modify controlfield at position specified by tag and field number."""
    field = record_get_field(rec, tag,
        field_position_global=field_position_global,
        field_position_local=field_position_local)

    new_field = (field[0], field[1], field[2], controlfield_value, field[4])

    record_replace_field(rec, tag, new_field,
        field_position_global=field_position_global,
        field_position_local=field_position_local)

def record_modify_subfield(rec, tag, subfield_code, value, subfield_position,
    field_position_global=None, field_position_local=None):
    """Modify subfield at position specified by tag, field number and
    subfield position."""
    subfields = record_get_subfields(rec, tag,
        field_position_global=field_position_global,
        field_position_local=field_position_local)

    try:
        subfields[subfield_position] = (subfield_code, value)
    except IndexError:
        raise InvenioBibRecordFieldError("There is no subfield with position "
            "'%d'." % subfield_position)

def record_move_subfield(rec, tag, subfield_position, new_subfield_position,
    field_position_global=None, field_position_local=None):
    """Move subfield at position specified by tag, field number and
    subfield position to new subfield position."""
    subfields = record_get_subfields(rec, tag,
        field_position_global=field_position_global,
        field_position_local=field_position_local)

    try:
        subfield = subfields.pop(subfield_position)
        subfields.insert(new_subfield_position, subfield)
    except IndexError:
        raise InvenioBibRecordFieldError("There is no subfield with position "
            "'%d'." % subfield_position)

def record_get_field_value(rec, tag, ind1=" ", ind2=" ", code=""):
    """Returns first (string) value that matches specified field
    (tag, ind1, ind2, code) of the record (rec).

    Returns empty string if not found.

    Parameters (tag, ind1, ind2, code) can contain wildcard %.

    Difference between wildcard % and empty '':

    - Empty char specifies that we are not interested in a field which
      has one of the indicator(s)/subfield specified.

    - Wildcard specifies that we are interested in getting the value
      of the field whatever the indicator(s)/subfield is.

    For e.g. consider the following record in MARC:
      100C5  $$a val1
      555AB  $$a val2
      555AB      val3
      555    $$a val4
      555A       val5

      >> record_get_field_value(record, '555', 'A', '', '')
      >> "val5"
      >> record_get_field_value(record, '555', 'A', '%', '')
      >> "val3"
      >> record_get_field_value(record, '555', 'A', '%', '%')
      >> "val2"
      >> record_get_field_value(record, '555', 'A', 'B', '')
      >> "val3"
      >> record_get_field_value(record, '555', '', 'B', 'a')
      >> ""
      >> record_get_field_value(record, '555', '', '', 'a')
      >> "val4"
      >> record_get_field_value(record, '555', '', '', '')
      >> ""
      >> record_get_field_value(record, '%%%', '%', '%', '%')
      >> "val1"

    @param rec: a record structure as returned by create_record()
    @param tag: a 3 characters long string
    @param ind1: a 1 character long string
    @param ind2: a 1 character long string
    @param code: a 1 character long string
    @return: string value (empty if nothing found)"""
    # Note: the code is quite redundant for speed reasons (avoid calling
    # functions or doing tests inside loops)
    ind1, ind2 = _wash_indicators(ind1, ind2)

    if '%' in tag:
        # Wild card in tag. Must find all corresponding fields
        if code == '':
            # Code not specified.
            for field_tag, fields in rec.items():
                if _tag_matches_pattern(field_tag, tag):
                    for field in fields:
                        if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
                            # Return matching field value if not empty
                            if field[3]:
                                return field[3]
        elif code == '%':
            # Code is wildcard. Take first subfield of first matching field
            for field_tag, fields in rec.items():
                if _tag_matches_pattern(field_tag, tag):
                    for field in fields:
                        if (ind1 in ('%', field[1]) and ind2 in ('%', field[2])
                            and field[0]):
                            return field[0][0][1]
        else:
            # Code is specified. Take corresponding one
            for field_tag, fields in rec.items():
                if _tag_matches_pattern(field_tag, tag):
                    for field in fields:
                        if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
                            for subfield in field[0]:
                                if subfield[0] == code:
                                    return subfield[1]

    else:
        # Tag is completely specified. Use tag as dict key
        if tag in rec:
            if code == '':
                # Code not specified.
                for field in rec[tag]:
                    if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
                        # Return matching field value if not empty
                        # or return "" empty if not exist.
                        if field[3]:
                            return field[3]

            elif code == '%':
                # Code is wildcard. Take first subfield of first matching field
                for field in rec[tag]:
                    if (ind1 in ('%', field[1]) and ind2 in ('%', field[2]) and
                        field[0]):
                        return field[0][0][1]
            else:
                # Code is specified. Take corresponding one
                for field in rec[tag]:
                    if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
                        for subfield in field[0]:
                            if subfield[0] == code:
                                return subfield[1]
    # Nothing was found
    return ""


def record_get_field_values(rec, tag, ind1=" ", ind2=" ", code="",
                            filter_subfield_code="",
                            filter_subfield_value="",
                            filter_subfield_mode="e"):
    """Returns the list of (string) values for the specified field
    (tag, ind1, ind2, code) of the record (rec).

    List can be filtered. Use filter_subfield_code
    and filter_subfield_value to search
    only in fields that have these values inside them as a subfield.
    filter_subfield_mode can have 3 different values:
    'e' for exact search
    's' for substring search
    'r' for regexp search

    Returns empty list if nothing was found.

    Parameters (tag, ind1, ind2, code) can contain wildcard %.

    @param rec: a record structure as returned by create_record()
    @param tag: a 3 characters long string
    @param ind1: a 1 character long string
    @param ind2: a 1 character long string
    @param code: a 1 character long string
    @return: a list of strings"""
    tmp = []

    ind1, ind2 = _wash_indicators(ind1, ind2)

    if filter_subfield_code and filter_subfield_mode == "r":
        reg_exp = re.compile(filter_subfield_value)

    tags = []
    if '%' in tag:
        # Wild card in tag. Must find all corresponding tags and fields
        tags = [k for k in rec if _tag_matches_pattern(k, tag)]
    elif rec and tag in rec:
        tags = [tag]

    if code == '':
        # Code not specified. Consider field value (without subfields)
        for tag in tags:
            for field in rec[tag]:
                if (ind1 in ('%', field[1]) and ind2 in ('%', field[2]) and
                    field[3]):
                    tmp.append(field[3])
    elif code == '%':
        # Code is wildcard. Consider all subfields
        for tag in tags:
            for field in rec[tag]:
                if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
                    if filter_subfield_code:
                        if filter_subfield_mode == "e":
                            subfield_to_match = (filter_subfield_code, filter_subfield_value)
                            if subfield_to_match in field[0]:
                                for subfield in field[0]:
                                    tmp.append(subfield[1])
                        elif filter_subfield_mode == "s":
                            if (dict(field[0]).get(filter_subfield_code, '')).find(filter_subfield_value) > -1:
                                for subfield in field[0]:
                                    tmp.append(subfield[1])
                        elif filter_subfield_mode == "r":
                            if reg_exp.match(dict(field[0]).get(filter_subfield_code, '')):
                                for subfield in field[0]:
                                    tmp.append(subfield[1])
                    else:
                        for subfield in field[0]:
                            tmp.append(subfield[1])
    else:
        # Code is specified. Consider all corresponding subfields
        for tag in tags:
            for field in rec[tag]:
                if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
                    if filter_subfield_code:
                        if filter_subfield_mode == "e":
                            subfield_to_match = (filter_subfield_code, filter_subfield_value)
                            if subfield_to_match in field[0]:
                                for subfield in field[0]:
                                    if subfield[0] == code:
                                        tmp.append(subfield[1])
                        elif filter_subfield_mode == "s":
                            if (dict(field[0]).get(filter_subfield_code, '')).find(filter_subfield_value) > -1:
                                for subfield in field[0]:
                                    if subfield[0] == code:
                                        tmp.append(subfield[1])
                        elif filter_subfield_mode == "r":
                            if reg_exp.match(dict(field[0]).get(filter_subfield_code, '')):
                                for subfield in field[0]:
                                    if subfield[0] == code:
                                        tmp.append(subfield[1])
                    else:
                        for subfield in field[0]:
                            if subfield[0] == code:
                                tmp.append(subfield[1])


    # If tmp was not set, nothing was found
    return tmp


def record_xml_output(rec, tags=None, order_fn=None):
    """Generates the XML for record 'rec' and returns it as a string
    @rec: record
    @tags: list of tags to be printed"""
    if tags is None:
        tags = []
    if isinstance(tags, str):
        tags = [tags]
    if tags and '001' not in tags:
        # Add the missing controlfield.
        tags.append('001')

    marcxml = ['<record>']

    # Add the tag 'tag' to each field in rec[tag]
    fields = []
    if rec is not None:
        for tag in rec:
            if not tags or tag in tags:
                for field in rec[tag]:
                    fields.append((tag, field))
        if order_fn is None:
            record_order_fields(fields)
        else:
            record_order_fields(fields, order_fn)
        for field in fields:
            marcxml.append(field_xml_output(field[1], field[0]))
    marcxml.append('</record>')
    return '\n'.join(marcxml)

def field_get_subfield_instances(field):
    """Returns the list of subfields associated with field 'field'"""
    return field[0]

def field_get_subfield_values(field_instance, code):
    """Return subfield CODE values of the field instance FIELD."""
    return [subfield_value
            for subfield_code, subfield_value in field_instance[0]
            if subfield_code == code]

def field_get_subfield_codes(field_instance):
    """Return subfield codes of the field instance FIELD."""
    return [subfield_code
            for subfield_code, subfield_value in field_instance[0]]

def field_add_subfield(field, code, value):
    """Adds a subfield to field 'field'"""
    field[0].append((code, value))

def record_order_fields(rec, fun="_order_by_ord"):
    """Orders field inside record 'rec' according to a function"""
    rec.sort(eval(fun))

def field_xml_output(field, tag):
    """Generates the XML for field 'field' and returns it as a string."""
    marcxml = []
    if field[3]:
        marcxml.append('  <controlfield tag="%s">%s</controlfield>' %
            (tag, encode_for_xml(field[3])))
    else:
        marcxml.append('  <datafield tag="%s" ind1="%s" ind2="%s">' %
            (tag, field[1], field[2]))
        marcxml += [_subfield_xml_output(subfield) for subfield in field[0]]
        marcxml.append('  </datafield>')
    return '\n'.join(marcxml)

def record_extract_oai_id(record):
    """Returns the OAI ID of the record."""
    tag = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3]
    ind1 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3]
    ind2 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]
    subfield = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]
    values = record_get_field_values(record, tag, ind1, ind2, subfield)
    oai_id_regex = re.compile("oai[a-zA-Z0-9/.:]+")
    for value in [value.strip() for value in values]:
        if oai_id_regex.match(value):
            return value
    return ""

def record_extract_dois(record):
    """Returns the DOI(s) of the record."""
    record_dois = []
    tag = "024"
    ind1 = "7"
    ind2 = "_"
    subfield_source_code = "2"
    subfield_value_code = "a"
    identifiers_fields = record_get_field_instances(record, tag, ind1, ind2)
    for identifer_field in identifiers_fields:
        if 'doi' in [val.lower() for val in field_get_subfield_values(identifer_field, subfield_source_code)]:
            record_dois.extend(field_get_subfield_values(identifer_field, subfield_value_code))
    return record_dois

def print_rec(rec, format=1, tags=None):
    """
    prints a record
    format = 1 -- XML
    format = 2 -- HTML (not implemented)
    @param tags: list of tags to be printed
    """
    if tags is None:
        tags = []
    if format == 1:
        text = record_xml_output(rec, tags)
    else:
        return ''

    return text

def print_recs(listofrec, format=1, tags=None):
    """
    prints a list of records
    @param format: 1 XML, 2 HTML (not implemented)
    @param tags: list of tags to be printed
           if 'listofrec' is not a list it returns empty string
    """
    if tags is None:
        tags = []
    text = ""

    if type(listofrec).__name__ !='list':
        return ""
    else:
        for rec in listofrec:
            text = "%s\n%s" % (text, print_rec(rec, format, tags))
    return text

def concat(alist):
    """Concats a list of lists"""
    newl = []
    for l in alist:
        newl.extend(l)
    return newl

def record_find_field(rec, tag, field, strict=False):
    """
    Returns the global and local positions of the first occurrence
    of the field in a record.

    @param rec:    A record dictionary structure
    @type  rec:    dictionary
    @param tag:    The tag of the field to search for
    @type  tag:    string
    @param field:  A field tuple as returned by create_field()
    @type  field:  tuple
    @param strict: A boolean describing the search method. If strict
                   is False, then the order of the subfields doesn't
                   matter. Default search method is strict.
    @type  strict: boolean
    @return:       A tuple of (global_position, local_position) or a
                   tuple (None, None) if the field is not present.
    @rtype:        tuple
    @raise InvenioBibRecordFieldError: If the provided field is invalid.
    """
    try:
        _check_field_validity(field)
    except InvenioBibRecordFieldError:
        raise

    for local_position, field1 in enumerate(rec.get(tag, [])):
        if _compare_fields(field, field1, strict):
            return (field1[4], local_position)

    return (None, None)

def record_match_subfields(rec, tag, ind1=" ", ind2=" ", sub_key=None,
                           sub_value='', sub_key2=None, sub_value2='',
                           case_sensitive=True):
    """ Finds subfield instances in a particular field and tests
    values in 1 of 3 possible ways:
     - Does a subfield code exist? (ie does 773__a exist?)
     - Does a subfield have a particular value? (ie 773__a == 'PhysX')
     - Do a pair of subfields have particular values?
        (ie 035__2 == 'CDS' and 035__a == '123456')

    Parameters:
     * rec - dictionary: a bibrecord structure
     * tag - string: the tag of the field (ie '773')
     * ind1, ind2 - char: a single characters for the MARC indicators
     * sub_key - char: subfield key to find
     * sub_value - string: subfield value of that key
     * sub_key2 - char: key of subfield to compare against
     * sub_value2 - string: expected value of second subfield
     * case_sensitive - bool: be case sensitive when matching values

    Returns: false if no match found, else provides the field position (int) """
    if sub_key is None:
        raise TypeError("None object passed for parameter sub_key.")

    if sub_key2 is not None and sub_value2 is '':
        raise TypeError("Parameter sub_key2 defined but sub_value2 is None, "
                        + "function requires a value for comparrison.")
    ind1, ind2 = _wash_indicators(ind1, ind2)

    if not case_sensitive:
        sub_value = sub_value.lower()
        sub_value2 = sub_value2.lower()

    for field in record_get_field_instances(rec, tag, ind1, ind2):
        subfields = dict(field_get_subfield_instances(field))
        if not case_sensitive:
            for k, v in subfields.iteritems():
                subfields[k] = v.lower()

        if sub_key in subfields:
            if sub_value is '':
                return field[4]
            else:
                if sub_value == subfields[sub_key]:
                    if sub_key2 is None:
                        return field[4]
                    else:
                        if sub_key2 in subfields:
                            if sub_value2 == subfields[sub_key2]:
                                return field[4]
    return False

def record_strip_empty_volatile_subfields(rec):
    """
    Removes unchanged volatile subfields from the record
    """
    for tag in rec.keys():
        for field in rec[tag]:
            field[0][:] = [subfield for subfield in field[0] if subfield[1][:9] != "VOLATILE:"]

def record_strip_empty_fields(rec, tag=None):
    """
    Removes empty subfields and fields from…
Tech Fingerprint

Alerts (69)

'def' Ensure functions have docstrings for documentation
93 106 119 352 495 593 625 653 686 718 733 764 778 791 805 928 1193
Complexity hotspot; lines 210 to 212 (total complexity: 4)
210 211 212
'def' Avoid long function definitions; keep signatures concise for readability
269
'global' Avoid global variables; use function parameters or class attributes for better scope management
297 392 394 398 399 402 406 408 410 415 417 419 422 430 432 439 441 445 448 449 450 451 454 456 458 459 463 464 469 474 529 537 610 662 665 689 692 754
'del' Avoid unless necessary; Python's garbage collector typically handles object deletion
549 555 589 742 757 759 762
'isinstance(' Overuse may indicate design issues; consider polymorphism
1034
'eval(' Avoid due to security risks; use ast.literal_eval for safer evaluation of literals
1079
'type(' Use isinstance() for type checking instead of type()
1148