PageRenderTime 78ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 1ms

/modules/bibrecord/lib/bibrecord.py

https://github.com/chokribr/invenio-1
Python | 1898 lines | 1837 code | 14 blank | 47 comment | 24 complexity | 71aa8a445c3535b820ce0b586bb83959 MD5 | raw file
Possible License(s): GPL-2.0

Large files files are truncated, but you can click here to view the full file

  1. # -*- coding: utf-8 -*-
  2. ##
  3. ## This file is part of Invenio.
  4. ## Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 CERN.
  5. ##
  6. ## Invenio is free software; you can redistribute it and/or
  7. ## modify it under the terms of the GNU General Public License as
  8. ## published by the Free Software Foundation; either version 2 of the
  9. ## License, or (at your option) any later version.
  10. ##
  11. ## Invenio is distributed in the hope that it will be useful, but
  12. ## WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ## General Public License for more details.
  15. ##
  16. ## You should have received a copy of the GNU General Public License
  17. ## along with Invenio; if not, write to the Free Software Foundation, Inc.,
  18. ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
  19. """BibRecord - XML MARC processing library for Invenio.
  20. For API, see create_record(), record_get_field_instances() and friends
  21. in the source code of this file in the section entitled INTERFACE.
  22. Note: Does not access the database, the input is MARCXML only."""
  23. ### IMPORT INTERESTING MODULES AND XML PARSERS
  24. import re
  25. import sys
  26. from cStringIO import StringIO
  27. if sys.hexversion < 0x2040000:
  28. # pylint: disable=W0622
  29. from sets import Set as set
  30. # pylint: enable=W0622
  31. from invenio.bibrecord_config import CFG_MARC21_DTD, \
  32. CFG_BIBRECORD_WARNING_MSGS, CFG_BIBRECORD_DEFAULT_VERBOSE_LEVEL, \
  33. CFG_BIBRECORD_DEFAULT_CORRECT, CFG_BIBRECORD_PARSERS_AVAILABLE, \
  34. InvenioBibRecordParserError, InvenioBibRecordFieldError
  35. from invenio.config import CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG
  36. from invenio.textutils import encode_for_xml
  37. # Some values used for the RXP parsing.
  38. TAG, ATTRS, CHILDREN = 0, 1, 2
  39. # Find out about the best usable parser:
  40. AVAILABLE_PARSERS = []
  41. # Do we remove singletons (empty tags)?
  42. # NOTE: this is currently set to True as there are some external workflow
  43. # exploiting singletons, e.g. bibupload -c used to delete fields, and
  44. # bibdocfile --fix-marc called on a record where the latest document
  45. # has been deleted.
  46. CFG_BIBRECORD_KEEP_SINGLETONS = True
  47. try:
  48. import pyRXP
  49. if 'pyrxp' in CFG_BIBRECORD_PARSERS_AVAILABLE:
  50. AVAILABLE_PARSERS.append('pyrxp')
  51. except ImportError:
  52. pass
  53. try:
  54. from lxml import etree
  55. if 'lxml' in CFG_BIBRECORD_PARSERS_AVAILABLE:
  56. AVAILABLE_PARSERS.append('lxml')
  57. except ImportError:
  58. pass
  59. try:
  60. import Ft.Xml.Domlette
  61. if '4suite' in CFG_BIBRECORD_PARSERS_AVAILABLE:
  62. AVAILABLE_PARSERS.append('4suite')
  63. except ImportError:
  64. pass
  65. except Exception, err:
  66. from warnings import warn
  67. warn("Error when importing 4suite: %s" % err)
  68. pass
  69. try:
  70. import xml.dom.minidom
  71. import xml.parsers.expat
  72. if 'minidom' in CFG_BIBRECORD_PARSERS_AVAILABLE:
  73. AVAILABLE_PARSERS.append('minidom')
  74. except ImportError:
  75. pass
  76. ### INTERFACE / VISIBLE FUNCTIONS
  77. def create_field(subfields=None, ind1=' ', ind2=' ', controlfield_value='',
  78. global_position=-1):
  79. """
  80. Returns a field created with the provided elements. Global position is
  81. set arbitrary to -1."""
  82. if subfields is None:
  83. subfields = []
  84. ind1, ind2 = _wash_indicators(ind1, ind2)
  85. field = (subfields, ind1, ind2, controlfield_value, global_position)
  86. _check_field_validity(field)
  87. return field
  88. def create_records(marcxml, verbose=CFG_BIBRECORD_DEFAULT_VERBOSE_LEVEL,
  89. correct=CFG_BIBRECORD_DEFAULT_CORRECT, parser='',
  90. keep_singletons=CFG_BIBRECORD_KEEP_SINGLETONS):
  91. """Creates a list of records from the marcxml description. Returns a
  92. list of objects initiated by the function create_record(). Please
  93. see that function's docstring."""
  94. # Use the DOTALL flag to include newlines.
  95. regex = re.compile('<record.*?>.*?</record>', re.DOTALL)
  96. record_xmls = regex.findall(marcxml)
  97. return [create_record(record_xml, verbose=verbose, correct=correct,
  98. parser=parser, keep_singletons=keep_singletons) for record_xml in record_xmls]
  99. def create_record(marcxml, verbose=CFG_BIBRECORD_DEFAULT_VERBOSE_LEVEL,
  100. correct=CFG_BIBRECORD_DEFAULT_CORRECT, parser='',
  101. sort_fields_by_indicators=False,
  102. keep_singletons=CFG_BIBRECORD_KEEP_SINGLETONS):
  103. """Creates a record object from the marcxml description.
  104. Uses the best parser available in CFG_BIBRECORD_PARSERS_AVAILABLE or
  105. the parser specified.
  106. The returned object is a tuple (record, status_code, list_of_errors),
  107. where status_code is 0 when there are errors, 1 when no errors.
  108. The return record structure is as follows:
  109. Record := {tag : [Field]}
  110. Field := (Subfields, ind1, ind2, value)
  111. Subfields := [(code, value)]
  112. For example:
  113. ______
  114. |record|
  115. ------
  116. __________________________|_______________________________________
  117. |record['001'] |record['909'] |record['520'] |
  118. | | | |
  119. [list of fields] [list of fields] [list of fields] ...
  120. | ______|______________ |
  121. |[0] |[0] |[1] | |[0]
  122. ___|_____ _____|___ ___|_____ ... ____|____
  123. |Field 001| |Field 909| |Field 909| |Field 520|
  124. --------- --------- --------- ---------
  125. | _______________|_________________ | |
  126. ... |[0] |[1] |[2] | ... ...
  127. | | | |
  128. [list of subfields] 'C' '4'
  129. ___|__________________________________________
  130. | | |
  131. ('a', 'value') ('b', 'value for subfield b') ('a', 'value for another a')
  132. @param marcxml: an XML string representation of the record to create
  133. @param verbose: the level of verbosity: 0 (silent), 1-2 (warnings),
  134. 3(strict:stop when errors)
  135. @param correct: 1 to enable correction of marcxml syntax. Else 0.
  136. @return: a tuple (record, status_code, list_of_errors), where status
  137. code is 0 where there are errors, 1 when no errors"""
  138. # Select the appropriate parser.
  139. parser = _select_parser(parser)
  140. try:
  141. if parser == 'pyrxp':
  142. rec = _create_record_rxp(marcxml, verbose, correct,
  143. keep_singletons=keep_singletons)
  144. elif parser == 'lxml':
  145. rec = _create_record_lxml(marcxml, verbose, correct,
  146. keep_singletons=keep_singletons)
  147. elif parser == '4suite':
  148. rec = _create_record_4suite(marcxml,
  149. keep_singletons=keep_singletons)
  150. elif parser == 'minidom':
  151. rec = _create_record_minidom(marcxml,
  152. keep_singletons=keep_singletons)
  153. except InvenioBibRecordParserError, ex1:
  154. return (None, 0, str(ex1))
  155. # _create_record = {
  156. # 'pyrxp': _create_record_rxp,
  157. # 'lxml': _create_record_lxml,
  158. # '4suite': _create_record_4suite,
  159. # 'minidom': _create_record_minidom,
  160. # }
  161. # try:
  162. # rec = _create_record[parser](marcxml, verbose)
  163. # except InvenioBibRecordParserError, ex1:
  164. # return (None, 0, str(ex1))
  165. if sort_fields_by_indicators:
  166. _record_sort_by_indicators(rec)
  167. errs = []
  168. if correct:
  169. # Correct the structure of the record.
  170. errs = _correct_record(rec)
  171. return (rec, int(not errs), errs)
  172. def filter_field_instances(field_instances, filter_subcode, filter_value, filter_mode = 'e'):
  173. """ Filters given field and returns only that field instances
  174. that contain filter_subcode with given filter_value.
  175. As an input for search function accepts output from
  176. record_get_field_instances function.
  177. Function can be run in three modes:
  178. 'e' - looking for exact match in subfield value
  179. 's' - looking for substring in subfield value
  180. 'r' - looking for regular expression in subfield value
  181. Example:
  182. record_filter_field(record_get_field_instances(rec, '999', '%', '%'), 'y', '2001')
  183. In this case filter_subcode is 'y' and
  184. filter_value is '2001'.
  185. @param field_instances: output from record_get_field_instances
  186. @param filter_subcode: name of the subfield
  187. @type filter_subcode: string
  188. @param filter_value: value of the subfield
  189. @type filter_value: string
  190. @param filter_mode: 'e','s' or 'r'
  191. """
  192. matched = []
  193. if filter_mode == 'e':
  194. to_match = (filter_subcode, filter_value)
  195. for instance in field_instances:
  196. if to_match in instance[0]:
  197. matched.append(instance)
  198. elif filter_mode == 's':
  199. for instance in field_instances:
  200. for subfield in instance[0]:
  201. if subfield[0] == filter_subcode and \
  202. subfield[1].find(filter_value) > -1:
  203. matched.append(instance)
  204. break
  205. elif filter_mode == 'r':
  206. reg_exp = re.compile(filter_value)
  207. for instance in field_instances:
  208. for subfield in instance[0]:
  209. if subfield[0] == filter_subcode and \
  210. reg_exp.match(subfield[1]) is not None:
  211. matched.append(instance)
  212. break
  213. return matched
  214. def record_drop_duplicate_fields(record):
  215. """
  216. Return a record where all the duplicate fields have been removed.
  217. Fields are considered identical considering also the order of their
  218. subfields.
  219. """
  220. out = {}
  221. position = 0
  222. tags = sorted(record.keys())
  223. for tag in tags:
  224. fields = record[tag]
  225. out[tag] = []
  226. current_fields = set()
  227. for full_field in fields:
  228. field = (tuple(full_field[0]),) + full_field[1:4]
  229. if field not in current_fields:
  230. current_fields.add(field)
  231. position += 1
  232. out[tag].append(full_field[:4] + (position,))
  233. return out
  234. def records_identical(rec1, rec2, skip_005=True, ignore_field_order=False, ignore_subfield_order=False, ignore_duplicate_subfields=False, ignore_duplicate_controlfields=False):
  235. """
  236. Return True if rec1 is identical to rec2, regardless of a difference
  237. in the 005 tag (i.e. the timestamp).
  238. """
  239. rec1_keys = set(rec1.keys())
  240. rec2_keys = set(rec2.keys())
  241. if skip_005:
  242. rec1_keys.discard("005")
  243. rec2_keys.discard("005")
  244. if rec1_keys != rec2_keys:
  245. return False
  246. for key in rec1_keys:
  247. if ignore_duplicate_controlfields and key.startswith('00'):
  248. if set(field[3] for field in rec1[key]) != set(field[3] for field in rec2[key]):
  249. return False
  250. continue
  251. rec1_fields = rec1[key]
  252. rec2_fields = rec2[key]
  253. if len(rec1_fields) != len(rec2_fields):
  254. ## They already differs in length...
  255. return False
  256. if ignore_field_order:
  257. ## We sort the fields, first by indicators and then by anything else
  258. rec1_fields = sorted(rec1_fields, key=lambda elem: (elem[1], elem[2], elem[3], elem[0]))
  259. rec2_fields = sorted(rec2_fields, key=lambda elem: (elem[1], elem[2], elem[3], elem[0]))
  260. else:
  261. ## We sort the fields, first by indicators, then by global position and then by anything else
  262. rec1_fields = sorted(rec1_fields, key=lambda elem: (elem[1], elem[2], elem[4], elem[3], elem[0]))
  263. rec2_fields = sorted(rec2_fields, key=lambda elem: (elem[1], elem[2], elem[4], elem[3], elem[0]))
  264. for field1, field2 in zip(rec1_fields, rec2_fields):
  265. if ignore_duplicate_subfields:
  266. if field1[1:4] != field2[1:4] or set(field1[0]) != set(field2[0]):
  267. return False
  268. elif ignore_subfield_order:
  269. if field1[1:4] != field2[1:4] or sorted(field1[0]) != sorted(field2[0]):
  270. return False
  271. elif field1[:4] != field2[:4]:
  272. return False
  273. return True
  274. def record_get_field_instances(rec, tag="", ind1=" ", ind2=" "):
  275. """Returns the list of field instances for the specified tag and
  276. indicators of the record (rec).
  277. Returns empty list if not found.
  278. If tag is empty string, returns all fields
  279. Parameters (tag, ind1, ind2) can contain wildcard %.
  280. @param rec: a record structure as returned by create_record()
  281. @param tag: a 3 characters long string
  282. @param ind1: a 1 character long string
  283. @param ind2: a 1 character long string
  284. @param code: a 1 character long string
  285. @return: a list of field tuples (Subfields, ind1, ind2, value,
  286. field_position_global) where subfields is list of (code, value)"""
  287. if not rec:
  288. return []
  289. if not tag:
  290. return rec.items()
  291. else:
  292. out = []
  293. ind1, ind2 = _wash_indicators(ind1, ind2)
  294. if '%' in tag:
  295. # Wildcard in tag. Check all possible
  296. for field_tag in rec:
  297. if _tag_matches_pattern(field_tag, tag):
  298. for possible_field_instance in rec[field_tag]:
  299. if (ind1 in ('%', possible_field_instance[1]) and
  300. ind2 in ('%', possible_field_instance[2])):
  301. out.append(possible_field_instance)
  302. else:
  303. # Completely defined tag. Use dict
  304. for possible_field_instance in rec.get(tag, []):
  305. if (ind1 in ('%', possible_field_instance[1]) and
  306. ind2 in ('%', possible_field_instance[2])):
  307. out.append(possible_field_instance)
  308. return out
  309. def record_add_field(rec, tag, ind1=' ', ind2=' ', controlfield_value='',
  310. subfields=None, field_position_global=None, field_position_local=None):
  311. """
  312. Adds a new field into the record.
  313. If field_position_global or field_position_local is specified then
  314. this method will insert the new field at the desired position.
  315. Otherwise a global field position will be computed in order to
  316. insert the field at the best position (first we try to keep the
  317. order of the tags and then we insert the field at the end of the
  318. fields with the same tag).
  319. If both field_position_global and field_position_local are present,
  320. then field_position_local takes precedence.
  321. @param rec: the record data structure
  322. @param tag: the tag of the field to be added
  323. @param ind1: the first indicator
  324. @param ind2: the second indicator
  325. @param controlfield_value: the value of the controlfield
  326. @param subfields: the subfields (a list of tuples (code, value))
  327. @param field_position_global: the global field position (record wise)
  328. @param field_position_local: the local field position (tag wise)
  329. @return: the global field position of the newly inserted field or -1 if the
  330. operation failed
  331. """
  332. error = _validate_record_field_positions_global(rec)
  333. if error:
  334. # FIXME one should write a message here
  335. pass
  336. # Clean the parameters.
  337. if subfields is None:
  338. subfields = []
  339. ind1, ind2 = _wash_indicators(ind1, ind2)
  340. if controlfield_value and (ind1 != ' ' or ind2 != ' ' or subfields):
  341. return -1
  342. # Detect field number to be used for insertion:
  343. # Dictionaries for uniqueness.
  344. tag_field_positions_global = {}.fromkeys([field[4]
  345. for field in rec.get(tag, [])])
  346. all_field_positions_global = {}.fromkeys([field[4]
  347. for fields in rec.values()
  348. for field in fields])
  349. if field_position_global is None and field_position_local is None:
  350. # Let's determine the global field position of the new field.
  351. if tag in rec:
  352. try:
  353. field_position_global = max([field[4] for field in rec[tag]]) \
  354. + 1
  355. except IndexError:
  356. if tag_field_positions_global:
  357. field_position_global = max(tag_field_positions_global) + 1
  358. elif all_field_positions_global:
  359. field_position_global = max(all_field_positions_global) + 1
  360. else:
  361. field_position_global = 1
  362. else:
  363. if tag in ('FMT', 'FFT', 'BDR', 'BDM'):
  364. # Add the new tag to the end of the record.
  365. if tag_field_positions_global:
  366. field_position_global = max(tag_field_positions_global) + 1
  367. elif all_field_positions_global:
  368. field_position_global = max(all_field_positions_global) + 1
  369. else:
  370. field_position_global = 1
  371. else:
  372. # Insert the tag in an ordered way by selecting the
  373. # right global field position.
  374. immediate_lower_tag = '000'
  375. for rec_tag in rec:
  376. if (tag not in ('FMT', 'FFT', 'BDR', 'BDM') and
  377. immediate_lower_tag < rec_tag < tag):
  378. immediate_lower_tag = rec_tag
  379. if immediate_lower_tag == '000':
  380. field_position_global = 1
  381. else:
  382. field_position_global = rec[immediate_lower_tag][-1][4] + 1
  383. field_position_local = len(rec.get(tag, []))
  384. _shift_field_positions_global(rec, field_position_global, 1)
  385. elif field_position_local is not None:
  386. if tag in rec:
  387. if field_position_local >= len(rec[tag]):
  388. field_position_global = rec[tag][-1][4] + 1
  389. else:
  390. field_position_global = rec[tag][field_position_local][4]
  391. _shift_field_positions_global(rec, field_position_global, 1)
  392. else:
  393. if all_field_positions_global:
  394. field_position_global = max(all_field_positions_global) + 1
  395. else:
  396. # Empty record.
  397. field_position_global = 1
  398. elif field_position_global is not None:
  399. # If the user chose an existing global field position, shift all the
  400. # global field positions greater than the input global field position.
  401. if tag not in rec:
  402. if all_field_positions_global:
  403. field_position_global = max(all_field_positions_global) + 1
  404. else:
  405. field_position_global = 1
  406. field_position_local = 0
  407. elif field_position_global < min(tag_field_positions_global):
  408. field_position_global = min(tag_field_positions_global)
  409. _shift_field_positions_global(rec, min(tag_field_positions_global),
  410. 1)
  411. field_position_local = 0
  412. elif field_position_global > max(tag_field_positions_global):
  413. field_position_global = max(tag_field_positions_global) + 1
  414. _shift_field_positions_global(rec,
  415. max(tag_field_positions_global) + 1, 1)
  416. field_position_local = len(rec.get(tag, []))
  417. else:
  418. if field_position_global in tag_field_positions_global:
  419. _shift_field_positions_global(rec, field_position_global, 1)
  420. field_position_local = 0
  421. for position, field in enumerate(rec[tag]):
  422. if field[4] == field_position_global + 1:
  423. field_position_local = position
  424. # Create the new field.
  425. newfield = (subfields, ind1, ind2, str(controlfield_value),
  426. field_position_global)
  427. rec.setdefault(tag, []).insert(field_position_local, newfield)
  428. # Return new field number:
  429. return field_position_global
  430. def record_has_field(rec, tag):
  431. """
  432. Checks if the tag exists in the record.
  433. @param rec: the record data structure
  434. @param the: field
  435. @return: a boolean
  436. """
  437. return tag in rec
  438. def record_delete_field(rec, tag, ind1=' ', ind2=' ',
  439. field_position_global=None, field_position_local=None):
  440. """
  441. If global field position is specified, deletes the field with the
  442. corresponding global field position.
  443. If field_position_local is specified, deletes the field with the
  444. corresponding local field position and tag.
  445. Else deletes all the fields matching tag and optionally ind1 and
  446. ind2.
  447. If both field_position_global and field_position_local are present,
  448. then field_position_local takes precedence.
  449. @param rec: the record data structure
  450. @param tag: the tag of the field to be deleted
  451. @param ind1: the first indicator of the field to be deleted
  452. @param ind2: the second indicator of the field to be deleted
  453. @param field_position_global: the global field position (record wise)
  454. @param field_position_local: the local field position (tag wise)
  455. @return: the list of deleted fields
  456. """
  457. error = _validate_record_field_positions_global(rec)
  458. if error:
  459. # FIXME one should write a message here.
  460. pass
  461. if tag not in rec:
  462. return False
  463. ind1, ind2 = _wash_indicators(ind1, ind2)
  464. deleted = []
  465. newfields = []
  466. if field_position_global is None and field_position_local is None:
  467. # Remove all fields with tag 'tag'.
  468. for field in rec[tag]:
  469. if field[1] != ind1 or field[2] != ind2:
  470. newfields.append(field)
  471. else:
  472. deleted.append(field)
  473. rec[tag] = newfields
  474. elif field_position_global is not None:
  475. # Remove the field with 'field_position_global'.
  476. for field in rec[tag]:
  477. if (field[1] != ind1 and field[2] != ind2 or
  478. field[4] != field_position_global):
  479. newfields.append(field)
  480. else:
  481. deleted.append(field)
  482. rec[tag] = newfields
  483. elif field_position_local is not None:
  484. # Remove the field with 'field_position_local'.
  485. try:
  486. del rec[tag][field_position_local]
  487. except IndexError:
  488. return []
  489. if not rec[tag]:
  490. # Tag is now empty, remove it.
  491. del rec[tag]
  492. return deleted
  493. def record_delete_fields(rec, tag, field_positions_local=None):
  494. """
  495. Delete all/some fields defined with MARC tag 'tag' from record 'rec'.
  496. @param rec: a record structure.
  497. @type rec: tuple
  498. @param tag: three letter field.
  499. @type tag: string
  500. @param field_position_local: if set, it is the list of local positions
  501. within all the fields with the specified tag, that should be deleted.
  502. If not set all the fields with the specified tag will be deleted.
  503. @type field_position_local: sequence
  504. @return: the list of deleted fields.
  505. @rtype: list
  506. @note: the record is modified in place.
  507. """
  508. if tag not in rec:
  509. return []
  510. new_fields, deleted_fields = [], []
  511. for position, field in enumerate(rec.get(tag, [])):
  512. if field_positions_local is None or position in field_positions_local:
  513. deleted_fields.append(field)
  514. else:
  515. new_fields.append(field)
  516. if new_fields:
  517. rec[tag] = new_fields
  518. else:
  519. del rec[tag]
  520. return deleted_fields
  521. def record_add_fields(rec, tag, fields, field_position_local=None,
  522. field_position_global=None):
  523. """
  524. Adds the fields into the record at the required position. The
  525. position is specified by the tag and the field_position_local in
  526. the list of fields.
  527. @param rec: a record structure
  528. @param tag: the tag of the fields
  529. to be moved
  530. @param field_position_local: the field_position_local to which the
  531. field will be inserted. If not specified, appends the fields to
  532. the tag.
  533. @param a: list of fields to be added
  534. @return: -1 if the operation failed, or the field_position_local
  535. if it was successful
  536. """
  537. if field_position_local is None and field_position_global is None:
  538. for field in fields:
  539. record_add_field(rec, tag, ind1=field[1],
  540. ind2=field[2], subfields=field[0],
  541. controlfield_value=field[3])
  542. else:
  543. fields.reverse()
  544. for field in fields:
  545. record_add_field(rec, tag, ind1=field[1], ind2=field[2],
  546. subfields=field[0], controlfield_value=field[3],
  547. field_position_local=field_position_local,
  548. field_position_global=field_position_global)
  549. return field_position_local
  550. def record_move_fields(rec, tag, field_positions_local,
  551. field_position_local=None):
  552. """
  553. Moves some fields to the position specified by
  554. 'field_position_local'.
  555. @param rec: a record structure as returned by create_record()
  556. @param tag: the tag of the fields to be moved
  557. @param field_positions_local: the positions of the
  558. fields to move
  559. @param field_position_local: insert the field before that
  560. field_position_local. If unspecified, appends the fields
  561. @return: the field_position_local is the operation was successful
  562. """
  563. fields = record_delete_fields(rec, tag,
  564. field_positions_local=field_positions_local)
  565. return record_add_fields(rec, tag, fields,
  566. field_position_local=field_position_local)
  567. def record_delete_subfield(rec, tag, subfield_code, ind1=' ', ind2=' '):
  568. """Deletes all subfields with subfield_code in the record."""
  569. ind1, ind2 = _wash_indicators(ind1, ind2)
  570. for field in rec.get(tag, []):
  571. if field[1] == ind1 and field[2] == ind2:
  572. field[0][:] = [subfield for subfield in field[0]
  573. if subfield_code != subfield[0]]
  574. def record_get_field(rec, tag, field_position_global=None,
  575. field_position_local=None):
  576. """
  577. Returns the the matching field. One has to enter either a global
  578. field position or a local field position.
  579. @return: a list of subfield tuples (subfield code, value).
  580. @rtype: list
  581. """
  582. if field_position_global is None and field_position_local is None:
  583. raise InvenioBibRecordFieldError("A field position is required to "
  584. "complete this operation.")
  585. elif field_position_global is not None and field_position_local is not None:
  586. raise InvenioBibRecordFieldError("Only one field position is required "
  587. "to complete this operation.")
  588. elif field_position_global:
  589. if not tag in rec:
  590. raise InvenioBibRecordFieldError("No tag '%s' in record." % tag)
  591. for field in rec[tag]:
  592. if field[4] == field_position_global:
  593. return field
  594. raise InvenioBibRecordFieldError("No field has the tag '%s' and the "
  595. "global field position '%d'." % (tag, field_position_global))
  596. else:
  597. try:
  598. return rec[tag][field_position_local]
  599. except KeyError:
  600. raise InvenioBibRecordFieldError("No tag '%s' in record." % tag)
  601. except IndexError:
  602. raise InvenioBibRecordFieldError("No field has the tag '%s' and "
  603. "the local field position '%d'." % (tag, field_position_local))
  604. def record_replace_field(rec, tag, new_field, field_position_global=None,
  605. field_position_local=None):
  606. """Replaces a field with a new field."""
  607. if field_position_global is None and field_position_local is None:
  608. raise InvenioBibRecordFieldError("A field position is required to "
  609. "complete this operation.")
  610. elif field_position_global is not None and field_position_local is not None:
  611. raise InvenioBibRecordFieldError("Only one field position is required "
  612. "to complete this operation.")
  613. elif field_position_global:
  614. if not tag in rec:
  615. raise InvenioBibRecordFieldError("No tag '%s' in record." % tag)
  616. replaced = False
  617. for position, field in enumerate(rec[tag]):
  618. if field[4] == field_position_global:
  619. rec[tag][position] = new_field
  620. replaced = True
  621. if not replaced:
  622. raise InvenioBibRecordFieldError("No field has the tag '%s' and "
  623. "the global field position '%d'." %
  624. (tag, field_position_global))
  625. else:
  626. try:
  627. rec[tag][field_position_local] = new_field
  628. except KeyError:
  629. raise InvenioBibRecordFieldError("No tag '%s' in record." % tag)
  630. except IndexError:
  631. raise InvenioBibRecordFieldError("No field has the tag '%s' and "
  632. "the local field position '%d'." % (tag, field_position_local))
  633. def record_get_subfields(rec, tag, field_position_global=None,
  634. field_position_local=None):
  635. """
  636. Returns the subfield of the matching field. One has to enter either a
  637. global field position or a local field position.
  638. @return: a list of subfield tuples (subfield code, value).
  639. @rtype: list
  640. """
  641. field = record_get_field(rec, tag,
  642. field_position_global=field_position_global,
  643. field_position_local=field_position_local)
  644. return field[0]
  645. def record_delete_subfield_from(rec, tag, subfield_position,
  646. field_position_global=None, field_position_local=None):
  647. """Delete subfield from position specified by tag, field number and
  648. subfield position."""
  649. subfields = record_get_subfields(rec, tag,
  650. field_position_global=field_position_global,
  651. field_position_local=field_position_local)
  652. try:
  653. del subfields[subfield_position]
  654. except IndexError:
  655. from invenio.xmlmarc2textmarc import create_marc_record
  656. recordMarc = create_marc_record(rec, 0, {"text-marc": 1, "aleph-marc": 0})
  657. raise InvenioBibRecordFieldError("The record : %(recordCode)s does not contain the subfield "
  658. "'%(subfieldIndex)s' inside the field (local: '%(fieldIndexLocal)s, global: '%(fieldIndexGlobal)s' ) of tag '%(tag)s'." % \
  659. {"subfieldIndex" : subfield_position, \
  660. "fieldIndexLocal" : str(field_position_local), \
  661. "fieldIndexGlobal" : str(field_position_global), \
  662. "tag" : tag, \
  663. "recordCode" : recordMarc})
  664. if not subfields:
  665. if field_position_global is not None:
  666. for position, field in enumerate(rec[tag]):
  667. if field[4] == field_position_global:
  668. del rec[tag][position]
  669. else:
  670. del rec[tag][field_position_local]
  671. if not rec[tag]:
  672. del rec[tag]
  673. def record_add_subfield_into(rec, tag, subfield_code, value,
  674. subfield_position=None, field_position_global=None,
  675. field_position_local=None):
  676. """Add subfield into position specified by tag, field number and
  677. optionally by subfield position."""
  678. subfields = record_get_subfields(rec, tag,
  679. field_position_global=field_position_global,
  680. field_position_local=field_position_local)
  681. if subfield_position is None:
  682. subfields.append((subfield_code, value))
  683. else:
  684. subfields.insert(subfield_position, (subfield_code, value))
  685. def record_modify_controlfield(rec, tag, controlfield_value,
  686. field_position_global=None, field_position_local=None):
  687. """Modify controlfield at position specified by tag and field number."""
  688. field = record_get_field(rec, tag,
  689. field_position_global=field_position_global,
  690. field_position_local=field_position_local)
  691. new_field = (field[0], field[1], field[2], controlfield_value, field[4])
  692. record_replace_field(rec, tag, new_field,
  693. field_position_global=field_position_global,
  694. field_position_local=field_position_local)
  695. def record_modify_subfield(rec, tag, subfield_code, value, subfield_position,
  696. field_position_global=None, field_position_local=None):
  697. """Modify subfield at position specified by tag, field number and
  698. subfield position."""
  699. subfields = record_get_subfields(rec, tag,
  700. field_position_global=field_position_global,
  701. field_position_local=field_position_local)
  702. try:
  703. subfields[subfield_position] = (subfield_code, value)
  704. except IndexError:
  705. raise InvenioBibRecordFieldError("There is no subfield with position "
  706. "'%d'." % subfield_position)
  707. def record_move_subfield(rec, tag, subfield_position, new_subfield_position,
  708. field_position_global=None, field_position_local=None):
  709. """Move subfield at position specified by tag, field number and
  710. subfield position to new subfield position."""
  711. subfields = record_get_subfields(rec, tag,
  712. field_position_global=field_position_global,
  713. field_position_local=field_position_local)
  714. try:
  715. subfield = subfields.pop(subfield_position)
  716. subfields.insert(new_subfield_position, subfield)
  717. except IndexError:
  718. raise InvenioBibRecordFieldError("There is no subfield with position "
  719. "'%d'." % subfield_position)
  720. def record_get_field_value(rec, tag, ind1=" ", ind2=" ", code=""):
  721. """Returns first (string) value that matches specified field
  722. (tag, ind1, ind2, code) of the record (rec).
  723. Returns empty string if not found.
  724. Parameters (tag, ind1, ind2, code) can contain wildcard %.
  725. Difference between wildcard % and empty '':
  726. - Empty char specifies that we are not interested in a field which
  727. has one of the indicator(s)/subfield specified.
  728. - Wildcard specifies that we are interested in getting the value
  729. of the field whatever the indicator(s)/subfield is.
  730. For e.g. consider the following record in MARC:
  731. 100C5 $$a val1
  732. 555AB $$a val2
  733. 555AB val3
  734. 555 $$a val4
  735. 555A val5
  736. >> record_get_field_value(record, '555', 'A', '', '')
  737. >> "val5"
  738. >> record_get_field_value(record, '555', 'A', '%', '')
  739. >> "val3"
  740. >> record_get_field_value(record, '555', 'A', '%', '%')
  741. >> "val2"
  742. >> record_get_field_value(record, '555', 'A', 'B', '')
  743. >> "val3"
  744. >> record_get_field_value(record, '555', '', 'B', 'a')
  745. >> ""
  746. >> record_get_field_value(record, '555', '', '', 'a')
  747. >> "val4"
  748. >> record_get_field_value(record, '555', '', '', '')
  749. >> ""
  750. >> record_get_field_value(record, '%%%', '%', '%', '%')
  751. >> "val1"
  752. @param rec: a record structure as returned by create_record()
  753. @param tag: a 3 characters long string
  754. @param ind1: a 1 character long string
  755. @param ind2: a 1 character long string
  756. @param code: a 1 character long string
  757. @return: string value (empty if nothing found)"""
  758. # Note: the code is quite redundant for speed reasons (avoid calling
  759. # functions or doing tests inside loops)
  760. ind1, ind2 = _wash_indicators(ind1, ind2)
  761. if '%' in tag:
  762. # Wild card in tag. Must find all corresponding fields
  763. if code == '':
  764. # Code not specified.
  765. for field_tag, fields in rec.items():
  766. if _tag_matches_pattern(field_tag, tag):
  767. for field in fields:
  768. if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
  769. # Return matching field value if not empty
  770. if field[3]:
  771. return field[3]
  772. elif code == '%':
  773. # Code is wildcard. Take first subfield of first matching field
  774. for field_tag, fields in rec.items():
  775. if _tag_matches_pattern(field_tag, tag):
  776. for field in fields:
  777. if (ind1 in ('%', field[1]) and ind2 in ('%', field[2])
  778. and field[0]):
  779. return field[0][0][1]
  780. else:
  781. # Code is specified. Take corresponding one
  782. for field_tag, fields in rec.items():
  783. if _tag_matches_pattern(field_tag, tag):
  784. for field in fields:
  785. if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
  786. for subfield in field[0]:
  787. if subfield[0] == code:
  788. return subfield[1]
  789. else:
  790. # Tag is completely specified. Use tag as dict key
  791. if tag in rec:
  792. if code == '':
  793. # Code not specified.
  794. for field in rec[tag]:
  795. if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
  796. # Return matching field value if not empty
  797. # or return "" empty if not exist.
  798. if field[3]:
  799. return field[3]
  800. elif code == '%':
  801. # Code is wildcard. Take first subfield of first matching field
  802. for field in rec[tag]:
  803. if (ind1 in ('%', field[1]) and ind2 in ('%', field[2]) and
  804. field[0]):
  805. return field[0][0][1]
  806. else:
  807. # Code is specified. Take corresponding one
  808. for field in rec[tag]:
  809. if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
  810. for subfield in field[0]:
  811. if subfield[0] == code:
  812. return subfield[1]
  813. # Nothing was found
  814. return ""
  815. def record_get_field_values(rec, tag, ind1=" ", ind2=" ", code="",
  816. filter_subfield_code="",
  817. filter_subfield_value="",
  818. filter_subfield_mode="e"):
  819. """Returns the list of (string) values for the specified field
  820. (tag, ind1, ind2, code) of the record (rec).
  821. List can be filtered. Use filter_subfield_code
  822. and filter_subfield_value to search
  823. only in fields that have these values inside them as a subfield.
  824. filter_subfield_mode can have 3 different values:
  825. 'e' for exact search
  826. 's' for substring search
  827. 'r' for regexp search
  828. Returns empty list if nothing was found.
  829. Parameters (tag, ind1, ind2, code) can contain wildcard %.
  830. @param rec: a record structure as returned by create_record()
  831. @param tag: a 3 characters long string
  832. @param ind1: a 1 character long string
  833. @param ind2: a 1 character long string
  834. @param code: a 1 character long string
  835. @return: a list of strings"""
  836. tmp = []
  837. ind1, ind2 = _wash_indicators(ind1, ind2)
  838. if filter_subfield_code and filter_subfield_mode == "r":
  839. reg_exp = re.compile(filter_subfield_value)
  840. tags = []
  841. if '%' in tag:
  842. # Wild card in tag. Must find all corresponding tags and fields
  843. tags = [k for k in rec if _tag_matches_pattern(k, tag)]
  844. elif rec and tag in rec:
  845. tags = [tag]
  846. if code == '':
  847. # Code not specified. Consider field value (without subfields)
  848. for tag in tags:
  849. for field in rec[tag]:
  850. if (ind1 in ('%', field[1]) and ind2 in ('%', field[2]) and
  851. field[3]):
  852. tmp.append(field[3])
  853. elif code == '%':
  854. # Code is wildcard. Consider all subfields
  855. for tag in tags:
  856. for field in rec[tag]:
  857. if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
  858. if filter_subfield_code:
  859. if filter_subfield_mode == "e":
  860. subfield_to_match = (filter_subfield_code, filter_subfield_value)
  861. if subfield_to_match in field[0]:
  862. for subfield in field[0]:
  863. tmp.append(subfield[1])
  864. elif filter_subfield_mode == "s":
  865. if (dict(field[0]).get(filter_subfield_code, '')).find(filter_subfield_value) > -1:
  866. for subfield in field[0]:
  867. tmp.append(subfield[1])
  868. elif filter_subfield_mode == "r":
  869. if reg_exp.match(dict(field[0]).get(filter_subfield_code, '')):
  870. for subfield in field[0]:
  871. tmp.append(subfield[1])
  872. else:
  873. for subfield in field[0]:
  874. tmp.append(subfield[1])
  875. else:
  876. # Code is specified. Consider all corresponding subfields
  877. for tag in tags:
  878. for field in rec[tag]:
  879. if ind1 in ('%', field[1]) and ind2 in ('%', field[2]):
  880. if filter_subfield_code:
  881. if filter_subfield_mode == "e":
  882. subfield_to_match = (filter_subfield_code, filter_subfield_value)
  883. if subfield_to_match in field[0]:
  884. for subfield in field[0]:
  885. if subfield[0] == code:
  886. tmp.append(subfield[1])
  887. elif filter_subfield_mode == "s":
  888. if (dict(field[0]).get(filter_subfield_code, '')).find(filter_subfield_value) > -1:
  889. for subfield in field[0]:
  890. if subfield[0] == code:
  891. tmp.append(subfield[1])
  892. elif filter_subfield_mode == "r":
  893. if reg_exp.match(dict(field[0]).get(filter_subfield_code, '')):
  894. for subfield in field[0]:
  895. if subfield[0] == code:
  896. tmp.append(subfield[1])
  897. else:
  898. for subfield in field[0]:
  899. if subfield[0] == code:
  900. tmp.append(subfield[1])
  901. # If tmp was not set, nothing was found
  902. return tmp
  903. def record_xml_output(rec, tags=None, order_fn=None):
  904. """Generates the XML for record 'rec' and returns it as a string
  905. @rec: record
  906. @tags: list of tags to be printed"""
  907. if tags is None:
  908. tags = []
  909. if isinstance(tags, str):
  910. tags = [tags]
  911. if tags and '001' not in tags:
  912. # Add the missing controlfield.
  913. tags.append('001')
  914. marcxml = ['<record>']
  915. # Add the tag 'tag' to each field in rec[tag]
  916. fields = []
  917. if rec is not None:
  918. for tag in rec:
  919. if not tags or tag in tags:
  920. for field in rec[tag]:
  921. fields.append((tag, field))
  922. if order_fn is None:
  923. record_order_fields(fields)
  924. else:
  925. record_order_fields(fields, order_fn)
  926. for field in fields:
  927. marcxml.append(field_xml_output(field[1], field[0]))
  928. marcxml.append('</record>')
  929. return '\n'.join(marcxml)
  930. def field_get_subfield_instances(field):
  931. """Returns the list of subfields associated with field 'field'"""
  932. return field[0]
  933. def field_get_subfield_values(field_instance, code):
  934. """Return subfield CODE values of the field instance FIELD."""
  935. return [subfield_value
  936. for subfield_code, subfield_value in field_instance[0]
  937. if subfield_code == code]
  938. def field_get_subfield_codes(field_instance):
  939. """Return subfield codes of the field instance FIELD."""
  940. return [subfield_code
  941. for subfield_code, subfield_value in field_instance[0]]
  942. def field_add_subfield(field, code, value):
  943. """Adds a subfield to field 'field'"""
  944. field[0].append((code, value))
  945. def record_order_fields(rec, fun="_order_by_ord"):
  946. """Orders field inside record 'rec' according to a function"""
  947. rec.sort(eval(fun))
  948. def field_xml_output(field, tag):
  949. """Generates the XML for field 'field' and returns it as a string."""
  950. marcxml = []
  951. if field[3]:
  952. marcxml.append(' <controlfield tag="%s">%s</controlfield>' %
  953. (tag, encode_for_xml(field[3])))
  954. else:
  955. marcxml.append(' <datafield tag="%s" ind1="%s" ind2="%s">' %
  956. (tag, field[1], field[2]))
  957. marcxml += [_subfield_xml_output(subfield) for subfield in field[0]]
  958. marcxml.append(' </datafield>')
  959. return '\n'.join(marcxml)
  960. def record_extract_oai_id(record):
  961. """Returns the OAI ID of the record."""
  962. tag = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[0:3]
  963. ind1 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[3]
  964. ind2 = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[4]
  965. subfield = CFG_BIBUPLOAD_EXTERNAL_OAIID_TAG[5]
  966. values = record_get_field_values(record, tag, ind1, ind2, subfield)
  967. oai_id_regex = re.compile("oai[a-zA-Z0-9/.:]+")
  968. for value in [value.strip() for value in values]:
  969. if oai_id_regex.match(value):
  970. return value
  971. return ""
  972. def record_extract_dois(record):
  973. """Returns the DOI(s) of the record."""
  974. record_dois = []
  975. tag = "024"
  976. ind1 = "7"
  977. ind2 = "_"
  978. subfield_source_code = "2"
  979. subfield_value_code = "a"
  980. identifiers_fields = record_get_field_instances(record, tag, ind1, ind2)
  981. for identifer_field in identifiers_fields:
  982. if 'doi' in [val.lower() for val in field_get_subfield_values(identifer_field, subfield_source_code)]:
  983. record_dois.extend(field_get_subfield_values(identifer_field, subfield_value_code))
  984. return record_dois
  985. def print_rec(rec, format=1, tags=None):
  986. """
  987. prints a record
  988. format = 1 -- XML
  989. format = 2 -- HTML (not implemented)
  990. @param tags: list of tags to be printed
  991. """
  992. if tags is None:
  993. tags = []
  994. if format == 1:
  995. text = record_xml_output(rec, tags)
  996. else:
  997. return ''
  998. return text
  999. def print_recs(listofrec, format=1, tags=None):
  1000. """
  1001. prints a list of records
  1002. @param format: 1 XML, 2 HTML (not implemented)
  1003. @param tags: list of tags to be printed
  1004. if 'listofrec' is not a list it returns empty string
  1005. """
  1006. if tags is None:
  1007. tags = []
  1008. text = ""
  1009. if type(listofrec).__name__ !='list':
  1010. return ""
  1011. else:
  1012. for rec in listofrec:
  1013. text = "%s\n%s" % (text, print_rec(rec, format, tags))
  1014. return text
  1015. def concat(alist):
  1016. """Concats a list of lists"""
  1017. newl = []
  1018. for l in alist:
  1019. newl.extend(l)
  1020. return newl
  1021. def record_find_field(rec, tag, field, strict=False):
  1022. """
  1023. Returns the global and local positions of the first occurrence
  1024. of the field in a record.
  1025. @param rec: A record dictionary structure
  1026. @type rec: dictionary
  1027. @param tag: The tag of the field to search for
  1028. @type tag: string
  1029. @param field: A field tuple as returned by create_field()
  1030. @type field: tuple
  1031. @param strict: A boolean describing the search method. If strict
  1032. is False, then the order of the subfields doesn't
  1033. matter. Default search method is strict.
  1034. @type strict: boolean
  1035. @return: A tuple of (global_position, local_position) or a
  1036. tuple (None, None) if the field is not present.
  1037. @rtype: tuple
  1038. @raise InvenioBibRecordFieldError: If the provided field is invalid.
  1039. """
  1040. try:
  1041. _check_field_validity(field)
  1042. except InvenioBibRecordFieldError:
  1043. raise
  1044. for local_position, field1 in enumerate(rec.get(tag, [])):
  1045. if _compare_fields(field, field1, strict):
  1046. return (field1[4], local_position)
  1047. return (None, None)
  1048. def record_match_subfields(rec, tag, ind1=" ", ind2=" ", sub_key=None,
  1049. sub_value='', sub_key2=None, sub_value2='',
  1050. case_sensitive=True):
  1051. """ Finds subfield instances in a particular field and tests
  1052. values in 1 of 3 possible ways:
  1053. - Does a subfield code exist? (ie does 773__a exist?)
  1054. - Does a subfield have a particular value? (ie 773__a == 'PhysX')
  1055. - Do a pair of subfields have particular values?
  1056. (ie 035__2 == 'CDS' and 035__a == '123456')
  1057. Parameters:
  1058. * rec - dictionary: a bibrecord structure
  1059. * tag - string: the tag of the field (ie '773')
  1060. * ind1, ind2 - char: a single characters for the MARC indicators
  1061. * sub_key - char: subfield key to find
  1062. * sub_value - string: subfield value of that key
  1063. * sub_key2 - char: key of subfield to compare against
  1064. * sub_value2 - string: expected value of second subfield
  1065. * case_sensitive - bool: be case sensitive when matching values
  1066. Returns: false if no match found, else provides the field position (int) """
  1067. if sub_key is None:
  1068. raise TypeError("None object passed for parameter sub_key.")
  1069. if sub_key2 is not None and sub_value2 is '':
  1070. raise TypeError("Parameter sub_key2 defined but sub_value2 is None, "
  1071. + "function requires a value for comparrison.")
  1072. ind1, ind2 = _wash_indicators(ind1, ind2)
  1073. if not case_sensitive:
  1074. sub_value = sub_value.lower()
  1075. sub_value2 = sub_value2.lower()
  1076. for field in record_get_field_instances(rec, tag, ind1, ind2):
  1077. subfields = dict(field_get_subfield_instances(field))
  1078. if not case_sensitive:
  1079. for k, v in subfields.iteritems():
  1080. subfields[k] = v.lower()
  1081. if sub_key in subfields:
  1082. if sub_value is '':
  1083. return field[4]
  1084. else:
  1085. if sub_value == subfields[sub_key]:
  1086. if sub_key2 is None:
  1087. return field[4]
  1088. else:
  1089. if sub_key2 in subfields:
  1090. if sub_value2 == subfields[sub_key2]:
  1091. return field[4]
  1092. return False
  1093. def record_strip_empty_volatile_subfields(rec):
  1094. """
  1095. Removes unchanged volatile subfields from the record
  1096. """
  1097. for tag in rec.keys():
  1098. for field in rec[tag]:
  1099. field[0][:] = [subfield for subfield in field[0] if subfield[1][:9] != "VOLATILE:"]
  1100. def record_strip_empty_fields(rec, tag=None):
  1101. """
  1102. Removes empty subfields and fields from

Large files files are truncated, but you can click here to view the full file