/arbre/datafeeding.py

https://github.com/hausdorf/vn
Python | 97 lines | 39 code | 16 blank | 42 comment | 6 complexity | c37ca034ce65719c7264864735b522aa MD5 | raw file
  1. """This module is for moshing data. It aims to provides helpful functions for
  2. common interactions with filesystems, json feeds, web feeds, etc.
  3. """
  4. import os
  5. import fnmatch
  6. import simplejson as json
  7. import copy
  8. ###
  9. ### Filesystem Helpers
  10. ###
  11. def locate_by_pattern(pattern, root_dir):
  12. """This is a generator for recursively detecting files with names that
  13. match the given pattern in a directory.
  14. The pattern can be any regular expression identifying files.
  15. The root_dir is the directory from which to recurse.
  16. """
  17. for path, dirs, files in os.walk(os.path.abspath(root_dir)):
  18. for filename in fnmatch.filter(files, pattern):
  19. yield os.path.join(path, filename)
  20. ###
  21. ### Feed Handing
  22. ###
  23. def process_json_feed(fun, filename):
  24. """Loops across each line in `filename` and converts the data from JSON to
  25. a Python dictionary.
  26. It then calls `fun` which is a function provided by the caller that takes
  27. one argument, the line of data.
  28. No return value is generated to avoid overhead, but `fun` should be a
  29. closure for aggregating values.
  30. """
  31. fd = open(filename, 'r')
  32. for line in fd:
  33. data = json.loads(line)
  34. fun(data)
  35. fd.close()
  36. ###
  37. ### Feed Conversions
  38. ###
  39. """
  40. The garbage key is used for fields we know to be uninteresting. An example
  41. conversion map might look like below.
  42. #
  43. The conversion map is intended for use with `convert_keys`.
  44. #
  45. gc_key_map = {
  46. 'to': 'toAddress',
  47. 'time': 'sendTime'
  48. '---': GARBAGE_KEY,
  49. 'date': GARBAGE_KEY, # redundant with 'time'
  50. }
  51. """
  52. GARBAGE_KEY = '__garbage__'
  53. def convert_keys(data_dict, alternate_key_map, deepcopy=False):
  54. """Convert keys takes a python dictionary, representing a lucene document,
  55. and creates a new document with the keys mapped to their maildir equivalent.
  56. Uses `obcene_key_map` for the mapping. If a mapping isn't present, it
  57. leaves the in the map.
  58. Deepcopy is supported, but off by default to favor speed.
  59. """
  60. if deepcopy:
  61. data_dict = copy.deepcopy(data_dict)
  62. else:
  63. data_dict = copy.copy(data_dict) # don't mutate the input
  64. for key, value in data_dict.items():
  65. if key in alternate_key_map:
  66. new_key = alternate_key_map[key]
  67. # handle unicode representations of < and >
  68. # TODO investigate if more work is necessary here
  69. value = value.replace('\u003c', '<')
  70. value = value.replace('\u003e', '>')
  71. data_dict[new_key] = value
  72. del data_dict[key]
  73. # dictionary based garbage collection ;)
  74. if GARBAGE_KEY in data_dict:
  75. del data_dict[GARBAGE_KEY]
  76. return data_dict