PageRenderTime 60ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/ioio.py

https://gitlab.com/solstag/abstractology
Python | 310 lines | 261 code | 11 blank | 38 comment | 25 complexity | 95222a98eb16613a9b67d808c43715fd MD5 | raw file
  1. # coding: utf-8
  2. # Abstractology - Study of the organisation and evolution of a corpus
  3. #
  4. # Author(s):
  5. # * Ale Abdo <abdo@member.fsf.org>
  6. #
  7. # License:
  8. # [GNU-GPLv3+](https://www.gnu.org/licenses/gpl-3.0.html)
  9. #
  10. # Project:
  11. # <https://en.wikiversity.org/wiki/The_dynamics_and_social_organization_of
  12. # _innovation_in_the_field_of_oncology>
  13. #
  14. # Reference repository for this file:
  15. # <https://gitlab.com/solstag/abstractology>
  16. #
  17. # Contributions are welcome, get in touch with the author(s).
  18. import pickle
  19. import json
  20. import gzip
  21. import lzma
  22. import pandas
  23. from pathlib import Path
  24. from itertools import chain
  25. from tempfile import NamedTemporaryFile
  26. ################
  27. # I/O utilities #
  28. ################
  29. class ioio:
  30. compressors = {
  31. None: {
  32. "module": None,
  33. "pandas_arg": None,
  34. },
  35. ".gz": {
  36. "module": gzip,
  37. "pandas_arg": "gzip",
  38. },
  39. ".gzip": {
  40. "module": gzip,
  41. "pandas_arg": "gzip",
  42. },
  43. ".xz": {
  44. "module": lzma,
  45. "pandas_arg": "xz",
  46. },
  47. ".lzma": {
  48. "module": lzma,
  49. "pandas_arg": "xz",
  50. },
  51. ".bz2": {
  52. "module": None,
  53. "pandas_arg": "bz2",
  54. },
  55. ".zip": {
  56. "module": None,
  57. "pandas_arg": "zip",
  58. },
  59. }
  60. formatters = {
  61. "pickle": {
  62. "module": pickle,
  63. "rmode": "rb",
  64. "wmode": "wb",
  65. "rmethod": "read_pickle",
  66. "wmethod": "to_pickle",
  67. "r_extra_args": {},
  68. "w_extra_args": {},
  69. },
  70. "json": {
  71. "module": json,
  72. "rmode": "rt",
  73. "wmode": "wt",
  74. "rmethod": "read_json",
  75. "wmethod": "to_json",
  76. "r_extra_args": {"orient": "split", "convert_dates": False},
  77. "w_extra_args": {"orient": "split", "date_format": "iso"},
  78. },
  79. "hdf5": {
  80. "module": None,
  81. "rmode": "rb",
  82. "wmode": "wb",
  83. "rmethod": "read_hdf",
  84. "wmethod": "to_hdf",
  85. "r_extra_args": {"key": "singleton"},
  86. "w_extra_args": {"key": "singleton"},
  87. },
  88. }
  89. @classmethod
  90. def uncompressed_suffix(cls, fpath):
  91. try:
  92. return next(x for x in reversed(fpath.suffixes) if x not in cls.compressors)
  93. except StopIteration:
  94. return None
  95. @classmethod
  96. def get_format(cls, fpath, fmt=None):
  97. cmp = fpath.suffix if fpath.suffix in cls.compressors else None
  98. suffix = cls.uncompressed_suffix(fpath)
  99. suffix = None if suffix is None else suffix[1:]
  100. if fmt is None:
  101. fmt = suffix if suffix in cls.formatters else "json"
  102. else:
  103. if fmt in cls.formatters:
  104. if suffix is not None and fmt != suffix:
  105. print(f"Warning: format {fmt} differs from path suffix {suffix}!")
  106. else:
  107. raise ValueError(f"Invalid format: {fmt}")
  108. return fmt, cmp
  109. @classmethod
  110. def load(cls, fpath, fmt=None, formatter_args={}):
  111. """
  112. Reads an object from the disk, decompressing xz and gzip files.
  113. Parameters
  114. ----------
  115. fpath: string
  116. Path to load from.
  117. fmt: string
  118. One of 'pickle' or 'json'.
  119. If `None`, tries to guess from extension, defaulting to 'json'.
  120. formatter_args: dict
  121. Parameters passed to reading function.
  122. Returns
  123. -------
  124. The object read
  125. """
  126. fpath = Path(fpath)
  127. fmt, cmp = cls.get_format(fpath, fmt)
  128. compressor = cls.compressors[cmp]["module"]
  129. formatter = cls.formatters[fmt]["module"]
  130. mode = cls.formatters[fmt]["rmode"]
  131. if cmp is not None and compressor is None:
  132. raise ValueError(f"Usupported compression: {cmp}")
  133. if cmp is not None and compressor is not None:
  134. with compressor.open(fpath, mode) as f:
  135. return formatter.load(f, **formatter_args)
  136. for fopen in lzma.open, gzip.open, open:
  137. try:
  138. with fopen(fpath, mode) as f:
  139. return formatter.load(f, **formatter_args)
  140. except (lzma.LZMAError, gzip.BadGzipFile):
  141. pass
  142. @classmethod
  143. def store(cls, obj, fpath, fmt=None, formatter_args={}):
  144. """
  145. Stores an object to the disk.
  146. Output is compressed if file suffix is '.xz' or '.gz'
  147. Parameters
  148. ----------
  149. obj: object
  150. The object to be stored.
  151. fpath: string
  152. Path to save to.
  153. fmt: string
  154. One of 'pickle' or 'json'.
  155. If `None`, tries to guess from extension, defaulting to 'json'.
  156. formatter_args: dict
  157. Parameters passed to writing function.
  158. """
  159. fpath = Path(fpath)
  160. fmt, cmp = cls.get_format(fpath, fmt)
  161. compressor = cls.compressors[cmp]["module"]
  162. formatter = cls.formatters[fmt]["module"]
  163. mode = cls.formatters[fmt]["wmode"]
  164. if cmp is not None and compressor is None:
  165. raise ValueError(f"Usupported compression: {cmp}")
  166. fopen = compressor.open if compressor else open
  167. fpath.parent.mkdir(parents=True, exist_ok=True)
  168. with fopen(fpath, mode) as f:
  169. formatter.dump(obj, f, **formatter_args)
  170. @classmethod
  171. def load_pandas(cls, fpath, fmt=None, formatter_args={}):
  172. """
  173. Reads a pandas object from the disk, decompressing if needed.
  174. Parameters
  175. ----------
  176. fpath: string
  177. Path to load from.
  178. fmt: string
  179. One of 'pickle' or 'json'.
  180. If `None`, tries to guess from extension, defaulting to 'json'.
  181. formatter_args: dict
  182. Parameters passed to reading function.
  183. Returns
  184. -------
  185. A pandas object, usually a Series or Dataframe
  186. """
  187. fpath = Path(fpath)
  188. if not fpath.exists():
  189. raise FileNotFoundError
  190. fmt, cmp = cls.get_format(fpath)
  191. compression = cls.compressors[cmp]["pandas_arg"]
  192. method = cls.formatters[fmt]["rmethod"]
  193. extra_args = cls.formatters[fmt]["r_extra_args"]
  194. # compression in `to_hdf` seems useless, we might have used lzma on top
  195. if fmt == "hdf5":
  196. with NamedTemporaryFile() as tf:
  197. for fopen in lzma.open, open:
  198. try:
  199. with fopen(fpath, "rb") as f:
  200. tf.write(f.read())
  201. except lzma.LZMAError:
  202. pass
  203. return getattr(pandas, method)(tf.name, **formatter_args)
  204. if compression:
  205. df = getattr(pandas, method)(
  206. fpath,
  207. **dict(
  208. chain(
  209. extra_args.items(),
  210. formatter_args.items(),
  211. [("compression", compression)],
  212. )
  213. ),
  214. )
  215. else:
  216. for compression in "xz", "gzip", None:
  217. try:
  218. df = getattr(pandas, method)(
  219. fpath,
  220. **dict(
  221. chain(
  222. extra_args.items(),
  223. formatter_args.items(),
  224. [("compression", compression)],
  225. )
  226. ),
  227. )
  228. break
  229. except (lzma.LZMAError, gzip.BadGzipFile):
  230. pass
  231. # json doesn't handle tuples, so we must find and convert our tokens
  232. if fmt != "json":
  233. return df
  234. df = df.transform(
  235. lambda x: x.map(lambda y: tuple(map(tuple, y)), na_action="ignore")
  236. if all(
  237. type(y) is list
  238. and all(type(z) is list and all(type(w) is str for w in z) for z in y)
  239. for y in x.loc[x.notna()]
  240. )
  241. else x
  242. )
  243. return df
  244. @classmethod
  245. def store_pandas(cls, obj, fpath, fmt=None, formatter_args={}):
  246. """
  247. Stores a pandas object to the disk.
  248. Parameters
  249. ----------
  250. obj: object
  251. The object to be stored.
  252. fpath: string
  253. Path to save to.
  254. fmt: string
  255. One of 'pickle' or 'json'.
  256. If `None`, tries to guess from extension, defaulting to 'json'.
  257. formatter_args: dict
  258. Parameters passed to writing function.
  259. """
  260. fpath = Path(fpath)
  261. fmt, cmp = cls.get_format(fpath)
  262. compression = cls.compressors[cmp]["pandas_arg"]
  263. method = cls.formatters[fmt]["wmethod"]
  264. extra_args = cls.formatters[fmt]["w_extra_args"]
  265. method_args = dict(
  266. chain(
  267. extra_args.items(),
  268. formatter_args.items(),
  269. [("compression", compression)] if fmt != "hdf5" else [],
  270. )
  271. )
  272. fpath.parent.mkdir(exist_ok=True)
  273. getattr(obj, method)(fpath, **method_args)
  274. # compression in `to_hdf` seems useless, so we apply it afterwards
  275. if fmt == "hdf5" and cls.compressors[cmp]["module"] is not None:
  276. with open(fpath, "rb") as f:
  277. data = f.read()
  278. with cls.compressors[cmp]["module"].open(fpath, "wb") as f:
  279. f.write(data)