PageRenderTime 45ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/blockology/zmethods.py

https://gitlab.com/solstag/abstractology
Python | 253 lines | 228 code | 5 blank | 20 comment | 0 complexity | 117934471511b45aa3a43a32c389028a MD5 | raw file
  1. # coding: utf-8
  2. # Abstractology - Study of the organisation and evolution of a corpus
  3. #
  4. # Author(s):
  5. # * Ale Abdo <abdo@member.fsf.org>
  6. #
  7. # License:
  8. # [GNU-GPLv3+](https://www.gnu.org/licenses/gpl-3.0.html)
  9. #
  10. # Project:
  11. # <https://en.wikiversity.org/wiki/The_dynamics_and_social_organization_of
  12. # _innovation_in_the_field_of_oncology>
  13. #
  14. # Reference repository for this file:
  15. # <https://gitlab.com/solstag/abstractology>
  16. #
  17. # Contributions are welcome, get in touch with the author(s).
  18. import pandas
  19. from pathlib import Path
  20. from collections import Counter
  21. from tqdm import tqdm
  22. from ..ioio import ioio
  23. """
  24. Methods to be used with plot_block_level_map.
  25. 'zmethods' should be defined as:
  26. def example(self, blocks, level, index)
  27. and should return a pandas.Series of a scalar dtype and indexed by 'index'.
  28. """
  29. def __init__():
  30. return
  31. def count(self, blocks, level, index):
  32. count = blocks.groupby(blocks.loc[:, level]).size()
  33. count = count.reindex(index)
  34. count = count.where(count.notnull(), 0)
  35. return count
  36. def density(self, blocks, level, index):
  37. count = blocks.groupby(blocks.loc[:, level]).size()
  38. count = count.reindex(index)
  39. count = count.where(count.notnull(), 0)
  40. dens = count / count.sum()
  41. return dens
  42. def x_doc_density_gen(btype):
  43. def x_doc_density(self, xblocks, xlevel, index):
  44. x_documents = getattr(self, f"{btype}_documents")
  45. x_groups = xblocks[xlevel].groupby(xblocks[xlevel])
  46. count = pandas.Series(index=xblocks[xlevel].unique())
  47. for n, g in tqdm(x_groups, desc=f"Level {xlevel}"):
  48. s = set()
  49. for x in g.index:
  50. s.update(x_documents[x])
  51. count.loc[n] = len(self.data.index.intersection(s))
  52. count = count.reindex(index)
  53. count = count.where(count.notnull(), 0)
  54. value = count / len(self.data)
  55. return value
  56. x_doc_density.__name__ = f"{btype}_doc_density"
  57. return x_doc_density
  58. def x_link_density_gen(btype):
  59. def x_doc_density(self, xblocks, xlevel, index):
  60. x_documents = getattr(self, f"{btype}_documents")
  61. x_groups = xblocks[xlevel].groupby(xblocks[xlevel])
  62. count = pandas.Series(0, index=xblocks[xlevel].unique())
  63. data_index = set(self.data.index)
  64. for n, g in tqdm(x_groups, desc=f"Level {xlevel}"):
  65. for x in g.index:
  66. count.loc[n] += sum(
  67. v for k, v in x_documents[x].items() if k in data_index
  68. )
  69. count = count.reindex(index)
  70. count = count.where(count.notnull(), 0)
  71. docs = self.get_doc_terms() if btype == "ter" else self.get_doc_exts()
  72. value = count / docs.transform(len).sum()
  73. return value
  74. x_doc_density.__name__ = "{}_doc_density".format(btype)
  75. return x_doc_density
  76. # Auxiliary methods, used in zmethods or to generate them
  77. def density_pair_gen(idx0, idx1, func):
  78. def density_pair(self, blocks, level, index):
  79. count0 = blocks.loc[idx0].groupby(blocks[level]).size()
  80. count0 = count0.reindex(index)
  81. count0 = count0.where(count0.notnull(), 0)
  82. dens0 = count0 / count0.sum()
  83. count1 = blocks.loc[idx1].groupby(blocks[level]).size()
  84. count1 = count1.reindex(index)
  85. count1 = count1.where(count1.notnull(), 0)
  86. dens1 = count1 / count1.sum()
  87. value = func(dens0, dens1)
  88. return value.where(value.notnull(), 1) # 0/0 => 1
  89. density_pair.__name__ = "density_{}_{}_{}".format(
  90. func.__name__, idx0.name, idx1.name
  91. )
  92. return density_pair
  93. def x_doc_density_pair_gen(idx0, idx1, func, btype):
  94. def x_doc_density_pair(self, xblocks, xlevel, index):
  95. x_documents = getattr(self, f"{btype}_documents")
  96. x_groups = xblocks[xlevel].groupby(xblocks[xlevel])
  97. count0 = pandas.Series(index=xblocks[xlevel].unique())
  98. count1 = pandas.Series(index=xblocks[xlevel].unique())
  99. index0 = self.data.index.intersection(idx0)
  100. index1 = self.data.index.intersection(idx1)
  101. for n, g in tqdm(x_groups):
  102. s = set()
  103. for x in g.index:
  104. s.update(x_documents[x])
  105. count0.loc[n] = len(index0.intersection(s))
  106. count1.loc[n] = len(index1.intersection(s))
  107. count0 = count0.reindex(index)
  108. count0 = count0.where(count0.notnull(), 0)
  109. count1 = count1.reindex(index)
  110. count1 = count1.where(count1.notnull(), 0)
  111. value = func(count0 / index0.size, count1 / index1.size)
  112. return value.where(value.notnull(), 1)
  113. x_doc_density_pair.__name__ = "{}_doc_density_{}_{}_{}".format(
  114. btype, func.__name__, idx0.name, idx1.name
  115. )
  116. return x_doc_density_pair
  117. def x_link_density_pair_gen(idx0, idx1, func, btype):
  118. def x_doc_density_pair(self, xblocks, xlevel, index):
  119. x_documents = getattr(self, f"{btype}_documents")
  120. x_groups = xblocks[xlevel].groupby(xblocks[xlevel])
  121. count0 = pandas.Series(index=xblocks[xlevel].unique())
  122. count1 = pandas.Series(index=xblocks[xlevel].unique())
  123. index0 = set(self.data.index.intersection(idx0))
  124. index1 = set(self.data.index.intersection(idx1))
  125. for n, g in tqdm(x_groups):
  126. for x in g.index:
  127. count0.loc[n] = sum(v for k, v in x_documents[x].items() if k in index0)
  128. count1.loc[n] = sum(v for k, v in x_documents[x].items() if k in index1)
  129. count0 = count0.reindex(index)
  130. count0 = count0.where(count0.notnull(), 0)
  131. count1 = count1.reindex(index)
  132. count1 = count1.where(count1.notnull(), 0)
  133. docs = self.get_doc_terms() if btype == "ter" else self.get_doc_exts()
  134. value = func(
  135. count0 / docs.loc[docs.index.intersection(idx0)].transform(len).sum(),
  136. count1 / docs.loc[docs.index.intersection(idx1)].transform(len).sum(),
  137. )
  138. return value.where(value.notnull(), 1)
  139. x_doc_density_pair.__name__ = "{}_doc_density_{}_{}_{}".format(
  140. btype, func.__name__, idx0.name, idx1.name
  141. )
  142. return x_doc_density_pair
  143. def load_cross_count_all(self, ybtype, ltype, store=True, from_file=True):
  144. """
  145. Pairs every domain from every level with every cross block from every
  146. level (topics or extended blocks) and counts the number of links or documents
  147. connected to each cross block.
  148. Parameters
  149. ----------
  150. self: an instance of `Graphology`
  151. btype: str
  152. The block type to cross. Either 'ter' or 'ext'.
  153. ltype: str
  154. Either 'link' or 'doc'. Whether to count links or documents.
  155. store: bool
  156. Store the result to a file.
  157. from_file: bool
  158. Attempt to read from a stored result.
  159. Result
  160. ------
  161. Sets self.'values_{btype}_{ltype}_count_all', to a pandas.Series with MultiIndex
  162. (domain level, domain, cross level, cross block)
  163. """
  164. blocks = self.blocks
  165. yblocks, yblocks_levels, _ = self.get_blocks_levels_sample(ybtype)
  166. y_documents = self.get_xelement_yelements(ybtype, "doc")
  167. attr_name = f"values_{ybtype}_{ltype}_count_all"
  168. if ybtype == "ter":
  169. fdir = self.blocks_dir
  170. elif ybtype == "ext":
  171. fdir = self.chained_dir
  172. fpath = Path(fdir, f"{attr_name}{self.ext_data}")
  173. if from_file:
  174. try:
  175. values = ioio.load(fpath)
  176. values = pandas.Series(
  177. index=pandas.MultiIndex.from_tuples(values["index"]),
  178. data=(x for x in values["data"]),
  179. )
  180. setattr(self, attr_name, values)
  181. print("Loaded cross link count from file")
  182. return
  183. except FileNotFoundError:
  184. pass
  185. keys, vals = [], []
  186. for ylevel in tqdm(yblocks_levels, desc="Cross level"):
  187. y_groups = yblocks[ylevel].groupby(yblocks[ylevel])
  188. for yb, yg in tqdm(y_groups, desc=" Term block"):
  189. yb_docs = Counter() if ltype == "link" else set()
  190. for ye in yg.index:
  191. yb_docs.update(y_documents[ye])
  192. for level in tqdm(self.dblocks_levels, desc=" Doc level"):
  193. doc_groups = blocks[level].groupby(blocks[level])
  194. for b, g in doc_groups:
  195. keys.append((level, b, ylevel, yb))
  196. if ltype == "link":
  197. g_index = set(g.index)
  198. vals.append(sum(v for k, v in yb_docs.items() if k in g_index))
  199. else: # ltype == 'doc'
  200. vals.append(len(g.index.intersection(yb_docs)))
  201. values = pandas.Series(vals, index=pandas.MultiIndex.from_tuples(keys))
  202. setattr(self, attr_name, values)
  203. if store:
  204. print("Storing values")
  205. ioio.store_pandas(values, fpath)
  206. def p_diff(a, b):
  207. return a - b
  208. def p_rel(a, b):
  209. return a / b