PageRenderTime 439ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/testing/misc.py

https://gitlab.com/pschuprikov/lpm
Python | 430 lines | 355 code | 72 blank | 3 comment | 39 complexity | 646c11dc4201d13ea89c9bd47d643e24 MD5 | raw file
  1. #!/usr/bin/env python3
  2. from textwrap import dedent
  3. import math
  4. from typing import List, Optional, Tuple, Union, Sequence, NamedTuple
  5. from sys import stderr
  6. import os
  7. import os.path as path
  8. import click
  9. import pandas as pd
  10. import numpy as np
  11. HEURISTICS = (
  12. 'lpm_inc_act_postoi',
  13. 'oi_act',
  14. 'oi_lpm_joint_act',
  15. 'oi_exact_act'
  16. )
  17. PLOT_HEURISTICS = (
  18. 'lpm_bounded_inc_act_postoi',
  19. 'lpm_bounded_max_step_act_postoi',
  20. 'lpm_bounded_max_oi_act_postoi',
  21. 'lpm_bounded_max_lpm_act_postoi',
  22. 'oi_bound_32_64',
  23. 'oi',
  24. )
  25. HEURISTICS_NOT_SUPPORTING_FULL_WIDTH = (
  26. 'oi_bound_32_64',
  27. 'oi',
  28. )
  29. FULL_BIT_LENGTH = 104
  30. CHOSEN_L: List[int] = [32]
  31. BETA_VALUES = [5, 10, 20]
  32. PERC_VALUES = [95, 99, 100]
  33. @click.command()
  34. @click.option('--output-dir', type=str, default='output',
  35. help='Output directory')
  36. def main(output_dir):
  37. do_main(output_dir)
  38. def do_main(output_dir: str):
  39. if not path.exists(output_dir):
  40. print(f'creating "{output_dir}" directory')
  41. os.makedirs(output_dir, exist_ok=True)
  42. data = load_data()
  43. classifiers: Sequence[str] = sorted(set(data.data_with_beta['cls_file']))
  44. generate_perc_table(classifiers, data.data_with_perc, output_dir, [
  45. 'lpm_inc_act_postoi',
  46. 'oi_act',
  47. 'oi_lpm_joint_act',
  48. 'oi_exact_act',
  49. ])
  50. generate_perc_plots(classifiers, data.data_with_perc, output_dir, [
  51. 'lpm_inc_act_postoi',
  52. 'oi_act',
  53. 'oi_lpm_joint_act',
  54. 'oi_exact_act',
  55. 'lpm_bounded'
  56. ])
  57. #generate_tables(classifiers, data.data_with_beta, output_dir)
  58. #generate_plots(classifiers, data.data_with_beta, output_dir)
  59. def generate_plots(classifiers, data: pd.DataFrame, output_dir):
  60. for k in classifiers:
  61. generate_plot_for(data, k, output_dir, bit_widths=[32, FULL_BIT_LENGTH])
  62. def generate_plot_for(data, k, output_dir, bit_widths: Sequence[int]):
  63. plot_data = pd.DataFrame({'beta': np.arange(0, max(BETA_VALUES) + 1)})
  64. plot_path = path.join(output_dir, f'plot_{drop_extension(k)}.tsv')
  65. print(f'generating "{plot_path}"')
  66. for heuristic in PLOT_HEURISTICS:
  67. for bit_width in bit_widths:
  68. if not is_valid_bit_width(heuristic, bit_width):
  69. continue
  70. heuristic_data = extract_cdf_data(data, bit_width, heuristic, k)
  71. plot_data = merge_cdf_data(heuristic_data, plot_data)
  72. plot_data.to_csv(plot_path, sep='\t', index=False)
  73. def is_valid_bit_width(heuristic: str, bit_width: int) -> bool:
  74. return bit_width != FULL_BIT_LENGTH \
  75. or heuristic not in HEURISTICS_NOT_SUPPORTING_FULL_WIDTH
  76. def merge_cdf_data(
  77. heuristic_data: pd.DataFrame, plot_data: pd.DataFrame
  78. ) -> pd.DataFrame:
  79. return plot_data.merge(
  80. heuristic_data, how='left', on='beta', validate='1:1')
  81. def extract_cdf_data(
  82. data: pd.DataFrame, bit_width: int, heuristic: str, k: str
  83. ) -> pd.DataFrame:
  84. return query_data_for(
  85. data, heuristic, k, is_full_length=False
  86. ).query(f'num_bits == {bit_width}').filter(
  87. ['beta', 'num_rules_perc'], axis='columns'
  88. ).rename(columns={'num_rules_perc': f'{heuristic}_{bit_width}'})
  89. def drop_extension(filepath: str) -> str:
  90. return path.splitext(filepath)[0]
  91. def generate_perc_table(
  92. classifiers: Sequence[str],
  93. data: pd.DataFrame,
  94. output_dir: str,
  95. heuristics: Sequence[str],
  96. ):
  97. table_path = path.join(output_dir, f'perctable.tsv')
  98. print(f'generating "{table_path}')
  99. result = []
  100. for i, cls in enumerate(classifiers):
  101. entry = pd.DataFrame({'classifier': [drop_extension(cls)]}, index=[i])
  102. for heuristic in heuristics:
  103. for perc in PERC_VALUES:
  104. sub_data = data.query(f'num_bits == 32 and perc == {perc} and mode == "{heuristic}" and cls_file == "{cls}"')
  105. if sub_data.empty:
  106. raise RuntimeError(f"Couldn't find data for {heuristic}")
  107. entry[f'{heuristic}_{perc}'] = sub_data['num_groups'].item()
  108. result.append(entry)
  109. data = pd.concat(result, axis='index')
  110. data.to_csv(table_path, sep='\t', index=False)
  111. def generate_perc_plots(
  112. classifiers: Sequence[str],
  113. data: pd.DataFrame,
  114. output_dir: str,
  115. heuristics: Sequence[str],
  116. ):
  117. for perc in PERC_VALUES:
  118. table_path = path.join(output_dir, f'perc_{perc}.tsv')
  119. print(f'generating "{table_path}')
  120. df = pd.DataFrame({'cls_file': classifiers})
  121. for heuristic in heuristics:
  122. for num_bits in CHOSEN_L + [FULL_BIT_LENGTH]:
  123. df = df.merge(
  124. data.query(
  125. f'num_bits == {num_bits} and mode == "{heuristic}"'
  126. f' and perc == "{perc}"'
  127. ).filter(
  128. ['num_groups', 'cls_file'], axis='columns'
  129. ).rename(columns={'num_groups': f'{heuristic}_{num_bits}'}),
  130. how='left',
  131. on='cls_file'
  132. )
  133. df = df.assign(cls_file=lambda x: [drop_extension(c) for c in x['cls_file']])
  134. df.fillna(float('inf'), inplace=True)
  135. df.to_csv(table_path, sep='\t', index=False)
  136. with pd.option_context(
  137. 'display.max_rows', None, 'display.max_columns', None):
  138. print(df.describe())
  139. for cls in classifiers:
  140. cls_name = drop_extension(cls)
  141. table_path = path.join(output_dir, f'perc_{cls_name}.tsv')
  142. print(f'generating "{table_path}')
  143. df = pd.DataFrame({'perc': PERC_VALUES})
  144. for heuristic in heuristics:
  145. for num_bits in CHOSEN_L + [FULL_BIT_LENGTH]:
  146. df = df.merge(
  147. data.query(
  148. f'num_bits == {num_bits} and mode == "{heuristic}"'
  149. f' and cls_file == "{cls}"'
  150. ).filter(
  151. ['num_groups', 'perc'], axis='columns'
  152. ).rename(columns={'num_groups': f'{heuristic}_{num_bits}'}),
  153. how='left',
  154. on='perc'
  155. )
  156. df.fillna(float('inf'), inplace=True)
  157. df.to_csv(table_path, sep='\t', index=False)
  158. def generate_tables(classifiers, data: pd.DataFrame, output_dir):
  159. for heuristic in HEURISTICS:
  160. generate_table_for(classifiers, data, heuristic, output_dir)
  161. def generate_table_for(classifiers, data, heuristic, output_dir):
  162. table_path = path.join(output_dir, f'bigtable_{heuristic}.tex')
  163. print(f'generating "{table_path}"')
  164. with open(table_path, 'w') as outf:
  165. outf.write(print_table_header_23h() + '\n')
  166. for k in classifiers:
  167. outf.write(print_line_23h(heuristic, k, data) + '\n')
  168. outf.write(print_table_footer() + '\n')
  169. class Data(NamedTuple):
  170. data_with_beta: pd.DataFrame
  171. data_with_perc: pd.DataFrame
  172. def load_data():
  173. raw_input = process_file_raw('data.tsv')
  174. data_exploded = explode_groups_for_last(raw_input)
  175. data_with_beta = replace_groups_with_beta(data_exploded, max(BETA_VALUES))
  176. data_with_perc = replace_groups_with_perc(data_exploded)
  177. lpm_bounded_perc = infer_group_perc(raw_input, 'lpm_bounded', 'lpm')
  178. data_with_beta = data_with_beta[data_with_beta['mode'].isin(HEURISTICS + PLOT_HEURISTICS)]
  179. data_with_perc = data_with_perc[data_with_perc['mode'].isin(HEURISTICS)]
  180. data_with_perc = pd.concat([data_with_perc, lpm_bounded_perc], axis='index')
  181. data_with_beta = add_is_best(data_with_beta)
  182. assert (data_with_beta['num_groups'] == data_with_beta['num_groups_real']).all()
  183. return Data(data_with_beta=data_with_beta, data_with_perc=data_with_perc)
  184. def process_file_raw(fname: str, exclude=()):
  185. data = pd.read_csv(
  186. fname, header=None, sep='\t',
  187. names=['mode', 'cls_file', 'A?', 'num_rules', 'oi_algo', 'num_bits',
  188. 'B?', 'num_groups', 'leftover', 'groups', 'C?', 'D?'],
  189. converters={
  190. 'groups': lambda x: [int(x) for x in x[1:-1].split(',')]
  191. })
  192. data = data.drop(columns=['A?', 'B?', 'C?', 'oi_algo', 'D?'])
  193. return data[~data['mode'].isin(exclude)]
  194. def explode_groups_for_last(data: pd.DataFrame):
  195. return data.groupby(
  196. by=['mode', 'cls_file', 'num_bits'], as_index=False
  197. ).last().explode(
  198. 'groups'
  199. ).rename(columns={'groups': 'group_size'}).assign(
  200. group_size=lambda x: pd.to_numeric(x['group_size'])
  201. )
  202. def replace_groups_with_beta(
  203. data: pd.DataFrame, max_beta: int
  204. ) -> pd.DataFrame:
  205. def convert_to_beta(x: pd.DataFrame):
  206. groups = x['group_size'].sort_values(ascending=False)
  207. cur_max_beta = max(max_beta, len(groups))
  208. groups_padded = np.concatenate(
  209. [np.zeros(1), groups, np.zeros(cur_max_beta - len(groups))])
  210. converted = pd.DataFrame({
  211. 'beta': np.arange(start=0, stop=len(groups_padded)),
  212. 'num_rules': groups_padded.cumsum(),
  213. }).assign(
  214. num_rules_total=x['num_rules'].head(1).item(),
  215. num_groups_real=len(x),
  216. num_groups=x['num_groups'].head(1).item(),
  217. ).assign(
  218. num_rules_perc=lambda x:
  219. (100 * x['num_rules'] / x['num_rules_total']).round(1)
  220. )
  221. return converted
  222. return data.groupby(
  223. ['mode', 'cls_file', 'num_bits'],
  224. ).apply(convert_to_beta).reset_index(3, drop=True).reset_index()
  225. def calculate_inferred_num_groups(x: pd.DataFrame, p):
  226. num_rules_total = x['num_rules'].head(1).item()
  227. min_groups_enough = x['num_groups'][x['leftover'] <= num_rules_total * (1 - 0.01 * p)].min()
  228. max_groups_not_enough = x['num_groups'][x['leftover'] > num_rules_total * (1 - 0.01 * p)].max()
  229. if math.isnan(min_groups_enough):
  230. print(f'Upper bound is not reached for {x["cls_file"].iloc[0]} and p = {p}', file=stderr)
  231. return None
  232. if math.isnan(max_groups_not_enough):
  233. print(f'Lower bound is not reached for {x["cls_file"].iloc[0]} and p = {p}', file=stderr)
  234. return None
  235. if min_groups_enough == max_groups_not_enough + 1 or p == 100:
  236. return min_groups_enough
  237. else:
  238. print(f'Exact minimum between {max_groups_not_enough} and {min_groups_enough} is not found for {x["cls_file"].iloc[0]} and p = {p}, ', file=stderr)
  239. return None
  240. def infer_group_perc(
  241. data: pd.DataFrame, mode: str, full_mode: str
  242. ) -> pd.DataFrame:
  243. def convert_to_percentiles(x: pd.DataFrame):
  244. converted = pd.DataFrame({
  245. 'perc': PERC_VALUES,
  246. 'num_groups': [
  247. calculate_inferred_num_groups(x, p)
  248. for p in PERC_VALUES
  249. ]
  250. })
  251. return converted
  252. return data.query(
  253. f'mode == "{mode}" or mode == "{full_mode}"'
  254. ).assign(mode=mode).groupby(
  255. ['mode', 'cls_file', 'num_bits'],
  256. ).apply(convert_to_percentiles).reset_index(3, drop=True).reset_index()
  257. def replace_groups_with_perc(
  258. data: pd.DataFrame
  259. ) -> pd.DataFrame:
  260. def convert_to_percentiles(x: pd.DataFrame):
  261. groups = x['group_size'].sort_values(ascending=False).cumsum()
  262. num_rules_total = x['num_rules'].head(1).item()
  263. converted = pd.DataFrame({
  264. 'perc': PERC_VALUES,
  265. 'num_groups': [
  266. (groups < num_rules_total * (0.01 * p)).sum() + 1
  267. for p in PERC_VALUES
  268. ]
  269. })
  270. return converted
  271. return data.groupby(
  272. ['mode', 'cls_file', 'num_bits'],
  273. ).apply(convert_to_percentiles).reset_index(3, drop=True).reset_index()
  274. def add_is_best(data: pd.DataFrame) -> pd.DataFrame:
  275. return data.groupby(by=['cls_file', 'num_bits', 'beta']).apply(
  276. lambda x: x.assign(
  277. is_best=(x['num_rules_perc'] == x['num_rules_perc'].max()))
  278. ).reset_index(drop=True)
  279. def to_string(x: Optional[Union[float, int, Tuple[float, bool]]]) -> str:
  280. if x is None:
  281. return '---'
  282. if isinstance(x, float):
  283. return f'{x:.1f}'
  284. if len(x) == 2:
  285. value, best = x
  286. if best:
  287. return f'\\textbf{{{value:.1f}}}'
  288. return f'{value:.1f}'
  289. if isinstance(x, int):
  290. return str(int(x))
  291. raise AssertionError
  292. def share_beta_num_groups(total, groups) -> float:
  293. return 100.0 * groups / float(total)
  294. def share_beta(
  295. num_bits: int, beta: int, data: pd.DataFrame
  296. ) -> Optional[Tuple[float, bool]]:
  297. data_for_length = data.query(f'num_bits == {num_bits} and beta == {beta}')
  298. if data_for_length.empty:
  299. return None
  300. return (share_beta_num_groups(data_for_length['num_rules_total'].item(),
  301. data_for_length['num_rules'].item()), data_for_length['is_best'].item())
  302. def share_betafix(
  303. num_bits: int, beta: int, data: pd.DataFrame
  304. ) -> Optional[Tuple[float, bool]]:
  305. length_data = data.query(f'num_bits == {num_bits} and beta == {beta}')
  306. if length_data.empty:
  307. return None
  308. return (share_beta_num_groups(length_data['num_rules_total'].item(),
  309. length_data['num_rules'].item()), length_data['is_best'].item())
  310. def print_line_23h(mode: str, k: str, data: pd.DataFrame):
  311. my_data = query_data_for(data, mode, k, is_full_length=False)
  312. my_data_104 = query_data_for(data, mode, k, is_full_length=True)
  313. return ' & '.join(
  314. [k.split('.')[0], str(my_data['num_rules'].head(1).item())] +
  315. [to_string(share_beta(num_bits, bt, my_data))
  316. for num_bits in CHOSEN_L for bt in BETA_VALUES]
  317. + [
  318. to_string(share_betafix(104, bt, my_data_104)) for bt in BETA_VALUES]
  319. ) + '\\\\'
  320. def query_data_for(
  321. data: pd.DataFrame, mode: str, k: str, is_full_length: bool
  322. ) -> pd.DataFrame:
  323. if is_full_length:
  324. postoi = f'{mode}_postoi' if not mode.endswith('_postoi') else mode
  325. return data.query(f'cls_file == \"{k}\" & mode == \"{postoi}\"')
  326. return data.query(f'cls_file == \"{k}\" & mode == \"{mode}\"')
  327. def print_table_header_23h():
  328. return dedent('''
  329. \\begin{{tabular}}{{|ll|{alignment}|}}
  330. \\hline
  331. & & {bit_widths} \\\\
  332. \\hline
  333. & Rules & {beta_values} \\\\
  334. \\hline
  335. ''').strip().format(
  336. alignment='|'.join(['c' * len(BETA_VALUES)] * (len(CHOSEN_L)+1)),
  337. bit_widths=' & '.join(
  338. f'\\multicolumn{{{len(BETA_VALUES)}}}{{c|}}{{$l={y}$}}'
  339. for y in CHOSEN_L + [104]),
  340. beta_values=' & '.join(
  341. ['$\\beta=$' + ' & '.join([f'{bt:d}' for bt in BETA_VALUES])]
  342. * (len(CHOSEN_L) + 1))
  343. )
  344. def print_table_footer():
  345. return dedent('''
  346. \\hline
  347. \\end{tabular}
  348. ''').strip()
  349. if __name__ == '__main__':
  350. main()