/testing/misc.py
Python | 430 lines | 355 code | 72 blank | 3 comment | 39 complexity | 646c11dc4201d13ea89c9bd47d643e24 MD5 | raw file
- #!/usr/bin/env python3
- from textwrap import dedent
- import math
- from typing import List, Optional, Tuple, Union, Sequence, NamedTuple
- from sys import stderr
- import os
- import os.path as path
- import click
- import pandas as pd
- import numpy as np
- HEURISTICS = (
- 'lpm_inc_act_postoi',
- 'oi_act',
- 'oi_lpm_joint_act',
- 'oi_exact_act'
- )
- PLOT_HEURISTICS = (
- 'lpm_bounded_inc_act_postoi',
- 'lpm_bounded_max_step_act_postoi',
- 'lpm_bounded_max_oi_act_postoi',
- 'lpm_bounded_max_lpm_act_postoi',
- 'oi_bound_32_64',
- 'oi',
- )
- HEURISTICS_NOT_SUPPORTING_FULL_WIDTH = (
- 'oi_bound_32_64',
- 'oi',
- )
- FULL_BIT_LENGTH = 104
- CHOSEN_L: List[int] = [32]
- BETA_VALUES = [5, 10, 20]
- PERC_VALUES = [95, 99, 100]
- @click.command()
- @click.option('--output-dir', type=str, default='output',
- help='Output directory')
- def main(output_dir):
- do_main(output_dir)
- def do_main(output_dir: str):
- if not path.exists(output_dir):
- print(f'creating "{output_dir}" directory')
- os.makedirs(output_dir, exist_ok=True)
- data = load_data()
- classifiers: Sequence[str] = sorted(set(data.data_with_beta['cls_file']))
- generate_perc_table(classifiers, data.data_with_perc, output_dir, [
- 'lpm_inc_act_postoi',
- 'oi_act',
- 'oi_lpm_joint_act',
- 'oi_exact_act',
- ])
- generate_perc_plots(classifiers, data.data_with_perc, output_dir, [
- 'lpm_inc_act_postoi',
- 'oi_act',
- 'oi_lpm_joint_act',
- 'oi_exact_act',
- 'lpm_bounded'
- ])
- #generate_tables(classifiers, data.data_with_beta, output_dir)
- #generate_plots(classifiers, data.data_with_beta, output_dir)
- def generate_plots(classifiers, data: pd.DataFrame, output_dir):
- for k in classifiers:
- generate_plot_for(data, k, output_dir, bit_widths=[32, FULL_BIT_LENGTH])
- def generate_plot_for(data, k, output_dir, bit_widths: Sequence[int]):
- plot_data = pd.DataFrame({'beta': np.arange(0, max(BETA_VALUES) + 1)})
- plot_path = path.join(output_dir, f'plot_{drop_extension(k)}.tsv')
- print(f'generating "{plot_path}"')
- for heuristic in PLOT_HEURISTICS:
- for bit_width in bit_widths:
- if not is_valid_bit_width(heuristic, bit_width):
- continue
- heuristic_data = extract_cdf_data(data, bit_width, heuristic, k)
- plot_data = merge_cdf_data(heuristic_data, plot_data)
- plot_data.to_csv(plot_path, sep='\t', index=False)
- def is_valid_bit_width(heuristic: str, bit_width: int) -> bool:
- return bit_width != FULL_BIT_LENGTH \
- or heuristic not in HEURISTICS_NOT_SUPPORTING_FULL_WIDTH
- def merge_cdf_data(
- heuristic_data: pd.DataFrame, plot_data: pd.DataFrame
- ) -> pd.DataFrame:
- return plot_data.merge(
- heuristic_data, how='left', on='beta', validate='1:1')
- def extract_cdf_data(
- data: pd.DataFrame, bit_width: int, heuristic: str, k: str
- ) -> pd.DataFrame:
- return query_data_for(
- data, heuristic, k, is_full_length=False
- ).query(f'num_bits == {bit_width}').filter(
- ['beta', 'num_rules_perc'], axis='columns'
- ).rename(columns={'num_rules_perc': f'{heuristic}_{bit_width}'})
- def drop_extension(filepath: str) -> str:
- return path.splitext(filepath)[0]
- def generate_perc_table(
- classifiers: Sequence[str],
- data: pd.DataFrame,
- output_dir: str,
- heuristics: Sequence[str],
- ):
- table_path = path.join(output_dir, f'perctable.tsv')
- print(f'generating "{table_path}')
- result = []
- for i, cls in enumerate(classifiers):
- entry = pd.DataFrame({'classifier': [drop_extension(cls)]}, index=[i])
- for heuristic in heuristics:
- for perc in PERC_VALUES:
- sub_data = data.query(f'num_bits == 32 and perc == {perc} and mode == "{heuristic}" and cls_file == "{cls}"')
- if sub_data.empty:
- raise RuntimeError(f"Couldn't find data for {heuristic}")
- entry[f'{heuristic}_{perc}'] = sub_data['num_groups'].item()
- result.append(entry)
- data = pd.concat(result, axis='index')
- data.to_csv(table_path, sep='\t', index=False)
- def generate_perc_plots(
- classifiers: Sequence[str],
- data: pd.DataFrame,
- output_dir: str,
- heuristics: Sequence[str],
- ):
- for perc in PERC_VALUES:
- table_path = path.join(output_dir, f'perc_{perc}.tsv')
- print(f'generating "{table_path}')
- df = pd.DataFrame({'cls_file': classifiers})
- for heuristic in heuristics:
- for num_bits in CHOSEN_L + [FULL_BIT_LENGTH]:
- df = df.merge(
- data.query(
- f'num_bits == {num_bits} and mode == "{heuristic}"'
- f' and perc == "{perc}"'
- ).filter(
- ['num_groups', 'cls_file'], axis='columns'
- ).rename(columns={'num_groups': f'{heuristic}_{num_bits}'}),
- how='left',
- on='cls_file'
- )
- df = df.assign(cls_file=lambda x: [drop_extension(c) for c in x['cls_file']])
- df.fillna(float('inf'), inplace=True)
- df.to_csv(table_path, sep='\t', index=False)
- with pd.option_context(
- 'display.max_rows', None, 'display.max_columns', None):
- print(df.describe())
- for cls in classifiers:
- cls_name = drop_extension(cls)
- table_path = path.join(output_dir, f'perc_{cls_name}.tsv')
- print(f'generating "{table_path}')
- df = pd.DataFrame({'perc': PERC_VALUES})
- for heuristic in heuristics:
- for num_bits in CHOSEN_L + [FULL_BIT_LENGTH]:
- df = df.merge(
- data.query(
- f'num_bits == {num_bits} and mode == "{heuristic}"'
- f' and cls_file == "{cls}"'
- ).filter(
- ['num_groups', 'perc'], axis='columns'
- ).rename(columns={'num_groups': f'{heuristic}_{num_bits}'}),
- how='left',
- on='perc'
- )
- df.fillna(float('inf'), inplace=True)
- df.to_csv(table_path, sep='\t', index=False)
- def generate_tables(classifiers, data: pd.DataFrame, output_dir):
- for heuristic in HEURISTICS:
- generate_table_for(classifiers, data, heuristic, output_dir)
- def generate_table_for(classifiers, data, heuristic, output_dir):
- table_path = path.join(output_dir, f'bigtable_{heuristic}.tex')
- print(f'generating "{table_path}"')
- with open(table_path, 'w') as outf:
- outf.write(print_table_header_23h() + '\n')
- for k in classifiers:
- outf.write(print_line_23h(heuristic, k, data) + '\n')
- outf.write(print_table_footer() + '\n')
- class Data(NamedTuple):
- data_with_beta: pd.DataFrame
- data_with_perc: pd.DataFrame
- def load_data():
- raw_input = process_file_raw('data.tsv')
- data_exploded = explode_groups_for_last(raw_input)
- data_with_beta = replace_groups_with_beta(data_exploded, max(BETA_VALUES))
- data_with_perc = replace_groups_with_perc(data_exploded)
- lpm_bounded_perc = infer_group_perc(raw_input, 'lpm_bounded', 'lpm')
- data_with_beta = data_with_beta[data_with_beta['mode'].isin(HEURISTICS + PLOT_HEURISTICS)]
- data_with_perc = data_with_perc[data_with_perc['mode'].isin(HEURISTICS)]
- data_with_perc = pd.concat([data_with_perc, lpm_bounded_perc], axis='index')
- data_with_beta = add_is_best(data_with_beta)
- assert (data_with_beta['num_groups'] == data_with_beta['num_groups_real']).all()
- return Data(data_with_beta=data_with_beta, data_with_perc=data_with_perc)
- def process_file_raw(fname: str, exclude=()):
- data = pd.read_csv(
- fname, header=None, sep='\t',
- names=['mode', 'cls_file', 'A?', 'num_rules', 'oi_algo', 'num_bits',
- 'B?', 'num_groups', 'leftover', 'groups', 'C?', 'D?'],
- converters={
- 'groups': lambda x: [int(x) for x in x[1:-1].split(',')]
- })
- data = data.drop(columns=['A?', 'B?', 'C?', 'oi_algo', 'D?'])
- return data[~data['mode'].isin(exclude)]
- def explode_groups_for_last(data: pd.DataFrame):
- return data.groupby(
- by=['mode', 'cls_file', 'num_bits'], as_index=False
- ).last().explode(
- 'groups'
- ).rename(columns={'groups': 'group_size'}).assign(
- group_size=lambda x: pd.to_numeric(x['group_size'])
- )
- def replace_groups_with_beta(
- data: pd.DataFrame, max_beta: int
- ) -> pd.DataFrame:
- def convert_to_beta(x: pd.DataFrame):
- groups = x['group_size'].sort_values(ascending=False)
- cur_max_beta = max(max_beta, len(groups))
- groups_padded = np.concatenate(
- [np.zeros(1), groups, np.zeros(cur_max_beta - len(groups))])
- converted = pd.DataFrame({
- 'beta': np.arange(start=0, stop=len(groups_padded)),
- 'num_rules': groups_padded.cumsum(),
- }).assign(
- num_rules_total=x['num_rules'].head(1).item(),
- num_groups_real=len(x),
- num_groups=x['num_groups'].head(1).item(),
- ).assign(
- num_rules_perc=lambda x:
- (100 * x['num_rules'] / x['num_rules_total']).round(1)
- )
- return converted
- return data.groupby(
- ['mode', 'cls_file', 'num_bits'],
- ).apply(convert_to_beta).reset_index(3, drop=True).reset_index()
- def calculate_inferred_num_groups(x: pd.DataFrame, p):
- num_rules_total = x['num_rules'].head(1).item()
- min_groups_enough = x['num_groups'][x['leftover'] <= num_rules_total * (1 - 0.01 * p)].min()
- max_groups_not_enough = x['num_groups'][x['leftover'] > num_rules_total * (1 - 0.01 * p)].max()
- if math.isnan(min_groups_enough):
- print(f'Upper bound is not reached for {x["cls_file"].iloc[0]} and p = {p}', file=stderr)
- return None
- if math.isnan(max_groups_not_enough):
- print(f'Lower bound is not reached for {x["cls_file"].iloc[0]} and p = {p}', file=stderr)
- return None
- if min_groups_enough == max_groups_not_enough + 1 or p == 100:
- return min_groups_enough
- else:
- print(f'Exact minimum between {max_groups_not_enough} and {min_groups_enough} is not found for {x["cls_file"].iloc[0]} and p = {p}, ', file=stderr)
- return None
- def infer_group_perc(
- data: pd.DataFrame, mode: str, full_mode: str
- ) -> pd.DataFrame:
- def convert_to_percentiles(x: pd.DataFrame):
- converted = pd.DataFrame({
- 'perc': PERC_VALUES,
- 'num_groups': [
- calculate_inferred_num_groups(x, p)
- for p in PERC_VALUES
- ]
- })
- return converted
- return data.query(
- f'mode == "{mode}" or mode == "{full_mode}"'
- ).assign(mode=mode).groupby(
- ['mode', 'cls_file', 'num_bits'],
- ).apply(convert_to_percentiles).reset_index(3, drop=True).reset_index()
- def replace_groups_with_perc(
- data: pd.DataFrame
- ) -> pd.DataFrame:
- def convert_to_percentiles(x: pd.DataFrame):
- groups = x['group_size'].sort_values(ascending=False).cumsum()
- num_rules_total = x['num_rules'].head(1).item()
- converted = pd.DataFrame({
- 'perc': PERC_VALUES,
- 'num_groups': [
- (groups < num_rules_total * (0.01 * p)).sum() + 1
- for p in PERC_VALUES
- ]
- })
- return converted
- return data.groupby(
- ['mode', 'cls_file', 'num_bits'],
- ).apply(convert_to_percentiles).reset_index(3, drop=True).reset_index()
- def add_is_best(data: pd.DataFrame) -> pd.DataFrame:
- return data.groupby(by=['cls_file', 'num_bits', 'beta']).apply(
- lambda x: x.assign(
- is_best=(x['num_rules_perc'] == x['num_rules_perc'].max()))
- ).reset_index(drop=True)
- def to_string(x: Optional[Union[float, int, Tuple[float, bool]]]) -> str:
- if x is None:
- return '---'
- if isinstance(x, float):
- return f'{x:.1f}'
- if len(x) == 2:
- value, best = x
- if best:
- return f'\\textbf{{{value:.1f}}}'
- return f'{value:.1f}'
- if isinstance(x, int):
- return str(int(x))
- raise AssertionError
- def share_beta_num_groups(total, groups) -> float:
- return 100.0 * groups / float(total)
- def share_beta(
- num_bits: int, beta: int, data: pd.DataFrame
- ) -> Optional[Tuple[float, bool]]:
- data_for_length = data.query(f'num_bits == {num_bits} and beta == {beta}')
- if data_for_length.empty:
- return None
- return (share_beta_num_groups(data_for_length['num_rules_total'].item(),
- data_for_length['num_rules'].item()), data_for_length['is_best'].item())
- def share_betafix(
- num_bits: int, beta: int, data: pd.DataFrame
- ) -> Optional[Tuple[float, bool]]:
- length_data = data.query(f'num_bits == {num_bits} and beta == {beta}')
- if length_data.empty:
- return None
- return (share_beta_num_groups(length_data['num_rules_total'].item(),
- length_data['num_rules'].item()), length_data['is_best'].item())
- def print_line_23h(mode: str, k: str, data: pd.DataFrame):
- my_data = query_data_for(data, mode, k, is_full_length=False)
- my_data_104 = query_data_for(data, mode, k, is_full_length=True)
- return ' & '.join(
- [k.split('.')[0], str(my_data['num_rules'].head(1).item())] +
- [to_string(share_beta(num_bits, bt, my_data))
- for num_bits in CHOSEN_L for bt in BETA_VALUES]
- + [
- to_string(share_betafix(104, bt, my_data_104)) for bt in BETA_VALUES]
- ) + '\\\\'
- def query_data_for(
- data: pd.DataFrame, mode: str, k: str, is_full_length: bool
- ) -> pd.DataFrame:
- if is_full_length:
- postoi = f'{mode}_postoi' if not mode.endswith('_postoi') else mode
- return data.query(f'cls_file == \"{k}\" & mode == \"{postoi}\"')
- return data.query(f'cls_file == \"{k}\" & mode == \"{mode}\"')
- def print_table_header_23h():
- return dedent('''
- \\begin{{tabular}}{{|ll|{alignment}|}}
- \\hline
- & & {bit_widths} \\\\
- \\hline
- & Rules & {beta_values} \\\\
- \\hline
- ''').strip().format(
- alignment='|'.join(['c' * len(BETA_VALUES)] * (len(CHOSEN_L)+1)),
- bit_widths=' & '.join(
- f'\\multicolumn{{{len(BETA_VALUES)}}}{{c|}}{{$l={y}$}}'
- for y in CHOSEN_L + [104]),
- beta_values=' & '.join(
- ['$\\beta=$' + ' & '.join([f'{bt:d}' for bt in BETA_VALUES])]
- * (len(CHOSEN_L) + 1))
- )
- def print_table_footer():
- return dedent('''
- \\hline
- \\end{tabular}
- ''').strip()
- if __name__ == '__main__':
- main()