misc.py | searchcode

/testing/misc.py

https://gitlab.com/pschuprikov/lpm
Python | 430 lines | 355 code | 72 blank | 3 comment | 39 complexity | 646c11dc4201d13ea89c9bd47d643e24 MD5 | raw file

#!/usr/bin/env python3
from textwrap import dedent
import math
from typing import List, Optional, Tuple, Union, Sequence, NamedTuple
from sys import stderr
import os
import os.path as path

import click
import pandas as pd
import numpy as np


HEURISTICS = (
    'lpm_inc_act_postoi',
    'oi_act',
    'oi_lpm_joint_act',
    'oi_exact_act'
)

PLOT_HEURISTICS = (
    'lpm_bounded_inc_act_postoi',
    'lpm_bounded_max_step_act_postoi',
    'lpm_bounded_max_oi_act_postoi',
    'lpm_bounded_max_lpm_act_postoi',
    'oi_bound_32_64',
    'oi',
)

HEURISTICS_NOT_SUPPORTING_FULL_WIDTH = (
    'oi_bound_32_64',
    'oi',
)

FULL_BIT_LENGTH = 104

CHOSEN_L: List[int] = [32]
BETA_VALUES = [5, 10, 20]
PERC_VALUES = [95, 99, 100]


@click.command()
@click.option('--output-dir', type=str, default='output',
              help='Output directory')
def main(output_dir):
    do_main(output_dir)


def do_main(output_dir: str):
    if not path.exists(output_dir):
        print(f'creating "{output_dir}" directory')
        os.makedirs(output_dir, exist_ok=True)

    data = load_data()

    classifiers: Sequence[str] = sorted(set(data.data_with_beta['cls_file']))

    generate_perc_table(classifiers, data.data_with_perc, output_dir, [
        'lpm_inc_act_postoi',
        'oi_act',
        'oi_lpm_joint_act',
        'oi_exact_act',
    ])
    generate_perc_plots(classifiers, data.data_with_perc, output_dir, [
        'lpm_inc_act_postoi',
        'oi_act',
        'oi_lpm_joint_act',
        'oi_exact_act',
        'lpm_bounded'
    ])

    #generate_tables(classifiers, data.data_with_beta, output_dir)
    #generate_plots(classifiers, data.data_with_beta, output_dir)


def generate_plots(classifiers, data: pd.DataFrame, output_dir):
    for k in classifiers:
        generate_plot_for(data, k, output_dir, bit_widths=[32, FULL_BIT_LENGTH])


def generate_plot_for(data, k, output_dir, bit_widths: Sequence[int]):
    plot_data = pd.DataFrame({'beta': np.arange(0, max(BETA_VALUES) + 1)})
    plot_path = path.join(output_dir, f'plot_{drop_extension(k)}.tsv')
    print(f'generating "{plot_path}"')
    for heuristic in PLOT_HEURISTICS:
        for bit_width in bit_widths:
            if not is_valid_bit_width(heuristic, bit_width):
                continue
            heuristic_data = extract_cdf_data(data, bit_width, heuristic, k)
            plot_data = merge_cdf_data(heuristic_data, plot_data)
    plot_data.to_csv(plot_path, sep='\t', index=False)


def is_valid_bit_width(heuristic: str, bit_width: int) -> bool:
    return bit_width != FULL_BIT_LENGTH \
           or heuristic not in HEURISTICS_NOT_SUPPORTING_FULL_WIDTH


def merge_cdf_data(
        heuristic_data: pd.DataFrame, plot_data: pd.DataFrame
) -> pd.DataFrame:
    return plot_data.merge(
        heuristic_data, how='left', on='beta', validate='1:1')


def extract_cdf_data(
        data: pd.DataFrame, bit_width: int, heuristic: str, k: str
) -> pd.DataFrame:
    return query_data_for(
        data, heuristic, k, is_full_length=False
    ).query(f'num_bits == {bit_width}').filter(
        ['beta', 'num_rules_perc'], axis='columns'
    ).rename(columns={'num_rules_perc': f'{heuristic}_{bit_width}'})


def drop_extension(filepath: str) -> str:
    return path.splitext(filepath)[0]


def generate_perc_table(
        classifiers: Sequence[str],
        data: pd.DataFrame,
        output_dir: str,
        heuristics: Sequence[str],
):
    table_path = path.join(output_dir, f'perctable.tsv')
    print(f'generating "{table_path}')

    result = []
    for i, cls in enumerate(classifiers):
        entry = pd.DataFrame({'classifier': [drop_extension(cls)]}, index=[i])
        for heuristic in heuristics:
            for perc in PERC_VALUES:
                sub_data = data.query(f'num_bits == 32 and perc == {perc} and mode == "{heuristic}" and cls_file == "{cls}"')
                if sub_data.empty:
                    raise RuntimeError(f"Couldn't find data for {heuristic}")
                entry[f'{heuristic}_{perc}'] = sub_data['num_groups'].item()
        result.append(entry)
    data = pd.concat(result, axis='index')
    data.to_csv(table_path, sep='\t', index=False)


def generate_perc_plots(
        classifiers: Sequence[str],
        data: pd.DataFrame,
        output_dir: str,
        heuristics: Sequence[str],
):
    for perc in PERC_VALUES:
        table_path = path.join(output_dir, f'perc_{perc}.tsv')
        print(f'generating "{table_path}')

        df = pd.DataFrame({'cls_file': classifiers})
        for heuristic in heuristics:
            for num_bits in CHOSEN_L + [FULL_BIT_LENGTH]:
                df = df.merge(
                    data.query(
                        f'num_bits == {num_bits} and mode == "{heuristic}"'
                        f' and perc == "{perc}"'
                    ).filter(
                        ['num_groups', 'cls_file'], axis='columns'
                    ).rename(columns={'num_groups': f'{heuristic}_{num_bits}'}),
                    how='left',
                    on='cls_file'
                )
        df = df.assign(cls_file=lambda x: [drop_extension(c) for c in x['cls_file']])
        df.fillna(float('inf'), inplace=True)
        df.to_csv(table_path, sep='\t', index=False)
        with pd.option_context(
                'display.max_rows', None, 'display.max_columns', None):
            print(df.describe())

    for cls in classifiers:
        cls_name = drop_extension(cls)
        table_path = path.join(output_dir, f'perc_{cls_name}.tsv')
        print(f'generating "{table_path}')

        df = pd.DataFrame({'perc': PERC_VALUES})
        for heuristic in heuristics:
            for num_bits in CHOSEN_L + [FULL_BIT_LENGTH]:
                df = df.merge(
                    data.query(
                        f'num_bits == {num_bits} and mode == "{heuristic}"'
                        f' and cls_file == "{cls}"'
                    ).filter(
                        ['num_groups', 'perc'], axis='columns'
                    ).rename(columns={'num_groups': f'{heuristic}_{num_bits}'}),
                    how='left',
                    on='perc'
                )
        df.fillna(float('inf'), inplace=True)
        df.to_csv(table_path, sep='\t', index=False)


def generate_tables(classifiers, data: pd.DataFrame, output_dir):
    for heuristic in HEURISTICS:
        generate_table_for(classifiers, data, heuristic, output_dir)


def generate_table_for(classifiers, data, heuristic, output_dir):
    table_path = path.join(output_dir, f'bigtable_{heuristic}.tex')
    print(f'generating "{table_path}"')
    with open(table_path, 'w') as outf:
        outf.write(print_table_header_23h() + '\n')
        for k in classifiers:
            outf.write(print_line_23h(heuristic, k, data) + '\n')
        outf.write(print_table_footer() + '\n')


class Data(NamedTuple):
    data_with_beta: pd.DataFrame
    data_with_perc: pd.DataFrame


def load_data():
    raw_input = process_file_raw('data.tsv')
    data_exploded = explode_groups_for_last(raw_input)
    data_with_beta = replace_groups_with_beta(data_exploded, max(BETA_VALUES))
    data_with_perc = replace_groups_with_perc(data_exploded)
    lpm_bounded_perc = infer_group_perc(raw_input, 'lpm_bounded', 'lpm')
    data_with_beta = data_with_beta[data_with_beta['mode'].isin(HEURISTICS + PLOT_HEURISTICS)]
    data_with_perc = data_with_perc[data_with_perc['mode'].isin(HEURISTICS)]
    data_with_perc = pd.concat([data_with_perc, lpm_bounded_perc], axis='index')
    data_with_beta = add_is_best(data_with_beta)

    assert (data_with_beta['num_groups'] == data_with_beta['num_groups_real']).all()

    return Data(data_with_beta=data_with_beta, data_with_perc=data_with_perc)


def process_file_raw(fname: str, exclude=()):
    data = pd.read_csv(
        fname, header=None, sep='\t',
        names=['mode', 'cls_file', 'A?', 'num_rules', 'oi_algo', 'num_bits',
               'B?', 'num_groups', 'leftover', 'groups', 'C?', 'D?'],
        converters={
            'groups': lambda x: [int(x) for x in x[1:-1].split(',')]
        })
    data = data.drop(columns=['A?', 'B?', 'C?', 'oi_algo', 'D?'])
    return data[~data['mode'].isin(exclude)]


def explode_groups_for_last(data: pd.DataFrame):
    return data.groupby(
        by=['mode', 'cls_file', 'num_bits'], as_index=False
    ).last().explode(
        'groups'
    ).rename(columns={'groups': 'group_size'}).assign(
        group_size=lambda x: pd.to_numeric(x['group_size'])
    )


def replace_groups_with_beta(
        data: pd.DataFrame, max_beta: int
) -> pd.DataFrame:
    def convert_to_beta(x: pd.DataFrame):
        groups = x['group_size'].sort_values(ascending=False)
        cur_max_beta = max(max_beta, len(groups))
        groups_padded = np.concatenate(
            [np.zeros(1), groups, np.zeros(cur_max_beta - len(groups))])
        converted = pd.DataFrame({
            'beta': np.arange(start=0, stop=len(groups_padded)),
            'num_rules': groups_padded.cumsum(),
        }).assign(
            num_rules_total=x['num_rules'].head(1).item(),
            num_groups_real=len(x),
            num_groups=x['num_groups'].head(1).item(),
        ).assign(
            num_rules_perc=lambda x:
            (100 * x['num_rules'] / x['num_rules_total']).round(1)
        )
        return converted

    return data.groupby(
        ['mode', 'cls_file', 'num_bits'],
    ).apply(convert_to_beta).reset_index(3, drop=True).reset_index()


def calculate_inferred_num_groups(x: pd.DataFrame, p):
    num_rules_total = x['num_rules'].head(1).item()
    min_groups_enough = x['num_groups'][x['leftover'] <= num_rules_total * (1 - 0.01 * p)].min()
    max_groups_not_enough = x['num_groups'][x['leftover'] > num_rules_total * (1 - 0.01 * p)].max()
    if math.isnan(min_groups_enough):
        print(f'Upper bound is not reached for {x["cls_file"].iloc[0]} and p = {p}', file=stderr)
        return None
    if math.isnan(max_groups_not_enough):
        print(f'Lower bound is not reached for {x["cls_file"].iloc[0]} and p = {p}', file=stderr)
        return None
    if min_groups_enough == max_groups_not_enough + 1 or p == 100:
        return min_groups_enough
    else:
        print(f'Exact minimum between {max_groups_not_enough} and {min_groups_enough} is not found for {x["cls_file"].iloc[0]} and p = {p}, ', file=stderr)
        return None


def infer_group_perc(
    data: pd.DataFrame, mode: str, full_mode: str
) -> pd.DataFrame:
    def convert_to_percentiles(x: pd.DataFrame):
        converted = pd.DataFrame({
            'perc': PERC_VALUES,
            'num_groups': [
                calculate_inferred_num_groups(x, p)
                for p in PERC_VALUES
            ]
        })
        return converted

    return data.query(
        f'mode == "{mode}" or mode == "{full_mode}"'
    ).assign(mode=mode).groupby(
        ['mode', 'cls_file', 'num_bits'],
    ).apply(convert_to_percentiles).reset_index(3, drop=True).reset_index()


def replace_groups_with_perc(
        data: pd.DataFrame
) -> pd.DataFrame:
    def convert_to_percentiles(x: pd.DataFrame):
        groups = x['group_size'].sort_values(ascending=False).cumsum()
        num_rules_total = x['num_rules'].head(1).item()
        converted = pd.DataFrame({
            'perc': PERC_VALUES,
            'num_groups': [
                (groups < num_rules_total * (0.01 * p)).sum() + 1
                for p in PERC_VALUES
            ]
        })
        return converted

    return data.groupby(
        ['mode', 'cls_file', 'num_bits'],
    ).apply(convert_to_percentiles).reset_index(3, drop=True).reset_index()


def add_is_best(data: pd.DataFrame) -> pd.DataFrame:
    return data.groupby(by=['cls_file', 'num_bits', 'beta']).apply(
        lambda x: x.assign(
            is_best=(x['num_rules_perc'] == x['num_rules_perc'].max()))
    ).reset_index(drop=True)


def to_string(x: Optional[Union[float, int, Tuple[float, bool]]]) -> str:
    if x is None:
        return '---'
    if isinstance(x, float):
        return f'{x:.1f}'
    if len(x) == 2:
        value, best = x
        if best:
            return f'\\textbf{{{value:.1f}}}'
        return f'{value:.1f}'
    if isinstance(x, int):
        return str(int(x))
    raise AssertionError


def share_beta_num_groups(total, groups) -> float:
    return 100.0 * groups / float(total)


def share_beta(
        num_bits: int, beta: int, data: pd.DataFrame
        ) -> Optional[Tuple[float, bool]]:
    data_for_length = data.query(f'num_bits == {num_bits} and beta == {beta}')
    if data_for_length.empty:
        return None
    return (share_beta_num_groups(data_for_length['num_rules_total'].item(),
                                  data_for_length['num_rules'].item()), data_for_length['is_best'].item())


def share_betafix(
        num_bits: int, beta: int, data: pd.DataFrame
        ) -> Optional[Tuple[float, bool]]:
    length_data = data.query(f'num_bits == {num_bits} and beta == {beta}')
    if length_data.empty:
        return None
    return (share_beta_num_groups(length_data['num_rules_total'].item(),
                                  length_data['num_rules'].item()), length_data['is_best'].item())


def print_line_23h(mode: str, k: str, data: pd.DataFrame):
    my_data = query_data_for(data, mode, k, is_full_length=False)
    my_data_104 = query_data_for(data, mode, k, is_full_length=True)
    return ' & '.join(
        [k.split('.')[0], str(my_data['num_rules'].head(1).item())] +
        [to_string(share_beta(num_bits, bt, my_data))
         for num_bits in CHOSEN_L for bt in BETA_VALUES]
        + [
            to_string(share_betafix(104, bt, my_data_104)) for bt in BETA_VALUES]
        ) + '\\\\'


def query_data_for(
        data: pd.DataFrame, mode: str, k: str, is_full_length: bool
) -> pd.DataFrame:
    if is_full_length:
        postoi = f'{mode}_postoi' if not mode.endswith('_postoi') else mode
        return data.query(f'cls_file == \"{k}\" & mode == \"{postoi}\"')
    return data.query(f'cls_file == \"{k}\" & mode == \"{mode}\"')


def print_table_header_23h():
    return dedent('''
        \\begin{{tabular}}{{|ll|{alignment}|}}
        \\hline
        & & {bit_widths} \\\\
        \\hline
        & Rules & {beta_values} \\\\
        \\hline
    ''').strip().format(
        alignment='|'.join(['c' * len(BETA_VALUES)] * (len(CHOSEN_L)+1)),
        bit_widths=' & '.join(
            f'\\multicolumn{{{len(BETA_VALUES)}}}{{c|}}{{$l={y}$}}'
            for y in CHOSEN_L + [104]),
        beta_values=' & '.join(
            ['$\\beta=$' + ' & '.join([f'{bt:d}' for bt in BETA_VALUES])]
            * (len(CHOSEN_L) + 1))
    )


def print_table_footer():
    return dedent('''
        \\hline
        \\end{tabular}
    ''').strip()


if __name__ == '__main__':
    main()