# misc stuff for shortening notebooks

import pandas as pd
import numpy as np


def infer_length_column(base_col_name, dataframe, args=None):
    # in order of preference
    # the count computed at detection time is ideal, denoted `_num_tokens_scored`
    # else for the outputs it's generation-time token ct
    # and for the baseline its the initial ct base on tokenization and slice
    # both now called `_length`

    if args.ignore_repeated_ngrams:
        # if we're ignoring repeated ngrams, then we need to use the length column
        # since the num_tokens_scored column will be wrong/short
        # though this isn't a perfect solution bc there can be retokenization differences
        col_suffixes = ["_length"]
    else:
        col_suffixes = ["_num_tokens_scored", "_length"]

    for suf in col_suffixes:
        length_column_name = f"{base_col_name}{suf}"
        if length_column_name in dataframe.columns:
            return length_column_name

    raise ValueError(
        f"Could not find length column for {base_col_name}. Note, `_num_tokens_generated` suffix is deprecated in favor of `_length`."
    )


def filter_text_col_length(
    df, text_col_name=None, count_suffix="_num_tokens_scored", upper_T=205, lower_T=195
):
    assert text_col_name is not None
    text_col_prefix = text_col_name
    text_col_name = text_col_prefix + count_suffix

    # length filtering
    orig_len = len(df)

    df = df[(df[text_col_name] >= lower_T)]
    df = df[(df[text_col_name] <= upper_T)]

    print(f"Dropped {orig_len-len(df)} rows filtering {text_col_prefix}, new len {len(df)}")

    return df


def mega_filter(df):
    # drop retok_problematic_rows
    retok_problematic_rows = df[
        (df["w_bl_whitelist_fraction"] != -1.0)
        & (df["w_bl_whitelist_fraction"] != 1.0)
        & (df["bl_type"] == "hard")
    ]
    print(
        f"Num rows that are hard-blacklisted, and measureable, but still have a non-100% WL fraction: {len(retok_problematic_rows)} out of {len(df[df['bl_type'] == 'hard'])}"
    )

    # drop special rows marked as -1.0
    orig_len = len(df)

    # df['no_bl_whitelist_fraction'].mask(df['no_bl_whitelist_fraction'] == -1.0, pd.NA, inplace=True)
    # df['w_bl_whitelist_fraction'].mask(df['w_bl_whitelist_fraction'] == -1.0, pd.NA, inplace=True)

    df = df[df["no_bl_whitelist_fraction"] != -1.0]
    df = df[df["w_bl_whitelist_fraction"] != -1.0]

    print(f"Dropped {orig_len-len(df)} rows, new len {len(df)}")

    # drop too few tokesn rows

    orig_len = len(df)
    # df = df[df["no_bl_ppl"].isna()]
    # df = df[df["w_bl_ppl"].isna()]
    df = df[~(df["no_bl_ppl"].isna() | df["w_bl_ppl"].isna())]
    print(f"Dropped {orig_len-len(df)} rows, new len {len(df)}")

    # drop huge biases
    orig_len = len(df)

    df = df[df["bl_logit_bias"] <= 100.0]

    print(f"Dropped {orig_len-len(df)} rows, new len {len(df)}")

    orig_len = len(df)

    # df = df[df["bl_hparams"].apply(lambda tup: (tup[0] == False and tup[2] != 1) or (tup[0] == True and tup[2] == 1) or (tup[0] == False))]
    df = df[((df["use_sampling"] == True) & (df["num_beams"] == 1)) | (df["use_sampling"] == False)]

    print(f"Dropped {orig_len-len(df)} rows, new len {len(df)}")

    # correct sampling temp
    df.loc[df["use_sampling"] == False, "sampling_temp"] = df.loc[
        df["use_sampling"] == False, "sampling_temp"
    ].fillna(0.0)
    df.loc[df["use_sampling"] == True, "sampling_temp"] = df.loc[
        df["use_sampling"] == True, "sampling_temp"
    ].fillna(1.0)

    # set to inf for hard blacklist
    df.loc[df["bl_type"] == "hard", "bl_logit_bias"] = np.inf
    # df.loc[df["bl_type"]=="hard","bl_logit_bias"] = 10000 # crosscheck with whats hardcoded in the bl processor

    # rename some stuff
    df["delta"] = df["bl_logit_bias"].values
    df["gamma"] = 1 - df["bl_proportion"].values
    df["gamma"] = df["gamma"].round(3)

    df["no_bl_act_num_wl_tokens"] = np.round(
        df["no_bl_whitelist_fraction"].values * df["no_bl_num_tokens_generated"], 1
    )  # round to 1 for sanity
    df["w_bl_act_num_wl_tokens"] = np.round(
        df["w_bl_whitelist_fraction"].values * df["w_bl_num_tokens_generated"], 1
    )  # round to 1 for sanity

    df["w_bl_std_num_wl_tokens"] = np.sqrt(df["w_bl_var_num_wl_tokens"].values)

    if "real_completion_length":
        df["baseline_num_tokens_generated"] = df["real_completion_length"].values

    if "actual_attacked_ratio" in df.columns:
        df["actual_attacked_fraction"] = (
            df["actual_attacked_ratio"].values * df["replace_ratio"].values
        )

    if "meta" in df.columns:
        df["pile_set_name"] = df["meta"].apply(lambda dict: dict["pile_set_name"])

    df["baseline_hit_list_length"] = df["baseline_hit_list"].apply(len)
    df["no_bl_hit_list_length"] = df["no_bl_hit_list"].apply(len)
    df["w_bl_hit_list_length"] = df["w_bl_hit_list"].apply(len)

    # for pile outlier filtering
    df["w_bl_space_count"] = df["w_bl_output"].apply(lambda string: string.count(" "))
    df["no_bl_space_count"] = df["no_bl_output"].apply(lambda string: string.count(" "))
    df["baseline_space_count"] = df["baseline_completion"].apply(lambda string: string.count(" "))

    df["w_bl_space_frac"] = df["w_bl_space_count"].values / df["w_bl_hit_list_length"]
    df["no_bl_space_frac"] = df["no_bl_space_count"].values / df["no_bl_hit_list_length"]
    df["baseline_space_frac"] = df["baseline_space_count"].values / df["baseline_hit_list_length"]

    # Final length filtering
    orig_len = len(df)

    upper_T = 205
    lower_T = 195
    df = df[
        (df["baseline_hit_list_length"] >= lower_T)
        & (df["no_bl_hit_list_length"] >= lower_T)
        & (df["w_bl_hit_list_length"] >= lower_T)
    ]  # now also applies to the truncated version
    df = df[
        (df["baseline_hit_list_length"] <= upper_T)
        & (df["no_bl_hit_list_length"] <= upper_T)
        & (df["w_bl_hit_list_length"] <= upper_T)
    ]  # now also applies to the truncated version

    print(f"Dropped {orig_len-len(df)} rows, new len {len(df)}")

    return df