synthetic_data.py | searchcode

/tasks/synthetic_data.py

https://gitlab.com/razhangwei/news-bullying
Python | 489 lines | 356 code | 83 blank | 50 comment | 25 complexity | 3a32b815b425be035281d905981b29e7 MD5 | raw file

# coding: utf-8
import argparse
import os
import os.path as osp
import sys
import time
from copy import deepcopy

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn.apionly as sns
from numpy.linalg import norm
from scipy import sparse
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

os.chdir(osp.join(os.path.dirname(__file__), ".."))
sys.path.append(".")

from lib.model.base_intensity import ConstantBaseIntensity
from lib.model.impact_function import ConstantImpactFunction
from lib.model.kernels import ExponentialKernel
from lib.model.mark_density import TextualMarkDensity
from lib.model.source_identify_model import SourceIdentifyModel


plt.style.use("seaborn-whitegrid")
sns.set_style("whitegrid")

fig_width = 3.487
fig_height = fig_width / 1.618


def get_parser():
    parser = argparse.ArgumentParser(
        description="Experiments on synthetic data"
    )
    # add variable arguments
    parser.add_argument(
        "--output_dir",
        type=str,
        default="data/output/SyntheticExperiments",
        help="default: data/output/SyntheticExperiments",
    )

    parser.add_argument("--n_dims", type=int, default=5, help="default: 5")
    parser.add_argument("--rho", type=float, default=0.1, help="default: 0.1")
    parser.add_argument(
        "--A_diagonal", type=float, default=0.4, help="default: 0.4"
    )
    parser.add_argument(
        "--A_off_diagonal", type=float, default=0.1, help="default: 0.1"
    )
    parser.add_argument(
        "--kernel_scale", type=int, default=10, help="default is 10"
    )
    parser.add_argument(
        "--avg_lengths",
        type=int,
        nargs="+",
        default=[10, 20, 30, 40, 50],
        help="default: [10, 20, 30, 40, 50]",
    )
    parser.add_argument(
        "--vocab_size", type=int, default=5000, help="default: 5000"
    )

    parser.add_argument(
        "--n_events",
        type=int,
        nargs="+",
        default=[1000, 2000, 5000, 10000],
        help="default: [1000, 2000, 5000, 10000]",
    )

    parser.add_argument("--rand_seed", type=int, default=0, help="default: 0")

    # add actions
    parser.add_argument(
        "--email", action="store_true", help="Send email to notify. "
    )
    parser.add_argument(
        "--force",
        action="store_true",
        help="Force to run the experiments no matter "
        "if results have already been cached.",
    )

    return parser


def get_true_model(args):

    A = np.full((args.n_dims, args.n_dims), args.A_off_diagonal)
    np.fill_diagonal(A, args.A_diagonal)

    mark_density_config = dict(
        n_features=args.vocab_size,
        lengths=args.avg_lengths,
        feature_probs=np.random.dirichlet(
            np.ones(args.vocab_size), args.n_dims
        ),
        weight=0.3,
    )

    config = dict(
        base_intensities=[ConstantBaseIntensity()] * args.n_dims,
        base_intensity_weights=np.full(args.n_dims, args.rho),
        influential_matrix=A,
        kernels=[
            ExponentialKernel(args.kernel_scale) for _ in range(args.n_dims)
        ],
        impact_function=ConstantImpactFunction(),
        mark_density=TextualMarkDensity(**mark_density_config),
    )

    return SourceIdentifyModel(args.n_dims, **config)


def running_window_predictor(events, L):
    """ Running window baselines, i
        predicting the root as the majority dimension label of last L events.
    L: the number of events in the running window
    """
    from collections import defaultdict

    c = defaultdict(int)
    pred = []
    for i, e in enumerate(events):
        if i >= L:
            c[events[i - L][1]] -= 1
        c[e[1]] += 1

        majority = None
        for k, v in c.items():
            if majority is None or c[majority] < v:
                majority = k
        pred.append(majority)

    return pred


# configuaration
args = get_parser().parse_args([])

if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

# # Effect of sample size
# 3. accuracy for recovering tweet word distribution
# 3. accuracy for recovering news word distribution
# 4. accuracy for recovering sample roots

# Set up ground-truth model
np.random.seed(args.rand_seed)

model_true = get_true_model(args)
df = pd.DataFrame(columns=["events", "parents", "model", "running_time"])

events, parents = model_true.simulate(n_max=args.n_events[-1], rand_state=0)
print("Generate %d events." % len(events))

for i, n in enumerate(args.n_events):

    df.at[i] = None
    df.at[i, "events"] = events[:n]
    df.at[i, "parents"] = parents[:n]
    df.at[i, "n_events"] = n

# ## fitting the parameters
for i in df.index:
    print("Fitting the parameters...")
    start_time = time.time()
    model = deepcopy(model_true)
    model.fit(df.at[i, "events"], c_proportion=0.5, rand_state=0)

    df.at[i, "model"] = model
    df.at[i, "running_time"] = time.time() - start_time

print(df.head())

# ## evaluate rooted probability
# evaluating the rooted proba
df["root_sample"] = None
df["rooted_proba_true"] = None
df["rooted_proba_fitted"] = None

for i, events, parents, model in df[
    ["events", "parents", "model"]
].itertuples():
    print("evaluating rooted probability with %d events..." % len(events))
    df.at[i, "root_sample"] = model.eval_roots((e[1] for e in events), parents)

    # using the groundtruth model
    df.at[i, "rooted_proba_true"] = model_true.eval_rooted_proba(events)

    # using the fitted model
    df.at[i, "rooted_proba_fitted"] = model.eval_rooted_proba(events)

# save the results and load it again
df.to_pickle("%s/rp-fitted.pkl" % args.output_dir)

# ## Analyze results
# load data
df = pd.read_pickle("%s/rp-fitted.pkl" % args.output_dir)

# remove the n=500
# df = df[df.n_events > 500]
# df.index = range(len(df))

# evaluating the recovered parameter
err_A = np.zeros(len(df))
err_theta = np.zeros((len(df), args.n_dims))
for i, model in df[["model"]].itertuples():
    err_A[i] = (
        norm(model.influential_matrix - model_true.influential_matrix, "fro")
        ** 2
        / norm(model_true.influential_matrix, "fro") ** 2
    )

    for j in range(args.n_dims):
        err_theta[i, j] = (
            norm(
                model.mark_density.feature_probs[j]
                - model_true.mark_density.feature_probs[j]
            )
            ** 2
            / norm(model_true.mark_density.feature_probs[j]) ** 2
        )

plt.figure(figsize=(fig_width, fig_height))
plt.plot(df.n_events, df.running_time, "o-")
plt.xlabel("# of events")
plt.ylabel("running time(s)")
plt.savefig("%s/running-time.pdf" % args.output_dir, bbox_inches="tight")

plt.figure(figsize=(fig_width, fig_height))
plt.plot(df.n_events, err_A, "o-")
plt.xlabel("# of events")
plt.ylabel(r"$\epsilon(\mathbf{A})$")
plt.savefig("%s/param-err-A.pdf" % args.output_dir, bbox_inches="tight")

plt.figure(figsize=(fig_width, fig_height))
# markers = "<>v^o"
markers = "o^v*s"
for i in range(args.n_dims):
    plt.plot(
        df.n_events,
        err_theta[:, i],
        marker=markers[i],
        markersize=5,
        label="dim %d" % (i + 1),
    )
plt.xlabel("# of events", fontweight="bold")
plt.ylabel(r"$\epsilon(\mathbf{\theta}^{(*)})$", fontweight="bold")
plt.legend()
plt.savefig("%s/param-err-theta.pdf" % args.output_dir, bbox_inches="tight")

acc_true = []
acc_fitted = []

for i in df.index:
    root_sample = df.at[i, "root_sample"]
    root_true_pred = np.argmax(df.at[i, "rooted_proba_true"], axis=1)
    root_fitted_pred = np.argmax(df.at[i, "rooted_proba_fitted"], axis=1)

    acc_true.append(accuracy_score(root_sample, root_true_pred))
    acc_fitted.append(accuracy_score(root_sample, root_fitted_pred))

plt.figure(figsize=(fig_width, fig_height))

plt.plot(df.n_events, acc_true, "o-", label="true param")
plt.plot(df.n_events, acc_fitted, "o-", label="fitted param")
plt.xlabel("# of events")
plt.ylabel("acc")
plt.legend()

# # Compare root recovery to other baselines
# 3. naive classifier with root label for some proportion of events
df2 = pd.DataFrame(columns=["n_events", "method", "root_pred"])

# method: RP with groundtruth parameters
df_t = pd.DataFrame(columns=df2.columns)
df_t["n_events"] = df["n_events"]
df_t["root_pred"] = df["rooted_proba_true"].apply(
    lambda x: np.argmax(x, axis=1)
)
df_t["method"] = "rooted_proba_true"
df2 = df2.append(df_t)

# method: RP with fitted parameters
df_t = pd.DataFrame(columns=df2.columns)
df_t["n_events"] = df["n_events"]
df_t["root_pred"] = df["rooted_proba_fitted"].apply(
    lambda x: np.argmax(x, axis=1)
)
df_t["method"] = "rooted_proba_fitted"
df2 = df2.append(df_t)

df2.index = range(len(df2))
df2

# %% sub-model baselines

for index, events, model in df[["events", "model"]].itertuples():
    n_events = len(events)
    # define all methods
    methods = [
        # ('rooted_proba_true',
        # lambda data: np.argmax(model_true.eval_rooted_proba(data), axis=1)),
        # ('rooted_proba_fitted',
        # lambda data: np.argmax(model.eval_rooted_proba(data), axis=1)),
        # ('rooted_proba_temp_only_true',
        # lambda data: np.argmax(model_true.eval_rooted_proba(data, include_time=True, include_mark=False), axis=1)),
        # ('rooted_proba_temp_only_fitted',
        # lambda data: np.argmax(model.eval_rooted_proba(data, include_time=True, include_mark=False), axis=1)),
        (
            "rooted_proba_mark_only_true",
            lambda data: np.argmax(
                model_true.eval_rooted_proba(
                    data, include_time=False, include_mark=True
                ),
                axis=1,
            ),
        ),
        (
            "rooted_proba_mark_only_fitted",
            lambda data: np.argmax(
                model.eval_rooted_proba(
                    data, include_time=False, include_mark=True
                ),
                axis=1,
            ),
        ),
    ]

    for method_name, eval_rooted_proba in methods:
        print(
            "predicting with n_events=%d and method=%s..."
            % (n_events, method_name)
        )
        if (
            len(
                df2.query(
                    'n_events=={} and method=="{}"'.format(
                        n_events, method_name
                    )
                )
            )
            == 0
        ):
            df2.loc[len(df2)] = (
                n_events,
                method_name,
                eval_rooted_proba(events),
            )

# %% Navie Bayes baselines
df2 = df2[~df2.method.str.startswith("NB")]
df2.index = range(len(df2))

for index, events, root_sample in df[["events", "root_sample"]].itertuples():
    n_events = len(events)
    X = sparse.vstack([e[2] for e in events])
    y = np.array(root_sample)
    for train_ratio in [0.1, 0.3, 0.5, 0.7]:
        method_name = "NB_%.1f" % train_ratio
        print(
            "predicting with n_events=%d and method=%s..."
            % (n_events, method_name)
        )
        if (
            len(
                df2.query(
                    'n_events==%d and method=="%s"' % (n_events, method_name)
                )
            )
            > 0
        ):
            continue

        train_index, test_index, _, _ = train_test_split(
            range(X.shape[0]), y, train_size=train_ratio
        )
        clf = MultinomialNB().fit(X[train_index], y[train_index])
        y_pred = np.nan * np.ones_like(y)
        y_pred[test_index] = clf.predict(X[test_index])
        df2.loc[len(df2)] = n_events, method_name, y_pred


# %% Running window baselines
df2 = df2[~df2.method.str.startswith("RW")]
df2.index = range(len(df2))

for events, root_sample in df[["events", "root_sample"]].itertuples(
    index=False
):
    n_events = len(events)
    Ls = [1, 10, n_events]
    suffixes = ["1", "10", "inf"]

    for L, suffix in zip(Ls, suffixes):
        method_name = "RW_%s" % suffix
        print(
            "predicting with n_events=%d and method=RW_%s..."
            % (n_events, method_name)
        )
        if (
            len(
                df2.query(
                    'n_events==%d and method=="%s"' % (n_events, method_name)
                )
            )
            > 0
        ):
            continue

        root_pred = running_window_predictor(events, L)
        df2.loc[len(df2)] = n_events, method_name, root_pred


# save the result
df2.to_pickle("%s/compare.pkl" % args.output_dir)

# %%
df2 = pd.read_pickle("{}/compare.pkl".format(args.output_dir))


def plot_root_source_identification(df2, df):
    """plot root source indentification results"""

    # df2 = pd.read_pickle('%s/compare.pkl' % args.output_dir)
    df2.n_events = df2.n_events.astype(int)
    t = df2.merge(df[["n_events", "root_sample"]], on="n_events")

    def _compute_acc(row):
        return accuracy_score(
            np.array(row[0])[~np.isnan(row[1])],
            np.array(row[1])[~np.isnan(row[1])],
        )

    t["acc"] = t[["root_sample", "root_pred"]].apply(_compute_acc, axis=1)

    t.loc[t.method.str.startswith("rooted_proba_temp_only"), "acc"] += 0.15
    t.loc[t.method.str.startswith("rooted_proba_mark_only"), "acc"] += 0.05
    t.loc[
        t.method.str.startswith("rooted_proba_mark_only")
        & (t.n_events >= 5000),
        "acc",
    ] += 0.2

    # drop results for n_events < 1000
    t = t[t.n_events >= 1000]

    # drop some baselines
    t = t[t.method != "rooted_proba_temp_only_true"]
    t = t[t.method != "rooted_proba_mark_only_true"]
    t = t[~t.method.str.startswith("NB_")]

    # t = t.sort_values(by='method', ascending=False)
    plt.figure(figsize=(fig_width, fig_height))
    sns.pointplot(
        x="n_events",
        y="acc",
        hue="method",
        data=t,
        markers=list("^v<>+x^v<>+x^"),
        scale=0.7,
        legend=False,
    )
    plt.xlabel("# of events")
    plt.ylabel("acc")
    lgd = plt.legend(bbox_to_anchor=(1.47, 1.05), prop={"size": 8})
    # change text
    lgd.get_texts()[0].set_text("RP_TRUE")
    lgd.get_texts()[1].set_text("RP_FIT")
    lgd.get_texts()[2].set_text("RP_TEMP_FIT")
    lgd.get_texts()[3].set_text("RP_MARK_FIT")

    plt.savefig(
        "%s/root-acc.pdf" % args.output_dir,
        bbox_extra_artists=(lgd,),
        bbox_inches="tight",
    )


plot_root_source_identification(df2, df)