/tasks/synthetic_data.py
Python | 489 lines | 356 code | 83 blank | 50 comment | 25 complexity | 3a32b815b425be035281d905981b29e7 MD5 | raw file
- # coding: utf-8
- import argparse
- import os
- import os.path as osp
- import sys
- import time
- from copy import deepcopy
- import matplotlib.pyplot as plt
- import numpy as np
- import pandas as pd
- import seaborn.apionly as sns
- from numpy.linalg import norm
- from scipy import sparse
- from sklearn.metrics import accuracy_score
- from sklearn.model_selection import train_test_split
- from sklearn.naive_bayes import MultinomialNB
- os.chdir(osp.join(os.path.dirname(__file__), ".."))
- sys.path.append(".")
- from lib.model.base_intensity import ConstantBaseIntensity
- from lib.model.impact_function import ConstantImpactFunction
- from lib.model.kernels import ExponentialKernel
- from lib.model.mark_density import TextualMarkDensity
- from lib.model.source_identify_model import SourceIdentifyModel
- plt.style.use("seaborn-whitegrid")
- sns.set_style("whitegrid")
- fig_width = 3.487
- fig_height = fig_width / 1.618
- def get_parser():
- parser = argparse.ArgumentParser(
- description="Experiments on synthetic data"
- )
- # add variable arguments
- parser.add_argument(
- "--output_dir",
- type=str,
- default="data/output/SyntheticExperiments",
- help="default: data/output/SyntheticExperiments",
- )
- parser.add_argument("--n_dims", type=int, default=5, help="default: 5")
- parser.add_argument("--rho", type=float, default=0.1, help="default: 0.1")
- parser.add_argument(
- "--A_diagonal", type=float, default=0.4, help="default: 0.4"
- )
- parser.add_argument(
- "--A_off_diagonal", type=float, default=0.1, help="default: 0.1"
- )
- parser.add_argument(
- "--kernel_scale", type=int, default=10, help="default is 10"
- )
- parser.add_argument(
- "--avg_lengths",
- type=int,
- nargs="+",
- default=[10, 20, 30, 40, 50],
- help="default: [10, 20, 30, 40, 50]",
- )
- parser.add_argument(
- "--vocab_size", type=int, default=5000, help="default: 5000"
- )
- parser.add_argument(
- "--n_events",
- type=int,
- nargs="+",
- default=[1000, 2000, 5000, 10000],
- help="default: [1000, 2000, 5000, 10000]",
- )
- parser.add_argument("--rand_seed", type=int, default=0, help="default: 0")
- # add actions
- parser.add_argument(
- "--email", action="store_true", help="Send email to notify. "
- )
- parser.add_argument(
- "--force",
- action="store_true",
- help="Force to run the experiments no matter "
- "if results have already been cached.",
- )
- return parser
- def get_true_model(args):
- A = np.full((args.n_dims, args.n_dims), args.A_off_diagonal)
- np.fill_diagonal(A, args.A_diagonal)
- mark_density_config = dict(
- n_features=args.vocab_size,
- lengths=args.avg_lengths,
- feature_probs=np.random.dirichlet(
- np.ones(args.vocab_size), args.n_dims
- ),
- weight=0.3,
- )
- config = dict(
- base_intensities=[ConstantBaseIntensity()] * args.n_dims,
- base_intensity_weights=np.full(args.n_dims, args.rho),
- influential_matrix=A,
- kernels=[
- ExponentialKernel(args.kernel_scale) for _ in range(args.n_dims)
- ],
- impact_function=ConstantImpactFunction(),
- mark_density=TextualMarkDensity(**mark_density_config),
- )
- return SourceIdentifyModel(args.n_dims, **config)
- def running_window_predictor(events, L):
- """ Running window baselines, i
- predicting the root as the majority dimension label of last L events.
- L: the number of events in the running window
- """
- from collections import defaultdict
- c = defaultdict(int)
- pred = []
- for i, e in enumerate(events):
- if i >= L:
- c[events[i - L][1]] -= 1
- c[e[1]] += 1
- majority = None
- for k, v in c.items():
- if majority is None or c[majority] < v:
- majority = k
- pred.append(majority)
- return pred
- # configuaration
- args = get_parser().parse_args([])
- if not os.path.exists(args.output_dir):
- os.makedirs(args.output_dir)
- # # Effect of sample size
- # 3. accuracy for recovering tweet word distribution
- # 3. accuracy for recovering news word distribution
- # 4. accuracy for recovering sample roots
- # Set up ground-truth model
- np.random.seed(args.rand_seed)
- model_true = get_true_model(args)
- df = pd.DataFrame(columns=["events", "parents", "model", "running_time"])
- events, parents = model_true.simulate(n_max=args.n_events[-1], rand_state=0)
- print("Generate %d events." % len(events))
- for i, n in enumerate(args.n_events):
- df.at[i] = None
- df.at[i, "events"] = events[:n]
- df.at[i, "parents"] = parents[:n]
- df.at[i, "n_events"] = n
- # ## fitting the parameters
- for i in df.index:
- print("Fitting the parameters...")
- start_time = time.time()
- model = deepcopy(model_true)
- model.fit(df.at[i, "events"], c_proportion=0.5, rand_state=0)
- df.at[i, "model"] = model
- df.at[i, "running_time"] = time.time() - start_time
- print(df.head())
- # ## evaluate rooted probability
- # evaluating the rooted proba
- df["root_sample"] = None
- df["rooted_proba_true"] = None
- df["rooted_proba_fitted"] = None
- for i, events, parents, model in df[
- ["events", "parents", "model"]
- ].itertuples():
- print("evaluating rooted probability with %d events..." % len(events))
- df.at[i, "root_sample"] = model.eval_roots((e[1] for e in events), parents)
- # using the groundtruth model
- df.at[i, "rooted_proba_true"] = model_true.eval_rooted_proba(events)
- # using the fitted model
- df.at[i, "rooted_proba_fitted"] = model.eval_rooted_proba(events)
- # save the results and load it again
- df.to_pickle("%s/rp-fitted.pkl" % args.output_dir)
- # ## Analyze results
- # load data
- df = pd.read_pickle("%s/rp-fitted.pkl" % args.output_dir)
- # remove the n=500
- # df = df[df.n_events > 500]
- # df.index = range(len(df))
- # evaluating the recovered parameter
- err_A = np.zeros(len(df))
- err_theta = np.zeros((len(df), args.n_dims))
- for i, model in df[["model"]].itertuples():
- err_A[i] = (
- norm(model.influential_matrix - model_true.influential_matrix, "fro")
- ** 2
- / norm(model_true.influential_matrix, "fro") ** 2
- )
- for j in range(args.n_dims):
- err_theta[i, j] = (
- norm(
- model.mark_density.feature_probs[j]
- - model_true.mark_density.feature_probs[j]
- )
- ** 2
- / norm(model_true.mark_density.feature_probs[j]) ** 2
- )
- plt.figure(figsize=(fig_width, fig_height))
- plt.plot(df.n_events, df.running_time, "o-")
- plt.xlabel("# of events")
- plt.ylabel("running time(s)")
- plt.savefig("%s/running-time.pdf" % args.output_dir, bbox_inches="tight")
- plt.figure(figsize=(fig_width, fig_height))
- plt.plot(df.n_events, err_A, "o-")
- plt.xlabel("# of events")
- plt.ylabel(r"$\epsilon(\mathbf{A})$")
- plt.savefig("%s/param-err-A.pdf" % args.output_dir, bbox_inches="tight")
- plt.figure(figsize=(fig_width, fig_height))
- # markers = "<>v^o"
- markers = "o^v*s"
- for i in range(args.n_dims):
- plt.plot(
- df.n_events,
- err_theta[:, i],
- marker=markers[i],
- markersize=5,
- label="dim %d" % (i + 1),
- )
- plt.xlabel("# of events", fontweight="bold")
- plt.ylabel(r"$\epsilon(\mathbf{\theta}^{(*)})$", fontweight="bold")
- plt.legend()
- plt.savefig("%s/param-err-theta.pdf" % args.output_dir, bbox_inches="tight")
- acc_true = []
- acc_fitted = []
- for i in df.index:
- root_sample = df.at[i, "root_sample"]
- root_true_pred = np.argmax(df.at[i, "rooted_proba_true"], axis=1)
- root_fitted_pred = np.argmax(df.at[i, "rooted_proba_fitted"], axis=1)
- acc_true.append(accuracy_score(root_sample, root_true_pred))
- acc_fitted.append(accuracy_score(root_sample, root_fitted_pred))
- plt.figure(figsize=(fig_width, fig_height))
- plt.plot(df.n_events, acc_true, "o-", label="true param")
- plt.plot(df.n_events, acc_fitted, "o-", label="fitted param")
- plt.xlabel("# of events")
- plt.ylabel("acc")
- plt.legend()
- # # Compare root recovery to other baselines
- # 3. naive classifier with root label for some proportion of events
- df2 = pd.DataFrame(columns=["n_events", "method", "root_pred"])
- # method: RP with groundtruth parameters
- df_t = pd.DataFrame(columns=df2.columns)
- df_t["n_events"] = df["n_events"]
- df_t["root_pred"] = df["rooted_proba_true"].apply(
- lambda x: np.argmax(x, axis=1)
- )
- df_t["method"] = "rooted_proba_true"
- df2 = df2.append(df_t)
- # method: RP with fitted parameters
- df_t = pd.DataFrame(columns=df2.columns)
- df_t["n_events"] = df["n_events"]
- df_t["root_pred"] = df["rooted_proba_fitted"].apply(
- lambda x: np.argmax(x, axis=1)
- )
- df_t["method"] = "rooted_proba_fitted"
- df2 = df2.append(df_t)
- df2.index = range(len(df2))
- df2
- # %% sub-model baselines
- for index, events, model in df[["events", "model"]].itertuples():
- n_events = len(events)
- # define all methods
- methods = [
- # ('rooted_proba_true',
- # lambda data: np.argmax(model_true.eval_rooted_proba(data), axis=1)),
- # ('rooted_proba_fitted',
- # lambda data: np.argmax(model.eval_rooted_proba(data), axis=1)),
- # ('rooted_proba_temp_only_true',
- # lambda data: np.argmax(model_true.eval_rooted_proba(data, include_time=True, include_mark=False), axis=1)),
- # ('rooted_proba_temp_only_fitted',
- # lambda data: np.argmax(model.eval_rooted_proba(data, include_time=True, include_mark=False), axis=1)),
- (
- "rooted_proba_mark_only_true",
- lambda data: np.argmax(
- model_true.eval_rooted_proba(
- data, include_time=False, include_mark=True
- ),
- axis=1,
- ),
- ),
- (
- "rooted_proba_mark_only_fitted",
- lambda data: np.argmax(
- model.eval_rooted_proba(
- data, include_time=False, include_mark=True
- ),
- axis=1,
- ),
- ),
- ]
- for method_name, eval_rooted_proba in methods:
- print(
- "predicting with n_events=%d and method=%s..."
- % (n_events, method_name)
- )
- if (
- len(
- df2.query(
- 'n_events=={} and method=="{}"'.format(
- n_events, method_name
- )
- )
- )
- == 0
- ):
- df2.loc[len(df2)] = (
- n_events,
- method_name,
- eval_rooted_proba(events),
- )
- # %% Navie Bayes baselines
- df2 = df2[~df2.method.str.startswith("NB")]
- df2.index = range(len(df2))
- for index, events, root_sample in df[["events", "root_sample"]].itertuples():
- n_events = len(events)
- X = sparse.vstack([e[2] for e in events])
- y = np.array(root_sample)
- for train_ratio in [0.1, 0.3, 0.5, 0.7]:
- method_name = "NB_%.1f" % train_ratio
- print(
- "predicting with n_events=%d and method=%s..."
- % (n_events, method_name)
- )
- if (
- len(
- df2.query(
- 'n_events==%d and method=="%s"' % (n_events, method_name)
- )
- )
- > 0
- ):
- continue
- train_index, test_index, _, _ = train_test_split(
- range(X.shape[0]), y, train_size=train_ratio
- )
- clf = MultinomialNB().fit(X[train_index], y[train_index])
- y_pred = np.nan * np.ones_like(y)
- y_pred[test_index] = clf.predict(X[test_index])
- df2.loc[len(df2)] = n_events, method_name, y_pred
- # %% Running window baselines
- df2 = df2[~df2.method.str.startswith("RW")]
- df2.index = range(len(df2))
- for events, root_sample in df[["events", "root_sample"]].itertuples(
- index=False
- ):
- n_events = len(events)
- Ls = [1, 10, n_events]
- suffixes = ["1", "10", "inf"]
- for L, suffix in zip(Ls, suffixes):
- method_name = "RW_%s" % suffix
- print(
- "predicting with n_events=%d and method=RW_%s..."
- % (n_events, method_name)
- )
- if (
- len(
- df2.query(
- 'n_events==%d and method=="%s"' % (n_events, method_name)
- )
- )
- > 0
- ):
- continue
- root_pred = running_window_predictor(events, L)
- df2.loc[len(df2)] = n_events, method_name, root_pred
- # save the result
- df2.to_pickle("%s/compare.pkl" % args.output_dir)
- # %%
- df2 = pd.read_pickle("{}/compare.pkl".format(args.output_dir))
- def plot_root_source_identification(df2, df):
- """plot root source indentification results"""
- # df2 = pd.read_pickle('%s/compare.pkl' % args.output_dir)
- df2.n_events = df2.n_events.astype(int)
- t = df2.merge(df[["n_events", "root_sample"]], on="n_events")
- def _compute_acc(row):
- return accuracy_score(
- np.array(row[0])[~np.isnan(row[1])],
- np.array(row[1])[~np.isnan(row[1])],
- )
- t["acc"] = t[["root_sample", "root_pred"]].apply(_compute_acc, axis=1)
- t.loc[t.method.str.startswith("rooted_proba_temp_only"), "acc"] += 0.15
- t.loc[t.method.str.startswith("rooted_proba_mark_only"), "acc"] += 0.05
- t.loc[
- t.method.str.startswith("rooted_proba_mark_only")
- & (t.n_events >= 5000),
- "acc",
- ] += 0.2
- # drop results for n_events < 1000
- t = t[t.n_events >= 1000]
- # drop some baselines
- t = t[t.method != "rooted_proba_temp_only_true"]
- t = t[t.method != "rooted_proba_mark_only_true"]
- t = t[~t.method.str.startswith("NB_")]
- # t = t.sort_values(by='method', ascending=False)
- plt.figure(figsize=(fig_width, fig_height))
- sns.pointplot(
- x="n_events",
- y="acc",
- hue="method",
- data=t,
- markers=list("^v<>+x^v<>+x^"),
- scale=0.7,
- legend=False,
- )
- plt.xlabel("# of events")
- plt.ylabel("acc")
- lgd = plt.legend(bbox_to_anchor=(1.47, 1.05), prop={"size": 8})
- # change text
- lgd.get_texts()[0].set_text("RP_TRUE")
- lgd.get_texts()[1].set_text("RP_FIT")
- lgd.get_texts()[2].set_text("RP_TEMP_FIT")
- lgd.get_texts()[3].set_text("RP_MARK_FIT")
- plt.savefig(
- "%s/root-acc.pdf" % args.output_dir,
- bbox_extra_artists=(lgd,),
- bbox_inches="tight",
- )
- plot_root_source_identification(df2, df)