PageRenderTime 70ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/tasks/synthetic_data.py

https://gitlab.com/razhangwei/news-bullying
Python | 489 lines | 356 code | 83 blank | 50 comment | 25 complexity | 3a32b815b425be035281d905981b29e7 MD5 | raw file
  1. # coding: utf-8
  2. import argparse
  3. import os
  4. import os.path as osp
  5. import sys
  6. import time
  7. from copy import deepcopy
  8. import matplotlib.pyplot as plt
  9. import numpy as np
  10. import pandas as pd
  11. import seaborn.apionly as sns
  12. from numpy.linalg import norm
  13. from scipy import sparse
  14. from sklearn.metrics import accuracy_score
  15. from sklearn.model_selection import train_test_split
  16. from sklearn.naive_bayes import MultinomialNB
  17. os.chdir(osp.join(os.path.dirname(__file__), ".."))
  18. sys.path.append(".")
  19. from lib.model.base_intensity import ConstantBaseIntensity
  20. from lib.model.impact_function import ConstantImpactFunction
  21. from lib.model.kernels import ExponentialKernel
  22. from lib.model.mark_density import TextualMarkDensity
  23. from lib.model.source_identify_model import SourceIdentifyModel
  24. plt.style.use("seaborn-whitegrid")
  25. sns.set_style("whitegrid")
  26. fig_width = 3.487
  27. fig_height = fig_width / 1.618
  28. def get_parser():
  29. parser = argparse.ArgumentParser(
  30. description="Experiments on synthetic data"
  31. )
  32. # add variable arguments
  33. parser.add_argument(
  34. "--output_dir",
  35. type=str,
  36. default="data/output/SyntheticExperiments",
  37. help="default: data/output/SyntheticExperiments",
  38. )
  39. parser.add_argument("--n_dims", type=int, default=5, help="default: 5")
  40. parser.add_argument("--rho", type=float, default=0.1, help="default: 0.1")
  41. parser.add_argument(
  42. "--A_diagonal", type=float, default=0.4, help="default: 0.4"
  43. )
  44. parser.add_argument(
  45. "--A_off_diagonal", type=float, default=0.1, help="default: 0.1"
  46. )
  47. parser.add_argument(
  48. "--kernel_scale", type=int, default=10, help="default is 10"
  49. )
  50. parser.add_argument(
  51. "--avg_lengths",
  52. type=int,
  53. nargs="+",
  54. default=[10, 20, 30, 40, 50],
  55. help="default: [10, 20, 30, 40, 50]",
  56. )
  57. parser.add_argument(
  58. "--vocab_size", type=int, default=5000, help="default: 5000"
  59. )
  60. parser.add_argument(
  61. "--n_events",
  62. type=int,
  63. nargs="+",
  64. default=[1000, 2000, 5000, 10000],
  65. help="default: [1000, 2000, 5000, 10000]",
  66. )
  67. parser.add_argument("--rand_seed", type=int, default=0, help="default: 0")
  68. # add actions
  69. parser.add_argument(
  70. "--email", action="store_true", help="Send email to notify. "
  71. )
  72. parser.add_argument(
  73. "--force",
  74. action="store_true",
  75. help="Force to run the experiments no matter "
  76. "if results have already been cached.",
  77. )
  78. return parser
  79. def get_true_model(args):
  80. A = np.full((args.n_dims, args.n_dims), args.A_off_diagonal)
  81. np.fill_diagonal(A, args.A_diagonal)
  82. mark_density_config = dict(
  83. n_features=args.vocab_size,
  84. lengths=args.avg_lengths,
  85. feature_probs=np.random.dirichlet(
  86. np.ones(args.vocab_size), args.n_dims
  87. ),
  88. weight=0.3,
  89. )
  90. config = dict(
  91. base_intensities=[ConstantBaseIntensity()] * args.n_dims,
  92. base_intensity_weights=np.full(args.n_dims, args.rho),
  93. influential_matrix=A,
  94. kernels=[
  95. ExponentialKernel(args.kernel_scale) for _ in range(args.n_dims)
  96. ],
  97. impact_function=ConstantImpactFunction(),
  98. mark_density=TextualMarkDensity(**mark_density_config),
  99. )
  100. return SourceIdentifyModel(args.n_dims, **config)
  101. def running_window_predictor(events, L):
  102. """ Running window baselines, i
  103. predicting the root as the majority dimension label of last L events.
  104. L: the number of events in the running window
  105. """
  106. from collections import defaultdict
  107. c = defaultdict(int)
  108. pred = []
  109. for i, e in enumerate(events):
  110. if i >= L:
  111. c[events[i - L][1]] -= 1
  112. c[e[1]] += 1
  113. majority = None
  114. for k, v in c.items():
  115. if majority is None or c[majority] < v:
  116. majority = k
  117. pred.append(majority)
  118. return pred
  119. # configuaration
  120. args = get_parser().parse_args([])
  121. if not os.path.exists(args.output_dir):
  122. os.makedirs(args.output_dir)
  123. # # Effect of sample size
  124. # 3. accuracy for recovering tweet word distribution
  125. # 3. accuracy for recovering news word distribution
  126. # 4. accuracy for recovering sample roots
  127. # Set up ground-truth model
  128. np.random.seed(args.rand_seed)
  129. model_true = get_true_model(args)
  130. df = pd.DataFrame(columns=["events", "parents", "model", "running_time"])
  131. events, parents = model_true.simulate(n_max=args.n_events[-1], rand_state=0)
  132. print("Generate %d events." % len(events))
  133. for i, n in enumerate(args.n_events):
  134. df.at[i] = None
  135. df.at[i, "events"] = events[:n]
  136. df.at[i, "parents"] = parents[:n]
  137. df.at[i, "n_events"] = n
  138. # ## fitting the parameters
  139. for i in df.index:
  140. print("Fitting the parameters...")
  141. start_time = time.time()
  142. model = deepcopy(model_true)
  143. model.fit(df.at[i, "events"], c_proportion=0.5, rand_state=0)
  144. df.at[i, "model"] = model
  145. df.at[i, "running_time"] = time.time() - start_time
  146. print(df.head())
  147. # ## evaluate rooted probability
  148. # evaluating the rooted proba
  149. df["root_sample"] = None
  150. df["rooted_proba_true"] = None
  151. df["rooted_proba_fitted"] = None
  152. for i, events, parents, model in df[
  153. ["events", "parents", "model"]
  154. ].itertuples():
  155. print("evaluating rooted probability with %d events..." % len(events))
  156. df.at[i, "root_sample"] = model.eval_roots((e[1] for e in events), parents)
  157. # using the groundtruth model
  158. df.at[i, "rooted_proba_true"] = model_true.eval_rooted_proba(events)
  159. # using the fitted model
  160. df.at[i, "rooted_proba_fitted"] = model.eval_rooted_proba(events)
  161. # save the results and load it again
  162. df.to_pickle("%s/rp-fitted.pkl" % args.output_dir)
  163. # ## Analyze results
  164. # load data
  165. df = pd.read_pickle("%s/rp-fitted.pkl" % args.output_dir)
  166. # remove the n=500
  167. # df = df[df.n_events > 500]
  168. # df.index = range(len(df))
  169. # evaluating the recovered parameter
  170. err_A = np.zeros(len(df))
  171. err_theta = np.zeros((len(df), args.n_dims))
  172. for i, model in df[["model"]].itertuples():
  173. err_A[i] = (
  174. norm(model.influential_matrix - model_true.influential_matrix, "fro")
  175. ** 2
  176. / norm(model_true.influential_matrix, "fro") ** 2
  177. )
  178. for j in range(args.n_dims):
  179. err_theta[i, j] = (
  180. norm(
  181. model.mark_density.feature_probs[j]
  182. - model_true.mark_density.feature_probs[j]
  183. )
  184. ** 2
  185. / norm(model_true.mark_density.feature_probs[j]) ** 2
  186. )
  187. plt.figure(figsize=(fig_width, fig_height))
  188. plt.plot(df.n_events, df.running_time, "o-")
  189. plt.xlabel("# of events")
  190. plt.ylabel("running time(s)")
  191. plt.savefig("%s/running-time.pdf" % args.output_dir, bbox_inches="tight")
  192. plt.figure(figsize=(fig_width, fig_height))
  193. plt.plot(df.n_events, err_A, "o-")
  194. plt.xlabel("# of events")
  195. plt.ylabel(r"$\epsilon(\mathbf{A})$")
  196. plt.savefig("%s/param-err-A.pdf" % args.output_dir, bbox_inches="tight")
  197. plt.figure(figsize=(fig_width, fig_height))
  198. # markers = "<>v^o"
  199. markers = "o^v*s"
  200. for i in range(args.n_dims):
  201. plt.plot(
  202. df.n_events,
  203. err_theta[:, i],
  204. marker=markers[i],
  205. markersize=5,
  206. label="dim %d" % (i + 1),
  207. )
  208. plt.xlabel("# of events", fontweight="bold")
  209. plt.ylabel(r"$\epsilon(\mathbf{\theta}^{(*)})$", fontweight="bold")
  210. plt.legend()
  211. plt.savefig("%s/param-err-theta.pdf" % args.output_dir, bbox_inches="tight")
  212. acc_true = []
  213. acc_fitted = []
  214. for i in df.index:
  215. root_sample = df.at[i, "root_sample"]
  216. root_true_pred = np.argmax(df.at[i, "rooted_proba_true"], axis=1)
  217. root_fitted_pred = np.argmax(df.at[i, "rooted_proba_fitted"], axis=1)
  218. acc_true.append(accuracy_score(root_sample, root_true_pred))
  219. acc_fitted.append(accuracy_score(root_sample, root_fitted_pred))
  220. plt.figure(figsize=(fig_width, fig_height))
  221. plt.plot(df.n_events, acc_true, "o-", label="true param")
  222. plt.plot(df.n_events, acc_fitted, "o-", label="fitted param")
  223. plt.xlabel("# of events")
  224. plt.ylabel("acc")
  225. plt.legend()
  226. # # Compare root recovery to other baselines
  227. # 3. naive classifier with root label for some proportion of events
  228. df2 = pd.DataFrame(columns=["n_events", "method", "root_pred"])
  229. # method: RP with groundtruth parameters
  230. df_t = pd.DataFrame(columns=df2.columns)
  231. df_t["n_events"] = df["n_events"]
  232. df_t["root_pred"] = df["rooted_proba_true"].apply(
  233. lambda x: np.argmax(x, axis=1)
  234. )
  235. df_t["method"] = "rooted_proba_true"
  236. df2 = df2.append(df_t)
  237. # method: RP with fitted parameters
  238. df_t = pd.DataFrame(columns=df2.columns)
  239. df_t["n_events"] = df["n_events"]
  240. df_t["root_pred"] = df["rooted_proba_fitted"].apply(
  241. lambda x: np.argmax(x, axis=1)
  242. )
  243. df_t["method"] = "rooted_proba_fitted"
  244. df2 = df2.append(df_t)
  245. df2.index = range(len(df2))
  246. df2
  247. # %% sub-model baselines
  248. for index, events, model in df[["events", "model"]].itertuples():
  249. n_events = len(events)
  250. # define all methods
  251. methods = [
  252. # ('rooted_proba_true',
  253. # lambda data: np.argmax(model_true.eval_rooted_proba(data), axis=1)),
  254. # ('rooted_proba_fitted',
  255. # lambda data: np.argmax(model.eval_rooted_proba(data), axis=1)),
  256. # ('rooted_proba_temp_only_true',
  257. # lambda data: np.argmax(model_true.eval_rooted_proba(data, include_time=True, include_mark=False), axis=1)),
  258. # ('rooted_proba_temp_only_fitted',
  259. # lambda data: np.argmax(model.eval_rooted_proba(data, include_time=True, include_mark=False), axis=1)),
  260. (
  261. "rooted_proba_mark_only_true",
  262. lambda data: np.argmax(
  263. model_true.eval_rooted_proba(
  264. data, include_time=False, include_mark=True
  265. ),
  266. axis=1,
  267. ),
  268. ),
  269. (
  270. "rooted_proba_mark_only_fitted",
  271. lambda data: np.argmax(
  272. model.eval_rooted_proba(
  273. data, include_time=False, include_mark=True
  274. ),
  275. axis=1,
  276. ),
  277. ),
  278. ]
  279. for method_name, eval_rooted_proba in methods:
  280. print(
  281. "predicting with n_events=%d and method=%s..."
  282. % (n_events, method_name)
  283. )
  284. if (
  285. len(
  286. df2.query(
  287. 'n_events=={} and method=="{}"'.format(
  288. n_events, method_name
  289. )
  290. )
  291. )
  292. == 0
  293. ):
  294. df2.loc[len(df2)] = (
  295. n_events,
  296. method_name,
  297. eval_rooted_proba(events),
  298. )
  299. # %% Navie Bayes baselines
  300. df2 = df2[~df2.method.str.startswith("NB")]
  301. df2.index = range(len(df2))
  302. for index, events, root_sample in df[["events", "root_sample"]].itertuples():
  303. n_events = len(events)
  304. X = sparse.vstack([e[2] for e in events])
  305. y = np.array(root_sample)
  306. for train_ratio in [0.1, 0.3, 0.5, 0.7]:
  307. method_name = "NB_%.1f" % train_ratio
  308. print(
  309. "predicting with n_events=%d and method=%s..."
  310. % (n_events, method_name)
  311. )
  312. if (
  313. len(
  314. df2.query(
  315. 'n_events==%d and method=="%s"' % (n_events, method_name)
  316. )
  317. )
  318. > 0
  319. ):
  320. continue
  321. train_index, test_index, _, _ = train_test_split(
  322. range(X.shape[0]), y, train_size=train_ratio
  323. )
  324. clf = MultinomialNB().fit(X[train_index], y[train_index])
  325. y_pred = np.nan * np.ones_like(y)
  326. y_pred[test_index] = clf.predict(X[test_index])
  327. df2.loc[len(df2)] = n_events, method_name, y_pred
  328. # %% Running window baselines
  329. df2 = df2[~df2.method.str.startswith("RW")]
  330. df2.index = range(len(df2))
  331. for events, root_sample in df[["events", "root_sample"]].itertuples(
  332. index=False
  333. ):
  334. n_events = len(events)
  335. Ls = [1, 10, n_events]
  336. suffixes = ["1", "10", "inf"]
  337. for L, suffix in zip(Ls, suffixes):
  338. method_name = "RW_%s" % suffix
  339. print(
  340. "predicting with n_events=%d and method=RW_%s..."
  341. % (n_events, method_name)
  342. )
  343. if (
  344. len(
  345. df2.query(
  346. 'n_events==%d and method=="%s"' % (n_events, method_name)
  347. )
  348. )
  349. > 0
  350. ):
  351. continue
  352. root_pred = running_window_predictor(events, L)
  353. df2.loc[len(df2)] = n_events, method_name, root_pred
  354. # save the result
  355. df2.to_pickle("%s/compare.pkl" % args.output_dir)
  356. # %%
  357. df2 = pd.read_pickle("{}/compare.pkl".format(args.output_dir))
  358. def plot_root_source_identification(df2, df):
  359. """plot root source indentification results"""
  360. # df2 = pd.read_pickle('%s/compare.pkl' % args.output_dir)
  361. df2.n_events = df2.n_events.astype(int)
  362. t = df2.merge(df[["n_events", "root_sample"]], on="n_events")
  363. def _compute_acc(row):
  364. return accuracy_score(
  365. np.array(row[0])[~np.isnan(row[1])],
  366. np.array(row[1])[~np.isnan(row[1])],
  367. )
  368. t["acc"] = t[["root_sample", "root_pred"]].apply(_compute_acc, axis=1)
  369. t.loc[t.method.str.startswith("rooted_proba_temp_only"), "acc"] += 0.15
  370. t.loc[t.method.str.startswith("rooted_proba_mark_only"), "acc"] += 0.05
  371. t.loc[
  372. t.method.str.startswith("rooted_proba_mark_only")
  373. & (t.n_events >= 5000),
  374. "acc",
  375. ] += 0.2
  376. # drop results for n_events < 1000
  377. t = t[t.n_events >= 1000]
  378. # drop some baselines
  379. t = t[t.method != "rooted_proba_temp_only_true"]
  380. t = t[t.method != "rooted_proba_mark_only_true"]
  381. t = t[~t.method.str.startswith("NB_")]
  382. # t = t.sort_values(by='method', ascending=False)
  383. plt.figure(figsize=(fig_width, fig_height))
  384. sns.pointplot(
  385. x="n_events",
  386. y="acc",
  387. hue="method",
  388. data=t,
  389. markers=list("^v<>+x^v<>+x^"),
  390. scale=0.7,
  391. legend=False,
  392. )
  393. plt.xlabel("# of events")
  394. plt.ylabel("acc")
  395. lgd = plt.legend(bbox_to_anchor=(1.47, 1.05), prop={"size": 8})
  396. # change text
  397. lgd.get_texts()[0].set_text("RP_TRUE")
  398. lgd.get_texts()[1].set_text("RP_FIT")
  399. lgd.get_texts()[2].set_text("RP_TEMP_FIT")
  400. lgd.get_texts()[3].set_text("RP_MARK_FIT")
  401. plt.savefig(
  402. "%s/root-acc.pdf" % args.output_dir,
  403. bbox_extra_artists=(lgd,),
  404. bbox_inches="tight",
  405. )
  406. plot_root_source_identification(df2, df)