PageRenderTime 111ms CodeModel.GetById 19ms RepoModel.GetById 1ms app.codeStats 0ms

/soggle/competition_utilities.py

https://bitbucket.org/dnouri/soggle
Python | 83 lines | 67 code | 15 blank | 1 comment | 10 complexity | 0c1cab8a504f25483900864c64aefa1d MD5 | raw file
  1. from __future__ import division
  2. from collections import Counter
  3. import csv
  4. import dateutil
  5. import numpy as np
  6. import os
  7. import pandas as pd
  8. data_path = '.'
  9. submissions_path = '.'
  10. if not data_path or not submissions_path:
  11. raise Exception("Set the data and submission paths in competition_utilities.py!")
  12. def parse_date_maybe_null(date):
  13. if date:
  14. return dateutil.parser.parse(date)
  15. return None
  16. df_converters = {"PostCreationDate": dateutil.parser.parse,
  17. "OwnerCreationDate": dateutil.parser.parse}
  18. # "PostClosedDate": parse_date_maybe_null}
  19. def get_reader(file_name="train-sample.csv"):
  20. reader = csv.reader(open(os.path.join(data_path, file_name)))
  21. header = reader.next()
  22. return reader
  23. def get_header(file_name="train-sample.csv"):
  24. reader = csv.reader(open(os.path.join(data_path, file_name)))
  25. header = reader.next()
  26. return header
  27. def get_closed_count(file_name):
  28. return sum(1 for q in iter_closed_questions(file_name))
  29. def iter_closed_questions(file_name):
  30. df_iter = pd.io.parsers.read_csv(os.path.join(data_path, file_name), iterator=True, chunksize=1000)
  31. return (question[1] for df in df_iter for question in df[df["OpenStatus"] != "open"].iterrows())
  32. def iter_open_questions(file_name):
  33. df_iter = pd.io.parsers.read_csv(os.path.join(data_path, file_name), iterator=True, chunksize=1000)
  34. return (question[1] for df in df_iter for question in df[df["OpenStatus"] == "open"].iterrows())
  35. def get_dataframe(file_name="train-sample.csv"):
  36. return pd.io.parsers.read_csv(os.path.join(data_path, file_name), converters = df_converters)
  37. def get_priors(file_name):
  38. closed_reasons = [r[14] for r in get_reader(file_name)]
  39. closed_reason_counts = Counter(closed_reasons)
  40. reasons = sorted(closed_reason_counts.keys())
  41. total = len(closed_reasons)
  42. priors = [closed_reason_counts[reason]/total for reason in reasons]
  43. return priors
  44. def write_sample(file_name, header, sample):
  45. writer = csv.writer(open(os.path.join(data_path, file_name), "w"), lineterminator="\n")
  46. writer.writerow(header)
  47. writer.writerows(sample)
  48. def update_prior(old_prior, old_posterior, new_prior):
  49. evidence_ratio = (old_prior*(1-old_posterior)) / (old_posterior*(1-old_prior))
  50. new_posterior = new_prior / (new_prior + (1-new_prior)*evidence_ratio)
  51. return new_posterior
  52. def cap_and_update_priors(old_priors, old_posteriors, new_priors, epsilon):
  53. old_posteriors = cap_predictions(old_posteriors, epsilon)
  54. old_priors = np.kron(np.ones((np.size(old_posteriors, 0), 1)), old_priors)
  55. new_priors = np.kron(np.ones((np.size(old_posteriors, 0), 1)), new_priors)
  56. evidence_ratio = (old_priors*(1-old_posteriors)) / (old_posteriors*(1-old_priors))
  57. new_posteriors = new_priors / (new_priors + (1-new_priors)*evidence_ratio)
  58. new_posteriors = cap_predictions(new_posteriors, epsilon)
  59. return new_posteriors
  60. def cap_predictions(probs, epsilon):
  61. probs[probs>1-epsilon] = 1-epsilon
  62. probs[probs<epsilon] = epsilon
  63. row_sums = probs.sum(axis=1)
  64. probs = probs / row_sums[:, np.newaxis]
  65. return probs
  66. def write_submission(file_name, predictions):
  67. writer = csv.writer(open(os.path.join(submissions_path, file_name), "w"), lineterminator="\n")
  68. writer.writerows(predictions)