PageRenderTime 41ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/Transform/create_competition_data.py

https://github.com/jeppe/Kdd2013AuthorPaperIdentification
Python | 83 lines | 69 code | 14 blank | 0 comment | 4 complexity | 0d56c855baa38df15f32e89617f8bace MD5 | raw file
Possible License(s): BSD-2-Clause
  1. import os
  2. import numpy as np
  3. import pandas as pd
  4. import re
  5. def KDDCup2013Track1SplitDataSeed():
  6. raise NotImplementedError
  7. def split_data(data, train_frac, valid_frac):
  8. index = np.arange(len(data))
  9. np.random.seed(KDDCup2013Track1SplitDataSeed())
  10. np.random.shuffle(index)
  11. train_end = int(train_frac*len(data))
  12. valid_end = int((train_frac+valid_frac)*len(data))
  13. train = data.ix[index[:train_end]]
  14. valid = data.ix[index[train_end:valid_end]]
  15. test = data.ix[index[valid_end:]]
  16. return train, valid, test
  17. def parse_paper_ids(id_string):
  18. id_string = id_string.strip()
  19. id_string = re.sub(r"\s+", " ", id_string)
  20. if id_string:
  21. return [int(x) for x in id_string.split(" ")]
  22. return []
  23. def paper_ids_to_string(ids):
  24. return " ".join([str(x) for x in ids])
  25. def sort_ids(row, column_name):
  26. ids = parse_paper_ids(row[column_name])
  27. ids = sorted(ids)
  28. return paper_ids_to_string(ids)
  29. def convert_to_train_format(train):
  30. train["ConfirmedPaperIds"] = train.apply(sort_ids, axis=1, args=("ConfirmedPaperId",))
  31. train["DeletedPaperIds"] = train.apply(sort_ids, axis=1, args=("DeletedPaperId",))
  32. train = train[["AuthorId", "DeletedPaperIds", "ConfirmedPaperIds"]]
  33. train = train.sort("AuthorId")
  34. return train
  35. def combine_id_columns(row):
  36. return row["DeletedPaperId"] + " " + row["ConfirmedPaperId"]
  37. def convert_to_test_format(test, usage="PrivateTest"):
  38. test["PaperIds"] = test.apply(combine_id_columns, axis=1)
  39. test["PaperIds"] = test.apply(sort_ids, axis=1, args=("PaperIds",))
  40. test["DeletedPaperIds"] = test.apply(sort_ids, axis=1, args=("DeletedPaperId",))
  41. test_set = test[["AuthorId", "PaperIds"]]
  42. test_set = test_set.sort("AuthorId")
  43. solution = test[["AuthorId", "DeletedPaperIds"]]
  44. solution = solution.rename(columns={"DeletedPaperIds": "PaperIds"})
  45. solution = solution.sort("AuthorId")
  46. solution["Usage"]=usage
  47. return test_set, solution
  48. def create_competition_data():
  49. data_path = os.path.join(os.environ["DataPath"],
  50. "Kdd2013AuthorPaperIdentification")
  51. raw_path = os.path.join(data_path, "LabeledObfuscated")
  52. labels_path = os.path.join(raw_path, "Task1LabeledDataSet.csv")
  53. out_path = os.path.join(data_path, "Release 1")
  54. converters = {
  55. "ConfirmedPaperId": lambda x: x,
  56. "DeletedPaperId": lambda x: x
  57. }
  58. data = pd.read_csv(labels_path, converters=converters)
  59. train, valid, test = split_data(data, 0.5, 0.2)
  60. train = convert_to_train_format(train)
  61. valid, valid_solution = convert_to_test_format(valid, "PublicTest")
  62. test, test_solution = convert_to_test_format(test, "PrivateTest")
  63. train.to_csv(os.path.join(out_path, "Train.csv"), index=False)
  64. valid.to_csv(os.path.join(out_path, "Valid.csv"), index=False)
  65. valid_solution.to_csv(os.path.join(out_path, "ValidSolution.csv"), index=False)
  66. test.to_csv(os.path.join(out_path, "Test.csv"), index=False)
  67. test_solution.to_csv(os.path.join(out_path, "TestSolution.csv"), index=False)
  68. if __name__=="__main__":
  69. create_competition_data()