PageRenderTime 41ms CodeModel.GetById 15ms RepoModel.GetById 1ms app.codeStats 0ms

/python/projects/pandas/parser.py

https://github.com/bashwork/common
Python | 63 lines | 40 code | 7 blank | 16 comment | 1 complexity | f32be63ddfee0097871965ce34c508ac MD5 | raw file
Possible License(s): GPL-2.0
  1. import pandas as pd
  2. # ------------------------------------------------------------
  3. # Constants
  4. # ------------------------------------------------------------
  5. COLUMNS = [
  6. 'age',
  7. 'workclass',
  8. 'weight',
  9. 'education',
  10. 'education-num',
  11. 'marital-status',
  12. 'occupation',
  13. 'relationship',
  14. 'race',
  15. 'sex',
  16. 'capital-gain',
  17. 'capital-loss',
  18. 'hours-per-week',
  19. 'native-country',
  20. 'label'
  21. ]
  22. PATHS = {
  23. 'training' : 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
  24. 'testing' : 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test',
  25. }
  26. # ------------------------------------------------------------
  27. # Helper Utilities
  28. # ------------------------------------------------------------
  29. def read_dataset(path, **kwargs):
  30. ''' Given a path to the dataset, return a cleaned and
  31. processed pandas data frame.
  32. :param path: The path (file or url) to the input data
  33. :returns: The cleaned pandas data frame
  34. '''
  35. params = {
  36. 'names' : COLUMNS,
  37. 'sep' : ',',
  38. 'skipinitialspace' : True,
  39. 'na_values' : ['?'],
  40. 'skiprows' : 0,
  41. 'skipfooter' : 1
  42. }
  43. params.update(kwargs)
  44. frame = pd.read_csv(path, **params)
  45. frame = frame.drop('weight', axis=1)
  46. frame['label'] = frame['label'].str.contains('>50K').astype(int)
  47. #frame['occupation'] = frame['occupation'].value_counts()[0]
  48. return frame
  49. # ------------------------------------------------------------
  50. # Main
  51. # ------------------------------------------------------------
  52. if __name__ == "__main__":
  53. frame = read_dataset('adult.test', skiprows=1)
  54. print frame.head()
  55. print frame.describe()