parser.py - Helper Utilities

/python/projects/pandas/parser.py

https://github.com/bashwork/common · Python · 63 lines · 40 code · 7 blank · 16 comment · 1 complexity · f32be63ddfee0097871965ce34c508ac MD5 · raw file


import pandas as pd

# ------------------------------------------------------------
# Constants
# ------------------------------------------------------------

COLUMNS = [
    'age',
    'workclass',
    'weight',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'label'
]

PATHS = {
    'training' : 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
    'testing'  : 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test',
}

# ------------------------------------------------------------
# Helper Utilities 
# ------------------------------------------------------------

def read_dataset(path, **kwargs):
    ''' Given a path to the dataset, return a cleaned and
    processed pandas data frame.

    :param path: The path (file or url) to the input data
    :returns: The cleaned pandas data frame
    '''
    params = {
        'names'            : COLUMNS,
        'sep'              : ',',
        'skipinitialspace' : True,
        'na_values'        : ['?'],
        'skiprows'         : 0,
        'skipfooter'       : 1
    }
    params.update(kwargs)
    frame = pd.read_csv(path, **params)
    frame = frame.drop('weight', axis=1)
    frame['label'] = frame['label'].str.contains('>50K').astype(int)
    #frame['occupation'] = frame['occupation'].value_counts()[0]
    return frame

# ------------------------------------------------------------
# Main 
# ------------------------------------------------------------
if __name__ == "__main__":

    frame = read_dataset('adult.test', skiprows=1)  
    print frame.head()
    print frame.describe()