PageRenderTime 50ms CodeModel.GetById 18ms RepoModel.GetById 1ms app.codeStats 0ms

/code/05_pandas_reference.py

https://gitlab.com/varunkothamachu/DAT3
Python | 243 lines | 100 code | 68 blank | 75 comment | 2 complexity | 61cb0e0541d83ed33b0b6f0a52c4df6e MD5 | raw file
  1. '''
  2. Pandas Reference Guide
  3. Source:
  4. http://fonnesbeck.github.io/Bios366/lectures.html
  5. Files used:
  6. ../data/microbiome.csv
  7. ../data/microbiome_missing.csv
  8. ../data/baseball.csv
  9. '''
  10. import pandas as pd
  11. import numpy as np
  12. ### SERIES ###
  13. # create Series with default index (0, 1, 2, 3)
  14. counts = pd.Series([632, 1638, 569, 115])
  15. counts.values # numpy array
  16. counts.index # index object
  17. # create Series and specify index
  18. bacteria = pd.Series([632, 1638, 569, 115],
  19. index=['Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes'])
  20. # filter a Series
  21. bacteria['Actinobacteria'] # by label
  22. bacteria[2] # by position
  23. bacteria[[name.endswith('bacteria') for name in bacteria.index]]
  24. bacteria[bacteria > 1000]
  25. # give name to index and to values
  26. bacteria.index.name = 'phylum'
  27. bacteria.name = 'counts'
  28. # vectorized operations on a Series
  29. np.log(bacteria) # return a Series
  30. np.log(bacteria.values) # return an array
  31. # create Series from a dict (creates in key-sorted order)
  32. bacteria_dict = {'Firmicutes': 632, 'Proteobacteria': 1638,
  33. 'Actinobacteria': 569, 'Bacteroidetes': 115}
  34. pd.Series(bacteria_dict)
  35. # pass a custom index to Series
  36. # indices without values will be treated as missing (NaN)
  37. bacteria2 = pd.Series(bacteria_dict,
  38. index=['Cyanobacteria','Firmicutes','Proteobacteria','Actinobacteria'])
  39. bacteria2.isnull()
  40. # labels are used to align data when used in operations with other Series
  41. # note that result is NaN if either Series has NaN for that index
  42. bacteria + bacteria2
  43. ### DATAFRAME ###
  44. # create DataFrame from dictionary of lists
  45. data = pd.DataFrame({'value':[632, 1638, 569, 115, 433, 1130, 754, 555],
  46. 'patient':[1, 1, 1, 1, 2, 2, 2, 2],
  47. 'phylum':['Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes',
  48. 'Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes']})
  49. # DataFrame has second index representing the columns
  50. data.columns
  51. data.dtypes
  52. # select column from DataFrame
  53. data.value
  54. data['value'] # returns a Series
  55. data[['value']] # returns a DataFrame
  56. # select column, then filter by index (or "label")
  57. data.value[3]
  58. data.loc[3, 'value'] # less ambiguous
  59. # select column, then filter by position
  60. data.value[0:2]
  61. data.iloc[0:2, 2] # less ambiguous, but requires column position
  62. # filter rows by boolean
  63. data[data.value>1000]
  64. # Series returned when selecting columns is a view by default, not a copy
  65. vals = data.value # view
  66. vals = data.value.copy() # copy
  67. # create DataFrame columns by assignment
  68. data['year'] = 2013
  69. data['month'] = ['Jan'] * len(data)
  70. # remove DataFrame columns
  71. del data['month']
  72. # extract data as ndarray
  73. # dtype of array is "object" due to mixed data types
  74. data.values
  75. ### IMPORTING DATA ###
  76. # read from CSV
  77. mb = pd.read_csv('../data/microbiome.csv')
  78. # use header=0 to overwrite column names, or header=None to add column names
  79. mb = pd.read_csv('../data/microbiome.csv', header=0, names = ['a','b','c','d','e'])
  80. # limit which rows are read in
  81. pd.read_csv('../data/microbiome.csv', skiprows=[3,4,6])
  82. pd.read_csv('../data/microbiome.csv', nrows=4)
  83. # read_table is a more general function
  84. # can use regular expression to define variable amount of whitespace
  85. mb = pd.read_table('../data/microbiome.csv', sep='\s+')
  86. # use chunksize to return iterable object
  87. data_chunks = pd.read_csv('../data/microbiome.csv', chunksize=15)
  88. mean_tissue = {chunk.Taxon[0]: chunk.Tissue.mean() for chunk in data_chunks}
  89. # missing data, NA, and NULL will automatically be replaced with NaN
  90. # specify additional symbols with na_values
  91. pd.read_csv('../data/microbiome_missing.csv')
  92. pd.isnull(pd.read_csv('../data/microbiome_missing.csv'))
  93. mb2 = pd.read_csv('../data/microbiome_missing.csv', na_values=['?', -99999])
  94. ### MISSING VALUES ###
  95. # drop any rows with missing values
  96. mb2.dropna()
  97. # only drop a row if every field is a missing value
  98. mb2.dropna(how='all')
  99. # fill missing values with specific values
  100. mb2.fillna({'Tissue':500, 'Stool':1000})
  101. ### INDEXING ###
  102. # specify which column contains an index
  103. baseball = pd.read_csv('../data/baseball.csv', index_col='id')
  104. # can also set index after reading it in
  105. baseball = pd.read_csv('../data/baseball.csv')
  106. baseball.set_index('id', inplace=True)
  107. # create our own index
  108. player_id = baseball.player + baseball.year.astype(str)
  109. baseball_new = baseball.copy()
  110. baseball_new.index = player_id
  111. # our new index is not unique (which is not illegal)
  112. # indexing by label will return multiple values for some labels
  113. baseball_new.index.is_unique
  114. pd.Series(baseball_new.index).value_counts()
  115. baseball_new.ix['wickmbo012007']
  116. # reindex to maniulpate the data indices in a DataFrame
  117. baseball.reindex(baseball.index[::-1])
  118. ### SLICING ###
  119. # select columns
  120. baseball_new[['h','ab']]
  121. baseball_new[baseball_new.ab>500]
  122. # select column, then filter rows by index
  123. baseball_new.h['womacto012006']
  124. baseball_new.h[['womacto012006', 'schilcu012006']]
  125. baseball_new.h['womacto012006':'myersmi012006']
  126. # select column, then filter rows by position (works because index is not an integer)
  127. baseball_new.h[0]
  128. baseball_new.h[0:3]
  129. # alternatively: filter rows then select columns
  130. baseball_new.ix['womacto012006', 'h']
  131. baseball_new.ix['gonzalu012006', ['h','X2b', 'X3b', 'hr']]
  132. baseball_new.ix[['gonzalu012006','finlest012006'], 5:8]
  133. # remove rows or columns
  134. baseball.drop([89525, 89526])
  135. baseball.drop(['ibb', 'hbp'], axis=1)
  136. ### APPLYING FUNCTIONS ###
  137. # apply
  138. stats = baseball[['h','X2b', 'X3b', 'hr']]
  139. stats.apply(np.median) # median of each column
  140. stats.apply(np.sum, axis=1) # sum of each row
  141. stats.apply(lambda x: x.max() - x.min())
  142. # built-in functions
  143. baseball.mean() # ignores NaN by defalut
  144. mb2.mean(skipna=False)
  145. baseball.describe()
  146. baseball.player.describe() # works on non-numeric data also
  147. baseball.head()
  148. baseball.tail()
  149. ### SORTING AND ORDERING ###
  150. # sorting by index
  151. baseball_new.sort_index()
  152. baseball_new.sort_index(ascending=False)
  153. baseball_new.sort_index(axis=1)
  154. # sorting by value
  155. baseball.hr.order(ascending=False)
  156. baseball.sort_index(ascending=[False,True], by=['sb','cs'])
  157. # rank each value relative to others in the Series
  158. # high rank means high value
  159. baseball.hr.rank()
  160. baseball.rank()
  161. ### HIERARCHICAL INDEXING ###
  162. # specify multiple columns to create hierarchical index
  163. mb = pd.read_csv('../data/microbiome.csv', index_col=['Taxon','Patient'])
  164. # use tuple to subset
  165. mb.ix[('Other',1)]
  166. # subset based on partial index
  167. mb.ix['Other']
  168. ### WRITING TO FILES ###
  169. # write to CSV, useful arguments are sep, na_rep, index, header
  170. mb.to_csv('mb.csv')
  171. # write to disk in binary format
  172. baseball.to_pickle('baseball_pickle')
  173. pd.read_pickle('baseball_pickle')