PageRenderTime 51ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/lessons/06 - Lesson.ipynb

https://bitbucket.org/timoguin/learn-pandas
Jupyter | 382 lines | 382 code | 0 blank | 0 comment | 0 complexity | 323edbf736a962d798670ce1169cf0b3 MD5 | raw file
  1. {
  2. "metadata": {
  3. "name": ""
  4. },
  5. "nbformat": 3,
  6. "nbformat_minor": 0,
  7. "worksheets": [
  8. {
  9. "cells": [
  10. {
  11. "cell_type": "markdown",
  12. "metadata": {},
  13. "source": [
  14. "# Lesson 6 \n",
  15. "\n",
  16. "Lets take a look at the ***groupby*** function."
  17. ]
  18. },
  19. {
  20. "cell_type": "code",
  21. "collapsed": false,
  22. "input": [
  23. "# Import libraries\n",
  24. "from pandas import DataFrame\n",
  25. "import pandas as pd"
  26. ],
  27. "language": "python",
  28. "metadata": {},
  29. "outputs": [],
  30. "prompt_number": 1
  31. },
  32. {
  33. "cell_type": "code",
  34. "collapsed": false,
  35. "input": [
  36. "print 'Pandas version: ' + pd.__version__"
  37. ],
  38. "language": "python",
  39. "metadata": {},
  40. "outputs": [
  41. {
  42. "output_type": "stream",
  43. "stream": "stdout",
  44. "text": [
  45. "Pandas version: 0.13.0\n"
  46. ]
  47. }
  48. ],
  49. "prompt_number": 2
  50. },
  51. {
  52. "cell_type": "code",
  53. "collapsed": false,
  54. "input": [
  55. "# Our small data set\n",
  56. "d = {'one':[1,1,1,1,1],\n",
  57. " 'two':[2,2,2,2,2],\n",
  58. " 'letter':['a','a','b','b','c']}\n",
  59. "\n",
  60. "# Create dataframe\n",
  61. "df = DataFrame(d)\n",
  62. "df"
  63. ],
  64. "language": "python",
  65. "metadata": {},
  66. "outputs": [
  67. {
  68. "html": [
  69. "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
  70. "<table border=\"1\" class=\"dataframe\">\n",
  71. " <thead>\n",
  72. " <tr style=\"text-align: right;\">\n",
  73. " <th></th>\n",
  74. " <th>letter</th>\n",
  75. " <th>one</th>\n",
  76. " <th>two</th>\n",
  77. " </tr>\n",
  78. " </thead>\n",
  79. " <tbody>\n",
  80. " <tr>\n",
  81. " <th>0</th>\n",
  82. " <td> a</td>\n",
  83. " <td> 1</td>\n",
  84. " <td> 2</td>\n",
  85. " </tr>\n",
  86. " <tr>\n",
  87. " <th>1</th>\n",
  88. " <td> a</td>\n",
  89. " <td> 1</td>\n",
  90. " <td> 2</td>\n",
  91. " </tr>\n",
  92. " <tr>\n",
  93. " <th>2</th>\n",
  94. " <td> b</td>\n",
  95. " <td> 1</td>\n",
  96. " <td> 2</td>\n",
  97. " </tr>\n",
  98. " <tr>\n",
  99. " <th>3</th>\n",
  100. " <td> b</td>\n",
  101. " <td> 1</td>\n",
  102. " <td> 2</td>\n",
  103. " </tr>\n",
  104. " <tr>\n",
  105. " <th>4</th>\n",
  106. " <td> c</td>\n",
  107. " <td> 1</td>\n",
  108. " <td> 2</td>\n",
  109. " </tr>\n",
  110. " </tbody>\n",
  111. "</table>\n",
  112. "<p>5 rows \u00d7 3 columns</p>\n",
  113. "</div>"
  114. ],
  115. "metadata": {},
  116. "output_type": "pyout",
  117. "prompt_number": 3,
  118. "text": [
  119. " letter one two\n",
  120. "0 a 1 2\n",
  121. "1 a 1 2\n",
  122. "2 b 1 2\n",
  123. "3 b 1 2\n",
  124. "4 c 1 2\n",
  125. "\n",
  126. "[5 rows x 3 columns]"
  127. ]
  128. }
  129. ],
  130. "prompt_number": 3
  131. },
  132. {
  133. "cell_type": "code",
  134. "collapsed": false,
  135. "input": [
  136. "# Create group object\n",
  137. "one = df.groupby('letter')\n",
  138. "\n",
  139. "# Apply sum function\n",
  140. "one.sum()"
  141. ],
  142. "language": "python",
  143. "metadata": {},
  144. "outputs": [
  145. {
  146. "html": [
  147. "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
  148. "<table border=\"1\" class=\"dataframe\">\n",
  149. " <thead>\n",
  150. " <tr style=\"text-align: right;\">\n",
  151. " <th></th>\n",
  152. " <th>one</th>\n",
  153. " <th>two</th>\n",
  154. " </tr>\n",
  155. " <tr>\n",
  156. " <th>letter</th>\n",
  157. " <th></th>\n",
  158. " <th></th>\n",
  159. " </tr>\n",
  160. " </thead>\n",
  161. " <tbody>\n",
  162. " <tr>\n",
  163. " <th>a</th>\n",
  164. " <td> 2</td>\n",
  165. " <td> 4</td>\n",
  166. " </tr>\n",
  167. " <tr>\n",
  168. " <th>b</th>\n",
  169. " <td> 2</td>\n",
  170. " <td> 4</td>\n",
  171. " </tr>\n",
  172. " <tr>\n",
  173. " <th>c</th>\n",
  174. " <td> 1</td>\n",
  175. " <td> 2</td>\n",
  176. " </tr>\n",
  177. " </tbody>\n",
  178. "</table>\n",
  179. "<p>3 rows \u00d7 2 columns</p>\n",
  180. "</div>"
  181. ],
  182. "metadata": {},
  183. "output_type": "pyout",
  184. "prompt_number": 4,
  185. "text": [
  186. " one two\n",
  187. "letter \n",
  188. "a 2 4\n",
  189. "b 2 4\n",
  190. "c 1 2\n",
  191. "\n",
  192. "[3 rows x 2 columns]"
  193. ]
  194. }
  195. ],
  196. "prompt_number": 4
  197. },
  198. {
  199. "cell_type": "code",
  200. "collapsed": false,
  201. "input": [
  202. "letterone = df.groupby(['letter','one']).sum()\n",
  203. "letterone"
  204. ],
  205. "language": "python",
  206. "metadata": {},
  207. "outputs": [
  208. {
  209. "html": [
  210. "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
  211. "<table border=\"1\" class=\"dataframe\">\n",
  212. " <thead>\n",
  213. " <tr style=\"text-align: right;\">\n",
  214. " <th></th>\n",
  215. " <th></th>\n",
  216. " <th>two</th>\n",
  217. " </tr>\n",
  218. " <tr>\n",
  219. " <th>letter</th>\n",
  220. " <th>one</th>\n",
  221. " <th></th>\n",
  222. " </tr>\n",
  223. " </thead>\n",
  224. " <tbody>\n",
  225. " <tr>\n",
  226. " <th>a</th>\n",
  227. " <th>1</th>\n",
  228. " <td> 4</td>\n",
  229. " </tr>\n",
  230. " <tr>\n",
  231. " <th>b</th>\n",
  232. " <th>1</th>\n",
  233. " <td> 4</td>\n",
  234. " </tr>\n",
  235. " <tr>\n",
  236. " <th>c</th>\n",
  237. " <th>1</th>\n",
  238. " <td> 2</td>\n",
  239. " </tr>\n",
  240. " </tbody>\n",
  241. "</table>\n",
  242. "<p>3 rows \u00d7 1 columns</p>\n",
  243. "</div>"
  244. ],
  245. "metadata": {},
  246. "output_type": "pyout",
  247. "prompt_number": 5,
  248. "text": [
  249. " two\n",
  250. "letter one \n",
  251. "a 1 4\n",
  252. "b 1 4\n",
  253. "c 1 2\n",
  254. "\n",
  255. "[3 rows x 1 columns]"
  256. ]
  257. }
  258. ],
  259. "prompt_number": 5
  260. },
  261. {
  262. "cell_type": "code",
  263. "collapsed": false,
  264. "input": [
  265. "letterone.index"
  266. ],
  267. "language": "python",
  268. "metadata": {},
  269. "outputs": [
  270. {
  271. "metadata": {},
  272. "output_type": "pyout",
  273. "prompt_number": 6,
  274. "text": [
  275. "MultiIndex(levels=[[u'a', u'b', u'c'], [1]],\n",
  276. " labels=[[0, 1, 2], [0, 0, 0]],\n",
  277. " names=[u'letter', u'one'])"
  278. ]
  279. }
  280. ],
  281. "prompt_number": 6
  282. },
  283. {
  284. "cell_type": "markdown",
  285. "metadata": {},
  286. "source": [
  287. "You may want to ***not*** have the columns you are grouping by become your index, this can be easily achieved as shown below."
  288. ]
  289. },
  290. {
  291. "cell_type": "code",
  292. "collapsed": false,
  293. "input": [
  294. "letterone = df.groupby(['letter','one'], as_index=False).sum()\n",
  295. "letterone"
  296. ],
  297. "language": "python",
  298. "metadata": {},
  299. "outputs": [
  300. {
  301. "html": [
  302. "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
  303. "<table border=\"1\" class=\"dataframe\">\n",
  304. " <thead>\n",
  305. " <tr style=\"text-align: right;\">\n",
  306. " <th></th>\n",
  307. " <th>letter</th>\n",
  308. " <th>one</th>\n",
  309. " <th>two</th>\n",
  310. " </tr>\n",
  311. " </thead>\n",
  312. " <tbody>\n",
  313. " <tr>\n",
  314. " <th>0</th>\n",
  315. " <td> a</td>\n",
  316. " <td> 1</td>\n",
  317. " <td> 4</td>\n",
  318. " </tr>\n",
  319. " <tr>\n",
  320. " <th>1</th>\n",
  321. " <td> b</td>\n",
  322. " <td> 1</td>\n",
  323. " <td> 4</td>\n",
  324. " </tr>\n",
  325. " <tr>\n",
  326. " <th>2</th>\n",
  327. " <td> c</td>\n",
  328. " <td> 1</td>\n",
  329. " <td> 2</td>\n",
  330. " </tr>\n",
  331. " </tbody>\n",
  332. "</table>\n",
  333. "<p>3 rows \u00d7 3 columns</p>\n",
  334. "</div>"
  335. ],
  336. "metadata": {},
  337. "output_type": "pyout",
  338. "prompt_number": 7,
  339. "text": [
  340. " letter one two\n",
  341. "0 a 1 4\n",
  342. "1 b 1 4\n",
  343. "2 c 1 2\n",
  344. "\n",
  345. "[3 rows x 3 columns]"
  346. ]
  347. }
  348. ],
  349. "prompt_number": 7
  350. },
  351. {
  352. "cell_type": "code",
  353. "collapsed": false,
  354. "input": [
  355. "letterone.index"
  356. ],
  357. "language": "python",
  358. "metadata": {},
  359. "outputs": [
  360. {
  361. "metadata": {},
  362. "output_type": "pyout",
  363. "prompt_number": 8,
  364. "text": [
  365. "Int64Index([0, 1, 2], dtype='int64')"
  366. ]
  367. }
  368. ],
  369. "prompt_number": 8
  370. },
  371. {
  372. "cell_type": "markdown",
  373. "metadata": {},
  374. "source": [
  375. "**Author:** [David Rojas LLC](http://hdrojas.pythonanywhere.com/) "
  376. ]
  377. }
  378. ],
  379. "metadata": {}
  380. }
  381. ]
  382. }