/lessons/06 - Lesson.ipynb
Jupyter | 382 lines | 382 code | 0 blank | 0 comment | 0 complexity | 323edbf736a962d798670ce1169cf0b3 MD5 | raw file
- {
- "metadata": {
- "name": ""
- },
- "nbformat": 3,
- "nbformat_minor": 0,
- "worksheets": [
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Lesson 6 \n",
- "\n",
- "Lets take a look at the ***groupby*** function."
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "# Import libraries\n",
- "from pandas import DataFrame\n",
- "import pandas as pd"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [],
- "prompt_number": 1
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "print 'Pandas version: ' + pd.__version__"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "output_type": "stream",
- "stream": "stdout",
- "text": [
- "Pandas version: 0.13.0\n"
- ]
- }
- ],
- "prompt_number": 2
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "# Our small data set\n",
- "d = {'one':[1,1,1,1,1],\n",
- " 'two':[2,2,2,2,2],\n",
- " 'letter':['a','a','b','b','c']}\n",
- "\n",
- "# Create dataframe\n",
- "df = DataFrame(d)\n",
- "df"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "html": [
- "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>letter</th>\n",
- " <th>one</th>\n",
- " <th>two</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td> a</td>\n",
- " <td> 1</td>\n",
- " <td> 2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td> a</td>\n",
- " <td> 1</td>\n",
- " <td> 2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td> b</td>\n",
- " <td> 1</td>\n",
- " <td> 2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>3</th>\n",
- " <td> b</td>\n",
- " <td> 1</td>\n",
- " <td> 2</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>4</th>\n",
- " <td> c</td>\n",
- " <td> 1</td>\n",
- " <td> 2</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>5 rows \u00d7 3 columns</p>\n",
- "</div>"
- ],
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 3,
- "text": [
- " letter one two\n",
- "0 a 1 2\n",
- "1 a 1 2\n",
- "2 b 1 2\n",
- "3 b 1 2\n",
- "4 c 1 2\n",
- "\n",
- "[5 rows x 3 columns]"
- ]
- }
- ],
- "prompt_number": 3
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "# Create group object\n",
- "one = df.groupby('letter')\n",
- "\n",
- "# Apply sum function\n",
- "one.sum()"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "html": [
- "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>one</th>\n",
- " <th>two</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>letter</th>\n",
- " <th></th>\n",
- " <th></th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>a</th>\n",
- " <td> 2</td>\n",
- " <td> 4</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>b</th>\n",
- " <td> 2</td>\n",
- " <td> 4</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>c</th>\n",
- " <td> 1</td>\n",
- " <td> 2</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>3 rows \u00d7 2 columns</p>\n",
- "</div>"
- ],
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 4,
- "text": [
- " one two\n",
- "letter \n",
- "a 2 4\n",
- "b 2 4\n",
- "c 1 2\n",
- "\n",
- "[3 rows x 2 columns]"
- ]
- }
- ],
- "prompt_number": 4
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "letterone = df.groupby(['letter','one']).sum()\n",
- "letterone"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "html": [
- "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th></th>\n",
- " <th>two</th>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>letter</th>\n",
- " <th>one</th>\n",
- " <th></th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>a</th>\n",
- " <th>1</th>\n",
- " <td> 4</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>b</th>\n",
- " <th>1</th>\n",
- " <td> 4</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>c</th>\n",
- " <th>1</th>\n",
- " <td> 2</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>3 rows \u00d7 1 columns</p>\n",
- "</div>"
- ],
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 5,
- "text": [
- " two\n",
- "letter one \n",
- "a 1 4\n",
- "b 1 4\n",
- "c 1 2\n",
- "\n",
- "[3 rows x 1 columns]"
- ]
- }
- ],
- "prompt_number": 5
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "letterone.index"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 6,
- "text": [
- "MultiIndex(levels=[[u'a', u'b', u'c'], [1]],\n",
- " labels=[[0, 1, 2], [0, 0, 0]],\n",
- " names=[u'letter', u'one'])"
- ]
- }
- ],
- "prompt_number": 6
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You may want to ***not*** have the columns you are grouping by become your index, this can be easily achieved as shown below."
- ]
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "letterone = df.groupby(['letter','one'], as_index=False).sum()\n",
- "letterone"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "html": [
- "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
- "<table border=\"1\" class=\"dataframe\">\n",
- " <thead>\n",
- " <tr style=\"text-align: right;\">\n",
- " <th></th>\n",
- " <th>letter</th>\n",
- " <th>one</th>\n",
- " <th>two</th>\n",
- " </tr>\n",
- " </thead>\n",
- " <tbody>\n",
- " <tr>\n",
- " <th>0</th>\n",
- " <td> a</td>\n",
- " <td> 1</td>\n",
- " <td> 4</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>1</th>\n",
- " <td> b</td>\n",
- " <td> 1</td>\n",
- " <td> 4</td>\n",
- " </tr>\n",
- " <tr>\n",
- " <th>2</th>\n",
- " <td> c</td>\n",
- " <td> 1</td>\n",
- " <td> 2</td>\n",
- " </tr>\n",
- " </tbody>\n",
- "</table>\n",
- "<p>3 rows \u00d7 3 columns</p>\n",
- "</div>"
- ],
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 7,
- "text": [
- " letter one two\n",
- "0 a 1 4\n",
- "1 b 1 4\n",
- "2 c 1 2\n",
- "\n",
- "[3 rows x 3 columns]"
- ]
- }
- ],
- "prompt_number": 7
- },
- {
- "cell_type": "code",
- "collapsed": false,
- "input": [
- "letterone.index"
- ],
- "language": "python",
- "metadata": {},
- "outputs": [
- {
- "metadata": {},
- "output_type": "pyout",
- "prompt_number": 8,
- "text": [
- "Int64Index([0, 1, 2], dtype='int64')"
- ]
- }
- ],
- "prompt_number": 8
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Author:** [David Rojas LLC](http://hdrojas.pythonanywhere.com/) "
- ]
- }
- ],
- "metadata": {}
- }
- ]
- }