PageRenderTime 45ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 0ms

/python/projects/pandas/movie-dataset.py

https://github.com/bashwork/common
Python | 62 lines | 31 code | 11 blank | 20 comment | 0 complexity | 672cee2e13091d04878deaaaa460e41e MD5 | raw file
Possible License(s): GPL-2.0
  1. import pandas as pd
  2. import numpy as np
  3. #------------------------------------------------------------
  4. # Initialize the data lens
  5. #------------------------------------------------------------
  6. u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
  7. users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols)
  8. r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
  9. ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols)
  10. m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
  11. movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5))
  12. movie_ratings = pd.merge(movies, ratings)
  13. lens = pd.merge(movie_ratings, users)
  14. #------------------------------------------------------------
  15. # find the top 25 movies as rated by users
  16. #------------------------------------------------------------
  17. #most_rated = lens.title.value_counts()
  18. most_rated = lens.groupby('title').size().order(ascending=False)
  19. print most_rated[:25]
  20. #------------------------------------------------------------
  21. # find the highest rated movies
  22. #------------------------------------------------------------
  23. best_rated = lens.groupby('title').agg({'rating': [np.size, np.mean]})
  24. at_least_100 = best_rated['rating'].size >= 100
  25. print best_rated[at_least_100].sort([('rating', 'mean')], ascending=False)[:25]
  26. #------------------------------------------------------------
  27. # bin our ratins based on user age
  28. #------------------------------------------------------------
  29. most_50 = lens.groupby('movie_id').size().order(ascending=False)[:50]
  30. #plot = users.age.hist(bins=30)
  31. #plot.title("Distribution of user ages")
  32. #plot.ylabel("count of users")
  33. #plot.xlabel("age")
  34. labels = ['0-9', '10-19' ,'20-29', '30-39', '40-49', '50-59', '60-69', '70-79']
  35. lens['age_group'] = pd.cut(lens.age, range(0, 81, 10), right=False, labels=lables)
  36. print lens.groupby('age_group').agg({'rating': [np.size, np.mean]})
  37. lens.set_index('movie_id', inplace=True)
  38. by_age = lens.ix[most_50.ix].groupby(['title', 'age_group'])
  39. print by_age.rating.mean().unstack(1).fillna(0)[10:20]
  40. #------------------------------------------------------------
  41. # difference in ratings by men and woment
  42. #------------------------------------------------------------
  43. lens.reset_index('movie_id', inplace=True)
  44. pivot = lens.pivot_table(rows=['movie_id', 'title'], cols=['sex'], values='rating', fill_value=0)
  45. pivot['diff'] = pivot.M - pivot.F
  46. pivot.reset_index('movie_id', inplace=True)
  47. disagree = pivot[pivot.movie_id.isin(most_50.index)]['diff']
  48. plot = disagree.order().plot(kind='barh', figsize=[9,15])
  49. plot.title('Male vs Female Average Ratings (>0 male)')
  50. plot.ylable('title')
  51. plot.xlable('Average Rating Difference')