/all-gists/6606487/snippet.py

https://github.com/gistable/gistable · Python · 180 lines · 141 code · 34 blank · 5 comment · 47 complexity · 881a730db39c7a9d63e5ec8fed6da2e8 MD5 · raw file

  1. '''
  2. Created on Sep 7, 2013
  3. @author: anuvrat
  4. '''
  5. import pickle
  6. from datetime import datetime
  7. import urllib.request
  8. from bs4 import BeautifulSoup
  9. import re
  10. import string
  11. import json
  12. import codecs
  13. def loadState():
  14. try:
  15. state_file = open( "itunes_store_state_dump.pba", "rb" )
  16. apps_discovered = pickle.load( state_file )
  17. apps_pending = pickle.load( state_file )
  18. state_file.close()
  19. print( "Pending = ", len( apps_pending ), " Discovered = ", len( apps_discovered ) )
  20. return apps_discovered, apps_pending
  21. except IOError:
  22. print( "A fresh start ..." )
  23. return [], []
  24. character_encoding = 'utf-8'
  25. apps_discovered, apps_pending = loadState()
  26. count_offset = len( apps_discovered )
  27. apps_categories = {}
  28. start_time = datetime.now()
  29. def getPageAsSoup( url ):
  30. try:
  31. response = urllib.request.urlopen( url )
  32. except urllib.error.HTTPError as e:
  33. print( "HTTPError with: ", url, e )
  34. return None
  35. the_page = response.read()
  36. soup = BeautifulSoup( the_page )
  37. return soup
  38. def reportProgress():
  39. current_time = datetime.now()
  40. elapsed = current_time - start_time
  41. v = ( ( len( apps_discovered ) - count_offset ) / elapsed.seconds ) * 60
  42. t = len( apps_pending ) / v if v > 0 else 0
  43. print( "Pending = ", len( apps_pending ), " Discovered = ", len( apps_discovered ), " Velocity = ", str( v ), " parsed per min and Time remaining in min = ", str( t ) )
  44. print( json.dumps( apps_categories ) )
  45. def saveState():
  46. state_file = open( "itunes_store_state_dump.pba", "wb" )
  47. pickle.dump( apps_discovered, state_file )
  48. pickle.dump( apps_pending, state_file )
  49. state_file.close()
  50. reportProgress()
  51. def getApps( categoryUrl ):
  52. previous_apps = []
  53. start_idx = 1
  54. while( True ):
  55. url = categoryUrl + "&page=" + str( start_idx )
  56. print( url )
  57. categoryPage = getPageAsSoup( url )
  58. allAppLinks = [aDiv.get( 'href' ) for aDiv in categoryPage.findAll( 'a', href = re.compile( '^https://itunes.apple.com/us/app' ) )]
  59. if allAppLinks == previous_apps: break
  60. apps_pending.extend( [appLink for appLink in allAppLinks if appLink not in apps_pending] )
  61. previous_apps = allAppLinks
  62. start_idx += 1
  63. saveState()
  64. def getAppDetails( appUrl ):
  65. if appUrl in apps_discovered: return None
  66. soup = getPageAsSoup( appUrl )
  67. if not soup: return None
  68. pTitleDiv = soup.find( 'p', {'class' : 'title'} )
  69. if pTitleDiv and pTitleDiv.getText() == 'One Moment Please.': return None
  70. appDetails = {}
  71. appDetails['app_url'] = appUrl
  72. titleDiv = soup.find( 'div', {'id' : 'title'} )
  73. appDetails['title'] = titleDiv.find( 'h1' ).getText()
  74. appDetails['developer'] = titleDiv.find( 'h2' ).getText()
  75. detailsDiv = soup.find( 'div', {'id' : 'left-stack'} )
  76. if not detailsDiv: return None
  77. priceDiv = detailsDiv.find( 'div', {'class' : 'price'} )
  78. if priceDiv: appDetails['price'] = priceDiv.getText()
  79. categoryDiv = detailsDiv.find( 'li', {'class' : 'genre'} )
  80. if categoryDiv: appDetails['category'] = categoryDiv.find( 'a' ).getText()
  81. releaseDateDiv = detailsDiv.find( 'li', {'class' : 'release-date'} )
  82. if releaseDateDiv: appDetails['release_date'] = releaseDateDiv.getText()
  83. languageDiv = detailsDiv.find( 'li', {'class' : 'language'} )
  84. if languageDiv: appDetails['language'] = languageDiv.getText().split()
  85. contentRatingDiv = detailsDiv.find( 'div', {'class' : 'app-rating'} )
  86. if contentRatingDiv: appDetails['content_rating'] = contentRatingDiv.getText()
  87. contentRatingReasonDiv = detailsDiv.find( 'list app-rating-reasons' )
  88. if contentRatingReasonDiv: appDetails['content_rating_reason'] = [li.getText() for li in contentRatingReasonDiv.findAll( 'li' )]
  89. compatibilityDiv = detailsDiv.find( 'p' )
  90. if compatibilityDiv: appDetails['compatibility'] = compatibilityDiv.getText()
  91. customerRatingDivs = detailsDiv.findAll( 'div', {'class' : 'rating', 'role': 'img'} )
  92. if customerRatingDivs:
  93. customerRating = customerRatingDivs[-1].get( 'aria-label' ).split( ',' )
  94. appDetails['rating'] = customerRating[0].strip()
  95. appDetails['reviewers'] = customerRating[1].strip()
  96. appLinksDiv = soup.find( 'div', {'class' : 'app-links'} )
  97. if appLinksDiv:
  98. for link in appLinksDiv.findAll( 'a', {'class' : 'see-all'} ):
  99. text = link.getText()
  100. href = link.get( 'href' )
  101. if text.endswith( 'Web Site' ): appDetails['developer_wesite'] = href
  102. elif text.endswith( 'Support' ): appDetails['support'] = href
  103. elif text.endswith( 'Agreement' ): appDetails['license'] = href
  104. apps_discovered.append( appUrl )
  105. return appDetails
  106. def closeFileHandlers( fileHandlers ):
  107. for v in fileHandlers.values():
  108. v.close()
  109. if __name__ == '__main__':
  110. itunesStoreUrl = 'https://itunes.apple.com/us/genre/ios/id36?mt=8'
  111. mainPage = getPageAsSoup( itunesStoreUrl )
  112. allCategories = []
  113. for column in ['list column first', 'list column', 'list column last']:
  114. columnDiv = mainPage.find( 'ul', {'class' : column} )
  115. allCategories.extend( aDiv.get( 'href' ) for aDiv in columnDiv.findAll( 'a', href = re.compile( '^https://itunes.apple.com/us/genre' ) ) )
  116. for category, alphabet in [( x, y ) for x in allCategories for y in string.ascii_uppercase]:
  117. getApps( category + '&letter=' + alphabet )
  118. fileHandlers = {}
  119. count = 100
  120. while apps_pending:
  121. if count == 0:
  122. saveState()
  123. count = 100
  124. count = count - 1
  125. app = apps_pending.pop()
  126. if not app: continue
  127. try:
  128. app_data = getAppDetails( app )
  129. except Exception as e:
  130. print( app, e )
  131. exit( 1 )
  132. if not app_data:
  133. continue
  134. if not app_data['category']: app_data['category'] = 'uncategorized'
  135. if app_data['category'].lower() not in fileHandlers:
  136. fileHandlers[app_data['category'].lower()] = codecs.open( '_'.join( ["apple_appstore", app_data['category'].lower()] ), 'ab', character_encoding, buffering = 0 )
  137. apps_categories[app_data['category'].lower()] = 0
  138. apps_categories[app_data['category'].lower()] = apps_categories[app_data['category'].lower()] + 1
  139. fileHandler = fileHandlers[app_data['category'].lower()]
  140. try:
  141. fileHandler.write( json.dumps( app_data ) + "\n" )
  142. except Exception as e:
  143. print( e )
  144. saveState()
  145. closeFileHandlers( fileHandlers )