PageRenderTime 45ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/python/demo/main.py

https://github.com/mikea/appengine-mapreduce
Python | 404 lines | 378 code | 2 blank | 24 comment | 0 complexity | 5ede7f18c94054181d259fe270f59974 MD5 | raw file
  1. #!/usr/bin/env python
  2. #
  3. # Copyright 2011 Google Inc.
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. """ This is a sample application that tests the MapReduce API.
  17. It does so by allowing users to upload a zip file containing plaintext files
  18. and perform some kind of analysis upon it. Currently three types of MapReduce
  19. jobs can be run over user-supplied input data: a WordCount MR that reports the
  20. number of occurrences of each word, an Index MR that reports which file(s) each
  21. word in the input corpus comes from, and a Phrase MR that finds statistically
  22. improbably phrases for a given input file (this requires many input files in the
  23. zip file to attain higher accuracies)."""
  24. __author__ = """aizatsky@google.com (Mike Aizatsky), cbunch@google.com (Chris
  25. Bunch)"""
  26. import datetime
  27. import logging
  28. import re
  29. import urllib
  30. from google.appengine.ext import blobstore
  31. from google.appengine.ext import db
  32. from google.appengine.ext import webapp
  33. from google.appengine.ext.webapp import blobstore_handlers
  34. from google.appengine.ext.webapp import util
  35. from google.appengine.ext.webapp import template
  36. from mapreduce.lib import files
  37. from google.appengine.api import taskqueue
  38. from google.appengine.api import users
  39. from mapreduce import base_handler
  40. from mapreduce import mapreduce_pipeline
  41. from mapreduce import operation as op
  42. from mapreduce import shuffler
  43. class FileMetadata(db.Model):
  44. """A helper class that will hold metadata for the user's blobs.
  45. Specifially, we want to keep track of who uploaded it, where they uploaded it
  46. from (right now they can only upload from their computer, but in the future
  47. urlfetch would be nice to add), and links to the results of their MR jobs. To
  48. enable our querying to scan over our input data, we store keys in the form
  49. 'user/date/blob_key', where 'user' is the given user's e-mail address, 'date'
  50. is the date and time that they uploaded the item on, and 'blob_key'
  51. indicates the location in the Blobstore that the item can be found at. '/'
  52. is not the actual separator between these values - we use '..' since it is
  53. an illegal set of characters for an e-mail address to contain.
  54. """
  55. __SEP = ".."
  56. __NEXT = "./"
  57. owner = db.UserProperty()
  58. filename = db.StringProperty()
  59. uploadedOn = db.DateTimeProperty()
  60. source = db.StringProperty()
  61. blobkey = db.StringProperty()
  62. wordcount_link = db.StringProperty()
  63. index_link = db.StringProperty()
  64. phrases_link = db.StringProperty()
  65. @staticmethod
  66. def getFirstKeyForUser(username):
  67. """Helper function that returns the first possible key a user could own.
  68. This is useful for table scanning, in conjunction with getLastKeyForUser.
  69. Args:
  70. username: The given user's e-mail address.
  71. Returns:
  72. The internal key representing the earliest possible key that a user could
  73. own (although the value of this key is not able to be used for actual
  74. user data).
  75. """
  76. return db.Key.from_path("FileMetadata", username + FileMetadata.__SEP)
  77. @staticmethod
  78. def getLastKeyForUser(username):
  79. """Helper function that returns the last possible key a user could own.
  80. This is useful for table scanning, in conjunction with getFirstKeyForUser.
  81. Args:
  82. username: The given user's e-mail address.
  83. Returns:
  84. The internal key representing the last possible key that a user could
  85. own (although the value of this key is not able to be used for actual
  86. user data).
  87. """
  88. return db.Key.from_path("FileMetadata", username + FileMetadata.__NEXT)
  89. @staticmethod
  90. def getKeyName(username, date, blob_key):
  91. """Returns the internal key for a particular item in the database.
  92. Our items are stored with keys of the form 'user/date/blob_key' ('/' is
  93. not the real separator, but __SEP is).
  94. Args:
  95. username: The given user's e-mail address.
  96. date: A datetime object representing the date and time that an input
  97. file was uploaded to this app.
  98. blob_key: The blob key corresponding to the location of the input file
  99. in the Blobstore.
  100. Returns:
  101. The internal key for the item specified by (username, date, blob_key).
  102. """
  103. sep = FileMetadata.__SEP
  104. return str(username + sep + str(date) + sep + blob_key)
  105. class IndexHandler(webapp.RequestHandler):
  106. """The main page that users will interact with, which presents users with
  107. the ability to upload new data or run MapReduce jobs on their existing data.
  108. """
  109. def get(self):
  110. user = users.get_current_user()
  111. username = user.nickname()
  112. first = FileMetadata.getFirstKeyForUser(username)
  113. last = FileMetadata.getLastKeyForUser(username)
  114. q = FileMetadata.all()
  115. q.filter("__key__ >", first)
  116. q.filter("__key__ < ", last)
  117. results = q.fetch(10)
  118. items = [result for result in results]
  119. length = len(items)
  120. upload_url = blobstore.create_upload_url("/upload")
  121. self.response.out.write(template.render("templates/index.html",
  122. {"username" : username,
  123. "items" : items,
  124. "length" : length,
  125. "upload_url" : upload_url}))
  126. def post(self):
  127. filekey = self.request.get("filekey")
  128. blob_key = self.request.get("blobkey")
  129. if self.request.get("word_count"):
  130. pipeline = WordCountPipeline(filekey, blob_key)
  131. elif self.request.get("index"):
  132. pipeline = IndexPipeline(filekey, blob_key)
  133. else:
  134. pipeline = PhrasesPipeline(filekey, blob_key)
  135. pipeline.start()
  136. self.redirect(pipeline.base_path + "/status?root=" + pipeline.pipeline_id)
  137. def split_into_sentences(s):
  138. """Split text into list of sentences."""
  139. s = re.sub(r"\s+", " ", s)
  140. s = re.sub(r"[\\.\\?\\!]", "\n", s)
  141. return s.split("\n")
  142. def split_into_words(s):
  143. """Split a sentence into list of words."""
  144. s = re.sub(r"\W+", " ", s)
  145. s = re.sub(r"[_0-9]+", " ", s)
  146. return s.split()
  147. def word_count_map(data):
  148. """Word count map function."""
  149. (entry, text_fn) = data
  150. text = text_fn()
  151. logging.debug("Got %s", entry.filename)
  152. for s in split_into_sentences(text):
  153. for w in split_into_words(s.lower()):
  154. yield (w, "")
  155. def word_count_reduce(key, values):
  156. """Word count reduce function."""
  157. yield "%s: %d\n" % (key, len(values))
  158. def index_map(data):
  159. """Index demo map function."""
  160. (entry, text_fn) = data
  161. text = text_fn()
  162. logging.debug("Got %s", entry.filename)
  163. for s in split_into_sentences(text):
  164. for w in split_into_words(s.lower()):
  165. yield (w, entry.filename)
  166. def index_reduce(key, values):
  167. """Index demo reduce function."""
  168. yield "%s: %s\n" % (key, list(set(values)))
  169. PHRASE_LENGTH = 4
  170. def phrases_map(data):
  171. """Phrases demo map function."""
  172. (entry, text_fn) = data
  173. text = text_fn()
  174. filename = entry.filename
  175. logging.debug("Got %s", filename)
  176. for s in split_into_sentences(text):
  177. words = split_into_words(s.lower())
  178. if len(words) < PHRASE_LENGTH:
  179. yield (":".join(words), filename)
  180. continue
  181. for i in range(0, len(words) - PHRASE_LENGTH):
  182. yield (":".join(words[i:i+PHRASE_LENGTH]), filename)
  183. def phrases_reduce(key, values):
  184. """Phrases demo reduce function."""
  185. if len(values) < 10:
  186. return
  187. counts = {}
  188. for filename in values:
  189. counts[filename] = counts.get(filename, 0) + 1
  190. words = re.sub(r":", " ", key)
  191. threshold = len(values) / 2
  192. for filename, count in counts.items():
  193. if count > threshold:
  194. yield "%s:%s\n" % (words, filename)
  195. class WordCountPipeline(base_handler.PipelineBase):
  196. """A pipeline to run Word count demo.
  197. Args:
  198. blobkey: blobkey to process as string. Should be a zip archive with
  199. text files inside.
  200. """
  201. def run(self, filekey, blobkey):
  202. logging.debug("filename is %s" % filekey)
  203. output = yield mapreduce_pipeline.MapreducePipeline(
  204. "word_count",
  205. "main.word_count_map",
  206. "main.word_count_reduce",
  207. "mapreduce.input_readers.BlobstoreZipInputReader",
  208. "mapreduce.output_writers.BlobstoreOutputWriter",
  209. mapper_params={
  210. "blob_key": blobkey,
  211. },
  212. reducer_params={
  213. "mime_type": "text/plain",
  214. },
  215. shards=16)
  216. yield StoreOutput("WordCount", filekey, output)
  217. class IndexPipeline(base_handler.PipelineBase):
  218. """A pipeline to run Index demo.
  219. Args:
  220. blobkey: blobkey to process as string. Should be a zip archive with
  221. text files inside.
  222. """
  223. def run(self, filekey, blobkey):
  224. output = yield mapreduce_pipeline.MapreducePipeline(
  225. "index",
  226. "main.index_map",
  227. "main.index_reduce",
  228. "mapreduce.input_readers.BlobstoreZipInputReader",
  229. "mapreduce.output_writers.BlobstoreOutputWriter",
  230. mapper_params={
  231. "blob_key": blobkey,
  232. },
  233. reducer_params={
  234. "mime_type": "text/plain",
  235. },
  236. shards=16)
  237. yield StoreOutput("Index", filekey, output)
  238. class PhrasesPipeline(base_handler.PipelineBase):
  239. """A pipeline to run Phrases demo.
  240. Args:
  241. blobkey: blobkey to process as string. Should be a zip archive with
  242. text files inside.
  243. """
  244. def run(self, filekey, blobkey):
  245. output = yield mapreduce_pipeline.MapreducePipeline(
  246. "phrases",
  247. "main.phrases_map",
  248. "main.phrases_reduce",
  249. "mapreduce.input_readers.BlobstoreZipInputReader",
  250. "mapreduce.output_writers.BlobstoreOutputWriter",
  251. mapper_params={
  252. "blob_key": blobkey,
  253. },
  254. reducer_params={
  255. "mime_type": "text/plain",
  256. },
  257. shards=16)
  258. yield StoreOutput("Phrases", filekey, output)
  259. class StoreOutput(base_handler.PipelineBase):
  260. """A pipeline to store the result of the MapReduce job in the database.
  261. Args:
  262. mr_type: the type of mapreduce job run (e.g., WordCount, Index)
  263. encoded_key: the DB key corresponding to the metadata of this job
  264. output: the blobstore location where the output of the job is stored
  265. """
  266. def run(self, mr_type, encoded_key, output):
  267. logging.debug("output is %s" % str(output))
  268. key = db.Key(encoded=encoded_key)
  269. m = FileMetadata.get(key)
  270. if mr_type == "WordCount":
  271. m.wordcount_link = output[0]
  272. elif mr_type == "Index":
  273. m.index_link = output[0]
  274. elif mr_type == "Phrases":
  275. m.phrases_link = output[0]
  276. m.put()
  277. class UploadHandler(blobstore_handlers.BlobstoreUploadHandler):
  278. """Handler to upload data to blobstore."""
  279. def post(self):
  280. source = "uploaded by user"
  281. upload_files = self.get_uploads("file")
  282. blob_key = upload_files[0].key()
  283. name = self.request.get("name")
  284. user = users.get_current_user()
  285. username = user.nickname()
  286. date = datetime.datetime.now()
  287. str_blob_key = str(blob_key)
  288. key = FileMetadata.getKeyName(username, date, str_blob_key)
  289. m = FileMetadata(key_name = key)
  290. m.owner = user
  291. m.filename = name
  292. m.uploadedOn = date
  293. m.source = source
  294. m.blobkey = str_blob_key
  295. m.put()
  296. self.redirect("/")
  297. class DownloadHandler(blobstore_handlers.BlobstoreDownloadHandler):
  298. """Handler to download blob by blobkey."""
  299. def get(self, key):
  300. key = str(urllib.unquote(key)).strip()
  301. logging.debug("key is %s" % key)
  302. blob_info = blobstore.BlobInfo.get(key)
  303. self.send_blob(blob_info)
  304. APP = webapp.WSGIApplication(
  305. [
  306. ('/', IndexHandler),
  307. ('/upload', UploadHandler),
  308. (r'/blobstore/(.*)', DownloadHandler),
  309. ],
  310. debug=True)
  311. def main():
  312. util.run_wsgi_app(APP)
  313. if __name__ == '__main__':
  314. main()