- #!/usr/bin/env python
- #
- # Copyright 2011 Google Inc.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """ This is a sample application that tests the MapReduce API.
- It does so by allowing users to upload a zip file containing plaintext files
- and perform some kind of analysis upon it. Currently three types of MapReduce
- jobs can be run over user-supplied input data: a WordCount MR that reports the
- number of occurrences of each word, an Index MR that reports which file(s) each
- word in the input corpus comes from, and a Phrase MR that finds statistically
- improbably phrases for a given input file (this requires many input files in the
- zip file to attain higher accuracies)."""
- __author__ = """aizatsky@google.com (Mike Aizatsky), cbunch@google.com (Chris
- Bunch)"""
- import datetime
- import logging
- import re
- import urllib
- from google.appengine.ext import blobstore
- from google.appengine.ext import db
- from google.appengine.ext import webapp
- from google.appengine.ext.webapp import blobstore_handlers
- from google.appengine.ext.webapp import util
- from google.appengine.ext.webapp import template
- from mapreduce.lib import files
- from google.appengine.api import taskqueue
- from google.appengine.api import users
- from mapreduce import base_handler
- from mapreduce import mapreduce_pipeline
- from mapreduce import operation as op
- from mapreduce import shuffler
- class FileMetadata(db.Model):
- """A helper class that will hold metadata for the user's blobs.
- Specifially, we want to keep track of who uploaded it, where they uploaded it
- from (right now they can only upload from their computer, but in the future
- urlfetch would be nice to add), and links to the results of their MR jobs. To
- enable our querying to scan over our input data, we store keys in the form
- 'user/date/blob_key', where 'user' is the given user's e-mail address, 'date'
- is the date and time that they uploaded the item on, and 'blob_key'
- indicates the location in the Blobstore that the item can be found at. '/'
- is not the actual separator between these values - we use '..' since it is
- an illegal set of characters for an e-mail address to contain.
- """
- __SEP = ".."
- __NEXT = "./"
- owner = db.UserProperty()
- filename = db.StringProperty()
- uploadedOn = db.DateTimeProperty()
- source = db.StringProperty()
- blobkey = db.StringProperty()
- wordcount_link = db.StringProperty()
- index_link = db.StringProperty()
- phrases_link = db.StringProperty()
- @staticmethod
- def getFirstKeyForUser(username):
- """Helper function that returns the first possible key a user could own.
- This is useful for table scanning, in conjunction with getLastKeyForUser.
- Args:
- username: The given user's e-mail address.
- Returns:
- The internal key representing the earliest possible key that a user could
- own (although the value of this key is not able to be used for actual
- user data).
- """
- return db.Key.from_path("FileMetadata", username + FileMetadata.__SEP)
- @staticmethod
- def getLastKeyForUser(username):
- """Helper function that returns the last possible key a user could own.
- This is useful for table scanning, in conjunction with getFirstKeyForUser.
- Args:
- username: The given user's e-mail address.
- Returns:
- The internal key representing the last possible key that a user could
- own (although the value of this key is not able to be used for actual
- user data).
- """
- return db.Key.from_path("FileMetadata", username + FileMetadata.__NEXT)
- @staticmethod
- def getKeyName(username, date, blob_key):
- """Returns the internal key for a particular item in the database.
- Our items are stored with keys of the form 'user/date/blob_key' ('/' is
- not the real separator, but __SEP is).
- Args:
- username: The given user's e-mail address.
- date: A datetime object representing the date and time that an input
- file was uploaded to this app.
- blob_key: The blob key corresponding to the location of the input file
- in the Blobstore.
- Returns:
- The internal key for the item specified by (username, date, blob_key).
- """
- sep = FileMetadata.__SEP
- return str(username + sep + str(date) + sep + blob_key)
- class IndexHandler(webapp.RequestHandler):
- """The main page that users will interact with, which presents users with
- the ability to upload new data or run MapReduce jobs on their existing data.
- """
- def get(self):
- user = users.get_current_user()
- username = user.nickname()
- first = FileMetadata.getFirstKeyForUser(username)
- last = FileMetadata.getLastKeyForUser(username)
- q = FileMetadata.all()
- q.filter("__key__ >", first)
- q.filter("__key__ < ", last)
- results = q.fetch(10)
- items = [result for result in results]
- length = len(items)
- upload_url = blobstore.create_upload_url("/upload")
- self.response.out.write(template.render("templates/index.html",
- {"username" : username,
- "items" : items,
- "length" : length,
- "upload_url" : upload_url}))
- def post(self):
- filekey = self.request.get("filekey")
- blob_key = self.request.get("blobkey")
- if self.request.get("word_count"):
- pipeline = WordCountPipeline(filekey, blob_key)
- elif self.request.get("index"):
- pipeline = IndexPipeline(filekey, blob_key)
- else:
- pipeline = PhrasesPipeline(filekey, blob_key)
- pipeline.start()
- self.redirect(pipeline.base_path + "/status?root=" + pipeline.pipeline_id)
- def split_into_sentences(s):
- """Split text into list of sentences."""
- s = re.sub(r"\s+", " ", s)
- s = re.sub(r"[\\.\\?\\!]", "\n", s)
- return s.split("\n")
- def split_into_words(s):
- """Split a sentence into list of words."""
- s = re.sub(r"\W+", " ", s)
- s = re.sub(r"[_0-9]+", " ", s)
- return s.split()
- def word_count_map(data):
- """Word count map function."""
- (entry, text_fn) = data
- text = text_fn()
- logging.debug("Got %s", entry.filename)
- for s in split_into_sentences(text):
- for w in split_into_words(s.lower()):
- yield (w, "")
- def word_count_reduce(key, values):
- """Word count reduce function."""
- yield "%s: %d\n" % (key, len(values))
- def index_map(data):
- """Index demo map function."""
- (entry, text_fn) = data
- text = text_fn()
- logging.debug("Got %s", entry.filename)
- for s in split_into_sentences(text):
- for w in split_into_words(s.lower()):
- yield (w, entry.filename)
- def index_reduce(key, values):
- """Index demo reduce function."""
- yield "%s: %s\n" % (key, list(set(values)))
- def phrases_map(data):
- """Phrases demo map function."""
- (entry, text_fn) = data
- text = text_fn()
- filename = entry.filename
- logging.debug("Got %s", filename)
- for s in split_into_sentences(text):
- words = split_into_words(s.lower())
- if len(words) < PHRASE_LENGTH:
- yield (":".join(words), filename)
- continue
- for i in range(0, len(words) - PHRASE_LENGTH):
- yield (":".join(words[i:i+PHRASE_LENGTH]), filename)
- def phrases_reduce(key, values):
- """Phrases demo reduce function."""
- if len(values) < 10:
- return
- counts = {}
- for filename in values:
- counts[filename] = counts.get(filename, 0) + 1
- words = re.sub(r":", " ", key)
- threshold = len(values) / 2
- for filename, count in counts.items():
- if count > threshold:
- yield "%s:%s\n" % (words, filename)
- class WordCountPipeline(base_handler.PipelineBase):
- """A pipeline to run Word count demo.
- Args:
- blobkey: blobkey to process as string. Should be a zip archive with
- text files inside.
- """
- def run(self, filekey, blobkey):
- logging.debug("filename is %s" % filekey)
- output = yield mapreduce_pipeline.MapreducePipeline(
- "word_count",
- "main.word_count_map",
- "main.word_count_reduce",
- "mapreduce.input_readers.BlobstoreZipInputReader",
- "mapreduce.output_writers.BlobstoreOutputWriter",
- mapper_params={
- "blob_key": blobkey,
- },
- reducer_params={
- "mime_type": "text/plain",
- },
- shards=16)
- yield StoreOutput("WordCount", filekey, output)
- class IndexPipeline(base_handler.PipelineBase):
- """A pipeline to run Index demo.
- Args:
- blobkey: blobkey to process as string. Should be a zip archive with
- text files inside.
- """
- def run(self, filekey, blobkey):
- output = yield mapreduce_pipeline.MapreducePipeline(
- "index",
- "main.index_map",
- "main.index_reduce",
- "mapreduce.input_readers.BlobstoreZipInputReader",
- "mapreduce.output_writers.BlobstoreOutputWriter",
- mapper_params={
- "blob_key": blobkey,
- },
- reducer_params={
- "mime_type": "text/plain",
- },
- shards=16)
- yield StoreOutput("Index", filekey, output)
- class PhrasesPipeline(base_handler.PipelineBase):
- """A pipeline to run Phrases demo.
- Args:
- blobkey: blobkey to process as string. Should be a zip archive with
- text files inside.
- """
- def run(self, filekey, blobkey):
- output = yield mapreduce_pipeline.MapreducePipeline(
- "phrases",
- "main.phrases_map",
- "main.phrases_reduce",
- "mapreduce.input_readers.BlobstoreZipInputReader",
- "mapreduce.output_writers.BlobstoreOutputWriter",
- mapper_params={
- "blob_key": blobkey,
- },
- reducer_params={
- "mime_type": "text/plain",
- },
- shards=16)
- yield StoreOutput("Phrases", filekey, output)
- class StoreOutput(base_handler.PipelineBase):
- """A pipeline to store the result of the MapReduce job in the database.
- Args:
- mr_type: the type of mapreduce job run (e.g., WordCount, Index)
- encoded_key: the DB key corresponding to the metadata of this job
- output: the blobstore location where the output of the job is stored
- """
- def run(self, mr_type, encoded_key, output):
- logging.debug("output is %s" % str(output))
- key = db.Key(encoded=encoded_key)
- m = FileMetadata.get(key)
- if mr_type == "WordCount":
- m.wordcount_link = output[0]
- elif mr_type == "Index":
- m.index_link = output[0]
- elif mr_type == "Phrases":
- m.phrases_link = output[0]
- m.put()
- class UploadHandler(blobstore_handlers.BlobstoreUploadHandler):
- """Handler to upload data to blobstore."""
- def post(self):
- source = "uploaded by user"
- upload_files = self.get_uploads("file")
- blob_key = upload_files[0].key()
- name = self.request.get("name")
- user = users.get_current_user()
- username = user.nickname()
- date = datetime.datetime.now()
- str_blob_key = str(blob_key)
- key = FileMetadata.getKeyName(username, date, str_blob_key)
- m = FileMetadata(key_name = key)
- m.owner = user
- m.filename = name
- m.uploadedOn = date
- m.source = source
- m.blobkey = str_blob_key
- m.put()
- self.redirect("/")
- class DownloadHandler(blobstore_handlers.BlobstoreDownloadHandler):
- """Handler to download blob by blobkey."""
- def get(self, key):
- key = str(urllib.unquote(key)).strip()
- logging.debug("key is %s" % key)
- blob_info = blobstore.BlobInfo.get(key)
- self.send_blob(blob_info)
- APP = webapp.WSGIApplication(
- [
- ('/', IndexHandler),
- ('/upload', UploadHandler),
- (r'/blobstore/(.*)', DownloadHandler),
- ],
- debug=True)
- def main():
- util.run_wsgi_app(APP)
- if __name__ == '__main__':
- main()