crawler_util.py | searchcode

/server/crawlers/crawler_util.py

https://code.google.com/
Python | 360 lines | 273 code | 33 blank | 54 comment | 33 complexity | 7644034dcb901614221e1f26841ff6b2 MD5 | raw file
Possible License(s): Apache-2.0

# Copyright 2010 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Utilities used by crawlers."""

__author__ = 'alexto@google.com (Alexis O. Torres)'


import logging
from os import environ

import gdata
import gdata.client
import gdata.projecthosting
import gdata.projecthosting.client

from google.appengine.ext import deferred
from google.appengine.runtime import DeadlineExceededError

from models import bugs
from models import bugs_util
from models import screenshots
from models import test_cycle
from models import test_cycle_user
from models import url_bug_map
from utils import target_element_util
from utils import screenshots_util
from utils import url_util


# Used to control the max amount of retries a Task should be retried.
_MAX_RETRIES = 5

# Maximum length of an issue summary to store.
SUMMARY_LIMIT = 150


class BugCrawlerError(Exception):
  """Generic error thrown when something goes wrong while craling bugs."""
  pass


def ExtractIssueTrackerBugId(issue):
  """Extract the bug id from a GData bug object."""
  return issue.id.text.split('/')[-1]


def SpawnDetailsCrawlersIssueTracker(recent_issues, project_name,
                                     skip_recent_check=False):
  """Queues the tasks to do the actual crawling for recent updates."""
  count = 0
  try:
    for issue in recent_issues:
      bug_id = issue['id']
      logging.info('Adding crawler to the queue for issue_id %s, project: %s.',
                   bug_id, project_name)
      end = bug_id.find('/')
      if end > 0:
        bug_id = bug_id[0:end]

      bug = bugs.GetBug(bug_id=bug_id, project=project_name,
                        provider=bugs_util.Provider.ISSUETRACKER)
      if bug:
        if  not skip_recent_check and bug.last_update == issue['updated']:
          logging.info('Bug %s is up-to-date.', bug.key().id_or_name())
          count += 1
          continue
        else:
          logging.info('Bug %s needs to be updated.', bug.key().id_or_name())
      else:
        logging.info('Bug %s seems to be a new issue.', bug_id)
      deferred.defer(ExtractDetailsCrawlerIssueTracker, project_name, bug_id,
                     _queue='find-bugs-queue')
      count += 1
  except DeadlineExceededError:
    remaining = recent_issues[count:]
    deferred.defer(SpawnDetailsCrawlersIssueTracker, remaining, project_name)
    deferred.PermanentTaskFailure(
        'Deadline exceeded, started a new SpawnDetailsCrawler'
        ' for the remaining %d urls.' % len(remaining))
    return


def ExtractDetailsCrawlerIssueTracker(project_name, bug_id):
  """Extract useful information for a given bug."""
  logging.debug('Scraping details for bug %s in project %s.',
                bug_id, project_name)
  phclient = gdata.projecthosting.client.ProjectHostingClient()
  try:
    query = gdata.projecthosting.client.Query(issue_id=bug_id)
    feed = phclient.get_issues(project_name, query=query)
  except gdata.client.RequestError, e:
    if ('HTTP_X_APPENGINE_TASKRETRYCOUNT' in environ and
        int(environ['HTTP_X_APPENGINE_TASKRETRYCOUNT']) < _MAX_RETRIES):
      if e.status == 403:  # Skip 403 (Unautorized)errors.
        logging.info('Unautorized to access this issue, skipping: %s, %s',
                     bug_id, project_name)

        # Nuke cache data for private bugs.
        url_bug_map.DeleteBugAndMappings(
            bug_id, project_name, bugs_util.Provider.ISSUETRACKER)
        return
      else:
        raise BugCrawlerError(
            'Error while trying to get details for %s. Error %s' %
            (str(bug_id), str(e)))
    else:
      raise deferred.PermanentTaskFailure(
          'Error hit too many times, aborting '
          'extracting details for bug %s on project %s. Error: %s' %
          (str(bug_id), str(project_name), str(e)))

  if not feed or not feed.entry:
    raise deferred.PermanentTaskFailure(
        'Failed to fetch full details for bug %s', bug_id)

  entry = feed.entry[0]
  urls = []
  if entry.title.text:
    urls = [(u, url_bug_map.UrlPosition.TITLE)
            for u in url_util.ExtractUrls(entry.title.text)]
  if entry.content.text:
    urls.extend([(u, url_bug_map.UrlPosition.MAIN)
                 for u in url_util.ExtractUrls(entry.content.text)])

  comments = GetComments(project_name, bug_id, phclient)
  comments_text = GetTextInComments(comments)
  if comments_text:
    urls.extend([(u, url_bug_map.UrlPosition.COMMENTS)
                 for u in url_util.ExtractUrls(comments_text)])

  last_updater = GetLastUpdater(comments, FindAuthor(entry))
  if not urls:
    logging.info('Nothing to do, no URLs found for bug %s in project %s.',
                 bug_id, project_name)
    return
  logging.debug('URLs found: %s', str(urls))

  target = (target_element_util.ExtractTargetElement(comments_text) or
            target_element_util.ExtractTargetElement(entry.content.text))
  logging.debug('Target information extracted for bug: %s, '
                'target_element: %s', bug_id, target)

  if entry.status and entry.status.text:  # Status is None sometimes.
    status = entry.status.text
  else:
    logging.warning('Status was not found, setting to unknown.')
    status = 'unknown'
  QueueStoreBug(bug_id=bug_id,
                title=entry.title.text,
                summary=entry.content.text[:SUMMARY_LIMIT],
                priority=FindPriority(entry),
                project_name=project_name,
                provider=bugs_util.Provider.ISSUETRACKER,
                # Special case status since it can be None.
                status=status,
                author=FindAuthor(entry),
                details_link=entry.GetAlternateLink().href,
                reported_on=entry.published.text,
                last_update=entry.updated.text,
                last_updater=last_updater,
                target_element=target,
                urls=urls)


def GetComments(project_name, bug_id, phclient=None):
  """Fetches the comments for a specified issue.

  Args:
    project_name: The name of the project (ie chromium)
    bug_id: The ID of the bug to fetch comments for.
    phclient: Project Hosting client to use.

  Returns:
    A list of CommentEntry instances.
  """
  # Comments needs to be fetched separately.
  if not phclient:
    phclient = gdata.projecthosting.client.ProjectHostingClient()
  comments = []
  try:
    comments = phclient.get_comments(project_name, bug_id)
    comments = comments.entry
  except gdata.client.RequestError, e:
    logging.exception('Error while getting the comments for %s. Error %s',
                      bug_id, e)
  return comments


def GetTextInComments(comments):
  """Gets the comments for the given issue id as a list of text fields.

  Args:
    comments:  A list of CommentEntry instances.

  Returns:
    String of the attached.
  """
  comments_text = [c.content.text for c in comments if c.content.text]
  return ' '.join(comments_text)


def GetLastUpdater(comments, author):
  """Get the last author to update this bug.

  Args:
    comments:  A list of CommentEntry instances.
    author: The default last_updater if one isn't found.

  Returns:
    A string containing the alias of the last updater for this bug.
  """
  last_updater = author
  for comment in comments:
    if comment.author:
      last_updater = comment.author[0].name.text
  return last_updater


def FindPriority(bug_entry):
  """Finds and returns the priority of a provided bug entry.

  Args:
    bug_entry: The provided bug, a IssueEntry instance.

  Returns:
    A string containg the priority of the bug ("1", "2", etc...)
  """
  priority = ''
  for label in bug_entry.label:
    if label.text.lower().startswith('pri-'):
      priority = label.text[4:]
  return priority


def FindAuthor(bug_entry):
  """Finds and returns the author of a provided bug entry."""
  author = ''
  if bug_entry.author:
    author = bug_entry.author[0].name.text
  return author


def QueueStoreBug(bug_id, title, summary, priority,
                  project_name, provider, status, author,
                  details_link, reported_on, last_update,
                  last_updater, target_element, urls, recording_link='',
                  cycle_id=None, expected=None, result=None, author_id='',
                  screenshot=None):
  """Adds a task to updates or create a Bug."""
  deferred.defer(StoreBug,
                 bug_id=bug_id,
                 title=title,
                 summary=summary,
                 priority=priority,
                 project_name=project_name,
                 provider=provider,
                 status=status,
                 author=author,
                 author_id=author_id,
                 details_link=details_link,
                 reported_on=reported_on,
                 last_update=last_update,
                 last_updater=last_updater,
                 target_element=target_element,
                 urls=urls,
                 recording_link=recording_link,
                 cycle_id=cycle_id,
                 expected=expected,
                 result=result,
                 screenshot=screenshot,
                 _queue='store-bug-queue')


def  StoreBug(bug_id,  title, summary,  priority,  project_name,  provider,
  status,  author,  details_link, reported_on,  last_update, last_updater,
  target_element='',  screenshot=None, urls=None,  recording_link='',
  cycle_id=None, expected=None,  result=None, author_id=''):
  """Updates  or create  a Bug."""
  screenshot_link = ''
  if screenshot:
    # Store the screenshot data and get the link.
    new_screenshot = screenshots.Add(
        data=screenshots_util.DecodeBase64PNG(screenshot),
        source=provider, project=project)
    screenshot_link = screenshots_util.RetrievalUrl(
        self.request.url, new_screenshot.key().id())

  if cycle_id:
    cycle = test_cycle.AddTestCycle(provider, project_name, cycle_id)

  if not urls:
    urls  = [(u, url_bug_map.UrlPosition.TITLE) for u in url_util.ExtractUrls(title)]
    expected =  expected or ''
    result = result or ''
    text = summary + ' ' + expected + ' ' + result
    urls.extend([(u, url_bug_map.UrlPosition.TITLE)
      for u in url_util.ExtractUrls(text)])
  logging.info(urls)
  urls = urls or [] # Set  default url list to have only one empty string
  bug = bugs.Store(
    bug_id=str(bug_id),
    title=title,
    summary=summary,
    priority=priority,
    project=project_name,
    provider=provider,
    status=status,
    author=author,
    author_id=author_id,
    details_link=details_link,
    reported_on=reported_on,
    last_update=last_update,
    last_updater=last_updater,
    target_element=target_element,
    screenshot=screenshot_link,
    recording_link=recording_link,
    cycle=cycle,
    expected=expected,
    result=result)

  if cycle:
    test_cycle_user.AddTestCycleUser(author, cycle)

  # TODO(alexto): Do the deletion first in a separate queue, then
  # add the bug-URL mappings to avoid timeouts. For now, this works
  # since the timeout causes the task to re-execute.
  logging.debug('Deleting all existing bug mappings')
  deleted = url_bug_map.DeleteAllMappingsForBug(bug)
  logging.debug('Mappings deleted: %d', deleted)

  if len(urls) > 0:
    # NOTE: This is an optimization,
    # list comprehension loop is faster than a FOR loop.
    # pylint: disable-msg=W0104
    # pylint: disable-msg=W0106
    [deferred.defer(UpdateUrlBugMappings,
                    bug_key=bug.key().id(),
                    url=url,
                    position=position,
                    _queue='urls-map-queue')
     for (url, position) in urls]

def UpdateUrlBugMappings(bug_key, url, position):
  """Updates or creates a Bug-URL mapping."""
  url_bug_map.StoreUrlBugMapping(target_url=url,
                                 bug=bugs.GetBugByKey(bug_key),
                                 position=position)