PageRenderTime 30ms CodeModel.GetById 1ms RepoModel.GetById 1ms app.codeStats 0ms

/server/crawlers/crawler_util.py

https://code.google.com/
Python | 360 lines | 273 code | 33 blank | 54 comment | 33 complexity | 7644034dcb901614221e1f26841ff6b2 MD5 | raw file
Possible License(s): Apache-2.0
  1. # Copyright 2010 Google Inc. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Utilities used by crawlers."""
  15. __author__ = 'alexto@google.com (Alexis O. Torres)'
  16. import logging
  17. from os import environ
  18. import gdata
  19. import gdata.client
  20. import gdata.projecthosting
  21. import gdata.projecthosting.client
  22. from google.appengine.ext import deferred
  23. from google.appengine.runtime import DeadlineExceededError
  24. from models import bugs
  25. from models import bugs_util
  26. from models import screenshots
  27. from models import test_cycle
  28. from models import test_cycle_user
  29. from models import url_bug_map
  30. from utils import target_element_util
  31. from utils import screenshots_util
  32. from utils import url_util
  33. # Used to control the max amount of retries a Task should be retried.
  34. _MAX_RETRIES = 5
  35. # Maximum length of an issue summary to store.
  36. SUMMARY_LIMIT = 150
  37. class BugCrawlerError(Exception):
  38. """Generic error thrown when something goes wrong while craling bugs."""
  39. pass
  40. def ExtractIssueTrackerBugId(issue):
  41. """Extract the bug id from a GData bug object."""
  42. return issue.id.text.split('/')[-1]
  43. def SpawnDetailsCrawlersIssueTracker(recent_issues, project_name,
  44. skip_recent_check=False):
  45. """Queues the tasks to do the actual crawling for recent updates."""
  46. count = 0
  47. try:
  48. for issue in recent_issues:
  49. bug_id = issue['id']
  50. logging.info('Adding crawler to the queue for issue_id %s, project: %s.',
  51. bug_id, project_name)
  52. end = bug_id.find('/')
  53. if end > 0:
  54. bug_id = bug_id[0:end]
  55. bug = bugs.GetBug(bug_id=bug_id, project=project_name,
  56. provider=bugs_util.Provider.ISSUETRACKER)
  57. if bug:
  58. if not skip_recent_check and bug.last_update == issue['updated']:
  59. logging.info('Bug %s is up-to-date.', bug.key().id_or_name())
  60. count += 1
  61. continue
  62. else:
  63. logging.info('Bug %s needs to be updated.', bug.key().id_or_name())
  64. else:
  65. logging.info('Bug %s seems to be a new issue.', bug_id)
  66. deferred.defer(ExtractDetailsCrawlerIssueTracker, project_name, bug_id,
  67. _queue='find-bugs-queue')
  68. count += 1
  69. except DeadlineExceededError:
  70. remaining = recent_issues[count:]
  71. deferred.defer(SpawnDetailsCrawlersIssueTracker, remaining, project_name)
  72. deferred.PermanentTaskFailure(
  73. 'Deadline exceeded, started a new SpawnDetailsCrawler'
  74. ' for the remaining %d urls.' % len(remaining))
  75. return
  76. def ExtractDetailsCrawlerIssueTracker(project_name, bug_id):
  77. """Extract useful information for a given bug."""
  78. logging.debug('Scraping details for bug %s in project %s.',
  79. bug_id, project_name)
  80. phclient = gdata.projecthosting.client.ProjectHostingClient()
  81. try:
  82. query = gdata.projecthosting.client.Query(issue_id=bug_id)
  83. feed = phclient.get_issues(project_name, query=query)
  84. except gdata.client.RequestError, e:
  85. if ('HTTP_X_APPENGINE_TASKRETRYCOUNT' in environ and
  86. int(environ['HTTP_X_APPENGINE_TASKRETRYCOUNT']) < _MAX_RETRIES):
  87. if e.status == 403: # Skip 403 (Unautorized)errors.
  88. logging.info('Unautorized to access this issue, skipping: %s, %s',
  89. bug_id, project_name)
  90. # Nuke cache data for private bugs.
  91. url_bug_map.DeleteBugAndMappings(
  92. bug_id, project_name, bugs_util.Provider.ISSUETRACKER)
  93. return
  94. else:
  95. raise BugCrawlerError(
  96. 'Error while trying to get details for %s. Error %s' %
  97. (str(bug_id), str(e)))
  98. else:
  99. raise deferred.PermanentTaskFailure(
  100. 'Error hit too many times, aborting '
  101. 'extracting details for bug %s on project %s. Error: %s' %
  102. (str(bug_id), str(project_name), str(e)))
  103. if not feed or not feed.entry:
  104. raise deferred.PermanentTaskFailure(
  105. 'Failed to fetch full details for bug %s', bug_id)
  106. entry = feed.entry[0]
  107. urls = []
  108. if entry.title.text:
  109. urls = [(u, url_bug_map.UrlPosition.TITLE)
  110. for u in url_util.ExtractUrls(entry.title.text)]
  111. if entry.content.text:
  112. urls.extend([(u, url_bug_map.UrlPosition.MAIN)
  113. for u in url_util.ExtractUrls(entry.content.text)])
  114. comments = GetComments(project_name, bug_id, phclient)
  115. comments_text = GetTextInComments(comments)
  116. if comments_text:
  117. urls.extend([(u, url_bug_map.UrlPosition.COMMENTS)
  118. for u in url_util.ExtractUrls(comments_text)])
  119. last_updater = GetLastUpdater(comments, FindAuthor(entry))
  120. if not urls:
  121. logging.info('Nothing to do, no URLs found for bug %s in project %s.',
  122. bug_id, project_name)
  123. return
  124. logging.debug('URLs found: %s', str(urls))
  125. target = (target_element_util.ExtractTargetElement(comments_text) or
  126. target_element_util.ExtractTargetElement(entry.content.text))
  127. logging.debug('Target information extracted for bug: %s, '
  128. 'target_element: %s', bug_id, target)
  129. if entry.status and entry.status.text: # Status is None sometimes.
  130. status = entry.status.text
  131. else:
  132. logging.warning('Status was not found, setting to unknown.')
  133. status = 'unknown'
  134. QueueStoreBug(bug_id=bug_id,
  135. title=entry.title.text,
  136. summary=entry.content.text[:SUMMARY_LIMIT],
  137. priority=FindPriority(entry),
  138. project_name=project_name,
  139. provider=bugs_util.Provider.ISSUETRACKER,
  140. # Special case status since it can be None.
  141. status=status,
  142. author=FindAuthor(entry),
  143. details_link=entry.GetAlternateLink().href,
  144. reported_on=entry.published.text,
  145. last_update=entry.updated.text,
  146. last_updater=last_updater,
  147. target_element=target,
  148. urls=urls)
  149. def GetComments(project_name, bug_id, phclient=None):
  150. """Fetches the comments for a specified issue.
  151. Args:
  152. project_name: The name of the project (ie chromium)
  153. bug_id: The ID of the bug to fetch comments for.
  154. phclient: Project Hosting client to use.
  155. Returns:
  156. A list of CommentEntry instances.
  157. """
  158. # Comments needs to be fetched separately.
  159. if not phclient:
  160. phclient = gdata.projecthosting.client.ProjectHostingClient()
  161. comments = []
  162. try:
  163. comments = phclient.get_comments(project_name, bug_id)
  164. comments = comments.entry
  165. except gdata.client.RequestError, e:
  166. logging.exception('Error while getting the comments for %s. Error %s',
  167. bug_id, e)
  168. return comments
  169. def GetTextInComments(comments):
  170. """Gets the comments for the given issue id as a list of text fields.
  171. Args:
  172. comments: A list of CommentEntry instances.
  173. Returns:
  174. String of the attached.
  175. """
  176. comments_text = [c.content.text for c in comments if c.content.text]
  177. return ' '.join(comments_text)
  178. def GetLastUpdater(comments, author):
  179. """Get the last author to update this bug.
  180. Args:
  181. comments: A list of CommentEntry instances.
  182. author: The default last_updater if one isn't found.
  183. Returns:
  184. A string containing the alias of the last updater for this bug.
  185. """
  186. last_updater = author
  187. for comment in comments:
  188. if comment.author:
  189. last_updater = comment.author[0].name.text
  190. return last_updater
  191. def FindPriority(bug_entry):
  192. """Finds and returns the priority of a provided bug entry.
  193. Args:
  194. bug_entry: The provided bug, a IssueEntry instance.
  195. Returns:
  196. A string containg the priority of the bug ("1", "2", etc...)
  197. """
  198. priority = ''
  199. for label in bug_entry.label:
  200. if label.text.lower().startswith('pri-'):
  201. priority = label.text[4:]
  202. return priority
  203. def FindAuthor(bug_entry):
  204. """Finds and returns the author of a provided bug entry."""
  205. author = ''
  206. if bug_entry.author:
  207. author = bug_entry.author[0].name.text
  208. return author
  209. def QueueStoreBug(bug_id, title, summary, priority,
  210. project_name, provider, status, author,
  211. details_link, reported_on, last_update,
  212. last_updater, target_element, urls, recording_link='',
  213. cycle_id=None, expected=None, result=None, author_id='',
  214. screenshot=None):
  215. """Adds a task to updates or create a Bug."""
  216. deferred.defer(StoreBug,
  217. bug_id=bug_id,
  218. title=title,
  219. summary=summary,
  220. priority=priority,
  221. project_name=project_name,
  222. provider=provider,
  223. status=status,
  224. author=author,
  225. author_id=author_id,
  226. details_link=details_link,
  227. reported_on=reported_on,
  228. last_update=last_update,
  229. last_updater=last_updater,
  230. target_element=target_element,
  231. urls=urls,
  232. recording_link=recording_link,
  233. cycle_id=cycle_id,
  234. expected=expected,
  235. result=result,
  236. screenshot=screenshot,
  237. _queue='store-bug-queue')
  238. def StoreBug(bug_id, title, summary, priority, project_name, provider,
  239. status, author, details_link, reported_on, last_update, last_updater,
  240. target_element='', screenshot=None, urls=None, recording_link='',
  241. cycle_id=None, expected=None, result=None, author_id=''):
  242. """Updates or create a Bug."""
  243. screenshot_link = ''
  244. if screenshot:
  245. # Store the screenshot data and get the link.
  246. new_screenshot = screenshots.Add(
  247. data=screenshots_util.DecodeBase64PNG(screenshot),
  248. source=provider, project=project)
  249. screenshot_link = screenshots_util.RetrievalUrl(
  250. self.request.url, new_screenshot.key().id())
  251. if cycle_id:
  252. cycle = test_cycle.AddTestCycle(provider, project_name, cycle_id)
  253. if not urls:
  254. urls = [(u, url_bug_map.UrlPosition.TITLE) for u in url_util.ExtractUrls(title)]
  255. expected = expected or ''
  256. result = result or ''
  257. text = summary + ' ' + expected + ' ' + result
  258. urls.extend([(u, url_bug_map.UrlPosition.TITLE)
  259. for u in url_util.ExtractUrls(text)])
  260. logging.info(urls)
  261. urls = urls or [] # Set default url list to have only one empty string
  262. bug = bugs.Store(
  263. bug_id=str(bug_id),
  264. title=title,
  265. summary=summary,
  266. priority=priority,
  267. project=project_name,
  268. provider=provider,
  269. status=status,
  270. author=author,
  271. author_id=author_id,
  272. details_link=details_link,
  273. reported_on=reported_on,
  274. last_update=last_update,
  275. last_updater=last_updater,
  276. target_element=target_element,
  277. screenshot=screenshot_link,
  278. recording_link=recording_link,
  279. cycle=cycle,
  280. expected=expected,
  281. result=result)
  282. if cycle:
  283. test_cycle_user.AddTestCycleUser(author, cycle)
  284. # TODO(alexto): Do the deletion first in a separate queue, then
  285. # add the bug-URL mappings to avoid timeouts. For now, this works
  286. # since the timeout causes the task to re-execute.
  287. logging.debug('Deleting all existing bug mappings')
  288. deleted = url_bug_map.DeleteAllMappingsForBug(bug)
  289. logging.debug('Mappings deleted: %d', deleted)
  290. if len(urls) > 0:
  291. # NOTE: This is an optimization,
  292. # list comprehension loop is faster than a FOR loop.
  293. # pylint: disable-msg=W0104
  294. # pylint: disable-msg=W0106
  295. [deferred.defer(UpdateUrlBugMappings,
  296. bug_key=bug.key().id(),
  297. url=url,
  298. position=position,
  299. _queue='urls-map-queue')
  300. for (url, position) in urls]
  301. def UpdateUrlBugMappings(bug_key, url, position):
  302. """Updates or creates a Bug-URL mapping."""
  303. url_bug_map.StoreUrlBugMapping(target_url=url,
  304. bug=bugs.GetBugByKey(bug_key),
  305. position=position)