PageRenderTime 53ms CodeModel.GetById 25ms RepoModel.GetById 1ms app.codeStats 0ms

/udacity_crawler.py

https://gitlab.com/fnaticshank/crawler
Python | 263 lines | 256 code | 4 blank | 3 comment | 9 complexity | 714ac866972e5767151ffcdd6279aa73 MD5 | raw file
  1. # coding=utf8
  2. import urllib
  3. import robotexclusionrulesparser as rerp
  4. from bs4 import BeautifulSoup
  5. from urlparse import urlparse, urljoin
  6. import csv
  7. from index_pdfs import index_pdfs
  8. from add_videos import add_videos_to_index
  9. def crawl_web(seed, max_pages, max_depth): # returns index, graph of inlinks
  10. tocrawl = []
  11. for url in seed:
  12. if is_udacity(url):
  13. tocrawl.append([url, 0])
  14. else:
  15. print "[crawl-web()] This seed is not a Udacity site!"
  16. pass
  17. crawled = []
  18. graph = {} # <url>, [list of pages it links to]
  19. index = {}
  20. pagedata = {}
  21. while tocrawl:
  22. page, depth = tocrawl.pop(0)
  23. print "[crawl_web()] Depth: ", depth
  24. print "[crawl_web()] Pages crawled: ", len(crawled)
  25. if page not in crawled and len(crawled) < max_pages and depth <= max_depth:
  26. soup, url = get_page(page)
  27. cache[url] = soup
  28. get_page_data(soup, url, pagedata)
  29. add_page_to_index(index, page, soup)
  30. outlinks = get_all_links(soup, url)
  31. graph[page] = outlinks
  32. add_new_links(tocrawl, outlinks, depth)
  33. #print tocrawl
  34. crawled.append(page)
  35. #print crawled
  36. index, pagedata = index_pdfs(index, pagedata)
  37. index = add_videos_to_index('subtitle_index.csv', '/Users/connormendenhall/Python/DaveDaveFind/DaveDaveFind/data/video_info.csv', index)
  38. index = undupe_index(index)
  39. return index, graph, pagedata
  40. def get_page_data(page, url, dict):
  41. try:
  42. title = page.title.string
  43. except:
  44. title = url
  45. try:
  46. page.body.style.decompose()
  47. page.body.script.decompose()
  48. text = page.body.get_text()
  49. except:
  50. text = ''
  51. dict[url] = [title, text]
  52. def get_all_links(page, url):
  53. links = []
  54. page_url = urlparse(url)
  55. if page_url[0]:
  56. base = page_url[0] + '://' + page_url[1]
  57. robots_url = urljoin(base, '/robots.txt')
  58. else:
  59. robots_url = "http://www.udacity-forums.com/robots.txt"
  60. rp = rerp.RobotFileParserLookalike()
  61. rp.set_url(robots_url)
  62. try:
  63. rp.read()
  64. except:
  65. pass
  66. #print rp
  67. for link in page.find_all('a'):
  68. link_url = link.get('href')
  69. print "[get_all_links()] Found a link: ", link_url
  70. #Ignore links that are 'None'.
  71. if link_url == None:
  72. pass
  73. elif not rp.can_fetch('*', link_url):
  74. print "[get_all_links] Page off limits!"
  75. pass
  76. #Ignore links that are internal page anchors.
  77. #Urlparse considers internal anchors 'fragment identifiers', at index 5.
  78. elif urlparse(link_url)[5] and not urlparse(link_url)[2]:
  79. pass
  80. elif urlparse(link_url)[1]:
  81. links.append(link_url)
  82. else:
  83. newlink = urljoin(base, link_url)
  84. links.append(newlink)
  85. return links
  86. def add_new_links(tocrawl, outlinks, depth):
  87. for link in outlinks:
  88. if link:
  89. if link not in tocrawl:
  90. if is_udacity(link):
  91. link = str(link)
  92. tocrawl.append([link, depth+1])
  93. def add_page_to_index(index, url, content):
  94. try:
  95. text = content.body.get_text()
  96. except:
  97. return
  98. words = text.split()
  99. punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
  100. stopwords = ['']
  101. with open('/Users/connormendenhall/Python/DaveDaveFind/DaveDaveFind/data/stopwords.csv', 'rb') as f:
  102. wordlist = csv.reader(f)
  103. for stopword in wordlist:
  104. stopwords.append(stopword[0])
  105. for word in words:
  106. word = word.lstrip(punctuation)
  107. word = word.rstrip(punctuation)
  108. word = word.lower()
  109. if word not in stopwords:
  110. add_to_index(index, word, url)
  111. def add_to_index(index, keyword, url):
  112. if keyword in index:
  113. index[keyword].append(url)
  114. else:
  115. index[keyword] = [url]
  116. def lookup(index, keyword):
  117. if keyword in index:
  118. return index[keyword]
  119. else:
  120. return None
  121. def get_page(url):
  122. page_url = urlparse(url)
  123. base = page_url[0] + '://' + page_url[1]
  124. robots_url = base + '/robots.txt'
  125. rp = rerp.RobotFileParserLookalike()
  126. rp.set_url(robots_url)
  127. rp.read()
  128. if not rp.can_fetch('*', url):
  129. print "[get_page()] Page off limits!"
  130. return BeautifulSoup(""), ""
  131. if url in cache:
  132. return cache[url], url
  133. else:
  134. print "[get_page()] Page not in cache: " + url
  135. try:
  136. content = urllib.urlopen(url).read()
  137. return BeautifulSoup(content), url
  138. except:
  139. return BeautifulSoup(""), ""
  140. def compute_ranks(graph):
  141. d = 0.8 # damping factor
  142. numloops = 10
  143. ranks = {}
  144. npages = len(graph)
  145. for page in graph:
  146. ranks[page] = 1.0 / npages
  147. for i in range(0, numloops):
  148. newranks = {}
  149. for page in graph:
  150. newrank = (1 - d) / npages
  151. for node in graph:
  152. if page in graph[node]:
  153. newrank = newrank + d * (ranks[node] / len(graph[node]))
  154. newranks[page] = newrank
  155. ranks = newranks
  156. return ranks
  157. def is_udacity(url):
  158. udacity_urls = ['www.udacity.com', 'www.udacity-forums.com', 'davedavefind.appspot.com']
  159. parsed_url = urlparse(url)
  160. if parsed_url[1] in udacity_urls:
  161. return True
  162. elif url == 'http://davedavefind.appspot.com/':
  163. return True
  164. else:
  165. return False
  166. def write_search_terms(filename, index):
  167. f = open(filename, 'wt')
  168. try:
  169. writer = csv.writer(f)
  170. writer.writerow(['term', 'urls'])
  171. for term in index:
  172. if len(term) > 500:
  173. pass
  174. else:
  175. ascii_term = term.encode('ascii', 'ignore')
  176. url_list = index[term]
  177. urlstring = ",".join(url_list)
  178. writer.writerow([ascii_term, urlstring])
  179. finally:
  180. f.close()
  181. print "[write_search_terms()] Finished writing SearchTerm CSV file."
  182. def write_url_info(filename, index, ranks, pagedata):
  183. f = open(filename, 'wt')
  184. try:
  185. writer = csv.writer(f)
  186. writer.writerow(['url', 'title', 'text', 'dave_rank', 'doc'])
  187. all_urls = set()
  188. for term in index:
  189. url_list = index[term]
  190. for url in url_list:
  191. if url.find('youtube') == -1:
  192. all_urls.add(url)
  193. for url in all_urls:
  194. ascii_url = url.encode('ascii', 'ignore')
  195. if url in ranks:
  196. dave_rank = ranks[url]
  197. doc = False
  198. else:
  199. dave_rank = 0.01
  200. doc = True
  201. title = pagedata[url][0]
  202. text = pagedata[url][1]
  203. ascii_title = title.encode('ascii', 'ignore')
  204. ascii_text = text.encode('ascii', 'ignore')
  205. writer.writerow([ascii_url,ascii_title, ascii_text, dave_rank, doc])
  206. finally:
  207. f.close()
  208. print "[write_url_info()] Finished writing PageUrl CSV file."
  209. def undupe_csv(filename, newfilename):
  210. oldfile = csv.reader(open(filename, 'rb'))
  211. newfile = open(newfilename, 'wb')
  212. try:
  213. writer = csv.writer(newfile)
  214. unique_rows = []
  215. for row in oldfile:
  216. if row not in unique_rows:
  217. unique_rows.append(row)
  218. writer.writerows(unique_rows)
  219. finally:
  220. newfile.close()
  221. print "[undupe_csv()] Index un-duped."
  222. def undupe_index(index):
  223. for key in index.keys():
  224. index[key] = list(set(index[key]))
  225. print "[undupe_index()] Index un-duped"
  226. return index
  227. cache = {}
  228. max_pages = 1000
  229. max_depth = 10
  230. crawl_list = ['http://www.udacity.com/overview/Course/cs101/', 'http://www.udacity-forums.com/cs101/', 'http://davedavefind.appspot.com/']
  231. def start_crawl():
  232. index, graph, pagedata = crawl_web(crawl_list, max_pages, max_depth)
  233. ranks = compute_ranks(graph)
  234. write_search_terms('/Users/connormendenhall/Python/DaveDaveFind/DaveDaveFind/data/search_terms.csv', index)
  235. write_url_info('/Users/connormendenhall/Python/DaveDaveFind/DaveDaveFind/data/url_info.csv', index, ranks, pagedata)
  236. if __name__ == "__main__":
  237. start_crawl()