/ny_times_pre_1981_scraper.py

https://github.com/slifty/rdiscraper · Python · 101 lines · 59 code · 14 blank · 28 comment · 6 complexity · d7122d4e53572d5a9881a63e7becb692 MD5 · raw file

  1. # For processing HTML
  2. from BeautifulSoup import BeautifulSoup
  3. # Fetch the Times's Advanced Search results for 'the.' urllib allows passing of HTML data
  4. # The Times has several searches, including a Beta, an Archive from 1851-1980, and an archive since 1981
  5. import urllib
  6. import urllib2
  7. import sys
  8. def fetch_page(date, page):
  9. #
  10. user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
  11. # unique
  12. query_args = {
  13. 'query': 'redskins',
  14. 'mon1': start_month,
  15. 'day1': start_day,
  16. 'year1': start_year,
  17. 'mon2': end_month,
  18. 'day2': end_day,
  19. 'year2': end_year,
  20. 'srcht': 's',
  21. 'srchst': 'p',
  22. 'daterange': 'period',
  23. # or change this to results_loop
  24. 'frow': 0,
  25. 'n': results_per_page
  26. }
  27. headers = { 'User-Agent' : user_agent }
  28. # Turns the array into query arguments for the URL
  29. url_args = urllib.urlencode(query_args)
  30. # unique
  31. base_url = 'http://query.nytimes.com/search/query'
  32. full_url = base_url + '?' + url_args
  33. page_handle = urllib2.urlopen(full_url)
  34. print(full_url)
  35. # Get access to the HTML we're looking to parse
  36. html_data = page_handle.read()
  37. # print html_data
  38. # Have BeautifulSoup parse the HTML into a tree of objects we can use
  39. soup = BeautifulSoup(html_data)
  40. return soup
  41. def get_links(soup):
  42. # Find all the story urls we care about
  43. # Get the unique section of our document that has the search results
  44. result_ols = soup.findAll('ol', attrs={'class': 'srchSearchResult'})
  45. # Set up storage for our result urls
  46. urls = []
  47. # We only expect to have one section here but will loop just in case
  48. for result_ol in result_ols:
  49. # Get the individual search results items and iterate through them
  50. result_lis = result_ol.findAll('li')
  51. for result_lis in result_lis:
  52. # Find the first link in each item
  53. result_as = result_lis.findAll('a', limit=1)
  54. for result_a in result_as:
  55. # Store the href attribute (i.e. the actual URL in our list of result URLs)
  56. urls.append(result_a['href'])
  57. return urls
  58. current_page_number = 0
  59. start_month = '01'
  60. start_day = '01'
  61. start_year = '1979'
  62. end_month = '01'
  63. end_day = '30'
  64. end_year = '1979'
  65. results_per_page = '50'
  66. # need to define this
  67. current_date = 0
  68. # need to loop this?
  69. # results_loop = '0' + results_per_page
  70. all_urls = []
  71. more_pages = True
  72. while more_pages:
  73. # page contains Next >>
  74. # "Next chr(175)"
  75. page = fetch_page(current_date, current_page_number)
  76. page_links = get_links(page)
  77. if len(page_links)==0:
  78. more_pages = False
  79. else:
  80. all_urls = all_urls + page_links
  81. current_page_number = current_page_number + 1
  82. print all_urls
  83. ## NEXT STEPS
  84. # plug it into mediacloud via an API wrapper http://webpy.org/
  85. # these results include non-NYT content, such as AP
  86. # update paging to go through multiple pages of search results
  87. # click through to preview and grab first paragraph