/getAWSdocs.py

https://github.com/richarvey/getAWSdocs · Python · 158 lines · 122 code · 21 blank · 15 comment · 30 complexity · 3ac80b3c710edb2077079ed634bde849 MD5 · raw file

  1. #!/usr/bin/env python3
  2. from bs4 import BeautifulSoup
  3. import os, argparse
  4. from urllib.parse import urlparse, urlsplit
  5. from urllib.request import urlopen
  6. import json
  7. def get_options():
  8. parser = argparse.ArgumentParser(description='AWS Documentation Downloader')
  9. parser.add_argument('-d','--documentation', help='Download the Documentation', action='store_true', required=False)
  10. parser.add_argument('-w','--whitepapers', help='Download White Papers', action='store_true', required=False)
  11. parser.add_argument('-f','--force', help='Overwrite old files', action='store_true', required=False)
  12. args = vars(parser.parse_args())
  13. return (args)
  14. # Build a list of the amazon PDF's
  15. def list_whitepaper_pdfs(start_page):
  16. html_page = urlopen(start_page)
  17. # Parse the HTML page
  18. soup = BeautifulSoup(html_page, 'html.parser')
  19. pdfs = set()
  20. print("Generating PDF list (this may take some time)")
  21. for link in soup.findAll('a'):
  22. try:
  23. uri = link.get('href')
  24. print('URI: ', uri)
  25. # Allow whitepapers to be returned
  26. if "whitepapers" in start_page:
  27. if uri.endswith("pdf"):
  28. if "whitepapers" in uri or "enterprise-marketing" in uri:
  29. pdfs.add(uri)
  30. except:
  31. continue
  32. return pdfs
  33. def find_pdfs_in_html(url):
  34. html_page_doc = urlopen(url)
  35. soup_doc = BeautifulSoup(html_page_doc, 'html.parser')
  36. # Get the A tag from the parsed page
  37. pdfs = set()
  38. for link in soup_doc.findAll('a'):
  39. try:
  40. sub_url = link.get('href')
  41. if sub_url.endswith("pdf"):
  42. pdfs.add(sub_url)
  43. except:
  44. continue
  45. return pdfs
  46. def list_docs_pdfs(start_page):
  47. locale_path = "en_us/"
  48. base_url = "http://docs.aws.amazon.com"
  49. page = urlopen(start_page)
  50. soup = BeautifulSoup(page, "xml")
  51. pdfs = set()
  52. print("Generating PDF list (this may take some time)")
  53. for link in soup.findAll('service'):
  54. try:
  55. uri = link.get('href')
  56. print('URI: ', uri)
  57. # if service uri is .html then parse as HTML
  58. if '.html' in uri:
  59. url = base_url + uri
  60. pdfs = pdfs.union(find_pdfs_in_html(url))
  61. continue
  62. # if service uri ends with "/" find and parse xml landing page
  63. if not uri.startswith('http'):
  64. url = base_url + uri.split("?")[0] + locale_path + "landing-page.xml"
  65. # Fetch the XML sub page (this is where the links to the pdf's live)
  66. sub_page_doc = urlopen(url)
  67. soup_doc = BeautifulSoup(sub_page_doc, 'xml')
  68. # Get the "tile" tag from the parsed page
  69. for sublink in soup_doc.findAll('tile'):
  70. try:
  71. sub_url = sublink.get('href')
  72. directory = base_url + "/".join(urlsplit(sub_url).path.split('/')[:-1])
  73. guide_info_url = directory + "/meta-inf/guide-info.json"
  74. print("Guide info url:", guide_info_url)
  75. guide_info_doc = urlopen(guide_info_url).read()
  76. guide_info = json.loads(guide_info_doc)
  77. if "pdf" in guide_info:
  78. pdf_url = directory + "/" + guide_info["pdf"]
  79. pdfs.add(pdf_url)
  80. except:
  81. continue
  82. except:
  83. continue
  84. return pdfs
  85. def save_pdf(full_dir,filename,i):
  86. if not os.path.exists(full_dir):
  87. os.makedirs(full_dir)
  88. # Open the URL and retrieve data
  89. file_loc = full_dir + filename
  90. if not os.path.exists(file_loc) or force == True:
  91. if i.startswith("//"):
  92. i = "http:" + i
  93. print("Downloading : " + i)
  94. web = urlopen(i)
  95. print("Saving to : " + file_loc)
  96. # Save Data to disk
  97. output = open(file_loc,'wb')
  98. output.write(web.read())
  99. output.close()
  100. else:
  101. print("Skipping " + i + " - file exists or is a dated API document, use './getAWSdocs.py --force' to force override")
  102. def get_pdfs(pdf_list, force):
  103. for i in pdf_list:
  104. doc = i.split('/')
  105. doc_location = doc[3]
  106. filename = urlsplit(i).path.split('/')[-1]
  107. # Set download dir for whitepapers
  108. if "whitepapers" in doc_location:
  109. full_dir = "whitepapers/"
  110. else:
  111. # Set download dir and sub directories for documentation
  112. full_dir = "documentation/"
  113. directory = urlsplit(i).path.split('/')[:-1]
  114. for path in directory:
  115. if path != "":
  116. full_dir = full_dir + path + "/"
  117. try:
  118. save_pdf(full_dir,filename,i)
  119. except:
  120. continue
  121. # Main
  122. args = get_options()
  123. # allow user to overwrite files
  124. force = args['force']
  125. if args['documentation']:
  126. print("Downloading Docs")
  127. pdf_list = list_docs_pdfs("https://docs.aws.amazon.com/en_us/main-landing-page.xml")
  128. get_pdfs(pdf_list, force)
  129. if args['whitepapers']:
  130. print("Downloading Whitepapaers")
  131. pdf_list = list_whitepaper_pdfs("http://aws.amazon.com/whitepapers/")
  132. get_pdfs(pdf_list, force)
  133. print("Downloading SAP Whitepapaers")
  134. pdf_list = list_whitepaper_pdfs("https://aws.amazon.com/sap/whitepapers/")
  135. get_pdfs(pdf_list, force)
  136. for p in pdf_list:
  137. print(p)