getAWSdocs.py - Build a list of the amazon PDF's Parse the …

/getAWSdocs.py

https://github.com/richarvey/getAWSdocs · Python · 158 lines · 122 code · 21 blank · 15 comment · 30 complexity · 3ac80b3c710edb2077079ed634bde849 MD5 · raw file


#!/usr/bin/env python3

from bs4 import BeautifulSoup
import os, argparse
from urllib.parse import urlparse, urlsplit
from urllib.request import urlopen
import json

def get_options():
  parser = argparse.ArgumentParser(description='AWS Documentation Downloader')
  parser.add_argument('-d','--documentation', help='Download the Documentation', action='store_true', required=False)
  parser.add_argument('-w','--whitepapers', help='Download White Papers', action='store_true', required=False)
  parser.add_argument('-f','--force', help='Overwrite old files', action='store_true', required=False)
  args = vars(parser.parse_args())
  return (args)

# Build a list of the amazon PDF's
def list_whitepaper_pdfs(start_page):
  html_page = urlopen(start_page)
  # Parse the HTML page
  soup = BeautifulSoup(html_page, 'html.parser')
  pdfs =  set()
  print("Generating PDF list (this may take some time)")
  for link in soup.findAll('a'):
    try:
      uri = link.get('href')
      print('URI: ', uri)
      # Allow whitepapers to be returned
      if "whitepapers" in start_page:
        if uri.endswith("pdf"):
          if "whitepapers" in uri or "enterprise-marketing" in uri:
            pdfs.add(uri)
    except:
     continue
  return pdfs


def find_pdfs_in_html(url):
  html_page_doc = urlopen(url)
  soup_doc = BeautifulSoup(html_page_doc, 'html.parser')
  # Get the A tag from the parsed page
  pdfs = set()
  for link in soup_doc.findAll('a'):
    try:
      sub_url = link.get('href')
      if sub_url.endswith("pdf"):
        pdfs.add(sub_url)
    except:
      continue
  return pdfs


def list_docs_pdfs(start_page):
  locale_path = "en_us/"
  base_url = "http://docs.aws.amazon.com"

  page = urlopen(start_page)
  soup = BeautifulSoup(page, "xml")
  pdfs =  set()
  print("Generating PDF list (this may take some time)")

  for link in soup.findAll('service'):
    try:
      uri = link.get('href')
      print('URI: ', uri)
      # if service uri is .html then parse as HTML
      if '.html' in uri:
        url = base_url + uri
        pdfs = pdfs.union(find_pdfs_in_html(url))
        continue

      # if service uri ends with "/" find and parse xml landing page
      if not uri.startswith('http'):
        url = base_url + uri.split("?")[0] + locale_path + "landing-page.xml"
      
      # Fetch the XML sub page (this is where the links to the pdf's live)
      sub_page_doc = urlopen(url)
      soup_doc = BeautifulSoup(sub_page_doc, 'xml')
      
      # Get the "tile" tag from the parsed page
      for sublink in soup_doc.findAll('tile'):
        try:
          sub_url = sublink.get('href')
          directory = base_url + "/".join(urlsplit(sub_url).path.split('/')[:-1])

          guide_info_url = directory + "/meta-inf/guide-info.json"
          print("Guide info url:", guide_info_url)
          guide_info_doc = urlopen(guide_info_url).read()
          guide_info = json.loads(guide_info_doc)

          if "pdf" in guide_info:
            pdf_url = directory + "/" + guide_info["pdf"]
            pdfs.add(pdf_url)
        except:
          continue
    except:
     continue
  return pdfs


def save_pdf(full_dir,filename,i):
  if not os.path.exists(full_dir):
    os.makedirs(full_dir)
  # Open the URL and retrieve data
  file_loc = full_dir + filename
  if not os.path.exists(file_loc) or force == True:
    if i.startswith("//"):
      i = "http:" + i
    print("Downloading : " + i)
    web = urlopen(i)
    print("Saving to : " + file_loc)
    # Save Data to disk
    output = open(file_loc,'wb')
    output.write(web.read())
    output.close()
  else:
    print("Skipping " + i + " - file exists or is a dated API document, use './getAWSdocs.py --force' to force override")


def get_pdfs(pdf_list, force):
  for i in pdf_list:
    doc = i.split('/')
    doc_location = doc[3]
    filename = urlsplit(i).path.split('/')[-1]
    # Set download dir for whitepapers
    if "whitepapers" in doc_location:
      full_dir = "whitepapers/"
    else:
      # Set download dir and sub directories for documentation
      full_dir = "documentation/"
      directory = urlsplit(i).path.split('/')[:-1]
      for path in directory:
        if path != "":
          full_dir = full_dir + path + "/"
    try:
      save_pdf(full_dir,filename,i)
    except:
      continue

# Main
args = get_options()
# allow user to overwrite files
force = args['force']
if args['documentation']:
  print("Downloading Docs")
  pdf_list = list_docs_pdfs("https://docs.aws.amazon.com/en_us/main-landing-page.xml")
  get_pdfs(pdf_list, force)

if args['whitepapers']:
  print("Downloading Whitepapaers")
  pdf_list = list_whitepaper_pdfs("http://aws.amazon.com/whitepapers/")
  get_pdfs(pdf_list, force)
  print("Downloading SAP Whitepapaers")
  pdf_list = list_whitepaper_pdfs("https://aws.amazon.com/sap/whitepapers/")
  get_pdfs(pdf_list, force)

for p in pdf_list:
  print(p)

Tech Fingerprint

Alerts (23)

'def' Ensure functions have docstrings for documentation
9 18 38 53 120
'open(' Use 'with open()' to ensure Files are properly closed
19 39 110 113
'print(' Use logging module for better control and configurability
23 27 109 111 117 145 153 158
Complexity hotspot; lines 29 to 31 (total complexity: 4)
29 30 31
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
33 48 137