PageRenderTime 71ms CodeModel.GetById 30ms RepoModel.GetById 1ms app.codeStats 0ms

/lib/fathead/apple_discussions/parse.py

http://github.com/duckduckgo/zeroclickinfo-fathead
Python | 186 lines | 172 code | 5 blank | 9 comment | 0 complexity | dc17750925499a0239d508a24a889fcc MD5 | raw file
Possible License(s): Apache-2.0
  1. #!/usr/bin/env python3
  2. #
  3. # -*- coding: utf-8 -*-
  4. #
  5. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  6. # Parser for Apple Discussion articles.
  7. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  8. #
  9. # For further information see https://duck.co/ia/view/apple_discussions
  10. __MODULE__ = "apple_discussions"
  11. __AUTHOR__ = "DuckDuckGo [https://duckduckgo.com]"
  12. __SOURCE__ = "https://discussions.apple.com/"
  13. __OUTPUT__ = "output.txt"
  14. __DEBUG__ = False
  15. import argparse
  16. import glob
  17. from multiprocessing import cpu_count, Pool
  18. import os
  19. import re
  20. from bs4 import BeautifulSoup
  21. CPU_COUNT = cpu_count()
  22. OUTPUT = """\
  23. {title}\t
  24. {entry_type}\t
  25. {redirect_title}\t
  26. {empty_field}\t
  27. {categories}\t
  28. {empty_field}\t
  29. {related_topics}\t
  30. {empty_field}\t
  31. {external_links}\t
  32. {disambiguation}\t
  33. {image}\t
  34. {abstract}\t
  35. {url}
  36. """.replace("\n", "").replace("\\n", "")
  37. FHTEMPLATE = """\
  38. <p><b>Answered by {username} ({date})</b></p>
  39. {information}
  40. """.replace("\n", "").replace("\\n", "")
  41. def parse_file(filename):
  42. """
  43. The pipeline to process the files
  44. """
  45. try:
  46. with open(filename, 'r', encoding='utf-8') as f:
  47. contents = f.read()
  48. parsed_document = parse_html(contents, filename)
  49. if parsed_document != False:
  50. output = format_output(parsed_document)
  51. write_to_output(output)
  52. # If we can't parsed and __DEBUG__ is switched on we log to "errors.txt"
  53. elif __DEBUG__ == True and parsed_document == False:
  54. write_to_output(filename, output="errors.txt")
  55. except:
  56. pass # parser can't parse
  57. def parse_html(doc, url):
  58. """
  59. Parses the html docs for a Title, Answer and URL
  60. """
  61. soup = BeautifulSoup(doc, "html.parser")
  62. parsed_doc = {}
  63. # Builds the original URL
  64. pop_url_extension = url.split(".")[-2]
  65. url_id = pop_url_extension.split("/")[-1]
  66. parsed_doc["url"] = "https://discussions.apple.com/thread/" + url_id
  67. # Gets the title
  68. title = soup.h1.text
  69. temp_title = title.strip().replace("Q: ", "")
  70. parsed_doc["title"] = temp_title[0].capitalize() + temp_title[1:] # Capitalizes first word
  71. print("Parsing", parsed_doc["title"])
  72. # Adds category to the title (if we can)
  73. try:
  74. topic = soup.find("a", {"class": "jive-breadcrumb-last"}).text
  75. topic = topic.replace("Using ", "")
  76. parsed_doc["title"] += " ({topic})".format(topic=topic)
  77. except:
  78. pass
  79. # Get's the most 'Recommended Answer'
  80. if soup.find("div", {"class": "recommended-answers"}):
  81. for p in soup.findAll('p'):
  82. if p.string:
  83. p.string.replace_with(p.string.strip())
  84. # ditch the span tags
  85. for span in soup.findAll('span'):
  86. span.decompose()
  87. username = soup.find("div", {"class", "recommended-answers"}).find("a", {"class": "username"}).text
  88. username = username.strip()
  89. posted = soup.find("div", {"class", "recommended-answers"}).find("p", {"class": "meta-posted"}).text
  90. posted = posted.strip().replace("Posted on ", "")
  91. posted = posted.split(" ")
  92. posted = posted[0] + " " + posted[1] + " " + posted[2]
  93. content = soup.find("div", {"class", "recommended-answers"}).find("section").find("div", {"class": "jive-rendered-content"})
  94. for tags in content.findAll(True):
  95. tags.attrs = {}
  96. # Does some regex replacements that the parser just won't hit with grace
  97. contents = FHTEMPLATE.format(information=str(content), username=username, date=posted)
  98. contents = re.sub(re.compile("<p></p>"), "", contents)
  99. contents = re.sub(re.compile("<div ((.|\n)+?)>"), "", contents)
  100. contents = re.sub(re.compile("</div>"), "", contents)
  101. contents = re.sub(re.compile("<a>"), "", contents)
  102. contents = re.sub(re.compile("</a>"), "", contents)
  103. contents = re.sub(re.compile("\n"), "\\n", contents)
  104. parsed_doc["body"] = contents
  105. # Some last moment validation
  106. if "CodeBlockStart" in parsed_doc["body"]: # contains code
  107. return False
  108. if "<code" in parsed_doc["body"]: # contains code
  109. return False
  110. if "<pre" in parsed_doc["body"]:
  111. return False
  112. if "blockquote" in parsed_doc["body"]:
  113. return False
  114. if len(parsed_doc["body"].split(" ")) > 500:
  115. return False
  116. if len(parsed_doc["body"].split(" ")) < 30:
  117. return False
  118. if parsed_doc["url"] == "" or parsed_doc["url"] == None:
  119. return False
  120. else:
  121. return False
  122. return parsed_doc
  123. def format_output(doc):
  124. """
  125. Takes dict doc and formats it for the final output
  126. """
  127. document = OUTPUT.format(
  128. title = doc["title"],
  129. entry_type = "A",
  130. redirect_title = "",
  131. categories = "",
  132. empty_field = "",
  133. related_topics = "",
  134. external_links = "",
  135. disambiguation = "",
  136. image = "",
  137. abstract = doc["body"],
  138. url = doc["url"],
  139. )
  140. return document
  141. def write_to_output(article, output=__OUTPUT__):
  142. """
  143. Appends the parsed article to the `output.txt` file.
  144. """
  145. with open(output, 'a', encoding="utf-8") as f:
  146. f.write(article + "\n")
  147. if __name__ == "__main__":
  148. argparser = argparse.ArgumentParser()
  149. argparser.add_argument("data_folder",
  150. help="The folder containing the data. Maybe run parse.sh instead?")
  151. argparser.add_argument("-p", "--processes",
  152. help="Number of parrallel processes to parse corpus", type=int, default=CPU_COUNT)
  153. args = argparser.parse_args()
  154. folder = args.data_folder
  155. files_to_parse = glob.glob(os.path.join(folder, "*.txt"))
  156. pool = Pool(args.processes)
  157. pool.map(parse_file, files_to_parse)