PageRenderTime 43ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/fathead/mdnjs/parse.py

http://github.com/duckduckgo/zeroclickinfo-fathead
Python | 586 lines | 567 code | 10 blank | 9 comment | 31 complexity | 0a9582e51a16f2b9280906973d2e3223 MD5 | raw file
Possible License(s): Apache-2.0
  1. import codecs
  2. import re
  3. from collections import Counter
  4. from lxml import html, etree
  5. class Standardizer(object):
  6. """ Standardize the titles of each entry.
  7. MDN uses a wiki for its documentation, so titles aren't consistent.
  8. For example, you might have:
  9. Array reverse
  10. Array.reverse
  11. Array.prototype.reverse
  12. """
  13. TITLE_FORMATTING = {
  14. 'class_property': '%s.%s',
  15. 'class_function': '%s.%s',
  16. 'instance_method': '%s.prototype.%s',
  17. 'instance_property': '%s.prototype.%s',
  18. }
  19. def __init__(self, specfile):
  20. """
  21. Args:
  22. specfile: A filesystem path to a csv file containing language
  23. definitions. It should have the format:
  24. BaseObject,property,{class_property,class_function,instance_method,instance_property}
  25. """
  26. self.inverted_index = {}
  27. self.objects = set()
  28. with codecs.open(specfile, 'r', 'utf-8') as f:
  29. for line in f:
  30. line = line.strip()
  31. index = line.split('(')[0]
  32. if index.count('.') > 1:
  33. index = index.split('prototype')[-1]
  34. index = index.split('.')[-1].lower().strip()
  35. if index not in self.inverted_index:
  36. self.inverted_index[index] = []
  37. self.inverted_index[index].append(line)
  38. obj = line.split('.')[0]
  39. self.objects.add(obj)
  40. def standardize(self, mdn):
  41. """ Standardize and clean the fields within an MDN object. """
  42. if 'Global' in mdn.obj:
  43. mdn.obj = 'Global'
  44. if mdn.obj not in self.objects and 'Global' in mdn.url:
  45. return None
  46. if mdn.prop.lower() not in self.inverted_index:
  47. return mdn
  48. for signature in self.inverted_index[mdn.prop.lower()]:
  49. if signature.startswith(mdn.obj):
  50. mdn.codesnippet = signature
  51. mdn.title = signature.split('(')[0].strip()
  52. break
  53. return mdn
  54. class FatWriter(object):
  55. """ File writer for DDG Fathead files. Field orders and output format
  56. comply with the documentation at https://github.com/duckduckgo/
  57. zeroclickinfo-fathead."""
  58. FIELDS = [
  59. 'title',
  60. 'type',
  61. 'redirect',
  62. 'otheruses',
  63. 'categories',
  64. 'references',
  65. 'see_also',
  66. 'further_reading',
  67. 'external_links',
  68. 'disambiguation',
  69. 'images',
  70. 'abstract',
  71. 'source_url'
  72. ]
  73. def __init__(self, outfile):
  74. self.outfile = outfile
  75. self.articles_index = []
  76. def writerow(self, outdict):
  77. """ Write the dict row. """
  78. row = []
  79. for field in FatWriter.FIELDS:
  80. col = outdict.get(field, '')
  81. col = col.replace('\t', ' ')
  82. col = col.replace('\n', '\\n')
  83. row.append(col)
  84. self.outfile.write('\t'.join(row) + '\n')
  85. class MDNWriter(FatWriter):
  86. """ An implementation of FatWriter that knows how to convert between MDN
  87. objects and the FatWriter spec. """
  88. def writemdn(self, mdn):
  89. code = ''
  90. abstract = ''
  91. if mdn.codesnippet:
  92. code = '<pre><code>%s</code></pre>' % mdn.codesnippet
  93. if mdn.summary:
  94. abstract += '<p>' + mdn.summary + '</p>'
  95. if mdn.exampledesc:
  96. abstract += '<p>' + mdn.exampledesc + '</p>'
  97. if mdn.example:
  98. temp = ''
  99. if type(mdn.example) == str:
  100. temp = '<pre><code>%s</code></pre>' % mdn.example
  101. if type(mdn.example) == dict:
  102. for key, value in mdn.example.items():
  103. temp += key + '<pre><code>%s</code></pre>' %value
  104. temp += '<br/>'
  105. code = temp
  106. fatheadTemplate = '<section class="prog__container">' + abstract + code + "</section>"
  107. d = {
  108. 'title': mdn.title,
  109. 'type': 'A',
  110. 'source_url': mdn.url,
  111. 'abstract': fatheadTemplate
  112. }
  113. self.writerow(d)
  114. self.articles_index.append(mdn.title.lower())
  115. class MDN(object):
  116. """ A container object for an MDN article.
  117. For example, given http://developer.mozilla.org/en-US/docs/
  118. JavaScript/Reference/Global_Objects/Array/pop, the object would have these
  119. properties:
  120. title Array.pop
  121. url http://developer.mozilla.org ...
  122. summary Removes the last element from an array and returns that
  123. element.
  124. codesnippet array.pop()
  125. obj Array
  126. prop pop
  127. Args:
  128. title: The article's title.
  129. url: The articles full URL.
  130. summary: A couple-sentence overview of the article.
  131. codesnippet: A couple lines of code showing the syntax. Multiple lines
  132. should be delimited with \n.
  133. obj: The calling object.
  134. prop: The calling object's property.
  135. articletype: Article categories (eg. Functions, Error)
  136. exampledesc: Example Description
  137. example: Example Code
  138. redirected: To control the addition of repeatitive redirections from the same object to output.txt
  139. """
  140. def __init__(self, title=None, url=None, summary=None, codesnippet=None,
  141. obj=None, prop=None, articletype=None):
  142. self.title = title
  143. self.url = url
  144. self.summary = summary
  145. self.codesnippet = codesnippet
  146. self.obj = obj
  147. self.prop = prop
  148. self.articletype = articletype
  149. self.exampledesc = None
  150. self.example = None
  151. self.redirected = False
  152. class MDNParser(object):
  153. """ A parser that takes an MDN wiki page and returns an MDN object. If
  154. pages change causing this Fathead to break, then the queries in this class
  155. should be checked. """
  156. def _is_obsolete(self, tree):
  157. obsolete = tree.xpath("//*[contains(@class, 'obsoleteHeader')]")
  158. return obsolete
  159. def parse(self, htmlfile):
  160. """ Parse an html file and return an mdn object.
  161. Args:
  162. htmlfile: A file-like object that should parse with lxml html parser.
  163. """
  164. page = htmlfile.read()
  165. tree = html.fromstring(page)
  166. if self._is_obsolete(tree):
  167. return None
  168. title = tree.xpath("//meta[@property='og:title']/@content")[0]
  169. article = tree.xpath("//article[contains(@id,'wikiArticle')]")
  170. summary = ""
  171. if article:
  172. summary_nodes = tree.xpath("//h2[contains(@id,'Summary')]/following-sibling::p[1]")
  173. for summary_el in summary_nodes :
  174. for tag in summary_el.xpath('//*[@class]'):
  175. tag.attrib.pop('class')
  176. summary += re.sub('<[^<]+?>', '', etree.tostring(summary_el).strip())
  177. # if there's no summary, getting description section
  178. if not summary:
  179. summary_el = tree.xpath("//meta[@property='og:description']/@content")
  180. if summary_el:
  181. summary = summary_el[0]
  182. # if there's no summary or description, getting see also section title
  183. if not summary:
  184. see_also_el = tree.xpath("//h3[contains(@id,'See_also')]")
  185. if see_also_el:
  186. elements = tree.xpath("//h3[contains(@id,'See_also')]/following-sibling::ul[1]")
  187. for element in elements:
  188. for tag in element.xpath('//*[@class]'):
  189. tag.attrib.pop('class')
  190. summary = re.findall('title="([^"]*)"', etree.tostring(element).strip())
  191. summary = summary[0].strip()
  192. codesnippet = ""
  193. syntax_header = tree.xpath("//h2[contains(@id,'Syntax')]")
  194. if syntax_header:
  195. elements = tree.xpath("//h2[contains(@id,'Syntax')]/following-sibling::pre[1]")
  196. for element in elements:
  197. for tag in element.xpath('//*[@class]'):
  198. tag.attrib.pop('class')
  199. codesnippet += re.sub('<[^<]+?>', '', etree.tostring(element).strip())
  200. articletype = ""
  201. exampledesc = ""
  202. example = ""
  203. # Web/API pages. In Web/API pages we take the first example. If the example contains both html content and JS
  204. # content, we take them both, if not we simply take the first example.
  205. url=tree.xpath("//meta[@property='og:url']/@content")[0]
  206. if "Web/API" in url:
  207. example_header = tree.xpath("//h2[contains(@id,'Example')][position()=1]")
  208. if example_header:
  209. html_example_header = tree.xpath("//h2[contains(@id,'Example')][position()=1]/following-sibling::h3[contains(@id,'HTML_Content')]")
  210. js_example_header = tree.xpath("//h2[contains(@id,'Example')][position()=1]/following-sibling::h3[contains(@id,'JavaScript_Content')]")
  211. if html_example_header and js_example_header:
  212. example = {}
  213. example['HTML Content']=''
  214. elements = tree.xpath("//h2[contains(@id,'Example')][position()=1]/following-sibling::h3[contains(@id,'HTML_Content')]/following-sibling::pre[1]")
  215. for element in elements:
  216. for tag in element.xpath('//*[@class]'):
  217. tag.attrib.pop('class')
  218. example['HTML Content'] += re.sub('<[^<]+?>', '', etree.tostring(element).strip())
  219. example['JavaScript Content']=''
  220. elements = tree.xpath("//h2[contains(@id,'Example')][position()=1]/following-sibling::h3[contains(@id,'JavaScript_Content')]/following-sibling::pre[1]")
  221. for element in elements:
  222. for tag in element.xpath('//*[@class]'):
  223. tag.attrib.pop('class')
  224. example['JavaScript Content'] += re.sub('<[^<]+?>', '', etree.tostring(element).strip())
  225. else:
  226. example=''
  227. example_header = tree.xpath("//h2[contains(@id,'Example')]")
  228. if example_header:
  229. elements = tree.xpath("//h2[contains(@id,'Example')]/following-sibling::pre[1]")
  230. for element in elements:
  231. for tag in element.xpath('//*[@class]'):
  232. tag.attrib.pop('class')
  233. example += re.sub('<[^<]+?>', '', etree.tostring(element).strip())
  234. elements = tree.xpath("//h2[contains(@id,'Example')]/following-sibling::p[1]")
  235. for element in elements:
  236. for tag in element.xpath('//*[@class]'):
  237. tag.attrib.pop('class')
  238. exampledesc += re.sub('<[^<]+?>', '', etree.tostring(element).strip())
  239. # Error pages
  240. if "Error" in htmlfile.name:
  241. articletype = "Error"
  242. # What went wrong?
  243. whatWentWrong_summary = ""
  244. whatWentWrong = tree.xpath("//h2[contains(@id,'What_went_wrong')]/following-sibling::p[1]")
  245. for element in whatWentWrong:
  246. for tag in element.xpath('//*[@class]'):
  247. tag.attrib.pop('class')
  248. whatWentWrong_summary += re.sub('<[^<]+?>', '', etree.tostring(element).strip())
  249. if whatWentWrong_summary:
  250. summary = whatWentWrong_summary
  251. # Examples
  252. exampleGood = ''.join(tree.xpath("//h3[contains(@id,'Valid_cases')]/following-sibling::pre/text()"))
  253. exampleBad = ''.join(tree.xpath("//h3[contains(@id,'Invalid_cases')]/following-sibling::pre/text()"))
  254. exampleGood = re.sub('<[^<]+?>', '', exampleGood)
  255. exampleBad = re.sub('<[^<]+?>', '', exampleBad)
  256. if exampleGood:
  257. exampleGood = "Valid Cases:\n" + exampleGood
  258. if exampleBad:
  259. exampleBad = "Invalid Cases:\n" + exampleGood
  260. if exampleBad or exampleGood:
  261. codesnippet = exampleBad + "\n" + exampleGood
  262. if any(wiki in htmlfile.name for wiki in ["Functions.", "Classes.", "Statements.", "Operators."]):
  263. articletype = htmlfile.name.split('.')[0].split('/')[1]
  264. desc_header = tree.xpath("//h2[contains(@id,'Description')]")
  265. if desc_header:
  266. elements = tree.xpath("//h2[contains(@id,'Description')]/following-sibling::p[1]")
  267. for element in elements:
  268. for tag in element.xpath('//*[@class]'):
  269. tag.attrib.pop('class')
  270. exampledesc += re.sub('<[^<]+?>', '', etree.tostring(element).strip())
  271. elements = tree.xpath("//h2[contains(@id,'Description')]/following-sibling::pre[1]")
  272. for element in elements:
  273. for tag in element.xpath('//*[@class]'):
  274. tag.attrib.pop('class')
  275. example += re.sub('<[^<]+?>', '', etree.tostring(element).strip())
  276. else:
  277. elements = tree.xpath("//h2[contains(@id,'Examples')]/following-sibling::p[1]")
  278. for element in elements:
  279. for tag in element.xpath('//*[@class]'):
  280. tag.attrib.pop('class')
  281. exampledesc += re.sub('<[^<]+?>', '', etree.tostring(element).strip())
  282. elements = tree.xpath("//h2[contains(@id,'Examples')]/following-sibling::pre[1]")
  283. for element in elements:
  284. for tag in element.xpath('//*[@class]'):
  285. tag.attrib.pop('class')
  286. example += re.sub('<[^<]+?>', '', etree.tostring(element).strip())
  287. print title + (' ' * 30) + '\r',
  288. mdn = MDN()
  289. mdn.title = title
  290. mdn.summary = summary
  291. mdn.codesnippet = codesnippet
  292. mdn.exampledesc = exampledesc
  293. mdn.example = example
  294. mdn.articletype = articletype
  295. return mdn
  296. class MDNIndexer(object):
  297. def __init__(self, writer):
  298. self._writer = writer
  299. self.counter = Counter()
  300. self.inverted_index = {}
  301. # for Web/Api pages
  302. self.WEBAPI_CLASS_WORDS = ['Window', 'Navigator', 'MouseEvent',
  303. 'KeyboardEvent', 'GlobalEventHandlers', 'Element',
  304. 'Node', 'Event', 'Selection']
  305. # for Error pages
  306. self.ERROR_SYNONYMS = [ ["bad", "not legal", "invalid", "not a valid"] ]
  307. # for syntax, example redirections
  308. self.SYTAX_EXAMPLE_REDIR = ["Functions", "Classes", "Statements", "Operators"]
  309. # for special redirects to if, else, each and method
  310. self.CONDITIONALS_REDIR = ["if", "else", "each", "method"]
  311. # for special redirects to data type primitives
  312. self.DATA_TYPES = ["boolean", "null", "undefined", "number", "string", "symbol", "bool8x16",
  313. "bool16x8", "bool32x4", "bool64x2", "int8x16", "int16x8", "int32x4", "uint8x16",
  314. "uint16x8", "uint32x4", "float32x4", "float64x2"]
  315. def add(self, mdn):
  316. keyword = mdn.prop.lower()
  317. self.counter[keyword] += 1
  318. if keyword not in self.inverted_index:
  319. self.inverted_index[keyword] = []
  320. self.inverted_index[keyword].append(mdn)
  321. def writestrippedredirect(self, mdn):
  322. #lowercase title and strip quotation marks ("), color(:) and period(.) between two words
  323. strippedRedirect = re.sub('(?<=[a-z])([.])(?!\d)', ' ', mdn.title.replace(',', ' ').replace(':',' ').replace('"', ' ').lower())
  324. strippedRedirect = re.sub( '\s+', ' ',strippedRedirect ).strip()
  325. if (strippedRedirect != mdn.title.lower()):
  326. self._writer.writerow({
  327. 'title': strippedRedirect,
  328. 'type': 'R',
  329. 'redirect': mdn.title
  330. })
  331. def writeredirect(self, title, mdn):
  332. title = title.replace('_', ' ')
  333. title = title.split('...')
  334. title = ' '.join(title)
  335. # write redirects with synonyms for error pages
  336. if mdn.articletype == "Error":
  337. for synonyms_list in self.ERROR_SYNONYMS:
  338. if any(word in title.lower() for word in synonyms_list):
  339. word = set(synonyms_list).intersection(title.lower().split()).pop()
  340. for synonym in synonyms_list:
  341. self._writer.writerow({
  342. 'title': title.replace(word, synonym),
  343. 'type': 'R',
  344. 'redirect': mdn.title
  345. })
  346. return;
  347. # write redirects for data-types
  348. split_title = title.split(' ')
  349. if split_title[0] == "global" and len(split_title) > 1:
  350. if any(split_title[1] == data_type for data_type in self.DATA_TYPES):
  351. self._writer.writerow({
  352. 'title': split_title[1] + " data type",
  353. 'type': 'R',
  354. 'redirect': mdn.title
  355. })
  356. self._writer.writerow({
  357. 'title': split_title[1] + " type",
  358. 'type': 'R',
  359. 'redirect': mdn.title
  360. })
  361. # write redirects with `syntax` and `example` for functions pages
  362. if any(wiki == mdn.articletype for wiki in self.SYTAX_EXAMPLE_REDIR) and not mdn.redirected:
  363. mdn.redirected = True
  364. self._writer.writerow({
  365. 'title': title + " syntax",
  366. 'type': 'R',
  367. 'redirect': mdn.title
  368. })
  369. self._writer.writerow({
  370. 'title': title + " example",
  371. 'type': 'R',
  372. 'redirect': mdn.title
  373. })
  374. split_title = title.split(' ')
  375. if any(split_title[0].lower() == wiki.lower() for wiki in self.SYTAX_EXAMPLE_REDIR) and len(split_title) > 1:
  376. new_title = split_title[1:]
  377. new_title = ' '.join(new_title).lower()
  378. if new_title != mdn.title.lower():
  379. self._writer.writerow({
  380. 'title': new_title,
  381. 'type': 'R',
  382. 'redirect': mdn.title
  383. })
  384. self._writer.writerow({
  385. 'title': new_title + " syntax",
  386. 'type': 'R',
  387. 'redirect': mdn.title
  388. })
  389. self._writer.writerow({
  390. 'title': new_title + " example",
  391. 'type': 'R',
  392. 'redirect': mdn.title
  393. })
  394. new_title = new_title.split(' ')
  395. for split_title in new_title:
  396. if any(exceptions == split_title for exceptions in self.CONDITIONALS_REDIR):
  397. self._writer.writerow({
  398. 'title': split_title,
  399. 'type': 'R',
  400. 'redirect': mdn.title
  401. })
  402. self._writer.writerow({
  403. 'title': split_title + " syntax",
  404. 'type': 'R',
  405. 'redirect': mdn.title
  406. })
  407. self._writer.writerow({
  408. 'title': split_title + " example",
  409. 'type': 'R',
  410. 'redirect': mdn.title
  411. })
  412. # To avoid redirections like "default parameters" -> "Default Parameters"
  413. if title.lower() != mdn.title.lower():
  414. self._writer.writerow({
  415. 'title': title,
  416. 'type': 'R',
  417. 'redirect': mdn.title
  418. })
  419. def writedisambiguation(self, keyword, disambig):
  420. self._writer.writerow({
  421. 'title': keyword,
  422. 'type': 'D',
  423. 'disambiguation': disambig
  424. })
  425. self._writer.articles_index.append(keyword.lower())
  426. def writerows(self):
  427. for keyword, count in self.counter.most_common():
  428. if count > 1:
  429. disambig = ''
  430. for mdn in self.inverted_index[keyword]:
  431. if disambig:
  432. disambig += '\\n'
  433. if '.' in mdn.summary:
  434. summary = mdn.summary[:mdn.summary.find('.') + 1]
  435. else:
  436. summary = mdn.summary
  437. disambig += '*[[%s]] %s' % (mdn.title, summary)
  438. # Write a disambiguation
  439. # skips D if already an article
  440. if keyword.lower() not in self._writer.articles_index:
  441. self.writedisambiguation(keyword, disambig)
  442. for mdn in self.inverted_index[keyword]:
  443. # add redirect for Web/Api pages
  444. strip_title = ""
  445. if any(word in mdn.title for word in self.WEBAPI_CLASS_WORDS) and '.' in mdn.title:
  446. # original title: Window.getAnimationFrame()
  447. match = re.search('(?:.*\.)([^\(]+)(?:\(\))?', mdn.title)
  448. # remove class_word: getAnimationFrame
  449. strip_title = match.group(1)
  450. # skips redirect if already an article
  451. if strip_title.lower() not in self._writer.articles_index:
  452. self.writeredirect(strip_title, mdn)
  453. # for all entries in the inverted index, write a redirect of
  454. # of the form <object><space><property>
  455. obj_prop_entry = mdn.obj.lower() + ' ' + mdn.prop.lower()
  456. if obj_prop_entry not in self._writer.articles_index:
  457. self.writeredirect(obj_prop_entry, mdn)
  458. # If this is the only item in the inverted index,
  459. # write a primary redirect on the keyword.
  460. if count == 1:
  461. # check if not an Article
  462. if not all(x in [keyword, strip_title] for x in self._writer.articles_index):
  463. if keyword.lower() not in self._writer.articles_index:
  464. self.writeredirect(keyword, mdn)
  465. def run(cachedir, cachejournal, langdefs, outfname):
  466. """
  467. Args:
  468. cachedir: Directory used to cache downloaded HTML files.
  469. cachejournal: A csv of fname,url pairs for the cache dir.
  470. langdefs: A filepath to a language definition for JavaScript. See
  471. the Standardizer class for info on this spec.
  472. outname: The output filename.
  473. """
  474. standardizer = Standardizer(langdefs)
  475. parser = MDNParser()
  476. journal = [l.strip().split(',') for l in
  477. open(cachejournal).read().splitlines()]
  478. with codecs.open(outfname, 'w', 'utf-8') as outfile:
  479. writer = MDNWriter(outfile)
  480. indexer = MDNIndexer(writer)
  481. # Iterate over URLs in the sitemap ...
  482. for fname, url in journal:
  483. # ... and parse each to generate an mdn object.
  484. mdn = parser.parse(codecs.open(fname, 'r', 'utf-8'))
  485. if not mdn or not mdn.summary:
  486. continue
  487. # WARNING WARNING
  488. #
  489. # If MDN updates their URL structure, this will break. This
  490. # assumes that the URL ends with /obj/property
  491. #
  492. # An improvement would be to supply this as a regex pattern
  493. # to the CL
  494. #
  495. # WARNING WARNING
  496. _, obj, prop = url.rsplit('/', 2)
  497. mdn.url = url
  498. mdn.obj = obj
  499. mdn.prop = prop
  500. mdn = standardizer.standardize(mdn)
  501. if mdn is None:
  502. continue
  503. # Here we require that outputs have either a summary
  504. # or a code sample.
  505. if mdn.summary or mdn.codesnippet:
  506. writer.writemdn(mdn)
  507. indexer.add(mdn)
  508. # For the error articles, we write a redirect
  509. # with a stripped version of the article
  510. if mdn.articletype == "Error":
  511. indexer.writestrippedredirect(mdn)
  512. indexer.writerows()
  513. if __name__ == '__main__':
  514. import argparse
  515. parser = argparse.ArgumentParser()
  516. parser.add_argument('--out')
  517. parser.add_argument('--cachedir')
  518. parser.add_argument('--cachejournal')
  519. parser.add_argument('--langdefs')
  520. args = parser.parse_args()
  521. run(args.cachedir, args.cachejournal, args.langdefs, args.out)