PageRenderTime 52ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/app/js/parsertest.py

https://github.com/antimatter15/js-wikireader
Python | 41 lines | 40 code | 1 blank | 0 comment | 0 complexity | 1ee0749b19d89d8c05e206b39c7f26b2 MD5 | raw file
  1. import sys, re
  2. page = sys.stdin.read()
  3. page = re.sub('<ref[^\0]*?\/(ref)?\>', '', page)
  4. page = re.sub('<gallery>[^\0]*?<\/gallery>', '', page)
  5. page = re.sub('<!--.*?-->', '', page)
  6. page = re.sub('<!--[^\0]*?-->', '', page)
  7. page = re.sub(re.compile('\[\[\s*(File|Image)\s*:[^\]\[]*\[\[[^\[\]]*\]\]', re.IGNORECASE), '[[File:', page)
  8. page = re.sub(re.compile('\[\[\s*(File|Image)\s*:[^\]\[]*\[\[[^\[\]]*\]\]', re.IGNORECASE), '[[File:', page)
  9. page = re.sub(re.compile('\[\[\s*(File|Image)\s*:[^\]\[]*\[\[[^\[\]]*\]\]', re.IGNORECASE), '[[File:', page)
  10. page = re.sub(re.compile('\[\[\s*(File|Image)\s*:[^\]\[]*\[\[[^\[\]]*\]\]', re.IGNORECASE), '[[File:', page)
  11. page = re.sub(re.compile('\[\[\s*(File|Image)\s*:[^\]\[]*\[\[[^\[\]]*\]\]', re.IGNORECASE), '[[File:', page)
  12. page = re.sub(re.compile('\[\[\s*(File|Image)\s*:[^\]\[]*\[\[[^\[\]]*\]\]', re.IGNORECASE), '[[File:', page)
  13. page = re.sub('\[\[\s*(File|Image)\s*:[^\]]*\]\]', '', page)
  14. page = re.sub('\{\{As of\|(\d*)(\|.*?)?\}\}', 'As of \g<1>', page)
  15. page = re.sub('\{\{[^\}\{]*\{\{[^\{\}]*\}\}', '{{', page)
  16. page = re.sub('\{\{[^\}\{]*\{\{[^\{\}]*\}\}', '{{', page)
  17. page = re.sub('\{\{[^\}\{]*\{\{[^\{\}]*\}\}', '{{', page)
  18. page = re.sub('\{\{[^\}\{]*\{\{[^\{\}]*\}\}', '{{', page)
  19. page = re.sub('\{\{[^\}\{]*\{\{[^\{\}]*\}\}', '{{', page)
  20. page = re.sub('\{\{[^\}\{]*\{\{[^\{\}]*\}\}', '{{', page)
  21. page = re.sub('\{\{[^\}\{]*\{\{[^\{\}]*\}\}', '{{', page)
  22. page = re.sub('\{\{[^\}\{]*\{\{[^\{\}]*\}\}', '{{', page)
  23. page = re.sub('\{\{[^\}\{]*\{\{[^\{\}]*\}\}', '{{', page)
  24. page = re.sub('\s*\{\{[^\}\{]*\}\}', '', page)
  25. page = re.sub('\[\[\s*[a-z\-]+\s*:[^\n\|]*?\]\]', '', page)
  26. page = re.sub(re.compile('\[\[Category\s*:\s*.*?\]\]', re.IGNORECASE), '', page)
  27. page = re.sub('\n(\s?\n)+', '\n\n', page)
  28. page = re.sub('[\n\s]+(={2,6})\s*(.*?)\s*(={2,6})[\n\s]+', '\n\g<1>\g<2>\g<3>\n', page)
  29. page = re.sub(re.compile('={2,6}\s*(References|Sources|Other Websites|External Links|Notes|Footnotes|Further Reading)\s*={2,6}[^\0]+', re.IGNORECASE), '', page)
  30. page = page.strip()
  31. print page