PageRenderTime 45ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/CrossMgrVideo/HelpIndex.py

https://github.com/esitarski/CrossMgr
Python | 98 lines | 87 code | 9 blank | 2 comment | 5 complexity | e72f27b389a3e2fdc11c6186f91e7930 MD5 | raw file
  1. from whoosh.index import create_in, open_dir
  2. from whoosh.analysis import StemmingAnalyzer
  3. from whoosh.fields import *
  4. import os
  5. import shutil
  6. import glob
  7. import re
  8. from bs4 import BeautifulSoup
  9. htmlDocDir = 'CrossMgrHtmlDoc'
  10. indexDir = 'CrossMgrHelpIndex'
  11. def BuildHelpIndex():
  12. if os.path.exists( indexDir ):
  13. shutil.rmtree( indexDir, ignore_errors = True )
  14. os.mkdir( indexDir )
  15. stemmingAnalyzer = StemmingAnalyzer()
  16. schema = Schema( path=ID(stored=True, unique=True), section=TEXT(stored=True), title=TEXT(stored=True, analyzer=stemmingAnalyzer),
  17. level=NUMERIC(stored=True), content=TEXT(stored=True, analyzer=stemmingAnalyzer) )
  18. ix = create_in( indexDir, schema )
  19. writer = ix.writer()
  20. titleTags = set(['h1', 'h2', 'h3', 'h4', 'h5'])
  21. newLines = re.compile( '\n+' )
  22. nonNumeric = re.compile( r'[^\d]' )
  23. def addDocument( fname, section, lastTitle, textCur ):
  24. # print 'addDocument: lastTitle={}'.format(lastTitle)
  25. if lastTitle and textCur:
  26. section = '|'.join( section ) if section else lastTitle.get_text()
  27. # print 'Indexing: {}: {}'.format(os.path.basename(fname), section)
  28. content = newLines.sub( '\n', '\n'.join(textCur) )
  29. writer.add_document( path = os.path.basename(fname) + '#' + lastTitle['id'],
  30. title = lastTitle.get_text(),
  31. section = section,
  32. level = int(nonNumeric.sub('', lastTitle.name)),
  33. content = content )
  34. # Extract content sections from the html pages.
  35. for f in glob.iglob( os.path.join(htmlDocDir, '*.html') ):
  36. doc = BeautifulSoup( open(f).read(), 'html.parser' )
  37. div = doc.find('div', class_='content')
  38. if not div:
  39. continue
  40. lastTitle = None
  41. textCur = []
  42. section = []
  43. for child in div.contents:
  44. try:
  45. tag = child.name
  46. except Exception as e:
  47. tag = None
  48. if tag not in titleTags:
  49. try:
  50. textCur.append( child.get_text() )
  51. except Exception as e:
  52. pass
  53. continue
  54. addDocument( f, section, lastTitle, textCur )
  55. iSection = int(int(nonNumeric.sub('', tag))) - 1
  56. section = section[:iSection]
  57. section.append( child.get_text() )
  58. lastTitle = child
  59. textCur = []
  60. addDocument( f, section, lastTitle, textCur )
  61. writer.commit()
  62. #---------------------------------------------------------------------------------------------
  63. if __name__ == '__main__':
  64. BuildHelpIndex()
  65. from whoosh.qparser import QueryParser
  66. ix = open_dir( indexDir, readonly=True )
  67. with ix.searcher() as searcher, open('search.html', 'w') as f:
  68. query = QueryParser('content', ix.schema).parse('fastest lap')
  69. results = searcher.search(query, limit=20)
  70. f.write( '<table><tr><th></th><th align="left">Section</th><th align="left">Match</th></tr>\n' )
  71. for i, hit in enumerate(results):
  72. f.write( '<tr><td align="left">%d.</td><td><a href="%s">%s</a></td><td>%s</td></tr>\n' % ((i+1), hit['path'], hit['section'], hit.highlights('content')) )
  73. f.write( '</table>\n' )
  74. ix.close()