PageRenderTime 47ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/hkforums.search.py

https://github.com/JMSCHKU/Social
Python | 183 lines | 178 code | 3 blank | 2 comment | 0 complexity | 9c993aebc510e874f83539fa926dcc02 MD5 | raw file
  1. #!/usr/bin/env python
  2. import sys, os
  3. import time, datetime
  4. import csv
  5. import pg
  6. import re
  7. import lucene
  8. import mypass, sinaweibooauth
  9. class SearchForums(object):
  10. """Usage: hkforums.search.py [-ds|-de DATE] terms <forum name>"""
  11. pgconn = None
  12. STORE_BASE_DIR = "/var/data/lucene/"
  13. STORE_DIR = ""
  14. supported_forums = ["uwants", "discuss", "hkreporter"]
  15. analysers = list()
  16. searcher = None
  17. MAX_ITEMS = 1000
  18. forum = ""
  19. def __init__(self, forumname):
  20. if not forumname in self.supported_forums:
  21. sys.exit()
  22. else:
  23. self.forum = forumname
  24. self.STORE_DIR = self.STORE_BASE_DIR + forumname
  25. smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
  26. self.analyzers = { "smartcn": smartcn }
  27. directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR))
  28. self.searcher = lucene.IndexSearcher(directory, True)
  29. self.pgconn = mypass.getConn()
  30. def prepareDates(self, datestring):
  31. if datestring is None:
  32. return None
  33. try:
  34. mydate = time.strptime(datestring, "%Y-%m-%d")
  35. except ValueError:
  36. try:
  37. mydate = time.strptime(datestring, "%Y-%m-%d %H:%M")
  38. except ValueError, TypeError:
  39. return None
  40. return int(time.mktime(mydate))
  41. def searchForums(self, q, time_start_secs, time_end_secs, uids=list(), offset=None, floor=None):
  42. if offset <> None:
  43. try:
  44. offset = int(offset)
  45. if offset > self.MAX_ITEMS:
  46. self.MAX_ITEMS = offset + 100
  47. except:
  48. pass
  49. page_start = page_end = None
  50. if floor <> None and len(floor) > 0:
  51. m = re.match(r"(\d+)-?(\d*)", floor)
  52. if m <> None:
  53. page_start = int(m.group(1))
  54. try:
  55. page_end = int(m.group(2))
  56. except:
  57. page_end = page_start
  58. startexec = datetime.datetime.now()
  59. first = True
  60. query = lucene.BooleanQuery()
  61. query.setMaxClauseCount(2097152)
  62. sorter = lucene.Sort(lucene.SortField("time", lucene.SortField.INT, True))
  63. pageFilter = None
  64. if len(q) > 0:
  65. query.add(lucene.QueryParser(lucene.Version.LUCENE_33, "content", self.analyzers["smartcn"]).parse(q), lucene.BooleanClause.Occur.MUST)
  66. dateFilter = lucene.NumericRangeFilter.newIntRange("time", time_start_secs, time_end_secs, True, True)
  67. else:
  68. query.add(lucene.NumericRangeQuery.newIntRange("time", time_start_secs, time_end_secs, True, True), lucene.BooleanClause.Occur.MUST)
  69. if page_start <> None and page_end <> None:
  70. pageFilter = lucene.NumericRangeFilter.newIntRange("floor", page_start, page_end, True, True)
  71. topScoreCollector = lucene.TopScoreDocCollector
  72. if len(uids) > 0:
  73. uids_str = list()
  74. numfilters = list()
  75. count = 0
  76. for x in uids:
  77. count += 1
  78. uids_str.append(str(x))
  79. numfilter = lucene.NumericRangeFilter.newIntRange("uid", x, x, True, True)
  80. numfilters.append(numfilter)
  81. #if count > 1000:
  82. # break
  83. chainedNumFilters = lucene.ChainedFilter(numfilters, lucene.ChainedFilter.OR)
  84. cachingChainedNumFilters = lucene.CachingWrapperFilter(chainedNumFilters)
  85. if len(q) > 0:
  86. chain = lucene.ChainedFilter([cachingChainedNumFilters,dateFilter, pageFilter], lucene.ChainedFilter.AND)
  87. else:
  88. chain = cachingChainedNumFilters
  89. topDocs = self.searcher.search(query, chain, sorter)
  90. else:
  91. if len(q) > 0 and time_start_secs is not None and time_end_secs is not None:
  92. if pageFilter is not None:
  93. filters = [dateFilter, pageFilter]
  94. chainedFilters = lucene.ChainedFilter(filters, lucene.ChainedFilter.AND)
  95. topDocs = self.searcher.search(query, chainedFilters, self.MAX_ITEMS, sorter)
  96. else:
  97. topDocs = self.searcher.search(query, dateFilter, self.MAX_ITEMS, sorter)
  98. else:
  99. if pageFilter is not None:
  100. topDocs = self.searcher.search(query, pageFilter, self.MAX_ITEMS, sorter)
  101. else:
  102. topDocs = self.searcher.search(query, self.MAX_ITEMS, sorter)
  103. #return "%(nb)d results found in %(secs)f seconds" %
  104. ids = list()
  105. ids_str = list()
  106. hits = list()
  107. count = 0
  108. for scoreDoc in topDocs.scoreDocs:
  109. count += 1
  110. doc = self.searcher.doc(scoreDoc.doc)
  111. id = doc.get("pid")
  112. uid = doc.get("uid")
  113. tid = doc.get("tid")
  114. #ids.append(id)
  115. hit = { "pid": id, "uid": uid, "tid": tid }
  116. hits.append(hit)
  117. #ids_str.append(str(id))
  118. #if count > self.MAX_ITEMS:
  119. #break
  120. out = { "totalhits": topDocs.totalHits, "nb_users": len(uids), "ids": ids, "q": q, "hits": hits }
  121. out["lucene_query_finished"] = long(time.mktime(datetime.datetime.now().timetuple())) * 1000
  122. if len(uids) > 0:
  123. out["user_ids"] = uids_str
  124. # Logging
  125. f = open("/var/data/hkforums/searchlog/%(forum)s.log" % {"forum": self.forum},"a")
  126. f.write(datetime.datetime.strftime(datetime.datetime.now(),"%Y-%m-%d %H:%M:%S") + "\t" + q + "\n")
  127. f.close()
  128. endexec = datetime.datetime.now()
  129. td = endexec - startexec
  130. microtime = td.microseconds + (td.seconds + td.days * 86400) * 1000000
  131. secondstime = microtime / 1000000.0
  132. out["secs"] = secondstime
  133. print out
  134. return out
  135. if __name__ == '__main__':
  136. if len(sys.argv) <= 1:
  137. print SearchSinaWeibo.__doc__
  138. sys.exit(1)
  139. inargs = False
  140. datestart_str = None
  141. dateend_str = None
  142. for i in range(1, len(sys.argv)):
  143. if sys.argv[i].find("-") != 0 and not inargs:
  144. i -= 1
  145. break
  146. else:
  147. inargs = False
  148. if sys.argv[i] == "-ds":
  149. if len(sys.argv) > i + 1:
  150. inargs = True
  151. datestart_str = sys.argv[i+1]
  152. elif sys.argv[i] == "-de":
  153. if len(sys.argv) > i + 1:
  154. inargs = True
  155. dateend_str = sys.argv[i+1]
  156. terms = sys.argv[i+1:len(sys.argv)+1]
  157. if inargs or len(terms) == 0:# or datestart_str is None:
  158. print SearchSinaWeibo.__doc__
  159. sys.exit(1)
  160. if dateend_str is None:
  161. dateend_str = datetime.datetime.strftime(datetime.datetime.now(),"%Y-%m-%d %H:%M")
  162. print terms
  163. print "date start: " + str(datestart_str)
  164. print "date end: " + str(dateend_str)
  165. # Start Lucene
  166. lucene.initVM(lucene.CLASSPATH)
  167. print 'lucene', lucene.VERSION
  168. search = SearchSinaWeibo()
  169. if datestart_str is None and dateend_str is None:
  170. search.searchWeibos(terms)
  171. elif datestart_str is not None:
  172. search.searchWeibos(terms, search.prepareDates(datestart_str))
  173. elif dateend_str is not None:
  174. search.searchWeibos(terms, 0, search.prepareDates(dateend_str))
  175. else:
  176. search.searchWeibos(terms, search.prepareDates(datestart_str), search.prepareDates(dateend_str))