/crawler.py

https://github.com/xinzhengzhang/ssdut_news_server · Python · 154 lines · 111 code · 22 blank · 21 comment · 12 complexity · 667f875f6de5ac9158b7dca47273655a MD5 · raw file

  1. #!/usr/bin/env python
  2. #encoding=utf-8
  3. from models import *
  4. import traceback
  5. from sqlalchemy import func
  6. import db
  7. import parser as par
  8. import config
  9. from utils import TornadoFormatter
  10. import time
  11. import logging
  12. from urllib2 import urlopen
  13. import string
  14. SITE_URL = 'http://ssdut.dlut.edu.cn'
  15. class SSdutSiteCrawler(object):
  16. def __init__(self):
  17. ''' use tornaod LogFormatter '''
  18. self._news_url_template = string.Template(
  19. SITE_URL+"/index.php/News/student/p/$p/")
  20. self._init_going = False
  21. def page_url(self, p):
  22. url = self._news_url_template.substitute(p=p)
  23. logging.debug("page url = %r" % url)
  24. return url
  25. def get_page_result(self, p):
  26. src = urlopen(self.page_url(p)).read()
  27. return par.ssdut_news_list(src)
  28. def update_db(self, p=1):
  29. # TODO fix hole , update
  30. db_max_id = db.ses.query(func.max(New.id)).one()[0]
  31. site_res = self.get_page_result(1)
  32. logging.info("records on site = %r, max_id in db = %r" %
  33. (site_res.total_records, db_max_id))
  34. news_id = site_res.total_records
  35. if db_max_id < site_res.total_records:
  36. n = site_res.total_records - db_max_id
  37. logging.info("will update %r news" % n)
  38. # updte news here
  39. # assume that, n<=12
  40. for new in site_res.news_list:
  41. if n <= 0:
  42. break
  43. n -= 1
  44. print n
  45. # do update
  46. src = urlopen(SITE_URL + new['link']).read()
  47. detail = par.ssdut_news_parse(src)
  48. r = New(
  49. id=news_id,
  50. raw=detail.raw,
  51. title=detail.title,
  52. link=new['link'],
  53. body=detail.body,
  54. clean_body=detail.clean_body,
  55. date=detail.date,
  56. publisher=detail.publisher,
  57. source=detail.source,
  58. source_link=new['source_link'],
  59. sha1=detail.sha1,
  60. search_text=detail.search_text)
  61. logging.info("%r added to db, id = %r" % (r, r.id))
  62. db.ses.add(r)
  63. db.ses.commit()
  64. news_id -= 1
  65. else:
  66. logging.info("no news to be update")
  67. logging.debug("update finish")
  68. def reset_news_db(self):
  69. ''' get the first 10 pages news and store them in db'''
  70. # delete all records in db
  71. for r in New.query.all():
  72. db.ses.delete(r)
  73. db.ses.commit()
  74. logging.debug("delete all news records in db")
  75. # get all the news links
  76. res_list = []
  77. for p in xrange(1, 220):
  78. res_list.append(self.get_page_result(p))
  79. # get news detail and store in db
  80. news_id = res_list[0].total_records
  81. for page in res_list:
  82. for new in page.news_list:
  83. #try:
  84. src = urlopen(SITE_URL + new['link']).read()
  85. detail = par.ssdut_news_parse(src)
  86. r = New(
  87. id=news_id,
  88. raw=detail.raw,
  89. title=detail.title,
  90. link=new['link'],
  91. body=detail.body,
  92. clean_body=detail.clean_body,
  93. date=detail.date,
  94. publisher=detail.publisher,
  95. source=detail.source,
  96. source_link=new['source_link'],
  97. sha1=detail.sha1,
  98. search_text=detail.search_text)
  99. db.ses.add(r)
  100. db.ses.commit()
  101. logging.info("%r, added, link=%r, page_no = %r" %
  102. (r, r.link, page.page_no))
  103. news_id -= 1
  104. #except:
  105. # traceback.print_exc()
  106. # logging.error("error, r= %r" % r )
  107. # logging.error("page no = %r" % page.page_no)
  108. #finally:
  109. # news_id -= 1
  110. if __name__ == "__main__":
  111. updater = SSdutSiteCrawler()
  112. # set up the log format
  113. lg = logging.getLogger()
  114. console_handler = logging.StreamHandler()
  115. console_handler.setLevel(logging.DEBUG)
  116. # console_handler.setFormatter(TornadoFormatter(color=True))
  117. file_handler = logging.FileHandler('crawler.log')
  118. file_handler.setLevel(logging.DEBUG)
  119. # file_handler.setFormatter(TornadoFormatter(color=False))
  120. lg.addHandler(console_handler)
  121. lg.addHandler(file_handler)
  122. lg.setLevel(logging.DEBUG)
  123. if kv.db_inited:
  124. logging.info("Initial data already loaded, begin updating")
  125. else:
  126. logging.info("begin crawling initial data...")
  127. updater.reset_news_db()
  128. kv.db_inited = 'true'
  129. logging.info("db init finished")
  130. while True:
  131. updater.update_db()
  132. time.sleep(config.update_interval)