PageRenderTime 58ms CodeModel.GetById 28ms RepoModel.GetById 1ms app.codeStats 0ms

/achewood2/utils/importstrips.py

https://github.com/fish2000/achewood
Python | 479 lines | 411 code | 49 blank | 19 comment | 40 complexity | 3adcf41c22a1a5ce40ce4f6ecd178298 MD5 | raw file
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. from __future__ import with_statement
  4. import sys
  5. from django.core.management import setup_environ
  6. from achewood2 import settings
  7. setup_environ(settings)
  8. try:
  9. import re2 as re
  10. except ImportError:
  11. import re
  12. else:
  13. re.set_fallback_notification(re.FALLBACK_WARNING)
  14. import os, urllib2, urlparse, datetime
  15. from django.db.models import Q
  16. from django.utils.html import strip_tags, strip_entities
  17. from django.core.exceptions import ObjectDoesNotExist
  18. from django.core.files import File
  19. from django.core.files.temp import NamedTemporaryFile
  20. from django.template.loader import render_to_string
  21. from BeautifulSoup import BeautifulSoup, UnicodeDammit
  22. from achewood2.utils.monkeypatch import memoize
  23. from achewood2.localache.models import AWComic, AWImage, AWCalendarMonth
  24. def soup(url):
  25. uh = urllib2.urlopen(url)
  26. u = uh.read()
  27. uh.close()
  28. return BeautifulSoup(u)
  29. title_re = re.compile(r'<h2>(.*?)&nbsp;')
  30. monthly_re = re.compile(r"archive\?start=\d\d\d\d\d\d\d\d")
  31. sre = re.compile('(\w\w\w\w)|(1st)')
  32. monthnames = ('Never', "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december")
  33. monthabbrevs = [m[0:3] for m in monthnames]
  34. def monthindex(monthname):
  35. try:
  36. return monthnames.index(monthname)
  37. except ValueError:
  38. try:
  39. return monthabbrevs.index(monthname)
  40. except ValueError:
  41. return -1
  42. return -1
  43. def AWAchewoodDate(yyyy, mm, dd):
  44. """ works also for yy years """
  45. return "%02d%02d%04d" % (int(mm), int(dd), (int(yyyy) < 100 and (2000+int(yyyy)) or int(yyyy)))
  46. def AWAssetbarDate(yyyy, mm, dd):
  47. return "%04d%02d%02d" % ((int(yyyy) < 100 and (2000+int(yyyy)) or int(yyyy)), int(mm), int(dd))
  48. def AWAchewoodURL(yyyy, mm, dd):
  49. return "http://achewood.com/index.php?date=%s" % AWAchewoodDate(yyyy, mm, dd)
  50. @memoize
  51. def AWAssetbarURLStringsForMonth(yyyy=None, mm=None, data=None):
  52. """
  53. Returns a dict of assetbar.com archive strings,
  54. keyed by Achewood dates (not AWAssetbarDate dates!)
  55. """
  56. if not data:
  57. archurl = "http://m.assetbar.com/achewood/archive?start=%s" % AWAssetbarDate(yyyy, mm, 1)
  58. records = soup(archurl).findAll('div', {'class': "one_record"})
  59. else:
  60. records = BeautifulSoup(data).findAll('div', {'class': "one_record"})
  61. return dict(zip(
  62. map(lambda r: unicode(r.find('div', {'class':"title"}).contents[2]).replace('/', ''), records),
  63. map(lambda r: r.find('a')['href'], records),
  64. ))
  65. @memoize
  66. def AWAssetbarURL(yyyy=None, mm=None, dd=None, urlstring=None):
  67. if not urlstring:
  68. archdate = AWAchewoodDate(yyyy, mm, dd)
  69. archmonthstrings = AWAssetbarURLStringsForMonth(yyyy, mm)
  70. if archdate in archmonthstrings:
  71. urlstring = archmonthstrings[archdate]
  72. else:
  73. return None
  74. return "http://m.assetbar.com/achewood/%s" % urlstring
  75. @memoize
  76. def AWGetStripAssetbarData(yyyy=None, mm=None, dd=None, urlstring=None):
  77. """
  78. Get a tuple of AsssetBar.com data for a given date.
  79. returns something like this:
  80. (
  81. img, # url to the strip image data
  82. title, # title string (from <title> of page)
  83. alttxt, # alt text from strip <img> tag
  84. prev, # 'prev' assetbar url string
  85. next, # 'next' assetbar url string
  86. )
  87. ... any of whose members might be None if it couldn't be sorted out.
  88. """
  89. if not urlstring:
  90. assetbarurl = AWAssetbarURL(yyyy, mm, dd)
  91. nodes = soup(assetbarurl)
  92. #urlstring = AWAssetbarURLStringsForMonth(yyyy, mm)[AWAchewoodDate(yyyy, mm, dd)]
  93. urlstring = assetbarurl.replace('http://m.assetbar.com/achewood/', '')
  94. else:
  95. nodes = soup(AWAssetbarURL(urlstring=urlstring))
  96. content = nodes.find('div',{'id':'content'})
  97. date = content.find('h2').find('span', {'class':"date"}).contents[0].split('/')
  98. img = content.find('img')
  99. m = title_re.search(unicode(content.find('h2')))
  100. title = m.group(1)
  101. img_url = urlparse.urljoin('http://m.assetbar.com/', img['src'], allow_fragments=False)
  102. alt_text = img['title']
  103. month, day, year = date[0], date[1], date[2]
  104. prevnext = dict(map(lambda a: (sre.search(a.string).group(), a['href']), nodes.find('span', {'class': "prevnext"}).findAll('a')))
  105. prevnext.update({
  106. 'urlstring': urlstring,
  107. 'month': month,
  108. 'day': day,
  109. 'year': year,
  110. 'imgurl': img_url,
  111. 'title': title,
  112. 'alttxt': alt_text,
  113. })
  114. return prevnext
  115. @memoize
  116. def AWGetStripAchewoodData(yyyy=None, mm=None, dd=None, urlstring=None):
  117. """
  118. Get a tuple of Achewood.com data for a given date
  119. returns this:
  120. (
  121. alttxt, # alt text from strip <img> tag
  122. url, # url the strip points to when clicked
  123. # (usually an m.assetbar.com url but sometimes different)
  124. )
  125. """
  126. if urlstring:
  127. bar = AWGetStripAssetbarData(urlstring=urlstring)
  128. yyyy, mm, dd = bar['year'], bar['month'], bar['day']
  129. nodes = soup(AWAchewoodURL(yyyy, mm, dd))
  130. try:
  131. alttxt = nodes.find('p', {'id':"comic_body"}).find('img')['title']
  132. except TypeError:
  133. alttxt = ""
  134. return {
  135. 'alttxt': alttxt,
  136. 'url': nodes.find('p', {'id':"comic_body"}).find('a')['href'],
  137. }
  138. #@memoize
  139. def AWGetStripDialogue(yyyy=None, mm=None, dd=None, urlstring=None):
  140. """
  141. Get a strip's dialogue from ohnorobot.com for a given date.
  142. This works by taking a URL like this:
  143. "http://www.ohnorobot.com/index.pl?s=%s+%s+%s&Search=Search&comic=636&e=0&n=0&b=0&m=0&d=0&t=0" % (
  144. monthnames[mm], dd, yyyy
  145. )
  146. ... and looking up the specific AWAchewoodDate in the mess of return data to find the dialogue.
  147. """
  148. if urlstring:
  149. bar = AWGetStripAssetbarData(urlstring=urlstring)
  150. yyyy, mm, dd = bar['year'], bar['month'], bar['day']
  151. dsurl = "http://www.ohnorobot.com/index.pl?s=%s+%s+%s&Search=Search&comic=636&e=0&n=0&b=0&m=0&d=0&t=0" % (
  152. monthnames[int(mm)], dd, yyyy
  153. )
  154. dsearch = soup(dsurl)
  155. dlg = filter(lambda li: li.find('a', {'class':"searchlink", 'href':re.compile("%s$" % AWAchewoodDate(yyyy, mm, dd))}), dsearch.findAll('li'))
  156. if len(dlg) == 1:
  157. #return strip_entities(strip_tags(dlg.pop()))
  158. return strip_tags(dlg.pop())
  159. return u""
  160. @memoize
  161. def AWGetStripData(yyyy=None, mm=None, dd=None, urlstring=None):
  162. if urlstring:
  163. bar = AWGetStripAssetbarData(urlstring=urlstring)
  164. yyyy, mm, dd = bar['year'], bar['month'], bar['day']
  165. else:
  166. bar = AWGetStripAssetbarData(yyyy, mm, dd)
  167. urlstring = bar['urlstring']
  168. out = bar
  169. out.update({
  170. 'url': AWGetStripAchewoodData(yyyy, mm, dd).get('url'),
  171. 'dialogue': AWGetStripDialogue(yyyy, mm, dd),
  172. })
  173. return out
  174. @memoize
  175. def AWGetAssetbarMonths(url="http://m.assetbar.com/achewood/archive"):
  176. pageText = urllib2.urlopen(url).read()
  177. index_pages = [u.split('=')[1] for u in monthly_re.findall(pageText)]
  178. return map(lambda m: (m[0:4], m[4:6]), index_pages)
  179. def AWGetFilenameForURL(url, default=None):
  180. urlpieces = urlparse.urlparse(str(url))
  181. try:
  182. return os.path.basename(urlpieces[2])
  183. except IndexError:
  184. return default
  185. return default
  186. def AWGetSuffixForURL(url, default=None):
  187. fn = AWGetFilenameForURL(url, default)
  188. if fn:
  189. if not fn.rfind('.') == -1:
  190. try:
  191. return fn.rsplit('.', 1)[1].lower()
  192. except IndexError:
  193. return default
  194. return default
  195. def AWGetTemporaryFileForURL(url, **kwargs):
  196. if str(url).startswith('http'):
  197. suffix = "gif"
  198. if 'suffix' in kwargs:
  199. suffix = kwargs['suffix']
  200. del kwargs['suffix']
  201. itemp = NamedTemporaryFile(suffix=(".%s" % suffix), **kwargs)
  202. try:
  203. itemp.write(urllib2.urlopen(url).read())
  204. except urllib2.URLError, urlerr:
  205. itemp.close()
  206. itemp = None
  207. else:
  208. itemp.flush()
  209. return itemp
  210. else:
  211. return None
  212. # backported from original script
  213. def repairEntities(brokenText):
  214. fixedText = brokenText
  215. replacements = [(r'&Atilde;&reg;','&icirc;'),
  216. (r'&Atilde;&copy;','&eacute;'),
  217. (r'&acirc;&euro;&trade;','&rsquo;'),
  218. (r'&#147;','&lsquo;'),
  219. (r'&#148;','&rsquo;'),
  220. (r'&Acirc;&cent;','&cent;'),
  221. (r'&Acirc;&rsquo;','&rsquo;'),
  222. (r' & ',' &amp; ')
  223. ]
  224. for subSearch, subReplace in replacements:
  225. fixedText = re.subn(subSearch, subReplace, fixedText)[0]
  226. return fixedText
  227. def main(argv=None):
  228. mths = get_months()
  229. get_data(mths)
  230. cmx = AWComic.objects.filter(visible=True).order_by('postdate')
  231. get_alturls(cmx)
  232. get_dialogue(cmx)
  233. get_images(cmx)
  234. generate_pages(cmx)
  235. return 0
  236. def get_months():
  237. print "Getting archive months from main URL..."
  238. mths = AWGetAssetbarMonths()
  239. print "Got %s total months" % len(mths)
  240. for yyyy, mm in mths:
  241. mth = AWCalendarMonth.objects.month(yyyy, mm)
  242. if not mth.id:
  243. print "Caching month: %s %s" % (monthnames[int(mm)].capitalize(), yyyy)
  244. mth.url = "http://m.assetbar.com/achewood/archive?start=%s" % AWAssetbarDate(yyyy, mm, 1)
  245. mth.data = urllib2.urlopen(mth.url).read()
  246. mth.title = "%s %s" % (monthnames[int(mm)], yyyy)
  247. mth.save()
  248. return mths
  249. def get_data(mths=None):
  250. if not mths:
  251. print "Getting archive months from main URL..."
  252. mths = AWGetAssetbarMonths()
  253. print "Got %s total months" % len(mths)
  254. for yyyy, mm in mths:
  255. mth = AWCalendarMonth.objects.month(yyyy, mm)
  256. if not mth.id:
  257. print "Caching month: %s %s" % (monthnames[int(mm)].capitalize(), yyyy)
  258. mth.url = "http://m.assetbar.com/achewood/archive?start=%s" % AWAssetbarDate(yyyy, mm, 1)
  259. mth.data = urllib2.urlopen(mth.url).read()
  260. mth.title = "%s %s" % (monthnames[int(mm)], yyyy)
  261. mth.save()
  262. bar = AWAssetbarURLStringsForMonth(data=mth.data)
  263. print "%s %s: %s strips" % (monthnames[int(mm)].capitalize(), yyyy, len(bar))
  264. for d, strip in bar.items():
  265. try:
  266. c = AWComic.objects.get(asseturlstring=strip)
  267. except ObjectDoesNotExist:
  268. #data = AWGetStripData(yyyy, int(mm), dd)
  269. try:
  270. dd = int(d[2:4])
  271. data = AWGetStripAssetbarData(yyyy, int(mm), dd)
  272. except ValueError:
  273. data = AWGetStripAssetbarData(urlstring=strip)
  274. print u">>>\t %s\t %s" % (
  275. d, data['title']
  276. )
  277. c = AWComic()
  278. c.postdate = datetime.date(
  279. int(data['year']),
  280. int(data['month']),
  281. int(data['day']),
  282. )
  283. c.title = repairEntities(data['title'])
  284. c.alttext = repairEntities(data['alttxt'])
  285. c.asseturlstring = data['urlstring']
  286. c.imageurl = data['imgurl']
  287. #c.alturl = data['url']
  288. #c.dialogue = data['dialogue']
  289. c.alturl = 'later'
  290. c.dialogue = 'later'
  291. c.save()
  292. else:
  293. print "---\t %s\t %s" % (d, c.title,)
  294. print ""
  295. print ""
  296. def get_alturls(comix=None):
  297. if comix == None:
  298. comix = AWComic.objects.filter(visible=True).order_by('postdate')
  299. print ""
  300. print "Getting alternative URLs for %s cached comics..." % comix.count()
  301. for c in comix.filter(Q(alturl__istartswith="later") | Q(alturl__isnull=True) | Q(alturl__exact="")):
  302. yyyy, mm, dd = (c.postdate.year, c.postdate.month, c.postdate.day)
  303. c.alturl = AWGetStripAchewoodData(yyyy, mm, dd).get('url')
  304. c.save()
  305. print "---\t %s\t %s" % (c.assetbardate, c.alturl)
  306. def get_dialogue(comix=None):
  307. if comix == None:
  308. comix = AWComic.objects.filter(visible=True).order_by('postdate')
  309. print ""
  310. print "Getting dialogue for %s cached comics..." % comix.count()
  311. for c in comix.filter(Q(dialogue__istartswith="later") | Q(alturl__isnull=True)):
  312. yyyy, mm, dd = (c.postdate.year, c.postdate.month, c.postdate.day)
  313. c.dialogue = AWGetStripDialogue(yyyy, mm, dd)
  314. c.save()
  315. print "---\t %s\t %s ..." % (c.assetbardate, c.dialogue[0:100])
  316. def get_images(comix=None):
  317. if comix == None:
  318. comix = AWComic.objects.filter(visible=True).order_by('postdate')
  319. print ""
  320. print "Getting images for %s cached comics..." % comix.count()
  321. for c in comix:
  322. print ""
  323. try:
  324. im = c.awimage
  325. if im == None:
  326. print "WTF: NoneType found for c.awimage (comic %s)" % c.id
  327. raise ObjectDoesNotExist
  328. except ObjectDoesNotExist:
  329. suffix = AWGetSuffixForURL(c.imageurl, default='gif')
  330. tn = "%s.%s" % (c.assetbardate, suffix)
  331. where = os.path.join(AWImage.gettargetpath(), tn)
  332. if os.path.exists(where):
  333. ff = File(open(where, 'r'))
  334. print "---\t Existing image file: %s" % tn
  335. else:
  336. ff = File(AWGetTemporaryFileForURL(c.imageurl, suffix=suffix))
  337. print ">>>\t New image file: %s" % tn
  338. if ff:
  339. cim = AWImage()
  340. cim.comic = c
  341. cim.image.save(tn, ff)
  342. cim.save()
  343. print ">>>\t New image object: %s" % os.path.basename(cim.image.name)
  344. else:
  345. print "---\t Existing image object: %s" % os.path.basename(im.image.name)
  346. print ""
  347. def generate_pages(comix=None):
  348. if comix == None:
  349. comix = AWComic.objects.filter(visible=True).order_by('postdate')
  350. for i in xrange(comix.count()):
  351. this_comic = comix[i]
  352. print "Processing HTML for %s" % this_comic.assetbardate
  353. if i < comix.count() - 1:
  354. nextpage = '%s.html' % comix[i+1].assetbardate
  355. else:
  356. nextpage = "#"
  357. if i > 0:
  358. previouspage = '%s.html' % comix[i-1].assetbardate
  359. else:
  360. previouspage = "#"
  361. if not this_comic.alturl.startswith('http://m.assetbar.com/achewood/one_strip'):
  362. alturl = this_comic.alturl
  363. else:
  364. alturl = None
  365. tplout = os.path.join(settings.MEDIA_ROOT, "%s.html" % this_comic.assetbardate)
  366. if os.path.exists(tplout):
  367. os.remove(tplout)
  368. with open(tplout, 'w') as f:
  369. f.write(render_to_string("localache-django.html", {
  370. 'nextpage': nextpage,
  371. 'previouspage': previouspage,
  372. 'title': this_comic.title,
  373. 'postdate': "%02s %s %04s" % (
  374. this_comic.postdate.day,
  375. monthabbrevs[int(this_comic.postdate.month)].capitalize(),
  376. this_comic.postdate.year,
  377. ),
  378. 'comic': this_comic.imagename,
  379. 'alttxt': this_comic.alttext,
  380. #'alturl': alturl,
  381. }).encode('utf-8'))
  382. if __name__ == "__main__":
  383. sys.exit(main(argv=sys.argv))