/scripts/baicai/b_bantang.py
Python | 183 lines | 164 code | 18 blank | 1 comment | 27 complexity | 9b48bd6f912cd0b54b3a49017e476df6 MD5 | raw file
- #coding:utf-8
- import tornado
- from tornado import template, httpclient
- import hashlib, urllib
- import sqlalchemy, models
- from sqlalchemy.sql import and_, or_, not_
- from mixin import staff_user
- from decimal import Decimal
- import settings
- import re, logging, random
- import time,datetime
- from datetime import datetime, timedelta
- import json
- from uuid import uuid1
- from celerytask import tasks as async_tasks
- from contrib import alimama
- CURRENT_SOURCE = 2
- IOS_UA = 'Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_1 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Mobile/11D201'
- CATE_MAP = {
- "taojujia": 1,
- "taomeishi": 6,
- "taodianqi": 14,
- "taomeizhuang": 5,
- "taoqita": 8,
- "taojujia": 4
- }
- MAP_CATE = {v: k for k,v in CATE_MAP.items()}
- def random_datetime():
- now = datetime.now()
- hour = now.hour + 2 if (now.hour + 2) < 22 else now.hour
- new_dt = datetime(year = now.year, month = now.month, day = now.day, hour = hour, minute = random.randint(1,59), second = random.randint(1, 59))
- return new_dt
- def fetch_url(url, callback, method = 'GET', data = {}, headers = {}, follow_redirects = True, use_proxy=False):
- httpclient.AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient")
- http_client = httpclient.AsyncHTTPClient()
- if data:
- body = urllib.urlencode(data)
- else:
- body = None
- if not use_proxy:
- req = httpclient.HTTPRequest(url, method = method, connect_timeout = 2.0, request_timeout = 5.0, body = body, headers = headers, follow_redirects = follow_redirects)
- else:
- req = httpclient.HTTPRequest(url, method = method, connect_timeout = 2.0, request_timeout = 5.0, body = body, headers = headers, follow_redirects = follow_redirects, proxy_host = '106.187.52.236', proxy_port=8080)
- try:
- http_client.fetch(req, callback)
- except httpclient.HTTPError,e:
- if e.code == 599:
- logging.log('request timeout|%s' % url )
- return callback(None)
- def crawl_list(url):
- fetch_url(url, _callback_crawl_list, headers = {'User-Agent': IOS_UA, 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}, use_proxy = False)
- def _callback_crawl_list(resp):
- if resp.code < 400:
- data = json.loads(resp.body)
- list_data = data["data"]["topic"]
- for topic in list_data:
- list_url = 'http://open3.bantangapp.com/topic/info?app_installtime=1436362830.922099&app_versions=4.1&channel_name=appStore&client_id=bt_app_ios&client_secret=9c1e6634ce1c5098e056628cd66a17a5&id=$cate&oauth_token=13248832ebfb692ab92d7a60d7051da0&os_versions=8.4.1&screensize=1242&statistics_uv=1&track_device_info=iPhone&track_deviceid=98D698CD-874B-40AF-919E-B346ECA78A47&track_user_id=1503&v=6'
- list_url = list_url.replace("$cate", str(topic["id"]))
- do_crawl(list_url)
- def do_crawl(url):
- print url
- fetch_url(url, _callback_crawl, headers = {'User-Agent': IOS_UA, 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}, use_proxy = False)
- def _callback_crawl(resp):
- if resp.code < 400:
- zhe_data = json.loads(resp.body)
- print 'bantang success'
- for good in zhe_data['data']["product"]:
- save_item(good)
- else:
- print 'ERROR:', resp.code
- def add_to_topic(deal, url_name, session):
- topic_id = 1 if url_name == 'baoyou' else 2
- if deal.id == 0:print 'deal id is 0--------------'
- if topic_id == 1 and deal.now_price > 9.9:
- return
- if topic_id == 2 and deal.now_price > 20:
- return
- if not deal.is_onsale:
- return
- if url_name == 'baoyou' or url_name == 'fengding':
- many2many_deal = session.query(models.DealInTopic).filter_by(topic_id=topic_id, deal_id = deal.id).first()
- if not many2many_deal:
- many2many_deal = models.DealInTopic()
- many2many_deal.topic_id = topic_id
- many2many_deal.deal_id = deal.id
- if not many2many_deal.id:
- session.add(many2many_deal)
- session.commit()
- print 'add to topic success'
- taobao_union = alimama.TaobaoUnion('zhangxiaolei1982','zxl234567')
- FROM_TAOBAO_COUNT = 0
- def save_item(good):
- session = models.DefaultSession()
- deal = models.DealItem()
- deal.title = good['title'].replace(u'?~@~P?~J?~W??~J?~V??~@~Q', '').encode('utf-8').replace('?~@~P?~J?~W??~J?~V??~@~Q', '')
- cate_url_name = MAP_CATE.get(int(good["category"]))
- if not cate_url_name:
- cate_url_name = "taojujia"
- deal.cate_id = models.StaticCategory_Dict.get(cate_url_name).get('id')
- deal.pic_url = good['pic'][0]['pic']
- price = good['price'].replace(u'?~E~C', '')
- deal.now_price = Decimal(price)
- deal.origin_price = Decimal(price)
- deal.begin_time = datetime.now()
- deal.expire_time = datetime.now() + timedelta(days = 3)
- deal.source = CURRENT_SOURCE
- deal.is_onsale = False
- deal.date_created = random_datetime()
- def _get_et(res_data):
- global FROM_TAOBAO_COUNT
- num_iid = res_data["item_id"]
- print 'num_iid', num_iid
- deal_ol = session.query(models.DealItem).filter_by(num_iid = num_iid).first()
- if not deal_ol:
- deal.num_iid = num_iid
- try:
- deal.taoke_url = taobao_union.convert_url(num_iid)
- except alimama.LoginErrorException:
- pass
- if deal.taoke_url:
- deal.is_onsale = True
- if not deal_ol:
- session.add(deal)
- session.commit()
- add_to_topic(deal, cate_url_name, session)
- else:
- deal_ol.now_price = deal.now_price
- deal_ol.origin_price = deal.origin_price
- deal_ol.expire_time = deal.expire_time
- deal_ol.begin_time = deal.begin_time
- deal_ol.state = 0
- if deal_ol.is_onsale == False:
- deal_ol.taoke_url = taobao_union.convert_url(num_iid)
- if deal_ol.taoke_url:
- deal_ol.is_onsale = True
- #session.commit()
- else:
- print '已?~X?~\?', num_iid
- add_to_topic(deal_ol, cate_url_name, session)
- async_tasks.promotion_detail.apply_async(args=[num_iid], kwargs={}, timeout=30, soft_timeout=10) #?~B步?~J~S?~O~V该?~U~F?~S~A?~Z~D?~J~X?~I?信?~A?
- session.commit()
- session.close()
- if good['item_id']:
- _get_et(good)
- def main():
- delta = 60 * 60 * 24
- main_url = 'http://open3.bantangapp.com/topic/list?app_installtime=1436362830.922099&app_versions=4.1&category=$cate&channel_name=appStore&client_id=bt_app_ios&client_secret=9c1e6634ce1c5098e056628cd66a17a5&oauth_token=13248832ebfb692ab92d7a60d7051da0&os_versions=8.4.1&page=0&pagesize=20&screensize=1242&track_device_info=iPhone&track_deviceid=98D698CD-874B-40AF-919E-B346ECA78A47&track_user_id=1503&v=6'
- taobao_union.login()
- for url_name, cate_id in CATE_MAP.items():
- #if cate['url_name'] != 'fengding' and cate['url_name'] != 'baoyou':
- # continue
- if url_name == 'all':
- continue
- print url_name
- target_url = main_url.replace('$cate', str(cate_id))
- crawl_list(target_url)
- tornado.ioloop.IOLoop.instance().add_timeout(timedelta(seconds = delta), main)
- if __name__ == '__main__':
- main()
- tornado.ioloop.IOLoop.instance().start()