脚本的主要工作是模拟了浏览器登录,解析“已买到的宝贝”页面以获得指定的订单及宝贝信息。
使用方法见代码或执行命令加参数-h,另外需要beautifulsoup4支持,beautifulsoup的官方项目列表页:https://www.crummy.com/software/beautifulsoup/bs4/download/
首先来说一下代码使用方法:
python taobao.py -u username -p password -s start-date -e end-date --verbose
所有参数均可选,如:
python taobao.py -u jinnlynn
统计用户jinnlynn所有订单的情况
python taobao.py -s 2014-12-12 -e 2014-12-12
统计用户(用户名在命令执行时会要求输入)在2014-12-12当天的订单情况
python taobao.py --verbose
这样就可以统计并输出订单明细。
好了,说了这么多我们就来看代码吧:
from __future__ import unicode_literals, print_function, absolute_import, divisionimport urllibimport urllib2import urlparseimport cookielibimport reimport sysimport osimport jsonimport subprocessimport argparseimport platformfrom getpass import getpassfrom datetime import datetimefrom pprint import pprinttry: from bs4 import beautifulsoupexcept importerror: sys.exit('beautifulsoup4 missing.')__version__ = '1.0.0'__author__ = 'jinnlynn'__copyright__ = 'copyright (c) 2014 jinnlynn'__license__ = 'the mit license'headers = { 'x-requestted-with' : 'xmlhttprequest', 'accept-language' : 'zh-cn', 'accept-encoding' : 'gzip, deflate', 'contenttype' : 'application/x-www-form-urlencoded; chartset=utf-8', 'cache-control' : 'no-cache', 'user-agent' :'mozilla/5.0 (macintosh; intel mac os x 10_9_5) applewebkit/537.36 (khtml, like gecko) chrome/40.0.2214.38 safari/537.36', 'connection' : 'keep-alive'}default_post_data = { 'tpl_username' : '', #用户名 'tpl_password' : '', #密码 'tpl_checkcode' : '', 'need_check_code' : 'false', 'callback' : '0', # 有值返回json}# 无效订单状态invalid_order_states = [ 'create_closed_of_taobao', # 取消 'trade_closed', # 订单关闭]login_url = 'https://login.taobao.com/member/login.jhtml'raw_imput_encoding = 'gbk' if platform.system() == 'windows' else 'utf-8'def _request(url, data, method='post'): if data: data = urllib.urlencode(data) if method == 'get': if data: url = '{}?{}'.format(url, data) data = none # print(url) # print(data) req = urllib2.request(url, data, headers) return urllib2.urlopen(req)def stdout_cr(msg=''): sys.stdout.write('\r{:10}'.format(' ')) sys.stdout.write('\r{}'.format(msg)) sys.stdout.flush()def get(url, data=none): return _request(url, data, method='get')def post(url, data=none): return _request(url, data, method='post')def login_post(data): login_data = default_post_data login_data.update(data) res = post(login_url, login_data) return json.load(res, encoding='gbk')def login(usr, pwd): data = { 'tpl_username' : usr.encode('utf-8' if platform.system() == 'windows' else 'gb18030'), 'tpl_password' : pwd } # 1. 尝试登录 ret = login_post(data) while not ret.get('state', false): code = ret.get('data', {}).get('code', 0) if code == 3425 or code == 1000: print('info: {}'.format(ret.get('message'))) check_code = checkcode(ret.get('data', {}).get('ccurl')) data.update({'tpl_checkcode' : check_code, 'need_check_code' : 'true'}) ret = login_post(data) else: sys.exit('error. code: {}, message:{}'.format(code, ret.get('message', ''))) token = ret.get('data', {}).get('token') print('login success. token: {}'.format(token)) # 2. 重定向 # 2.1 st值 res = get('https://passport.alipay.com/mini_apply_st.js', { 'site' : '0', 'token' : token, 'callback' : 'stcallback4'}) content = res.read() st = re.search(r'st:(\s*)( |})', content).group(1) # 2.1 重定向 get('http://login.taobao.com/member/vst.htm', {'st' : st, 'tpl_uesrname' : usr.encode('gb18030')})def checkcode(url): filename, _ = urllib.urlretrieve(url) if not filename.endswith('.jpg'): old_fn = filename filename = '{}.jpg'.format(filename) os.rename(old_fn, filename) if platform.system() == 'darwin': # mac 下直接preview打开 subprocess.call(['open', filename]) elif platform.system() == 'windows': # windows 执行文件用默认程序打开 subprocess.call(filename, shell=true) else: # 其它系统 输出文件名 print('打开该文件获取验证码: {}'.format(filename)) return raw_input('输入验证码: '.encode(raw_imput_encoding))def parse_bought_list(start_date=none, end_date=none): url = 'http://buyer.trade.taobao.com/trade/itemlist/list_bought_items.htm' # 运费险 增值服务 分段支付(定金,尾款) extra_service = ['freight-info', 'service-info', 'stage-item'] stdout_cr('working... {:.0%}'.format(0)) # 1. 解析第一页 res = urllib2.urlopen(url) soup = beautifulsoup(res.read().decode('gbk')) # 2. 获取页数相关 page_jump = soup.find('span', id='j_jumpto') jump_url = page_jump.attrs['data-url'] url_parts = urlparse.urlparse(jump_url) query_data = dict(urlparse.parse_qsl(url_parts.query)) total_pages = int(query_data['tpage']) # 解析 orders = [] cur_page = 1 out_date = false errors = [] while true: bought_items = soup.find_all('tbody', attrs={'data-orderid' : true}) # pprint(len(bought_items)) count = 0 for item in bought_items: count += 1 # pprint('{}.{}'.format(cur_page, count)) try: info = {} # 订单在页面上的位置 页数.排序号 info['pos'] = '{}.{}'.format(cur_page, count) info['orderid'] = item.attrs['data-orderid'] info['status'] = item.attrs['data-status'] # 店铺 node = item.select('tr.order-hd a.shopname') if not node: # 店铺不存在,可能是赠送彩票订单,忽略 # print('ignore') continue info['shop_name'] = node[0].attrs['title'].strip() info['shop_url'] = node[0].attrs['href'] # 日期 node = item.select('tr.order-hd span.dealtime')[0] info['date'] = datetime.strptime(node.attrs['title'], '%y-%m-%d %h:%m') if end_date and info['date'].toordinal() > end_date.toordinal(): continue if start_date and info['date'].toordinal() 1: bb['snapshot'] = name_node[1].attrs['href'] # 宝贝规格 bb['spec'] = n.select('.spec')[0].text.strip() # 宝贝价格 bb['price'] = float(n.find('td', class_='price').attrs['title']) # 宝贝数量 bb['quantity'] = int(n.find('td', class_='quantity').attrs['title']) bb['is_goods'] = true baobei.append(bb) # 尝试获取实付款 # 实付款所在的节点可能跨越多个tr的td amount_node = n.select('td.amount em.real-price') if amount_node: info['amount'] = float(amount_node[0].text) except exception as e: errors.append({ 'type' : 'baobei', 'id' : '{}.{}'.format(cur_page, count), 'node' : '{}'.format(n), 'error' : '{}'.format(e) }) except exception as e: errors.append({ 'type' : 'order', 'id' : '{}.{}'.format(cur_page, count), 'node' : '{}'.format(item), 'error' : '{}'.format(e) }) info['baobei'] = baobei orders.append(info) stdout_cr('working... {:.0%}'.format(cur_page / total_pages)) # 下一页 cur_page += 1 if cur_page > total_pages or out_date: break query_data.update({'pagenum' : cur_page}) page_url = '{}?{}'.format(url, urllib.urlencode(query_data)) res = urllib2.urlopen(page_url) soup = beautifulsoup(res.read().decode('gbk')) stdout_cr() if errors: print('info. 有错误发生,统计结果可能不准确。') # pprint(errors) return ordersdef output(orders, start_date, end_date): amount = 0.0 org_amount = 0 baobei_count = 0 order_count = 0 invaild_order_count = 0 for order in orders: if order['status'] in invalid_order_states: invaild_order_count += 1 continue amount += order['amount'] order_count += 1 for baobei in order.get('baobei', []): if not baobei['is_goods']: continue org_amount += baobei['price'] * baobei['quantity'] baobei_count += baobei['quantity'] print('{:<9} {}'.format('累计消费:', amount)) print('{:<9} {}/{}'.format('订单/宝贝:', order_count, baobei_count)) if invaild_order_count: print('{:<9} {} (退货或取消等, 不在上述订单之内)'.format('无效订单:', invaild_order_count)) print('{:<7} {}'.format('宝贝原始总价:', org_amount)) print('{:<7} {:.2f}'.format('宝贝平均单价:', 0 if baobei_count == 0 else org_amount / baobei_count)) print('{:<9} {} ({:.2%})'.format('节约了(?):', org_amount - amount, 0 if org_amount == 0 else (org_amount - amount) / org_amount)) from_date = start_date if start_date else orders[-1]['date'] to_date = end_date if end_date else datetime.now() print('{:<9} {:%y-%m-%d} - {:%y-%m-%d}'.format('统计区间:', from_date, to_date)) if not start_date: print('{: end_date: sys.exit('error, 结束日期必须晚于或等于开始日期') cj_file = './{}.tmp'.format(usr) cj = cookielib.lwpcookiejar() try: cj.load(cj_file) except: pass opener = urllib2.build_opener(urllib2.httpcookieprocessor(cj), urllib2.httphandler) urllib2.install_opener(opener) login(usr, pwd) try: cj.save(cj_file) except: pass orders = parse_bought_list(start_date, end_date) output(orders, start_date, end_date) # 输出订单明细 if verbose: ouput_orders(orders)if __name__ == '__main__': main()
