python爬虫beta版之抓取知乎单页面

鉴于之前用python写爬虫，帮运营人员抓取过京东的商品品牌以及分类，这次也是用python来搞简单的抓取单页面版，后期再补充哈。
#-*- coding: utf-8 -*- import requests import sys from bs4 import beautifulsoup #－－－－－－知乎答案收集－－－－－－－－－－ #获取网页body里的内容 def get_content(url , data = none): header={ 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, sdch', 'accept-language': 'zh-cn,zh;q=0.8', 'connection': 'keep-alive', 'user-agent': 'mozilla/5.0 (windows nt 6.3; wow64) applewebkit/537.36 (khtml, like gecko) chrome/43.0.235' } req = requests.get(url, headers=header) req.encoding = 'utf-8' bs = beautifulsoup(req.text, "html.parser") # 创建beautifulsoup对象 body = bs.body # 获取body部分 return body #获取问题标题 def get_title(html_text): data = html_text.find('span', {'class': 'zm-editable-content'}) return data.string.encode('utf-8') #获取问题内容 def get_question_content(html_text): data = html_text.find('div', {'class': 'zm-editable-content'}) if data.string is none: out = ''; for datastring in data.strings: out = out + datastring.encode('utf-8') print '内容：\n' + out else: print '内容：\n' + data.string.encode('utf-8') #获取点赞数 def get_answer_agree(body): agree = body.find('span',{'class': 'count'}) print '点赞数：' + agree.string.encode('utf-8') + '\n' #获取答案 def get_response(html_text): response = html_text.find_all('div', {'class': 'zh-summary summary clearfix'}) for index in range(len(response)): #获取标签 answerhref = response[index].find('a', {'class': 'toggle-expand'}) if not(answerhref['href'].startswith('javascript')): url = 'http://www.zhihu.com/' + answerhref['href'] print url body = get_content(url) get_answer_agree(body) answer = body.find('div', {'class': 'zm-editable-content clearfix'}) if answer.string is none: out = ''; for datastring in answer.strings: out = out + '\n' + datastring.encode('utf-8') print out else: print answer.string.encode('utf-8') html_text = get_content('https://www.zhihu.com/question/43879769') title = get_title(html_text) print "标题：\n" + title + '\n' questiondata = get_question_content(html_text) print '\n' data = get_response(html_text)
输出结果：

python爬虫beta版之抓取知乎单页面

VIP推荐