python+requests+xpath爬取贴吧评论总是报错NoneType
本人新人,刚学爬虫。看了个例子仿照着写了个爬取贴吧回复的,但是运行总是报错无类型......我估计是我的xpath定位问题,但我不知道该怎么改,或者是其他地方错了。哪位大神帮忙指导下!!谢谢!!以下是代码
import requests
from lxml import etree
import json
import time
from requests.exceptions import RequestException
from pymongo import MongoClient
client=MongoClient()
db=client.tieba #连接tieba数据库,没有则自动创建
my_set=db.comment #使用comment集合,没有则自动创建
def get_html(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return None
except RequestException:
return None
def parse_html(html):
selector=etree.HTML(html)
data=selector.xpath('//div[@class="l_post l_post_bright j_l_post clearfix "]')
for each in data:
rs=each.xpath('@data-field')[0]
rs=json.loads(rs)
author=rs.get('author').get('user_name')
author_id=rs.get('content').get('post_id')
content=each.xpath('div/div/cc/div[@id="post_content_%s"]/text()'% author_id)[0].strip()
date=rs.get('content').get('date')
yield {
'author':author,
'content':content,
'date':date
}
def save_to_txt(result):
print('正在存储:',result)
with open('tieba.txt','a',encoding='utf-8') as f:
f.write('回帖作者:'+result['author']+'\n')
f.write('回帖内容:' + result['content'] + '\n')
f.write('回帖时间:' + result['date'] + '\n')
f.write('\n')
def main(url):
html=get_html(url)
if html:
for result in parse_html(html):
save_to_txt(result)
if __name__=='__main__':
for i in range(1,21):
url = 'https://tieba.baidu.com/p/5501399200?fid=59099&pn='+str(i)
print('正在爬取第%s页'% str(i))
main(url)
time.sleep(10)
爬取的网址是https://tieba.baidu.com/p/5501399200?fid=59099&pn=1