python爬虫问题求教
馄饨领主 2018-04-22 09:40:45 作为初学者,本来想按照教程写一个工作需要的爬虫,想要爬取[穷游结伴论坛](http://bbs.qyer.com/forum-2-1.html "")每个帖子的大致信息,但是最后运行时,却发现没反应,也没报错,真是一头雾水,所以想请教一下大家。
import requests
from bs4 import BeautifulSoup
import traceback
import re
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def getQyerList(lst, qyerURL):
html = getHTMLText(qyerURL)
soup = BeautifulSoup(html, 'html.parser')
for a in soup.find_all('a'):
try:
lst.append(a['href'])
except:
continue
def getQyerInfo(lst, qyerURL, fpath):
count = 0
for qyer in lst:
html = getHTMLText(qyer)
try:
if html == "":
continue
infoDict = {}
soup = BeautifulSoup(html, 'html.parser')
title_tag = soup.find('div', attrs={'class': 'signs'})
titleInfo = title_tag.find('a')
des_tag = soup.find('div', attrs={'class': 'xdest'})
desInfo = des_tag.find('a')
timeInfo = soup.find('em', attrs={'class': 'xmr13'}) + '-' + soup.find('em', attrs={'class': 'etd'})
contactInfo = soup.find('div', attrs={'class': 'xpan-contact'})
contentInfo = soup.find('li', attrs={'class': 'xlast xmt20'})
# keyList = ['标题', '目的地', '出行日期', '联系方式', '结伴详情']
# valueList = []
infoDict.update({'标题': titleInfo, '目的地': desInfo, '出行日期': timeInfo, '联系方式': contactInfo, '结伴详情': contentInfo})
with open(fpath, 'a', encoding='utf-8') as f:
f.write(str(infoDict) + '\n')
count = count + 1
print('\r当前速度:{:2f}%'.format(count*100/len(lst)), end='')
except:
count = count + 1
print('\r当前速度:{:2f}%'.format(count * 100 / len(lst)), end='')
continue
def main():
qyer_list_url = 'http://bbs.qyer.com/forum-2-1.html'
qyer_info_url = 'http://bbs.qyer.com/forum-2-1.html'
output_file = 'E://qunar.text'
slist = []
getQyerList(slist, qyer_info_url)
getQyerInfo(slist, qyer_info_url, output_file)
main()