3,423
社区成员
发帖
与我相关
我的任务
分享
# -*- coding:utf-8 -*-
import urllib
import urllib.request
import urllib.error
from lxml import etree
#多次与目标网站链接
def geturl(url):
user_agent='Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3373.400 QQBrowser/9.6.11866.400'
headers={'User-Agent':user_agent}
maxtry=3
for n in range(maxtry):
try:
url2=urllib.request.Request(url,headers=headers)
image1=urllib.request.urlopen(url2,timeout=5).read()
break
except:
if n<(maxtry-1):
continue
else:
print("Has tried %d times to access url %s,all failed!" %(maxtry,url))
break
return image1
for j in range(2):
url='https://tieba.baidu.com/f?kw=%E7%BE%8E%E5%A5%B3&ie=utf-8&pn='+str(j)
html=geturl(url)
#html=urllib.request.urlopen(url).read()
selector=etree.HTML(html)
links=selector.xpath('//div/a[@class ="j_th_tit "]/@href')
print(links)