37,720
社区成员
发帖
与我相关
我的任务
分享
import re,requests,time
from urllib import request
from urllib import response
# url="http://31f.cn/"
'''
1.抓取网页
2.正则提取信息,保存进字典
3.对服务器地址进行校验
4.写入文本
'''
url="http://www.xicidaili.com"
def get_proxy(url):
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36"}
ipcheck = []
content = requests.get(url,headers=headers).text
iplist = re.findall('''<tr class=".*?">.*?<td>([0-9.]*?)</td>\s*?<td>([0-9]*?)</td>\s*?<td>(\w*?)</td>\s*?<td class="country">.*?</td>\s*?<td>(.*?)</td>.*?</tr>''',content, flags=re.S)
for i in iplist:
dic1 = {"地区": i[2], "IP": i[0], "端口": i[1], "协议": i[3]}
ipcheck.append(dic1)
return ipcheck
get_proxy(url)
class Serveragent_check():
def __init__(self,iplist):
self.iplist=iplist
self.timeout=10
self.testurl="http://www.baidu.com"
self.testinfo="柳絮"
self.checkedlist=[]
def check(self):
for i in self.iplist:
proxy={"http":"http://%s:%s"%(i["IP"],i["端口"])}
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36"}
t1=time.time()
try:
f=requests.get(self.testurl,headers=headers,proxies=proxy,timeout=self.timeout)
result=f.text
time.sleep(5)
pos=result.find(self.testinfo)
t2=time.time()
timeused=t2-t1
if pos>1:
self.checkedlist.append({"地区":i[2],"IP":i[0],"端口":i[1],"协议":i[3],"time":timeused})
else:
continue
except Exception as e:
print(e)#这里开始报错,都是说超时,或者代理错误
continue
def sorting(self):
sorted(self.checkedlist,key=lambda x:x[4])
print(self.checkedlist)
ipcheck=get_proxy(url)
c=Serveragent_check(ipcheck)
c.check()
c.sorting()