37,719
社区成员
发帖
与我相关
我的任务
分享
#coding=utf-8
from gevent import monkey,pool
monkey.patch_socket()
import os
import gevent
from lxml import etree
import requests
import re
import pandas as pd
class Gaokao():
def __init__(self):
self.header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}
self.lists=[]
def urls(self):
#page_range=[1,23416]
for i in range(1,2001): #test:get all page of 2017
#url='https://data-gkcx.eol.cn/soudaxue/queryProvinceScore.html?messtype=jsonp&provinceforschool=&schooltype=&page={}&size=10&keyWord=&schoolproperty=&schoolflag=&province=&fstype=&zhaoshengpici=&fsyear=2017'.format(i)
url='https://data-gkcx.eol.cn/soudaxue/queryProvinceScore.html?messtype=jsonp&provinceforschool=&schooltype=&page={}&size=10&keyWord=&schoolproperty=&schoolflag=&province=&fstype=&zhaoshengpici=&fsyear='.format(i)
self.lists.append(url)
return self.lists
def get_html(url):
global k
gevent.sleep(3)
print 'downloading page:>>> ',url
dic={}
try:
res=requests.get(url,header)
res.encoding='utf-8'
html=res.text
#res.close()
for r in re.findall(re_,html):
lists=[rr.encode('gb2312','ignore') for rr in r]
dic[k]=lists
k+=1
#record error urls
except:
dic[k]=url
finally:
df=pd.DataFrame.from_dict(dic,orient='index')
df.to_csv(outputfile,mode='a',header=0,index=0)
if __name__=='__main__':
header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}
re_=re.compile(r'"schoolname": "(.*?)",.*?"localprovince": "(.*?)",.*?"province": "(.*?)",.*?"studenttype": "(.*?)",.*?"year": "(.*?)",.*?"batch": "(.*?)",.*?"var": "(.*?)",.*?"var_score": "(.*?)",.*?"max": (.*?),.*?"min": (.*?),.*?"num": "(.*?)",.*?"fencha": "(.*?)",.*?"provincescore": "(.*?)"',re.S)
gk=Gaokao()
k=0
th=[]
p=pool.Pool(20)
outputfile='d:\\gaokao_every_10.csv'
gk.urls()
for u in gk.lists:
th.append(p.spawn(get_html,u))
gevent.joinall(th)