34
社区成员




import urllib.request
import xlwt
import re
from bs4 import BeautifulSoup
def jiexi(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39"
}
request = urllib.request.Request(url , headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
print(html)
except urllib.error.URError as e:
if hasattr(e , "code"):
print(e.code)
if hasattr(e , "reason"):
print(e.reason)
return html
def getdata(baseural):
datalist = []
c=[]
for i in range(1 , 51):
url = baseural + str(i)
# print(url)
html = jiexi(url)
soup = BeautifulSoup(html , "html.parser")
for item in soup.find_all('div' , class_="rank_d_list borderB_c_dsh clearfix"):
data = []
# print(item)
item = str(item)
title = re.findall(name , item)[0]
zuoz = re.findall(aut , item)[0]
# print(zuoz)
leix = re.findall(cate , item)[0]
# print(leix)
lianjie = re.findall(link , item)[0]
# print(lianjie)
img = re.findall(photo , item)[0]
# print(img)
# 数据加入到列表里
data.append(title)
data.append(zuoz)
data.append(leix)
data.append(lianjie)
datalist.append(data)
c.append(title)
# print(datalist)
return (datalist,c)
def savedata(datalist , filename):
book = xlwt.Workbook(encoding="utf-8" , style_compression=0) \
sheet = book.add_sheet('小说' , cell_overwrite_ok=True)
col = ("小说名" , "作者" , "类型" , "小说链接" , "图片链接")
for i in range(0 , 5):
sheet.write(0 , i , col[i])
for i in range(0 , 1000):
data = datalist[i]
for j in range(0 , 5):
sheet.write(i + 1 , j , data[j])
book.save(filename)
def crawler(books):
baseural = "http://www.zongheng.com/rank/details.html?rt=1&d=1&p="
jiexi(baseural)
filename = "小说排行榜.xls"
datalist = getdata(baseural)[1]
c=getdata(baseural)[2]
savedata(datalist , filename)
d=0
dic={}
for name in books:
for i in c:
if c[i]==name:
d=d+1
dic[name]=d
return print(dic)
name = re.compile(r'<div class="rank_d_b_name" title="(.*?)">')
aut = re.compile(r'<div class="rank_d_b_cate" title="(.*?)">')
photo = re.compile(r'src="(.*?)"/>' , re.S)
link = re.compile(r'href="(.*?)" target="_blank">')
cate = re.compile(r'<a target="_blank">(.*?)<')
# print(photo)
if __name__ == "__main__":
books=input("books:")
crawler(books)