37,720
社区成员
发帖
与我相关
我的任务
分享
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import requests
import re
import os
#需求:爬取糗事百科中糗图板块下所有的糗图图片
if __name__ == "__main__":
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
#创建一个文件夹,保存所有的图片
if not os.path.exists('./qiutuLibs'):
os.mkdir('./qiutuLibs')
#设置一个通用的url模板
url = 'https://www.qiushibaike.com/pic/page/%d/?s=5184961'
# pageNum = 2
for pageNum in range(1,3):
#对应页码的url
new_url = format(url%pageNum)
#使用通用爬虫对url对应的一整张页面进行爬取
page_text = requests.get(url=new_url,headers=headers).text
#使用聚焦爬虫将页面中所有的糗图进行解析/提取
ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
img_src_list = re.findall(ex,page_text,re.S)
# print(img_src_list)
for src in img_src_list:
#拼接出一个完整的图片url
src = 'https:'+src
#请求到了图片的二进制数据
img_data = requests.get(url=src,headers=headers).content
#生成图片名称
img_name = src.split('/')[-1]
#图片存储的路径
imgPath = './qiutuLibs/'+img_name
with open(imgPath,'wb') as fp:
fp.write(img_data)
print(img_name,'下载成功!!!')
url = 'https://www.qiushibaike.com/pic/page/%d/?s=5184961'
for pageNum in range(1,3):
new_url = format(url%pageNum)
url = 'https://www.qiushibaike.com/pic/page/{}/?s=5184961'
for pageNum in range(1,3):
new_url = url.format(pageNum)
url = 'https://www.qiushibaike.com/pic/page/%d/?s=5184961'
for pageNum in range(1,3):
new_url = url % pageNum
xxx.format("{}")
"%s" % xxx
# Function: Demo Python format value for:
# 小白求教: Python format的用法-CSDN论坛
# https://bbs.csdn.net/topics/397523895
# Author: Crifan Li
# Update: 20200904
################################################################################
# Demo Python format value
################################################################################
lastYearInt = 2019
curYearInt = 2020
piFloat = 3.1415
genderStr = "Male"
nameTuple = ("Crifan", "Li")
print('\n%s Method 1: "%%s" %% someValue %s' % ("="*20, "="*20))
########## single value ##########
# use %s also support integer value
print("Current year is: %s" % curYearInt) # Current year is: 2020
# use %d to format integer value
print("Current year is: %d" % curYearInt) # Current year is: 2020
print("PI=%s" % piFloat) # PI=3.1415
print("PI=%.2f" % piFloat) # PI=3.14
print("Your gender: %s" % genderStr) # Your gender: Male
# print("Your full name is: %s" % nameTuple) # Error: TypeError not all arguments converted during string formatting
print("Your full name is: %s" % (nameTuple, )) # Your full name is: ('Crifan', 'Li')
########## multiple value -> use tuple ##########
print("current year=%d and your full name=%s" % (curYearInt, nameTuple)) # current year=2020 and your full name=('Crifan', 'Li')
print("\n%s Method 2: someValue.format(""{}"") %s" % ("="*20, "="*20))
########## single value ##########
formattedYear_noIndex = "Current year is: {}".format(curYearInt)
print(formattedYear_noIndex) # Current year is: 2020
formattedYear_withIndex = "Current year is: {0}".format(curYearInt)
print(formattedYear_withIndex) # Current year is: 2020
print("PI={}".format(piFloat)) # PI=3.1415
# with format
print("PI={:.2f}".format(piFloat)) # PI=3.14
########## multiple value -> use index inside {} ##########
formattedTwoYear_withIndex = "Current year is: {0}, last year is: {1}".format(curYearInt, lastYearInt)
print(formattedTwoYear_withIndex) # Current year is: 2020, last year is: 2019
# with parameter name, can any order
print("last year is: {lastYear}, Current year is: {curYear}".format(curYear=curYearInt, lastYear=lastYearInt)) # last year is: 2019, Current year is: 2020
print("\n%s Method 3: format(someValue) %s" % ("="*20, "="*20))
print(format(curYearInt)) # 2020
print(format(piFloat)) # 3.1415
print(format(piFloat, ".2f")) # 3.14
print(format(genderStr)) # Male
print(format(nameTuple)) # ('Crifan', 'Li')
print("\n%s" % ("-"*80))
referUrl = """
7. 输入输出 — Python 3.8.5 文档
https://docs.python.org/zh-cn/3/tutorial/inputoutput.html
内置类型 — Python 3.8.5 文档
https://docs.python.org/zh-cn/3/library/stdtypes.html#str.format
6.1. string — Common string operations — Python 3.4.10 documentation
https://docs.python.org/3.4/library/string.html
2. Built-in Functions — Python 3.4.10 documentation
https://docs.python.org/3.4/library/functions.html#format
7.1. string — Common string operations — Python 2.7.18 documentation
https://docs.python.org/2/library/string.html#formatspec
"""
print("\n参考资料:%s" % referUrl)
url = 'https://www.qiushibaike.com/pic/page/{}/?s=5184961'
new_url = url.format(pageNum)
个人看法是尽量只用一种写法, 建议用 {} 的方式。