char_table = {ord(key): ord(value) for key, value in char_table.items()}
def deCode(url):
for key, value in str_table.items():
url = url.replace(key, value)
d=url.translate(trantab)
return d
def getMoreURL(word):
word = urllib.quote(word)
url = r"http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&fp=result&queryWord={word}" \
r"&cl=2&lm=-1&ie=utf-8&oe=utf-8&st=-1&ic=0&word={word}&face=0&istype=2nc=1&pn={pn}&rn=30"
urls = (url.format(word=word, pn=x) for x in itertools.count(start=0, step=30))
#itertools.count 0开始,步长30,迭代
return urls
def getHtml(url):
page=urllib.urlopen(url)
html=page.read()
return html
for image in imageList:
imgUrls.append(deCode(image))
l=len(imgUrls)
print l
return imgUrls
def downLoad(urls,path):
global index
for url in urls:
print("Downloading:", url)
res = urllib2.Request(url)
# res = urllib.request(url)
try:
response = urllib2.urlopen(res ,data=None, timeout=5) #超时处理
except urllib2.URLError, e:
if hasattr(e,'code'):
error_status = e.code
print(error_status, "未下载成功:", url)
continue
elif hasattr(e,'reason'):
print( "time out", url)
continue
continue
filename = os.path.join(path, str(index) + ".jpg")
# urllib.urlretrieve(url,filename)
auto_down(url,filename)
index += 1
if index-1==10000:
break
def auto_down(url,filename):
try:
urllib.urlretrieve(url,filename)
except urllib.ContentTooShortError:
print 'Network conditions is not good.Reloading.'
auto_down(url,filename)
if __name__ == '__main__':
keyWord="XXX"
index = 1
Savepath = "./img_warship/"
urls=getMoreURL(keyWord)
for url in urls:
downLoad(getImg(getHtml(url)),Savepath)
if index-1==10000:
break
...全文
302512打赏收藏
请问一个python爬取百度图片卡死的问题
我在按关键字爬取百度图片的时候总是在爬取到特定图片的时候卡死或出现socket.error: [Errno 104] Connection reset by peer的问题,不同的关键字就是不同的特定图片处卡死,但是图片本身按道理来说应该是没什么问题的。不知道问题出在哪,请大神指教一下,谢谢! #coding: utf-8 import os import re import urllib import urllib2 import itertools from string import ma