Python爬虫。最基础爬虫!爬女神妹子写真!
import os
os.chdir('')
#这里放图片路径
import urllib
import re
import urllib2
import urlparse
import time
import itertools
import zipfile
import socket
socket.setdefaulttimeout(10)
def uo(url):
return urllib2.urlopen(url).read()
def ud(url,index):
picdir = os.path.join(os.getcwd(),'%d.jpg'%index)
urllib.urlretrieve(url,picdir)
if os.path.getsize(picdir) < 100*1024:
os.remove(picdir)
else:
print 'Downloaded:%d.jpg'%index
index += 1
return index
def fre(html,reg):
return re.find_all(reg,html)
def rec(url1,url2):
return urlparse.urljoin(url1,url2)
def cgn(url,num):
return urlparse.urljoin(url,'%d.html'%num)
def zipp(dirnum,ydir):
os.chdir('')
#这里是Zip压缩包要在哪创建
print "Create zip file in "+os.getcwd()
z = zipfile.ZipFile('%d.zip'%dirnum,mode='w')
dirfiles = os.listdir(ydir)
for fp in dirfiles:
z.write(os.path.join(ydir,fp),os.sep+fp)
z.close()
print 'Zip OK!'
for fp in dirfiles:
os.remove(os.path.join(ydir,fp))
print 'Remove OK!'
os.chdir(ydir)
return 1
index = 1
dirnum = 0
imgstr = r"<img.*?src=\'([^\']*?\/[0-9]*\.jpg)\'.*?>"
imgre = re.compile(imgstr)
for xx in range(1,702):
try:
url = cgn('http://m.zngirls.com/gallery/',xx)
html = urllib2.urlopen(url).read()
secstr = r"<a.*?href='(.*?)'.*?</a>"
secre = re.compile(secstr)
picsetsurl = secre.findall(html)
for picseturl in picsetsurl:
try:
# print 'Page:%d Getset:%d'%(xx,dirnum)
# print 'Now start to get %d set'%dirnum
#这里可以选择要不要压缩包格式不要就注掉
if index >= 1000:
dirnum += 1
index = zipp(dirnum,os.getcwd())
# index = 1
# os.chdir('/sdcard/apic/')
# if not os.path.exists('Girl'+str(dirnum)):
# os.mkdir('Girl'+str(dirnum))
# os.chdir(os.path.join(os.getcwd(),'Girl'+str(dirnum)))
print 'Save to:'+os.getcwd()
emptynum = 0
for num in itertools.count(1):
try:
curl = cgn(picseturl,num)
print curl
picsethtml = uo(curl)
imglist = imgre.findall(picsethtml)
empty = index
for img in imglist:
try:
index = ud(img,index)
except:
for iii in range(1,2):
print 'Download Failed,Next img'
time.sleep(5)
if index == empty:
emptynum += 1
else:
emptynum = 0
if emptynum >= 5:
break
except:
print 'Download Failed,Next setpage'
time.sleep(5)
except:
print 'Download Failed,Next set'
time.sleep(5)
except:
print 'Download Failed,Next page'
time.sleep(5)
#该脚本纯属娱乐,无聊撩图专用。勿商用。