python爬取百度图片的问题

ZJ729286823 2016-08-01 05:01:14

import os

import urllib

import urllib2

import re

url = u"http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1470037956784_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E4%B8%AD%E5%B1%B1%E9%99%B5%E9%9F%B3%E4%B9%90%E5%8F%B0"

#url = u"https://lvyou.baidu.com/zhongshanlingyinyuetai/fengjing/"

outpath = "d:\\g"





def getHtml(url):

    request = urllib2.Request(url)

    webfile = urllib2.urlopen(request)

    outhtml = webfile.read()

    #print outhtml

    return outhtml





def getImageList(html):

    restr = ur'('

    restr += ur'http:\/\/[^\s,"]*\.jpg'

    restr += ur'|http:\/\/[^\s,"]*\.jpeg'

    restr += ur'|http:\/\/[^\s,"]*\.png'

    restr += ur'|http:\/\/[^\s,"]*\.gif'

    restr += ur'|http:\/\/[^\s,"]*\.bmp'

    restr += ur'|https:\/\/[^\s,"]*\.jpeg'

    restr += ur'|https:\/\/[^\s,"]*\.jpeg'

    restr += ur'|https:\/\/[^\s,"]*\.png'

    restr += ur'|https:\/\/[^\s,"]*\.gif'

    restr += ur'|https:\/\/[^\s,"]*\.bmp'

    restr += ur')'

    htmlurl = re.compile(restr)

    imgList = re.findall(htmlurl, html)

    print imgList

    return imgList





def download(imgList, page):

    x = 1

    for imgurl in imgList:

        filepathname = str(outpath + 'pic_%09d_%010d' % (page, x) + str(

            os.path.splitext(urllib2.unquote(imgurl).decode('utf8').split('/')[-1])[1])).lower()

        print '[Debug] Download file :' + imgurl + ' >> ' + filepathname

        urllib.urlretrieve(imgurl, filepathname)

        x += 1





def downImageNum(pagenum):

    page = 1

    pageNumber = pagenum

    while (page <= pageNumber):

        html = getHtml(url)  # 获得url指向的html内容

        imageList = getImageList(html)  # 获得所有图片的地址，返回列表

        download(imageList, page)  # 下载所有的图片

        page = page + 1





if __name__ == '__main__':

    downImageNum(1)