37,719
社区成员
发帖
与我相关
我的任务
分享
import urllib
import urllib2
import re
def getHtml(url):
page = urllib2.urlopen(url)
html = page.read()
return html
def getImg(html):
# http://imgsrc.baidu.com/forum/w%3D580/sign=92c3414f32292df597c3ac1d8c315ce2/5b8f95eef01f3a292bd851ae9125bc315d607c9d.
# http://d.hiphotos.baidu.com/baike/s%3D235/sign=b3e17b2497cad1c8d4bbfb244a3f67c4/962bd40735fae6cdf826c06a0db30f2442a70f2e.jpg
#reg = r'src="(.+?\.jpg)" '
#reg = r'http://file.nju.gov.cn/manage/upgl/upfiles/\d+\.jpg'
reg = r'http://+.+\.hiphotos.baidu.com/baike/'
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0
for imgurl in imglist:
urllib.urlretrieve(imgurl,'D:\E\%s.jpg' % x)
x=x+1
html = getHtml('http://baike.baidu.com/view/367981.htm')
getImg(html)