开发四年只会写业务代码,分布式高并发都不会还做程序员?->>>
练习写爬虫,这个页面的图片路径爬不下来了,其他页面倒是能爬取下来,求大神指教,代码如下:
#-*- coding: utf-8 -*-
import urllib2
import re
def html_get(url):
#获取网页内容
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36 QIHU 360EE'
headers = {'User-Agent':user_agent}
req = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(req)
html = response.read()
return html
if __name__ == '__main__':
content_url = 'http://tieba.baidu.com/p/3428007979?pn=2'
content_page_html = html_get(content_url)
print content_page_html
pattern = re.compile(r'<img.*?class="BDE_Image".*?src="(.*?)".*?pic_ext="jpeg".*?',re.S)#获取图片路径
images_url = pattern.findall(content_page_html)
print(images_url)