37,718
社区成员
发帖
与我相关
我的任务
分享
import urllib
import urllib2
import re
page = 1
url = 'http://www.qiushibaike.com/hot/page/' + str(page)
#有些网站没有user_agent会报错
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'
headers = {'User-Agent':user_agent}
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content = response.read().decode('utf-8')
pattern = re.compile('<div.*?class="author".*?>.*?<a.*?>.*?<img.*?>"(.*?)"</a><div.*?class="content".*?>"(.*?)".*?</div>',re.S)
items = re.findall(pattern,content)
for item in items:
print item[0]
import urllib
import urllib2
import re
from bs4 import BeautifulSoup
page = 1
url = 'http://www.qiushibaike.com/hot/page/' + str(page)
#有些网站没有user_agent会报错
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'
headers = {'User-Agent':user_agent}
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
#decode的作用是将其他编码的字符串转换成unicode编码
content = response.read().decode('utf-8')
soup = BeautifulSoup(content)
items = soup.find_all('div',class_='content')
print items