python 爬取糗事百科遇到问题

帘项小竹 2014-12-12 08:17:03
爬取糗事百科时出错,但不清楚是什么原因,log里的error信息如下:
Traceback (most recent call last):
File "E:\javafunc\Project1\spider3.py", line 88, in <module>
my_spider.start()
File "E:\javafunc\Project1\spider3.py", line 77, in start
self.LoadNewPages()
File "E:\javafunc\Project1\spider3.py", line 46, in LoadNewPages
self.download_page.append(self.GetPageInfo(self.page))
File "E:\javafunc\Project1\spider3.py", line 31, in GetPageInfo
initial_downloadInfo=urllib2.urlopen(request).read().decode('utf-8')
File "D:\Python27\lib\urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "D:\Python27\lib\urllib2.py", line 404, in open
response = self._open(req, data)
File "D:\Python27\lib\urllib2.py", line 422, in _open
'_open', req)
File "D:\Python27\lib\urllib2.py", line 382, in _call_chain
result = func(*args)
File "D:\Python27\lib\urllib2.py", line 1214, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "D:\Python27\lib\urllib2.py", line 1181, in do_open
h.request(req.get_method(), req.get_selector(), req.data, headers)
File "D:\Python27\lib\httplib.py", line 973, in request
self._send_request(method, url, body, headers)
File "D:\Python27\lib\httplib.py", line 1007, in _send_request
self.endheaders(body)
File "D:\Python27\lib\httplib.py", line 969, in endheaders
self._send_output(message_body)
File "D:\Python27\lib\httplib.py", line 833, in _send_output
self.send(message_body)
File "D:\Python27\lib\httplib.py", line 805, in send
self.sock.sendall(data)
File "D:\Python27\lib\socket.py", line 224, in meth
return getattr(self._sock,name)(*args)
TypeError: must be string or buffer, not dict

以下为源码

#coding:utf-8
'''
Created on 2014-12-12

@author: cl282907
'''
import urllib2
import re
import time
import datetime
import thread
from urllib2 import URLError

class my_spider_of_qiushibaike:

def __init__(self):
self.page=1
self.download_page=[]
self.obtain=False
self.NowPrintPage=0

######得到每一页的文字信息###########
def GetPageInfo(self,page):

items_of_1page=[]

RequestUrl='http://www.qiushibaike.com/hot/page/%s' %page
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers={'User-Agent':user_agent}
#socket.settimeout(3)
request=urllib2.Request(RequestUrl,headers)
initial_downloadInfo=urllib2.urlopen(request).read().decode('utf-8')

my_re=re.compile('<div class="content" title="(.*?)">(.*?)</div>',re.DOTALL)
for item in my_re.findall(initial_downloadInfo):
with open(r'E:\spider','a') as file:
file.write(item)
items_of_1page.append([page,item[0].replace('\n',''),item[1].replace('\n','')])

return items_of_1page

#######当在list中的信息不超过5页时自动加载下一页##################
def LoadNewPages(self):
while self.obtain:
try:
if len(self.download_page)<5:
self.download_page.append(self.GetPageInfo(self.page))
self.page+=1
except URLError,e:
if hasattr(e, 'code'):
print 'this is a URLError and We failed to reach a server,',e.code
if hasattr(e, 'reason'):
print 'this is an HTTPError',e.reason
else:
time.sleep(1)

############显示已经爬取的网页信息#########
def showPageContent(self):

while(self.NowPrintPage+1<=self.page): #self.page是当前已经爬取到的页数,self.NowPrintPage是当前显示到的页数

for items in self.download_page[self.NowPrintPage]:
print '第%s页'%items[0],items[1],items[2]
my_input=raw_input('请输入回车,停止程序请输入quit')
if my_input=='quit':
self.obtain=False
break
self.NowPrintPage+=1


def start(self):


self.obtain=True
# page=self.page

print "正在拼命加载中。。。"
#thread.start_new_thread(self.LoadNewPages,())
self.LoadNewPages()
while self.obtain:

if self.download_page:
self.showPageContent()



print u'请按下回车来浏览今日糗百内容'
raw_input()
my_spider=my_spider_of_qiushibaike()
my_spider.start()



...全文
485 7 打赏 收藏 转发到动态 举报
写回复
用AI写文章
7 条回复
切换为时间正序
请发表友善的回复…
发表回复
robotcan 2015-03-11
  • 打赏
  • 举报
回复
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers={'User-Agent':user_agent} #socket.settimeout(3) request=urllib2.Request(RequestUrl,headers) 换成 request = urllib2.Request(RequestUrl) request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
thomashtq 2015-02-13
  • 打赏
  • 举报
回复
mark下先
裸奔的蜗牛 2015-02-12
  • 打赏
  • 举报
回复
编码的问题吧,# -*- coding: UTF-8 -*-
jiht594 2015-02-09
  • 打赏
  • 举报
回复
#coding:utf-8 ''' Created on 2014-12-12 @author: cl282907 ''' import urllib2 import re import time import datetime import thread from urllib2 import URLError class my_spider_of_qiushibaike: def __init__(self): self.page=1 self.download_page={} self.obtain=False self.NowPrintPage=1 ######得到每一页的文字信息########### def GetPageInfo(self,page): items_of_1page=[] RequestUrl='http://www.qiushibaike.com/8hr/page/%s' %page user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers={'User-Agent':user_agent} #socket.settimeout(3) request=urllib2.Request(RequestUrl, None, headers) initial_downloadInfo=urllib2.urlopen(request).read().decode('utf-8') my_re=re.compile('<div class="content" title="(.*?)">(.*?)</div>',re.DOTALL) for item in my_re.findall(initial_downloadInfo): with open(r'E:\spider','a') as file: file.write(str(item)) items_of_1page.append([page,item[0].replace('\n',''),item[1].replace('\n','')]) return items_of_1page #######当在list中的信息不超过5页时自动加载下一页################## def LoadNewPages(self): while self.obtain: try: if len(self.download_page)<5: #self.download_page.append(self.GetPageInfo(self.page)) self.download_page[self.page] = self.GetPageInfo(self.page) self.page+=1 else : break except URLError,e: if hasattr(e, 'code'): print 'this is a URLError and We failed to reach a server,',e.code if hasattr(e, 'reason'): print 'this is an HTTPError',e.reason else: time.sleep(1) ############显示已经爬取的网页信息######### def showPageContent(self): while(self.NowPrintPage < self.page): #self.page是当前已经爬取到的页数,self.NowPrintPage是当前显示到的页数 for items in self.download_page[self.NowPrintPage]: print u'第%s页'%items[0],items[1],items[2] my_input=raw_input('input \"enter\", to stop, input \"quit\"') if my_input=='quit': self.obtain=False break self.NowPrintPage+=1 if self.NowPrintPage >= self.page : self.download_page = {} def start(self): self.obtain=True # page=self.page print u"正在拼命加载中。。。" #thread.start_new_thread(self.LoadNewPages,()) self.LoadNewPages() while self.obtain: if self.download_page: self.showPageContent() else : print u"正在拼命加载中。。。" self.LoadNewPages() print u'请按下回车来浏览今日糗百内容' raw_input() my_spider=my_spider_of_qiushibaike() my_spider.start()
albertwb951 2015-02-09
  • 打赏
  • 举报
回复
#coding:utf-8 ''' Created on 2014-12-12 @author: cl282907 ''' import urllib2 import re import time import datetime import thread from urllib2 import URLError class my_spider_of_qiushibaike: def __init__(self): self.page=1 self.download_page=[] self.obtain=False self.NowPrintPage=0 ######得到每一页的文字信息########### def GetPageInfo(self,page): items_of_1page=[] RequestUrl='http://www.qiushibaike.com/hot/page/%s' %page user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers={'User-Agent':user_agent} #socket.settimeout(3) request=urllib2.Request(RequestUrl,headers = headers) initial_downloadInfo=urllib2.urlopen(request).read().decode('utf-8') my_re=re.compile('<div class="content" title="(.*?)">(.*?)</div>',re.DOTALL) for item in my_re.findall(initial_downloadInfo): items_of_1page.append([item[0].replace('\n',''),item[1].replace('\n','')]) return items_of_1page #######当在list中的信息不超过5页时自动加载下一页################## def LoadNewPages(self): while self.obtain: try: if len(self.download_page)<5: self.download_page.append(self.GetPageInfo(self.page)) self.page+=1 except URLError,e: if hasattr(e, 'code'): print 'this is a URLError and We failed to reach a server,',e.code if hasattr(e, 'reason'): print 'this is an HTTPError',e.reason else: time.sleep(1) ############显示已经爬取的网页信息######### def showPageContent(self): while(self.NowPrintPage+1<=self.page): #self.page是当前已经爬取到的页数,self.NowPrintPage是当前显示到的页数 for items in self.download_page[self.NowPrintPage]: print '第%s页'%items[0],items[1],items[2] my_input=raw_input('请输入回车,停止程序请输入quit') if my_input=='quit': self.obtain=False break self.NowPrintPage+=1 def start(self): self.obtain=True # page=self.page print "正在拼命加载中。。。" #thread.start_new_thread(self.LoadNewPages,()) self.LoadNewPages() while self.obtain: if self.download_page: self.showPageContent() print u'请按下回车来浏览今日糗百内容' raw_input() my_spider=my_spider_of_qiushibaike() my_spider.start()
xiaoju2009 2014-12-12
  • 打赏
  • 举报
回复
顶下。。。。。。

37,720

社区成员

发帖
与我相关
我的任务
社区描述
JavaScript,VBScript,AngleScript,ActionScript,Shell,Perl,Ruby,Lua,Tcl,Scala,MaxScript 等脚本语言交流。
社区管理员
  • 脚本语言(Perl/Python)社区
  • IT.BOB
加入社区
  • 近7日
  • 近30日
  • 至今

试试用AI创作助手写篇文章吧