python 爬取糗事百科遇到问题

帘项小竹 2014-12-12 08:17:03

爬取糗事百科时出错，但不清楚是什么原因，log里的error信息如下：
Traceback (most recent call last):
File "E:\javafunc\Project1\spider3.py", line 88, in <module>
my_spider.start()
File "E:\javafunc\Project1\spider3.py", line 77, in start
self.LoadNewPages()
File "E:\javafunc\Project1\spider3.py", line 46, in LoadNewPages
self.download_page.append(self.GetPageInfo(self.page))
File "E:\javafunc\Project1\spider3.py", line 31, in GetPageInfo
initial_downloadInfo=urllib2.urlopen(request).read().decode('utf-8')
File "D:\Python27\lib\urllib2.py", line 127, in urlopen
return _opener.open(url, data, timeout)
File "D:\Python27\lib\urllib2.py", line 404, in open
response = self._open(req, data)
File "D:\Python27\lib\urllib2.py", line 422, in _open
'_open', req)
File "D:\Python27\lib\urllib2.py", line 382, in _call_chain
result = func(*args)
File "D:\Python27\lib\urllib2.py", line 1214, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "D:\Python27\lib\urllib2.py", line 1181, in do_open
h.request(req.get_method(), req.get_selector(), req.data, headers)
File "D:\Python27\lib\httplib.py", line 973, in request
self._send_request(method, url, body, headers)
File "D:\Python27\lib\httplib.py", line 1007, in _send_request
self.endheaders(body)
File "D:\Python27\lib\httplib.py", line 969, in endheaders
self._send_output(message_body)
File "D:\Python27\lib\httplib.py", line 833, in _send_output
self.send(message_body)
File "D:\Python27\lib\httplib.py", line 805, in send
self.sock.sendall(data)
File "D:\Python27\lib\socket.py", line 224, in meth
return getattr(self._sock,name)(*args)
TypeError: must be string or buffer, not dict

以下为源码



#coding:utf-8

'''

Created on 2014-12-12



@author: cl282907

'''

import urllib2

import re

import time

import datetime

import thread

from urllib2 import URLError



class my_spider_of_qiushibaike:

    

    def __init__(self):

        self.page=1

        self.download_page=[]

        self.obtain=False

        self.NowPrintPage=0

    

    ######得到每一页的文字信息###########

    def GetPageInfo(self,page):

        

        items_of_1page=[]

        

        RequestUrl='http://www.qiushibaike.com/hot/page/%s' %page

        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'

        headers={'User-Agent':user_agent}

        #socket.settimeout(3)

        request=urllib2.Request(RequestUrl,headers)

        initial_downloadInfo=urllib2.urlopen(request).read().decode('utf-8')

        

        my_re=re.compile('<div class="content" title="(.*?)">(.*?)</div>',re.DOTALL)

        for item in my_re.findall(initial_downloadInfo):

            with open(r'E:\spider','a') as file:

                file.write(item)

            items_of_1page.append([page,item[0].replace('\n',''),item[1].replace('\n','')])

            

        return items_of_1page

     

    #######当在list中的信息不超过5页时自动加载下一页##################

    def LoadNewPages(self):

        while self.obtain:

            try:

                if len(self.download_page)<5:

                    self.download_page.append(self.GetPageInfo(self.page))

                    self.page+=1

            except URLError,e:

                if hasattr(e, 'code'):

                    print 'this is a URLError and We failed to reach a server,',e.code

                if hasattr(e, 'reason'):

                    print 'this is an HTTPError',e.reason

            else:

                time.sleep(1)

    

    ############显示已经爬取的网页信息#########            

    def showPageContent(self):

        

        while(self.NowPrintPage+1<=self.page):     #self.page是当前已经爬取到的页数，self.NowPrintPage是当前显示到的页数

            

            for items in self.download_page[self.NowPrintPage]:

                print '第%s页'%items[0],items[1],items[2]

                my_input=raw_input('请输入回车，停止程序请输入quit')

                if my_input=='quit':

                    self.obtain=False

                    break

            self.NowPrintPage+=1

                

            

    def start(self):

        

            

        self.obtain=True

    #    page=self.page

        

        print "正在拼命加载中。。。"

        #thread.start_new_thread(self.LoadNewPages,())

        self.LoadNewPages()

        while self.obtain:

            

            if self.download_page:

                self.showPageContent()

                

                

    

print u'请按下回车来浏览今日糗百内容'

raw_input()

my_spider=my_spider_of_qiushibaike()

my_spider.start()

...全文

491 7 打赏收藏转发到动态举报

写回复

用AI写文章

7 条回复

切换为时间正序

请发表友善的回复…

发表回复

robotcan 2015-03-11

打赏
举报

user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers={'User-Agent':user_agent} #socket.settimeout(3) request=urllib2.Request(RequestUrl,headers) 换成 request = urllib2.Request(RequestUrl) request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')

thomashtq 2015-02-13

打赏
举报

mark下先

裸奔的蜗牛 2015-02-12

打赏
举报

编码的问题吧，# -*- coding: UTF-8 -*-

jiht594 2015-02-09

打赏
举报

#coding:utf-8 ''' Created on 2014-12-12 @author: cl282907 ''' import urllib2 import re import time import datetime import thread from urllib2 import URLError class my_spider_of_qiushibaike: def __init__(self): self.page=1 self.download_page={} self.obtain=False self.NowPrintPage=1 ######得到每一页的文字信息########### def GetPageInfo(self,page): items_of_1page=[] RequestUrl='http://www.qiushibaike.com/8hr/page/%s' %page user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers={'User-Agent':user_agent} #socket.settimeout(3) request=urllib2.Request(RequestUrl, None, headers) initial_downloadInfo=urllib2.urlopen(request).read().decode('utf-8') my_re=re.compile('<div class="content" title="(.*?)">(.*?)</div>',re.DOTALL) for item in my_re.findall(initial_downloadInfo): with open(r'E:\spider','a') as file: file.write(str(item)) items_of_1page.append([page,item[0].replace('\n',''),item[1].replace('\n','')]) return items_of_1page #######当在list中的信息不超过5页时自动加载下一页################## def LoadNewPages(self): while self.obtain: try: if len(self.download_page)<5: #self.download_page.append(self.GetPageInfo(self.page)) self.download_page[self.page] = self.GetPageInfo(self.page) self.page+=1 else : break except URLError,e: if hasattr(e, 'code'): print 'this is a URLError and We failed to reach a server,',e.code if hasattr(e, 'reason'): print 'this is an HTTPError',e.reason else: time.sleep(1) ############显示已经爬取的网页信息######### def showPageContent(self): while(self.NowPrintPage < self.page): #self.page是当前已经爬取到的页数，self.NowPrintPage是当前显示到的页数 for items in self.download_page[self.NowPrintPage]: print u'第%s页'%items[0],items[1],items[2] my_input=raw_input('input \"enter\", to stop, input \"quit\"') if my_input=='quit': self.obtain=False break self.NowPrintPage+=1 if self.NowPrintPage >= self.page : self.download_page = {} def start(self): self.obtain=True # page=self.page print u"正在拼命加载中。。。" #thread.start_new_thread(self.LoadNewPages,()) self.LoadNewPages() while self.obtain: if self.download_page: self.showPageContent() else : print u"正在拼命加载中。。。" self.LoadNewPages() print u'请按下回车来浏览今日糗百内容' raw_input() my_spider=my_spider_of_qiushibaike() my_spider.start()

albertwb951 2015-02-09

打赏
举报

#coding:utf-8 ''' Created on 2014-12-12 @author: cl282907 ''' import urllib2 import re import time import datetime import thread from urllib2 import URLError class my_spider_of_qiushibaike: def __init__(self): self.page=1 self.download_page=[] self.obtain=False self.NowPrintPage=0 ######得到每一页的文字信息########### def GetPageInfo(self,page): items_of_1page=[] RequestUrl='http://www.qiushibaike.com/hot/page/%s' %page user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers={'User-Agent':user_agent} #socket.settimeout(3) request=urllib2.Request(RequestUrl,headers = headers) initial_downloadInfo=urllib2.urlopen(request).read().decode('utf-8') my_re=re.compile('<div class="content" title="(.*?)">(.*?)</div>',re.DOTALL) for item in my_re.findall(initial_downloadInfo): items_of_1page.append([item[0].replace('\n',''),item[1].replace('\n','')]) return items_of_1page #######当在list中的信息不超过5页时自动加载下一页################## def LoadNewPages(self): while self.obtain: try: if len(self.download_page)<5: self.download_page.append(self.GetPageInfo(self.page)) self.page+=1 except URLError,e: if hasattr(e, 'code'): print 'this is a URLError and We failed to reach a server,',e.code if hasattr(e, 'reason'): print 'this is an HTTPError',e.reason else: time.sleep(1) ############显示已经爬取的网页信息######### def showPageContent(self): while(self.NowPrintPage+1<=self.page): #self.page是当前已经爬取到的页数，self.NowPrintPage是当前显示到的页数 for items in self.download_page[self.NowPrintPage]: print '第%s页'%items[0],items[1],items[2] my_input=raw_input('请输入回车，停止程序请输入quit') if my_input=='quit': self.obtain=False break self.NowPrintPage+=1 def start(self): self.obtain=True # page=self.page print "正在拼命加载中。。。" #thread.start_new_thread(self.LoadNewPages,()) self.LoadNewPages() while self.obtain: if self.download_page: self.showPageContent() print u'请按下回车来浏览今日糗百内容' raw_input() my_spider=my_spider_of_qiushibaike() my_spider.start()

xiaoju2009 2014-12-12