37,720
社区成员
发帖
与我相关
我的任务
分享
#coding:utf-8
'''
Created on 2014-12-12
@author: cl282907
'''
import urllib2
import re
import time
import datetime
import thread
from urllib2 import URLError
class my_spider_of_qiushibaike:
def __init__(self):
self.page=1
self.download_page=[]
self.obtain=False
self.NowPrintPage=0
######得到每一页的文字信息###########
def GetPageInfo(self,page):
items_of_1page=[]
RequestUrl='http://www.qiushibaike.com/hot/page/%s' %page
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers={'User-Agent':user_agent}
#socket.settimeout(3)
request=urllib2.Request(RequestUrl,headers)
initial_downloadInfo=urllib2.urlopen(request).read().decode('utf-8')
my_re=re.compile('<div class="content" title="(.*?)">(.*?)</div>',re.DOTALL)
for item in my_re.findall(initial_downloadInfo):
with open(r'E:\spider','a') as file:
file.write(item)
items_of_1page.append([page,item[0].replace('\n',''),item[1].replace('\n','')])
return items_of_1page
#######当在list中的信息不超过5页时自动加载下一页##################
def LoadNewPages(self):
while self.obtain:
try:
if len(self.download_page)<5:
self.download_page.append(self.GetPageInfo(self.page))
self.page+=1
except URLError,e:
if hasattr(e, 'code'):
print 'this is a URLError and We failed to reach a server,',e.code
if hasattr(e, 'reason'):
print 'this is an HTTPError',e.reason
else:
time.sleep(1)
############显示已经爬取的网页信息#########
def showPageContent(self):
while(self.NowPrintPage+1<=self.page): #self.page是当前已经爬取到的页数,self.NowPrintPage是当前显示到的页数
for items in self.download_page[self.NowPrintPage]:
print '第%s页'%items[0],items[1],items[2]
my_input=raw_input('请输入回车,停止程序请输入quit')
if my_input=='quit':
self.obtain=False
break
self.NowPrintPage+=1
def start(self):
self.obtain=True
# page=self.page
print "正在拼命加载中。。。"
#thread.start_new_thread(self.LoadNewPages,())
self.LoadNewPages()
while self.obtain:
if self.download_page:
self.showPageContent()
print u'请按下回车来浏览今日糗百内容'
raw_input()
my_spider=my_spider_of_qiushibaike()
my_spider.start()