37,720
社区成员
发帖
与我相关
我的任务
分享
#! /usr/bin/python
#coding=utf-8
import urllib2
import urllib
from sys import argv # get data from command line
import HTMLParser # parse the data which spider get
import re
import redis
import string
import datetime
class SpiderRegular(object):
def __init__(self, pattern , string):
self.pattern = pattern
self.string = string
def getnexturl(self):
sub = re.search(self.pattern, self.string)
if not sub:
return None
return re.search(r'".+\.html"', sub.group()).group()
def inredis(self):
try:
redis_conn = redis.Redis(host='127.0.0.1', port=6379)
redis_conn.set('nexturl', self.getnexturl())
except redis.RedisError, e:
raise(e)
def outredis(self):
self.inredis()
try:
redis_conn = redis.Redis(host='127.0.0.1', port=6379)
nexurl = redis_conn.get('nexturl')
except redis.RedisError, e:
raise(e)
else:
return nexurl
class Spider(object):
def __init__(self, baseurl):
self.nexturl = baseurl
def getindexpage(self):
self.html = ''
try:
htmlobj = urllib2.urlopen(self.nexturl, timeout=350)
self.html = htmlobj.read()
except Exception, e:
raise e
else:
htmlobj.close()
return self.html
def geturls(self):
self.indexpage = self.getindexpage()
urls = {}
if self.indexpage:
nextpath = SpiderRegular(r'<LINK HREF=".+" REL=next>', self.indexpage).getnexturl()
if nextpath:
self.nexturl = self.nexturl[:self.nexturl.rfind('/')+1] + nextpath.strip('"')
print 'get url: %s' % self.nexturl
urls[nextpath.strip('"')] = self.nexturl
print '[%s]: set url {%s} to dict' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self.nexturl)
self.geturls()
else:
return urls
return urls
def action(self, path):
urls = self.geturls()
print '[%s]: get urls success . . .' % datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
for url in urls:
urllib.urlretrieve(url, path + urls[url])
print '[%s]: Download success' % datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
def main():
spider = Spider('http://www.tldp.org/HOWTO/Bash-Prog-Intro-HOWTO.html')
spider.action('/home/yfsuse/pyproject/pyeah/pyspider/howtobash/')
if __name__ =='__main__':
main()