37,743
社区成员




#! /usr/bin/env python
#coding=utf-8
from sgmllib import SGMLParser
import urllib,re
class UrlList(SGMLParser):
def reset(self):
self.urls=[]
SGMLParser.reset(self)
def start_a(self,attrs):
href=[v for k,v in attrs if k=='href']
if href:
self.urls.extend(href)
def getUrls(url):
try:
usock=urllib.urlopen(url)
except:
print "get url except"+url
return []
result=[]
parser=UrlList()
parser.feed(usock.read())
usock.close()
parser.close()
urls=parser.urls
for url in urls:
if len(re.findall(r'^http://',url))>0: #指定正则表达式
result.append(url)
return result
def spider(startURL,depth):
f=open("D:\\url.txt","w")
if depth<0:
return
else:
urls=getUrls(startURL)
for url in urls:
spider(url,depth-1)
f.write(url+"\n")
if __name__=="__main__":
spider("http://www.pcuseman.com/?p=102",0) #指定需处理网页
import urllib2
url = 'http://www.baidu.com'
request = urllib2.Request(url)
response = urllib2.urlopen(request)
page = response.read()
import re
rc = '<a href="(\S*?)" target="_blank">京ICP证030173号</a>'
re.findall(rc,page)