37,743
社区成员




import urllib
import HTMLParser
class LinkParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.links= []
def handle_starttag(self,tag,attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
self.links.append(value)
def parseLinks(url):
parser = LinkParser()
parser.feed(urllib.urlopen(url).read())
return parser.links
def completeLink(url):
root = "http://127.0.0.1"
if url.startswith('http://'):
return url
elif url.startswith('/'):
return root + url
else:
return root + '/' + url
if __name__=='__main__':
links = parseLinks("http://127.0.0.1")
i = 0
while i < len(links):
links[i] = completeLink(links[i])
try:
morelinks = parseLinks(links[i])
except Exception, e:
print e
j = 0
while j < len(morelinks):
item = completeLink(morelinks[j])
j = j + 1
if item not in links:
print item
links.append(item)
print len(links)
i = i + 1
print links
raw_input()
def completeLink(url):
root = "http://127.0.0.1"
if url.startswith('http://'):
return url
elif url.startswith('/'):
return root + url
else:
return root + '/' + url
from BeautifulSoup import BeautifulSoup
import urllib
def completeLink(url):
root = "http://www.chem960.com"
if url.startswith('http://'):
return url
elif url.startswith('/'):
return root + url
else:
return root + '/' + url
if __name__=='__main__':
links = []
try:
soup = BeautifulSoup(urllib.urlopen("http://www.chem960.com/productSupplyShow_582388.shtml").read())
test = soup.findAll('a')
for temp in test:
links.append(completeLink(temp['href']))
print completeLink(temp['href'])
except Exception,e:
print e,22
raw_input()
import urllib
import HTMLParser
from BeautifulSoup import BeautifulSoup
import sys
import socket
socket.setdefaulttimeout(2)
class LinkParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.links= []
def handle_starttag(self,tag,attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
self.links.append(value)
def parseLinks(url):
parser = LinkParser()
parser.feed(urllib.urlopen(url).read())
return parser.links
def completeLink(url,domain):
root = domain
if url.startswith('http://'):
return url
elif url.startswith('/'):
return root + url
else:
return root + '/' + url
if __name__=='__main__':
host = sys.argv[1]
links = []
try:
soup = BeautifulSoup(urllib.urlopen(host).read())
test = soup.findAll('a')
for temp in test:
links.append(completeLink(temp['href'],host))
i = 0
while i < len(links):
item=links[i]
if item.startswith(host):
if item.find(":",5)>0:
links.remove(item)
i = i -1
else:
print item
links.remove(item)
i = i - 1
i = i + 1
except Exception,e:
print e
i = 0
while i < len(links):
try:
soup = BeautifulSoup(urllib.urlopen(links[i]).read())
test = soup.findAll('a')
for temp in test:
link = completeLink(temp['href'],host)
if link.startswith(host):
if link not in links:
if link.find(":",5)>0:
print link
else:
print link
links.append(link)
print len(links)
except Exception,e:
print "***********"
print links[i],e
print "***********"
links.remove(links[i])
i = i - 1
i = i + 1
print len(links)
raw_input('end')
import urllib
import HTMLParser
class LinkParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.links= []
def handle_starttag(self,tag,attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
self.links.append(value)
def parseLinks(url):
parser = LinkParser()
parser.feed(urllib.urlopen(url).read())
return parser.links
def completeLink(url):
root = "http://www.yxlink.com"
if url.startswith('http://'):
return url
elif url.startswith('/'):
return root + url
else:
return root + '/' + url
if __name__=='__main__':
links = parseLinks("http://www.yxlink.com")
i = 0
while i < len(links):
links[i] = completeLink(links[i])
i=i+1
i = 0
while i < len(links):
item=links[i]
if item.startswith("http://www.yxlink.com"):
print len(links)
else:
print item
links.remove(item)
i = i - 1
i = i + 1
i = 0
while i < len(links):
try:
print urllib.urlopen(links[i]).read()
morelinks = parseLinks(links[i])
except Exception, e:
print e
for item in morelinks:
item = completeLink(item)
print item
if item.startswith("http://www.yxlink.com"):
if item not in links:
links.append(item)
print item
print len(links)
i = i + 1
print "*****************"
for item in links:
print item
print len(links)
raw_input()