37,719
社区成员
发帖
与我相关
我的任务
分享
import urllib
import HTMLParser
class parselinks(HTMLParser.HTMLParser):
urlText= []
def __int__(self):
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
self.urlText.append(value)
if __name__=='__main__':
my = parselinks()
j = 0
i = 0
urlText1 = []
my.feed(urllib.urlopen("http://www.yxlink.com/index.html").read())
urlText1 = my.urlText
while i < len(urlText1):
print len(urlText1)
urlText1[i] = "http://www.yxlink.com/"+urlText1[i]
my.urlText=[]
my.feed(urllib.urlopen(urlText1[i]).read())
print len(my.urlText)
j = 0
while j< len(my.urlText):
my.urlText[j] = "http://www.yxlink.com/"+my.urlText[j]
print my.urlText[j]
print my.urlText[j] in urlText1
if my.urlText[j] in urlText1 :
j+=1
print i,j
else :
urlText1.append(str(my.urlText[j]))
j+=1
print i,j
i+=1
print urlText1
links[i] = "http://www.yxlink.com/"+links[i]
import urllib
import HTMLParser
class LinkParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.links= []
def handle_starttag(self,tag,attrs):
print self.links
if tag == 'a':
for name,value in attrs:
if name == 'href':
self.links.append(value)
def parseLinks(url):
parser = LinkParser()
parser.feed(urllib.urlopen(url).read())
return parser.links
def completeLink(url):
root = "http://www.yxlink.com"
if url.startswith('http://'):
return url
elif url.startswith('/'):
return root + url
else:
return root + '/' + url
if __name__=='__main__':
j = 0
i = 0
links = map(completeLink, parseLinks("http://www.yxlink.com/index.html"))
while i < len(links):
moreLinks = map(completeLink, parseLinks(links[i]))
print len(moreLinks)
j = 0
while j < len(moreLinks):
print moreLinks[j]
print moreLinks[j] in links
if moreLinks[j] not in links :
links.append(str(moreLinks[j]))
j += 1
print i, j
i += 1
print links
if moreLinks[j] in links :
j+=1
print i,j
else :
links.append(str(moreLinks[j]))
j+=1
print i,j
if moreLinks[j] not in links :
links.append(str(moreLinks[j]))
j += 1
print i, j
import urllib
import HTMLParser
class LinkParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.links= []
def handle_starttag(self,tag,attrs):
print self.links
if tag == 'a':
for name,value in attrs:
if name == 'href':
self.links.append(value)
def parseLinks(url):
parser = LinkParser()
parser.feed(urllib.urlopen(url).read())
return parser.links
if __name__=='__main__':
j = 0
i = 0
links = parseLinks("http://www.yxlink.com/index.html")
while i < len(links):
print len(links)
links[i] = "http://www.yxlink.com/"+links[i]
moreLinks = parseLinks(links[i])
print len(moreLinks)
j = 0
while j< len(moreLinks):
moreLinks[j] = "http://www.yxlink.com/"+moreLinks[j]
print moreLinks[j]
print moreLinks[j] in links
if moreLinks[j] in links :
j+=1
print i,j
else :
links.append(str(moreLinks[j]))
j+=1
print i,j
i+=1
print links
import urllib
import HTMLParser
import sys
class parselinks(HTMLParser.HTMLParser):
urlText= []
def __int__(self):
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self,tag,attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
self.urlText.append(value)
if __name__=='__main__':
my = parselinks()
a = urllib.urlopen("http://www.yxlink.com/index.html")
my.feed(a.read())
print my.urlText
a = urllib.urlopen("http://www.yxlink.com/support-download.html" )
my.feed(a.read())
print my.urlText
my.feed(urllib.urlopen(urlText1[i]).read())