37,743
社区成员




import chilkat
def spider_main(url):
spider = chilkat.CkSpider()
spider.put_ConnectTimeout(2)
spider.put_ReadTimeout(3)
spider.Initialize(url)
# Add the 1st URL:
spider.AddUnspidered("http://"+url)
for i in range(0,1000):
success = spider.CrawlNext()
if (success == True):
aa=spider.lastUrl()
print aa
i = i + 1
else:
if (spider.get_NumUnspidered() == 0):
print "No more URLs to spider"
break
else:
print spider.lastErrorText()
print i
if __name__=="__main__":
spider_main('www.yxlink.com')
raw_input()
<a href="test.asp?id=123d&id2=test.exe">combination+mixed</a>不可以抓取
<a href="test.asp?id=123d&id2=testexe">combination+mixed</a>可以抓取
<a href="test.asp?id2=test.exe&id=123d">combination+mixed</a>可以抓取
<p><a href="test.asp?id=1">number</a><br />
<a href="test.asp?id=a">alpha</a><br />
<a href="test.asp?id=a3">combination</a><br />
<a href="test.asp?id=test.exe">mixed</a><br />
<a href="test.asp?id=1&id2=a">number+alpha</a><br />
<a href="test.asp?id=1&id2=a3">number+combination</a><br />
<a href="test.asp?id2=test.pdf&id=1">number+mixed</a><br />
<a href="test.asp?id=a&id2=a4">alpha+combination</a><br />
<a href="test.asp?id=a&id2=test.exe">alpha+mixed</a><br />
<a href="test.asp?id=123d&id2=test.exe">combination+mixed</a><br />
<a href="test.asp?id=1&id2=a&id3=du2">number+alpha+combination</a><br />
<a href="test.asp?id=1&id2=a&id3=test.exe">number+alpha+mixed</a><br />
<a href="test.asp?id=234&id2=af2de&id3=test.exe">number+combination+mixed</a><br />
<a href="test.asp?id=a&id2=af2de&id3=test.exe">alpha+combination+mixed</a><br />
<a href="test.asp?id=1&id2=sd&id3=ds2&id4=test.exe">number+alpha+combination+mixed</a><br />
</p>
<p> </p>
<p><a href="vnc-E4_5-x86_x64_win32.zip">vnc-E4_5-x86_x64_win32.zip</a></p>
<p> </p>
<p><a href="wireshark-win32-1.2.9.exe">exe</a></p>
http://127.0.0.1
http://127.0.0.1/test.asp?id=1
http://127.0.0.1/test.asp?id=a
http://127.0.0.1/test.asp?id=a3
http://127.0.0.1/test.asp?id=1&id2=a
http://127.0.0.1/test.asp?id=1&id2=a3
http://127.0.0.1/test.asp?id2=test.pdf&id=1
http://127.0.0.1/test.asp?id=a&id2=a4
http://127.0.0.1/test.asp?id=1&id2=a&id3=du2
No more URLs to spider
9