37,719
社区成员
发帖
与我相关
我的任务
分享
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
import urllib2
import re
import os
#connect to a URL
#一页的搜索结果中url大概是200个左右
file_url = open('url.txt','ab+')
#搜索框里的东西,这块可以设置成数字好让每次搜索的结果不一样
search = '123'
url = "http://www.baidu.com/s?wd="+search
def setUrlToFile():
website = urllib2.urlopen(url)
#read html code
html = website.read()
#use re.findall to get all the links
links = re.findall('"((http|ftp)s?://.*?)"', html)
for s in links:
print s[0]
if len(s[0]) < 256:
file_url.write(s[0]+'\r\n')
#收集实验数据
for i in range(0,50):
setUrlToFile()
file_url.close()
###需要重新打开再读一下
file_url = open('url.txt','r')
file_lines = len(file_url.readlines())
print "there are %d url in %s" %(file_lines,file_url)
file_url.close()