37,719
社区成员
发帖
与我相关
我的任务
分享
import time
import urllib
import lxml.html
import os
import Queue
import threading
time1=time.time()
os.mkdir('/tmp/python')
down='http://www.network-theory.co.uk/docs/pytut/'
file=urllib.urlopen(down).read()
root=lxml.html.fromstring(file)
tnodes = root.xpath("//div[@class='main']//ul/li/a")
jobs = Queue.Queue()
for x in tnodes:
jobs.put(x)
def worker():
while not jobs.empty():
x = jobs.get()
url='http://www.network-theory.co.uk/docs/pytut/'+x.get('href')
name=list(x.text)
myfile=open('/tmp/python'+name,'a')
page=urllib.urlopen(url).read()
myfile.write(page)
myfile.close()
jobs.task_done()
for i in range(10):
threading.Thread(target=worker).start()
jobs.join()
time2=time.time()
print time2-time1
from multiprocessing import Process, Lock
url_pool = set("first_url")
workerCounter = 0
WORKER_LIMIT = 10
l = Lock()
def worker(url):
# download url
# parse content to find all links that need to be downloaded
# add the links to global pool, need the lock it first
l.require()
set.update(new_url_link_list)
l.release()
def main():
while True:
l.require()
if len(url_pool)==0:
if workerCounter == 0:
l.release()
return
else:
while workerCounter <= WORKER_LIMIT and len(url_pool)>0:
Peocess(target=worker, args = (url_pool.pop(),)).start()
workerCounter += 1
l.release()
sleep(1)