37,719
社区成员
发帖
与我相关
我的任务
分享
from threading import Thread,Lock
import time
a = True
class A:
def __init__(self,):
t = Thread(target=self.run)
t.start()
def run(self):
print 'im starting'
while a:
print a
if __name__ == "__main__":
aa=A()
time.sleep(1)
a= False
#coding=utf-8
from BeautifulSoup import BeautifulSoup
import urllib2
from threading import Thread,Lock
from Queue import Queue
import time
import socket
socket.setdefaulttimeout(5)
flag = True#标志位,控制子线程结束
class Fetcher:#封装的抓取网页的线程,处理结果的线程
def __init__(self,th_num):
self.opener = urllib2.build_opener(urllib2.HTTPHandler)
self.q_req = Queue() #任务队列
self.q_ans = Queue() #抓取结果处理队列
self.urls = []#返回下一深度要抓取的url
for i in range(th_num):#启动抓取线程
t = Thread(target=self.thread_get)
t.start()
for i in range(0,1):
t = Thread(target=self.thread_put)
t.start()
def join(self): #需等待两个队列完成
self.q_req.join()
self.q_ans.join()
print '=====================im done'#这个运行的时候也能打印出来。
def push(self,req):#往任务队列装任务
self.q_req.put(req)
def thread_put(self):#负责结果处理,将页面中的url存到sel.url里面
while flag:
print flag#调试用,如果这个线程没有结束,应该不断的打印flag才对
try:
url = self.q_ans.get()
self.urls.extend(url)
except Exception ,e:
print e,'other,excp========in=put'
finally:
self.q_ans.task_done()
def thread_get(self):
while flag:
print flag
try:
req = self.q_req.get()
urls = []
ans = self.opener.open(req).read()#抓取页面
soup = BeautifulSoup(ans)#解析页面
for a in soup.findAll('a'):#提取页面的所有链接
try:
if a['href'].startswith('http'):
urls.append(a['href'])
except Exception,ex:
print ex,'========================Exception=in=soup=findAll'
self.q_ans.put(urls)#将连接放入结果处理队列,让put线程去处理
except Exception, e:
print e,'other--exception----------in- threadget----'
finally:
print '--------------------'
self.q_req.task_done()
print "----------get---quiting"#这个看不到打印
def run(f,links):#作用是在下一轮抓取前清空urls,填装下一轮的任务,并等待任务完成。
f.urls = []
for url in links:
f.push(url)
f.join()
return f.urls
if __name__ == "__main__":
links = ['http://www.kingdowin.com/',]
deep = 2#要抓取的深度
f = Fetcher(10)
while deep > 0:#在这里结合run函数控制抓取深度
urls = run(f,links)
deep -= 1
links = urls
print len(links)
time.sleep(1)
flag = False #关闭线程
time.sleep(1)
print "Exiting Main Thread"#这一行会被打印出来
#每次运行,到这里,光标不动,通过任务管理器查看python的线程数13个
#线程虽然在,但不打印信息了
def get(self, block=True, timeout=None):
151 """Remove and return an item from the queue.
152
153 If optional args 'block' is true and 'timeout' is None (the default),
154 block if necessary until an item is available. If 'timeout' is
155 a positive number, it blocks at most 'timeout' seconds and raises
156 the Empty exception if no item was available within that time.
157 Otherwise ('block' is false), return an item if one is immediately
158 available, else raise the Empty exception ('timeout' is ignored
159 in that case).
160 """
161 self.not_empty.acquire()
162 try:
163 if not block:
164 if not self._qsize():
165 raise Empty
166 elif timeout is None:
167 while not self._qsize():
168 self.not_empty.wait()
169 elif timeout < 0:
170 raise ValueError("'timeout' must be a positive number")
171 else:
172 endtime = _time() + timeout
173 while not self._qsize():
174 remaining = endtime - _time()
175 if remaining <= 0.0:
176 raise Empty
177 self.not_empty.wait(remaining)
178 item = self._get()
179 self.not_full.notify()
180 return item
181 finally:
182 self.not_empty.release()
#!/usr/bin/env python
#coding=utf-8
from BeautifulSoup import BeautifulSoup
import urllib2
from threading import Thread,Lock
from Queue import Queue
import time
import socket
socket.setdefaulttimeout(5)
flag = True#标志位,控制子线程结束
class Fetcher:#封装的抓取网页的线程,处理结果的线程
def __init__(self,th_num):
self.opener = urllib2.build_opener(urllib2.HTTPHandler)
self.q_req = Queue() #任务队列
self.q_ans = Queue() #抓取结果处理队列
self.urls = []#返回下一深度要抓取的url
for i in range(th_num):#启动抓取线程
t = Thread(target=self.thread_get)
t.start()
for i in range(0,1):
t = Thread(target=self.thread_put)
t.start()
def join(self): #需等待两个队列完成
self.q_req.join()
self.q_ans.join()
print '=====================im done'#这个运行的时候也能打印出来。
def push(self,req):#往任务队列装任务
self.q_req.put(req)
def thread_put(self):#负责结果处理,将页面中的url存到sel.url里面
while flag:
print flag#调试用,如果这个线程没有结束,应该不断的打印flag才对
if self.q_ans.qsize()<=0:
time.sleep(1)
continue
try:
url = self.q_ans.get()
self.urls.extend(url)
except Exception ,e:
print e,'other,excp========in=put'
finally:
self.q_ans.task_done()
def thread_get(self):
while flag:
print flag
if self.q_req.qsize()<=0:
time.sleep(1)
continue
try:
req = self.q_req.get()
urls = []
ans = self.opener.open(req).read()#抓取页面
soup = BeautifulSoup(ans)#解析页面
for a in soup.findAll('a'):#提取页面的所有链接
try:
if a['href'].startswith('http'):
urls.append(a['href'])
except Exception,ex:
print ex,'========================Exception=in=soup=findAll'
self.q_ans.put(urls)#将连接放入结果处理队列,让put线程去处理
except Exception, e:
print e,'other--exception----------in- threadget----'
finally:
print '--------------------'
self.q_req.task_done()
print "----------get---quiting"#这个看不到打印
def run(f,links):#作用是在下一轮抓取前清空urls,填装下一轮的任务,并等待任务完成。
f.urls = []
for url in links:
f.push(url)
f.join()
return f.urls
if __name__ == "__main__":
links = ['http://www.kingdowin.com/',]
deep = 2#要抓取的深度
f = Fetcher(10)
while deep > 0:#在这里结合run函数控制抓取深度
urls = run(f,links)
outputinfo='deep [%d] ok\n' % deep
print outputinfo
deep -= 1
links = urls
outputinfo='run turn links length:%d\n' % len(links)
print outputinfo
print links
time.sleep(1)
flag = False #关闭线程
time.sleep(3)
print "Exiting Main Thread"#这一行会被打印出来
#每次运行,到这里,光标不动,通过任务管理器查看python的线程数13个
#线程虽然在,但不打印信息了
#!/usr/bin/env python
#coding=utf-8
from BeautifulSoup import BeautifulSoup
import urllib2
from threading import Thread,Lock
from Queue import Queue
import time
import socket
socket.setdefaulttimeout(5)
lock = Lock()
flag = True#标志位,控制子线程结束
class Fetcher:#封装的抓取网页的线程,处理结果的线程
def __init__(self,th_num):
self.opener = urllib2.build_opener(urllib2.HTTPHandler)
self.q_req = Queue() #任务队列
self.q_ans = Queue() #抓取结果处理队列
self.urls = []#返回下一深度要抓取的url
for i in range(th_num):#启动抓取线程
t = Thread(target=self.thread_get)
t.start()
for i in range(0,1):
t = Thread(target=self.thread_put)
t.start()
def join(self): #需等待两个队列完成
self.q_req.join()
self.q_ans.join()
print '=====================im done'#这个运行的时候也能打印出来。
def push(self,req):#往任务队列装任务
self.q_req.put(req)
def thread_put(self):#负责结果处理,将页面中的url存到sel.url里面
while flag:
#print flag#调试用,如果这个线程没有结束,应该不断的打印flag才对
lock.acquire()
if self.q_ans.qsize()<=0:
lock.release()
time.sleep(1)
continue
try:
url = self.q_ans.get()
lock.release()
self.urls.extend(url)
except Exception ,e:
print e,'other,excp========in=put'
finally:
self.q_ans.task_done()
def thread_get(self):
while flag:
#print flag
lock.acquire()
if self.q_req.qsize()<=0:
lock.release()
time.sleep(1)
continue
try:
req = self.q_req.get()
lock.release()
urls = []
ans = self.opener.open(req).read()#抓取页面
soup = BeautifulSoup(ans)#解析页面
for a in soup.findAll('a'):#提取页面的所有链接
try:
if a['href'].startswith('http'):
urls.append(a['href'])
except Exception,ex:
print ex,'========================Exception=in=soup=findAll'
self.q_ans.put(urls)#将连接放入结果处理队列,让put线程去处理
except Exception, e:
print e,'other--exception----------in- threadget----'
finally:
print '--------------------'
self.q_req.task_done()
print "----------get---quiting"#这个看不到打印
def run(f,links):#作用是在下一轮抓取前清空urls,填装下一轮的任务,并等待任务完成。
f.urls = []
for url in links:
f.push(url)
f.join()
return f.urls
if __name__ == "__main__":
links = ['http://www.kingdowin.com/',]
deep = 2#要抓取的深度
f = Fetcher(10)
while deep > 0:#在这里结合run函数控制抓取深度
urls = run(f,links)
outputinfo='deep [%d] ok\n' % deep
print outputinfo
deep -= 1
links = urls
outputinfo='run return links length:%d\n' % len(links)
print outputinfo
print links
time.sleep(1)
flag = False #关闭线程
time.sleep(2)
print "Exiting Main Thread"#这一行会被打印出来
#每次运行,到这里,光标不动,通过任务管理器查看python的线程数13个
#线程虽然在,但不打印信息了
#coding=utf-8
from BeautifulSoup import BeautifulSoup
import urllib2
from threading import Thread,Lock
from Queue import Queue, Empty
import time
import socket
socket.setdefaulttimeout(5)
lock = Lock()
flag = True#标志位,控制子线程结束
class Fetcher:#封装的抓取网页的线程,处理结果的线程
def __init__(self,th_num):
self.opener = urllib2.build_opener(urllib2.HTTPHandler)
self.q_req = Queue() #任务队列
self.q_ans = Queue() #抓取结果处理队列
self.urls = []#返回下一深度要抓取的url
for i in range(th_num):#启动抓取线程
t = Thread(target=self.thread_get)
t.start()
for i in range(0,3):
t = Thread(target=self.thread_put)
t.start()
def join(self): #需等待两个队列完成
self.q_req.join()
self.q_ans.join()
print '=====================im done'#这个运行的时候也能打印出来。
def push(self,req):#往任务队列装任务
self.q_req.put(req)
def thread_put(self):#负责结果处理,将页面中的url存到sel.url里面
while flag:
#~ print flag#调试用,如果这个线程没有结束,应该不断的打印flag才对
try:
url = self.q_ans.get(timeout=1)
except Empty:
continue
except Exception ,e:
print e,'other,excp========in=put'
break
lock.acquire()
self.urls.extend(url)
lock.release()
self.q_ans.task_done()
print "----------put---quiting"
def thread_get(self):
while flag:
#~ print flag
try:
req = self.q_req.get(timeout=1)
except Empty:
continue
except Exception, e:
print e,'other--exception----------in- threadget----'
urls = []
try:
ans = self.opener.open(req).read()#抓取页面
soup = BeautifulSoup(ans)#解析页面
for a in soup.findAll('a'):#提取页面的所有链接
if a['href'].startswith('http'):
urls.append(a['href'])
except Exception,ex:
print ex,'========================Exception=in=ans/soup'
self.q_ans.put(urls)#将连接放入结果处理队列,让put线程去处理
print '--------------------'
self.q_req.task_done()
print "----------get---quiting"#这个看不到打印
def run(f,links):#作用是在下一轮抓取前清空urls,填装下一轮的任务,并等待任务完成。
f.urls = []
for url in links:
f.push(url)
f.join()
return f.urls
if __name__ == "__main__":
links = ['http://www.kingdowin.com/',]
deep = 2#要抓取的深度
f = Fetcher(10)
while deep > 0:#在这里结合run函数控制抓取深度
urls = run(f,links)
deep -= 1
links = urls
print len(links)
time.sleep(1)
flag = False #关闭线程
time.sleep(1)
print "Exiting Main Thread"#这一行会被打印出来
#每次运行,到这里,光标不动,通过任务管理器查看python的线程数13个
#线程虽然在,但不打印信息了
if __name__ == "__main__":
links = ['http://www.kingdowin.com/',]
deep = 2#要抓取的深度
f = Fetcher(10)
while deep > 0:#在这里结合run函数控制抓取深度
urls = run(f,links)
outputinfo='deep [%d] ok\n' % deep
print outputinfo
deep -= 1
links = urls
outputinfo='run turn links length:%d\n' % len(links)
print outputinfo
print links
def thread_put(self):#负责结果处理,将页面中的url存到sel.url里面
while flag:
print flag#调试用,如果这个线程没有结束,应该不断的打印flag才对
if self.q_ans.qsize()<=0:
time.sleep(1)
continue
def thread_get(self):
while flag:
print flag
if self.q_req.qsize()<=0:
time.sleep(1)
continue