谁能帮我用requests 跟BS爬下这个网站?

hellsb 2016-05-07 11:33:29
爬这个网站:http://www.souutu.com/mnmm/index.html

我一直都没搞出来,求帮助。

...全文
154 6 打赏 收藏 转发到动态 举报
写回复
用AI写文章
6 条回复
切换为时间正序
请发表友善的回复…
发表回复
屎克螂 2016-05-09
  • 打赏
  • 举报
回复
import re, urllib2 html = urllib2.urlopen('http://www.souutu.com/mnmm/index.html').read() imgUrl = re.findall('src="(.+?\.(?:jpg|pgn))"', html) #这是你要的图片地址
hellsb 2016-05-09
  • 打赏
  • 举报
回复
引用 2 楼 ligengyong2010 的回复:
links_parser.py import urllib import HTMLParser import urlparse class PicLinks(HTMLParser.HTMLParser): """ 从HTMLParser类中继承,用来爬虫url里的图片链接 """ def __init__(self, pattern, url): """ 初始化的时候调用,将links设置为空。这里的links为字典结构 Args: pattern: 抓取的类型 url: 要抓取的url """ HTMLParser.HTMLParser.__init__(self) self.links = [] self.re = pattern self.url = url def handle_starttag(self, tag, attrs): """ 处理图片连接 Args: tag: 标签 attrs: 属性 """ if len(attrs) == 0: pass else: for (variable, value) in attrs: tmp = None if variable == "src": if value[:4] == "http": tmp = value else: tmp = urlparse.urljoin(self.url, value) if tmp is not None and self.re.match(tmp): self.links.append(tmp) def __enter__(self): """ 重写上下文对象管理 """ return self def __exit__(self, exc_type, exc_val, exc_tb): """ 重写上下文对象管理 """ if exc_tb is not None: return False else: return True class SubLinks(HTMLParser.HTMLParser): """ 从HTMLParser类中继承 """ def __init__(self, url): """ 初始化的时候调用,将links设置为空。这里的links为字典结构 Args: url: 要处理的url """ HTMLParser.HTMLParser.__init__(self) self.url = url self.links = set() def handle_starttag(self, tag, attrs): """ execution function Args: tag: 标签 attrs: 属性 """ for (variable, value) in attrs: if tag == "a": for (variable, value) in attrs: if variable == "href": link = None if value.startswith("http"): link = value.strip() elif value.startswith("javascript"): links = value.split("=") if len(links) < 2: continue tmp = links[1].strip("\"\'").strip() if tmp.startswith("http") or tmp.startswith("www"): link = tmp else: link = urlparse.urljoin(self.url, tmp) else: link = urlparse.urljoin(self.url, value) if link is not None: self.links.add(link) def handle_endtag(self, tag): """ end handler Args: tag: 标签 """ pass def handle_data(self, data): """ data information Args: data: 数据 """ pass def __enter__(self): """ 重写上下文对象管理 """ return self def __exit__(self, exc_type, exc_val, exc_tb): """ 重写上下文对象管理 """ if exc_tb is not None: return False else: return True if __name__ == "__main__": """ a test case """ url = "http://pycm.baidu.com:8081//3/page3_4.html" fp = urllib.urlopen(url) target = ".*\.(gif|png|jpg|bmp)$" data = fp.read() fp.close() link_demo = PicLinks(target, url) link_demo.feed(data) link_demo.close() for link in link_demo.links: print link url = "http://pycm.baidu.com:8081/" fp = urllib.urlopen(url) data = fp.read() fp.close() link_demo = SubLinks(url) link_demo.feed(data) link_demo.close() for link in link_demo.links: print link
我研究研究,。谢谢。
hellsb 2016-05-09
  • 打赏
  • 举报
回复
引用 1 楼 uiuiy1 的回复:
import re, urllib2 html = urllib2.urlopen('http://www.souutu.com/mnmm/index.html').read() imgUrl = re.findall('src="(.+?\.(?:jpg|pgn))"', html) #这是你要的图片地址
这种只能获取首页的图片,没意思的、。
关山路遥 2016-05-09
  • 打赏
  • 举报
回复
logger_util.py import logging import os LEVEL = { 0: logging.DEBUG, 1: logging.INFO, 2: logging.ERROR } DEBUG = 0 INFO = 1 ERROR = 2 spider_logger = None def init_logger(file, level): """ 初始化日志文件 Args: file: log file wrote to level: log level Returns: 1: success 0: fail """ global spider_logger spider_logger = Logger(file, level) if spider_logger is None: return 0 else: return 1 class Logger(object): """ logger for self use """ def __init__(self, file="log", level=DEBUG, path='log'): """ 初始化函数 Args: file: 日志名 level:日志等级 path: 日志路径 """ if not os.path.exists(path): os.mkdir(path) self.file = os.path.join(path, file) self.logger = logging.getLogger(self.file) self.logger.setLevel(LEVEL[level]) format = '%(asctime)s - %(levelname)s - %(message)s' formatter = logging.Formatter(format) file_handler = logging.FileHandler(self.file, 'a') file_handler.setLevel(LEVEL[level]) file_handler.setFormatter(formatter) stream_handler = logging.StreamHandler() stream_handler.setLevel(LEVEL[level]) stream_handler.setFormatter(formatter) self.logger.addHandler(file_handler) self.logger.addHandler(stream_handler) def debug(self, debug): """ debug level Args: debug: 要记录的日志信息 """ if debug is not None: self.logger.debug(debug) def info(self, info): """ info level Args: info: 要记录的日志信息 """ if info is not None: self.logger.info(info) def error(self, error): """ error level Args: error: 要记录的日志信息 """ if error is not None: self.logger.error(error)
关山路遥 2016-05-09
  • 打赏
  • 举报
回复
mini_spider.py import signal import threading import sys import getopt import ConfigParser import Queue import time import urllib import urllib2 import re import os import socket import logger_util import links_parser class CountDownLatch(object): """ 同步计数器 """ def __init__(self, num): """ 初始化函数 Args: num: 需要同步的个数 """ self.num = num self.lock = threading.Lock() def down(self): """ 自减一次 """ with self.lock: self.num -= 1 def status(self): """ 查看当前状态 Returns: 1: 所有事件都已经触发 0: 还有未触发完的时间 """ with self.lock: if self.num <= 0: return 1 return 0 class SafeUrlSet(object): """ 用来记录解析的url,类型为 url:depth字典, depth为发现url的level """ def __init__(self): """ 初始化函数 """ self.dict = {} self.lock = threading.Lock() def add(self, url, depth): """ 记录解析的url和它解析的层次,对于已经在较浅层析发现的url,在较深层次发现的时候不再重复处理 Args: url: 添加的url depth:添加的层次 Returns: 1 : 需要解析 0 : 不需要解析 """ if depth < 0: return 0 with self.lock: if url in self.dict.keys(): concurrent_depth = self.dict[url] if depth < concurrent_depth: self.dict[url] = depth return 1 else: return 0 else: self.dict[url] = depth return 1 def pop(self): """ 将字典的第一个元素取出 Returns: Url: 取出来的url None: dict空了 """ with self.lock: if len(self.dict) > 0: url = self.dict.keys()[0] self.dict.pop(url) return url else: return None def empty(self): """ 判断dict是否为空 Returns: 0 : 字典为空 1 : 字典不为空 """ with self.lock: if len(self.dict) > 0: return 0 else: return 1 class SafeQueue(object): """ safe queue usage,多线程安全 """ def __init__(self, num): """ 初始化函数 Args: num: 队列的最大数目 """ self.queue = Queue.Queue(num) self.lock = threading.Lock() def get(self): """ queue get function Return: item: 从队列里取出来一个元素返回 """ with self.lock: if self.queue.empty(): return None else: item = self.queue.get() return item def put(self, item): """ put item to queue Args: item: 将item放到队列中 Returns: True: success False: fail """ with self.lock: if self.queue.full(): return False self.queue.put(item) return True def empty(self): """ verify if the queue is empty Returns: True: 队列为空 False: 队列不为空 """ with self.lock: return self.queue.empty()
关山路遥 2016-05-09
  • 打赏
  • 举报
回复
links_parser.py import urllib import HTMLParser import urlparse class PicLinks(HTMLParser.HTMLParser): """ 从HTMLParser类中继承,用来爬虫url里的图片链接 """ def __init__(self, pattern, url): """ 初始化的时候调用,将links设置为空。这里的links为字典结构 Args: pattern: 抓取的类型 url: 要抓取的url """ HTMLParser.HTMLParser.__init__(self) self.links = [] self.re = pattern self.url = url def handle_starttag(self, tag, attrs): """ 处理图片连接 Args: tag: 标签 attrs: 属性 """ if len(attrs) == 0: pass else: for (variable, value) in attrs: tmp = None if variable == "src": if value[:4] == "http": tmp = value else: tmp = urlparse.urljoin(self.url, value) if tmp is not None and self.re.match(tmp): self.links.append(tmp) def __enter__(self): """ 重写上下文对象管理 """ return self def __exit__(self, exc_type, exc_val, exc_tb): """ 重写上下文对象管理 """ if exc_tb is not None: return False else: return True class SubLinks(HTMLParser.HTMLParser): """ 从HTMLParser类中继承 """ def __init__(self, url): """ 初始化的时候调用,将links设置为空。这里的links为字典结构 Args: url: 要处理的url """ HTMLParser.HTMLParser.__init__(self) self.url = url self.links = set() def handle_starttag(self, tag, attrs): """ execution function Args: tag: 标签 attrs: 属性 """ for (variable, value) in attrs: if tag == "a": for (variable, value) in attrs: if variable == "href": link = None if value.startswith("http"): link = value.strip() elif value.startswith("javascript"): links = value.split("=") if len(links) < 2: continue tmp = links[1].strip("\"\'").strip() if tmp.startswith("http") or tmp.startswith("www"): link = tmp else: link = urlparse.urljoin(self.url, tmp) else: link = urlparse.urljoin(self.url, value) if link is not None: self.links.add(link) def handle_endtag(self, tag): """ end handler Args: tag: 标签 """ pass def handle_data(self, data): """ data information Args: data: 数据 """ pass def __enter__(self): """ 重写上下文对象管理 """ return self def __exit__(self, exc_type, exc_val, exc_tb): """ 重写上下文对象管理 """ if exc_tb is not None: return False else: return True if __name__ == "__main__": """ a test case """ url = "http://pycm.baidu.com:8081//3/page3_4.html" fp = urllib.urlopen(url) target = ".*\.(gif|png|jpg|bmp)$" data = fp.read() fp.close() link_demo = PicLinks(target, url) link_demo.feed(data) link_demo.close() for link in link_demo.links: print link url = "http://pycm.baidu.com:8081/" fp = urllib.urlopen(url) data = fp.read() fp.close() link_demo = SubLinks(url) link_demo.feed(data) link_demo.close() for link in link_demo.links: print link

37,721

社区成员

发帖
与我相关
我的任务
社区描述
JavaScript,VBScript,AngleScript,ActionScript,Shell,Perl,Ruby,Lua,Tcl,Scala,MaxScript 等脚本语言交流。
社区管理员
  • 脚本语言(Perl/Python)社区
  • IT.BOB
加入社区
  • 近7日
  • 近30日
  • 至今

试试用AI创作助手写篇文章吧