links_parser.py import urllib import HTMLParser import urlparse class PicLinks(HTMLParser.HTMLParser): """ 从HTMLParser类中继承,用来爬虫url里的图片链接 """ def __init__(self, pattern, url): """ 初始化的时候调用,将links设置为空。这里的links为字典结构 Args: pattern: 抓取的类型 url: 要抓取的url """ HTMLParser.HTMLParser.__init__(self) self.links = [] self.re = pattern self.url = url def handle_starttag(self, tag, attrs): """ 处理图片连接 Args: tag: 标签 attrs: 属性 """ if len(attrs) == 0: pass else: for (variable, value) in attrs: tmp = None if variable == "src": if value[:4] == "http": tmp = value else: tmp = urlparse.urljoin(self.url, value) if tmp is not None and self.re.match(tmp): self.links.append(tmp) def __enter__(self): """ 重写上下文对象管理 """ return self def __exit__(self, exc_type, exc_val, exc_tb): """ 重写上下文对象管理 """ if exc_tb is not None: return False else: return True class SubLinks(HTMLParser.HTMLParser): """ 从HTMLParser类中继承 """ def __init__(self, url): """ 初始化的时候调用,将links设置为空。这里的links为字典结构 Args: url: 要处理的url """ HTMLParser.HTMLParser.__init__(self) self.url = url self.links = set() def handle_starttag(self, tag, attrs): """ execution function Args: tag: 标签 attrs: 属性 """ for (variable, value) in attrs: if tag == "a": for (variable, value) in attrs: if variable == "href": link = None if value.startswith("http"): link = value.strip() elif value.startswith("javascript"): links = value.split("=") if len(links) < 2: continue tmp = links[1].strip("\"\'").strip() if tmp.startswith("http") or tmp.startswith("www"): link = tmp else: link = urlparse.urljoin(self.url, tmp) else: link = urlparse.urljoin(self.url, value) if link is not None: self.links.add(link) def handle_endtag(self, tag): """ end handler Args: tag: 标签 """ pass def handle_data(self, data): """ data information Args: data: 数据 """ pass def __enter__(self): """ 重写上下文对象管理 """ return self def __exit__(self, exc_type, exc_val, exc_tb): """ 重写上下文对象管理 """ if exc_tb is not None: return False else: return True if __name__ == "__main__": """ a test case """ url = "http://pycm.baidu.com:8081//3/page3_4.html" fp = urllib.urlopen(url) target = ".*\.(gif|png|jpg|bmp)$" data = fp.read() fp.close() link_demo = PicLinks(target, url) link_demo.feed(data) link_demo.close() for link in link_demo.links: print link url = "http://pycm.baidu.com:8081/" fp = urllib.urlopen(url) data = fp.read() fp.close() link_demo = SubLinks(url) link_demo.feed(data) link_demo.close() for link in link_demo.links: print link
import re, urllib2 html = urllib2.urlopen('http://www.souutu.com/mnmm/index.html').read() imgUrl = re.findall('src="(.+?\.(?:jpg|pgn))"', html) #这是你要的图片地址
37,721
社区成员
34,239
社区内容
加载中
试试用AI创作助手写篇文章吧