3.7w+
社区成员
arrs = re.findall("(?is)<div aria-label=\"(.*?)\".*?>",content)
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Function:
【整理】用BeautifulSoup查找属性值未知的标签
http://www.crifan.com/python_use_beautifulsoup_find_tag_with_unknown_attribute_value/
Author: Crifan Li
Version: 2013-07-17
Contact: http://www.crifan.com/about/me/
"""
from BeautifulSoup import BeautifulSoup;
def beautifulsoup_tag_attr_unknown():
"""
demo BeautifulSoup find the tag which attribute value unknown
"""
html = """<div aria-label="5星, 747 份评分" class="rating" role="img" tabindex="-1">
<div>
<span class="rating-star">
</span>
<span class="rating-star">
</span>
<span class="rating-star">
</span>
<span class="rating-star">
</span>
<span class="rating-star">
</span>
</div>
<span class="rating-count">
747 份评分
</span>
</div>""";
soup = BeautifulSoup(html);
foundDiv = soup.find(name="div", attrs={"aria-label":True});
#print "foundDiv=",foundDiv;
attrVal = foundDiv['aria-label'];
print "attrVal=",attrVal; #attrVal= 5星, 747 份评分
if __name__ == "__main__":
beautifulsoup_tag_attr_unknown();
In [8]: from BeautifulSoup import BeautifulSoup
In [9]: root = BeautifulSoup(u"""<div aria-label="5星, 747 份评分" class="rating" role="img" tabindex="-1">
...: <div>
...: <span class="rating-star">
...: </span>
...: <span class="rating-star">
...: </span>
...: <span class="rating-star">
...: </span>
...: <span class="rating-star">
...: </span>
...: <span class="rating-star">
...: </span>
...: </div>
...: <span class="rating-count">
...: 747 份评分
...: </span>
...: </div>""")
In [10]: rating = root.findAll(attrs={"aria-lable": True})
In [11]: rating
Out[11]: []
In [12]: rating = root.findAll(attrs={"aria-label": True})
In [13]: rating
Out[13]:
[<div aria-label="5星, 747 份评分" class="rating" role="img" tabindex="-1">
<div>
<span class="rating-star">
</span>
<span class="rating-star">
</span>
<span class="rating-star">
</span>
<span class="rating-star">
</span>
<span class="rating-star">
</span>
</div>
<span class="rating-count">
747 份评分
</span>
</div>]
In [14]: rating[0]['aria-label']
Out[14]: u'5\u661f, 747 \u4efd\u8bc4\u5206'
In [15]: print rating[0]['aria-label']
5星, 747 份评分