37,744
社区成员




#coding:utf-8
import urllib
import urllib2
from lxml import etree as etree
if __name__ == "__main__":
#此段代码的目的是为了爬取下边网页上的“更新时间”
req_url = 'http://www.mumayi.com/android-81548.html'
try:
headers = {'User-Agent':'"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0) Gecko/20100101 Firefox/26.0"'}
req = urllib2.Request(req_url, headers = headers)
content = urllib2.urlopen(req, timeout=60).read()
if isinstance(content, unicode):
pass
else:
content = content.decode('utf-8')
#print content
htmlSource = etree.HTML(content)
names = htmlSource.find('.//ul[@class="istyle fl"]//li[4]') #问题出在节点“li”的序列号[4]上,只要加上li[4],结果就是None
print names.text, type(names)
<ul class="menu fl hidden" id="menu">
<li class="conBox"><strong>应用:</strong><a href="http://www.mumayi.com/android/xitonggongju"
我没搞错的话要写全信息,话说没全我没试过,我也一直用的这个!哈哈
names = htmlSource.find('.//ul[@class="menu fl hidden"]//li[4]')
更新时间:
2014-06-19
# -*-coding: utf-8 -*-
import urllib
import urllib2
import lxml.html as HTML
if __name__ == "__main__":
#此段代码的目的是为了爬取下边网页上的“更新时间”
req_url = 'http://www.mumayi.com/android-81548.html'
headers = {'User-Agent':'"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0) Gecko/20100101 Firefox/26.0"'}
req = urllib2.Request(req_url, headers = headers)
content = urllib2.urlopen(req, timeout=60).read()
if isinstance(content, unicode):
pass
else:
content = content.decode('utf-8')
htmlSource = HTML.fromstring(content)
retrans_content_tags = htmlSource.xpath(u'//div[@class="c"][4]/child::text()|//div[@class="c"][$_i]/a[position()>1]/child::text()') #
names = htmlSource.xpath(u'//ul[@class="istyle fl"]/li[3]/span')
print names[0].text
time = htmlSource.xpath(u'//ul[@class="istyle fl"]/li[3]/child::text()')
print time[0]
加分加分!哈哈#coding:utf-8
import urllib
import urllib2
import lxml.html as HTML
if __name__ == "__main__":
#此段代码的目的是为了爬取下边网页上的“更新时间”
req_url = 'http://www.mumayi.com/android-81548.html'
headers = {'User-Agent':'"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0) Gecko/20100101 Firefox/26.0"'}
req = urllib2.Request(req_url, headers = headers)
content = urllib2.urlopen(req, timeout=60).read()
if isinstance(content, unicode):
pass
else:
content = content.decode('utf-8')
# print content
htmlSource = HTML.fromstring(content)
print htmlSource
names = htmlSource.xpath(r'//ul[@class="menu fl hidden"]/li/strong') #问题出在节点“li”的序列号[4]上,只要加上li[4],结果就是None
for name in names:
print name.text
>>> ================================ RESTART ================================
>>>
<Element html at 0x2ab44e0>
应用:
游戏:
应用:
游戏:
>>>
我不知道你具体要爬那一层,写了一个给你参考!