37,719
社区成员
发帖
与我相关
我的任务
分享
class TestSpider(CrawlSpider):
name = 'testSpider'
num = 0
allow_domain = ['http://wz.sun0769.com/']
start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4']
rules = {
Rule(LxmlLinkExtractor(allow='page')),
Rule(LxmlLinkExtractor(allow='/index\.php/question/questionType\?type=4$')),
Rule(LxmlLinkExtractor(allow='/html/question/\d+/\d+\.shtml$'), callback='parse_content')
}
_x_query = {
'title': '''//div[contains(@class, 'pagecenter p3')]/div/div/div[contains(@class,'cleft')]/strong/text()''',
'content': '''//div[contains(@class, 'c1 text14_2')]/text()''',
'content_first': '''//div[contains(@class, 'contentext')]/text()'''
}
def parse_content(self, response):
bbs_item_loader = ItemLoader(item=TutorialItem(), response=response)
content = response.xpath(self._x_query['content_first']).extract()
if len(content) == 0:
content = str(response.xpath(self._x_query['content']).extract()[0].encode('utf-8'))
else:
content = str(content[0].encode('utf-8'))
title = str(response.xpath(self._x_query['title']).extract()[0].encode('utf-8'))
title_list = title.split(' ')
number = title_list[-1]
number = number.split(':')[-1]
url = str(response.url)
bbs_item_loader.add_value('url', url)
bbs_item_loader.add_value('number', number)
bbs_item_loader.add_value('title', title)
bbs_item_loader.add_value('content', content)
# bbs_item_loader.add_xpath('content', self._x_query['content'])
return bbs_item_loader.load_item()
...href="http://wz.sun0769.com/html/question/201506/279788.shtml" title=...
"http://wz.sun0769.com/html/question/201506/279788.shtml"是你要的部分, 但它不在html源码的行尾,后面还有title等字符。加了$后这个链接就匹配不出来。
----------------------------------------------------------
说错误, 说真正错误, 拷错误信息说真正错误,
贴代码, 贴完整代码, 用语法高亮贴完整代码.
[/quote]
原来是这样的啊……太谢谢了
...href="http://wz.sun0769.com/html/question/201506/279788.shtml" title=...
"http://wz.sun0769.com/html/question/201506/279788.shtml"是你要的部分, 但它不在html源码的行尾,后面还有title等字符。加了$后这个链接就匹配不出来。
----------------------------------------------------------
说错误, 说真正错误, 拷错误信息说真正错误,
贴代码, 贴完整代码, 用语法高亮贴完整代码.
Rule(LxmlLinkExtractor(allow='/index\.php/question/questionType\?type=4$')),
这个应该是用来获取页面链接的. 正则表达式中的$应去掉.