大神求答!!scarpy框架爬取搜房网爬不到数据

王大皓 2017-11-09 02:17:47
写了一个爬虫,爬取搜房网的新房数据,遇到一个问题,以重庆房源为例,当进入到房源详情的时候 获取到的数据(红色部分)都为空。 请问这是什么原因,Xpath地址肯定没错 。


# -*- coding: utf-8 -*-
import scrapy
import re
import sys
import urllib
import urllib2
import json

from scrapy.spiders import Spider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from sfw_data.items import SfwDataItem

class SfSpider(Spider):

name = "soufang"
allowed_domains = ["fang.com"]

#start_urls = ['http://esf.fang.com/housing/0__0_0_0_0_1_0_0/']
start_urls = []
for i in range(1,2):
start_urls.append('http://newhouse.cq.fang.com/house/s/b9'+str(i)+'/')
handle_httpstatus_list = [404,403]

def parse(self,response):
reload(sys)
sys.setdefaultencoding('utf8')

print '__________'
if response.status == 403:
print 'meet 403, sleep 600 sconds'
import time
time.sleep(1200)
yield Request(response.url,callback=self.parse)
#404,页面不存在,直接范围即可
elif response.status == 404:
print 'meet 404,return'
else:

hxs = scrapy.Selector(response)

for i in range(1,22):


name_ = hxs.xpath('/html/body/div[7]/div/div[1]/div[1]/div/div/ul/li['+str(i)+']/div/div[2]/div[1]/div[1]/a/text()').extract()
name = ''.join(name_)
strinfo = re.compile('\t')
name = strinfo.sub('',name)
strinfo = re.compile('\n')
name = strinfo.sub('',name)
if name!='':
item = SfwDataItem()
else:
continue
http = hxs.xpath('/html/body/div[7]/div/div[1]/div[1]/div/div/ul/li['+str(i)+']/div[1]/div[2]/div[1]/div[1]/a/@href').extract()
href = ''.join(http)

item['name'] = name.encode('utf-8')

item['link'] = href.encode('utf-8')

yield Request(href,callback=self.parse_detail,meta={'item':item})

print name
print '__________'

def parse_detail(self,response):

loc_hxs = scrapy.Selector(response)
href_other = loc_hxs.xpath('//*[@id="orginalNaviBox"]/a[2]/@href').extract()
href_other = ''.join(href_other)

item = response.meta['item']
yield Request(href_other,callback=self.parse_detail_other,meta={'item':item})

def parse_detail_other(self,response):

loc_hxs_other = scrapy.Selector(response)
item = response.meta['item']

area_covered = loc_hxs_other.xpath('/html/body/div[7]/div/div[1]/div[4]/ul/li[1]/div[2]/text()').extract() #占地面积
area_covered = ''.join(area_covered)
item['area_covered'] = area_covered.encode('utf-8')

volume_Rate = loc_hxs_other.xpath('/html/body/div[7]/div/div[1]/div[4]/ul/li[3]/div[2]/text()').extract() #容积率
volume_Rate = ''.join(volume_Rate)
item['volume_Rate'] = volume_Rate.encode('utf-8')

parking_Space = loc_hxs_other.xpath('/html/body/div[7]/div/div[1]/div[4]/ul/li[5]/div[2]/text()').extract() #停车位
parking_Space = ''.join(parking_Space)
item['parking_Space'] = parking_Space.encode('utf-8')

total_House= loc_hxs_other.xpath('/html/body/div[7]/div/div[1]/div[4]/ul/li[7]/div[2]/text()').extract() #总户数
total_House = ''.join(total_House)
item['total_House'] = total_House.encode('utf-8')

property_Cost = loc_hxs_other.xpath('/html/body/div[7]/div/div[1]/div[4]/ul/li[9]/div[2]/text()').extract() #物业费
property_Cost = ''.join(property_Cost)
item['property_Cost'] = property_Cost.encode('utf-8')

floor_Condition = loc_hxs_other.xpath('/html/body/div[7]/div/div[1]/div[4]/ul/li[11]/div[2]/text()').extract() #楼层状况
floor_Condition = ''.join(floor_Condition)
item['floor_Condition'] = floor_Condition.encode('utf-8')

builtup_Area = loc_hxs_other.xpath('/html/body/div[7]/div/div[1]/div[4]/ul/li[2]/div[2]/text()').extract() #建筑面积
builtup_Area = ''.join(builtup_Area)
item['builtup_Area'] = builtup_Area.encode('utf-8')

greening_Rate = loc_hxs_other.xpath('/html/body/div[7]/div/div[1]/div[4]/ul/li[4]/div[2]/text()').extract() #绿化率
greening_Rate = ''.join(greening_Rate)
item['greening_Rate'] = greening_Rate.encode('utf-8')

total_Buildings = loc_hxs_other.xpath('/html/body/div[7]/div/div[1]/div[4]/ul/li[6]/div[2]/text()').extract() #楼栋总数
total_Buildings = ''.join(total_Buildings)
item['total_Buildings'] = total_Buildings.encode('utf-8')

property_Company = loc_hxs_other.xpath('/html/body/div[7]/div/div[1]/div[4]/ul/li[8]/div[2]/a/text()').extract() #物业公司

property_Company = ''.join(property_Company)
item['property_Company'] = property_Company.encode('utf-8')

yield item
print address
...全文
123 回复 打赏 收藏 转发到动态 举报
写回复
用AI写文章
回复
切换为时间正序
请发表友善的回复…
发表回复

10,606

社区成员

发帖
与我相关
我的任务
社区描述
Web 开发 其他
社区管理员
  • 其他
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧