37,719
社区成员
发帖
与我相关
我的任务
分享
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from hbzc.items import HbzcItem
class HbzcsSpider(scrapy.Spider):
name = "hbzcs"
city = '?citycode=130000000-130700000-130728000&cityname=河北省张家口市怀安县'
allowed_domains = ["www.ccgp-hebei.gov.cn"]
start_urls = (
'http://www.ccgp-hebei.gov.cn/zfcg/web/getPreWinAnncList_1.html'+city,#预中标公告列表
)
def parse(self, response):
sel = response.xpath
htm = '.html'
prehtm = 'http://www.ccgp-hebei.gov.cn/zfcg/preBidingAnncDetail_'
prehtmlist =[prehtm+h+htm for h in [t[1] for t in [i.split("'") for i in sel('//*[@id="moreprewinannctable"]/tr/@onclick').extract()]]]#预中标公告详细
for preurl in prehtmlist:
yield Request(preurl,callback=self.parse_item)
def parse_item0(self,response):
sel = response.xpath
htm = '.html'
dinghtm = 'http://www.ccgp-hebei.gov.cn/zfcg/1/bidingAnncDetail_'
dinghtmlist = [dinghtm+h+htm for h in [t[1] for t in [i.split("'") for i in sel('//*[@id="moredingannctable"]/tr/@onclick').extract()]]]#招标公告详细
for dingurl in dinghtmlist:
item = response.meta['item']
#print dingurl
yield Request(url=dingurl,callback=self.parse_item1,meta={'item':item})
def parse_item(self,response):
sel = response.xpath
item = HbzcItem()
item['fs'] = sel('/html/body/table/tr/td/table/tr[4]/td/table/tr[1]/td/table/tr[1]/td[4]/text()').extract()
item['dl'] = sel('/html/body/table/tr/td/table/tr[4]/td/table/tr[1]/td/table/tr[3]/td[6]/text()').extract()
item['zb'] = sel('/html/body/table/tr/td/table/tr[4]/td/table/tr[7]/td/span/span[1]/text()').extract()
item['mc'] = sel('/html/body/table/tr/td/table/tr[4]/td/table/tr[1]/td/table/tr[2]/td[2]/text()').extract()
#这里出现问题,导致以上ITEM获取的值是一个项目的
return Request('http://www.ccgp-hebei.gov.cn/zfcg/web/getBidingList_1.html?citycode=130000000-130700000-130728000',callback=self.parse_item0,meta={'item':item})
def parse_item1(self,response):
sel = response.xpath
item = response.meta['item']
item['ys'] = sel('/html/body/table/tr/td/table/tr[4]/td/table/tr[7]/td/span/span[8]/text()').extract()
item['kb'] = sel('//*[@id="bidopentime2"]/text()').extract()
print item
return item