37,743
社区成员




import json
import os
import urllib
import urllib2
import cookielib
from scrapy.http import Request, FormRequest,HtmlResponse
from scrapy.selector import Selector
try:
from scrapy.spider import Spider
except:
from scrapy.spider import BaseSpider as Spider
from scrapy.utils.response import get_base_url
from scrapy.utils.url import urljoin_rfc
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from itzhaopin.items import *
from itzhaopin.misc.log import *
class TencentSpider(CrawlSpider):
name = "login_dingdian"
allowed_domains = ["23us.com"] #设定域名
start_urls = [
"http://www.23us.com/"
]
rules = [ # 定义爬取URL的规则
Rule(SgmlLinkExtractor(allow=("/modules/article/bookcase.php?")), follow=True, callback='parse_item')
]
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip,deflate",
"Accept-Language": "en-US,en;q=0.5",
"Connection": "keep-alive",
"Content-Type":" application/x-www-form-urlencoded",
"Referer": "http://www.23us.com/login.php?do=submit"
}
def start_requests(self):
return [Request("http://www.23us.com/login.php?do=submit",meta = {'cookiejar':1}, headers = self.headers,callback = self.post_login)]
#!!!!就是这句话,要是把cookiejar改成cookie或是直接整句话去掉程序就能运行成功
def post_login(self, response):
print '---------------Preparing login------------'
#登陆成功后, 会调用after_login回调函数
return [FormRequest.from_response(response,
meta={'cookiejar': response.meta['cookiejar']},#!!!!就是这句话,要是把cookiejar改成cookie或是直接整句话去掉程序就能运行成功
formdata = {
'username':'whtest',
'password':'******',
'usecookie':'2592000',
'action':'login',
'submit':' µÇ ¼ '
},
callback = self.after_login,
dont_filter = True,
)]
def after_login(self, response) :
for url in self.start_urls :
req = Request(url)
req.meta['cookiejar'] = response.meta['cookiejar']#!!!!就是这句话,要是把cookiejar改成cookie或是直接整句话去掉程序就能运行成功
return req
def parse_item(self, response): # 提取数据到Items里面,主要用到XPath和CSS选择器提取网页数据
print '----------------------------进入parse_item----------'
items = []
sel = Selector(response)
base_url = get_base_url(response)
sites_even = sel.css('table.grid')
for site in sites_even:
print "-----------------------进入循环进行内容挑选---------------------"
item = TencentItem()
item['name'] = site.css('td.even a[target="_blank"]').xpath('text()').extract()#css('.even').
item['updatechapter'] = site.css('td.odd a[target="_blank"]').xpath('text()').extract()
items.append(item)
print repr(item).decode("unicode-escape") + '\n'
return items
def _process_request(self, request):
return request