求助各位大神，python爬虫出错

accesszhishen 2020-01-16 01:53:49

爬虫代码如下：
import scrapy
from career.items import CareerItem
import re

class A51jobSpider(scrapy.Spider):
name = 'a51job'
allowed_domains = ['www.51job.com','jobs.51job.com']
start_urls = ['http://www.51job.com/']
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'guid=7f99087a0a6340b0e50b7674ff5d3ef7; 51job=cenglish%3D0%26%7C%26; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; search=jobarea%7E%60020000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60020000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60020000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAapple%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60020000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C7%E9%B1%A8%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21',
'Host': 'jobs.51job.com',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}

reg = re.compile('</?\w+[^>]*>')

def start_requests(self):
for i in range(1,50):
url = 'https://search.51job.com/list/020000,000000,0000,00,9,99,python,2,'+str(i)+'.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
yield scrapy.Request(url,callback=self.parse)
break

def parse(self, response):
for i in range(4,55):
url = response.xpath('//*[@id="resultList"]/div['+str(i)+']/p/span/a/@href')\
.extract_first()
yield scrapy.Request(url,callback=self.parsDetail,headers=self.header)

def parsDetail(self,response):
# print(response.text)
jobTitle = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/text()').extract_first()
company = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/@title').extract_first()
tempStr = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/@title').extract_first()
# arr = tempStr.split('\xa0\xa0|\xa0\xa0')
print(jobTitle)
arr = tempStr.split('|')
area = arr[0].strip()
exp = arr[1].strip()
edu = arr[2].strip()
num = arr[3].strip()
pub_date = arr[4].strip()
salary = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/strong/text()').extract_first()
job_info = response.xpath('/html/body/div[3]/div[2]/div[3]/div[1]/div').extract_first()
item = CareerItem()
item['job_name'] = jobTitle
item['company_name'] = company
item['company_city'] = area
item['job_exp'] = exp
item['job_degree'] = edu
item['job_salary'] = salary
# item['job_tips'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/div/div').extract_first()
job_tips = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div[1]/div/div').extract_first()
job_tips = job_tips.replace('</span><span class="sp4">','|')
job_tips = self.reg.sub('',job_tips)
item['job_tips'] = job_tips
item['job_info'] = job_info
item['company_contact'] = response.xpath('/html/body/div[3]/div[2]/div[3]/div[2]/div/p/text()').extract_first()
item['company_info'] = response.xpath('/html/body/div[3]/div[2]/div[3]/div[3]/div').extract_first()
item['company_type'] = response.xpath('/html/body/div[3]/div[2]/div[4]/div[1]/div[2]/p[1]/@title').extract_first()
item['company_people_num'] = response.xpath('/html/body/div[3]/div[2]/div[4]/div[1]/div[2]/p[2]/@title').extract_first()
item['company_industry'] = response.xpath('/html/body/div[3]/div[2]/div[4]/div[1]/div[2]/p[3]/@title').extract_first()
print(item)
yield item

pass

然后执行的时候提示：
2020-01-16 12:17:21 [scrapy.core.scraper] ERROR: Spider error processing <GET https://search.51job.com/list/020000,000000,0000,00,9,99,python,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=> (referer: None)
Traceback (most recent call last):
File "C:\Users\YQL\AppData\Local\Programs\Python\Python37-32\lib\site-packages\scrapy\http\request\__init__.py", line 64, in _set_url
raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)

有大神知道是哪里错了吗？烦请各位指教，万分感谢呢

...全文

237 2 打赏收藏转发到动态举报

写回复

用AI写文章

2 条回复

切换为时间正序

请发表友善的回复…

发表回复

accesszhishen 2020-01-19

打赏
举报

引用 1 楼冰风漫天的回复:


    def parse(self, response):
        for i in range(4,55):
            url = response.xpath('//*[@id="resultList"]/div['+str(i)+']/p/span/a/@href')\
                .extract_first()
            yield scrapy.Request(url,callback=self.parsDetail,headers=self.header)

是因为这个url会是None，加个判断处理吧

好的谢谢

冰风漫天 2020-01-18

打赏
举报


    def parse(self, response):
        for i in range(4,55):
            url = response.xpath('//*[@id="resultList"]/div['+str(i)+']/p/span/a/@href')\
                .extract_first()
            yield scrapy.Request(url,callback=self.parsDetail,headers=self.header)

是因为这个url会是None，加个判断处理吧