from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.utils.response import get_base_url
from scrapy.utils.url import urljoin_rfc
from dmoz.items import DmozItem
class DmozSpider(BaseSpider):
name = "dmoz.org"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.pizzahut.com.cn/phdi/index.aspx#!handler/home.ashx?&tagid=&proid="
]
一下是错误信息:
--- <exception caught here> ---
File "C:\Python26\Lib\site-packages\twisted\internet\defer.py", line 542, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "C:\Python26\Lib\site-packages\scrapy\contrib\pipeline\media.py", line 41, in process_item
requests = arg_to_iter(self.get_media_requests(item, info))
File "C:\Python26\Lib\site-packages\scrapy\contrib\pipeline\images.py", line 308, in get_media_requests
return [Request(x) for x in item.get('image_urls', [])]
File "C:\Python26\Lib\site-packages\scrapy\http\request\__init__.py", line 25, in __init__
self._set_url(url)
File "C:\Python26\Lib\site-packages\scrapy\http\request\__init__.py", line 61, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
exceptions.ValueError: Missing scheme in request url: h
非常非常感谢你提供的网址,我按照它的方法成功获得了图片的绝对路径,但是又有另外一个问题出现了,我调用urljoin_rfc(base_url, relative_url)函数时,显示异常为:Missing scheme in request url,然后我翻查了源代码:
if ':' not in self._url:
raise ValueError('Missing scheme in request url: %s' % self._url)