python写爬虫遇到的一些问题

MIMImmm 2016-04-12 07:41:44
for html in htmlList:
temp += 1
print temp, html
n = random.randint(-1, 19)
headers = {
'User-Agent': user_agent,
}
try:
request = urllib2.Request(html, headers=headers)
content = urllib2.urlopen(request)
except urllib2.URLError as e:
print e.message
except urllib2.HTTPError as h:
print h.message

bs_obj = BeautifulSoup(content.read(), "html.parser")

a_list = bs_obj.find_all("a", href=re.compile("v\.youku\.com\w?"))

for a in a_list:
if a.attrs.get('href') and a.attrs['href'].strip('\n\t') not in totalHtmlPage:
if a.attrs['href'] != re.compile("index\w?"):
nextHtmlList.append(a.attrs['href'].strip('\n\t'))


每次都会报这个错误
Traceback (most recent call last):
File "/Users/wanghe/Desktop/MusicPopularTrendPrediction/CollectData.py", line 81, in <module>
htmlList = Crawler(htmlList)
File "/Users/wanghe/Desktop/MusicPopularTrendPrediction/CollectData.py", line 56, in Crawler
content = urllib2.urlopen(request)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 431, in open
response = self._open(req, data)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 449, in _open
'_open', req)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 409, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1227, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1200, in do_open
r = h.getresponse(buffering=True)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 1132, in getresponse
response.begin()
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 453, in begin
version, status, reason = self._read_status()
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/httplib.py", line 409, in _read_status
line = self.fp.readline(_MAXLINE + 1)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/socket.py", line 480, in readline
data = self._sock.recv(self._rbufsize)
socket.error: [Errno 54] Connection reset by peer
...全文
391 1 打赏 收藏 转发到动态 举报
写回复
用AI写文章
1 条回复
切换为时间正序
请发表友善的回复…
发表回复
屎克螂 2016-04-13
  • 打赏
  • 举报
回复
socket.error: [Errno 54] Connection reset by peer 我遇到过 用2g上网卡上网 网络不好的情况经常出现

37,721

社区成员

发帖
与我相关
我的任务
社区描述
JavaScript,VBScript,AngleScript,ActionScript,Shell,Perl,Ruby,Lua,Tcl,Scala,MaxScript 等脚本语言交流。
社区管理员
  • 脚本语言(Perl/Python)社区
  • IT.BOB
加入社区
  • 近7日
  • 近30日
  • 至今

试试用AI创作助手写篇文章吧