Python3 谷歌翻译爬虫 urllib.error.HTTPError: HTTP Error 403: Forbidden
爬虫小白
在对谷歌翻译的爬虫中,一直报错
File "E:\MyDownloads\Download\lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "E:\MyDownloads\Download\lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "E:\MyDownloads\Download\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "E:\MyDownloads\Download\lib\urllib\request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden
但是我已经添加了user-agent了。还是提示该错误。
现在主要怀疑两点:
1. 是否Google的翻译的机制完全改变了,所以这段代码没有用
2. user-agent这块错了?
代码部分参考了http://blog.csdn.net/u013289188/article/details/38360477
import re
import urllib.request
import urllib.parse
import random
def Gtranslate(text):
# text 输入要翻译的英文句子
Gtext = text
# hl:浏览器、操作系统语言,默认是zh-CN
# ie:默认是UTF-8
# text:就是要翻译的字符串
# langpair:语言对,即'en'|'zh-CN'表示从英语到简体中文
values = {'hl': 'zh-CN', 'ie': 'UTF-8', 'text': Gtext, 'langpair': "'en'|'zh-CN'"}
# URL用来存储谷歌翻译的网址
url = 'http://translate.google.cn/'
# 将values中的数据通过urllib.urlencode转义为URL专用的格式然后赋给data存储
data = urllib.parse.urlencode(values).encode("utf-8")
# 然后用URL和data生成一个request
req = urllib.request.Request(url, data)
user_agents=[
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 ",
]
browser = random.choice(user_agents)
req.add_header('User-Agent', browser)
# 向谷歌翻译发送请求
response = urllib.request.urlopen(req)
# 读取返回页面,然后我们就从这个HTML页面中截取翻译过来的字符串即可
html = response.read()
# 使用正则表达式匹配<=TRANSLATED_TEXT=)。而翻译后的文本是'TRANSLATED_TEXT='等号后面的内容
p = re.compile(r"(?<=TRANSLATED_TEXT=).*?;")
m = p.search(html)
chineseText = m.group(0).strip(';')
return chineseText
if __name__ == "__main__":
# Gtext为待翻译的字符串
Gtext = 'Hello'
print('The input text: %s' % Gtext)
chineseText = Gtranslate(Gtext).strip("'")
print('Translated End,The output text: %s' % chineseText)
万分感谢!