37,719
社区成员
发帖
与我相关
我的任务
分享
import urllib
import re
def fetch(id=90938184,debug=False):
urlbase = 'http://zhidao.baidu.com/question/'
url = urlbase + str(id) +'.html'
res = urllib.urlopen(url).read()
tre = re.compile(r'span class="question-title">(.*?)</span>',re.DOTALL)
qusre = re.compile(r'<pre id="question-content">(.*?)</pre>',re.DOTALL)
qusre2 = re.compile(r'<pre id="question-suply">(.*?)</pre>',re.DOTALL)
bestre = re.compile(r'<pre id="best-answer-content".*?>(.*?)</pre>',re.DOTALL)
nbestre = re.compile(r'<pre class="reply-text mb10".*?>(.*?)</pre>',re.DOTALL)
title = re.findall(tre,res)
qus = re.findall(qusre,res)
qus2 = re.findall(qusre2,res)
best = re.findall(bestre,res)
nbest = re.findall(nbestre,res)
if debug:
print title
print qus
print qus2
print best
print nbest
fetch(90938184,debug=True)
>>> s = '\xd4\xa1\xb0\xd4\xd1\xa1\xd4\xf1' #从百度网页得到的字符串,他的编码是gbk
>>> s2 = s.decode('gbk') # s的编码是gbk,把它转化为utf-8编码的字符串s2
>>> print s2 #我的系统编码是utf-8,所以可以显示s2
浴霸选择
>>> print s2.encode('gb2312') #encode可以把utf-8编码转化为其它的编码(我的系统不支持gb2312,所以显示乱码)
ԡ��ѡ��
import urllib
import re
import binascii
def fetch(id=90938184,debug=False):
urlbase = 'http://zhidao.baidu.com/question/'
url = urlbase + str(id) +'.html'
res = urllib.urlopen(url).read()
tre = re.compile(r'span class="question-title">(.*?)</span>',re.DOTALL)
qusre = re.compile(r'<pre id="question-content">(.*?)</pre>',re.DOTALL)
qusre2 = re.compile(r'<pre id="question-suply">(.*?)</pre>',re.DOTALL)
bestre = re.compile(r'<pre id="best-answer-content".*?>(.*?)</pre>',re.DOTALL)
nbestre = re.compile(r'<pre class="reply-text mb10".*?>(.*?)</pre>',re.DOTALL)
title = re.findall(tre,res)
qus = re.findall(qusre,res)
qus2 = re.findall(qusre2,res)
best = re.findall(bestre,res)
nbest = re.findall(nbestre,res)
for t in title:
print t.decode('gbk')
for q in qus:
print q.decode('gbk')
for q2 in qus2:
print q2.decode('gbk')
for b in best:
print b.decode('gbk')
for n in nbest:
print n.decode('gbk')
fetch(90938184,debug=True)