初学python，请教一个编码的问题

yoyo1631 2011-08-11 08:00:09

抓取一个百度知道的，返回的编码是形如：
['\xd4\xa1\xb0\xd4\xd1\xa1\xd4\xf1']的东西，我像问下，为什么不是汉字啊？
我是菜鸟一个，高手表喷哦！

下面是代码



import urllib

import re



def fetch(id=90938184,debug=False):

    urlbase = 'http://zhidao.baidu.com/question/'

    url = urlbase + str(id) +'.html'

    res = urllib.urlopen(url).read()



    tre = re.compile(r'span class="question-title">(.*?)</span>',re.DOTALL)

    qusre = re.compile(r'<pre id="question-content">(.*?)</pre>',re.DOTALL)

    qusre2 = re.compile(r'<pre id="question-suply">(.*?)</pre>',re.DOTALL)

    bestre = re.compile(r'<pre id="best-answer-content".*?>(.*?)</pre>',re.DOTALL)

    nbestre = re.compile(r'<pre class="reply-text mb10".*?>(.*?)</pre>',re.DOTALL)



    

    title = re.findall(tre,res)

    qus = re.findall(qusre,res)

    qus2 = re.findall(qusre2,res)

    best = re.findall(bestre,res)

    nbest = re.findall(nbestre,res)

    









    if debug:

        print title

        print qus

        print qus2

        print best

        print nbest



fetch(90938184,debug=True)

...全文

42 3 打赏收藏转发到动态举报

写回复

用AI写文章

3 条回复

切换为时间正序

请发表友善的回复…

发表回复

yoyo1631 2011-08-11

打赏
举报

谢谢各位，问题已经解决
两位都有道理！

panghuhu250 2011-08-11

打赏
举报

[Quote=引用楼主 yoyo1631 的回复:]
抓取一个百度知道的，返回的编码是形如：
['\xd4\xa1\xb0\xd4\xd1\xa1\xd4\xf1']的东西，我像问下，为什么不是汉字啊？
[/Quote]
'\xd4\xa1\xb0\xd4\xd1\xa1\xd4\xf1'是汉字的编码。你要把它转化为和你的python环境匹配的编码，才能输出正确的汉字字符。例如：

>>> s = '\xd4\xa1\xb0\xd4\xd1\xa1\xd4\xf1' #从百度网页得到的字符串，他的编码是gbk

>>> s2 = s.decode('gbk') # s的编码是gbk，把它转化为utf-8编码的字符串s2

>>> print s2 #我的系统编码是utf-8，所以可以显示s2

浴霸选择

>>> print s2.encode('gb2312') #encode可以把utf-8编码转化为其它的编码（我的系统不支持gb2312，所以显示乱码)

ԡ��ѡ��

我姓区不姓区 2011-08-11

打赏
举报



import urllib

import re

import binascii



def fetch(id=90938184,debug=False):

    urlbase = 'http://zhidao.baidu.com/question/'

    url = urlbase + str(id) +'.html'

    res = urllib.urlopen(url).read()



    tre = re.compile(r'span class="question-title">(.*?)</span>',re.DOTALL)

    qusre = re.compile(r'<pre id="question-content">(.*?)</pre>',re.DOTALL)

    qusre2 = re.compile(r'<pre id="question-suply">(.*?)</pre>',re.DOTALL)

    bestre = re.compile(r'<pre id="best-answer-content".*?>(.*?)</pre>',re.DOTALL)

    nbestre = re.compile(r'<pre class="reply-text mb10".*?>(.*?)</pre>',re.DOTALL)



    

    title = re.findall(tre,res)

    qus = re.findall(qusre,res)

    qus2 = re.findall(qusre2,res)

    best = re.findall(bestre,res)

    nbest = re.findall(nbestre,res)

    





    for t in title:

        print t.decode('gbk')



    for q in qus:

        print q.decode('gbk')



    for q2 in qus2:

        print q2.decode('gbk')



    for b in best:

        print b.decode('gbk')



    for n in nbest:

        print n.decode('gbk')



        

fetch(90938184,debug=True)