使用beautifulsoup4下载网页出现乱码
#coding:UTF-8
import urllib2
import urllib
import cookielib
import re
from bs4 import BeautifulSoup
def getcontent(url):
filename = 'cookie.txt'
linklist = []
cookie = cookielib.MozillaCookieJar(filename)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
postdata = urllib.urlencode({
'userid':'liwan123',
'password':'lishichao1',
'UserLogin':'True',
'Submit.x':'25',
'Submit.y':'14'
})
loginurl = "http://www.chinacoalchem.com/loginchk.asp?action=pw"
result = opener.open(loginurl,postdata)
cookie.save(ignore_discard=True,ignore_expires=True)
resulthtml = opener.open(url)
soup = BeautifulSoup(resulthtml,from_encoding='gb2312')
for n in soup.find_all('a'):
#print n.get('href')
linklist.append(n.get('href'))
return linklist
print getcontent('http://www.chinacoalchem.com/more.asp?lm=%D7%EE%D0%C2%D0%C5%CF%A2&page=1')
代码如上,想截取http://www.chinacoalchem.com/more.asp?lm=%D7%EE%D0%C2%D0%C5%CF%A2&page=1列表页的新闻链接地址,但是获得的结果是出现乱码,而且无缘无故多了个u,结果这样:[u'http://www.chinacoalchem.com/events/2015CTL/index.htm', u'index.asp', u'more.asp?lm=\u653f\u7b56\u89c4\u5212', u'more.asp?lm=\u516c\u53f8\u52a8\u6001', u'more.asp?lm=\u5de5\u7a0b\u9879\u76ee', u'more.asp?lm=\u6280\u672f\u8fdb\u5c55', u'more.asp?lm=\u5e02\u573a\u884c\u60c5', u'more.asp?lm=\u7532\u9187', u'more.asp?lm=%C3%BA%D6%C6%CC%EC%C8%BB%C6%F8', u'more.asp?lm=\u7164\u5236\u6cb9', u'more.asp?lm=MTO/MTP', u'more.asp?lm=\u5176\u4ed6\u7164\u5316\u5de5', u'yuekan.asp', u'http://www.shenhuagroup.com.cn', u'http://www.mbchem.com', u'http://www.ykjt.cn/', u'http://www.ctdmto.com/', u'http://www.praxair.com.cn/', u'http://www.wison.com', u'http://www.gsp-cn.com', u'http://www.mcwongtech.com', u'http://www.uop.com/processing-solutions/gas-processing', u'http://www.blogcn.com/zlzj/tougao.\xb2\xa9\xce\xc4\xd4\xad\xb4\xb4.html', u'#', u'news.asp?id=60447', u'news.asp?id=60448', u'news.asp?id=60449', u'news.asp?id=60443', u'news.asp?id=60444', u'news.asp?id=60445', u'news.asp?id=60446', u'news.asp?id=60440', u'news.asp?id=60441', u'news.asp?id=60442', u'news.asp?id=60439', u'news.asp?id=60436', u'news.asp?id=60437', u'news.asp?id=60438', u'news.asp?id=60431', u'more.asp?lm=\u6700\u65b0\u4fe1\u606f&page=2', u'more.asp?lm=\u6700\u65b0\u4fe1\u606f&page=622', u'manage.asp', u'loginchk.asp?userlogout=logout', u'http://www.airproducts.com/', u'http://www.cn.airliquide.com/', u'http://www.sulzerchemtech.com ', u'http://www.choren.com/', u'http://www.horizon-fluid.com', u'http://www.linde.com', u'http://www.highchem.co.jp/', u'http://www.antiwearvalve.com/', u'http://www.cbi.com/technologies/gasification-technology', u'http://www.PrattWhitneyRocketdyne.com', u'http://www.chinacoalchem.com/events/2015CTO/index.htm', None, u"javascript:window.external.AddFavorite('http://www.Chinacoalchem.com/','\u4e2d\u56fd\u7164\u5316\u7f51')", u'service.asp', u'connect_to_us.asp', u'about_us.asp', u'http://www.miibeian.gov.cn'],而且还出现这种类似的乱码.\xb2\xa9\xce\xc4\xd4,这个问题不知道如何解决??