37,743
社区成员




import urllib
import urllib.request as reqlib
def GetProvinceUrl():
try:
print('Getting province main page...')
req = reqlib.urlopen('http://www.weather.com.cn/textFC/hb.shtml')
resp = req.read().decode('utf-8') # <--sth wrong
req.close()
except:
raise(simpleException('Fail to get province info.'))
# ......
b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n
............
<script type="text/javascript" src="http://c.wrating.com/a1.js">\n</script>\n<script type="text/javascript">\nvar vjAcc="860010-2099040100";\r\nvar wrUrl="http://c.wrating.com/";\r\nvjTrack("");\n</script>\n<noscript>\n<img src="http://c.wrating.com/a.gif?a=&c=860010-2099040100" width="1" height="1"/>\n</noscript>\n</body>\n</html>
.js">
</script>
<script type="text/javascript">
var vjAcc="860010-2099040100";
var wrUrl="http://c.wrating.com/";
vjTrack("");
</script>
<noscript>
<img src="http://c.wrating.com/a.gif?a=&c=860010-2099040100" width="1" height="1"/>
</noscript>
</body>
</html>
import urllib
import urllib.request as reqlib
import re
class simpleException(Exception):
def __init__(self,msg):
Exception(self)
self.msg = msg
def GetProvinceUrl():
try:
print('Getting province main page...')
req = reqlib.urlopen('http://www.weather.com.cn/textFC/hb.shtml')
# TODO: 解析网页编码
resp = req.read().decode('utf-8')
req.close()
except:
raise(simpleException('Fail to get province info.'))
try:
print('Parsing province HTML...')
prvDiv = re.search(r'''(?is)<div.*?class=["']lqcontentBoxheader["'].*?>(.*?)</div>''',resp).group(1)
prvLi = re.findall(r'(?is)<li.*?>(.*?)</li>',prvDiv)
# print(prvDiv, prvLi) # and NOTHING printed
prvDict = {}
for li in prvLi:
# 获得超链接和名称
info = re.search(r'''<a.*?href=['"](.*?)['"].*?>(.*?)</a>''',li)
prvDict.setdefault(info.group(2),info.group(1))
print(info.group(2),info.group(1))
except:
raise(simpleException('Fail to parse HTML about provinces.'))
# 补全短连接
for prv in prvDict:
if prvDict[prv][0] == r'/':
prvDict[prv] = r'http://www.weather.com.cn'+prvDict[prv]
elif re.match(r'https?://*',prvDict[prv]) == None:
prvDict[prv] = r'http://'+prvDict[prv];
return prvDict
def GetCityCodes():
pass
try:
print(GetProvinceUrl())
except simpleException as e:
print(e.msg)
# ......
>>> print(type(prvDiv))
# <class 'str'>
>>> print(prvDiv)
#
>>> for ch in prvDiv: print(ch, end='')
# blablablablabla...
# 有毒。
是不是我哪里设置错了……
用的是Python3.4,因为os用的是xp……难道是传说中的官方bug?py3.6
文件头加个#encoding:utf-8
测试一切正常