37,709
社区成员




Traceback (most recent call last):
File "F:\Python25\Lib\site-packages\pythonwin\pywin\framework\scriptutils.py", line 310, in RunScript
exec codeObject in __main__.__dict__
File "F:\Python25\CTR_my_way.py", line 190, in <module>
print cn2juhao(body(i,j))
File "F:\Python25\CTR_my_way.py", line 131, in body
parser.feed(html)
File "F:\Python25\lib\sgmllib.py", line 99, in feed
self.goahead(0)
File "F:\Python25\lib\sgmllib.py", line 169, in goahead
k = self.parse_declaration(i)
File "F:\Python25\lib\markupbase.py", line 98, in parse_declaration
decltype, j = self._scan_name(j, i)
File "F:\Python25\lib\markupbase.py", line 388, in _scan_name
% rawdata[declstartpos:declstartpos+20])
File "F:\Python25\lib\sgmllib.py", line 106, in error
raise SGMLParseError(message)
SGMLParseError: expected name token at '<!!---["+bb+"]-start'
html = re.sub('onload=\"\s*[^\"]*\"','',html)
html = re.sub('onmouseover=\"\s*[^\"]*\"','',html)
#修改为:
html = re.sub(r'b2="[^"].*"', '', html)
html = re.sub(r'e2="[^"].*"', '', html)
html = re.sub(r'b="[^"].*"', '', html)
html = re.sub(r'e="[^"].*"', '', html)
html = re.sub('onload=\"\s*[^\"]*\"','',html)
html = re.sub('onmouseover=\"\s*[^\"]*\"','',html)
#修改为:
html = re.sub(r'<![^-->].*-->','',html)
def body(url1,url2):
try:
html = urllib.urlopen(url1).read()
except Exception, e:
html = urllib.urlopen(url2).read()
#txt = unicode(txt,"gbk")
html = re.sub('onload=\"\s*[^\"]*\"','',html)
html = re.sub('onmouseover=\"\s*[^\"]*\"','',html)
parser = html2txt()
parser.feed(html)
parser.close()
return parser.text