37,708
社区成员




def body(url1,url2):
try:
txt = urllib.urlopen(url1).read()
except Exception, e:
txt = urllib.urlopen(url2).read()
txt = unicode(txt,"gbk")
parser = html2txt()
parser.feed(txt)
parser.close()
#print type(parser.text) #<type 'unicode'>
return parser.text
def cr2n(txt):
#data = re.sub('(\s|\xa0)+','',txt.encode("gbk"))
#data = data.replace('<NEWLINE>',"\n").strip()
p = re.compile(r'(。|!|……|】)')
t = p.sub('。\n',txt.encode("gbk"))
#print type(t) # t is str
return t
def getcontent(ar,txt):
ctent = re.findall(ar,txt)
for i in ctent:
print i
url1 = 'http://docs.google.com/Doc?id=d8b2mxm_107cc3mgqgr' #
url2 = 'http://docs.google.com/Doc?id=d8b2mxm_107cc3mgqgr' #暂时用一样的
test = '需要匹配这个。' # test是str
ar = '匹配'
k = cr2n(body(url1,url2)) #k 是 str
getcontent(ar,k)#这个没有输出:(
getcontent(ar,test) #这个工作正确
txt = unicode(txt,"gbk")
txt = unicode(txt,"utf-8")