去除text内容杂质及怎样调用类里的方法
for i in range(2, 100):
urls = url + str(i)
# print urls
request = urllib2.Request(url=urls, headers=header)
content = urllib2.urlopen(request)
time.sleep(5)
contents = content.read()
# time.sleep(5)
dict_data = json.loads(contents)
print dict_data
print "---------------" + str(i)
cards = dict_data['cards'][0]['card_group']
for i in cards:
a = i['mblog']
print U"时间:", a['created_at']
print u'mid:', a['mid']
text = a['text'].encode('gbk', 'ignore')
print text
#调用下面Tool()类里的方法,使text文本内容里的杂质去除,只留中文。该怎么办
class Tool():
# 去除img标签,7位长空格
removeImg = re.compile('<img.*?>| {7}|')
#删除span标签
removeSpan = re.compile('<span.*?>| {7}|')
# 删除超链接标签
removeAddr = re.compile('<a.*?>|</a>')
# 把换行的标签换为\n
replaceLine = re.compile('<tr>|<div>|</div>|</p>')
# 将表格制表<td>替换为\t
replaceTD = re.compile('<td>')
# 把段落开头换为\n加空两格
replacePara = re.compile('<p.*?>')
# 将换行符或双换行符替换为\n
replaceBR = re.compile('<br><br>|<br>')
# 将其余标签剔除
removeExtraTag = re.compile('<.*?>')
def replace(self,text):
text = re.sub(self.removeImg, "", text)
text = re.sub(self.removeSpan,"",text)
text = re.sub(self.removeAddr, "", text)
text = re.sub(self.replaceLine, "\n", text)
text = re.sub(self.replaceTD, "\t", text)
text = re.sub(self.replacePara, "\n ", text)
text = re.sub(self.replaceBR, "\n", text)
text = re.sub(self.removeExtraTag, "", text)
# strip()将前后多余内容删除
return text.strip()