用Python写了个去标签的小程序，出了个错TypeError: expected string or buffer，不知道错在哪了，求大神们帮帮忙

xibei5602 2016-12-02 09:31:49

我是Python初学者，现在写了一个从百度搜索一个东西，将得到的一二级页面的内容去除标签的程序，出了

这样一个错，看不太懂错在哪儿了，求大神帮帮忙

#coding:utf-8
import urllib
import urllib2
import nltk
import re
from bs4 import BeautifulSoup
localdir = "E:\\"
#

[code=python]

[/code]
#函数名：remove_html
#函数功能：清除html标签，去空行
#函数接口：
#输入：html文本
#输出：去标签后的html文本
#
def remove_html(html):
reg1 = re.compile("<[^>]*>")
content = reg1.sub('',html)
# content.replace('\n',' ')
part_content = content.split()
new_content = ""
for i in part_content:
i=i.strip()
new_content+=i+' '
return new_content

#
#函数名：get_all_url
#函数功能：获取所有URL
#函数接口：
#输入：html文本,格式为.txt
#输出：返回一个包含页面上所有URL连接的list
#

def get_all_url(html):
links = re.findall('"((http|ftp)s?://.*?)"', html)
return links

#
#函数名：open_save
#函数功能：打开一个URL并将其内容入库
#函数接口：
#输入：URL
#输出：将HTML内容入库,返回一个网页结构，可以用BS解析
#
def open_save(url):
f = urllib.urlopen(url)
raw = BeautifulSoup(f,"html.parser")
[script.extract() for script in raw.findAll('script')]
[style.extract() for style in raw.findAll('style')]
raw.prettify()
file = open(localdir + '1.txt','w')
print >> file,raw
file.close()
return raw

##################################################################################################

#1.给出url

my_text = raw_input("input:")
url = 'http://www.baidu.com/s?wd=%s' % my_text
addr = url
#result = urllib.requests.urlopen(url).read() # 搜索结果页的内容
#html = urllib.urlopen(addr)
#print html.readline()
#soup = BeautifulSoup(open("C:\\Users\\TT-pc\\Desktop\\1.html"),"html.parser")
html = open_save(addr)

#2.清洗标签
f = open(localdir + '1.txt').read()
new = remove_html(f)
f1 = open(localdir + '2.txt','w')
print >> f1,new
f1.close()

#3.获取所有URL
new1 = get_all_url(f)
f2 = open(localdir + '3.txt','w')
for a in new1:
print >> f2,a[0]
f2.close()

#爬取二级页面
f4 = open(localdir + '4.txt','w')
f3 = open(localdir + '3.txt').read()
url = f3
for b in url:
tzy = open_save(url)
new2 = remove_html(tzy)
print >> f4, new2
f4.close()

...全文