求助,python27改到python36的问题
# coding=utf-8
import re
import time
import json
from urllib import request
import pandas as pd
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection':'close',
'Referer':'https://www.jd.com/'
}
file = open('F:\\spiderTest\\jd_phone.txt','w')
productName1 = []
commentTime1 = []
content1 = []
def crawlProductComment(url,page):
req = request.Request(url, headers=headers)
html = request.urlopen(req).read()
html = html.decode('gbk')
reg0 = re.compile('^fetchJSON_comment98vv22312\(')
reg1 = re.compile('\);')
reg2 = re.compile('&[a-zA-Z]dquo')
reg3 = re.compile('&hellip')
reg4 = re.compile('\r\n')
data = reg0.sub('',html)
data = reg1.sub('', data)
data = reg2.sub('',data)
data = reg3.sub('',data)
data = reg4.sub('',data)
data = json.loads(data)#data1的内容为一个字典,用{}括起来的内容
for i in data['comments']:
content = i['content'].encode('gbk')
content = content.replace("\n", "")
content1.append(i['content'].replace("\n", ""))
file.write(content +'\n')
for i in range(0,2):
url = 'https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv22312&productId=4538887&score=0&sortType=5&page='+ str(i) +'&pageSize=10&isShadowSku=0&rid=0&fold=1'
crawlProductComment(url,i)
time.sleep(1)
#print content1
#将前面提取的各字段信息汇总为table数据表,以便后面分析
table=pd.DataFrame({'productName':productName1,'commentTime':commentTime1,'content':content1})
#print table
#将creationTime字段更改为时间格式
table['commentTime']=pd.to_datetime(table['commentTime'])
#设置creationTime字段为索引列
#table = table.set_index('commentTime')
table.to_csv('f://spiderTest//jd_table_phone.csv',mode='w', encoding='utf-8',header = None)
#table.to_csv('f://spiderTest//jd_table_phone1.csv',mode='w', encoding='gbk',header = None)
file.close()
为什么使用python27程序可以运行,换了python36就一直报错,为什么老是字符串和byte的问题