# -*- coding: utf-8 -*-
import MySQLdb
import MySQLdb.cursors
import urllib2
import requests
import json
import re
import base64
import sys
page=1
count=1
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
for page in range(1,50):
url='http://club.jd.com/productpage/p-2829294-s-0-t-0-p-'+str(page)+'.html?'
cont=urllib2.urlopen(url).read()
rex=re.compile(r'\"content\":\"(.*?)\"\,\"creationTime\":\"(.*?)\"\,\"isTop\".*?\"referenceName\":\"(.*?)\"\,\"referenceTime\".*?\"score\":(.*?)\,\"status\".*?\"userProvince\":\"(.*?)\"\,\"userRegisterTime\".*?\"nickname\":\"(.*?)\"\,\"userClient\"',re.S)
#(r'\"tmall_vip_level\":(.*?)\,\"spuId\".*?\"sku\":(.*?)\,\"enableTime\".*?\"displayUserNick\":(.*?)\,\"displayUserNumId\".*?\"rateContent\":(.*?)\,\"rateDate\":(.*?)\,\"reply\"',re.S)
contents=rex.findall(cont)
for content in contents:
haveImg = re.search("img",content[0])
if not haveImg:
print count,content[0].decode('gbk').encode('utf-8'),content[1],content[2].decode('gbk').encode('utf-8'),content[3],content[4].decode('gbk').encode('utf-8'),content[5].decode('gbk').encode('utf-8')
count=count+1
print '页码:' ,page
page=page+1
这是主要代码,请大家看一下对不对~谢谢~