121,059
社区成员




#spiderContent.py import random import time import requests import csv from clearData import clearData from globalVariable import * from datetime import datetime import os from utils.wordCloudPicture import get_img max_id = 0 articleId = '' commentUrl = 'https://weibo.com/ajax/statuses/buildComments' text = '' def init(articleCommentsFilePath): if not os.path.exists(articleCommentsFilePath): with open(articleCommentsFilePath, 'w', encoding='utf-8', newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow([ 'articleId', 'created_at', 'likes_counts', 'region', 'content', 'authorName', 'authorGender', 'authorAddress' ]) def writerRow(row, articleCommentsFilePath): with open(articleCommentsFilePath, 'a', encoding='utf-8', newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow(row) csvFile.flush() csvFile.close() def get_data(url, params): response = requests.get(url, headers=headers, params=params) if response.status_code == 200: response_json = response.json() global max_id max_id = response_json['max_id'] return response_json['data'] else: return None def getAllArticleList(articleDataFilePath): artileList = [] with open(articleDataFilePath, 'r', encoding='utf-8') as reader: reader.flush() readerCsv = csv.reader(reader) for nav in readerCsv: # print(nav) artileList.append(nav) # print(artileList) return artileList def parse_json(response, articleId, articleCommentsFilePath): if response is None: return for comment in response: global region created_at = datetime.strptime(comment['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d') likes_counts = comment['like_counts'] try: region = comment['source'].replace('来自', '') except: region = '无' content = comment['text_raw'] content = clearData(content) # 数据清洗 if content == "": continue # 判空 global text text += content authorName = comment['user']['screen_name'] authorGender = comment['user']['gender'] authorAddress = comment['user']['location'] writerRow([ articleId, created_at, likes_counts, region, content, authorName, authorGender, authorAddress ], articleCommentsFilePath) def start(articleDataFilePath, articleCommentsFilePath): init(articleCommentsFilePath) articleList = getAllArticleList(articleDataFilePath) for article in articleList[1:]: global articleId articleId = article[0] global text text = '' start_time = time.time() print('正在爬取id值为%s的文章评论' % articleId) time.sleep(random.uniform(0, 1)) params = { 'id': int(articleId), 'is_show_bulletin': 2 } response = get_data(commentUrl, params) parse_json(response, articleId, articleCommentsFilePath) max_page = 100 max_id_last = 0 while max_id != 0: max_page -= 1 if max_page < 0: break print(max_id) if max_id_last == max_id: break max_id_last = max_id params = { 'id': int(articleId), 'is_show_bulletin': 2, 'max_id': int(max_id) } response = get_data(commentUrl, params) parse_json(response, articleId, articleCommentsFilePath) time.sleep(random.uniform(0, 0.5)) end_time = time.time() print('耗时:' + str(end_time - start_time)) print() get_img(articleId,text) # if __name__ == '__main__': # start()#spiderComments.py
import random import time import requests import csv from clearData import clearData from globalVariable import * from datetime import datetime import os from utils.wordCloudPicture import get_img max_id = 0 articleId = '' commentUrl = 'https://weibo.com/ajax/statuses/buildComments' text = '' def init(articleCommentsFilePath): if not os.path.exists(articleCommentsFilePath): with open(articleCommentsFilePath, 'w', encoding='utf-8', newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow([ 'articleId', 'created_at', 'likes_counts', 'region', 'content', 'authorName', 'authorGender', 'authorAddress' ]) def writerRow(row, articleCommentsFilePath): with open(articleCommentsFilePath, 'a', encoding='utf-8', newline='') as csvFile: writer = csv.writer(csvFile) writer.writerow(row) csvFile.flush() csvFile.close() def get_data(url, params): response = requests.get(url, headers=headers, params=params) if response.status_code == 200: response_json = response.json() global max_id max_id = response_json['max_id'] return response_json['data'] else: return None def getAllArticleList(articleDataFilePath): artileList = [] with open(articleDataFilePath, 'r', encoding='utf-8') as reader: reader.flush() readerCsv = csv.reader(reader) for nav in readerCsv: # print(nav) artileList.append(nav) # print(artileList) return artileList def parse_json(response, articleId, articleCommentsFilePath): if response is None: return for comment in response: global region created_at = datetime.strptime(comment['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d') likes_counts = comment['like_counts'] try: region = comment['source'].replace('来自', '') except: region = '无' content = comment['text_raw'] content = clearData(content) # 数据清洗 if content == "": continue # 判空 global text text += content authorName = comment['user']['screen_name'] authorGender = comment['user']['gender'] authorAddress = comment['user']['location'] writerRow([ articleId, created_at, likes_counts, region, content, authorName, authorGender, authorAddress ], articleCommentsFilePath) def start(articleDataFilePath, articleCommentsFilePath): init(articleCommentsFilePath) articleList = getAllArticleList(articleDataFilePath) for article in articleList[1:]: global articleId articleId = article[0] global text text = '' start_time = time.time() print('正在爬取id值为%s的文章评论' % articleId) time.sleep(random.uniform(0, 1)) params = { 'id': int(articleId), 'is_show_bulletin': 2 } response = get_data(commentUrl, params) parse_json(response, articleId, articleCommentsFilePath) max_page = 100 max_id_last = 0 while max_id != 0: max_page -= 1 if max_page < 0: break print(max_id) if max_id_last == max_id: break max_id_last = max_id params = { 'id': int(articleId), 'is_show_bulletin': 2, 'max_id': int(max_id) } response = get_data(commentUrl, params) parse_json(response, articleId, articleCommentsFilePath) time.sleep(random.uniform(0, 0.5)) end_time = time.time() print('耗时:' + str(end_time - start_time)) print() get_img(articleId,text) #if __name__ == '__main__': # start()#globalVariable.py
import os from datetime import datetime headers = { 'Cookie': 'SINAGLOBAL=8240438899311.014.1696773183143; SCF=AmrnNqo7P2iXmQ3NUrBALgX7jeCXIpFT4Gs5T0suCF-JPfazBZLqNgNqZCrLFQXeez5g0PPSmlpDEpo-rHrOwoE.; UOR=,,www.baidu.com; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWZsKabTbTcpwExYBKrSd265JpX5KMhUgL.Foe0eKM4eo-0Sh52dJLoI0YLxK.L1KMLB--LxK-L1K5L1-zLxK-L1KeLB-2LxKnL1heLB.BLxKqLBonL1h-LxK-LBKqL1hzLxKqLBo5LBoBt; ULV=1727100176835:15:3:2:6609349842637.535.1727100176829:1727011678443; XSRF-TOKEN=Dehifh4IfSPCO6o5UGYoxSEp; ALF=1730468107; SUB=_2A25L-TxbDeRhGeVN6lUY8ivPzzyIHXVpdzGTrDV8PUJbkNANLWmskW1NTI12JCXuXFvGbMlyYJ5emHm8bMhRhqPw; WBPSESS=d1RoWYRMvf7R2BqLPDL0saG9CHbOSvtzfEfEtDn2JIFGsdDPcOHfjE_yYuPiwp41wF80Yw-6jCdWfADzzomAhNIfmto_wzKqj-YV8et1NMDXibmZxTkF1zEUTqjYCzdkoOTx2eV4fTd4hjRNAFunkw==', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36' } def initGlobalVariable(): current_time = datetime.now() formatted_time = current_time.strftime("%Y-%m-%d_%H-%M-%S") articleDataFilePath = r'D:\weibo\spiders\data\articleContent_' + formatted_time + '.csv' articleCommentsFilePath = r'D:\weibo\spiders\data\articleComments_' + formatted_time + '.csv' articleCategoryFilePath = r'D:\weibo\spiders\data\articleCategory.csv' return articleCategoryFilePath, articleDataFilePath, articleCommentsFilePath if __name__ == '__main__': initGlobalVariable()