求助怎样才能运行成功并生成csv文件

m0_68052012 2025-02-23 20:35:58

#spiderContent.py

import random
import time
import requests
import csv
from clearData import clearData
from globalVariable import *
from datetime import datetime
import os
from utils.wordCloudPicture import get_img

max_id = 0
articleId = ''
commentUrl = 'https://weibo.com/ajax/statuses/buildComments'
text = ''


def init(articleCommentsFilePath):
    if not os.path.exists(articleCommentsFilePath):
        with open(articleCommentsFilePath, 'w', encoding='utf-8', newline='') as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow([
                'articleId',
                'created_at',
                'likes_counts',
                'region',
                'content',
                'authorName',
                'authorGender',
                'authorAddress'
            ])


def writerRow(row, articleCommentsFilePath):
    with open(articleCommentsFilePath, 'a', encoding='utf-8', newline='') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow(row)
        csvFile.flush()
        csvFile.close()


def get_data(url, params):
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        response_json = response.json()
        global max_id
        max_id = response_json['max_id']
        return response_json['data']
    else:
        return None


def getAllArticleList(articleDataFilePath):
    artileList = []
    with open(articleDataFilePath, 'r', encoding='utf-8') as reader:
        reader.flush()
        readerCsv = csv.reader(reader)
        for nav in readerCsv:
            # print(nav)
            artileList.append(nav)
            # print(artileList)
    return artileList


def parse_json(response, articleId, articleCommentsFilePath):
    if response is None: return
    for comment in response:
        global region
        created_at = datetime.strptime(comment['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')
        likes_counts = comment['like_counts']
        try:
            region = comment['source'].replace('来自', '')
        except:
            region = '无'
        content = comment['text_raw']
        content = clearData(content)  # 数据清洗
        if content == "": continue  # 判空
        global text
        text += content

        authorName = comment['user']['screen_name']
        authorGender = comment['user']['gender']
        authorAddress = comment['user']['location']
        writerRow([
            articleId,
            created_at,
            likes_counts,
            region,
            content,
            authorName,
            authorGender,
            authorAddress
        ], articleCommentsFilePath)


def start(articleDataFilePath, articleCommentsFilePath):
    init(articleCommentsFilePath)
    articleList = getAllArticleList(articleDataFilePath)
    for article in articleList[1:]:
        global articleId
        articleId = article[0]

        global text
        text = ''

        start_time = time.time()
        print('正在爬取id值为%s的文章评论' % articleId)
        time.sleep(random.uniform(0, 1))
        params = {
            'id': int(articleId),
            'is_show_bulletin': 2
        }
        response = get_data(commentUrl, params)
        parse_json(response, articleId, articleCommentsFilePath)

        max_page = 100
        max_id_last = 0
        while max_id != 0:
            max_page -= 1
            if max_page < 0:
                break
            print(max_id)
            if max_id_last == max_id: break
            max_id_last = max_id
            params = {
                'id': int(articleId),
                'is_show_bulletin': 2,
                'max_id': int(max_id)
            }
            response = get_data(commentUrl, params)
            parse_json(response, articleId, articleCommentsFilePath)
            time.sleep(random.uniform(0, 0.5))

        end_time = time.time()
        print('耗时：' + str(end_time - start_time))
        print()
        get_img(articleId,text)


# if __name__ == '__main__':
#     start()

#spiderComments.py

import random
import time
import requests
import csv
from clearData import clearData
from globalVariable import *
from datetime import datetime
import os
from utils.wordCloudPicture import get_img

max_id = 0
articleId = ''
commentUrl = 'https://weibo.com/ajax/statuses/buildComments'
text = ''


def init(articleCommentsFilePath):
    if not os.path.exists(articleCommentsFilePath):
        with open(articleCommentsFilePath, 'w', encoding='utf-8', newline='') as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow([
                'articleId',
                'created_at',
                'likes_counts',
                'region',
                'content',
                'authorName',
                'authorGender',
                'authorAddress'
            ])


def writerRow(row, articleCommentsFilePath):
    with open(articleCommentsFilePath, 'a', encoding='utf-8', newline='') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow(row)
        csvFile.flush()
        csvFile.close()


def get_data(url, params):
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        response_json = response.json()
        global max_id
        max_id = response_json['max_id']
        return response_json['data']
    else:
        return None


def getAllArticleList(articleDataFilePath):
    artileList = []
    with open(articleDataFilePath, 'r', encoding='utf-8') as reader:
        reader.flush()
        readerCsv = csv.reader(reader)
        for nav in readerCsv:
            # print(nav)
            artileList.append(nav)
            # print(artileList)
    return artileList


def parse_json(response, articleId, articleCommentsFilePath):
    if response is None: return
    for comment in response:
        global region
        created_at = datetime.strptime(comment['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')
        likes_counts = comment['like_counts']
        try:
            region = comment['source'].replace('来自', '')
        except:
            region = '无'
        content = comment['text_raw']
        content = clearData(content)  # 数据清洗
        if content == "": continue  # 判空
        global text
        text += content

        authorName = comment['user']['screen_name']
        authorGender = comment['user']['gender']
        authorAddress = comment['user']['location']
        writerRow([
            articleId,
            created_at,
            likes_counts,
            region,
            content,
            authorName,
            authorGender,
            authorAddress
        ], articleCommentsFilePath)


def start(articleDataFilePath, articleCommentsFilePath):
    init(articleCommentsFilePath)
    articleList = getAllArticleList(articleDataFilePath)
    for article in articleList[1:]:
        global articleId
        articleId = article[0]

        global text
        text = ''

        start_time = time.time()
        print('正在爬取id值为%s的文章评论' % articleId)
        time.sleep(random.uniform(0, 1))
        params = {
            'id': int(articleId),
            'is_show_bulletin': 2
        }
        response = get_data(commentUrl, params)
        parse_json(response, articleId, articleCommentsFilePath)

        max_page = 100
        max_id_last = 0
        while max_id != 0:
            max_page -= 1
            if max_page < 0:
                break
            print(max_id)
            if max_id_last == max_id: break
            max_id_last = max_id
            params = {
                'id': int(articleId),
                'is_show_bulletin': 2,
                'max_id': int(max_id)
            }
            response = get_data(commentUrl, params)
            parse_json(response, articleId, articleCommentsFilePath)
            time.sleep(random.uniform(0, 0.5))

        end_time = time.time()
        print('耗时：' + str(end_time - start_time))
        print()
        get_img(articleId,text)


#if __name__ == '__main__':
#   start()

#globalVariable.py

import os
from datetime import datetime

headers = {
    'Cookie': 'SINAGLOBAL=8240438899311.014.1696773183143; SCF=AmrnNqo7P2iXmQ3NUrBALgX7jeCXIpFT4Gs5T0suCF-JPfazBZLqNgNqZCrLFQXeez5g0PPSmlpDEpo-rHrOwoE.; UOR=,,www.baidu.com; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWZsKabTbTcpwExYBKrSd265JpX5KMhUgL.Foe0eKM4eo-0Sh52dJLoI0YLxK.L1KMLB--LxK-L1K5L1-zLxK-L1KeLB-2LxKnL1heLB.BLxKqLBonL1h-LxK-LBKqL1hzLxKqLBo5LBoBt; ULV=1727100176835:15:3:2:6609349842637.535.1727100176829:1727011678443; XSRF-TOKEN=Dehifh4IfSPCO6o5UGYoxSEp; ALF=1730468107; SUB=_2A25L-TxbDeRhGeVN6lUY8ivPzzyIHXVpdzGTrDV8PUJbkNANLWmskW1NTI12JCXuXFvGbMlyYJ5emHm8bMhRhqPw; WBPSESS=d1RoWYRMvf7R2BqLPDL0saG9CHbOSvtzfEfEtDn2JIFGsdDPcOHfjE_yYuPiwp41wF80Yw-6jCdWfADzzomAhNIfmto_wzKqj-YV8et1NMDXibmZxTkF1zEUTqjYCzdkoOTx2eV4fTd4hjRNAFunkw==',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}


def initGlobalVariable():
    current_time = datetime.now()
    formatted_time = current_time.strftime("%Y-%m-%d_%H-%M-%S")
    articleDataFilePath = r'D:\weibo\spiders\data\articleContent_' + formatted_time + '.csv'
    articleCommentsFilePath = r'D:\weibo\spiders\data\articleComments_' + formatted_time + '.csv'
    articleCategoryFilePath = r'D:\weibo\spiders\data\articleCategory.csv'
    return articleCategoryFilePath, articleDataFilePath, articleCommentsFilePath


if __name__ == '__main__':
    initGlobalVariable()

...全文