求助怎样才能运行成功并生成csv文件

m0_68052012 2025-02-23 20:35:58
#spiderContent.py

import random
import time
import requests
import csv
from clearData import clearData
from globalVariable import *
from datetime import datetime
import os
from utils.wordCloudPicture import get_img

max_id = 0
articleId = ''
commentUrl = 'https://weibo.com/ajax/statuses/buildComments'
text = ''


def init(articleCommentsFilePath):
    if not os.path.exists(articleCommentsFilePath):
        with open(articleCommentsFilePath, 'w', encoding='utf-8', newline='') as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow([
                'articleId',
                'created_at',
                'likes_counts',
                'region',
                'content',
                'authorName',
                'authorGender',
                'authorAddress'
            ])


def writerRow(row, articleCommentsFilePath):
    with open(articleCommentsFilePath, 'a', encoding='utf-8', newline='') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow(row)
        csvFile.flush()
        csvFile.close()


def get_data(url, params):
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        response_json = response.json()
        global max_id
        max_id = response_json['max_id']
        return response_json['data']
    else:
        return None


def getAllArticleList(articleDataFilePath):
    artileList = []
    with open(articleDataFilePath, 'r', encoding='utf-8') as reader:
        reader.flush()
        readerCsv = csv.reader(reader)
        for nav in readerCsv:
            # print(nav)
            artileList.append(nav)
            # print(artileList)
    return artileList


def parse_json(response, articleId, articleCommentsFilePath):
    if response is None: return
    for comment in response:
        global region
        created_at = datetime.strptime(comment['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')
        likes_counts = comment['like_counts']
        try:
            region = comment['source'].replace('来自', '')
        except:
            region = '无'
        content = comment['text_raw']
        content = clearData(content)  # 数据清洗
        if content == "": continue  # 判空
        global text
        text += content

        authorName = comment['user']['screen_name']
        authorGender = comment['user']['gender']
        authorAddress = comment['user']['location']
        writerRow([
            articleId,
            created_at,
            likes_counts,
            region,
            content,
            authorName,
            authorGender,
            authorAddress
        ], articleCommentsFilePath)


def start(articleDataFilePath, articleCommentsFilePath):
    init(articleCommentsFilePath)
    articleList = getAllArticleList(articleDataFilePath)
    for article in articleList[1:]:
        global articleId
        articleId = article[0]

        global text
        text = ''

        start_time = time.time()
        print('正在爬取id值为%s的文章评论' % articleId)
        time.sleep(random.uniform(0, 1))
        params = {
            'id': int(articleId),
            'is_show_bulletin': 2
        }
        response = get_data(commentUrl, params)
        parse_json(response, articleId, articleCommentsFilePath)

        max_page = 100
        max_id_last = 0
        while max_id != 0:
            max_page -= 1
            if max_page < 0:
                break
            print(max_id)
            if max_id_last == max_id: break
            max_id_last = max_id
            params = {
                'id': int(articleId),
                'is_show_bulletin': 2,
                'max_id': int(max_id)
            }
            response = get_data(commentUrl, params)
            parse_json(response, articleId, articleCommentsFilePath)
            time.sleep(random.uniform(0, 0.5))

        end_time = time.time()
        print('耗时:' + str(end_time - start_time))
        print()
        get_img(articleId,text)


# if __name__ == '__main__':
#     start()

#spiderComments.py

import random
import time
import requests
import csv
from clearData import clearData
from globalVariable import *
from datetime import datetime
import os
from utils.wordCloudPicture import get_img

max_id = 0
articleId = ''
commentUrl = 'https://weibo.com/ajax/statuses/buildComments'
text = ''


def init(articleCommentsFilePath):
    if not os.path.exists(articleCommentsFilePath):
        with open(articleCommentsFilePath, 'w', encoding='utf-8', newline='') as csvFile:
            writer = csv.writer(csvFile)
            writer.writerow([
                'articleId',
                'created_at',
                'likes_counts',
                'region',
                'content',
                'authorName',
                'authorGender',
                'authorAddress'
            ])


def writerRow(row, articleCommentsFilePath):
    with open(articleCommentsFilePath, 'a', encoding='utf-8', newline='') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerow(row)
        csvFile.flush()
        csvFile.close()


def get_data(url, params):
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        response_json = response.json()
        global max_id
        max_id = response_json['max_id']
        return response_json['data']
    else:
        return None


def getAllArticleList(articleDataFilePath):
    artileList = []
    with open(articleDataFilePath, 'r', encoding='utf-8') as reader:
        reader.flush()
        readerCsv = csv.reader(reader)
        for nav in readerCsv:
            # print(nav)
            artileList.append(nav)
            # print(artileList)
    return artileList


def parse_json(response, articleId, articleCommentsFilePath):
    if response is None: return
    for comment in response:
        global region
        created_at = datetime.strptime(comment['created_at'], '%a %b %d %H:%M:%S %z %Y').strftime('%Y-%m-%d')
        likes_counts = comment['like_counts']
        try:
            region = comment['source'].replace('来自', '')
        except:
            region = '无'
        content = comment['text_raw']
        content = clearData(content)  # 数据清洗
        if content == "": continue  # 判空
        global text
        text += content

        authorName = comment['user']['screen_name']
        authorGender = comment['user']['gender']
        authorAddress = comment['user']['location']
        writerRow([
            articleId,
            created_at,
            likes_counts,
            region,
            content,
            authorName,
            authorGender,
            authorAddress
        ], articleCommentsFilePath)


def start(articleDataFilePath, articleCommentsFilePath):
    init(articleCommentsFilePath)
    articleList = getAllArticleList(articleDataFilePath)
    for article in articleList[1:]:
        global articleId
        articleId = article[0]

        global text
        text = ''

        start_time = time.time()
        print('正在爬取id值为%s的文章评论' % articleId)
        time.sleep(random.uniform(0, 1))
        params = {
            'id': int(articleId),
            'is_show_bulletin': 2
        }
        response = get_data(commentUrl, params)
        parse_json(response, articleId, articleCommentsFilePath)

        max_page = 100
        max_id_last = 0
        while max_id != 0:
            max_page -= 1
            if max_page < 0:
                break
            print(max_id)
            if max_id_last == max_id: break
            max_id_last = max_id
            params = {
                'id': int(articleId),
                'is_show_bulletin': 2,
                'max_id': int(max_id)
            }
            response = get_data(commentUrl, params)
            parse_json(response, articleId, articleCommentsFilePath)
            time.sleep(random.uniform(0, 0.5))

        end_time = time.time()
        print('耗时:' + str(end_time - start_time))
        print()
        get_img(articleId,text)


#if __name__ == '__main__':
#   start()

#globalVariable.py

import os
from datetime import datetime

headers = {
    'Cookie': 'SINAGLOBAL=8240438899311.014.1696773183143; SCF=AmrnNqo7P2iXmQ3NUrBALgX7jeCXIpFT4Gs5T0suCF-JPfazBZLqNgNqZCrLFQXeez5g0PPSmlpDEpo-rHrOwoE.; UOR=,,www.baidu.com; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWZsKabTbTcpwExYBKrSd265JpX5KMhUgL.Foe0eKM4eo-0Sh52dJLoI0YLxK.L1KMLB--LxK-L1K5L1-zLxK-L1KeLB-2LxKnL1heLB.BLxKqLBonL1h-LxK-LBKqL1hzLxKqLBo5LBoBt; ULV=1727100176835:15:3:2:6609349842637.535.1727100176829:1727011678443; XSRF-TOKEN=Dehifh4IfSPCO6o5UGYoxSEp; ALF=1730468107; SUB=_2A25L-TxbDeRhGeVN6lUY8ivPzzyIHXVpdzGTrDV8PUJbkNANLWmskW1NTI12JCXuXFvGbMlyYJ5emHm8bMhRhqPw; WBPSESS=d1RoWYRMvf7R2BqLPDL0saG9CHbOSvtzfEfEtDn2JIFGsdDPcOHfjE_yYuPiwp41wF80Yw-6jCdWfADzzomAhNIfmto_wzKqj-YV8et1NMDXibmZxTkF1zEUTqjYCzdkoOTx2eV4fTd4hjRNAFunkw==',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}


def initGlobalVariable():
    current_time = datetime.now()
    formatted_time = current_time.strftime("%Y-%m-%d_%H-%M-%S")
    articleDataFilePath = r'D:\weibo\spiders\data\articleContent_' + formatted_time + '.csv'
    articleCommentsFilePath = r'D:\weibo\spiders\data\articleComments_' + formatted_time + '.csv'
    articleCategoryFilePath = r'D:\weibo\spiders\data\articleCategory.csv'
    return articleCategoryFilePath, articleDataFilePath, articleCommentsFilePath


if __name__ == '__main__':
    initGlobalVariable()


...全文
124 回复 打赏 收藏 转发到动态 举报
写回复
用AI写文章
回复
切换为时间正序
请发表友善的回复…
发表回复

121,059

社区成员

发帖
与我相关
我的任务
社区描述
欢迎云计算、网络、云原生、大数据、服务器、Devops、python等领域工程师,一起互相学习交流,提升技术!
云原生云计算devops 个人社区 湖北省·武汉市
社区管理员
  • 江湖有缘
  • 奇零才子
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告

社区公告

欢迎辞与社区指南

亲爱的朋友们,

欢迎来到我们的技术交流社区!这里是一个充满热情和技术交流的地方,我们诚挚地邀请每一位对运维技术感兴趣的朋友加入。无论您是经验丰富的专家,还是初出茅庐的新手,我们都期待您的参与。

社区规则与期望

  1. 友好讨论:我们鼓励大家在这里友好讨论各种技术相关问题,分享知识和见解。
  2. 多多发帖:积极参与讨论,每天收获一点,相信您的技术会不断进步!

加入我们

我们期待与您一起携手共创一个更加精彩的技术世界!立即扫码或点击链接加入我们吧!

  • 主群(已满):942602415
  • 1群:906554602
  • 2群:863915594

由于主群人数已达上限,建议新朋友加入【QQ - IT运维技术交流群】:906554602 或选择其他可用的分群。

感谢您对我们社区的关注和支持!如果您有任何疑问或需要帮助,请随时联系管理员。让我们共同维护一个积极向上、互帮互助的技术交流环境。

祝您在本社区中学习愉快,收获满满!

注:请确保遵守所有社交平台的相关规定,保持良好的网络行为。

 

试试用AI创作助手写篇文章吧