python3.6 利用多线程 beautifulshop爬取网页

_Death__Knight 2018-11-13 09:47:07
get_text等获取text函数 报错 nonetype
...全文
208 4 打赏 收藏 转发到动态 举报
写回复
用AI写文章
4 条回复
切换为时间正序
请发表友善的回复…
发表回复
_Death__Knight 2018-11-13
  • 打赏
  • 举报
回复
从doit里创建线程,线程执行downloading get_txt函数内的content = text_content.text.replace('  ', '\n\n  ').replace(' ', '\n\n  ').replace('\n\n', '\n') 报错AttributeError: 'NoneType' object has no attribute 'text'
_Death__Knight 2018-11-13
  • 打赏
  • 举报
回复
import requests
from bs4 import BeautifulSoup
import urllib
from urllib import request
import  time
import threading
from tkinter import *
import pymysql
from datetime import datetime
from tkinter.scrolledtext import  ScrolledText
'''
数据库参数
'''
host = 'localhost'
user = 'root'
password = '123456'
database = 'txt'
db = pymysql.connect(host, user, password, database)
cursor = db.cursor()

'''
申请http头
'''
headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh,zh-TW;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6',
    'cache-control': 'max-age=0',
    'cookie': 'UM_distinctid=1669ab46d60cf-0b7dbd5850197f-b79193d-1fa400-1669ab46d6176b; width=85%25; CNZZDATA1259955010=2006150613-1540191850-https%253A%252F%252Fwww.baidu.com%252F%7C1541933446; Hm_lvt_17077e8d2ddd3bade67fd92a3bcbbc9f=1541930887,1541931811,1541933115,1541933565; Hm_lpvt_17077e8d2ddd3bade67fd92a3bcbbc9f=1541933565',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
}

"""
爬取小说类
"""
class Downloader(object):
    """
    构造函数
    """

    def __init__(self, target):
        self._target = target

    """
    获取每一章的url
    """

    def get_url(self):
        req = requests.get(url=self._target)
        bf = BeautifulSoup(req.text, 'html5lib')
        content_url = bf.find('div', id='list')
        bf_a = BeautifulSoup(str(content_url), 'html5lib')
        a = bf_a.find_all('a')
        return a

    """
    获取每一章的内容
    """

    def get_txt(self, content_url, title):
        req = requests.get(url=content_url)
        bf = BeautifulSoup(req.text, 'html.parser')
        text_content = bf.find('div', id='content')
        content = text_content.text.replace('  ', '\n\n  ').replace('    ', '\n\n  ').replace('\n\n', '\n')
        txt = title + '\n' + content
        return txt

    """
    下载文件
    """

    def write_txt(self, filename, txt):
        with open(filename, 'a', encoding='utf-8') as f:
            f.write(txt)

"""
根据名字搜索小说
"""
def search_novel(content):
    content_code = urllib.request.quote(content)  # 解决中文编码的问题
    url = 'https://www.biquge5200.cc/modules/article/search.php?searchkey=' + content_code
    r = requests.get(url, params=headers)
    soup = BeautifulSoup(r.text, "html.parser")
    for table in soup.findAll('table')[0:1]:
        for trr in table.findAll('tr'):
            for tdd in trr.findAll('td')[0:1]:
                if tdd.a.string == content:
                    return tdd.a['href']

def downloading(content,d,a,i,filename):
    txt = d.get_txt(a[i].get('href'), a[i].string)
    d.write_txt(filename, txt)

"""
command事件函数
"""
def doit(var):
    content = var
    tk = Tk()
    tk.title("下载进度")
    tk.geometry('300x200')
    text = ScrolledText(tk, font=('微软雅黑', 10), fg='blue')
    text.grid()
    tk.update()
    now = datetime.now()
    time_begin = datetime.strftime(now, '%Y-%m-%d %H:%M:%S')
    target = search_novel(content)
    filename = content + '.txt'
    d = Downloader(target)
    a = d.get_url()
    length = len(a)
    for i in range(9, length):
        th = threading.Thread(target=downloading, args=(content,d,a,i,filename,))
        th.setDaemon(True)  # 守护线程
        th.start()
        text.insert(END, a[i].string+'ok' + '\n')
        time.sleep(0.001)
    now = datetime.now()
    time_end = datetime.strftime(now, '%Y-%m-%d %H:%M:%S')
    database(content,time_begin,time_end)




'''
数据库操作函数
'''
def database(name,time_begin,time_end):
    cursor.execute("insert into text values ('%s','%s','%s')" % (name, time_begin, time_end))
    cursor.connection.commit()
    cursor.close()
    db.close()

"""
主函数
"""


def main():
    root = Tk()
    root.title("TXT")
    root.geometry('300x100')
    Label(root, text="请输入小说名字:").grid(row=0, column=0)
    var = StringVar()
    e1 = Entry(root, textvariable=var)
    e1.grid(row=0, column=1)
    theButton1 = Button(root, text="开始", width=10, command=lambda: doit(e1.get())).grid(row=2, column=1)
    root.mainloop()


"""
开始入口
"""
if __name__ == '__main__':
    main()
_Death__Knight 2018-11-13
  • 打赏
  • 举报
回复
引用 1 楼 ruancan 的回复:
你这个信息有点儿少啊
过会儿贴代码
ruancan 2018-11-13
  • 打赏
  • 举报
回复
你这个信息有点儿少啊

37,719

社区成员

发帖
与我相关
我的任务
社区描述
JavaScript,VBScript,AngleScript,ActionScript,Shell,Perl,Ruby,Lua,Tcl,Scala,MaxScript 等脚本语言交流。
社区管理员
  • 脚本语言(Perl/Python)社区
  • IT.BOB
加入社区
  • 近7日
  • 近30日
  • 至今

试试用AI创作助手写篇文章吧