关于python爬虫程序中途停止的问题

有为少年 2017-08-13 01:02:25

# -*- coding: utf-8 -*-
"""
Created on Fri Aug 11 16:31:42 2017
@author: lart
"""

import urllib.request
import re, time


def req_open_html(url):
print('req_open_html begin')
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0'}
request = urllib.request.Request(url, headers=headers)
html = urllib.request.urlopen(request).read().decode('GBK')
print('req_open_html end')
return html

def re_find_match(re_string, operation, html):
print('re_find_match begin')
pattern = re.compile(re_string, re.I)
if operation == 'findall':
result = pattern.findall(html)
elif operation == 'match':
result = pattern.match(html)
else:
print('this operation is invalid')
exit(-1)
print('re_find_match end')
return result


if __name__ == '__main__':
url_base = 'http://www.7kankan.la/book/1/'
html = req_open_html(url_base)
findall_title = re_find_match(r'<title>(.+?)</title>', 'findall', html)
findall_article = re_find_match(r'<dd class="col-md-3"><a href=[\',"](.+?)[\',"] title=[\',"](.+?)[\',"]>', 'findall', html)

with open(findall_title[0] + '.txt', 'w+', encoding='utf-8') as open_file:
print('article文件打开', findall_article)
for i in range(len(findall_article)):
print(i)
open_file.write(findall_article[i][1] + '\n ---------------------------------------------- \n')
url_arctile = url_base + findall_article[i][0]
html_article = req_open_html(url_arctile)
findall_article_txet = re_find_match(r'    (.+?)<br />', 'findall', html_article)
findall_article_next = findall_article[i][0].replace('.html', '_2.html')
url_arctile_next = url_base + findall_article_next
html_article_next = req_open_html(url_arctile_next)
if html_article_next:
findall_article_txet.extend(re_find_match(r'    (.+?)<br />', 'findall', html_article_next))
for text in findall_article_txet:
open_file.write(text + '\n')
time.sleep(1)

print('文件写入完毕')


Python: 3.5
IDE: wingide 6.0
Sys: WIN 8.1


问题:
本人电脑运行时,发现,总是会在中途停止输出,也没有报错,只是卡在那里,cpu,内存等等也没有显示出太明显的异常高占用。
我好奇的是,这是因为目标网站的原因,还是我自己代码的问题。下面是任务管理器和IDE的输出截图


...全文
3529 2 打赏 收藏 转发到动态 举报
写回复
用AI写文章
2 条回复
切换为时间正序
请发表友善的回复…
发表回复
有为少年 2017-08-13
  • 打赏
  • 举报
回复
引用 1 楼 chuifengde 的回复:
频繁打开网页时,有时打开网页假死,可以设置超时,捕获超时错误,重新打开
正解,多谢。 以下是修改后的代码,可以实现完整的功能

# -*- coding: utf-8 -*-
"""
Created on Fri Aug 11 16:31:42 2017
@author: lart
"""

import urllib.request
import re, time
import socket


def req_open_html(url):
    print('req_open_html begin')
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0'}
    request = urllib.request.Request(url, headers=headers)
    NET_STATUS = False
    while not NET_STATUS:
        try:
            html = urllib.request.urlopen(request, data=None, timeout=3).read().decode('GBK')
            print('NET_STATUS is good')
            print('req_open_html end')
            return html
        except socket.timeout:
            print('NET_STATUS is not good')
            NET_STATUS = False

def re_find_match(re_string, operation, html):
    print('re_find_match begin')
    pattern = re.compile(re_string, re.I)
    if operation == 'findall':
        result = pattern.findall(html)
    elif operation ==  'match':
        result = pattern.match(html)
    else:
        print('this operation is invalid')
        exit(-1)
    print('re_find_match end')
    return result


if __name__ == '__main__':
    url_base = 'http://www.7kankan.la/book/1/'
    html = req_open_html(url_base)
    findall_title = re_find_match(r'<title>(.+?)</title>', 'findall', html)
    findall_article = re_find_match(r'<dd class="col-md-3"><a href=[\',"](.+?)[\',"] title=[\',"](.+?)[\',"]>', 'findall', html)

    with open(findall_title[0] + '.txt', 'w+', encoding='utf-8') as open_file:
        print('article文件打开', findall_article)
        for i in range(len(findall_article)):
            print(i)
            open_file.write(findall_article[i][1] + '\n ---------------------------------------------- \n')
            url_arctile = url_base + findall_article[i][0]
            html_article = req_open_html(url_arctile)
            findall_article_txet = re_find_match(r'    (.+?)<br />', 'findall', html_article)
            findall_article_next = findall_article[i][0].replace('.html', '_2.html')
            url_arctile_next = url_base + findall_article_next
            html_article_next = req_open_html(url_arctile_next)
            if html_article_next:
                findall_article_txet.extend(re_find_match(r'    (.+?)<br />', 'findall', html_article_next))
                for text in findall_article_txet:
                    open_file.write(text + '\n')
            time.sleep(1)

    print('文件写入完毕')
chuifengde 2017-08-13
  • 打赏
  • 举报
回复 1
频繁打开网页时,有时打开网页假死,可以设置超时,捕获超时错误,重新打开

37,718

社区成员

发帖
与我相关
我的任务
社区描述
JavaScript,VBScript,AngleScript,ActionScript,Shell,Perl,Ruby,Lua,Tcl,Scala,MaxScript 等脚本语言交流。
社区管理员
  • 脚本语言(Perl/Python)社区
  • IT.BOB
加入社区
  • 近7日
  • 近30日
  • 至今

试试用AI创作助手写篇文章吧