sub_images在网页耳朵哪里可以找到

微电子学与固体电子学-俞驰 2017-04-29 04:02:16

# -*-coding:utf-8-*-

from requests.exceptions import RequestException

from bs4 import BeautifulSoup

from hashlib import md5

from multiprocessing import Pool

from urllib.parse import urlencode

import requests, re, os, json, threading





def get_index_page(offset):#√随着头条搜索页面的下拉，offset会发生变化，其他属性是不变的，属性来自Query String Parameter

    data = {

        'offset': offset,

        'format': 'json',

        'keyword': '科技',

        'autoload': 'true',

        'count': 20,

        'cur_tab': 1,

    }

    url = 'http://www.toutiao.com/search_content/?' + urlencode(data)#

    print("url = ",url)

    try:

        response = requests.get(url)

        if response.status_code == 200:

            return response.text

        return None

    except RequestException:

        print('爬虫异常')

        return None





def parser_index_page(html):#√应该不需要修改

    data = json.loads(html)#html转化为字典，也就是C++中的map数据结构，这个data是指html转化后的结果

    if data and 'data' in data.keys():#这个'data'指的是上面一句得到的data中的属性

        for url in data.get('data'):

            yield url.get('article_url')#加强版的return





def get_image_page(url):#√

    try:

        response = requests.get(url)

        if response.status_code == 200:

            return response.text

        return None

    except RequestException:

        print('爬虫异常')

        return None





def parser_image_page(html):#需要研究一下

    soup = BeautifulSoup(html, 'lxml')#解析网页

    title = soup.select('title')[0].get_text()

    image_src = re.compile('var gallery = (.*?);', re.S)

    result = re.search(image_src, html)

    if result:

        data = json.loads(result.group(1))#data中含有sub_images属性

        if data and 'sub_images' in data.keys():

            sub_images = data.get('sub_images')

            images = [item.get('url') for item in sub_images]

            for url in images:

                down_html(url, title)

            return {

                'title': title,

                'images': images

            }





def down_html(url, title):#这里本来是图片的url现在用作网页的url即可

    try:

        response = requests.get(url)

        if response.status_code == 200:

            save_image(response.content, title)

        return None

    except RequestException:

        print('爬虫异常')

        return None





def save_image(content, title):#基本看懂，无需修改

    path = 'D:/pic/' + str(title)

    if not os.path.exists(path):

        os.mkdir(path)

    file_name = '{0}/{1}.{2}'.format(path, md5(content).hexdigest(), '.jpg')

    if not os.path.exists(file_name):

        with open(file_name, 'wb') as f:

            f.write(content)

            print('保存成功', title, path)

            f.close()





def main(offset):

    html = get_index_page(offset)

    print("html = ",html)

    for url in parser_index_page(html):

        html = get_image_page(url)#举例url=http://toutiao.com/FAKE/resubmit?id=6407543743541838081

        #根据url获取网页F12之后看到的源代码

        if html:

            parser_image_page(html)#下载并保存



if __name__ == '__main__':#需要修改

    group = [x for x in range(1, 20)]

    pool = Pool()

    pool.map(main, group)