Python3爬虫爬取文章标题和发布时间
螺丝鸽安全 2019-03-18 04:03:39 准备: Windows7,pycharm,python3.6
#coding:utf-8
#爬取 www.daqianduan.com网页的前10页内容,并以CSV文件格式保存在磁盘
import requests
from lxml import etree
import csv
import time
#爬取前10页网页文章标题和时间的方法
def spider(url):
response = requests.get(url)
time.sleep(2)
selector = etree.HTML(response.text)
for i in range(1,11):
wen = selector.xpath('/html/body/section/div[1]/div/article[{}]'.format(i))[0]
title = wen.xpath('header/h2/a/text()')[0]
pub_time = wen.xpath('p[1]/time/text()')[0]
item = [title,pub_time] #创建一个列表,将标题和时间作为该列表的元素
csv_write(item)
#将爬取的标题和时间以csv格式保存在本地磁盘
def csv_write(item):
with open('csv_spider','a',newline='') as f:
csvwriter = csv.writer(f)
csvwriter.writerow(item)
print('正在下载:',item[0])
#拼接成前10页的网页
for n in range(1,11):
url = 'http://www.daqianduan.com/page/' + str(n)
spider(url) #调用spider()方法