37,743
社区成员




import time
from selenium import webdriver
import pymysql
import uuid
class mainAll(object):
def __init__(self):
self.conn = pymysql.connect(host='localhost', user='root', passwd='123', db='tianyan', port=3306, charset='utf8')
self.cur = self.conn.cursor() # 获取一个游标
self.main()
self.cur.close()
self.conn.close()
def main(self):
# 获取当前年月日
y = time.strftime('%Y', time.localtime(time.time())) # 年
m = time.strftime('%m', time.localtime(time.time())) # 月
d = time.strftime('%d', time.localtime(time.time())) # 日
data_time = time.strftime('%Y-%m-%d', time.localtime(time.time())) # 抓取时间
data_time_now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
website = '海丝商报'
# 创建相应时间的url地址
url = 'http://fjrb.fjsen.com/nasb/html/%s-%s/%s/node_122.htm' % (y, m, d)
driver = webdriver.Chrome()
driver.get(url)
# 找到版面数
sheets = driver.find_element_by_xpath("//table[@cellpadding='2']")
sheets_len = len(sheets.find_elements_by_tag_name('tr'))
# 找到每个版面的标题数量
for sheet in range(sheets_len):
titles = driver.find_element_by_xpath("//table[@cellpadding='1']")
titles_len = int(len(titles.find_elements_by_tag_name('tr')) / 2)
content_type = driver.find_element_by_xpath("//table[@cellpadding='2']").find_elements_by_tag_name('tr')
content_type = content_type[sheet].text.split(':')[-1] # 以冒号为分隔符切开版面的文字
# 点击版面的第一篇文章
title_button = driver.find_element_by_xpath("//*[@id='demo']/table[1]/tbody/tr[3]/td[2]/table/tbody/"
"tr[4]/td/table/tbody/tr/td[2]/table/tbody/tr[1]/td/table/"
"tbody/tr[4]/td/div/table/tbody/tr[1]/td[2]/a")
title_button.click()
for title in range(titles_len):
# 找到主标题和子标题的table表
title_table = driver.find_element_by_xpath(
"//*[@id='demo']/table/tbody/tr[3]/td[2]/table/tbody/tr[4]//tr")
content_title = title_table.find_elements_by_tag_name('p')[0].text
content_subtitle = title_table.find_elements_by_tag_name('p')[1].text
content = driver.find_element_by_xpath("//table[@class='content_tt']").text
# 获取左下角每一版的所有标题的链接
content_id = driver.find_elements_by_xpath("//*[@id='demo']/table/tbody/tr[3]/td[1]/table/tbody/tr[3]/"
"td/table//a")
content_id = content_id[title].get_attribute('href')
content_id = content_id.split('content_')[-1].split('.')[0] # 正则表达式没有处理成功!!!!!
# content_id = driver.current_url
# 'http://fjrb.fjsen.com/nasb/html/2017-09/21/content_1055929.htm?div=-1'
idd = str(uuid.uuid1())
idd.replace('-', '')
# 新闻时间和爬取时间是一个时候 sentiment_source 和sentiment_website是同一处理的
lists = (idd, content_title, content_subtitle, website, data_time, url, website, data_time_now, content,
content_id, content_type)
self.conn(lists)
driver.find_elements_by_xpath("//a[@class='preart']")[-1].click() # 点击下一篇章
# 当把一版的所有标题都走完以后,点击下一版,回到外层循环的页面
if title == titles_len - 1 and sheet == 0:
driver.find_elements_by_xpath("//a[@class='preart']")[0].click()
elif title == titles_len - 1:
driver.find_elements_by_xpath("//a[@class='preart']")[1].click()
elif title == 0 and sheet == 0:
flag = self.judge(content_id)
if flag > 0:
break
# 我这里的break会不会让定时程序都停止了
driver.close()
def conn(self, table):
# 名称 职位 公司名称 entuid
sql = "INSERT INTO sentiment_info (sentiment_id, sentiment_title, sentiment_subtitle, sentiment_source," \
"sentiment_time, sentiment_url,sentiment_website,sentiment_create_time,sentiment_content," \
"sentiment_source_id,sentiment_type) VALUES ( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s'," \
" '%s','%s')"
self.cur.execute(sql % table)
self.conn.commit()
# 第一页点击 driver.find_element_by_xpath("//a[@class='preart']").click() 即可到下一页
def judge(self, content_id):
sql = "SELECT COUNT(*) FROM sentiment_info WHERE sentiment_source='海丝商报' AND sentiment_type='要闻'" \
" AND sentiment_source_id=%s", content_id
self.cur.execute(sql)
a = self.cur.fetchall()
a = max(max(a))
self.conn.commit()
return a
if __name__ == '__main__':
mainAll()
C:\Users\Administrator\AppData\Local\Programs\Python\Python36\python.exe "D:\pycharm\PyCharm 2016.3\helpers\pydev\pydevd.py" --multiproc --qt-support --client 127.0.0.1 --port 53782 --file D:/pyworkpeace/HaiSi
warning: Debugger speedups using cython not found. Run '"C:\Users\Administrator\AppData\Local\Programs\Python\Python36\python.exe" "D:\pycharm\PyCharm 2016.3\helpers\pydev\setup_cython.py" build_ext --inplace' to build.
pydev debugger: process 7620 is connecting
Connected to pydev debugger (build 163.8233.8)
Traceback (most recent call last):
File "D:\pycharm\PyCharm 2016.3\helpers\pydev\pydevd.py", line 1596, in <module>
globals = debugger.run(setup['file'], None, None, is_module)
File "D:\pycharm\PyCharm 2016.3\helpers\pydev\pydevd.py", line 974, in run
pydev_imports.execfile(file, globals, locals) # execute the script
File "D:\pycharm\PyCharm 2016.3\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "D:/pyworkpeace/HaiSi", line 110, in <module>
mainAll()
File "D:/pyworkpeace/HaiSi", line 20, in __init__
self.main()
File "D:/pyworkpeace/HaiSi", line 75, in main
self.conn(lists)
TypeError: 'Connection' object is not callable
Process finished with exit code 1