爬CSDN博客文章的代码,为什么登录失败了

乐百川 2017-04-08 12:18:54

import requests
from bs4 import BeautifulSoup
import re


class CsdnHelper:
csdn_login_url = 'https://passport.csdn.net/account/login?ref=toolbar'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'DNT': '1',
'Referer': 'https://passport.csdn.net/account/login?ref=toolbar',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4',
}

def __init__(self):
self._session = requests.session()
self._session.headers = CsdnHelper.headers

def login(self, username, password):
form_data = self._prepare_login_form_data(username, password)
response = self._session.post(CsdnHelper.csdn_login_url, data=form_data)
print(response.text)
if 'UserNick' in response.cookies:
print(response.cookies['UserNick'])
else:
raise Exception('登录失败')

def _prepare_login_form_data(self, username, password):
response = self._session.get(CsdnHelper.csdn_login_url)
login_page = BeautifulSoup(response.text, 'lxml')
login_form = login_page.find('form', id='fm1')

lt = login_form.find('input', attrs={'name': 'lt'})['value']
execution = login_form.find('input', attrs={'name': 'lt'})['value']
eventId = login_form.find('input', attrs={'name': 'lt'})['value']
form = {
'username': username,
'password': password,
'lt': lt,
'execution': execution,
'_eventId': eventId
}

return form

def _get_blog_count(self):

blog_base_url = 'http://write.blog.csdn.net/postlist/'
response = self._session.get(blog_base_url)
blog_page = BeautifulSoup(response.text, 'lxml')
span = blog_page.find('div', class_='page_nav').span
pattern = re.compile(r'(\d+)条 共(\d*)页')
result = pattern.match(span.string)
blog_count = int(result.group(1))
page_count = int(result.group(2))
return (blog_count, page_count)

def print_blogs(self):
blog_count, page_count = self._get_blog_count()
for index in range(1, page_count + 1):
url = f'http://write.blog.csdn.net/postlist/0/0/enabled/{index}'
response = self._session.get(url)
page = BeautifulSoup(response.text, 'lxml')
links = page.find_all('a', href=re.compile(r'http://blog.csdn.net/u011054333/article/details/(\d+)'))
print(f'----------第{index}页----------')
for link in links:
blog_name = link.string
blog_url = link['href']
print(f'文章名称:{blog_name} 文章链接:{blog_url}')


if __name__ == '__main__':
csdn_helper = CsdnHelper()
username = XXX
password = XXX
csdn_helper.login(username, password)



我感觉几个部分写的都挺对的,结果登录URL返回的是一个错误页面。但是我浏览器登录好好的,请问我这代码哪里不对?
...全文
140 回复 打赏 收藏 转发到动态 举报
AI 作业
写回复
用AI写文章
回复
切换为时间正序
请发表友善的回复…
发表回复

37,743

社区成员

发帖
与我相关
我的任务
社区描述
JavaScript,VBScript,AngleScript,ActionScript,Shell,Perl,Ruby,Lua,Tcl,Scala,MaxScript 等脚本语言交流。
社区管理员
  • 脚本语言(Perl/Python)社区
  • WuKongSecurity@BOB
加入社区
  • 近7日
  • 近30日
  • 至今

试试用AI创作助手写篇文章吧