37,720
社区成员
发帖
与我相关
我的任务
分享
def get_soup(url, user_agent, funname):
filename = "content.txt"
headers = {"User-Agent": user_agent}
req = urllib2.Request(url, headers=headers)
try:
write_log(filename, funname + ": Start down page" + "\n")
response = urllib2.urlopen(req)
write_log(filename, funname + ": End down page" + "\n")
except urllib2.URLError, e:
write_log(filename, e.reason + ": " + e.errno + "\n")
write_log(filename, funname + ": Start read page" + "\n")
html = response.read()
write_log(filename, funname + ": End read page" + "\n")
soup = BeautifulSoup(html, 'lxml')
return soup
import urllib
import urllib2
import re
from bs4 import BeautifulSoup
import time
import datetime
def write_log(filename, log_text):
try:
with open(filename, 'a') as f:
f.write(log_text)
except IOError, e:
print e.errno + e.message
def get_soup(url, user_agent, funname):
filename = "content.txt"
headers = {"User-Agent": user_agent}
req = urllib2.Request(url, headers=headers)
try:
write_log(filename, funname + ": Start down page" + "\n")
response = urllib2.urlopen(req)
write_log(filename, funname + ": End down page" + "\n")
except urllib2.URLError, e:
write_log(filename, e.reason + ": " + e.errno + "\n")
write_log(filename, funname + ": Start read page" + "\n")
html = response.read()
write_log(filename, funname + ": End read page" + "\n")
soup = BeautifulSoup(html, 'lxml')
return soup
def get_post_next_url_list(url):
soup = get_soup(url, user_agent, "get_post_next_url_list")
next_page = soup.find_all(class_=re.compile("next.*pagination-item"))
if len(next_page) > 0:
next_page_url = next_page[0]["href"]
return "http:" + next_page_url
else:
return ''
'''
def get_post_contents(urls):
headers = {"User-Agent": user_agent}
for url in urls:
url = 'http://tieba.baidu.com' + url.a['href']
filename = "content.txt"
write_log(filename, url + "\n")
req = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(req)
html = response.read()
soup = BeautifulSoup(html, 'lxml')
divs = soup.find_all(class_=re.compile("l_post j_l_post l_post_bright"))
for div in divs:
name = div.find_all(class_="d_name")
if len(name) > 0:
name = name[0].find(class_=re.compile("p_author_name")).string
content = div.find_all(class_=re.compile("d_post_content.*clearfix"))
if len(content) > 0:
content = content[0].get_text().strip()
user_content = name + ":" + content + "\n"
write_log(filename, user_content.encode("utf-8"))
time.sleep(2)
write_log(filename, "\n" + "\n")
'''
def get_post_url_list(url):
soup = get_soup(url, user_agent, "get_post_url_list")
urls = soup.find_all(class_="threadlist_title pull_left j_th_tit ")
filename = "content.txt"
for url in urls:
write_log(filename, 'http://tieba.baidu.com' + url.a['href'] + "\n")
return urls
def get_tieba_content(url):
filename = "content.txt"
while url.strip():
write_log(filename, url + "\n")
urls = get_post_url_list(url)
#get_post_contents(urls)
url = get_post_next_url_list(url)
if __name__ == '__main__':
user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"
url = "http://tieba.baidu.com/f?ie=utf-8&kw=%E8%8B%B1%E5%9B%BD%E6%96%97%E7%89%9B%E7%8A%AC&fr=search"
get_tieba_content(url)