利用Python通过关键字抓取网页新闻

wingman1985 2013-05-13 09:20:25

我是一个Python初学者，因为最近要买房子，所以想先做些研究工作，阅读相关信息。

我的问题是这样的:

如果我以”开发商猫腻“为关键字，在百度搜索相关文章，并且需要保存为文本。该怎么写这个脚本，或者说有没有什么现成的库可以用。

谢谢大家。

...全文

12040 15 打赏收藏转发到动态举报

写回复

用AI写文章

15 条回复

切换为时间正序

请发表友善的回复…

发表回复

Sidney_VonWunderland 2015-03-15

打赏
举报

输入关键词，百度新闻搜索，还可以自动翻页，提取新闻正文


#coding:utf-8
import re
import urllib2
import chardet
from BeautifulSoup import BeautifulSoup

#提取网页正文，放入txt中
def remove_js_css (content):
    """ remove the the javascript and the stylesheet and the comment content (<script>....</script> and <style>....</style> <!-- xxx -->) """
    r = re.compile(r'''<script.*?</script>''',re.I|re.M|re.S)
    s = r.sub ('',content)
    r = re.compile(r'''<style.*?</style>''',re.I|re.M|re.S)
    s = r.sub ('', s)
    r = re.compile(r'''<!--.*?-->''', re.I|re.M|re.S)
    s = r.sub('',s)
    r = re.compile(r'''<meta.*?>''', re.I|re.M|re.S)
    s = r.sub('',s)
    r = re.compile(r'''<ins.*?</ins>''', re.I|re.M|re.S)
    s = r.sub('',s)
    return s

def remove_empty_line (content):
    """remove multi space """
    r = re.compile(r'''^\s+$''', re.M|re.S)
    s = r.sub ('', content)
    r = re.compile(r'''\n+''',re.M|re.S)
    s = r.sub('\n',s)
    return s

def remove_any_tag (s):
    s = re.sub(r'''<[^>]+>''','',s)
    return s.strip()

def remove_any_tag_but_a (s):
    text = re.findall (r'''<a[^r][^>]*>(.*?)</a>''',s,re.I|re.S|re.S)
    text_b = remove_any_tag (s)
    return len(''.join(text)),len(text_b)

def remove_image (s,n=50):
    image = 'a' * n
    r = re.compile (r'''<img.*?>''',re.I|re.M|re.S)
    s = r.sub(image,s)
    return s

def remove_video (s,n=1000):
    video = 'a' * n
    r = re.compile (r'''<embed.*?>''',re.I|re.M|re.S)
    s = r.sub(video,s)
    return s

def sum_max (values):
    cur_max = values[0]
    glo_max = -999999
    left,right = 0,0
    for index,value in enumerate (values):
        cur_max += value
        if (cur_max > glo_max) :
            glo_max = cur_max
            right = index
        elif (cur_max < 0):
            cur_max = 0

    for i in range(right, -1, -1):
        glo_max -= values[i]
        if abs(glo_max < 0.00001):
            left = i
            break
    return left,right+1

def method_1 (content, k=1):
    if not content:
        return None,None,None,None
    tmp = content.split('\n')
    group_value = []
    for i in range(0,len(tmp),k):
        group = '\n'.join(tmp[i:i+k])
        group = remove_image (group)
        group = remove_video (group)
        text_a,text_b= remove_any_tag_but_a (group)
        temp = (text_b - text_a) - 8 
        group_value.append (temp)
    left,right = sum_max (group_value)
    return left,right, len('\n'.join(tmp[:left])), len ('\n'.join(tmp[:right]))

def extract (content):
    content = remove_empty_line(remove_js_css(content))
    left,right,x,y = method_1 (content)
    return '\n'.join(content.split('\n')[left:right])

#输入url，将其新闻页的正文输入txt
def extract_news_content(web_url,file_name):  
    request = urllib2.Request(web_url)  
  
    #在请求加上头信息，伪装成浏览器访问  
    request.add_header('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')  
    opener = urllib2.build_opener()  
    html= opener.open(request).read()
    infoencode = chardet.detect(html)['encoding']##通过第3方模块来自动提取网页的编码
    if html!=None and infoencode!=None:#提取内容不为空，error.或者用else
        html = html.decode(infoencode,'ignore')
        soup=BeautifulSoup(html)
        content=soup.renderContents()
        content_text=extract(content)#提取新闻网页中的正文部分，化为无换行的一段文字
        content_text= re.sub(" "," ",content_text)
        content_text= re.sub(">","",content_text)
        content_text= re.sub(""",'""',content_text)
        content_text= re.sub("<[^>]+>","",content_text)
        content_text=re.sub("\n","",content_text)
        file = open(file_name,'a')#append
        file.write(content_text)
        file.close()

#抓取百度新闻搜索结果:中文搜索，前10页，url：key=关键词
def search(key_word):
    search_url='http://news.baidu.com/ns?word=key_word&tn=news&from=news&cl=2&rn=20&ct=1' 
    req=urllib2.urlopen(search_url.replace('key_word',key_word))
    real_visited=0
    for count in range(10):#前10页
        html=req.read()
        soup=BeautifulSoup(html)  
        content  = soup.findAll("li", {"class": "result"}) #resultset object
        num = len(content)               
        for i in range(num):
            #先解析出来所有新闻的标题、来源、时间、url
            p_str= content[i].find('a') #if no result then nontype object
            contenttitle=p_str.renderContents()
            contenttitle=contenttitle.decode('utf-8', 'ignore')#need it
            contenttitle= re.sub("<[^>]+>","",contenttitle)
            contentlink=str(p_str.get("href"))
            #存放顺利抓取的url，对比
            visited_url=open(r'D:\Python27\visited-cn.txt','r')#是否已经爬过
            visited_url_list=visited_url.readlines()
            visited_url.close()#及时close
            exist=0
            for item in visited_url_list:
                if contentlink==item:
                    exist=1
            if exist!=1:#如果未被访问url
                p_str2= content[i].find('p').renderContents()
                contentauthor=p_str2[:p_str2.find("  ")]#来源
                contentauthor=contentauthor.decode('utf-8', 'ignore')#时
                contenttime=p_str2[p_str2.find("  ")+len("  ")+ 1:]
                contenttime=contenttime.decode('utf-8', 'ignore')
                #第i篇新闻，filename="D:\\Python27\\newscn\\%d.txt"%(i)
                #file = open(filename,'w'),一个txt一篇新闻
                real_visited+=1
                file_name=r"D:\Python27\newscn\%d.txt"%(real_visited)
                file = open(file_name,'w')
                file.write(contenttitle.encode('utf-8'))
                file.write(u'\n')
                file.write(contentauthor.encode('utf-8'))
                file.write(u'\n')
                file.write(contenttime.encode('utf-8'))
                file.write(u'\n'+contentlink+u'\n')
                file.close()
                extract_news_content(contentlink,file_name)#还写入文件
                visited_url_list.append(contentlink)#访问之
                visited_url=open(r'D:\Python27\visited-cn.txt','a')#标记为已访问，永久存防止程序停止后丢失                
                visited_url.write(contentlink+u'\n')
                visited_url.close()
            if len(visited_url_list)>=120:
                break
            #解析下一页
        if len(visited_url_list)>=120:
            break
        if count==0:
            next_num=0
        else:
            next_num=1
        next_page='http://news.baidu.com'+soup('a',{'href':True,'class':'n'})[next_num]['href'] # search for the next page#翻页
        print next_page
        req=urllib2.urlopen(next_page)

if __name__=='__main__':
    key_word=raw_input('input key word:')
    search(key_word)

liuye0922 2013-05-21

打赏
举报

你可以看看这个用python来实现网络爬虫功能包含APScheduler调度 http://www.djangochina.cn/forum.php?mod=viewthread&tid=8 python实例之：多线程抓取网页数据程序 http://www.djangochina.cn/forum.php?mod=viewthread&tid=62 Django中国|Django中文社区 http://www.djangochina.cn

augur 2013-05-21

打赏
举报

最好采用网页爬虫，把网页先抓取下来，然后用xpath解析内容最后过滤入库 python版的爬虫 scrapy 这个框架不错

夏之冰雪 2013-05-16

打赏
举报

我觉得，抓取信息就是看http请求是什么，然后自己模拟出来。最后把模拟出来的结果通过正则等等操作匹配出来，得到自己想要的。 f12查看http请求，然后自己通过urllib来实现即可。

wingman1985 2013-05-13

打赏
举报

引用 7 楼 liuyifeng123 的回复:

[quote=引用 6 楼 wingman1985 的回复:] 谢谢你，我试了的确很好。如果要在腾讯新闻以某个关键字（比如说”开发商猫腻“）进行搜索，该如何修改你上面的代码呢”。谢谢。

我只能给你个建议，你可以通过input方法获取你输入的内容，然后用python的thread去写个爬虫爬腾讯新闻全部的网页，只要有你输入的内容就把他放在一个文件里或者放到一个队列，爬完之后输出就行了。[/quote] 好的，我再研究研究。虽然这些东西学习曲线都很陡峭。

南歌子 2013-05-13

打赏
举报

引用 6 楼 wingman1985 的回复:

谢谢你，我试了的确很好。如果要在腾讯新闻以某个关键字（比如说”开发商猫腻“）进行搜索，该如何修改你上面的代码呢”。谢谢。

我只能给你个建议，你可以通过input方法获取你输入的内容，然后用python的thread去写个爬虫爬腾讯新闻全部的网页，只要有你输入的内容就把他放在一个文件里或者放到一个队列，爬完之后输出就行了。

wingman1985 2013-05-13

打赏
举报

谢谢你，我试了的确很好。如果要在腾讯新闻以某个关键字（比如说”开发商猫腻“）进行搜索，该如何修改你上面的代码呢”。谢谢。

南歌子 2013-05-13

打赏
举报

我这儿有个可以抓取腾讯新闻的代码，差不多这个意思，你看看 #coding=utf-8 import sys import urllib2 import re import os def extract_url(info): rege="http://news.qq.com/a/\d{8}/\d{6}.htm" re_url = re.findall(rege, info) return re_url def extract_sub_web_title(sub_web): re_key = "<title>.+</title>" title = re.findall(re_key,sub_web) return title def extract_sub_web_content(sub_web): re_key = "<div id=\"Cnt-Main-Article-QQ\".*</div>" content = re.findall(re_key,sub_web) return content def filter_tags(htmlstr): re_cdata=re.compile('//<!\[CDATA\[[^>]*//\]\]>',re.I) re_script=re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',re.I) re_style=re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>',re.I) re_p=re.compile('<P\s*?/?>') re_h=re.compile('</?\w+[^>]*>') re_comment=re.compile('') s=re_cdata.sub('',htmlstr) s=re_script.sub('',s) s=re_style.sub('',s) s=re_p.sub('\r\n',s) s=re_h.sub('',s) s=re_comment.sub('',s) blank_line=re.compile('\n+') s=blank_line.sub('\n',s) return s #get news content = urllib2.urlopen('http://news.qq.com').read() #get the url get_url = extract_url(content) #generate file f = file('result.txt','w') i = 15 #新闻起始位置，前面几条格式不一致 flag = 30 while True: f.write(str(i-14)+"\r\n") #get the sub web title and content sub_web = urllib2.urlopen(get_url[i]).read() sub_title = extract_sub_web_title(sub_web) sub_content = extract_sub_web_content(sub_web) #remove html tag if sub_title != [] and sub_content != []: re_content = filter_tags(sub_title[0]+"\r\n"+sub_content[0]) f.write(re_content.decode("gb2312").encode("utf-8")) f.write("\r\n") else: flag = flag +1 if i == flag: break i = i + 1 print "Have finished %d news" %(i-15) f.close()

南歌子 2013-05-13

打赏
举报

引用 3 楼 wingman1985 的回复:

这是我在网上搜到的代码：

import urllib 
from bs4 import BeautifulSoup 

url ='http://www.baidu.com/s' 
values ={'wd':'wang'} 
encoded_param = urllib.urlencode(values) 
full_url = url +'?'+ encoded_param 
response = urllib.urlopen(full_url) 
soup =BeautifulSoup(response) 
soup.find_all('a')

但是似乎并没有作用

你这个确实没有用，只能搜到十条

wingman1985 2013-05-13

打赏
举报

这是我在网上搜到的代码：

import urllib 
from bs4 import BeautifulSoup 

url ='http://www.baidu.com/s' 
values ={'wd':'wang'} 
encoded_param = urllib.urlencode(values) 
full_url = url +'?'+ encoded_param 
response = urllib.urlopen(full_url) 
soup =BeautifulSoup(response) 
soup.find_all('a')

但是似乎并没有作用