
from bs4 import BeautifulSoup
import re
import urllib.request, urllib.error
# import sqlite3
def main():
baseurl = "http://www.prnasia.com/m/mediafeed/rss?id=2303&t=240"
datalist = getData(baseurl)
savepath = ".\\wenjian"
saveData(savepath)
findLink = re.compile('<a href="(.*?)" target="_blank">')
def getData(baseurl):
datalist = []
# html= askUrl("http://www.prnasia.com/m/mediafeed/rss?id=2303&t=240")
# for i in range(0,1):
url = baseurl
html = askUrl(url)
soup = BeautifulSoup(html, "html.parser")
for presscolumn in soup.find_all('div', class_="presscolumn"):
# data = []
item = str(presscolumn)
link = re.findall(findLink, item)
print(link)
# print()
return datalist
def askUrl(url):
head = {
"User-Agent": "Mozilla/5.0(Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
# print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def saveData(savepath):
print("save....")
if __name__ == "__main__":
main()