37,720
社区成员
发帖
与我相关
我的任务
分享
import re
def phraseDomainName(namestr):
if namestr.count('''.''')<1:
return False
if namestr.count('''.''')==1:
namestr="www."+namestr;
regex="([a-z0-9_-]{1,32}\.)+([a-z0-9_-]{1,32})((\.[a-z]{2,4})(.[a-z]{1,2})?)"
match = re.search(regex,namestr)
if match.group(1)!= None:
hostname=match.group(1)
else:
hostname=""
if match.group(2)!= None:
domain=match.group(2)
else:
return False
if match.group(3)!= None:
suffix=match.group(3)
else:
return False
return (hostname,domain,suffix)
while True:
domain=input("input url:")
print(phraseDomainName(domain))
input("end")
#! /usr/bin/env python
def parseURL(url):
lookList = ['.com.cn','.js.cn'] # expand this list if necessary
backList = ['.comcn','.jscn'] # mapped to lookList one by one
# replace 2nd level domain
for i in range(len(lookList)):
if lookList[i] in url:
url = url.replace(lookList[i],backList[i])
# split
if url.count('.')==1 and not url.startswith('www.'):
url = 'www.'+url
firstDot = url.index('.')
secondDot = url[firstDot+1:len(url)].index('.')+firstDot+1
# recover 2nd level domain
for i in range(len(backList)):
if backList[i] in url:
url = url.replace(backList[i],lookList[i])
# return
return (url[:firstDot],url[firstDot+1:secondDot],url[secondDot+1:])
def main():
testurl = ['www.baidu.com','abc.baidu.com.cn','baidu.com','baidu.com.cn','www.wx.js.cn','wx.js.cn']
for url in testurl:
print url, parseURL(url)
if __name__=='__main__':
main()
>>> ================================ RESTART ================================
>>>
www.baidu.com ('www', 'baidu', 'com')
abc.baidu.com.cn ('abc', 'baidu', 'com.cn')
baidu.com ('www', 'baidu', 'com')
baidu.com.cn ('www', 'baidu', 'com.cn')
www.wx.js.cn ('www', 'wx', 'js.cn')
wx.js.cn ('www', 'wx', 'js.cn')
>>>
#! /usr/bin/env python
def parseURL(url):
lookList = ['.com.cn'] # expand this list if necessary
backList = ['.comcn'] # mapped to lookList one by one
for i in range(len(lookList)):
if lookList[i] in url:
url = url.replace(lookList[i],backList[i])
if url.count('.')==1 and not url.startswith('www.'):
url = 'www.'+url
firstDot = url.index('.')
secondDot = url[firstDot+1:len(url)].index('.')+firstDot+1
for i in range(len(backList)):
if backList[i] in url:
url = url.replace(backList[i],lookList[i])
return (url[:firstDot],url[firstDot+1:secondDot],url[secondDot+1:])
def main():
testurl = ['www.baidu.com','abc.baidu.com.cn','baidu.com','baidu.com.cn','www.wx.js.cn','wx.js.cn']
for url in testurl:
print url, parseURL(url)
# unit test
if __name__=='__main__':
main()
www.baidu.com ('www', 'baidu', 'com')
abc.baidu.com.cn ('abc', 'baidu', 'com.cn')
baidu.com ('www', 'baidu', 'com')
baidu.com.cn ('www', 'baidu', 'com.cn')
www.wx.js.cn ('www', 'wx', 'js.cn')
wx.js.cn ('wx', 'js', 'cn')