37,721
社区成员
发帖
与我相关
我的任务
分享
import re
#先找到<tr和</tr之间的数据,存为一个列表result1
result1=re.findall(r'<tr.+?</tr',s,re.S)
rule=re.compile(r'<\s*td[^<>]*>(?:\s*<[^<>]*>)*\s*([^<>\s]+)',re.S)
result=[]
for i in result:
result.append(rule.findall(i))
#OK了,结果result是一个二维列表,下面是打印
for i in result:
print '-------------------------------'
for j in i:
print j
import re
rule=re.compile(r'<\s*td[^<>]*>(?:\s*<[^<>]*>)*\s*([^<>\s]+)',re.S)
result=rule.findall(s)
for info in result:
print info
import re
rule=re.compile(r'<\s*td[^<>]*>(?:\s*<[^<>]*>)*\s*(?P<INFO>[^<>\s]+)',re.S)
pos=0
m=rule.search(s,pos)
while m:
pos=m.end()
print m.group('INFO')
m=rule.search(s,pos)
#ignore link id
rows = html2table(html)
for r in rows:
for c in r:
print c
print
#get link id
rows = html2table(html, True)
for r in rows:
for c in r:
print c[0], c[1]
print
#!/usr/bin/env python
import re
import urllib
def html2table(html, useid=False):
trs = re.findall(r'<tr align="center" bgcolor="FFFBEF">.*?</tr>', html, re.DOTALL)
rows = []
for tr in trs:
if useid:
x = re.findall(r'>(?:<a [^<>]*id=(\w+)[^<>]*>)?([^<>]*)(?:</a>)?</td>', tr, re.DOTALL)
x = map(lambda t: [t[0].strip(), t[1].strip()], x)
else:
x = re.findall(r'>([^<>]*)(?:</a>)?</td>', tr, re.DOTALL)
x = map(lambda s: s.strip(), x)
rows.append(x)
return rows
url = r'http://www.szfcweb.com/szwsfc/house_select_bottom.asp?action=sumb&pp=20&yt=b&qy=6&page=1'
html = urllib.urlopen(url).read()
#get link id
rows = html2table(html)
for r in rows:
for c in r:
print c
print
#get link id
rows = html2table(html, True)
for r in rows:
for c in r:
print c[0], c[1]
print