37,719
社区成员
发帖
与我相关
我的任务
分享
其中有四百张图片我只想要60张商品的图片
其实不知道你特定的图片要求是什么 你还是得自己理解正则如何编写 才能找到最好方案
import re sketchyStrList = re.findall( r'''(?<=<img).+?(?=/>)''' , urlStr ) resultStrList = [] for sketchyStr in sketchyStrList: resultStrList += re.findall( r'''(?<=").+?(?=")''' , sketchyStr ) print resultStrList
import re
sketchyStrList = re.findall( r'''(?<=<img).+?(?=/>)''' , urlStr )
resultStrList = []
for sketchyStr in sketchyStrList:
resultStrList += re.findall( r'''(?<=").+?(?=")''' , sketchyStr )
print resultStrList
不用正则还可以锻炼字符串处理算法# -*- coding: cp936 -*- urlStr = '''<img data-ks-lazyload= "http://gi1.mlist.alicdn.com/bao/uploaded/i1/T1sk3BFtRgXXXXXXXX_!!0-item_pic.jpg_b.jpg" /> <img data-ks-lazyload= "testtest123" />''' resultStrList = [] resultStr = "" isInItem = False isInHttp = False for i in xrange(len(urlStr)): if urlStr[i] == '<': isInItem = True continue if urlStr[i] == '>': isInItem = False continue if isInItem: if urlStr[i] == '"': if isInHttp: resultStrList.append(resultStr) resultStr = "" isInHttp = not isInHttp continue if isInHttp: resultStr+=urlStr[i] continue print resultStrList result: >>> ['http://gi1.mlist.alicdn.com/bao/uploaded/i1/T1sk3BFtRgXXXXXXXX_!!0-item_pic.jpg_b.jpg', 'testtest123'] >>>
# -*- coding: cp936 -*-
urlStr = '''<img data-ks-lazyload= "http://gi1.mlist.alicdn.com/bao/uploaded/i1/T1sk3BFtRgXXXXXXXX_!!0-item_pic.jpg_b.jpg" /> <img data-ks-lazyload= "testtest123" />'''
resultStrList = []
resultStr = ""
isInItem = False
isInHttp = False
for i in xrange(len(urlStr)):
if urlStr[i] == '<':
isInItem = True
continue
if urlStr[i] == '>':
isInItem = False
continue
if isInItem:
if urlStr[i] == '"':
if isInHttp:
resultStrList.append(resultStr)
resultStr = ""
isInHttp = not isInHttp
continue
if isInHttp:
resultStr+=urlStr[i]
continue
print resultStrList
result:
>>>
['http://gi1.mlist.alicdn.com/bao/uploaded/i1/T1sk3BFtRgXXXXXXXX_!!0-item_pic.jpg_b.jpg', 'testtest123']
>>>