python关于检索目录中子目录的问题，求高人指点。

oFangLiang1 2013-12-23 03:38:47

#filename Seek.py

import unicodedata

import sys

import os

class Seek():

    """

    功能:查找中文,并替换成指定字符或字符串

    使用方法:python脚本用法

    参数说明:

        -d    : 文件目录(绝对或相对路径)(默认为脚本所在目录)

        -t    : 文件类型(文件名后缀,如.jsp;.txt)(默认为所有文件)

        -sf   : 是否包括子目录(Y|N)(option,默认不包括子目录)

        -r    : 输出文件名(option,默认为'ChineseCharacter.txt'),位于脚本目录下

        -encoding:文件字符编码(默认为utf-8)

    """

        

    def __init__(self):

        """

        初始化查找程序

        参数解析

        """

        #TODO:

        self.d = '.'

        self.sf = 'N'

        self.t = 'ALL'

        self.r = 'ChineseCharacter.txt'

        self.encoding = 'utf-8'

        varL = 0

        for ar in sys.argv:

            if ar == '-d':

                self.d = sys.argv[varL+1]

                continue

            if ar =='-sf':

                sf = sys.argv[varL+1].upper()

                if (sf == 'Y') | (sf == 'N'):

                    self.sf = sf

                else:

                    print('input error with sf parameter')

                continue

            if ar=='-r':

                self.r = sys.argv[varL+1]

                continue

            if ar=='-t':

                self.t = sys.argv[varL+1]

                continue

                

            if ar == '-encoding':

                self.encoding = sys.argv[varL+1]

                continue

            varL+=1

    def seeking(self):

        """"

        开始查找字符

        

        """

        try:

            #output file

            self.rfile = open(self.r,'w',encoding=self.encoding)

            #start seek

            for f in os.listdir(self.d):

                path = os.path.join(self.d,f)

                if self.__isFile(path):

                    if self.t != 'ALL':

                        if f.endswith(self.t):

                            self.__seek(path)

                    else:

                        self.__seek(path)

                elif self.__isDir(path) and self.sf == 'Y' :

                    #seek the sub folder when the self.sf equals 'Y'

                    self.start(path)

        except Exception as error:

            print('seek error %s' % error)

        finally:

            self.__close()

       

                

    def __close(self):

        """

        关闭文件及输入流和输出流

        """

        #close the stream and file

        self.rfile.close()

    def __isFile(self,file):

        #

        return os.path.isfile(file)

    

    def __isDir(self,path):

        #

        return os.path.isdir(path)

    

    def __openFile(self,file):

        pass

    

    def __closeFile(self,file):

        file.close()

    def __seek(self,file):

        """

        查找

        

        """

        #seek character

        fileObj = open(file,'r',encoding=self.encoding)

        lineList = fileObj.readlines()

        #块注释标记

        blockComment = 'finish'

        try:

            isC = False

            for line in lineList:

                #查找出注释部分,并跳过

                #

                #跳过'/*'和'*/'中的内容,处理剩余的内容

                if blockComment == 'start':

                    #块注释内容

                    index = line.find('*/')

                    if index != -1:

                        blockComment = 'finish'

                        #块注释结束

                        #处理当前行'*/'后的内容

                        line = line[index+2:]

                    else:

                        #仍处于块注释内容中,跳过

                        continue

                if line.startswith('//'):

                    #行注释

                    #跳过行

                    continue

                if line.startswith('/*'):

                    #块注释开始

                    blockComment = 'start'

                    continue

                #查找字符

                indexTag = 0;

                for s in line:

                    sIndex = line.index(s)

                    try:

                        #将不是LATIN开头的字符都找出来

                        if unicodedata.name(s).startswith('CJK') == True:

                            #TODO

                            #content = lineList.index(line)+1+s

                            isC = True

                            #如果两个字符间隔大于1,表示为不连续的中文

                            if (sIndex - indexTag) > 1 :

                                self.__writeFile('\t'+s)

                            else:

                                self.__writeFile(s)

                            indexTag = sIndex

                    except Exception as error:

                        print('seek character error : %s in %s' % (error,fileObj.name))

                        continue

                if isC:

                    for t in range(8):

                        self.__writeFile('\t')

                    self.__writeFile('line:')

                    self.__writeFile('%d' % (lineList.index(line)+1))

                    self.__writeFile('\n')

                isC = False

                

        finally:

            self.__writeFile('\n')

            self.__writeFile('------------'+fileObj.name)

            self.__writeFile('\n')

            fileObj.close()

        

    def __writeFile(self,content):

        self.rfile.write(content)

        

        

if __name__ == '__main__':

    seek=Seek()

    seek.seeking()

这是在网上看到的一篇关于检索文件中中文的小工具，可是当把包括子目录设置能Y的时候会出错。提示没有start方法。我自己试着写了start那个地方。但刚学习python，还是没有找到合适的方法。所以恳请各位高人，谁帮我改一下程序，能让这个可以检索子目录。
还有一个问题，就是自己想写一个替换工具，根据检索出来的文件，将这些中文改成英文，替换回去。试了两种方案，但实在是调不出来。如果谁能帮忙写一个，那就太感谢了。
我是一个实习生，这是项目经理给的题目，明天就交工了。各位帮帮忙，谢谢了。

...全文

113 2 打赏收藏转发到动态举报

写回复

用AI写文章

2 条回复

切换为时间正序

请发表友善的回复…

发表回复

oFangLiang1 2013-12-23

打赏
举报

引用 1 楼 snmr_com 的回复:

这个类看着很累要回家了，没空给你写，给你思路，能否写出来就自求多福吧用os.walk（含子目录）或glob（不含子目录）读取文件名列表依次用字节方式打开，用chardet.detect判断编码按行处理，用re.sub替换然后合并所有的行，写入原文件（测试时注意不要用正式的文件来测试）

思路我也有，但是对python实在是不熟悉。刚才自己研究了下，把检索子目录的功能弄出来了。替换工具，我的思路是在提取时记录一下文件名，文件名下边记录提取内容。然后读取这个记录文件，打开记录的源文件，然后依次对每一行进行replace。试着写了一下，但是出现各种各样的错误，比如写回源文件时候，每一行都是往后添加，而不是替换。跟经理说了，不弄替换工具了。现在让我把提取的东西生成xml文件，按3目录结构对字符串分组存储。我连xml是啥都不知道。我的天，这就是我梦寐以求得工作。好累。心累。