java 过滤敏感词替换

化石 2013-01-10 06:04:04
求java过滤敏感词替换**
如:TMD 缺德
你TMD,也太缺德了,太变态了吧TM 替换成:你***,也太缺德了,太**了吧**
注意:一个字就是一个*

有代码的话发我新浪邮箱:xtay_myet@sina.com 谢谢!
...全文
1124 7 打赏 收藏 转发到动态 举报
写回复
用AI写文章
7 条回复
切换为时间正序
请发表友善的回复…
发表回复
哈小皮 2013-01-11
  • 打赏
  • 举报
回复
jiajianhui2009 2013-01-11
  • 打赏
  • 举报
回复

public class WmParser {
	private static Log log = LogFactory.getLog(WmParser.class);
	public static WmParser wmParser;
	private static String CHARSET = "ISO-8859-1"; 
	static {
		try {
			log.debug("Instantiating WmParser....");
			wmParser = new WmParser();
			InputStream in = WmParser.class.getResourceAsStream("bad_words_zh_CN.txt");
			BufferedReader reader = new BufferedReader(
					new InputStreamReader(in));
			String line = null;
			while ((line = reader.readLine()) != null) {
//				wmParser.addFilterKeyWord(new String(line.getBytes(), "ISO-8859-1"), 1);
				wmParser.addFilterKeyWord(line, 1);
//				String[] badWords = line.split("!");
//				if (badWords.length == 0){
//					continue;
//				}
//				if (badWords[0].equals("")){
//					continue;
//				}
//				try {
//					wmParser.addFilterKeyWord(badWords[0], Integer.valueOf(badWords[1]));
//				} catch (NumberFormatException e) {
//					log.error("NumberFormatException in Instantiating WmParser's badWords level:" + e);
//					wmParser.addFilterKeyWord(badWords[0], Integer.valueOf(1));
//				}
			}
			reader.close();
			in.close();
		} catch (Exception e) {
			log.error("Exception in Instantiating WmParser:" + e);
			e.printStackTrace();
		}
	}
	protected WmParser(){
		
	}
	public static WmParser getInstance(){
		return wmParser;
	}
	
	private boolean initFlag = false;
	private UnionPatternSet unionPatternSet = new UnionPatternSet();
	private int maxIndex = (int) java.lang.Math.pow(2, 16);
	private int shiftTable[] = new int[maxIndex];
	public Vector<AtomicPattern> hashTable[] = new Vector[maxIndex];
	private UnionPatternSet tmpUnionPatternSet = new UnionPatternSet();

	public static void main(String args[]) {
		try {
			WmParser filterEngine = WmParser.getInstance();
			Vector<Integer> levelSet = new Vector<Integer>();
			String str = "单个的政治,政治运动和强奸和shit";
			SimpleDateFormat sf = new SimpleDateFormat("HH:mm:ss.SSS");
			System.out.println("文本长度:" + str.length());
			System.out.println("敏感词汇总数:" + filterEngine.tmpUnionPatternSet.getSet().size());
			Date start = new Date(System.currentTimeMillis());
			System.out.println("过滤开始:" + sf.format(start));
			
			System.out.println(str);
			System.out.println(filterEngine.parse(new String(str.getBytes(), "ISO_8859-1"), levelSet));
			
			Date end = new Date(System.currentTimeMillis());
			System.out.println("过滤完毕:" + sf.format(end));
			System.out.println("文本中出现敏感词个数:" + levelSet.size());
			System.out.println("耗时:" + (end.getTime() - start.getTime()) + "ms");
		} catch (Exception e) {
			e.printStackTrace();
		}
		
	}
	public boolean addFilterKeyWord(String keyWord, int level) {
		if (initFlag == true)
			return false;
		UnionPattern unionPattern = new UnionPattern();
		Pattern pattern = new Pattern(keyWord);
		AtomicPattern atomicPattern = new AtomicPattern(pattern);
		unionPattern.addNewAtomicPattrn(atomicPattern);
		unionPattern.setLevel(level);
		atomicPattern.setBelongUnionPattern(unionPattern);
		tmpUnionPatternSet.addNewUnionPattrn(unionPattern);
		return true;
	}

	public String parse(String content, Vector<Integer> levelSet){
		try {
			if (initFlag == false)
				init();
			Vector<AtomicPattern> aps = new Vector<AtomicPattern>();
			StringBuilder sb = new StringBuilder();	
			char checkChar;
			for (int i = 0; i < content.length();) {
				checkChar = content.charAt(i);
				if (shiftTable[checkChar] == 0) {
					Vector<AtomicPattern> tmpAps = new Vector<AtomicPattern>();
					Vector<AtomicPattern> destAps = hashTable[checkChar];
					int match = 0;
					for (int j = 0; j < destAps.size(); j++) {
						AtomicPattern ap = destAps.get(j);
						if (ap.findMatchInString(content.substring(0, i + 1))){
							String patternStr = ap.getPattern().str;
							if (match > 0){
								sb.setLength(sb.length() - patternStr.length());
							} else {
								sb.setLength(sb.length() - patternStr.length() + 1);
							}
							appendStar(patternStr, sb);
							tmpAps.add(ap);
							match++;
						}
					}
					aps.addAll(tmpAps);
					if (tmpAps.size() <= 0){
						sb.append(checkChar);
					}
					i++;
				} else {
					if (i + shiftTable[checkChar] <= content.length()){
						sb.append(content.substring(i, i + shiftTable[checkChar]));
					} else {
						sb.append(content.substring(i));
					}
					i = i + shiftTable[checkChar];
				}
			}
			parseAtomicPatternSet(aps, levelSet);
			return sb.toString();
		} catch (Exception e) {
			log.error(e);
			e.printStackTrace();
		}
		return "";
	}
	
	private static void appendStar(String patternStr, StringBuilder sb){
		for (int c = 0;c < patternStr.length(); c++){
			char ch = patternStr.charAt(c);
			if ((ch >= 0x4e00 && ch <= 0x9FA5) || (ch >= 0xF900 && ch <= 0xFA2D)){
				sb.append("*");
			} else {
				sb.append("*");
			}
		}
	}


	private void parseAtomicPatternSet(Vector<AtomicPattern> aps,
			Vector<Integer> levelSet) {
		while (aps.size() > 0) {
			AtomicPattern ap = aps.get(0);
			UnionPattern up = ap.belongUnionPattern;
			if (up.isIncludeAllAp(aps)) {
				levelSet.add(new Integer(up.getLevel()));
			}
			aps.remove(0);
		}
	}

	// shift table and hash table of initialize
	private void init() {
		initFlag = true;
		for (int i = 0; i < maxIndex; i++)
			hashTable[i] = new Vector<AtomicPattern>();
		shiftTableInit();
		hashTableInit();
	}

	public void clear() {
		tmpUnionPatternSet.clear();
		initFlag = false;
	}

	private void shiftTableInit() {
		for (int i = 0; i < maxIndex; i++)
			shiftTable[i] = 2;
		Vector<UnionPattern> upSet = tmpUnionPatternSet.getSet();
		for (int i = 0; i < upSet.size(); i++) {
			Vector<AtomicPattern> apSet = upSet.get(i).getSet();
			for (int j = 0; j < apSet.size(); j++) {
				AtomicPattern ap = apSet.get(j);
				Pattern pattern = ap.getPattern();
				if (shiftTable[pattern.charAtEnd(1)] != 0)
					shiftTable[pattern.charAtEnd(1)] = 1;
				if (shiftTable[pattern.charAtEnd(0)] != 0)
					shiftTable[pattern.charAtEnd(0)] = 0;
			}
		}
	}

	private void hashTableInit() {
		Vector<UnionPattern> upSet = tmpUnionPatternSet.getSet();
		for (int i = 0; i < upSet.size(); i++) {
			Vector<AtomicPattern> apSet = upSet.get(i).getSet();
			for (int j = 0; j < apSet.size(); j++) {
				AtomicPattern ap = apSet.get(j);
				Pattern pattern = ap.getPattern();
				if (pattern.charAtEnd(0) != 0) {
					hashTable[pattern.charAtEnd(0)].add(ap);
				}
			}
		}
	}
}

class Pattern { // string
	Pattern(String str) {
		this.str = str;
	}

	public char charAtEnd(int index) {
		if (str.length() > index) {
			return str.charAt(str.length() - index - 1);
		} else
			return 0;
	}

	public String str;

	public String getStr() {
		return str;
	};
}

class AtomicPattern {
//	public boolean findMatchInString(String str) throws Exception {
//		String patStr = new String(this.pattern.str.getBytes("ISO-8859-1"), "UTF-8");
//		str = new String(str.getBytes("ISO-8859-1"), "UTF-8");
//		if (patStr.length() > str.length())
//			return false;
//		int beginIndex = str.lastIndexOf(patStr.charAt(0) + "");
//		if (beginIndex != -1){
//			String eqaulLengthStr = str.substring(beginIndex);
//			if (patStr.equalsIgnoreCase(eqaulLengthStr))
//				return true;
//		}
//		return false;
//	}
	public boolean findMatchInString(String str) {
		if (this.pattern.str.length() > str.length())
			return false;
		int beginIndex = str.lastIndexOf(this.pattern.str.charAt(0) + "");
		if (beginIndex != -1){
			String eqaulLengthStr = str.substring(beginIndex);
			if (this.pattern.str.equalsIgnoreCase(preConvert(eqaulLengthStr)))
				return true;
		}
		return false;
	}
	private String preConvert(String content) {
		String retStr = new String();
		for (int i = 0; i < content.length(); i++) {
			char ch = content.charAt(i);
			if (this.isValidChar(ch)) {
				retStr = retStr + ch;
			}
		}
		return retStr;
	}
	private boolean isValidChar(char ch) {
		if ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'Z')
				|| (ch >= 'a' && ch <= 'z'))
			return true;
		if ((ch >= 0x4e00 && ch <= 0x9FA5) || (ch >= 0xF900 && ch <= 0xFA2D))
			return true;
		return false;
	}

	AtomicPattern(Pattern pattern) {
		this.pattern = pattern;
	};

	private Pattern pattern;
	public UnionPattern belongUnionPattern;

	public UnionPattern getBelongUnionPattern() {
		return belongUnionPattern;
	}

	public void setBelongUnionPattern(UnionPattern belongUnionPattern) {
		this.belongUnionPattern = belongUnionPattern;
	}

	public Pattern getPattern() {
		return pattern;
	}

	public void setPattern(Pattern pattern) {
		this.pattern = pattern;
	}
}

class SameAtomicPatternSet {
	SameAtomicPatternSet() {
		SAPS = new Vector<AtomicPattern>();
	};

	public Vector<AtomicPattern> SAPS;
}

class UnionPattern { // union string
	UnionPattern() {
		this.apSet = new Vector<AtomicPattern>();
	}

	public Vector<AtomicPattern> apSet;

	public void addNewAtomicPattrn(AtomicPattern ap) {
		this.apSet.add(ap);
	}

	public Vector<AtomicPattern> getSet() {
		return apSet;
	}

	public boolean isIncludeAllAp(Vector<AtomicPattern> inAps) {
		if (apSet.size() > inAps.size())
			return false;
		for (int i = 0; i < apSet.size(); i++) {
			AtomicPattern ap = apSet.get(i);
			if (isInAps(ap, inAps) == false)
				return false;
		}
		return true;
	}

	private boolean isInAps(AtomicPattern ap, Vector<AtomicPattern> inAps) {
		for (int i = 0; i < inAps.size(); i++) {
			AtomicPattern destAp = inAps.get(i);
			if (ap.getPattern().str.equalsIgnoreCase(destAp.getPattern().str))
				return true;
		}
		return false;
	}

	public void setLevel(int level) {
		this.level = level;
	}

	public int getLevel() {
		return this.level;
	}

	private int level;
}

class UnionPatternSet { // union string set
	UnionPatternSet() {
		this.unionPatternSet = new Vector<UnionPattern>();
	}

	public void addNewUnionPattrn(UnionPattern up) {
		this.unionPatternSet.add(up);
	}

	public Vector<UnionPattern> unionPatternSet;

	public Vector<UnionPattern> getSet() {
		return unionPatternSet;
	}

	public void clear() {
		unionPatternSet.clear();
	}
}
附上我之前改写的一个吧。
jiajianhui2009 2013-01-11
  • 打赏
  • 举报
回复
字符串匹配。实现并不难只是看你是否要求高效率了,看样子你这个应该也是聊天服务器中处理敏感词。参考下这个算法,你自己结合来作出自己程序:http://blog.csdn.net/jiajianhui2009/article/details/6229740
lisongajava 2013-01-11
  • 打赏
  • 举报
回复
怎么觉得是在骂人
  • 打赏
  • 举报
回复
很简单,用正则表达式即可判断,代码如下 邮箱就不发了 public static void main(String[] args) { String str = "你TMD,也太缺德了,太变态了吧TM "; String regex = ".*[TMD,TM].*"; Pattern pat = Pattern.compile(regex); Matcher mat = pat.matcher(str); String s = ""; if (mat.matches()) { s = mat.group().replace("TMD", "*").replace("TM", "*"); } System.out.println(s); }
  • 打赏
  • 举报
回复
嗯嗯 ,,写个过滤器 ,,replaceAll()处理之~~
zhangjingtao6100 2013-01-10
  • 打赏
  • 举报
回复
filter= =

67,513

社区成员

发帖
与我相关
我的任务
社区描述
J2EE只是Java企业应用。我们需要一个跨J2SE/WEB/EJB的微容器,保护我们的业务核心组件(中间件),以延续它的生命力,而不是依赖J2SE/J2EE版本。
社区管理员
  • Java EE
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧