62,614
社区成员
发帖
与我相关
我的任务
分享
List<AnswerOcr> answerOcrList = service.queryForList();
Set<AnswerOcr> setList = new HashSet<>();
for (int i = 0; i < answerOcrList.size(); i++) {
if (i + 1 >= answerOcrList.size()) {
break;
}
for (int j = i + 1; j < answerOcrList.size(); j++) {
if (StrUtil.isEmpty(answerOcrList.get(i).getOcr())||StrUtil.isEmpty(answerOcrList.get(j).getOcr())){
continue;
}
setList.addAll(SimilarityUtils.levenshtein(answerOcrList.get(i),answerOcrList.get(j)));
}
}
BigDecimal bigDecimal = new BigDecimal(setList.size()).divide(new BigDecimal(answerOcrList.size()),5,BigDecimal.ROUND_HALF_DOWN);
System.out.println("重复率"+bigDecimal);
// 通过工具类创建writer
ExcelWriter writer = ExcelUtil.getWriter("/home/noah/demo/similarity.xls");
public static Set<AnswerOcr> levenshtein(AnswerOcr answerOcr1, AnswerOcr answerOcr2) {
Set<AnswerOcr> answerOcrList = new HashSet<>();
//计算两个字符串的长度。
int len1 = answerOcr1.getOcr().length();
int len2 = answerOcr2.getOcr().length();
//建立上面说的数组,比字符长度大一个空间
int[][] dif = new int[len1 + 1][len2 + 1];
//赋初值,步骤B。
for (int a = 0; a <= len1; a++) {
dif[a][0] = a;
}
for (int a = 0; a <= len2; a++) {
dif[0][a] = a;
}
//计算两个字符是否一样,计算左上的值
int temp;
for (int i = 1; i <= len1; i++) {
for (int j = 1; j <= len2; j++) {
if (answerOcr1.getOcr().charAt(i - 1) == answerOcr2.getOcr().charAt(j - 1)) {
temp = 0;
} else {
temp = 1;
}
//取三个值中最小的
dif[i][j] = min(dif[i - 1][j - 1] + temp, dif[i][j - 1] + 1,
dif[i - 1][j] + 1);
}
}
// 取数组右下角的值,同样不同位置代表不同字符串的比较
//计算相似度
float similarity =1 - (float) dif[len1][len2] / Math.max(answerOcr1.getOcr().length(), answerOcr2.getOcr().length());
if (similarity > 0.8){
answerOcrList.add(answerOcr1);
answerOcr1.setIds(answerOcr1.getIds()+","+answerOcr2.getQuestionid());
// DownloadPicFromURL.downloadPicture(answerOcr1.getPath(),"d:/ocr/similarity/"+answerOcr1.getXueduanname()+"-"+answerOcr1.getGradename()+"-"+answerOcr1.getSubjectname()+"-"+answerOcr1.getQuestionid()+".jpg");
answerOcrList.add(answerOcr2);
answerOcr2.setIds(answerOcr2.getIds()+","+answerOcr1.getQuestionid());
// DownloadPicFromURL.downloadPicture(answerOcr2.getPath(),"d:/ocr/similarity/"+answerOcr2.getXueduanname()+"-"+answerOcr2.getGradename()+"-"+answerOcr2.getSubjectname()+"-"+answerOcr2.getQuestionid()+".jpg");
}
return answerOcrList;
看不懂 我用的fuzzywuzzy对比相似度,但是对比量很大.
每次大概2-10几万 还在不断增加,我也在找优化方案.