文本匹配算法LCS源码(java实现)
最近无聊写了个复读机程序,其中有一块要用到文本匹配, 于是有了以下代码. 在这里共享出来,也许有人用的到. 第一个实现(也就是代码中的 LCS 函数)是用的经典的矩阵实现(动态规划), 在一般的数据结构或算法课本上都可以找到对此算法的描述, 故不多言. 可是这个算法的时间和空间复杂度太高,都是N^2级别的. 在我的程序中有点不可忍受. 于是有了 LCS_DN 这个算法,它把时间和空间复杂度缩小到D*N(D是两个文本的差异), 所以当文本相差不大时,效率会挺高不少. LCS_DN_refined是改进版,空间复杂度虽然还是D*N,但减少了几个系数量.
本来还要解释一下LCS_DN,但有点太罗嗦了,或许另开帖子比较方便.
这些代码都可以直接拿过来用的, 界面也挺简单, 呵呵, 下面贴代码( 因为版式问题,代码很乱,可以考到eclipse或其他编辑器里再看
public final class TextUtil {
public static class MatchPair {
public int x;
public int y;
public int len;
MatchPair(int x, int y) {
this.x = x;
this.y = y;
}
}
public static MatchPair[] LCS(String[] txt1, String[] txt2) {
int[][] matchCount = null;
int[][] direction = null;
matchCount = new int[txt1.length + 1][txt2.length + 1];
direction = new int[txt1.length + 1][txt2.length + 1];
for (int i = 0; i <= txt1.length; ++i) {
matchCount[i][0] = 0;
direction[i][0] = 0;
}
for (int i = 0; i <= txt2.length; ++i) {
matchCount[0][i] = 0;
direction[0][i] = 0;
}
for (int plus = 2; plus <= txt1.length + txt2.length; ++plus) {
for (int x = Math.max(1, plus - txt2.length); x <= Math.min(
plus - 1, txt1.length); ++x) {
int y = plus - x;
int count = 0;
int dir = 0;
if (txt1[x - 1].equals(txt2[y - 1])) {
count = matchCount[x - 1][y - 1] + 1;
dir = 2;
} else if (matchCount[x - 1][y] > matchCount[x][y - 1]) {
count = matchCount[x - 1][y];
dir = 1;
} else {
count = matchCount[x][y - 1];
dir = 3;
}
matchCount[x][y] = count;
direction[x][y] = dir;
}
}
int x = txt1.length, y = txt2.length;
ArrayList ret = new ArrayList();
int dir = direction[x][y];
while (dir != 0) {
if (dir == 2) {
ret.add(new MatchPair(x - 1, y - 1));
--x;
--y;
} else if (dir == 1) {
--x;
} else {
--y;
}
dir = direction[x][y];
}
Collections.reverse(ret);
return (MatchPair[]) ret.toArray(new MatchPair[0]);
}
public static MatchPair[] LCS_DN(String[] txt1, String[] txt2) {
ArrayList list = new ArrayList();
int M = txt1.length;
int N = txt2.length;
int MAX = M + N;
int[] cur = null;
int[] pre = new int[2 * MAX + 1];
pre[1 + MAX] = 0;
int x, y;
for (int D = 0; D <= MAX; ++D) {
list.add(pre);
cur = new int[2 * MAX + 1];
for (int k = -1 * D; k <= D; k += 2) {
if (k == -1 * D || k != D
&& pre[k - 1 + MAX] < pre[k + 1 + MAX]) {
x = pre[k + 1 + MAX];
} else {
x = pre[k - 1 + MAX] + 1;
}
y = x - k;
while (x < txt1.length && y < txt2.length
&& txt1[x].equals(txt2[y])) {
++x;
++y;
}
cur[k + MAX] = x;
if (x == txt1.length && y == txt2.length) {
int x1, k1;
ArrayList ret = new ArrayList();
do {
pre = (int[]) list.get(D);
if (k == -1 * D || k != D
&& pre[k - 1 + MAX] < pre[k + 1 + MAX]) {
x1 = pre[k + 1 + MAX];
k1 = k + 1;
} else {
x1 = pre[k - 1 + MAX];
k1 = k - 1;
}
while (x > x1 && (x - k) > (x1 - k1)) {
--x;
System.out.println("(" + x + "," + (x - k) + ")");
ret.add(new MatchPair(x, x - k));
}
k = k1;
x = x1;
} while (--D >= 0);
Collections.reverse(ret);
return (MatchPair[]) ret.toArray(new MatchPair[0]);
}
}
pre = cur;
}
return null;
}
public static MatchPair[] LCS_DN_refined(String[] txt1, String[] txt2) {
ArrayList list = new ArrayList();
int M = txt1.length;
int N = txt2.length;
int MAX = M + N;
int[] cur = null;
int[] pre = new int[1];
pre[0] = 0;
int x, y;
for (int D = 0; D <= MAX; ++D) {
list.add(pre);
cur = new int[D + 1];
for (int k = -1 * D; k <= D; k += 2) {
if (k == -1 * D
|| k != D
&& pre[(k - 1 + D - 1) >> 1] < pre[(k + 1 + D - 1) >> 1]) {
x = pre[(k + 1 + D - 1) >> 1];
} else {
x = pre[(k - 1 + D - 1) >> 1] + 1;
}
y = x - k;
while (x < txt1.length && y < txt2.length
&& txt1[x].equals(txt2[y])) {
++x;
++y;
}
cur[(k + D) >> 1] = x;
if (x == txt1.length && y == txt2.length) {
int x1, k1;
ArrayList ret = new ArrayList();
do {
pre = (int[]) list.get(D);
if (k == -1 * D
|| k != D
&& pre[(k - 1 + D - 1) >> 1] < pre[(k + 1 + D - 1) >> 1]) {
x1 = pre[(k + 1 + D - 1) >> 1];
k1 = k + 1;
} else {
x1 = pre[(k - 1 + D - 1) >> 1];
k1 = k - 1;
}
while (x > x1 && (x - k) > (x1 - k1)) {
--x;
System.out.println("(" + x + "," + (x - k) + ")");
ret.add(new MatchPair(x, x - k));
}
k = k1;
x = x1;
} while (--D >= 0);
Collections.reverse(ret);
return (MatchPair[]) ret.toArray(new MatchPair[0]);
}
}
pre = cur;
}
return null;
}
}