Tuesday, April 27, 2010

Dynamic Programming: Longest Common Substring

The idea how to solve Longest Common Substring problem by Dynamic Programming could be found here:
My implementation is published below.

Next ideas additionally could be used to reduce the memory usage:
  • Keep only the last and current row
  • Store only non-zero values in the rows (hash tables)

/**
* Longest Common Substring.<BR>
* <pre>
* | ls(i-1,j-1) + 1, if a[i]=b[j]
* ls(i,j) = |
* | 0, else
* Boundary conditions: let think that ls[i][-1] = 0, ls[-1][j] = 0
* </pre>
*/

public class LongestCommonSubstring {
public LongestCommonSubstring(String a, String b) {
this.a = a;
this.b = b;

initDp();
}

// strings a, b
private String a;
private String b;

/**
* ls(i,j) - maximum length of common strings that end at a[i] and b[j].
*/

private int ls[][];

// max ls - stores max length
private int maxLs;
private int maxI;
private int maxJ;

private void updateMaxLs(int ls, int i, int j) {
if (maxLs < ls) {
maxLs = ls;
maxI = i;
maxJ = j;
}
}

private void initMaxLs() {
maxLs = 0;
maxI = -1;
maxJ = -1;
}

private void initDp() {
initMaxLs();

// init with 0 by default
ls = new int[a.length() + 1][b.length() + 1];

// so this could be skipped
for (int i = 0; i <= a.length(); i++)
ls[i][0] = 0;
for (int j = 0; j <= b.length(); j++)
ls[0][j] = 0;

for (int i = 1; i <= a.length(); i++) {
for (int j = 1; j <= b.length(); j++) {
ls[i][j] = (a.charAt(i - 1) != b.charAt(j - 1)) ? 0 : ls[i - 1][j - 1] + 1;
updateMaxLs(ls[i][j], i, j);
}
}
}

public void printLs() {
System.out.printf("%1$c ", ' ');
for (int j = 0; j <= b.length(); j++)
System.out.printf("%1$c ", (j == 0) ? ' ' : b.charAt(j - 1));
System.out.printf("\n");

for (int i = 0; i <= a.length(); i++) {
System.out.printf("%1$c ", (i == 0) ? ' ' : a.charAt(i - 1));

for (int j = 0; j <= b.length(); j++) {
System.out.printf("%1$d ", ls[i][j]);
}
System.out.printf("\n");
}
}

public String getLcs() {
int size = ls[maxI][maxJ];
char sb[] = new char[size];

for(int l=0; l<size; l++)
sb[l] = a.charAt(maxI-size+l);
return new String(sb);
}

public static void main(String[] args) {
LongestCommonSubstring lcs = new LongestCommonSubstring("hello", "aloha");
lcs.printLs();
System.out.printf("LCS=%1$s\n\n", lcs.getLcs());

lcs = new LongestCommonSubstring("baba", "abba");
lcs.printLs();
System.out.printf("LCS=%1$s\n\n", lcs.getLcs());

lcs = new LongestCommonSubstring("Quick Search algorithm", "Tuned Boyer-Moore algorithm");
lcs.printLs();
System.out.printf("LCS=%1$s\n\n", lcs.getLcs());
}
}

Program output could be found below:
    a l o h a 
0 0 0 0 0 0
h 0 0 0 0 1 0
e 0 0 0 0 0 0
l 0 0 1 0 0 0
l 0 0 1 0 0 0
o 0 0 0 2 0 0
LCS=lo

a b b a
0 0 0 0 0
b 0 0 1 1 0
a 0 1 0 0 2
b 0 0 2 1 0
a 0 1 0 0 2
LCS=ba

T u n e d B o y e r - M o o r e a l g o r i t h m
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Q 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
u 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
i 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
c 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
k 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
S 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
e 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
a 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
r 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0
c 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
h 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
a 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0
l 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0
g 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0
o 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 5 0 0 0 0 0
r 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 2 0 0 0 0 0 0 6 0 0 0 0
i 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0
t 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 0 0
h 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 0
m 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10
LCS= algorithm

No comments: