..

工作上的LCS算法

帮之前同事解决他公司的一个bug,顺带优化了一个feature。 这个优化的功能类似于git的diff,要找到一行文字有变化的部分。 其实就是LCS算法。


在复习下LCS算法,参考了geeksforgeeks的代码1


  • 定义

LCS Problem Statement: Given two sequences, find the length of longest subsequence present in both of them. A subsequence is a sequence that appears in the same relative order, but not necessarily contiguous. For example, “abc”, “abg”, “bdf”, “aeg”, ‘”acefg”, .. etc are subsequences of “abcdefg”.

  • 分析

假设有两个字符串
X = ‘AGGTAB’ 、 Y = ‘GXTXAYB’
长度用 m,n 表示
用 L(X, Y) 表示 lcs 的长度。 有:
1)如果S1、S2最后一个字符相同,则:

L(X[0..m-1], Y[0..n-1]) = 1 + L(X[0..m-2], Y[0..n-2])

2)如果S1、S2最后一个字符不相同,则:

L(X[0..m-1], Y[0..n-1]) = MAX ( L(X[0..m-2], Y[0..n-1]), L(X[0..m-1], Y[0..n-2]) )

可以实现代码如下:

/* A Naive recursive implementation of LCS problem */
#include <bits/stdc++.h>
using namespace std;
  
int max(int a, int b); 
  
/* Returns length of LCS for X[0..m-1], Y[0..n-1] */
int lcs( char *X, char *Y, int m, int n ) 
{ 
    if (m == 0 || n == 0) 
        return 0; 
    if (X[m-1] == Y[n-1]) {
        return 1 + lcs(X, Y, m-1, n-1); 
    }
    else
        return max(lcs(X, Y, m, n-1), lcs(X, Y, m-1, n)); 
} 
  
/* Utility function to get max of 2 integers */
int max(int a, int b) 
{ 
    return (a > b)? a : b; 
} 
  
/* Driver code */
int main() 
{ 
    char X[] = "AGGTAB"; 
    char Y[] = "GXTXAYB"; 
      
    int m = strlen(X); 
    int n = strlen(Y); 
      
    cout<<"Length of LCS is "<< lcs( X, Y, m, n ) ; 
      
    return 0; 
} 

时间复杂度: 2^n


  • 优化

在上述实现中,有重复计算的情况,

                         lcs("AXYT", "AYZX")
                       /                 
         lcs("AXY", "AYZX")            lcs("AXYT", "AYZ")
         /                              /               
lcs("AX", "AYZX") lcs("AXY", "AYZ")   lcs("AXY", "AYZ") lcs("AXYT", "AY")

可以优化如下代码:

/* Dynamic Programming C++ implementation of LCS problem */
#include<bits/stdc++.h> 
using namespace std;
  
int max(int a, int b); 
  
/* Returns length of LCS for X[0..m-1], Y[0..n-1] */
int lcs( char *X, char *Y, int m, int n ) 
{ 
    int L[m + 1][n + 1]; 
    int i, j; 
      
    /* Following steps build L[m+1][n+1] in 
       bottom up fashion. Note that L[i][j] 
       contains length of LCS of X[0..i-1]
       and Y[0..j-1] */
    for (i = 0; i <= m; i++) 
    { 
        for (j = 0; j <= n; j++) 
        { 
        if (i == 0 || j == 0) 
            L[i][j] = 0; 
      
        else if (X[i - 1] == Y[j - 1]) 
            L[i][j] = L[i - 1][j - 1] + 1; 
      
        else
            L[i][j] = max(L[i - 1][j], L[i][j - 1]); 
        } 
    } 
          
    /* L[m][n] contains length of LCS 
    for X[0..n-1] and Y[0..m-1] */
    return L[m][n]; 
} 
  
/* Utility function to get max of 2 integers */
int max(int a, int b) 
{ 
    return (a > b)? a : b; 
} 
  
// Driver Code
int main() 
{ 
    char X[] = "AGGTAB"; 
    char Y[] = "GXTXAYB"; 
      
    int m = strlen(X); 
    int n = strlen(Y); 
      
    cout << "Length of LCS is " 
         << lcs( X, Y, m, n ); 
      
    return 0; 
} 
  
// This code is contributed by rathbhupendra

时间复杂度为:O(mn)


下面打印出来 lcs、长度、及index

/* Dynamic Programming implementation of LCS problem */
#include<iostream>
#include<cstring>
#include<cstdlib>
using namespace std;
  
/* Returns length of LCS for X[0..m-1], Y[0..n-1] */
void lcs( char *X, char *Y, int m, int n )
{
   int L[m+1][n+1];
  
   /* Following steps build L[m+1][n+1] in bottom up fashion. Note
      that L[i][j] contains length of LCS of X[0..i-1] and Y[0..j-1] */
   for (int i=0; i<=m; i++)
   {
     for (int j=0; j<=n; j++)
     {
       if (i == 0 || j == 0)
         L[i][j] = 0;
       else if (X[i-1] == Y[j-1])
         L[i][j] = L[i-1][j-1] + 1;
       else
         L[i][j] = max(L[i-1][j], L[i][j-1]);
     }
   }
  
   // Following code is used to print LCS
   int index = L[m][n];
  
   // Create a character array to store the lcs string
   char lcs[index+1];
   lcs[index] = '\0'; // Set the terminating character
  
   // Start from the right-most-bottom-most corner and
   // one by one store characters in lcs[]
   int i = m, j = n;
   while (i > 0 && j > 0)
   {
      // If current character in X[] and Y are same, then
      // current character is part of LCS
      if (X[i-1] == Y[j-1])
      {
          lcs[index-1] = X[i-1]; // Put current character in result
          i--; j--; index--;     // reduce values of i, j and index
          
          // 分别打印 index
          std::cout << "i: " << i << std::endl;
          std::cout << "j: " << j << std::endl;
      }
  
      // If not same, then find the larger of two and
      // go in the direction of larger value
      else if (L[i-1][j] > L[i][j-1])
         i--;
      else
         j--;
   }
  
   // Print the lcs
   cout << "LCS of " << X << " and " << Y << " is " << lcs;
}
  
/* Driver program to test above function */
int main()
{
  char X[] = "AGGTAB";
  char Y[] = "GXTXAYB";
  int m = strlen(X);
  int n = strlen(Y);
  lcs(X, Y, m, n);
  return 0;
}

到这里,功能完成了。 更详细的可以看网上其他人的讲解2