`
biansutao
  • 浏览: 53093 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

字符串相似度算法 levenshtein distance 编辑距离算法

阅读更多
参考:http://www.merriampark.com/ld.htm#WHATIS
http://en.wikipedia.org/wiki/Levenshtein_distance
 
    * Java
    * C++
    * Visual Basic
    * Python

Java

public class Distance {

  //****************************
  // Get minimum of three values
  //****************************

  private int Minimum (int a, int b, int c) {
  int mi;

    mi = a;
    if (b < mi) {
      mi = b;
    }
    if (c < mi) {
      mi = c;
    }
    return mi;

  }

  //*****************************
  // Compute Levenshtein distance
  //*****************************

  public int LD (String s, String t) {
  int d[][]; // matrix
  int n; // length of s
  int m; // length of t
  int i; // iterates through s
  int j; // iterates through t
  char s_i; // ith character of s
  char t_j; // jth character of t
  int cost; // cost

    // Step 1

    n = s.length ();
    m = t.length ();
    if (n == 0) {
      return m;
    }
    if (m == 0) {
      return n;
    }
    d = new int[n+1][m+1];

    // Step 2

    for (i = 0; i <= n; i++) {
      d[i][0] = i;
    }

    for (j = 0; j <= m; j++) {
      d[0][j] = j;
    }

    // Step 3

    for (i = 1; i <= n; i++) {

      s_i = s.charAt (i - 1);

      // Step 4

      for (j = 1; j <= m; j++) {

        t_j = t.charAt (j - 1);

        // Step 5

        if (s_i == t_j) {
          cost = 0;
        }
        else {
          cost = 1;
        }

        // Step 6

        d[i][j] = Minimum (d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1] + cost);

      }

    }

    // Step 7

    return d[n][m];

  }

}

C++

In C++, the size of an array must be a constant, and this code fragment causes an error at compile time:

int sz = 5;
int arr[sz];

This limitation makes the following C++ code slightly more complicated than it would be if the matrix could simply be declared as a two-dimensional array, with a size determined at run-time.

In C++ it's more idiomatic to use the System Template Library's vector class, as Anders Sewerin Johansen has done in an alternative C++ implementation.

Here is the definition of the class (distance.h):

class Distance
{
  public:
    int LD (char const *s, char const *t);
  private:
    int Minimum (int a, int b, int c);
    int *GetCellPointer (int *pOrigin, int col, int row, int nCols);
    int GetAt (int *pOrigin, int col, int row, int nCols);
    void PutAt (int *pOrigin, int col, int row, int nCols, int x);
}; 

Here is the implementation of the class (distance.cpp):

#include "distance.h"
#include <string.h>
#include <malloc.h>

//****************************
// Get minimum of three values
//****************************

int Distance::Minimum (int a, int b, int c)
{
int mi;

  mi = a;
  if (b < mi) {
    mi = b;
  }
  if (c < mi) {
    mi = c;
  }
  return mi;

}

//**************************************************
// Get a pointer to the specified cell of the matrix
//************************************************** 

int *Distance::GetCellPointer (int *pOrigin, int col, int row, int nCols)
{
  return pOrigin + col + (row * (nCols + 1));
}

//*****************************************************
// Get the contents of the specified cell in the matrix 
//*****************************************************

int Distance::GetAt (int *pOrigin, int col, int row, int nCols)
{
int *pCell;

  pCell = GetCellPointer (pOrigin, col, row, nCols);
  return *pCell;

}

//*******************************************************
// Fill the specified cell in the matrix with the value x
//*******************************************************

void Distance::PutAt (int *pOrigin, int col, int row, int nCols, int x)
{
int *pCell;

  pCell = GetCellPointer (pOrigin, col, row, nCols);
  *pCell = x;

}

//*****************************
// Compute Levenshtein distance
//*****************************

int Distance::LD (char const *s, char const *t)
{
int *d; // pointer to matrix
int n; // length of s
int m; // length of t
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
char t_j; // jth character of t
int cost; // cost
int result; // result
int cell; // contents of target cell
int above; // contents of cell immediately above
int left; // contents of cell immediately to left
int diag; // contents of cell immediately above and to left
int sz; // number of cells in matrix

  // Step 1	

  n = strlen (s);
  m = strlen (t);
  if (n == 0) {
    return m;
  }
  if (m == 0) {
    return n;
  }
  sz = (n+1) * (m+1) * sizeof (int);
  d = (int *) malloc (sz);

  // Step 2

  for (i = 0; i <= n; i++) {
    PutAt (d, i, 0, n, i);
  }

  for (j = 0; j <= m; j++) {
    PutAt (d, 0, j, n, j);
  }

  // Step 3

  for (i = 1; i <= n; i++) {

    s_i = s[i-1];

    // Step 4

    for (j = 1; j <= m; j++) {

      t_j = t[j-1];

      // Step 5

      if (s_i == t_j) {
        cost = 0;
      }
      else {
        cost = 1;
      }

      // Step 6 

      above = GetAt (d,i-1,j, n);
      left = GetAt (d,i, j-1, n);
      diag = GetAt (d, i-1,j-1, n);
      cell = Minimum (above + 1, left + 1, diag + cost);
      PutAt (d, i, j, n, cell);
    }
  }

  // Step 7

  result = GetAt (d, n, m, n);
  free (d);
  return result;
	
}

Visual Basic

'*******************************
'*** Get minimum of three values
'*******************************

Private Function Minimum(ByVal a As Integer, _
                         ByVal b As Integer, _
                         ByVal c As Integer) As Integer
Dim mi As Integer
                          
  mi = a
  If b < mi Then
    mi = b
  End If
  If c < mi Then
    mi = c
  End If
  
  Minimum = mi
                          
End Function

'********************************
'*** Compute Levenshtein Distance
'********************************

Public Function LD(ByVal s As String, ByVal t As String) As Integer
Dim d() As Integer ' matrix
Dim m As Integer ' length of t
Dim n As Integer ' length of s
Dim i As Integer ' iterates through s
Dim j As Integer ' iterates through t
Dim s_i As String ' ith character of s
Dim t_j As String ' jth character of t
Dim cost As Integer ' cost
  
  ' Step 1
  
  n = Len(s)
  m = Len(t)
  If n = 0 Then
    LD = m
    Exit Function
  End If 
  If m = 0 Then
    LD = n
    Exit Function
  End If 
  ReDim d(0 To n, 0 To m) As Integer
  
  ' Step 2
  
  For i = 0 To n
    d(i, 0) = i
  Next i
  
  For j = 0 To m
    d(0, j) = j
  Next j

  ' Step 3

  For i = 1 To n
    
    s_i = Mid$(s, i, 1)
    
    ' Step 4
    
    For j = 1 To m
      
      t_j = Mid$(t, j, 1)
      
      ' Step 5
      
      If s_i = t_j Then
        cost = 0
      Else
        cost = 1
      End If
      
      ' Step 6
      
      d(i, j) = Minimum(d(i - 1, j) + 1, d(i, j - 1) + 1, d(i - 1, j - 1) + cost)
    
    Next j
    
  Next i
  
  ' Step 7
  
  LD = d(n, m)
  Erase d

End Function





#!/user/bin/env python
# -*- coding: utf-8 -*-

class arithmetic():
	
	def __init__(self):
		pass
	''' 【编辑距离算法】 【levenshtein distance】 【字符串相似度算法】 '''
	def levenshtein(self,first,second):
		if len(first) > len(second):
			first,second = second,first
		if len(first) == 0:
			return len(second)
		if len(second) == 0:
			return len(first)
		first_length = len(first) + 1
		second_length = len(second) + 1
		distance_matrix = [range(second_length) for x in range(first_length)] 
		#print distance_matrix
		for i in range(1,first_length):
			for j in range(1,second_length):
				deletion = distance_matrix[i-1][j] + 1
				insertion = distance_matrix[i][j-1] + 1
				substitution = distance_matrix[i-1][j-1]
				if first[i-1] != second[j-1]:
					substitution += 1
				distance_matrix[i][j] = min(insertion,deletion,substitution)
		print distance_matrix
		return distance_matrix[first_length-1][second_length-1]
	
if __name__ == "__main__":
	arith = arithmetic()
	print arith.levenshtein('GUMBOsdafsadfdsafsafsadfasfadsfasdfasdfs','GAMBOL00000000000dfasfasfdafsafasfasdfdsa'

  • ld.rar (6 KB)
  • 下载次数: 47
分享到:
评论
1 楼 junfeng_feng 2012-12-10  
这个Pytrhon代码不知道起源在在哪?

distance_matrix 没有初始化正确。
应该初始化为:
0,1,2,3,4
1,
2,
3,

相关推荐

Global site tag (gtag.js) - Google Analytics