Sql-server – Find rows with similar string values

data-cleansingfull-text-searchsql serversql-server-2012string manipulation

I have a Microsoft SQL Server 2012 database table with around 7 million crowd-sourced records, primarily containing a string name value with some related details. For nearly every record it seems there are a dozen similar typo records and I am trying to do some fuzzy matching to identify record groups such as "Apple", "Aple", "Apples", "Spple", etc. These names can also contain multiple words with spaces between them.

I've come up with a solution of using an edit-distance scalar function that returns number of keystrokes required for transformation from string1 to string2 and using that function to join the table to itself. As you can imagine, this doesn't perform that well since its having to execute the function millions of times to evaluate a join.

So I put that in a cursor so at least only one string1 is being evaluated at a time, this at least gets results coming out but after letting it run for weeks it has only made it through evaluating 150,000 records. With 7 million to evaluate, I don't think I have the kind of time my method is going to take.

I put full text indexes on the string names, but couldn't really find a way to use the full text predicates when I didn't have a static value I was searching.

Any ideas how I could do something like the following in a way that wouldn't take months to run?

  SELECT t1.name, t2.name
  FROM names AS t1
  INNER JOIN names AS t2
       ON EditDistance(t1.name,t2.name) = 1
       AND t1.id != t2.id

I've tried soundex, but since the names can contain spaces and multiple words per value I get too many false positives to use it reliably.

Best Answer

Having dealt this issue, the most effective and performant way is to create a CLR function that calculates the Levenshtein Distance. You will be able to mark the assembly as SAFE (if you're at all concerned about security), and it runs much quicker than SOUNDEX() or any in-built SQL Server functions.

Here's the code to set up the assembly and function in the database, as well as a basic version of the Levenshtein Distance algorithm implemented in C# from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#C#

C#:

using System;
using System.Security.Cryptography;

namespace LevenshteinDistance
{
    public class LevenshteinDistance
    {
        private int LevenshteinDistance(string a, string b)
        {
            if (string.IsNullOrEmpty(a))
            {
                if (!string.IsNullOrEmpty(b))
                {
                    return b.Length;
                }
                return 0;
            }

            if (string.IsNullOrEmpty(b))
            {
                if (!string.IsNullOrEmpty(a))
                {
                    return a.Length;
                }
                return 0;
            }

            int cost;
            int[,] d = new int[a.Length + 1, b.Length + 1];
            int min1;
            int min2;
            int min3;

            for (int i = 0; i <= d.GetUpperBound(0); i += 1)
            {
                d[i, 0] = i;
            }

            for (int i = 0; i <= d.GetUpperBound(1); i += 1)
            {
                d[0, i] = i;
            }

            for (int i = 1; i <= d.GetUpperBound(0); i += 1)
            {
                for (int j = 1; j <= d.GetUpperBound(1); j += 1)
                {
                    cost = (a[i-1] != b[j-1])? 1 : 0; 

                    min1 = d[i - 1, j] + 1;
                    min2 = d[i, j - 1] + 1;
                    min3 = d[i - 1, j - 1] + cost;
                    d[i, j] = Math.Min(Math.Min(min1, min2), min3);
                }
            }
            return d[d.GetUpperBound(0), d.GetUpperBound(1)];
        }        
    }
}

T-SQL:

use [master];
go

exec sp_configure 'clr enabled', 1;
go
reconfigure with override;
go

use [database_name];
go

-- Drop the function...
if exists (select 1 from sys.objects so where so.[name] = 'LevenshteinDistance')
    drop function dbo.LevenshteinDistance;
go

-- ...then the assembly
if exists (select 1 from sys.assemblies sa where sa.[name] = 'LevenshteinDistance')
    drop assembly [LevenshteinDistance];
go

-- Now load the assembly from an appropriately accessible location
create assembly [LevenshteinDistance]
from
    'd:\LevenshteinDistance.dll'
with
    permission_set = safe;
go

-- Create an asymmetric key from the assembly file
use [master];
go

if not exists (select 1 from sys.asymmetric_keys ak where ak.[name] = 'LevenshteinDistanceKey')
begin
    create asymmetric key LevenshteinDistanceKey
    from executable file = 'd:\LevenshteinDistance.dll';
end
go

-- Create a user to associate with the assembly from the asymmetric key, and then
-- revoke connect access. The login is used to execute the assembly.
use [master];
go

if not exists (select 1 from sys.server_principals sp where sp.[name] = 'LevenshteinDistanceKeyUser')
begin
    create login LevenshteinDistanceKeyUser from asymmetric key LevenshteinDistanceKey;
    revoke connect sql from LevenshteinDistanceKeyUser;
end
go

grant external access assembly to LevenshteinDistanceKeyUser;
go

use [database_name];
go
alter assembly [LevenshteinDistance] with permission_set = safe;
go

-- Create the SQL function which will be called
create function [dbo].LevenshteinDistance
(
    @string1 nvarchar(2048)
    ,@string2 nvarchar(2048)
)
returns nvarchar(max)
as
    external name LevenshteinDistance.[LevenshteinDistance.LevenshteinDistance].LevenshteinDistance;
go