在T-SQL中的Levenshtein距离
我对计算Levenshtein距离的T-SQLalgorithm感兴趣。
阿诺德·布里布尔提出了这一个:
SET QUOTED_IDENTIFIER ON GO SET ANSI_NULLS ON GO CREATE FUNCTION edit_distance_within(@s nvarchar(4000), @t nvarchar(4000), @d int) RETURNS int AS BEGIN DECLARE @sl int, @tl int, @i int, @j int, @sc nchar, @c int, @c1 int, @cv0 nvarchar(4000), @cv1 nvarchar(4000), @cmin int SELECT @sl = LEN(@s), @tl = LEN(@t), @cv1 = '', @j = 1, @i = 1, @c = 0 WHILE @j <= @tl SELECT @cv1 = @cv1 + NCHAR(@j), @j = @j + 1 WHILE @i <= @sl BEGIN SELECT @sc = SUBSTRING(@s, @i, 1), @c1 = @i, @c = @i, @cv0 = '', @j = 1, @cmin = 4000 WHILE @j <= @tl BEGIN SET @c = @c + 1 SET @c1 = @c1 - CASE WHEN @sc = SUBSTRING(@t, @j, 1) THEN 1 ELSE 0 END IF @c > @c1 SET @c = @c1 SET @c1 = UNICODE(SUBSTRING(@cv1, @j, 1)) + 1 IF @c > @c1 SET @c = @c1 IF @c < @cmin SET @cmin = @c SELECT @cv0 = @cv0 + NCHAR(@c), @j = @j + 1 END IF @cmin > @d BREAK SELECT @cv1 = @cv0, @i = @i + 1 END RETURN CASE WHEN @cmin <= @d AND @c <= @d THEN @c ELSE -1 END END GO
我在TSQL中实现了标准的Levenshtein编辑距离函数,并进行了几次优化,这些优化提高了我意识到的其他版本的速度。 在两个string在起始处具有共同字符的情况下(共享前缀),在它们的末尾共享字符(共享后缀),并且当string较大并且提供最大编辑距离时,速度的提高是显着的。 例如,当input是两个非常相似的4000个字符的string,并且指定了最大编辑距离为2时,这比接受的答案中的edit_distance_within
函数快了三个数量级,在0.073秒(73毫秒)vs 55秒。 这也是有效的内存,使用空间等于两个inputstring中的较大的一个,加上一些恒定的空间。 它使用一个表示列的nvarchar“数组”,并且在那里进行所有的计算,加上一些helper intvariables。
优化:
- 跳过共享前缀和/或后缀的处理
- 如果较大的string以较小的string开始或结束,则提前返回
- 如果尺寸不同,可以保证最大距离
- 仅使用表示matrix中的列的单个数组(实现为nvarchar)
- 当给定最大距离时,时间复杂度从(len1 * len2)到(min(len1,len2)),即线性
- 当给出最大距离时,一旦知道最大距离界限,尽早返回是不可实现的
在TSQL的Levenshtein 博客文章中对优化进行了更详细的描述,并将其链接到另一个与Damerau-Levenshtein类似的文章。 但是这里是代码(2014年1月20日更新,以加速更多):
-- ============================================= -- Computes and returns the Levenshtein edit distance between two strings, ie the -- number of insertion, deletion, and sustitution edits required to transform one -- string to the other, or NULL if @max is exceeded. Comparisons use the case- -- sensitivity configured in SQL Server (case-insensitive by default). -- http://blog.softwx.net/2014/12/optimizing-levenshtein-algorithm-in-tsql.html -- -- Based on Sten Hjelmqvist's "Fast, memory efficient" algorithm, described -- at http://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm, -- with some additional optimizations. -- ============================================= CREATE FUNCTION [dbo].[Levenshtein]( @s nvarchar(4000) , @t nvarchar(4000) , @max int ) RETURNS int WITH SCHEMABINDING AS BEGIN DECLARE @distance int = 0 -- return variable , @v0 nvarchar(4000)-- running scratchpad for storing computed distances , @start int = 1 -- index (1 based) of first non-matching character between the two string , @i int, @j int -- loop counters: i for s string and j for t string , @diag int -- distance in cell diagonally above and left if we were using an m by n matrix , @left int -- distance in cell to the left if we were using an m by n matrix , @sChar nchar -- character at index i from s string , @thisJ int -- temporary storage of @j to allow SELECT combining , @jOffset int -- offset used to calculate starting value for j loop , @jEnd int -- ending value for j loop (stopping point for processing a column) -- get input string lengths including any trailing spaces (which SQL Server would otherwise ignore) , @sLen int = datalength(@s) / datalength(left(left(@s, 1) + '.', 1)) -- length of smaller string , @tLen int = datalength(@t) / datalength(left(left(@t, 1) + '.', 1)) -- length of larger string , @lenDiff int -- difference in length between the two strings -- if strings of different lengths, ensure shorter string is in s. This can result in a little -- faster speed by spending more time spinning just the inner loop during the main processing. IF (@sLen > @tLen) BEGIN SELECT @v0 = @s, @i = @sLen -- temporarily use v0 for swap SELECT @s = @t, @sLen = @tLen SELECT @t = @v0, @tLen = @i END SELECT @max = ISNULL(@max, @tLen) , @lenDiff = @tLen - @sLen IF @lenDiff > @max RETURN NULL -- suffix common to both strings can be ignored WHILE(@sLen > 0 AND SUBSTRING(@s, @sLen, 1) = SUBSTRING(@t, @tLen, 1)) SELECT @sLen = @sLen - 1, @tLen = @tLen - 1 IF (@sLen = 0) RETURN @tLen -- prefix common to both strings can be ignored WHILE (@start < @sLen AND SUBSTRING(@s, @start, 1) = SUBSTRING(@t, @start, 1)) SELECT @start = @start + 1 IF (@start > 1) BEGIN SELECT @sLen = @sLen - (@start - 1) , @tLen = @tLen - (@start - 1) -- if all of shorter string matches prefix and/or suffix of longer string, then -- edit distance is just the delete of additional characters present in longer string IF (@sLen <= 0) RETURN @tLen SELECT @s = SUBSTRING(@s, @start, @sLen) , @t = SUBSTRING(@t, @start, @tLen) END -- initialize v0 array of distances SELECT @v0 = '', @j = 1 WHILE (@j <= @tLen) BEGIN SELECT @v0 = @v0 + NCHAR(CASE WHEN @j > @max THEN @max ELSE @j END) SELECT @j = @j + 1 END SELECT @jOffset = @max - @lenDiff , @i = 1 WHILE (@i <= @sLen) BEGIN SELECT @distance = @i , @diag = @i - 1 , @sChar = SUBSTRING(@s, @i, 1) -- no need to look beyond window of upper left diagonal (@i) + @max cells -- and the lower right diagonal (@i - @lenDiff) - @max cells , @j = CASE WHEN @i <= @jOffset THEN 1 ELSE @i - @jOffset END , @jEnd = CASE WHEN @i + @max >= @tLen THEN @tLen ELSE @i + @max END WHILE (@j <= @jEnd) BEGIN -- at this point, @distance holds the previous value (the cell above if we were using an m by n matrix) SELECT @left = UNICODE(SUBSTRING(@v0, @j, 1)) , @thisJ = @j SELECT @distance = CASE WHEN (@sChar = SUBSTRING(@t, @j, 1)) THEN @diag --match, no change ELSE 1 + CASE WHEN @diag < @left AND @diag < @distance THEN @diag --substitution WHEN @left < @distance THEN @left -- insertion ELSE @distance -- deletion END END SELECT @v0 = STUFF(@v0, @thisJ, 1, NCHAR(@distance)) , @diag = @left , @j = case when (@distance > @max) AND (@thisJ = @i + @lenDiff) then @jEnd + 2 else @thisJ + 1 end END SELECT @i = CASE WHEN @j > @jEnd + 1 THEN @sLen + 1 ELSE @i + 1 END END RETURN CASE WHEN @distance <= @max THEN @distance ELSE NULL END END
IIRC,使用SQL Server 2005和更高版本,您可以使用任何.NET语言编写存储过程: 在SQL Server 2005中使用CLR集成 。 因此,编写计算莱文斯坦距离的程序不应该很困难。
一个简单的你好,世界! 从帮助中提取:
using System; using System.Data; using Microsoft.SqlServer.Server; using System.Data.SqlTypes; public class HelloWorldProc { [Microsoft.SqlServer.Server.SqlProcedure] public static void HelloWorld(out string text) { SqlContext.Pipe.Send("Hello world!" + Environment.NewLine); text = "Hello world!"; } }
然后在你的SQL Server中运行以下命令:
CREATE ASSEMBLY helloworld from 'c:\helloworld.dll' WITH PERMISSION_SET = SAFE CREATE PROCEDURE hello @i nchar(25) OUTPUT AS EXTERNAL NAME helloworld.HelloWorldProc.HelloWorld
现在你可以testing运行它:
DECLARE @J nchar(25) EXEC hello @J out PRINT @J
希望这可以帮助。
您可以使用Levenshtein距离algorithm来比较string
CREATE FUNCTION edit_distance(@s1 nvarchar(3999), @s2 nvarchar(3999)) RETURNS int AS BEGIN DECLARE @s1_len int, @s2_len int DECLARE @i int, @j int, @s1_char nchar, @c int, @c_temp int DECLARE @cv0 varbinary(8000), @cv1 varbinary(8000) SELECT @s1_len = LEN(@s1), @s2_len = LEN(@s2), @cv1 = 0x0000, @j = 1, @i = 1, @c = 0 WHILE @j <= @s2_len SELECT @cv1 = @cv1 + CAST(@j AS binary(2)), @j = @j + 1 WHILE @i <= @s1_len BEGIN SELECT @s1_char = SUBSTRING(@s1, @i, 1), @c = @i, @cv0 = CAST(@i AS binary(2)), @j = 1 WHILE @j <= @s2_len BEGIN SET @c = @c + 1 SET @c_temp = CAST(SUBSTRING(@cv1, @j+@j-1, 2) AS int) + CASE WHEN @s1_char = SUBSTRING(@s2, @j, 1) THEN 0 ELSE 1 END IF @c > @c_temp SET @c = @c_temp SET @c_temp = CAST(SUBSTRING(@cv1, @j+@j+1, 2) AS int)+1 IF @c > @c_temp SET @c = @c_temp SELECT @cv0 = @cv0 + CAST(@c AS binary(2)), @j = @j + 1 END SELECT @cv1 = @cv0, @i = @i + 1 END RETURN @c END
(function由Joseph Gama开发)
用法:
select dbo.edit_distance('Fuzzy String Match','fuzzy string match'), dbo.edit_distance('fuzzy','fuzy'), dbo.edit_distance('Fuzzy String Match','fuzy string match'), dbo.edit_distance('levenshtein distance sql','levenshtein sql server'), dbo.edit_distance('distance','server')
该algorithm简单地返回stpe计数,通过在一个步骤replace不同的字符来将一个string改变为另一个string
我正在寻找Levenshteinalgorithm的代码示例,并很高兴在这里find它。 当然,我想知道这个algorithm是如何工作的,而且我正在玩一个上面的例子,我正在玩一些Veve发布的例子 。 为了更好地理解代码,我使用Matrix创build了一个EXCEL。
距离FUZZY与FUZY相比
图片说超过1000字。
有了这个EXCEL,我发现有额外的性能优化的潜力。 右上方红色区域中的所有值都不需要计算。 每个红色单元格的值将导致左侧单元格的值加1.这是因为第二个string在该区域中的长度总是比第一个string长,这样每个字符的距离就会增加1。
您可以使用IF @j <= @i语句并增加@i的值来反映这一点。
CREATE FUNCTION [dbo].[f_LevenshteinDistance](@s1 nvarchar(3999), @s2 nvarchar(3999)) RETURNS int AS BEGIN DECLARE @s1_len int; DECLARE @s2_len int; DECLARE @i int; DECLARE @j int; DECLARE @s1_char nchar; DECLARE @c int; DECLARE @c_temp int; DECLARE @cv0 varbinary(8000); DECLARE @cv1 varbinary(8000); SELECT @s1_len = LEN(@s1), @s2_len = LEN(@s2), @cv1 = 0x0000 , @j = 1 , @i = 1 , @c = 0 WHILE @j <= @s2_len SELECT @cv1 = @cv1 + CAST(@j AS binary(2)), @j = @j + 1; WHILE @i <= @s1_len BEGIN SELECT @s1_char = SUBSTRING(@s1, @i, 1), @c = @i , @cv0 = CAST(@i AS binary(2)), @j = 1; SET @i = @i + 1; WHILE @j <= @s2_len BEGIN SET @c = @c + 1; IF @j <= @i BEGIN SET @c_temp = CAST(SUBSTRING(@cv1, @j + @j - 1, 2) AS int) + CASE WHEN @s1_char = SUBSTRING(@s2, @j, 1) THEN 0 ELSE 1 END; IF @c > @c_temp SET @c = @c_temp SET @c_temp = CAST(SUBSTRING(@cv1, @j + @j + 1, 2) AS int) + 1; IF @c > @c_temp SET @c = @c_temp; END; SELECT @cv0 = @cv0 + CAST(@c AS binary(2)), @j = @j + 1; END; SET @cv1 = @cv0; END; RETURN @c; END;
让我先说我知道这是可怕的。 然而,我正在使用HIVE QL,但是对于udf还不够了解所以我创build了这个安迪 – 施特恩的怪物…这绝对不是很漂亮,但是我觉得这听起来不错。 你怎么看?
DECLARE @A VARCHAR(20),@B VARCHAR(20) SET @A = 'AAIRAA' SET @B = 'ALASKA AIR' SELECT CASE WHEN RUNME = 0 THEN 0 ELSE (SUM(CASE WHEN A13 IS NOT NULL THEN A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11+A12+A13 WHEN A12 IS NOT NULL THEN A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11+A12 WHEN A11 IS NOT NULL THEN A1+A2+A3+A4+A5+A6+A7+A8+A9+A10+A11 WHEN A10 IS NOT NULL THEN A1+A2+A3+A4+A5+A6+A7+A8+A9+A10 WHEN A9 IS NOT NULL THEN A1+A2+A3+A4+A5+A6+A7+A8+A9 WHEN A8 IS NOT NULL THEN A1+A2+A3+A4+A5+A6+A7+A8 WHEN A7 IS NOT NULL THEN A1+A2+A3+A4+A5+A6+A7 WHEN A6 IS NOT NULL THEN A1+A2+A3+A4+A5+A6 WHEN A5 IS NOT NULL THEN A1+A2+A3+A4+A5 WHEN A4 IS NOT NULL THEN A1+A2+A3+A4 WHEN A3 IS NOT NULL THEN A1+A2+A3 WHEN A2 IS NOT NULL THEN A1+A2 WHEN A1 IS NOT NULL THEN A1 ELSE 0 END)*1.0)/ ((13-SUM(CASE WHEN A13 IS NULL THEN 1 ELSE 0 END + CASE WHEN A12 IS NULL THEN 1 ELSE 0 END + CASE WHEN A11 IS NULL THEN 1 ELSE 0 END + CASE WHEN A10 IS NULL THEN 1 ELSE 0 END + CASE WHEN A9 IS NULL THEN 1 ELSE 0 END + CASE WHEN A8 IS NULL THEN 1 ELSE 0 END + CASE WHEN A7 IS NULL THEN 1 ELSE 0 END + CASE WHEN A6 IS NULL THEN 1 ELSE 0 END + CASE WHEN A5 IS NULL THEN 1 ELSE 0 END + CASE WHEN A4 IS NULL THEN 1 ELSE 0 END + CASE WHEN A3 IS NULL THEN 1 ELSE 0 END + CASE WHEN A2 IS NULL THEN 1 ELSE 0 END + CASE WHEN A1 IS NULL THEN 1 ELSE 0 END))*1.0) END AS MATCHY FROM ( SELECT CASE WHEN LEN(@A) < 6 THEN 0 WHEN LEN(@B) < 6 THEN 0 ELSE 1 END AS RUNME, CASE WHEN SUBSTRING(@A, 1, 3) ='' THEN NULL WHEN @B LIKE CONCAT('%', SUBSTRING(@A, 1, 3), '%') THEN 1 ELSE 0 END AS A1, CASE WHEN SUBSTRING(@A, 2, 3) ='' THEN NULL WHEN @B LIKE CONCAT('%', SUBSTRING(@A, 2, 3), '%') THEN 1 ELSE 0 END AS A2, CASE WHEN SUBSTRING(@A, 3, 3) ='' THEN NULL WHEN @B LIKE CONCAT('%', SUBSTRING(@A, 3, 3), '%') THEN 1 ELSE 0 END AS A3, CASE WHEN SUBSTRING(@A, 4, 3) ='' THEN NULL WHEN @B LIKE CONCAT('%', SUBSTRING(@A, 4, 3), '%') THEN 1 ELSE 0 END AS A4, CASE WHEN SUBSTRING(@A, 5, 3) ='' THEN NULL WHEN @B LIKE CONCAT('%', SUBSTRING(@A, 5, 3), '%') THEN 1 ELSE 0 END AS A5, CASE WHEN SUBSTRING(@A, 6, 3) ='' THEN NULL WHEN @B LIKE CONCAT('%', SUBSTRING(@A, 6, 3), '%') THEN 1 ELSE 0 END AS A6, CASE WHEN SUBSTRING(@A, 7, 3) ='' THEN NULL WHEN @B LIKE CONCAT('%', SUBSTRING(@A, 7, 3), '%') THEN 1 ELSE 0 END AS A7, CASE WHEN SUBSTRING(@A, 8, 3) ='' THEN NULL WHEN @B LIKE CONCAT('%', SUBSTRING(@A, 8, 3), '%') THEN 1 ELSE 0 END AS A8, CASE WHEN SUBSTRING(@A, 9, 3) ='' THEN NULL WHEN @B LIKE CONCAT('%', SUBSTRING(@A, 9, 3), '%') THEN 1 ELSE 0 END AS A9, CASE WHEN SUBSTRING(@A, 10, 3) ='' THEN NULL WHEN @B LIKE CONCAT('%', SUBSTRING(@A, 10, 3), '%') THEN 1 ELSE 0 END AS A10, CASE WHEN SUBSTRING(@A, 11, 3) ='' THEN NULL WHEN @B LIKE CONCAT('%', SUBSTRING(@A, 11, 3), '%') THEN 1 ELSE 0 END AS A11, CASE WHEN SUBSTRING(@A, 12, 3) ='' THEN NULL WHEN @B LIKE CONCAT('%', SUBSTRING(@A, 12, 3), '%') THEN 1 ELSE 0 END AS A12, CASE WHEN SUBSTRING(@A, 13, 3) ='' THEN NULL WHEN @B LIKE CONCAT('%', SUBSTRING(@A, 13, 3), '%') THEN 1 ELSE 0 END AS A13 )SUB GROUP BY RUNME