183 lines
4.8 KiB
C++
Raw Permalink Normal View History

2024-01-08 23:37:00 +08:00
2024-04-02 15:36:52 +08:00
size_t lev_u_edit_distance(size_t len1, const wchar_t *string1,
size_t len2, const wchar_t *string2,
int xcost)
2024-01-08 23:37:00 +08:00
{
size_t i;
2024-04-02 15:36:52 +08:00
size_t *row; /* we only need to keep one row of costs */
size_t *end;
2024-01-08 23:37:00 +08:00
size_t half;
/* strip common prefix */
2024-04-02 15:36:52 +08:00
while (len1 > 0 && len2 > 0 && *string1 == *string2)
{
2024-01-08 23:37:00 +08:00
len1--;
len2--;
string1++;
string2++;
}
/* strip common suffix */
2024-04-02 15:36:52 +08:00
while (len1 > 0 && len2 > 0 && string1[len1 - 1] == string2[len2 - 1])
{
2024-01-08 23:37:00 +08:00
len1--;
len2--;
}
/* catch trivial cases */
if (len1 == 0)
return len2;
if (len2 == 0)
return len1;
/* make the inner cycle (i.e. string2) the longer one */
2024-04-02 15:36:52 +08:00
if (len1 > len2)
{
2024-01-08 23:37:00 +08:00
size_t nx = len1;
2024-04-02 15:36:52 +08:00
const wchar_t *sx = string1;
2024-01-08 23:37:00 +08:00
len1 = len2;
len2 = nx;
string1 = string2;
string2 = sx;
}
/* check len1 == 1 separately */
2024-04-02 15:36:52 +08:00
if (len1 == 1)
{
2024-01-08 23:37:00 +08:00
wchar_t z = *string1;
2024-04-02 15:36:52 +08:00
const wchar_t *p = string2;
for (i = len2; i; i--)
{
2024-01-08 23:37:00 +08:00
if (*(p++) == z)
return len2 - 1;
}
return len2 + (xcost != 0);
}
len1++;
len2++;
half = len1 >> 1;
/* initalize first row */
2024-04-02 15:36:52 +08:00
row = (size_t *)malloc(len2 * sizeof(size_t));
2024-01-08 23:37:00 +08:00
if (!row)
return (size_t)(-1);
end = row + len2 - 1;
for (i = 0; i < len2 - (xcost ? 0 : half); i++)
row[i] = i;
/* go through the matrix and compute the costs. yes, this is an extremely
* obfuscated version, but also extremely memory-conservative and relatively
* fast. */
2024-04-02 15:36:52 +08:00
if (xcost)
{
for (i = 1; i < len1; i++)
{
size_t *p = row + 1;
2024-01-08 23:37:00 +08:00
const wchar_t char1 = string1[i - 1];
2024-04-02 15:36:52 +08:00
const wchar_t *char2p = string2;
2024-01-08 23:37:00 +08:00
size_t D = i - 1;
size_t x = i;
2024-04-02 15:36:52 +08:00
while (p <= end)
{
2024-01-08 23:37:00 +08:00
if (char1 == *(char2p++))
x = D;
else
x++;
D = *p;
if (x > D + 1)
x = D + 1;
*(p++) = x;
}
}
}
2024-04-02 15:36:52 +08:00
else
{
2024-01-08 23:37:00 +08:00
/* in this case we don't have to scan two corner triangles (of size len1/2)
* in the matrix because no best path can go throught them. note this
* breaks when len1 == len2 == 2 so the memchr() special case above is
* necessary */
row[0] = len1 - half - 1;
2024-04-02 15:36:52 +08:00
for (i = 1; i < len1; i++)
{
size_t *p;
2024-01-08 23:37:00 +08:00
const wchar_t char1 = string1[i - 1];
2024-04-02 15:36:52 +08:00
const wchar_t *char2p;
2024-01-08 23:37:00 +08:00
size_t D, x;
/* skip the upper triangle */
2024-04-02 15:36:52 +08:00
if (i >= len1 - half)
{
2024-01-08 23:37:00 +08:00
size_t offset = i - (len1 - half);
size_t c3;
char2p = string2 + offset;
p = row + offset;
c3 = *(p++) + (char1 != *(char2p++));
x = *p;
x++;
D = x;
if (x > c3)
x = c3;
*(p++) = x;
}
2024-04-02 15:36:52 +08:00
else
{
2024-01-08 23:37:00 +08:00
p = row + 1;
char2p = string2;
D = x = i;
}
/* skip the lower triangle */
if (i <= half + 1)
end = row + len2 + i - half - 2;
/* main */
2024-04-02 15:36:52 +08:00
while (p <= end)
{
2024-01-08 23:37:00 +08:00
size_t c3 = --D + (char1 != *(char2p++));
x++;
if (x > c3)
x = c3;
D = *p;
D++;
if (x > D)
x = D;
*(p++) = x;
}
/* lower triangle sentinel */
2024-04-02 15:36:52 +08:00
if (i <= half)
{
2024-01-08 23:37:00 +08:00
size_t c3 = --D + (char1 != *char2p);
x++;
if (x > c3)
x = c3;
*p = x;
}
}
}
i = *end;
free(row);
return i;
}
2024-11-15 03:40:55 +08:00
#ifndef WINXP
#include <rapidfuzz/distance.hpp>
#endif
2024-11-05 15:46:45 +08:00
DECLARE_API size_t levenshtein_distance(size_t len1, const wchar_t *string1,
2024-11-15 03:21:30 +08:00
size_t len2, const wchar_t *string2)
2024-04-02 15:36:52 +08:00
{
2024-11-15 03:21:30 +08:00
#ifndef WINXP
2024-07-01 22:14:01 +08:00
return rapidfuzz::levenshtein_distance(std::wstring_view(string1, len1), std::wstring_view(string2, len2));
2024-11-15 03:21:30 +08:00
#else
return lev_u_edit_distance(len1, string1, len2, string2, 0);
#endif
2024-01-08 23:37:00 +08:00
}
2024-11-05 15:46:45 +08:00
DECLARE_API double levenshtein_ratio(size_t len1, const wchar_t *string1,
2024-11-15 03:21:30 +08:00
size_t len2, const wchar_t *string2)
2024-04-02 15:36:52 +08:00
{
2024-11-15 03:21:30 +08:00
#ifndef WINXP
2024-07-01 22:14:01 +08:00
auto ldist = levenshtein_distance(len1, string1, len2, string2);
2024-11-15 03:21:30 +08:00
#else
auto ldist = lev_u_edit_distance(len1, string1, len2, string2, 1);
#endif
2024-01-08 23:37:00 +08:00
auto lensum = len1 + len2;
2024-04-02 15:36:52 +08:00
if (lensum == 0)
return 0;
2024-01-08 23:37:00 +08:00
return (double)(lensum - ldist) / lensum;
2024-04-02 15:36:52 +08:00
}