upgrade repetition detection algorithm

This commit is contained in:
Akash Mozumdar 2019-08-07 14:05:50 -04:00
parent f1e1257f21
commit c38551905d
5 changed files with 91 additions and 26 deletions

View File

@ -41,6 +41,7 @@ foreach ($language in @{
"Regex Filter.dll", "Regex Filter.dll",
"Remove Repeated Characters.dll", "Remove Repeated Characters.dll",
"Remove Repeated Phrases.dll", "Remove Repeated Phrases.dll",
"Remove Repeated Phrases 2.dll",
"Remove 30 Repeated Sentences.dll", "Remove 30 Repeated Sentences.dll",
"Replacer.dll", "Replacer.dll",
"Thread Linker.dll" "Thread Linker.dll"

View File

@ -13,6 +13,7 @@ add_library(Lua MODULE lua.cpp extensionimpl.cpp)
add_library(Regex\ Filter MODULE regexfilter.cpp extensionimpl.cpp) add_library(Regex\ Filter MODULE regexfilter.cpp extensionimpl.cpp)
add_library(Remove\ Repeated\ Characters MODULE removerepeatchar.cpp extensionimpl.cpp) add_library(Remove\ Repeated\ Characters MODULE removerepeatchar.cpp extensionimpl.cpp)
add_library(Remove\ Repeated\ Phrases MODULE removerepeatphrase.cpp extensionimpl.cpp) add_library(Remove\ Repeated\ Phrases MODULE removerepeatphrase.cpp extensionimpl.cpp)
add_library(Remove\ Repeated\ Phrases\ 2 MODULE removerepeatphrase2.cpp extensionimpl.cpp)
add_library(Remove\ 30\ Repeated\ Sentences MODULE removerepeatsentence.cpp extensionimpl.cpp) add_library(Remove\ 30\ Repeated\ Sentences MODULE removerepeatsentence.cpp extensionimpl.cpp)
add_library(Replacer MODULE replacer.cpp extensionimpl.cpp) add_library(Replacer MODULE replacer.cpp extensionimpl.cpp)
add_library(Thread\ Linker MODULE threadlinker.cpp extensionimpl.cpp) add_library(Thread\ Linker MODULE threadlinker.cpp extensionimpl.cpp)

View File

@ -1,44 +1,55 @@
#include "extension.h" #include "extension.h"
constexpr wchar_t ERASED = 0xe012; // inside Unicode private use area
bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo) bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
{ {
if (sentenceInfo["text number"] == 0) return false; if (sentenceInfo["text number"] == 0) return false;
if (sentence.size() > 15000) return false; // this algorithm is O(N^3) so if N > 15000 it's extremely slow // This algorithm looks for repeating substrings (in other words, common prefixes among the set of suffixes) of the sentence with length > 6
auto data = std::make_unique<wchar_t[]>(sentence.size() + 1); // It then looks for any regions of characters at least twice as long as the substring made up only of characters in the substring, and erases them
wcscpy_s(data.get(), sentence.size() + 1, sentence.c_str()); // If this results in the common prefix being completely erased from the string, the common prefix is copied to the last location where it was located in the original string
wchar_t* dataEnd = data.get() + sentence.size(); std::vector<int> suffixArray(sentence.size());
int skip = 0, count = 0; for (int i = 0; i < sentence.size(); ++i) suffixArray[i] = i;
for (wchar_t* end = dataEnd; end - data.get() > skip; --end) std::sort(suffixArray.begin(), suffixArray.end(), [&](int a, int b) { return wcsncmp(sentence.c_str() + a, sentence.c_str() + b, 5000) > 0; });
for (int i = 0; i + 1 < sentence.size(); ++i)
{ {
std::swap(*end, *dataEnd); int commonPrefixLength = 0;
int junkLength = end - data.get() - skip; for (int j = suffixArray[i], k = suffixArray[i + 1]; j < sentence.size() && k < sentence.size(); ++j, ++k)
auto junkFound = wcsstr(sentence.c_str() + skip + junkLength, data.get() + skip); if (sentence[j] != ERASED && sentence[k] != ERASED && sentence[j] == sentence[k]) commonPrefixLength += 1;
std::swap(*end, *dataEnd); else break;
if (junkFound)
if (commonPrefixLength > 6)
{ {
if (count && junkLength < min(skip / count, 4)) break; std::wstring commonPrefixCopy(sentence.c_str() + suffixArray[i], commonPrefixLength);
skip += junkLength; std::unordered_set<wchar_t, Identity<wchar_t>> commonPrefixChars(commonPrefixCopy.begin(), commonPrefixCopy.end());
count += 1;
end = dataEnd; for (int regionSize = 0, j = 0; j <= sentence.size(); ++j)
if (commonPrefixChars.find(sentence[j]) != commonPrefixChars.end()) regionSize += 1;
else if (regionSize >= commonPrefixLength * 2)
while (regionSize > 0)
sentence[j - regionSize--] = ERASED;
else regionSize = 0;
if (!wcsstr(sentence.c_str(), commonPrefixCopy.c_str()))
std::copy(commonPrefixCopy.begin(), commonPrefixCopy.end(), sentence.data() + max(suffixArray[i], suffixArray[i + 1]));
} }
} }
if (count && skip / count >= 3) sentence.erase(std::remove(sentence.begin(), sentence.end(), ERASED), sentence.end());
{
sentence = data.get() + skip;
return true; return true;
} }
return false;
}
TEST( TEST(
{ {
std::wstring cyclicRepeats = L"_abcde_abcdef_abcdefg_abcdefg_abcdefg_abcdefg_abcdefg"; std::wstring cyclicRepeats = L"Name: '_abcdefg_abcdefg_abcdefg_abcdefg_abcdefg'";
std::wstring buildupRepeats = L"__a_ab_abc_abcd_abcde_abcdef_abcdefg"; std::wstring buildupRepeats = L"Name: '__a_ab_abc_abcd_abcde_abcdef_abcdefg'";
std::wstring breakdownRepeats = L"Name: '_abcdefg_abcdef_abcde_abcd_abc_ab_a_'";
ProcessSentence(cyclicRepeats, { SentenceInfo::DUMMY }); ProcessSentence(cyclicRepeats, { SentenceInfo::DUMMY });
ProcessSentence(buildupRepeats, { SentenceInfo::DUMMY }); ProcessSentence(buildupRepeats, { SentenceInfo::DUMMY });
assert(cyclicRepeats == L"_abcdefg"); ProcessSentence(breakdownRepeats, { SentenceInfo::DUMMY });
assert(buildupRepeats == L"_abcdefg"); assert(cyclicRepeats == L"Name: '_abcdefg'");
assert(buildupRepeats == L"Name: '_abcdefg'");
assert(breakdownRepeats == L"Name: '_abcdefg'");
std::wstring empty = L"", one = L" ", normal = L"This is a normal sentence. はい"; std::wstring empty = L"", one = L" ", normal = L"This is a normal sentence. はい";
ProcessSentence(empty, { SentenceInfo::DUMMY }); ProcessSentence(empty, { SentenceInfo::DUMMY });

View File

@ -0,0 +1,52 @@
#include "extension.h"
bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
{
if (sentenceInfo["text number"] == 0) return false;
// This algorithm looks at all substrings which start at the beginning of the sentence
// If that substring is found later in the sentence, it is removed from the beginning of the sentence and the process is repeated
// Complexity O(N^3) so executing for N > 10,000 dangerous
if (sentence.size() > 10000) return false;
auto data = std::make_unique<wchar_t[]>(sentence.size() + 1);
wcscpy_s(data.get(), sentence.size() + 1, sentence.c_str());
wchar_t* dataEnd = data.get() + sentence.size();
int skip = 0, count = 0;
for (wchar_t* end = dataEnd; end - data.get() > skip; --end)
{
std::swap(*end, *dataEnd);
int junkLength = end - data.get() - skip;
auto junkFound = wcsstr(sentence.c_str() + skip + junkLength, data.get() + skip);
std::swap(*end, *dataEnd);
if (junkFound)
{
if (count && junkLength < min(skip / count, 4)) break;
skip += junkLength;
count += 1;
end = dataEnd;
}
}
if (count && skip / count >= 3)
{
sentence = data.get() + skip;
return true;
}
return false;
}
TEST(
{
std::wstring cyclicRepeats = L"_abcde_abcdef_abcdefg_abcdefg_abcdefg_abcdefg_abcdefg";
std::wstring buildupRepeats = L"__a_ab_abc_abcd_abcde_abcdef_abcdefg";
ProcessSentence(cyclicRepeats, { SentenceInfo::DUMMY });
ProcessSentence(buildupRepeats, { SentenceInfo::DUMMY });
assert(cyclicRepeats == L"_abcdefg");
assert(buildupRepeats == L"_abcdefg");
std::wstring empty = L"", one = L" ", normal = L"This is a normal sentence. はい";
ProcessSentence(empty, { SentenceInfo::DUMMY });
ProcessSentence(one, { SentenceInfo::DUMMY });
ProcessSentence(normal, { SentenceInfo::DUMMY });
assert(empty == L"" && one == L" " && normal == L"This is a normal sentence. はい");
}
);

View File

@ -62,7 +62,7 @@ private:
struct Node struct Node
{ {
std::unordered_map<wchar_t, std::unique_ptr<Node>> next; std::unordered_map<wchar_t, std::unique_ptr<Node>, Identity<wchar_t>> next;
std::optional<std::wstring> value; std::optional<std::wstring> value;
} root; } root;
} trie = { {} }; } trie = { {} };