From c38551905d80694795d951605dc5d6d310508d9d Mon Sep 17 00:00:00 2001 From: Akash Mozumdar Date: Wed, 7 Aug 2019 14:05:50 -0400 Subject: [PATCH] upgrade repetition detection algorithm --- deploy.ps1 | 1 + extensions/CMakeLists.txt | 1 + extensions/removerepeatphrase.cpp | 61 ++++++++++++++++++------------ extensions/removerepeatphrase2.cpp | 52 +++++++++++++++++++++++++ extensions/replacer.cpp | 2 +- 5 files changed, 91 insertions(+), 26 deletions(-) create mode 100644 extensions/removerepeatphrase2.cpp diff --git a/deploy.ps1 b/deploy.ps1 index 3c6ed7b..1ca8021 100644 --- a/deploy.ps1 +++ b/deploy.ps1 @@ -41,6 +41,7 @@ foreach ($language in @{ "Regex Filter.dll", "Remove Repeated Characters.dll", "Remove Repeated Phrases.dll", + "Remove Repeated Phrases 2.dll", "Remove 30 Repeated Sentences.dll", "Replacer.dll", "Thread Linker.dll" diff --git a/extensions/CMakeLists.txt b/extensions/CMakeLists.txt index af0577b..29e8652 100644 --- a/extensions/CMakeLists.txt +++ b/extensions/CMakeLists.txt @@ -13,6 +13,7 @@ add_library(Lua MODULE lua.cpp extensionimpl.cpp) add_library(Regex\ Filter MODULE regexfilter.cpp extensionimpl.cpp) add_library(Remove\ Repeated\ Characters MODULE removerepeatchar.cpp extensionimpl.cpp) add_library(Remove\ Repeated\ Phrases MODULE removerepeatphrase.cpp extensionimpl.cpp) +add_library(Remove\ Repeated\ Phrases\ 2 MODULE removerepeatphrase2.cpp extensionimpl.cpp) add_library(Remove\ 30\ Repeated\ Sentences MODULE removerepeatsentence.cpp extensionimpl.cpp) add_library(Replacer MODULE replacer.cpp extensionimpl.cpp) add_library(Thread\ Linker MODULE threadlinker.cpp extensionimpl.cpp) diff --git a/extensions/removerepeatphrase.cpp b/extensions/removerepeatphrase.cpp index 38f2153..0b16c41 100644 --- a/extensions/removerepeatphrase.cpp +++ b/extensions/removerepeatphrase.cpp @@ -1,44 +1,55 @@ #include "extension.h" +constexpr wchar_t ERASED = 0xe012; // inside Unicode private use area + bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo) { if (sentenceInfo["text number"] == 0) return false; - if (sentence.size() > 15000) return false; // this algorithm is O(N^3) so if N > 15000 it's extremely slow - auto data = std::make_unique(sentence.size() + 1); - wcscpy_s(data.get(), sentence.size() + 1, sentence.c_str()); - wchar_t* dataEnd = data.get() + sentence.size(); - int skip = 0, count = 0; - for (wchar_t* end = dataEnd; end - data.get() > skip; --end) + // This algorithm looks for repeating substrings (in other words, common prefixes among the set of suffixes) of the sentence with length > 6 + // It then looks for any regions of characters at least twice as long as the substring made up only of characters in the substring, and erases them + // If this results in the common prefix being completely erased from the string, the common prefix is copied to the last location where it was located in the original string + std::vector suffixArray(sentence.size()); + for (int i = 0; i < sentence.size(); ++i) suffixArray[i] = i; + std::sort(suffixArray.begin(), suffixArray.end(), [&](int a, int b) { return wcsncmp(sentence.c_str() + a, sentence.c_str() + b, 5000) > 0; }); + for (int i = 0; i + 1 < sentence.size(); ++i) { - std::swap(*end, *dataEnd); - int junkLength = end - data.get() - skip; - auto junkFound = wcsstr(sentence.c_str() + skip + junkLength, data.get() + skip); - std::swap(*end, *dataEnd); - if (junkFound) + int commonPrefixLength = 0; + for (int j = suffixArray[i], k = suffixArray[i + 1]; j < sentence.size() && k < sentence.size(); ++j, ++k) + if (sentence[j] != ERASED && sentence[k] != ERASED && sentence[j] == sentence[k]) commonPrefixLength += 1; + else break; + + if (commonPrefixLength > 6) { - if (count && junkLength < min(skip / count, 4)) break; - skip += junkLength; - count += 1; - end = dataEnd; + std::wstring commonPrefixCopy(sentence.c_str() + suffixArray[i], commonPrefixLength); + std::unordered_set> commonPrefixChars(commonPrefixCopy.begin(), commonPrefixCopy.end()); + + for (int regionSize = 0, j = 0; j <= sentence.size(); ++j) + if (commonPrefixChars.find(sentence[j]) != commonPrefixChars.end()) regionSize += 1; + else if (regionSize >= commonPrefixLength * 2) + while (regionSize > 0) + sentence[j - regionSize--] = ERASED; + else regionSize = 0; + + if (!wcsstr(sentence.c_str(), commonPrefixCopy.c_str())) + std::copy(commonPrefixCopy.begin(), commonPrefixCopy.end(), sentence.data() + max(suffixArray[i], suffixArray[i + 1])); } } - if (count && skip / count >= 3) - { - sentence = data.get() + skip; - return true; - } - return false; + sentence.erase(std::remove(sentence.begin(), sentence.end(), ERASED), sentence.end()); + return true; } TEST( { - std::wstring cyclicRepeats = L"_abcde_abcdef_abcdefg_abcdefg_abcdefg_abcdefg_abcdefg"; - std::wstring buildupRepeats = L"__a_ab_abc_abcd_abcde_abcdef_abcdefg"; + std::wstring cyclicRepeats = L"Name: '_abcdefg_abcdefg_abcdefg_abcdefg_abcdefg'"; + std::wstring buildupRepeats = L"Name: '__a_ab_abc_abcd_abcde_abcdef_abcdefg'"; + std::wstring breakdownRepeats = L"Name: '_abcdefg_abcdef_abcde_abcd_abc_ab_a_'"; ProcessSentence(cyclicRepeats, { SentenceInfo::DUMMY }); ProcessSentence(buildupRepeats, { SentenceInfo::DUMMY }); - assert(cyclicRepeats == L"_abcdefg"); - assert(buildupRepeats == L"_abcdefg"); + ProcessSentence(breakdownRepeats, { SentenceInfo::DUMMY }); + assert(cyclicRepeats == L"Name: '_abcdefg'"); + assert(buildupRepeats == L"Name: '_abcdefg'"); + assert(breakdownRepeats == L"Name: '_abcdefg'"); std::wstring empty = L"", one = L" ", normal = L"This is a normal sentence. はい"; ProcessSentence(empty, { SentenceInfo::DUMMY }); diff --git a/extensions/removerepeatphrase2.cpp b/extensions/removerepeatphrase2.cpp new file mode 100644 index 0000000..aadf851 --- /dev/null +++ b/extensions/removerepeatphrase2.cpp @@ -0,0 +1,52 @@ +#include "extension.h" + +bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo) +{ + if (sentenceInfo["text number"] == 0) return false; + + // This algorithm looks at all substrings which start at the beginning of the sentence + // If that substring is found later in the sentence, it is removed from the beginning of the sentence and the process is repeated + // Complexity O(N^3) so executing for N > 10,000 dangerous + if (sentence.size() > 10000) return false; + auto data = std::make_unique(sentence.size() + 1); + wcscpy_s(data.get(), sentence.size() + 1, sentence.c_str()); + wchar_t* dataEnd = data.get() + sentence.size(); + int skip = 0, count = 0; + for (wchar_t* end = dataEnd; end - data.get() > skip; --end) + { + std::swap(*end, *dataEnd); + int junkLength = end - data.get() - skip; + auto junkFound = wcsstr(sentence.c_str() + skip + junkLength, data.get() + skip); + std::swap(*end, *dataEnd); + if (junkFound) + { + if (count && junkLength < min(skip / count, 4)) break; + skip += junkLength; + count += 1; + end = dataEnd; + } + } + if (count && skip / count >= 3) + { + sentence = data.get() + skip; + return true; + } + return false; +} + +TEST( + { + std::wstring cyclicRepeats = L"_abcde_abcdef_abcdefg_abcdefg_abcdefg_abcdefg_abcdefg"; + std::wstring buildupRepeats = L"__a_ab_abc_abcd_abcde_abcdef_abcdefg"; + ProcessSentence(cyclicRepeats, { SentenceInfo::DUMMY }); + ProcessSentence(buildupRepeats, { SentenceInfo::DUMMY }); + assert(cyclicRepeats == L"_abcdefg"); + assert(buildupRepeats == L"_abcdefg"); + + std::wstring empty = L"", one = L" ", normal = L"This is a normal sentence. はい"; + ProcessSentence(empty, { SentenceInfo::DUMMY }); + ProcessSentence(one, { SentenceInfo::DUMMY }); + ProcessSentence(normal, { SentenceInfo::DUMMY }); + assert(empty == L"" && one == L" " && normal == L"This is a normal sentence. はい"); + } +); diff --git a/extensions/replacer.cpp b/extensions/replacer.cpp index 2d41d58..5dbe6e4 100644 --- a/extensions/replacer.cpp +++ b/extensions/replacer.cpp @@ -62,7 +62,7 @@ private: struct Node { - std::unordered_map> next; + std::unordered_map, Identity> next; std::optional value; } root; } trie = { {} };