From 60372a6b8a070e392e958264ab0600e8f1b26197 Mon Sep 17 00:00:00 2001 From: Akash Mozumdar Date: Fri, 9 Aug 2019 22:11:34 -0400 Subject: [PATCH] further optimize repetition detection --- extensions/removerepeatphrase.cpp | 24 ++++++++++++------------ extensions/removerepeatphrase2.cpp | 3 +-- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/extensions/removerepeatphrase.cpp b/extensions/removerepeatphrase.cpp index d298244..15e20b5 100644 --- a/extensions/removerepeatphrase.cpp +++ b/extensions/removerepeatphrase.cpp @@ -1,7 +1,5 @@ #include "extension.h" -constexpr wchar_t ERASED = 0xe012; // inside Unicode private use area - std::vector GenerateSuffixArray(const std::wstring& text) { std::vector identity(text.size()); @@ -13,7 +11,7 @@ std::vector GenerateSuffixArray(const std::wstring& text) std::vector classes(text.begin(), text.end()); for (int length = 1; length < text.size(); length *= 2) { - // Determine equivalence class up to length, by checking length/2 equivalence of suffixes and their following length/2 suffixes + // Determine equivalence class up to length, by checking length / 2 equivalence of suffixes and their following length / 2 suffixes std::vector oldClasses = classes; classes[suffixArray[0]] = 0; for (int i = 1; i < text.size(); ++i) @@ -26,8 +24,7 @@ std::vector GenerateSuffixArray(const std::wstring& text) else classes[currentSuffix] = i; } - // Sort within equivalence class based on order of following suffix after length - // Orders up to length*2 + // Sort within equivalence class based on order of following suffix after length (orders up to length * 2) std::vector count = identity; for (auto suffix : std::vector(suffixArray)) { @@ -40,35 +37,38 @@ std::vector GenerateSuffixArray(const std::wstring& text) return suffixArray; } +constexpr wchar_t ERASED = 0xf246; // inside Unicode private use area + bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo) { if (sentenceInfo["text number"] == 0) return false; // This algorithm looks for repeating substrings (in other words, common prefixes among the set of suffixes) of the sentence with length > 6 // It then looks for any regions of characters at least twice as long as the substring made up only of characters in the substring, and erases them - // If this results in the common prefix being completely erased from the string, the common prefix is copied to the last location where it was located in the original string + // If this results in the substring being completely erased from the string, the substring is copied to the last location where it was located in the original string std::vector suffixArray = GenerateSuffixArray(sentence); for (int i = 0; i + 1 < sentence.size(); ++i) { int commonPrefixLength = 0; for (int j = suffixArray[i], k = suffixArray[i + 1]; j < sentence.size() && k < sentence.size(); ++j, ++k) - if (sentence[j] != ERASED && sentence[k] != ERASED && sentence[j] == sentence[k]) commonPrefixLength += 1; + if (sentence[j] != ERASED && sentence[j] == sentence[k]) commonPrefixLength += 1; else break; if (commonPrefixLength > 6) { - std::wstring commonPrefixCopy(sentence.c_str() + suffixArray[i], commonPrefixLength); - std::unordered_set> commonPrefixChars(commonPrefixCopy.begin(), commonPrefixCopy.end()); + std::wstring substring(sentence, suffixArray[i], commonPrefixLength); + bool substringCharMap[0x10000] = {}; + for (auto ch : substring) + substringCharMap[ch] = true; for (int regionSize = 0, j = 0; j <= sentence.size(); ++j) - if (commonPrefixChars.find(sentence[j]) != commonPrefixChars.end()) regionSize += 1; + if (substringCharMap[sentence[j]]) regionSize += 1; else if (regionSize >= commonPrefixLength * 2) while (regionSize > 0) sentence[j - regionSize--] = ERASED; else regionSize = 0; - if (!wcsstr(sentence.c_str(), commonPrefixCopy.c_str())) - std::copy(commonPrefixCopy.begin(), commonPrefixCopy.end(), sentence.data() + max(suffixArray[i], suffixArray[i + 1])); + if (!wcsstr(sentence.c_str(), substring.c_str())) std::copy(substring.begin(), substring.end(), sentence.begin() + max(suffixArray[i], suffixArray[i + 1])); } } sentence.erase(std::remove(sentence.begin(), sentence.end(), ERASED), sentence.end()); diff --git a/extensions/removerepeatphrase2.cpp b/extensions/removerepeatphrase2.cpp index aadf851..ce0a42f 100644 --- a/extensions/removerepeatphrase2.cpp +++ b/extensions/removerepeatphrase2.cpp @@ -4,8 +4,7 @@ bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo) { if (sentenceInfo["text number"] == 0) return false; - // This algorithm looks at all substrings which start at the beginning of the sentence - // If that substring is found later in the sentence, it is removed from the beginning of the sentence and the process is repeated + // This algorithm looks at all the prefixes of the sentence: if a prefix is found later in the sentence, it is removed from the beginning and the process is repeated // Complexity O(N^3) so executing for N > 10,000 dangerous if (sentence.size() > 10000) return false; auto data = std::make_unique(sentence.size() + 1);