further optimize repetition detection

This commit is contained in:
Akash Mozumdar 2019-08-09 22:11:34 -04:00
parent 488a19134a
commit 60372a6b8a
2 changed files with 13 additions and 14 deletions

View File

@ -1,7 +1,5 @@
#include "extension.h" #include "extension.h"
constexpr wchar_t ERASED = 0xe012; // inside Unicode private use area
std::vector<int> GenerateSuffixArray(const std::wstring& text) std::vector<int> GenerateSuffixArray(const std::wstring& text)
{ {
std::vector<int> identity(text.size()); std::vector<int> identity(text.size());
@ -13,7 +11,7 @@ std::vector<int> GenerateSuffixArray(const std::wstring& text)
std::vector<int> classes(text.begin(), text.end()); std::vector<int> classes(text.begin(), text.end());
for (int length = 1; length < text.size(); length *= 2) for (int length = 1; length < text.size(); length *= 2)
{ {
// Determine equivalence class up to length, by checking length/2 equivalence of suffixes and their following length/2 suffixes // Determine equivalence class up to length, by checking length / 2 equivalence of suffixes and their following length / 2 suffixes
std::vector<int> oldClasses = classes; std::vector<int> oldClasses = classes;
classes[suffixArray[0]] = 0; classes[suffixArray[0]] = 0;
for (int i = 1; i < text.size(); ++i) for (int i = 1; i < text.size(); ++i)
@ -26,8 +24,7 @@ std::vector<int> GenerateSuffixArray(const std::wstring& text)
else classes[currentSuffix] = i; else classes[currentSuffix] = i;
} }
// Sort within equivalence class based on order of following suffix after length // Sort within equivalence class based on order of following suffix after length (orders up to length * 2)
// Orders up to length*2
std::vector<int> count = identity; std::vector<int> count = identity;
for (auto suffix : std::vector(suffixArray)) for (auto suffix : std::vector(suffixArray))
{ {
@ -40,35 +37,38 @@ std::vector<int> GenerateSuffixArray(const std::wstring& text)
return suffixArray; return suffixArray;
} }
constexpr wchar_t ERASED = 0xf246; // inside Unicode private use area
bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo) bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
{ {
if (sentenceInfo["text number"] == 0) return false; if (sentenceInfo["text number"] == 0) return false;
// This algorithm looks for repeating substrings (in other words, common prefixes among the set of suffixes) of the sentence with length > 6 // This algorithm looks for repeating substrings (in other words, common prefixes among the set of suffixes) of the sentence with length > 6
// It then looks for any regions of characters at least twice as long as the substring made up only of characters in the substring, and erases them // It then looks for any regions of characters at least twice as long as the substring made up only of characters in the substring, and erases them
// If this results in the common prefix being completely erased from the string, the common prefix is copied to the last location where it was located in the original string // If this results in the substring being completely erased from the string, the substring is copied to the last location where it was located in the original string
std::vector<int> suffixArray = GenerateSuffixArray(sentence); std::vector<int> suffixArray = GenerateSuffixArray(sentence);
for (int i = 0; i + 1 < sentence.size(); ++i) for (int i = 0; i + 1 < sentence.size(); ++i)
{ {
int commonPrefixLength = 0; int commonPrefixLength = 0;
for (int j = suffixArray[i], k = suffixArray[i + 1]; j < sentence.size() && k < sentence.size(); ++j, ++k) for (int j = suffixArray[i], k = suffixArray[i + 1]; j < sentence.size() && k < sentence.size(); ++j, ++k)
if (sentence[j] != ERASED && sentence[k] != ERASED && sentence[j] == sentence[k]) commonPrefixLength += 1; if (sentence[j] != ERASED && sentence[j] == sentence[k]) commonPrefixLength += 1;
else break; else break;
if (commonPrefixLength > 6) if (commonPrefixLength > 6)
{ {
std::wstring commonPrefixCopy(sentence.c_str() + suffixArray[i], commonPrefixLength); std::wstring substring(sentence, suffixArray[i], commonPrefixLength);
std::unordered_set<wchar_t, Identity<wchar_t>> commonPrefixChars(commonPrefixCopy.begin(), commonPrefixCopy.end()); bool substringCharMap[0x10000] = {};
for (auto ch : substring)
substringCharMap[ch] = true;
for (int regionSize = 0, j = 0; j <= sentence.size(); ++j) for (int regionSize = 0, j = 0; j <= sentence.size(); ++j)
if (commonPrefixChars.find(sentence[j]) != commonPrefixChars.end()) regionSize += 1; if (substringCharMap[sentence[j]]) regionSize += 1;
else if (regionSize >= commonPrefixLength * 2) else if (regionSize >= commonPrefixLength * 2)
while (regionSize > 0) while (regionSize > 0)
sentence[j - regionSize--] = ERASED; sentence[j - regionSize--] = ERASED;
else regionSize = 0; else regionSize = 0;
if (!wcsstr(sentence.c_str(), commonPrefixCopy.c_str())) if (!wcsstr(sentence.c_str(), substring.c_str())) std::copy(substring.begin(), substring.end(), sentence.begin() + max(suffixArray[i], suffixArray[i + 1]));
std::copy(commonPrefixCopy.begin(), commonPrefixCopy.end(), sentence.data() + max(suffixArray[i], suffixArray[i + 1]));
} }
} }
sentence.erase(std::remove(sentence.begin(), sentence.end(), ERASED), sentence.end()); sentence.erase(std::remove(sentence.begin(), sentence.end(), ERASED), sentence.end());

View File

@ -4,8 +4,7 @@ bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
{ {
if (sentenceInfo["text number"] == 0) return false; if (sentenceInfo["text number"] == 0) return false;
// This algorithm looks at all substrings which start at the beginning of the sentence // This algorithm looks at all the prefixes of the sentence: if a prefix is found later in the sentence, it is removed from the beginning and the process is repeated
// If that substring is found later in the sentence, it is removed from the beginning of the sentence and the process is repeated
// Complexity O(N^3) so executing for N > 10,000 dangerous // Complexity O(N^3) so executing for N > 10,000 dangerous
if (sentence.size() > 10000) return false; if (sentence.size() > 10000) return false;
auto data = std::make_unique<wchar_t[]>(sentence.size() + 1); auto data = std::make_unique<wchar_t[]>(sentence.size() + 1);