further optimize repetition detection

This commit is contained in:
Akash Mozumdar 2019-08-09 22:11:34 -04:00
parent 488a19134a
commit 60372a6b8a
2 changed files with 13 additions and 14 deletions

View File

@ -1,7 +1,5 @@
#include "extension.h"
constexpr wchar_t ERASED = 0xe012; // inside Unicode private use area
std::vector<int> GenerateSuffixArray(const std::wstring& text)
{
std::vector<int> identity(text.size());
@ -13,7 +11,7 @@ std::vector<int> GenerateSuffixArray(const std::wstring& text)
std::vector<int> classes(text.begin(), text.end());
for (int length = 1; length < text.size(); length *= 2)
{
// Determine equivalence class up to length, by checking length/2 equivalence of suffixes and their following length/2 suffixes
// Determine equivalence class up to length, by checking length / 2 equivalence of suffixes and their following length / 2 suffixes
std::vector<int> oldClasses = classes;
classes[suffixArray[0]] = 0;
for (int i = 1; i < text.size(); ++i)
@ -26,8 +24,7 @@ std::vector<int> GenerateSuffixArray(const std::wstring& text)
else classes[currentSuffix] = i;
}
// Sort within equivalence class based on order of following suffix after length
// Orders up to length*2
// Sort within equivalence class based on order of following suffix after length (orders up to length * 2)
std::vector<int> count = identity;
for (auto suffix : std::vector(suffixArray))
{
@ -40,35 +37,38 @@ std::vector<int> GenerateSuffixArray(const std::wstring& text)
return suffixArray;
}
constexpr wchar_t ERASED = 0xf246; // inside Unicode private use area
bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
{
if (sentenceInfo["text number"] == 0) return false;
// This algorithm looks for repeating substrings (in other words, common prefixes among the set of suffixes) of the sentence with length > 6
// It then looks for any regions of characters at least twice as long as the substring made up only of characters in the substring, and erases them
// If this results in the common prefix being completely erased from the string, the common prefix is copied to the last location where it was located in the original string
// If this results in the substring being completely erased from the string, the substring is copied to the last location where it was located in the original string
std::vector<int> suffixArray = GenerateSuffixArray(sentence);
for (int i = 0; i + 1 < sentence.size(); ++i)
{
int commonPrefixLength = 0;
for (int j = suffixArray[i], k = suffixArray[i + 1]; j < sentence.size() && k < sentence.size(); ++j, ++k)
if (sentence[j] != ERASED && sentence[k] != ERASED && sentence[j] == sentence[k]) commonPrefixLength += 1;
if (sentence[j] != ERASED && sentence[j] == sentence[k]) commonPrefixLength += 1;
else break;
if (commonPrefixLength > 6)
{
std::wstring commonPrefixCopy(sentence.c_str() + suffixArray[i], commonPrefixLength);
std::unordered_set<wchar_t, Identity<wchar_t>> commonPrefixChars(commonPrefixCopy.begin(), commonPrefixCopy.end());
std::wstring substring(sentence, suffixArray[i], commonPrefixLength);
bool substringCharMap[0x10000] = {};
for (auto ch : substring)
substringCharMap[ch] = true;
for (int regionSize = 0, j = 0; j <= sentence.size(); ++j)
if (commonPrefixChars.find(sentence[j]) != commonPrefixChars.end()) regionSize += 1;
if (substringCharMap[sentence[j]]) regionSize += 1;
else if (regionSize >= commonPrefixLength * 2)
while (regionSize > 0)
sentence[j - regionSize--] = ERASED;
else regionSize = 0;
if (!wcsstr(sentence.c_str(), commonPrefixCopy.c_str()))
std::copy(commonPrefixCopy.begin(), commonPrefixCopy.end(), sentence.data() + max(suffixArray[i], suffixArray[i + 1]));
if (!wcsstr(sentence.c_str(), substring.c_str())) std::copy(substring.begin(), substring.end(), sentence.begin() + max(suffixArray[i], suffixArray[i + 1]));
}
}
sentence.erase(std::remove(sentence.begin(), sentence.end(), ERASED), sentence.end());

View File

@ -4,8 +4,7 @@ bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
{
if (sentenceInfo["text number"] == 0) return false;
// This algorithm looks at all substrings which start at the beginning of the sentence
// If that substring is found later in the sentence, it is removed from the beginning of the sentence and the process is repeated
// This algorithm looks at all the prefixes of the sentence: if a prefix is found later in the sentence, it is removed from the beginning and the process is repeated
// Complexity O(N^3) so executing for N > 10,000 dangerous
if (sentence.size() > 10000) return false;
auto data = std::make_unique<wchar_t[]>(sentence.size() + 1);