forked from Public-Mirror/Textractor
further optimize repetition detection
This commit is contained in:
parent
488a19134a
commit
60372a6b8a
@ -1,7 +1,5 @@
|
|||||||
#include "extension.h"
|
#include "extension.h"
|
||||||
|
|
||||||
constexpr wchar_t ERASED = 0xe012; // inside Unicode private use area
|
|
||||||
|
|
||||||
std::vector<int> GenerateSuffixArray(const std::wstring& text)
|
std::vector<int> GenerateSuffixArray(const std::wstring& text)
|
||||||
{
|
{
|
||||||
std::vector<int> identity(text.size());
|
std::vector<int> identity(text.size());
|
||||||
@ -13,7 +11,7 @@ std::vector<int> GenerateSuffixArray(const std::wstring& text)
|
|||||||
std::vector<int> classes(text.begin(), text.end());
|
std::vector<int> classes(text.begin(), text.end());
|
||||||
for (int length = 1; length < text.size(); length *= 2)
|
for (int length = 1; length < text.size(); length *= 2)
|
||||||
{
|
{
|
||||||
// Determine equivalence class up to length, by checking length/2 equivalence of suffixes and their following length/2 suffixes
|
// Determine equivalence class up to length, by checking length / 2 equivalence of suffixes and their following length / 2 suffixes
|
||||||
std::vector<int> oldClasses = classes;
|
std::vector<int> oldClasses = classes;
|
||||||
classes[suffixArray[0]] = 0;
|
classes[suffixArray[0]] = 0;
|
||||||
for (int i = 1; i < text.size(); ++i)
|
for (int i = 1; i < text.size(); ++i)
|
||||||
@ -26,8 +24,7 @@ std::vector<int> GenerateSuffixArray(const std::wstring& text)
|
|||||||
else classes[currentSuffix] = i;
|
else classes[currentSuffix] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort within equivalence class based on order of following suffix after length
|
// Sort within equivalence class based on order of following suffix after length (orders up to length * 2)
|
||||||
// Orders up to length*2
|
|
||||||
std::vector<int> count = identity;
|
std::vector<int> count = identity;
|
||||||
for (auto suffix : std::vector(suffixArray))
|
for (auto suffix : std::vector(suffixArray))
|
||||||
{
|
{
|
||||||
@ -40,35 +37,38 @@ std::vector<int> GenerateSuffixArray(const std::wstring& text)
|
|||||||
return suffixArray;
|
return suffixArray;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
constexpr wchar_t ERASED = 0xf246; // inside Unicode private use area
|
||||||
|
|
||||||
bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
|
bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
|
||||||
{
|
{
|
||||||
if (sentenceInfo["text number"] == 0) return false;
|
if (sentenceInfo["text number"] == 0) return false;
|
||||||
|
|
||||||
// This algorithm looks for repeating substrings (in other words, common prefixes among the set of suffixes) of the sentence with length > 6
|
// This algorithm looks for repeating substrings (in other words, common prefixes among the set of suffixes) of the sentence with length > 6
|
||||||
// It then looks for any regions of characters at least twice as long as the substring made up only of characters in the substring, and erases them
|
// It then looks for any regions of characters at least twice as long as the substring made up only of characters in the substring, and erases them
|
||||||
// If this results in the common prefix being completely erased from the string, the common prefix is copied to the last location where it was located in the original string
|
// If this results in the substring being completely erased from the string, the substring is copied to the last location where it was located in the original string
|
||||||
std::vector<int> suffixArray = GenerateSuffixArray(sentence);
|
std::vector<int> suffixArray = GenerateSuffixArray(sentence);
|
||||||
for (int i = 0; i + 1 < sentence.size(); ++i)
|
for (int i = 0; i + 1 < sentence.size(); ++i)
|
||||||
{
|
{
|
||||||
int commonPrefixLength = 0;
|
int commonPrefixLength = 0;
|
||||||
for (int j = suffixArray[i], k = suffixArray[i + 1]; j < sentence.size() && k < sentence.size(); ++j, ++k)
|
for (int j = suffixArray[i], k = suffixArray[i + 1]; j < sentence.size() && k < sentence.size(); ++j, ++k)
|
||||||
if (sentence[j] != ERASED && sentence[k] != ERASED && sentence[j] == sentence[k]) commonPrefixLength += 1;
|
if (sentence[j] != ERASED && sentence[j] == sentence[k]) commonPrefixLength += 1;
|
||||||
else break;
|
else break;
|
||||||
|
|
||||||
if (commonPrefixLength > 6)
|
if (commonPrefixLength > 6)
|
||||||
{
|
{
|
||||||
std::wstring commonPrefixCopy(sentence.c_str() + suffixArray[i], commonPrefixLength);
|
std::wstring substring(sentence, suffixArray[i], commonPrefixLength);
|
||||||
std::unordered_set<wchar_t, Identity<wchar_t>> commonPrefixChars(commonPrefixCopy.begin(), commonPrefixCopy.end());
|
bool substringCharMap[0x10000] = {};
|
||||||
|
for (auto ch : substring)
|
||||||
|
substringCharMap[ch] = true;
|
||||||
|
|
||||||
for (int regionSize = 0, j = 0; j <= sentence.size(); ++j)
|
for (int regionSize = 0, j = 0; j <= sentence.size(); ++j)
|
||||||
if (commonPrefixChars.find(sentence[j]) != commonPrefixChars.end()) regionSize += 1;
|
if (substringCharMap[sentence[j]]) regionSize += 1;
|
||||||
else if (regionSize >= commonPrefixLength * 2)
|
else if (regionSize >= commonPrefixLength * 2)
|
||||||
while (regionSize > 0)
|
while (regionSize > 0)
|
||||||
sentence[j - regionSize--] = ERASED;
|
sentence[j - regionSize--] = ERASED;
|
||||||
else regionSize = 0;
|
else regionSize = 0;
|
||||||
|
|
||||||
if (!wcsstr(sentence.c_str(), commonPrefixCopy.c_str()))
|
if (!wcsstr(sentence.c_str(), substring.c_str())) std::copy(substring.begin(), substring.end(), sentence.begin() + max(suffixArray[i], suffixArray[i + 1]));
|
||||||
std::copy(commonPrefixCopy.begin(), commonPrefixCopy.end(), sentence.data() + max(suffixArray[i], suffixArray[i + 1]));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sentence.erase(std::remove(sentence.begin(), sentence.end(), ERASED), sentence.end());
|
sentence.erase(std::remove(sentence.begin(), sentence.end(), ERASED), sentence.end());
|
||||||
|
@ -4,8 +4,7 @@ bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
|
|||||||
{
|
{
|
||||||
if (sentenceInfo["text number"] == 0) return false;
|
if (sentenceInfo["text number"] == 0) return false;
|
||||||
|
|
||||||
// This algorithm looks at all substrings which start at the beginning of the sentence
|
// This algorithm looks at all the prefixes of the sentence: if a prefix is found later in the sentence, it is removed from the beginning and the process is repeated
|
||||||
// If that substring is found later in the sentence, it is removed from the beginning of the sentence and the process is repeated
|
|
||||||
// Complexity O(N^3) so executing for N > 10,000 dangerous
|
// Complexity O(N^3) so executing for N > 10,000 dangerous
|
||||||
if (sentence.size() > 10000) return false;
|
if (sentence.size() > 10000) return false;
|
||||||
auto data = std::make_unique<wchar_t[]>(sentence.size() + 1);
|
auto data = std::make_unique<wchar_t[]>(sentence.size() + 1);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user