forked from Public-Mirror/Textractor
repetition filters bail after 30 seconds
This commit is contained in:
parent
9e86ecb987
commit
6a6c208c20
@ -46,8 +46,9 @@ bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
|
|||||||
// This algorithm looks for repeating substrings (in other words, common prefixes among the set of suffixes) of the sentence with length > 6
|
// This algorithm looks for repeating substrings (in other words, common prefixes among the set of suffixes) of the sentence with length > 6
|
||||||
// It then looks for any regions of characters at least twice as long as the substring made up only of characters in the substring, and erases them
|
// It then looks for any regions of characters at least twice as long as the substring made up only of characters in the substring, and erases them
|
||||||
// If this results in the substring being completely erased from the string, the substring is copied to the last location where it was located in the original string
|
// If this results in the substring being completely erased from the string, the substring is copied to the last location where it was located in the original string
|
||||||
|
auto timeout = GetTickCount64() + 30'000; // give up if taking over 30 seconds
|
||||||
std::vector<int> suffixArray = GenerateSuffixArray(sentence);
|
std::vector<int> suffixArray = GenerateSuffixArray(sentence);
|
||||||
for (int i = 0; i + 1 < sentence.size(); ++i)
|
for (int i = 0; i + 1 < sentence.size() && GetTickCount64() < timeout; ++i)
|
||||||
{
|
{
|
||||||
int commonPrefixLength = 0;
|
int commonPrefixLength = 0;
|
||||||
for (int j = suffixArray[i], k = suffixArray[i + 1]; j < sentence.size() && k < sentence.size(); ++j, ++k)
|
for (int j = suffixArray[i], k = suffixArray[i + 1]; j < sentence.size() && k < sentence.size(); ++j, ++k)
|
||||||
|
@ -5,13 +5,12 @@ bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
|
|||||||
if (sentenceInfo["text number"] == 0) return false;
|
if (sentenceInfo["text number"] == 0) return false;
|
||||||
|
|
||||||
// This algorithm looks at all the prefixes of the sentence: if a prefix is found later in the sentence, it is removed from the beginning and the process is repeated
|
// This algorithm looks at all the prefixes of the sentence: if a prefix is found later in the sentence, it is removed from the beginning and the process is repeated
|
||||||
// Complexity O(N^3) so executing for N > 10,000 dangerous
|
auto timeout = GetTickCount64() + 30'000; // give up if taking over 30 seconds
|
||||||
if (sentence.size() > 10000) return false;
|
|
||||||
auto data = std::make_unique<wchar_t[]>(sentence.size() + 1);
|
auto data = std::make_unique<wchar_t[]>(sentence.size() + 1);
|
||||||
wcscpy_s(data.get(), sentence.size() + 1, sentence.c_str());
|
wcscpy_s(data.get(), sentence.size() + 1, sentence.c_str());
|
||||||
wchar_t* dataEnd = data.get() + sentence.size();
|
wchar_t* dataEnd = data.get() + sentence.size();
|
||||||
int skip = 0, count = 0;
|
int skip = 0, count = 0;
|
||||||
for (wchar_t* end = dataEnd; end - data.get() > skip; --end)
|
for (wchar_t* end = dataEnd; end - data.get() > skip && GetTickCount64() < timeout; --end)
|
||||||
{
|
{
|
||||||
std::swap(*end, *dataEnd);
|
std::swap(*end, *dataEnd);
|
||||||
int junkLength = end - data.get() - skip;
|
int junkLength = end - data.get() - skip;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user