forked from Public-Mirror/Textractor
upgrade repetition remover algorithm
This commit is contained in:
parent
241d5906e4
commit
40e62a13d6
@ -6,11 +6,11 @@ void RemoveRepeatedChars(std::wstring& sentence)
|
|||||||
int repeatNumber = 1;
|
int repeatNumber = 1;
|
||||||
wchar_t prevChar = L'\0';
|
wchar_t prevChar = L'\0';
|
||||||
for (auto nextChar : sentence)
|
for (auto nextChar : sentence)
|
||||||
if (nextChar == prevChar) repeatNumber++;
|
if (nextChar == prevChar) repeatNumber += 1;
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
prevChar = nextChar;
|
prevChar = nextChar;
|
||||||
++repeatNumbers.at(repeatNumber);
|
repeatNumbers.at(repeatNumber) += 1;
|
||||||
repeatNumber = 1;
|
repeatNumber = 1;
|
||||||
}
|
}
|
||||||
if ((repeatNumber = std::distance(repeatNumbers.begin(), std::max_element(repeatNumbers.begin(), repeatNumbers.end()))) == 1) return;
|
if ((repeatNumber = std::distance(repeatNumbers.begin(), std::max_element(repeatNumbers.begin(), repeatNumbers.end()))) == 1) return;
|
||||||
@ -33,13 +33,24 @@ void RemoveRepeatedChars(std::wstring& sentence)
|
|||||||
|
|
||||||
void RemoveCyclicRepeats(std::wstring& sentence)
|
void RemoveCyclicRepeats(std::wstring& sentence)
|
||||||
{
|
{
|
||||||
remove:
|
auto data = std::make_unique<wchar_t[]>(sentence.size() + 1);
|
||||||
for (std::wstring junk = sentence; junk.size() > 4; junk.pop_back())
|
wcscpy_s(data.get(), sentence.size() + 1, sentence.c_str());
|
||||||
if (sentence.rfind(junk) > 0)
|
wchar_t* dataEnd = data.get() + sentence.size();
|
||||||
|
int skip = 0, count = 0;
|
||||||
|
for (wchar_t* end = dataEnd; end - data.get() > skip; --end)
|
||||||
{
|
{
|
||||||
sentence.erase(0, junk.size());
|
std::swap(*end, *dataEnd);
|
||||||
goto remove;
|
int junkLength = end - data.get() - skip;
|
||||||
|
auto junkFound = wcsstr(sentence.c_str() + skip + junkLength, data.get() + skip);
|
||||||
|
std::swap(*end, *dataEnd);
|
||||||
|
if (junkFound)
|
||||||
|
{
|
||||||
|
skip += junkLength;
|
||||||
|
count += 1;
|
||||||
|
end = dataEnd;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if (count && skip / count >= 3) sentence = data.get() + skip;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
|
bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
|
||||||
@ -57,8 +68,11 @@ TEST(
|
|||||||
assert(repeatedChars.find(L"aaaabbcd") == 0);
|
assert(repeatedChars.find(L"aaaabbcd") == 0);
|
||||||
|
|
||||||
std::wstring cyclicRepeats = L"abcdeabcdefabcdefgabcdefgabcdefgabcdefgabcdefg";
|
std::wstring cyclicRepeats = L"abcdeabcdefabcdefgabcdefgabcdefgabcdefgabcdefg";
|
||||||
|
std::wstring buildupRepeats = L"aababcabcdabcdeabcdefabcdefg";
|
||||||
RemoveCyclicRepeats(cyclicRepeats);
|
RemoveCyclicRepeats(cyclicRepeats);
|
||||||
|
RemoveCyclicRepeats(buildupRepeats);
|
||||||
assert(cyclicRepeats == L"abcdefg");
|
assert(cyclicRepeats == L"abcdefg");
|
||||||
|
assert(buildupRepeats == L"abcdefg");
|
||||||
|
|
||||||
std::wstring empty = L"", one = L" ", normal = L"This is a normal sentence. はい";
|
std::wstring empty = L"", one = L" ", normal = L"This is a normal sentence. はい";
|
||||||
ProcessSentence(empty, { SentenceInfo::DUMMY });
|
ProcessSentence(empty, { SentenceInfo::DUMMY });
|
||||||
|
@ -10237,7 +10237,7 @@ void SpecialHookV8String(DWORD dwDatabase, HookParam* hp, BYTE, DWORD* data, DWO
|
|||||||
DWORD strPtr = *(DWORD*)ecx;
|
DWORD strPtr = *(DWORD*)ecx;
|
||||||
*data = strPtr + 0xb;
|
*data = strPtr + 0xb;
|
||||||
*len = *(short*)(strPtr + 7);
|
*len = *(short*)(strPtr + 7);
|
||||||
if (*len < 12) *split = 1; // To ensure this is caught by cyclic repetition detection, split if there's 6+ wide chars
|
//if (*len < 12) *split = 1; // To ensure this is caught by cyclic repetition detection, split if there's 6+ wide chars
|
||||||
//*split = *(DWORD*)((BYTE*)hp->split + dwDatabase);
|
//*split = *(DWORD*)((BYTE*)hp->split + dwDatabase);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user