upgrade repetition remover algorithm

This commit is contained in:
Akash Mozumdar 2019-02-20 22:12:26 -05:00
parent 241d5906e4
commit 40e62a13d6
2 changed files with 22 additions and 8 deletions

View File

@ -6,11 +6,11 @@ void RemoveRepeatedChars(std::wstring& sentence)
int repeatNumber = 1; int repeatNumber = 1;
wchar_t prevChar = L'\0'; wchar_t prevChar = L'\0';
for (auto nextChar : sentence) for (auto nextChar : sentence)
if (nextChar == prevChar) repeatNumber++; if (nextChar == prevChar) repeatNumber += 1;
else else
{ {
prevChar = nextChar; prevChar = nextChar;
++repeatNumbers.at(repeatNumber); repeatNumbers.at(repeatNumber) += 1;
repeatNumber = 1; repeatNumber = 1;
} }
if ((repeatNumber = std::distance(repeatNumbers.begin(), std::max_element(repeatNumbers.begin(), repeatNumbers.end()))) == 1) return; if ((repeatNumber = std::distance(repeatNumbers.begin(), std::max_element(repeatNumbers.begin(), repeatNumbers.end()))) == 1) return;
@ -33,14 +33,25 @@ void RemoveRepeatedChars(std::wstring& sentence)
void RemoveCyclicRepeats(std::wstring& sentence) void RemoveCyclicRepeats(std::wstring& sentence)
{ {
remove: auto data = std::make_unique<wchar_t[]>(sentence.size() + 1);
for (std::wstring junk = sentence; junk.size() > 4; junk.pop_back()) wcscpy_s(data.get(), sentence.size() + 1, sentence.c_str());
if (sentence.rfind(junk) > 0) wchar_t* dataEnd = data.get() + sentence.size();
int skip = 0, count = 0;
for (wchar_t* end = dataEnd; end - data.get() > skip; --end)
{ {
sentence.erase(0, junk.size()); std::swap(*end, *dataEnd);
goto remove; int junkLength = end - data.get() - skip;
auto junkFound = wcsstr(sentence.c_str() + skip + junkLength, data.get() + skip);
std::swap(*end, *dataEnd);
if (junkFound)
{
skip += junkLength;
count += 1;
end = dataEnd;
} }
} }
if (count && skip / count >= 3) sentence = data.get() + skip;
}
bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo) bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
{ {
@ -57,8 +68,11 @@ TEST(
assert(repeatedChars.find(L"aaaabbcd") == 0); assert(repeatedChars.find(L"aaaabbcd") == 0);
std::wstring cyclicRepeats = L"abcdeabcdefabcdefgabcdefgabcdefgabcdefgabcdefg"; std::wstring cyclicRepeats = L"abcdeabcdefabcdefgabcdefgabcdefgabcdefgabcdefg";
std::wstring buildupRepeats = L"aababcabcdabcdeabcdefabcdefg";
RemoveCyclicRepeats(cyclicRepeats); RemoveCyclicRepeats(cyclicRepeats);
RemoveCyclicRepeats(buildupRepeats);
assert(cyclicRepeats == L"abcdefg"); assert(cyclicRepeats == L"abcdefg");
assert(buildupRepeats == L"abcdefg");
std::wstring empty = L"", one = L" ", normal = L"This is a normal sentence. はい"; std::wstring empty = L"", one = L" ", normal = L"This is a normal sentence. はい";
ProcessSentence(empty, { SentenceInfo::DUMMY }); ProcessSentence(empty, { SentenceInfo::DUMMY });

View File

@ -10237,7 +10237,7 @@ void SpecialHookV8String(DWORD dwDatabase, HookParam* hp, BYTE, DWORD* data, DWO
DWORD strPtr = *(DWORD*)ecx; DWORD strPtr = *(DWORD*)ecx;
*data = strPtr + 0xb; *data = strPtr + 0xb;
*len = *(short*)(strPtr + 7); *len = *(short*)(strPtr + 7);
if (*len < 12) *split = 1; // To ensure this is caught by cyclic repetition detection, split if there's 6+ wide chars //if (*len < 12) *split = 1; // To ensure this is caught by cyclic repetition detection, split if there's 6+ wide chars
//*split = *(DWORD*)((BYTE*)hp->split + dwDatabase); //*split = *(DWORD*)((BYTE*)hp->split + dwDatabase);
} }