Textractor_test/extensions/removerepeat.cpp

63 lines
1.8 KiB
C++
Raw Normal View History

#include "extensions.h"
#include <set>
#include <mutex>
bool RemoveRepeatedChars(std::wstring& sentence)
{
2018-09-22 15:45:54 -04:00
int repeatNumber = 0;
wchar_t prevChar = sentence[0];
for (auto i : sentence)
if (i == prevChar) repeatNumber++;
else break;
if (repeatNumber == 1) return false;
for (int i = 0; i < sentence.size(); i += repeatNumber)
for (int j = i; j < sentence.size(); ++j)
if (sentence[j] != sentence[i])
if ((j - i) % repeatNumber != 0) return false;
else break;
2018-09-13 02:50:03 -04:00
std::wstring newSentence = L"";
for (int i = 0; i < sentence.size(); ++i) if (i % repeatNumber == 0) newSentence.push_back(sentence[i]);
sentence = newSentence;
return true;
}
bool RemoveCyclicRepeats(std::wstring& sentence)
{
if (sentence == L"") throw InvalidSentence();
2018-09-22 15:45:54 -04:00
int junkLength = 0;
2018-09-13 02:50:03 -04:00
wchar_t junk[2000] = {};
while (wcsstr(sentence.c_str() + junkLength, junk))
{
2018-09-13 02:50:03 -04:00
junk[junkLength] = sentence[junkLength];
if (++junkLength >= 2000) return false;
}
2018-09-13 02:50:03 -04:00
if (--junkLength >= 5) // If the first 5 characters appear later on, there's probably a repetition issue.
{
2018-09-13 02:50:03 -04:00
sentence = std::wstring(sentence.c_str() + junkLength);
RemoveCyclicRepeats(sentence);
return true;
}
return false;
}
2018-09-22 15:45:54 -04:00
bool RemoveRepeatedSentences(std::wstring& sentence, int64_t handle)
{
2018-09-22 15:45:54 -04:00
static std::set<std::pair<int64_t, std::wstring>> seenSentences;
static std::mutex m;
std::lock_guard<std::mutex> l(m);
if (seenSentences.count({ handle, sentence }) != 0) throw InvalidSentence();
2018-09-13 02:50:03 -04:00
seenSentences.insert({ handle, sentence });
return false;
}
2018-09-22 15:08:31 -04:00
bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
{
2018-09-22 15:08:31 -04:00
if (sentenceInfo["hook address"] == -1) return false;
2018-09-13 02:50:03 -04:00
bool ret = false;
ret |= RemoveRepeatedChars(sentence);
ret |= RemoveCyclicRepeats(sentence);
2018-09-22 15:08:31 -04:00
ret |= RemoveRepeatedSentences(sentence, sentenceInfo["text handle"]);
2018-09-13 02:50:03 -04:00
return ret;
}