ok, finally have a good repetition filter

This commit is contained in:
Akash Mozumdar 2018-11-16 08:34:15 -05:00
parent 59869dc45a
commit 523008d7e0
3 changed files with 14 additions and 4 deletions

View File

@ -34,7 +34,7 @@ namespace Host
inline std::wstring StringToWideString(const std::string& text, UINT encoding = CP_UTF8)
{
std::wstring ret(text.size() + 1, 0);
if (int len = MultiByteToWideChar(encoding, 0, text.c_str(), -1, ret.data(), ret.capacity()))
if (int len = MultiByteToWideChar(encoding, 0, text.c_str(), -1, ret.data(), ret.size()))
{
ret.resize(len - 1);
return ret;

View File

@ -42,6 +42,15 @@ void TextThread::Push(const BYTE* data, int len)
lastPushTime = GetTickCount();
}
bool TextThread::FilterRepetition(std::wstring& sentence)
{
wchar_t* end = sentence.data() + sentence.size();
for (int len = sentence.size() / 3; len > 6; --len)
if (wcsncmp(end - len * 3, end - len * 2, len) == 0 && wcsncmp(end - len * 3, end - len * 1, len) == 0)
return true | FilterRepetition(sentence = end - len);
return false;
}
void TextThread::Flush()
{
std::unique_lock locker(threadMutex);
@ -50,10 +59,9 @@ void TextThread::Flush()
{
std::wstring sentence = buffer;
buffer.clear();
repeatingChars.clear();
for (std::wsmatch results; std::regex_search(sentence, results, std::wregex(L"^([^]{6,})\\1\\1")); sentence = results[1])
repeatingChars = std::unordered_set(sentence.begin(), sentence.end());
if (FilterRepetition(sentence)) repeatingChars = std::unordered_set(sentence.begin(), sentence.end());
else repeatingChars.clear();
locker.unlock();
AddSentence(sentence);

View File

@ -32,6 +32,8 @@ public:
const HookParam hp;
private:
// see https://github.com/Artikash/Textractor/issues/40
static bool FilterRepetition(std::wstring& sentence);
void Flush();
std::wstring buffer;