ok, finally have a good repetition filter

This commit is contained in:
Akash Mozumdar 2018-11-16 08:34:15 -05:00
parent 59869dc45a
commit 523008d7e0
3 changed files with 14 additions and 4 deletions

View File

@ -34,7 +34,7 @@ namespace Host
inline std::wstring StringToWideString(const std::string& text, UINT encoding = CP_UTF8) inline std::wstring StringToWideString(const std::string& text, UINT encoding = CP_UTF8)
{ {
std::wstring ret(text.size() + 1, 0); std::wstring ret(text.size() + 1, 0);
if (int len = MultiByteToWideChar(encoding, 0, text.c_str(), -1, ret.data(), ret.capacity())) if (int len = MultiByteToWideChar(encoding, 0, text.c_str(), -1, ret.data(), ret.size()))
{ {
ret.resize(len - 1); ret.resize(len - 1);
return ret; return ret;

View File

@ -42,6 +42,15 @@ void TextThread::Push(const BYTE* data, int len)
lastPushTime = GetTickCount(); lastPushTime = GetTickCount();
} }
bool TextThread::FilterRepetition(std::wstring& sentence)
{
wchar_t* end = sentence.data() + sentence.size();
for (int len = sentence.size() / 3; len > 6; --len)
if (wcsncmp(end - len * 3, end - len * 2, len) == 0 && wcsncmp(end - len * 3, end - len * 1, len) == 0)
return true | FilterRepetition(sentence = end - len);
return false;
}
void TextThread::Flush() void TextThread::Flush()
{ {
std::unique_lock locker(threadMutex); std::unique_lock locker(threadMutex);
@ -50,10 +59,9 @@ void TextThread::Flush()
{ {
std::wstring sentence = buffer; std::wstring sentence = buffer;
buffer.clear(); buffer.clear();
repeatingChars.clear();
for (std::wsmatch results; std::regex_search(sentence, results, std::wregex(L"^([^]{6,})\\1\\1")); sentence = results[1]) if (FilterRepetition(sentence)) repeatingChars = std::unordered_set(sentence.begin(), sentence.end());
repeatingChars = std::unordered_set(sentence.begin(), sentence.end()); else repeatingChars.clear();
locker.unlock(); locker.unlock();
AddSentence(sentence); AddSentence(sentence);

View File

@ -32,6 +32,8 @@ public:
const HookParam hp; const HookParam hp;
private: private:
// see https://github.com/Artikash/Textractor/issues/40
static bool FilterRepetition(std::wstring& sentence);
void Flush(); void Flush();
std::wstring buffer; std::wstring buffer;