mirror of
https://github.com/Artikash/Textractor.git
synced 2025-01-11 01:59:14 +08:00
ok, finally have a good repetition filter
This commit is contained in:
parent
59869dc45a
commit
523008d7e0
@ -34,7 +34,7 @@ namespace Host
|
|||||||
inline std::wstring StringToWideString(const std::string& text, UINT encoding = CP_UTF8)
|
inline std::wstring StringToWideString(const std::string& text, UINT encoding = CP_UTF8)
|
||||||
{
|
{
|
||||||
std::wstring ret(text.size() + 1, 0);
|
std::wstring ret(text.size() + 1, 0);
|
||||||
if (int len = MultiByteToWideChar(encoding, 0, text.c_str(), -1, ret.data(), ret.capacity()))
|
if (int len = MultiByteToWideChar(encoding, 0, text.c_str(), -1, ret.data(), ret.size()))
|
||||||
{
|
{
|
||||||
ret.resize(len - 1);
|
ret.resize(len - 1);
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -42,6 +42,15 @@ void TextThread::Push(const BYTE* data, int len)
|
|||||||
lastPushTime = GetTickCount();
|
lastPushTime = GetTickCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool TextThread::FilterRepetition(std::wstring& sentence)
|
||||||
|
{
|
||||||
|
wchar_t* end = sentence.data() + sentence.size();
|
||||||
|
for (int len = sentence.size() / 3; len > 6; --len)
|
||||||
|
if (wcsncmp(end - len * 3, end - len * 2, len) == 0 && wcsncmp(end - len * 3, end - len * 1, len) == 0)
|
||||||
|
return true | FilterRepetition(sentence = end - len);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
void TextThread::Flush()
|
void TextThread::Flush()
|
||||||
{
|
{
|
||||||
std::unique_lock locker(threadMutex);
|
std::unique_lock locker(threadMutex);
|
||||||
@ -50,10 +59,9 @@ void TextThread::Flush()
|
|||||||
{
|
{
|
||||||
std::wstring sentence = buffer;
|
std::wstring sentence = buffer;
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
repeatingChars.clear();
|
|
||||||
|
|
||||||
for (std::wsmatch results; std::regex_search(sentence, results, std::wregex(L"^([^]{6,})\\1\\1")); sentence = results[1])
|
if (FilterRepetition(sentence)) repeatingChars = std::unordered_set(sentence.begin(), sentence.end());
|
||||||
repeatingChars = std::unordered_set(sentence.begin(), sentence.end());
|
else repeatingChars.clear();
|
||||||
|
|
||||||
locker.unlock();
|
locker.unlock();
|
||||||
AddSentence(sentence);
|
AddSentence(sentence);
|
||||||
|
@ -32,6 +32,8 @@ public:
|
|||||||
const HookParam hp;
|
const HookParam hp;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
// see https://github.com/Artikash/Textractor/issues/40
|
||||||
|
static bool FilterRepetition(std::wstring& sentence);
|
||||||
void Flush();
|
void Flush();
|
||||||
|
|
||||||
std::wstring buffer;
|
std::wstring buffer;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user