From 67a5f42c5196ca96bb6596c19be7a4191b386582 Mon Sep 17 00:00:00 2001 From: Akash Mozumdar Date: Thu, 13 Jun 2019 13:06:34 -0400 Subject: [PATCH] break up remove repetition extension --- deploy.ps1 | 4 ++- extensions/CMakeLists.txt | 4 ++- extensions/removerepeatchar.cpp | 54 +++++++++++++++++++++++++++++ extensions/removerepeatphrase.cpp | 49 ++++++++++++++++++++++++++ extensions/removerepeatsentence.cpp | 44 +++++++++++++++++++++++ include/defs.h | 2 +- 6 files changed, 154 insertions(+), 3 deletions(-) create mode 100644 extensions/removerepeatchar.cpp create mode 100644 extensions/removerepeatphrase.cpp create mode 100644 extensions/removerepeatsentence.cpp diff --git a/deploy.ps1 b/deploy.ps1 index 9c0353a..a51be7f 100644 --- a/deploy.ps1 +++ b/deploy.ps1 @@ -43,7 +43,9 @@ foreach ($language in @{ "Google Translate.dll", "Lua.dll", "Regex Filter.dll", - "Remove Repetition.dll", + "Remove Repeated Characters.dll", + "Remove Repeated Phrases.dll", + "Remove 30 Repeated Sentences.dll", "Replacer.dll", "Thread Linker.dll", "platforms", diff --git a/extensions/CMakeLists.txt b/extensions/CMakeLists.txt index a6d2e3b..af0577b 100644 --- a/extensions/CMakeLists.txt +++ b/extensions/CMakeLists.txt @@ -11,7 +11,9 @@ add_library(Extra\ Window MODULE extrawindow.cpp extensionimpl.cpp) add_library(Google\ Translate MODULE googletranslate.cpp translatewrapper.cpp network.cpp extensionimpl.cpp) add_library(Lua MODULE lua.cpp extensionimpl.cpp) add_library(Regex\ Filter MODULE regexfilter.cpp extensionimpl.cpp) -add_library(Remove\ Repetition MODULE removerepeat.cpp extensionimpl.cpp) +add_library(Remove\ Repeated\ Characters MODULE removerepeatchar.cpp extensionimpl.cpp) +add_library(Remove\ Repeated\ Phrases MODULE removerepeatphrase.cpp extensionimpl.cpp) +add_library(Remove\ 30\ Repeated\ Sentences MODULE removerepeatsentence.cpp extensionimpl.cpp) add_library(Replacer MODULE replacer.cpp extensionimpl.cpp) add_library(Thread\ Linker MODULE threadlinker.cpp extensionimpl.cpp) diff --git a/extensions/removerepeatchar.cpp b/extensions/removerepeatchar.cpp new file mode 100644 index 0000000..b1ef8ac --- /dev/null +++ b/extensions/removerepeatchar.cpp @@ -0,0 +1,54 @@ +#include "extension.h" + +bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo) +{ + if (sentenceInfo["text number"] == 0) return false; + + std::vector repeatNumbers(sentence.size() + 1, 0); + int repeatNumber = 1; + wchar_t prevChar = L'\0'; + for (auto nextChar : sentence) + { + if (nextChar == prevChar) + { + repeatNumber += 1; + } + else + { + prevChar = nextChar; + repeatNumbers.at(repeatNumber) += 1; + repeatNumber = 1; + } + } + if ((repeatNumber = std::distance(repeatNumbers.begin(), std::max_element(repeatNumbers.begin(), repeatNumbers.end()))) == 1) return false; + + std::wstring newSentence; + for (int i = 0; i < sentence.size();) + { + newSentence.push_back(sentence.at(i)); + for (int j = i; j <= sentence.size(); ++j) + { + if (j == sentence.size() || sentence.at(i) != sentence.at(j)) + { + i += (j - i) % repeatNumber == 0 ? repeatNumber : 1; + break; + } + } + } + sentence = newSentence; + return true; +} + +TEST( + { + std::wstring repeatedChars = L"aaaaaaaaaaaabbbbbbcccdddaabbbcccddd"; + ProcessSentence(repeatedChars, { SentenceInfo::DUMMY }); + assert(repeatedChars.find(L"aaaabbcd") == 0); + + std::wstring empty = L"", one = L" ", normal = L"This is a normal sentence. はい"; + ProcessSentence(empty, { SentenceInfo::DUMMY }); + ProcessSentence(one, { SentenceInfo::DUMMY }); + ProcessSentence(normal, { SentenceInfo::DUMMY }); + assert(empty == L"" && one == L" " && normal == L"This is a normal sentence. はい"); + } +); diff --git a/extensions/removerepeatphrase.cpp b/extensions/removerepeatphrase.cpp new file mode 100644 index 0000000..38f2153 --- /dev/null +++ b/extensions/removerepeatphrase.cpp @@ -0,0 +1,49 @@ +#include "extension.h" + +bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo) +{ + if (sentenceInfo["text number"] == 0) return false; + + if (sentence.size() > 15000) return false; // this algorithm is O(N^3) so if N > 15000 it's extremely slow + auto data = std::make_unique(sentence.size() + 1); + wcscpy_s(data.get(), sentence.size() + 1, sentence.c_str()); + wchar_t* dataEnd = data.get() + sentence.size(); + int skip = 0, count = 0; + for (wchar_t* end = dataEnd; end - data.get() > skip; --end) + { + std::swap(*end, *dataEnd); + int junkLength = end - data.get() - skip; + auto junkFound = wcsstr(sentence.c_str() + skip + junkLength, data.get() + skip); + std::swap(*end, *dataEnd); + if (junkFound) + { + if (count && junkLength < min(skip / count, 4)) break; + skip += junkLength; + count += 1; + end = dataEnd; + } + } + if (count && skip / count >= 3) + { + sentence = data.get() + skip; + return true; + } + return false; +} + +TEST( + { + std::wstring cyclicRepeats = L"_abcde_abcdef_abcdefg_abcdefg_abcdefg_abcdefg_abcdefg"; + std::wstring buildupRepeats = L"__a_ab_abc_abcd_abcde_abcdef_abcdefg"; + ProcessSentence(cyclicRepeats, { SentenceInfo::DUMMY }); + ProcessSentence(buildupRepeats, { SentenceInfo::DUMMY }); + assert(cyclicRepeats == L"_abcdefg"); + assert(buildupRepeats == L"_abcdefg"); + + std::wstring empty = L"", one = L" ", normal = L"This is a normal sentence. はい"; + ProcessSentence(empty, { SentenceInfo::DUMMY }); + ProcessSentence(one, { SentenceInfo::DUMMY }); + ProcessSentence(normal, { SentenceInfo::DUMMY }); + assert(empty == L"" && one == L" " && normal == L"This is a normal sentence. はい"); + } +); diff --git a/extensions/removerepeatsentence.cpp b/extensions/removerepeatsentence.cpp new file mode 100644 index 0000000..b07cc4d --- /dev/null +++ b/extensions/removerepeatsentence.cpp @@ -0,0 +1,44 @@ +#include "extension.h" + +int sentenceCacheSize = 30; + +BOOL WINAPI DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved) +{ + switch (ul_reason_for_call) + { + case DLL_PROCESS_ATTACH: + { + wchar_t filePath[MAX_PATH]; + GetModuleFileNameW(hModule, filePath, MAX_PATH); + if (wchar_t* fileName = wcsrchr(filePath, L'\\')) swscanf_s(fileName, L"\\Remove %d Repeated Sentences.dll", &sentenceCacheSize); + } + break; + case DLL_PROCESS_DETACH: + { + } + break; + } + return TRUE; +} + +bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo) +{ + uint64_t textNumber = sentenceInfo["text number"]; + if (textNumber == 0) return false; + + static std::deque>> cache; + static std::mutex m; + m.lock(); + if (textNumber + 1 > cache.size()) cache.resize(textNumber + 1); + auto prevSentences = cache.at(textNumber).Acquire(); + m.unlock(); + auto& inserted = prevSentences->emplace_back(sentence); + auto firstLocation = std::find(prevSentences->begin(), prevSentences->end(), sentence); + if (&*firstLocation != &inserted) + { + prevSentences->erase(firstLocation); + sentence.clear(); + } + if (prevSentences->size() > sentenceCacheSize) prevSentences->erase(prevSentences->begin()); + return sentence.empty(); +} diff --git a/include/defs.h b/include/defs.h index 875128f..5ffb349 100644 --- a/include/defs.h +++ b/include/defs.h @@ -28,7 +28,7 @@ constexpr auto CONFIG_FILE = u8"Textractor.ini"; // Misc -constexpr auto DEFAULT_EXTENSIONS = u8"Remove Repetition>Regex Filter>Copy to Clipboard>Bing Translate>Extra Window>Extra Newlines"; +constexpr auto DEFAULT_EXTENSIONS = u8"Remove Repeated Characters>Remove Repeated Phrases>Regex Filter>Copy to Clipboard>Bing Translate>Extra Window>Extra Newlines"; constexpr auto WINDOW = u8"Window"; // EOF