break up remove repetition extension

This commit is contained in:
Akash Mozumdar 2019-06-13 13:06:34 -04:00
parent b5c319ee46
commit 67a5f42c51
6 changed files with 154 additions and 3 deletions

View File

@ -43,7 +43,9 @@ foreach ($language in @{
"Google Translate.dll", "Google Translate.dll",
"Lua.dll", "Lua.dll",
"Regex Filter.dll", "Regex Filter.dll",
"Remove Repetition.dll", "Remove Repeated Characters.dll",
"Remove Repeated Phrases.dll",
"Remove 30 Repeated Sentences.dll",
"Replacer.dll", "Replacer.dll",
"Thread Linker.dll", "Thread Linker.dll",
"platforms", "platforms",

View File

@ -11,7 +11,9 @@ add_library(Extra\ Window MODULE extrawindow.cpp extensionimpl.cpp)
add_library(Google\ Translate MODULE googletranslate.cpp translatewrapper.cpp network.cpp extensionimpl.cpp) add_library(Google\ Translate MODULE googletranslate.cpp translatewrapper.cpp network.cpp extensionimpl.cpp)
add_library(Lua MODULE lua.cpp extensionimpl.cpp) add_library(Lua MODULE lua.cpp extensionimpl.cpp)
add_library(Regex\ Filter MODULE regexfilter.cpp extensionimpl.cpp) add_library(Regex\ Filter MODULE regexfilter.cpp extensionimpl.cpp)
add_library(Remove\ Repetition MODULE removerepeat.cpp extensionimpl.cpp) add_library(Remove\ Repeated\ Characters MODULE removerepeatchar.cpp extensionimpl.cpp)
add_library(Remove\ Repeated\ Phrases MODULE removerepeatphrase.cpp extensionimpl.cpp)
add_library(Remove\ 30\ Repeated\ Sentences MODULE removerepeatsentence.cpp extensionimpl.cpp)
add_library(Replacer MODULE replacer.cpp extensionimpl.cpp) add_library(Replacer MODULE replacer.cpp extensionimpl.cpp)
add_library(Thread\ Linker MODULE threadlinker.cpp extensionimpl.cpp) add_library(Thread\ Linker MODULE threadlinker.cpp extensionimpl.cpp)

View File

@ -0,0 +1,54 @@
#include "extension.h"
bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
{
if (sentenceInfo["text number"] == 0) return false;
std::vector<int> repeatNumbers(sentence.size() + 1, 0);
int repeatNumber = 1;
wchar_t prevChar = L'\0';
for (auto nextChar : sentence)
{
if (nextChar == prevChar)
{
repeatNumber += 1;
}
else
{
prevChar = nextChar;
repeatNumbers.at(repeatNumber) += 1;
repeatNumber = 1;
}
}
if ((repeatNumber = std::distance(repeatNumbers.begin(), std::max_element(repeatNumbers.begin(), repeatNumbers.end()))) == 1) return false;
std::wstring newSentence;
for (int i = 0; i < sentence.size();)
{
newSentence.push_back(sentence.at(i));
for (int j = i; j <= sentence.size(); ++j)
{
if (j == sentence.size() || sentence.at(i) != sentence.at(j))
{
i += (j - i) % repeatNumber == 0 ? repeatNumber : 1;
break;
}
}
}
sentence = newSentence;
return true;
}
TEST(
{
std::wstring repeatedChars = L"aaaaaaaaaaaabbbbbbcccdddaabbbcccddd";
ProcessSentence(repeatedChars, { SentenceInfo::DUMMY });
assert(repeatedChars.find(L"aaaabbcd") == 0);
std::wstring empty = L"", one = L" ", normal = L"This is a normal sentence. はい";
ProcessSentence(empty, { SentenceInfo::DUMMY });
ProcessSentence(one, { SentenceInfo::DUMMY });
ProcessSentence(normal, { SentenceInfo::DUMMY });
assert(empty == L"" && one == L" " && normal == L"This is a normal sentence. はい");
}
);

View File

@ -0,0 +1,49 @@
#include "extension.h"
bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
{
if (sentenceInfo["text number"] == 0) return false;
if (sentence.size() > 15000) return false; // this algorithm is O(N^3) so if N > 15000 it's extremely slow
auto data = std::make_unique<wchar_t[]>(sentence.size() + 1);
wcscpy_s(data.get(), sentence.size() + 1, sentence.c_str());
wchar_t* dataEnd = data.get() + sentence.size();
int skip = 0, count = 0;
for (wchar_t* end = dataEnd; end - data.get() > skip; --end)
{
std::swap(*end, *dataEnd);
int junkLength = end - data.get() - skip;
auto junkFound = wcsstr(sentence.c_str() + skip + junkLength, data.get() + skip);
std::swap(*end, *dataEnd);
if (junkFound)
{
if (count && junkLength < min(skip / count, 4)) break;
skip += junkLength;
count += 1;
end = dataEnd;
}
}
if (count && skip / count >= 3)
{
sentence = data.get() + skip;
return true;
}
return false;
}
TEST(
{
std::wstring cyclicRepeats = L"_abcde_abcdef_abcdefg_abcdefg_abcdefg_abcdefg_abcdefg";
std::wstring buildupRepeats = L"__a_ab_abc_abcd_abcde_abcdef_abcdefg";
ProcessSentence(cyclicRepeats, { SentenceInfo::DUMMY });
ProcessSentence(buildupRepeats, { SentenceInfo::DUMMY });
assert(cyclicRepeats == L"_abcdefg");
assert(buildupRepeats == L"_abcdefg");
std::wstring empty = L"", one = L" ", normal = L"This is a normal sentence. はい";
ProcessSentence(empty, { SentenceInfo::DUMMY });
ProcessSentence(one, { SentenceInfo::DUMMY });
ProcessSentence(normal, { SentenceInfo::DUMMY });
assert(empty == L"" && one == L" " && normal == L"This is a normal sentence. はい");
}
);

View File

@ -0,0 +1,44 @@
#include "extension.h"
int sentenceCacheSize = 30;
BOOL WINAPI DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
{
switch (ul_reason_for_call)
{
case DLL_PROCESS_ATTACH:
{
wchar_t filePath[MAX_PATH];
GetModuleFileNameW(hModule, filePath, MAX_PATH);
if (wchar_t* fileName = wcsrchr(filePath, L'\\')) swscanf_s(fileName, L"\\Remove %d Repeated Sentences.dll", &sentenceCacheSize);
}
break;
case DLL_PROCESS_DETACH:
{
}
break;
}
return TRUE;
}
bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
{
uint64_t textNumber = sentenceInfo["text number"];
if (textNumber == 0) return false;
static std::deque<Synchronized<std::vector<std::wstring>>> cache;
static std::mutex m;
m.lock();
if (textNumber + 1 > cache.size()) cache.resize(textNumber + 1);
auto prevSentences = cache.at(textNumber).Acquire();
m.unlock();
auto& inserted = prevSentences->emplace_back(sentence);
auto firstLocation = std::find(prevSentences->begin(), prevSentences->end(), sentence);
if (&*firstLocation != &inserted)
{
prevSentences->erase(firstLocation);
sentence.clear();
}
if (prevSentences->size() > sentenceCacheSize) prevSentences->erase(prevSentences->begin());
return sentence.empty();
}

View File

@ -28,7 +28,7 @@ constexpr auto CONFIG_FILE = u8"Textractor.ini";
// Misc // Misc
constexpr auto DEFAULT_EXTENSIONS = u8"Remove Repetition>Regex Filter>Copy to Clipboard>Bing Translate>Extra Window>Extra Newlines"; constexpr auto DEFAULT_EXTENSIONS = u8"Remove Repeated Characters>Remove Repeated Phrases>Regex Filter>Copy to Clipboard>Bing Translate>Extra Window>Extra Newlines";
constexpr auto WINDOW = u8"Window"; constexpr auto WINDOW = u8"Window";
// EOF // EOF