diff --git a/extensions/removerepeat.cpp b/extensions/removerepeat.cpp deleted file mode 100644 index 61d2ca1..0000000 --- a/extensions/removerepeat.cpp +++ /dev/null @@ -1,109 +0,0 @@ -#include "extension.h" - -void RemoveRepeatedSentences(std::wstring& sentence, uint64_t textNumber) -{ - static std::deque>> cache; - static std::mutex m; - m.lock(); - if (textNumber + 1 > cache.size()) cache.resize(textNumber + 1); - auto prevSentences = cache.at(textNumber).Acquire(); - m.unlock(); - auto& inserted = prevSentences->emplace_back(sentence); - auto firstLocation = std::find(prevSentences->begin(), prevSentences->end(), sentence); - if (&*firstLocation != &inserted) - { - prevSentences->erase(firstLocation); - sentence.clear(); - } - if (prevSentences->size() > 50) prevSentences->erase(prevSentences->begin()); -} - -void RemoveRepeatedChars(std::wstring& sentence) -{ - std::vector repeatNumbers(sentence.size() + 1, 0); - int repeatNumber = 1; - wchar_t prevChar = L'\0'; - for (auto nextChar : sentence) - { - if (nextChar == prevChar) - { - repeatNumber += 1; - } - else - { - prevChar = nextChar; - repeatNumbers.at(repeatNumber) += 1; - repeatNumber = 1; - } - } - if ((repeatNumber = std::distance(repeatNumbers.begin(), std::max_element(repeatNumbers.begin(), repeatNumbers.end()))) == 1) return; - - std::wstring newSentence; - for (int i = 0; i < sentence.size();) - { - newSentence.push_back(sentence.at(i)); - for (int j = i; j <= sentence.size(); ++j) - { - if (j == sentence.size() || sentence.at(i) != sentence.at(j)) - { - i += (j - i) % repeatNumber == 0 ? repeatNumber : 1; - break; - } - } - } - sentence = newSentence; -} - -void RemoveCyclicRepeats(std::wstring& sentence) -{ - if (sentence.size() > 15000) return; // this algorithm is O(N^3) so if N > 15000 it's extremely slow - auto data = std::make_unique(sentence.size() + 1); - wcscpy_s(data.get(), sentence.size() + 1, sentence.c_str()); - wchar_t* dataEnd = data.get() + sentence.size(); - int skip = 0, count = 0; - for (wchar_t* end = dataEnd; end - data.get() > skip; --end) - { - std::swap(*end, *dataEnd); - int junkLength = end - data.get() - skip; - auto junkFound = wcsstr(sentence.c_str() + skip + junkLength, data.get() + skip); - std::swap(*end, *dataEnd); - if (junkFound) - { - if (count && junkLength < min(skip / count, 4)) break; - skip += junkLength; - count += 1; - end = dataEnd; - } - } - if (count && skip / count >= 3) sentence = data.get() + skip; -} - -bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo) -{ - if (sentenceInfo["text number"] == 0) return false; - RemoveRepeatedSentences(sentence, sentenceInfo["text number"]); - RemoveRepeatedChars(sentence); - RemoveCyclicRepeats(sentence); - return true; -} - -TEST( - { - std::wstring repeatedChars = L"aaaaaaaaaaaabbbbbbcccdddaabbbcccddd"; - RemoveRepeatedChars(repeatedChars); - assert(repeatedChars.find(L"aaaabbcd") == 0); - - std::wstring cyclicRepeats = L"_abcde_abcdef_abcdefg_abcdefg_abcdefg_abcdefg_abcdefg"; - std::wstring buildupRepeats = L"__a_ab_abc_abcd_abcde_abcdef_abcdefg"; - RemoveCyclicRepeats(cyclicRepeats); - RemoveCyclicRepeats(buildupRepeats); - assert(cyclicRepeats == L"_abcdefg"); - assert(buildupRepeats == L"_abcdefg"); - - std::wstring empty = L"", one = L" ", normal = L"This is a normal sentence. はい"; - ProcessSentence(empty, { SentenceInfo::DUMMY }); - ProcessSentence(one, { SentenceInfo::DUMMY }); - ProcessSentence(normal, { SentenceInfo::DUMMY }); - assert(empty == L"" && one == L" " && normal == L"This is a normal sentence. はい"); - } -);