remove some false positives from repetition detection

This commit is contained in:
Akash Mozumdar 2019-02-25 00:47:18 -05:00
parent 9900c09483
commit ec0b9c077c

View File

@ -6,13 +6,18 @@ void RemoveRepeatedChars(std::wstring& sentence)
int repeatNumber = 1; int repeatNumber = 1;
wchar_t prevChar = L'\0'; wchar_t prevChar = L'\0';
for (auto nextChar : sentence) for (auto nextChar : sentence)
if (nextChar == prevChar) repeatNumber += 1; {
if (nextChar == prevChar)
{
repeatNumber += 1;
}
else else
{ {
prevChar = nextChar; prevChar = nextChar;
repeatNumbers.at(repeatNumber) += 1; repeatNumbers.at(repeatNumber) += 1;
repeatNumber = 1; repeatNumber = 1;
} }
}
if ((repeatNumber = std::distance(repeatNumbers.begin(), std::max_element(repeatNumbers.begin(), repeatNumbers.end()))) == 1) return; if ((repeatNumber = std::distance(repeatNumbers.begin(), std::max_element(repeatNumbers.begin(), repeatNumbers.end()))) == 1) return;
std::wstring newSentence; std::wstring newSentence;
@ -33,6 +38,7 @@ void RemoveRepeatedChars(std::wstring& sentence)
void RemoveCyclicRepeats(std::wstring& sentence) void RemoveCyclicRepeats(std::wstring& sentence)
{ {
if (sentence.size() > 15000) return; // this algorithm is O(N^3) so if N > 15000 it's extremely slow
auto data = std::make_unique<wchar_t[]>(sentence.size() + 1); auto data = std::make_unique<wchar_t[]>(sentence.size() + 1);
wcscpy_s(data.get(), sentence.size() + 1, sentence.c_str()); wcscpy_s(data.get(), sentence.size() + 1, sentence.c_str());
wchar_t* dataEnd = data.get() + sentence.size(); wchar_t* dataEnd = data.get() + sentence.size();
@ -45,6 +51,7 @@ void RemoveCyclicRepeats(std::wstring& sentence)
std::swap(*end, *dataEnd); std::swap(*end, *dataEnd);
if (junkFound) if (junkFound)
{ {
if (count && junkLength < min(skip / count, 4)) break;
skip += junkLength; skip += junkLength;
count += 1; count += 1;
end = dataEnd; end = dataEnd;
@ -67,12 +74,12 @@ TEST(
RemoveRepeatedChars(repeatedChars); RemoveRepeatedChars(repeatedChars);
assert(repeatedChars.find(L"aaaabbcd") == 0); assert(repeatedChars.find(L"aaaabbcd") == 0);
std::wstring cyclicRepeats = L"abcdeabcdefabcdefgabcdefgabcdefgabcdefgabcdefg"; std::wstring cyclicRepeats = L"_abcde_abcdef_abcdefg_abcdefg_abcdefg_abcdefg_abcdefg";
std::wstring buildupRepeats = L"aababcabcdabcdeabcdefabcdefg"; std::wstring buildupRepeats = L"__a_ab_abc_abcd_abcde_abcdef_abcdefg";
RemoveCyclicRepeats(cyclicRepeats); RemoveCyclicRepeats(cyclicRepeats);
RemoveCyclicRepeats(buildupRepeats); RemoveCyclicRepeats(buildupRepeats);
assert(cyclicRepeats == L"abcdefg"); assert(cyclicRepeats == L"_abcdefg");
assert(buildupRepeats == L"abcdefg"); assert(buildupRepeats == L"_abcdefg");
std::wstring empty = L"", one = L" ", normal = L"This is a normal sentence. はい"; std::wstring empty = L"", one = L" ", normal = L"This is a normal sentence. はい";
ProcessSentence(empty, { SentenceInfo::DUMMY }); ProcessSentence(empty, { SentenceInfo::DUMMY });