further optimize repetition detection

2019-08-09 22:11:34 -04:00 · 2019-08-09 22:11:34 -04:00 · 60372a6b8a
commit 60372a6b8a
parent 488a19134a
2 changed files with 13 additions and 14 deletions
--- a/extensions/removerepeatphrase.cpp
+++ b/extensions/removerepeatphrase.cpp
@ -1,7 +1,5 @@
 #include "extension.h"

-constexpr wchar_t ERASED = 0xe012; // inside Unicode private use area
-
 std::vector<int> GenerateSuffixArray(const std::wstring& text)
 {
 	std::vector<int> identity(text.size());
@ -26,8 +24,7 @@ std::vector<int> GenerateSuffixArray(const std::wstring& text)
 			else classes[currentSuffix] = i;
 		}

-		// Sort within equivalence class based on order of following suffix after length
-		// Orders up to length*2
+		// Sort within equivalence class based on order of following suffix after length (orders up to length * 2)
 		std::vector<int> count = identity;
 		for (auto suffix : std::vector(suffixArray))
 		{
@ -40,35 +37,38 @@ std::vector<int> GenerateSuffixArray(const std::wstring& text)
 	return suffixArray;
 }

+constexpr wchar_t ERASED = 0xf246; // inside Unicode private use area
+
 bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
 {
 	if (sentenceInfo["text number"] == 0) return false;

 	// This algorithm looks for repeating substrings (in other words, common prefixes among the set of suffixes) of the sentence with length > 6
 	// It then looks for any regions of characters at least twice as long as the substring made up only of characters in the substring, and erases them
-	// If this results in the common prefix being completely erased from the string, the common prefix is copied to the last location where it was located in the original string
+	// If this results in the substring being completely erased from the string, the substring is copied to the last location where it was located in the original string
 	std::vector<int> suffixArray = GenerateSuffixArray(sentence);
 	for (int i = 0; i + 1 < sentence.size(); ++i)
 	{
 		int commonPrefixLength = 0;
 		for (int j = suffixArray[i], k = suffixArray[i + 1]; j < sentence.size() && k < sentence.size(); ++j, ++k)
-			if (sentence[j] != ERASED && sentence[k] != ERASED && sentence[j] == sentence[k]) commonPrefixLength += 1;
+			if (sentence[j] != ERASED && sentence[j] == sentence[k]) commonPrefixLength += 1;
 			else break;

 		if (commonPrefixLength > 6)
 		{
-			std::wstring commonPrefixCopy(sentence.c_str() + suffixArray[i], commonPrefixLength);
-			std::unordered_set<wchar_t, Identity<wchar_t>> commonPrefixChars(commonPrefixCopy.begin(), commonPrefixCopy.end());
+			std::wstring substring(sentence, suffixArray[i], commonPrefixLength);
+			bool substringCharMap[0x10000] = {};
+			for (auto ch : substring)
+				substringCharMap[ch] = true;

 			for (int regionSize = 0, j = 0; j <= sentence.size(); ++j)
-				if (commonPrefixChars.find(sentence[j]) != commonPrefixChars.end()) regionSize += 1;
+				if (substringCharMap[sentence[j]]) regionSize += 1;
 				else if (regionSize >= commonPrefixLength * 2)
 					while (regionSize > 0)
 						sentence[j - regionSize--] = ERASED;
 				else regionSize = 0;

-			if (!wcsstr(sentence.c_str(), commonPrefixCopy.c_str()))
-				std::copy(commonPrefixCopy.begin(), commonPrefixCopy.end(), sentence.data() + max(suffixArray[i], suffixArray[i + 1]));
+			if (!wcsstr(sentence.c_str(), substring.c_str())) std::copy(substring.begin(), substring.end(), sentence.begin() + max(suffixArray[i], suffixArray[i + 1]));
 		}
 	}
 	sentence.erase(std::remove(sentence.begin(), sentence.end(), ERASED), sentence.end());
--- a/extensions/removerepeatphrase2.cpp
+++ b/extensions/removerepeatphrase2.cpp
@ -4,8 +4,7 @@ bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
 {
 	if (sentenceInfo["text number"] == 0) return false;

-	// This algorithm looks at all substrings which start at the beginning of the sentence
-	// If that substring is found later in the sentence, it is removed from the beginning of the sentence and the process is repeated
+	// This algorithm looks at all the prefixes of the sentence: if a prefix is found later in the sentence, it is removed from the beginning and the process is repeated
 	// Complexity O(N^3) so executing for N > 10,000 dangerous
 	if (sentence.size() > 10000) return false;
 	auto data = std::make_unique<wchar_t[]>(sentence.size() + 1);