From 60372a6b8a070e392e958264ab0600e8f1b26197 Mon Sep 17 00:00:00 2001
From: Akash Mozumdar <akashmozumdar@gmail.com>
Date: Fri, 9 Aug 2019 22:11:34 -0400
Subject: [PATCH] further optimize repetition detection

---
 extensions/removerepeatphrase.cpp  | 24 ++++++++++++------------
 extensions/removerepeatphrase2.cpp |  3 +--
 2 files changed, 13 insertions(+), 14 deletions(-)
diff --git a/extensions/removerepeatphrase.cpp b/extensions/removerepeatphrase.cpp
index d298244..15e20b5 100644
--- a/extensions/removerepeatphrase.cpp
+++ b/extensions/removerepeatphrase.cpp
@@ -1,7 +1,5 @@
 ﻿#include "extension.h"
 
-constexpr wchar_t ERASED = 0xe012; // inside Unicode private use area
-
 std::vector<int> GenerateSuffixArray(const std::wstring& text)
 {
 	std::vector<int> identity(text.size());
@@ -13,7 +11,7 @@ std::vector<int> GenerateSuffixArray(const std::wstring& text)
 	std::vector<int> classes(text.begin(), text.end());
 	for (int length = 1; length < text.size(); length *= 2)
 	{
-		// Determine equivalence class up to length, by checking length/2 equivalence of suffixes and their following length/2 suffixes
+		// Determine equivalence class up to length, by checking length / 2 equivalence of suffixes and their following length / 2 suffixes
 		std::vector<int> oldClasses = classes;
 		classes[suffixArray[0]] = 0;
 		for (int i = 1; i < text.size(); ++i)
@@ -26,8 +24,7 @@ std::vector<int> GenerateSuffixArray(const std::wstring& text)
 			else classes[currentSuffix] = i;
 		}
 
-		// Sort within equivalence class based on order of following suffix after length
-		// Orders up to length*2
+		// Sort within equivalence class based on order of following suffix after length (orders up to length * 2)
 		std::vector<int> count = identity;
 		for (auto suffix : std::vector(suffixArray))
 		{
@@ -40,35 +37,38 @@ std::vector<int> GenerateSuffixArray(const std::wstring& text)
 	return suffixArray;
 }
 
+constexpr wchar_t ERASED = 0xf246; // inside Unicode private use area
+
 bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
 {
 	if (sentenceInfo["text number"] == 0) return false;
 
 	// This algorithm looks for repeating substrings (in other words, common prefixes among the set of suffixes) of the sentence with length > 6
 	// It then looks for any regions of characters at least twice as long as the substring made up only of characters in the substring, and erases them
-	// If this results in the common prefix being completely erased from the string, the common prefix is copied to the last location where it was located in the original string
+	// If this results in the substring being completely erased from the string, the substring is copied to the last location where it was located in the original string
 	std::vector<int> suffixArray = GenerateSuffixArray(sentence);
 	for (int i = 0; i + 1 < sentence.size(); ++i)
 	{
 		int commonPrefixLength = 0;
 		for (int j = suffixArray[i], k = suffixArray[i + 1]; j < sentence.size() && k < sentence.size(); ++j, ++k)
-			if (sentence[j] != ERASED && sentence[k] != ERASED && sentence[j] == sentence[k]) commonPrefixLength += 1;
+			if (sentence[j] != ERASED && sentence[j] == sentence[k]) commonPrefixLength += 1;
 			else break;
 
 		if (commonPrefixLength > 6)
 		{
-			std::wstring commonPrefixCopy(sentence.c_str() + suffixArray[i], commonPrefixLength);
-			std::unordered_set<wchar_t, Identity<wchar_t>> commonPrefixChars(commonPrefixCopy.begin(), commonPrefixCopy.end());
+			std::wstring substring(sentence, suffixArray[i], commonPrefixLength);
+			bool substringCharMap[0x10000] = {};
+			for (auto ch : substring)
+				substringCharMap[ch] = true;
 
 			for (int regionSize = 0, j = 0; j <= sentence.size(); ++j)
-				if (commonPrefixChars.find(sentence[j]) != commonPrefixChars.end()) regionSize += 1;
+				if (substringCharMap[sentence[j]]) regionSize += 1;
 				else if (regionSize >= commonPrefixLength * 2)
 					while (regionSize > 0)
 						sentence[j - regionSize--] = ERASED;
 				else regionSize = 0;
 
-			if (!wcsstr(sentence.c_str(), commonPrefixCopy.c_str()))
-				std::copy(commonPrefixCopy.begin(), commonPrefixCopy.end(), sentence.data() + max(suffixArray[i], suffixArray[i + 1]));
+			if (!wcsstr(sentence.c_str(), substring.c_str())) std::copy(substring.begin(), substring.end(), sentence.begin() + max(suffixArray[i], suffixArray[i + 1]));
 		}
 	}
 	sentence.erase(std::remove(sentence.begin(), sentence.end(), ERASED), sentence.end());
diff --git a/extensions/removerepeatphrase2.cpp b/extensions/removerepeatphrase2.cpp
index aadf851..ce0a42f 100644
--- a/extensions/removerepeatphrase2.cpp
+++ b/extensions/removerepeatphrase2.cpp
@@ -4,8 +4,7 @@ bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
 {
 	if (sentenceInfo["text number"] == 0) return false;
 
-	// This algorithm looks at all substrings which start at the beginning of the sentence
-	// If that substring is found later in the sentence, it is removed from the beginning of the sentence and the process is repeated
+	// This algorithm looks at all the prefixes of the sentence: if a prefix is found later in the sentence, it is removed from the beginning and the process is repeated
 	// Complexity O(N^3) so executing for N > 10,000 dangerous
 	if (sentence.size() > 10000) return false;
 	auto data = std::make_unique<wchar_t[]>(sentence.size() + 1);