2019-06-14 01:06:34 +08:00
# include " extension.h "
2019-08-08 18:23:15 +08:00
std : : vector < int > GenerateSuffixArray ( const std : : wstring & text )
{
2020-02-28 15:34:34 +08:00
std : : vector < int > suffixArray ( text . size ( ) ) ;
for ( int i = 0 ; i < text . size ( ) ; + + i ) suffixArray [ i ] = i ;
2019-08-08 18:23:15 +08:00
// The below code is a more efficient way of doing this:
// std::sort(suffixArray.begin(), suffixArray.end(), [&](int a, int b) { return wcscmp(text.c_str() + a, text.c_str() + b) > 0; });
std : : stable_sort ( suffixArray . begin ( ) , suffixArray . end ( ) , [ & ] ( int a , int b ) { return text [ a ] > text [ b ] ; } ) ;
2020-02-28 15:34:34 +08:00
std : : vector < int > eqClasses ( text . begin ( ) , text . end ( ) ) ;
std : : vector < int > count ( text . size ( ) ) ;
2019-08-08 18:23:15 +08:00
for ( int length = 1 ; length < text . size ( ) ; length * = 2 )
{
2019-08-10 10:11:34 +08:00
// Determine equivalence class up to length, by checking length / 2 equivalence of suffixes and their following length / 2 suffixes
2020-02-28 15:34:34 +08:00
std : : vector < int > prevEqClasses = eqClasses ;
eqClasses [ suffixArray [ 0 ] ] = 0 ;
2019-08-08 18:23:15 +08:00
for ( int i = 1 ; i < text . size ( ) ; + + i )
{
2021-06-05 16:20:04 +08:00
int currentSuffix = suffixArray [ i ] , lastSuffix = suffixArray [ i - 1 ] ;
2020-02-28 15:34:34 +08:00
if ( currentSuffix + length < text . size ( ) & & prevEqClasses [ currentSuffix ] = = prevEqClasses [ lastSuffix ] & &
2021-06-05 16:20:04 +08:00
prevEqClasses [ currentSuffix + length / 2 ] = = prevEqClasses [ lastSuffix + length / 2 ]
)
2020-02-28 15:34:34 +08:00
eqClasses [ currentSuffix ] = eqClasses [ lastSuffix ] ;
else eqClasses [ currentSuffix ] = i ;
2019-08-08 18:23:15 +08:00
}
2019-08-10 10:11:34 +08:00
// Sort within equivalence class based on order of following suffix after length (orders up to length * 2)
2020-02-28 15:34:34 +08:00
for ( int i = 0 ; i < text . size ( ) ; + + i ) count [ i ] = i ;
2019-08-08 18:23:15 +08:00
for ( auto suffix : std : : vector ( suffixArray ) )
{
int precedingSuffix = suffix - length ;
2020-02-28 15:34:34 +08:00
if ( precedingSuffix > = 0 ) suffixArray [ count [ eqClasses [ precedingSuffix ] ] + + ] = precedingSuffix ;
2019-08-08 18:23:15 +08:00
}
}
for ( int i = 0 ; i + 1 < text . size ( ) ; + + i )
assert ( wcscmp ( text . c_str ( ) + suffixArray [ i ] , text . c_str ( ) + suffixArray [ i + 1 ] ) > 0 ) ;
return suffixArray ;
}
2019-08-10 10:11:34 +08:00
constexpr wchar_t ERASED = 0xf246 ; // inside Unicode private use area
2019-06-14 01:06:34 +08:00
bool ProcessSentence ( std : : wstring & sentence , SentenceInfo sentenceInfo )
{
if ( sentenceInfo [ " text number " ] = = 0 ) return false ;
2019-08-08 02:05:50 +08:00
// This algorithm looks for repeating substrings (in other words, common prefixes among the set of suffixes) of the sentence with length > 6
// It then looks for any regions of characters at least twice as long as the substring made up only of characters in the substring, and erases them
2019-08-10 10:11:34 +08:00
// If this results in the substring being completely erased from the string, the substring is copied to the last location where it was located in the original string
2019-10-07 13:00:54 +08:00
auto timeout = GetTickCount64 ( ) + 30'000 ; // give up if taking over 30 seconds
2019-08-08 18:23:15 +08:00
std : : vector < int > suffixArray = GenerateSuffixArray ( sentence ) ;
2019-10-07 13:00:54 +08:00
for ( int i = 0 ; i + 1 < sentence . size ( ) & & GetTickCount64 ( ) < timeout ; + + i )
2019-06-14 01:06:34 +08:00
{
2019-08-08 02:05:50 +08:00
int commonPrefixLength = 0 ;
for ( int j = suffixArray [ i ] , k = suffixArray [ i + 1 ] ; j < sentence . size ( ) & & k < sentence . size ( ) ; + + j , + + k )
2019-08-10 10:11:34 +08:00
if ( sentence [ j ] ! = ERASED & & sentence [ j ] = = sentence [ k ] ) commonPrefixLength + = 1 ;
2019-08-08 02:05:50 +08:00
else break ;
if ( commonPrefixLength > 6 )
2019-06-14 01:06:34 +08:00
{
2019-08-10 10:11:34 +08:00
std : : wstring substring ( sentence , suffixArray [ i ] , commonPrefixLength ) ;
bool substringCharMap [ 0x10000 ] = { } ;
2021-01-15 21:07:23 +08:00
for ( auto ch : substring ) substringCharMap [ ch ] = true ;
2019-08-08 02:05:50 +08:00
for ( int regionSize = 0 , j = 0 ; j < = sentence . size ( ) ; + + j )
2019-08-10 10:11:34 +08:00
if ( substringCharMap [ sentence [ j ] ] ) regionSize + = 1 ;
2019-08-08 02:05:50 +08:00
else if ( regionSize > = commonPrefixLength * 2 )
2021-01-15 21:07:23 +08:00
while ( regionSize > 0 ) sentence [ j - regionSize - - ] = ERASED ;
2019-08-08 02:05:50 +08:00
else regionSize = 0 ;
2019-08-10 10:11:34 +08:00
if ( ! wcsstr ( sentence . c_str ( ) , substring . c_str ( ) ) ) std : : copy ( substring . begin ( ) , substring . end ( ) , sentence . begin ( ) + max ( suffixArray [ i ] , suffixArray [ i + 1 ] ) ) ;
2019-06-14 01:06:34 +08:00
}
}
2019-08-08 02:05:50 +08:00
sentence . erase ( std : : remove ( sentence . begin ( ) , sentence . end ( ) , ERASED ) , sentence . end ( ) ) ;
return true ;
2019-06-14 01:06:34 +08:00
}
TEST (
{
2019-10-02 17:30:14 +08:00
InfoForExtension nonConsole [ ] = { { " text number " , 1 } , { } } ;
2019-08-08 02:05:50 +08:00
std : : wstring cyclicRepeats = L " Name: '_abcdefg_abcdefg_abcdefg_abcdefg_abcdefg' " ;
std : : wstring buildupRepeats = L " Name: '__a_ab_abc_abcd_abcde_abcdef_abcdefg' " ;
std : : wstring breakdownRepeats = L " Name: '_abcdefg_abcdef_abcde_abcd_abc_ab_a_' " ;
2019-10-02 17:30:14 +08:00
ProcessSentence ( cyclicRepeats , { nonConsole } ) ;
ProcessSentence ( buildupRepeats , { nonConsole } ) ;
ProcessSentence ( breakdownRepeats , { nonConsole } ) ;
2019-08-08 02:05:50 +08:00
assert ( cyclicRepeats = = L " Name: '_abcdefg' " ) ;
assert ( buildupRepeats = = L " Name: '_abcdefg' " ) ;
assert ( breakdownRepeats = = L " Name: '_abcdefg' " ) ;
2019-06-14 01:06:34 +08:00
std : : wstring empty = L " " , one = L " " , normal = L " This is a normal sentence. はい " ;
2019-10-02 17:30:14 +08:00
ProcessSentence ( empty , { nonConsole } ) ;
ProcessSentence ( one , { nonConsole } ) ;
ProcessSentence ( normal , { nonConsole } ) ;
2019-06-14 01:06:34 +08:00
assert ( empty = = L " " & & one = = L " " & & normal = = L " This is a normal sentence. はい " ) ;
}
) ;