From bd51193464fa99585459d7f881bfd86dfa01e9af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=81=8D=E5=85=AE=E6=83=9A=E5=85=AE?= <1173718158@qq.com> Date: Mon, 4 Nov 2024 09:46:14 +0800 Subject: [PATCH] UTF8 --- LunaHook/texthook.cc | 4 +++- include/hookcode.cpp | 6 +++++- include/stringutils.cpp | 44 +++++++++++++++++++++++++++++++++++++++++ include/stringutils.h | 1 + 4 files changed, 53 insertions(+), 2 deletions(-) diff --git a/LunaHook/texthook.cc b/LunaHook/texthook.cc index 5f28856..fce298a 100644 --- a/LunaHook/texthook.cc +++ b/LunaHook/texthook.cc @@ -347,7 +347,7 @@ void TextHook::Send(uintptr_t lpDataBase) } else { - if (hp.type & CODEC_UTF32) + if (hp.type & CODEC_UTF32 || hp.type& CODEC_UTF8) { *(uint32_t *)pbData = lpDataIn & 0xffffffff; } @@ -595,6 +595,8 @@ int TextHook::GetLength(hook_stack *stack, uintptr_t in) len = 2; else if (hp.type & CODEC_UTF32) len = 4; + else if (hp.type & CODEC_UTF8) + len = utf8charlen((char*)&in); else { // CODEC_ANSI_BE,CHAR_LITTLE_ENDIAN if (hp.type & CODEC_ANSI_BE) diff --git a/include/hookcode.cpp b/include/hookcode.cpp index a561930..a95cb45 100644 --- a/include/hookcode.cpp +++ b/include/hookcode.cpp @@ -83,6 +83,8 @@ namespace break; case L'W': hp.type |= CODEC_UTF16; + case L'C': + hp.type |= CODEC_UTF8; break; case L'I': hp.type |= CODEC_UTF32; @@ -94,7 +96,7 @@ namespace hp.type |= USING_STRING | CODEC_UTF16; break; case L'M': - hp.type |= SPECIAL_JIT_STRING | USING_STRING | CODEC_UTF16; + hp.type |= USING_STRING | CODEC_UTF16 | SPECIAL_JIT_STRING; break; case L'U': hp.type |= USING_STRING | CODEC_UTF32; @@ -333,6 +335,8 @@ namespace { if (hp.type & CODEC_UTF16) HCode += L'W'; + else if (hp.type & CODEC_UTF8) + HCode += L'C'; else if (hp.type & CODEC_UTF32) HCode += L'I'; else if (hp.type & CODEC_ANSI_BE) diff --git a/include/stringutils.cpp b/include/stringutils.cpp index 4330fa8..6d53264 100644 --- a/include/stringutils.cpp +++ b/include/stringutils.cpp @@ -214,7 +214,51 @@ size_t u32strlen(uint32_t *data) s++; return s; } +// 检查一个字节是否是有效的 UTF-8 后续字节 +int is_valid_following_byte(unsigned char byte) +{ + return (byte & 0xC0) == 0x80; // 10xxxxxx +} +int utf8charlen(char *str) +{ + if ((!str) || (!*str)) + return 0; + unsigned char first_byte = (unsigned char)*str; + if ((first_byte & 0x80) == 0) + { + // 0xxxxxxx - 1 byte character + return 1; + } + else if ((first_byte & 0xE0) == 0xC0) + { + // 110xxxxx - 2 byte character + if (is_valid_following_byte((unsigned char)str[1])) + { + return 2; + } + } + else if ((first_byte & 0xF0) == 0xE0) + { + // 1110xxxx - 3 byte character + if (is_valid_following_byte((unsigned char)str[1]) && + is_valid_following_byte((unsigned char)str[2])) + { + return 3; + } + } + else if ((first_byte & 0xF8) == 0xF0) + { + // 11110xxx - 4 byte character + if (is_valid_following_byte((unsigned char)str[1]) && + is_valid_following_byte((unsigned char)str[2]) && + is_valid_following_byte((unsigned char)str[3])) + { + return 4; + } + } + return 0; // 不是有效的UTF-8序列 +} std::string wcasta(const std::wstring &x) { std::string xx; diff --git a/include/stringutils.h b/include/stringutils.h index e8bd909..b0128f4 100644 --- a/include/stringutils.h +++ b/include/stringutils.h @@ -36,6 +36,7 @@ std::optional StringToWideString(const std::string &text, UINT enc std::string wcasta(const std::wstring& x); std::wstring acastw(const std::string& x); size_t u32strlen(uint32_t *data); +int utf8charlen(char *data); inline bool disable_mbwc = false; inline bool disable_wcmb = false; template