improve reliability of translations

This commit is contained in:
Akash Mozumdar 2020-12-14 06:26:01 -07:00
parent 1b71b3ee86
commit 268f40771d
9 changed files with 234 additions and 135 deletions

View File

@ -147,7 +147,7 @@ namespace
{
QMultiHash<QString, DWORD> allProcesses;
for (auto [processId, processName] : GetAllProcesses())
if (processName && (showSystemProcesses || processName->find(L":\\Windows\\") == std::wstring::npos))
if (processName && (showSystemProcesses || processName->find(L":\\Windows\\") == std::string::npos))
allProcesses.insert(QFileInfo(S(processName.value())).fileName(), processId);
QStringList processList(allProcesses.uniqueKeys());
@ -202,7 +202,7 @@ namespace
{
if (auto processName = GetModuleFilename(selectedProcessId)) if (int last = processName->rfind(L'\\') + 1)
{
std::wstring configFile = std::wstring(processName.value()).replace(last, std::wstring::npos, GAME_CONFIG_FILE);
std::wstring configFile = std::wstring(processName.value()).replace(last, std::string::npos, GAME_CONFIG_FILE);
if (!std::filesystem::exists(configFile)) QTextFile(S(configFile), QFile::WriteOnly).write("see https://github.com/Artikash/Textractor/wiki/Game-configuration-file");
if (std::filesystem::exists(configFile)) _wspawnlp(_P_DETACH, L"notepad", L"notepad", configFile.c_str(), NULL);
else QMessageBox::critical(This, GAME_CONFIG, QString(FAILED_TO_CREATE_CONFIG_FILE).arg(S(configFile)));
@ -649,7 +649,7 @@ MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent)
if (arg[1] == L'p' || arg[1] == L'P')
if (DWORD processId = _wtoi(arg.substr(2).c_str())) Host::InjectProcess(processId);
else for (auto [processId, processName] : processes)
if (processName.value_or(L"").find(L"\\" + arg.substr(2)) != std::wstring::npos) Host::InjectProcess(processId);
if (processName.value_or(L"").find(L"\\" + arg.substr(2)) != std::string::npos) Host::InjectProcess(processId);
std::thread([]
{

View File

@ -91,12 +91,11 @@ std::pair<bool, std::wstring> Translate(const std::wstring& text)
L"api.cognitive.microsofttranslator.com",
L"POST",
FormatString(L"/translate?api-version=3.0&to=%s", translateTo.Copy()).c_str(),
FormatString(R"([{"text":"%s"}])", JSON::Escape(text)),
FormatString(R"([{"text":"%s"}])", JSON::Escape(WideStringToString(text))),
FormatString(L"Content-Type: application/json; charset=UTF-8\r\nOcp-Apim-Subscription-Key:%s", apiKey.Copy()).c_str()
})
{
// Response formatted as JSON: translation starts with text":" and ends with ","to
if (std::wsmatch results; std::regex_search(httpRequest.response, results, std::wregex(L"text\":\"(.+?)\",\""))) return { true, results[1] };
if (auto translation = Copy(JSON::Parse(httpRequest.response)[0][L"translations"][0][L"text"].String())) return { true, translation.value() };
else return { false, FormatString(L"%s: %s", TRANSLATION_ERROR, httpRequest.response) };
}
else return { false, FormatString(L"%s (code=%u)", TRANSLATION_ERROR, httpRequest.errorCode) };
@ -107,8 +106,7 @@ std::pair<bool, std::wstring> Translate(const std::wstring& text)
L"POST",
FormatString(L"/ttranslatev3?fromLang=auto-detect&to=%s&text=%s", translateTo.Copy(), Escape(text)).c_str()
})
// Response formatted as JSON: translation starts with text":" and ends with ","to
if (std::wsmatch results; std::regex_search(httpRequest.response, results, std::wregex(L"text\":\"(.+?)\",\""))) return { true, results[1] };
if (auto translation = Copy(JSON::Parse(httpRequest.response)[0][L"translations"][0][L"text"].String())) return { true, translation.value() };
else return { false, FormatString(L"%s: %s", TRANSLATION_ERROR, httpRequest.response) };
else return { false, FormatString(L"%s (code=%u)", TRANSLATION_ERROR, httpRequest.errorCode) };
}

View File

@ -50,14 +50,14 @@ std::pair<bool, std::wstring> Translate(const std::wstring& text)
L"Content-Type: application/x-www-form-urlencoded"
})))
// Response formatted as JSON: translation starts with text":" and ends with "}]
if (std::wsmatch results; std::regex_search(httpRequest.response, results, std::wregex(L"text\":\"(.+?)\"\\}\\]"))) return { true, results[1] };
if (auto translation = Copy(JSON::Parse(httpRequest.response)[L"translations"][0][L"text"].String())) return { true, translation.value() };
else return { false, FormatString(L"%s: %s", TRANSLATION_ERROR, httpRequest.response) };
else return { false, FormatString(L"%s (code=%u)", TRANSLATION_ERROR, httpRequest.errorCode) };
// the following code was reverse engineered from the DeepL website; it's as close as I could make it but I'm not sure what parts of this could be removed and still have it work
int64_t r = _time64(nullptr), n = std::count(text.begin(), text.end(), L'i') + 1;
thread_local auto generator = std::mt19937(std::random_device()());
int id = 10000 * std::uniform_int_distribution(0, 9999)(generator);
int id = 10000 * std::uniform_int_distribution(0, 9999)(generator) + 1;
// user_preferred_langs? what should priority be? does timestamp do anything? other translation quality options?
auto body = FormatString(R"(
{
@ -81,7 +81,7 @@ std::pair<bool, std::wstring> Translate(const std::wstring& text)
}]
}
}
)", id + 1, r + (n - r % n), translateTo.Copy(), JSON::Escape(text));
)", id, r + (n - r % n), translateTo.Copy(), JSON::Escape(WideStringToString(text)));
// missing accept-encoding header since it fucks up HttpRequest
if (HttpRequest httpRequest{
L"Mozilla/5.0 Textractor",
@ -93,8 +93,7 @@ std::pair<bool, std::wstring> Translate(const std::wstring& text)
L"https://www.deepl.com/translator",
WINHTTP_FLAG_SECURE
})
// Response formatted as JSON: translation starts with preprocessed_sentence":" and ends with ","
if (std::wsmatch results; std::regex_search(httpRequest.response, results, std::wregex(L"postprocessed_sentence\":\"(.+?)\",\""))) return { true, results[1] };
if (auto translation = Copy(JSON::Parse(httpRequest.response)[L"result"][L"translations"][0][L"beams"][0][L"postprocessed_sentence"].String())) return { true, translation.value() };
else return { false, FormatString(L"%s: %s", TRANSLATION_ERROR, httpRequest.response) };
else return { false, FormatString(L"%s (code=%u)", TRANSLATION_ERROR, httpRequest.errorCode) };
}

View File

@ -197,7 +197,7 @@ public:
void AddSentence(QString sentence)
{
if (sentence.size() > maxSentenceSize) sentence = SENTENCE_TOO_BIG;
if (!showOriginal) sentence = sentence.section('\n', sentence.count('\n') / 2 + 1);
if (!showOriginal) sentence = sentence.section("\n----\n", sentence.count("\n----\n") / 2 + 1);
sanitize(sentence);
sentence.chop(std::distance(std::remove(sentence.begin(), sentence.end(), QChar::Tabulation), sentence.end()));
sentenceHistory.push_back(sentence);

View File

@ -125,37 +125,6 @@ QStringList languages
bool translateSelectedOnly = false, rateLimitAll = true, rateLimitSelected = false, useCache = true;
int tokenCount = 30, tokenRestoreDelay = 60000, maxSentenceSize = 500;
unsigned TKK = 0;
std::wstring GetTranslationUri(const std::wstring& text)
{
// If no TKK available, use this uri. Can't use too much or google will detect unauthorized access
if (!TKK) return FormatString(L"/translate_a/single?client=gtx&dt=ld&dt=rm&dt=t&tl=%s&q=%s", translateTo.Copy(), Escape(text));
// reverse engineered from translate.google.com
std::wstring escapedText;
unsigned a = time(NULL) / 3600, b = a; // the first part of TKK
for (unsigned char ch : WideStringToString(text))
{
escapedText += FormatString(L"%%%02X", (int)ch);
a += ch;
a += a << 10;
a ^= a >> 6;
}
a += a << 3;
a ^= a >> 11;
a += a << 15;
a ^= TKK;
a %= 1000000;
return FormatString(L"/translate_a/single?client=webapp&dt=ld&dt=rm&dt=t&sl=auto&tl=%s&tk=%u.%u&q=%s", translateTo.Copy(), a, a ^ b, escapedText);
}
bool IsHash(const std::wstring& result)
{
return result.size() == 32 && std::all_of(result.begin(), result.end(), [](char ch) { return (ch >= L'0' && ch <= L'9') || (ch >= L'a' && ch <= L'z'); });
}
std::pair<bool, std::wstring> Translate(const std::wstring& text)
{
if (!apiKey->empty())
@ -164,31 +133,40 @@ std::pair<bool, std::wstring> Translate(const std::wstring& text)
L"translation.googleapis.com",
L"POST",
FormatString(L"/language/translate/v2?format=text&target=%s&key=%s", translateTo.Copy(), apiKey.Copy()).c_str(),
FormatString(R"({"q":["%s"]})", JSON::Escape(text))
FormatString(R"({"q":["%s"]})", JSON::Escape(WideStringToString(text)))
})
{
// Response formatted as JSON: starts with "translatedText": " and translation is enclosed in quotes followed by a comma
if (std::wsmatch results; std::regex_search(httpRequest.response, results, std::wregex(L"\"translatedText\": \"(.+?)\","))) return { true, results[1] };
return { false, FormatString(L"%s: %s", TRANSLATION_ERROR, httpRequest.response) };
}
if (auto translation = Copy(JSON::Parse(httpRequest.response)[L"data"][L"translations"][0][L"translatedText"].String())) return { true, translation.value() };
else return { false, FormatString(L"%s: %s", TRANSLATION_ERROR, httpRequest.response) };
else return { false, FormatString(L"%s (code=%u)", TRANSLATION_ERROR, httpRequest.errorCode) };
if (!TKK)
if (HttpRequest httpRequest{ L"Mozilla/5.0 Textractor", L"translate.google.com", L"GET", L"/" })
if (std::wsmatch results; std::regex_search(httpRequest.response, results, std::wregex(L"(\\d{7,})'")))
_InterlockedCompareExchange(&TKK, stoll(results[1]), 0);
if (HttpRequest httpRequest{ L"Mozilla/5.0 Textractor", L"translate.googleapis.com", L"GET", GetTranslationUri(text).c_str() })
if (HttpRequest httpRequest{
L"Mozilla/5.0 Textractor",
L"translate.google.com",
L"POST",
L"/_/TranslateWebserverUi/data/batchexecute?rpcids=MkEWBc",
"f.req=" + Escape(WideStringToString(FormatString(LR"([[["MkEWBc","[[\"%s\",\"auto\",\"%s\",true],[null]]",null,"generic"]]])", JSON::Escape((JSON::Escape(text))), translateTo.Copy()))),
L"Content-Type: application/x-www-form-urlencoded"
})
{
// Response formatted as JSON: starts with "[[[" and translation is enclosed in quotes followed by a comma
if (httpRequest.response[0] == L'[')
if (auto start = httpRequest.response.find(L"[["); start != std::string::npos)
{
std::wstring translation;
for (std::wsmatch results; std::regex_search(httpRequest.response, results, std::wregex(L"\\[\"(.*?)\",[n\"]")); httpRequest.response = results.suffix())
if (!IsHash(results[1])) translation += std::wstring(results[1]) + L" ";
if (!translation.empty()) return { true, translation };
if (auto blob = Copy(JSON::Parse(httpRequest.response.substr(start))[0][2].String())) if (auto translations = Copy(JSON::Parse(blob.value())[1][0].Array()))
{
std::wstring translation;
if (translations->size() == 1 && (translations = Copy(translations.value()[0][5].Array())))
{
for (const auto& sentence : translations.value()) if (sentence[0].String()) (translation += *sentence[0].String()) += L" ";
}
else
{
for (const auto& conjugation : translations.value())
if (auto sentence = conjugation[0].String()) if (auto gender = conjugation[2].String()) translation += FormatString(L"%s %s\n", *sentence, *gender);
}
if (!translation.empty()) return { true, translation };
return { false, FormatString(L"%s: %s", TRANSLATION_ERROR, blob.value()) };
}
}
return { false, FormatString(L"%s (TKK=%u): %s", TRANSLATION_ERROR, _InterlockedExchange(&TKK, 0), httpRequest.response) };
return { false, FormatString(L"%s: %s", TRANSLATION_ERROR, httpRequest.response) };
}
else return { false, FormatString(L"%s (code=%u)", TRANSLATION_ERROR, httpRequest.errorCode) };
}

View File

@ -54,44 +54,11 @@ std::wstring Escape(const std::wstring& text)
return escaped;
}
namespace JSON
std::string Escape(const std::string& text)
{
void Unescape(std::wstring& text)
{
for (int i = 0; i < text.size(); ++i)
{
if (text[i] == L'\\')
{
text[i] = 0;
if (text[i + 1] == L'r') text[i + 1] = 0; // for some reason \r gets displayed as a newline
if (text[i + 1] == L'n') text[i + 1] = L'\n';
if (text[i + 1] == L't') text[i + 1] = L'\t';
if (text[i + 1] == L'\\') ++i;
}
}
text.erase(std::remove(text.begin(), text.end(), 0), text.end());
}
std::string Escape(const std::wstring& text)
{
std::string escaped = WideStringToString(text);
int oldSize = escaped.size();
escaped.resize(escaped.size() + std::count_if(escaped.begin(), escaped.end(), [](char ch) { return ch == '\n' || ch == '\r' || ch == '\t' || ch == '\\' || ch == '"'; }));
auto out = escaped.rbegin();
for (int i = oldSize - 1; i >= 0; --i)
{
if (escaped[i] == '\n') *out++ = 'n';
else if (escaped[i] == '\t') *out++ = 't';
else if (escaped[i] == '\r') *out++ = 'r';
else if (escaped[i] == '\\' || escaped[i] == '"') *out++ = escaped[i];
else
{
*out++ = escaped[i];
continue;
}
*out++ = '\\';
}
escaped.erase(std::remove_if(escaped.begin(), escaped.end(), [](unsigned char ch) { return ch < 0x20; }), escaped.end());
return escaped;
}
std::string escaped;
for (unsigned char ch : text) escaped += FormatString("%%%02X", (int)ch);
return escaped;
}
//TEST(assert(JSON::Parse(LR"([{"string":"hello world","boolean":false,"number": 1.67e+4,"null": null,"array":[]},"hello world"])")))

View File

@ -1,6 +1,7 @@
#pragma once
#include <winhttp.h>
#include <variant>
using InternetHandle = AutoHandle<Functor<WinHttpCloseHandle>>;
@ -28,9 +29,179 @@ struct HttpRequest
};
std::wstring Escape(const std::wstring& text);
std::string Escape(const std::string& text);
namespace JSON
{
void Unescape(std::wstring& text);
std::string Escape(const std::wstring& text);
inline std::wstring UTF(int charCode)
{
return { (wchar_t)charCode };
}
template <typename C>
std::pair<std::basic_string<C>, int> Unescape(std::basic_string_view<C> text)
{
std::basic_string<C> unescaped;
int i = 0;
for (; i < text.size(); ++i)
{
char ch = text[i];
if (ch == '"') return { unescaped, i + 1 };
if (ch == '\\')
{
ch = text[i + 1];
if (ch == 'u' && std::isxdigit(text[i + 2]) && std::isxdigit(text[i + 3]) && std::isxdigit(text[i + 4]) && std::isxdigit(text[i + 5]))
{
char charCode[] = { text[i + 2], text[i + 3], text[i + 4], text[i + 5], 0 };
unescaped += UTF(strtol(charCode, nullptr, 16));
i += 5;
continue;
}
for (auto [original, value] : Array<char, char>{ { 'b', '\b' }, {'f', '\f'}, {'n', '\n'}, {'r', '\r'}, {'t', '\t'} }) if (ch == original)
{
unescaped.push_back(value);
goto replaced;
}
unescaped.push_back(ch);
replaced: i += 1;
}
else unescaped.push_back(ch);
}
return { unescaped, i };
}
template <typename C>
std::basic_string<C> Escape(std::basic_string<C> text)
{
int oldSize = text.size();
text.resize(text.size() + std::count_if(text.begin(), text.end(), [](auto ch) { return ch == '\n' || ch == '\r' || ch == '\t' || ch == '\\' || ch == '"'; }));
auto out = text.rbegin();
for (int i = oldSize - 1; i >= 0; --i)
{
if (text[i] == '\n') *out++ = 'n';
else if (text[i] == '\t') *out++ = 't';
else if (text[i] == '\r') *out++ = 'r';
else if (text[i] == '\\' || text[i] == '"') *out++ = text[i];
else
{
*out++ = text[i];
continue;
}
*out++ = '\\';
}
text.erase(std::remove_if(text.begin(), text.end(), [](uint64_t ch) { return ch < 0x20 || ch == 0x7f; }), text.end());
return text;
}
template <typename C>
struct Value : private std::variant<std::monostate, std::nullopt_t, bool, double, std::basic_string<C>, std::vector<Value<C>>, std::unordered_map<std::basic_string<C>, Value<C>>>
{
using std::variant<std::monostate, std::nullopt_t, bool, double, std::basic_string<C>, std::vector<Value<C>>, std::unordered_map<std::basic_string<C>, Value<C>>>::variant;
explicit operator bool() const { return index(); }
bool IsNull() const { return index() == 1; }
auto Boolean() const { return std::get_if<bool>(this); }
auto String() const { return std::get_if<std::basic_string<C>>(this); }
auto Array() const { return std::get_if<std::vector<Value<C>>>(this); }
auto Object() const { return std::get_if<std::unordered_map<std::basic_string<C>, Value<C>>>(this); }
const Value<C>& operator[](std::basic_string<C> key) const
{
static const Value<C> failure;
if (auto object = Object()) if (auto it = object->find(key); it != object->end()) return it->second;
return failure;
}
const Value<C>& operator[](int i) const
{
static const Value<C> failure;
if (auto array = Array()) if (i < array->size()) return array->at(i);
return failure;
}
};
template <typename C>
Value<C> Parse(std::basic_string_view<C> text, int64_t& i, int depth)
{
if (depth > 25) return {};
C ch;
auto SkipWhitespace = [&]
{
while (i < text.size() && (text[i] == ' ' || text[i] == '\n' || text[i] == '\r' || text[i] == '\t')) ++i;
if (i >= text.size()) return true;
ch = text[i];
return false;
};
auto ExtractString = [&]
{
i += 1;
auto [string, length] = Unescape(text.substr(i));
i += length;
return string;
};
if (SkipWhitespace()) return {};
static C nullStr[] = { 'n', 'u', 'l', 'l' }, trueStr[] = { 't', 'r', 'u', 'e' }, falseStr[] = { 'f', 'a', 'l', 's', 'e' };
if (ch == nullStr[0])
if (std::char_traits<C>::compare(text.data() + i, nullStr, std::size(nullStr)) == 0) return i += std::size(nullStr), std::nullopt;
else return {};
if (ch == trueStr[0])
if (std::char_traits<C>::compare(text.data() + i, trueStr, std::size(trueStr)) == 0) return i += std::size(trueStr), true;
else return {};
if (ch == falseStr[0])
if (std::char_traits<C>::compare(text.data() + i, falseStr, std::size(falseStr)) == 0) return i += std::size(falseStr), false;
else return {};
if (ch == '-' || (ch >= '0' && ch <= '9'))
{
// no numbers currently used, add a actual parser when needed
while (i < text.size() && ((text[i] >= '0' && text[i] <= '9') || text[i] == '-' || text[i] == '+' || text[i] == 'e' || text[i] == 'E' || text[i] == '.')) ++i;
return 0.0;
}
if (ch == '"') return ExtractString();
if (ch == '[')
{
std::vector<Value<C>> array;
while (true)
{
i += 1;
if (SkipWhitespace()) return {};
if (ch == ']') return i += 1, Value<C>(array);
if (!array.emplace_back(Parse(text, i, depth + 1))) return {};
if (SkipWhitespace()) return {};
if (ch == ']') return i += 1, Value<C>(array);
if (ch != ',') return {};
}
}
if (ch == '{')
{
std::unordered_map<std::basic_string<C>, Value<C>> object;
while (true)
{
i += 1;
if (SkipWhitespace()) return {};
if (ch == '}') return i += 1, Value<C>(object);
if (ch != '"') return {};
auto key = ExtractString();
if (SkipWhitespace() || ch != ':') return {};
i += 1;
if (!(object[std::move(key)] = Parse(text, i, depth + 1))) return {};
if (SkipWhitespace()) return {};
if (ch == '}') return i += 1, Value<C>(object);
if (ch != ',') return {};
}
}
return {};
}
template <typename C>
Value<C> Parse(const std::basic_string<C>& text)
{
int64_t start = 0;
return Parse((std::basic_string_view<C>)text, start, 0);
}
}

View File

@ -26,7 +26,7 @@ extern int tokenCount, tokenRestoreDelay, maxSentenceSize;
std::pair<bool, std::wstring> Translate(const std::wstring& text);
const char* LANGUAGE = u8"Language";
const std::string TRANSLATION_CACHE_FILE = FormatString("%s Cache.txt", TRANSLATION_PROVIDER);
const std::string TRANSLATION_CACHE_FILE = FormatString("%s Translation Cache.txt", TRANSLATION_PROVIDER);
QFormLayout* display;
Settings settings;
@ -160,9 +160,9 @@ bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
if (cache) translationCache->try_emplace(sentence, translation);
if (cache && translationCache->size() > savedSize + 50) SaveCache();
JSON::Unescape(translation);
sentence += L"\n" + translation;
for (int i = 0; i < translation.size(); ++i) if (translation[i] == '\r' && translation[i + 1] == '\n') translation[i] = 0x200b; // for some reason \r appears as newline - no need to double
if (!translation.empty()) (sentence += L"\n----\n") += translation;
return true;
}
TEST(assert(Translate(L"こんにちは").second.find(L"ello") != std::wstring::npos));
TEST(assert(Translate(L"こんにちは").second.find(L"ello") != std::string::npos));

View File

@ -27,23 +27,15 @@ constexpr bool x64 = true;
constexpr bool x64 = false;
#endif
template <typename T, typename... Xs>
struct ArrayImpl { using Type = std::tuple<T, Xs...>[]; };
template <typename T>
struct ArrayImpl<T> { using Type = T[]; };
template <typename... Ts>
using Array = typename ArrayImpl<Ts...>::Type;
template <typename T, typename... Xs> struct ArrayImpl { using Type = std::tuple<T, Xs...>[]; };
template <typename T> struct ArrayImpl<T> { using Type = T[]; };
template <typename... Ts> using Array = typename ArrayImpl<Ts...>::Type;
template <auto F>
using Functor = std::integral_constant<std::remove_reference_t<decltype(F)>, F>;
template <typename V>
struct Identity { V operator()(V v) const { return v; } };
template <auto F> using Functor = std::integral_constant<std::remove_reference_t<decltype(F)>, F>; // shouldn't need remove_reference_t but MSVC is bugged
struct PermissivePointer
{
template <typename T>
operator T*() { return (T*)p; }
template <typename T> operator T*() { return (T*)p; }
void* p;
};
@ -77,29 +69,23 @@ public:
Locker Acquire() { return { std::unique_lock(m), contents }; }
Locker operator->() { return Acquire(); }
T Copy()
{
return Acquire().contents;
}
T Copy() { return Acquire().contents; }
private:
T contents;
M m;
};
static struct
static struct // should be inline but MSVC (linker) is bugged
{
BYTE DUMMY[100];
template <typename T>
operator T*() { static_assert(sizeof(T) < sizeof(DUMMY)); return (T*)DUMMY; }
inline static BYTE DUMMY[100];
template <typename T> operator T*() { static_assert(sizeof(T) < sizeof(DUMMY)); return (T*)DUMMY; }
} DUMMY;
template <typename T>
inline auto FormatArg(T arg) { return arg; }
template <typename T> std::optional<std::remove_cv_t<T>> Copy(T* ptr) { if (ptr) return *ptr; return {}; }
template <typename C>
inline auto FormatArg(const std::basic_string<C>& arg) { return arg.c_str(); }
template <typename T> inline auto FormatArg(T arg) { return arg; }
template <typename C> inline auto FormatArg(const std::basic_string<C>& arg) { return arg.c_str(); }
#pragma warning(push)
#pragma warning(disable: 4996)