mirror of
https://github.com/Artikash/Textractor.git
synced 2024-12-23 08:54:12 +08:00
add check for outdated doc and refactor input text modifications
Added the flag for checking if webdocument structure is outdated Refactor text modifications before processing to the translator
This commit is contained in:
parent
2ab780a491
commit
3af4217075
@ -71,7 +71,7 @@ bool DevTools::startChrome(QString path, bool headless, int port)
|
||||
if (!std::filesystem::exists(path.toStdWString()))
|
||||
return false;
|
||||
DWORD exitCode = 0;
|
||||
if ((GetExitCodeProcess(processInfo.hProcess, &exitCode) != FALSE) && (exitCode == STILL_ACTIVE))
|
||||
if (GetExitCodeProcess(processInfo.hProcess, &exitCode) != FALSE && exitCode == STILL_ACTIVE)
|
||||
return false;
|
||||
QString args = "--proxy-server=direct:// --disable-extensions --disable-gpu --user-data-dir="
|
||||
+ QString::fromStdWString(std::filesystem::current_path())
|
||||
@ -237,8 +237,8 @@ void DevTools::onTextMessageReceived(QString message)
|
||||
{
|
||||
for (auto iter = mapmethod.cbegin(); iter != mapmethod.cend();)
|
||||
{
|
||||
if ((iter->second.value("method") == root.value("method"))
|
||||
&& (compareJson(iter->second.value("params"), root.value("params"))))
|
||||
if (iter->second.value("method") == root.value("method")
|
||||
&& compareJson(iter->second.value("params"), root.value("params")))
|
||||
{
|
||||
mutex.lock();
|
||||
mapmethod.erase(iter++);
|
||||
|
@ -33,28 +33,48 @@ QStringList languages
|
||||
};
|
||||
|
||||
int docfound = -1, targetNodeId = -1, session = -1, pageenabled = -1, useragentflag = -1;
|
||||
long update = -1;
|
||||
|
||||
std::pair<bool, std::wstring> Translate(const std::wstring& text, DevTools* devtools)
|
||||
{
|
||||
QString qtext = S(text);
|
||||
qtext.remove(QString(12288)); // japanese space (no need for translator)
|
||||
|
||||
// Check text for repeated symbols (e.g. only ellipsis)
|
||||
if (qtext.length() > 2)
|
||||
for (int i = 1; i < (qtext.length() - 1); i++)
|
||||
// Check quotes
|
||||
bool checkquote = false;
|
||||
if ((qtext.front() == QString(12300) && qtext.back() == QString(12301)) // japanese quotation marks
|
||||
|| (qtext.front() == "\"" && qtext.back() == "\""))
|
||||
{
|
||||
if (qtext[i] != qtext[1])
|
||||
break;
|
||||
if ((i + 2) == qtext.length() && (qtext.front() == qtext.back()))
|
||||
checkquote = true;
|
||||
qtext.remove(0, 1);
|
||||
qtext.chop(1);
|
||||
}
|
||||
|
||||
if (qtext == QString(12387)) // if text consists of only one sokuon, add exclamation mark for correct translation
|
||||
{
|
||||
qtext += "!";
|
||||
}
|
||||
|
||||
// Check ellipsis
|
||||
int count = qtext.count(QString(8230)); // ellipsis
|
||||
if (count == qtext.length()
|
||||
|| (count == (qtext.length() - 1) && qtext.back() == QString(12290))) // japanese end of a sentence
|
||||
{
|
||||
return { true, text };
|
||||
}
|
||||
|
||||
// Put quotes back
|
||||
if (checkquote)
|
||||
{
|
||||
qtext = "\"" + qtext + "\"";
|
||||
}
|
||||
|
||||
// Check status
|
||||
if (devtools->getStatus() == "Stopped")
|
||||
{
|
||||
return { false, FormatString(L"%s", ERROR_CHROME) };
|
||||
}
|
||||
if ((devtools->getStatus().startsWith("Fail")) || (devtools->getStatus().startsWith("Unconnected")))
|
||||
if (devtools->getStatus().startsWith("Fail") || devtools->getStatus().startsWith("Unconnected"))
|
||||
{
|
||||
return { false, FormatString(L"%s", ERROR_START_CHROME) };
|
||||
}
|
||||
@ -65,22 +85,15 @@ std::pair<bool, std::wstring> Translate(const std::wstring& text, DevTools* devt
|
||||
targetNodeId = -1;
|
||||
pageenabled = -1;
|
||||
useragentflag = -1;
|
||||
update = -1;
|
||||
}
|
||||
|
||||
// Add spaces near ellipsis for better translation and check for quotes
|
||||
qtext.replace(QRegularExpression("[" + QString(8230) + "]" + "[" + QString(8230) + "]" + "[" + QString(8230) + "]"), QString(8230));
|
||||
qtext.replace(QRegularExpression("[" + QString(8230) + "]" + "[" + QString(8230) + "]"), QString(8230));
|
||||
qtext.replace(QRegularExpression("[" + QString(8230) + "]"), " " + QString(8230) + " ");
|
||||
bool checkquote = false;
|
||||
if ((qtext.front() == QString(12300)) && (qtext.back() == QString(12301)))
|
||||
{
|
||||
checkquote = true;
|
||||
qtext.remove(0, 1);
|
||||
qtext.chop(1);
|
||||
}
|
||||
QJsonObject root;
|
||||
// Erase tags and reduce the number of ellipsis for better translation
|
||||
qtext.remove(QRegExp("<[^>]*>"));
|
||||
qtext.replace(QRegExp("(" + QString(8230) + ")+"), " " + QString(8230));
|
||||
|
||||
// Enable page feedback
|
||||
QJsonObject root;
|
||||
if (pageenabled == -1)
|
||||
{
|
||||
if (!devtools->SendRequest("Page.enable", {}, root))
|
||||
@ -94,9 +107,9 @@ std::pair<bool, std::wstring> Translate(const std::wstring& text, DevTools* devt
|
||||
if (useragentflag == -1)
|
||||
{
|
||||
QString useragent = devtools->getUserAgent();
|
||||
useragent.replace(QRegularExpression("HeadlessChrome"), "Chrome");
|
||||
if (!useragent.isEmpty())
|
||||
{
|
||||
useragent.replace("HeadlessChrome", "Chrome");
|
||||
if (!devtools->SendRequest("Network.setUserAgentOverride", { {"userAgent", useragent} }, root))
|
||||
{
|
||||
return { false, FormatString(L"%s", ERROR_COMMAND_FAIL) };
|
||||
@ -107,11 +120,18 @@ std::pair<bool, std::wstring> Translate(const std::wstring& text, DevTools* devt
|
||||
|
||||
long navigate = devtools->methodToReceive("Page.navigatedWithinDocument");
|
||||
long target = devtools->methodToReceive("DOM.attributeModified", { { "value" , "lmt__mobile_share_container" } });
|
||||
if (update == -1)
|
||||
{
|
||||
update = devtools->methodToReceive("DOM.documentUpdated");
|
||||
}
|
||||
|
||||
// Navigate to site
|
||||
QString fullurl = URL + "#ja/" + S(translateTo.Copy()) + "/" + qtext;
|
||||
if (devtools->SendRequest("Page.navigate", { {"url", fullurl} }, root))
|
||||
if (!devtools->SendRequest("Page.navigate", { {"url", fullurl} }, root))
|
||||
{
|
||||
return { false, FormatString(L"%s", ERROR_COMMAND_FAIL) };
|
||||
}
|
||||
|
||||
// Wait until page is loaded
|
||||
float timer = 0;
|
||||
int timer_stop = 10;
|
||||
@ -125,22 +145,29 @@ std::pair<bool, std::wstring> Translate(const std::wstring& text, DevTools* devt
|
||||
return { false, FormatString(L"%s: %d ", ERROR_GOT_TIMEOUT, timer_stop) };
|
||||
}
|
||||
|
||||
// Check if document is outdated
|
||||
if (devtools->checkMethod(update))
|
||||
{
|
||||
docfound = -1;
|
||||
targetNodeId = -1;
|
||||
update = -1;
|
||||
}
|
||||
|
||||
// Get document
|
||||
if (docfound == -1)
|
||||
{
|
||||
if (!devtools->SendRequest("DOM.getDocument", {}, root))
|
||||
{
|
||||
docfound = -1;
|
||||
return { false, FormatString(L"%s", ERROR_COMMAND_FAIL) };
|
||||
}
|
||||
docfound = root.value("result").toObject().value("root").toObject().value("nodeId").toInt();
|
||||
}
|
||||
|
||||
//Get target selector
|
||||
// Get target selector
|
||||
if (targetNodeId == -1)
|
||||
{
|
||||
if (!(devtools->SendRequest("DOM.querySelector", { {"nodeId", docfound}, {"selector", "textarea.lmt__target_textarea"} }, root))
|
||||
|| (root.value("result").toObject().value("nodeId").toInt() == 0))
|
||||
if (!devtools->SendRequest("DOM.querySelector", { {"nodeId", docfound}, {"selector", "textarea.lmt__target_textarea"} }, root)
|
||||
|| root.value("result").toObject().value("nodeId").toInt() == 0)
|
||||
{
|
||||
docfound = -1;
|
||||
return { false, FormatString(L"%s", ERROR_COMMAND_FAIL) };
|
||||
@ -157,14 +184,19 @@ std::pair<bool, std::wstring> Translate(const std::wstring& text, DevTools* devt
|
||||
}
|
||||
|
||||
// Catch the translation
|
||||
devtools->SendRequest("DOM.getOuterHTML", { {"nodeId", targetNodeId + 1} }, root);
|
||||
if (!devtools->SendRequest("DOM.getOuterHTML", { {"nodeId", targetNodeId + 1} }, root))
|
||||
{
|
||||
docfound = -1;
|
||||
targetNodeId = -1;
|
||||
return { false, FormatString(L"%s", ERROR_COMMAND_FAIL) };
|
||||
}
|
||||
QString OuterHTML = root.value("result").toObject().value("outerHTML").toString();
|
||||
if (OuterHTML == "<div></div>")
|
||||
{
|
||||
// Try to catch the notification
|
||||
int noteNodeId = -1;
|
||||
if (!(devtools->SendRequest("DOM.querySelector", { {"nodeId", docfound}, {"selector", "div.lmt__system_notification"} }, root))
|
||||
|| (root.value("result").toObject().value("nodeId").toInt() == 0))
|
||||
if (!devtools->SendRequest("DOM.querySelector", { {"nodeId", docfound}, {"selector", "div.lmt__system_notification"} }, root)
|
||||
|| root.value("result").toObject().value("nodeId").toInt() == 0)
|
||||
{
|
||||
return { false, FormatString(L"%s: %d ", ERROR_GOT_TIMEOUT, timer_stop) };
|
||||
}
|
||||
@ -200,15 +232,5 @@ std::pair<bool, std::wstring> Translate(const std::wstring& text, DevTools* devt
|
||||
}
|
||||
}
|
||||
|
||||
// Get quotes back
|
||||
if (checkquote)
|
||||
{
|
||||
OuterHTML = "\"" + OuterHTML + "\"";
|
||||
}
|
||||
return { true, S(OuterHTML) };
|
||||
}
|
||||
else
|
||||
{
|
||||
return { false, FormatString(L"%s", ERROR_COMMAND_FAIL) };
|
||||
}
|
||||
}
|
@ -50,6 +50,17 @@ void SaveCache()
|
||||
savedSize = translationCache->size();
|
||||
}
|
||||
|
||||
void EraseControlCharacters(std::wstring& text)
|
||||
{
|
||||
for (auto it = text.begin(); it!= text.end(); ++it)
|
||||
{
|
||||
if ((*it == '\n') || (*it == '\r') || (*it == '\t') || (int(*it) == 4) || (int(*it) == 5))
|
||||
{
|
||||
text.erase(it--);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class Window : public QDialog
|
||||
{
|
||||
public:
|
||||
@ -184,7 +195,11 @@ bool ProcessSentence(std::wstring& sentence, SentenceInfo sentenceInfo)
|
||||
if (auto it = translationCache->find(sentence); it != translationCache->end()) translation = it->second + L"\x200b";
|
||||
}
|
||||
if (translation.empty() && (sentenceInfo["current select"]))
|
||||
{
|
||||
EraseControlCharacters(sentence);
|
||||
std::tie(cache, translation) = Translate(sentence, devtools);
|
||||
}
|
||||
|
||||
if (cache) translationCache->try_emplace(sentence, translation);
|
||||
if (cache && translationCache->size() > savedSize + 50) SaveCache();
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user