implement deinflection

This commit is contained in:
Akash Mozumdar 2020-02-26 00:59:47 -07:00
parent dc48f2a3c8
commit 565f99cced
3 changed files with 74 additions and 21 deletions

View File

@ -2,9 +2,11 @@
#include "extension.h"
#include "ui_extrawindow.h"
#include "defs.h"
#include "util.h"
#include "blockmarkup.h"
#include <fstream>
#include <process.h>
#include <QRegularExpression>
#include <QColorDialog>
#include <QFontDialog>
#include <QMenu>
@ -260,13 +262,8 @@ private:
if (ui.display->text().mid(i) == dictionaryWindow.term) return dictionaryWindow.ShowDefinition();
dictionaryWindow.ui.display->setFixedWidth(ui.display->width() * 3 / 4);
dictionaryWindow.setTerm(ui.display->text().mid(i));
int home = i == 0 ? 0 : textPositionMap[i - 1].x(), away = textPositionMap[i].x(), x = 0;
if (textPositionMap[i].x() > ui.display->width() / 2)
{
std::swap(home, away);
x -= dictionaryWindow.width();
}
x += (home * 3 + away) / 4;
int left = i == 0 ? 0 : textPositionMap[i - 1].x(), right = textPositionMap[i].x(),
x = textPositionMap[i].x() > ui.display->width() / 2 ? -dictionaryWindow.width() + (right * 3 + left) / 4 : (left * 3 + right) / 4;
dictionaryWindow.move(ui.display->mapToGlobal(QPoint(x, textPositionMap[i].y())));
}
@ -354,7 +351,7 @@ private:
BlockMarkupIterator savedDictionary(stream, Array<std::string_view>{ "|TERM|", "|DEFINITION|" });
while (auto read = savedDictionary.Next())
{
const auto& [terms, definition] = *read;
const auto& [terms, definition] = read.value();
auto storedDefinition = StoreCopy(definition);
std::string_view termsView = terms;
size_t start = 0, end = termsView.find("|TERM|");
@ -366,7 +363,20 @@ private:
}
dictionary.push_back(DictionaryEntry{ StoreCopy(termsView.substr(start)), storedDefinition });
}
std::sort(dictionary.begin(), dictionary.end());
std::stable_sort(dictionary.begin(), dictionary.end());
inflections.clear();
stream.seekg(0);
BlockMarkupIterator savedInflections(stream, Array<std::string_view>{ "|ROOT|", "|INFLECTS TO|", "|NAME|" });
while (auto read = savedInflections.Next())
{
const auto& [root, inflectsTo, name] = read.value();
if (!inflections.emplace_back(Inflection{
S(root),
QRegularExpression(QRegularExpression::anchoredPattern(S(inflectsTo)), QRegularExpression::UseUnicodePropertiesOption),
S(name)
}).inflectsTo.isValid()) TEXTRACTOR_MESSAGE(L"Invalid regex: %s", StringToWideString(inflectsTo));
}
}
void setTerm(QString term)
@ -375,11 +385,17 @@ private:
UpdateDictionary();
definitions.clear();
definitionIndex = 0;
std::unordered_set<std::string_view> definitionSet;
for (QByteArray utf8term = term.left(500).toUtf8(); !utf8term.isEmpty(); utf8term.chop(1))
for (auto [it, end] = std::equal_range(dictionary.begin(), dictionary.end(), DictionaryEntry{ utf8term }); it != end; ++it)
if (definitionSet.emplace(it->definition).second)
definitions.push_back(QStringLiteral("<h3>%1 (%3/%4)</h3>%2").arg(utf8term, it->definition));
std::unordered_set<const char*> foundDefinitions;
for (term = term.left(500); !term.isEmpty(); term.chop(1))
for (const auto& [rootTerm, definition, inflections] : LookupDefinitions(term, foundDefinitions))
definitions.push_back(
QStringLiteral("<h3>%1 (%5/%6)</h3><small>%2 %3</small><p>%4</p>").arg(
term,
rootTerm.split("<<")[0],
inflections.join(""),
definition
)
);
for (int i = 0; i < definitions.size(); ++i) definitions[i] = definitions[i].arg(i + 1).arg(definitions.size());
ShowDefinition();
}
@ -403,6 +419,29 @@ private:
QString term;
private:
struct LookupResult
{
QString term;
QString definition;
QStringList inflectionsUsed;
};
std::vector<LookupResult> LookupDefinitions(QString term, std::unordered_set<const char*>& foundDefinitions, QStringList inflectionsUsed = {})
{
std::vector<LookupResult> results;
for (auto [it, end] = std::equal_range(dictionary.begin(), dictionary.end(), DictionaryEntry{ term.toUtf8() }); it != end; ++it)
if (foundDefinitions.emplace(it->definition).second)
results.push_back({ term, it->definition, inflectionsUsed });
for (const auto& inflection : inflections) if (auto match = inflection.inflectsTo.match(term); match.hasMatch())
{
QStringList currentInflectionsUsed = inflectionsUsed;
currentInflectionsUsed.push_front(inflection.name);
QString root = inflection.root;
for (int i = 0; i < root.size(); ++i) if (root[i].isDigit()) root.replace(i, 1, match.captured(root[i].unicode() - '0'));
for (const auto& definition : LookupDefinitions(root, foundDefinitions, currentInflectionsUsed)) results.push_back(definition);
}
return results;
}
void wheelEvent(QWheelEvent* event) override
{
int scroll = event->angleDelta().y();
@ -411,6 +450,14 @@ private:
ShowDefinition();
}
struct Inflection
{
QString root;
QRegularExpression inflectsTo;
QString name;
};
std::vector<Inflection> inflections;
std::filesystem::file_time_type dictionaryFileLastWrite;
std::vector<char> charStorage;
std::vector<QString> definitions;

View File

@ -17,10 +17,10 @@ class Trie
public:
Trie(const std::istream& replacementScript)
{
BlockMarkupIterator replacementScriptParser(replacementScript.rdbuf(), Array<std::wstring_view>{ L"|ORIG|", L"|BECOMES|" });
BlockMarkupIterator replacementScriptParser(replacementScript, Array<std::wstring_view>{ L"|ORIG|", L"|BECOMES|" });
while (auto read = replacementScriptParser.Next())
{
const auto& [original, replacement] = *read;
const auto& [original, replacement] = read.value();
Node* current = &root;
for (auto ch : original) if (!Ignore(ch)) current = Next(current, ch);
if (current != &root)
@ -103,7 +103,7 @@ BOOL WINAPI DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved
if (trie.Empty())
{
auto file = std::ofstream(REPLACE_SAVE_FILE, std::ios::binary) << "\xff\xfe";
for (auto ch : std::wstring_view(REPLACER_INSTRUCTIONS)) file << (ch == L'\n' ? std::string_view("\r\0\n", 4) : std::string_view((char*)&ch, 2));
for (auto ch : std::wstring_view(REPLACER_INSTRUCTIONS)) file << (ch == L'\n' ? std::string_view("\r\0\n", 4) : std::string_view((char*)&ch, 2));
_spawnlp(_P_DETACH, "notepad", "notepad", REPLACE_SAVE_FILE, NULL); // show file to user
}
}

View File

@ -128,10 +128,16 @@ const char* DICTIONARY_INSTRUCTIONS = u8R"(This file is used only for the "Dicti
It uses a custom format specific to Textractor and is not meant to be written manually.
You should look for a dictionary in this format online (https://github.com/Artikash/Textractor-Dictionaries/releases is a good place to start).
Alternatively, if you're a programmer, you can write a script to convert a dictionary from another format with the info below.
Once you have a dictionary, to look up some text in Extra Window, hover over it. All matching definitions will be shown. Scroll to change definitions.
Definitions are formatted like this:|TERM|Hola|TERM|hola|TERM|Bonjour|TERM|bonjour|DEFINITION|hello|END|
The definition can include rich text (https://doc.qt.io/qt-5/richtext-html-subset.html) which will be formatted properly.
All text in this file outside of a definition is ignored.
Once you have a dictionary, to look up some text in Extra Window, hover over it. You can scroll through all the matching definitions.
Definitions are formatted like this:|TERM|Hola<<ignored|TERM|hola|TERM|Bonjour|TERM|bonjour|DEFINITION|hello|END|
The term and definition can include rich text (https://doc.qt.io/qt-5/richtext-html-subset.html) which will be formatted properly.
Inflections are formatted like this:|ROOT|1<<noun|INFLECTS TO|(\w*)s|NAME| plural|END|
Textractor will check if a term matches the inflection regex and if it does, will recursively search for the root term.
The root term is generated by replacing each number with the corresponding regex capture group (with 0 being replaced by the entire match).
This process can easily result in infinite loops and/or stack overflows. It's your job to avoid that.
Inflection regex uses QRegularExpression (https://doc.qt.io/qt-5/qregularexpression.html) unicode syntax.
Textractor will display the final root term as well as all inflections used to get to that root term.
However, the text in a term after << is ignored when displaying. This is intended to store part-of-speech information.
This file must be encoded in UTF-8.)";
const char* SHOW_ORIGINAL = u8"Original text";
const char* SHOW_ORIGINAL_INFO = u8R"(Original text will not be shown