implement deinflection

This commit is contained in:
Akash Mozumdar 2020-02-26 00:59:47 -07:00
parent dc48f2a3c8
commit 565f99cced
3 changed files with 74 additions and 21 deletions

View File

@ -2,9 +2,11 @@
#include "extension.h" #include "extension.h"
#include "ui_extrawindow.h" #include "ui_extrawindow.h"
#include "defs.h" #include "defs.h"
#include "util.h"
#include "blockmarkup.h" #include "blockmarkup.h"
#include <fstream> #include <fstream>
#include <process.h> #include <process.h>
#include <QRegularExpression>
#include <QColorDialog> #include <QColorDialog>
#include <QFontDialog> #include <QFontDialog>
#include <QMenu> #include <QMenu>
@ -260,13 +262,8 @@ private:
if (ui.display->text().mid(i) == dictionaryWindow.term) return dictionaryWindow.ShowDefinition(); if (ui.display->text().mid(i) == dictionaryWindow.term) return dictionaryWindow.ShowDefinition();
dictionaryWindow.ui.display->setFixedWidth(ui.display->width() * 3 / 4); dictionaryWindow.ui.display->setFixedWidth(ui.display->width() * 3 / 4);
dictionaryWindow.setTerm(ui.display->text().mid(i)); dictionaryWindow.setTerm(ui.display->text().mid(i));
int home = i == 0 ? 0 : textPositionMap[i - 1].x(), away = textPositionMap[i].x(), x = 0; int left = i == 0 ? 0 : textPositionMap[i - 1].x(), right = textPositionMap[i].x(),
if (textPositionMap[i].x() > ui.display->width() / 2) x = textPositionMap[i].x() > ui.display->width() / 2 ? -dictionaryWindow.width() + (right * 3 + left) / 4 : (left * 3 + right) / 4;
{
std::swap(home, away);
x -= dictionaryWindow.width();
}
x += (home * 3 + away) / 4;
dictionaryWindow.move(ui.display->mapToGlobal(QPoint(x, textPositionMap[i].y()))); dictionaryWindow.move(ui.display->mapToGlobal(QPoint(x, textPositionMap[i].y())));
} }
@ -354,7 +351,7 @@ private:
BlockMarkupIterator savedDictionary(stream, Array<std::string_view>{ "|TERM|", "|DEFINITION|" }); BlockMarkupIterator savedDictionary(stream, Array<std::string_view>{ "|TERM|", "|DEFINITION|" });
while (auto read = savedDictionary.Next()) while (auto read = savedDictionary.Next())
{ {
const auto& [terms, definition] = *read; const auto& [terms, definition] = read.value();
auto storedDefinition = StoreCopy(definition); auto storedDefinition = StoreCopy(definition);
std::string_view termsView = terms; std::string_view termsView = terms;
size_t start = 0, end = termsView.find("|TERM|"); size_t start = 0, end = termsView.find("|TERM|");
@ -366,7 +363,20 @@ private:
} }
dictionary.push_back(DictionaryEntry{ StoreCopy(termsView.substr(start)), storedDefinition }); dictionary.push_back(DictionaryEntry{ StoreCopy(termsView.substr(start)), storedDefinition });
} }
std::sort(dictionary.begin(), dictionary.end()); std::stable_sort(dictionary.begin(), dictionary.end());
inflections.clear();
stream.seekg(0);
BlockMarkupIterator savedInflections(stream, Array<std::string_view>{ "|ROOT|", "|INFLECTS TO|", "|NAME|" });
while (auto read = savedInflections.Next())
{
const auto& [root, inflectsTo, name] = read.value();
if (!inflections.emplace_back(Inflection{
S(root),
QRegularExpression(QRegularExpression::anchoredPattern(S(inflectsTo)), QRegularExpression::UseUnicodePropertiesOption),
S(name)
}).inflectsTo.isValid()) TEXTRACTOR_MESSAGE(L"Invalid regex: %s", StringToWideString(inflectsTo));
}
} }
void setTerm(QString term) void setTerm(QString term)
@ -375,11 +385,17 @@ private:
UpdateDictionary(); UpdateDictionary();
definitions.clear(); definitions.clear();
definitionIndex = 0; definitionIndex = 0;
std::unordered_set<std::string_view> definitionSet; std::unordered_set<const char*> foundDefinitions;
for (QByteArray utf8term = term.left(500).toUtf8(); !utf8term.isEmpty(); utf8term.chop(1)) for (term = term.left(500); !term.isEmpty(); term.chop(1))
for (auto [it, end] = std::equal_range(dictionary.begin(), dictionary.end(), DictionaryEntry{ utf8term }); it != end; ++it) for (const auto& [rootTerm, definition, inflections] : LookupDefinitions(term, foundDefinitions))
if (definitionSet.emplace(it->definition).second) definitions.push_back(
definitions.push_back(QStringLiteral("<h3>%1 (%3/%4)</h3>%2").arg(utf8term, it->definition)); QStringLiteral("<h3>%1 (%5/%6)</h3><small>%2 %3</small><p>%4</p>").arg(
term,
rootTerm.split("<<")[0],
inflections.join(""),
definition
)
);
for (int i = 0; i < definitions.size(); ++i) definitions[i] = definitions[i].arg(i + 1).arg(definitions.size()); for (int i = 0; i < definitions.size(); ++i) definitions[i] = definitions[i].arg(i + 1).arg(definitions.size());
ShowDefinition(); ShowDefinition();
} }
@ -403,6 +419,29 @@ private:
QString term; QString term;
private: private:
struct LookupResult
{
QString term;
QString definition;
QStringList inflectionsUsed;
};
std::vector<LookupResult> LookupDefinitions(QString term, std::unordered_set<const char*>& foundDefinitions, QStringList inflectionsUsed = {})
{
std::vector<LookupResult> results;
for (auto [it, end] = std::equal_range(dictionary.begin(), dictionary.end(), DictionaryEntry{ term.toUtf8() }); it != end; ++it)
if (foundDefinitions.emplace(it->definition).second)
results.push_back({ term, it->definition, inflectionsUsed });
for (const auto& inflection : inflections) if (auto match = inflection.inflectsTo.match(term); match.hasMatch())
{
QStringList currentInflectionsUsed = inflectionsUsed;
currentInflectionsUsed.push_front(inflection.name);
QString root = inflection.root;
for (int i = 0; i < root.size(); ++i) if (root[i].isDigit()) root.replace(i, 1, match.captured(root[i].unicode() - '0'));
for (const auto& definition : LookupDefinitions(root, foundDefinitions, currentInflectionsUsed)) results.push_back(definition);
}
return results;
}
void wheelEvent(QWheelEvent* event) override void wheelEvent(QWheelEvent* event) override
{ {
int scroll = event->angleDelta().y(); int scroll = event->angleDelta().y();
@ -411,6 +450,14 @@ private:
ShowDefinition(); ShowDefinition();
} }
struct Inflection
{
QString root;
QRegularExpression inflectsTo;
QString name;
};
std::vector<Inflection> inflections;
std::filesystem::file_time_type dictionaryFileLastWrite; std::filesystem::file_time_type dictionaryFileLastWrite;
std::vector<char> charStorage; std::vector<char> charStorage;
std::vector<QString> definitions; std::vector<QString> definitions;

View File

@ -17,10 +17,10 @@ class Trie
public: public:
Trie(const std::istream& replacementScript) Trie(const std::istream& replacementScript)
{ {
BlockMarkupIterator replacementScriptParser(replacementScript.rdbuf(), Array<std::wstring_view>{ L"|ORIG|", L"|BECOMES|" }); BlockMarkupIterator replacementScriptParser(replacementScript, Array<std::wstring_view>{ L"|ORIG|", L"|BECOMES|" });
while (auto read = replacementScriptParser.Next()) while (auto read = replacementScriptParser.Next())
{ {
const auto& [original, replacement] = *read; const auto& [original, replacement] = read.value();
Node* current = &root; Node* current = &root;
for (auto ch : original) if (!Ignore(ch)) current = Next(current, ch); for (auto ch : original) if (!Ignore(ch)) current = Next(current, ch);
if (current != &root) if (current != &root)
@ -103,7 +103,7 @@ BOOL WINAPI DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved
if (trie.Empty()) if (trie.Empty())
{ {
auto file = std::ofstream(REPLACE_SAVE_FILE, std::ios::binary) << "\xff\xfe"; auto file = std::ofstream(REPLACE_SAVE_FILE, std::ios::binary) << "\xff\xfe";
for (auto ch : std::wstring_view(REPLACER_INSTRUCTIONS)) file << (ch == L'\n' ? std::string_view("\r\0\n", 4) : std::string_view((char*)&ch, 2)); for (auto ch : std::wstring_view(REPLACER_INSTRUCTIONS)) file << (ch == L'\n' ? std::string_view("\r\0\n", 4) : std::string_view((char*)&ch, 2));
_spawnlp(_P_DETACH, "notepad", "notepad", REPLACE_SAVE_FILE, NULL); // show file to user _spawnlp(_P_DETACH, "notepad", "notepad", REPLACE_SAVE_FILE, NULL); // show file to user
} }
} }

View File

@ -128,10 +128,16 @@ const char* DICTIONARY_INSTRUCTIONS = u8R"(This file is used only for the "Dicti
It uses a custom format specific to Textractor and is not meant to be written manually. It uses a custom format specific to Textractor and is not meant to be written manually.
You should look for a dictionary in this format online (https://github.com/Artikash/Textractor-Dictionaries/releases is a good place to start). You should look for a dictionary in this format online (https://github.com/Artikash/Textractor-Dictionaries/releases is a good place to start).
Alternatively, if you're a programmer, you can write a script to convert a dictionary from another format with the info below. Alternatively, if you're a programmer, you can write a script to convert a dictionary from another format with the info below.
Once you have a dictionary, to look up some text in Extra Window, hover over it. All matching definitions will be shown. Scroll to change definitions. Once you have a dictionary, to look up some text in Extra Window, hover over it. You can scroll through all the matching definitions.
Definitions are formatted like this:|TERM|Hola|TERM|hola|TERM|Bonjour|TERM|bonjour|DEFINITION|hello|END| Definitions are formatted like this:|TERM|Hola<<ignored|TERM|hola|TERM|Bonjour|TERM|bonjour|DEFINITION|hello|END|
The definition can include rich text (https://doc.qt.io/qt-5/richtext-html-subset.html) which will be formatted properly. The term and definition can include rich text (https://doc.qt.io/qt-5/richtext-html-subset.html) which will be formatted properly.
All text in this file outside of a definition is ignored. Inflections are formatted like this:|ROOT|1<<noun|INFLECTS TO|(\w*)s|NAME| plural|END|
Textractor will check if a term matches the inflection regex and if it does, will recursively search for the root term.
The root term is generated by replacing each number with the corresponding regex capture group (with 0 being replaced by the entire match).
This process can easily result in infinite loops and/or stack overflows. It's your job to avoid that.
Inflection regex uses QRegularExpression (https://doc.qt.io/qt-5/qregularexpression.html) unicode syntax.
Textractor will display the final root term as well as all inflections used to get to that root term.
However, the text in a term after << is ignored when displaying. This is intended to store part-of-speech information.
This file must be encoded in UTF-8.)"; This file must be encoded in UTF-8.)";
const char* SHOW_ORIGINAL = u8"Original text"; const char* SHOW_ORIGINAL = u8"Original text";
const char* SHOW_ORIGINAL_INFO = u8R"(Original text will not be shown const char* SHOW_ORIGINAL_INFO = u8R"(Original text will not be shown