mirror of
https://github.com/Artikash/Textractor.git
synced 2024-12-24 01:14:12 +08:00
implement deinflection
This commit is contained in:
parent
dc48f2a3c8
commit
565f99cced
@ -2,9 +2,11 @@
|
|||||||
#include "extension.h"
|
#include "extension.h"
|
||||||
#include "ui_extrawindow.h"
|
#include "ui_extrawindow.h"
|
||||||
#include "defs.h"
|
#include "defs.h"
|
||||||
|
#include "util.h"
|
||||||
#include "blockmarkup.h"
|
#include "blockmarkup.h"
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <process.h>
|
#include <process.h>
|
||||||
|
#include <QRegularExpression>
|
||||||
#include <QColorDialog>
|
#include <QColorDialog>
|
||||||
#include <QFontDialog>
|
#include <QFontDialog>
|
||||||
#include <QMenu>
|
#include <QMenu>
|
||||||
@ -260,13 +262,8 @@ private:
|
|||||||
if (ui.display->text().mid(i) == dictionaryWindow.term) return dictionaryWindow.ShowDefinition();
|
if (ui.display->text().mid(i) == dictionaryWindow.term) return dictionaryWindow.ShowDefinition();
|
||||||
dictionaryWindow.ui.display->setFixedWidth(ui.display->width() * 3 / 4);
|
dictionaryWindow.ui.display->setFixedWidth(ui.display->width() * 3 / 4);
|
||||||
dictionaryWindow.setTerm(ui.display->text().mid(i));
|
dictionaryWindow.setTerm(ui.display->text().mid(i));
|
||||||
int home = i == 0 ? 0 : textPositionMap[i - 1].x(), away = textPositionMap[i].x(), x = 0;
|
int left = i == 0 ? 0 : textPositionMap[i - 1].x(), right = textPositionMap[i].x(),
|
||||||
if (textPositionMap[i].x() > ui.display->width() / 2)
|
x = textPositionMap[i].x() > ui.display->width() / 2 ? -dictionaryWindow.width() + (right * 3 + left) / 4 : (left * 3 + right) / 4;
|
||||||
{
|
|
||||||
std::swap(home, away);
|
|
||||||
x -= dictionaryWindow.width();
|
|
||||||
}
|
|
||||||
x += (home * 3 + away) / 4;
|
|
||||||
dictionaryWindow.move(ui.display->mapToGlobal(QPoint(x, textPositionMap[i].y())));
|
dictionaryWindow.move(ui.display->mapToGlobal(QPoint(x, textPositionMap[i].y())));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -354,7 +351,7 @@ private:
|
|||||||
BlockMarkupIterator savedDictionary(stream, Array<std::string_view>{ "|TERM|", "|DEFINITION|" });
|
BlockMarkupIterator savedDictionary(stream, Array<std::string_view>{ "|TERM|", "|DEFINITION|" });
|
||||||
while (auto read = savedDictionary.Next())
|
while (auto read = savedDictionary.Next())
|
||||||
{
|
{
|
||||||
const auto& [terms, definition] = *read;
|
const auto& [terms, definition] = read.value();
|
||||||
auto storedDefinition = StoreCopy(definition);
|
auto storedDefinition = StoreCopy(definition);
|
||||||
std::string_view termsView = terms;
|
std::string_view termsView = terms;
|
||||||
size_t start = 0, end = termsView.find("|TERM|");
|
size_t start = 0, end = termsView.find("|TERM|");
|
||||||
@ -366,7 +363,20 @@ private:
|
|||||||
}
|
}
|
||||||
dictionary.push_back(DictionaryEntry{ StoreCopy(termsView.substr(start)), storedDefinition });
|
dictionary.push_back(DictionaryEntry{ StoreCopy(termsView.substr(start)), storedDefinition });
|
||||||
}
|
}
|
||||||
std::sort(dictionary.begin(), dictionary.end());
|
std::stable_sort(dictionary.begin(), dictionary.end());
|
||||||
|
|
||||||
|
inflections.clear();
|
||||||
|
stream.seekg(0);
|
||||||
|
BlockMarkupIterator savedInflections(stream, Array<std::string_view>{ "|ROOT|", "|INFLECTS TO|", "|NAME|" });
|
||||||
|
while (auto read = savedInflections.Next())
|
||||||
|
{
|
||||||
|
const auto& [root, inflectsTo, name] = read.value();
|
||||||
|
if (!inflections.emplace_back(Inflection{
|
||||||
|
S(root),
|
||||||
|
QRegularExpression(QRegularExpression::anchoredPattern(S(inflectsTo)), QRegularExpression::UseUnicodePropertiesOption),
|
||||||
|
S(name)
|
||||||
|
}).inflectsTo.isValid()) TEXTRACTOR_MESSAGE(L"Invalid regex: %s", StringToWideString(inflectsTo));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void setTerm(QString term)
|
void setTerm(QString term)
|
||||||
@ -375,11 +385,17 @@ private:
|
|||||||
UpdateDictionary();
|
UpdateDictionary();
|
||||||
definitions.clear();
|
definitions.clear();
|
||||||
definitionIndex = 0;
|
definitionIndex = 0;
|
||||||
std::unordered_set<std::string_view> definitionSet;
|
std::unordered_set<const char*> foundDefinitions;
|
||||||
for (QByteArray utf8term = term.left(500).toUtf8(); !utf8term.isEmpty(); utf8term.chop(1))
|
for (term = term.left(500); !term.isEmpty(); term.chop(1))
|
||||||
for (auto [it, end] = std::equal_range(dictionary.begin(), dictionary.end(), DictionaryEntry{ utf8term }); it != end; ++it)
|
for (const auto& [rootTerm, definition, inflections] : LookupDefinitions(term, foundDefinitions))
|
||||||
if (definitionSet.emplace(it->definition).second)
|
definitions.push_back(
|
||||||
definitions.push_back(QStringLiteral("<h3>%1 (%3/%4)</h3>%2").arg(utf8term, it->definition));
|
QStringLiteral("<h3>%1 (%5/%6)</h3><small>%2 %3</small><p>%4</p>").arg(
|
||||||
|
term,
|
||||||
|
rootTerm.split("<<")[0],
|
||||||
|
inflections.join(""),
|
||||||
|
definition
|
||||||
|
)
|
||||||
|
);
|
||||||
for (int i = 0; i < definitions.size(); ++i) definitions[i] = definitions[i].arg(i + 1).arg(definitions.size());
|
for (int i = 0; i < definitions.size(); ++i) definitions[i] = definitions[i].arg(i + 1).arg(definitions.size());
|
||||||
ShowDefinition();
|
ShowDefinition();
|
||||||
}
|
}
|
||||||
@ -403,6 +419,29 @@ private:
|
|||||||
QString term;
|
QString term;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
struct LookupResult
|
||||||
|
{
|
||||||
|
QString term;
|
||||||
|
QString definition;
|
||||||
|
QStringList inflectionsUsed;
|
||||||
|
};
|
||||||
|
std::vector<LookupResult> LookupDefinitions(QString term, std::unordered_set<const char*>& foundDefinitions, QStringList inflectionsUsed = {})
|
||||||
|
{
|
||||||
|
std::vector<LookupResult> results;
|
||||||
|
for (auto [it, end] = std::equal_range(dictionary.begin(), dictionary.end(), DictionaryEntry{ term.toUtf8() }); it != end; ++it)
|
||||||
|
if (foundDefinitions.emplace(it->definition).second)
|
||||||
|
results.push_back({ term, it->definition, inflectionsUsed });
|
||||||
|
for (const auto& inflection : inflections) if (auto match = inflection.inflectsTo.match(term); match.hasMatch())
|
||||||
|
{
|
||||||
|
QStringList currentInflectionsUsed = inflectionsUsed;
|
||||||
|
currentInflectionsUsed.push_front(inflection.name);
|
||||||
|
QString root = inflection.root;
|
||||||
|
for (int i = 0; i < root.size(); ++i) if (root[i].isDigit()) root.replace(i, 1, match.captured(root[i].unicode() - '0'));
|
||||||
|
for (const auto& definition : LookupDefinitions(root, foundDefinitions, currentInflectionsUsed)) results.push_back(definition);
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
void wheelEvent(QWheelEvent* event) override
|
void wheelEvent(QWheelEvent* event) override
|
||||||
{
|
{
|
||||||
int scroll = event->angleDelta().y();
|
int scroll = event->angleDelta().y();
|
||||||
@ -411,6 +450,14 @@ private:
|
|||||||
ShowDefinition();
|
ShowDefinition();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct Inflection
|
||||||
|
{
|
||||||
|
QString root;
|
||||||
|
QRegularExpression inflectsTo;
|
||||||
|
QString name;
|
||||||
|
};
|
||||||
|
std::vector<Inflection> inflections;
|
||||||
|
|
||||||
std::filesystem::file_time_type dictionaryFileLastWrite;
|
std::filesystem::file_time_type dictionaryFileLastWrite;
|
||||||
std::vector<char> charStorage;
|
std::vector<char> charStorage;
|
||||||
std::vector<QString> definitions;
|
std::vector<QString> definitions;
|
||||||
|
@ -17,10 +17,10 @@ class Trie
|
|||||||
public:
|
public:
|
||||||
Trie(const std::istream& replacementScript)
|
Trie(const std::istream& replacementScript)
|
||||||
{
|
{
|
||||||
BlockMarkupIterator replacementScriptParser(replacementScript.rdbuf(), Array<std::wstring_view>{ L"|ORIG|", L"|BECOMES|" });
|
BlockMarkupIterator replacementScriptParser(replacementScript, Array<std::wstring_view>{ L"|ORIG|", L"|BECOMES|" });
|
||||||
while (auto read = replacementScriptParser.Next())
|
while (auto read = replacementScriptParser.Next())
|
||||||
{
|
{
|
||||||
const auto& [original, replacement] = *read;
|
const auto& [original, replacement] = read.value();
|
||||||
Node* current = &root;
|
Node* current = &root;
|
||||||
for (auto ch : original) if (!Ignore(ch)) current = Next(current, ch);
|
for (auto ch : original) if (!Ignore(ch)) current = Next(current, ch);
|
||||||
if (current != &root)
|
if (current != &root)
|
||||||
|
14
text.cpp
14
text.cpp
@ -128,10 +128,16 @@ const char* DICTIONARY_INSTRUCTIONS = u8R"(This file is used only for the "Dicti
|
|||||||
It uses a custom format specific to Textractor and is not meant to be written manually.
|
It uses a custom format specific to Textractor and is not meant to be written manually.
|
||||||
You should look for a dictionary in this format online (https://github.com/Artikash/Textractor-Dictionaries/releases is a good place to start).
|
You should look for a dictionary in this format online (https://github.com/Artikash/Textractor-Dictionaries/releases is a good place to start).
|
||||||
Alternatively, if you're a programmer, you can write a script to convert a dictionary from another format with the info below.
|
Alternatively, if you're a programmer, you can write a script to convert a dictionary from another format with the info below.
|
||||||
Once you have a dictionary, to look up some text in Extra Window, hover over it. All matching definitions will be shown. Scroll to change definitions.
|
Once you have a dictionary, to look up some text in Extra Window, hover over it. You can scroll through all the matching definitions.
|
||||||
Definitions are formatted like this:|TERM|Hola|TERM|hola|TERM|Bonjour|TERM|bonjour|DEFINITION|hello|END|
|
Definitions are formatted like this:|TERM|Hola<<ignored|TERM|hola|TERM|Bonjour|TERM|bonjour|DEFINITION|hello|END|
|
||||||
The definition can include rich text (https://doc.qt.io/qt-5/richtext-html-subset.html) which will be formatted properly.
|
The term and definition can include rich text (https://doc.qt.io/qt-5/richtext-html-subset.html) which will be formatted properly.
|
||||||
All text in this file outside of a definition is ignored.
|
Inflections are formatted like this:|ROOT|1<<noun|INFLECTS TO|(\w*)s|NAME| plural|END|
|
||||||
|
Textractor will check if a term matches the inflection regex and if it does, will recursively search for the root term.
|
||||||
|
The root term is generated by replacing each number with the corresponding regex capture group (with 0 being replaced by the entire match).
|
||||||
|
This process can easily result in infinite loops and/or stack overflows. It's your job to avoid that.
|
||||||
|
Inflection regex uses QRegularExpression (https://doc.qt.io/qt-5/qregularexpression.html) unicode syntax.
|
||||||
|
Textractor will display the final root term as well as all inflections used to get to that root term.
|
||||||
|
However, the text in a term after << is ignored when displaying. This is intended to store part-of-speech information.
|
||||||
This file must be encoded in UTF-8.)";
|
This file must be encoded in UTF-8.)";
|
||||||
const char* SHOW_ORIGINAL = u8"Original text";
|
const char* SHOW_ORIGINAL = u8"Original text";
|
||||||
const char* SHOW_ORIGINAL_INFO = u8R"(Original text will not be shown
|
const char* SHOW_ORIGINAL_INFO = u8R"(Original text will not be shown
|
||||||
|
Loading…
x
Reference in New Issue
Block a user