mirror of
https://github.com/Artikash/Textractor.git
synced 2024-12-23 08:54:12 +08:00
implement deinflection
This commit is contained in:
parent
dc48f2a3c8
commit
565f99cced
@ -2,9 +2,11 @@
|
||||
#include "extension.h"
|
||||
#include "ui_extrawindow.h"
|
||||
#include "defs.h"
|
||||
#include "util.h"
|
||||
#include "blockmarkup.h"
|
||||
#include <fstream>
|
||||
#include <process.h>
|
||||
#include <QRegularExpression>
|
||||
#include <QColorDialog>
|
||||
#include <QFontDialog>
|
||||
#include <QMenu>
|
||||
@ -260,13 +262,8 @@ private:
|
||||
if (ui.display->text().mid(i) == dictionaryWindow.term) return dictionaryWindow.ShowDefinition();
|
||||
dictionaryWindow.ui.display->setFixedWidth(ui.display->width() * 3 / 4);
|
||||
dictionaryWindow.setTerm(ui.display->text().mid(i));
|
||||
int home = i == 0 ? 0 : textPositionMap[i - 1].x(), away = textPositionMap[i].x(), x = 0;
|
||||
if (textPositionMap[i].x() > ui.display->width() / 2)
|
||||
{
|
||||
std::swap(home, away);
|
||||
x -= dictionaryWindow.width();
|
||||
}
|
||||
x += (home * 3 + away) / 4;
|
||||
int left = i == 0 ? 0 : textPositionMap[i - 1].x(), right = textPositionMap[i].x(),
|
||||
x = textPositionMap[i].x() > ui.display->width() / 2 ? -dictionaryWindow.width() + (right * 3 + left) / 4 : (left * 3 + right) / 4;
|
||||
dictionaryWindow.move(ui.display->mapToGlobal(QPoint(x, textPositionMap[i].y())));
|
||||
}
|
||||
|
||||
@ -354,7 +351,7 @@ private:
|
||||
BlockMarkupIterator savedDictionary(stream, Array<std::string_view>{ "|TERM|", "|DEFINITION|" });
|
||||
while (auto read = savedDictionary.Next())
|
||||
{
|
||||
const auto& [terms, definition] = *read;
|
||||
const auto& [terms, definition] = read.value();
|
||||
auto storedDefinition = StoreCopy(definition);
|
||||
std::string_view termsView = terms;
|
||||
size_t start = 0, end = termsView.find("|TERM|");
|
||||
@ -366,7 +363,20 @@ private:
|
||||
}
|
||||
dictionary.push_back(DictionaryEntry{ StoreCopy(termsView.substr(start)), storedDefinition });
|
||||
}
|
||||
std::sort(dictionary.begin(), dictionary.end());
|
||||
std::stable_sort(dictionary.begin(), dictionary.end());
|
||||
|
||||
inflections.clear();
|
||||
stream.seekg(0);
|
||||
BlockMarkupIterator savedInflections(stream, Array<std::string_view>{ "|ROOT|", "|INFLECTS TO|", "|NAME|" });
|
||||
while (auto read = savedInflections.Next())
|
||||
{
|
||||
const auto& [root, inflectsTo, name] = read.value();
|
||||
if (!inflections.emplace_back(Inflection{
|
||||
S(root),
|
||||
QRegularExpression(QRegularExpression::anchoredPattern(S(inflectsTo)), QRegularExpression::UseUnicodePropertiesOption),
|
||||
S(name)
|
||||
}).inflectsTo.isValid()) TEXTRACTOR_MESSAGE(L"Invalid regex: %s", StringToWideString(inflectsTo));
|
||||
}
|
||||
}
|
||||
|
||||
void setTerm(QString term)
|
||||
@ -375,11 +385,17 @@ private:
|
||||
UpdateDictionary();
|
||||
definitions.clear();
|
||||
definitionIndex = 0;
|
||||
std::unordered_set<std::string_view> definitionSet;
|
||||
for (QByteArray utf8term = term.left(500).toUtf8(); !utf8term.isEmpty(); utf8term.chop(1))
|
||||
for (auto [it, end] = std::equal_range(dictionary.begin(), dictionary.end(), DictionaryEntry{ utf8term }); it != end; ++it)
|
||||
if (definitionSet.emplace(it->definition).second)
|
||||
definitions.push_back(QStringLiteral("<h3>%1 (%3/%4)</h3>%2").arg(utf8term, it->definition));
|
||||
std::unordered_set<const char*> foundDefinitions;
|
||||
for (term = term.left(500); !term.isEmpty(); term.chop(1))
|
||||
for (const auto& [rootTerm, definition, inflections] : LookupDefinitions(term, foundDefinitions))
|
||||
definitions.push_back(
|
||||
QStringLiteral("<h3>%1 (%5/%6)</h3><small>%2 %3</small><p>%4</p>").arg(
|
||||
term,
|
||||
rootTerm.split("<<")[0],
|
||||
inflections.join(""),
|
||||
definition
|
||||
)
|
||||
);
|
||||
for (int i = 0; i < definitions.size(); ++i) definitions[i] = definitions[i].arg(i + 1).arg(definitions.size());
|
||||
ShowDefinition();
|
||||
}
|
||||
@ -403,6 +419,29 @@ private:
|
||||
QString term;
|
||||
|
||||
private:
|
||||
struct LookupResult
|
||||
{
|
||||
QString term;
|
||||
QString definition;
|
||||
QStringList inflectionsUsed;
|
||||
};
|
||||
std::vector<LookupResult> LookupDefinitions(QString term, std::unordered_set<const char*>& foundDefinitions, QStringList inflectionsUsed = {})
|
||||
{
|
||||
std::vector<LookupResult> results;
|
||||
for (auto [it, end] = std::equal_range(dictionary.begin(), dictionary.end(), DictionaryEntry{ term.toUtf8() }); it != end; ++it)
|
||||
if (foundDefinitions.emplace(it->definition).second)
|
||||
results.push_back({ term, it->definition, inflectionsUsed });
|
||||
for (const auto& inflection : inflections) if (auto match = inflection.inflectsTo.match(term); match.hasMatch())
|
||||
{
|
||||
QStringList currentInflectionsUsed = inflectionsUsed;
|
||||
currentInflectionsUsed.push_front(inflection.name);
|
||||
QString root = inflection.root;
|
||||
for (int i = 0; i < root.size(); ++i) if (root[i].isDigit()) root.replace(i, 1, match.captured(root[i].unicode() - '0'));
|
||||
for (const auto& definition : LookupDefinitions(root, foundDefinitions, currentInflectionsUsed)) results.push_back(definition);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
void wheelEvent(QWheelEvent* event) override
|
||||
{
|
||||
int scroll = event->angleDelta().y();
|
||||
@ -411,6 +450,14 @@ private:
|
||||
ShowDefinition();
|
||||
}
|
||||
|
||||
struct Inflection
|
||||
{
|
||||
QString root;
|
||||
QRegularExpression inflectsTo;
|
||||
QString name;
|
||||
};
|
||||
std::vector<Inflection> inflections;
|
||||
|
||||
std::filesystem::file_time_type dictionaryFileLastWrite;
|
||||
std::vector<char> charStorage;
|
||||
std::vector<QString> definitions;
|
||||
|
@ -17,10 +17,10 @@ class Trie
|
||||
public:
|
||||
Trie(const std::istream& replacementScript)
|
||||
{
|
||||
BlockMarkupIterator replacementScriptParser(replacementScript.rdbuf(), Array<std::wstring_view>{ L"|ORIG|", L"|BECOMES|" });
|
||||
BlockMarkupIterator replacementScriptParser(replacementScript, Array<std::wstring_view>{ L"|ORIG|", L"|BECOMES|" });
|
||||
while (auto read = replacementScriptParser.Next())
|
||||
{
|
||||
const auto& [original, replacement] = *read;
|
||||
const auto& [original, replacement] = read.value();
|
||||
Node* current = &root;
|
||||
for (auto ch : original) if (!Ignore(ch)) current = Next(current, ch);
|
||||
if (current != &root)
|
||||
|
14
text.cpp
14
text.cpp
@ -128,10 +128,16 @@ const char* DICTIONARY_INSTRUCTIONS = u8R"(This file is used only for the "Dicti
|
||||
It uses a custom format specific to Textractor and is not meant to be written manually.
|
||||
You should look for a dictionary in this format online (https://github.com/Artikash/Textractor-Dictionaries/releases is a good place to start).
|
||||
Alternatively, if you're a programmer, you can write a script to convert a dictionary from another format with the info below.
|
||||
Once you have a dictionary, to look up some text in Extra Window, hover over it. All matching definitions will be shown. Scroll to change definitions.
|
||||
Definitions are formatted like this:|TERM|Hola|TERM|hola|TERM|Bonjour|TERM|bonjour|DEFINITION|hello|END|
|
||||
The definition can include rich text (https://doc.qt.io/qt-5/richtext-html-subset.html) which will be formatted properly.
|
||||
All text in this file outside of a definition is ignored.
|
||||
Once you have a dictionary, to look up some text in Extra Window, hover over it. You can scroll through all the matching definitions.
|
||||
Definitions are formatted like this:|TERM|Hola<<ignored|TERM|hola|TERM|Bonjour|TERM|bonjour|DEFINITION|hello|END|
|
||||
The term and definition can include rich text (https://doc.qt.io/qt-5/richtext-html-subset.html) which will be formatted properly.
|
||||
Inflections are formatted like this:|ROOT|1<<noun|INFLECTS TO|(\w*)s|NAME| plural|END|
|
||||
Textractor will check if a term matches the inflection regex and if it does, will recursively search for the root term.
|
||||
The root term is generated by replacing each number with the corresponding regex capture group (with 0 being replaced by the entire match).
|
||||
This process can easily result in infinite loops and/or stack overflows. It's your job to avoid that.
|
||||
Inflection regex uses QRegularExpression (https://doc.qt.io/qt-5/qregularexpression.html) unicode syntax.
|
||||
Textractor will display the final root term as well as all inflections used to get to that root term.
|
||||
However, the text in a term after << is ignored when displaying. This is intended to store part-of-speech information.
|
||||
This file must be encoded in UTF-8.)";
|
||||
const char* SHOW_ORIGINAL = u8"Original text";
|
||||
const char* SHOW_ORIGINAL_INFO = u8R"(Original text will not be shown
|
||||
|
Loading…
x
Reference in New Issue
Block a user