mirror of
https://github.com/Artikash/Textractor.git
synced 2024-12-23 08:54:12 +08:00
add some classes
This commit is contained in:
parent
b0eeca5b36
commit
f890789a3b
10
extensions/blockmarkuplanguage.h
Normal file
10
extensions/blockmarkuplanguage.h
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
template <typename C>
|
||||||
|
class BlockMarkupLanguageIterator
|
||||||
|
{
|
||||||
|
std::istreambuf_iterator<char> it;
|
||||||
|
};
|
31
extensions/charstorage.h
Normal file
31
extensions/charstorage.h
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
template <typename C>
|
||||||
|
class CharStorage
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
CharStorage(size_t capacity = 0)
|
||||||
|
{
|
||||||
|
storage.reserve(capacity);
|
||||||
|
}
|
||||||
|
|
||||||
|
int Store(const std::basic_string<C>& string)
|
||||||
|
{
|
||||||
|
return storage.insert(storage.end(), string.c_str(), string.c_str() + string.size() + 1) - storage.begin();
|
||||||
|
}
|
||||||
|
|
||||||
|
void FreeExcess()
|
||||||
|
{
|
||||||
|
storage.shrink_to_fit();
|
||||||
|
}
|
||||||
|
|
||||||
|
const C* Retrieve(int handle) const
|
||||||
|
{
|
||||||
|
return storage.data() + handle;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::vector<C> storage;
|
||||||
|
};
|
@ -2,6 +2,7 @@
|
|||||||
#include "extension.h"
|
#include "extension.h"
|
||||||
#include "ui_extrawindow.h"
|
#include "ui_extrawindow.h"
|
||||||
#include "defs.h"
|
#include "defs.h"
|
||||||
|
#include "blockmarkuplanguage.h"
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <filesystem>
|
#include <filesystem>
|
||||||
#include <process.h>
|
#include <process.h>
|
||||||
@ -281,15 +282,15 @@ private:
|
|||||||
catch (std::filesystem::filesystem_error) { return; }
|
catch (std::filesystem::filesystem_error) { return; }
|
||||||
|
|
||||||
dictionary.clear();
|
dictionary.clear();
|
||||||
owningStorage.clear();
|
definitionStorage.clear();
|
||||||
|
|
||||||
auto StoreCopy = [&](const std::string& string)
|
auto StoreCopy = [&](const std::string& string)
|
||||||
{
|
{
|
||||||
return &*owningStorage.insert(owningStorage.end(), string.c_str(), string.c_str() + string.size() + 1);
|
return &*definitionStorage.insert(definitionStorage.end(), string.c_str(), string.c_str() + string.size() + 1);
|
||||||
};
|
};
|
||||||
|
|
||||||
std::string savedDictionary(std::istreambuf_iterator(std::ifstream(DICTIONARY_SAVE_FILE)), {});
|
std::string savedDictionary(std::istreambuf_iterator(std::ifstream(DICTIONARY_SAVE_FILE)), {});
|
||||||
owningStorage.reserve(savedDictionary.size());
|
definitionStorage.reserve(savedDictionary.size());
|
||||||
for (size_t end = 0; ;)
|
for (size_t end = 0; ;)
|
||||||
{
|
{
|
||||||
size_t term = savedDictionary.find("|TERM|", end);
|
size_t term = savedDictionary.find("|TERM|", end);
|
||||||
@ -300,14 +301,6 @@ private:
|
|||||||
dictionary.push_back({ StoreCopy(savedDictionary.substr(term + 6, next - term - 6)), storedDefinition });
|
dictionary.push_back({ StoreCopy(savedDictionary.substr(term + 6, next - term - 6)), storedDefinition });
|
||||||
dictionary.push_back({ StoreCopy(savedDictionary.substr(term + 6, definition - term - 6)), storedDefinition });
|
dictionary.push_back({ StoreCopy(savedDictionary.substr(term + 6, definition - term - 6)), storedDefinition });
|
||||||
}
|
}
|
||||||
auto oldData = owningStorage.data();
|
|
||||||
owningStorage.shrink_to_fit();
|
|
||||||
dictionary.shrink_to_fit();
|
|
||||||
for (auto& [term, definition] : dictionary)
|
|
||||||
{
|
|
||||||
term += owningStorage.data() - oldData;
|
|
||||||
definition += owningStorage.data() - oldData;
|
|
||||||
}
|
|
||||||
std::sort(dictionary.begin(), dictionary.end());
|
std::sort(dictionary.begin(), dictionary.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -354,7 +347,7 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::filesystem::file_time_type dictionaryFileLastWrite;
|
std::filesystem::file_time_type dictionaryFileLastWrite;
|
||||||
std::vector<char> owningStorage;
|
std::vector<char> definitionStorage;
|
||||||
std::vector<QString> definitions;
|
std::vector<QString> definitions;
|
||||||
int definitionIndex;
|
int definitionIndex;
|
||||||
} dictionaryWindow;
|
} dictionaryWindow;
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
#include "extension.h"
|
#include "extension.h"
|
||||||
|
#include "trie.h"
|
||||||
|
#include "charstorage.h"
|
||||||
#include <cwctype>
|
#include <cwctype>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <filesystem>
|
#include <filesystem>
|
||||||
@ -11,18 +13,14 @@ constexpr auto REPLACE_SAVE_FILE = u8"SavedReplacements.txt";
|
|||||||
std::atomic<std::filesystem::file_time_type> replaceFileLastWrite = {};
|
std::atomic<std::filesystem::file_time_type> replaceFileLastWrite = {};
|
||||||
std::shared_mutex m;
|
std::shared_mutex m;
|
||||||
|
|
||||||
class Trie
|
class ReplacementTrie
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
Trie(std::unordered_map<std::wstring, std::wstring> replacements)
|
ReplacementTrie(std::vector<std::pair<std::wstring, std::wstring>> replacements)
|
||||||
{
|
{
|
||||||
for (const auto& [original, replacement] : replacements)
|
for (auto& [original, replacement] : replacements)
|
||||||
{
|
if (!original.empty())
|
||||||
Node* current = &root;
|
trie.Insert(std::wstring_view(original.c_str(), std::remove_if(original.begin(), original.end(), Ignore) - original.begin()))->SetValue(storage.Store(replacement));
|
||||||
for (auto ch : original) if (!Ignore(ch)) current = Next(current, ch);
|
|
||||||
if (current != &root)
|
|
||||||
current->value = owningStorage.insert(owningStorage.end(), replacement.c_str(), replacement.c_str() + replacement.size() + 1) - owningStorage.begin();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::wstring Replace(const std::wstring& sentence) const
|
std::wstring Replace(const std::wstring& sentence) const
|
||||||
@ -33,17 +31,23 @@ public:
|
|||||||
std::wstring_view replacement(sentence.c_str() + i, 1);
|
std::wstring_view replacement(sentence.c_str() + i, 1);
|
||||||
int originalLength = 1;
|
int originalLength = 1;
|
||||||
|
|
||||||
const Node* current = &root;
|
auto current = trie.Root();
|
||||||
for (int j = i; current && j <= sentence.size(); ++j)
|
for (int j = i; current && j <= sentence.size(); ++j)
|
||||||
{
|
{
|
||||||
if (current->value >= 0)
|
if (const wchar_t* tail = current->Tail())
|
||||||
|
for (; j <= sentence.size() && *tail; ++j)
|
||||||
|
if (Ignore(sentence[j]));
|
||||||
|
else if (sentence[j] == *tail) ++tail;
|
||||||
|
else goto doneSearchingTrie;
|
||||||
|
if (int* value = current->Value())
|
||||||
{
|
{
|
||||||
replacement = owningStorage.data() + current->value;
|
replacement = storage.Retrieve(*value);
|
||||||
originalLength = j - i;
|
originalLength = j - i;
|
||||||
}
|
}
|
||||||
if (!Ignore(sentence[j])) current = Next(current, sentence[j]);
|
if (!Ignore(sentence[j])) current = trie.Next(current, sentence[j]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
doneSearchingTrie:
|
||||||
result += replacement;
|
result += replacement;
|
||||||
i += originalLength;
|
i += originalLength;
|
||||||
}
|
}
|
||||||
@ -52,7 +56,7 @@ public:
|
|||||||
|
|
||||||
bool Empty()
|
bool Empty()
|
||||||
{
|
{
|
||||||
return root.charMap.empty();
|
return trie.Root()->charMap.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -61,33 +65,19 @@ private:
|
|||||||
return ch <= 0x20 || std::iswspace(ch);
|
return ch <= 0x20 || std::iswspace(ch);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Node>
|
CharStorage<wchar_t> storage;
|
||||||
static Node* Next(Node* node, wchar_t ch)
|
Trie<wchar_t, int> trie;
|
||||||
{
|
|
||||||
auto it = std::lower_bound(node->charMap.begin(), node->charMap.end(), ch, [](const auto& one, auto two) { return one.first < two; });
|
|
||||||
if (it != node->charMap.end() && it->first == ch) return it->second.get();
|
|
||||||
if constexpr (!std::is_const_v<Node>) return node->charMap.insert(it, { ch, std::make_unique<Node>() })->second.get();
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct Node
|
|
||||||
{
|
|
||||||
std::vector<std::pair<wchar_t, std::unique_ptr<Node>>> charMap;
|
|
||||||
ptrdiff_t value = -1;
|
|
||||||
} root;
|
|
||||||
|
|
||||||
std::vector<wchar_t> owningStorage;
|
|
||||||
} trie = { {} };
|
} trie = { {} };
|
||||||
|
|
||||||
std::unordered_map<std::wstring, std::wstring> Parse(std::wstring_view replacementScript)
|
std::vector<std::pair<std::wstring, std::wstring>> Parse(std::wstring_view replacementScript)
|
||||||
{
|
{
|
||||||
std::unordered_map<std::wstring, std::wstring> replacements;
|
std::vector<std::pair<std::wstring, std::wstring>> replacements;
|
||||||
for (size_t end = 0; ;)
|
for (size_t end = 0; ;)
|
||||||
{
|
{
|
||||||
size_t original = replacementScript.find(L"|ORIG|", end);
|
size_t original = replacementScript.find(L"|ORIG|", end);
|
||||||
size_t becomes = replacementScript.find(L"|BECOMES|", original);
|
size_t becomes = replacementScript.find(L"|BECOMES|", original);
|
||||||
if ((end = replacementScript.find(L"|END|", becomes)) == std::wstring::npos) break;
|
if ((end = replacementScript.find(L"|END|", becomes)) == std::wstring::npos) break;
|
||||||
replacements[std::wstring(replacementScript.substr(original + 6, becomes - original - 6))] = replacementScript.substr(becomes + 9, end - becomes - 9);
|
replacements.emplace_back(replacementScript.substr(original + 6, becomes - original - 6), replacementScript.substr(becomes + 9, end - becomes - 9));
|
||||||
}
|
}
|
||||||
return replacements;
|
return replacements;
|
||||||
}
|
}
|
||||||
@ -99,7 +89,7 @@ void UpdateReplacements()
|
|||||||
if (replaceFileLastWrite.exchange(std::filesystem::last_write_time(REPLACE_SAVE_FILE)) == std::filesystem::last_write_time(REPLACE_SAVE_FILE)) return;
|
if (replaceFileLastWrite.exchange(std::filesystem::last_write_time(REPLACE_SAVE_FILE)) == std::filesystem::last_write_time(REPLACE_SAVE_FILE)) return;
|
||||||
std::vector<BYTE> file(std::istreambuf_iterator(std::ifstream(REPLACE_SAVE_FILE, std::ios::binary)), {});
|
std::vector<BYTE> file(std::istreambuf_iterator(std::ifstream(REPLACE_SAVE_FILE, std::ios::binary)), {});
|
||||||
std::scoped_lock l(m);
|
std::scoped_lock l(m);
|
||||||
trie = Trie(Parse({ (wchar_t*)file.data(), file.size() / sizeof(wchar_t) }));
|
trie = ReplacementTrie(Parse({ (wchar_t*)file.data(), file.size() / sizeof(wchar_t) }));
|
||||||
}
|
}
|
||||||
catch (std::filesystem::filesystem_error) { replaceFileLastWrite.store({}); }
|
catch (std::filesystem::filesystem_error) { replaceFileLastWrite.store({}); }
|
||||||
}
|
}
|
||||||
@ -146,7 +136,7 @@ And this text ツ
|
|||||||
assert(replacements.size() == 4);
|
assert(replacements.size() == 4);
|
||||||
std::wstring original = LR"(Don't replace this
|
std::wstring original = LR"(Don't replace this
|
||||||
さよなら バカ こんにちは delete this)";
|
さよなら バカ こんにちは delete this)";
|
||||||
std::wstring replaced = Trie(std::move(replacements)).Replace(original);
|
std::wstring replaced = ReplacementTrie(std::move(replacements)).Replace(original);
|
||||||
assert(replaced == L"Don't replace thisgoodbye idiot hello");
|
assert(replaced == L"Don't replace thisgoodbye idiot hello");
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
121
extensions/trie.h
Normal file
121
extensions/trie.h
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include <variant>
|
||||||
|
|
||||||
|
template <typename C, typename V>
|
||||||
|
struct Trie
|
||||||
|
{
|
||||||
|
struct Node
|
||||||
|
{
|
||||||
|
union
|
||||||
|
{
|
||||||
|
std::basic_string<C> chars;
|
||||||
|
std::vector<std::pair<C, std::unique_ptr<Node>>> charMap;
|
||||||
|
};
|
||||||
|
uint64_t packedValue;
|
||||||
|
const C* Tail() const
|
||||||
|
{
|
||||||
|
return packedValue >> 63 ? chars.c_str() : nullptr;
|
||||||
|
}
|
||||||
|
V* Value() const
|
||||||
|
{
|
||||||
|
return (V*)((packedValue << 2) >> 2);
|
||||||
|
}
|
||||||
|
void SetValue(V value)
|
||||||
|
{
|
||||||
|
if (V* oldValue = Value()) *oldValue = std::move(value);
|
||||||
|
else packedValue = (1LL << (62 + (packedValue >> 63))) | (uint64_t)new V(std::move(value));
|
||||||
|
}
|
||||||
|
Node(bool map) :
|
||||||
|
packedValue(1LL << (62 + !map))
|
||||||
|
{
|
||||||
|
if (map) new (&charMap) decltype(charMap)();
|
||||||
|
else new (&chars) decltype(chars)();
|
||||||
|
}
|
||||||
|
~Node()
|
||||||
|
{
|
||||||
|
if (packedValue >> 63) chars.~basic_string();
|
||||||
|
else charMap.~vector();
|
||||||
|
delete Value();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename Node>
|
||||||
|
static Node* Next(Node* node, C ch, bool makeMap = false)
|
||||||
|
{
|
||||||
|
if (node->packedValue >> 63) return nullptr;
|
||||||
|
auto it = std::lower_bound(node->charMap.begin(), node->charMap.end(), ch, [](const auto& one, auto two) { return one.first < two; });
|
||||||
|
if (it != node->charMap.end() && it->first == ch) return it->second.get();
|
||||||
|
if constexpr (!std::is_const_v<Node>) return node->charMap.insert(it, { ch, std::make_unique<Node>(makeMap) })->second.get();
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void Clear(Node* node)
|
||||||
|
{
|
||||||
|
if (node->packedValue >> 63) node->chars.clear();
|
||||||
|
else for (auto& [_, child] : node->charMap) Clear(child.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<Node> root = std::make_unique<Node>(true);
|
||||||
|
|
||||||
|
Node* Insert(std::basic_string_view<C> key)
|
||||||
|
{
|
||||||
|
Node* current = root.get();
|
||||||
|
for (int i = 0; i < key.size(); ++i)
|
||||||
|
{
|
||||||
|
if (Node* next = Next(current, key[i], i + 1 == key.size())) current = next;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (current->chars.empty()) // FIXME: how to represent last character inside map?
|
||||||
|
{
|
||||||
|
current->chars = std::basic_string(key.begin() + i, key.end());
|
||||||
|
if(current->chars.empty())throw;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if (current->chars == key.substr(i))
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
auto oldChars = std::move(current->chars);
|
||||||
|
assert(current->Value());
|
||||||
|
auto oldValue = std::move(*current->Value());
|
||||||
|
auto keyRemaining = key.substr(i);
|
||||||
|
current->chars.~basic_string();
|
||||||
|
new (¤t->charMap) decltype(current->charMap)();
|
||||||
|
current->packedValue ^= 3ULL << 62;
|
||||||
|
for (i = 0; i < oldChars.size() && i < keyRemaining.size(); ++i)
|
||||||
|
{
|
||||||
|
if (oldChars[i] == keyRemaining[i]) current = Next(current, oldChars[i], true);
|
||||||
|
else break;
|
||||||
|
}
|
||||||
|
if (i == oldChars.size())
|
||||||
|
{
|
||||||
|
current->SetValue(std::move(oldValue));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
auto relocated = Next(current, oldChars[i]);
|
||||||
|
relocated->chars = oldChars.substr(i);
|
||||||
|
relocated->SetValue(std::move(oldValue));
|
||||||
|
}
|
||||||
|
if (i != keyRemaining.size()) (current = Next(current, keyRemaining[i]))->chars = std::basic_string(keyRemaining.begin() + i, keyRemaining.end());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return current;
|
||||||
|
}
|
||||||
|
|
||||||
|
const Node* Root() const
|
||||||
|
{
|
||||||
|
return root.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Empty() const
|
||||||
|
{
|
||||||
|
return root->charMap.empty();
|
||||||
|
}
|
||||||
|
};
|
Loading…
x
Reference in New Issue
Block a user