From 162aaa07d16943cccbd73e6e7a730b7db161cf02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=81=8D=E5=85=AE=E6=83=9A=E5=85=AE?= <101191390+HIllya51@users.noreply.github.com> Date: Sat, 18 May 2024 14:16:58 +0800 Subject: [PATCH] fix --- .../LunaTranslator/cishu/cishubase.py | 3 +- LunaTranslator/LunaTranslator/cishu/mdict.py | 2465 +++++++++++++++++ .../LunaTranslator/gui/settingpage_cishu.py | 5 +- .../files/defaultconfig/config.json | 32 + 4 files changed, 2503 insertions(+), 2 deletions(-) create mode 100644 LunaTranslator/LunaTranslator/cishu/mdict.py diff --git a/LunaTranslator/LunaTranslator/cishu/cishubase.py b/LunaTranslator/LunaTranslator/cishu/cishubase.py index e4f8cb6c..44a27b2b 100644 --- a/LunaTranslator/LunaTranslator/cishu/cishubase.py +++ b/LunaTranslator/LunaTranslator/cishu/cishubase.py @@ -1,6 +1,6 @@ from myutils.config import globalconfig from myutils.wrapper import threader - +from traceback import print_exc class cishubase: def init(self): @@ -28,6 +28,7 @@ class cishubase: try: res = self.search(sentence) except: + print_exc() self.needinit = True if res and len(res): diff --git a/LunaTranslator/LunaTranslator/cishu/mdict.py b/LunaTranslator/LunaTranslator/cishu/mdict.py new file mode 100644 index 00000000..1e0b588b --- /dev/null +++ b/LunaTranslator/LunaTranslator/cishu/mdict.py @@ -0,0 +1,2465 @@ +import math + + +class FlexBuffer: + + def __init__(self): + + self.blockSize = None + self.c = None + self.l = None + self.buf = None + + def require(self, n): + + r = self.c - self.l + n + if r > 0: + self.l = self.l + self.blockSize * math.ceil(r / self.blockSize) + # tmp = bytearray(self.l) + # for i in len(self.buf): + # tmp[i] = self.buf[i] + # self.buf = tmp + self.buf = self.buf + bytearray(self.l - len(self.buf)) + self.c = self.c + n + return self.buf + + def alloc(self, initSize, blockSize): + + if blockSize: + sz = blockSize + else: + sz = 4096 + self.blockSize = self.roundUp(sz) + self.c = 0 + self.l = self.roundUp(initSize) | 0 + self.l += self.blockSize - (self.l % self.blockSize) + self.buf = bytearray(self.l) + return self.buf + + def roundUp(self, n): + + r = n % 4 + if r == 0: + return n + else: + return n + 4 - r + + def reset(self): + + self.c = 0 + self.l = len(self.buf) + + def pack(self, size): + + return self.buf[0:size] + + +def _decompress(inBuf, outBuf): + + c_top_loop = 1 + c_first_literal_run = 2 + c_match = 3 + c_copy_match = 4 + c_match_done = 5 + c_match_next = 6 + + out = outBuf.buf + op = 0 + ip = 0 + t = inBuf[ip] + state = c_top_loop + m_pos = 0 + ip_end = len(inBuf) + + if t > 17: + ip = ip + 1 + t = t - 17 + if t < 4: + state = c_match_next + else: + out = outBuf.require(t) + while True: + out[op] = inBuf[ip] + op = op + 1 + ip = ip + 1 + t = t - 1 + if not t > 0: + break + state = c_first_literal_run + + while True: + if_block = False + + ## + if state == c_top_loop: + t = inBuf[ip] + ip = ip + 1 + if t >= 16: + state = c_match + continue + if t == 0: + while inBuf[ip] == 0: + t = t + 255 + ip = ip + 1 + t = t + 15 + inBuf[ip] + ip = ip + 1 + + t = t + 3 + out = outBuf.require(t) + while True: + out[op] = inBuf[ip] + op = op + 1 + ip = ip + 1 + t = t - 1 + if not t > 0: + break + # emulate c switch + state = c_first_literal_run + + ## + if state == c_first_literal_run: + t = inBuf[ip] + ip = ip + 1 + if t >= 16: + state = c_match + continue + m_pos = op - 0x801 - (t >> 2) - (inBuf[ip] << 2) + ip = ip + 1 + out = outBuf.require(3) + out[op] = out[m_pos] + op = op + 1 + m_pos = m_pos + 1 + out[op] = out[m_pos] + op = op + 1 + m_pos = m_pos + 1 + out[op] = out[m_pos] + op = op + 1 + + state = c_match_done + continue + + ## + if state == c_match: + if t >= 64: + m_pos = op - 1 - ((t >> 2) & 7) - (inBuf[ip] << 3) + ip = ip + 1 + t = (t >> 5) - 1 + state = c_copy_match + continue + elif t >= 32: + t = t & 31 + if t == 0: + while inBuf[ip] == 0: + t = t + 255 + ip = ip + 1 + t = t + 31 + inBuf[ip] + ip = ip + 1 + m_pos = op - 1 - ((inBuf[ip] + (inBuf[ip + 1] << 8)) >> 2) + ip = ip + 2 + elif t >= 16: + m_pos = op - ((t & 8) << 11) + t = t & 7 + if t == 0: + while inBuf[ip] == 0: + t = t + 255 + ip = ip + 1 + t = t + 7 + inBuf[ip] + ip = ip + 1 + m_pos = m_pos - ((inBuf[ip] + (inBuf[ip + 1] << 8)) >> 2) + ip = ip + 2 + if m_pos == op: + break + m_pos = m_pos - 0x4000 + else: + m_pos = op - 1 - (t >> 2) - (inBuf[ip] << 2) + ip = ip + 1 + out = outBuf.require(2) + out[op] = out[m_pos] + op = op + 1 + m_pos = m_pos + 1 + out[op] = out[m_pos] + op = op + 1 + state = c_match_done + continue + + if t >= 6 and (op - m_pos) >= 4: + if_block = True + t += 2 + out = outBuf.require(t) + while True: + out[op] = out[m_pos] + op += 1 + m_pos += 1 + t -= 1 + if not t > 0: + break + # emulate c switch + state = c_copy_match + + ## + if state == c_copy_match: + if not if_block: + t += 2 + out = outBuf.require(t) + while True: + out[op] = out[m_pos] + op += 1 + m_pos += 1 + t -= 1 + if not t > 0: + break + # emulating c switch + state = c_match_done + + ## + if state == c_match_done: + t = inBuf[ip - 2] & 3 + if t == 0: + state = c_top_loop + continue + # emulate c switch + state = c_match_next + + ## + if state == c_match_next: + out = outBuf.require(1) + out[op] = inBuf[ip] + op += 1 + ip += 1 + if t > 1: + out = outBuf.require(1) + out[op] = inBuf[ip] + op += 1 + ip += 1 + if t > 2: + out = outBuf.require(1) + out[op] = inBuf[ip] + op += 1 + ip += 1 + t = inBuf[ip] + ip += 1 + state = c_match + continue + + return bytes(outBuf.pack(op)) + + +class lzo: + + def decompress(input, initSize=16000, blockSize=8192): + output = FlexBuffer() + output.alloc(initSize, blockSize) + return _decompress(bytearray(input), output) + + +""" +Copyright by https://github.com/zhansliu/writemdict + +ripemd128.py - A simple ripemd128 library in pure Python. + +Supports both Python 2 (versions >= 2.6) and Python 3. + +Usage: + from ripemd128 import ripemd128 + digest = ripemd128(b"The quick brown fox jumps over the lazy dog") + assert(digest == b"\x3f\xa9\xb5\x7f\x05\x3c\x05\x3f\xbe\x27\x35\xb2\x38\x0d\xb5\x96") + +""" + + +import struct + + +# follows this description: http://homes.esat.kuleuven.be/~bosselae/ripemd/rmd128.txt + + +def f(j, x, y, z): + assert 0 <= j and j < 64 + if j < 16: + return x ^ y ^ z + elif j < 32: + return (x & y) | (z & ~x) + elif j < 48: + return (x | (0xFFFFFFFF & ~y)) ^ z + else: + return (x & z) | (y & ~z) + + +def K(j): + assert 0 <= j and j < 64 + if j < 16: + return 0x00000000 + elif j < 32: + return 0x5A827999 + elif j < 48: + return 0x6ED9EBA1 + else: + return 0x8F1BBCDC + + +def Kp(j): + assert 0 <= j and j < 64 + if j < 16: + return 0x50A28BE6 + elif j < 32: + return 0x5C4DD124 + elif j < 48: + return 0x6D703EF3 + else: + return 0x00000000 + + +def padandsplit(message): + """ + returns a two-dimensional array X[i][j] of 32-bit integers, where j ranges + from 0 to 16. + First pads the message to length in bytes is congruent to 56 (mod 64), + by first adding a byte 0x80, and then padding with 0x00 bytes until the + message length is congruent to 56 (mod 64). Then adds the little-endian + 64-bit representation of the original length. Finally, splits the result + up into 64-byte blocks, which are further parsed as 32-bit integers. + """ + origlen = len(message) + padlength = 64 - ((origlen - 56) % 64) # minimum padding is 1! + message += b"\x80" + message += b"\x00" * (padlength - 1) + message += struct.pack("> (32 - s)) & 0xFFFFFFFF + + +r = [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 7, + 4, + 13, + 1, + 10, + 6, + 15, + 3, + 12, + 0, + 9, + 5, + 2, + 14, + 11, + 8, + 3, + 10, + 14, + 4, + 9, + 15, + 8, + 1, + 2, + 7, + 0, + 6, + 13, + 11, + 5, + 12, + 1, + 9, + 11, + 10, + 0, + 8, + 12, + 4, + 13, + 3, + 7, + 15, + 14, + 5, + 6, + 2, +] +rp = [ + 5, + 14, + 7, + 0, + 9, + 2, + 11, + 4, + 13, + 6, + 15, + 8, + 1, + 10, + 3, + 12, + 6, + 11, + 3, + 7, + 0, + 13, + 5, + 10, + 14, + 15, + 8, + 12, + 4, + 9, + 1, + 2, + 15, + 5, + 1, + 3, + 7, + 14, + 6, + 9, + 11, + 8, + 12, + 2, + 10, + 0, + 4, + 13, + 8, + 6, + 4, + 1, + 3, + 11, + 15, + 0, + 5, + 12, + 2, + 13, + 9, + 7, + 10, + 14, +] +s = [ + 11, + 14, + 15, + 12, + 5, + 8, + 7, + 9, + 11, + 13, + 14, + 15, + 6, + 7, + 9, + 8, + 7, + 6, + 8, + 13, + 11, + 9, + 7, + 15, + 7, + 12, + 15, + 9, + 11, + 7, + 13, + 12, + 11, + 13, + 6, + 7, + 14, + 9, + 13, + 15, + 14, + 8, + 13, + 6, + 5, + 12, + 7, + 5, + 11, + 12, + 14, + 15, + 14, + 15, + 9, + 8, + 9, + 14, + 5, + 6, + 8, + 6, + 5, + 12, +] +sp = [ + 8, + 9, + 9, + 11, + 13, + 15, + 15, + 5, + 7, + 7, + 8, + 11, + 14, + 14, + 12, + 6, + 9, + 13, + 15, + 7, + 12, + 8, + 9, + 11, + 7, + 7, + 12, + 7, + 6, + 15, + 13, + 11, + 9, + 7, + 15, + 11, + 8, + 6, + 6, + 14, + 12, + 13, + 5, + 14, + 13, + 13, + 7, + 5, + 15, + 5, + 8, + 11, + 14, + 14, + 6, + 14, + 6, + 9, + 12, + 9, + 12, + 5, + 15, + 8, +] + + +def ripemd128(message): + h0 = 0x67452301 + h1 = 0xEFCDAB89 + h2 = 0x98BADCFE + h3 = 0x10325476 + X = padandsplit(message) + for i in range(len(X)): + (A, B, C, D) = (h0, h1, h2, h3) + (Ap, Bp, Cp, Dp) = (h0, h1, h2, h3) + for j in range(64): + T = rol(s[j], add(A, f(j, B, C, D), X[i][r[j]], K(j))) + (A, D, C, B) = (D, C, B, T) + T = rol(sp[j], add(Ap, f(63 - j, Bp, Cp, Dp), X[i][rp[j]], Kp(j))) + (Ap, Dp, Cp, Bp) = (Dp, Cp, Bp, T) + T = add(h1, C, Dp) + h1 = add(h2, D, Ap) + h2 = add(h3, A, Bp) + h3 = add(h0, B, Cp) + h0 = T + + return struct.pack("= (2, 6) + +if sys.version_info >= (3,): + integer_types = (int,) + python3 = True +else: + integer_types = (int, long) + python3 = False + +from struct import Struct + +little_u64 = Struct("= 2**64" + ctx = self.ctx + ctx[8], ctx[9] = little2_i32.unpack(little_u64.pack(counter)) + + def getCounter(self): + return little_u64.unpack(little2_i32.pack(*self.ctx[8:10]))[0] + + def setRounds(self, rounds, testing=False): + assert testing or rounds in [8, 12, 20], "rounds must be 8, 12, 20" + self.rounds = rounds + + def encryptBytes(self, data): + assert type(data) == bytes, "data must be byte string" + assert self._lastChunk64, "previous chunk not multiple of 64 bytes" + lendata = len(data) + munged = bytearray(lendata) + for i in range(0, lendata, 64): + h = salsa20_wordtobyte(self.ctx, self.rounds, checkRounds=False) + self.setCounter((self.getCounter() + 1) % 2**64) + # Stopping at 2^70 bytes per nonce is user's responsibility. + for j in range(min(64, lendata - i)): + if python3: + munged[i + j] = data[i + j] ^ h[j] + else: + munged[i + j] = ord(data[i + j]) ^ ord(h[j]) + + self._lastChunk64 = not lendata % 64 + return bytes(munged) + + decryptBytes = encryptBytes # encrypt and decrypt use same function + + +# -------------------------------------------------------------------------- + + +def salsa20_wordtobyte(input, nRounds=20, checkRounds=True): + """Do nRounds Salsa20 rounds on a copy of + input: list or tuple of 16 ints treated as little-endian unsigneds. + Returns a 64-byte string. + """ + + assert type(input) in (list, tuple) and len(input) == 16 + assert not (checkRounds) or (nRounds in [8, 12, 20]) + + x = list(input) + + def XOR(a, b): + return a ^ b + + ROTATE = rot32 + PLUS = add32 + + for i in range(nRounds // 2): + # These ...XOR...ROTATE...PLUS... lines are from ecrypt-linux.c + # unchanged except for indents and the blank line between rounds: + x[4] = XOR(x[4], ROTATE(PLUS(x[0], x[12]), 7)) + x[8] = XOR(x[8], ROTATE(PLUS(x[4], x[0]), 9)) + x[12] = XOR(x[12], ROTATE(PLUS(x[8], x[4]), 13)) + x[0] = XOR(x[0], ROTATE(PLUS(x[12], x[8]), 18)) + x[9] = XOR(x[9], ROTATE(PLUS(x[5], x[1]), 7)) + x[13] = XOR(x[13], ROTATE(PLUS(x[9], x[5]), 9)) + x[1] = XOR(x[1], ROTATE(PLUS(x[13], x[9]), 13)) + x[5] = XOR(x[5], ROTATE(PLUS(x[1], x[13]), 18)) + x[14] = XOR(x[14], ROTATE(PLUS(x[10], x[6]), 7)) + x[2] = XOR(x[2], ROTATE(PLUS(x[14], x[10]), 9)) + x[6] = XOR(x[6], ROTATE(PLUS(x[2], x[14]), 13)) + x[10] = XOR(x[10], ROTATE(PLUS(x[6], x[2]), 18)) + x[3] = XOR(x[3], ROTATE(PLUS(x[15], x[11]), 7)) + x[7] = XOR(x[7], ROTATE(PLUS(x[3], x[15]), 9)) + x[11] = XOR(x[11], ROTATE(PLUS(x[7], x[3]), 13)) + x[15] = XOR(x[15], ROTATE(PLUS(x[11], x[7]), 18)) + + x[1] = XOR(x[1], ROTATE(PLUS(x[0], x[3]), 7)) + x[2] = XOR(x[2], ROTATE(PLUS(x[1], x[0]), 9)) + x[3] = XOR(x[3], ROTATE(PLUS(x[2], x[1]), 13)) + x[0] = XOR(x[0], ROTATE(PLUS(x[3], x[2]), 18)) + x[6] = XOR(x[6], ROTATE(PLUS(x[5], x[4]), 7)) + x[7] = XOR(x[7], ROTATE(PLUS(x[6], x[5]), 9)) + x[4] = XOR(x[4], ROTATE(PLUS(x[7], x[6]), 13)) + x[5] = XOR(x[5], ROTATE(PLUS(x[4], x[7]), 18)) + x[11] = XOR(x[11], ROTATE(PLUS(x[10], x[9]), 7)) + x[8] = XOR(x[8], ROTATE(PLUS(x[11], x[10]), 9)) + x[9] = XOR(x[9], ROTATE(PLUS(x[8], x[11]), 13)) + x[10] = XOR(x[10], ROTATE(PLUS(x[9], x[8]), 18)) + x[12] = XOR(x[12], ROTATE(PLUS(x[15], x[14]), 7)) + x[13] = XOR(x[13], ROTATE(PLUS(x[12], x[15]), 9)) + x[14] = XOR(x[14], ROTATE(PLUS(x[13], x[12]), 13)) + x[15] = XOR(x[15], ROTATE(PLUS(x[14], x[13]), 18)) + + for i in range(len(input)): + x[i] = PLUS(x[i], input[i]) + return little16_i32.pack(*x) + + +# --------------------------- 32-bit ops ------------------------------- + + +def trunc32(w): + """Return the bottom 32 bits of w as a Python int. + This creates longs temporarily, but returns an int.""" + w = int((w & 0x7FFFFFFF) | -(w & 0x80000000)) + assert type(w) == int + return w + + +def add32(a, b): + """Add two 32-bit words discarding carry above 32nd bit, + and without creating a Python long. + Timing shouldn't vary. + """ + lo = (a & 0xFFFF) + (b & 0xFFFF) + hi = (a >> 16) + (b >> 16) + (lo >> 16) + return (-(hi & 0x8000) | (hi & 0x7FFF)) << 16 | (lo & 0xFFFF) + + +def rot32(w, nLeft): + """Rotate 32-bit word left by nLeft or right by -nLeft + without creating a Python long. + Timing depends on nLeft but not on w. + """ + nLeft &= 31 # which makes nLeft >= 0 + if nLeft == 0: + return w + + # Note: now 1 <= nLeft <= 31. + # RRRsLLLLLL There are nLeft RRR's, (31-nLeft) LLLLLL's, + # => sLLLLLLRRR and one s which becomes the sign bit. + RRR = ((w >> 1) & 0x7FFFFFFF) >> (31 - nLeft) + sLLLLLL = -((1 << (31 - nLeft)) & w) | (0x7FFFFFFF >> nLeft) & w + return RRR | (sLLLLLL << nLeft) + + +# --------------------------------- end ----------------------------------- + +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# readmdict.py +# Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser +# +# Copyright (C) 2012, 2013, 2015 Xiaoqiang Wang +# +# This program is a free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# You can get a copy of GNU General Public License along this program +# But you can always get it from http://www.gnu.org/licenses/gpl.txt +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +from struct import pack, unpack +from io import BytesIO +import re +import sys +import json + + +# zlib compression is used for engine version >=2.0 +import zlib + +# LZO compression is used for engine version < 2.0 +# try: +# import lzo +# except ImportError: +# lzo = None +# print("LZO compression support is not available") + +# 2x3 compatible +if sys.hexversion >= 0x03000000: + unicode = str + + +def _unescape_entities(text): + """ + unescape offending tags < > " & + """ + text = text.replace(b"<", b"<") + text = text.replace(b">", b">") + text = text.replace(b""", b'"') + text = text.replace(b"&", b"&") + return text + + +def _fast_decrypt(data, key): + b = bytearray(data) + key = bytearray(key) + previous = 0x36 + for i in range(len(b)): + t = (b[i] >> 4 | b[i] << 4) & 0xFF + t = t ^ previous ^ (i & 0xFF) ^ key[i % len(key)] + previous = b[i] + b[i] = t + return bytes(b) + + +def _mdx_decrypt(comp_block): + key = ripemd128(comp_block[4:8] + pack(b" + """ + taglist = re.findall(b'(\w+)="(.*?)"', header, re.DOTALL) + tagdict = {} + for key, value in taglist: + tagdict[key] = _unescape_entities(value) + return tagdict + + def _decode_key_block_info(self, key_block_info_compressed): + if self._version >= 2: + # zlib compression + assert key_block_info_compressed[:4] == b"\x02\x00\x00\x00" + # decrypt if needed + if self._encrypt & 0x02: + key_block_info_compressed = _mdx_decrypt(key_block_info_compressed) + # decompress + key_block_info = zlib.decompress(key_block_info_compressed[8:]) + # adler checksum + adler32 = unpack(">I", key_block_info_compressed[4:8])[0] + assert adler32 == zlib.adler32(key_block_info) & 0xFFFFFFFF + else: + # no compression + key_block_info = key_block_info_compressed + # decode + key_block_info_list = [] + num_entries = 0 + i = 0 + if self._version >= 2: + byte_format = ">H" + byte_width = 2 + text_term = 1 + else: + byte_format = ">B" + byte_width = 1 + text_term = 0 + + while i < len(key_block_info): + # number of entries in current key block + num_entries += unpack( + self._number_format, key_block_info[i : i + self._number_width] + )[0] + i += self._number_width + # text head size + text_head_size = unpack(byte_format, key_block_info[i : i + byte_width])[0] + i += byte_width + # text head + if self._encoding != "UTF-16": + i += text_head_size + text_term + else: + i += (text_head_size + text_term) * 2 + # text tail size + text_tail_size = unpack(byte_format, key_block_info[i : i + byte_width])[0] + i += byte_width + # text tail + if self._encoding != "UTF-16": + i += text_tail_size + text_term + else: + i += (text_tail_size + text_term) * 2 + # key block compressed size + key_block_compressed_size = unpack( + self._number_format, key_block_info[i : i + self._number_width] + )[0] + i += self._number_width + # key block decompressed size + key_block_decompressed_size = unpack( + self._number_format, key_block_info[i : i + self._number_width] + )[0] + i += self._number_width + key_block_info_list += [ + (key_block_compressed_size, key_block_decompressed_size) + ] + + assert num_entries == self._num_entries + + return key_block_info_list + + def _decode_key_block(self, key_block_compressed, key_block_info_list): + key_list = [] + i = 0 + for compressed_size, decompressed_size in key_block_info_list: + start = i + end = i + compressed_size + # 4 bytes : compression type + key_block_type = key_block_compressed[start : start + 4] + # 4 bytes : adler checksum of decompressed key block + adler32 = unpack(">I", key_block_compressed[start + 4 : start + 8])[0] + if key_block_type == b"\x00\x00\x00\x00": + key_block = key_block_compressed[start + 8 : end] + elif key_block_type == b"\x01\x00\x00\x00": + if lzo is None: + print("LZO compression is not supported") + break + # decompress key block + header = b"\xf0" + pack(">I", decompressed_size) + key_block = lzo.decompress( + key_block_compressed[start + 8 : end], + initSize=decompressed_size, + blockSize=1308672, + ) + elif key_block_type == b"\x02\x00\x00\x00": + # decompress key block + key_block = zlib.decompress(key_block_compressed[start + 8 : end]) + # extract one single key block into a key list + key_list += self._split_key_block(key_block) + # notice that adler32 returns signed value + assert adler32 == zlib.adler32(key_block) & 0xFFFFFFFF + + i += compressed_size + return key_list + + def _split_key_block(self, key_block): + key_list = [] + key_start_index = 0 + while key_start_index < len(key_block): + temp = key_block[key_start_index : key_start_index + self._number_width] + # the corresponding record's offset in record block + key_id = unpack( + self._number_format, + key_block[key_start_index : key_start_index + self._number_width], + )[0] + # key text ends with '\x00' + if self._encoding == "UTF-16": + delimiter = b"\x00\x00" + width = 2 + else: + delimiter = b"\x00" + width = 1 + i = key_start_index + self._number_width + while i < len(key_block): + if key_block[i : i + width] == delimiter: + key_end_index = i + break + i += width + key_text = ( + key_block[key_start_index + self._number_width : key_end_index] + .decode(self._encoding, errors="ignore") + .encode("utf-8") + .strip() + ) + key_start_index = key_end_index + width + key_list += [(key_id, key_text)] + return key_list + + def _read_header(self): + f = open(self._fname, "rb") + # number of bytes of header text + header_bytes_size = unpack(">I", f.read(4))[0] + header_bytes = f.read(header_bytes_size) + # 4 bytes: adler32 checksum of header, in little endian + adler32 = unpack("= 0x03000000: + encoding = encoding.decode("utf-8") + # GB18030 > GBK > GB2312 + if encoding in ["GBK", "GB2312"]: + encoding = "GB18030" + self._encoding = encoding + # 读取标题和描述 + if b"Title" in header_tag: + self._title = header_tag[b"Title"].decode("utf-8") + else: + self._title = "" + + if b"Description" in header_tag: + self._description = header_tag[b"Description"].decode("utf-8") + else: + self._description = "" + pass + # encryption flag + # 0x00 - no encryption + # 0x01 - encrypt record block + # 0x02 - encrypt key info block + if b"Encrypted" not in header_tag or header_tag[b"Encrypted"] == b"No": + self._encrypt = 0 + elif header_tag[b"Encrypted"] == b"Yes": + self._encrypt = 1 + else: + self._encrypt = int(header_tag[b"Encrypted"]) + + # stylesheet attribute if present takes form of: + # style_number # 1-255 + # style_begin # or '' + # style_end # or '' + # store stylesheet in dict in the form of + # {'number' : ('style_begin', 'style_end')} + self._stylesheet = {} + if header_tag.get("StyleSheet"): + lines = header_tag["StyleSheet"].splitlines() + for i in range(0, len(lines), 3): + self._stylesheet[lines[i]] = (lines[i + 1], lines[i + 2]) + + # before version 2.0, number is 4 bytes integer + # version 2.0 and above uses 8 bytes + self._version = float(header_tag[b"GeneratedByEngineVersion"]) + if self._version < 2.0: + self._number_width = 4 + self._number_format = ">I" + else: + self._number_width = 8 + self._number_format = ">Q" + + return header_tag + + def _read_keys(self): + f = open(self._fname, "rb") + f.seek(self._key_block_offset) + + # the following numbers could be encrypted + if self._version >= 2.0: + num_bytes = 8 * 5 + else: + num_bytes = 4 * 4 + block = f.read(num_bytes) + + if self._encrypt & 1: + if self._passcode is None: + raise RuntimeError( + "user identification is needed to read encrypted file" + ) + regcode, userid = self._passcode + if isinstance(userid, unicode): + userid = userid.encode("utf8") + if self.header[b"RegisterBy"] == b"EMail": + encrypted_key = _decrypt_regcode_by_email(regcode, userid) + else: + encrypted_key = _decrypt_regcode_by_deviceid(regcode, userid) + block = _salsa_decrypt(block, encrypted_key) + + # decode this block + sf = BytesIO(block) + # number of key blocks + num_key_blocks = self._read_number(sf) + # number of entries + self._num_entries = self._read_number(sf) + # number of bytes of key block info after decompression + if self._version >= 2.0: + key_block_info_decomp_size = self._read_number(sf) + # number of bytes of key block info + key_block_info_size = self._read_number(sf) + # number of bytes of key block + key_block_size = self._read_number(sf) + + # 4 bytes: adler checksum of previous 5 numbers + if self._version >= 2.0: + adler32 = unpack(">I", f.read(4))[0] + assert adler32 == (zlib.adler32(block) & 0xFFFFFFFF) + + # read key block info, which indicates key block's compressed and + # decompressed size + key_block_info = f.read(key_block_info_size) + key_block_info_list = self._decode_key_block_info(key_block_info) + assert num_key_blocks == len(key_block_info_list) + + # read key block + key_block_compressed = f.read(key_block_size) + # extract key block + key_list = self._decode_key_block(key_block_compressed, key_block_info_list) + + self._record_block_offset = f.tell() + f.close() + + return key_list + + def _read_keys_brutal(self): + f = open(self._fname, "rb") + f.seek(self._key_block_offset) + + # the following numbers could be encrypted, disregard them! + if self._version >= 2.0: + num_bytes = 8 * 5 + 4 + key_block_type = b"\x02\x00\x00\x00" + else: + num_bytes = 4 * 4 + key_block_type = b"\x01\x00\x00\x00" + block = f.read(num_bytes) + + # key block info + # 4 bytes '\x02\x00\x00\x00' + # 4 bytes adler32 checksum + # unknown number of bytes follows until '\x02\x00\x00\x00' which marks + # the beginning of key block + key_block_info = f.read(8) + if self._version >= 2.0: + assert key_block_info[:4] == b"\x02\x00\x00\x00" + while True: + fpos = f.tell() + t = f.read(1024) + index = t.find(key_block_type) + if index != -1: + key_block_info += t[:index] + f.seek(fpos + index) + break + else: + key_block_info += t + + key_block_info_list = self._decode_key_block_info(key_block_info) + key_block_size = sum(list(zip(*key_block_info_list))[0]) + + # read key block + key_block_compressed = f.read(key_block_size) + # extract key block + key_list = self._decode_key_block(key_block_compressed, key_block_info_list) + + self._record_block_offset = f.tell() + f.close() + + self._num_entries = len(key_list) + return key_list + + +class MDD(MDict): + """ + MDict resource file format (*.MDD) reader. + >>> mdd = MDD('example.mdd') + >>> len(mdd) + 208 + >>> for filename,content in mdd.items(): + ... print filename, content[:10] + """ + + def __init__(self, fname, passcode=None): + MDict.__init__(self, fname, encoding="UTF-16", passcode=passcode) + + def items(self): + """Return a generator which in turn produce tuples in the form of (filename, content)""" + return self._decode_record_block() + + def _decode_record_block(self): + f = open(self._fname, "rb") + f.seek(self._record_block_offset) + + num_record_blocks = self._read_number(f) + num_entries = self._read_number(f) + assert num_entries == self._num_entries + record_block_info_size = self._read_number(f) + record_block_size = self._read_number(f) + + # record block info section + record_block_info_list = [] + size_counter = 0 + for i in range(num_record_blocks): + compressed_size = self._read_number(f) + decompressed_size = self._read_number(f) + record_block_info_list += [(compressed_size, decompressed_size)] + size_counter += self._number_width * 2 + assert size_counter == record_block_info_size + + # actual record block + offset = 0 + i = 0 + size_counter = 0 + for compressed_size, decompressed_size in record_block_info_list: + record_block_compressed = f.read(compressed_size) + # 4 bytes: compression type + record_block_type = record_block_compressed[:4] + # 4 bytes: adler32 checksum of decompressed record block + adler32 = unpack(">I", record_block_compressed[4:8])[0] + if record_block_type == b"\x00\x00\x00\x00": + record_block = record_block_compressed[8:] + elif record_block_type == b"\x01\x00\x00\x00": + if lzo is None: + print("LZO compression is not supported") + break + # decompress + header = b"\xf0" + pack(">I", decompressed_size) + record_block = lzo.decompress( + record_block_compressed[start + 8 : end], + initSize=decompressed_size, + blockSize=1308672, + ) + elif record_block_type == b"\x02\x00\x00\x00": + # decompress + record_block = zlib.decompress(record_block_compressed[8:]) + + # notice that adler32 return signed value + assert adler32 == zlib.adler32(record_block) & 0xFFFFFFFF + + assert len(record_block) == decompressed_size + # split record block according to the offset info from key block + while i < len(self._key_list): + record_start, key_text = self._key_list[i] + # reach the end of current record block + if record_start - offset >= len(record_block): + break + # record end index + if i < len(self._key_list) - 1: + record_end = self._key_list[i + 1][0] + else: + record_end = len(record_block) + offset + i += 1 + data = record_block[record_start - offset : record_end - offset] + yield key_text, data + offset += len(record_block) + size_counter += compressed_size + assert size_counter == record_block_size + + f.close() + + ### 获取 mdx 文件的索引列表,格式为 + ### key_text(关键词,可以由后面的 keylist 得到) + ### file_pos(record_block开始的位置) + ### compressed_size(record_block压缩前的大小) + ### decompressed_size(解压后的大小) + ### record_block_type(record_block 的压缩类型) + ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存) + ### record_end + ### offset + + def get_index(self, check_block=True): + f = open(self._fname, "rb") + index_dict_list = [] + f.seek(self._record_block_offset) + + num_record_blocks = self._read_number(f) + num_entries = self._read_number(f) + assert num_entries == self._num_entries + record_block_info_size = self._read_number(f) + record_block_size = self._read_number(f) + + # record block info section + record_block_info_list = [] + size_counter = 0 + for i in range(num_record_blocks): + compressed_size = self._read_number(f) + decompressed_size = self._read_number(f) + record_block_info_list += [(compressed_size, decompressed_size)] + size_counter += self._number_width * 2 + # todo:注意!!! + assert size_counter == record_block_info_size + + # actual record block + offset = 0 + i = 0 + size_counter = 0 + for compressed_size, decompressed_size in record_block_info_list: + current_pos = f.tell() + record_block_compressed = f.read(compressed_size) + # 4 bytes: compression type + record_block_type = record_block_compressed[:4] + # 4 bytes: adler32 checksum of decompressed record block + adler32 = unpack(">I", record_block_compressed[4:8])[0] + if record_block_type == b"\x00\x00\x00\x00": + _type = 0 + if check_block: + record_block = record_block_compressed[8:] + elif record_block_type == b"\x01\x00\x00\x00": + _type = 1 + if lzo is None: + print("LZO compression is not supported") + break + # decompress + header = b"\xf0" + pack(">I", decompressed_size) + if check_block: + record_block = lzo.decompress( + record_block_compressed[start + 8 : end], + initSize=decompressed_size, + blockSize=1308672, + ) + elif record_block_type == b"\x02\x00\x00\x00": + # decompress + _type = 2 + if check_block: + record_block = zlib.decompress(record_block_compressed[8:]) + + # notice that adler32 return signed value + if check_block: + assert adler32 == zlib.adler32(record_block) & 0xFFFFFFFF + assert len(record_block) == decompressed_size + # split record block according to the offset info from key block + while i < len(self._key_list): + ### 用来保存索引信息的空字典 + index_dict = {} + index_dict["file_pos"] = current_pos + index_dict["compressed_size"] = compressed_size + index_dict["decompressed_size"] = decompressed_size + index_dict["record_block_type"] = _type + record_start, key_text = self._key_list[i] + index_dict["record_start"] = record_start + index_dict["key_text"] = key_text.decode("utf-8") + index_dict["offset"] = offset + # reach the end of current record block + if record_start - offset >= decompressed_size: + break + # record end index + if i < len(self._key_list) - 1: + record_end = self._key_list[i + 1][0] + else: + record_end = decompressed_size + offset + index_dict["record_end"] = record_end + i += 1 + if check_block: + data = record_block[record_start - offset : record_end - offset] + index_dict_list.append(index_dict) + # yield key_text, data + offset += decompressed_size + size_counter += compressed_size + assert size_counter == record_block_size + f.close() + return index_dict_list + + +class MDX(MDict): + """ + MDict dictionary file format (*.MDD) reader. + >>> mdx = MDX('example.mdx') + >>> len(mdx) + 42481 + >>> for key,value in mdx.items(): + ... print key, value[:10] + """ + + def __init__(self, fname, encoding="", substyle=False, passcode=None): + MDict.__init__(self, fname, encoding, passcode) + self._substyle = substyle + + def items(self): + """Return a generator which in turn produce tuples in the form of (key, value)""" + return self._decode_record_block() + + def _substitute_stylesheet(self, txt): + # substitute stylesheet definition + txt_list = re.split("`\d+`", txt) + txt_tag = re.findall("`\d+`", txt) + txt_styled = txt_list[0] + for j, p in enumerate(txt_list[1:]): + style = self._stylesheet[txt_tag[j][1:-1]] + if p and p[-1] == "\n": + txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + "\r\n" + else: + txt_styled = txt_styled + style[0] + p + style[1] + return txt_styled + + def _decode_record_block(self): + f = open(self._fname, "rb") + f.seek(self._record_block_offset) + + num_record_blocks = self._read_number(f) + num_entries = self._read_number(f) + assert num_entries == self._num_entries + record_block_info_size = self._read_number(f) + record_block_size = self._read_number(f) + + # record block info section + record_block_info_list = [] + size_counter = 0 + for i in range(num_record_blocks): + compressed_size = self._read_number(f) + decompressed_size = self._read_number(f) + record_block_info_list += [(compressed_size, decompressed_size)] + size_counter += self._number_width * 2 + assert size_counter == record_block_info_size + + # actual record block data + offset = 0 + i = 0 + size_counter = 0 + ###最后的索引表的格式为 + ### key_text(关键词,可以由后面的 keylist 得到) + ### file_pos(record_block开始的位置) + ### compressed_size(record_block压缩前的大小) + ### decompressed_size(解压后的大小) + ### record_block_type(record_block 的压缩类型) + ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存) + ### record_end + ### offset + for compressed_size, decompressed_size in record_block_info_list: + record_block_compressed = f.read(compressed_size) + ###### 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录) + ###### 另外还需要记录当前 f 对象的位置 + ###### 使用 f.tell() 命令/ 在建立索引是需要 f.seek() + # 4 bytes indicates block compression type + record_block_type = record_block_compressed[:4] + # 4 bytes adler checksum of uncompressed content + adler32 = unpack(">I", record_block_compressed[4:8])[0] + # no compression + if record_block_type == b"\x00\x00\x00\x00": + record_block = record_block_compressed[8:] + # lzo compression + elif record_block_type == b"\x01\x00\x00\x00": + if lzo is None: + print("LZO compression is not supported") + break + # decompress + header = b"\xf0" + pack(">I", decompressed_size) + record_block = lzo.decompress( + record_block_compressed[8:], + initSize=decompressed_size, + blockSize=1308672, + ) + # zlib compression + elif record_block_type == b"\x02\x00\x00\x00": + # decompress + record_block = zlib.decompress(record_block_compressed[8:]) + ###### 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的,其中一共有三种解压方法 + ###### 需要的信息有 record_block_compressed, decompress_size, + ###### record_block_type + ###### 另外还需要校验信息 adler32 + # notice that adler32 return signed value + assert adler32 == zlib.adler32(record_block) & 0xFFFFFFFF + + assert len(record_block) == decompressed_size + # split record block according to the offset info from key block + while i < len(self._key_list): + record_start, key_text = self._key_list[i] + # reach the end of current record block + if record_start - offset >= len(record_block): + break + # record end index + if i < len(self._key_list) - 1: + record_end = self._key_list[i + 1][0] + else: + record_end = len(record_block) + offset + i += 1 + #############需要得到 record_block , record_start, record_end, + #############offset + record = record_block[record_start - offset : record_end - offset] + # convert to utf-8 + record = ( + record.decode(self._encoding, errors="ignore") + .strip("\x00") + .encode("utf-8") + ) + # substitute styles + #############是否替换样式表 + if self._substyle and self._stylesheet: + record = self._substitute_stylesheet(record) + + yield key_text, record + offset += len(record_block) + size_counter += compressed_size + assert size_counter == record_block_size + + f.close() + + ### 获取 mdx 文件的索引列表,格式为 + ### key_text(关键词,可以由后面的 keylist 得到) + ### file_pos(record_block开始的位置) + ### compressed_size(record_block压缩前的大小) + ### decompressed_size(解压后的大小) + ### record_block_type(record_block 的压缩类型) + ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存) + ### record_end + ### offset + ### 所需 metadata + ### + def get_index(self, check_block=True): + ### 索引列表 + index_dict_list = [] + f = open(self._fname, "rb") + f.seek(self._record_block_offset) + + num_record_blocks = self._read_number(f) + num_entries = self._read_number(f) + assert num_entries == self._num_entries + record_block_info_size = self._read_number(f) + record_block_size = self._read_number(f) + + # record block info section + record_block_info_list = [] + size_counter = 0 + for i in range(num_record_blocks): + compressed_size = self._read_number(f) + decompressed_size = self._read_number(f) + record_block_info_list += [(compressed_size, decompressed_size)] + size_counter += self._number_width * 2 + assert size_counter == record_block_info_size + + # actual record block data + offset = 0 + i = 0 + size_counter = 0 + ###最后的索引表的格式为 + ### key_text(关键词,可以由后面的 keylist 得到) + ### file_pos(record_block开始的位置) + ### compressed_size(record_block压缩前的大小) + ### decompressed_size(解压后的大小) + ### record_block_type(record_block 的压缩类型) + ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存) + ### record_end + ### offset + for compressed_size, decompressed_size in record_block_info_list: + current_pos = f.tell() + record_block_compressed = f.read(compressed_size) + ###### 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录) + ###### 另外还需要记录当前 f 对象的位置 + ###### 使用 f.tell() 命令/ 在建立索引是需要 f.seek() + # 4 bytes indicates block compression type + record_block_type = record_block_compressed[:4] + # 4 bytes adler checksum of uncompressed content + adler32 = unpack(">I", record_block_compressed[4:8])[0] + # no compression + if record_block_type == b"\x00\x00\x00\x00": + _type = 0 + record_block = record_block_compressed[8:] + # lzo compression + elif record_block_type == b"\x01\x00\x00\x00": + _type = 1 + if lzo is None: + print("LZO compression is not supported") + break + # decompress + header = b"\xf0" + pack(">I", decompressed_size) + if check_block: + record_block = lzo.decompress( + record_block_compressed[8:], + initSize=decompressed_size, + blockSize=1308672, + ) + # zlib compression + elif record_block_type == b"\x02\x00\x00\x00": + # decompress + _type = 2 + if check_block: + record_block = zlib.decompress(record_block_compressed[8:]) + ###### 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的,其中一共有三种解压方法 + ###### 需要的信息有 record_block_compressed, decompress_size, + ###### record_block_type + ###### 另外还需要校验信息 adler32 + # notice that adler32 return signed value + if check_block: + assert adler32 == zlib.adler32(record_block) & 0xFFFFFFFF + assert len(record_block) == decompressed_size + # split record block according to the offset info from key block + while i < len(self._key_list): + ### 用来保存索引信息的空字典 + index_dict = {} + index_dict["file_pos"] = current_pos + index_dict["compressed_size"] = compressed_size + index_dict["decompressed_size"] = decompressed_size + index_dict["record_block_type"] = _type + record_start, key_text = self._key_list[i] + index_dict["record_start"] = record_start + index_dict["key_text"] = key_text.decode("utf-8") + index_dict["offset"] = offset + # reach the end of current record block + if record_start - offset >= decompressed_size: + break + # record end index + if i < len(self._key_list) - 1: + record_end = self._key_list[i + 1][0] + else: + record_end = decompressed_size + offset + index_dict["record_end"] = record_end + i += 1 + #############需要得到 record_block , record_start, record_end, + #############offset + if check_block: + record = record_block[record_start - offset : record_end - offset] + # convert to utf-8 + record = ( + record.decode(self._encoding, errors="ignore") + .strip("\x00") + .encode("utf-8") + ) + # substitute styles + #############是否替换样式表 + if self._substyle and self._stylesheet: + record = self._substitute_stylesheet(record) + index_dict_list.append(index_dict) + + offset += decompressed_size + size_counter += compressed_size + # todo: 注意!!! + # assert(size_counter == record_block_size) + f.close + # 这里比 mdd 部分稍有不同,应该还需要传递编码以及样式表信息 + meta = {} + meta["encoding"] = self._encoding + meta["stylesheet"] = json.dumps(self._stylesheet) + meta["title"] = self._title + meta["description"] = self._description + + return {"index_dict_list": index_dict_list, "meta": meta} + + +from struct import pack, unpack +from io import BytesIO +import re +import sys +import os +import sqlite3 +import json + +# zlib compression is used for engine version >=2.0 +import zlib + +# LZO compression is used for engine version < 2.0 +# try: +# import lzo +# except ImportError: +# lzo = None +# print("LZO compression support is not available") + +# 2x3 compatible +if sys.hexversion >= 0x03000000: + unicode = str + +version = "1.1" + + +class IndexBuilder(object): + # todo: enable history + def __init__( + self, + fname, + encoding="", + passcode=None, + force_rebuild=False, + enable_history=False, + sql_index=True, + check=False, + ): + self._mdx_file = fname + self._mdd_file = "" + self._encoding = "" + self._stylesheet = {} + self._title = "" + self._version = "" + self._description = "" + self._sql_index = sql_index + self._check = check + _filename, _file_extension = os.path.splitext(fname) + assert _file_extension == ".mdx" + assert os.path.isfile(fname) + self._mdx_db = _filename + ".mdx.db" + # make index anyway + if force_rebuild: + self._make_mdx_index(self._mdx_db) + if os.path.isfile(_filename + ".mdd"): + self._mdd_file = _filename + ".mdd" + self._mdd_db = _filename + ".mdd.db" + self._make_mdd_index(self._mdd_db) + + if os.path.isfile(self._mdx_db): + # read from META table + conn = sqlite3.connect(self._mdx_db) + # cursor = conn.execute("SELECT * FROM META") + cursor = conn.execute('SELECT * FROM META WHERE key = "version"') + # 判断有无版本号 + for cc in cursor: + self._version = cc[1] + ################# if not version in fo ############# + if not self._version: + print("version info not found") + conn.close() + self._make_mdx_index(self._mdx_db) + print("mdx.db rebuilt!") + if os.path.isfile(_filename + ".mdd"): + self._mdd_file = _filename + ".mdd" + self._mdd_db = _filename + ".mdd.db" + self._make_mdd_index(self._mdd_db) + print("mdd.db rebuilt!") + return None + cursor = conn.execute('SELECT * FROM META WHERE key = "encoding"') + for cc in cursor: + self._encoding = cc[1] + cursor = conn.execute('SELECT * FROM META WHERE key = "stylesheet"') + for cc in cursor: + self._stylesheet = json.loads(cc[1]) + + cursor = conn.execute('SELECT * FROM META WHERE key = "title"') + for cc in cursor: + self._title = cc[1] + + cursor = conn.execute('SELECT * FROM META WHERE key = "description"') + for cc in cursor: + self._description = cc[1] + + # for cc in cursor: + # if cc[0] == 'encoding': + # self._encoding = cc[1] + # continue + # if cc[0] == 'stylesheet': + # self._stylesheet = json.loads(cc[1]) + # continue + # if cc[0] == 'title': + # self._title = cc[1] + # continue + # if cc[0] == 'title': + # self._description = cc[1] + else: + self._make_mdx_index(self._mdx_db) + + if os.path.isfile(_filename + ".mdd"): + self._mdd_file = _filename + ".mdd" + self._mdd_db = _filename + ".mdd.db" + if not os.path.isfile(self._mdd_db): + self._make_mdd_index(self._mdd_db) + pass + + def _replace_stylesheet(self, txt): + # substitute stylesheet definition + txt_list = re.split("`\d+`", txt) + txt_tag = re.findall("`\d+`", txt) + txt_styled = txt_list[0] + for j, p in enumerate(txt_list[1:]): + style = self._stylesheet[txt_tag[j][1:-1]] + if p and p[-1] == "\n": + txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + "\r\n" + else: + txt_styled = txt_styled + style[0] + p + style[1] + return txt_styled + + def make_sqlite(self): + sqlite_file = self._mdx_file + ".sqlite.db" + if os.path.exists(sqlite_file): + os.remove(sqlite_file) + mdx = MDX(self._mdx_file) + conn = sqlite3.connect(sqlite_file) + cursor = conn.cursor() + cursor.execute( + """ CREATE TABLE MDX_DICT + (key text not null, + value text + )""" + ) + + # remove '(pīnyīn)', remove `1`: + aeiou = "āáǎàĀÁǍÀēéěèêềếĒÉĚÈÊỀẾīíǐìÍǏÌōóǒòŌÓǑÒūúǔùŪÚǓÙǖǘǚǜǕǗǙǛḾǹňŃŇ" + pattern = r"`\d+`|[(\(]?['a-z%s]*[%s]['a-z%s]*[\))]?" % (aeiou, aeiou, aeiou) + tuple_list = [ + (key.decode(), re.sub(pattern, "", value.decode())) + for key, value in mdx.items() + ] + + cursor.executemany("INSERT INTO MDX_DICT VALUES (?,?)", tuple_list) + + returned_index = mdx.get_index(check_block=self._check) + meta = returned_index["meta"] + cursor.execute("""CREATE TABLE META (key text, value text)""") + + cursor.executemany( + "INSERT INTO META VALUES (?,?)", + [ + ("encoding", meta["encoding"]), + ("stylesheet", meta["stylesheet"]), + ("title", meta["title"]), + ("description", meta["description"]), + ("version", version), + ], + ) + + if self._sql_index: + cursor.execute( + """ + CREATE INDEX key_index ON MDX_DICT (key) + """ + ) + conn.commit() + conn.close() + + def _make_mdx_index(self, db_name): + if os.path.exists(db_name): + os.remove(db_name) + mdx = MDX(self._mdx_file) + self._mdx_db = db_name + returned_index = mdx.get_index(check_block=self._check) + index_list = returned_index["index_dict_list"] + conn = sqlite3.connect(db_name) + c = conn.cursor() + c.execute( + """ CREATE TABLE MDX_INDEX + (key_text text not null, + file_pos integer, + compressed_size integer, + decompressed_size integer, + record_block_type integer, + record_start integer, + record_end integer, + offset integer + )""" + ) + + tuple_list = [ + ( + item["key_text"], + item["file_pos"], + item["compressed_size"], + item["decompressed_size"], + item["record_block_type"], + item["record_start"], + item["record_end"], + item["offset"], + ) + for item in index_list + ] + c.executemany("INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)", tuple_list) + # build the metadata table + meta = returned_index["meta"] + c.execute( + """CREATE TABLE META + (key text, + value text + )""" + ) + + # for k,v in meta: + # c.execute( + # 'INSERT INTO META VALUES (?,?)', + # (k, v) + # ) + + c.executemany( + "INSERT INTO META VALUES (?,?)", + [ + ("encoding", meta["encoding"]), + ("stylesheet", meta["stylesheet"]), + ("title", meta["title"]), + ("description", meta["description"]), + ("version", version), + ], + ) + + if self._sql_index: + c.execute( + """ + CREATE INDEX key_index ON MDX_INDEX (key_text) + """ + ) + + conn.commit() + conn.close() + # set class member + self._encoding = meta["encoding"] + self._stylesheet = json.loads(meta["stylesheet"]) + self._title = meta["title"] + self._description = meta["description"] + + def _make_mdd_index(self, db_name): + if os.path.exists(db_name): + os.remove(db_name) + mdd = MDD(self._mdd_file) + self._mdd_db = db_name + index_list = mdd.get_index(check_block=self._check) + conn = sqlite3.connect(db_name) + c = conn.cursor() + c.execute( + """ CREATE TABLE MDX_INDEX + (key_text text not null unique, + file_pos integer, + compressed_size integer, + decompressed_size integer, + record_block_type integer, + record_start integer, + record_end integer, + offset integer + )""" + ) + + tuple_list = [ + ( + item["key_text"], + item["file_pos"], + item["compressed_size"], + item["decompressed_size"], + item["record_block_type"], + item["record_start"], + item["record_end"], + item["offset"], + ) + for item in index_list + ] + c.executemany("INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)", tuple_list) + if self._sql_index: + c.execute( + """ + CREATE UNIQUE INDEX key_index ON MDX_INDEX (key_text) + """ + ) + + conn.commit() + conn.close() + + @staticmethod + def get_data_by_index(fmdx, index): + fmdx.seek(index["file_pos"]) + record_block_compressed = fmdx.read(index["compressed_size"]) + record_block_type = record_block_compressed[:4] + record_block_type = index["record_block_type"] + decompressed_size = index["decompressed_size"] + # adler32 = unpack('>I', record_block_compressed[4:8])[0] + if record_block_type == 0: + _record_block = record_block_compressed[8:] + # lzo compression + elif record_block_type == 1: + if lzo is None: + print("LZO compression is not supported") + # decompress + header = b"\xf0" + pack(">I", index["decompressed_size"]) + _record_block = lzo.decompress( + record_block_compressed[8:], + initSize=decompressed_size, + blockSize=1308672, + ) + # zlib compression + elif record_block_type == 2: + # decompress + _record_block = zlib.decompress(record_block_compressed[8:]) + data = _record_block[ + index["record_start"] + - index["offset"] : index["record_end"] + - index["offset"] + ] + return data + + def get_mdx_by_index(self, fmdx, index): + data = self.get_data_by_index(fmdx, index) + record = ( + data.decode(self._encoding, errors="ignore").strip("\x00").encode("utf-8") + ) + if self._stylesheet: + record = self._replace_stylesheet(record) + record = record.decode("utf-8") + return record + + def get_mdd_by_index(self, fmdx, index): + return self.get_data_by_index(fmdx, index) + + @staticmethod + def lookup_indexes(db, keyword, ignorecase=None): + indexes = [] + if ignorecase: + sql = 'SELECT * FROM MDX_INDEX WHERE lower(key_text) = lower("{}")'.format( + keyword + ) + else: + sql = 'SELECT * FROM MDX_INDEX WHERE key_text = "{}"'.format(keyword) + with sqlite3.connect(db) as conn: + cursor = conn.execute(sql) + for result in cursor: + index = {} + index["file_pos"] = result[1] + index["compressed_size"] = result[2] + index["decompressed_size"] = result[3] + index["record_block_type"] = result[4] + index["record_start"] = result[5] + index["record_end"] = result[6] + index["offset"] = result[7] + indexes.append(index) + return indexes + + def mdx_lookup(self, keyword, ignorecase=None): + lookup_result_list = [] + indexes = self.lookup_indexes(self._mdx_db, keyword, ignorecase) + with open(self._mdx_file, "rb") as mdx_file: + for index in indexes: + lookup_result_list.append(self.get_mdx_by_index(mdx_file, index)) + return lookup_result_list + + def mdd_lookup(self, keyword, ignorecase=None): + lookup_result_list = [] + indexes = self.lookup_indexes(self._mdd_db, keyword, ignorecase) + with open(self._mdd_file, "rb") as mdd_file: + for index in indexes: + lookup_result_list.append(self.get_mdd_by_index(mdd_file, index)) + return lookup_result_list + + @staticmethod + def get_keys(db, query=""): + if not db: + return [] + if query: + if "*" in query: + query = query.replace("*", "%") + else: + query = query + "%" + sql = 'SELECT key_text FROM MDX_INDEX WHERE key_text LIKE "' + query + '"' + else: + sql = "SELECT key_text FROM MDX_INDEX" + with sqlite3.connect(db) as conn: + cursor = conn.execute(sql) + keys = [item[0] for item in cursor] + return keys + + def get_mdd_keys(self, query=""): + return self.get_keys(self._mdd_db, query) + + def get_mdx_keys(self, query=""): + return self.get_keys(self._mdx_db, query) + + +from cishu.cishubase import cishubase +import re + + +class mdict(cishubase): + def init(self): + self.sql = None + + paths = self.config["path"] + self.builders = [] + for f in paths.split("|"): + if os.path.exists(f): + try: + self.builders.append((IndexBuilder(f), f)) + except: + pass + + def querycomplex(self, word, index): + #0 严格,1 前缀,2 后缀,3 中缀 + results = [] + results += index(word) + if self.config['ambiguity']==0: + results=results[:1] + if self.config['ambiguity']>=2: + for k in index("*" + word): + if k not in results: + results.append(k) + if self.config['ambiguity']>=3: + for k in index("*" + word + "*"): + if k not in results: + results.append(k) + return results + + def parse_strings(self, input_string): + parsed_strings = [] + current_string = "" + current_number = "" + + i = 0 + while i < len(input_string): + if input_string[i] == "`": + if current_number and current_string: + parsed_strings.append((current_number, current_string)) + current_number = "" + current_string = "" + i += 1 + elif input_string[i].isdigit(): + current_number += input_string[i] + i += 1 + else: + current_string += input_string[i] + i += 1 + + if current_number and current_string: + parsed_strings.append((current_number, current_string)) + + return parsed_strings + + def parseashtml(self, item): + item = item.replace("\r\n", "
") + items = self.parse_strings(item) + html = "" + for type_, string in items: + if type_ == "1": + html += f'{string}' + elif type_ == "2": + html += f'{string}' + elif type_ == "3": + html += ( + f'{string}' + ) + elif type_ == "4": + html += f"{string}" + elif type_ == "5": + html += f'{string}' + else: + print("unknown", item) + if string.endswith("
") == False: + html += "
" + return html + + def search(self, word): + results = [] + for index, f in self.builders: + if f.lower().endswith(".mdx"): + try: + keys = self.querycomplex(word, index.get_mdx_keys) + for k in keys: + results.append(self.parseashtml(index.mdx_lookup(k)[0])) + except: + pass + elif f.lower().endswith(".mdd"): + try: + keys = self.querycomplex(word, index.get_mdd_keys) + + for k in keys: + results.append(self.parseashtml(index.mdd_lookup(k)[0])) + except: + pass + if len(results) == 0: + return + style = """ + +""" + + return style + "".join(results) diff --git a/LunaTranslator/LunaTranslator/gui/settingpage_cishu.py b/LunaTranslator/LunaTranslator/gui/settingpage_cishu.py index d945c42d..86c9cfb4 100644 --- a/LunaTranslator/LunaTranslator/gui/settingpage_cishu.py +++ b/LunaTranslator/LunaTranslator/gui/settingpage_cishu.py @@ -27,6 +27,7 @@ def gethiragrid(self): continue if "args" in globalconfig["hirasetting"][name]: items = autoinitdialog_items(globalconfig["hirasetting"][name]) + items[-1]["callback"] = gobject.baseobject.starthira _3 = getcolorbutton( globalconfig, "", @@ -129,7 +130,9 @@ def setTabcishu_l(self): continue items = autoinitdialog_items(globalconfig["cishu"][cishu]) - + items[-1]["callback"] = functools.partial( + gobject.baseobject.startxiaoxueguan, cishu + ) line += [ (globalconfig["cishu"][cishu]["name"], 6), getsimpleswitch( diff --git a/LunaTranslator/files/defaultconfig/config.json b/LunaTranslator/files/defaultconfig/config.json index 7a7952b1..a0474016 100644 --- a/LunaTranslator/files/defaultconfig/config.json +++ b/LunaTranslator/files/defaultconfig/config.json @@ -909,6 +909,38 @@ "step": 1 } } + }, + "mdict": { + "use": false, + "name": "mdict", + "args": { + "path": "", + "ambiguity": 0, + "priority": 100 + }, + "argstype": { + "path": { + "type": "file", + "name": "路径", + "dir": false, + "multi": true, + "filter": "*.mdx|*.mdd" + }, + "ambiguity": { + "type": "intspin", + "name": "ambiguity", + "min": 0, + "max": 3, + "step": 1 + }, + "priority": { + "type": "intspin", + "name": "优先级", + "min": 0, + "max": 10000, + "step": 1 + } + } } }, "darklight": 0,