diff --git a/README.md b/README.md index 89bb227d..f2a96265 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,8 @@ * [uyjulian/AtlasTranslate](https://github.com/uyjulian/AtlasTranslate) +* [ilius/pyglossary](https://github.com/ilius/pyglossary) + diff --git a/cpp/LunaHook/LunaHook/engine64/vita3k.cpp b/cpp/LunaHook/LunaHook/engine64/vita3k.cpp index b04aada4..2538ed7c 100644 --- a/cpp/LunaHook/LunaHook/engine64/vita3k.cpp +++ b/cpp/LunaHook/LunaHook/engine64/vita3k.cpp @@ -592,6 +592,13 @@ namespace strReplace(ws, L"^", L""); buffer->from(WideStringToString(ws, 932)); } + void PCSG01151(TextBuffer *buffer, HookParam *hp) + { + auto ws = StringToWideString(buffer->viewA(), 932).value(); + strReplace(ws, L"^", L""); + strReplace(ws, L" ", L""); + buffer->from(WideStringToString(ws, 932)); + } void FPCSG01066(TextBuffer *buffer, HookParam *hp) { auto s = buffer->strA(); @@ -828,6 +835,8 @@ namespace // DRAMAtical Murder {0x8004630a, {0, 0, 0, 0, FPCSG00852, "PCSG00420"}}, {0x8003eed2, {0, 0, 0, 0, FPCSG00852, "PCSG00420"}}, + // GALTIA V Edition + {0x8001B7AA, {0, 0, 0, 0, PCSG01151, "PCSG01151"}}, }; return 1; diff --git a/cpp/version.cmake b/cpp/version.cmake index 221cb6d1..ff1a1034 100644 --- a/cpp/version.cmake +++ b/cpp/version.cmake @@ -1,7 +1,7 @@ set(VERSION_MAJOR 6) set(VERSION_MINOR 19) -set(VERSION_PATCH 3) +set(VERSION_PATCH 4) set(VERSION_REVISION 0) set(LUNA_VERSION "{${VERSION_MAJOR},${VERSION_MINOR},${VERSION_PATCH},${VERSION_REVISION}}") add_library(VERSION_DEF ${CMAKE_CURRENT_LIST_DIR}/version_def.cpp) diff --git a/py/LunaTranslator/cishu/jisho.py b/py/LunaTranslator/cishu/jisho.py index 10f46647..8b6c11e5 100644 --- a/py/LunaTranslator/cishu/jisho.py +++ b/py/LunaTranslator/cishu/jisho.py @@ -54,11 +54,12 @@ function onclickbtn_xxxxxx_internal(_id) { } .tab-widget_xxxxxx_internal .tab-button_xxxx_internal { - padding: 10px 20px; + padding: 7px 15px; background-color: #cccccccc; border: none; cursor: pointer; display: inline-block; + line-height: 20px; } .tab-widget_xxxxxx_internal .tab-button_xxxx_internal.active { diff --git a/py/LunaTranslator/cishu/mdict.py b/py/LunaTranslator/cishu/mdict.py index 5979a8f7..5b587f5b 100644 --- a/py/LunaTranslator/cishu/mdict.py +++ b/py/LunaTranslator/cishu/mdict.py @@ -1,9 +1,11 @@ -import math, base64, uuid, gobject +import base64, uuid, gobject from cishu.cishubase import DictTree -from myutils.config import isascii, globalconfig +from myutils.config import isascii from traceback import print_exc from myutils.audioplayer import bass_code_cast -import json, os +import json, os, re +from cishu.mdict_.readmdict import MDX, MDD, MDict +import hashlib, sqlite3 cachejson = None @@ -23,1684 +25,6 @@ def query_mime(ext): return cachejson.get(ext, "application/octet-stream") -class FlexBuffer: - - def __init__(self): - - self.blockSize = None - self.c = None - self.l = None - self.buf = None - - def require(self, n): - - r = self.c - self.l + n - if r > 0: - self.l = self.l + self.blockSize * math.ceil(r / self.blockSize) - # tmp = bytearray(self.l) - # for i in len(self.buf): - # tmp[i] = self.buf[i] - # self.buf = tmp - self.buf = self.buf + bytearray(self.l - len(self.buf)) - self.c = self.c + n - return self.buf - - def alloc(self, initSize, blockSize): - - if blockSize: - sz = blockSize - else: - sz = 4096 - self.blockSize = self.roundUp(sz) - self.c = 0 - self.l = self.roundUp(initSize) | 0 - self.l += self.blockSize - (self.l % self.blockSize) - self.buf = bytearray(self.l) - return self.buf - - def roundUp(self, n): - - r = n % 4 - if r == 0: - return n - else: - return n + 4 - r - - def reset(self): - - self.c = 0 - self.l = len(self.buf) - - def pack(self, size): - - return self.buf[0:size] - - -def _decompress(inBuf, outBuf): - - c_top_loop = 1 - c_first_literal_run = 2 - c_match = 3 - c_copy_match = 4 - c_match_done = 5 - c_match_next = 6 - - out = outBuf.buf - op = 0 - ip = 0 - t = inBuf[ip] - state = c_top_loop - m_pos = 0 - ip_end = len(inBuf) - - if t > 17: - ip = ip + 1 - t = t - 17 - if t < 4: - state = c_match_next - else: - out = outBuf.require(t) - while True: - out[op] = inBuf[ip] - op = op + 1 - ip = ip + 1 - t = t - 1 - if not t > 0: - break - state = c_first_literal_run - - while True: - if_block = False - - ## - if state == c_top_loop: - t = inBuf[ip] - ip = ip + 1 - if t >= 16: - state = c_match - continue - if t == 0: - while inBuf[ip] == 0: - t = t + 255 - ip = ip + 1 - t = t + 15 + inBuf[ip] - ip = ip + 1 - - t = t + 3 - out = outBuf.require(t) - while True: - out[op] = inBuf[ip] - op = op + 1 - ip = ip + 1 - t = t - 1 - if not t > 0: - break - # emulate c switch - state = c_first_literal_run - - ## - if state == c_first_literal_run: - t = inBuf[ip] - ip = ip + 1 - if t >= 16: - state = c_match - continue - m_pos = op - 0x801 - (t >> 2) - (inBuf[ip] << 2) - ip = ip + 1 - out = outBuf.require(3) - out[op] = out[m_pos] - op = op + 1 - m_pos = m_pos + 1 - out[op] = out[m_pos] - op = op + 1 - m_pos = m_pos + 1 - out[op] = out[m_pos] - op = op + 1 - - state = c_match_done - continue - - ## - if state == c_match: - if t >= 64: - m_pos = op - 1 - ((t >> 2) & 7) - (inBuf[ip] << 3) - ip = ip + 1 - t = (t >> 5) - 1 - state = c_copy_match - continue - elif t >= 32: - t = t & 31 - if t == 0: - while inBuf[ip] == 0: - t = t + 255 - ip = ip + 1 - t = t + 31 + inBuf[ip] - ip = ip + 1 - m_pos = op - 1 - ((inBuf[ip] + (inBuf[ip + 1] << 8)) >> 2) - ip = ip + 2 - elif t >= 16: - m_pos = op - ((t & 8) << 11) - t = t & 7 - if t == 0: - while inBuf[ip] == 0: - t = t + 255 - ip = ip + 1 - t = t + 7 + inBuf[ip] - ip = ip + 1 - m_pos = m_pos - ((inBuf[ip] + (inBuf[ip + 1] << 8)) >> 2) - ip = ip + 2 - if m_pos == op: - break - m_pos = m_pos - 0x4000 - else: - m_pos = op - 1 - (t >> 2) - (inBuf[ip] << 2) - ip = ip + 1 - out = outBuf.require(2) - out[op] = out[m_pos] - op = op + 1 - m_pos = m_pos + 1 - out[op] = out[m_pos] - op = op + 1 - state = c_match_done - continue - - if t >= 6 and (op - m_pos) >= 4: - if_block = True - t += 2 - out = outBuf.require(t) - while True: - out[op] = out[m_pos] - op += 1 - m_pos += 1 - t -= 1 - if not t > 0: - break - # emulate c switch - state = c_copy_match - - ## - if state == c_copy_match: - if not if_block: - t += 2 - out = outBuf.require(t) - while True: - out[op] = out[m_pos] - op += 1 - m_pos += 1 - t -= 1 - if not t > 0: - break - # emulating c switch - state = c_match_done - - ## - if state == c_match_done: - t = inBuf[ip - 2] & 3 - if t == 0: - state = c_top_loop - continue - # emulate c switch - state = c_match_next - - ## - if state == c_match_next: - out = outBuf.require(1) - out[op] = inBuf[ip] - op += 1 - ip += 1 - if t > 1: - out = outBuf.require(1) - out[op] = inBuf[ip] - op += 1 - ip += 1 - if t > 2: - out = outBuf.require(1) - out[op] = inBuf[ip] - op += 1 - ip += 1 - t = inBuf[ip] - ip += 1 - state = c_match - continue - - return bytes(outBuf.pack(op)) - - -class lzo: - - def decompress(input, initSize=16000, blockSize=8192): - output = FlexBuffer() - output.alloc(initSize, blockSize) - return _decompress(bytearray(input), output) - - -""" -Copyright by https://github.com/zhansliu/writemdict - -ripemd128.py - A simple ripemd128 library in pure Python. - -Supports both Python 2 (versions >= 2.6) and Python 3. - -Usage: - from ripemd128 import ripemd128 - digest = ripemd128(b"The quick brown fox jumps over the lazy dog") - assert(digest == b"\x3f\xa9\xb5\x7f\x05\x3c\x05\x3f\xbe\x27\x35\xb2\x38\x0d\xb5\x96") - -""" - - -import struct - - -# follows this description: http://homes.esat.kuleuven.be/~bosselae/ripemd/rmd128.txt - - -def f(j, x, y, z): - assert 0 <= j and j < 64 - if j < 16: - return x ^ y ^ z - elif j < 32: - return (x & y) | (z & ~x) - elif j < 48: - return (x | (0xFFFFFFFF & ~y)) ^ z - else: - return (x & z) | (y & ~z) - - -def K(j): - assert 0 <= j and j < 64 - if j < 16: - return 0x00000000 - elif j < 32: - return 0x5A827999 - elif j < 48: - return 0x6ED9EBA1 - else: - return 0x8F1BBCDC - - -def Kp(j): - assert 0 <= j and j < 64 - if j < 16: - return 0x50A28BE6 - elif j < 32: - return 0x5C4DD124 - elif j < 48: - return 0x6D703EF3 - else: - return 0x00000000 - - -def padandsplit(message): - """ - returns a two-dimensional array X[i][j] of 32-bit integers, where j ranges - from 0 to 16. - First pads the message to length in bytes is congruent to 56 (mod 64), - by first adding a byte 0x80, and then padding with 0x00 bytes until the - message length is congruent to 56 (mod 64). Then adds the little-endian - 64-bit representation of the original length. Finally, splits the result - up into 64-byte blocks, which are further parsed as 32-bit integers. - """ - origlen = len(message) - padlength = 64 - ((origlen - 56) % 64) # minimum padding is 1! - message += b"\x80" - message += b"\x00" * (padlength - 1) - message += struct.pack("> (32 - s)) & 0xFFFFFFFF - - -# fmt: off -r = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 7, 4, 13, 1, 10, 6, 15, 3, 12, 0, 9, 5, 2, 14, 11, 8, 3, 10, 14, 4, 9, 15, 8, 1, 2, 7, 0, 6, 13, 11, 5, 12, 1, 9, 11, 10, 0, 8, 12, 4, 13, 3, 7, 15, 14, 5, 6, 2] -rp = [ 5, 14, 7, 0, 9, 2, 11, 4, 13, 6, 15, 8, 1, 10, 3, 12, 6, 11, 3, 7, 0, 13, 5, 10, 14, 15, 8, 12, 4, 9, 1, 2, 15, 5, 1, 3, 7, 14, 6, 9, 11, 8, 12, 2, 10, 0, 4, 13, 8, 6, 4, 1, 3, 11, 15, 0, 5, 12, 2, 13, 9, 7, 10, 14] -s = [ 11, 14, 15, 12, 5, 8, 7, 9, 11, 13, 14, 15, 6, 7, 9, 8, 7, 6, 8, 13, 11, 9, 7, 15, 7, 12, 15, 9, 11, 7, 13, 12, 11, 13, 6, 7, 14, 9, 13, 15, 14, 8, 13, 6, 5, 12, 7, 5, 11, 12, 14, 15, 14, 15, 9, 8, 9, 14, 5, 6, 8, 6, 5, 12] -sp = [ 8, 9, 9, 11, 13, 15, 15, 5, 7, 7, 8, 11, 14, 14, 12, 6, 9, 13, 15, 7, 12, 8, 9, 11, 7, 7, 12, 7, 6, 15, 13, 11, 9, 7, 15, 11, 8, 6, 6, 14, 12, 13, 5, 14, 13, 13, 7, 5, 15, 5, 8, 11, 14, 14, 6, 14, 6, 9, 12, 9, 12, 5, 15, 8] -# fmt: on - - -def ripemd128(message): - h0 = 0x67452301 - h1 = 0xEFCDAB89 - h2 = 0x98BADCFE - h3 = 0x10325476 - X = padandsplit(message) - for i in range(len(X)): - (A, B, C, D) = (h0, h1, h2, h3) - (Ap, Bp, Cp, Dp) = (h0, h1, h2, h3) - for j in range(64): - T = rol(s[j], add(A, f(j, B, C, D), X[i][r[j]], K(j))) - (A, D, C, B) = (D, C, B, T) - T = rol(sp[j], add(Ap, f(63 - j, Bp, Cp, Dp), X[i][rp[j]], Kp(j))) - (Ap, Dp, Cp, Bp) = (Dp, Cp, Bp, T) - T = add(h1, C, Dp) - h1 = add(h2, D, Ap) - h2 = add(h3, A, Bp) - h3 = add(h0, B, Cp) - h0 = T - - return struct.pack("= (2, 6) - -if sys.version_info >= (3,): - integer_types = (int,) - python3 = True -else: - integer_types = (int, long) - python3 = False - -from struct import Struct - -little_u64 = Struct("= 2**64" - ctx = self.ctx - ctx[8], ctx[9] = little2_i32.unpack(little_u64.pack(counter)) - - def getCounter(self): - return little_u64.unpack(little2_i32.pack(*self.ctx[8:10]))[0] - - def setRounds(self, rounds, testing=False): - assert testing or rounds in [8, 12, 20], "rounds must be 8, 12, 20" - self.rounds = rounds - - def encryptBytes(self, data): - assert type(data) == bytes, "data must be byte string" - assert self._lastChunk64, "previous chunk not multiple of 64 bytes" - lendata = len(data) - munged = bytearray(lendata) - for i in range(0, lendata, 64): - h = salsa20_wordtobyte(self.ctx, self.rounds, checkRounds=False) - self.setCounter((self.getCounter() + 1) % 2**64) - # Stopping at 2^70 bytes per nonce is user's responsibility. - for j in range(min(64, lendata - i)): - if python3: - munged[i + j] = data[i + j] ^ h[j] - else: - munged[i + j] = ord(data[i + j]) ^ ord(h[j]) - - self._lastChunk64 = not lendata % 64 - return bytes(munged) - - decryptBytes = encryptBytes # encrypt and decrypt use same function - - -# -------------------------------------------------------------------------- - - -def salsa20_wordtobyte(input, nRounds=20, checkRounds=True): - """Do nRounds Salsa20 rounds on a copy of - input: list or tuple of 16 ints treated as little-endian unsigneds. - Returns a 64-byte string. - """ - - assert type(input) in (list, tuple) and len(input) == 16 - assert not (checkRounds) or (nRounds in [8, 12, 20]) - - x = list(input) - - def XOR(a, b): - return a ^ b - - ROTATE = rot32 - PLUS = add32 - - for i in range(nRounds // 2): - # These ...XOR...ROTATE...PLUS... lines are from ecrypt-linux.c - # unchanged except for indents and the blank line between rounds: - x[4] = XOR(x[4], ROTATE(PLUS(x[0], x[12]), 7)) - x[8] = XOR(x[8], ROTATE(PLUS(x[4], x[0]), 9)) - x[12] = XOR(x[12], ROTATE(PLUS(x[8], x[4]), 13)) - x[0] = XOR(x[0], ROTATE(PLUS(x[12], x[8]), 18)) - x[9] = XOR(x[9], ROTATE(PLUS(x[5], x[1]), 7)) - x[13] = XOR(x[13], ROTATE(PLUS(x[9], x[5]), 9)) - x[1] = XOR(x[1], ROTATE(PLUS(x[13], x[9]), 13)) - x[5] = XOR(x[5], ROTATE(PLUS(x[1], x[13]), 18)) - x[14] = XOR(x[14], ROTATE(PLUS(x[10], x[6]), 7)) - x[2] = XOR(x[2], ROTATE(PLUS(x[14], x[10]), 9)) - x[6] = XOR(x[6], ROTATE(PLUS(x[2], x[14]), 13)) - x[10] = XOR(x[10], ROTATE(PLUS(x[6], x[2]), 18)) - x[3] = XOR(x[3], ROTATE(PLUS(x[15], x[11]), 7)) - x[7] = XOR(x[7], ROTATE(PLUS(x[3], x[15]), 9)) - x[11] = XOR(x[11], ROTATE(PLUS(x[7], x[3]), 13)) - x[15] = XOR(x[15], ROTATE(PLUS(x[11], x[7]), 18)) - - x[1] = XOR(x[1], ROTATE(PLUS(x[0], x[3]), 7)) - x[2] = XOR(x[2], ROTATE(PLUS(x[1], x[0]), 9)) - x[3] = XOR(x[3], ROTATE(PLUS(x[2], x[1]), 13)) - x[0] = XOR(x[0], ROTATE(PLUS(x[3], x[2]), 18)) - x[6] = XOR(x[6], ROTATE(PLUS(x[5], x[4]), 7)) - x[7] = XOR(x[7], ROTATE(PLUS(x[6], x[5]), 9)) - x[4] = XOR(x[4], ROTATE(PLUS(x[7], x[6]), 13)) - x[5] = XOR(x[5], ROTATE(PLUS(x[4], x[7]), 18)) - x[11] = XOR(x[11], ROTATE(PLUS(x[10], x[9]), 7)) - x[8] = XOR(x[8], ROTATE(PLUS(x[11], x[10]), 9)) - x[9] = XOR(x[9], ROTATE(PLUS(x[8], x[11]), 13)) - x[10] = XOR(x[10], ROTATE(PLUS(x[9], x[8]), 18)) - x[12] = XOR(x[12], ROTATE(PLUS(x[15], x[14]), 7)) - x[13] = XOR(x[13], ROTATE(PLUS(x[12], x[15]), 9)) - x[14] = XOR(x[14], ROTATE(PLUS(x[13], x[12]), 13)) - x[15] = XOR(x[15], ROTATE(PLUS(x[14], x[13]), 18)) - - for i in range(len(input)): - x[i] = PLUS(x[i], input[i]) - return little16_i32.pack(*x) - - -# --------------------------- 32-bit ops ------------------------------- - - -def trunc32(w): - """Return the bottom 32 bits of w as a Python int. - This creates longs temporarily, but returns an int.""" - w = int((w & 0x7FFFFFFF) | -(w & 0x80000000)) - assert type(w) == int - return w - - -def add32(a, b): - """Add two 32-bit words discarding carry above 32nd bit, - and without creating a Python long. - Timing shouldn't vary. - """ - lo = (a & 0xFFFF) + (b & 0xFFFF) - hi = (a >> 16) + (b >> 16) + (lo >> 16) - return (-(hi & 0x8000) | (hi & 0x7FFF)) << 16 | (lo & 0xFFFF) - - -def rot32(w, nLeft): - """Rotate 32-bit word left by nLeft or right by -nLeft - without creating a Python long. - Timing depends on nLeft but not on w. - """ - nLeft &= 31 # which makes nLeft >= 0 - if nLeft == 0: - return w - - # Note: now 1 <= nLeft <= 31. - # RRRsLLLLLL There are nLeft RRR's, (31-nLeft) LLLLLL's, - # => sLLLLLLRRR and one s which becomes the sign bit. - RRR = ((w >> 1) & 0x7FFFFFFF) >> (31 - nLeft) - sLLLLLL = -((1 << (31 - nLeft)) & w) | (0x7FFFFFFF >> nLeft) & w - return RRR | (sLLLLLL << nLeft) - - -# --------------------------------- end ----------------------------------- - -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# readmdict.py -# Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser -# -# Copyright (C) 2012, 2013, 2015 Xiaoqiang Wang -# -# This program is a free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, version 3 of the License. -# -# You can get a copy of GNU General Public License along this program -# But you can always get it from http://www.gnu.org/licenses/gpl.txt -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -from struct import pack, unpack -from io import BytesIO -import re -import sys -import json - - -# zlib compression is used for engine version >=2.0 -import zlib - -# LZO compression is used for engine version < 2.0 -# try: -# import lzo -# except ImportError: -# lzo = None -# print("LZO compression support is not available") - -# 2x3 compatible -if sys.hexversion >= 0x03000000: - unicode = str - - -def _unescape_entities(text): - """ - unescape offending tags < > " & - """ - text = text.replace(b"<", b"<") - text = text.replace(b">", b">") - text = text.replace(b""", b'"') - text = text.replace(b"&", b"&") - return text - - -def _fast_decrypt(data, key): - b = bytearray(data) - key = bytearray(key) - previous = 0x36 - for i in range(len(b)): - t = (b[i] >> 4 | b[i] << 4) & 0xFF - t = t ^ previous ^ (i & 0xFF) ^ key[i % len(key)] - previous = b[i] - b[i] = t - return bytes(b) - - -def _mdx_decrypt(comp_block): - key = ripemd128(comp_block[4:8] + pack(b" - """ - taglist = re.findall(rb'(\w+)="(.*?)"', header, re.DOTALL) - tagdict = {} - for key, value in taglist: - tagdict[key] = _unescape_entities(value) - return tagdict - - def _decode_key_block_info(self, key_block_info_compressed): - if self._version >= 2: - # zlib compression - assert key_block_info_compressed[:4] == b"\x02\x00\x00\x00" - # decrypt if needed - if self._encrypt & 0x02: - key_block_info_compressed = _mdx_decrypt(key_block_info_compressed) - # decompress - key_block_info = zlib.decompress(key_block_info_compressed[8:]) - # adler checksum - adler32 = unpack(">I", key_block_info_compressed[4:8])[0] - assert adler32 == zlib.adler32(key_block_info) & 0xFFFFFFFF - else: - # no compression - key_block_info = key_block_info_compressed - # decode - key_block_info_list = [] - num_entries = 0 - i = 0 - if self._version >= 2: - byte_format = ">H" - byte_width = 2 - text_term = 1 - else: - byte_format = ">B" - byte_width = 1 - text_term = 0 - - while i < len(key_block_info): - # number of entries in current key block - num_entries += unpack( - self._number_format, key_block_info[i : i + self._number_width] - )[0] - i += self._number_width - # text head size - text_head_size = unpack(byte_format, key_block_info[i : i + byte_width])[0] - i += byte_width - # text head - if self._encoding != "UTF-16": - i += text_head_size + text_term - else: - i += (text_head_size + text_term) * 2 - # text tail size - text_tail_size = unpack(byte_format, key_block_info[i : i + byte_width])[0] - i += byte_width - # text tail - if self._encoding != "UTF-16": - i += text_tail_size + text_term - else: - i += (text_tail_size + text_term) * 2 - # key block compressed size - key_block_compressed_size = unpack( - self._number_format, key_block_info[i : i + self._number_width] - )[0] - i += self._number_width - # key block decompressed size - key_block_decompressed_size = unpack( - self._number_format, key_block_info[i : i + self._number_width] - )[0] - i += self._number_width - key_block_info_list += [ - (key_block_compressed_size, key_block_decompressed_size) - ] - - assert num_entries == self._num_entries - - return key_block_info_list - - def _decode_key_block(self, key_block_compressed, key_block_info_list): - key_list = [] - i = 0 - for compressed_size, decompressed_size in key_block_info_list: - start = i - end = i + compressed_size - # 4 bytes : compression type - key_block_type = key_block_compressed[start : start + 4] - # 4 bytes : adler checksum of decompressed key block - adler32 = unpack(">I", key_block_compressed[start + 4 : start + 8])[0] - if key_block_type == b"\x00\x00\x00\x00": - key_block = key_block_compressed[start + 8 : end] - elif key_block_type == b"\x01\x00\x00\x00": - if lzo is None: - print("LZO compression is not supported") - break - # decompress key block - header = b"\xf0" + pack(">I", decompressed_size) - key_block = lzo.decompress( - key_block_compressed[start + 8 : end], - initSize=decompressed_size, - blockSize=1308672, - ) - elif key_block_type == b"\x02\x00\x00\x00": - # decompress key block - key_block = zlib.decompress(key_block_compressed[start + 8 : end]) - # extract one single key block into a key list - key_list += self._split_key_block(key_block) - # notice that adler32 returns signed value - assert adler32 == zlib.adler32(key_block) & 0xFFFFFFFF - - i += compressed_size - return key_list - - def _split_key_block(self, key_block): - key_list = [] - key_start_index = 0 - while key_start_index < len(key_block): - temp = key_block[key_start_index : key_start_index + self._number_width] - # the corresponding record's offset in record block - key_id = unpack( - self._number_format, - key_block[key_start_index : key_start_index + self._number_width], - )[0] - # key text ends with '\x00' - if self._encoding == "UTF-16": - delimiter = b"\x00\x00" - width = 2 - else: - delimiter = b"\x00" - width = 1 - i = key_start_index + self._number_width - while i < len(key_block): - if key_block[i : i + width] == delimiter: - key_end_index = i - break - i += width - key_text = ( - key_block[key_start_index + self._number_width : key_end_index] - .decode(self._encoding, errors="ignore") - .encode("utf-8") - .strip() - ) - key_start_index = key_end_index + width - key_list += [(key_id, key_text)] - return key_list - - def _read_header(self): - f = open(self._fname, "rb") - # number of bytes of header text - header_bytes_size = unpack(">I", f.read(4))[0] - header_bytes = f.read(header_bytes_size) - # 4 bytes: adler32 checksum of header, in little endian - adler32 = unpack("= 0x03000000: - encoding = encoding.decode("utf-8") - # GB18030 > GBK > GB2312 - if encoding in ["GBK", "GB2312"]: - encoding = "GB18030" - self._encoding = encoding - # 读取标题和描述 - if b"Title" in header_tag: - self._title = header_tag[b"Title"].decode("utf-8") - else: - self._title = "" - - if b"Description" in header_tag: - self._description = header_tag[b"Description"].decode("utf-8") - else: - self._description = "" - pass - # encryption flag - # 0x00 - no encryption - # 0x01 - encrypt record block - # 0x02 - encrypt key info block - if b"Encrypted" not in header_tag or header_tag[b"Encrypted"] == b"No": - self._encrypt = 0 - elif header_tag[b"Encrypted"] == b"Yes": - self._encrypt = 1 - else: - self._encrypt = int(header_tag[b"Encrypted"]) - - # stylesheet attribute if present takes form of: - # style_number # 1-255 - # style_begin # or '' - # style_end # or '' - # store stylesheet in dict in the form of - # {'number' : ('style_begin', 'style_end')} - self._stylesheet = {} - if header_tag.get("StyleSheet"): - lines = header_tag["StyleSheet"].splitlines() - for i in range(0, len(lines), 3): - self._stylesheet[lines[i]] = (lines[i + 1], lines[i + 2]) - - # before version 2.0, number is 4 bytes integer - # version 2.0 and above uses 8 bytes - self._version = float(header_tag[b"GeneratedByEngineVersion"]) - if self._version < 2.0: - self._number_width = 4 - self._number_format = ">I" - else: - self._number_width = 8 - self._number_format = ">Q" - - return header_tag - - def _read_keys(self): - f = open(self._fname, "rb") - f.seek(self._key_block_offset) - - # the following numbers could be encrypted - if self._version >= 2.0: - num_bytes = 8 * 5 - else: - num_bytes = 4 * 4 - block = f.read(num_bytes) - - if self._encrypt & 1: - if self._passcode is None: - raise RuntimeError( - "user identification is needed to read encrypted file" - ) - regcode, userid = self._passcode - if isinstance(userid, unicode): - userid = userid.encode("utf8") - if self.header[b"RegisterBy"] == b"EMail": - encrypted_key = _decrypt_regcode_by_email(regcode, userid) - else: - encrypted_key = _decrypt_regcode_by_deviceid(regcode, userid) - block = _salsa_decrypt(block, encrypted_key) - - # decode this block - sf = BytesIO(block) - # number of key blocks - num_key_blocks = self._read_number(sf) - # number of entries - self._num_entries = self._read_number(sf) - # number of bytes of key block info after decompression - if self._version >= 2.0: - key_block_info_decomp_size = self._read_number(sf) - # number of bytes of key block info - key_block_info_size = self._read_number(sf) - # number of bytes of key block - key_block_size = self._read_number(sf) - - # 4 bytes: adler checksum of previous 5 numbers - if self._version >= 2.0: - adler32 = unpack(">I", f.read(4))[0] - assert adler32 == (zlib.adler32(block) & 0xFFFFFFFF) - - # read key block info, which indicates key block's compressed and - # decompressed size - key_block_info = f.read(key_block_info_size) - key_block_info_list = self._decode_key_block_info(key_block_info) - assert num_key_blocks == len(key_block_info_list) - - # read key block - key_block_compressed = f.read(key_block_size) - # extract key block - key_list = self._decode_key_block(key_block_compressed, key_block_info_list) - - self._record_block_offset = f.tell() - f.close() - - return key_list - - def _read_keys_brutal(self): - f = open(self._fname, "rb") - f.seek(self._key_block_offset) - - # the following numbers could be encrypted, disregard them! - if self._version >= 2.0: - num_bytes = 8 * 5 + 4 - key_block_type = b"\x02\x00\x00\x00" - else: - num_bytes = 4 * 4 - key_block_type = b"\x01\x00\x00\x00" - block = f.read(num_bytes) - - # key block info - # 4 bytes '\x02\x00\x00\x00' - # 4 bytes adler32 checksum - # unknown number of bytes follows until '\x02\x00\x00\x00' which marks - # the beginning of key block - key_block_info = f.read(8) - if self._version >= 2.0: - assert key_block_info[:4] == b"\x02\x00\x00\x00" - while True: - fpos = f.tell() - t = f.read(1024) - index = t.find(key_block_type) - if index != -1: - key_block_info += t[:index] - f.seek(fpos + index) - break - else: - key_block_info += t - - key_block_info_list = self._decode_key_block_info(key_block_info) - key_block_size = sum(list(zip(*key_block_info_list))[0]) - - # read key block - key_block_compressed = f.read(key_block_size) - # extract key block - key_list = self._decode_key_block(key_block_compressed, key_block_info_list) - - self._record_block_offset = f.tell() - f.close() - - self._num_entries = len(key_list) - return key_list - - -class MDD(MDict): - """ - MDict resource file format (*.MDD) reader. - >>> mdd = MDD('example.mdd') - >>> len(mdd) - 208 - >>> for filename,content in mdd.items(): - ... print filename, content[:10] - """ - - def __init__(self, fname, passcode=None): - MDict.__init__(self, fname, encoding="UTF-16", passcode=passcode) - - def items(self): - """Return a generator which in turn produce tuples in the form of (filename, content)""" - return self._decode_record_block() - - def _decode_record_block(self): - f = open(self._fname, "rb") - f.seek(self._record_block_offset) - - num_record_blocks = self._read_number(f) - num_entries = self._read_number(f) - assert num_entries == self._num_entries - record_block_info_size = self._read_number(f) - record_block_size = self._read_number(f) - - # record block info section - record_block_info_list = [] - size_counter = 0 - for i in range(num_record_blocks): - compressed_size = self._read_number(f) - decompressed_size = self._read_number(f) - record_block_info_list += [(compressed_size, decompressed_size)] - size_counter += self._number_width * 2 - assert size_counter == record_block_info_size - - # actual record block - offset = 0 - i = 0 - size_counter = 0 - for compressed_size, decompressed_size in record_block_info_list: - record_block_compressed = f.read(compressed_size) - # 4 bytes: compression type - record_block_type = record_block_compressed[:4] - # 4 bytes: adler32 checksum of decompressed record block - adler32 = unpack(">I", record_block_compressed[4:8])[0] - if record_block_type == b"\x00\x00\x00\x00": - record_block = record_block_compressed[8:] - elif record_block_type == b"\x01\x00\x00\x00": - if lzo is None: - print("LZO compression is not supported") - break - # decompress - header = b"\xf0" + pack(">I", decompressed_size) - record_block = lzo.decompress( - record_block_compressed[8:], - initSize=decompressed_size, - blockSize=1308672, - ) - elif record_block_type == b"\x02\x00\x00\x00": - # decompress - record_block = zlib.decompress(record_block_compressed[8:]) - - # notice that adler32 return signed value - assert adler32 == zlib.adler32(record_block) & 0xFFFFFFFF - - assert len(record_block) == decompressed_size - # split record block according to the offset info from key block - while i < len(self._key_list): - record_start, key_text = self._key_list[i] - # reach the end of current record block - if record_start - offset >= len(record_block): - break - # record end index - if i < len(self._key_list) - 1: - record_end = self._key_list[i + 1][0] - else: - record_end = len(record_block) + offset - i += 1 - data = record_block[record_start - offset : record_end - offset] - yield key_text, data - offset += len(record_block) - size_counter += compressed_size - assert size_counter == record_block_size - - f.close() - - ### 获取 mdx 文件的索引列表,格式为 - ### key_text(关键词,可以由后面的 keylist 得到) - ### file_pos(record_block开始的位置) - ### compressed_size(record_block压缩前的大小) - ### decompressed_size(解压后的大小) - ### record_block_type(record_block 的压缩类型) - ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存) - ### record_end - ### offset - - def get_index(self, check_block=True): - f = open(self._fname, "rb") - index_dict_list = [] - f.seek(self._record_block_offset) - - num_record_blocks = self._read_number(f) - num_entries = self._read_number(f) - assert num_entries == self._num_entries - record_block_info_size = self._read_number(f) - record_block_size = self._read_number(f) - - # record block info section - record_block_info_list = [] - size_counter = 0 - for i in range(num_record_blocks): - compressed_size = self._read_number(f) - decompressed_size = self._read_number(f) - record_block_info_list += [(compressed_size, decompressed_size)] - size_counter += self._number_width * 2 - # todo:注意!!! - assert size_counter == record_block_info_size - - # actual record block - offset = 0 - i = 0 - size_counter = 0 - for compressed_size, decompressed_size in record_block_info_list: - current_pos = f.tell() - record_block_compressed = f.read(compressed_size) - # 4 bytes: compression type - record_block_type = record_block_compressed[:4] - # 4 bytes: adler32 checksum of decompressed record block - adler32 = unpack(">I", record_block_compressed[4:8])[0] - if record_block_type == b"\x00\x00\x00\x00": - _type = 0 - if check_block: - record_block = record_block_compressed[8:] - elif record_block_type == b"\x01\x00\x00\x00": - _type = 1 - if lzo is None: - print("LZO compression is not supported") - break - # decompress - header = b"\xf0" + pack(">I", decompressed_size) - if check_block: - record_block = lzo.decompress( - record_block_compressed[8:], - initSize=decompressed_size, - blockSize=1308672, - ) - elif record_block_type == b"\x02\x00\x00\x00": - # decompress - _type = 2 - if check_block: - record_block = zlib.decompress(record_block_compressed[8:]) - - # notice that adler32 return signed value - if check_block: - assert adler32 == zlib.adler32(record_block) & 0xFFFFFFFF - assert len(record_block) == decompressed_size - # split record block according to the offset info from key block - while i < len(self._key_list): - ### 用来保存索引信息的空字典 - index_dict = {} - index_dict["file_pos"] = current_pos - index_dict["compressed_size"] = compressed_size - index_dict["decompressed_size"] = decompressed_size - index_dict["record_block_type"] = _type - record_start, key_text = self._key_list[i] - index_dict["record_start"] = record_start - index_dict["key_text"] = key_text.decode("utf-8") - index_dict["offset"] = offset - # reach the end of current record block - if record_start - offset >= decompressed_size: - break - # record end index - if i < len(self._key_list) - 1: - record_end = self._key_list[i + 1][0] - else: - record_end = decompressed_size + offset - index_dict["record_end"] = record_end - i += 1 - if check_block: - data = record_block[record_start - offset : record_end - offset] - index_dict_list.append(index_dict) - # yield key_text, data - offset += decompressed_size - size_counter += compressed_size - assert size_counter == record_block_size - f.close() - return index_dict_list - - -class MDX(MDict): - """ - MDict dictionary file format (*.MDD) reader. - >>> mdx = MDX('example.mdx') - >>> len(mdx) - 42481 - >>> for key,value in mdx.items(): - ... print key, value[:10] - """ - - def __init__(self, fname, encoding="", substyle=False, passcode=None): - MDict.__init__(self, fname, encoding, passcode) - self._substyle = substyle - - def items(self): - """Return a generator which in turn produce tuples in the form of (key, value)""" - return self._decode_record_block() - - def _substitute_stylesheet(self, txt): - # substitute stylesheet definition - txt_list = re.split(r"`\d+`", txt) - txt_tag = re.findall(r"`\d+`", txt) - txt_styled = txt_list[0] - for j, p in enumerate(txt_list[1:]): - style = self._stylesheet[txt_tag[j][1:-1]] - if p and p[-1] == "\n": - txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + "\r\n" - else: - txt_styled = txt_styled + style[0] + p + style[1] - return txt_styled - - def _decode_record_block(self): - f = open(self._fname, "rb") - f.seek(self._record_block_offset) - - num_record_blocks = self._read_number(f) - num_entries = self._read_number(f) - assert num_entries == self._num_entries - record_block_info_size = self._read_number(f) - record_block_size = self._read_number(f) - - # record block info section - record_block_info_list = [] - size_counter = 0 - for i in range(num_record_blocks): - compressed_size = self._read_number(f) - decompressed_size = self._read_number(f) - record_block_info_list += [(compressed_size, decompressed_size)] - size_counter += self._number_width * 2 - assert size_counter == record_block_info_size - - # actual record block data - offset = 0 - i = 0 - size_counter = 0 - ###最后的索引表的格式为 - ### key_text(关键词,可以由后面的 keylist 得到) - ### file_pos(record_block开始的位置) - ### compressed_size(record_block压缩前的大小) - ### decompressed_size(解压后的大小) - ### record_block_type(record_block 的压缩类型) - ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存) - ### record_end - ### offset - for compressed_size, decompressed_size in record_block_info_list: - record_block_compressed = f.read(compressed_size) - ###### 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录) - ###### 另外还需要记录当前 f 对象的位置 - ###### 使用 f.tell() 命令/ 在建立索引是需要 f.seek() - # 4 bytes indicates block compression type - record_block_type = record_block_compressed[:4] - # 4 bytes adler checksum of uncompressed content - adler32 = unpack(">I", record_block_compressed[4:8])[0] - # no compression - if record_block_type == b"\x00\x00\x00\x00": - record_block = record_block_compressed[8:] - # lzo compression - elif record_block_type == b"\x01\x00\x00\x00": - if lzo is None: - print("LZO compression is not supported") - break - # decompress - header = b"\xf0" + pack(">I", decompressed_size) - record_block = lzo.decompress( - record_block_compressed[8:], - initSize=decompressed_size, - blockSize=1308672, - ) - # zlib compression - elif record_block_type == b"\x02\x00\x00\x00": - # decompress - record_block = zlib.decompress(record_block_compressed[8:]) - ###### 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的,其中一共有三种解压方法 - ###### 需要的信息有 record_block_compressed, decompress_size, - ###### record_block_type - ###### 另外还需要校验信息 adler32 - # notice that adler32 return signed value - assert adler32 == zlib.adler32(record_block) & 0xFFFFFFFF - - assert len(record_block) == decompressed_size - # split record block according to the offset info from key block - while i < len(self._key_list): - record_start, key_text = self._key_list[i] - # reach the end of current record block - if record_start - offset >= len(record_block): - break - # record end index - if i < len(self._key_list) - 1: - record_end = self._key_list[i + 1][0] - else: - record_end = len(record_block) + offset - i += 1 - #############需要得到 record_block , record_start, record_end, - #############offset - record = record_block[record_start - offset : record_end - offset] - # convert to utf-8 - record = ( - record.decode(self._encoding, errors="ignore") - .strip("\x00") - .encode("utf-8") - ) - # substitute styles - #############是否替换样式表 - if self._substyle and self._stylesheet: - record = self._substitute_stylesheet(record) - - yield key_text, record - offset += len(record_block) - size_counter += compressed_size - assert size_counter == record_block_size - - f.close() - - ### 获取 mdx 文件的索引列表,格式为 - ### key_text(关键词,可以由后面的 keylist 得到) - ### file_pos(record_block开始的位置) - ### compressed_size(record_block压缩前的大小) - ### decompressed_size(解压后的大小) - ### record_block_type(record_block 的压缩类型) - ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存) - ### record_end - ### offset - ### 所需 metadata - ### - def get_index(self, check_block=True): - ### 索引列表 - index_dict_list = [] - f = open(self._fname, "rb") - f.seek(self._record_block_offset) - - num_record_blocks = self._read_number(f) - num_entries = self._read_number(f) - assert num_entries == self._num_entries - record_block_info_size = self._read_number(f) - record_block_size = self._read_number(f) - - # record block info section - record_block_info_list = [] - size_counter = 0 - for i in range(num_record_blocks): - compressed_size = self._read_number(f) - decompressed_size = self._read_number(f) - record_block_info_list += [(compressed_size, decompressed_size)] - size_counter += self._number_width * 2 - assert size_counter == record_block_info_size - - # actual record block data - offset = 0 - i = 0 - size_counter = 0 - ###最后的索引表的格式为 - ### key_text(关键词,可以由后面的 keylist 得到) - ### file_pos(record_block开始的位置) - ### compressed_size(record_block压缩前的大小) - ### decompressed_size(解压后的大小) - ### record_block_type(record_block 的压缩类型) - ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存) - ### record_end - ### offset - for compressed_size, decompressed_size in record_block_info_list: - current_pos = f.tell() - record_block_compressed = f.read(compressed_size) - ###### 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录) - ###### 另外还需要记录当前 f 对象的位置 - ###### 使用 f.tell() 命令/ 在建立索引是需要 f.seek() - # 4 bytes indicates block compression type - record_block_type = record_block_compressed[:4] - # 4 bytes adler checksum of uncompressed content - adler32 = unpack(">I", record_block_compressed[4:8])[0] - # no compression - if record_block_type == b"\x00\x00\x00\x00": - _type = 0 - record_block = record_block_compressed[8:] - # lzo compression - elif record_block_type == b"\x01\x00\x00\x00": - _type = 1 - if lzo is None: - print("LZO compression is not supported") - break - # decompress - header = b"\xf0" + pack(">I", decompressed_size) - if check_block: - record_block = lzo.decompress( - record_block_compressed[8:], - initSize=decompressed_size, - blockSize=1308672, - ) - # zlib compression - elif record_block_type == b"\x02\x00\x00\x00": - # decompress - _type = 2 - if check_block: - record_block = zlib.decompress(record_block_compressed[8:]) - ###### 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的,其中一共有三种解压方法 - ###### 需要的信息有 record_block_compressed, decompress_size, - ###### record_block_type - ###### 另外还需要校验信息 adler32 - # notice that adler32 return signed value - if check_block: - assert adler32 == zlib.adler32(record_block) & 0xFFFFFFFF - assert len(record_block) == decompressed_size - # split record block according to the offset info from key block - while i < len(self._key_list): - ### 用来保存索引信息的空字典 - index_dict = {} - index_dict["file_pos"] = current_pos - index_dict["compressed_size"] = compressed_size - index_dict["decompressed_size"] = decompressed_size - index_dict["record_block_type"] = _type - record_start, key_text = self._key_list[i] - index_dict["record_start"] = record_start - index_dict["key_text"] = key_text.decode("utf-8") - index_dict["offset"] = offset - # reach the end of current record block - if record_start - offset >= decompressed_size: - break - # record end index - if i < len(self._key_list) - 1: - record_end = self._key_list[i + 1][0] - else: - record_end = decompressed_size + offset - index_dict["record_end"] = record_end - i += 1 - #############需要得到 record_block , record_start, record_end, - #############offset - if check_block: - record = record_block[record_start - offset : record_end - offset] - # convert to utf-8 - record = ( - record.decode(self._encoding, errors="ignore") - .strip("\x00") - .encode("utf-8") - ) - # substitute styles - #############是否替换样式表 - if self._substyle and self._stylesheet: - record = self._substitute_stylesheet(record) - index_dict_list.append(index_dict) - - offset += decompressed_size - size_counter += compressed_size - # todo: 注意!!! - # assert(size_counter == record_block_size) - f.close - # 这里比 mdd 部分稍有不同,应该还需要传递编码以及样式表信息 - meta = {} - meta["encoding"] = self._encoding - meta["stylesheet"] = json.dumps(self._stylesheet) - meta["title"] = self._title - meta["description"] = self._description - - return {"index_dict_list": index_dict_list, "meta": meta} - - -from struct import pack, unpack -from io import BytesIO -import re -import sys -import os -import sqlite3 -import json - -# zlib compression is used for engine version >=2.0 -import zlib - -# LZO compression is used for engine version < 2.0 -# try: -# import lzo -# except ImportError: -# lzo = None -# print("LZO compression support is not available") - -# 2x3 compatible -if sys.hexversion >= 0x03000000: - unicode = str - -version = "1.1" - -import hashlib - - class IndexBuilder(object): # todo: enable history def checkinfo(self, fn): @@ -1724,59 +48,29 @@ class IndexBuilder(object): def __init__( self, fname, - encoding="", passcode=None, enable_history=False, sql_index=True, - check=False, ): + self._mdx_file = fname - self._mdd_files = [] + self._mdict_mdds = [] self._mdd_dbs = [] - self._encoding = "" - self._stylesheet = {} - self._title = "" - self._version = "" - self._description = "" self._sql_index = sql_index - self._check = check _filename, _file_extension = os.path.splitext(fname) assert _file_extension == ".mdx" assert os.path.isfile(fname) + self._mdict = MDX(fname, substyle=True) _mdxmd5 = ( os.path.basename(_filename) + "_" + hashlib.md5(_filename.encode("utf8")).hexdigest() ) _targetfilenamebase = gobject.getcachedir("mdict/index/" + _mdxmd5) - self._mdx_db = _targetfilenamebase + ".mdx.db" + self._mdx_db = _targetfilenamebase + ".mdx.v2.db" # make index anyway self._make_mdx_index_checked(self._mdx_db) - - if os.path.isfile(self._mdx_db): - # read from META table - conn = sqlite3.connect(self._mdx_db) - # cursor = conn.execute("SELECT * FROM META") - cursor = conn.execute('SELECT * FROM META WHERE key = "version"') - # 判断有无版本号 - for cc in cursor: - self._version = cc[1] - cursor = conn.execute('SELECT * FROM META WHERE key = "encoding"') - for cc in cursor: - self._encoding = cc[1] - cursor = conn.execute('SELECT * FROM META WHERE key = "stylesheet"') - for cc in cursor: - self._stylesheet = json.loads(cc[1]) - - cursor = conn.execute('SELECT * FROM META WHERE key = "title"') - for cc in cursor: - self._title = cc[1] - - cursor = conn.execute('SELECT * FROM META WHERE key = "description"') - for cc in cursor: - self._description = cc[1] - self.makemdds(_filename, _targetfilenamebase) def makemdds(self, _filename, _targetfilenamebase): @@ -1786,147 +80,57 @@ class IndexBuilder(object): i += 1 end = extra + ".mdd" if os.path.isfile(_filename + end): - self._mdd_files.append(_filename + end) - self._mdd_dbs.append(_targetfilenamebase + end + ".db") - self._make_mdd_index_checked(self._mdd_files[-1], self._mdd_dbs[-1]) + mdd = MDD(_filename + end) + self._mdict_mdds.append(mdd) + self._mdd_dbs.append(_targetfilenamebase + end + ".v2.db") + self._make_mdd_index_checked(mdd, self._mdd_dbs[-1]) else: break - def _replace_stylesheet(self, txt): - # substitute stylesheet definition - txt_list = re.split(r"`\d+`", txt) - txt_tag = re.findall(r"`\d+`", txt) - txt_styled = txt_list[0] - for j, p in enumerate(txt_list[1:]): - style = self._stylesheet[txt_tag[j][1:-1]] - if p and p[-1] == "\n": - txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + "\r\n" - else: - txt_styled = txt_styled + style[0] + p + style[1] - return txt_styled - - def _make_mdd_index_checked(self, mdd, db_name): - if self.checkneedupdate(mdd, db_name): - self._make_mdd_index(mdd, db_name) - self.checkneedupdateafter(mdd, db_name) + def _make_mdd_index_checked(self, mdd: MDD, db_name): + if self.checkneedupdate(mdd._fname, db_name): + self._make_mdict_index(mdd, db_name, False) + self.checkneedupdateafter(mdd._fname, db_name) def _make_mdx_index_checked(self, db_name): if self.checkneedupdate(self._mdx_file, db_name): - self._make_mdx_index(db_name) + self._make_mdict_index(self._mdict, db_name, True) self.checkneedupdateafter(self._mdx_file, db_name) - def _make_mdx_index(self, db_name): + def _make_mdict_index(self, mdd: MDict, db_name, ismdx): if os.path.exists(db_name): os.remove(db_name) - mdx = MDX(self._mdx_file) - self._mdx_db = db_name - returned_index = mdx.get_index(check_block=self._check) - index_list = returned_index["index_dict_list"] + mdd._key_list = mdd._read_keys() + index_list = list(mdd.items()) conn = sqlite3.connect(db_name) c = conn.cursor() c.execute( """ CREATE TABLE MDX_INDEX - (key_text text not null, + (key_text text not null{}, file_pos integer, compressed_size integer, decompressed_size integer, - record_block_type integer, record_start integer, record_end integer, offset integer - )""" + )""".format( + " unique" if (not ismdx) else "" + ) ) - tuple_list = [ ( item["key_text"], item["file_pos"], item["compressed_size"], item["decompressed_size"], - item["record_block_type"], item["record_start"], item["record_end"], item["offset"], ) for item in index_list ] - c.executemany("INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)", tuple_list) - # build the metadata table - meta = returned_index["meta"] - c.execute( - """CREATE TABLE META - (key text, - value text - )""" - ) - - # for k,v in meta: - # c.execute( - # 'INSERT INTO META VALUES (?,?)', - # (k, v) - # ) - - c.executemany( - "INSERT INTO META VALUES (?,?)", - [ - ("encoding", meta["encoding"]), - ("stylesheet", meta["stylesheet"]), - ("title", meta["title"]), - ("description", meta["description"]), - ("version", version), - ], - ) - - if self._sql_index: - c.execute( - """ - CREATE INDEX key_index ON MDX_INDEX (key_text) - """ - ) - - conn.commit() - conn.close() - # set class member - self._encoding = meta["encoding"] - self._stylesheet = json.loads(meta["stylesheet"]) - self._title = meta["title"] - self._description = meta["description"] - - def _make_mdd_index(self, mdd_file, db_name): - if os.path.exists(db_name): - os.remove(db_name) - mdd = MDD(mdd_file) - index_list = mdd.get_index(check_block=self._check) - conn = sqlite3.connect(db_name) - c = conn.cursor() - c.execute( - """ CREATE TABLE MDX_INDEX - (key_text text not null unique, - file_pos integer, - compressed_size integer, - decompressed_size integer, - record_block_type integer, - record_start integer, - record_end integer, - offset integer - )""" - ) - - tuple_list = [ - ( - item["key_text"], - item["file_pos"], - item["compressed_size"], - item["decompressed_size"], - item["record_block_type"], - item["record_start"], - item["record_end"], - item["offset"], - ) - for item in index_list - ] - c.executemany("INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)", tuple_list) - if self._sql_index: + c.executemany("INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?)", tuple_list) + if self._sql_index and not ismdx: c.execute( """ CREATE UNIQUE INDEX key_index ON MDX_INDEX (key_text) @@ -1936,53 +140,12 @@ class IndexBuilder(object): conn.commit() conn.close() - @staticmethod - def get_data_by_index(fmdx, index): - fmdx.seek(index["file_pos"]) - record_block_compressed = fmdx.read(index["compressed_size"]) - record_block_type = record_block_compressed[:4] - record_block_type = index["record_block_type"] - decompressed_size = index["decompressed_size"] - # adler32 = unpack('>I', record_block_compressed[4:8])[0] - if record_block_type == 0: - _record_block = record_block_compressed[8:] - # lzo compression - elif record_block_type == 1: - if lzo is None: - print("LZO compression is not supported") - # decompress - header = b"\xf0" + pack(">I", index["decompressed_size"]) - _record_block = lzo.decompress( - record_block_compressed[8:], - initSize=decompressed_size, - blockSize=1308672, - ) - # zlib compression - elif record_block_type == 2: - # decompress - _record_block = zlib.decompress(record_block_compressed[8:]) - data = _record_block[ - index["record_start"] - - index["offset"] : index["record_end"] - - index["offset"] - ] - return data - - def get_mdx_by_index(self, fmdx, index): - data = self.get_data_by_index(fmdx, index) - record = ( - data.decode(self._encoding, errors="ignore").strip("\x00").encode("utf-8") - ) - if self._stylesheet: - record = self._replace_stylesheet(record) - record = record.decode("utf-8") + def get_mdx_by_index(self, index): + data = self._mdict.read_records(index) + record = self._mdict._treat_record_data(data) return record - def get_mdd_by_index(self, fmdx, index): - return self.get_data_by_index(fmdx, index) - - @staticmethod - def lookup_indexes(db, keyword, ignorecase=None): + def lookup_indexes(self, db, keyword, ignorecase=None): indexes = [] if ignorecase: sql = 'SELECT * FROM MDX_INDEX WHERE lower(key_text) = lower("{}")'.format( @@ -1997,7 +160,6 @@ class IndexBuilder(object): index["file_pos"] = result[1] index["compressed_size"] = result[2] index["decompressed_size"] = result[3] - index["record_block_type"] = result[4] index["record_start"] = result[5] index["record_end"] = result[6] index["offset"] = result[7] @@ -2007,18 +169,16 @@ class IndexBuilder(object): def mdx_lookup(self, keyword, ignorecase=None): lookup_result_list = [] indexes = self.lookup_indexes(self._mdx_db, keyword, ignorecase) - with open(self._mdx_file, "rb") as mdx_file: - for index in indexes: - lookup_result_list.append(self.get_mdx_by_index(mdx_file, index)) + for index in indexes: + lookup_result_list.append(self.get_mdx_by_index(index)) return lookup_result_list def mdd_lookup(self, keyword, ignorecase=None): lookup_result_list = [] - for i in range(len(self._mdd_files)): + for i in range(len(self._mdict_mdds)): indexes = self.lookup_indexes(self._mdd_dbs[i], keyword, ignorecase) - with open(self._mdd_files[i], "rb") as mdd_file: - for index in indexes: - lookup_result_list.append(self.get_mdd_by_index(mdd_file, index)) + for index in indexes: + lookup_result_list.append(self._mdict_mdds[i].read_records(index)) return lookup_result_list @staticmethod @@ -2066,8 +226,8 @@ class mdict(cishubase): title = _["title"] if title is None: t: str = os.path.basename(f)[:-4] - if index._title.strip() != "": - t1 = index._title.strip() + if index._mdict._title != "": + t1 = index._mdict._title if (isascii(t1)) and (isascii(t)): t = t1 elif not isascii(t1): @@ -2161,114 +321,6 @@ class mdict(cishubase): diss[k] = dis return sorted(results, key=lambda x: diss[x])[: self.config["max_num"]] - def parse_strings(self, input_string): - parsed_strings = [] - current_string = "" - current_number = "" - isin = False - for c in input_string: - if c == "`": - - isin = not isin - if isin and len(current_number): - if len(current_string): - parsed_strings.append((int(current_number), current_string)) - current_number = "" - current_string = "" - else: - isin = not isin - elif c.isdigit() and isin: - current_number += c - else: - current_string += c - if current_string: - if current_number: - parsed_strings.append((int(current_number), current_string)) - else: - parsed_strings.append(((current_number), current_string)) - - return parsed_strings - - def parseashtml(self, item): - - items = self.parse_strings(item) - - html = "" - - for type_, string in items: - ishtml = False - if type_ == 1: - htmlitem = '{}'.format(string) - elif type_ == 3: - htmlitem = ( - '{}'.format( - string - ) - ) - elif type_ == 4: - htmlitem = "{}".format(string) - elif type_ == 5: - htmlitem = '{}'.format(string) - elif type_ == 6: - htmlitem = '{}'.format(string) - elif type_ == 7: - htmlitem = '{}'.format(string) - else: - if str(type_).startswith("2"): - num = str(type_)[1:] - if len(num): - num += " " - htmlitem = '{}{}'.format(num, string) - elif str(type_).startswith("8"): - num = str(type_)[1:] - if len(num): - num += " " - htmlitem = '{}{}'.format(num, string) - elif ( - str(type_).startswith("11") - or str(type_).startswith("9") - or str(type_).startswith("10") - or str(type_).startswith("12") - ): - if str(type_).startswith("11"): - offset = 2 - color = "9933FF" - elif str(type_).startswith("9"): - offset = 1 - color = "046AA4" - elif str(type_).startswith("10"): - offset = 2 - color = "006699" - elif str(type_).startswith("12"): - offset = 2 - color = "F80AB8" - num = str(type_)[offset:] - if len(num): - idx = -1 - for i in range(1, len(string)): - if string[i - 1] == " " and (not string[i].isalpha()): - idx = i - break - if idx != -1: - string = string[:idx] + num + string[idx:] - htmlitem = '{}{}'.format( - color, num, string - ) - else: - ishtml = True - htmlitem = string - # print(type_) - # html - # print(item) - - if not ishtml: - htmlitem = htmlitem.replace("\r\n", "
") - if not htmlitem.endswith("
"): - htmlitem += "
" - html += htmlitem - # print(html) - return html - def parse_url_in_mdd(self, index: IndexBuilder, url1: str): url1 = url1.replace("/", "\\") if not url1.startswith("\\"): @@ -2395,7 +447,7 @@ class mdict(cishubase): for content in sorted( set(self.searchthread_internal(index, k, __safe)) ): - results.append(self.parseashtml(content)) + results.append(content) except: print_exc() @@ -2470,12 +522,12 @@ function onclickbtn_mdict_internal(_id) { } .tab-widget_mdict_internal .tab-button_mdict_internal { - padding: 10px 20px; + padding: 7px 15px; background-color: #cccccccc; border: none; cursor: pointer; display: inline-block; - line-height: 25px; + line-height: 20px; } .tab-widget_mdict_internal .tab-button_mdict_internal.active { diff --git a/py/LunaTranslator/cishu/mdict_/__init__.py b/py/LunaTranslator/cishu/mdict_/__init__.py new file mode 100644 index 00000000..5f0a26a2 --- /dev/null +++ b/py/LunaTranslator/cishu/mdict_/__init__.py @@ -0,0 +1,2 @@ +from .readmdict import MDD, MDX +from . import lzo diff --git a/py/LunaTranslator/cishu/mdict_/lzo.py b/py/LunaTranslator/cishu/mdict_/lzo.py new file mode 100644 index 00000000..ec59e6b4 --- /dev/null +++ b/py/LunaTranslator/cishu/mdict_/lzo.py @@ -0,0 +1,242 @@ +import math + + +class FlexBuffer(): + def __init__(self): + self.blockSize = None + self.c = None + self.l = None + self.buf = None + + def require(self, n): + r = self.c - self.l + n + if r > 0: + self.l = self.l + self.blockSize * math.ceil(r / self.blockSize) + # tmp = bytearray(self.l) + # for i in len(self.buf): + # tmp[i] = self.buf[i] + # self.buf = tmp + self.buf = self.buf + bytearray(self.l - len(self.buf)) + self.c = self.c + n + return self.buf + + def alloc(self, initSize, blockSize): + if blockSize: + sz = blockSize + else: + sz = 4096 + self.blockSize = self.roundUp(sz) + self.c = 0 + self.l = self.roundUp(initSize) | 0 + self.l += self.blockSize - (self.l % self.blockSize) + self.buf = bytearray(self.l) + return self.buf + + def roundUp(self, n): + r = n % 4 + if r == 0: + return n + else: + return n + 4 - r + + def reset(self): + self.c = 0 + self.l = len(self.buf) + + def pack(self, size): + return self.buf[0:size] + + +def _decompress(inBuf, outBuf): + c_top_loop = 1 + c_first_literal_run = 2 + c_match = 3 + c_copy_match = 4 + c_match_done = 5 + c_match_next = 6 + + out = outBuf.buf + op = 0 + ip = 0 + t = inBuf[ip] + state = c_top_loop + m_pos = 0 + ip_end = len(inBuf) + + if t > 17: + ip = ip + 1 + t = t - 17 + if t < 4: + state = c_match_next + else: + out = outBuf.require(t) + while True: + out[op] = inBuf[ip] + op = op + 1 + ip = ip + 1 + t = t - 1 + if not t > 0: + break + state = c_first_literal_run + + while True: + if_block = False + + ## + if state == c_top_loop: + t = inBuf[ip] + ip = ip + 1 + if t >= 16: + state = c_match + continue + if t == 0: + while inBuf[ip] == 0: + t = t + 255 + ip = ip + 1 + t = t + 15 + inBuf[ip] + ip = ip + 1 + + t = t + 3 + out = outBuf.require(t) + while True: + out[op] = inBuf[ip] + op = op + 1 + ip = ip + 1 + t = t - 1 + if not t > 0: + break + # emulate c switch + state = c_first_literal_run + + ## + if state == c_first_literal_run: + t = inBuf[ip] + ip = ip + 1 + if t >= 16: + state = c_match + continue + m_pos = op - 0x801 - (t >> 2) - (inBuf[ip] << 2) + ip = ip + 1 + out = outBuf.require(3) + out[op] = out[m_pos] + op = op + 1 + m_pos = m_pos + 1 + out[op] = out[m_pos] + op = op + 1 + m_pos = m_pos + 1 + out[op] = out[m_pos] + op = op + 1 + + state = c_match_done + continue + + ## + if state == c_match: + if t >= 64: + m_pos = op - 1 - ((t >> 2) & 7) - (inBuf[ip] << 3) + ip = ip + 1 + t = (t >> 5) - 1 + state = c_copy_match + continue + elif t >= 32: + t = t & 31 + if t == 0: + while inBuf[ip] == 0: + t = t + 255 + ip = ip + 1 + t = t + 31 + inBuf[ip] + ip = ip + 1 + m_pos = op - 1 - ((inBuf[ip] + (inBuf[ip + 1] << 8)) >> 2) + ip = ip + 2 + elif t >= 16: + m_pos = op - ((t & 8) << 11) + t = t & 7 + if t == 0: + while inBuf[ip] == 0: + t = t + 255 + ip = ip + 1 + t = t + 7 + inBuf[ip] + ip = ip + 1 + m_pos = m_pos - ((inBuf[ip] + (inBuf[ip + 1] << 8)) >> 2) + ip = ip + 2 + if m_pos == op: + break + m_pos = m_pos - 0x4000 + else: + m_pos = op - 1 - (t >> 2) - (inBuf[ip] << 2) + ip = ip + 1 + out = outBuf.require(2) + out[op] = out[m_pos] + op = op + 1 + m_pos = m_pos + 1 + out[op] = out[m_pos] + op = op + 1 + state = c_match_done + continue + + if t >= 6 and (op - m_pos) >= 4: + if_block = True + t += 2 + out = outBuf.require(t) + while True: + out[op] = out[m_pos] + op += 1 + m_pos += 1 + t -= 1 + if not t > 0: + break + # emulate c switch + state = c_copy_match + + ## + if state == c_copy_match: + if not if_block: + t += 2 + out = outBuf.require(t) + while True: + out[op] = out[m_pos] + op += 1 + m_pos += 1 + t -= 1 + if not t > 0: + break + # emulating c switch + state = c_match_done + + ## + if state == c_match_done: + t = inBuf[ip - 2] & 3 + if t == 0: + state = c_top_loop + continue + # emulate c switch + state = c_match_next + + ## + if state == c_match_next: + out = outBuf.require(1) + out[op] = inBuf[ip] + op += 1 + ip += 1 + if t > 1: + out = outBuf.require(1) + out[op] = inBuf[ip] + op += 1 + ip += 1 + if t > 2: + out = outBuf.require(1) + out[op] = inBuf[ip] + op += 1 + ip += 1 + t = inBuf[ip] + ip += 1 + state = c_match + continue + + return bytes(outBuf.pack(op)) + + +def decompress(input, initSize=16000, blockSize=8192): + output = FlexBuffer() + output.alloc(initSize, blockSize) + return _decompress(bytearray(input), output) \ No newline at end of file diff --git a/py/LunaTranslator/cishu/mdict_/pureSalsa20.py b/py/LunaTranslator/cishu/mdict_/pureSalsa20.py new file mode 100644 index 00000000..dbec3af7 --- /dev/null +++ b/py/LunaTranslator/cishu/mdict_/pureSalsa20.py @@ -0,0 +1,363 @@ +# coding: utf-8 +# mypy: ignore-errors + +# Copyright (C) 2016-2023 Saeed Rasooli on https://github.com/ilius/pyglossary/ +# Copyright (C) 2015 Z. H. Liu on https://github.com/zhansliu/writemdict + +# pureSalsa20.py -- a pure Python implementation of the Salsa20 cipher, +# ported to Python 3 + +# v4.0: Added Python 3 support, dropped support for Python <= 2.5. + +# // zhansliu + +# Original comments below. + +# ==================================================================== +# There are comments here by two authors about three pieces of software: +# comments by Larry Bugbee about +# Salsa20, the stream cipher by Daniel J. Bernstein +# (including comments about the speed of the C version) and +# pySalsa20, Bugbee's own Python wrapper for salsa20.c +# (including some references), and +# comments by Steve Witham about +# pureSalsa20, Witham's pure Python 2.5 implementation of Salsa20, +# which follows pySalsa20's API, and is in this file. + +# Salsa20: a Fast Streaming Cipher (comments by Larry Bugbee) +# ----------------------------------------------------------- + +# Salsa20 is a fast stream cipher written by Daniel Bernstein +# that basically uses a hash function and XOR making for fast +# encryption. (Decryption uses the same function.) Salsa20 +# is simple and quick. + +# Some Salsa20 parameter values... +# design strength 128 bits +# key length 128 or 256 bits, exactly +# IV, aka nonce 64 bits, always +# chunk size must be in multiples of 64 bytes + +# Salsa20 has two reduced versions, 8 and 12 rounds each. + +# One benchmark (10 MB): +# 1.5GHz PPC G4 102/97/89 MB/sec for 8/12/20 rounds +# AMD Athlon 2500+ 77/67/53 MB/sec for 8/12/20 rounds +# (no I/O and before Python GC kicks in) + +# Salsa20 is a Phase 3 finalist in the EU eSTREAM competition +# and appears to be one of the fastest ciphers. It is well +# documented so I will not attempt any injustice here. Please +# see "References" below. + +# ...and Salsa20 is "free for any use". + + +# pySalsa20: a Python wrapper for Salsa20 (Comments by Larry Bugbee) +# ------------------------------------------------------------------ + +# pySalsa20.py is a simple ctypes Python wrapper. Salsa20 is +# as it's name implies, 20 rounds, but there are two reduced +# versions, 8 and 12 rounds each. Because the APIs are +# identical, pySalsa20 is capable of wrapping all three +# versions (number of rounds hardcoded), including a special +# version that allows you to set the number of rounds with a +# set_rounds() function. Compile the version of your choice +# as a shared library (not as a Python extension), name and +# install it as libsalsa20.so. + +# Sample usage: +# from pySalsa20 import Salsa20 +# s20 = Salsa20(key, IV) +# dataout = s20.encryptBytes(datain) # same for decrypt + +# This is EXPERIMENTAL software and intended for educational +# purposes only. To make experimentation less cumbersome, +# pySalsa20 is also free for any use. + +# THIS PROGRAM IS PROVIDED WITHOUT WARRANTY OR GUARANTEE OF +# ANY KIND. USE AT YOUR OWN RISK. + +# Enjoy, + +# Larry Bugbee +# bugbee@seanet.com +# April 2007 + + +# References: +# ----------- +# http://en.wikipedia.org/wiki/Salsa20 +# http://en.wikipedia.org/wiki/Daniel_Bernstein +# http://cr.yp.to/djb.html +# http://www.ecrypt.eu.org/stream/salsa20p3.html +# http://www.ecrypt.eu.org/stream/p3ciphers/salsa20/salsa20_p3source.zip + + +# Prerequisites for pySalsa20: +# ---------------------------- +# - Python 2.5 (haven't tested in 2.4) + + +# pureSalsa20: Salsa20 in pure Python 2.5 (comments by Steve Witham) +# ------------------------------------------------------------------ + +# pureSalsa20 is the stand-alone Python code in this file. +# It implements the underlying Salsa20 core algorithm +# and emulates pySalsa20's Salsa20 class API (minus a bug(*)). + +# pureSalsa20 is MUCH slower than libsalsa20.so wrapped with pySalsa20-- +# about 1/1000 the speed for Salsa20/20 and 1/500 the speed for Salsa20/8, +# when encrypting 64k-byte blocks on my computer. + +# pureSalsa20 is for cases where portability is much more important than +# speed. I wrote it for use in a "structured" random number generator. + +# There are comments about the reasons for this slowness in +# http://www.tiac.net/~sw/2010/02/PureSalsa20 + +# Sample usage: +# from pureSalsa20 import Salsa20 +# s20 = Salsa20(key, IV) +# dataout = s20.encryptBytes(datain) # same for decrypt + +# I took the test code from pySalsa20, added a bunch of tests including +# rough speed tests, and moved them into the file testSalsa20.py. +# To test both pySalsa20 and pureSalsa20, type +# python testSalsa20.py + +# (*)The bug (?) in pySalsa20 is this. The rounds variable is global to the +# libsalsa20.so library and not switched when switching between instances +# of the Salsa20 class. +# s1 = Salsa20( key, IV, 20 ) +# s2 = Salsa20( key, IV, 8 ) +# In this example, +# with pySalsa20, both s1 and s2 will do 8 rounds of encryption. +# with pureSalsa20, s1 will do 20 rounds and s2 will do 8 rounds. +# Perhaps giving each instance its own nRounds variable, which +# is passed to the salsa20wordtobyte() function, is insecure. I'm not a +# cryptographer. + +# pureSalsa20.py and testSalsa20.py are EXPERIMENTAL software and +# intended for educational purposes only. To make experimentation less +# cumbersome, pureSalsa20.py and testSalsa20.py are free for any use. + +# Revisions: +# ---------- +# p3.2 Fixed bug that initialized the output buffer with plaintext! +# Saner ramping of nreps in speed test. +# Minor changes and print statements. +# p3.1 Took timing variability out of add32() and rot32(). +# Made the internals more like pySalsa20/libsalsa . +# Put the semicolons back in the main loop! +# In encryptBytes(), modify a byte array instead of appending. +# Fixed speed calculation bug. +# Used subclasses instead of patches in testSalsa20.py . +# Added 64k-byte messages to speed test to be fair to pySalsa20. +# p3 First version, intended to parallel pySalsa20 version 3. + +# More references: +# ---------------- +# http://www.seanet.com/~bugbee/crypto/salsa20/ [pySalsa20] +# http://cr.yp.to/snuffle.html [The original name of Salsa20] +# http://cr.yp.to/snuffle/salsafamily-20071225.pdf [ Salsa20 design] +# http://www.tiac.net/~sw/2010/02/PureSalsa20 + +# THIS PROGRAM IS PROVIDED WITHOUT WARRANTY OR GUARANTEE OF +# ANY KIND. USE AT YOUR OWN RISK. + +# Cheers, + +# Steve Witham sw at remove-this tiac dot net +# February, 2010 + + +import operator +from struct import Struct + +__all__ = ["Salsa20"] + +little_u64 = Struct(" None: + self._lastChunk64 = True + self._IVbitlen = 64 # must be 64 bits + self.ctx = [0] * 16 + if key: + self.setKey(key) + if IV: + self.setIV(IV) + + self.setRounds(rounds) + + def setKey(self, key): + assert isinstance(key, bytes) + ctx = self.ctx + if len(key) == 32: # recommended + constants = b"expand 32-byte k" + ctx[1], ctx[2], ctx[3], ctx[4] = little4_i32.unpack(key[0:16]) + ctx[11], ctx[12], ctx[13], ctx[14] = little4_i32.unpack(key[16:32]) + elif len(key) == 16: + constants = b"expand 16-byte k" + ctx[1], ctx[2], ctx[3], ctx[4] = little4_i32.unpack(key[0:16]) + ctx[11], ctx[12], ctx[13], ctx[14] = little4_i32.unpack(key[0:16]) + else: + raise ValueError("key length isn't 32 or 16 bytes.") + ctx[0], ctx[5], ctx[10], ctx[15] = little4_i32.unpack(constants) + + def setIV(self, IV): + assert isinstance(IV, bytes) + assert len(IV) * 8 == 64, "nonce (IV) not 64 bits" + self.IV = IV + ctx = self.ctx + ctx[6], ctx[7] = little2_i32.unpack(IV) + ctx[8], ctx[9] = 0, 0 # Reset the block counter. + + setNonce = setIV # support an alternate name + + def setCounter(self, counter): + assert isinstance(counter, int) + assert 0 <= counter < 1 << 64, "counter < 0 or >= 2**64" + ctx = self.ctx + ctx[8], ctx[9] = little2_i32.unpack(little_u64.pack(counter)) + + def getCounter(self): + return little_u64.unpack(little2_i32.pack(*self.ctx[8:10]))[0] + + def setRounds(self, rounds, testing=False): + assert testing or rounds in {8, 12, 20}, "rounds must be 8, 12, 20" + self.rounds = rounds + + def encryptBytes(self, data: bytes) -> bytes: + assert isinstance(data, bytes), "data must be byte string" + assert self._lastChunk64, "previous chunk not multiple of 64 bytes" + lendata = len(data) + munged = bytearray(lendata) + for i in range(0, lendata, 64): + h = salsa20_wordtobyte(self.ctx, self.rounds, checkRounds=False) + self.setCounter((self.getCounter() + 1) % 2**64) + # Stopping at 2^70 bytes per nonce is user's responsibility. + for j in range(min(64, lendata - i)): + munged[i + j] = data[i + j] ^ h[j] + + self._lastChunk64 = not lendata % 64 + return bytes(munged) + + decryptBytes = encryptBytes # encrypt and decrypt use same function + + +# -------------------------------------------------------------------------- + + +def salsa20_wordtobyte(input_, nRounds=20, checkRounds=True): + """ + Do nRounds Salsa20 rounds on a copy of + input: list or tuple of 16 ints treated as little-endian unsigneds. + Returns a 64-byte string. + """ + assert isinstance(input_, list | tuple) and len(input_) == 16 + assert not checkRounds or nRounds in {8, 12, 20} + + x = list(input_) + + XOR = operator.xor + + ROTATE = rot32 + PLUS = add32 + + for _ in range(nRounds // 2): + # These ...XOR...ROTATE...PLUS... lines are from ecrypt-linux.c + # unchanged except for indents and the blank line between rounds: + x[4] = XOR(x[4], ROTATE(PLUS(x[0], x[12]), 7)) + x[8] = XOR(x[8], ROTATE(PLUS(x[4], x[0]), 9)) + x[12] = XOR(x[12], ROTATE(PLUS(x[8], x[4]), 13)) + x[0] = XOR(x[0], ROTATE(PLUS(x[12], x[8]), 18)) + x[9] = XOR(x[9], ROTATE(PLUS(x[5], x[1]), 7)) + x[13] = XOR(x[13], ROTATE(PLUS(x[9], x[5]), 9)) + x[1] = XOR(x[1], ROTATE(PLUS(x[13], x[9]), 13)) + x[5] = XOR(x[5], ROTATE(PLUS(x[1], x[13]), 18)) + x[14] = XOR(x[14], ROTATE(PLUS(x[10], x[6]), 7)) + x[2] = XOR(x[2], ROTATE(PLUS(x[14], x[10]), 9)) + x[6] = XOR(x[6], ROTATE(PLUS(x[2], x[14]), 13)) + x[10] = XOR(x[10], ROTATE(PLUS(x[6], x[2]), 18)) + x[3] = XOR(x[3], ROTATE(PLUS(x[15], x[11]), 7)) + x[7] = XOR(x[7], ROTATE(PLUS(x[3], x[15]), 9)) + x[11] = XOR(x[11], ROTATE(PLUS(x[7], x[3]), 13)) + x[15] = XOR(x[15], ROTATE(PLUS(x[11], x[7]), 18)) + + x[1] = XOR(x[1], ROTATE(PLUS(x[0], x[3]), 7)) + x[2] = XOR(x[2], ROTATE(PLUS(x[1], x[0]), 9)) + x[3] = XOR(x[3], ROTATE(PLUS(x[2], x[1]), 13)) + x[0] = XOR(x[0], ROTATE(PLUS(x[3], x[2]), 18)) + x[6] = XOR(x[6], ROTATE(PLUS(x[5], x[4]), 7)) + x[7] = XOR(x[7], ROTATE(PLUS(x[6], x[5]), 9)) + x[4] = XOR(x[4], ROTATE(PLUS(x[7], x[6]), 13)) + x[5] = XOR(x[5], ROTATE(PLUS(x[4], x[7]), 18)) + x[11] = XOR(x[11], ROTATE(PLUS(x[10], x[9]), 7)) + x[8] = XOR(x[8], ROTATE(PLUS(x[11], x[10]), 9)) + x[9] = XOR(x[9], ROTATE(PLUS(x[8], x[11]), 13)) + x[10] = XOR(x[10], ROTATE(PLUS(x[9], x[8]), 18)) + x[12] = XOR(x[12], ROTATE(PLUS(x[15], x[14]), 7)) + x[13] = XOR(x[13], ROTATE(PLUS(x[12], x[15]), 9)) + x[14] = XOR(x[14], ROTATE(PLUS(x[13], x[12]), 13)) + x[15] = XOR(x[15], ROTATE(PLUS(x[14], x[13]), 18)) + + for idx, item in enumerate(input_): + x[idx] = PLUS(x[idx], item) + return little16_i32.pack(*x) + + +# --------------------------- 32-bit ops ------------------------------- + + +def trunc32(w): + """ + Return the bottom 32 bits of w as a Python int. + This creates longs temporarily, but returns an int. + """ + w = int((w & 0x7FFFFFFF) | -(w & 0x80000000)) + assert isinstance(w, int) + return w + + +def add32(a, b): + """ + Add two 32-bit words discarding carry above 32nd bit, + and without creating a Python long. + Timing shouldn't vary. + """ + lo = (a & 0xFFFF) + (b & 0xFFFF) + hi = (a >> 16) + (b >> 16) + (lo >> 16) + return (-(hi & 0x8000) | (hi & 0x7FFF)) << 16 | (lo & 0xFFFF) + + +def rot32(w, nLeft): + """ + Rotate 32-bit word left by nLeft or right by -nLeft + without creating a Python long. + Timing depends on nLeft but not on w. + """ + nLeft &= 31 # which makes nLeft >= 0 + if nLeft == 0: + return w + + # Note: now 1 <= nLeft <= 31. + # RRRsLLLLLL There are nLeft RRR's, (31-nLeft) LLLLLL's, + # => sLLLLLLRRR and one s which becomes the sign bit. + RRR = ((w >> 1) & 0x7FFFFFFF) >> (31 - nLeft) + sLLLLLL = -((1 << (31 - nLeft)) & w) | (0x7FFFFFFF >> nLeft) & w + return RRR | (sLLLLLL << nLeft) + + +# --------------------------------- end ----------------------------------- \ No newline at end of file diff --git a/py/LunaTranslator/cishu/mdict_/readmdict.py b/py/LunaTranslator/cishu/mdict_/readmdict.py new file mode 100644 index 00000000..4bc89696 --- /dev/null +++ b/py/LunaTranslator/cishu/mdict_/readmdict.py @@ -0,0 +1,748 @@ +# -*- coding: utf-8 -*- +# mypy: ignore-errors +# +# readmdict.py from https://bitbucket.org/xwang/mdict-analysis +# Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser +# +# Copyright (C) 2016-2023 Saeed Rasooli on https://github.com/ilius/pyglossary/ +# Copyright (C) 2012, 2013, 2015, 2022 Xiaoqiang Wang +# +# This program is a free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, version 3 of the License. +# +# You can get a copy of GNU General Public License along this program +# But you can always get it from http://www.gnu.org/licenses/gpl.txt +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +import logging +import re +import sys + +# zlib compression is used for engine version >=2.0 +import zlib +from io import BytesIO +from struct import pack, unpack + +from .pureSalsa20 import Salsa20 +from .ripemd128 import ripemd128 + +# LZO compression is used for engine version < 2.0 +# try: +# import lzo +# except ImportError: +# lzo = None +from . import lzo + +# xxhash is used for engine version >= 3.0 +try: + import xxhash +except ImportError: + xxhash = None + +__all__ = ["MDD", "MDX"] + +log = logging.getLogger(__name__) + + +def _unescape_entities(text): + """Unescape offending tags < > " &.""" + text = text.replace(b"<", b"<") + text = text.replace(b">", b">") + text = text.replace(b""", b'"') + text = text.replace(b"&", b"&") + return text # noqa: RET504 + + +def _fast_decrypt(data, key): + """XOR decryption.""" + b = bytearray(data) + key = bytearray(key) + previous = 0x36 + for i, bi in enumerate(b): + t = (bi >> 4 | bi << 4) & 0xFF + t = t ^ previous ^ (i & 0xFF) ^ key[i % len(key)] + previous = bi + b[i] = t + return bytes(b) + + +def _salsa_decrypt(ciphertext, encrypt_key): + """salsa20 (8 rounds) decryption.""" + s20 = Salsa20(key=encrypt_key, IV=b"\x00" * 8, rounds=8) + return s20.encryptBytes(ciphertext) + + +def _decrypt_regcode_by_userid(reg_code: bytes, userid: bytes) -> bytes: + userid_digest = ripemd128(userid) + s20 = Salsa20(key=userid_digest, IV=b"\x00" * 8, rounds=8) + return s20.encryptBytes(reg_code) + + +class MDict: + """ + Base class which reads in header and key block. + It has no public methods and serves only as code sharing base class. + """ + + def __init__( + self, + fname: str, + encoding: str = "", + passcode: "tuple[bytes, bytes] | None" = None, + ) -> None: + self._fname = fname + self._encoding = encoding.upper() + self._encrypted_key = None + self._passcode = passcode + + self.header = self._read_header() + + # decrypt regcode to get the encrypted key + if passcode is not None: + regcode, userid = passcode + if isinstance(userid, str): + userid = userid.encode("utf8") + self._encrypted_key = _decrypt_regcode_by_userid(regcode, userid) + # MDict 3.0 encryption key derives from UUID if present + elif self._version >= 3.0: + uuid = self.header.get(b"UUID") + if uuid: + if xxhash is None: + raise RuntimeError( + "xxhash module is needed to read MDict 3.0 format" + "\n" + "Run `pip3 install xxhash` to install", + ) + mid = (len(uuid) + 1) // 2 + self._encrypted_key = xxhash.xxh64_digest( + uuid[:mid], + ) + xxhash.xxh64_digest(uuid[mid:]) + + #self._key_list = self._read_keys() + + def __repr__(self): + return ( + f"MDict({self._fname!r}, " + f"encoding={self._encoding!r}, " + f"passcode={self._passcode})" + ) + + @property + def filename(self): + return self._fname + + def __len__(self): + return self._num_entries + + def __iter__(self): + return self.keys() + + def keys(self): + """Return an iterator over dictionary keys.""" + return (key_value for key_id, key_value in self._key_list) + + def _read_number(self, f): + return unpack(self._number_format, f.read(self._number_width))[0] + + @staticmethod + def _read_int32(f): + return unpack(">I", f.read(4))[0] + + @staticmethod + def _parse_header(header): + """Extract attributes from .""" + return { + key: _unescape_entities(value) + for key, value in re.findall(rb'(\w+)="(.*?)"', header, re.DOTALL) + } + + def _decode_block(self, block, decompressed_size): + # block info: compression, encryption + info = unpack("> 4) & 0xF + encryption_size = (info >> 8) & 0xFF + + # adler checksum of the block data used as the encryption key if none given + adler32 = unpack(">I", block[4:8])[0] + encrypted_key = self._encrypted_key + if encrypted_key is None: + encrypted_key = ripemd128(block[4:8]) + + # block data + data = block[8:] + + # decrypt + if encryption_method == 0: + decrypted_block = data + elif encryption_method == 1: + decrypted_block = ( + _fast_decrypt(data[:encryption_size], encrypted_key) + + data[encryption_size:] + ) + elif encryption_method == 2: + decrypted_block = ( + _salsa_decrypt(data[:encryption_size], encrypted_key) + + data[encryption_size:] + ) + else: + raise ValueError(f"encryption method {encryption_method} not supported") + + # check adler checksum over decrypted data + if self._version >= 3: + assert hex(adler32) == hex(zlib.adler32(decrypted_block) & 0xFFFFFFFF) + + # decompress + if compression_method == 0: + decompressed_block = decrypted_block + elif compression_method == 1: + header = b"\xf0" + pack(">I", decompressed_size) + decompressed_block = lzo.decompress(header + decrypted_block) + elif compression_method == 2: + decompressed_block = zlib.decompress(decrypted_block) + else: + raise ValueError(f"compression method {compression_method} not supported") + + # check adler checksum over decompressed data + if self._version < 3: + assert hex(adler32) == hex(zlib.adler32(decompressed_block) & 0xFFFFFFFF) + + return decompressed_block + + def _decode_key_block_info(self, key_block_info_compressed): + if self._version >= 2: + # zlib compression + assert key_block_info_compressed[:4] == b"\x02\x00\x00\x00" + # decrypt if needed + if self._encrypt & 0x02: + key = ripemd128(key_block_info_compressed[4:8] + pack(b"I", key_block_info_compressed[4:8])[0] + assert adler32 == zlib.adler32(key_block_info) & 0xFFFFFFFF + else: + # no compression + key_block_info = key_block_info_compressed + # decode + key_block_info_list = [] + num_entries = 0 + i = 0 + if self._version >= 2: + byte_format = ">H" + byte_width = 2 + text_term = 1 + else: + byte_format = ">B" + byte_width = 1 + text_term = 0 + + while i < len(key_block_info): + # number of entries in current key block + num_entries += unpack( + self._number_format, + key_block_info[i : i + self._number_width], + )[0] + i += self._number_width + # text head size + text_head_size = unpack(byte_format, key_block_info[i : i + byte_width])[0] + i += byte_width + # text head + if self._encoding != "UTF-16": + i += text_head_size + text_term + else: + i += (text_head_size + text_term) * 2 + # text tail size + text_tail_size = unpack(byte_format, key_block_info[i : i + byte_width])[0] + i += byte_width + # text tail + if self._encoding != "UTF-16": + i += text_tail_size + text_term + else: + i += (text_tail_size + text_term) * 2 + # key block compressed size + key_block_compressed_size = unpack( + self._number_format, + key_block_info[i : i + self._number_width], + )[0] + i += self._number_width + # key block decompressed size + key_block_decompressed_size = unpack( + self._number_format, + key_block_info[i : i + self._number_width], + )[0] + i += self._number_width + key_block_info_list.append( + (key_block_compressed_size, key_block_decompressed_size), + ) + + # assert num_entries == self._num_entries + + return key_block_info_list + + def _decode_key_block(self, key_block_compressed, key_block_info_list): + key_list = [] + i = 0 + for compressed_size, decompressed_size in key_block_info_list: + key_block = self._decode_block( + key_block_compressed[i : i + compressed_size], + decompressed_size, + ) + # extract one single key block into a key list + key_list += self._split_key_block(key_block) + i += compressed_size + return key_list + + def _split_key_block(self, key_block): + key_list = [] + key_start_index = 0 + while key_start_index < len(key_block): + # the corresponding record's offset in record block + key_id = unpack( + self._number_format, + key_block[key_start_index : key_start_index + self._number_width], + )[0] + # key text ends with '\x00' + if self._encoding == "UTF-16": + delimiter = b"\x00\x00" + width = 2 + else: + delimiter = b"\x00" + width = 1 + i = key_start_index + self._number_width + key_end_index = None + while i < len(key_block): + if key_block[i : i + width] == delimiter: + key_end_index = i + break + i += width + assert key_end_index is not None + key_text = ( + key_block[key_start_index + self._number_width : key_end_index] + .decode(self._encoding, errors="ignore") + .strip() + ) + key_start_index = key_end_index + width + key_list += [(key_id, key_text)] + return key_list + + def _read_header(self): + f = open(self._fname, "rb") + # number of bytes of header text + header_bytes_size = unpack(">I", f.read(4))[0] + header_bytes = f.read(header_bytes_size) + # 4 bytes: adler32 checksum of header, in little endian + adler32 = unpack("= 0x03000000: + encoding = encoding.decode("utf-8") + # GB18030 > GBK > GB2312 + if encoding in {"GBK", "GB2312"}: + encoding = "GB18030" + self._encoding = encoding + + # encryption flag + # 0x00 - no encryption, "Allow export to text" is checked in MdxBuilder 3. + # 0x01 - encrypt record block, "Encryption Key" is given in MdxBuilder 3. + # 0x02 - encrypt key info block, + # "Allow export to text" is unchecked in MdxBuilder 3. + if b"Encrypted" not in header_tag or header_tag[b"Encrypted"] == b"No": + self._encrypt = 0 + elif header_tag[b"Encrypted"] == b"Yes": + self._encrypt = 1 + else: + self._encrypt = int(header_tag[b"Encrypted"]) + + # stylesheet attribute if present takes form of: + # style_number # 1-255 + # style_begin # or '' + # style_end # or '' + # store stylesheet in dict in the form of + # {'number' : ('style_begin', 'style_end')} + self._stylesheet = {} + if header_tag.get(b"StyleSheet"): + lines = header_tag[b"StyleSheet"].decode('utf8',errors='ignore').splitlines() + self._stylesheet = { + lines[i]: (lines[i + 1], lines[i + 2]) for i in range(0, len(lines), 3) + } + if b"Title" in header_tag: + self._title = header_tag[b"Title"].decode("utf-8",errors='ignore') + else: + self._title = "" + # before version 2.0, number is 4 bytes integer + # version 2.0 and above uses 8 bytes + self._version = float(header_tag[b"GeneratedByEngineVersion"]) + if self._version < 2.0: + self._number_width = 4 + self._number_format = ">I" + else: + self._number_width = 8 + self._number_format = ">Q" + # version 3.0 uses UTF-8 only + if self._version >= 3: + self._encoding = "UTF-8" + + return header_tag + + def _read_keys(self): + if self._version >= 3: + return self._read_keys_v3() + + # if no regcode is given, try brute-force (only for engine <= 2) + if (self._encrypt & 0x01) and self._encrypted_key is None: + log.warning("Trying brute-force on encrypted key blocks") + return self._read_keys_brutal() + + return self._read_keys_v1v2() + + def _read_keys_v3(self): + f = open(self._fname, "rb") + f.seek(self._key_block_offset) + + # find all blocks offset + while True: + block_type = self._read_int32(f) + block_size = self._read_number(f) + block_offset = f.tell() + # record data + if block_type == 0x01000000: + self._record_block_offset = block_offset + # record index + elif block_type == 0x02000000: + self._record_index_offset = block_offset + # key data + elif block_type == 0x03000000: + self._key_data_offset = block_offset + # key index + elif block_type == 0x04000000: + self._key_index_offset = block_offset + else: + raise RuntimeError(f"Unknown block type {block_type}") + f.seek(block_size, 1) + # test the end of file + if f.read(4): + f.seek(-4, 1) + else: + break + + # read key data + f.seek(self._key_data_offset) + number = self._read_int32(f) + self._read_number(f) # total_size + key_list = [] + for _ in range(number): + decompressed_size = self._read_int32(f) + compressed_size = self._read_int32(f) + block_data = f.read(compressed_size) + decompressed_block_data = self._decode_block(block_data, decompressed_size) + key_list.extend(self._split_key_block(decompressed_block_data)) + + f.close() + self._num_entries = len(key_list) + return key_list + + def _read_keys_v1v2(self): + f = open(self._fname, "rb") + f.seek(self._key_block_offset) + + # the following numbers could be encrypted + num_bytes = 8 * 5 if self._version >= 2.0 else 4 * 4 + block = f.read(num_bytes) + + if self._encrypt & 1: + block = _salsa_decrypt(block, self._encrypted_key) + + # decode this block + sf = BytesIO(block) + # number of key blocks + num_key_blocks = self._read_number(sf) + # number of entries + self._num_entries = self._read_number(sf) + # number of bytes of key block info after decompression + if self._version >= 2.0: + self._read_number(sf) # key_block_info_decomp_size + # number of bytes of key block info + key_block_info_size = self._read_number(sf) + # number of bytes of key block + key_block_size = self._read_number(sf) + + # 4 bytes: adler checksum of previous 5 numbers + if self._version >= 2.0: + adler32 = unpack(">I", f.read(4))[0] + assert adler32 == (zlib.adler32(block) & 0xFFFFFFFF) + + # read key block info, which indicates key block's compressed + # and decompressed size + key_block_info = f.read(key_block_info_size) + key_block_info_list = self._decode_key_block_info(key_block_info) + assert num_key_blocks == len(key_block_info_list) + + # read key block + key_block_compressed = f.read(key_block_size) + # extract key block + key_list = self._decode_key_block(key_block_compressed, key_block_info_list) + + self._record_block_offset = f.tell() + f.close() + + return key_list + + def _read_keys_brutal(self): + f = open(self._fname, "rb") + f.seek(self._key_block_offset) + + # the following numbers could be encrypted, disregard them! + if self._version >= 2.0: + num_bytes = 8 * 5 + 4 + key_block_type = b"\x02\x00\x00\x00" + else: + num_bytes = 4 * 4 + key_block_type = b"\x01\x00\x00\x00" + + f.read(num_bytes) # block + + # key block info + # 4 bytes '\x02\x00\x00\x00' + # 4 bytes adler32 checksum + # unknown number of bytes follows until '\x02\x00\x00\x00' + # which marks the beginning of key block + key_block_info = f.read(8) + if self._version >= 2.0: + assert key_block_info[:4] == b"\x02\x00\x00\x00" + while True: + fpos = f.tell() + t = f.read(1024) + index = t.find(key_block_type) + if index != -1: + key_block_info += t[:index] + f.seek(fpos + index) + break + key_block_info += t + + key_block_info_list = self._decode_key_block_info(key_block_info) + key_block_size = sum(list(zip(*key_block_info_list, strict=False))[0]) + + # read key block + key_block_compressed = f.read(key_block_size) + # extract key block + key_list = self._decode_key_block(key_block_compressed, key_block_info_list) + + self._record_block_offset = f.tell() + f.close() + + self._num_entries = len(key_list) + return key_list + + def items(self): + """ + Return a generator which in turn produce tuples in the + form of (filename, content). + """ + return self._read_records() + + def _read_records(self): + if self._version >= 3: + yield from self._read_records_v3() + else: + yield from self._read_records_v1v2() + + def _read_records_v3(self): + f = open(self._fname, "rb") + f.seek(self._record_block_offset) + + offset = 0 + i = 0 + size_counter = 0 + num_record_blocks = self._read_int32(f) + self._read_number(f) # num_bytes + for _ in range(num_record_blocks): + file_pos=f.tell() + decompressed_size = self._read_int32(f) + compressed_size = self._read_int32(f) + record_block = self._decode_block( + f.read(compressed_size), + decompressed_size, + ) + + # split record block according to the offset info from key block + while i < len(self._key_list): + record_start, key_text = self._key_list[i] + # reach the end of current record block + if record_start - offset >= len(record_block): + break + # record end index + if i < len(self._key_list) - 1: + record_end = self._key_list[i + 1][0] + else: + record_end = len(record_block) + offset + i += 1 + yield dict(key_text= key_text, file_pos= file_pos, decompressed_size= len(record_block),record_start=record_start, offset=offset, record_end=record_end,compressed_size=compressed_size) + offset += len(record_block) + size_counter += compressed_size + + def _read_records_v1v2(self): + f = open(self._fname, "rb") + f.seek(self._record_block_offset) + + num_record_blocks = self._read_number(f) + num_entries = self._read_number(f) + assert num_entries == self._num_entries + record_block_info_size = self._read_number(f) + self._read_number(f) # record_block_size + + # record block info section + record_block_info_list = [] + size_counter = 0 + for _ in range(num_record_blocks): + compressed_size = self._read_number(f) + decompressed_size = self._read_number(f) + record_block_info_list += [(compressed_size, decompressed_size)] + size_counter += self._number_width * 2 + assert size_counter == record_block_info_size + + # actual record block + offset = 0 + i = 0 + size_counter = 0 + for compressed_size, decompressed_size in record_block_info_list: + file_pos=f.tell() + record_block_compressed = f.read(compressed_size) + try: + record_block = self._decode_block( + record_block_compressed, + decompressed_size, + ) + except zlib.error: + log.error("zlib decompress error") + log.debug(f"record_block_compressed = {record_block_compressed!r}") + continue + # split record block according to the offset info from key block + while i < len(self._key_list): + record_start, key_text = self._key_list[i] + # reach the end of current record block + if record_start - offset >= len(record_block): + break + # record end index + if i < len(self._key_list) - 1: + record_end = self._key_list[i + 1][0] + else: + record_end = len(record_block) + offset + i += 1 + yield dict(key_text= key_text, file_pos= file_pos, decompressed_size= len(record_block),record_start=record_start, offset=offset, record_end=record_end,compressed_size=compressed_size) + offset += len(record_block) + size_counter += compressed_size + # assert size_counter == record_block_size + + f.close() + + def read_records(self, index): + f = open(self._fname, "rb") + f.seek(index["file_pos"]) + record_block_compressed = f.read(index["compressed_size"]) + f.close() + decompressed_size = index["decompressed_size"] + + try: + record_block = self._decode_block( + record_block_compressed, + decompressed_size, + ) + except zlib.error: + log.error("zlib decompress error") + log.debug(f"record_block_compressed = {record_block_compressed!r}") + + # split record block according to the offset info from key block + data = record_block[ + index["record_start"] + - index["offset"] : index["record_end"] + - index["offset"] + ] + return data + # assert size_counter == record_block_size + +class MDD(MDict): + """ + MDict resource file format (*.MDD) reader. + >>> mdd = MDD("example.mdd") + >>> len(mdd) + 208 + >>> for filename,content in mdd.items(): + ... print(filename, content[:10]) + """ + + def __init__( + self, + fname: str, + passcode: "tuple[bytes, bytes] | None" = None, + ) -> None: + MDict.__init__(self, fname, encoding="UTF-16", passcode=passcode) + + +class MDX(MDict): + """ + MDict dictionary file format (*.MDD) reader. + >>> mdx = MDX("example.mdx") + >>> len(mdx) + 42481 + >>> for key,value in mdx.items(): + ... print(key, value[:10]) + """ + + def __init__( + self, + fname: str, + encoding: str = "", + substyle: bool = False, + passcode: "tuple[bytes, bytes] | None" = None, + ) -> None: + MDict.__init__(self, fname, encoding, passcode) + self._substyle = substyle + + def _substitute_stylesheet(self, txt): + # substitute stylesheet definition + txt_list = re.split(r"`\d+`", txt) + txt_tag = re.findall(r"`\d+`", txt) + txt_styled = txt_list[0] + for j, p in enumerate(txt_list[1:]): + key = txt_tag[j][1:-1] + try: + style = self._stylesheet[key] + except KeyError: + log.error(f'invalid stylesheet key "{key}"') + continue + if p and p[-1] == "\n": + txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + "\r\n" + else: + txt_styled = txt_styled + style[0] + p + style[1] + return txt_styled + + def _treat_record_data(self, data): + # convert to utf-8 + data = ( + data.decode(self._encoding, errors="ignore").strip("\x00") + ) + # substitute styles + if self._substyle and self._stylesheet: + data = self._substitute_stylesheet(data) + return data # noqa: RET504 \ No newline at end of file diff --git a/py/LunaTranslator/cishu/mdict_/ripemd128.py b/py/LunaTranslator/cishu/mdict_/ripemd128.py new file mode 100644 index 00000000..abe729e7 --- /dev/null +++ b/py/LunaTranslator/cishu/mdict_/ripemd128.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- +# mypy: ignore-errors +# +# Copyright (C) 2016-2023 Saeed Rasooli on https://github.com/ilius/pyglossary/ +# Copyright (C) 2015 Z. H. Liu on https://github.com/zhansliu/writemdict +# +# ripemd128.py - A simple ripemd128 library in pure Python. +# +# Supports both Python 2 (versions >= 2.6) and Python 3. +# +# Usage: +# from ripemd128 import ripemd128 +# digest = ripemd128(b"The quick brown fox jumps over the lazy dog") +# assert( +# digest == b"\x3f\xa9\xb5\x7f\x05\x3c\x05\x3f\xbe\x27\x35\xb2\x38\x0d\xb5\x96" +# ) + +import struct + +__all__ = ["ripemd128"] + +# follows this description: http://homes.esat.kuleuven.be/~bosselae/ripemd/rmd128.txt + + +def f(j, x, y, z): + assert 0 <= j < 64 + if j < 16: + return x ^ y ^ z + if j < 32: + return (x & y) | (z & ~x) + if j < 48: + return (x | (0xFFFFFFFF & ~y)) ^ z + return (x & z) | (y & ~z) + + +def K(j): + assert 0 <= j < 64 + if j < 16: + return 0x00000000 + if j < 32: + return 0x5A827999 + if j < 48: + return 0x6ED9EBA1 + return 0x8F1BBCDC + + +def Kp(j): + assert 0 <= j < 64 + if j < 16: + return 0x50A28BE6 + if j < 32: + return 0x5C4DD124 + if j < 48: + return 0x6D703EF3 + return 0x00000000 + + +def padandsplit(message: bytes): + """ + returns a two-dimensional array X[i][j] of 32-bit integers, where j ranges + from 0 to 16. + First pads the message to length in bytes is congruent to 56 (mod 64), + by first adding a byte 0x80, and then padding with 0x00 bytes until the + message length is congruent to 56 (mod 64). Then adds the little-endian + 64-bit representation of the original length. Finally, splits the result + up into 64-byte blocks, which are further parsed as 32-bit integers. + """ + origlen = len(message) + padlength = 64 - ((origlen - 56) % 64) # minimum padding is 1! + message += b"\x80" + message += b"\x00" * (padlength - 1) + message += struct.pack("> (32 - s)) & 0xFFFFFFFF + + +r = [ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 7, 4, 13, 1, 10, 6, 15, 3, 12, 0, 9, 5, 2, 14, 11, 8, + 3, 10, 14, 4, 9, 15, 8, 1, 2, 7, 0, 6, 13, 11, 5, 12, + 1, 9, 11, 10, 0, 8, 12, 4, 13, 3, 7, 15, 14, 5, 6, 2, +] +rp = [ + 5, 14, 7, 0, 9, 2, 11, 4, 13, 6, 15, 8, 1, 10, 3, 12, + 6, 11, 3, 7, 0, 13, 5, 10, 14, 15, 8, 12, 4, 9, 1, 2, + 15, 5, 1, 3, 7, 14, 6, 9, 11, 8, 12, 2, 10, 0, 4, 13, + 8, 6, 4, 1, 3, 11, 15, 0, 5, 12, 2, 13, 9, 7, 10, 14, +] +s = [ + 11, 14, 15, 12, 5, 8, 7, 9, 11, 13, 14, 15, 6, 7, 9, 8, + 7, 6, 8, 13, 11, 9, 7, 15, 7, 12, 15, 9, 11, 7, 13, 12, + 11, 13, 6, 7, 14, 9, 13, 15, 14, 8, 13, 6, 5, 12, 7, 5, + 11, 12, 14, 15, 14, 15, 9, 8, 9, 14, 5, 6, 8, 6, 5, 12, +] +sp = [ + 8, 9, 9, 11, 13, 15, 15, 5, 7, 7, 8, 11, 14, 14, 12, 6, + 9, 13, 15, 7, 12, 8, 9, 11, 7, 7, 12, 7, 6, 15, 13, 11, + 9, 7, 15, 11, 8, 6, 6, 14, 12, 13, 5, 14, 13, 13, 7, 5, + 15, 5, 8, 11, 14, 14, 6, 14, 6, 9, 12, 9, 12, 5, 15, 8, +] + + +def ripemd128(message: bytes) -> bytes: + h0 = 0x67452301 + h1 = 0xEFCDAB89 + h2 = 0x98BADCFE + h3 = 0x10325476 + X = padandsplit(message) + for Xi in X: + A, B, C, D = h0, h1, h2, h3 + Ap, Bp, Cp, Dp = h0, h1, h2, h3 + for j in range(64): + T = rol( + s[j], + add( + A, + f(j, B, C, D), + Xi[r[j]], + K(j), + ), + ) + A, D, C, B = D, C, B, T + T = rol( + sp[j], + add( + Ap, + f(63 - j, Bp, Cp, Dp), + Xi[rp[j]], + Kp(j), + ), + ) + Ap, Dp, Cp, Bp = Dp, Cp, Bp, T + T = add(h1, C, Dp) + h1 = add(h2, D, Ap) + h2 = add(h3, A, Bp) + h3 = add(h0, B, Cp) + h0 = T + + return struct.pack("