From 6213fddc0ecca128ba01aa8ffe4673f4009496e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=81=8D=E5=85=AE=E6=83=9A=E5=85=AE?= <101191390+HIllya51@users.noreply.github.com> Date: Sat, 25 May 2024 15:27:44 +0800 Subject: [PATCH] repair --- .../LunaTranslator/hiraparse/mecab.py | 64 ++++++++++++------- .../LunaTranslator/winsharedutils.py | 4 +- 2 files changed, 42 insertions(+), 26 deletions(-) diff --git a/LunaTranslator/LunaTranslator/hiraparse/mecab.py b/LunaTranslator/LunaTranslator/hiraparse/mecab.py index a878dfa4..d7cb657e 100644 --- a/LunaTranslator/LunaTranslator/hiraparse/mecab.py +++ b/LunaTranslator/LunaTranslator/hiraparse/mecab.py @@ -3,6 +3,25 @@ import os from hiraparse.basehira import basehira +# # 2.1.2 src schema +# UnidicFeatures17 = namedtuple('UnidicFeatures17', +# ('pos1 pos2 pos3 pos4 cType cForm lForm lemma orth pron ' +# 'orthBase pronBase goshu iType iForm fType fForm').split(' ')) + +# # 2.1.2 bin schema +# # The unidic-mecab-2.1.2_bin distribution adds kana accent fields. +# UnidicFeatures26 = namedtuple('UnidicFeatures26', +# ('pos1 pos2 pos3 pos4 cType cForm lForm lemma orth pron ' +# 'orthBase pronBase goshu iType iForm fType fForm ' +# 'kana kanaBase form formBase iConType fConType aType ' +# 'aConType aModeType').split(' ')) + +# # schema used in 2.2.0, 2.3.0 +# UnidicFeatures29 = namedtuple('UnidicFeatures29', 'pos1 pos2 pos3 pos4 cType ' +# 'cForm lForm lemma orth pron orthBase pronBase goshu iType iForm fType ' +# 'fForm iConType fConType type kana kanaBase form formBase aType aConType ' +# 'aModType lid lemma_id'.split(' ')) + class mecab(basehira): def init(self) -> None: @@ -20,34 +39,26 @@ class mecab(basehira): text, codec ): # self.kks.parseToNodeList(text): kana = "" - pos1 = "" - origorig = None - if len(fields): - pos1 = fields[0] - if len(fields) > 29: - kana = fields[22] - elif len(fields) == 29: - kana = fields[20] - elif 29 > len(fields) >= 26: - kana = fields[17] - origorig = fields[7] - elif len(fields) > 9: - kana = fields[9] # 无kana,用lform代替 - elif len(fields) == 9: - kana = fields[8] # 7/8均可,issues/514 - else: - kana = "" - if len(fields) >= 8: - origorig = fields[7] # unsafe + origorig = "" + pos1 = fields[0] + if len(fields) == 26: + kana = fields[17] + origorig = fields[7] + elif len(fields) == 29: + kana = fields[20] + origorig = fields[7] + elif len(fields) == 17: + kana = fields[9] + origorig = fields[7] + elif len(fields) == 9: + kana = fields[8] + origorig = fields[7] + l = 0 - if text[start] == "\n": - start += 1 + while str(node) not in text[start : start + l]: l += 1 orig = text[start : start + l] - if origorig is None: - origorig = orig - start += l hira = kana # .translate(self.h2k) @@ -66,4 +77,9 @@ class mecab(basehira): result.append( {"orig": orig, "hira": hira, "cixing": pos1, "origorig": origorig} ) + extras=text[start :] + if len(extras): + result.append( + {"orig": extras, "hira": extras, "cixing": '', "origorig": extras} + ) return result diff --git a/LunaTranslator/LunaTranslator/winsharedutils.py b/LunaTranslator/LunaTranslator/winsharedutils.py index 0fbc5042..90bb0b56 100644 --- a/LunaTranslator/LunaTranslator/winsharedutils.py +++ b/LunaTranslator/LunaTranslator/winsharedutils.py @@ -22,7 +22,7 @@ from ctypes import ( ) from ctypes.wintypes import WORD, HANDLE, HWND, LONG, DWORD from windows import WINDOWPLACEMENT -import gobject +import gobject, csv utilsdll = CDLL(gobject.GetDllpath(("winsharedutils32.dll", "winsharedutils64.dll"))) @@ -157,7 +157,7 @@ class mecabwrap: res = [] for i in range(num.value): f = feature[i] - fields = f.decode(codec).split(",") + fields = list(csv.reader([f.decode(codec)]))[0] res.append((surface[i].decode(codec), fields)) _freestringlist(feature, num.value) _freestringlist(surface, num.value)