This commit is contained in:
恍兮惚兮 2024-05-25 15:27:44 +08:00
parent f104630ccb
commit 6213fddc0e
2 changed files with 42 additions and 26 deletions

View File

@ -3,6 +3,25 @@ import os
from hiraparse.basehira import basehira from hiraparse.basehira import basehira
# # 2.1.2 src schema
# UnidicFeatures17 = namedtuple('UnidicFeatures17',
# ('pos1 pos2 pos3 pos4 cType cForm lForm lemma orth pron '
# 'orthBase pronBase goshu iType iForm fType fForm').split(' '))
# # 2.1.2 bin schema
# # The unidic-mecab-2.1.2_bin distribution adds kana accent fields.
# UnidicFeatures26 = namedtuple('UnidicFeatures26',
# ('pos1 pos2 pos3 pos4 cType cForm lForm lemma orth pron '
# 'orthBase pronBase goshu iType iForm fType fForm '
# 'kana kanaBase form formBase iConType fConType aType '
# 'aConType aModeType').split(' '))
# # schema used in 2.2.0, 2.3.0
# UnidicFeatures29 = namedtuple('UnidicFeatures29', 'pos1 pos2 pos3 pos4 cType '
# 'cForm lForm lemma orth pron orthBase pronBase goshu iType iForm fType '
# 'fForm iConType fConType type kana kanaBase form formBase aType aConType '
# 'aModType lid lemma_id'.split(' '))
class mecab(basehira): class mecab(basehira):
def init(self) -> None: def init(self) -> None:
@ -20,34 +39,26 @@ class mecab(basehira):
text, codec text, codec
): # self.kks.parseToNodeList(text): ): # self.kks.parseToNodeList(text):
kana = "" kana = ""
pos1 = "" origorig = ""
origorig = None
if len(fields):
pos1 = fields[0] pos1 = fields[0]
if len(fields) > 29: if len(fields) == 26:
kana = fields[22]
elif len(fields) == 29:
kana = fields[20]
elif 29 > len(fields) >= 26:
kana = fields[17] kana = fields[17]
origorig = fields[7] origorig = fields[7]
elif len(fields) > 9: elif len(fields) == 29:
kana = fields[9] # 无kana用lform代替 kana = fields[20]
origorig = fields[7]
elif len(fields) == 17:
kana = fields[9]
origorig = fields[7]
elif len(fields) == 9: elif len(fields) == 9:
kana = fields[8] # 7/8均可issues/514 kana = fields[8]
else: origorig = fields[7]
kana = ""
if len(fields) >= 8:
origorig = fields[7] # unsafe
l = 0 l = 0
if text[start] == "\n":
start += 1
while str(node) not in text[start : start + l]: while str(node) not in text[start : start + l]:
l += 1 l += 1
orig = text[start : start + l] orig = text[start : start + l]
if origorig is None:
origorig = orig
start += l start += l
hira = kana # .translate(self.h2k) hira = kana # .translate(self.h2k)
@ -66,4 +77,9 @@ class mecab(basehira):
result.append( result.append(
{"orig": orig, "hira": hira, "cixing": pos1, "origorig": origorig} {"orig": orig, "hira": hira, "cixing": pos1, "origorig": origorig}
) )
extras=text[start :]
if len(extras):
result.append(
{"orig": extras, "hira": extras, "cixing": '', "origorig": extras}
)
return result return result

View File

@ -22,7 +22,7 @@ from ctypes import (
) )
from ctypes.wintypes import WORD, HANDLE, HWND, LONG, DWORD from ctypes.wintypes import WORD, HANDLE, HWND, LONG, DWORD
from windows import WINDOWPLACEMENT from windows import WINDOWPLACEMENT
import gobject import gobject, csv
utilsdll = CDLL(gobject.GetDllpath(("winsharedutils32.dll", "winsharedutils64.dll"))) utilsdll = CDLL(gobject.GetDllpath(("winsharedutils32.dll", "winsharedutils64.dll")))
@ -157,7 +157,7 @@ class mecabwrap:
res = [] res = []
for i in range(num.value): for i in range(num.value):
f = feature[i] f = feature[i]
fields = f.decode(codec).split(",") fields = list(csv.reader([f.decode(codec)]))[0]
res.append((surface[i].decode(codec), fields)) res.append((surface[i].decode(codec), fields))
_freestringlist(feature, num.value) _freestringlist(feature, num.value)
_freestringlist(surface, num.value) _freestringlist(surface, num.value)