This commit is contained in:
恍兮惚兮 2024-05-25 15:27:44 +08:00
parent f104630ccb
commit 6213fddc0e
2 changed files with 42 additions and 26 deletions

View File

@ -3,6 +3,25 @@ import os
from hiraparse.basehira import basehira
# # 2.1.2 src schema
# UnidicFeatures17 = namedtuple('UnidicFeatures17',
# ('pos1 pos2 pos3 pos4 cType cForm lForm lemma orth pron '
# 'orthBase pronBase goshu iType iForm fType fForm').split(' '))
# # 2.1.2 bin schema
# # The unidic-mecab-2.1.2_bin distribution adds kana accent fields.
# UnidicFeatures26 = namedtuple('UnidicFeatures26',
# ('pos1 pos2 pos3 pos4 cType cForm lForm lemma orth pron '
# 'orthBase pronBase goshu iType iForm fType fForm '
# 'kana kanaBase form formBase iConType fConType aType '
# 'aConType aModeType').split(' '))
# # schema used in 2.2.0, 2.3.0
# UnidicFeatures29 = namedtuple('UnidicFeatures29', 'pos1 pos2 pos3 pos4 cType '
# 'cForm lForm lemma orth pron orthBase pronBase goshu iType iForm fType '
# 'fForm iConType fConType type kana kanaBase form formBase aType aConType '
# 'aModType lid lemma_id'.split(' '))
class mecab(basehira):
def init(self) -> None:
@ -20,34 +39,26 @@ class mecab(basehira):
text, codec
): # self.kks.parseToNodeList(text):
kana = ""
pos1 = ""
origorig = None
if len(fields):
pos1 = fields[0]
if len(fields) > 29:
kana = fields[22]
elif len(fields) == 29:
kana = fields[20]
elif 29 > len(fields) >= 26:
kana = fields[17]
origorig = fields[7]
elif len(fields) > 9:
kana = fields[9] # 无kana用lform代替
elif len(fields) == 9:
kana = fields[8] # 7/8均可issues/514
else:
kana = ""
if len(fields) >= 8:
origorig = fields[7] # unsafe
origorig = ""
pos1 = fields[0]
if len(fields) == 26:
kana = fields[17]
origorig = fields[7]
elif len(fields) == 29:
kana = fields[20]
origorig = fields[7]
elif len(fields) == 17:
kana = fields[9]
origorig = fields[7]
elif len(fields) == 9:
kana = fields[8]
origorig = fields[7]
l = 0
if text[start] == "\n":
start += 1
while str(node) not in text[start : start + l]:
l += 1
orig = text[start : start + l]
if origorig is None:
origorig = orig
start += l
hira = kana # .translate(self.h2k)
@ -66,4 +77,9 @@ class mecab(basehira):
result.append(
{"orig": orig, "hira": hira, "cixing": pos1, "origorig": origorig}
)
extras=text[start :]
if len(extras):
result.append(
{"orig": extras, "hira": extras, "cixing": '', "origorig": extras}
)
return result

View File

@ -22,7 +22,7 @@ from ctypes import (
)
from ctypes.wintypes import WORD, HANDLE, HWND, LONG, DWORD
from windows import WINDOWPLACEMENT
import gobject
import gobject, csv
utilsdll = CDLL(gobject.GetDllpath(("winsharedutils32.dll", "winsharedutils64.dll")))
@ -157,7 +157,7 @@ class mecabwrap:
res = []
for i in range(num.value):
f = feature[i]
fields = f.decode(codec).split(",")
fields = list(csv.reader([f.decode(codec)]))[0]
res.append((surface[i].decode(codec), fields))
_freestringlist(feature, num.value)
_freestringlist(surface, num.value)