From 6f0132a1668d315b8c96360cf18688de7b2a69a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=81=8D=E5=85=AE=E6=83=9A=E5=85=AE?= <101191390+HIllya51@users.noreply.github.com> Date: Fri, 17 May 2024 00:08:24 +0800 Subject: [PATCH] fix --- LunaTranslator/LunaTranslator/cishu/youdao.py | 4 +- .../LunaTranslator/gui/setting_lang.py | 8 +- LunaTranslator/LunaTranslator/gui/showword.py | 52 +- .../LunaTranslator/myutils/commonbase.py | 6 +- .../LunaTranslator/myutils/config.py | 18 +- LunaTranslator/LunaTranslator/myutils/post.py | 4 +- .../LunaTranslator/myutils/utils.py | 5 +- .../LunaTranslator/ocrengines/local.py | 6 +- .../translator/basetranslator.py | 4 +- .../LunaTranslator/tts/basettsclass.py | 30 +- LunaTranslator/LunaTranslator/tts/gtts.py | 1093 +++++++++++++++++ .../LunaTranslator/tts/huoshantts.py | 57 +- .../LunaTranslator/tts/voiceroid2.py | 22 +- .../LunaTranslator/tts/voiceroidplus.py | 23 +- LunaTranslator/LunaTranslator/tts/voicevox.py | 59 +- .../files/defaultconfig/config.json | 5 + 16 files changed, 1251 insertions(+), 145 deletions(-) create mode 100644 LunaTranslator/LunaTranslator/tts/gtts.py diff --git a/LunaTranslator/LunaTranslator/cishu/youdao.py b/LunaTranslator/LunaTranslator/cishu/youdao.py index c4dd6e43..ea1e7c69 100644 --- a/LunaTranslator/LunaTranslator/cishu/youdao.py +++ b/LunaTranslator/LunaTranslator/cishu/youdao.py @@ -1,4 +1,4 @@ -from myutils.config import globalconfig, static_data +from myutils.config import getlangsrc import requests from urllib.parse import quote import re @@ -11,7 +11,7 @@ class youdao(cishubase): def srclang(self): try: - l = static_data["language_list_translator_inner"][globalconfig["srclang3"]] + l = getlangsrc() return l except: diff --git a/LunaTranslator/LunaTranslator/gui/setting_lang.py b/LunaTranslator/LunaTranslator/gui/setting_lang.py index 307ffed0..eb52861c 100644 --- a/LunaTranslator/LunaTranslator/gui/setting_lang.py +++ b/LunaTranslator/LunaTranslator/gui/setting_lang.py @@ -1,5 +1,5 @@ import os -from myutils.config import globalconfig, _TRL, static_data +from myutils.config import globalconfig, _TRL, static_data,getlanguse from gui.usefulwidget import getsimplecombobox, getcolorbutton @@ -46,11 +46,7 @@ def setTablanglz(self): "", callback=lambda: os.startfile( os.path.abspath( - "./files/lang/{}.json".format( - static_data["language_list_translator_inner"][ - globalconfig["languageuse"] - ] - ) + "./files/lang/{}.json".format(getlanguse()) ) ), icon="fa.gear", diff --git a/LunaTranslator/LunaTranslator/gui/showword.py b/LunaTranslator/LunaTranslator/gui/showword.py index c0519245..d3ba726d 100644 --- a/LunaTranslator/LunaTranslator/gui/showword.py +++ b/LunaTranslator/LunaTranslator/gui/showword.py @@ -44,14 +44,14 @@ class AnkiWindow(QWidget): def langdu(self): if gobject.baseobject.reader: - self.audiopath.setText( - gobject.baseobject.reader.syncttstofile(self.currentword) + gobject.baseobject.reader.ttscallback( + self.currentword, self.audiopath.setText ) def langdu2(self): if gobject.baseobject.reader: - self.audiopath_sentence.setText( - gobject.baseobject.reader.syncttstofile(self.example.toPlainText()) + gobject.baseobject.reader.ttscallback( + self.example.toPlainText(), self.audiopath_sentence.setText ) @threader @@ -205,27 +205,41 @@ class AnkiWindow(QWidget): def loadfakefields(self): if len(self.editpath.text()): - with open(self.editpath.text(), "rb") as image_file: - encoded_string = base64.b64encode(image_file.read()).decode("utf-8") - encoded_string = ''.format( - encoded_string - ) + try: + with open(self.editpath.text(), "rb") as image_file: + encoded_string = base64.b64encode(image_file.read()).decode("utf-8") + encoded_string = ''.format( + encoded_string + ) + except: + encoded_string = "" else: encoded_string = "" if len(self.audiopath.text()): - with open(self.audiopath.text(), "rb") as image_file: - encoded_string2 = base64.b64encode(image_file.read()).decode("utf-8") - encoded_string2 = """""".format( - encoded_string2 - ) + try: + with open(self.audiopath.text(), "rb") as image_file: + encoded_string2 = base64.b64encode(image_file.read()).decode( + "utf-8" + ) + encoded_string2 = """""".format( + encoded_string2 + ) + except: + encoded_string2 = "" else: encoded_string2 = "" if len(self.audiopath_sentence.text()): - with open(self.audiopath_sentence.text(), "rb") as image_file: - encoded_string3 = base64.b64encode(image_file.read()).decode("utf-8") - encoded_string3 = """""".format( - encoded_string3 - ) + try: + with open(self.audiopath_sentence.text(), "rb") as image_file: + encoded_string3 = base64.b64encode(image_file.read()).decode( + "utf-8" + ) + encoded_string3 = """""".format( + encoded_string3 + ) + except: + + encoded_string3 = "" else: encoded_string3 = "" fields = { diff --git a/LunaTranslator/LunaTranslator/myutils/commonbase.py b/LunaTranslator/LunaTranslator/myutils/commonbase.py index 66d0aa30..ff34cf81 100644 --- a/LunaTranslator/LunaTranslator/myutils/commonbase.py +++ b/LunaTranslator/LunaTranslator/myutils/commonbase.py @@ -1,5 +1,5 @@ from myutils.proxy import getproxy -from myutils.config import globalconfig, _TR, static_data +from myutils.config import getlangtgt, _TR, static_data, getlangsrc from myutils.wrapper import stripwrapper import requests @@ -34,7 +34,7 @@ class commonbase: @property def srclang(self): try: - l = static_data["language_list_translator_inner"][globalconfig["srclang3"]] + l = getlangsrc() return self.langmap_[l] except: return "" @@ -42,7 +42,7 @@ class commonbase: @property def tgtlang(self): try: - l = static_data["language_list_translator_inner"][globalconfig["tgtlang3"]] + l = getlangtgt() return self.langmap_[l] except: return "" diff --git a/LunaTranslator/LunaTranslator/myutils/config.py b/LunaTranslator/LunaTranslator/myutils/config.py index 736c9a78..790f2609 100644 --- a/LunaTranslator/LunaTranslator/myutils/config.py +++ b/LunaTranslator/LunaTranslator/myutils/config.py @@ -187,14 +187,22 @@ if len(globalconfig["toolbutton"]["rank"]) != len( ) +def getlanguse(): + global language, languageshow + return static_data["language_list_translator_inner"][language] + +def getlangsrc(): + return static_data["language_list_translator_inner"][globalconfig["srclang3"]] + +def getlangtgt(): + return static_data["language_list_translator_inner"][globalconfig["tgtlang3"]] + def setlanguage(): global language, languageshow language = globalconfig["languageuse"] try: with open( - "./files/lang/{}.json".format( - static_data["language_list_translator_inner"][language] - ), + "./files/lang/{}.json".format(getlanguse()), "r", encoding="utf8", ) as ff: @@ -255,8 +263,6 @@ def saveallconfig(): "./userconfig/savehook_new_1.39.4.json", [savehook_new_list, savehook_new_data] ) safesave( - "./files/lang/{}.json".format( - static_data["language_list_translator_inner"][language] - ), + "./files/lang/{}.json".format(getlanguse()), languageshow, ) diff --git a/LunaTranslator/LunaTranslator/myutils/post.py b/LunaTranslator/LunaTranslator/myutils/post.py index 6f31971f..7c32d55b 100644 --- a/LunaTranslator/LunaTranslator/myutils/post.py +++ b/LunaTranslator/LunaTranslator/myutils/post.py @@ -7,7 +7,7 @@ from myutils.config import ( postprocessconfig, globalconfig, savehook_new_data, - static_data, + getlangsrc, ) @@ -179,7 +179,7 @@ def _4_f(line): def _6_fEX(line): - srclang = static_data["language_list_translator_inner"][globalconfig["srclang3"]] + srclang = getlangsrc() if srclang in ["zh", "ja"]: white = "" else: diff --git a/LunaTranslator/LunaTranslator/myutils/utils.py b/LunaTranslator/LunaTranslator/myutils/utils.py index db1c070b..89261ffd 100644 --- a/LunaTranslator/LunaTranslator/myutils/utils.py +++ b/LunaTranslator/LunaTranslator/myutils/utils.py @@ -15,6 +15,7 @@ from traceback import print_exc from myutils.config import ( globalconfig, static_data, + getlanguse, savehook_new_list, savehook_new_data, getdefaultsavehook, @@ -537,10 +538,6 @@ def parsemayberegexreplace(dict, res): return res -def getlanguse(): - return static_data["language_list_translator_inner"][globalconfig["languageuse"]] - - def checkpostlangmatch(name): for item in static_data["transoptimi"]: if name == item["name"]: diff --git a/LunaTranslator/LunaTranslator/ocrengines/local.py b/LunaTranslator/LunaTranslator/ocrengines/local.py index 0d65827e..13d41a70 100644 --- a/LunaTranslator/LunaTranslator/ocrengines/local.py +++ b/LunaTranslator/LunaTranslator/ocrengines/local.py @@ -1,5 +1,5 @@ import os -from myutils.config import globalconfig, _TR, static_data +from myutils.config import globalconfig, _TR, getlangsrc from ocrengines.baseocrclass import baseocr from ctypes import ( CDLL, @@ -131,9 +131,7 @@ class OCR(baseocr): return self._ocr.trydestroy() - path = "./files/ocr/{}".format( - static_data["language_list_translator_inner"][globalconfig["srclang3"]] - ) + path = "./files/ocr/{}".format(getlangsrc()) if not ( os.path.exists(path + "/det.onnx") and os.path.exists(path + "/rec.onnx") diff --git a/LunaTranslator/LunaTranslator/translator/basetranslator.py b/LunaTranslator/LunaTranslator/translator/basetranslator.py index 56468dbb..9aa4f42f 100644 --- a/LunaTranslator/LunaTranslator/translator/basetranslator.py +++ b/LunaTranslator/LunaTranslator/translator/basetranslator.py @@ -1,7 +1,7 @@ from traceback import print_exc from queue import Queue -from myutils.config import globalconfig, translatorsetting, static_data +from myutils.config import globalconfig, translatorsetting, getlangtgt from threading import Thread import time, types import zhconv, gobject @@ -172,7 +172,7 @@ class basetrans(commonbase): @property def needzhconv(self): # The API does not support direct translation to Traditional Chinese, only Simplified Chinese can be translated first and then converted to Traditional Chinese - l = static_data["language_list_translator_inner"][globalconfig["tgtlang3"]] + l = getlangtgt() return l == "cht" and "cht" not in self.langmap() @property diff --git a/LunaTranslator/LunaTranslator/tts/basettsclass.py b/LunaTranslator/LunaTranslator/tts/basettsclass.py index 3308bd0e..7ec411d3 100644 --- a/LunaTranslator/LunaTranslator/tts/basettsclass.py +++ b/LunaTranslator/LunaTranslator/tts/basettsclass.py @@ -1,6 +1,7 @@ from myutils.config import globalconfig -import threading, os - +import threading, os, functools +from myutils.wrapper import threader +from traceback import print_exc class TTSbase: def init(self): @@ -19,15 +20,16 @@ class TTSbase: # 一些可能需要的属性 @property def config(self): - return self.privateconfig['args'] + return self.privateconfig["args"] @property def privateconfig(self): return globalconfig["reader"][self.typename] - + @property def publicconfig(self): return globalconfig["ttscommon"] + ######################## def __init__(self, typename, showlistsignal, mp3playsignal) -> None: @@ -58,15 +60,14 @@ class TTSbase: threading.Thread(target=_).start() def read(self, content, force=False): - def _(content, force): - fname = self.syncttstofile(content) + def _(force, fname): volume = self.publicconfig["volume"] - if fname: - self.mp3playsignal.emit(fname, volume, force) + self.mp3playsignal.emit(fname, volume, force) - threading.Thread(target=_, args=(content, force)).start() + self.ttscallback(content, functools.partial(_, force)) - def syncttstofile(self, content): + @threader + def ttscallback(self, content, callback): if self.loadok == False: return if len(content) == 0: @@ -77,5 +78,10 @@ class TTSbase: rate = self.publicconfig["rate"] voice = self.privateconfig["voice"] voice_index = self.voicelist.index(voice) - fname = self.speak(content, rate, voice, voice_index) - return os.path.abspath(fname) + try: + fname = self.speak(content, rate, voice, voice_index) + if fname: + callback(os.path.abspath(fname)) + except: + print_exc() + return diff --git a/LunaTranslator/LunaTranslator/tts/gtts.py b/LunaTranslator/LunaTranslator/tts/gtts.py new file mode 100644 index 00000000..2385d1c1 --- /dev/null +++ b/LunaTranslator/LunaTranslator/tts/gtts.py @@ -0,0 +1,1093 @@ +# -*- coding: utf-8 -*- +import base64 +import json, time +import logging, os +import re +import urllib +from myutils.proxy import getproxy +import requests + +_langs = { + "af": "Afrikaans", + "ar": "Arabic", + "bg": "Bulgarian", + "bn": "Bengali", + "bs": "Bosnian", + "ca": "Catalan", + "cs": "Czech", + "da": "Danish", + "de": "German", + "el": "Greek", + "en": "English", + "es": "Spanish", + "et": "Estonian", + "fi": "Finnish", + "fr": "French", + "gu": "Gujarati", + "hi": "Hindi", + "hr": "Croatian", + "hu": "Hungarian", + "id": "Indonesian", + "is": "Icelandic", + "it": "Italian", + "iw": "Hebrew", + "ja": "Japanese", + "jw": "Javanese", + "km": "Khmer", + "kn": "Kannada", + "ko": "Korean", + "la": "Latin", + "lv": "Latvian", + "ml": "Malayalam", + "mr": "Marathi", + "ms": "Malay", + "my": "Myanmar (Burmese)", + "ne": "Nepali", + "nl": "Dutch", + "no": "Norwegian", + "pl": "Polish", + "pt": "Portuguese", + "ro": "Romanian", + "ru": "Russian", + "si": "Sinhala", + "sk": "Slovak", + "sq": "Albanian", + "sr": "Serbian", + "su": "Sundanese", + "sv": "Swedish", + "sw": "Swahili", + "ta": "Tamil", + "te": "Telugu", + "th": "Thai", + "tl": "Filipino", + "tr": "Turkish", + "uk": "Ukrainian", + "ur": "Urdu", + "vi": "Vietnamese", + "zh-CN": "Chinese (Simplified)", + "zh-TW": "Chinese (Traditional)", +} + + +def _main_langs(): + return _langs + + +from warnings import warn +import logging + +__all__ = ["tts_langs"] + +# Logger +log = logging.getLogger(__name__) +log.addHandler(logging.NullHandler()) + + +def tts_langs(): + """Languages Google Text-to-Speech supports. + + Returns: + dict: A dictionary of the type `{ '': ''}` + + Where `` is an IETF language tag such as `en` or `zh-TW`, + and `` is the full English name of the language, such as + `English` or `Chinese (Mandarin/Taiwan)`. + + The dictionary returned combines languages from two origins: + + - Languages fetched from Google Translate (pre-generated in :mod:`gtts.langs`) + - Languages that are undocumented variations that were observed to work and + present different dialects or accents. + + """ + langs = dict() + langs.update(_main_langs()) + langs.update(_extra_langs()) + log.debug("langs: {}".format(langs)) + return langs + + +def _extra_langs(): + """Define extra languages. + + Returns: + dict: A dictionary of extra languages manually defined. + + Variations of the ones generated in `_main_langs`, + observed to provide different dialects or accents or + just simply accepted by the Google Translate Text-to-Speech API. + + """ + return { + # Chinese + "zh-TW": "Chinese (Mandarin/Taiwan)", + "zh": "Chinese (Mandarin)", + } + + +def _fallback_deprecated_lang(lang): + """Languages Google Text-to-Speech used to support. + + Language tags that don't work anymore, but that can + fallback to a more general language code to maintain + compatibility. + + Args: + lang (string): The language tag. + + Returns: + string: The language tag, as-is if not deprecated, + or a fallback if it exits. + + Example: + ``en-GB`` returns ``en``. + ``en-gb`` returns ``en``. + + """ + + deprecated = { + # '': [] + "en": [ + "en-us", + "en-ca", + "en-uk", + "en-gb", + "en-au", + "en-gh", + "en-in", + "en-ie", + "en-nz", + "en-ng", + "en-ph", + "en-za", + "en-tz", + ], + "fr": ["fr-ca", "fr-fr"], + "pt": ["pt-br", "pt-pt"], + "es": ["es-es", "es-us"], + "zh-CN": ["zh-cn"], + "zh-TW": ["zh-tw"], + } + + for fallback_lang, deprecated_langs in deprecated.items(): + if lang.lower() in deprecated_langs: + msg = ( + "'{}' has been deprecated, falling back to '{}'. " + "This fallback will be removed in a future version." + ).format(lang, fallback_lang) + + warn(msg, DeprecationWarning) + log.warning(msg) + + return fallback_lang + + return lang + + +# -*- coding: utf-8 -*- +import re + + +class symbols: + # -*- coding: utf-8 -*- + + ABBREVIATIONS = ["dr", "jr", "mr", "mrs", "ms", "msgr", "prof", "sr", "st"] + + SUB_PAIRS = [("Esq.", "Esquire")] + + ALL_PUNC = "?!?!.,¡()[]¿…‥،;:—。,、:\n" + + TONE_MARKS = "?!?!" + + PERIOD_COMMA = ".," + + COLON = ":" + + +class RegexBuilder: + r"""Builds regex using arguments passed into a pattern template. + + Builds a regex object for which the pattern is made from an argument + passed into a template. If more than one argument is passed (iterable), + each pattern is joined by "|" (regex alternation 'or') to create a + single pattern. + + Args: + pattern_args (iteratable): String element(s) to be each passed to + ``pattern_func`` to create a regex pattern. Each element is + ``re.escape``'d before being passed. + pattern_func (callable): A 'template' function that should take a + string and return a string. It should take an element of + ``pattern_args`` and return a valid regex pattern group string. + flags: ``re`` flag(s) to compile with the regex. + + Example: + To create a simple regex that matches on the characters "a", "b", + or "c", followed by a period:: + + >>> rb = RegexBuilder('abc', lambda x: "{}\.".format(x)) + + Looking at ``rb.regex`` we get the following compiled regex:: + + >>> print(rb.regex) + 'a\.|b\.|c\.' + + The above is fairly simple, but this class can help in writing more + complex repetitive regex, making them more readable and easier to + create by using existing data structures. + + Example: + To match the character following the words "lorem", "ipsum", "meili" + or "koda":: + + >>> words = ['lorem', 'ipsum', 'meili', 'koda'] + >>> rb = RegexBuilder(words, lambda x: "(?<={}).".format(x)) + + Looking at ``rb.regex`` we get the following compiled regex:: + + >>> print(rb.regex) + '(?<=lorem).|(?<=ipsum).|(?<=meili).|(?<=koda).' + + """ + + def __init__(self, pattern_args, pattern_func, flags=0): + self.pattern_args = pattern_args + self.pattern_func = pattern_func + self.flags = flags + + # Compile + self.regex = self._compile() + + def _compile(self): + alts = [] + for arg in self.pattern_args: + arg = re.escape(arg) + alt = self.pattern_func(arg) + alts.append(alt) + + pattern = "|".join(alts) + return re.compile(pattern, self.flags) + + def __repr__(self): # pragma: no cover + return str(self.regex) + + +class PreProcessorRegex: + r"""Regex-based substitution text pre-processor. + + Runs a series of regex substitutions (``re.sub``) from each ``regex`` of a + :class:`gtts.tokenizer.core.RegexBuilder` with an extra ``repl`` + replacement parameter. + + Args: + search_args (iteratable): String element(s) to be each passed to + ``search_func`` to create a regex pattern. Each element is + ``re.escape``'d before being passed. + search_func (callable): A 'template' function that should take a + string and return a string. It should take an element of + ``search_args`` and return a valid regex search pattern string. + repl (string): The common replacement passed to the ``sub`` method for + each ``regex``. Can be a raw string (the case of a regex + backreference, for example) + flags: ``re`` flag(s) to compile with each `regex`. + + Example: + Add "!" after the words "lorem" or "ipsum", while ignoring case:: + + >>> import re + >>> words = ['lorem', 'ipsum'] + >>> pp = PreProcessorRegex(words, + ... lambda x: "({})".format(x), r'\\1!', + ... re.IGNORECASE) + + In this case, the regex is a group and the replacement uses its + backreference ``\\1`` (as a raw string). Looking at ``pp`` we get the + following list of search/replacement pairs:: + + >>> print(pp) + (re.compile('(lorem)', re.IGNORECASE), repl='\1!'), + (re.compile('(ipsum)', re.IGNORECASE), repl='\1!') + + It can then be run on any string of text:: + + >>> pp.run("LOREM ipSuM") + "LOREM! ipSuM!" + + See :mod:`gtts.tokenizer.pre_processors` for more examples. + + """ + + def __init__(self, search_args, search_func, repl, flags=0): + self.repl = repl + + # Create regex list + self.regexes = [] + for arg in search_args: + rb = RegexBuilder([arg], search_func, flags) + self.regexes.append(rb.regex) + + def run(self, text): + """Run each regex substitution on ``text``. + + Args: + text (string): the input text. + + Returns: + string: text after all substitutions have been sequentially + applied. + + """ + for regex in self.regexes: + text = regex.sub(self.repl, text) + return text + + def __repr__(self): # pragma: no cover + subs_strs = [] + for r in self.regexes: + subs_strs.append("({}, repl='{}')".format(r, self.repl)) + return ", ".join(subs_strs) + + +class PreProcessorSub: + r"""Simple substitution text preprocessor. + + Performs string-for-string substitution from list a find/replace pairs. + It abstracts :class:`gtts.tokenizer.core.PreProcessorRegex` with a default + simple substitution regex. + + Args: + sub_pairs (list): A list of tuples of the style + ``(, )`` + ignore_case (bool): Ignore case during search. Defaults to ``True``. + + Example: + Replace all occurences of "Mac" to "PC" and "Firefox" to "Chrome":: + + >>> sub_pairs = [('Mac', 'PC'), ('Firefox', 'Chrome')] + >>> pp = PreProcessorSub(sub_pairs) + + Looking at the ``pp``, we get the following list of + search (regex)/replacement pairs:: + + >>> print(pp) + (re.compile('Mac', re.IGNORECASE), repl='PC'), + (re.compile('Firefox', re.IGNORECASE), repl='Chrome') + + It can then be run on any string of text:: + + >>> pp.run("I use firefox on my mac") + "I use Chrome on my PC" + + See :mod:`gtts.tokenizer.pre_processors` for more examples. + + """ + + def __init__(self, sub_pairs, ignore_case=True): + def search_func(x): + return "{}".format(x) + + flags = re.I if ignore_case else 0 + + # Create pre-processor list + self.pre_processors = [] + for sub_pair in sub_pairs: + pattern, repl = sub_pair + pp = PreProcessorRegex([pattern], search_func, repl, flags) + self.pre_processors.append(pp) + + def run(self, text): + """Run each substitution on ``text``. + + Args: + text (string): the input text. + + Returns: + string: text after all substitutions have been sequentially + applied. + + """ + for pp in self.pre_processors: + text = pp.run(text) + return text + + def __repr__(self): # pragma: no cover + return ", ".join([str(pp) for pp in self.pre_processors]) + + +class Tokenizer: + r"""An extensible but simple generic rule-based tokenizer. + + A generic and simple string tokenizer that takes a list of functions + (called `tokenizer cases`) returning ``regex`` objects and joins them by + "|" (regex alternation 'or') to create a single regex to use with the + standard ``regex.split()`` function. + + ``regex_funcs`` is a list of any function that can return a ``regex`` + (from ``re.compile()``) object, such as a + :class:`gtts.tokenizer.core.RegexBuilder` instance (and its ``regex`` + attribute). + + See the :mod:`gtts.tokenizer.tokenizer_cases` module for examples. + + Args: + regex_funcs (list): List of compiled ``regex`` objects. Each + function's pattern will be joined into a single pattern and + compiled. + flags: ``re`` flag(s) to compile with the final regex. Defaults to + ``re.IGNORECASE`` + + Note: + When the ``regex`` objects obtained from ``regex_funcs`` are joined, + their individual ``re`` flags are ignored in favour of ``flags``. + + Raises: + TypeError: When an element of ``regex_funcs`` is not a function, or + a function that does not return a compiled ``regex`` object. + + Warning: + Joined ``regex`` patterns can easily interfere with one another in + unexpected ways. It is recommanded that each tokenizer case operate + on distinct or non-overlapping chracters/sets of characters + (For example, a tokenizer case for the period (".") should also + handle not matching/cutting on decimals, instead of making that + a seperate tokenizer case). + + Example: + A tokenizer with a two simple case (*Note: these are bad cases to + tokenize on, this is simply a usage example*):: + + >>> import re, RegexBuilder + >>> + >>> def case1(): + ... return re.compile("\,") + >>> + >>> def case2(): + ... return RegexBuilder('abc', lambda x: "{}\.".format(x)).regex + >>> + >>> t = Tokenizer([case1, case2]) + + Looking at ``case1().pattern``, we get:: + + >>> print(case1().pattern) + '\\,' + + Looking at ``case2().pattern``, we get:: + + >>> print(case2().pattern) + 'a\\.|b\\.|c\\.' + + Finally, looking at ``t``, we get them combined:: + + >>> print(t) + 're.compile('\\,|a\\.|b\\.|c\\.', re.IGNORECASE) + from: [, ]' + + It can then be run on any string of text:: + + >>> t.run("Hello, my name is Linda a. Call me Lin, b. I'm your friend") + ['Hello', ' my name is Linda ', ' Call me Lin', ' ', " I'm your friend"] + + """ + + def __init__(self, regex_funcs, flags=re.IGNORECASE): + self.regex_funcs = regex_funcs + self.flags = flags + + try: + # Combine + self.total_regex = self._combine_regex() + except (TypeError, AttributeError) as e: # pragma: no cover + raise TypeError( + "Tokenizer() expects a list of functions returning " + "regular expression objects (i.e. re.compile). " + str(e) + ) + + def _combine_regex(self): + alts = [] + for func in self.regex_funcs: + alts.append(func()) + + pattern = "|".join(alt.pattern for alt in alts) + return re.compile(pattern, self.flags) + + def run(self, text): + """Tokenize `text`. + + Args: + text (string): the input text to tokenize. + + Returns: + list: A list of strings (token) split according to the tokenizer cases. + + """ + return self.total_regex.split(text) + + def __repr__(self): # pragma: no cover + return str(self.total_regex) + " from: " + str(self.regex_funcs) + + +class tokenizer_cases: + + def tone_marks(): + """Keep tone-modifying punctuation by matching following character. + + Assumes the `tone_marks` pre-processor was run for cases where there might + not be any space after a tone-modifying punctuation mark. + """ + return RegexBuilder( + pattern_args=symbols.TONE_MARKS, pattern_func=lambda x: "(?<={}).".format(x) + ).regex + + def period_comma(): + """Period and comma case. + + Match if not preceded by "." and only if followed by space. + Won't cut in the middle/after dotted abbreviations; won't cut numbers. + + Note: + Won't match if a dotted abbreviation ends a sentence. + + Note: + Won't match the end of a sentence if not followed by a space. + + """ + return RegexBuilder( + pattern_args=symbols.PERIOD_COMMA, + pattern_func=lambda x: r"(?". + + """ + return PreProcessorRegex( + search_args="-", search_func=lambda x: "{}\n".format(x), repl="" + ).run(text) + + def abbreviations(text): + """Remove periods after an abbreviation from a list of known + abbreviations that can be spoken the same without that period. This + prevents having to handle tokenization of that period. + + Note: + Could potentially remove the ending period of a sentence. + + Note: + Abbreviations that Google Translate can't pronounce without + (or even with) a period should be added as a word substitution with a + :class:`PreProcessorSub` pre-processor. Ex.: 'Esq.', 'Esquire'. + + """ + return PreProcessorRegex( + search_args=symbols.ABBREVIATIONS, + search_func=lambda x: r"(?<={})(?=\.).".format(x), + repl="", + flags=re.IGNORECASE, + ).run(text) + + def word_sub(text): + """Word-for-word substitutions.""" + return PreProcessorSub(sub_pairs=symbols.SUB_PAIRS).run(text) + + +punc = symbols.ALL_PUNC +from string import whitespace as ws +import re + +_ALL_PUNC_OR_SPACE = re.compile("^[{}]*$".format(re.escape(punc + ws))) +"""Regex that matches if an entire line is only comprised +of whitespace and punctuation + +""" + + +def _minimize(the_string, delim, max_size): + """Recursively split a string in the largest chunks + possible from the highest position of a delimiter all the way + to a maximum size + + Args: + the_string (string): The string to split. + delim (string): The delimiter to split on. + max_size (int): The maximum size of a chunk. + + Returns: + list: the minimized string in tokens + + Every chunk size will be at minimum ``the_string[0:idx]`` where ``idx`` + is the highest index of ``delim`` found in ``the_string``; and at maximum + ``the_string[0:max_size]`` if no ``delim`` was found in ``the_string``. + In the latter case, the split will occur at ``the_string[max_size]`` + which can be any character. The function runs itself again on the rest of + ``the_string`` (``the_string[idx:]``) until no chunk is larger than + ``max_size``. + + """ + # Remove `delim` from start of `the_string` + # i.e. prevent a recursive infinite loop on `the_string[0:0]` + # if `the_string` starts with `delim` and is larger than `max_size` + if the_string.startswith(delim): + the_string = the_string[len(delim) :] + + if len(the_string) > max_size: + try: + # Find the highest index of `delim` in `the_string[0:max_size]` + # i.e. `the_string` will be cut in half on `delim` index + idx = the_string.rindex(delim, 0, max_size) + except ValueError: + # `delim` not found in `the_string`, index becomes `max_size` + # i.e. `the_string` will be cut in half arbitrarily on `max_size` + idx = max_size + # Call itself again for `the_string[idx:]` + return [the_string[:idx]] + _minimize(the_string[idx:], delim, max_size) + else: + return [the_string] + + +def _clean_tokens(tokens): + """Clean a list of strings + + Args: + tokens (list): A list of strings (tokens) to clean. + + Returns: + list: Stripped strings ``tokens`` without the original elements + that only consisted of whitespace and/or punctuation characters. + + """ + return [t.strip() for t in tokens if not _ALL_PUNC_OR_SPACE.match(t)] + + +def _translate_url(tld="com", path=""): + """Generates a Google Translate URL + + Args: + tld (string): Top-level domain for the Google Translate host, + i.e ``https://translate.google.``. Default is ``com``. + path: (string): A path to append to the Google Translate host, + i.e ``https://translate.google.com/``. Default is ``""``. + + Returns: + string: A Google Translate URL `https://translate.google./path` + """ + _GOOGLE_TTS_URL = "https://translate.google.{}/{}" + return _GOOGLE_TTS_URL.format(tld, path) + + +__all__ = ["gTTS", "gTTSError"] + +# Logger +log = logging.getLogger(__name__) +log.addHandler(logging.NullHandler()) + + +class Speed: + """Read Speed + + The Google TTS Translate API supports two speeds: + Slow: True + Normal: None + """ + + SLOW = True + NORMAL = None + + +class gTTS: + """gTTS -- Google Text-to-Speech. + + An interface to Google Translate's Text-to-Speech API. + + Args: + text (string): The text to be read. + tld (string): Top-level domain for the Google Translate host, + i.e `https://translate.google.`. Different Google domains + can produce different localized 'accents' for a given + language. This is also useful when ``google.com`` might be blocked + within a network but a local or different Google host + (e.g. ``google.com.hk``) is not. Default is ``com``. + lang (string, optional): The language (IETF language tag) to + read the text in. Default is ``en``. + slow (bool, optional): Reads text more slowly. Defaults to ``False``. + lang_check (bool, optional): Strictly enforce an existing ``lang``, + to catch a language error early. If set to ``True``, + a ``ValueError`` is raised if ``lang`` doesn't exist. + Setting ``lang_check`` to ``False`` skips Web requests + (to validate language) and therefore speeds up instantiation. + Default is ``True``. + pre_processor_funcs (list): A list of zero or more functions that are + called to transform (pre-process) text before tokenizing. Those + functions must take a string and return a string. Defaults to:: + + [ + pre_processors.tone_marks, + pre_processors.end_of_line, + pre_processors.abbreviations, + pre_processors.word_sub + ] + + tokenizer_func (callable): A function that takes in a string and + returns a list of string (tokens). Defaults to:: + + Tokenizer([ + tokenizer_cases.tone_marks, + tokenizer_cases.period_comma, + tokenizer_cases.colon, + tokenizer_cases.other_punctuation + ]).run + + timeout (float or tuple, optional): Seconds to wait for the server to + send data before giving up, as a float, or a ``(connect timeout, + read timeout)`` tuple. ``None`` will wait forever (default). + + See Also: + :doc:`Pre-processing and tokenizing ` + + Raises: + AssertionError: When ``text`` is ``None`` or empty; when there's nothing + left to speak after pre-precessing, tokenizing and cleaning. + ValueError: When ``lang_check`` is ``True`` and ``lang`` is not supported. + RuntimeError: When ``lang_check`` is ``True`` but there's an error loading + the languages dictionary. + + """ + + GOOGLE_TTS_MAX_CHARS = 100 # Max characters the Google TTS API takes at a time + GOOGLE_TTS_HEADERS = { + "Referer": "http://translate.google.com/", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/47.0.2526.106 Safari/537.36", + "Content-Type": "application/x-www-form-urlencoded;charset=utf-8", + } + GOOGLE_TTS_RPC = "jQ1olc" + + def __init__( + self, + text, + tld="com", + lang="en", + slow=False, + lang_check=True, + pre_processor_funcs=[ + pre_processors.tone_marks, + pre_processors.end_of_line, + pre_processors.abbreviations, + pre_processors.word_sub, + ], + tokenizer_func=Tokenizer( + [ + tokenizer_cases.tone_marks, + tokenizer_cases.period_comma, + tokenizer_cases.colon, + tokenizer_cases.other_punctuation, + ] + ).run, + timeout=None, + ): + + # Debug + for k, v in dict(locals()).items(): + if k == "self": + continue + log.debug("%s: %s", k, v) + + # Text + assert text, "No text to speak" + self.text = text + + # Translate URL top-level domain + self.tld = tld + + # Language + self.lang_check = lang_check + self.lang = lang + + if self.lang_check: + # Fallback lang in case it is deprecated + self.lang = _fallback_deprecated_lang(lang) + + try: + langs = tts_langs() + if self.lang not in langs: + raise ValueError("Language not supported: %s" % lang) + except RuntimeError as e: + log.debug(str(e), exc_info=True) + log.warning(str(e)) + + # Read speed + if slow: + self.speed = Speed.SLOW + else: + self.speed = Speed.NORMAL + + # Pre-processors and tokenizer + self.pre_processor_funcs = pre_processor_funcs + self.tokenizer_func = tokenizer_func + + self.timeout = timeout + + def _tokenize(self, text): + # Pre-clean + text = text.strip() + + # Apply pre-processors + for pp in self.pre_processor_funcs: + log.debug("pre-processing: %s", pp) + text = pp(text) + + if len(text) <= self.GOOGLE_TTS_MAX_CHARS: + return _clean_tokens([text]) + + # Tokenize + log.debug("tokenizing: %s", self.tokenizer_func) + tokens = self.tokenizer_func(text) + + # Clean + tokens = _clean_tokens(tokens) + + # Minimize + min_tokens = [] + for t in tokens: + min_tokens += _minimize(t, " ", self.GOOGLE_TTS_MAX_CHARS) + + # Filter empty tokens, post-minimize + tokens = [t for t in min_tokens if t] + + return tokens + + def _prepare_requests(self): + """Created the TTS API the request(s) without sending them. + + Returns: + list: ``requests.PreparedRequests_``. `_``. + """ + # TTS API URL + translate_url = _translate_url( + tld=self.tld, path="_/TranslateWebserverUi/data/batchexecute" + ) + + text_parts = self._tokenize(self.text) + log.debug("text_parts: %s", str(text_parts)) + log.debug("text_parts: %i", len(text_parts)) + assert text_parts, "No text to send to TTS API" + + prepared_requests = [] + for idx, part in enumerate(text_parts): + data = self._package_rpc(part) + + log.debug("data-%i: %s", idx, data) + + # Request + r = requests.post( + url=translate_url, + data=data, + headers=self.GOOGLE_TTS_HEADERS, + proxies=getproxy(), + ) + + # Prepare request + prepared_requests.append(r) + + return prepared_requests + + def _package_rpc(self, text): + parameter = [text, self.lang, self.speed, "null"] + escaped_parameter = json.dumps(parameter, separators=(",", ":")) + + rpc = [[[self.GOOGLE_TTS_RPC, escaped_parameter, None, "generic"]]] + espaced_rpc = json.dumps(rpc, separators=(",", ":")) + return "f.req={}&".format(urllib.parse.quote(espaced_rpc)) + + def stream(self): + """Do the TTS API request(s) and stream bytes + + Raises: + :class:`gTTSError`: When there's an error with the API request. + + """ + # When disabling ssl verify in requests (for proxies and firewalls), + # urllib3 prints an insecure warning on stdout. We disable that. + try: + requests.packages.urllib3.disable_warnings( + requests.packages.urllib3.exceptions.InsecureRequestWarning + ) + except: + pass + + prepared_requests = self._prepare_requests() + for idx, r in enumerate(prepared_requests): + + # Write + for line in r.content.split(b"\n"): + decoded_line = line.decode("utf-8") + if "jQ1olc" in decoded_line: + audio_search = re.search(r'jQ1olc","\[\\"(.*)\\"]', decoded_line) + if audio_search: + as_bytes = audio_search.group(1).encode("ascii") + yield base64.b64decode(as_bytes) + else: + # Request successful, good response, + # no audio stream in response + raise gTTSError(tts=self, response=r) + log.debug("part-%i created", idx) + + def write_to_fp(self, fp): + """Do the TTS API request(s) and write bytes to a file-like object. + + Args: + fp (file object): Any file-like object to write the ``mp3`` to. + + Raises: + :class:`gTTSError`: When there's an error with the API request. + TypeError: When ``fp`` is not a file-like object that takes bytes. + + """ + + try: + for idx, decoded in enumerate(self.stream()): + fp.write(decoded) + log.debug("part-%i written to %s", idx, fp) + except (AttributeError, TypeError) as e: + raise TypeError( + "'fp' is not a file-like object or it does not take bytes: %s" % str(e) + ) + + def save(self, savefile): + """Do the TTS API request and write result to file. + + Args: + savefile (string): The path and file name to save the ``mp3`` to. + + Raises: + :class:`gTTSError`: When there's an error with the API request. + + """ + with open(str(savefile), "wb") as f: + self.write_to_fp(f) + f.flush() + log.debug("Saved to %s", savefile) + + +class gTTSError(Exception): + """Exception that uses context to present a meaningful error message""" + + def __init__(self, msg=None, **kwargs): + self.tts = kwargs.pop("tts", None) + self.rsp = kwargs.pop("response", None) + if msg: + self.msg = msg + elif self.tts is not None: + self.msg = self.infer_msg(self.tts, self.rsp) + else: + self.msg = None + super(gTTSError, self).__init__(self.msg) + + def infer_msg(self, tts, rsp=None): + """Attempt to guess what went wrong by using known + information (e.g. http response) and observed behaviour + + """ + cause = "Unknown" + + if rsp is None: + premise = "Failed to connect" + + if tts.tld != "com": + host = _translate_url(tld=tts.tld) + cause = "Host '{}' is not reachable".format(host) + + else: + # rsp should be + # http://docs.python-requests.org/en/master/api/ + status = rsp.status_code + reason = rsp.reason + + premise = "{:d} ({}) from TTS API".format(status, reason) + + if status == 403: + cause = "Bad token or upstream API changes" + elif status == 404 and tts.tld != "com": + cause = "Unsupported tld '{}'".format(tts.tld) + elif status == 200 and not tts.lang_check: + cause = ( + "No audio stream in response. Unsupported language '%s'" + % self.tts.lang + ) + elif status >= 500: + cause = "Upstream API error. Try again later." + + return "{}. Probable cause: {}".format(premise, cause) + + +from tts.basettsclass import TTSbase +from myutils.config import globalconfig, getlangsrc + + +class TTS(TTSbase): + def getvoicelist(self): + return [""] + + def speak(self, content, rate, voice, voiceidx): + tts = gTTS(content, lang=getlangsrc()) + fname = str(time.time()) + os.makedirs("./cache/tts/", exist_ok=True) + + tts.save("./cache/tts/" + fname + ".mp3") + return "./cache/tts/" + fname + ".mp3" diff --git a/LunaTranslator/LunaTranslator/tts/huoshantts.py b/LunaTranslator/LunaTranslator/tts/huoshantts.py index fe5de2ad..f2893705 100644 --- a/LunaTranslator/LunaTranslator/tts/huoshantts.py +++ b/LunaTranslator/LunaTranslator/tts/huoshantts.py @@ -24,36 +24,31 @@ class TTS(TTSbase): def speak(self, content, rate, voice, voiceidx): - try: - headers = { - "authority": "translate.volcengine.com", - "accept": "application/json, text/plain, */*", - "accept-language": "zh-CN,zh;q=0.9", - "origin": "chrome-extension://klgfhbdadaspgppeadghjjemk", - "sec-fetch-dest": "empty", - "sec-fetch-mode": "cors", - "sec-fetch-site": "none", - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36", - } + headers = { + "authority": "translate.volcengine.com", + "accept": "application/json, text/plain, */*", + "accept-language": "zh-CN,zh;q=0.9", + "origin": "chrome-extension://klgfhbdadaspgppeadghjjemk", + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "none", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36", + } - json_data = { - "text": content, - "speaker": voice, - } # - response = requests.post( - "https://translate.volcengine.com/crx/tts/v1/", - headers=headers, - json=json_data, - proxies={"http": None, "https": None}, - ) - fname = str(time.time()) - b64 = base64.b64decode(response.json()["audio"]["data"]) - os.makedirs("./cache/tts/", exist_ok=True) - with open("./cache/tts/" + fname + ".mp3", "wb") as ff: - ff.write(b64) + json_data = { + "text": content, + "speaker": voice, + } # + response = requests.post( + "https://translate.volcengine.com/crx/tts/v1/", + headers=headers, + json=json_data, + proxies={"http": None, "https": None}, + ) + fname = str(time.time()) + b64 = base64.b64decode(response.json()["audio"]["data"]) + os.makedirs("./cache/tts/", exist_ok=True) + with open("./cache/tts/" + fname + ".mp3", "wb") as ff: + ff.write(b64) - return "./cache/tts/" + fname + ".mp3" - - except: - print_exc() - return None + return "./cache/tts/" + fname + ".mp3" diff --git a/LunaTranslator/LunaTranslator/tts/voiceroid2.py b/LunaTranslator/LunaTranslator/tts/voiceroid2.py index dfc21a1f..6e9fbdf3 100644 --- a/LunaTranslator/LunaTranslator/tts/voiceroid2.py +++ b/LunaTranslator/LunaTranslator/tts/voiceroid2.py @@ -126,16 +126,16 @@ class TTS(TTSbase): def speak(self, content, rate, voice, voice_idx): self.checkpath() # def _(): - if True: + - try: - content.encode("shift-jis") - except: - return - code1 = content.encode("shift-jis") - # print(code1) - windows.WriteFile(self.hPipe, code1) + try: + content.encode("shift-jis") + except: + return + code1 = content.encode("shift-jis") + # print(code1) + windows.WriteFile(self.hPipe, code1) - fname = windows.ReadFile(self.hPipe, 1024).decode("utf8") - if os.path.exists(fname): - return fname + fname = windows.ReadFile(self.hPipe, 1024).decode("utf8") + if os.path.exists(fname): + return fname diff --git a/LunaTranslator/LunaTranslator/tts/voiceroidplus.py b/LunaTranslator/LunaTranslator/tts/voiceroidplus.py index 8702c450..fb6f3b37 100644 --- a/LunaTranslator/LunaTranslator/tts/voiceroidplus.py +++ b/LunaTranslator/LunaTranslator/tts/voiceroidplus.py @@ -115,17 +115,16 @@ class TTS(TTSbase): def speak(self, content, rate, voice, voice_idx): self.checkpath() - # def _(): - if True: + - try: - content.encode("shift-jis") - except: - return - code1 = content.encode("shift-jis") - # print(code1) - windows.WriteFile(self.hPipe, code1) + try: + content.encode("shift-jis") + except: + return + code1 = content.encode("shift-jis") + # print(code1) + windows.WriteFile(self.hPipe, code1) - fname = windows.ReadFile(self.hPipe, 1024).decode("utf8") - if os.path.exists(fname): - return fname + fname = windows.ReadFile(self.hPipe, 1024).decode("utf8") + if os.path.exists(fname): + return fname diff --git a/LunaTranslator/LunaTranslator/tts/voicevox.py b/LunaTranslator/LunaTranslator/tts/voicevox.py index fceb99da..202a6424 100644 --- a/LunaTranslator/LunaTranslator/tts/voicevox.py +++ b/LunaTranslator/LunaTranslator/tts/voicevox.py @@ -74,36 +74,33 @@ class TTS(TTSbase): def speak(self, content, rate, voice, voiceidx): - # def _(): - if True: + headers = { + "Content-Type": "application/x-www-form-urlencoded", + } - headers = { - "Content-Type": "application/x-www-form-urlencoded", - } + params = {"speaker": voiceidx, "text": content} - params = {"speaker": voiceidx, "text": content} - - response = requests.post( - f"http://localhost:{self.config['Port']}/audio_query", - params=params, - headers=headers, - proxies={"http": None, "https": None}, - ) - print(response.json()) - fname = str(time.time()) - headers = { - "Content-Type": "application/json", - } - params = { - "speaker": voiceidx, - } - response = requests.post( - f"http://localhost:{self.config['Port']}/synthesis", - params=params, - headers=headers, - data=json.dumps(response.json()), - ) - os.makedirs("./cache/tts/", exist_ok=True) - with open("./cache/tts/" + fname + ".wav", "wb") as ff: - ff.write(response.content) - return "./cache/tts/" + fname + ".wav" + response = requests.post( + f"http://localhost:{self.config['Port']}/audio_query", + params=params, + headers=headers, + proxies={"http": None, "https": None}, + ) + print(response.json()) + fname = str(time.time()) + headers = { + "Content-Type": "application/json", + } + params = { + "speaker": voiceidx, + } + response = requests.post( + f"http://localhost:{self.config['Port']}/synthesis", + params=params, + headers=headers, + data=json.dumps(response.json()), + ) + os.makedirs("./cache/tts/", exist_ok=True) + with open("./cache/tts/" + fname + ".wav", "wb") as ff: + ff.write(response.content) + return "./cache/tts/" + fname + ".wav" diff --git a/LunaTranslator/files/defaultconfig/config.json b/LunaTranslator/files/defaultconfig/config.json index 48b0caa9..c9dc48a3 100644 --- a/LunaTranslator/files/defaultconfig/config.json +++ b/LunaTranslator/files/defaultconfig/config.json @@ -725,6 +725,11 @@ "name": "GPT-SOVITS preset" } } + }, + "gtts": { + "use": false, + "name": "谷歌", + "voice": "" } }, "hirasetting": {