From 92030d892bb52af9ef7ac0c259b511a7d133f844 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=81=8D=E5=85=AE=E6=83=9A=E5=85=AE?= <101191390+HIllya51@users.noreply.github.com> Date: Sat, 27 Apr 2024 17:50:45 +0800 Subject: [PATCH] issues/691 --- .../LunaTranslator/myutils/utils.py | 7 +- .../transoptimi/arabic_reshaper.py | 2168 +++++++++++++++++ .../files/defaultconfig/config.json | 3 +- .../files/defaultconfig/static_data.json | 7 +- 4 files changed, 2182 insertions(+), 3 deletions(-) create mode 100644 LunaTranslator/LunaTranslator/transoptimi/arabic_reshaper.py diff --git a/LunaTranslator/LunaTranslator/myutils/utils.py b/LunaTranslator/LunaTranslator/myutils/utils.py index 3dfee8f2..811e66ac 100644 --- a/LunaTranslator/LunaTranslator/myutils/utils.py +++ b/LunaTranslator/LunaTranslator/myutils/utils.py @@ -534,7 +534,12 @@ def checkpostlangmatch(name): for item in static_data["transoptimi"]: if name == item["name"]: try: - return globalconfig["languageuse"] == item["languageuse"] + return ( + static_data["language_list_translator_inner"][ + globalconfig["languageuse"] + ] + == item["languageuse"] + ) except: return True diff --git a/LunaTranslator/LunaTranslator/transoptimi/arabic_reshaper.py b/LunaTranslator/LunaTranslator/transoptimi/arabic_reshaper.py new file mode 100644 index 00000000..42f1f19c --- /dev/null +++ b/LunaTranslator/LunaTranslator/transoptimi/arabic_reshaper.py @@ -0,0 +1,2168 @@ +# Each letter is of the format: +# +# ('', ) +# +# And replacement is of the format: +# +# ('', '', '', '') +# +# Where is the string to replace, and is the replacement in +# case should be in isolated form, is the replacement in +# case should be in initial form, is the replacement in case +# should be in medial form, and is the replacement in case +# should be in final form. If no replacement is specified for a form, +# then no that means the letter doesn't support this form. + +UNSHAPED = 255 +ISOLATED = 0 +INITIAL = 1 +MEDIAL = 2 +FINAL = 3 + +TATWEEL = '\u0640' +ZWJ = '\u200D' +LETTERS_ARABIC = { + # ARABIC LETTER HAMZA + '\u0621': ('\uFE80', '', '', ''), + # ARABIC LETTER ALEF WITH MADDA ABOVE + '\u0622': ('\uFE81', '', '', '\uFE82'), + # ARABIC LETTER ALEF WITH HAMZA ABOVE + '\u0623': ('\uFE83', '', '', '\uFE84'), + # ARABIC LETTER WAW WITH HAMZA ABOVE + '\u0624': ('\uFE85', '', '', '\uFE86'), + # ARABIC LETTER ALEF WITH HAMZA BELOW + '\u0625': ('\uFE87', '', '', '\uFE88'), + # ARABIC LETTER YEH WITH HAMZA ABOVE + '\u0626': ('\uFE89', '\uFE8B', '\uFE8C', '\uFE8A'), + # ARABIC LETTER ALEF + '\u0627': ('\uFE8D', '', '', '\uFE8E'), + # ARABIC LETTER BEH + '\u0628': ('\uFE8F', '\uFE91', '\uFE92', '\uFE90'), + # ARABIC LETTER TEH MARBUTA + '\u0629': ('\uFE93', '', '', '\uFE94'), + # ARABIC LETTER TEH + '\u062A': ('\uFE95', '\uFE97', '\uFE98', '\uFE96'), + # ARABIC LETTER THEH + '\u062B': ('\uFE99', '\uFE9B', '\uFE9C', '\uFE9A'), + # ARABIC LETTER JEEM + '\u062C': ('\uFE9D', '\uFE9F', '\uFEA0', '\uFE9E'), + # ARABIC LETTER HAH + '\u062D': ('\uFEA1', '\uFEA3', '\uFEA4', '\uFEA2'), + # ARABIC LETTER KHAH + '\u062E': ('\uFEA5', '\uFEA7', '\uFEA8', '\uFEA6'), + # ARABIC LETTER DAL + '\u062F': ('\uFEA9', '', '', '\uFEAA'), + # ARABIC LETTER THAL + '\u0630': ('\uFEAB', '', '', '\uFEAC'), + # ARABIC LETTER REH + '\u0631': ('\uFEAD', '', '', '\uFEAE'), + # ARABIC LETTER ZAIN + '\u0632': ('\uFEAF', '', '', '\uFEB0'), + # ARABIC LETTER SEEN + '\u0633': ('\uFEB1', '\uFEB3', '\uFEB4', '\uFEB2'), + # ARABIC LETTER SHEEN + '\u0634': ('\uFEB5', '\uFEB7', '\uFEB8', '\uFEB6'), + # ARABIC LETTER SAD + '\u0635': ('\uFEB9', '\uFEBB', '\uFEBC', '\uFEBA'), + # ARABIC LETTER DAD + '\u0636': ('\uFEBD', '\uFEBF', '\uFEC0', '\uFEBE'), + # ARABIC LETTER TAH + '\u0637': ('\uFEC1', '\uFEC3', '\uFEC4', '\uFEC2'), + # ARABIC LETTER ZAH + '\u0638': ('\uFEC5', '\uFEC7', '\uFEC8', '\uFEC6'), + # ARABIC LETTER AIN + '\u0639': ('\uFEC9', '\uFECB', '\uFECC', '\uFECA'), + # ARABIC LETTER GHAIN + '\u063A': ('\uFECD', '\uFECF', '\uFED0', '\uFECE'), + # ARABIC TATWEEL + TATWEEL: (TATWEEL, TATWEEL, TATWEEL, TATWEEL), + # ARABIC LETTER FEH + '\u0641': ('\uFED1', '\uFED3', '\uFED4', '\uFED2'), + # ARABIC LETTER QAF + '\u0642': ('\uFED5', '\uFED7', '\uFED8', '\uFED6'), + # ARABIC LETTER KAF + '\u0643': ('\uFED9', '\uFEDB', '\uFEDC', '\uFEDA'), + # ARABIC LETTER LAM + '\u0644': ('\uFEDD', '\uFEDF', '\uFEE0', '\uFEDE'), + # ARABIC LETTER MEEM + '\u0645': ('\uFEE1', '\uFEE3', '\uFEE4', '\uFEE2'), + # ARABIC LETTER NOON + '\u0646': ('\uFEE5', '\uFEE7', '\uFEE8', '\uFEE6'), + # ARABIC LETTER HEH + '\u0647': ('\uFEE9', '\uFEEB', '\uFEEC', '\uFEEA'), + # ARABIC LETTER WAW + '\u0648': ('\uFEED', '', '', '\uFEEE'), + # ARABIC LETTER (UIGHUR KAZAKH KIRGHIZ)? ALEF MAKSURA + '\u0649': ('\uFEEF', '\uFBE8', '\uFBE9', '\uFEF0'), + # ARABIC LETTER YEH + '\u064A': ('\uFEF1', '\uFEF3', '\uFEF4', '\uFEF2'), + # ARABIC LETTER ALEF WASLA + '\u0671': ('\uFB50', '', '', '\uFB51'), + # ARABIC LETTER U WITH HAMZA ABOVE + '\u0677': ('\uFBDD', '', '', ''), + # ARABIC LETTER TTEH + '\u0679': ('\uFB66', '\uFB68', '\uFB69', '\uFB67'), + # ARABIC LETTER TTEHEH + '\u067A': ('\uFB5E', '\uFB60', '\uFB61', '\uFB5F'), + # ARABIC LETTER BEEH + '\u067B': ('\uFB52', '\uFB54', '\uFB55', '\uFB53'), + # ARABIC LETTER PEH + '\u067E': ('\uFB56', '\uFB58', '\uFB59', '\uFB57'), + # ARABIC LETTER TEHEH + '\u067F': ('\uFB62', '\uFB64', '\uFB65', '\uFB63'), + # ARABIC LETTER BEHEH + '\u0680': ('\uFB5A', '\uFB5C', '\uFB5D', '\uFB5B'), + # ARABIC LETTER NYEH + '\u0683': ('\uFB76', '\uFB78', '\uFB79', '\uFB77'), + # ARABIC LETTER DYEH + '\u0684': ('\uFB72', '\uFB74', '\uFB75', '\uFB73'), + # ARABIC LETTER TCHEH + '\u0686': ('\uFB7A', '\uFB7C', '\uFB7D', '\uFB7B'), + # ARABIC LETTER TCHEHEH + '\u0687': ('\uFB7E', '\uFB80', '\uFB81', '\uFB7F'), + # ARABIC LETTER DDAL + '\u0688': ('\uFB88', '', '', '\uFB89'), + # ARABIC LETTER DAHAL + '\u068C': ('\uFB84', '', '', '\uFB85'), + # ARABIC LETTER DDAHAL + '\u068D': ('\uFB82', '', '', '\uFB83'), + # ARABIC LETTER DUL + '\u068E': ('\uFB86', '', '', '\uFB87'), + # ARABIC LETTER RREH + '\u0691': ('\uFB8C', '', '', '\uFB8D'), + # ARABIC LETTER JEH + '\u0698': ('\uFB8A', '', '', '\uFB8B'), + # ARABIC LETTER VEH + '\u06A4': ('\uFB6A', '\uFB6C', '\uFB6D', '\uFB6B'), + # ARABIC LETTER PEHEH + '\u06A6': ('\uFB6E', '\uFB70', '\uFB71', '\uFB6F'), + # ARABIC LETTER KEHEH + '\u06A9': ('\uFB8E', '\uFB90', '\uFB91', '\uFB8F'), + # ARABIC LETTER NG + '\u06AD': ('\uFBD3', '\uFBD5', '\uFBD6', '\uFBD4'), + # ARABIC LETTER GAF + '\u06AF': ('\uFB92', '\uFB94', '\uFB95', '\uFB93'), + # ARABIC LETTER NGOEH + '\u06B1': ('\uFB9A', '\uFB9C', '\uFB9D', '\uFB9B'), + # ARABIC LETTER GUEH + '\u06B3': ('\uFB96', '\uFB98', '\uFB99', '\uFB97'), + # ARABIC LETTER NOON GHUNNA + '\u06BA': ('\uFB9E', '', '', '\uFB9F'), + # ARABIC LETTER RNOON + '\u06BB': ('\uFBA0', '\uFBA2', '\uFBA3', '\uFBA1'), + # ARABIC LETTER HEH DOACHASHMEE + '\u06BE': ('\uFBAA', '\uFBAC', '\uFBAD', '\uFBAB'), + # ARABIC LETTER HEH WITH YEH ABOVE + '\u06C0': ('\uFBA4', '', '', '\uFBA5'), + # ARABIC LETTER HEH GOAL + '\u06C1': ('\uFBA6', '\uFBA8', '\uFBA9', '\uFBA7'), + # ARABIC LETTER KIRGHIZ OE + '\u06C5': ('\uFBE0', '', '', '\uFBE1'), + # ARABIC LETTER OE + '\u06C6': ('\uFBD9', '', '', '\uFBDA'), + # ARABIC LETTER U + '\u06C7': ('\uFBD7', '', '', '\uFBD8'), + # ARABIC LETTER YU + '\u06C8': ('\uFBDB', '', '', '\uFBDC'), + # ARABIC LETTER KIRGHIZ YU + '\u06C9': ('\uFBE2', '', '', '\uFBE3'), + # ARABIC LETTER VE + '\u06CB': ('\uFBDE', '', '', '\uFBDF'), + # ARABIC LETTER FARSI YEH + '\u06CC': ('\uFBFC', '\uFBFE', '\uFBFF', '\uFBFD'), + # ARABIC LETTER E + '\u06D0': ('\uFBE4', '\uFBE6', '\uFBE7', '\uFBE5'), + # ARABIC LETTER YEH BARREE + '\u06D2': ('\uFBAE', '', '', '\uFBAF'), + # ARABIC LETTER YEH BARREE WITH HAMZA ABOVE + '\u06D3': ('\uFBB0', '', '', '\uFBB1'), + + # ZWJ + ZWJ: (ZWJ, ZWJ, ZWJ, ZWJ), +} + +LETTERS_ARABIC_V2 = { + # ARABIC LETTER HAMZA + '\u0621': ('\uFE80', '', '', ''), + # ARABIC LETTER ALEF WITH MADDA ABOVE + '\u0622': ('\u0622', '', '', '\uFE82'), + # ARABIC LETTER ALEF WITH HAMZA ABOVE + '\u0623': ('\u0623', '', '', '\uFE84'), + # ARABIC LETTER WAW WITH HAMZA ABOVE + '\u0624': ('\u0624', '', '', '\uFE86'), + # ARABIC LETTER ALEF WITH HAMZA BELOW + '\u0625': ('\u0625', '', '', '\uFE88'), + # ARABIC LETTER YEH WITH HAMZA ABOVE + '\u0626': ('\u0626', '\uFE8B', '\uFE8C', '\uFE8A'), + # ARABIC LETTER ALEF + '\u0627': ('\u0627', '', '', '\uFE8E'), + # ARABIC LETTER BEH + '\u0628': ('\u0628', '\uFE91', '\uFE92', '\uFE90'), + # ARABIC LETTER TEH MARBUTA + '\u0629': ('\u0629', '', '', '\uFE94'), + # ARABIC LETTER TEH + '\u062A': ('\u062A', '\uFE97', '\uFE98', '\uFE96'), + # ARABIC LETTER THEH + '\u062B': ('\u062B', '\uFE9B', '\uFE9C', '\uFE9A'), + # ARABIC LETTER JEEM + '\u062C': ('\u062C', '\uFE9F', '\uFEA0', '\uFE9E'), + # ARABIC LETTER HAH + '\u062D': ('\uFEA1', '\uFEA3', '\uFEA4', '\uFEA2'), + # ARABIC LETTER KHAH + '\u062E': ('\u062E', '\uFEA7', '\uFEA8', '\uFEA6'), + # ARABIC LETTER DAL + '\u062F': ('\u062F', '', '', '\uFEAA'), + # ARABIC LETTER THAL + '\u0630': ('\u0630', '', '', '\uFEAC'), + # ARABIC LETTER REH + '\u0631': ('\u0631', '', '', '\uFEAE'), + # ARABIC LETTER ZAIN + '\u0632': ('\u0632', '', '', '\uFEB0'), + # ARABIC LETTER SEEN + '\u0633': ('\u0633', '\uFEB3', '\uFEB4', '\uFEB2'), + # ARABIC LETTER SHEEN + '\u0634': ('\u0634', '\uFEB7', '\uFEB8', '\uFEB6'), + # ARABIC LETTER SAD + '\u0635': ('\u0635', '\uFEBB', '\uFEBC', '\uFEBA'), + # ARABIC LETTER DAD + '\u0636': ('\u0636', '\uFEBF', '\uFEC0', '\uFEBE'), + # ARABIC LETTER TAH + '\u0637': ('\u0637', '\uFEC3', '\uFEC4', '\uFEC2'), + # ARABIC LETTER ZAH + '\u0638': ('\u0638', '\uFEC7', '\uFEC8', '\uFEC6'), + # ARABIC LETTER AIN + '\u0639': ('\u0639', '\uFECB', '\uFECC', '\uFECA'), + # ARABIC LETTER GHAIN + '\u063A': ('\u063A', '\uFECF', '\uFED0', '\uFECE'), + # ARABIC TATWEEL + TATWEEL: (TATWEEL, TATWEEL, TATWEEL, TATWEEL), + # ARABIC LETTER FEH + '\u0641': ('\u0641', '\uFED3', '\uFED4', '\uFED2'), + # ARABIC LETTER QAF + '\u0642': ('\u0642', '\uFED7', '\uFED8', '\uFED6'), + # ARABIC LETTER KAF + '\u0643': ('\u0643', '\uFEDB', '\uFEDC', '\uFEDA'), + # ARABIC LETTER LAM + '\u0644': ('\u0644', '\uFEDF', '\uFEE0', '\uFEDE'), + # ARABIC LETTER MEEM + '\u0645': ('\u0645', '\uFEE3', '\uFEE4', '\uFEE2'), + # ARABIC LETTER NOON + '\u0646': ('\u0646', '\uFEE7', '\uFEE8', '\uFEE6'), + # ARABIC LETTER HEH + '\u0647': ('\u0647', '\uFEEB', '\uFEEC', '\uFEEA'), + # ARABIC LETTER WAW + '\u0648': ('\u0648', '', '', '\uFEEE'), + # ARABIC LETTER (UIGHUR KAZAKH KIRGHIZ)? ALEF MAKSURA + '\u0649': ('\u0649', '\uFBE8', '\uFBE9', '\uFEF0'), + # ARABIC LETTER YEH + '\u064A': ('\u064A', '\uFEF3', '\uFEF4', '\uFEF2'), + # ARABIC LETTER ALEF WASLA + '\u0671': ('\u0671', '', '', '\uFB51'), + # ARABIC LETTER U WITH HAMZA ABOVE + '\u0677': ('\u0677', '', '', ''), + # ARABIC LETTER TTEH + '\u0679': ('\u0679', '\uFB68', '\uFB69', '\uFB67'), + # ARABIC LETTER TTEHEH + '\u067A': ('\u067A', '\uFB60', '\uFB61', '\uFB5F'), + # ARABIC LETTER BEEH + '\u067B': ('\u067B', '\uFB54', '\uFB55', '\uFB53'), + # ARABIC LETTER PEH + '\u067E': ('\u067E', '\uFB58', '\uFB59', '\uFB57'), + # ARABIC LETTER TEHEH + '\u067F': ('\u067F', '\uFB64', '\uFB65', '\uFB63'), + # ARABIC LETTER BEHEH + '\u0680': ('\u0680', '\uFB5C', '\uFB5D', '\uFB5B'), + # ARABIC LETTER NYEH + '\u0683': ('\u0683', '\uFB78', '\uFB79', '\uFB77'), + # ARABIC LETTER DYEH + '\u0684': ('\u0684', '\uFB74', '\uFB75', '\uFB73'), + # ARABIC LETTER TCHEH + '\u0686': ('\u0686', '\uFB7C', '\uFB7D', '\uFB7B'), + # ARABIC LETTER TCHEHEH + '\u0687': ('\u0687', '\uFB80', '\uFB81', '\uFB7F'), + # ARABIC LETTER DDAL + '\u0688': ('\u0688', '', '', '\uFB89'), + # ARABIC LETTER DAHAL + '\u068C': ('\u068C', '', '', '\uFB85'), + # ARABIC LETTER DDAHAL + '\u068D': ('\u068D', '', '', '\uFB83'), + # ARABIC LETTER DUL + '\u068E': ('\u068E', '', '', '\uFB87'), + # ARABIC LETTER RREH + '\u0691': ('\u0691', '', '', '\uFB8D'), + # ARABIC LETTER JEH + '\u0698': ('\u0698', '', '', '\uFB8B'), + # ARABIC LETTER VEH + '\u06A4': ('\u06A4', '\uFB6C', '\uFB6D', '\uFB6B'), + # ARABIC LETTER PEHEH + '\u06A6': ('\u06A6', '\uFB70', '\uFB71', '\uFB6F'), + # ARABIC LETTER KEHEH + '\u06A9': ('\u06A9', '\uFB90', '\uFB91', '\uFB8F'), + # ARABIC LETTER NG + '\u06AD': ('\u06AD', '\uFBD5', '\uFBD6', '\uFBD4'), + # ARABIC LETTER GAF + '\u06AF': ('\u06AF', '\uFB94', '\uFB95', '\uFB93'), + # ARABIC LETTER NGOEH + '\u06B1': ('\u06B1', '\uFB9C', '\uFB9D', '\uFB9B'), + # ARABIC LETTER GUEH + '\u06B3': ('\u06B3', '\uFB98', '\uFB99', '\uFB97'), + # ARABIC LETTER NOON GHUNNA + '\u06BA': ('\u06BA', '', '', '\uFB9F'), + # ARABIC LETTER RNOON + '\u06BB': ('\u06BB', '\uFBA2', '\uFBA3', '\uFBA1'), + # ARABIC LETTER HEH DOACHASHMEE + '\u06BE': ('\u06BE', '\uFBAC', '\uFBAD', '\uFBAB'), + # ARABIC LETTER HEH WITH YEH ABOVE + '\u06C0': ('\u06C0', '', '', '\uFBA5'), + # ARABIC LETTER HEH GOAL + '\u06C1': ('\u06C1', '\uFBA8', '\uFBA9', '\uFBA7'), + # ARABIC LETTER KIRGHIZ OE + '\u06C5': ('\u06C5', '', '', '\uFBE1'), + # ARABIC LETTER OE + '\u06C6': ('\u06C6', '', '', '\uFBDA'), + # ARABIC LETTER U + '\u06C7': ('\u06C7', '', '', '\uFBD8'), + # ARABIC LETTER YU + '\u06C8': ('\u06C8', '', '', '\uFBDC'), + # ARABIC LETTER KIRGHIZ YU + '\u06C9': ('\u06C9', '', '', '\uFBE3'), + # ARABIC LETTER VE + '\u06CB': ('\u06CB', '', '', '\uFBDF'), + # ARABIC LETTER FARSI YEH + '\u06CC': ('\u06CC', '\uFBFE', '\uFBFF', '\uFBFD'), + # ARABIC LETTER E + '\u06D0': ('\u06D0', '\uFBE6', '\uFBE7', '\uFBE5'), + # ARABIC LETTER YEH BARREE + '\u06D2': ('\u06D2', '', '', '\uFBAF'), + # ARABIC LETTER YEH BARREE WITH HAMZA ABOVE + '\u06D3': ('\u06D3', '', '', '\uFBB1'), + # Kurdish letter YEAH + '\u06ce': ('\uE004', '\uE005', '\uE006', '\uE004'), + # Kurdish letter Hamza same as arabic Teh without the point + '\u06d5': ('\u06d5', '', '', '\uE000'), + # ZWJ + ZWJ: (ZWJ, ZWJ, ZWJ, ZWJ), +} +LETTERS_KURDISH = { + # ARABIC LETTER HAMZA + '\u0621': ('\uFE80', '', '', ''), + # ARABIC LETTER ALEF WITH MADDA ABOVE + '\u0622': ('\u0622', '', '', '\uFE82'), + # ARABIC LETTER ALEF WITH HAMZA ABOVE + '\u0623': ('\u0623', '', '', '\uFE84'), + # ARABIC LETTER WAW WITH HAMZA ABOVE + '\u0624': ('\u0624', '', '', '\uFE86'), + # ARABIC LETTER ALEF WITH HAMZA BELOW + '\u0625': ('\u0625', '', '', '\uFE88'), + # ARABIC LETTER YEH WITH HAMZA ABOVE + '\u0626': ('\u0626', '\uFE8B', '\uFE8C', '\uFE8A'), + # ARABIC LETTER ALEF + '\u0627': ('\u0627', '', '', '\uFE8E'), + # ARABIC LETTER BEH + '\u0628': ('\u0628', '\uFE91', '\uFE92', '\uFE90'), + # ARABIC LETTER TEH MARBUTA + '\u0629': ('\u0629', '', '', '\uFE94'), + # ARABIC LETTER TEH + '\u062A': ('\u062A', '\uFE97', '\uFE98', '\uFE96'), + # ARABIC LETTER THEH + '\u062B': ('\u062B', '\uFE9B', '\uFE9C', '\uFE9A'), + # ARABIC LETTER JEEM + '\u062C': ('\u062C', '\uFE9F', '\uFEA0', '\uFE9E'), + # ARABIC LETTER HAH + '\u062D': ('\uFEA1', '\uFEA3', '\uFEA4', '\uFEA2'), + # ARABIC LETTER KHAH + '\u062E': ('\u062E', '\uFEA7', '\uFEA8', '\uFEA6'), + # ARABIC LETTER DAL + '\u062F': ('\u062F', '', '', '\uFEAA'), + # ARABIC LETTER THAL + '\u0630': ('\u0630', '', '', '\uFEAC'), + # ARABIC LETTER REH + '\u0631': ('\u0631', '', '', '\uFEAE'), + # ARABIC LETTER ZAIN + '\u0632': ('\u0632', '', '', '\uFEB0'), + # ARABIC LETTER SEEN + '\u0633': ('\u0633', '\uFEB3', '\uFEB4', '\uFEB2'), + # ARABIC LETTER SHEEN + '\u0634': ('\u0634', '\uFEB7', '\uFEB8', '\uFEB6'), + # ARABIC LETTER SAD + '\u0635': ('\u0635', '\uFEBB', '\uFEBC', '\uFEBA'), + # ARABIC LETTER DAD + '\u0636': ('\u0636', '\uFEBF', '\uFEC0', '\uFEBE'), + # ARABIC LETTER TAH + '\u0637': ('\u0637', '\uFEC3', '\uFEC4', '\uFEC2'), + # ARABIC LETTER ZAH + '\u0638': ('\u0638', '\uFEC7', '\uFEC8', '\uFEC6'), + # ARABIC LETTER AIN + '\u0639': ('\u0639', '\uFECB', '\uFECC', '\uFECA'), + # ARABIC LETTER GHAIN + '\u063A': ('\u063A', '\uFECF', '\uFED0', '\uFECE'), + # ARABIC TATWEEL + TATWEEL: (TATWEEL, TATWEEL, TATWEEL, TATWEEL), + # ARABIC LETTER FEH + '\u0641': ('\u0641', '\uFED3', '\uFED4', '\uFED2'), + # ARABIC LETTER QAF + '\u0642': ('\u0642', '\uFED7', '\uFED8', '\uFED6'), + # ARABIC LETTER KAF + '\u0643': ('\u0643', '\uFEDB', '\uFEDC', '\uFEDA'), + # ARABIC LETTER LAM + '\u0644': ('\u0644', '\uFEDF', '\uFEE0', '\uFEDE'), + # ARABIC LETTER MEEM + '\u0645': ('\u0645', '\uFEE3', '\uFEE4', '\uFEE2'), + # ARABIC LETTER NOON + '\u0646': ('\u0646', '\uFEE7', '\uFEE8', '\uFEE6'), + # ARABIC LETTER HEH + '\u0647': ('\uFBAB', '\uFBAB', '\uFBAB', '\uFBAB'), + # ARABIC LETTER WAW + '\u0648': ('\u0648', '', '', '\uFEEE'), + # ARABIC LETTER (UIGHUR KAZAKH KIRGHIZ)? ALEF MAKSURA + '\u0649': ('\u0649', '\uFBE8', '\uFBE9', '\uFEF0'), + # ARABIC LETTER YEH + '\u064A': ('\u064A', '\uFEF3', '\uFEF4', '\uFEF2'), + # ARABIC LETTER ALEF WASLA + '\u0671': ('\u0671', '', '', '\uFB51'), + # ARABIC LETTER U WITH HAMZA ABOVE + '\u0677': ('\u0677', '', '', ''), + # ARABIC LETTER TTEH + '\u0679': ('\u0679', '\uFB68', '\uFB69', '\uFB67'), + # ARABIC LETTER TTEHEH + '\u067A': ('\u067A', '\uFB60', '\uFB61', '\uFB5F'), + # ARABIC LETTER BEEH + '\u067B': ('\u067B', '\uFB54', '\uFB55', '\uFB53'), + # ARABIC LETTER PEH + '\u067E': ('\u067E', '\uFB58', '\uFB59', '\uFB57'), + # ARABIC LETTER TEHEH + '\u067F': ('\u067F', '\uFB64', '\uFB65', '\uFB63'), + # ARABIC LETTER BEHEH + '\u0680': ('\u0680', '\uFB5C', '\uFB5D', '\uFB5B'), + # ARABIC LETTER NYEH + '\u0683': ('\u0683', '\uFB78', '\uFB79', '\uFB77'), + # ARABIC LETTER DYEH + '\u0684': ('\u0684', '\uFB74', '\uFB75', '\uFB73'), + # ARABIC LETTER TCHEH + '\u0686': ('\u0686', '\uFB7C', '\uFB7D', '\uFB7B'), + # ARABIC LETTER TCHEHEH + '\u0687': ('\u0687', '\uFB80', '\uFB81', '\uFB7F'), + # ARABIC LETTER DDAL + '\u0688': ('\u0688', '', '', '\uFB89'), + # ARABIC LETTER DAHAL + '\u068C': ('\u068C', '', '', '\uFB85'), + # ARABIC LETTER DDAHAL + '\u068D': ('\u068D', '', '', '\uFB83'), + # ARABIC LETTER DUL + '\u068E': ('\u068E', '', '', '\uFB87'), + # ARABIC LETTER RREH + '\u0691': ('\u0691', '', '', '\uFB8D'), + # ARABIC LETTER JEH + '\u0698': ('\u0698', '', '', '\uFB8B'), + # ARABIC LETTER VEH + '\u06A4': ('\u06A4', '\uFB6C', '\uFB6D', '\uFB6B'), + # ARABIC LETTER PEHEH + '\u06A6': ('\u06A6', '\uFB70', '\uFB71', '\uFB6F'), + # ARABIC LETTER KEHEH + '\u06A9': ('\u06A9', '\uFB90', '\uFB91', '\uFB8F'), + # ARABIC LETTER NG + '\u06AD': ('\u06AD', '\uFBD5', '\uFBD6', '\uFBD4'), + # ARABIC LETTER GAF + '\u06AF': ('\u06AF', '\uFB94', '\uFB95', '\uFB93'), + # ARABIC LETTER NGOEH + '\u06B1': ('\u06B1', '\uFB9C', '\uFB9D', '\uFB9B'), + # ARABIC LETTER GUEH + '\u06B3': ('\u06B3', '\uFB98', '\uFB99', '\uFB97'), + # ARABIC LETTER NOON GHUNNA + '\u06BA': ('\u06BA', '', '', '\uFB9F'), + # ARABIC LETTER RNOON + '\u06BB': ('\u06BB', '\uFBA2', '\uFBA3', '\uFBA1'), + # ARABIC LETTER HEH DOACHASHMEE + '\u06BE': ('\u06BE', '\uFBAC', '\uFBAD', '\uFBAB'), + # ARABIC LETTER HEH WITH YEH ABOVE + '\u06C0': ('\u06C0', '', '', '\uFBA5'), + # ARABIC LETTER HEH GOAL + '\u06C1': ('\u06C1', '\uFBA8', '\uFBA9', '\uFBA7'), + # ARABIC LETTER KIRGHIZ OE + '\u06C5': ('\u06C5', '', '', '\uFBE1'), + # ARABIC LETTER OE + '\u06C6': ('\u06C6', '', '', '\uFBDA'), + # ARABIC LETTER U + '\u06C7': ('\u06C7', '', '', '\uFBD8'), + # ARABIC LETTER YU + '\u06C8': ('\u06C8', '', '', '\uFBDC'), + # ARABIC LETTER KIRGHIZ YU + '\u06C9': ('\u06C9', '', '', '\uFBE3'), + # ARABIC LETTER VE + '\u06CB': ('\u06CB', '', '', '\uFBDF'), + # ARABIC LETTER FARSI YEH + '\u06CC': ('\u06CC', '\uFBFE', '\uFBFF', '\uFBFD'), + # ARABIC LETTER E + '\u06D0': ('\u06D0', '\uFBE6', '\uFBE7', '\uFBE5'), + # ARABIC LETTER YEH BARREE + '\u06D2': ('\u06D2', '', '', '\uFBAF'), + # ARABIC LETTER YEH BARREE WITH HAMZA ABOVE + '\u06D3': ('\u06D3', '', '', '\uFBB1'), + # Kurdish letter YEAH + '\u06ce': ('\uE004', '\uE005', '\uE006', '\uE004'), + # Kurdish letter Hamza same as arabic Teh without the point + '\u06d5': ('\u06d5', '', '', '\uE000'), + # ZWJ + ZWJ: (ZWJ, ZWJ, ZWJ, ZWJ), +} + +def connects_with_letter_before(letter,LETTERS): + if letter not in LETTERS: + return False + forms = LETTERS[letter] + return forms[FINAL] or forms[MEDIAL] + + +def connects_with_letter_after(letter,LETTERS): + if letter not in LETTERS: + return False + forms = LETTERS[letter] + return forms[INITIAL] or forms[MEDIAL] + + +def connects_with_letters_before_and_after(letter,LETTERS): + if letter not in LETTERS: + return False + forms = LETTERS[letter] + return forms[MEDIAL] + +# Each ligature is of the format: +# +# ('', ) +# +# Where is used in the configuration and is of the format: +# +# ('', ('', '', '', '')) +# +# Where is the string to replace, and is the replacement in +# case was in isolated form, is the replacement in case +# was in initial form, is the replacement in case was +# in medial form, and is the replacement in case was in final +# form. If no replacement is specified for a form, then no replacement of +# will occur. + +# Order here is important, it should be: +# 1. Sentences +# 2. Words +# 3. Letters +# This way we make sure we replace the longest ligatures first + +from itertools import chain + +SENTENCES_LIGATURES = ( + ('ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM', ( + '\u0628\u0633\u0645\u0020' + '\u0627\u0644\u0644\u0647\u0020' + '\u0627\u0644\u0631\u062D\u0645\u0646\u0020' + '\u0627\u0644\u0631\u062D\u064A\u0645', + + ('\uFDFD', '', '', '') + )), + ('ARABIC LIGATURE JALLAJALALOUHOU', ( + '\u062C\u0644\u0020\u062C\u0644\u0627\u0644\u0647', + + ('\uFDFB', '', '', '') + )), + ('ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM', ( + '\u0635\u0644\u0649\u0020' + '\u0627\u0644\u0644\u0647\u0020' + '\u0639\u0644\u064A\u0647\u0020' + '\u0648\u0633\u0644\u0645', + + ('\uFDFA', '', '', '') + )), +) + +WORDS_LIGATURES = ( + ('ARABIC LIGATURE ALLAH', ( + '\u0627\u0644\u0644\u0647', ('\uFDF2', '', '', ''), + )), + ('ARABIC LIGATURE AKBAR', ( + '\u0623\u0643\u0628\u0631', ('\uFDF3', '', '', ''), + )), + ('ARABIC LIGATURE ALAYHE', ( + '\u0639\u0644\u064A\u0647', ('\uFDF7', '', '', ''), + )), + ('ARABIC LIGATURE MOHAMMAD', ( + '\u0645\u062D\u0645\u062F', ('\uFDF4', '', '', ''), + )), + ('ARABIC LIGATURE RASOUL', ( + '\u0631\u0633\u0648\u0644', ('\uFDF6', '', '', ''), + )), + ('ARABIC LIGATURE SALAM', ( + '\u0635\u0644\u0639\u0645', ('\uFDF5', '', '', ''), + )), + ('ARABIC LIGATURE SALLA', ( + '\u0635\u0644\u0649', ('\uFDF9', '', '', ''), + )), + ('ARABIC LIGATURE WASALLAM', ( + '\u0648\u0633\u0644\u0645', ('\uFDF8', '', '', ''), + )), + ('RIAL SIGN', ( + '\u0631[\u06CC\u064A]\u0627\u0644', ('\uFDFC', '', '', ''), + )), +) + +LETTERS_LIGATURES = ( + ('ARABIC LIGATURE AIN WITH ALEF MAKSURA', ( + '\u0639\u0649', ('\uFCF7', '', '', '\uFD13'), + )), + ('ARABIC LIGATURE AIN WITH JEEM', ( + '\u0639\u062C', ('\uFC29', '\uFCBA', '', ''), + )), + ('ARABIC LIGATURE AIN WITH JEEM WITH MEEM', ( + '\u0639\u062C\u0645', ('', '\uFDC4', '', '\uFD75'), + )), + ('ARABIC LIGATURE AIN WITH MEEM', ( + '\u0639\u0645', ('\uFC2A', '\uFCBB', '', ''), + )), + ('ARABIC LIGATURE AIN WITH MEEM WITH ALEF MAKSURA', ( + '\u0639\u0645\u0649', ('', '', '', '\uFD78'), + )), + ('ARABIC LIGATURE AIN WITH MEEM WITH MEEM', ( + '\u0639\u0645\u0645', ('', '\uFD77', '', '\uFD76'), + )), + ('ARABIC LIGATURE AIN WITH MEEM WITH YEH', ( + '\u0639\u0645\u064A', ('', '', '', '\uFDB6'), + )), + ('ARABIC LIGATURE AIN WITH YEH', ( + '\u0639\u064A', ('\uFCF8', '', '', '\uFD14'), + )), + ('ARABIC LIGATURE ALEF MAKSURA WITH SUPERSCRIPT ALEF', ( + '\u0649\u0670', ('\uFC5D', '', '', '\uFC90'), + )), + ('ARABIC LIGATURE ALEF WITH FATHATAN', ( + '\u0627\u064B', ('\uFD3D', '', '', '\uFD3C'), + )), + ('ARABIC LIGATURE BEH WITH ALEF MAKSURA', ( + '\u0628\u0649', ('\uFC09', '', '', '\uFC6E'), + )), + ('ARABIC LIGATURE BEH WITH HAH', ( + '\u0628\u062D', ('\uFC06', '\uFC9D', '', ''), + )), + ('ARABIC LIGATURE BEH WITH HAH WITH YEH', ( + '\u0628\u062D\u064A', ('', '', '', '\uFDC2'), + )), + ('ARABIC LIGATURE BEH WITH HEH', ( + '\u0628\u0647', ('', '\uFCA0', '\uFCE2', ''), + )), + ('ARABIC LIGATURE BEH WITH JEEM', ( + '\u0628\u062C', ('\uFC05', '\uFC9C', '', ''), + )), + ('ARABIC LIGATURE BEH WITH KHAH', ( + '\u0628\u062E', ('\uFC07', '\uFC9E', '', ''), + )), + ('ARABIC LIGATURE BEH WITH KHAH WITH YEH', ( + '\u0628\u062E\u064A', ('', '', '', '\uFD9E'), + )), + ('ARABIC LIGATURE BEH WITH MEEM', ( + '\u0628\u0645', ('\uFC08', '\uFC9F', '\uFCE1', '\uFC6C'), + )), + ('ARABIC LIGATURE BEH WITH NOON', ( + '\u0628\u0646', ('', '', '', '\uFC6D'), + )), + ('ARABIC LIGATURE BEH WITH REH', ( + '\u0628\u0631', ('', '', '', '\uFC6A'), + )), + ('ARABIC LIGATURE BEH WITH YEH', ( + '\u0628\u064A', ('\uFC0A', '', '', '\uFC6F'), + )), + ('ARABIC LIGATURE BEH WITH ZAIN', ( + '\u0628\u0632', ('', '', '', '\uFC6B'), + )), + ('ARABIC LIGATURE DAD WITH ALEF MAKSURA', ( + '\u0636\u0649', ('\uFD07', '', '', '\uFD23'), + )), + ('ARABIC LIGATURE DAD WITH HAH', ( + '\u0636\u062D', ('\uFC23', '\uFCB5', '', ''), + )), + ('ARABIC LIGATURE DAD WITH HAH WITH ALEF MAKSURA', ( + '\u0636\u062D\u0649', ('', '', '', '\uFD6E'), + )), + ('ARABIC LIGATURE DAD WITH HAH WITH YEH', ( + '\u0636\u062D\u064A', ('', '', '', '\uFDAB'), + )), + ('ARABIC LIGATURE DAD WITH JEEM', ( + '\u0636\u062C', ('\uFC22', '\uFCB4', '', ''), + )), + ('ARABIC LIGATURE DAD WITH KHAH', ( + '\u0636\u062E', ('\uFC24', '\uFCB6', '', ''), + )), + ('ARABIC LIGATURE DAD WITH KHAH WITH MEEM', ( + '\u0636\u062E\u0645', ('', '\uFD70', '', '\uFD6F'), + )), + ('ARABIC LIGATURE DAD WITH MEEM', ( + '\u0636\u0645', ('\uFC25', '\uFCB7', '', ''), + )), + ('ARABIC LIGATURE DAD WITH REH', ( + '\u0636\u0631', ('\uFD10', '', '', '\uFD2C'), + )), + ('ARABIC LIGATURE DAD WITH YEH', ( + '\u0636\u064A', ('\uFD08', '', '', '\uFD24'), + )), + ('ARABIC LIGATURE FEH WITH ALEF MAKSURA', ( + '\u0641\u0649', ('\uFC31', '', '', '\uFC7C'), + )), + ('ARABIC LIGATURE FEH WITH HAH', ( + '\u0641\u062D', ('\uFC2E', '\uFCBF', '', ''), + )), + ('ARABIC LIGATURE FEH WITH JEEM', ( + '\u0641\u062C', ('\uFC2D', '\uFCBE', '', ''), + )), + ('ARABIC LIGATURE FEH WITH KHAH', ( + '\u0641\u062E', ('\uFC2F', '\uFCC0', '', ''), + )), + ('ARABIC LIGATURE FEH WITH KHAH WITH MEEM', ( + '\u0641\u062E\u0645', ('', '\uFD7D', '', '\uFD7C'), + )), + ('ARABIC LIGATURE FEH WITH MEEM', ( + '\u0641\u0645', ('\uFC30', '\uFCC1', '', ''), + )), + ('ARABIC LIGATURE FEH WITH MEEM WITH YEH', ( + '\u0641\u0645\u064A', ('', '', '', '\uFDC1'), + )), + ('ARABIC LIGATURE FEH WITH YEH', ( + '\u0641\u064A', ('\uFC32', '', '', '\uFC7D'), + )), + ('ARABIC LIGATURE GHAIN WITH ALEF MAKSURA', ( + '\u063A\u0649', ('\uFCF9', '', '', '\uFD15'), + )), + ('ARABIC LIGATURE GHAIN WITH JEEM', ( + '\u063A\u062C', ('\uFC2B', '\uFCBC', '', ''), + )), + ('ARABIC LIGATURE GHAIN WITH MEEM', ( + '\u063A\u0645', ('\uFC2C', '\uFCBD', '', ''), + )), + ('ARABIC LIGATURE GHAIN WITH MEEM WITH ALEF MAKSURA', ( + '\u063A\u0645\u0649', ('', '', '', '\uFD7B'), + )), + ('ARABIC LIGATURE GHAIN WITH MEEM WITH MEEM', ( + '\u063A\u0645\u0645', ('', '', '', '\uFD79'), + )), + ('ARABIC LIGATURE GHAIN WITH MEEM WITH YEH', ( + '\u063A\u0645\u064A', ('', '', '', '\uFD7A'), + )), + ('ARABIC LIGATURE GHAIN WITH YEH', ( + '\u063A\u064A', ('\uFCFA', '', '', '\uFD16'), + )), + ('ARABIC LIGATURE HAH WITH ALEF MAKSURA', ( + '\u062D\u0649', ('\uFCFF', '', '', '\uFD1B'), + )), + ('ARABIC LIGATURE HAH WITH JEEM', ( + '\u062D\u062C', ('\uFC17', '\uFCA9', '', ''), + )), + ('ARABIC LIGATURE HAH WITH JEEM WITH YEH', ( + '\u062D\u062C\u064A', ('', '', '', '\uFDBF'), + )), + ('ARABIC LIGATURE HAH WITH MEEM', ( + '\u062D\u0645', ('\uFC18', '\uFCAA', '', ''), + )), + ('ARABIC LIGATURE HAH WITH MEEM WITH ALEF MAKSURA', ( + '\u062D\u0645\u0649', ('', '', '', '\uFD5B'), + )), + ('ARABIC LIGATURE HAH WITH MEEM WITH YEH', ( + '\u062D\u0645\u064A', ('', '', '', '\uFD5A'), + )), + ('ARABIC LIGATURE HAH WITH YEH', ( + '\u062D\u064A', ('\uFD00', '', '', '\uFD1C'), + )), + ('ARABIC LIGATURE HEH WITH ALEF MAKSURA', ( + '\u0647\u0649', ('\uFC53', '', '', ''), + )), + ('ARABIC LIGATURE HEH WITH JEEM', ( + '\u0647\u062C', ('\uFC51', '\uFCD7', '', ''), + )), + ('ARABIC LIGATURE HEH WITH MEEM', ( + '\u0647\u0645', ('\uFC52', '\uFCD8', '', ''), + )), + ('ARABIC LIGATURE HEH WITH MEEM WITH JEEM', ( + '\u0647\u0645\u062C', ('', '\uFD93', '', ''), + )), + ('ARABIC LIGATURE HEH WITH MEEM WITH MEEM', ( + '\u0647\u0645\u0645', ('', '\uFD94', '', ''), + )), + ('ARABIC LIGATURE HEH WITH SUPERSCRIPT ALEF', ( + '\u0647\u0670', ('', '\uFCD9', '', ''), + )), + ('ARABIC LIGATURE HEH WITH YEH', ( + '\u0647\u064A', ('\uFC54', '', '', ''), + )), + ('ARABIC LIGATURE JEEM WITH ALEF MAKSURA', ( + '\u062C\u0649', ('\uFD01', '', '', '\uFD1D'), + )), + ('ARABIC LIGATURE JEEM WITH HAH', ( + '\u062C\u062D', ('\uFC15', '\uFCA7', '', ''), + )), + ('ARABIC LIGATURE JEEM WITH HAH WITH ALEF MAKSURA', ( + '\u062C\u062D\u0649', ('', '', '', '\uFDA6'), + )), + ('ARABIC LIGATURE JEEM WITH HAH WITH YEH', ( + '\u062C\u062D\u064A', ('', '', '', '\uFDBE'), + )), + ('ARABIC LIGATURE JEEM WITH MEEM', ( + '\u062C\u0645', ('\uFC16', '\uFCA8', '', ''), + )), + ('ARABIC LIGATURE JEEM WITH MEEM WITH ALEF MAKSURA', ( + '\u062C\u0645\u0649', ('', '', '', '\uFDA7'), + )), + ('ARABIC LIGATURE JEEM WITH MEEM WITH HAH', ( + '\u062C\u0645\u062D', ('', '\uFD59', '', '\uFD58'), + )), + ('ARABIC LIGATURE JEEM WITH MEEM WITH YEH', ( + '\u062C\u0645\u064A', ('', '', '', '\uFDA5'), + )), + ('ARABIC LIGATURE JEEM WITH YEH', ( + '\u062C\u064A', ('\uFD02', '', '', '\uFD1E'), + )), + ('ARABIC LIGATURE KAF WITH ALEF', ( + '\u0643\u0627', ('\uFC37', '', '', '\uFC80'), + )), + ('ARABIC LIGATURE KAF WITH ALEF MAKSURA', ( + '\u0643\u0649', ('\uFC3D', '', '', '\uFC83'), + )), + ('ARABIC LIGATURE KAF WITH HAH', ( + '\u0643\u062D', ('\uFC39', '\uFCC5', '', ''), + )), + ('ARABIC LIGATURE KAF WITH JEEM', ( + '\u0643\u062C', ('\uFC38', '\uFCC4', '', ''), + )), + ('ARABIC LIGATURE KAF WITH KHAH', ( + '\u0643\u062E', ('\uFC3A', '\uFCC6', '', ''), + )), + ('ARABIC LIGATURE KAF WITH LAM', ( + '\u0643\u0644', ('\uFC3B', '\uFCC7', '\uFCEB', '\uFC81'), + )), + ('ARABIC LIGATURE KAF WITH MEEM', ( + '\u0643\u0645', ('\uFC3C', '\uFCC8', '\uFCEC', '\uFC82'), + )), + ('ARABIC LIGATURE KAF WITH MEEM WITH MEEM', ( + '\u0643\u0645\u0645', ('', '\uFDC3', '', '\uFDBB'), + )), + ('ARABIC LIGATURE KAF WITH MEEM WITH YEH', ( + '\u0643\u0645\u064A', ('', '', '', '\uFDB7'), + )), + ('ARABIC LIGATURE KAF WITH YEH', ( + '\u0643\u064A', ('\uFC3E', '', '', '\uFC84'), + )), + ('ARABIC LIGATURE KHAH WITH ALEF MAKSURA', ( + '\u062E\u0649', ('\uFD03', '', '', '\uFD1F'), + )), + ('ARABIC LIGATURE KHAH WITH HAH', ( + '\u062E\u062D', ('\uFC1A', '', '', ''), + )), + ('ARABIC LIGATURE KHAH WITH JEEM', ( + '\u062E\u062C', ('\uFC19', '\uFCAB', '', ''), + )), + ('ARABIC LIGATURE KHAH WITH MEEM', ( + '\u062E\u0645', ('\uFC1B', '\uFCAC', '', ''), + )), + ('ARABIC LIGATURE KHAH WITH YEH', ( + '\u062E\u064A', ('\uFD04', '', '', '\uFD20'), + )), + ('ARABIC LIGATURE LAM WITH ALEF', ( + '\u0644\u0627', ('\uFEFB', '', '', '\uFEFC'), + )), + ('ARABIC LIGATURE LAM WITH ALEF MAKSURA', ( + '\u0644\u0649', ('\uFC43', '', '', '\uFC86'), + )), + ('ARABIC LIGATURE LAM WITH ALEF WITH HAMZA ABOVE', ( + '\u0644\u0623', ('\uFEF7', '', '', '\uFEF8'), + )), + ('ARABIC LIGATURE LAM WITH ALEF WITH HAMZA BELOW', ( + '\u0644\u0625', ('\uFEF9', '', '', '\uFEFA'), + )), + ('ARABIC LIGATURE LAM WITH ALEF WITH MADDA ABOVE', ( + '\u0644\u0622', ('\uFEF5', '', '', '\uFEF6'), + )), + ('ARABIC LIGATURE LAM WITH HAH', ( + '\u0644\u062D', ('\uFC40', '\uFCCA', '', ''), + )), + ('ARABIC LIGATURE LAM WITH HAH WITH ALEF MAKSURA', ( + '\u0644\u062D\u0649', ('', '', '', '\uFD82'), + )), + ('ARABIC LIGATURE LAM WITH HAH WITH MEEM', ( + '\u0644\u062D\u0645', ('', '\uFDB5', '', '\uFD80'), + )), + ('ARABIC LIGATURE LAM WITH HAH WITH YEH', ( + '\u0644\u062D\u064A', ('', '', '', '\uFD81'), + )), + ('ARABIC LIGATURE LAM WITH HEH', ( + '\u0644\u0647', ('', '\uFCCD', '', ''), + )), + ('ARABIC LIGATURE LAM WITH JEEM', ( + '\u0644\u062C', ('\uFC3F', '\uFCC9', '', ''), + )), + ('ARABIC LIGATURE LAM WITH JEEM WITH JEEM', ( + '\u0644\u062C\u062C', ('', '\uFD83', '', '\uFD84'), + )), + ('ARABIC LIGATURE LAM WITH JEEM WITH MEEM', ( + '\u0644\u062C\u0645', ('', '\uFDBA', '', '\uFDBC'), + )), + ('ARABIC LIGATURE LAM WITH JEEM WITH YEH', ( + '\u0644\u062C\u064A', ('', '', '', '\uFDAC'), + )), + ('ARABIC LIGATURE LAM WITH KHAH', ( + '\u0644\u062E', ('\uFC41', '\uFCCB', '', ''), + )), + ('ARABIC LIGATURE LAM WITH KHAH WITH MEEM', ( + '\u0644\u062E\u0645', ('', '\uFD86', '', '\uFD85'), + )), + ('ARABIC LIGATURE LAM WITH MEEM', ( + '\u0644\u0645', ('\uFC42', '\uFCCC', '\uFCED', '\uFC85'), + )), + ('ARABIC LIGATURE LAM WITH MEEM WITH HAH', ( + '\u0644\u0645\u062D', ('', '\uFD88', '', '\uFD87'), + )), + ('ARABIC LIGATURE LAM WITH MEEM WITH YEH', ( + '\u0644\u0645\u064A', ('', '', '', '\uFDAD'), + )), + ('ARABIC LIGATURE LAM WITH YEH', ( + '\u0644\u064A', ('\uFC44', '', '', '\uFC87'), + )), + ('ARABIC LIGATURE MEEM WITH ALEF', ( + '\u0645\u0627', ('', '', '', '\uFC88'), + )), + ('ARABIC LIGATURE MEEM WITH ALEF MAKSURA', ( + '\u0645\u0649', ('\uFC49', '', '', ''), + )), + ('ARABIC LIGATURE MEEM WITH HAH', ( + '\u0645\u062D', ('\uFC46', '\uFCCF', '', ''), + )), + ('ARABIC LIGATURE MEEM WITH HAH WITH JEEM', ( + '\u0645\u062D\u062C', ('', '\uFD89', '', ''), + )), + ('ARABIC LIGATURE MEEM WITH HAH WITH MEEM', ( + '\u0645\u062D\u0645', ('', '\uFD8A', '', ''), + )), + ('ARABIC LIGATURE MEEM WITH HAH WITH YEH', ( + '\u0645\u062D\u064A', ('', '', '', '\uFD8B'), + )), + ('ARABIC LIGATURE MEEM WITH JEEM', ( + '\u0645\u062C', ('\uFC45', '\uFCCE', '', ''), + )), + ('ARABIC LIGATURE MEEM WITH JEEM WITH HAH', ( + '\u0645\u062C\u062D', ('', '\uFD8C', '', ''), + )), + ('ARABIC LIGATURE MEEM WITH JEEM WITH KHAH', ( + '\u0645\u062C\u062E', ('', '\uFD92', '', ''), + )), + ('ARABIC LIGATURE MEEM WITH JEEM WITH MEEM', ( + '\u0645\u062C\u0645', ('', '\uFD8D', '', ''), + )), + ('ARABIC LIGATURE MEEM WITH JEEM WITH YEH', ( + '\u0645\u062C\u064A', ('', '', '', '\uFDC0'), + )), + ('ARABIC LIGATURE MEEM WITH KHAH', ( + '\u0645\u062E', ('\uFC47', '\uFCD0', '', ''), + )), + ('ARABIC LIGATURE MEEM WITH KHAH WITH JEEM', ( + '\u0645\u062E\u062C', ('', '\uFD8E', '', ''), + )), + ('ARABIC LIGATURE MEEM WITH KHAH WITH MEEM', ( + '\u0645\u062E\u0645', ('', '\uFD8F', '', ''), + )), + ('ARABIC LIGATURE MEEM WITH KHAH WITH YEH', ( + '\u0645\u062E\u064A', ('', '', '', '\uFDB9'), + )), + ('ARABIC LIGATURE MEEM WITH MEEM', ( + '\u0645\u0645', ('\uFC48', '\uFCD1', '', '\uFC89'), + )), + ('ARABIC LIGATURE MEEM WITH MEEM WITH YEH', ( + '\u0645\u0645\u064A', ('', '', '', '\uFDB1'), + )), + ('ARABIC LIGATURE MEEM WITH YEH', ( + '\u0645\u064A', ('\uFC4A', '', '', ''), + )), + ('ARABIC LIGATURE NOON WITH ALEF MAKSURA', ( + '\u0646\u0649', ('\uFC4F', '', '', '\uFC8E'), + )), + ('ARABIC LIGATURE NOON WITH HAH', ( + '\u0646\u062D', ('\uFC4C', '\uFCD3', '', ''), + )), + ('ARABIC LIGATURE NOON WITH HAH WITH ALEF MAKSURA', ( + '\u0646\u062D\u0649', ('', '', '', '\uFD96'), + )), + ('ARABIC LIGATURE NOON WITH HAH WITH MEEM', ( + '\u0646\u062D\u0645', ('', '\uFD95', '', ''), + )), + ('ARABIC LIGATURE NOON WITH HAH WITH YEH', ( + '\u0646\u062D\u064A', ('', '', '', '\uFDB3'), + )), + ('ARABIC LIGATURE NOON WITH HEH', ( + '\u0646\u0647', ('', '\uFCD6', '\uFCEF', ''), + )), + ('ARABIC LIGATURE NOON WITH JEEM', ( + '\u0646\u062C', ('\uFC4B', '\uFCD2', '', ''), + )), + ('ARABIC LIGATURE NOON WITH JEEM WITH ALEF MAKSURA', ( + '\u0646\u062C\u0649', ('', '', '', '\uFD99'), + )), + ('ARABIC LIGATURE NOON WITH JEEM WITH HAH', ( + '\u0646\u062C\u062D', ('', '\uFDB8', '', '\uFDBD'), + )), + ('ARABIC LIGATURE NOON WITH JEEM WITH MEEM', ( + '\u0646\u062C\u0645', ('', '\uFD98', '', '\uFD97'), + )), + ('ARABIC LIGATURE NOON WITH JEEM WITH YEH', ( + '\u0646\u062C\u064A', ('', '', '', '\uFDC7'), + )), + ('ARABIC LIGATURE NOON WITH KHAH', ( + '\u0646\u062E', ('\uFC4D', '\uFCD4', '', ''), + )), + ('ARABIC LIGATURE NOON WITH MEEM', ( + '\u0646\u0645', ('\uFC4E', '\uFCD5', '\uFCEE', '\uFC8C'), + )), + ('ARABIC LIGATURE NOON WITH MEEM WITH ALEF MAKSURA', ( + '\u0646\u0645\u0649', ('', '', '', '\uFD9B'), + )), + ('ARABIC LIGATURE NOON WITH MEEM WITH YEH', ( + '\u0646\u0645\u064A', ('', '', '', '\uFD9A'), + )), + ('ARABIC LIGATURE NOON WITH NOON', ( + '\u0646\u0646', ('', '', '', '\uFC8D'), + )), + ('ARABIC LIGATURE NOON WITH REH', ( + '\u0646\u0631', ('', '', '', '\uFC8A'), + )), + ('ARABIC LIGATURE NOON WITH YEH', ( + '\u0646\u064A', ('\uFC50', '', '', '\uFC8F'), + )), + ('ARABIC LIGATURE NOON WITH ZAIN', ( + '\u0646\u0632', ('', '', '', '\uFC8B'), + )), + ('ARABIC LIGATURE QAF WITH ALEF MAKSURA', ( + '\u0642\u0649', ('\uFC35', '', '', '\uFC7E'), + )), + ('ARABIC LIGATURE QAF WITH HAH', ( + '\u0642\u062D', ('\uFC33', '\uFCC2', '', ''), + )), + ('ARABIC LIGATURE QAF WITH MEEM', ( + '\u0642\u0645', ('\uFC34', '\uFCC3', '', ''), + )), + ('ARABIC LIGATURE QAF WITH MEEM WITH HAH', ( + '\u0642\u0645\u062D', ('', '\uFDB4', '', '\uFD7E'), + )), + ('ARABIC LIGATURE QAF WITH MEEM WITH MEEM', ( + '\u0642\u0645\u0645', ('', '', '', '\uFD7F'), + )), + ('ARABIC LIGATURE QAF WITH MEEM WITH YEH', ( + '\u0642\u0645\u064A', ('', '', '', '\uFDB2'), + )), + ('ARABIC LIGATURE QAF WITH YEH', ( + '\u0642\u064A', ('\uFC36', '', '', '\uFC7F'), + )), + ('ARABIC LIGATURE QALA USED AS KORANIC STOP SIGN', ( + '\u0642\u0644\u06D2', ('\uFDF1', '', '', ''), + )), + ('ARABIC LIGATURE REH WITH SUPERSCRIPT ALEF', ( + '\u0631\u0670', ('\uFC5C', '', '', ''), + )), + ('ARABIC LIGATURE SAD WITH ALEF MAKSURA', ( + '\u0635\u0649', ('\uFD05', '', '', '\uFD21'), + )), + ('ARABIC LIGATURE SAD WITH HAH', ( + '\u0635\u062D', ('\uFC20', '\uFCB1', '', ''), + )), + ('ARABIC LIGATURE SAD WITH HAH WITH HAH', ( + '\u0635\u062D\u062D', ('', '\uFD65', '', '\uFD64'), + )), + ('ARABIC LIGATURE SAD WITH HAH WITH YEH', ( + '\u0635\u062D\u064A', ('', '', '', '\uFDA9'), + )), + ('ARABIC LIGATURE SAD WITH KHAH', ( + '\u0635\u062E', ('', '\uFCB2', '', ''), + )), + ('ARABIC LIGATURE SAD WITH MEEM', ( + '\u0635\u0645', ('\uFC21', '\uFCB3', '', ''), + )), + ('ARABIC LIGATURE SAD WITH MEEM WITH MEEM', ( + '\u0635\u0645\u0645', ('', '\uFDC5', '', '\uFD66'), + )), + ('ARABIC LIGATURE SAD WITH REH', ( + '\u0635\u0631', ('\uFD0F', '', '', '\uFD2B'), + )), + ('ARABIC LIGATURE SAD WITH YEH', ( + '\u0635\u064A', ('\uFD06', '', '', '\uFD22'), + )), + ('ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN', ( + '\u0635\u0644\u06D2', ('\uFDF0', '', '', ''), + )), + ('ARABIC LIGATURE SEEN WITH ALEF MAKSURA', ( + '\u0633\u0649', ('\uFCFB', '', '', '\uFD17'), + )), + ('ARABIC LIGATURE SEEN WITH HAH', ( + '\u0633\u062D', ('\uFC1D', '\uFCAE', '\uFD35', ''), + )), + ('ARABIC LIGATURE SEEN WITH HAH WITH JEEM', ( + '\u0633\u062D\u062C', ('', '\uFD5C', '', ''), + )), + ('ARABIC LIGATURE SEEN WITH HEH', ( + '\u0633\u0647', ('', '\uFD31', '\uFCE8', ''), + )), + ('ARABIC LIGATURE SEEN WITH JEEM', ( + '\u0633\u062C', ('\uFC1C', '\uFCAD', '\uFD34', ''), + )), + ('ARABIC LIGATURE SEEN WITH JEEM WITH ALEF MAKSURA', ( + '\u0633\u062C\u0649', ('', '', '', '\uFD5E'), + )), + ('ARABIC LIGATURE SEEN WITH JEEM WITH HAH', ( + '\u0633\u062C\u062D', ('', '\uFD5D', '', ''), + )), + ('ARABIC LIGATURE SEEN WITH KHAH', ( + '\u0633\u062E', ('\uFC1E', '\uFCAF', '\uFD36', ''), + )), + ('ARABIC LIGATURE SEEN WITH KHAH WITH ALEF MAKSURA', ( + '\u0633\u062E\u0649', ('', '', '', '\uFDA8'), + )), + ('ARABIC LIGATURE SEEN WITH KHAH WITH YEH', ( + '\u0633\u062E\u064A', ('', '', '', '\uFDC6'), + )), + ('ARABIC LIGATURE SEEN WITH MEEM', ( + '\u0633\u0645', ('\uFC1F', '\uFCB0', '\uFCE7', ''), + )), + ('ARABIC LIGATURE SEEN WITH MEEM WITH HAH', ( + '\u0633\u0645\u062D', ('', '\uFD60', '', '\uFD5F'), + )), + ('ARABIC LIGATURE SEEN WITH MEEM WITH JEEM', ( + '\u0633\u0645\u062C', ('', '\uFD61', '', ''), + )), + ('ARABIC LIGATURE SEEN WITH MEEM WITH MEEM', ( + '\u0633\u0645\u0645', ('', '\uFD63', '', '\uFD62'), + )), + ('ARABIC LIGATURE SEEN WITH REH', ( + '\u0633\u0631', ('\uFD0E', '', '', '\uFD2A'), + )), + ('ARABIC LIGATURE SEEN WITH YEH', ( + '\u0633\u064A', ('\uFCFC', '', '', '\uFD18'), + )), + + # Arabic ligatures with Shadda, the order of characters doesn't matter + ('ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM', ( + '(?:\u064C\u0651|\u0651\u064C)', + + ('\uFC5E', '\uFC5E', '\uFC5E', '\uFC5E'), + )), + ('ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM', ( + '(?:\u064D\u0651|\u0651\u064D)', + + ('\uFC5F', '\uFC5F', '\uFC5F', '\uFC5F'), + )), + ('ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM', ( + '(?:\u064E\u0651|\u0651\u064E)', + + ('\uFC60', '\uFC60', '\uFC60', '\uFC60'), + )), + ('ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM', ( + '(?:\u064F\u0651|\u0651\u064F)', + + ('\uFC61', '\uFC61', '\uFC61', '\uFC61'), + )), + ('ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM', ( + '(?:\u0650\u0651|\u0651\u0650)', + + ('\uFC62', '\uFC62', '\uFC62', '\uFC62'), + )), + ('ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF', ( + '(?:\u0651\u0670|\u0670\u0651)', ('\uFC63', '', '', ''), + )), + + # There is a special case when they are with Tatweel + ('ARABIC LIGATURE SHADDA WITH FATHA MEDIAL FORM', ( + '\u0640(?:\u064E\u0651|\u0651\u064E)', + + ('\uFCF2', '\uFCF2', '\uFCF2', '\uFCF2'), + )), + ('ARABIC LIGATURE SHADDA WITH DAMMA MEDIAL FORM', ( + '\u0640(?:\u064F\u0651|\u0651\u064F)', + + ('\uFCF3', '\uFCF3', '\uFCF3', '\uFCF3'), + )), + ('ARABIC LIGATURE SHADDA WITH KASRA MEDIAL FORM', ( + '\u0640(?:\u0650\u0651|\u0651\u0650)', + + ('\uFCF4', '\uFCF4', '\uFCF4', '\uFCF4'), + )), + + # Repeated with different keys to be backward compatible + ('ARABIC LIGATURE SHADDA WITH FATHA', ( + '\u0640(?:\u064E\u0651|\u0651\u064E)', + + ('\uFCF2', '\uFCF2', '\uFCF2', '\uFCF2'), + )), + ('ARABIC LIGATURE SHADDA WITH DAMMA', ( + '\u0640(?:\u064F\u0651|\u0651\u064F)', + + ('\uFCF3', '\uFCF3', '\uFCF3', '\uFCF3'), + )), + ('ARABIC LIGATURE SHADDA WITH KASRA', ( + '\u0640(?:\u0650\u0651|\u0651\u0650)', + + ('\uFCF4', '\uFCF4', '\uFCF4', '\uFCF4'), + )), + + ('ARABIC LIGATURE SHEEN WITH ALEF MAKSURA', ( + '\u0634\u0649', ('\uFCFD', '', '', '\uFD19'), + )), + ('ARABIC LIGATURE SHEEN WITH HAH', ( + '\u0634\u062D', ('\uFD0A', '\uFD2E', '\uFD38', '\uFD26'), + )), + ('ARABIC LIGATURE SHEEN WITH HAH WITH MEEM', ( + '\u0634\u062D\u0645', ('', '\uFD68', '', '\uFD67'), + )), + ('ARABIC LIGATURE SHEEN WITH HAH WITH YEH', ( + '\u0634\u062D\u064A', ('', '', '', '\uFDAA'), + )), + ('ARABIC LIGATURE SHEEN WITH HEH', ( + '\u0634\u0647', ('', '\uFD32', '\uFCEA', ''), + )), + ('ARABIC LIGATURE SHEEN WITH JEEM', ( + '\u0634\u062C', ('\uFD09', '\uFD2D', '\uFD37', '\uFD25'), + )), + ('ARABIC LIGATURE SHEEN WITH JEEM WITH YEH', ( + '\u0634\u062C\u064A', ('', '', '', '\uFD69'), + )), + ('ARABIC LIGATURE SHEEN WITH KHAH', ( + '\u0634\u062E', ('\uFD0B', '\uFD2F', '\uFD39', '\uFD27'), + )), + ('ARABIC LIGATURE SHEEN WITH MEEM', ( + '\u0634\u0645', ('\uFD0C', '\uFD30', '\uFCE9', '\uFD28'), + )), + ('ARABIC LIGATURE SHEEN WITH MEEM WITH KHAH', ( + '\u0634\u0645\u062E', ('', '\uFD6B', '', '\uFD6A'), + )), + ('ARABIC LIGATURE SHEEN WITH MEEM WITH MEEM', ( + '\u0634\u0645\u0645', ('', '\uFD6D', '', '\uFD6C'), + )), + ('ARABIC LIGATURE SHEEN WITH REH', ( + '\u0634\u0631', ('\uFD0D', '', '', '\uFD29'), + )), + ('ARABIC LIGATURE SHEEN WITH YEH', ( + '\u0634\u064A', ('\uFCFE', '', '', '\uFD1A'), + )), + ('ARABIC LIGATURE TAH WITH ALEF MAKSURA', ( + '\u0637\u0649', ('\uFCF5', '', '', '\uFD11'), + )), + ('ARABIC LIGATURE TAH WITH HAH', ( + '\u0637\u062D', ('\uFC26', '\uFCB8', '', ''), + )), + ('ARABIC LIGATURE TAH WITH MEEM', ( + '\u0637\u0645', ('\uFC27', '\uFD33', '\uFD3A', ''), + )), + ('ARABIC LIGATURE TAH WITH MEEM WITH HAH', ( + '\u0637\u0645\u062D', ('', '\uFD72', '', '\uFD71'), + )), + ('ARABIC LIGATURE TAH WITH MEEM WITH MEEM', ( + '\u0637\u0645\u0645', ('', '\uFD73', '', ''), + )), + ('ARABIC LIGATURE TAH WITH MEEM WITH YEH', ( + '\u0637\u0645\u064A', ('', '', '', '\uFD74'), + )), + ('ARABIC LIGATURE TAH WITH YEH', ( + '\u0637\u064A', ('\uFCF6', '', '', '\uFD12'), + )), + ('ARABIC LIGATURE TEH WITH ALEF MAKSURA', ( + '\u062A\u0649', ('\uFC0F', '', '', '\uFC74'), + )), + ('ARABIC LIGATURE TEH WITH HAH', ( + '\u062A\u062D', ('\uFC0C', '\uFCA2', '', ''), + )), + ('ARABIC LIGATURE TEH WITH HAH WITH JEEM', ( + '\u062A\u062D\u062C', ('', '\uFD52', '', '\uFD51'), + )), + ('ARABIC LIGATURE TEH WITH HAH WITH MEEM', ( + '\u062A\u062D\u0645', ('', '\uFD53', '', ''), + )), + ('ARABIC LIGATURE TEH WITH HEH', ( + '\u062A\u0647', ('', '\uFCA5', '\uFCE4', ''), + )), + ('ARABIC LIGATURE TEH WITH JEEM', ( + '\u062A\u062C', ('\uFC0B', '\uFCA1', '', ''), + )), + ('ARABIC LIGATURE TEH WITH JEEM WITH ALEF MAKSURA', ( + '\u062A\u062C\u0649', ('', '', '', '\uFDA0'), + )), + ('ARABIC LIGATURE TEH WITH JEEM WITH MEEM', ( + '\u062A\u062C\u0645', ('', '\uFD50', '', ''), + )), + ('ARABIC LIGATURE TEH WITH JEEM WITH YEH', ( + '\u062A\u062C\u064A', ('', '', '', '\uFD9F'), + )), + ('ARABIC LIGATURE TEH WITH KHAH', ( + '\u062A\u062E', ('\uFC0D', '\uFCA3', '', ''), + )), + ('ARABIC LIGATURE TEH WITH KHAH WITH ALEF MAKSURA', ( + '\u062A\u062E\u0649', ('', '', '', '\uFDA2'), + )), + ('ARABIC LIGATURE TEH WITH KHAH WITH MEEM', ( + '\u062A\u062E\u0645', ('', '\uFD54', '', ''), + )), + ('ARABIC LIGATURE TEH WITH KHAH WITH YEH', ( + '\u062A\u062E\u064A', ('', '', '', '\uFDA1'), + )), + ('ARABIC LIGATURE TEH WITH MEEM', ( + '\u062A\u0645', ('\uFC0E', '\uFCA4', '\uFCE3', '\uFC72'), + )), + ('ARABIC LIGATURE TEH WITH MEEM WITH ALEF MAKSURA', ( + '\u062A\u0645\u0649', ('', '', '', '\uFDA4'), + )), + ('ARABIC LIGATURE TEH WITH MEEM WITH HAH', ( + '\u062A\u0645\u062D', ('', '\uFD56', '', ''), + )), + ('ARABIC LIGATURE TEH WITH MEEM WITH JEEM', ( + '\u062A\u0645\u062C', ('', '\uFD55', '', ''), + )), + ('ARABIC LIGATURE TEH WITH MEEM WITH KHAH', ( + '\u062A\u0645\u062E', ('', '\uFD57', '', ''), + )), + ('ARABIC LIGATURE TEH WITH MEEM WITH YEH', ( + '\u062A\u0645\u064A', ('', '', '', '\uFDA3'), + )), + ('ARABIC LIGATURE TEH WITH NOON', ( + '\u062A\u0646', ('', '', '', '\uFC73'), + )), + ('ARABIC LIGATURE TEH WITH REH', ( + '\u062A\u0631', ('', '', '', '\uFC70'), + )), + ('ARABIC LIGATURE TEH WITH YEH', ( + '\u062A\u064A', ('\uFC10', '', '', '\uFC75'), + )), + ('ARABIC LIGATURE TEH WITH ZAIN', ( + '\u062A\u0632', ('', '', '', '\uFC71'), + )), + ('ARABIC LIGATURE THAL WITH SUPERSCRIPT ALEF', ( + '\u0630\u0670', ('\uFC5B', '', '', ''), + )), + ('ARABIC LIGATURE THEH WITH ALEF MAKSURA', ( + '\u062B\u0649', ('\uFC13', '', '', '\uFC7A'), + )), + ('ARABIC LIGATURE THEH WITH HEH', ( + '\u062B\u0647', ('', '', '\uFCE6', ''), + )), + ('ARABIC LIGATURE THEH WITH JEEM', ( + '\u062B\u062C', ('\uFC11', '', '', ''), + )), + ('ARABIC LIGATURE THEH WITH MEEM', ( + '\u062B\u0645', ('\uFC12', '\uFCA6', '\uFCE5', '\uFC78'), + )), + ('ARABIC LIGATURE THEH WITH NOON', ( + '\u062B\u0646', ('', '', '', '\uFC79'), + )), + ('ARABIC LIGATURE THEH WITH REH', ( + '\u062B\u0631', ('', '', '', '\uFC76'), + )), + ('ARABIC LIGATURE THEH WITH YEH', ( + '\u062B\u064A', ('\uFC14', '', '', '\uFC7B'), + )), + ('ARABIC LIGATURE THEH WITH ZAIN', ( + '\u062B\u0632', ('', '', '', '\uFC77'), + )), + ('ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA ABOVE WITH ALEF MAKSURA', ( + '\u0626\u0649', ('\uFBF9', '\uFBFB', '', '\uFBFA'), + )), + ('ARABIC LIGATURE YEH WITH ALEF MAKSURA', ( + '\u064A\u0649', ('\uFC59', '', '', '\uFC95'), + )), + ('ARABIC LIGATURE YEH WITH HAH', ( + '\u064A\u062D', ('\uFC56', '\uFCDB', '', ''), + )), + ('ARABIC LIGATURE YEH WITH HAH WITH YEH', ( + '\u064A\u062D\u064A', ('', '', '', '\uFDAE'), + )), + ('ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH AE', ( + '\u0626\u06D5', ('\uFBEC', '', '', '\uFBED'), + )), + ('ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH ALEF', ( + '\u0626\u0627', ('\uFBEA', '', '', '\uFBEB'), + )), + ('ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH ALEF MAKSURA', ( + '\u0626\u0649', ('\uFC03', '', '', '\uFC68'), + )), + ('ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH E', ( + '\u0626\u06D0', ('\uFBF6', '\uFBF8', '', '\uFBF7'), + )), + ('ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH HAH', ( + '\u0626\u062D', ('\uFC01', '\uFC98', '', ''), + )), + ('ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH HEH', ( + '\u0626\u0647', ('', '\uFC9B', '\uFCE0', ''), + )), + ('ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH JEEM', ( + '\u0626\u062C', ('\uFC00', '\uFC97', '', ''), + )), + ('ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH KHAH', ( + '\u0626\u062E', ('', '\uFC99', '', ''), + )), + ('ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH MEEM', ( + '\u0626\u0645', ('\uFC02', '\uFC9A', '\uFCDF', '\uFC66'), + )), + ('ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH NOON', ( + '\u0626\u0646', ('', '', '', '\uFC67'), + )), + ('ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH OE', ( + '\u0626\u06C6', ('\uFBF2', '', '', '\uFBF3'), + )), + ('ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH REH', ( + '\u0626\u0631', ('', '', '', '\uFC64'), + )), + ('ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH U', ( + '\u0626\u06C7', ('\uFBF0', '', '', '\uFBF1'), + )), + ('ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH WAW', ( + '\u0626\u0648', ('\uFBEE', '', '', '\uFBEF'), + )), + ('ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH YEH', ( + '\u0626\u064A', ('\uFC04', '', '', '\uFC69'), + )), + ('ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH YU', ( + '\u0626\u06C8', ('\uFBF4', '', '', '\uFBF5'), + )), + ('ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH ZAIN', ( + '\u0626\u0632', ('', '', '', '\uFC65'), + )), + ('ARABIC LIGATURE YEH WITH HEH', ( + '\u064A\u0647', ('', '\uFCDE', '\uFCF1', ''), + )), + ('ARABIC LIGATURE YEH WITH JEEM', ( + '\u064A\u062C', ('\uFC55', '\uFCDA', '', ''), + )), + ('ARABIC LIGATURE YEH WITH JEEM WITH YEH', ( + '\u064A\u062C\u064A', ('', '', '', '\uFDAF'), + )), + ('ARABIC LIGATURE YEH WITH KHAH', ( + '\u064A\u062E', ('\uFC57', '\uFCDC', '', ''), + )), + ('ARABIC LIGATURE YEH WITH MEEM', ( + '\u064A\u0645', ('\uFC58', '\uFCDD', '\uFCF0', '\uFC93'), + )), + ('ARABIC LIGATURE YEH WITH MEEM WITH MEEM', ( + '\u064A\u0645\u0645', ('', '\uFD9D', '', '\uFD9C'), + )), + ('ARABIC LIGATURE YEH WITH MEEM WITH YEH', ( + '\u064A\u0645\u064A', ('', '', '', '\uFDB0'), + )), + ('ARABIC LIGATURE YEH WITH NOON', ( + '\u064A\u0646', ('', '', '', '\uFC94'), + )), + ('ARABIC LIGATURE YEH WITH REH', ( + '\u064A\u0631', ('', '', '', '\uFC91'), + )), + ('ARABIC LIGATURE YEH WITH YEH', ( + '\u064A\u064A', ('\uFC5A', '', '', '\uFC96'), + )), + ('ARABIC LIGATURE YEH WITH ZAIN', ( + '\u064A\u0632', ('', '', '', '\uFC92'), + )), + ('ARABIC LIGATURE ZAH WITH MEEM', ( + '\u0638\u0645', ('\uFC28', '\uFCB9', '\uFD3B', ''), + )), +) + +LIGATURES = tuple(chain(SENTENCES_LIGATURES, WORDS_LIGATURES, LETTERS_LIGATURES)) + + +# -*- coding: utf-8 -*- + +# This work is licensed under the MIT License. +# To view a copy of this license, visit https://opensource.org/licenses/MIT + +# Written by Abdullah Diab (mpcabd) +# Email: mpcabd@gmail.com +# Website: http://mpcabd.xyz + +import os + +from configparser import ConfigParser + +# from .letters import (UNSHAPED, ISOLATED, LETTERS_ARABIC) +# from .ligatures import (SENTENCES_LIGATURES, +# WORDS_LIGATURES, +# LETTERS_LIGATURES) + +try: + from fontTools.ttLib import TTFont + with_font_config = True +except ImportError: + with_font_config = False + +ENABLE_NO_LIGATURES = 0b000 +ENABLE_SENTENCES_LIGATURES = 0b001 +ENABLE_WORDS_LIGATURES = 0b010 +ENABLE_LETTERS_LIGATURES = 0b100 +ENABLE_ALL_LIGATURES = 0b111 + +default_config = { + # Supported languages are: [Arabic, ArabicV2, Kurdish] + # More languages might be supported soon. + # `Arabic` is default and recommended to work in most of the cases and + # supports (Arabic, Urdu and Farsi) + # `ArabicV2` is only to be used with certain font that you run into missing + # chars `Kurdish` if you are using Kurdish Sarchia font is recommended, + # work with both unicode and classic Arabic-Kurdish keybouard + 'language': 'Arabic', + + # Whether to delete the Harakat (Tashkeel) before reshaping or not. + 'delete_harakat': True, + + # Whether to shift the Harakat (Tashkeel) one position so they appear + # correctly when string is reversed + 'shift_harakat_position': False, + + # Whether to delete the Tatweel (U+0640) before reshaping or not. + 'delete_tatweel': False, + + # Whether to support ZWJ (U+200D) or not. + 'support_zwj': True, + + # Use unshaped form instead of isolated form. + 'use_unshaped_instead_of_isolated': False, + + # Whether to use ligatures or not. + # Serves as a shortcut to disable all ligatures. + 'support_ligatures': True, + + # When `support_ligatures` is enabled. + # Separate ligatures configuration take precedence over it. + # When `support_ligatures` is disabled, + # separate ligatures configurations are ignored. + + # ------------------- Begin: Ligatures Configurations ------------------ # + + # Sentences (Enabled on top) + 'ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM': False, + 'ARABIC LIGATURE JALLAJALALOUHOU': False, + 'ARABIC LIGATURE SALLALLAHOU ALAYHE WASALLAM': False, + + # Words (Enabled on top) + 'ARABIC LIGATURE ALLAH': True, + 'ARABIC LIGATURE AKBAR': False, + 'ARABIC LIGATURE ALAYHE': False, + 'ARABIC LIGATURE MOHAMMAD': False, + 'ARABIC LIGATURE RASOUL': False, + 'ARABIC LIGATURE SALAM': False, + 'ARABIC LIGATURE SALLA': False, + 'ARABIC LIGATURE WASALLAM': False, + 'RIAL SIGN': False, + + # Letters (Enabled on top) + 'ARABIC LIGATURE LAM WITH ALEF': True, + 'ARABIC LIGATURE LAM WITH ALEF WITH HAMZA ABOVE': True, + 'ARABIC LIGATURE LAM WITH ALEF WITH HAMZA BELOW': True, + 'ARABIC LIGATURE LAM WITH ALEF WITH MADDA ABOVE': True, + 'ARABIC LIGATURE AIN WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE AIN WITH JEEM': False, + 'ARABIC LIGATURE AIN WITH JEEM WITH MEEM': False, + 'ARABIC LIGATURE AIN WITH MEEM': False, + 'ARABIC LIGATURE AIN WITH MEEM WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE AIN WITH MEEM WITH MEEM': False, + 'ARABIC LIGATURE AIN WITH MEEM WITH YEH': False, + 'ARABIC LIGATURE AIN WITH YEH': False, + 'ARABIC LIGATURE ALEF MAKSURA WITH SUPERSCRIPT ALEF': False, + 'ARABIC LIGATURE ALEF WITH FATHATAN': False, + 'ARABIC LIGATURE BEH WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE BEH WITH HAH': False, + 'ARABIC LIGATURE BEH WITH HAH WITH YEH': False, + 'ARABIC LIGATURE BEH WITH HEH': False, + 'ARABIC LIGATURE BEH WITH JEEM': False, + 'ARABIC LIGATURE BEH WITH KHAH': False, + 'ARABIC LIGATURE BEH WITH KHAH WITH YEH': False, + 'ARABIC LIGATURE BEH WITH MEEM': False, + 'ARABIC LIGATURE BEH WITH NOON': False, + 'ARABIC LIGATURE BEH WITH REH': False, + 'ARABIC LIGATURE BEH WITH YEH': False, + 'ARABIC LIGATURE BEH WITH ZAIN': False, + 'ARABIC LIGATURE DAD WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE DAD WITH HAH': False, + 'ARABIC LIGATURE DAD WITH HAH WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE DAD WITH HAH WITH YEH': False, + 'ARABIC LIGATURE DAD WITH JEEM': False, + 'ARABIC LIGATURE DAD WITH KHAH': False, + 'ARABIC LIGATURE DAD WITH KHAH WITH MEEM': False, + 'ARABIC LIGATURE DAD WITH MEEM': False, + 'ARABIC LIGATURE DAD WITH REH': False, + 'ARABIC LIGATURE DAD WITH YEH': False, + 'ARABIC LIGATURE FEH WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE FEH WITH HAH': False, + 'ARABIC LIGATURE FEH WITH JEEM': False, + 'ARABIC LIGATURE FEH WITH KHAH': False, + 'ARABIC LIGATURE FEH WITH KHAH WITH MEEM': False, + 'ARABIC LIGATURE FEH WITH MEEM': False, + 'ARABIC LIGATURE FEH WITH MEEM WITH YEH': False, + 'ARABIC LIGATURE FEH WITH YEH': False, + 'ARABIC LIGATURE GHAIN WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE GHAIN WITH JEEM': False, + 'ARABIC LIGATURE GHAIN WITH MEEM': False, + 'ARABIC LIGATURE GHAIN WITH MEEM WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE GHAIN WITH MEEM WITH MEEM': False, + 'ARABIC LIGATURE GHAIN WITH MEEM WITH YEH': False, + 'ARABIC LIGATURE GHAIN WITH YEH': False, + 'ARABIC LIGATURE HAH WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE HAH WITH JEEM': False, + 'ARABIC LIGATURE HAH WITH JEEM WITH YEH': False, + 'ARABIC LIGATURE HAH WITH MEEM': False, + 'ARABIC LIGATURE HAH WITH MEEM WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE HAH WITH MEEM WITH YEH': False, + 'ARABIC LIGATURE HAH WITH YEH': False, + 'ARABIC LIGATURE HEH WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE HEH WITH JEEM': False, + 'ARABIC LIGATURE HEH WITH MEEM': False, + 'ARABIC LIGATURE HEH WITH MEEM WITH JEEM': False, + 'ARABIC LIGATURE HEH WITH MEEM WITH MEEM': False, + 'ARABIC LIGATURE HEH WITH SUPERSCRIPT ALEF': False, + 'ARABIC LIGATURE HEH WITH YEH': False, + 'ARABIC LIGATURE JEEM WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE JEEM WITH HAH': False, + 'ARABIC LIGATURE JEEM WITH HAH WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE JEEM WITH HAH WITH YEH': False, + 'ARABIC LIGATURE JEEM WITH MEEM': False, + 'ARABIC LIGATURE JEEM WITH MEEM WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE JEEM WITH MEEM WITH HAH': False, + 'ARABIC LIGATURE JEEM WITH MEEM WITH YEH': False, + 'ARABIC LIGATURE JEEM WITH YEH': False, + 'ARABIC LIGATURE KAF WITH ALEF': False, + 'ARABIC LIGATURE KAF WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE KAF WITH HAH': False, + 'ARABIC LIGATURE KAF WITH JEEM': False, + 'ARABIC LIGATURE KAF WITH KHAH': False, + 'ARABIC LIGATURE KAF WITH LAM': False, + 'ARABIC LIGATURE KAF WITH MEEM': False, + 'ARABIC LIGATURE KAF WITH MEEM WITH MEEM': False, + 'ARABIC LIGATURE KAF WITH MEEM WITH YEH': False, + 'ARABIC LIGATURE KAF WITH YEH': False, + 'ARABIC LIGATURE KHAH WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE KHAH WITH HAH': False, + 'ARABIC LIGATURE KHAH WITH JEEM': False, + 'ARABIC LIGATURE KHAH WITH MEEM': False, + 'ARABIC LIGATURE KHAH WITH YEH': False, + 'ARABIC LIGATURE LAM WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE LAM WITH HAH': False, + 'ARABIC LIGATURE LAM WITH HAH WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE LAM WITH HAH WITH MEEM': False, + 'ARABIC LIGATURE LAM WITH HAH WITH YEH': False, + 'ARABIC LIGATURE LAM WITH HEH': False, + 'ARABIC LIGATURE LAM WITH JEEM': False, + 'ARABIC LIGATURE LAM WITH JEEM WITH JEEM': False, + 'ARABIC LIGATURE LAM WITH JEEM WITH MEEM': False, + 'ARABIC LIGATURE LAM WITH JEEM WITH YEH': False, + 'ARABIC LIGATURE LAM WITH KHAH': False, + 'ARABIC LIGATURE LAM WITH KHAH WITH MEEM': False, + 'ARABIC LIGATURE LAM WITH MEEM': False, + 'ARABIC LIGATURE LAM WITH MEEM WITH HAH': False, + 'ARABIC LIGATURE LAM WITH MEEM WITH YEH': False, + 'ARABIC LIGATURE LAM WITH YEH': False, + 'ARABIC LIGATURE MEEM WITH ALEF': False, + 'ARABIC LIGATURE MEEM WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE MEEM WITH HAH': False, + 'ARABIC LIGATURE MEEM WITH HAH WITH JEEM': False, + 'ARABIC LIGATURE MEEM WITH HAH WITH MEEM': False, + 'ARABIC LIGATURE MEEM WITH HAH WITH YEH': False, + 'ARABIC LIGATURE MEEM WITH JEEM': False, + 'ARABIC LIGATURE MEEM WITH JEEM WITH HAH': False, + 'ARABIC LIGATURE MEEM WITH JEEM WITH KHAH': False, + 'ARABIC LIGATURE MEEM WITH JEEM WITH MEEM': False, + 'ARABIC LIGATURE MEEM WITH JEEM WITH YEH': False, + 'ARABIC LIGATURE MEEM WITH KHAH': False, + 'ARABIC LIGATURE MEEM WITH KHAH WITH JEEM': False, + 'ARABIC LIGATURE MEEM WITH KHAH WITH MEEM': False, + 'ARABIC LIGATURE MEEM WITH KHAH WITH YEH': False, + 'ARABIC LIGATURE MEEM WITH MEEM': False, + 'ARABIC LIGATURE MEEM WITH MEEM WITH YEH': False, + 'ARABIC LIGATURE MEEM WITH YEH': False, + 'ARABIC LIGATURE NOON WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE NOON WITH HAH': False, + 'ARABIC LIGATURE NOON WITH HAH WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE NOON WITH HAH WITH MEEM': False, + 'ARABIC LIGATURE NOON WITH HAH WITH YEH': False, + 'ARABIC LIGATURE NOON WITH HEH': False, + 'ARABIC LIGATURE NOON WITH JEEM': False, + 'ARABIC LIGATURE NOON WITH JEEM WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE NOON WITH JEEM WITH HAH': False, + 'ARABIC LIGATURE NOON WITH JEEM WITH MEEM': False, + 'ARABIC LIGATURE NOON WITH JEEM WITH YEH': False, + 'ARABIC LIGATURE NOON WITH KHAH': False, + 'ARABIC LIGATURE NOON WITH MEEM': False, + 'ARABIC LIGATURE NOON WITH MEEM WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE NOON WITH MEEM WITH YEH': False, + 'ARABIC LIGATURE NOON WITH NOON': False, + 'ARABIC LIGATURE NOON WITH REH': False, + 'ARABIC LIGATURE NOON WITH YEH': False, + 'ARABIC LIGATURE NOON WITH ZAIN': False, + 'ARABIC LIGATURE QAF WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE QAF WITH HAH': False, + 'ARABIC LIGATURE QAF WITH MEEM': False, + 'ARABIC LIGATURE QAF WITH MEEM WITH HAH': False, + 'ARABIC LIGATURE QAF WITH MEEM WITH MEEM': False, + 'ARABIC LIGATURE QAF WITH MEEM WITH YEH': False, + 'ARABIC LIGATURE QAF WITH YEH': False, + 'ARABIC LIGATURE QALA USED AS KORANIC STOP SIGN': False, + 'ARABIC LIGATURE REH WITH SUPERSCRIPT ALEF': False, + 'ARABIC LIGATURE SAD WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE SAD WITH HAH': False, + 'ARABIC LIGATURE SAD WITH HAH WITH HAH': False, + 'ARABIC LIGATURE SAD WITH HAH WITH YEH': False, + 'ARABIC LIGATURE SAD WITH KHAH': False, + 'ARABIC LIGATURE SAD WITH MEEM': False, + 'ARABIC LIGATURE SAD WITH MEEM WITH MEEM': False, + 'ARABIC LIGATURE SAD WITH REH': False, + 'ARABIC LIGATURE SAD WITH YEH': False, + 'ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN': False, + 'ARABIC LIGATURE SEEN WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE SEEN WITH HAH': False, + 'ARABIC LIGATURE SEEN WITH HAH WITH JEEM': False, + 'ARABIC LIGATURE SEEN WITH HEH': False, + 'ARABIC LIGATURE SEEN WITH JEEM': False, + 'ARABIC LIGATURE SEEN WITH JEEM WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE SEEN WITH JEEM WITH HAH': False, + 'ARABIC LIGATURE SEEN WITH KHAH': False, + 'ARABIC LIGATURE SEEN WITH KHAH WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE SEEN WITH KHAH WITH YEH': False, + 'ARABIC LIGATURE SEEN WITH MEEM': False, + 'ARABIC LIGATURE SEEN WITH MEEM WITH HAH': False, + 'ARABIC LIGATURE SEEN WITH MEEM WITH JEEM': False, + 'ARABIC LIGATURE SEEN WITH MEEM WITH MEEM': False, + 'ARABIC LIGATURE SEEN WITH REH': False, + 'ARABIC LIGATURE SEEN WITH YEH': False, + 'ARABIC LIGATURE SHADDA WITH DAMMA': False, + 'ARABIC LIGATURE SHADDA WITH DAMMA ISOLATED FORM': False, + 'ARABIC LIGATURE SHADDA WITH DAMMA MEDIAL FORM': False, + 'ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM': False, + 'ARABIC LIGATURE SHADDA WITH FATHA': False, + 'ARABIC LIGATURE SHADDA WITH FATHA ISOLATED FORM': False, + 'ARABIC LIGATURE SHADDA WITH FATHA MEDIAL FORM': False, + 'ARABIC LIGATURE SHADDA WITH KASRA': False, + 'ARABIC LIGATURE SHADDA WITH KASRA ISOLATED FORM': False, + 'ARABIC LIGATURE SHADDA WITH KASRA MEDIAL FORM': False, + 'ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM': False, + 'ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF': False, + 'ARABIC LIGATURE SHADDA WITH SUPERSCRIPT ALEF ISOLATED FORM': False, + 'ARABIC LIGATURE SHEEN WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE SHEEN WITH HAH': False, + 'ARABIC LIGATURE SHEEN WITH HAH WITH MEEM': False, + 'ARABIC LIGATURE SHEEN WITH HAH WITH YEH': False, + 'ARABIC LIGATURE SHEEN WITH HEH': False, + 'ARABIC LIGATURE SHEEN WITH JEEM': False, + 'ARABIC LIGATURE SHEEN WITH JEEM WITH YEH': False, + 'ARABIC LIGATURE SHEEN WITH KHAH': False, + 'ARABIC LIGATURE SHEEN WITH MEEM': False, + 'ARABIC LIGATURE SHEEN WITH MEEM WITH KHAH': False, + 'ARABIC LIGATURE SHEEN WITH MEEM WITH MEEM': False, + 'ARABIC LIGATURE SHEEN WITH REH': False, + 'ARABIC LIGATURE SHEEN WITH YEH': False, + 'ARABIC LIGATURE TAH WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE TAH WITH HAH': False, + 'ARABIC LIGATURE TAH WITH MEEM': False, + 'ARABIC LIGATURE TAH WITH MEEM WITH HAH': False, + 'ARABIC LIGATURE TAH WITH MEEM WITH MEEM': False, + 'ARABIC LIGATURE TAH WITH MEEM WITH YEH': False, + 'ARABIC LIGATURE TAH WITH YEH': False, + 'ARABIC LIGATURE TEH WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE TEH WITH HAH': False, + 'ARABIC LIGATURE TEH WITH HAH WITH JEEM': False, + 'ARABIC LIGATURE TEH WITH HAH WITH MEEM': False, + 'ARABIC LIGATURE TEH WITH HEH': False, + 'ARABIC LIGATURE TEH WITH JEEM': False, + 'ARABIC LIGATURE TEH WITH JEEM WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE TEH WITH JEEM WITH MEEM': False, + 'ARABIC LIGATURE TEH WITH JEEM WITH YEH': False, + 'ARABIC LIGATURE TEH WITH KHAH': False, + 'ARABIC LIGATURE TEH WITH KHAH WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE TEH WITH KHAH WITH MEEM': False, + 'ARABIC LIGATURE TEH WITH KHAH WITH YEH': False, + 'ARABIC LIGATURE TEH WITH MEEM': False, + 'ARABIC LIGATURE TEH WITH MEEM WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE TEH WITH MEEM WITH HAH': False, + 'ARABIC LIGATURE TEH WITH MEEM WITH JEEM': False, + 'ARABIC LIGATURE TEH WITH MEEM WITH KHAH': False, + 'ARABIC LIGATURE TEH WITH MEEM WITH YEH': False, + 'ARABIC LIGATURE TEH WITH NOON': False, + 'ARABIC LIGATURE TEH WITH REH': False, + 'ARABIC LIGATURE TEH WITH YEH': False, + 'ARABIC LIGATURE TEH WITH ZAIN': False, + 'ARABIC LIGATURE THAL WITH SUPERSCRIPT ALEF': False, + 'ARABIC LIGATURE THEH WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE THEH WITH HEH': False, + 'ARABIC LIGATURE THEH WITH JEEM': False, + 'ARABIC LIGATURE THEH WITH MEEM': False, + 'ARABIC LIGATURE THEH WITH NOON': False, + 'ARABIC LIGATURE THEH WITH REH': False, + 'ARABIC LIGATURE THEH WITH YEH': False, + 'ARABIC LIGATURE THEH WITH ZAIN': False, + 'ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH HAMZA ABOVE WITH ALEF MAKSURA': False, # noqa + 'ARABIC LIGATURE YEH WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE YEH WITH HAH': False, + 'ARABIC LIGATURE YEH WITH HAH WITH YEH': False, + 'ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH AE': False, + 'ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH ALEF': False, + 'ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH ALEF MAKSURA': False, + 'ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH E': False, + 'ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH HAH': False, + 'ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH HEH': False, + 'ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH JEEM': False, + 'ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH KHAH': False, + 'ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH MEEM': False, + 'ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH NOON': False, + 'ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH OE': False, + 'ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH REH': False, + 'ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH U': False, + 'ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH WAW': False, + 'ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH YEH': False, + 'ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH YU': False, + 'ARABIC LIGATURE YEH WITH HAMZA ABOVE WITH ZAIN': False, + 'ARABIC LIGATURE YEH WITH HEH': False, + 'ARABIC LIGATURE YEH WITH JEEM': False, + 'ARABIC LIGATURE YEH WITH JEEM WITH YEH': False, + 'ARABIC LIGATURE YEH WITH KHAH': False, + 'ARABIC LIGATURE YEH WITH MEEM': False, + 'ARABIC LIGATURE YEH WITH MEEM WITH MEEM': False, + 'ARABIC LIGATURE YEH WITH MEEM WITH YEH': False, + 'ARABIC LIGATURE YEH WITH NOON': False, + 'ARABIC LIGATURE YEH WITH REH': False, + 'ARABIC LIGATURE YEH WITH YEH': False, + 'ARABIC LIGATURE YEH WITH ZAIN': False, + 'ARABIC LIGATURE ZAH WITH MEEM': False, + # -------------------- End: Ligatures Configurations ------------------- # +} + + +def auto_config(configuration=None, configuration_file=None): + loaded_from_envvar = False + + configuration_parser = ConfigParser() + configuration_parser.read_dict({ + 'ArabicReshaper': default_config + }) + + if not configuration_file: + configuration_file = os.getenv( + 'PYTHON_ARABIC_RESHAPER_CONFIGURATION_FILE' + ) + if configuration_file: + loaded_from_envvar = True + + if configuration_file: + if not os.path.exists(configuration_file): + raise Exception( + 'Configuration file {} not found{}.'.format( + configuration_file, + loaded_from_envvar and ( + ' it is set in your environment variable ' + + 'PYTHON_ARABIC_RESHAPER_CONFIGURATION_FILE' + ) or '' + ) + ) + configuration_parser.read((configuration_file,)) + + if configuration: + configuration_parser.read_dict({ + 'ArabicReshaper': configuration + }) + + if 'ArabicReshaper' not in configuration_parser: + raise ValueError( + 'Invalid configuration: ' + 'A section with the name ArabicReshaper was not found' + ) + + return configuration_parser['ArabicReshaper'] + + +def config_for_true_type_font(font_file_path, + ligatures_config=ENABLE_ALL_LIGATURES): + if not with_font_config: + raise Exception('fonttools not installed, ' + + 'install it then rerun this.\n' + + '$ pip install arabic-teshaper[with-fonttools]') + if not font_file_path or not os.path.exists(font_file_path): + raise Exception('Invalid path to font file') + ttfont = TTFont(font_file_path) + has_isolated = True + for k, v in LETTERS_ARABIC.items(): + for table in ttfont['cmap'].tables: + if ord(v[ISOLATED]) in table.cmap: + break + else: + has_isolated = False + break + + configuration = { + 'use_unshaped_instead_of_isolated': not has_isolated, + } + + def process_ligatures(ligatures): + for ligature in ligatures: + forms = list(filter(lambda form: form != '', ligature[1][1])) + n = len(forms) + for form in forms: + for table in ttfont['cmap'].tables: + if ord(form) in table.cmap: + n -= 1 + break + configuration[ligature[0]] = (n == 0) + + if ENABLE_SENTENCES_LIGATURES & ligatures_config: + process_ligatures(SENTENCES_LIGATURES) + + if ENABLE_WORDS_LIGATURES & ligatures_config: + process_ligatures(WORDS_LIGATURES) + + if ENABLE_LETTERS_LIGATURES & ligatures_config: + process_ligatures(LETTERS_LIGATURES) + + return configuration + +# -*- coding: utf-8 -*- + +# This work is licensed under the MIT License. +# To view a copy of this license, visit https://opensource.org/licenses/MIT + +# Written by Abdullah Diab (mpcabd) +# Email: mpcabd@gmail.com +# Website: http://mpcabd.xyz + +import re + +from itertools import repeat + +# from .ligatures import LIGATURES +# from .reshaper_config import auto_config +# from .letters import (UNSHAPED, ISOLATED, TATWEEL, ZWJ, LETTERS_ARABIC, +# LETTERS_ARABIC_V2, LETTERS_KURDISH, FINAL, +# INITIAL, MEDIAL, connects_with_letters_before_and_after, +# connects_with_letter_before, connects_with_letter_after) + +HARAKAT_RE = re.compile( + '[' + '\u0610-\u061a' + '\u064b-\u065f' + '\u0670' + '\u06d6-\u06dc' + '\u06df-\u06e8' + '\u06ea-\u06ed' + '\u08d4-\u08e1' + '\u08d4-\u08ed' + '\u08e3-\u08ff' + ']', + + re.UNICODE | re.X +) + + +class ArabicReshaper(object): + """ + A class for Arabic reshaper, it allows for fine-tune configuration over the + API. + + If no configuration is passed to the constructor, the class will check for + an environment variable :envvar:`PYTHON_ARABIC_RESHAPER_CONFIGURATION_FILE` + , if the variable is available, the class will load the file pointed to by + the variable, and will read it as an ini file. + If the variable doesn't exist, the class will load with the default + configuration file :file:`default-config.ini` + + Check these links for information on the configuration files format: + + * Python 3: https://docs.python.org/3/library/configparser.html + + See the default configuration file :file:`default-config.ini` for details + on how to configure your reshaper. + """ + + def __init__(self, configuration=None, configuration_file=None): + super(ArabicReshaper, self).__init__() + + self.configuration = auto_config(configuration, configuration_file) + self.language = self.configuration.get('language') + + if self.language == 'ArabicV2': + self.letters = LETTERS_ARABIC_V2 + elif self.language == 'Kurdish': + self.letters = LETTERS_KURDISH + else: + self.letters = LETTERS_ARABIC + + @property + def _ligatures_re(self): + if not hasattr(self, '__ligatures_re'): + patterns = [] + re_group_index_to_ligature_forms = {} + index = 0 + FORMS = 1 + MATCH = 0 + for ligature_record in LIGATURES: + ligature, replacement = ligature_record + if not self.configuration.getboolean(ligature): + continue + re_group_index_to_ligature_forms[index] = replacement[FORMS] + patterns.append('({})'.format(replacement[MATCH])) + index += 1 + self._re_group_index_to_ligature_forms = ( + re_group_index_to_ligature_forms + ) + self.__ligatures_re = re.compile('|'.join(patterns), re.UNICODE) + return self.__ligatures_re + + def _get_ligature_forms_from_re_group_index(self, group_index): + if not hasattr(self, '_re_group_index_to_ligature_forms'): + return self._ligatures_re + return self._re_group_index_to_ligature_forms[group_index] + + def reshape(self, text): + if not text: + return '' + + output = [] + + LETTER = 0 + FORM = 1 + NOT_SUPPORTED = -1 + + delete_harakat = self.configuration.getboolean('delete_harakat') + delete_tatweel = self.configuration.getboolean('delete_tatweel') + support_zwj = self.configuration.getboolean('support_zwj') + shift_harakat_position = self.configuration.getboolean( + 'shift_harakat_position' + ) + use_unshaped_instead_of_isolated = self.configuration.getboolean( + 'use_unshaped_instead_of_isolated' + ) + + positions_harakat = {} + + isolated_form = (UNSHAPED + if use_unshaped_instead_of_isolated else ISOLATED) + + for letter in text: + if HARAKAT_RE.match(letter): + if not delete_harakat: + position = len(output) - 1 + if shift_harakat_position: + position -= 1 + if position not in positions_harakat: + positions_harakat[position] = [] + if shift_harakat_position: + positions_harakat[position].insert(0, letter) + else: + positions_harakat[position].append(letter) + elif letter == TATWEEL and delete_tatweel: + pass + elif letter == ZWJ and not support_zwj: + pass + elif letter not in self.letters: + output.append((letter, NOT_SUPPORTED)) + elif not output: # first letter + output.append((letter, isolated_form)) + else: + previous_letter = output[-1] + if previous_letter[FORM] == NOT_SUPPORTED: + output.append((letter, isolated_form)) + elif not connects_with_letter_before(letter, self.letters): + output.append((letter, isolated_form)) + elif not connects_with_letter_after( + previous_letter[LETTER], self.letters): + output.append((letter, isolated_form)) + elif (previous_letter[FORM] == FINAL and not + connects_with_letters_before_and_after( + previous_letter[LETTER], self.letters + )): + output.append((letter, isolated_form)) + elif previous_letter[FORM] == isolated_form: + output[-1] = ( + previous_letter[LETTER], + INITIAL + ) + output.append((letter, FINAL)) + # Otherwise, we will change the previous letter to connect + # to the current letter + else: + output[-1] = ( + previous_letter[LETTER], + MEDIAL + ) + output.append((letter, FINAL)) + + # Remove ZWJ if it's the second to last item as it won't be useful + if support_zwj and len(output) > 1 and output[-2][LETTER] == ZWJ: + output.pop(len(output) - 2) + + if support_zwj and output and output[-1][LETTER] == ZWJ: + output.pop() + + if self.configuration.getboolean('support_ligatures'): + # Clean text from Harakat to be able to find ligatures + text = HARAKAT_RE.sub('', text) + + # Clean text from Tatweel to find ligatures if delete_tatweel + if delete_tatweel: + text = text.replace(TATWEEL, '') + + for match in re.finditer(self._ligatures_re, text): + group_index = next(( + i for i, group in enumerate(match.groups()) if group + ), -1) + forms = self._get_ligature_forms_from_re_group_index( + group_index + ) + a, b = match.span() + a_form = output[a][FORM] + b_form = output[b - 1][FORM] + ligature_form = None + + # +-----------+----------+---------+---------+----------+ + # | a \ b | ISOLATED | INITIAL | MEDIAL | FINAL | + # +-----------+----------+---------+---------+----------+ + # | ISOLATED | ISOLATED | INITIAL | INITIAL | ISOLATED | + # | INITIAL | ISOLATED | INITIAL | INITIAL | ISOLATED | + # | MEDIAL | FINAL | MEDIAL | MEDIAL | FINAL | + # | FINAL | FINAL | MEDIAL | MEDIAL | FINAL | + # +-----------+----------+---------+---------+----------+ + + if a_form in (isolated_form, INITIAL): + if b_form in (isolated_form, FINAL): + ligature_form = ISOLATED + else: + ligature_form = INITIAL + else: + if b_form in (isolated_form, FINAL): + ligature_form = FINAL + else: + ligature_form = MEDIAL + if not forms[ligature_form]: + continue + output[a] = (forms[ligature_form], NOT_SUPPORTED) + output[a+1:b] = repeat(('', NOT_SUPPORTED), b - 1 - a) + + result = [] + if not delete_harakat and -1 in positions_harakat: + result.extend(positions_harakat[-1]) + for i, o in enumerate(output): + if o[LETTER]: + if o[FORM] == NOT_SUPPORTED or o[FORM] == UNSHAPED: + result.append(o[LETTER]) + else: + result.append(self.letters[o[LETTER]][o[FORM]]) + + if not delete_harakat: + if i in positions_harakat: + result.extend(positions_harakat[i]) + + return ''.join(result) + + +default_reshaper = ArabicReshaper() +reshape = default_reshaper.reshape + + + +class Process: + def process_before(self, text): + # إعداد سياق المعالجة إذا لزم الأمر + context = {} + # تُرجع النص كما هو مع السياق + return text, context + + def process_after(self, res, context): + # إعادة تشكيل النص العربي ليظهر متصلاً بعد المعالجة + reshaped_text = reshape(res) + # عكس النص + reversed_text = ''.join(reversed(reshaped_text)) + # يمكنك هنا إضافة تحويل النص إلى UTF-8 إذا لزم الأمر أو أية معالجات إضافية + return reversed_text \ No newline at end of file diff --git a/LunaTranslator/files/defaultconfig/config.json b/LunaTranslator/files/defaultconfig/config.json index 225bdd0b..dc578ae4 100644 --- a/LunaTranslator/files/defaultconfig/config.json +++ b/LunaTranslator/files/defaultconfig/config.json @@ -54,7 +54,8 @@ "transerrorfix":false, "gongxiangcishu":false, "vndbnamemap":false, - "myprocess":false + "myprocess":false, + "arabic_reshaper":false }, "uselongtermcache": false, "hist_split": false, diff --git a/LunaTranslator/files/defaultconfig/static_data.json b/LunaTranslator/files/defaultconfig/static_data.json index 4b9bfe6a..0567dfe4 100644 --- a/LunaTranslator/files/defaultconfig/static_data.json +++ b/LunaTranslator/files/defaultconfig/static_data.json @@ -353,11 +353,16 @@ { "name":"vndbnamemap", "visname":"使用VNDB数据替换人名", - "languageuse":2 + "languageuse":"en" }, { "name":"myprocess", "visname":"使用自定义优化" + }, + { + "name":"arabic_reshaper", + "visname":"arabic reshaper", + "languageuse":"ar" } ] }