lib.itmens/common/models/lang.py

417 lines
23 KiB
Python
Raw Normal View History

2024-07-13 00:16:47 -04:00
"""
Language support utilities
2024-07-13 00:16:47 -04:00
get site wide preferences:
SITE_DEFAULT_LANGUAGE
SITE_PREFERRED_LANGUAGES
SITE_PREFERRED_LOCALES
get available choices based on site wide preferences:
LANGUAGE_CHOICES
LOCALE_CHOICES
SCRIPT_CHOICES
based on user preferences:
get_current_locales()
detect language based on text:
detect_language()
refereneces:
https://en.wikipedia.org/wiki/IETF_language_tag
2024-07-13 00:16:47 -04:00
"""
import re
from typing import Any
from django.conf import settings
from django.utils.translation import get_language
from django.utils.translation import gettext_lazy as _
from langdetect import detect
from loguru import logger
FALLBACK_LANGUAGE = "en"
SITE_PREFERRED_LANGUAGES: list[str] = settings.PREFERRED_LANGUAGES or [
FALLBACK_LANGUAGE
]
SITE_DEFAULT_LANGUAGE: str = SITE_PREFERRED_LANGUAGES[0]
2024-07-13 00:16:47 -04:00
ISO_639_1 = {
"aa": _("Afar"),
"af": _("Afrikaans"),
"ak": _("Akan"),
"an": _("Aragonese"),
"as": _("Assamese"),
"av": _("Avaric"),
"ae": _("Avestan"),
"ay": _("Aymara"),
"az": _("Azerbaijani"),
"ba": _("Bashkir"),
"bm": _("Bambara"),
"bi": _("Bislama"),
"bo": _("Tibetan"),
"br": _("Breton"),
"ca": _("Catalan"),
"cs": _("Czech"),
"ce": _("Chechen"),
"cu": _("Slavic"),
"cv": _("Chuvash"),
"kw": _("Cornish"),
"co": _("Corsican"),
"cr": _("Cree"),
"cy": _("Welsh"),
"da": _("Danish"),
"de": _("German"),
"dv": _("Divehi"),
"dz": _("Dzongkha"),
"eo": _("Esperanto"),
"et": _("Estonian"),
"eu": _("Basque"),
"fo": _("Faroese"),
"fj": _("Fijian"),
"fi": _("Finnish"),
"fr": _("French"),
"fy": _("Frisian"),
"ff": _("Fulah"),
"gd": _("Gaelic"),
"ga": _("Irish"),
"gl": _("Galician"),
"gv": _("Manx"),
"gn": _("Guarani"),
"gu": _("Gujarati"),
"ht": _("Haitian; Haitian Creole"),
"ha": _("Hausa"),
"sh": _("Serbo-Croatian"),
"hz": _("Herero"),
"ho": _("Hiri Motu"),
"hr": _("Croatian"),
"hu": _("Hungarian"),
"ig": _("Igbo"),
"io": _("Ido"),
"ii": _("Yi"),
"iu": _("Inuktitut"),
"ie": _("Interlingue"),
"ia": _("Interlingua"),
"id": _("Indonesian"),
"ik": _("Inupiaq"),
"is": _("Icelandic"),
"it": _("Italian"),
"jv": _("Javanese"),
"ja": _("Japanese"),
"kl": _("Kalaallisut"),
"kn": _("Kannada"),
"ks": _("Kashmiri"),
"kr": _("Kanuri"),
"kk": _("Kazakh"),
"km": _("Khmer"),
"ki": _("Kikuyu"),
"rw": _("Kinyarwanda"),
"ky": _("Kirghiz"),
"kv": _("Komi"),
"kg": _("Kongo"),
"ko": _("Korean"),
"kj": _("Kuanyama"),
"ku": _("Kurdish"),
"lo": _("Lao"),
"la": _("Latin"),
"lv": _("Latvian"),
"li": _("Limburgish"),
"ln": _("Lingala"),
"lt": _("Lithuanian"),
"lb": _("Letzeburgesch"),
"lu": _("Luba-Katanga"),
"lg": _("Ganda"),
"mh": _("Marshall"),
"ml": _("Malayalam"),
"mr": _("Marathi"),
"mg": _("Malagasy"),
"mt": _("Maltese"),
"mo": _("Moldavian"),
"mn": _("Mongolian"),
"mi": _("Maori"),
"ms": _("Malay"),
"my": _("Burmese"),
"na": _("Nauru"),
"nv": _("Navajo"),
"nr": _("Ndebele"),
"nd": _("Ndebele"),
"ng": _("Ndonga"),
"ne": _("Nepali"),
"nl": _("Dutch"),
"nn": _("Norwegian Nynorsk"),
"nb": _("Norwegian Bokmål"),
"no": _("Norwegian"),
"ny": _("Chichewa; Nyanja"),
"oc": _("Occitan"),
"oj": _("Ojibwa"),
"or": _("Oriya"),
"om": _("Oromo"),
"os": _("Ossetian; Ossetic"),
"pi": _("Pali"),
"pl": _("Polish"),
"pt": _("Portuguese"),
"qu": _("Quechua"),
"rm": _("Raeto-Romance"),
"ro": _("Romanian"),
"rn": _("Rundi"),
"ru": _("Russian"),
"sg": _("Sango"),
"sa": _("Sanskrit"),
"si": _("Sinhalese"),
"sk": _("Slovak"),
"sl": _("Slovenian"),
"se": _("Northern Sami"),
"sm": _("Samoan"),
"sn": _("Shona"),
"sd": _("Sindhi"),
"so": _("Somali"),
"st": _("Sotho"),
"es": _("Spanish"),
"sq": _("Albanian"),
"sc": _("Sardinian"),
"sr": _("Serbian"),
"ss": _("Swati"),
"su": _("Sundanese"),
"sw": _("Swahili"),
"sv": _("Swedish"),
"ty": _("Tahitian"),
"ta": _("Tamil"),
"tt": _("Tatar"),
"te": _("Telugu"),
"tg": _("Tajik"),
"tl": _("Tagalog"),
"th": _("Thai"),
"ti": _("Tigrinya"),
"to": _("Tonga"),
"tn": _("Tswana"),
"ts": _("Tsonga"),
"tk": _("Turkmen"),
"tr": _("Turkish"),
"tw": _("Twi"),
"ug": _("Uighur"),
"uk": _("Ukrainian"),
"ur": _("Urdu"),
"uz": _("Uzbek"),
"ve": _("Venda"),
"vi": _("Vietnamese"),
"vo": _("Volapük"),
"wa": _("Walloon"),
"wo": _("Wolof"),
"xh": _("Xhosa"),
"yi": _("Yiddish"),
"za": _("Zhuang"),
"zu": _("Zulu"),
"ab": _("Abkhazian"),
"zh": _("Chinese"),
"ps": _("Pushto"),
"am": _("Amharic"),
"ar": _("Arabic"),
"bg": _("Bulgarian"),
"mk": _("Macedonian"),
"el": _("Greek"),
"fa": _("Persian"),
"he": _("Hebrew"),
"hi": _("Hindi"),
"hy": _("Armenian"),
"en": _("English"),
"ee": _("Ewe"),
"ka": _("Georgian"),
"pa": _("Punjabi"),
"bn": _("Bengali"),
"bs": _("Bosnian"),
"ch": _("Chamorro"),
"be": _("Belarusian"),
"yo": _("Yoruba"),
}
TOP_USED_LANGUAGES = [
2024-07-13 00:16:47 -04:00
"en",
"de",
"es",
"zh",
"fr",
"ja",
"it",
"ru",
"pt",
"nl",
"kr",
"hi",
"ar",
"bn",
]
_UNKNOWN_LANGUAGE = ("x", _("Unknown or Other"))
2024-07-14 13:36:52 -04:00
RE_LOCALIZED_SEASON_NUMBERS = re.compile(
r"|一|二|三|四|五|六|七|八|九|零|十|\d|\s|\.|Season |Temporada |ª Temporada|Staffel |Saison |Stagione |Sæson |Temporada |Serie |S|#|第|季|シーズン|Сезон |시즌 ",
2024-07-14 13:36:52 -04:00
flags=re.IGNORECASE,
)
2024-07-13 00:16:47 -04:00
2024-07-13 19:21:27 -04:00
2024-07-14 13:36:52 -04:00
def localize_number(i: int) -> str:
lang = get_language().lower()
if lang == "zh" or lang.startswith("zh-"):
# TODO this works but can be much better
if i < 0 or i > 99:
return str(i)
s = "零一二三四五六七八九"
match i // 10:
case 0:
return s[i % 10]
case 1:
return "" + s[i % 10]
case _:
return s[i // 10] + "" + s[i % 10]
return str(i)
def _get_base_language_list() -> dict[str, str]:
2024-07-13 00:16:47 -04:00
langs = {}
for k in SITE_PREFERRED_LANGUAGES + TOP_USED_LANGUAGES:
2024-07-13 00:16:47 -04:00
if k not in langs:
if k in ISO_639_1:
langs[k] = ISO_639_1[k]
else:
logger.error(f"{k} is not a supported ISO-639-1 language tag")
for k, v in ISO_639_1.items():
if k not in langs:
langs[k] = v
return langs
_BASE_LANGUAGE_LIST: dict[str, Any] = _get_base_language_list()
_LOCALE_SUBTAGS_PRIO = {
"zh": {
"zh-cn": _("Simplified Chinese (Mainland)"),
"zh-tw": _("Traditional Chinese (Taiwan)"),
"zh-hk": _("Traditional Chinese (Hongkong)"),
},
"pt": {
"pt": _("Portuguese"),
},
}
_LOCALE_SUBTAGS_ADD = {
"pt": {
"pt-br": _("Portuguese (Brazil)"),
},
"zh": {
"zh-sg": _("Simplified Chinese (Singapore)"),
"zh-my": _("Simplified Chinese (Malaysia)"),
"zh-mo": _("Traditional Chinese (Macau)"),
},
}
_LOCALE_SUBTAGS_FALLBACK = ["zh"]
_LANGUAGE_SUBTAGS_PRIO = {
"zh": {
"cmn": _("Mandarin Chinese"),
"yue": _("Yue Chinese"),
}
}
_LANGUAGE_SUBTAGS_ADD = {
"nan": _("Min Nan Chinese"),
"wuu": _("Wu Chinese"),
"hak": _("Hakka Chinese"),
}
def get_preferred_locales() -> list[str]:
locales = []
for k in SITE_PREFERRED_LANGUAGES:
if k in _LOCALE_SUBTAGS_PRIO:
locales += list(_LOCALE_SUBTAGS_PRIO[k].keys()) + list(
_LOCALE_SUBTAGS_ADD[k].keys()
)
else:
locales.append(k)
return locales
SITE_PREFERRED_LOCALES = get_preferred_locales()
2024-07-13 00:16:47 -04:00
def _get_locale_choices() -> list[tuple[str, str]]:
2024-07-13 00:16:47 -04:00
choices = []
for k, v in _BASE_LANGUAGE_LIST.items():
if k in _LOCALE_SUBTAGS_PRIO:
choices += _LOCALE_SUBTAGS_PRIO[k].items()
2024-07-13 00:16:47 -04:00
else:
choices.append((k, v))
for v in _LOCALE_SUBTAGS_ADD.values():
choices += v.items()
for k in _LOCALE_SUBTAGS_PRIO.keys():
p = (k, ISO_639_1[k])
if p not in choices:
choices.append(p)
choices.append(_UNKNOWN_LANGUAGE)
2024-07-13 00:16:47 -04:00
return choices
def _get_script_choices() -> list[tuple[str, str]]:
return list(_BASE_LANGUAGE_LIST.items()) + [_UNKNOWN_LANGUAGE]
2024-07-13 00:16:47 -04:00
def _get_language_choices() -> list[tuple[str, str]]:
2024-07-13 00:16:47 -04:00
choices = []
for k, v in _BASE_LANGUAGE_LIST.items():
if k in _LANGUAGE_SUBTAGS_PRIO:
choices += _LANGUAGE_SUBTAGS_PRIO[k].items()
2024-07-13 00:16:47 -04:00
else:
choices.append((k, v))
choices += _LANGUAGE_SUBTAGS_ADD.items()
choices.append(_UNKNOWN_LANGUAGE)
2024-07-13 00:16:47 -04:00
return choices
LOCALE_CHOICES: list[tuple[str, str]] = _get_locale_choices()
SCRIPT_CHOICES: list[tuple[str, str]] = _get_script_choices()
LANGUAGE_CHOICES: list[tuple[str, str]] = _get_language_choices()
2024-07-13 00:16:47 -04:00
def get_current_locales() -> list[str]:
lang = get_language().lower()
if lang == "zh-hans":
locales = ["zh-cn", "zh-sg", "zh-my", "zh-hk", "zh-tw", "zh-mo", "en"]
2024-07-13 00:16:47 -04:00
elif lang == "zh-hant":
locales = ["zh-tw", "zh-hk", "zh-mo", "zh-cn", "zh-sg", "zh-my", "en"]
2024-07-13 00:16:47 -04:00
else:
lng = lang.split("-")
locales = ["en"] if lng[0] == "en" else [lng[0], "en"]
for locale in SITE_PREFERRED_LOCALES:
if locale not in locales:
locales.append(locale)
if FALLBACK_LANGUAGE not in locales:
locales.append(FALLBACK_LANGUAGE)
return locales
2024-07-13 00:16:47 -04:00
2024-07-14 01:34:17 -04:00
_eng = re.compile(r"^[A-Z-a-z0-9]+$")
_chn = re.compile(
2024-07-14 01:34:17 -04:00
r"^[\d\u4E00-\u9FFF\u3400-\u4DBF\uF900-\uFAFF\uff01-\uff5e\s。、·— 0-9\-\(\)]+$"
)
2024-07-14 01:34:17 -04:00
_latin = re.compile(r"^[\u0000-\u007F]+$")
_chn_latin = re.compile(
r"^[\u0000-\u007F\u4E00-\u9FFF\u3400-\u4DBF\uF900-\uFAFF\uFF01-\uFF5E\s。、·— 0-9\-\(\)]+$"
) # https://github.com/BYVoid/OpenCC/blob/master/data/dictionary/TSCharacters.txt
# print("".join([l.split("\t", 1)[0] for l in open("TSCharacters.txt", "r")]))
_tc_char = "㑮㑯㑳㑶㒓㓄㓨㔋㖮㗲㗿㘉㘓㘔㘚㛝㜄㜏㜐㜗㜢㜷㞞㟺㠏㠣㢗㢝㥮㦎㦛㦞㨻㩋㩜㩳㩵㪎㯤㰙㵗㵾㶆㷍㷿㸇㹽㺏㺜㻶㿖㿗㿧䀉䀹䁪䁻䂎䃮䅐䅳䆉䉑䉙䉬䉲䉶䊭䊷䊺䋃䋔䋙䋚䋦䋹䋻䋼䋿䌈䌋䌖䌝䌟䌥䌰䍤䍦䍽䎙䎱䓣䕤䕳䖅䗅䗿䙔䙡䙱䚩䛄䛳䜀䜖䝭䝻䝼䞈䞋䞓䟃䟆䟐䠆䠱䡐䡩䡵䢨䤤䥄䥇䥑䥕䥗䥩䥯䥱䦘䦛䦟䦯䦳䧢䪊䪏䪗䪘䪴䪾䫀䫂䫟䫴䫶䫻䫾䬓䬘䬝䬞䬧䭀䭃䭑䭔䭿䮄䮝䮞䮠䮫䮰䮳䮾䯀䯤䰾䱀䱁䱙䱧䱬䱰䱷䱸䱽䲁䲅䲖䲘䲰䳜䳢䳤䳧䳫䴉䴋䴬䴱䴴䴽䵳䵴䶕䶲丟並乾亂亙亞佇佈佔併來侖侶侷俁係俓俔俠俥俬倀倆倈倉個們倖倫倲偉偑側偵偽傌傑傖傘備傢傭傯傳傴債傷傾僂僅僉僑僕僞僤僥僨僱價儀儁儂億儈儉儎儐儔儕儘償儣優儭儲儷儸儺儻儼兇兌兒兗內兩冊冑冪凈凍凙凜凱別刪剄則剋剎剗剛剝剮剴創剷剾劃劇劉劊劌劍劏劑劚勁勑動務勛勝勞勢勣勩勱勳勵勸勻匭匯匱區協卹卻卽厙厠厤厭厲厴參叄叢吒吳吶呂咼員哯唄唓唸問啓啞啟啢喎喚喪喫喬單喲嗆嗇嗊嗎嗚嗩嗰嗶嗹嘆嘍嘓嘔嘖嘗嘜嘩嘪嘮嘯嘰嘳嘵嘸嘺嘽噁噅噓噚噝噞噠噥噦噯噲噴噸噹嚀嚇嚌嚐嚕嚙嚛嚥嚦嚧嚨嚮嚲嚳嚴嚶嚽囀囁囂囃囅囈囉囌囑囒囪圇國圍園圓圖團圞垻埡埨埬埰執堅堊堖堚堝堯報場塊塋塏塒塗塚塢塤塵塸塹塿墊墜墠墮墰墲墳墶墻墾壇壈壋壎壓壗壘壙壚壜壞壟壠壢壣壩壪壯壺壼壽夠夢夥夾奐奧奩奪奬奮奼妝姍姦娙娛婁婡婦婭媈媧媯媰媼媽嫋嫗嫵嫺嫻嫿嬀嬃嬇嬈嬋嬌嬙嬡嬣嬤嬦嬪嬰嬸嬻孃孄孆孇孋孌孎孫學孻孾孿宮寀寠寢實寧審寫寬寵寶將專尋對導尷屆屍屓屜屢層屨屩屬岡峯峴島峽崍崑崗崙崢崬嵐嵗嵼嵽嵾嶁嶄嶇嶈嶔嶗嶘嶠嶢嶧嶨嶮嶸嶹嶺嶼嶽巊巋巒巔巖巗巘巰巹帥師帳帶幀幃幓幗幘幝幟幣幩幫幬幹幾庫廁廂廄廈廎廕廚廝廞廟廠廡廢廣廧廩廬廳弒弔弳張強彃彄彆彈彌彎彔彙彠彥彫彲彷彿後徑從徠復徵徹徿恆恥悅悞悵悶悽惡惱惲惻愛愜愨愴愷愻愾慄態慍慘慚慟慣慤慪慫慮慳慶慺慼慾憂憊憐憑憒憖憚憢憤憫憮憲憶憸憹懀懇應懌懍懎懞懟懣懤懨懲懶懷懸懺懼懾戀戇戔戧戩戰戱戲戶拋挩挱挾捨捫捱捲掃掄掆掗掙掚掛採揀揚換揮揯損搖搗搵搶摋摐摑摜摟摯摳摶摺摻撈撊撏撐撓撝撟撣撥撧撫撲撳撻撾撿擁擄擇擊擋擓擔據擟擠擣擫擬擯擰擱擲擴擷擺擻擼擽擾攄攆攋攏攔攖攙攛攜攝攢攣攤攪攬敎敓敗敘敵數斂斃斅斆斕斬斷斸於旂旣昇時晉晛晝暈暉暐暘暢暫曄曆曇曉曊曏曖曠曥曨曬書會朥朧朮東枴柵柺査桱桿梔梖梘梜條梟梲棄棊棖棗棟棡棧棲棶椏椲楇楊楓楨業極榘榦榪榮榲榿構槍槓槤槧槨槫槮槳槶槼樁樂樅樑樓標樞樠樢樣樤樧樫樳樸樹樺樿橈橋機橢橫橯檁檉檔檜檟檢檣檭檮檯檳檵檸檻櫃櫅櫍櫓櫚櫛櫝櫞櫟櫠櫥櫧櫨櫪櫫櫬櫱櫳櫸櫻欄欅欇權欍欏欐欑欒欓欖欘欞欽歎歐歟歡歲歷歸歿殘殞殢殤殨殫殭殮殯殰殲殺殻殼毀毆毊毿氂氈氌氣氫氬氭氳氾汎汙決沒沖況泝洩洶浹浿涇涗涼淒淚淥淨淩淪淵淶淺渙減渢渦測渾湊湋湞湧湯溈準溝溡溫溮溳溼滄滅滌滎滙滬滯滲滷滸滻滾滿漁漊漍漚漢漣漬漲漵漸漿潁潑潔潕潙潚潛潣潤潯潰潷潿澀澅澆澇澐澗澠澤澦澩澫澬澮澱澾濁濃濄濆濕濘濚濛濜濟濤濧濫濰濱濺濼濾濿瀂瀃瀅瀆瀇瀉瀋瀏瀕瀘瀝瀟瀠瀦瀧瀨瀰瀲瀾灃灄灍灑灒灕灘灙灝灡灣灤灧灩災為烏烴無煇煉煒煙煢煥煩煬煱熂熅熉熌熒熓熗熚熡熰熱熲熾燀燁燈燉燒燖燙燜營燦燬燭燴燶燻燼燾爃爄爇爍爐爖爛爥爧爭爲爺爾牀牆牘牴牽犖犛犞犢犧狀狹狽猌猙猶猻獁獃獄獅獊獎獨獩獪獫獮獰獱獲獵獷獸獺獻獼玀玁珼現琱琺琿瑋瑒瑣瑤瑩瑪瑲瑻瑽璉璊璕璗璝璡璣璦璫璯環璵璸璼璽璾璿瓄瓅瓊瓏瓔瓕瓚瓛甌甕產産甦甯畝畢畫異畵當畼疇疊痙痠痮痾瘂瘋瘍
_chn_t = re.compile(r".*[" + _tc_char + "].*")
2024-07-13 00:16:47 -04:00
def detect_language(s: str) -> str:
if _eng.match(s):
# doing this for now since langdetect is bad at single word
return "en"
2024-07-14 01:34:17 -04:00
if _chn_latin.match(s) and not _latin.match(s):
if _chn_t.match(s):
return "zh-tw"
return "zh-cn"
2024-07-13 00:16:47 -04:00
try:
lang = detect(s).lower()
return lang
2024-07-13 00:16:47 -04:00
except Exception:
return "x"
def migrate_languages(languages: list[str]) -> list[str]:
return []