diff --git a/libretranslate/app.py b/libretranslate/app.py index af09c00..0952290 100644 --- a/libretranslate/app.py +++ b/libretranslate/app.py @@ -37,6 +37,19 @@ from libretranslate.locales import ( from .api_keys import Database, RemoteDatabase from .suggestions import Database as SuggestionsDatabase +# Rough map of emoji characters +emojis = {e: True for e in \ + [ord(' ')] + # Spaces + list(range(0x1F600,0x1F64F)) + # Emoticons + list(range(0x1F300,0x1F5FF)) + # Misc Symbols and Pictographs + list(range(0x1F680,0x1F6FF)) + # Transport and Map + list(range(0x2600,0x26FF)) + # Misc symbols + list(range(0x2700,0x27BF)) + # Dingbats + list(range(0xFE00,0xFE0F)) + # Variation Selectors + list(range(0x1F900,0x1F9FF)) + # Supplemental Symbols and Pictographs + list(range(0x1F1E6,0x1F1FF)) + # Flags + list(range(0x20D0,0x20FF)) # Combining Diacritical Marks for Symbols +} def get_version(): try: @@ -153,6 +166,19 @@ def filter_unique(seq, extra): seen_add = seen.add return [x for x in seq if not (x in seen or seen_add(x))] + +def detect_translatable(src_texts): + if isinstance(src_texts, list): + return any(detect_translatable(t) for t in src_texts) + + for ch in src_texts: + if not (ord(ch) in emojis): + return True + + # All emojis + return False + + def create_app(args): from libretranslate.init import boot @@ -395,7 +421,7 @@ def create_app(args): web_version=os.environ.get("LT_WEB") is not None, version=get_version(), swagger_url=swagger_url, - available_locales=sorted[{'code': l['code'], 'name': _lazy(l['name'])} for l in get_available_locales(not args.debug)], key=lambda s: s['name']), + available_locales=sorted([{'code': l['code'], 'name': _lazy(l['name'])} for l in get_available_locales(not args.debug)], key=lambda s: s['name']), current_locale=get_locale(), alternate_locales=get_alternate_locale_links() )) @@ -647,13 +673,17 @@ def create_app(args): if batch: request.req_cost = max(1, len(q)) - - if source_lang == "auto": - candidate_langs = detect_languages(src_texts) - detected_src_lang = candidate_langs[0] + + translatable = detect_translatable(src_texts) + if translatable: + if source_lang == "auto": + candidate_langs = detect_languages(src_texts) + detected_src_lang = candidate_langs[0] + else: + detected_src_lang = {"confidence": 100.0, "language": source_lang} else: - detected_src_lang = {"confidence": 100.0, "language": source_lang} - + detected_src_lang = {"confidence": 0.0, "language": "en"} + src_lang = next(iter([l for l in languages if l.code == detected_src_lang["language"]]), None) if src_lang is None: @@ -679,14 +709,18 @@ def create_app(args): if translator is None: abort(400, description=_("%(tname)s (%(tcode)s) is not available as a target language from %(sname)s (%(scode)s)", tname=_lazy(tgt_lang.name), tcode=tgt_lang.code, sname=_lazy(src_lang.name), scode=src_lang.code)) - if text_format == "html": - translated_text = unescape(str(translate_html(translator, text))) - alternatives = [] # Not supported for html yet + if translatable: + if text_format == "html": + translated_text = unescape(str(translate_html(translator, text))) + alternatives = [] # Not supported for html yet + else: + hypotheses = translator.hypotheses(text, num_alternatives + 1) + translated_text = unescape(improve_translation_formatting(text, hypotheses[0].value)) + alternatives = filter_unique([unescape(improve_translation_formatting(text, hypotheses[i].value)) for i in range(1, len(hypotheses))], translated_text) else: - hypotheses = translator.hypotheses(text, num_alternatives + 1) - translated_text = unescape(improve_translation_formatting(text, hypotheses[0].value)) - alternatives = filter_unique([unescape(improve_translation_formatting(text, hypotheses[i].value)) for i in range(1, len(hypotheses))], translated_text) - + translated_text = text # Cannot translate, send the original text back + alternatives = [] + batch_results.append(translated_text) batch_alternatives.append(alternatives) @@ -703,14 +737,18 @@ def create_app(args): if translator is None: abort(400, description=_("%(tname)s (%(tcode)s) is not available as a target language from %(sname)s (%(scode)s)", tname=_lazy(tgt_lang.name), tcode=tgt_lang.code, sname=_lazy(src_lang.name), scode=src_lang.code)) - if text_format == "html": - translated_text = unescape(str(translate_html(translator, q))) - alternatives = [] # Not supported for html yet + if translatable: + if text_format == "html": + translated_text = unescape(str(translate_html(translator, q))) + alternatives = [] # Not supported for html yet + else: + hypotheses = translator.hypotheses(q, num_alternatives + 1) + translated_text = unescape(improve_translation_formatting(q, hypotheses[0].value)) + alternatives = filter_unique([unescape(improve_translation_formatting(q, hypotheses[i].value)) for i in range(1, len(hypotheses))], translated_text) else: - hypotheses = translator.hypotheses(q, num_alternatives + 1) - translated_text = unescape(improve_translation_formatting(q, hypotheses[0].value)) - alternatives = filter_unique([unescape(improve_translation_formatting(q, hypotheses[i].value)) for i in range(1, len(hypotheses))], translated_text) - + translated_text = q # Cannot translate, send the original text back + alternatives = [] + result = {"translatedText": translated_text} if source_lang == "auto": diff --git a/libretranslate/detect.py b/libretranslate/detect.py index b13d52c..25329d5 100644 --- a/libretranslate/detect.py +++ b/libretranslate/detect.py @@ -3,7 +3,8 @@ from langdetect import DetectorFactory DetectorFactory.seed = 0 -from langdetect import detect_langs +from langdetect import detect_langs, LangDetectException +from langdetect.lang_detect_exception import ErrorCode from lexilang.detector import detect as lldetect @@ -35,11 +36,16 @@ class Detector: if conf > 0: return [Language(code, round(conf * 100))] - top_3_choices = [lang for lang in detect_langs(text) if check_lang(self.langcodes, lang)][:3] - if not len(top_3_choices): - return [Language("en", 0)] - if top_3_choices[0].prob == 0: - return [Language("en", 0)] + try: + top_3_choices = [lang for lang in detect_langs(text) if check_lang(self.langcodes, lang)][:3] + if not len(top_3_choices): + return [Language("en", 0)] + if top_3_choices[0].prob == 0: + return [Language("en", 0)] + except LangDetectException as e: + if e.code == ErrorCode.CantDetectError: + return [Language("en", 0)] + else: + raise e return [Language(normalized_lang_code(lang), round(lang.prob * 100)) for lang in top_3_choices] -