mirror of
https://codeberg.org/Freeyourgadget/Gadgetbridge.git
synced 2025-01-25 08:05:55 +01:00
Add Armenian transliteration
This commit is contained in:
parent
9ae02f65ed
commit
f0a9ab7f98
@ -276,6 +276,9 @@ dependencies {
|
||||
|
||||
// Fix Duplicate class build error
|
||||
implementation(platform("org.jetbrains.kotlin:kotlin-bom:1.8.0"))
|
||||
|
||||
// Needed for Armenian transliteration
|
||||
implementation group: 'org.ahocorasick', name: 'ahocorasick', version: '0.6.3'
|
||||
}
|
||||
|
||||
preBuild.dependsOn(":GBDaoGenerator:genSources")
|
||||
|
@ -35,6 +35,7 @@ import nodomain.freeyourgadget.gadgetbridge.devices.DeviceCoordinator;
|
||||
import nodomain.freeyourgadget.gadgetbridge.impl.GBDevice;
|
||||
import nodomain.freeyourgadget.gadgetbridge.util.Prefs;
|
||||
import nodomain.freeyourgadget.gadgetbridge.util.language.impl.ArabicTransliterator;
|
||||
import nodomain.freeyourgadget.gadgetbridge.util.language.impl.ArmenianTransliterator;
|
||||
import nodomain.freeyourgadget.gadgetbridge.util.language.impl.BengaliTransliterator;
|
||||
import nodomain.freeyourgadget.gadgetbridge.util.language.impl.CommonSymbolsTransliterator;
|
||||
import nodomain.freeyourgadget.gadgetbridge.util.language.impl.CroatianTransliterator;
|
||||
@ -86,6 +87,7 @@ public class LanguageUtils {
|
||||
put("scandinavian", new ScandinavianTransliterator());
|
||||
put("turkish", new TurkishTransliterator());
|
||||
put("ukranian", new UkranianTransliterator());
|
||||
put("armenian", new ArmenianTransliterator());
|
||||
}};
|
||||
|
||||
/**
|
||||
|
@ -0,0 +1,245 @@
|
||||
/* Copyright (C) 2021-2024 Alik Aslanyan
|
||||
|
||||
This file is part of Gadgetbridge.
|
||||
|
||||
Gadgetbridge is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published
|
||||
by the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
Gadgetbridge is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>. */
|
||||
package nodomain.freeyourgadget.gadgetbridge.util.language.impl;
|
||||
import nodomain.freeyourgadget.gadgetbridge.util.language.Transliterator;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
import org.ahocorasick.trie.Trie;
|
||||
import org.apache.commons.lang3.text.WordUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class ArmenianTransliterator implements Transliterator {
|
||||
// Transliteration map ordered by priority
|
||||
// Armenian has some rules regarding reading of 'ո' in the middle of the word it reads as english O
|
||||
// But if word starts with it's read as sound of 'vo'
|
||||
// Or if it has 'ւ' symbol after it, then we should read it as 'u' (as double o in booze)
|
||||
private static final Map<String, String> transliterateMap = new LinkedHashMap<String, String>() {
|
||||
{
|
||||
// Letter + 'ու'
|
||||
put("աու","au");
|
||||
put("բու","bu");
|
||||
put("գու","gu");
|
||||
put("դու","du");
|
||||
put("եու","eu");
|
||||
put("զու","zu");
|
||||
put("էու","eu");
|
||||
put("ըու","yu");
|
||||
put("թու","tu");
|
||||
put("ժու","ju");
|
||||
put("իու","iu");
|
||||
put("լու","lu");
|
||||
put("խու","xu");
|
||||
put("ծու","cu");
|
||||
put("կու","ku");
|
||||
put("հու","hu");
|
||||
put("ձու","dzu");
|
||||
put("ղու","xu");
|
||||
put("ճու","cu");
|
||||
put("մու","mu");
|
||||
put("յու","yu");
|
||||
put("նու","nu");
|
||||
put("շու","shu");
|
||||
put("չու","chu");
|
||||
put("պու","pu");
|
||||
put("ջու","ju");
|
||||
put("ռու","ru");
|
||||
put("սու","su");
|
||||
put("վու","vu");
|
||||
put("տու","tu");
|
||||
put("րու","ru");
|
||||
put("ցու","cu");
|
||||
put("փու","pu");
|
||||
put("քու","qu");
|
||||
put("օու","ou");
|
||||
put("ևու","eu");
|
||||
put("ֆու","fu");
|
||||
put("ոու","vou");
|
||||
|
||||
put("ու","u");
|
||||
|
||||
// Letter + 'ո'
|
||||
put("բո","bo");
|
||||
put("գո","go");
|
||||
put("դո","do");
|
||||
put("զո","zo");
|
||||
put("թո","to");
|
||||
put("ժո","jo");
|
||||
put("լո","lo");
|
||||
put("խո","xo");
|
||||
put("ծո","co");
|
||||
put("կո","ko");
|
||||
put("հո","ho");
|
||||
put("ձո","dzo");
|
||||
put("ղո","xo");
|
||||
put("ճո","co");
|
||||
put("մո","mo");
|
||||
put("յո","yo");
|
||||
put("նո","no");
|
||||
put("շո","so");
|
||||
put("չո","co");
|
||||
put("պո","po");
|
||||
put("ջո","jo");
|
||||
put("ռո","ro");
|
||||
put("սո","so");
|
||||
put("վո","vo");
|
||||
put("տո","to");
|
||||
put("րո","ro");
|
||||
put("ցո","co");
|
||||
put("փո","po");
|
||||
put("քո","qo");
|
||||
put("ևո","eo");
|
||||
put("ֆո","fo");
|
||||
put("ո","vo");
|
||||
|
||||
// Two different ways to write, we support all.
|
||||
put("եւ","ev");
|
||||
put("եվ","ev");
|
||||
|
||||
// Simple substitutions
|
||||
put("ա","a");
|
||||
put("բ","b");
|
||||
put("գ","g");
|
||||
put("դ","d");
|
||||
put("ե","e");
|
||||
put("զ","z");
|
||||
put("է","e");
|
||||
put("ը","y");
|
||||
put("թ","t");
|
||||
put("ժ","j");
|
||||
put("ի","i");
|
||||
put("լ","l");
|
||||
put("խ","x");
|
||||
put("ծ","c");
|
||||
put("կ","k");
|
||||
put("հ","h");
|
||||
put("ձ","dz");
|
||||
put("ղ","x");
|
||||
put("ճ","c");
|
||||
put("մ","m");
|
||||
put("յ","y");
|
||||
put("ն","n");
|
||||
put("շ","sh");
|
||||
put("չ","ch");
|
||||
put("պ","p");
|
||||
put("ջ","j");
|
||||
put("ռ","r");
|
||||
put("ս","s");
|
||||
put("վ","v");
|
||||
put("տ","t");
|
||||
put("ր","r");
|
||||
put("ց","c");
|
||||
put("փ","p");
|
||||
put("ք","q");
|
||||
put("օ","o");
|
||||
put("և","ev");
|
||||
put("ֆ","f");
|
||||
|
||||
// If this symbol wasn't used in the combination with others, then it's meaningless
|
||||
put("ւ","");
|
||||
|
||||
// Add support for capitilazed words
|
||||
for (final Map.Entry<String,String> entry : ((Map<String, String>)this.clone()).entrySet()) {
|
||||
final String capitalKey = WordUtils.capitalize(entry.getKey());
|
||||
if(!capitalKey.equals(entry.getKey())) {
|
||||
put(capitalKey, WordUtils.capitalize(entry.getValue()));
|
||||
}
|
||||
}
|
||||
|
||||
}};
|
||||
|
||||
private static final Map<String, Integer> transliterationPriorityMap = new HashMap<String, Integer>() {{
|
||||
int priority = 0;
|
||||
for( final String key : transliterateMap.keySet() ) {
|
||||
put(key, priority++);
|
||||
}
|
||||
}};
|
||||
|
||||
// Aho-Corasick trie
|
||||
private static final Trie transliterationTrie;
|
||||
static {
|
||||
final Trie.TrieBuilder builder = Trie.builder();
|
||||
for( final String key : ArmenianTransliterator.transliterateMap.keySet()) {
|
||||
builder.addKeyword(key);
|
||||
}
|
||||
transliterationTrie = builder.build();
|
||||
}
|
||||
|
||||
private static String ahoCorasick(final String text) {
|
||||
// Create a buffer sufficiently large that re-allocations are minimized.
|
||||
final StringBuilder sb = new StringBuilder( text.length() * 10 / 12 );
|
||||
|
||||
// The complexity of the Aho-Corasick algorithm O(N + L + Z)
|
||||
// Where N is the length of the text, L is the length of keywords and the Z is a number of matches.
|
||||
// This algorithm allows us to do fast substring search
|
||||
final List<Emit> emits = new ArrayList<Emit>(transliterationTrie.parseText( text ));
|
||||
|
||||
// Sort collection first by starting position, then by priority.
|
||||
Collections.sort(emits, new Comparator<Emit>() {
|
||||
@Override
|
||||
public int compare(Emit a, Emit b) {
|
||||
int cmp = Integer.compare(a.getStart(), b.getStart());
|
||||
if (cmp != 0) {
|
||||
return cmp;
|
||||
}
|
||||
|
||||
int priorityA = transliterationPriorityMap.get(a.getKeyword());
|
||||
int priorityB = transliterationPriorityMap.get(b.getKeyword());
|
||||
return Integer.compare(priorityA, priorityB);
|
||||
}
|
||||
});
|
||||
|
||||
int prevIndex = 0;
|
||||
|
||||
for( final Emit emit : emits ) {
|
||||
final int matchIndex = emit.getStart();
|
||||
|
||||
// Skip if we already substituted this part
|
||||
if(matchIndex < prevIndex) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Add part which shouldn't be substituted
|
||||
sb.append(text.substring(prevIndex, matchIndex));
|
||||
|
||||
// Substitute and append to the builder
|
||||
sb.append( ArmenianTransliterator.transliterateMap.get( emit.getKeyword() ) );
|
||||
|
||||
prevIndex = emit.getEnd() + 1;
|
||||
}
|
||||
|
||||
// Add the remainder of the string (contains no more matches).
|
||||
sb.append( text.substring( prevIndex ) );
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String transliterate(String txt) {
|
||||
if (txt == null || txt.isEmpty()) {
|
||||
return txt;
|
||||
}
|
||||
|
||||
return ahoCorasick(txt);
|
||||
}
|
||||
}
|
@ -3495,6 +3495,7 @@
|
||||
<item>@string/turkish</item>
|
||||
<item>@string/ukranian</item>
|
||||
<item>@string/hungarian</item>
|
||||
<item>@string/armenian</item>
|
||||
</string-array>
|
||||
|
||||
<string-array name="pref_transliteration_languages_values">
|
||||
@ -3521,6 +3522,7 @@
|
||||
<item>turkish</item>
|
||||
<item>ukranian</item>
|
||||
<item>hungarian</item>
|
||||
<item>armenian</item>
|
||||
</string-array>
|
||||
|
||||
<string-array name="pref_transliteration_languages_default">
|
||||
|
@ -1061,6 +1061,7 @@
|
||||
<string name="persian">Persian</string>
|
||||
<string name="scandinavian">Scandinavian</string>
|
||||
<string name="ukranian">Ukranian</string>
|
||||
<string name="armenian">Armenian</string>
|
||||
<string name="italian">Italian</string>
|
||||
<string name="french">French</string>
|
||||
<string name="french_ca">French (Canada)</string>
|
||||
|
@ -0,0 +1,150 @@
|
||||
package nodomain.freeyourgadget.gadgetbridge.test;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.commons.lang3.text.WordUtils;
|
||||
import org.junit.Test;
|
||||
import org.junit.Assert;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import nodomain.freeyourgadget.gadgetbridge.util.language.impl.ArmenianTransliterator;
|
||||
|
||||
public class ArmenianTransliteratorTest extends TestCase {
|
||||
@Test
|
||||
public void testSimpleCases() {
|
||||
Assert.assertEquals("aybuben", new ArmenianTransliterator().transliterate("այբուբեն"));
|
||||
Assert.assertEquals("vorotan", new ArmenianTransliterator().transliterate("որոտան"));
|
||||
Assert.assertEquals("voroshel", new ArmenianTransliterator().transliterate("որոշել"));
|
||||
Assert.assertEquals("uzox", new ArmenianTransliterator().transliterate("ուզող"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultipleWords() {
|
||||
Assert.assertEquals("vory karucum en Viqipedia kayqic ogtvoxnery azat xmbagrman dzevachapov",
|
||||
new ArmenianTransliterator().transliterate("որը կառուցում են Վիքիպեդիա կայքից օգտվողները ազատ խմբագրման ձևաչափով"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMixedStrings() {
|
||||
Assert.assertEquals("vor1voshel 12 uzox", new ArmenianTransliterator().transliterate("որ1ոշել 12 ուզող"));
|
||||
Assert.assertEquals("vory jet iridescent karucum en sheen Viqipedia kayqic ogtvoxnery and a distinctive azat xmbagrman dzevachapov",
|
||||
new ArmenianTransliterator().transliterate("որը jet iridescent կառուցում են sheen Վիքիպեդիա կայքից օգտվողները and a distinctive ազատ խմբագրման ձևաչափով"));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testTop100Words() {
|
||||
final Map<String,String> topWords = new LinkedHashMap<String,String>() {{
|
||||
put("ինչպես", "inchpes");
|
||||
put("ես", "es");
|
||||
put("նրա", "nra");
|
||||
put("որ", "vor");
|
||||
put("նա", "na");
|
||||
put("էր", "er");
|
||||
put("համար", "hamar");
|
||||
put("ին", "in");
|
||||
put("հետ", "het");
|
||||
put("նրանք", "nranq");
|
||||
put("լինել", "linel");
|
||||
put("մեկ", "mek");
|
||||
put("ունենալ", "unenal");
|
||||
put("այս", "ays");
|
||||
put("ից", "ic");
|
||||
put("ի", "i");
|
||||
put("տաք", "taq");
|
||||
put("բառ", "bar");
|
||||
put("բայց", "bayc");
|
||||
put("ինչ", "inch");
|
||||
put("մի", "mi");
|
||||
put("քանի", "qani");
|
||||
put("է", "e");
|
||||
put("այն", "ayn");
|
||||
put("դուք", "duq");
|
||||
put("կամ", "kam");
|
||||
put("եւ", "ev");
|
||||
put("մինչեւ", "minchev");
|
||||
put("իսկ", "isk");
|
||||
put("ա", "a");
|
||||
put("մենք", "menq");
|
||||
put("կարող", "karox");
|
||||
put("այլ", "ayl");
|
||||
put("են", "en");
|
||||
put("որը", "vory");
|
||||
put("անել", "anel");
|
||||
put("իրենց", "irenc");
|
||||
put("ժամանակ", "jamanak");
|
||||
put("եթե", "ete");
|
||||
put("կամք", "kamq");
|
||||
put("յուրաքանչյուր", "yuraqanchyur");
|
||||
put("ասել", "asel");
|
||||
put("շարք", "sharq");
|
||||
put("երեք", "ereq");
|
||||
put("ուզում", "uzum");
|
||||
put("օդի", "odi");
|
||||
put("լավ", "lav");
|
||||
put("նույնպես", "nuynpes");
|
||||
put("խաղալ", "xaxal");
|
||||
put("փոքր", "poqr");
|
||||
put("վերջ", "verj");
|
||||
put("կարդալ", "kardal");
|
||||
put("ձեռք", "dzerq");
|
||||
put("նավահանգիստ", "navahangist");
|
||||
put("տառ", "tar");
|
||||
put("առ", "ar");
|
||||
put("ավելացնել", "avelacnel");
|
||||
put("նույնիսկ", "nuynisk");
|
||||
put("այստեղ", "aystex");
|
||||
put("պետք", "petq");
|
||||
put("մեծ", "mec");
|
||||
put("բարձր", "bardzr");
|
||||
put("այդպիսի", "aydpisi");
|
||||
put("հետեւել", "hetevel");
|
||||
put("գործ", "gorc");
|
||||
put("ինչու", "inchu");
|
||||
put("խնդրել", "xndrel");
|
||||
put("տղամարդիկ", "txamardik");
|
||||
put("փոփոխություն", "popoxutyun");
|
||||
put("գնաց", "gnac");
|
||||
put("լույս", "luys");
|
||||
put("բարի", "bari");
|
||||
put("դուրս", "durs");
|
||||
put("անհրաժեշտ", "anhrajesht");
|
||||
put("տուն", "tun");
|
||||
put("նկար", "nkar");
|
||||
put("փորձել", "pordzel");
|
||||
put("մեզ", "mez");
|
||||
put("կրկին", "krkin");
|
||||
put("կենդանի", "kendani");
|
||||
put("կետ", "ket");
|
||||
put("մայր", "mayr");
|
||||
put("աշխարհ", "ashxarh");
|
||||
put("մոտ", "mot");
|
||||
put("կառուցել", "karucel");
|
||||
put("ինքնուրույն", "inqnuruyn");
|
||||
put("երկիր", "erkir");
|
||||
put("հայր", "hayr");
|
||||
put("ցանկացած", "cankacac");
|
||||
put("նոր", "nor");
|
||||
put("աշխատանք", "ashxatanq");
|
||||
put("մաս", "mas");
|
||||
put("վերցնել", "vercnel");
|
||||
put("ստանալ", "stanal");
|
||||
put("տեղ", "tex");
|
||||
put("ապրել", "aprel");
|
||||
put("որտեղ", "vortex");
|
||||
put("երբ", "erb");
|
||||
put("Վերադառնալ", "Veradarnal");
|
||||
put("միայն", "miayn");
|
||||
}};
|
||||
|
||||
for (final Map.Entry<String,String> entry : topWords.entrySet()) {
|
||||
Assert.assertEquals(entry.getValue(), new ArmenianTransliterator().transliterate(entry.getKey()));
|
||||
}
|
||||
|
||||
for (final Map.Entry<String,String> entry : topWords.entrySet()) {
|
||||
Assert.assertEquals(WordUtils.capitalize(entry.getValue()), WordUtils.capitalize(new ArmenianTransliterator().transliterate(entry.getKey())));
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user