mirror of
https://codeberg.org/Freeyourgadget/Gadgetbridge.git
synced 2025-01-25 16:15:55 +01:00
Merge pull request #1236 from uroybd/master
Better Bengali transliteration.
This commit is contained in:
commit
ece0f094eb
@ -17,59 +17,153 @@
|
||||
package nodomain.freeyourgadget.gadgetbridge.util;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.regex.*;
|
||||
|
||||
// What's the reason to extending LanguageUtils?
|
||||
// Just doing it because already done in the previous code.
|
||||
public class BengaliLanguageUtils extends LanguageUtils {
|
||||
|
||||
private final static char BENGALI_JOIN_CHAR = '্';
|
||||
|
||||
private final static HashMap<Character, String> numbers = new HashMap<Character, String>() {
|
||||
// Composite Letters.
|
||||
private final static HashMap<String, String> composites = new HashMap<String, String>() {
|
||||
{
|
||||
put('০',"0"); put('১',"1"); put('২',"2"); put('৩',"3"); put('৪',"4");
|
||||
put('৫',"5"); put('৬',"6"); put('৭',"7"); put('৮',"8");
|
||||
put('৯',"9");
|
||||
put("ক্ষ", "kkh");
|
||||
put("ঞ্চ", "NC");
|
||||
put("ঞ্ছ", "NCh");
|
||||
put("ঞ্জ", "Ng");
|
||||
put("জ্ঞ", "gg");
|
||||
put("ঞ্ঝ", "Ngh");
|
||||
put("্র", "r");
|
||||
put("্ল", "l");
|
||||
put("ষ্ম", "SSh");
|
||||
put("র্", "r");
|
||||
put("্য", "y");
|
||||
put("্ব", "w");
|
||||
}
|
||||
};
|
||||
// Vowels Only
|
||||
private final static HashMap<String, String> vowelsAndHasants = new HashMap<String, String>() {
|
||||
{
|
||||
put("আ", "aa");
|
||||
put("অ", "a");
|
||||
put("ই", "i");
|
||||
put("ঈ", "ii");
|
||||
put("উ", "u");
|
||||
put("ঊ", "uu");
|
||||
put("ঋ", "ri");
|
||||
put("এ", "e");
|
||||
put("ঐ", "oi");
|
||||
put("ও", "o");
|
||||
put("ঔ", "ou");
|
||||
put("া", "aa");
|
||||
put("ি", "i");
|
||||
put("ী", "ii");
|
||||
put("ু", "u");
|
||||
put("ূ", "uu");
|
||||
put("ৃ", "r");
|
||||
put("ে", "e");
|
||||
put("ো", "o");
|
||||
put("ৈ", "oi");
|
||||
put("ৗ", "ou");
|
||||
put("ৌ", "ou");
|
||||
put("ং", "ng");
|
||||
put("ঃ", "h");
|
||||
put("।", ".");
|
||||
}
|
||||
};
|
||||
|
||||
private final static HashMap<Character, String> vowels = new HashMap<Character, String>() {
|
||||
// Single Character Letters.
|
||||
private final static HashMap<String, String> letters = new HashMap<String, String>() {
|
||||
{
|
||||
put('অ', "o"); put('আ', "a"); put('ই', "i"); put('ঈ', "ee");
|
||||
put('উ', "u"); put('ঊ', "oo"); put('ঋ', "ri"); put('এ', "e");
|
||||
put('ঐ', "oi"); put('ও', "o"); put('ঔ', "ou"); put('া', "a");
|
||||
put('ি', "i"); put('ী', "ee"); put('ু', "u"); put('ূ', "oo");
|
||||
put('ৃ', "ri"); put('ে', "e"); put('ৈ', "oi"); put('ো', "o");
|
||||
put('ৌ', "ou");
|
||||
put("আ", "aa");
|
||||
put("অ", "a");
|
||||
put("ই", "i");
|
||||
put("ঈ", "ii");
|
||||
put("উ", "u");
|
||||
put("ঊ", "uu");
|
||||
put("ঋ", "ri");
|
||||
put("এ", "e");
|
||||
put("ঐ", "oi");
|
||||
put("ও", "o");
|
||||
put("ঔ", "ou");
|
||||
put("ক", "k");
|
||||
put("খ", "kh");
|
||||
put("গ", "g");
|
||||
put("ঘ", "gh");
|
||||
put("ঙ", "ng");
|
||||
put("চ", "ch");
|
||||
put("ছ", "chh");
|
||||
put("জ", "j");
|
||||
put("ঝ", "jh");
|
||||
put("ঞ", "Ng");
|
||||
put("ট", "T");
|
||||
put("ঠ", "Th");
|
||||
put("ড", "D");
|
||||
put("ঢ", "Dh");
|
||||
put("ণ", "N");
|
||||
put("ত", "t");
|
||||
put("থ", "th");
|
||||
put("দ", "d");
|
||||
put("ধ", "dh");
|
||||
put("ন", "n");
|
||||
put("প", "p");
|
||||
put("ফ", "ph");
|
||||
put("ব", "b");
|
||||
put("ভ", "bh");
|
||||
put("ম", "m");
|
||||
put("য", "J");
|
||||
put("র", "r");
|
||||
put("ল", "l");
|
||||
put("শ", "sh");
|
||||
put("ষ", "Sh");
|
||||
put("স", "s");
|
||||
put("হ", "h");
|
||||
put("ড়", "rh");
|
||||
put("ঢ়", "rH");
|
||||
put("য়", "y");
|
||||
put("ৎ", "t");
|
||||
put("০", "0");
|
||||
put("১", "1");
|
||||
put("২", "2");
|
||||
put("৩", "3");
|
||||
put("৪", "4");
|
||||
put("৫", "5");
|
||||
put("৬", "6");
|
||||
put("৭", "7");
|
||||
put("৮", "8");
|
||||
put("৯", "9");
|
||||
put("া", "aa");
|
||||
put("ি", "i");
|
||||
put("ী", "ii");
|
||||
put("ু", "u");
|
||||
put("ূ", "uu");
|
||||
put("ৃ", "r");
|
||||
put("ে", "e");
|
||||
put("ো", "o");
|
||||
put("ৈ", "oi");
|
||||
put("ৗ", "ou");
|
||||
put("ৌ", "ou");
|
||||
put("ং", "ng");
|
||||
put("ঃ", "h");
|
||||
put("ঁ", "nN");
|
||||
put("।", ".");
|
||||
}
|
||||
};
|
||||
|
||||
private final static HashMap<Character, String> consonants = new HashMap<Character, String>() {
|
||||
{
|
||||
put('ঁ', ""); put('ং', "ng"); put('ঃ', "");
|
||||
put('ক', "k"); put('খ', "kh"); put('গ', "g"); put('ঘ', "gh"); put('ঙ', "ng");
|
||||
put('চ', "ch"); put('ছ', "ch"); put('জ', "j"); put('ঝ', "jh"); put('ঞ', "ng");
|
||||
put('ট', "t"); put('ঠ', "th"); put('ড', "d"); put('ঢ', "dh"); put('ণ', "n");
|
||||
put('ত', "t"); put('থ', "th"); put('দ', "d"); put('ধ', "dh"); put('ন', "n");
|
||||
put('প', "p"); put('ফ', "f"); put('ব', "b"); put('ভ', "v"); put('ম', "m");
|
||||
put('য', "z"); put('র', "r"); put('ল', "l"); put('শ', "sh");
|
||||
put('ষ', "sh"); put('স', "s"); put('হ', "h");
|
||||
put('ৎ', "t"); put('ড়', "r"); put('ঢ়', "r"); put('য়', "y");
|
||||
}
|
||||
};
|
||||
// The regex to extract Bengali characters in nested groups.
|
||||
private final static String pattern = "(র্){0,1}(([অ-হড়-য়])(্([অ-মশ-হড়-য়]))*)((){0,1}(্([য-ল]))){0,1}([া-ৌ]){0,1}|([্ঁঃংৎ০-৯।])| ";
|
||||
private final static Pattern bengaliRegex = Pattern.compile(pattern);
|
||||
|
||||
private final static HashMap<Character, String> symbols = new HashMap<Character, String>() {
|
||||
{
|
||||
put('ব', "w");
|
||||
put('য়', "y");
|
||||
private static String getVal(String key) {
|
||||
if (key != null) {
|
||||
String comp = composites.get(key);
|
||||
if (comp != null) {
|
||||
return comp;
|
||||
}
|
||||
String sl = letters.get(key);
|
||||
if (sl != null) {
|
||||
return letters.get(key);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
private final static HashMap<Character, String> joins = new HashMap<Character, String>() {
|
||||
{
|
||||
put('৳', "$");
|
||||
}
|
||||
};
|
||||
|
||||
private static boolean hasJoinedInString(String string) {
|
||||
return string.contains(string);
|
||||
return null;
|
||||
}
|
||||
|
||||
public static String transliterate(String txt) {
|
||||
@ -77,50 +171,72 @@ public class BengaliLanguageUtils extends LanguageUtils {
|
||||
return txt;
|
||||
}
|
||||
|
||||
char[] charArray = txt.toCharArray();
|
||||
|
||||
StringBuilder romanizedBuilder = new StringBuilder();
|
||||
char last = '\0';
|
||||
|
||||
for(int i = 0; i < txt.length(); i++) {
|
||||
char currentChar = charArray[i];
|
||||
|
||||
if (symbols.containsKey(currentChar)) {
|
||||
romanizedBuilder.append(symbols.get(currentChar));
|
||||
Matcher m = bengaliRegex.matcher(txt);
|
||||
StringBuffer sb = new StringBuffer();
|
||||
while (m.find()) {
|
||||
String appendableString = "";
|
||||
String reff = m.group(1);
|
||||
if (reff != null) {
|
||||
appendableString = appendableString + "rr";
|
||||
}
|
||||
else if (numbers.containsKey(currentChar)) {
|
||||
romanizedBuilder.append(numbers.get(currentChar));
|
||||
}
|
||||
else if (vowels.containsKey(currentChar)) {
|
||||
romanizedBuilder.append(vowels.get(currentChar));
|
||||
}
|
||||
else if (consonants.containsKey(currentChar)) {
|
||||
if (last != '\0' && consonants.containsKey(last)) {
|
||||
romanizedBuilder.append('o');
|
||||
}
|
||||
romanizedBuilder.append(consonants.get(currentChar));
|
||||
} else if (currentChar == BENGALI_JOIN_CHAR) {
|
||||
if (i + 1 < txt.length() && joins.containsKey(charArray[i + 1])) {
|
||||
romanizedBuilder.append(joins.get(charArray[i + 1]));
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
// This is a filter-down approach. First considering larger groups,
|
||||
// If found any match breaks their. Else go to the next step.
|
||||
// Helpful to solve some corner-cases.
|
||||
String mainPart = getVal(m.group(2));
|
||||
if (mainPart != null) {
|
||||
appendableString = appendableString + mainPart;
|
||||
} else {
|
||||
romanizedBuilder.append(currentChar);
|
||||
String firstPart = getVal(m.group(3));
|
||||
if (firstPart != null) {
|
||||
appendableString = appendableString + firstPart;
|
||||
}
|
||||
int g = 4;
|
||||
while (g < 6) {
|
||||
String part = getVal(m.group(g));
|
||||
if (part != null) {
|
||||
appendableString = appendableString + part;
|
||||
break;
|
||||
}
|
||||
g = g + 1;
|
||||
}
|
||||
}
|
||||
int g = 6;
|
||||
while (g < 10) {
|
||||
String key = getVal(m.group(g));
|
||||
if (key != null) {
|
||||
appendableString = appendableString + key;
|
||||
break;
|
||||
}
|
||||
g = g + 1;
|
||||
}
|
||||
String kaar = m.group(10);
|
||||
if (kaar != null) {
|
||||
String kaarStr = letters.get(kaar);
|
||||
if (kaarStr != null) {
|
||||
appendableString = appendableString + kaarStr;
|
||||
}
|
||||
} else if (appendableString.length() > 0 && !vowelsAndHasants.containsKey(m.group(0))) {
|
||||
// Adding 'a' like ITRANS if no vowel is present.
|
||||
// TODO: Have to add it dynamically using Bengali grammer rules.
|
||||
appendableString = appendableString + "a";
|
||||
}
|
||||
String singleton = m.group(11);
|
||||
if (singleton != null) {
|
||||
String singleStr = letters.get(singleton);
|
||||
if (singleStr != null) {
|
||||
appendableString = appendableString + singleStr;
|
||||
}
|
||||
}
|
||||
String others = m.group(0);
|
||||
if (others != null) {
|
||||
|
||||
last = currentChar;
|
||||
if (appendableString.length() <= 0) {
|
||||
appendableString = appendableString + others;
|
||||
}
|
||||
}
|
||||
m.appendReplacement(sb, appendableString);
|
||||
}
|
||||
|
||||
String romanized = romanizedBuilder.toString();
|
||||
|
||||
if (vowels.containsKey(charArray[charArray.length - 1])
|
||||
&& hasJoinedInString(txt)
|
||||
&& romanized.toCharArray()[romanized.length() - 1] == 'y') {
|
||||
romanizedBuilder.append('o');
|
||||
}
|
||||
|
||||
return romanizedBuilder.toString();
|
||||
m.appendTail(sb);
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -17,14 +17,14 @@ import static org.junit.Assert.assertTrue;
|
||||
public class LanguageUtilsTest extends TestBase {
|
||||
@Test
|
||||
public void testStringTransliterateCyrillic() throws Exception {
|
||||
//input with cyrillic and diacritic letters
|
||||
// input with cyrillic and diacritic letters
|
||||
String input = "Прõсто текčт";
|
||||
String output = LanguageUtils.transliterate(input);
|
||||
String result = "Prosto tekct";
|
||||
|
||||
assertEquals("Transliteration failed", result, output);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testStringTransliterateHebrew() throws Exception {
|
||||
String input = "בדיקה עברית";
|
||||
@ -58,12 +58,17 @@ public class LanguageUtilsTest extends TestBase {
|
||||
}
|
||||
|
||||
public void testStringTransliterateBengali() throws Exception {
|
||||
//input with cyrillic and diacritic letters
|
||||
String input = "অনিরুদ্ধ";
|
||||
String output = LanguageUtils.transliterate(input);
|
||||
String result = "oniruddho";
|
||||
// input with cyrillic and diacritic letters
|
||||
String[] inputs = { "অনিরুদ্ধ", "বিজ্ঞানযাত্রা চলছে চলবে।", "আমি সব দেখেশুনে ক্ষেপে গিয়ে করি বাঙলায় চিৎকার!" };
|
||||
String[] outputs = { "aniruddha", "biggaanaJaatraa chalachhe chalabe.",
|
||||
"aami saba dekheshune kkhepe giye kari baangalaaya chitkaara!" };
|
||||
|
||||
assertEquals("Transliteration failed", result, output);
|
||||
String result;
|
||||
|
||||
for (int i = 0; i < inputs.length; i++) {
|
||||
result = LanguageUtils.transliterate(inputs[i]);
|
||||
assertEquals("Transliteration failed", outputs[i], result);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -82,7 +87,8 @@ public class LanguageUtilsTest extends TestBase {
|
||||
@Test
|
||||
public void testTransliterateOption() throws Exception {
|
||||
setDefaultTransliteration();
|
||||
assertFalse("Transliteration option fail! Expected 'Off' by default, but result is 'On'", LanguageUtils.transliterate());
|
||||
assertFalse("Transliteration option fail! Expected 'Off' by default, but result is 'On'",
|
||||
LanguageUtils.transliterate());
|
||||
|
||||
enableTransliteration(true);
|
||||
assertTrue("Transliteration option fail! Expected 'On', but result is 'Off'", LanguageUtils.transliterate());
|
||||
|
Loading…
Reference in New Issue
Block a user