Merge pull request #1236 from uroybd/master

Better Bengali transliteration.
This commit is contained in:
Carsten Pfeiffer 2018-09-01 09:35:23 +02:00 committed by GitHub
commit ece0f094eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 212 additions and 90 deletions

View File

@ -17,59 +17,153 @@
package nodomain.freeyourgadget.gadgetbridge.util;
import java.util.HashMap;
import java.util.regex.*;
// What's the reason to extending LanguageUtils?
// Just doing it because already done in the previous code.
public class BengaliLanguageUtils extends LanguageUtils {
private final static char BENGALI_JOIN_CHAR = '্';
private final static HashMap<Character, String> numbers = new HashMap<Character, String>() {
// Composite Letters.
private final static HashMap<String, String> composites = new HashMap<String, String>() {
{
put('',"0"); put('১',"1"); put('২',"2"); put('৩',"3"); put('',"4");
put('৫',"5"); put('৬',"6"); put('',"7"); put('৮',"8");
put('৯',"9");
put("ক্ষ", "kkh");
put("ঞ্চ", "NC");
put("ঞ্ছ", "NCh");
put("ঞ্জ", "Ng");
put("জ্ঞ", "gg");
put("ঞ্ঝ", "Ngh");
put("্র", "r");
put("্ল", "l");
put("ষ্ম", "SSh");
put("র্", "r");
put("্য", "y");
put("্ব", "w");
}
};
// Vowels Only
private final static HashMap<String, String> vowelsAndHasants = new HashMap<String, String>() {
{
put("", "aa");
put("", "a");
put("", "i");
put("", "ii");
put("", "u");
put("", "uu");
put("", "ri");
put("", "e");
put("", "oi");
put("", "o");
put("", "ou");
put("", "aa");
put("ি", "i");
put("", "ii");
put("", "u");
put("", "uu");
put("", "r");
put("", "e");
put("", "o");
put("", "oi");
put("", "ou");
put("", "ou");
put("", "ng");
put("", "h");
put("", ".");
}
};
private final static HashMap<Character, String> vowels = new HashMap<Character, String>() {
// Single Character Letters.
private final static HashMap<String, String> letters = new HashMap<String, String>() {
{
put('অ', "o"); put('আ', "a"); put('ই', "i"); put('ঈ', "ee");
put('উ', "u"); put('ঊ', "oo"); put('ঋ', "ri"); put('এ', "e");
put('ঐ', "oi"); put('ও', "o"); put('ঔ', "ou"); put('া', "a");
put('ি', "i"); put('ী', "ee"); put('ু', "u"); put('ূ', "oo");
put('ৃ', "ri"); put('ে', "e"); put('ৈ', "oi"); put('ো', "o");
put('ৌ', "ou");
put("", "aa");
put("", "a");
put("", "i");
put("", "ii");
put("", "u");
put("", "uu");
put("", "ri");
put("", "e");
put("", "oi");
put("", "o");
put("", "ou");
put("", "k");
put("", "kh");
put("", "g");
put("", "gh");
put("", "ng");
put("", "ch");
put("", "chh");
put("", "j");
put("", "jh");
put("", "Ng");
put("", "T");
put("", "Th");
put("", "D");
put("", "Dh");
put("", "N");
put("", "t");
put("", "th");
put("", "d");
put("", "dh");
put("", "n");
put("", "p");
put("", "ph");
put("", "b");
put("", "bh");
put("", "m");
put("", "J");
put("", "r");
put("", "l");
put("", "sh");
put("", "Sh");
put("", "s");
put("", "h");
put("", "rh");
put("", "rH");
put("", "y");
put("", "t");
put("", "0");
put("", "1");
put("", "2");
put("", "3");
put("", "4");
put("", "5");
put("", "6");
put("", "7");
put("", "8");
put("", "9");
put("", "aa");
put("ি", "i");
put("", "ii");
put("", "u");
put("", "uu");
put("", "r");
put("", "e");
put("", "o");
put("", "oi");
put("", "ou");
put("", "ou");
put("", "ng");
put("", "h");
put("", "nN");
put("", ".");
}
};
private final static HashMap<Character, String> consonants = new HashMap<Character, String>() {
{
put('ঁ', ""); put('ং', "ng"); put('ঃ', "");
put('ক', "k"); put('খ', "kh"); put('গ', "g"); put('ঘ', "gh"); put('ঙ', "ng");
put('চ', "ch"); put('ছ', "ch"); put('জ', "j"); put('ঝ', "jh"); put('ঞ', "ng");
put('ট', "t"); put('ঠ', "th"); put('ড', "d"); put('ঢ', "dh"); put('ণ', "n");
put('ত', "t"); put('থ', "th"); put('দ', "d"); put('ধ', "dh"); put('ন', "n");
put('প', "p"); put('ফ', "f"); put('ব', "b"); put('ভ', "v"); put('ম', "m");
put('য', "z"); put('র', "r"); put('ল', "l"); put('শ', "sh");
put('ষ', "sh"); put('স', "s"); put('হ', "h");
put('ৎ', "t"); put('ড়', "r"); put('ঢ়', "r"); put('য়', "y");
}
};
// The regex to extract Bengali characters in nested groups.
private final static String pattern = "(র্){0,1}(([অ-হড়-য়])(্([অ-মশ-হড়-য়]))*)((){0,1}(্([য-ল]))){0,1}([া-ৌ]){0,1}|([্ঁঃংৎ০-৯।])| ";
private final static Pattern bengaliRegex = Pattern.compile(pattern);
private final static HashMap<Character, String> symbols = new HashMap<Character, String>() {
{
put('ব', "w");
put('য়', "y");
private static String getVal(String key) {
if (key != null) {
String comp = composites.get(key);
if (comp != null) {
return comp;
}
String sl = letters.get(key);
if (sl != null) {
return letters.get(key);
}
}
};
private final static HashMap<Character, String> joins = new HashMap<Character, String>() {
{
put('৳', "$");
}
};
private static boolean hasJoinedInString(String string) {
return string.contains(string);
return null;
}
public static String transliterate(String txt) {
@ -77,50 +171,72 @@ public class BengaliLanguageUtils extends LanguageUtils {
return txt;
}
char[] charArray = txt.toCharArray();
StringBuilder romanizedBuilder = new StringBuilder();
char last = '\0';
for(int i = 0; i < txt.length(); i++) {
char currentChar = charArray[i];
if (symbols.containsKey(currentChar)) {
romanizedBuilder.append(symbols.get(currentChar));
Matcher m = bengaliRegex.matcher(txt);
StringBuffer sb = new StringBuffer();
while (m.find()) {
String appendableString = "";
String reff = m.group(1);
if (reff != null) {
appendableString = appendableString + "rr";
}
else if (numbers.containsKey(currentChar)) {
romanizedBuilder.append(numbers.get(currentChar));
}
else if (vowels.containsKey(currentChar)) {
romanizedBuilder.append(vowels.get(currentChar));
}
else if (consonants.containsKey(currentChar)) {
if (last != '\0' && consonants.containsKey(last)) {
romanizedBuilder.append('o');
}
romanizedBuilder.append(consonants.get(currentChar));
} else if (currentChar == BENGALI_JOIN_CHAR) {
if (i + 1 < txt.length() && joins.containsKey(charArray[i + 1])) {
romanizedBuilder.append(joins.get(charArray[i + 1]));
i++;
continue;
}
// This is a filter-down approach. First considering larger groups,
// If found any match breaks their. Else go to the next step.
// Helpful to solve some corner-cases.
String mainPart = getVal(m.group(2));
if (mainPart != null) {
appendableString = appendableString + mainPart;
} else {
romanizedBuilder.append(currentChar);
String firstPart = getVal(m.group(3));
if (firstPart != null) {
appendableString = appendableString + firstPart;
}
int g = 4;
while (g < 6) {
String part = getVal(m.group(g));
if (part != null) {
appendableString = appendableString + part;
break;
}
g = g + 1;
}
}
int g = 6;
while (g < 10) {
String key = getVal(m.group(g));
if (key != null) {
appendableString = appendableString + key;
break;
}
g = g + 1;
}
String kaar = m.group(10);
if (kaar != null) {
String kaarStr = letters.get(kaar);
if (kaarStr != null) {
appendableString = appendableString + kaarStr;
}
} else if (appendableString.length() > 0 && !vowelsAndHasants.containsKey(m.group(0))) {
// Adding 'a' like ITRANS if no vowel is present.
// TODO: Have to add it dynamically using Bengali grammer rules.
appendableString = appendableString + "a";
}
String singleton = m.group(11);
if (singleton != null) {
String singleStr = letters.get(singleton);
if (singleStr != null) {
appendableString = appendableString + singleStr;
}
}
String others = m.group(0);
if (others != null) {
last = currentChar;
if (appendableString.length() <= 0) {
appendableString = appendableString + others;
}
}
m.appendReplacement(sb, appendableString);
}
String romanized = romanizedBuilder.toString();
if (vowels.containsKey(charArray[charArray.length - 1])
&& hasJoinedInString(txt)
&& romanized.toCharArray()[romanized.length() - 1] == 'y') {
romanizedBuilder.append('o');
}
return romanizedBuilder.toString();
m.appendTail(sb);
return sb.toString();
}
}
}

View File

@ -17,14 +17,14 @@ import static org.junit.Assert.assertTrue;
public class LanguageUtilsTest extends TestBase {
@Test
public void testStringTransliterateCyrillic() throws Exception {
//input with cyrillic and diacritic letters
// input with cyrillic and diacritic letters
String input = "Прõсто текčт";
String output = LanguageUtils.transliterate(input);
String result = "Prosto tekct";
assertEquals("Transliteration failed", result, output);
}
@Test
public void testStringTransliterateHebrew() throws Exception {
String input = "בדיקה עברית";
@ -58,12 +58,17 @@ public class LanguageUtilsTest extends TestBase {
}
public void testStringTransliterateBengali() throws Exception {
//input with cyrillic and diacritic letters
String input = "অনিরুদ্ধ";
String output = LanguageUtils.transliterate(input);
String result = "oniruddho";
// input with cyrillic and diacritic letters
String[] inputs = { "অনিরুদ্ধ", "বিজ্ঞানযাত্রা চলছে চলবে।", "আমি সব দেখেশুনে ক্ষেপে গিয়ে করি বাঙলায় চিৎকার!" };
String[] outputs = { "aniruddha", "biggaanaJaatraa chalachhe chalabe.",
"aami saba dekheshune kkhepe giye kari baangalaaya chitkaara!" };
assertEquals("Transliteration failed", result, output);
String result;
for (int i = 0; i < inputs.length; i++) {
result = LanguageUtils.transliterate(inputs[i]);
assertEquals("Transliteration failed", outputs[i], result);
}
}
@Test
@ -82,7 +87,8 @@ public class LanguageUtilsTest extends TestBase {
@Test
public void testTransliterateOption() throws Exception {
setDefaultTransliteration();
assertFalse("Transliteration option fail! Expected 'Off' by default, but result is 'On'", LanguageUtils.transliterate());
assertFalse("Transliteration option fail! Expected 'Off' by default, but result is 'On'",
LanguageUtils.transliterate());
enableTransliteration(true);
assertTrue("Transliteration option fail! Expected 'On', but result is 'Off'", LanguageUtils.transliterate());