Improve ASCII transliterator

We should use NFKD instead of NFD since we are flattening to US-ASCII
afterwards anyway. This allows various Unicode characters which would
end up as a question mark to be represented by their compatibility
decomposition. This applies to e.g. ligatures (e.g. U+FB01 LATIN SMALL
LIGATURE FI will now be replaced with plain fi instead of a question
mark), and also the U+00A0 NO-BREAK SPACE [NBSP] to be replaced by
a normal space instead of a question mark.

+Add Czech fancy quotes to the Czech transliterator
+Add a unit test for Multitransliterator
This commit is contained in:
mormegil 2023-11-28 13:25:58 +01:00 committed by José Rebelo
parent 99b3dc3226
commit ec73b244ee
3 changed files with 24 additions and 8 deletions

View File

@ -27,6 +27,7 @@ public class CzechTransliterator extends SimpleTransliterator {
super(new HashMap<Character, String>() {{
put('ř',"r"); put('ě',"e"); put('ý',"y"); put('á',"a"); put('í',"i"); put('é',"e");
put('ó',"o"); put('ú',"u"); put('ů',"u"); put('ď',"d"); put('ť',"t"); put('ň',"n");
put('„', "\""); put('“', "\""); put('', "'"); put('', "'");
}});
}
}

View File

@ -32,8 +32,8 @@ public class FlattenToAsciiTransliterator implements Transliterator {
return txt;
}
// Decompose the string into its canonical decomposition (splits base characters from accents/marks)
txt = Normalizer.normalize(txt, Normalizer.Form.NFD);
// Decompose the string into its compatible decomposition (splits base characters from accents/marks, and changes some characters to compatibility version)
txt = Normalizer.normalize(txt, Normalizer.Form.NFKD);
// Remove all marks (characters intended to be combined with another character), keeping the base glyphs
txt = txt.replaceAll("\\p{M}", "");
// Flatten the resulting string to ASCII

View File

@ -4,9 +4,13 @@ import android.content.SharedPreferences;
import org.junit.Test;
import java.util.Arrays;
import nodomain.freeyourgadget.gadgetbridge.GBApplication;
import nodomain.freeyourgadget.gadgetbridge.impl.GBDevice;
import nodomain.freeyourgadget.gadgetbridge.test.TestBase;
import nodomain.freeyourgadget.gadgetbridge.util.language.impl.CzechTransliterator;
import nodomain.freeyourgadget.gadgetbridge.util.language.impl.ExtendedAsciiTransliterator;
import nodomain.freeyourgadget.gadgetbridge.util.language.impl.FlattenToAsciiTransliterator;
import static org.junit.Assert.assertEquals;
@ -96,9 +100,9 @@ public class LanguageUtilsTest extends TestBase {
final Transliterator transliterator = LanguageUtils.getTransliterator("bengali");
// input with cyrillic and diacritic letters
String[] inputs = { "অনিরুদ্ধ", "বিজ্ঞানযাত্রা চলছে চলবে।", "আমি সব দেখেশুনে ক্ষেপে গিয়ে করি বাঙলায় চিৎকার!",
"আমার জাভা কোড is so bad! কী আর বলবো!" };
String[] outputs = { "oniruddho", "biggaanJaatraa cholchhe cholbe.",
String[] inputs = {"অনিরুদ্ধ", "বিজ্ঞানযাত্রা চলছে চলবে।", "আমি সব দেখেশুনে ক্ষেপে গিয়ে করি বাঙলায় চিৎকার!",
"আমার জাভা কোড is so bad! কী আর বলবো!"};
String[] outputs = {"oniruddho", "biggaanJaatraa cholchhe cholbe.",
"aami sob dekheshune kkhepe giye kori baanglaay chitkaar!",
"aamaar jaabhaa koD is so bad! kii aar bolbo!"};
@ -189,7 +193,7 @@ public class LanguageUtilsTest extends TestBase {
assertEquals("georgian transliteration failed", expected, output);
}
@Test
@Test
public void testStringTransliterateHungarian() {
final Transliterator transliterator = LanguageUtils.getTransliterator("hungarian");
@ -227,12 +231,23 @@ public class LanguageUtilsTest extends TestBase {
@Test
public void testFlattenToAscii() throws Exception {
final FlattenToAsciiTransliterator transliterator = new FlattenToAsciiTransliterator();
String input = "ä ș ț ă";
String input = "ä ș ț ă fine";
String output = transliterator.transliterate(input);
String expected = "a s t a";
String expected = "a s t a fine";
assertEquals("flatten to ascii transliteration failed", expected, output);
}
@Test
public void testMultitransliterator() throws Exception {
final MultiTransliterator multiTransliterator = new MultiTransliterator(Arrays.asList(
new CzechTransliterator(),
new ExtendedAsciiTransliterator(),
new FlattenToAsciiTransliterator()
));
assertEquals("Zlutoucky kun upel \"dabelske\" \"ody\"", multiTransliterator.transliterate("Žluťoučký kůň úpěl »ďábelské« „ódy“"));
assertEquals("300 Kc", multiTransliterator.transliterate("300\u00A0Kč"));
}
@Test
public void testTransliterateOption() throws Exception {
enableTransliteration(false);