Fix character mark removal before flattening to ASCII

This commit is contained in:
José Rebelo 2023-07-11 18:15:49 +01:00
parent 12dd9651e7
commit bdb904faf9
2 changed files with 15 additions and 2 deletions

View File

@ -32,8 +32,11 @@ public class FlattenToAsciiTransliterator implements Transliterator {
return txt;
}
// Decompose the string into its canonical decomposition (splits base characters from accents/marks)
txt = Normalizer.normalize(txt, Normalizer.Form.NFD);
txt = new String(txt.getBytes(StandardCharsets.US_ASCII), StandardCharsets.US_ASCII);
return txt.replaceAll("\\p{M}", "");
// Remove all marks (characters intended to be combined with another character), keeping the base glyphs
txt = txt.replaceAll("\\p{M}", "");
// Flatten the resulting string to ASCII
return new String(txt.getBytes(StandardCharsets.US_ASCII), StandardCharsets.US_ASCII);
}
}

View File

@ -7,6 +7,7 @@ import org.junit.Test;
import nodomain.freeyourgadget.gadgetbridge.GBApplication;
import nodomain.freeyourgadget.gadgetbridge.impl.GBDevice;
import nodomain.freeyourgadget.gadgetbridge.test.TestBase;
import nodomain.freeyourgadget.gadgetbridge.util.language.impl.FlattenToAsciiTransliterator;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
@ -183,6 +184,15 @@ public class LanguageUtilsTest extends TestBase {
assertEquals("croatian transliteration failed", expected, output);
}
@Test
public void testFlattenToAscii() throws Exception {
final FlattenToAsciiTransliterator transliterator = new FlattenToAsciiTransliterator();
String input = "ä ș ț ă";
String output = transliterator.transliterate(input);
String expected = "a s t a";
assertEquals("flatten to ascii transliteration failed", expected, output);
}
@Test
public void testTransliterateOption() throws Exception {
enableTransliteration(false);