Fix null elements for some mixed case words in Armenian (combinations with U and Vo)

This commit is contained in:
Alik Aslanyan 2024-11-09 19:56:53 +04:00 committed by Arjan Schrijver
parent 1ca387df7a
commit 3b58965b0d
2 changed files with 120 additions and 103 deletions

View File

@ -29,6 +29,7 @@ import java.util.LinkedHashMap;
import java.util.LinkedHashSet; import java.util.LinkedHashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Objects;
public class ArmenianTransliterator implements Transliterator { public class ArmenianTransliterator implements Transliterator {
// Transliteration map ordered by priority // Transliteration map ordered by priority
@ -63,6 +64,7 @@ public class ArmenianTransliterator implements Transliterator {
put("յ","y"); put("յ","y");
put("ն","n"); put("ն","n");
put("շ","sh"); put("շ","sh");
put("ո", "vo");
put("չ","ch"); put("չ","ch");
put("պ","p"); put("պ","p");
put("ջ","j"); put("ջ","j");
@ -77,68 +79,78 @@ public class ArmenianTransliterator implements Transliterator {
put("օ","o"); put("օ","o");
put("և","ev"); put("և","ev");
put("ֆ","f"); put("ֆ","f");
put("՝", "`");
put("՞", "?");
put("։", ":");
put("", ".");
} }
}; };
// Capitalize existing simple substitutions here
for (final Entry<String, String> entry : new ArrayList<Entry<String, String>>(simpleSubstitions.entrySet())) {
String capitalKey = entry.getKey().toUpperCase();
if (!capitalKey.equals(entry.getKey())) {
simpleSubstitions.put(capitalKey, entry.getValue().toUpperCase());
}
}
// Letter + 'ու' // Letter + 'ու'
char[] letterMapU = { final String[] letterMapU = {
'ա', "ա",
'բ', "բ",
'գ', "գ",
'դ', "դ",
'ե', "ե",
'զ', "զ",
'է', "է",
'ը', "ը",
'թ', "թ",
'ժ', "ժ",
'ի', "ի",
'լ', "լ",
'խ', "խ",
'ծ', "ծ",
'կ', "կ",
'հ', "հ",
'ձ', "ձ",
'ղ', "ղ",
'ճ', "ճ",
'մ', "մ",
'յ', "յ",
'ն', "ն",
'շ', "շ",
'չ', "չ",
'պ', "պ",
'ջ', "ջ",
'ռ', "ռ",
'ս', "ս",
'վ', "վ",
'տ', "տ",
'ր', "ր",
'ց', "ց",
'փ', "փ",
'ք', "ք",
'օ', "օ",
'և', "և",
'ֆ', "ֆ",
'ո', "ո"
}; };
for(char letter : letterMapU) { for (final String letter : letterMapU) {
char capitalLetter = Character.toUpperCase(letter); final String capitalLetter = letter.toUpperCase();
final String transliteratedLetter = simpleSubstitions.get(Character.toString(letter)); final String transliteratedLetter = Objects.requireNonNull(simpleSubstitions.get(letter), letter);
final String transliteratedCapitalLetter = simpleSubstitions.get(Character.toString(capitalLetter)); final String transliteratedCapitalLetter = Objects.requireNonNull(simpleSubstitions.get(capitalLetter), capitalLetter);
put(Character.toString(letter) + "ու", transliteratedLetter + "u"); put(letter + "ու", transliteratedLetter + "u");
put(Character.toString(capitalLetter) + "ու", transliteratedCapitalLetter + "u"); put(capitalLetter + "ու", transliteratedCapitalLetter + "u");
put(Character.toString(letter) + "ՈՒ", transliteratedLetter + "U"); put(letter + "ՈՒ", transliteratedLetter + "U");
put(Character.toString(capitalLetter) + "ՈՒ", transliteratedCapitalLetter + "U"); put(capitalLetter + "ՈՒ", transliteratedCapitalLetter + "U");
put(letter + "Ու", transliteratedLetter + "U");
put(capitalLetter + "Ու", transliteratedCapitalLetter + "U");
put(Character.toString(letter) + "Ու", transliteratedLetter + "U"); put(letter + "ոՒ", transliteratedLetter + "U");
put(Character.toString(capitalLetter) + "Ու", transliteratedCapitalLetter + "U"); put(capitalLetter + "ոՒ", transliteratedCapitalLetter + "U");
put(Character.toString(letter) + "ոՒ", transliteratedLetter + "U");
put(Character.toString(capitalLetter) + "ոՒ", transliteratedCapitalLetter + "U");
} }
put("ու","u"); put("ու","u");
@ -147,50 +159,51 @@ public class ArmenianTransliterator implements Transliterator {
put("ՈՒ","U"); put("ՈՒ","U");
// Letter + 'ո' // Letter + 'ո'
char[] letterMapVo = { final String[] letterMapVo = {
'բ', "բ",
'գ', "գ",
'դ', "դ",
'զ', "զ",
'թ', "թ",
'ժ', "ժ",
'լ', "լ",
'խ', "խ",
'ծ', "ծ",
'կ', "կ",
'հ', "հ",
'ձ', "ձ",
'ղ', "ղ",
'ճ', "ճ",
'մ', "մ",
'յ', "յ",
'ն', "ն",
'շ', "շ",
'չ', "ո", // ո + ո should be voo
'պ', "չ",
'ջ', "պ",
'ռ', "ջ",
'ս', "ռ",
'վ', "ս",
'տ', "վ",
'ր', "տ",
'ց', "ր",
'փ', "ց",
'ք', "փ",
'և', "ք",
'ֆ', "և",
"ֆ"
}; };
for(char letter : letterMapVo) { for (String letter : letterMapVo) {
char capitalLetter = Character.toUpperCase(letter); String capitalLetter = letter.toUpperCase();
final String transliteratedLetter = simpleSubstitions.get(Character.toString(letter)); final String transliteratedLetter = Objects.requireNonNull(simpleSubstitions.get(letter));
final String transliteratedCapitalLetter = simpleSubstitions.get(Character.toString(capitalLetter)); final String transliteratedCapitalLetter = Objects.requireNonNull(simpleSubstitions.get(capitalLetter));
put(Character.toString(letter) + "ո", transliteratedLetter + "o"); put(letter + "ո", transliteratedLetter + "o");
put(Character.toString(capitalLetter) + "ո", transliteratedCapitalLetter + "o"); put(capitalLetter + "ո", transliteratedCapitalLetter + "o");
put(Character.toString(letter) + "Ո", transliteratedLetter + "Օ"); put(letter + "Ո", transliteratedLetter + "Օ");
put(Character.toString(capitalLetter) + "Ո", transliteratedCapitalLetter + "Օ"); put(capitalLetter + "Ո", transliteratedCapitalLetter + "Օ");
} }
put("ո","vo"); put("ո","vo");
@ -213,12 +226,11 @@ public class ArmenianTransliterator implements Transliterator {
put(entry.getKey(), entry.getValue()); put(entry.getKey(), entry.getValue());
put(entry.getKey().toUpperCase(), entry.getValue().toUpperCase()); put(entry.getKey().toUpperCase(), entry.getValue().toUpperCase());
} }
}}; }};
private static final Map<String, Integer> transliterationPriorityMap = new HashMap<String, Integer>() {{ private static final Map<String, Integer> transliterationPriorityMap = new HashMap<String, Integer>() {{
int priority = 0; int priority = 0;
for( final String key : transliterateMap.keySet() ) { for (final String key : transliterateMap.keySet()) {
put(key, priority++); put(key, priority++);
} }
}}; }};
@ -227,7 +239,7 @@ public class ArmenianTransliterator implements Transliterator {
private static final Trie transliterationTrie; private static final Trie transliterationTrie;
static { static {
final Trie.TrieBuilder builder = Trie.builder(); final Trie.TrieBuilder builder = Trie.builder();
for( final String key : ArmenianTransliterator.transliterateMap.keySet()) { for (final String key : ArmenianTransliterator.transliterateMap.keySet()) {
builder.addKeyword(key); builder.addKeyword(key);
} }
transliterationTrie = builder.build(); transliterationTrie = builder.build();
@ -235,12 +247,12 @@ public class ArmenianTransliterator implements Transliterator {
private static String ahoCorasick(final String text) { private static String ahoCorasick(final String text) {
// Create a buffer sufficiently large that re-allocations are minimized. // Create a buffer sufficiently large that re-allocations are minimized.
final StringBuilder sb = new StringBuilder( text.length() * 10 / 12 ); final StringBuilder sb = new StringBuilder(text.length() * 10 / 12);
// The complexity of the Aho-Corasick algorithm O(N + L + Z) // The complexity of the Aho-Corasick algorithm O(N + L + Z)
// Where N is the length of the text, L is the length of keywords and the Z is a number of matches. // Where N is the length of the text, L is the length of keywords and the Z is a number of matches.
// This algorithm allows us to do fast substring search // This algorithm allows us to do fast substring search
final List<Emit> emits = new ArrayList<Emit>(transliterationTrie.parseText( text )); final List<Emit> emits = new ArrayList<Emit>(transliterationTrie.parseText(text));
// Sort collection first by starting position, then by priority. // Sort collection first by starting position, then by priority.
Collections.sort(emits, new Comparator<Emit>() { Collections.sort(emits, new Comparator<Emit>() {
@ -259,11 +271,11 @@ public class ArmenianTransliterator implements Transliterator {
int prevIndex = 0; int prevIndex = 0;
for( final Emit emit : emits ) { for (final Emit emit : emits) {
final int matchIndex = emit.getStart(); final int matchIndex = emit.getStart();
// Skip if we already substituted this part // Skip if we already substituted this part
if(matchIndex < prevIndex) { if (matchIndex < prevIndex) {
continue; continue;
} }
@ -271,13 +283,13 @@ public class ArmenianTransliterator implements Transliterator {
sb.append(text.substring(prevIndex, matchIndex)); sb.append(text.substring(prevIndex, matchIndex));
// Substitute and append to the builder // Substitute and append to the builder
sb.append( ArmenianTransliterator.transliterateMap.get( emit.getKeyword() ) ); sb.append(Objects.requireNonNull(ArmenianTransliterator.transliterateMap.get(emit.getKeyword())));
prevIndex = emit.getEnd() + 1; prevIndex = emit.getEnd() + 1;
} }
// Add the remainder of the string (contains no more matches). // Add the remainder of the string (contains no more matches).
sb.append( text.substring( prevIndex ) ); sb.append(text.substring(prevIndex));
return sb.toString(); return sb.toString();
} }

View File

@ -18,6 +18,9 @@ public class ArmenianTransliteratorTest extends TestCase {
Assert.assertEquals("vorotan", new ArmenianTransliterator().transliterate("որոտան")); Assert.assertEquals("vorotan", new ArmenianTransliterator().transliterate("որոտան"));
Assert.assertEquals("voroshel", new ArmenianTransliterator().transliterate("որոշել")); Assert.assertEquals("voroshel", new ArmenianTransliterator().transliterate("որոշել"));
Assert.assertEquals("uzox", new ArmenianTransliterator().transliterate("ուզող")); Assert.assertEquals("uzox", new ArmenianTransliterator().transliterate("ուզող"));
Assert.assertEquals(
"AVO", new ArmenianTransliterator().transliterate("ԱՈ")
);
} }
@Test @Test
@ -44,9 +47,11 @@ public class ArmenianTransliteratorTest extends TestCase {
Assert.assertEquals( Assert.assertEquals(
"Ushadir", new ArmenianTransliterator().transliterate("Ուշադիր") "Ushadir", new ArmenianTransliterator().transliterate("Ուշադիր")
); );
Assert.assertEquals(
"AU", new ArmenianTransliterator().transliterate("ԱՈւ")
);
} }
@Test @Test
public void testTop100Words() { public void testTop100Words() {
final Map<String,String> topWords = new LinkedHashMap<String,String>() {{ final Map<String,String> topWords = new LinkedHashMap<String,String>() {{