Skip to content

Commit

Permalink
Replace Map<Character> by CharObjectHashMap and Set<Character> by Cha…
Browse files Browse the repository at this point in the history
…rHashSet. (#13420)

Also optimize the character replacement in JapaneseKatakanaUppercaseFilter.
  • Loading branch information
bruno-roustant committed May 27, 2024
1 parent a8def94 commit 6f77493
Show file tree
Hide file tree
Showing 31 changed files with 2,960 additions and 162 deletions.
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ Optimizations

* GITHUB#13406: Replace List<Integer> by IntArrayList and List<Long> by LongArrayList. (Bruno Roustant)

* GITHUB#13420: Replace Map<Character> by CharObjectHashMap and Set<Character> by CharHashSet. (Bruno Roustant)

Bug Fixes
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@

import java.io.IOException;
import java.io.Reader;
import java.util.Map;
import org.apache.lucene.analysis.CharFilter; // javadocs
import org.apache.lucene.analysis.util.RollingCharBuffer;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.hppc.CharObjectHashMap;

/**
* Simplistic {@link CharFilter} that applies the mappings contained in a {@link NormalizeCharMap}
Expand All @@ -38,7 +38,7 @@ public class MappingCharFilter extends BaseCharFilter {
private final FST.BytesReader fstReader;
private final RollingCharBuffer buffer = new RollingCharBuffer();
private final FST.Arc<CharsRef> scratchArc = new FST.Arc<>();
private final Map<Character, FST.Arc<CharsRef>> cachedRootArcs;
private final CharObjectHashMap<FST.Arc<CharsRef>> cachedRootArcs;

private CharsRef replacement;
private int replacementPointer;
Expand Down Expand Up @@ -96,7 +96,7 @@ public int read() throws IOException {

final int firstCH = buffer.get(inputOff);
if (firstCH != -1) {
FST.Arc<CharsRef> arc = cachedRootArcs.get(Character.valueOf((char) firstCH));
FST.Arc<CharsRef> arc = cachedRootArcs.get((char) firstCH);
if (arc != null) {
if (!FST.targetHasArcs(arc)) {
// Fast pass for single character match:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
package org.apache.lucene.analysis.charfilter;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
import org.apache.lucene.util.CharsRef;
Expand All @@ -27,6 +26,7 @@
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.hppc.CharObjectHashMap;

// TODO: save/load?

Expand All @@ -37,7 +37,7 @@
public class NormalizeCharMap {

final FST<CharsRef> map;
final Map<Character, FST.Arc<CharsRef>> cachedRootArcs = new HashMap<>();
final CharObjectHashMap<FST.Arc<CharsRef>> cachedRootArcs = new CharObjectHashMap<>();

// Use the builder to create:
private NormalizeCharMap(FST<CharsRef> map) {
Expand All @@ -53,8 +53,7 @@ private NormalizeCharMap(FST<CharsRef> map) {
while (true) {
assert scratchArc.label() != FST.END_LABEL;
cachedRootArcs.put(
Character.valueOf((char) scratchArc.label()),
new FST.Arc<CharsRef>().copyFrom(scratchArc));
(char) scratchArc.label(), new FST.Arc<CharsRef>().copyFrom(scratchArc));
if (scratchArc.isLast()) {
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,13 @@
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.analysis.hunspell.SortingStrategy.EntryAccumulator;
import org.apache.lucene.analysis.hunspell.SortingStrategy.EntrySupplier;
import org.apache.lucene.store.Directory;
Expand All @@ -60,6 +58,7 @@
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.IntSequenceOutputs;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.hppc.CharHashSet;
import org.apache.lucene.util.hppc.IntArrayList;
import org.apache.lucene.util.hppc.IntCursor;

Expand Down Expand Up @@ -334,8 +333,8 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
throws IOException, ParseException {
TreeMap<String, IntArrayList> prefixes = new TreeMap<>();
TreeMap<String, IntArrayList> suffixes = new TreeMap<>();
Set<Character> prefixContFlags = new HashSet<>();
Set<Character> suffixContFlags = new HashSet<>();
CharHashSet prefixContFlags = new CharHashSet();
CharHashSet suffixContFlags = new CharHashSet();
Map<String, Integer> seenPatterns = new HashMap<>();

// zero condition -> 0 ord
Expand Down Expand Up @@ -673,7 +672,7 @@ private FST<IntsRef> affixFST(TreeMap<String, IntArrayList> affixes) throws IOEx
*/
private void parseAffix(
TreeMap<String, IntArrayList> affixes,
Set<Character> secondStageFlags,
CharHashSet secondStageFlags,
String header,
LineNumberReader reader,
AffixKind kind,
Expand Down Expand Up @@ -1173,10 +1172,14 @@ protected boolean tolerateDuplicateConversionMappings() {
}

char[] allNonSuggestibleFlags() {
return Dictionary.toSortedCharArray(
Stream.of(HIDDEN_FLAG, noSuggest, forbiddenword, onlyincompound, subStandard)
.filter(c -> c != FLAG_UNSET)
.collect(Collectors.toSet()));
CharHashSet set = new CharHashSet(5);
set.add(HIDDEN_FLAG);
for (char c : new char[] {noSuggest, forbiddenword, onlyincompound, subStandard}) {
if (c != FLAG_UNSET) {
set.add(c);
}
}
return Dictionary.toSortedCharArray(set);
}

private List<String> readMorphFields(String word, String unparsed) {
Expand Down Expand Up @@ -1533,12 +1536,8 @@ CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
return reuse;
}

static char[] toSortedCharArray(Set<Character> set) {
char[] chars = new char[set.size()];
int i = 0;
for (Character c : set) {
chars[i++] = c;
}
static char[] toSortedCharArray(CharHashSet set) {
char[] chars = set.toArray();
Arrays.sort(chars);
return chars;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.IntsRefFSTEnum;
import org.apache.lucene.util.hppc.CharHashSet;
import org.apache.lucene.util.hppc.CharObjectHashMap;

/**
* A utility class used for generating possible word forms by adding affixes to stems ({@link
Expand All @@ -51,7 +53,7 @@
*/
public class WordFormGenerator {
private final Dictionary dictionary;
private final Map<Character, List<AffixEntry>> affixes = new HashMap<>();
private final CharObjectHashMap<List<AffixEntry>> affixes = new CharObjectHashMap<>();
private final Stemmer stemmer;

public WordFormGenerator(Dictionary dictionary) {
Expand All @@ -76,7 +78,15 @@ private void fillAffixMap(FST<IntsRef> fst, AffixKind kind) {
char flag = dictionary.affixData(id, AFFIX_FLAG);
var entry =
new AffixEntry(id, flag, kind, toString(kind, io.input), strip(id), condition(id));
affixes.computeIfAbsent(flag, __ -> new ArrayList<>()).add(entry);
List<AffixEntry> entries;
int index = affixes.indexOf(flag);
if (index < 0) {
entries = new ArrayList<>();
affixes.indexInsert(index, flag, entries);
} else {
entries = affixes.indexGet(index);
}
entries.add(entry);
}
}
} catch (IOException e) {
Expand Down Expand Up @@ -163,11 +173,7 @@ private static char[] sortAndDeduplicate(char[] flags) {
}

private static char[] deduplicate(char[] flags) {
Set<Character> set = new HashSet<>();
for (char flag : flags) {
set.add(flag);
}
return toSortedCharArray(set);
return toSortedCharArray(CharHashSet.from(flags));
}

/**
Expand Down Expand Up @@ -442,7 +448,7 @@ boolean processStemCandidate(
int innerSuffix) {
String candidate = new String(word, offset, length);
stemCounts.merge(candidate, 1, Integer::sum);
Set<Character> flags = new LinkedHashSet<>();
CharHashSet flags = new CharHashSet();
if (outerPrefix >= 0) flags.add(dictionary.affixData(outerPrefix, AFFIX_FLAG));
if (innerPrefix >= 0) flags.add(dictionary.affixData(innerPrefix, AFFIX_FLAG));
if (outerSuffix >= 0) flags.add(dictionary.affixData(outerSuffix, AFFIX_FLAG));
Expand Down Expand Up @@ -515,7 +521,7 @@ EntrySuggestion toSuggestion(State state) {
if (wordSet.contains(extra)) continue;

if (forbidden.contains(extra) && dictionary.forbiddenword != FLAG_UNSET) {
addEntry(toEdit, toAdd, extra, Set.of(dictionary.forbiddenword));
addEntry(toEdit, toAdd, extra, CharHashSet.from(dictionary.forbiddenword));
} else {
extraGenerated.add(extra);
}
Expand All @@ -525,7 +531,7 @@ EntrySuggestion toSuggestion(State state) {
}

private void addEntry(
List<DictEntry> toEdit, List<DictEntry> toAdd, String stem, Set<Character> flags) {
List<DictEntry> toEdit, List<DictEntry> toAdd, String stem, CharHashSet flags) {
String flagString = toFlagString(flags);
(existingStems.contains(stem) ? toEdit : toAdd).add(DictEntry.create(stem, flagString));
}
Expand Down Expand Up @@ -589,20 +595,20 @@ private Stream<String> allGenerated(Map<String, Set<FlagSet>> stemToFlags) {
.flatMap(swc -> expansionCache.computeIfAbsent(swc, expandToWords).stream());
}

private List<AffixedWord> expand(String stem, Set<Character> flagSet) {
private List<AffixedWord> expand(String stem, CharHashSet flagSet) {
return getAllWordForms(stem, toFlagString(flagSet), checkCanceled);
}

private String toFlagString(Set<Character> flagSet) {
private String toFlagString(CharHashSet flagSet) {
return dictionary.flagParsingStrategy.printFlags(Dictionary.toSortedCharArray(flagSet));
}
}

private static class FlagSet {
final Set<Character> flags;
final CharHashSet flags;
final Dictionary dictionary;

FlagSet(Set<Character> flags, Dictionary dictionary) {
FlagSet(CharHashSet flags, Dictionary dictionary) {
this.flags = flags;
this.dictionary = dictionary;
}
Expand All @@ -620,8 +626,10 @@ public int hashCode() {
return Objects.hash(flags, dictionary);
}

static Set<Character> flatten(Set<FlagSet> flagSets) {
return flagSets.stream().flatMap(f -> f.flags.stream()).collect(Collectors.toSet());
static CharHashSet flatten(Set<FlagSet> flagSets) {
CharHashSet set = new CharHashSet(flagSets.size() << 1);
flagSets.forEach(flagSet -> set.addAll(flagSet.flags));
return set;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja;

import java.util.Map;
import org.apache.lucene.util.hppc.CharObjectHashMap;

/** Utility methods for Japanese filters. */
class JapaneseFilterUtil {

/** Creates a primitive char-to-char map from a set of {@link java.util.Map.Entry}. */
@SafeVarargs
static CharObjectHashMap<Character> createCharMap(
Map.Entry<Character, Character>... charMappings) {
CharObjectHashMap<Character> map = new CharObjectHashMap<>(charMappings.length);
for (Map.Entry<Character, Character> charMapping : charMappings) {
map.put(charMapping.getKey(), charMapping.getValue());
}
return map;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,14 @@
*/
package org.apache.lucene.analysis.ja;

import static org.apache.lucene.analysis.ja.JapaneseFilterUtil.createCharMap;

import java.io.IOException;
import java.util.Map;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.hppc.CharObjectHashMap;

/**
* A {@link TokenFilter} that normalizes small letters (捨て仮名) in hiragana into normal letters. For
Expand All @@ -30,13 +33,13 @@
* legal, contract policies, etc.
*/
public final class JapaneseHiraganaUppercaseFilter extends TokenFilter {
private static final Map<Character, Character> LETTER_MAPPINGS;
private static final CharObjectHashMap<Character> LETTER_MAPPINGS;

static {
// supported characters are:
// ぁ ぃ ぅ ぇ ぉ っ ゃ ゅ ょ ゎ ゕ ゖ
LETTER_MAPPINGS =
Map.ofEntries(
createCharMap(
Map.entry('ぁ', 'あ'),
Map.entry('ぃ', 'い'),
Map.entry('ぅ', 'う'),
Expand All @@ -59,17 +62,16 @@ public JapaneseHiraganaUppercaseFilter(TokenStream input) {

@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
char[] termBuffer = termAttr.buffer();
for (int i = 0; i < termBuffer.length; i++) {
Character c = LETTER_MAPPINGS.get(termBuffer[i]);
if (c != null) {
termBuffer[i] = c;
}
}
return true;
} else {
if (!input.incrementToken()) {
return false;
}
final char[] termBuffer = termAttr.buffer();
for (int i = 0, length = termAttr.length(); i < length; i++) {
Character c = LETTER_MAPPINGS.get(termBuffer[i]);
if (c != null) {
termBuffer[i] = c;
}
}
return true;
}
}
Loading

0 comments on commit 6f77493

Please sign in to comment.