Skip to content

Commit

Permalink
fix "startOffset must be non-negative, and endOffset must be >= start…
Browse files Browse the repository at this point in the history
…Offset, and offsets must not go backwards" Error
  • Loading branch information
jiangyunpeng committed Jan 6, 2023
1 parent b18b899 commit fc50e19
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

public class PinyinTokenizer extends Tokenizer {


private static final int DEFAULT_BUFFER_SIZE = 256;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private boolean done = false;
Expand Down Expand Up @@ -156,6 +155,7 @@ public final boolean incrementToken() throws IOException {
StringBuilder buff = new StringBuilder();
int buffStartPosition = 0;
int buffSize = 0;
int[] specialCharPosition = new int[source.length()];

position = 0;

Expand Down Expand Up @@ -183,12 +183,19 @@ public final boolean incrementToken() throws IOException {
if (config.keepNoneChineseInJoinedFullPinyin) {
fullPinyinLetters.append(c);
}
}else{
//handle special charset
specialCharPosition[i]=1;
++buffSize;
}
} else {

//clean previous temp
if (buff.length() > 0) {
buffSize = parseBuff(buff, buffSize, buffStartPosition);
buffSize = parseBuff(buff, buffSize, buffStartPosition,specialCharPosition);
}else{
//clean buffSize
buffSize=0;
}

boolean incrPosition = false;
Expand Down Expand Up @@ -219,7 +226,7 @@ public final boolean incrementToken() throws IOException {

//clean previous temp
if (buff.length() > 0) {
buffSize = parseBuff(buff, buffSize, buffStartPosition);
buffSize = parseBuff(buff, buffSize, buffStartPosition,specialCharPosition);
}
}

Expand Down Expand Up @@ -270,14 +277,18 @@ public final boolean incrementToken() throws IOException {
return false;
}

private int parseBuff(StringBuilder buff, int buffSize, int buffPosition) {
private int parseBuff(StringBuilder buff, int buffSize, int buffPosition,int[] specialCharPosition) {
if (config.keepNoneChinese) {
if (config.noneChinesePinyinTokenize) {
List<String> result = PinyinAlphabetTokenizer.walk(buff.toString());
int start = (lastOffset - buffSize + 1);
for (int i = 0; i < result.size(); i++) {
int end;
String t = result.get(i);
//skip special charset
if(specialCharPosition[start]==1){
++start;
}
if (config.fixedPinyinOffset) {
end = start + 1;
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1485,4 +1485,57 @@ public void TestPinyinPosition4() throws IOException {


}

@Test
public void TestPinyinPosition5() throws IOException {
String[] s = {"WC-20%权益","刘德华(香港)精选M类"};
PinyinConfig config = new PinyinConfig();
config.keepFirstLetter = true;
config.keepOriginal = false;
config.ignorePinyinOffset = false;

HashMap<String, ArrayList<TermItem>> result = getStringArrayListHashMap(s, config);
ArrayList<TermItem> re = result.get("WC-20%权益");
Assert.assertEquals("w", re.get(0).term);
Assert.assertEquals(0, re.get(0).startOffset);
Assert.assertEquals(1, re.get(0).endOffset);
Assert.assertEquals(1, re.get(0).position);

Assert.assertEquals("wc20qy", re.get(1).term);
Assert.assertEquals(0, re.get(1).startOffset);
Assert.assertEquals(6, re.get(1).endOffset);
Assert.assertEquals(1, re.get(1).position);

Assert.assertEquals("c", re.get(2).term);
Assert.assertEquals(1, re.get(2).startOffset);
Assert.assertEquals(2, re.get(2).endOffset);
Assert.assertEquals(2, re.get(2).position);

Assert.assertEquals("20", re.get(3).term);
Assert.assertEquals(3, re.get(3).startOffset);
Assert.assertEquals(5, re.get(3).endOffset);
Assert.assertEquals(3, re.get(3).position);

Assert.assertEquals("quan", re.get(4).term);
Assert.assertEquals(6, re.get(4).startOffset);
Assert.assertEquals(7, re.get(4).endOffset);
Assert.assertEquals(4, re.get(4).position);

Assert.assertEquals("yi", re.get(5).term);
Assert.assertEquals(7, re.get(5).startOffset);
Assert.assertEquals(8, re.get(5).endOffset);
Assert.assertEquals(5, re.get(5).position);

re = result.get("刘德华(香港)精选M类");
Assert.assertEquals("xuan", re.get(7).term);
Assert.assertEquals(8, re.get(7).startOffset);
Assert.assertEquals(9, re.get(7).endOffset);
Assert.assertEquals(7, re.get(7).position);

Assert.assertEquals("m", re.get(8).term);
Assert.assertEquals(9, re.get(8).startOffset);
Assert.assertEquals(10, re.get(8).endOffset);
Assert.assertEquals(8, re.get(8).position);

}
}

0 comments on commit fc50e19

Please sign in to comment.