-
Notifications
You must be signed in to change notification settings - Fork 669
/
Copy pathStringCoding.java
1355 lines (1195 loc) · 57.3 KB
/
StringCoding.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package java.lang;
import jdk.internal.HotSpotIntrinsicCandidate;
import sun.nio.cs.ArrayDecoder;
import sun.nio.cs.ArrayEncoder;
import sun.nio.cs.HistoricallyNamedCharset;
import java.io.UnsupportedEncodingException;
import java.lang.ref.SoftReference;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.MalformedInputException;
import java.nio.charset.UnmappableCharacterException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Arrays;
import static java.lang.Character.highSurrogate;
import static java.lang.Character.isSupplementaryCodePoint;
import static java.lang.Character.isSurrogate;
import static java.lang.Character.lowSurrogate;
import static java.lang.String.COMPACT_STRINGS;
import static java.lang.String.LATIN1;
import static java.lang.String.UTF16;
import static java.lang.StringUTF16.putChar;
// 用于解码[decoding]/编码[encoding]字符串的工具类
class StringCoding {
// 每个线程缓存的解码器
private static final ThreadLocal<SoftReference<StringDecoder>> decoder = new ThreadLocal<>();
// 每个线程缓存的编码器
private static final ThreadLocal<SoftReference<StringEncoder>> encoder = new ThreadLocal<>();
/* The cached Result for each thread */
// 为当前线程指定缓存的Result对象(此刻还没缓存进去呢)
private static final ThreadLocal<Result> resultCached = new ThreadLocal<>() {
protected Result initialValue() {
return new Result();
}
};
// 缓存的常用字符集实例
private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE;
private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE;
private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE;
private static char repl = '\ufffd';
private StringCoding() {
}
// 判断数组元素是否全部为ANSI字符,即字符范围在[0x0, 0x80)
@HotSpotIntrinsicCandidate
public static boolean hasNegatives(byte[] ba, int off, int len) {
for(int i = off; i < off + len; i++) {
// byte的非负范围是[0, 80),此时可表示ANSI字符
if(ba[i] < 0) {
return true;
}
}
return false;
}
/*▼ encode ████████████████████████████████████████████████████████████████████████████████┓ */
// 编码String,需要先将String内部的byte[]转为char[],然后以JVM默认字符集格式对char[]进行编码,并返回编码后的byte[]
static byte[] encode(byte coder, byte[] val) {
// JVM默认字符集
Charset cs = Charset.defaultCharset();
// UTF_8可表示的字符范围:整个Unicode字符集
if(cs == UTF_8) {
// 编码String,返回UTF-8格式的byte[]。如发生编码错误,替换错误的码元为单字节'?'
return encodeUTF8(coder, val, true);
}
// ISO_8859_1可表示的字符范围[0x0, 0xFF)
if(cs == ISO_8859_1) {
// 编码String,返回ISO-8859-1格式的byte[]。如发生编码错误,替换错误的单元为单字节'?'
return encode8859_1(coder, val);
}
// US_ASCII可表示的字符范围[0x0, 0x80)
if(cs == US_ASCII) {
// 编码String,返回ASCII格式的byte[]。如发生编码错误,替换错误的单元为单字节'?'
return encodeASCII(coder, val);
}
// 取出当前线程关联的字符串编码器缓存,看与JVM默认字符集是否匹配
StringEncoder se = deref(encoder);
if(se == null // 无缓存的编码器,或者:
|| !cs.name().equals(se.cs.name())) { // JVM默认字符集与缓存的编码器支持的字符集不同
se = new StringEncoder(cs, cs.name()); // 新建一个支持JVM默认字符集的编码器
// 将字符串编码器放入缓存,并关联到当前线程
set(encoder, se);
}
// 返回编码后的byte[]
return se.encode(coder, val);
}
// 编码String,返回charsetName字符集格式的byte[]
static byte[] encode(String charsetName, byte coder, byte[] val) throws UnsupportedEncodingException {
// 取出当前线程关联的字符串编码器缓存
StringEncoder se = deref(encoder);
String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
if((se == null) || !(csn.equals(se.requestedCharsetName()) || csn.equals(se.charsetName()))) {
se = null;
try {
Charset cs = lookupCharset(csn);
if(cs != null) {
if(cs == UTF_8) {
// 编码String,返回UTF-8格式的byte[]。如发生编码错误,替换错误的码元为单字节'?'
return encodeUTF8(coder, val, true);
}
if(cs == ISO_8859_1) {
// 编码String,返回ISO-8859-1格式的byte[]。如发生编码错误,替换错误的单元为单字节'?'
return encode8859_1(coder, val);
}
if(cs == US_ASCII) {
// 编码String,返回ASCII格式的byte[]。如发生编码错误,替换错误的单元为单字节'?'
return encodeASCII(coder, val);
}
se = new StringEncoder(cs, csn);
}
} catch(IllegalCharsetNameException x) {
}
if(se == null) {
throw new UnsupportedEncodingException(csn);
}
// 将字符串编码器放入缓存,并关联到当前线程
set(encoder, se);
}
// 返回编码后的byte[]
return se.encode(coder, val);
}
// 编码String,返回cs字符集格式的byte[]
static byte[] encode(Charset cs, byte coder, byte[] val) {
if(cs == UTF_8) {
// 编码String,返回UTF-8格式的byte[]。如发生编码错误,替换错误的码元为单字节'?'
return encodeUTF8(coder, val, true);
}
if(cs == ISO_8859_1) {
// 编码String,返回ISO-8859-1格式的byte[]。如发生编码错误,替换错误的单元为单字节'?'
return encode8859_1(coder, val);
}
if(cs == US_ASCII) {
// 编码String,返回ASCII格式的byte[]。如发生编码错误,替换错误的单元为单字节'?'
return encodeASCII(coder, val);
}
/* 编码String,返回其他编码格式的byte[]。*/
CharsetEncoder ce = cs.newEncoder();
// fastpath for ascii compatible
if(coder == LATIN1
&& ((ce instanceof ArrayEncoder)
&& ((ArrayEncoder) ce).isASCIICompatible()
&& !hasNegatives(val, 0, val.length))) { // 判断数组元素是否全部为ANSI字符,即字符范围在[0x0, 0x80)
return Arrays.copyOf(val, val.length);
}
int len = val.length >> coder; // assume LATIN1=0/UTF16=1;
int en = scale(len, ce.maxBytesPerChar());
byte[] ba = new byte[en];
if(len == 0) {
return ba;
}
ce.onMalformedInput(CodingErrorAction.REPLACE).onUnmappableCharacter(CodingErrorAction.REPLACE).reset();
if(ce instanceof ArrayEncoder) {
int blen = (coder == LATIN1) ? ((ArrayEncoder) ce).encodeFromLatin1(val, 0, len, ba) : ((ArrayEncoder) ce).encodeFromUTF16(val, 0, len, ba);
if(blen != -1) {
return safeTrim(ba, blen, true);
}
}
boolean isTrusted = cs.getClass().getClassLoader0() == null || System.getSecurityManager() == null;
char[] ca = (coder == LATIN1) ? StringLatin1.toChars(val) : StringUTF16.toChars(val);
ByteBuffer bb = ByteBuffer.wrap(ba);
CharBuffer cb = CharBuffer.wrap(ca, 0, len);
try {
CoderResult cr = ce.encode(cb, bb, true);
if(!cr.isUnderflow())
cr.throwException();
cr = ce.flush(bb);
if(!cr.isUnderflow())
cr.throwException();
} catch(CharacterCodingException x) {
throw new Error(x);
}
return safeTrim(ba, bb.position(), isTrusted);
}
/**
* @param coder String类别,分为LATIN1-String和UTF16-String,参见String中的注释
* @param val 存储String的字节数组,在Windows上显示为LATIN1编码或UTF-16LE编码
* @param doReplace 当转码发生错误时,错误的码元是否接受使用'?'去替换,如果不接受,则抛出异常
*
* @return 返回UTF-8编码格式的String。
*/
// 编码String,返回UTF-8格式的byte[]
private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) {
if(coder == UTF16) {
// 编码UTF16-string,返回UTF-8格式的byte[]。
return encodeUTF8_UTF16(val, doReplace);
}
/* 否则,需要编码LATIN1-String,返回UTF-8格式的byte[]*/
// 判断数组元素是否全部为ANSI字符,即字符范围在[0x0, 0x80)
if(!hasNegatives(val, 0, val.length)) {
// 返回val的一份拷贝(都是单字节)
return Arrays.copyOf(val, val.length);
}
// 字符范围在[0x80, 0xFF)
int dp = 0;
byte[] dst = new byte[val.length << 1];
for(byte c : val) {
// 处理[0x80, 0x800)范围内的字符,存储为两个字节:110x-xxxx|10xx-xxxx
if(c < 0) {
dst[dp++] = (byte) (0xc0 | ((c & 0xff) >> 6));
dst[dp++] = (byte) (0x80 | (c & 0x3f));
} else { // 处理[0x0, 0x80)范围内的字符,存储为一个字节:0xxx-xxxx
dst[dp++] = c;
}
}
// 分配的空间恰好用完
if(dp == dst.length) {
return dst;
}
// 后面有多余的空间,则将其去掉
return Arrays.copyOf(dst, dp);
}
/**
* @param val 存储了UTF16-string的字节数组,在Windows上显示为UTF-16LE编码
* @param doReplace 当转码发生错误时,错误的码元是否接受使用'?'去替换
*
* @return 返回UTF-8格式的byte[]。
*/
// 编码UTF16-string,返回UTF-8格式的byte[]
private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) {
int dp = 0; // dst游标,统计转换后的符号所占的字节数
int sp = 0; // src游标,遍历原字节数组val中的符号
// 原UTF16-string中,最多可能容纳的符号个数(按char计算)。
int sl = val.length >> 1;
/*
* 创建新数组用来容纳转换后的字节序列
* 这里处理的是UTF16-string,即其符号由UTF16形式的两个字节或四个字节组成
* 所以这里按最大容量去开辟一个字节数组用来存储转码后的符号
*
* 该最大容量确定的依据是:
* UTF-16两字节符号转换成UTF-8后,可能为1字节、2字节、3字节符号
* UTF-16四字节符号转换成UTF-8后,还是4字节符号
*/
byte[] dst = new byte[sl * 3];
char c;
/*
* 快速解析字符串前面编码范围在[0x0, 0x80)之间的符号(其实就是ASCII码)
* 经过压缩(如0x0035--->0x35)后,存入UTF-8编码表示的一个byte
*/
while(sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') {
// ascii fast loop;
dst[dp++] = (byte) c;
sp++;
}
// 从第一个非ASCII码char开始遍历,当然,后面还可能遇到ASCII码char
while(sp < sl) {
c = StringUTF16.getChar(val, sp++);
/*
* (1).对于[0x 0, 0x 80),有效位数为 0~ 7位,存储为一个字节:0xxx-xxxx
* (2).对于[0x 80, 0x 800),有效位数为 8~11位,存储为两个字节:110x-xxxx|10xx-xxxx
* (3).对于[0x 800, 0x FFFF),有效位数为12~16位,存储为三个字节:1110-xxxx|10xx-xxxx|10xx-xxxx
* (4).对于[0x10000, 0x10FFFF),有效位数为17~21位,存储为四个字节:1111-0xxx|10xx-xxxx|10xx-xxxx|10xx-xxxx
*/
if(c < 0x80) { // (1)
dst[dp++] = (byte) c;
} else if(c < 0x800) { // (2)
dst[dp++] = (byte) (0xc0 | (c >> 6)); // 先存高6位
dst[dp++] = (byte) (0x80 | (c & 0x3f)); // 再存低6位
} else if(Character.isSurrogate(c)) { // (4) 处于Unicode代理字符区域
int uc = -1;
char c2;
if(Character.isHighSurrogate(c) && sp < sl && Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) {
// 高、低代理区的码点值 ---> Unicode符号编码值
uc = Character.toCodePoint(c, c2);
}
// 出现了异常的码元,即单独出现了高代理单元或低代理单元
if(uc < 0) {
if(doReplace) {
// 如果接受替换,则将异常的码元(两个byte)替换为一个单字节'?'
dst[dp++] = '?';
} else {
// 如果不接受替换,则抛出异常
throwUnmappable(sp - 1, 1); // or 2, does not matter here
}
} else {
dst[dp++] = (byte) (0xf0 | ((uc >> 18)));
dst[dp++] = (byte) (0x80 | ((uc >> 12) & 0x3f));
dst[dp++] = (byte) (0x80 | ((uc >> 6) & 0x3f));
dst[dp++] = (byte) (0x80 | (uc & 0x3f));
sp++; // 2 chars
}
} else { // (3) 3 bytes, 16 bits
dst[dp++] = (byte) (0xe0 | ((c >> 12)));
dst[dp++] = (byte) (0x80 | ((c >> 6) & 0x3f));
dst[dp++] = (byte) (0x80 | (c & 0x3f));
}
}
// 新数组刚好填充完
if(dp == dst.length) {
return dst;
}
// 新数组空间没用完,则压缩空间(去掉后面没有使用的部分)
return Arrays.copyOf(dst, dp);
}
// 编码String,返回ISO-8859-1格式的byte[]。如发生编码错误,替换错误的单元为单字节'?'
private static byte[] encode8859_1(byte coder, byte[] val) {
return encode8859_1(coder, val, true);
}
/**
* @param coder String类别,分为LATIN1-String和UTF16-String,参见String中的注释
* @param val 存储String的字节数组,在Windows上显示为UTF-16LE编码
* @param doReplace 当转码发生错误时,错误的单元是否接受使用'?'去替换,如果不接受,则抛出异常
* @return 返回ISO-8859-1格式的byte[]。
*/
// 编码String,返回以ISO-8859-1格式的byte[]。如发生编码错误,替换错误的单元为单字节'?'
private static byte[] encode8859_1(byte coder, byte[] val, boolean doReplace) {
// 编码LATIN1-String,返回ISO-8859-1格式的byte[]。
if(coder == LATIN1) {
// 两种格式完全一样,直接返回副本即可
return Arrays.copyOf(val, val.length);
}
/* 否则,编码UTF16-String,返回ISO-8859-1格式的byte[]。 */
// dst数组剩余容量,初始化为原UTF16-string中,最多可能容纳的符号个数(按char计算)。
int len = val.length >> 1;
// 创建新数组用来容纳转换后的字节序列
byte[] dst = new byte[len];
int dp = 0; // dst游标,统计转换后的符号所占的字节数
int sp = 0; // src游标,遍历原字节数组val中的符号,也代表已经编码完成的符号数量
// 总共需要编码的符号数量,初始化为原UTF16-string中,最多可能容纳的符号个数
int sl = len;
while(sp < sl) {
// 编码UTF16-String,返回成功处理的ISO-8859-1符号数量。
int ret = implEncodeISOArray(val, sp, dst, dp, len);
sp = sp + ret;
dp = dp + ret;
// 编码过程中遇到了超出ISO-8859-1表示范围的符号
if(ret != len) {
// 如果不允许替换这些异常byte,则抛出异常
if(!doReplace) {
throwUnmappable(sp, 1);
}
// 将UTF16-String内部的字节转换为char后返回
char c = StringUTF16.getChar(val, sp++);
// 如果遇到了Unicode增补字符号
if(Character.isHighSurrogate(c) && sp < sl && Character.isLowSurrogate(StringUTF16.getChar(val, sp))) {
sp++;
}
// 替换当前超出ISO_8859_1表示范围的单元(两个byte或四个byte)为一个单字节'?'
dst[dp++] = '?';
// 更新dst数组剩余容量
len = sl - sp;
}
}
// 刚好填满新数组
if(dp == dst.length) {
return dst;
}
// 去掉新数组后面多余的空间
return Arrays.copyOf(dst, dp);
}
/**
* @param coder String类别,分为LATIN1-String和UTF16-String,参见String中的注释
* @param val 存储String的字节数组,在Windows上显示为UTF-16LE编码
* @return 返回ASCII格式的byte[]
*/
// 编码String,返回以ASCII格式的byte[]。如发生编码错误,替换错误的单元为单字节'?'
private static byte[] encodeASCII(byte coder, byte[] val) {
// 编码LATIN1-String,返回ASCII格式的byte[]。
if(coder == LATIN1) {
byte[] dst = new byte[val.length];
for(int i = 0; i < val.length; i++) {
if(val[i] < 0) {
// 替换当前超出ASCII表示范围的单个字节为一个单字节'?'
dst[i] = '?';
} else {
dst[i] = val[i];
}
}
return dst;
}
/* 否则,编码UTF16-String,返回ASCII格式的byte[]。 */
// dst数组剩余容量,初始化为原UTF16-string中,最多可能容纳的符号个数(按char计算)。
int len = val.length >> 1;
// 创建新数组用来容纳转换后的字节序列
byte[] dst = new byte[len];
// dst游标,统计转换后的符号所占的字节数
int dp = 0;
for(int i = 0; i < len; i++) {
// 将UTF16-String内部的字节转换为char后返回
char c = StringUTF16.getChar(val, i);
if(c < 0x80) {
dst[dp++] = (byte) c;
continue;
}
// 编码过程中遇到了超出ASCII表示范围的byte
if(Character.isHighSurrogate(c) && i + 1 < len && Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) {
i++;
}
dst[dp++] = '?'; // 替换当前超出ISO_8859_1表示范围的单元(两个byte或四个byte)为一个单字节'?'
}
if(len == dp) {
return dst;
}
return Arrays.copyOf(dst, dp);
}
/**
* 编码UTF16-String,编码过程中,如遇到超出表示范围的byte,则停止转换
*
* @param sa 转换前的字节序列
* @param sp 遍历sa的游标
* @param da 转换后的字节序列
* @param dp 遍历dp的游标
* @param len 待转换的字节数量
*
* @return 返回成功处理的ISO-8859-1符号数量。
*/
// 编码UTF16-String,返回成功处理的ISO-8859-1符号数量。
@HotSpotIntrinsicCandidate
private static int implEncodeISOArray(byte[] sa, int sp, byte[] da, int dp, int len) {
int i = 0;
for(; i < len; i++) {
// 将UTF16-String内部的字节转换为char后返回
char c = StringUTF16.getChar(sa, sp++);
// 如果该char的编码超出了ISO-8859-1编码可表示的形式,则结束解码
if(c > '\u00FF')
break;
da[dp++] = (byte) c;
}
return i;
}
/**
* Throws iae, instead of replacing, if unmappable.
*/
// 编码String,返回UTF-8格式的byte[]。如发生编码错误,抛出异常。
static byte[] getBytesUTF8NoRepl(String s) {
return encodeUTF8(s.coder(), s.value(), false);
}
/**
* Throws CCE, instead of replacing, if unmappable.
*/
// 编码String,返回指定字符集格式的byte[]
static byte[] getBytesNoRepl(String s, Charset cs) throws CharacterCodingException {
try {
return getBytesNoRepl1(s, cs);
} catch(IllegalArgumentException e) {
//getBytesNoRepl1 throws IAE with UnmappableCharacterException or CCE as the cause
Throwable cause = e.getCause();
if(cause instanceof UnmappableCharacterException) {
throw (UnmappableCharacterException) cause;
}
throw (CharacterCodingException) cause;
}
}
// 编码String,返回指定字符集格式的byte[]
static byte[] getBytesNoRepl1(String s, Charset cs) {
byte[] val = s.value();
byte coder = s.coder();
// 返回UTF-8格式的byte[]
if(cs == UTF_8) {
if(isASCII(val)) {
return val;
}
return encodeUTF8(coder, val, false);
}
// 返回ISO_8859_1格式的byte[]
if(cs == ISO_8859_1) {
if(coder == LATIN1) {
return val;
}
return encode8859_1(coder, val, false);
}
// 返回US_ASCII格式的byte[]
if(cs == US_ASCII) {
if(coder == LATIN1) {
if(isASCII(val)) {
return val;
} else {
throwUnmappable(val);
}
}
}
CharsetEncoder ce = cs.newEncoder();
// fastpath for ascii compatible
if(coder == LATIN1 && (((ce instanceof ArrayEncoder) && ((ArrayEncoder) ce).isASCIICompatible() && isASCII(val)))) {
return val;
}
int len = val.length >> coder; // assume LATIN1=0/UTF16=1;
int en = scale(len, ce.maxBytesPerChar());
byte[] ba = new byte[en];
if(len == 0) {
return ba;
}
if(ce instanceof ArrayEncoder) {
int blen = (coder == LATIN1) ? ((ArrayEncoder) ce).encodeFromLatin1(val, 0, len, ba) : ((ArrayEncoder) ce).encodeFromUTF16(val, 0, len, ba);
if(blen != -1) {
return safeTrim(ba, blen, true);
}
}
boolean isTrusted = cs.getClass().getClassLoader0() == null || System.getSecurityManager() == null;
char[] ca = (coder == LATIN1) ? StringLatin1.toChars(val) : StringUTF16.toChars(val);
ByteBuffer bb = ByteBuffer.wrap(ba);
CharBuffer cb = CharBuffer.wrap(ca, 0, len);
try {
CoderResult cr = ce.encode(cb, bb, true);
if(!cr.isUnderflow())
cr.throwException();
cr = ce.flush(bb);
if(!cr.isUnderflow())
cr.throwException();
} catch(CharacterCodingException x) {
throw new IllegalArgumentException(x);
}
return safeTrim(ba, bb.position(), isTrusted);
}
/*▲ encode ████████████████████████████████████████████████████████████████████████████████┛ */
/*▼ decode ████████████████████████████████████████████████████████████████████████████████┓ */
// 以JVM默认字符集格式解码byte[],返回结果集
static Result decode(byte[] ba, int off, int len) {
Charset cs = Charset.defaultCharset();
if(cs == UTF_8) {
// 以UTF-8格式解码byte[],返回结果集
return decodeUTF8(ba, off, len, true);
}
if(cs == ISO_8859_1) {
// 以Latin1格式解码byte[],返回结果集
return decodeLatin1(ba, off, len);
}
if(cs == US_ASCII) {
// 以ASCII格式解码byte[],返回结果集
return decodeASCII(ba, off, len);
}
/* 解码其他字符集格式的byte[],返回结果集 */
// 取出当前线程关联的字符串解码器缓存
StringDecoder sd = deref(decoder);
if(sd == null || !cs.name().equals(sd.cs.name())) {
sd = new StringDecoder(cs, cs.name());
// 将字符串解码器放入缓存,并关联到当前线程
set(decoder, sd);
}
// 返回解码后的结果集
return sd.decode(ba, off, len);
}
// 以charsetName格式解析byte[],返回结果集
static Result decode(String charsetName, byte[] ba, int off, int len) throws UnsupportedEncodingException {
// 取出当前线程关联的字符串解码器缓存
StringDecoder sd = deref(decoder);
// 默认按"ISO-8859-1"格式解析byte[]
String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
if((sd == null) // 没有缓存的解码器,或者:
|| !(csn.equals(sd.requestedCharsetName()) || csn.equals(sd.charsetName()))) { // 缓存的解码器不支持设定的字符集
// 丢掉缓存中的解码器
sd = null;
try {
// 在系统中查找相应的字符集
Charset cs = lookupCharset(csn);
if(cs != null) {
if(cs == UTF_8) {
return decodeUTF8(ba, off, len, true);
}
if(cs == ISO_8859_1) {
return decodeLatin1(ba, off, len);
}
if(cs == US_ASCII) {
return decodeASCII(ba, off, len);
}
sd = new StringDecoder(cs, csn);
}
} catch(IllegalCharsetNameException x) {
}
if(sd == null)
throw new UnsupportedEncodingException(csn);
// 将字符串解码器放入缓存,并关联到当前线程
set(decoder, sd);
}
// 返回解码后的结果集
return sd.decode(ba, off, len);
}
// 以cs格式解码byte[],返回结果集
static Result decode(Charset cs, byte[] ba, int off, int len) {
if(cs == UTF_8) {
return decodeUTF8(ba, off, len, true);
}
if(cs == ISO_8859_1) {
return decodeLatin1(ba, off, len);
}
if(cs == US_ASCII) {
return decodeASCII(ba, off, len);
}
/*
* (1) We never cache the "external" cs, the only benefit of creating an additional StringDe/Encoder object to wrap it is to share the de/encode() method.
* These SD/E objects are short-lived, the young-gen gc should be able to take care of them well.
* But the best approach is still not to generate them if not really necessary.
* (2) The defensive copy of the input byte/char[] has a big performance impact, as well as the outgoing result byte/char[].
* Need to do the optimization check of (sm==null && classLoader0==null) for both.
* (3) There might be a timing gap in isTrusted setting. getClassLoader0() is only checked (and then isTrusted gets set) when (SM==null).
* It is possible that the SM==null for now but then SM is NOT null later when safeTrim() is invoked...
* the "safe" way to do is to redundant check (... && (isTrusted || SM == null || getClassLoader0())) in trim but it then can be argued that the SM is null
* when the operation is started...
*/
CharsetDecoder cd = cs.newDecoder();
// ascii fastpath
if((cd instanceof ArrayDecoder)
&& ((ArrayDecoder) cd).isASCIICompatible()
&& !hasNegatives(ba, off, len)) { // 判断数组元素是否全部为ANSI字符,即字符范围在[0x0, 0x80)
return decodeLatin1(ba, off, len);
}
int en = scale(len, cd.maxCharsPerByte());
if(len == 0) {
return new Result().with();
}
cd.onMalformedInput(CodingErrorAction.REPLACE).onUnmappableCharacter(CodingErrorAction.REPLACE).reset();
char[] ca = new char[en];
if(cd instanceof ArrayDecoder) {
int clen = ((ArrayDecoder) cd).decode(ba, off, len, ca);
return new Result().with(ca, 0, clen);
}
if(cs.getClass().getClassLoader0() != null && System.getSecurityManager() != null) {
ba = Arrays.copyOfRange(ba, off, off + len);
off = 0;
}
ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
CharBuffer cb = CharBuffer.wrap(ca);
try {
CoderResult cr = cd.decode(bb, cb, true);
if(!cr.isUnderflow())
cr.throwException();
cr = cd.flush(cb);
if(!cr.isUnderflow())
cr.throwException();
} catch(CharacterCodingException x) {
// Substitution is always enabled,
// so this shouldn't happen
throw new Error(x);
}
return new Result().with(ca, 0, cb.position());
}
// 以ASCII格式解码byte[],返回结果集
private static Result decodeASCII(byte[] ba, int off, int len) {
Result result = resultCached.get();
if(COMPACT_STRINGS && !hasNegatives(ba, off, len)) { // 判断数组元素是否全部为ANSI字符,即字符范围在[0x0, 0x80)
return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1);
}
byte[] dst = new byte[len << 1];
int dp = 0;
while(dp < len) {
int b = ba[off++];
putChar(dst, dp++, (b >= 0) ? (char) b : repl);
}
return result.with(dst, UTF16);
}
// 以Latin1格式解码byte[],返回结果集
private static Result decodeLatin1(byte[] ba, int off, int len) {
Result result = resultCached.get();
if(COMPACT_STRINGS) {
return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1);
} else {
// 从LATIN-String内部的字节转为UTF16-String内部的字节后返回
return result.with(StringLatin1.inflate(ba, off, len), UTF16);
}
}
// 以UTF-8格式解码byte[],返回结果集
private static Result decodeUTF8(byte[] src, int sp, int len, boolean doReplace) {
// ascii-bais, which has a relative impact to the non-ascii-only bytes
if(COMPACT_STRINGS && !hasNegatives(src, sp, len)) // 判断数组元素是否全部为ANSI字符,即字符范围在[0x0, 0x80)
return resultCached.get().with(Arrays.copyOfRange(src, sp, sp + len), LATIN1);
return decodeUTF8_0(src, sp, len, doReplace);
}
// 以UTF-8格式解码byte[],返回结果集
private static Result decodeUTF8_0(byte[] src, int sp, int len, boolean doReplace) {
Result ret = resultCached.get();
int sl = sp + len;
int dp = 0;
byte[] dst = new byte[len];
if(COMPACT_STRINGS) {
while(sp < sl) {
int b1 = src[sp];
if(b1 >= 0) {
dst[dp++] = (byte) b1;
sp++;
continue;
}
if((b1 == (byte) 0xc2 || b1 == (byte) 0xc3) && sp + 1 < sl) {
int b2 = src[sp + 1];
if(!isNotContinuation(b2)) {
dst[dp++] = (byte) (((b1 << 6) ^ b2) ^ (((byte) 0xC0 << 6) ^ ((byte) 0x80 << 0)));
sp += 2;
continue;
}
}
// anything not a latin1, including the repl we have to go with the utf16
break;
}
if(sp == sl) {
if(dp != dst.length) {
dst = Arrays.copyOf(dst, dp);
}
return ret.with(dst, LATIN1);
}
}
if(dp == 0) {
dst = new byte[len << 1];
} else {
byte[] buf = new byte[len << 1];
StringLatin1.inflate(dst, 0, buf, 0, dp);
dst = buf;
}
while(sp < sl) {
int b1 = src[sp++];
if(b1 >= 0) {
putChar(dst, dp++, (char) b1);
} else if((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
if(sp < sl) {
int b2 = src[sp++];
if(isNotContinuation(b2)) {
if(!doReplace) {
throwMalformed(sp - 1, 1);
}
putChar(dst, dp++, repl);
sp--;
} else {
putChar(dst, dp++, (char) (((b1 << 6) ^ b2) ^ (((byte) 0xC0 << 6) ^ ((byte) 0x80 << 0))));
}
continue;
}
if(!doReplace) {
throwMalformed(sp, 1); // underflow()
}
putChar(dst, dp++, repl);
break;
} else if((b1 >> 4) == -2) {
if(sp + 1 < sl) {
int b2 = src[sp++];
int b3 = src[sp++];
if(isMalformed3(b1, b2, b3)) {
if(!doReplace) {
throwMalformed(sp - 3, 3);
}
putChar(dst, dp++, repl);
sp -= 3;
sp += malformedN(src, sp, 3);
} else {
char c = (char) ((b1 << 12) ^ (b2 << 6) ^ (b3 ^ (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80 << 0))));
if(isSurrogate(c)) {
if(!doReplace) {
throwMalformed(sp - 3, 3);
}
putChar(dst, dp++, repl);
} else {
putChar(dst, dp++, c);
}
}
continue;
}
if(sp < sl && isMalformed3_2(b1, src[sp])) {
if(!doReplace) {
throwMalformed(sp - 1, 2);
}
putChar(dst, dp++, repl);
continue;
}
if(!doReplace) {
throwMalformed(sp, 1);
}
putChar(dst, dp++, repl);
break;
} else if((b1 >> 3) == -2) {
if(sp + 2 < sl) {
int b2 = src[sp++];
int b3 = src[sp++];
int b4 = src[sp++];
int uc = ((b1 << 18) ^ (b2 << 12) ^ (b3 << 6) ^ (b4 ^ (((byte) 0xF0 << 18) ^ ((byte) 0x80 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80 << 0))));
if(isMalformed4(b2, b3, b4) || !isSupplementaryCodePoint(uc)) { // shortest form check
if(!doReplace) {
throwMalformed(sp - 4, 4);
}
putChar(dst, dp++, repl);
sp -= 4;
sp += malformedN(src, sp, 4);
} else {
putChar(dst, dp++, highSurrogate(uc)); // 返回高代理处的码元(char)
putChar(dst, dp++, lowSurrogate(uc)); // 返回低代理处的码元(char)
}
continue;
}
b1 &= 0xff;
if(b1 > 0xf4 || sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) {
if(!doReplace) {
throwMalformed(sp - 1, 1); // or 2
}
putChar(dst, dp++, repl);
continue;
}
if(!doReplace) {
throwMalformed(sp - 1, 1);
}
sp++;
putChar(dst, dp++, repl);
if(sp < sl && isMalformed4_3(src[sp])) {
continue;
}
break;
} else {
if(!doReplace) {
throwMalformed(sp - 1, 1);
}
putChar(dst, dp++, repl);
}
}
if(dp != len) {
dst = Arrays.copyOf(dst, dp << 1);
}
return ret.with(dst, UTF16);
}
/**
* Throws iae, instead of replacing, if malformed or unmappable.
*/
// 以UTF-8格式解码byte[],进而构造String
static String newStringUTF8NoRepl(byte[] src, int off, int len) {
if(COMPACT_STRINGS && !hasNegatives(src, off, len)) // 判断数组元素是否全部为ANSI字符,即字符范围在[0x0, 0x80)
return new String(Arrays.copyOfRange(src, off, off + len), LATIN1);
Result ret = decodeUTF8_0(src, off, len, false);
return new String(ret.value, ret.coder);
}
// 以cs格式解码src,进而构造String
static String newStringNoRepl(byte[] src, Charset cs) throws CharacterCodingException {
try {
return newStringNoRepl1(src, cs);
} catch(IllegalArgumentException e) {
//newStringNoRepl1 throws IAE with MalformedInputException or CCE as the cause
Throwable cause = e.getCause();
if(cause instanceof MalformedInputException) {
throw (MalformedInputException) cause;
}
throw (CharacterCodingException) cause;
}
}
// 以cs格式解码src,进而构造String
static String newStringNoRepl1(byte[] src, Charset cs) {
if(cs == UTF_8) {
if(COMPACT_STRINGS && isASCII(src))
return new String(src, LATIN1);
Result ret = decodeUTF8_0(src, 0, src.length, false);
return new String(ret.value, ret.coder);
}
if(cs == ISO_8859_1) {
return newStringLatin1(src);
}
if(cs == US_ASCII) {
if(isASCII(src)) {
return newStringLatin1(src);
} else {
throwMalformed(src);
}
}
CharsetDecoder cd = cs.newDecoder();
// ascii fastpath
if((cd instanceof ArrayDecoder) && ((ArrayDecoder) cd).isASCIICompatible() && isASCII(src)) {
return newStringLatin1(src);
}
int len = src.length;
if(len == 0) {
return "";
}
int en = scale(len, cd.maxCharsPerByte());
char[] ca = new char[en];
if(cs.getClass().getClassLoader0() != null && System.getSecurityManager() != null) {
src = Arrays.copyOf(src, len);
}
ByteBuffer bb = ByteBuffer.wrap(src);
CharBuffer cb = CharBuffer.wrap(ca);
try {
CoderResult cr = cd.decode(bb, cb, true);
if(!cr.isUnderflow())
cr.throwException();
cr = cd.flush(cb);
if(!cr.isUnderflow())
cr.throwException();
} catch(CharacterCodingException x) {
throw new IllegalArgumentException(x); // todo
}