-
Notifications
You must be signed in to change notification settings - Fork 0
/
base64_arm64.s
134 lines (107 loc) · 3.7 KB
/
base64_arm64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
// Reference
// https://github.com/aklomp/base64/blob/master/lib/arch/neon64/enc_loop.c
// https://github.com/aklomp/base64/blob/master/lib/arch/neon64/dec_loop.c
//go:build arm64 && !purego
#include "textflag.h"
//func encodeAsm(dst, src []byte, lut *[64]byte) int
TEXT ·encodeAsm(SB),NOSPLIT,$0
MOVD dst_base+0(FP), R0
MOVD src_base+24(FP), R1
MOVD src_len+32(FP), R2
MOVD lut+48(FP), R3
VLD1 (R3), [V8.B16, V9.B16, V10.B16, V11.B16]
MOVD $0x3F, R4
VDUP R4, V7.B16
EOR R5, R5, R5
loop:
CMP $48, R2
BLT done
// Move the input bits to where they need to be in the outputs. Except
// for the first output, the high two bits are not cleared.
VLD3.P 48(R1), [V0.B16, V1.B16, V2.B16]
VUSHR $2, V0.B16, V3.B16
VUSHR $4, V1.B16, V4.B16
VUSHR $6, V2.B16, V5.B16
VSLI $4, V0.B16, V4.B16
VSLI $2, V1.B16, V5.B16
// Clear the high two bits in the second, third and fourth output.
VAND V7.B16, V4.B16, V4.B16
VAND V7.B16, V5.B16, V5.B16
VAND V7.B16, V2.B16, V6.B16
// The bits have now been shifted to the right locations;
// translate their values 0..63 to the Base64 alphabet.
// Use a 64-byte table lookup:
VTBL V3.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V3.B16
VTBL V4.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V4.B16
VTBL V5.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V5.B16
VTBL V6.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V6.B16
// Interleave and store output:
VST4.P [V3.B16, V4.B16, V5.B16, V6.B16], 64(R0)
SUB $48, R2
ADD $64, R5
B loop
done:
MOVD R5, ret+56(FP)
RET
//func decodeAsm(dst, src []byte, lut *[128]byte) int
TEXT ·decodeAsm(SB),NOSPLIT,$0
MOVD dst_base+0(FP), R0
MOVD src_base+24(FP), R1
MOVD src_len+32(FP), R2
MOVD lut+48(FP), R3
VLD1.P 64(R3), [V8.B16, V9.B16, V10.B16, V11.B16]
VLD1 (R3), [V12.B16, V13.B16, V14.B16, V15.B16]
MOVD $63, R4
VDUP R4, V7.B16
loop:
CMP $64, R2
BLT done
VLD4.P 64(R1), [V0.B16, V1.B16, V2.B16, V3.B16]
// Get indices for second LUT:
WORD $0x6e272c10 // VUQSUB V7.B16, V0.B16, V16.B16
WORD $0x6e272c31 // VUQSUB V7.B16, V1.B16, V17.B16
WORD $0x6e272c52 // VUQSUB V7.B16, V2.B16, V18.B16
WORD $0x6e272c73 // VUQSUB V7.B16, V3.B16, V19.B16
// Get values from first LUT:
VTBL V0.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V20.B16
VTBL V1.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V21.B16
VTBL V2.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V22.B16
VTBL V3.B16, [V8.B16, V9.B16, V10.B16, V11.B16], V23.B16
// Get values from second LUT:
WORD $0x4e107190 // VTBX V16.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V16.B16
WORD $0x4e117191 // VTBX V17.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V17.B16
WORD $0x4e127192 // VTBX V18.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V18.B16
WORD $0x4e137193 // VTBX V19.B16, [V12.B16, V13.B16, V14.B16, V15.B16], V19.B16
// Get final values:
VORR V16.B16, V20.B16, V0.B16
VORR V17.B16, V21.B16, V1.B16
VORR V18.B16, V22.B16, V2.B16
VORR V19.B16, V23.B16, V3.B16
// Check for invalid input, any value larger than 63:
WORD $0x6e273410 // VCMHI V7.B16, V0.B16, V16.B16
WORD $0x6e273431 // VCMHI V7.B16, V1.B16, V17.B16
WORD $0x6e273452 // VCMHI V7.B16, V2.B16, V18.B16
WORD $0x6e273473 // VCMHI V7.B16, V3.B16, V19.B16
VORR V17.B16, V16.B16, V16.B16
VORR V18.B16, V16.B16, V16.B16
VORR V19.B16, V16.B16, V16.B16
// Check that all bits are zero:
WORD $0x6e30aa11 // VUMAXV V16.B16, V17
VMOV V17.B[0], R5
CBNZ R5, done
// Compress four bytes into three:
VSHL $2, V0.B16, V4.B16
VUSHR $4, V1.B16, V16.B16
VORR V16.B16, V4.B16, V4.B16
VSHL $4, V1.B16, V5.B16
VUSHR $2, V2.B16, V16.B16
VORR V16.B16, V5.B16, V5.B16
VSHL $6, V2.B16, V16.B16
VORR V16.B16, V3.B16, V6.B16
// Interleave and store decoded result:
VST3.P [V4.B16, V5.B16, V6.B16], 48(R0)
SUB $64, R2
B loop
done:
MOVD R2, ret+56(FP)
RET