forked from JohannesBuchner/paq
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpaq7asm.asm
140 lines (130 loc) · 3.51 KB
/
paq7asm.asm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
; NASM assembly language code for PAQ7.
; (C) 2005, Matt Mahoney.
; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
;
; MINGW g++: nasm paq7asm.asm -f win32 --prefix _
; DJGPP g++: nasm paq7asm.asm -f coff --prefix _
; Borland, Mars: nasm paq7asm.asm -f obj --prefix _
; Linux: nasm paq7asm.asm -f elf
;
; For other Windows compilers try -f win32 or -f obj. Some old versions
; of Linux should use -f aout instead of -f elf.
;
; This code will only work on a Pentium-MMX or higher. It doesn't
; use extended (Katmai/SSE) instructions. It won't work
; in 64-bit mode.
section .text use32 class=CODE
; Reset after MMX
global do_emms
do_emms:
emms
ret
; Vector product a*b of n signed words, returning signed dword scaled
; down by 8 bits. n is rounded up to a multiple of 8.
global dot_product ; (short* a, short* b, int n)
align 16
dot_product:
mov eax, [esp+4] ; a
mov edx, [esp+8] ; b
mov ecx, [esp+12] ; n
add ecx, 7 ; n rounding up
and ecx, -8
jz .done
sub eax, 8
sub edx, 8
pxor mm0, mm0 ; sum = 0
.loop: ; each loop sums 4 products
movq mm1, [eax+ecx*2] ; put halves of vector product in mm0
pmaddwd mm1, [edx+ecx*2]
movq mm2, [eax+ecx*2-8]
pmaddwd mm2, [edx+ecx*2-8]
psrad mm1, 8
psrad mm2, 8
paddd mm0, mm1
paddd mm0, mm2
sub ecx, 8
ja .loop
movq mm1, mm0 ; add 2 halves of mm0 and return in eax
psrlq mm1, 32
paddd mm0, mm1
movd eax, mm0
emms
.done
ret
; This should work on a Pentium 4 or higher in 32-bit mode,
; but it isn't much faster than the MMX version so I don't use it.
global dot_product_sse2 ; (short* a, short* b, int n)
align 16
dot_product_sse2:
mov eax, [esp+4] ; a
mov edx, [esp+8] ; b
mov ecx, [esp+12] ; n
add ecx, 7 ; n rounding up
and ecx, -8
jz .done
sub eax, 16
sub edx, 16
pxor xmm0, xmm0 ; sum = 0
.loop: ; each loop sums 4 products
movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
pmaddwd xmm1, [edx+ecx*2]
psrad xmm1, 8
paddd xmm0, xmm1
sub ecx, 8
ja .loop
movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax
psrldq xmm1, 8
paddd xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm1, 4
paddd xmm0, xmm1
movd eax, xmm0
.done
ret
; Train n neural network weights w[n] on inputs t[n] and err.
; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
; n is rounded up to a multiple of 8.
global train ; (short* t, short* w, int n, int err)
align 16
train:
mov eax, [esp+16] ; err
and eax, 0xffff ; put 4 copies of err in mm0
movd mm0, eax
movd mm1, eax
psllq mm1, 16
por mm0, mm1
movq mm1, mm0
psllq mm1, 32
por mm0, mm1
pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1
psrlw mm1, 15
mov eax, [esp+4] ; t
mov edx, [esp+8] ; w
mov ecx, [esp+12] ; n
add ecx, 7 ; n/8 rounding up
and ecx, -8
sub eax, 8
sub edx, 8
jz .done
.loop: ; each iteration adjusts 8 weights
movq mm2, [edx+ecx*2] ; w[i]
movq mm3, [eax+ecx*2] ; t[i]
movq mm4, [edx+ecx*2-8] ; w[i]
movq mm5, [eax+ecx*2-8] ; t[i]
paddsw mm3, mm3
paddsw mm5, mm5
pmulhw mm3, mm0
pmulhw mm5, mm0
paddsw mm3, mm1
paddsw mm5, mm1
psraw mm3, 1
psraw mm5, 1
paddsw mm2, mm3
paddsw mm4, mm5
movq [edx+ecx*2], mm2
movq [edx+ecx*2-8], mm4
sub ecx, 8
ja .loop
.done:
emms
ret