diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..f0b4ec9 --- /dev/null +++ b/Makefile @@ -0,0 +1,137 @@ + +CC := g++ -DUNIX -O2 -Os -s -m32 -fomit-frame-pointer +#CC := g++ -DUNIX -O3 -s +TARGETS := paq8a.exe paq8f.exe paq8fthis2.exe paq8fthis3.exe paq8fthis4.exe paq8g.exe paq8hp12any.exe paq8jd.exe paq8k.exe paq8k2.exe paq8k3.exe paq8kx_v1.exe paq8kx_v4.exe paq8kx_v7.exe paq8l.exe paq8m.exe paq8n.exe paq8o.exe paq8o10t.exe paq8o2.exe paq8o3.exe paq8o4v2.exe paq8o5.exe paq8o6.exe paq8o7.exe paq8o8.exe paq8o9.exe paq8p.exe paq8px_v1.exe paq8px_v44.exe paq8px_v67.exe paq8px_v68e.exe paq8px_v68p3.exe paq8px_v9.exe + +all: ${TARGETS} +clean: + rm -f ${TARGETS} */*.o + +%.o: %.asm + nasm -f elf $? + +paq8a.exe: paq8a/paq8a.cpp paq8o10t/paq7asm.o + ${CC} -o $@ $? + +#paq8b.exe: paq8b/src/Paq8b.cpp paq8b/src/TextFilter.cpp ./paq8b/src/Paq8asm.o +# ${CC} -o $@ $? + +#paq8c.exe: paq8c/paq8c.cpp paq8c/paq7asm.o +# ${CC} -o $@ $? + +#paq8d.exe: paq8d/paq8d.cpp paq8d/paq7asm.o +# ${CC} -o $@ $? + +#paq8e.exe: paq8e/paq8e.cpp paq8e/paq7asm.o +# ${CC} -o $@ $? + +paq8f.exe: paq8f/paq8f.cpp paq8f/paq7asm.o + ${CC} -o $@ $? + +paq8fthis2.exe: paq8fthis2/paq8fthis2.cpp paq8fthis2/paq7asm.o + ${CC} -o $@ $? + +paq8fthis3.exe: paq8fthis3/paq8fthis3.cpp paq8fthis3/paq7asm.o + ${CC} -o $@ $? + +paq8fthis4.exe: paq8fthis4/paq8fthis4.cpp paq8fthis4/paq7asm.o + ${CC} -o $@ $? + +paq8g.exe: paq8g/src/paq8g.cpp ./paq8g/src/paq8asm.o + ${CC} -o $@ $? + +paq8hp12any.exe: paq8hp12any/paq8hp12.cpp paq8hp12any/paq7asm.o + ${CC} -o $@ $? + +paq8i.exe: paq8i/paq8i.cpp paq8i/paq7asm.o + ${CC} -o $@ $? + +paq8jd.exe: paq8jd/paq8jd.cpp paq8jd/paq7asm.o + ${CC} -o $@ $? + +paq8k.exe: paq8k/paq8k.cpp paq8k/paq7asm.o + ${CC} -o $@ $? + +paq8k2.exe: paq8k2/paq8k2.cpp paq8k2/paq7asm.o + ${CC} -o $@ $? + +paq8k3.exe: paq8k3/paq8k3.cpp paq8k3/paq7asm.o + ${CC} -o $@ $? + +paq8kx_v1.exe: paq8kx_v1/paq8kx_v1.cpp paq8kx_v1/paq7asm.o + ${CC} -o $@ $? + +paq8kx_v4.exe: paq8kx_v4/paq8kx_v4.cpp paq8kx_v4/paq7asm.o + ${CC} -o $@ $? + +paq8kx_v7.exe: paq8kx_v7/paq8kx_v7.cpp paq8kx_v7/paq7asm.o + ${CC} -o $@ $? + +paq8l.exe: paq8l/paq8l.cpp paq8l/paq7asm.o + ${CC} -o $@ $? + +paq8m.exe: paq8m/paq8m.cpp paq8m/paq7asm.o + ${CC} -o $@ $? + +paq8n.exe: paq8n/paq8n.cpp paq8n/paq7asm.o + ${CC} -o $@ $? + +paq8o.exe: paq8o/paq8o.cpp paq8o/paq7asm.o + ${CC} -o $@ $? + +paq8o10t.exe: paq8o10t/paq8o10t.cpp paq8o10t/paq7asm.o + ${CC} -o $@ $? + +paq8o2.exe: paq8o2/paq8o.cpp paq8o2/paq7asm.o + ${CC} -o $@ $? + +paq8o3.exe: paq8o3/paq8o3.cpp paq8o3/paq7asm.o + ${CC} -o $@ $? + +paq8o4v2.exe: paq8o4v2/paq8o4.cpp paq8o4v2/paq7asm.o + ${CC} -o $@ $? + +paq8o5.exe: paq8o5/paq8o5.cpp paq8o5/paq7asm.o + ${CC} -o $@ $? + +paq8o6.exe: paq8o6/paq8o6.cpp paq8o6/paq7asm.o + ${CC} -o $@ $? + +paq8o7.exe: paq8o7/paq8o7.cpp paq8o7/paq7asm.o + ${CC} -o $@ $? + +paq8o8.exe: paq8o8/paq8o8.cpp paq8o8/paq7asm.o + ${CC} -o $@ $? + +#paq8o8pre.exe: paq8o8pre/paq8o8pre.cpp paq8o8pre/PAQ7ASM.asm +# ${CC} -o $@ $? + +paq8o9.exe: paq8o9/paq8o9.cpp paq8o9/paq7asm.o + ${CC} -o $@ $? + +paq8p.exe: paq8p/paq8p.cpp paq8p/paq7asm.o + ${CC} -o $@ $? + +#paq8pxpre.exe: paq8pxpre/paq8pxpre.cpp paq8pxpre/PAQ7ASM.o +# ${CC} -o $@ $? + +paq8px_v1.exe: paq8px_v1/paq8px.cpp paq8px_v1/paq7asm.o + ${CC} -o $@ $? + +paq8px_v44.exe: paq8px_v44/paq8px.cpp paq8px_v44/paq7asm.o + ${CC} -o $@ $? + +paq8px_v67.exe: paq8px_v67/paq8px.cpp paq8px_v67/paq7asm.o + ${CC} -o $@ $? + +paq8px_v68p3.exe: paq8px_v68p3/paq8px_v68p3.cpp paq8px_v68p3/paq7asm.o + ${CC} -o $@ $? + +paq8px_v68e.exe: paq8px_v68e/paq8px_v68e.cpp paq8px_v68e/paq7asm.o + ${CC} -o $@ $? + +paq8px_v9.exe: paq8px_v9/paq8px.cpp paq8px_v9/paq7asm.o + ${CC} -o $@ $? + +.PHONY: all clean + diff --git a/paq8a/paq8a.cpp b/paq8a/paq8a.cpp index ef7dbd0..66f6c6d 100644 --- a/paq8a/paq8a.cpp +++ b/paq8a/paq8a.cpp @@ -1008,7 +1008,7 @@ class Mixer { } // predict next bit - int Mixer::p() { + int p() { while (nx&7) tx[nx++]=0; // pad if (mp) { // combine outputs mp->update(); diff --git a/paq8b/src/PAQ8ASM.OBJ b/paq8b/src/PAQ8ASM.OBJ deleted file mode 100644 index 2112509..0000000 Binary files a/paq8b/src/PAQ8ASM.OBJ and /dev/null differ diff --git a/paq8b/src/Paq8asm.asm b/paq8b/src/Paq8asm.asm new file mode 100644 index 0000000..82d55a7 --- /dev/null +++ b/paq8b/src/Paq8asm.asm @@ -0,0 +1,140 @@ +; NASM assembly language code for PAQ7. +; (C) 2005, Matt Mahoney. +; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt +; +; MINGW g++: nasm paq7asm.asm -f win32 --prefix _ +; DJGPP g++: nasm paq7asm.asm -f coff --prefix _ +; Borland, Mars: nasm paq7asm.asm -f obj --prefix _ +; Linux: nasm paq7asm.asm -f elf +; +; For other Windows compilers try -f win32 or -f obj. Some old versions +; of Linux should use -f aout instead of -f elf. +; +; This code will only work on a Pentium-MMX or higher. It doesn't +; use extended (Katmai/SSE) instructions. It won't work +; in 64-bit mode. + +section .text use32 class=CODE + +; Reset after MMX +global do_emms +do_emms: + emms + ret + +; Vector product a*b of n signed words, returning signed dword scaled +; down by 8 bits. n is rounded up to a multiple of 8. + +global dot_product ; (short* a, short* b, int n) +align 16 +dot_product: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 8 + sub edx, 8 + pxor mm0, mm0 ; sum = 0 +.loop: ; each loop sums 4 products + movq mm1, [eax+ecx*2] ; put halves of vector product in mm0 + pmaddwd mm1, [edx+ecx*2] + movq mm2, [eax+ecx*2-8] + pmaddwd mm2, [edx+ecx*2-8] + psrad mm1, 8 + psrad mm2, 8 + paddd mm0, mm1 + paddd mm0, mm2 + sub ecx, 8 + ja .loop + movq mm1, mm0 ; add 2 halves of mm0 and return in eax + psrlq mm1, 32 + paddd mm0, mm1 + movd eax, mm0 + emms +.done + ret + +; This should work on a Pentium 4 or higher in 32-bit mode, +; but it isn't much faster than the MMX version so I don't use it. + +global dot_product_sse2 ; (short* a, short* b, int n) +align 16 +dot_product_sse2: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 16 + sub edx, 16 + pxor xmm0, xmm0 ; sum = 0 +.loop: ; each loop sums 4 products + movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0 + pmaddwd xmm1, [edx+ecx*2] + psrad xmm1, 8 + paddd xmm0, xmm1 + sub ecx, 8 + ja .loop + movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax + psrldq xmm1, 8 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + paddd xmm0, xmm1 + movd eax, xmm0 +.done + ret + + +; Train n neural network weights w[n] on inputs t[n] and err. +; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K. +; n is rounded up to a multiple of 8. + +global train ; (short* t, short* w, int n, int err) +align 16 +train: + mov eax, [esp+16] ; err + and eax, 0xffff ; put 4 copies of err in mm0 + movd mm0, eax + movd mm1, eax + psllq mm1, 16 + por mm0, mm1 + movq mm1, mm0 + psllq mm1, 32 + por mm0, mm1 + pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1 + psrlw mm1, 15 + mov eax, [esp+4] ; t + mov edx, [esp+8] ; w + mov ecx, [esp+12] ; n + add ecx, 7 ; n/8 rounding up + and ecx, -8 + sub eax, 8 + sub edx, 8 + jz .done +.loop: ; each iteration adjusts 8 weights + movq mm2, [edx+ecx*2] ; w[i] + movq mm3, [eax+ecx*2] ; t[i] + movq mm4, [edx+ecx*2-8] ; w[i] + movq mm5, [eax+ecx*2-8] ; t[i] + paddsw mm3, mm3 + paddsw mm5, mm5 + pmulhw mm3, mm0 + pmulhw mm5, mm0 + paddsw mm3, mm1 + paddsw mm5, mm1 + psraw mm3, 1 + psraw mm5, 1 + paddsw mm2, mm3 + paddsw mm4, mm5 + movq [edx+ecx*2], mm2 + movq [edx+ecx*2-8], mm4 + sub ecx, 8 + ja .loop +.done: + emms + ret + diff --git a/paq8b/src/Paq8b.cpp b/paq8b/src/Paq8b.cpp index 2da974c..511b593 100644 --- a/paq8b/src/Paq8b.cpp +++ b/paq8b/src/Paq8b.cpp @@ -1009,7 +1009,7 @@ class Mixer { } // predict next bit - int Mixer::p() { + int p() { while (nx&7) tx[nx++]=0; // pad if (mp) { // combine outputs mp->update(); diff --git a/paq8c/src/PAQ8ASM.OBJ b/paq8c/src/PAQ8ASM.OBJ deleted file mode 100644 index 2112509..0000000 Binary files a/paq8c/src/PAQ8ASM.OBJ and /dev/null differ diff --git a/paq8c/src/Paq8c.cpp b/paq8c/src/Paq8c.cpp index b9b748b..ff67862 100644 --- a/paq8c/src/Paq8c.cpp +++ b/paq8c/src/Paq8c.cpp @@ -1009,7 +1009,7 @@ class Mixer { } // predict next bit - int Mixer::p() { + int p() { while (nx&7) tx[nx++]=0; // pad if (mp) { // combine outputs mp->update(); diff --git a/paq8d/src/PAQ8ASM.OBJ b/paq8d/src/PAQ8ASM.OBJ deleted file mode 100644 index 2112509..0000000 Binary files a/paq8d/src/PAQ8ASM.OBJ and /dev/null differ diff --git a/paq8d/src/Paq8d.cpp b/paq8d/src/Paq8d.cpp index cd3720f..eb5b31d 100644 --- a/paq8d/src/Paq8d.cpp +++ b/paq8d/src/Paq8d.cpp @@ -1011,7 +1011,7 @@ class Mixer { } // predict next bit - int Mixer::p() { + int p() { while (nx&7) tx[nx++]=0; // pad if (mp) { // combine outputs mp->update(); diff --git a/paq8e/src/PAQ8ASM.OBJ b/paq8e/src/PAQ8ASM.OBJ deleted file mode 100644 index 2112509..0000000 Binary files a/paq8e/src/PAQ8ASM.OBJ and /dev/null differ diff --git a/paq8e/src/Paq8e.cpp b/paq8e/src/Paq8e.cpp index 22ddb0f..0f0511e 100644 --- a/paq8e/src/Paq8e.cpp +++ b/paq8e/src/Paq8e.cpp @@ -1012,7 +1012,7 @@ class Mixer { } // predict next bit - int Mixer::p() { + int p() { while (nx&7) tx[nx++]=0; // pad if (mp) { // combine outputs mp->update(); diff --git a/paq8f/paq7asm-x86_64.o b/paq8f/paq7asm-x86_64.o deleted file mode 100644 index d81b5cf..0000000 Binary files a/paq8f/paq7asm-x86_64.o and /dev/null differ diff --git a/paq8f/paq8f.cpp b/paq8f/paq8f.cpp index 08de33b..3a1225b 100644 --- a/paq8f/paq8f.cpp +++ b/paq8f/paq8f.cpp @@ -1157,7 +1157,7 @@ class Mixer { } // predict next bit - int Mixer::p() { + int p() { while (nx&7) tx[nx++]=0; // pad if (mp) { // combine outputs mp->update(); diff --git a/paq8fthis2/paq7asm.obj b/paq8fthis2/paq7asm.obj deleted file mode 100644 index ebc3a8b..0000000 Binary files a/paq8fthis2/paq7asm.obj and /dev/null differ diff --git a/paq8fthis2/paq8fthis2.cpp b/paq8fthis2/paq8fthis2.cpp index f68362f..6f61da8 100644 --- a/paq8fthis2/paq8fthis2.cpp +++ b/paq8fthis2/paq8fthis2.cpp @@ -1151,7 +1151,7 @@ class Mixer { } // predict next bit - int Mixer::p() { + int p() { while (nx&7) tx[nx++]=0; // pad if (mp) { // combine outputs mp->update(); diff --git a/paq8fthis3/paq7asm.obj b/paq8fthis3/paq7asm.obj deleted file mode 100644 index ebc3a8b..0000000 Binary files a/paq8fthis3/paq7asm.obj and /dev/null differ diff --git a/paq8fthis3/paq8fthis3.cpp b/paq8fthis3/paq8fthis3.cpp index cb0ef19..d6b32a6 100644 --- a/paq8fthis3/paq8fthis3.cpp +++ b/paq8fthis3/paq8fthis3.cpp @@ -1151,7 +1151,7 @@ class Mixer { } // predict next bit - int Mixer::p() { + int p() { while (nx&7) tx[nx++]=0; // pad if (mp) { // combine outputs mp->update(); diff --git a/paq8fthis4/paq7asm.obj b/paq8fthis4/paq7asm.obj deleted file mode 100644 index ebc3a8b..0000000 Binary files a/paq8fthis4/paq7asm.obj and /dev/null differ diff --git a/paq8fthis4/paq8fthis4.cpp b/paq8fthis4/paq8fthis4.cpp index 239f4f0..d6ed88b 100644 --- a/paq8fthis4/paq8fthis4.cpp +++ b/paq8fthis4/paq8fthis4.cpp @@ -1151,7 +1151,7 @@ class Mixer { } // predict next bit - int Mixer::p() { + int p() { while (nx&7) tx[nx++]=0; // pad if (mp) { // combine outputs mp->update(); diff --git a/paq8fthis4/paq8fthis_fast.cpp b/paq8fthis4/paq8fthis_fast.cpp index 98c591f..20f318b 100644 --- a/paq8fthis4/paq8fthis_fast.cpp +++ b/paq8fthis4/paq8fthis_fast.cpp @@ -1151,7 +1151,7 @@ class Mixer { } // predict next bit - int Mixer::p() { + int p() { while (nx&7) tx[nx++]=0; // pad if (mp) { // combine outputs mp->update(); diff --git a/paq8g/src/linux/paq8asm.o b/paq8g/src/linux/paq8asm.o deleted file mode 100644 index 42e1676..0000000 Binary files a/paq8g/src/linux/paq8asm.o and /dev/null differ diff --git a/paq8g/src/paq8g.cpp b/paq8g/src/paq8g.cpp index fbc9833..4179d58 100644 --- a/paq8g/src/paq8g.cpp +++ b/paq8g/src/paq8g.cpp @@ -1101,7 +1101,7 @@ class Mixer { } // predict next bit - int Mixer::p() { + int p() { while (nx&7) tx[nx++]=0; // pad if (mp) { // combine outputs mp->update(); diff --git a/paq8g/src/windows/paq8asm.obj b/paq8g/src/windows/paq8asm.obj deleted file mode 100644 index 2112509..0000000 Binary files a/paq8g/src/windows/paq8asm.obj and /dev/null differ diff --git a/paq8hp12any/paq7asm-x86_64.o b/paq8hp12any/paq7asm-x86_64.o deleted file mode 100644 index d81b5cf..0000000 Binary files a/paq8hp12any/paq7asm-x86_64.o and /dev/null differ diff --git a/paq8hp12any/paq7asmsse.obj b/paq8hp12any/paq7asmsse.obj deleted file mode 100644 index 75fa698..0000000 Binary files a/paq8hp12any/paq7asmsse.obj and /dev/null differ diff --git a/paq8i/paq8i.cpp b/paq8i/paq8i.cpp index b1e790b..2b378ac 100644 --- a/paq8i/paq8i.cpp +++ b/paq8i/paq8i.cpp @@ -1127,7 +1127,7 @@ class Mixer { } // predict next bit - int Mixer::p() { + int p() { while (nx&7) tx[nx++]=0; // pad if (mp) { // combine outputs mp->update(); diff --git a/paq8jd/paq7asm-x86_64.o b/paq8jd/paq7asm-x86_64.o deleted file mode 100644 index d81b5cf..0000000 Binary files a/paq8jd/paq7asm-x86_64.o and /dev/null differ diff --git a/paq8jd/paq8jd.cpp b/paq8jd/paq8jd.cpp index 1f009ee..98e879a 100644 --- a/paq8jd/paq8jd.cpp +++ b/paq8jd/paq8jd.cpp @@ -1192,7 +1192,7 @@ class Mixer { } // predict next bit - int Mixer::p() { + int p() { while (nx&7) tx[nx++]=0; // pad if (mp) { // combine outputs mp->update(); diff --git a/paq8k/paq7asm.asm b/paq8k/paq7asm.asm new file mode 100644 index 0000000..82d55a7 --- /dev/null +++ b/paq8k/paq7asm.asm @@ -0,0 +1,140 @@ +; NASM assembly language code for PAQ7. +; (C) 2005, Matt Mahoney. +; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt +; +; MINGW g++: nasm paq7asm.asm -f win32 --prefix _ +; DJGPP g++: nasm paq7asm.asm -f coff --prefix _ +; Borland, Mars: nasm paq7asm.asm -f obj --prefix _ +; Linux: nasm paq7asm.asm -f elf +; +; For other Windows compilers try -f win32 or -f obj. Some old versions +; of Linux should use -f aout instead of -f elf. +; +; This code will only work on a Pentium-MMX or higher. It doesn't +; use extended (Katmai/SSE) instructions. It won't work +; in 64-bit mode. + +section .text use32 class=CODE + +; Reset after MMX +global do_emms +do_emms: + emms + ret + +; Vector product a*b of n signed words, returning signed dword scaled +; down by 8 bits. n is rounded up to a multiple of 8. + +global dot_product ; (short* a, short* b, int n) +align 16 +dot_product: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 8 + sub edx, 8 + pxor mm0, mm0 ; sum = 0 +.loop: ; each loop sums 4 products + movq mm1, [eax+ecx*2] ; put halves of vector product in mm0 + pmaddwd mm1, [edx+ecx*2] + movq mm2, [eax+ecx*2-8] + pmaddwd mm2, [edx+ecx*2-8] + psrad mm1, 8 + psrad mm2, 8 + paddd mm0, mm1 + paddd mm0, mm2 + sub ecx, 8 + ja .loop + movq mm1, mm0 ; add 2 halves of mm0 and return in eax + psrlq mm1, 32 + paddd mm0, mm1 + movd eax, mm0 + emms +.done + ret + +; This should work on a Pentium 4 or higher in 32-bit mode, +; but it isn't much faster than the MMX version so I don't use it. + +global dot_product_sse2 ; (short* a, short* b, int n) +align 16 +dot_product_sse2: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 16 + sub edx, 16 + pxor xmm0, xmm0 ; sum = 0 +.loop: ; each loop sums 4 products + movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0 + pmaddwd xmm1, [edx+ecx*2] + psrad xmm1, 8 + paddd xmm0, xmm1 + sub ecx, 8 + ja .loop + movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax + psrldq xmm1, 8 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + paddd xmm0, xmm1 + movd eax, xmm0 +.done + ret + + +; Train n neural network weights w[n] on inputs t[n] and err. +; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K. +; n is rounded up to a multiple of 8. + +global train ; (short* t, short* w, int n, int err) +align 16 +train: + mov eax, [esp+16] ; err + and eax, 0xffff ; put 4 copies of err in mm0 + movd mm0, eax + movd mm1, eax + psllq mm1, 16 + por mm0, mm1 + movq mm1, mm0 + psllq mm1, 32 + por mm0, mm1 + pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1 + psrlw mm1, 15 + mov eax, [esp+4] ; t + mov edx, [esp+8] ; w + mov ecx, [esp+12] ; n + add ecx, 7 ; n/8 rounding up + and ecx, -8 + sub eax, 8 + sub edx, 8 + jz .done +.loop: ; each iteration adjusts 8 weights + movq mm2, [edx+ecx*2] ; w[i] + movq mm3, [eax+ecx*2] ; t[i] + movq mm4, [edx+ecx*2-8] ; w[i] + movq mm5, [eax+ecx*2-8] ; t[i] + paddsw mm3, mm3 + paddsw mm5, mm5 + pmulhw mm3, mm0 + pmulhw mm5, mm0 + paddsw mm3, mm1 + paddsw mm5, mm1 + psraw mm3, 1 + psraw mm5, 1 + paddsw mm2, mm3 + paddsw mm4, mm5 + movq [edx+ecx*2], mm2 + movq [edx+ecx*2-8], mm4 + sub ecx, 8 + ja .loop +.done: + emms + ret + diff --git a/paq8k/paq8k.cpp b/paq8k/paq8k.cpp index 49c37f1..a43854d 100644 --- a/paq8k/paq8k.cpp +++ b/paq8k/paq8k.cpp @@ -1203,7 +1203,7 @@ class Mixer { } // predict next bit - int Mixer::p() { + int p() { while (nx&7) tx[nx++]=0; // pad if (mp) { // combine outputs mp->update(); diff --git a/paq8k2/paq7asm.asm b/paq8k2/paq7asm.asm new file mode 100644 index 0000000..82d55a7 --- /dev/null +++ b/paq8k2/paq7asm.asm @@ -0,0 +1,140 @@ +; NASM assembly language code for PAQ7. +; (C) 2005, Matt Mahoney. +; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt +; +; MINGW g++: nasm paq7asm.asm -f win32 --prefix _ +; DJGPP g++: nasm paq7asm.asm -f coff --prefix _ +; Borland, Mars: nasm paq7asm.asm -f obj --prefix _ +; Linux: nasm paq7asm.asm -f elf +; +; For other Windows compilers try -f win32 or -f obj. Some old versions +; of Linux should use -f aout instead of -f elf. +; +; This code will only work on a Pentium-MMX or higher. It doesn't +; use extended (Katmai/SSE) instructions. It won't work +; in 64-bit mode. + +section .text use32 class=CODE + +; Reset after MMX +global do_emms +do_emms: + emms + ret + +; Vector product a*b of n signed words, returning signed dword scaled +; down by 8 bits. n is rounded up to a multiple of 8. + +global dot_product ; (short* a, short* b, int n) +align 16 +dot_product: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 8 + sub edx, 8 + pxor mm0, mm0 ; sum = 0 +.loop: ; each loop sums 4 products + movq mm1, [eax+ecx*2] ; put halves of vector product in mm0 + pmaddwd mm1, [edx+ecx*2] + movq mm2, [eax+ecx*2-8] + pmaddwd mm2, [edx+ecx*2-8] + psrad mm1, 8 + psrad mm2, 8 + paddd mm0, mm1 + paddd mm0, mm2 + sub ecx, 8 + ja .loop + movq mm1, mm0 ; add 2 halves of mm0 and return in eax + psrlq mm1, 32 + paddd mm0, mm1 + movd eax, mm0 + emms +.done + ret + +; This should work on a Pentium 4 or higher in 32-bit mode, +; but it isn't much faster than the MMX version so I don't use it. + +global dot_product_sse2 ; (short* a, short* b, int n) +align 16 +dot_product_sse2: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 16 + sub edx, 16 + pxor xmm0, xmm0 ; sum = 0 +.loop: ; each loop sums 4 products + movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0 + pmaddwd xmm1, [edx+ecx*2] + psrad xmm1, 8 + paddd xmm0, xmm1 + sub ecx, 8 + ja .loop + movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax + psrldq xmm1, 8 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + paddd xmm0, xmm1 + movd eax, xmm0 +.done + ret + + +; Train n neural network weights w[n] on inputs t[n] and err. +; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K. +; n is rounded up to a multiple of 8. + +global train ; (short* t, short* w, int n, int err) +align 16 +train: + mov eax, [esp+16] ; err + and eax, 0xffff ; put 4 copies of err in mm0 + movd mm0, eax + movd mm1, eax + psllq mm1, 16 + por mm0, mm1 + movq mm1, mm0 + psllq mm1, 32 + por mm0, mm1 + pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1 + psrlw mm1, 15 + mov eax, [esp+4] ; t + mov edx, [esp+8] ; w + mov ecx, [esp+12] ; n + add ecx, 7 ; n/8 rounding up + and ecx, -8 + sub eax, 8 + sub edx, 8 + jz .done +.loop: ; each iteration adjusts 8 weights + movq mm2, [edx+ecx*2] ; w[i] + movq mm3, [eax+ecx*2] ; t[i] + movq mm4, [edx+ecx*2-8] ; w[i] + movq mm5, [eax+ecx*2-8] ; t[i] + paddsw mm3, mm3 + paddsw mm5, mm5 + pmulhw mm3, mm0 + pmulhw mm5, mm0 + paddsw mm3, mm1 + paddsw mm5, mm1 + psraw mm3, 1 + psraw mm5, 1 + paddsw mm2, mm3 + paddsw mm4, mm5 + movq [edx+ecx*2], mm2 + movq [edx+ecx*2-8], mm4 + sub ecx, 8 + ja .loop +.done: + emms + ret + diff --git a/paq8k2/paq7asm.obj b/paq8k2/paq7asm.obj deleted file mode 100644 index c088d90..0000000 Binary files a/paq8k2/paq7asm.obj and /dev/null differ diff --git a/paq8k3/paq7asm.asm b/paq8k3/paq7asm.asm new file mode 100644 index 0000000..82d55a7 --- /dev/null +++ b/paq8k3/paq7asm.asm @@ -0,0 +1,140 @@ +; NASM assembly language code for PAQ7. +; (C) 2005, Matt Mahoney. +; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt +; +; MINGW g++: nasm paq7asm.asm -f win32 --prefix _ +; DJGPP g++: nasm paq7asm.asm -f coff --prefix _ +; Borland, Mars: nasm paq7asm.asm -f obj --prefix _ +; Linux: nasm paq7asm.asm -f elf +; +; For other Windows compilers try -f win32 or -f obj. Some old versions +; of Linux should use -f aout instead of -f elf. +; +; This code will only work on a Pentium-MMX or higher. It doesn't +; use extended (Katmai/SSE) instructions. It won't work +; in 64-bit mode. + +section .text use32 class=CODE + +; Reset after MMX +global do_emms +do_emms: + emms + ret + +; Vector product a*b of n signed words, returning signed dword scaled +; down by 8 bits. n is rounded up to a multiple of 8. + +global dot_product ; (short* a, short* b, int n) +align 16 +dot_product: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 8 + sub edx, 8 + pxor mm0, mm0 ; sum = 0 +.loop: ; each loop sums 4 products + movq mm1, [eax+ecx*2] ; put halves of vector product in mm0 + pmaddwd mm1, [edx+ecx*2] + movq mm2, [eax+ecx*2-8] + pmaddwd mm2, [edx+ecx*2-8] + psrad mm1, 8 + psrad mm2, 8 + paddd mm0, mm1 + paddd mm0, mm2 + sub ecx, 8 + ja .loop + movq mm1, mm0 ; add 2 halves of mm0 and return in eax + psrlq mm1, 32 + paddd mm0, mm1 + movd eax, mm0 + emms +.done + ret + +; This should work on a Pentium 4 or higher in 32-bit mode, +; but it isn't much faster than the MMX version so I don't use it. + +global dot_product_sse2 ; (short* a, short* b, int n) +align 16 +dot_product_sse2: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 16 + sub edx, 16 + pxor xmm0, xmm0 ; sum = 0 +.loop: ; each loop sums 4 products + movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0 + pmaddwd xmm1, [edx+ecx*2] + psrad xmm1, 8 + paddd xmm0, xmm1 + sub ecx, 8 + ja .loop + movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax + psrldq xmm1, 8 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + paddd xmm0, xmm1 + movd eax, xmm0 +.done + ret + + +; Train n neural network weights w[n] on inputs t[n] and err. +; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K. +; n is rounded up to a multiple of 8. + +global train ; (short* t, short* w, int n, int err) +align 16 +train: + mov eax, [esp+16] ; err + and eax, 0xffff ; put 4 copies of err in mm0 + movd mm0, eax + movd mm1, eax + psllq mm1, 16 + por mm0, mm1 + movq mm1, mm0 + psllq mm1, 32 + por mm0, mm1 + pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1 + psrlw mm1, 15 + mov eax, [esp+4] ; t + mov edx, [esp+8] ; w + mov ecx, [esp+12] ; n + add ecx, 7 ; n/8 rounding up + and ecx, -8 + sub eax, 8 + sub edx, 8 + jz .done +.loop: ; each iteration adjusts 8 weights + movq mm2, [edx+ecx*2] ; w[i] + movq mm3, [eax+ecx*2] ; t[i] + movq mm4, [edx+ecx*2-8] ; w[i] + movq mm5, [eax+ecx*2-8] ; t[i] + paddsw mm3, mm3 + paddsw mm5, mm5 + pmulhw mm3, mm0 + pmulhw mm5, mm0 + paddsw mm3, mm1 + paddsw mm5, mm1 + psraw mm3, 1 + psraw mm5, 1 + paddsw mm2, mm3 + paddsw mm4, mm5 + movq [edx+ecx*2], mm2 + movq [edx+ecx*2-8], mm4 + sub ecx, 8 + ja .loop +.done: + emms + ret + diff --git a/paq8kx_v1/paq7asm.asm b/paq8kx_v1/paq7asm.asm new file mode 100644 index 0000000..82d55a7 --- /dev/null +++ b/paq8kx_v1/paq7asm.asm @@ -0,0 +1,140 @@ +; NASM assembly language code for PAQ7. +; (C) 2005, Matt Mahoney. +; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt +; +; MINGW g++: nasm paq7asm.asm -f win32 --prefix _ +; DJGPP g++: nasm paq7asm.asm -f coff --prefix _ +; Borland, Mars: nasm paq7asm.asm -f obj --prefix _ +; Linux: nasm paq7asm.asm -f elf +; +; For other Windows compilers try -f win32 or -f obj. Some old versions +; of Linux should use -f aout instead of -f elf. +; +; This code will only work on a Pentium-MMX or higher. It doesn't +; use extended (Katmai/SSE) instructions. It won't work +; in 64-bit mode. + +section .text use32 class=CODE + +; Reset after MMX +global do_emms +do_emms: + emms + ret + +; Vector product a*b of n signed words, returning signed dword scaled +; down by 8 bits. n is rounded up to a multiple of 8. + +global dot_product ; (short* a, short* b, int n) +align 16 +dot_product: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 8 + sub edx, 8 + pxor mm0, mm0 ; sum = 0 +.loop: ; each loop sums 4 products + movq mm1, [eax+ecx*2] ; put halves of vector product in mm0 + pmaddwd mm1, [edx+ecx*2] + movq mm2, [eax+ecx*2-8] + pmaddwd mm2, [edx+ecx*2-8] + psrad mm1, 8 + psrad mm2, 8 + paddd mm0, mm1 + paddd mm0, mm2 + sub ecx, 8 + ja .loop + movq mm1, mm0 ; add 2 halves of mm0 and return in eax + psrlq mm1, 32 + paddd mm0, mm1 + movd eax, mm0 + emms +.done + ret + +; This should work on a Pentium 4 or higher in 32-bit mode, +; but it isn't much faster than the MMX version so I don't use it. + +global dot_product_sse2 ; (short* a, short* b, int n) +align 16 +dot_product_sse2: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 16 + sub edx, 16 + pxor xmm0, xmm0 ; sum = 0 +.loop: ; each loop sums 4 products + movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0 + pmaddwd xmm1, [edx+ecx*2] + psrad xmm1, 8 + paddd xmm0, xmm1 + sub ecx, 8 + ja .loop + movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax + psrldq xmm1, 8 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + paddd xmm0, xmm1 + movd eax, xmm0 +.done + ret + + +; Train n neural network weights w[n] on inputs t[n] and err. +; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K. +; n is rounded up to a multiple of 8. + +global train ; (short* t, short* w, int n, int err) +align 16 +train: + mov eax, [esp+16] ; err + and eax, 0xffff ; put 4 copies of err in mm0 + movd mm0, eax + movd mm1, eax + psllq mm1, 16 + por mm0, mm1 + movq mm1, mm0 + psllq mm1, 32 + por mm0, mm1 + pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1 + psrlw mm1, 15 + mov eax, [esp+4] ; t + mov edx, [esp+8] ; w + mov ecx, [esp+12] ; n + add ecx, 7 ; n/8 rounding up + and ecx, -8 + sub eax, 8 + sub edx, 8 + jz .done +.loop: ; each iteration adjusts 8 weights + movq mm2, [edx+ecx*2] ; w[i] + movq mm3, [eax+ecx*2] ; t[i] + movq mm4, [edx+ecx*2-8] ; w[i] + movq mm5, [eax+ecx*2-8] ; t[i] + paddsw mm3, mm3 + paddsw mm5, mm5 + pmulhw mm3, mm0 + pmulhw mm5, mm0 + paddsw mm3, mm1 + paddsw mm5, mm1 + psraw mm3, 1 + psraw mm5, 1 + paddsw mm2, mm3 + paddsw mm4, mm5 + movq [edx+ecx*2], mm2 + movq [edx+ecx*2-8], mm4 + sub ecx, 8 + ja .loop +.done: + emms + ret + diff --git a/paq8kx_v4/paq7asm.asm b/paq8kx_v4/paq7asm.asm new file mode 100644 index 0000000..82d55a7 --- /dev/null +++ b/paq8kx_v4/paq7asm.asm @@ -0,0 +1,140 @@ +; NASM assembly language code for PAQ7. +; (C) 2005, Matt Mahoney. +; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt +; +; MINGW g++: nasm paq7asm.asm -f win32 --prefix _ +; DJGPP g++: nasm paq7asm.asm -f coff --prefix _ +; Borland, Mars: nasm paq7asm.asm -f obj --prefix _ +; Linux: nasm paq7asm.asm -f elf +; +; For other Windows compilers try -f win32 or -f obj. Some old versions +; of Linux should use -f aout instead of -f elf. +; +; This code will only work on a Pentium-MMX or higher. It doesn't +; use extended (Katmai/SSE) instructions. It won't work +; in 64-bit mode. + +section .text use32 class=CODE + +; Reset after MMX +global do_emms +do_emms: + emms + ret + +; Vector product a*b of n signed words, returning signed dword scaled +; down by 8 bits. n is rounded up to a multiple of 8. + +global dot_product ; (short* a, short* b, int n) +align 16 +dot_product: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 8 + sub edx, 8 + pxor mm0, mm0 ; sum = 0 +.loop: ; each loop sums 4 products + movq mm1, [eax+ecx*2] ; put halves of vector product in mm0 + pmaddwd mm1, [edx+ecx*2] + movq mm2, [eax+ecx*2-8] + pmaddwd mm2, [edx+ecx*2-8] + psrad mm1, 8 + psrad mm2, 8 + paddd mm0, mm1 + paddd mm0, mm2 + sub ecx, 8 + ja .loop + movq mm1, mm0 ; add 2 halves of mm0 and return in eax + psrlq mm1, 32 + paddd mm0, mm1 + movd eax, mm0 + emms +.done + ret + +; This should work on a Pentium 4 or higher in 32-bit mode, +; but it isn't much faster than the MMX version so I don't use it. + +global dot_product_sse2 ; (short* a, short* b, int n) +align 16 +dot_product_sse2: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 16 + sub edx, 16 + pxor xmm0, xmm0 ; sum = 0 +.loop: ; each loop sums 4 products + movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0 + pmaddwd xmm1, [edx+ecx*2] + psrad xmm1, 8 + paddd xmm0, xmm1 + sub ecx, 8 + ja .loop + movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax + psrldq xmm1, 8 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + paddd xmm0, xmm1 + movd eax, xmm0 +.done + ret + + +; Train n neural network weights w[n] on inputs t[n] and err. +; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K. +; n is rounded up to a multiple of 8. + +global train ; (short* t, short* w, int n, int err) +align 16 +train: + mov eax, [esp+16] ; err + and eax, 0xffff ; put 4 copies of err in mm0 + movd mm0, eax + movd mm1, eax + psllq mm1, 16 + por mm0, mm1 + movq mm1, mm0 + psllq mm1, 32 + por mm0, mm1 + pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1 + psrlw mm1, 15 + mov eax, [esp+4] ; t + mov edx, [esp+8] ; w + mov ecx, [esp+12] ; n + add ecx, 7 ; n/8 rounding up + and ecx, -8 + sub eax, 8 + sub edx, 8 + jz .done +.loop: ; each iteration adjusts 8 weights + movq mm2, [edx+ecx*2] ; w[i] + movq mm3, [eax+ecx*2] ; t[i] + movq mm4, [edx+ecx*2-8] ; w[i] + movq mm5, [eax+ecx*2-8] ; t[i] + paddsw mm3, mm3 + paddsw mm5, mm5 + pmulhw mm3, mm0 + pmulhw mm5, mm0 + paddsw mm3, mm1 + paddsw mm5, mm1 + psraw mm3, 1 + psraw mm5, 1 + paddsw mm2, mm3 + paddsw mm4, mm5 + movq [edx+ecx*2], mm2 + movq [edx+ecx*2-8], mm4 + sub ecx, 8 + ja .loop +.done: + emms + ret + diff --git a/paq8kx_v7/paq7asm.asm b/paq8kx_v7/paq7asm.asm new file mode 100644 index 0000000..82d55a7 --- /dev/null +++ b/paq8kx_v7/paq7asm.asm @@ -0,0 +1,140 @@ +; NASM assembly language code for PAQ7. +; (C) 2005, Matt Mahoney. +; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt +; +; MINGW g++: nasm paq7asm.asm -f win32 --prefix _ +; DJGPP g++: nasm paq7asm.asm -f coff --prefix _ +; Borland, Mars: nasm paq7asm.asm -f obj --prefix _ +; Linux: nasm paq7asm.asm -f elf +; +; For other Windows compilers try -f win32 or -f obj. Some old versions +; of Linux should use -f aout instead of -f elf. +; +; This code will only work on a Pentium-MMX or higher. It doesn't +; use extended (Katmai/SSE) instructions. It won't work +; in 64-bit mode. + +section .text use32 class=CODE + +; Reset after MMX +global do_emms +do_emms: + emms + ret + +; Vector product a*b of n signed words, returning signed dword scaled +; down by 8 bits. n is rounded up to a multiple of 8. + +global dot_product ; (short* a, short* b, int n) +align 16 +dot_product: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 8 + sub edx, 8 + pxor mm0, mm0 ; sum = 0 +.loop: ; each loop sums 4 products + movq mm1, [eax+ecx*2] ; put halves of vector product in mm0 + pmaddwd mm1, [edx+ecx*2] + movq mm2, [eax+ecx*2-8] + pmaddwd mm2, [edx+ecx*2-8] + psrad mm1, 8 + psrad mm2, 8 + paddd mm0, mm1 + paddd mm0, mm2 + sub ecx, 8 + ja .loop + movq mm1, mm0 ; add 2 halves of mm0 and return in eax + psrlq mm1, 32 + paddd mm0, mm1 + movd eax, mm0 + emms +.done + ret + +; This should work on a Pentium 4 or higher in 32-bit mode, +; but it isn't much faster than the MMX version so I don't use it. + +global dot_product_sse2 ; (short* a, short* b, int n) +align 16 +dot_product_sse2: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 16 + sub edx, 16 + pxor xmm0, xmm0 ; sum = 0 +.loop: ; each loop sums 4 products + movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0 + pmaddwd xmm1, [edx+ecx*2] + psrad xmm1, 8 + paddd xmm0, xmm1 + sub ecx, 8 + ja .loop + movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax + psrldq xmm1, 8 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + paddd xmm0, xmm1 + movd eax, xmm0 +.done + ret + + +; Train n neural network weights w[n] on inputs t[n] and err. +; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K. +; n is rounded up to a multiple of 8. + +global train ; (short* t, short* w, int n, int err) +align 16 +train: + mov eax, [esp+16] ; err + and eax, 0xffff ; put 4 copies of err in mm0 + movd mm0, eax + movd mm1, eax + psllq mm1, 16 + por mm0, mm1 + movq mm1, mm0 + psllq mm1, 32 + por mm0, mm1 + pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1 + psrlw mm1, 15 + mov eax, [esp+4] ; t + mov edx, [esp+8] ; w + mov ecx, [esp+12] ; n + add ecx, 7 ; n/8 rounding up + and ecx, -8 + sub eax, 8 + sub edx, 8 + jz .done +.loop: ; each iteration adjusts 8 weights + movq mm2, [edx+ecx*2] ; w[i] + movq mm3, [eax+ecx*2] ; t[i] + movq mm4, [edx+ecx*2-8] ; w[i] + movq mm5, [eax+ecx*2-8] ; t[i] + paddsw mm3, mm3 + paddsw mm5, mm5 + pmulhw mm3, mm0 + pmulhw mm5, mm0 + paddsw mm3, mm1 + paddsw mm5, mm1 + psraw mm3, 1 + psraw mm5, 1 + paddsw mm2, mm3 + paddsw mm4, mm5 + movq [edx+ecx*2], mm2 + movq [edx+ecx*2-8], mm4 + sub ecx, 8 + ja .loop +.done: + emms + ret + diff --git a/paq8n/paq7asm.obj b/paq8n/paq7asm.obj deleted file mode 100644 index ebc3a8b..0000000 Binary files a/paq8n/paq7asm.obj and /dev/null differ diff --git a/paq8o/paq7asm.obj b/paq8o/paq7asm.obj deleted file mode 100644 index 2112509..0000000 Binary files a/paq8o/paq7asm.obj and /dev/null differ diff --git a/paq8o/paq7asmsse.obj b/paq8o/paq7asmsse.obj deleted file mode 100644 index 0bfae92..0000000 Binary files a/paq8o/paq7asmsse.obj and /dev/null differ diff --git a/paq8o2/paq7asm.obj b/paq8o2/paq7asm.obj deleted file mode 100644 index 2112509..0000000 Binary files a/paq8o2/paq7asm.obj and /dev/null differ diff --git a/paq8o3/paq7asm.obj b/paq8o3/paq7asm.obj deleted file mode 100644 index 2112509..0000000 Binary files a/paq8o3/paq7asm.obj and /dev/null differ diff --git a/paq8o3/paq8o3.cpp b/paq8o3/paq8o3.cpp index eb6e9f7..05bcf5d 100644 --- a/paq8o3/paq8o3.cpp +++ b/paq8o3/paq8o3.cpp @@ -2476,7 +2476,7 @@ int jpegModel(Mixer& m) { for (int i=1; i> 17 bounded to +- 32K. +; n is rounded up to a multiple of 8. + +global train ; (short* t, short* w, int n, int err) +align 16 +train: + mov eax, [esp+16] ; err + and eax, 0xffff ; put 4 copies of err in mm0 + movd mm0, eax + movd mm1, eax + psllq mm1, 16 + por mm0, mm1 + movq mm1, mm0 + psllq mm1, 32 + por mm0, mm1 + pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1 + psrlw mm1, 15 + mov eax, [esp+4] ; t + mov edx, [esp+8] ; w + mov ecx, [esp+12] ; n + add ecx, 7 ; n/8 rounding up + and ecx, -8 + sub eax, 8 + sub edx, 8 + jz .done +.loop: ; each iteration adjusts 8 weights + movq mm2, [edx+ecx*2] ; w[i] + movq mm3, [eax+ecx*2] ; t[i] + movq mm4, [edx+ecx*2-8] ; w[i] + movq mm5, [eax+ecx*2-8] ; t[i] + paddsw mm3, mm3 + paddsw mm5, mm5 + pmulhw mm3, mm0 + pmulhw mm5, mm0 + paddsw mm3, mm1 + paddsw mm5, mm1 + psraw mm3, 1 + psraw mm5, 1 + paddsw mm2, mm3 + paddsw mm4, mm5 + movq [edx+ecx*2], mm2 + movq [edx+ecx*2-8], mm4 + sub ecx, 8 + ja .loop +.done: + emms + ret + diff --git a/paq8px_v1/paq7asm.obj b/paq8px_v1/paq7asm.obj deleted file mode 100644 index ebc3a8b..0000000 Binary files a/paq8px_v1/paq7asm.obj and /dev/null differ diff --git a/paq8px_v44/paq7asm.asm b/paq8px_v44/paq7asm.asm new file mode 100644 index 0000000..82d55a7 --- /dev/null +++ b/paq8px_v44/paq7asm.asm @@ -0,0 +1,140 @@ +; NASM assembly language code for PAQ7. +; (C) 2005, Matt Mahoney. +; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt +; +; MINGW g++: nasm paq7asm.asm -f win32 --prefix _ +; DJGPP g++: nasm paq7asm.asm -f coff --prefix _ +; Borland, Mars: nasm paq7asm.asm -f obj --prefix _ +; Linux: nasm paq7asm.asm -f elf +; +; For other Windows compilers try -f win32 or -f obj. Some old versions +; of Linux should use -f aout instead of -f elf. +; +; This code will only work on a Pentium-MMX or higher. It doesn't +; use extended (Katmai/SSE) instructions. It won't work +; in 64-bit mode. + +section .text use32 class=CODE + +; Reset after MMX +global do_emms +do_emms: + emms + ret + +; Vector product a*b of n signed words, returning signed dword scaled +; down by 8 bits. n is rounded up to a multiple of 8. + +global dot_product ; (short* a, short* b, int n) +align 16 +dot_product: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 8 + sub edx, 8 + pxor mm0, mm0 ; sum = 0 +.loop: ; each loop sums 4 products + movq mm1, [eax+ecx*2] ; put halves of vector product in mm0 + pmaddwd mm1, [edx+ecx*2] + movq mm2, [eax+ecx*2-8] + pmaddwd mm2, [edx+ecx*2-8] + psrad mm1, 8 + psrad mm2, 8 + paddd mm0, mm1 + paddd mm0, mm2 + sub ecx, 8 + ja .loop + movq mm1, mm0 ; add 2 halves of mm0 and return in eax + psrlq mm1, 32 + paddd mm0, mm1 + movd eax, mm0 + emms +.done + ret + +; This should work on a Pentium 4 or higher in 32-bit mode, +; but it isn't much faster than the MMX version so I don't use it. + +global dot_product_sse2 ; (short* a, short* b, int n) +align 16 +dot_product_sse2: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 16 + sub edx, 16 + pxor xmm0, xmm0 ; sum = 0 +.loop: ; each loop sums 4 products + movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0 + pmaddwd xmm1, [edx+ecx*2] + psrad xmm1, 8 + paddd xmm0, xmm1 + sub ecx, 8 + ja .loop + movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax + psrldq xmm1, 8 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + paddd xmm0, xmm1 + movd eax, xmm0 +.done + ret + + +; Train n neural network weights w[n] on inputs t[n] and err. +; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K. +; n is rounded up to a multiple of 8. + +global train ; (short* t, short* w, int n, int err) +align 16 +train: + mov eax, [esp+16] ; err + and eax, 0xffff ; put 4 copies of err in mm0 + movd mm0, eax + movd mm1, eax + psllq mm1, 16 + por mm0, mm1 + movq mm1, mm0 + psllq mm1, 32 + por mm0, mm1 + pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1 + psrlw mm1, 15 + mov eax, [esp+4] ; t + mov edx, [esp+8] ; w + mov ecx, [esp+12] ; n + add ecx, 7 ; n/8 rounding up + and ecx, -8 + sub eax, 8 + sub edx, 8 + jz .done +.loop: ; each iteration adjusts 8 weights + movq mm2, [edx+ecx*2] ; w[i] + movq mm3, [eax+ecx*2] ; t[i] + movq mm4, [edx+ecx*2-8] ; w[i] + movq mm5, [eax+ecx*2-8] ; t[i] + paddsw mm3, mm3 + paddsw mm5, mm5 + pmulhw mm3, mm0 + pmulhw mm5, mm0 + paddsw mm3, mm1 + paddsw mm5, mm1 + psraw mm3, 1 + psraw mm5, 1 + paddsw mm2, mm3 + paddsw mm4, mm5 + movq [edx+ecx*2], mm2 + movq [edx+ecx*2-8], mm4 + sub ecx, 8 + ja .loop +.done: + emms + ret + diff --git a/paq8px_v44/paq7asm.obj b/paq8px_v44/paq7asm.obj deleted file mode 100644 index ebc3a8b..0000000 Binary files a/paq8px_v44/paq7asm.obj and /dev/null differ diff --git a/paq8px_v68e/paq7asm.asm b/paq8px_v68e/paq7asm.asm new file mode 100644 index 0000000..82d55a7 --- /dev/null +++ b/paq8px_v68e/paq7asm.asm @@ -0,0 +1,140 @@ +; NASM assembly language code for PAQ7. +; (C) 2005, Matt Mahoney. +; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt +; +; MINGW g++: nasm paq7asm.asm -f win32 --prefix _ +; DJGPP g++: nasm paq7asm.asm -f coff --prefix _ +; Borland, Mars: nasm paq7asm.asm -f obj --prefix _ +; Linux: nasm paq7asm.asm -f elf +; +; For other Windows compilers try -f win32 or -f obj. Some old versions +; of Linux should use -f aout instead of -f elf. +; +; This code will only work on a Pentium-MMX or higher. It doesn't +; use extended (Katmai/SSE) instructions. It won't work +; in 64-bit mode. + +section .text use32 class=CODE + +; Reset after MMX +global do_emms +do_emms: + emms + ret + +; Vector product a*b of n signed words, returning signed dword scaled +; down by 8 bits. n is rounded up to a multiple of 8. + +global dot_product ; (short* a, short* b, int n) +align 16 +dot_product: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 8 + sub edx, 8 + pxor mm0, mm0 ; sum = 0 +.loop: ; each loop sums 4 products + movq mm1, [eax+ecx*2] ; put halves of vector product in mm0 + pmaddwd mm1, [edx+ecx*2] + movq mm2, [eax+ecx*2-8] + pmaddwd mm2, [edx+ecx*2-8] + psrad mm1, 8 + psrad mm2, 8 + paddd mm0, mm1 + paddd mm0, mm2 + sub ecx, 8 + ja .loop + movq mm1, mm0 ; add 2 halves of mm0 and return in eax + psrlq mm1, 32 + paddd mm0, mm1 + movd eax, mm0 + emms +.done + ret + +; This should work on a Pentium 4 or higher in 32-bit mode, +; but it isn't much faster than the MMX version so I don't use it. + +global dot_product_sse2 ; (short* a, short* b, int n) +align 16 +dot_product_sse2: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 16 + sub edx, 16 + pxor xmm0, xmm0 ; sum = 0 +.loop: ; each loop sums 4 products + movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0 + pmaddwd xmm1, [edx+ecx*2] + psrad xmm1, 8 + paddd xmm0, xmm1 + sub ecx, 8 + ja .loop + movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax + psrldq xmm1, 8 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + paddd xmm0, xmm1 + movd eax, xmm0 +.done + ret + + +; Train n neural network weights w[n] on inputs t[n] and err. +; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K. +; n is rounded up to a multiple of 8. + +global train ; (short* t, short* w, int n, int err) +align 16 +train: + mov eax, [esp+16] ; err + and eax, 0xffff ; put 4 copies of err in mm0 + movd mm0, eax + movd mm1, eax + psllq mm1, 16 + por mm0, mm1 + movq mm1, mm0 + psllq mm1, 32 + por mm0, mm1 + pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1 + psrlw mm1, 15 + mov eax, [esp+4] ; t + mov edx, [esp+8] ; w + mov ecx, [esp+12] ; n + add ecx, 7 ; n/8 rounding up + and ecx, -8 + sub eax, 8 + sub edx, 8 + jz .done +.loop: ; each iteration adjusts 8 weights + movq mm2, [edx+ecx*2] ; w[i] + movq mm3, [eax+ecx*2] ; t[i] + movq mm4, [edx+ecx*2-8] ; w[i] + movq mm5, [eax+ecx*2-8] ; t[i] + paddsw mm3, mm3 + paddsw mm5, mm5 + pmulhw mm3, mm0 + pmulhw mm5, mm0 + paddsw mm3, mm1 + paddsw mm5, mm1 + psraw mm3, 1 + psraw mm5, 1 + paddsw mm2, mm3 + paddsw mm4, mm5 + movq [edx+ecx*2], mm2 + movq [edx+ecx*2-8], mm4 + sub ecx, 8 + ja .loop +.done: + emms + ret + diff --git a/paq8px_v68p3/paq7asm.asm b/paq8px_v68p3/paq7asm.asm new file mode 100644 index 0000000..82d55a7 --- /dev/null +++ b/paq8px_v68p3/paq7asm.asm @@ -0,0 +1,140 @@ +; NASM assembly language code for PAQ7. +; (C) 2005, Matt Mahoney. +; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt +; +; MINGW g++: nasm paq7asm.asm -f win32 --prefix _ +; DJGPP g++: nasm paq7asm.asm -f coff --prefix _ +; Borland, Mars: nasm paq7asm.asm -f obj --prefix _ +; Linux: nasm paq7asm.asm -f elf +; +; For other Windows compilers try -f win32 or -f obj. Some old versions +; of Linux should use -f aout instead of -f elf. +; +; This code will only work on a Pentium-MMX or higher. It doesn't +; use extended (Katmai/SSE) instructions. It won't work +; in 64-bit mode. + +section .text use32 class=CODE + +; Reset after MMX +global do_emms +do_emms: + emms + ret + +; Vector product a*b of n signed words, returning signed dword scaled +; down by 8 bits. n is rounded up to a multiple of 8. + +global dot_product ; (short* a, short* b, int n) +align 16 +dot_product: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 8 + sub edx, 8 + pxor mm0, mm0 ; sum = 0 +.loop: ; each loop sums 4 products + movq mm1, [eax+ecx*2] ; put halves of vector product in mm0 + pmaddwd mm1, [edx+ecx*2] + movq mm2, [eax+ecx*2-8] + pmaddwd mm2, [edx+ecx*2-8] + psrad mm1, 8 + psrad mm2, 8 + paddd mm0, mm1 + paddd mm0, mm2 + sub ecx, 8 + ja .loop + movq mm1, mm0 ; add 2 halves of mm0 and return in eax + psrlq mm1, 32 + paddd mm0, mm1 + movd eax, mm0 + emms +.done + ret + +; This should work on a Pentium 4 or higher in 32-bit mode, +; but it isn't much faster than the MMX version so I don't use it. + +global dot_product_sse2 ; (short* a, short* b, int n) +align 16 +dot_product_sse2: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 16 + sub edx, 16 + pxor xmm0, xmm0 ; sum = 0 +.loop: ; each loop sums 4 products + movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0 + pmaddwd xmm1, [edx+ecx*2] + psrad xmm1, 8 + paddd xmm0, xmm1 + sub ecx, 8 + ja .loop + movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax + psrldq xmm1, 8 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + paddd xmm0, xmm1 + movd eax, xmm0 +.done + ret + + +; Train n neural network weights w[n] on inputs t[n] and err. +; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K. +; n is rounded up to a multiple of 8. + +global train ; (short* t, short* w, int n, int err) +align 16 +train: + mov eax, [esp+16] ; err + and eax, 0xffff ; put 4 copies of err in mm0 + movd mm0, eax + movd mm1, eax + psllq mm1, 16 + por mm0, mm1 + movq mm1, mm0 + psllq mm1, 32 + por mm0, mm1 + pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1 + psrlw mm1, 15 + mov eax, [esp+4] ; t + mov edx, [esp+8] ; w + mov ecx, [esp+12] ; n + add ecx, 7 ; n/8 rounding up + and ecx, -8 + sub eax, 8 + sub edx, 8 + jz .done +.loop: ; each iteration adjusts 8 weights + movq mm2, [edx+ecx*2] ; w[i] + movq mm3, [eax+ecx*2] ; t[i] + movq mm4, [edx+ecx*2-8] ; w[i] + movq mm5, [eax+ecx*2-8] ; t[i] + paddsw mm3, mm3 + paddsw mm5, mm5 + pmulhw mm3, mm0 + pmulhw mm5, mm0 + paddsw mm3, mm1 + paddsw mm5, mm1 + psraw mm3, 1 + psraw mm5, 1 + paddsw mm2, mm3 + paddsw mm4, mm5 + movq [edx+ecx*2], mm2 + movq [edx+ecx*2-8], mm4 + sub ecx, 8 + ja .loop +.done: + emms + ret + diff --git a/paq8px_v9/paq7asm.asm b/paq8px_v9/paq7asm.asm new file mode 100644 index 0000000..82d55a7 --- /dev/null +++ b/paq8px_v9/paq7asm.asm @@ -0,0 +1,140 @@ +; NASM assembly language code for PAQ7. +; (C) 2005, Matt Mahoney. +; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt +; +; MINGW g++: nasm paq7asm.asm -f win32 --prefix _ +; DJGPP g++: nasm paq7asm.asm -f coff --prefix _ +; Borland, Mars: nasm paq7asm.asm -f obj --prefix _ +; Linux: nasm paq7asm.asm -f elf +; +; For other Windows compilers try -f win32 or -f obj. Some old versions +; of Linux should use -f aout instead of -f elf. +; +; This code will only work on a Pentium-MMX or higher. It doesn't +; use extended (Katmai/SSE) instructions. It won't work +; in 64-bit mode. + +section .text use32 class=CODE + +; Reset after MMX +global do_emms +do_emms: + emms + ret + +; Vector product a*b of n signed words, returning signed dword scaled +; down by 8 bits. n is rounded up to a multiple of 8. + +global dot_product ; (short* a, short* b, int n) +align 16 +dot_product: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 8 + sub edx, 8 + pxor mm0, mm0 ; sum = 0 +.loop: ; each loop sums 4 products + movq mm1, [eax+ecx*2] ; put halves of vector product in mm0 + pmaddwd mm1, [edx+ecx*2] + movq mm2, [eax+ecx*2-8] + pmaddwd mm2, [edx+ecx*2-8] + psrad mm1, 8 + psrad mm2, 8 + paddd mm0, mm1 + paddd mm0, mm2 + sub ecx, 8 + ja .loop + movq mm1, mm0 ; add 2 halves of mm0 and return in eax + psrlq mm1, 32 + paddd mm0, mm1 + movd eax, mm0 + emms +.done + ret + +; This should work on a Pentium 4 or higher in 32-bit mode, +; but it isn't much faster than the MMX version so I don't use it. + +global dot_product_sse2 ; (short* a, short* b, int n) +align 16 +dot_product_sse2: + mov eax, [esp+4] ; a + mov edx, [esp+8] ; b + mov ecx, [esp+12] ; n + add ecx, 7 ; n rounding up + and ecx, -8 + jz .done + sub eax, 16 + sub edx, 16 + pxor xmm0, xmm0 ; sum = 0 +.loop: ; each loop sums 4 products + movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0 + pmaddwd xmm1, [edx+ecx*2] + psrad xmm1, 8 + paddd xmm0, xmm1 + sub ecx, 8 + ja .loop + movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax + psrldq xmm1, 8 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + psrldq xmm1, 4 + paddd xmm0, xmm1 + movd eax, xmm0 +.done + ret + + +; Train n neural network weights w[n] on inputs t[n] and err. +; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K. +; n is rounded up to a multiple of 8. + +global train ; (short* t, short* w, int n, int err) +align 16 +train: + mov eax, [esp+16] ; err + and eax, 0xffff ; put 4 copies of err in mm0 + movd mm0, eax + movd mm1, eax + psllq mm1, 16 + por mm0, mm1 + movq mm1, mm0 + psllq mm1, 32 + por mm0, mm1 + pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1 + psrlw mm1, 15 + mov eax, [esp+4] ; t + mov edx, [esp+8] ; w + mov ecx, [esp+12] ; n + add ecx, 7 ; n/8 rounding up + and ecx, -8 + sub eax, 8 + sub edx, 8 + jz .done +.loop: ; each iteration adjusts 8 weights + movq mm2, [edx+ecx*2] ; w[i] + movq mm3, [eax+ecx*2] ; t[i] + movq mm4, [edx+ecx*2-8] ; w[i] + movq mm5, [eax+ecx*2-8] ; t[i] + paddsw mm3, mm3 + paddsw mm5, mm5 + pmulhw mm3, mm0 + pmulhw mm5, mm0 + paddsw mm3, mm1 + paddsw mm5, mm1 + psraw mm3, 1 + psraw mm5, 1 + paddsw mm2, mm3 + paddsw mm4, mm5 + movq [edx+ecx*2], mm2 + movq [edx+ecx*2-8], mm4 + sub ecx, 8 + ja .loop +.done: + emms + ret + diff --git a/paq8px_v9/paq7asm.obj b/paq8px_v9/paq7asm.obj deleted file mode 100644 index ebc3a8b..0000000 Binary files a/paq8px_v9/paq7asm.obj and /dev/null differ diff --git a/paq8pxpre/PAQ7ASM.obj b/paq8pxpre/PAQ7ASM.obj deleted file mode 100644 index 2112509..0000000 Binary files a/paq8pxpre/PAQ7ASM.obj and /dev/null differ