diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..f0b4ec9
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,137 @@
+
+CC := g++ -DUNIX -O2 -Os -s -m32 -fomit-frame-pointer
+#CC := g++ -DUNIX -O3 -s 
+TARGETS := paq8a.exe paq8f.exe paq8fthis2.exe paq8fthis3.exe paq8fthis4.exe paq8g.exe paq8hp12any.exe paq8jd.exe paq8k.exe paq8k2.exe paq8k3.exe paq8kx_v1.exe paq8kx_v4.exe paq8kx_v7.exe paq8l.exe paq8m.exe paq8n.exe paq8o.exe paq8o10t.exe paq8o2.exe paq8o3.exe paq8o4v2.exe paq8o5.exe paq8o6.exe paq8o7.exe paq8o8.exe paq8o9.exe paq8p.exe paq8px_v1.exe paq8px_v44.exe paq8px_v67.exe paq8px_v68e.exe paq8px_v68p3.exe paq8px_v9.exe
+
+all: ${TARGETS}
+clean:
+	rm -f ${TARGETS} */*.o
+
+%.o: %.asm
+	nasm -f elf $?
+
+paq8a.exe: paq8a/paq8a.cpp paq8o10t/paq7asm.o
+	${CC} -o $@ $?
+
+#paq8b.exe: paq8b/src/Paq8b.cpp paq8b/src/TextFilter.cpp ./paq8b/src/Paq8asm.o
+#	${CC} -o $@ $?
+
+#paq8c.exe: paq8c/paq8c.cpp paq8c/paq7asm.o
+#	${CC} -o $@ $?
+
+#paq8d.exe: paq8d/paq8d.cpp paq8d/paq7asm.o
+#	${CC} -o $@ $?
+
+#paq8e.exe: paq8e/paq8e.cpp paq8e/paq7asm.o
+#	${CC} -o $@ $?
+
+paq8f.exe: paq8f/paq8f.cpp paq8f/paq7asm.o
+	${CC} -o $@ $?
+
+paq8fthis2.exe: paq8fthis2/paq8fthis2.cpp paq8fthis2/paq7asm.o
+	${CC} -o $@ $?
+
+paq8fthis3.exe: paq8fthis3/paq8fthis3.cpp paq8fthis3/paq7asm.o
+	${CC} -o $@ $?
+
+paq8fthis4.exe: paq8fthis4/paq8fthis4.cpp paq8fthis4/paq7asm.o
+	${CC} -o $@ $?
+
+paq8g.exe: paq8g/src/paq8g.cpp ./paq8g/src/paq8asm.o
+	${CC} -o $@ $?
+
+paq8hp12any.exe: paq8hp12any/paq8hp12.cpp paq8hp12any/paq7asm.o
+	${CC} -o $@ $?
+
+paq8i.exe: paq8i/paq8i.cpp paq8i/paq7asm.o
+	${CC} -o $@ $?
+
+paq8jd.exe: paq8jd/paq8jd.cpp paq8jd/paq7asm.o
+	${CC} -o $@ $?
+
+paq8k.exe: paq8k/paq8k.cpp paq8k/paq7asm.o
+	${CC} -o $@ $?
+
+paq8k2.exe: paq8k2/paq8k2.cpp paq8k2/paq7asm.o
+	${CC} -o $@ $?
+
+paq8k3.exe: paq8k3/paq8k3.cpp paq8k3/paq7asm.o
+	${CC} -o $@ $?
+
+paq8kx_v1.exe: paq8kx_v1/paq8kx_v1.cpp paq8kx_v1/paq7asm.o
+	${CC} -o $@ $?
+
+paq8kx_v4.exe: paq8kx_v4/paq8kx_v4.cpp paq8kx_v4/paq7asm.o
+	${CC} -o $@ $?
+
+paq8kx_v7.exe: paq8kx_v7/paq8kx_v7.cpp paq8kx_v7/paq7asm.o
+	${CC} -o $@ $?
+
+paq8l.exe: paq8l/paq8l.cpp paq8l/paq7asm.o
+	${CC} -o $@ $?
+
+paq8m.exe: paq8m/paq8m.cpp paq8m/paq7asm.o
+	${CC} -o $@ $?
+
+paq8n.exe: paq8n/paq8n.cpp paq8n/paq7asm.o
+	${CC} -o $@ $?
+
+paq8o.exe: paq8o/paq8o.cpp paq8o/paq7asm.o
+	${CC} -o $@ $?
+
+paq8o10t.exe: paq8o10t/paq8o10t.cpp paq8o10t/paq7asm.o
+	${CC} -o $@ $?
+
+paq8o2.exe: paq8o2/paq8o.cpp paq8o2/paq7asm.o
+	${CC} -o $@ $?
+
+paq8o3.exe: paq8o3/paq8o3.cpp paq8o3/paq7asm.o
+	${CC} -o $@ $?
+
+paq8o4v2.exe: paq8o4v2/paq8o4.cpp paq8o4v2/paq7asm.o
+	${CC} -o $@ $?
+
+paq8o5.exe: paq8o5/paq8o5.cpp paq8o5/paq7asm.o
+	${CC} -o $@ $?
+
+paq8o6.exe: paq8o6/paq8o6.cpp paq8o6/paq7asm.o
+	${CC} -o $@ $?
+
+paq8o7.exe: paq8o7/paq8o7.cpp paq8o7/paq7asm.o
+	${CC} -o $@ $?
+
+paq8o8.exe: paq8o8/paq8o8.cpp paq8o8/paq7asm.o
+	${CC} -o $@ $?
+
+#paq8o8pre.exe: paq8o8pre/paq8o8pre.cpp paq8o8pre/PAQ7ASM.asm
+#	${CC} -o $@ $?
+
+paq8o9.exe: paq8o9/paq8o9.cpp paq8o9/paq7asm.o
+	${CC} -o $@ $?
+
+paq8p.exe: paq8p/paq8p.cpp paq8p/paq7asm.o
+	${CC} -o $@ $?
+
+#paq8pxpre.exe: paq8pxpre/paq8pxpre.cpp paq8pxpre/PAQ7ASM.o
+#	${CC} -o $@ $?
+
+paq8px_v1.exe: paq8px_v1/paq8px.cpp paq8px_v1/paq7asm.o
+	${CC} -o $@ $?
+
+paq8px_v44.exe: paq8px_v44/paq8px.cpp paq8px_v44/paq7asm.o
+	${CC} -o $@ $?
+
+paq8px_v67.exe: paq8px_v67/paq8px.cpp paq8px_v67/paq7asm.o
+	${CC} -o $@ $?
+
+paq8px_v68p3.exe: paq8px_v68p3/paq8px_v68p3.cpp paq8px_v68p3/paq7asm.o
+	${CC} -o $@ $?
+
+paq8px_v68e.exe: paq8px_v68e/paq8px_v68e.cpp paq8px_v68e/paq7asm.o
+	${CC} -o $@ $?
+
+paq8px_v9.exe: paq8px_v9/paq8px.cpp paq8px_v9/paq7asm.o
+	${CC} -o $@ $?
+
+.PHONY: all clean
+
diff --git a/paq8a/paq8a.cpp b/paq8a/paq8a.cpp
index ef7dbd0..66f6c6d 100644
--- a/paq8a/paq8a.cpp
+++ b/paq8a/paq8a.cpp
@@ -1008,7 +1008,7 @@ class Mixer {
   }
 
   // predict next bit
-  int Mixer::p() {
+  int p() {
     while (nx&7) tx[nx++]=0;  // pad
     if (mp) {  // combine outputs
       mp->update();
diff --git a/paq8b/src/PAQ8ASM.OBJ b/paq8b/src/PAQ8ASM.OBJ
deleted file mode 100644
index 2112509..0000000
Binary files a/paq8b/src/PAQ8ASM.OBJ and /dev/null differ
diff --git a/paq8b/src/Paq8asm.asm b/paq8b/src/Paq8asm.asm
new file mode 100644
index 0000000..82d55a7
--- /dev/null
+++ b/paq8b/src/Paq8asm.asm
@@ -0,0 +1,140 @@
+; NASM assembly language code for PAQ7.
+; (C) 2005, Matt Mahoney.
+; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
+;
+;   MINGW g++:     nasm paq7asm.asm -f win32 --prefix _
+;   DJGPP g++:     nasm paq7asm.asm -f coff  --prefix _
+;   Borland, Mars: nasm paq7asm.asm -f obj   --prefix _
+;   Linux:         nasm paq7asm.asm -f elf
+;
+; For other Windows compilers try -f win32 or -f obj.  Some old versions
+; of Linux should use -f aout instead of -f elf.
+;
+; This code will only work on a Pentium-MMX or higher.  It doesn't
+; use extended (Katmai/SSE) instructions.  It won't work
+; in 64-bit mode.
+
+section .text use32 class=CODE
+
+; Reset after MMX
+global do_emms
+do_emms:
+  emms
+  ret
+
+; Vector product a*b of n signed words, returning signed dword scaled
+; down by 8 bits. n is rounded up to a multiple of 8.
+
+global dot_product ; (short* a, short* b, int n)
+align 16
+dot_product:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 8
+  sub edx, 8
+  pxor mm0, mm0         ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movq mm1, [eax+ecx*2] ; put halves of vector product in mm0
+  pmaddwd mm1, [edx+ecx*2]
+  movq mm2, [eax+ecx*2-8]
+  pmaddwd mm2, [edx+ecx*2-8]
+  psrad mm1, 8
+  psrad mm2, 8
+  paddd mm0, mm1
+  paddd mm0, mm2
+  sub ecx, 8
+  ja .loop
+  movq mm1, mm0         ; add 2 halves of mm0 and return in eax
+  psrlq mm1, 32
+  paddd mm0, mm1
+  movd eax, mm0
+  emms
+.done
+  ret
+
+; This should work on a Pentium 4 or higher in 32-bit mode,
+; but it isn't much faster than the MMX version so I don't use it.
+
+global dot_product_sse2 ; (short* a, short* b, int n)
+align 16
+dot_product_sse2:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 16
+  sub edx, 16
+  pxor xmm0, xmm0       ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
+  pmaddwd xmm1, [edx+ecx*2]
+  psrad xmm1, 8
+  paddd xmm0, xmm1
+  sub ecx, 8
+  ja .loop
+  movdqa xmm1, xmm0      ; add 4 parts of xmm0 and return in eax
+  psrldq xmm1, 8
+  paddd xmm0, xmm1
+  movdqa xmm1, xmm0
+  psrldq xmm1, 4
+  paddd xmm0, xmm1
+  movd eax, xmm0
+.done
+  ret
+
+
+; Train n neural network weights w[n] on inputs t[n] and err.
+; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
+; n is rounded up to a multiple of 8.
+
+global train ; (short* t, short* w, int n, int err)
+align 16
+train:
+  mov eax, [esp+16]     ; err
+  and eax, 0xffff       ; put 4 copies of err in mm0
+  movd mm0, eax
+  movd mm1, eax
+  psllq mm1, 16
+  por mm0, mm1
+  movq mm1, mm0
+  psllq mm1, 32
+  por mm0, mm1
+  pcmpeqb mm1, mm1      ; 4 copies of 1 in mm1
+  psrlw mm1, 15
+  mov eax, [esp+4]      ; t
+  mov edx, [esp+8]      ; w
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n/8 rounding up
+  and ecx, -8
+  sub eax, 8
+  sub edx, 8
+  jz .done
+.loop:                  ; each iteration adjusts 8 weights
+  movq mm2, [edx+ecx*2] ; w[i]
+  movq mm3, [eax+ecx*2] ; t[i]
+  movq mm4, [edx+ecx*2-8] ; w[i]
+  movq mm5, [eax+ecx*2-8] ; t[i]
+  paddsw mm3, mm3
+  paddsw mm5, mm5
+  pmulhw mm3, mm0
+  pmulhw mm5, mm0
+  paddsw mm3, mm1
+  paddsw mm5, mm1
+  psraw mm3, 1
+  psraw mm5, 1
+  paddsw mm2, mm3
+  paddsw mm4, mm5
+  movq [edx+ecx*2], mm2
+  movq [edx+ecx*2-8], mm4
+  sub ecx, 8
+  ja .loop
+.done:
+  emms
+  ret
+
diff --git a/paq8b/src/Paq8b.cpp b/paq8b/src/Paq8b.cpp
index 2da974c..511b593 100644
--- a/paq8b/src/Paq8b.cpp
+++ b/paq8b/src/Paq8b.cpp
@@ -1009,7 +1009,7 @@ class Mixer {
   }
 
   // predict next bit
-  int Mixer::p() {
+  int p() {
     while (nx&7) tx[nx++]=0;  // pad
     if (mp) {  // combine outputs
       mp->update();
diff --git a/paq8c/src/PAQ8ASM.OBJ b/paq8c/src/PAQ8ASM.OBJ
deleted file mode 100644
index 2112509..0000000
Binary files a/paq8c/src/PAQ8ASM.OBJ and /dev/null differ
diff --git a/paq8c/src/Paq8c.cpp b/paq8c/src/Paq8c.cpp
index b9b748b..ff67862 100644
--- a/paq8c/src/Paq8c.cpp
+++ b/paq8c/src/Paq8c.cpp
@@ -1009,7 +1009,7 @@ class Mixer {
   }
 
   // predict next bit
-  int Mixer::p() {
+  int p() {
     while (nx&7) tx[nx++]=0;  // pad
     if (mp) {  // combine outputs
       mp->update();
diff --git a/paq8d/src/PAQ8ASM.OBJ b/paq8d/src/PAQ8ASM.OBJ
deleted file mode 100644
index 2112509..0000000
Binary files a/paq8d/src/PAQ8ASM.OBJ and /dev/null differ
diff --git a/paq8d/src/Paq8d.cpp b/paq8d/src/Paq8d.cpp
index cd3720f..eb5b31d 100644
--- a/paq8d/src/Paq8d.cpp
+++ b/paq8d/src/Paq8d.cpp
@@ -1011,7 +1011,7 @@ class Mixer {
   }
 
   // predict next bit
-  int Mixer::p() {
+  int p() {
     while (nx&7) tx[nx++]=0;  // pad
     if (mp) {  // combine outputs
       mp->update();
diff --git a/paq8e/src/PAQ8ASM.OBJ b/paq8e/src/PAQ8ASM.OBJ
deleted file mode 100644
index 2112509..0000000
Binary files a/paq8e/src/PAQ8ASM.OBJ and /dev/null differ
diff --git a/paq8e/src/Paq8e.cpp b/paq8e/src/Paq8e.cpp
index 22ddb0f..0f0511e 100644
--- a/paq8e/src/Paq8e.cpp
+++ b/paq8e/src/Paq8e.cpp
@@ -1012,7 +1012,7 @@ class Mixer {
   }
 
   // predict next bit
-  int Mixer::p() {
+  int p() {
     while (nx&7) tx[nx++]=0;  // pad
     if (mp) {  // combine outputs
       mp->update();
diff --git a/paq8f/paq7asm-x86_64.o b/paq8f/paq7asm-x86_64.o
deleted file mode 100644
index d81b5cf..0000000
Binary files a/paq8f/paq7asm-x86_64.o and /dev/null differ
diff --git a/paq8f/paq8f.cpp b/paq8f/paq8f.cpp
index 08de33b..3a1225b 100644
--- a/paq8f/paq8f.cpp
+++ b/paq8f/paq8f.cpp
@@ -1157,7 +1157,7 @@ class Mixer {
   }
 
   // predict next bit
-  int Mixer::p() {
+  int p() {
     while (nx&7) tx[nx++]=0;  // pad
     if (mp) {  // combine outputs
       mp->update();
diff --git a/paq8fthis2/paq7asm.obj b/paq8fthis2/paq7asm.obj
deleted file mode 100644
index ebc3a8b..0000000
Binary files a/paq8fthis2/paq7asm.obj and /dev/null differ
diff --git a/paq8fthis2/paq8fthis2.cpp b/paq8fthis2/paq8fthis2.cpp
index f68362f..6f61da8 100644
--- a/paq8fthis2/paq8fthis2.cpp
+++ b/paq8fthis2/paq8fthis2.cpp
@@ -1151,7 +1151,7 @@ class Mixer {
   }
 
   // predict next bit
-  int Mixer::p() {
+  int p() {
     while (nx&7) tx[nx++]=0;  // pad
     if (mp) {  // combine outputs
       mp->update();
diff --git a/paq8fthis3/paq7asm.obj b/paq8fthis3/paq7asm.obj
deleted file mode 100644
index ebc3a8b..0000000
Binary files a/paq8fthis3/paq7asm.obj and /dev/null differ
diff --git a/paq8fthis3/paq8fthis3.cpp b/paq8fthis3/paq8fthis3.cpp
index cb0ef19..d6b32a6 100644
--- a/paq8fthis3/paq8fthis3.cpp
+++ b/paq8fthis3/paq8fthis3.cpp
@@ -1151,7 +1151,7 @@ class Mixer {
   }
 
   // predict next bit
-  int Mixer::p() {
+  int p() {
     while (nx&7) tx[nx++]=0;  // pad
     if (mp) {  // combine outputs
       mp->update();
diff --git a/paq8fthis4/paq7asm.obj b/paq8fthis4/paq7asm.obj
deleted file mode 100644
index ebc3a8b..0000000
Binary files a/paq8fthis4/paq7asm.obj and /dev/null differ
diff --git a/paq8fthis4/paq8fthis4.cpp b/paq8fthis4/paq8fthis4.cpp
index 239f4f0..d6ed88b 100644
--- a/paq8fthis4/paq8fthis4.cpp
+++ b/paq8fthis4/paq8fthis4.cpp
@@ -1151,7 +1151,7 @@ class Mixer {
   }
 
   // predict next bit
-  int Mixer::p() {
+  int p() {
     while (nx&7) tx[nx++]=0;  // pad
     if (mp) {  // combine outputs
       mp->update();
diff --git a/paq8fthis4/paq8fthis_fast.cpp b/paq8fthis4/paq8fthis_fast.cpp
index 98c591f..20f318b 100644
--- a/paq8fthis4/paq8fthis_fast.cpp
+++ b/paq8fthis4/paq8fthis_fast.cpp
@@ -1151,7 +1151,7 @@ class Mixer {
   }
 
   // predict next bit
-  int Mixer::p() {
+  int p() {
     while (nx&7) tx[nx++]=0;  // pad
     if (mp) {  // combine outputs
       mp->update();
diff --git a/paq8g/src/linux/paq8asm.o b/paq8g/src/linux/paq8asm.o
deleted file mode 100644
index 42e1676..0000000
Binary files a/paq8g/src/linux/paq8asm.o and /dev/null differ
diff --git a/paq8g/src/paq8g.cpp b/paq8g/src/paq8g.cpp
index fbc9833..4179d58 100644
--- a/paq8g/src/paq8g.cpp
+++ b/paq8g/src/paq8g.cpp
@@ -1101,7 +1101,7 @@ class Mixer {
   }
 
   // predict next bit
-  int Mixer::p() {
+  int p() {
     while (nx&7) tx[nx++]=0;  // pad
     if (mp) {  // combine outputs
       mp->update();
diff --git a/paq8g/src/windows/paq8asm.obj b/paq8g/src/windows/paq8asm.obj
deleted file mode 100644
index 2112509..0000000
Binary files a/paq8g/src/windows/paq8asm.obj and /dev/null differ
diff --git a/paq8hp12any/paq7asm-x86_64.o b/paq8hp12any/paq7asm-x86_64.o
deleted file mode 100644
index d81b5cf..0000000
Binary files a/paq8hp12any/paq7asm-x86_64.o and /dev/null differ
diff --git a/paq8hp12any/paq7asmsse.obj b/paq8hp12any/paq7asmsse.obj
deleted file mode 100644
index 75fa698..0000000
Binary files a/paq8hp12any/paq7asmsse.obj and /dev/null differ
diff --git a/paq8i/paq8i.cpp b/paq8i/paq8i.cpp
index b1e790b..2b378ac 100644
--- a/paq8i/paq8i.cpp
+++ b/paq8i/paq8i.cpp
@@ -1127,7 +1127,7 @@ class Mixer {
   }
 
   // predict next bit
-  int Mixer::p() {
+  int p() {
     while (nx&7) tx[nx++]=0;  // pad
     if (mp) {  // combine outputs
       mp->update();
diff --git a/paq8jd/paq7asm-x86_64.o b/paq8jd/paq7asm-x86_64.o
deleted file mode 100644
index d81b5cf..0000000
Binary files a/paq8jd/paq7asm-x86_64.o and /dev/null differ
diff --git a/paq8jd/paq8jd.cpp b/paq8jd/paq8jd.cpp
index 1f009ee..98e879a 100644
--- a/paq8jd/paq8jd.cpp
+++ b/paq8jd/paq8jd.cpp
@@ -1192,7 +1192,7 @@ class Mixer {
   }
 
   // predict next bit
-  int Mixer::p() {
+  int p() {
     while (nx&7) tx[nx++]=0;  // pad
     if (mp) {  // combine outputs
       mp->update();
diff --git a/paq8k/paq7asm.asm b/paq8k/paq7asm.asm
new file mode 100644
index 0000000..82d55a7
--- /dev/null
+++ b/paq8k/paq7asm.asm
@@ -0,0 +1,140 @@
+; NASM assembly language code for PAQ7.
+; (C) 2005, Matt Mahoney.
+; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
+;
+;   MINGW g++:     nasm paq7asm.asm -f win32 --prefix _
+;   DJGPP g++:     nasm paq7asm.asm -f coff  --prefix _
+;   Borland, Mars: nasm paq7asm.asm -f obj   --prefix _
+;   Linux:         nasm paq7asm.asm -f elf
+;
+; For other Windows compilers try -f win32 or -f obj.  Some old versions
+; of Linux should use -f aout instead of -f elf.
+;
+; This code will only work on a Pentium-MMX or higher.  It doesn't
+; use extended (Katmai/SSE) instructions.  It won't work
+; in 64-bit mode.
+
+section .text use32 class=CODE
+
+; Reset after MMX
+global do_emms
+do_emms:
+  emms
+  ret
+
+; Vector product a*b of n signed words, returning signed dword scaled
+; down by 8 bits. n is rounded up to a multiple of 8.
+
+global dot_product ; (short* a, short* b, int n)
+align 16
+dot_product:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 8
+  sub edx, 8
+  pxor mm0, mm0         ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movq mm1, [eax+ecx*2] ; put halves of vector product in mm0
+  pmaddwd mm1, [edx+ecx*2]
+  movq mm2, [eax+ecx*2-8]
+  pmaddwd mm2, [edx+ecx*2-8]
+  psrad mm1, 8
+  psrad mm2, 8
+  paddd mm0, mm1
+  paddd mm0, mm2
+  sub ecx, 8
+  ja .loop
+  movq mm1, mm0         ; add 2 halves of mm0 and return in eax
+  psrlq mm1, 32
+  paddd mm0, mm1
+  movd eax, mm0
+  emms
+.done
+  ret
+
+; This should work on a Pentium 4 or higher in 32-bit mode,
+; but it isn't much faster than the MMX version so I don't use it.
+
+global dot_product_sse2 ; (short* a, short* b, int n)
+align 16
+dot_product_sse2:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 16
+  sub edx, 16
+  pxor xmm0, xmm0       ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
+  pmaddwd xmm1, [edx+ecx*2]
+  psrad xmm1, 8
+  paddd xmm0, xmm1
+  sub ecx, 8
+  ja .loop
+  movdqa xmm1, xmm0      ; add 4 parts of xmm0 and return in eax
+  psrldq xmm1, 8
+  paddd xmm0, xmm1
+  movdqa xmm1, xmm0
+  psrldq xmm1, 4
+  paddd xmm0, xmm1
+  movd eax, xmm0
+.done
+  ret
+
+
+; Train n neural network weights w[n] on inputs t[n] and err.
+; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
+; n is rounded up to a multiple of 8.
+
+global train ; (short* t, short* w, int n, int err)
+align 16
+train:
+  mov eax, [esp+16]     ; err
+  and eax, 0xffff       ; put 4 copies of err in mm0
+  movd mm0, eax
+  movd mm1, eax
+  psllq mm1, 16
+  por mm0, mm1
+  movq mm1, mm0
+  psllq mm1, 32
+  por mm0, mm1
+  pcmpeqb mm1, mm1      ; 4 copies of 1 in mm1
+  psrlw mm1, 15
+  mov eax, [esp+4]      ; t
+  mov edx, [esp+8]      ; w
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n/8 rounding up
+  and ecx, -8
+  sub eax, 8
+  sub edx, 8
+  jz .done
+.loop:                  ; each iteration adjusts 8 weights
+  movq mm2, [edx+ecx*2] ; w[i]
+  movq mm3, [eax+ecx*2] ; t[i]
+  movq mm4, [edx+ecx*2-8] ; w[i]
+  movq mm5, [eax+ecx*2-8] ; t[i]
+  paddsw mm3, mm3
+  paddsw mm5, mm5
+  pmulhw mm3, mm0
+  pmulhw mm5, mm0
+  paddsw mm3, mm1
+  paddsw mm5, mm1
+  psraw mm3, 1
+  psraw mm5, 1
+  paddsw mm2, mm3
+  paddsw mm4, mm5
+  movq [edx+ecx*2], mm2
+  movq [edx+ecx*2-8], mm4
+  sub ecx, 8
+  ja .loop
+.done:
+  emms
+  ret
+
diff --git a/paq8k/paq8k.cpp b/paq8k/paq8k.cpp
index 49c37f1..a43854d 100644
--- a/paq8k/paq8k.cpp
+++ b/paq8k/paq8k.cpp
@@ -1203,7 +1203,7 @@ class Mixer {
   }
 
   // predict next bit
-  int Mixer::p() {
+  int p() {
     while (nx&7) tx[nx++]=0;  // pad
     if (mp) {  // combine outputs
       mp->update();
diff --git a/paq8k2/paq7asm.asm b/paq8k2/paq7asm.asm
new file mode 100644
index 0000000..82d55a7
--- /dev/null
+++ b/paq8k2/paq7asm.asm
@@ -0,0 +1,140 @@
+; NASM assembly language code for PAQ7.
+; (C) 2005, Matt Mahoney.
+; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
+;
+;   MINGW g++:     nasm paq7asm.asm -f win32 --prefix _
+;   DJGPP g++:     nasm paq7asm.asm -f coff  --prefix _
+;   Borland, Mars: nasm paq7asm.asm -f obj   --prefix _
+;   Linux:         nasm paq7asm.asm -f elf
+;
+; For other Windows compilers try -f win32 or -f obj.  Some old versions
+; of Linux should use -f aout instead of -f elf.
+;
+; This code will only work on a Pentium-MMX or higher.  It doesn't
+; use extended (Katmai/SSE) instructions.  It won't work
+; in 64-bit mode.
+
+section .text use32 class=CODE
+
+; Reset after MMX
+global do_emms
+do_emms:
+  emms
+  ret
+
+; Vector product a*b of n signed words, returning signed dword scaled
+; down by 8 bits. n is rounded up to a multiple of 8.
+
+global dot_product ; (short* a, short* b, int n)
+align 16
+dot_product:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 8
+  sub edx, 8
+  pxor mm0, mm0         ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movq mm1, [eax+ecx*2] ; put halves of vector product in mm0
+  pmaddwd mm1, [edx+ecx*2]
+  movq mm2, [eax+ecx*2-8]
+  pmaddwd mm2, [edx+ecx*2-8]
+  psrad mm1, 8
+  psrad mm2, 8
+  paddd mm0, mm1
+  paddd mm0, mm2
+  sub ecx, 8
+  ja .loop
+  movq mm1, mm0         ; add 2 halves of mm0 and return in eax
+  psrlq mm1, 32
+  paddd mm0, mm1
+  movd eax, mm0
+  emms
+.done
+  ret
+
+; This should work on a Pentium 4 or higher in 32-bit mode,
+; but it isn't much faster than the MMX version so I don't use it.
+
+global dot_product_sse2 ; (short* a, short* b, int n)
+align 16
+dot_product_sse2:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 16
+  sub edx, 16
+  pxor xmm0, xmm0       ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
+  pmaddwd xmm1, [edx+ecx*2]
+  psrad xmm1, 8
+  paddd xmm0, xmm1
+  sub ecx, 8
+  ja .loop
+  movdqa xmm1, xmm0      ; add 4 parts of xmm0 and return in eax
+  psrldq xmm1, 8
+  paddd xmm0, xmm1
+  movdqa xmm1, xmm0
+  psrldq xmm1, 4
+  paddd xmm0, xmm1
+  movd eax, xmm0
+.done
+  ret
+
+
+; Train n neural network weights w[n] on inputs t[n] and err.
+; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
+; n is rounded up to a multiple of 8.
+
+global train ; (short* t, short* w, int n, int err)
+align 16
+train:
+  mov eax, [esp+16]     ; err
+  and eax, 0xffff       ; put 4 copies of err in mm0
+  movd mm0, eax
+  movd mm1, eax
+  psllq mm1, 16
+  por mm0, mm1
+  movq mm1, mm0
+  psllq mm1, 32
+  por mm0, mm1
+  pcmpeqb mm1, mm1      ; 4 copies of 1 in mm1
+  psrlw mm1, 15
+  mov eax, [esp+4]      ; t
+  mov edx, [esp+8]      ; w
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n/8 rounding up
+  and ecx, -8
+  sub eax, 8
+  sub edx, 8
+  jz .done
+.loop:                  ; each iteration adjusts 8 weights
+  movq mm2, [edx+ecx*2] ; w[i]
+  movq mm3, [eax+ecx*2] ; t[i]
+  movq mm4, [edx+ecx*2-8] ; w[i]
+  movq mm5, [eax+ecx*2-8] ; t[i]
+  paddsw mm3, mm3
+  paddsw mm5, mm5
+  pmulhw mm3, mm0
+  pmulhw mm5, mm0
+  paddsw mm3, mm1
+  paddsw mm5, mm1
+  psraw mm3, 1
+  psraw mm5, 1
+  paddsw mm2, mm3
+  paddsw mm4, mm5
+  movq [edx+ecx*2], mm2
+  movq [edx+ecx*2-8], mm4
+  sub ecx, 8
+  ja .loop
+.done:
+  emms
+  ret
+
diff --git a/paq8k2/paq7asm.obj b/paq8k2/paq7asm.obj
deleted file mode 100644
index c088d90..0000000
Binary files a/paq8k2/paq7asm.obj and /dev/null differ
diff --git a/paq8k3/paq7asm.asm b/paq8k3/paq7asm.asm
new file mode 100644
index 0000000..82d55a7
--- /dev/null
+++ b/paq8k3/paq7asm.asm
@@ -0,0 +1,140 @@
+; NASM assembly language code for PAQ7.
+; (C) 2005, Matt Mahoney.
+; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
+;
+;   MINGW g++:     nasm paq7asm.asm -f win32 --prefix _
+;   DJGPP g++:     nasm paq7asm.asm -f coff  --prefix _
+;   Borland, Mars: nasm paq7asm.asm -f obj   --prefix _
+;   Linux:         nasm paq7asm.asm -f elf
+;
+; For other Windows compilers try -f win32 or -f obj.  Some old versions
+; of Linux should use -f aout instead of -f elf.
+;
+; This code will only work on a Pentium-MMX or higher.  It doesn't
+; use extended (Katmai/SSE) instructions.  It won't work
+; in 64-bit mode.
+
+section .text use32 class=CODE
+
+; Reset after MMX
+global do_emms
+do_emms:
+  emms
+  ret
+
+; Vector product a*b of n signed words, returning signed dword scaled
+; down by 8 bits. n is rounded up to a multiple of 8.
+
+global dot_product ; (short* a, short* b, int n)
+align 16
+dot_product:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 8
+  sub edx, 8
+  pxor mm0, mm0         ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movq mm1, [eax+ecx*2] ; put halves of vector product in mm0
+  pmaddwd mm1, [edx+ecx*2]
+  movq mm2, [eax+ecx*2-8]
+  pmaddwd mm2, [edx+ecx*2-8]
+  psrad mm1, 8
+  psrad mm2, 8
+  paddd mm0, mm1
+  paddd mm0, mm2
+  sub ecx, 8
+  ja .loop
+  movq mm1, mm0         ; add 2 halves of mm0 and return in eax
+  psrlq mm1, 32
+  paddd mm0, mm1
+  movd eax, mm0
+  emms
+.done
+  ret
+
+; This should work on a Pentium 4 or higher in 32-bit mode,
+; but it isn't much faster than the MMX version so I don't use it.
+
+global dot_product_sse2 ; (short* a, short* b, int n)
+align 16
+dot_product_sse2:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 16
+  sub edx, 16
+  pxor xmm0, xmm0       ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
+  pmaddwd xmm1, [edx+ecx*2]
+  psrad xmm1, 8
+  paddd xmm0, xmm1
+  sub ecx, 8
+  ja .loop
+  movdqa xmm1, xmm0      ; add 4 parts of xmm0 and return in eax
+  psrldq xmm1, 8
+  paddd xmm0, xmm1
+  movdqa xmm1, xmm0
+  psrldq xmm1, 4
+  paddd xmm0, xmm1
+  movd eax, xmm0
+.done
+  ret
+
+
+; Train n neural network weights w[n] on inputs t[n] and err.
+; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
+; n is rounded up to a multiple of 8.
+
+global train ; (short* t, short* w, int n, int err)
+align 16
+train:
+  mov eax, [esp+16]     ; err
+  and eax, 0xffff       ; put 4 copies of err in mm0
+  movd mm0, eax
+  movd mm1, eax
+  psllq mm1, 16
+  por mm0, mm1
+  movq mm1, mm0
+  psllq mm1, 32
+  por mm0, mm1
+  pcmpeqb mm1, mm1      ; 4 copies of 1 in mm1
+  psrlw mm1, 15
+  mov eax, [esp+4]      ; t
+  mov edx, [esp+8]      ; w
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n/8 rounding up
+  and ecx, -8
+  sub eax, 8
+  sub edx, 8
+  jz .done
+.loop:                  ; each iteration adjusts 8 weights
+  movq mm2, [edx+ecx*2] ; w[i]
+  movq mm3, [eax+ecx*2] ; t[i]
+  movq mm4, [edx+ecx*2-8] ; w[i]
+  movq mm5, [eax+ecx*2-8] ; t[i]
+  paddsw mm3, mm3
+  paddsw mm5, mm5
+  pmulhw mm3, mm0
+  pmulhw mm5, mm0
+  paddsw mm3, mm1
+  paddsw mm5, mm1
+  psraw mm3, 1
+  psraw mm5, 1
+  paddsw mm2, mm3
+  paddsw mm4, mm5
+  movq [edx+ecx*2], mm2
+  movq [edx+ecx*2-8], mm4
+  sub ecx, 8
+  ja .loop
+.done:
+  emms
+  ret
+
diff --git a/paq8kx_v1/paq7asm.asm b/paq8kx_v1/paq7asm.asm
new file mode 100644
index 0000000..82d55a7
--- /dev/null
+++ b/paq8kx_v1/paq7asm.asm
@@ -0,0 +1,140 @@
+; NASM assembly language code for PAQ7.
+; (C) 2005, Matt Mahoney.
+; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
+;
+;   MINGW g++:     nasm paq7asm.asm -f win32 --prefix _
+;   DJGPP g++:     nasm paq7asm.asm -f coff  --prefix _
+;   Borland, Mars: nasm paq7asm.asm -f obj   --prefix _
+;   Linux:         nasm paq7asm.asm -f elf
+;
+; For other Windows compilers try -f win32 or -f obj.  Some old versions
+; of Linux should use -f aout instead of -f elf.
+;
+; This code will only work on a Pentium-MMX or higher.  It doesn't
+; use extended (Katmai/SSE) instructions.  It won't work
+; in 64-bit mode.
+
+section .text use32 class=CODE
+
+; Reset after MMX
+global do_emms
+do_emms:
+  emms
+  ret
+
+; Vector product a*b of n signed words, returning signed dword scaled
+; down by 8 bits. n is rounded up to a multiple of 8.
+
+global dot_product ; (short* a, short* b, int n)
+align 16
+dot_product:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 8
+  sub edx, 8
+  pxor mm0, mm0         ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movq mm1, [eax+ecx*2] ; put halves of vector product in mm0
+  pmaddwd mm1, [edx+ecx*2]
+  movq mm2, [eax+ecx*2-8]
+  pmaddwd mm2, [edx+ecx*2-8]
+  psrad mm1, 8
+  psrad mm2, 8
+  paddd mm0, mm1
+  paddd mm0, mm2
+  sub ecx, 8
+  ja .loop
+  movq mm1, mm0         ; add 2 halves of mm0 and return in eax
+  psrlq mm1, 32
+  paddd mm0, mm1
+  movd eax, mm0
+  emms
+.done
+  ret
+
+; This should work on a Pentium 4 or higher in 32-bit mode,
+; but it isn't much faster than the MMX version so I don't use it.
+
+global dot_product_sse2 ; (short* a, short* b, int n)
+align 16
+dot_product_sse2:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 16
+  sub edx, 16
+  pxor xmm0, xmm0       ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
+  pmaddwd xmm1, [edx+ecx*2]
+  psrad xmm1, 8
+  paddd xmm0, xmm1
+  sub ecx, 8
+  ja .loop
+  movdqa xmm1, xmm0      ; add 4 parts of xmm0 and return in eax
+  psrldq xmm1, 8
+  paddd xmm0, xmm1
+  movdqa xmm1, xmm0
+  psrldq xmm1, 4
+  paddd xmm0, xmm1
+  movd eax, xmm0
+.done
+  ret
+
+
+; Train n neural network weights w[n] on inputs t[n] and err.
+; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
+; n is rounded up to a multiple of 8.
+
+global train ; (short* t, short* w, int n, int err)
+align 16
+train:
+  mov eax, [esp+16]     ; err
+  and eax, 0xffff       ; put 4 copies of err in mm0
+  movd mm0, eax
+  movd mm1, eax
+  psllq mm1, 16
+  por mm0, mm1
+  movq mm1, mm0
+  psllq mm1, 32
+  por mm0, mm1
+  pcmpeqb mm1, mm1      ; 4 copies of 1 in mm1
+  psrlw mm1, 15
+  mov eax, [esp+4]      ; t
+  mov edx, [esp+8]      ; w
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n/8 rounding up
+  and ecx, -8
+  sub eax, 8
+  sub edx, 8
+  jz .done
+.loop:                  ; each iteration adjusts 8 weights
+  movq mm2, [edx+ecx*2] ; w[i]
+  movq mm3, [eax+ecx*2] ; t[i]
+  movq mm4, [edx+ecx*2-8] ; w[i]
+  movq mm5, [eax+ecx*2-8] ; t[i]
+  paddsw mm3, mm3
+  paddsw mm5, mm5
+  pmulhw mm3, mm0
+  pmulhw mm5, mm0
+  paddsw mm3, mm1
+  paddsw mm5, mm1
+  psraw mm3, 1
+  psraw mm5, 1
+  paddsw mm2, mm3
+  paddsw mm4, mm5
+  movq [edx+ecx*2], mm2
+  movq [edx+ecx*2-8], mm4
+  sub ecx, 8
+  ja .loop
+.done:
+  emms
+  ret
+
diff --git a/paq8kx_v4/paq7asm.asm b/paq8kx_v4/paq7asm.asm
new file mode 100644
index 0000000..82d55a7
--- /dev/null
+++ b/paq8kx_v4/paq7asm.asm
@@ -0,0 +1,140 @@
+; NASM assembly language code for PAQ7.
+; (C) 2005, Matt Mahoney.
+; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
+;
+;   MINGW g++:     nasm paq7asm.asm -f win32 --prefix _
+;   DJGPP g++:     nasm paq7asm.asm -f coff  --prefix _
+;   Borland, Mars: nasm paq7asm.asm -f obj   --prefix _
+;   Linux:         nasm paq7asm.asm -f elf
+;
+; For other Windows compilers try -f win32 or -f obj.  Some old versions
+; of Linux should use -f aout instead of -f elf.
+;
+; This code will only work on a Pentium-MMX or higher.  It doesn't
+; use extended (Katmai/SSE) instructions.  It won't work
+; in 64-bit mode.
+
+section .text use32 class=CODE
+
+; Reset after MMX
+global do_emms
+do_emms:
+  emms
+  ret
+
+; Vector product a*b of n signed words, returning signed dword scaled
+; down by 8 bits. n is rounded up to a multiple of 8.
+
+global dot_product ; (short* a, short* b, int n)
+align 16
+dot_product:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 8
+  sub edx, 8
+  pxor mm0, mm0         ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movq mm1, [eax+ecx*2] ; put halves of vector product in mm0
+  pmaddwd mm1, [edx+ecx*2]
+  movq mm2, [eax+ecx*2-8]
+  pmaddwd mm2, [edx+ecx*2-8]
+  psrad mm1, 8
+  psrad mm2, 8
+  paddd mm0, mm1
+  paddd mm0, mm2
+  sub ecx, 8
+  ja .loop
+  movq mm1, mm0         ; add 2 halves of mm0 and return in eax
+  psrlq mm1, 32
+  paddd mm0, mm1
+  movd eax, mm0
+  emms
+.done
+  ret
+
+; This should work on a Pentium 4 or higher in 32-bit mode,
+; but it isn't much faster than the MMX version so I don't use it.
+
+global dot_product_sse2 ; (short* a, short* b, int n)
+align 16
+dot_product_sse2:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 16
+  sub edx, 16
+  pxor xmm0, xmm0       ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
+  pmaddwd xmm1, [edx+ecx*2]
+  psrad xmm1, 8
+  paddd xmm0, xmm1
+  sub ecx, 8
+  ja .loop
+  movdqa xmm1, xmm0      ; add 4 parts of xmm0 and return in eax
+  psrldq xmm1, 8
+  paddd xmm0, xmm1
+  movdqa xmm1, xmm0
+  psrldq xmm1, 4
+  paddd xmm0, xmm1
+  movd eax, xmm0
+.done
+  ret
+
+
+; Train n neural network weights w[n] on inputs t[n] and err.
+; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
+; n is rounded up to a multiple of 8.
+
+global train ; (short* t, short* w, int n, int err)
+align 16
+train:
+  mov eax, [esp+16]     ; err
+  and eax, 0xffff       ; put 4 copies of err in mm0
+  movd mm0, eax
+  movd mm1, eax
+  psllq mm1, 16
+  por mm0, mm1
+  movq mm1, mm0
+  psllq mm1, 32
+  por mm0, mm1
+  pcmpeqb mm1, mm1      ; 4 copies of 1 in mm1
+  psrlw mm1, 15
+  mov eax, [esp+4]      ; t
+  mov edx, [esp+8]      ; w
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n/8 rounding up
+  and ecx, -8
+  sub eax, 8
+  sub edx, 8
+  jz .done
+.loop:                  ; each iteration adjusts 8 weights
+  movq mm2, [edx+ecx*2] ; w[i]
+  movq mm3, [eax+ecx*2] ; t[i]
+  movq mm4, [edx+ecx*2-8] ; w[i]
+  movq mm5, [eax+ecx*2-8] ; t[i]
+  paddsw mm3, mm3
+  paddsw mm5, mm5
+  pmulhw mm3, mm0
+  pmulhw mm5, mm0
+  paddsw mm3, mm1
+  paddsw mm5, mm1
+  psraw mm3, 1
+  psraw mm5, 1
+  paddsw mm2, mm3
+  paddsw mm4, mm5
+  movq [edx+ecx*2], mm2
+  movq [edx+ecx*2-8], mm4
+  sub ecx, 8
+  ja .loop
+.done:
+  emms
+  ret
+
diff --git a/paq8kx_v7/paq7asm.asm b/paq8kx_v7/paq7asm.asm
new file mode 100644
index 0000000..82d55a7
--- /dev/null
+++ b/paq8kx_v7/paq7asm.asm
@@ -0,0 +1,140 @@
+; NASM assembly language code for PAQ7.
+; (C) 2005, Matt Mahoney.
+; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
+;
+;   MINGW g++:     nasm paq7asm.asm -f win32 --prefix _
+;   DJGPP g++:     nasm paq7asm.asm -f coff  --prefix _
+;   Borland, Mars: nasm paq7asm.asm -f obj   --prefix _
+;   Linux:         nasm paq7asm.asm -f elf
+;
+; For other Windows compilers try -f win32 or -f obj.  Some old versions
+; of Linux should use -f aout instead of -f elf.
+;
+; This code will only work on a Pentium-MMX or higher.  It doesn't
+; use extended (Katmai/SSE) instructions.  It won't work
+; in 64-bit mode.
+
+section .text use32 class=CODE
+
+; Reset after MMX
+global do_emms
+do_emms:
+  emms
+  ret
+
+; Vector product a*b of n signed words, returning signed dword scaled
+; down by 8 bits. n is rounded up to a multiple of 8.
+
+global dot_product ; (short* a, short* b, int n)
+align 16
+dot_product:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 8
+  sub edx, 8
+  pxor mm0, mm0         ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movq mm1, [eax+ecx*2] ; put halves of vector product in mm0
+  pmaddwd mm1, [edx+ecx*2]
+  movq mm2, [eax+ecx*2-8]
+  pmaddwd mm2, [edx+ecx*2-8]
+  psrad mm1, 8
+  psrad mm2, 8
+  paddd mm0, mm1
+  paddd mm0, mm2
+  sub ecx, 8
+  ja .loop
+  movq mm1, mm0         ; add 2 halves of mm0 and return in eax
+  psrlq mm1, 32
+  paddd mm0, mm1
+  movd eax, mm0
+  emms
+.done
+  ret
+
+; This should work on a Pentium 4 or higher in 32-bit mode,
+; but it isn't much faster than the MMX version so I don't use it.
+
+global dot_product_sse2 ; (short* a, short* b, int n)
+align 16
+dot_product_sse2:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 16
+  sub edx, 16
+  pxor xmm0, xmm0       ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
+  pmaddwd xmm1, [edx+ecx*2]
+  psrad xmm1, 8
+  paddd xmm0, xmm1
+  sub ecx, 8
+  ja .loop
+  movdqa xmm1, xmm0      ; add 4 parts of xmm0 and return in eax
+  psrldq xmm1, 8
+  paddd xmm0, xmm1
+  movdqa xmm1, xmm0
+  psrldq xmm1, 4
+  paddd xmm0, xmm1
+  movd eax, xmm0
+.done
+  ret
+
+
+; Train n neural network weights w[n] on inputs t[n] and err.
+; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
+; n is rounded up to a multiple of 8.
+
+global train ; (short* t, short* w, int n, int err)
+align 16
+train:
+  mov eax, [esp+16]     ; err
+  and eax, 0xffff       ; put 4 copies of err in mm0
+  movd mm0, eax
+  movd mm1, eax
+  psllq mm1, 16
+  por mm0, mm1
+  movq mm1, mm0
+  psllq mm1, 32
+  por mm0, mm1
+  pcmpeqb mm1, mm1      ; 4 copies of 1 in mm1
+  psrlw mm1, 15
+  mov eax, [esp+4]      ; t
+  mov edx, [esp+8]      ; w
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n/8 rounding up
+  and ecx, -8
+  sub eax, 8
+  sub edx, 8
+  jz .done
+.loop:                  ; each iteration adjusts 8 weights
+  movq mm2, [edx+ecx*2] ; w[i]
+  movq mm3, [eax+ecx*2] ; t[i]
+  movq mm4, [edx+ecx*2-8] ; w[i]
+  movq mm5, [eax+ecx*2-8] ; t[i]
+  paddsw mm3, mm3
+  paddsw mm5, mm5
+  pmulhw mm3, mm0
+  pmulhw mm5, mm0
+  paddsw mm3, mm1
+  paddsw mm5, mm1
+  psraw mm3, 1
+  psraw mm5, 1
+  paddsw mm2, mm3
+  paddsw mm4, mm5
+  movq [edx+ecx*2], mm2
+  movq [edx+ecx*2-8], mm4
+  sub ecx, 8
+  ja .loop
+.done:
+  emms
+  ret
+
diff --git a/paq8n/paq7asm.obj b/paq8n/paq7asm.obj
deleted file mode 100644
index ebc3a8b..0000000
Binary files a/paq8n/paq7asm.obj and /dev/null differ
diff --git a/paq8o/paq7asm.obj b/paq8o/paq7asm.obj
deleted file mode 100644
index 2112509..0000000
Binary files a/paq8o/paq7asm.obj and /dev/null differ
diff --git a/paq8o/paq7asmsse.obj b/paq8o/paq7asmsse.obj
deleted file mode 100644
index 0bfae92..0000000
Binary files a/paq8o/paq7asmsse.obj and /dev/null differ
diff --git a/paq8o2/paq7asm.obj b/paq8o2/paq7asm.obj
deleted file mode 100644
index 2112509..0000000
Binary files a/paq8o2/paq7asm.obj and /dev/null differ
diff --git a/paq8o3/paq7asm.obj b/paq8o3/paq7asm.obj
deleted file mode 100644
index 2112509..0000000
Binary files a/paq8o3/paq7asm.obj and /dev/null differ
diff --git a/paq8o3/paq8o3.cpp b/paq8o3/paq8o3.cpp
index eb6e9f7..05bcf5d 100644
--- a/paq8o3/paq8o3.cpp
+++ b/paq8o3/paq8o3.cpp
@@ -2476,7 +2476,7 @@ int jpegModel(Mixer& m) {
         for (int i=1; i<mcusize; ++i) if (color[(j+i)%mcusize]==color[j]) ls[j]=i;
         ls[j]=mcusize-ls[j]<<6;
       }
-      for ( j=0; j<64; ++j) zpos[zzu[j]+8*zzv[j]]=j;
+      for (int j=0; j<64; ++j) zpos[zzu[j]+8*zzv[j]]=j;
       width=buf[sof+7]*256+buf[sof+8];  // in pixels
       int height=buf[sof+5]*256+buf[sof+6];
       printf("JPEG %dx%d ", width, height);
@@ -2582,7 +2582,7 @@ int jpegModel(Mixer& m) {
             const int zz=mcupos&63, cpos_dc=cpos-zz;
             if (zz==0) {
               for (int i=0; i<8; ++i) sumu[i]=sumv[i]=0;
-              for ( i=0; i<64; ++i) {
+              for (int i=0; i<64; ++i) {
                 sumu[zzu[i]]+=(zzv[i]?256:181)*(zzv[i]&1?-1:+1)*(qtab[q+i]+1)*cbuf2[cpos_dc+i-mcusize*width];
                 sumv[zzv[i]]+=(zzu[i]?256:181)*(zzu[i]&1?-1:+1)*(qtab[q+i]+1)*cbuf2[cpos_dc+i-ls[acomp]];
               }
@@ -2604,7 +2604,7 @@ int jpegModel(Mixer& m) {
                   break;
                 }
               }
-            for ( i=0; i<4; ++i) {
+            for (int i=0; i<4; ++i) {
               const int a=(i&1?zzv[zz]:zzu[zz]), b=(i&2?2:1);
               if (a<b) x=255;
               else {
diff --git a/paq8o4v2/paq7asm.obj b/paq8o4v2/paq7asm.obj
deleted file mode 100644
index ebc3a8b..0000000
Binary files a/paq8o4v2/paq7asm.obj and /dev/null differ
diff --git a/paq8o4v2/paq7asmsse.obj b/paq8o4v2/paq7asmsse.obj
deleted file mode 100644
index 604948a..0000000
Binary files a/paq8o4v2/paq7asmsse.obj and /dev/null differ
diff --git a/paq8o8pre/PAQ7ASM.obj b/paq8o8pre/PAQ7ASM.obj
deleted file mode 100644
index 2112509..0000000
Binary files a/paq8o8pre/PAQ7ASM.obj and /dev/null differ
diff --git a/paq8px_v1/paq7asm.asm b/paq8px_v1/paq7asm.asm
new file mode 100644
index 0000000..82d55a7
--- /dev/null
+++ b/paq8px_v1/paq7asm.asm
@@ -0,0 +1,140 @@
+; NASM assembly language code for PAQ7.
+; (C) 2005, Matt Mahoney.
+; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
+;
+;   MINGW g++:     nasm paq7asm.asm -f win32 --prefix _
+;   DJGPP g++:     nasm paq7asm.asm -f coff  --prefix _
+;   Borland, Mars: nasm paq7asm.asm -f obj   --prefix _
+;   Linux:         nasm paq7asm.asm -f elf
+;
+; For other Windows compilers try -f win32 or -f obj.  Some old versions
+; of Linux should use -f aout instead of -f elf.
+;
+; This code will only work on a Pentium-MMX or higher.  It doesn't
+; use extended (Katmai/SSE) instructions.  It won't work
+; in 64-bit mode.
+
+section .text use32 class=CODE
+
+; Reset after MMX
+global do_emms
+do_emms:
+  emms
+  ret
+
+; Vector product a*b of n signed words, returning signed dword scaled
+; down by 8 bits. n is rounded up to a multiple of 8.
+
+global dot_product ; (short* a, short* b, int n)
+align 16
+dot_product:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 8
+  sub edx, 8
+  pxor mm0, mm0         ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movq mm1, [eax+ecx*2] ; put halves of vector product in mm0
+  pmaddwd mm1, [edx+ecx*2]
+  movq mm2, [eax+ecx*2-8]
+  pmaddwd mm2, [edx+ecx*2-8]
+  psrad mm1, 8
+  psrad mm2, 8
+  paddd mm0, mm1
+  paddd mm0, mm2
+  sub ecx, 8
+  ja .loop
+  movq mm1, mm0         ; add 2 halves of mm0 and return in eax
+  psrlq mm1, 32
+  paddd mm0, mm1
+  movd eax, mm0
+  emms
+.done
+  ret
+
+; This should work on a Pentium 4 or higher in 32-bit mode,
+; but it isn't much faster than the MMX version so I don't use it.
+
+global dot_product_sse2 ; (short* a, short* b, int n)
+align 16
+dot_product_sse2:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 16
+  sub edx, 16
+  pxor xmm0, xmm0       ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
+  pmaddwd xmm1, [edx+ecx*2]
+  psrad xmm1, 8
+  paddd xmm0, xmm1
+  sub ecx, 8
+  ja .loop
+  movdqa xmm1, xmm0      ; add 4 parts of xmm0 and return in eax
+  psrldq xmm1, 8
+  paddd xmm0, xmm1
+  movdqa xmm1, xmm0
+  psrldq xmm1, 4
+  paddd xmm0, xmm1
+  movd eax, xmm0
+.done
+  ret
+
+
+; Train n neural network weights w[n] on inputs t[n] and err.
+; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
+; n is rounded up to a multiple of 8.
+
+global train ; (short* t, short* w, int n, int err)
+align 16
+train:
+  mov eax, [esp+16]     ; err
+  and eax, 0xffff       ; put 4 copies of err in mm0
+  movd mm0, eax
+  movd mm1, eax
+  psllq mm1, 16
+  por mm0, mm1
+  movq mm1, mm0
+  psllq mm1, 32
+  por mm0, mm1
+  pcmpeqb mm1, mm1      ; 4 copies of 1 in mm1
+  psrlw mm1, 15
+  mov eax, [esp+4]      ; t
+  mov edx, [esp+8]      ; w
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n/8 rounding up
+  and ecx, -8
+  sub eax, 8
+  sub edx, 8
+  jz .done
+.loop:                  ; each iteration adjusts 8 weights
+  movq mm2, [edx+ecx*2] ; w[i]
+  movq mm3, [eax+ecx*2] ; t[i]
+  movq mm4, [edx+ecx*2-8] ; w[i]
+  movq mm5, [eax+ecx*2-8] ; t[i]
+  paddsw mm3, mm3
+  paddsw mm5, mm5
+  pmulhw mm3, mm0
+  pmulhw mm5, mm0
+  paddsw mm3, mm1
+  paddsw mm5, mm1
+  psraw mm3, 1
+  psraw mm5, 1
+  paddsw mm2, mm3
+  paddsw mm4, mm5
+  movq [edx+ecx*2], mm2
+  movq [edx+ecx*2-8], mm4
+  sub ecx, 8
+  ja .loop
+.done:
+  emms
+  ret
+
diff --git a/paq8px_v1/paq7asm.obj b/paq8px_v1/paq7asm.obj
deleted file mode 100644
index ebc3a8b..0000000
Binary files a/paq8px_v1/paq7asm.obj and /dev/null differ
diff --git a/paq8px_v44/paq7asm.asm b/paq8px_v44/paq7asm.asm
new file mode 100644
index 0000000..82d55a7
--- /dev/null
+++ b/paq8px_v44/paq7asm.asm
@@ -0,0 +1,140 @@
+; NASM assembly language code for PAQ7.
+; (C) 2005, Matt Mahoney.
+; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
+;
+;   MINGW g++:     nasm paq7asm.asm -f win32 --prefix _
+;   DJGPP g++:     nasm paq7asm.asm -f coff  --prefix _
+;   Borland, Mars: nasm paq7asm.asm -f obj   --prefix _
+;   Linux:         nasm paq7asm.asm -f elf
+;
+; For other Windows compilers try -f win32 or -f obj.  Some old versions
+; of Linux should use -f aout instead of -f elf.
+;
+; This code will only work on a Pentium-MMX or higher.  It doesn't
+; use extended (Katmai/SSE) instructions.  It won't work
+; in 64-bit mode.
+
+section .text use32 class=CODE
+
+; Reset after MMX
+global do_emms
+do_emms:
+  emms
+  ret
+
+; Vector product a*b of n signed words, returning signed dword scaled
+; down by 8 bits. n is rounded up to a multiple of 8.
+
+global dot_product ; (short* a, short* b, int n)
+align 16
+dot_product:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 8
+  sub edx, 8
+  pxor mm0, mm0         ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movq mm1, [eax+ecx*2] ; put halves of vector product in mm0
+  pmaddwd mm1, [edx+ecx*2]
+  movq mm2, [eax+ecx*2-8]
+  pmaddwd mm2, [edx+ecx*2-8]
+  psrad mm1, 8
+  psrad mm2, 8
+  paddd mm0, mm1
+  paddd mm0, mm2
+  sub ecx, 8
+  ja .loop
+  movq mm1, mm0         ; add 2 halves of mm0 and return in eax
+  psrlq mm1, 32
+  paddd mm0, mm1
+  movd eax, mm0
+  emms
+.done
+  ret
+
+; This should work on a Pentium 4 or higher in 32-bit mode,
+; but it isn't much faster than the MMX version so I don't use it.
+
+global dot_product_sse2 ; (short* a, short* b, int n)
+align 16
+dot_product_sse2:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 16
+  sub edx, 16
+  pxor xmm0, xmm0       ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
+  pmaddwd xmm1, [edx+ecx*2]
+  psrad xmm1, 8
+  paddd xmm0, xmm1
+  sub ecx, 8
+  ja .loop
+  movdqa xmm1, xmm0      ; add 4 parts of xmm0 and return in eax
+  psrldq xmm1, 8
+  paddd xmm0, xmm1
+  movdqa xmm1, xmm0
+  psrldq xmm1, 4
+  paddd xmm0, xmm1
+  movd eax, xmm0
+.done
+  ret
+
+
+; Train n neural network weights w[n] on inputs t[n] and err.
+; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
+; n is rounded up to a multiple of 8.
+
+global train ; (short* t, short* w, int n, int err)
+align 16
+train:
+  mov eax, [esp+16]     ; err
+  and eax, 0xffff       ; put 4 copies of err in mm0
+  movd mm0, eax
+  movd mm1, eax
+  psllq mm1, 16
+  por mm0, mm1
+  movq mm1, mm0
+  psllq mm1, 32
+  por mm0, mm1
+  pcmpeqb mm1, mm1      ; 4 copies of 1 in mm1
+  psrlw mm1, 15
+  mov eax, [esp+4]      ; t
+  mov edx, [esp+8]      ; w
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n/8 rounding up
+  and ecx, -8
+  sub eax, 8
+  sub edx, 8
+  jz .done
+.loop:                  ; each iteration adjusts 8 weights
+  movq mm2, [edx+ecx*2] ; w[i]
+  movq mm3, [eax+ecx*2] ; t[i]
+  movq mm4, [edx+ecx*2-8] ; w[i]
+  movq mm5, [eax+ecx*2-8] ; t[i]
+  paddsw mm3, mm3
+  paddsw mm5, mm5
+  pmulhw mm3, mm0
+  pmulhw mm5, mm0
+  paddsw mm3, mm1
+  paddsw mm5, mm1
+  psraw mm3, 1
+  psraw mm5, 1
+  paddsw mm2, mm3
+  paddsw mm4, mm5
+  movq [edx+ecx*2], mm2
+  movq [edx+ecx*2-8], mm4
+  sub ecx, 8
+  ja .loop
+.done:
+  emms
+  ret
+
diff --git a/paq8px_v44/paq7asm.obj b/paq8px_v44/paq7asm.obj
deleted file mode 100644
index ebc3a8b..0000000
Binary files a/paq8px_v44/paq7asm.obj and /dev/null differ
diff --git a/paq8px_v68e/paq7asm.asm b/paq8px_v68e/paq7asm.asm
new file mode 100644
index 0000000..82d55a7
--- /dev/null
+++ b/paq8px_v68e/paq7asm.asm
@@ -0,0 +1,140 @@
+; NASM assembly language code for PAQ7.
+; (C) 2005, Matt Mahoney.
+; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
+;
+;   MINGW g++:     nasm paq7asm.asm -f win32 --prefix _
+;   DJGPP g++:     nasm paq7asm.asm -f coff  --prefix _
+;   Borland, Mars: nasm paq7asm.asm -f obj   --prefix _
+;   Linux:         nasm paq7asm.asm -f elf
+;
+; For other Windows compilers try -f win32 or -f obj.  Some old versions
+; of Linux should use -f aout instead of -f elf.
+;
+; This code will only work on a Pentium-MMX or higher.  It doesn't
+; use extended (Katmai/SSE) instructions.  It won't work
+; in 64-bit mode.
+
+section .text use32 class=CODE
+
+; Reset after MMX
+global do_emms
+do_emms:
+  emms
+  ret
+
+; Vector product a*b of n signed words, returning signed dword scaled
+; down by 8 bits. n is rounded up to a multiple of 8.
+
+global dot_product ; (short* a, short* b, int n)
+align 16
+dot_product:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 8
+  sub edx, 8
+  pxor mm0, mm0         ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movq mm1, [eax+ecx*2] ; put halves of vector product in mm0
+  pmaddwd mm1, [edx+ecx*2]
+  movq mm2, [eax+ecx*2-8]
+  pmaddwd mm2, [edx+ecx*2-8]
+  psrad mm1, 8
+  psrad mm2, 8
+  paddd mm0, mm1
+  paddd mm0, mm2
+  sub ecx, 8
+  ja .loop
+  movq mm1, mm0         ; add 2 halves of mm0 and return in eax
+  psrlq mm1, 32
+  paddd mm0, mm1
+  movd eax, mm0
+  emms
+.done
+  ret
+
+; This should work on a Pentium 4 or higher in 32-bit mode,
+; but it isn't much faster than the MMX version so I don't use it.
+
+global dot_product_sse2 ; (short* a, short* b, int n)
+align 16
+dot_product_sse2:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 16
+  sub edx, 16
+  pxor xmm0, xmm0       ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
+  pmaddwd xmm1, [edx+ecx*2]
+  psrad xmm1, 8
+  paddd xmm0, xmm1
+  sub ecx, 8
+  ja .loop
+  movdqa xmm1, xmm0      ; add 4 parts of xmm0 and return in eax
+  psrldq xmm1, 8
+  paddd xmm0, xmm1
+  movdqa xmm1, xmm0
+  psrldq xmm1, 4
+  paddd xmm0, xmm1
+  movd eax, xmm0
+.done
+  ret
+
+
+; Train n neural network weights w[n] on inputs t[n] and err.
+; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
+; n is rounded up to a multiple of 8.
+
+global train ; (short* t, short* w, int n, int err)
+align 16
+train:
+  mov eax, [esp+16]     ; err
+  and eax, 0xffff       ; put 4 copies of err in mm0
+  movd mm0, eax
+  movd mm1, eax
+  psllq mm1, 16
+  por mm0, mm1
+  movq mm1, mm0
+  psllq mm1, 32
+  por mm0, mm1
+  pcmpeqb mm1, mm1      ; 4 copies of 1 in mm1
+  psrlw mm1, 15
+  mov eax, [esp+4]      ; t
+  mov edx, [esp+8]      ; w
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n/8 rounding up
+  and ecx, -8
+  sub eax, 8
+  sub edx, 8
+  jz .done
+.loop:                  ; each iteration adjusts 8 weights
+  movq mm2, [edx+ecx*2] ; w[i]
+  movq mm3, [eax+ecx*2] ; t[i]
+  movq mm4, [edx+ecx*2-8] ; w[i]
+  movq mm5, [eax+ecx*2-8] ; t[i]
+  paddsw mm3, mm3
+  paddsw mm5, mm5
+  pmulhw mm3, mm0
+  pmulhw mm5, mm0
+  paddsw mm3, mm1
+  paddsw mm5, mm1
+  psraw mm3, 1
+  psraw mm5, 1
+  paddsw mm2, mm3
+  paddsw mm4, mm5
+  movq [edx+ecx*2], mm2
+  movq [edx+ecx*2-8], mm4
+  sub ecx, 8
+  ja .loop
+.done:
+  emms
+  ret
+
diff --git a/paq8px_v68p3/paq7asm.asm b/paq8px_v68p3/paq7asm.asm
new file mode 100644
index 0000000..82d55a7
--- /dev/null
+++ b/paq8px_v68p3/paq7asm.asm
@@ -0,0 +1,140 @@
+; NASM assembly language code for PAQ7.
+; (C) 2005, Matt Mahoney.
+; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
+;
+;   MINGW g++:     nasm paq7asm.asm -f win32 --prefix _
+;   DJGPP g++:     nasm paq7asm.asm -f coff  --prefix _
+;   Borland, Mars: nasm paq7asm.asm -f obj   --prefix _
+;   Linux:         nasm paq7asm.asm -f elf
+;
+; For other Windows compilers try -f win32 or -f obj.  Some old versions
+; of Linux should use -f aout instead of -f elf.
+;
+; This code will only work on a Pentium-MMX or higher.  It doesn't
+; use extended (Katmai/SSE) instructions.  It won't work
+; in 64-bit mode.
+
+section .text use32 class=CODE
+
+; Reset after MMX
+global do_emms
+do_emms:
+  emms
+  ret
+
+; Vector product a*b of n signed words, returning signed dword scaled
+; down by 8 bits. n is rounded up to a multiple of 8.
+
+global dot_product ; (short* a, short* b, int n)
+align 16
+dot_product:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 8
+  sub edx, 8
+  pxor mm0, mm0         ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movq mm1, [eax+ecx*2] ; put halves of vector product in mm0
+  pmaddwd mm1, [edx+ecx*2]
+  movq mm2, [eax+ecx*2-8]
+  pmaddwd mm2, [edx+ecx*2-8]
+  psrad mm1, 8
+  psrad mm2, 8
+  paddd mm0, mm1
+  paddd mm0, mm2
+  sub ecx, 8
+  ja .loop
+  movq mm1, mm0         ; add 2 halves of mm0 and return in eax
+  psrlq mm1, 32
+  paddd mm0, mm1
+  movd eax, mm0
+  emms
+.done
+  ret
+
+; This should work on a Pentium 4 or higher in 32-bit mode,
+; but it isn't much faster than the MMX version so I don't use it.
+
+global dot_product_sse2 ; (short* a, short* b, int n)
+align 16
+dot_product_sse2:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 16
+  sub edx, 16
+  pxor xmm0, xmm0       ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
+  pmaddwd xmm1, [edx+ecx*2]
+  psrad xmm1, 8
+  paddd xmm0, xmm1
+  sub ecx, 8
+  ja .loop
+  movdqa xmm1, xmm0      ; add 4 parts of xmm0 and return in eax
+  psrldq xmm1, 8
+  paddd xmm0, xmm1
+  movdqa xmm1, xmm0
+  psrldq xmm1, 4
+  paddd xmm0, xmm1
+  movd eax, xmm0
+.done
+  ret
+
+
+; Train n neural network weights w[n] on inputs t[n] and err.
+; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
+; n is rounded up to a multiple of 8.
+
+global train ; (short* t, short* w, int n, int err)
+align 16
+train:
+  mov eax, [esp+16]     ; err
+  and eax, 0xffff       ; put 4 copies of err in mm0
+  movd mm0, eax
+  movd mm1, eax
+  psllq mm1, 16
+  por mm0, mm1
+  movq mm1, mm0
+  psllq mm1, 32
+  por mm0, mm1
+  pcmpeqb mm1, mm1      ; 4 copies of 1 in mm1
+  psrlw mm1, 15
+  mov eax, [esp+4]      ; t
+  mov edx, [esp+8]      ; w
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n/8 rounding up
+  and ecx, -8
+  sub eax, 8
+  sub edx, 8
+  jz .done
+.loop:                  ; each iteration adjusts 8 weights
+  movq mm2, [edx+ecx*2] ; w[i]
+  movq mm3, [eax+ecx*2] ; t[i]
+  movq mm4, [edx+ecx*2-8] ; w[i]
+  movq mm5, [eax+ecx*2-8] ; t[i]
+  paddsw mm3, mm3
+  paddsw mm5, mm5
+  pmulhw mm3, mm0
+  pmulhw mm5, mm0
+  paddsw mm3, mm1
+  paddsw mm5, mm1
+  psraw mm3, 1
+  psraw mm5, 1
+  paddsw mm2, mm3
+  paddsw mm4, mm5
+  movq [edx+ecx*2], mm2
+  movq [edx+ecx*2-8], mm4
+  sub ecx, 8
+  ja .loop
+.done:
+  emms
+  ret
+
diff --git a/paq8px_v9/paq7asm.asm b/paq8px_v9/paq7asm.asm
new file mode 100644
index 0000000..82d55a7
--- /dev/null
+++ b/paq8px_v9/paq7asm.asm
@@ -0,0 +1,140 @@
+; NASM assembly language code for PAQ7.
+; (C) 2005, Matt Mahoney.
+; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
+;
+;   MINGW g++:     nasm paq7asm.asm -f win32 --prefix _
+;   DJGPP g++:     nasm paq7asm.asm -f coff  --prefix _
+;   Borland, Mars: nasm paq7asm.asm -f obj   --prefix _
+;   Linux:         nasm paq7asm.asm -f elf
+;
+; For other Windows compilers try -f win32 or -f obj.  Some old versions
+; of Linux should use -f aout instead of -f elf.
+;
+; This code will only work on a Pentium-MMX or higher.  It doesn't
+; use extended (Katmai/SSE) instructions.  It won't work
+; in 64-bit mode.
+
+section .text use32 class=CODE
+
+; Reset after MMX
+global do_emms
+do_emms:
+  emms
+  ret
+
+; Vector product a*b of n signed words, returning signed dword scaled
+; down by 8 bits. n is rounded up to a multiple of 8.
+
+global dot_product ; (short* a, short* b, int n)
+align 16
+dot_product:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 8
+  sub edx, 8
+  pxor mm0, mm0         ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movq mm1, [eax+ecx*2] ; put halves of vector product in mm0
+  pmaddwd mm1, [edx+ecx*2]
+  movq mm2, [eax+ecx*2-8]
+  pmaddwd mm2, [edx+ecx*2-8]
+  psrad mm1, 8
+  psrad mm2, 8
+  paddd mm0, mm1
+  paddd mm0, mm2
+  sub ecx, 8
+  ja .loop
+  movq mm1, mm0         ; add 2 halves of mm0 and return in eax
+  psrlq mm1, 32
+  paddd mm0, mm1
+  movd eax, mm0
+  emms
+.done
+  ret
+
+; This should work on a Pentium 4 or higher in 32-bit mode,
+; but it isn't much faster than the MMX version so I don't use it.
+
+global dot_product_sse2 ; (short* a, short* b, int n)
+align 16
+dot_product_sse2:
+  mov eax, [esp+4]      ; a
+  mov edx, [esp+8]      ; b
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n rounding up
+  and ecx, -8
+  jz .done
+  sub eax, 16
+  sub edx, 16
+  pxor xmm0, xmm0       ; sum = 0
+.loop:                  ; each loop sums 4 products
+  movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
+  pmaddwd xmm1, [edx+ecx*2]
+  psrad xmm1, 8
+  paddd xmm0, xmm1
+  sub ecx, 8
+  ja .loop
+  movdqa xmm1, xmm0      ; add 4 parts of xmm0 and return in eax
+  psrldq xmm1, 8
+  paddd xmm0, xmm1
+  movdqa xmm1, xmm0
+  psrldq xmm1, 4
+  paddd xmm0, xmm1
+  movd eax, xmm0
+.done
+  ret
+
+
+; Train n neural network weights w[n] on inputs t[n] and err.
+; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
+; n is rounded up to a multiple of 8.
+
+global train ; (short* t, short* w, int n, int err)
+align 16
+train:
+  mov eax, [esp+16]     ; err
+  and eax, 0xffff       ; put 4 copies of err in mm0
+  movd mm0, eax
+  movd mm1, eax
+  psllq mm1, 16
+  por mm0, mm1
+  movq mm1, mm0
+  psllq mm1, 32
+  por mm0, mm1
+  pcmpeqb mm1, mm1      ; 4 copies of 1 in mm1
+  psrlw mm1, 15
+  mov eax, [esp+4]      ; t
+  mov edx, [esp+8]      ; w
+  mov ecx, [esp+12]     ; n
+  add ecx, 7            ; n/8 rounding up
+  and ecx, -8
+  sub eax, 8
+  sub edx, 8
+  jz .done
+.loop:                  ; each iteration adjusts 8 weights
+  movq mm2, [edx+ecx*2] ; w[i]
+  movq mm3, [eax+ecx*2] ; t[i]
+  movq mm4, [edx+ecx*2-8] ; w[i]
+  movq mm5, [eax+ecx*2-8] ; t[i]
+  paddsw mm3, mm3
+  paddsw mm5, mm5
+  pmulhw mm3, mm0
+  pmulhw mm5, mm0
+  paddsw mm3, mm1
+  paddsw mm5, mm1
+  psraw mm3, 1
+  psraw mm5, 1
+  paddsw mm2, mm3
+  paddsw mm4, mm5
+  movq [edx+ecx*2], mm2
+  movq [edx+ecx*2-8], mm4
+  sub ecx, 8
+  ja .loop
+.done:
+  emms
+  ret
+
diff --git a/paq8px_v9/paq7asm.obj b/paq8px_v9/paq7asm.obj
deleted file mode 100644
index ebc3a8b..0000000
Binary files a/paq8px_v9/paq7asm.obj and /dev/null differ
diff --git a/paq8pxpre/PAQ7ASM.obj b/paq8pxpre/PAQ7ASM.obj
deleted file mode 100644
index 2112509..0000000
Binary files a/paq8pxpre/PAQ7ASM.obj and /dev/null differ