diff --git a/gen/avx_type.hpp b/gen/avx_type.hpp index 4436f284..6453b819 100644 --- a/gen/avx_type.hpp +++ b/gen/avx_type.hpp @@ -66,7 +66,7 @@ std::string type2String(uint64_t type) if (type & T_VSIB) str += "|T_VSIB"; if (type & T_MEM_EVEX) str += "|T_MEM_EVEX"; if (type & T_NF) str += "|T_NF"; - if (type & T_NO_OR1) str += "|T_NO_OR1"; + if (type & T_CODE1_IF1) str += "|T_CODE1_IF1"; if (str[0] == '|') str = str.substr(1); return str; diff --git a/gen/avx_type_def.h b/gen/avx_type_def.h index 5e74a934..80c6623e 100644 --- a/gen/avx_type_def.h +++ b/gen/avx_type_def.h @@ -43,7 +43,7 @@ static const uint64_t T_MAP5 = T_FP16 | T_0F; static const uint64_t T_MAP6 = T_FP16 | T_0F38; static const uint64_t T_NF = 1ull << 32; // T_nf - static const uint64_t T_NO_OR1 = 1ull << 33; // does not "code | 1" + static const uint64_t T_CODE1_IF1 = 1ull << 33; // code|=1 if !r.isBit(8) // T_66 = 1, T_F3 = 2, T_F2 = 3 static inline uint32_t getPP(uint64_t type) { return (type >> 5) & 3; } // @@@end of avx_type_def.h diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 7c4d9ba0..84635436 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -914,21 +914,21 @@ void putFP16_FMA2() }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { for (int j = 0; j < 2; j++) { - int t = T_MAP6 | T_EW0 | T_MUST_EVEX; + uint64_t type = T_MAP6 | T_EW0 | T_MUST_EVEX; if (j == 0) { - t |= T_F2; + type |= T_F2; } else { - t |= T_F3; + type |= T_F3; } const char *suf = 0; if (tbl[i].isPH) { - t |= T_ER_Z | T_YMM | T_B32; + type |= T_ER_Z | T_YMM | T_B32; suf = "ph"; } else { - t |= T_ER_X | T_N2; + type |= T_ER_X | T_N2; suf = "sh"; } - std::string s = type2String(t); + std::string s = type2String(type); printf("void vf%s%s%s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X); }\n" , j == 0 ? "c" : "", tbl[i].name, suf, s.c_str(), tbl[i].code); } diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index e0e9f493..88413ce1 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -1800,7 +1800,7 @@ class CodeGenerator : public CodeArray { static const uint64_t T_MAP5 = T_FP16 | T_0F; static const uint64_t T_MAP6 = T_FP16 | T_0F38; static const uint64_t T_NF = 1ull << 32; // T_nf - static const uint64_t T_NO_OR1 = 1ull << 33; // does not "code | 1" + static const uint64_t T_CODE1_IF1 = 1ull << 33; // code|=1 if !r.isBit(8) // T_66 = 1, T_F3 = 2, T_F2 = 3 static inline uint32_t getPP(uint64_t type) { return (type >> 5) & 3; } // @@@end of avx_type_def.h @@ -1993,8 +1993,7 @@ class CodeGenerator : public CodeArray { } else if (type & T_0F3A) { db(0x0F); db(0x3A); } -// db(code | (!(type & T_NO_OR1) && !r.isBit(8))); - db(code | (type == 0 && !r.isBit(8))); + db(code | ((type == 0 || (type & T_CODE1_IF1)) && !r.isBit(8))); } void opRR(const Reg& reg1, const Reg& reg2, uint64_t type, int code) { diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 62e49dba..bbce56ea 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -2058,8 +2058,8 @@ void vextracti32x4(const Operand& op, const Ymm& r, uint8_t imm) { if (!op.is(Op void vextracti32x8(const Operand& op, const Zmm& r, uint8_t imm) { if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32|T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x3B, imm); } void vextracti64x2(const Operand& op, const Ymm& r, uint8_t imm) { if (!op.is(Operand::MEM | Operand::XMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N16|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x39, imm); } void vextracti64x4(const Operand& op, const Zmm& r, uint8_t imm) { if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r, 0, op, T_N32|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x3B, imm); } -void vfcmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32|T_NF|T_NO_OR1, 0x56); } -void vfcmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32|T_NF|T_NO_OR1, 0xD6); } +void vfcmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x56); } +void vfcmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0xD6); } void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x54, imm); } void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x54, imm); } void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_Z|T_MUST_EVEX, 0x55, imm); } @@ -2070,7 +2070,7 @@ void vfmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM void vfmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xA9); } void vfmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB8); } void vfmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xB9); } -void vfmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32|T_NF|T_NO_OR1, 0x56); } +void vfmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x56); } void vfmaddsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x96); } void vfmaddsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA6); } void vfmaddsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB6); } @@ -2083,7 +2083,7 @@ void vfmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM void vfmsubadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x97); } void vfmsubadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA7); } void vfmsubadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB7); } -void vfmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32|T_NF|T_NO_OR1, 0xD6); } +void vfmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0xD6); } void vfnmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9C); } void vfnmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9D); } void vfnmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAC); }