diff --git a/src/jit_compiler_a64.cpp b/src/jit_compiler_a64.cpp index f11db86..2de66d0 100644 --- a/src/jit_compiler_a64.cpp +++ b/src/jit_compiler_a64.cpp @@ -932,11 +932,8 @@ void JitCompilerA64::h_FDIV_M(Instruction& instr, uint32_t& codePos) constexpr uint32_t tmp_reg_fp = 28; emitMemLoadFP(src, instr, code, k); - // and tmp_reg_fp, tmp_reg_fp, and_mask_reg - emit32(0x4E201C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (29 << 16), code, k); - - // orr tmp_reg_fp, tmp_reg_fp, or_mask_reg - emit32(0x4EA01C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (30 << 16), code, k); + // bif tmp_reg_fp, or_mask_reg, and_mask_reg + emit32(0x6EE01C00 | tmp_reg_fp | (30 << 5) | (29 << 16), code, k); emit32(ARMV8A::FDIV | dst | (dst << 5) | (tmp_reg_fp << 16), code, k); diff --git a/src/jit_compiler_a64_static.S b/src/jit_compiler_a64_static.S index bc14613..bad9db8 100644 --- a/src/jit_compiler_a64_static.S +++ b/src/jit_compiler_a64_static.S @@ -102,7 +102,7 @@ # v26 -> "a2" # v27 -> "a3" # v28 -> temporary -# v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff +# v29 -> E 'and' mask = 0x00ffffffffc0000000ffffffffc00000 # v30 -> E 'or' mask = 0x3*00000000******3*00000000****** # v31 -> scale mask = 0x81f000000000000081f0000000000000 @@ -144,7 +144,7 @@ DECL(randomx_program_aarch64): ldp q26, q27, [x0, 224] # Load E 'and' mask - mov x16, 0x00FFFFFFFFFFFFFF + mov x16, 0x00FFFFFFFFC00000 ins v29.d[0], x16 ins v29.d[1], x16 @@ -257,14 +257,10 @@ DECL(randomx_program_aarch64_main_loop): scvtf v21.2d, v21.2d scvtf v22.2d, v22.2d scvtf v23.2d, v23.2d - and v20.16b, v20.16b, v29.16b - and v21.16b, v21.16b, v29.16b - and v22.16b, v22.16b, v29.16b - and v23.16b, v23.16b, v29.16b - orr v20.16b, v20.16b, v30.16b - orr v21.16b, v21.16b, v30.16b - orr v22.16b, v22.16b, v30.16b - orr v23.16b, v23.16b, v30.16b + bif v20.16b, v30.16b, v29.16b + bif v21.16b, v30.16b, v29.16b + bif v22.16b, v30.16b, v29.16b + bif v23.16b, v30.16b, v29.16b # Execute VM instructions DECL(randomx_program_aarch64_vm_instructions):