From 9bd8f93ce4c5e2b386795e8fa081ae96132b0acd Mon Sep 17 00:00:00 2001 From: cyrozap Date: Fri, 24 Apr 2026 12:30:36 -0500 Subject: [PATCH] ARM64 JIT: Optimize Group E register conversion The AND and ORR sequence can be simplified down to a single BIF instruction if we correct the Group E AND mask to additionally clear the lower 22 bits of each double. This is possible because Group E registers are always loaded and converted from signed 32-bit integers. The int32-to-double conversion process never sets the lower 22 bits of the resulting double, so it doesn't matter whether or not we clear them with the mask. And since we're able to clear those bits with the mask, we can treat the AND/OR process like a bit-select operation, where the AND mask is used to select between the bits in the OR mask and the bits to keep unchanged. This change boosts performance by ~0.9% on an Apple M1 Pro, and likely more than that on systems with weaker OoO execution capabilities. --- src/jit_compiler_a64.cpp | 7 ++----- src/jit_compiler_a64_static.S | 16 ++++++---------- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/src/jit_compiler_a64.cpp b/src/jit_compiler_a64.cpp index 750443d5..bfca39b4 100644 --- a/src/jit_compiler_a64.cpp +++ b/src/jit_compiler_a64.cpp @@ -1001,11 +1001,8 @@ void JitCompilerA64::h_FDIV_M(Instruction& instr, uint32_t& codePos) constexpr uint32_t tmp_reg_fp = 28; emitMemLoadFP(src, instr, code, k); - // and tmp_reg_fp, tmp_reg_fp, and_mask_reg - emit32(0x4E201C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (29 << 16), code, k); - - // orr tmp_reg_fp, tmp_reg_fp, or_mask_reg - emit32(0x4EA01C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (30 << 16), code, k); + // bif tmp_reg_fp, or_mask_reg, and_mask_reg + emit32(0x6EE01C00 | tmp_reg_fp | (30 << 5) | (29 << 16), code, k); emit32(ARMV8A::FDIV | dst | (dst << 5) | (tmp_reg_fp << 16), code, k); diff --git a/src/jit_compiler_a64_static.S b/src/jit_compiler_a64_static.S index 0e29888e..2bf74cc6 100644 --- a/src/jit_compiler_a64_static.S +++ b/src/jit_compiler_a64_static.S @@ -114,7 +114,7 @@ # v26 -> "a2" # v27 -> "a3" # v28 -> temporary -# v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff +# v29 -> E 'and' mask = 0x00ffffffffc0000000ffffffffc00000 # v30 -> E 'or' mask = 0x3*00000000******3*00000000****** # v31 -> scale mask = 0x81f000000000000081f0000000000000 @@ -156,7 +156,7 @@ DECL(randomx_program_aarch64): ldp q26, q27, [x0, 224] # Load E 'and' mask - mov x16, 0x00FFFFFFFFFFFFFF + mov x16, 0x00FFFFFFFFC00000 ins v29.d[0], x16 ins v29.d[1], x16 @@ -269,14 +269,10 @@ DECL(randomx_program_aarch64_main_loop): scvtf v21.2d, v21.2d scvtf v22.2d, v22.2d scvtf v23.2d, v23.2d - and v20.16b, v20.16b, v29.16b - and v21.16b, v21.16b, v29.16b - and v22.16b, v22.16b, v29.16b - and v23.16b, v23.16b, v29.16b - orr v20.16b, v20.16b, v30.16b - orr v21.16b, v21.16b, v30.16b - orr v22.16b, v22.16b, v30.16b - orr v23.16b, v23.16b, v30.16b + bif v20.16b, v30.16b, v29.16b + bif v21.16b, v30.16b, v29.16b + bif v22.16b, v30.16b, v29.16b + bif v23.16b, v30.16b, v29.16b # Execute VM instructions DECL(randomx_program_aarch64_vm_instructions):