From 9bd8f93ce4c5e2b386795e8fa081ae96132b0acd Mon Sep 17 00:00:00 2001
From: cyrozap <cyrozap@gmail.com>
Date: Fri, 24 Apr 2026 12:30:36 -0500
Subject: [PATCH] ARM64 JIT: Optimize Group E register conversion

The AND and ORR sequence can be simplified down to a single BIF
instruction if we correct the Group E AND mask to additionally clear the
lower 22 bits of each double.

This is possible because Group E registers are always loaded and
converted from signed 32-bit integers. The int32-to-double conversion
process never sets the lower 22 bits of the resulting double, so it
doesn't matter whether or not we clear them with the mask. And since
we're able to clear those bits with the mask, we can treat the AND/OR
process like a bit-select operation, where the AND mask is used to
select between the bits in the OR mask and the bits to keep unchanged.

This change boosts performance by ~0.9% on an Apple M1 Pro, and likely
more than that on systems with weaker OoO execution capabilities.
---
 src/jit_compiler_a64.cpp      |  7 ++-----
 src/jit_compiler_a64_static.S | 16 ++++++----------
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/src/jit_compiler_a64.cpp b/src/jit_compiler_a64.cpp
index 750443d5..bfca39b4 100644
--- a/src/jit_compiler_a64.cpp
+++ b/src/jit_compiler_a64.cpp
@@ -1001,11 +1001,8 @@ void JitCompilerA64::h_FDIV_M(Instruction& instr, uint32_t& codePos)
 	constexpr uint32_t tmp_reg_fp = 28;
 	emitMemLoadFP<tmp_reg_fp>(src, instr, code, k);
 
-	// and tmp_reg_fp, tmp_reg_fp, and_mask_reg
-	emit32(0x4E201C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (29 << 16), code, k);
-
-	// orr tmp_reg_fp, tmp_reg_fp, or_mask_reg
-	emit32(0x4EA01C00 | tmp_reg_fp | (tmp_reg_fp << 5) | (30 << 16), code, k);
+	// bif tmp_reg_fp, or_mask_reg, and_mask_reg
+	emit32(0x6EE01C00 | tmp_reg_fp | (30 << 5) | (29 << 16), code, k);
 
 	emit32(ARMV8A::FDIV | dst | (dst << 5) | (tmp_reg_fp << 16), code, k);
 
diff --git a/src/jit_compiler_a64_static.S b/src/jit_compiler_a64_static.S
index 0e29888e..2bf74cc6 100644
--- a/src/jit_compiler_a64_static.S
+++ b/src/jit_compiler_a64_static.S
@@ -114,7 +114,7 @@
 # v26 -> "a2"
 # v27 -> "a3"
 # v28 -> temporary
-# v29 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff
+# v29 -> E 'and' mask = 0x00ffffffffc0000000ffffffffc00000
 # v30 -> E 'or' mask  = 0x3*00000000******3*00000000******
 # v31 -> scale mask   = 0x81f000000000000081f0000000000000
 
@@ -156,7 +156,7 @@ DECL(randomx_program_aarch64):
 	ldp	q26, q27, [x0, 224]
 
 	# Load E 'and' mask
-	mov	x16, 0x00FFFFFFFFFFFFFF
+	mov	x16, 0x00FFFFFFFFC00000
 	ins	v29.d[0], x16
 	ins	v29.d[1], x16
 
@@ -269,14 +269,10 @@ DECL(randomx_program_aarch64_main_loop):
 	scvtf	v21.2d, v21.2d
 	scvtf	v22.2d, v22.2d
 	scvtf	v23.2d, v23.2d
-	and	v20.16b, v20.16b, v29.16b
-	and	v21.16b, v21.16b, v29.16b
-	and	v22.16b, v22.16b, v29.16b
-	and	v23.16b, v23.16b, v29.16b
-	orr	v20.16b, v20.16b, v30.16b
-	orr	v21.16b, v21.16b, v30.16b
-	orr	v22.16b, v22.16b, v30.16b
-	orr	v23.16b, v23.16b, v30.16b
+	bif	v20.16b, v30.16b, v29.16b
+	bif	v21.16b, v30.16b, v29.16b
+	bif	v22.16b, v30.16b, v29.16b
+	bif	v23.16b, v30.16b, v29.16b
 
 	# Execute VM instructions
 DECL(randomx_program_aarch64_vm_instructions):