From be4bc3b477270c3382655db7e2a186d25189c07b Mon Sep 17 00:00:00 2001
From: MrPurple666 <antoniosacramento666usa@gmail.com>
Date: Sat, 12 Apr 2025 17:52:41 -0300
Subject: [PATCH] WIP: DO-NOT-MERGE: NCE experiments: Better Pre-Fetch
 implementation

should make it more performatic as in use with new lru cache implmentation
---
 src/core/arm/nce/interpreter_visitor.cpp | 114 ++++++++---------------
 1 file changed, 39 insertions(+), 75 deletions(-)
diff --git a/src/core/arm/nce/interpreter_visitor.cpp b/src/core/arm/nce/interpreter_visitor.cpp
index b1cf0f1a2c..bbe0289f8e 100644
--- a/src/core/arm/nce/interpreter_visitor.cpp
+++ b/src/core/arm/nce/interpreter_visitor.cpp
@@ -7,6 +7,13 @@
 
 namespace Core {
 
+namespace {
+// Prefetch tuning parameters
+constexpr size_t CACHE_LINE_SIZE = 64;
+constexpr size_t PREFETCH_STRIDE = 128; // 2 cache lines ahead
+constexpr size_t SIMD_PREFETCH_THRESHOLD = 32; // Bytes
+} // namespace
+
 template <u32 BitSize>
 u64 SignExtendToLong(u64 value) {
     u64 mask = 1ULL << (BitSize - 1);
@@ -168,15 +175,15 @@ bool InterpreterVisitor::Ordered(size_t size, bool L, bool o0, Reg Rn, Reg Rt) {
     const auto memop = L ? MemOp::Load : MemOp::Store;
     const size_t elsize = 8 << size;
     const size_t datasize = elsize;
-
-    // Operation
     const size_t dbytes = datasize / 8;
 
-    u64 address;
-    if (Rn == Reg::SP) {
-        address = this->GetSp();
+    u64 address = (Rn == Reg::SP) ? this->GetSp() : this->GetReg(Rn);
+
+    // Conservative prefetch for atomic ops
+    if (memop == MemOp::Load) {
+        __builtin_prefetch(reinterpret_cast<const void*>(address), 0, 1);
     } else {
-        address = this->GetReg(Rn);
+        __builtin_prefetch(reinterpret_cast<const void*>(address), 1, 1);
     }
 
     switch (memop) {
@@ -197,7 +204,6 @@ bool InterpreterVisitor::Ordered(size_t size, bool L, bool o0, Reg Rn, Reg Rt) {
     default:
         UNREACHABLE();
     }
-
     return true;
 }
 
@@ -407,13 +413,11 @@ bool InterpreterVisitor::RegisterImmediate(bool wback, bool postindex, size_t sc
     MemOp memop;
     bool signed_ = false;
     size_t regsize = 0;
-
     const size_t datasize = 8 << scale;
 
     if (opc.Bit<1>() == 0) {
         memop = opc.Bit<0>() ? MemOp::Load : MemOp::Store;
         regsize = size == 0b11 ? 64 : 32;
-        signed_ = false;
     } else if (size == 0b11) {
         memop = MemOp::Prefetch;
         ASSERT(!opc.Bit<0>());
@@ -424,34 +428,22 @@ bool InterpreterVisitor::RegisterImmediate(bool wback, bool postindex, size_t sc
         signed_ = true;
     }
 
-    if ((memop == MemOp::Load || memop == MemOp::Store) && wback && Rn == Rt && Rn != Reg::R31) {
-        // Unpredictable instruction
-        return false;
-    }
-
-    alignas(8) u64 address;
-    if (Rn == Reg::SP) {
-        address = this->GetSp();
-    } else {
-        address = this->GetReg(Rn);
-    }
-
-    if (!postindex) {
+    u64 address = (Rn == Reg::SP) ? this->GetSp() : this->GetReg(Rn);
+    if (!postindex)
         address += offset;
-    }
-
-    //const bool is_aligned = (address % 8) == 0;
 
+    // Optimized prefetch for loads
     if (memop == MemOp::Load) {
-        const size_t CACHE_LINE_SIZE = 64;
-        if ((address % 16) == 0) {
-            __builtin_prefetch((void*)address, 0, 3);
-            __builtin_prefetch((void*)(address + CACHE_LINE_SIZE), 0, 3);
-            if (datasize >= 32) { // Now datasize is in scope
-                __builtin_prefetch((void*)(address + CACHE_LINE_SIZE * 2), 0, 2);
+        const size_t access_size = datasize / 8;
+        const bool is_aligned = (address % access_size) == 0;
+
+        if (is_aligned) {
+            __builtin_prefetch(reinterpret_cast<const void*>(address), 0, 3);
+            if (access_size >= 8 && access_size <= 32) {
+                __builtin_prefetch(reinterpret_cast<const void*>(address + PREFETCH_STRIDE), 0, 3);
             }
-        } else if ((address % 8) == 0) {
-            __builtin_prefetch((void*)address, 0, 2);
+        } else {
+            __builtin_prefetch(reinterpret_cast<const void*>(address), 0, 1);
         }
     }
 
@@ -472,21 +464,17 @@ bool InterpreterVisitor::RegisterImmediate(bool wback, bool postindex, size_t sc
         break;
     }
     case MemOp::Prefetch:
-        // this->Prefetch(address, Rt)
         break;
     }
 
     if (wback) {
-        if (postindex) {
+        if (postindex)
             address += offset;
-        }
-        if (Rn == Reg::SP) {
+        if (Rn == Reg::SP)
             this->SetSp(address);
-        } else {
+        else
             this->SetReg(Rn, address);
-        }
     }
-
     return true;
 }
 
@@ -521,28 +509,16 @@ bool InterpreterVisitor::STURx_LDURx(Imm<2> size, Imm<2> opc, Imm<9> imm9, Reg R
 bool InterpreterVisitor::SIMDImmediate(bool wback, bool postindex, size_t scale, u64 offset,
                                        MemOp memop, Reg Rn, Vec Vt) {
     const size_t datasize = 8 << scale;
-
-    u64 address;
-    if (Rn == Reg::SP) {
-        address = this->GetSp();
-    } else {
-        address = this->GetReg(Rn);
-    }
-
-    if (!postindex) {
+    u64 address = (Rn == Reg::SP) ? this->GetSp() : this->GetReg(Rn);
+    if (!postindex)
         address += offset;
-    }
 
-    // Enhanced prefetching for SIMD loads
+    // Aggressive prefetch for SIMD
     if (memop == MemOp::Load) {
-        if ((address % 32) == 0) {
-            // Aggressive prefetch for well-aligned SIMD operations
-            __builtin_prefetch((void*)address, 0, 3);
-            __builtin_prefetch((void*)(address + 32), 0, 3);
-            __builtin_prefetch((void*)(address + 64), 0, 2);
-        } else if ((address % 16) == 0) {
-            __builtin_prefetch((void*)address, 0, 3);
-            __builtin_prefetch((void*)(address + datasize), 0, 2);
+        __builtin_prefetch(reinterpret_cast<const void*>(address), 0, 3);
+        __builtin_prefetch(reinterpret_cast<const void*>(address + CACHE_LINE_SIZE), 0, 3);
+        if (datasize >= SIMD_PREFETCH_THRESHOLD) {
+            __builtin_prefetch(reinterpret_cast<const void*>(address + PREFETCH_STRIDE), 0, 3);
         }
     }
 
@@ -563,17 +539,13 @@ bool InterpreterVisitor::SIMDImmediate(bool wback, bool postindex, size_t scale,
     }
 
     if (wback) {
-        if (postindex) {
+        if (postindex)
             address += offset;
-        }
-
-        if (Rn == Reg::SP) {
+        if (Rn == Reg::SP)
             this->SetSp(address);
-        } else {
+        else
             this->SetReg(Rn, address);
-        }
     }
-
     return true;
 }
 
@@ -820,30 +792,22 @@ bool InterpreterVisitor::LDR_reg_fpsimd(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3
 
 std::optional<u64> MatchAndExecuteOneInstruction(Core::Memory::Memory& memory, mcontext_t* context,
                                                  fpsimd_context* fpsimd_context) {
-    // Construct the interpreter.
     std::span<u64, 31> regs(reinterpret_cast<u64*>(context->regs), 31);
     std::span<u128, 32> vregs(reinterpret_cast<u128*>(fpsimd_context->vregs), 32);
     u64& sp = *reinterpret_cast<u64*>(&context->sp);
     const u64& pc = *reinterpret_cast<u64*>(&context->pc);
 
     InterpreterVisitor visitor(memory, regs, vregs, sp, pc);
-
-    // Read the instruction at the program counter.
     u32 instruction = memory.Read32(pc);
     bool was_executed = false;
 
-    // Interpret the instruction.
     if (auto decoder = Dynarmic::A64::Decode<VisitorBase>(instruction)) {
         was_executed = decoder->get().call(visitor, instruction);
     } else {
         LOG_ERROR(Core_ARM, "Unallocated encoding: {:#x}", instruction);
     }
 
-    if (was_executed) {
-        return pc + 4;
-    }
-
-    return std::nullopt;
+    return was_executed ? std::optional<u64>(pc + 4) : std::nullopt;
 }
 
 } // namespace Core