From be4bc3b477270c3382655db7e2a186d25189c07b Mon Sep 17 00:00:00 2001 From: MrPurple666 Date: Sat, 12 Apr 2025 17:52:41 -0300 Subject: [PATCH] WIP: DO-NOT-MERGE: NCE experiments: Better Pre-Fetch implementation should make it more performatic as in use with new lru cache implmentation --- src/core/arm/nce/interpreter_visitor.cpp | 114 ++++++++--------------- 1 file changed, 39 insertions(+), 75 deletions(-) diff --git a/src/core/arm/nce/interpreter_visitor.cpp b/src/core/arm/nce/interpreter_visitor.cpp index b1cf0f1a2c..bbe0289f8e 100644 --- a/src/core/arm/nce/interpreter_visitor.cpp +++ b/src/core/arm/nce/interpreter_visitor.cpp @@ -7,6 +7,13 @@ namespace Core { +namespace { +// Prefetch tuning parameters +constexpr size_t CACHE_LINE_SIZE = 64; +constexpr size_t PREFETCH_STRIDE = 128; // 2 cache lines ahead +constexpr size_t SIMD_PREFETCH_THRESHOLD = 32; // Bytes +} // namespace + template u64 SignExtendToLong(u64 value) { u64 mask = 1ULL << (BitSize - 1); @@ -168,15 +175,15 @@ bool InterpreterVisitor::Ordered(size_t size, bool L, bool o0, Reg Rn, Reg Rt) { const auto memop = L ? MemOp::Load : MemOp::Store; const size_t elsize = 8 << size; const size_t datasize = elsize; - - // Operation const size_t dbytes = datasize / 8; - u64 address; - if (Rn == Reg::SP) { - address = this->GetSp(); + u64 address = (Rn == Reg::SP) ? this->GetSp() : this->GetReg(Rn); + + // Conservative prefetch for atomic ops + if (memop == MemOp::Load) { + __builtin_prefetch(reinterpret_cast(address), 0, 1); } else { - address = this->GetReg(Rn); + __builtin_prefetch(reinterpret_cast(address), 1, 1); } switch (memop) { @@ -197,7 +204,6 @@ bool InterpreterVisitor::Ordered(size_t size, bool L, bool o0, Reg Rn, Reg Rt) { default: UNREACHABLE(); } - return true; } @@ -407,13 +413,11 @@ bool InterpreterVisitor::RegisterImmediate(bool wback, bool postindex, size_t sc MemOp memop; bool signed_ = false; size_t regsize = 0; - const size_t datasize = 8 << scale; if (opc.Bit<1>() == 0) { memop = opc.Bit<0>() ? MemOp::Load : MemOp::Store; regsize = size == 0b11 ? 64 : 32; - signed_ = false; } else if (size == 0b11) { memop = MemOp::Prefetch; ASSERT(!opc.Bit<0>()); @@ -424,34 +428,22 @@ bool InterpreterVisitor::RegisterImmediate(bool wback, bool postindex, size_t sc signed_ = true; } - if ((memop == MemOp::Load || memop == MemOp::Store) && wback && Rn == Rt && Rn != Reg::R31) { - // Unpredictable instruction - return false; - } - - alignas(8) u64 address; - if (Rn == Reg::SP) { - address = this->GetSp(); - } else { - address = this->GetReg(Rn); - } - - if (!postindex) { + u64 address = (Rn == Reg::SP) ? this->GetSp() : this->GetReg(Rn); + if (!postindex) address += offset; - } - - //const bool is_aligned = (address % 8) == 0; + // Optimized prefetch for loads if (memop == MemOp::Load) { - const size_t CACHE_LINE_SIZE = 64; - if ((address % 16) == 0) { - __builtin_prefetch((void*)address, 0, 3); - __builtin_prefetch((void*)(address + CACHE_LINE_SIZE), 0, 3); - if (datasize >= 32) { // Now datasize is in scope - __builtin_prefetch((void*)(address + CACHE_LINE_SIZE * 2), 0, 2); + const size_t access_size = datasize / 8; + const bool is_aligned = (address % access_size) == 0; + + if (is_aligned) { + __builtin_prefetch(reinterpret_cast(address), 0, 3); + if (access_size >= 8 && access_size <= 32) { + __builtin_prefetch(reinterpret_cast(address + PREFETCH_STRIDE), 0, 3); } - } else if ((address % 8) == 0) { - __builtin_prefetch((void*)address, 0, 2); + } else { + __builtin_prefetch(reinterpret_cast(address), 0, 1); } } @@ -472,21 +464,17 @@ bool InterpreterVisitor::RegisterImmediate(bool wback, bool postindex, size_t sc break; } case MemOp::Prefetch: - // this->Prefetch(address, Rt) break; } if (wback) { - if (postindex) { + if (postindex) address += offset; - } - if (Rn == Reg::SP) { + if (Rn == Reg::SP) this->SetSp(address); - } else { + else this->SetReg(Rn, address); - } } - return true; } @@ -521,28 +509,16 @@ bool InterpreterVisitor::STURx_LDURx(Imm<2> size, Imm<2> opc, Imm<9> imm9, Reg R bool InterpreterVisitor::SIMDImmediate(bool wback, bool postindex, size_t scale, u64 offset, MemOp memop, Reg Rn, Vec Vt) { const size_t datasize = 8 << scale; - - u64 address; - if (Rn == Reg::SP) { - address = this->GetSp(); - } else { - address = this->GetReg(Rn); - } - - if (!postindex) { + u64 address = (Rn == Reg::SP) ? this->GetSp() : this->GetReg(Rn); + if (!postindex) address += offset; - } - // Enhanced prefetching for SIMD loads + // Aggressive prefetch for SIMD if (memop == MemOp::Load) { - if ((address % 32) == 0) { - // Aggressive prefetch for well-aligned SIMD operations - __builtin_prefetch((void*)address, 0, 3); - __builtin_prefetch((void*)(address + 32), 0, 3); - __builtin_prefetch((void*)(address + 64), 0, 2); - } else if ((address % 16) == 0) { - __builtin_prefetch((void*)address, 0, 3); - __builtin_prefetch((void*)(address + datasize), 0, 2); + __builtin_prefetch(reinterpret_cast(address), 0, 3); + __builtin_prefetch(reinterpret_cast(address + CACHE_LINE_SIZE), 0, 3); + if (datasize >= SIMD_PREFETCH_THRESHOLD) { + __builtin_prefetch(reinterpret_cast(address + PREFETCH_STRIDE), 0, 3); } } @@ -563,17 +539,13 @@ bool InterpreterVisitor::SIMDImmediate(bool wback, bool postindex, size_t scale, } if (wback) { - if (postindex) { + if (postindex) address += offset; - } - - if (Rn == Reg::SP) { + if (Rn == Reg::SP) this->SetSp(address); - } else { + else this->SetReg(Rn, address); - } } - return true; } @@ -820,30 +792,22 @@ bool InterpreterVisitor::LDR_reg_fpsimd(Imm<2> size, Imm<1> opc_1, Reg Rm, Imm<3 std::optional MatchAndExecuteOneInstruction(Core::Memory::Memory& memory, mcontext_t* context, fpsimd_context* fpsimd_context) { - // Construct the interpreter. std::span regs(reinterpret_cast(context->regs), 31); std::span vregs(reinterpret_cast(fpsimd_context->vregs), 32); u64& sp = *reinterpret_cast(&context->sp); const u64& pc = *reinterpret_cast(&context->pc); InterpreterVisitor visitor(memory, regs, vregs, sp, pc); - - // Read the instruction at the program counter. u32 instruction = memory.Read32(pc); bool was_executed = false; - // Interpret the instruction. if (auto decoder = Dynarmic::A64::Decode(instruction)) { was_executed = decoder->get().call(visitor, instruction); } else { LOG_ERROR(Core_ARM, "Unallocated encoding: {:#x}", instruction); } - if (was_executed) { - return pc + 4; - } - - return std::nullopt; + return was_executed ? std::optional(pc + 4) : std::nullopt; } } // namespace Core