diff --git a/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp b/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp index e5fb25573b..14d1f0e17b 100644 --- a/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp +++ b/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp @@ -364,10 +364,10 @@ void BlockOfCode::GenRunCode(std::function rcp) { return_from_run_code[0] = getCurr(); cmp(dword[r15 + jsi.offsetof_halt_reason], 0); - jne(return_to_caller); + jne(return_to_caller, T_NEAR); if (cb.enable_cycle_counting) { cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0); - jng(return_to_caller); + jng(return_to_caller, T_NEAR); } cb.LookupBlock->EmitCall(*this); jmp(ABI_RETURN); @@ -376,10 +376,10 @@ void BlockOfCode::GenRunCode(std::function rcp) { return_from_run_code[MXCSR_ALREADY_EXITED] = getCurr(); cmp(dword[r15 + jsi.offsetof_halt_reason], 0); - jne(return_to_caller_mxcsr_already_exited); + jne(return_to_caller_mxcsr_already_exited, T_NEAR); if (cb.enable_cycle_counting) { cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0); - jng(return_to_caller_mxcsr_already_exited); + jng(return_to_caller_mxcsr_already_exited, T_NEAR); } SwitchMxcsrOnEntry(); cb.LookupBlock->EmitCall(*this); @@ -403,8 +403,7 @@ void BlockOfCode::GenRunCode(std::function rcp) { } xor_(eax, eax); - lock(); - xchg(dword[r15 + jsi.offsetof_halt_reason], eax); + /* implicit LOCK */ xchg(dword[r15 + jsi.offsetof_halt_reason], eax); ABI_PopCalleeSaveRegistersAndAdjustStack(*this, sizeof(StackLayout)); ret(); diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_single_structure.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_single_structure.cpp index b4bc842942..9615abb153 100644 --- a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_single_structure.cpp +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/load_store_single_structure.cpp @@ -58,46 +58,51 @@ static bool SharedDecodeAndOperation(TranslatorVisitor& v, bool wback, IR::MemOp address = v.X(64, Rn); } - IR::U64 offs = v.ir.Imm64(0); if (replicate) { - for (size_t s = 0; s < selem; s++) { + // CPU likes when we read first and then we do operations; Sure, OOO, but might as well + IR::UAnyU128 p_elements[4] = {}; //max upper bound=4 elements + for (size_t s = 0; s < selem; ++s) { + p_elements[s] = v.Mem(v.ir.Add(address, v.ir.Imm64(ebytes * s)), ebytes, IR::AccType::VEC); + } + // schedule ops after + for (size_t s = 0; s < selem; ++s) { const Vec tt = static_cast((VecNumber(Vt) + s) % 32); - const IR::UAnyU128 element = v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC); - const IR::U128 broadcasted_element = v.ir.VectorBroadcast(esize, element); - + const IR::U128 broadcasted_element = v.ir.VectorBroadcast(esize, p_elements[s]); v.V(datasize, tt, broadcasted_element); - - offs = v.ir.Add(offs, v.ir.Imm64(ebytes)); } } else { - for (size_t s = 0; s < selem; s++) { - const Vec tt = static_cast((VecNumber(Vt) + s) % 32); - const IR::U128 rval = v.V(128, tt); - - if (memop == IR::MemOp::LOAD) { - const IR::UAny elem = v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC); - const IR::U128 vec = v.ir.VectorSetElement(esize, rval, index, elem); - v.V(128, tt, vec); - } else { - const IR::UAny elem = v.ir.VectorGetElement(esize, rval, index); - v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC, elem); + if (memop == IR::MemOp::LOAD) { + IR::UAny p_elements[4] = {}; //max upper bound=4 elements + for (size_t s = 0; s < selem; ++s) { + p_elements[s] = v.Mem(v.ir.Add(address, v.ir.Imm64(ebytes * s)), ebytes, IR::AccType::VEC); + } + for (size_t s = 0; s < selem; ++s) { + const Vec tt = static_cast((VecNumber(Vt) + s) % 32); + const IR::U128 rval = v.V(128, tt); + const IR::U128 vec = v.ir.VectorSetElement(esize, rval, index, p_elements[s]); + v.V(128, tt, vec); + } + } else { + for (size_t s = 0; s < selem; ++s) { + const Vec tt = static_cast((VecNumber(Vt) + s) % 32); + const IR::U128 rval = v.V(128, tt); + const IR::UAny elem = v.ir.VectorGetElement(esize, rval, index); + v.Mem(v.ir.Add(address, v.ir.Imm64(ebytes * s)), ebytes, IR::AccType::VEC, elem); } - offs = v.ir.Add(offs, v.ir.Imm64(ebytes)); } } + IR::U64 offs = v.ir.Imm64(ebytes * selem); if (wback) { if (*Rm != Reg::SP) { offs = v.X(64, *Rm); } - if (Rn == Reg::SP) { v.SP(64, v.ir.Add(address, offs)); } else { v.X(64, Rn, v.ir.Add(address, offs)); } } - return true; } diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_different.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_different.cpp index 8f460665da..32f4550979 100644 --- a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_different.cpp +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_different.cpp @@ -25,16 +25,18 @@ bool AbsoluteDifferenceLong(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, V const size_t esize = 8 << size.ZeroExtend(); const size_t datasize = 64; - const IR::U128 operand1 = v.ir.VectorZeroExtend(esize, v.Vpart(datasize, Vn, Q)); - const IR::U128 operand2 = v.ir.VectorZeroExtend(esize, v.Vpart(datasize, Vm, Q)); - IR::U128 result = sign == SignednessSTD::Signed ? v.ir.VectorSignedAbsoluteDifference(esize, operand1, operand2) - : v.ir.VectorUnsignedAbsoluteDifference(esize, operand1, operand2); - + // Loads first, then operations + auto const s_operand1 = v.Vpart(datasize, Vn, Q); + auto const s_operand2 = v.Vpart(datasize, Vm, Q); + const IR::U128 operand1 = v.ir.VectorZeroExtend(esize, s_operand1); + const IR::U128 operand2 = v.ir.VectorZeroExtend(esize, s_operand2); + IR::U128 result = sign == SignednessSTD::Signed + ? v.ir.VectorSignedAbsoluteDifference(esize, operand1, operand2) + : v.ir.VectorUnsignedAbsoluteDifference(esize, operand1, operand2); if (behavior == AbsoluteDifferenceBehavior::Accumulate) { const IR::U128 data = v.V(2 * datasize, Vd); result = v.ir.VectorAdd(2 * esize, result, data); } - v.V(2 * datasize, Vd, result); return true; } diff --git a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same.cpp b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same.cpp index 1cfc2ced78..a4ab1774c1 100644 --- a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same.cpp +++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_three_same.cpp @@ -134,10 +134,8 @@ bool FPCompareRegister(TranslatorVisitor& v, bool Q, bool sz, Vec Vm, Vec Vn, Ve if (sz && !Q) { return v.ReservedValue(); } - const size_t esize = sz ? 64 : 32; const size_t datasize = Q ? 128 : 64; - const IR::U128 operand1 = v.V(datasize, Vn); const IR::U128 operand2 = v.V(datasize, Vm); const IR::U128 result = [&] { @@ -146,21 +144,22 @@ bool FPCompareRegister(TranslatorVisitor& v, bool Q, bool sz, Vec Vm, Vec Vn, Ve return v.ir.FPVectorEqual(esize, operand1, operand2); case ComparisonTypeSTS::GE: return v.ir.FPVectorGreaterEqual(esize, operand1, operand2); - case ComparisonTypeSTS::AbsoluteGE: - return v.ir.FPVectorGreaterEqual(esize, - v.ir.FPVectorAbs(esize, operand1), - v.ir.FPVectorAbs(esize, operand2)); + case ComparisonTypeSTS::AbsoluteGE: { + auto const tmp1 = v.ir.FPVectorAbs(esize, operand1); + auto const tmp2 = v.ir.FPVectorAbs(esize, operand2); + return v.ir.FPVectorGreaterEqual(esize, tmp1, tmp2); + } case ComparisonTypeSTS::GT: return v.ir.FPVectorGreater(esize, operand1, operand2); - case ComparisonTypeSTS::AbsoluteGT: - return v.ir.FPVectorGreater(esize, - v.ir.FPVectorAbs(esize, operand1), - v.ir.FPVectorAbs(esize, operand2)); + case ComparisonTypeSTS::AbsoluteGT: { + auto const tmp1 = v.ir.FPVectorAbs(esize, operand1); + auto const tmp2 = v.ir.FPVectorAbs(esize, operand2); + return v.ir.FPVectorGreater(esize, tmp1, tmp2); + } } UNREACHABLE(); }(); - v.V(datasize, Vd, result); return true; }