[dynarmic] delay ops, load first (attempt#1)

This commit is contained in:
lizzie 2025-07-19 00:53:23 +01:00 committed by crueter
parent f414ebdf34
commit f2e352822e
4 changed files with 49 additions and 44 deletions

View file

@ -364,10 +364,10 @@ void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
return_from_run_code[0] = getCurr<const void*>(); return_from_run_code[0] = getCurr<const void*>();
cmp(dword[r15 + jsi.offsetof_halt_reason], 0); cmp(dword[r15 + jsi.offsetof_halt_reason], 0);
jne(return_to_caller); jne(return_to_caller, T_NEAR);
if (cb.enable_cycle_counting) { if (cb.enable_cycle_counting) {
cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0); cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0);
jng(return_to_caller); jng(return_to_caller, T_NEAR);
} }
cb.LookupBlock->EmitCall(*this); cb.LookupBlock->EmitCall(*this);
jmp(ABI_RETURN); jmp(ABI_RETURN);
@ -376,10 +376,10 @@ void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
return_from_run_code[MXCSR_ALREADY_EXITED] = getCurr<const void*>(); return_from_run_code[MXCSR_ALREADY_EXITED] = getCurr<const void*>();
cmp(dword[r15 + jsi.offsetof_halt_reason], 0); cmp(dword[r15 + jsi.offsetof_halt_reason], 0);
jne(return_to_caller_mxcsr_already_exited); jne(return_to_caller_mxcsr_already_exited, T_NEAR);
if (cb.enable_cycle_counting) { if (cb.enable_cycle_counting) {
cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0); cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0);
jng(return_to_caller_mxcsr_already_exited); jng(return_to_caller_mxcsr_already_exited, T_NEAR);
} }
SwitchMxcsrOnEntry(); SwitchMxcsrOnEntry();
cb.LookupBlock->EmitCall(*this); cb.LookupBlock->EmitCall(*this);
@ -403,8 +403,7 @@ void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
} }
xor_(eax, eax); xor_(eax, eax);
lock(); /* implicit LOCK */ xchg(dword[r15 + jsi.offsetof_halt_reason], eax);
xchg(dword[r15 + jsi.offsetof_halt_reason], eax);
ABI_PopCalleeSaveRegistersAndAdjustStack(*this, sizeof(StackLayout)); ABI_PopCalleeSaveRegistersAndAdjustStack(*this, sizeof(StackLayout));
ret(); ret();

View file

@ -58,46 +58,51 @@ static bool SharedDecodeAndOperation(TranslatorVisitor& v, bool wback, IR::MemOp
address = v.X(64, Rn); address = v.X(64, Rn);
} }
IR::U64 offs = v.ir.Imm64(0);
if (replicate) { if (replicate) {
for (size_t s = 0; s < selem; s++) { // CPU likes when we read first and then we do operations; Sure, OOO, but might as well
IR::UAnyU128 p_elements[4] = {}; //max upper bound=4 elements
for (size_t s = 0; s < selem; ++s) {
p_elements[s] = v.Mem(v.ir.Add(address, v.ir.Imm64(ebytes * s)), ebytes, IR::AccType::VEC);
}
// schedule ops after
for (size_t s = 0; s < selem; ++s) {
const Vec tt = static_cast<Vec>((VecNumber(Vt) + s) % 32); const Vec tt = static_cast<Vec>((VecNumber(Vt) + s) % 32);
const IR::UAnyU128 element = v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC); const IR::U128 broadcasted_element = v.ir.VectorBroadcast(esize, p_elements[s]);
const IR::U128 broadcasted_element = v.ir.VectorBroadcast(esize, element);
v.V(datasize, tt, broadcasted_element); v.V(datasize, tt, broadcasted_element);
offs = v.ir.Add(offs, v.ir.Imm64(ebytes));
} }
} else { } else {
for (size_t s = 0; s < selem; s++) { if (memop == IR::MemOp::LOAD) {
IR::UAny p_elements[4] = {}; //max upper bound=4 elements
for (size_t s = 0; s < selem; ++s) {
p_elements[s] = v.Mem(v.ir.Add(address, v.ir.Imm64(ebytes * s)), ebytes, IR::AccType::VEC);
}
for (size_t s = 0; s < selem; ++s) {
const Vec tt = static_cast<Vec>((VecNumber(Vt) + s) % 32); const Vec tt = static_cast<Vec>((VecNumber(Vt) + s) % 32);
const IR::U128 rval = v.V(128, tt); const IR::U128 rval = v.V(128, tt);
const IR::U128 vec = v.ir.VectorSetElement(esize, rval, index, p_elements[s]);
if (memop == IR::MemOp::LOAD) {
const IR::UAny elem = v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC);
const IR::U128 vec = v.ir.VectorSetElement(esize, rval, index, elem);
v.V(128, tt, vec); v.V(128, tt, vec);
} else {
const IR::UAny elem = v.ir.VectorGetElement(esize, rval, index);
v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC, elem);
} }
offs = v.ir.Add(offs, v.ir.Imm64(ebytes)); } else {
for (size_t s = 0; s < selem; ++s) {
const Vec tt = static_cast<Vec>((VecNumber(Vt) + s) % 32);
const IR::U128 rval = v.V(128, tt);
const IR::UAny elem = v.ir.VectorGetElement(esize, rval, index);
v.Mem(v.ir.Add(address, v.ir.Imm64(ebytes * s)), ebytes, IR::AccType::VEC, elem);
}
} }
} }
IR::U64 offs = v.ir.Imm64(ebytes * selem);
if (wback) { if (wback) {
if (*Rm != Reg::SP) { if (*Rm != Reg::SP) {
offs = v.X(64, *Rm); offs = v.X(64, *Rm);
} }
if (Rn == Reg::SP) { if (Rn == Reg::SP) {
v.SP(64, v.ir.Add(address, offs)); v.SP(64, v.ir.Add(address, offs));
} else { } else {
v.X(64, Rn, v.ir.Add(address, offs)); v.X(64, Rn, v.ir.Add(address, offs));
} }
} }
return true; return true;
} }

View file

@ -25,16 +25,18 @@ bool AbsoluteDifferenceLong(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, V
const size_t esize = 8 << size.ZeroExtend(); const size_t esize = 8 << size.ZeroExtend();
const size_t datasize = 64; const size_t datasize = 64;
const IR::U128 operand1 = v.ir.VectorZeroExtend(esize, v.Vpart(datasize, Vn, Q)); // Loads first, then operations
const IR::U128 operand2 = v.ir.VectorZeroExtend(esize, v.Vpart(datasize, Vm, Q)); auto const s_operand1 = v.Vpart(datasize, Vn, Q);
IR::U128 result = sign == SignednessSTD::Signed ? v.ir.VectorSignedAbsoluteDifference(esize, operand1, operand2) auto const s_operand2 = v.Vpart(datasize, Vm, Q);
const IR::U128 operand1 = v.ir.VectorZeroExtend(esize, s_operand1);
const IR::U128 operand2 = v.ir.VectorZeroExtend(esize, s_operand2);
IR::U128 result = sign == SignednessSTD::Signed
? v.ir.VectorSignedAbsoluteDifference(esize, operand1, operand2)
: v.ir.VectorUnsignedAbsoluteDifference(esize, operand1, operand2); : v.ir.VectorUnsignedAbsoluteDifference(esize, operand1, operand2);
if (behavior == AbsoluteDifferenceBehavior::Accumulate) { if (behavior == AbsoluteDifferenceBehavior::Accumulate) {
const IR::U128 data = v.V(2 * datasize, Vd); const IR::U128 data = v.V(2 * datasize, Vd);
result = v.ir.VectorAdd(2 * esize, result, data); result = v.ir.VectorAdd(2 * esize, result, data);
} }
v.V(2 * datasize, Vd, result); v.V(2 * datasize, Vd, result);
return true; return true;
} }

View file

@ -134,10 +134,8 @@ bool FPCompareRegister(TranslatorVisitor& v, bool Q, bool sz, Vec Vm, Vec Vn, Ve
if (sz && !Q) { if (sz && !Q) {
return v.ReservedValue(); return v.ReservedValue();
} }
const size_t esize = sz ? 64 : 32; const size_t esize = sz ? 64 : 32;
const size_t datasize = Q ? 128 : 64; const size_t datasize = Q ? 128 : 64;
const IR::U128 operand1 = v.V(datasize, Vn); const IR::U128 operand1 = v.V(datasize, Vn);
const IR::U128 operand2 = v.V(datasize, Vm); const IR::U128 operand2 = v.V(datasize, Vm);
const IR::U128 result = [&] { const IR::U128 result = [&] {
@ -146,21 +144,22 @@ bool FPCompareRegister(TranslatorVisitor& v, bool Q, bool sz, Vec Vm, Vec Vn, Ve
return v.ir.FPVectorEqual(esize, operand1, operand2); return v.ir.FPVectorEqual(esize, operand1, operand2);
case ComparisonTypeSTS::GE: case ComparisonTypeSTS::GE:
return v.ir.FPVectorGreaterEqual(esize, operand1, operand2); return v.ir.FPVectorGreaterEqual(esize, operand1, operand2);
case ComparisonTypeSTS::AbsoluteGE: case ComparisonTypeSTS::AbsoluteGE: {
return v.ir.FPVectorGreaterEqual(esize, auto const tmp1 = v.ir.FPVectorAbs(esize, operand1);
v.ir.FPVectorAbs(esize, operand1), auto const tmp2 = v.ir.FPVectorAbs(esize, operand2);
v.ir.FPVectorAbs(esize, operand2)); return v.ir.FPVectorGreaterEqual(esize, tmp1, tmp2);
}
case ComparisonTypeSTS::GT: case ComparisonTypeSTS::GT:
return v.ir.FPVectorGreater(esize, operand1, operand2); return v.ir.FPVectorGreater(esize, operand1, operand2);
case ComparisonTypeSTS::AbsoluteGT: case ComparisonTypeSTS::AbsoluteGT: {
return v.ir.FPVectorGreater(esize, auto const tmp1 = v.ir.FPVectorAbs(esize, operand1);
v.ir.FPVectorAbs(esize, operand1), auto const tmp2 = v.ir.FPVectorAbs(esize, operand2);
v.ir.FPVectorAbs(esize, operand2)); return v.ir.FPVectorGreater(esize, tmp1, tmp2);
}
} }
UNREACHABLE(); UNREACHABLE();
}(); }();
v.V(datasize, Vd, result); v.V(datasize, Vd, result);
return true; return true;
} }