[dynarmic] delay ops, load first (attempt#1)

This commit is contained in:
lizzie 2025-07-19 00:53:23 +01:00 committed by crueter
parent f414ebdf34
commit f2e352822e
4 changed files with 49 additions and 44 deletions

View file

@ -364,10 +364,10 @@ void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
return_from_run_code[0] = getCurr<const void*>();
cmp(dword[r15 + jsi.offsetof_halt_reason], 0);
jne(return_to_caller);
jne(return_to_caller, T_NEAR);
if (cb.enable_cycle_counting) {
cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0);
jng(return_to_caller);
jng(return_to_caller, T_NEAR);
}
cb.LookupBlock->EmitCall(*this);
jmp(ABI_RETURN);
@ -376,10 +376,10 @@ void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
return_from_run_code[MXCSR_ALREADY_EXITED] = getCurr<const void*>();
cmp(dword[r15 + jsi.offsetof_halt_reason], 0);
jne(return_to_caller_mxcsr_already_exited);
jne(return_to_caller_mxcsr_already_exited, T_NEAR);
if (cb.enable_cycle_counting) {
cmp(qword[rsp + ABI_SHADOW_SPACE + offsetof(StackLayout, cycles_remaining)], 0);
jng(return_to_caller_mxcsr_already_exited);
jng(return_to_caller_mxcsr_already_exited, T_NEAR);
}
SwitchMxcsrOnEntry();
cb.LookupBlock->EmitCall(*this);
@ -403,8 +403,7 @@ void BlockOfCode::GenRunCode(std::function<void(BlockOfCode&)> rcp) {
}
xor_(eax, eax);
lock();
xchg(dword[r15 + jsi.offsetof_halt_reason], eax);
/* implicit LOCK */ xchg(dword[r15 + jsi.offsetof_halt_reason], eax);
ABI_PopCalleeSaveRegistersAndAdjustStack(*this, sizeof(StackLayout));
ret();

View file

@ -58,46 +58,51 @@ static bool SharedDecodeAndOperation(TranslatorVisitor& v, bool wback, IR::MemOp
address = v.X(64, Rn);
}
IR::U64 offs = v.ir.Imm64(0);
if (replicate) {
for (size_t s = 0; s < selem; s++) {
// CPU likes when we read first and then we do operations; Sure, OOO, but might as well
IR::UAnyU128 p_elements[4] = {}; //max upper bound=4 elements
for (size_t s = 0; s < selem; ++s) {
p_elements[s] = v.Mem(v.ir.Add(address, v.ir.Imm64(ebytes * s)), ebytes, IR::AccType::VEC);
}
// schedule ops after
for (size_t s = 0; s < selem; ++s) {
const Vec tt = static_cast<Vec>((VecNumber(Vt) + s) % 32);
const IR::UAnyU128 element = v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC);
const IR::U128 broadcasted_element = v.ir.VectorBroadcast(esize, element);
const IR::U128 broadcasted_element = v.ir.VectorBroadcast(esize, p_elements[s]);
v.V(datasize, tt, broadcasted_element);
offs = v.ir.Add(offs, v.ir.Imm64(ebytes));
}
} else {
for (size_t s = 0; s < selem; s++) {
const Vec tt = static_cast<Vec>((VecNumber(Vt) + s) % 32);
const IR::U128 rval = v.V(128, tt);
if (memop == IR::MemOp::LOAD) {
const IR::UAny elem = v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC);
const IR::U128 vec = v.ir.VectorSetElement(esize, rval, index, elem);
v.V(128, tt, vec);
} else {
const IR::UAny elem = v.ir.VectorGetElement(esize, rval, index);
v.Mem(v.ir.Add(address, offs), ebytes, IR::AccType::VEC, elem);
if (memop == IR::MemOp::LOAD) {
IR::UAny p_elements[4] = {}; //max upper bound=4 elements
for (size_t s = 0; s < selem; ++s) {
p_elements[s] = v.Mem(v.ir.Add(address, v.ir.Imm64(ebytes * s)), ebytes, IR::AccType::VEC);
}
for (size_t s = 0; s < selem; ++s) {
const Vec tt = static_cast<Vec>((VecNumber(Vt) + s) % 32);
const IR::U128 rval = v.V(128, tt);
const IR::U128 vec = v.ir.VectorSetElement(esize, rval, index, p_elements[s]);
v.V(128, tt, vec);
}
} else {
for (size_t s = 0; s < selem; ++s) {
const Vec tt = static_cast<Vec>((VecNumber(Vt) + s) % 32);
const IR::U128 rval = v.V(128, tt);
const IR::UAny elem = v.ir.VectorGetElement(esize, rval, index);
v.Mem(v.ir.Add(address, v.ir.Imm64(ebytes * s)), ebytes, IR::AccType::VEC, elem);
}
offs = v.ir.Add(offs, v.ir.Imm64(ebytes));
}
}
IR::U64 offs = v.ir.Imm64(ebytes * selem);
if (wback) {
if (*Rm != Reg::SP) {
offs = v.X(64, *Rm);
}
if (Rn == Reg::SP) {
v.SP(64, v.ir.Add(address, offs));
} else {
v.X(64, Rn, v.ir.Add(address, offs));
}
}
return true;
}

View file

@ -25,16 +25,18 @@ bool AbsoluteDifferenceLong(TranslatorVisitor& v, bool Q, Imm<2> size, Vec Vm, V
const size_t esize = 8 << size.ZeroExtend();
const size_t datasize = 64;
const IR::U128 operand1 = v.ir.VectorZeroExtend(esize, v.Vpart(datasize, Vn, Q));
const IR::U128 operand2 = v.ir.VectorZeroExtend(esize, v.Vpart(datasize, Vm, Q));
IR::U128 result = sign == SignednessSTD::Signed ? v.ir.VectorSignedAbsoluteDifference(esize, operand1, operand2)
: v.ir.VectorUnsignedAbsoluteDifference(esize, operand1, operand2);
// Loads first, then operations
auto const s_operand1 = v.Vpart(datasize, Vn, Q);
auto const s_operand2 = v.Vpart(datasize, Vm, Q);
const IR::U128 operand1 = v.ir.VectorZeroExtend(esize, s_operand1);
const IR::U128 operand2 = v.ir.VectorZeroExtend(esize, s_operand2);
IR::U128 result = sign == SignednessSTD::Signed
? v.ir.VectorSignedAbsoluteDifference(esize, operand1, operand2)
: v.ir.VectorUnsignedAbsoluteDifference(esize, operand1, operand2);
if (behavior == AbsoluteDifferenceBehavior::Accumulate) {
const IR::U128 data = v.V(2 * datasize, Vd);
result = v.ir.VectorAdd(2 * esize, result, data);
}
v.V(2 * datasize, Vd, result);
return true;
}

View file

@ -134,10 +134,8 @@ bool FPCompareRegister(TranslatorVisitor& v, bool Q, bool sz, Vec Vm, Vec Vn, Ve
if (sz && !Q) {
return v.ReservedValue();
}
const size_t esize = sz ? 64 : 32;
const size_t datasize = Q ? 128 : 64;
const IR::U128 operand1 = v.V(datasize, Vn);
const IR::U128 operand2 = v.V(datasize, Vm);
const IR::U128 result = [&] {
@ -146,21 +144,22 @@ bool FPCompareRegister(TranslatorVisitor& v, bool Q, bool sz, Vec Vm, Vec Vn, Ve
return v.ir.FPVectorEqual(esize, operand1, operand2);
case ComparisonTypeSTS::GE:
return v.ir.FPVectorGreaterEqual(esize, operand1, operand2);
case ComparisonTypeSTS::AbsoluteGE:
return v.ir.FPVectorGreaterEqual(esize,
v.ir.FPVectorAbs(esize, operand1),
v.ir.FPVectorAbs(esize, operand2));
case ComparisonTypeSTS::AbsoluteGE: {
auto const tmp1 = v.ir.FPVectorAbs(esize, operand1);
auto const tmp2 = v.ir.FPVectorAbs(esize, operand2);
return v.ir.FPVectorGreaterEqual(esize, tmp1, tmp2);
}
case ComparisonTypeSTS::GT:
return v.ir.FPVectorGreater(esize, operand1, operand2);
case ComparisonTypeSTS::AbsoluteGT:
return v.ir.FPVectorGreater(esize,
v.ir.FPVectorAbs(esize, operand1),
v.ir.FPVectorAbs(esize, operand2));
case ComparisonTypeSTS::AbsoluteGT: {
auto const tmp1 = v.ir.FPVectorAbs(esize, operand1);
auto const tmp2 = v.ir.FPVectorAbs(esize, operand2);
return v.ir.FPVectorGreater(esize, tmp1, tmp2);
}
}
UNREACHABLE();
}();
v.V(datasize, Vd, result);
return true;
}