[dynarmic] remove memory pool (standard malloc does a better job)

2025-07-20 14:05:45 +00:00 · 2025-07-19 00:51:47 +01:00 · 2025-07-19 00:51:47 +01:00 · ab631e6b28
commit ab631e6b28
parent fc6f9de3fa
16 changed files with 85 additions and 264 deletions
--- a/externals/dynarmic/src/dynarmic/CMakeLists.txt
+++ b/externals/dynarmic/src/dynarmic/CMakeLists.txt
@ -53,8 +53,6 @@ add_library(dynarmic
    common/lut_from_list.h
    common/math_util.cpp
    common/math_util.h
-    common/memory_pool.cpp
-    common/memory_pool.h
    common/safe_ops.h
    common/spin_lock.h
    common/string_util.h
@ -153,6 +151,7 @@ if ("A64" IN_LIST DYNARMIC_FRONTENDS)
        ir/opt/a64_callback_config_pass.cpp
        ir/opt/a64_get_set_elimination_pass.cpp
        ir/opt/a64_merge_interpret_blocks.cpp
+        ir/opt/x64_peepholes.cpp
    )
 endif()

--- a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64.cpp
@ -35,11 +35,6 @@ EmitContext::EmitContext(RegAlloc& reg_alloc, IR::Block& block)

 EmitContext::~EmitContext() = default;

-void EmitContext::EraseInstruction(IR::Inst* inst) {
-    block.Instructions().erase(inst);
-    inst->ClearArgs();
-}
-
 EmitX64::EmitX64(BlockOfCode& code)
        : code(code) {
    exception_handler.Register(code);
--- a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64.h
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64.h
@ -54,10 +54,7 @@ struct EmitContext {
    EmitContext(RegAlloc& reg_alloc, IR::Block& block);
    virtual ~EmitContext();

-    void EraseInstruction(IR::Inst* inst);
-
    virtual FP::FPCR FPCR(bool fpcr_controlled = true) const = 0;
-
    virtual bool HasOptimization(OptimizationFlag flag) const = 0;

    RegAlloc& reg_alloc;
--- a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_crc32.cpp
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_crc32.cpp
@ -40,7 +40,6 @@ static void EmitCRC32Castagnoli(BlockOfCode& code, EmitContext& ctx, IR::Inst* i

 static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, const int data_size) {
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
    if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size < 32) {
        const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
        const Xbyak::Reg64 value = ctx.reg_alloc.UseScratchGpr(args[1]);
@ -69,10 +68,7 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
        code.pextrd(crc, xmm_value, 2);

        ctx.reg_alloc.DefineValue(inst, crc);
-        return;
-    }
-
-    if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size == 32) {
+    } else if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size == 32) {
        const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
        const Xbyak::Reg32 value = ctx.reg_alloc.UseGpr(args[1]).cvt32();
        const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
@ -90,10 +86,7 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
        code.pextrd(crc, xmm_value, 2);

        ctx.reg_alloc.DefineValue(inst, crc);
-        return;
-    }
-
-    if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size == 64) {
+    } else if (code.HasHostFeature(HostFeature::PCLMULQDQ) && data_size == 64) {
        const Xbyak::Reg32 crc = ctx.reg_alloc.UseScratchGpr(args[0]).cvt32();
        const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]);
        const Xbyak::Xmm xmm_value = ctx.reg_alloc.ScratchXmm();
@ -111,12 +104,11 @@ static void EmitCRC32ISO(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, co
        code.pextrd(crc, xmm_value, 2);

        ctx.reg_alloc.DefineValue(inst, crc);
-        return;
+    } else {
+        ctx.reg_alloc.HostCall(inst, args[0], args[1], {});
+        code.mov(code.ABI_PARAM3, data_size / CHAR_BIT);
+        code.CallFunction(&CRC32::ComputeCRC32ISO);
    }
-
-    ctx.reg_alloc.HostCall(inst, args[0], args[1], {});
-    code.mov(code.ABI_PARAM3, data_size / CHAR_BIT);
-    code.CallFunction(&CRC32::ComputeCRC32ISO);
 }

 void EmitX64::EmitCRC32Castagnoli8(EmitContext& ctx, IR::Inst* inst) {
--- a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
@ -236,23 +236,19 @@ void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list<Xb
                FpFixup::Norm_Src,
                FpFixup::Norm_Src,
                FpFixup::Norm_Src,
-                FpFixup::Norm_Src);
-
+                FpFixup::Norm_Src
+            );
            FCODE(vmovap)(tmp, code.BConst<fsize>(xword, denormal_to_zero));
-
-            for (const Xbyak::Xmm& xmm : to_daz) {
+            for (const Xbyak::Xmm& xmm : to_daz)
                FCODE(vfixupimmp)(xmm, xmm, tmp, u8(0));
-            }
-            return;
-        }
-
-        if (fpcr.RMode() != FP::RoundingMode::TowardsMinusInfinity) {
-            code.movaps(tmp, GetNegativeZeroVector<fsize>(code));
        } else {
-            code.xorps(tmp, tmp);
-        }
-        for (const Xbyak::Xmm& xmm : to_daz) {
-            FCODE(addp)(xmm, tmp);
+            if (fpcr.RMode() != FP::RoundingMode::TowardsMinusInfinity) {
+                code.movaps(tmp, GetNegativeZeroVector<fsize>(code));
+            } else {
+                code.xorps(tmp, tmp);
+            }
+            for (const Xbyak::Xmm& xmm : to_daz)
+                FCODE(addp)(xmm, tmp);
        }
    }
 }
--- a/externals/dynarmic/src/dynarmic/common/memory_pool.cpp
+++ b/externals/dynarmic/src/dynarmic/common/memory_pool.cpp
@ -1,13 +0,0 @@
-/* This file is part of the dynarmic project.
- * Copyright (c) 2016 MerryMage
- * SPDX-License-Identifier: 0BSD
- */
-
-#include "dynarmic/common/memory_pool.h"
-
-#include <cstdlib>
-
-namespace Dynarmic::Common {
-
-
-}  // namespace Dynarmic::Common
--- a/externals/dynarmic/src/dynarmic/common/memory_pool.h
+++ b/externals/dynarmic/src/dynarmic/common/memory_pool.h
@ -1,61 +0,0 @@
-/* This file is part of the dynarmic project.
- * Copyright (c) 2016 MerryMage
- * SPDX-License-Identifier: 0BSD
- */
-
-#pragma once
-
-#include <cstddef>
-#include <vector>
-
-namespace Dynarmic::Common {
-
-/// @tparam object_size Byte-size of objects to construct
-/// @tparam slab_size Number of objects to have per slab
-template<size_t object_size, size_t slab_size>
-class Pool {
-public:
-    inline Pool() noexcept {
-        AllocateNewSlab();
-    }
-    inline ~Pool() noexcept {
-        std::free(current_slab);
-        for (char* slab : slabs) {
-            std::free(slab);
-        }
-    }
-
-    Pool(const Pool&) = delete;
-    Pool(Pool&&) = delete;
-
-    Pool& operator=(const Pool&) = delete;
-    Pool& operator=(Pool&&) = delete;
-
-    /// @brief Returns a pointer to an `object_size`-bytes block of memory.
-    [[nodiscard]] void* Alloc() noexcept {
-        if (remaining == 0) {
-            slabs.push_back(current_slab);
-            AllocateNewSlab();
-        }
-        void* ret = static_cast<void*>(current_ptr);
-        current_ptr += object_size;
-        remaining--;
-        return ret;
-    }
-private:
-    /// @brief Allocates a completely new memory slab.
-    /// Used when an entirely new slab is needed
-    /// due the current one running out of usable space.
-    void AllocateNewSlab() noexcept {
-        current_slab = static_cast<char*>(std::malloc(object_size * slab_size));
-        current_ptr = current_slab;
-        remaining = slab_size;
-    }
-
-    std::vector<char*> slabs;
-    char* current_slab = nullptr;
-    char* current_ptr = nullptr;
-    size_t remaining = 0;
-};
-
-}  // namespace Dynarmic::Common
--- a/externals/dynarmic/src/dynarmic/ir/basic_block.cpp
+++ b/externals/dynarmic/src/dynarmic/ir/basic_block.cpp
@ -13,7 +13,6 @@
 #include <fmt/format.h>
 #include <mcl/assert.hpp>

-#include "dynarmic/common/memory_pool.h"
 #include "dynarmic/frontend/A32/a32_types.h"
 #include "dynarmic/frontend/A64/a64_types.h"
 #include "dynarmic/ir/cond.h"
@ -24,8 +23,7 @@ namespace Dynarmic::IR {
 Block::Block(const LocationDescriptor& location)
    : location{location},
    end_location{location},
-    cond{Cond::AL},
-    instruction_alloc_pool{std::make_unique<std::remove_reference_t<decltype(*instruction_alloc_pool)>>()}
+    cond{Cond::AL}
 {

 }
@ -37,7 +35,7 @@ Block::Block(const LocationDescriptor& location)
 /// @param args            A sequence of Value instances used as arguments for the instruction.
 /// @returns Iterator to the newly created instruction.
 Block::iterator Block::PrependNewInst(iterator insertion_point, Opcode opcode, std::initializer_list<Value> args) noexcept {
-    IR::Inst* inst = new (instruction_alloc_pool->Alloc()) IR::Inst(opcode);
+    IR::Inst* inst = new IR::Inst(opcode);
    DEBUG_ASSERT(args.size() == inst->NumArgs());
    std::for_each(args.begin(), args.end(), [&inst, index = size_t(0)](const auto& arg) mutable {
        inst->SetArg(index, arg);
@ -83,9 +81,7 @@ static std::string TerminalToString(const Terminal& terminal_variant) noexcept {
 }

 std::string DumpBlock(const IR::Block& block) noexcept {
-    std::string ret;
-
-    ret += fmt::format("Block: location={}\n", block.Location());
+    std::string ret = fmt::format("Block: location={}\n", block.Location());
    ret += fmt::format("cycles={}", block.CycleCount());
    ret += fmt::format(", entry_cond={}", A64::CondToString(block.GetCondition()));
    if (block.GetCondition() != Cond::AL) {
@ -113,6 +109,8 @@ std::string DumpBlock(const IR::Block& block) noexcept {
            return fmt::format("#{:#x}", arg.GetU32());
        case Type::U64:
            return fmt::format("#{:#x}", arg.GetU64());
+        case Type::U128:
+            return fmt::format("#<u128>");
        case Type::A32Reg:
            return A32::RegToString(arg.GetA32RegRef());
        case Type::A32ExtReg:
@ -155,14 +153,9 @@ std::string DumpBlock(const IR::Block& block) noexcept {
                ret += fmt::format("<type error: {} != {}>", GetNameOf(actual_type), GetNameOf(expected_type));
            }
        }
-
-        ret += fmt::format(" (uses: {})", inst.UseCount());
-
-        ret += '\n';
+        ret += fmt::format(" (uses: {})\n", inst.UseCount());
    }
-
    ret += "terminal = " + TerminalToString(block.GetTerminal()) + '\n';
-
    return ret;
 }

--- a/externals/dynarmic/src/dynarmic/ir/basic_block.h
+++ b/externals/dynarmic/src/dynarmic/ir/basic_block.h
@ -17,8 +17,6 @@
 #include "dynarmic/ir/microinstruction.h"
 #include "dynarmic/ir/terminal.h"
 #include "dynarmic/ir/value.h"
-#include "dynarmic/ir/dense_list.h"
-#include "dynarmic/common/memory_pool.h"

 namespace Dynarmic::IR {

@ -76,7 +74,7 @@ public:
    /// @param op   Opcode representing the instruction to add.
    /// @param args A sequence of Value instances used as arguments for the instruction.
    inline void AppendNewInst(const Opcode opcode, const std::initializer_list<IR::Value> args) noexcept {
-        PrependNewInst(end(), opcode, args);
+        PrependNewInst(instructions.end(), opcode, args);
    }
    iterator PrependNewInst(iterator insertion_point, Opcode op, std::initializer_list<Value> args) noexcept;

@ -171,8 +169,6 @@ private:
    LocationDescriptor end_location;
    /// Conditional to pass in order to execute this block
    Cond cond;
-    /// Memory pool for instruction list
-    std::unique_ptr<Common::Pool<sizeof(Inst), 2097152UL / sizeof(Inst)>> instruction_alloc_pool;
    /// Terminal instruction of this block.
    Terminal terminal = Term::Invalid{};
    /// Number of cycles this block takes to execute if the conditional fails.
--- a/externals/dynarmic/src/dynarmic/ir/dense_list.h
+++ b/externals/dynarmic/src/dynarmic/ir/dense_list.h
@ -1,58 +0,0 @@
-#pragma once
-
-#include <cstdint>
-#include <cstddef>
-#include <deque>
-
-namespace Dynarmic {
-    template<typename T> struct dense_list {
-        using difference_type = std::ptrdiff_t;
-        using size_type = std::size_t;
-        using value_type = T;
-        using pointer = value_type*;
-        using const_pointer = const value_type*;
-        using reference = value_type&;
-        using const_reference = const value_type&;
-        using iterator = std::deque<value_type>::iterator;
-        using const_iterator = std::deque<value_type>::const_iterator;
-        using reverse_iterator = std::reverse_iterator<iterator>;
-        using const_reverse_iterator = std::reverse_iterator<const_iterator>;
-
-        inline bool empty() const noexcept { return list.empty(); }
-        inline size_type size() const noexcept { return list.size(); }
-    
-        inline value_type& front() noexcept { return list.front(); }
-        inline const value_type& front() const noexcept { return list.front(); }
-    
-        inline value_type& back() noexcept { return list.back(); }
-        inline const value_type& back() const noexcept { return list.back(); }
-    
-        inline iterator begin() noexcept { return list.begin(); }
-        inline const_iterator begin() const noexcept { return list.begin(); }
-        inline iterator end() noexcept { return list.end(); }
-        inline const_iterator end() const noexcept { return list.end(); }
-    
-        inline reverse_iterator rbegin() noexcept { return list.rbegin(); }
-        inline const_reverse_iterator rbegin() const noexcept { return list.rbegin(); }
-        inline reverse_iterator rend() noexcept { return list.rend(); }
-        inline const_reverse_iterator rend() const noexcept { return list.rend(); }
-    
-        inline const_iterator cbegin() const noexcept { return list.cbegin(); }
-        inline const_iterator cend() const noexcept { return list.cend(); }
-    
-        inline const_reverse_iterator crbegin() const noexcept { return list.crbegin(); }
-        inline const_reverse_iterator crend() const noexcept { return list.crend(); }
-
-        inline iterator insert_before(iterator it, value_type& value) noexcept {
-            if (it == list.begin()) {
-                list.push_front(value);
-                return list.begin();
-            }
-            auto const index = std::distance(list.begin(), it - 1);
-            list.insert(it - 1, value);
-            return list.begin() + index;
-        }
-
-        std::deque<value_type> list;
-    };
-}
--- a/externals/dynarmic/src/dynarmic/ir/ir_emitter.h
+++ b/externals/dynarmic/src/dynarmic/ir/ir_emitter.h
@ -2947,19 +2947,10 @@ public:
        block.SetTerminal(terminal);
    }

-    void SetInsertionPointBefore(IR::Inst* new_insertion_point) {
-        insertion_point = IR::Block::iterator{*new_insertion_point};
-    }
-
    void SetInsertionPointBefore(IR::Block::iterator new_insertion_point) {
        insertion_point = new_insertion_point;
    }

-    void SetInsertionPointAfter(IR::Inst* new_insertion_point) {
-        insertion_point = IR::Block::iterator{*new_insertion_point};
-        ++insertion_point;
-    }
-
    void SetInsertionPointAfter(IR::Block::iterator new_insertion_point) {
        insertion_point = new_insertion_point;
        ++insertion_point;
--- a/externals/dynarmic/src/dynarmic/ir/opt/a32_get_set_elimination_pass.cpp
+++ b/externals/dynarmic/src/dynarmic/ir/opt/a32_get_set_elimination_pass.cpp
@ -22,8 +22,7 @@ namespace Dynarmic::Optimization {
 namespace {

 void FlagsPass(IR::Block& block) {
-    using Iterator = std::reverse_iterator<IR::Block::iterator>;
-
+    using Iterator = IR::Block::reverse_iterator;
    struct FlagInfo {
        bool set_not_required = false;
        bool has_value_request = false;
@ -185,10 +184,10 @@ void RegisterPass(IR::Block& block) {
    using Iterator = IR::Block::iterator;

    struct RegInfo {
-        IR::Value register_value;
        std::optional<Iterator> last_set_instruction;
+        IR::Value register_value;
    };
-    std::array<RegInfo, 15> reg_info;
+    alignas(64) std::array<RegInfo, 15> reg_info;

    const auto do_get = [](RegInfo& info, Iterator get_inst) {
        if (info.register_value.IsEmpty()) {
@ -203,12 +202,12 @@ void RegisterPass(IR::Block& block) {
            (*info.last_set_instruction)->Invalidate();
        }
        info = {
-            .register_value = value,
            .last_set_instruction = set_inst,
+            .register_value = value,
        };
    };

-    enum class ExtValueType {
+    enum class ExtValueType : std::uint8_t {
        Empty,
        Single,
        Double,
@ -216,19 +215,20 @@ void RegisterPass(IR::Block& block) {
        VectorQuad,
    };
    struct ExtRegInfo {
-        ExtValueType value_type = {};
        IR::Value register_value;
        std::optional<Iterator> last_set_instruction;
+        ExtValueType value_type = {};
    };
-    std::array<ExtRegInfo, 64> ext_reg_info;
+    // Max returned by RegNumber = 31 (but multiplied by 4 in some cases)
+    alignas(64) std::array<ExtRegInfo, 128> ext_reg_info;

    const auto do_ext_get = [](ExtValueType type, std::initializer_list<std::reference_wrapper<ExtRegInfo>> infos, Iterator get_inst) {
        if (!std::all_of(infos.begin(), infos.end(), [type](const auto& info) { return info.get().value_type == type; })) {
            for (auto& info : infos) {
                info.get() = {
-                    .value_type = type,
                    .register_value = IR::Value(&*get_inst),
                    .last_set_instruction = std::nullopt,
+                    .value_type = type,
                };
            }
            return;
@ -244,9 +244,9 @@ void RegisterPass(IR::Block& block) {
        }
        for (auto& info : infos) {
            info.get() = {
-                .value_type = type,
                .register_value = value,
                .last_set_instruction = set_inst,
+                .value_type = type,
            };
        }
    };
--- a/externals/dynarmic/src/dynarmic/ir/opt/a64_callback_config_pass.cpp
+++ b/externals/dynarmic/src/dynarmic/ir/opt/a64_callback_config_pass.cpp
@ -17,7 +17,8 @@ void A64CallbackConfigPass(IR::Block& block, const A64::UserConfig& conf) {
        return;
    }

-    for (auto& inst : block) {
+    for (auto iter = block.begin(); iter != block.end(); iter++) {
+        auto& inst = *iter;
        if (inst.GetOpcode() != IR::Opcode::A64DataCacheOperationRaised) {
            continue;
        }
@ -26,7 +27,7 @@ void A64CallbackConfigPass(IR::Block& block, const A64::UserConfig& conf) {
        if (op == A64::DataCacheOperation::ZeroByVA) {
            A64::IREmitter ir{block};
            ir.current_location = A64::LocationDescriptor{IR::LocationDescriptor{inst.GetArg(0).GetU64()}};
-            ir.SetInsertionPointBefore(&inst);
+            ir.SetInsertionPointBefore(iter);

            size_t bytes = 4 << static_cast<size_t>(conf.dczid_el0 & 0b1111);
            IR::U64 addr{inst.GetArg(2)};
--- a/externals/dynarmic/src/dynarmic/ir/opt/constant_propagation_pass.cpp
+++ b/externals/dynarmic/src/dynarmic/ir/opt/constant_propagation_pass.cpp
@ -22,9 +22,9 @@ using Op = Dynarmic::IR::Opcode;

 namespace {

-// Tiny helper to avoid the need to store based off the opcode
-// bit size all over the place within folding functions.
-void ReplaceUsesWith(IR::Inst& inst, bool is_32_bit, u64 value) {
+/// Tiny helper to avoid the need to store based off the opcode
+/// bit size all over the place within folding functions.
+static void ReplaceUsesWith(IR::Inst& inst, bool is_32_bit, u64 value) {
    if (is_32_bit) {
        inst.ReplaceUsesWith(IR::Value{static_cast<u32>(value)});
    } else {
@ -32,12 +32,12 @@ void ReplaceUsesWith(IR::Inst& inst, bool is_32_bit, u64 value) {
    }
 }

-IR::Value Value(bool is_32_bit, u64 value) {
+static IR::Value Value(bool is_32_bit, u64 value) {
    return is_32_bit ? IR::Value{static_cast<u32>(value)} : IR::Value{value};
 }

 template<typename ImmFn>
-bool FoldCommutative(IR::Inst& inst, bool is_32_bit, ImmFn imm_fn) {
+static bool FoldCommutative(IR::Inst& inst, bool is_32_bit, ImmFn imm_fn) {
    const auto lhs = inst.GetArg(0);
    const auto rhs = inst.GetArg(1);

@ -75,7 +75,7 @@ bool FoldCommutative(IR::Inst& inst, bool is_32_bit, ImmFn imm_fn) {
    return true;
 }

-void FoldAdd(IR::Inst& inst, bool is_32_bit) {
+static void FoldAdd(IR::Inst& inst, bool is_32_bit) {
    const auto lhs = inst.GetArg(0);
    const auto rhs = inst.GetArg(1);
    const auto carry = inst.GetArg(2);
@ -125,7 +125,7 @@ void FoldAdd(IR::Inst& inst, bool is_32_bit) {
 /// 4. x & y -> y (where x has all bits set to 1)
 /// 5. x & y -> x (where y has all bits set to 1)
 ///
-void FoldAND(IR::Inst& inst, bool is_32_bit) {
+static void FoldAND(IR::Inst& inst, bool is_32_bit) {
    if (FoldCommutative(inst, is_32_bit, [](u64 a, u64 b) { return a & b; })) {
        const auto rhs = inst.GetArg(1);
        if (rhs.IsZero()) {
@ -140,7 +140,7 @@ void FoldAND(IR::Inst& inst, bool is_32_bit) {
 ///
 /// 1. imm -> swap(imm)
 ///
-void FoldByteReverse(IR::Inst& inst, Op op) {
+static void FoldByteReverse(IR::Inst& inst, Op op) {
    const auto operand = inst.GetArg(0);

    if (!operand.IsImmediate()) {
@ -165,7 +165,7 @@ void FoldByteReverse(IR::Inst& inst, Op op) {
 /// 2. imm_x / imm_y -> result
 /// 3. x / 1 -> x
 ///
-void FoldDivide(IR::Inst& inst, bool is_32_bit, bool is_signed) {
+static void FoldDivide(IR::Inst& inst, bool is_32_bit, bool is_signed) {
    const auto rhs = inst.GetArg(1);

    if (rhs.IsZero()) {
@ -193,7 +193,7 @@ void FoldDivide(IR::Inst& inst, bool is_32_bit, bool is_signed) {
 // 2. x ^ 0 -> x
 // 3. 0 ^ y -> y
 //
-void FoldEOR(IR::Inst& inst, bool is_32_bit) {
+static void FoldEOR(IR::Inst& inst, bool is_32_bit) {
    if (FoldCommutative(inst, is_32_bit, [](u64 a, u64 b) { return a ^ b; })) {
        const auto rhs = inst.GetArg(1);
        if (rhs.IsZero()) {
@ -202,7 +202,7 @@ void FoldEOR(IR::Inst& inst, bool is_32_bit) {
    }
 }

-void FoldLeastSignificantByte(IR::Inst& inst) {
+static void FoldLeastSignificantByte(IR::Inst& inst) {
    if (!inst.AreAllArgsImmediates()) {
        return;
    }
@ -211,7 +211,7 @@ void FoldLeastSignificantByte(IR::Inst& inst) {
    inst.ReplaceUsesWith(IR::Value{static_cast<u8>(operand.GetImmediateAsU64())});
 }

-void FoldLeastSignificantHalf(IR::Inst& inst) {
+static void FoldLeastSignificantHalf(IR::Inst& inst) {
    if (!inst.AreAllArgsImmediates()) {
        return;
    }
@ -220,7 +220,7 @@ void FoldLeastSignificantHalf(IR::Inst& inst) {
    inst.ReplaceUsesWith(IR::Value{static_cast<u16>(operand.GetImmediateAsU64())});
 }

-void FoldLeastSignificantWord(IR::Inst& inst) {
+static void FoldLeastSignificantWord(IR::Inst& inst) {
    if (!inst.AreAllArgsImmediates()) {
        return;
    }
@ -229,7 +229,7 @@ void FoldLeastSignificantWord(IR::Inst& inst) {
    inst.ReplaceUsesWith(IR::Value{static_cast<u32>(operand.GetImmediateAsU64())});
 }

-void FoldMostSignificantBit(IR::Inst& inst) {
+static void FoldMostSignificantBit(IR::Inst& inst) {
    if (!inst.AreAllArgsImmediates()) {
        return;
    }
@ -238,7 +238,7 @@ void FoldMostSignificantBit(IR::Inst& inst) {
    inst.ReplaceUsesWith(IR::Value{(operand.GetImmediateAsU64() >> 31) != 0});
 }

-void FoldMostSignificantWord(IR::Inst& inst) {
+static void FoldMostSignificantWord(IR::Inst& inst) {
    IR::Inst* carry_inst = inst.GetAssociatedPseudoOperation(Op::GetCarryFromOp);

    if (!inst.AreAllArgsImmediates()) {
@ -260,7 +260,7 @@ void FoldMostSignificantWord(IR::Inst& inst) {
 // 4. x * 1 -> x
 // 5. 1 * y -> y
 //
-void FoldMultiply(IR::Inst& inst, bool is_32_bit) {
+static void FoldMultiply(IR::Inst& inst, bool is_32_bit) {
    if (FoldCommutative(inst, is_32_bit, [](u64 a, u64 b) { return a * b; })) {
        const auto rhs = inst.GetArg(1);
        if (rhs.IsZero()) {
@ -272,7 +272,7 @@ void FoldMultiply(IR::Inst& inst, bool is_32_bit) {
 }

 // Folds NOT operations if the contained value is an immediate.
-void FoldNOT(IR::Inst& inst, bool is_32_bit) {
+static void FoldNOT(IR::Inst& inst, bool is_32_bit) {
    const auto operand = inst.GetArg(0);

    if (!operand.IsImmediate()) {
@ -289,7 +289,7 @@ void FoldNOT(IR::Inst& inst, bool is_32_bit) {
 // 2. x | 0 -> x
 // 3. 0 | y -> y
 //
-void FoldOR(IR::Inst& inst, bool is_32_bit) {
+static void FoldOR(IR::Inst& inst, bool is_32_bit) {
    if (FoldCommutative(inst, is_32_bit, [](u64 a, u64 b) { return a | b; })) {
        const auto rhs = inst.GetArg(1);
        if (rhs.IsZero()) {
@ -298,7 +298,7 @@ void FoldOR(IR::Inst& inst, bool is_32_bit) {
    }
 }

-bool FoldShifts(IR::Inst& inst) {
+static bool FoldShifts(IR::Inst& inst) {
    IR::Inst* carry_inst = inst.GetAssociatedPseudoOperation(Op::GetCarryFromOp);

    // The 32-bit variants can contain 3 arguments, while the
@ -328,7 +328,7 @@ bool FoldShifts(IR::Inst& inst) {
    return true;
 }

-void FoldSignExtendXToWord(IR::Inst& inst) {
+static void FoldSignExtendXToWord(IR::Inst& inst) {
    if (!inst.AreAllArgsImmediates()) {
        return;
    }
@ -337,7 +337,7 @@ void FoldSignExtendXToWord(IR::Inst& inst) {
    inst.ReplaceUsesWith(IR::Value{static_cast<u32>(value)});
 }

-void FoldSignExtendXToLong(IR::Inst& inst) {
+static void FoldSignExtendXToLong(IR::Inst& inst) {
    if (!inst.AreAllArgsImmediates()) {
        return;
    }
@ -346,7 +346,7 @@ void FoldSignExtendXToLong(IR::Inst& inst) {
    inst.ReplaceUsesWith(IR::Value{static_cast<u64>(value)});
 }

-void FoldSub(IR::Inst& inst, bool is_32_bit) {
+static void FoldSub(IR::Inst& inst, bool is_32_bit) {
    if (!inst.AreAllArgsImmediates() || inst.HasAssociatedPseudoOperation()) {
        return;
    }
@ -359,7 +359,7 @@ void FoldSub(IR::Inst& inst, bool is_32_bit) {
    ReplaceUsesWith(inst, is_32_bit, result);
 }

-void FoldZeroExtendXToWord(IR::Inst& inst) {
+static void FoldZeroExtendXToWord(IR::Inst& inst) {
    if (!inst.AreAllArgsImmediates()) {
        return;
    }
@ -368,7 +368,7 @@ void FoldZeroExtendXToWord(IR::Inst& inst) {
    inst.ReplaceUsesWith(IR::Value{static_cast<u32>(value)});
 }

-void FoldZeroExtendXToLong(IR::Inst& inst) {
+static void FoldZeroExtendXToLong(IR::Inst& inst) {
    if (!inst.AreAllArgsImmediates()) {
        return;
    }
--- a/externals/dynarmic/src/dynarmic/ir/opt/identity_removal_pass.cpp
+++ b/externals/dynarmic/src/dynarmic/ir/opt/identity_removal_pass.cpp
@ -13,11 +13,8 @@ namespace Dynarmic::Optimization {

 void IdentityRemovalPass(IR::Block& block) {
    std::vector<IR::Inst*> to_invalidate;
-
-    auto iter = block.begin();
-    while (iter != block.end()) {
+    for (auto iter = block.begin(); iter != block.end(); ) {
        IR::Inst& inst = *iter;
-
        const size_t num_args = inst.NumArgs();
        for (size_t i = 0; i < num_args; i++) {
            while (true) {
@ -27,18 +24,15 @@ void IdentityRemovalPass(IR::Block& block) {
                inst.SetArg(i, arg.GetInst()->GetArg(0));
            }
        }
-
        if (inst.GetOpcode() == IR::Opcode::Identity || inst.GetOpcode() == IR::Opcode::Void) {
-            iter = block.Instructions().erase(inst);
+            iter = block.Instructions().erase(iter);
            to_invalidate.push_back(&inst);
        } else {
            ++iter;
        }
    }
-
-    for (IR::Inst* inst : to_invalidate) {
+    for (auto* inst : to_invalidate)
        inst->Invalidate();
-    }
 }

 }  // namespace Dynarmic::Optimization
--- a/externals/dynarmic/src/dynarmic/ir/opt/polyfill_pass.cpp
+++ b/externals/dynarmic/src/dynarmic/ir/opt/polyfill_pass.cpp
@ -13,7 +13,7 @@ namespace Dynarmic::Optimization {

 namespace {

-void PolyfillSHA256MessageSchedule0(IR::IREmitter& ir, IR::Inst& inst) {
+static void PolyfillSHA256MessageSchedule0(IR::IREmitter& ir, IR::Inst& inst) {
    const IR::U128 x = (IR::U128)inst.GetArg(0);
    const IR::U128 y = (IR::U128)inst.GetArg(1);

@ -37,13 +37,14 @@ void PolyfillSHA256MessageSchedule0(IR::IREmitter& ir, IR::Inst& inst) {
    inst.ReplaceUsesWith(result);
 }

-void PolyfillSHA256MessageSchedule1(IR::IREmitter& ir, IR::Inst& inst) {
+static void PolyfillSHA256MessageSchedule1(IR::IREmitter& ir, IR::Inst& inst) {
    const IR::U128 x = (IR::U128)inst.GetArg(0);
    const IR::U128 y = (IR::U128)inst.GetArg(1);
    const IR::U128 z = (IR::U128)inst.GetArg(2);

    const IR::U128 T0 = ir.VectorExtract(y, z, 32);

+    // TODO: this can use better pipelining m8
    const IR::U128 lower_half = [&] {
        const IR::U128 T = ir.VectorRotateWholeVectorRight(z, 64);
        const IR::U128 tmp1 = ir.VectorRotateRight(32, T, 17);
@ -73,15 +74,15 @@ void PolyfillSHA256MessageSchedule1(IR::IREmitter& ir, IR::Inst& inst) {
    inst.ReplaceUsesWith(result);
 }

-IR::U32 SHAchoose(IR::IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) {
+static IR::U32 SHAchoose(IR::IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) {
    return ir.Eor(ir.And(ir.Eor(y, z), x), z);
 }

-IR::U32 SHAmajority(IR::IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) {
+static IR::U32 SHAmajority(IR::IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) {
    return ir.Or(ir.And(x, y), ir.And(ir.Or(x, y), z));
 }

-IR::U32 SHAhashSIGMA0(IR::IREmitter& ir, IR::U32 x) {
+static IR::U32 SHAhashSIGMA0(IR::IREmitter& ir, IR::U32 x) {
    const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(2));
    const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(13));
    const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(22));
@ -89,7 +90,7 @@ IR::U32 SHAhashSIGMA0(IR::IREmitter& ir, IR::U32 x) {
    return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
 }

-IR::U32 SHAhashSIGMA1(IR::IREmitter& ir, IR::U32 x) {
+static IR::U32 SHAhashSIGMA1(IR::IREmitter& ir, IR::U32 x) {
    const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(6));
    const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(11));
    const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(25));
@ -97,7 +98,7 @@ IR::U32 SHAhashSIGMA1(IR::IREmitter& ir, IR::U32 x) {
    return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
 }

-void PolyfillSHA256Hash(IR::IREmitter& ir, IR::Inst& inst) {
+static void PolyfillSHA256Hash(IR::IREmitter& ir, IR::Inst& inst) {
    IR::U128 x = (IR::U128)inst.GetArg(0);
    IR::U128 y = (IR::U128)inst.GetArg(1);
    const IR::U128 w = (IR::U128)inst.GetArg(2);
@ -139,7 +140,7 @@ void PolyfillSHA256Hash(IR::IREmitter& ir, IR::Inst& inst) {
 }

 template<size_t esize, bool is_signed>
-void PolyfillVectorMultiplyWiden(IR::IREmitter& ir, IR::Inst& inst) {
+static void PolyfillVectorMultiplyWiden(IR::IREmitter& ir, IR::Inst& inst) {
    IR::U128 n = (IR::U128)inst.GetArg(0);
    IR::U128 m = (IR::U128)inst.GetArg(1);

@ -159,54 +160,52 @@ void PolyfillPass(IR::Block& block, const PolyfillOptions& polyfill) {
    }

    IR::IREmitter ir{block};
-
-    for (auto& inst : block) {
-        ir.SetInsertionPointBefore(&inst);
-
-        switch (inst.GetOpcode()) {
+    for (auto iter = block.begin(); iter != block.end(); iter++) {
+        ir.SetInsertionPointBefore(iter);
+        switch (iter->GetOpcode()) {
        case IR::Opcode::SHA256MessageSchedule0:
            if (polyfill.sha256) {
-                PolyfillSHA256MessageSchedule0(ir, inst);
+                PolyfillSHA256MessageSchedule0(ir, *iter);
            }
            break;
        case IR::Opcode::SHA256MessageSchedule1:
            if (polyfill.sha256) {
-                PolyfillSHA256MessageSchedule1(ir, inst);
+                PolyfillSHA256MessageSchedule1(ir, *iter);
            }
            break;
        case IR::Opcode::SHA256Hash:
            if (polyfill.sha256) {
-                PolyfillSHA256Hash(ir, inst);
+                PolyfillSHA256Hash(ir, *iter);
            }
            break;
        case IR::Opcode::VectorMultiplySignedWiden8:
            if (polyfill.vector_multiply_widen) {
-                PolyfillVectorMultiplyWiden<8, true>(ir, inst);
+                PolyfillVectorMultiplyWiden<8, true>(ir, *iter);
            }
            break;
        case IR::Opcode::VectorMultiplySignedWiden16:
            if (polyfill.vector_multiply_widen) {
-                PolyfillVectorMultiplyWiden<16, true>(ir, inst);
+                PolyfillVectorMultiplyWiden<16, true>(ir, *iter);
            }
            break;
        case IR::Opcode::VectorMultiplySignedWiden32:
            if (polyfill.vector_multiply_widen) {
-                PolyfillVectorMultiplyWiden<32, true>(ir, inst);
+                PolyfillVectorMultiplyWiden<32, true>(ir, *iter);
            }
            break;
        case IR::Opcode::VectorMultiplyUnsignedWiden8:
            if (polyfill.vector_multiply_widen) {
-                PolyfillVectorMultiplyWiden<8, false>(ir, inst);
+                PolyfillVectorMultiplyWiden<8, false>(ir, *iter);
            }
            break;
        case IR::Opcode::VectorMultiplyUnsignedWiden16:
            if (polyfill.vector_multiply_widen) {
-                PolyfillVectorMultiplyWiden<16, false>(ir, inst);
+                PolyfillVectorMultiplyWiden<16, false>(ir, *iter);
            }
            break;
        case IR::Opcode::VectorMultiplyUnsignedWiden32:
            if (polyfill.vector_multiply_widen) {
-                PolyfillVectorMultiplyWiden<32, false>(ir, inst);
+                PolyfillVectorMultiplyWiden<32, false>(ir, *iter);
            }
            break;
        default: