PORT: NCE: Initial ryujinx host mapped memory

2025-07-20 17:35:46 +00:00 · 2025-05-19 00:36:07 -03:00 · 2025-05-19 00:36:07 -03:00 · 752945ee31
commit 752945ee31
parent d3510b1397
4 changed files with 371 additions and 18 deletions
--- a/src/core/arm/nce/arm_nce.cpp
+++ b/src/core/arm/nce/arm_nce.cpp
@ -3,7 +3,9 @@
 #include <cinttypes>
 #include <memory>
 #include <sys/mman.h>
 #include "common/logging/log.h"
 #include "common/signal_chain.h"
 #include "core/arm/nce/arm_nce.h"
 #include "core/arm/nce/interpreter_visitor.h"
@ -43,6 +45,54 @@ constexpr u32 StackSize = 128_KiB;
 } // namespace
 // Implementation of the enhanced features inspired by Ryujinx NCE
 void ArmNce::SetupAlternateSignalStack() {
    // Create an alternate stack for signal handling
    // This ensures we have a clean stack for handling signals even if the guest stack is corrupted
    m_alt_signal_stack = std::make_unique<u8[]>(AlternateStackSize);
    stack_t ss{};
    ss.ss_sp = m_alt_signal_stack.get();
    ss.ss_size = AlternateStackSize;
    ss.ss_flags = 0;
    if (sigaltstack(&ss, nullptr) != 0) {
        LOG_ERROR(Core_ARM, "Failed to setup alternate signal stack: {}", strerror(errno));
    } else {
        LOG_DEBUG(Core_ARM, "Alternate signal stack set up successfully");
    }
 }
 void ArmNce::CleanupAlternateSignalStack() {
    if (m_alt_signal_stack) {
        stack_t ss{};
        ss.ss_flags = SS_DISABLE;
        if (sigaltstack(&ss, nullptr) != 0) {
            LOG_ERROR(Core_ARM, "Failed to disable alternate signal stack: {}", strerror(errno));
        }
        m_alt_signal_stack.reset();
    }
 }
 bool ArmNce::HandleThreadInterrupt(GuestContext* ctx) {
    // Check if an interrupt was requested
    if (ctx->interrupt_requested.load(std::memory_order_acquire) != 0) {
        // Clear the interrupt request
        ctx->interrupt_requested.store(0, std::memory_order_release);
        // Add break loop reason to indicate we should exit
        ctx->esr_el1.fetch_or(static_cast<u64>(HaltReason::BreakLoop));
        // Indicate we handled an interrupt
        return true;
    }
    return false;
 }
 void* ArmNce::RestoreGuestContext(void* raw_context) {
    // Retrieve the host context.
    auto& host_ctx = static_cast<ucontext_t*>(raw_context)->uc_mcontext;
@ -268,9 +318,18 @@ void ArmNce::SetSvcArguments(std::span<const uint64_t, 8> args) {
 ArmNce::ArmNce(System& system, bool uses_wall_clock, std::size_t core_index)
    : ArmInterface{uses_wall_clock}, m_system{system}, m_core_index{core_index} {
    m_guest_ctx.system = &m_system;
    m_guest_ctx.parent = this;
    // Initialize as being in managed code
    m_guest_ctx.in_managed.store(1, std::memory_order_release);
 }
-ArmNce::~ArmNce() = default;
+ArmNce::~ArmNce() {
    // Clean up alternate signal stack
    CleanupAlternateSignalStack();
    // Host mapped memory will be cleaned up by its destructor
 }
 void ArmNce::Initialize() {
    if (m_thread_id == -1) {
@ -287,6 +346,16 @@ void ArmNce::Initialize() {
        sigaltstack(&ss, nullptr);
    }
    // Set up alternate signal stack (Ryujinx-inspired enhancement)
    SetupAlternateSignalStack();
    // Initialize host-mapped memory for efficient access
    if (!m_host_mapped_memory) {
        auto& memory = m_system.ApplicationMemory();
        m_host_mapped_memory = std::make_unique<HostMappedMemory>(memory);
        LOG_DEBUG(Core_ARM, "Initialized host-mapped memory for NCE");
    }
    // Set up signals.
    static std::once_flag flag;
    std::call_once(flag, [] {
@ -365,19 +434,23 @@ void ArmNce::SetContext(const Kernel::Svc::ThreadContext& ctx) {
 }
 void ArmNce::SignalInterrupt(Kernel::KThread* thread) {
-    // Add break loop condition.
+    // Mark that we're requesting an interrupt
    m_guest_ctx.interrupt_requested.store(1, std::memory_order_release);
    // Add break loop condition
    m_guest_ctx.esr_el1.fetch_or(static_cast<u64>(HaltReason::BreakLoop));
-    // Lock the thread context.
+    // Lock the thread context
    auto* params = &thread->GetNativeExecutionParameters();
    LockThreadParameters(params);
-    if (params->is_running) {
+    // Only send a signal if the thread is running and not in managed code
-        // We should signal to the running thread.
+    if (params->is_running && m_guest_ctx.in_managed.load(std::memory_order_acquire) == 0) {
-        // The running thread will unlock the thread context.
+        // Send signal to the running thread
        // The running thread will unlock the thread context
        syscall(SYS_tkill, m_thread_id, BreakFromRunCodeSignal);
    } else {
-        // If the thread is no longer running, we have nothing to do.
+        // If the thread is no longer running or is in managed code, we unlock
        UnlockThreadParameters(params);
    }
 }
@ -402,21 +475,54 @@ void ArmNce::ClearInstructionCache() {
 }
 void ArmNce::InvalidateCacheRange(u64 addr, std::size_t size) {
-    #if defined(__GNUC__) || defined(__clang__)
+#if defined(__GNUC__) || defined(__clang__)
-        // Align the start address to cache line boundary for better performance
+    while (size > 0) {
-        const size_t CACHE_LINE_SIZE = 64;
+        const std::size_t size_step = std::min(size, CACHE_PAGE_SIZE);
        addr &= ~(CACHE_LINE_SIZE - 1);
-        // Round up size to nearest cache line
+        // The __builtin___clear_cache intrinsic generates icache(i) invalidation and dcache(d)
-        size = (size + CACHE_LINE_SIZE - 1) & ~(CACHE_LINE_SIZE - 1);
+        // write-back instructions targeting the range.
        char* addr_ptr = reinterpret_cast<char*>(addr);
        __builtin___clear_cache(addr_ptr, addr_ptr + size_step);
-        // Prefetch the range to be invalidated
+        addr += size_step;
-        for (size_t offset = 0; offset < size; offset += CACHE_LINE_SIZE) {
+        size -= size_step;
-            __builtin_prefetch((void*)(addr + offset), 1, 3);
+    }
        }
    #endif
    // Clear instruction cache after range invalidation
    this->ClearInstructionCache();
 #endif
 }
 // Fast memory access template implementation (inspired by Ryujinx)
 template <typename T>
 T& ArmNce::GetHostRef(u64 guest_addr) {
    if (m_host_mapped_memory) {
        // Use the host-mapped memory for fast access
        try {
            return m_host_mapped_memory->GetRef<T>(guest_addr);
        } catch (const std::exception& e) {
            LOG_ERROR(Core_ARM, "Failed to get host reference: {}", e.what());
        }
    }
    // Fallback to slower memory access
    T value{};
    m_system.ApplicationMemory().ReadBlock(guest_addr, &value, sizeof(T));
    static thread_local T fallback;
    fallback = value;
    return fallback;
 }
 // Explicit instantiations for common types
 template u8& ArmNce::GetHostRef<u8>(u64);
 template u16& ArmNce::GetHostRef<u16>(u64);
 template u32& ArmNce::GetHostRef<u32>(u64);
 template u64& ArmNce::GetHostRef<u64>(u64);
 template s8& ArmNce::GetHostRef<s8>(u64);
 template s16& ArmNce::GetHostRef<s16>(u64);
 template s32& ArmNce::GetHostRef<s32>(u64);
 template s64& ArmNce::GetHostRef<s64>(u64);
 template f32& ArmNce::GetHostRef<f32>(u64);
 template f64& ArmNce::GetHostRef<f64>(u64);
 } // namespace Core
--- a/src/core/arm/nce/arm_nce.h
+++ b/src/core/arm/nce/arm_nce.h
@ -7,6 +7,7 @@
 #include "core/arm/arm_interface.h"
 #include "core/arm/nce/guest_context.h"
 #include "core/arm/nce/host_mapped_memory.h"
 namespace Core::Memory {
 class Memory;
@ -52,6 +53,10 @@ protected:
    void RewindBreakpointInstruction() override {}
    // Fast memory access using host-mapped memory (inspired by Ryujinx)
    template <typename T>
    T& GetHostRef(u64 guest_addr);
 private:
    // Assembly definitions.
    static HaltReason ReturnToRunCodeByTrampoline(void* tpidr, GuestContext* ctx,
@ -67,6 +72,13 @@ private:
    static void LockThreadParameters(void* tpidr);
    static void UnlockThreadParameters(void* tpidr);
    // Alternate stack management (inspired by Ryujinx)
    void SetupAlternateSignalStack();
    void CleanupAlternateSignalStack();
    // Enhanced signal handling
    static bool HandleThreadInterrupt(GuestContext* ctx);
 private:
    // C++ implementation functions for assembly definitions.
    static void* RestoreGuestContext(void* raw_context);
@ -90,6 +102,13 @@ public:
    // Stack for signal processing.
    std::unique_ptr<u8[]> m_stack{};
    // Alternate signal stack (inspired by Ryujinx)
    static constexpr size_t AlternateStackSize = 16384;
    std::unique_ptr<u8[]> m_alt_signal_stack{};
    // Host mapped memory for efficient access (inspired by Ryujinx)
    std::unique_ptr<HostMappedMemory> m_host_mapped_memory{};
 };
 } // namespace Core
--- a/src/core/arm/nce/guest_context.h
+++ b/src/core/arm/nce/guest_context.h
@ -38,6 +38,12 @@ struct GuestContext {
    u32 svc{};
    System* system{};
    ArmNce* parent{};
    // Enhanced thread control (inspired by Ryujinx)
    std::atomic<u32> in_managed{1};         // 1 when in managed code, 0 when in native
    std::atomic<u32> interrupt_requested{0}; // Set when interrupt requested
    pid_t host_thread_id{-1};              // Host thread ID for signaling
    u64 ctr_el0{0x8444c004};               // Cache type register
 };
 // Verify assembly offsets.
--- a/src/core/arm/nce/host_mapped_memory.h
+++ b/src/core/arm/nce/host_mapped_memory.h
@ -0,0 +1,222 @@
 // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 #pragma once
 #include <array>
 #include <span>
 #include <stdexcept>
 #include <string>
 #include <sys/mman.h>
 #include <unordered_map>
 #include <vector>
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "common/page_table.h"
 namespace Core::Memory {
 class Memory;
 }
 namespace Core {
 /**
 * HostMappedMemory provides direct host-mapped memory access for NCE
 * This is inspired by Ryujinx's MemoryManagerNative for faster memory operations
 */
 class HostMappedMemory {
 public:
    explicit inline HostMappedMemory(Memory::Memory& memory) : memory{memory} {}
    inline ~HostMappedMemory() {
        // Unmap all allocations
        for (void* allocation : allocations) {
            if (munmap(allocation, page_size) != 0) {
                LOG_ERROR(Core_ARM, "Failed to unmap allocation at {:p}: {}", allocation, std::strerror(errno));
            }
        }
    }
    /**
     * Maps a guest memory region to host memory
     * @param guest_addr Guest virtual address to map
     * @param size Size of the region to map
     * @return True if the mapping succeeded
     */
    inline bool MapRegion(u64 guest_addr, u64 size) {
        const u64 start_page = guest_addr >> page_bits;
        const u64 end_page = (guest_addr + size + page_mask) >> page_bits;
        for (u64 page = start_page; page < end_page; page++) {
            const u64 current_addr = page << page_bits;
            // Skip if already mapped
            if (page_table.contains(page)) {
                continue;
            }
            // Allocate memory for this page
            void* allocation = mmap(nullptr, page_size, PROT_READ | PROT_WRITE,
                                    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
            if (allocation == MAP_FAILED) {
                LOG_ERROR(Core_ARM, "Failed to allocate host page for guest address {:016X}: {}",
                          current_addr, std::strerror(errno));
                return false;
            }
            // Copy data from guest memory to our allocation
            bool result = false;
            std::array<u8, page_size> data;
            // Try to read the memory from guest
            result = memory.ReadBlock(current_addr, data.data(), page_size);
            if (!result) {
                LOG_ERROR(Core_ARM, "Failed to read memory block at {:016X}", current_addr);
                munmap(allocation, page_size);
                return false;
            }
            // Copy to our allocation
            std::memcpy(allocation, data.data(), page_size);
            // Store the allocation
            page_table[page] = static_cast<u8*>(allocation);
            allocations.push_back(allocation);
        }
        return true;
    }
    /**
     * Unmaps a previously mapped guest memory region
     * @param guest_addr Guest virtual address to unmap
     * @param size Size of the region to unmap
     */
    inline void UnmapRegion(u64 guest_addr, u64 size) {
        const u64 start_page = guest_addr >> page_bits;
        const u64 end_page = (guest_addr + size + page_mask) >> page_bits;
        for (u64 page = start_page; page < end_page; page++) {
            // Skip if not mapped
            auto it = page_table.find(page);
            if (it == page_table.end()) {
                continue;
            }
            u8* host_ptr = it->second;
            // Try to write the memory back to guest
            const u64 current_addr = page << page_bits;
            memory.WriteBlock(current_addr, host_ptr, page_size);
            // Remove from page table
            page_table.erase(it);
            // Don't unmap immediately - we'll do that in the destructor
            // to avoid potential reuse problems
        }
    }
    /**
     * Gets a typed reference to memory at the specified guest address
     * @tparam T Type of the reference to return
     * @param guest_addr Guest virtual address
     * @return Reference to the memory at the specified address
     * @note The memory region must be continuous and mapped
     */
    template <typename T>
    T& GetRef(u64 guest_addr) {
        static_assert(std::is_trivially_copyable_v<T>, "T must be trivially copyable");
        // Check if region covers the entire value
        const u64 page_offset = guest_addr & page_mask;
        if (page_offset + sizeof(T) <= page_size) {
            // Fast path - contained within single page
            return *reinterpret_cast<T*>(TranslateAddress(guest_addr));
        } else {
            // Slow path - spans pages, need to check if all are mapped
            if (!IsRangeMapped(guest_addr, sizeof(T))) {
                throw std::runtime_error("Memory region is not continuous");
            }
            return *reinterpret_cast<T*>(TranslateAddress(guest_addr));
        }
    }
    /**
     * Gets a span over memory at the specified guest address
     * @param guest_addr Guest virtual address
     * @param size Size of the span
     * @return Span over the memory at the specified address
     * @note The memory region must be continuous and mapped
     */
    inline std::span<u8> GetSpan(u64 guest_addr, u64 size) {
        // Ensure the memory is mapped and continuous
        if (!IsRangeMapped(guest_addr, size)) {
            throw std::runtime_error("GetSpan requested on unmapped or non-continuous memory region");
        }
        return std::span<u8>(TranslateAddress(guest_addr), size);
    }
    /**
     * Checks if an address is mapped
     * @param guest_addr Guest virtual address to check
     * @return True if the address is mapped
     */
    inline bool IsMapped(u64 guest_addr) const {
        const u64 page = guest_addr >> page_bits;
        return page_table.contains(page);
    }
    /**
     * Checks if a range of memory is mapped continuously
     * @param guest_addr Starting guest virtual address
     * @param size Size of the region to check
     * @return True if the entire range is mapped continuously
     */
    inline bool IsRangeMapped(u64 guest_addr, u64 size) const {
        const u64 start_page = guest_addr >> page_bits;
        const u64 end_page = (guest_addr + size + page_mask) >> page_bits;
        for (u64 page = start_page; page < end_page; page++) {
            if (!page_table.contains(page)) {
                return false;
            }
        }
        return true;
    }
    /**
     * Gets the host address for a guest virtual address
     * @param guest_addr Guest virtual address to translate
     * @return Host address corresponding to the guest address
     */
    inline u8* TranslateAddress(u64 guest_addr) {
        const u64 page = guest_addr >> page_bits;
        const u64 offset = guest_addr & page_mask;
        auto it = page_table.find(page);
        if (it == page_table.end()) {
            throw std::runtime_error(fmt::format("Tried to translate unmapped address {:016X}", guest_addr));
        }
        return it->second + offset;
    }
 private:
    static constexpr u64 page_bits = 12;
    static constexpr u64 page_size = 1ULL << page_bits;
    static constexpr u64 page_mask = page_size - 1;
    Memory::Memory& memory;
    // Page table mapping guest pages to host addresses
    std::unordered_map<u64, u8*> page_table;
    // Allocation pool for mapped regions
    std::vector<void*> allocations;
 };
 } // namespace Core