PORT: NCE: Initial ryujinx host mapped memory

2025-07-20 12:55:45 +00:00 · 2025-05-19 00:36:07 -03:00 · 2025-05-19 00:36:07 -03:00 · 752945ee31
commit 752945ee31
parent d3510b1397
4 changed files with 371 additions and 18 deletions
--- a/src/core/arm/nce/arm_nce.cpp
+++ b/src/core/arm/nce/arm_nce.cpp
@ -3,7 +3,9 @@

 #include <cinttypes>
 #include <memory>
+#include <sys/mman.h>

+#include "common/logging/log.h"
 #include "common/signal_chain.h"
 #include "core/arm/nce/arm_nce.h"
 #include "core/arm/nce/interpreter_visitor.h"
@ -43,6 +45,54 @@ constexpr u32 StackSize = 128_KiB;

 } // namespace

+// Implementation of the enhanced features inspired by Ryujinx NCE
+
+void ArmNce::SetupAlternateSignalStack() {
+    // Create an alternate stack for signal handling
+    // This ensures we have a clean stack for handling signals even if the guest stack is corrupted
+    m_alt_signal_stack = std::make_unique<u8[]>(AlternateStackSize);
+
+    stack_t ss{};
+    ss.ss_sp = m_alt_signal_stack.get();
+    ss.ss_size = AlternateStackSize;
+    ss.ss_flags = 0;
+
+    if (sigaltstack(&ss, nullptr) != 0) {
+        LOG_ERROR(Core_ARM, "Failed to setup alternate signal stack: {}", strerror(errno));
+    } else {
+        LOG_DEBUG(Core_ARM, "Alternate signal stack set up successfully");
+    }
+}
+
+void ArmNce::CleanupAlternateSignalStack() {
+    if (m_alt_signal_stack) {
+        stack_t ss{};
+        ss.ss_flags = SS_DISABLE;
+
+        if (sigaltstack(&ss, nullptr) != 0) {
+            LOG_ERROR(Core_ARM, "Failed to disable alternate signal stack: {}", strerror(errno));
+        }
+
+        m_alt_signal_stack.reset();
+    }
+}
+
+bool ArmNce::HandleThreadInterrupt(GuestContext* ctx) {
+    // Check if an interrupt was requested
+    if (ctx->interrupt_requested.load(std::memory_order_acquire) != 0) {
+        // Clear the interrupt request
+        ctx->interrupt_requested.store(0, std::memory_order_release);
+
+        // Add break loop reason to indicate we should exit
+        ctx->esr_el1.fetch_or(static_cast<u64>(HaltReason::BreakLoop));
+
+        // Indicate we handled an interrupt
+        return true;
+    }
+
+    return false;
+}
+
 void* ArmNce::RestoreGuestContext(void* raw_context) {
    // Retrieve the host context.
    auto& host_ctx = static_cast<ucontext_t*>(raw_context)->uc_mcontext;
@ -268,9 +318,18 @@ void ArmNce::SetSvcArguments(std::span<const uint64_t, 8> args) {
 ArmNce::ArmNce(System& system, bool uses_wall_clock, std::size_t core_index)
    : ArmInterface{uses_wall_clock}, m_system{system}, m_core_index{core_index} {
    m_guest_ctx.system = &m_system;
+    m_guest_ctx.parent = this;
+
+    // Initialize as being in managed code
+    m_guest_ctx.in_managed.store(1, std::memory_order_release);
 }

-ArmNce::~ArmNce() = default;
+ArmNce::~ArmNce() {
+    // Clean up alternate signal stack
+    CleanupAlternateSignalStack();
+
+    // Host mapped memory will be cleaned up by its destructor
+}

 void ArmNce::Initialize() {
    if (m_thread_id == -1) {
@ -287,6 +346,16 @@ void ArmNce::Initialize() {
        sigaltstack(&ss, nullptr);
    }

+    // Set up alternate signal stack (Ryujinx-inspired enhancement)
+    SetupAlternateSignalStack();
+
+    // Initialize host-mapped memory for efficient access
+    if (!m_host_mapped_memory) {
+        auto& memory = m_system.ApplicationMemory();
+        m_host_mapped_memory = std::make_unique<HostMappedMemory>(memory);
+        LOG_DEBUG(Core_ARM, "Initialized host-mapped memory for NCE");
+    }
+
    // Set up signals.
    static std::once_flag flag;
    std::call_once(flag, [] {
@ -365,19 +434,23 @@ void ArmNce::SetContext(const Kernel::Svc::ThreadContext& ctx) {
 }

 void ArmNce::SignalInterrupt(Kernel::KThread* thread) {
-    // Add break loop condition.
+    // Mark that we're requesting an interrupt
+    m_guest_ctx.interrupt_requested.store(1, std::memory_order_release);
+
+    // Add break loop condition
    m_guest_ctx.esr_el1.fetch_or(static_cast<u64>(HaltReason::BreakLoop));

-    // Lock the thread context.
+    // Lock the thread context
    auto* params = &thread->GetNativeExecutionParameters();
    LockThreadParameters(params);

-    if (params->is_running) {
-        // We should signal to the running thread.
-        // The running thread will unlock the thread context.
+    // Only send a signal if the thread is running and not in managed code
+    if (params->is_running && m_guest_ctx.in_managed.load(std::memory_order_acquire) == 0) {
+        // Send signal to the running thread
+        // The running thread will unlock the thread context
        syscall(SYS_tkill, m_thread_id, BreakFromRunCodeSignal);
    } else {
-        // If the thread is no longer running, we have nothing to do.
+        // If the thread is no longer running or is in managed code, we unlock
        UnlockThreadParameters(params);
    }
 }
@ -402,21 +475,54 @@ void ArmNce::ClearInstructionCache() {
 }

 void ArmNce::InvalidateCacheRange(u64 addr, std::size_t size) {
-    #if defined(__GNUC__) || defined(__clang__)
-        // Align the start address to cache line boundary for better performance
-        const size_t CACHE_LINE_SIZE = 64;
-        addr &= ~(CACHE_LINE_SIZE - 1);
+#if defined(__GNUC__) || defined(__clang__)
+    while (size > 0) {
+        const std::size_t size_step = std::min(size, CACHE_PAGE_SIZE);

-        // Round up size to nearest cache line
-        size = (size + CACHE_LINE_SIZE - 1) & ~(CACHE_LINE_SIZE - 1);
+        // The __builtin___clear_cache intrinsic generates icache(i) invalidation and dcache(d)
+        // write-back instructions targeting the range.
+        char* addr_ptr = reinterpret_cast<char*>(addr);
+        __builtin___clear_cache(addr_ptr, addr_ptr + size_step);

-        // Prefetch the range to be invalidated
-        for (size_t offset = 0; offset < size; offset += CACHE_LINE_SIZE) {
-            __builtin_prefetch((void*)(addr + offset), 1, 3);
+        addr += size_step;
+        size -= size_step;
    }
-    #endif

+    // Clear instruction cache after range invalidation
    this->ClearInstructionCache();
+#endif
 }

+// Fast memory access template implementation (inspired by Ryujinx)
+template <typename T>
+T& ArmNce::GetHostRef(u64 guest_addr) {
+    if (m_host_mapped_memory) {
+        // Use the host-mapped memory for fast access
+        try {
+            return m_host_mapped_memory->GetRef<T>(guest_addr);
+        } catch (const std::exception& e) {
+            LOG_ERROR(Core_ARM, "Failed to get host reference: {}", e.what());
+        }
+    }
+
+    // Fallback to slower memory access
+    T value{};
+    m_system.ApplicationMemory().ReadBlock(guest_addr, &value, sizeof(T));
+    static thread_local T fallback;
+    fallback = value;
+    return fallback;
+}
+
+// Explicit instantiations for common types
+template u8& ArmNce::GetHostRef<u8>(u64);
+template u16& ArmNce::GetHostRef<u16>(u64);
+template u32& ArmNce::GetHostRef<u32>(u64);
+template u64& ArmNce::GetHostRef<u64>(u64);
+template s8& ArmNce::GetHostRef<s8>(u64);
+template s16& ArmNce::GetHostRef<s16>(u64);
+template s32& ArmNce::GetHostRef<s32>(u64);
+template s64& ArmNce::GetHostRef<s64>(u64);
+template f32& ArmNce::GetHostRef<f32>(u64);
+template f64& ArmNce::GetHostRef<f64>(u64);
+
 } // namespace Core
--- a/src/core/arm/nce/arm_nce.h
+++ b/src/core/arm/nce/arm_nce.h
@ -7,6 +7,7 @@

 #include "core/arm/arm_interface.h"
 #include "core/arm/nce/guest_context.h"
+#include "core/arm/nce/host_mapped_memory.h"

 namespace Core::Memory {
 class Memory;
@ -52,6 +53,10 @@ protected:

    void RewindBreakpointInstruction() override {}

+    // Fast memory access using host-mapped memory (inspired by Ryujinx)
+    template <typename T>
+    T& GetHostRef(u64 guest_addr);
+
 private:
    // Assembly definitions.
    static HaltReason ReturnToRunCodeByTrampoline(void* tpidr, GuestContext* ctx,
@ -67,6 +72,13 @@ private:
    static void LockThreadParameters(void* tpidr);
    static void UnlockThreadParameters(void* tpidr);

+    // Alternate stack management (inspired by Ryujinx)
+    void SetupAlternateSignalStack();
+    void CleanupAlternateSignalStack();
+
+    // Enhanced signal handling
+    static bool HandleThreadInterrupt(GuestContext* ctx);
+
 private:
    // C++ implementation functions for assembly definitions.
    static void* RestoreGuestContext(void* raw_context);
@ -90,6 +102,13 @@ public:

    // Stack for signal processing.
    std::unique_ptr<u8[]> m_stack{};
+
+    // Alternate signal stack (inspired by Ryujinx)
+    static constexpr size_t AlternateStackSize = 16384;
+    std::unique_ptr<u8[]> m_alt_signal_stack{};
+
+    // Host mapped memory for efficient access (inspired by Ryujinx)
+    std::unique_ptr<HostMappedMemory> m_host_mapped_memory{};
 };

 } // namespace Core
--- a/src/core/arm/nce/guest_context.h
+++ b/src/core/arm/nce/guest_context.h
@ -38,6 +38,12 @@ struct GuestContext {
    u32 svc{};
    System* system{};
    ArmNce* parent{};
+
+    // Enhanced thread control (inspired by Ryujinx)
+    std::atomic<u32> in_managed{1};         // 1 when in managed code, 0 when in native
+    std::atomic<u32> interrupt_requested{0}; // Set when interrupt requested
+    pid_t host_thread_id{-1};              // Host thread ID for signaling
+    u64 ctr_el0{0x8444c004};               // Cache type register
 };

 // Verify assembly offsets.
--- a/src/core/arm/nce/host_mapped_memory.h
+++ b/src/core/arm/nce/host_mapped_memory.h
@ -0,0 +1,222 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <array>
+#include <span>
+#include <stdexcept>
+#include <string>
+#include <sys/mman.h>
+#include <unordered_map>
+#include <vector>
+
+#include "common/common_types.h"
+#include "common/logging/log.h"
+#include "common/page_table.h"
+
+namespace Core::Memory {
+class Memory;
+}
+
+namespace Core {
+
+/**
+ * HostMappedMemory provides direct host-mapped memory access for NCE
+ * This is inspired by Ryujinx's MemoryManagerNative for faster memory operations
+ */
+class HostMappedMemory {
+public:
+    explicit inline HostMappedMemory(Memory::Memory& memory) : memory{memory} {}
+    inline ~HostMappedMemory() {
+        // Unmap all allocations
+        for (void* allocation : allocations) {
+            if (munmap(allocation, page_size) != 0) {
+                LOG_ERROR(Core_ARM, "Failed to unmap allocation at {:p}: {}", allocation, std::strerror(errno));
+            }
+        }
+    }
+
+    /**
+     * Maps a guest memory region to host memory
+     * @param guest_addr Guest virtual address to map
+     * @param size Size of the region to map
+     * @return True if the mapping succeeded
+     */
+    inline bool MapRegion(u64 guest_addr, u64 size) {
+        const u64 start_page = guest_addr >> page_bits;
+        const u64 end_page = (guest_addr + size + page_mask) >> page_bits;
+
+        for (u64 page = start_page; page < end_page; page++) {
+            const u64 current_addr = page << page_bits;
+
+            // Skip if already mapped
+            if (page_table.contains(page)) {
+                continue;
+            }
+
+            // Allocate memory for this page
+            void* allocation = mmap(nullptr, page_size, PROT_READ | PROT_WRITE,
+                                    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+            if (allocation == MAP_FAILED) {
+                LOG_ERROR(Core_ARM, "Failed to allocate host page for guest address {:016X}: {}",
+                          current_addr, std::strerror(errno));
+                return false;
+            }
+
+            // Copy data from guest memory to our allocation
+            bool result = false;
+            std::array<u8, page_size> data;
+
+            // Try to read the memory from guest
+            result = memory.ReadBlock(current_addr, data.data(), page_size);
+            if (!result) {
+                LOG_ERROR(Core_ARM, "Failed to read memory block at {:016X}", current_addr);
+                munmap(allocation, page_size);
+                return false;
+            }
+
+            // Copy to our allocation
+            std::memcpy(allocation, data.data(), page_size);
+
+            // Store the allocation
+            page_table[page] = static_cast<u8*>(allocation);
+            allocations.push_back(allocation);
+        }
+
+        return true;
+    }
+
+    /**
+     * Unmaps a previously mapped guest memory region
+     * @param guest_addr Guest virtual address to unmap
+     * @param size Size of the region to unmap
+     */
+    inline void UnmapRegion(u64 guest_addr, u64 size) {
+        const u64 start_page = guest_addr >> page_bits;
+        const u64 end_page = (guest_addr + size + page_mask) >> page_bits;
+
+        for (u64 page = start_page; page < end_page; page++) {
+            // Skip if not mapped
+            auto it = page_table.find(page);
+            if (it == page_table.end()) {
+                continue;
+            }
+
+            u8* host_ptr = it->second;
+
+            // Try to write the memory back to guest
+            const u64 current_addr = page << page_bits;
+            memory.WriteBlock(current_addr, host_ptr, page_size);
+
+            // Remove from page table
+            page_table.erase(it);
+
+            // Don't unmap immediately - we'll do that in the destructor
+            // to avoid potential reuse problems
+        }
+    }
+
+    /**
+     * Gets a typed reference to memory at the specified guest address
+     * @tparam T Type of the reference to return
+     * @param guest_addr Guest virtual address
+     * @return Reference to the memory at the specified address
+     * @note The memory region must be continuous and mapped
+     */
+    template <typename T>
+    T& GetRef(u64 guest_addr) {
+        static_assert(std::is_trivially_copyable_v<T>, "T must be trivially copyable");
+
+        // Check if region covers the entire value
+        const u64 page_offset = guest_addr & page_mask;
+
+        if (page_offset + sizeof(T) <= page_size) {
+            // Fast path - contained within single page
+            return *reinterpret_cast<T*>(TranslateAddress(guest_addr));
+        } else {
+            // Slow path - spans pages, need to check if all are mapped
+            if (!IsRangeMapped(guest_addr, sizeof(T))) {
+                throw std::runtime_error("Memory region is not continuous");
+            }
+            return *reinterpret_cast<T*>(TranslateAddress(guest_addr));
+        }
+    }
+
+    /**
+     * Gets a span over memory at the specified guest address
+     * @param guest_addr Guest virtual address
+     * @param size Size of the span
+     * @return Span over the memory at the specified address
+     * @note The memory region must be continuous and mapped
+     */
+    inline std::span<u8> GetSpan(u64 guest_addr, u64 size) {
+        // Ensure the memory is mapped and continuous
+        if (!IsRangeMapped(guest_addr, size)) {
+            throw std::runtime_error("GetSpan requested on unmapped or non-continuous memory region");
+        }
+
+        return std::span<u8>(TranslateAddress(guest_addr), size);
+    }
+
+    /**
+     * Checks if an address is mapped
+     * @param guest_addr Guest virtual address to check
+     * @return True if the address is mapped
+     */
+    inline bool IsMapped(u64 guest_addr) const {
+        const u64 page = guest_addr >> page_bits;
+        return page_table.contains(page);
+    }
+
+    /**
+     * Checks if a range of memory is mapped continuously
+     * @param guest_addr Starting guest virtual address
+     * @param size Size of the region to check
+     * @return True if the entire range is mapped continuously
+     */
+    inline bool IsRangeMapped(u64 guest_addr, u64 size) const {
+        const u64 start_page = guest_addr >> page_bits;
+        const u64 end_page = (guest_addr + size + page_mask) >> page_bits;
+
+        for (u64 page = start_page; page < end_page; page++) {
+            if (!page_table.contains(page)) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    /**
+     * Gets the host address for a guest virtual address
+     * @param guest_addr Guest virtual address to translate
+     * @return Host address corresponding to the guest address
+     */
+    inline u8* TranslateAddress(u64 guest_addr) {
+        const u64 page = guest_addr >> page_bits;
+        const u64 offset = guest_addr & page_mask;
+
+        auto it = page_table.find(page);
+        if (it == page_table.end()) {
+            throw std::runtime_error(fmt::format("Tried to translate unmapped address {:016X}", guest_addr));
+        }
+
+        return it->second + offset;
+    }
+
+private:
+    static constexpr u64 page_bits = 12;
+    static constexpr u64 page_size = 1ULL << page_bits;
+    static constexpr u64 page_mask = page_size - 1;
+
+    Memory::Memory& memory;
+
+    // Page table mapping guest pages to host addresses
+    std::unordered_map<u64, u8*> page_table;
+
+    // Allocation pool for mapped regions
+    std::vector<void*> allocations;
+};
+
+} // namespace Core