From 752945ee3193064fc5890f16ad3a0c722e47a374 Mon Sep 17 00:00:00 2001 From: MrPurple666 Date: Mon, 19 May 2025 00:36:07 -0300 Subject: [PATCH] PORT: NCE: Initial ryujinx host mapped memory --- src/core/arm/nce/arm_nce.cpp | 142 +++++++++++++--- src/core/arm/nce/arm_nce.h | 19 +++ src/core/arm/nce/guest_context.h | 6 + src/core/arm/nce/host_mapped_memory.h | 222 ++++++++++++++++++++++++++ 4 files changed, 371 insertions(+), 18 deletions(-) create mode 100644 src/core/arm/nce/host_mapped_memory.h diff --git a/src/core/arm/nce/arm_nce.cpp b/src/core/arm/nce/arm_nce.cpp index 90891e241d..475d2951ce 100644 --- a/src/core/arm/nce/arm_nce.cpp +++ b/src/core/arm/nce/arm_nce.cpp @@ -3,7 +3,9 @@ #include #include +#include +#include "common/logging/log.h" #include "common/signal_chain.h" #include "core/arm/nce/arm_nce.h" #include "core/arm/nce/interpreter_visitor.h" @@ -43,6 +45,54 @@ constexpr u32 StackSize = 128_KiB; } // namespace +// Implementation of the enhanced features inspired by Ryujinx NCE + +void ArmNce::SetupAlternateSignalStack() { + // Create an alternate stack for signal handling + // This ensures we have a clean stack for handling signals even if the guest stack is corrupted + m_alt_signal_stack = std::make_unique(AlternateStackSize); + + stack_t ss{}; + ss.ss_sp = m_alt_signal_stack.get(); + ss.ss_size = AlternateStackSize; + ss.ss_flags = 0; + + if (sigaltstack(&ss, nullptr) != 0) { + LOG_ERROR(Core_ARM, "Failed to setup alternate signal stack: {}", strerror(errno)); + } else { + LOG_DEBUG(Core_ARM, "Alternate signal stack set up successfully"); + } +} + +void ArmNce::CleanupAlternateSignalStack() { + if (m_alt_signal_stack) { + stack_t ss{}; + ss.ss_flags = SS_DISABLE; + + if (sigaltstack(&ss, nullptr) != 0) { + LOG_ERROR(Core_ARM, "Failed to disable alternate signal stack: {}", strerror(errno)); + } + + m_alt_signal_stack.reset(); + } +} + +bool ArmNce::HandleThreadInterrupt(GuestContext* ctx) { + // Check if an interrupt was requested + if (ctx->interrupt_requested.load(std::memory_order_acquire) != 0) { + // Clear the interrupt request + ctx->interrupt_requested.store(0, std::memory_order_release); + + // Add break loop reason to indicate we should exit + ctx->esr_el1.fetch_or(static_cast(HaltReason::BreakLoop)); + + // Indicate we handled an interrupt + return true; + } + + return false; +} + void* ArmNce::RestoreGuestContext(void* raw_context) { // Retrieve the host context. auto& host_ctx = static_cast(raw_context)->uc_mcontext; @@ -268,9 +318,18 @@ void ArmNce::SetSvcArguments(std::span args) { ArmNce::ArmNce(System& system, bool uses_wall_clock, std::size_t core_index) : ArmInterface{uses_wall_clock}, m_system{system}, m_core_index{core_index} { m_guest_ctx.system = &m_system; + m_guest_ctx.parent = this; + + // Initialize as being in managed code + m_guest_ctx.in_managed.store(1, std::memory_order_release); } -ArmNce::~ArmNce() = default; +ArmNce::~ArmNce() { + // Clean up alternate signal stack + CleanupAlternateSignalStack(); + + // Host mapped memory will be cleaned up by its destructor +} void ArmNce::Initialize() { if (m_thread_id == -1) { @@ -287,6 +346,16 @@ void ArmNce::Initialize() { sigaltstack(&ss, nullptr); } + // Set up alternate signal stack (Ryujinx-inspired enhancement) + SetupAlternateSignalStack(); + + // Initialize host-mapped memory for efficient access + if (!m_host_mapped_memory) { + auto& memory = m_system.ApplicationMemory(); + m_host_mapped_memory = std::make_unique(memory); + LOG_DEBUG(Core_ARM, "Initialized host-mapped memory for NCE"); + } + // Set up signals. static std::once_flag flag; std::call_once(flag, [] { @@ -365,19 +434,23 @@ void ArmNce::SetContext(const Kernel::Svc::ThreadContext& ctx) { } void ArmNce::SignalInterrupt(Kernel::KThread* thread) { - // Add break loop condition. + // Mark that we're requesting an interrupt + m_guest_ctx.interrupt_requested.store(1, std::memory_order_release); + + // Add break loop condition m_guest_ctx.esr_el1.fetch_or(static_cast(HaltReason::BreakLoop)); - // Lock the thread context. + // Lock the thread context auto* params = &thread->GetNativeExecutionParameters(); LockThreadParameters(params); - if (params->is_running) { - // We should signal to the running thread. - // The running thread will unlock the thread context. + // Only send a signal if the thread is running and not in managed code + if (params->is_running && m_guest_ctx.in_managed.load(std::memory_order_acquire) == 0) { + // Send signal to the running thread + // The running thread will unlock the thread context syscall(SYS_tkill, m_thread_id, BreakFromRunCodeSignal); } else { - // If the thread is no longer running, we have nothing to do. + // If the thread is no longer running or is in managed code, we unlock UnlockThreadParameters(params); } } @@ -402,21 +475,54 @@ void ArmNce::ClearInstructionCache() { } void ArmNce::InvalidateCacheRange(u64 addr, std::size_t size) { - #if defined(__GNUC__) || defined(__clang__) - // Align the start address to cache line boundary for better performance - const size_t CACHE_LINE_SIZE = 64; - addr &= ~(CACHE_LINE_SIZE - 1); +#if defined(__GNUC__) || defined(__clang__) + while (size > 0) { + const std::size_t size_step = std::min(size, CACHE_PAGE_SIZE); - // Round up size to nearest cache line - size = (size + CACHE_LINE_SIZE - 1) & ~(CACHE_LINE_SIZE - 1); + // The __builtin___clear_cache intrinsic generates icache(i) invalidation and dcache(d) + // write-back instructions targeting the range. + char* addr_ptr = reinterpret_cast(addr); + __builtin___clear_cache(addr_ptr, addr_ptr + size_step); - // Prefetch the range to be invalidated - for (size_t offset = 0; offset < size; offset += CACHE_LINE_SIZE) { - __builtin_prefetch((void*)(addr + offset), 1, 3); - } - #endif + addr += size_step; + size -= size_step; + } + // Clear instruction cache after range invalidation this->ClearInstructionCache(); +#endif } +// Fast memory access template implementation (inspired by Ryujinx) +template +T& ArmNce::GetHostRef(u64 guest_addr) { + if (m_host_mapped_memory) { + // Use the host-mapped memory for fast access + try { + return m_host_mapped_memory->GetRef(guest_addr); + } catch (const std::exception& e) { + LOG_ERROR(Core_ARM, "Failed to get host reference: {}", e.what()); + } + } + + // Fallback to slower memory access + T value{}; + m_system.ApplicationMemory().ReadBlock(guest_addr, &value, sizeof(T)); + static thread_local T fallback; + fallback = value; + return fallback; +} + +// Explicit instantiations for common types +template u8& ArmNce::GetHostRef(u64); +template u16& ArmNce::GetHostRef(u64); +template u32& ArmNce::GetHostRef(u64); +template u64& ArmNce::GetHostRef(u64); +template s8& ArmNce::GetHostRef(u64); +template s16& ArmNce::GetHostRef(u64); +template s32& ArmNce::GetHostRef(u64); +template s64& ArmNce::GetHostRef(u64); +template f32& ArmNce::GetHostRef(u64); +template f64& ArmNce::GetHostRef(u64); + } // namespace Core diff --git a/src/core/arm/nce/arm_nce.h b/src/core/arm/nce/arm_nce.h index be9b304c4c..973ca10098 100644 --- a/src/core/arm/nce/arm_nce.h +++ b/src/core/arm/nce/arm_nce.h @@ -7,6 +7,7 @@ #include "core/arm/arm_interface.h" #include "core/arm/nce/guest_context.h" +#include "core/arm/nce/host_mapped_memory.h" namespace Core::Memory { class Memory; @@ -52,6 +53,10 @@ protected: void RewindBreakpointInstruction() override {} + // Fast memory access using host-mapped memory (inspired by Ryujinx) + template + T& GetHostRef(u64 guest_addr); + private: // Assembly definitions. static HaltReason ReturnToRunCodeByTrampoline(void* tpidr, GuestContext* ctx, @@ -67,6 +72,13 @@ private: static void LockThreadParameters(void* tpidr); static void UnlockThreadParameters(void* tpidr); + // Alternate stack management (inspired by Ryujinx) + void SetupAlternateSignalStack(); + void CleanupAlternateSignalStack(); + + // Enhanced signal handling + static bool HandleThreadInterrupt(GuestContext* ctx); + private: // C++ implementation functions for assembly definitions. static void* RestoreGuestContext(void* raw_context); @@ -90,6 +102,13 @@ public: // Stack for signal processing. std::unique_ptr m_stack{}; + + // Alternate signal stack (inspired by Ryujinx) + static constexpr size_t AlternateStackSize = 16384; + std::unique_ptr m_alt_signal_stack{}; + + // Host mapped memory for efficient access (inspired by Ryujinx) + std::unique_ptr m_host_mapped_memory{}; }; } // namespace Core diff --git a/src/core/arm/nce/guest_context.h b/src/core/arm/nce/guest_context.h index a7eadccce5..2d4407e6cc 100644 --- a/src/core/arm/nce/guest_context.h +++ b/src/core/arm/nce/guest_context.h @@ -38,6 +38,12 @@ struct GuestContext { u32 svc{}; System* system{}; ArmNce* parent{}; + + // Enhanced thread control (inspired by Ryujinx) + std::atomic in_managed{1}; // 1 when in managed code, 0 when in native + std::atomic interrupt_requested{0}; // Set when interrupt requested + pid_t host_thread_id{-1}; // Host thread ID for signaling + u64 ctr_el0{0x8444c004}; // Cache type register }; // Verify assembly offsets. diff --git a/src/core/arm/nce/host_mapped_memory.h b/src/core/arm/nce/host_mapped_memory.h new file mode 100644 index 0000000000..a69780d295 --- /dev/null +++ b/src/core/arm/nce/host_mapped_memory.h @@ -0,0 +1,222 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "common/common_types.h" +#include "common/logging/log.h" +#include "common/page_table.h" + +namespace Core::Memory { +class Memory; +} + +namespace Core { + +/** + * HostMappedMemory provides direct host-mapped memory access for NCE + * This is inspired by Ryujinx's MemoryManagerNative for faster memory operations + */ +class HostMappedMemory { +public: + explicit inline HostMappedMemory(Memory::Memory& memory) : memory{memory} {} + inline ~HostMappedMemory() { + // Unmap all allocations + for (void* allocation : allocations) { + if (munmap(allocation, page_size) != 0) { + LOG_ERROR(Core_ARM, "Failed to unmap allocation at {:p}: {}", allocation, std::strerror(errno)); + } + } + } + + /** + * Maps a guest memory region to host memory + * @param guest_addr Guest virtual address to map + * @param size Size of the region to map + * @return True if the mapping succeeded + */ + inline bool MapRegion(u64 guest_addr, u64 size) { + const u64 start_page = guest_addr >> page_bits; + const u64 end_page = (guest_addr + size + page_mask) >> page_bits; + + for (u64 page = start_page; page < end_page; page++) { + const u64 current_addr = page << page_bits; + + // Skip if already mapped + if (page_table.contains(page)) { + continue; + } + + // Allocate memory for this page + void* allocation = mmap(nullptr, page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (allocation == MAP_FAILED) { + LOG_ERROR(Core_ARM, "Failed to allocate host page for guest address {:016X}: {}", + current_addr, std::strerror(errno)); + return false; + } + + // Copy data from guest memory to our allocation + bool result = false; + std::array data; + + // Try to read the memory from guest + result = memory.ReadBlock(current_addr, data.data(), page_size); + if (!result) { + LOG_ERROR(Core_ARM, "Failed to read memory block at {:016X}", current_addr); + munmap(allocation, page_size); + return false; + } + + // Copy to our allocation + std::memcpy(allocation, data.data(), page_size); + + // Store the allocation + page_table[page] = static_cast(allocation); + allocations.push_back(allocation); + } + + return true; + } + + /** + * Unmaps a previously mapped guest memory region + * @param guest_addr Guest virtual address to unmap + * @param size Size of the region to unmap + */ + inline void UnmapRegion(u64 guest_addr, u64 size) { + const u64 start_page = guest_addr >> page_bits; + const u64 end_page = (guest_addr + size + page_mask) >> page_bits; + + for (u64 page = start_page; page < end_page; page++) { + // Skip if not mapped + auto it = page_table.find(page); + if (it == page_table.end()) { + continue; + } + + u8* host_ptr = it->second; + + // Try to write the memory back to guest + const u64 current_addr = page << page_bits; + memory.WriteBlock(current_addr, host_ptr, page_size); + + // Remove from page table + page_table.erase(it); + + // Don't unmap immediately - we'll do that in the destructor + // to avoid potential reuse problems + } + } + + /** + * Gets a typed reference to memory at the specified guest address + * @tparam T Type of the reference to return + * @param guest_addr Guest virtual address + * @return Reference to the memory at the specified address + * @note The memory region must be continuous and mapped + */ + template + T& GetRef(u64 guest_addr) { + static_assert(std::is_trivially_copyable_v, "T must be trivially copyable"); + + // Check if region covers the entire value + const u64 page_offset = guest_addr & page_mask; + + if (page_offset + sizeof(T) <= page_size) { + // Fast path - contained within single page + return *reinterpret_cast(TranslateAddress(guest_addr)); + } else { + // Slow path - spans pages, need to check if all are mapped + if (!IsRangeMapped(guest_addr, sizeof(T))) { + throw std::runtime_error("Memory region is not continuous"); + } + return *reinterpret_cast(TranslateAddress(guest_addr)); + } + } + + /** + * Gets a span over memory at the specified guest address + * @param guest_addr Guest virtual address + * @param size Size of the span + * @return Span over the memory at the specified address + * @note The memory region must be continuous and mapped + */ + inline std::span GetSpan(u64 guest_addr, u64 size) { + // Ensure the memory is mapped and continuous + if (!IsRangeMapped(guest_addr, size)) { + throw std::runtime_error("GetSpan requested on unmapped or non-continuous memory region"); + } + + return std::span(TranslateAddress(guest_addr), size); + } + + /** + * Checks if an address is mapped + * @param guest_addr Guest virtual address to check + * @return True if the address is mapped + */ + inline bool IsMapped(u64 guest_addr) const { + const u64 page = guest_addr >> page_bits; + return page_table.contains(page); + } + + /** + * Checks if a range of memory is mapped continuously + * @param guest_addr Starting guest virtual address + * @param size Size of the region to check + * @return True if the entire range is mapped continuously + */ + inline bool IsRangeMapped(u64 guest_addr, u64 size) const { + const u64 start_page = guest_addr >> page_bits; + const u64 end_page = (guest_addr + size + page_mask) >> page_bits; + + for (u64 page = start_page; page < end_page; page++) { + if (!page_table.contains(page)) { + return false; + } + } + + return true; + } + + /** + * Gets the host address for a guest virtual address + * @param guest_addr Guest virtual address to translate + * @return Host address corresponding to the guest address + */ + inline u8* TranslateAddress(u64 guest_addr) { + const u64 page = guest_addr >> page_bits; + const u64 offset = guest_addr & page_mask; + + auto it = page_table.find(page); + if (it == page_table.end()) { + throw std::runtime_error(fmt::format("Tried to translate unmapped address {:016X}", guest_addr)); + } + + return it->second + offset; + } + +private: + static constexpr u64 page_bits = 12; + static constexpr u64 page_size = 1ULL << page_bits; + static constexpr u64 page_mask = page_size - 1; + + Memory::Memory& memory; + + // Page table mapping guest pages to host addresses + std::unordered_map page_table; + + // Allocation pool for mapped regions + std::vector allocations; +}; + +} // namespace Core