diff options
| -rw-r--r-- | src/audio_core/audio_core.cpp | 19 | ||||
| -rw-r--r-- | src/audio_core/audio_core.h | 13 | ||||
| -rw-r--r-- | src/audio_core/hle/dsp.cpp | 14 | ||||
| -rw-r--r-- | src/audio_core/hle/dsp.h | 21 | ||||
| -rw-r--r-- | src/core/core.cpp | 5 | ||||
| -rw-r--r-- | src/core/hle/kernel/memory.cpp | 112 | ||||
| -rw-r--r-- | src/core/hle/kernel/memory.h | 10 | ||||
| -rw-r--r-- | src/core/hle/kernel/process.cpp | 23 | ||||
| -rw-r--r-- | src/core/hle/kernel/process.h | 2 | ||||
| -rw-r--r-- | src/core/memory.cpp | 8 | ||||
| -rw-r--r-- | src/core/memory.h | 10 | ||||
| -rw-r--r-- | src/video_core/command_processor.cpp | 212 | ||||
| -rw-r--r-- | src/video_core/shader/shader.h | 7 | ||||
| -rw-r--r-- | src/video_core/shader/shader_interpreter.cpp | 2 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.cpp | 2 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64_compiler.cpp | 4 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64_compiler.h | 14 | 
17 files changed, 328 insertions, 150 deletions
diff --git a/src/audio_core/audio_core.cpp b/src/audio_core/audio_core.cpp index 84f9c03a7..9c2e6ed88 100644 --- a/src/audio_core/audio_core.cpp +++ b/src/audio_core/audio_core.cpp @@ -2,6 +2,7 @@  // Licensed under GPLv2 or any later version  // Refer to the license.txt file included. +#include <array>  #include <memory>  #include <string>  #include "audio_core/audio_core.h" @@ -10,8 +11,8 @@  #include "audio_core/null_sink.h"  #include "audio_core/sink.h"  #include "audio_core/sink_details.h" +#include "common/common_types.h"  #include "core/core_timing.h" -#include "core/hle/kernel/vm_manager.h"  #include "core/hle/service/dsp_dsp.h"  namespace AudioCore { @@ -39,20 +40,8 @@ void Init() {      CoreTiming::ScheduleEvent(audio_frame_ticks, tick_event);  } -void AddAddressSpace(Kernel::VMManager& address_space) { -    auto r0_vma = address_space -                      .MapBackingMemory(DSP::HLE::region0_base, -                                        reinterpret_cast<u8*>(&DSP::HLE::g_regions[0]), -                                        sizeof(DSP::HLE::SharedMemory), Kernel::MemoryState::IO) -                      .MoveFrom(); -    address_space.Reprotect(r0_vma, Kernel::VMAPermission::ReadWrite); - -    auto r1_vma = address_space -                      .MapBackingMemory(DSP::HLE::region1_base, -                                        reinterpret_cast<u8*>(&DSP::HLE::g_regions[1]), -                                        sizeof(DSP::HLE::SharedMemory), Kernel::MemoryState::IO) -                      .MoveFrom(); -    address_space.Reprotect(r1_vma, Kernel::VMAPermission::ReadWrite); +std::array<u8, Memory::DSP_RAM_SIZE>& GetDspMemory() { +    return DSP::HLE::g_dsp_memory.raw_memory;  }  void SelectSink(std::string sink_id) { diff --git a/src/audio_core/audio_core.h b/src/audio_core/audio_core.h index 0edf6dd15..ab323ce1f 100644 --- a/src/audio_core/audio_core.h +++ b/src/audio_core/audio_core.h @@ -4,11 +4,10 @@  #pragma once +#include <array>  #include <string> - -namespace Kernel { -class VMManager; -} +#include "common/common_types.h" +#include "core/memory.h"  namespace AudioCore { @@ -17,8 +16,8 @@ constexpr int native_sample_rate = 32728; ///< 32kHz  /// Initialise Audio Core  void Init(); -/// Add DSP address spaces to a Process. -void AddAddressSpace(Kernel::VMManager& vm_manager); +/// Returns a reference to the array backing DSP memory +std::array<u8, Memory::DSP_RAM_SIZE>& GetDspMemory();  /// Select the sink to use based on sink id.  void SelectSink(std::string sink_id); @@ -29,4 +28,4 @@ void EnableStretching(bool enable);  /// Shutdown Audio Core  void Shutdown(); -} // namespace +} // namespace AudioCore diff --git a/src/audio_core/hle/dsp.cpp b/src/audio_core/hle/dsp.cpp index 31421fdc6..260b182ed 100644 --- a/src/audio_core/hle/dsp.cpp +++ b/src/audio_core/hle/dsp.cpp @@ -16,31 +16,33 @@ namespace HLE {  // Region management -std::array<SharedMemory, 2> g_regions; +DspMemory g_dsp_memory;  static size_t CurrentRegionIndex() {      // The region with the higher frame counter is chosen unless there is wraparound.      // This function only returns a 0 or 1. +    u16 frame_counter_0 = g_dsp_memory.region_0.frame_counter; +    u16 frame_counter_1 = g_dsp_memory.region_1.frame_counter; -    if (g_regions[0].frame_counter == 0xFFFFu && g_regions[1].frame_counter != 0xFFFEu) { +    if (frame_counter_0 == 0xFFFFu && frame_counter_1 != 0xFFFEu) {          // Wraparound has occurred.          return 1;      } -    if (g_regions[1].frame_counter == 0xFFFFu && g_regions[0].frame_counter != 0xFFFEu) { +    if (frame_counter_1 == 0xFFFFu && frame_counter_0 != 0xFFFEu) {          // Wraparound has occurred.          return 0;      } -    return (g_regions[0].frame_counter > g_regions[1].frame_counter) ? 0 : 1; +    return (frame_counter_0 > frame_counter_1) ? 0 : 1;  }  static SharedMemory& ReadRegion() { -    return g_regions[CurrentRegionIndex()]; +    return CurrentRegionIndex() == 0 ? g_dsp_memory.region_0 : g_dsp_memory.region_1;  }  static SharedMemory& WriteRegion() { -    return g_regions[1 - CurrentRegionIndex()]; +    return CurrentRegionIndex() != 0 ? g_dsp_memory.region_0 : g_dsp_memory.region_1;  }  // Audio processing and mixing diff --git a/src/audio_core/hle/dsp.h b/src/audio_core/hle/dsp.h index 0a0f60ac1..94ce48863 100644 --- a/src/audio_core/hle/dsp.h +++ b/src/audio_core/hle/dsp.h @@ -31,8 +31,8 @@ namespace HLE {  // double-buffer. The frame counter is located as the very last u16 of each region and is  // incremented each audio tick. -constexpr VAddr region0_base = 0x1FF50000; -constexpr VAddr region1_base = 0x1FF70000; +constexpr u32 region0_offset = 0x50000; +constexpr u32 region1_offset = 0x70000;  /**   * The DSP is native 16-bit. The DSP also appears to be big-endian. When reading 32-bit numbers from @@ -512,7 +512,22 @@ struct SharedMemory {  };  ASSERT_DSP_STRUCT(SharedMemory, 0x8000); -extern std::array<SharedMemory, 2> g_regions; +union DspMemory { +    std::array<u8, 0x80000> raw_memory; +    struct { +        u8 unused_0[0x50000]; +        SharedMemory region_0; +        u8 unused_1[0x18000]; +        SharedMemory region_1; +        u8 unused_2[0x8000]; +    }; +}; +static_assert(offsetof(DspMemory, region_0) == region0_offset, +              "DSP region 0 is at the wrong offset"); +static_assert(offsetof(DspMemory, region_1) == region1_offset, +              "DSP region 1 is at the wrong offset"); + +extern DspMemory g_dsp_memory;  // Structures must have an offset that is a multiple of two.  static_assert(offsetof(SharedMemory, frame_counter) % 2 == 0, diff --git a/src/core/core.cpp b/src/core/core.cpp index 140ff6451..881f1e93c 100644 --- a/src/core/core.cpp +++ b/src/core/core.cpp @@ -13,11 +13,11 @@  #include "core/core_timing.h"  #include "core/gdbstub/gdbstub.h"  #include "core/hle/kernel/kernel.h" -#include "core/hle/kernel/memory.h"  #include "core/hle/kernel/thread.h"  #include "core/hle/service/service.h"  #include "core/hw/hw.h"  #include "core/loader/loader.h" +#include "core/memory_setup.h"  #include "core/settings.h"  #include "video_core/video_core.h" @@ -123,7 +123,8 @@ void System::Reschedule() {  }  System::ResultStatus System::Init(EmuWindow* emu_window, u32 system_mode) { -    Memory::Init(); +    Memory::InitMemoryMap(); +    LOG_DEBUG(HW_Memory, "initialized OK");      if (Settings::values.use_cpu_jit) {          cpu_core = std::make_unique<ARM_Dynarmic>(USER32MODE); diff --git a/src/core/hle/kernel/memory.cpp b/src/core/hle/kernel/memory.cpp index 33c165197..8250a90b5 100644 --- a/src/core/hle/kernel/memory.cpp +++ b/src/core/hle/kernel/memory.cpp @@ -2,11 +2,13 @@  // Licensed under GPLv2 or any later version  // Refer to the license.txt file included. +#include <cinttypes>  #include <map>  #include <memory>  #include <utility>  #include <vector>  #include "audio_core/audio_core.h" +#include "common/assert.h"  #include "common/common_types.h"  #include "common/logging/log.h"  #include "core/hle/config_mem.h" @@ -92,52 +94,96 @@ MemoryRegionInfo* GetMemoryRegion(MemoryRegion region) {          UNREACHABLE();      }  } -} - -namespace Memory { -namespace { +std::array<u8, Memory::VRAM_SIZE> vram; +std::array<u8, Memory::N3DS_EXTRA_RAM_SIZE> n3ds_extra_ram; + +void HandleSpecialMapping(VMManager& address_space, const AddressMapping& mapping) { +    using namespace Memory; + +    struct MemoryArea { +        VAddr vaddr_base; +        PAddr paddr_base; +        u32 size; +    }; + +    // The order of entries in this array is important. The VRAM and IO VAddr ranges overlap, and +    // VRAM must be tried first. +    static constexpr MemoryArea memory_areas[] = { +        {VRAM_VADDR, VRAM_PADDR, VRAM_SIZE}, +        {IO_AREA_VADDR, IO_AREA_PADDR, IO_AREA_SIZE}, +        {DSP_RAM_VADDR, DSP_RAM_PADDR, DSP_RAM_SIZE}, +        {N3DS_EXTRA_RAM_VADDR, N3DS_EXTRA_RAM_PADDR, N3DS_EXTRA_RAM_SIZE - 0x20000}, +    }; + +    VAddr mapping_limit = mapping.address + mapping.size; +    if (mapping_limit < mapping.address) { +        LOG_CRITICAL(Loader, "Mapping size overflowed: address=0x%08" PRIX32 " size=0x%" PRIX32, +                     mapping.address, mapping.size); +        return; +    } -struct MemoryArea { -    u32 base; -    u32 size; -    const char* name; -}; +    auto area = +        std::find_if(std::begin(memory_areas), std::end(memory_areas), [&](const auto& area) { +            return mapping.address >= area.vaddr_base && +                   mapping_limit <= area.vaddr_base + area.size; +        }); +    if (area == std::end(memory_areas)) { +        LOG_ERROR(Loader, "Unhandled special mapping: address=0x%08" PRIX32 " size=0x%" PRIX32 +                          " read_only=%d unk_flag=%d", +                  mapping.address, mapping.size, mapping.read_only, mapping.unk_flag); +        return; +    } -// We don't declare the IO regions in here since its handled by other means. -static MemoryArea memory_areas[] = { -    {VRAM_VADDR, VRAM_SIZE, "VRAM"}, // Video memory (VRAM) -}; -} +    u32 offset_into_region = mapping.address - area->vaddr_base; +    if (area->paddr_base == IO_AREA_PADDR) { +        LOG_ERROR(Loader, "MMIO mappings are not supported yet. phys_addr=0x%08" PRIX32, +                  area->paddr_base + offset_into_region); +        return; +    } -void Init() { -    InitMemoryMap(); -    LOG_DEBUG(HW_Memory, "initialized OK"); -} +    // TODO(yuriks): Use GetPhysicalPointer when that becomes independent of the virtual +    // mappings. +    u8* target_pointer = nullptr; +    switch (area->paddr_base) { +    case VRAM_PADDR: +        target_pointer = vram.data(); +        break; +    case DSP_RAM_PADDR: +        target_pointer = AudioCore::GetDspMemory().data(); +        break; +    case N3DS_EXTRA_RAM_PADDR: +        target_pointer = n3ds_extra_ram.data(); +        break; +    default: +        UNREACHABLE(); +    } -void InitLegacyAddressSpace(Kernel::VMManager& address_space) { -    using namespace Kernel; +    // TODO(yuriks): This flag seems to have some other effect, but it's unknown what +    MemoryState memory_state = mapping.unk_flag ? MemoryState::Static : MemoryState::IO; -    for (MemoryArea& area : memory_areas) { -        auto block = std::make_shared<std::vector<u8>>(area.size); -        address_space -            .MapMemoryBlock(area.base, std::move(block), 0, area.size, MemoryState::Private) -            .Unwrap(); -    } +    auto vma = address_space +                   .MapBackingMemory(mapping.address, target_pointer + offset_into_region, +                                     mapping.size, memory_state) +                   .MoveFrom(); +    address_space.Reprotect(vma, +                            mapping.read_only ? VMAPermission::Read : VMAPermission::ReadWrite); +} +void MapSharedPages(VMManager& address_space) {      auto cfg_mem_vma = address_space -                           .MapBackingMemory(CONFIG_MEMORY_VADDR, (u8*)&ConfigMem::config_mem, -                                             CONFIG_MEMORY_SIZE, MemoryState::Shared) +                           .MapBackingMemory(Memory::CONFIG_MEMORY_VADDR, +                                             reinterpret_cast<u8*>(&ConfigMem::config_mem), +                                             Memory::CONFIG_MEMORY_SIZE, MemoryState::Shared)                             .MoveFrom();      address_space.Reprotect(cfg_mem_vma, VMAPermission::Read);      auto shared_page_vma = address_space -                               .MapBackingMemory(SHARED_PAGE_VADDR, (u8*)&SharedPage::shared_page, -                                                 SHARED_PAGE_SIZE, MemoryState::Shared) +                               .MapBackingMemory(Memory::SHARED_PAGE_VADDR, +                                                 reinterpret_cast<u8*>(&SharedPage::shared_page), +                                                 Memory::SHARED_PAGE_SIZE, MemoryState::Shared)                                 .MoveFrom();      address_space.Reprotect(shared_page_vma, VMAPermission::Read); - -    AudioCore::AddAddressSpace(address_space);  } -} // namespace +} // namespace Kernel diff --git a/src/core/hle/kernel/memory.h b/src/core/hle/kernel/memory.h index 4e1856a41..08c1a9989 100644 --- a/src/core/hle/kernel/memory.h +++ b/src/core/hle/kernel/memory.h @@ -23,11 +23,7 @@ struct MemoryRegionInfo {  void MemoryInit(u32 mem_type);  void MemoryShutdown();  MemoryRegionInfo* GetMemoryRegion(MemoryRegion region); -} -namespace Memory { - -void Init(); -void InitLegacyAddressSpace(Kernel::VMManager& address_space); - -} // namespace +void HandleSpecialMapping(VMManager& address_space, const AddressMapping& mapping); +void MapSharedPages(VMManager& address_space); +} // namespace Kernel diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp index ba80fe7f8..32cb25fb7 100644 --- a/src/core/hle/kernel/process.cpp +++ b/src/core/hle/kernel/process.cpp @@ -35,7 +35,6 @@ SharedPtr<Process> Process::Create(SharedPtr<CodeSet> code_set) {      process->codeset = std::move(code_set);      process->flags.raw = 0;      process->flags.memory_region.Assign(MemoryRegion::APPLICATION); -    Memory::InitLegacyAddressSpace(process->vm_manager);      return process;  } @@ -78,8 +77,15 @@ void Process::ParseKernelCaps(const u32* kernel_caps, size_t len) {              AddressMapping mapping;              mapping.address = descriptor << 12; -            mapping.size = (end_desc << 12) - mapping.address; -            mapping.writable = (descriptor & (1 << 20)) != 0; +            VAddr end_address = end_desc << 12; + +            if (mapping.address < end_address) { +                mapping.size = end_address - mapping.address; +            } else { +                mapping.size = 0; +            } + +            mapping.read_only = (descriptor & (1 << 20)) != 0;              mapping.unk_flag = (end_desc & (1 << 20)) != 0;              address_mappings.push_back(mapping); @@ -88,8 +94,10 @@ void Process::ParseKernelCaps(const u32* kernel_caps, size_t len) {              AddressMapping mapping;              mapping.address = descriptor << 12;              mapping.size = Memory::PAGE_SIZE; -            mapping.writable = true; // TODO: Not sure if correct +            mapping.read_only = false;              mapping.unk_flag = false; + +            address_mappings.push_back(mapping);          } else if ((type & 0xFE0) == 0xFC0) { // 0x01FF              // Kernel version              kernel_version = descriptor & 0xFFFF; @@ -131,6 +139,12 @@ void Process::Run(s32 main_thread_priority, u32 stack_size) {      misc_memory_used += stack_size;      memory_region->used += stack_size; +    // Map special address mappings +    MapSharedPages(vm_manager); +    for (const auto& mapping : address_mappings) { +        HandleSpecialMapping(vm_manager, mapping); +    } +      vm_manager.LogLayout(Log::Level::Debug);      Kernel::SetupMainThread(codeset->entrypoint, main_thread_priority);  } @@ -138,6 +152,7 @@ void Process::Run(s32 main_thread_priority, u32 stack_size) {  VAddr Process::GetLinearHeapAreaAddress() const {      return kernel_version < 0x22C ? Memory::LINEAR_HEAP_VADDR : Memory::NEW_LINEAR_HEAP_VADDR;  } +  VAddr Process::GetLinearHeapBase() const {      return GetLinearHeapAreaAddress() + memory_region->base;  } diff --git a/src/core/hle/kernel/process.h b/src/core/hle/kernel/process.h index b566950b0..b52211d2a 100644 --- a/src/core/hle/kernel/process.h +++ b/src/core/hle/kernel/process.h @@ -20,7 +20,7 @@ struct AddressMapping {      // Address and size must be page-aligned      VAddr address;      u32 size; -    bool writable; +    bool read_only;      bool unk_flag;  }; diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 65e4bba85..b8438e490 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -672,12 +672,14 @@ PAddr VirtualToPhysicalAddress(const VAddr addr) {          return addr - VRAM_VADDR + VRAM_PADDR;      } else if (addr >= LINEAR_HEAP_VADDR && addr < LINEAR_HEAP_VADDR_END) {          return addr - LINEAR_HEAP_VADDR + FCRAM_PADDR; +    } else if (addr >= NEW_LINEAR_HEAP_VADDR && addr < NEW_LINEAR_HEAP_VADDR_END) { +        return addr - NEW_LINEAR_HEAP_VADDR + FCRAM_PADDR;      } else if (addr >= DSP_RAM_VADDR && addr < DSP_RAM_VADDR_END) {          return addr - DSP_RAM_VADDR + DSP_RAM_PADDR;      } else if (addr >= IO_AREA_VADDR && addr < IO_AREA_VADDR_END) {          return addr - IO_AREA_VADDR + IO_AREA_PADDR; -    } else if (addr >= NEW_LINEAR_HEAP_VADDR && addr < NEW_LINEAR_HEAP_VADDR_END) { -        return addr - NEW_LINEAR_HEAP_VADDR + FCRAM_PADDR; +    } else if (addr >= N3DS_EXTRA_RAM_VADDR && addr < N3DS_EXTRA_RAM_VADDR_END) { +        return addr - N3DS_EXTRA_RAM_VADDR + N3DS_EXTRA_RAM_PADDR;      }      LOG_ERROR(HW_Memory, "Unknown virtual address @ 0x%08X", addr); @@ -696,6 +698,8 @@ VAddr PhysicalToVirtualAddress(const PAddr addr) {          return addr - DSP_RAM_PADDR + DSP_RAM_VADDR;      } else if (addr >= IO_AREA_PADDR && addr < IO_AREA_PADDR_END) {          return addr - IO_AREA_PADDR + IO_AREA_VADDR; +    } else if (addr >= N3DS_EXTRA_RAM_PADDR && addr < N3DS_EXTRA_RAM_PADDR_END) { +        return addr - N3DS_EXTRA_RAM_PADDR + N3DS_EXTRA_RAM_VADDR;      }      LOG_ERROR(HW_Memory, "Unknown physical address @ 0x%08X", addr); diff --git a/src/core/memory.h b/src/core/memory.h index 903b58a22..802aa465e 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -37,6 +37,12 @@ enum : PAddr {      VRAM_SIZE = 0x00600000, ///< VRAM size (6MB)      VRAM_PADDR_END = VRAM_PADDR + VRAM_SIZE, +    /// New 3DS additional memory. Supposedly faster than regular FCRAM. Part of it can be used by +    /// applications and system modules if mapped via the ExHeader. +    N3DS_EXTRA_RAM_PADDR = 0x1F000000, +    N3DS_EXTRA_RAM_SIZE = 0x00400000, ///< New 3DS additional memory size (4MB) +    N3DS_EXTRA_RAM_PADDR_END = N3DS_EXTRA_RAM_PADDR + N3DS_EXTRA_RAM_SIZE, +      /// DSP memory      DSP_RAM_PADDR = 0x1FF00000,      DSP_RAM_SIZE = 0x00080000, ///< DSP memory size (512KB) @@ -81,6 +87,10 @@ enum : VAddr {      LINEAR_HEAP_SIZE = 0x08000000,      LINEAR_HEAP_VADDR_END = LINEAR_HEAP_VADDR + LINEAR_HEAP_SIZE, +    /// Maps 1:1 to New 3DS additional memory +    N3DS_EXTRA_RAM_VADDR = 0x1E800000, +    N3DS_EXTRA_RAM_VADDR_END = N3DS_EXTRA_RAM_VADDR + N3DS_EXTRA_RAM_SIZE, +      /// Maps 1:1 to the IO register area.      IO_AREA_VADDR = 0x1EC00000,      IO_AREA_VADDR_END = IO_AREA_VADDR + IO_AREA_SIZE, diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 2e32ff905..9a09f81dc 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -32,12 +32,13 @@ namespace Pica {  namespace CommandProcessor { -static int float_regs_counter = 0; +static int vs_float_regs_counter = 0; +static u32 vs_uniform_write_buffer[4]; -static u32 uniform_write_buffer[4]; +static int gs_float_regs_counter = 0; +static u32 gs_uniform_write_buffer[4];  static int default_attr_counter = 0; -  static u32 default_attr_write_buffer[3];  // Expand a 4-bit mask to 4-byte mask, e.g. 0b0101 -> 0x00FF00FF @@ -48,6 +49,97 @@ static const u32 expand_bits_to_bytes[] = {  MICROPROFILE_DEFINE(GPU_Drawing, "GPU", "Drawing", MP_RGB(50, 50, 240)); +static const char* GetShaderSetupTypeName(Shader::ShaderSetup& setup) { +    if (&setup == &g_state.vs) { +        return "vertex shader"; +    } +    if (&setup == &g_state.gs) { +        return "geometry shader"; +    } +    return "unknown shader"; +} + +static void WriteUniformBoolReg(Shader::ShaderSetup& setup, u32 value) { +    for (unsigned i = 0; i < setup.uniforms.b.size(); ++i) +        setup.uniforms.b[i] = (value & (1 << i)) != 0; +} + +static void WriteUniformIntReg(Shader::ShaderSetup& setup, unsigned index, +                               const Math::Vec4<u8>& values) { +    ASSERT(index < setup.uniforms.i.size()); +    setup.uniforms.i[index] = values; +    LOG_TRACE(HW_GPU, "Set %s integer uniform %d to %02x %02x %02x %02x", +              GetShaderSetupTypeName(setup), index, values.x, values.y, values.z, values.w); +} + +static void WriteUniformFloatReg(ShaderRegs& config, Shader::ShaderSetup& setup, +                                 int& float_regs_counter, u32 uniform_write_buffer[4], u32 value) { +    auto& uniform_setup = config.uniform_setup; + +    // TODO: Does actual hardware indeed keep an intermediate buffer or does +    //       it directly write the values? +    uniform_write_buffer[float_regs_counter++] = value; + +    // Uniforms are written in a packed format such that four float24 values are encoded in +    // three 32-bit numbers. We write to internal memory once a full such vector is +    // written. +    if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) || +        (float_regs_counter >= 3 && !uniform_setup.IsFloat32())) { +        float_regs_counter = 0; + +        auto& uniform = setup.uniforms.f[uniform_setup.index]; + +        if (uniform_setup.index >= 96) { +            LOG_ERROR(HW_GPU, "Invalid %s float uniform index %d", GetShaderSetupTypeName(setup), +                      (int)uniform_setup.index); +        } else { + +            // NOTE: The destination component order indeed is "backwards" +            if (uniform_setup.IsFloat32()) { +                for (auto i : {0, 1, 2, 3}) +                    uniform[3 - i] = float24::FromFloat32(*(float*)(&uniform_write_buffer[i])); +            } else { +                // TODO: Untested +                uniform.w = float24::FromRaw(uniform_write_buffer[0] >> 8); +                uniform.z = float24::FromRaw(((uniform_write_buffer[0] & 0xFF) << 16) | +                                             ((uniform_write_buffer[1] >> 16) & 0xFFFF)); +                uniform.y = float24::FromRaw(((uniform_write_buffer[1] & 0xFFFF) << 8) | +                                             ((uniform_write_buffer[2] >> 24) & 0xFF)); +                uniform.x = float24::FromRaw(uniform_write_buffer[2] & 0xFFFFFF); +            } + +            LOG_TRACE(HW_GPU, "Set %s float uniform %x to (%f %f %f %f)", +                      GetShaderSetupTypeName(setup), (int)uniform_setup.index, +                      uniform.x.ToFloat32(), uniform.y.ToFloat32(), uniform.z.ToFloat32(), +                      uniform.w.ToFloat32()); + +            // TODO: Verify that this actually modifies the register! +            uniform_setup.index.Assign(uniform_setup.index + 1); +        } +    } +} + +static void WriteProgramCode(ShaderRegs& config, Shader::ShaderSetup& setup, +                             unsigned max_program_code_length, u32 value) { +    if (config.program.offset >= max_program_code_length) { +        LOG_ERROR(HW_GPU, "Invalid %s program offset %d", GetShaderSetupTypeName(setup), +                  (int)config.program.offset); +    } else { +        setup.program_code[config.program.offset] = value; +        config.program.offset++; +    } +} + +static void WriteSwizzlePatterns(ShaderRegs& config, Shader::ShaderSetup& setup, u32 value) { +    if (config.swizzle_patterns.offset >= setup.swizzle_data.size()) { +        LOG_ERROR(HW_GPU, "Invalid %s swizzle pattern offset %d", GetShaderSetupTypeName(setup), +                  (int)config.swizzle_patterns.offset); +    } else { +        setup.swizzle_data[config.swizzle_patterns.offset] = value; +        config.swizzle_patterns.offset++; +    } +} +  static void WritePicaReg(u32 id, u32 value, u32 mask) {      auto& regs = g_state.regs; @@ -330,21 +422,70 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {          break;      } -    case PICA_REG_INDEX(vs.bool_uniforms): -        for (unsigned i = 0; i < 16; ++i) -            g_state.vs.uniforms.b[i] = (regs.vs.bool_uniforms.Value() & (1 << i)) != 0; +    case PICA_REG_INDEX(gs.bool_uniforms): +        WriteUniformBoolReg(g_state.gs, value); +        break; +    case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[0], 0x281): +    case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[1], 0x282): +    case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[2], 0x283): +    case PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[3], 0x284): { +        unsigned index = (id - PICA_REG_INDEX_WORKAROUND(gs.int_uniforms[0], 0x281)); +        auto values = regs.gs.int_uniforms[index]; +        WriteUniformIntReg(g_state.gs, index, +                           Math::Vec4<u8>(values.x, values.y, values.z, values.w)); +        break; +    } + +    case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[0], 0x291): +    case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[1], 0x292): +    case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[2], 0x293): +    case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[3], 0x294): +    case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[4], 0x295): +    case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[5], 0x296): +    case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[6], 0x297): +    case PICA_REG_INDEX_WORKAROUND(gs.uniform_setup.set_value[7], 0x298): { +        WriteUniformFloatReg(g_state.regs.gs, g_state.gs, gs_float_regs_counter, +                             gs_uniform_write_buffer, value); +        break; +    } + +    case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[0], 0x29c): +    case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[1], 0x29d): +    case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[2], 0x29e): +    case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[3], 0x29f): +    case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[4], 0x2a0): +    case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[5], 0x2a1): +    case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[6], 0x2a2): +    case PICA_REG_INDEX_WORKAROUND(gs.program.set_word[7], 0x2a3): { +        WriteProgramCode(g_state.regs.gs, g_state.gs, 4096, value); +        break; +    } + +    case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[0], 0x2a6): +    case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[1], 0x2a7): +    case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[2], 0x2a8): +    case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[3], 0x2a9): +    case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[4], 0x2aa): +    case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[5], 0x2ab): +    case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[6], 0x2ac): +    case PICA_REG_INDEX_WORKAROUND(gs.swizzle_patterns.set_word[7], 0x2ad): { +        WriteSwizzlePatterns(g_state.regs.gs, g_state.gs, value); +        break; +    } + +    case PICA_REG_INDEX(vs.bool_uniforms): +        WriteUniformBoolReg(g_state.vs, value);          break;      case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1):      case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[1], 0x2b2):      case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[2], 0x2b3):      case PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[3], 0x2b4): { -        int index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1)); +        unsigned index = (id - PICA_REG_INDEX_WORKAROUND(vs.int_uniforms[0], 0x2b1));          auto values = regs.vs.int_uniforms[index]; -        g_state.vs.uniforms.i[index] = Math::Vec4<u8>(values.x, values.y, values.z, values.w); -        LOG_TRACE(HW_GPU, "Set integer uniform %d to %02x %02x %02x %02x", index, values.x.Value(), -                  values.y.Value(), values.z.Value(), values.w.Value()); +        WriteUniformIntReg(g_state.vs, index, +                           Math::Vec4<u8>(values.x, values.y, values.z, values.w));          break;      } @@ -356,51 +497,11 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {      case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[5], 0x2c6):      case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[6], 0x2c7):      case PICA_REG_INDEX_WORKAROUND(vs.uniform_setup.set_value[7], 0x2c8): { -        auto& uniform_setup = regs.vs.uniform_setup; - -        // TODO: Does actual hardware indeed keep an intermediate buffer or does -        //       it directly write the values? -        uniform_write_buffer[float_regs_counter++] = value; - -        // Uniforms are written in a packed format such that four float24 values are encoded in -        // three 32-bit numbers. We write to internal memory once a full such vector is -        // written. -        if ((float_regs_counter >= 4 && uniform_setup.IsFloat32()) || -            (float_regs_counter >= 3 && !uniform_setup.IsFloat32())) { -            float_regs_counter = 0; - -            auto& uniform = g_state.vs.uniforms.f[uniform_setup.index]; - -            if (uniform_setup.index > 95) { -                LOG_ERROR(HW_GPU, "Invalid VS uniform index %d", (int)uniform_setup.index); -                break; -            } - -            // NOTE: The destination component order indeed is "backwards" -            if (uniform_setup.IsFloat32()) { -                for (auto i : {0, 1, 2, 3}) -                    uniform[3 - i] = float24::FromFloat32(*(float*)(&uniform_write_buffer[i])); -            } else { -                // TODO: Untested -                uniform.w = float24::FromRaw(uniform_write_buffer[0] >> 8); -                uniform.z = float24::FromRaw(((uniform_write_buffer[0] & 0xFF) << 16) | -                                             ((uniform_write_buffer[1] >> 16) & 0xFFFF)); -                uniform.y = float24::FromRaw(((uniform_write_buffer[1] & 0xFFFF) << 8) | -                                             ((uniform_write_buffer[2] >> 24) & 0xFF)); -                uniform.x = float24::FromRaw(uniform_write_buffer[2] & 0xFFFFFF); -            } - -            LOG_TRACE(HW_GPU, "Set uniform %x to (%f %f %f %f)", (int)uniform_setup.index, -                      uniform.x.ToFloat32(), uniform.y.ToFloat32(), uniform.z.ToFloat32(), -                      uniform.w.ToFloat32()); - -            // TODO: Verify that this actually modifies the register! -            uniform_setup.index.Assign(uniform_setup.index + 1); -        } +        WriteUniformFloatReg(g_state.regs.vs, g_state.vs, vs_float_regs_counter, +                             vs_uniform_write_buffer, value);          break;      } -    // Load shader program code      case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[0], 0x2cc):      case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[1], 0x2cd):      case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[2], 0x2ce): @@ -409,12 +510,10 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {      case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[5], 0x2d1):      case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[6], 0x2d2):      case PICA_REG_INDEX_WORKAROUND(vs.program.set_word[7], 0x2d3): { -        g_state.vs.program_code[regs.vs.program.offset] = value; -        regs.vs.program.offset++; +        WriteProgramCode(g_state.regs.vs, g_state.vs, 512, value);          break;      } -    // Load swizzle pattern data      case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[0], 0x2d6):      case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[1], 0x2d7):      case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[2], 0x2d8): @@ -423,8 +522,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {      case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[5], 0x2db):      case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[6], 0x2dc):      case PICA_REG_INDEX_WORKAROUND(vs.swizzle_patterns.set_word[7], 0x2dd): { -        g_state.vs.swizzle_data[regs.vs.swizzle_patterns.offset] = value; -        regs.vs.swizzle_patterns.offset++; +        WriteSwizzlePatterns(g_state.regs.vs, g_state.vs, value);          break;      } diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 38ea717ab..e156f6aef 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -24,6 +24,9 @@ namespace Pica {  namespace Shader { +constexpr unsigned MAX_PROGRAM_CODE_LENGTH = 4096; +constexpr unsigned MAX_SWIZZLE_DATA_LENGTH = 4096; +  struct AttributeBuffer {      alignas(16) Math::Vec4<float24> attr[16];  }; @@ -144,8 +147,8 @@ struct ShaderSetup {          return offsetof(ShaderSetup, uniforms.i) + index * sizeof(Math::Vec4<u8>);      } -    std::array<u32, 1024> program_code; -    std::array<u32, 1024> swizzle_data; +    std::array<u32, MAX_PROGRAM_CODE_LENGTH> program_code; +    std::array<u32, MAX_SWIZZLE_DATA_LENGTH> swizzle_data;      /// Data private to ShaderEngines      struct EngineData { diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index f4d1c46c5..aa1cec81f 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -653,7 +653,7 @@ static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData  }  void InterpreterEngine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) { -    ASSERT(entry_point < 1024); +    ASSERT(entry_point < MAX_PROGRAM_CODE_LENGTH);      setup.engine_data.entry_point = entry_point;  } diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index 0ee0dd9ef..73c21871c 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -15,7 +15,7 @@ JitX64Engine::JitX64Engine() = default;  JitX64Engine::~JitX64Engine() = default;  void JitX64Engine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) { -    ASSERT(entry_point < 1024); +    ASSERT(entry_point < MAX_PROGRAM_CODE_LENGTH);      setup.engine_data.entry_point = entry_point;      u64 code_hash = Common::ComputeHash64(&setup.program_code, sizeof(setup.program_code)); diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp index 2dbc8b147..5d9b6448c 100644 --- a/src/video_core/shader/shader_jit_x64_compiler.cpp +++ b/src/video_core/shader/shader_jit_x64_compiler.cpp @@ -834,8 +834,8 @@ void JitShader::FindReturnOffsets() {      std::sort(return_offsets.begin(), return_offsets.end());  } -void JitShader::Compile(const std::array<u32, 1024>* program_code_, -                        const std::array<u32, 1024>* swizzle_data_) { +void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code_, +                        const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data_) {      program_code = program_code_;      swizzle_data = swizzle_data_; diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h index f27675560..31af0ca48 100644 --- a/src/video_core/shader/shader_jit_x64_compiler.h +++ b/src/video_core/shader/shader_jit_x64_compiler.h @@ -22,8 +22,8 @@ namespace Pica {  namespace Shader { -/// Memory allocated for each compiled shader (64Kb) -constexpr size_t MAX_SHADER_SIZE = 1024 * 64; +/// Memory allocated for each compiled shader +constexpr size_t MAX_SHADER_SIZE = MAX_PROGRAM_CODE_LENGTH * 64;  /**   * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 @@ -37,8 +37,8 @@ public:          program(&setup, &state, instruction_labels[offset].getAddress());      } -    void Compile(const std::array<u32, 1024>* program_code, -                 const std::array<u32, 1024>* swizzle_data); +    void Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code, +                 const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data);      void Compile_ADD(Instruction instr);      void Compile_DP3(Instruction instr); @@ -104,11 +104,11 @@ private:       */      void FindReturnOffsets(); -    const std::array<u32, 1024>* program_code = nullptr; -    const std::array<u32, 1024>* swizzle_data = nullptr; +    const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code = nullptr; +    const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data = nullptr;      /// Mapping of Pica VS instructions to pointers in the emitted code -    std::array<Xbyak::Label, 1024> instruction_labels; +    std::array<Xbyak::Label, MAX_PROGRAM_CODE_LENGTH> instruction_labels;      /// Offsets in code where a return needs to be inserted      std::vector<unsigned> return_offsets;  | 
