diff options
| author | bunnei <bunneidev@gmail.com> | 2020-07-14 14:04:16 -0400 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-07-14 14:04:16 -0400 | 
| commit | 666b37ad56e816cd2bde4a521cba0e63812dc681 (patch) | |
| tree | c498f364e42006df041ec4bab818edb2c1b23d36 /src/video_core | |
| parent | e2730372b8b26bf3141bf91107f9982e270f751b (diff) | |
| parent | c574ab5aa1d3ff81b28ddfbba3818b3ce724aa32 (diff) | |
Merge pull request #4242 from ReinUsesLisp/maxwell-dma
maxwell_dma: Match official doc and support pitch->voxel copies
Diffstat (limited to 'src/video_core')
| -rw-r--r-- | src/video_core/engines/maxwell_dma.cpp | 300 | ||||
| -rw-r--r-- | src/video_core/engines/maxwell_dma.h | 348 | ||||
| -rw-r--r-- | src/video_core/texture_cache/surface_params.cpp | 5 | ||||
| -rw-r--r-- | src/video_core/texture_cache/surface_params.h | 2 | ||||
| -rw-r--r-- | src/video_core/textures/decoders.cpp | 134 | ||||
| -rw-r--r-- | src/video_core/textures/decoders.h | 38 | 
6 files changed, 468 insertions, 359 deletions
| diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 01d7df405..a2d3d7823 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -14,50 +14,45 @@  namespace Tegra::Engines { +using namespace Texture; +  MaxwellDMA::MaxwellDMA(Core::System& system, MemoryManager& memory_manager)      : system{system}, memory_manager{memory_manager} {}  void MaxwellDMA::CallMethod(u32 method, u32 method_argument, bool is_last_call) { -    ASSERT_MSG(method < Regs::NUM_REGS, -               "Invalid MaxwellDMA register, increase the size of the Regs structure"); +    ASSERT_MSG(method < NUM_REGS, "Invalid MaxwellDMA register");      regs.reg_array[method] = method_argument; -#define MAXWELLDMA_REG_INDEX(field_name)                                                           \ -    (offsetof(Tegra::Engines::MaxwellDMA::Regs, field_name) / sizeof(u32)) - -    switch (method) { -    case MAXWELLDMA_REG_INDEX(exec): { -        HandleCopy(); -        break; -    } +    if (method == offsetof(Regs, launch_dma) / sizeof(u32)) { +        Launch();      } - -#undef MAXWELLDMA_REG_INDEX  }  void MaxwellDMA::CallMultiMethod(u32 method, const u32* base_start, u32 amount,                                   u32 methods_pending) { -    for (std::size_t i = 0; i < amount; i++) { +    for (size_t i = 0; i < amount; ++i) {          CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);      }  } -void MaxwellDMA::HandleCopy() { -    LOG_TRACE(HW_GPU, "Requested a DMA copy"); - -    const GPUVAddr source = regs.src_address.Address(); -    const GPUVAddr dest = regs.dst_address.Address(); +void MaxwellDMA::Launch() { +    LOG_TRACE(Render_OpenGL, "DMA copy 0x{:x} -> 0x{:x}", static_cast<GPUVAddr>(regs.offset_in), +              static_cast<GPUVAddr>(regs.offset_out));      // TODO(Subv): Perform more research and implement all features of this engine. -    ASSERT(regs.exec.enable_swizzle == 0); -    ASSERT(regs.exec.query_mode == Regs::QueryMode::None); -    ASSERT(regs.exec.query_intr == Regs::QueryIntr::None); -    ASSERT(regs.exec.copy_mode == Regs::CopyMode::Unk2); -    ASSERT(regs.dst_params.pos_x == 0); -    ASSERT(regs.dst_params.pos_y == 0); - -    if (!regs.exec.is_dst_linear && !regs.exec.is_src_linear) { +    const LaunchDMA& launch = regs.launch_dma; +    ASSERT(launch.remap_enable == 0); +    ASSERT(launch.semaphore_type == LaunchDMA::SemaphoreType::NONE); +    ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE); +    ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED); +    ASSERT(regs.dst_params.origin.x == 0); +    ASSERT(regs.dst_params.origin.y == 0); + +    const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH; +    const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH; + +    if (!is_src_pitch && !is_dst_pitch) {          // If both the source and the destination are in block layout, assert.          UNREACHABLE_MSG("Tiled->Tiled DMA transfers are not yet implemented");          return; @@ -66,144 +61,161 @@ void MaxwellDMA::HandleCopy() {      // All copies here update the main memory, so mark all rasterizer states as invalid.      system.GPU().Maxwell3D().OnMemoryWrite(); -    if (regs.exec.is_dst_linear && regs.exec.is_src_linear) { -        // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D -        // buffer of length `x_count`, otherwise we copy a 2D image of dimensions (x_count, -        // y_count). -        if (!regs.exec.enable_2d) { -            memory_manager.CopyBlock(dest, source, regs.x_count); -            return; -        } +    if (is_src_pitch && is_dst_pitch) { +        CopyPitchToPitch(); +    } else { +        ASSERT(launch.multi_line_enable == 1); -        // If both the source and the destination are in linear layout, perform a line-by-line -        // copy. We're going to take a subrect of size (x_count, y_count) from the source -        // rectangle. There is no need to manually flush/invalidate the regions because -        // CopyBlock does that for us. -        for (u32 line = 0; line < regs.y_count; ++line) { -            const GPUVAddr source_line = source + line * regs.src_pitch; -            const GPUVAddr dest_line = dest + line * regs.dst_pitch; -            memory_manager.CopyBlock(dest_line, source_line, regs.x_count); +        if (!is_src_pitch && is_dst_pitch) { +            CopyBlockLinearToPitch(); +        } else { +            CopyPitchToBlockLinear();          } -        return;      } +} -    ASSERT(regs.exec.enable_2d == 1); - -    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) { - -        ASSERT(regs.src_params.BlockDepth() == 0); -        // Optimized path for micro copies. -        if (regs.dst_pitch * regs.y_count < Texture::GetGOBSize() && regs.dst_pitch <= 64) { -            const u32 bytes_per_pixel = regs.dst_pitch / regs.x_count; -            const std::size_t src_size = Texture::GetGOBSize(); -            const std::size_t dst_size = regs.dst_pitch * regs.y_count; -            u32 pos_x = regs.src_params.pos_x; -            u32 pos_y = regs.src_params.pos_y; -            const u64 offset = -                Texture::GetGOBOffset(regs.src_params.size_x, regs.src_params.size_y, pos_x, pos_y, -                                      regs.src_params.BlockDepth(), bytes_per_pixel); -            const u32 x_in_gob = 64 / bytes_per_pixel; -            pos_x = pos_x % x_in_gob; -            pos_y = pos_y % 8; - -            if (read_buffer.size() < src_size) { -                read_buffer.resize(src_size); -            } - -            if (write_buffer.size() < dst_size) { -                write_buffer.resize(dst_size); -            } - -            if (Settings::IsGPULevelExtreme()) { -                memory_manager.ReadBlock(source + offset, read_buffer.data(), src_size); -                memory_manager.ReadBlock(dest, write_buffer.data(), dst_size); -            } else { -                memory_manager.ReadBlockUnsafe(source + offset, read_buffer.data(), src_size); -                memory_manager.ReadBlockUnsafe(dest, write_buffer.data(), dst_size); -            } - -            Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch, -                                      regs.src_params.size_x, bytes_per_pixel, read_buffer.data(), -                                      write_buffer.data(), regs.src_params.BlockHeight(), pos_x, -                                      pos_y); - -            memory_manager.WriteBlock(dest, write_buffer.data(), dst_size); - -            return; -        } -        // If the input is tiled and the output is linear, deswizzle the input and copy it over. -        const u32 bytes_per_pixel = regs.dst_pitch / regs.x_count; -        const std::size_t src_size = Texture::CalculateSize( -            true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y, -            regs.src_params.size_z, regs.src_params.BlockHeight(), regs.src_params.BlockDepth()); - -        const std::size_t src_layer_size = Texture::CalculateSize( -            true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y, 1, -            regs.src_params.BlockHeight(), regs.src_params.BlockDepth()); - -        const std::size_t dst_size = regs.dst_pitch * regs.y_count; +void MaxwellDMA::CopyPitchToPitch() { +    // When `multi_line_enable` bit is disabled the copy is performed as if we were copying a 1D +    // buffer of length `line_length_in`. +    // Otherwise we copy a 2D image of dimensions (line_length_in, line_count). +    if (!regs.launch_dma.multi_line_enable) { +        memory_manager.CopyBlock(regs.offset_out, regs.offset_in, regs.line_length_in); +        return; +    } -        if (read_buffer.size() < src_size) { -            read_buffer.resize(src_size); -        } +    // Perform a line-by-line copy. +    // We're going to take a subrect of size (line_length_in, line_count) from the source rectangle. +    // There is no need to manually flush/invalidate the regions because CopyBlock does that for us. +    for (u32 line = 0; line < regs.line_count; ++line) { +        const GPUVAddr source_line = regs.offset_in + static_cast<size_t>(line) * regs.pitch_in; +        const GPUVAddr dest_line = regs.offset_out + static_cast<size_t>(line) * regs.pitch_out; +        memory_manager.CopyBlock(dest_line, source_line, regs.line_length_in); +    } +} -        if (write_buffer.size() < dst_size) { -            write_buffer.resize(dst_size); -        } +void MaxwellDMA::CopyBlockLinearToPitch() { +    ASSERT(regs.src_params.block_size.depth == 0); -        if (Settings::IsGPULevelExtreme()) { -            memory_manager.ReadBlock(source, read_buffer.data(), src_size); -            memory_manager.ReadBlock(dest, write_buffer.data(), dst_size); -        } else { -            memory_manager.ReadBlockUnsafe(source, read_buffer.data(), src_size); -            memory_manager.ReadBlockUnsafe(dest, write_buffer.data(), dst_size); -        } +    // Optimized path for micro copies. +    const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count; +    if (dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X) { +        FastCopyBlockLinearToPitch(); +        return; +    } -        Texture::UnswizzleSubrect( -            regs.x_count, regs.y_count, regs.dst_pitch, regs.src_params.size_x, bytes_per_pixel, -            read_buffer.data() + src_layer_size * regs.src_params.pos_z, write_buffer.data(), -            regs.src_params.BlockHeight(), regs.src_params.pos_x, regs.src_params.pos_y); +    // Deswizzle the input and copy it over. +    const u32 bytes_per_pixel = regs.pitch_out / regs.line_length_in; +    const Parameters& src_params = regs.src_params; +    const u32 width = src_params.width; +    const u32 height = src_params.height; +    const u32 depth = src_params.depth; +    const u32 block_height = src_params.block_size.height; +    const u32 block_depth = src_params.block_size.depth; +    const size_t src_size = +        CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth); +    const size_t src_layer_size = +        CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth); + +    if (read_buffer.size() < src_size) { +        read_buffer.resize(src_size); +    } +    if (write_buffer.size() < dst_size) { +        write_buffer.resize(dst_size); +    } -        memory_manager.WriteBlock(dest, write_buffer.data(), dst_size); +    if (Settings::IsGPULevelExtreme()) { +        memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size); +        memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);      } else { -        ASSERT(regs.dst_params.BlockDepth() == 0); +        memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), src_size); +        memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size); +    } -        const u32 bytes_per_pixel = regs.src_pitch / regs.x_count; +    UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, width, bytes_per_pixel, +                     read_buffer.data() + src_layer_size * src_params.layer, write_buffer.data(), +                     block_height, src_params.origin.x, src_params.origin.y); -        const std::size_t dst_size = Texture::CalculateSize( -            true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, -            regs.dst_params.size_z, regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth()); +    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); +} -        const std::size_t dst_layer_size = Texture::CalculateSize( -            true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1, -            regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth()); +void MaxwellDMA::CopyPitchToBlockLinear() { +    const auto& dst_params = regs.dst_params; +    const u32 bytes_per_pixel = regs.pitch_in / regs.line_length_in; +    const u32 width = dst_params.width; +    const u32 height = dst_params.height; +    const u32 depth = dst_params.depth; +    const u32 block_height = dst_params.block_size.height; +    const u32 block_depth = dst_params.block_size.depth; +    const size_t dst_size = +        CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth); +    const size_t dst_layer_size = +        CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth); + +    const size_t src_size = static_cast<size_t>(regs.pitch_in) * regs.line_count; + +    if (read_buffer.size() < src_size) { +        read_buffer.resize(src_size); +    } +    if (write_buffer.size() < dst_size) { +        write_buffer.resize(dst_size); +    } -        const std::size_t src_size = regs.src_pitch * regs.y_count; +    if (Settings::IsGPULevelExtreme()) { +        memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size); +        memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); +    } else { +        memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), src_size); +        memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size); +    } -        if (read_buffer.size() < src_size) { -            read_buffer.resize(src_size); -        } +    // If the input is linear and the output is tiled, swizzle the input and copy it over. +    if (regs.dst_params.block_size.depth > 0) { +        ASSERT(dst_params.layer == 0); +        SwizzleSliceToVoxel(regs.line_length_in, regs.line_count, regs.pitch_in, width, height, +                            bytes_per_pixel, block_height, block_depth, dst_params.origin.x, +                            dst_params.origin.y, write_buffer.data(), read_buffer.data()); +    } else { +        SwizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_in, width, bytes_per_pixel, +                       write_buffer.data() + dst_layer_size * dst_params.layer, read_buffer.data(), +                       block_height, dst_params.origin.x, dst_params.origin.y); +    } -        if (write_buffer.size() < dst_size) { -            write_buffer.resize(dst_size); -        } +    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); +} -        if (Settings::IsGPULevelExtreme()) { -            memory_manager.ReadBlock(source, read_buffer.data(), src_size); -            memory_manager.ReadBlock(dest, write_buffer.data(), dst_size); -        } else { -            memory_manager.ReadBlockUnsafe(source, read_buffer.data(), src_size); -            memory_manager.ReadBlockUnsafe(dest, write_buffer.data(), dst_size); -        } +void MaxwellDMA::FastCopyBlockLinearToPitch() { +    const u32 bytes_per_pixel = regs.pitch_out / regs.line_length_in; +    const size_t src_size = GOB_SIZE; +    const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count; +    u32 pos_x = regs.src_params.origin.x; +    u32 pos_y = regs.src_params.origin.y; +    const u64 offset = GetGOBOffset(regs.src_params.width, regs.src_params.height, pos_x, pos_y, +                                    regs.src_params.block_size.height, bytes_per_pixel); +    const u32 x_in_gob = 64 / bytes_per_pixel; +    pos_x = pos_x % x_in_gob; +    pos_y = pos_y % 8; + +    if (read_buffer.size() < src_size) { +        read_buffer.resize(src_size); +    } -        // If the input is linear and the output is tiled, swizzle the input and copy it over. -        Texture::SwizzleSubrect( -            regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x, bytes_per_pixel, -            write_buffer.data() + dst_layer_size * regs.dst_params.pos_z, read_buffer.data(), -            regs.dst_params.BlockHeight(), regs.dst_params.pos_x, regs.dst_params.pos_y); +    if (write_buffer.size() < dst_size) { +        write_buffer.resize(dst_size); +    } -        memory_manager.WriteBlock(dest, write_buffer.data(), dst_size); +    if (Settings::IsGPULevelExtreme()) { +        memory_manager.ReadBlock(regs.offset_in + offset, read_buffer.data(), src_size); +        memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); +    } else { +        memory_manager.ReadBlockUnsafe(regs.offset_in + offset, read_buffer.data(), src_size); +        memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);      } + +    UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, regs.src_params.width, +                     bytes_per_pixel, read_buffer.data(), write_buffer.data(), +                     regs.src_params.block_size.height, pos_x, pos_y); + +    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);  }  } // namespace Tegra::Engines diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index 502dd8509..50f445efc 100644 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -24,160 +24,190 @@ class MemoryManager;  namespace Tegra::Engines {  /** - * This Engine is known as GK104_Copy. Documentation can be found in: + * This engine is known as gk104_copy. Documentation can be found in: + * https://github.com/NVIDIA/open-gpu-doc/blob/master/classes/dma-copy/clb0b5.h   * https://github.com/envytools/envytools/blob/master/rnndb/fifo/gk104_copy.xml   */  class MaxwellDMA final : public EngineInterface {  public: -    explicit MaxwellDMA(Core::System& system, MemoryManager& memory_manager); -    ~MaxwellDMA() = default; - -    /// Write the value to the register identified by method. -    void CallMethod(u32 method, u32 method_argument, bool is_last_call) override; - -    /// Write multiple values to the register identified by method. -    void CallMultiMethod(u32 method, const u32* base_start, u32 amount, -                         u32 methods_pending) override; +    struct PackedGPUVAddr { +        u32 upper; +        u32 lower; + +        constexpr operator GPUVAddr() const noexcept { +            return (static_cast<GPUVAddr>(upper & 0xff) << 32) | lower; +        } +    }; + +    union BlockSize { +        BitField<0, 4, u32> width; +        BitField<4, 4, u32> height; +        BitField<8, 4, u32> depth; +        BitField<12, 4, u32> gob_height; +    }; +    static_assert(sizeof(BlockSize) == 4); + +    union Origin { +        BitField<0, 16, u32> x; +        BitField<16, 16, u32> y; +    }; +    static_assert(sizeof(Origin) == 4); + +    struct Parameters { +        BlockSize block_size; +        u32 width; +        u32 height; +        u32 depth; +        u32 layer; +        Origin origin; +    }; +    static_assert(sizeof(Parameters) == 24); + +    struct Semaphore { +        PackedGPUVAddr address; +        u32 payload; +    }; +    static_assert(sizeof(Semaphore) == 12); + +    struct RenderEnable { +        enum class Mode : u32 { +            FALSE = 0, +            TRUE = 1, +            CONDITIONAL = 2, +            RENDER_IF_EQUAL = 3, +            RENDER_IF_NOT_EQUAL = 4, +        }; -    struct Regs { -        static constexpr std::size_t NUM_REGS = 0x1D6; +        PackedGPUVAddr address; +        BitField<0, 3, Mode> mode; +    }; +    static_assert(sizeof(RenderEnable) == 12); + +    enum class PhysModeTarget : u32 { +        LOCAL_FB = 0, +        COHERENT_SYSMEM = 1, +        NONCOHERENT_SYSMEM = 2, +    }; +    using PhysMode = BitField<0, 2, PhysModeTarget>; + +    union LaunchDMA { +        enum class DataTransferType : u32 { +            NONE = 0, +            PIPELINED = 1, +            NON_PIPELINED = 2, +        }; -        struct Parameters { -            union { -                BitField<0, 4, u32> block_depth; -                BitField<4, 4, u32> block_height; -                BitField<8, 4, u32> block_width; -            }; -            u32 size_x; -            u32 size_y; -            u32 size_z; -            u32 pos_z; -            union { -                BitField<0, 16, u32> pos_x; -                BitField<16, 16, u32> pos_y; -            }; +        enum class SemaphoreType : u32 { +            NONE = 0, +            RELEASE_ONE_WORD_SEMAPHORE = 1, +            RELEASE_FOUR_WORD_SEMAPHORE = 2, +        }; -            u32 BlockHeight() const { -                return block_height.Value(); -            } +        enum class InterruptType : u32 { +            NONE = 0, +            BLOCKING = 1, +            NON_BLOCKING = 2, +        }; -            u32 BlockDepth() const { -                return block_depth.Value(); -            } +        enum class MemoryLayout : u32 { +            BLOCKLINEAR = 0, +            PITCH = 1,          }; -        static_assert(sizeof(Parameters) == 24, "Parameters has wrong size"); +        enum class Type : u32 { +            VIRTUAL = 0, +            PHYSICAL = 1, +        }; -        enum class ComponentMode : u32 { -            Src0 = 0, -            Src1 = 1, -            Src2 = 2, -            Src3 = 3, -            Const0 = 4, -            Const1 = 5, -            Zero = 6, +        enum class SemaphoreReduction : u32 { +            IMIN = 0, +            IMAX = 1, +            IXOR = 2, +            IAND = 3, +            IOR = 4, +            IADD = 5, +            INC = 6, +            DEC = 7, +            FADD = 0xA,          }; -        enum class CopyMode : u32 { -            None = 0, -            Unk1 = 1, -            Unk2 = 2, +        enum class SemaphoreReductionSign : u32 { +            SIGNED = 0, +            UNSIGNED = 1,          }; -        enum class QueryMode : u32 { -            None = 0, -            Short = 1, -            Long = 2, +        enum class BypassL2 : u32 { +            USE_PTE_SETTING = 0, +            FORCE_VOLATILE = 1,          }; -        enum class QueryIntr : u32 { -            None = 0, -            Block = 1, -            NonBlock = 2, +        BitField<0, 2, DataTransferType> data_transfer_type; +        BitField<2, 1, u32> flush_enable; +        BitField<3, 2, SemaphoreType> semaphore_type; +        BitField<5, 2, InterruptType> interrupt_type; +        BitField<7, 1, MemoryLayout> src_memory_layout; +        BitField<8, 1, MemoryLayout> dst_memory_layout; +        BitField<9, 1, u32> multi_line_enable; +        BitField<10, 1, u32> remap_enable; +        BitField<11, 1, u32> rmwdisable; +        BitField<12, 1, Type> src_type; +        BitField<13, 1, Type> dst_type; +        BitField<14, 4, SemaphoreReduction> semaphore_reduction; +        BitField<18, 1, SemaphoreReductionSign> semaphore_reduction_sign; +        BitField<19, 1, u32> reduction_enable; +        BitField<20, 1, BypassL2> bypass_l2; +    }; +    static_assert(sizeof(LaunchDMA) == 4); + +    struct RemapConst { +        enum Swizzle : u32 { +            SRC_X = 0, +            SRC_Y = 1, +            SRC_Z = 2, +            SRC_W = 3, +            CONST_A = 4, +            CONST_B = 5, +            NO_WRITE = 6,          }; -        union { -            struct { -                INSERT_UNION_PADDING_WORDS(0xC0); - -                struct { -                    union { -                        BitField<0, 2, CopyMode> copy_mode; -                        BitField<2, 1, u32> flush; - -                        BitField<3, 2, QueryMode> query_mode; -                        BitField<5, 2, QueryIntr> query_intr; - -                        BitField<7, 1, u32> is_src_linear; -                        BitField<8, 1, u32> is_dst_linear; - -                        BitField<9, 1, u32> enable_2d; -                        BitField<10, 1, u32> enable_swizzle; -                    }; -                } exec; - -                INSERT_UNION_PADDING_WORDS(0x3F); - -                struct { -                    u32 address_high; -                    u32 address_low; - -                    GPUVAddr Address() const { -                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | -                                                     address_low); -                    } -                } src_address; - -                struct { -                    u32 address_high; -                    u32 address_low; - -                    GPUVAddr Address() const { -                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | -                                                     address_low); -                    } -                } dst_address; - -                u32 src_pitch; -                u32 dst_pitch; -                u32 x_count; -                u32 y_count; - -                INSERT_UNION_PADDING_WORDS(0xB8); - -                u32 const0; -                u32 const1; -                union { -                    BitField<0, 4, ComponentMode> component0; -                    BitField<4, 4, ComponentMode> component1; -                    BitField<8, 4, ComponentMode> component2; -                    BitField<12, 4, ComponentMode> component3; -                    BitField<16, 2, u32> component_size; -                    BitField<20, 3, u32> src_num_components; -                    BitField<24, 3, u32> dst_num_components; - -                    u32 SrcBytePerPixel() const { -                        return src_num_components.Value() * component_size.Value(); -                    } -                    u32 DstBytePerPixel() const { -                        return dst_num_components.Value() * component_size.Value(); -                    } -                } swizzle_config; +        PackedGPUVAddr address; -                Parameters dst_params; +        union { +            BitField<0, 3, Swizzle> dst_x; +            BitField<4, 3, Swizzle> dst_y; +            BitField<8, 3, Swizzle> dst_z; +            BitField<12, 3, Swizzle> dst_w; +            BitField<16, 2, u32> component_size_minus_one; +            BitField<20, 2, u32> num_src_components_minus_one; +            BitField<24, 2, u32> num_dst_components_minus_one; +        }; +    }; +    static_assert(sizeof(RemapConst) == 12); -                INSERT_UNION_PADDING_WORDS(1); +    explicit MaxwellDMA(Core::System& system, MemoryManager& memory_manager); +    ~MaxwellDMA() = default; -                Parameters src_params; +    /// Write the value to the register identified by method. +    void CallMethod(u32 method, u32 method_argument, bool is_last_call) override; -                INSERT_UNION_PADDING_WORDS(0x13); -            }; -            std::array<u32, NUM_REGS> reg_array; -        }; -    } regs{}; +    /// Write multiple values to the register identified by method. +    void CallMultiMethod(u32 method, const u32* base_start, u32 amount, +                         u32 methods_pending) override;  private: +    /// Performs the copy from the source buffer to the destination buffer as configured in the +    /// registers. +    void Launch(); + +    void CopyPitchToPitch(); + +    void CopyBlockLinearToPitch(); + +    void CopyPitchToBlockLinear(); + +    void FastCopyBlockLinearToPitch(); +      Core::System& system;      MemoryManager& memory_manager; @@ -185,28 +215,58 @@ private:      std::vector<u8> read_buffer;      std::vector<u8> write_buffer; -    /// Performs the copy from the source buffer to the destination buffer as configured in the -    /// registers. -    void HandleCopy(); -}; +    static constexpr std::size_t NUM_REGS = 0x800; +    struct Regs { +        union { +            struct { +                u32 reserved[0x40]; +                u32 nop; +                u32 reserved01[0xf]; +                u32 pm_trigger; +                u32 reserved02[0x3f]; +                Semaphore semaphore; +                u32 reserved03[0x2]; +                RenderEnable render_enable; +                PhysMode src_phys_mode; +                PhysMode dst_phys_mode; +                u32 reserved04[0x26]; +                LaunchDMA launch_dma; +                u32 reserved05[0x3f]; +                PackedGPUVAddr offset_in; +                PackedGPUVAddr offset_out; +                u32 pitch_in; +                u32 pitch_out; +                u32 line_length_in; +                u32 line_count; +                u32 reserved06[0xb8]; +                RemapConst remap_const; +                Parameters dst_params; +                u32 reserved07[0x1]; +                Parameters src_params; +                u32 reserved08[0x275]; +                u32 pm_trigger_end; +                u32 reserved09[0x3ba]; +            }; +            std::array<u32, NUM_REGS> reg_array; +        }; +    } regs{};  #define ASSERT_REG_POSITION(field_name, position)                                                  \      static_assert(offsetof(MaxwellDMA::Regs, field_name) == position * 4,                          \                    "Field " #field_name " has invalid position") -ASSERT_REG_POSITION(exec, 0xC0); -ASSERT_REG_POSITION(src_address, 0x100); -ASSERT_REG_POSITION(dst_address, 0x102); -ASSERT_REG_POSITION(src_pitch, 0x104); -ASSERT_REG_POSITION(dst_pitch, 0x105); -ASSERT_REG_POSITION(x_count, 0x106); -ASSERT_REG_POSITION(y_count, 0x107); -ASSERT_REG_POSITION(const0, 0x1C0); -ASSERT_REG_POSITION(const1, 0x1C1); -ASSERT_REG_POSITION(swizzle_config, 0x1C2); -ASSERT_REG_POSITION(dst_params, 0x1C3); -ASSERT_REG_POSITION(src_params, 0x1CA); +    ASSERT_REG_POSITION(launch_dma, 0xC0); +    ASSERT_REG_POSITION(offset_in, 0x100); +    ASSERT_REG_POSITION(offset_out, 0x102); +    ASSERT_REG_POSITION(pitch_in, 0x104); +    ASSERT_REG_POSITION(pitch_out, 0x105); +    ASSERT_REG_POSITION(line_length_in, 0x106); +    ASSERT_REG_POSITION(line_count, 0x107); +    ASSERT_REG_POSITION(remap_const, 0x1C0); +    ASSERT_REG_POSITION(dst_params, 0x1C3); +    ASSERT_REG_POSITION(src_params, 0x1CA);  #undef ASSERT_REG_POSITION +};  } // namespace Tegra::Engines diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp index 0b2b2b8c4..921562c1f 100644 --- a/src/video_core/texture_cache/surface_params.cpp +++ b/src/video_core/texture_cache/surface_params.cpp @@ -343,8 +343,7 @@ std::size_t SurfaceParams::GetLayerSize(bool as_host_size, bool uncompressed) co          size += GetInnerMipmapMemorySize(level, as_host_size, uncompressed);      }      if (is_tiled && is_layered) { -        return Common::AlignBits(size, -                                 Tegra::Texture::GetGOBSizeShift() + block_height + block_depth); +        return Common::AlignBits(size, Tegra::Texture::GOB_SIZE_SHIFT + block_height + block_depth);      }      return size;  } @@ -418,7 +417,7 @@ std::tuple<u32, u32, u32> SurfaceParams::GetBlockOffsetXYZ(u32 offset) const {      const u32 block_size = GetBlockSize();      const u32 block_index = offset / block_size;      const u32 gob_offset = offset % block_size; -    const u32 gob_index = gob_offset / static_cast<u32>(Tegra::Texture::GetGOBSize()); +    const u32 gob_index = gob_offset / static_cast<u32>(Tegra::Texture::GOB_SIZE);      const u32 x_gob_pixels = 64U / GetBytesPerPixel();      const u32 x_block_pixels = x_gob_pixels << block_width;      const u32 y_block_pixels = 8U << block_height; diff --git a/src/video_core/texture_cache/surface_params.h b/src/video_core/texture_cache/surface_params.h index 24957df8d..118aa689e 100644 --- a/src/video_core/texture_cache/surface_params.h +++ b/src/video_core/texture_cache/surface_params.h @@ -204,7 +204,7 @@ public:      static std::size_t AlignLayered(const std::size_t out_size, const u32 block_height,                                      const u32 block_depth) {          return Common::AlignBits(out_size, -                                 Tegra::Texture::GetGOBSizeShift() + block_height + block_depth); +                                 Tegra::Texture::GOB_SIZE_SHIFT + block_height + block_depth);      }      /// Converts a width from a type of surface into another. This helps represent the diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index 548e4c3fe..98beabef1 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp @@ -6,6 +6,7 @@  #include <cstring>  #include "common/alignment.h"  #include "common/assert.h" +#include "common/bit_util.h"  #include "video_core/gpu.h"  #include "video_core/textures/decoders.h"  #include "video_core/textures/texture.h" @@ -37,20 +38,10 @@ struct alignas(64) SwizzleTable {      std::array<std::array<u16, M>, N> values{};  }; -constexpr u32 gob_size_x_shift = 6; -constexpr u32 gob_size_y_shift = 3; -constexpr u32 gob_size_z_shift = 0; -constexpr u32 gob_size_shift = gob_size_x_shift + gob_size_y_shift + gob_size_z_shift; +constexpr u32 FAST_SWIZZLE_ALIGN = 16; -constexpr u32 gob_size_x = 1U << gob_size_x_shift; -constexpr u32 gob_size_y = 1U << gob_size_y_shift; -constexpr u32 gob_size_z = 1U << gob_size_z_shift; -constexpr u32 gob_size = 1U << gob_size_shift; - -constexpr u32 fast_swizzle_align = 16; - -constexpr auto legacy_swizzle_table = SwizzleTable<gob_size_y, gob_size_x, gob_size_z>(); -constexpr auto fast_swizzle_table = SwizzleTable<gob_size_y, 4, fast_swizzle_align>(); +constexpr auto LEGACY_SWIZZLE_TABLE = SwizzleTable<GOB_SIZE_X, GOB_SIZE_X, GOB_SIZE_Z>(); +constexpr auto FAST_SWIZZLE_TABLE = SwizzleTable<GOB_SIZE_Y, 4, FAST_SWIZZLE_ALIGN>();  /**   * This function manages ALL the GOBs(Group of Bytes) Inside a single block. @@ -69,17 +60,17 @@ void PreciseProcessBlock(u8* const swizzled_data, u8* const unswizzled_data, con          u32 y_address = z_address;          u32 pixel_base = layer_z * z + y_start * stride_x;          for (u32 y = y_start; y < y_end; y++) { -            const auto& table = legacy_swizzle_table[y % gob_size_y]; +            const auto& table = LEGACY_SWIZZLE_TABLE[y % GOB_SIZE_Y];              for (u32 x = x_start; x < x_end; x++) { -                const u32 swizzle_offset{y_address + table[x * bytes_per_pixel % gob_size_x]}; +                const u32 swizzle_offset{y_address + table[x * bytes_per_pixel % GOB_SIZE_X]};                  const u32 pixel_index{x * out_bytes_per_pixel + pixel_base};                  data_ptrs[unswizzle] = swizzled_data + swizzle_offset;                  data_ptrs[!unswizzle] = unswizzled_data + pixel_index;                  std::memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel);              }              pixel_base += stride_x; -            if ((y + 1) % gob_size_y == 0) -                y_address += gob_size; +            if ((y + 1) % GOB_SIZE_Y == 0) +                y_address += GOB_SIZE;          }          z_address += xy_block_size;      } @@ -104,18 +95,18 @@ void FastProcessBlock(u8* const swizzled_data, u8* const unswizzled_data, const          u32 y_address = z_address;          u32 pixel_base = layer_z * z + y_start * stride_x;          for (u32 y = y_start; y < y_end; y++) { -            const auto& table = fast_swizzle_table[y % gob_size_y]; -            for (u32 xb = x_startb; xb < x_endb; xb += fast_swizzle_align) { -                const u32 swizzle_offset{y_address + table[(xb / fast_swizzle_align) % 4]}; +            const auto& table = FAST_SWIZZLE_TABLE[y % GOB_SIZE_Y]; +            for (u32 xb = x_startb; xb < x_endb; xb += FAST_SWIZZLE_ALIGN) { +                const u32 swizzle_offset{y_address + table[(xb / FAST_SWIZZLE_ALIGN) % 4]};                  const u32 out_x = xb * out_bytes_per_pixel / bytes_per_pixel;                  const u32 pixel_index{out_x + pixel_base};                  data_ptrs[unswizzle ? 1 : 0] = swizzled_data + swizzle_offset;                  data_ptrs[unswizzle ? 0 : 1] = unswizzled_data + pixel_index; -                std::memcpy(data_ptrs[0], data_ptrs[1], fast_swizzle_align); +                std::memcpy(data_ptrs[0], data_ptrs[1], FAST_SWIZZLE_ALIGN);              }              pixel_base += stride_x; -            if ((y + 1) % gob_size_y == 0) -                y_address += gob_size; +            if ((y + 1) % GOB_SIZE_Y == 0) +                y_address += GOB_SIZE;          }          z_address += xy_block_size;      } @@ -138,9 +129,9 @@ void SwizzledData(u8* const swizzled_data, u8* const unswizzled_data, const bool      auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); };      const u32 stride_x = width * out_bytes_per_pixel;      const u32 layer_z = height * stride_x; -    const u32 gob_elements_x = gob_size_x / bytes_per_pixel; -    constexpr u32 gob_elements_y = gob_size_y; -    constexpr u32 gob_elements_z = gob_size_z; +    const u32 gob_elements_x = GOB_SIZE_X / bytes_per_pixel; +    constexpr u32 gob_elements_y = GOB_SIZE_Y; +    constexpr u32 gob_elements_z = GOB_SIZE_Z;      const u32 block_x_elements = gob_elements_x;      const u32 block_y_elements = gob_elements_y * block_height;      const u32 block_z_elements = gob_elements_z * block_depth; @@ -148,7 +139,7 @@ void SwizzledData(u8* const swizzled_data, u8* const unswizzled_data, const bool      const u32 blocks_on_x = div_ceil(aligned_width, block_x_elements);      const u32 blocks_on_y = div_ceil(height, block_y_elements);      const u32 blocks_on_z = div_ceil(depth, block_z_elements); -    const u32 xy_block_size = gob_size * block_height; +    const u32 xy_block_size = GOB_SIZE * block_height;      const u32 block_size = xy_block_size * block_depth;      u32 tile_offset = 0;      for (u32 zb = 0; zb < blocks_on_z; zb++) { @@ -182,7 +173,7 @@ void CopySwizzledData(u32 width, u32 height, u32 depth, u32 bytes_per_pixel,                        bool unswizzle, u32 block_height, u32 block_depth, u32 width_spacing) {      const u32 block_height_size{1U << block_height};      const u32 block_depth_size{1U << block_depth}; -    if (bytes_per_pixel % 3 != 0 && (width * bytes_per_pixel) % fast_swizzle_align == 0) { +    if (bytes_per_pixel % 3 != 0 && (width * bytes_per_pixel) % FAST_SWIZZLE_ALIGN == 0) {          SwizzledData<true>(swizzled_data, unswizzled_data, unswizzle, width, height, depth,                             bytes_per_pixel, out_bytes_per_pixel, block_height_size,                             block_depth_size, width_spacing); @@ -259,25 +250,26 @@ std::vector<u8> UnswizzleTexture(u8* address, u32 tile_size_x, u32 tile_size_y,  }  void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, -                    u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, +                    u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data,                      u32 block_height_bit, u32 offset_x, u32 offset_y) {      const u32 block_height = 1U << block_height_bit; -    const u32 image_width_in_gobs{(swizzled_width * bytes_per_pixel + (gob_size_x - 1)) / -                                  gob_size_x}; +    const u32 image_width_in_gobs = +        (swizzled_width * bytes_per_pixel + (GOB_SIZE_X - 1)) / GOB_SIZE_X;      for (u32 line = 0; line < subrect_height; ++line) {          const u32 dst_y = line + offset_y;          const u32 gob_address_y = -            (dst_y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs + -            ((dst_y % (gob_size_y * block_height)) / gob_size_y) * gob_size; -        const auto& table = legacy_swizzle_table[dst_y % gob_size_y]; +            (dst_y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs + +            ((dst_y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE; +        const auto& table = LEGACY_SWIZZLE_TABLE[dst_y % GOB_SIZE_Y];          for (u32 x = 0; x < subrect_width; ++x) {              const u32 dst_x = x + offset_x;              const u32 gob_address = -                gob_address_y + (dst_x * bytes_per_pixel / gob_size_x) * gob_size * block_height; -            const u32 swizzled_offset = gob_address + table[(dst_x * bytes_per_pixel) % gob_size_x]; -            u8* source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel; -            u8* dest_addr = swizzled_data + swizzled_offset; +                gob_address_y + (dst_x * bytes_per_pixel / GOB_SIZE_X) * GOB_SIZE * block_height; +            const u32 swizzled_offset = gob_address + table[(dst_x * bytes_per_pixel) % GOB_SIZE_X]; +            const u32 unswizzled_offset = line * source_pitch + x * bytes_per_pixel; +            const u8* const source_line = unswizzled_data + unswizzled_offset; +            u8* const dest_addr = swizzled_data + swizzled_offset;              std::memcpy(dest_addr, source_line, bytes_per_pixel);          }      } @@ -289,14 +281,15 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32      const u32 block_height = 1U << block_height_bit;      for (u32 line = 0; line < subrect_height; ++line) {          const u32 y2 = line + offset_y; -        const u32 gob_address_y = (y2 / (gob_size_y * block_height)) * gob_size * block_height + -                                  ((y2 % (gob_size_y * block_height)) / gob_size_y) * gob_size; -        const auto& table = legacy_swizzle_table[y2 % gob_size_y]; +        const u32 gob_address_y = (y2 / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height + +                                  ((y2 % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE; +        const auto& table = LEGACY_SWIZZLE_TABLE[y2 % GOB_SIZE_Y];          for (u32 x = 0; x < subrect_width; ++x) {              const u32 x2 = (x + offset_x) * bytes_per_pixel; -            const u32 gob_address = gob_address_y + (x2 / gob_size_x) * gob_size * block_height; -            const u32 swizzled_offset = gob_address + table[x2 % gob_size_x]; -            u8* dest_line = unswizzled_data + line * dest_pitch + x * bytes_per_pixel; +            const u32 gob_address = gob_address_y + (x2 / GOB_SIZE_X) * GOB_SIZE * block_height; +            const u32 swizzled_offset = gob_address + table[x2 % GOB_SIZE_X]; +            const u32 unswizzled_offset = line * dest_pitch + x * bytes_per_pixel; +            u8* dest_line = unswizzled_data + unswizzled_offset;              u8* source_addr = swizzled_data + swizzled_offset;              std::memcpy(dest_line, source_addr, bytes_per_pixel); @@ -304,21 +297,48 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32      }  } +void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height, +                         u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x, +                         u32 origin_y, u8* output, const u8* input) { +    UNIMPLEMENTED_IF(origin_x > 0); +    UNIMPLEMENTED_IF(origin_y > 0); + +    const u32 stride = width * bytes_per_pixel; +    const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X; +    const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth); + +    const u32 block_height_mask = (1U << block_height) - 1; +    const u32 x_shift = Common::CountTrailingZeroes32(GOB_SIZE << (block_height + block_depth)); + +    for (u32 line = 0; line < line_count; ++line) { +        const auto& table = LEGACY_SWIZZLE_TABLE[line % GOB_SIZE_Y]; +        const u32 block_y = line / GOB_SIZE_Y; +        const u32 dst_offset_y = +            (block_y >> block_height) * block_size + (block_y & block_height_mask) * GOB_SIZE; +        for (u32 x = 0; x < line_length_in; ++x) { +            const u32 dst_offset = +                ((x / GOB_SIZE_X) << x_shift) + dst_offset_y + table[x % GOB_SIZE_X]; +            const u32 src_offset = x * bytes_per_pixel + line * pitch; +            std::memcpy(output + dst_offset, input + src_offset, bytes_per_pixel); +        } +    } +} +  void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y,                     const u32 block_height_bit, const std::size_t copy_size, const u8* source_data,                     u8* swizzle_data) {      const u32 block_height = 1U << block_height_bit; -    const u32 image_width_in_gobs{(width + gob_size_x - 1) / gob_size_x}; +    const u32 image_width_in_gobs{(width + GOB_SIZE_X - 1) / GOB_SIZE_X};      std::size_t count = 0;      for (std::size_t y = dst_y; y < height && count < copy_size; ++y) {          const std::size_t gob_address_y = -            (y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs + -            ((y % (gob_size_y * block_height)) / gob_size_y) * gob_size; -        const auto& table = legacy_swizzle_table[y % gob_size_y]; +            (y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs + +            ((y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE; +        const auto& table = LEGACY_SWIZZLE_TABLE[y % GOB_SIZE_Y];          for (std::size_t x = dst_x; x < width && count < copy_size; ++x) {              const std::size_t gob_address = -                gob_address_y + (x / gob_size_x) * gob_size * block_height; -            const std::size_t swizzled_offset = gob_address + table[x % gob_size_x]; +                gob_address_y + (x / GOB_SIZE_X) * GOB_SIZE * block_height; +            const std::size_t swizzled_offset = gob_address + table[x % GOB_SIZE_X];              const u8* source_line = source_data + count;              u8* dest_addr = swizzle_data + swizzled_offset;              count++; @@ -373,9 +393,9 @@ std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat  std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height, u32 depth,                            u32 block_height, u32 block_depth) {      if (tiled) { -        const u32 aligned_width = Common::AlignBits(width * bytes_per_pixel, gob_size_x_shift); -        const u32 aligned_height = Common::AlignBits(height, gob_size_y_shift + block_height); -        const u32 aligned_depth = Common::AlignBits(depth, gob_size_z_shift + block_depth); +        const u32 aligned_width = Common::AlignBits(width * bytes_per_pixel, GOB_SIZE_X_SHIFT); +        const u32 aligned_height = Common::AlignBits(height, GOB_SIZE_Y_SHIFT + block_height); +        const u32 aligned_depth = Common::AlignBits(depth, GOB_SIZE_Z_SHIFT + block_depth);          return aligned_width * aligned_height * aligned_depth;      } else {          return width * height * depth * bytes_per_pixel; @@ -386,14 +406,14 @@ u64 GetGOBOffset(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,                   u32 bytes_per_pixel) {      auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); };      const u32 gobs_in_block = 1 << block_height; -    const u32 y_blocks = gob_size_y << block_height; -    const u32 x_per_gob = gob_size_x / bytes_per_pixel; +    const u32 y_blocks = GOB_SIZE_Y << block_height; +    const u32 x_per_gob = GOB_SIZE_X / bytes_per_pixel;      const u32 x_blocks = div_ceil(width, x_per_gob); -    const u32 block_size = gob_size * gobs_in_block; +    const u32 block_size = GOB_SIZE * gobs_in_block;      const u32 stride = block_size * x_blocks;      const u32 base = (dst_y / y_blocks) * stride + (dst_x / x_per_gob) * block_size;      const u32 relative_y = dst_y % y_blocks; -    return base + (relative_y / gob_size_y) * gob_size; +    return base + (relative_y / GOB_SIZE_Y) * GOB_SIZE;  }  } // namespace Tegra::Texture diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h index 06f3ebf87..232b696b3 100644 --- a/src/video_core/textures/decoders.h +++ b/src/video_core/textures/decoders.h @@ -10,15 +10,15 @@  namespace Tegra::Texture { -// GOBSize constant. Calculated by 64 bytes in x multiplied by 8 y coords, represents -// an small rect of (64/bytes_per_pixel)X8. -inline std::size_t GetGOBSize() { -    return 512; -} +constexpr u32 GOB_SIZE_X = 64; +constexpr u32 GOB_SIZE_Y = 8; +constexpr u32 GOB_SIZE_Z = 1; +constexpr u32 GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z; -inline std::size_t GetGOBSizeShift() { -    return 9; -} +constexpr std::size_t GOB_SIZE_X_SHIFT = 6; +constexpr std::size_t GOB_SIZE_Y_SHIFT = 3; +constexpr std::size_t GOB_SIZE_Z_SHIFT = 0; +constexpr std::size_t GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;  /// Unswizzles a swizzled texture without changing its format.  void UnswizzleTexture(u8* unswizzled_data, u8* address, u32 tile_size_x, u32 tile_size_y, @@ -48,14 +48,32 @@ std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height  /// Copies an untiled subrectangle into a tiled surface.  void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, -                    u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height, -                    u32 offset_x, u32 offset_y); +                    u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data, +                    u32 block_height_bit, u32 offset_x, u32 offset_y);  /// Copies a tiled subrectangle into a linear surface.  void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width,                        u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height,                        u32 offset_x, u32 offset_y); +/// @brief Swizzles a 2D array of pixels into a 3D texture +/// @param line_length_in  Number of pixels per line +/// @param line_count      Number of lines +/// @param pitch           Number of bytes per line +/// @param width           Width of the swizzled texture +/// @param height          Height of the swizzled texture +/// @param bytes_per_pixel Number of bytes used per pixel +/// @param block_height    Block height shift +/// @param block_depth     Block depth shift +/// @param origin_x        Column offset in pixels of the swizzled texture +/// @param origin_y        Row offset in pixels of the swizzled texture +/// @param output          Pointer to the pixels of the swizzled texture +/// @param input           Pointer to the 2D array of pixels used as input +/// @pre input and output points to an array large enough to hold the number of bytes used +void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height, +                         u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x, +                         u32 origin_y, u8* output, const u8* input); +  void SwizzleKepler(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,                     std::size_t copy_size, const u8* source_data, u8* swizzle_data); | 
