From 33a4d7d1badbebd2dc05114ef17c85678baed843 Mon Sep 17 00:00:00 2001 From: riperiperi Date: Thu, 17 Nov 2022 17:47:41 +0000 Subject: [PATCH] GPU: Eliminate CB0 accesses when storage buffer accesses are resolved (#3847) * Eliminate CB0 accesses Still some work to do, decouple from hle? * Forgot the important part somehow * Fix and improve alignment test * Address Feedback * Remove some complexity when checking storage buffer alignment * Update Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs Co-authored-by: gdkchan Co-authored-by: gdkchan --- Ryujinx.Graphics.Gpu/Constants.cs | 5 + .../Engine/Compute/ComputeClass.cs | 45 ++-- .../Engine/Threed/StateUpdater.cs | 8 +- Ryujinx.Graphics.Gpu/Memory/BufferManager.cs | 46 +++- .../Shader/ComputeShaderCacheHashTable.cs | 4 +- .../Shader/DiskCache/DiskCacheGpuAccessor.cs | 6 + .../Shader/DiskCache/DiskCacheHostStorage.cs | 2 +- Ryujinx.Graphics.Gpu/Shader/GpuAccessor.cs | 6 + .../Shader/GpuChannelComputeState.cs | 10 +- .../Shader/GpuChannelGraphicsState.cs | 10 +- Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs | 8 +- .../Shader/ShaderSpecializationList.cs | 5 +- .../Shader/ShaderSpecializationState.cs | 13 +- Ryujinx.Graphics.Shader/Constants.cs | 2 + Ryujinx.Graphics.Shader/IGpuAccessor.cs | 9 + .../Optimizations/GlobalToStorage.cs | 206 ++++++++++++++---- 16 files changed, 317 insertions(+), 68 deletions(-) diff --git a/Ryujinx.Graphics.Gpu/Constants.cs b/Ryujinx.Graphics.Gpu/Constants.cs index d580049f2..1897f5d0f 100644 --- a/Ryujinx.Graphics.Gpu/Constants.cs +++ b/Ryujinx.Graphics.Gpu/Constants.cs @@ -95,5 +95,10 @@ namespace Ryujinx.Graphics.Gpu /// Byte alignment for block linear textures /// public const int GobAlignment = 64; + + /// + /// Expected byte alignment for storage buffers + /// + public const int StorageAlignment = 16; } } \ No newline at end of file diff --git a/Ryujinx.Graphics.Gpu/Engine/Compute/ComputeClass.cs b/Ryujinx.Graphics.Gpu/Engine/Compute/ComputeClass.cs index bc2911748..cd509471e 100644 --- a/Ryujinx.Graphics.Gpu/Engine/Compute/ComputeClass.cs +++ b/Ryujinx.Graphics.Gpu/Engine/Compute/ComputeClass.cs @@ -138,7 +138,8 @@ namespace Ryujinx.Graphics.Gpu.Engine.Compute qmd.CtaThreadDimension1, qmd.CtaThreadDimension2, localMemorySize, - sharedMemorySize); + sharedMemorySize, + _channel.BufferManager.HasUnalignedStorageBuffers); CachedShaderProgram cs = memoryManager.Physical.ShaderCache.GetComputeShader(_channel, poolState, computeState, shaderGpuVa); @@ -150,6 +151,33 @@ namespace Ryujinx.Graphics.Gpu.Engine.Compute ShaderProgramInfo info = cs.Shaders[0].Info; + bool hasUnaligned = _channel.BufferManager.HasUnalignedStorageBuffers; + + for (int index = 0; index < info.SBuffers.Count; index++) + { + BufferDescriptor sb = info.SBuffers[index]; + + ulong sbDescAddress = _channel.BufferManager.GetComputeUniformBufferAddress(0); + + int sbDescOffset = 0x310 + sb.Slot * 0x10; + + sbDescAddress += (ulong)sbDescOffset; + + SbDescriptor sbDescriptor = _channel.MemoryManager.Physical.Read(sbDescAddress); + + _channel.BufferManager.SetComputeStorageBuffer(sb.Slot, sbDescriptor.PackAddress(), (uint)sbDescriptor.Size, sb.Flags); + } + + if ((_channel.BufferManager.HasUnalignedStorageBuffers) != hasUnaligned) + { + // Refetch the shader, as assumptions about storage buffer alignment have changed. + cs = memoryManager.Physical.ShaderCache.GetComputeShader(_channel, poolState, computeState, shaderGpuVa); + + _context.Renderer.Pipeline.SetProgram(cs.HostProgram); + + info = cs.Shaders[0].Info; + } + for (int index = 0; index < info.CBuffers.Count; index++) { BufferDescriptor cb = info.CBuffers[index]; @@ -174,21 +202,6 @@ namespace Ryujinx.Graphics.Gpu.Engine.Compute _channel.BufferManager.SetComputeUniformBuffer(cb.Slot, cbDescriptor.PackAddress(), (uint)cbDescriptor.Size); } - for (int index = 0; index < info.SBuffers.Count; index++) - { - BufferDescriptor sb = info.SBuffers[index]; - - ulong sbDescAddress = _channel.BufferManager.GetComputeUniformBufferAddress(0); - - int sbDescOffset = 0x310 + sb.Slot * 0x10; - - sbDescAddress += (ulong)sbDescOffset; - - SbDescriptor sbDescriptor = _channel.MemoryManager.Physical.Read(sbDescAddress); - - _channel.BufferManager.SetComputeStorageBuffer(sb.Slot, sbDescriptor.PackAddress(), (uint)sbDescriptor.Size, sb.Flags); - } - _channel.BufferManager.SetComputeStorageBufferBindings(info.SBuffers); _channel.BufferManager.SetComputeUniformBufferBindings(info.CBuffers); diff --git a/Ryujinx.Graphics.Gpu/Engine/Threed/StateUpdater.cs b/Ryujinx.Graphics.Gpu/Engine/Threed/StateUpdater.cs index 3f71172c0..d51077dc7 100644 --- a/Ryujinx.Graphics.Gpu/Engine/Threed/StateUpdater.cs +++ b/Ryujinx.Graphics.Gpu/Engine/Threed/StateUpdater.cs @@ -293,9 +293,12 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed /// private void CommitBindings() { + var buffers = _channel.BufferManager; + var hasUnaligned = buffers.HasUnalignedStorageBuffers; + UpdateStorageBuffers(); - if (!_channel.TextureManager.CommitGraphicsBindings(_shaderSpecState)) + if (!_channel.TextureManager.CommitGraphicsBindings(_shaderSpecState) || (buffers.HasUnalignedStorageBuffers != hasUnaligned)) { // Shader must be reloaded. UpdateShaderState(); @@ -1361,7 +1364,8 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed _state.State.AlphaTestFunc, _state.State.AlphaTestRef, ref attributeTypes, - _drawState.HasConstantBufferDrawParameters); + _drawState.HasConstantBufferDrawParameters, + _channel.BufferManager.HasUnalignedStorageBuffers); } /// diff --git a/Ryujinx.Graphics.Gpu/Memory/BufferManager.cs b/Ryujinx.Graphics.Gpu/Memory/BufferManager.cs index 9f1f88b1e..1b67f6507 100644 --- a/Ryujinx.Graphics.Gpu/Memory/BufferManager.cs +++ b/Ryujinx.Graphics.Gpu/Memory/BufferManager.cs @@ -17,6 +17,9 @@ namespace Ryujinx.Graphics.Gpu.Memory private readonly GpuContext _context; private readonly GpuChannel _channel; + private int _unalignedStorageBuffers; + public bool HasUnalignedStorageBuffers => _unalignedStorageBuffers > 0; + private IndexBuffer _indexBuffer; private readonly VertexBuffer[] _vertexBuffers; private readonly BufferBounds[] _transformFeedbackBuffers; @@ -38,6 +41,11 @@ namespace Ryujinx.Graphics.Gpu.Memory /// public BufferBounds[] Buffers { get; } + /// + /// Flag indicating if this binding is unaligned. + /// + public bool[] Unaligned { get; } + /// /// Total amount of buffers used on the shader. /// @@ -51,6 +59,7 @@ namespace Ryujinx.Graphics.Gpu.Memory { Bindings = new BufferDescriptor[count]; Buffers = new BufferBounds[count]; + Unaligned = new bool[count]; } /// @@ -202,6 +211,31 @@ namespace Ryujinx.Graphics.Gpu.Memory _transformFeedbackBuffersDirty = true; } + /// + /// Records the alignment of a storage buffer. + /// Unaligned storage buffers disable some optimizations on the shader. + /// + /// The binding list to modify + /// Index of the storage buffer + /// Start GPU virtual address of the buffer + private void RecordStorageAlignment(BuffersPerStage buffers, int index, ulong gpuVa) + { + bool unaligned = (gpuVa & (Constants.StorageAlignment - 1)) != 0; + + if (unaligned || HasUnalignedStorageBuffers) + { + // Check if the alignment changed for this binding. + + ref bool currentUnaligned = ref buffers.Unaligned[index]; + + if (currentUnaligned != unaligned) + { + currentUnaligned = unaligned; + _unalignedStorageBuffers += unaligned ? 1 : -1; + } + } + } + /// /// Sets a storage buffer on the compute pipeline. /// Storage buffers can be read and written to on shaders. @@ -214,6 +248,8 @@ namespace Ryujinx.Graphics.Gpu.Memory { size += gpuVa & ((ulong)_context.Capabilities.StorageBufferOffsetAlignment - 1); + RecordStorageAlignment(_cpStorageBuffers, index, gpuVa); + gpuVa = BitUtils.AlignDown(gpuVa, _context.Capabilities.StorageBufferOffsetAlignment); ulong address = _channel.MemoryManager.Physical.BufferCache.TranslateAndCreateBuffer(_channel.MemoryManager, gpuVa, size); @@ -234,17 +270,21 @@ namespace Ryujinx.Graphics.Gpu.Memory { size += gpuVa & ((ulong)_context.Capabilities.StorageBufferOffsetAlignment - 1); + BuffersPerStage buffers = _gpStorageBuffers[stage]; + + RecordStorageAlignment(buffers, index, gpuVa); + gpuVa = BitUtils.AlignDown(gpuVa, _context.Capabilities.StorageBufferOffsetAlignment); ulong address = _channel.MemoryManager.Physical.BufferCache.TranslateAndCreateBuffer(_channel.MemoryManager, gpuVa, size); - if (_gpStorageBuffers[stage].Buffers[index].Address != address || - _gpStorageBuffers[stage].Buffers[index].Size != size) + if (buffers.Buffers[index].Address != address || + buffers.Buffers[index].Size != size) { _gpStorageBuffersDirty = true; } - _gpStorageBuffers[stage].SetBounds(index, address, size, flags); + buffers.SetBounds(index, address, size, flags); } /// diff --git a/Ryujinx.Graphics.Gpu/Shader/ComputeShaderCacheHashTable.cs b/Ryujinx.Graphics.Gpu/Shader/ComputeShaderCacheHashTable.cs index 08154df32..a67182112 100644 --- a/Ryujinx.Graphics.Gpu/Shader/ComputeShaderCacheHashTable.cs +++ b/Ryujinx.Graphics.Gpu/Shader/ComputeShaderCacheHashTable.cs @@ -36,6 +36,7 @@ namespace Ryujinx.Graphics.Gpu.Shader /// /// GPU channel /// Texture pool state + /// Compute state /// GPU virtual address of the compute shader /// Cached host program for the given state, if found /// Cached guest code, if any found @@ -43,6 +44,7 @@ namespace Ryujinx.Graphics.Gpu.Shader public bool TryFind( GpuChannel channel, GpuChannelPoolState poolState, + GpuChannelComputeState computeState, ulong gpuVa, out CachedShaderProgram program, out byte[] cachedGuestCode) @@ -50,7 +52,7 @@ namespace Ryujinx.Graphics.Gpu.Shader program = null; ShaderCodeAccessor codeAccessor = new ShaderCodeAccessor(channel.MemoryManager, gpuVa); bool hasSpecList = _cache.TryFindItem(codeAccessor, out var specList, out cachedGuestCode); - return hasSpecList && specList.TryFindForCompute(channel, poolState, out program); + return hasSpecList && specList.TryFindForCompute(channel, poolState, computeState, out program); } /// diff --git a/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheGpuAccessor.cs b/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheGpuAccessor.cs index 98748bf62..c567c2c06 100644 --- a/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheGpuAccessor.cs +++ b/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheGpuAccessor.cs @@ -225,6 +225,12 @@ namespace Ryujinx.Graphics.Gpu.Shader.DiskCache return _oldSpecState.GraphicsState.EarlyZForce; } + /// + public bool QueryHasUnalignedStorageBuffer() + { + return _oldSpecState.GraphicsState.HasUnalignedStorageBuffer || _oldSpecState.ComputeState.HasUnalignedStorageBuffer; + } + /// public bool QueryViewportTransformDisable() { diff --git a/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs b/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs index 0bdf49499..e23b4d50e 100644 --- a/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs +++ b/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs @@ -22,7 +22,7 @@ namespace Ryujinx.Graphics.Gpu.Shader.DiskCache private const ushort FileFormatVersionMajor = 1; private const ushort FileFormatVersionMinor = 2; private const uint FileFormatVersionPacked = ((uint)FileFormatVersionMajor << 16) | FileFormatVersionMinor; - private const uint CodeGenVersion = 3747; + private const uint CodeGenVersion = 3848; private const string SharedTocFileName = "shared.toc"; private const string SharedDataFileName = "shared.data"; diff --git a/Ryujinx.Graphics.Gpu/Shader/GpuAccessor.cs b/Ryujinx.Graphics.Gpu/Shader/GpuAccessor.cs index b8cb1107f..28ea430cd 100644 --- a/Ryujinx.Graphics.Gpu/Shader/GpuAccessor.cs +++ b/Ryujinx.Graphics.Gpu/Shader/GpuAccessor.cs @@ -145,6 +145,12 @@ namespace Ryujinx.Graphics.Gpu.Shader return _state.GraphicsState.HasConstantBufferDrawParameters; } + /// + public bool QueryHasUnalignedStorageBuffer() + { + return _state.GraphicsState.HasUnalignedStorageBuffer || _state.ComputeState.HasUnalignedStorageBuffer; + } + /// public InputTopology QueryPrimitiveTopology() { diff --git a/Ryujinx.Graphics.Gpu/Shader/GpuChannelComputeState.cs b/Ryujinx.Graphics.Gpu/Shader/GpuChannelComputeState.cs index 89a3db712..356d3f3e4 100644 --- a/Ryujinx.Graphics.Gpu/Shader/GpuChannelComputeState.cs +++ b/Ryujinx.Graphics.Gpu/Shader/GpuChannelComputeState.cs @@ -32,6 +32,11 @@ namespace Ryujinx.Graphics.Gpu.Shader /// public readonly int SharedMemorySize; + /// + /// Indicates that any storage buffer use is unaligned. + /// + public readonly bool HasUnalignedStorageBuffer; + /// /// Creates a new GPU compute state. /// @@ -40,18 +45,21 @@ namespace Ryujinx.Graphics.Gpu.Shader /// Local group size Z of the compute shader /// Local memory size of the compute shader /// Shared memory size of the compute shader + /// Indicates that any storage buffer use is unaligned public GpuChannelComputeState( int localSizeX, int localSizeY, int localSizeZ, int localMemorySize, - int sharedMemorySize) + int sharedMemorySize, + bool hasUnalignedStorageBuffer) { LocalSizeX = localSizeX; LocalSizeY = localSizeY; LocalSizeZ = localSizeZ; LocalMemorySize = localMemorySize; SharedMemorySize = sharedMemorySize; + HasUnalignedStorageBuffer = hasUnalignedStorageBuffer; } } } \ No newline at end of file diff --git a/Ryujinx.Graphics.Gpu/Shader/GpuChannelGraphicsState.cs b/Ryujinx.Graphics.Gpu/Shader/GpuChannelGraphicsState.cs index b07276774..511f4c235 100644 --- a/Ryujinx.Graphics.Gpu/Shader/GpuChannelGraphicsState.cs +++ b/Ryujinx.Graphics.Gpu/Shader/GpuChannelGraphicsState.cs @@ -82,6 +82,11 @@ namespace Ryujinx.Graphics.Gpu.Shader /// public readonly bool HasConstantBufferDrawParameters; + /// + /// Indicates that any storage buffer use is unaligned. + /// + public readonly bool HasUnalignedStorageBuffer; + /// /// Creates a new GPU graphics state. /// @@ -99,6 +104,7 @@ namespace Ryujinx.Graphics.Gpu.Shader /// When alpha test is enabled, indicates the value to compare with the fragment output alpha /// Type of the vertex attributes consumed by the shader /// Indicates that the draw is writing the base vertex, base instance and draw index to Constant Buffer 0 + /// Indicates that any storage buffer use is unaligned public GpuChannelGraphicsState( bool earlyZForce, PrimitiveTopology topology, @@ -113,7 +119,8 @@ namespace Ryujinx.Graphics.Gpu.Shader CompareOp alphaTestCompare, float alphaTestReference, ref Array32 attributeTypes, - bool hasConstantBufferDrawParameters) + bool hasConstantBufferDrawParameters, + bool hasUnalignedStorageBuffer) { EarlyZForce = earlyZForce; Topology = topology; @@ -129,6 +136,7 @@ namespace Ryujinx.Graphics.Gpu.Shader AlphaTestReference = alphaTestReference; AttributeTypes = attributeTypes; HasConstantBufferDrawParameters = hasConstantBufferDrawParameters; + HasUnalignedStorageBuffer = hasUnalignedStorageBuffer; } } } \ No newline at end of file diff --git a/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs b/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs index 1803dae61..2a9dd6a5c 100644 --- a/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs +++ b/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs @@ -203,12 +203,12 @@ namespace Ryujinx.Graphics.Gpu.Shader GpuChannelComputeState computeState, ulong gpuVa) { - if (_cpPrograms.TryGetValue(gpuVa, out var cpShader) && IsShaderEqual(channel, poolState, cpShader, gpuVa)) + if (_cpPrograms.TryGetValue(gpuVa, out var cpShader) && IsShaderEqual(channel, poolState, computeState, cpShader, gpuVa)) { return cpShader; } - if (_computeShaderCache.TryFind(channel, poolState, gpuVa, out cpShader, out byte[] cachedGuestCode)) + if (_computeShaderCache.TryFind(channel, poolState, computeState, gpuVa, out cpShader, out byte[] cachedGuestCode)) { _cpPrograms[gpuVa] = cpShader; return cpShader; @@ -473,18 +473,20 @@ namespace Ryujinx.Graphics.Gpu.Shader /// /// GPU channel using the shader /// GPU channel state to verify shader compatibility + /// GPU channel compute state to verify shader compatibility /// Cached compute shader /// GPU virtual address of the shader code in memory /// True if the code is different, false otherwise private static bool IsShaderEqual( GpuChannel channel, GpuChannelPoolState poolState, + GpuChannelComputeState computeState, CachedShaderProgram cpShader, ulong gpuVa) { if (IsShaderEqual(channel.MemoryManager, cpShader.Shaders[0], gpuVa)) { - return cpShader.SpecializationState.MatchesCompute(channel, poolState, true); + return cpShader.SpecializationState.MatchesCompute(channel, poolState, computeState, true); } return false; diff --git a/Ryujinx.Graphics.Gpu/Shader/ShaderSpecializationList.cs b/Ryujinx.Graphics.Gpu/Shader/ShaderSpecializationList.cs index abc9d913b..cb6ab49a8 100644 --- a/Ryujinx.Graphics.Gpu/Shader/ShaderSpecializationList.cs +++ b/Ryujinx.Graphics.Gpu/Shader/ShaderSpecializationList.cs @@ -53,13 +53,14 @@ namespace Ryujinx.Graphics.Gpu.Shader /// /// GPU channel /// Texture pool state + /// Compute state /// Cached program, if found /// True if a compatible program is found, false otherwise - public bool TryFindForCompute(GpuChannel channel, GpuChannelPoolState poolState, out CachedShaderProgram program) + public bool TryFindForCompute(GpuChannel channel, GpuChannelPoolState poolState, GpuChannelComputeState computeState, out CachedShaderProgram program) { foreach (var entry in _entries) { - if (entry.SpecializationState.MatchesCompute(channel, poolState, true)) + if (entry.SpecializationState.MatchesCompute(channel, poolState, computeState, true)) { program = entry; return true; diff --git a/Ryujinx.Graphics.Gpu/Shader/ShaderSpecializationState.cs b/Ryujinx.Graphics.Gpu/Shader/ShaderSpecializationState.cs index 0aecc5b7b..8f931507a 100644 --- a/Ryujinx.Graphics.Gpu/Shader/ShaderSpecializationState.cs +++ b/Ryujinx.Graphics.Gpu/Shader/ShaderSpecializationState.cs @@ -531,6 +531,11 @@ namespace Ryujinx.Graphics.Gpu.Shader return false; } + if (graphicsState.HasUnalignedStorageBuffer != GraphicsState.HasUnalignedStorageBuffer) + { + return false; + } + return Matches(channel, poolState, checkTextures, isCompute: false); } @@ -539,10 +544,16 @@ namespace Ryujinx.Graphics.Gpu.Shader /// /// GPU channel /// Texture pool state + /// Compute state /// Indicates whether texture descriptors should be checked /// True if the state matches, false otherwise - public bool MatchesCompute(GpuChannel channel, GpuChannelPoolState poolState, bool checkTextures) + public bool MatchesCompute(GpuChannel channel, GpuChannelPoolState poolState, GpuChannelComputeState computeState, bool checkTextures) { + if (computeState.HasUnalignedStorageBuffer != ComputeState.HasUnalignedStorageBuffer) + { + return false; + } + return Matches(channel, poolState, checkTextures, isCompute: true); } diff --git a/Ryujinx.Graphics.Shader/Constants.cs b/Ryujinx.Graphics.Shader/Constants.cs index 7f1445ed0..c6f9ef494 100644 --- a/Ryujinx.Graphics.Shader/Constants.cs +++ b/Ryujinx.Graphics.Shader/Constants.cs @@ -10,5 +10,7 @@ namespace Ryujinx.Graphics.Shader public const int NvnBaseVertexByteOffset = 0x640; public const int NvnBaseInstanceByteOffset = 0x644; public const int NvnDrawIndexByteOffset = 0x648; + + public const int StorageAlignment = 16; } } \ No newline at end of file diff --git a/Ryujinx.Graphics.Shader/IGpuAccessor.cs b/Ryujinx.Graphics.Shader/IGpuAccessor.cs index 4f800a145..f05a8527b 100644 --- a/Ryujinx.Graphics.Shader/IGpuAccessor.cs +++ b/Ryujinx.Graphics.Shader/IGpuAccessor.cs @@ -177,6 +177,15 @@ namespace Ryujinx.Graphics.Shader return false; } + /// + /// Queries whenever the current draw uses unaligned storage buffer addresses. + /// + /// True if any storage buffer address is not aligned to 16 bytes, false otherwise + bool QueryHasUnalignedStorageBuffer() + { + return false; + } + /// /// Queries host about the presence of the FrontFacing built-in variable bug. /// diff --git a/Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs b/Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs index d2200d0b8..25c0eb25b 100644 --- a/Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs +++ b/Ryujinx.Graphics.Shader/Translation/Optimizations/GlobalToStorage.cs @@ -34,7 +34,7 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations // we can guess which storage buffer it is accessing. // We can then replace the global memory access with a storage // buffer access. - node = ReplaceGlobalWithStorage(node, config, storageIndex); + node = ReplaceGlobalWithStorage(block, node, config, storageIndex); } else if (config.Stage == ShaderStage.Compute && operation.Inst == Instruction.LoadGlobal) { @@ -54,7 +54,7 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations } } - private static LinkedListNode ReplaceGlobalWithStorage(LinkedListNode node, ShaderConfig config, int storageIndex) + private static LinkedListNode ReplaceGlobalWithStorage(BasicBlock block, LinkedListNode node, ShaderConfig config, int storageIndex) { Operation operation = (Operation)node.Value; @@ -64,42 +64,10 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations config.SetUsedStorageBuffer(storageIndex, isWrite); - Operand GetStorageOffset() - { - Operand addrLow = operation.GetSource(0); - - Operand baseAddrLow = Cbuf(0, GetStorageCbOffset(config.Stage, storageIndex)); - - Operand baseAddrTrunc = Local(); - - Operand alignMask = Const(-config.GpuAccessor.QueryHostStorageBufferOffsetAlignment()); - - Operation andOp = new Operation(Instruction.BitwiseAnd, baseAddrTrunc, baseAddrLow, alignMask); - - node.List.AddBefore(node, andOp); - - Operand byteOffset = Local(); - Operation subOp = new Operation(Instruction.Subtract, byteOffset, addrLow, baseAddrTrunc); - - node.List.AddBefore(node, subOp); - - if (isStg16Or8) - { - return byteOffset; - } - - Operand wordOffset = Local(); - Operation shrOp = new Operation(Instruction.ShiftRightU32, wordOffset, byteOffset, Const(2)); - - node.List.AddBefore(node, shrOp); - - return wordOffset; - } - Operand[] sources = new Operand[operation.SourcesCount]; sources[0] = Const(storageIndex); - sources[1] = GetStorageOffset(); + sources[1] = GetStorageOffset(block, node, config, storageIndex, operation.GetSource(0), isStg16Or8); for (int index = 2; index < operation.SourcesCount; index++) { @@ -144,6 +112,170 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations return node; } + private static Operand GetStorageOffset( + BasicBlock block, + LinkedListNode node, + ShaderConfig config, + int storageIndex, + Operand addrLow, + bool isStg16Or8) + { + int baseAddressCbOffset = GetStorageCbOffset(config.Stage, storageIndex); + + bool storageAligned = !(config.GpuAccessor.QueryHasUnalignedStorageBuffer() || config.GpuAccessor.QueryHostStorageBufferOffsetAlignment() > Constants.StorageAlignment); + + (Operand byteOffset, int constantOffset) = storageAligned ? + GetStorageOffset(block, Utils.FindLastOperation(addrLow, block), baseAddressCbOffset) : + (null, 0); + + if (byteOffset == null) + { + Operand baseAddrLow = Cbuf(0, baseAddressCbOffset); + Operand baseAddrTrunc = Local(); + + Operand alignMask = Const(-config.GpuAccessor.QueryHostStorageBufferOffsetAlignment()); + + Operation andOp = new Operation(Instruction.BitwiseAnd, baseAddrTrunc, baseAddrLow, alignMask); + + node.List.AddBefore(node, andOp); + + Operand offset = Local(); + Operation subOp = new Operation(Instruction.Subtract, offset, addrLow, baseAddrTrunc); + + node.List.AddBefore(node, subOp); + + byteOffset = offset; + } + else if (constantOffset != 0) + { + Operand offset = Local(); + Operation addOp = new Operation(Instruction.Add, offset, byteOffset, Const(constantOffset)); + + node.List.AddBefore(node, addOp); + + byteOffset = offset; + } + + if (byteOffset != null) + { + ReplaceAddressAlignment(node.List, addrLow, byteOffset, constantOffset); + } + + if (isStg16Or8) + { + return byteOffset; + } + + Operand wordOffset = Local(); + Operation shrOp = new Operation(Instruction.ShiftRightU32, wordOffset, byteOffset, Const(2)); + + node.List.AddBefore(node, shrOp); + + return wordOffset; + } + + private static bool IsCb0Offset(Operand operand, int offset) + { + return operand.Type == OperandType.ConstantBuffer && operand.GetCbufSlot() == 0 && operand.GetCbufOffset() == offset; + } + + private static void ReplaceAddressAlignment(LinkedList list, Operand address, Operand byteOffset, int constantOffset) + { + // When we emit 16/8-bit LDG, we add extra code to determine the address alignment. + // Eliminate the storage buffer base address from this too, leaving only the byte offset. + + foreach (INode useNode in address.UseOps) + { + if (useNode is Operation op && op.Inst == Instruction.BitwiseAnd) + { + Operand src1 = op.GetSource(0); + Operand src2 = op.GetSource(1); + + int addressIndex = -1; + + if (src1 == address && src2.Type == OperandType.Constant && src2.Value == 3) + { + addressIndex = 0; + } + else if (src2 == address && src1.Type == OperandType.Constant && src1.Value == 3) + { + addressIndex = 1; + } + + if (addressIndex != -1) + { + LinkedListNode node = list.Find(op); + + // Add offset calculation before the use. Needs to be on the same block. + if (node != null) + { + Operand offset = Local(); + Operation addOp = new Operation(Instruction.Add, offset, byteOffset, Const(constantOffset)); + list.AddBefore(node, addOp); + + op.SetSource(addressIndex, offset); + } + } + } + } + } + + private static (Operand, int) GetStorageOffset(BasicBlock block, Operand address, int baseAddressCbOffset) + { + if (IsCb0Offset(address, baseAddressCbOffset)) + { + // Direct offset: zero. + return (Const(0), 0); + } + + (address, int constantOffset) = GetStorageConstantOffset(block, address); + + address = Utils.FindLastOperation(address, block); + + if (IsCb0Offset(address, baseAddressCbOffset)) + { + // Only constant offset + return (Const(0), constantOffset); + } + + if (!(address.AsgOp is Operation offsetAdd) || offsetAdd.Inst != Instruction.Add) + { + return (null, 0); + } + + Operand src1 = offsetAdd.GetSource(0); + Operand src2 = Utils.FindLastOperation(offsetAdd.GetSource(1), block); + + if (IsCb0Offset(src2, baseAddressCbOffset)) + { + return (src1, constantOffset); + } + else if (IsCb0Offset(src1, baseAddressCbOffset)) + { + return (src2, constantOffset); + } + + return (null, 0); + } + + private static (Operand, int) GetStorageConstantOffset(BasicBlock block, Operand address) + { + if (!(address.AsgOp is Operation offsetAdd) || offsetAdd.Inst != Instruction.Add) + { + return (address, 0); + } + + Operand src1 = offsetAdd.GetSource(0); + Operand src2 = offsetAdd.GetSource(1); + + if (src2.Type != OperandType.Constant) + { + return (address, 0); + } + + return (src1, src2.Value); + } + private static LinkedListNode ReplaceLdgWithLdc(LinkedListNode node, ShaderConfig config, int storageIndex) { Operation operation = (Operation)node.Value; @@ -165,7 +297,7 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations Operand byteOffset = Local(); Operand wordOffset = Local(); - Operation subOp = new Operation(Instruction.Subtract, byteOffset, addrLow, baseAddrTrunc); + Operation subOp = new Operation(Instruction.Subtract, byteOffset, addrLow, baseAddrTrunc); Operation shrOp = new Operation(Instruction.ShiftRightU32, wordOffset, byteOffset, Const(2)); node.List.AddBefore(node, subOp); @@ -260,7 +392,7 @@ namespace Ryujinx.Graphics.Shader.Translation.Optimizations { if (operand.Type == OperandType.ConstantBuffer) { - int slot = operand.GetCbufSlot(); + int slot = operand.GetCbufSlot(); int offset = operand.GetCbufOffset(); if (slot == 0 && offset >= sbStart && offset < sbEnd)