From f1943fd0b65b74f164eec1f47a586a463fd4352a Mon Sep 17 00:00:00 2001 From: riperiperi Date: Thu, 9 Feb 2023 02:50:18 +0000 Subject: [PATCH 01/41] Log shader compile errors with Warning level (#2617) * Log shader compile errors with Warning level These are infrequent enough that I think it's worth dumping any errors into the log. They also keep causing graphical glitches, and the only indication that anything went wrong is a debug log that is never enabled. * Add maximum length for shader log --- Ryujinx.Graphics.OpenGL/Program.cs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Ryujinx.Graphics.OpenGL/Program.cs b/Ryujinx.Graphics.OpenGL/Program.cs index 0cc722e6c..a6009108a 100644 --- a/Ryujinx.Graphics.OpenGL/Program.cs +++ b/Ryujinx.Graphics.OpenGL/Program.cs @@ -10,6 +10,8 @@ namespace Ryujinx.Graphics.OpenGL { class Program : IProgram { + private const int MaxShaderLogLength = 2048; + public int Handle { get; private set; } public bool IsLinked @@ -115,9 +117,16 @@ namespace Ryujinx.Graphics.OpenGL if (status == 0) { - // Use GL.GetProgramInfoLog(Handle), it may be too long to print on the log. _status = ProgramLinkStatus.Failure; - Logger.Debug?.Print(LogClass.Gpu, "Shader linking failed."); + + string log = GL.GetProgramInfoLog(Handle); + + if (log.Length > MaxShaderLogLength) + { + log = log.Substring(0, MaxShaderLogLength) + "..."; + } + + Logger.Warning?.Print(LogClass.Gpu, $"Shader linking failed: \n{log}"); } else { From 7bae440d3a5f2ed9ca7f93d8e39d6e2935926d41 Mon Sep 17 00:00:00 2001 From: Isaac Marovitz <42140194+IsaacMarovitz@users.noreply.github.com> Date: Wed, 8 Feb 2023 22:08:15 -0500 Subject: [PATCH 02/41] `ObjectiveC` Helper Class (#4286) * `NativeMacOS` Helper Class * Corrections * Make CFString IDisposable * Fix `openURL:` * `dealloc` metal layer * Remove releases * Use NSString * Update Ryujinx.Ui.Common/Helper/NativeMacOS.cs Co-authored-by: merry * Programatically select updates in Finder * Address feedback * Feedback * Ptr * Fix whoopsie * Ack suggestions * Update Ryujinx.Ava/UI/Renderer/EmbeddedWindow.cs Co-authored-by: gdkchan * GDK Suggestions --------- Co-authored-by: merry Co-authored-by: gdkchan --- Ryujinx.Ava/UI/Helpers/MetalHelper.cs | 127 ---------------------- Ryujinx.Ava/UI/Renderer/EmbeddedWindow.cs | 28 ++++- Ryujinx.Ui.Common/Helper/ObjectiveC.cs | 97 +++++++++++++++++ Ryujinx.Ui.Common/Helper/OpenHelper.cs | 21 +++- 4 files changed, 141 insertions(+), 132 deletions(-) delete mode 100644 Ryujinx.Ava/UI/Helpers/MetalHelper.cs create mode 100644 Ryujinx.Ui.Common/Helper/ObjectiveC.cs diff --git a/Ryujinx.Ava/UI/Helpers/MetalHelper.cs b/Ryujinx.Ava/UI/Helpers/MetalHelper.cs deleted file mode 100644 index 5eb8660a1..000000000 --- a/Ryujinx.Ava/UI/Helpers/MetalHelper.cs +++ /dev/null @@ -1,127 +0,0 @@ -using System; -using System.Runtime.Versioning; -using System.Runtime.InteropServices; -using Avalonia; - -namespace Ryujinx.Ava.UI.Helpers -{ - public delegate void UpdateBoundsCallbackDelegate(Rect rect); - - [SupportedOSPlatform("macos")] - static partial class MetalHelper - { - private const string LibObjCImport = "/usr/lib/libobjc.A.dylib"; - - private struct Selector - { - public readonly IntPtr NativePtr; - - public unsafe Selector(string value) - { - int size = System.Text.Encoding.UTF8.GetMaxByteCount(value.Length); - byte* data = stackalloc byte[size]; - - fixed (char* pValue = value) - { - System.Text.Encoding.UTF8.GetBytes(pValue, value.Length, data, size); - } - - NativePtr = sel_registerName(data); - } - - public static implicit operator Selector(string value) => new Selector(value); - } - - private static unsafe IntPtr GetClass(string value) - { - int size = System.Text.Encoding.UTF8.GetMaxByteCount(value.Length); - byte* data = stackalloc byte[size]; - - fixed (char* pValue = value) - { - System.Text.Encoding.UTF8.GetBytes(pValue, value.Length, data, size); - } - - return objc_getClass(data); - } - - private struct NSPoint - { - public double X; - public double Y; - - public NSPoint(double x, double y) - { - X = x; - Y = y; - } - } - - private struct NSRect - { - public NSPoint Pos; - public NSPoint Size; - - public NSRect(double x, double y, double width, double height) - { - Pos = new NSPoint(x, y); - Size = new NSPoint(width, height); - } - } - - public static IntPtr GetMetalLayer(out IntPtr nsView, out UpdateBoundsCallbackDelegate updateBounds) - { - // Create a new CAMetalLayer. - IntPtr layerClass = GetClass("CAMetalLayer"); - IntPtr metalLayer = IntPtr_objc_msgSend(layerClass, "alloc"); - objc_msgSend(metalLayer, "init"); - - // Create a child NSView to render into. - IntPtr nsViewClass = GetClass("NSView"); - IntPtr child = IntPtr_objc_msgSend(nsViewClass, "alloc"); - objc_msgSend(child, "init", new NSRect(0, 0, 0, 0)); - - // Make its renderer our metal layer. - objc_msgSend(child, "setWantsLayer:", (byte)1); - objc_msgSend(child, "setLayer:", metalLayer); - objc_msgSend(metalLayer, "setContentsScale:", Program.DesktopScaleFactor); - - // Ensure the scale factor is up to date. - updateBounds = (Rect rect) => { - objc_msgSend(metalLayer, "setContentsScale:", Program.DesktopScaleFactor); - }; - - nsView = child; - return metalLayer; - } - - public static void DestroyMetalLayer(IntPtr nsView, IntPtr metalLayer) - { - // TODO - } - - [LibraryImport(LibObjCImport)] - private static unsafe partial IntPtr sel_registerName(byte* data); - - [LibraryImport(LibObjCImport)] - private static unsafe partial IntPtr objc_getClass(byte* data); - - [LibraryImport(LibObjCImport)] - private static partial void objc_msgSend(IntPtr receiver, Selector selector); - - [LibraryImport(LibObjCImport)] - private static partial void objc_msgSend(IntPtr receiver, Selector selector, byte value); - - [LibraryImport(LibObjCImport)] - private static partial void objc_msgSend(IntPtr receiver, Selector selector, IntPtr value); - - [LibraryImport(LibObjCImport)] - private static partial void objc_msgSend(IntPtr receiver, Selector selector, NSRect point); - - [LibraryImport(LibObjCImport)] - private static partial void objc_msgSend(IntPtr receiver, Selector selector, double value); - - [LibraryImport(LibObjCImport, EntryPoint = "objc_msgSend")] - private static partial IntPtr IntPtr_objc_msgSend(IntPtr receiver, Selector selector); - } -} \ No newline at end of file diff --git a/Ryujinx.Ava/UI/Renderer/EmbeddedWindow.cs b/Ryujinx.Ava/UI/Renderer/EmbeddedWindow.cs index 532f4dc27..a5c8b0031 100644 --- a/Ryujinx.Ava/UI/Renderer/EmbeddedWindow.cs +++ b/Ryujinx.Ava/UI/Renderer/EmbeddedWindow.cs @@ -2,9 +2,9 @@ using Avalonia; using Avalonia.Controls; using Avalonia.Input; using Avalonia.Platform; -using Ryujinx.Ava.UI.Helpers; using Ryujinx.Common.Configuration; using Ryujinx.Ui.Common.Configuration; +using Ryujinx.Ui.Common.Helper; using SPB.Graphics; using SPB.Platform; using SPB.Platform.GLX; @@ -30,6 +30,7 @@ namespace Ryujinx.Ava.UI.Renderer protected IntPtr NsView { get; set; } protected IntPtr MetalLayer { get; set; } + public delegate void UpdateBoundsCallbackDelegate(Rect rect); private UpdateBoundsCallbackDelegate _updateBoundsCallback; public event EventHandler WindowCreated; @@ -237,8 +238,29 @@ namespace Ryujinx.Ava.UI.Renderer [SupportedOSPlatform("macos")] IPlatformHandle CreateMacOS() { - MetalLayer = MetalHelper.GetMetalLayer(out IntPtr nsView, out _updateBoundsCallback); + // Create a new CAMetalLayer. + IntPtr layerClass = ObjectiveC.objc_getClass("CAMetalLayer"); + IntPtr metalLayer = ObjectiveC.IntPtr_objc_msgSend(layerClass, "alloc"); + ObjectiveC.objc_msgSend(metalLayer, "init"); + // Create a child NSView to render into. + IntPtr nsViewClass = ObjectiveC.objc_getClass("NSView"); + IntPtr child = ObjectiveC.IntPtr_objc_msgSend(nsViewClass, "alloc"); + ObjectiveC.objc_msgSend(child, "init", new ObjectiveC.NSRect(0, 0, 0, 0)); + + // Make its renderer our metal layer. + ObjectiveC.objc_msgSend(child, "setWantsLayer:", 1); + ObjectiveC.objc_msgSend(child, "setLayer:", metalLayer); + ObjectiveC.objc_msgSend(metalLayer, "setContentsScale:", Program.DesktopScaleFactor); + + // Ensure the scale factor is up to date. + _updateBoundsCallback = rect => + { + ObjectiveC.objc_msgSend(metalLayer, "setContentsScale:", Program.DesktopScaleFactor); + }; + + IntPtr nsView = child; + MetalLayer = metalLayer; NsView = nsView; return new PlatformHandle(nsView, "NSView"); @@ -260,7 +282,7 @@ namespace Ryujinx.Ava.UI.Renderer [SupportedOSPlatform("macos")] void DestroyMacOS() { - MetalHelper.DestroyMetalLayer(NsView, MetalLayer); + // TODO } } } \ No newline at end of file diff --git a/Ryujinx.Ui.Common/Helper/ObjectiveC.cs b/Ryujinx.Ui.Common/Helper/ObjectiveC.cs new file mode 100644 index 000000000..234f7597a --- /dev/null +++ b/Ryujinx.Ui.Common/Helper/ObjectiveC.cs @@ -0,0 +1,97 @@ +using System; +using System.IO; +using System.Runtime.InteropServices; +using System.Runtime.Versioning; +using System.Text; + +namespace Ryujinx.Ui.Common.Helper +{ + [SupportedOSPlatform("macos")] + public static partial class ObjectiveC + { + private const string ObjCRuntime = "/usr/lib/libobjc.A.dylib"; + + [LibraryImport(ObjCRuntime, StringMarshalling = StringMarshalling.Utf8)] + private static unsafe partial IntPtr sel_getUid(string name); + + [LibraryImport(ObjCRuntime, StringMarshalling = StringMarshalling.Utf8)] + public static partial IntPtr objc_getClass(string name); + + [LibraryImport(ObjCRuntime)] + public static partial void objc_msgSend(IntPtr receiver, Selector selector); + + [LibraryImport(ObjCRuntime)] + public static partial void objc_msgSend(IntPtr receiver, Selector selector, byte value); + + [LibraryImport(ObjCRuntime)] + public static partial void objc_msgSend(IntPtr receiver, Selector selector, IntPtr value); + + [LibraryImport(ObjCRuntime)] + public static partial void objc_msgSend(IntPtr receiver, Selector selector, NSRect point); + + [LibraryImport(ObjCRuntime)] + public static partial void objc_msgSend(IntPtr receiver, Selector selector, double value); + + [LibraryImport(ObjCRuntime, EntryPoint = "objc_msgSend")] + public static partial IntPtr IntPtr_objc_msgSend(IntPtr receiver, Selector selector); + + [LibraryImport(ObjCRuntime, EntryPoint = "objc_msgSend")] + public static partial IntPtr IntPtr_objc_msgSend(IntPtr receiver, Selector selector, IntPtr param); + + [LibraryImport(ObjCRuntime, EntryPoint = "objc_msgSend", StringMarshalling = StringMarshalling.Utf8)] + public static partial IntPtr IntPtr_objc_msgSend(IntPtr receiver, Selector selector, string param); + + [LibraryImport(ObjCRuntime, EntryPoint = "objc_msgSend")] + [return: MarshalAs(UnmanagedType.Bool)] + public static partial bool bool_objc_msgSend(IntPtr receiver, Selector selector, IntPtr param); + + public struct Selector + { + public readonly IntPtr SelPtr; + + public unsafe Selector(string name) + { + SelPtr = sel_getUid(name); + } + + public static implicit operator Selector(string value) => new(value); + } + + public struct NSString + { + public readonly IntPtr StrPtr; + + public NSString(string aString) + { + IntPtr nsString = objc_getClass("NSString"); + StrPtr = IntPtr_objc_msgSend(nsString, "stringWithUTF8String:", aString); + } + + public static implicit operator IntPtr(NSString nsString) => nsString.StrPtr; + } + + public readonly struct NSPoint + { + public readonly double X; + public readonly double Y; + + public NSPoint(double x, double y) + { + X = x; + Y = y; + } + } + + public readonly struct NSRect + { + public readonly NSPoint Pos; + public readonly NSPoint Size; + + public NSRect(double x, double y, double width, double height) + { + Pos = new NSPoint(x, y); + Size = new NSPoint(width, height); + } + } + } +} \ No newline at end of file diff --git a/Ryujinx.Ui.Common/Helper/OpenHelper.cs b/Ryujinx.Ui.Common/Helper/OpenHelper.cs index 355348921..5b2e86635 100644 --- a/Ryujinx.Ui.Common/Helper/OpenHelper.cs +++ b/Ryujinx.Ui.Common/Helper/OpenHelper.cs @@ -55,7 +55,17 @@ namespace Ryujinx.Ui.Common.Helper } else if (OperatingSystem.IsMacOS()) { - Process.Start("open", $"-R \"{path}\""); + ObjectiveC.NSString nsStringPath = new(path); + IntPtr nsUrl = ObjectiveC.objc_getClass("NSURL"); + var urlPtr = ObjectiveC.IntPtr_objc_msgSend(nsUrl, "fileURLWithPath:", nsStringPath); + + IntPtr nsArray = ObjectiveC.objc_getClass("NSArray"); + IntPtr urlArray = ObjectiveC.IntPtr_objc_msgSend(nsArray, "arrayWithObject:", urlPtr); + + IntPtr nsWorkspace = ObjectiveC.objc_getClass("NSWorkspace"); + IntPtr sharedWorkspace = ObjectiveC.IntPtr_objc_msgSend(nsWorkspace, "sharedWorkspace"); + + ObjectiveC.objc_msgSend(sharedWorkspace, "activateFileViewerSelectingURLs:", urlArray); } else if (OperatingSystem.IsLinux()) { @@ -84,7 +94,14 @@ namespace Ryujinx.Ui.Common.Helper } else if (OperatingSystem.IsMacOS()) { - Process.Start("open", url); + ObjectiveC.NSString nsStringPath = new(url); + IntPtr nsUrl = ObjectiveC.objc_getClass("NSURL"); + var urlPtr = ObjectiveC.IntPtr_objc_msgSend(nsUrl, "URLWithString:", nsStringPath); + + IntPtr nsWorkspace = ObjectiveC.objc_getClass("NSWorkspace"); + IntPtr sharedWorkspace = ObjectiveC.IntPtr_objc_msgSend(nsWorkspace, "sharedWorkspace"); + + ObjectiveC.bool_objc_msgSend(sharedWorkspace, "openURL:", urlPtr); } else { From 5f38086f9494a4ffbcb4b0ce4b7727ad8ac18b3e Mon Sep 17 00:00:00 2001 From: gdkchan Date: Thu, 9 Feb 2023 00:48:25 -0300 Subject: [PATCH 03/41] Fix SPIR-V when all inputs/outputs are indexed (#4389) --- .../CodeGen/Spirv/Declarations.cs | 128 +++++++++--------- 1 file changed, 65 insertions(+), 63 deletions(-) diff --git a/Ryujinx.Graphics.Shader/CodeGen/Spirv/Declarations.cs b/Ryujinx.Graphics.Shader/CodeGen/Spirv/Declarations.cs index fab1667ce..5108d8619 100644 --- a/Ryujinx.Graphics.Shader/CodeGen/Spirv/Declarations.cs +++ b/Ryujinx.Graphics.Shader/CodeGen/Spirv/Declarations.cs @@ -397,6 +397,31 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Spirv private static void DeclareInputAttributes(CodeGenContext context, StructuredProgramInfo info, bool perPatch) { bool iaIndexing = context.Config.UsedFeatures.HasFlag(FeatureFlags.IaIndexing); + + if (iaIndexing && !perPatch) + { + var attrType = context.TypeVector(context.TypeFP32(), (LiteralInteger)4); + attrType = context.TypeArray(attrType, context.Constant(context.TypeU32(), (LiteralInteger)MaxAttributes)); + + if (context.Config.Stage == ShaderStage.Geometry) + { + attrType = context.TypeArray(attrType, context.Constant(context.TypeU32(), (LiteralInteger)context.InputVertices)); + } + + var spvType = context.TypePointer(StorageClass.Input, attrType); + var spvVar = context.Variable(spvType, StorageClass.Input); + + if (context.Config.PassthroughAttributes != 0 && context.Config.GpuAccessor.QueryHostSupportsGeometryShaderPassthrough()) + { + context.Decorate(spvVar, Decoration.PassthroughNV); + } + + context.Decorate(spvVar, Decoration.Location, (LiteralInteger)0); + + context.AddGlobalVariable(spvVar); + context.InputsArray = spvVar; + } + var inputs = perPatch ? info.InputsPerPatch : info.Inputs; foreach (int attr in inputs) @@ -410,60 +435,56 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Spirv if (iaIndexing && isUserAttr && !perPatch) { - if (context.InputsArray == null) - { - var attrType = context.TypeVector(context.TypeFP32(), (LiteralInteger)4); - attrType = context.TypeArray(attrType, context.Constant(context.TypeU32(), (LiteralInteger)MaxAttributes)); - - if (context.Config.Stage == ShaderStage.Geometry) - { - attrType = context.TypeArray(attrType, context.Constant(context.TypeU32(), (LiteralInteger)context.InputVertices)); - } - - var spvType = context.TypePointer(StorageClass.Input, attrType); - var spvVar = context.Variable(spvType, StorageClass.Input); - - if (context.Config.PassthroughAttributes != 0 && context.Config.GpuAccessor.QueryHostSupportsGeometryShaderPassthrough()) - { - context.Decorate(spvVar, Decoration.PassthroughNV); - } - - context.Decorate(spvVar, Decoration.Location, (LiteralInteger)0); - - context.AddGlobalVariable(spvVar); - context.InputsArray = spvVar; - } + continue; } - else + + PixelImap iq = PixelImap.Unused; + + if (context.Config.Stage == ShaderStage.Fragment) { - PixelImap iq = PixelImap.Unused; - - if (context.Config.Stage == ShaderStage.Fragment) + if (attr >= AttributeConsts.UserAttributeBase && attr < AttributeConsts.UserAttributeEnd) { - if (attr >= AttributeConsts.UserAttributeBase && attr < AttributeConsts.UserAttributeEnd) - { - iq = context.Config.ImapTypes[(attr - AttributeConsts.UserAttributeBase) / 16].GetFirstUsedType(); - } - else - { - AttributeInfo attrInfo = AttributeInfo.From(context.Config, attr, isOutAttr: false); - AggregateType elemType = attrInfo.Type & AggregateType.ElementTypeMask; + iq = context.Config.ImapTypes[(attr - AttributeConsts.UserAttributeBase) / 16].GetFirstUsedType(); + } + else + { + AttributeInfo attrInfo = AttributeInfo.From(context.Config, attr, isOutAttr: false); + AggregateType elemType = attrInfo.Type & AggregateType.ElementTypeMask; - if (attrInfo.IsBuiltin && (elemType == AggregateType.S32 || elemType == AggregateType.U32)) - { - iq = PixelImap.Constant; - } + if (attrInfo.IsBuiltin && (elemType == AggregateType.S32 || elemType == AggregateType.U32)) + { + iq = PixelImap.Constant; } } - - DeclareInputOrOutput(context, attr, perPatch, isOutAttr: false, iq); } + + DeclareInputOrOutput(context, attr, perPatch, isOutAttr: false, iq); } } private static void DeclareOutputAttributes(CodeGenContext context, StructuredProgramInfo info, bool perPatch) { bool oaIndexing = context.Config.UsedFeatures.HasFlag(FeatureFlags.OaIndexing); + + if (oaIndexing && !perPatch) + { + var attrType = context.TypeVector(context.TypeFP32(), (LiteralInteger)4); + attrType = context.TypeArray(attrType, context.Constant(context.TypeU32(), (LiteralInteger)MaxAttributes)); + + if (context.Config.Stage == ShaderStage.TessellationControl) + { + attrType = context.TypeArray(attrType, context.Constant(context.TypeU32(), context.Config.ThreadsPerInputPrimitive)); + } + + var spvType = context.TypePointer(StorageClass.Output, attrType); + var spvVar = context.Variable(spvType, StorageClass.Output); + + context.Decorate(spvVar, Decoration.Location, (LiteralInteger)0); + + context.AddGlobalVariable(spvVar); + context.OutputsArray = spvVar; + } + var outputs = perPatch ? info.OutputsPerPatch : info.Outputs; foreach (int attr in outputs) @@ -477,29 +498,10 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Spirv if (oaIndexing && isUserAttr && !perPatch) { - if (context.OutputsArray == null) - { - var attrType = context.TypeVector(context.TypeFP32(), (LiteralInteger)4); - attrType = context.TypeArray(attrType, context.Constant(context.TypeU32(), (LiteralInteger)MaxAttributes)); - - if (context.Config.Stage == ShaderStage.TessellationControl) - { - attrType = context.TypeArray(attrType, context.Constant(context.TypeU32(), context.Config.ThreadsPerInputPrimitive)); - } - - var spvType = context.TypePointer(StorageClass.Output, attrType); - var spvVar = context.Variable(spvType, StorageClass.Output); - - context.Decorate(spvVar, Decoration.Location, (LiteralInteger)0); - - context.AddGlobalVariable(spvVar); - context.OutputsArray = spvVar; - } - } - else - { - DeclareOutputAttribute(context, attr, perPatch); + continue; } + + DeclareOutputAttribute(context, attr, perPatch); } if (context.Config.Stage == ShaderStage.Vertex) From 61b1ce252f11e8f8e31080faee60d0a9d99cb67f Mon Sep 17 00:00:00 2001 From: gdkchan Date: Fri, 10 Feb 2023 11:47:59 -0300 Subject: [PATCH 04/41] Allow partially mapped textures with unmapped start (#4394) --- Ryujinx.Graphics.Gpu/Image/TextureCache.cs | 7 +++++ Ryujinx.Graphics.Gpu/Memory/MemoryManager.cs | 32 ++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/Ryujinx.Graphics.Gpu/Image/TextureCache.cs b/Ryujinx.Graphics.Gpu/Image/TextureCache.cs index 27bec786f..f18de6075 100644 --- a/Ryujinx.Graphics.Gpu/Image/TextureCache.cs +++ b/Ryujinx.Graphics.Gpu/Image/TextureCache.cs @@ -474,6 +474,13 @@ namespace Ryujinx.Graphics.Gpu.Image { address = memoryManager.Translate(info.GpuAddress); + // If the start address is unmapped, let's try to find a page of memory that is mapped. + if (address == MemoryManager.PteUnmapped) + { + address = memoryManager.TranslateFirstMapped(info.GpuAddress, (ulong)info.CalculateSizeInfo(layerSize).TotalSize); + } + + // If address is still invalid, the texture is fully unmapped, so it has no data, just return null. if (address == MemoryManager.PteUnmapped) { return null; diff --git a/Ryujinx.Graphics.Gpu/Memory/MemoryManager.cs b/Ryujinx.Graphics.Gpu/Memory/MemoryManager.cs index 0ac6160d9..b0f7e7992 100644 --- a/Ryujinx.Graphics.Gpu/Memory/MemoryManager.cs +++ b/Ryujinx.Graphics.Gpu/Memory/MemoryManager.cs @@ -583,6 +583,38 @@ namespace Ryujinx.Graphics.Gpu.Memory return UnpackPaFromPte(pte) + (va & PageMask); } + /// + /// Translates a GPU virtual address to a CPU virtual address on the first mapped page of memory + /// on the specified region. + /// If no page is mapped on the specified region, is returned. + /// + /// GPU virtual address to be translated + /// Size of the range to be translated + /// CPU virtual address, or if unmapped + public ulong TranslateFirstMapped(ulong va, ulong size) + { + if (!ValidateAddress(va)) + { + return PteUnmapped; + } + + ulong endVa = va + size; + + ulong pte = GetPte(va); + + for (; va < endVa && pte == PteUnmapped; va += PageSize - (va & PageMask)) + { + pte = GetPte(va); + } + + if (pte == PteUnmapped) + { + return PteUnmapped; + } + + return UnpackPaFromPte(pte) + (va & PageMask); + } + /// /// Gets the kind of a given memory page. /// This might indicate the type of resource that can be allocated on the page, and also texture tiling. From 1dcd44b94fcba10e9f78f8352557d46ea84b80ab Mon Sep 17 00:00:00 2001 From: Logan Stromberg Date: Fri, 10 Feb 2023 07:37:20 -0800 Subject: [PATCH 05/41] Treat NpadIdType < 0 as invalid. Filter invalid SupportedPlayers inside IHidServer.SetSupportedNpadIdType(). (#4377) Co-authored-by: Logan Stromberg --- Ryujinx.HLE/HOS/Services/Hid/HidServer/HidUtils.cs | 4 +++- Ryujinx.HLE/HOS/Services/Hid/IHidServer.cs | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Ryujinx.HLE/HOS/Services/Hid/HidServer/HidUtils.cs b/Ryujinx.HLE/HOS/Services/Hid/HidServer/HidUtils.cs index 65a69bb77..b98f60658 100644 --- a/Ryujinx.HLE/HOS/Services/Hid/HidServer/HidUtils.cs +++ b/Ryujinx.HLE/HOS/Services/Hid/HidServer/HidUtils.cs @@ -38,7 +38,9 @@ namespace Ryujinx.HLE.HOS.Services.Hid.HidServer public static bool IsValidNpadIdType(NpadIdType npadIdType) { - return npadIdType <= NpadIdType.Player8 || npadIdType == NpadIdType.Handheld || npadIdType == NpadIdType.Unknown; + return (npadIdType >= NpadIdType.Player1 && npadIdType <= NpadIdType.Player8) || + npadIdType == NpadIdType.Handheld || + npadIdType == NpadIdType.Unknown; } } } \ No newline at end of file diff --git a/Ryujinx.HLE/HOS/Services/Hid/IHidServer.cs b/Ryujinx.HLE/HOS/Services/Hid/IHidServer.cs index d347a3bde..266fc04fb 100644 --- a/Ryujinx.HLE/HOS/Services/Hid/IHidServer.cs +++ b/Ryujinx.HLE/HOS/Services/Hid/IHidServer.cs @@ -722,7 +722,7 @@ namespace Ryujinx.HLE.HOS.Services.Hid for (int i = 0; i < supportedPlayerIds.Length; ++i) { - if (supportedPlayerIds[i] >= 0) + if (HidUtils.IsValidNpadIdType(supportedPlayerIds[i])) { context.Device.Hid.Npads.SetSupportedPlayer(HidUtils.GetIndexFromNpadIdType(supportedPlayerIds[i])); } @@ -1101,7 +1101,7 @@ namespace Ryujinx.HLE.HOS.Services.Hid if (deviceType < NpadStyleIndex.System || deviceType >= NpadStyleIndex.FullKey) { - if (npadIdType >= (NpadIdType.Player8 + 1) && npadIdType != NpadIdType.Handheld && npadIdType != NpadIdType.Unknown) + if (!HidUtils.IsValidNpadIdType(npadIdType)) { return ResultCode.InvalidNpadIdType; } From e4f68592c3a6e51414e5f78eef096f21bf735eb1 Mon Sep 17 00:00:00 2001 From: riperiperi Date: Sun, 12 Feb 2023 09:30:26 +0000 Subject: [PATCH 06/41] Fix partial updates for textures. (#4401) I was forcing some types of texture to partially update when investigating performance with games that stream in data, and noticed that partially loading texture data was really broken on both backends. Fixes Vulkan texture set by getting the correct expected size for the texture. Fixes partial upload on both backends for both Texture 2D Array and Cubemap using the wrong offset and uploading to the first layer/level for a handle. 3D might also be affected. This might fix textures randomly having incorrect data in games that render to it - jumbled in the case of OpenGL, and outdated/black in the case of Vulkan. This case typically happens in UE4 games. --- Ryujinx.Graphics.Gpu/Image/TextureGroup.cs | 9 ++++----- Ryujinx.Graphics.Vulkan/TextureView.cs | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/Ryujinx.Graphics.Gpu/Image/TextureGroup.cs b/Ryujinx.Graphics.Gpu/Image/TextureGroup.cs index 942fa2f87..1040b394a 100644 --- a/Ryujinx.Graphics.Gpu/Image/TextureGroup.cs +++ b/Ryujinx.Graphics.Gpu/Image/TextureGroup.cs @@ -336,24 +336,23 @@ namespace Ryujinx.Graphics.Gpu.Image if (_loadNeeded[baseHandle + i]) { var info = GetHandleInformation(baseHandle + i); - int offsetIndex = info.Index; // Only one of these will be greater than 1, as partial sync is only called when there are sub-image views. for (int layer = 0; layer < info.Layers; layer++) { for (int level = 0; level < info.Levels; level++) { + int offsetIndex = GetOffsetIndex(info.BaseLayer + layer, info.BaseLevel + level); + int offset = _allOffsets[offsetIndex]; int endOffset = Math.Min(offset + _sliceSizes[info.BaseLevel + level], (int)Storage.Size); int size = endOffset - offset; ReadOnlySpan data = _physicalMemory.GetSpan(Storage.Range.GetSlice((ulong)offset, (ulong)size)); - SpanOrArray result = Storage.ConvertToHostCompatibleFormat(data, info.BaseLevel, true); + SpanOrArray result = Storage.ConvertToHostCompatibleFormat(data, info.BaseLevel + level, true); - Storage.SetData(result, info.BaseLayer, info.BaseLevel); - - offsetIndex++; + Storage.SetData(result, info.BaseLayer + layer, info.BaseLevel + level); } } } diff --git a/Ryujinx.Graphics.Vulkan/TextureView.cs b/Ryujinx.Graphics.Vulkan/TextureView.cs index d60ce39b0..aa050c015 100644 --- a/Ryujinx.Graphics.Vulkan/TextureView.cs +++ b/Ryujinx.Graphics.Vulkan/TextureView.cs @@ -712,7 +712,7 @@ namespace Ryujinx.Graphics.Vulkan for (int level = 0; level < levels; level++) { - int mipSize = GetBufferDataLength(Info.GetMipSize(dstLevel + level)); + int mipSize = GetBufferDataLength(Info.GetMipSize2D(dstLevel + level) * dstLayers); int endOffset = offset + mipSize; From 052b23c83c3a58afdd63c3d5a7655be482f1c739 Mon Sep 17 00:00:00 2001 From: Mary Date: Mon, 13 Feb 2023 21:32:20 +0100 Subject: [PATCH 07/41] vulkan: Do not call vkCmdSetViewport when viewportCount is 0 (#4406) This fix validation error "VUID-vkCmdSetViewport-viewportCount-arraylength". --- Ryujinx.Graphics.Vulkan/PipelineBase.cs | 8 +++----- Ryujinx.Graphics.Vulkan/PipelineDynamicState.cs | 17 +++++++++++++---- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/Ryujinx.Graphics.Vulkan/PipelineBase.cs b/Ryujinx.Graphics.Vulkan/PipelineBase.cs index 43dccf86e..02b1c3896 100644 --- a/Ryujinx.Graphics.Vulkan/PipelineBase.cs +++ b/Ryujinx.Graphics.Vulkan/PipelineBase.cs @@ -650,9 +650,7 @@ namespace Ryujinx.Graphics.Vulkan _newState.DepthWriteEnable = oldDepthWriteEnable; _newState.Topology = oldTopology; - DynamicState.Viewports = oldViewports; - DynamicState.ViewportsCount = (int)oldViewportsCount; - DynamicState.SetViewportsDirty(); + DynamicState.SetViewports(ref oldViewports, oldViewportsCount); _newState.ViewportsCount = oldViewportsCount; SignalStateChange(); @@ -1183,6 +1181,8 @@ namespace Ryujinx.Graphics.Vulkan return Math.Clamp(value, 0f, 1f); } + DynamicState.ViewportsCount = (uint)count; + for (int i = 0; i < count; i++) { var viewport = viewports[i]; @@ -1196,8 +1196,6 @@ namespace Ryujinx.Graphics.Vulkan Clamp(viewport.DepthFar))); } - DynamicState.ViewportsCount = count; - float disableTransformF = disableTransform ? 1.0f : 0.0f; if (SupportBufferUpdater.Data.ViewportInverse.W != disableTransformF || disableTransform) { diff --git a/Ryujinx.Graphics.Vulkan/PipelineDynamicState.cs b/Ryujinx.Graphics.Vulkan/PipelineDynamicState.cs index b4d6e95c7..42ea022a4 100644 --- a/Ryujinx.Graphics.Vulkan/PipelineDynamicState.cs +++ b/Ryujinx.Graphics.Vulkan/PipelineDynamicState.cs @@ -21,7 +21,7 @@ namespace Ryujinx.Graphics.Vulkan private Array4 _blendConstants; - public int ViewportsCount; + public uint ViewportsCount; public Array16 Viewports; private enum DirtyFlags @@ -88,9 +88,15 @@ namespace Ryujinx.Graphics.Vulkan _dirty |= DirtyFlags.Viewport; } - public void SetViewportsDirty() + public void SetViewports(ref Array16 viewports, uint viewportsCount) { - _dirty |= DirtyFlags.Viewport; + Viewports = viewports; + ViewportsCount = viewportsCount; + + if (ViewportsCount != 0) + { + _dirty |= DirtyFlags.Viewport; + } } public void ForceAllDirty() @@ -155,7 +161,10 @@ namespace Ryujinx.Graphics.Vulkan private void RecordViewport(Vk api, CommandBuffer commandBuffer) { - api.CmdSetViewport(commandBuffer, 0, (uint)ViewportsCount, Viewports.AsSpan()); + if (ViewportsCount != 0) + { + api.CmdSetViewport(commandBuffer, 0, ViewportsCount, Viewports.AsSpan()); + } } } } From fe9c49949a1329bc964ab10ff2a97abd5507ef6a Mon Sep 17 00:00:00 2001 From: Mary Date: Mon, 13 Feb 2023 23:04:55 +0100 Subject: [PATCH 08/41] vulkan: Enforce Vulkan 1.2+ at instance API level and 1.1+ at device level (#4408) * vulkan: Enforce Vulkan 1.2+ at instance API level and 1.1+ at device level This ensure we don't end up trying to initialize with anything currently incompatible. * Address riperiperi's comment --- .../VulkanInitialization.cs | 33 +++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs b/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs index ab5a0acfb..b8b48f6ca 100644 --- a/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs +++ b/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs @@ -14,6 +14,9 @@ namespace Ryujinx.Graphics.Vulkan public unsafe static class VulkanInitialization { private const uint InvalidIndex = uint.MaxValue; + private static uint MinimalVulkanVersion = Vk.Version11.Value; + private static uint MinimalInstanceVulkanVersion = Vk.Version12.Value; + private static uint MaximumVulkanVersion = Vk.Version12.Value; private const string AppName = "Ryujinx.Graphics.Vulkan"; private const int QueuesCount = 2; @@ -99,7 +102,7 @@ namespace Ryujinx.Graphics.Vulkan ApplicationVersion = 1, PEngineName = (byte*)appName, EngineVersion = 1, - ApiVersion = Vk.Version12.Value + ApiVersion = MaximumVulkanVersion }; IntPtr* ppEnabledExtensions = stackalloc IntPtr[enabledExtensions.Length]; @@ -224,7 +227,7 @@ namespace Ryujinx.Graphics.Vulkan ApplicationVersion = 1, PEngineName = (byte*)appName, EngineVersion = 1, - ApiVersion = Vk.Version12.Value + ApiVersion = MaximumVulkanVersion }; var instanceCreateInfo = new InstanceCreateInfo @@ -239,6 +242,27 @@ namespace Ryujinx.Graphics.Vulkan api.CreateInstance(in instanceCreateInfo, null, out var instance).ThrowOnError(); + // We ensure that vkEnumerateInstanceVersion is present (added in 1.1). + // If the instance doesn't support it, no device is going to be 1.1 compatible. + if (api.GetInstanceProcAddr(instance, "vkEnumerateInstanceVersion") == IntPtr.Zero) + { + api.DestroyInstance(instance, null); + + return Array.Empty(); + } + + // We currently assume that the instance is compatible with Vulkan 1.2 + // TODO: Remove this once we relax our initialization codepaths. + uint instanceApiVerison = 0; + api.EnumerateInstanceVersion(ref instanceApiVerison).ThrowOnError(); + + if (instanceApiVerison < MinimalInstanceVulkanVersion) + { + api.DestroyInstance(instance, null); + + return Array.Empty(); + } + Marshal.FreeHGlobal(appName); uint physicalDeviceCount; @@ -259,6 +283,11 @@ namespace Ryujinx.Graphics.Vulkan var physicalDevice = physicalDevices[i]; api.GetPhysicalDeviceProperties(physicalDevice, out var properties); + if (properties.ApiVersion < MinimalVulkanVersion) + { + continue; + } + devices[i] = new DeviceInfo( StringFromIdPair(properties.VendorID, properties.DeviceID), VendorUtils.GetNameFromId(properties.VendorID), From ed7a0474c6b126d885e6689abc46264100ec8de0 Mon Sep 17 00:00:00 2001 From: Ac_K Date: Tue, 14 Feb 2023 15:58:57 +0100 Subject: [PATCH 09/41] Infra: Issues template cleanup (#4421) * Infra: Issues template cleanup * applied --- .github/ISSUE_TEMPLATE/bug_report.yml | 34 +++++++------------ .github/ISSUE_TEMPLATE/feature_request.yml | 6 ++-- .../missing_cpu_instruction.yml | 2 +- .../ISSUE_TEMPLATE/missing_service_call.yml | 2 +- 4 files changed, 17 insertions(+), 27 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index f65f32540..68be1f5e0 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -1,26 +1,27 @@ name: Bug Report description: File a bug report -title: "[Bug] " +title: "[Bug]" labels: bug body: - type: textarea id: issue attributes: - label: Description of Issue + label: Description of the issue description: What's the issue you encountered? validations: required: true - type: textarea id: repro attributes: - label: Reproduction Steps + label: Reproduction steps description: How can the issue be reproduced? + placeholder: Describe each step as precisely as possible validations: required: true - type: textarea id: log attributes: - label: Log File + label: Log file description: A log file will help our developers to better diagnose and fix the issue. placeholder: Logs files can be found under "Logs" folder in Ryujinx program folder. You can drag and drop the log on to the text area validations: @@ -29,55 +30,44 @@ body: id: os attributes: label: OS - placeholder: "Example: Windows 10" + placeholder: "e.g. Windows 10" validations: required: true - type: input id: ryujinx-version attributes: label: Ryujinx version - placeholder: | - - *(e.g. 1.0.470)* + placeholder: "e.g. 1.0.470" validations: required: true - type: input id: game-version attributes: label: Game version - placeholder: | - - *(e.g. 1.1.1)* + placeholder: "e.g. 1.1.1" validations: required: false - type: input id: cpu attributes: label: CPU - placeholder: | - - *(e.g. i7-6700)* + placeholder: "e.g. i7-6700" validations: required: false - type: input id: gpu attributes: label: GPU - placeholder: | - - *(e.g. NVIDIA RTX 2070)* + placeholder: "e.g. NVIDIA RTX 2070" validations: required: false - type: input id: ram attributes: label: RAM - placeholder: | - - *(e.g. 16GB)* + placeholder: "e.g. 16GB" validations: required: false - - type: checkboxes - attributes: - label: Applied Mods? - options: - - label: "Yes" - required: false - type: textarea id: mods attributes: @@ -93,4 +83,4 @@ body: - Additional info about your environment: - Any other information relevant to your issue. validations: - required: false + required: false \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml index a9a5b504a..383bbb151 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -1,6 +1,6 @@ name: Feature Request description: Suggest a new feature for Ryujinx. -title: "[Feature Request] <title>" +title: "[Feature Request]" body: - type: textarea id: overview @@ -12,14 +12,14 @@ body: - type: textarea id: details attributes: - label: Smaller Details + label: Smaller details description: These may include specific methods of implementation etc. validations: required: true - type: textarea id: request attributes: - label: Nature of Request + label: Nature of request validations: required: true - type: textarea diff --git a/.github/ISSUE_TEMPLATE/missing_cpu_instruction.yml b/.github/ISSUE_TEMPLATE/missing_cpu_instruction.yml index 10e3bad37..d815ddfd9 100644 --- a/.github/ISSUE_TEMPLATE/missing_cpu_instruction.yml +++ b/.github/ISSUE_TEMPLATE/missing_cpu_instruction.yml @@ -1,6 +1,6 @@ name: Missing CPU Instruction description: CPU Instruction is missing in Ryujinx. -title: "[CPU] <title>" +title: "[CPU]" labels: [cpu, not-implemented] body: - type: textarea diff --git a/.github/ISSUE_TEMPLATE/missing_service_call.yml b/.github/ISSUE_TEMPLATE/missing_service_call.yml index 48d46d57f..80aae533b 100644 --- a/.github/ISSUE_TEMPLATE/missing_service_call.yml +++ b/.github/ISSUE_TEMPLATE/missing_service_call.yml @@ -5,7 +5,7 @@ body: - type: textarea id: instruction attributes: - label: Service Call + label: Service call description: What service call is missing? validations: required: true From 32450d45de7889318e0f289fc52b3fffc62edf60 Mon Sep 17 00:00:00 2001 From: Mary <mary@mary.zone> Date: Wed, 15 Feb 2023 07:50:26 +0100 Subject: [PATCH 10/41] vulkan: Clean up MemoryAllocator (#4418) This started as an attempt to remove vkGetPhysicalDeviceMemoryProperties in FindSuitableMemoryTypeIndex (As this could have some overhead and shouldn't change at runtime) and turned in a little bigger cleanup. --- Ryujinx.Graphics.Vulkan/BufferManager.cs | 6 ++--- Ryujinx.Graphics.Vulkan/MemoryAllocator.cs | 26 ++++++++++------------ Ryujinx.Graphics.Vulkan/TextureStorage.cs | 5 ++--- Ryujinx.Graphics.Vulkan/VulkanRenderer.cs | 6 ++--- 4 files changed, 19 insertions(+), 24 deletions(-) diff --git a/Ryujinx.Graphics.Vulkan/BufferManager.cs b/Ryujinx.Graphics.Vulkan/BufferManager.cs index 9c50e6ff3..49fdd75d6 100644 --- a/Ryujinx.Graphics.Vulkan/BufferManager.cs +++ b/Ryujinx.Graphics.Vulkan/BufferManager.cs @@ -39,7 +39,6 @@ namespace Ryujinx.Graphics.Vulkan BufferUsageFlags.VertexBufferBit | BufferUsageFlags.TransformFeedbackBufferBitExt; - private readonly PhysicalDevice _physicalDevice; private readonly Device _device; private readonly IdList<BufferHolder> _buffers; @@ -48,9 +47,8 @@ namespace Ryujinx.Graphics.Vulkan public StagingBuffer StagingBuffer { get; } - public BufferManager(VulkanRenderer gd, PhysicalDevice physicalDevice, Device device) + public BufferManager(VulkanRenderer gd, Device device) { - _physicalDevice = physicalDevice; _device = device; _buffers = new IdList<BufferHolder>(); StagingBuffer = new StagingBuffer(gd, this); @@ -114,7 +112,7 @@ namespace Ryujinx.Graphics.Vulkan allocateFlagsAlt = DefaultBufferMemoryAltFlags; } - var allocation = gd.MemoryAllocator.AllocateDeviceMemory(_physicalDevice, requirements, allocateFlags, allocateFlagsAlt); + var allocation = gd.MemoryAllocator.AllocateDeviceMemory(requirements, allocateFlags, allocateFlagsAlt); if (allocation.Memory.Handle == 0UL) { diff --git a/Ryujinx.Graphics.Vulkan/MemoryAllocator.cs b/Ryujinx.Graphics.Vulkan/MemoryAllocator.cs index 83c0a3243..e4dcd916e 100644 --- a/Ryujinx.Graphics.Vulkan/MemoryAllocator.cs +++ b/Ryujinx.Graphics.Vulkan/MemoryAllocator.cs @@ -9,34 +9,36 @@ namespace Ryujinx.Graphics.Vulkan private ulong MaxDeviceMemoryUsageEstimate = 16UL * 1024 * 1024 * 1024; private readonly Vk _api; + private readonly PhysicalDevice _physicalDevice; private readonly Device _device; private readonly List<MemoryAllocatorBlockList> _blockLists; + private readonly int _blockAlignment; + private readonly PhysicalDeviceMemoryProperties _physicalDeviceMemoryProperties; - private int _blockAlignment; - - public MemoryAllocator(Vk api, Device device, uint maxMemoryAllocationCount) + public MemoryAllocator(Vk api, PhysicalDevice physicalDevice, Device device, uint maxMemoryAllocationCount) { _api = api; + _physicalDevice = physicalDevice; _device = device; _blockLists = new List<MemoryAllocatorBlockList>(); _blockAlignment = (int)Math.Min(int.MaxValue, MaxDeviceMemoryUsageEstimate / (ulong)maxMemoryAllocationCount); + + _api.GetPhysicalDeviceMemoryProperties(_physicalDevice, out _physicalDeviceMemoryProperties); } public MemoryAllocation AllocateDeviceMemory( - PhysicalDevice physicalDevice, MemoryRequirements requirements, MemoryPropertyFlags flags = 0) { - return AllocateDeviceMemory(physicalDevice, requirements, flags, flags); + return AllocateDeviceMemory(requirements, flags, flags); } public MemoryAllocation AllocateDeviceMemory( - PhysicalDevice physicalDevice, MemoryRequirements requirements, MemoryPropertyFlags flags, MemoryPropertyFlags alternativeFlags) { - int memoryTypeIndex = FindSuitableMemoryTypeIndex(_api, physicalDevice, requirements.MemoryTypeBits, flags, alternativeFlags); + int memoryTypeIndex = FindSuitableMemoryTypeIndex(requirements.MemoryTypeBits, flags, alternativeFlags); if (memoryTypeIndex < 0) { return default; @@ -65,20 +67,16 @@ namespace Ryujinx.Graphics.Vulkan return newBl.Allocate(size, alignment, map); } - private static int FindSuitableMemoryTypeIndex( - Vk api, - PhysicalDevice physicalDevice, + private int FindSuitableMemoryTypeIndex( uint memoryTypeBits, MemoryPropertyFlags flags, MemoryPropertyFlags alternativeFlags) { int bestCandidateIndex = -1; - api.GetPhysicalDeviceMemoryProperties(physicalDevice, out var properties); - - for (int i = 0; i < properties.MemoryTypeCount; i++) + for (int i = 0; i < _physicalDeviceMemoryProperties.MemoryTypeCount; i++) { - var type = properties.MemoryTypes[i]; + var type = _physicalDeviceMemoryProperties.MemoryTypes[i]; if ((memoryTypeBits & (1 << i)) != 0) { diff --git a/Ryujinx.Graphics.Vulkan/TextureStorage.cs b/Ryujinx.Graphics.Vulkan/TextureStorage.cs index 92209997d..03a47a091 100644 --- a/Ryujinx.Graphics.Vulkan/TextureStorage.cs +++ b/Ryujinx.Graphics.Vulkan/TextureStorage.cs @@ -55,7 +55,6 @@ namespace Ryujinx.Graphics.Vulkan public unsafe TextureStorage( VulkanRenderer gd, - PhysicalDevice physicalDevice, Device device, TextureCreateInfo info, float scaleFactor, @@ -118,7 +117,7 @@ namespace Ryujinx.Graphics.Vulkan if (foreignAllocation == null) { gd.Api.GetImageMemoryRequirements(device, _image, out var requirements); - var allocation = gd.MemoryAllocator.AllocateDeviceMemory(physicalDevice, requirements, DefaultImageMemoryFlags); + var allocation = gd.MemoryAllocator.AllocateDeviceMemory(requirements, DefaultImageMemoryFlags); if (allocation.Memory.Handle == 0UL) { @@ -173,7 +172,7 @@ namespace Ryujinx.Graphics.Vulkan var info = NewCreateInfoWith(ref _info, format, _info.BytesPerPixel); - storage = new TextureStorage(_gd, default, _device, info, ScaleFactor, _allocationAuto); + storage = new TextureStorage(_gd, _device, info, ScaleFactor, _allocationAuto); _aliasedStorages.Add(format, storage); } diff --git a/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs b/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs index 92dec7a1a..595e033cb 100644 --- a/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs +++ b/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs @@ -280,7 +280,7 @@ namespace Ryujinx.Graphics.Vulkan supportedSampleCounts, portabilityFlags); - MemoryAllocator = new MemoryAllocator(Api, _device, properties.Limits.MaxMemoryAllocationCount); + MemoryAllocator = new MemoryAllocator(Api, _physicalDevice, _device, properties.Limits.MaxMemoryAllocationCount); CommandBufferPool = VulkanInitialization.CreateCommandBufferPool(Api, _device, Queue, QueueLock, queueFamilyIndex); @@ -290,7 +290,7 @@ namespace Ryujinx.Graphics.Vulkan BackgroundResources = new BackgroundResources(this, _device); - BufferManager = new BufferManager(this, _physicalDevice, _device); + BufferManager = new BufferManager(this, _device); _syncManager = new SyncManager(this, _device); _pipeline = new PipelineFull(this, _device); @@ -388,7 +388,7 @@ namespace Ryujinx.Graphics.Vulkan internal TextureStorage CreateTextureStorage(TextureCreateInfo info, float scale) { - return new TextureStorage(this, _physicalDevice, _device, info, scale); + return new TextureStorage(this, _device, info, scale); } public void DeleteBuffer(BufferHandle buffer) From 17078ad929f9942d2b03ede00b30867aeab924de Mon Sep 17 00:00:00 2001 From: Mary <mary@mary.zone> Date: Wed, 15 Feb 2023 09:41:48 +0100 Subject: [PATCH 11/41] vulkan: Respect VK_KHR_portability_subset vertex stride alignment (#4419) * vulkan: Respect VK_KHR_portability_subset vertex stride alignment We were hardcoding alignment to 4, but by specs it can be any values that is a power of 2. This also enable VK_KHR_portability_subset if present as per specs requirements. * address gdkchan's comment * Make NeedsVertexBufferAlignment internal --- Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs | 14 ++++++++------ Ryujinx.Graphics.Vulkan/PipelineBase.cs | 5 +++-- Ryujinx.Graphics.Vulkan/PipelineConverter.cs | 5 +++-- Ryujinx.Graphics.Vulkan/VulkanInitialization.cs | 3 ++- Ryujinx.Graphics.Vulkan/VulkanRenderer.cs | 13 ++++++++----- 5 files changed, 24 insertions(+), 16 deletions(-) diff --git a/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs b/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs index 82fcaea10..1ed2b0ccc 100644 --- a/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs +++ b/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs @@ -8,11 +8,10 @@ namespace Ryujinx.Graphics.Vulkan { None = 0, - VertexBufferAlignment4B = 1, - NoTriangleFans = 1 << 1, - NoPointMode = 1 << 2, - No3DImageView = 1 << 3, - NoLodBias = 1 << 4 + NoTriangleFans = 1, + NoPointMode = 1 << 1, + No3DImageView = 1 << 2, + NoLodBias = 1 << 3 } readonly struct HardwareCapabilities @@ -40,6 +39,7 @@ namespace Ryujinx.Graphics.Vulkan public readonly ShaderStageFlags RequiredSubgroupSizeStages; public readonly SampleCountFlags SupportedSampleCounts; public readonly PortabilitySubsetFlags PortabilitySubset; + public readonly uint VertexBufferAlignment; public HardwareCapabilities( bool supportsIndexTypeUint8, @@ -64,7 +64,8 @@ namespace Ryujinx.Graphics.Vulkan uint maxSubgroupSize, ShaderStageFlags requiredSubgroupSizeStages, SampleCountFlags supportedSampleCounts, - PortabilitySubsetFlags portabilitySubset) + PortabilitySubsetFlags portabilitySubset, + uint vertexBufferAlignment) { SupportsIndexTypeUint8 = supportsIndexTypeUint8; SupportsCustomBorderColor = supportsCustomBorderColor; @@ -89,6 +90,7 @@ namespace Ryujinx.Graphics.Vulkan RequiredSubgroupSizeStages = requiredSubgroupSizeStages; SupportedSampleCounts = supportedSampleCounts; PortabilitySubset = portabilitySubset; + VertexBufferAlignment = vertexBufferAlignment; } } } diff --git a/Ryujinx.Graphics.Vulkan/PipelineBase.cs b/Ryujinx.Graphics.Vulkan/PipelineBase.cs index 02b1c3896..8ed39ee26 100644 --- a/Ryujinx.Graphics.Vulkan/PipelineBase.cs +++ b/Ryujinx.Graphics.Vulkan/PipelineBase.cs @@ -1,4 +1,5 @@ -using Ryujinx.Graphics.GAL; +using Ryujinx.Common; +using Ryujinx.Graphics.GAL; using Ryujinx.Graphics.Shader; using Silk.NET.Vulkan; using System; @@ -1136,7 +1137,7 @@ namespace Ryujinx.Graphics.Vulkan buffer.Dispose(); - if (!Gd.Capabilities.PortabilitySubset.HasFlag(PortabilitySubsetFlags.VertexBufferAlignment4B) && + if (Gd.Capabilities.VertexBufferAlignment < 2 && (vertexBuffer.Stride % FormatExtensions.MaxBufferFormatScalarSize) == 0) { buffer = new VertexBufferState( diff --git a/Ryujinx.Graphics.Vulkan/PipelineConverter.cs b/Ryujinx.Graphics.Vulkan/PipelineConverter.cs index 5c9193fa6..da480d9f5 100644 --- a/Ryujinx.Graphics.Vulkan/PipelineConverter.cs +++ b/Ryujinx.Graphics.Vulkan/PipelineConverter.cs @@ -1,4 +1,5 @@ -using Ryujinx.Graphics.GAL; +using Ryujinx.Common; +using Ryujinx.Graphics.GAL; using Silk.NET.Vulkan; using System; @@ -253,7 +254,7 @@ namespace Ryujinx.Graphics.Vulkan if (gd.NeedsVertexBufferAlignment(vbScalarSizes[i], out int alignment)) { - alignedStride = (vertexBuffer.Stride + (alignment - 1)) & -alignment; + alignedStride = BitUtils.AlignUp(vertexBuffer.Stride, alignment); } // TODO: Support divisor > 1 diff --git a/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs b/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs index b8b48f6ca..4401f032d 100644 --- a/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs +++ b/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs @@ -36,7 +36,8 @@ namespace Ryujinx.Graphics.Vulkan "VK_KHR_shader_float16_int8", "VK_EXT_shader_subgroup_ballot", "VK_EXT_subgroup_size_control", - "VK_NV_geometry_shader_passthrough" + "VK_NV_geometry_shader_passthrough", + "VK_KHR_portability_subset", // By spec, we should enable this if present. }; public static string[] RequiredExtensions { get; } = new string[] diff --git a/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs b/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs index 595e033cb..a7b4b41a7 100644 --- a/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs +++ b/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs @@ -234,10 +234,12 @@ namespace Ryujinx.Graphics.Vulkan Api.GetPhysicalDeviceFeatures2(_physicalDevice, &features2); var portabilityFlags = PortabilitySubsetFlags.None; + uint vertexBufferAlignment = 1; if (usePortability) { - portabilityFlags |= propertiesPortabilitySubset.MinVertexInputBindingStrideAlignment > 1 ? PortabilitySubsetFlags.VertexBufferAlignment4B : 0; + vertexBufferAlignment = propertiesPortabilitySubset.MinVertexInputBindingStrideAlignment; + portabilityFlags |= featuresPortabilitySubset.TriangleFans ? 0 : PortabilitySubsetFlags.NoTriangleFans; portabilityFlags |= featuresPortabilitySubset.PointPolygons ? 0 : PortabilitySubsetFlags.NoPointMode; portabilityFlags |= featuresPortabilitySubset.ImageView2DOn3DImage ? 0 : PortabilitySubsetFlags.No3DImageView; @@ -278,7 +280,8 @@ namespace Ryujinx.Graphics.Vulkan propertiesSubgroupSizeControl.MaxSubgroupSize, propertiesSubgroupSizeControl.RequiredSubgroupSizeStages, supportedSampleCounts, - portabilityFlags); + portabilityFlags, + vertexBufferAlignment); MemoryAllocator = new MemoryAllocator(Api, _physicalDevice, _device, properties.Limits.MaxMemoryAllocationCount); @@ -636,11 +639,11 @@ namespace Ryujinx.Graphics.Vulkan PrintGpuInformation(); } - public bool NeedsVertexBufferAlignment(int attrScalarAlignment, out int alignment) + internal bool NeedsVertexBufferAlignment(int attrScalarAlignment, out int alignment) { - if (Capabilities.PortabilitySubset.HasFlag(PortabilitySubsetFlags.VertexBufferAlignment4B)) + if (Capabilities.VertexBufferAlignment > 1) { - alignment = 4; + alignment = (int)Capabilities.VertexBufferAlignment; return true; } From a5a9b9bc8b64184cd4c342dea39fed5c2c058a72 Mon Sep 17 00:00:00 2001 From: TSRBerry <20988865+TSRBerry@users.noreply.github.com> Date: Wed, 15 Feb 2023 23:36:35 +0100 Subject: [PATCH 12/41] GUI: Small Updater refactor & Set correct permissions on Linux when extracting files (#4315) * ava: Refactor Updater.cs Fix typos Remove unused usings Rename variables to follow naming scheme * ava: Set file permissions when extracting update files * gtk: Apply the same refactor to Updater.cs * updater: Replace assert with if statement * updater: Remove await usings again --- Ryujinx.Ava/Modules/Updater/Updater.cs | 71 ++--- Ryujinx/Modules/Updater/Updater.cs | 372 ++++++++++++------------- 2 files changed, 203 insertions(+), 240 deletions(-) diff --git a/Ryujinx.Ava/Modules/Updater/Updater.cs b/Ryujinx.Ava/Modules/Updater/Updater.cs index 62dc17729..b476bb85b 100644 --- a/Ryujinx.Ava/Modules/Updater/Updater.cs +++ b/Ryujinx.Ava/Modules/Updater/Updater.cs @@ -132,8 +132,8 @@ namespace Ryujinx.Modules } } - // If build not done, assume no new update are availaible. - if (_buildUrl == null) + // If build not done, assume no new update are available. + if (_buildUrl is null) { if (showVersionUpToDate) { @@ -240,13 +240,13 @@ namespace Ryujinx.Modules { HttpClient result = new(); - // Required by GitHub to interract with APIs. + // Required by GitHub to interact with APIs. result.DefaultRequestHeaders.Add("User-Agent", "Ryujinx-Updater/1.0.0"); return result; } - public static async void UpdateRyujinx(Window parent, string downloadUrl) + private static async void UpdateRyujinx(Window parent, string downloadUrl) { _updateSuccessful = false; @@ -300,8 +300,6 @@ namespace Ryujinx.Modules ryuExe = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, OperatingSystem.IsWindows() ? "Ryujinx.exe" : "Ryujinx"); } - SetFileExecutable(ryuExe); - Process.Start(ryuExe, CommandLineState.Arguments); Environment.Exit(0); @@ -408,9 +406,9 @@ namespace Ryujinx.Modules Logger.Warning?.Print(LogClass.Application, ex.Message); Logger.Warning?.Print(LogClass.Application, "Multi-Threaded update failed, falling back to single-threaded updater."); - for (int j = 0; j < webClients.Count; j++) + foreach (WebClient webClient in webClients) { - webClients[j].CancelAsync(); + webClient.CancelAsync(); } DoUpdateWithSingleThread(taskDialog, downloadUrl, updateFile); @@ -472,22 +470,6 @@ namespace Ryujinx.Modules worker.Start(); } - private static void SetFileExecutable(string path) - { - const UnixFileMode ExecutableFileMode = UnixFileMode.UserExecute | - UnixFileMode.UserWrite | - UnixFileMode.UserRead | - UnixFileMode.GroupRead | - UnixFileMode.GroupWrite | - UnixFileMode.OtherRead | - UnixFileMode.OtherWrite; - - if (!OperatingSystem.IsWindows() && File.Exists(path)) - { - File.SetUnixFileMode(path, ExecutableFileMode); - } - } - private static async void InstallUpdate(TaskDialog taskDialog, string updateFile) { // Extract Update @@ -503,27 +485,30 @@ namespace Ryujinx.Modules await Task.Run(() => { TarEntry tarEntry; - while ((tarEntry = tarStream.GetNextEntry()) != null) + + if (!OperatingSystem.IsWindows()) { - if (tarEntry.IsDirectory) continue; - - string outPath = Path.Combine(UpdateDir, tarEntry.Name); - - Directory.CreateDirectory(Path.GetDirectoryName(outPath)); - - using (FileStream outStream = File.OpenWrite(outPath)) + while ((tarEntry = tarStream.GetNextEntry()) is not null) { - tarStream.CopyEntryContents(outStream); + if (tarEntry.IsDirectory) continue; + + string outPath = Path.Combine(UpdateDir, tarEntry.Name); + + Directory.CreateDirectory(Path.GetDirectoryName(outPath)); + + using (FileStream outStream = File.OpenWrite(outPath)) + { + tarStream.CopyEntryContents(outStream); + } + + File.SetUnixFileMode(outPath, (UnixFileMode)tarEntry.TarHeader.Mode); + File.SetLastWriteTime(outPath, DateTime.SpecifyKind(tarEntry.ModTime, DateTimeKind.Utc)); + + Dispatcher.UIThread.Post(() => + { + taskDialog.SetProgressBarState(GetPercentage(tarEntry.Size, inStream.Length), TaskDialogProgressState.Normal); + }); } - - File.SetLastWriteTime(outPath, DateTime.SpecifyKind(tarEntry.ModTime, DateTimeKind.Utc)); - - TarEntry entry = tarEntry; - - Dispatcher.UIThread.Post(() => - { - taskDialog.SetProgressBarState(GetPercentage(entry.Size, inStream.Length), TaskDialogProgressState.Normal); - }); } }); @@ -603,8 +588,6 @@ namespace Ryujinx.Modules Directory.Delete(UpdateDir, true); - SetFileExecutable(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Ryujinx")); - _updateSuccessful = true; taskDialog.Hide(); diff --git a/Ryujinx/Modules/Updater/Updater.cs b/Ryujinx/Modules/Updater/Updater.cs index 2a25e78f5..5ad5924e8 100644 --- a/Ryujinx/Modules/Updater/Updater.cs +++ b/Ryujinx/Modules/Updater/Updater.cs @@ -9,6 +9,7 @@ using Ryujinx.Ui; using Ryujinx.Ui.Widgets; using System; using System.Collections.Generic; +using System.Diagnostics; using System.IO; using System.Linq; using System.Net; @@ -23,20 +24,20 @@ namespace Ryujinx.Modules { public static class Updater { + private const string GitHubApiURL = "https://api.github.com"; + private const int ConnectionCount = 4; + internal static bool Running; private static readonly string HomeDir = AppDomain.CurrentDomain.BaseDirectory; private static readonly string UpdateDir = Path.Combine(Path.GetTempPath(), "Ryujinx", "update"); private static readonly string UpdatePublishDir = Path.Combine(UpdateDir, "publish"); - private static readonly int ConnectionCount = 4; private static string _buildVer; private static string _platformExt; private static string _buildUrl; private static long _buildSize; - private const string GitHubApiURL = "https://api.github.com"; - // On Windows, GtkSharp.Dependencies adds these extra dirs that must be cleaned during updates. private static readonly string[] WindowsDependencyDirs = new string[] { "bin", "etc", "lib", "share" }; @@ -44,7 +45,7 @@ namespace Ryujinx.Modules { HttpClient result = new HttpClient(); - // Required by GitHub to interract with APIs. + // Required by GitHub to interact with APIs. result.DefaultRequestHeaders.Add("User-Agent", "Ryujinx-Updater/1.0.0"); return result; @@ -101,50 +102,48 @@ namespace Ryujinx.Modules // Get latest version number from GitHub API try { - using (HttpClient jsonClient = ConstructHttpClient()) + using HttpClient jsonClient = ConstructHttpClient(); + string buildInfoURL = $"{GitHubApiURL}/repos/{ReleaseInformation.ReleaseChannelOwner}/{ReleaseInformation.ReleaseChannelRepo}/releases/latest"; + + // Fetch latest build information + string fetchedJson = await jsonClient.GetStringAsync(buildInfoURL); + JObject jsonRoot = JObject.Parse(fetchedJson); + JToken assets = jsonRoot["assets"]; + + _buildVer = (string)jsonRoot["name"]; + + foreach (JToken asset in assets) { - string buildInfoURL = $"{GitHubApiURL}/repos/{ReleaseInformation.ReleaseChannelOwner}/{ReleaseInformation.ReleaseChannelRepo}/releases/latest"; + string assetName = (string)asset["name"]; + string assetState = (string)asset["state"]; + string downloadURL = (string)asset["browser_download_url"]; - // Fetch latest build information - string fetchedJson = await jsonClient.GetStringAsync(buildInfoURL); - JObject jsonRoot = JObject.Parse(fetchedJson); - JToken assets = jsonRoot["assets"]; - - _buildVer = (string)jsonRoot["name"]; - - foreach (JToken asset in assets) + if (assetName.StartsWith("ryujinx") && assetName.EndsWith(_platformExt)) { - string assetName = (string)asset["name"]; - string assetState = (string)asset["state"]; - string downloadURL = (string)asset["browser_download_url"]; + _buildUrl = downloadURL; - if (assetName.StartsWith("ryujinx") && assetName.EndsWith(_platformExt)) + if (assetState != "uploaded") { - _buildUrl = downloadURL; - - if (assetState != "uploaded") + if (showVersionUpToDate) { - if (showVersionUpToDate) - { - GtkDialog.CreateUpdaterInfoDialog("You are already using the latest version of Ryujinx!", ""); - } - - return; + GtkDialog.CreateUpdaterInfoDialog("You are already using the latest version of Ryujinx!", ""); } - break; + return; } - } - if (_buildUrl == null) + break; + } + } + + if (_buildUrl == null) + { + if (showVersionUpToDate) { - if (showVersionUpToDate) - { - GtkDialog.CreateUpdaterInfoDialog("You are already using the latest version of Ryujinx!", ""); - } - - return; + GtkDialog.CreateUpdaterInfoDialog("You are already using the latest version of Ryujinx!", ""); } + + return; } } catch (Exception exception) @@ -247,160 +246,142 @@ namespace Ryujinx.Modules for (int i = 0; i < ConnectionCount; i++) { - list.Add(new byte[0]); + list.Add(Array.Empty<byte>()); } for (int i = 0; i < ConnectionCount; i++) { #pragma warning disable SYSLIB0014 // TODO: WebClient is obsolete and need to be replaced with a more complex logic using HttpClient. - using (WebClient client = new WebClient()) + using WebClient client = new WebClient(); #pragma warning restore SYSLIB0014 + webClients.Add(client); + + if (i == ConnectionCount - 1) { - webClients.Add(client); + client.Headers.Add("Range", $"bytes={chunkSize * i}-{(chunkSize * (i + 1) - 1) + remainderChunk}"); + } + else + { + client.Headers.Add("Range", $"bytes={chunkSize * i}-{chunkSize * (i + 1) - 1}"); + } - if (i == ConnectionCount - 1) + client.DownloadProgressChanged += (_, args) => + { + int index = (int)args.UserState; + + Interlocked.Add(ref totalProgressPercentage, -1 * progressPercentage[index]); + Interlocked.Exchange(ref progressPercentage[index], args.ProgressPercentage); + Interlocked.Add(ref totalProgressPercentage, args.ProgressPercentage); + + updateDialog.ProgressBar.Value = totalProgressPercentage / ConnectionCount; + }; + + client.DownloadDataCompleted += (_, args) => + { + int index = (int)args.UserState; + + if (args.Cancelled) { - client.Headers.Add("Range", $"bytes={chunkSize * i}-{(chunkSize * (i + 1) - 1) + remainderChunk}"); - } - else - { - client.Headers.Add("Range", $"bytes={chunkSize * i}-{chunkSize * (i + 1) - 1}"); - } - - client.DownloadProgressChanged += (_, args) => - { - int index = (int)args.UserState; - - Interlocked.Add(ref totalProgressPercentage, -1 * progressPercentage[index]); - Interlocked.Exchange(ref progressPercentage[index], args.ProgressPercentage); - Interlocked.Add(ref totalProgressPercentage, args.ProgressPercentage); - - updateDialog.ProgressBar.Value = totalProgressPercentage / ConnectionCount; - }; - - client.DownloadDataCompleted += (_, args) => - { - int index = (int)args.UserState; - - if (args.Cancelled) - { - webClients[index].Dispose(); - - return; - } - - list[index] = args.Result; - Interlocked.Increment(ref completedRequests); - - if (Equals(completedRequests, ConnectionCount)) - { - byte[] mergedFileBytes = new byte[_buildSize]; - for (int connectionIndex = 0, destinationOffset = 0; connectionIndex < ConnectionCount; connectionIndex++) - { - Array.Copy(list[connectionIndex], 0, mergedFileBytes, destinationOffset, list[connectionIndex].Length); - destinationOffset += list[connectionIndex].Length; - } - - File.WriteAllBytes(updateFile, mergedFileBytes); - - try - { - InstallUpdate(updateDialog, updateFile); - } - catch (Exception e) - { - Logger.Warning?.Print(LogClass.Application, e.Message); - Logger.Warning?.Print(LogClass.Application, "Multi-Threaded update failed, falling back to single-threaded updater."); - - DoUpdateWithSingleThread(updateDialog, downloadUrl, updateFile); - - return; - } - } - }; - - try - { - client.DownloadDataAsync(new Uri(downloadUrl), i); - } - catch (WebException ex) - { - Logger.Warning?.Print(LogClass.Application, ex.Message); - Logger.Warning?.Print(LogClass.Application, "Multi-Threaded update failed, falling back to single-threaded updater."); - - for (int j = 0; j < webClients.Count; j++) - { - webClients[j].CancelAsync(); - } - - DoUpdateWithSingleThread(updateDialog, downloadUrl, updateFile); + webClients[index].Dispose(); return; } + + list[index] = args.Result; + Interlocked.Increment(ref completedRequests); + + if (Equals(completedRequests, ConnectionCount)) + { + byte[] mergedFileBytes = new byte[_buildSize]; + for (int connectionIndex = 0, destinationOffset = 0; connectionIndex < ConnectionCount; connectionIndex++) + { + Array.Copy(list[connectionIndex], 0, mergedFileBytes, destinationOffset, list[connectionIndex].Length); + destinationOffset += list[connectionIndex].Length; + } + + File.WriteAllBytes(updateFile, mergedFileBytes); + + try + { + InstallUpdate(updateDialog, updateFile); + } + catch (Exception e) + { + Logger.Warning?.Print(LogClass.Application, e.Message); + Logger.Warning?.Print(LogClass.Application, "Multi-Threaded update failed, falling back to single-threaded updater."); + + DoUpdateWithSingleThread(updateDialog, downloadUrl, updateFile); + + return; + } + } + }; + + try + { + client.DownloadDataAsync(new Uri(downloadUrl), i); + } + catch (WebException ex) + { + Logger.Warning?.Print(LogClass.Application, ex.Message); + Logger.Warning?.Print(LogClass.Application, "Multi-Threaded update failed, falling back to single-threaded updater."); + + foreach (WebClient webClient in webClients) + { + webClient.CancelAsync(); + } + + DoUpdateWithSingleThread(updateDialog, downloadUrl, updateFile); + + return; } } } private static void DoUpdateWithSingleThreadWorker(UpdateDialog updateDialog, string downloadUrl, string updateFile) { - using (HttpClient client = new HttpClient()) + using HttpClient client = new HttpClient(); + // We do not want to timeout while downloading + client.Timeout = TimeSpan.FromDays(1); + + using (HttpResponseMessage response = client.GetAsync(downloadUrl, HttpCompletionOption.ResponseHeadersRead).Result) + using (Stream remoteFileStream = response.Content.ReadAsStreamAsync().Result) { - // We do not want to timeout while downloading - client.Timeout = TimeSpan.FromDays(1); - - using (HttpResponseMessage response = client.GetAsync(downloadUrl, HttpCompletionOption.ResponseHeadersRead).Result) - using (Stream remoteFileStream = response.Content.ReadAsStreamAsync().Result) + using (Stream updateFileStream = File.Open(updateFile, FileMode.Create)) { - using (Stream updateFileStream = File.Open(updateFile, FileMode.Create)) + long totalBytes = response.Content.Headers.ContentLength.Value; + long byteWritten = 0; + + byte[] buffer = new byte[32 * 1024]; + + while (true) { - long totalBytes = response.Content.Headers.ContentLength.Value; - long byteWritten = 0; + int readSize = remoteFileStream.Read(buffer); - byte[] buffer = new byte[32 * 1024]; - - while (true) + if (readSize == 0) { - int readSize = remoteFileStream.Read(buffer); - - if (readSize == 0) - { - break; - } - - byteWritten += readSize; - - updateDialog.ProgressBar.Value = ((double)byteWritten / totalBytes) * 100; - updateFileStream.Write(buffer, 0, readSize); + break; } + + byteWritten += readSize; + + updateDialog.ProgressBar.Value = ((double)byteWritten / totalBytes) * 100; + updateFileStream.Write(buffer, 0, readSize); } } - - InstallUpdate(updateDialog, updateFile); } + + InstallUpdate(updateDialog, updateFile); } private static void DoUpdateWithSingleThread(UpdateDialog updateDialog, string downloadUrl, string updateFile) { - Thread worker = new Thread(() => DoUpdateWithSingleThreadWorker(updateDialog, downloadUrl, updateFile)); - worker.Name = "Updater.SingleThreadWorker"; - worker.Start(); - } - - private static void SetFileExecutable(string path) - { - const UnixFileMode ExecutableFileMode = UnixFileMode.UserExecute | - UnixFileMode.UserWrite | - UnixFileMode.UserRead | - UnixFileMode.GroupRead | - UnixFileMode.GroupWrite | - UnixFileMode.OtherRead | - UnixFileMode.OtherWrite; - - if (!OperatingSystem.IsWindows() && File.Exists(path)) + Thread worker = new Thread(() => DoUpdateWithSingleThreadWorker(updateDialog, downloadUrl, updateFile)) { - File.SetUnixFileMode(path, ExecutableFileMode); - } + Name = "Updater.SingleThreadWorker" + }; + worker.Start(); } private static async void InstallUpdate(UpdateDialog updateDialog, string updateFile) @@ -411,15 +392,17 @@ namespace Ryujinx.Modules if (OperatingSystem.IsLinux()) { - using (Stream inStream = File.OpenRead(updateFile)) - using (Stream gzipStream = new GZipInputStream(inStream)) - using (TarInputStream tarStream = new TarInputStream(gzipStream, Encoding.ASCII)) - { - updateDialog.ProgressBar.MaxValue = inStream.Length; + using Stream inStream = File.OpenRead(updateFile); + using Stream gzipStream = new GZipInputStream(inStream); + using TarInputStream tarStream = new TarInputStream(gzipStream, Encoding.ASCII); + updateDialog.ProgressBar.MaxValue = inStream.Length; - await Task.Run(() => + await Task.Run(() => + { + TarEntry tarEntry; + + if (!OperatingSystem.IsWindows()) { - TarEntry tarEntry; while ((tarEntry = tarStream.GetNextEntry()) != null) { if (tarEntry.IsDirectory) continue; @@ -433,6 +416,7 @@ namespace Ryujinx.Modules tarStream.CopyEntryContents(outStream); } + File.SetUnixFileMode(outPath, (UnixFileMode)tarEntry.TarHeader.Mode); File.SetLastWriteTime(outPath, DateTime.SpecifyKind(tarEntry.ModTime, DateTimeKind.Utc)); TarEntry entry = tarEntry; @@ -442,43 +426,41 @@ namespace Ryujinx.Modules updateDialog.ProgressBar.Value += entry.Size; }); } - }); + } + }); - updateDialog.ProgressBar.Value = inStream.Length; - } + updateDialog.ProgressBar.Value = inStream.Length; } else { - using (Stream inStream = File.OpenRead(updateFile)) - using (ZipFile zipFile = new ZipFile(inStream)) + using Stream inStream = File.OpenRead(updateFile); + using ZipFile zipFile = new ZipFile(inStream); + updateDialog.ProgressBar.MaxValue = zipFile.Count; + + await Task.Run(() => { - updateDialog.ProgressBar.MaxValue = zipFile.Count; - - await Task.Run(() => + foreach (ZipEntry zipEntry in zipFile) { - foreach (ZipEntry zipEntry in zipFile) + if (zipEntry.IsDirectory) continue; + + string outPath = Path.Combine(UpdateDir, zipEntry.Name); + + Directory.CreateDirectory(Path.GetDirectoryName(outPath)); + + using (Stream zipStream = zipFile.GetInputStream(zipEntry)) + using (FileStream outStream = File.OpenWrite(outPath)) { - if (zipEntry.IsDirectory) continue; - - string outPath = Path.Combine(UpdateDir, zipEntry.Name); - - Directory.CreateDirectory(Path.GetDirectoryName(outPath)); - - using (Stream zipStream = zipFile.GetInputStream(zipEntry)) - using (FileStream outStream = File.OpenWrite(outPath)) - { - zipStream.CopyTo(outStream); - } - - File.SetLastWriteTime(outPath, DateTime.SpecifyKind(zipEntry.DateTime, DateTimeKind.Utc)); - - Application.Invoke(delegate - { - updateDialog.ProgressBar.Value++; - }); + zipStream.CopyTo(outStream); } - }); - } + + File.SetLastWriteTime(outPath, DateTime.SpecifyKind(zipEntry.DateTime, DateTimeKind.Utc)); + + Application.Invoke(delegate + { + updateDialog.ProgressBar.Value++; + }); + } + }); } // Delete downloaded zip @@ -522,8 +504,6 @@ namespace Ryujinx.Modules Directory.Delete(UpdateDir, true); - SetFileExecutable(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Ryujinx")); - updateDialog.MainText.Text = "Update Complete!"; updateDialog.SecondaryText.Text = "Do you want to restart Ryujinx now?"; updateDialog.Modal = true; @@ -640,4 +620,4 @@ namespace Ryujinx.Modules } } } -} +} \ No newline at end of file From a707842e14dde468781270198ae63757ca3c2716 Mon Sep 17 00:00:00 2001 From: gdkchan <gab.dark.100@gmail.com> Date: Thu, 16 Feb 2023 11:16:31 -0300 Subject: [PATCH 13/41] Validate dimensions before creating texture (#4430) --- Ryujinx.Graphics.Gpu/Image/TextureCache.cs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/Ryujinx.Graphics.Gpu/Image/TextureCache.cs b/Ryujinx.Graphics.Gpu/Image/TextureCache.cs index f18de6075..261d06038 100644 --- a/Ryujinx.Graphics.Gpu/Image/TextureCache.cs +++ b/Ryujinx.Graphics.Gpu/Image/TextureCache.cs @@ -477,7 +477,23 @@ namespace Ryujinx.Graphics.Gpu.Image // If the start address is unmapped, let's try to find a page of memory that is mapped. if (address == MemoryManager.PteUnmapped) { - address = memoryManager.TranslateFirstMapped(info.GpuAddress, (ulong)info.CalculateSizeInfo(layerSize).TotalSize); + // Make sure that the dimensions are valid before calculating the texture size. + if (info.Width < 1 || info.Height < 1 || info.Levels < 1) + { + return null; + } + + if ((info.Target == Target.Texture3D || + info.Target == Target.Texture2DArray || + info.Target == Target.Texture2DMultisampleArray || + info.Target == Target.CubemapArray) && info.DepthOrLayers < 1) + { + return null; + } + + ulong dataSize = (ulong)info.CalculateSizeInfo(layerSize).TotalSize; + + address = memoryManager.TranslateFirstMapped(info.GpuAddress, dataSize); } // If address is still invalid, the texture is fully unmapped, so it has no data, just return null. From efb135b74c9c0ff1de2dfd7d2a431bd23185ca66 Mon Sep 17 00:00:00 2001 From: gdkchan <gab.dark.100@gmail.com> Date: Thu, 16 Feb 2023 18:28:49 -0300 Subject: [PATCH 14/41] Clear CPU side data on GPU buffer clears (#4125) * Clear CPU side data on GPU buffer clears * Implement tracked fill operation that can signal other resource types except buffer * Fix tests, add missing XML doc * PR feedback --- ARMeilleure/Memory/IMemoryManager.cs | 3 +- ARMeilleure/Signal/NativeSignalHandler.cs | 2 +- Ryujinx.Cpu/AppleHv/HvMemoryManager.cs | 20 +++++----- Ryujinx.Cpu/IVirtualMemoryManagerTracked.cs | 9 +++-- Ryujinx.Cpu/Jit/MemoryManager.cs | 18 ++++----- Ryujinx.Cpu/Jit/MemoryManagerHostMapped.cs | 20 +++++----- Ryujinx.Cpu/MemoryEhMeilleure.cs | 2 +- Ryujinx.Graphics.Gpu/Image/Pool.cs | 2 +- Ryujinx.Graphics.Gpu/Image/TextureGroup.cs | 2 +- Ryujinx.Graphics.Gpu/Memory/Buffer.cs | 4 +- Ryujinx.Graphics.Gpu/Memory/BufferCache.cs | 2 +- Ryujinx.Graphics.Gpu/Memory/PhysicalMemory.cs | 37 ++++++++++++++---- Ryujinx.Graphics.Gpu/Memory/ResourceKind.cs | 13 +++++++ .../MockVirtualMemoryManager.cs | 2 +- .../MultiRegionTrackingTests.cs | 16 ++++---- Ryujinx.Memory.Tests/TrackingTests.cs | 24 ++++++------ Ryujinx.Memory/AddressSpaceManager.cs | 2 +- Ryujinx.Memory/IVirtualMemoryManager.cs | 3 +- Ryujinx.Memory/Tracking/AbstractRegion.cs | 8 ++-- Ryujinx.Memory/Tracking/MemoryTracking.cs | 38 +++++++++++-------- Ryujinx.Memory/Tracking/MultiRegionHandle.cs | 14 +++++-- Ryujinx.Memory/Tracking/RegionHandle.cs | 24 ++++++++++-- .../Tracking/SmartMultiRegionHandle.cs | 12 +++--- Ryujinx.Memory/Tracking/VirtualRegion.cs | 16 ++++++-- Ryujinx.Tests/Memory/MockMemoryManager.cs | 2 +- 25 files changed, 188 insertions(+), 107 deletions(-) create mode 100644 Ryujinx.Graphics.Gpu/Memory/ResourceKind.cs diff --git a/ARMeilleure/Memory/IMemoryManager.cs b/ARMeilleure/Memory/IMemoryManager.cs index c4ea70d17..5eb1fadd6 100644 --- a/ARMeilleure/Memory/IMemoryManager.cs +++ b/ARMeilleure/Memory/IMemoryManager.cs @@ -71,6 +71,7 @@ namespace ARMeilleure.Memory /// <param name="size">Size of the region</param> /// <param name="write">True if the region was written, false if read</param> /// <param name="precise">True if the access is precise, false otherwise</param> - void SignalMemoryTracking(ulong va, ulong size, bool write, bool precise = false); + /// <param name="exemptId">Optional ID of the handles that should not be signalled</param> + void SignalMemoryTracking(ulong va, ulong size, bool write, bool precise = false, int? exemptId = null); } } \ No newline at end of file diff --git a/ARMeilleure/Signal/NativeSignalHandler.cs b/ARMeilleure/Signal/NativeSignalHandler.cs index 77eabe267..cddeb8174 100644 --- a/ARMeilleure/Signal/NativeSignalHandler.cs +++ b/ARMeilleure/Signal/NativeSignalHandler.cs @@ -222,7 +222,7 @@ namespace ARMeilleure.Signal // Tracking action should be non-null to call it, otherwise assume false return. context.BranchIfFalse(skipActionLabel, trackingActionPtr); - Operand result = context.Call(trackingActionPtr, OperandType.I32, offset, Const(_pageSize), isWrite, Const(0)); + Operand result = context.Call(trackingActionPtr, OperandType.I32, offset, Const(_pageSize), isWrite); context.Copy(inRegionLocal, result); context.MarkLabel(skipActionLabel); diff --git a/Ryujinx.Cpu/AppleHv/HvMemoryManager.cs b/Ryujinx.Cpu/AppleHv/HvMemoryManager.cs index 222dcae1b..437e02aea 100644 --- a/Ryujinx.Cpu/AppleHv/HvMemoryManager.cs +++ b/Ryujinx.Cpu/AppleHv/HvMemoryManager.cs @@ -634,13 +634,13 @@ namespace Ryujinx.Cpu.AppleHv /// <remarks> /// This function also validates that the given range is both valid and mapped, and will throw if it is not. /// </remarks> - public void SignalMemoryTracking(ulong va, ulong size, bool write, bool precise = false) + public void SignalMemoryTracking(ulong va, ulong size, bool write, bool precise = false, int? exemptId = null) { AssertValidAddressAndSize(va, size); if (precise) { - Tracking.VirtualMemoryEvent(va, size, write, precise: true); + Tracking.VirtualMemoryEvent(va, size, write, precise: true, exemptId); return; } @@ -663,7 +663,7 @@ namespace Ryujinx.Cpu.AppleHv if (state >= tag) { - Tracking.VirtualMemoryEvent(va, size, write); + Tracking.VirtualMemoryEvent(va, size, write, precise: false, exemptId); return; } else if (state == 0) @@ -706,7 +706,7 @@ namespace Ryujinx.Cpu.AppleHv // Only trigger tracking from reads if both bits are set on any page. if (write || (pte & (pte >> 1) & BlockMappedMask) != 0) { - Tracking.VirtualMemoryEvent(va, size, write); + Tracking.VirtualMemoryEvent(va, size, write, precise: false, exemptId); break; } } @@ -822,21 +822,21 @@ namespace Ryujinx.Cpu.AppleHv } /// <inheritdoc/> - public CpuRegionHandle BeginTracking(ulong address, ulong size) + public CpuRegionHandle BeginTracking(ulong address, ulong size, int id) { - return new CpuRegionHandle(Tracking.BeginTracking(address, size)); + return new CpuRegionHandle(Tracking.BeginTracking(address, size, id)); } /// <inheritdoc/> - public CpuMultiRegionHandle BeginGranularTracking(ulong address, ulong size, IEnumerable<IRegionHandle> handles, ulong granularity) + public CpuMultiRegionHandle BeginGranularTracking(ulong address, ulong size, IEnumerable<IRegionHandle> handles, ulong granularity, int id) { - return new CpuMultiRegionHandle(Tracking.BeginGranularTracking(address, size, handles, granularity)); + return new CpuMultiRegionHandle(Tracking.BeginGranularTracking(address, size, handles, granularity, id)); } /// <inheritdoc/> - public CpuSmartMultiRegionHandle BeginSmartGranularTracking(ulong address, ulong size, ulong granularity) + public CpuSmartMultiRegionHandle BeginSmartGranularTracking(ulong address, ulong size, ulong granularity, int id) { - return new CpuSmartMultiRegionHandle(Tracking.BeginSmartGranularTracking(address, size, granularity)); + return new CpuSmartMultiRegionHandle(Tracking.BeginSmartGranularTracking(address, size, granularity, id)); } /// <summary> diff --git a/Ryujinx.Cpu/IVirtualMemoryManagerTracked.cs b/Ryujinx.Cpu/IVirtualMemoryManagerTracked.cs index 8004d39bc..92d3c76ca 100644 --- a/Ryujinx.Cpu/IVirtualMemoryManagerTracked.cs +++ b/Ryujinx.Cpu/IVirtualMemoryManagerTracked.cs @@ -28,8 +28,9 @@ namespace Ryujinx.Cpu /// </summary> /// <param name="address">CPU virtual address of the region</param> /// <param name="size">Size of the region</param> + /// <param name="id">Handle ID</param> /// <returns>The memory tracking handle</returns> - CpuRegionHandle BeginTracking(ulong address, ulong size); + CpuRegionHandle BeginTracking(ulong address, ulong size, int id); /// <summary> /// Obtains a memory tracking handle for the given virtual region, with a specified granularity. This should be disposed when finished with. @@ -38,8 +39,9 @@ namespace Ryujinx.Cpu /// <param name="size">Size of the region</param> /// <param name="handles">Handles to inherit state from or reuse. When none are present, provide null</param> /// <param name="granularity">Desired granularity of write tracking</param> + /// <param name="id">Handle ID</param> /// <returns>The memory tracking handle</returns> - CpuMultiRegionHandle BeginGranularTracking(ulong address, ulong size, IEnumerable<IRegionHandle> handles, ulong granularity); + CpuMultiRegionHandle BeginGranularTracking(ulong address, ulong size, IEnumerable<IRegionHandle> handles, ulong granularity, int id); /// <summary> /// Obtains a smart memory tracking handle for the given virtual region, with a specified granularity. This should be disposed when finished with. @@ -47,7 +49,8 @@ namespace Ryujinx.Cpu /// <param name="address">CPU virtual address of the region</param> /// <param name="size">Size of the region</param> /// <param name="granularity">Desired granularity of write tracking</param> + /// <param name="id">Handle ID</param> /// <returns>The memory tracking handle</returns> - CpuSmartMultiRegionHandle BeginSmartGranularTracking(ulong address, ulong size, ulong granularity); + CpuSmartMultiRegionHandle BeginSmartGranularTracking(ulong address, ulong size, ulong granularity, int id); } } diff --git a/Ryujinx.Cpu/Jit/MemoryManager.cs b/Ryujinx.Cpu/Jit/MemoryManager.cs index 014d843b5..8542d53e2 100644 --- a/Ryujinx.Cpu/Jit/MemoryManager.cs +++ b/Ryujinx.Cpu/Jit/MemoryManager.cs @@ -629,31 +629,31 @@ namespace Ryujinx.Cpu.Jit } /// <inheritdoc/> - public CpuRegionHandle BeginTracking(ulong address, ulong size) + public CpuRegionHandle BeginTracking(ulong address, ulong size, int id) { - return new CpuRegionHandle(Tracking.BeginTracking(address, size)); + return new CpuRegionHandle(Tracking.BeginTracking(address, size, id)); } /// <inheritdoc/> - public CpuMultiRegionHandle BeginGranularTracking(ulong address, ulong size, IEnumerable<IRegionHandle> handles, ulong granularity) + public CpuMultiRegionHandle BeginGranularTracking(ulong address, ulong size, IEnumerable<IRegionHandle> handles, ulong granularity, int id) { - return new CpuMultiRegionHandle(Tracking.BeginGranularTracking(address, size, handles, granularity)); + return new CpuMultiRegionHandle(Tracking.BeginGranularTracking(address, size, handles, granularity, id)); } /// <inheritdoc/> - public CpuSmartMultiRegionHandle BeginSmartGranularTracking(ulong address, ulong size, ulong granularity) + public CpuSmartMultiRegionHandle BeginSmartGranularTracking(ulong address, ulong size, ulong granularity, int id) { - return new CpuSmartMultiRegionHandle(Tracking.BeginSmartGranularTracking(address, size, granularity)); + return new CpuSmartMultiRegionHandle(Tracking.BeginSmartGranularTracking(address, size, granularity, id)); } /// <inheritdoc/> - public void SignalMemoryTracking(ulong va, ulong size, bool write, bool precise = false) + public void SignalMemoryTracking(ulong va, ulong size, bool write, bool precise = false, int? exemptId = null) { AssertValidAddressAndSize(va, size); if (precise) { - Tracking.VirtualMemoryEvent(va, size, write, precise: true); + Tracking.VirtualMemoryEvent(va, size, write, precise: true, exemptId); return; } @@ -676,7 +676,7 @@ namespace Ryujinx.Cpu.Jit if ((pte & tag) != 0) { - Tracking.VirtualMemoryEvent(va, size, write); + Tracking.VirtualMemoryEvent(va, size, write, precise: false, exemptId); break; } diff --git a/Ryujinx.Cpu/Jit/MemoryManagerHostMapped.cs b/Ryujinx.Cpu/Jit/MemoryManagerHostMapped.cs index 856b6b9b0..090740abe 100644 --- a/Ryujinx.Cpu/Jit/MemoryManagerHostMapped.cs +++ b/Ryujinx.Cpu/Jit/MemoryManagerHostMapped.cs @@ -518,13 +518,13 @@ namespace Ryujinx.Cpu.Jit /// <remarks> /// This function also validates that the given range is both valid and mapped, and will throw if it is not. /// </remarks> - public void SignalMemoryTracking(ulong va, ulong size, bool write, bool precise = false) + public void SignalMemoryTracking(ulong va, ulong size, bool write, bool precise = false, int? exemptId = null) { AssertValidAddressAndSize(va, size); if (precise) { - Tracking.VirtualMemoryEvent(va, size, write, precise: true); + Tracking.VirtualMemoryEvent(va, size, write, precise: true, exemptId); return; } @@ -547,7 +547,7 @@ namespace Ryujinx.Cpu.Jit if (state >= tag) { - Tracking.VirtualMemoryEvent(va, size, write); + Tracking.VirtualMemoryEvent(va, size, write, precise: false, exemptId); return; } else if (state == 0) @@ -590,7 +590,7 @@ namespace Ryujinx.Cpu.Jit // Only trigger tracking from reads if both bits are set on any page. if (write || (pte & (pte >> 1) & BlockMappedMask) != 0) { - Tracking.VirtualMemoryEvent(va, size, write); + Tracking.VirtualMemoryEvent(va, size, write, precise: false, exemptId); break; } } @@ -706,21 +706,21 @@ namespace Ryujinx.Cpu.Jit } /// <inheritdoc/> - public CpuRegionHandle BeginTracking(ulong address, ulong size) + public CpuRegionHandle BeginTracking(ulong address, ulong size, int id) { - return new CpuRegionHandle(Tracking.BeginTracking(address, size)); + return new CpuRegionHandle(Tracking.BeginTracking(address, size, id)); } /// <inheritdoc/> - public CpuMultiRegionHandle BeginGranularTracking(ulong address, ulong size, IEnumerable<IRegionHandle> handles, ulong granularity) + public CpuMultiRegionHandle BeginGranularTracking(ulong address, ulong size, IEnumerable<IRegionHandle> handles, ulong granularity, int id) { - return new CpuMultiRegionHandle(Tracking.BeginGranularTracking(address, size, handles, granularity)); + return new CpuMultiRegionHandle(Tracking.BeginGranularTracking(address, size, handles, granularity, id)); } /// <inheritdoc/> - public CpuSmartMultiRegionHandle BeginSmartGranularTracking(ulong address, ulong size, ulong granularity) + public CpuSmartMultiRegionHandle BeginSmartGranularTracking(ulong address, ulong size, ulong granularity, int id) { - return new CpuSmartMultiRegionHandle(Tracking.BeginSmartGranularTracking(address, size, granularity)); + return new CpuSmartMultiRegionHandle(Tracking.BeginSmartGranularTracking(address, size, granularity, id)); } /// <summary> diff --git a/Ryujinx.Cpu/MemoryEhMeilleure.cs b/Ryujinx.Cpu/MemoryEhMeilleure.cs index 806ef8113..0b434ea74 100644 --- a/Ryujinx.Cpu/MemoryEhMeilleure.cs +++ b/Ryujinx.Cpu/MemoryEhMeilleure.cs @@ -8,7 +8,7 @@ namespace Ryujinx.Cpu { public class MemoryEhMeilleure : IDisposable { - private delegate bool TrackingEventDelegate(ulong address, ulong size, bool write, bool precise = false); + private delegate bool TrackingEventDelegate(ulong address, ulong size, bool write); private readonly MemoryTracking _tracking; private readonly TrackingEventDelegate _trackingEvent; diff --git a/Ryujinx.Graphics.Gpu/Image/Pool.cs b/Ryujinx.Graphics.Gpu/Image/Pool.cs index ee4c051f4..3e557c0bd 100644 --- a/Ryujinx.Graphics.Gpu/Image/Pool.cs +++ b/Ryujinx.Graphics.Gpu/Image/Pool.cs @@ -69,7 +69,7 @@ namespace Ryujinx.Graphics.Gpu.Image Address = address; Size = size; - _memoryTracking = physicalMemory.BeginGranularTracking(address, size); + _memoryTracking = physicalMemory.BeginGranularTracking(address, size, ResourceKind.Pool); _memoryTracking.RegisterPreciseAction(address, size, PreciseAction); _modifiedDelegate = RegionModified; } diff --git a/Ryujinx.Graphics.Gpu/Image/TextureGroup.cs b/Ryujinx.Graphics.Gpu/Image/TextureGroup.cs index 1040b394a..12a640e15 100644 --- a/Ryujinx.Graphics.Gpu/Image/TextureGroup.cs +++ b/Ryujinx.Graphics.Gpu/Image/TextureGroup.cs @@ -854,7 +854,7 @@ namespace Ryujinx.Graphics.Gpu.Image /// <returns>A CpuRegionHandle covering the given range</returns> private CpuRegionHandle GenerateHandle(ulong address, ulong size) { - return _physicalMemory.BeginTracking(address, size); + return _physicalMemory.BeginTracking(address, size, ResourceKind.Texture); } /// <summary> diff --git a/Ryujinx.Graphics.Gpu/Memory/Buffer.cs b/Ryujinx.Graphics.Gpu/Memory/Buffer.cs index a624386ed..3778cd824 100644 --- a/Ryujinx.Graphics.Gpu/Memory/Buffer.cs +++ b/Ryujinx.Graphics.Gpu/Memory/Buffer.cs @@ -105,13 +105,13 @@ namespace Ryujinx.Graphics.Gpu.Memory if (_useGranular) { - _memoryTrackingGranular = physicalMemory.BeginGranularTracking(address, size, baseHandles); + _memoryTrackingGranular = physicalMemory.BeginGranularTracking(address, size, ResourceKind.Buffer, baseHandles); _memoryTrackingGranular.RegisterPreciseAction(address, size, PreciseAction); } else { - _memoryTracking = physicalMemory.BeginTracking(address, size); + _memoryTracking = physicalMemory.BeginTracking(address, size, ResourceKind.Buffer); if (baseHandles != null) { diff --git a/Ryujinx.Graphics.Gpu/Memory/BufferCache.cs b/Ryujinx.Graphics.Gpu/Memory/BufferCache.cs index 00f590831..a5a9b75e9 100644 --- a/Ryujinx.Graphics.Gpu/Memory/BufferCache.cs +++ b/Ryujinx.Graphics.Gpu/Memory/BufferCache.cs @@ -368,7 +368,7 @@ namespace Ryujinx.Graphics.Gpu.Memory _context.Renderer.Pipeline.ClearBuffer(buffer.Handle, offset, (int)size, value); - buffer.SignalModified(address, size); + memoryManager.Physical.FillTrackedResource(address, size, value, ResourceKind.Buffer); } /// <summary> diff --git a/Ryujinx.Graphics.Gpu/Memory/PhysicalMemory.cs b/Ryujinx.Graphics.Gpu/Memory/PhysicalMemory.cs index c1fc0c5cd..bd33383e5 100644 --- a/Ryujinx.Graphics.Gpu/Memory/PhysicalMemory.cs +++ b/Ryujinx.Graphics.Gpu/Memory/PhysicalMemory.cs @@ -7,6 +7,7 @@ using Ryujinx.Memory.Range; using Ryujinx.Memory.Tracking; using System; using System.Collections.Generic; +using System.Runtime.InteropServices; using System.Threading; namespace Ryujinx.Graphics.Gpu.Memory @@ -295,23 +296,41 @@ namespace Ryujinx.Graphics.Gpu.Memory } } + /// <summary> + /// Fills the specified memory region with a 32-bit integer value. + /// </summary> + /// <param name="address">CPU virtual address of the region</param> + /// <param name="size">Size of the region</param> + /// <param name="value">Value to fill the region with</param> + /// <param name="kind">Kind of the resource being filled, which will not be signalled as CPU modified</param> + public void FillTrackedResource(ulong address, ulong size, uint value, ResourceKind kind) + { + _cpuMemory.SignalMemoryTracking(address, size, write: true, precise: true, (int)kind); + + using WritableRegion region = _cpuMemory.GetWritableRegion(address, (int)size); + + MemoryMarshal.Cast<byte, uint>(region.Memory.Span).Fill(value); + } + /// <summary> /// Obtains a memory tracking handle for the given virtual region. This should be disposed when finished with. /// </summary> /// <param name="address">CPU virtual address of the region</param> /// <param name="size">Size of the region</param> + /// <param name="kind">Kind of the resource being tracked</param> /// <returns>The memory tracking handle</returns> - public CpuRegionHandle BeginTracking(ulong address, ulong size) + public CpuRegionHandle BeginTracking(ulong address, ulong size, ResourceKind kind) { - return _cpuMemory.BeginTracking(address, size); + return _cpuMemory.BeginTracking(address, size, (int)kind); } /// <summary> /// Obtains a memory tracking handle for the given virtual region. This should be disposed when finished with. /// </summary> /// <param name="range">Ranges of physical memory where the data is located</param> + /// <param name="kind">Kind of the resource being tracked</param> /// <returns>The memory tracking handle</returns> - public GpuRegionHandle BeginTracking(MultiRange range) + public GpuRegionHandle BeginTracking(MultiRange range, ResourceKind kind) { var cpuRegionHandles = new CpuRegionHandle[range.Count]; int count = 0; @@ -321,7 +340,7 @@ namespace Ryujinx.Graphics.Gpu.Memory var currentRange = range.GetSubRange(i); if (currentRange.Address != MemoryManager.PteUnmapped) { - cpuRegionHandles[count++] = _cpuMemory.BeginTracking(currentRange.Address, currentRange.Size); + cpuRegionHandles[count++] = _cpuMemory.BeginTracking(currentRange.Address, currentRange.Size, (int)kind); } } @@ -338,12 +357,13 @@ namespace Ryujinx.Graphics.Gpu.Memory /// </summary> /// <param name="address">CPU virtual address of the region</param> /// <param name="size">Size of the region</param> + /// <param name="kind">Kind of the resource being tracked</param> /// <param name="handles">Handles to inherit state from or reuse</param> /// <param name="granularity">Desired granularity of write tracking</param> /// <returns>The memory tracking handle</returns> - public CpuMultiRegionHandle BeginGranularTracking(ulong address, ulong size, IEnumerable<IRegionHandle> handles = null, ulong granularity = 4096) + public CpuMultiRegionHandle BeginGranularTracking(ulong address, ulong size, ResourceKind kind, IEnumerable<IRegionHandle> handles = null, ulong granularity = 4096) { - return _cpuMemory.BeginGranularTracking(address, size, handles, granularity); + return _cpuMemory.BeginGranularTracking(address, size, handles, granularity, (int)kind); } /// <summary> @@ -351,11 +371,12 @@ namespace Ryujinx.Graphics.Gpu.Memory /// </summary> /// <param name="address">CPU virtual address of the region</param> /// <param name="size">Size of the region</param> + /// <param name="kind">Kind of the resource being tracked</param> /// <param name="granularity">Desired granularity of write tracking</param> /// <returns>The memory tracking handle</returns> - public CpuSmartMultiRegionHandle BeginSmartGranularTracking(ulong address, ulong size, ulong granularity = 4096) + public CpuSmartMultiRegionHandle BeginSmartGranularTracking(ulong address, ulong size, ResourceKind kind, ulong granularity = 4096) { - return _cpuMemory.BeginSmartGranularTracking(address, size, granularity); + return _cpuMemory.BeginSmartGranularTracking(address, size, granularity, (int)kind); } /// <summary> diff --git a/Ryujinx.Graphics.Gpu/Memory/ResourceKind.cs b/Ryujinx.Graphics.Gpu/Memory/ResourceKind.cs new file mode 100644 index 000000000..55d697b81 --- /dev/null +++ b/Ryujinx.Graphics.Gpu/Memory/ResourceKind.cs @@ -0,0 +1,13 @@ +namespace Ryujinx.Graphics.Gpu.Memory +{ + /// <summary> + /// Kind of a GPU resource. + /// </summary> + enum ResourceKind + { + None, + Buffer, + Texture, + Pool + } +} diff --git a/Ryujinx.Memory.Tests/MockVirtualMemoryManager.cs b/Ryujinx.Memory.Tests/MockVirtualMemoryManager.cs index 06eb4729e..6729f4a36 100644 --- a/Ryujinx.Memory.Tests/MockVirtualMemoryManager.cs +++ b/Ryujinx.Memory.Tests/MockVirtualMemoryManager.cs @@ -96,7 +96,7 @@ namespace Ryujinx.Memory.Tests throw new NotImplementedException(); } - public void SignalMemoryTracking(ulong va, ulong size, bool write, bool precise = false) + public void SignalMemoryTracking(ulong va, ulong size, bool write, bool precise = false, int? exemptId = null) { throw new NotImplementedException(); } diff --git a/Ryujinx.Memory.Tests/MultiRegionTrackingTests.cs b/Ryujinx.Memory.Tests/MultiRegionTrackingTests.cs index c607464d2..38cb49216 100644 --- a/Ryujinx.Memory.Tests/MultiRegionTrackingTests.cs +++ b/Ryujinx.Memory.Tests/MultiRegionTrackingTests.cs @@ -34,8 +34,8 @@ namespace Ryujinx.Memory.Tests private IMultiRegionHandle GetGranular(bool smart, ulong address, ulong size, ulong granularity) { return smart ? - _tracking.BeginSmartGranularTracking(address, size, granularity) : - (IMultiRegionHandle)_tracking.BeginGranularTracking(address, size, null, granularity); + _tracking.BeginSmartGranularTracking(address, size, granularity, 0) : + (IMultiRegionHandle)_tracking.BeginGranularTracking(address, size, null, granularity, 0); } private void RandomOrder(Random random, List<int> indices, Action<int> action) @@ -216,7 +216,7 @@ namespace Ryujinx.Memory.Tests { int region = regionSizes[i]; handle.QueryModified(address, (ulong)(PageSize * region), (address, size) => { }); - + // There should be a gap between regions, // So that they don't combine and we can see the full effects. address += (ulong)(PageSize * (region + 1)); @@ -294,7 +294,7 @@ namespace Ryujinx.Memory.Tests bool[] actionsTriggered = new bool[3]; - MultiRegionHandle granular = _tracking.BeginGranularTracking(PageSize * 3, PageSize * 3, null, PageSize); + MultiRegionHandle granular = _tracking.BeginGranularTracking(PageSize * 3, PageSize * 3, null, PageSize, 0); PreparePages(granular, 3, PageSize * 3); // Write to the second handle in the multiregion. @@ -307,7 +307,7 @@ namespace Ryujinx.Memory.Tests for (int i = 0; i < 3; i++) { - singlePages[i] = _tracking.BeginTracking(PageSize * (8 + (ulong)i), PageSize); + singlePages[i] = _tracking.BeginTracking(PageSize * (8 + (ulong)i), PageSize, 0); singlePages[i].Reprotect(); } @@ -321,7 +321,7 @@ namespace Ryujinx.Memory.Tests for (int i = 0; i < 3; i++) { - doublePages[i] = _tracking.BeginTracking(PageSize * (11 + (ulong)i * 2), PageSize * 2); + doublePages[i] = _tracking.BeginTracking(PageSize * (11 + (ulong)i * 2), PageSize * 2, 0); doublePages[i].Reprotect(); } @@ -340,7 +340,7 @@ namespace Ryujinx.Memory.Tests doublePages }; - MultiRegionHandle combined = _tracking.BeginGranularTracking(0, PageSize * 18, handleGroups.SelectMany((handles) => handles), PageSize); + MultiRegionHandle combined = _tracking.BeginGranularTracking(0, PageSize * 18, handleGroups.SelectMany((handles) => handles), PageSize, 0); bool[] expectedDirty = new bool[] { @@ -405,7 +405,7 @@ namespace Ryujinx.Memory.Tests { bool actionTriggered = false; - MultiRegionHandle granular = _tracking.BeginGranularTracking(PageSize * 3, PageSize * 3, null, PageSize); + MultiRegionHandle granular = _tracking.BeginGranularTracking(PageSize * 3, PageSize * 3, null, PageSize, 0); PreparePages(granular, 3, PageSize * 3); // Add a precise action to the second and third handle in the multiregion. diff --git a/Ryujinx.Memory.Tests/TrackingTests.cs b/Ryujinx.Memory.Tests/TrackingTests.cs index b0c39ab04..eb679804c 100644 --- a/Ryujinx.Memory.Tests/TrackingTests.cs +++ b/Ryujinx.Memory.Tests/TrackingTests.cs @@ -44,7 +44,7 @@ namespace Ryujinx.Memory.Tests [Test] public void SingleRegion() { - RegionHandle handle = _tracking.BeginTracking(0, PageSize); + RegionHandle handle = _tracking.BeginTracking(0, PageSize, 0); (ulong address, ulong size)? readTrackingTriggered = null; handle.RegisterAction((address, size) => { @@ -97,7 +97,7 @@ namespace Ryujinx.Memory.Tests [Test] public void OverlappingRegions() { - RegionHandle allHandle = _tracking.BeginTracking(0, PageSize * 16); + RegionHandle allHandle = _tracking.BeginTracking(0, PageSize * 16, 0); allHandle.Reprotect(); (ulong address, ulong size)? readTrackingTriggeredAll = null; @@ -116,7 +116,7 @@ namespace Ryujinx.Memory.Tests for (int i = 0; i < 16; i++) { - containedHandles[i] = _tracking.BeginTracking((ulong)i * PageSize, PageSize); + containedHandles[i] = _tracking.BeginTracking((ulong)i * PageSize, PageSize, 0); containedHandles[i].Reprotect(); } @@ -163,7 +163,7 @@ namespace Ryujinx.Memory.Tests ulong alignedEnd = ((address + size + PageSize - 1) / PageSize) * PageSize; ulong alignedSize = alignedEnd - alignedStart; - RegionHandle handle = _tracking.BeginTracking(address, size); + RegionHandle handle = _tracking.BeginTracking(address, size, 0); // Anywhere inside the pages the region is contained on should trigger. @@ -207,7 +207,7 @@ namespace Ryujinx.Memory.Tests for (int i = 0; i < handles.Length; i++) { - handles[i] = _tracking.BeginTracking((ulong)i * PageSize, PageSize); + handles[i] = _tracking.BeginTracking((ulong)i * PageSize, PageSize, 0); handles[i].Reprotect(); } @@ -263,7 +263,7 @@ namespace Ryujinx.Memory.Tests Random random = new Random(randSeed + 512); while (Stopwatch.GetTimestamp() < finishedTime) { - RegionHandle handle = _tracking.BeginTracking((ulong)random.Next(maxAddress), (ulong)random.Next(65536)); + RegionHandle handle = _tracking.BeginTracking((ulong)random.Next(maxAddress), (ulong)random.Next(65536), 0); handle.Dispose(); @@ -295,7 +295,7 @@ namespace Ryujinx.Memory.Tests // Read actions should only be triggered once for each registration. // The implementation should use an interlocked exchange to make sure other threads can't get the action. - RegionHandle handle = _tracking.BeginTracking(0, PageSize); + RegionHandle handle = _tracking.BeginTracking(0, PageSize, 0); int triggeredCount = 0; int registeredCount = 0; @@ -359,7 +359,7 @@ namespace Ryujinx.Memory.Tests { // Ensure that disposed handles correctly remove their virtual and physical regions. - RegionHandle handle = _tracking.BeginTracking(0, PageSize); + RegionHandle handle = _tracking.BeginTracking(0, PageSize, 0); handle.Reprotect(); Assert.AreEqual(1, _tracking.GetRegionCount()); @@ -372,8 +372,8 @@ namespace Ryujinx.Memory.Tests // We expect there to be three regions after creating both, one for the small region and two covering the big one around it. // Regions are always split to avoid overlapping, which is why there are three instead of two. - RegionHandle handleSmall = _tracking.BeginTracking(PageSize, PageSize); - RegionHandle handleBig = _tracking.BeginTracking(0, PageSize * 4); + RegionHandle handleSmall = _tracking.BeginTracking(PageSize, PageSize, 0); + RegionHandle handleBig = _tracking.BeginTracking(0, PageSize * 4, 0); Assert.AreEqual(3, _tracking.GetRegionCount()); @@ -398,7 +398,7 @@ namespace Ryujinx.Memory.Tests protection = newProtection; }; - RegionHandle handle = _tracking.BeginTracking(0, PageSize); + RegionHandle handle = _tracking.BeginTracking(0, PageSize, 0); // After creating the handle, there is no protection yet. Assert.AreEqual(MemoryPermission.ReadAndWrite, protection); @@ -453,7 +453,7 @@ namespace Ryujinx.Memory.Tests [Test] public void PreciseAction() { - RegionHandle handle = _tracking.BeginTracking(0, PageSize); + RegionHandle handle = _tracking.BeginTracking(0, PageSize, 0); (ulong address, ulong size, bool write)? preciseTriggered = null; handle.RegisterPreciseAction((address, size, write) => diff --git a/Ryujinx.Memory/AddressSpaceManager.cs b/Ryujinx.Memory/AddressSpaceManager.cs index b532ce5e0..ac89fca6d 100644 --- a/Ryujinx.Memory/AddressSpaceManager.cs +++ b/Ryujinx.Memory/AddressSpaceManager.cs @@ -462,7 +462,7 @@ namespace Ryujinx.Memory } /// <inheritdoc/> - public void SignalMemoryTracking(ulong va, ulong size, bool write, bool precise = false) + public void SignalMemoryTracking(ulong va, ulong size, bool write, bool precise = false, int? exemptId = null) { // Only the ARM Memory Manager has tracking for now. } diff --git a/Ryujinx.Memory/IVirtualMemoryManager.cs b/Ryujinx.Memory/IVirtualMemoryManager.cs index 390371ad2..e1851d48b 100644 --- a/Ryujinx.Memory/IVirtualMemoryManager.cs +++ b/Ryujinx.Memory/IVirtualMemoryManager.cs @@ -175,7 +175,8 @@ namespace Ryujinx.Memory /// <param name="size">Size of the region</param> /// <param name="write">True if the region was written, false if read</param> /// <param name="precise">True if the access is precise, false otherwise</param> - void SignalMemoryTracking(ulong va, ulong size, bool write, bool precise = false); + /// <param name="exemptId">Optional ID of the handles that should not be signalled</param> + void SignalMemoryTracking(ulong va, ulong size, bool write, bool precise = false, int? exemptId = null); /// <summary> /// Reprotect a region of virtual memory for tracking. diff --git a/Ryujinx.Memory/Tracking/AbstractRegion.cs b/Ryujinx.Memory/Tracking/AbstractRegion.cs index a3c3990ea..bd4c8ab5c 100644 --- a/Ryujinx.Memory/Tracking/AbstractRegion.cs +++ b/Ryujinx.Memory/Tracking/AbstractRegion.cs @@ -50,7 +50,8 @@ namespace Ryujinx.Memory.Tracking /// <param name="address">Address accessed</param> /// <param name="size">Size of the region affected in bytes</param> /// <param name="write">Whether the region was written to or read</param> - public abstract void Signal(ulong address, ulong size, bool write); + /// <param name="exemptId">Optional ID of the handles that should not be signalled</param> + public abstract void Signal(ulong address, ulong size, bool write, int? exemptId); /// <summary> /// Signals to the handles that a precise memory event has occurred. Assumes that the tracking lock has been obtained. @@ -58,10 +59,11 @@ namespace Ryujinx.Memory.Tracking /// <param name="address">Address accessed</param> /// <param name="size">Size of the region affected in bytes</param> /// <param name="write">Whether the region was written to or read</param> - public abstract void SignalPrecise(ulong address, ulong size, bool write); + /// <param name="exemptId">Optional ID of the handles that should not be signalled</param> + public abstract void SignalPrecise(ulong address, ulong size, bool write, int? exemptId); /// <summary> - /// Split this region into two, around the specified address. + /// Split this region into two, around the specified address. /// This region is updated to end at the split address, and a new region is created to represent past that point. /// </summary> /// <param name="splitAddress">Address to split the region around</param> diff --git a/Ryujinx.Memory/Tracking/MemoryTracking.cs b/Ryujinx.Memory/Tracking/MemoryTracking.cs index 9a35cfb6c..bf1e0ad34 100644 --- a/Ryujinx.Memory/Tracking/MemoryTracking.cs +++ b/Ryujinx.Memory/Tracking/MemoryTracking.cs @@ -136,10 +136,11 @@ namespace Ryujinx.Memory.Tracking /// <param name="size">Size of the region</param> /// <param name="handles">Handles to inherit state from or reuse. When none are present, provide null</param> /// <param name="granularity">Desired granularity of write tracking</param> + /// <param name="id">Handle ID</param> /// <returns>The memory tracking handle</returns> - public MultiRegionHandle BeginGranularTracking(ulong address, ulong size, IEnumerable<IRegionHandle> handles, ulong granularity) + public MultiRegionHandle BeginGranularTracking(ulong address, ulong size, IEnumerable<IRegionHandle> handles, ulong granularity, int id) { - return new MultiRegionHandle(this, address, size, handles, granularity); + return new MultiRegionHandle(this, address, size, handles, granularity, id); } /// <summary> @@ -148,12 +149,13 @@ namespace Ryujinx.Memory.Tracking /// <param name="address">CPU virtual address of the region</param> /// <param name="size">Size of the region</param> /// <param name="granularity">Desired granularity of write tracking</param> + /// <param name="id">Handle ID</param> /// <returns>The memory tracking handle</returns> - public SmartMultiRegionHandle BeginSmartGranularTracking(ulong address, ulong size, ulong granularity) + public SmartMultiRegionHandle BeginSmartGranularTracking(ulong address, ulong size, ulong granularity, int id) { (address, size) = PageAlign(address, size); - return new SmartMultiRegionHandle(this, address, size, granularity); + return new SmartMultiRegionHandle(this, address, size, granularity, id); } /// <summary> @@ -161,14 +163,16 @@ namespace Ryujinx.Memory.Tracking /// </summary> /// <param name="address">CPU virtual address of the region</param> /// <param name="size">Size of the region</param> + /// <param name="id">Handle ID</param> /// <returns>The memory tracking handle</returns> - public RegionHandle BeginTracking(ulong address, ulong size) + public RegionHandle BeginTracking(ulong address, ulong size, int id) { var (paAddress, paSize) = PageAlign(address, size); lock (TrackingLock) { - RegionHandle handle = new RegionHandle(this, paAddress, paSize, address, size, _memoryManager.IsRangeMapped(address, size)); + bool mapped = _memoryManager.IsRangeMapped(address, size); + RegionHandle handle = new RegionHandle(this, paAddress, paSize, address, size, id, mapped); return handle; } @@ -181,28 +185,31 @@ namespace Ryujinx.Memory.Tracking /// <param name="size">Size of the region</param> /// <param name="bitmap">The bitmap owning the dirty flag for this handle</param> /// <param name="bit">The bit of this handle within the dirty flag</param> + /// <param name="id">Handle ID</param> /// <returns>The memory tracking handle</returns> - internal RegionHandle BeginTrackingBitmap(ulong address, ulong size, ConcurrentBitmap bitmap, int bit) + internal RegionHandle BeginTrackingBitmap(ulong address, ulong size, ConcurrentBitmap bitmap, int bit, int id) { var (paAddress, paSize) = PageAlign(address, size); lock (TrackingLock) { - RegionHandle handle = new RegionHandle(this, paAddress, paSize, address, size, bitmap, bit, _memoryManager.IsRangeMapped(address, size)); + bool mapped = _memoryManager.IsRangeMapped(address, size); + RegionHandle handle = new RegionHandle(this, paAddress, paSize, address, size, bitmap, bit, id, mapped); return handle; } } /// <summary> - /// Signal that a virtual memory event happened at the given location (one byte). + /// Signal that a virtual memory event happened at the given location. /// </summary> /// <param name="address">Virtual address accessed</param> - /// <param name="write">Whether the address was written to or read</param> + /// <param name="size">Size of the region affected in bytes</param> + /// <param name="write">Whether the region was written to or read</param> /// <returns>True if the event triggered any tracking regions, false otherwise</returns> - public bool VirtualMemoryEventTracking(ulong address, bool write) + public bool VirtualMemoryEvent(ulong address, ulong size, bool write) { - return VirtualMemoryEvent(address, 1, write); + return VirtualMemoryEvent(address, size, write, precise: false, null); } /// <summary> @@ -214,8 +221,9 @@ namespace Ryujinx.Memory.Tracking /// <param name="size">Size of the region affected in bytes</param> /// <param name="write">Whether the region was written to or read</param> /// <param name="precise">True if the access is precise, false otherwise</param> + /// <param name="exemptId">Optional ID that of the handles that should not be signalled</param> /// <returns>True if the event triggered any tracking regions, false otherwise</returns> - public bool VirtualMemoryEvent(ulong address, ulong size, bool write, bool precise = false) + public bool VirtualMemoryEvent(ulong address, ulong size, bool write, bool precise, int? exemptId = null) { // Look up the virtual region using the region list. // Signal up the chain to relevant handles. @@ -250,11 +258,11 @@ namespace Ryujinx.Memory.Tracking if (precise) { - region.SignalPrecise(address, size, write); + region.SignalPrecise(address, size, write, exemptId); } else { - region.Signal(address, size, write); + region.Signal(address, size, write, exemptId); } } } diff --git a/Ryujinx.Memory/Tracking/MultiRegionHandle.cs b/Ryujinx.Memory/Tracking/MultiRegionHandle.cs index 6ea2b7845..68fc5e759 100644 --- a/Ryujinx.Memory/Tracking/MultiRegionHandle.cs +++ b/Ryujinx.Memory/Tracking/MultiRegionHandle.cs @@ -30,7 +30,13 @@ namespace Ryujinx.Memory.Tracking public bool Dirty { get; private set; } = true; - internal MultiRegionHandle(MemoryTracking tracking, ulong address, ulong size, IEnumerable<IRegionHandle> handles, ulong granularity) + internal MultiRegionHandle( + MemoryTracking tracking, + ulong address, + ulong size, + IEnumerable<IRegionHandle> handles, + ulong granularity, + int id) { _handles = new RegionHandle[(size + granularity - 1) / granularity]; Granularity = granularity; @@ -55,7 +61,7 @@ namespace Ryujinx.Memory.Tracking // Fill any gap left before this handle. while (i < startIndex) { - RegionHandle fillHandle = tracking.BeginTrackingBitmap(address + (ulong)i * granularity, granularity, _dirtyBitmap, i); + RegionHandle fillHandle = tracking.BeginTrackingBitmap(address + (ulong)i * granularity, granularity, _dirtyBitmap, i, id); fillHandle.Parent = this; _handles[i++] = fillHandle; } @@ -76,7 +82,7 @@ namespace Ryujinx.Memory.Tracking while (i < endIndex) { - RegionHandle splitHandle = tracking.BeginTrackingBitmap(address + (ulong)i * granularity, granularity, _dirtyBitmap, i); + RegionHandle splitHandle = tracking.BeginTrackingBitmap(address + (ulong)i * granularity, granularity, _dirtyBitmap, i, id); splitHandle.Parent = this; splitHandle.Reprotect(handle.Dirty); @@ -99,7 +105,7 @@ namespace Ryujinx.Memory.Tracking // Fill any remaining space with new handles. while (i < _handles.Length) { - RegionHandle handle = tracking.BeginTrackingBitmap(address + (ulong)i * granularity, granularity, _dirtyBitmap, i); + RegionHandle handle = tracking.BeginTrackingBitmap(address + (ulong)i * granularity, granularity, _dirtyBitmap, i, id); handle.Parent = this; _handles[i++] = handle; } diff --git a/Ryujinx.Memory/Tracking/RegionHandle.cs b/Ryujinx.Memory/Tracking/RegionHandle.cs index 580f94a51..7a59f9f25 100644 --- a/Ryujinx.Memory/Tracking/RegionHandle.cs +++ b/Ryujinx.Memory/Tracking/RegionHandle.cs @@ -15,12 +15,12 @@ namespace Ryujinx.Memory.Tracking /// If more than this number of checks have been performed on a dirty flag since its last reprotect, /// then it is dirtied infrequently. /// </summary> - private static int CheckCountForInfrequent = 3; + private const int CheckCountForInfrequent = 3; /// <summary> /// Number of frequent dirty/consume in a row to make this handle volatile. /// </summary> - private static int VolatileThreshold = 5; + private const int VolatileThreshold = 5; public bool Dirty { @@ -35,6 +35,7 @@ namespace Ryujinx.Memory.Tracking } internal int SequenceNumber { get; set; } + internal int Id { get; } public bool Unmapped { get; private set; } @@ -97,14 +98,26 @@ namespace Ryujinx.Memory.Tracking /// <param name="realSize">The real, unaligned size of the handle</param> /// <param name="bitmap">The bitmap the dirty flag for this handle is stored in</param> /// <param name="bit">The bit index representing the dirty flag for this handle</param> + /// <param name="id">Handle ID</param> /// <param name="mapped">True if the region handle starts mapped</param> - internal RegionHandle(MemoryTracking tracking, ulong address, ulong size, ulong realAddress, ulong realSize, ConcurrentBitmap bitmap, int bit, bool mapped = true) + internal RegionHandle( + MemoryTracking tracking, + ulong address, + ulong size, + ulong realAddress, + ulong realSize, + ConcurrentBitmap bitmap, + int bit, + int id, + bool mapped = true) { Bitmap = bitmap; DirtyBit = bit; Dirty = mapped; + Id = id; + Unmapped = !mapped; Address = address; Size = size; @@ -131,11 +144,14 @@ namespace Ryujinx.Memory.Tracking /// <param name="size">Size of the region to track</param> /// <param name="realAddress">The real, unaligned address of the handle</param> /// <param name="realSize">The real, unaligned size of the handle</param> + /// <param name="id">Handle ID</param> /// <param name="mapped">True if the region handle starts mapped</param> - internal RegionHandle(MemoryTracking tracking, ulong address, ulong size, ulong realAddress, ulong realSize, bool mapped = true) + internal RegionHandle(MemoryTracking tracking, ulong address, ulong size, ulong realAddress, ulong realSize, int id, bool mapped = true) { Bitmap = new ConcurrentBitmap(1, mapped); + Id = id; + Unmapped = !mapped; Address = address; diff --git a/Ryujinx.Memory/Tracking/SmartMultiRegionHandle.cs b/Ryujinx.Memory/Tracking/SmartMultiRegionHandle.cs index 47fe72e5b..4acddefaf 100644 --- a/Ryujinx.Memory/Tracking/SmartMultiRegionHandle.cs +++ b/Ryujinx.Memory/Tracking/SmartMultiRegionHandle.cs @@ -18,10 +18,11 @@ namespace Ryujinx.Memory.Tracking private readonly ulong _granularity; private readonly ulong _size; private MemoryTracking _tracking; + private readonly int _id; public bool Dirty { get; private set; } = true; - internal SmartMultiRegionHandle(MemoryTracking tracking, ulong address, ulong size, ulong granularity) + internal SmartMultiRegionHandle(MemoryTracking tracking, ulong address, ulong size, ulong granularity, int id) { // For this multi-region handle, the handle list starts empty. // As regions are queried, they are added to the _handles array at their start index. @@ -34,6 +35,7 @@ namespace Ryujinx.Memory.Tracking _address = address; _size = size; + _id = id; } public void SignalWrite() @@ -102,7 +104,7 @@ namespace Ryujinx.Memory.Tracking RegionSignal signal = handle.PreAction; handle.Dispose(); - RegionHandle splitLow = _tracking.BeginTracking(address, size); + RegionHandle splitLow = _tracking.BeginTracking(address, size, _id); splitLow.Parent = this; if (signal != null) { @@ -110,7 +112,7 @@ namespace Ryujinx.Memory.Tracking } _handles[handleIndex] = splitLow; - RegionHandle splitHigh = _tracking.BeginTracking(address + size, handle.Size - size); + RegionHandle splitHigh = _tracking.BeginTracking(address + size, handle.Size - size, _id); splitHigh.Parent = this; if (signal != null) { @@ -145,7 +147,7 @@ namespace Ryujinx.Memory.Tracking if (handle != null) { // Fill up to the found handle. - handle = _tracking.BeginTracking(startAddress, HandlesToBytes(i - startHandle)); + handle = _tracking.BeginTracking(startAddress, HandlesToBytes(i - startHandle), _id); handle.Parent = this; _handles[startHandle] = handle; return; @@ -153,7 +155,7 @@ namespace Ryujinx.Memory.Tracking } // Can fill the whole range. - _handles[startHandle] = _tracking.BeginTracking(startAddress, HandlesToBytes(1 + lastHandle - startHandle)); + _handles[startHandle] = _tracking.BeginTracking(startAddress, HandlesToBytes(1 + lastHandle - startHandle), _id); _handles[startHandle].Parent = this; } diff --git a/Ryujinx.Memory/Tracking/VirtualRegion.cs b/Ryujinx.Memory/Tracking/VirtualRegion.cs index 57a0344ac..9651426b3 100644 --- a/Ryujinx.Memory/Tracking/VirtualRegion.cs +++ b/Ryujinx.Memory/Tracking/VirtualRegion.cs @@ -19,19 +19,24 @@ namespace Ryujinx.Memory.Tracking _tracking = tracking; } - public override void Signal(ulong address, ulong size, bool write) + /// <inheritdoc/> + public override void Signal(ulong address, ulong size, bool write, int? exemptId) { IList<RegionHandle> handles = Handles; for (int i = 0; i < handles.Count; i++) { - handles[i].Signal(address, size, write, ref handles); + if (exemptId == null || handles[i].Id != exemptId.Value) + { + handles[i].Signal(address, size, write, ref handles); + } } UpdateProtection(); } - public override void SignalPrecise(ulong address, ulong size, bool write) + /// <inheritdoc/> + public override void SignalPrecise(ulong address, ulong size, bool write, int? exemptId) { IList<RegionHandle> handles = Handles; @@ -39,7 +44,10 @@ namespace Ryujinx.Memory.Tracking for (int i = 0; i < handles.Count; i++) { - allPrecise &= handles[i].SignalPrecise(address, size, write, ref handles); + if (exemptId == null || handles[i].Id != exemptId.Value) + { + allPrecise &= handles[i].SignalPrecise(address, size, write, ref handles); + } } // Only update protection if a regular signal handler was called. diff --git a/Ryujinx.Tests/Memory/MockMemoryManager.cs b/Ryujinx.Tests/Memory/MockMemoryManager.cs index 3f7692636..eeecf419f 100644 --- a/Ryujinx.Tests/Memory/MockMemoryManager.cs +++ b/Ryujinx.Tests/Memory/MockMemoryManager.cs @@ -40,7 +40,7 @@ namespace Ryujinx.Tests.Memory throw new NotImplementedException(); } - public void SignalMemoryTracking(ulong va, ulong size, bool write, bool precise = false) + public void SignalMemoryTracking(ulong va, ulong size, bool write, bool precise = false, int? exemptId = null) { throw new NotImplementedException(); } From 6bf460e1041b969a453dc40ee6fb83164739bf9c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 18 Feb 2023 02:35:02 +0100 Subject: [PATCH 15/41] nuget: bump System.IdentityModel.Tokens.Jwt from 6.26.1 to 6.27.0 (#4441) Bumps [System.IdentityModel.Tokens.Jwt](https://github.com/AzureAD/azure-activedirectory-identitymodel-extensions-for-dotnet) from 6.26.1 to 6.27.0. - [Release notes](https://github.com/AzureAD/azure-activedirectory-identitymodel-extensions-for-dotnet/releases) - [Changelog](https://github.com/AzureAD/azure-activedirectory-identitymodel-extensions-for-dotnet/blob/dev/CHANGELOG.md) - [Commits](https://github.com/AzureAD/azure-activedirectory-identitymodel-extensions-for-dotnet/commits) --- updated-dependencies: - dependency-name: System.IdentityModel.Tokens.Jwt dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Directory.Packages.props | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Directory.Packages.props b/Directory.Packages.props index 1b0b906f8..6f5ed3d71 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -44,7 +44,7 @@ <PackageVersion Include="SixLabors.ImageSharp.Drawing" Version="1.0.0-beta11" /> <PackageVersion Include="SPB" Version="0.0.4-build28" /> <PackageVersion Include="System.Drawing.Common" Version="7.0.0" /> - <PackageVersion Include="System.IdentityModel.Tokens.Jwt" Version="6.26.1" /> + <PackageVersion Include="System.IdentityModel.Tokens.Jwt" Version="6.27.0" /> <PackageVersion Include="System.IO.FileSystem.Primitives" Version="4.3.0" /> <PackageVersion Include="System.Management" Version="7.0.0" /> <PackageVersion Include="System.Net.NameResolution" Version="4.3.0" /> From 7aa430f1a51fd793971992b4454540975222b848 Mon Sep 17 00:00:00 2001 From: gdkchan <gab.dark.100@gmail.com> Date: Sun, 19 Feb 2023 22:37:37 -0300 Subject: [PATCH 16/41] Add support for advanced blend (part 1/2) (#2801) * Add blend microcode registers * Add advanced blend support using host extension * Remove debug message * Use pre-generated table for blend functions * XML docs * Rename AdvancedBlendMode to AdvancedBlendOp for consistency * Remove redundant code * Fix some advanced blend related issues on Vulkan * Formatting --- .../AdvancedBlendDescriptor.cs | 16 + Ryujinx.Graphics.GAL/AdvancedBlendOp.cs | 52 + Ryujinx.Graphics.GAL/AdvancedBlendOverlap.cs | 9 + Ryujinx.Graphics.GAL/Capabilities.cs | 3 + Ryujinx.Graphics.GAL/IPipeline.cs | 1 + .../Multithreading/CommandHelper.cs | 1 + .../Multithreading/CommandType.cs | 1 + .../Commands/SetBlendStateAdvancedCommand.cs | 18 + .../Multithreading/ThreadedPipeline.cs | 6 + .../Threed/Blender/AdvancedBlendFunctions.cs | 4226 +++++++++++++++++ .../Threed/Blender/AdvancedBlendManager.cs | 115 + .../Blender/AdvancedBlendPreGenTable.cs | 273 ++ .../Threed/Blender/AdvancedBlendUcode.cs | 126 + .../Engine/Threed/Blender/UcodeAssembler.cs | 305 ++ .../Engine/Threed/StateUpdater.cs | 28 +- .../Engine/Threed/ThreedClass.cs | 25 +- .../Engine/Threed/ThreedClassState.cs | 72 +- Ryujinx.Graphics.OpenGL/EnumConversion.cs | 120 + Ryujinx.Graphics.OpenGL/HwCapabilities.cs | 2 + Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs | 1 + Ryujinx.Graphics.OpenGL/Pipeline.cs | 19 + Ryujinx.Graphics.Vulkan/EnumConversion.cs | 65 + .../HardwareCapabilities.cs | 12 + Ryujinx.Graphics.Vulkan/PipelineBase.cs | 54 +- Ryujinx.Graphics.Vulkan/PipelineState.cs | 42 + .../VulkanInitialization.cs | 1 + Ryujinx.Graphics.Vulkan/VulkanRenderer.cs | 26 +- 27 files changed, 5605 insertions(+), 14 deletions(-) create mode 100644 Ryujinx.Graphics.GAL/AdvancedBlendDescriptor.cs create mode 100644 Ryujinx.Graphics.GAL/AdvancedBlendOp.cs create mode 100644 Ryujinx.Graphics.GAL/AdvancedBlendOverlap.cs create mode 100644 Ryujinx.Graphics.GAL/Multithreading/Commands/SetBlendStateAdvancedCommand.cs create mode 100644 Ryujinx.Graphics.Gpu/Engine/Threed/Blender/AdvancedBlendFunctions.cs create mode 100644 Ryujinx.Graphics.Gpu/Engine/Threed/Blender/AdvancedBlendManager.cs create mode 100644 Ryujinx.Graphics.Gpu/Engine/Threed/Blender/AdvancedBlendPreGenTable.cs create mode 100644 Ryujinx.Graphics.Gpu/Engine/Threed/Blender/AdvancedBlendUcode.cs create mode 100644 Ryujinx.Graphics.Gpu/Engine/Threed/Blender/UcodeAssembler.cs diff --git a/Ryujinx.Graphics.GAL/AdvancedBlendDescriptor.cs b/Ryujinx.Graphics.GAL/AdvancedBlendDescriptor.cs new file mode 100644 index 000000000..1f1f7c3f1 --- /dev/null +++ b/Ryujinx.Graphics.GAL/AdvancedBlendDescriptor.cs @@ -0,0 +1,16 @@ +namespace Ryujinx.Graphics.GAL +{ + public struct AdvancedBlendDescriptor + { + public AdvancedBlendOp Op { get; } + public AdvancedBlendOverlap Overlap { get; } + public bool SrcPreMultiplied { get; } + + public AdvancedBlendDescriptor(AdvancedBlendOp op, AdvancedBlendOverlap overlap, bool srcPreMultiplied) + { + Op = op; + Overlap = overlap; + SrcPreMultiplied = srcPreMultiplied; + } + } +} diff --git a/Ryujinx.Graphics.GAL/AdvancedBlendOp.cs b/Ryujinx.Graphics.GAL/AdvancedBlendOp.cs new file mode 100644 index 000000000..4140bf497 --- /dev/null +++ b/Ryujinx.Graphics.GAL/AdvancedBlendOp.cs @@ -0,0 +1,52 @@ +namespace Ryujinx.Graphics.GAL +{ + public enum AdvancedBlendOp + { + Zero, + Src, + Dst, + SrcOver, + DstOver, + SrcIn, + DstIn, + SrcOut, + DstOut, + SrcAtop, + DstAtop, + Xor, + Plus, + PlusClamped, + PlusClampedAlpha, + PlusDarker, + Multiply, + Screen, + Overlay, + Darken, + Lighten, + ColorDodge, + ColorBurn, + HardLight, + SoftLight, + Difference, + Minus, + MinusClamped, + Exclusion, + Contrast, + Invert, + InvertRGB, + InvertOvg, + LinearDodge, + LinearBurn, + VividLight, + LinearLight, + PinLight, + HardMix, + Red, + Green, + Blue, + HslHue, + HslSaturation, + HslColor, + HslLuminosity + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.GAL/AdvancedBlendOverlap.cs b/Ryujinx.Graphics.GAL/AdvancedBlendOverlap.cs new file mode 100644 index 000000000..d4feb2b30 --- /dev/null +++ b/Ryujinx.Graphics.GAL/AdvancedBlendOverlap.cs @@ -0,0 +1,9 @@ +namespace Ryujinx.Graphics.GAL +{ + public enum AdvancedBlendOverlap + { + Uncorrelated, + Disjoint, + Conjoint + } +} diff --git a/Ryujinx.Graphics.GAL/Capabilities.cs b/Ryujinx.Graphics.GAL/Capabilities.cs index 7a1f44b6b..a24139eba 100644 --- a/Ryujinx.Graphics.GAL/Capabilities.cs +++ b/Ryujinx.Graphics.GAL/Capabilities.cs @@ -23,6 +23,7 @@ namespace Ryujinx.Graphics.GAL public readonly bool SupportsR4G4B4A4Format; public readonly bool SupportsSnormBufferTextureFormat; public readonly bool Supports5BitComponentFormat; + public readonly bool SupportsBlendEquationAdvanced; public readonly bool SupportsFragmentShaderInterlock; public readonly bool SupportsFragmentShaderOrderingIntel; public readonly bool SupportsGeometryShaderPassthrough; @@ -64,6 +65,7 @@ namespace Ryujinx.Graphics.GAL bool supportsR4G4B4A4Format, bool supportsSnormBufferTextureFormat, bool supports5BitComponentFormat, + bool supportsBlendEquationAdvanced, bool supportsFragmentShaderInterlock, bool supportsFragmentShaderOrderingIntel, bool supportsGeometryShaderPassthrough, @@ -102,6 +104,7 @@ namespace Ryujinx.Graphics.GAL SupportsR4G4B4A4Format = supportsR4G4B4A4Format; SupportsSnormBufferTextureFormat = supportsSnormBufferTextureFormat; Supports5BitComponentFormat = supports5BitComponentFormat; + SupportsBlendEquationAdvanced = supportsBlendEquationAdvanced; SupportsFragmentShaderInterlock = supportsFragmentShaderInterlock; SupportsFragmentShaderOrderingIntel = supportsFragmentShaderOrderingIntel; SupportsGeometryShaderPassthrough = supportsGeometryShaderPassthrough; diff --git a/Ryujinx.Graphics.GAL/IPipeline.cs b/Ryujinx.Graphics.GAL/IPipeline.cs index 26d019eb4..0a362081c 100644 --- a/Ryujinx.Graphics.GAL/IPipeline.cs +++ b/Ryujinx.Graphics.GAL/IPipeline.cs @@ -44,6 +44,7 @@ namespace Ryujinx.Graphics.GAL void SetAlphaTest(bool enable, float reference, CompareOp op); + void SetBlendState(AdvancedBlendDescriptor blend); void SetBlendState(int index, BlendDescriptor blend); void SetDepthBias(PolygonModeMask enables, float factor, float units, float clamp); diff --git a/Ryujinx.Graphics.GAL/Multithreading/CommandHelper.cs b/Ryujinx.Graphics.GAL/Multithreading/CommandHelper.cs index 48873491f..063b7edf9 100644 --- a/Ryujinx.Graphics.GAL/Multithreading/CommandHelper.cs +++ b/Ryujinx.Graphics.GAL/Multithreading/CommandHelper.cs @@ -98,6 +98,7 @@ namespace Ryujinx.Graphics.GAL.Multithreading Register<EndHostConditionalRenderingCommand>(CommandType.EndHostConditionalRendering); Register<EndTransformFeedbackCommand>(CommandType.EndTransformFeedback); Register<SetAlphaTestCommand>(CommandType.SetAlphaTest); + Register<SetBlendStateAdvancedCommand>(CommandType.SetBlendStateAdvanced); Register<SetBlendStateCommand>(CommandType.SetBlendState); Register<SetDepthBiasCommand>(CommandType.SetDepthBias); Register<SetDepthClampCommand>(CommandType.SetDepthClamp); diff --git a/Ryujinx.Graphics.GAL/Multithreading/CommandType.cs b/Ryujinx.Graphics.GAL/Multithreading/CommandType.cs index c199ff34c..61e729b44 100644 --- a/Ryujinx.Graphics.GAL/Multithreading/CommandType.cs +++ b/Ryujinx.Graphics.GAL/Multithreading/CommandType.cs @@ -60,6 +60,7 @@ EndHostConditionalRendering, EndTransformFeedback, SetAlphaTest, + SetBlendStateAdvanced, SetBlendState, SetDepthBias, SetDepthClamp, diff --git a/Ryujinx.Graphics.GAL/Multithreading/Commands/SetBlendStateAdvancedCommand.cs b/Ryujinx.Graphics.GAL/Multithreading/Commands/SetBlendStateAdvancedCommand.cs new file mode 100644 index 000000000..2ec10a503 --- /dev/null +++ b/Ryujinx.Graphics.GAL/Multithreading/Commands/SetBlendStateAdvancedCommand.cs @@ -0,0 +1,18 @@ +namespace Ryujinx.Graphics.GAL.Multithreading.Commands +{ + struct SetBlendStateAdvancedCommand : IGALCommand, IGALCommand<SetBlendStateAdvancedCommand> + { + public CommandType CommandType => CommandType.SetBlendStateAdvanced; + private AdvancedBlendDescriptor _blend; + + public void Set(AdvancedBlendDescriptor blend) + { + _blend = blend; + } + + public static void Run(ref SetBlendStateAdvancedCommand command, ThreadedRenderer threaded, IRenderer renderer) + { + renderer.Pipeline.SetBlendState(command._blend); + } + } +} diff --git a/Ryujinx.Graphics.GAL/Multithreading/ThreadedPipeline.cs b/Ryujinx.Graphics.GAL/Multithreading/ThreadedPipeline.cs index ba120867c..1bdc9cf48 100644 --- a/Ryujinx.Graphics.GAL/Multithreading/ThreadedPipeline.cs +++ b/Ryujinx.Graphics.GAL/Multithreading/ThreadedPipeline.cs @@ -131,6 +131,12 @@ namespace Ryujinx.Graphics.GAL.Multithreading _renderer.QueueCommand(); } + public void SetBlendState(AdvancedBlendDescriptor blend) + { + _renderer.New<SetBlendStateAdvancedCommand>().Set(blend); + _renderer.QueueCommand(); + } + public void SetBlendState(int index, BlendDescriptor blend) { _renderer.New<SetBlendStateCommand>().Set(index, blend); diff --git a/Ryujinx.Graphics.Gpu/Engine/Threed/Blender/AdvancedBlendFunctions.cs b/Ryujinx.Graphics.Gpu/Engine/Threed/Blender/AdvancedBlendFunctions.cs new file mode 100644 index 000000000..a40b9cc47 --- /dev/null +++ b/Ryujinx.Graphics.Gpu/Engine/Threed/Blender/AdvancedBlendFunctions.cs @@ -0,0 +1,4226 @@ +using Ryujinx.Common; +using Ryujinx.Graphics.GAL; +using System.Globalization; +using System.Runtime.InteropServices; +using System.Text; + +namespace Ryujinx.Graphics.Gpu.Engine.Threed.Blender +{ + static class AdvancedBlendFunctions + { + public static readonly AdvancedBlendUcode[] Table = new AdvancedBlendUcode[] + { + new AdvancedBlendUcode(AdvancedBlendOp.PlusClamped, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedPlusClampedPremul), + new AdvancedBlendUcode(AdvancedBlendOp.PlusClampedAlpha, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedPlusClampedAlphaPremul), + new AdvancedBlendUcode(AdvancedBlendOp.PlusDarker, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedPlusDarkerPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Multiply, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedMultiplyPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Screen, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedScreenPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Overlay, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedOverlayPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Darken, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedDarkenPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Lighten, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedLightenPremul), + new AdvancedBlendUcode(AdvancedBlendOp.ColorDodge, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedColorDodgePremul), + new AdvancedBlendUcode(AdvancedBlendOp.ColorBurn, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedColorBurnPremul), + new AdvancedBlendUcode(AdvancedBlendOp.HardLight, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedHardLightPremul), + new AdvancedBlendUcode(AdvancedBlendOp.SoftLight, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedSoftLightPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Difference, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedDifferencePremul), + new AdvancedBlendUcode(AdvancedBlendOp.Minus, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedMinusPremul), + new AdvancedBlendUcode(AdvancedBlendOp.MinusClamped, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedMinusClampedPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Exclusion, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedExclusionPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Contrast, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedContrastPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Invert, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedInvertPremul), + new AdvancedBlendUcode(AdvancedBlendOp.InvertRGB, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedInvertRGBPremul), + new AdvancedBlendUcode(AdvancedBlendOp.InvertOvg, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedInvertOvgPremul), + new AdvancedBlendUcode(AdvancedBlendOp.LinearDodge, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedLinearDodgePremul), + new AdvancedBlendUcode(AdvancedBlendOp.LinearBurn, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedLinearBurnPremul), + new AdvancedBlendUcode(AdvancedBlendOp.VividLight, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedVividLightPremul), + new AdvancedBlendUcode(AdvancedBlendOp.LinearLight, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedLinearLightPremul), + new AdvancedBlendUcode(AdvancedBlendOp.PinLight, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedPinLightPremul), + new AdvancedBlendUcode(AdvancedBlendOp.HardMix, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedHardMixPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Red, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedRedPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Green, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedGreenPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Blue, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedBluePremul), + new AdvancedBlendUcode(AdvancedBlendOp.HslHue, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedHslHuePremul), + new AdvancedBlendUcode(AdvancedBlendOp.HslSaturation, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedHslSaturationPremul), + new AdvancedBlendUcode(AdvancedBlendOp.HslColor, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedHslColorPremul), + new AdvancedBlendUcode(AdvancedBlendOp.HslLuminosity, AdvancedBlendOverlap.Uncorrelated, true, GenUncorrelatedHslLuminosityPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Src, AdvancedBlendOverlap.Disjoint, true, GenDisjointSrcPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Dst, AdvancedBlendOverlap.Disjoint, true, GenDisjointDstPremul), + new AdvancedBlendUcode(AdvancedBlendOp.SrcOver, AdvancedBlendOverlap.Disjoint, true, GenDisjointSrcOverPremul), + new AdvancedBlendUcode(AdvancedBlendOp.DstOver, AdvancedBlendOverlap.Disjoint, true, GenDisjointDstOverPremul), + new AdvancedBlendUcode(AdvancedBlendOp.SrcIn, AdvancedBlendOverlap.Disjoint, true, GenDisjointSrcInPremul), + new AdvancedBlendUcode(AdvancedBlendOp.DstIn, AdvancedBlendOverlap.Disjoint, true, GenDisjointDstInPremul), + new AdvancedBlendUcode(AdvancedBlendOp.SrcOut, AdvancedBlendOverlap.Disjoint, true, GenDisjointSrcOutPremul), + new AdvancedBlendUcode(AdvancedBlendOp.DstOut, AdvancedBlendOverlap.Disjoint, true, GenDisjointDstOutPremul), + new AdvancedBlendUcode(AdvancedBlendOp.SrcAtop, AdvancedBlendOverlap.Disjoint, true, GenDisjointSrcAtopPremul), + new AdvancedBlendUcode(AdvancedBlendOp.DstAtop, AdvancedBlendOverlap.Disjoint, true, GenDisjointDstAtopPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Xor, AdvancedBlendOverlap.Disjoint, true, GenDisjointXorPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Plus, AdvancedBlendOverlap.Disjoint, true, GenDisjointPlusPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Multiply, AdvancedBlendOverlap.Disjoint, true, GenDisjointMultiplyPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Screen, AdvancedBlendOverlap.Disjoint, true, GenDisjointScreenPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Overlay, AdvancedBlendOverlap.Disjoint, true, GenDisjointOverlayPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Darken, AdvancedBlendOverlap.Disjoint, true, GenDisjointDarkenPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Lighten, AdvancedBlendOverlap.Disjoint, true, GenDisjointLightenPremul), + new AdvancedBlendUcode(AdvancedBlendOp.ColorDodge, AdvancedBlendOverlap.Disjoint, true, GenDisjointColorDodgePremul), + new AdvancedBlendUcode(AdvancedBlendOp.ColorBurn, AdvancedBlendOverlap.Disjoint, true, GenDisjointColorBurnPremul), + new AdvancedBlendUcode(AdvancedBlendOp.HardLight, AdvancedBlendOverlap.Disjoint, true, GenDisjointHardLightPremul), + new AdvancedBlendUcode(AdvancedBlendOp.SoftLight, AdvancedBlendOverlap.Disjoint, true, GenDisjointSoftLightPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Difference, AdvancedBlendOverlap.Disjoint, true, GenDisjointDifferencePremul), + new AdvancedBlendUcode(AdvancedBlendOp.Exclusion, AdvancedBlendOverlap.Disjoint, true, GenDisjointExclusionPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Invert, AdvancedBlendOverlap.Disjoint, true, GenDisjointInvertPremul), + new AdvancedBlendUcode(AdvancedBlendOp.InvertRGB, AdvancedBlendOverlap.Disjoint, true, GenDisjointInvertRGBPremul), + new AdvancedBlendUcode(AdvancedBlendOp.LinearDodge, AdvancedBlendOverlap.Disjoint, true, GenDisjointLinearDodgePremul), + new AdvancedBlendUcode(AdvancedBlendOp.LinearBurn, AdvancedBlendOverlap.Disjoint, true, GenDisjointLinearBurnPremul), + new AdvancedBlendUcode(AdvancedBlendOp.VividLight, AdvancedBlendOverlap.Disjoint, true, GenDisjointVividLightPremul), + new AdvancedBlendUcode(AdvancedBlendOp.LinearLight, AdvancedBlendOverlap.Disjoint, true, GenDisjointLinearLightPremul), + new AdvancedBlendUcode(AdvancedBlendOp.PinLight, AdvancedBlendOverlap.Disjoint, true, GenDisjointPinLightPremul), + new AdvancedBlendUcode(AdvancedBlendOp.HardMix, AdvancedBlendOverlap.Disjoint, true, GenDisjointHardMixPremul), + new AdvancedBlendUcode(AdvancedBlendOp.HslHue, AdvancedBlendOverlap.Disjoint, true, GenDisjointHslHuePremul), + new AdvancedBlendUcode(AdvancedBlendOp.HslSaturation, AdvancedBlendOverlap.Disjoint, true, GenDisjointHslSaturationPremul), + new AdvancedBlendUcode(AdvancedBlendOp.HslColor, AdvancedBlendOverlap.Disjoint, true, GenDisjointHslColorPremul), + new AdvancedBlendUcode(AdvancedBlendOp.HslLuminosity, AdvancedBlendOverlap.Disjoint, true, GenDisjointHslLuminosityPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Src, AdvancedBlendOverlap.Conjoint, true, GenConjointSrcPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Dst, AdvancedBlendOverlap.Conjoint, true, GenConjointDstPremul), + new AdvancedBlendUcode(AdvancedBlendOp.SrcOver, AdvancedBlendOverlap.Conjoint, true, GenConjointSrcOverPremul), + new AdvancedBlendUcode(AdvancedBlendOp.DstOver, AdvancedBlendOverlap.Conjoint, true, GenConjointDstOverPremul), + new AdvancedBlendUcode(AdvancedBlendOp.SrcIn, AdvancedBlendOverlap.Conjoint, true, GenConjointSrcInPremul), + new AdvancedBlendUcode(AdvancedBlendOp.DstIn, AdvancedBlendOverlap.Conjoint, true, GenConjointDstInPremul), + new AdvancedBlendUcode(AdvancedBlendOp.SrcOut, AdvancedBlendOverlap.Conjoint, true, GenConjointSrcOutPremul), + new AdvancedBlendUcode(AdvancedBlendOp.DstOut, AdvancedBlendOverlap.Conjoint, true, GenConjointDstOutPremul), + new AdvancedBlendUcode(AdvancedBlendOp.SrcAtop, AdvancedBlendOverlap.Conjoint, true, GenConjointSrcAtopPremul), + new AdvancedBlendUcode(AdvancedBlendOp.DstAtop, AdvancedBlendOverlap.Conjoint, true, GenConjointDstAtopPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Xor, AdvancedBlendOverlap.Conjoint, true, GenConjointXorPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Multiply, AdvancedBlendOverlap.Conjoint, true, GenConjointMultiplyPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Screen, AdvancedBlendOverlap.Conjoint, true, GenConjointScreenPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Overlay, AdvancedBlendOverlap.Conjoint, true, GenConjointOverlayPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Darken, AdvancedBlendOverlap.Conjoint, true, GenConjointDarkenPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Lighten, AdvancedBlendOverlap.Conjoint, true, GenConjointLightenPremul), + new AdvancedBlendUcode(AdvancedBlendOp.ColorDodge, AdvancedBlendOverlap.Conjoint, true, GenConjointColorDodgePremul), + new AdvancedBlendUcode(AdvancedBlendOp.ColorBurn, AdvancedBlendOverlap.Conjoint, true, GenConjointColorBurnPremul), + new AdvancedBlendUcode(AdvancedBlendOp.HardLight, AdvancedBlendOverlap.Conjoint, true, GenConjointHardLightPremul), + new AdvancedBlendUcode(AdvancedBlendOp.SoftLight, AdvancedBlendOverlap.Conjoint, true, GenConjointSoftLightPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Difference, AdvancedBlendOverlap.Conjoint, true, GenConjointDifferencePremul), + new AdvancedBlendUcode(AdvancedBlendOp.Exclusion, AdvancedBlendOverlap.Conjoint, true, GenConjointExclusionPremul), + new AdvancedBlendUcode(AdvancedBlendOp.Invert, AdvancedBlendOverlap.Conjoint, true, GenConjointInvertPremul), + new AdvancedBlendUcode(AdvancedBlendOp.InvertRGB, AdvancedBlendOverlap.Conjoint, true, GenConjointInvertRGBPremul), + new AdvancedBlendUcode(AdvancedBlendOp.LinearDodge, AdvancedBlendOverlap.Conjoint, true, GenConjointLinearDodgePremul), + new AdvancedBlendUcode(AdvancedBlendOp.LinearBurn, AdvancedBlendOverlap.Conjoint, true, GenConjointLinearBurnPremul), + new AdvancedBlendUcode(AdvancedBlendOp.VividLight, AdvancedBlendOverlap.Conjoint, true, GenConjointVividLightPremul), + new AdvancedBlendUcode(AdvancedBlendOp.LinearLight, AdvancedBlendOverlap.Conjoint, true, GenConjointLinearLightPremul), + new AdvancedBlendUcode(AdvancedBlendOp.PinLight, AdvancedBlendOverlap.Conjoint, true, GenConjointPinLightPremul), + new AdvancedBlendUcode(AdvancedBlendOp.HardMix, AdvancedBlendOverlap.Conjoint, true, GenConjointHardMixPremul), + new AdvancedBlendUcode(AdvancedBlendOp.HslHue, AdvancedBlendOverlap.Conjoint, true, GenConjointHslHuePremul), + new AdvancedBlendUcode(AdvancedBlendOp.HslSaturation, AdvancedBlendOverlap.Conjoint, true, GenConjointHslSaturationPremul), + new AdvancedBlendUcode(AdvancedBlendOp.HslColor, AdvancedBlendOverlap.Conjoint, true, GenConjointHslColorPremul), + new AdvancedBlendUcode(AdvancedBlendOp.HslLuminosity, AdvancedBlendOverlap.Conjoint, true, GenConjointHslLuminosityPremul), + new AdvancedBlendUcode(AdvancedBlendOp.DstOver, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedDstOver), + new AdvancedBlendUcode(AdvancedBlendOp.SrcIn, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedSrcIn), + new AdvancedBlendUcode(AdvancedBlendOp.SrcOut, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedSrcOut), + new AdvancedBlendUcode(AdvancedBlendOp.SrcAtop, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedSrcAtop), + new AdvancedBlendUcode(AdvancedBlendOp.DstAtop, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedDstAtop), + new AdvancedBlendUcode(AdvancedBlendOp.Xor, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedXor), + new AdvancedBlendUcode(AdvancedBlendOp.PlusClamped, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedPlusClamped), + new AdvancedBlendUcode(AdvancedBlendOp.PlusClampedAlpha, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedPlusClampedAlpha), + new AdvancedBlendUcode(AdvancedBlendOp.PlusDarker, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedPlusDarker), + new AdvancedBlendUcode(AdvancedBlendOp.Multiply, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedMultiply), + new AdvancedBlendUcode(AdvancedBlendOp.Screen, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedScreen), + new AdvancedBlendUcode(AdvancedBlendOp.Overlay, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedOverlay), + new AdvancedBlendUcode(AdvancedBlendOp.Darken, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedDarken), + new AdvancedBlendUcode(AdvancedBlendOp.Lighten, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedLighten), + new AdvancedBlendUcode(AdvancedBlendOp.ColorDodge, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedColorDodge), + new AdvancedBlendUcode(AdvancedBlendOp.ColorBurn, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedColorBurn), + new AdvancedBlendUcode(AdvancedBlendOp.HardLight, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedHardLight), + new AdvancedBlendUcode(AdvancedBlendOp.SoftLight, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedSoftLight), + new AdvancedBlendUcode(AdvancedBlendOp.Difference, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedDifference), + new AdvancedBlendUcode(AdvancedBlendOp.Minus, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedMinus), + new AdvancedBlendUcode(AdvancedBlendOp.MinusClamped, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedMinusClamped), + new AdvancedBlendUcode(AdvancedBlendOp.Exclusion, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedExclusion), + new AdvancedBlendUcode(AdvancedBlendOp.Contrast, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedContrast), + new AdvancedBlendUcode(AdvancedBlendOp.InvertRGB, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedInvertRGB), + new AdvancedBlendUcode(AdvancedBlendOp.LinearDodge, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedLinearDodge), + new AdvancedBlendUcode(AdvancedBlendOp.LinearBurn, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedLinearBurn), + new AdvancedBlendUcode(AdvancedBlendOp.VividLight, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedVividLight), + new AdvancedBlendUcode(AdvancedBlendOp.LinearLight, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedLinearLight), + new AdvancedBlendUcode(AdvancedBlendOp.PinLight, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedPinLight), + new AdvancedBlendUcode(AdvancedBlendOp.HardMix, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedHardMix), + new AdvancedBlendUcode(AdvancedBlendOp.Red, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedRed), + new AdvancedBlendUcode(AdvancedBlendOp.Green, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedGreen), + new AdvancedBlendUcode(AdvancedBlendOp.Blue, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedBlue), + new AdvancedBlendUcode(AdvancedBlendOp.HslHue, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedHslHue), + new AdvancedBlendUcode(AdvancedBlendOp.HslSaturation, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedHslSaturation), + new AdvancedBlendUcode(AdvancedBlendOp.HslColor, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedHslColor), + new AdvancedBlendUcode(AdvancedBlendOp.HslLuminosity, AdvancedBlendOverlap.Uncorrelated, false, GenUncorrelatedHslLuminosity), + new AdvancedBlendUcode(AdvancedBlendOp.Src, AdvancedBlendOverlap.Disjoint, false, GenDisjointSrc), + new AdvancedBlendUcode(AdvancedBlendOp.SrcOver, AdvancedBlendOverlap.Disjoint, false, GenDisjointSrcOver), + new AdvancedBlendUcode(AdvancedBlendOp.DstOver, AdvancedBlendOverlap.Disjoint, false, GenDisjointDstOver), + new AdvancedBlendUcode(AdvancedBlendOp.SrcIn, AdvancedBlendOverlap.Disjoint, false, GenDisjointSrcIn), + new AdvancedBlendUcode(AdvancedBlendOp.SrcOut, AdvancedBlendOverlap.Disjoint, false, GenDisjointSrcOut), + new AdvancedBlendUcode(AdvancedBlendOp.SrcAtop, AdvancedBlendOverlap.Disjoint, false, GenDisjointSrcAtop), + new AdvancedBlendUcode(AdvancedBlendOp.DstAtop, AdvancedBlendOverlap.Disjoint, false, GenDisjointDstAtop), + new AdvancedBlendUcode(AdvancedBlendOp.Xor, AdvancedBlendOverlap.Disjoint, false, GenDisjointXor), + new AdvancedBlendUcode(AdvancedBlendOp.Plus, AdvancedBlendOverlap.Disjoint, false, GenDisjointPlus), + new AdvancedBlendUcode(AdvancedBlendOp.Multiply, AdvancedBlendOverlap.Disjoint, false, GenDisjointMultiply), + new AdvancedBlendUcode(AdvancedBlendOp.Screen, AdvancedBlendOverlap.Disjoint, false, GenDisjointScreen), + new AdvancedBlendUcode(AdvancedBlendOp.Overlay, AdvancedBlendOverlap.Disjoint, false, GenDisjointOverlay), + new AdvancedBlendUcode(AdvancedBlendOp.Darken, AdvancedBlendOverlap.Disjoint, false, GenDisjointDarken), + new AdvancedBlendUcode(AdvancedBlendOp.Lighten, AdvancedBlendOverlap.Disjoint, false, GenDisjointLighten), + new AdvancedBlendUcode(AdvancedBlendOp.ColorDodge, AdvancedBlendOverlap.Disjoint, false, GenDisjointColorDodge), + new AdvancedBlendUcode(AdvancedBlendOp.ColorBurn, AdvancedBlendOverlap.Disjoint, false, GenDisjointColorBurn), + new AdvancedBlendUcode(AdvancedBlendOp.HardLight, AdvancedBlendOverlap.Disjoint, false, GenDisjointHardLight), + new AdvancedBlendUcode(AdvancedBlendOp.SoftLight, AdvancedBlendOverlap.Disjoint, false, GenDisjointSoftLight), + new AdvancedBlendUcode(AdvancedBlendOp.Difference, AdvancedBlendOverlap.Disjoint, false, GenDisjointDifference), + new AdvancedBlendUcode(AdvancedBlendOp.Exclusion, AdvancedBlendOverlap.Disjoint, false, GenDisjointExclusion), + new AdvancedBlendUcode(AdvancedBlendOp.InvertRGB, AdvancedBlendOverlap.Disjoint, false, GenDisjointInvertRGB), + new AdvancedBlendUcode(AdvancedBlendOp.LinearDodge, AdvancedBlendOverlap.Disjoint, false, GenDisjointLinearDodge), + new AdvancedBlendUcode(AdvancedBlendOp.LinearBurn, AdvancedBlendOverlap.Disjoint, false, GenDisjointLinearBurn), + new AdvancedBlendUcode(AdvancedBlendOp.VividLight, AdvancedBlendOverlap.Disjoint, false, GenDisjointVividLight), + new AdvancedBlendUcode(AdvancedBlendOp.LinearLight, AdvancedBlendOverlap.Disjoint, false, GenDisjointLinearLight), + new AdvancedBlendUcode(AdvancedBlendOp.PinLight, AdvancedBlendOverlap.Disjoint, false, GenDisjointPinLight), + new AdvancedBlendUcode(AdvancedBlendOp.HardMix, AdvancedBlendOverlap.Disjoint, false, GenDisjointHardMix), + new AdvancedBlendUcode(AdvancedBlendOp.HslHue, AdvancedBlendOverlap.Disjoint, false, GenDisjointHslHue), + new AdvancedBlendUcode(AdvancedBlendOp.HslSaturation, AdvancedBlendOverlap.Disjoint, false, GenDisjointHslSaturation), + new AdvancedBlendUcode(AdvancedBlendOp.HslColor, AdvancedBlendOverlap.Disjoint, false, GenDisjointHslColor), + new AdvancedBlendUcode(AdvancedBlendOp.HslLuminosity, AdvancedBlendOverlap.Disjoint, false, GenDisjointHslLuminosity), + new AdvancedBlendUcode(AdvancedBlendOp.Src, AdvancedBlendOverlap.Conjoint, false, GenConjointSrc), + new AdvancedBlendUcode(AdvancedBlendOp.SrcOver, AdvancedBlendOverlap.Conjoint, false, GenConjointSrcOver), + new AdvancedBlendUcode(AdvancedBlendOp.DstOver, AdvancedBlendOverlap.Conjoint, false, GenConjointDstOver), + new AdvancedBlendUcode(AdvancedBlendOp.SrcIn, AdvancedBlendOverlap.Conjoint, false, GenConjointSrcIn), + new AdvancedBlendUcode(AdvancedBlendOp.SrcOut, AdvancedBlendOverlap.Conjoint, false, GenConjointSrcOut), + new AdvancedBlendUcode(AdvancedBlendOp.SrcAtop, AdvancedBlendOverlap.Conjoint, false, GenConjointSrcAtop), + new AdvancedBlendUcode(AdvancedBlendOp.DstAtop, AdvancedBlendOverlap.Conjoint, false, GenConjointDstAtop), + new AdvancedBlendUcode(AdvancedBlendOp.Xor, AdvancedBlendOverlap.Conjoint, false, GenConjointXor), + new AdvancedBlendUcode(AdvancedBlendOp.Multiply, AdvancedBlendOverlap.Conjoint, false, GenConjointMultiply), + new AdvancedBlendUcode(AdvancedBlendOp.Screen, AdvancedBlendOverlap.Conjoint, false, GenConjointScreen), + new AdvancedBlendUcode(AdvancedBlendOp.Overlay, AdvancedBlendOverlap.Conjoint, false, GenConjointOverlay), + new AdvancedBlendUcode(AdvancedBlendOp.Darken, AdvancedBlendOverlap.Conjoint, false, GenConjointDarken), + new AdvancedBlendUcode(AdvancedBlendOp.Lighten, AdvancedBlendOverlap.Conjoint, false, GenConjointLighten), + new AdvancedBlendUcode(AdvancedBlendOp.ColorDodge, AdvancedBlendOverlap.Conjoint, false, GenConjointColorDodge), + new AdvancedBlendUcode(AdvancedBlendOp.ColorBurn, AdvancedBlendOverlap.Conjoint, false, GenConjointColorBurn), + new AdvancedBlendUcode(AdvancedBlendOp.HardLight, AdvancedBlendOverlap.Conjoint, false, GenConjointHardLight), + new AdvancedBlendUcode(AdvancedBlendOp.SoftLight, AdvancedBlendOverlap.Conjoint, false, GenConjointSoftLight), + new AdvancedBlendUcode(AdvancedBlendOp.Difference, AdvancedBlendOverlap.Conjoint, false, GenConjointDifference), + new AdvancedBlendUcode(AdvancedBlendOp.Exclusion, AdvancedBlendOverlap.Conjoint, false, GenConjointExclusion), + new AdvancedBlendUcode(AdvancedBlendOp.InvertRGB, AdvancedBlendOverlap.Conjoint, false, GenConjointInvertRGB), + new AdvancedBlendUcode(AdvancedBlendOp.LinearDodge, AdvancedBlendOverlap.Conjoint, false, GenConjointLinearDodge), + new AdvancedBlendUcode(AdvancedBlendOp.LinearBurn, AdvancedBlendOverlap.Conjoint, false, GenConjointLinearBurn), + new AdvancedBlendUcode(AdvancedBlendOp.VividLight, AdvancedBlendOverlap.Conjoint, false, GenConjointVividLight), + new AdvancedBlendUcode(AdvancedBlendOp.LinearLight, AdvancedBlendOverlap.Conjoint, false, GenConjointLinearLight), + new AdvancedBlendUcode(AdvancedBlendOp.PinLight, AdvancedBlendOverlap.Conjoint, false, GenConjointPinLight), + new AdvancedBlendUcode(AdvancedBlendOp.HardMix, AdvancedBlendOverlap.Conjoint, false, GenConjointHardMix), + new AdvancedBlendUcode(AdvancedBlendOp.HslHue, AdvancedBlendOverlap.Conjoint, false, GenConjointHslHue), + new AdvancedBlendUcode(AdvancedBlendOp.HslSaturation, AdvancedBlendOverlap.Conjoint, false, GenConjointHslSaturation), + new AdvancedBlendUcode(AdvancedBlendOp.HslColor, AdvancedBlendOverlap.Conjoint, false, GenConjointHslColor), + new AdvancedBlendUcode(AdvancedBlendOp.HslLuminosity, AdvancedBlendOverlap.Conjoint, false, GenConjointHslLuminosity) + }; + + public static string GenTable() + { + // This can be used to generate the table on AdvancedBlendPreGenTable. + + StringBuilder sb = new StringBuilder(); + + sb.AppendLine($"private static Dictionary<Hash128, AdvancedBlendEntry> _entries = new()"); + sb.AppendLine("{"); + + foreach (var entry in Table) + { + Hash128 hash = XXHash128.ComputeHash(MemoryMarshal.Cast<uint, byte>(entry.Code)); + + string[] constants = new string[entry.Constants != null ? entry.Constants.Length : 0]; + + for (int i = 0; i < constants.Length; i++) + { + RgbFloat rgb = entry.Constants[i]; + + constants[i] = string.Format(CultureInfo.InvariantCulture, "new " + nameof(RgbFloat) + "({0}f, {1}f, {2}f)", rgb.R, rgb.G, rgb.B); + } + + string constantList = constants.Length > 0 ? $"new[] {{ {string.Join(", ", constants)} }}" : $"Array.Empty<{nameof(RgbFloat)}>()"; + + static string EnumValue(string name, object value) + { + if (value.ToString() == "0") + { + return "0"; + } + + return $"{name}.{value}"; + } + + string alpha = $"new {nameof(FixedFunctionAlpha)}({EnumValue(nameof(BlendUcodeEnable), entry.Alpha.Enable)}, {EnumValue(nameof(BlendOp), entry.Alpha.AlphaOp)}, {EnumValue(nameof(BlendFactor), entry.Alpha.AlphaSrcFactor)}, {EnumValue(nameof(BlendFactor), entry.Alpha.AlphaDstFactor)})"; + + sb.AppendLine($" {{ new Hash128(0x{hash.Low:X16}, 0x{hash.High:X16}), new AdvancedBlendEntry({nameof(AdvancedBlendOp)}.{entry.Op}, {nameof(AdvancedBlendOverlap)}.{entry.Overlap}, {(entry.SrcPreMultiplied ? "true" : "false")}, {constantList}, {alpha}) }},"); + } + + sb.AppendLine("};"); + + return sb.ToString(); + } + + private static FixedFunctionAlpha GenUncorrelatedPlusClampedPremul(ref UcodeAssembler asm) + { + asm.Add(CC.T, Dest.PBR, OpBD.DstRGB, OpBD.SrcRGB); + asm.Min(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenUncorrelatedPlusClampedAlphaPremul(ref UcodeAssembler asm) + { + asm.Add(CC.T, Dest.Temp0, OpBD.DstRGB, OpBD.SrcRGB); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantOne); + asm.Min(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenUncorrelatedPlusDarkerPremul(ref UcodeAssembler asm) + { + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantOne); + asm.Add(CC.T, Dest.PBR, OpBD.PBR, OpBD.SrcRGB); + asm.Add(CC.T, Dest.PBR, OpBD.PBR, OpBD.DstRGB); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.SrcAAA); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.DstAAA); + asm.Max(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantZero); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenUncorrelatedMultiplyPremul(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.DstRGB); + asm.Mmadd(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedScreenPremul(ref UcodeAssembler asm) + { + asm.Mmadd(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.DstAAA, OpAC.DstRGB, OpBD.SrcAAA); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.SrcRGB, OpBD.DstRGB); + asm.Mmadd(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedOverlayPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.ConstantRGB); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.Temp2, OpBD.Temp1, OpAC.Temp2, OpBD.Temp1); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.ConstantOne, OpBD.Temp2); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.PBR); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedDarkenPremul(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.DstAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.DstRGB, OpBD.SrcAAA); + asm.Min(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR); + asm.Mmadd(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedLightenPremul(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.DstAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.DstRGB, OpBD.SrcAAA); + asm.Max(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR); + asm.Mmadd(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedColorDodgePremul(ref UcodeAssembler asm) + { + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.SrcRGB); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mul(CC.GT, Dest.PBR, OpAC.PBR, OpBD.SrcAAA); + asm.Mul(CC.GT, Dest.PBR, OpAC.PBR, OpBD.DstRGB); + asm.Min(CC.GT, Dest.PBR, OpAC.DstAAA, OpBD.PBR); + asm.Mul(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.SrcAAA); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.DstAAA); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.DstRGB, OpBD.ConstantZero); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Mmadd(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedColorBurnPremul(ref UcodeAssembler asm) + { + asm.Mmsub(CC.T, Dest.Temp0, OpAC.DstAAA, OpBD.SrcAAA, OpAC.SrcAAA, OpBD.DstRGB); + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcRGB); + asm.Mul(CC.T, Dest.PBR, OpAC.Temp0, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.PBR); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.SrcAAA, OpBD.DstAAA, OpAC.SrcAAA, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcRGB, OpBD.ConstantZero); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.DstAAA, OpBD.DstRGB); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedHardLightPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.Temp2, OpBD.ConstantRGB); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.Temp2, OpBD.Temp1, OpAC.Temp2, OpBD.Temp1); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.ConstantOne, OpBD.Temp2); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.PBR); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedSoftLightPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(4, 0.25f, 0.25f, 0.25f); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantRGB); + asm.SetConstant(0, 0.2605f, 0.2605f, 0.2605f); + asm.Mul(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(1, -0.7817f, -0.7817f, -0.7817f); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(2, 0.3022f, 0.3022f, 0.3022f); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(3, 0.2192f, 0.2192f, 0.2192f); + asm.Add(CC.GT, Dest.Temp0, OpBD.PBR, OpBD.ConstantRGB); + asm.SetConstant(5, 16f, 16f, 16f); + asm.Mul(CC.LE, Dest.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(6, 12f, 12f, 12f); + asm.Mmsub(CC.LE, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(7, 3f, 3f, 3f); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Mmsub(CC.LE, Dest.Temp0, OpAC.Temp1, OpBD.ConstantOne, OpAC.Temp1, OpBD.Temp1); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedDifferencePremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.Temp0, OpBD.Temp2, OpBD.Temp1); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedMinusPremul(ref UcodeAssembler asm) + { + asm.Sub(CC.T, Dest.Temp0, OpBD.DstRGB, OpBD.SrcRGB); + return new FixedFunctionAlpha(BlendOp.ReverseSubtractGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenUncorrelatedMinusClampedPremul(ref UcodeAssembler asm) + { + asm.Sub(CC.T, Dest.PBR, OpBD.DstRGB, OpBD.SrcRGB); + asm.Max(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Max(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantZero); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenUncorrelatedExclusionPremul(ref UcodeAssembler asm) + { + asm.Mmadd(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.DstAAA, OpAC.DstRGB, OpBD.SrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.SrcRGB, OpBD.DstRGB); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.SrcRGB, OpBD.DstRGB); + asm.Mmadd(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedContrastPremul(ref UcodeAssembler asm) + { + asm.SetConstant(0, 2f, 2f, 2f); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.DstRGB, OpBD.ConstantRGB, OpAC.DstAAA, OpBD.ConstantOne); + asm.Mmsub(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.ConstantRGB, OpAC.SrcAAA, OpBD.ConstantOne); + asm.Mul(CC.T, Dest.PBR, OpAC.Temp0, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.PBR, OpBD.DstAAA); + asm.SetConstant(1, 0.5f, 0.5f, 0.5f); + asm.Mul(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantRGB); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenUncorrelatedInvertPremul(ref UcodeAssembler asm) + { + asm.Mmsub(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA, OpAC.SrcAAA, OpBD.DstRGB); + asm.Madd(CC.T, Dest.Temp0, OpAC.DstRGB, OpBD.OneMinusSrcAAA, OpAC.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenUncorrelatedInvertRGBPremul(ref UcodeAssembler asm) + { + asm.Mmsub(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.DstAAA, OpAC.SrcRGB, OpBD.DstRGB); + asm.Madd(CC.T, Dest.Temp0, OpAC.DstRGB, OpBD.OneMinusSrcAAA, OpAC.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenUncorrelatedInvertOvgPremul(ref UcodeAssembler asm) + { + asm.Sub(CC.T, Dest.PBR, OpBD.ConstantOne, OpBD.DstRGB); + asm.Mmadd(CC.T, Dest.Temp0, OpAC.SrcAAA, OpBD.PBR, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedLinearDodgePremul(ref UcodeAssembler asm) + { + asm.Mmadd(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.DstAAA, OpAC.DstRGB, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR); + asm.Mmadd(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedLinearBurnPremul(ref UcodeAssembler asm) + { + asm.Mmadd(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.DstAAA, OpAC.DstRGB, OpBD.SrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.SrcAAA, OpBD.DstAAA); + asm.Max(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantZero); + asm.Mmadd(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedVividLightPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.Temp2, OpBD.ConstantRGB); + asm.Sub(CC.GE, Dest.PBR, OpBD.ConstantOne, OpBD.Temp2); + asm.Add(CC.GE, Dest.PBR, OpBD.PBR, OpBD.PBR); + asm.Rcp(CC.GE, Dest.PBR, OpAC.PBR); + asm.Mul(CC.GE, Dest.PBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GE, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Add(CC.LT, Dest.PBR, OpBD.Temp2, OpBD.Temp2); + asm.Rcp(CC.LT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.LT, Dest.PBR, OpAC.PBR, OpBD.ConstantOne); + asm.Sub(CC.LT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.Temp2, OpBD.ConstantZero); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.Temp2, OpBD.ConstantOne); + asm.Mov(CC.GE, Dest.Temp0, OpBD.ConstantOne); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedLinearLightPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 2f, 2f, 2f); + asm.Madd(CC.T, Dest.PBR, OpAC.Temp2, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Min(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedPinLightPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.Temp2); + asm.Sub(CC.T, Dest.Temp0, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.ConstantZero); + asm.Add(CC.LE, Dest.PBR, OpBD.Temp2, OpBD.Temp2); + asm.Min(CC.LE, Dest.Temp0, OpAC.PBR, OpBD.Temp1); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedHardMixPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Mul(CC.LT, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Mov(CC.GE, Dest.Temp0, OpBD.ConstantOne); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedRedPremul(ref UcodeAssembler asm) + { + asm.Mov(CC.T, Dest.Temp0, OpBD.DstRGB); + asm.Mov(CC.T, Dest.Temp0.R, OpBD.SrcRGB); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenUncorrelatedGreenPremul(ref UcodeAssembler asm) + { + asm.Mov(CC.T, Dest.Temp0, OpBD.DstRGB); + asm.Mov(CC.T, Dest.Temp0.G, OpBD.SrcRGB); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenUncorrelatedBluePremul(ref UcodeAssembler asm) + { + asm.Mov(CC.T, Dest.Temp0, OpBD.DstRGB); + asm.Mov(CC.T, Dest.Temp0.B, OpBD.SrcRGB); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenUncorrelatedHslHuePremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.Temp0.GBR, OpAC.PBR, OpBD.Temp2); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.Temp0); + asm.Rcp(CC.GT, Dest.Temp0, OpAC.Temp0); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.Temp2, OpAC.Temp0, OpBD.PBR); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.Temp2.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.Temp2); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp0, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedHslSaturationPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.PBR); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.T, Dest.Temp0.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.Temp0); + asm.Rcp(CC.GT, Dest.Temp0, OpAC.Temp0); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.Temp1, OpAC.Temp0, OpBD.PBR); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.GT, Dest.Temp1.GBR, OpAC.PBR, OpBD.Temp2); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.Temp1); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp0, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedHslColorPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp2, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp2, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp2, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.Temp2, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedHslLuminosityPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp2, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp2, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp2.BBB, OpAC.Temp2, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp1, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Add(CC.T, Dest.Temp1, OpBD.Temp1, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp2); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp1, OpBD.Temp2); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp2); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp1, OpBD.Temp2, OpAC.Temp2, OpBD.Temp2); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp2); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.SrcRGB, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenDisjointSrcPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl); + } + + private static FixedFunctionAlpha GenDisjointDstPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.DstAAA, OpAC.Temp1, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenDisjointSrcOverPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp2); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointDstOverPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp1); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointSrcInPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Sub(CC.T, Dest.Temp1.RToA, OpBD.DstAAA, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointDstInPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.DstAAA, OpAC.Temp1, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Sub(CC.T, Dest.Temp1.RToA, OpBD.DstAAA, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointSrcOutPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.PBR); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointDstOutPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointSrcAtopPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenDisjointDstAtopPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.DstAAA, OpAC.Temp1, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl); + } + + private static FixedFunctionAlpha GenDisjointXorPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp0); + asm.Min(CC.T, Dest.Temp1, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Add(CC.T, Dest.Temp1.RToA, OpBD.Temp1, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointPlusPremul(ref UcodeAssembler asm) + { + asm.Add(CC.T, Dest.Temp0, OpBD.DstRGB, OpBD.SrcRGB); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenDisjointMultiplyPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointScreenPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.Temp2, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointOverlayPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.ConstantRGB); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.Temp2, OpBD.Temp1, OpAC.Temp2, OpBD.Temp1); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.ConstantOne, OpBD.Temp2); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.PBR); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointDarkenPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointLightenPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Max(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointColorDodgePremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.ConstantOne, OpBD.Temp2); + asm.Rcp(CC.GT, Dest.PBR, OpAC.Temp0); + asm.Mul(CC.GT, Dest.PBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.LE, Dest.Temp0, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.Temp1, OpBD.ConstantZero); + asm.Mov(CC.LE, Dest.Temp0, OpBD.ConstantZero); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointColorBurnPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.Temp2, OpBD.ConstantZero); + asm.Rcp(CC.GT, Dest.PBR, OpAC.Temp2); + asm.Mmsub(CC.GT, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Max(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.ConstantOne, OpBD.Temp1); + asm.Mov(CC.LE, Dest.Temp0, OpBD.ConstantOne); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointHardLightPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.Temp2, OpBD.ConstantRGB); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.Temp2, OpBD.Temp1, OpAC.Temp2, OpBD.Temp1); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.ConstantOne, OpBD.Temp2); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.PBR); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointSoftLightPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(4, 0.25f, 0.25f, 0.25f); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantRGB); + asm.SetConstant(0, 0.2605f, 0.2605f, 0.2605f); + asm.Mul(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(1, -0.7817f, -0.7817f, -0.7817f); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(2, 0.3022f, 0.3022f, 0.3022f); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(3, 0.2192f, 0.2192f, 0.2192f); + asm.Add(CC.GT, Dest.Temp0, OpBD.PBR, OpBD.ConstantRGB); + asm.SetConstant(5, 16f, 16f, 16f); + asm.Mul(CC.LE, Dest.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(6, 12f, 12f, 12f); + asm.Mmsub(CC.LE, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(7, 3f, 3f, 3f); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Mmsub(CC.LE, Dest.Temp0, OpAC.Temp1, OpBD.ConstantOne, OpAC.Temp1, OpBD.Temp1); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointDifferencePremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.Temp0, OpBD.Temp2, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointExclusionPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.Temp2, OpBD.Temp1); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.Temp2, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointInvertPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp0, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenDisjointInvertRGBPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.ConstantOne, OpAC.Temp2, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp0, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenDisjointLinearDodgePremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Min(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointLinearBurnPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Max(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantZero); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointVividLightPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.Temp2, OpBD.ConstantRGB); + asm.Sub(CC.GE, Dest.PBR, OpBD.ConstantOne, OpBD.Temp2); + asm.Add(CC.GE, Dest.PBR, OpBD.PBR, OpBD.PBR); + asm.Rcp(CC.GE, Dest.PBR, OpAC.PBR); + asm.Mul(CC.GE, Dest.PBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GE, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Add(CC.LT, Dest.PBR, OpBD.Temp2, OpBD.Temp2); + asm.Rcp(CC.LT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.LT, Dest.PBR, OpAC.PBR, OpBD.ConstantOne); + asm.Sub(CC.LT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.Temp2, OpBD.ConstantZero); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.Temp2, OpBD.ConstantOne); + asm.Mov(CC.GE, Dest.Temp0, OpBD.ConstantOne); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointLinearLightPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 2f, 2f, 2f); + asm.Madd(CC.T, Dest.PBR, OpAC.Temp2, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Min(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointPinLightPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.Temp2); + asm.Sub(CC.T, Dest.Temp0, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.ConstantZero); + asm.Add(CC.LE, Dest.PBR, OpBD.Temp2, OpBD.Temp2); + asm.Min(CC.LE, Dest.Temp0, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointHardMixPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Mul(CC.LT, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Mov(CC.GE, Dest.Temp0, OpBD.ConstantOne); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointHslHuePremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.Temp0.GBR, OpAC.PBR, OpBD.Temp2); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.Temp0); + asm.Rcp(CC.GT, Dest.Temp0, OpAC.Temp0); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.Temp2, OpAC.Temp0, OpBD.PBR); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.Temp2.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.Temp2); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp0, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointHslSaturationPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.PBR); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.T, Dest.Temp0.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.Temp0); + asm.Rcp(CC.GT, Dest.Temp0, OpAC.Temp0); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.Temp1, OpAC.Temp0, OpBD.PBR); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.GT, Dest.Temp1.GBR, OpAC.PBR, OpBD.Temp2); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.Temp1); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp0, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointHslColorPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp2, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp2, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp2, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.Temp2, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointHslLuminosityPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp2, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp2, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp2.BBB, OpAC.Temp2, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp1, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Add(CC.T, Dest.Temp1, OpBD.Temp1, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp2); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp1, OpBD.Temp2); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp2); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp1, OpBD.Temp2, OpAC.Temp2, OpBD.Temp2); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp2); + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.Temp2, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenConjointSrcPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl); + } + + private static FixedFunctionAlpha GenConjointDstPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointSrcOverPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp2, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointDstOverPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp1, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointSrcInPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MinimumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointDstInPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MinimumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointSrcOutPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Max(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantZero); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenConjointDstOutPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Max(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantZero); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenConjointSrcAtopPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointDstAtopPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl); + } + + private static FixedFunctionAlpha GenConjointXorPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp0); + asm.Sub(CC.T, Dest.Temp1.CC, OpBD.DstAAA, OpBD.SrcAAA); + asm.Sub(CC.LT, Dest.Temp1, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mov(CC.T, Dest.Temp1.RToA, OpBD.Temp1); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenConjointMultiplyPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointScreenPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.Temp2, OpBD.Temp1); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointOverlayPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.ConstantRGB); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.Temp2, OpBD.Temp1, OpAC.Temp2, OpBD.Temp1); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.ConstantOne, OpBD.Temp2); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.PBR); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointDarkenPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointLightenPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Max(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointColorDodgePremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.ConstantOne, OpBD.Temp2); + asm.Rcp(CC.GT, Dest.PBR, OpAC.Temp0); + asm.Mul(CC.GT, Dest.PBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.LE, Dest.Temp0, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.Temp1, OpBD.ConstantZero); + asm.Mov(CC.LE, Dest.Temp0, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointColorBurnPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.Temp2, OpBD.ConstantZero); + asm.Rcp(CC.GT, Dest.PBR, OpAC.Temp2); + asm.Mmsub(CC.GT, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Max(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.ConstantOne, OpBD.Temp1); + asm.Mov(CC.LE, Dest.Temp0, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointHardLightPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.Temp2, OpBD.ConstantRGB); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.Temp2, OpBD.Temp1, OpAC.Temp2, OpBD.Temp1); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.ConstantOne, OpBD.Temp2); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.PBR); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointSoftLightPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(4, 0.25f, 0.25f, 0.25f); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantRGB); + asm.SetConstant(0, 0.2605f, 0.2605f, 0.2605f); + asm.Mul(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(1, -0.7817f, -0.7817f, -0.7817f); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(2, 0.3022f, 0.3022f, 0.3022f); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(3, 0.2192f, 0.2192f, 0.2192f); + asm.Add(CC.GT, Dest.Temp0, OpBD.PBR, OpBD.ConstantRGB); + asm.SetConstant(5, 16f, 16f, 16f); + asm.Mul(CC.LE, Dest.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(6, 12f, 12f, 12f); + asm.Mmsub(CC.LE, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(7, 3f, 3f, 3f); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Mmsub(CC.LE, Dest.Temp0, OpAC.Temp1, OpBD.ConstantOne, OpAC.Temp1, OpBD.Temp1); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointDifferencePremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.Temp0, OpBD.Temp2, OpBD.Temp1); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointExclusionPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.Temp2, OpBD.Temp1); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.Temp2, OpBD.Temp1); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointInvertPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointInvertRGBPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.ConstantOne, OpAC.Temp2, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointLinearDodgePremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Min(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointLinearBurnPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Max(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointVividLightPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.Temp2, OpBD.ConstantRGB); + asm.Sub(CC.GE, Dest.PBR, OpBD.ConstantOne, OpBD.Temp2); + asm.Add(CC.GE, Dest.PBR, OpBD.PBR, OpBD.PBR); + asm.Rcp(CC.GE, Dest.PBR, OpAC.PBR); + asm.Mul(CC.GE, Dest.PBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GE, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Add(CC.LT, Dest.PBR, OpBD.Temp2, OpBD.Temp2); + asm.Rcp(CC.LT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.LT, Dest.PBR, OpAC.PBR, OpBD.ConstantOne); + asm.Sub(CC.LT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.Temp2, OpBD.ConstantZero); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.Temp2, OpBD.ConstantOne); + asm.Mov(CC.GE, Dest.Temp0, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointLinearLightPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 2f, 2f, 2f); + asm.Madd(CC.T, Dest.PBR, OpAC.Temp2, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Min(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointPinLightPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.Temp2); + asm.Sub(CC.T, Dest.Temp0, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.ConstantZero); + asm.Add(CC.LE, Dest.PBR, OpBD.Temp2, OpBD.Temp2); + asm.Min(CC.LE, Dest.Temp0, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointHardMixPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Mul(CC.LT, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Mov(CC.GE, Dest.Temp0, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointHslHuePremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.Temp0.GBR, OpAC.PBR, OpBD.Temp2); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.Temp0); + asm.Rcp(CC.GT, Dest.Temp0, OpAC.Temp0); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.Temp2, OpAC.Temp0, OpBD.PBR); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.Temp2.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.Temp2); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp0, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointHslSaturationPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.PBR); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.T, Dest.Temp0.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.Temp0); + asm.Rcp(CC.GT, Dest.Temp0, OpAC.Temp0); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.Temp1, OpAC.Temp0, OpBD.PBR); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.GT, Dest.Temp1.GBR, OpAC.PBR, OpBD.Temp2); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.Temp1); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp0, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointHslColorPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp2, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp2, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp2, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.Temp2, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointHslLuminosityPremul(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp2, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp2, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp2.BBB, OpAC.Temp2, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp1, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Add(CC.T, Dest.Temp1, OpBD.Temp1, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp2); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp1, OpBD.Temp2); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp2); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp1, OpBD.Temp2, OpAC.Temp2, OpBD.Temp2); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp2); + asm.Rcp(CC.T, Dest.PBR, OpAC.SrcAAA); + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.PBR); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp2, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenUncorrelatedDstOver(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.DstRGB, OpBD.SrcAAA); + asm.Mmadd(CC.T, Dest.PBR, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedSrcIn(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.PBR, OpBD.DstAAA); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.DstAlphaGl, BlendFactor.ZeroGl); + } + + private static FixedFunctionAlpha GenUncorrelatedSrcOut(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.PBR, OpBD.OneMinusDstAAA); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneMinusDstAlphaGl, BlendFactor.ZeroGl); + } + + private static FixedFunctionAlpha GenUncorrelatedSrcAtop(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.PBR, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.DstRGB, OpBD.OneMinusSrcAAA, OpAC.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenUncorrelatedDstAtop(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.DstRGB, OpBD.SrcAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl); + } + + private static FixedFunctionAlpha GenUncorrelatedXor(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.PBR, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.DstRGB, OpBD.OneMinusSrcAAA, OpAC.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneMinusDstAlphaGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedPlusClamped(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Add(CC.T, Dest.PBR, OpBD.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenUncorrelatedPlusClampedAlpha(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantOne); + asm.Min(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenUncorrelatedPlusDarker(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantOne); + asm.Add(CC.T, Dest.PBR, OpBD.PBR, OpBD.Temp2); + asm.Add(CC.T, Dest.PBR, OpBD.PBR, OpBD.DstRGB); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.SrcAAA); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.DstAAA); + asm.Max(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantZero); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenUncorrelatedMultiply(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.PBR, OpBD.DstRGB); + asm.Mmadd(CC.T, Dest.PBR, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedScreen(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mmadd(CC.T, Dest.PBR, OpAC.PBR, OpBD.DstAAA, OpAC.DstRGB, OpBD.SrcAAA); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.Temp2, OpBD.DstRGB); + asm.Mmadd(CC.T, Dest.PBR, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedOverlay(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.ConstantRGB); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.SrcRGB, OpBD.Temp1, OpAC.SrcRGB, OpBD.Temp1); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.ConstantOne, OpBD.SrcRGB); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.PBR); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedDarken(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.PBR, OpBD.DstAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.DstRGB, OpBD.SrcAAA); + asm.Min(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR); + asm.Mmadd(CC.T, Dest.PBR, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedLighten(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.PBR, OpBD.DstAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.DstRGB, OpBD.SrcAAA); + asm.Max(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR); + asm.Mmadd(CC.T, Dest.PBR, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedColorDodge(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.PBR); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mul(CC.GT, Dest.PBR, OpAC.PBR, OpBD.SrcAAA); + asm.Mul(CC.GT, Dest.PBR, OpAC.PBR, OpBD.DstRGB); + asm.Min(CC.GT, Dest.PBR, OpAC.DstAAA, OpBD.PBR); + asm.Mul(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.SrcAAA); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.DstAAA); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.DstRGB, OpBD.ConstantZero); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Mmadd(CC.T, Dest.PBR, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedColorBurn(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.DstAAA, OpBD.SrcAAA, OpAC.SrcAAA, OpBD.DstRGB); + asm.Rcp(CC.T, Dest.PBR, OpAC.Temp2); + asm.Mul(CC.T, Dest.PBR, OpAC.Temp0, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.PBR); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.SrcAAA, OpBD.DstAAA, OpAC.SrcAAA, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.Temp2, OpBD.ConstantZero); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.DstAAA, OpBD.DstRGB); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.T, Dest.PBR, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedHardLight(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.SrcRGB, OpBD.ConstantRGB); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.SrcRGB, OpBD.Temp1, OpAC.SrcRGB, OpBD.Temp1); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.ConstantOne, OpBD.SrcRGB); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.PBR); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedSoftLight(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(4, 0.25f, 0.25f, 0.25f); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantRGB); + asm.SetConstant(0, 0.2605f, 0.2605f, 0.2605f); + asm.Mul(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(1, -0.7817f, -0.7817f, -0.7817f); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(2, 0.3022f, 0.3022f, 0.3022f); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(3, 0.2192f, 0.2192f, 0.2192f); + asm.Add(CC.GT, Dest.Temp0, OpBD.PBR, OpBD.ConstantRGB); + asm.SetConstant(5, 16f, 16f, 16f); + asm.Mul(CC.LE, Dest.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(6, 12f, 12f, 12f); + asm.Mmsub(CC.LE, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(7, 3f, 3f, 3f); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.SrcRGB); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Mmsub(CC.LE, Dest.Temp0, OpAC.Temp1, OpBD.ConstantOne, OpAC.Temp1, OpBD.Temp1); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.SrcRGB); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedDifference(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.SrcRGB); + asm.Sub(CC.LT, Dest.Temp0, OpBD.SrcRGB, OpBD.Temp1); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedMinus(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Sub(CC.T, Dest.Temp0, OpBD.DstRGB, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.ReverseSubtractGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenUncorrelatedMinusClamped(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Sub(CC.T, Dest.PBR, OpBD.DstRGB, OpBD.PBR); + asm.Max(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Max(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantZero); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenUncorrelatedExclusion(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mmadd(CC.T, Dest.PBR, OpAC.PBR, OpBD.DstAAA, OpAC.DstRGB, OpBD.SrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.Temp2, OpBD.DstRGB); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.Temp2, OpBD.DstRGB); + asm.Mmadd(CC.T, Dest.PBR, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedContrast(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.SetConstant(0, 2f, 2f, 2f); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.DstRGB, OpBD.ConstantRGB, OpAC.DstAAA, OpBD.ConstantOne); + asm.Mmsub(CC.T, Dest.PBR, OpAC.Temp2, OpBD.ConstantRGB, OpAC.SrcAAA, OpBD.ConstantOne); + asm.Mul(CC.T, Dest.PBR, OpAC.Temp0, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.PBR, OpBD.DstAAA); + asm.SetConstant(1, 0.5f, 0.5f, 0.5f); + asm.Mul(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantRGB); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenUncorrelatedInvertRGB(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.DstAAA, OpAC.PBR, OpBD.DstRGB); + asm.Madd(CC.T, Dest.Temp0, OpAC.DstRGB, OpBD.OneMinusSrcAAA, OpAC.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenUncorrelatedLinearDodge(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mmadd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.DstAAA, OpAC.DstRGB, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR); + asm.Mmadd(CC.T, Dest.PBR, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedLinearBurn(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mmadd(CC.T, Dest.PBR, OpAC.PBR, OpBD.DstAAA, OpAC.DstRGB, OpBD.SrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.SrcAAA, OpBD.DstAAA); + asm.Max(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantZero); + asm.Mmadd(CC.T, Dest.PBR, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.Temp0, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedVividLight(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcRGB, OpBD.ConstantRGB); + asm.Sub(CC.GE, Dest.PBR, OpBD.ConstantOne, OpBD.SrcRGB); + asm.Add(CC.GE, Dest.PBR, OpBD.PBR, OpBD.PBR); + asm.Rcp(CC.GE, Dest.PBR, OpAC.PBR); + asm.Mul(CC.GE, Dest.PBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GE, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Add(CC.LT, Dest.PBR, OpBD.SrcRGB, OpBD.SrcRGB); + asm.Rcp(CC.LT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.LT, Dest.PBR, OpAC.PBR, OpBD.ConstantOne); + asm.Sub(CC.LT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcRGB, OpBD.ConstantZero); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcRGB, OpBD.ConstantOne); + asm.Mov(CC.GE, Dest.Temp0, OpBD.ConstantOne); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedLinearLight(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 2f, 2f, 2f); + asm.Madd(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Min(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedPinLight(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.SrcRGB); + asm.Sub(CC.T, Dest.Temp0, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.ConstantZero); + asm.Add(CC.LE, Dest.PBR, OpBD.SrcRGB, OpBD.SrcRGB); + asm.Min(CC.LE, Dest.Temp0, OpAC.PBR, OpBD.Temp1); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedHardMix(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Mul(CC.LT, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Mov(CC.GE, Dest.Temp0, OpBD.ConstantOne); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.Temp2, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedRed(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mov(CC.T, Dest.Temp0, OpBD.DstRGB); + asm.Mov(CC.T, Dest.Temp0.R, OpBD.Temp2); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenUncorrelatedGreen(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mov(CC.T, Dest.Temp0, OpBD.DstRGB); + asm.Mov(CC.T, Dest.Temp0.G, OpBD.Temp2); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenUncorrelatedBlue(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.Temp2, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mov(CC.T, Dest.Temp0, OpBD.DstRGB); + asm.Mov(CC.T, Dest.Temp0.B, OpBD.Temp2); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenUncorrelatedHslHue(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.SrcRGB); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Min(CC.T, Dest.Temp0.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.SrcRGB); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.Temp0); + asm.Rcp(CC.GT, Dest.Temp0, OpAC.Temp0); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.SrcRGB); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.SrcRGB, OpAC.Temp0, OpBD.PBR); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.Temp2.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.Temp2); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp0, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.PBR, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedHslSaturation(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.PBR); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.T, Dest.Temp0.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.Temp0); + asm.Rcp(CC.GT, Dest.Temp0, OpAC.Temp0); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.Temp1, OpAC.Temp0, OpBD.PBR); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.SrcRGB); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Min(CC.GT, Dest.Temp1.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.SrcRGB); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.Temp1); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp0, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.PBR, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedHslColor(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.SrcRGB, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.SrcRGB, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.SrcRGB, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.SrcRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.PBR, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenUncorrelatedHslLuminosity(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.SrcRGB, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.SrcRGB, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp2.BBB, OpAC.SrcRGB, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp1, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Add(CC.T, Dest.Temp1, OpBD.Temp1, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp2); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp1, OpBD.Temp2); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp2); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp1, OpBD.Temp2, OpAC.Temp2, OpBD.Temp2); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp2); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Mmadd(CC.T, Dest.Temp1, OpAC.PBR, OpBD.OneMinusDstAAA, OpAC.DstRGB, OpBD.OneMinusSrcAAA); + asm.Mul(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.DstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl); + } + + private static FixedFunctionAlpha GenDisjointSrc(ref UcodeAssembler asm) + { + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl); + } + + private static FixedFunctionAlpha GenDisjointSrcOver(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.SrcRGB); + asm.Madd(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointDstOver(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp1); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointSrcIn(ref UcodeAssembler asm) + { + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Sub(CC.T, Dest.Temp1.RToA, OpBD.DstAAA, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointSrcOut(ref UcodeAssembler asm) + { + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.PBR); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointSrcAtop(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenDisjointDstAtop(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.DstAAA, OpAC.Temp1, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl); + } + + private static FixedFunctionAlpha GenDisjointXor(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp0); + asm.Min(CC.T, Dest.Temp1, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Add(CC.T, Dest.Temp1.RToA, OpBD.Temp1, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointPlus(ref UcodeAssembler asm) + { + asm.Mul(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.SrcAAA); + asm.Add(CC.T, Dest.Temp0, OpBD.DstRGB, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenDisjointMultiply(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mul(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointScreen(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.PBR); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.SrcRGB, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointOverlay(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.ConstantRGB); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.SrcRGB, OpBD.Temp1, OpAC.SrcRGB, OpBD.Temp1); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.ConstantOne, OpBD.SrcRGB); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.PBR); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointDarken(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointLighten(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Max(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointColorDodge(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.ConstantOne, OpBD.SrcRGB); + asm.Rcp(CC.GT, Dest.PBR, OpAC.Temp0); + asm.Mul(CC.GT, Dest.PBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.LE, Dest.Temp0, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.Temp1, OpBD.ConstantZero); + asm.Mov(CC.LE, Dest.Temp0, OpBD.ConstantZero); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointColorBurn(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.SrcRGB, OpBD.ConstantZero); + asm.Rcp(CC.GT, Dest.PBR, OpAC.SrcRGB); + asm.Mmsub(CC.GT, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Max(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.ConstantOne, OpBD.Temp1); + asm.Mov(CC.LE, Dest.Temp0, OpBD.ConstantOne); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointHardLight(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.SrcRGB, OpBD.ConstantRGB); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.SrcRGB, OpBD.Temp1, OpAC.SrcRGB, OpBD.Temp1); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.ConstantOne, OpBD.SrcRGB); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.PBR); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointSoftLight(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(4, 0.25f, 0.25f, 0.25f); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantRGB); + asm.SetConstant(0, 0.2605f, 0.2605f, 0.2605f); + asm.Mul(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(1, -0.7817f, -0.7817f, -0.7817f); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(2, 0.3022f, 0.3022f, 0.3022f); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(3, 0.2192f, 0.2192f, 0.2192f); + asm.Add(CC.GT, Dest.Temp0, OpBD.PBR, OpBD.ConstantRGB); + asm.SetConstant(5, 16f, 16f, 16f); + asm.Mul(CC.LE, Dest.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(6, 12f, 12f, 12f); + asm.Mmsub(CC.LE, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(7, 3f, 3f, 3f); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.SrcRGB); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Mmsub(CC.LE, Dest.Temp0, OpAC.Temp1, OpBD.ConstantOne, OpAC.Temp1, OpBD.Temp1); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.SrcRGB); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointDifference(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.SrcRGB); + asm.Sub(CC.LT, Dest.Temp0, OpBD.SrcRGB, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointExclusion(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.PBR); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.SrcRGB, OpBD.Temp1); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.SrcRGB, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointInvertRGB(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.ConstantOne, OpAC.SrcRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.Temp0, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenDisjointLinearDodge(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.PBR); + asm.Min(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointLinearBurn(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Max(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantZero); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointVividLight(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcRGB, OpBD.ConstantRGB); + asm.Sub(CC.GE, Dest.PBR, OpBD.ConstantOne, OpBD.SrcRGB); + asm.Add(CC.GE, Dest.PBR, OpBD.PBR, OpBD.PBR); + asm.Rcp(CC.GE, Dest.PBR, OpAC.PBR); + asm.Mul(CC.GE, Dest.PBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GE, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Add(CC.LT, Dest.PBR, OpBD.SrcRGB, OpBD.SrcRGB); + asm.Rcp(CC.LT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.LT, Dest.PBR, OpAC.PBR, OpBD.ConstantOne); + asm.Sub(CC.LT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcRGB, OpBD.ConstantZero); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcRGB, OpBD.ConstantOne); + asm.Mov(CC.GE, Dest.Temp0, OpBD.ConstantOne); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointLinearLight(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 2f, 2f, 2f); + asm.Madd(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Min(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointPinLight(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.SrcRGB); + asm.Sub(CC.T, Dest.Temp0, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.ConstantZero); + asm.Add(CC.LE, Dest.PBR, OpBD.SrcRGB, OpBD.SrcRGB); + asm.Min(CC.LE, Dest.Temp0, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointHardMix(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Mul(CC.LT, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Mov(CC.GE, Dest.Temp0, OpBD.ConstantOne); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointHslHue(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.SrcRGB); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Min(CC.T, Dest.Temp0.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.SrcRGB); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.Temp0); + asm.Rcp(CC.GT, Dest.Temp0, OpAC.Temp0); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.SrcRGB); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.SrcRGB, OpAC.Temp0, OpBD.PBR); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.Temp2.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.Temp2); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp0, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointHslSaturation(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.PBR); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.T, Dest.Temp0.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.Temp0); + asm.Rcp(CC.GT, Dest.Temp0, OpAC.Temp0); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.Temp1, OpAC.Temp0, OpBD.PBR); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.SrcRGB); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Min(CC.GT, Dest.Temp1.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.SrcRGB); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.Temp1); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp0, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointHslColor(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.SrcRGB, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.SrcRGB, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.SrcRGB, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.SrcRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenDisjointHslLuminosity(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.SrcRGB, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.SrcRGB, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp2.BBB, OpAC.SrcRGB, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp1, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Add(CC.T, Dest.Temp1, OpBD.Temp1, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp2); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp1, OpBD.Temp2); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp2); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp1, OpBD.Temp2, OpAC.Temp2, OpBD.Temp2); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp2); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.OneMinusSrcAAA); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.Temp1, OpAC.PBR, OpBD.Temp0); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.SrcAAA, OpBD.OneMinusDstAAA); + asm.Madd(CC.T, Dest.Temp0, OpAC.PBR, OpBD.SrcRGB, OpAC.Temp0); + asm.Add(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Min(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenConjointSrc(ref UcodeAssembler asm) + { + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Madd(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl); + } + + private static FixedFunctionAlpha GenConjointSrcOver(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.SrcRGB); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.SrcRGB, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointDstOver(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp1, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointSrcIn(ref UcodeAssembler asm) + { + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MinimumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointSrcOut(ref UcodeAssembler asm) + { + asm.Sub(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Mul(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Max(CC.T, Dest.Temp1.RToA, OpAC.PBR, OpBD.ConstantZero); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenConjointSrcAtop(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointDstAtop(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Madd(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl); + } + + private static FixedFunctionAlpha GenConjointXor(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.SrcAAA, OpBD.DstAAA); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Mul(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp0); + asm.Sub(CC.T, Dest.Temp1.CC, OpBD.DstAAA, OpBD.SrcAAA); + asm.Sub(CC.LT, Dest.Temp1, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mov(CC.T, Dest.Temp1.RToA, OpBD.Temp1); + asm.Mov(CC.T, Dest.Temp0, OpBD.Temp0); + return FixedFunctionAlpha.Disabled; + } + + private static FixedFunctionAlpha GenConjointMultiply(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mul(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointScreen(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.PBR); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.SrcRGB, OpBD.Temp1); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointOverlay(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.ConstantRGB); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.SrcRGB, OpBD.Temp1, OpAC.SrcRGB, OpBD.Temp1); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.ConstantOne, OpBD.SrcRGB); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.PBR); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointDarken(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Min(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointLighten(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Max(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointColorDodge(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.ConstantOne, OpBD.SrcRGB); + asm.Rcp(CC.GT, Dest.PBR, OpAC.Temp0); + asm.Mul(CC.GT, Dest.PBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Mov(CC.LE, Dest.Temp0, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.Temp1, OpBD.ConstantZero); + asm.Mov(CC.LE, Dest.Temp0, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointColorBurn(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.SrcRGB, OpBD.ConstantZero); + asm.Rcp(CC.GT, Dest.PBR, OpAC.SrcRGB); + asm.Mmsub(CC.GT, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Max(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.ConstantOne, OpBD.Temp1); + asm.Mov(CC.LE, Dest.Temp0, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointHardLight(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.SrcRGB, OpBD.ConstantRGB); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.SrcRGB, OpBD.Temp1, OpAC.SrcRGB, OpBD.Temp1); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.ConstantOne, OpBD.SrcRGB); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.PBR); + asm.Sub(CC.GT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointSoftLight(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(4, 0.25f, 0.25f, 0.25f); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantRGB); + asm.SetConstant(0, 0.2605f, 0.2605f, 0.2605f); + asm.Mul(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(1, -0.7817f, -0.7817f, -0.7817f); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(2, 0.3022f, 0.3022f, 0.3022f); + asm.Mmadd(CC.GT, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(3, 0.2192f, 0.2192f, 0.2192f); + asm.Add(CC.GT, Dest.Temp0, OpBD.PBR, OpBD.ConstantRGB); + asm.SetConstant(5, 16f, 16f, 16f); + asm.Mul(CC.LE, Dest.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(6, 12f, 12f, 12f); + asm.Mmsub(CC.LE, Dest.PBR, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.SetConstant(7, 3f, 3f, 3f); + asm.Mmadd(CC.LE, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp1, OpBD.ConstantRGB); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.SrcRGB); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Mmsub(CC.LE, Dest.Temp0, OpAC.Temp1, OpBD.ConstantOne, OpAC.Temp1, OpBD.Temp1); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.SrcRGB); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointDifference(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.SrcRGB); + asm.Sub(CC.LT, Dest.Temp0, OpBD.SrcRGB, OpBD.Temp1); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointExclusion(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.PBR); + asm.Mmsub(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.SrcRGB, OpBD.Temp1); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.SrcRGB, OpBD.Temp1); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointInvertRGB(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mmsub(CC.T, Dest.Temp0, OpAC.SrcRGB, OpBD.ConstantOne, OpAC.SrcRGB, OpBD.PBR); + asm.Min(CC.T, Dest.PBR, OpAC.DstAAA, OpBD.SrcAAA); + asm.Mul(CC.T, Dest.Temp0, OpAC.Temp0, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Madd(CC.T, Dest.Temp0, OpAC.Temp1, OpBD.PBR, OpAC.Temp0); + return new FixedFunctionAlpha(BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointLinearDodge(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.PBR); + asm.Min(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointLinearBurn(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Max(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointVividLight(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.5f, 0.5f, 0.5f); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcRGB, OpBD.ConstantRGB); + asm.Sub(CC.GE, Dest.PBR, OpBD.ConstantOne, OpBD.SrcRGB); + asm.Add(CC.GE, Dest.PBR, OpBD.PBR, OpBD.PBR); + asm.Rcp(CC.GE, Dest.PBR, OpAC.PBR); + asm.Mul(CC.GE, Dest.PBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GE, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Add(CC.LT, Dest.PBR, OpBD.SrcRGB, OpBD.SrcRGB); + asm.Rcp(CC.LT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.LT, Dest.PBR, OpAC.PBR, OpBD.ConstantOne); + asm.Sub(CC.LT, Dest.Temp0, OpBD.ConstantOne, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcRGB, OpBD.ConstantZero); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcRGB, OpBD.ConstantOne); + asm.Mov(CC.GE, Dest.Temp0, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointLinearLight(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 2f, 2f, 2f); + asm.Madd(CC.T, Dest.PBR, OpAC.SrcRGB, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Max(CC.T, Dest.PBR, OpAC.PBR, OpBD.ConstantZero); + asm.Min(CC.T, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointPinLight(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.SrcRGB); + asm.Sub(CC.T, Dest.Temp0, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.ConstantZero); + asm.Add(CC.LE, Dest.PBR, OpBD.SrcRGB, OpBD.SrcRGB); + asm.Min(CC.LE, Dest.Temp0, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointHardMix(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Add(CC.T, Dest.PBR, OpBD.SrcRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Mul(CC.LT, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Mov(CC.GE, Dest.Temp0, OpBD.ConstantOne); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointHslHue(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.SrcRGB); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Min(CC.T, Dest.Temp0.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.SrcRGB); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.Temp0); + asm.Rcp(CC.GT, Dest.Temp0, OpAC.Temp0); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.SrcRGB); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.SrcRGB, OpAC.Temp0, OpBD.PBR); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.Temp2.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.Temp2); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp0, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointHslSaturation(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.PBR); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.T, Dest.Temp0.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.T, Dest.Temp0.CC, OpBD.PBR, OpBD.Temp0); + asm.Rcp(CC.GT, Dest.Temp0, OpAC.Temp0); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.Temp1, OpAC.Temp0, OpBD.PBR); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.SrcRGB); + asm.Min(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Min(CC.GT, Dest.Temp1.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Mov(CC.GT, Dest.PBR.GBR, OpBD.SrcRGB); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Max(CC.GT, Dest.PBR.GBR, OpAC.PBR, OpBD.SrcRGB); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp0, OpBD.Temp1); + asm.Mul(CC.LE, Dest.Temp0, OpAC.SrcAAA, OpBD.ConstantZero); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp0, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp0, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointHslColor(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.PBR, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp1.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.SrcRGB, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.SrcRGB, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.SrcRGB, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Add(CC.T, Dest.Temp2, OpBD.SrcRGB, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp1); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp2, OpBD.Temp1); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp1); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp2); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp1, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp2, OpBD.Temp1, OpAC.Temp1, OpBD.Temp1); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp1); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + + private static FixedFunctionAlpha GenConjointHslLuminosity(ref UcodeAssembler asm) + { + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.SetConstant(0, 0.3f, 0.59f, 0.11f); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.SrcRGB, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.SrcRGB, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.Temp2.BBB, OpAC.SrcRGB, OpBD.ConstantRGB, OpAC.PBR); + asm.Mul(CC.T, Dest.PBR.RRR, OpAC.Temp1, OpBD.ConstantRGB); + asm.Madd(CC.T, Dest.PBR.GGG, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Madd(CC.T, Dest.PBR.BBB, OpAC.Temp1, OpBD.ConstantRGB, OpAC.PBR); + asm.Sub(CC.T, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Add(CC.T, Dest.Temp1, OpBD.Temp1, OpBD.PBR); + asm.Mov(CC.T, Dest.Temp0, OpBD.PBR); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Max(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.PBR, OpBD.ConstantOne); + asm.Add(CC.GT, Dest.PBR, OpBD.PBR, OpBD.ConstantOne); + asm.Sub(CC.GT, Dest.PBR, OpBD.PBR, OpBD.Temp2); + asm.Rcp(CC.GT, Dest.PBR, OpAC.PBR); + asm.Mmsub(CC.GT, Dest.Temp0, OpAC.PBR, OpBD.ConstantOne, OpAC.PBR, OpBD.Temp2); + asm.Sub(CC.GT, Dest.PBR, OpBD.Temp1, OpBD.Temp2); + asm.Madd(CC.GT, Dest.Temp0, OpAC.Temp0, OpBD.PBR, OpAC.Temp2); + asm.Mov(CC.T, Dest.PBR.GBR, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR.GBR, OpAC.PBR, OpBD.Temp1); + asm.Min(CC.T, Dest.PBR.GBR.CC, OpAC.PBR, OpBD.Temp1); + asm.Sub(CC.LT, Dest.PBR, OpBD.Temp2, OpBD.PBR); + asm.Rcp(CC.LT, Dest.Temp0, OpAC.PBR); + asm.Mmsub(CC.LT, Dest.PBR, OpAC.Temp1, OpBD.Temp2, OpAC.Temp2, OpBD.Temp2); + asm.Madd(CC.LT, Dest.Temp0, OpAC.PBR, OpBD.Temp0, OpAC.Temp2); + asm.Rcp(CC.T, Dest.PBR, OpAC.DstAAA); + asm.Mul(CC.T, Dest.Temp1, OpAC.DstRGB, OpBD.PBR); + asm.Sub(CC.T, Dest.PBR.CC, OpBD.SrcAAA, OpBD.DstAAA); + asm.Mmadd(CC.GE, Dest.Temp0, OpAC.Temp0, OpBD.DstAAA, OpAC.SrcRGB, OpBD.PBR); + asm.Sub(CC.LT, Dest.PBR, OpBD.DstAAA, OpBD.SrcAAA); + asm.Mmadd(CC.LT, Dest.Temp0, OpAC.Temp0, OpBD.SrcAAA, OpAC.Temp1, OpBD.PBR); + return new FixedFunctionAlpha(BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl); + } + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Gpu/Engine/Threed/Blender/AdvancedBlendManager.cs b/Ryujinx.Graphics.Gpu/Engine/Threed/Blender/AdvancedBlendManager.cs new file mode 100644 index 000000000..8072c6af2 --- /dev/null +++ b/Ryujinx.Graphics.Gpu/Engine/Threed/Blender/AdvancedBlendManager.cs @@ -0,0 +1,115 @@ +using Ryujinx.Common; +using Ryujinx.Graphics.GAL; +using System; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Gpu.Engine.Threed.Blender +{ + /// <summary> + /// Advanced blend manager. + /// </summary> + class AdvancedBlendManager + { + private const int InstructionRamSize = 128; + private const int InstructionRamSizeMask = InstructionRamSize - 1; + + private readonly DeviceStateWithShadow<ThreedClassState> _state; + + private readonly uint[] _code; + private int _ip; + + /// <summary> + /// Creates a new instance of the advanced blend manager. + /// </summary> + /// <param name="state">GPU state of the channel owning this manager</param> + public AdvancedBlendManager(DeviceStateWithShadow<ThreedClassState> state) + { + _state = state; + _code = new uint[InstructionRamSize]; + } + + /// <summary> + /// Sets the start offset of the blend microcode in memory. + /// </summary> + /// <param name="argument">Method call argument</param> + public void LoadBlendUcodeStart(int argument) + { + _ip = argument; + } + + /// <summary> + /// Pushes one word of blend microcode. + /// </summary> + /// <param name="argument">Method call argument</param> + public void LoadBlendUcodeInstruction(int argument) + { + _code[_ip++ & InstructionRamSizeMask] = (uint)argument; + } + + /// <summary> + /// Tries to identify the current advanced blend function being used, + /// given the current state and microcode that was uploaded. + /// </summary> + /// <param name="descriptor">Advanced blend descriptor</param> + /// <returns>True if the function was found, false otherwise</returns> + public bool TryGetAdvancedBlend(out AdvancedBlendDescriptor descriptor) + { + Span<uint> currentCode = new Span<uint>(_code); + byte codeLength = (byte)_state.State.BlendUcodeSize; + + if (currentCode.Length > codeLength) + { + currentCode = currentCode.Slice(0, codeLength); + } + + Hash128 hash = XXHash128.ComputeHash(MemoryMarshal.Cast<uint, byte>(currentCode)); + + descriptor = default; + + if (!AdvancedBlendPreGenTable.Entries.TryGetValue(hash, out var entry)) + { + return false; + } + + if (entry.Constants != null) + { + bool constantsMatch = true; + + for (int i = 0; i < entry.Constants.Length; i++) + { + RgbFloat constant = entry.Constants[i]; + RgbHalf constant2 = _state.State.BlendUcodeConstants[i]; + + if ((Half)constant.R != constant2.UnpackR() || + (Half)constant.G != constant2.UnpackG() || + (Half)constant.B != constant2.UnpackB()) + { + constantsMatch = false; + break; + } + } + + if (!constantsMatch) + { + return false; + } + } + + if (entry.Alpha.Enable != _state.State.BlendUcodeEnable) + { + return false; + } + + if (entry.Alpha.Enable == BlendUcodeEnable.EnableRGBA && + (entry.Alpha.AlphaOp != _state.State.BlendStateCommon.AlphaOp || + entry.Alpha.AlphaSrcFactor != _state.State.BlendStateCommon.AlphaSrcFactor || + entry.Alpha.AlphaDstFactor != _state.State.BlendStateCommon.AlphaDstFactor)) + { + return false; + } + + descriptor = new AdvancedBlendDescriptor(entry.Op, entry.Overlap, entry.SrcPreMultiplied); + return true; + } + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Gpu/Engine/Threed/Blender/AdvancedBlendPreGenTable.cs b/Ryujinx.Graphics.Gpu/Engine/Threed/Blender/AdvancedBlendPreGenTable.cs new file mode 100644 index 000000000..d35d8abf4 --- /dev/null +++ b/Ryujinx.Graphics.Gpu/Engine/Threed/Blender/AdvancedBlendPreGenTable.cs @@ -0,0 +1,273 @@ +using Ryujinx.Common; +using Ryujinx.Graphics.GAL; +using System; +using System.Collections.Generic; + +namespace Ryujinx.Graphics.Gpu.Engine.Threed.Blender +{ + /// <summary> + /// Advanced blend function entry. + /// </summary> + struct AdvancedBlendEntry + { + /// <summary> + /// Advanced blend operation. + /// </summary> + public AdvancedBlendOp Op { get; } + + /// <summary> + /// Advanced blend overlap mode. + /// </summary> + public AdvancedBlendOverlap Overlap { get; } + + /// <summary> + /// Whenever the source input is pre-multiplied. + /// </summary> + public bool SrcPreMultiplied { get; } + + /// <summary> + /// Constants used by the microcode. + /// </summary> + public RgbFloat[] Constants { get; } + + /// <summary> + /// Fixed function alpha state. + /// </summary> + public FixedFunctionAlpha Alpha { get; } + + /// <summary> + /// Creates a new advanced blend function entry. + /// </summary> + /// <param name="op">Advanced blend operation</param> + /// <param name="overlap">Advanced blend overlap mode</param> + /// <param name="srcPreMultiplied">Whenever the source input is pre-multiplied</param> + /// <param name="constants">Constants used by the microcode</param> + /// <param name="alpha">Fixed function alpha state</param> + public AdvancedBlendEntry( + AdvancedBlendOp op, + AdvancedBlendOverlap overlap, + bool srcPreMultiplied, + RgbFloat[] constants, + FixedFunctionAlpha alpha) + { + Op = op; + Overlap = overlap; + SrcPreMultiplied = srcPreMultiplied; + Constants = constants; + Alpha = alpha; + } + } + + /// <summary> + /// Pre-generated hash table with advanced blend functions used by the driver. + /// </summary> + static class AdvancedBlendPreGenTable + { + /// <summary> + /// Advanced blend functions dictionary. + /// </summary> + public static readonly IReadOnlyDictionary<Hash128, AdvancedBlendEntry> Entries = new Dictionary<Hash128, AdvancedBlendEntry>() + { + { new Hash128(0x19ECF57B83DE31F7, 0x5BAE759246F264C0), new AdvancedBlendEntry(AdvancedBlendOp.PlusClamped, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xDE1B14A356A1A9ED, 0x59D803593C607C1D), new AdvancedBlendEntry(AdvancedBlendOp.PlusClampedAlpha, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x1A3C3A6D32DEC368, 0xBCAE519EC6AAA045), new AdvancedBlendEntry(AdvancedBlendOp.PlusDarker, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x6FD380261A63B240, 0x17C3B335DBB9E3DB), new AdvancedBlendEntry(AdvancedBlendOp.Multiply, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x1D39164823D3A2D1, 0xC45350959CE1C8FB), new AdvancedBlendEntry(AdvancedBlendOp.Screen, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x18DF09FF53B129FE, 0xC02EDA33C36019F6), new AdvancedBlendEntry(AdvancedBlendOp.Overlay, AdvancedBlendOverlap.Uncorrelated, true, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x5973E583271EBF06, 0x711497D75D1272E0), new AdvancedBlendEntry(AdvancedBlendOp.Darken, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x4759E0E5DA54D5E8, 0x1FDD57C0C38AFA1F), new AdvancedBlendEntry(AdvancedBlendOp.Lighten, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x337684D43CCE97FA, 0x0139E30CC529E1C9), new AdvancedBlendEntry(AdvancedBlendOp.ColorDodge, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0xDA59E85D8428992D, 0x1D3D7C64C9EF0132), new AdvancedBlendEntry(AdvancedBlendOp.ColorBurn, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x9455B949298CE805, 0xE73D3301518BE98A), new AdvancedBlendEntry(AdvancedBlendOp.HardLight, AdvancedBlendOverlap.Uncorrelated, true, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0xBDD3B4DEDBE336AA, 0xBFA4DCD50D535DEE), new AdvancedBlendEntry(AdvancedBlendOp.SoftLight, AdvancedBlendOverlap.Uncorrelated, true, new[] { new RgbFloat(0.2605f, 0.2605f, 0.2605f), new RgbFloat(-0.7817f, -0.7817f, -0.7817f), new RgbFloat(0.3022f, 0.3022f, 0.3022f), new RgbFloat(0.2192f, 0.2192f, 0.2192f), new RgbFloat(0.25f, 0.25f, 0.25f), new RgbFloat(16f, 16f, 16f), new RgbFloat(12f, 12f, 12f), new RgbFloat(3f, 3f, 3f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x22D4E970A028649A, 0x4F3FCB055FCED965), new AdvancedBlendEntry(AdvancedBlendOp.Difference, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0xA346A91311D72114, 0x151A27A3FB0A1904), new AdvancedBlendEntry(AdvancedBlendOp.Minus, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.ReverseSubtractGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x8A307241061FACD6, 0xA39D1826440B8EE7), new AdvancedBlendEntry(AdvancedBlendOp.MinusClamped, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xB3BE569485EFFFE0, 0x0BA4E269B3CFB165), new AdvancedBlendEntry(AdvancedBlendOp.Exclusion, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x36FCA3277DC11822, 0x2BC0F6CAC2029672), new AdvancedBlendEntry(AdvancedBlendOp.Contrast, AdvancedBlendOverlap.Uncorrelated, true, new[] { new RgbFloat(2f, 2f, 2f), new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x4A6226AF2DE9BD7F, 0xEB890D7DA716F73A), new AdvancedBlendEntry(AdvancedBlendOp.Invert, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0xF364CAA94E160FEB, 0xBF364512C72A3797), new AdvancedBlendEntry(AdvancedBlendOp.InvertRGB, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x6BF791AB4AC19C87, 0x6FA17A994EA0FCDE), new AdvancedBlendEntry(AdvancedBlendOp.InvertOvg, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x053C75A0AE0BB222, 0x03C791FEEB59754C), new AdvancedBlendEntry(AdvancedBlendOp.LinearDodge, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x25762AB40B6CBDE9, 0x595E9A968AC4F01C), new AdvancedBlendEntry(AdvancedBlendOp.LinearBurn, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0xC2D05E2DBE16955D, 0xB8659C7A3FCFA7CE), new AdvancedBlendEntry(AdvancedBlendOp.VividLight, AdvancedBlendOverlap.Uncorrelated, true, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x223F220B8F74CBFB, 0xD3DD19D7C39209A5), new AdvancedBlendEntry(AdvancedBlendOp.LinearLight, AdvancedBlendOverlap.Uncorrelated, true, new[] { new RgbFloat(2f, 2f, 2f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0xD0DAE57A9F1FE78A, 0x353796BCFB8CE30B), new AdvancedBlendEntry(AdvancedBlendOp.PinLight, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x601C8CBEC07FF8FF, 0xB8E22882360E8695), new AdvancedBlendEntry(AdvancedBlendOp.HardMix, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x3A55B7B78C76A7A8, 0x206F503B2D9FFEAA), new AdvancedBlendEntry(AdvancedBlendOp.Red, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x80BC65C7831388E5, 0xC652457B2C766AEC), new AdvancedBlendEntry(AdvancedBlendOp.Green, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x3D3A912E5833EE13, 0x307895951349EE33), new AdvancedBlendEntry(AdvancedBlendOp.Blue, AdvancedBlendOverlap.Uncorrelated, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x289105BE92E81803, 0xFD8F1F03D15C53B4), new AdvancedBlendEntry(AdvancedBlendOp.HslHue, AdvancedBlendOverlap.Uncorrelated, true, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x007AE3BD140764EB, 0x0EE05A0D2E80BBAE), new AdvancedBlendEntry(AdvancedBlendOp.HslSaturation, AdvancedBlendOverlap.Uncorrelated, true, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x77F7EE0DB3FDDB96, 0xDEA47C881306DB3E), new AdvancedBlendEntry(AdvancedBlendOp.HslColor, AdvancedBlendOverlap.Uncorrelated, true, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x66F4E9A7D73CA157, 0x1486058A177DB11C), new AdvancedBlendEntry(AdvancedBlendOp.HslLuminosity, AdvancedBlendOverlap.Uncorrelated, true, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x593E9F331612D618, 0x9D217BEFA4EB919A), new AdvancedBlendEntry(AdvancedBlendOp.Src, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl)) }, + { new Hash128(0x0A5194C5E6891106, 0xDD8EC6586106557C), new AdvancedBlendEntry(AdvancedBlendOp.Dst, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x8D77173D5E06E916, 0x06AB190E7D10F4D4), new AdvancedBlendEntry(AdvancedBlendOp.SrcOver, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x655B4EBC148981DA, 0x455999EF2B9BD28A), new AdvancedBlendEntry(AdvancedBlendOp.DstOver, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x98F5437D5F518929, 0xBFF4A6E83183DB63), new AdvancedBlendEntry(AdvancedBlendOp.SrcIn, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x6ADDEFE3B9CEF2FD, 0xB6F6272AFECB1AAB), new AdvancedBlendEntry(AdvancedBlendOp.DstIn, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x80953F0953BF05B1, 0xD59ABFAA34F8196F), new AdvancedBlendEntry(AdvancedBlendOp.SrcOut, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xA401D9AA2A39C121, 0xFC0C8005C22AD7E3), new AdvancedBlendEntry(AdvancedBlendOp.DstOut, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x06274FB7CA9CDD22, 0x6CE8188B1A9AB6EF), new AdvancedBlendEntry(AdvancedBlendOp.SrcAtop, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x0B079BE7F7F70817, 0xB72E7736CA51E321), new AdvancedBlendEntry(AdvancedBlendOp.DstAtop, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl)) }, + { new Hash128(0x66215C99403CEDDE, 0x900B733D62204C48), new AdvancedBlendEntry(AdvancedBlendOp.Xor, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x12DEF2AD900CAD6C, 0x58CF5CC3004910DF), new AdvancedBlendEntry(AdvancedBlendOp.Plus, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x272BA3A49F64DAE4, 0xAC70B96C00A99EAF), new AdvancedBlendEntry(AdvancedBlendOp.Multiply, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x206C34AAA7D3F545, 0xDA4B30CACAA483A0), new AdvancedBlendEntry(AdvancedBlendOp.Screen, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x3D93494920D257BE, 0xDCC573BE1F5F4449), new AdvancedBlendEntry(AdvancedBlendOp.Overlay, AdvancedBlendOverlap.Disjoint, true, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x0D7417D80191107B, 0xEAF40547827E005F), new AdvancedBlendEntry(AdvancedBlendOp.Darken, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xEC1B03E8C883F9C9, 0x2D3CA044C58C01B4), new AdvancedBlendEntry(AdvancedBlendOp.Lighten, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x58A19A0135D68B31, 0x82F35B97AED068E5), new AdvancedBlendEntry(AdvancedBlendOp.ColorDodge, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x20489F9AB36CC0E3, 0x20499874219E35EE), new AdvancedBlendEntry(AdvancedBlendOp.ColorBurn, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xBB176935E5EE05BF, 0x95B26D4D30EA7A14), new AdvancedBlendEntry(AdvancedBlendOp.HardLight, AdvancedBlendOverlap.Disjoint, true, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x5FF9393C908ACFED, 0x068B0BD875773ABF), new AdvancedBlendEntry(AdvancedBlendOp.SoftLight, AdvancedBlendOverlap.Disjoint, true, new[] { new RgbFloat(0.2605f, 0.2605f, 0.2605f), new RgbFloat(-0.7817f, -0.7817f, -0.7817f), new RgbFloat(0.3022f, 0.3022f, 0.3022f), new RgbFloat(0.2192f, 0.2192f, 0.2192f), new RgbFloat(0.25f, 0.25f, 0.25f), new RgbFloat(16f, 16f, 16f), new RgbFloat(12f, 12f, 12f), new RgbFloat(3f, 3f, 3f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x03181F8711C9802C, 0x6B02C7C6B224FE7B), new AdvancedBlendEntry(AdvancedBlendOp.Difference, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x2EE2209021F6B977, 0xF3AFA1491B8B89FC), new AdvancedBlendEntry(AdvancedBlendOp.Exclusion, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xD8BA4DD2EDE4DC9E, 0x01006114977CF715), new AdvancedBlendEntry(AdvancedBlendOp.Invert, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0xD156B99835A2D8ED, 0x2D0BEE9E135EA7A7), new AdvancedBlendEntry(AdvancedBlendOp.InvertRGB, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x20CE8C898ED4BE27, 0x1514900B6F5E8F66), new AdvancedBlendEntry(AdvancedBlendOp.LinearDodge, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xCDE5F743820BA2D9, 0x917845FE2ECB083D), new AdvancedBlendEntry(AdvancedBlendOp.LinearBurn, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xEB03DF4A0C1D14CD, 0xBAE2E831C6E8FFE4), new AdvancedBlendEntry(AdvancedBlendOp.VividLight, AdvancedBlendOverlap.Disjoint, true, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x1DC9E49AABC779AC, 0x4053A1441EB713D3), new AdvancedBlendEntry(AdvancedBlendOp.LinearLight, AdvancedBlendOverlap.Disjoint, true, new[] { new RgbFloat(2f, 2f, 2f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xFBDEF776248F7B3E, 0xE05EEFD65AC47CB7), new AdvancedBlendEntry(AdvancedBlendOp.PinLight, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x415A1A48E03AA6E7, 0x046D7EE33CA46B9A), new AdvancedBlendEntry(AdvancedBlendOp.HardMix, AdvancedBlendOverlap.Disjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x59A6901EC9BB2041, 0x2F3E19CE5EEC3EBE), new AdvancedBlendEntry(AdvancedBlendOp.HslHue, AdvancedBlendOverlap.Disjoint, true, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x044B2B6E105221DA, 0x3089BBC033F994AF), new AdvancedBlendEntry(AdvancedBlendOp.HslSaturation, AdvancedBlendOverlap.Disjoint, true, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x374A5A24AA8E6CC5, 0x29930FAA6215FA2B), new AdvancedBlendEntry(AdvancedBlendOp.HslColor, AdvancedBlendOverlap.Disjoint, true, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x30CD0F7AF0CF26F9, 0x06CCA6744DE7DCF5), new AdvancedBlendEntry(AdvancedBlendOp.HslLuminosity, AdvancedBlendOverlap.Disjoint, true, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x1A6C9A1F6FE494A5, 0xA0CFAF77617E54DD), new AdvancedBlendEntry(AdvancedBlendOp.Src, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl)) }, + { new Hash128(0x081AF6DAAB1C8717, 0xBFEDCE59AE3DC9AC), new AdvancedBlendEntry(AdvancedBlendOp.Dst, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x3518E44573AB68BA, 0xC96EE71AF9F8F546), new AdvancedBlendEntry(AdvancedBlendOp.SrcOver, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0xF89E81FE8D73C96F, 0x4583A04577A0F21C), new AdvancedBlendEntry(AdvancedBlendOp.DstOver, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0xDF4026421CB61119, 0x14115A1F5139AFC7), new AdvancedBlendEntry(AdvancedBlendOp.SrcIn, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MinimumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x91A20262C3E3A695, 0x0B3A102BFCDC6B1C), new AdvancedBlendEntry(AdvancedBlendOp.DstIn, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MinimumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x44F4C7CCFEB9EBFA, 0xF68394E6D56E5C2F), new AdvancedBlendEntry(AdvancedBlendOp.SrcOut, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xB89F17C7021E9760, 0x430357EE0F7188EF), new AdvancedBlendEntry(AdvancedBlendOp.DstOut, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xDA2D20EA4242B8A0, 0x0D1EC05B72E3838F), new AdvancedBlendEntry(AdvancedBlendOp.SrcAtop, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x855DFEE1208D11B9, 0x77C6E3DDCFE30B85), new AdvancedBlendEntry(AdvancedBlendOp.DstAtop, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl)) }, + { new Hash128(0x9B3808439683FD58, 0x123DCBE4705AB25E), new AdvancedBlendEntry(AdvancedBlendOp.Xor, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xA42CF045C248A00A, 0x0C6C63C24EA0B0C1), new AdvancedBlendEntry(AdvancedBlendOp.Multiply, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x320A83B6D00C8059, 0x796EDAB3EB7314BC), new AdvancedBlendEntry(AdvancedBlendOp.Screen, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x45253AC9ABFFC613, 0x8F92EA70195FB573), new AdvancedBlendEntry(AdvancedBlendOp.Overlay, AdvancedBlendOverlap.Conjoint, true, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x1A5D263B588274B6, 0x167D305F6C794179), new AdvancedBlendEntry(AdvancedBlendOp.Darken, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x709C1A837FE966AC, 0x75D8CE49E8A78EDB), new AdvancedBlendEntry(AdvancedBlendOp.Lighten, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x8265C26F85E4145F, 0x932E6CCBF37CB600), new AdvancedBlendEntry(AdvancedBlendOp.ColorDodge, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x3F252B3FEF983F27, 0x9370D7EEFEFA1A9E), new AdvancedBlendEntry(AdvancedBlendOp.ColorBurn, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x66A334A4AEA41078, 0xCB52254E1E395231), new AdvancedBlendEntry(AdvancedBlendOp.HardLight, AdvancedBlendOverlap.Conjoint, true, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0xFDD05C53B25F0035, 0xB7E3ECEE166C222F), new AdvancedBlendEntry(AdvancedBlendOp.SoftLight, AdvancedBlendOverlap.Conjoint, true, new[] { new RgbFloat(0.2605f, 0.2605f, 0.2605f), new RgbFloat(-0.7817f, -0.7817f, -0.7817f), new RgbFloat(0.3022f, 0.3022f, 0.3022f), new RgbFloat(0.2192f, 0.2192f, 0.2192f), new RgbFloat(0.25f, 0.25f, 0.25f), new RgbFloat(16f, 16f, 16f), new RgbFloat(12f, 12f, 12f), new RgbFloat(3f, 3f, 3f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x25D932A77FFED81A, 0xA50D797B0FCA94E8), new AdvancedBlendEntry(AdvancedBlendOp.Difference, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x4A953B6F5F7D341C, 0xDC05CFB50DDB5DC1), new AdvancedBlendEntry(AdvancedBlendOp.Exclusion, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x838CB660C4F41F6D, 0x9E7D958697543495), new AdvancedBlendEntry(AdvancedBlendOp.Invert, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x4DF6EC1348A8F797, 0xA128E0CD69DB5A64), new AdvancedBlendEntry(AdvancedBlendOp.InvertRGB, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x178CDFAB9A015295, 0x2BF40EA72E596D57), new AdvancedBlendEntry(AdvancedBlendOp.LinearDodge, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x338FC99050E56AFD, 0x2AF41CF82BE602BF), new AdvancedBlendEntry(AdvancedBlendOp.LinearBurn, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x62E02ED60D1E978E, 0xBF726B3E68C11E4D), new AdvancedBlendEntry(AdvancedBlendOp.VividLight, AdvancedBlendOverlap.Conjoint, true, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0xFBAF92DD4C101502, 0x7AF2EDA6596B819D), new AdvancedBlendEntry(AdvancedBlendOp.LinearLight, AdvancedBlendOverlap.Conjoint, true, new[] { new RgbFloat(2f, 2f, 2f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x0EF1241F65D4B50A, 0xE8D85DFA6AEDDB84), new AdvancedBlendEntry(AdvancedBlendOp.PinLight, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x77FE024B5C9D4A18, 0xF19D48A932F6860F), new AdvancedBlendEntry(AdvancedBlendOp.HardMix, AdvancedBlendOverlap.Conjoint, true, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x9C88CBFA2E09D857, 0x0A0361704CBEEE1D), new AdvancedBlendEntry(AdvancedBlendOp.HslHue, AdvancedBlendOverlap.Conjoint, true, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x5B94127FA190E640, 0x8D1FEFF837A91268), new AdvancedBlendEntry(AdvancedBlendOp.HslSaturation, AdvancedBlendOverlap.Conjoint, true, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0xB9C9105B7E063DDB, 0xF6A70E1D511B96FD), new AdvancedBlendEntry(AdvancedBlendOp.HslColor, AdvancedBlendOverlap.Conjoint, true, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0xF0751AAE332B3ED1, 0xC40146F5C83C2533), new AdvancedBlendEntry(AdvancedBlendOp.HslLuminosity, AdvancedBlendOverlap.Conjoint, true, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x579EB12F595F75AD, 0x151BF0504703B81B), new AdvancedBlendEntry(AdvancedBlendOp.DstOver, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0xF9CA152C03AC8C62, 0x1581336205E5CF47), new AdvancedBlendEntry(AdvancedBlendOp.SrcIn, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.DstAlphaGl, BlendFactor.ZeroGl)) }, + { new Hash128(0x98ACD8BB5E195D0F, 0x91F937672BE899F0), new AdvancedBlendEntry(AdvancedBlendOp.SrcOut, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneMinusDstAlphaGl, BlendFactor.ZeroGl)) }, + { new Hash128(0xBF97F10FC301F44C, 0x75721789F0D48548), new AdvancedBlendEntry(AdvancedBlendOp.SrcAtop, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x1B982263B8B08A10, 0x3350C76E2E1B27DF), new AdvancedBlendEntry(AdvancedBlendOp.DstAtop, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl)) }, + { new Hash128(0xFF20AC79F64EDED8, 0xAF9025B2D97B9273), new AdvancedBlendEntry(AdvancedBlendOp.Xor, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneMinusDstAlphaGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x9FFD986600FB112F, 0x384FDDF4E060139A), new AdvancedBlendEntry(AdvancedBlendOp.PlusClamped, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x0425E40B5B8B3B52, 0x5880CBED7CAB631C), new AdvancedBlendEntry(AdvancedBlendOp.PlusClampedAlpha, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x16DAC8593F28623A, 0x233DBC82325B8AED), new AdvancedBlendEntry(AdvancedBlendOp.PlusDarker, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xB37E5F234B9F0948, 0xD5F957A2ECD98FD6), new AdvancedBlendEntry(AdvancedBlendOp.Multiply, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0xCA0FDADD1D20DBE3, 0x1A5C15CCBF1AC538), new AdvancedBlendEntry(AdvancedBlendOp.Screen, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x1C48304D73A9DF3A, 0x891DB93FA36E3450), new AdvancedBlendEntry(AdvancedBlendOp.Overlay, AdvancedBlendOverlap.Uncorrelated, false, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x53200F2279B7FA39, 0x051C2462EBF6789C), new AdvancedBlendEntry(AdvancedBlendOp.Darken, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0xB88BFB80714DCD5C, 0xEBD6938D744E6A41), new AdvancedBlendEntry(AdvancedBlendOp.Lighten, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0xE33DC2A25FC1A976, 0x08B3DBB1F3027D45), new AdvancedBlendEntry(AdvancedBlendOp.ColorDodge, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0xCE97E71615370316, 0xE131AE49D3A4D62B), new AdvancedBlendEntry(AdvancedBlendOp.ColorBurn, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0xE059FD265149B256, 0x94AF817AC348F61F), new AdvancedBlendEntry(AdvancedBlendOp.HardLight, AdvancedBlendOverlap.Uncorrelated, false, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x16D31333D477E231, 0x9A98AAC84F72CC62), new AdvancedBlendEntry(AdvancedBlendOp.SoftLight, AdvancedBlendOverlap.Uncorrelated, false, new[] { new RgbFloat(0.2605f, 0.2605f, 0.2605f), new RgbFloat(-0.7817f, -0.7817f, -0.7817f), new RgbFloat(0.3022f, 0.3022f, 0.3022f), new RgbFloat(0.2192f, 0.2192f, 0.2192f), new RgbFloat(0.25f, 0.25f, 0.25f), new RgbFloat(16f, 16f, 16f), new RgbFloat(12f, 12f, 12f), new RgbFloat(3f, 3f, 3f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x47FC3B0776366D3C, 0xE96D9BD83B277874), new AdvancedBlendEntry(AdvancedBlendOp.Difference, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x7230401E3FEA1F3B, 0xF0D15F05D3D1E309), new AdvancedBlendEntry(AdvancedBlendOp.Minus, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.ReverseSubtractGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x188212F9303742F5, 0x100C51CB96E03591), new AdvancedBlendEntry(AdvancedBlendOp.MinusClamped, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x52B755D296B44DC5, 0x4003B87275625973), new AdvancedBlendEntry(AdvancedBlendOp.Exclusion, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0xD873ED973ADF7EAD, 0x73E68B57D92034E7), new AdvancedBlendEntry(AdvancedBlendOp.Contrast, AdvancedBlendOverlap.Uncorrelated, false, new[] { new RgbFloat(2f, 2f, 2f), new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x471F9FA34B945ACB, 0x10524D1410B3C402), new AdvancedBlendEntry(AdvancedBlendOp.InvertRGB, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x99F569454EA0EF32, 0x6FC70A8B3A07DC8B), new AdvancedBlendEntry(AdvancedBlendOp.LinearDodge, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x5AD55F950067AC7E, 0x4BA60A4FBABDD0AC), new AdvancedBlendEntry(AdvancedBlendOp.LinearBurn, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x03FF2C858C9C4C5B, 0xE95AE7F561FB60E9), new AdvancedBlendEntry(AdvancedBlendOp.VividLight, AdvancedBlendOverlap.Uncorrelated, false, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x6DC0E510C7BCF9D2, 0xAE805D7CECDCB5C1), new AdvancedBlendEntry(AdvancedBlendOp.LinearLight, AdvancedBlendOverlap.Uncorrelated, false, new[] { new RgbFloat(2f, 2f, 2f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x44832332CED5C054, 0x2F8D5536C085B30A), new AdvancedBlendEntry(AdvancedBlendOp.PinLight, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x4AB4D387618AC51F, 0x495B46E0555F4B32), new AdvancedBlendEntry(AdvancedBlendOp.HardMix, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x99282B49405A01A8, 0xD6FA93F864F24A8E), new AdvancedBlendEntry(AdvancedBlendOp.Red, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x37B30C1064FBD23E, 0x5D068366F42317C2), new AdvancedBlendEntry(AdvancedBlendOp.Green, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x760FAE9D59E04BC2, 0xA40AD483EA01435E), new AdvancedBlendEntry(AdvancedBlendOp.Blue, AdvancedBlendOverlap.Uncorrelated, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0xE786950FD9D1C6EF, 0xF9FDD5AF6451D239), new AdvancedBlendEntry(AdvancedBlendOp.HslHue, AdvancedBlendOverlap.Uncorrelated, false, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x052458BB4788B0CA, 0x8AC58FDCA1F45EF5), new AdvancedBlendEntry(AdvancedBlendOp.HslSaturation, AdvancedBlendOverlap.Uncorrelated, false, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x6AFC3837D1D31920, 0xB9D49C2FE49642C6), new AdvancedBlendEntry(AdvancedBlendOp.HslColor, AdvancedBlendOverlap.Uncorrelated, false, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0xAFC2911949317E01, 0xD5B63636F5CB3422), new AdvancedBlendEntry(AdvancedBlendOp.HslLuminosity, AdvancedBlendOverlap.Uncorrelated, false, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneMinusSrcAlphaGl)) }, + { new Hash128(0x13B46DF507CC2C53, 0x86DE26517E6BF0A7), new AdvancedBlendEntry(AdvancedBlendOp.Src, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl)) }, + { new Hash128(0x5C372442474BE410, 0x79ECD3C0C496EF2E), new AdvancedBlendEntry(AdvancedBlendOp.SrcOver, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x74AAB45DBF5336E9, 0x01BFC4E181DAD442), new AdvancedBlendEntry(AdvancedBlendOp.DstOver, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x43239E282A36C85C, 0x36FB65560E46AD0F), new AdvancedBlendEntry(AdvancedBlendOp.SrcIn, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x1A3BA8A7583B8F7A, 0xE64E41D548033180), new AdvancedBlendEntry(AdvancedBlendOp.SrcOut, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x32BBB9859E9B565D, 0x3D5CE94FE55F18B5), new AdvancedBlendEntry(AdvancedBlendOp.SrcAtop, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0xD947A0766AE3C0FC, 0x391E5D53E86F4ED6), new AdvancedBlendEntry(AdvancedBlendOp.DstAtop, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl)) }, + { new Hash128(0xBD9A7C08BDFD8CE6, 0x905407634901355E), new AdvancedBlendEntry(AdvancedBlendOp.Xor, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x8395475BCB0D7A8C, 0x48AF5DD501D44A70), new AdvancedBlendEntry(AdvancedBlendOp.Plus, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x80AAC23FEBD4A3E5, 0xEA8C70F0B4DE52DE), new AdvancedBlendEntry(AdvancedBlendOp.Multiply, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x2F3AD1B0F1B3FD09, 0xC0EBC784BFAB8EA3), new AdvancedBlendEntry(AdvancedBlendOp.Screen, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x52B54032F2F70BFF, 0xC941D6FDED674765), new AdvancedBlendEntry(AdvancedBlendOp.Overlay, AdvancedBlendOverlap.Disjoint, false, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xCA7B86F72EC6A99B, 0x55868A131AFE359E), new AdvancedBlendEntry(AdvancedBlendOp.Darken, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x377919B60BD133CA, 0x0FD611627664EF40), new AdvancedBlendEntry(AdvancedBlendOp.Lighten, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x9D4A0C5EE1153887, 0x7B869EBA218C589B), new AdvancedBlendEntry(AdvancedBlendOp.ColorDodge, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x311F2A858545D123, 0xB4D09C802480AD62), new AdvancedBlendEntry(AdvancedBlendOp.ColorBurn, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xCF78AA6A83AFA689, 0x9DC48B0C2182A3E1), new AdvancedBlendEntry(AdvancedBlendOp.HardLight, AdvancedBlendOverlap.Disjoint, false, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xC3018CD6F1CF62D1, 0x016E32DD9087B1BB), new AdvancedBlendEntry(AdvancedBlendOp.SoftLight, AdvancedBlendOverlap.Disjoint, false, new[] { new RgbFloat(0.2605f, 0.2605f, 0.2605f), new RgbFloat(-0.7817f, -0.7817f, -0.7817f), new RgbFloat(0.3022f, 0.3022f, 0.3022f), new RgbFloat(0.2192f, 0.2192f, 0.2192f), new RgbFloat(0.25f, 0.25f, 0.25f), new RgbFloat(16f, 16f, 16f), new RgbFloat(12f, 12f, 12f), new RgbFloat(3f, 3f, 3f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x9CB62CE0E956EE29, 0x0FB67F503E60B3AD), new AdvancedBlendEntry(AdvancedBlendOp.Difference, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x3589A13C16EF3BFA, 0x15B29BFC91F3BDFB), new AdvancedBlendEntry(AdvancedBlendOp.Exclusion, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x3502CA5FB7529917, 0xFA51BFD0D1688071), new AdvancedBlendEntry(AdvancedBlendOp.InvertRGB, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x62ADC25AD6D0A923, 0x76CB6D238276D3A3), new AdvancedBlendEntry(AdvancedBlendOp.LinearDodge, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x09FDEB1116A9D52C, 0x85BB8627CD5C2733), new AdvancedBlendEntry(AdvancedBlendOp.LinearBurn, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x0709FED1B65E18EB, 0x5BC3AA4D99EC19CF), new AdvancedBlendEntry(AdvancedBlendOp.VividLight, AdvancedBlendOverlap.Disjoint, false, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xB18D28AE5DE4C723, 0xE820AA2B75C9C02E), new AdvancedBlendEntry(AdvancedBlendOp.LinearLight, AdvancedBlendOverlap.Disjoint, false, new[] { new RgbFloat(2f, 2f, 2f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x6743C51621497480, 0x4B164E40858834AE), new AdvancedBlendEntry(AdvancedBlendOp.PinLight, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x63D1E181E34A2944, 0x1AE292C9D9F12819), new AdvancedBlendEntry(AdvancedBlendOp.HardMix, AdvancedBlendOverlap.Disjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x079523298250BFF6, 0xC0C793510603CDB5), new AdvancedBlendEntry(AdvancedBlendOp.HslHue, AdvancedBlendOverlap.Disjoint, false, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x4C9D0A973C805EA6, 0xD1FF59AD5156B93C), new AdvancedBlendEntry(AdvancedBlendOp.HslSaturation, AdvancedBlendOverlap.Disjoint, false, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x1E914678F3057BCD, 0xD503AE389C12D229), new AdvancedBlendEntry(AdvancedBlendOp.HslColor, AdvancedBlendOverlap.Disjoint, false, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0x9FDBADE5556C5311, 0x03F0CBC798FC5C94), new AdvancedBlendEntry(AdvancedBlendOp.HslLuminosity, AdvancedBlendOverlap.Disjoint, false, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xE39451534635403C, 0x606CC1CA1F452388), new AdvancedBlendEntry(AdvancedBlendOp.Src, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl)) }, + { new Hash128(0x1D39F0F0A1008AA6, 0xBFDF2B97E6C3F125), new AdvancedBlendEntry(AdvancedBlendOp.SrcOver, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0xDB81BED30D5BDBEA, 0xAF0B2856EB93AD2C), new AdvancedBlendEntry(AdvancedBlendOp.DstOver, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x83F69CCF1D0A79B6, 0x70D31332797430AC), new AdvancedBlendEntry(AdvancedBlendOp.SrcIn, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MinimumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x7B87F807AB7A8F5C, 0x1241A2A01FB31771), new AdvancedBlendEntry(AdvancedBlendOp.SrcOut, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xF557172E20D5272D, 0xC1961F8C7A5D2820), new AdvancedBlendEntry(AdvancedBlendOp.SrcAtop, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0xA8476B3944DBBC9B, 0x84A2F6AF97B15FDF), new AdvancedBlendEntry(AdvancedBlendOp.DstAtop, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.OneGl, BlendFactor.ZeroGl)) }, + { new Hash128(0x3259602B55414DA3, 0x72AACCC00B5A9D10), new AdvancedBlendEntry(AdvancedBlendOp.Xor, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, 0, 0, 0)) }, + { new Hash128(0xC0CB8C10F36EDCD6, 0x8C2D088AD8191E1C), new AdvancedBlendEntry(AdvancedBlendOp.Multiply, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x81806C451C6255EF, 0x5AA8AC9A08941A15), new AdvancedBlendEntry(AdvancedBlendOp.Screen, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0xE55A6537F4568198, 0xCA8735390B799B19), new AdvancedBlendEntry(AdvancedBlendOp.Overlay, AdvancedBlendOverlap.Conjoint, false, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x5C044BA14536DDA3, 0xBCE0123ED7D510EC), new AdvancedBlendEntry(AdvancedBlendOp.Darken, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x6788346C405BE130, 0x372A4BB199C01F9F), new AdvancedBlendEntry(AdvancedBlendOp.Lighten, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x510EDC2A34E2856B, 0xE1727A407E294254), new AdvancedBlendEntry(AdvancedBlendOp.ColorDodge, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x4B7BE01BD398C7A8, 0x5BFF79BC00672C18), new AdvancedBlendEntry(AdvancedBlendOp.ColorBurn, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x213B43845540CFEC, 0xDA857411CF1CCFCE), new AdvancedBlendEntry(AdvancedBlendOp.HardLight, AdvancedBlendOverlap.Conjoint, false, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x765AFA6732E783F1, 0x8F1CABF1BC78A014), new AdvancedBlendEntry(AdvancedBlendOp.SoftLight, AdvancedBlendOverlap.Conjoint, false, new[] { new RgbFloat(0.2605f, 0.2605f, 0.2605f), new RgbFloat(-0.7817f, -0.7817f, -0.7817f), new RgbFloat(0.3022f, 0.3022f, 0.3022f), new RgbFloat(0.2192f, 0.2192f, 0.2192f), new RgbFloat(0.25f, 0.25f, 0.25f), new RgbFloat(16f, 16f, 16f), new RgbFloat(12f, 12f, 12f), new RgbFloat(3f, 3f, 3f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0xA4A5DE1CC06F6CB1, 0xA0634A0011001709), new AdvancedBlendEntry(AdvancedBlendOp.Difference, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x81F32BD8816EA796, 0x697EE86683165170), new AdvancedBlendEntry(AdvancedBlendOp.Exclusion, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0xB870C209EAA5F092, 0xAF5FD923909CAA1F), new AdvancedBlendEntry(AdvancedBlendOp.InvertRGB, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.AddGl, BlendFactor.ZeroGl, BlendFactor.OneGl)) }, + { new Hash128(0x3649A9F5C936FB83, 0xDD7C834897AA182A), new AdvancedBlendEntry(AdvancedBlendOp.LinearDodge, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0xD72A2B1097A5995C, 0x3D41B2763A913654), new AdvancedBlendEntry(AdvancedBlendOp.LinearBurn, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x551E212B9F6C454A, 0xB0DFA05BEB3C37FA), new AdvancedBlendEntry(AdvancedBlendOp.VividLight, AdvancedBlendOverlap.Conjoint, false, new[] { new RgbFloat(0.5f, 0.5f, 0.5f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x681B5A313B7416BF, 0xCB1CBAEEB4D81500), new AdvancedBlendEntry(AdvancedBlendOp.LinearLight, AdvancedBlendOverlap.Conjoint, false, new[] { new RgbFloat(2f, 2f, 2f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x9343A18BD4B16777, 0xEDB4AC1C8972C3A4), new AdvancedBlendEntry(AdvancedBlendOp.PinLight, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0xC960BF6D8519DE28, 0x78D8557FD405D119), new AdvancedBlendEntry(AdvancedBlendOp.HardMix, AdvancedBlendOverlap.Conjoint, false, Array.Empty<RgbFloat>(), new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x65A7B01FDC73A46C, 0x297E096ED5CC4D8A), new AdvancedBlendEntry(AdvancedBlendOp.HslHue, AdvancedBlendOverlap.Conjoint, false, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0xD9C99BA4A6CDC13B, 0x3CFF0ACEDC2EE150), new AdvancedBlendEntry(AdvancedBlendOp.HslSaturation, AdvancedBlendOverlap.Conjoint, false, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x6BC00DA6EB922BD1, 0x5FD4C11F2A685234), new AdvancedBlendEntry(AdvancedBlendOp.HslColor, AdvancedBlendOverlap.Conjoint, false, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + { new Hash128(0x8652300E32D93050, 0x9460E7B449132371), new AdvancedBlendEntry(AdvancedBlendOp.HslLuminosity, AdvancedBlendOverlap.Conjoint, false, new[] { new RgbFloat(0.3f, 0.59f, 0.11f) }, new FixedFunctionAlpha(BlendUcodeEnable.EnableRGB, BlendOp.MaximumGl, BlendFactor.OneGl, BlendFactor.OneGl)) }, + }; + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Gpu/Engine/Threed/Blender/AdvancedBlendUcode.cs b/Ryujinx.Graphics.Gpu/Engine/Threed/Blender/AdvancedBlendUcode.cs new file mode 100644 index 000000000..f06b4bf74 --- /dev/null +++ b/Ryujinx.Graphics.Gpu/Engine/Threed/Blender/AdvancedBlendUcode.cs @@ -0,0 +1,126 @@ +using Ryujinx.Graphics.GAL; + +namespace Ryujinx.Graphics.Gpu.Engine.Threed.Blender +{ + /// <summary> + /// Fixed function alpha state used for a advanced blend function. + /// </summary> + struct FixedFunctionAlpha + { + /// <summary> + /// Fixed function alpha state with alpha blending disabled. + /// </summary> + public static FixedFunctionAlpha Disabled => new FixedFunctionAlpha(BlendUcodeEnable.EnableRGBA, default, default, default); + + /// <summary> + /// Individual enable bits for the RGB and alpha components. + /// </summary> + public BlendUcodeEnable Enable { get; } + + /// <summary> + /// Alpha blend operation. + /// </summary> + public BlendOp AlphaOp { get; } + + /// <summary> + /// Value multiplied with the blend source operand. + /// </summary> + public BlendFactor AlphaSrcFactor { get; } + + /// <summary> + /// Value multiplied with the blend destination operand. + /// </summary> + public BlendFactor AlphaDstFactor { get; } + + /// <summary> + /// Creates a new blend fixed function alpha state. + /// </summary> + /// <param name="enable">Individual enable bits for the RGB and alpha components</param> + /// <param name="alphaOp">Alpha blend operation</param> + /// <param name="alphaSrc">Value multiplied with the blend source operand</param> + /// <param name="alphaDst">Value multiplied with the blend destination operand</param> + public FixedFunctionAlpha(BlendUcodeEnable enable, BlendOp alphaOp, BlendFactor alphaSrc, BlendFactor alphaDst) + { + Enable = enable; + AlphaOp = alphaOp; + AlphaSrcFactor = alphaSrc; + AlphaDstFactor = alphaDst; + } + + /// <summary> + /// Creates a new blend fixed function alpha state. + /// </summary> + /// <param name="alphaOp">Alpha blend operation</param> + /// <param name="alphaSrc">Value multiplied with the blend source operand</param> + /// <param name="alphaDst">Value multiplied with the blend destination operand</param> + public FixedFunctionAlpha(BlendOp alphaOp, BlendFactor alphaSrc, BlendFactor alphaDst) : this(BlendUcodeEnable.EnableRGB, alphaOp, alphaSrc, alphaDst) + { + } + } + + /// <summary> + /// Blend microcode assembly function delegate. + /// </summary> + /// <param name="asm">Assembler</param> + /// <returns>Fixed function alpha state for the microcode</returns> + delegate FixedFunctionAlpha GenUcodeFunc(ref UcodeAssembler asm); + + /// <summary> + /// Advanced blend microcode state. + /// </summary> + struct AdvancedBlendUcode + { + /// <summary> + /// Advanced blend operation. + /// </summary> + public AdvancedBlendOp Op { get; } + + /// <summary> + /// Advanced blend overlap mode. + /// </summary> + public AdvancedBlendOverlap Overlap { get; } + + /// <summary> + /// Whenever the source input is pre-multiplied. + /// </summary> + public bool SrcPreMultiplied { get; } + + /// <summary> + /// Fixed function alpha state. + /// </summary> + public FixedFunctionAlpha Alpha { get; } + + /// <summary> + /// Microcode. + /// </summary> + public uint[] Code { get; } + + /// <summary> + /// Constants used by the microcode. + /// </summary> + public RgbFloat[] Constants { get; } + + /// <summary> + /// Creates a new advanced blend state. + /// </summary> + /// <param name="op">Advanced blend operation</param> + /// <param name="overlap">Advanced blend overlap mode</param> + /// <param name="srcPreMultiplied">Whenever the source input is pre-multiplied</param> + /// <param name="genFunc">Function that will generate the advanced blend microcode</param> + public AdvancedBlendUcode( + AdvancedBlendOp op, + AdvancedBlendOverlap overlap, + bool srcPreMultiplied, + GenUcodeFunc genFunc) + { + Op = op; + Overlap = overlap; + SrcPreMultiplied = srcPreMultiplied; + + UcodeAssembler asm = new UcodeAssembler(); + Alpha = genFunc(ref asm); + Code = asm.GetCode(); + Constants = asm.GetConstants(); + } + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Gpu/Engine/Threed/Blender/UcodeAssembler.cs b/Ryujinx.Graphics.Gpu/Engine/Threed/Blender/UcodeAssembler.cs new file mode 100644 index 000000000..f854787e0 --- /dev/null +++ b/Ryujinx.Graphics.Gpu/Engine/Threed/Blender/UcodeAssembler.cs @@ -0,0 +1,305 @@ +using System; +using System.Collections.Generic; + +namespace Ryujinx.Graphics.Gpu.Engine.Threed.Blender +{ + /// <summary> + /// Blend microcode instruction. + /// </summary> + enum Instruction + { + Mmadd = 0, + Mmsub = 1, + Min = 2, + Max = 3, + Rcp = 4, + Add = 5, + Sub = 6 + } + + /// <summary> + /// Blend microcode condition code. + /// </summary> + enum CC + { + F = 0, + T = 1, + EQ = 2, + NE = 3, + LT = 4, + LE = 5, + GT = 6, + GE = 7 + } + + /// <summary> + /// Blend microcode opend B or D value. + /// </summary> + enum OpBD + { + ConstantZero = 0x0, + ConstantOne = 0x1, + SrcRGB = 0x2, + SrcAAA = 0x3, + OneMinusSrcAAA = 0x4, + DstRGB = 0x5, + DstAAA = 0x6, + OneMinusDstAAA = 0x7, + Temp0 = 0x9, + Temp1 = 0xa, + Temp2 = 0xb, + PBR = 0xc, + ConstantRGB = 0xd + } + + /// <summary> + /// Blend microcode operand A or C value. + /// </summary> + enum OpAC + { + SrcRGB = 0, + DstRGB = 1, + SrcAAA = 2, + DstAAA = 3, + Temp0 = 4, + Temp1 = 5, + Temp2 = 6, + PBR = 7 + } + + /// <summary> + /// Blend microcode destination operand. + /// </summary> + enum OpDst + { + Temp0 = 0, + Temp1 = 1, + Temp2 = 2, + PBR = 3 + } + + /// <summary> + /// Blend microcode input swizzle. + /// </summary> + enum Swizzle + { + RGB = 0, + GBR = 1, + RRR = 2, + GGG = 3, + BBB = 4, + RToA = 5 + } + + /// <summary> + /// Blend microcode output components. + /// </summary> + enum WriteMask + { + RGB = 0, + R = 1, + G = 2, + B = 3 + } + + /// <summary> + /// Floating-point RGB color values. + /// </summary> + struct RgbFloat + { + /// <summary> + /// Red component value. + /// </summary> + public float R { get; } + + /// <summary> + /// Green component value. + /// </summary> + public float G { get; } + + /// <summary> + /// Blue component value. + /// </summary> + public float B { get; } + + /// <summary> + /// Creates a new floating-point RGB value. + /// </summary> + /// <param name="r">Red component value</param> + /// <param name="g">Green component value</param> + /// <param name="b">Blue component value</param> + public RgbFloat(float r, float g, float b) + { + R = r; + G = g; + B = b; + } + } + + /// <summary> + /// Blend microcode destination operand, including swizzle, write mask and condition code update flag. + /// </summary> + struct Dest + { + public static Dest Temp0 => new Dest(OpDst.Temp0, Swizzle.RGB, WriteMask.RGB, false); + public static Dest Temp1 => new Dest(OpDst.Temp1, Swizzle.RGB, WriteMask.RGB, false); + public static Dest Temp2 => new Dest(OpDst.Temp2, Swizzle.RGB, WriteMask.RGB, false); + public static Dest PBR => new Dest(OpDst.PBR, Swizzle.RGB, WriteMask.RGB, false); + + public Dest GBR => new Dest(Dst, Swizzle.GBR, WriteMask, WriteCC); + public Dest RRR => new Dest(Dst, Swizzle.RRR, WriteMask, WriteCC); + public Dest GGG => new Dest(Dst, Swizzle.GGG, WriteMask, WriteCC); + public Dest BBB => new Dest(Dst, Swizzle.BBB, WriteMask, WriteCC); + public Dest RToA => new Dest(Dst, Swizzle.RToA, WriteMask, WriteCC); + + public Dest R => new Dest(Dst, Swizzle, WriteMask.R, WriteCC); + public Dest G => new Dest(Dst, Swizzle, WriteMask.G, WriteCC); + public Dest B => new Dest(Dst, Swizzle, WriteMask.B, WriteCC); + + public Dest CC => new Dest(Dst, Swizzle, WriteMask, true); + + public OpDst Dst { get; } + public Swizzle Swizzle { get; } + public WriteMask WriteMask { get; } + public bool WriteCC { get; } + + /// <summary> + /// Creates a new blend microcode destination operand. + /// </summary> + /// <param name="dst">Operand</param> + /// <param name="swizzle">Swizzle</param> + /// <param name="writeMask">Write maks</param> + /// <param name="writeCC">Indicates if condition codes should be updated</param> + public Dest(OpDst dst, Swizzle swizzle, WriteMask writeMask, bool writeCC) + { + Dst = dst; + Swizzle = swizzle; + WriteMask = writeMask; + WriteCC = writeCC; + } + } + + /// <summary> + /// Blend microcode operaiton. + /// </summary> + struct UcodeOp + { + public readonly uint Word; + + /// <summary> + /// Creates a new blend microcode operation. + /// </summary> + /// <param name="cc">Condition code that controls whenever the operation is executed or not</param> + /// <param name="inst">Instruction</param> + /// <param name="constIndex">Index on the constant table of the constant used by any constant operand</param> + /// <param name="dest">Destination operand</param> + /// <param name="srcA">First input operand</param> + /// <param name="srcB">Second input operand</param> + /// <param name="srcC">Third input operand</param> + /// <param name="srcD">Fourth input operand</param> + public UcodeOp(CC cc, Instruction inst, int constIndex, Dest dest, OpAC srcA, OpBD srcB, OpAC srcC, OpBD srcD) + { + Word = (uint)cc | + ((uint)inst << 3) | + ((uint)constIndex << 6) | + ((uint)srcA << 9) | + ((uint)srcB << 12) | + ((uint)srcC << 16) | + ((uint)srcD << 19) | + ((uint)dest.Swizzle << 23) | + ((uint)dest.WriteMask << 26) | + ((uint)dest.Dst << 28) | + (dest.WriteCC ? (1u << 31) : 0); + } + } + + /// <summary> + /// Blend microcode assembler. + /// </summary> + struct UcodeAssembler + { + private List<uint> _code; + private RgbFloat[] _constants; + private int _constantIndex; + + public void Mul(CC cc, Dest dest, OpAC srcA, OpBD srcB) + { + Assemble(cc, Instruction.Mmadd, dest, srcA, srcB, OpAC.SrcRGB, OpBD.ConstantZero); + } + + public void Madd(CC cc, Dest dest, OpAC srcA, OpBD srcB, OpAC srcC) + { + Assemble(cc, Instruction.Mmadd, dest, srcA, srcB, srcC, OpBD.ConstantOne); + } + + public void Mmadd(CC cc, Dest dest, OpAC srcA, OpBD srcB, OpAC srcC, OpBD srcD) + { + Assemble(cc, Instruction.Mmadd, dest, srcA, srcB, srcC, srcD); + } + + public void Mmsub(CC cc, Dest dest, OpAC srcA, OpBD srcB, OpAC srcC, OpBD srcD) + { + Assemble(cc, Instruction.Mmsub, dest, srcA, srcB, srcC, srcD); + } + + public void Min(CC cc, Dest dest, OpAC srcA, OpBD srcB) + { + Assemble(cc, Instruction.Min, dest, srcA, srcB, OpAC.SrcRGB, OpBD.ConstantZero); + } + + public void Max(CC cc, Dest dest, OpAC srcA, OpBD srcB) + { + Assemble(cc, Instruction.Max, dest, srcA, srcB, OpAC.SrcRGB, OpBD.ConstantZero); + } + + public void Rcp(CC cc, Dest dest, OpAC srcA) + { + Assemble(cc, Instruction.Rcp, dest, srcA, OpBD.ConstantZero, OpAC.SrcRGB, OpBD.ConstantZero); + } + + public void Mov(CC cc, Dest dest, OpBD srcB) + { + Assemble(cc, Instruction.Add, dest, OpAC.SrcRGB, srcB, OpAC.SrcRGB, OpBD.ConstantZero); + } + + public void Add(CC cc, Dest dest, OpBD srcB, OpBD srcD) + { + Assemble(cc, Instruction.Add, dest, OpAC.SrcRGB, srcB, OpAC.SrcRGB, srcD); + } + + public void Sub(CC cc, Dest dest, OpBD srcB, OpBD srcD) + { + Assemble(cc, Instruction.Sub, dest, OpAC.SrcRGB, srcB, OpAC.SrcRGB, srcD); + } + + private void Assemble(CC cc, Instruction inst, Dest dest, OpAC srcA, OpBD srcB, OpAC srcC, OpBD srcD) + { + (_code ??= new List<uint>()).Add(new UcodeOp(cc, inst, _constantIndex, dest, srcA, srcB, srcC, srcD).Word); + } + + public void SetConstant(int index, float r, float g, float b) + { + if (_constants == null) + { + _constants = new RgbFloat[index + 1]; + } + else if (_constants.Length <= index) + { + Array.Resize(ref _constants, index + 1); + } + + _constants[index] = new RgbFloat(r, g, b); + _constantIndex = index; + } + + public uint[] GetCode() + { + return _code?.ToArray(); + } + + public RgbFloat[] GetConstants() + { + return _constants; + } + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Gpu/Engine/Threed/StateUpdater.cs b/Ryujinx.Graphics.Gpu/Engine/Threed/StateUpdater.cs index 9b58e0148..ecfd763f6 100644 --- a/Ryujinx.Graphics.Gpu/Engine/Threed/StateUpdater.cs +++ b/Ryujinx.Graphics.Gpu/Engine/Threed/StateUpdater.cs @@ -1,5 +1,6 @@ using Ryujinx.Common.Logging; using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.Gpu.Engine.Threed.Blender; using Ryujinx.Graphics.Gpu.Engine.Types; using Ryujinx.Graphics.Gpu.Image; using Ryujinx.Graphics.Gpu.Shader; @@ -26,6 +27,7 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed private readonly GpuChannel _channel; private readonly DeviceStateWithShadow<ThreedClassState> _state; private readonly DrawState _drawState; + private readonly AdvancedBlendManager _blendManager; private readonly StateUpdateTracker<ThreedClassState> _updateTracker; @@ -55,13 +57,21 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed /// <param name="channel">GPU channel</param> /// <param name="state">3D engine state</param> /// <param name="drawState">Draw state</param> + /// <param name="blendManager">Advanced blend manager</param> /// <param name="spec">Specialization state updater</param> - public StateUpdater(GpuContext context, GpuChannel channel, DeviceStateWithShadow<ThreedClassState> state, DrawState drawState, SpecializationStateUpdater spec) + public StateUpdater( + GpuContext context, + GpuChannel channel, + DeviceStateWithShadow<ThreedClassState> state, + DrawState drawState, + AdvancedBlendManager blendManager, + SpecializationStateUpdater spec) { _context = context; _channel = channel; _state = state; _drawState = drawState; + _blendManager = blendManager; _currentProgramInfo = new ShaderProgramInfo[Constants.ShaderStages]; _currentSpecState = spec; @@ -84,6 +94,8 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed new StateUpdateCallbackEntry(UpdateVertexAttribState, nameof(ThreedClassState.VertexAttribState)), new StateUpdateCallbackEntry(UpdateBlendState, + nameof(ThreedClassState.BlendUcodeEnable), + nameof(ThreedClassState.BlendUcodeSize), nameof(ThreedClassState.BlendIndependent), nameof(ThreedClassState.BlendConstant), nameof(ThreedClassState.BlendStateCommon), @@ -1154,6 +1166,20 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed /// </summary> private void UpdateBlendState() { + if (_state.State.BlendUcodeEnable != BlendUcodeEnable.Disabled) + { + if (_context.Capabilities.SupportsBlendEquationAdvanced && _blendManager.TryGetAdvancedBlend(out var blendDescriptor)) + { + // Try to HLE it using advanced blend on the host if we can. + _context.Renderer.Pipeline.SetBlendState(blendDescriptor); + return; + } + else + { + // TODO: Blend emulation fallback. + } + } + bool blendIndependent = _state.State.BlendIndependent; ColorF blendConstant = _state.State.BlendConstant; diff --git a/Ryujinx.Graphics.Gpu/Engine/Threed/ThreedClass.cs b/Ryujinx.Graphics.Gpu/Engine/Threed/ThreedClass.cs index 9a447a0bd..caeee18e5 100644 --- a/Ryujinx.Graphics.Gpu/Engine/Threed/ThreedClass.cs +++ b/Ryujinx.Graphics.Gpu/Engine/Threed/ThreedClass.cs @@ -2,6 +2,7 @@ using Ryujinx.Graphics.GAL; using Ryujinx.Graphics.Gpu.Engine.GPFifo; using Ryujinx.Graphics.Gpu.Engine.InlineToMemory; +using Ryujinx.Graphics.Gpu.Engine.Threed.Blender; using System; using System.Collections.Generic; using System.Runtime.CompilerServices; @@ -18,6 +19,7 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed private readonly DeviceStateWithShadow<ThreedClassState> _state; private readonly InlineToMemoryClass _i2mClass; + private readonly AdvancedBlendManager _blendManager; private readonly DrawManager _drawManager; private readonly SemaphoreUpdater _semaphoreUpdater; private readonly ConstantBufferUpdater _cbUpdater; @@ -40,6 +42,8 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed { nameof(ThreedClassState.InvalidateSamplerCacheNoWfi), new RwCallback(InvalidateSamplerCacheNoWfi, null) }, { nameof(ThreedClassState.InvalidateTextureHeaderCacheNoWfi), new RwCallback(InvalidateTextureHeaderCacheNoWfi, null) }, { nameof(ThreedClassState.TextureBarrier), new RwCallback(TextureBarrier, null) }, + { nameof(ThreedClassState.LoadBlendUcodeStart), new RwCallback(LoadBlendUcodeStart, null) }, + { nameof(ThreedClassState.LoadBlendUcodeInstruction), new RwCallback(LoadBlendUcodeInstruction, null) }, { nameof(ThreedClassState.TextureBarrierTiled), new RwCallback(TextureBarrierTiled, null) }, { nameof(ThreedClassState.DrawTextureSrcY), new RwCallback(DrawTexture, null) }, { nameof(ThreedClassState.DrawVertexArrayBeginEndInstanceFirst), new RwCallback(DrawVertexArrayBeginEndInstanceFirst, null) }, @@ -75,9 +79,10 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed var drawState = new DrawState(); _drawManager = new DrawManager(context, channel, _state, drawState, spec); + _blendManager = new AdvancedBlendManager(_state); _semaphoreUpdater = new SemaphoreUpdater(context, channel, _state); _cbUpdater = new ConstantBufferUpdater(channel, _state); - _stateUpdater = new StateUpdater(context, channel, _state, drawState, spec); + _stateUpdater = new StateUpdater(context, channel, _state, drawState, _blendManager, spec); // This defaults to "always", even without any register write. // Reads just return 0, regardless of what was set there. @@ -283,6 +288,24 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed _context.Renderer.Pipeline.TextureBarrier(); } + /// <summary> + /// Sets the start offset of the blend microcode in memory. + /// </summary> + /// <param name="argument">Method call argument</param> + private void LoadBlendUcodeStart(int argument) + { + _blendManager.LoadBlendUcodeStart(argument); + } + + /// <summary> + /// Pushes one word of blend microcode. + /// </summary> + /// <param name="argument">Method call argument</param> + private void LoadBlendUcodeInstruction(int argument) + { + _blendManager.LoadBlendUcodeInstruction(argument); + } + /// <summary> /// Issues a texture barrier. /// This waits until previous texture writes from the GPU to finish, before diff --git a/Ryujinx.Graphics.Gpu/Engine/Threed/ThreedClassState.cs b/Ryujinx.Graphics.Gpu/Engine/Threed/ThreedClassState.cs index 1498e27ba..8f26f38ff 100644 --- a/Ryujinx.Graphics.Gpu/Engine/Threed/ThreedClassState.cs +++ b/Ryujinx.Graphics.Gpu/Engine/Threed/ThreedClassState.cs @@ -5,6 +5,7 @@ using Ryujinx.Graphics.Gpu.Engine.Types; using Ryujinx.Graphics.Gpu.Image; using Ryujinx.Graphics.Shader; using System; +using System.Runtime.CompilerServices; namespace Ryujinx.Graphics.Gpu.Engine.Threed { @@ -214,6 +215,17 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed #pragma warning restore CS0649 } + /// <summary> + /// Indicates whenever the blend microcode processes RGB and alpha components. + /// </summary> + enum BlendUcodeEnable + { + Disabled = 0, + EnableRGB = 1, + EnableAlpha = 2, + EnableRGBA = 3 + } + /// <summary> /// Scissor state. /// </summary> @@ -434,6 +446,49 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed TriangleRastFlip = 1 << 4 } + /// <summary> + /// RGB color components packed as 16-bit float values. + /// </summary> + struct RgbHalf + { +#pragma warning disable CS0649 + public uint R; + public uint G; + public uint B; + public uint Padding; +#pragma warning restore CS0649 + + /// <summary> + /// Unpacks the red color component as a 16-bit float value. + /// </summary> + /// <returns>The component value</returns> + public Half UnpackR() + { + ushort value = (ushort)R; + return Unsafe.As<ushort, Half>(ref value); + } + + /// <summary> + /// Unpacks the green color component as a 16-bit float value. + /// </summary> + /// <returns>The component value</returns> + public Half UnpackG() + { + ushort value = (ushort)G; + return Unsafe.As<ushort, Half>(ref value); + } + + /// <summary> + /// Unpacks the blue color component as a 16-bit float value. + /// </summary> + /// <returns>The component value</returns> + public Half UnpackB() + { + ushort value = (ushort)B; + return Unsafe.As<ushort, Half>(ref value); + } + } + /// <summary> /// Condition for conditional rendering. /// </summary> @@ -752,7 +807,9 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed public Boolean32 EarlyZForce; public fixed uint Reserved214[45]; public uint SyncpointAction; - public fixed uint Reserved2CC[21]; + public fixed uint Reserved2CC[10]; + public uint BlendUcodeNormalizedDst; + public fixed uint Reserved2F8[10]; public TessMode TessMode; public Array4<float> TessOuterLevel; public Array2<float> TessInnerLevel; @@ -781,11 +838,16 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed public fixed uint ReservedDB8[2]; public DepthBiasState DepthBiasState; public int PatchVertices; - public fixed uint ReservedDD0[4]; + public BlendUcodeEnable BlendUcodeEnable; + public uint BlendUcodeSize; + public fixed uint ReservedDD8[2]; public uint TextureBarrier; public uint WatchdogTimer; public Boolean32 PrimitiveRestartDrawArrays; - public fixed uint ReservedDEC[5]; + public uint ReservedDEC; + public uint LoadBlendUcodeStart; + public uint LoadBlendUcodeInstruction; + public fixed uint ReservedDF8[2]; public Array16<ScissorState> ScissorState; public fixed uint ReservedF00[21]; public StencilBackMasks StencilBackMasks; @@ -850,7 +912,9 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed public fixed uint Reserved142C[2]; public uint FirstVertex; public uint FirstInstance; - public fixed uint Reserved143C[53]; + public fixed uint Reserved143C[17]; + public Array8<RgbHalf> BlendUcodeConstants; + public fixed uint Reserved1500[4]; public uint ClipDistanceEnable; public uint Reserved1514; public float PointSize; diff --git a/Ryujinx.Graphics.OpenGL/EnumConversion.cs b/Ryujinx.Graphics.OpenGL/EnumConversion.cs index f262c584c..c9a1eaa93 100644 --- a/Ryujinx.Graphics.OpenGL/EnumConversion.cs +++ b/Ryujinx.Graphics.OpenGL/EnumConversion.cs @@ -34,6 +34,126 @@ namespace Ryujinx.Graphics.OpenGL return TextureWrapMode.Clamp; } + public static NvBlendEquationAdvanced Convert(this AdvancedBlendOp op) + { + switch (op) + { + case AdvancedBlendOp.Zero: + return NvBlendEquationAdvanced.Zero; + case AdvancedBlendOp.Src: + return NvBlendEquationAdvanced.SrcNv; + case AdvancedBlendOp.Dst: + return NvBlendEquationAdvanced.DstNv; + case AdvancedBlendOp.SrcOver: + return NvBlendEquationAdvanced.SrcOverNv; + case AdvancedBlendOp.DstOver: + return NvBlendEquationAdvanced.DstOverNv; + case AdvancedBlendOp.SrcIn: + return NvBlendEquationAdvanced.SrcInNv; + case AdvancedBlendOp.DstIn: + return NvBlendEquationAdvanced.DstInNv; + case AdvancedBlendOp.SrcOut: + return NvBlendEquationAdvanced.SrcOutNv; + case AdvancedBlendOp.DstOut: + return NvBlendEquationAdvanced.DstOutNv; + case AdvancedBlendOp.SrcAtop: + return NvBlendEquationAdvanced.SrcAtopNv; + case AdvancedBlendOp.DstAtop: + return NvBlendEquationAdvanced.DstAtopNv; + case AdvancedBlendOp.Xor: + return NvBlendEquationAdvanced.XorNv; + case AdvancedBlendOp.Plus: + return NvBlendEquationAdvanced.PlusNv; + case AdvancedBlendOp.PlusClamped: + return NvBlendEquationAdvanced.PlusClampedNv; + case AdvancedBlendOp.PlusClampedAlpha: + return NvBlendEquationAdvanced.PlusClampedAlphaNv; + case AdvancedBlendOp.PlusDarker: + return NvBlendEquationAdvanced.PlusDarkerNv; + case AdvancedBlendOp.Multiply: + return NvBlendEquationAdvanced.MultiplyNv; + case AdvancedBlendOp.Screen: + return NvBlendEquationAdvanced.ScreenNv; + case AdvancedBlendOp.Overlay: + return NvBlendEquationAdvanced.OverlayNv; + case AdvancedBlendOp.Darken: + return NvBlendEquationAdvanced.DarkenNv; + case AdvancedBlendOp.Lighten: + return NvBlendEquationAdvanced.LightenNv; + case AdvancedBlendOp.ColorDodge: + return NvBlendEquationAdvanced.ColordodgeNv; + case AdvancedBlendOp.ColorBurn: + return NvBlendEquationAdvanced.ColorburnNv; + case AdvancedBlendOp.HardLight: + return NvBlendEquationAdvanced.HardlightNv; + case AdvancedBlendOp.SoftLight: + return NvBlendEquationAdvanced.SoftlightNv; + case AdvancedBlendOp.Difference: + return NvBlendEquationAdvanced.DifferenceNv; + case AdvancedBlendOp.Minus: + return NvBlendEquationAdvanced.MinusNv; + case AdvancedBlendOp.MinusClamped: + return NvBlendEquationAdvanced.MinusClampedNv; + case AdvancedBlendOp.Exclusion: + return NvBlendEquationAdvanced.ExclusionNv; + case AdvancedBlendOp.Contrast: + return NvBlendEquationAdvanced.ContrastNv; + case AdvancedBlendOp.Invert: + return NvBlendEquationAdvanced.Invert; + case AdvancedBlendOp.InvertRGB: + return NvBlendEquationAdvanced.InvertRgbNv; + case AdvancedBlendOp.InvertOvg: + return NvBlendEquationAdvanced.InvertOvgNv; + case AdvancedBlendOp.LinearDodge: + return NvBlendEquationAdvanced.LineardodgeNv; + case AdvancedBlendOp.LinearBurn: + return NvBlendEquationAdvanced.LinearburnNv; + case AdvancedBlendOp.VividLight: + return NvBlendEquationAdvanced.VividlightNv; + case AdvancedBlendOp.LinearLight: + return NvBlendEquationAdvanced.LinearlightNv; + case AdvancedBlendOp.PinLight: + return NvBlendEquationAdvanced.PinlightNv; + case AdvancedBlendOp.HardMix: + return NvBlendEquationAdvanced.HardmixNv; + case AdvancedBlendOp.Red: + return NvBlendEquationAdvanced.RedNv; + case AdvancedBlendOp.Green: + return NvBlendEquationAdvanced.GreenNv; + case AdvancedBlendOp.Blue: + return NvBlendEquationAdvanced.BlueNv; + case AdvancedBlendOp.HslHue: + return NvBlendEquationAdvanced.HslHueNv; + case AdvancedBlendOp.HslSaturation: + return NvBlendEquationAdvanced.HslSaturationNv; + case AdvancedBlendOp.HslColor: + return NvBlendEquationAdvanced.HslColorNv; + case AdvancedBlendOp.HslLuminosity: + return NvBlendEquationAdvanced.HslLuminosityNv; + } + + Logger.Debug?.Print(LogClass.Gpu, $"Invalid {nameof(AdvancedBlendOp)} enum value: {op}."); + + return NvBlendEquationAdvanced.Zero; + } + + public static All Convert(this AdvancedBlendOverlap overlap) + { + switch (overlap) + { + case AdvancedBlendOverlap.Uncorrelated: + return All.UncorrelatedNv; + case AdvancedBlendOverlap.Disjoint: + return All.DisjointNv; + case AdvancedBlendOverlap.Conjoint: + return All.ConjointNv; + } + + Logger.Debug?.Print(LogClass.Gpu, $"Invalid {nameof(AdvancedBlendOverlap)} enum value: {overlap}."); + + return All.UncorrelatedNv; + } + public static All Convert(this BlendFactor factor) { switch (factor) diff --git a/Ryujinx.Graphics.OpenGL/HwCapabilities.cs b/Ryujinx.Graphics.OpenGL/HwCapabilities.cs index 8caf11dd5..846465260 100644 --- a/Ryujinx.Graphics.OpenGL/HwCapabilities.cs +++ b/Ryujinx.Graphics.OpenGL/HwCapabilities.cs @@ -7,6 +7,7 @@ namespace Ryujinx.Graphics.OpenGL { private static readonly Lazy<bool> _supportsAlphaToCoverageDitherControl = new Lazy<bool>(() => HasExtension("GL_NV_alpha_to_coverage_dither_control")); private static readonly Lazy<bool> _supportsAstcCompression = new Lazy<bool>(() => HasExtension("GL_KHR_texture_compression_astc_ldr")); + private static readonly Lazy<bool> _supportsBlendEquationAdvanced = new Lazy<bool>(() => HasExtension("GL_NV_blend_equation_advanced")); private static readonly Lazy<bool> _supportsDrawTexture = new Lazy<bool>(() => HasExtension("GL_NV_draw_texture")); private static readonly Lazy<bool> _supportsFragmentShaderInterlock = new Lazy<bool>(() => HasExtension("GL_ARB_fragment_shader_interlock")); private static readonly Lazy<bool> _supportsFragmentShaderOrdering = new Lazy<bool>(() => HasExtension("GL_INTEL_fragment_shader_ordering")); @@ -51,6 +52,7 @@ namespace Ryujinx.Graphics.OpenGL public static bool SupportsAlphaToCoverageDitherControl => _supportsAlphaToCoverageDitherControl.Value; public static bool SupportsAstcCompression => _supportsAstcCompression.Value; + public static bool SupportsBlendEquationAdvanced => _supportsBlendEquationAdvanced.Value; public static bool SupportsDrawTexture => _supportsDrawTexture.Value; public static bool SupportsFragmentShaderInterlock => _supportsFragmentShaderInterlock.Value; public static bool SupportsFragmentShaderOrdering => _supportsFragmentShaderOrdering.Value; diff --git a/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs b/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs index 30ed942d3..722c4b4da 100644 --- a/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs +++ b/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs @@ -119,6 +119,7 @@ namespace Ryujinx.Graphics.OpenGL supportsR4G4B4A4Format: true, supportsSnormBufferTextureFormat: false, supports5BitComponentFormat: true, + supportsBlendEquationAdvanced: HwCapabilities.SupportsBlendEquationAdvanced, supportsFragmentShaderInterlock: HwCapabilities.SupportsFragmentShaderInterlock, supportsFragmentShaderOrderingIntel: HwCapabilities.SupportsFragmentShaderOrdering, supportsGeometryShaderPassthrough: HwCapabilities.SupportsGeometryShaderPassthrough, diff --git a/Ryujinx.Graphics.OpenGL/Pipeline.cs b/Ryujinx.Graphics.OpenGL/Pipeline.cs index 8bcaf4c77..970feea0c 100644 --- a/Ryujinx.Graphics.OpenGL/Pipeline.cs +++ b/Ryujinx.Graphics.OpenGL/Pipeline.cs @@ -59,6 +59,7 @@ namespace Ryujinx.Graphics.OpenGL private uint _fragmentOutputMap; private uint _componentMasks; private uint _currentComponentMasks; + private bool _advancedBlendEnable; private uint _scissorEnables; @@ -784,8 +785,26 @@ namespace Ryujinx.Graphics.OpenGL GL.Enable(EnableCap.AlphaTest); } + public void SetBlendState(AdvancedBlendDescriptor blend) + { + if (HwCapabilities.SupportsBlendEquationAdvanced) + { + GL.BlendEquation((BlendEquationMode)blend.Op.Convert()); + GL.NV.BlendParameter(NvBlendEquationAdvanced.BlendOverlapNv, (int)blend.Overlap.Convert()); + GL.NV.BlendParameter(NvBlendEquationAdvanced.BlendPremultipliedSrcNv, blend.SrcPreMultiplied ? 1 : 0); + GL.Enable(EnableCap.Blend); + _advancedBlendEnable = true; + } + } + public void SetBlendState(int index, BlendDescriptor blend) { + if (_advancedBlendEnable) + { + GL.Disable(EnableCap.Blend); + _advancedBlendEnable = false; + } + if (!blend.Enable) { GL.Disable(IndexedEnableCap.Blend, index); diff --git a/Ryujinx.Graphics.Vulkan/EnumConversion.cs b/Ryujinx.Graphics.Vulkan/EnumConversion.cs index 0164ef85c..6c273b050 100644 --- a/Ryujinx.Graphics.Vulkan/EnumConversion.cs +++ b/Ryujinx.Graphics.Vulkan/EnumConversion.cs @@ -79,6 +79,60 @@ namespace Ryujinx.Graphics.Vulkan }; } + public static Silk.NET.Vulkan.BlendOp Convert(this GAL.AdvancedBlendOp op) + { + return op switch + { + GAL.AdvancedBlendOp.Zero => Silk.NET.Vulkan.BlendOp.ZeroExt, + GAL.AdvancedBlendOp.Src => Silk.NET.Vulkan.BlendOp.SrcExt, + GAL.AdvancedBlendOp.Dst => Silk.NET.Vulkan.BlendOp.DstExt, + GAL.AdvancedBlendOp.SrcOver => Silk.NET.Vulkan.BlendOp.SrcOverExt, + GAL.AdvancedBlendOp.DstOver => Silk.NET.Vulkan.BlendOp.DstOverExt, + GAL.AdvancedBlendOp.SrcIn => Silk.NET.Vulkan.BlendOp.SrcInExt, + GAL.AdvancedBlendOp.DstIn => Silk.NET.Vulkan.BlendOp.DstInExt, + GAL.AdvancedBlendOp.SrcOut => Silk.NET.Vulkan.BlendOp.SrcOutExt, + GAL.AdvancedBlendOp.DstOut => Silk.NET.Vulkan.BlendOp.DstOutExt, + GAL.AdvancedBlendOp.SrcAtop => Silk.NET.Vulkan.BlendOp.SrcAtopExt, + GAL.AdvancedBlendOp.DstAtop => Silk.NET.Vulkan.BlendOp.DstAtopExt, + GAL.AdvancedBlendOp.Xor => Silk.NET.Vulkan.BlendOp.XorExt, + GAL.AdvancedBlendOp.Plus => Silk.NET.Vulkan.BlendOp.PlusExt, + GAL.AdvancedBlendOp.PlusClamped => Silk.NET.Vulkan.BlendOp.PlusClampedExt, + GAL.AdvancedBlendOp.PlusClampedAlpha => Silk.NET.Vulkan.BlendOp.PlusClampedAlphaExt, + GAL.AdvancedBlendOp.PlusDarker => Silk.NET.Vulkan.BlendOp.PlusDarkerExt, + GAL.AdvancedBlendOp.Multiply => Silk.NET.Vulkan.BlendOp.MultiplyExt, + GAL.AdvancedBlendOp.Screen => Silk.NET.Vulkan.BlendOp.ScreenExt, + GAL.AdvancedBlendOp.Overlay => Silk.NET.Vulkan.BlendOp.OverlayExt, + GAL.AdvancedBlendOp.Darken => Silk.NET.Vulkan.BlendOp.DarkenExt, + GAL.AdvancedBlendOp.Lighten => Silk.NET.Vulkan.BlendOp.LightenExt, + GAL.AdvancedBlendOp.ColorDodge => Silk.NET.Vulkan.BlendOp.ColordodgeExt, + GAL.AdvancedBlendOp.ColorBurn => Silk.NET.Vulkan.BlendOp.ColorburnExt, + GAL.AdvancedBlendOp.HardLight => Silk.NET.Vulkan.BlendOp.HardlightExt, + GAL.AdvancedBlendOp.SoftLight => Silk.NET.Vulkan.BlendOp.SoftlightExt, + GAL.AdvancedBlendOp.Difference => Silk.NET.Vulkan.BlendOp.DifferenceExt, + GAL.AdvancedBlendOp.Minus => Silk.NET.Vulkan.BlendOp.MinusExt, + GAL.AdvancedBlendOp.MinusClamped => Silk.NET.Vulkan.BlendOp.MinusClampedExt, + GAL.AdvancedBlendOp.Exclusion => Silk.NET.Vulkan.BlendOp.ExclusionExt, + GAL.AdvancedBlendOp.Contrast => Silk.NET.Vulkan.BlendOp.ContrastExt, + GAL.AdvancedBlendOp.Invert => Silk.NET.Vulkan.BlendOp.InvertExt, + GAL.AdvancedBlendOp.InvertRGB => Silk.NET.Vulkan.BlendOp.InvertRgbExt, + GAL.AdvancedBlendOp.InvertOvg => Silk.NET.Vulkan.BlendOp.InvertOvgExt, + GAL.AdvancedBlendOp.LinearDodge => Silk.NET.Vulkan.BlendOp.LineardodgeExt, + GAL.AdvancedBlendOp.LinearBurn => Silk.NET.Vulkan.BlendOp.LinearburnExt, + GAL.AdvancedBlendOp.VividLight => Silk.NET.Vulkan.BlendOp.VividlightExt, + GAL.AdvancedBlendOp.LinearLight => Silk.NET.Vulkan.BlendOp.LinearlightExt, + GAL.AdvancedBlendOp.PinLight => Silk.NET.Vulkan.BlendOp.PinlightExt, + GAL.AdvancedBlendOp.HardMix => Silk.NET.Vulkan.BlendOp.HardmixExt, + GAL.AdvancedBlendOp.Red => Silk.NET.Vulkan.BlendOp.RedExt, + GAL.AdvancedBlendOp.Green => Silk.NET.Vulkan.BlendOp.GreenExt, + GAL.AdvancedBlendOp.Blue => Silk.NET.Vulkan.BlendOp.BlueExt, + GAL.AdvancedBlendOp.HslHue => Silk.NET.Vulkan.BlendOp.HslHueExt, + GAL.AdvancedBlendOp.HslSaturation => Silk.NET.Vulkan.BlendOp.HslSaturationExt, + GAL.AdvancedBlendOp.HslColor => Silk.NET.Vulkan.BlendOp.HslColorExt, + GAL.AdvancedBlendOp.HslLuminosity => Silk.NET.Vulkan.BlendOp.HslLuminosityExt, + _ => LogInvalidAndReturn(op, nameof(GAL.AdvancedBlendOp), Silk.NET.Vulkan.BlendOp.Add) + }; + } + public static Silk.NET.Vulkan.BlendOp Convert(this GAL.BlendOp op) { return op switch @@ -92,6 +146,17 @@ namespace Ryujinx.Graphics.Vulkan }; } + public static Silk.NET.Vulkan.BlendOverlapEXT Convert(this GAL.AdvancedBlendOverlap overlap) + { + return overlap switch + { + GAL.AdvancedBlendOverlap.Uncorrelated => Silk.NET.Vulkan.BlendOverlapEXT.UncorrelatedExt, + GAL.AdvancedBlendOverlap.Disjoint => Silk.NET.Vulkan.BlendOverlapEXT.DisjointExt, + GAL.AdvancedBlendOverlap.Conjoint => Silk.NET.Vulkan.BlendOverlapEXT.ConjointExt, + _ => LogInvalidAndReturn(overlap, nameof(GAL.AdvancedBlendOverlap), Silk.NET.Vulkan.BlendOverlapEXT.UncorrelatedExt) + }; + } + public static Silk.NET.Vulkan.CompareOp Convert(this GAL.CompareOp op) { return op switch diff --git a/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs b/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs index 1ed2b0ccc..4512d375f 100644 --- a/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs +++ b/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs @@ -18,6 +18,10 @@ namespace Ryujinx.Graphics.Vulkan { public readonly bool SupportsIndexTypeUint8; public readonly bool SupportsCustomBorderColor; + public readonly bool SupportsBlendEquationAdvanced; + public readonly bool SupportsBlendEquationAdvancedCorrelatedOverlap; + public readonly bool SupportsBlendEquationAdvancedNonPreMultipliedSrcColor; + public readonly bool SupportsBlendEquationAdvancedNonPreMultipliedDstColor; public readonly bool SupportsIndirectParameters; public readonly bool SupportsFragmentShaderInterlock; public readonly bool SupportsGeometryShaderPassthrough; @@ -44,6 +48,10 @@ namespace Ryujinx.Graphics.Vulkan public HardwareCapabilities( bool supportsIndexTypeUint8, bool supportsCustomBorderColor, + bool supportsBlendEquationAdvanced, + bool supportsBlendEquationAdvancedCorrelatedOverlap, + bool supportsBlendEquationAdvancedNonPreMultipliedSrcColor, + bool supportsBlendEquationAdvancedNonPreMultipliedDstColor, bool supportsIndirectParameters, bool supportsFragmentShaderInterlock, bool supportsGeometryShaderPassthrough, @@ -69,6 +77,10 @@ namespace Ryujinx.Graphics.Vulkan { SupportsIndexTypeUint8 = supportsIndexTypeUint8; SupportsCustomBorderColor = supportsCustomBorderColor; + SupportsBlendEquationAdvanced = supportsBlendEquationAdvanced; + SupportsBlendEquationAdvancedCorrelatedOverlap = supportsBlendEquationAdvancedCorrelatedOverlap; + SupportsBlendEquationAdvancedNonPreMultipliedSrcColor = supportsBlendEquationAdvancedNonPreMultipliedSrcColor; + SupportsBlendEquationAdvancedNonPreMultipliedDstColor = supportsBlendEquationAdvancedNonPreMultipliedDstColor; SupportsIndirectParameters = supportsIndirectParameters; SupportsFragmentShaderInterlock = supportsFragmentShaderInterlock; SupportsGeometryShaderPassthrough = supportsGeometryShaderPassthrough; diff --git a/Ryujinx.Graphics.Vulkan/PipelineBase.cs b/Ryujinx.Graphics.Vulkan/PipelineBase.cs index 8ed39ee26..f779305db 100644 --- a/Ryujinx.Graphics.Vulkan/PipelineBase.cs +++ b/Ryujinx.Graphics.Vulkan/PipelineBase.cs @@ -112,11 +112,9 @@ namespace Ryujinx.Graphics.Vulkan var defaultScale = new Vector4<float> { X = 1f, Y = 0f, Z = 0f, W = 0f }; new Span<Vector4<float>>(_renderScale).Fill(defaultScale); - _newState.Initialize(); - _newState.LineWidth = 1f; - _newState.SamplesCount = 1; + _storedBlend = new PipelineColorBlendAttachmentState[Constants.MaxRenderTargets]; - _storedBlend = new PipelineColorBlendAttachmentState[8]; + _newState.Initialize(); } public void Initialize() @@ -676,6 +674,49 @@ namespace Ryujinx.Graphics.Vulkan // to avoid creating one version of the shader per reference value used. } + public void SetBlendState(AdvancedBlendDescriptor blend) + { + for (int index = 0; index < Constants.MaxRenderTargets; index++) + { + ref var vkBlend = ref _newState.Internal.ColorBlendAttachmentState[index]; + + if (index == 0) + { + var blendOp = blend.Op.Convert(); + + vkBlend = new PipelineColorBlendAttachmentState( + blendEnable: true, + colorBlendOp: blendOp, + alphaBlendOp: blendOp, + colorWriteMask: vkBlend.ColorWriteMask); + + if (Gd.Capabilities.SupportsBlendEquationAdvancedNonPreMultipliedSrcColor) + { + _newState.AdvancedBlendSrcPreMultiplied = blend.SrcPreMultiplied; + } + + if (Gd.Capabilities.SupportsBlendEquationAdvancedCorrelatedOverlap) + { + _newState.AdvancedBlendOverlap = blend.Overlap.Convert(); + } + } + else + { + vkBlend = new PipelineColorBlendAttachmentState( + colorWriteMask: vkBlend.ColorWriteMask); + } + + if (vkBlend.ColorWriteMask == 0) + { + _storedBlend[index] = vkBlend; + + vkBlend = new PipelineColorBlendAttachmentState(); + } + } + + SignalStateChange(); + } + public void SetBlendState(int index, BlendDescriptor blend) { ref var vkBlend = ref _newState.Internal.ColorBlendAttachmentState[index]; @@ -709,6 +750,11 @@ namespace Ryujinx.Graphics.Vulkan blend.BlendConstant.Blue, blend.BlendConstant.Alpha); + // Reset advanced blend state back defaults to the cache to help the pipeline cache. + _newState.AdvancedBlendSrcPreMultiplied = true; + _newState.AdvancedBlendDstPreMultiplied = true; + _newState.AdvancedBlendOverlap = BlendOverlapEXT.UncorrelatedExt; + SignalStateChange(); } diff --git a/Ryujinx.Graphics.Vulkan/PipelineState.cs b/Ryujinx.Graphics.Vulkan/PipelineState.cs index 00b154a06..0d5494766 100644 --- a/Ryujinx.Graphics.Vulkan/PipelineState.cs +++ b/Ryujinx.Graphics.Vulkan/PipelineState.cs @@ -285,6 +285,24 @@ namespace Ryujinx.Graphics.Vulkan set => Internal.Id9 = (Internal.Id9 & 0xFFFFFFFFFFFFFFFD) | ((value ? 1UL : 0UL) << 1); } + public bool AdvancedBlendSrcPreMultiplied + { + get => ((Internal.Id9 >> 2) & 0x1) != 0UL; + set => Internal.Id9 = (Internal.Id9 & 0xFFFFFFFFFFFFFFFB) | ((value ? 1UL : 0UL) << 2); + } + + public bool AdvancedBlendDstPreMultiplied + { + get => ((Internal.Id9 >> 3) & 0x1) != 0UL; + set => Internal.Id9 = (Internal.Id9 & 0xFFFFFFFFFFFFFFF7) | ((value ? 1UL : 0UL) << 3); + } + + public BlendOverlapEXT AdvancedBlendOverlap + { + get => (BlendOverlapEXT)((Internal.Id9 >> 4) & 0x3); + set => Internal.Id9 = (Internal.Id9 & 0xFFFFFFFFFFFFFFCF) | ((ulong)value << 4); + } + public NativeArray<PipelineShaderStageCreateInfo> Stages; public NativeArray<PipelineShaderStageRequiredSubgroupSizeCreateInfoEXT> StageRequiredSubgroupSizes; public PipelineLayout PipelineLayout; @@ -303,6 +321,13 @@ namespace Ryujinx.Graphics.Vulkan RequiredSubgroupSize = RequiredSubgroupSize }; } + + AdvancedBlendSrcPreMultiplied = true; + AdvancedBlendDstPreMultiplied = true; + AdvancedBlendOverlap = BlendOverlapEXT.UncorrelatedExt; + + LineWidth = 1f; + SamplesCount = 1; } public unsafe Auto<DisposablePipeline> CreateComputePipeline( @@ -486,6 +511,23 @@ namespace Ryujinx.Graphics.Vulkan PAttachments = pColorBlendAttachmentState }; + PipelineColorBlendAdvancedStateCreateInfoEXT colorBlendAdvancedState; + + if (!AdvancedBlendSrcPreMultiplied || + !AdvancedBlendDstPreMultiplied || + AdvancedBlendOverlap != BlendOverlapEXT.UncorrelatedExt) + { + colorBlendAdvancedState = new PipelineColorBlendAdvancedStateCreateInfoEXT() + { + SType = StructureType.PipelineColorBlendAdvancedStateCreateInfoExt, + SrcPremultiplied = AdvancedBlendSrcPreMultiplied, + DstPremultiplied = AdvancedBlendDstPreMultiplied, + BlendOverlap = AdvancedBlendOverlap + }; + + colorBlendState.PNext = &colorBlendAdvancedState; + } + bool supportsExtDynamicState = gd.Capabilities.SupportsExtendedDynamicState; int dynamicStatesCount = supportsExtDynamicState ? 9 : 8; diff --git a/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs b/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs index 4401f032d..353b219ac 100644 --- a/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs +++ b/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs @@ -27,6 +27,7 @@ namespace Ryujinx.Graphics.Vulkan ExtTransformFeedback.ExtensionName, KhrDrawIndirectCount.ExtensionName, KhrPushDescriptor.ExtensionName, + "VK_EXT_blend_operation_advanced", "VK_EXT_custom_border_color", "VK_EXT_descriptor_indexing", // Enabling this works around an issue with disposed buffer bindings on RADV. "VK_EXT_fragment_shader_interlock", diff --git a/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs b/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs index a7b4b41a7..4c7c731be 100644 --- a/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs +++ b/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs @@ -149,6 +149,19 @@ namespace Ryujinx.Graphics.Vulkan SType = StructureType.PhysicalDeviceProperties2 }; + PhysicalDeviceBlendOperationAdvancedPropertiesEXT propertiesBlendOperationAdvanced = new PhysicalDeviceBlendOperationAdvancedPropertiesEXT() + { + SType = StructureType.PhysicalDeviceBlendOperationAdvancedPropertiesExt + }; + + bool supportsBlendOperationAdvanced = supportedExtensions.Contains("VK_EXT_blend_operation_advanced"); + + if (supportsBlendOperationAdvanced) + { + propertiesBlendOperationAdvanced.PNext = properties2.PNext; + properties2.PNext = &propertiesBlendOperationAdvanced; + } + PhysicalDeviceSubgroupSizeControlPropertiesEXT propertiesSubgroupSizeControl = new PhysicalDeviceSubgroupSizeControlPropertiesEXT() { SType = StructureType.PhysicalDeviceSubgroupSizeControlPropertiesExt @@ -246,9 +259,9 @@ namespace Ryujinx.Graphics.Vulkan portabilityFlags |= featuresPortabilitySubset.SamplerMipLodBias ? 0 : PortabilitySubsetFlags.NoLodBias; } - bool customBorderColorSupported = supportedExtensions.Contains("VK_EXT_custom_border_color") && - featuresCustomBorderColor.CustomBorderColors && - featuresCustomBorderColor.CustomBorderColorWithoutFormat; + bool supportsCustomBorderColor = supportedExtensions.Contains("VK_EXT_custom_border_color") && + featuresCustomBorderColor.CustomBorderColors && + featuresCustomBorderColor.CustomBorderColorWithoutFormat; ref var properties = ref properties2.Properties; @@ -259,7 +272,11 @@ namespace Ryujinx.Graphics.Vulkan Capabilities = new HardwareCapabilities( supportedExtensions.Contains("VK_EXT_index_type_uint8"), - customBorderColorSupported, + supportsCustomBorderColor, + supportsBlendOperationAdvanced, + propertiesBlendOperationAdvanced.AdvancedBlendCorrelatedOverlap, + propertiesBlendOperationAdvanced.AdvancedBlendNonPremultipliedSrcColor, + propertiesBlendOperationAdvanced.AdvancedBlendNonPremultipliedDstColor, supportedExtensions.Contains(KhrDrawIndirectCount.ExtensionName), supportedExtensions.Contains("VK_EXT_fragment_shader_interlock"), supportedExtensions.Contains("VK_NV_geometry_shader_passthrough"), @@ -526,6 +543,7 @@ namespace Ryujinx.Graphics.Vulkan supportsR4G4B4A4Format: supportsR4G4B4A4Format, supportsSnormBufferTextureFormat: true, supports5BitComponentFormat: supports5BitComponentFormat, + supportsBlendEquationAdvanced: Capabilities.SupportsBlendEquationAdvanced, supportsFragmentShaderInterlock: Capabilities.SupportsFragmentShaderInterlock, supportsFragmentShaderOrderingIntel: false, supportsGeometryShaderPassthrough: Capabilities.SupportsGeometryShaderPassthrough, From 58d7a1fe9747f673b0c0399581730616681f015c Mon Sep 17 00:00:00 2001 From: gdkchan <gab.dark.100@gmail.com> Date: Tue, 21 Feb 2023 06:40:23 -0300 Subject: [PATCH 17/41] Mark texture as modified and sync on I2M fast path (#4449) --- .../Engine/InlineToMemory/InlineToMemoryClass.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Ryujinx.Graphics.Gpu/Engine/InlineToMemory/InlineToMemoryClass.cs b/Ryujinx.Graphics.Gpu/Engine/InlineToMemory/InlineToMemoryClass.cs index f6effe2ed..e1d7e9407 100644 --- a/Ryujinx.Graphics.Gpu/Engine/InlineToMemory/InlineToMemoryClass.cs +++ b/Ryujinx.Graphics.Gpu/Engine/InlineToMemory/InlineToMemoryClass.cs @@ -197,7 +197,9 @@ namespace Ryujinx.Graphics.Gpu.Engine.InlineToMemory if (target != null) { + target.SynchronizeMemory(); target.SetData(data, 0, 0, new GAL.Rectangle<int>(_dstX, _dstY, _lineLengthIn / target.Info.FormatInfo.BytesPerPixel, _lineCount)); + target.SignalModified(); return; } From fc43aecbbd37a83ebd03f8cfe8fbc033ce2bda7d Mon Sep 17 00:00:00 2001 From: riperiperi <rhy3756547@hotmail.com> Date: Tue, 21 Feb 2023 09:53:38 +0000 Subject: [PATCH 18/41] Memory: Faster Split for NonOverlappingRangeList (#4451) I noticed that in Xenoblade 2, the game can end up spending a lot of time adding and removing tracking handles. One of the main causes of this is actually splitting existing handles, which does the following: - Remove existing handle from list - Update existing handle to end at split address, create new handle starting at split address - Add updated handle (left) to list - Add new handle (right) to list This costs 1 deletion and 2 insertions. When there are more handles, this gets a lot more expensive, as insertions are done by copying all values to the right, and deletions by copying values to the left. This PR simply allows it to look up the handle being split, and replace its entry with the new end address without insertion or deletion. This makes a split only cost one insertion and a binary search lookup (very cheap). This isn't all of the cost on Xenoblade 2, but it does significantly reduce it. There might be something else to this - we could find a way to reduce the handle count for the game (merging on deletion? buffer deletion?), we could use a different structure for virtual regions, as the current one is optimal for buffer lookups which nearly always read, memory tracking has more of a balance between read/write. That's for a later date though, this was an easy improvment. --- .../Range/NonOverlappingRangeList.cs | 4 +- Ryujinx.Memory/Range/RangeList.cs | 37 +++++++++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/Ryujinx.Memory/Range/NonOverlappingRangeList.cs b/Ryujinx.Memory/Range/NonOverlappingRangeList.cs index 9a8f84dd6..60b2b3784 100644 --- a/Ryujinx.Memory/Range/NonOverlappingRangeList.cs +++ b/Ryujinx.Memory/Range/NonOverlappingRangeList.cs @@ -97,10 +97,8 @@ namespace Ryujinx.Memory.Range /// <returns>The new region (high part)</returns> private T Split(T region, ulong splitAddress) { - Remove(region); - T newRegion = (T)region.Split(splitAddress); - Add(region); + Update(region); Add(newRegion); return newRegion; } diff --git a/Ryujinx.Memory/Range/RangeList.cs b/Ryujinx.Memory/Range/RangeList.cs index 7278e7eb4..469195973 100644 --- a/Ryujinx.Memory/Range/RangeList.cs +++ b/Ryujinx.Memory/Range/RangeList.cs @@ -67,6 +67,43 @@ namespace Ryujinx.Memory.Range Insert(index, new RangeItem<T>(item)); } + /// <summary> + /// Updates an item's end address on the list. Address must be the same. + /// </summary> + /// <param name="item">The item to be updated</param> + /// <returns>True if the item was located and updated, false otherwise</returns> + public bool Update(T item) + { + int index = BinarySearch(item.Address); + + if (index >= 0) + { + while (index > 0 && _items[index - 1].Address == item.Address) + { + index--; + } + + while (index < Count) + { + if (_items[index].Value.Equals(item)) + { + _items[index] = new RangeItem<T>(item); + + return true; + } + + if (_items[index].Address > item.Address) + { + break; + } + + index++; + } + } + + return false; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private void Insert(int index, RangeItem<T> item) { From edfd4d70c0f38d41c6ebb31508127b14727017bd Mon Sep 17 00:00:00 2001 From: Logan Stromberg <loganstromberg@gmail.com> Date: Tue, 21 Feb 2023 02:44:57 -0800 Subject: [PATCH 19/41] Use SIMD acceleration for audio upsampler (#4410) * Use SIMD acceleration for audio upsampler filter kernel for a moderate speedup * Address formatting. Implement AVX2 fast path for high quality resampling in ResamplerHelper * now really, are we really getting the benefit of inlining 50+ line methods? * adding unit tests for resampler + upsampler. The upsampler ones fail for some reason * Fixing upsampler test. Apparently this algo only works at specific ratios --------- Co-authored-by: Logan Stromberg <lostromb@microsoft.com> --- Ryujinx.Audio/Renderer/Dsp/ResamplerHelper.cs | 183 ++++++++++-------- Ryujinx.Audio/Renderer/Dsp/UpsamplerHelper.cs | 23 ++- .../Audio/Renderer/Dsp/ResamplerTests.cs | 93 +++++++++ .../Audio/Renderer/Dsp/UpsamplerTests.cs | 64 ++++++ 4 files changed, 279 insertions(+), 84 deletions(-) create mode 100644 Ryujinx.Tests/Audio/Renderer/Dsp/ResamplerTests.cs create mode 100644 Ryujinx.Tests/Audio/Renderer/Dsp/UpsamplerTests.cs diff --git a/Ryujinx.Audio/Renderer/Dsp/ResamplerHelper.cs b/Ryujinx.Audio/Renderer/Dsp/ResamplerHelper.cs index b46a33fe0..7873c4d27 100644 --- a/Ryujinx.Audio/Renderer/Dsp/ResamplerHelper.cs +++ b/Ryujinx.Audio/Renderer/Dsp/ResamplerHelper.cs @@ -1,5 +1,6 @@ using System; using System.Linq; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; @@ -380,7 +381,6 @@ namespace Ryujinx.Audio.Renderer.Dsp return _normalCurveLut2F; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] private unsafe static void ResampleDefaultQuality(Span<float> outputBuffer, ReadOnlySpan<short> inputBuffer, float ratio, ref float fraction, int sampleCount, bool needPitch) { ReadOnlySpan<float> parameters = GetDefaultParameter(ratio); @@ -394,35 +394,33 @@ namespace Ryujinx.Audio.Renderer.Dsp if (ratio == 1f) { fixed (short* pInput = inputBuffer) + fixed (float* pOutput = outputBuffer, pParameters = parameters) { - fixed (float* pOutput = outputBuffer, pParameters = parameters) + Vector128<float> parameter = Sse.LoadVector128(pParameters); + + for (; i < (sampleCount & ~3); i += 4) { - Vector128<float> parameter = Sse.LoadVector128(pParameters); + Vector128<int> intInput0 = Sse41.ConvertToVector128Int32(pInput + (uint)i); + Vector128<int> intInput1 = Sse41.ConvertToVector128Int32(pInput + (uint)i + 1); + Vector128<int> intInput2 = Sse41.ConvertToVector128Int32(pInput + (uint)i + 2); + Vector128<int> intInput3 = Sse41.ConvertToVector128Int32(pInput + (uint)i + 3); - for (; i < (sampleCount & ~3); i += 4) - { - Vector128<int> intInput0 = Sse41.ConvertToVector128Int32(pInput + (uint)i); - Vector128<int> intInput1 = Sse41.ConvertToVector128Int32(pInput + (uint)i + 1); - Vector128<int> intInput2 = Sse41.ConvertToVector128Int32(pInput + (uint)i + 2); - Vector128<int> intInput3 = Sse41.ConvertToVector128Int32(pInput + (uint)i + 3); + Vector128<float> input0 = Sse2.ConvertToVector128Single(intInput0); + Vector128<float> input1 = Sse2.ConvertToVector128Single(intInput1); + Vector128<float> input2 = Sse2.ConvertToVector128Single(intInput2); + Vector128<float> input3 = Sse2.ConvertToVector128Single(intInput3); - Vector128<float> input0 = Sse2.ConvertToVector128Single(intInput0); - Vector128<float> input1 = Sse2.ConvertToVector128Single(intInput1); - Vector128<float> input2 = Sse2.ConvertToVector128Single(intInput2); - Vector128<float> input3 = Sse2.ConvertToVector128Single(intInput3); + Vector128<float> mix0 = Sse.Multiply(input0, parameter); + Vector128<float> mix1 = Sse.Multiply(input1, parameter); + Vector128<float> mix2 = Sse.Multiply(input2, parameter); + Vector128<float> mix3 = Sse.Multiply(input3, parameter); - Vector128<float> mix0 = Sse.Multiply(input0, parameter); - Vector128<float> mix1 = Sse.Multiply(input1, parameter); - Vector128<float> mix2 = Sse.Multiply(input2, parameter); - Vector128<float> mix3 = Sse.Multiply(input3, parameter); + Vector128<float> mix01 = Sse3.HorizontalAdd(mix0, mix1); + Vector128<float> mix23 = Sse3.HorizontalAdd(mix2, mix3); - Vector128<float> mix01 = Sse3.HorizontalAdd(mix0, mix1); - Vector128<float> mix23 = Sse3.HorizontalAdd(mix2, mix3); + Vector128<float> mix0123 = Sse3.HorizontalAdd(mix01, mix23); - Vector128<float> mix0123 = Sse3.HorizontalAdd(mix01, mix23); - - Sse.Store(pOutput + (uint)i, Sse41.RoundToNearestInteger(mix0123)); - } + Sse.Store(pOutput + (uint)i, Sse41.RoundToNearestInteger(mix0123)); } } @@ -431,62 +429,60 @@ namespace Ryujinx.Audio.Renderer.Dsp else { fixed (short* pInput = inputBuffer) + fixed (float* pOutput = outputBuffer, pParameters = parameters) { - fixed (float* pOutput = outputBuffer, pParameters = parameters) + for (; i < (sampleCount & ~3); i += 4) { - for (; i < (sampleCount & ~3); i += 4) - { - uint baseIndex0 = (uint)(fraction * 128) * 4; - uint inputIndex0 = (uint)inputBufferIndex; + uint baseIndex0 = (uint)(fraction * 128) * 4; + uint inputIndex0 = (uint)inputBufferIndex; - fraction += ratio; + fraction += ratio; - uint baseIndex1 = ((uint)(fraction * 128) & 127) * 4; - uint inputIndex1 = (uint)inputBufferIndex + (uint)fraction; + uint baseIndex1 = ((uint)(fraction * 128) & 127) * 4; + uint inputIndex1 = (uint)inputBufferIndex + (uint)fraction; - fraction += ratio; + fraction += ratio; - uint baseIndex2 = ((uint)(fraction * 128) & 127) * 4; - uint inputIndex2 = (uint)inputBufferIndex + (uint)fraction; + uint baseIndex2 = ((uint)(fraction * 128) & 127) * 4; + uint inputIndex2 = (uint)inputBufferIndex + (uint)fraction; - fraction += ratio; + fraction += ratio; - uint baseIndex3 = ((uint)(fraction * 128) & 127) * 4; - uint inputIndex3 = (uint)inputBufferIndex + (uint)fraction; + uint baseIndex3 = ((uint)(fraction * 128) & 127) * 4; + uint inputIndex3 = (uint)inputBufferIndex + (uint)fraction; - fraction += ratio; - inputBufferIndex += (int)fraction; + fraction += ratio; + inputBufferIndex += (int)fraction; - // Only keep lower part (safe as fraction isn't supposed to be negative) - fraction -= (int)fraction; + // Only keep lower part (safe as fraction isn't supposed to be negative) + fraction -= (int)fraction; - Vector128<float> parameter0 = Sse.LoadVector128(pParameters + baseIndex0); - Vector128<float> parameter1 = Sse.LoadVector128(pParameters + baseIndex1); - Vector128<float> parameter2 = Sse.LoadVector128(pParameters + baseIndex2); - Vector128<float> parameter3 = Sse.LoadVector128(pParameters + baseIndex3); + Vector128<float> parameter0 = Sse.LoadVector128(pParameters + baseIndex0); + Vector128<float> parameter1 = Sse.LoadVector128(pParameters + baseIndex1); + Vector128<float> parameter2 = Sse.LoadVector128(pParameters + baseIndex2); + Vector128<float> parameter3 = Sse.LoadVector128(pParameters + baseIndex3); - Vector128<int> intInput0 = Sse41.ConvertToVector128Int32(pInput + inputIndex0); - Vector128<int> intInput1 = Sse41.ConvertToVector128Int32(pInput + inputIndex1); - Vector128<int> intInput2 = Sse41.ConvertToVector128Int32(pInput + inputIndex2); - Vector128<int> intInput3 = Sse41.ConvertToVector128Int32(pInput + inputIndex3); + Vector128<int> intInput0 = Sse41.ConvertToVector128Int32(pInput + inputIndex0); + Vector128<int> intInput1 = Sse41.ConvertToVector128Int32(pInput + inputIndex1); + Vector128<int> intInput2 = Sse41.ConvertToVector128Int32(pInput + inputIndex2); + Vector128<int> intInput3 = Sse41.ConvertToVector128Int32(pInput + inputIndex3); - Vector128<float> input0 = Sse2.ConvertToVector128Single(intInput0); - Vector128<float> input1 = Sse2.ConvertToVector128Single(intInput1); - Vector128<float> input2 = Sse2.ConvertToVector128Single(intInput2); - Vector128<float> input3 = Sse2.ConvertToVector128Single(intInput3); + Vector128<float> input0 = Sse2.ConvertToVector128Single(intInput0); + Vector128<float> input1 = Sse2.ConvertToVector128Single(intInput1); + Vector128<float> input2 = Sse2.ConvertToVector128Single(intInput2); + Vector128<float> input3 = Sse2.ConvertToVector128Single(intInput3); - Vector128<float> mix0 = Sse.Multiply(input0, parameter0); - Vector128<float> mix1 = Sse.Multiply(input1, parameter1); - Vector128<float> mix2 = Sse.Multiply(input2, parameter2); - Vector128<float> mix3 = Sse.Multiply(input3, parameter3); + Vector128<float> mix0 = Sse.Multiply(input0, parameter0); + Vector128<float> mix1 = Sse.Multiply(input1, parameter1); + Vector128<float> mix2 = Sse.Multiply(input2, parameter2); + Vector128<float> mix3 = Sse.Multiply(input3, parameter3); - Vector128<float> mix01 = Sse3.HorizontalAdd(mix0, mix1); - Vector128<float> mix23 = Sse3.HorizontalAdd(mix2, mix3); + Vector128<float> mix01 = Sse3.HorizontalAdd(mix0, mix1); + Vector128<float> mix23 = Sse3.HorizontalAdd(mix2, mix3); - Vector128<float> mix0123 = Sse3.HorizontalAdd(mix01, mix23); + Vector128<float> mix0123 = Sse3.HorizontalAdd(mix01, mix23); - Sse.Store(pOutput + (uint)i, Sse41.RoundToNearestInteger(mix0123)); - } + Sse.Store(pOutput + (uint)i, Sse41.RoundToNearestInteger(mix0123)); } } } @@ -526,34 +522,59 @@ namespace Ryujinx.Audio.Renderer.Dsp return _highCurveLut2F; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void ResampleHighQuality(Span<float> outputBuffer, ReadOnlySpan<short> inputBuffer, float ratio, ref float fraction, int sampleCount) + private static unsafe void ResampleHighQuality(Span<float> outputBuffer, ReadOnlySpan<short> inputBuffer, float ratio, ref float fraction, int sampleCount) { ReadOnlySpan<float> parameters = GetHighParameter(ratio); int inputBufferIndex = 0; - // TODO: fast path - - for (int i = 0; i < sampleCount; i++) + if (Avx2.IsSupported) { - int baseIndex = (int)(fraction * 128) * 8; - ReadOnlySpan<float> parameter = parameters.Slice(baseIndex, 8); - ReadOnlySpan<short> currentInput = inputBuffer.Slice(inputBufferIndex, 8); + // Fast path; assumes 256-bit vectors for simplicity because the filter is 8 taps + fixed (short* pInput = inputBuffer) + fixed (float* pParameters = parameters) + { + for (int i = 0; i < sampleCount; i++) + { + int baseIndex = (int)(fraction * 128) * 8; - outputBuffer[i] = (float)Math.Round(currentInput[0] * parameter[0] + - currentInput[1] * parameter[1] + - currentInput[2] * parameter[2] + - currentInput[3] * parameter[3] + - currentInput[4] * parameter[4] + - currentInput[5] * parameter[5] + - currentInput[6] * parameter[6] + - currentInput[7] * parameter[7]); + Vector256<int> intInput = Avx2.ConvertToVector256Int32(pInput + inputBufferIndex); + Vector256<float> floatInput = Avx.ConvertToVector256Single(intInput); + Vector256<float> parameter = Avx.LoadVector256(pParameters + baseIndex); + Vector256<float> dp = Avx.DotProduct(floatInput, parameter, control: 0xFF); - fraction += ratio; - inputBufferIndex += (int)MathF.Truncate(fraction); + // avx2 does an 8-element dot product piecewise so we have to sum up 2 intermediate results + outputBuffer[i] = (float)Math.Round(dp[0] + dp[4]); - fraction -= (int)fraction; + fraction += ratio; + inputBufferIndex += (int)MathF.Truncate(fraction); + + fraction -= (int)fraction; + } + } + } + else + { + for (int i = 0; i < sampleCount; i++) + { + int baseIndex = (int)(fraction * 128) * 8; + ReadOnlySpan<float> parameter = parameters.Slice(baseIndex, 8); + ReadOnlySpan<short> currentInput = inputBuffer.Slice(inputBufferIndex, 8); + + outputBuffer[i] = (float)Math.Round(currentInput[0] * parameter[0] + + currentInput[1] * parameter[1] + + currentInput[2] * parameter[2] + + currentInput[3] * parameter[3] + + currentInput[4] * parameter[4] + + currentInput[5] * parameter[5] + + currentInput[6] * parameter[6] + + currentInput[7] * parameter[7]); + + fraction += ratio; + inputBufferIndex += (int)MathF.Truncate(fraction); + + fraction -= (int)fraction; + } } } diff --git a/Ryujinx.Audio/Renderer/Dsp/UpsamplerHelper.cs b/Ryujinx.Audio/Renderer/Dsp/UpsamplerHelper.cs index 847acec2e..6cdab5a7b 100644 --- a/Ryujinx.Audio/Renderer/Dsp/UpsamplerHelper.cs +++ b/Ryujinx.Audio/Renderer/Dsp/UpsamplerHelper.cs @@ -2,6 +2,7 @@ using Ryujinx.Audio.Renderer.Server.Upsampler; using Ryujinx.Common.Memory; using System; using System.Diagnostics; +using System.Numerics; using System.Runtime.CompilerServices; namespace Ryujinx.Audio.Renderer.Dsp @@ -70,16 +71,32 @@ namespace Ryujinx.Audio.Renderer.Dsp return; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] float DoFilterBank(ref UpsamplerBufferState state, in Array20<float> bank) { float result = 0.0f; Debug.Assert(state.History.Length == HistoryLength); Debug.Assert(bank.Length == FilterBankLength); - for (int j = 0; j < FilterBankLength; j++) + + int curIdx = 0; + if (Vector.IsHardwareAccelerated) { - result += bank[j] * state.History[j]; + // Do SIMD-accelerated block operations where possible. + // Only about a 2x speedup since filter bank length is short + int stopIdx = FilterBankLength - (FilterBankLength % Vector<float>.Count); + while (curIdx < stopIdx) + { + result += Vector.Dot( + new Vector<float>(bank.AsSpan().Slice(curIdx, Vector<float>.Count)), + new Vector<float>(state.History.AsSpan().Slice(curIdx, Vector<float>.Count))); + curIdx += Vector<float>.Count; + } + } + + while (curIdx < FilterBankLength) + { + result += bank[curIdx] * state.History[curIdx]; + curIdx++; } return result; diff --git a/Ryujinx.Tests/Audio/Renderer/Dsp/ResamplerTests.cs b/Ryujinx.Tests/Audio/Renderer/Dsp/ResamplerTests.cs new file mode 100644 index 000000000..364837ee0 --- /dev/null +++ b/Ryujinx.Tests/Audio/Renderer/Dsp/ResamplerTests.cs @@ -0,0 +1,93 @@ +using NUnit.Framework; +using Ryujinx.Audio.Renderer.Dsp; +using Ryujinx.Audio.Renderer.Parameter; +using Ryujinx.Audio.Renderer.Server.Upsampler; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Text; +using System.Threading.Tasks; + +namespace Ryujinx.Tests.Audio.Renderer.Dsp +{ + class ResamplerTests + { + [Test] + [TestCase(VoiceInParameter.SampleRateConversionQuality.Low)] + [TestCase(VoiceInParameter.SampleRateConversionQuality.Default)] + [TestCase(VoiceInParameter.SampleRateConversionQuality.High)] + public void TestResamplerConsistencyUpsampling(VoiceInParameter.SampleRateConversionQuality quality) + { + DoResamplingTest(44100, 48000, quality); + } + + [Test] + [TestCase(VoiceInParameter.SampleRateConversionQuality.Low)] + [TestCase(VoiceInParameter.SampleRateConversionQuality.Default)] + [TestCase(VoiceInParameter.SampleRateConversionQuality.High)] + public void TestResamplerConsistencyDownsampling(VoiceInParameter.SampleRateConversionQuality quality) + { + DoResamplingTest(48000, 44100, quality); + } + + /// <summary> + /// Generates a 1-second sine wave sample at input rate, resamples it to output rate, and + /// ensures that it resampled at the expected rate with no discontinuities + /// </summary> + /// <param name="inputRate">The input sample rate to test</param> + /// <param name="outputRate">The output sample rate to test</param> + /// <param name="quality">The resampler quality to use</param> + private static void DoResamplingTest(int inputRate, int outputRate, VoiceInParameter.SampleRateConversionQuality quality) + { + float inputSampleRate = (float)inputRate; + float outputSampleRate = (float)outputRate; + int inputSampleCount = inputRate; + int outputSampleCount = outputRate; + short[] inputBuffer = new short[inputSampleCount + 100]; // add some safety buffer at the end + float[] outputBuffer = new float[outputSampleCount + 100]; + for (int sample = 0; sample < inputBuffer.Length; sample++) + { + // 440 hz sine wave with amplitude = 0.5f at input sample rate + inputBuffer[sample] = (short)(32767 * MathF.Sin((440 / inputSampleRate) * (float)sample * MathF.PI * 2f) * 0.5f); + } + + float fraction = 0; + + ResamplerHelper.Resample( + outputBuffer.AsSpan(), + inputBuffer.AsSpan(), + inputSampleRate / outputSampleRate, + ref fraction, + outputSampleCount, + quality, + false); + + float[] expectedOutput = new float[outputSampleCount]; + float sumDifference = 0; + int delay = quality switch + { + VoiceInParameter.SampleRateConversionQuality.High => 3, + VoiceInParameter.SampleRateConversionQuality.Default => 1, + _ => 0 + }; + + for (int sample = 0; sample < outputSampleCount; sample++) + { + outputBuffer[sample] /= 32767; + // 440 hz sine wave with amplitude = 0.5f at output sample rate + expectedOutput[sample] = MathF.Sin((440 / outputSampleRate) * (float)(sample + delay) * MathF.PI * 2f) * 0.5f; + float thisDelta = Math.Abs(expectedOutput[sample] - outputBuffer[sample]); + + // Ensure no discontinuities + Assert.IsTrue(thisDelta < 0.1f); + sumDifference += thisDelta; + } + + sumDifference = sumDifference / (float)outputSampleCount; + // Expect the output to be 99% similar to the expected resampled sine wave + Assert.IsTrue(sumDifference < 0.01f); + } + } +} diff --git a/Ryujinx.Tests/Audio/Renderer/Dsp/UpsamplerTests.cs b/Ryujinx.Tests/Audio/Renderer/Dsp/UpsamplerTests.cs new file mode 100644 index 000000000..2018752b3 --- /dev/null +++ b/Ryujinx.Tests/Audio/Renderer/Dsp/UpsamplerTests.cs @@ -0,0 +1,64 @@ +using NUnit.Framework; +using Ryujinx.Audio.Renderer.Dsp; +using Ryujinx.Audio.Renderer.Parameter; +using Ryujinx.Audio.Renderer.Server.Upsampler; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Text; +using System.Threading.Tasks; + +namespace Ryujinx.Tests.Audio.Renderer.Dsp +{ + class UpsamplerTests + { + [Test] + public void TestUpsamplerConsistency() + { + UpsamplerBufferState bufferState = new UpsamplerBufferState(); + int inputBlockSize = 160; + int numInputSamples = 32000; + int numOutputSamples = 48000; + float inputSampleRate = numInputSamples; + float outputSampleRate = numOutputSamples; + float[] inputBuffer = new float[numInputSamples + 100]; + float[] outputBuffer = new float[numOutputSamples + 100]; + for (int sample = 0; sample < inputBuffer.Length; sample++) + { + // 440 hz sine wave with amplitude = 0.5f at input sample rate + inputBuffer[sample] = MathF.Sin((440 / inputSampleRate) * (float)sample * MathF.PI * 2f) * 0.5f; + } + + int inputIdx = 0; + int outputIdx = 0; + while (inputIdx + inputBlockSize < numInputSamples) + { + int outputBufLength = (int)Math.Round((float)(inputIdx + inputBlockSize) * outputSampleRate / inputSampleRate) - outputIdx; + UpsamplerHelper.Upsample( + outputBuffer.AsSpan(outputIdx), + inputBuffer.AsSpan(inputIdx), + outputBufLength, + inputBlockSize, + ref bufferState); + + inputIdx += inputBlockSize; + outputIdx += outputBufLength; + } + + float[] expectedOutput = new float[numOutputSamples]; + float sumDifference = 0; + for (int sample = 0; sample < numOutputSamples; sample++) + { + // 440 hz sine wave with amplitude = 0.5f at output sample rate with an offset of 15 + expectedOutput[sample] = MathF.Sin((440 / outputSampleRate) * (float)(sample - 15) * MathF.PI * 2f) * 0.5f; + sumDifference += Math.Abs(expectedOutput[sample] - outputBuffer[sample]); + } + + sumDifference = sumDifference / (float)expectedOutput.Length; + // Expect the output to be 98% similar to the expected resampled sine wave + Assert.IsTrue(sumDifference < 0.02f); + } + } +} From e54f9dc4b42e0c4091875989df24710956bf2e10 Mon Sep 17 00:00:00 2001 From: Andrew Glaze <andrew.glaze76@gmail.com> Date: Tue, 21 Feb 2023 06:14:31 -0500 Subject: [PATCH 20/41] Move Ryujinx Folder from ~/.config to ~/Library/Application Support on macOS (#4296) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Move Ryujinx folder to Application Support on macOS * Create a symlink to preserve back compat Co-authored-by: Ac_K <Acoustik666@gmail.com> * Remove extra whitespace * Don’t create a symlink * Update Ryujinx.Common/Configuration/AppDataManager.cs Co-authored-by: Ac_K <Acoustik666@gmail.com> * Revert "Don’t create a symlink" This reverts commit 31752fe8aba1deb32e75f949001ffb74a1e0f674. --------- Co-authored-by: Ac_K <Acoustik666@gmail.com> --- .../Configuration/AppDataManager.cs | 53 ++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/Ryujinx.Common/Configuration/AppDataManager.cs b/Ryujinx.Common/Configuration/AppDataManager.cs index 42b76453b..d6e778430 100644 --- a/Ryujinx.Common/Configuration/AppDataManager.cs +++ b/Ryujinx.Common/Configuration/AppDataManager.cs @@ -45,7 +45,15 @@ namespace Ryujinx.Common.Configuration public static void Initialize(string baseDirPath) { - string appDataPath = Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData); + string appDataPath; + if (OperatingSystem.IsMacOS()) + { + appDataPath = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Personal), "Library", "Application Support"); + } + else + { + appDataPath = Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData); + } if (appDataPath.Length == 0) { @@ -81,6 +89,21 @@ namespace Ryujinx.Common.Configuration BaseDirPath = Path.GetFullPath(BaseDirPath); // convert relative paths + // NOTE: Moves the Ryujinx folder in `~/.config` to `~/Library/Application Support` if one is found + // and a Ryujinx folder does not already exist in Application Support. + // Also creates a symlink from `~/.config/Ryujinx` to `~/Library/Application Support/Ryujinx` to preserve backwards compatibility. + // This should be removed in the future. + if (OperatingSystem.IsMacOS() && Mode == LaunchMode.UserProfile) + { + string oldConfigPath = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.ApplicationData), DefaultBaseDir); + if (Path.Exists(oldConfigPath) && !Path.Exists(BaseDirPath)) + { + CopyDirectory(oldConfigPath, BaseDirPath); + Directory.Delete(oldConfigPath, true); + Directory.CreateSymbolicLink(oldConfigPath, BaseDirPath); + } + } + SetupBasePaths(); } @@ -92,6 +115,34 @@ namespace Ryujinx.Common.Configuration Directory.CreateDirectory(KeysDirPath = Path.Combine(BaseDirPath, KeysDir)); } + private static void CopyDirectory(string sourceDir, string destinationDir) + { + var dir = new DirectoryInfo(sourceDir); + + if (!dir.Exists) + { + throw new DirectoryNotFoundException($"Source directory not found: {dir.FullName}"); + } + + DirectoryInfo[] subDirs = dir.GetDirectories(); + Directory.CreateDirectory(destinationDir); + + foreach (FileInfo file in dir.GetFiles()) + { + if (file.Name == ".DS_Store") + { + continue; + } + + file.CopyTo(Path.Combine(destinationDir, file.Name)); + } + + foreach (DirectoryInfo subDir in subDirs) + { + CopyDirectory(subDir.FullName, Path.Combine(destinationDir, subDir.Name)); + } + } + public static string GetModsPath() => CustomModsPath ?? Directory.CreateDirectory(Path.Combine(BaseDirPath, DefaultModsDir)).FullName; public static string GetSdModsPath() => CustomSdModsPath ?? Directory.CreateDirectory(Path.Combine(BaseDirPath, DefaultSdcardDir, "atmosphere")).FullName; } From 1f1e2a7f03aad988cb04045eee18a360a807d13f Mon Sep 17 00:00:00 2001 From: Mary <mary@mary.zone> Date: Tue, 21 Feb 2023 22:38:34 +0100 Subject: [PATCH 21/41] misc: changes base application directory behaviour (#4460) This allows changing base application directory behavior at build time via FORCE_EXTERNAL_BASE_DIR. This is intended to be used by nixpkgs and flathub builds. I also added the missing patch for macOS that we have on macos1 to avoid invalidating code signature. --- Ryujinx.Common/ReleaseInformation.cs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Ryujinx.Common/ReleaseInformation.cs b/Ryujinx.Common/ReleaseInformation.cs index d0e013282..601c05b17 100644 --- a/Ryujinx.Common/ReleaseInformation.cs +++ b/Ryujinx.Common/ReleaseInformation.cs @@ -40,14 +40,21 @@ namespace Ryujinx.Common } } +#if FORCE_EXTERNAL_BASE_DIR public static string GetBaseApplicationDirectory() { - if (IsFlatHubBuild()) + return AppDataManager.BaseDirPath; + } +#else + public static string GetBaseApplicationDirectory() + { + if (IsFlatHubBuild() || OperatingSystem.IsMacOS()) { return AppDataManager.BaseDirPath; } return AppDomain.CurrentDomain.BaseDirectory; } +#endif } } \ No newline at end of file From c3a5716a95ea93cba9488189fb36d594db5083bc Mon Sep 17 00:00:00 2001 From: gdkchan <gab.dark.100@gmail.com> Date: Tue, 21 Feb 2023 19:21:57 -0300 Subject: [PATCH 22/41] Add copy dependency for some incompatible texture formats (#4380) * Add copy dependency for some incompatible texture formats * Simplify compatibility check --- .../Image/TextureCompatibility.cs | 103 ++++--- .../Image/TextureCopyIncompatible.cs | 252 +++++++++++++++++ Ryujinx.Graphics.OpenGL/Image/TextureView.cs | 10 + Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs | 2 + Ryujinx.Graphics.Vulkan/HelperShader.cs | 158 ++++++++++- ...olorCopyShorteningComputeShaderSource.comp | 36 +++ .../ColorCopyWideningComputeShaderSource.comp | 31 +++ .../Shaders/ShaderBinaries.cs | 259 ++++++++++++++++++ Ryujinx.Graphics.Vulkan/TextureView.cs | 10 + 9 files changed, 814 insertions(+), 47 deletions(-) create mode 100644 Ryujinx.Graphics.OpenGL/Image/TextureCopyIncompatible.cs create mode 100644 Ryujinx.Graphics.Vulkan/Shaders/ColorCopyShorteningComputeShaderSource.comp create mode 100644 Ryujinx.Graphics.Vulkan/Shaders/ColorCopyWideningComputeShaderSource.comp diff --git a/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs b/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs index e8061951b..4b84333df 100644 --- a/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs +++ b/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs @@ -214,41 +214,6 @@ namespace Ryujinx.Graphics.Gpu.Image return true; } - /// <summary> - /// Checks if two formats are compatible, according to the host API copy format compatibility rules. - /// </summary> - /// <param name="lhsFormat">First comparand</param> - /// <param name="rhsFormat">Second comparand</param> - /// <param name="caps">Host GPU capabilities</param> - /// <returns>True if the formats are compatible, false otherwise</returns> - public static bool FormatCompatible(TextureInfo lhs, TextureInfo rhs, Capabilities caps) - { - FormatInfo lhsFormat = lhs.FormatInfo; - FormatInfo rhsFormat = rhs.FormatInfo; - - if (lhsFormat.Format.IsDepthOrStencil() || rhsFormat.Format.IsDepthOrStencil()) - { - return lhsFormat.Format == rhsFormat.Format; - } - - if (IsFormatHostIncompatible(lhs, caps) || IsFormatHostIncompatible(rhs, caps)) - { - return lhsFormat.Format == rhsFormat.Format; - } - - if (lhsFormat.IsCompressed && rhsFormat.IsCompressed) - { - FormatClass lhsClass = GetFormatClass(lhsFormat.Format); - FormatClass rhsClass = GetFormatClass(rhsFormat.Format); - - return lhsClass == rhsClass; - } - else - { - return lhsFormat.BytesPerPixel == rhsFormat.BytesPerPixel; - } - } - /// <summary> /// Checks if the texture format matches with the specified texture information. /// </summary> @@ -391,6 +356,13 @@ namespace Ryujinx.Graphics.Gpu.Image Size lhsSize = GetSizeInBlocks(lhs, level); Size rhsSize = GetSizeInBlocks(rhs); + bool alignedWidthMatches = lhsAlignedSize.Width == rhsAlignedSize.Width; + + if (lhs.FormatInfo.BytesPerPixel != rhs.FormatInfo.BytesPerPixel && IsIncompatibleFormatAliasingAllowed(lhs.FormatInfo, rhs.FormatInfo)) + { + alignedWidthMatches = lhsSize.Width * lhs.FormatInfo.BytesPerPixel == rhsSize.Width * rhs.FormatInfo.BytesPerPixel; + } + TextureViewCompatibility result = TextureViewCompatibility.Full; // For copies, we can copy a subset of the 3D texture slices, @@ -404,7 +376,7 @@ namespace Ryujinx.Graphics.Gpu.Image // so the width may not match in this case for different uses of the same texture. // To account for this, we compare the aligned width here. // We expect height to always match exactly, if the texture is the same. - if (lhsAlignedSize.Width == rhsAlignedSize.Width && lhsSize.Height == rhsSize.Height) + if (alignedWidthMatches && lhsSize.Height == rhsSize.Height) { return (exact && lhsSize.Width != rhsSize.Width) || lhsSize.Width < rhsSize.Width ? TextureViewCompatibility.CopyOnly @@ -659,21 +631,62 @@ namespace Ryujinx.Graphics.Gpu.Image /// <returns>The view compatibility level of the texture formats</returns> public static TextureViewCompatibility ViewFormatCompatible(TextureInfo lhs, TextureInfo rhs, Capabilities caps) { - if (FormatCompatible(lhs, rhs, caps)) + FormatInfo lhsFormat = lhs.FormatInfo; + FormatInfo rhsFormat = rhs.FormatInfo; + + if (lhsFormat.Format.IsDepthOrStencil() || rhsFormat.Format.IsDepthOrStencil()) { - if (lhs.FormatInfo.IsCompressed != rhs.FormatInfo.IsCompressed) - { - return TextureViewCompatibility.CopyOnly; - } - else - { - return TextureViewCompatibility.Full; - } + return lhsFormat.Format == rhsFormat.Format ? TextureViewCompatibility.Full : TextureViewCompatibility.Incompatible; + } + + if (IsFormatHostIncompatible(lhs, caps) || IsFormatHostIncompatible(rhs, caps)) + { + return lhsFormat.Format == rhsFormat.Format ? TextureViewCompatibility.Full : TextureViewCompatibility.Incompatible; + } + + if (lhsFormat.IsCompressed && rhsFormat.IsCompressed) + { + FormatClass lhsClass = GetFormatClass(lhsFormat.Format); + FormatClass rhsClass = GetFormatClass(rhsFormat.Format); + + return lhsClass == rhsClass ? TextureViewCompatibility.Full : TextureViewCompatibility.Incompatible; + } + else if (lhsFormat.BytesPerPixel == rhsFormat.BytesPerPixel) + { + return lhs.FormatInfo.IsCompressed == rhs.FormatInfo.IsCompressed + ? TextureViewCompatibility.Full + : TextureViewCompatibility.CopyOnly; + } + else if (IsIncompatibleFormatAliasingAllowed(lhsFormat, rhsFormat)) + { + return TextureViewCompatibility.CopyOnly; } return TextureViewCompatibility.Incompatible; } + /// <summary> + /// Checks if aliasing of two formats that would normally be considered incompatible be allowed, + /// using copy dependencies. + /// </summary> + /// <param name="lhsFormat">Format information of the first texture</param + /// <param name="rhsFormat">Format information of the second texture</param> + /// <returns>True if aliasing should be allowed, false otherwise</returns> + private static bool IsIncompatibleFormatAliasingAllowed(FormatInfo lhsFormat, FormatInfo rhsFormat) + { + // Some games will try to alias textures with incompatible foramts, with different BPP (bytes per pixel). + // We allow that in some cases as long Width * BPP is equal on both textures. + // This is very conservative right now as we want to avoid copies as much as possible, + // so we only consider the formats we have seen being aliased. + + if (rhsFormat.BytesPerPixel < lhsFormat.BytesPerPixel) + { + (lhsFormat, rhsFormat) = (rhsFormat, lhsFormat); + } + + return lhsFormat.Format == Format.R8Unorm && rhsFormat.Format == Format.R8G8B8A8Unorm; + } + /// <summary> /// Check if the target of the first texture view information is compatible with the target of the second texture view information. /// This follows the host API target compatibility rules. diff --git a/Ryujinx.Graphics.OpenGL/Image/TextureCopyIncompatible.cs b/Ryujinx.Graphics.OpenGL/Image/TextureCopyIncompatible.cs new file mode 100644 index 000000000..c8fbfbc6a --- /dev/null +++ b/Ryujinx.Graphics.OpenGL/Image/TextureCopyIncompatible.cs @@ -0,0 +1,252 @@ +using OpenTK.Graphics.OpenGL; +using Ryujinx.Graphics.GAL; +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Numerics; + +namespace Ryujinx.Graphics.OpenGL.Image +{ + class TextureCopyIncompatible + { + private const string ComputeShaderShortening = @"#version 450 core + +layout (binding = 0, $SRC_FORMAT$) uniform uimage2D src; +layout (binding = 1, $DST_FORMAT$) uniform uimage2D dst; + +layout (local_size_x = 32, local_size_y = 32, local_size_z = 1) in; + +void main() +{ + uvec2 coords = gl_GlobalInvocationID.xy; + ivec2 imageSz = imageSize(src); + + if (int(coords.x) >= imageSz.x || int(coords.y) >= imageSz.y) + { + return; + } + + uint coordsShifted = coords.x << $RATIO_LOG2$; + + uvec2 dstCoords0 = uvec2(coordsShifted, coords.y); + uvec2 dstCoords1 = uvec2(coordsShifted + 1, coords.y); + uvec2 dstCoords2 = uvec2(coordsShifted + 2, coords.y); + uvec2 dstCoords3 = uvec2(coordsShifted + 3, coords.y); + + uvec4 rgba = imageLoad(src, ivec2(coords)); + + imageStore(dst, ivec2(dstCoords0), rgba.rrrr); + imageStore(dst, ivec2(dstCoords1), rgba.gggg); + imageStore(dst, ivec2(dstCoords2), rgba.bbbb); + imageStore(dst, ivec2(dstCoords3), rgba.aaaa); +}"; + + private const string ComputeShaderWidening = @"#version 450 core + +layout (binding = 0, $SRC_FORMAT$) uniform uimage2D src; +layout (binding = 1, $DST_FORMAT$) uniform uimage2D dst; + +layout (local_size_x = 32, local_size_y = 32, local_size_z = 1) in; + +void main() +{ + uvec2 coords = gl_GlobalInvocationID.xy; + ivec2 imageSz = imageSize(dst); + + if (int(coords.x) >= imageSz.x || int(coords.y) >= imageSz.y) + { + return; + } + + uvec2 srcCoords = uvec2(coords.x << $RATIO_LOG2$, coords.y); + + uint r = imageLoad(src, ivec2(srcCoords) + ivec2(0, 0)).r; + uint g = imageLoad(src, ivec2(srcCoords) + ivec2(1, 0)).r; + uint b = imageLoad(src, ivec2(srcCoords) + ivec2(2, 0)).r; + uint a = imageLoad(src, ivec2(srcCoords) + ivec2(3, 0)).r; + + imageStore(dst, ivec2(coords), uvec4(r, g, b, a)); +}"; + + private readonly OpenGLRenderer _renderer; + private readonly Dictionary<int, int> _shorteningProgramHandles; + private readonly Dictionary<int, int> _wideningProgramHandles; + + public TextureCopyIncompatible(OpenGLRenderer renderer) + { + _renderer = renderer; + _shorteningProgramHandles = new Dictionary<int, int>(); + _wideningProgramHandles = new Dictionary<int, int>(); + } + + public void CopyIncompatibleFormats(ITextureInfo src, ITextureInfo dst, int srcLayer, int dstLayer, int srcLevel, int dstLevel, int depth, int levels) + { + TextureCreateInfo srcInfo = src.Info; + TextureCreateInfo dstInfo = dst.Info; + + int srcBpp = src.Info.BytesPerPixel; + int dstBpp = dst.Info.BytesPerPixel; + + // Calculate ideal component size, given our constraints: + // - Component size must not exceed bytes per pixel of source and destination image formats. + // - Maximum component size is 4 (R32). + int componentSize = Math.Min(Math.Min(srcBpp, dstBpp), 4); + + int srcComponentsCount = srcBpp / componentSize; + int dstComponentsCount = dstBpp / componentSize; + + var srcFormat = GetFormat(componentSize, srcComponentsCount); + var dstFormat = GetFormat(componentSize, dstComponentsCount); + + GL.UseProgram(srcBpp < dstBpp + ? GetWideningShader(componentSize, srcComponentsCount, dstComponentsCount) + : GetShorteningShader(componentSize, srcComponentsCount, dstComponentsCount)); + + for (int l = 0; l < levels; l++) + { + int srcWidth = Math.Max(1, src.Info.Width >> l); + int srcHeight = Math.Max(1, src.Info.Height >> l); + + int dstWidth = Math.Max(1, dst.Info.Width >> l); + int dstHeight = Math.Max(1, dst.Info.Height >> l); + + int width = Math.Min(srcWidth, dstWidth); + int height = Math.Min(srcHeight, dstHeight); + + for (int z = 0; z < depth; z++) + { + GL.BindImageTexture(0, src.Handle, srcLevel + l, false, srcLayer + z, TextureAccess.ReadOnly, srcFormat); + GL.BindImageTexture(1, dst.Handle, dstLevel + l, false, dstLayer + z, TextureAccess.WriteOnly, dstFormat); + + GL.DispatchCompute((width + 31) / 32, (height + 31) / 32, 1); + } + } + + Pipeline pipeline = (Pipeline)_renderer.Pipeline; + + pipeline.RestoreProgram(); + pipeline.RestoreImages1And2(); + } + + private static SizedInternalFormat GetFormat(int componentSize, int componentsCount) + { + if (componentSize == 1) + { + return componentsCount switch + { + 1 => SizedInternalFormat.R8ui, + 2 => SizedInternalFormat.Rg8ui, + 4 => SizedInternalFormat.Rgba8ui, + _ => throw new ArgumentException($"Invalid components count {componentsCount}.") + }; + } + else if (componentSize == 2) + { + return componentsCount switch + { + 1 => SizedInternalFormat.R16ui, + 2 => SizedInternalFormat.Rg16ui, + 4 => SizedInternalFormat.Rgba16ui, + _ => throw new ArgumentException($"Invalid components count {componentsCount}.") + }; + } + else if (componentSize == 4) + { + return componentsCount switch + { + 1 => SizedInternalFormat.R32ui, + 2 => SizedInternalFormat.Rg32ui, + 4 => SizedInternalFormat.Rgba32ui, + _ => throw new ArgumentException($"Invalid components count {componentsCount}.") + }; + } + else + { + throw new ArgumentException($"Invalid component size {componentSize}."); + } + } + + private int GetShorteningShader(int componentSize, int srcComponentsCount, int dstComponentsCount) + { + return GetShader(ComputeShaderShortening, _shorteningProgramHandles, componentSize, srcComponentsCount, dstComponentsCount); + } + + private int GetWideningShader(int componentSize, int srcComponentsCount, int dstComponentsCount) + { + return GetShader(ComputeShaderWidening, _wideningProgramHandles, componentSize, srcComponentsCount, dstComponentsCount); + } + + private int GetShader( + string code, + Dictionary<int, int> programHandles, + int componentSize, + int srcComponentsCount, + int dstComponentsCount) + { + int componentSizeLog2 = BitOperations.Log2((uint)componentSize); + + int srcIndex = componentSizeLog2 + BitOperations.Log2((uint)srcComponentsCount) * 3; + int dstIndex = componentSizeLog2 + BitOperations.Log2((uint)dstComponentsCount) * 3; + + int key = srcIndex | (dstIndex << 8); + + if (!programHandles.TryGetValue(key, out int programHandle)) + { + int csHandle = GL.CreateShader(ShaderType.ComputeShader); + + string[] formatTable = new[] { "r8ui", "r16ui", "r32ui", "rg8ui", "rg16ui", "rg32ui", "rgba8ui", "rgba16ui", "rgba32ui" }; + + string srcFormat = formatTable[srcIndex]; + string dstFormat = formatTable[dstIndex]; + + int srcBpp = srcComponentsCount * componentSize; + int dstBpp = dstComponentsCount * componentSize; + + int ratio = srcBpp < dstBpp ? dstBpp / srcBpp : srcBpp / dstBpp; + int ratioLog2 = BitOperations.Log2((uint)ratio); + + GL.ShaderSource(csHandle, code + .Replace("$SRC_FORMAT$", srcFormat) + .Replace("$DST_FORMAT$", dstFormat) + .Replace("$RATIO_LOG2$", ratioLog2.ToString(CultureInfo.InvariantCulture))); + + GL.CompileShader(csHandle); + + programHandle = GL.CreateProgram(); + + GL.AttachShader(programHandle, csHandle); + GL.LinkProgram(programHandle); + GL.DetachShader(programHandle, csHandle); + GL.DeleteShader(csHandle); + + GL.GetProgram(programHandle, GetProgramParameterName.LinkStatus, out int status); + + if (status == 0) + { + throw new Exception(GL.GetProgramInfoLog(programHandle)); + } + + programHandles.Add(key, programHandle); + } + + return programHandle; + } + + public void Dispose() + { + foreach (int handle in _shorteningProgramHandles.Values) + { + GL.DeleteProgram(handle); + } + + _shorteningProgramHandles.Clear(); + + foreach (int handle in _wideningProgramHandles.Values) + { + GL.DeleteProgram(handle); + } + + _wideningProgramHandles.Clear(); + } + } +} diff --git a/Ryujinx.Graphics.OpenGL/Image/TextureView.cs b/Ryujinx.Graphics.OpenGL/Image/TextureView.cs index 68cd2d30f..44df441f7 100644 --- a/Ryujinx.Graphics.OpenGL/Image/TextureView.cs +++ b/Ryujinx.Graphics.OpenGL/Image/TextureView.cs @@ -127,6 +127,12 @@ namespace Ryujinx.Graphics.OpenGL.Image int layers = Math.Min(Info.GetLayers(), destinationView.Info.GetLayers() - firstLayer); _renderer.TextureCopyMS.CopyNonMSToMS(this, destinationView, 0, firstLayer, layers); } + else if (destinationView.Info.BytesPerPixel != Info.BytesPerPixel) + { + int layers = Math.Min(Info.GetLayers(), destinationView.Info.GetLayers() - firstLayer); + int levels = Math.Min(Info.Levels, destinationView.Info.Levels - firstLevel); + _renderer.TextureCopyIncompatible.CopyIncompatibleFormats(this, destinationView, 0, firstLayer, 0, firstLevel, layers, levels); + } else { _renderer.TextureCopy.CopyUnscaled(this, destinationView, 0, firstLayer, 0, firstLevel); @@ -145,6 +151,10 @@ namespace Ryujinx.Graphics.OpenGL.Image { _renderer.TextureCopyMS.CopyNonMSToMS(this, destinationView, srcLayer, dstLayer, 1); } + else if (destinationView.Info.BytesPerPixel != Info.BytesPerPixel) + { + _renderer.TextureCopyIncompatible.CopyIncompatibleFormats(this, destinationView, srcLayer, dstLayer, srcLevel, dstLevel, 1, 1); + } else { _renderer.TextureCopy.CopyUnscaled(this, destinationView, srcLayer, dstLayer, srcLevel, dstLevel, 1, 1); diff --git a/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs b/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs index 722c4b4da..efbd17c1b 100644 --- a/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs +++ b/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs @@ -24,6 +24,7 @@ namespace Ryujinx.Graphics.OpenGL private TextureCopy _textureCopy; private TextureCopy _backgroundTextureCopy; internal TextureCopy TextureCopy => BackgroundContextWorker.InBackground ? _backgroundTextureCopy : _textureCopy; + internal TextureCopyIncompatible TextureCopyIncompatible { get; } internal TextureCopyMS TextureCopyMS { get; } private Sync _sync; @@ -49,6 +50,7 @@ namespace Ryujinx.Graphics.OpenGL _window = new Window(this); _textureCopy = new TextureCopy(this); _backgroundTextureCopy = new TextureCopy(this); + TextureCopyIncompatible = new TextureCopyIncompatible(this); TextureCopyMS = new TextureCopyMS(this); _sync = new Sync(); PersistentBuffers = new PersistentBuffers(); diff --git a/Ryujinx.Graphics.Vulkan/HelperShader.cs b/Ryujinx.Graphics.Vulkan/HelperShader.cs index b8c21fe8e..c67389aa4 100644 --- a/Ryujinx.Graphics.Vulkan/HelperShader.cs +++ b/Ryujinx.Graphics.Vulkan/HelperShader.cs @@ -5,6 +5,7 @@ using Ryujinx.Graphics.Vulkan.Shaders; using Silk.NET.Vulkan; using System; using System.Collections.Generic; +using System.Numerics; using VkFormat = Silk.NET.Vulkan.Format; namespace Ryujinx.Graphics.Vulkan @@ -32,7 +33,9 @@ namespace Ryujinx.Graphics.Vulkan private readonly IProgram _programStrideChange; private readonly IProgram _programConvertIndexBuffer; private readonly IProgram _programConvertIndirectData; + private readonly IProgram _programColorCopyShortening; private readonly IProgram _programColorCopyToNonMs; + private readonly IProgram _programColorCopyWidening; private readonly IProgram _programColorDrawToMs; private readonly IProgram _programDepthBlit; private readonly IProgram _programDepthBlitMs; @@ -112,15 +115,25 @@ namespace Ryujinx.Graphics.Vulkan new ShaderSource(ShaderBinaries.ChangeBufferStrideShaderSource, strideChangeBindings, ShaderStage.Compute, TargetLanguage.Spirv), }); - var colorCopyToNonMsBindings = new ShaderBindings( + var colorCopyBindings = new ShaderBindings( new[] { 0 }, Array.Empty<int>(), new[] { 0 }, new[] { 0 }); + _programColorCopyShortening = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ShaderBinaries.ColorCopyShorteningComputeShaderSource, colorCopyBindings, ShaderStage.Compute, TargetLanguage.Spirv), + }); + _programColorCopyToNonMs = gd.CreateProgramWithMinimalLayout(new[] { - new ShaderSource(ShaderBinaries.ColorCopyToNonMsComputeShaderSource, colorCopyToNonMsBindings, ShaderStage.Compute, TargetLanguage.Spirv), + new ShaderSource(ShaderBinaries.ColorCopyToNonMsComputeShaderSource, colorCopyBindings, ShaderStage.Compute, TargetLanguage.Spirv), + }); + + _programColorCopyWidening = gd.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(ShaderBinaries.ColorCopyWideningComputeShaderSource, colorCopyBindings, ShaderStage.Compute, TargetLanguage.Spirv), }); var colorDrawToMsVertexBindings = new ShaderBindings( @@ -922,6 +935,107 @@ namespace Ryujinx.Graphics.Vulkan convertedCount * outputIndexSize); } + public void CopyIncompatibleFormats( + VulkanRenderer gd, + CommandBufferScoped cbs, + TextureView src, + TextureView dst, + int srcLayer, + int dstLayer, + int srcLevel, + int dstLevel, + int depth, + int levels) + { + const int ParamsBufferSize = 4; + + Span<int> shaderParams = stackalloc int[sizeof(int)]; + + int srcBpp = src.Info.BytesPerPixel; + int dstBpp = dst.Info.BytesPerPixel; + + int ratio = srcBpp < dstBpp ? dstBpp / srcBpp : srcBpp / dstBpp; + + shaderParams[0] = BitOperations.Log2((uint)ratio); + + var bufferHandle = gd.BufferManager.CreateWithHandle(gd, ParamsBufferSize, false); + + gd.BufferManager.SetData<int>(bufferHandle, 0, shaderParams); + + TextureView.InsertImageBarrier( + gd.Api, + cbs.CommandBuffer, + src.GetImage().Get(cbs).Value, + TextureStorage.DefaultAccessMask, + AccessFlags.ShaderReadBit, + PipelineStageFlags.AllCommandsBit, + PipelineStageFlags.ComputeShaderBit, + ImageAspectFlags.ColorBit, + src.FirstLayer + srcLayer, + src.FirstLevel + srcLevel, + depth, + levels); + + _pipeline.SetCommandBuffer(cbs); + + _pipeline.SetProgram(srcBpp < dstBpp ? _programColorCopyWidening : _programColorCopyShortening); + + // Calculate ideal component size, given our constraints: + // - Component size must not exceed bytes per pixel of source and destination image formats. + // - Maximum component size is 4 (R32). + int componentSize = Math.Min(Math.Min(srcBpp, dstBpp), 4); + + var srcFormat = GetFormat(componentSize, srcBpp / componentSize); + var dstFormat = GetFormat(componentSize, dstBpp / componentSize); + + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(0, new BufferRange(bufferHandle, 0, ParamsBufferSize)) }); + + for (int l = 0; l < levels; l++) + { + for (int z = 0; z < depth; z++) + { + var srcView = Create2DLayerView(src, srcLayer + z, srcLevel + l, srcFormat); + var dstView = Create2DLayerView(dst, dstLayer + z, dstLevel + l); + + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 0, srcView, null); + _pipeline.SetImage(0, dstView, dstFormat); + + int dispatchX = (Math.Min(srcView.Info.Width, dstView.Info.Width) + 31) / 32; + int dispatchY = (Math.Min(srcView.Info.Height, dstView.Info.Height) + 31) / 32; + + _pipeline.DispatchCompute(dispatchX, dispatchY, 1); + + if (srcView != src) + { + srcView.Release(); + } + + if (dstView != dst) + { + dstView.Release(); + } + } + } + + gd.BufferManager.Delete(bufferHandle); + + _pipeline.Finish(gd, cbs); + + TextureView.InsertImageBarrier( + gd.Api, + cbs.CommandBuffer, + dst.GetImage().Get(cbs).Value, + AccessFlags.ShaderWriteBit, + TextureStorage.DefaultAccessMask, + PipelineStageFlags.ComputeShaderBit, + PipelineStageFlags.AllCommandsBit, + ImageAspectFlags.ColorBit, + dst.FirstLayer + dstLayer, + dst.FirstLevel + dstLevel, + depth, + levels); + } + public void CopyMSToNonMS(VulkanRenderer gd, CommandBufferScoped cbs, TextureView src, TextureView dst, int srcLayer, int dstLayer, int depth) { const int ParamsBufferSize = 16; @@ -1196,6 +1310,44 @@ namespace Ryujinx.Graphics.Vulkan }; } + private static GAL.Format GetFormat(int componentSize, int componentsCount) + { + if (componentSize == 1) + { + return componentsCount switch + { + 1 => GAL.Format.R8Uint, + 2 => GAL.Format.R8G8Uint, + 4 => GAL.Format.R8G8B8A8Uint, + _ => throw new ArgumentException($"Invalid components count {componentsCount}.") + }; + } + else if (componentSize == 2) + { + return componentsCount switch + { + 1 => GAL.Format.R16Uint, + 2 => GAL.Format.R16G16Uint, + 4 => GAL.Format.R16G16B16A16Uint, + _ => throw new ArgumentException($"Invalid components count {componentsCount}.") + }; + } + else if (componentSize == 4) + { + return componentsCount switch + { + 1 => GAL.Format.R32Uint, + 2 => GAL.Format.R32G32Uint, + 4 => GAL.Format.R32G32B32A32Uint, + _ => throw new ArgumentException($"Invalid components count {componentsCount}.") + }; + } + else + { + throw new ArgumentException($"Invalid component size {componentSize}."); + } + } + public void ConvertIndexBufferIndirect( VulkanRenderer gd, CommandBufferScoped cbs, @@ -1336,7 +1488,9 @@ namespace Ryujinx.Graphics.Vulkan _programStrideChange.Dispose(); _programConvertIndexBuffer.Dispose(); _programConvertIndirectData.Dispose(); + _programColorCopyShortening.Dispose(); _programColorCopyToNonMs.Dispose(); + _programColorCopyWidening.Dispose(); _programColorDrawToMs.Dispose(); _programDepthBlit.Dispose(); _programDepthBlitMs.Dispose(); diff --git a/Ryujinx.Graphics.Vulkan/Shaders/ColorCopyShorteningComputeShaderSource.comp b/Ryujinx.Graphics.Vulkan/Shaders/ColorCopyShorteningComputeShaderSource.comp new file mode 100644 index 000000000..78cc1cc6f --- /dev/null +++ b/Ryujinx.Graphics.Vulkan/Shaders/ColorCopyShorteningComputeShaderSource.comp @@ -0,0 +1,36 @@ +#version 450 core + +layout (std140, binding = 0) uniform ratio_in +{ + int ratio; +}; + +layout (set = 2, binding = 0) uniform usampler2D src; +layout (set = 3, binding = 0) writeonly uniform uimage2D dst; + +layout (local_size_x = 32, local_size_y = 32, local_size_z = 1) in; + +void main() +{ + uvec2 coords = gl_GlobalInvocationID.xy; + ivec2 textureSz = textureSize(src, 0); + + if (int(coords.x) >= textureSz.x || int(coords.y) >= textureSz.y) + { + return; + } + + uint coordsShifted = coords.x << ratio; + + uvec2 dstCoords0 = uvec2(coordsShifted, coords.y); + uvec2 dstCoords1 = uvec2(coordsShifted + 1, coords.y); + uvec2 dstCoords2 = uvec2(coordsShifted + 2, coords.y); + uvec2 dstCoords3 = uvec2(coordsShifted + 3, coords.y); + + uvec4 rgba = texelFetch(src, ivec2(coords), 0); + + imageStore(dst, ivec2(dstCoords0), rgba.rrrr); + imageStore(dst, ivec2(dstCoords1), rgba.gggg); + imageStore(dst, ivec2(dstCoords2), rgba.bbbb); + imageStore(dst, ivec2(dstCoords3), rgba.aaaa); +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Vulkan/Shaders/ColorCopyWideningComputeShaderSource.comp b/Ryujinx.Graphics.Vulkan/Shaders/ColorCopyWideningComputeShaderSource.comp new file mode 100644 index 000000000..a9be454fa --- /dev/null +++ b/Ryujinx.Graphics.Vulkan/Shaders/ColorCopyWideningComputeShaderSource.comp @@ -0,0 +1,31 @@ +#version 450 core + +layout (std140, binding = 0) uniform ratio_in +{ + int ratio; +}; + +layout (set = 2, binding = 0) uniform usampler2D src; +layout (set = 3, binding = 0) writeonly uniform uimage2D dst; + +layout (local_size_x = 32, local_size_y = 32, local_size_z = 1) in; + +void main() +{ + uvec2 coords = gl_GlobalInvocationID.xy; + ivec2 imageSz = imageSize(dst); + + if (int(coords.x) >= imageSz.x || int(coords.y) >= imageSz.y) + { + return; + } + + uvec2 srcCoords = uvec2(coords.x << ratio, coords.y); + + uint r = texelFetchOffset(src, ivec2(srcCoords), 0, ivec2(0, 0)).r; + uint g = texelFetchOffset(src, ivec2(srcCoords), 0, ivec2(1, 0)).r; + uint b = texelFetchOffset(src, ivec2(srcCoords), 0, ivec2(2, 0)).r; + uint a = texelFetchOffset(src, ivec2(srcCoords), 0, ivec2(3, 0)).r; + + imageStore(dst, ivec2(coords), uvec4(r, g, b, a)); +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Vulkan/Shaders/ShaderBinaries.cs b/Ryujinx.Graphics.Vulkan/Shaders/ShaderBinaries.cs index c9df894bc..7fd047a23 100644 --- a/Ryujinx.Graphics.Vulkan/Shaders/ShaderBinaries.cs +++ b/Ryujinx.Graphics.Vulkan/Shaders/ShaderBinaries.cs @@ -669,6 +669,138 @@ namespace Ryujinx.Graphics.Vulkan.Shaders 0x35, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0xFD, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00, }; + public static readonly byte[] ColorCopyShorteningComputeShaderSource = new byte[] + { + 0x03, 0x02, 0x23, 0x07, 0x00, 0x05, 0x01, 0x00, 0x0B, 0x00, 0x08, 0x00, 0x79, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, + 0x32, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x06, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x47, 0x4C, 0x53, 0x4C, 0x2E, 0x73, 0x74, 0x64, 0x2E, 0x34, 0x35, 0x30, + 0x00, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x0F, 0x00, 0x09, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6D, 0x61, 0x69, 0x6E, + 0x00, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, + 0x60, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, + 0x02, 0x00, 0x00, 0x00, 0xC2, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x6D, 0x61, 0x69, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, 0x0C, 0x00, 0x00, 0x00, + 0x67, 0x6C, 0x5F, 0x47, 0x6C, 0x6F, 0x62, 0x61, 0x6C, 0x49, 0x6E, 0x76, 0x6F, 0x63, 0x61, 0x74, + 0x69, 0x6F, 0x6E, 0x49, 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x03, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x00, 0x05, 0x00, 0x05, 0x00, 0x36, 0x00, 0x00, 0x00, 0x72, 0x61, 0x74, 0x69, + 0x6F, 0x5F, 0x69, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, 0x36, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x72, 0x61, 0x74, 0x69, 0x6F, 0x00, 0x00, 0x00, 0x05, 0x00, 0x03, 0x00, + 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x03, 0x00, 0x60, 0x00, 0x00, 0x00, + 0x64, 0x73, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, + 0x1C, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x36, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x38, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x38, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x60, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x60, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x60, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x76, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, + 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x19, 0x00, 0x09, 0x00, 0x13, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1B, 0x00, 0x03, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x04, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00, 0x1B, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x03, 0x00, + 0x36, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x37, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00, 0x37, 0x00, 0x00, 0x00, + 0x38, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x39, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x4A, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x19, 0x00, 0x09, 0x00, 0x5E, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x5F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5E, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00, + 0x5F, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x2C, 0x00, 0x06, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, + 0xF7, 0x00, 0x03, 0x00, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0x00, 0x03, 0x00, + 0x1C, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x78, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x0D, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, + 0x4F, 0x00, 0x07, 0x00, 0x07, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x00, 0x00, 0x0D, 0x00, 0x00, 0x00, + 0x0D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, + 0x14, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x64, 0x00, 0x04, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x67, 0x00, 0x05, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1F, 0x00, 0x00, 0x00, 0x0D, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x7C, 0x00, 0x04, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x1F, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xAF, 0x00, 0x05, 0x00, 0x1B, 0x00, 0x00, 0x00, + 0x24, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0xA8, 0x00, 0x04, 0x00, + 0x1B, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0xF7, 0x00, 0x03, 0x00, + 0x27, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFA, 0x00, 0x04, 0x00, 0x25, 0x00, 0x00, 0x00, + 0x26, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x26, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x00, 0x00, 0x0D, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x7C, 0x00, 0x04, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x00, 0x00, + 0x2A, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x2D, 0x00, 0x00, 0x00, + 0x1A, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xAF, 0x00, 0x05, 0x00, 0x1B, 0x00, 0x00, 0x00, + 0x2E, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x00, 0x00, 0x2D, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, + 0x27, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x27, 0x00, 0x00, 0x00, 0xF5, 0x00, 0x07, 0x00, + 0x1B, 0x00, 0x00, 0x00, 0x2F, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, + 0x2E, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0xF7, 0x00, 0x03, 0x00, 0x31, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xFA, 0x00, 0x04, 0x00, 0x2F, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, + 0x31, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x30, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, + 0x77, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x31, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x39, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x00, 0x00, + 0xC4, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x1F, 0x00, 0x00, 0x00, + 0x3B, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x0D, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x50, 0x00, 0x05, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, + 0x50, 0x00, 0x05, 0x00, 0x07, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, + 0x40, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x4B, 0x00, 0x00, 0x00, + 0x3C, 0x00, 0x00, 0x00, 0x4A, 0x00, 0x00, 0x00, 0x50, 0x00, 0x05, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x4E, 0x00, 0x00, 0x00, 0x4B, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, + 0x50, 0x00, 0x05, 0x00, 0x07, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, + 0x40, 0x00, 0x00, 0x00, 0x7C, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, 0x5B, 0x00, 0x00, 0x00, + 0x0E, 0x00, 0x00, 0x00, 0x64, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x5C, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x07, 0x00, 0x56, 0x00, 0x00, 0x00, 0x5D, 0x00, 0x00, 0x00, + 0x5C, 0x00, 0x00, 0x00, 0x5B, 0x00, 0x00, 0x00, 0x02, 0x20, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x5E, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, + 0x7C, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00, + 0x4F, 0x00, 0x09, 0x00, 0x56, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x5D, 0x00, 0x00, 0x00, + 0x5D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x63, 0x00, 0x05, 0x00, 0x61, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, + 0x65, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x5E, 0x00, 0x00, 0x00, + 0x66, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x7C, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x68, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x09, 0x00, 0x56, 0x00, 0x00, 0x00, + 0x6A, 0x00, 0x00, 0x00, 0x5D, 0x00, 0x00, 0x00, 0x5D, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x63, 0x00, 0x05, 0x00, + 0x66, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x6A, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x5E, 0x00, 0x00, 0x00, 0x6B, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, + 0x7C, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00, 0x4E, 0x00, 0x00, 0x00, + 0x4F, 0x00, 0x09, 0x00, 0x56, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x5D, 0x00, 0x00, 0x00, + 0x5D, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x63, 0x00, 0x05, 0x00, 0x6B, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00, + 0x6F, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x5E, 0x00, 0x00, 0x00, + 0x70, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x7C, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x72, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x09, 0x00, 0x56, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x5D, 0x00, 0x00, 0x00, 0x5D, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x63, 0x00, 0x05, 0x00, + 0x70, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, + 0xF9, 0x00, 0x02, 0x00, 0x77, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x77, 0x00, 0x00, 0x00, + 0xFD, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00, + }; + public static readonly byte[] ColorCopyToNonMsComputeShaderSource = new byte[] { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0A, 0x00, 0x08, 0x00, 0x86, 0x00, 0x00, 0x00, @@ -801,6 +933,133 @@ namespace Ryujinx.Graphics.Vulkan.Shaders 0x84, 0x00, 0x00, 0x00, 0xFD, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00, }; + public static readonly byte[] ColorCopyWideningComputeShaderSource = new byte[] + { + 0x03, 0x02, 0x23, 0x07, 0x00, 0x05, 0x01, 0x00, 0x0B, 0x00, 0x08, 0x00, 0x72, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, + 0x32, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x06, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x47, 0x4C, 0x53, 0x4C, 0x2E, 0x73, 0x74, 0x64, 0x2E, 0x34, 0x35, 0x30, + 0x00, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x0F, 0x00, 0x09, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6D, 0x61, 0x69, 0x6E, + 0x00, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, + 0x42, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, + 0x02, 0x00, 0x00, 0x00, 0xC2, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x6D, 0x61, 0x69, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, 0x0C, 0x00, 0x00, 0x00, + 0x67, 0x6C, 0x5F, 0x47, 0x6C, 0x6F, 0x62, 0x61, 0x6C, 0x49, 0x6E, 0x76, 0x6F, 0x63, 0x61, 0x74, + 0x69, 0x6F, 0x6E, 0x49, 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x03, 0x00, 0x15, 0x00, 0x00, 0x00, + 0x64, 0x73, 0x74, 0x00, 0x05, 0x00, 0x05, 0x00, 0x33, 0x00, 0x00, 0x00, 0x72, 0x61, 0x74, 0x69, + 0x6F, 0x5F, 0x69, 0x6E, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, 0x33, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x72, 0x61, 0x74, 0x69, 0x6F, 0x00, 0x00, 0x00, 0x05, 0x00, 0x03, 0x00, + 0x35, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x03, 0x00, 0x42, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0C, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, + 0x1C, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x15, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x05, 0x00, 0x33, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x33, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x35, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x35, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x42, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x42, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, + 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x19, 0x00, 0x09, 0x00, 0x13, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x2B, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2B, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x1E, 0x00, 0x03, 0x00, 0x33, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x34, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00, + 0x34, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x04, 0x00, + 0x0F, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x37, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x19, 0x00, 0x09, 0x00, + 0x3F, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1B, 0x00, 0x03, 0x00, 0x40, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x3B, 0x00, 0x04, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2C, 0x00, 0x05, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x2B, 0x00, 0x04, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x2C, 0x00, 0x05, 0x00, 0x10, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0x4F, 0x00, 0x00, 0x00, + 0x36, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x04, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x2C, 0x00, 0x05, 0x00, 0x10, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, + 0x58, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x04, 0x00, 0x0F, 0x00, 0x00, 0x00, + 0x61, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x2C, 0x00, 0x05, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x62, 0x00, 0x00, 0x00, 0x61, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x6E, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x2C, 0x00, 0x06, 0x00, + 0x0A, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x6E, 0x00, 0x00, 0x00, 0x6E, 0x00, 0x00, 0x00, + 0x25, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, + 0xF7, 0x00, 0x03, 0x00, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFB, 0x00, 0x03, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x3D, 0x00, 0x04, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x0D, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00, + 0x4F, 0x00, 0x07, 0x00, 0x07, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x00, 0x00, 0x0D, 0x00, 0x00, 0x00, + 0x0D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, + 0x13, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x68, 0x00, 0x04, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x0D, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x7C, 0x00, 0x04, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xAF, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, + 0x1D, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xA8, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0xF7, 0x00, 0x03, 0x00, 0x24, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xFA, 0x00, 0x04, 0x00, 0x22, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x24, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x23, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x0D, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x7C, 0x00, 0x04, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0xAF, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x00, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x2A, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, 0x24, 0x00, 0x00, 0x00, + 0xF8, 0x00, 0x02, 0x00, 0x24, 0x00, 0x00, 0x00, 0xF5, 0x00, 0x07, 0x00, 0x18, 0x00, 0x00, 0x00, + 0x2C, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x2B, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x00, 0xF7, 0x00, 0x03, 0x00, 0x2E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xFA, 0x00, 0x04, 0x00, 0x2C, 0x00, 0x00, 0x00, 0x2D, 0x00, 0x00, 0x00, 0x2E, 0x00, 0x00, 0x00, + 0xF8, 0x00, 0x02, 0x00, 0x2D, 0x00, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, 0x70, 0x00, 0x00, 0x00, + 0xF8, 0x00, 0x02, 0x00, 0x2E, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x37, 0x00, 0x00, 0x00, + 0x38, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, + 0x0F, 0x00, 0x00, 0x00, 0x39, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0xC4, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x3A, 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x39, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x0D, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x50, 0x00, 0x05, 0x00, 0x07, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x00, 0x00, + 0x3A, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x43, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x7C, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x45, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x00, 0x00, 0x64, 0x00, 0x04, 0x00, 0x3F, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x08, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x00, 0x00, + 0x36, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x4A, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x00, 0x04, 0x00, + 0x3F, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x08, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x0A, 0x20, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x64, 0x00, 0x04, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x5A, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, + 0x5F, 0x00, 0x08, 0x00, 0x48, 0x00, 0x00, 0x00, 0x5B, 0x00, 0x00, 0x00, 0x5A, 0x00, 0x00, 0x00, + 0x45, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x5C, 0x00, 0x00, 0x00, 0x5B, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x64, 0x00, 0x04, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, + 0x43, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x08, 0x00, 0x48, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, + 0x63, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x0A, 0x20, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, + 0x62, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, + 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x3D, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, + 0x66, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x7C, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x68, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x6D, 0x00, 0x00, 0x00, 0x4A, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x5C, 0x00, 0x00, 0x00, + 0x65, 0x00, 0x00, 0x00, 0x63, 0x00, 0x05, 0x00, 0x66, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, + 0x6D, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0xF9, 0x00, 0x02, 0x00, 0x70, 0x00, 0x00, 0x00, + 0xF8, 0x00, 0x02, 0x00, 0x70, 0x00, 0x00, 0x00, 0xFD, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00, + }; + public static readonly byte[] ColorDrawToMsVertexShaderSource = new byte[] { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0A, 0x00, 0x08, 0x00, 0x2E, 0x00, 0x00, 0x00, diff --git a/Ryujinx.Graphics.Vulkan/TextureView.cs b/Ryujinx.Graphics.Vulkan/TextureView.cs index aa050c015..264ecf5db 100644 --- a/Ryujinx.Graphics.Vulkan/TextureView.cs +++ b/Ryujinx.Graphics.Vulkan/TextureView.cs @@ -199,6 +199,12 @@ namespace Ryujinx.Graphics.Vulkan int layers = Math.Min(Info.GetLayers(), dst.Info.GetLayers() - firstLayer); _gd.HelperShader.CopyNonMSToMS(_gd, cbs, src, dst, 0, firstLayer, layers); } + else if (dst.Info.BytesPerPixel != Info.BytesPerPixel) + { + int layers = Math.Min(Info.GetLayers(), dst.Info.GetLayers() - firstLayer); + int levels = Math.Min(Info.Levels, dst.Info.Levels - firstLevel); + _gd.HelperShader.CopyIncompatibleFormats(_gd, cbs, src, dst, 0, firstLayer, 0, firstLevel, layers, levels); + } else { TextureCopy.Copy( @@ -244,6 +250,10 @@ namespace Ryujinx.Graphics.Vulkan { _gd.HelperShader.CopyNonMSToMS(_gd, cbs, src, dst, srcLayer, dstLayer, 1); } + else if (dst.Info.BytesPerPixel != Info.BytesPerPixel) + { + _gd.HelperShader.CopyIncompatibleFormats(_gd, cbs, src, dst, srcLayer, dstLayer, srcLevel, dstLevel, 1, 1); + } else { TextureCopy.Copy( From 1f8d66db7c91a3242629edca84f4df9c17a832ef Mon Sep 17 00:00:00 2001 From: TSRBerry <20988865+TSRBerry@users.noreply.github.com> Date: Wed, 22 Feb 2023 09:13:50 +0100 Subject: [PATCH 23/41] Ava: Fix Updater crashing on Linux (#4457) --- Ryujinx.Ava/Modules/Updater/Updater.cs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Ryujinx.Ava/Modules/Updater/Updater.cs b/Ryujinx.Ava/Modules/Updater/Updater.cs index b476bb85b..511e273e5 100644 --- a/Ryujinx.Ava/Modules/Updater/Updater.cs +++ b/Ryujinx.Ava/Modules/Updater/Updater.cs @@ -506,6 +506,11 @@ namespace Ryujinx.Modules Dispatcher.UIThread.Post(() => { + if (tarEntry is null) + { + return; + } + taskDialog.SetProgressBarState(GetPercentage(tarEntry.Size, inStream.Length), TaskDialogProgressState.Normal); }); } From f1eef29409e393e2470557cec65268f925099880 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 22 Feb 2023 13:30:53 +0100 Subject: [PATCH 24/41] nuget: bump UnicornEngine.Unicorn (#4459) Bumps [UnicornEngine.Unicorn](https://github.com/unicorn-engine/unicorn) from 2.0.2-rc1-9c9356d to 2.0.2-rc1-a913199. - [Release notes](https://github.com/unicorn-engine/unicorn/releases) - [Changelog](https://github.com/unicorn-engine/unicorn/blob/master/ChangeLog) - [Commits](https://github.com/unicorn-engine/unicorn/commits) --- updated-dependencies: - dependency-name: UnicornEngine.Unicorn dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Directory.Packages.props | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Directory.Packages.props b/Directory.Packages.props index 6f5ed3d71..9ddf8a63d 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -49,7 +49,7 @@ <PackageVersion Include="System.Management" Version="7.0.0" /> <PackageVersion Include="System.Net.NameResolution" Version="4.3.0" /> <PackageVersion Include="System.Threading.ThreadPool" Version="4.3.0" /> - <PackageVersion Include="UnicornEngine.Unicorn" Version="2.0.2-rc1-9c9356d" /> + <PackageVersion Include="UnicornEngine.Unicorn" Version="2.0.2-rc1-a913199" /> <PackageVersion Include="XamlNameReferenceGenerator" Version="1.5.1" /> </ItemGroup> </Project> \ No newline at end of file From c308f09722fdcd46b71c7f0892ebeb31a6345b3c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 22 Feb 2023 16:08:25 +0100 Subject: [PATCH 25/41] nuget: bump Microsoft.NET.Test.Sdk from 17.4.1 to 17.5.0 (#4458) Bumps [Microsoft.NET.Test.Sdk](https://github.com/microsoft/vstest) from 17.4.1 to 17.5.0. - [Release notes](https://github.com/microsoft/vstest/releases) - [Changelog](https://github.com/microsoft/vstest/blob/main/docs/releases.md) - [Commits](https://github.com/microsoft/vstest/compare/v17.4.1...v17.5.0) --- updated-dependencies: - dependency-name: Microsoft.NET.Test.Sdk dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Directory.Packages.props | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Directory.Packages.props b/Directory.Packages.props index 9ddf8a63d..528dc4b4a 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -22,7 +22,7 @@ <PackageVersion Include="LibHac" Version="0.17.0" /> <PackageVersion Include="Microsoft.CodeAnalysis.Analyzers" Version="3.3.4" /> <PackageVersion Include="Microsoft.CodeAnalysis.CSharp" Version="4.4.0" /> - <PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.4.1" /> + <PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.5.0" /> <PackageVersion Include="MsgPack.Cli" Version="1.0.1" /> <PackageVersion Include="NUnit" Version="3.13.3" /> <PackageVersion Include="NUnit3TestAdapter" Version="4.1.0" /> From f07ae7d53f511af38d9735c8a0aecc1cce6dc12b Mon Sep 17 00:00:00 2001 From: Isaac Marovitz <42140194+IsaacMarovitz@users.noreply.github.com> Date: Wed, 22 Feb 2023 15:58:32 -0500 Subject: [PATCH 26/41] Fix Title Update Manager not selecting right update (#4452) --- Ryujinx.Ava/UI/ViewModels/TitleUpdateViewModel.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Ryujinx.Ava/UI/ViewModels/TitleUpdateViewModel.cs b/Ryujinx.Ava/UI/ViewModels/TitleUpdateViewModel.cs index 3d0b20f7e..f330006e3 100644 --- a/Ryujinx.Ava/UI/ViewModels/TitleUpdateViewModel.cs +++ b/Ryujinx.Ava/UI/ViewModels/TitleUpdateViewModel.cs @@ -105,13 +105,13 @@ public class TitleUpdateViewModel : BaseModel AddUpdate(path); } - // NOTE: Save the list again to remove leftovers. - Save(); - TitleUpdateModel selected = TitleUpdates.FirstOrDefault(x => x.Path == _titleUpdateWindowData.Selected, null); SelectedUpdate = selected; + // NOTE: Save the list again to remove leftovers. + Save(); + SortUpdates(); } From 095ad923ad24c44e51ee6cee60edd50ee470fd71 Mon Sep 17 00:00:00 2001 From: gdkchan <gab.dark.100@gmail.com> Date: Thu, 23 Feb 2023 06:08:54 -0300 Subject: [PATCH 27/41] Account for multisample when calculating render target size hint (#4467) --- Ryujinx.Graphics.Gpu/Engine/Threed/StateUpdater.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Ryujinx.Graphics.Gpu/Engine/Threed/StateUpdater.cs b/Ryujinx.Graphics.Gpu/Engine/Threed/StateUpdater.cs index ecfd763f6..d7d197adb 100644 --- a/Ryujinx.Graphics.Gpu/Engine/Threed/StateUpdater.cs +++ b/Ryujinx.Graphics.Gpu/Engine/Threed/StateUpdater.cs @@ -439,7 +439,7 @@ namespace Ryujinx.Graphics.Gpu.Engine.Threed int samplesInY = msaaMode.SamplesInY(); var scissor = _state.State.ScreenScissorState; - Size sizeHint = new Size(scissor.X + scissor.Width, scissor.Y + scissor.Height, 1); + Size sizeHint = new Size((scissor.X + scissor.Width) * samplesInX, (scissor.Y + scissor.Height) * samplesInY, 1); int clipRegionWidth = int.MaxValue; int clipRegionHeight = int.MaxValue; From 58207685c0dcda07d18f5f538629c775e2a714b8 Mon Sep 17 00:00:00 2001 From: jhorv <38920027+jhorv@users.noreply.github.com> Date: Sat, 25 Feb 2023 05:26:39 -0500 Subject: [PATCH 28/41] Perform bounds checking before list indexer to avoid frequent exceptions (#4438) * Perform bounds checking before list indexer to avoid frequent ArgumentOutOfRangeExceptions * do a single compare after casting id and .Count to uint --- Ryujinx.Graphics.Vulkan/IdList.cs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/Ryujinx.Graphics.Vulkan/IdList.cs b/Ryujinx.Graphics.Vulkan/IdList.cs index d5a87a058..5c0623c3f 100644 --- a/Ryujinx.Graphics.Vulkan/IdList.cs +++ b/Ryujinx.Graphics.Vulkan/IdList.cs @@ -80,8 +80,16 @@ namespace Ryujinx.Graphics.Vulkan try { - value = _list[id]; - return value != null; + if ((uint)id < (uint)_list.Count) + { + value = _list[id]; + return value != null; + } + else + { + value = null; + return false; + } } catch (ArgumentOutOfRangeException) { From cedd2007451c046a1276556bacb4e19333b11557 Mon Sep 17 00:00:00 2001 From: gdkchan <gab.dark.100@gmail.com> Date: Sat, 25 Feb 2023 07:39:51 -0300 Subject: [PATCH 29/41] Move gl_Layer to vertex shader if geometry is not supported (#4368) * Set gl_Layer on vertex shader if it's set on the geometry shader and it does nothing else * Shader cache version bump * PR feedback * Fix typo --- Ryujinx.Graphics.GAL/Capabilities.cs | 3 + .../Shader/DiskCache/DiskCacheHostStorage.cs | 4 +- .../DiskCache/ParallelDiskCacheLoader.cs | 5 + .../Shader/GpuAccessorBase.cs | 2 + Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs | 38 +++++ Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs | 1 + Ryujinx.Graphics.Shader/IGpuAccessor.cs | 9 ++ .../ShaderIdentification.cs | 8 + Ryujinx.Graphics.Shader/ShaderProgramInfo.cs | 6 + .../Translation/EmitterContext.cs | 7 + .../Translation/ShaderConfig.cs | 22 ++- .../Translation/ShaderIdentifier.cs | 145 ++++++++++++++++++ .../Translation/Translator.cs | 4 +- .../Translation/TranslatorContext.cs | 10 ++ Ryujinx.Graphics.Vulkan/VulkanRenderer.cs | 1 + 15 files changed, 262 insertions(+), 3 deletions(-) create mode 100644 Ryujinx.Graphics.Shader/ShaderIdentification.cs create mode 100644 Ryujinx.Graphics.Shader/Translation/ShaderIdentifier.cs diff --git a/Ryujinx.Graphics.GAL/Capabilities.cs b/Ryujinx.Graphics.GAL/Capabilities.cs index a24139eba..7822da211 100644 --- a/Ryujinx.Graphics.GAL/Capabilities.cs +++ b/Ryujinx.Graphics.GAL/Capabilities.cs @@ -26,6 +26,7 @@ namespace Ryujinx.Graphics.GAL public readonly bool SupportsBlendEquationAdvanced; public readonly bool SupportsFragmentShaderInterlock; public readonly bool SupportsFragmentShaderOrderingIntel; + public readonly bool SupportsGeometryShader; public readonly bool SupportsGeometryShaderPassthrough; public readonly bool SupportsImageLoadFormatted; public readonly bool SupportsLayerVertexTessellation; @@ -68,6 +69,7 @@ namespace Ryujinx.Graphics.GAL bool supportsBlendEquationAdvanced, bool supportsFragmentShaderInterlock, bool supportsFragmentShaderOrderingIntel, + bool supportsGeometryShader, bool supportsGeometryShaderPassthrough, bool supportsImageLoadFormatted, bool supportsLayerVertexTessellation, @@ -107,6 +109,7 @@ namespace Ryujinx.Graphics.GAL SupportsBlendEquationAdvanced = supportsBlendEquationAdvanced; SupportsFragmentShaderInterlock = supportsFragmentShaderInterlock; SupportsFragmentShaderOrderingIntel = supportsFragmentShaderOrderingIntel; + SupportsGeometryShader = supportsGeometryShader; SupportsGeometryShaderPassthrough = supportsGeometryShaderPassthrough; SupportsImageLoadFormatted = supportsImageLoadFormatted; SupportsLayerVertexTessellation = supportsLayerVertexTessellation; diff --git a/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs b/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs index 1f6dab893..edc5a8a08 100644 --- a/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs +++ b/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs @@ -22,7 +22,7 @@ namespace Ryujinx.Graphics.Gpu.Shader.DiskCache private const ushort FileFormatVersionMajor = 1; private const ushort FileFormatVersionMinor = 2; private const uint FileFormatVersionPacked = ((uint)FileFormatVersionMajor << 16) | FileFormatVersionMinor; - private const uint CodeGenVersion = 4369; + private const uint CodeGenVersion = 4368; private const string SharedTocFileName = "shared.toc"; private const string SharedDataFileName = "shared.data"; @@ -774,6 +774,8 @@ namespace Ryujinx.Graphics.Gpu.Shader.DiskCache sBuffers, textures, images, + ShaderIdentification.None, + 0, dataInfo.Stage, dataInfo.UsesInstanceId, dataInfo.UsesDrawParameters, diff --git a/Ryujinx.Graphics.Gpu/Shader/DiskCache/ParallelDiskCacheLoader.cs b/Ryujinx.Graphics.Gpu/Shader/DiskCache/ParallelDiskCacheLoader.cs index 722e66b36..77fb3ca4b 100644 --- a/Ryujinx.Graphics.Gpu/Shader/DiskCache/ParallelDiskCacheLoader.cs +++ b/Ryujinx.Graphics.Gpu/Shader/DiskCache/ParallelDiskCacheLoader.cs @@ -633,6 +633,11 @@ namespace Ryujinx.Graphics.Gpu.Shader.DiskCache } } + if (!_context.Capabilities.SupportsGeometryShader) + { + ShaderCache.TryRemoveGeometryStage(translatorContexts); + } + CachedShaderStage[] shaders = new CachedShaderStage[guestShaders.Length]; List<ShaderProgram> translatedStages = new List<ShaderProgram>(); diff --git a/Ryujinx.Graphics.Gpu/Shader/GpuAccessorBase.cs b/Ryujinx.Graphics.Gpu/Shader/GpuAccessorBase.cs index d36ffd70f..1402f146b 100644 --- a/Ryujinx.Graphics.Gpu/Shader/GpuAccessorBase.cs +++ b/Ryujinx.Graphics.Gpu/Shader/GpuAccessorBase.cs @@ -126,6 +126,8 @@ namespace Ryujinx.Graphics.Gpu.Shader public bool QueryHostSupportsFragmentShaderOrderingIntel() => _context.Capabilities.SupportsFragmentShaderOrderingIntel; + public bool QueryHostSupportsGeometryShader() => _context.Capabilities.SupportsGeometryShader; + public bool QueryHostSupportsGeometryShaderPassthrough() => _context.Capabilities.SupportsGeometryShaderPassthrough; public bool QueryHostSupportsImageLoadFormatted() => _context.Capabilities.SupportsImageLoadFormatted; diff --git a/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs b/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs index 5c045d9ba..11f7085d3 100644 --- a/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs +++ b/Ryujinx.Graphics.Gpu/Shader/ShaderCache.cs @@ -353,6 +353,11 @@ namespace Ryujinx.Graphics.Gpu.Shader } } + if (!_context.Capabilities.SupportsGeometryShader) + { + TryRemoveGeometryStage(translatorContexts); + } + CachedShaderStage[] shaders = new CachedShaderStage[Constants.ShaderStages + 1]; List<ShaderSource> shaderSources = new List<ShaderSource>(); @@ -421,6 +426,39 @@ namespace Ryujinx.Graphics.Gpu.Shader return gpShaders; } + /// <summary> + /// Tries to eliminate the geometry stage from the array of translator contexts. + /// </summary> + /// <param name="translatorContexts">Array of translator contexts</param> + public static void TryRemoveGeometryStage(TranslatorContext[] translatorContexts) + { + if (translatorContexts[4] != null) + { + // We have a geometry shader, but geometry shaders are not supported. + // Try to eliminate the geometry shader. + + ShaderProgramInfo info = translatorContexts[4].Translate().Info; + + if (info.Identification == ShaderIdentification.GeometryLayerPassthrough) + { + // We managed to identify that this geometry shader is only used to set the output Layer value, + // we can set the Layer on the previous stage instead (usually the vertex stage) and eliminate it. + + for (int i = 3; i >= 1; i--) + { + if (translatorContexts[i] != null) + { + translatorContexts[i].SetGeometryShaderLayerInputAttribute(info.GpLayerInputAttribute); + translatorContexts[i].SetLastInVertexPipeline(translatorContexts[5] != null); + break; + } + } + + translatorContexts[4] = null; + } + } + } + /// <summary> /// Creates a shader source for use with the backend from a translated shader program. /// </summary> diff --git a/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs b/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs index efbd17c1b..9490684cd 100644 --- a/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs +++ b/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs @@ -124,6 +124,7 @@ namespace Ryujinx.Graphics.OpenGL supportsBlendEquationAdvanced: HwCapabilities.SupportsBlendEquationAdvanced, supportsFragmentShaderInterlock: HwCapabilities.SupportsFragmentShaderInterlock, supportsFragmentShaderOrderingIntel: HwCapabilities.SupportsFragmentShaderOrdering, + supportsGeometryShader: true, supportsGeometryShaderPassthrough: HwCapabilities.SupportsGeometryShaderPassthrough, supportsImageLoadFormatted: HwCapabilities.SupportsImageLoadFormatted, supportsLayerVertexTessellation: HwCapabilities.SupportsShaderViewportLayerArray, diff --git a/Ryujinx.Graphics.Shader/IGpuAccessor.cs b/Ryujinx.Graphics.Shader/IGpuAccessor.cs index 55df8dc31..f364437c7 100644 --- a/Ryujinx.Graphics.Shader/IGpuAccessor.cs +++ b/Ryujinx.Graphics.Shader/IGpuAccessor.cs @@ -259,6 +259,15 @@ namespace Ryujinx.Graphics.Shader return false; } + /// <summary> + /// Queries host GPU geometry shader support. + /// </summary> + /// <returns>True if the GPU and driver supports geometry shaders, false otherwise</returns> + bool QueryHostSupportsGeometryShader() + { + return true; + } + /// <summary> /// Queries host GPU geometry shader passthrough support. /// </summary> diff --git a/Ryujinx.Graphics.Shader/ShaderIdentification.cs b/Ryujinx.Graphics.Shader/ShaderIdentification.cs new file mode 100644 index 000000000..3f0157626 --- /dev/null +++ b/Ryujinx.Graphics.Shader/ShaderIdentification.cs @@ -0,0 +1,8 @@ +namespace Ryujinx.Graphics.Shader +{ + public enum ShaderIdentification + { + None, + GeometryLayerPassthrough + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Shader/ShaderProgramInfo.cs b/Ryujinx.Graphics.Shader/ShaderProgramInfo.cs index bb75b10ae..30f0ffaa2 100644 --- a/Ryujinx.Graphics.Shader/ShaderProgramInfo.cs +++ b/Ryujinx.Graphics.Shader/ShaderProgramInfo.cs @@ -10,6 +10,8 @@ namespace Ryujinx.Graphics.Shader public ReadOnlyCollection<TextureDescriptor> Textures { get; } public ReadOnlyCollection<TextureDescriptor> Images { get; } + public ShaderIdentification Identification { get; } + public int GpLayerInputAttribute { get; } public ShaderStage Stage { get; } public bool UsesInstanceId { get; } public bool UsesDrawParameters { get; } @@ -22,6 +24,8 @@ namespace Ryujinx.Graphics.Shader BufferDescriptor[] sBuffers, TextureDescriptor[] textures, TextureDescriptor[] images, + ShaderIdentification identification, + int gpLayerInputAttribute, ShaderStage stage, bool usesInstanceId, bool usesDrawParameters, @@ -34,6 +38,8 @@ namespace Ryujinx.Graphics.Shader Textures = Array.AsReadOnly(textures); Images = Array.AsReadOnly(images); + Identification = identification; + GpLayerInputAttribute = gpLayerInputAttribute; Stage = stage; UsesInstanceId = usesInstanceId; UsesDrawParameters = usesDrawParameters; diff --git a/Ryujinx.Graphics.Shader/Translation/EmitterContext.cs b/Ryujinx.Graphics.Shader/Translation/EmitterContext.cs index ad55c0109..8f33cceda 100644 --- a/Ryujinx.Graphics.Shader/Translation/EmitterContext.cs +++ b/Ryujinx.Graphics.Shader/Translation/EmitterContext.cs @@ -241,6 +241,13 @@ namespace Ryujinx.Graphics.Shader.Translation this.Copy(Attribute(AttributeConsts.PositionZ), this.FPFusedMultiplyAdd(z, ConstF(0.5f), halfW)); } + + if (Config.Stage != ShaderStage.Geometry && Config.HasLayerInputAttribute) + { + Config.SetUsedFeature(FeatureFlags.RtLayer); + + this.Copy(Attribute(AttributeConsts.Layer), Attribute(Config.GpLayerInputAttribute | AttributeConsts.LoadOutputMask)); + } } public void PrepareForVertexReturn(out Operand oldXLocal, out Operand oldYLocal, out Operand oldZLocal) diff --git a/Ryujinx.Graphics.Shader/Translation/ShaderConfig.cs b/Ryujinx.Graphics.Shader/Translation/ShaderConfig.cs index a79ef6f57..2caa8f638 100644 --- a/Ryujinx.Graphics.Shader/Translation/ShaderConfig.cs +++ b/Ryujinx.Graphics.Shader/Translation/ShaderConfig.cs @@ -20,6 +20,8 @@ namespace Ryujinx.Graphics.Shader.Translation public bool LastInPipeline { get; private set; } public bool LastInVertexPipeline { get; private set; } + public bool HasLayerInputAttribute { get; private set; } + public int GpLayerInputAttribute { get; private set; } public int ThreadsPerInputPrimitive { get; } public OutputTopology OutputTopology { get; } @@ -245,6 +247,22 @@ namespace Ryujinx.Graphics.Shader.Translation LayerOutputAttribute = attr; } + public void SetGeometryShaderLayerInputAttribute(int attr) + { + HasLayerInputAttribute = true; + GpLayerInputAttribute = attr; + } + + public void SetLastInVertexPipeline(bool hasFragment) + { + if (!hasFragment) + { + LastInPipeline = true; + } + + LastInVertexPipeline = true; + } + public void SetInputUserAttributeFixedFunc(int index) { UsedInputAttributes |= 1 << index; @@ -706,13 +724,15 @@ namespace Ryujinx.Graphics.Shader.Translation return FindDescriptorIndex(GetImageDescriptors(), texOp); } - public ShaderProgramInfo CreateProgramInfo() + public ShaderProgramInfo CreateProgramInfo(ShaderIdentification identification = ShaderIdentification.None) { return new ShaderProgramInfo( GetConstantBufferDescriptors(), GetStorageBufferDescriptors(), GetTextureDescriptors(), GetImageDescriptors(), + identification, + GpLayerInputAttribute, Stage, UsedFeatures.HasFlag(FeatureFlags.InstanceId), UsedFeatures.HasFlag(FeatureFlags.DrawParameters), diff --git a/Ryujinx.Graphics.Shader/Translation/ShaderIdentifier.cs b/Ryujinx.Graphics.Shader/Translation/ShaderIdentifier.cs new file mode 100644 index 000000000..206718f2a --- /dev/null +++ b/Ryujinx.Graphics.Shader/Translation/ShaderIdentifier.cs @@ -0,0 +1,145 @@ +using Ryujinx.Graphics.Shader.IntermediateRepresentation; +using static Ryujinx.Graphics.Shader.IntermediateRepresentation.OperandHelper; + +namespace Ryujinx.Graphics.Shader.Translation +{ + static class ShaderIdentifier + { + public static ShaderIdentification Identify(Function[] functions, ShaderConfig config) + { + if (config.Stage == ShaderStage.Geometry && + config.GpuAccessor.QueryPrimitiveTopology() == InputTopology.Triangles && + !config.GpuAccessor.QueryHostSupportsGeometryShader() && + IsLayerPassthroughGeometryShader(functions, out int layerInputAttr)) + { + config.SetGeometryShaderLayerInputAttribute(layerInputAttr); + + return ShaderIdentification.GeometryLayerPassthrough; + } + + return ShaderIdentification.None; + } + + private static bool IsLayerPassthroughGeometryShader(Function[] functions, out int layerInputAttr) + { + bool writesLayer = false; + layerInputAttr = 0; + + if (functions.Length != 1) + { + return false; + } + + int verticesCount = 0; + int totalVerticesCount = 0; + + foreach (BasicBlock block in functions[0].Blocks) + { + // We are not expecting loops or any complex control flow here, so fail in those cases. + if (block.Branch != null && block.Branch.Index <= block.Index) + { + return false; + } + + foreach (INode node in block.Operations) + { + if (!(node is Operation operation)) + { + continue; + } + + if (IsResourceWrite(operation.Inst)) + { + return false; + } + + if (operation.Inst == Instruction.StoreAttribute) + { + return false; + } + + if (operation.Inst == Instruction.Copy && operation.Dest.Type == OperandType.Attribute) + { + Operand src = operation.GetSource(0); + + if (src.Type == OperandType.LocalVariable && src.AsgOp is Operation asgOp && asgOp.Inst == Instruction.LoadAttribute) + { + src = Attribute(asgOp.GetSource(0).Value); + } + + if (src.Type == OperandType.Attribute) + { + if (operation.Dest.Value == AttributeConsts.Layer) + { + if ((src.Value & AttributeConsts.LoadOutputMask) != 0) + { + return false; + } + + writesLayer = true; + layerInputAttr = src.Value; + } + else if (src.Value != operation.Dest.Value) + { + return false; + } + } + else if (src.Type == OperandType.Constant) + { + int dstComponent = (operation.Dest.Value >> 2) & 3; + float expectedValue = dstComponent == 3 ? 1f : 0f; + + if (src.AsFloat() != expectedValue) + { + return false; + } + } + else + { + return false; + } + } + else if (operation.Inst == Instruction.EmitVertex) + { + verticesCount++; + } + else if (operation.Inst == Instruction.EndPrimitive) + { + totalVerticesCount += verticesCount; + verticesCount = 0; + } + } + } + + return totalVerticesCount + verticesCount == 3 && writesLayer; + } + + private static bool IsResourceWrite(Instruction inst) + { + switch (inst) + { + case Instruction.AtomicAdd: + case Instruction.AtomicAnd: + case Instruction.AtomicCompareAndSwap: + case Instruction.AtomicMaxS32: + case Instruction.AtomicMaxU32: + case Instruction.AtomicMinS32: + case Instruction.AtomicMinU32: + case Instruction.AtomicOr: + case Instruction.AtomicSwap: + case Instruction.AtomicXor: + case Instruction.ImageAtomic: + case Instruction.ImageStore: + case Instruction.StoreGlobal: + case Instruction.StoreGlobal16: + case Instruction.StoreGlobal8: + case Instruction.StoreStorage: + case Instruction.StoreStorage16: + case Instruction.StoreStorage8: + return true; + } + + return false; + } + } +} diff --git a/Ryujinx.Graphics.Shader/Translation/Translator.cs b/Ryujinx.Graphics.Shader/Translation/Translator.cs index 3fb586cbb..6a1230458 100644 --- a/Ryujinx.Graphics.Shader/Translation/Translator.cs +++ b/Ryujinx.Graphics.Shader/Translation/Translator.cs @@ -77,9 +77,11 @@ namespace Ryujinx.Graphics.Shader.Translation funcs[i] = new Function(cfg.Blocks, $"fun{i}", false, inArgumentsCount, outArgumentsCount); } + var identification = ShaderIdentifier.Identify(funcs, config); + var sInfo = StructuredProgram.MakeStructuredProgram(funcs, config); - var info = config.CreateProgramInfo(); + var info = config.CreateProgramInfo(identification); return config.Options.TargetLanguage switch { diff --git a/Ryujinx.Graphics.Shader/Translation/TranslatorContext.cs b/Ryujinx.Graphics.Shader/Translation/TranslatorContext.cs index 127f84a67..3b88fdbab 100644 --- a/Ryujinx.Graphics.Shader/Translation/TranslatorContext.cs +++ b/Ryujinx.Graphics.Shader/Translation/TranslatorContext.cs @@ -138,6 +138,16 @@ namespace Ryujinx.Graphics.Shader.Translation _config.MergeFromtNextStage(nextStage._config); } + public void SetGeometryShaderLayerInputAttribute(int attr) + { + _config.SetGeometryShaderLayerInputAttribute(attr); + } + + public void SetLastInVertexPipeline(bool hasFragment) + { + _config.SetLastInVertexPipeline(hasFragment); + } + public ShaderProgram Translate(TranslatorContext other = null) { FunctionCode[] code = EmitShader(_program, _config, initializeOutputs: other == null, out _); diff --git a/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs b/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs index 4c7c731be..6b6352571 100644 --- a/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs +++ b/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs @@ -546,6 +546,7 @@ namespace Ryujinx.Graphics.Vulkan supportsBlendEquationAdvanced: Capabilities.SupportsBlendEquationAdvanced, supportsFragmentShaderInterlock: Capabilities.SupportsFragmentShaderInterlock, supportsFragmentShaderOrderingIntel: false, + supportsGeometryShader: Capabilities.SupportsGeometryShader, supportsGeometryShaderPassthrough: Capabilities.SupportsGeometryShaderPassthrough, supportsImageLoadFormatted: features2.Features.ShaderStorageImageReadWithoutFormat, supportsLayerVertexTessellation: featuresVk12.ShaderOutputLayer, From f7c2e867f4e0c9067c0c88f58b5df4cef6ee4399 Mon Sep 17 00:00:00 2001 From: Mary <mary@mary.zone> Date: Sat, 25 Feb 2023 11:55:57 +0100 Subject: [PATCH 30/41] chore: Update OpenTK to 4.7.7 (#4478) --- Directory.Packages.props | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Directory.Packages.props b/Directory.Packages.props index 528dc4b4a..ae05ff54c 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -26,10 +26,10 @@ <PackageVersion Include="MsgPack.Cli" Version="1.0.1" /> <PackageVersion Include="NUnit" Version="3.13.3" /> <PackageVersion Include="NUnit3TestAdapter" Version="4.1.0" /> - <PackageVersion Include="OpenTK.Core" Version="4.7.5" /> - <PackageVersion Include="OpenTK.Graphics" Version="4.7.5" /> - <PackageVersion Include="OpenTK.OpenAL" Version="4.7.5" /> - <PackageVersion Include="OpenTK.Windowing.GraphicsLibraryFramework" Version="4.7.5" /> + <PackageVersion Include="OpenTK.Core" Version="4.7.7" /> + <PackageVersion Include="OpenTK.Graphics" Version="4.7.7" /> + <PackageVersion Include="OpenTK.OpenAL" Version="4.7.7" /> + <PackageVersion Include="OpenTK.Windowing.GraphicsLibraryFramework" Version="4.7.7" /> <PackageVersion Include="Ryujinx.Audio.OpenAL.Dependencies" Version="1.21.0.1" /> <PackageVersion Include="Ryujinx.Graphics.Nvdec.Dependencies" Version="5.0.1-build13" /> <PackageVersion Include="Ryujinx.Graphics.Vulkan.Dependencies.MoltenVK" Version="1.2.0" /> From f663a5cd38e0ac0191f5859ed5bc25f5a7a9a907 Mon Sep 17 00:00:00 2001 From: Mary <mary@mary.zone> Date: Sat, 25 Feb 2023 12:30:48 +0100 Subject: [PATCH 31/41] macos: Add updater support (#4464) This is a very basic updater but should be enough for now. --------- Co-authored-by: TSRBerry <20988865+TSRBerry@users.noreply.github.com> --- Ryujinx.Ava/Modules/Updater/Updater.cs | 257 ++++++++++++--------- distribution/macos/create_app_bundle.sh | 1 + distribution/macos/create_macos_release.sh | 2 +- distribution/macos/updater.sh | 39 ++++ 4 files changed, 191 insertions(+), 108 deletions(-) create mode 100755 distribution/macos/updater.sh diff --git a/Ryujinx.Ava/Modules/Updater/Updater.cs b/Ryujinx.Ava/Modules/Updater/Updater.cs index 511e273e5..e89abd1da 100644 --- a/Ryujinx.Ava/Modules/Updater/Updater.cs +++ b/Ryujinx.Ava/Modules/Updater/Updater.cs @@ -21,6 +21,7 @@ using System.Net.Http; using System.Net.NetworkInformation; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Versioning; using System.Text; using System.Threading; using System.Threading.Tasks; @@ -57,7 +58,7 @@ namespace Ryujinx.Modules // Detect current platform if (OperatingSystem.IsMacOS()) { - _platformExt = "osx_x64.zip"; + _platformExt = "macos_universal.app.tar.gz"; } else if (OperatingSystem.IsWindows()) { @@ -286,22 +287,40 @@ namespace Ryujinx.Modules if (_updateSuccessful) { - var shouldRestart = await ContentDialogHelper.CreateChoiceDialog(LocaleManager.Instance[LocaleKeys.RyujinxUpdater], - LocaleManager.Instance[LocaleKeys.DialogUpdaterCompleteMessage], - LocaleManager.Instance[LocaleKeys.DialogUpdaterRestartMessage]); + bool shouldRestart = true; + + if (!OperatingSystem.IsMacOS()) + { + shouldRestart = await ContentDialogHelper.CreateChoiceDialog(LocaleManager.Instance[LocaleKeys.RyujinxUpdater], + LocaleManager.Instance[LocaleKeys.DialogUpdaterCompleteMessage], + LocaleManager.Instance[LocaleKeys.DialogUpdaterRestartMessage]); + } if (shouldRestart) { + List<string> arguments = CommandLineState.Arguments.ToList(); string ryuName = Path.GetFileName(Environment.ProcessPath); - string ryuExe = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, ryuName); + string executableDirectory = AppDomain.CurrentDomain.BaseDirectory; + string executablePath = Path.Combine(executableDirectory, ryuName); - if (!Path.Exists(ryuExe)) + if (!Path.Exists(executablePath)) { - ryuExe = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, OperatingSystem.IsWindows() ? "Ryujinx.exe" : "Ryujinx"); + executablePath = Path.Combine(executableDirectory, OperatingSystem.IsWindows() ? "Ryujinx.exe" : "Ryujinx"); } - Process.Start(ryuExe, CommandLineState.Arguments); + // On macOS we perform the update at relaunch. + if (OperatingSystem.IsMacOS()) + { + string baseBundlePath = Path.GetFullPath(Path.Combine(executableDirectory, "..", "..")); + string newBundlePath = Path.Combine(UpdateDir, "Ryujinx.app"); + string updaterScriptPath = Path.Combine(newBundlePath, "Contents", "Resources", "updater.sh"); + string currentPid = Process.GetCurrentProcess().Id.ToString(); + executablePath = "/bin/bash"; + arguments.InsertRange(0, new List<string> { updaterScriptPath, baseBundlePath, newBundlePath, currentPid }); + } + + Process.Start(executablePath, arguments); Environment.Exit(0); } } @@ -381,6 +400,15 @@ namespace Ryujinx.Modules File.WriteAllBytes(updateFile, mergedFileBytes); + // On macOS, ensure that we remove the quarantine bit to prevent Gatekeeper from blocking execution. + if (OperatingSystem.IsMacOS()) + { + using (Process xattrProcess = Process.Start("xattr", new List<string> { "-d", "com.apple.quarantine", updateFile })) + { + xattrProcess.WaitForExit(); + } + } + try { InstallUpdate(taskDialog, updateFile); @@ -470,87 +498,98 @@ namespace Ryujinx.Modules worker.Start(); } + [SupportedOSPlatform("linux")] + [SupportedOSPlatform("macos")] + private static void ExtractTarGzipFile(TaskDialog taskDialog, string archivePath, string outputDirectoryPath) + { + using Stream inStream = File.OpenRead(archivePath); + using GZipInputStream gzipStream = new(inStream); + using TarInputStream tarStream = new(gzipStream, Encoding.ASCII); + + TarEntry tarEntry; + + while ((tarEntry = tarStream.GetNextEntry()) is not null) + { + if (tarEntry.IsDirectory) + { + continue; + } + + string outPath = Path.Combine(outputDirectoryPath, tarEntry.Name); + + Directory.CreateDirectory(Path.GetDirectoryName(outPath)); + + using (FileStream outStream = File.OpenWrite(outPath)) + { + tarStream.CopyEntryContents(outStream); + } + + File.SetUnixFileMode(outPath, (UnixFileMode)tarEntry.TarHeader.Mode); + File.SetLastWriteTime(outPath, DateTime.SpecifyKind(tarEntry.ModTime, DateTimeKind.Utc)); + + Dispatcher.UIThread.Post(() => + { + if (tarEntry is null) + { + return; + } + + taskDialog.SetProgressBarState(GetPercentage(tarEntry.Size, inStream.Length), TaskDialogProgressState.Normal); + }); + } + } + + private static void ExtractZipFile(TaskDialog taskDialog, string archivePath, string outputDirectoryPath) + { + using Stream inStream = File.OpenRead(archivePath); + using ZipFile zipFile = new(inStream); + + double count = 0; + foreach (ZipEntry zipEntry in zipFile) + { + count++; + if (zipEntry.IsDirectory) continue; + + string outPath = Path.Combine(outputDirectoryPath, zipEntry.Name); + + Directory.CreateDirectory(Path.GetDirectoryName(outPath)); + + using (Stream zipStream = zipFile.GetInputStream(zipEntry)) + using (FileStream outStream = File.OpenWrite(outPath)) + { + zipStream.CopyTo(outStream); + } + + File.SetLastWriteTime(outPath, DateTime.SpecifyKind(zipEntry.DateTime, DateTimeKind.Utc)); + + Dispatcher.UIThread.Post(() => + { + taskDialog.SetProgressBarState(GetPercentage(count, zipFile.Count), TaskDialogProgressState.Normal); + }); + } + } + private static async void InstallUpdate(TaskDialog taskDialog, string updateFile) { // Extract Update taskDialog.SubHeader = LocaleManager.Instance[LocaleKeys.UpdaterExtracting]; taskDialog.SetProgressBarState(0, TaskDialogProgressState.Normal); - if (OperatingSystem.IsLinux()) + await Task.Run(() => { - using Stream inStream = File.OpenRead(updateFile); - using GZipInputStream gzipStream = new(inStream); - using TarInputStream tarStream = new(gzipStream, Encoding.ASCII); - - await Task.Run(() => + if (OperatingSystem.IsLinux() || OperatingSystem.IsMacOS()) { - TarEntry tarEntry; - - if (!OperatingSystem.IsWindows()) - { - while ((tarEntry = tarStream.GetNextEntry()) is not null) - { - if (tarEntry.IsDirectory) continue; - - string outPath = Path.Combine(UpdateDir, tarEntry.Name); - - Directory.CreateDirectory(Path.GetDirectoryName(outPath)); - - using (FileStream outStream = File.OpenWrite(outPath)) - { - tarStream.CopyEntryContents(outStream); - } - - File.SetUnixFileMode(outPath, (UnixFileMode)tarEntry.TarHeader.Mode); - File.SetLastWriteTime(outPath, DateTime.SpecifyKind(tarEntry.ModTime, DateTimeKind.Utc)); - - Dispatcher.UIThread.Post(() => - { - if (tarEntry is null) - { - return; - } - - taskDialog.SetProgressBarState(GetPercentage(tarEntry.Size, inStream.Length), TaskDialogProgressState.Normal); - }); - } - } - }); - - taskDialog.SetProgressBarState(100, TaskDialogProgressState.Normal); - } - else - { - using Stream inStream = File.OpenRead(updateFile); - using ZipFile zipFile = new(inStream); - - await Task.Run(() => + ExtractTarGzipFile(taskDialog, updateFile, UpdateDir); + } + else if (OperatingSystem.IsWindows()) { - double count = 0; - foreach (ZipEntry zipEntry in zipFile) - { - count++; - if (zipEntry.IsDirectory) continue; - - string outPath = Path.Combine(UpdateDir, zipEntry.Name); - - Directory.CreateDirectory(Path.GetDirectoryName(outPath)); - - using (Stream zipStream = zipFile.GetInputStream(zipEntry)) - using (FileStream outStream = File.OpenWrite(outPath)) - { - zipStream.CopyTo(outStream); - } - - File.SetLastWriteTime(outPath, DateTime.SpecifyKind(zipEntry.DateTime, DateTimeKind.Utc)); - - Dispatcher.UIThread.Post(() => - { - taskDialog.SetProgressBarState(GetPercentage(count, zipFile.Count), TaskDialogProgressState.Normal); - }); - } - }); - } + ExtractZipFile(taskDialog, updateFile, UpdateDir); + } + else + { + throw new NotSupportedException(); + } + }); // Delete downloaded zip File.Delete(updateFile); @@ -560,38 +599,42 @@ namespace Ryujinx.Modules taskDialog.SubHeader = LocaleManager.Instance[LocaleKeys.UpdaterRenaming]; taskDialog.SetProgressBarState(0, TaskDialogProgressState.Normal); - // Replace old files - await Task.Run(() => + // NOTE: On macOS, replacement is delayed to the restart phase. + if (!OperatingSystem.IsMacOS()) { - double count = 0; - foreach (string file in allFiles) + // Replace old files + await Task.Run(() => { - count++; - try + double count = 0; + foreach (string file in allFiles) { - File.Move(file, file + ".ryuold"); - - Dispatcher.UIThread.Post(() => + count++; + try { - taskDialog.SetProgressBarState(GetPercentage(count, allFiles.Count), TaskDialogProgressState.Normal); - }); - } - catch - { - Logger.Warning?.Print(LogClass.Application, LocaleManager.Instance.UpdateAndGetDynamicValue(LocaleKeys.UpdaterRenameFailed, file)); - } - } + File.Move(file, file + ".ryuold"); - Dispatcher.UIThread.Post(() => - { - taskDialog.SubHeader = LocaleManager.Instance[LocaleKeys.UpdaterAddingFiles]; - taskDialog.SetProgressBarState(0, TaskDialogProgressState.Normal); + Dispatcher.UIThread.Post(() => + { + taskDialog.SetProgressBarState(GetPercentage(count, allFiles.Count), TaskDialogProgressState.Normal); + }); + } + catch + { + Logger.Warning?.Print(LogClass.Application, LocaleManager.Instance.UpdateAndGetDynamicValue(LocaleKeys.UpdaterRenameFailed, file)); + } + } + + Dispatcher.UIThread.Post(() => + { + taskDialog.SubHeader = LocaleManager.Instance[LocaleKeys.UpdaterAddingFiles]; + taskDialog.SetProgressBarState(0, TaskDialogProgressState.Normal); + }); + + MoveAllFilesOver(UpdatePublishDir, HomeDir, taskDialog); }); - MoveAllFilesOver(UpdatePublishDir, HomeDir, taskDialog); - }); - - Directory.Delete(UpdateDir, true); + Directory.Delete(UpdateDir, true); + } _updateSuccessful = true; @@ -601,7 +644,7 @@ namespace Ryujinx.Modules public static bool CanUpdate(bool showWarnings) { #if !DISABLE_UPDATER - if (RuntimeInformation.OSArchitecture != Architecture.X64) + if (RuntimeInformation.OSArchitecture != Architecture.X64 && !OperatingSystem.IsMacOS()) { if (showWarnings) { @@ -674,7 +717,7 @@ namespace Ryujinx.Modules #endif } - // NOTE: This method should always reflect the latest build layout.s + // NOTE: This method should always reflect the latest build layout. private static IEnumerable<string> EnumerateFilesToDelete() { var files = Directory.EnumerateFiles(HomeDir); // All files directly in base dir. diff --git a/distribution/macos/create_app_bundle.sh b/distribution/macos/create_app_bundle.sh index 8076303cb..b62f3491e 100755 --- a/distribution/macos/create_app_bundle.sh +++ b/distribution/macos/create_app_bundle.sh @@ -24,6 +24,7 @@ cp $PUBLISH_DIRECTORY/*.dylib $APP_BUNDLE_DIRECTORY/Contents/Frameworks # Then resources cp Info.plist $APP_BUNDLE_DIRECTORY/Contents cp Ryujinx.icns $APP_BUNDLE_DIRECTORY/Contents/Resources/Ryujinx.icns +cp updater.sh $APP_BUNDLE_DIRECTORY/Contents/Resources/updater.sh cp -r $PUBLISH_DIRECTORY/THIRDPARTY.md $APP_BUNDLE_DIRECTORY/Contents/Resources echo -n "APPL????" > $APP_BUNDLE_DIRECTORY/Contents/PkgInfo diff --git a/distribution/macos/create_macos_release.sh b/distribution/macos/create_macos_release.sh index 545baf20e..d979ec8f0 100755 --- a/distribution/macos/create_macos_release.sh +++ b/distribution/macos/create_macos_release.sh @@ -27,7 +27,7 @@ EXECUTABLE_SUB_PATH=Contents/MacOS/Ryujinx rm -rf $TEMP_DIRECTORY mkdir -p $TEMP_DIRECTORY -DOTNET_COMMON_ARGS="-p:DebugType=embedded -p:Version=$VERSION -p:SourceRevisionId=$SOURCE_REVISION_ID -p:ExtraDefineConstants=DISABLE_UPDATER --self-contained true" +DOTNET_COMMON_ARGS="-p:DebugType=embedded -p:Version=$VERSION -p:SourceRevisionId=$SOURCE_REVISION_ID --self-contained true" dotnet restore dotnet build -c Release Ryujinx.Ava diff --git a/distribution/macos/updater.sh b/distribution/macos/updater.sh new file mode 100755 index 000000000..b60ac34df --- /dev/null +++ b/distribution/macos/updater.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +set -e + +INSTALL_DIRECTORY=$1 +NEW_APP_DIRECTORY=$2 +APP_PID=$3 +APP_ARGUMENTS="${@:4}" + +error_handler() { + local lineno="$1" + + script=""" + set alertTitle to \"Ryujinx - Updater error\" + set alertMessage to \"An error occurred during Ryujinx update (updater.sh:$lineno)\n\nPlease download the update manually from our website if the problem persists.\" + display dialog alertMessage with icon caution with title alertTitle buttons {\"Open Download Page\", \"Exit\"} + set the button_pressed to the button returned of the result + + if the button_pressed is \"Open Download Page\" then + open location \"https://ryujinx.org/download\" + end if + """ + + osascript -e "$script" + exit 1 +} + +trap 'error_handler ${LINENO}' ERR + +# Wait for Ryujinx to exit +# NOTE: in case no fds are open, lsof could be returning with a process still living. +# We wait 1s and assume the process stopped after that +lsof -p $APP_PID +r 1 &>/dev/null +sleep 1 + +# Now replace and reopen. +rm -rf "$INSTALL_DIRECTORY" +mv "$NEW_APP_DIRECTORY" "$INSTALL_DIRECTORY" +open -a "$INSTALL_DIRECTORY" --args "$APP_ARGUMENTS" From e691622f0a118d550a7891896e40b0d9ab39fb60 Mon Sep 17 00:00:00 2001 From: Mary <mary@mary.zone> Date: Sat, 25 Feb 2023 13:29:30 +0100 Subject: [PATCH 32/41] misc: Add missing DefineConstants definition in Ryujinx.Common Fix flathub and nixpkgs build hopefully now. --- Ryujinx.Common/Ryujinx.Common.csproj | 1 + 1 file changed, 1 insertion(+) diff --git a/Ryujinx.Common/Ryujinx.Common.csproj b/Ryujinx.Common/Ryujinx.Common.csproj index ca5de76a6..c307f524e 100644 --- a/Ryujinx.Common/Ryujinx.Common.csproj +++ b/Ryujinx.Common/Ryujinx.Common.csproj @@ -3,6 +3,7 @@ <PropertyGroup> <TargetFramework>net7.0</TargetFramework> <AllowUnsafeBlocks>true</AllowUnsafeBlocks> + <DefineConstants Condition=" '$(ExtraDefineConstants)' != '' ">$(DefineConstants);$(ExtraDefineConstants)</DefineConstants> </PropertyGroup> <ItemGroup> From 9b1cc2cec6135602efc5dc5afa45ed3db261eb42 Mon Sep 17 00:00:00 2001 From: merry <git@mary.rs> Date: Sat, 25 Feb 2023 15:07:23 +0000 Subject: [PATCH 33/41] Logging: Redirect StdErr into logging system (#4427) * Logging: Redirect StdErr into logging system * Remove Mono.Unix * Apply suggestions from code review Co-authored-by: riperiperi <rhy3756547@hotmail.com> * Address comments --------- Co-authored-by: Mary <thog@protonmail.com> Co-authored-by: riperiperi <rhy3756547@hotmail.com> Co-authored-by: Mary <mary@mary.zone> --- Directory.Packages.props | 2 +- Ryujinx.Common/Logging/Logger.cs | 15 +- Ryujinx.Common/SystemInterop/StdErrAdapter.cs | 93 +++++++++++ Ryujinx.Common/SystemInterop/UnixStream.cs | 155 ++++++++++++++++++ 4 files changed, 263 insertions(+), 2 deletions(-) create mode 100644 Ryujinx.Common/SystemInterop/StdErrAdapter.cs create mode 100644 Ryujinx.Common/SystemInterop/UnixStream.cs diff --git a/Directory.Packages.props b/Directory.Packages.props index ae05ff54c..35c98e5a3 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -52,4 +52,4 @@ <PackageVersion Include="UnicornEngine.Unicorn" Version="2.0.2-rc1-a913199" /> <PackageVersion Include="XamlNameReferenceGenerator" Version="1.5.1" /> </ItemGroup> -</Project> \ No newline at end of file +</Project> diff --git a/Ryujinx.Common/Logging/Logger.cs b/Ryujinx.Common/Logging/Logger.cs index c1abdba9b..4d48dd48d 100644 --- a/Ryujinx.Common/Logging/Logger.cs +++ b/Ryujinx.Common/Logging/Logger.cs @@ -1,3 +1,4 @@ +using Ryujinx.Common.SystemInterop; using System; using System.Collections.Generic; using System.Diagnostics; @@ -14,6 +15,8 @@ namespace Ryujinx.Common.Logging private static readonly List<ILogTarget> m_LogTargets; + private static readonly StdErrAdapter _stdErrAdapter; + public static event EventHandler<LogEventArgs> Updated; public readonly struct Log @@ -77,7 +80,13 @@ namespace Ryujinx.Common.Logging { Updated?.Invoke(null, new LogEventArgs(Level, m_Time.Elapsed, Thread.CurrentThread.Name, FormatMessage(logClass, caller, "Stubbed. " + message), data)); } - } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void PrintRawMsg(string message) + { + Updated?.Invoke(null, new LogEventArgs(Level, m_Time.Elapsed, Thread.CurrentThread.Name, message)); + } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static string FormatMessage(LogClass Class, string Caller, string Message) => $"{Class} {Caller}: {Message}"; @@ -119,6 +128,8 @@ namespace Ryujinx.Common.Logging Warning = new Log(LogLevel.Warning); Info = new Log(LogLevel.Info); Trace = new Log(LogLevel.Trace); + + _stdErrAdapter = new StdErrAdapter(); } public static void RestartTime() @@ -164,6 +175,8 @@ namespace Ryujinx.Common.Logging { Updated = null; + _stdErrAdapter.Dispose(); + foreach (var target in m_LogTargets) { target.Dispose(); diff --git a/Ryujinx.Common/SystemInterop/StdErrAdapter.cs b/Ryujinx.Common/SystemInterop/StdErrAdapter.cs new file mode 100644 index 000000000..12e240ad3 --- /dev/null +++ b/Ryujinx.Common/SystemInterop/StdErrAdapter.cs @@ -0,0 +1,93 @@ +using System; +using System.IO; +using System.Runtime.Versioning; +using System.Threading; +using Ryujinx.Common.Logging; +using System.Runtime.InteropServices; + +namespace Ryujinx.Common.SystemInterop +{ + public partial class StdErrAdapter : IDisposable + { + private bool _disposable = false; + private UnixStream _pipeReader; + private UnixStream _pipeWriter; + private Thread _worker; + + public StdErrAdapter() + { + if (OperatingSystem.IsLinux() || OperatingSystem.IsMacOS()) + { + RegisterPosix(); + } + } + + [SupportedOSPlatform("linux")] + [SupportedOSPlatform("macos")] + private void RegisterPosix() + { + const int stdErrFileno = 2; + + (int readFd, int writeFd) = MakePipe(); + dup2(writeFd, stdErrFileno); + + _pipeReader = new UnixStream(readFd); + _pipeWriter = new UnixStream(writeFd); + + _worker = new Thread(EventWorker); + _disposable = true; + _worker.Start(); + } + + [SupportedOSPlatform("linux")] + [SupportedOSPlatform("macos")] + private void EventWorker() + { + TextReader reader = new StreamReader(_pipeReader); + string line; + while ((line = reader.ReadLine()) != null) + { + Logger.Error?.PrintRawMsg(line); + } + } + + private void Dispose(bool disposing) + { + if (_disposable) + { + if (OperatingSystem.IsLinux() || OperatingSystem.IsMacOS()) + { + _pipeReader?.Close(); + _pipeWriter?.Close(); + } + + _disposable = false; + } + } + + public void Dispose() + { + Dispose(true); + } + + [LibraryImport("libc", SetLastError = true)] + private static partial int dup2(int fd, int fd2); + + [LibraryImport("libc", SetLastError = true)] + private static unsafe partial int pipe(int* pipefd); + + private static unsafe (int, int) MakePipe() + { + int *pipefd = stackalloc int[2]; + + if (pipe(pipefd) == 0) + { + return (pipefd[0], pipefd[1]); + } + else + { + throw new(); + } + } + } +} diff --git a/Ryujinx.Common/SystemInterop/UnixStream.cs b/Ryujinx.Common/SystemInterop/UnixStream.cs new file mode 100644 index 000000000..1d6449974 --- /dev/null +++ b/Ryujinx.Common/SystemInterop/UnixStream.cs @@ -0,0 +1,155 @@ +using System; +using System.IO; +using System.Runtime.InteropServices; +using System.Runtime.Versioning; + +namespace Ryujinx.Common.SystemInterop +{ + [SupportedOSPlatform("linux")] + [SupportedOSPlatform("macos")] + public partial class UnixStream : Stream, IDisposable + { + private const int InvalidFd = -1; + + private int _fd; + + [LibraryImport("libc", SetLastError = true)] + private static partial long read(int fd, IntPtr buf, ulong count); + + [LibraryImport("libc", SetLastError = true)] + private static partial long write(int fd, IntPtr buf, ulong count); + + [LibraryImport("libc", SetLastError = true)] + private static partial int close(int fd); + + public UnixStream(int fd) + { + if (InvalidFd == fd) + { + throw new ArgumentException("Invalid file descriptor"); + } + + _fd = fd; + + CanRead = read(fd, IntPtr.Zero, 0) != -1; + CanWrite = write(fd, IntPtr.Zero, 0) != -1; + } + + ~UnixStream() + { + Close(); + } + + public override bool CanRead { get; } + public override bool CanWrite { get; } + public override bool CanSeek => false; + + public override long Length => throw new NotSupportedException(); + + public override long Position + { + get => throw new NotSupportedException(); + set => throw new NotSupportedException(); + } + + public override void Flush() + { + } + + public override unsafe int Read([In, Out] byte[] buffer, int offset, int count) + { + if (offset < 0 || offset > (buffer.Length - count) || count < 0) + { + throw new ArgumentOutOfRangeException(); + } + + if (buffer.Length == 0) + { + return 0; + } + + long r = 0; + fixed (byte* buf = &buffer[offset]) + { + do + { + r = read(_fd, (IntPtr)buf, (ulong)count); + } while (ShouldRetry(r)); + } + + return (int)r; + } + + public override unsafe void Write(byte[] buffer, int offset, int count) + { + if (offset < 0 || offset > (buffer.Length - count) || count < 0) + { + throw new ArgumentOutOfRangeException(); + } + + if (buffer.Length == 0) + { + return; + } + + fixed (byte* buf = &buffer[offset]) + { + long r = 0; + do { + r = write(_fd, (IntPtr)buf, (ulong)count); + } while (ShouldRetry(r)); + } + } + + public override long Seek(long offset, SeekOrigin origin) + { + throw new NotSupportedException(); + } + + public override void SetLength(long value) + { + throw new NotSupportedException(); + } + + public override void Close() + { + if (_fd == InvalidFd) + { + return; + } + + Flush(); + + int r; + do { + r = close(_fd); + } while (ShouldRetry(r)); + + _fd = InvalidFd; + } + + void IDisposable.Dispose() + { + Close(); + } + + private bool ShouldRetry(long r) + { + if (r == -1) + { + const int eintr = 4; + + int errno = Marshal.GetLastPInvokeError(); + + if (errno == eintr) + { + return true; + } + + throw new SystemException($"Operation failed with error 0x{errno:X}"); + } + + return false; + } + } +} From 5d85468302dd21a93ac141abfb7b8749b938dc9a Mon Sep 17 00:00:00 2001 From: gdkchan <gab.dark.100@gmail.com> Date: Sun, 26 Feb 2023 19:19:00 -0300 Subject: [PATCH 34/41] Vulkan: Support list topology primitive restart (#4483) --- .../HardwareCapabilities.cs | 6 +++++ Ryujinx.Graphics.Vulkan/PipelineState.cs | 21 +++++++++++---- .../VulkanInitialization.cs | 27 +++++++++++++++++++ Ryujinx.Graphics.Vulkan/VulkanRenderer.cs | 13 +++++++++ 4 files changed, 62 insertions(+), 5 deletions(-) diff --git a/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs b/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs index 4512d375f..a45c2409b 100644 --- a/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs +++ b/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs @@ -33,6 +33,8 @@ namespace Ryujinx.Graphics.Vulkan public readonly bool SupportsMultiView; public readonly bool SupportsNullDescriptors; public readonly bool SupportsPushDescriptors; + public readonly bool SupportsPrimitiveTopologyListRestart; + public readonly bool SupportsPrimitiveTopologyPatchListRestart; public readonly bool SupportsTransformFeedback; public readonly bool SupportsTransformFeedbackQueries; public readonly bool SupportsPreciseOcclusionQueries; @@ -63,6 +65,8 @@ namespace Ryujinx.Graphics.Vulkan bool supportsMultiView, bool supportsNullDescriptors, bool supportsPushDescriptors, + bool supportsPrimitiveTopologyListRestart, + bool supportsPrimitiveTopologyPatchListRestart, bool supportsTransformFeedback, bool supportsTransformFeedbackQueries, bool supportsPreciseOcclusionQueries, @@ -92,6 +96,8 @@ namespace Ryujinx.Graphics.Vulkan SupportsMultiView = supportsMultiView; SupportsNullDescriptors = supportsNullDescriptors; SupportsPushDescriptors = supportsPushDescriptors; + SupportsPrimitiveTopologyListRestart = supportsPrimitiveTopologyListRestart; + SupportsPrimitiveTopologyPatchListRestart = supportsPrimitiveTopologyPatchListRestart; SupportsTransformFeedback = supportsTransformFeedback; SupportsTransformFeedbackQueries = supportsTransformFeedbackQueries; SupportsPreciseOcclusionQueries = supportsPreciseOcclusionQueries; diff --git a/Ryujinx.Graphics.Vulkan/PipelineState.cs b/Ryujinx.Graphics.Vulkan/PipelineState.cs index 0d5494766..dccc8ce68 100644 --- a/Ryujinx.Graphics.Vulkan/PipelineState.cs +++ b/Ryujinx.Graphics.Vulkan/PipelineState.cs @@ -417,11 +417,22 @@ namespace Ryujinx.Graphics.Vulkan bool primitiveRestartEnable = PrimitiveRestartEnable; - primitiveRestartEnable &= Topology == PrimitiveTopology.LineStrip || - Topology == PrimitiveTopology.TriangleStrip || - Topology == PrimitiveTopology.TriangleFan || - Topology == PrimitiveTopology.LineStripWithAdjacency || - Topology == PrimitiveTopology.TriangleStripWithAdjacency; + bool topologySupportsRestart; + + if (gd.Capabilities.SupportsPrimitiveTopologyListRestart) + { + topologySupportsRestart = gd.Capabilities.SupportsPrimitiveTopologyPatchListRestart || Topology != PrimitiveTopology.PatchList; + } + else + { + topologySupportsRestart = Topology == PrimitiveTopology.LineStrip || + Topology == PrimitiveTopology.TriangleStrip || + Topology == PrimitiveTopology.TriangleFan || + Topology == PrimitiveTopology.LineStripWithAdjacency || + Topology == PrimitiveTopology.TriangleStripWithAdjacency; + } + + primitiveRestartEnable &= topologySupportsRestart; var inputAssemblyState = new PipelineInputAssemblyStateCreateInfo() { diff --git a/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs b/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs index 353b219ac..ba3b5ef65 100644 --- a/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs +++ b/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs @@ -32,6 +32,7 @@ namespace Ryujinx.Graphics.Vulkan "VK_EXT_descriptor_indexing", // Enabling this works around an issue with disposed buffer bindings on RADV. "VK_EXT_fragment_shader_interlock", "VK_EXT_index_type_uint8", + "VK_EXT_primitive_topology_list_restart", "VK_EXT_robustness2", "VK_EXT_shader_stencil_export", "VK_KHR_shader_float16_int8", @@ -429,6 +430,17 @@ namespace Ryujinx.Graphics.Vulkan features2.PNext = &supportedFeaturesCustomBorderColor; } + PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT supportedFeaturesPrimitiveTopologyListRestart = new PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT() + { + SType = StructureType.PhysicalDevicePrimitiveTopologyListRestartFeaturesExt, + PNext = features2.PNext + }; + + if (supportedExtensions.Contains("VK_EXT_primitive_topology_list_restart")) + { + features2.PNext = &supportedFeaturesPrimitiveTopologyListRestart; + } + PhysicalDeviceTransformFeedbackFeaturesEXT supportedFeaturesTransformFeedback = new PhysicalDeviceTransformFeedbackFeaturesEXT() { SType = StructureType.PhysicalDeviceTransformFeedbackFeaturesExt, @@ -497,6 +509,21 @@ namespace Ryujinx.Graphics.Vulkan pExtendedFeatures = &featuresTransformFeedback; } + PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT featuresPrimitiveTopologyListRestart; + + if (supportedExtensions.Contains("VK_EXT_primitive_topology_list_restart")) + { + featuresPrimitiveTopologyListRestart = new PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT() + { + SType = StructureType.PhysicalDevicePrimitiveTopologyListRestartFeaturesExt, + PNext = pExtendedFeatures, + PrimitiveTopologyListRestart = supportedFeaturesPrimitiveTopologyListRestart.PrimitiveTopologyListRestart, + PrimitiveTopologyPatchListRestart = supportedFeaturesPrimitiveTopologyListRestart.PrimitiveTopologyPatchListRestart + }; + + pExtendedFeatures = &featuresPrimitiveTopologyListRestart; + } + PhysicalDeviceRobustness2FeaturesEXT featuresRobustness2; if (supportedExtensions.Contains("VK_EXT_robustness2")) diff --git a/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs b/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs index 6b6352571..8d4e54c4b 100644 --- a/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs +++ b/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs @@ -195,6 +195,11 @@ namespace Ryujinx.Graphics.Vulkan SType = StructureType.PhysicalDeviceFeatures2 }; + PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT featuresPrimitiveTopologyListRestart = new PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT() + { + SType = StructureType.PhysicalDevicePrimitiveTopologyListRestartFeaturesExt + }; + PhysicalDeviceRobustness2FeaturesEXT featuresRobustness2 = new PhysicalDeviceRobustness2FeaturesEXT() { SType = StructureType.PhysicalDeviceRobustness2FeaturesExt @@ -215,8 +220,14 @@ namespace Ryujinx.Graphics.Vulkan SType = StructureType.PhysicalDevicePortabilitySubsetFeaturesKhr }; + if (supportedExtensions.Contains("VK_EXT_primitive_topology_list_restart")) + { + features2.PNext = &featuresPrimitiveTopologyListRestart; + } + if (supportedExtensions.Contains("VK_EXT_robustness2")) { + featuresRobustness2.PNext = features2.PNext; features2.PNext = &featuresRobustness2; } @@ -288,6 +299,8 @@ namespace Ryujinx.Graphics.Vulkan features2.Features.MultiViewport, featuresRobustness2.NullDescriptor || IsMoltenVk, supportedExtensions.Contains(KhrPushDescriptor.ExtensionName), + featuresPrimitiveTopologyListRestart.PrimitiveTopologyListRestart, + featuresPrimitiveTopologyListRestart.PrimitiveTopologyPatchListRestart, supportsTransformFeedback, propertiesTransformFeedback.TransformFeedbackQueries, features2.Features.OcclusionQueryPrecise, From 80b497213981512e9ba1a629bcd5e2c519d2e566 Mon Sep 17 00:00:00 2001 From: Emmanuel Hansen <emmausssss@gmail.com> Date: Mon, 27 Feb 2023 21:11:55 +0000 Subject: [PATCH 35/41] Add Support for Post Processing Effects (#3616) * Add Post Processing Effects * fix events and shader issues * fix gtk upscale slider value * fix bgra games * don't swap swizzle if already swapped * restore opengl texture state after effects run * addressed review * use single pipeline for smaa and fsr * call finish on all pipelines * addressed review * attempt fix file case * attempt fixing file case * fix filter level tick frequency * adjust filter slider margins * replace fxaa shaders with original shader * addressed review --- Ryujinx.Ava/AppHost.cs | 28 + Ryujinx.Ava/Assets/Locales/en_US.json | 10 + .../UI/ViewModels/SettingsViewModel.cs | 32 + .../Views/Settings/SettingsGraphicsView.axaml | 78 + Ryujinx.Common/Configuration/AntiAliasing.cs | 12 + Ryujinx.Common/Configuration/ScalingFilter.cs | 9 + Ryujinx.Graphics.GAL/AntiAliasing.cs | 12 + Ryujinx.Graphics.GAL/IWindow.cs | 4 + .../Multithreading/ThreadedWindow.cs | 6 + Ryujinx.Graphics.GAL/UpscaleType.cs | 9 + .../Effects/FsrScalingFilter.cs | 177 + .../Effects/FxaaPostProcessingEffect.cs | 81 + .../Effects/IPostProcessingEffect.cs | 11 + .../Effects/IScalingFilter.cs | 18 + .../Effects/ShaderHelper.cs | 40 + .../Effects/Shaders/ffx_a.h | 2656 +++++++++++ .../Effects/Shaders/ffx_fsr1.h | 1199 +++++ .../Effects/Shaders/fsr_scaling.glsl | 88 + .../Effects/Shaders/fsr_sharpening.glsl | 37 + .../Effects/Shaders/fxaa.glsl | 1174 +++++ .../Effects/Shaders/smaa.hlsl | 1361 ++++++ .../Effects/Shaders/smaa_blend.glsl | 26 + .../Effects/Shaders/smaa_edge.glsl | 24 + .../Effects/Shaders/smaa_neighbour.glsl | 26 + .../Effects/SmaaPostProcessingEffect.cs | 261 ++ .../Effects/Textures/SmaaAreaTexture.bin | Bin 0 -> 179200 bytes .../Effects/Textures/SmaaSearchTexture.bin | Bin 0 -> 1024 bytes .../Ryujinx.Graphics.OpenGL.csproj | 14 + Ryujinx.Graphics.OpenGL/Window.cs | 215 +- .../DescriptorSetUpdater.cs | 7 + .../Effects/FsrScalingFilter.cs | 208 + .../Effects/FxaaPostProcessingEffect.cs | 127 + .../Effects/IPostProcessingEffect.cs | 10 + .../Effects/IScalingFilter.cs | 20 + .../Effects/Shaders/FsrScaling.glsl | 3945 +++++++++++++++++ .../Effects/Shaders/FsrScaling.spv | Bin 0 -> 44672 bytes .../Effects/Shaders/FsrSharpening.glsl | 3904 ++++++++++++++++ .../Effects/Shaders/FsrSharpening.spv | Bin 0 -> 20472 bytes .../Effects/Shaders/Fxaa.glsl | 1177 +++++ .../Effects/Shaders/Fxaa.spv | Bin 0 -> 25012 bytes .../Effects/Shaders/SmaaBlend.glsl | 1404 ++++++ .../Effects/Shaders/SmaaBlend.spv | Bin 0 -> 33728 bytes .../Effects/Shaders/SmaaEdge.glsl | 1402 ++++++ .../Effects/Shaders/SmaaEdge.spv | Bin 0 -> 8464 bytes .../Effects/Shaders/SmaaNeighbour.glsl | 1403 ++++++ .../Effects/Shaders/SmaaNeighbour.spv | Bin 0 -> 8328 bytes .../Effects/SmaaConstants.cs | 15 + .../Effects/SmaaPostProcessingEffect.cs | 314 ++ .../Effects/Textures/SmaaAreaTexture.bin | Bin 0 -> 179200 bytes .../Effects/Textures/SmaaSearchTexture.bin | Bin 0 -> 1024 bytes Ryujinx.Graphics.Vulkan/NativeArray.cs | 7 +- Ryujinx.Graphics.Vulkan/PipelineBase.cs | 27 + .../Ryujinx.Graphics.Vulkan.csproj | 11 + Ryujinx.Graphics.Vulkan/Window.cs | 167 +- Ryujinx.Graphics.Vulkan/WindowBase.cs | 3 + .../Configuration/ConfigurationFileFormat.cs | 17 +- .../Configuration/ConfigurationState.cs | 41 + Ryujinx/Ui/RendererWidgetBase.cs | 28 + Ryujinx/Ui/Windows/SettingsWindow.cs | 12 + Ryujinx/Ui/Windows/SettingsWindow.glade | 123 +- 60 files changed, 21954 insertions(+), 26 deletions(-) create mode 100644 Ryujinx.Common/Configuration/AntiAliasing.cs create mode 100644 Ryujinx.Common/Configuration/ScalingFilter.cs create mode 100644 Ryujinx.Graphics.GAL/AntiAliasing.cs create mode 100644 Ryujinx.Graphics.GAL/UpscaleType.cs create mode 100644 Ryujinx.Graphics.OpenGL/Effects/FsrScalingFilter.cs create mode 100644 Ryujinx.Graphics.OpenGL/Effects/FxaaPostProcessingEffect.cs create mode 100644 Ryujinx.Graphics.OpenGL/Effects/IPostProcessingEffect.cs create mode 100644 Ryujinx.Graphics.OpenGL/Effects/IScalingFilter.cs create mode 100644 Ryujinx.Graphics.OpenGL/Effects/ShaderHelper.cs create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_a.h create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_fsr1.h create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_scaling.glsl create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_sharpening.glsl create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/fxaa.glsl create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa.hlsl create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_blend.glsl create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_edge.glsl create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_neighbour.glsl create mode 100644 Ryujinx.Graphics.OpenGL/Effects/SmaaPostProcessingEffect.cs create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Textures/SmaaAreaTexture.bin create mode 100644 Ryujinx.Graphics.OpenGL/Effects/Textures/SmaaSearchTexture.bin create mode 100644 Ryujinx.Graphics.Vulkan/Effects/FsrScalingFilter.cs create mode 100644 Ryujinx.Graphics.Vulkan/Effects/FxaaPostProcessingEffect.cs create mode 100644 Ryujinx.Graphics.Vulkan/Effects/IPostProcessingEffect.cs create mode 100644 Ryujinx.Graphics.Vulkan/Effects/IScalingFilter.cs create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrScaling.glsl create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrScaling.spv create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrSharpening.glsl create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrSharpening.spv create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/Fxaa.glsl create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/Fxaa.spv create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaBlend.glsl create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaBlend.spv create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaEdge.glsl create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaEdge.spv create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaNeighbour.glsl create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaNeighbour.spv create mode 100644 Ryujinx.Graphics.Vulkan/Effects/SmaaConstants.cs create mode 100644 Ryujinx.Graphics.Vulkan/Effects/SmaaPostProcessingEffect.cs create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Textures/SmaaAreaTexture.bin create mode 100644 Ryujinx.Graphics.Vulkan/Effects/Textures/SmaaSearchTexture.bin diff --git a/Ryujinx.Ava/AppHost.cs b/Ryujinx.Ava/AppHost.cs index 242c84e7f..eb22b39e9 100644 --- a/Ryujinx.Ava/AppHost.cs +++ b/Ryujinx.Ava/AppHost.cs @@ -171,6 +171,11 @@ namespace Ryujinx.Ava ConfigurationState.Instance.Graphics.AspectRatio.Event += UpdateAspectRatioState; ConfigurationState.Instance.System.EnableDockedMode.Event += UpdateDockedModeState; ConfigurationState.Instance.System.AudioVolume.Event += UpdateAudioVolumeState; + ConfigurationState.Instance.System.EnableDockedMode.Event += UpdateDockedModeState; + ConfigurationState.Instance.System.AudioVolume.Event += UpdateAudioVolumeState; + ConfigurationState.Instance.Graphics.AntiAliasing.Event += UpdateAntiAliasing; + ConfigurationState.Instance.Graphics.ScalingFilter.Event += UpdateScalingFilter; + ConfigurationState.Instance.Graphics.ScalingFilterLevel.Event += UpdateScalingFilterLevel; _gpuCancellationTokenSource = new CancellationTokenSource(); } @@ -193,6 +198,17 @@ namespace Ryujinx.Ava } } } + private void UpdateScalingFilterLevel(object sender, ReactiveEventArgs<int> e) + { + _renderer.Window?.SetScalingFilter((Graphics.GAL.ScalingFilter)ConfigurationState.Instance.Graphics.ScalingFilter.Value); + _renderer.Window?.SetScalingFilterLevel(ConfigurationState.Instance.Graphics.ScalingFilterLevel.Value); + } + + private void UpdateScalingFilter(object sender, ReactiveEventArgs<Ryujinx.Common.Configuration.ScalingFilter> e) + { + _renderer.Window?.SetScalingFilter((Graphics.GAL.ScalingFilter)ConfigurationState.Instance.Graphics.ScalingFilter.Value); + _renderer.Window?.SetScalingFilterLevel(ConfigurationState.Instance.Graphics.ScalingFilterLevel.Value); + } private void ShowCursor() { @@ -345,6 +361,11 @@ namespace Ryujinx.Ava } } + private void UpdateAntiAliasing(object sender, ReactiveEventArgs<Ryujinx.Common.Configuration.AntiAliasing> e) + { + _renderer?.Window?.SetAntiAliasing((Graphics.GAL.AntiAliasing)e.NewValue); + } + private void UpdateDockedModeState(object sender, ReactiveEventArgs<bool> e) { Device?.System.ChangeDockedModeState(e.NewValue); @@ -411,6 +432,9 @@ namespace Ryujinx.Ava ConfigurationState.Instance.Graphics.AspectRatio.Event -= UpdateAspectRatioState; ConfigurationState.Instance.System.EnableDockedMode.Event -= UpdateDockedModeState; ConfigurationState.Instance.System.AudioVolume.Event -= UpdateAudioVolumeState; + ConfigurationState.Instance.Graphics.ScalingFilter.Event -= UpdateScalingFilter; + ConfigurationState.Instance.Graphics.ScalingFilterLevel.Event -= UpdateScalingFilterLevel; + ConfigurationState.Instance.Graphics.AntiAliasing.Event -= UpdateAntiAliasing; _topLevel.PointerMoved -= TopLevel_PointerMoved; @@ -788,6 +812,10 @@ namespace Ryujinx.Ava Device.Gpu.Renderer.Initialize(_glLogLevel); + _renderer?.Window?.SetAntiAliasing((Graphics.GAL.AntiAliasing)ConfigurationState.Instance.Graphics.AntiAliasing.Value); + _renderer?.Window?.SetScalingFilter((Graphics.GAL.ScalingFilter)ConfigurationState.Instance.Graphics.ScalingFilter.Value); + _renderer?.Window?.SetScalingFilterLevel(ConfigurationState.Instance.Graphics.ScalingFilterLevel.Value); + Width = (int)_rendererHost.Bounds.Width; Height = (int)_rendererHost.Bounds.Height; diff --git a/Ryujinx.Ava/Assets/Locales/en_US.json b/Ryujinx.Ava/Assets/Locales/en_US.json index b7d1e02bf..db8d24241 100644 --- a/Ryujinx.Ava/Assets/Locales/en_US.json +++ b/Ryujinx.Ava/Assets/Locales/en_US.json @@ -626,6 +626,16 @@ "Recover": "Recover", "UserProfilesRecoverHeading" : "Saves were found for the following accounts", "UserProfilesRecoverEmptyList": "No profiles to recover", + "GraphicsAATooltip": "Applies anti-aliasing to the game render", + "GraphicsAALabel": "Anti-Aliasing:", + "GraphicsScalingFilterLabel": "Scaling Filter:", + "GraphicsScalingFilterTooltip": "Enables Framebuffer Scaling", + "GraphicsScalingFilterLevelLabel": "Level", + "GraphicsScalingFilterLevelTooltip": "Set Scaling Filter Level", + "SmaaLow": "SMAA Low", + "SmaaMedium": "SMAA Medium", + "SmaaHigh": "SMAA High", + "SmaaUltra": "SMAA Ultra", "UserEditorTitle" : "Edit User", "UserEditorTitleCreate" : "Create User" } diff --git a/Ryujinx.Ava/UI/ViewModels/SettingsViewModel.cs b/Ryujinx.Ava/UI/ViewModels/SettingsViewModel.cs index 36b37b0f5..7045c9ed3 100644 --- a/Ryujinx.Ava/UI/ViewModels/SettingsViewModel.cs +++ b/Ryujinx.Ava/UI/ViewModels/SettingsViewModel.cs @@ -45,6 +45,8 @@ namespace Ryujinx.Ava.UI.ViewModels private KeyboardHotkeys _keyboardHotkeys; private int _graphicsBackendIndex; private string _customThemePath; + private int _scalingFilter; + private int _scalingFilterLevel; public event Action CloseWindow; public event Action SaveSettingsEvent; @@ -153,6 +155,8 @@ namespace Ryujinx.Ava.UI.ViewModels public bool IsSDL2Enabled { get; set; } public bool EnableCustomTheme { get; set; } public bool IsCustomResolutionScaleActive => _resolutionScale == 4; + public bool IsScalingFilterActive => _scalingFilter == (int)Ryujinx.Common.Configuration.ScalingFilter.Fsr; + public bool IsVulkanSelected => GraphicsBackendIndex == 0; public bool UseHypervisor { get; set; } @@ -179,6 +183,18 @@ namespace Ryujinx.Ava.UI.ViewModels public int AudioBackend { get; set; } public int MaxAnisotropy { get; set; } public int AspectRatio { get; set; } + public int AntiAliasingEffect { get; set; } + public string ScalingFilterLevelText => ScalingFilterLevel.ToString("0"); + public int ScalingFilterLevel + { + get => _scalingFilterLevel; + set + { + _scalingFilterLevel = value; + OnPropertyChanged(); + OnPropertyChanged(nameof(ScalingFilterLevelText)); + } + } public int OpenglDebugLevel { get; set; } public int MemoryMode { get; set; } public int BaseStyleIndex { get; set; } @@ -192,6 +208,16 @@ namespace Ryujinx.Ava.UI.ViewModels OnPropertyChanged(nameof(IsVulkanSelected)); } } + public int ScalingFilter + { + get => _scalingFilter; + set + { + _scalingFilter = value; + OnPropertyChanged(); + OnPropertyChanged(nameof(IsScalingFilterActive)); + } + } public int PreferredGpuIndex { get; set; } @@ -365,6 +391,9 @@ namespace Ryujinx.Ava.UI.ViewModels AspectRatio = (int)config.Graphics.AspectRatio.Value; GraphicsBackendMultithreadingIndex = (int)config.Graphics.BackendThreading.Value; ShaderDumpPath = config.Graphics.ShadersDumpPath; + AntiAliasingEffect = (int)config.Graphics.AntiAliasing.Value; + ScalingFilter = (int)config.Graphics.ScalingFilter.Value; + ScalingFilterLevel = config.Graphics.ScalingFilterLevel.Value; // Audio AudioBackend = (int)config.System.AudioBackend.Value; @@ -447,6 +476,9 @@ namespace Ryujinx.Ava.UI.ViewModels config.Graphics.ResScaleCustom.Value = CustomResolutionScale; config.Graphics.MaxAnisotropy.Value = MaxAnisotropy == 0 ? -1 : MathF.Pow(2, MaxAnisotropy); config.Graphics.AspectRatio.Value = (AspectRatio)AspectRatio; + config.Graphics.AntiAliasing.Value = (AntiAliasing)AntiAliasingEffect; + config.Graphics.ScalingFilter.Value = (ScalingFilter)ScalingFilter; + config.Graphics.ScalingFilterLevel.Value = ScalingFilterLevel; if (ConfigurationState.Instance.Graphics.BackendThreading != (BackendThreading)GraphicsBackendMultithreadingIndex) { diff --git a/Ryujinx.Ava/UI/Views/Settings/SettingsGraphicsView.axaml b/Ryujinx.Ava/UI/Views/Settings/SettingsGraphicsView.axaml index fb30fb7f4..8e4122f38 100644 --- a/Ryujinx.Ava/UI/Views/Settings/SettingsGraphicsView.axaml +++ b/Ryujinx.Ava/UI/Views/Settings/SettingsGraphicsView.axaml @@ -7,6 +7,7 @@ xmlns:ui="clr-namespace:FluentAvalonia.UI.Controls;assembly=FluentAvalonia" xmlns:locale="clr-namespace:Ryujinx.Ava.Common.Locale" xmlns:viewModels="clr-namespace:Ryujinx.Ava.UI.ViewModels" + Design.Width="1000" mc:Ignorable="d" x:CompileBindings="True" x:DataType="viewModels:SettingsViewModel"> @@ -111,6 +112,83 @@ Minimum="0.1" Value="{Binding CustomResolutionScale}" /> </StackPanel> + <StackPanel + HorizontalAlignment="Stretch" + Orientation="Vertical" + Spacing="10"> + <StackPanel Orientation="Horizontal"> + <TextBlock VerticalAlignment="Center" + ToolTip.Tip="{locale:Locale GraphicsAATooltip}" + Text="{locale:Locale GraphicsAALabel}" + Width="250" /> + <ComboBox Width="350" + HorizontalContentAlignment="Left" + ToolTip.Tip="{locale:Locale GraphicsAATooltip}" + SelectedIndex="{Binding AntiAliasingEffect}"> + <ComboBoxItem> + <TextBlock Text="{locale:Locale SettingsTabLoggingGraphicsBackendLogLevelNone}" /> + </ComboBoxItem> + <ComboBoxItem> + <TextBlock Text="FXAA" /> + </ComboBoxItem> + <ComboBoxItem> + <TextBlock Text="{locale:Locale SmaaLow}" /> + </ComboBoxItem> + <ComboBoxItem> + <TextBlock Text="{locale:Locale SmaaMedium}" /> + </ComboBoxItem> + <ComboBoxItem> + <TextBlock Text="{locale:Locale SmaaHigh}" /> + </ComboBoxItem> + <ComboBoxItem> + <TextBlock Text="{locale:Locale SmaaUltra}" /> + </ComboBoxItem> + </ComboBox> + </StackPanel> + </StackPanel> + <StackPanel + HorizontalAlignment="Stretch" + Orientation="Vertical" + Spacing="10"> + <StackPanel Orientation="Horizontal"> + <TextBlock VerticalAlignment="Center" + ToolTip.Tip="{locale:Locale GraphicsScalingFilterTooltip}" + Text="{locale:Locale GraphicsScalingFilterLabel}" + Width="250" /> + <ComboBox Width="350" + HorizontalContentAlignment="Left" + ToolTip.Tip="{locale:Locale GraphicsScalingFilterTooltip}" + SelectedIndex="{Binding ScalingFilter}"> + <ComboBoxItem> + <TextBlock Text="Bilinear" /> + </ComboBoxItem> + <ComboBoxItem> + <TextBlock Text="Nearest" /> + </ComboBoxItem> + <ComboBoxItem> + <TextBlock Text="FSR" /> + </ComboBoxItem> + </ComboBox> + <Slider Value="{Binding ScalingFilterLevel}" + ToolTip.Tip="{locale:Locale GraphicsScalingFilterLevelTooltip}" + MinWidth="150" + Margin="10,-3,0,0" + Height="32" + Padding="0,-5" + IsVisible="{Binding IsScalingFilterActive}" + TickFrequency="1" + IsSnapToTickEnabled="True" + LargeChange="10" + SmallChange="1" + VerticalAlignment="Center" + Minimum="0" + Maximum="100" /> + <TextBlock Margin="5,0" + Width="40" + IsVisible="{Binding IsScalingFilterActive}" + Text="{Binding ScalingFilterLevelText}"/> + </StackPanel> + </StackPanel> <StackPanel Orientation="Horizontal"> <TextBlock VerticalAlignment="Center" ToolTip.Tip="{locale:Locale AnisotropyTooltip}" diff --git a/Ryujinx.Common/Configuration/AntiAliasing.cs b/Ryujinx.Common/Configuration/AntiAliasing.cs new file mode 100644 index 000000000..6543598c7 --- /dev/null +++ b/Ryujinx.Common/Configuration/AntiAliasing.cs @@ -0,0 +1,12 @@ +namespace Ryujinx.Common.Configuration +{ + public enum AntiAliasing + { + None, + Fxaa, + SmaaLow, + SmaaMedium, + SmaaHigh, + SmaaUltra + } +} diff --git a/Ryujinx.Common/Configuration/ScalingFilter.cs b/Ryujinx.Common/Configuration/ScalingFilter.cs new file mode 100644 index 000000000..2095b89b1 --- /dev/null +++ b/Ryujinx.Common/Configuration/ScalingFilter.cs @@ -0,0 +1,9 @@ +namespace Ryujinx.Common.Configuration +{ + public enum ScalingFilter + { + Bilinear, + Nearest, + Fsr + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.GAL/AntiAliasing.cs b/Ryujinx.Graphics.GAL/AntiAliasing.cs new file mode 100644 index 000000000..d4e5754d8 --- /dev/null +++ b/Ryujinx.Graphics.GAL/AntiAliasing.cs @@ -0,0 +1,12 @@ +namespace Ryujinx.Graphics.GAL +{ + public enum AntiAliasing + { + None, + Fxaa, + SmaaLow, + SmaaMedium, + SmaaHigh, + SmaaUltra + } +} diff --git a/Ryujinx.Graphics.GAL/IWindow.cs b/Ryujinx.Graphics.GAL/IWindow.cs index a9bbbc5e0..1221d685a 100644 --- a/Ryujinx.Graphics.GAL/IWindow.cs +++ b/Ryujinx.Graphics.GAL/IWindow.cs @@ -9,5 +9,9 @@ namespace Ryujinx.Graphics.GAL void SetSize(int width, int height); void ChangeVSyncMode(bool vsyncEnabled); + + void SetAntiAliasing(AntiAliasing antialiasing); + void SetScalingFilter(ScalingFilter type); + void SetScalingFilterLevel(float level); } } diff --git a/Ryujinx.Graphics.GAL/Multithreading/ThreadedWindow.cs b/Ryujinx.Graphics.GAL/Multithreading/ThreadedWindow.cs index c4b62a25d..a647d37eb 100644 --- a/Ryujinx.Graphics.GAL/Multithreading/ThreadedWindow.cs +++ b/Ryujinx.Graphics.GAL/Multithreading/ThreadedWindow.cs @@ -32,5 +32,11 @@ namespace Ryujinx.Graphics.GAL.Multithreading } public void ChangeVSyncMode(bool vsyncEnabled) { } + + public void SetAntiAliasing(AntiAliasing effect) { } + + public void SetScalingFilter(ScalingFilter type) { } + + public void SetScalingFilterLevel(float level) { } } } diff --git a/Ryujinx.Graphics.GAL/UpscaleType.cs b/Ryujinx.Graphics.GAL/UpscaleType.cs new file mode 100644 index 000000000..442b65f24 --- /dev/null +++ b/Ryujinx.Graphics.GAL/UpscaleType.cs @@ -0,0 +1,9 @@ +namespace Ryujinx.Graphics.GAL +{ + public enum ScalingFilter + { + Bilinear, + Nearest, + Fsr + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.OpenGL/Effects/FsrScalingFilter.cs b/Ryujinx.Graphics.OpenGL/Effects/FsrScalingFilter.cs new file mode 100644 index 000000000..16678bb7b --- /dev/null +++ b/Ryujinx.Graphics.OpenGL/Effects/FsrScalingFilter.cs @@ -0,0 +1,177 @@ +using OpenTK.Graphics.OpenGL; +using Ryujinx.Common; +using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.OpenGL.Image; +using System; +using static Ryujinx.Graphics.OpenGL.Effects.ShaderHelper; + +namespace Ryujinx.Graphics.OpenGL.Effects +{ + internal class FsrScalingFilter : IScalingFilter + { + private readonly OpenGLRenderer _renderer; + private int _inputUniform; + private int _outputUniform; + private int _sharpeningUniform; + private int _srcX0Uniform; + private int _srcX1Uniform; + private int _srcY0Uniform; + private int _scalingShaderProgram; + private int _sharpeningShaderProgram; + private float _scale = 1; + private int _srcY1Uniform; + private int _dstX0Uniform; + private int _dstX1Uniform; + private int _dstY0Uniform; + private int _dstY1Uniform; + private int _scaleXUniform; + private int _scaleYUniform; + private TextureStorage _intermediaryTexture; + + public float Level + { + get => _scale; + set + { + _scale = MathF.Max(0.01f, value); + } + } + + public FsrScalingFilter(OpenGLRenderer renderer, IPostProcessingEffect filter) + { + Initialize(); + + _renderer = renderer; + } + + public void Dispose() + { + if (_scalingShaderProgram != 0) + { + GL.DeleteProgram(_scalingShaderProgram); + GL.DeleteProgram(_sharpeningShaderProgram); + } + + _intermediaryTexture?.Dispose(); + } + + private void Initialize() + { + var scalingShader = EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_scaling.glsl"); + var sharpeningShader = EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_sharpening.glsl"); + var fsrA = EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_a.h"); + var fsr1 = EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_fsr1.h"); + + scalingShader = scalingShader.Replace("#include \"ffx_a.h\"", fsrA); + scalingShader = scalingShader.Replace("#include \"ffx_fsr1.h\"", fsr1); + sharpeningShader = sharpeningShader.Replace("#include \"ffx_a.h\"", fsrA); + sharpeningShader = sharpeningShader.Replace("#include \"ffx_fsr1.h\"", fsr1); + + _scalingShaderProgram = CompileProgram(scalingShader, ShaderType.ComputeShader); + _sharpeningShaderProgram = CompileProgram(sharpeningShader, ShaderType.ComputeShader); + + _inputUniform = GL.GetUniformLocation(_scalingShaderProgram, "Source"); + _outputUniform = GL.GetUniformLocation(_scalingShaderProgram, "imgOutput"); + _sharpeningUniform = GL.GetUniformLocation(_sharpeningShaderProgram, "sharpening"); + + _srcX0Uniform = GL.GetUniformLocation(_scalingShaderProgram, "srcX0"); + _srcX1Uniform = GL.GetUniformLocation(_scalingShaderProgram, "srcX1"); + _srcY0Uniform = GL.GetUniformLocation(_scalingShaderProgram, "srcY0"); + _srcY1Uniform = GL.GetUniformLocation(_scalingShaderProgram, "srcY1"); + _dstX0Uniform = GL.GetUniformLocation(_scalingShaderProgram, "dstX0"); + _dstX1Uniform = GL.GetUniformLocation(_scalingShaderProgram, "dstX1"); + _dstY0Uniform = GL.GetUniformLocation(_scalingShaderProgram, "dstY0"); + _dstY1Uniform = GL.GetUniformLocation(_scalingShaderProgram, "dstY1"); + _scaleXUniform = GL.GetUniformLocation(_scalingShaderProgram, "scaleX"); + _scaleYUniform = GL.GetUniformLocation(_scalingShaderProgram, "scaleY"); + } + + public void Run( + TextureView view, + TextureView destinationTexture, + int width, + int height, + Extents2D source, + Extents2D destination) + { + if (_intermediaryTexture == null || _intermediaryTexture.Info.Width != width || _intermediaryTexture.Info.Height != height) + { + _intermediaryTexture?.Dispose(); + var originalInfo = view.Info; + var info = new TextureCreateInfo(width, + height, + originalInfo.Depth, + originalInfo.Levels, + originalInfo.Samples, + originalInfo.BlockWidth, + originalInfo.BlockHeight, + originalInfo.BytesPerPixel, + originalInfo.Format, + originalInfo.DepthStencilMode, + originalInfo.Target, + originalInfo.SwizzleR, + originalInfo.SwizzleG, + originalInfo.SwizzleB, + originalInfo.SwizzleA); + + _intermediaryTexture = new TextureStorage(_renderer, info, view.ScaleFactor); + _intermediaryTexture.CreateDefaultView(); + } + + var textureView = _intermediaryTexture.CreateView(_intermediaryTexture.Info, 0, 0) as TextureView; + + int previousProgram = GL.GetInteger(GetPName.CurrentProgram); + int previousUnit = GL.GetInteger(GetPName.ActiveTexture); + GL.ActiveTexture(TextureUnit.Texture0); + int previousTextureBinding = GL.GetInteger(GetPName.TextureBinding2D); + + GL.BindImageTexture(0, textureView.Handle, 0, false, 0, TextureAccess.ReadWrite, SizedInternalFormat.Rgba8); + + int threadGroupWorkRegionDim = 16; + int dispatchX = (width + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + int dispatchY = (height + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + + // Scaling pass + float srcWidth = Math.Abs(source.X2 - source.X1); + float srcHeight = Math.Abs(source.Y2 - source.Y1); + float scaleX = srcWidth / view.Width; + float scaleY = srcHeight / view.Height; + GL.UseProgram(_scalingShaderProgram); + view.Bind(0); + GL.Uniform1(_inputUniform, 0); + GL.Uniform1(_outputUniform, 0); + GL.Uniform1(_srcX0Uniform, (float)source.X1); + GL.Uniform1(_srcX1Uniform, (float)source.X2); + GL.Uniform1(_srcY0Uniform, (float)source.Y1); + GL.Uniform1(_srcY1Uniform, (float)source.Y2); + GL.Uniform1(_dstX0Uniform, (float)destination.X1); + GL.Uniform1(_dstX1Uniform, (float)destination.X2); + GL.Uniform1(_dstY0Uniform, (float)destination.Y1); + GL.Uniform1(_dstY1Uniform, (float)destination.Y2); + GL.Uniform1(_scaleXUniform, scaleX); + GL.Uniform1(_scaleYUniform, scaleY); + GL.DispatchCompute(dispatchX, dispatchY, 1); + + GL.MemoryBarrier(MemoryBarrierFlags.ShaderImageAccessBarrierBit); + + // Sharpening Pass + GL.UseProgram(_sharpeningShaderProgram); + GL.BindImageTexture(0, destinationTexture.Handle, 0, false, 0, TextureAccess.ReadWrite, SizedInternalFormat.Rgba8); + textureView.Bind(0); + GL.Uniform1(_inputUniform, 0); + GL.Uniform1(_outputUniform, 0); + GL.Uniform1(_sharpeningUniform, 1.5f - (Level * 0.01f * 1.5f)); + GL.DispatchCompute(dispatchX, dispatchY, 1); + + GL.UseProgram(previousProgram); + GL.MemoryBarrier(MemoryBarrierFlags.ShaderImageAccessBarrierBit); + + (_renderer.Pipeline as Pipeline).RestoreImages1And2(); + + GL.ActiveTexture(TextureUnit.Texture0); + GL.BindTexture(TextureTarget.Texture2D, previousTextureBinding); + + GL.ActiveTexture((TextureUnit)previousUnit); + } + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.OpenGL/Effects/FxaaPostProcessingEffect.cs b/Ryujinx.Graphics.OpenGL/Effects/FxaaPostProcessingEffect.cs new file mode 100644 index 000000000..3a2d685b7 --- /dev/null +++ b/Ryujinx.Graphics.OpenGL/Effects/FxaaPostProcessingEffect.cs @@ -0,0 +1,81 @@ +using OpenTK.Graphics.OpenGL; +using Ryujinx.Common; +using Ryujinx.Graphics.OpenGL.Image; + +namespace Ryujinx.Graphics.OpenGL.Effects +{ + internal class FxaaPostProcessingEffect : IPostProcessingEffect + { + private readonly OpenGLRenderer _renderer; + private int _resolutionUniform; + private int _inputUniform; + private int _outputUniform; + private int _shaderProgram; + private TextureStorage _textureStorage; + + public FxaaPostProcessingEffect(OpenGLRenderer renderer) + { + Initialize(); + + _renderer = renderer; + } + + public void Dispose() + { + if (_shaderProgram != 0) + { + GL.DeleteProgram(_shaderProgram); + _textureStorage?.Dispose(); + } + } + + private void Initialize() + { + _shaderProgram = ShaderHelper.CompileProgram(EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/fxaa.glsl"), ShaderType.ComputeShader); + + _resolutionUniform = GL.GetUniformLocation(_shaderProgram, "invResolution"); + _inputUniform = GL.GetUniformLocation(_shaderProgram, "inputTexture"); + _outputUniform = GL.GetUniformLocation(_shaderProgram, "imgOutput"); + } + + public TextureView Run(TextureView view, int width, int height) + { + if (_textureStorage == null || _textureStorage.Info.Width != view.Width || _textureStorage.Info.Height != view.Height) + { + _textureStorage?.Dispose(); + _textureStorage = new TextureStorage(_renderer, view.Info, view.ScaleFactor); + _textureStorage.CreateDefaultView(); + } + + var textureView = _textureStorage.CreateView(view.Info, 0, 0) as TextureView; + + int previousProgram = GL.GetInteger(GetPName.CurrentProgram); + int previousUnit = GL.GetInteger(GetPName.ActiveTexture); + GL.ActiveTexture(TextureUnit.Texture0); + int previousTextureBinding = GL.GetInteger(GetPName.TextureBinding2D); + + GL.BindImageTexture(0, textureView.Handle, 0, false, 0, TextureAccess.ReadWrite, SizedInternalFormat.Rgba8); + GL.UseProgram(_shaderProgram); + + var dispatchX = BitUtils.DivRoundUp(view.Width, IPostProcessingEffect.LocalGroupSize); + var dispatchY = BitUtils.DivRoundUp(view.Height, IPostProcessingEffect.LocalGroupSize); + + view.Bind(0); + GL.Uniform1(_inputUniform, 0); + GL.Uniform1(_outputUniform, 0); + GL.Uniform2(_resolutionUniform, (float)view.Width, (float)view.Height); + GL.DispatchCompute(dispatchX, dispatchY, 1); + GL.UseProgram(previousProgram); + GL.MemoryBarrier(MemoryBarrierFlags.ShaderImageAccessBarrierBit); + + (_renderer.Pipeline as Pipeline).RestoreImages1And2(); + + GL.ActiveTexture(TextureUnit.Texture0); + GL.BindTexture(TextureTarget.Texture2D, previousTextureBinding); + + GL.ActiveTexture((TextureUnit)previousUnit); + + return textureView; + } + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.OpenGL/Effects/IPostProcessingEffect.cs b/Ryujinx.Graphics.OpenGL/Effects/IPostProcessingEffect.cs new file mode 100644 index 000000000..7a045a021 --- /dev/null +++ b/Ryujinx.Graphics.OpenGL/Effects/IPostProcessingEffect.cs @@ -0,0 +1,11 @@ +using Ryujinx.Graphics.OpenGL.Image; +using System; + +namespace Ryujinx.Graphics.OpenGL.Effects +{ + internal interface IPostProcessingEffect : IDisposable + { + const int LocalGroupSize = 64; + TextureView Run(TextureView view, int width, int height); + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.OpenGL/Effects/IScalingFilter.cs b/Ryujinx.Graphics.OpenGL/Effects/IScalingFilter.cs new file mode 100644 index 000000000..e1e1b2c1d --- /dev/null +++ b/Ryujinx.Graphics.OpenGL/Effects/IScalingFilter.cs @@ -0,0 +1,18 @@ +using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.OpenGL.Image; +using System; + +namespace Ryujinx.Graphics.OpenGL.Effects +{ + internal interface IScalingFilter : IDisposable + { + float Level { get; set; } + void Run( + TextureView view, + TextureView destinationTexture, + int width, + int height, + Extents2D source, + Extents2D destination); + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.OpenGL/Effects/ShaderHelper.cs b/Ryujinx.Graphics.OpenGL/Effects/ShaderHelper.cs new file mode 100644 index 000000000..72c5a98f5 --- /dev/null +++ b/Ryujinx.Graphics.OpenGL/Effects/ShaderHelper.cs @@ -0,0 +1,40 @@ +using OpenTK.Graphics.OpenGL; +using System; + +namespace Ryujinx.Graphics.OpenGL.Effects +{ + internal static class ShaderHelper + { + public static int CompileProgram(string shaderCode, ShaderType shaderType) + { + var shader = GL.CreateShader(shaderType); + GL.ShaderSource(shader, shaderCode); + GL.CompileShader(shader); + + var program = GL.CreateProgram(); + GL.AttachShader(program, shader); + GL.LinkProgram(program); + + GL.DetachShader(program, shader); + GL.DeleteShader(shader); + + return program; + } + + public static int CompileProgram(string[] shaders, ShaderType shaderType) + { + var shader = GL.CreateShader(shaderType); + GL.ShaderSource(shader, shaders.Length, shaders, (int[])null); + GL.CompileShader(shader); + + var program = GL.CreateProgram(); + GL.AttachShader(program, shader); + GL.LinkProgram(program); + + GL.DetachShader(program, shader); + GL.DeleteShader(shader); + + return program; + } + } +} diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_a.h b/Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_a.h new file mode 100644 index 000000000..d04bff55c --- /dev/null +++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_a.h @@ -0,0 +1,2656 @@ +//============================================================================================================================== +// +// [A] SHADER PORTABILITY 1.20210629 +// +//============================================================================================================================== +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +// MIT LICENSE +// =========== +// Copyright (c) 2014 Michal Drobot (for concepts used in "FLOAT APPROXIMATIONS"). +// ----------- +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// ----------- +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +// Software. +// ----------- +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +// ABOUT +// ===== +// Common central point for high-level shading language and C portability for various shader headers. +//------------------------------------------------------------------------------------------------------------------------------ +// DEFINES +// ======= +// A_CPU ..... Include the CPU related code. +// A_GPU ..... Include the GPU related code. +// A_GLSL .... Using GLSL. +// A_HLSL .... Using HLSL. +// A_HLSL_6_2 Using HLSL 6.2 with new 'uint16_t' and related types (requires '-enable-16bit-types'). +// A_NO_16_BIT_CAST Don't use instructions that are not availabe in SPIR-V (needed for running A_HLSL_6_2 on Vulkan) +// A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default). +// ======= +// A_BYTE .... Support 8-bit integer. +// A_HALF .... Support 16-bit integer and floating point. +// A_LONG .... Support 64-bit integer. +// A_DUBL .... Support 64-bit floating point. +// ======= +// A_WAVE .... Support wave-wide operations. +//------------------------------------------------------------------------------------------------------------------------------ +// To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'. +//------------------------------------------------------------------------------------------------------------------------------ +// SIMPLIFIED TYPE SYSTEM +// ====================== +// - All ints will be unsigned with exception of when signed is required. +// - Type naming simplified and shortened "A<type><#components>", +// - H = 16-bit float (half) +// - F = 32-bit float (float) +// - D = 64-bit float (double) +// - P = 1-bit integer (predicate, not using bool because 'B' is used for byte) +// - B = 8-bit integer (byte) +// - W = 16-bit integer (word) +// - U = 32-bit integer (unsigned) +// - L = 64-bit integer (long) +// - Using "AS<type><#components>" for signed when required. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops). +//------------------------------------------------------------------------------------------------------------------------------ +// CHANGE LOG +// ========== +// 20200914 - Expanded wave ops and prx code. +// 20200713 - Added [ZOL] section, fixed serious bugs in sRGB and Rec.709 color conversion code, etc. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COMMON +//============================================================================================================================== +#define A_2PI 6.28318530718 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// CPU +// +// +//============================================================================================================================== +#ifdef A_CPU + // Supporting user defined overrides. + #ifndef A_RESTRICT + #define A_RESTRICT __restrict + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifndef A_STATIC + #define A_STATIC static + #endif +//------------------------------------------------------------------------------------------------------------------------------ + // Same types across CPU and GPU. + // Predicate uses 32-bit integer (C friendly bool). + typedef uint32_t AP1; + typedef float AF1; + typedef double AD1; + typedef uint8_t AB1; + typedef uint16_t AW1; + typedef uint32_t AU1; + typedef uint64_t AL1; + typedef int8_t ASB1; + typedef int16_t ASW1; + typedef int32_t ASU1; + typedef int64_t ASL1; +//------------------------------------------------------------------------------------------------------------------------------ + #define AD1_(a) ((AD1)(a)) + #define AF1_(a) ((AF1)(a)) + #define AL1_(a) ((AL1)(a)) + #define AU1_(a) ((AU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1_(a) ((ASL1)(a)) + #define ASU1_(a) ((ASU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;} +//------------------------------------------------------------------------------------------------------------------------------ + #define A_TRUE 1 + #define A_FALSE 0 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// CPU/GPU PORTING +// +//------------------------------------------------------------------------------------------------------------------------------ +// Get CPU and GPU to share all setup code, without duplicate code paths. +// This uses a lower-case prefix for special vector constructs. +// - In C restrict pointers are used. +// - In the shading language, in/inout/out arguments are used. +// This depends on the ability to access a vector value in both languages via array syntax (aka color[2]). +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD1 *A_RESTRICT + #define retAD3 AD1 *A_RESTRICT + #define retAD4 AD1 *A_RESTRICT + #define retAF2 AF1 *A_RESTRICT + #define retAF3 AF1 *A_RESTRICT + #define retAF4 AF1 *A_RESTRICT + #define retAL2 AL1 *A_RESTRICT + #define retAL3 AL1 *A_RESTRICT + #define retAL4 AL1 *A_RESTRICT + #define retAU2 AU1 *A_RESTRICT + #define retAU3 AU1 *A_RESTRICT + #define retAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 AD1 *A_RESTRICT + #define inAD3 AD1 *A_RESTRICT + #define inAD4 AD1 *A_RESTRICT + #define inAF2 AF1 *A_RESTRICT + #define inAF3 AF1 *A_RESTRICT + #define inAF4 AF1 *A_RESTRICT + #define inAL2 AL1 *A_RESTRICT + #define inAL3 AL1 *A_RESTRICT + #define inAL4 AL1 *A_RESTRICT + #define inAU2 AU1 *A_RESTRICT + #define inAU3 AU1 *A_RESTRICT + #define inAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 AD1 *A_RESTRICT + #define inoutAD3 AD1 *A_RESTRICT + #define inoutAD4 AD1 *A_RESTRICT + #define inoutAF2 AF1 *A_RESTRICT + #define inoutAF3 AF1 *A_RESTRICT + #define inoutAF4 AF1 *A_RESTRICT + #define inoutAL2 AL1 *A_RESTRICT + #define inoutAL3 AL1 *A_RESTRICT + #define inoutAL4 AL1 *A_RESTRICT + #define inoutAU2 AU1 *A_RESTRICT + #define inoutAU3 AU1 *A_RESTRICT + #define inoutAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 AD1 *A_RESTRICT + #define outAD3 AD1 *A_RESTRICT + #define outAD4 AD1 *A_RESTRICT + #define outAF2 AF1 *A_RESTRICT + #define outAF3 AF1 *A_RESTRICT + #define outAF4 AF1 *A_RESTRICT + #define outAL2 AL1 *A_RESTRICT + #define outAL3 AL1 *A_RESTRICT + #define outAL4 AL1 *A_RESTRICT + #define outAU2 AU1 *A_RESTRICT + #define outAU3 AU1 *A_RESTRICT + #define outAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD1 x[2] + #define varAD3(x) AD1 x[3] + #define varAD4(x) AD1 x[4] + #define varAF2(x) AF1 x[2] + #define varAF3(x) AF1 x[3] + #define varAF4(x) AF1 x[4] + #define varAL2(x) AL1 x[2] + #define varAL3(x) AL1 x[3] + #define varAL4(x) AL1 x[4] + #define varAU2(x) AU1 x[2] + #define varAU3(x) AU1 x[3] + #define varAU4(x) AU1 x[4] +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) {x,y} + #define initAD3(x,y,z) {x,y,z} + #define initAD4(x,y,z,w) {x,y,z,w} + #define initAF2(x,y) {x,y} + #define initAF3(x,y,z) {x,y,z} + #define initAF4(x,y,z,w) {x,y,z,w} + #define initAL2(x,y) {x,y} + #define initAL3(x,y,z) {x,y,z} + #define initAL4(x,y,z,w) {x,y,z,w} + #define initAU2(x,y) {x,y} + #define initAU3(x,y,z) {x,y,z} + #define initAU4(x,y,z,w) {x,y,z,w} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Replace transcendentals with manual versions. +//============================================================================================================================== + #ifdef A_GCC + A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_llabs(ASL1_(a)));} + #else + A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(labs((long)ASL1_(a)));} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);} + #else + A_STATIC AD1 ACosD1(AD1 a){return cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return cosf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} + A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);} + #else + A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);} + #else + A_STATIC AD1 AFloorD1(AD1 a){return floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);} + A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);} + #else + A_STATIC AD1 ALog2D1(AD1 a){return log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;} + A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;} + A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;} + A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + // These follow the convention that A integer types don't have signage, until they are operated on. + A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;} + A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a<b?a:b;} + A_STATIC AF1 AMinF1(AF1 a,AF1 b){return a<b?a:b;} + A_STATIC AL1 AMinL1(AL1 a,AL1 b){return a<b?a:b;} + A_STATIC AU1 AMinU1(AU1 a,AU1 b){return a<b?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AL1 AMinSL1(AL1 a,AL1 b){return (ASL1_(a)<ASL1_(b))?a:b;} + A_STATIC AU1 AMinSU1(AU1 a,AU1 b){return (ASU1_(a)<ASU1_(b))?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ARcpD1(AD1 a){return 1.0/a;} + A_STATIC AF1 ARcpF1(AF1 a){return 1.0f/a;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AL1 AShrSL1(AL1 a,AL1 b){return AL1_(ASL1_(a)>>ASL1_(b));} + A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);} + #else + A_STATIC AD1 ASinD1(AD1 a){return sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return sinf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);} + #else + A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + A_STATIC AD1 AClampD1(AD1 x,AD1 n,AD1 m){return AMaxD1(n,AMinD1(x,m));} + A_STATIC AF1 AClampF1(AF1 x,AF1 n,AF1 m){return AMaxF1(n,AMinF1(x,m));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);} + A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));} + A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));} + A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));} + A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;} + A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;} + A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;} + A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;} + A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;} + A_STATIC retAD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;} + A_STATIC retAD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;} + A_STATIC retAF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;} + A_STATIC retAF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;} +//============================================================================================================================== + A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;} + A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;} + A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;} + A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;} + A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;} + A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;} + A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;} + A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;} + A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//============================================================================================================================== + A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;} + A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;} + A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;} + A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;} + A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF FLOAT PACKING +//============================================================================================================================== + // Convert float to half (in lower 16-bits of output). + // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf + // Supports denormals. + // Conversion rules are to make computations possibly "safer" on the GPU, + // -INF & -NaN -> -65504 + // +INF & +NaN -> +65504 + A_STATIC AU1 AU1_AH1_AF1(AF1 f){ + static AW1 base[512]={ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100, + 0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00, + 0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100, + 0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00, + 0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff}; + static AB1 shift[512]={ + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18}; + union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);} +//------------------------------------------------------------------------------------------------------------------------------ + // Used to output packed constant. + A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GLSL +// +// +//============================================================================================================================== +#if defined(A_GLSL) && defined(A_GPU) + #ifndef A_SKIP_EXT + #ifdef A_HALF + #extension GL_EXT_shader_16bit_storage:require + #extension GL_EXT_shader_explicit_arithmetic_types:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_LONG + #extension GL_ARB_gpu_shader_int64:require + #extension GL_NV_shader_atomic_int64:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_WAVE + #extension GL_KHR_shader_subgroup_arithmetic:require + #extension GL_KHR_shader_subgroup_ballot:require + #extension GL_KHR_shader_subgroup_quad:require + #extension GL_KHR_shader_subgroup_shuffle:require + #endif + #endif +//============================================================================================================================== + #define AP1 bool + #define AP2 bvec2 + #define AP3 bvec3 + #define AP4 bvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 vec2 + #define AF3 vec3 + #define AF4 vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uvec2 + #define AU3 uvec3 + #define AU4 uvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 ivec2 + #define ASU3 ivec3 + #define ASU4 ivec4 +//============================================================================================================================== + #define AF1_AU1(x) uintBitsToFloat(AU1(x)) + #define AF2_AU2(x) uintBitsToFloat(AU2(x)) + #define AF3_AU3(x) uintBitsToFloat(AU3(x)) + #define AF4_AU4(x) uintBitsToFloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) floatBitsToUint(AF1(x)) + #define AU2_AF2(x) floatBitsToUint(AF2(x)) + #define AU3_AF3(x) floatBitsToUint(AF3(x)) + #define AU4_AF4(x) floatBitsToUint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH1_AF1_x(AF1 a){return packHalf2x16(AF2(a,0.0));} + #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2_AF2 packHalf2x16 + #define AU1_AW2Unorm_AF2 packUnorm2x16 + #define AU1_AB4Unorm_AF4 packUnorm4x8 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF2_AH2_AU1 unpackHalf2x16 + #define AF2_AW2Unorm_AU1 unpackUnorm2x16 + #define AF4_AB4Unorm_AU1 unpackUnorm4x8 +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate. + AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){return bitfieldInsert(src,ins,0,ASU1(bits));} +//------------------------------------------------------------------------------------------------------------------------------ + // V_MED3_F32. + AF1 AClampF1(AF1 x,AF1 n,AF1 m){return clamp(x,n,m);} + AF2 AClampF2(AF2 x,AF2 n,AF2 m){return clamp(x,n,m);} + AF3 AClampF3(AF3 x,AF3 n,AF3 m){return clamp(x,n,m);} + AF4 AClampF4(AF4 x,AF4 n,AF4 m){return clamp(x,n,m);} +//------------------------------------------------------------------------------------------------------------------------------ + // V_FRACT_F32 (note DX frac() is different). + AF1 AFractF1(AF1 x){return fract(x);} + AF2 AFractF2(AF2 x){return fract(x);} + AF3 AFractF3(AF3 x){return fract(x);} + AF4 AFractF4(AF4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return mix(x,y,a);} + AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return mix(x,y,a);} + AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return mix(x,y,a);} + AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + // V_MAX3_F32. + AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));} + AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));} + AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));} + AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));} + AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));} + AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));} + AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));} + AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));} + AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));} + AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));} + AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));} + AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));} + AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Clamp has an easier pattern match for med3 when some ordering is known. + // V_MED3_F32. + AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));} + AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));} + AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));} + AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));} +//------------------------------------------------------------------------------------------------------------------------------ + // V_MIN3_F32. + AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));} + AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));} + AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));} + AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));} + AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));} + AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));} + AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));} + AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));} + AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));} + AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));} + AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));} + AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));} + AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently. + // V_COS_F32. + AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));} + AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));} + AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));} + AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));} +//------------------------------------------------------------------------------------------------------------------------------ + // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently. + // V_SIN_F32. + AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));} + AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));} + AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));} + AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ARcpF1(AF1 x){return AF1_(1.0)/x;} + AF2 ARcpF2(AF2 x){return AF2_(1.0)/x;} + AF3 ARcpF3(AF3 x){return AF3_(1.0)/x;} + AF4 ARcpF4(AF4 x){return AF4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ARsqF1(AF1 x){return AF1_(1.0)/sqrt(x);} + AF2 ARsqF2(AF2 x){return AF2_(1.0)/sqrt(x);} + AF3 ARsqF3(AF3 x){return AF3_(1.0)/sqrt(x);} + AF4 ARsqF4(AF4 x){return AF4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ASatF1(AF1 x){return clamp(x,AF1_(0.0),AF1_(1.0));} + AF2 ASatF2(AF2 x){return clamp(x,AF2_(0.0),AF2_(1.0));} + AF3 ASatF3(AF3 x){return clamp(x,AF3_(0.0),AF3_(1.0));} + AF4 ASatF4(AF4 x){return clamp(x,AF4_(0.0),AF4_(1.0));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #define AB1 uint8_t + #define AB2 u8vec2 + #define AB3 u8vec3 + #define AB4 u8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASB1 int8_t + #define ASB2 i8vec2 + #define ASB3 i8vec3 + #define ASB4 i8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + AB1 AB1_x(AB1 a){return AB1(a);} + AB2 AB2_x(AB1 a){return AB2(a,a);} + AB3 AB3_x(AB1 a){return AB3(a,a,a);} + AB4 AB4_x(AB1 a){return AB4(a,a,a,a);} + #define AB1_(a) AB1_x(AB1(a)) + #define AB2_(a) AB2_x(AB1(a)) + #define AB3_(a) AB3_x(AB1(a)) + #define AB4_(a) AB4_x(AB1(a)) + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #define AH1 float16_t + #define AH2 f16vec2 + #define AH3 f16vec3 + #define AH4 f16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 uint16_t + #define AW2 u16vec2 + #define AW3 u16vec3 + #define AW4 u16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 int16_t + #define ASW2 i16vec2 + #define ASW3 i16vec3 + #define ASW4 i16vec4 +//============================================================================================================================== + #define AH2_AU1(x) unpackFloat2x16(AU1(x)) + AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));} + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) unpackUint2x16(AU1(x)) + #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x))) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2(x) packFloat2x16(AH2(x)) + AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));} + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) packUint2x16(AW2(x)) + #define AU2_AW4(x) unpack32(packUint4x16(AW4(x))) +//============================================================================================================================== + #define AW1_AH1(x) halfBitsToUint16(AH1(x)) + #define AW2_AH2(x) halfBitsToUint16(AH2(x)) + #define AW3_AH3(x) halfBitsToUint16(AH3(x)) + #define AW4_AH4(x) halfBitsToUint16(AH4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AH1_AW1(x) uint16BitsToHalf(AW1(x)) + #define AH2_AW2(x) uint16BitsToHalf(AW2(x)) + #define AH3_AW3(x) uint16BitsToHalf(AW3(x)) + #define AH4_AW4(x) uint16BitsToHalf(AW4(x)) +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AClampH1(AH1 x,AH1 n,AH1 m){return clamp(x,n,m);} + AH2 AClampH2(AH2 x,AH2 n,AH2 m){return clamp(x,n,m);} + AH3 AClampH3(AH3 x,AH3 n,AH3 m){return clamp(x,n,m);} + AH4 AClampH4(AH4 x,AH4 n,AH4 m){return clamp(x,n,m);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFractH1(AH1 x){return fract(x);} + AH2 AFractH2(AH2 x){return fract(x);} + AH3 AFractH3(AH3 x){return fract(x);} + AH4 AFractH4(AH4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of max3. + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of min3. + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;} + AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;} + AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;} + AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);} + AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);} + AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);} + AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));} + AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));} + AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));} + AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #define AD1 double + #define AD2 dvec2 + #define AD3 dvec3 + #define AD4 dvec4 +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 x){return fract(x);} + AD2 AFractD2(AD2 x){return fract(x);} + AD3 AFractD3(AD3 x){return fract(x);} + AD4 AFractD4(AD4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;} + AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;} + AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;} + AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);} + AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);} + AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);} + AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));} + AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));} + AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));} + AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL LONG +//============================================================================================================================== + #ifdef A_LONG + #define AL1 uint64_t + #define AL2 u64vec2 + #define AL3 u64vec3 + #define AL4 u64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1 int64_t + #define ASL2 i64vec2 + #define ASL3 i64vec3 + #define ASL4 i64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AL1_AU2(x) packUint2x32(AU2(x)) + #define AU2_AL1(x) unpackUint2x32(AL1(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AL1_x(AL1 a){return AL1(a);} + AL2 AL2_x(AL1 a){return AL2(a,a);} + AL3 AL3_x(AL1 a){return AL3(a,a,a);} + AL4 AL4_x(AL1 a){return AL4(a,a,a,a);} + #define AL1_(a) AL1_x(AL1(a)) + #define AL2_(a) AL2_x(AL1(a)) + #define AL3_(a) AL3_x(AL1(a)) + #define AL4_(a) AL4_x(AL1(a)) +//============================================================================================================================== + AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));} + AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));} + AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));} + AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));} + AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));} + AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));} + AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));} + AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));} + AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));} + AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// WAVE OPERATIONS +//============================================================================================================================== + #ifdef A_WAVE + // Where 'x' must be a compile time literal. + AF1 AWaveXorF1(AF1 v,AU1 x){return subgroupShuffleXor(v,x);} + AF2 AWaveXorF2(AF2 v,AU1 x){return subgroupShuffleXor(v,x);} + AF3 AWaveXorF3(AF3 v,AU1 x){return subgroupShuffleXor(v,x);} + AF4 AWaveXorF4(AF4 v,AU1 x){return subgroupShuffleXor(v,x);} + AU1 AWaveXorU1(AU1 v,AU1 x){return subgroupShuffleXor(v,x);} + AU2 AWaveXorU2(AU2 v,AU1 x){return subgroupShuffleXor(v,x);} + AU3 AWaveXorU3(AU3 v,AU1 x){return subgroupShuffleXor(v,x);} + AU4 AWaveXorU4(AU4 v,AU1 x){return subgroupShuffleXor(v,x);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(subgroupShuffleXor(AU1_AH2(v),x));} + AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(subgroupShuffleXor(AU2_AH4(v),x));} + AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(subgroupShuffleXor(AU1_AW2(v),x));} + AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU2(subgroupShuffleXor(AU2_AW4(v),x));} + #endif + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// HLSL +// +// +//============================================================================================================================== +#if defined(A_HLSL) && defined(A_GPU) + #ifdef A_HLSL_6_2 + #define AP1 bool + #define AP2 bool2 + #define AP3 bool3 + #define AP4 bool4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float32_t + #define AF2 float32_t2 + #define AF3 float32_t3 + #define AF4 float32_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint32_t + #define AU2 uint32_t2 + #define AU3 uint32_t3 + #define AU4 uint32_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int32_t + #define ASU2 int32_t2 + #define ASU3 int32_t3 + #define ASU4 int32_t4 + #else + #define AP1 bool + #define AP2 bool2 + #define AP3 bool3 + #define AP4 bool4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 float2 + #define AF3 float3 + #define AF4 float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uint2 + #define AU3 uint3 + #define AU4 uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 int2 + #define ASU3 int3 + #define ASU4 int4 + #endif +//============================================================================================================================== + #define AF1_AU1(x) asfloat(AU1(x)) + #define AF2_AU2(x) asfloat(AU2(x)) + #define AF3_AU3(x) asfloat(AU3(x)) + #define AF4_AU4(x) asfloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) asuint(AF1(x)) + #define AU2_AF2(x) asuint(AF2(x)) + #define AU3_AF3(x) asuint(AF3(x)) + #define AU4_AF4(x) asuint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH1_AF1_x(AF1 a){return f32tof16(a);} + #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);} + #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) + #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));} + #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x)) +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1u<<bits)-1;return (src>>off)&mask;} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1u<<bits)-1;return (ins&mask)|(src&(~mask));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AClampF1(AF1 x,AF1 n,AF1 m){return max(n,min(x,m));} + AF2 AClampF2(AF2 x,AF2 n,AF2 m){return max(n,min(x,m));} + AF3 AClampF3(AF3 x,AF3 n,AF3 m){return max(n,min(x,m));} + AF4 AClampF4(AF4 x,AF4 n,AF4 m){return max(n,min(x,m));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFractF1(AF1 x){return x-floor(x);} + AF2 AFractF2(AF2 x){return x-floor(x);} + AF3 AFractF3(AF3 x){return x-floor(x);} + AF4 AFractF4(AF4 x){return x-floor(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return lerp(x,y,a);} + AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return lerp(x,y,a);} + AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return lerp(x,y,a);} + AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));} + AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));} + AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));} + AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));} + AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));} + AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));} + AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));} + AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));} + AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));} + AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));} + AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));} + AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));} + AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));} + AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));} + AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));} + AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));} + AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));} + AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));} + AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));} + AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));} + AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));} + AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));} + AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));} + AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));} + AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));} + AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));} + AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));} + AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));} + AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));} + AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));} + AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));} + AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));} + AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));} + AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ARcpF1(AF1 x){return rcp(x);} + AF2 ARcpF2(AF2 x){return rcp(x);} + AF3 ARcpF3(AF3 x){return rcp(x);} + AF4 ARcpF4(AF4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ARsqF1(AF1 x){return rsqrt(x);} + AF2 ARsqF2(AF2 x){return rsqrt(x);} + AF3 ARsqF3(AF3 x){return rsqrt(x);} + AF4 ARsqF4(AF4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ASatF1(AF1 x){return saturate(x);} + AF2 ASatF2(AF2 x){return saturate(x);} + AF3 ASatF3(AF3 x){return saturate(x);} + AF4 ASatF4(AF4 x){return saturate(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #ifdef A_HLSL_6_2 + #define AH1 float16_t + #define AH2 float16_t2 + #define AH3 float16_t3 + #define AH4 float16_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 uint16_t + #define AW2 uint16_t2 + #define AW3 uint16_t3 + #define AW4 uint16_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 int16_t + #define ASW2 int16_t2 + #define ASW3 int16_t3 + #define ASW4 int16_t4 + #else + #define AH1 min16float + #define AH2 min16float2 + #define AH3 min16float3 + #define AH4 min16float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 min16uint + #define AW2 min16uint2 + #define AW3 min16uint3 + #define AW4 min16uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 min16int + #define ASW2 min16int2 + #define ASW3 min16int3 + #define ASW4 min16int4 + #endif +//============================================================================================================================== + // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly). + // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/ + AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);} + AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));} + AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);} + AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));} + #define AH2_AU1(x) AH2_AU1_x(AU1(x)) + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) AW2_AU1_x(AU1(x)) + #define AW4_AU2(x) AW4_AU2_x(AU2(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);} + AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));} + AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);} + AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));} + #define AU1_AH2(x) AU1_AH2_x(AH2(x)) + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) AU1_AW2_x(AW2(x)) + #define AU2_AW4(x) AU2_AW4_x(AW4(x)) +//============================================================================================================================== + #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST) + #define AW1_AH1(x) asuint16(x) + #define AW2_AH2(x) asuint16(x) + #define AW3_AH3(x) asuint16(x) + #define AW4_AH4(x) asuint16(x) + #else + #define AW1_AH1(a) AW1(f32tof16(AF1(a))) + #define AW2_AH2(a) AW2(AW1_AH1((a).x),AW1_AH1((a).y)) + #define AW3_AH3(a) AW3(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z)) + #define AW4_AH4(a) AW4(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z),AW1_AH1((a).w)) + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST) + #define AH1_AW1(x) asfloat16(x) + #define AH2_AW2(x) asfloat16(x) + #define AH3_AW3(x) asfloat16(x) + #define AH4_AW4(x) asfloat16(x) + #else + #define AH1_AW1(a) AH1(f16tof32(AU1(a))) + #define AH2_AW2(a) AH2(AH1_AW1((a).x),AH1_AW1((a).y)) + #define AH3_AW3(a) AH3(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z)) + #define AH4_AW4(a) AH4(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z),AH1_AW1((a).w)) + #endif +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AClampH1(AH1 x,AH1 n,AH1 m){return max(n,min(x,m));} + AH2 AClampH2(AH2 x,AH2 n,AH2 m){return max(n,min(x,m));} + AH3 AClampH3(AH3 x,AH3 n,AH3 m){return max(n,min(x,m));} + AH4 AClampH4(AH4 x,AH4 n,AH4 m){return max(n,min(x,m));} +//------------------------------------------------------------------------------------------------------------------------------ + // V_FRACT_F16 (note DX frac() is different). + AH1 AFractH1(AH1 x){return x-floor(x);} + AH2 AFractH2(AH2 x){return x-floor(x);} + AH3 AFractH3(AH3 x){return x-floor(x);} + AH4 AFractH4(AH4 x){return x-floor(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return rcp(x);} + AH2 ARcpH2(AH2 x){return rcp(x);} + AH3 ARcpH3(AH3 x){return rcp(x);} + AH4 ARcpH4(AH4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return rsqrt(x);} + AH2 ARsqH2(AH2 x){return rsqrt(x);} + AH3 ARsqH3(AH3 x){return rsqrt(x);} + AH4 ARsqH4(AH4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return saturate(x);} + AH2 ASatH2(AH2 x){return saturate(x);} + AH3 ASatH3(AH3 x){return saturate(x);} + AH4 ASatH4(AH4 x){return saturate(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #ifdef A_HLSL_6_2 + #define AD1 float64_t + #define AD2 float64_t2 + #define AD3 float64_t3 + #define AD4 float64_t4 + #else + #define AD1 double + #define AD2 double2 + #define AD3 double3 + #define AD4 double4 + #endif +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 a){return a-floor(a);} + AD2 AFractD2(AD2 a){return a-floor(a);} + AD3 AFractD3(AD3 a){return a-floor(a);} + AD4 AFractD4(AD4 a){return a-floor(a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return rcp(x);} + AD2 ARcpD2(AD2 x){return rcp(x);} + AD3 ARcpD3(AD3 x){return rcp(x);} + AD4 ARcpD4(AD4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return rsqrt(x);} + AD2 ARsqD2(AD2 x){return rsqrt(x);} + AD3 ARsqD3(AD3 x){return rsqrt(x);} + AD4 ARsqD4(AD4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return saturate(x);} + AD2 ASatD2(AD2 x){return saturate(x);} + AD3 ASatD3(AD3 x){return saturate(x);} + AD4 ASatD4(AD4 x){return saturate(x);} + #endif +//============================================================================================================================== +// HLSL WAVE +//============================================================================================================================== + #ifdef A_WAVE + // Where 'x' must be a compile time literal. + AF1 AWaveXorF1(AF1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF2 AWaveXorF2(AF2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF3 AWaveXorF3(AF3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF4 AWaveXorF4(AF4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU1 AWaveXorU1(AU1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU2 AWaveXorU1(AU2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU3 AWaveXorU1(AU3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU4 AWaveXorU1(AU4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(WaveReadLaneAt(AU1_AH2(v),WaveGetLaneIndex()^x));} + AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(WaveReadLaneAt(AU2_AH4(v),WaveGetLaneIndex()^x));} + AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(WaveReadLaneAt(AU1_AW2(v),WaveGetLaneIndex()^x));} + AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU1(WaveReadLaneAt(AU1_AW4(v),WaveGetLaneIndex()^x));} + #endif + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU COMMON +// +// +//============================================================================================================================== +#ifdef A_GPU + // Negative and positive infinity. + #define A_INFP_F AF1_AU1(0x7f800000u) + #define A_INFN_F AF1_AU1(0xff800000u) +//------------------------------------------------------------------------------------------------------------------------------ + // Copy sign from 's' to positive 'd'. + AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));} + AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));} + AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));} + AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Single operation to return (useful to create a mask to use in lerp for branch free logic), + // m=NaN := 0 + // m>=0 := 0 + // m<0 := 1 + // Uses the following useful floating point logic, + // saturate(+a*(-INF)==-INF) := 0 + // saturate( 0*(-INF)== NaN) := 0 + // saturate(-a*(-INF)==+INF) := 1 + AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));} + AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));} + AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));} + AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AGtZeroF1(AF1 m){return ASatF1(m*AF1_(A_INFP_F));} + AF2 AGtZeroF2(AF2 m){return ASatF2(m*AF2_(A_INFP_F));} + AF3 AGtZeroF3(AF3 m){return ASatF3(m*AF3_(A_INFP_F));} + AF4 AGtZeroF4(AF4 m){return ASatF4(m*AF4_(A_INFP_F));} +//============================================================================================================================== + #ifdef A_HALF + #ifdef A_HLSL_6_2 + #define A_INFP_H AH1_AW1((uint16_t)0x7c00u) + #define A_INFN_H AH1_AW1((uint16_t)0xfc00u) + #else + #define A_INFP_H AH1_AW1(0x7c00u) + #define A_INFN_H AH1_AW1(0xfc00u) + #endif + +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));} + AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));} + AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));} + AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));} + AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));} + AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));} + AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AGtZeroH1(AH1 m){return ASatH1(m*AH1_(A_INFP_H));} + AH2 AGtZeroH2(AH2 m){return ASatH2(m*AH2_(A_INFP_H));} + AH3 AGtZeroH3(AH3 m){return ASatH3(m*AH3_(A_INFP_H));} + AH4 AGtZeroH4(AH4 m){return ASatH4(m*AH4_(A_INFP_H));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [FIS] FLOAT INTEGER SORTABLE +//------------------------------------------------------------------------------------------------------------------------------ +// Float to integer sortable. +// - If sign bit=0, flip the sign bit (positives). +// - If sign bit=1, flip all bits (negatives). +// Integer sortable to float. +// - If sign bit=1, flip the sign bit (positives). +// - If sign bit=0, flip all bits (negatives). +// Has nice side effects. +// - Larger integers are more positive values. +// - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage). +// Burns 3 ops for conversion {shift,or,xor}. +//============================================================================================================================== + AU1 AFisToU1(AU1 x){return x^(( AShrSU1(x,AU1_(31)))|AU1_(0x80000000));} + AU1 AFisFromU1(AU1 x){return x^((~AShrSU1(x,AU1_(31)))|AU1_(0x80000000));} +//------------------------------------------------------------------------------------------------------------------------------ + // Just adjust high 16-bit value (useful when upper part of 32-bit word is a 16-bit float value). + AU1 AFisToHiU1(AU1 x){return x^(( AShrSU1(x,AU1_(15)))|AU1_(0x80000000));} + AU1 AFisFromHiU1(AU1 x){return x^((~AShrSU1(x,AU1_(15)))|AU1_(0x80000000));} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AW1 AFisToW1(AW1 x){return x^(( AShrSW1(x,AW1_(15)))|AW1_(0x8000));} + AW1 AFisFromW1(AW1 x){return x^((~AShrSW1(x,AW1_(15)))|AW1_(0x8000));} +//------------------------------------------------------------------------------------------------------------------------------ + AW2 AFisToW2(AW2 x){return x^(( AShrSW2(x,AW2_(15)))|AW2_(0x8000));} + AW2 AFisFromW2(AW2 x){return x^((~AShrSW2(x,AW2_(15)))|AW2_(0x8000));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [PERM] V_PERM_B32 +//------------------------------------------------------------------------------------------------------------------------------ +// Support for V_PERM_B32 started in the 3rd generation of GCN. +//------------------------------------------------------------------------------------------------------------------------------ +// yyyyxxxx - The 'i' input. +// 76543210 +// ======== +// HGFEDCBA - Naming on permutation. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Make sure compiler optimizes this. +//============================================================================================================================== + #ifdef A_HALF + AU1 APerm0E0A(AU2 i){return((i.x )&0xffu)|((i.y<<16)&0xff0000u);} + AU1 APerm0F0B(AU2 i){return((i.x>> 8)&0xffu)|((i.y<< 8)&0xff0000u);} + AU1 APerm0G0C(AU2 i){return((i.x>>16)&0xffu)|((i.y )&0xff0000u);} + AU1 APerm0H0D(AU2 i){return((i.x>>24)&0xffu)|((i.y>> 8)&0xff0000u);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 APermHGFA(AU2 i){return((i.x )&0x000000ffu)|(i.y&0xffffff00u);} + AU1 APermHGFC(AU2 i){return((i.x>>16)&0x000000ffu)|(i.y&0xffffff00u);} + AU1 APermHGAE(AU2 i){return((i.x<< 8)&0x0000ff00u)|(i.y&0xffff00ffu);} + AU1 APermHGCE(AU2 i){return((i.x>> 8)&0x0000ff00u)|(i.y&0xffff00ffu);} + AU1 APermHAFE(AU2 i){return((i.x<<16)&0x00ff0000u)|(i.y&0xff00ffffu);} + AU1 APermHCFE(AU2 i){return((i.x )&0x00ff0000u)|(i.y&0xff00ffffu);} + AU1 APermAGFE(AU2 i){return((i.x<<24)&0xff000000u)|(i.y&0x00ffffffu);} + AU1 APermCGFE(AU2 i){return((i.x<< 8)&0xff000000u)|(i.y&0x00ffffffu);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 APermGCEA(AU2 i){return((i.x)&0x00ff00ffu)|((i.y<<8)&0xff00ff00u);} + AU1 APermGECA(AU2 i){return(((i.x)&0xffu)|((i.x>>8)&0xff00u)|((i.y<<16)&0xff0000u)|((i.y<<8)&0xff000000u));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [BUC] BYTE UNSIGNED CONVERSION +//------------------------------------------------------------------------------------------------------------------------------ +// Designed to use the optimal conversion, enables the scaling to possibly be factored into other computation. +// Works on a range of {0 to A_BUC_<32,16>}, for <32-bit, and 16-bit> respectively. +//------------------------------------------------------------------------------------------------------------------------------ +// OPCODE NOTES +// ============ +// GCN does not do UNORM or SNORM for bytes in opcodes. +// - V_CVT_F32_UBYTE{0,1,2,3} - Unsigned byte to float. +// - V_CVT_PKACC_U8_F32 - Float to unsigned byte (does bit-field insert into 32-bit integer). +// V_PERM_B32 does byte packing with ability to zero fill bytes as well. +// - Can pull out byte values from two sources, and zero fill upper 8-bits of packed hi and lo. +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U1() - Designed for V_CVT_F32_UBYTE* and V_CVT_PKACCUM_U8_F32 ops. +// ==== ===== +// 0 : 0 +// 1 : 1 +// ... +// 255 : 255 +// : 256 (just outside the encoding range) +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32. +// ==== ===== +// 0 : 0 +// 1 : 1/512 +// 2 : 1/256 +// ... +// 64 : 1/8 +// 128 : 1/4 +// 255 : 255/512 +// : 1/2 (just outside the encoding range) +//------------------------------------------------------------------------------------------------------------------------------ +// OPTIMAL IMPLEMENTATIONS ON AMD ARCHITECTURES +// ============================================ +// r=ABuc0FromU1(i) +// V_CVT_F32_UBYTE0 r,i +// -------------------------------------------- +// r=ABuc0ToU1(d,i) +// V_CVT_PKACCUM_U8_F32 r,i,0,d +// -------------------------------------------- +// d=ABuc0FromU2(i) +// Where 'k0' is an SGPR with 0x0E0A +// Where 'k1' is an SGPR with {32768.0} packed into the lower 16-bits +// V_PERM_B32 d,i.x,i.y,k0 +// V_PK_FMA_F16 d,d,k1.x,0 +// -------------------------------------------- +// r=ABuc0ToU2(d,i) +// Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits +// Where 'k1' is an SGPR with 0x???? +// Where 'k2' is an SGPR with 0x???? +// V_PK_FMA_F16 i,i,k0.x,0 +// V_PERM_B32 r.x,i,i,k1 +// V_PERM_B32 r.y,i,i,k2 +//============================================================================================================================== + // Peak range for 32-bit and 16-bit operations. + #define A_BUC_32 (255.0) + #define A_BUC_16 (255.0/512.0) +//============================================================================================================================== + #if 1 + // Designed to be one V_CVT_PKACCUM_U8_F32. + // The extra min is required to pattern match to V_CVT_PKACCUM_U8_F32. + AU1 ABuc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i),255u) )&(0x000000ffu));} + AU1 ABuc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i),255u)<< 8)&(0x0000ff00u));} + AU1 ABuc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i),255u)<<16)&(0x00ff0000u));} + AU1 ABuc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i),255u)<<24)&(0xff000000u));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed to be one V_CVT_F32_UBYTE*. + AF1 ABuc0FromU1(AU1 i){return AF1((i )&255u);} + AF1 ABuc1FromU1(AU1 i){return AF1((i>> 8)&255u);} + AF1 ABuc2FromU1(AU1 i){return AF1((i>>16)&255u);} + AF1 ABuc3FromU1(AU1 i){return AF1((i>>24)&255u);} + #endif +//============================================================================================================================== + #ifdef A_HALF + // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}. + AW2 ABuc01ToW2(AH2 x,AH2 y){x*=AH2_(1.0/32768.0);y*=AH2_(1.0/32768.0); + return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed for 3 ops to do SOA to AOS and conversion. + AU2 ABuc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABuc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABuc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABuc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed for 2 ops to do both AOS to SOA, and conversion. + AH2 ABuc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0);} + AH2 ABuc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0);} + AH2 ABuc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0);} + AH2 ABuc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [BSC] BYTE SIGNED CONVERSION +//------------------------------------------------------------------------------------------------------------------------------ +// Similar to [BUC]. +// Works on a range of {-/+ A_BSC_<32,16>}, for <32-bit, and 16-bit> respectively. +//------------------------------------------------------------------------------------------------------------------------------ +// ENCODING (without zero-based encoding) +// ======== +// 0 = unused (can be used to mean something else) +// 1 = lowest value +// 128 = exact zero center (zero based encoding +// 255 = highest value +//------------------------------------------------------------------------------------------------------------------------------ +// Zero-based [Zb] flips the MSB bit of the byte (making 128 "exact zero" actually zero). +// This is useful if there is a desire for cleared values to decode as zero. +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABsc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32. +// ==== ===== +// 0 : -127/512 (unused) +// 1 : -126/512 +// 2 : -125/512 +// ... +// 128 : 0 +// ... +// 255 : 127/512 +// : 1/4 (just outside the encoding range) +//============================================================================================================================== + // Peak range for 32-bit and 16-bit operations. + #define A_BSC_32 (127.0) + #define A_BSC_16 (127.0/512.0) +//============================================================================================================================== + #if 1 + AU1 ABsc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i+128.0),255u) )&(0x000000ffu));} + AU1 ABsc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i+128.0),255u)<< 8)&(0x0000ff00u));} + AU1 ABsc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i+128.0),255u)<<16)&(0x00ff0000u));} + AU1 ABsc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i+128.0),255u)<<24)&(0xff000000u));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABsc0ToZbU1(AU1 d,AF1 i){return ((d&0xffffff00u)|((min(AU1(trunc(i)+128.0),255u) )&(0x000000ffu)))^0x00000080u;} + AU1 ABsc1ToZbU1(AU1 d,AF1 i){return ((d&0xffff00ffu)|((min(AU1(trunc(i)+128.0),255u)<< 8)&(0x0000ff00u)))^0x00008000u;} + AU1 ABsc2ToZbU1(AU1 d,AF1 i){return ((d&0xff00ffffu)|((min(AU1(trunc(i)+128.0),255u)<<16)&(0x00ff0000u)))^0x00800000u;} + AU1 ABsc3ToZbU1(AU1 d,AF1 i){return ((d&0x00ffffffu)|((min(AU1(trunc(i)+128.0),255u)<<24)&(0xff000000u)))^0x80000000u;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ABsc0FromU1(AU1 i){return AF1((i )&255u)-128.0;} + AF1 ABsc1FromU1(AU1 i){return AF1((i>> 8)&255u)-128.0;} + AF1 ABsc2FromU1(AU1 i){return AF1((i>>16)&255u)-128.0;} + AF1 ABsc3FromU1(AU1 i){return AF1((i>>24)&255u)-128.0;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ABsc0FromZbU1(AU1 i){return AF1(((i )&255u)^0x80u)-128.0;} + AF1 ABsc1FromZbU1(AU1 i){return AF1(((i>> 8)&255u)^0x80u)-128.0;} + AF1 ABsc2FromZbU1(AU1 i){return AF1(((i>>16)&255u)^0x80u)-128.0;} + AF1 ABsc3FromZbU1(AU1 i){return AF1(((i>>24)&255u)^0x80u)-128.0;} + #endif +//============================================================================================================================== + #ifdef A_HALF + // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}. + AW2 ABsc01ToW2(AH2 x,AH2 y){x=x*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);y=y*AH2_(1.0/32768.0)+AH2_(0.25/32768.0); + return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU2 ABsc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABsc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABsc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABsc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU2 ABsc0ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABsc1ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABsc2ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABsc3ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 ABsc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0)-AH2_(0.25);} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 ABsc0FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc1FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc2FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc3FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These support only positive inputs. +// Did not see value yet in specialization for range. +// Using quick testing, ended up mostly getting the same "best" approximation for various ranges. +// With hardware that can co-execute transcendentals, the value in approximations could be less than expected. +// However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total. +// And co-execution would require a compiler interleaving a lot of independent work for packed usage. +//------------------------------------------------------------------------------------------------------------------------------ +// The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total). +// Same with sqrt(), as this could be x*rsq() (7 ops). +//============================================================================================================================== + #ifdef A_HALF + // Minimize squared error across full positive range, 2 ops. + // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output. + AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));} + AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));} + AH3 APrxLoSqrtH3(AH3 a){return AH3_AW3((AW3_AH3(a)>>AW3_(1))+AW3_(0x1de2));} + AH4 APrxLoSqrtH4(AH4 a){return AH4_AW4((AW4_AH4(a)>>AW4_(1))+AW4_(0x1de2));} +//------------------------------------------------------------------------------------------------------------------------------ + // Lower precision estimation, 1 op. + // Minimize squared error across {smallest normal to 16384.0}. + AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));} + AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));} + AH3 APrxLoRcpH3(AH3 a){return AH3_AW3(AW3_(0x7784)-AW3_AH3(a));} + AH4 APrxLoRcpH4(AH4 a){return AH4_AW4(AW4_(0x7784)-AW4_AH4(a));} +//------------------------------------------------------------------------------------------------------------------------------ + // Medium precision estimation, one Newton Raphson iteration, 3 ops. + AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));} + AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));} + AH3 APrxMedRcpH3(AH3 a){AH3 b=AH3_AW3(AW3_(0x778d)-AW3_AH3(a));return b*(-b*a+AH3_(2.0));} + AH4 APrxMedRcpH4(AH4 a){AH4 b=AH4_AW4(AW4_(0x778d)-AW4_AH4(a));return b*(-b*a+AH4_(2.0));} +//------------------------------------------------------------------------------------------------------------------------------ + // Minimize squared error across {smallest normal to 16384.0}, 2 ops. + AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));} + AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));} + AH3 APrxLoRsqH3(AH3 a){return AH3_AW3(AW3_(0x59a3)-(AW3_AH3(a)>>AW3_(1)));} + AH4 APrxLoRsqH4(AH4 a){return AH4_AW4(AW4_(0x59a3)-(AW4_AH4(a)>>AW4_(1)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// FLOAT APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN", +// - Idea dates back to SGI, then to Quake 3, etc. +// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +// - sqrt(x)=rsqrt(x)*x +// - rcp(x)=rsqrt(x)*rsqrt(x) for positive x +// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +//------------------------------------------------------------------------------------------------------------------------------ +// These below are from perhaps less complete searching for optimal. +// Used FP16 normal range for testing with +4096 32-bit step size for sampling error. +// So these match up well with the half approximations. +//============================================================================================================================== + AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));} + AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));} + AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));} + AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 APrxLoSqrtF2(AF2 a){return AF2_AU2((AU2_AF2(a)>>AU2_(1))+AU2_(0x1fbc4639));} + AF2 APrxLoRcpF2(AF2 a){return AF2_AU2(AU2_(0x7ef07ebb)-AU2_AF2(a));} + AF2 APrxMedRcpF2(AF2 a){AF2 b=AF2_AU2(AU2_(0x7ef19fff)-AU2_AF2(a));return b*(-b*a+AF2_(2.0));} + AF2 APrxLoRsqF2(AF2 a){return AF2_AU2(AU2_(0x5f347d74)-(AU2_AF2(a)>>AU2_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF3 APrxLoSqrtF3(AF3 a){return AF3_AU3((AU3_AF3(a)>>AU3_(1))+AU3_(0x1fbc4639));} + AF3 APrxLoRcpF3(AF3 a){return AF3_AU3(AU3_(0x7ef07ebb)-AU3_AF3(a));} + AF3 APrxMedRcpF3(AF3 a){AF3 b=AF3_AU3(AU3_(0x7ef19fff)-AU3_AF3(a));return b*(-b*a+AF3_(2.0));} + AF3 APrxLoRsqF3(AF3 a){return AF3_AU3(AU3_(0x5f347d74)-(AU3_AF3(a)>>AU3_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF4 APrxLoSqrtF4(AF4 a){return AF4_AU4((AU4_AF4(a)>>AU4_(1))+AU4_(0x1fbc4639));} + AF4 APrxLoRcpF4(AF4 a){return AF4_AU4(AU4_(0x7ef07ebb)-AU4_AF4(a));} + AF4 APrxMedRcpF4(AF4 a){AF4 b=AF4_AU4(AU4_(0x7ef19fff)-AU4_AF4(a));return b*(-b*a+AF4_(2.0));} + AF4 APrxLoRsqF4(AF4 a){return AF4_AU4(AU4_(0x5f347d74)-(AU4_AF4(a)>>AU4_(1)));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PQ APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// PQ is very close to x^(1/8). The functions below Use the fast float approximation method to do +// PQ<~>Gamma2 (4th power and fast 4th root) and PQ<~>Linear (8th power and fast 8th root). Maximum error is ~0.2%. +//============================================================================================================================== +// Helpers + AF1 Quart(AF1 a) { a = a * a; return a * a;} + AF1 Oct(AF1 a) { a = a * a; a = a * a; return a * a; } + AF2 Quart(AF2 a) { a = a * a; return a * a; } + AF2 Oct(AF2 a) { a = a * a; a = a * a; return a * a; } + AF3 Quart(AF3 a) { a = a * a; return a * a; } + AF3 Oct(AF3 a) { a = a * a; a = a * a; return a * a; } + AF4 Quart(AF4 a) { a = a * a; return a * a; } + AF4 Oct(AF4 a) { a = a * a; a = a * a; return a * a; } + //------------------------------------------------------------------------------------------------------------------------------ + AF1 APrxPQToGamma2(AF1 a) { return Quart(a); } + AF1 APrxPQToLinear(AF1 a) { return Oct(a); } + AF1 APrxLoGamma2ToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); } + AF1 APrxMedGamma2ToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); AF1 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF1 APrxHighGamma2ToPQ(AF1 a) { return sqrt(sqrt(a)); } + AF1 APrxLoLinearToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); } + AF1 APrxMedLinearToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); AF1 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF1 APrxHighLinearToPQ(AF1 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF2 APrxPQToGamma2(AF2 a) { return Quart(a); } + AF2 APrxPQToLinear(AF2 a) { return Oct(a); } + AF2 APrxLoGamma2ToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); } + AF2 APrxMedGamma2ToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); AF2 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF2 APrxHighGamma2ToPQ(AF2 a) { return sqrt(sqrt(a)); } + AF2 APrxLoLinearToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); } + AF2 APrxMedLinearToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); AF2 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF2 APrxHighLinearToPQ(AF2 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF3 APrxPQToGamma2(AF3 a) { return Quart(a); } + AF3 APrxPQToLinear(AF3 a) { return Oct(a); } + AF3 APrxLoGamma2ToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); } + AF3 APrxMedGamma2ToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); AF3 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF3 APrxHighGamma2ToPQ(AF3 a) { return sqrt(sqrt(a)); } + AF3 APrxLoLinearToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); } + AF3 APrxMedLinearToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); AF3 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF3 APrxHighLinearToPQ(AF3 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF4 APrxPQToGamma2(AF4 a) { return Quart(a); } + AF4 APrxPQToLinear(AF4 a) { return Oct(a); } + AF4 APrxLoGamma2ToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); } + AF4 APrxMedGamma2ToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); AF4 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF4 APrxHighGamma2ToPQ(AF4 a) { return sqrt(sqrt(a)); } + AF4 APrxLoLinearToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); } + AF4 APrxMedLinearToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); AF4 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF4 APrxHighLinearToPQ(AF4 a) { return sqrt(sqrt(sqrt(a))); } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PARABOLIC SIN & COS +//------------------------------------------------------------------------------------------------------------------------------ +// Approximate answers to transcendental questions. +//------------------------------------------------------------------------------------------------------------------------------ +//============================================================================================================================== + #if 1 + // Valid input range is {-1 to 1} representing {0 to 2 pi}. + // Output range is {-1/4 to 1/4} representing {-1 to 1}. + AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD. + AF2 APSinF2(AF2 x){return x*abs(x)-x;} + AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT + AF2 APCosF2(AF2 x){x=AFractF2(x*AF2_(0.5)+AF2_(0.75));x=x*AF2_(2.0)-AF2_(1.0);return APSinF2(x);} + AF2 APSinCosF1(AF1 x){AF1 y=AFractF1(x*AF1_(0.5)+AF1_(0.75));y=y*AF1_(2.0)-AF1_(1.0);return APSinF2(AF2(x,y));} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + // For a packed {sin,cos} pair, + // - Native takes 16 clocks and 4 issue slots (no packed transcendentals). + // - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed). + AH1 APSinH1(AH1 x){return x*abs(x)-x;} + AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA + AH1 APCosH1(AH1 x){x=AFractH1(x*AH1_(0.5)+AH1_(0.75));x=x*AH1_(2.0)-AH1_(1.0);return APSinH1(x);} + AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND + AH2 APSinCosH1(AH1 x){AH1 y=AFractH1(x*AH1_(0.5)+AH1_(0.75));y=y*AH1_(2.0)-AH1_(1.0);return APSinH2(AH2(x,y));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [ZOL] ZERO ONE LOGIC +//------------------------------------------------------------------------------------------------------------------------------ +// Conditional free logic designed for easy 16-bit packing, and backwards porting to 32-bit. +//------------------------------------------------------------------------------------------------------------------------------ +// 0 := false +// 1 := true +//------------------------------------------------------------------------------------------------------------------------------ +// AndNot(x,y) -> !(x&y) .... One op. +// AndOr(x,y,z) -> (x&y)|z ... One op. +// GtZero(x) -> x>0.0 ..... One op. +// Sel(x,y,z) -> x?y:z ..... Two ops, has no precision loss. +// Signed(x) -> x<0.0 ..... One op. +// ZeroPass(x,y) -> x?0:y ..... Two ops, 'y' is a pass through safe for aliasing as integer. +//------------------------------------------------------------------------------------------------------------------------------ +// OPTIMIZATION NOTES +// ================== +// - On Vega to use 2 constants in a packed op, pass in as one AW2 or one AH2 'k.xy' and use as 'k.xx' and 'k.yy'. +// For example 'a.xy*k.xx+k.yy'. +//============================================================================================================================== + #if 1 + AU1 AZolAndU1(AU1 x,AU1 y){return min(x,y);} + AU2 AZolAndU2(AU2 x,AU2 y){return min(x,y);} + AU3 AZolAndU3(AU3 x,AU3 y){return min(x,y);} + AU4 AZolAndU4(AU4 x,AU4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AZolNotU1(AU1 x){return x^AU1_(1);} + AU2 AZolNotU2(AU2 x){return x^AU2_(1);} + AU3 AZolNotU3(AU3 x){return x^AU3_(1);} + AU4 AZolNotU4(AU4 x){return x^AU4_(1);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AZolOrU1(AU1 x,AU1 y){return max(x,y);} + AU2 AZolOrU2(AU2 x,AU2 y){return max(x,y);} + AU3 AZolOrU3(AU3 x,AU3 y){return max(x,y);} + AU4 AZolOrU4(AU4 x,AU4 y){return max(x,y);} +//============================================================================================================================== + AU1 AZolF1ToU1(AF1 x){return AU1(x);} + AU2 AZolF2ToU2(AF2 x){return AU2(x);} + AU3 AZolF3ToU3(AF3 x){return AU3(x);} + AU4 AZolF4ToU4(AF4 x){return AU4(x);} +//------------------------------------------------------------------------------------------------------------------------------ + // 2 ops, denormals don't work in 32-bit on PC (and if they are enabled, OMOD is disabled). + AU1 AZolNotF1ToU1(AF1 x){return AU1(AF1_(1.0)-x);} + AU2 AZolNotF2ToU2(AF2 x){return AU2(AF2_(1.0)-x);} + AU3 AZolNotF3ToU3(AF3 x){return AU3(AF3_(1.0)-x);} + AU4 AZolNotF4ToU4(AF4 x){return AU4(AF4_(1.0)-x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolU1ToF1(AU1 x){return AF1(x);} + AF2 AZolU2ToF2(AU2 x){return AF2(x);} + AF3 AZolU3ToF3(AU3 x){return AF3(x);} + AF4 AZolU4ToF4(AU4 x){return AF4(x);} +//============================================================================================================================== + AF1 AZolAndF1(AF1 x,AF1 y){return min(x,y);} + AF2 AZolAndF2(AF2 x,AF2 y){return min(x,y);} + AF3 AZolAndF3(AF3 x,AF3 y){return min(x,y);} + AF4 AZolAndF4(AF4 x,AF4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ASolAndNotF1(AF1 x,AF1 y){return (-x)*y+AF1_(1.0);} + AF2 ASolAndNotF2(AF2 x,AF2 y){return (-x)*y+AF2_(1.0);} + AF3 ASolAndNotF3(AF3 x,AF3 y){return (-x)*y+AF3_(1.0);} + AF4 ASolAndNotF4(AF4 x,AF4 y){return (-x)*y+AF4_(1.0);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolAndOrF1(AF1 x,AF1 y,AF1 z){return ASatF1(x*y+z);} + AF2 AZolAndOrF2(AF2 x,AF2 y,AF2 z){return ASatF2(x*y+z);} + AF3 AZolAndOrF3(AF3 x,AF3 y,AF3 z){return ASatF3(x*y+z);} + AF4 AZolAndOrF4(AF4 x,AF4 y,AF4 z){return ASatF4(x*y+z);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolGtZeroF1(AF1 x){return ASatF1(x*AF1_(A_INFP_F));} + AF2 AZolGtZeroF2(AF2 x){return ASatF2(x*AF2_(A_INFP_F));} + AF3 AZolGtZeroF3(AF3 x){return ASatF3(x*AF3_(A_INFP_F));} + AF4 AZolGtZeroF4(AF4 x){return ASatF4(x*AF4_(A_INFP_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolNotF1(AF1 x){return AF1_(1.0)-x;} + AF2 AZolNotF2(AF2 x){return AF2_(1.0)-x;} + AF3 AZolNotF3(AF3 x){return AF3_(1.0)-x;} + AF4 AZolNotF4(AF4 x){return AF4_(1.0)-x;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolOrF1(AF1 x,AF1 y){return max(x,y);} + AF2 AZolOrF2(AF2 x,AF2 y){return max(x,y);} + AF3 AZolOrF3(AF3 x,AF3 y){return max(x,y);} + AF4 AZolOrF4(AF4 x,AF4 y){return max(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolSelF1(AF1 x,AF1 y,AF1 z){AF1 r=(-x)*z+z;return x*y+r;} + AF2 AZolSelF2(AF2 x,AF2 y,AF2 z){AF2 r=(-x)*z+z;return x*y+r;} + AF3 AZolSelF3(AF3 x,AF3 y,AF3 z){AF3 r=(-x)*z+z;return x*y+r;} + AF4 AZolSelF4(AF4 x,AF4 y,AF4 z){AF4 r=(-x)*z+z;return x*y+r;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolSignedF1(AF1 x){return ASatF1(x*AF1_(A_INFN_F));} + AF2 AZolSignedF2(AF2 x){return ASatF2(x*AF2_(A_INFN_F));} + AF3 AZolSignedF3(AF3 x){return ASatF3(x*AF3_(A_INFN_F));} + AF4 AZolSignedF4(AF4 x){return ASatF4(x*AF4_(A_INFN_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolZeroPassF1(AF1 x,AF1 y){return AF1_AU1((AU1_AF1(x)!=AU1_(0))?AU1_(0):AU1_AF1(y));} + AF2 AZolZeroPassF2(AF2 x,AF2 y){return AF2_AU2((AU2_AF2(x)!=AU2_(0))?AU2_(0):AU2_AF2(y));} + AF3 AZolZeroPassF3(AF3 x,AF3 y){return AF3_AU3((AU3_AF3(x)!=AU3_(0))?AU3_(0):AU3_AF3(y));} + AF4 AZolZeroPassF4(AF4 x,AF4 y){return AF4_AU4((AU4_AF4(x)!=AU4_(0))?AU4_(0):AU4_AF4(y));} + #endif +//============================================================================================================================== + #ifdef A_HALF + AW1 AZolAndW1(AW1 x,AW1 y){return min(x,y);} + AW2 AZolAndW2(AW2 x,AW2 y){return min(x,y);} + AW3 AZolAndW3(AW3 x,AW3 y){return min(x,y);} + AW4 AZolAndW4(AW4 x,AW4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AZolNotW1(AW1 x){return x^AW1_(1);} + AW2 AZolNotW2(AW2 x){return x^AW2_(1);} + AW3 AZolNotW3(AW3 x){return x^AW3_(1);} + AW4 AZolNotW4(AW4 x){return x^AW4_(1);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AZolOrW1(AW1 x,AW1 y){return max(x,y);} + AW2 AZolOrW2(AW2 x,AW2 y){return max(x,y);} + AW3 AZolOrW3(AW3 x,AW3 y){return max(x,y);} + AW4 AZolOrW4(AW4 x,AW4 y){return max(x,y);} +//============================================================================================================================== + // Uses denormal trick. + AW1 AZolH1ToW1(AH1 x){return AW1_AH1(x*AH1_AW1(AW1_(1)));} + AW2 AZolH2ToW2(AH2 x){return AW2_AH2(x*AH2_AW2(AW2_(1)));} + AW3 AZolH3ToW3(AH3 x){return AW3_AH3(x*AH3_AW3(AW3_(1)));} + AW4 AZolH4ToW4(AH4 x){return AW4_AH4(x*AH4_AW4(AW4_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + // AMD arch lacks a packed conversion opcode. + AH1 AZolW1ToH1(AW1 x){return AH1_AW1(x*AW1_AH1(AH1_(1.0)));} + AH2 AZolW2ToH2(AW2 x){return AH2_AW2(x*AW2_AH2(AH2_(1.0)));} + AH3 AZolW1ToH3(AW3 x){return AH3_AW3(x*AW3_AH3(AH3_(1.0)));} + AH4 AZolW2ToH4(AW4 x){return AH4_AW4(x*AW4_AH4(AH4_(1.0)));} +//============================================================================================================================== + AH1 AZolAndH1(AH1 x,AH1 y){return min(x,y);} + AH2 AZolAndH2(AH2 x,AH2 y){return min(x,y);} + AH3 AZolAndH3(AH3 x,AH3 y){return min(x,y);} + AH4 AZolAndH4(AH4 x,AH4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASolAndNotH1(AH1 x,AH1 y){return (-x)*y+AH1_(1.0);} + AH2 ASolAndNotH2(AH2 x,AH2 y){return (-x)*y+AH2_(1.0);} + AH3 ASolAndNotH3(AH3 x,AH3 y){return (-x)*y+AH3_(1.0);} + AH4 ASolAndNotH4(AH4 x,AH4 y){return (-x)*y+AH4_(1.0);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolAndOrH1(AH1 x,AH1 y,AH1 z){return ASatH1(x*y+z);} + AH2 AZolAndOrH2(AH2 x,AH2 y,AH2 z){return ASatH2(x*y+z);} + AH3 AZolAndOrH3(AH3 x,AH3 y,AH3 z){return ASatH3(x*y+z);} + AH4 AZolAndOrH4(AH4 x,AH4 y,AH4 z){return ASatH4(x*y+z);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolGtZeroH1(AH1 x){return ASatH1(x*AH1_(A_INFP_H));} + AH2 AZolGtZeroH2(AH2 x){return ASatH2(x*AH2_(A_INFP_H));} + AH3 AZolGtZeroH3(AH3 x){return ASatH3(x*AH3_(A_INFP_H));} + AH4 AZolGtZeroH4(AH4 x){return ASatH4(x*AH4_(A_INFP_H));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolNotH1(AH1 x){return AH1_(1.0)-x;} + AH2 AZolNotH2(AH2 x){return AH2_(1.0)-x;} + AH3 AZolNotH3(AH3 x){return AH3_(1.0)-x;} + AH4 AZolNotH4(AH4 x){return AH4_(1.0)-x;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolOrH1(AH1 x,AH1 y){return max(x,y);} + AH2 AZolOrH2(AH2 x,AH2 y){return max(x,y);} + AH3 AZolOrH3(AH3 x,AH3 y){return max(x,y);} + AH4 AZolOrH4(AH4 x,AH4 y){return max(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolSelH1(AH1 x,AH1 y,AH1 z){AH1 r=(-x)*z+z;return x*y+r;} + AH2 AZolSelH2(AH2 x,AH2 y,AH2 z){AH2 r=(-x)*z+z;return x*y+r;} + AH3 AZolSelH3(AH3 x,AH3 y,AH3 z){AH3 r=(-x)*z+z;return x*y+r;} + AH4 AZolSelH4(AH4 x,AH4 y,AH4 z){AH4 r=(-x)*z+z;return x*y+r;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolSignedH1(AH1 x){return ASatH1(x*AH1_(A_INFN_H));} + AH2 AZolSignedH2(AH2 x){return ASatH2(x*AH2_(A_INFN_H));} + AH3 AZolSignedH3(AH3 x){return ASatH3(x*AH3_(A_INFN_H));} + AH4 AZolSignedH4(AH4 x){return ASatH4(x*AH4_(A_INFN_H));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COLOR CONVERSIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These are all linear to/from some other space (where 'linear' has been shortened out of the function name). +// So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'. +// These are branch free implementations. +// The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion. +//------------------------------------------------------------------------------------------------------------------------------ +// TRANSFER FUNCTIONS +// ================== +// 709 ..... Rec709 used for some HDTVs +// Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native +// Pq ...... PQ native for HDR10 +// Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type +// Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations) +// Three ... Gamma 3.0, less fast, but good for HDR. +//------------------------------------------------------------------------------------------------------------------------------ +// KEEPING TO SPEC +// =============== +// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +// Also there is a slight step in the transition regions. +// Precision of the coefficients in the spec being the likely cause. +// Main usage case of the sRGB code is to do the linear->sRGB converstion in a compute shader before store. +// This is to work around lack of hardware (typically only ROP does the conversion for free). +// To "correct" the linear segment, would be to introduce error, because hardware decode of sRGB->linear is fixed (and free). +// So this header keeps with the spec. +// For linear->sRGB transforms, the linear segment in some respects reduces error, because rounding in that region is linear. +// Rounding in the curved region in hardware (and fast software code) introduces error due to rounding in non-linear. +//------------------------------------------------------------------------------------------------------------------------------ +// FOR PQ +// ====== +// Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2. +// All constants are only specified to FP32 precision. +// External PQ source reference, +// - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl +//------------------------------------------------------------------------------------------------------------------------------ +// PACKED VERSIONS +// =============== +// These are the A*H2() functions. +// There is no PQ functions as FP16 seemed to not have enough precision for the conversion. +// The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors. +// Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least). +//------------------------------------------------------------------------------------------------------------------------------ +// NOTES +// ===== +// Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case. +//============================================================================================================================== + #if 1 + AF1 ATo709F1(AF1 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AF2 ATo709F2(AF2 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AF3 ATo709F3(AF3 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma(). + AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,AF1_(rcpX));} + AF2 AToGammaF2(AF2 c,AF1 rcpX){return pow(c,AF2_(rcpX));} + AF3 AToGammaF3(AF3 c,AF1 rcpX){return pow(c,AF3_(rcpX));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302)); + return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));} + AF2 AToPqF1(AF2 x){AF2 p=pow(x,AF2_(0.159302)); + return pow((AF2_(0.835938)+AF2_(18.8516)*p)/(AF2_(1.0)+AF2_(18.6875)*p),AF2_(78.8438));} + AF3 AToPqF1(AF3 x){AF3 p=pow(x,AF3_(0.159302)); + return pow((AF3_(0.835938)+AF3_(18.8516)*p)/(AF3_(1.0)+AF3_(18.6875)*p),AF3_(78.8438));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToSrgbF1(AF1 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AF2 AToSrgbF2(AF2 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AF3 AToSrgbF3(AF3 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToTwoF1(AF1 c){return sqrt(c);} + AF2 AToTwoF2(AF2 c){return sqrt(c);} + AF3 AToTwoF3(AF3 c){return sqrt(c);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToThreeF1(AF1 c){return pow(c,AF1_(1.0/3.0));} + AF2 AToThreeF2(AF2 c){return pow(c,AF2_(1.0/3.0));} + AF3 AToThreeF3(AF3 c){return pow(c,AF3_(1.0/3.0));} + #endif +//============================================================================================================================== + #if 1 + // Unfortunately median won't work here. + AF1 AFrom709F1(AF1 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AF2 AFrom709F2(AF2 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AF3 AFrom709F3(AF3 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,AF1_(x));} + AF2 AFromGammaF2(AF2 c,AF1 x){return pow(c,AF2_(x));} + AF3 AFromGammaF3(AF3 c,AF1 x){return pow(c,AF3_(x));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833)); + return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));} + AF2 AFromPqF1(AF2 x){AF2 p=pow(x,AF2_(0.0126833)); + return pow(ASatF2(p-AF2_(0.835938))/(AF2_(18.8516)-AF2_(18.6875)*p),AF2_(6.27739));} + AF3 AFromPqF1(AF3 x){AF3 p=pow(x,AF3_(0.0126833)); + return pow(ASatF3(p-AF3_(0.835938))/(AF3_(18.8516)-AF3_(18.6875)*p),AF3_(6.27739));} +//------------------------------------------------------------------------------------------------------------------------------ + // Unfortunately median won't work here. + AF1 AFromSrgbF1(AF1 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AF2 AFromSrgbF2(AF2 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AF3 AFromSrgbF3(AF3 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromTwoF1(AF1 c){return c*c;} + AF2 AFromTwoF2(AF2 c){return c*c;} + AF3 AFromTwoF3(AF3 c){return c*c;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromThreeF1(AF1 c){return c*c*c;} + AF2 AFromThreeF2(AF2 c){return c*c*c;} + AF3 AFromThreeF3(AF3 c){return c*c*c;} + #endif +//============================================================================================================================== + #ifdef A_HALF + AH1 ATo709H1(AH1 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AH2 ATo709H2(AH2 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AH3 ATo709H3(AH3 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToGammaH1(AH1 c,AH1 rcpX){return pow(c,AH1_(rcpX));} + AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));} + AH3 AToGammaH3(AH3 c,AH1 rcpX){return pow(c,AH3_(rcpX));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToSrgbH1(AH1 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AH2 AToSrgbH2(AH2 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AH3 AToSrgbH3(AH3 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToTwoH1(AH1 c){return sqrt(c);} + AH2 AToTwoH2(AH2 c){return sqrt(c);} + AH3 AToTwoH3(AH3 c){return sqrt(c);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToThreeF1(AH1 c){return pow(c,AH1_(1.0/3.0));} + AH2 AToThreeF2(AH2 c){return pow(c,AH2_(1.0/3.0));} + AH3 AToThreeF3(AH3 c){return pow(c,AH3_(1.0/3.0));} + #endif +//============================================================================================================================== + #ifdef A_HALF + AH1 AFrom709H1(AH1 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AH2 AFrom709H2(AH2 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AH3 AFrom709H3(AH3 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromGammaH1(AH1 c,AH1 x){return pow(c,AH1_(x));} + AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));} + AH3 AFromGammaH3(AH3 c,AH1 x){return pow(c,AH3_(x));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AHromSrgbF1(AH1 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AH2 AHromSrgbF2(AH2 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AH3 AHromSrgbF3(AH3 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromTwoH1(AH1 c){return c*c;} + AH2 AFromTwoH2(AH2 c){return c*c;} + AH3 AFromTwoH3(AH3 c){return c*c;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromThreeH1(AH1 c){return c*c*c;} + AH2 AFromThreeH2(AH2 c){return c*c*c;} + AH3 AFromThreeH3(AH3 c){return c*c*c;} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CS REMAP +//============================================================================================================================== + // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear. + // 543210 + // ====== + // ..xxx. + // yy...y + AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} +//============================================================================================================================== + // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions. + // 543210 + // ====== + // .xx..x + // y..yy. + // Details, + // LANE TO 8x8 MAPPING + // =================== + // 00 01 08 09 10 11 18 19 + // 02 03 0a 0b 12 13 1a 1b + // 04 05 0c 0d 14 15 1c 1d + // 06 07 0e 0f 16 17 1e 1f + // 20 21 28 29 30 31 38 39 + // 22 23 2a 2b 32 33 3a 3b + // 24 25 2c 2d 34 35 3c 3d + // 26 27 2e 2f 36 37 3e 3f + AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} +//============================================================================================================================== + #ifdef A_HALF + AW2 ARmp8x8H(AU1 a){return AW2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} + AW2 ARmpRed8x8H(AU1 a){return AW2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} + #endif +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// REFERENCE +// +//------------------------------------------------------------------------------------------------------------------------------ +// IEEE FLOAT RULES +// ================ +// - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1 +// - {+/-}0 * {+/-}INF = NaN +// - -INF + (+INF) = NaN +// - {+/-}0 / {+/-}0 = NaN +// - {+/-}INF / {+/-}INF = NaN +// - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN) +// - 0 == -0 +// - 4/0 = +INF +// - 4/-0 = -INF +// - 4+INF = +INF +// - 4-INF = -INF +// - 4*(+INF) = +INF +// - 4*(-INF) = -INF +// - -4*(+INF) = -INF +// - sqrt(+INF) = +INF +//------------------------------------------------------------------------------------------------------------------------------ +// FP16 ENCODING +// ============= +// fedcba9876543210 +// ---------------- +// ......mmmmmmmmmm 10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals) +// .eeeee.......... 5-bit exponent +// .00000.......... denormals +// .00001.......... -14 exponent +// .11110.......... 15 exponent +// .111110000000000 infinity +// .11111nnnnnnnnnn NaN with n!=0 +// s............... sign +//------------------------------------------------------------------------------------------------------------------------------ +// FP16/INT16 ALIASING DENORMAL +// ============================ +// 11-bit unsigned integers alias with half float denormal/normal values, +// 1 = 2^(-24) = 1/16777216 ....................... first denormal value +// 2 = 2^(-23) +// ... +// 1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value +// 1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers +// 2047 .............................................. last normal value that still maps to integers +// Scaling limits, +// 2^15 = 32768 ...................................... largest power of 2 scaling +// Largest pow2 conversion mapping is at *32768, +// 1 : 2^(-9) = 1/512 +// 2 : 1/256 +// 4 : 1/128 +// 8 : 1/64 +// 16 : 1/32 +// 32 : 1/16 +// 64 : 1/8 +// 128 : 1/4 +// 256 : 1/2 +// 512 : 1 +// 1024 : 2 +// 2047 : a little less than 4 +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU/CPU PORTABILITY +// +// +//------------------------------------------------------------------------------------------------------------------------------ +// This is the GPU implementation. +// See the CPU implementation for docs. +//============================================================================================================================== +#ifdef A_GPU + #define A_TRUE true + #define A_FALSE false + #define A_STATIC +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD2 + #define retAD3 AD3 + #define retAD4 AD4 + #define retAF2 AF2 + #define retAF3 AF3 + #define retAF4 AF4 + #define retAL2 AL2 + #define retAL3 AL3 + #define retAL4 AL4 + #define retAU2 AU2 + #define retAU3 AU3 + #define retAU4 AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 in AD2 + #define inAD3 in AD3 + #define inAD4 in AD4 + #define inAF2 in AF2 + #define inAF3 in AF3 + #define inAF4 in AF4 + #define inAL2 in AL2 + #define inAL3 in AL3 + #define inAL4 in AL4 + #define inAU2 in AU2 + #define inAU3 in AU3 + #define inAU4 in AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 inout AD2 + #define inoutAD3 inout AD3 + #define inoutAD4 inout AD4 + #define inoutAF2 inout AF2 + #define inoutAF3 inout AF3 + #define inoutAF4 inout AF4 + #define inoutAL2 inout AL2 + #define inoutAL3 inout AL3 + #define inoutAL4 inout AL4 + #define inoutAU2 inout AU2 + #define inoutAU3 inout AU3 + #define inoutAU4 inout AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 out AD2 + #define outAD3 out AD3 + #define outAD4 out AD4 + #define outAF2 out AF2 + #define outAF3 out AF3 + #define outAF4 out AF4 + #define outAL2 out AL2 + #define outAL3 out AL3 + #define outAL4 out AL4 + #define outAU2 out AU2 + #define outAU3 out AU3 + #define outAU4 out AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD2 x + #define varAD3(x) AD3 x + #define varAD4(x) AD4 x + #define varAF2(x) AF2 x + #define varAF3(x) AF3 x + #define varAF4(x) AF4 x + #define varAL2(x) AL2 x + #define varAL3(x) AL3 x + #define varAL4(x) AL4 x + #define varAU2(x) AU2 x + #define varAU3(x) AU3 x + #define varAU4(x) AU4 x +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) AD2(x,y) + #define initAD3(x,y,z) AD3(x,y,z) + #define initAD4(x,y,z,w) AD4(x,y,z,w) + #define initAF2(x,y) AF2(x,y) + #define initAF3(x,y,z) AF3(x,y,z) + #define initAF4(x,y,z,w) AF4(x,y,z,w) + #define initAL2(x,y) AL2(x,y) + #define initAL3(x,y,z) AL3(x,y,z) + #define initAL4(x,y,z,w) AL4(x,y,z,w) + #define initAU2(x,y) AU2(x,y) + #define initAU3(x,y,z) AU3(x,y,z) + #define initAU4(x,y,z,w) AU4(x,y,z,w) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//============================================================================================================================== + #define AAbsD1(a) abs(AD1(a)) + #define AAbsF1(a) abs(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ACosD1(a) cos(AD1(a)) + #define ACosF1(a) cos(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ADotD2(a,b) dot(AD2(a),AD2(b)) + #define ADotD3(a,b) dot(AD3(a),AD3(b)) + #define ADotD4(a,b) dot(AD4(a),AD4(b)) + #define ADotF2(a,b) dot(AF2(a),AF2(b)) + #define ADotF3(a,b) dot(AF3(a),AF3(b)) + #define ADotF4(a,b) dot(AF4(a),AF4(b)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AExp2D1(a) exp2(AD1(a)) + #define AExp2F1(a) exp2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AFloorD1(a) floor(AD1(a)) + #define AFloorF1(a) floor(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ALog2D1(a) log2(AD1(a)) + #define ALog2F1(a) log2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMaxD1(a,b) max(a,b) + #define AMaxF1(a,b) max(a,b) + #define AMaxL1(a,b) max(a,b) + #define AMaxU1(a,b) max(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMinD1(a,b) min(a,b) + #define AMinF1(a,b) min(a,b) + #define AMinL1(a,b) min(a,b) + #define AMinU1(a,b) min(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASinD1(a) sin(AD1(a)) + #define ASinF1(a) sin(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASqrtD1(a) sqrt(AD1(a)) + #define ASqrtF1(a) sqrt(AF1(a)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + #define APowD1(a,b) pow(AD1(a),AF1(b)) + #define APowF1(a,b) pow(AF1(a),AF1(b)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + #ifdef A_DUBL + AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;} + AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;} + AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;} + AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;} + AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d=a+AD2_(b);return d;} + AD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d=a+AD3_(b);return d;} + AD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d=a+AD4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;} + AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;} + AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;} + AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;} + AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;} + AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;} + AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;} + AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;} + AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;} + AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;} + AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;} + AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;} + AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;} + AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;} + AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;} + AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;} + AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;} + AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;} + AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;} + #endif +//============================================================================================================================== + AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;} + AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;} + AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;} + AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;} + AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d=a+AF2_(b);return d;} + AF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d=a+AF3_(b);return d;} + AF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d=a+AF4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;} + AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;} + AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;} + AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;} + AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;} + AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;} + AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;} + AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;} + AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;} + AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;} + AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;} + AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;} + AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;} + AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;} + AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;} + AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;} + AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;} + AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;} + AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;} +#endif diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_fsr1.h b/Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_fsr1.h new file mode 100644 index 000000000..4e0b3d548 --- /dev/null +++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/ffx_fsr1.h @@ -0,0 +1,1199 @@ +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// AMD FidelityFX SUPER RESOLUTION [FSR 1] ::: SPATIAL SCALING & EXTRAS - v1.20210629 +// +// +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// ABOUT +// ===== +// FSR is a collection of algorithms relating to generating a higher resolution image. +// This specific header focuses on single-image non-temporal image scaling, and related tools. +// +// The core functions are EASU and RCAS: +// [EASU] Edge Adaptive Spatial Upsampling ....... 1x to 4x area range spatial scaling, clamped adaptive elliptical filter. +// [RCAS] Robust Contrast Adaptive Sharpening .... A non-scaling variation on CAS. +// RCAS needs to be applied after EASU as a separate pass. +// +// Optional utility functions are: +// [LFGA] Linear Film Grain Applicator ........... Tool to apply film grain after scaling. +// [SRTM] Simple Reversible Tone-Mapper .......... Linear HDR {0 to FP16_MAX} to {0 to 1} and back. +// [TEPD] Temporal Energy Preserving Dither ...... Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion. +// See each individual sub-section for inline documentation. +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// FUNCTION PERMUTATIONS +// ===================== +// *F() ..... Single item computation with 32-bit. +// *H() ..... Single item computation with 16-bit, with packing (aka two 16-bit ops in parallel) when possible. +// *Hx2() ... Processing two items in parallel with 16-bit, easier packing. +// Not all interfaces in this file have a *Hx2() form. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [EASU] EDGE ADAPTIVE SPATIAL UPSAMPLING +// +//------------------------------------------------------------------------------------------------------------------------------ +// EASU provides a high quality spatial-only scaling at relatively low cost. +// Meaning EASU is appropiate for laptops and other low-end GPUs. +// Quality from 1x to 4x area scaling is good. +//------------------------------------------------------------------------------------------------------------------------------ +// The scalar uses a modified fast approximation to the standard lanczos(size=2) kernel. +// EASU runs in a single pass, so it applies a directionally and anisotropically adaptive radial lanczos. +// This is also kept as simple as possible to have minimum runtime. +//------------------------------------------------------------------------------------------------------------------------------ +// The lanzcos filter has negative lobes, so by itself it will introduce ringing. +// To remove all ringing, the algorithm uses the nearest 2x2 input texels as a neighborhood, +// and limits output to the minimum and maximum of that neighborhood. +//------------------------------------------------------------------------------------------------------------------------------ +// Input image requirements: +// +// Color needs to be encoded as 3 channel[red, green, blue](e.g.XYZ not supported) +// Each channel needs to be in the range[0, 1] +// Any color primaries are supported +// Display / tonemapping curve needs to be as if presenting to sRGB display or similar(e.g.Gamma 2.0) +// There should be no banding in the input +// There should be no high amplitude noise in the input +// There should be no noise in the input that is not at input pixel granularity +// For performance purposes, use 32bpp formats +//------------------------------------------------------------------------------------------------------------------------------ +// Best to apply EASU at the end of the frame after tonemapping +// but before film grain or composite of the UI. +//------------------------------------------------------------------------------------------------------------------------------ +// Example of including this header for D3D HLSL : +// +// #define A_GPU 1 +// #define A_HLSL 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of including this header for Vulkan GLSL : +// +// #define A_GPU 1 +// #define A_GLSL 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of including this header for Vulkan HLSL : +// +// #define A_GPU 1 +// #define A_HLSL 1 +// #define A_HLSL_6_2 1 +// #define A_NO_16_BIT_CAST 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of declaring the required input callbacks for GLSL : +// The callbacks need to gather4 for each color channel using the specified texture coordinate 'p'. +// EASU uses gather4 to reduce position computation logic and for free Arrays of Structures to Structures of Arrays conversion. +// +// AH4 FsrEasuRH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,0));} +// AH4 FsrEasuGH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,1));} +// AH4 FsrEasuBH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,2));} +// ... +// The FsrEasuCon function needs to be called from the CPU or GPU to set up constants. +// The difference in viewport and input image size is there to support Dynamic Resolution Scaling. +// To use FsrEasuCon() on the CPU, define A_CPU before including ffx_a and ffx_fsr1. +// Including a GPU example here, the 'con0' through 'con3' values would be stored out to a constant buffer. +// AU4 con0,con1,con2,con3; +// FsrEasuCon(con0,con1,con2,con3, +// 1920.0,1080.0, // Viewport size (top left aligned) in the input image which is to be scaled. +// 3840.0,2160.0, // The size of the input image. +// 2560.0,1440.0); // The output resolution. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CONSTANT SETUP +//============================================================================================================================== +// Call to setup required constant values (works on CPU or GPU). +A_STATIC void FsrEasuCon( +outAU4 con0, +outAU4 con1, +outAU4 con2, +outAU4 con3, +// This the rendered image resolution being upscaled +AF1 inputViewportInPixelsX, +AF1 inputViewportInPixelsY, +// This is the resolution of the resource containing the input image (useful for dynamic resolution) +AF1 inputSizeInPixelsX, +AF1 inputSizeInPixelsY, +// This is the display resolution which the input image gets upscaled to +AF1 outputSizeInPixelsX, +AF1 outputSizeInPixelsY){ + // Output integer position to a pixel position in viewport. + con0[0]=AU1_AF1(inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)); + con0[1]=AU1_AF1(inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)); + con0[2]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5)); + con0[3]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5)); + // Viewport pixel position to normalized image space. + // This is used to get upper-left of 'F' tap. + con1[0]=AU1_AF1(ARcpF1(inputSizeInPixelsX)); + con1[1]=AU1_AF1(ARcpF1(inputSizeInPixelsY)); + // Centers of gather4, first offset from upper-left of 'F'. + // +---+---+ + // | | | + // +--(0)--+ + // | b | c | + // +---F---+---+---+ + // | e | f | g | h | + // +--(1)--+--(2)--+ + // | i | j | k | l | + // +---+---+---+---+ + // | n | o | + // +--(3)--+ + // | | | + // +---+---+ + con1[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX)); + con1[3]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsY)); + // These are from (0) instead of 'F'. + con2[0]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsX)); + con2[1]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY)); + con2[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX)); + con2[3]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY)); + con3[0]=AU1_AF1(AF1_( 0.0)*ARcpF1(inputSizeInPixelsX)); + con3[1]=AU1_AF1(AF1_( 4.0)*ARcpF1(inputSizeInPixelsY)); + con3[2]=con3[3]=0;} + +//If the an offset into the input image resource +A_STATIC void FsrEasuConOffset( + outAU4 con0, + outAU4 con1, + outAU4 con2, + outAU4 con3, + // This the rendered image resolution being upscaled + AF1 inputViewportInPixelsX, + AF1 inputViewportInPixelsY, + // This is the resolution of the resource containing the input image (useful for dynamic resolution) + AF1 inputSizeInPixelsX, + AF1 inputSizeInPixelsY, + // This is the display resolution which the input image gets upscaled to + AF1 outputSizeInPixelsX, + AF1 outputSizeInPixelsY, + // This is the input image offset into the resource containing it (useful for dynamic resolution) + AF1 inputOffsetInPixelsX, + AF1 inputOffsetInPixelsY) { + FsrEasuCon(con0, con1, con2, con3, inputViewportInPixelsX, inputViewportInPixelsY, inputSizeInPixelsX, inputSizeInPixelsY, outputSizeInPixelsX, outputSizeInPixelsY); + con0[2] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsX * ARcpF1(outputSizeInPixelsX) - AF1_(0.5) + inputOffsetInPixelsX); + con0[3] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsY * ARcpF1(outputSizeInPixelsY) - AF1_(0.5) + inputOffsetInPixelsY); +} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 32-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(FSR_EASU_F) + // Input callback prototypes, need to be implemented by calling shader + AF4 FsrEasuRF(AF2 p); + AF4 FsrEasuGF(AF2 p); + AF4 FsrEasuBF(AF2 p); +//------------------------------------------------------------------------------------------------------------------------------ + // Filtering for a given tap for the scalar. + void FsrEasuTapF( + inout AF3 aC, // Accumulated color, with negative lobe. + inout AF1 aW, // Accumulated weight. + AF2 off, // Pixel offset from resolve position to tap. + AF2 dir, // Gradient direction. + AF2 len, // Length. + AF1 lob, // Negative lobe strength. + AF1 clp, // Clipping point. + AF3 c){ // Tap color. + // Rotate offset by direction. + AF2 v; + v.x=(off.x*( dir.x))+(off.y*dir.y); + v.y=(off.x*(-dir.y))+(off.y*dir.x); + // Anisotropy. + v*=len; + // Compute distance^2. + AF1 d2=v.x*v.x+v.y*v.y; + // Limit to the window as at corner, 2 taps can easily be outside. + d2=min(d2,clp); + // Approximation of lancos2 without sin() or rcp(), or sqrt() to get x. + // (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2 + // |_______________________________________| |_______________| + // base window + // The general form of the 'base' is, + // (a*(b*x^2-1)^2-(a-1)) + // Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe. + AF1 wB=AF1_(2.0/5.0)*d2+AF1_(-1.0); + AF1 wA=lob*d2+AF1_(-1.0); + wB*=wB; + wA*=wA; + wB=AF1_(25.0/16.0)*wB+AF1_(-(25.0/16.0-1.0)); + AF1 w=wB*wA; + // Do weighted average. + aC+=c*w;aW+=w;} +//------------------------------------------------------------------------------------------------------------------------------ + // Accumulate direction and length. + void FsrEasuSetF( + inout AF2 dir, + inout AF1 len, + AF2 pp, + AP1 biS,AP1 biT,AP1 biU,AP1 biV, + AF1 lA,AF1 lB,AF1 lC,AF1 lD,AF1 lE){ + // Compute bilinear weight, branches factor out as predicates are compiler time immediates. + // s t + // u v + AF1 w = AF1_(0.0); + if(biS)w=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y); + if(biT)w= pp.x *(AF1_(1.0)-pp.y); + if(biU)w=(AF1_(1.0)-pp.x)* pp.y ; + if(biV)w= pp.x * pp.y ; + // Direction is the '+' diff. + // a + // b c d + // e + // Then takes magnitude from abs average of both sides of 'c'. + // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms. + AF1 dc=lD-lC; + AF1 cb=lC-lB; + AF1 lenX=max(abs(dc),abs(cb)); + lenX=APrxLoRcpF1(lenX); + AF1 dirX=lD-lB; + dir.x+=dirX*w; + lenX=ASatF1(abs(dirX)*lenX); + lenX*=lenX; + len+=lenX*w; + // Repeat for the y axis. + AF1 ec=lE-lC; + AF1 ca=lC-lA; + AF1 lenY=max(abs(ec),abs(ca)); + lenY=APrxLoRcpF1(lenY); + AF1 dirY=lE-lA; + dir.y+=dirY*w; + lenY=ASatF1(abs(dirY)*lenY); + lenY*=lenY; + len+=lenY*w;} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrEasuF( + out AF3 pix, + AU2 ip, // Integer pixel position in output. + AU4 con0, // Constants generated by FsrEasuCon(). + AU4 con1, + AU4 con2, + AU4 con3){ +//------------------------------------------------------------------------------------------------------------------------------ + // Get position of 'f'. + AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw); + AF2 fp=floor(pp); + pp-=fp; +//------------------------------------------------------------------------------------------------------------------------------ + // 12-tap kernel. + // b c + // e f g h + // i j k l + // n o + // Gather 4 ordering. + // a b + // r g + // For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions, + // a b <- unused (z) + // r g + // a b a b + // r g r g + // a b + // r g <- unused (z) + // Allowing dead-code removal to remove the 'z's. + AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw); + // These are from p0 to avoid pulling two constants on pre-Navi hardware. + AF2 p1=p0+AF2_AU2(con2.xy); + AF2 p2=p0+AF2_AU2(con2.zw); + AF2 p3=p0+AF2_AU2(con3.xy); + AF4 bczzR=FsrEasuRF(p0); + AF4 bczzG=FsrEasuGF(p0); + AF4 bczzB=FsrEasuBF(p0); + AF4 ijfeR=FsrEasuRF(p1); + AF4 ijfeG=FsrEasuGF(p1); + AF4 ijfeB=FsrEasuBF(p1); + AF4 klhgR=FsrEasuRF(p2); + AF4 klhgG=FsrEasuGF(p2); + AF4 klhgB=FsrEasuBF(p2); + AF4 zzonR=FsrEasuRF(p3); + AF4 zzonG=FsrEasuGF(p3); + AF4 zzonB=FsrEasuBF(p3); +//------------------------------------------------------------------------------------------------------------------------------ + // Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD). + AF4 bczzL=bczzB*AF4_(0.5)+(bczzR*AF4_(0.5)+bczzG); + AF4 ijfeL=ijfeB*AF4_(0.5)+(ijfeR*AF4_(0.5)+ijfeG); + AF4 klhgL=klhgB*AF4_(0.5)+(klhgR*AF4_(0.5)+klhgG); + AF4 zzonL=zzonB*AF4_(0.5)+(zzonR*AF4_(0.5)+zzonG); + // Rename. + AF1 bL=bczzL.x; + AF1 cL=bczzL.y; + AF1 iL=ijfeL.x; + AF1 jL=ijfeL.y; + AF1 fL=ijfeL.z; + AF1 eL=ijfeL.w; + AF1 kL=klhgL.x; + AF1 lL=klhgL.y; + AF1 hL=klhgL.z; + AF1 gL=klhgL.w; + AF1 oL=zzonL.z; + AF1 nL=zzonL.w; + // Accumulate for bilinear interpolation. + AF2 dir=AF2_(0.0); + AF1 len=AF1_(0.0); + FsrEasuSetF(dir,len,pp,true, false,false,false,bL,eL,fL,gL,jL); + FsrEasuSetF(dir,len,pp,false,true ,false,false,cL,fL,gL,hL,kL); + FsrEasuSetF(dir,len,pp,false,false,true ,false,fL,iL,jL,kL,nL); + FsrEasuSetF(dir,len,pp,false,false,false,true ,gL,jL,kL,lL,oL); +//------------------------------------------------------------------------------------------------------------------------------ + // Normalize with approximation, and cleanup close to zero. + AF2 dir2=dir*dir; + AF1 dirR=dir2.x+dir2.y; + AP1 zro=dirR<AF1_(1.0/32768.0); + dirR=APrxLoRsqF1(dirR); + dirR=zro?AF1_(1.0):dirR; + dir.x=zro?AF1_(1.0):dir.x; + dir*=AF2_(dirR); + // Transform from {0 to 2} to {0 to 1} range, and shape with square. + len=len*AF1_(0.5); + len*=len; + // Stretch kernel {1.0 vert|horz, to sqrt(2.0) on diagonal}. + AF1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpF1(max(abs(dir.x),abs(dir.y))); + // Anisotropic length after rotation, + // x := 1.0 lerp to 'stretch' on edges + // y := 1.0 lerp to 2x on edges + AF2 len2=AF2(AF1_(1.0)+(stretch-AF1_(1.0))*len,AF1_(1.0)+AF1_(-0.5)*len); + // Based on the amount of 'edge', + // the window shifts from +/-{sqrt(2.0) to slightly beyond 2.0}. + AF1 lob=AF1_(0.5)+AF1_((1.0/4.0-0.04)-0.5)*len; + // Set distance^2 clipping point to the end of the adjustable window. + AF1 clp=APrxLoRcpF1(lob); +//------------------------------------------------------------------------------------------------------------------------------ + // Accumulation mixed with min/max of 4 nearest. + // b c + // e f g h + // i j k l + // n o + AF3 min4=min(AMin3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)), + AF3(klhgR.x,klhgG.x,klhgB.x)); + AF3 max4=max(AMax3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)), + AF3(klhgR.x,klhgG.x,klhgB.x)); + // Accumulation. + AF3 aC=AF3_(0.0); + AF1 aW=AF1_(0.0); + FsrEasuTapF(aC,aW,AF2( 0.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.x,bczzG.x,bczzB.x)); // b + FsrEasuTapF(aC,aW,AF2( 1.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.y,bczzG.y,bczzB.y)); // c + FsrEasuTapF(aC,aW,AF2(-1.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.x,ijfeG.x,ijfeB.x)); // i + FsrEasuTapF(aC,aW,AF2( 0.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.y,ijfeG.y,ijfeB.y)); // j + FsrEasuTapF(aC,aW,AF2( 0.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.z,ijfeG.z,ijfeB.z)); // f + FsrEasuTapF(aC,aW,AF2(-1.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.w,ijfeG.w,ijfeB.w)); // e + FsrEasuTapF(aC,aW,AF2( 1.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.x,klhgG.x,klhgB.x)); // k + FsrEasuTapF(aC,aW,AF2( 2.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.y,klhgG.y,klhgB.y)); // l + FsrEasuTapF(aC,aW,AF2( 2.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.z,klhgG.z,klhgB.z)); // h + FsrEasuTapF(aC,aW,AF2( 1.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.w,klhgG.w,klhgB.w)); // g + FsrEasuTapF(aC,aW,AF2( 1.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.z,zzonG.z,zzonB.z)); // o + FsrEasuTapF(aC,aW,AF2( 0.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.w,zzonG.w,zzonB.w)); // n +//------------------------------------------------------------------------------------------------------------------------------ + // Normalize and dering. + pix=min(max4,max(min4,aC*AF3_(ARcpF1(aW))));} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_EASU_H) +// Input callback prototypes, need to be implemented by calling shader + AH4 FsrEasuRH(AF2 p); + AH4 FsrEasuGH(AF2 p); + AH4 FsrEasuBH(AF2 p); +//------------------------------------------------------------------------------------------------------------------------------ + // This runs 2 taps in parallel. + void FsrEasuTapH( + inout AH2 aCR,inout AH2 aCG,inout AH2 aCB, + inout AH2 aW, + AH2 offX,AH2 offY, + AH2 dir, + AH2 len, + AH1 lob, + AH1 clp, + AH2 cR,AH2 cG,AH2 cB){ + AH2 vX,vY; + vX=offX* dir.xx +offY*dir.yy; + vY=offX*(-dir.yy)+offY*dir.xx; + vX*=len.x;vY*=len.y; + AH2 d2=vX*vX+vY*vY; + d2=min(d2,AH2_(clp)); + AH2 wB=AH2_(2.0/5.0)*d2+AH2_(-1.0); + AH2 wA=AH2_(lob)*d2+AH2_(-1.0); + wB*=wB; + wA*=wA; + wB=AH2_(25.0/16.0)*wB+AH2_(-(25.0/16.0-1.0)); + AH2 w=wB*wA; + aCR+=cR*w;aCG+=cG*w;aCB+=cB*w;aW+=w;} +//------------------------------------------------------------------------------------------------------------------------------ + // This runs 2 taps in parallel. + void FsrEasuSetH( + inout AH2 dirPX,inout AH2 dirPY, + inout AH2 lenP, + AH2 pp, + AP1 biST,AP1 biUV, + AH2 lA,AH2 lB,AH2 lC,AH2 lD,AH2 lE){ + AH2 w = AH2_(0.0); + if(biST)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_(AH1_(1.0)-pp.y); + if(biUV)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_( pp.y); + // ABS is not free in the packed FP16 path. + AH2 dc=lD-lC; + AH2 cb=lC-lB; + AH2 lenX=max(abs(dc),abs(cb)); + lenX=ARcpH2(lenX); + AH2 dirX=lD-lB; + dirPX+=dirX*w; + lenX=ASatH2(abs(dirX)*lenX); + lenX*=lenX; + lenP+=lenX*w; + AH2 ec=lE-lC; + AH2 ca=lC-lA; + AH2 lenY=max(abs(ec),abs(ca)); + lenY=ARcpH2(lenY); + AH2 dirY=lE-lA; + dirPY+=dirY*w; + lenY=ASatH2(abs(dirY)*lenY); + lenY*=lenY; + lenP+=lenY*w;} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrEasuH( + out AH3 pix, + AU2 ip, + AU4 con0, + AU4 con1, + AU4 con2, + AU4 con3){ +//------------------------------------------------------------------------------------------------------------------------------ + AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw); + AF2 fp=floor(pp); + pp-=fp; + AH2 ppp=AH2(pp); +//------------------------------------------------------------------------------------------------------------------------------ + AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw); + AF2 p1=p0+AF2_AU2(con2.xy); + AF2 p2=p0+AF2_AU2(con2.zw); + AF2 p3=p0+AF2_AU2(con3.xy); + AH4 bczzR=FsrEasuRH(p0); + AH4 bczzG=FsrEasuGH(p0); + AH4 bczzB=FsrEasuBH(p0); + AH4 ijfeR=FsrEasuRH(p1); + AH4 ijfeG=FsrEasuGH(p1); + AH4 ijfeB=FsrEasuBH(p1); + AH4 klhgR=FsrEasuRH(p2); + AH4 klhgG=FsrEasuGH(p2); + AH4 klhgB=FsrEasuBH(p2); + AH4 zzonR=FsrEasuRH(p3); + AH4 zzonG=FsrEasuGH(p3); + AH4 zzonB=FsrEasuBH(p3); +//------------------------------------------------------------------------------------------------------------------------------ + AH4 bczzL=bczzB*AH4_(0.5)+(bczzR*AH4_(0.5)+bczzG); + AH4 ijfeL=ijfeB*AH4_(0.5)+(ijfeR*AH4_(0.5)+ijfeG); + AH4 klhgL=klhgB*AH4_(0.5)+(klhgR*AH4_(0.5)+klhgG); + AH4 zzonL=zzonB*AH4_(0.5)+(zzonR*AH4_(0.5)+zzonG); + AH1 bL=bczzL.x; + AH1 cL=bczzL.y; + AH1 iL=ijfeL.x; + AH1 jL=ijfeL.y; + AH1 fL=ijfeL.z; + AH1 eL=ijfeL.w; + AH1 kL=klhgL.x; + AH1 lL=klhgL.y; + AH1 hL=klhgL.z; + AH1 gL=klhgL.w; + AH1 oL=zzonL.z; + AH1 nL=zzonL.w; + // This part is different, accumulating 2 taps in parallel. + AH2 dirPX=AH2_(0.0); + AH2 dirPY=AH2_(0.0); + AH2 lenP=AH2_(0.0); + FsrEasuSetH(dirPX,dirPY,lenP,ppp,true, false,AH2(bL,cL),AH2(eL,fL),AH2(fL,gL),AH2(gL,hL),AH2(jL,kL)); + FsrEasuSetH(dirPX,dirPY,lenP,ppp,false,true ,AH2(fL,gL),AH2(iL,jL),AH2(jL,kL),AH2(kL,lL),AH2(nL,oL)); + AH2 dir=AH2(dirPX.r+dirPX.g,dirPY.r+dirPY.g); + AH1 len=lenP.r+lenP.g; +//------------------------------------------------------------------------------------------------------------------------------ + AH2 dir2=dir*dir; + AH1 dirR=dir2.x+dir2.y; + AP1 zro=dirR<AH1_(1.0/32768.0); + dirR=APrxLoRsqH1(dirR); + dirR=zro?AH1_(1.0):dirR; + dir.x=zro?AH1_(1.0):dir.x; + dir*=AH2_(dirR); + len=len*AH1_(0.5); + len*=len; + AH1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpH1(max(abs(dir.x),abs(dir.y))); + AH2 len2=AH2(AH1_(1.0)+(stretch-AH1_(1.0))*len,AH1_(1.0)+AH1_(-0.5)*len); + AH1 lob=AH1_(0.5)+AH1_((1.0/4.0-0.04)-0.5)*len; + AH1 clp=APrxLoRcpH1(lob); +//------------------------------------------------------------------------------------------------------------------------------ + // FP16 is different, using packed trick to do min and max in same operation. + AH2 bothR=max(max(AH2(-ijfeR.z,ijfeR.z),AH2(-klhgR.w,klhgR.w)),max(AH2(-ijfeR.y,ijfeR.y),AH2(-klhgR.x,klhgR.x))); + AH2 bothG=max(max(AH2(-ijfeG.z,ijfeG.z),AH2(-klhgG.w,klhgG.w)),max(AH2(-ijfeG.y,ijfeG.y),AH2(-klhgG.x,klhgG.x))); + AH2 bothB=max(max(AH2(-ijfeB.z,ijfeB.z),AH2(-klhgB.w,klhgB.w)),max(AH2(-ijfeB.y,ijfeB.y),AH2(-klhgB.x,klhgB.x))); + // This part is different for FP16, working pairs of taps at a time. + AH2 pR=AH2_(0.0); + AH2 pG=AH2_(0.0); + AH2 pB=AH2_(0.0); + AH2 pW=AH2_(0.0); + FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0, 1.0)-ppp.xx,AH2(-1.0,-1.0)-ppp.yy,dir,len2,lob,clp,bczzR.xy,bczzG.xy,bczzB.xy); + FsrEasuTapH(pR,pG,pB,pW,AH2(-1.0, 0.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,ijfeR.xy,ijfeG.xy,ijfeB.xy); + FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0,-1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,ijfeR.zw,ijfeG.zw,ijfeB.zw); + FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 2.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,klhgR.xy,klhgG.xy,klhgB.xy); + FsrEasuTapH(pR,pG,pB,pW,AH2( 2.0, 1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,klhgR.zw,klhgG.zw,klhgB.zw); + FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 0.0)-ppp.xx,AH2( 2.0, 2.0)-ppp.yy,dir,len2,lob,clp,zzonR.zw,zzonG.zw,zzonB.zw); + AH3 aC=AH3(pR.x+pR.y,pG.x+pG.y,pB.x+pB.y); + AH1 aW=pW.x+pW.y; +//------------------------------------------------------------------------------------------------------------------------------ + // Slightly different for FP16 version due to combined min and max. + pix=min(AH3(bothR.y,bothG.y,bothB.y),max(-AH3(bothR.x,bothG.x,bothB.x),aC*AH3_(ARcpH1(aW))));} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [RCAS] ROBUST CONTRAST ADAPTIVE SHARPENING +// +//------------------------------------------------------------------------------------------------------------------------------ +// CAS uses a simplified mechanism to convert local contrast into a variable amount of sharpness. +// RCAS uses a more exact mechanism, solving for the maximum local sharpness possible before clipping. +// RCAS also has a built in process to limit sharpening of what it detects as possible noise. +// RCAS sharper does not support scaling, as it should be applied after EASU scaling. +// Pass EASU output straight into RCAS, no color conversions necessary. +//------------------------------------------------------------------------------------------------------------------------------ +// RCAS is based on the following logic. +// RCAS uses a 5 tap filter in a cross pattern (same as CAS), +// w n +// w 1 w for taps w m e +// w s +// Where 'w' is the negative lobe weight. +// output = (w*(n+e+w+s)+m)/(4*w+1) +// RCAS solves for 'w' by seeing where the signal might clip out of the {0 to 1} input range, +// 0 == (w*(n+e+w+s)+m)/(4*w+1) -> w = -m/(n+e+w+s) +// 1 == (w*(n+e+w+s)+m)/(4*w+1) -> w = (1-m)/(n+e+w+s-4*1) +// Then chooses the 'w' which results in no clipping, limits 'w', and multiplies by the 'sharp' amount. +// This solution above has issues with MSAA input as the steps along the gradient cause edge detection issues. +// So RCAS uses 4x the maximum and 4x the minimum (depending on equation)in place of the individual taps. +// As well as switching from 'm' to either the minimum or maximum (depending on side), to help in energy conservation. +// This stabilizes RCAS. +// RCAS does a simple highpass which is normalized against the local contrast then shaped, +// 0.25 +// 0.25 -1 0.25 +// 0.25 +// This is used as a noise detection filter, to reduce the effect of RCAS on grain, and focus on real edges. +// +// GLSL example for the required callbacks : +// +// AH4 FsrRcasLoadH(ASW2 p){return AH4(imageLoad(imgSrc,ASU2(p)));} +// void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b) +// { +// //do any simple input color conversions here or leave empty if none needed +// } +// +// FsrRcasCon need to be called from the CPU or GPU to set up constants. +// Including a GPU example here, the 'con' value would be stored out to a constant buffer. +// +// AU4 con; +// FsrRcasCon(con, +// 0.0); // The scale is {0.0 := maximum sharpness, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. +// --------------- +// RCAS sharpening supports a CAS-like pass-through alpha via, +// #define FSR_RCAS_PASSTHROUGH_ALPHA 1 +// RCAS also supports a define to enable a more expensive path to avoid some sharpening of noise. +// Would suggest it is better to apply film grain after RCAS sharpening (and after scaling) instead of using this define, +// #define FSR_RCAS_DENOISE 1 +//============================================================================================================================== +// This is set at the limit of providing unnatural results for sharpening. +#define FSR_RCAS_LIMIT (0.25-(1.0/16.0)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CONSTANT SETUP +//============================================================================================================================== +// Call to setup required constant values (works on CPU or GPU). +A_STATIC void FsrRcasCon( +outAU4 con, +// The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. +AF1 sharpness){ + // Transform from stops to linear value. + sharpness=AExp2F1(-sharpness); + varAF2(hSharp)=initAF2(sharpness,sharpness); + con[0]=AU1_AF1(sharpness); + con[1]=AU1_AH2_AF2(hSharp); + con[2]=0; + con[3]=0;} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 32-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(FSR_RCAS_F) + // Input callback prototypes that need to be implemented by calling shader + AF4 FsrRcasLoadF(ASU2 p); + void FsrRcasInputF(inout AF1 r,inout AF1 g,inout AF1 b); +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasF( + out AF1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. + out AF1 pixG, + out AF1 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AF1 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // Algorithm uses minimal 3x3 pixel neighborhood. + // b + // d e f + // h + ASU2 sp=ASU2(ip); + AF3 b=FsrRcasLoadF(sp+ASU2( 0,-1)).rgb; + AF3 d=FsrRcasLoadF(sp+ASU2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AF4 ee=FsrRcasLoadF(sp); + AF3 e=ee.rgb;pixA=ee.a; + #else + AF3 e=FsrRcasLoadF(sp).rgb; + #endif + AF3 f=FsrRcasLoadF(sp+ASU2( 1, 0)).rgb; + AF3 h=FsrRcasLoadF(sp+ASU2( 0, 1)).rgb; + // Rename (32-bit) or regroup (16-bit). + AF1 bR=b.r; + AF1 bG=b.g; + AF1 bB=b.b; + AF1 dR=d.r; + AF1 dG=d.g; + AF1 dB=d.b; + AF1 eR=e.r; + AF1 eG=e.g; + AF1 eB=e.b; + AF1 fR=f.r; + AF1 fG=f.g; + AF1 fB=f.b; + AF1 hR=h.r; + AF1 hG=h.g; + AF1 hB=h.b; + // Run optional input transform. + FsrRcasInputF(bR,bG,bB); + FsrRcasInputF(dR,dG,dB); + FsrRcasInputF(eR,eG,eB); + FsrRcasInputF(fR,fG,fB); + FsrRcasInputF(hR,hG,hB); + // Luma times 2. + AF1 bL=bB*AF1_(0.5)+(bR*AF1_(0.5)+bG); + AF1 dL=dB*AF1_(0.5)+(dR*AF1_(0.5)+dG); + AF1 eL=eB*AF1_(0.5)+(eR*AF1_(0.5)+eG); + AF1 fL=fB*AF1_(0.5)+(fR*AF1_(0.5)+fG); + AF1 hL=hB*AF1_(0.5)+(hR*AF1_(0.5)+hG); + // Noise detection. + AF1 nz=AF1_(0.25)*bL+AF1_(0.25)*dL+AF1_(0.25)*fL+AF1_(0.25)*hL-eL; + nz=ASatF1(abs(nz)*APrxMedRcpF1(AMax3F1(AMax3F1(bL,dL,eL),fL,hL)-AMin3F1(AMin3F1(bL,dL,eL),fL,hL))); + nz=AF1_(-0.5)*nz+AF1_(1.0); + // Min and max of ring. + AF1 mn4R=min(AMin3F1(bR,dR,fR),hR); + AF1 mn4G=min(AMin3F1(bG,dG,fG),hG); + AF1 mn4B=min(AMin3F1(bB,dB,fB),hB); + AF1 mx4R=max(AMax3F1(bR,dR,fR),hR); + AF1 mx4G=max(AMax3F1(bG,dG,fG),hG); + AF1 mx4B=max(AMax3F1(bB,dB,fB),hB); + // Immediate constants for peak range. + AF2 peakC=AF2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AF1 hitMinR=min(mn4R,eR)*ARcpF1(AF1_(4.0)*mx4R); + AF1 hitMinG=min(mn4G,eG)*ARcpF1(AF1_(4.0)*mx4G); + AF1 hitMinB=min(mn4B,eB)*ARcpF1(AF1_(4.0)*mx4B); + AF1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpF1(AF1_(4.0)*mn4R+peakC.y); + AF1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpF1(AF1_(4.0)*mn4G+peakC.y); + AF1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpF1(AF1_(4.0)*mn4B+peakC.y); + AF1 lobeR=max(-hitMinR,hitMaxR); + AF1 lobeG=max(-hitMinG,hitMaxG); + AF1 lobeB=max(-hitMinB,hitMaxB); + AF1 lobe=max(AF1_(-FSR_RCAS_LIMIT),min(AMax3F1(lobeR,lobeG,lobeB),AF1_(0.0)))*AF1_AU1(con.x); + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AF1 rcpL=APrxMedRcpF1(AF1_(4.0)*lobe+AF1_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL; + return;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_H) + // Input callback prototypes that need to be implemented by calling shader + AH4 FsrRcasLoadH(ASW2 p); + void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b); +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasH( + out AH1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. + out AH1 pixG, + out AH1 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AH1 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // Sharpening algorithm uses minimal 3x3 pixel neighborhood. + // b + // d e f + // h + ASW2 sp=ASW2(ip); + AH3 b=FsrRcasLoadH(sp+ASW2( 0,-1)).rgb; + AH3 d=FsrRcasLoadH(sp+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee=FsrRcasLoadH(sp); + AH3 e=ee.rgb;pixA=ee.a; + #else + AH3 e=FsrRcasLoadH(sp).rgb; + #endif + AH3 f=FsrRcasLoadH(sp+ASW2( 1, 0)).rgb; + AH3 h=FsrRcasLoadH(sp+ASW2( 0, 1)).rgb; + // Rename (32-bit) or regroup (16-bit). + AH1 bR=b.r; + AH1 bG=b.g; + AH1 bB=b.b; + AH1 dR=d.r; + AH1 dG=d.g; + AH1 dB=d.b; + AH1 eR=e.r; + AH1 eG=e.g; + AH1 eB=e.b; + AH1 fR=f.r; + AH1 fG=f.g; + AH1 fB=f.b; + AH1 hR=h.r; + AH1 hG=h.g; + AH1 hB=h.b; + // Run optional input transform. + FsrRcasInputH(bR,bG,bB); + FsrRcasInputH(dR,dG,dB); + FsrRcasInputH(eR,eG,eB); + FsrRcasInputH(fR,fG,fB); + FsrRcasInputH(hR,hG,hB); + // Luma times 2. + AH1 bL=bB*AH1_(0.5)+(bR*AH1_(0.5)+bG); + AH1 dL=dB*AH1_(0.5)+(dR*AH1_(0.5)+dG); + AH1 eL=eB*AH1_(0.5)+(eR*AH1_(0.5)+eG); + AH1 fL=fB*AH1_(0.5)+(fR*AH1_(0.5)+fG); + AH1 hL=hB*AH1_(0.5)+(hR*AH1_(0.5)+hG); + // Noise detection. + AH1 nz=AH1_(0.25)*bL+AH1_(0.25)*dL+AH1_(0.25)*fL+AH1_(0.25)*hL-eL; + nz=ASatH1(abs(nz)*APrxMedRcpH1(AMax3H1(AMax3H1(bL,dL,eL),fL,hL)-AMin3H1(AMin3H1(bL,dL,eL),fL,hL))); + nz=AH1_(-0.5)*nz+AH1_(1.0); + // Min and max of ring. + AH1 mn4R=min(AMin3H1(bR,dR,fR),hR); + AH1 mn4G=min(AMin3H1(bG,dG,fG),hG); + AH1 mn4B=min(AMin3H1(bB,dB,fB),hB); + AH1 mx4R=max(AMax3H1(bR,dR,fR),hR); + AH1 mx4G=max(AMax3H1(bG,dG,fG),hG); + AH1 mx4B=max(AMax3H1(bB,dB,fB),hB); + // Immediate constants for peak range. + AH2 peakC=AH2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AH1 hitMinR=min(mn4R,eR)*ARcpH1(AH1_(4.0)*mx4R); + AH1 hitMinG=min(mn4G,eG)*ARcpH1(AH1_(4.0)*mx4G); + AH1 hitMinB=min(mn4B,eB)*ARcpH1(AH1_(4.0)*mx4B); + AH1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH1(AH1_(4.0)*mn4R+peakC.y); + AH1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH1(AH1_(4.0)*mn4G+peakC.y); + AH1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH1(AH1_(4.0)*mn4B+peakC.y); + AH1 lobeR=max(-hitMinR,hitMaxR); + AH1 lobeG=max(-hitMinG,hitMaxG); + AH1 lobeB=max(-hitMinB,hitMaxB); + AH1 lobe=max(AH1_(-FSR_RCAS_LIMIT),min(AMax3H1(lobeR,lobeG,lobeB),AH1_(0.0)))*AH2_AU1(con.y).x; + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AH1 rcpL=APrxMedRcpH1(AH1_(4.0)*lobe+AH1_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_HX2) + // Input callback prototypes that need to be implemented by the calling shader + AH4 FsrRcasLoadHx2(ASW2 p); + void FsrRcasInputHx2(inout AH2 r,inout AH2 g,inout AH2 b); +//------------------------------------------------------------------------------------------------------------------------------ + // Can be used to convert from packed Structures of Arrays to Arrays of Structures for store. + void FsrRcasDepackHx2(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){ + #ifdef A_HLSL + // Invoke a slower path for DX only, since it won't allow uninitialized values. + pix0.a=pix1.a=0.0; + #endif + pix0.rgb=AH3(pixR.x,pixG.x,pixB.x); + pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasHx2( + // Output values are for 2 8x8 tiles in a 16x8 region. + // pix<R,G,B>.x = left 8x8 tile + // pix<R,G,B>.y = right 8x8 tile + // This enables later processing to easily be packed as well. + out AH2 pixR, + out AH2 pixG, + out AH2 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AH2 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // No scaling algorithm uses minimal 3x3 pixel neighborhood. + ASW2 sp0=ASW2(ip); + AH3 b0=FsrRcasLoadHx2(sp0+ASW2( 0,-1)).rgb; + AH3 d0=FsrRcasLoadHx2(sp0+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee0=FsrRcasLoadHx2(sp0); + AH3 e0=ee0.rgb;pixA.r=ee0.a; + #else + AH3 e0=FsrRcasLoadHx2(sp0).rgb; + #endif + AH3 f0=FsrRcasLoadHx2(sp0+ASW2( 1, 0)).rgb; + AH3 h0=FsrRcasLoadHx2(sp0+ASW2( 0, 1)).rgb; + ASW2 sp1=sp0+ASW2(8,0); + AH3 b1=FsrRcasLoadHx2(sp1+ASW2( 0,-1)).rgb; + AH3 d1=FsrRcasLoadHx2(sp1+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee1=FsrRcasLoadHx2(sp1); + AH3 e1=ee1.rgb;pixA.g=ee1.a; + #else + AH3 e1=FsrRcasLoadHx2(sp1).rgb; + #endif + AH3 f1=FsrRcasLoadHx2(sp1+ASW2( 1, 0)).rgb; + AH3 h1=FsrRcasLoadHx2(sp1+ASW2( 0, 1)).rgb; + // Arrays of Structures to Structures of Arrays conversion. + AH2 bR=AH2(b0.r,b1.r); + AH2 bG=AH2(b0.g,b1.g); + AH2 bB=AH2(b0.b,b1.b); + AH2 dR=AH2(d0.r,d1.r); + AH2 dG=AH2(d0.g,d1.g); + AH2 dB=AH2(d0.b,d1.b); + AH2 eR=AH2(e0.r,e1.r); + AH2 eG=AH2(e0.g,e1.g); + AH2 eB=AH2(e0.b,e1.b); + AH2 fR=AH2(f0.r,f1.r); + AH2 fG=AH2(f0.g,f1.g); + AH2 fB=AH2(f0.b,f1.b); + AH2 hR=AH2(h0.r,h1.r); + AH2 hG=AH2(h0.g,h1.g); + AH2 hB=AH2(h0.b,h1.b); + // Run optional input transform. + FsrRcasInputHx2(bR,bG,bB); + FsrRcasInputHx2(dR,dG,dB); + FsrRcasInputHx2(eR,eG,eB); + FsrRcasInputHx2(fR,fG,fB); + FsrRcasInputHx2(hR,hG,hB); + // Luma times 2. + AH2 bL=bB*AH2_(0.5)+(bR*AH2_(0.5)+bG); + AH2 dL=dB*AH2_(0.5)+(dR*AH2_(0.5)+dG); + AH2 eL=eB*AH2_(0.5)+(eR*AH2_(0.5)+eG); + AH2 fL=fB*AH2_(0.5)+(fR*AH2_(0.5)+fG); + AH2 hL=hB*AH2_(0.5)+(hR*AH2_(0.5)+hG); + // Noise detection. + AH2 nz=AH2_(0.25)*bL+AH2_(0.25)*dL+AH2_(0.25)*fL+AH2_(0.25)*hL-eL; + nz=ASatH2(abs(nz)*APrxMedRcpH2(AMax3H2(AMax3H2(bL,dL,eL),fL,hL)-AMin3H2(AMin3H2(bL,dL,eL),fL,hL))); + nz=AH2_(-0.5)*nz+AH2_(1.0); + // Min and max of ring. + AH2 mn4R=min(AMin3H2(bR,dR,fR),hR); + AH2 mn4G=min(AMin3H2(bG,dG,fG),hG); + AH2 mn4B=min(AMin3H2(bB,dB,fB),hB); + AH2 mx4R=max(AMax3H2(bR,dR,fR),hR); + AH2 mx4G=max(AMax3H2(bG,dG,fG),hG); + AH2 mx4B=max(AMax3H2(bB,dB,fB),hB); + // Immediate constants for peak range. + AH2 peakC=AH2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AH2 hitMinR=min(mn4R,eR)*ARcpH2(AH2_(4.0)*mx4R); + AH2 hitMinG=min(mn4G,eG)*ARcpH2(AH2_(4.0)*mx4G); + AH2 hitMinB=min(mn4B,eB)*ARcpH2(AH2_(4.0)*mx4B); + AH2 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH2(AH2_(4.0)*mn4R+peakC.y); + AH2 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH2(AH2_(4.0)*mn4G+peakC.y); + AH2 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH2(AH2_(4.0)*mn4B+peakC.y); + AH2 lobeR=max(-hitMinR,hitMaxR); + AH2 lobeG=max(-hitMinG,hitMaxG); + AH2 lobeB=max(-hitMinB,hitMaxB); + AH2 lobe=max(AH2_(-FSR_RCAS_LIMIT),min(AMax3H2(lobeR,lobeG,lobeB),AH2_(0.0)))*AH2_(AH2_AU1(con.y).x); + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AH2 rcpL=APrxMedRcpH2(AH2_(4.0)*lobe+AH2_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [LFGA] LINEAR FILM GRAIN APPLICATOR +// +//------------------------------------------------------------------------------------------------------------------------------ +// Adding output-resolution film grain after scaling is a good way to mask both rendering and scaling artifacts. +// Suggest using tiled blue noise as film grain input, with peak noise frequency set for a specific look and feel. +// The 'Lfga*()' functions provide a convenient way to introduce grain. +// These functions limit grain based on distance to signal limits. +// This is done so that the grain is temporally energy preserving, and thus won't modify image tonality. +// Grain application should be done in a linear colorspace. +// The grain should be temporally changing, but have a temporal sum per pixel that adds to zero (non-biased). +//------------------------------------------------------------------------------------------------------------------------------ +// Usage, +// FsrLfga*( +// color, // In/out linear colorspace color {0 to 1} ranged. +// grain, // Per pixel grain texture value {-0.5 to 0.5} ranged, input is 3-channel to support colored grain. +// amount); // Amount of grain (0 to 1} ranged. +//------------------------------------------------------------------------------------------------------------------------------ +// Example if grain texture is monochrome: 'FsrLfgaF(color,AF3_(grain),amount)' +//============================================================================================================================== +#if defined(A_GPU) + // Maximum grain is the minimum distance to the signal limit. + void FsrLfgaF(inout AF3 c,AF3 t,AF1 a){c+=(t*AF3_(a))*min(AF3_(1.0)-c,c);} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + // Half precision version (slower). + void FsrLfgaH(inout AH3 c,AH3 t,AH1 a){c+=(t*AH3_(a))*min(AH3_(1.0)-c,c);} +//------------------------------------------------------------------------------------------------------------------------------ + // Packed half precision version (faster). + void FsrLfgaHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 tR,AH2 tG,AH2 tB,AH1 a){ + cR+=(tR*AH2_(a))*min(AH2_(1.0)-cR,cR);cG+=(tG*AH2_(a))*min(AH2_(1.0)-cG,cG);cB+=(tB*AH2_(a))*min(AH2_(1.0)-cB,cB);} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [SRTM] SIMPLE REVERSIBLE TONE-MAPPER +// +//------------------------------------------------------------------------------------------------------------------------------ +// This provides a way to take linear HDR color {0 to FP16_MAX} and convert it into a temporary {0 to 1} ranged post-tonemapped linear. +// The tonemapper preserves RGB ratio, which helps maintain HDR color bleed during filtering. +//------------------------------------------------------------------------------------------------------------------------------ +// Reversible tonemapper usage, +// FsrSrtm*(color); // {0 to FP16_MAX} converted to {0 to 1}. +// FsrSrtmInv*(color); // {0 to 1} converted into {0 to 32768, output peak safe for FP16}. +//============================================================================================================================== +#if defined(A_GPU) + void FsrSrtmF(inout AF3 c){c*=AF3_(ARcpF1(AMax3F1(c.r,c.g,c.b)+AF1_(1.0)));} + // The extra max solves the c=1.0 case (which is a /0). + void FsrSrtmInvF(inout AF3 c){c*=AF3_(ARcpF1(max(AF1_(1.0/32768.0),AF1_(1.0)-AMax3F1(c.r,c.g,c.b))));} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + void FsrSrtmH(inout AH3 c){c*=AH3_(ARcpH1(AMax3H1(c.r,c.g,c.b)+AH1_(1.0)));} + void FsrSrtmInvH(inout AH3 c){c*=AH3_(ARcpH1(max(AH1_(1.0/32768.0),AH1_(1.0)-AMax3H1(c.r,c.g,c.b))));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrSrtmHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){ + AH2 rcp=ARcpH2(AMax3H2(cR,cG,cB)+AH2_(1.0));cR*=rcp;cG*=rcp;cB*=rcp;} + void FsrSrtmInvHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){ + AH2 rcp=ARcpH2(max(AH2_(1.0/32768.0),AH2_(1.0)-AMax3H2(cR,cG,cB)));cR*=rcp;cG*=rcp;cB*=rcp;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [TEPD] TEMPORAL ENERGY PRESERVING DITHER +// +//------------------------------------------------------------------------------------------------------------------------------ +// Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion. +// Gamma 2.0 is used so that the conversion back to linear is just to square the color. +// The conversion comes in 8-bit and 10-bit modes, designed for output to 8-bit UNORM or 10:10:10:2 respectively. +// Given good non-biased temporal blue noise as dither input, +// the output dither will temporally conserve energy. +// This is done by choosing the linear nearest step point instead of perceptual nearest. +// See code below for details. +//------------------------------------------------------------------------------------------------------------------------------ +// DX SPEC RULES FOR FLOAT->UNORM 8-BIT CONVERSION +// =============================================== +// - Output is 'uint(floor(saturate(n)*255.0+0.5))'. +// - Thus rounding is to nearest. +// - NaN gets converted to zero. +// - INF is clamped to {0.0 to 1.0}. +//============================================================================================================================== +#if defined(A_GPU) + // Hand tuned integer position to dither value, with more values than simple checkerboard. + // Only 32-bit has enough precision for this compddation. + // Output is {0 to <1}. + AF1 FsrTepdDitF(AU2 p,AU1 f){ + AF1 x=AF1_(p.x+f); + AF1 y=AF1_(p.y); + // The 1.61803 golden ratio. + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + // Number designed to provide a good visual pattern. + AF1 b=AF1_(1.0/3.69); + x=x*a+(y*b); + return AFractF1(x);} +//------------------------------------------------------------------------------------------------------------------------------ + // This version is 8-bit gamma 2.0. + // The 'c' input is {0 to 1}. + // Output is {0 to 1} ready for image store. + void FsrTepdC8F(inout AF3 c,AF1 dit){ + AF3 n=sqrt(c); + n=floor(n*AF3_(255.0))*AF3_(1.0/255.0); + AF3 a=n*n; + AF3 b=n+AF3_(1.0/255.0);b=b*b; + // Ratio of 'a' to 'b' required to produce 'c'. + // APrxLoRcpF1() won't work here (at least for very high dynamic ranges). + // APrxMedRcpF1() is an IADD,FMA,MUL. + AF3 r=(c-b)*APrxMedRcpF3(a-b); + // Use the ratio as a cutoff to choose 'a' or 'b'. + // AGtZeroF1() is a MUL. + c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + // This version is 10-bit gamma 2.0. + // The 'c' input is {0 to 1}. + // Output is {0 to 1} ready for image store. + void FsrTepdC10F(inout AF3 c,AF1 dit){ + AF3 n=sqrt(c); + n=floor(n*AF3_(1023.0))*AF3_(1.0/1023.0); + AF3 a=n*n; + AF3 b=n+AF3_(1.0/1023.0);b=b*b; + AF3 r=(c-b)*APrxMedRcpF3(a-b); + c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/1023.0));} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + AH1 FsrTepdDitH(AU2 p,AU1 f){ + AF1 x=AF1_(p.x+f); + AF1 y=AF1_(p.y); + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + AF1 b=AF1_(1.0/3.69); + x=x*a+(y*b); + return AH1(AFractF1(x));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC8H(inout AH3 c,AH1 dit){ + AH3 n=sqrt(c); + n=floor(n*AH3_(255.0))*AH3_(1.0/255.0); + AH3 a=n*n; + AH3 b=n+AH3_(1.0/255.0);b=b*b; + AH3 r=(c-b)*APrxMedRcpH3(a-b); + c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC10H(inout AH3 c,AH1 dit){ + AH3 n=sqrt(c); + n=floor(n*AH3_(1023.0))*AH3_(1.0/1023.0); + AH3 a=n*n; + AH3 b=n+AH3_(1.0/1023.0);b=b*b; + AH3 r=(c-b)*APrxMedRcpH3(a-b); + c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/1023.0));} +//============================================================================================================================== + // This computes dither for positions 'p' and 'p+{8,0}'. + AH2 FsrTepdDitHx2(AU2 p,AU1 f){ + AF2 x; + x.x=AF1_(p.x+f); + x.y=x.x+AF1_(8.0); + AF1 y=AF1_(p.y); + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + AF1 b=AF1_(1.0/3.69); + x=x*AF2_(a)+AF2_(y*b); + return AH2(AFractF2(x));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC8Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){ + AH2 nR=sqrt(cR); + AH2 nG=sqrt(cG); + AH2 nB=sqrt(cB); + nR=floor(nR*AH2_(255.0))*AH2_(1.0/255.0); + nG=floor(nG*AH2_(255.0))*AH2_(1.0/255.0); + nB=floor(nB*AH2_(255.0))*AH2_(1.0/255.0); + AH2 aR=nR*nR; + AH2 aG=nG*nG; + AH2 aB=nB*nB; + AH2 bR=nR+AH2_(1.0/255.0);bR=bR*bR; + AH2 bG=nG+AH2_(1.0/255.0);bG=bG*bG; + AH2 bB=nB+AH2_(1.0/255.0);bB=bB*bB; + AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR); + AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG); + AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB); + cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/255.0)); + cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/255.0)); + cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC10Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){ + AH2 nR=sqrt(cR); + AH2 nG=sqrt(cG); + AH2 nB=sqrt(cB); + nR=floor(nR*AH2_(1023.0))*AH2_(1.0/1023.0); + nG=floor(nG*AH2_(1023.0))*AH2_(1.0/1023.0); + nB=floor(nB*AH2_(1023.0))*AH2_(1.0/1023.0); + AH2 aR=nR*nR; + AH2 aG=nG*nG; + AH2 aB=nB*nB; + AH2 bR=nR+AH2_(1.0/1023.0);bR=bR*bR; + AH2 bG=nG+AH2_(1.0/1023.0);bG=bG*bG; + AH2 bB=nB+AH2_(1.0/1023.0);bB=bB*bB; + AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR); + AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG); + AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB); + cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/1023.0)); + cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/1023.0)); + cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/1023.0));} +#endif diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_scaling.glsl b/Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_scaling.glsl new file mode 100644 index 000000000..8e8755db2 --- /dev/null +++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_scaling.glsl @@ -0,0 +1,88 @@ +#version 430 core +precision mediump float; +layout (local_size_x = 64) in; +layout(rgba8, binding = 0, location=0) uniform image2D imgOutput; +layout( location=1 ) uniform sampler2D Source; +layout( location=2 ) uniform float srcX0; +layout( location=3 ) uniform float srcX1; +layout( location=4 ) uniform float srcY0; +layout( location=5 ) uniform float srcY1; +layout( location=6 ) uniform float dstX0; +layout( location=7 ) uniform float dstX1; +layout( location=8 ) uniform float dstY0; +layout( location=9 ) uniform float dstY1; +layout( location=10 ) uniform float scaleX; +layout( location=11 ) uniform float scaleY; + +#define A_GPU 1 +#define A_GLSL 1 +#include "ffx_a.h" + +#define FSR_EASU_F 1 +AU4 con0, con1, con2, con3; +float srcW, srcH, dstW, dstH; +vec2 bLeft, tRight; + +AF2 translate(AF2 pos) { + return AF2(pos.x * scaleX, pos.y * scaleY); +} + +void setBounds(vec2 bottomLeft, vec2 topRight) { + bLeft = bottomLeft; + tRight = topRight; +} + +AF2 translateDest(AF2 pos) { + AF2 translatedPos = AF2(pos.x, pos.y); + translatedPos.x = dstX1 < dstX0 ? dstX1 - translatedPos.x : translatedPos.x; + translatedPos.y = dstY0 > dstY1 ? dstY0 + dstY1 - translatedPos.y - 1: translatedPos.y; + return translatedPos; +} + +AF4 FsrEasuRF(AF2 p) { AF4 res = textureGather(Source, translate(p), 0); return res; } +AF4 FsrEasuGF(AF2 p) { AF4 res = textureGather(Source, translate(p), 1); return res; } +AF4 FsrEasuBF(AF2 p) { AF4 res = textureGather(Source, translate(p), 2); return res; } + +#include "ffx_fsr1.h" + +float insideBox(vec2 v) { + vec2 s = step(bLeft, v) - step(tRight, v); + return s.x * s.y; +} + +void CurrFilter(AU2 pos) +{ + if((insideBox(vec2(pos.x, pos.y))) == 0) { + imageStore(imgOutput, ASU2(pos.x, pos.y), AF4(0,0,0,1)); + return; + } + AF3 c; + FsrEasuF(c, AU2(pos.x - bLeft.x, pos.y - bLeft.y), con0, con1, con2, con3); + imageStore(imgOutput, ASU2(translateDest(pos)), AF4(c, 1)); +} + +void main() { + srcW = abs(srcX1 - srcX0); + srcH = abs(srcY1 - srcY0); + dstW = abs(dstX1 - dstX0); + dstH = abs(dstY1 - dstY0); + + AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u); + + setBounds(vec2(dstX0 < dstX1 ? dstX0 : dstX1, dstY0 < dstY1 ? dstY0 : dstY1), + vec2(dstX1 > dstX0 ? dstX1 : dstX0, dstY1 > dstY0 ? dstY1 : dstY0)); + + // Upscaling + FsrEasuCon(con0, con1, con2, con3, + srcW, srcH, // Viewport size (top left aligned) in the input image which is to be scaled. + srcW, srcH, // The size of the input image. + dstW, dstH); // The output resolution. + + CurrFilter(gxy); + gxy.x += 8u; + CurrFilter(gxy); + gxy.y += 8u; + CurrFilter(gxy); + gxy.x -= 8u; + CurrFilter(gxy); +} \ No newline at end of file diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_sharpening.glsl b/Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_sharpening.glsl new file mode 100644 index 000000000..d3b98729a --- /dev/null +++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/fsr_sharpening.glsl @@ -0,0 +1,37 @@ +#version 430 core +precision mediump float; +layout (local_size_x = 64) in; +layout(rgba8, binding = 0, location=0) uniform image2D imgOutput; +layout( location=1 ) uniform sampler2D source; +layout( location=2 ) uniform float sharpening; + +#define A_GPU 1 +#define A_GLSL 1 +#include "ffx_a.h" + +#define FSR_RCAS_F 1 +AU4 con0; + +AF4 FsrRcasLoadF(ASU2 p) { return AF4(texelFetch(source, p, 0)); } +void FsrRcasInputF(inout AF1 r, inout AF1 g, inout AF1 b) {} + +#include "ffx_fsr1.h" + +void CurrFilter(AU2 pos) +{ + AF3 c; + FsrRcasF(c.r, c.g, c.b, pos, con0); + imageStore(imgOutput, ASU2(pos), AF4(c, 1)); +} + +void main() { + FsrRcasCon(con0, sharpening); + AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u); + CurrFilter(gxy); + gxy.x += 8u; + CurrFilter(gxy); + gxy.y += 8u; + CurrFilter(gxy); + gxy.x -= 8u; + CurrFilter(gxy); +} \ No newline at end of file diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/fxaa.glsl b/Ryujinx.Graphics.OpenGL/Effects/Shaders/fxaa.glsl new file mode 100644 index 000000000..8bdcbca69 --- /dev/null +++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/fxaa.glsl @@ -0,0 +1,1174 @@ +/*============================================================================ + + + NVIDIA FXAA 3.11 by TIMOTHY LOTTES + + +------------------------------------------------------------------------------ +COPYRIGHT (C) 2010, 2011 NVIDIA CORPORATION. ALL RIGHTS RESERVED. +------------------------------------------------------------------------------ +TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THIS SOFTWARE IS PROVIDED +*AS IS* AND NVIDIA AND ITS SUPPLIERS DISCLAIM ALL WARRANTIES, EITHER EXPRESS +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT SHALL NVIDIA +OR ITS SUPPLIERS BE LIABLE FOR ANY SPECIAL, INCIDENTAL, INDIRECT, OR +CONSEQUENTIAL DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR +LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, +OR ANY OTHER PECUNIARY LOSS) ARISING OUT OF THE USE OF OR INABILITY TO USE +THIS SOFTWARE, EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + +------------------------------------------------------------------------------ + INTEGRATION CHECKLIST +------------------------------------------------------------------------------ +(1.) +In the shader source, setup defines for the desired configuration. +When providing multiple shaders (for different presets), +simply setup the defines differently in multiple files. +Example, + + #define FXAA_PC 1 + #define FXAA_HLSL_5 1 + #define FXAA_QUALITY__PRESET 12 + +Or, + + #define FXAA_360 1 + +Or, + + #define FXAA_PS3 1 + +Etc. + +(2.) +Then include this file, + + #include "Fxaa3_11.h" + +(3.) +Then call the FXAA pixel shader from within your desired shader. +Look at the FXAA Quality FxaaPixelShader() for docs on inputs. +As for FXAA 3.11 all inputs for all shaders are the same +to enable easy porting between platforms. + + return FxaaPixelShader(...); + +(4.) +Insure pass prior to FXAA outputs RGBL (see next section). +Or use, + + #define FXAA_GREEN_AS_LUMA 1 + +(5.) +Setup engine to provide the following constants +which are used in the FxaaPixelShader() inputs, + + FxaaFloat2 fxaaQualityRcpFrame, + FxaaFloat4 fxaaConsoleRcpFrameOpt, + FxaaFloat4 fxaaConsoleRcpFrameOpt2, + FxaaFloat4 fxaaConsole360RcpFrameOpt2, + FxaaFloat fxaaQualitySubpix, + FxaaFloat fxaaQualityEdgeThreshold, + FxaaFloat fxaaQualityEdgeThresholdMin, + FxaaFloat fxaaConsoleEdgeSharpness, + FxaaFloat fxaaConsoleEdgeThreshold, + FxaaFloat fxaaConsoleEdgeThresholdMin, + FxaaFloat4 fxaaConsole360ConstDir + +Look at the FXAA Quality FxaaPixelShader() for docs on inputs. + +(6.) +Have FXAA vertex shader run as a full screen triangle, +and output "pos" and "fxaaConsolePosPos" +such that inputs in the pixel shader provide, + + // {xy} = center of pixel + FxaaFloat2 pos, + + // {xy__} = upper left of pixel + // {__zw} = lower right of pixel + FxaaFloat4 fxaaConsolePosPos, + +(7.) +Insure the texture sampler(s) used by FXAA are set to bilinear filtering. + + +------------------------------------------------------------------------------ + INTEGRATION - RGBL AND COLORSPACE +------------------------------------------------------------------------------ +FXAA3 requires RGBL as input unless the following is set, + + #define FXAA_GREEN_AS_LUMA 1 + +In which case the engine uses green in place of luma, +and requires RGB input is in a non-linear colorspace. + +RGB should be LDR (low dynamic range). +Specifically do FXAA after tonemapping. + +RGB data as returned by a texture fetch can be non-linear, +or linear when FXAA_GREEN_AS_LUMA is not set. +Note an "sRGB format" texture counts as linear, +because the result of a texture fetch is linear data. +Regular "RGBA8" textures in the sRGB colorspace are non-linear. + +If FXAA_GREEN_AS_LUMA is not set, +luma must be stored in the alpha channel prior to running FXAA. +This luma should be in a perceptual space (could be gamma 2.0). +Example pass before FXAA where output is gamma 2.0 encoded, + + color.rgb = ToneMap(color.rgb); // linear color output + color.rgb = sqrt(color.rgb); // gamma 2.0 color output + return color; + +To use FXAA, + + color.rgb = ToneMap(color.rgb); // linear color output + color.rgb = sqrt(color.rgb); // gamma 2.0 color output + color.a = dot(color.rgb, FxaaFloat3(0.299, 0.587, 0.114)); // compute luma + return color; + +Another example where output is linear encoded, +say for instance writing to an sRGB formated render target, +where the render target does the conversion back to sRGB after blending, + + color.rgb = ToneMap(color.rgb); // linear color output + return color; + +To use FXAA, + + color.rgb = ToneMap(color.rgb); // linear color output + color.a = sqrt(dot(color.rgb, FxaaFloat3(0.299, 0.587, 0.114))); // compute luma + return color; + +Getting luma correct is required for the algorithm to work correctly. + + +------------------------------------------------------------------------------ + BEING LINEARLY CORRECT? +------------------------------------------------------------------------------ +Applying FXAA to a framebuffer with linear RGB color will look worse. +This is very counter intuitive, but happends to be true in this case. +The reason is because dithering artifacts will be more visiable +in a linear colorspace. + + +------------------------------------------------------------------------------ + COMPLEX INTEGRATION +------------------------------------------------------------------------------ +Q. What if the engine is blending into RGB before wanting to run FXAA? + +A. In the last opaque pass prior to FXAA, + have the pass write out luma into alpha. + Then blend into RGB only. + FXAA should be able to run ok + assuming the blending pass did not any add aliasing. + This should be the common case for particles and common blending passes. + +A. Or use FXAA_GREEN_AS_LUMA. + +============================================================================*/ + +#version 430 core + +layout(local_size_x = 16, local_size_y = 16) in; +layout(rgba8, binding = 0) uniform image2D imgOutput; + +uniform sampler2D inputTexture; +layout(location=0) uniform vec2 invResolution; + +#define FXAA_QUALITY__PRESET 12 +#define FXAA_GREEN_AS_LUMA 1 +#define FXAA_PC 1 +#define FXAA_GLSL_130 1 + + +/*============================================================================ + + INTEGRATION KNOBS + +/*==========================================================================*/ +#ifndef FXAA_PC + // + // FXAA Quality + // The high quality PC algorithm. + // + #define FXAA_PC 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_GLSL_120 + #define FXAA_GLSL_120 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_GLSL_130 + #define FXAA_GLSL_130 0 +#endif +/*==========================================================================*/ +#ifndef FXAA_GREEN_AS_LUMA + // + // For those using non-linear color, + // and either not able to get luma in alpha, or not wanting to, + // this enables FXAA to run using green as a proxy for luma. + // So with this enabled, no need to pack luma in alpha. + // + // This will turn off AA on anything which lacks some amount of green. + // Pure red and blue or combination of only R and B, will get no AA. + // + // Might want to lower the settings for both, + // fxaaConsoleEdgeThresholdMin + // fxaaQualityEdgeThresholdMin + // In order to insure AA does not get turned off on colors + // which contain a minor amount of green. + // + // 1 = On. + // 0 = Off. + // + #define FXAA_GREEN_AS_LUMA 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_EARLY_EXIT + // + // Controls algorithm's early exit path. + // On PS3 turning this ON adds 2 cycles to the shader. + // On 360 turning this OFF adds 10ths of a millisecond to the shader. + // Turning this off on console will result in a more blurry image. + // So this defaults to on. + // + // 1 = On. + // 0 = Off. + // + #define FXAA_EARLY_EXIT 1 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_DISCARD + // + // Only valid for PC OpenGL currently. + // Probably will not work when FXAA_GREEN_AS_LUMA = 1. + // + // 1 = Use discard on pixels which don't need AA. + // For APIs which enable concurrent TEX+ROP from same surface. + // 0 = Return unchanged color on pixels which don't need AA. + // + #define FXAA_DISCARD 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_FAST_PIXEL_OFFSET + // + // Used for GLSL 120 only. + // + // 1 = GL API supports fast pixel offsets + // 0 = do not use fast pixel offsets + // + #ifdef GL_EXT_gpu_shader4 + #define FXAA_FAST_PIXEL_OFFSET 1 + #endif + #ifdef GL_NV_gpu_shader5 + #define FXAA_FAST_PIXEL_OFFSET 1 + #endif + #ifdef GL_ARB_gpu_shader5 + #define FXAA_FAST_PIXEL_OFFSET 1 + #endif + #ifndef FXAA_FAST_PIXEL_OFFSET + #define FXAA_FAST_PIXEL_OFFSET 0 + #endif +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_GATHER4_ALPHA + // + // 1 = API supports gather4 on alpha channel. + // 0 = API does not support gather4 on alpha channel. + // + #if (FXAA_HLSL_5 == 1) + #define FXAA_GATHER4_ALPHA 1 + #endif + #ifdef GL_ARB_gpu_shader5 + #define FXAA_GATHER4_ALPHA 1 + #endif + #ifdef GL_NV_gpu_shader5 + #define FXAA_GATHER4_ALPHA 1 + #endif + #ifndef FXAA_GATHER4_ALPHA + #define FXAA_GATHER4_ALPHA 0 + #endif +#endif + +/*============================================================================ + FXAA QUALITY - TUNING KNOBS +------------------------------------------------------------------------------ +NOTE the other tuning knobs are now in the shader function inputs! +============================================================================*/ +#ifndef FXAA_QUALITY__PRESET + // + // Choose the quality preset. + // This needs to be compiled into the shader as it effects code. + // Best option to include multiple presets is to + // in each shader define the preset, then include this file. + // + // OPTIONS + // ----------------------------------------------------------------------- + // 10 to 15 - default medium dither (10=fastest, 15=highest quality) + // 20 to 29 - less dither, more expensive (20=fastest, 29=highest quality) + // 39 - no dither, very expensive + // + // NOTES + // ----------------------------------------------------------------------- + // 12 = slightly faster then FXAA 3.9 and higher edge quality (default) + // 13 = about same speed as FXAA 3.9 and better than 12 + // 23 = closest to FXAA 3.9 visually and performance wise + // _ = the lowest digit is directly related to performance + // _ = the highest digit is directly related to style + // + #define FXAA_QUALITY__PRESET 12 +#endif + + +/*============================================================================ + + FXAA QUALITY - PRESETS + +============================================================================*/ + +/*============================================================================ + FXAA QUALITY - MEDIUM DITHER PRESETS +============================================================================*/ +#if (FXAA_QUALITY__PRESET == 10) + #define FXAA_QUALITY__PS 3 + #define FXAA_QUALITY__P0 1.5 + #define FXAA_QUALITY__P1 3.0 + #define FXAA_QUALITY__P2 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 11) + #define FXAA_QUALITY__PS 4 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 3.0 + #define FXAA_QUALITY__P3 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 12) + #define FXAA_QUALITY__PS 5 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 4.0 + #define FXAA_QUALITY__P4 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 13) + #define FXAA_QUALITY__PS 6 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 4.0 + #define FXAA_QUALITY__P5 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 14) + #define FXAA_QUALITY__PS 7 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 4.0 + #define FXAA_QUALITY__P6 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 15) + #define FXAA_QUALITY__PS 8 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 2.0 + #define FXAA_QUALITY__P6 4.0 + #define FXAA_QUALITY__P7 12.0 +#endif + +/*============================================================================ + FXAA QUALITY - LOW DITHER PRESETS +============================================================================*/ +#if (FXAA_QUALITY__PRESET == 20) + #define FXAA_QUALITY__PS 3 + #define FXAA_QUALITY__P0 1.5 + #define FXAA_QUALITY__P1 2.0 + #define FXAA_QUALITY__P2 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 21) + #define FXAA_QUALITY__PS 4 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 22) + #define FXAA_QUALITY__PS 5 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 23) + #define FXAA_QUALITY__PS 6 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 24) + #define FXAA_QUALITY__PS 7 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 3.0 + #define FXAA_QUALITY__P6 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 25) + #define FXAA_QUALITY__PS 8 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 2.0 + #define FXAA_QUALITY__P6 4.0 + #define FXAA_QUALITY__P7 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 26) + #define FXAA_QUALITY__PS 9 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 2.0 + #define FXAA_QUALITY__P6 2.0 + #define FXAA_QUALITY__P7 4.0 + #define FXAA_QUALITY__P8 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 27) + #define FXAA_QUALITY__PS 10 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 2.0 + #define FXAA_QUALITY__P6 2.0 + #define FXAA_QUALITY__P7 2.0 + #define FXAA_QUALITY__P8 4.0 + #define FXAA_QUALITY__P9 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 28) + #define FXAA_QUALITY__PS 11 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 2.0 + #define FXAA_QUALITY__P6 2.0 + #define FXAA_QUALITY__P7 2.0 + #define FXAA_QUALITY__P8 2.0 + #define FXAA_QUALITY__P9 4.0 + #define FXAA_QUALITY__P10 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 29) + #define FXAA_QUALITY__PS 12 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 2.0 + #define FXAA_QUALITY__P6 2.0 + #define FXAA_QUALITY__P7 2.0 + #define FXAA_QUALITY__P8 2.0 + #define FXAA_QUALITY__P9 2.0 + #define FXAA_QUALITY__P10 4.0 + #define FXAA_QUALITY__P11 8.0 +#endif + +/*============================================================================ + FXAA QUALITY - EXTREME QUALITY +============================================================================*/ +#if (FXAA_QUALITY__PRESET == 39) + #define FXAA_QUALITY__PS 12 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.0 + #define FXAA_QUALITY__P2 1.0 + #define FXAA_QUALITY__P3 1.0 + #define FXAA_QUALITY__P4 1.0 + #define FXAA_QUALITY__P5 1.5 + #define FXAA_QUALITY__P6 2.0 + #define FXAA_QUALITY__P7 2.0 + #define FXAA_QUALITY__P8 2.0 + #define FXAA_QUALITY__P9 2.0 + #define FXAA_QUALITY__P10 4.0 + #define FXAA_QUALITY__P11 8.0 +#endif + + + +/*============================================================================ + + API PORTING + +============================================================================*/ +#if (FXAA_GLSL_120 == 1) || (FXAA_GLSL_130 == 1) + #define FxaaBool bool + #define FxaaDiscard discard + #define FxaaFloat float + #define FxaaFloat2 vec2 + #define FxaaFloat3 vec3 + #define FxaaFloat4 vec4 + #define FxaaHalf float + #define FxaaHalf2 vec2 + #define FxaaHalf3 vec3 + #define FxaaHalf4 vec4 + #define FxaaInt2 ivec2 + #define FxaaSat(x) clamp(x, 0.0, 1.0) + #define FxaaTex sampler2D +#else + #define FxaaBool bool + #define FxaaDiscard clip(-1) + #define FxaaFloat float + #define FxaaFloat2 float2 + #define FxaaFloat3 float3 + #define FxaaFloat4 float4 + #define FxaaHalf half + #define FxaaHalf2 half2 + #define FxaaHalf3 half3 + #define FxaaHalf4 half4 + #define FxaaSat(x) saturate(x) +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_GLSL_120 == 1) + // Requires, + // #version 120 + // And at least, + // #extension GL_EXT_gpu_shader4 : enable + // (or set FXAA_FAST_PIXEL_OFFSET 1 to work like DX9) + #define FxaaTexTop(t, p) texture2DLod(t, p, 0.0) + #if (FXAA_FAST_PIXEL_OFFSET == 1) + #define FxaaTexOff(t, p, o, r) texture2DLodOffset(t, p, 0.0, o) + #else + #define FxaaTexOff(t, p, o, r) texture2DLod(t, p + (o * r), 0.0) + #endif + #if (FXAA_GATHER4_ALPHA == 1) + // use #extension GL_ARB_gpu_shader5 : enable + #define FxaaTexAlpha4(t, p) textureGather(t, p, 3) + #define FxaaTexOffAlpha4(t, p, o) textureGatherOffset(t, p, o, 3) + #define FxaaTexGreen4(t, p) textureGather(t, p, 1) + #define FxaaTexOffGreen4(t, p, o) textureGatherOffset(t, p, o, 1) + #endif +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_GLSL_130 == 1) + // Requires "#version 130" or better + #define FxaaTexTop(t, p) textureLod(t, p, 0.0) + #define FxaaTexOff(t, p, o, r) textureLodOffset(t, p, 0.0, o) + #if (FXAA_GATHER4_ALPHA == 1) + // use #extension GL_ARB_gpu_shader5 : enable + #define FxaaTexAlpha4(t, p) textureGather(t, p, 3) + #define FxaaTexOffAlpha4(t, p, o) textureGatherOffset(t, p, o, 3) + #define FxaaTexGreen4(t, p) textureGather(t, p, 1) + #define FxaaTexOffGreen4(t, p, o) textureGatherOffset(t, p, o, 1) + #endif +#endif + + +/*============================================================================ + GREEN AS LUMA OPTION SUPPORT FUNCTION +============================================================================*/ +#if (FXAA_GREEN_AS_LUMA == 0) + FxaaFloat FxaaLuma(FxaaFloat4 rgba) { return rgba.w; } +#else + FxaaFloat FxaaLuma(FxaaFloat4 rgba) { return rgba.y; } +#endif + + + + +/*============================================================================ + + FXAA3 QUALITY - PC + +============================================================================*/ +#if (FXAA_PC == 1) +/*--------------------------------------------------------------------------*/ +FxaaFloat4 FxaaPixelShader( + // + // Use noperspective interpolation here (turn off perspective interpolation). + // {xy} = center of pixel + FxaaFloat2 pos, + // + // Used only for FXAA Console, and not used on the 360 version. + // Use noperspective interpolation here (turn off perspective interpolation). + // {xy__} = upper left of pixel + // {__zw} = lower right of pixel + FxaaFloat4 fxaaConsolePosPos, + // + // Input color texture. + // {rgb_} = color in linear or perceptual color space + // if (FXAA_GREEN_AS_LUMA == 0) + // {___a} = luma in perceptual color space (not linear) + FxaaTex tex, + // + // Only used on the optimized 360 version of FXAA Console. + // For everything but 360, just use the same input here as for "tex". + // For 360, same texture, just alias with a 2nd sampler. + // This sampler needs to have an exponent bias of -1. + FxaaTex fxaaConsole360TexExpBiasNegOne, + // + // Only used on the optimized 360 version of FXAA Console. + // For everything but 360, just use the same input here as for "tex". + // For 360, same texture, just alias with a 3nd sampler. + // This sampler needs to have an exponent bias of -2. + FxaaTex fxaaConsole360TexExpBiasNegTwo, + // + // Only used on FXAA Quality. + // This must be from a constant/uniform. + // {x_} = 1.0/screenWidthInPixels + // {_y} = 1.0/screenHeightInPixels + FxaaFloat2 fxaaQualityRcpFrame, + // + // Only used on FXAA Console. + // This must be from a constant/uniform. + // This effects sub-pixel AA quality and inversely sharpness. + // Where N ranges between, + // N = 0.50 (default) + // N = 0.33 (sharper) + // {x___} = -N/screenWidthInPixels + // {_y__} = -N/screenHeightInPixels + // {__z_} = N/screenWidthInPixels + // {___w} = N/screenHeightInPixels + FxaaFloat4 fxaaConsoleRcpFrameOpt, + // + // Only used on FXAA Console. + // Not used on 360, but used on PS3 and PC. + // This must be from a constant/uniform. + // {x___} = -2.0/screenWidthInPixels + // {_y__} = -2.0/screenHeightInPixels + // {__z_} = 2.0/screenWidthInPixels + // {___w} = 2.0/screenHeightInPixels + FxaaFloat4 fxaaConsoleRcpFrameOpt2, + // + // Only used on FXAA Console. + // Only used on 360 in place of fxaaConsoleRcpFrameOpt2. + // This must be from a constant/uniform. + // {x___} = 8.0/screenWidthInPixels + // {_y__} = 8.0/screenHeightInPixels + // {__z_} = -4.0/screenWidthInPixels + // {___w} = -4.0/screenHeightInPixels + FxaaFloat4 fxaaConsole360RcpFrameOpt2, + // + // Only used on FXAA Quality. + // This used to be the FXAA_QUALITY__SUBPIX define. + // It is here now to allow easier tuning. + // Choose the amount of sub-pixel aliasing removal. + // This can effect sharpness. + // 1.00 - upper limit (softer) + // 0.75 - default amount of filtering + // 0.50 - lower limit (sharper, less sub-pixel aliasing removal) + // 0.25 - almost off + // 0.00 - completely off + FxaaFloat fxaaQualitySubpix, + // + // Only used on FXAA Quality. + // This used to be the FXAA_QUALITY__EDGE_THRESHOLD define. + // It is here now to allow easier tuning. + // The minimum amount of local contrast required to apply algorithm. + // 0.333 - too little (faster) + // 0.250 - low quality + // 0.166 - default + // 0.125 - high quality + // 0.063 - overkill (slower) + FxaaFloat fxaaQualityEdgeThreshold, + // + // Only used on FXAA Quality. + // This used to be the FXAA_QUALITY__EDGE_THRESHOLD_MIN define. + // It is here now to allow easier tuning. + // Trims the algorithm from processing darks. + // 0.0833 - upper limit (default, the start of visible unfiltered edges) + // 0.0625 - high quality (faster) + // 0.0312 - visible limit (slower) + // Special notes when using FXAA_GREEN_AS_LUMA, + // Likely want to set this to zero. + // As colors that are mostly not-green + // will appear very dark in the green channel! + // Tune by looking at mostly non-green content, + // then start at zero and increase until aliasing is a problem. + FxaaFloat fxaaQualityEdgeThresholdMin, + // + // Only used on FXAA Console. + // This used to be the FXAA_CONSOLE__EDGE_SHARPNESS define. + // It is here now to allow easier tuning. + // This does not effect PS3, as this needs to be compiled in. + // Use FXAA_CONSOLE__PS3_EDGE_SHARPNESS for PS3. + // Due to the PS3 being ALU bound, + // there are only three safe values here: 2 and 4 and 8. + // These options use the shaders ability to a free *|/ by 2|4|8. + // For all other platforms can be a non-power of two. + // 8.0 is sharper (default!!!) + // 4.0 is softer + // 2.0 is really soft (good only for vector graphics inputs) + FxaaFloat fxaaConsoleEdgeSharpness, + // + // Only used on FXAA Console. + // This used to be the FXAA_CONSOLE__EDGE_THRESHOLD define. + // It is here now to allow easier tuning. + // This does not effect PS3, as this needs to be compiled in. + // Use FXAA_CONSOLE__PS3_EDGE_THRESHOLD for PS3. + // Due to the PS3 being ALU bound, + // there are only two safe values here: 1/4 and 1/8. + // These options use the shaders ability to a free *|/ by 2|4|8. + // The console setting has a different mapping than the quality setting. + // Other platforms can use other values. + // 0.125 leaves less aliasing, but is softer (default!!!) + // 0.25 leaves more aliasing, and is sharper + FxaaFloat fxaaConsoleEdgeThreshold, + // + // Only used on FXAA Console. + // This used to be the FXAA_CONSOLE__EDGE_THRESHOLD_MIN define. + // It is here now to allow easier tuning. + // Trims the algorithm from processing darks. + // The console setting has a different mapping than the quality setting. + // This only applies when FXAA_EARLY_EXIT is 1. + // This does not apply to PS3, + // PS3 was simplified to avoid more shader instructions. + // 0.06 - faster but more aliasing in darks + // 0.05 - default + // 0.04 - slower and less aliasing in darks + // Special notes when using FXAA_GREEN_AS_LUMA, + // Likely want to set this to zero. + // As colors that are mostly not-green + // will appear very dark in the green channel! + // Tune by looking at mostly non-green content, + // then start at zero and increase until aliasing is a problem. + FxaaFloat fxaaConsoleEdgeThresholdMin, + // + // Extra constants for 360 FXAA Console only. + // Use zeros or anything else for other platforms. + // These must be in physical constant registers and NOT immedates. + // Immedates will result in compiler un-optimizing. + // {xyzw} = float4(1.0, -1.0, 0.25, -0.25) + FxaaFloat4 fxaaConsole360ConstDir +) { +/*--------------------------------------------------------------------------*/ + FxaaFloat2 posM; + posM.x = pos.x; + posM.y = pos.y; + #if (FXAA_GATHER4_ALPHA == 1) + #if (FXAA_DISCARD == 0) + FxaaFloat4 rgbyM = FxaaTexTop(tex, posM); + #if (FXAA_GREEN_AS_LUMA == 0) + #define lumaM rgbyM.w + #else + #define lumaM rgbyM.y + #endif + #endif + #if (FXAA_GREEN_AS_LUMA == 0) + FxaaFloat4 luma4A = FxaaTexAlpha4(tex, posM); + FxaaFloat4 luma4B = FxaaTexOffAlpha4(tex, posM, FxaaInt2(-1, -1)); + #else + FxaaFloat4 luma4A = FxaaTexGreen4(tex, posM); + FxaaFloat4 luma4B = FxaaTexOffGreen4(tex, posM, FxaaInt2(-1, -1)); + #endif + #if (FXAA_DISCARD == 1) + #define lumaM luma4A.w + #endif + #define lumaE luma4A.z + #define lumaS luma4A.x + #define lumaSE luma4A.y + #define lumaNW luma4B.w + #define lumaN luma4B.z + #define lumaW luma4B.x + #else + FxaaFloat4 rgbyM = FxaaTexTop(tex, posM); + #if (FXAA_GREEN_AS_LUMA == 0) + #define lumaM rgbyM.w + #else + #define lumaM rgbyM.y + #endif + FxaaFloat lumaS = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 0, 1), fxaaQualityRcpFrame.xy)); + FxaaFloat lumaE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1, 0), fxaaQualityRcpFrame.xy)); + FxaaFloat lumaN = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 0,-1), fxaaQualityRcpFrame.xy)); + FxaaFloat lumaW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 0), fxaaQualityRcpFrame.xy)); + #endif +/*--------------------------------------------------------------------------*/ + FxaaFloat maxSM = max(lumaS, lumaM); + FxaaFloat minSM = min(lumaS, lumaM); + FxaaFloat maxESM = max(lumaE, maxSM); + FxaaFloat minESM = min(lumaE, minSM); + FxaaFloat maxWN = max(lumaN, lumaW); + FxaaFloat minWN = min(lumaN, lumaW); + FxaaFloat rangeMax = max(maxWN, maxESM); + FxaaFloat rangeMin = min(minWN, minESM); + FxaaFloat rangeMaxScaled = rangeMax * fxaaQualityEdgeThreshold; + FxaaFloat range = rangeMax - rangeMin; + FxaaFloat rangeMaxClamped = max(fxaaQualityEdgeThresholdMin, rangeMaxScaled); + FxaaBool earlyExit = range < rangeMaxClamped; +/*--------------------------------------------------------------------------*/ + if(earlyExit) + #if (FXAA_DISCARD == 1) + FxaaDiscard; + #else + return rgbyM; + #endif +/*--------------------------------------------------------------------------*/ + #if (FXAA_GATHER4_ALPHA == 0) + FxaaFloat lumaNW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1,-1), fxaaQualityRcpFrame.xy)); + FxaaFloat lumaSE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1, 1), fxaaQualityRcpFrame.xy)); + FxaaFloat lumaNE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1,-1), fxaaQualityRcpFrame.xy)); + FxaaFloat lumaSW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 1), fxaaQualityRcpFrame.xy)); + #else + FxaaFloat lumaNE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(1, -1), fxaaQualityRcpFrame.xy)); + FxaaFloat lumaSW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 1), fxaaQualityRcpFrame.xy)); + #endif +/*--------------------------------------------------------------------------*/ + FxaaFloat lumaNS = lumaN + lumaS; + FxaaFloat lumaWE = lumaW + lumaE; + FxaaFloat subpixRcpRange = 1.0/range; + FxaaFloat subpixNSWE = lumaNS + lumaWE; + FxaaFloat edgeHorz1 = (-2.0 * lumaM) + lumaNS; + FxaaFloat edgeVert1 = (-2.0 * lumaM) + lumaWE; +/*--------------------------------------------------------------------------*/ + FxaaFloat lumaNESE = lumaNE + lumaSE; + FxaaFloat lumaNWNE = lumaNW + lumaNE; + FxaaFloat edgeHorz2 = (-2.0 * lumaE) + lumaNESE; + FxaaFloat edgeVert2 = (-2.0 * lumaN) + lumaNWNE; +/*--------------------------------------------------------------------------*/ + FxaaFloat lumaNWSW = lumaNW + lumaSW; + FxaaFloat lumaSWSE = lumaSW + lumaSE; + FxaaFloat edgeHorz4 = (abs(edgeHorz1) * 2.0) + abs(edgeHorz2); + FxaaFloat edgeVert4 = (abs(edgeVert1) * 2.0) + abs(edgeVert2); + FxaaFloat edgeHorz3 = (-2.0 * lumaW) + lumaNWSW; + FxaaFloat edgeVert3 = (-2.0 * lumaS) + lumaSWSE; + FxaaFloat edgeHorz = abs(edgeHorz3) + edgeHorz4; + FxaaFloat edgeVert = abs(edgeVert3) + edgeVert4; +/*--------------------------------------------------------------------------*/ + FxaaFloat subpixNWSWNESE = lumaNWSW + lumaNESE; + FxaaFloat lengthSign = fxaaQualityRcpFrame.x; + FxaaBool horzSpan = edgeHorz >= edgeVert; + FxaaFloat subpixA = subpixNSWE * 2.0 + subpixNWSWNESE; +/*--------------------------------------------------------------------------*/ + if(!horzSpan) lumaN = lumaW; + if(!horzSpan) lumaS = lumaE; + if(horzSpan) lengthSign = fxaaQualityRcpFrame.y; + FxaaFloat subpixB = (subpixA * (1.0/12.0)) - lumaM; +/*--------------------------------------------------------------------------*/ + FxaaFloat gradientN = lumaN - lumaM; + FxaaFloat gradientS = lumaS - lumaM; + FxaaFloat lumaNN = lumaN + lumaM; + FxaaFloat lumaSS = lumaS + lumaM; + FxaaBool pairN = abs(gradientN) >= abs(gradientS); + FxaaFloat gradient = max(abs(gradientN), abs(gradientS)); + if(pairN) lengthSign = -lengthSign; + FxaaFloat subpixC = FxaaSat(abs(subpixB) * subpixRcpRange); +/*--------------------------------------------------------------------------*/ + FxaaFloat2 posB; + posB.x = posM.x; + posB.y = posM.y; + FxaaFloat2 offNP; + offNP.x = (!horzSpan) ? 0.0 : fxaaQualityRcpFrame.x; + offNP.y = ( horzSpan) ? 0.0 : fxaaQualityRcpFrame.y; + if(!horzSpan) posB.x += lengthSign * 0.5; + if( horzSpan) posB.y += lengthSign * 0.5; +/*--------------------------------------------------------------------------*/ + FxaaFloat2 posN; + posN.x = posB.x - offNP.x * FXAA_QUALITY__P0; + posN.y = posB.y - offNP.y * FXAA_QUALITY__P0; + FxaaFloat2 posP; + posP.x = posB.x + offNP.x * FXAA_QUALITY__P0; + posP.y = posB.y + offNP.y * FXAA_QUALITY__P0; + FxaaFloat subpixD = ((-2.0)*subpixC) + 3.0; + FxaaFloat lumaEndN = FxaaLuma(FxaaTexTop(tex, posN)); + FxaaFloat subpixE = subpixC * subpixC; + FxaaFloat lumaEndP = FxaaLuma(FxaaTexTop(tex, posP)); +/*--------------------------------------------------------------------------*/ + if(!pairN) lumaNN = lumaSS; + FxaaFloat gradientScaled = gradient * 1.0/4.0; + FxaaFloat lumaMM = lumaM - lumaNN * 0.5; + FxaaFloat subpixF = subpixD * subpixE; + FxaaBool lumaMLTZero = lumaMM < 0.0; +/*--------------------------------------------------------------------------*/ + lumaEndN -= lumaNN * 0.5; + lumaEndP -= lumaNN * 0.5; + FxaaBool doneN = abs(lumaEndN) >= gradientScaled; + FxaaBool doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P1; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P1; + FxaaBool doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P1; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P1; +/*--------------------------------------------------------------------------*/ + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P2; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P2; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P2; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P2; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 3) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P3; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P3; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P3; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P3; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 4) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P4; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P4; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P4; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P4; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 5) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P5; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P5; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P5; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P5; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 6) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P6; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P6; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P6; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P6; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 7) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P7; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P7; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P7; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P7; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 8) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P8; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P8; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P8; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P8; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 9) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P9; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P9; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P9; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P9; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 10) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P10; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P10; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P10; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P10; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 11) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P11; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P11; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P11; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P11; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 12) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P12; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P12; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P12; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P12; +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } +/*--------------------------------------------------------------------------*/ + FxaaFloat dstN = posM.x - posN.x; + FxaaFloat dstP = posP.x - posM.x; + if(!horzSpan) dstN = posM.y - posN.y; + if(!horzSpan) dstP = posP.y - posM.y; +/*--------------------------------------------------------------------------*/ + FxaaBool goodSpanN = (lumaEndN < 0.0) != lumaMLTZero; + FxaaFloat spanLength = (dstP + dstN); + FxaaBool goodSpanP = (lumaEndP < 0.0) != lumaMLTZero; + FxaaFloat spanLengthRcp = 1.0/spanLength; +/*--------------------------------------------------------------------------*/ + FxaaBool directionN = dstN < dstP; + FxaaFloat dst = min(dstN, dstP); + FxaaBool goodSpan = directionN ? goodSpanN : goodSpanP; + FxaaFloat subpixG = subpixF * subpixF; + FxaaFloat pixelOffset = (dst * (-spanLengthRcp)) + 0.5; + FxaaFloat subpixH = subpixG * fxaaQualitySubpix; +/*--------------------------------------------------------------------------*/ + FxaaFloat pixelOffsetGood = goodSpan ? pixelOffset : 0.0; + FxaaFloat pixelOffsetSubpix = max(pixelOffsetGood, subpixH); + if(!horzSpan) posM.x += pixelOffsetSubpix * lengthSign; + if( horzSpan) posM.y += pixelOffsetSubpix * lengthSign; + #if (FXAA_DISCARD == 1) + return FxaaTexTop(tex, posM); + #else + return FxaaFloat4(FxaaTexTop(tex, posM).xyz, lumaM); + #endif +} +/*==========================================================================*/ +#endif + +vec4 mainImage(vec2 fragCoord) +{ + vec2 rcpFrame = 1./invResolution.xy; + vec2 uv2 = fragCoord.xy / invResolution.xy; + + float fxaaQualitySubpix = 0.75; // [0..1], default 0.75 + float fxaaQualityEdgeThreshold = 0.166; // [0.125..0.33], default 0.166 + float fxaaQualityEdgeThresholdMin = 0.02;//0.0625; // ? + vec4 dummy4 = vec4(0.0,0.0,0.0,0.0); + float dummy1 = 0.0; + + vec4 col = FxaaPixelShader(uv2, dummy4, + inputTexture, inputTexture, inputTexture, + rcpFrame, dummy4, dummy4, dummy4, + fxaaQualitySubpix, fxaaQualityEdgeThreshold, + fxaaQualityEdgeThresholdMin, + dummy1, dummy1, dummy1, dummy4); + + vec4 fragColor = vec4( col.xyz, 1. ); + + return fragColor; +} + +void main() +{ + ivec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4); + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 4; j++) + { + ivec2 texelCoord = ivec2(loc.x + i, loc.y + j); + vec4 outColor = mainImage(texelCoord + vec2(0.5)); + imageStore(imgOutput, texelCoord, outColor); + } + } +} diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa.hlsl b/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa.hlsl new file mode 100644 index 000000000..2201f78c1 --- /dev/null +++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa.hlsl @@ -0,0 +1,1361 @@ +/** + * Copyright (C) 2013 Jorge Jimenez (jorge@iryoku.com) + * Copyright (C) 2013 Jose I. Echevarria (joseignacioechevarria@gmail.com) + * Copyright (C) 2013 Belen Masia (bmasia@unizar.es) + * Copyright (C) 2013 Fernando Navarro (fernandn@microsoft.com) + * Copyright (C) 2013 Diego Gutierrez (diegog@unizar.es) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to + * do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. As clarification, there + * is no requirement that the copyright notice and permission be included in + * binary distributions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +/** + * _______ ___ ___ ___ ___ + * / || \/ | / \ / \ + * | (---- | \ / | / ^ \ / ^ \ + * \ \ | |\/| | / /_\ \ / /_\ \ + * ----) | | | | | / _____ \ / _____ \ + * |_______/ |__| |__| /__/ \__\ /__/ \__\ + * + * E N H A N C E D + * S U B P I X E L M O R P H O L O G I C A L A N T I A L I A S I N G + * + * http://www.iryoku.com/smaa/ + * + * Hi, welcome aboard! + * + * Here you'll find instructions to get the shader up and running as fast as + * possible. + * + * IMPORTANTE NOTICE: when updating, remember to update both this file and the + * precomputed textures! They may change from version to version. + * + * The shader has three passes, chained together as follows: + * + * |input|------------------� + * v | + * [ SMAA*EdgeDetection ] | + * v | + * |edgesTex| | + * v | + * [ SMAABlendingWeightCalculation ] | + * v | + * |blendTex| | + * v | + * [ SMAANeighborhoodBlending ] <------� + * v + * |output| + * + * Note that each [pass] has its own vertex and pixel shader. Remember to use + * oversized triangles instead of quads to avoid overshading along the + * diagonal. + * + * You've three edge detection methods to choose from: luma, color or depth. + * They represent different quality/performance and anti-aliasing/sharpness + * tradeoffs, so our recommendation is for you to choose the one that best + * suits your particular scenario: + * + * - Depth edge detection is usually the fastest but it may miss some edges. + * + * - Luma edge detection is usually more expensive than depth edge detection, + * but catches visible edges that depth edge detection can miss. + * + * - Color edge detection is usually the most expensive one but catches + * chroma-only edges. + * + * For quickstarters: just use luma edge detection. + * + * The general advice is to not rush the integration process and ensure each + * step is done correctly (don't try to integrate SMAA T2x with predicated edge + * detection from the start!). Ok then, let's go! + * + * 1. The first step is to create two RGBA temporal render targets for holding + * |edgesTex| and |blendTex|. + * + * In DX10 or DX11, you can use a RG render target for the edges texture. + * In the case of NVIDIA GPUs, using RG render targets seems to actually be + * slower. + * + * On the Xbox 360, you can use the same render target for resolving both + * |edgesTex| and |blendTex|, as they aren't needed simultaneously. + * + * 2. Both temporal render targets |edgesTex| and |blendTex| must be cleared + * each frame. Do not forget to clear the alpha channel! + * + * 3. The next step is loading the two supporting precalculated textures, + * 'areaTex' and 'searchTex'. You'll find them in the 'Textures' folder as + * C++ headers, and also as regular DDS files. They'll be needed for the + * 'SMAABlendingWeightCalculation' pass. + * + * If you use the C++ headers, be sure to load them in the format specified + * inside of them. + * + * You can also compress 'areaTex' and 'searchTex' using BC5 and BC4 + * respectively, if you have that option in your content processor pipeline. + * When compressing then, you get a non-perceptible quality decrease, and a + * marginal performance increase. + * + * 4. All samplers must be set to linear filtering and clamp. + * + * After you get the technique working, remember that 64-bit inputs have + * half-rate linear filtering on GCN. + * + * If SMAA is applied to 64-bit color buffers, switching to point filtering + * when accesing them will increase the performance. Search for + * 'SMAASamplePoint' to see which textures may benefit from point + * filtering, and where (which is basically the color input in the edge + * detection and resolve passes). + * + * 5. All texture reads and buffer writes must be non-sRGB, with the exception + * of the input read and the output write in + * 'SMAANeighborhoodBlending' (and only in this pass!). If sRGB reads in + * this last pass are not possible, the technique will work anyway, but + * will perform antialiasing in gamma space. + * + * IMPORTANT: for best results the input read for the color/luma edge + * detection should *NOT* be sRGB. + * + * 6. Before including SMAA.h you'll have to setup the render target metrics, + * the target and any optional configuration defines. Optionally you can + * use a preset. + * + * You have the following targets available: + * SMAA_HLSL_3 + * SMAA_HLSL_4 + * SMAA_HLSL_4_1 + * SMAA_GLSL_3 * + * SMAA_GLSL_4 * + * + * * (See SMAA_INCLUDE_VS and SMAA_INCLUDE_PS below). + * + * And four presets: + * SMAA_PRESET_LOW (%60 of the quality) + * SMAA_PRESET_MEDIUM (%80 of the quality) + * SMAA_PRESET_HIGH (%95 of the quality) + * SMAA_PRESET_ULTRA (%99 of the quality) + * + * For example: + * #define SMAA_RT_METRICS float4(1.0 / 1280.0, 1.0 / 720.0, 1280.0, 720.0) + * #define SMAA_HLSL_4 + * #define SMAA_PRESET_HIGH + * #include "SMAA.h" + * + * Note that SMAA_RT_METRICS doesn't need to be a macro, it can be a + * uniform variable. The code is designed to minimize the impact of not + * using a constant value, but it is still better to hardcode it. + * + * Depending on how you encoded 'areaTex' and 'searchTex', you may have to + * add (and customize) the following defines before including SMAA.h: + * #define SMAA_AREATEX_SELECT(sample) sample.rg + * #define SMAA_SEARCHTEX_SELECT(sample) sample.r + * + * If your engine is already using porting macros, you can define + * SMAA_CUSTOM_SL, and define the porting functions by yourself. + * + * 7. Then, you'll have to setup the passes as indicated in the scheme above. + * You can take a look into SMAA.fx, to see how we did it for our demo. + * Checkout the function wrappers, you may want to copy-paste them! + * + * 8. It's recommended to validate the produced |edgesTex| and |blendTex|. + * You can use a screenshot from your engine to compare the |edgesTex| + * and |blendTex| produced inside of the engine with the results obtained + * with the reference demo. + * + * 9. After you get the last pass to work, it's time to optimize. You'll have + * to initialize a stencil buffer in the first pass (discard is already in + * the code), then mask execution by using it the second pass. The last + * pass should be executed in all pixels. + * + * + * After this point you can choose to enable predicated thresholding, + * temporal supersampling and motion blur integration: + * + * a) If you want to use predicated thresholding, take a look into + * SMAA_PREDICATION; you'll need to pass an extra texture in the edge + * detection pass. + * + * b) If you want to enable temporal supersampling (SMAA T2x): + * + * 1. The first step is to render using subpixel jitters. I won't go into + * detail, but it's as simple as moving each vertex position in the + * vertex shader, you can check how we do it in our DX10 demo. + * + * 2. Then, you must setup the temporal resolve. You may want to take a look + * into SMAAResolve for resolving 2x modes. After you get it working, you'll + * probably see ghosting everywhere. But fear not, you can enable the + * CryENGINE temporal reprojection by setting the SMAA_REPROJECTION macro. + * Check out SMAA_DECODE_VELOCITY if your velocity buffer is encoded. + * + * 3. The next step is to apply SMAA to each subpixel jittered frame, just as + * done for 1x. + * + * 4. At this point you should already have something usable, but for best + * results the proper area textures must be set depending on current jitter. + * For this, the parameter 'subsampleIndices' of + * 'SMAABlendingWeightCalculationPS' must be set as follows, for our T2x + * mode: + * + * @SUBSAMPLE_INDICES + * + * | S# | Camera Jitter | subsampleIndices | + * +----+------------------+---------------------+ + * | 0 | ( 0.25, -0.25) | float4(1, 1, 1, 0) | + * | 1 | (-0.25, 0.25) | float4(2, 2, 2, 0) | + * + * These jitter positions assume a bottom-to-top y axis. S# stands for the + * sample number. + * + * More information about temporal supersampling here: + * http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf + * + * c) If you want to enable spatial multisampling (SMAA S2x): + * + * 1. The scene must be rendered using MSAA 2x. The MSAA 2x buffer must be + * created with: + * - DX10: see below (*) + * - DX10.1: D3D10_STANDARD_MULTISAMPLE_PATTERN or + * - DX11: D3D11_STANDARD_MULTISAMPLE_PATTERN + * + * This allows to ensure that the subsample order matches the table in + * @SUBSAMPLE_INDICES. + * + * (*) In the case of DX10, we refer the reader to: + * - SMAA::detectMSAAOrder and + * - SMAA::msaaReorder + * + * These functions allow to match the standard multisample patterns by + * detecting the subsample order for a specific GPU, and reordering + * them appropriately. + * + * 2. A shader must be run to output each subsample into a separate buffer + * (DX10 is required). You can use SMAASeparate for this purpose, or just do + * it in an existing pass (for example, in the tone mapping pass, which has + * the advantage of feeding tone mapped subsamples to SMAA, which will yield + * better results). + * + * 3. The full SMAA 1x pipeline must be run for each separated buffer, storing + * the results in the final buffer. The second run should alpha blend with + * the existing final buffer using a blending factor of 0.5. + * 'subsampleIndices' must be adjusted as in the SMAA T2x case (see point + * b). + * + * d) If you want to enable temporal supersampling on top of SMAA S2x + * (which actually is SMAA 4x): + * + * 1. SMAA 4x consists on temporally jittering SMAA S2x, so the first step is + * to calculate SMAA S2x for current frame. In this case, 'subsampleIndices' + * must be set as follows: + * + * | F# | S# | Camera Jitter | Net Jitter | subsampleIndices | + * +----+----+--------------------+-------------------+----------------------+ + * | 0 | 0 | ( 0.125, 0.125) | ( 0.375, -0.125) | float4(5, 3, 1, 3) | + * | 0 | 1 | ( 0.125, 0.125) | (-0.125, 0.375) | float4(4, 6, 2, 3) | + * +----+----+--------------------+-------------------+----------------------+ + * | 1 | 2 | (-0.125, -0.125) | ( 0.125, -0.375) | float4(3, 5, 1, 4) | + * | 1 | 3 | (-0.125, -0.125) | (-0.375, 0.125) | float4(6, 4, 2, 4) | + * + * These jitter positions assume a bottom-to-top y axis. F# stands for the + * frame number. S# stands for the sample number. + * + * 2. After calculating SMAA S2x for current frame (with the new subsample + * indices), previous frame must be reprojected as in SMAA T2x mode (see + * point b). + * + * e) If motion blur is used, you may want to do the edge detection pass + * together with motion blur. This has two advantages: + * + * 1. Pixels under heavy motion can be omitted from the edge detection process. + * For these pixels we can just store "no edge", as motion blur will take + * care of them. + * 2. The center pixel tap is reused. + * + * Note that in this case depth testing should be used instead of stenciling, + * as we have to write all the pixels in the motion blur pass. + * + * That's it! + */ + +//----------------------------------------------------------------------------- +// SMAA Presets + +/** + * Note that if you use one of these presets, the following configuration + * macros will be ignored if set in the "Configurable Defines" section. + */ + +#if defined(SMAA_PRESET_LOW) +#define SMAA_THRESHOLD 0.15 +#define SMAA_MAX_SEARCH_STEPS 4 +#define SMAA_DISABLE_DIAG_DETECTION +#define SMAA_DISABLE_CORNER_DETECTION +#elif defined(SMAA_PRESET_MEDIUM) +#define SMAA_THRESHOLD 0.1 +#define SMAA_MAX_SEARCH_STEPS 8 +#define SMAA_DISABLE_DIAG_DETECTION +#define SMAA_DISABLE_CORNER_DETECTION +#elif defined(SMAA_PRESET_HIGH) +#define SMAA_THRESHOLD 0.1 +#define SMAA_MAX_SEARCH_STEPS 16 +#define SMAA_MAX_SEARCH_STEPS_DIAG 8 +#define SMAA_CORNER_ROUNDING 25 +#elif defined(SMAA_PRESET_ULTRA) +#define SMAA_THRESHOLD 0.05 +#define SMAA_MAX_SEARCH_STEPS 32 +#define SMAA_MAX_SEARCH_STEPS_DIAG 16 +#define SMAA_CORNER_ROUNDING 25 +#endif + +//----------------------------------------------------------------------------- +// Configurable Defines + +/** + * SMAA_THRESHOLD specifies the threshold or sensitivity to edges. + * Lowering this value you will be able to detect more edges at the expense of + * performance. + * + * Range: [0, 0.5] + * 0.1 is a reasonable value, and allows to catch most visible edges. + * 0.05 is a rather overkill value, that allows to catch 'em all. + * + * If temporal supersampling is used, 0.2 could be a reasonable value, as low + * contrast edges are properly filtered by just 2x. + */ +#ifndef SMAA_THRESHOLD +#define SMAA_THRESHOLD 0.1 +#endif + +/** + * SMAA_DEPTH_THRESHOLD specifies the threshold for depth edge detection. + * + * Range: depends on the depth range of the scene. + */ +#ifndef SMAA_DEPTH_THRESHOLD +#define SMAA_DEPTH_THRESHOLD (0.1 * SMAA_THRESHOLD) +#endif + +/** + * SMAA_MAX_SEARCH_STEPS specifies the maximum steps performed in the + * horizontal/vertical pattern searches, at each side of the pixel. + * + * In number of pixels, it's actually the double. So the maximum line length + * perfectly handled by, for example 16, is 64 (by perfectly, we meant that + * longer lines won't look as good, but still antialiased). + * + * Range: [0, 112] + */ +#ifndef SMAA_MAX_SEARCH_STEPS +#define SMAA_MAX_SEARCH_STEPS 16 +#endif + +/** + * SMAA_MAX_SEARCH_STEPS_DIAG specifies the maximum steps performed in the + * diagonal pattern searches, at each side of the pixel. In this case we jump + * one pixel at time, instead of two. + * + * Range: [0, 20] + * + * On high-end machines it is cheap (between a 0.8x and 0.9x slower for 16 + * steps), but it can have a significant impact on older machines. + * + * Define SMAA_DISABLE_DIAG_DETECTION to disable diagonal processing. + */ +#ifndef SMAA_MAX_SEARCH_STEPS_DIAG +#define SMAA_MAX_SEARCH_STEPS_DIAG 8 +#endif + +/** + * SMAA_CORNER_ROUNDING specifies how much sharp corners will be rounded. + * + * Range: [0, 100] + * + * Define SMAA_DISABLE_CORNER_DETECTION to disable corner processing. + */ +#ifndef SMAA_CORNER_ROUNDING +#define SMAA_CORNER_ROUNDING 25 +#endif + +/** + * If there is an neighbor edge that has SMAA_LOCAL_CONTRAST_FACTOR times + * bigger contrast than current edge, current edge will be discarded. + * + * This allows to eliminate spurious crossing edges, and is based on the fact + * that, if there is too much contrast in a direction, that will hide + * perceptually contrast in the other neighbors. + */ +#ifndef SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR +#define SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR 2.0 +#endif + +/** + * Predicated thresholding allows to better preserve texture details and to + * improve performance, by decreasing the number of detected edges using an + * additional buffer like the light accumulation buffer, object ids or even the + * depth buffer (the depth buffer usage may be limited to indoor or short range + * scenes). + * + * It locally decreases the luma or color threshold if an edge is found in an + * additional buffer (so the global threshold can be higher). + * + * This method was developed by Playstation EDGE MLAA team, and used in + * Killzone 3, by using the light accumulation buffer. More information here: + * http://iryoku.com/aacourse/downloads/06-MLAA-on-PS3.pptx + */ +#ifndef SMAA_PREDICATION +#define SMAA_PREDICATION 0 +#endif + +/** + * Threshold to be used in the additional predication buffer. + * + * Range: depends on the input, so you'll have to find the magic number that + * works for you. + */ +#ifndef SMAA_PREDICATION_THRESHOLD +#define SMAA_PREDICATION_THRESHOLD 0.01 +#endif + +/** + * How much to scale the global threshold used for luma or color edge + * detection when using predication. + * + * Range: [1, 5] + */ +#ifndef SMAA_PREDICATION_SCALE +#define SMAA_PREDICATION_SCALE 2.0 +#endif + +/** + * How much to locally decrease the threshold. + * + * Range: [0, 1] + */ +#ifndef SMAA_PREDICATION_STRENGTH +#define SMAA_PREDICATION_STRENGTH 0.4 +#endif + +/** + * Temporal reprojection allows to remove ghosting artifacts when using + * temporal supersampling. We use the CryEngine 3 method which also introduces + * velocity weighting. This feature is of extreme importance for totally + * removing ghosting. More information here: + * http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf + * + * Note that you'll need to setup a velocity buffer for enabling reprojection. + * For static geometry, saving the previous depth buffer is a viable + * alternative. + */ +#ifndef SMAA_REPROJECTION +#define SMAA_REPROJECTION 0 +#endif + +/** + * SMAA_REPROJECTION_WEIGHT_SCALE controls the velocity weighting. It allows to + * remove ghosting trails behind the moving object, which are not removed by + * just using reprojection. Using low values will exhibit ghosting, while using + * high values will disable temporal supersampling under motion. + * + * Behind the scenes, velocity weighting removes temporal supersampling when + * the velocity of the subsamples differs (meaning they are different objects). + * + * Range: [0, 80] + */ +#ifndef SMAA_REPROJECTION_WEIGHT_SCALE +#define SMAA_REPROJECTION_WEIGHT_SCALE 30.0 +#endif + +/** + * On some compilers, discard cannot be used in vertex shaders. Thus, they need + * to be compiled separately. + */ +#ifndef SMAA_INCLUDE_VS +#define SMAA_INCLUDE_VS 1 +#endif +#ifndef SMAA_INCLUDE_PS +#define SMAA_INCLUDE_PS 1 +#endif + +//----------------------------------------------------------------------------- +// Texture Access Defines + +#ifndef SMAA_AREATEX_SELECT +#if defined(SMAA_HLSL_3) +#define SMAA_AREATEX_SELECT(sample) sample.ra +#else +#define SMAA_AREATEX_SELECT(sample) sample.rg +#endif +#endif + +#ifndef SMAA_SEARCHTEX_SELECT +#define SMAA_SEARCHTEX_SELECT(sample) sample.r +#endif + +#ifndef SMAA_DECODE_VELOCITY +#define SMAA_DECODE_VELOCITY(sample) sample.rg +#endif + +//----------------------------------------------------------------------------- +// Non-Configurable Defines + +#define SMAA_AREATEX_MAX_DISTANCE 16 +#define SMAA_AREATEX_MAX_DISTANCE_DIAG 20 +#define SMAA_AREATEX_PIXEL_SIZE (1.0 / float2(160.0, 560.0)) +#define SMAA_AREATEX_SUBTEX_SIZE (1.0 / 7.0) +#define SMAA_SEARCHTEX_SIZE float2(66.0, 33.0) +#define SMAA_SEARCHTEX_PACKED_SIZE float2(64.0, 16.0) +#define SMAA_CORNER_ROUNDING_NORM (float(SMAA_CORNER_ROUNDING) / 100.0) + +//----------------------------------------------------------------------------- +// Porting Functions + +#if defined(SMAA_HLSL_3) +#define SMAATexture2D(tex) sampler2D tex +#define SMAATexturePass2D(tex) tex +#define SMAASampleLevelZero(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0)) +#define SMAASampleLevelZeroPoint(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0)) +#define SMAASampleLevelZeroOffset(tex, coord, offset) tex2Dlod(tex, float4(coord + offset * SMAA_RT_METRICS.xy, 0.0, 0.0)) +#define SMAASample(tex, coord) tex2D(tex, coord) +#define SMAASamplePoint(tex, coord) tex2D(tex, coord) +#define SMAASampleOffset(tex, coord, offset) tex2D(tex, coord + offset * SMAA_RT_METRICS.xy) +#define SMAA_FLATTEN [flatten] +#define SMAA_BRANCH [branch] +#endif +#if defined(SMAA_HLSL_4) || defined(SMAA_HLSL_4_1) +SamplerState LinearSampler { Filter = MIN_MAG_LINEAR_MIP_POINT; AddressU = Clamp; AddressV = Clamp; }; +SamplerState PointSampler { Filter = MIN_MAG_MIP_POINT; AddressU = Clamp; AddressV = Clamp; }; +#define SMAATexture2D(tex) Texture2D tex +#define SMAATexturePass2D(tex) tex +#define SMAASampleLevelZero(tex, coord) tex.SampleLevel(LinearSampler, coord, 0) +#define SMAASampleLevelZeroPoint(tex, coord) tex.SampleLevel(PointSampler, coord, 0) +#define SMAASampleLevelZeroOffset(tex, coord, offset) tex.SampleLevel(LinearSampler, coord, 0, offset) +#define SMAASample(tex, coord) tex.Sample(LinearSampler, coord) +#define SMAASamplePoint(tex, coord) tex.Sample(PointSampler, coord) +#define SMAASampleOffset(tex, coord, offset) tex.Sample(LinearSampler, coord, offset) +#define SMAA_FLATTEN [flatten] +#define SMAA_BRANCH [branch] +#define SMAATexture2DMS2(tex) Texture2DMS<float4, 2> tex +#define SMAALoad(tex, pos, sample) tex.Load(pos, sample) +#if defined(SMAA_HLSL_4_1) +#define SMAAGather(tex, coord) tex.Gather(LinearSampler, coord, 0) +#endif +#endif +#if defined(SMAA_GLSL_3) || defined(SMAA_GLSL_4) +#define SMAATexture2D(tex) sampler2D tex +#define SMAATexturePass2D(tex) tex +#define SMAASampleLevelZero(tex, coord) textureLod(tex, coord, 0.0) +#define SMAASampleLevelZeroPoint(tex, coord) textureLod(tex, coord, 0.0) +#define SMAASampleLevelZeroOffset(tex, coord, offset) textureLodOffset(tex, coord, 0.0, offset) +#define SMAASample(tex, coord) texture(tex, coord) +#define SMAASamplePoint(tex, coord) texture(tex, coord) +#define SMAASampleOffset(tex, coord, offset) texture(tex, coord, offset) +#define SMAA_FLATTEN +#define SMAA_BRANCH +#define lerp(a, b, t) mix(a, b, t) +#define saturate(a) clamp(a, 0.0, 1.0) +#if defined(SMAA_GLSL_4) +#define mad(a, b, c) fma(a, b, c) +#define SMAAGather(tex, coord) textureGather(tex, coord) +#else +#define mad(a, b, c) (a * b + c) +#endif +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#endif + +#if !defined(SMAA_HLSL_3) && !defined(SMAA_HLSL_4) && !defined(SMAA_HLSL_4_1) && !defined(SMAA_GLSL_3) && !defined(SMAA_GLSL_4) && !defined(SMAA_CUSTOM_SL) +#error you must define the shading language: SMAA_HLSL_*, SMAA_GLSL_* or SMAA_CUSTOM_SL +#endif + +//----------------------------------------------------------------------------- +// Misc functions + +/** + * Gathers current pixel, and the top-left neighbors. + */ +float3 SMAAGatherNeighbours(float2 texcoord, + float4 offset[3], + SMAATexture2D(tex)) { + #ifdef SMAAGather + return SMAAGather(tex, texcoord + SMAA_RT_METRICS.xy * float2(-0.5, -0.5)).grb; + #else + float P = SMAASamplePoint(tex, texcoord).r; + float Pleft = SMAASamplePoint(tex, offset[0].xy).r; + float Ptop = SMAASamplePoint(tex, offset[0].zw).r; + return float3(P, Pleft, Ptop); + #endif +} + +/** + * Adjusts the threshold by means of predication. + */ +float2 SMAACalculatePredicatedThreshold(float2 texcoord, + float4 offset[3], + SMAATexture2D(predicationTex)) { + float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(predicationTex)); + float2 delta = abs(neighbours.xx - neighbours.yz); + float2 edges = step(SMAA_PREDICATION_THRESHOLD, delta); + return SMAA_PREDICATION_SCALE * SMAA_THRESHOLD * (1.0 - SMAA_PREDICATION_STRENGTH * edges); +} + +/** + * Conditional move: + */ +void SMAAMovc(bool2 cond, inout float2 variable, float2 value) { + SMAA_FLATTEN if (cond.x) variable.x = value.x; + SMAA_FLATTEN if (cond.y) variable.y = value.y; +} + +void SMAAMovc(bool4 cond, inout float4 variable, float4 value) { + SMAAMovc(cond.xy, variable.xy, value.xy); + SMAAMovc(cond.zw, variable.zw, value.zw); +} + + +#if SMAA_INCLUDE_VS +//----------------------------------------------------------------------------- +// Vertex Shaders + +/** + * Edge Detection Vertex Shader + */ +void SMAAEdgeDetectionVS(float2 texcoord, + out float4 offset[3]) { + offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-1.0, 0.0, 0.0, -1.0), texcoord.xyxy); + offset[1] = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0, 1.0), texcoord.xyxy); + offset[2] = mad(SMAA_RT_METRICS.xyxy, float4(-2.0, 0.0, 0.0, -2.0), texcoord.xyxy); +} + +/** + * Blend Weight Calculation Vertex Shader + */ +void SMAABlendingWeightCalculationVS(float2 texcoord, + out float2 pixcoord, + out float4 offset[3]) { + pixcoord = texcoord * SMAA_RT_METRICS.zw; + + // We will use these offsets for the searches later on (see @PSEUDO_GATHER4): + offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-0.25, -0.125, 1.25, -0.125), texcoord.xyxy); + offset[1] = mad(SMAA_RT_METRICS.xyxy, float4(-0.125, -0.25, -0.125, 1.25), texcoord.xyxy); + + // And these for the searches, they indicate the ends of the loops: + offset[2] = mad(SMAA_RT_METRICS.xxyy, + float4(-2.0, 2.0, -2.0, 2.0) * float(SMAA_MAX_SEARCH_STEPS), + float4(offset[0].xz, offset[1].yw)); +} + +/** + * Neighborhood Blending Vertex Shader + */ +void SMAANeighborhoodBlendingVS(float2 texcoord, + out float4 offset) { + offset = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0, 1.0), texcoord.xyxy); +} +#endif // SMAA_INCLUDE_VS + +#if SMAA_INCLUDE_PS +//----------------------------------------------------------------------------- +// Edge Detection Pixel Shaders (First Pass) + +/** + * Luma Edge Detection + * + * IMPORTANT NOTICE: luma edge detection requires gamma-corrected colors, and + * thus 'colorTex' should be a non-sRGB texture. + */ +float2 SMAALumaEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(colorTex) + #if SMAA_PREDICATION + , SMAATexture2D(predicationTex) + #endif + ) { + // Calculate the threshold: + #if SMAA_PREDICATION + float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, SMAATexturePass2D(predicationTex)); + #else + float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD); + #endif + + // Calculate lumas: + float3 weights = float3(0.2126, 0.7152, 0.0722); + float L = dot(SMAASamplePoint(colorTex, texcoord).rgb, weights); + + float Lleft = dot(SMAASamplePoint(colorTex, offset[0].xy).rgb, weights); + float Ltop = dot(SMAASamplePoint(colorTex, offset[0].zw).rgb, weights); + + // We do the usual threshold: + float4 delta; + delta.xy = abs(L - float2(Lleft, Ltop)); + float2 edges = step(threshold, delta.xy); + + // Then discard if there is no edge: + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + // Calculate right and bottom deltas: + float Lright = dot(SMAASamplePoint(colorTex, offset[1].xy).rgb, weights); + float Lbottom = dot(SMAASamplePoint(colorTex, offset[1].zw).rgb, weights); + delta.zw = abs(L - float2(Lright, Lbottom)); + + // Calculate the maximum delta in the direct neighborhood: + float2 maxDelta = max(delta.xy, delta.zw); + + // Calculate left-left and top-top deltas: + float Lleftleft = dot(SMAASamplePoint(colorTex, offset[2].xy).rgb, weights); + float Ltoptop = dot(SMAASamplePoint(colorTex, offset[2].zw).rgb, weights); + delta.zw = abs(float2(Lleft, Ltop) - float2(Lleftleft, Ltoptop)); + + // Calculate the final maximum delta: + maxDelta = max(maxDelta.xy, delta.zw); + float finalDelta = max(maxDelta.x, maxDelta.y); + + // Local contrast adaptation: + edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy); + + return edges; +} + +/** + * Color Edge Detection + * + * IMPORTANT NOTICE: color edge detection requires gamma-corrected colors, and + * thus 'colorTex' should be a non-sRGB texture. + */ +float2 SMAAColorEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(colorTex) + #if SMAA_PREDICATION + , SMAATexture2D(predicationTex) + #endif + ) { + // Calculate the threshold: + #if SMAA_PREDICATION + float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, predicationTex); + #else + float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD); + #endif + + // Calculate color deltas: + float4 delta; + float3 C = SMAASamplePoint(colorTex, texcoord).rgb; + + float3 Cleft = SMAASamplePoint(colorTex, offset[0].xy).rgb; + float3 t = abs(C - Cleft); + delta.x = max(max(t.r, t.g), t.b); + + float3 Ctop = SMAASamplePoint(colorTex, offset[0].zw).rgb; + t = abs(C - Ctop); + delta.y = max(max(t.r, t.g), t.b); + + // We do the usual threshold: + float2 edges = step(threshold, delta.xy); + + // Then discard if there is no edge: + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + // Calculate right and bottom deltas: + float3 Cright = SMAASamplePoint(colorTex, offset[1].xy).rgb; + t = abs(C - Cright); + delta.z = max(max(t.r, t.g), t.b); + + float3 Cbottom = SMAASamplePoint(colorTex, offset[1].zw).rgb; + t = abs(C - Cbottom); + delta.w = max(max(t.r, t.g), t.b); + + // Calculate the maximum delta in the direct neighborhood: + float2 maxDelta = max(delta.xy, delta.zw); + + // Calculate left-left and top-top deltas: + float3 Cleftleft = SMAASamplePoint(colorTex, offset[2].xy).rgb; + t = abs(C - Cleftleft); + delta.z = max(max(t.r, t.g), t.b); + + float3 Ctoptop = SMAASamplePoint(colorTex, offset[2].zw).rgb; + t = abs(C - Ctoptop); + delta.w = max(max(t.r, t.g), t.b); + + // Calculate the final maximum delta: + maxDelta = max(maxDelta.xy, delta.zw); + float finalDelta = max(maxDelta.x, maxDelta.y); + + // Local contrast adaptation: + edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy); + + return edges; +} + +/** + * Depth Edge Detection + */ +float2 SMAADepthEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(depthTex)) { + float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(depthTex)); + float2 delta = abs(neighbours.xx - float2(neighbours.y, neighbours.z)); + float2 edges = step(SMAA_DEPTH_THRESHOLD, delta); + + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + return edges; +} + +//----------------------------------------------------------------------------- +// Diagonal Search Functions + +#if !defined(SMAA_DISABLE_DIAG_DETECTION) + +/** + * Allows to decode two binary values from a bilinear-filtered access. + */ +float2 SMAADecodeDiagBilinearAccess(float2 e) { + // Bilinear access for fetching 'e' have a 0.25 offset, and we are + // interested in the R and G edges: + // + // +---G---+-------+ + // | x o R x | + // +-------+-------+ + // + // Then, if one of these edge is enabled: + // Red: (0.75 * X + 0.25 * 1) => 0.25 or 1.0 + // Green: (0.75 * 1 + 0.25 * X) => 0.75 or 1.0 + // + // This function will unpack the values (mad + mul + round): + // wolframalpha.com: round(x * abs(5 * x - 5 * 0.75)) plot 0 to 1 + e.r = e.r * abs(5.0 * e.r - 5.0 * 0.75); + return round(e); +} + +float4 SMAADecodeDiagBilinearAccess(float4 e) { + e.rb = e.rb * abs(5.0 * e.rb - 5.0 * 0.75); + return round(e); +} + +/** + * These functions allows to perform diagonal pattern searches. + */ +float2 SMAASearchDiag1(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) { + float4 coord = float4(texcoord, -1.0, 1.0); + float3 t = float3(SMAA_RT_METRICS.xy, 1.0); + while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) && + coord.w > 0.9) { + coord.xyz = mad(t, float3(dir, 1.0), coord.xyz); + e = SMAASampleLevelZero(edgesTex, coord.xy).rg; + coord.w = dot(e, float2(0.5, 0.5)); + } + return coord.zw; +} + +float2 SMAASearchDiag2(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) { + float4 coord = float4(texcoord, -1.0, 1.0); + coord.x += 0.25 * SMAA_RT_METRICS.x; // See @SearchDiag2Optimization + float3 t = float3(SMAA_RT_METRICS.xy, 1.0); + while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) && + coord.w > 0.9) { + coord.xyz = mad(t, float3(dir, 1.0), coord.xyz); + + // @SearchDiag2Optimization + // Fetch both edges at once using bilinear filtering: + e = SMAASampleLevelZero(edgesTex, coord.xy).rg; + e = SMAADecodeDiagBilinearAccess(e); + + // Non-optimized version: + // e.g = SMAASampleLevelZero(edgesTex, coord.xy).g; + // e.r = SMAASampleLevelZeroOffset(edgesTex, coord.xy, int2(1, 0)).r; + + coord.w = dot(e, float2(0.5, 0.5)); + } + return coord.zw; +} + +/** + * Similar to SMAAArea, this calculates the area corresponding to a certain + * diagonal distance and crossing edges 'e'. + */ +float2 SMAAAreaDiag(SMAATexture2D(areaTex), float2 dist, float2 e, float offset) { + float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE_DIAG, SMAA_AREATEX_MAX_DISTANCE_DIAG), e, dist); + + // We do a scale and bias for mapping to texel space: + texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE); + + // Diagonal areas are on the second half of the texture: + texcoord.x += 0.5; + + // Move to proper place, according to the subpixel offset: + texcoord.y += SMAA_AREATEX_SUBTEX_SIZE * offset; + + // Do it! + return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord)); +} + +/** + * This searches for diagonal patterns and returns the corresponding weights. + */ +float2 SMAACalculateDiagWeights(SMAATexture2D(edgesTex), SMAATexture2D(areaTex), float2 texcoord, float2 e, float4 subsampleIndices) { + float2 weights = float2(0.0, 0.0); + + // Search for the line ends: + float4 d; + float2 end; + if (e.r > 0.0) { + d.xz = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, 1.0), end); + d.x += float(end.y > 0.9); + } else + d.xz = float2(0.0, 0.0); + d.yw = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, -1.0), end); + + SMAA_BRANCH + if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3 + // Fetch the crossing edges: + float4 coords = mad(float4(-d.x + 0.25, d.x, d.y, -d.y - 0.25), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + float4 c; + c.xy = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).rg; + c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).rg; + c.yxwz = SMAADecodeDiagBilinearAccess(c.xyzw); + + // Non-optimized version: + // float4 coords = mad(float4(-d.x, d.x, d.y, -d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + // float4 c; + // c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).g; + // c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, 0)).r; + // c.z = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).g; + // c.w = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, -1)).r; + + // Merge crossing edges at each side into a single value: + float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw); + + // Remove the crossing edge if we didn't found the end of the line: + SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0)); + + // Fetch the areas for this line: + weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.z); + } + + // Search for the line ends: + d.xz = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, -1.0), end); + if (SMAASampleLevelZeroOffset(edgesTex, texcoord, int2(1, 0)).r > 0.0) { + d.yw = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, 1.0), end); + d.y += float(end.y > 0.9); + } else + d.yw = float2(0.0, 0.0); + + SMAA_BRANCH + if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3 + // Fetch the crossing edges: + float4 coords = mad(float4(-d.x, -d.x, d.y, d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + float4 c; + c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).g; + c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, -1)).r; + c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).gr; + float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw); + + // Remove the crossing edge if we didn't found the end of the line: + SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0)); + + // Fetch the areas for this line: + weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.w).gr; + } + + return weights; +} +#endif + +//----------------------------------------------------------------------------- +// Horizontal/Vertical Search Functions + +/** + * This allows to determine how much length should we add in the last step + * of the searches. It takes the bilinearly interpolated edge (see + * @PSEUDO_GATHER4), and adds 0, 1 or 2, depending on which edges and + * crossing edges are active. + */ +float SMAASearchLength(SMAATexture2D(searchTex), float2 e, float offset) { + // The texture is flipped vertically, with left and right cases taking half + // of the space horizontally: + float2 scale = SMAA_SEARCHTEX_SIZE * float2(0.5, -1.0); + float2 bias = SMAA_SEARCHTEX_SIZE * float2(offset, 1.0); + + // Scale and bias to access texel centers: + scale += float2(-1.0, 1.0); + bias += float2( 0.5, -0.5); + + // Convert from pixel coordinates to texcoords: + // (We use SMAA_SEARCHTEX_PACKED_SIZE because the texture is cropped) + scale *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE; + bias *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE; + + // Lookup the search texture: + return SMAA_SEARCHTEX_SELECT(SMAASampleLevelZero(searchTex, mad(scale, e, bias))); +} + +/** + * Horizontal/vertical search functions for the 2nd pass. + */ +float SMAASearchXLeft(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + /** + * @PSEUDO_GATHER4 + * This texcoord has been offset by (-0.25, -0.125) in the vertex shader to + * sample between edge, thus fetching four edges in a row. + * Sampling with different offsets in each direction allows to disambiguate + * which edges are active from the four fetched ones. + */ + float2 e = float2(0.0, 1.0); + while (texcoord.x > end && + e.g > 0.8281 && // Is there some edge not activated? + e.r == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(-float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord); + } + + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0), 3.25); + return mad(SMAA_RT_METRICS.x, offset, texcoord.x); + + // Non-optimized version: + // We correct the previous (-0.25, -0.125) offset we applied: + // texcoord.x += 0.25 * SMAA_RT_METRICS.x; + + // The searches are bias by 1, so adjust the coords accordingly: + // texcoord.x += SMAA_RT_METRICS.x; + + // Disambiguate the length added by the last step: + // texcoord.x += 2.0 * SMAA_RT_METRICS.x; // Undo last step + // texcoord.x -= SMAA_RT_METRICS.x * (255.0 / 127.0) * SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0); + // return mad(SMAA_RT_METRICS.x, offset, texcoord.x); +} + +float SMAASearchXRight(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(0.0, 1.0); + while (texcoord.x < end && + e.g > 0.8281 && // Is there some edge not activated? + e.r == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.5), 3.25); + return mad(-SMAA_RT_METRICS.x, offset, texcoord.x); +} + +float SMAASearchYUp(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(1.0, 0.0); + while (texcoord.y > end && + e.r > 0.8281 && // Is there some edge not activated? + e.g == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(-float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.0), 3.25); + return mad(SMAA_RT_METRICS.y, offset, texcoord.y); +} + +float SMAASearchYDown(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(1.0, 0.0); + while (texcoord.y < end && + e.r > 0.8281 && // Is there some edge not activated? + e.g == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.5), 3.25); + return mad(-SMAA_RT_METRICS.y, offset, texcoord.y); +} + +/** + * Ok, we have the distance and both crossing edges. So, what are the areas + * at each side of current edge? + */ +float2 SMAAArea(SMAATexture2D(areaTex), float2 dist, float e1, float e2, float offset) { + // Rounding prevents precision errors of bilinear filtering: + float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE, SMAA_AREATEX_MAX_DISTANCE), round(4.0 * float2(e1, e2)), dist); + + // We do a scale and bias for mapping to texel space: + texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE); + + // Move to proper place, according to the subpixel offset: + texcoord.y = mad(SMAA_AREATEX_SUBTEX_SIZE, offset, texcoord.y); + + // Do it! + return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord)); +} + +//----------------------------------------------------------------------------- +// Corner Detection Functions + +void SMAADetectHorizontalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) { + #if !defined(SMAA_DISABLE_CORNER_DETECTION) + float2 leftRight = step(d.xy, d.yx); + float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight; + + rounding /= leftRight.x + leftRight.y; // Reduce blending for pixels in the center of a line. + + float2 factor = float2(1.0, 1.0); + factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, 1)).r; + factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, 1)).r; + factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, -2)).r; + factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, -2)).r; + + weights *= saturate(factor); + #endif +} + +void SMAADetectVerticalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) { + #if !defined(SMAA_DISABLE_CORNER_DETECTION) + float2 leftRight = step(d.xy, d.yx); + float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight; + + rounding /= leftRight.x + leftRight.y; + + float2 factor = float2(1.0, 1.0); + factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2( 1, 0)).g; + factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2( 1, 1)).g; + factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(-2, 0)).g; + factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(-2, 1)).g; + + weights *= saturate(factor); + #endif +} + +//----------------------------------------------------------------------------- +// Blending Weight Calculation Pixel Shader (Second Pass) + +float4 SMAABlendingWeightCalculationPS(float2 texcoord, + float2 pixcoord, + float4 offset[3], + SMAATexture2D(edgesTex), + SMAATexture2D(areaTex), + SMAATexture2D(searchTex), + float4 subsampleIndices) { // Just pass zero for SMAA 1x, see @SUBSAMPLE_INDICES. + float4 weights = float4(0.0, 0.0, 0.0, 0.0); + + float2 e = SMAASample(edgesTex, texcoord).rg; + + SMAA_BRANCH + if (e.g > 0.0) { // Edge at north + #if !defined(SMAA_DISABLE_DIAG_DETECTION) + // Diagonals have both north and west edges, so searching for them in + // one of the boundaries is enough. + weights.rg = SMAACalculateDiagWeights(SMAATexturePass2D(edgesTex), SMAATexturePass2D(areaTex), texcoord, e, subsampleIndices); + + // We give priority to diagonals, so if we find a diagonal we skip + // horizontal/vertical processing. + SMAA_BRANCH + if (weights.r == -weights.g) { // weights.r + weights.g == 0.0 + #endif + + float2 d; + + // Find the distance to the left: + float3 coords; + coords.x = SMAASearchXLeft(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].xy, offset[2].x); + coords.y = offset[1].y; // offset[1].y = texcoord.y - 0.25 * SMAA_RT_METRICS.y (@CROSSING_OFFSET) + d.x = coords.x; + + // Now fetch the left crossing edges, two at a time using bilinear + // filtering. Sampling at -0.25 (see @CROSSING_OFFSET) enables to + // discern what value each edge has: + float e1 = SMAASampleLevelZero(edgesTex, coords.xy).r; + + // Find the distance to the right: + coords.z = SMAASearchXRight(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].zw, offset[2].y); + d.y = coords.z; + + // We want the distances to be in pixel units (doing this here allow to + // better interleave arithmetic and memory accesses): + d = abs(round(mad(SMAA_RT_METRICS.zz, d, -pixcoord.xx))); + + // SMAAArea below needs a sqrt, as the areas texture is compressed + // quadratically: + float2 sqrt_d = sqrt(d); + + // Fetch the right crossing edges: + float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.zy, int2(1, 0)).r; + + // Ok, we know how this pattern looks like, now it is time for getting + // the actual area: + weights.rg = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.y); + + // Fix corners: + coords.y = texcoord.y; + SMAADetectHorizontalCornerPattern(SMAATexturePass2D(edgesTex), weights.rg, coords.xyzy, d); + + #if !defined(SMAA_DISABLE_DIAG_DETECTION) + } else + e.r = 0.0; // Skip vertical processing. + #endif + } + + SMAA_BRANCH + if (e.r > 0.0) { // Edge at west + float2 d; + + // Find the distance to the top: + float3 coords; + coords.y = SMAASearchYUp(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].xy, offset[2].z); + coords.x = offset[0].x; // offset[1].x = texcoord.x - 0.25 * SMAA_RT_METRICS.x; + d.x = coords.y; + + // Fetch the top crossing edges: + float e1 = SMAASampleLevelZero(edgesTex, coords.xy).g; + + // Find the distance to the bottom: + coords.z = SMAASearchYDown(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].zw, offset[2].w); + d.y = coords.z; + + // We want the distances to be in pixel units: + d = abs(round(mad(SMAA_RT_METRICS.ww, d, -pixcoord.yy))); + + // SMAAArea below needs a sqrt, as the areas texture is compressed + // quadratically: + float2 sqrt_d = sqrt(d); + + // Fetch the bottom crossing edges: + float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.xz, int2(0, 1)).g; + + // Get the area for this direction: + weights.ba = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.x); + + // Fix corners: + coords.x = texcoord.x; + SMAADetectVerticalCornerPattern(SMAATexturePass2D(edgesTex), weights.ba, coords.xyxz, d); + } + + return weights; +} + +//----------------------------------------------------------------------------- +// Neighborhood Blending Pixel Shader (Third Pass) + +float4 SMAANeighborhoodBlendingPS(float2 texcoord, + float4 offset, + SMAATexture2D(colorTex), + SMAATexture2D(blendTex) + #if SMAA_REPROJECTION + , SMAATexture2D(velocityTex) + #endif + ) { + // Fetch the blending weights for current pixel: + float4 a; + a.x = SMAASample(blendTex, offset.xy).a; // Right + a.y = SMAASample(blendTex, offset.zw).g; // Top + a.wz = SMAASample(blendTex, texcoord).xz; // Bottom / Left + + // Is there any blending weight with a value greater than 0.0? + SMAA_BRANCH + if (dot(a, float4(1.0, 1.0, 1.0, 1.0)) < 1e-5) { + float4 color = SMAASampleLevelZero(colorTex, texcoord); + + #if SMAA_REPROJECTION + float2 velocity = SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, texcoord)); + + // Pack velocity into the alpha channel: + color.a = sqrt(5.0 * length(velocity)); + #endif + + return color; + } else { + bool h = max(a.x, a.z) > max(a.y, a.w); // max(horizontal) > max(vertical) + + // Calculate the blending offsets: + float4 blendingOffset = float4(0.0, a.y, 0.0, a.w); + float2 blendingWeight = a.yw; + SMAAMovc(bool4(h, h, h, h), blendingOffset, float4(a.x, 0.0, a.z, 0.0)); + SMAAMovc(bool2(h, h), blendingWeight, a.xz); + blendingWeight /= dot(blendingWeight, float2(1.0, 1.0)); + + // Calculate the texture coordinates: + float4 blendingCoord = mad(blendingOffset, float4(SMAA_RT_METRICS.xy, -SMAA_RT_METRICS.xy), texcoord.xyxy); + + // We exploit bilinear filtering to mix current pixel with the chosen + // neighbor: + float4 color = blendingWeight.x * SMAASampleLevelZero(colorTex, blendingCoord.xy); + color += blendingWeight.y * SMAASampleLevelZero(colorTex, blendingCoord.zw); + + #if SMAA_REPROJECTION + // Antialias velocity for proper reprojection in a later stage: + float2 velocity = blendingWeight.x * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.xy)); + velocity += blendingWeight.y * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.zw)); + + // Pack velocity into the alpha channel: + color.a = sqrt(5.0 * length(velocity)); + #endif + + return color; + } +} + +//----------------------------------------------------------------------------- +// Temporal Resolve Pixel Shader (Optional Pass) + +float4 SMAAResolvePS(float2 texcoord, + SMAATexture2D(currentColorTex), + SMAATexture2D(previousColorTex) + #if SMAA_REPROJECTION + , SMAATexture2D(velocityTex) + #endif + ) { + #if SMAA_REPROJECTION + // Velocity is assumed to be calculated for motion blur, so we need to + // inverse it for reprojection: + float2 velocity = -SMAA_DECODE_VELOCITY(SMAASamplePoint(velocityTex, texcoord).rg); + + // Fetch current pixel: + float4 current = SMAASamplePoint(currentColorTex, texcoord); + + // Reproject current coordinates and fetch previous pixel: + float4 previous = SMAASamplePoint(previousColorTex, texcoord + velocity); + + // Attenuate the previous pixel if the velocity is different: + float delta = abs(current.a * current.a - previous.a * previous.a) / 5.0; + float weight = 0.5 * saturate(1.0 - sqrt(delta) * SMAA_REPROJECTION_WEIGHT_SCALE); + + // Blend the pixels according to the calculated weight: + return lerp(current, previous, weight); + #else + // Just blend the pixels: + float4 current = SMAASamplePoint(currentColorTex, texcoord); + float4 previous = SMAASamplePoint(previousColorTex, texcoord); + return lerp(current, previous, 0.5); + #endif +} + +//----------------------------------------------------------------------------- +// Separate Multisamples Pixel Shader (Optional Pass) + +#ifdef SMAALoad +void SMAASeparatePS(float4 position, + float2 texcoord, + out float4 target0, + out float4 target1, + SMAATexture2DMS2(colorTexMS)) { + int2 pos = int2(position.xy); + target0 = SMAALoad(colorTexMS, pos, 0); + target1 = SMAALoad(colorTexMS, pos, 1); +} +#endif + +//----------------------------------------------------------------------------- +#endif // SMAA_INCLUDE_PS diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_blend.glsl b/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_blend.glsl new file mode 100644 index 000000000..c875ce127 --- /dev/null +++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_blend.glsl @@ -0,0 +1,26 @@ +layout(rgba8, binding = 0) uniform image2D imgOutput; + +uniform sampler2D inputTexture; +layout( location=0 ) uniform vec2 invResolution; +uniform sampler2D samplerArea; +uniform sampler2D samplerSearch; + +void main() { + ivec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4); + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 4; j++) + { + ivec2 texelCoord = ivec2(loc.x + i, loc.y + j); + vec2 coord = (texelCoord + vec2(0.5)) / invResolution; + vec2 pixCoord; + vec4 offset[3]; + + SMAABlendingWeightCalculationVS(coord, pixCoord, offset); + + vec4 oColor = SMAABlendingWeightCalculationPS(coord, pixCoord, offset, inputTexture, samplerArea, samplerSearch, ivec4(0)); + + imageStore(imgOutput, texelCoord, oColor); + } + } +} diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_edge.glsl b/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_edge.glsl new file mode 100644 index 000000000..fd5d97154 --- /dev/null +++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_edge.glsl @@ -0,0 +1,24 @@ +layout(rgba8, binding = 0) uniform image2D imgOutput; + +uniform sampler2D inputTexture; +layout( location=0 ) uniform vec2 invResolution; + +void main() +{ + vec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4); + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 4; j++) + { + ivec2 texelCoord = ivec2(loc.x + i, loc.y + j); + vec2 coord = (texelCoord + vec2(0.5)) / invResolution; + vec4 offset[3]; + SMAAEdgeDetectionVS(coord, offset); + vec2 oColor = SMAAColorEdgeDetectionPS(coord, offset, inputTexture); + if (oColor != float2(-2.0, -2.0)) + { + imageStore(imgOutput, texelCoord, vec4(oColor, 0.0, 1.0)); + } + } + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_neighbour.glsl b/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_neighbour.glsl new file mode 100644 index 000000000..2e9432ae6 --- /dev/null +++ b/Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_neighbour.glsl @@ -0,0 +1,26 @@ +layout(rgba8, binding = 0) uniform image2D imgOutput; + +uniform sampler2D inputTexture; +layout( location=0 ) uniform vec2 invResolution; +uniform sampler2D samplerBlend; + +void main() { + vec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4); + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 4; j++) + { + ivec2 texelCoord = ivec2(loc.x + i, loc.y + j); + vec2 coord = (texelCoord + vec2(0.5)) / invResolution; + vec2 pixCoord; + vec4 offset; + + SMAANeighborhoodBlendingVS(coord, offset); + + vec4 oColor = SMAANeighborhoodBlendingPS(coord, offset, inputTexture, samplerBlend); + + imageStore(imgOutput, texelCoord, oColor); + } + } + +} diff --git a/Ryujinx.Graphics.OpenGL/Effects/SmaaPostProcessingEffect.cs b/Ryujinx.Graphics.OpenGL/Effects/SmaaPostProcessingEffect.cs new file mode 100644 index 000000000..1ad300c88 --- /dev/null +++ b/Ryujinx.Graphics.OpenGL/Effects/SmaaPostProcessingEffect.cs @@ -0,0 +1,261 @@ +using OpenTK.Graphics.OpenGL; +using Ryujinx.Common; +using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.OpenGL.Image; +using System; + +namespace Ryujinx.Graphics.OpenGL.Effects.Smaa +{ + internal partial class SmaaPostProcessingEffect : IPostProcessingEffect + { + public const int AreaWidth = 160; + public const int AreaHeight = 560; + public const int SearchWidth = 64; + public const int SearchHeight = 16; + + private readonly OpenGLRenderer _renderer; + private TextureStorage _outputTexture; + private TextureStorage _searchTexture; + private TextureStorage _areaTexture; + private int[] _edgeShaderPrograms; + private int[] _blendShaderPrograms; + private int[] _neighbourShaderPrograms; + private TextureStorage _edgeOutputTexture; + private TextureStorage _blendOutputTexture; + private string[] _qualities; + private int _inputUniform; + private int _outputUniform; + private int _samplerAreaUniform; + private int _samplerSearchUniform; + private int _samplerBlendUniform; + private int _resolutionUniform; + private int _quality = 1; + + public int Quality + { + get => _quality; set + { + _quality = Math.Clamp(value, 0, _qualities.Length - 1); + } + } + public SmaaPostProcessingEffect(OpenGLRenderer renderer, int quality) + { + _renderer = renderer; + + _edgeShaderPrograms = Array.Empty<int>(); + _blendShaderPrograms = Array.Empty<int>(); + _neighbourShaderPrograms = Array.Empty<int>(); + + _qualities = new string[] { "SMAA_PRESET_LOW", "SMAA_PRESET_MEDIUM", "SMAA_PRESET_HIGH", "SMAA_PRESET_ULTRA" }; + + Quality = quality; + + Initialize(); + } + + public void Dispose() + { + _searchTexture?.Dispose(); + _areaTexture?.Dispose(); + _outputTexture?.Dispose(); + _edgeOutputTexture?.Dispose(); + _blendOutputTexture?.Dispose(); + + DeleteShaders(); + } + + private void DeleteShaders() + { + for (int i = 0; i < _edgeShaderPrograms.Length; i++) + { + GL.DeleteProgram(_edgeShaderPrograms[i]); + GL.DeleteProgram(_blendShaderPrograms[i]); + GL.DeleteProgram(_neighbourShaderPrograms[i]); + } + } + + private unsafe void RecreateShaders(int width, int height) + { + string baseShader = EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa.hlsl"); + var pixelSizeDefine = $"#define SMAA_RT_METRICS float4(1.0 / {width}.0, 1.0 / {height}.0, {width}, {height}) \n"; + + _edgeShaderPrograms = new int[_qualities.Length]; + _blendShaderPrograms = new int[_qualities.Length]; + _neighbourShaderPrograms = new int[_qualities.Length]; + + for (int i = 0; i < +_edgeShaderPrograms.Length; i++) + { + var presets = $"#version 430 core \n#define {_qualities[i]} 1 \n{pixelSizeDefine}#define SMAA_GLSL_4 1 \nlayout (local_size_x = 16, local_size_y = 16) in;\n{baseShader}"; + + var edgeShaderData = EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_edge.glsl"); + var blendShaderData = EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_blend.glsl"); + var neighbourShaderData = EmbeddedResources.ReadAllText("Ryujinx.Graphics.OpenGL/Effects/Shaders/smaa_neighbour.glsl"); + + var shaders = new string[] { presets, edgeShaderData }; + var edgeProgram = ShaderHelper.CompileProgram(shaders, ShaderType.ComputeShader); + + shaders[1] = blendShaderData; + var blendProgram = ShaderHelper.CompileProgram(shaders, ShaderType.ComputeShader); + + shaders[1] = neighbourShaderData; + var neighbourProgram = ShaderHelper.CompileProgram(shaders, ShaderType.ComputeShader); + + _edgeShaderPrograms[i] = edgeProgram; + _blendShaderPrograms[i] = blendProgram; + _neighbourShaderPrograms[i] = neighbourProgram; + } + + _inputUniform = GL.GetUniformLocation(_edgeShaderPrograms[0], "inputTexture"); + _outputUniform = GL.GetUniformLocation(_edgeShaderPrograms[0], "imgOutput"); + _samplerAreaUniform = GL.GetUniformLocation(_blendShaderPrograms[0], "samplerArea"); + _samplerSearchUniform = GL.GetUniformLocation(_blendShaderPrograms[0], "samplerSearch"); + _samplerBlendUniform = GL.GetUniformLocation(_neighbourShaderPrograms[0], "samplerBlend"); + _resolutionUniform = GL.GetUniformLocation(_edgeShaderPrograms[0], "invResolution"); + } + + private void Initialize() + { + var areaInfo = new TextureCreateInfo(AreaWidth, + AreaHeight, + 1, + 1, + 1, + 1, + 1, + 1, + Format.R8G8Unorm, + DepthStencilMode.Depth, + Target.Texture2D, + SwizzleComponent.Red, + SwizzleComponent.Green, + SwizzleComponent.Blue, + SwizzleComponent.Alpha); + + var searchInfo = new TextureCreateInfo(SearchWidth, + SearchHeight, + 1, + 1, + 1, + 1, + 1, + 1, + Format.R8Unorm, + DepthStencilMode.Depth, + Target.Texture2D, + SwizzleComponent.Red, + SwizzleComponent.Green, + SwizzleComponent.Blue, + SwizzleComponent.Alpha); + + _areaTexture = new TextureStorage(_renderer, areaInfo, 1); + _searchTexture = new TextureStorage(_renderer, searchInfo, 1); + + var areaTexture = EmbeddedResources.Read("Ryujinx.Graphics.OpenGL/Effects/Textures/SmaaAreaTexture.bin"); + var searchTexture = EmbeddedResources.Read("Ryujinx.Graphics.OpenGL/Effects/Textures/SmaaSearchTexture.bin"); + + var areaView = _areaTexture.CreateDefaultView(); + var searchView = _searchTexture.CreateDefaultView(); + + areaView.SetData(areaTexture); + searchView.SetData(searchTexture); + } + + public TextureView Run(TextureView view, int width, int height) + { + if (_outputTexture == null || _outputTexture.Info.Width != view.Width || _outputTexture.Info.Height != view.Height) + { + _outputTexture?.Dispose(); + _outputTexture = new TextureStorage(_renderer, view.Info, view.ScaleFactor); + _outputTexture.CreateDefaultView(); + _edgeOutputTexture = new TextureStorage(_renderer, view.Info, view.ScaleFactor); + _edgeOutputTexture.CreateDefaultView(); + _blendOutputTexture = new TextureStorage(_renderer, view.Info, view.ScaleFactor); + _blendOutputTexture.CreateDefaultView(); + + DeleteShaders(); + + RecreateShaders(view.Width, view.Height); + } + + var textureView = _outputTexture.CreateView(view.Info, 0, 0) as TextureView; + var edgeOutput = _edgeOutputTexture.DefaultView as TextureView; + var blendOutput = _blendOutputTexture.DefaultView as TextureView; + var areaTexture = _areaTexture.DefaultView as TextureView; + var searchTexture = _searchTexture.DefaultView as TextureView; + + var previousFramebuffer = GL.GetInteger(GetPName.FramebufferBinding); + int previousUnit = GL.GetInteger(GetPName.ActiveTexture); + GL.ActiveTexture(TextureUnit.Texture0); + int previousTextureBinding0 = GL.GetInteger(GetPName.TextureBinding2D); + GL.ActiveTexture(TextureUnit.Texture1); + int previousTextureBinding1 = GL.GetInteger(GetPName.TextureBinding2D); + GL.ActiveTexture(TextureUnit.Texture2); + int previousTextureBinding2 = GL.GetInteger(GetPName.TextureBinding2D); + + var framebuffer = new Framebuffer(); + framebuffer.Bind(); + framebuffer.AttachColor(0, edgeOutput); + GL.Clear(ClearBufferMask.ColorBufferBit); + GL.ClearColor(0, 0, 0, 0); + framebuffer.AttachColor(0, blendOutput); + GL.Clear(ClearBufferMask.ColorBufferBit); + GL.ClearColor(0, 0, 0, 0); + + GL.BindFramebuffer(FramebufferTarget.Framebuffer, previousFramebuffer); + + framebuffer.Dispose(); + + var dispatchX = BitUtils.DivRoundUp(view.Width, IPostProcessingEffect.LocalGroupSize); + var dispatchY = BitUtils.DivRoundUp(view.Height, IPostProcessingEffect.LocalGroupSize); + + int previousProgram = GL.GetInteger(GetPName.CurrentProgram); + GL.BindImageTexture(0, edgeOutput.Handle, 0, false, 0, TextureAccess.ReadWrite, SizedInternalFormat.Rgba8); + GL.UseProgram(_edgeShaderPrograms[Quality]); + view.Bind(0); + GL.Uniform1(_inputUniform, 0); + GL.Uniform1(_outputUniform, 0); + GL.Uniform2(_resolutionUniform, (float)view.Width, (float)view.Height); + GL.DispatchCompute(dispatchX, dispatchY, 1); + GL.MemoryBarrier(MemoryBarrierFlags.ShaderImageAccessBarrierBit); + + GL.BindImageTexture(0, blendOutput.Handle, 0, false, 0, TextureAccess.ReadWrite, SizedInternalFormat.Rgba8); + GL.UseProgram(_blendShaderPrograms[Quality]); + edgeOutput.Bind(0); + areaTexture.Bind(1); + searchTexture.Bind(2); + GL.Uniform1(_inputUniform, 0); + GL.Uniform1(_outputUniform, 0); + GL.Uniform1(_samplerAreaUniform, 1); + GL.Uniform1(_samplerSearchUniform, 2); + GL.Uniform2(_resolutionUniform, (float)view.Width, (float)view.Height); + GL.DispatchCompute(dispatchX, dispatchY, 1); + GL.MemoryBarrier(MemoryBarrierFlags.ShaderImageAccessBarrierBit); + + GL.BindImageTexture(0, textureView.Handle, 0, false, 0, TextureAccess.ReadWrite, SizedInternalFormat.Rgba8); + GL.UseProgram(_neighbourShaderPrograms[Quality]); + view.Bind(0); + blendOutput.Bind(1); + GL.Uniform1(_inputUniform, 0); + GL.Uniform1(_outputUniform, 0); + GL.Uniform1(_samplerBlendUniform, 1); + GL.Uniform2(_resolutionUniform, (float)view.Width, (float)view.Height); + GL.DispatchCompute(dispatchX, dispatchY, 1); + GL.MemoryBarrier(MemoryBarrierFlags.ShaderImageAccessBarrierBit); + + (_renderer.Pipeline as Pipeline).RestoreImages1And2(); + + GL.UseProgram(previousProgram); + + GL.ActiveTexture(TextureUnit.Texture0); + GL.BindTexture(TextureTarget.Texture2D, previousTextureBinding0); + GL.ActiveTexture(TextureUnit.Texture1); + GL.BindTexture(TextureTarget.Texture2D, previousTextureBinding1); + GL.ActiveTexture(TextureUnit.Texture2); + GL.BindTexture(TextureTarget.Texture2D, previousTextureBinding2); + + GL.ActiveTexture((TextureUnit)previousUnit); + + return textureView; + } + } +} diff --git a/Ryujinx.Graphics.OpenGL/Effects/Textures/SmaaAreaTexture.bin b/Ryujinx.Graphics.OpenGL/Effects/Textures/SmaaAreaTexture.bin new file mode 100644 index 0000000000000000000000000000000000000000..f4a7a1b417766c12bbac4e4bdc56796f18538bd6 GIT binary patch literal 179200 zcmdSChkqN_mHs{GL?MVqfW3FHfnW!V2!g%$UL;B)B~rcj*s?5HvMgIJaxaPFIB^oE z$4Q**W;dH`Nwy@L&2IMd`(NJY+?hck0nA9^T7EGfTO<ZE!~4Z^&bf2v-g7uk;*~A2 zUt;%T9?tD)?P&G0|NLS9($A5<2QGCi`6oL@`~&_ze{V;RzuVu1Ex@+Uj!wU@sjua= zx?8JTy{&bv4Xur>P1u^T`PkOdDsFqO$-AyZ^Es}aGe~|W&IcazaIQlpzkNAFWxdq_ zZ=fzv-`U`A>}>Kk`J1u%XzTF#TlkGzGW9t-GduG36t$POm$z58RkeBAYTCTmYTN3t z)weZZlY8AbP3LzV1VNSLr(%BKF%Or`?}%~6Ig;6**IU$6+U*Wh1gZkn0Z*W&)7x3w zS?jOstoPUZ8~hF2hITS>z|oT$$n4DN$nPlHQ@p3N-3^iz?Un6S?bYqojOVWLJIO6_ z6`Ud_Ac-B1c{s=0F4<C~y>9>gq;@57+A)$kl-Zxtm)}#^UEE#TRn}Dwl9hp~&T69B z37Xsy^@4uPJZSHAb*F)0wm;8b&{5b?yr*PO>7KG6Nx9dRXg<fC=MHndT%P3TVtn8+ z50|aKGphaidGn-w%r%@gnAw-zm)A=ai@Hm?z%k$s@FXj_<H}Xtv~e_P(AMYdN$F1S z$_!+8<_Z)^jXcL))89qz1UJVubLlYvNxc1-hs)-7PPw67G0rAU+QyPcQU=opGW)ap za(e}eUBz7jNA9F@zh)s}(lnYhWb1eIx_VN1hJhS^kRsH$Yy2*AXSg-4pR4Aa5~!l? ze$2yV^E<EH)a*-GFwGD_$FOTCl`+if%jpHh?t*U8VsTe7cUo~!y{wxtOeBt2hHL|l zK4))AcUl+eFe{M5l$Z~T-bH?7g42j1C7eAfqPTBfW)9$y50}mFg5rq!fNt3^pE%73 zjyQ)?2Ga&ehgp5u#1QtJ-(AR^Q5;h3)y^kO8^;BL{Z6LBw5|-sFq=`_HGWsP3*1p| znH%65xO~nM7gm&=k9j!wg~tssj_}bV)Gze6UQ`@Yt!q~kmW*?W)0PSAm}4}VsgN=3 z1w&qm+*!q@YE84Go7GR6#?8>6b<p0Q+~?{|VU{d1lzZJMO^$r)3U`q^$?fMRIX`ST zn=?o5mip~650}mFlH!E&kY-J{qF*r0nx{Z;+zx`I!qmaE{)~alKE{wc$89MOs8^uC z8N;M$EJP4?OqL992!>s|#_uY3i95|5;ufGlALm9KNt9w1zW*@~m(LG@VN<<dyEkFU zFmIX;5`-PQhEfL7uo1&d@H?tJsNSnt)XnLqjT4DuNh6kFXmG&MpWG{`5Da&b-(~JB zcZA!^O>hCOjw|L;Ia4^H_&;Ce9_w(~{4R5+lv}EUP~ftD(Ku_G5eYg*V8??@g~Sm2 zjwv@_!OKj6Qy@4RMv$TjRG8YmtNd_H?f`l*gIpVII3JPN7@t~bFY$Yf!ykg*Y2{JX zhGswcK*PLoHZiC{c)}sFW5y8S>^LIGe)X!BS0KD#h#)+npu%0`cLmox!EGRhjB{OF z6IagVaLID-im!5yb+~MPR}ff_DG#X+X!ns1G|rpC2-=}RiY8!4ZBZOnkp(Y7fwOv2 z;JA4-X_#3t#S%mlXYa1^yTF~|Ho1M=3>3JBtK~|$Ec9jcA(h4ciTFLn;Sa&@tm3%x zuxeeiMhXPMInzwylwidph$UbM72=OMq1b{4+^1P#3Y;+z!7(CeB|pe}!v0<5cM*N) zqp;uwZj9?j6shKl5JzlWLTqm09^&^Hhd&g*6Yzj!!F$OE!iGW641(5i>zHlKK1!Z2 z8T?Kvj*th01uy9qpulP4q$!M`jS-a1FB?7MgXp20M>8*DyMpa%*mi}R<&MLJT|gX; zwqFDl<KJwF^GofQ+m0*TImJok5!E5}K`nYQ2^2+O!$dG?(mG}xX9P!y-zo4rq&ff# zUJfd7GLa&&pg|%i_qu4d0R$Iuk8uJjJCE~U3?oJK0^Ah04*i{sx<3Xt@okL%#XY~& ze!1;{mj%D0%z_a^-~$&8<OOG_FO!7cAWzWF_#Fj5-hxF1GB1cY!V?^_4a&VPYo`h$ zRK19=YoJJ+PNT<iPGET)^l}h)N4fc^`$bSO{$=BTan3KbU+#U>Rq#6new(o11F+x~ z){kK}j7U5Q4e|uR?=*Ta6hYQBt5Dzq`hgThreMPqi3NgkuPZcLff1@^j1o*E%I-&R z`3QO}ClEExfa5yoCF1Tk`u-T)#J7d<znJHj+Ap^q;D@1sW8?wV2Q>SLpnj3!F!co~ zjtB%9zvGHc<vRK>dl5r;1yU3dy&xkf_qt*`@ozffLObp<CJ`t0B3f)9<{riMjzW7S z7(tD`KkNk2eheG`i+a4&{@f+ghW3d1gz}{FwBn57tl}KLQL+v4G@<RHe~)X*w3={G zb69;ubxe6ec}j6gvHb`->XN)o?sX&D>;$I<+~*G9USkeQ+y{!6V9)bx{7->?^UnPd zC)oK*#Q0y7<E8ey`<xSr3;I>ve$Bf2kZM!41tUGCJgzu_?Iak15-jq}jI*LV-Dm4c z95PNM%;*+0%bLCFHT412x^hE#NV%!pQXEknRUDH?kGmxIy4zZ8$K6FWdME+(^v2QO zTR_YWAOG9Af7=PR{~CEXdX3x8FSXy(nK|GXGfx{A5|(v)wQHIK>UH&o>X7O%NFKq) z(>%V{<V!Cp$*gfUTRY4>rUAo<enK~;oz={17B$Q2RW)d;_A3v{<tO*L+uMr~s%(rB z)WPR<f#DeVMUMY%-@oGok#B52eEa#O_Io?>x>E=3qe+vd8N+<Sl5R!2SF@(suRe&4 zI1)*yYhBl6tH~@X%BoDMbu?SslLDq5<A7mEKc*W8#~BSHsa}%HPwsUiTg!ek!_yT* zg-y8E=w`PMk>h`n`=gv7{H-YCe-Xz^?XTNY9LVf-4cbPNCK9KOv-<gjMcuM?6%_Yr zV4sZRx@u!nGnD9aR%R6zX1mio&U$;Zr7g*y*k$T5^cx2CBRb+ZshyU~PwsWY+Zy~i z5o0X*xX-PHB}W^;3*R3+DEbf44o5T(o?mKzLwjjwZg*OrbI3Mo8BZh?&M}5d+La*1 z{ptfp)f0veYi)`<yPzPuG`-U0O>VF?TUwL$BzBs*OudZaux>;yKe^Y9-fE0cInnDY zg$kn&;Dz?bIDz<f^ut5vm)hUhUfz)($m~h&cMRD^EaT=$)3h<j5K1I_h8B;jW)phM z%??jmDG272WK^VjoV5;~VY}IH3YfZ$(4t&^a<414bH~3%jIm_lJ~!3?p16O<fst>- zIvgWl?EF&uo7yWoiaK+;(|cV5jNq7gB5}${mOQ6lNLbJ<F@~!rlnc5+Q@gd^S(#p% zQ;?rilIc#Za@9ENY{al7somUR@|(Kk@{@br$ks;uW<az|XZ)fM;PLw-4%zujobf+) zywv`twyHfP{=6<Iu+IsC))C7XJmI8i3Mv#C!XutiE^9^%fuv@;H>Dz@I43_pr#Q1L zts<q`>2=iE8X3d3qz>8qWL|e?i^Ja0V~i!<03Pj^c*Q#$BVabE{mpHjJ*EEqKsFSZ z(w{tNCxW9%<G9}^6$TleR_xPEB=nfttPRPYRCgvLSd>|oUY=UzgeBXFp)biNm!I70 zN;E%#@rylxCpm&ed}1HYB}naWZmVf`cNBK!c4c;_^}2}QFi#M6Oe!Qxh7Qjt_N%9L z14e&Rv#r)ym0Ffr1P$gEWtOJ9Q!Amun&dhg7|Q1-@46EFIL7ZW2Jnbayu+pT``WyS zAjST?Kz3IKY&eV{c|sz{Ecq-Ri_dC@3|)zBmIg<St0D~qbMx{*uq4BsTAo_%WR@(K zpWN$8@_WJ`pmtQi0~Q2wm;zJC3z7zhVa4PL$&R7IbBcA<yk<<_YuW<^);X(F%F|0l zg2kEcG^o&3?eshhKRMTZ!gwSv?`x}VuiR5g797YT1(FvO2wG{hi98`xcuujQT+)o| z`VF0lt(Hcc7dGrpFA)f4!4rZYqKSNd@~$h{fG7NcmbSX~sy$^LMX+E}AfgC)!2vv{ zpyw4L!HIL+Vdb)VN;hce;uYxdI4e`#8O51JdAWJHg&;_t&@Gpr-0MDJJd&5Ec2u{! z_Z0gJcm*Pgz=kP~gb2bDp651|tLho;2rSrdZh-=8VZ#-O#6?+!B0+e<hv6sZx{?ie z!XId9s|UaGJtZB5EQUaVybW(7c%D0=+^e3|jKP8frgrjyjv8lms+%Hlh+wgNe)6vS zgz-pTz6EoUJ%}Ksu;6@HFnk~>kZiagab(aYSTXTiQ_pKAV8Pv{j--~PMv5Y2!)b^k z0>SJe`TXQvSF!<5_yfeR1`(tTeHgOfZ1RCUf(=s~8R7|^=Z-24s1`Mox?z2<vD4I+ z<g?Wyid4H`!xTq&f&~x5PtJ9pFdoUv6F)CzcDn@&<`tOUOK})pkm3j<c!4{nIH+7w zPiseD!Cg?Gj}(X~Qk{aZL2)Fz0Dkac_{q7hWCNb?2ljDW?13OXfaV{qh8{y-!Qig= zsQM}HY3@4r6#k=)eh%Nn_pjrTe*h23_Tr%!J={6Y9+F}B9}gVqk)!x%<vc#MhAYz} z-Dp>)$K?E#pD-TD%MTi7v@5DLJjB>!4=_%E<|%sM!CHKlJxHQQNl(k<$L(>s%1u69 zmu5&kp_)-HDwh>|6>Exvigm>X9z1S>+7XcL$Qd%uYL_wOPgmYj98-i}nW)mksPmG4 z|C9bePx46Ow0=&zqS?nD2C+v$N<0Ws&?BVd3ZCX^L35(cV(X8aGL@&N7u#wQn~fa_ zJ-R{7h-N}Pt)5lQE0>hZ%2g%N+^^^;>v4`GPU;ufmG|Q@RQQ$Y+WeKDG#{z!cV+fF zM$MDPISkulz!?ud@JLOyp*&2F)Ue_45hxy4@HQ)h-)e%-R+5=n<gB#Tnww4Sh5#rI zXooaonsN1%dPY5~o>wg@{ncG~a5Z9{Vpm?qAZ_@SH~9xwxUSUS|D^W^@_JK;f{%}g zA^-S@Q4DfCqLj_=pr$La##xk=RhZ&-c&znF&89Y^-_WJ+(e>*FwIjq)GpX+I(v=78 zW9-4zT-bvvy7Ic}knjyYX+Bcd?=0%c>~{^>&<^(aXo0n4Rf~s5f+C<sw1$rsGW2&? zJ!EXQR;CnYXBVWEIxFp7YeQ0VVykJ7p%WB)iK1>;+fheX?sE;ZD?^1C?52l*{FT{* zWwL|ESlK4k^4;0rS=yD;mo|_*BD7=HFdu9QJ%Zbh2f_zb2UW899Z`)ZbeL-$?z93B zOfPX&BztVN7J(s@7%+50iyifyrQJEbX@l&_?9pG?l~;pT-knzd-<_q?-Id>)*_Sfp z071)ydCD|noI^_%NeyJpyb?9C`5jZv=z2`eR*$nZy&xwiKcgho?W}Ut*y=2eNzLXK zQ@g1H6gwK|%Fuq)D=+Kt0F_-?VtO8zogIIApcFZ&>4Q?|@2m(Ec4zmd54eULqd|hR zMyL=B;Sc$iXl3)mgC@;@p*^YIUYSywkq?5I#i?bk3TL&=%NY8=u+7xbNLTL4?qgRT zv5Mave`T5s!CzTw`XASBFvOc4D8)~5`k=)5{Z)bD?mXxqb<jC%AF+;;1{0@EvrL6x zsH2vw2>ir;i}){3HPo-Z&SCZ>9yBEcOwAT=az$!MCL>ssR+dr@6~dBjbxDo5cj;)N zD|hAfvMb}ei@5R}=63N{ruN5~p2uNsYP(7mdwQT0KgsEX66g0<2THs0yGaKrgUN`* z!FIro$*T)3;Yo?y#DAmm+p8Ye^%&cd8f+e?JGEFKm|hAyu5?!0Vac|7;)g2-@_U3U z4@X>ihSxtB^6if~J&!|oAM1Y=idfSFrT9rsACx%1za~)DRoI;i9i$Do29rbWfG316 zoF}gihO+sc#)GCQZNI^v*lekFRJqF1iZb(abMrEb(%}iA!s=wQ<c?;#GJJ<{<)Mfx zlm7WDhuR-?dS2N2-x&KtCy4r!<n+Pd1i<L$_t$im2Z~@vqyw_yWXz@KHHcU;P0@t4 zL^i)O7^a!k4C%WtMAKlaaaN?1r59!9F@hPz7%yZhbPzwfayPp&^B!SW#xxS)%HfTU zGChyO!?WP}->Cb;PY~s2$?1a<=l6Tz$4d|ib4Y>xDIn+w5u8k<c0db8+5FBb)|K;` zQC*K=4-{But9F*BmIeuC!V{*t5lxUWA?(WNK}B475<Zx}a%7WtOwZ#Y=l@07A9;cu zzeUdPi*kI#2X^lF*LGII1CkwO^`%1xE?AOdlr(4^PvT<<wL~_*bKHh<k$=!+YJ~#5 zjw*P;k_^&dUT#()2*MMlfM3{^5&t8ujNUwpVlz8dvhA4Ow{3#nj{Q4`MZU4^@a@z8 z==+J7U%37JXhjwL5bUTY8~s4ifeUMi>^#9xJ061HA$riH9fSq9n@NFq(1amE3=n3C z1mOu8zpBo%F5$}XzF}9!dI*7FWNYcSi0OG8n?4XR|8Lv=ohOKRXXpIBZO8BUK;(X8 z&ZtKAg5s_MQebZe)&P<v5kcE1t?Qt6z!UQP#Q(>MzvuV|O_~v1ufcDkVg6bN*|0mc zD6=p~Fa!K><xb(sJz0I*u1xBuD@Qa|_$6$59`W0;{x@QOv=fBA7v=Eq>3@>v_t%48 zIXqw?d2*(})WH<kFnMzj6x$(^i`1d`u6SRJ{vWOXjd;)m1>!*yY?vZ(kRbTs%7`HB z%IL+A_Jdc(y^wI_u%?Fn7&g5x>IA*8{b5H%`!VX_q3M4#KP1Zf;?Vh#wZ(%7QVKgl z3<>IhdBG8Ts2!2q;^+?#niTsmRbgB=sP8s*Fa=T+sdTwn9N`IOceI3G8F#{>0@;;G z{XD^FZKn5v)B9pf&<pO5bpql27>CO~KXTsGfL|FhXykY2hIGJem?E*z4pC{59L0CT z^hXbx_A3|EliFd#U_U)*N~%K?sdnN)6N@8-+3<tl7jb3up9BTcm2p2JTse00h3S1U zC+NwV{s(!*TK_BW{BrYO?Uk9oN!=<l|CP+VQL*`PG(C@>e-&f>uZ-z`;1_%SublJC zT?e!!H-FO?n*R#Z`{GW}lbrq+|8QCWD{+3g{AA~E`l9n+Nlnm;Ct&Q8JY3=fa`RBh z&foL}=f9E~pOe6c@k5FA#S$lwt3TQKo4)A$R}$lM63qv^`0IbAj+e_%cK)U>I{%f# z_?%SxrPlXK9WR%k?EFn%bp9)e@j0pXORevfI$kb6+4-Bk==@g_<8xB&4;~(CeX+y| z<gTOV^Ea{FCGqn&ebM=^B*y0?njb!V$UVyXVcGoT=WqIA^Iu7f&q?u%IGo$L{#WXF zx%?g?f72J6|4L$fPOAOe4(E0b0LDLqe10<XH+|9huO!Cj;<rA^@ZpXD#`tHD%kLrb zH+`}Buc8d#J?{8l<N)Aq@spXq>5I>Q6?c3te(UM}Ez03s_yA!1GsxH9S!7U@mA~nW z&VLm(f#2hf|AhtscZ;9w{7t)?|0@2UPXdZDfcLoLf5;m}>x(5`^7Z!+`I~k(|5f}J z(D=yz(fA)fz!?AXa`{E(Z$dvhcK)VQirvnC6|V`zFXjT?#~uHZ&VVQP{_UaiH|>7@ ztJnl3#^+)m$nSZ~!;$}w6v(SE^zY96#aHnqpGF47EhL0N7F5bYDP#c@a<B;*u;^pR z;Q4=%|0+6)r08$UeYZ;P2V;Cr?(>g>*M>Iw7JH_;#=3?BgPr}IeVx6XJ^mhlH#VvS zBGdwzxmGz_JJitE)YIJM3-|&powNnZ6VTViwl}T~&GrrT^mX-gbp-;Qfll`F^REtn zaEtM0hu+TSG@)M}{-+i0@xB1L`un=`bpF25xr&ME(VF4f!MgtXzJ}h$-o~D$ZrYl= z*cR~RR~6QkHB>fvn!UbSUtLQBwnm|j3SUWuucdOXc*QeaH{LkXG}t`g>+|)t^tSY1 z>u&9C?P6P?l{QHtUgUFnj9L0`a4V1X1<2+13&l(N^VUt*zKq43+5D-(@shE!5%+NU zP~~9NKy|;T&(l}a>+SL873P-}RhD_my_L1q^`3^B25)0+V_j2yQ+;y-w#Md0UlX>A zX&VJAW%HF&o(b<*?MU5F{b0jD<3MA7Qy;e8<{oU_zHXl+zb38{<0g;%`Zjk5YoU~n ze7xM}AHU_kro64YXgX%yNZyySoW77bn=_p^nLklDRy0~NTsl-X<Q^&?EYHi&D=sW6 zE-$NeS69?jdaG+awKa9#dT)JgLv2G{V|^n?V!Px#l6|0HrDVZ9TQOBN=^3vX^^VjI z*A3MT)(_P8H}p4R14+q-wIG*R0cVx`T#OI=kb4j7p}dbQVY&MIJ@+a1y6UF(lHrtj z%ern~bFQQ;rp;%}W=-cz<xb{}7mO8-7L63=<>eI?6c?41xJxVCRpr$co~oK^Z?%_E z<T=*Y)z>v#PCk*ok+W8?QoL9?S3W}wt0z2THKR2nBFDM`c!?Oz4!zyVH6jyMChl1u z>Gf0YTigvK!Yg^CqdD2}_usgma$n`{t8Qp7>(3aEnGahJ+Sig-T+69MaVB#*dn%ha z;#H7eR9I3}3WDYCit?(8>Wb<rPc@@h<E^RnGK!ZSr&71F*7HEHXt89zY?c^SPE<`) zk9)>EBSevLlx$WTawmGYLM{b&T#xYjIrly8RjhOD<#He4X!-p9$o&%E)LV)dRL^KG zC!95$Fl{9rvL3XpIaZy^uBFt4w7K+|jOi@!%g=`b3rmUxg6@h6o?$i5(BrM~dc9Zd zXHt%49LheBx36Fo1Q$x@%4T?mL=g-}JtN>K*{C*-EZB`G;*ggyJMxUb;K+iXLKJD` zG9q`$`c1C={)ziL?)y;SYs%+U*ELsk=k%wH#}W@GZSV|}SDec!i>dQzj9)=PVPSDm zDI@51S3rf8#IUlus=8WG;uZT@*YWhttc{!ld3!-{33dz>y2BWfJ$ob@LHruHDkv}= zUQqgmW7`Qo=YGO{#Jzw?z~lHDQn&4u{#EY%+kbNZfGd5>z0KWM-c~&gg6H+8NrmP^ zV0gf`W?yxHp$jQ>@?pV+1x1C$#k>aHZrCwY7-9&DS8V5;CsL0<fd_N;=L-Z&7RqMJ zX25W|a<X!QHtFtd8-&xu)o^7nz!Y@mgc<wNE$2V+Irka&3HJcQ5c^@n4<E|q^7}LQ zJMQP)cc8%6759|4K=7*We8OqNN#jw|CK#>@48e~qm{(wkAXHdZQ69z+{E|<m98KTM zgaUK-=dbb_q-at$6UI=ILp%62Ba>nU6qt#ebBUZ*@|ds>{F3___g(H??m0FMy#px) zAFc?HTz>z>{S!Rk54dk|?{KdwUQpg7g4zoSXJE(2O<ReF%^Qqih@YUqk`nmA5_cI? zSVk%o7+$fRPd=4$Jncy4p{#WfTniDTXcA-?*3{@fwj&d&kE=%}#Zu&mPDAre4@2;m zP{sGTuW>J780;VtAvfb|kWAVZ_Ls=z_ZRM8xWDCo46T2adlNQ%7X+Uog1WP0$D~4H zXc76rg5d+14TlkAcFY)FwOvR)<2sRcB>iwEq6oa;zWmkv)k65e;>8j{h0tNxpBnu~ zo?injxE$FOvth$lL}J<Zbid>%f_#H}9le;7+!EIZFPMjiNOE8D`TZUdgtYz<)c!#6 zqVkUNhMEXo)SXQ@tv_izX4*0xHV65U1ydA(4HuQ*g&rX^$VU?p<oTUUJ({+ad5CP7 z;s_(SOa%F80u@TOgZQ<e4_J#BLJG{l-G~KapE3#l3jBVIp2a)dU5w;!AQ^loq?|4L zu2?R=|K|P`5#$%J;BRpsaBp$1zzg11J;MlI(w!qaW-8?Q@g5KaDT<I63@6B<$yM7$ z$64pelw-UCvkrnF;z&3_M3XtW{F)#e4|*|0Tn=p5f#&NuwIq9d;8z?);P0SKFC$06 zNp6)JfkUl;D~&pJea9)~^ZPgMxA1^J#<fU+Z*ceF1)mEM)PW#W$oTOQL{MN9f>0sj zhX?`-<`qabOmQTfAXK>H58nO;>5n1^S#SkzBl6Mo6nMczc?5~yPjPScby)Con2m54 ziOu^E&`Z%Bl5<NVpWnYD0{;!7{iobF@ol}$y#^b;i%3iwyrzW)$rI{B{3r(V3WN<8 zGaC+BvHPm^qWx_0=@hbH^kVoZ!pD&af>2U6Zcm$FqpcvtZ|#_NCT_EMe-K+Q^14Uc zzn%BC-+e}%3!bwc-vC`<$2EF%Nl8$(7ak(&C*<<`AMQU8fq#u?^#ktP__p3f6nPmo z{5%L==Lu@h!xIMiQ6FZf0)qsvS})npA%ZXqW(s6cM2sW(tA)5T#)jr<K(G?_$+sY~ zm83n&r0wTA@DsIg`~Gdbw*PvJD^@CP2)b3m6?~EBxP~uEqk&f~gMXmEp$XAmyb5T` z=l4hMpSizh-uAobM}Gj1`#Q7XJBY+Q!OI}HonJ7Dh+dEpEQS7v-&yb@3uX%Bqlg$s z1cI!+R2iTEDxnrMpasongV>5e{7wr8tH3zg{_ViF{r(;bVdQaoBhHZzMpaN6U4uB$ zRa(Bz5kd55^em?~eWC`js$Td+v)uc)|KW(=KfnY26nEU8a3A8X@h$WOUxF9Bt$a2} zP~=AuB&a|>ibN2+#`8l2IhhK6=tq+cQxriQ;Uh76geyhC=33#`3K1d1zh7+QAhcJ4 z);VMB51t_U51%4#%;MI?oS-zrZ{{E~JYB=XID*m-;Hi#)U?v50qMVJtq7<IC8XUn| zF2BEm-@hP&{0e;-vS40;Z@`Ah3qGg3ArKV!Q3Mezm{%Z+BE`j}EE1zfNc_$xpG6;< zw_sj@f(?st1O&rcorw0)UtcEt^|eeXy|COWw2OngkG4PT1krx{F_dr-g;{bqn^tE+ z&xg)YfN!K6dIlpJ-@1iD8SQwJjr1E4-O~Y_D7v{yZ(d1WVO{}lTJ!17OKd)tO0tdK zVcR>J&BQTVS8}VXF{Li8CcQenDx(5hd8RwFEUPT56k7>4@j$76SM1%crZg{|CDYB0 zFUc-uTTymVPEk&wutl^v{BJL3Dyf0`TG3K|{4d)6h!gDm<!9(m+(W<54}uPj(V$L* zv8E!kc|M{>DOXN+CvSl80tlAkYr+795_dRu_L)4cu%V=)uq3}Izc9Z*peT?enqjZc zxfc|t)T;?o#=*p%B)_%I=Ce0B8j|aswN9_ACdHFdomz!WikrCqn040LZ|_L<IU8KH zDK)9pX_aXe>E-F=8Sad-jIzv9Y$e#XwOEh4ixkkxW36*TiIL-fJNIup!S-J%9=?r7 z$cqRHIh<2%PSBZP$;2m<R7Sn)uW`5GC8@@3IRZ%%bu86jXyaU5Mq_z(iMyz@u(+V8 z01RnkS|p0$B!3H?KgXR@uB(@|QwgJn0aH(6z}%74W@)whY)!UCdxN7cxy~u_k^1+v zb<H|u9kd1PdmO&xCTG2?*5wtx1(3v6p6*U}?;Mg#Km(G|N6uypOTaI3{BQgI?ZCGE z{yF%3h-V<jFn-klf-bd1XE10IaTNzKq~77TxYrOxHc<4Y9a4u*aeJfSaF=c3tj$$5 z?#dE(aS7e5QirX8F$71v!btuBQR7YSCaOqnp|H`SW=1!rA2#$Gdr^F$)4V6CJ*hRR z1tg`oiTlr44_TM3lhz??kFC?*Zf|ikB{w?jopmlj%_&tWRjHL}+nP(i>fq_@P+<m| zlF#@>j{imOk8*<Ww`lx|_|W*(AOdR+m!h(2O?sV4WrjCQ7RP7qU}5PR#uwV*6!UR$ zGe1b7^VQT=RF##Nlogj2fnXt2xSb*+`EQ6DA9AnYF8d;$qHn17GKLenQT-rFR}jU3 z$)DJfB=V8^_q_E86u4lWK$71+TbHeaF>D6K27w|tx;!c2Z4Lg+zIiyKG}u_Q@xSo> z!Gogz!1H+rwsi_N+>M}`!=<Y1T62Qls7X|q`SBTQ`O_HXU*?8Eum;W}17D#LnN?b9 z>#98Em5d;6QyD{^Ae0!U#XlnQeg}QN7rAHPO;2Ki!2uNQS<uXCCs8_Ogr{iiHc4?4 z_g}Ogx2{`PtaFTDzqK1G?69}lTOB?}lcSNR=&W@`Z#BMrBh=x55TcL&h4#ldf%vvK zesv!Tq%z=q5$05tLz9$XH0Vq;KJz2oalHi_zl7|C3)~Pc=|zCf#usSjTI%XOHI-Eu z-gIM#1O$tj3Tf<w*P%d>#_xXs1-_1Pit9KNO%hs1M%fi&sF_0XnNeg$9W;o1r2f5V zJ!w4*1uj`<uu^*1+HdW(1#Eu%9%5+sIiN&FQEcaqe>HGINf1G-@jr3@jsqj#;K#2% zMvT3QoRRyOg;s)1nlf2q(V6u8_`<ij_t8&$2A?>H-r@i*TFVB!Q@NJ<h8k~Gbwy=3 zLEe%h82%Z3XY!=);c3+!5X8K7<XAkUJcz~S%gD<MhS~`zQHq<m|C05Tb<29tx(XYf z2Eh^QfVJ1!1%@3F3?o|`@tX#{=0t&CwDCWFf5ag>f8ockKEeIyJt*)9Y<Lg>se%!7 zs;%14_`(}#_f^>N8dTbk%hn+p=W{I$jo#X7Pi0kkMOk@BgSg=*9kLhG;h(vGgpz5T z;w|nz;`!4ccoq|dwxGiOs#Wz8uS1Q<N9x~8)-%?l)(t3d**Y%}9JKZ!ngnc}%o{Qt z#ve>qgJ8VzKiV(x;`w|Rqr$Ipμw<5_SoZr#d3FkR_X{|a|ppP>)<F?`p3c>nXr zKe~z%eto!bJvg^EHrCd8m;%cy+!*2%tQdA2vSiruf8p-!R~V`N7L@!tyx=VmL`_i? zt3AS0xCU=X45he<`!8G1!UN(dvUM+Pcn*CT(jX(qROp8{Y=<pNG+!<7i#7fyIR;4l zK11K)BlIrrqL*@<6$0<V?JYeLO;-}XpP}8K;Ew+R6!;ZPFWbO4^@KpMwXvzLzQzj$ zR#)Khgh;RigT_QKXvzP@{T}1@KSd1rI=+e5p!QpMYDxr8!HzLqO}SsW58hDZBlYiP z>pANQSnxU&xME#IZ+Z&ZK8PSZA)*PXki4M;KaTNxtnokM^8-YPcLjYNh7FG(j?}}$ zWkWL*VSdEg{T7~*JPY4{Tqrf%i;K55HPzR9z121311n&|Zg;60k9_dr2?`AV0)Btb zp0#`o1>(LJ-_}h=5Op!hj(HU-rMQXviQh?B@FD8~roef`5vqebf*#=j*)gxehk2AE ziTeNI13bT4C=idgEAWVso>7Pdal0?P3je~!??1;k!H@nuz<2R{kl-0+$D7DSEb@{1 z_p<doBFHi8CR5-l6ganoU^n`O{}*lE&2zRkHP<)P)<J=uYNkN4;V^=%M+1oe&C&S% zukftkQ#SU3@n?9!+t4#v@dZTolVryVDQ@EaE8uq;9&igWgcL|NJcl?E(jc?rN1j); zn~dU}tF^hgp|K7Y?5W`u2rmd54kyU$_`kV-M+E*`<|{sd4}1?c{0h8ah~QcD;Ez89 zzYB=KCvX=+F=P$>m?iXNX2S@QC+z$G2S4(FWWi8i6|><8g6LJ37W3oxf5&>`H2(Zm zj9NT^$9(~JJ~x;KFQPwliWBuE_3src@gono$rQMcDR7>}kqPDpQEVOr<9+vEhJCHg zzD9~5wG=}{1@c}nq(KlQel&hhPy4=yUd-1KMc#xBzl1pQ9D3$t#TOBcrMQXviQif9 zI|d8JJT543FM5K!7o5U99Yqo%_+?-e?{5G<iooOnJwXNXJ(+MTE}`-Jf8l-u5BM3P z{dbrG-(fcVB6>x)1cH|yg5O2(I|UC&7L3_5R#G6@FnPfc!FYfBmtkM4&xZ(tDWS}Q z5kt_AVK3ebx-oTy`ZD16C-h)`3lB)Wp-%+`z6Be88D5YGUS}E<^(FQ173)RxU{1pW z@)jIYAlWbx9A_H*GBAqwHxR$3`Uc*DDTeSqQ0&PB`_shl4|oFnYxcbPd+5b{13r+R zV!jH3FM!}pMo@~IxIe^?x8QZ_ekgDiHoSoQIX;dM!FYfBmto)Qu*ZLZW&Rud+n>1q z<o?Y47q-7}|IN1l;rQ*Z-2aog1#oxrIdu95obiu1<DY|P<j*Od@kR9Oe~)X@^=Oay z{$DwM`(KXVz6^}w{S6z6JBS3|K~(r0+WHL?_D`Vsd)EGcNBjQBwm-?F_y4#;%_h|N zA!5wW;4A+Q-_1WF?))Cxzk(>8k<R%i?kVMm7&-r2kp35TE&kpA4EqNE!nnr!+rJF^ zru8S4HxWnQ#~tAJ(4wC~7k`6x{{}51mj8^6c*@mZe&U$o6e9AQVE7$GnxCS_^DA)r zdyxDsSkf8kd|%N%Kn(gJ_WmvQ{EjjH0~m)0lQsP3mjO}SzhT6BKzCO4jN%?*=?Cy> z-@#bTk6_ck07q)KNK!7pJX@D~75$i-=v}`9hTlT0`5`F&9Qx+Z_xJ3(xu$<f`7ZYU z0N45@<4bgTw#1RNN-V$ZuL0MRX-j)vbzSj1TKpCmeuP$imr<lv^Bn(1w*K;xTN9>~ z>li1yf&07Hai8*aoR9xjegbv>Lg0AKa990?;;Y#AefW-_gYB=GMt{dNN(v<(@~>ZZ zd|Y3~jCH@^xb}kTY2|J7h9BU*;sdn#n`rZ=j3TxA=d$_bx*Cmx>Se_idehIKZ~ZF1 zllO7Hj}fsyh1z}uTA$&3*NnF{uP8~Op9p?~^!e+c-bG9QvIFeNnM__WAJU)HTm->e zu;W)?$?t;Uhpf%tMeE3-W!rB`U1E=RO0{2c90aeUUvnQSd`DpT36%XkM)8{IIqiMb z8;bX#cOv*BX5-{Dc!uOHNsC|hS9kts%7S&jaVz1p<`M|rgdM*G)x0S%{03S_%8|=2 zH?1bgpD?OkRBm84=Oqxl4Lg2K_)bWNWX+#)*Aj2(?x|l>z6~4x2CncucuC&ADSi<{ zKH^kiWy>4b&-+la{?FkrY5Nu1e8yw&5udWiM?N-k@2@2BzlmoacVQzR;aeqdwfzh| zx@)i=ii7;#@N3gw&{a5`K4V`sZy1j0&N70}D(@&>M4$Kp&PqD`8uN&9`Q@coTiXl+ zni<tz#U{oRE@Pbk7Vc2!d~c(rUuCxZE%19b;VvxrO~reH(#g7c+a^mE8FF~YwgMHu z2kt!M&zL{?h0s==t5|sqXXm5+kKr50+lYIADar2$o@Hzz+P#ieMmv8}QaeWV-)4UL z$DqicJw&;yWGHjWxn$jM+SH%WoddzAlsA;O6)(W<US*c|9%Cq%Uv7Gp&1dS-jX{A2 z6h|>G$P>H^Rg((eh9wh2@Y6pJ3w{6v^8S|hkNn@rM;T&BFAY|h9zi^M6%@(ZDZ2d- ziu(!L!<8#-d3dIH5#Q0<pvcEQ+WS++QVQT*&?~{*jpMP-U!>8r<LbBYw|`AIJNa<_ z>_6uMWdk`ADf9NdN$bWV`coiynGw9DxXV=d8Z7x8{M~Z-<z-aZn-T;1Va<$sRk^M> zhEc66xHG<q(drlBTOY9Vz0X}U-_YODzNmgx`KIz+*ziX}#0o_iKJxSJSK^8F4BGe< z`sNQ{6<>qR^L_x2DGK$*Ts%?Sz;*86J9-CuKW3E4(@MSG!e2IyNl8ty&hO<iY<SMG z58u#joc&#VN8e-~jlRPlah>k|ys@;I<Yntx;vvJ)gwxvdn#<~^RL>}HDxXI*xzF1C zmTZ2x85NEObBCc{H=$lot|<<o)#n6)x1o{yOoeX|Kf`U^J<ZFi*P+1o1usFK@dwNk zlBJLe#g_sq>4hbqz%$tApk<0lq+2{{#OyzFLJpqzEMgWA?Y+;uE&rW}$H)5jBb@0t z=qY2JzY&j-^qC;IfU{r2H$=TWzDMy1Jb7n%U;b#uv}?h(H|d~h({L=|Oc=pCiWd|w z!MnUFn_pf=d2)SHo3U3ns+m=<C=a41b3!2aEWW`P&~{ScHS;ru=X7^9_f@Yk1yZ~h zHN{&eS;)^Lnw^WOe&d+ua{@l$2E6{uXgiyIm9I40v^n^Z_MQU6oA{>q7#MMaD8Kvw z%IDYrDllCw%K2$^Y^nycC)@Gdd;=|c8Vp~?H^hH`ot3?X!&#H5bB-13n)#4%3tsSS zkl?fA31P?gW%J9;bUSM;t)?#hpmq`p+zT5%f;OFJ8hjS&xC=XenfMuRCEV4%q<)1d z@IBaYC_0P&?Xzty&cIXuUaT-UD8%;XI5e3QlA<#?G&z`oHV73ShS$A{Co^~PcT2s( z*Z(RoD`?00X?9#bXH3HE$#y*TU4vRLp=V3|GK#SNs@|fZobj|-=OV8_<^|7cFKDj7 zimxl5#oXMxihBy#{x7r4>9zVyov`3BC~#T1UvUUt@U%ei1_<7PI<A?ogWvPo7h%B< zpul$o8y5RFym$G@j^?Lg23`kdsVqSYC*hO8k1RNcvltCdbq?mGbVDcm;PKDF+HUMP ze)}8S*Z)R2KYc%#AMC`Ogmzqc84-uRpBre$OPs&Dr(`g93>G}^ScL*NOk1$w)4Fp( zf;W`6;0fjO%grit)!3TMdknq05zQ28U9Q50H}S2V!BaYlB+nq0-Qlh!T{qs;-_gCm zEcgxOTZ(rPhd*R-g#5~<=uJ>8+4*up3C(!=KZSV|hdIn&^C7qtb1CXXbAk&5X-e-X zRJe|7O4o1Qx&C+C`Dt}*6&|D;6QIsY5FCalJjl-d6=>Y==_wt^8_k?bolRbb0uLk} zh7BK2I1@qercyq?(iD%aF{zy_ST_y@E-BYw!+fpZ5JB)W-PGUKy}&G3P#|oWyx=zx zJw<{$`B9S1LKsyiW(mv@KXLx4O=q&`QdBvZ;n)s4p218E@RfK)TmQT5{62gog_tX+ zOoGhm+(YOuQT5`}uw=idyR5%pBzuA^*iH&uH*La(PwIGrSD6*xcqo3=_J*WZQven` z3I#5pA44{L1on0o7DpPqPWm%F3x0Q*1;3_zL-{uPF%*Zt9zigA^KF>CTMg6g<N4A2 z(*iD4X*L+F+Ef)Sw`qZ&oPY|YUgG@H$o0S5&R+}3r*kGXAm#&RJ7%y=L;V|2p}(fv z-B&o0J)S<3f*4}kM+$@uA4OjfaYTDj!xNORzmn7{M_p2j$qx%2)=n@5?jsw97d#at zcrEE^)3b=c&uj0&f?q`pAs_gjV8b5=H5jY?REMYnrp4x;HX=rrV9vZ<lW5Z08NmwJ zco!5Z^%Cb#ZeRZ!cK#MN(JB|RR~s;4lNNKeqrW)`AAErGd%Mbei-vR2he_oXxDN_E zY&>E(u0I_{@S*r6*I9ju9mZ~0@EB9z3T$|TdBIcg>zANGfgd8sJq^X+2ciO*4S&S$ zlfMlsj{Qo)<lR!5KSUnN%O>Nca!ERqIl-YxQ)c6hMtDN0SM2q_!ue@_a3jQCh}q;O z1!kOP;H#>|W&5D-1#T(ri1TFf8OJ&MdHV(XMcYN&CAM9*UABg{E7o5sKh!?Zzht;; zylr|e@mAuqiO-sEV0#AJ^(20KI_c>oFP=gk!*jKBn4fVG+a+w`d9GMP+cm4aEnfSn zk`o$2D;VlvvPA&uP?`0{q=aNf&<){8z2dF^70%D+S15(PZJbsM8D=9O*D-=)+-Si{ z_QCYc)T1fKU8kI<lFwjG(Ang3j&qLlj3}65yJY`8?kGQ2zN>yi^NRMq?q0%O{qy=e zhTDecjJL4eG~F~k%eH)bH=drY;hFdmJlj2qC%k9yocA2#c^=QqX(O)k=v|X|{)QE% z;q!NMp%IHF(P*&foa%IFP<r_=KR-9t`d@Z_>LGbC9WW2;Jy1`L7IEc5-SxP1FV|l- zQ?Qb=KWii7aQYFVm~zr}5+qLxB#9>R{11$=WA>!ttIBs&Z>V1*irN=-_jGp??k0ew z{<a=0K{L<Qgjp9;cn-OSr{ae(yWl7|o?s+TgC)@<p1Z>j>JsOl()xXtK|*2HnM``C zmM18!KZ*6fA$=FZbgi6D1shI99I4<5R!)@67p&y1WgpBs1cqB!4S6*6c*+T^OA9CY zXN+g^tIt1FypLG<rs{$ERn5zq``VYl@da#mb<cxjUYgh5fq55`c#exZ8qCbt5GWoK zIFcq$?;1a;@i{4eQtNxAj_>x2x~Gd5KyY8qf$Vjxg5((z#bZ20M)JS#)a-9@NBVvA z%|23mRq-BU_`2#f^(!Jp?LF;1U0!-svd<R4EXWDW#8|=;abma*d)|Z=k3fw)$KBGO z)cBlK`=!?RN*&+j9jcrtoh@1f!M(XyJ({(_7;e(4T&6{y;$QGIhgPrB?4eK4L;Fzi zf%08};RBJP<|Qr0Uz`p0J(xc_41zOI;WB0htzmx4I#Z%Zakuz!6614H?U!2LD|LLJ zcCc!!e7a;F1XuI-=I)0I57JuP%tIL>L((G6pZpyZ_;bV&n!WKc_jRc7eV(E64HX!w zUWFF(GTp9PhtJlDxqHKy!!nK87)!8Zo*`NDA=vZguGud<d))CksrE~)@0B_}P~Yzv zt(YvEDOnH*u3-i9fh@9QVu+~1bSTbWqxHW(Vg$d5d)2QgK7b{^qkId!-#1jRgCg#K zQ)-f%?CrKL#1Wq09Ok8wC5sG6iMyshiSaq9_Dikrl{&t&zOQDuYP@{9bhdb*XbBoz zB~M5yWR}b`+)AVQYc&6gpFi|N+~0p2_Xr=sj-f)uyI_czf<ACwR%vRrv)<lf>#%l% zAS!|*8qY$7i%{Vz<_M83AKX2D6614H?U!2LD|LKlL$7zJdW;lUI#;|HPB3K2P$ED7 z>L2k`>{qbipI~g{Q`{YV3!d=nOoe30#4tCjB&`w_ObYD8N`wJ{;0&S(sgNw0819<> zB*y2Y+Ap=fSL*oA#@^aN&q(D&`7~^Jo(SecgS;mUv*iE5T$F#s{EB~o0_mPRMDXLF z3f}|6yzJt%3YRyz(cWV3KpgHxUuFdJ&?XT}<`7L55l@KWuJPl?Ph*bHNwr^UeXrE< zosB(p17yJ!6YiNZ*l_V;;WmQIlK%>R|BgP)@9<=n?s+~#91%6h2=Xda<YpJAm#27~ z^^RtHo6T?SLL3nZP78Lt1ctlFFYfr9RQtv8mssn2rH=PEb=URR3|Egrfm5*IS<1R1 z5M0e;v4nZUzhb7|f4~F&BWC}Q0)LJtcs~jf{3f$w@`cbJ_*FaWV8N~SJy2lJHiEng zca0y7pT-%VlWIRd{t|nAuhjA2SKsd)f(7#mWHt<f%gl;}SOS87#XQIVKp*g*5JUbB zPq#m3HvB{Mr^yd~f>`nmvSThcyC}n*S_KPkaQL7=vf&=Yks-l~r<fhj@1Fi>{4~z^ zoK*Xz*7r&s?{Dg=?;{UbJ;p1LdNNFdB0<(CWc>aJ4@eeF3gm70C%7jL5hOd#1HZD= zN>>dmm{(vYY?$H*PjCX!nCy5L^%r-1POAO<_(AOTy;8^fo4Xo%$$~ww;7am=WWx~z zSuFWq%m)1v`p{&-{~#!kZ1~692<GO1Us?q`V4b7UE-J7aHXKQC7x~2<pOb1ozkZe0 z<wjrMD|NiUk1V)qTvQ<SWS9n5cq>MqkmvXB;71nx_ksd{CJ_9dV8!3!a&rnZOTn+2 zS#UFaV7tvPc)@UjyT*?nKdr$~YSi&LsrE~)@0B_p{1Abe2P6xI0@07*domFOi62Ga zKcEluJ3)cuJ3d2ChT;fM@Y@2v@)USLX2DP(`Y~j~<OK(DM?$@s@!hjuKL0_?@j0pX zORevfI^ORKPy{9qNWDN&fqYMf2o~@f{9nwG|4&2^@__#sRN&``!;u6@e<kT|@T(yU zMlVKGU^l#Ah~O^jPaJ=aIzA`WeyR1nQpYoX_5I8Pj#PyyP_$thnSm7(zdz$1jy&K$ zgWqqM4<rTtjP(UY4GR1aK|Ib{#Na0Kf#}EhaX(C6P$al({G>A+$3?($TE+cD{GD>^ ze-v-jjMeuyq9_8@_28>|w1mp6P#Q9}b(o#S`JS@6%0^F<w;2^IeDzep0$Wq7SigcQ zSZr`L#r2-XIuvAR@-?^ku;JB8Tf{5=Uyr?yS-|5l%ZJF$f1g)VQeWO!)m%eNT8IhN z^&lqURD7*SExWFtwvM<4GWv3Q^1BPWN&;m8cV~HLWoMPY+V8<u<M;a077Itp`YU@q z-QKR+Kz*R0v!S!Gv(eu~TSqh7d>y_HNq&!6z~eE?hsftwQdm(^?XIb;t*Z0XhgJ0u z>w5V3x*iC~ca$e}2TY6BndFI-(e&ZWp`3xd{`|he-lE=;p3<JOZg+Qi7s^+x6wH)N zxJN67ss=s%HGSUR+TOaJy6*b!`mTnqMr=)irhp{B$1LFSnB_w_x%EF#7p0`gT~b+A zRqm;*siL|bbv3B!L3KSsRXysP5SriTZmP~|Hx2vEE7k@3jB_ewB5f>VG;26#D0eV# zprF67uc$ZqWctCJm4f-=>C%bvv5L{E;p!pJV9h{HzqhZpudWvxV=2k+F$;Lm@XceC z50S4ws!CB>>@GoN6sk#4Db)24>rsFs7ID@$AUuBusb!y0o!1`MA2RMYuUeMubICKV z>D0-z@r<#|(d?1zk=$X&nbgg!{dp?|3&pdg(`A$8;}v6-BcM3!A&&iCY_$k$lKf}@ z^f3x}Q0MzG%7@73Csw5>1HlS+h#~5Fu!<ByT@M84pFn~46xUT3G$#|b3>&8XNqa5J zwgtzWa~2e*(k9Z!Ge+&_P`+X#dkq8^i{?sZ7{iH*@ybz99PtoE#!-@A_yj)55AXay zWct9J`=#EIo4*sO><TEOJ*rYrCUL3;P)U_hDuWd%c#7yEevZL}H@N4OPpd9!PV0{8 zH;wCwYhbuyTXf7P&pKyQW>Tl^=TW}mFcO~bV+2WsGiB56iE?5%RykTVQjHF&XISd* zM+fS$3V4L3Z#`1^5c&F}x-wK31zBKOE_Sgd1<#NuQk5PrZjyhEo4F4#N%uAgUecV^ zozNdK95Nj=6T?;8l6{esXuse@;#ex6lCu|SP8W+7ilIW>`b-f+bmWO5lqkt>=L9~< zL&85m{36o_hVPaBS#JJfr1BK2Qcym7KAXK*lOn{h1~)0c!Jxu7Ftzn1#VtngobFV@ zQNyM&#BjyFi1GoaP`=`DCKX@U3xZ2Ui^cOyg&~F$m1B}Eh&F*w@}STMqNNWE?UnxN zVfh8CQt$+M6^1JGRO2S)ceo{?2_+BU1#j{MwWkw~>9>rVrj5h{(BVGl@M7{Al&?VX zfGi{|4ibbNGZh9IO13~8K#rWiCwZ9o0si*+k;{j`e(5HH|9|M;(2a%o6B6_C8J$B_ zDL}B4a!gQt9!i<OsvKdM^`GDYzmK87w_(H2V*>S4s>_=5+B3|Kw~U8P>tMLgb}9L6 z%1Kr{U;{S1mPZd=mI`UuXCBdnEP0x2S!`#_fB6CAM=szIKK<ws%7;7@Ki-0o&zV)F z;Qb&Y2o*AdU<iKyg5l{OVMzJ|M3I*iD088FN=*dM!j5?rf+6v9iRCLOZT7w}f_yY# z3}bQ=cf_8+Cw?OCzdXNa=>y{)7ThUUe=L<T^MHI!0Hk!Hv>a4pi!><OaRr9Cs8Gmf zSY$;>@?Yg%RNO{#ZIR&VgyV=NP@(OT<D8Qfpvgc171%HcvcX1PgNs5mnF;=(v3`g* zfiKoEVegC6w;r*4h;03d{HUq|UZJ`uB0<59!S4@<AisbGQ<(-789?IbTgqn`!3)}R zm{xsKe^h^j_#uK&@_S0%zRt>5>}3R(nH3AsB&?~?e~dqYFZwZI@ACX&r4J1IV2mFh zQh(bN$f{C=2ttJz=Hd&5P|Cayp+F>gQlQKQBX~u_6Fj9qZo7>1?PtQuSFi#q6i2X$ z1ktC7@n?tL4h_IRQUMR*_ZaB|A5wqJ1MaL!5hCaYKPnXRD~u!XiP}(P05<$QmeY|2 z89{i$(^ld~<rjnkG?^4dsB{4F3lUtz%=-8XfTb4jhDb;K$>_tfwm1ujbv;;y6K4VN zL+LLVgM&6KRs;wV1V1ViLKC&Vk0~;wz}I2J_Yg;(RbCeeB9?&P1;#H_zJf2lz(?ZM z{1x7c(dhUKfTb4jMx_>d)1^}ugJ8@B#HgMUcLDH2^CJu96&P8S0tBf-6cXY55&XnN zZQp<oq;#Js_M>=CAb3@CQG1@}hX}$7&`=RBA&Q8R7zA-w!Y|;6w*Xjj0dI&^w4cg3 zn-VjdV=f>rqUtGe768k>|78IL_ABrUQy@|-vKJ%B`28~?2tU>GTUZeD9-_!A=*gg5 z68bWvK@bGLi<nrA0zPEHX<JmnhmRus{Rp#S5ER-RdjYWI0^VrULhoVuFlW?bLU@b? z#F$>lVcvbr1;BFoiFHwkUkn8j#4^TDEEGZ({1H<ir5SsX#Swv^zz-3GFF+$GP$-}x z#1UvP*wR=FfF&332G0=n2ey2eOE7aW77(-Pg&bB!#8?3A{7~@=JP!IjqGl)o7N0ta zQeXwsQfc17U4?X7tN)R^&{Zif{nh_QG&<S>UC9N!5og%(3qSqMtkl{-Fv<dAHocHi z2*g+boPQdLfWC%U`BPXNrK=JXV2SCfgalZLn@EWuoz_ZrqAabv^jH7At-%=>E{<FP z96tURbph|TGi>{npZ=D@nRFTl2<})wEKDz?C>C`AaM@n#dEIT*>sU4=rUIgLN<@-R zfF&fXvW=<bq_z5kT>a@EDCMNT`t!Eta#(s1xd3?k_}}*ZJ5CV!MsWEsXE3UrAh>M- zad3KJj0M1zbFNL(MeR-1eZ^ZyjPWsgz2C?96D1$uliBd;s(zQ;m9VOmPHXi$x%><- zsozydr@#7RWNX=PkqdNpj{imOk8*<Ww}Q)uxkPn>S?vPBhy}#K<-Z(C_e5C$?3u{e zXFYDXq`jefLHQcSOemcVr3m=}C{h9fApw@F)i|YIS4gF``n6ns#=Dv~lp)Q>N`Lil z;cX569KJv|+W24i{@_8;e+Z2~+hD^wi;8M+g)JZsn_fsZykh~dcQj`)dBc2Ce_8X4 z>UrgT1)tc0l8EuiFodKrDUFGP+GW+2B7Rz{&*kzn-PXRMdPgCZ{_6M9Ta6xtxBxi% z_+My$j1!1&i{sB&KFpbw27}hBO2q=f-~wXWFHA3_sz*B(0M`!Z&!+6NY#GlaT-IDy z-B#XHh{>BMiP*QHL|RDy<FvY@o`fm&er5c$RzH=?FY!6uef1m4_Z9KeU;VS#&K>`X z3v^?R|B3r|92ofqzdTh~KFlR*^d_yH5fl~>6F;i6z?T1Vs5!E20dU=5(Ny}fecgP_ za8`Fk^R()g@&%|+Ox{E(F<1hubg#v47}YGQHWcyFT76G8zgr3SG}7s>ejC}^h~M}H zy0ORqgay12$L#oJ+wx)9aDrLmP*Qy=wtzTd`7ejEQQH;(*Y}r<XD&F`EQd`e^ye5s zB+gO3sGu}#Z{zM&NX(L6ZEG_P=w{S=m79v#X|293o8PngyRhIlmG8lZ#pHH7(_j5& zM}x!Ppar_|#{bv?-tYr<{!HU9Vata(v&yJXNYW%Tf>=PjZF(V9-HKQM+|XAxmNT2O zVn3L)Wjv)nue%BjB83?2_`X6+-UNmjRd!!ukA6(Es6L=Ps)&`=>Kk(TnOXWPHPT-x zKTz<Vf>IA{Pk;4yJ6jy_P9peG1dg+SH{$a<ej$Eh`ERy-7&dG$>MUxfGMyGm@0eb= zZ2@p&uX`kKI&Co-OP&*t8crvi*Iv~;rM>}A_ySZ&DSqC<2*7(t%hY5J7>1$1cxkP^ zDx2R8!yVm=npdH~c<HbHW=Dg!zp)GBe^kCYxO|v1D@;0gK`T!%q)%abA%~({VGDp8 zdn<<XCo|?<E4G74uwf9qzz9C0LgE~z!dGF*Z)H>@Hzai!k=9DHpk7lRLVCn#X|3LU z2!6rzSBN5fk~}`*QzAey{greAy6rr_;NP2iDu)UuvSw449c$JN^Oo_X{)|o}h{QQc zB*20SA7qw0>n&}jUj3++rL|ISDWavddQ&#PXN=DwebasQYfOP3zzar8fAxzm7{^0j zuBp3fuxKm?7VKPw0uPyv!V8|&T@*BU3!aed_&%kzvLdaOVNf>-1+uhOh$Gw6TFFi{ zCZ@mAu=H1mBGJ=d{p_I|^aUT#)LlJLGMYP;G3SB;EgR-d({b4FS=}W@@Ve?I%KO5O zk(SA8^O=KbtyKGA!#mSjy)0LMNZ)iz|2#ZkF#Q#?;qB?KwkHt#g6T+KuDRROUpkyO znK_fXl)P#?kaQ?<%XmV6I^i4$UPC0gj%a*a`8+M_wKXN}fd!A~rZn@2A_viv*`C(w z1^N7tz6t5Cm<1#K6`}~)Fr`rpC2#r>;_;XK)!gOjD;p{p&z??OaIV<bEbHdOuwn9o zA%ZuQ&t)O4m7S%vf(5g*R)``<YXvVDnHWrFqMcCsD_Af~e}yQrGyPSx1Y%z*qRm}3 zz3!pHv7D*&xs+uGDR3ik({#*miU?{&4c?$-(T-qREA13~AktdFhL7NWB$U?b=0oW3 zStOto(qE~B^jEOq$n;mz2{@lHO<$L{w|uB*1RiiEbup+w*f0p5)t%Q~=B+pjX|0m$ zEG>zCW3PS~y%;1`QzNYvY&blv)pgnYSo$lLzUjVNNPmSW!rO2t{nhuLFdoUv`vTsc z^1<SfyveNTw0Xo3C=gMEw_%DS0zu;Eth4!&I!xWLV5GHz0)@0z@PcAmtEV1@pYb_F zkbByfm<5XpBpVK;zxq_N0Z;e?Rr!UOTDtQk7D5;Ih;>h-{^bIi!o+-AUUHr*FC{N6 zFCAM3+cNVq^Rj}Q#cR)ZvHcmr_p|fFEhkUho-iKC%hwgU^NWKO<fD<~2}*Gj_v2n~ zQ!%FQN@z7UCe|f+E!DOvdxfJSx!mb?mbuDON>fTxOHxZLE0%6+liizK<*abIQ_50H z(@N5c)3Id~(Uw`rZ;}mo!XIcXt1K#`>gQBJKBD%ysJw^dheGqP=2hjCdQjVw;5W3H ze2GowhNOB+t<`I*v3cy(j;iD;%Q4HWrQh0tGV~3O+T<E%wX4!qky4&gp6X65ODjt& zO)E_=NiTWAcqA|1ROu-#FD@%W{d1~QOAOf-Ui+K~iu#iJ7f+p7p?Pjyv8<X>k7@>V zJ^Fy5!`NnOHTld<NsUPjNp+Sw%W2D+Wy&&W4OsVpVUxWc6unLlI99nTQ!22Pr?^wy zk_~vmA87V?%d4o~B@q;>pNA;2+ULYlikr9}v)6<|^W2tVzj9GEqaM=?>-u%Q`YuDK zVUMXD99t57mUEUvmSxMN1qI}-owjyc3m7&!>K%2USmX4#h$BcoVLXzT_jzk8tKAhu zaC-&$u<Gae`63^wfAJhXSZE&UK{ph8m5b^b^@L_rJ19^L82rW#%X!NYC~(0tVL<_T zED!FmwcA_l&GsfogFumSlx)Bg{y<A@eRWL*>Yp=$sQD|_osOt}ULc>}wnFoWCI^(O z$_4eTdQvl{8PV|+yDb+j$1UrKB6Ex&3dlo+9X7t`G%1m%So?(WNM63BuEFE2tfuPS z0>P*i<O`&9C+^4kk8OqKxuc48#hP-3F`UwjLy5z>LGU|iISd6ZS!OKb&>)pt4OsoQ zJpw~gA}A(d5ZH?Zr!AP}*d}c4n2aKBMxRSZxgeX_Cw6{D3cxJRJBY0pxl*H@U)PhA zL>XfnV_c!7p`ix#&nr;>oUeCT%qrFf>rNx4U@MT%FG`_#%&$A7K%sf%vT7bWoYG8a z#w?dCrz~5RgO*j;@H7Yt1>~(LARo?<rk>E_#5$n`#4$vvjpL3fi*3q6Ota)a;RDC; z&q(oW#lzkvu<O8C={pENmUv?JWV=R_flQztc7-O4_KTmQzk}6}_fYj|R=t}vh-#r? z_47~#c~M_d|6=_|)I#%!ChLeNt4b6}Cx+m6#&XoM0R=8w<^_U$`SSp?WGa5%YWGq5 z705VRfX89ff(Ep<8Ep_-f!T4#xIClY$>pPsRZt3jrcwA*ixfU6uL)-_!I72G&o3kh z)U%wObcGr`53U!k(2V-=O|^AYyOt`~Qq4=A;I``Lg@sbw#Qj*m5TnpMheGpE;Tq4- za@leg9`LZ`fMqXicn*CT(x6a49xC)hhhQiIuwn{XA(FU?f4|5GGoN0@?8n>$vsp<~ z2a4HeJd7i?QtCAb?UitPFpP2jumpjWIW`-AMJb*KSA!#cSFC<KYS((G{yAT_maiDf zYcLp3L_Sjgf?v!+^H9nrQ{g`43izF~oPY(dLxC%nMf9epEMtryUwE2S2yYl_e<B`T z(_ddE{Pnd=X&52=5J}DZxg4#{q_eQk6o5!M^h9lxdiBC`Q9uMAi*f#K2?C9nf{>1< z=|s_uYf$}eRR5*Qp-h4Bf@N+jZ^NrNSV0~=Ln&_JezyK2PN8|&F;gMqcM=wSNKoKB z;>aW;7%U)9DhzLb_}^ZRR(psKtrfe6RO)^%N8>OU6V*0YS{AIi2rVoRKRoKse)zpg zXb+i+k(oHeCV2iG2?CW!Vr*yE$OA>8{y8j|DX;=@I7CqNgixW#N9tej`&Z0;i&JPG zcD$+B0KfByAjd45Oo6LV;M@*^-PW$iM(dG4I0dcEV|3h5B4#*|g<V0a`xY)o?MyJ6 zbY>;BFO#Vx^7x&<MXvvia{j0Z0@Yw>LptaTksmA=3an(+yCVtmp0HSoo48+C{}H#) zJlV133izFd2i!soAs<LKJcl?E(jc?r?JZ8gB&cMLrdJR{+#Y|<Ecg!2IgY@Qqju>m z21BAI30fkCQZJg{QU^=S#^?3;aFp|t1;2(kyunSN+f)fB>%cr1g%%90T(EvT#Srp= zV(n?silM<$EY=eFNd3#!f5<2_4}KRAfluHr<S=?MYv{);p(isNMvxVd-`Rd0Y|Rc8 zW}qSY#P2iKzN_4R1k+kBN0p+r>P=9g$^z0-FX9)u{&(B?#ra)P69nprUt=SAK#C!} z0;3Qtk<U+Np?QHHdB9Dkz<o@C^DK@`Fh9r&$VY2G|IIt0Lg<Tc-#aL4a|$-x4VRds zO4ZsE%tn2p#;k~TeE56O*8grhKY75I2?B-s@vQziESOgy-Hp(yz>o$@q;x0l$Bc7X zh2|N*v$zL21`9sK6o@Pdf)|{^Jsrhj5Daf?=w~f_y#;2JLj0(GUqd;ft0<Q;jW|-x zWvfy(j)WwG(WDC<6aA;?>whE8{|l@p{U&Cy+(F@pO_W?1f)#t=I<gVA&5`xbQN5dd z;Ew9uB@e?-R-t+DyNDj(DR@A#;C1*wQXttddBGsT=&h!^5(7M;o%qoV=#MeS<R+d3 z?&C(8kIGi2X_9poJ@Wm>e}&H<3Sa*lJU>4_DQ1E|q5e75kLN9zSD<LaVt@K!_{l0X z&-k4~1UV73;59@MQXttd5gaEC#%{d^{vN#!e*W4gm}z$p3OoWE9)wG);6PCA)JoKE z5X(12TmKt8zgS=(YJ$MxvuPKcmy$0#F59orW~1$@xLL2V4avUjE3mZF=<}Y3ZZ6{P z>k@h({B{K_f*bdu@+s|C^w$j6jMq%p64_?HCT>aB#LaR|-WHo5O{Yb~<mX>~7xT7X zMLDN)C>u3{!GLCT{j!zmm{BkBBK^_)t7z+gh4cRs5$gL;_&e|sH!x#m3#EG|Q9h-e zCZ^V`6mDc6$vBaEGUc@Etn-}nT=Mzk3&|H87ae@zk4q^2fz5@o9*B~MU~R|YwN4>+ zo&`<fc^;9JHsX2__Vgk5K>3pTuI9G(Io+*<XZ6qOZy0VEo-saSyl%X1qU~wZ)AHz9 zuElS@n&<Z!#%w>rDzUrhWgJJ@s0ozrX{1gG=I}|pcz&_g|FZK_g!&<7MNmeEJKR@L zHfjT<dnR~-_0wf51qX6AvbHjgrXNqGvK6ON&bZDvg~A`n=dgijT9X}RJz!s$p#Xoi z1*<y_+ar>v!IEe)p16BMp%w03#T&|3RQJ{QG<UVnYwzf8>z+%vm2gXc6Wg-}Y`en` zaQFm>KY)jSSJ1~{*zgGANIh=#vL*0AL=x+NB@zTSj#ti=tQ4%}t!E#~+RQkTel+cP z8mW=CQ?4`6BqN#bP42LvtOxx23K$+lpM$4(OyJ0qJdNnf3a!AN-T}kc7{wPg#1SNQ zM3PuOw_E(A#^<E?Nv-ddI=*SdGv!_=CW265_Myzp%p)ina5U{$kRmDabVilaXJ=(S z5FeM&+g(G%T!%exLW@VBMxNsdj9&!{t#I!v-d4P+e4u<)^)j^hlIBIt3)&ZmqxSjT z;wLpeC)Iwb^}SNZHxGIzDrZX<L2zH*{+xrbWRy_J+{$3ZKSC5UD_jkZJvLU>1NW&5 zu;jf0LsDXpBFn!KEVKd@zNetV0<SAyQ@s+Tcu#Y0*Z4_{&q=jkYJIQN@xFoDv8t)^ zxzfd=)q=hG`%yv#mW;KyjG<7x0$R*+r_?6<?434N)&u>Ed8lwjU`W<{h^NT%M?@4_ z;XXiVMxNn=5XD{NCow)J)qbh<y;8^f`sznKlNB>%^CcmI6i<jDtQj7W6#&U9ORaH+ zmGuC@IrP=Yl6f5-gf)XA8$Z}qXay>KUtstKtQmz?RIltBKZ)@<srE~)@0B{frMF?I zh7>qkx=_3X4X);sCp?gIFgt=_R%u$btKLCH6@;=L=;6*ng^N((D()S~AHpLB@87l+ zT7e4RQ@o>Kg;rqA4|a{8#Q2<4`=!?RN*xb=gWl0<QXp(tAh?ea6fK!qb2cmML4{>t z!$MgP5S$TINR~_t_p|lqQ46gIDijK>>>59b@j1!%WAqB;ePgcgl{~(uaiDg@Gf_EJ z&TN<nBKH9A2}y^73Q<;}BE{=$WC{$H^}uKy5nO-@mzg)*2Y!FT9E2E!R-i(%WH8({ zelZJp@S`YF?U!2LD|tNl)eXafCn{##vt^X{eyK2$;6d~Z4~3WYuv0Q;p{xf8P78Lt z1cs~N7q`#~Q{j7xcXy4S#Q2<K`|<pZmG_OYzE|@2?xy~_p_<WZUV+q;5ebT(kW{#y zU7S&#TJ5S!ZgQ})9_YzL5)@R($tbi!D%?eWamVMR+Ap=fSL*oI?xw!_LGOqM#RH%~ zc)=1PxXd)TnlF0711L+u%6d3iSr1m4Cs@`48st6Uyk!CWWE5KAzPgM2;*QTrwVy^0 z;;rwMI=;25skfdypr}CJ3x){p&7(fy0alixG9|362W&W4)&t|6Q@kC6pR7VFyT(sq zd`_zU#4q0ZUa8|-yPA6&`r!d7hD=mWQxPQeWWqJb$C9E<R@MU++(?CGNP+0d1j~Ae z1Q|aWg;sWzALZJ5!~z~t>w9JJ>uH1sgasppR8Ara^F5hxg7Ac)vL3KtR#X8|gfDI+ zl=a{Vaxx38>>|Io<8xB&ms;N|b$n}}nfQ?hBn##hDB3XYiUmKoO8ih5N+|0gD6kVY z99Gr?BgT^gKgmKXyUH)l_?%SxrPlXK9p4)8Q3NIr$XhV4K)xqK8eE~0g!#k|Wj#<B z3S~WD!9fMWhQrEwSjHJY8HHAMmEX_tgy%c3VC4M881yBKp&r0<o-vF=w@bC3#{c;B z=h4^qN*&Mm^`QU=c|fvYQGt9<1_X;3LE@L;POD7uxN2d+vC4XkfuF2GE4#+eCDp?C z`-|i6mfS11{>SvSp0&VC=XmF^f6(9W@ALQid;H!0F17{yv~|)pc&&J-s@of=M?H!r zKh>e2Dipqsmf*(M33=gK&rqPdGtfy#`TZTSHpvE5agxs{<DFxK&3uXXiTL~E=3l#! zzFIt4F<d=>@(3u5zzR$RnmU_0oBg!;#7*DnY|m~hY%6UmZ>wsnZmVhYw$--Pwbf&5 zz}CpNrnWiHNaH|rudk=2yQQnOt2NNtiA~Iw9Qrr@Uyr?mxP4Gcek$e%u3@MjPdOg> zC70i8n$t;ZuKCQ#+_8e;;=$5>cV9(cWp7olr^nM>1HpL%wVkz{b^1<gTS`l2OI}N1 zOL0qCOL=QWYgKD?n+F@Q<Z0HmEtE}oMrsG^2O9gEdYe$z#)l0oyBW(6P05B$<5^K2 zrdU4W>mH^pG(N@`Ah-U<yNai@#}fD3mYlO`lbI7aqj|#xgGB?y{iS_neeS;U-ilt( z?5WZRlG+?CsV$kl9ACbV7?!k@`O2|X_^Mi}TB=)#V@=y)$#liIXT&>PH&EZ-(BIhC z)YH_>ZzvHIVkwDb4g#vjRG~+Fy~aI*P*e8EUw~ZweaPKbo!1>P9k8t07o0PxQ|S{~ z<Jlv5L-~UR1BKvN(pQSjt?x?QV{LV|r1~;_*}mLnF!U9HVJRrO8O2IprLW4jR6J8Y zSp|YM!?lBTDD=T7LWv?pfu$st1#Sw@+T57T@kp;X@X+%Z0(IUa9nB&CZilS=?cYEG z&abG?By1TEnD<(jP(p!9c}!+ZWR0O5%23{5{s3_-0>2JRo1@j$lIF|sWkH2`&H2qd zLq@TMr?^x!S2kTSQ8fmFBi^C90b*FkD>0m+WV4nqd$%7ZSbzyJkMMd64?QpAac&Fc z>3oEjTz=og6u4)V7f}*r69h3;1qD%Nozp2`m_Cs?o`te9tUQmt+tiWNZfi|$VFa7A zm<scn3z`d?i+p6wP$IVFqWRJpDpXNDN|oolf(myiQL;%(99i%lM3F4yWqIh=+nBcS z45k!}VB!{LBR%v>F27Ikc>j*#n(CbPM8aX?K@>z;wW2VOV}=-}f?>u)<`^g%x=ns_ zyA^ARoGmG&K}Il#ESVTK6*U(#pUC*Rrz(O3Yes5=D$$RgRGWw@^it_KP%#e_D= z`9j;z_cjs)-Npj?WrU&fM{ojUy`j1b{BNP&L+JN!WBSKE#SP^p^%?E4ghNEo%*yiE zXOnq`Oo<tW9;4s9$08_@CrF+U1eqlZ41LQ*3sk6rFO|Yd%`g>;cFbEdeB$=@?)ZHf zvkj)W0o-ZSz=mBk6#p=<cQA451x!3TgfF25Gg@sAb1*(Dm*00#)aX9<obsCLyyled zC`v;eH0?L<g9;aIa}I$atQq_|6Jf!mz+|6G&|qeBHrX*_SO6UsG%cfC%52$;dkQx^ z6BI}IO^hZyKm1>^ImB-U<`;kh%Q>1`OVe;=zutjey~JJTPC|kG2;_MW=|<)9<0m-X zQCx=t&+-HfAegw1ibJ7H*(?}N!<HF8Kk>7nALE1<Ol4M_*+c|Mg}Ke-4T;|z75QKa zWN}0w7#2<B@uLVr_quItR%0e4^biE!#ZB|e+%4_`CIwA#oizCp^9tp^<nsFzQ#@aX z(r-e67huCDbX)ojMv&}yg;dBG^88@IZK47Ng1(HVtS}W)i2y_pC~%S#2o170!ZQqu zCXy{#M(~`2@zFvUDu!18FKERyaG9@nF?s)0Osl$zUd#$NjMWU~h{Uv-LEcL~zpq2- zFLJk`z{{$0n$z0jJVE1r69`f?fePpBh8{y8(QocZB7RU{a<fa+V2~iGFvyR3F+4#& z5^q!C|JUA|xVLqk2i_Mu0fGPtZUA=@;J&XS36PSwXr*?E+C^%$Bx<o1%aUcwmb}Q0 zcgI_tCeG3(Q)iMk>3nUzX{XO*rk$oUNxvlRG-;dmZ~1=jxws%eg1C?<#}Xsl=aEQW z+yk87ec$t*d(OL<_F@Kp2N0dD_&$%Y6yHMFLJ$$k@~K$(eS}=Ri1X?x27>i?lQ-Ri zA<CAYIM|5;;Tx7$thcPshz>(=PGm6mBm|F%H9-o6pLa|j0U^jdrbLZ`N{fY{(&Or= zlm}22WL^S0b{cjhNeZ*zCl+`bzuk?55)Gk%jm%{5OwNv=@cS6Kc!eOQ^GM)!8tEl2 zCYYrm%AVhM3Dx_6JN2sN4ib3X@s#tj>tfDXgJ2#6kwW-Q!cT1QNL&J=rO5<`841y2 z;ioK!jDdy(b|mbGA%*7gzcUQ{2N9f|Xz&CQ*pC86P{AUU&}m8^r$Mp6SVlDXG7>10 z@ewAIb=(4v`Q=&L-yf1*?i)zDXz<G-fl7xjyDlUV6g?Jxxr(1?FcE-AprONY85Dv; zIzO=>#w<o31$iDxIhc`jLS~vV?jV|a7+S_LgSgBv^_}A1V;rK%i_e=dXe+WbBe}a( z`-c{{&|P2)GcxQ1>;Wu&>L(o0X0nw(aR62Yp|m&R5_kh0zKR_=pCBkBVS=B`VbmDN zJSJ8W9|h+L>im=ifuNd4$L&ZeLAvjP;3PAYJ@i2+9*XxvXEjAA>jrghZ`$)F{Pt=k zb~Ju1Q3<=$T{@8Z2{TNIFv&5*a!2AD$afHu9NF{xV_XNs*1x5_W_d|TAUez_h$YTB z6W8JsUihtJ1~}#$^D7OGOJKA#m0(<d<0D{v9-YRHbgS-5g^mI2>n14fq8B<T>FGhF zZUX_4eLTTUq&;uK?+%RxEK15pF&<l>Bpef;@G>8O6L20zWr*p`ChXKsq*dOrpW{Hb z&u`*j{B6dH_X!htow9hh83S*i!;FGri4z13elh~0!6Jb&lNKG0kAk`kQrd&@FTqcr zN9z)Z9hslY#Jg`{g{qYq@*4WM8H&=&J*pSS_`i?AzIuuCCj8#Tp?+NE%s$+Rx{>8E z?#I;pC&=o;79nWGD7SDIN703i{An8;`3~9h`wYkWud|Bqj?!R}z?-(~N{6u{SYllU z6+dP$ir;FnAxNOiV?>9O2<rR{3zC>eCkPs&U{d#sunV0qT#ckjAGcyGcQWqngK9Mk zc<Ik4HIVMd4+*<D3YL7$>VZ%rPl3ZqmN#&p_=JymI+ozrYFVY0oZPM4ttk2~6nRHe ztYmG$xPQVonYYf%k-J_$;E-}$s2FZ;?5-LJK>-h#)dgz`*5t4CkNL;_<Gyj<L>`Bi zBlW7Ke~b0h4F+RHv?Hw~wJ_;bNA4uYy4-asea>fQ(o7Erk%BQs^^MAQ?1iVk{+I52 zN&}03`3m}U6>;tWjy;EfrW&No_=HJqgFM~cz;r}G$74+nnqG&6my{Or9VWX5Yled( zMWX?|AB<ah;~YYRe&~ImAM3wl+wWf!?5S>vM4D>aDmu%%gT3s;4HT~`8Y~<N40A*a zV)?P9iKU*|RNviN-&NIHHc&c<S79^|D;UWi&0p<b?O)@QW36wkNb!Om`;ZYyg&^O5 z1Syzc3{PJFTYP>&0}FrsDkIYK+6it(GX$M3^jQA{VHnfaNcpfA39OSv?rK@yvGKNY z^&NVcYRCEp>xZhMp;*a?E>%c_$b)dyk2uYbv~$j#dE<e>U{^&;O=CFRP}5Y^Qqfl4 z5$a@5v!|rDq>r89{!~g!|F%66ZR}`iXs_+8>?!LF_H!o&*%jqJ6mTE76GC&;pU8iZ z=(iUMs9`kkK#zps!u7v}=MxvE|KSf9k?v}jFb5;dn)5Z6gbo&is(~0V2+7lltZ#{# zl3MfX*;w9D#oI|~F)-9JR5w&PTox@IDIOu?#b`j+DWSMlILg0yV_(;9SdY54=T78D zi~53{<*ijsHTBe_u5GMts%)ufEpH1!GSsy+vD7oW!(;XRjqS~iZFL>hofX|>y`jER zp(ygKB)UM6Ow(%#v#&$~T6oszSF`8T^}p2fX&OlW)7N<Vc!Ot;6AX5%)apx)gOM<R z6xMN<+Ku(CISA{k3k5O~x<DP`J~j*vwGY(~)eKig%c4PD3ZWRlZZa~h5n0rw@msP= zYd_%Hnmg$q4Gb3d1Ut*xDx0etY9mxuu5GAptWq3PDJ}imba;IvTHn*yMoslddu>Nm zS4DSuPpCI&P>gd-?rY*_x9VRlqkS9v7O(#$pHEzr{s&pVdPlp>>cK&DnBZZ+xa~GK zf>w%*MdbW@^!@65XeS@egN+SkCSAhNaOY6dP#A(0(X!zn7P5G>NHVXC1V&``p(w)f zJFHPWWj*fL>)Mh#=^G_=Uw=tYu%oQ4qPeoMx}h#oR~xRauW4ACSn8R5;Z5Pyk%5M; z##SnVN80MzYlLA}c~4nSP$)7Qh%~0}HAA^1LZE+&-{SSZ`SVLOVEp-OYEJtK8hR3A zu5SYfL2QZeDU<cB_j$V0Lw`48M+oi2%%}w(K4o~gd$?t&ez=ZNPzYjQS+z@Is3_*I z`A^)XXDxHq8OI*il!r|=e=I=azLM@>N2s;DrLw82p++)Urczq^xBcNQ;c+Ceub~ry zO)aExL<*}rDurQpSx*RxFkJfiR}vMJ@SH2_SLxROjPnMlCH@yaA27l_$Lhgh%y|@% zu0#esc880Rut3%qK19P`V4wH|Q_Xdl6+K%5DG;c}FdXd}ZX0SG4iD7|!I&Z_QaEBT zEF39-;b+{1JKAN-N&7*^F4txj3D){Y3Wfp$q?_v!hGngl&6N#ROA|{yb0EAeye={t zSykWD(9zgR<v0ko)ru6RFkGVB@@KJ?MT%eg^*{Z5h6d!6&gV@u_$m@O4WCt5;tC;X zbGj@ZeSPa~0#|Pke13!}=Qx&F0uzf7P@5Kw_D0)>nw13BBnSp|DTE<U4`NkDK4YA{ zt6fI|kJ|S+cjRnV8%3l3;lQc@nrtv+|0BEOKzK)ZLwGIQA^r8;5G3PFOG9g<wXVIk zz1m>dRhBscj--0XwEidOO<p>mcNCv<iq9B^y^$%3U(2;Qow7djF>}CI)%wgy9@(bQ za#?1S@cle38tWVGM1vdHAFE|$sZwMx8XAV6NTJA~Jb{h=XXN~Ko@Fjsp0FKZ6pRzh zTkVe`h5e$*ieYy2JnIjJcQOKwM@Axp=x|qKyH2nLf+B@5tW<Jn(tRiVGOhoau3+hW zjP<LN3<aB+7<V%{DN_XX^{p?{_d5h<8==!9%w?J(SPH-X;jU=gP!oIHb?ERsK}JGd z3S~50{eAAkTS)meB=ETXkYn1l9T{Bj9rrR4>QY#QCYNN-Z&!FKJQ-fY7&uVhD+DF$ zQ47gfTf<0UjjqY&`6nO9`k%pPj@z*d9hRz&Qc=pU)(1afgnSbTy~#ZB6n1($?})-! zQb@!`28O$%ZNtq-U?f^QRAUf~g~XN^QW*O_tEcZODZglW!a8f;@7V3y7AMF^h!hS6 z`UCyN+4I{S-WpyX9*e|~z<x%-&QyX*3Tw>ynF#3TeOmbS;M>zG5~%AiE9@1b#fl($ ztfWx(FW*<Az*WmR>q*-Y`#$W5BA7ejUF#e5N0k(2%kNNl4;Ex&cp|bI30#E^%P80; zGRS<UUPeMCg`ao-m)m}9bQKn)6Mnh`*2yTS<}!+)J`y5@vVW<E|39sX1Rh0)_d0gE zwu%f2K}NzDQkXrz>F{<&K+)h>WC#gFhm#315~9a#%kAIieZAs0+=&LaFpEI~bsbJ2 zD0*z{U%te%&SjB6>x}(?V_FEhH$jm33?t!)tlwvK912fkfv1!PkLnVL4yO{7k?`~G z|8m=pjSgagyU^eko<n2|tV>H!53qQLIm|8Q>K84it;g&~7zOt@cjyFVM#!8nJ9?h= zhr)ZYz+0HZOvEK{puUfp45Q$@7Nf_@?ce8p{fOe%9qovT28#q@MU)OJORSHBjD&x} z{-wCLzsgMfI<uHLN-v<pjDmYy+Y<!GWc@y?<4|~Cco+OOqQPsK#YB}A>17m55LDLq z^X~t0+lSvsKNh4z*I<!ArNc4`CKJ^6f5pzf$Sm+FlIotZ9=FZ#G%_ugI8Ja~*8C1D ze%lxU*D(uRt;Rr+Kp6#-2`;yPpZE16qeG+cV+3rEi3T@{cTYlx=L!BP`<JqM|0X-l zx0p#Hfh03MMiGz$l(N7Q#|dU9eV+BgZ#NcXDz3pJeC(ji1Z5@z!8TosANu_FiXYFw zU82EaLktPjby&9}XtC`7eu@Qtn-TB@o&~Qc2|U3}@Q{px5ZtN=-VeV?W`Vi{t`a*e zqo7z~Mey^MpXIjyJhQs5v$OVH_Sd9P==UkO`UAr8{)CiBe@faXIex_P7g-*)+67iK zKBCQU#mk_4rds{JYFXOVTRz9C+Bf-BIV+#1f0lgGk4Tne9RFpx-TU0H-*11?@;p}I z9iCC7VCbjxTZ#~UhrWH6J@Y^2_>3c4ex8b9+kT!CZ?NY0I<3CX)3+)B$^6@BS=yfU z-LSk0L%rOve4>2TXX+jrcTu=L?60l<S=V{n(=dFQ{=Q9*-bZ>qf#lce@i$eU|B#)_ ztob!J`qwyjT93j|FEV<UcHhEsOS>O)kA&vewb`PJuA8=(EN^h;L;gyMP4#Ji2ub74 z$$fj+UmHuO^NzXZ>{qNeEVnE#B8k%5cNE7D>90=m)2#WqyQ_jddE>6__5;@AmN|TK zu4Aunb9aoE-^GXFLv1!V=e_ED&i119HOreY{2=MGzKI5Wn|mf&@UXwuSMCTL^d8SS z>v+=kwDqR-mgNo>_Dig|3PtJb`}FOj?D-9Zs#^ksxf8Cfj=k0y%SjksV#a*~tNj8x zeOGaOQ=2V6?SIn!wBtG39qV0-A))#mGxS}g|6zYk)^07C&Oe-cGUtr*qW!Avx?*_S zBJ!eBd|OFlw)*QHE{#NjUHQ@6N!J#~9{WM-QBsc2u|IQ_S<f?y;)~p&*~%vZ=e<{Q zZa8j1@D-%+4Mq=LHa|k2KZ)!4!_IFaGF7&#aKCRx5j^j>V!vj)VSN@kyloL_c@2Gj zgIQnJ{GvroH5DyIeg07hPC0hj_t}o1$&@ODp_JMaeZB?1Q$=&WOB4-%27<S(FI!)= zypAU83_n1_Wvu+<VaKP9H*TujR<gI?pzmnzshl&e^NvgQr_khQ6vG!RFIrwf5?{-n z-$-FwMNMO{Gcf2I^-Q`pJ9jwt+7D4Q;AEVk6nuPIo2@!oayI{>_X<VBZwf)%OV+zc z;p>*S;tb!1;YX~bZ^J*}ZDnJ=idFjqEREP7<7g||%gV5BgXFxNO+IgS1K!Y%)BbDd z*oR2`;`X}>W*EEQQ#RFT-}r3#f4q4^&6d#aqJ8;?@{Z-6a-VUXb3W;~iWFYg8CqYk z+(DbOmcP{no#o+hd24Y`!BE~B&t%SKitSC?_alWzVR#x{zQBk$TYaMROu_lQOP;4F zc@06k5PStG{F3F3B!=QTwvE~POX$FR%F5`vEj<-lmgW-sUS#?%jD@&ZyssvaFynU| zSE^a==Zk1ry7q118rGzrWh@gLXN)Wc=dWqwt?TQi%6F7Z2lo4q<Q-Q8=bRTER~$$o znylo|`n)A;erpQ4gAFxRO~KB>et*on*0T<R+a0?p`f<oQW2NLH1q9}_*_so<(}i<> zk-%#dY<^Y|d{OjRN#UC?6#FGFJ9~x)(f((l_$4(L`m*YaXgEGZjRDIzK2%qcqq|Ue z2b(AQug{ju_&pE32)-G!1pTLNf1Orc4dc^Tz!%l0%M8zG{S$4xeO+X8)wa-{qJ8M_ zjQ51+l>4;ntn-5NvMz-%yooMn&o95Xw5h75zO1dJCm;n!#}vVB6rI^?KVTOrJZ?E< znXNe<dZOrT{)N0txmVHQXCU}IGAIP^!ce5}O<3tKEBjp=*q=Se_<4(d7;~ZbG<+G? zYNh$?Deho@_7WEAwwgb@sjS|Y&G4bQPt&p=SCA#z--#DdUNx>(2k;`2Pk)JP3|ses zHr6rOu(^6m+0K&bLK6D<jv54A=be`vSCtfsCTGoWt-r6Nr6L@zYz}r5^%V^HMj^N! z8QezcBlOsQ$R<(<zp_)sXA92zp7dT39fsg@34-XclEOFGqSjv)+)-j^@)07fF00v) zSn=01HWat8+2hN{p=cdGDkqTBYwU*0j8kl%+3Owd#w_&2#WijF;@#DxS*zJG+J+|2 zA?G)liRts%H?^_OiN;NJQx)5Tdy4i34)_n}9n%TsoK+(sdMsn1Eo**b{(<7Q^7`89 z1|+aMFyJ4~TkRcJ1h+zP4|;sie%Lx&d%XNq$(g`8zb=8I!#Cpu?^tCd#F`+5$^EZj z*Km^ku_I{HMXZp_hEVKXY&eFko_ux`NALnV$UgXadMPu`JIR-){aJhqo<#N}ggtHh z2N3Kbt<26|+W_y|iyzD!<FVfQ%i37?MDxb*RMpn9T}a@*f&=~;MnR>;5M(42Jw^(% z<~QzNRn!q`tgekzAc4ib=y24##vq6_-h&iUz8ZcfOBn(4FX$484kr;*wnX$ewbMbo zE5wz150Y>SN|!afho<rA?WTu0`Rv-RLJD`H=_k?k%jl-rD>?qxf=f-B_9d*VMay?X zu$LdVbB9jAP+Gq!{8~0d(BSeN!9B%$1Ef)76g-}LQpQ0=@B(^##gQ#P&$w@}urt_H zSyxxt7-}u)3iKBYi5<zsHPyX|RW57^QV72a_!XTkIOn_Qy(C5J(P6P8x)$Go;LAwi z!u}WGm^VN#wjv3_5LBYUQ6@TtPU`p6;X}0sAAxD4^d!ABdo7IrP1C+;usBcVc=@Lb z_u<XlKWY6O<Ly|_MC<zc&9zgN+d{ibrm-R<hGi5q2%d4xsgdv=_zeZROIs=;XmEK; zumda7?;pmF%o7w_GFyAB;$-lN;<LI2d#_+c&|zgq41!`y7WY2@qb@XcJ(93b_=$#% zDc3Wf%bAY{S|>Qysk1YSFJJJN#qqz3+TYL38pUa-RI^Dnx{i43`x!S+X=A<PZR=zN ztlCn(QzTI7@Dbm!6oTiRS@WC78xHi8w3ati*F`FuL;{QYM2E2>YjY>#GPun#TX(GT zWaw1!nZh}2h>}3hHPK;3@D`Tj1=}60@q+HEU&W2P4NYB(U?2!G0!qgF7F;fJa~$FX zEoaA&lU?a9Px+hSADJBgyQux047+{YHr(_J`0RG(Q{%|^Zax}*?UPL#!)S2D_Rt=Y zz`%Z)$zVs~GRT~I&UH`xdP~|uja7BERSjjWNMJWQJm`;NN7i}_g2L}a=v2w+LNr)x zh%SLjhi}SkFitR~vx#5)jP*^7MkB&cMnG}uk@)FitH*86#{;dI4v!**X0P<|zbWk> z1k^@^t_`Q6Tsz*A;)FlI2W~{h$NI)Qupp7mHB*(?5F}7p5oR*8dB?pc^l=b^+4J*8 z3;IerLQR$7T1h&lB(PWMa11+Q5LEoiPU$0{vLTFto~NYby0Rk%!KM12hp<W2LB((0 zf3jK&*xW*}f&sA`p)`9f75|&k{xl=3gcWjs#6h|cf-T&i5$+HCI^icHAR3Gf5eY2Z zj~!N{V1nR1@Qda57k7qQD(Y+M!j-xNVnsxUWfYY8G+GS5suRMG5l}Q(B=E9~frbvp zWpL^Kd$E>v!f)PxvIpaW+mt=W1Hp2g@6rvZGvdFNKK?hUeYroA%wpOY8uB$;4sM4+ zuot1+)-v6+zwtoBA$1&X;HWo_BlR2+{dmuJqv&FAuKaZ66P2f`PFA0&IZ<=G=6J1s z9IG>ql5mVY$?fc|O|!$cKYW1WpgInPjpK0mu%vX$>d4f6sRyX|iT~tg2E0LrraH!X zufyeGtd~FpvzPAwmp=YCsr{WO>nI~cJx*B!F?f`Velrrjp?h1$uC_fbdz<$)?-!Dd zdh$t$u1Y?>X?fLk(|gtbWZ-<!T*=weGr`lLC(2HhpDI6D!BKg#@<io{suNY7p1S(( zNHjc#XT~P>$G737v6G$h-S`VlbL@qx9NE#E4QK8@5vKG0laovhdl(k05RY7^D;I+1 zX^Dj2((%8vZ^XCD{aJ+sstgMjAWALACT+5BbN9B+o$b5Z_K><$9SM@j{JyKbXM4%{ zocn3-72l=&iv{Nk=Zel2pDj64db;#<@QKi=(5W&xJcGg3`o@m>ei*I}kFz_z0ghAn zs%=9mb(TW&!SI9by#M3|hV)fPpo-wPc$`ilX!g?mBMtw_H1WTw_otf?q8uaaw6QK! z3c-nijlEkSxP$b&$YJZW&M=9jE|K`}THdg|=y)#YhUc31O5P>^#rz8e=L6?-j>Tt6 z;0Q@itf;H1rJ=FCz9%w(SH@^~Efgn_#*K>O7A2G09~eKg^*J+sX7Rmd*N+cw=-=G4 zwR>CFc0G2lZFed|MRH%W<bQeB`kL*I<Cg0g_tT!IgrV<A{{<+X6N-grMIMXYs|$L| z+G{Df*VfQg-xpbhU%^Ot4f;HxC~k;zd{FXd3uj)RGwZ)ue6QK{<3sBPHua?tL=tzm zG9soNKVWz9L(7}iyNck=oa^puxmR_DzKeb&vEUpW=L+3x@&`&gE8FTI*xJxp--9L( zB8Rca>I6m6=?BG+DMZHgIkWzo#rK+BKQ=VEYD52~-l=XPMv+0%XzPcJh)D#+|L-g0 z5PRMFitV=Jmh-0ThDf0>^j<;|VJP}s;9lz|IH|0ox;3H*cGP#{6*LfMI0{42=ZOc$ z&t!ehtp8^5y=K>s4NnfPmux>W3d+%$AQ&GJbvgV~c4NQBOzACFm0nf^pLIPWQV7E< zafVO&F8JMJzM;b2U}r^pZA+xN2^s7_k9+ZJSrr*layX(S@qx*o$@-jG|IOli&8{Dd zPN2aX`{O#S2x3c;4>cl6{)11k^zT~Uu)b=0$u2SoL6Jhy<OD<axGzdTNOxIBRa;$4 zJ=I5B8rvb*gA^t+d|>=c*5}OnZx-KccD>>!8Vo_D!$Q!oCG!;9e~5R=H`$vZejgpa zs|Y^td`|RONukd0ve!M47b_qrDb!WbUeg**5bSK|MhbOJ9>S(PFn$s}m|=a+tp8^5 zy=K>s#jqfQ>xAFD4yTTUisFy(MfnzU%@2{lH__o&Y%d~%ae}%O%4nEN_Mwsd!J@v> z?(&YR_F4!wP|8|lQ1n=rLKzJo7(YFJddc-Uv;LdK_nKX=_zkUFg$8pdJFJg`sUu;> zkMN9={0ASiyCo9%Dx;u5Q0cKqA)_JLheq>Na3&JiQQcPC8i^BZQv~BuXrwhawIgQV zU#*{JSf4Yy-dMj{D!$k3dUE|qrcvF7B#(j%M#3NAr}syUfS)J{e9Q8sd4i7T^pVhY zov{#ps|f@_gM(e=9hK;CT}v`SU5|So96wqA%dkFY)_=43UbE}rH#{i>#fGHnaLP#d zXYl(jBj8urR}~3-10BAL4Binf7J@PoG8W>755EFJTT6S&kU-I4?1&+Q=y9jg;|Hd{ z!Y{-6oLT?P;(N`m7k*e^G+5b?B?<l+{-fVz4k+^e0b}4>N{3&uz7!{jHHP3bF8HnS z5zM8+tCa+b4r51}8`U`2rU-UENcofX(<RsE%=$0u2btn~&8~-^%wXm<c%cr9En!ah zXW9>#!F)Te!6JcgT1AInu_Y6HR_7OkpJ;HmE`jLqB7zT4{tmO1wB-7nS^xF*tEJ<6 z&8~-^Sm0z0Hl9WdJEBHH;rGW_kZ+*DA2W*)3Dk91C-?$dEHX&;ur<C|{*VNxm-d8o z36xO~J2EeWZK}vuHVn<ae|&vu$@Mw2>-G56rQ>_eu214OuffSPnK(t^_Zb%Cn@WQ} zWDI=A@+La0kAg3v#X3Rwt@VxMs}Ya{2}%O7BIvN#k$HmIu#@+geSfp{IkW4{;(N`m zho5eNl>{maB4ePwqL3tk{{>&}&+w=H7IT<SupvmGCK9N0SmrWug2FFv6n-)S_QoX; zD-zdXnNO>6@PY9&S)ViOzwpZx-)nZg;-_2S#4J!zoPQco6oudSlm+=zY48Vldl?eQ zC@7X#5hVNCSRTQwY6R><0=r}kL;@28Wj@{1^uYKnvw+7k%ZF&$*8k|aI=FXuTWoV| z<H-6EDUx86oRG2;NrflYy34y82iu9s?v3@w21Ztm3~~&O49Ab?h#avIsvdL?55|UL z@=I)F$(Pyv$8wnFILOq-GRuc#o&SEMX|QXsZ)jj>aCm5VC^{UC#yF(!qsB2Z%AwtI z?DKCe-B`K4b}};2INm(oI^I6sG1fWO&CxT~%hAWty0v?K-{io=AXQeR&WcfEMX#<h z|8J(gt8>dN;IYi|AzAZVTQXcTR8NRz`%ot-V|t>!!+pd3!vn(u(N)n^vB4N(Mkw8o z_GQbH&e^=_z|NAbWmA=#YBtoZkF0B$Y@BGGXqjl8XcL;_o#Rc@9oxDo#?rrGKow&d zn&6-u3oKPRme^QK4K1epmRZ1KndL*YZ0moF7mNg>RixReC&;j6sBNg7ki)KMcT`9U zOQG2x?PoZC-ge$~%zH3@x^QR7_RyA!sjAJ@o9i}(H%2zpuWww}G}#Qx#{F%(I=6Li z>Dk=3seeQNdML7tp(aw3$t+FzEwg~fGRuc#Eq{7hl;KLGka85mjYuLChgDe=!v7UX z!Z>@GV!vv6+CG<aJom6~f5G0O-6cCh+scIERQ2Xso#V#(jg1FLdb$&WTYEP5iorq> zDGOvU)Jd)znl$CN%mN<EEFY3Jze(Skz(@&6#)M#<V%RX;q$suu#h6|er3(}Nj&{p> z#c?+0WbRDf!Tf!J>7qR)JA>QHww7<HR1|CA*l>`fr@K3Lkn(wJ_f+pDq)?}rEQwGw z<(KThXZq&FZ%9oaxcI!;U$V{Lxh`*Q0Vx7YAxK#i62+*pD8mNBmLbZb#GuGS$_LuZ zmK(Mwoo66;H1Cjqf5E;&VYn-ZB$jWfP!t;uHSbg9E4Hho*}5JZ43$3X+HCgymHWjq z3V19`A87h{3*L}5KQBcAB$;!NRDnXUMi@rPzXZcZ#zfKQmf=<=iC-bX>@Dll_KOfa zm3u7ji0@$j{=nYC>Eb;lyMsGJN)j6mHIe*T6%Wt}Dk)Sln4l<Q;)1Wc_}9e_e2d?c z;V-)T+kKZ0L4U=}>Hp6EuS{Rk-^WIzEDD8<3en+Uv@8a};c5s<fg^*VP;6nD>kkOU zpiG19s^h%tjQdpXaqo=pkpDozzQEohVYnxVBsLtD@(c0u6-K(<WGReO%+PiHl(7Js z?7(Mw(fk|Ir4O7xYyMNV{3weeg^ff4ONT>>AVoby3Ww_`w3T4UGS{a(n!JhxUbkP? z33^U=kLDfrAIKL?jx&Vc0k%i=@)cr7^uw?wIz9e1?Lm46KGVyLH!QP!NY?TfFN+c& z!KWfPEHWqrl@yXNfjuFP29_DV#ltb_wvfPUj!VvSB7+b-mN$bQAHbf-XgFPexba|8 z`HCch-CGjYBpZH9Iq(^PO#V-Hf9u5`=pM_5+!Mbszi6<OMIoJdTn3{;P^6IZJi-uu z-)4E?E$tN~@P_Rv$CJ)E*J&ZB^w^NXeelDAhy=!q2Z$v$4)j>fX|m(Dlmnj$IsLTZ zUUi@4L+*i}D(jIyYDi!(suL7jq6iKN!7$4VQps4#43MH6D{|Fw(RnsO@TgC$37RbY z#DXN1uh8vCT#t=8jVXjo4t%B;C%eDhSNV|a`NhkktX2}JM#11Pso2qCouF<_YT@@C zEC_`{n8iq$1}PMB!F4uAWbh<1D0+-F5q_$GPjc}9!;a_-l{GO%klBIH^zwQ4xBDs| zau57eSr5@*T>_O3&l6N5A$m+A4M`O9J{tV0lECZe@MY)u6oRupTwLKN7KDR&wC?aQ zKME$K&=i2F17GGAmYF{AUiqoQMluE}36xRLkU`y+R4abpU`qxK)?QK)C^{^5WL^f3 zdym#15%*THARP1MD;5&mW*Pu&7QmYspH|ey?n#CK;(~5GG&2MMXFY#mVVN}A6gEPG zbqN$Zyoey_Vx&-rqzrvSyK8xYG4KXDd{x;IgCK=M;{3#dhy>0%+$7r(q>%1s4gfX_ z;5FgbY_XR>aLEATuuaS8V3<8WDeFNha?xPMK%=w~iBgkfFj@saDHQSv8zgU1DC9-G zPzXAFg;2UV*BLbqp2|HDQSzr-5Mv%4*WzS5BK$H30GkHzW(G+3ux1J1L9;{vG46#@ zKV``P;B24Y^s*?5AEn78CGTpI%PI*JOKfOyh2r;ZEQl%;!YoFYz^Cmp3SJ-(hcXvf zl2bZAV+2f*z+D|l1mnG5Isn)-fHyNg6+SGLDV7KzR_=vDaESomd*Nr4^)MumnRJ4n zl*toYO!_6gP{;?2fO@GP3WZp&D;>Us9f=c+^J5i8Ka4VPN(Oh%7lTXm-a=$Tx&U3% z0Nz9kX@7`^4-3Jx0mO0lLLrzo0J!{R$A^}$v$30;0Lw^HHJ|?KM->-r*`&4lVYZ19 zeD7JZNq?1^04rU9u4w>oT5O3w#>0m-j}?Mx0*K@8g))Yu4FGPO^<Qz_wtdMW=@BH2 z(tMgKBaM<ufYm(g-|swUHA`#t-E8?4-Eh5Xd(UE){_0N>9o2tW9H5)L{+Bj@cR>pa zeocfAYdLlZE(#z{xEE^V+)on#+`hZ?c-}?Vb^8m}S4bWr39$53N|KN&DFIe@V{lF0 zPS;WUIqOrFXR-L{(^_Sn=qY&7d(-)n?G5X@mQ3ldzPF&e8nH2{0l=mKyo*{$`9o6p zu!OcP3Ls8$FVqN{Srh==wYBm<;Dq;l&Qp$OY|j&LO9~d|zIp;IN!KAsPrj~wI#5;J z6By6k?mFN&Zk@ASwxmyM^-;F+S3c*z>VD4gqU|;7n@aZQQ*9-uzxr-UXZ2rF19TU! z|D~Q!(?If{Qo@I&{>*{^V&R)`FVsjhu^<4rcXRFTlEe8YbI;{mc0O&t3BwmjNJaV! zMbSur_3>~h(h?lZpUB<n+UuCHpM>Eh%i^?FZ)eM|@^s)y@6$QYIqul*T3@%m1;hFD zEy=Q<+*#x2<N)1t>wn4Tjcb<pe?j=L=E#9yQULJ+_rgU1z<nF*ca-fbob{c|oy)o8 ze9Cb>&hQlrscf*Il2Qq2jFvV<D!PiI{>j`eu04)}_M^5ZV0h7zn%3%W*8H9*I`6yU zx#7A6LDFAY->|$5LrKkMB&m@!LZ2?t?fC%R^y`1~=a*=}`16ABVeE+A4Z%bJ@dEck zu_Gw~zys?Ww^r^hIaqKMf@gCsx~@2`Id0fUfMva7eHn(L&rlpGYOASf3HAj>eUqLk z*DmKi#}WH+>l4;<Fq}_o^+LA%s!o;6<zMn%b3X&Y+qRc&uOfwS#u?71zxt}t&qe<m z0lG`A{~6~OU6}fZ)b&%shc&lN5sU{A>wJ^k3#s{$900s(UGr4U&d}bX!}-VZPUW6) zUvOPP3ZJ$=6K6<TDxtW#u(KlESl(GYm_M2~>DipK13f-upS7KgGt|>s-N=?-^~ut+ z1sC(Kc%II=>4c#DCG_|;E9tM|42>i|sohQfO%KqO^_PqRyvdg={yBO5G#);TCANEz zK@~tO=NGsaN)@w20Px^s>!!Nx6?;nd2WI@o;{-1`uOfxlZ92o-l8%M6Oyv#Xs@7m{ z;gElgS0^~_*iRB|(pp(hqstd8mq?6~WuP_bo6Z!T_h0fp<-YDR2uj+VFQLiv3_n=h z<D|bxfNrMsKNY~6^!_wI8tbPi;lrBCVe?=s2q4zaFK{nR3IHCQY~L8!TD2=QU39Qu z)_2@{%5%nj&h?~Im%<xyhA*rs=m|E})HIcKmh=Z={<Yq9$l!L@ZpS{PaK=trD=TTO zEElrp7dl-umyZPIUUT1YJ*#B!MVm^0r8N01i=@9IK~u(;NAb%Tz?<><rNW1q%h+-u z7)s`w2>;bcwi6Ek9-8Rb*f3SSvusbvzQV)#M|~$Ccv=X$F2|+thK&SRYxDa`TdKnK zm2JVEqE-1LzA;5`n~Sto4$@j-PmT-2toa=;BY8=|1>Ys_Rdo0n2tJP%3qh6s3MnLo z7!1u`4`lsM$}gvc4{IKaQxROky-@0JsQ}=i@y-p6o9nh!><&&B?GGIBAIsAT&S6W0 zpp1oRGHIDg+A8Y9)y-ucC4GS*|7hNrcRez=%|%)(^jMd|6WQ`p>6^|5&ikK40!4>O zf0ZDp(qAEkl0xhue+`d!Z)hgazG@c|xVI1;7J|7VgJ*LtxGpI@W-KHvQ*lRmV{L6i zMO(1DcpxzBC#{ttxYf1Oxd%Nat(ATDUig)sDLhB&4Us_6VbWhIf}|c|Mx)YSS<MrG zeZhQ($GX?IY>Z4*?<hwCi}wW%p~I56L??J2f=UX<{ewlFp(aU+(;Vt3A+1$G6gx7W ziwwHAx^^%poJNliWzSEgZ<<4cNx+5#KAoco;)Z_PZV)61+ZT+-^y7!ede*f{`jTyx zyGWn2w`hOC5k>H1oS;4uUKuAWwL92S8L6voEN=~VVMR!5g&moI;3f!)E!pGPe=q!k zr&ao<iz<1}HFQ|)2+5n`1SS2|tEL_Jg5MAw>z!=d$OyQlYNsxNg$MIz{33%o!81AM zDDQiD!be)B($<OwG`O+_3G6BAM~6pz^903~?8%m&O5cP9iEA+Fuapjp9We-sEqUb& z#$)>N@SAAg&_s%i>g^S}1d0yN`i_x)C{EBl=ei(itw_sME{W-?nn=skg%uGU9?e_J zY*3d$2;LJv5&)9G?!4cSK+$0mQ8Jf#-thuj3_;Tle8F#sj`dA+Y)}cD(BQH?G6s@< zNM<rA1Sj&M0hN}iDqL67P}zzEb|ZnLwZe|f6Wn?a{8aiONnodJh%SMm!=%4T5WMpR z<1zhs_%VY~37oc6i4D;cI33JCqI8(Kj2Z{eOn74j{UxMjs;sZ8C9Rc`K+;;F!!hhg zf?)Ris*Lna!tY5n29o|tje<!8O*`-fzacu-Ki;{%bz{Tkx~b}I6$uFx9Zn_)Khj!t zg%|;AYQt3x6(WIM#iX@Dhexm@GM`3^?~R{W5T(H~iy{4$q_7hmW)zIe;1`U?^y4So z<L(LfB*!|AtV$PUERmwsRd|{`-R=SRusiA=aj$l-;aKY)<8#J6<DQB55iaTR40>W7 zIj^6caO>wL$#G{K>qwyVaKBcUP~_LYCT}ffV2tUUen?OBKCJV|kv&)Kf^FE}7_6!e zMM|5B+6p@hx(j;p`}_m`RsKQWkZ(9|I4|mrdBd^#I*d?JZ^1zRpnu30^~Lf=@<zR@ zIo8OLyOv`t_u;0eX`|J#V5~?L`Sq`svhDG*=VQ<u<NFKG365+D;<d5gxhAhC&=QPP z)YMdj%Nv4Cr7b0G#T`YRh24RkKyRS0pg%vnDbn3o-%{0H)>+z9+*{Zm7$_LbAIcvl z*C#AFMtq}rqsxjd`dzRet&df*cPr)NDSW;<f3+WJ^sSxesPp7OExUu&<8#)XuJPQ# z{H~&wU}Jf>s-~tYTv1=%7-|Z(l(v?%m2{MJ7L!aYTHn#w&|KSA*-=Ib`jXz_zM}rZ z0S44T4oGrD%@ctAE+A$2jM3&;U9=Ma0V!5X(QX(PNa1^-C>$vuuLx#GZ@sK1-O!HO zx4R~CqyD}?XK`z=sjR*-TwPmRT~`^YXb_H}mJlh}#v=Xo?TwAib*<Iy6&+<=q3+Ti zgJR*TIL8cM{h>HD+7_#i)<Ccff+ew{SmCH*h?kH~G0rh-eyY&C^?+lmYchA#H<;gB z*jdsRY%Xi4h*Z`I#oDTHWqk#y0N3L--P6zp!AMJOTNMn;yFy*To;XD%jfD>dqzqqT zv?JDt|F@J}En_4sjTS40g(8PYVuE7U{8XWN%W>;o#}?P5XEblf-yi5H>JWyZ#`1>B zNL5`;U2P316jcHF`mTmnMX;{5roE~|7?yPfyLE~seZ?8R`a^MQv@6yUtH*n}YB)|1 z?`9a{<5##^*Ji~qYksQGyf$Z<vF~wAL2!*X=3kZH7w9hTC}|D0gqq44D$!?Bd#VER zk-qv)2sSm<w}e}3+p60u;|xPO!%}j@N;d5Mq}e#kaU^lfuy#Cy|6a3YBMCU<{Gl}O zFoS%K)@R7ceoU=IFK)k}J8YAT=KDx9y|DcwTr=7;(iUq#0&9{8mJW+&PzuAW`6UX? zTMycIIXCA_de(YJd_(>L3~v{vqczwXLYqm8N&)%ss1%T|@0cejQaI1BD=1~+J4xQU zjWkXB;*|7$=zZ1p+DjTqj=O~vs6zRG;yC{fGk#OJ1#M7m?@ix62?D7yO{VD`iEm)s zDY>7cy(8_hrf5A;9My4xVoOS+B86g6@bnWoya#@&&^!`&)V9yD!?ihQooCEDnm6oU zRUn!iX9&M|0eP&5Dj<&xHjpBrPDx>cq4ZyBBMjg`wh4;6=!M=>A$3s0lVq<YAW}L? zM)EE6QaH}P!;Iey^vcOayhfWy-@Zx^m`7y85Ke-d@I}~(w96;#S9fc)Z=^HY5~F;4 zv=)MuN{e-Zx)j0?O)kuuUs9oY>k<3j1i{tbs9(lHG+7vul2;Xw532(5jDo6wyg?8t z6ir5#_5NGP1lvk_-8C@U3`L=|NA-d|rt5Sp-=~Gih8u<Q1~hX19cKKF(N3S1i(AGL z?Z*=YTG@<i;VzD%3mdsl+u$hQVYGjg^6_yAtcg-EG)@p(f_J1Yg;?dR`6UX?BZ0@Q zhwRgi?a1JI4~6C#33Vw%li`;rARm#kq4NY~B*dDuDNUa5e-VjcJ7KsQNfAa{+4I?{ z;jeHP-_Zdr-(xMqm$ZkR*&<6D;4JSjdmTe-*Few0mTubilM)1qTSzOkql_0qQ9faG zV3hLlv1TMNf)#<_P^GfPA+g5Fzf|<N_#XJB6q>i~x9@gr(+TFvNEr1GB8C0%TU0>a zkill1U@cNuliYuCOlgMDAW|?!;?9jw#KY-rihb<B`!PSqUWl`DOX3}qP&9kZpotSu zDdLT3+fPXlm_yFpI+7UoqRsLhCUt%kpN_?gcca7ALlV<L`FIW)36&Jy1HVL}dF`C# zBsxr?`8dG|2#)4aXx_hyWZLlp@?i?dV@0I+bO*CR3dlD#LQqCRq%gJrK9v(&|NbLL z!Gwl?*!vVdIswjRE#Kud-XWA@NS4_vHU8JQTKFW||9}!TXHi(mk>|B3$AkzBdq|cX z(?(Z~_QWV3j|PhbQoI`-u3AK}G;4l~3eBU#6q;u~y_w}5A-Ebnj>2zI0eN&dl^}Y& zu>W2ppoY=B13i*H;=Lgnd<iic!TT|vyxA%@cms)`rH5v(h4H^>+7}JJ##ChkVpq+c zI-_MVnotkJZsFGpKhfaEAyqzJbU1~e%&N2Jmr`h6Ng##h83lJbx4Je#Q06mhyd#QV zynsBj7+nH;gdl}~4H-m_7xzCGU8`iQYM}~9zlO)`$M`org9L5`7d0jvIUZ{OIjw_o z`{K(N{AF?c@1pj<f+ycCO0n$4PIW=BK(mXRV2JzJpp6bPgQ0vpv*<=5q2m&W4$FMn z&|(N?Eq_M*TT-EUba+3rX$s9J2(E?Sf&%hb5mi8*Q7}%BId%Pl?y6ticuAFMwOETb z;r9vtb+@&POlH<0ay44M-R&mxcquYsp8sC_tM*_CR&!~DYh=M}QTy*vbn7Zp?Crd* z1A+m`Kknr|mJ7c=EJz0$tV^I+5t+$gN0JDJ?tx!Qp?Rz@h30Ll&^(sdAUFoUv<2i< z@o9q~Gs1NIq=yAmUr>C$#t8UjN^xDIpw3PPyiWcM!5q8vuPpuLNq<fCUsK8L6~RRN z84?7_w}4;&s45?ykU*IUiVmj`%$i@CLi5aHC^U}_izU_xuA>Bl@QW9a$A-j6Z!8ih zI^3C%!6oaT=P(eJQ00U59!UZFB}$f^=W#$DC)!Z&e7ncx$fbW~FYyoBhxy)sJxz}P zO=>?wf<UJx<>TR}%Ex0v8YMTbZbgtleJ-Q4_#XHzC^WAm@R*fC^XRZxVntB#TcUtG zPsE*AVx`4P_ur-YWY!`6Yf{$!HX3%4DfK!=!Dhhb+dM9($FlVG`WeH&YH|E;Qu|W& z{bknW=6Lwt&L}v*;9HFm@oKi7{VfNZ4mBQbJlt@^IHc@H{fu!$W;kYhV)dIM=4Cx7 z5$KyOnhnmD&sK6&%~sFWsH1jPKk8=n!!&X1tp1Kn{dY;~E8YL^9o4^cs()jc=0=7l zzb)74%yhk+oA<9;9RC|{Uo6y;2?7VVb?@q!Zrj_kzxhD(fu{7uKZukZtZVIVjMk4u zGMDw(r@d#n;kxLZ^PdhpQFN;0Wa)|EiO}(~<K-L`$19Fis-x=IgWxBwwD6Pu83ia# zqKunS#crk-RHDI$&*YUk{x{zKR~TW*NI=22bK0!RwmO2cHzR|k!<+lIb?@xl-44Te z0g$vLW5KTamX5~$`e<ZzWIVD|S&!YsUcPF(>AdQG(tF-Fmw&e4OyTLGCyGy%oGLw8 zdNO!2bTV|JjN`%Z1CH+hXY_9lf!c)*4`W9nOfgK?|4rk6rTq*E0;3xSrh0`S5v1*V z5)36OZJ(0KgSt$vF6ymnZ)k3B>}eQ);b>%SBwbmL?Zkn-XL-r?oa5=7E1pZ<i+Sh$ zLh)=sIG!%nNuDY_^`Q8ft<RZ=pJo={Yj%BX{ov;Qtv%bicXa77xO>{BjY2dq)DK9m zDHtg0tZi#-Zfopn=&N5<Pg#%1n#forwXDY!(SyuqtuI3GhU=PJXL!MP-hVEC4vJ?B zb&jVW6hGvxCe!+yS^v%Ad(EyNSvRz4U~``k+#V;ms{?Ib$Z&1`U};ZfM_p@Ub8BN~ z17$tx2a!X{dI&{T97S|`1Nn0(v|@S9ddGIl@r>)~oTuDZaxZzG%)8*b;5#Q2l{B7x zVEjzh=gj(V7T;@j{mA6dhE<#TMTdppeDMl3A|@no@0fq6xVNmcx;@e&1UnkKk-~vE z!%-NDJ|m0dFQL$i_NL{oB6!nz9Vxu(Q4G;$T@s=A!1$T0&zbe#EWX$5`jN@u^@Eb_ zXNw{j9|_};;7KDQWhn~#gWVM!wQUg|Pa%jNtFj*TLrM-Q>miaz{tZ=VMWfJ)^)@ni z(|JQlq0Uf7M3Kbv4~(D5`kYz+&Ek8_t{<6*t{d7oK<NOb!y<({#Fiu-YDApy#R3B* zy=7fh?RBm75Tv961bdJ|%6il%N~4fJB3@_(9j4HV{g&fd=Q9))yyg~77CBVf{J{8` ztk0SC-z>h@?0UsdG#DLbCZiwoBcZ`@4`nF|28;VbNMKESSSQ%j&Pdpe6zZBRk1jDS zQD{Yb!}2QW-Qxt66zZC+7(Ot53krBFT%R-Rzgc{*+4UpiG0F8P{N{BybtF^_CrIi% zRMcPEQ{GjD4u@MCL<S*9Sr4p<NTG~|L-0!}w4w+~fdSEDkwTr}<p;(uO#zSO^*OWt zo5lB<T~GRU;ipTWK9ez|P|XPqJ*F&0{%|20tjc;&RDq%tieQ@}Xh`8ImH!}KXay@m zp%vybx9tW&84E>|uRbt-M*Qj$>vLxPH;eBzyPhO}!cVs$$)n(ckx-ST2&^jZ3nGD) z=&&TTPY_gk+#_Y8;FnTp1s#6bAgD&dxD-A>exLC4w&ePpS^v%Ad(Ezg-{i0m6dR&O z!MGjPrO=pF@0#?kk)-0H!SS*lZDL0nnnjB#N&!L9<6ihB6<Q(1yDGGjOi=0Z(+`ZF z$@-jG|IOli&8~;vgyN@baOx;14CA&0em+%}qJ**(B7xOX+z5h=Nd&Pb+M+@$=rDy= zm<=i|Mh0ahyzv0}Wn7;#>%Uoiui5p&j~UFo1~1fMWlK6m{@^!Ms5CfU)&m`0L=b){ zg;tP2DVC%Pttf&rpSkIL=7I9dus&zje_8*_6yIxhJ^XYFJg>pV(}-b5<o})VvJ~;M z9&rh*L5C?yk&r=b3H%lnT2VTjO7PhS$S>pioLT?%_0y%}d(Ey-;%C^9#X1Z@;it+{ z6j7EUsK!9CB9!&e39@S|<6xWOmr`g&%3LTN)(O6VEk*_(D8CHrb7uWF)}K?#`zFQr znq3b+-2#gaZ;IQH#X7uAm8Hm6WhqMZ!ZM73l=X<~uso3{f{NdQLMzN;UMDtRMnTGi z*c8DB$}hwEoLT?p*Po^Qa-zI%BEHw`dc{w-z{V^vZif?3BTA1IKYx4#OqM{=VHpKw zE(1Yri9#!J9ZnE@VEkNWJ<NRGe1|>r^^g2nKkjYIcwJ;O_xJjh#TtDkK0jjP-E@;F zQv;9sxr(^cz<u`PrzCsawmcEO&rg0xE~{dnEkH(=`+WUM`+srYisAqFd-|$kue^k< z^!rr8f7bpN8}O&tuKPVDSdF(v5DD~Zj{A|If6;!<g#Qkkx{dd9`J;bLHU2_n_)_>a z54pYW`Sq`wXz)i^k@a|V{T{xAUpY1;fHOzVy%YR5?Vq&&!Ir~II2Mc`%D;Cn;a7pD zK`}0kUV=qz_eS8i+OM_0W7Fo#xF7AoF(`Czmp}UN7Jijza0wFVLx-LBM)0@VZ?vCj zKi0m11fIrqrVCO2y?RyS#Z!p}2XSUAK!@G;M)2RYf6@Lw?XTIikwB8e$`}8k54wAh zH`w48LIMl1B3?#8NA?rqf7kw1`=$0j+4lMaT;R@2L=1k@4}I9@gCi*XWCScj@k9cB zxG>)X!GG8OP5Vbiz`q~`_B(Mm^dVfg%sp2hTo=C3mge~du_1+6kvv907nV4ybp_$~ zf3^Ru{T16@UqJ$&#GiZv4wyLdf1%y_oqf2@Pewq|U|j;eY81?h;D2bp)&7||^nXW# zzd@kFO{~ZsB8wzY!PI&1ch<>ee2|N7gwW3_4q>(M7*!!@J$Q{r7M+v#n!Kj$1#U|L zx?j#`RH{!&`JAJcu+8R1u7iva@($xY#=A_~e+W)nAy^v!_Chh0WUC4UX=Tu_UDUTM z_}$fpEPj{O%j~C=`y>=oD1C~9VYOzrFeVf*J_yD6_Xx{J<CRPQN)q2*q`;5^@fDX$ zuN9m%dY63Oq<?=!@K=u(a^$-4mG<!|#fqa)Qk?#l6x1`kqeK=cY*6&`o+9;KLVT)u zfQiZH|ISY5r0X3Nd(hu9I+wS^d6O^ra~$q2u<uc254b#T@)k?hSe;NQ_apQxEX$h^ zl;FA|hE-k;{>#I>$h#h;siKp)o%F3Z>F$>^?lmEMl4mFVc@uvBr2T|vw;Ak3ovqaA zhoQq`%~hXNz^GUJYwhnTNOYTbOt8jP4YFK$$hIhoUhdhW^m3p-#+@%g&yWwSMlJvA z>woFaoAmED+W*02`+00`mlm;>JBpmX9Jkx)L7Q{=EdRf1zu@WaE6faMh&LWkH>(i5 zPFXLs-rt&;$yfS3PurOr%Uu`Q5bY^JB$C(v7N0lyHuCiGV;qhz5x&`_HCU_crOtpW zPcgK6EV(i}{GSkhTZOPru~IF85#@ZmEce{wVui_f`JJ4$GdC6~^v4;>(?|09-@@}I zUrN@menQE=yV?_Ehv?E8tu?kXq%hy*g`vwM>ob3^{W(_TRqZk|xRnsaK0;?|5&gnP zOnu`hY$%UgNnt(=g<tCWU+Q@ie!}OcIE{V8%z2i;n=Y-%QfsR~3JYES9B+=SFZ=`R z3*SRV@9=aoqXOUt31e;Gqe-vwD87VHjvAkQir>QZzvS~K{nOX4{wq(vuOnXvl$B^^ z&Rk^=If|VHE`5FBZ*U9v7SG4Gu+=k!$*x0>dk7*7o3#GWp0+V^=b*<@Q77H{-~4%# zE{J^nf~U9d6ZUhP0Ly8f-8!{qOT=1jFL#tW<@xxZ(BQvB0>4U_>`g-8Bt&cjq0|FN z;UoJJ+vh@$<ryR0`k!&$q=O<~zta9z`+tzHw+K-+g0NdG^^AlScK!MIXAu4oGWr35 z+Sj2x!+z5y@>dL*wEj?^wlWcOCipF0|I^Q#bYWh;B=GEIba<Bi^igE6#nND{wF$pp zv%>l}Sn2N(tSIH?u91#oh8fQkp}>#yO9+YlEw%n9=S^Po@+IppZ!^N3X9a7D=Z#LS z6@qg9H_UN<s{J=sX#Plo`?MF>D>(zfY4Q*~;xB6wzjW(=rfYz@eEkD+yg%nT<6~wi z*IB{Zk9NvTuhsG&nv9VD$SnIOtg(I%#_#i<=O`4T6MV$^NnJ(Vf-GbG?>AVGzr#ZO z06On7OSwWalxZwP6oSI<U$G!RXN>$yBv6uLNU-}&X56Qd!N;%11RfK3OyDtr#{?b| zcue3ifyV?M6L?JEF@eVf9us&>;4y*61RfK3OyG+mu!_i*AF+D=E4&>4g_X>IV?F)1 ztZC}Ue`r(oyLkBgoE6UBEI9igtbP86>DT@ayBS|4Wr(zquKlByPgyzr6?-81XB(fZ zw{G6P=@<Ci-w=S{w}=t?U-a--aQZc*^qvaK0{4*RE<RL0VTJvdDQAWAZ%x15HPS{u z#oqY$*kMc8e%LmH@4;WOgZT?QIewkQSig&=AMg;o!O4!;^Z3XA0E+*EmG+;pANEW3 zB>pclX({l#df3(eGCnFlhT=~-`wPYSpA~7dum4r(wZiauq%d9kHg`3v^l!2^@?-W$ z{vKZhxl4N6au*+smzCX|Q|wH9#17b>(v!cYFMo&E_s=!0)ang6y6NNdNYQuD_5X?_ z{VhHH8GZeQY4`p!R~^KYW$jY!7uqd7w0;{Otk2kE`fGMFrEUEV{*!6@56c@Y?B(o6 zlV3v8zCpCh57`a-D@ft{63BH|*}K?}y@VIgN9=NcpWX1k<eV@zdx_uu^U$*`)qYU( zl(3sQg9qE&X!7^)<dC-i21)-va{nHgmjy@N_3Un6h2gts^B>ZeA8PncmfCUyt~y&6 zzEMxGFZMdSkl#X9eu%XErD^ZP^Zu(U{@0qm{W7h<iSOwI1h27k`5|q84=MjK`@ug! z&L6E;Et0T@eZ%YQAije}eM7@{Bc%B}?owyO+QqKzEFJ{6@x=I;bN`7E&FqyN|7*c3 zZBhFbcp2J?kj{+|yv!K#4!rey_ZL~*AG6Q@U7xSSYiBcYYUk0Xm*J$m*8+Br-&O8t zfZ#fOsOIovc^PdJhGwtS_+JaRV?q1ko#_!bn*qEMX7FIUg%{BWcyj!aF8^j9@X(zP z;iET!6rP|jH-(=_NUq0S<gBtcLU4k;?~_Q%Ewt@j({H~x{&zw965B~2O%a~EF-C|P zZ2e8Vh~80m=R2nDKa6iEWyUc|Uk;(kieIVbcX)g`rOp~#lcf_6w4Lbjd9?hF>DNye z|GS`lER^QR<u-NVd%7Kh=V<vw{9xWklON4jG14%Ehs7T8-;fb7V9j;A1FkYht*se? zYnU+#-7BWuOCSH6)V|0!feUu#I1xN|$GJB%j1brGB6<}md{ke0Z^m?fW%SVN^yK9f zIV<d8MR1hSYCk$|_F5|bH_^WEE5YB|;$^DTOjW9Ncu&sY!FCldqL<B@e~8Z)5F^<o z{KWscL~1R#-TAIkM!|@+1sNQ{8c&;k|E1%9<L!%gYmjIehZ6~`#p5)F-0z0q8Kj%> z_Yr<cJ-ax+Qas6XogQyap)2I5w%03y(o?fnrug4@`)Yj9+!ll|gx7cvp1YeEIfURP zv*sW2^B6|mdoqY{<k>xLw?C)YS?;K@HCS7b!Kmr?&m8}&+7}DV_@G&H_>gM6#s_#_ zk(#hGV!s~sm(EY}Z}=^_E+nu39j>s~#tE8rPxn8~5dSOfOMHn>bJ+MNh|hfowrd>U zQz7`Mzm~f`w*umOmvsx5yFRx<;(PC#;d0mKR!n^F{SsW}`rHbN@4Y{Q%UqvZQSrU^ zLvWevb1N*q_udIEbA4{b#rNJT!DX(`t-$!+dn35a^|=)p-+M0vW%Xbg>vJnKzBhY< z%UqwEkH1`D@x9p+T;}@ReEj8#i|@^j;4;_e^!Upa8sD1@!T9?9{jSdmzZD$cYfe!3 zEn|Id#mD!W5nSf_-0yvS?=lN`EVF!w+1-8k&o8rp$1=-@Jp6pjzTq+pcr3Ggh}pM$ z_|Gr1fX6b+hdlgz%)a3=3wSKEe2Ce%d-%^Uvw(;G_;TOnLmqxUX5TQ`fzRx{Y0k^^ zaT(=9(zN)9{VCOf@1DQBF@3zx@*(&9c@NzMiyipx=_?Aq)bxS(borzI=5!8x_s}ch zw=jL+JzW0izcsxBU)KE+e)0f)pXEcc{+x&QjY~Q3W%-4dN*|cz<&XZ`mUiIFs&59r z#pwgHy8O|9Qzi$#EP9tIeP9-sKl*RU?7(N*zj=P?(g&Je{^-BK)PXN^7j=I3T|Ojp z;}64GvjAYT0A7<m*ry1Z%Md_pIZDvmN)7-v3*a^3=Ps}eL2$_cVp|v|)0G?mY!<+4 z!mq&M^5YV|WB{?-hRgFx4gfX{;LQxMRP$OLAqs#j5kOq1S?us#xdFhY0lb;{iSvg$ z*Iq>^N!kEn+zT}xPF8LJuxS8qW`1h@+2#t^Bp4@605R@`QkiAt1^_3o|D_G!&CE}Q z4{L6Zv(y@a;GzIx<z6V^mn$^@ICcFmO#p9Ze!BaX$L{hw%B>9$To6F4+zU08XMbe| z054wuTM)o&#!rP0iv$+gtE`O>ObH;4yBDt90N`}%e<=aH=KKgB);x|JpR?3fV`)ND zlLCn2?u8oFMpkA3aQgMXqyXN`{0#S_Tr1Uj0**3Ut)*E7O2-3;<L-qTl?Ya50Ps@l ze~AEIbAGz}k;j>nhYnZR!ir!#fH>h^s1ZT95(9vjUjK^+@S5=x4JCY7%e5ha`OZ>C z!HA^=8B_tp3HL&o#jL~t;7sd(Du6c|eky!eLjt|7LPyA6ZLL=X2_QD~o63VLGXOaA z`k!eDT7CaWg%4{UyW5@PcNRO!Z8g>gODi%MO>!^PsJOK90)Usj{%7pJp}`i93kfVh zhbwHgIzizZ5C4_=p(`%{IJ0^_KKq!!V*-x}JSOm%z+(cB2|On7n80HKj|n^`@R-13 z0*?tiCh(ZRV*(E`0T(+HA+`>yIK<a+;bEV07Zg}Z*~6&7k6zxD`aZL+jgkww4+o(( zvg<BvTYk43wT0cbLSmH4)F&99VSI{N`(ONdD+Ej9-(Kfqe0S-ecGoiG@_VfT-V;(3 zh2wlL&AxvtIbj=N*u?+ySIv{e_isk4w%9%R!4xZsa;NmqkoP|tuUx#ON)q2*_!xb~ zcl55-;|RHPb8SA}RjfFcK~nFhS^xT==T~<pn7;j_1c5euvHW~Sk@_wnt|~KY{-HUa z!`8!MI+YqFB>n{N$8LL-E8zBc?74{^mck=s`u%&5y)w2Y^OkCVQG!4R&M<i-G!~A< zrtLpGZ*a1KEqA{->F$?n6odH*B|&DiE?b?m)a8evBiAbLEMT+Dynk(IZUsDKdpB+S zX%hrmG@<B)`J?o5pg+c)FF{Y0eEo(pCZACPWslZnjX27kML9k<&Oo`gT;6NCeTmhm zr5!!~H*NbF5(HYI_-MWCY{AN17ugc+DbXlF^h=7Fd`K$mtq^RmS0ROgoIGKOHe1a4 zmm2?@ru`L`AkeP4loaavS)@_WOf>jASdl02e(bU~+G-qS&LUTSju(ce=}C<JT_gT? zQTr<{L7<J1N*{aj#kW@a_frb$d_*$(S-c;+EKRmrdj(QhnB&j!n#<q9_}>NXufPO> z(!(4a^2BDU^i@Bj=++-n_~<glXC|~xOS3I(uX2Q(#V!D6e(C-jlD<5jk->$Ffw&nY zwZ9@01lkz6bI@bNOXTAh6oL9a1z~Pem}466M+jOY_G+w2X=ZwQ`uxT5zbWm@2=Vun zI$BW)0>$>Z&|~<C2lU@kvg})wL%Kofo}H9W>9n+1>uojm3Uhv9LCUegMT`b{wCGea zm}p=3ulX5wXoV#RwBi!#6n=($e1-(Rj1JFId}dS;Y_Qd4%WrY~Z@his_iM`K{S8+B zdz6iOhtfUQD4#OJjA@E80*~-ZdPs@T-%?Id<l~2w+j^U_QRgY$GllnKr==B-dXtwt z>52zd8S_bzKp*334zg|Kx<vacHbEf0)aBzJC^q=#6ny)bqD0pz%dwxiOiT!x;8SZZ zI=^)Bzp8y1VgHd?`cEi%_&x5?`+UZ86rVYR44N0XG5Z2v^m*a;8!X7*F~a|V-n~oF ztt%8hnx^<n6c0#~mo9%9;(u3og23gj&#i#?UK0s1IlavFxfK!LYl0}_>1D3Zt&sTM zjEFLxT;}@Riiz*d2r0wKWv<Vyp!nVlkTRTF=K9<Ui|@?<YKc?JT%TKU@x4o6TJq1! zT%TKk@x4o;N_%3N>vJnIzBetXCH}a~^|=)q-@62?G=H97f4Q&qIeGS_yzk13?@fa$ z?Vsk?U+!ytPWY+vzAG=jH!ZG3f7I7s?t6W1MaTCpf|ceE`ufW+*+cqotN{KID_Cz( xTKj5zb!khQD^mWn!sB~W`kMBaWft&Q@$tQBp(Xusnd@`L{x9WyS7dze{|`Wm#zg=C literal 0 HcmV?d00001 diff --git a/Ryujinx.Graphics.OpenGL/Effects/Textures/SmaaSearchTexture.bin b/Ryujinx.Graphics.OpenGL/Effects/Textures/SmaaSearchTexture.bin new file mode 100644 index 0000000000000000000000000000000000000000..db5bf73f7d5a0b5e436d336849c90bfbc24d76dc GIT binary patch literal 1024 zcmezOkD<Pvf#Dy7Vt@dk2uKi2{R1+90K~@zpc={6kIhU{#3;3&QvIa3l@@A|qY7?5 evLH0#aK#_8QgZae^^nP+)P73!lj-bXqYVIqI9W{q literal 0 HcmV?d00001 diff --git a/Ryujinx.Graphics.OpenGL/Ryujinx.Graphics.OpenGL.csproj b/Ryujinx.Graphics.OpenGL/Ryujinx.Graphics.OpenGL.csproj index 9fd2c48a5..2313cc68f 100644 --- a/Ryujinx.Graphics.OpenGL/Ryujinx.Graphics.OpenGL.csproj +++ b/Ryujinx.Graphics.OpenGL/Ryujinx.Graphics.OpenGL.csproj @@ -9,6 +9,20 @@ <PackageReference Include="OpenTK.Graphics" /> </ItemGroup> + <ItemGroup> + <EmbeddedResource Include="Effects\Textures\SmaaAreaTexture.bin" /> + <EmbeddedResource Include="Effects\Textures\SmaaSearchTexture.bin" /> + <EmbeddedResource Include="Effects\Shaders\fsr_sharpening.glsl" /> + <EmbeddedResource Include="Effects\Shaders\fxaa.glsl" /> + <EmbeddedResource Include="Effects\Shaders\smaa.hlsl" /> + <EmbeddedResource Include="Effects\Shaders\smaa_blend.glsl" /> + <EmbeddedResource Include="Effects\Shaders\smaa_edge.glsl" /> + <EmbeddedResource Include="Effects\Shaders\smaa_neighbour.glsl" /> + <EmbeddedResource Include="Effects\Shaders\ffx_fsr1.h" /> + <EmbeddedResource Include="Effects\Shaders\ffx_a.h" /> + <EmbeddedResource Include="Effects\Shaders\fsr_scaling.glsl" /> + </ItemGroup> + <ItemGroup> <ProjectReference Include="..\Ryujinx.Common\Ryujinx.Common.csproj" /> <ProjectReference Include="..\Ryujinx.Graphics.GAL\Ryujinx.Graphics.GAL.csproj" /> diff --git a/Ryujinx.Graphics.OpenGL/Window.cs b/Ryujinx.Graphics.OpenGL/Window.cs index 8f7917f91..d6606f392 100644 --- a/Ryujinx.Graphics.OpenGL/Window.cs +++ b/Ryujinx.Graphics.OpenGL/Window.cs @@ -1,5 +1,7 @@ using OpenTK.Graphics.OpenGL; using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.OpenGL.Effects; +using Ryujinx.Graphics.OpenGL.Effects.Smaa; using Ryujinx.Graphics.OpenGL.Image; using System; @@ -7,14 +9,24 @@ namespace Ryujinx.Graphics.OpenGL { class Window : IWindow, IDisposable { - private const int TextureCount = 3; private readonly OpenGLRenderer _renderer; private bool _initialized; private int _width; private int _height; + private bool _updateSize; private int _copyFramebufferHandle; + private IPostProcessingEffect _antiAliasing; + private IScalingFilter _scalingFilter; + private bool _isLinear; + private AntiAliasing _currentAntiAliasing; + private bool _updateEffect; + private ScalingFilter _currentScalingFilter; + private float _scalingFilterLevel; + private bool _updateScalingFilter; + private bool _isBgra; + private TextureView _upscaledTexture; internal BackgroundContextWorker BackgroundContext { get; private set; } @@ -48,6 +60,8 @@ namespace Ryujinx.Graphics.OpenGL { _width = width; _height = height; + + _updateSize = true; } private void CopyTextureToFrameBufferRGB(int drawFramebuffer, int readFramebuffer, TextureView view, ImageCrop crop, Action swapBuffersCallback) @@ -57,6 +71,32 @@ namespace Ryujinx.Graphics.OpenGL TextureView viewConverted = view.Format.IsBgr() ? _renderer.TextureCopy.BgraSwap(view) : view; + UpdateEffect(); + + if (_antiAliasing != null) + { + var oldView = viewConverted; + + viewConverted = _antiAliasing.Run(viewConverted, _width, _height); + + if (viewConverted.Format.IsBgr()) + { + var swappedView = _renderer.TextureCopy.BgraSwap(viewConverted); + + viewConverted?.Dispose(); + + viewConverted = swappedView; + } + + if (viewConverted != oldView && oldView != view) + { + oldView.Dispose(); + } + } + + GL.BindFramebuffer(FramebufferTarget.DrawFramebuffer, drawFramebuffer); + GL.BindFramebuffer(FramebufferTarget.ReadFramebuffer, readFramebuffer); + GL.FramebufferTexture( FramebufferTarget.ReadFramebuffer, FramebufferAttachment.ColorAttachment0, @@ -71,12 +111,12 @@ namespace Ryujinx.Graphics.OpenGL GL.Clear(ClearBufferMask.ColorBufferBit); int srcX0, srcX1, srcY0, srcY1; - float scale = view.ScaleFactor; + float scale = viewConverted.ScaleFactor; if (crop.Left == 0 && crop.Right == 0) { srcX0 = 0; - srcX1 = (int)(view.Width / scale); + srcX1 = (int)(viewConverted.Width / scale); } else { @@ -87,7 +127,7 @@ namespace Ryujinx.Graphics.OpenGL if (crop.Top == 0 && crop.Bottom == 0) { srcY0 = 0; - srcY1 = (int)(view.Height / scale); + srcY1 = (int)(viewConverted.Height / scale); } else { @@ -125,6 +165,42 @@ namespace Ryujinx.Graphics.OpenGL ScreenCaptureRequested = false; } + if (_scalingFilter != null) + { + if (viewConverted.Format.IsBgr() && !_isBgra) + { + RecreateUpscalingTexture(true); + } + + _scalingFilter.Run( + viewConverted, + _upscaledTexture, + _width, + _height, + new Extents2D( + srcX0, + srcY0, + srcX1, + srcY1), + new Extents2D( + dstX0, + dstY0, + dstX1, + dstY1) + ); + + srcX0 = dstX0; + srcY0 = dstY0; + srcX1 = dstX1; + srcY1 = dstY1; + + GL.FramebufferTexture( + FramebufferTarget.ReadFramebuffer, + FramebufferAttachment.ColorAttachment0, + _upscaledTexture.Handle, + 0); + } + GL.BlitFramebuffer( srcX0, srcY0, @@ -135,7 +211,7 @@ namespace Ryujinx.Graphics.OpenGL dstX1, dstY1, ClearBufferMask.ColorBufferBit, - BlitFramebufferFilter.Linear); + _isLinear ? BlitFramebufferFilter.Linear : BlitFramebufferFilter.Nearest); // Remove Alpha channel GL.ColorMask(false, false, false, true); @@ -209,6 +285,135 @@ namespace Ryujinx.Graphics.OpenGL _copyFramebufferHandle = 0; } + + _antiAliasing?.Dispose(); + _scalingFilter?.Dispose(); + _upscaledTexture?.Dispose(); + } + + public void SetAntiAliasing(AntiAliasing effect) + { + if (_currentAntiAliasing == effect && _antiAliasing != null) + { + return; + } + + _currentAntiAliasing = effect; + + _updateEffect = true; + } + + public void SetScalingFilter(ScalingFilter type) + { + if (_currentScalingFilter == type && _antiAliasing != null) + { + return; + } + + _currentScalingFilter = type; + + _updateScalingFilter = true; + } + + private void UpdateEffect() + { + if (_updateEffect) + { + _updateEffect = false; + + switch (_currentAntiAliasing) + { + case AntiAliasing.Fxaa: + _antiAliasing?.Dispose(); + _antiAliasing = new FxaaPostProcessingEffect(_renderer); + break; + case AntiAliasing.None: + _antiAliasing?.Dispose(); + _antiAliasing = null; + break; + case AntiAliasing.SmaaLow: + case AntiAliasing.SmaaMedium: + case AntiAliasing.SmaaHigh: + case AntiAliasing.SmaaUltra: + var quality = _currentAntiAliasing - AntiAliasing.SmaaLow; + if (_antiAliasing is SmaaPostProcessingEffect smaa) + { + smaa.Quality = quality; + } + else + { + _antiAliasing?.Dispose(); + _antiAliasing = new SmaaPostProcessingEffect(_renderer, quality); + } + break; + } + } + + if (_updateSize && !_updateScalingFilter) + { + RecreateUpscalingTexture(); + } + + _updateSize = false; + + if (_updateScalingFilter) + { + _updateScalingFilter = false; + + switch (_currentScalingFilter) + { + case ScalingFilter.Bilinear: + case ScalingFilter.Nearest: + _scalingFilter?.Dispose(); + _scalingFilter = null; + _isLinear = _currentScalingFilter == ScalingFilter.Bilinear; + _upscaledTexture?.Dispose(); + _upscaledTexture = null; + break; + case ScalingFilter.Fsr: + if (_scalingFilter is not FsrScalingFilter) + { + _scalingFilter?.Dispose(); + _scalingFilter = new FsrScalingFilter(_renderer, _antiAliasing); + } + _isLinear = false; + _scalingFilter.Level = _scalingFilterLevel; + + RecreateUpscalingTexture(); + break; + } + } + } + + private void RecreateUpscalingTexture(bool forceBgra = false) + { + _upscaledTexture?.Dispose(); + + var info = new TextureCreateInfo( + _width, + _height, + 1, + 1, + 1, + 1, + 1, + 1, + Format.R8G8B8A8Unorm, + DepthStencilMode.Depth, + Target.Texture2D, + forceBgra ? SwizzleComponent.Blue : SwizzleComponent.Red, + SwizzleComponent.Green, + forceBgra ? SwizzleComponent.Red : SwizzleComponent.Blue, + SwizzleComponent.Alpha); + + _isBgra = forceBgra; + _upscaledTexture = _renderer.CreateTexture(info, 1) as TextureView; + } + + public void SetScalingFilterLevel(float level) + { + _scalingFilterLevel = level; + _updateScalingFilter = true; } } } \ No newline at end of file diff --git a/Ryujinx.Graphics.Vulkan/DescriptorSetUpdater.cs b/Ryujinx.Graphics.Vulkan/DescriptorSetUpdater.cs index 9ac2e61de..19a085023 100644 --- a/Ryujinx.Graphics.Vulkan/DescriptorSetUpdater.cs +++ b/Ryujinx.Graphics.Vulkan/DescriptorSetUpdater.cs @@ -163,6 +163,13 @@ namespace Ryujinx.Graphics.Vulkan SignalDirty(DirtyFlags.Image); } + public void SetImage(int binding, Auto<DisposableImageView> image) + { + _imageRefs[binding] = image; + + SignalDirty(DirtyFlags.Image); + } + public void SetStorageBuffers(CommandBuffer commandBuffer, ReadOnlySpan<BufferAssignment> buffers) { for (int i = 0; i < buffers.Length; i++) diff --git a/Ryujinx.Graphics.Vulkan/Effects/FsrScalingFilter.cs b/Ryujinx.Graphics.Vulkan/Effects/FsrScalingFilter.cs new file mode 100644 index 000000000..a12070592 --- /dev/null +++ b/Ryujinx.Graphics.Vulkan/Effects/FsrScalingFilter.cs @@ -0,0 +1,208 @@ +using Ryujinx.Common; +using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.Shader; +using Ryujinx.Graphics.Shader.Translation; +using Silk.NET.Vulkan; +using System; +using Extent2D = Ryujinx.Graphics.GAL.Extents2D; + +namespace Ryujinx.Graphics.Vulkan.Effects +{ + internal partial class FsrScalingFilter : IScalingFilter + { + private readonly VulkanRenderer _renderer; + private PipelineHelperShader _pipeline; + private ISampler _sampler; + private ShaderCollection _scalingProgram; + private ShaderCollection _sharpeningProgram; + private float _sharpeningLevel = 1; + private Device _device; + private TextureView _intermediaryTexture; + + public float Level + { + get => _sharpeningLevel; + set + { + _sharpeningLevel = MathF.Max(0.01f, value); + } + } + + public FsrScalingFilter(VulkanRenderer renderer, Device device) + { + _device = device; + _renderer = renderer; + + Initialize(); + } + + public void Dispose() + { + _pipeline.Dispose(); + _scalingProgram.Dispose(); + _sharpeningProgram.Dispose(); + _sampler.Dispose(); + _intermediaryTexture?.Dispose(); + } + + public void Initialize() + { + _pipeline = new PipelineHelperShader(_renderer, _device); + + _pipeline.Initialize(); + + var scalingShader = EmbeddedResources.Read("Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrScaling.spv"); + var sharpeningShader = EmbeddedResources.Read("Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrSharpening.spv"); + + var computeBindings = new ShaderBindings( + new[] { 2 }, + Array.Empty<int>(), + new[] { 1 }, + new[] { 0 }); + + var sharpeningBindings = new ShaderBindings( + new[] { 2, 3, 4 }, + Array.Empty<int>(), + new[] { 1 }, + new[] { 0 }); + + _sampler = _renderer.CreateSampler(GAL.SamplerCreateInfo.Create(MinFilter.Linear, MagFilter.Linear)); + + _scalingProgram = _renderer.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(scalingShader, computeBindings, ShaderStage.Compute, TargetLanguage.Spirv) + }); + + _sharpeningProgram = _renderer.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(sharpeningShader, sharpeningBindings, ShaderStage.Compute, TargetLanguage.Spirv) + }); + } + + public void Run( + TextureView view, + CommandBufferScoped cbs, + Auto<DisposableImageView> destinationTexture, + Silk.NET.Vulkan.Format format, + int width, + int height, + Extent2D source, + Extent2D destination) + { + if (_intermediaryTexture == null + || _intermediaryTexture.Info.Width != width + || _intermediaryTexture.Info.Height != height + || !_intermediaryTexture.Info.Equals(view.Info)) + { + var originalInfo = view.Info; + + var swapRB = originalInfo.Format.IsBgr() && originalInfo.SwizzleR == SwizzleComponent.Red; + + var info = new TextureCreateInfo( + width, + height, + originalInfo.Depth, + originalInfo.Levels, + originalInfo.Samples, + originalInfo.BlockWidth, + originalInfo.BlockHeight, + originalInfo.BytesPerPixel, + originalInfo.Format, + originalInfo.DepthStencilMode, + originalInfo.Target, + swapRB ? originalInfo.SwizzleB : originalInfo.SwizzleR, + originalInfo.SwizzleG, + swapRB ? originalInfo.SwizzleR : originalInfo.SwizzleB, + originalInfo.SwizzleA); + _intermediaryTexture?.Dispose(); + _intermediaryTexture = _renderer.CreateTexture(info, view.ScaleFactor) as TextureView; + } + + Span<GAL.Viewport> viewports = stackalloc GAL.Viewport[1]; + Span<Rectangle<int>> scissors = stackalloc Rectangle<int>[1]; + + viewports[0] = new GAL.Viewport( + new Rectangle<float>(0, 0, view.Width, view.Height), + ViewportSwizzle.PositiveX, + ViewportSwizzle.PositiveY, + ViewportSwizzle.PositiveZ, + ViewportSwizzle.PositiveW, + 0f, + 1f); + + scissors[0] = new Rectangle<int>(0, 0, view.Width, view.Height); + + _pipeline.SetCommandBuffer(cbs); + _pipeline.SetProgram(_scalingProgram); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, view, _sampler); + + float srcWidth = Math.Abs(source.X2 - source.X1); + float srcHeight = Math.Abs(source.Y2 - source.Y1); + float scaleX = srcWidth / view.Width; + float scaleY = srcHeight / view.Height; + + ReadOnlySpan<float> dimensionsBuffer = stackalloc float[] + { + source.X1, + source.X2, + source.Y1, + source.Y2, + destination.X1, + destination.X2, + destination.Y1, + destination.Y2, + scaleX, + scaleY + }; + + int rangeSize = dimensionsBuffer.Length * sizeof(float); + var bufferHandle = _renderer.BufferManager.CreateWithHandle(_renderer, rangeSize, false); + _renderer.BufferManager.SetData(bufferHandle, 0, dimensionsBuffer); + + ReadOnlySpan<float> sharpeningBuffer = stackalloc float[] { 1.5f - (Level * 0.01f * 1.5f)}; + var sharpeningBufferHandle = _renderer.BufferManager.CreateWithHandle(_renderer, sizeof(float), false); + _renderer.BufferManager.SetData(sharpeningBufferHandle, 0, sharpeningBuffer); + + int threadGroupWorkRegionDim = 16; + int dispatchX = (width + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + int dispatchY = (height + (threadGroupWorkRegionDim - 1)) / threadGroupWorkRegionDim; + + var bufferRanges = new BufferRange(bufferHandle, 0, rangeSize); + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(2, bufferRanges) }); + _pipeline.SetScissors(scissors); + _pipeline.SetViewports(viewports, false); + _pipeline.SetImage(0, _intermediaryTexture, GAL.Format.R8G8B8A8Unorm); + _pipeline.DispatchCompute(dispatchX, dispatchY, 1); + _pipeline.ComputeBarrier(); + + viewports[0] = new GAL.Viewport( + new Rectangle<float>(0, 0, width, height), + ViewportSwizzle.PositiveX, + ViewportSwizzle.PositiveY, + ViewportSwizzle.PositiveZ, + ViewportSwizzle.PositiveW, + 0f, + 1f); + + scissors[0] = new Rectangle<int>(0, 0, width, height); + + // Sharpening pass + _pipeline.SetCommandBuffer(cbs); + _pipeline.SetProgram(_sharpeningProgram); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, _intermediaryTexture, _sampler); + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(2, bufferRanges) }); + var sharpeningRange = new BufferRange(sharpeningBufferHandle, 0, sizeof(float)); + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(4, sharpeningRange) }); + _pipeline.SetScissors(scissors); + _pipeline.SetViewports(viewports, false); + _pipeline.SetImage(0, destinationTexture); + _pipeline.DispatchCompute(dispatchX, dispatchY, 1); + _pipeline.ComputeBarrier(); + + _pipeline.Finish(); + + _renderer.BufferManager.Delete(bufferHandle); + _renderer.BufferManager.Delete(sharpeningBufferHandle); + } + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Vulkan/Effects/FxaaPostProcessingEffect.cs b/Ryujinx.Graphics.Vulkan/Effects/FxaaPostProcessingEffect.cs new file mode 100644 index 000000000..0f6a0a7ba --- /dev/null +++ b/Ryujinx.Graphics.Vulkan/Effects/FxaaPostProcessingEffect.cs @@ -0,0 +1,127 @@ +using Ryujinx.Common; +using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.Shader; +using Ryujinx.Graphics.Shader.Translation; +using Silk.NET.Vulkan; +using System; + +namespace Ryujinx.Graphics.Vulkan.Effects +{ + internal partial class FxaaPostProcessingEffect : IPostProcessingEffect + { + private readonly VulkanRenderer _renderer; + private ISampler _samplerLinear; + private ShaderCollection _shaderProgram; + + private PipelineHelperShader _pipeline; + private TextureView _texture; + + public FxaaPostProcessingEffect(VulkanRenderer renderer, Device device) + { + _renderer = renderer; + _pipeline = new PipelineHelperShader(renderer, device); + + Initialize(); + } + + public void Dispose() + { + _shaderProgram.Dispose(); + _pipeline.Dispose(); + _samplerLinear.Dispose(); + _texture?.Dispose(); + } + + private void Initialize() + { + _pipeline.Initialize(); + + var shader = EmbeddedResources.Read("Ryujinx.Graphics.Vulkan/Effects/Shaders/Fxaa.spv"); + + var computeBindings = new ShaderBindings( + new[] { 2 }, + Array.Empty<int>(), + new[] { 1 }, + new[] { 0 }); + + _samplerLinear = _renderer.CreateSampler(GAL.SamplerCreateInfo.Create(MinFilter.Linear, MagFilter.Linear)); + + _shaderProgram = _renderer.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(shader, computeBindings, ShaderStage.Compute, TargetLanguage.Spirv) + }); + } + + public TextureView Run(TextureView view, CommandBufferScoped cbs, int width, int height) + { + if (_texture == null || _texture.Width != view.Width || _texture.Height != view.Height) + { + _texture?.Dispose(); + + var info = view.Info; + + if (view.Info.Format.IsBgr()) + { + info = new TextureCreateInfo(info.Width, + info.Height, + info.Depth, + info.Levels, + info.Samples, + info.BlockWidth, + info.BlockHeight, + info.BytesPerPixel, + info.Format, + info.DepthStencilMode, + info.Target, + info.SwizzleB, + info.SwizzleG, + info.SwizzleR, + info.SwizzleA); + } + _texture = _renderer.CreateTexture(info, view.ScaleFactor) as TextureView; + } + + _pipeline.SetCommandBuffer(cbs); + _pipeline.SetProgram(_shaderProgram); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, view, _samplerLinear); + + ReadOnlySpan<float> resolutionBuffer = stackalloc float[] { view.Width, view.Height }; + int rangeSize = resolutionBuffer.Length * sizeof(float); + var bufferHandle = _renderer.BufferManager.CreateWithHandle(_renderer, rangeSize, false); + + _renderer.BufferManager.SetData(bufferHandle, 0, resolutionBuffer); + + var bufferRanges = new BufferRange(bufferHandle, 0, rangeSize); + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(2, bufferRanges) }); + + Span<GAL.Viewport> viewports = stackalloc GAL.Viewport[1]; + + viewports[0] = new GAL.Viewport( + new Rectangle<float>(0, 0, view.Width, view.Height), + ViewportSwizzle.PositiveX, + ViewportSwizzle.PositiveY, + ViewportSwizzle.PositiveZ, + ViewportSwizzle.PositiveW, + 0f, + 1f); + + Span<Rectangle<int>> scissors = stackalloc Rectangle<int>[1]; + + var dispatchX = BitUtils.DivRoundUp(view.Width, IPostProcessingEffect.LocalGroupSize); + var dispatchY = BitUtils.DivRoundUp(view.Height, IPostProcessingEffect.LocalGroupSize); + + _pipeline.SetScissors(stackalloc[] { new Rectangle<int>(0, 0, view.Width, view.Height) }); + _pipeline.SetViewports(viewports, false); + + _pipeline.SetImage(0, _texture, GAL.Format.R8G8B8A8Unorm); + _pipeline.DispatchCompute(dispatchX, dispatchY, 1); + + _renderer.BufferManager.Delete(bufferHandle); + _pipeline.ComputeBarrier(); + + _pipeline.Finish(); + + return _texture; + } + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Vulkan/Effects/IPostProcessingEffect.cs b/Ryujinx.Graphics.Vulkan/Effects/IPostProcessingEffect.cs new file mode 100644 index 000000000..d36cf01d4 --- /dev/null +++ b/Ryujinx.Graphics.Vulkan/Effects/IPostProcessingEffect.cs @@ -0,0 +1,10 @@ +using System; + +namespace Ryujinx.Graphics.Vulkan.Effects +{ + internal interface IPostProcessingEffect : IDisposable + { + const int LocalGroupSize = 64; + TextureView Run(TextureView view, CommandBufferScoped cbs, int width, int height); + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Vulkan/Effects/IScalingFilter.cs b/Ryujinx.Graphics.Vulkan/Effects/IScalingFilter.cs new file mode 100644 index 000000000..54f809d71 --- /dev/null +++ b/Ryujinx.Graphics.Vulkan/Effects/IScalingFilter.cs @@ -0,0 +1,20 @@ +using Silk.NET.Vulkan; +using System; +using Extent2D = Ryujinx.Graphics.GAL.Extents2D; + +namespace Ryujinx.Graphics.Vulkan.Effects +{ + internal interface IScalingFilter : IDisposable + { + float Level { get; set; } + void Run( + TextureView view, + CommandBufferScoped cbs, + Auto<DisposableImageView> destinationTexture, + Format format, + int width, + int height, + Extent2D source, + Extent2D destination); + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrScaling.glsl b/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrScaling.glsl new file mode 100644 index 000000000..5eb74b3d1 --- /dev/null +++ b/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrScaling.glsl @@ -0,0 +1,3945 @@ +// Scaling + +#version 430 core +layout (local_size_x = 64) in; +layout( rgba8, binding = 0, set = 3) uniform image2D imgOutput; +layout( binding = 1, set = 2) uniform sampler2D Source; +layout( binding = 2 ) uniform dimensions{ + float srcX0; + float srcX1; + float srcY0; + float srcY1; + float dstX0; + float dstX1; + float dstY0; + float dstY1; + float scaleX; + float scaleY; +}; + +#define A_GPU 1 +#define A_GLSL 1 +//============================================================================================================================== +// +// [A] SHADER PORTABILITY 1.20210629 +// +//============================================================================================================================== +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +// MIT LICENSE +// =========== +// Copyright (c) 2014 Michal Drobot (for concepts used in "FLOAT APPROXIMATIONS"). +// ----------- +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// ----------- +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +// Software. +// ----------- +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +// ABOUT +// ===== +// Common central point for high-level shading language and C portability for various shader headers. +//------------------------------------------------------------------------------------------------------------------------------ +// DEFINES +// ======= +// A_CPU ..... Include the CPU related code. +// A_GPU ..... Include the GPU related code. +// A_GLSL .... Using GLSL. +// A_HLSL .... Using HLSL. +// A_HLSL_6_2 Using HLSL 6.2 with new 'uint16_t' and related types (requires '-enable-16bit-types'). +// A_NO_16_BIT_CAST Don't use instructions that are not availabe in SPIR-V (needed for running A_HLSL_6_2 on Vulkan) +// A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default). +// ======= +// A_BYTE .... Support 8-bit integer. +// A_HALF .... Support 16-bit integer and floating point. +// A_LONG .... Support 64-bit integer. +// A_DUBL .... Support 64-bit floating point. +// ======= +// A_WAVE .... Support wave-wide operations. +//------------------------------------------------------------------------------------------------------------------------------ +// To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'. +//------------------------------------------------------------------------------------------------------------------------------ +// SIMPLIFIED TYPE SYSTEM +// ====================== +// - All ints will be unsigned with exception of when signed is required. +// - Type naming simplified and shortened "A<type><#components>", +// - H = 16-bit float (half) +// - F = 32-bit float (float) +// - D = 64-bit float (double) +// - P = 1-bit integer (predicate, not using bool because 'B' is used for byte) +// - B = 8-bit integer (byte) +// - W = 16-bit integer (word) +// - U = 32-bit integer (unsigned) +// - L = 64-bit integer (long) +// - Using "AS<type><#components>" for signed when required. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops). +//------------------------------------------------------------------------------------------------------------------------------ +// CHANGE LOG +// ========== +// 20200914 - Expanded wave ops and prx code. +// 20200713 - Added [ZOL] section, fixed serious bugs in sRGB and Rec.709 color conversion code, etc. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COMMON +//============================================================================================================================== +#define A_2PI 6.28318530718 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// CPU +// +// +//============================================================================================================================== +#ifdef A_CPU + // Supporting user defined overrides. + #ifndef A_RESTRICT + #define A_RESTRICT __restrict + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifndef A_STATIC + #define A_STATIC static + #endif +//------------------------------------------------------------------------------------------------------------------------------ + // Same types across CPU and GPU. + // Predicate uses 32-bit integer (C friendly bool). + typedef uint32_t AP1; + typedef float AF1; + typedef double AD1; + typedef uint8_t AB1; + typedef uint16_t AW1; + typedef uint32_t AU1; + typedef uint64_t AL1; + typedef int8_t ASB1; + typedef int16_t ASW1; + typedef int32_t ASU1; + typedef int64_t ASL1; +//------------------------------------------------------------------------------------------------------------------------------ + #define AD1_(a) ((AD1)(a)) + #define AF1_(a) ((AF1)(a)) + #define AL1_(a) ((AL1)(a)) + #define AU1_(a) ((AU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1_(a) ((ASL1)(a)) + #define ASU1_(a) ((ASU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;} +//------------------------------------------------------------------------------------------------------------------------------ + #define A_TRUE 1 + #define A_FALSE 0 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// CPU/GPU PORTING +// +//------------------------------------------------------------------------------------------------------------------------------ +// Get CPU and GPU to share all setup code, without duplicate code paths. +// This uses a lower-case prefix for special vector constructs. +// - In C restrict pointers are used. +// - In the shading language, in/inout/out arguments are used. +// This depends on the ability to access a vector value in both languages via array syntax (aka color[2]). +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD1 *A_RESTRICT + #define retAD3 AD1 *A_RESTRICT + #define retAD4 AD1 *A_RESTRICT + #define retAF2 AF1 *A_RESTRICT + #define retAF3 AF1 *A_RESTRICT + #define retAF4 AF1 *A_RESTRICT + #define retAL2 AL1 *A_RESTRICT + #define retAL3 AL1 *A_RESTRICT + #define retAL4 AL1 *A_RESTRICT + #define retAU2 AU1 *A_RESTRICT + #define retAU3 AU1 *A_RESTRICT + #define retAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 AD1 *A_RESTRICT + #define inAD3 AD1 *A_RESTRICT + #define inAD4 AD1 *A_RESTRICT + #define inAF2 AF1 *A_RESTRICT + #define inAF3 AF1 *A_RESTRICT + #define inAF4 AF1 *A_RESTRICT + #define inAL2 AL1 *A_RESTRICT + #define inAL3 AL1 *A_RESTRICT + #define inAL4 AL1 *A_RESTRICT + #define inAU2 AU1 *A_RESTRICT + #define inAU3 AU1 *A_RESTRICT + #define inAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 AD1 *A_RESTRICT + #define inoutAD3 AD1 *A_RESTRICT + #define inoutAD4 AD1 *A_RESTRICT + #define inoutAF2 AF1 *A_RESTRICT + #define inoutAF3 AF1 *A_RESTRICT + #define inoutAF4 AF1 *A_RESTRICT + #define inoutAL2 AL1 *A_RESTRICT + #define inoutAL3 AL1 *A_RESTRICT + #define inoutAL4 AL1 *A_RESTRICT + #define inoutAU2 AU1 *A_RESTRICT + #define inoutAU3 AU1 *A_RESTRICT + #define inoutAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 AD1 *A_RESTRICT + #define outAD3 AD1 *A_RESTRICT + #define outAD4 AD1 *A_RESTRICT + #define outAF2 AF1 *A_RESTRICT + #define outAF3 AF1 *A_RESTRICT + #define outAF4 AF1 *A_RESTRICT + #define outAL2 AL1 *A_RESTRICT + #define outAL3 AL1 *A_RESTRICT + #define outAL4 AL1 *A_RESTRICT + #define outAU2 AU1 *A_RESTRICT + #define outAU3 AU1 *A_RESTRICT + #define outAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD1 x[2] + #define varAD3(x) AD1 x[3] + #define varAD4(x) AD1 x[4] + #define varAF2(x) AF1 x[2] + #define varAF3(x) AF1 x[3] + #define varAF4(x) AF1 x[4] + #define varAL2(x) AL1 x[2] + #define varAL3(x) AL1 x[3] + #define varAL4(x) AL1 x[4] + #define varAU2(x) AU1 x[2] + #define varAU3(x) AU1 x[3] + #define varAU4(x) AU1 x[4] +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) {x,y} + #define initAD3(x,y,z) {x,y,z} + #define initAD4(x,y,z,w) {x,y,z,w} + #define initAF2(x,y) {x,y} + #define initAF3(x,y,z) {x,y,z} + #define initAF4(x,y,z,w) {x,y,z,w} + #define initAL2(x,y) {x,y} + #define initAL3(x,y,z) {x,y,z} + #define initAL4(x,y,z,w) {x,y,z,w} + #define initAU2(x,y) {x,y} + #define initAU3(x,y,z) {x,y,z} + #define initAU4(x,y,z,w) {x,y,z,w} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Replace transcendentals with manual versions. +//============================================================================================================================== + #ifdef A_GCC + A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_llabs(ASL1_(a)));} + #else + A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(labs((long)ASL1_(a)));} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);} + #else + A_STATIC AD1 ACosD1(AD1 a){return cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return cosf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} + A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);} + #else + A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);} + #else + A_STATIC AD1 AFloorD1(AD1 a){return floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);} + A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);} + #else + A_STATIC AD1 ALog2D1(AD1 a){return log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;} + A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;} + A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;} + A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + // These follow the convention that A integer types don't have signage, until they are operated on. + A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;} + A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a<b?a:b;} + A_STATIC AF1 AMinF1(AF1 a,AF1 b){return a<b?a:b;} + A_STATIC AL1 AMinL1(AL1 a,AL1 b){return a<b?a:b;} + A_STATIC AU1 AMinU1(AU1 a,AU1 b){return a<b?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AL1 AMinSL1(AL1 a,AL1 b){return (ASL1_(a)<ASL1_(b))?a:b;} + A_STATIC AU1 AMinSU1(AU1 a,AU1 b){return (ASU1_(a)<ASU1_(b))?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ARcpD1(AD1 a){return 1.0/a;} + A_STATIC AF1 ARcpF1(AF1 a){return 1.0f/a;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AL1 AShrSL1(AL1 a,AL1 b){return AL1_(ASL1_(a)>>ASL1_(b));} + A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);} + #else + A_STATIC AD1 ASinD1(AD1 a){return sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return sinf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);} + #else + A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + A_STATIC AD1 AClampD1(AD1 x,AD1 n,AD1 m){return AMaxD1(n,AMinD1(x,m));} + A_STATIC AF1 AClampF1(AF1 x,AF1 n,AF1 m){return AMaxF1(n,AMinF1(x,m));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);} + A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));} + A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));} + A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));} + A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;} + A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;} + A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;} + A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;} + A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;} + A_STATIC retAD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;} + A_STATIC retAD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;} + A_STATIC retAF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;} + A_STATIC retAF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;} +//============================================================================================================================== + A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;} + A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;} + A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;} + A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;} + A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;} + A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;} + A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;} + A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;} + A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//============================================================================================================================== + A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;} + A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;} + A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;} + A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;} + A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF FLOAT PACKING +//============================================================================================================================== + // Convert float to half (in lower 16-bits of output). + // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf + // Supports denormals. + // Conversion rules are to make computations possibly "safer" on the GPU, + // -INF & -NaN -> -65504 + // +INF & +NaN -> +65504 + A_STATIC AU1 AU1_AH1_AF1(AF1 f){ + static AW1 base[512]={ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100, + 0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00, + 0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100, + 0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00, + 0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff}; + static AB1 shift[512]={ + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18}; + union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);} +//------------------------------------------------------------------------------------------------------------------------------ + // Used to output packed constant. + A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GLSL +// +// +//============================================================================================================================== +#if defined(A_GLSL) && defined(A_GPU) + #ifndef A_SKIP_EXT + #ifdef A_HALF + #extension GL_EXT_shader_16bit_storage:require + #extension GL_EXT_shader_explicit_arithmetic_types:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_LONG + #extension GL_ARB_gpu_shader_int64:require + #extension GL_NV_shader_atomic_int64:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_WAVE + #extension GL_KHR_shader_subgroup_arithmetic:require + #extension GL_KHR_shader_subgroup_ballot:require + #extension GL_KHR_shader_subgroup_quad:require + #extension GL_KHR_shader_subgroup_shuffle:require + #endif + #endif +//============================================================================================================================== + #define AP1 bool + #define AP2 bvec2 + #define AP3 bvec3 + #define AP4 bvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 vec2 + #define AF3 vec3 + #define AF4 vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uvec2 + #define AU3 uvec3 + #define AU4 uvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 ivec2 + #define ASU3 ivec3 + #define ASU4 ivec4 +//============================================================================================================================== + #define AF1_AU1(x) uintBitsToFloat(AU1(x)) + #define AF2_AU2(x) uintBitsToFloat(AU2(x)) + #define AF3_AU3(x) uintBitsToFloat(AU3(x)) + #define AF4_AU4(x) uintBitsToFloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) floatBitsToUint(AF1(x)) + #define AU2_AF2(x) floatBitsToUint(AF2(x)) + #define AU3_AF3(x) floatBitsToUint(AF3(x)) + #define AU4_AF4(x) floatBitsToUint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH1_AF1_x(AF1 a){return packHalf2x16(AF2(a,0.0));} + #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2_AF2 packHalf2x16 + #define AU1_AW2Unorm_AF2 packUnorm2x16 + #define AU1_AB4Unorm_AF4 packUnorm4x8 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF2_AH2_AU1 unpackHalf2x16 + #define AF2_AW2Unorm_AU1 unpackUnorm2x16 + #define AF4_AB4Unorm_AU1 unpackUnorm4x8 +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate. + AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){return bitfieldInsert(src,ins,0,ASU1(bits));} +//------------------------------------------------------------------------------------------------------------------------------ + // V_MED3_F32. + AF1 AClampF1(AF1 x,AF1 n,AF1 m){return clamp(x,n,m);} + AF2 AClampF2(AF2 x,AF2 n,AF2 m){return clamp(x,n,m);} + AF3 AClampF3(AF3 x,AF3 n,AF3 m){return clamp(x,n,m);} + AF4 AClampF4(AF4 x,AF4 n,AF4 m){return clamp(x,n,m);} +//------------------------------------------------------------------------------------------------------------------------------ + // V_FRACT_F32 (note DX frac() is different). + AF1 AFractF1(AF1 x){return fract(x);} + AF2 AFractF2(AF2 x){return fract(x);} + AF3 AFractF3(AF3 x){return fract(x);} + AF4 AFractF4(AF4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return mix(x,y,a);} + AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return mix(x,y,a);} + AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return mix(x,y,a);} + AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + // V_MAX3_F32. + AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));} + AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));} + AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));} + AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));} + AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));} + AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));} + AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));} + AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));} + AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));} + AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));} + AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));} + AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));} + AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Clamp has an easier pattern match for med3 when some ordering is known. + // V_MED3_F32. + AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));} + AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));} + AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));} + AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));} +//------------------------------------------------------------------------------------------------------------------------------ + // V_MIN3_F32. + AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));} + AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));} + AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));} + AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));} + AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));} + AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));} + AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));} + AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));} + AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));} + AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));} + AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));} + AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));} + AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently. + // V_COS_F32. + AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));} + AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));} + AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));} + AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));} +//------------------------------------------------------------------------------------------------------------------------------ + // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently. + // V_SIN_F32. + AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));} + AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));} + AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));} + AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ARcpF1(AF1 x){return AF1_(1.0)/x;} + AF2 ARcpF2(AF2 x){return AF2_(1.0)/x;} + AF3 ARcpF3(AF3 x){return AF3_(1.0)/x;} + AF4 ARcpF4(AF4 x){return AF4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ARsqF1(AF1 x){return AF1_(1.0)/sqrt(x);} + AF2 ARsqF2(AF2 x){return AF2_(1.0)/sqrt(x);} + AF3 ARsqF3(AF3 x){return AF3_(1.0)/sqrt(x);} + AF4 ARsqF4(AF4 x){return AF4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ASatF1(AF1 x){return clamp(x,AF1_(0.0),AF1_(1.0));} + AF2 ASatF2(AF2 x){return clamp(x,AF2_(0.0),AF2_(1.0));} + AF3 ASatF3(AF3 x){return clamp(x,AF3_(0.0),AF3_(1.0));} + AF4 ASatF4(AF4 x){return clamp(x,AF4_(0.0),AF4_(1.0));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #define AB1 uint8_t + #define AB2 u8vec2 + #define AB3 u8vec3 + #define AB4 u8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASB1 int8_t + #define ASB2 i8vec2 + #define ASB3 i8vec3 + #define ASB4 i8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + AB1 AB1_x(AB1 a){return AB1(a);} + AB2 AB2_x(AB1 a){return AB2(a,a);} + AB3 AB3_x(AB1 a){return AB3(a,a,a);} + AB4 AB4_x(AB1 a){return AB4(a,a,a,a);} + #define AB1_(a) AB1_x(AB1(a)) + #define AB2_(a) AB2_x(AB1(a)) + #define AB3_(a) AB3_x(AB1(a)) + #define AB4_(a) AB4_x(AB1(a)) + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #define AH1 float16_t + #define AH2 f16vec2 + #define AH3 f16vec3 + #define AH4 f16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 uint16_t + #define AW2 u16vec2 + #define AW3 u16vec3 + #define AW4 u16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 int16_t + #define ASW2 i16vec2 + #define ASW3 i16vec3 + #define ASW4 i16vec4 +//============================================================================================================================== + #define AH2_AU1(x) unpackFloat2x16(AU1(x)) + AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));} + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) unpackUint2x16(AU1(x)) + #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x))) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2(x) packFloat2x16(AH2(x)) + AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));} + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) packUint2x16(AW2(x)) + #define AU2_AW4(x) unpack32(packUint4x16(AW4(x))) +//============================================================================================================================== + #define AW1_AH1(x) halfBitsToUint16(AH1(x)) + #define AW2_AH2(x) halfBitsToUint16(AH2(x)) + #define AW3_AH3(x) halfBitsToUint16(AH3(x)) + #define AW4_AH4(x) halfBitsToUint16(AH4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AH1_AW1(x) uint16BitsToHalf(AW1(x)) + #define AH2_AW2(x) uint16BitsToHalf(AW2(x)) + #define AH3_AW3(x) uint16BitsToHalf(AW3(x)) + #define AH4_AW4(x) uint16BitsToHalf(AW4(x)) +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AClampH1(AH1 x,AH1 n,AH1 m){return clamp(x,n,m);} + AH2 AClampH2(AH2 x,AH2 n,AH2 m){return clamp(x,n,m);} + AH3 AClampH3(AH3 x,AH3 n,AH3 m){return clamp(x,n,m);} + AH4 AClampH4(AH4 x,AH4 n,AH4 m){return clamp(x,n,m);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFractH1(AH1 x){return fract(x);} + AH2 AFractH2(AH2 x){return fract(x);} + AH3 AFractH3(AH3 x){return fract(x);} + AH4 AFractH4(AH4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of max3. + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of min3. + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;} + AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;} + AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;} + AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);} + AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);} + AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);} + AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));} + AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));} + AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));} + AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #define AD1 double + #define AD2 dvec2 + #define AD3 dvec3 + #define AD4 dvec4 +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 x){return fract(x);} + AD2 AFractD2(AD2 x){return fract(x);} + AD3 AFractD3(AD3 x){return fract(x);} + AD4 AFractD4(AD4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;} + AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;} + AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;} + AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);} + AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);} + AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);} + AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));} + AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));} + AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));} + AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL LONG +//============================================================================================================================== + #ifdef A_LONG + #define AL1 uint64_t + #define AL2 u64vec2 + #define AL3 u64vec3 + #define AL4 u64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1 int64_t + #define ASL2 i64vec2 + #define ASL3 i64vec3 + #define ASL4 i64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AL1_AU2(x) packUint2x32(AU2(x)) + #define AU2_AL1(x) unpackUint2x32(AL1(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AL1_x(AL1 a){return AL1(a);} + AL2 AL2_x(AL1 a){return AL2(a,a);} + AL3 AL3_x(AL1 a){return AL3(a,a,a);} + AL4 AL4_x(AL1 a){return AL4(a,a,a,a);} + #define AL1_(a) AL1_x(AL1(a)) + #define AL2_(a) AL2_x(AL1(a)) + #define AL3_(a) AL3_x(AL1(a)) + #define AL4_(a) AL4_x(AL1(a)) +//============================================================================================================================== + AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));} + AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));} + AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));} + AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));} + AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));} + AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));} + AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));} + AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));} + AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));} + AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// WAVE OPERATIONS +//============================================================================================================================== + #ifdef A_WAVE + // Where 'x' must be a compile time literal. + AF1 AWaveXorF1(AF1 v,AU1 x){return subgroupShuffleXor(v,x);} + AF2 AWaveXorF2(AF2 v,AU1 x){return subgroupShuffleXor(v,x);} + AF3 AWaveXorF3(AF3 v,AU1 x){return subgroupShuffleXor(v,x);} + AF4 AWaveXorF4(AF4 v,AU1 x){return subgroupShuffleXor(v,x);} + AU1 AWaveXorU1(AU1 v,AU1 x){return subgroupShuffleXor(v,x);} + AU2 AWaveXorU2(AU2 v,AU1 x){return subgroupShuffleXor(v,x);} + AU3 AWaveXorU3(AU3 v,AU1 x){return subgroupShuffleXor(v,x);} + AU4 AWaveXorU4(AU4 v,AU1 x){return subgroupShuffleXor(v,x);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(subgroupShuffleXor(AU1_AH2(v),x));} + AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(subgroupShuffleXor(AU2_AH4(v),x));} + AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(subgroupShuffleXor(AU1_AW2(v),x));} + AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU2(subgroupShuffleXor(AU2_AW4(v),x));} + #endif + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// HLSL +// +// +//============================================================================================================================== +#if defined(A_HLSL) && defined(A_GPU) + #ifdef A_HLSL_6_2 + #define AP1 bool + #define AP2 bool2 + #define AP3 bool3 + #define AP4 bool4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float32_t + #define AF2 float32_t2 + #define AF3 float32_t3 + #define AF4 float32_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint32_t + #define AU2 uint32_t2 + #define AU3 uint32_t3 + #define AU4 uint32_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int32_t + #define ASU2 int32_t2 + #define ASU3 int32_t3 + #define ASU4 int32_t4 + #else + #define AP1 bool + #define AP2 bool2 + #define AP3 bool3 + #define AP4 bool4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 float2 + #define AF3 float3 + #define AF4 float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uint2 + #define AU3 uint3 + #define AU4 uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 int2 + #define ASU3 int3 + #define ASU4 int4 + #endif +//============================================================================================================================== + #define AF1_AU1(x) asfloat(AU1(x)) + #define AF2_AU2(x) asfloat(AU2(x)) + #define AF3_AU3(x) asfloat(AU3(x)) + #define AF4_AU4(x) asfloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) asuint(AF1(x)) + #define AU2_AF2(x) asuint(AF2(x)) + #define AU3_AF3(x) asuint(AF3(x)) + #define AU4_AF4(x) asuint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH1_AF1_x(AF1 a){return f32tof16(a);} + #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);} + #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) + #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));} + #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x)) +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1u<<bits)-1;return (src>>off)&mask;} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1u<<bits)-1;return (ins&mask)|(src&(~mask));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AClampF1(AF1 x,AF1 n,AF1 m){return max(n,min(x,m));} + AF2 AClampF2(AF2 x,AF2 n,AF2 m){return max(n,min(x,m));} + AF3 AClampF3(AF3 x,AF3 n,AF3 m){return max(n,min(x,m));} + AF4 AClampF4(AF4 x,AF4 n,AF4 m){return max(n,min(x,m));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFractF1(AF1 x){return x-floor(x);} + AF2 AFractF2(AF2 x){return x-floor(x);} + AF3 AFractF3(AF3 x){return x-floor(x);} + AF4 AFractF4(AF4 x){return x-floor(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return lerp(x,y,a);} + AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return lerp(x,y,a);} + AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return lerp(x,y,a);} + AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));} + AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));} + AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));} + AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));} + AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));} + AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));} + AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));} + AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));} + AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));} + AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));} + AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));} + AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));} + AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));} + AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));} + AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));} + AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));} + AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));} + AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));} + AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));} + AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));} + AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));} + AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));} + AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));} + AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));} + AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));} + AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));} + AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));} + AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));} + AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));} + AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));} + AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));} + AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));} + AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));} + AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ARcpF1(AF1 x){return rcp(x);} + AF2 ARcpF2(AF2 x){return rcp(x);} + AF3 ARcpF3(AF3 x){return rcp(x);} + AF4 ARcpF4(AF4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ARsqF1(AF1 x){return rsqrt(x);} + AF2 ARsqF2(AF2 x){return rsqrt(x);} + AF3 ARsqF3(AF3 x){return rsqrt(x);} + AF4 ARsqF4(AF4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ASatF1(AF1 x){return saturate(x);} + AF2 ASatF2(AF2 x){return saturate(x);} + AF3 ASatF3(AF3 x){return saturate(x);} + AF4 ASatF4(AF4 x){return saturate(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #ifdef A_HLSL_6_2 + #define AH1 float16_t + #define AH2 float16_t2 + #define AH3 float16_t3 + #define AH4 float16_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 uint16_t + #define AW2 uint16_t2 + #define AW3 uint16_t3 + #define AW4 uint16_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 int16_t + #define ASW2 int16_t2 + #define ASW3 int16_t3 + #define ASW4 int16_t4 + #else + #define AH1 min16float + #define AH2 min16float2 + #define AH3 min16float3 + #define AH4 min16float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 min16uint + #define AW2 min16uint2 + #define AW3 min16uint3 + #define AW4 min16uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 min16int + #define ASW2 min16int2 + #define ASW3 min16int3 + #define ASW4 min16int4 + #endif +//============================================================================================================================== + // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly). + // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/ + AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);} + AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));} + AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);} + AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));} + #define AH2_AU1(x) AH2_AU1_x(AU1(x)) + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) AW2_AU1_x(AU1(x)) + #define AW4_AU2(x) AW4_AU2_x(AU2(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);} + AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));} + AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);} + AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));} + #define AU1_AH2(x) AU1_AH2_x(AH2(x)) + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) AU1_AW2_x(AW2(x)) + #define AU2_AW4(x) AU2_AW4_x(AW4(x)) +//============================================================================================================================== + #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST) + #define AW1_AH1(x) asuint16(x) + #define AW2_AH2(x) asuint16(x) + #define AW3_AH3(x) asuint16(x) + #define AW4_AH4(x) asuint16(x) + #else + #define AW1_AH1(a) AW1(f32tof16(AF1(a))) + #define AW2_AH2(a) AW2(AW1_AH1((a).x),AW1_AH1((a).y)) + #define AW3_AH3(a) AW3(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z)) + #define AW4_AH4(a) AW4(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z),AW1_AH1((a).w)) + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST) + #define AH1_AW1(x) asfloat16(x) + #define AH2_AW2(x) asfloat16(x) + #define AH3_AW3(x) asfloat16(x) + #define AH4_AW4(x) asfloat16(x) + #else + #define AH1_AW1(a) AH1(f16tof32(AU1(a))) + #define AH2_AW2(a) AH2(AH1_AW1((a).x),AH1_AW1((a).y)) + #define AH3_AW3(a) AH3(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z)) + #define AH4_AW4(a) AH4(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z),AH1_AW1((a).w)) + #endif +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AClampH1(AH1 x,AH1 n,AH1 m){return max(n,min(x,m));} + AH2 AClampH2(AH2 x,AH2 n,AH2 m){return max(n,min(x,m));} + AH3 AClampH3(AH3 x,AH3 n,AH3 m){return max(n,min(x,m));} + AH4 AClampH4(AH4 x,AH4 n,AH4 m){return max(n,min(x,m));} +//------------------------------------------------------------------------------------------------------------------------------ + // V_FRACT_F16 (note DX frac() is different). + AH1 AFractH1(AH1 x){return x-floor(x);} + AH2 AFractH2(AH2 x){return x-floor(x);} + AH3 AFractH3(AH3 x){return x-floor(x);} + AH4 AFractH4(AH4 x){return x-floor(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return rcp(x);} + AH2 ARcpH2(AH2 x){return rcp(x);} + AH3 ARcpH3(AH3 x){return rcp(x);} + AH4 ARcpH4(AH4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return rsqrt(x);} + AH2 ARsqH2(AH2 x){return rsqrt(x);} + AH3 ARsqH3(AH3 x){return rsqrt(x);} + AH4 ARsqH4(AH4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return saturate(x);} + AH2 ASatH2(AH2 x){return saturate(x);} + AH3 ASatH3(AH3 x){return saturate(x);} + AH4 ASatH4(AH4 x){return saturate(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #ifdef A_HLSL_6_2 + #define AD1 float64_t + #define AD2 float64_t2 + #define AD3 float64_t3 + #define AD4 float64_t4 + #else + #define AD1 double + #define AD2 double2 + #define AD3 double3 + #define AD4 double4 + #endif +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 a){return a-floor(a);} + AD2 AFractD2(AD2 a){return a-floor(a);} + AD3 AFractD3(AD3 a){return a-floor(a);} + AD4 AFractD4(AD4 a){return a-floor(a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return rcp(x);} + AD2 ARcpD2(AD2 x){return rcp(x);} + AD3 ARcpD3(AD3 x){return rcp(x);} + AD4 ARcpD4(AD4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return rsqrt(x);} + AD2 ARsqD2(AD2 x){return rsqrt(x);} + AD3 ARsqD3(AD3 x){return rsqrt(x);} + AD4 ARsqD4(AD4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return saturate(x);} + AD2 ASatD2(AD2 x){return saturate(x);} + AD3 ASatD3(AD3 x){return saturate(x);} + AD4 ASatD4(AD4 x){return saturate(x);} + #endif +//============================================================================================================================== +// HLSL WAVE +//============================================================================================================================== + #ifdef A_WAVE + // Where 'x' must be a compile time literal. + AF1 AWaveXorF1(AF1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF2 AWaveXorF2(AF2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF3 AWaveXorF3(AF3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF4 AWaveXorF4(AF4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU1 AWaveXorU1(AU1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU2 AWaveXorU1(AU2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU3 AWaveXorU1(AU3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU4 AWaveXorU1(AU4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(WaveReadLaneAt(AU1_AH2(v),WaveGetLaneIndex()^x));} + AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(WaveReadLaneAt(AU2_AH4(v),WaveGetLaneIndex()^x));} + AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(WaveReadLaneAt(AU1_AW2(v),WaveGetLaneIndex()^x));} + AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU1(WaveReadLaneAt(AU1_AW4(v),WaveGetLaneIndex()^x));} + #endif + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU COMMON +// +// +//============================================================================================================================== +#ifdef A_GPU + // Negative and positive infinity. + #define A_INFP_F AF1_AU1(0x7f800000u) + #define A_INFN_F AF1_AU1(0xff800000u) +//------------------------------------------------------------------------------------------------------------------------------ + // Copy sign from 's' to positive 'd'. + AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));} + AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));} + AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));} + AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Single operation to return (useful to create a mask to use in lerp for branch free logic), + // m=NaN := 0 + // m>=0 := 0 + // m<0 := 1 + // Uses the following useful floating point logic, + // saturate(+a*(-INF)==-INF) := 0 + // saturate( 0*(-INF)== NaN) := 0 + // saturate(-a*(-INF)==+INF) := 1 + AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));} + AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));} + AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));} + AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AGtZeroF1(AF1 m){return ASatF1(m*AF1_(A_INFP_F));} + AF2 AGtZeroF2(AF2 m){return ASatF2(m*AF2_(A_INFP_F));} + AF3 AGtZeroF3(AF3 m){return ASatF3(m*AF3_(A_INFP_F));} + AF4 AGtZeroF4(AF4 m){return ASatF4(m*AF4_(A_INFP_F));} +//============================================================================================================================== + #ifdef A_HALF + #ifdef A_HLSL_6_2 + #define A_INFP_H AH1_AW1((uint16_t)0x7c00u) + #define A_INFN_H AH1_AW1((uint16_t)0xfc00u) + #else + #define A_INFP_H AH1_AW1(0x7c00u) + #define A_INFN_H AH1_AW1(0xfc00u) + #endif + +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));} + AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));} + AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));} + AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));} + AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));} + AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));} + AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AGtZeroH1(AH1 m){return ASatH1(m*AH1_(A_INFP_H));} + AH2 AGtZeroH2(AH2 m){return ASatH2(m*AH2_(A_INFP_H));} + AH3 AGtZeroH3(AH3 m){return ASatH3(m*AH3_(A_INFP_H));} + AH4 AGtZeroH4(AH4 m){return ASatH4(m*AH4_(A_INFP_H));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [FIS] FLOAT INTEGER SORTABLE +//------------------------------------------------------------------------------------------------------------------------------ +// Float to integer sortable. +// - If sign bit=0, flip the sign bit (positives). +// - If sign bit=1, flip all bits (negatives). +// Integer sortable to float. +// - If sign bit=1, flip the sign bit (positives). +// - If sign bit=0, flip all bits (negatives). +// Has nice side effects. +// - Larger integers are more positive values. +// - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage). +// Burns 3 ops for conversion {shift,or,xor}. +//============================================================================================================================== + AU1 AFisToU1(AU1 x){return x^(( AShrSU1(x,AU1_(31)))|AU1_(0x80000000));} + AU1 AFisFromU1(AU1 x){return x^((~AShrSU1(x,AU1_(31)))|AU1_(0x80000000));} +//------------------------------------------------------------------------------------------------------------------------------ + // Just adjust high 16-bit value (useful when upper part of 32-bit word is a 16-bit float value). + AU1 AFisToHiU1(AU1 x){return x^(( AShrSU1(x,AU1_(15)))|AU1_(0x80000000));} + AU1 AFisFromHiU1(AU1 x){return x^((~AShrSU1(x,AU1_(15)))|AU1_(0x80000000));} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AW1 AFisToW1(AW1 x){return x^(( AShrSW1(x,AW1_(15)))|AW1_(0x8000));} + AW1 AFisFromW1(AW1 x){return x^((~AShrSW1(x,AW1_(15)))|AW1_(0x8000));} +//------------------------------------------------------------------------------------------------------------------------------ + AW2 AFisToW2(AW2 x){return x^(( AShrSW2(x,AW2_(15)))|AW2_(0x8000));} + AW2 AFisFromW2(AW2 x){return x^((~AShrSW2(x,AW2_(15)))|AW2_(0x8000));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [PERM] V_PERM_B32 +//------------------------------------------------------------------------------------------------------------------------------ +// Support for V_PERM_B32 started in the 3rd generation of GCN. +//------------------------------------------------------------------------------------------------------------------------------ +// yyyyxxxx - The 'i' input. +// 76543210 +// ======== +// HGFEDCBA - Naming on permutation. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Make sure compiler optimizes this. +//============================================================================================================================== + #ifdef A_HALF + AU1 APerm0E0A(AU2 i){return((i.x )&0xffu)|((i.y<<16)&0xff0000u);} + AU1 APerm0F0B(AU2 i){return((i.x>> 8)&0xffu)|((i.y<< 8)&0xff0000u);} + AU1 APerm0G0C(AU2 i){return((i.x>>16)&0xffu)|((i.y )&0xff0000u);} + AU1 APerm0H0D(AU2 i){return((i.x>>24)&0xffu)|((i.y>> 8)&0xff0000u);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 APermHGFA(AU2 i){return((i.x )&0x000000ffu)|(i.y&0xffffff00u);} + AU1 APermHGFC(AU2 i){return((i.x>>16)&0x000000ffu)|(i.y&0xffffff00u);} + AU1 APermHGAE(AU2 i){return((i.x<< 8)&0x0000ff00u)|(i.y&0xffff00ffu);} + AU1 APermHGCE(AU2 i){return((i.x>> 8)&0x0000ff00u)|(i.y&0xffff00ffu);} + AU1 APermHAFE(AU2 i){return((i.x<<16)&0x00ff0000u)|(i.y&0xff00ffffu);} + AU1 APermHCFE(AU2 i){return((i.x )&0x00ff0000u)|(i.y&0xff00ffffu);} + AU1 APermAGFE(AU2 i){return((i.x<<24)&0xff000000u)|(i.y&0x00ffffffu);} + AU1 APermCGFE(AU2 i){return((i.x<< 8)&0xff000000u)|(i.y&0x00ffffffu);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 APermGCEA(AU2 i){return((i.x)&0x00ff00ffu)|((i.y<<8)&0xff00ff00u);} + AU1 APermGECA(AU2 i){return(((i.x)&0xffu)|((i.x>>8)&0xff00u)|((i.y<<16)&0xff0000u)|((i.y<<8)&0xff000000u));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [BUC] BYTE UNSIGNED CONVERSION +//------------------------------------------------------------------------------------------------------------------------------ +// Designed to use the optimal conversion, enables the scaling to possibly be factored into other computation. +// Works on a range of {0 to A_BUC_<32,16>}, for <32-bit, and 16-bit> respectively. +//------------------------------------------------------------------------------------------------------------------------------ +// OPCODE NOTES +// ============ +// GCN does not do UNORM or SNORM for bytes in opcodes. +// - V_CVT_F32_UBYTE{0,1,2,3} - Unsigned byte to float. +// - V_CVT_PKACC_U8_F32 - Float to unsigned byte (does bit-field insert into 32-bit integer). +// V_PERM_B32 does byte packing with ability to zero fill bytes as well. +// - Can pull out byte values from two sources, and zero fill upper 8-bits of packed hi and lo. +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U1() - Designed for V_CVT_F32_UBYTE* and V_CVT_PKACCUM_U8_F32 ops. +// ==== ===== +// 0 : 0 +// 1 : 1 +// ... +// 255 : 255 +// : 256 (just outside the encoding range) +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32. +// ==== ===== +// 0 : 0 +// 1 : 1/512 +// 2 : 1/256 +// ... +// 64 : 1/8 +// 128 : 1/4 +// 255 : 255/512 +// : 1/2 (just outside the encoding range) +//------------------------------------------------------------------------------------------------------------------------------ +// OPTIMAL IMPLEMENTATIONS ON AMD ARCHITECTURES +// ============================================ +// r=ABuc0FromU1(i) +// V_CVT_F32_UBYTE0 r,i +// -------------------------------------------- +// r=ABuc0ToU1(d,i) +// V_CVT_PKACCUM_U8_F32 r,i,0,d +// -------------------------------------------- +// d=ABuc0FromU2(i) +// Where 'k0' is an SGPR with 0x0E0A +// Where 'k1' is an SGPR with {32768.0} packed into the lower 16-bits +// V_PERM_B32 d,i.x,i.y,k0 +// V_PK_FMA_F16 d,d,k1.x,0 +// -------------------------------------------- +// r=ABuc0ToU2(d,i) +// Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits +// Where 'k1' is an SGPR with 0x???? +// Where 'k2' is an SGPR with 0x???? +// V_PK_FMA_F16 i,i,k0.x,0 +// V_PERM_B32 r.x,i,i,k1 +// V_PERM_B32 r.y,i,i,k2 +//============================================================================================================================== + // Peak range for 32-bit and 16-bit operations. + #define A_BUC_32 (255.0) + #define A_BUC_16 (255.0/512.0) +//============================================================================================================================== + #if 1 + // Designed to be one V_CVT_PKACCUM_U8_F32. + // The extra min is required to pattern match to V_CVT_PKACCUM_U8_F32. + AU1 ABuc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i),255u) )&(0x000000ffu));} + AU1 ABuc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i),255u)<< 8)&(0x0000ff00u));} + AU1 ABuc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i),255u)<<16)&(0x00ff0000u));} + AU1 ABuc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i),255u)<<24)&(0xff000000u));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed to be one V_CVT_F32_UBYTE*. + AF1 ABuc0FromU1(AU1 i){return AF1((i )&255u);} + AF1 ABuc1FromU1(AU1 i){return AF1((i>> 8)&255u);} + AF1 ABuc2FromU1(AU1 i){return AF1((i>>16)&255u);} + AF1 ABuc3FromU1(AU1 i){return AF1((i>>24)&255u);} + #endif +//============================================================================================================================== + #ifdef A_HALF + // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}. + AW2 ABuc01ToW2(AH2 x,AH2 y){x*=AH2_(1.0/32768.0);y*=AH2_(1.0/32768.0); + return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed for 3 ops to do SOA to AOS and conversion. + AU2 ABuc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABuc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABuc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABuc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed for 2 ops to do both AOS to SOA, and conversion. + AH2 ABuc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0);} + AH2 ABuc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0);} + AH2 ABuc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0);} + AH2 ABuc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [BSC] BYTE SIGNED CONVERSION +//------------------------------------------------------------------------------------------------------------------------------ +// Similar to [BUC]. +// Works on a range of {-/+ A_BSC_<32,16>}, for <32-bit, and 16-bit> respectively. +//------------------------------------------------------------------------------------------------------------------------------ +// ENCODING (without zero-based encoding) +// ======== +// 0 = unused (can be used to mean something else) +// 1 = lowest value +// 128 = exact zero center (zero based encoding +// 255 = highest value +//------------------------------------------------------------------------------------------------------------------------------ +// Zero-based [Zb] flips the MSB bit of the byte (making 128 "exact zero" actually zero). +// This is useful if there is a desire for cleared values to decode as zero. +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABsc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32. +// ==== ===== +// 0 : -127/512 (unused) +// 1 : -126/512 +// 2 : -125/512 +// ... +// 128 : 0 +// ... +// 255 : 127/512 +// : 1/4 (just outside the encoding range) +//============================================================================================================================== + // Peak range for 32-bit and 16-bit operations. + #define A_BSC_32 (127.0) + #define A_BSC_16 (127.0/512.0) +//============================================================================================================================== + #if 1 + AU1 ABsc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i+128.0),255u) )&(0x000000ffu));} + AU1 ABsc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i+128.0),255u)<< 8)&(0x0000ff00u));} + AU1 ABsc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i+128.0),255u)<<16)&(0x00ff0000u));} + AU1 ABsc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i+128.0),255u)<<24)&(0xff000000u));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABsc0ToZbU1(AU1 d,AF1 i){return ((d&0xffffff00u)|((min(AU1(trunc(i)+128.0),255u) )&(0x000000ffu)))^0x00000080u;} + AU1 ABsc1ToZbU1(AU1 d,AF1 i){return ((d&0xffff00ffu)|((min(AU1(trunc(i)+128.0),255u)<< 8)&(0x0000ff00u)))^0x00008000u;} + AU1 ABsc2ToZbU1(AU1 d,AF1 i){return ((d&0xff00ffffu)|((min(AU1(trunc(i)+128.0),255u)<<16)&(0x00ff0000u)))^0x00800000u;} + AU1 ABsc3ToZbU1(AU1 d,AF1 i){return ((d&0x00ffffffu)|((min(AU1(trunc(i)+128.0),255u)<<24)&(0xff000000u)))^0x80000000u;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ABsc0FromU1(AU1 i){return AF1((i )&255u)-128.0;} + AF1 ABsc1FromU1(AU1 i){return AF1((i>> 8)&255u)-128.0;} + AF1 ABsc2FromU1(AU1 i){return AF1((i>>16)&255u)-128.0;} + AF1 ABsc3FromU1(AU1 i){return AF1((i>>24)&255u)-128.0;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ABsc0FromZbU1(AU1 i){return AF1(((i )&255u)^0x80u)-128.0;} + AF1 ABsc1FromZbU1(AU1 i){return AF1(((i>> 8)&255u)^0x80u)-128.0;} + AF1 ABsc2FromZbU1(AU1 i){return AF1(((i>>16)&255u)^0x80u)-128.0;} + AF1 ABsc3FromZbU1(AU1 i){return AF1(((i>>24)&255u)^0x80u)-128.0;} + #endif +//============================================================================================================================== + #ifdef A_HALF + // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}. + AW2 ABsc01ToW2(AH2 x,AH2 y){x=x*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);y=y*AH2_(1.0/32768.0)+AH2_(0.25/32768.0); + return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU2 ABsc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABsc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABsc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABsc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU2 ABsc0ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABsc1ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABsc2ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABsc3ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 ABsc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0)-AH2_(0.25);} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 ABsc0FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc1FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc2FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc3FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These support only positive inputs. +// Did not see value yet in specialization for range. +// Using quick testing, ended up mostly getting the same "best" approximation for various ranges. +// With hardware that can co-execute transcendentals, the value in approximations could be less than expected. +// However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total. +// And co-execution would require a compiler interleaving a lot of independent work for packed usage. +//------------------------------------------------------------------------------------------------------------------------------ +// The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total). +// Same with sqrt(), as this could be x*rsq() (7 ops). +//============================================================================================================================== + #ifdef A_HALF + // Minimize squared error across full positive range, 2 ops. + // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output. + AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));} + AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));} + AH3 APrxLoSqrtH3(AH3 a){return AH3_AW3((AW3_AH3(a)>>AW3_(1))+AW3_(0x1de2));} + AH4 APrxLoSqrtH4(AH4 a){return AH4_AW4((AW4_AH4(a)>>AW4_(1))+AW4_(0x1de2));} +//------------------------------------------------------------------------------------------------------------------------------ + // Lower precision estimation, 1 op. + // Minimize squared error across {smallest normal to 16384.0}. + AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));} + AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));} + AH3 APrxLoRcpH3(AH3 a){return AH3_AW3(AW3_(0x7784)-AW3_AH3(a));} + AH4 APrxLoRcpH4(AH4 a){return AH4_AW4(AW4_(0x7784)-AW4_AH4(a));} +//------------------------------------------------------------------------------------------------------------------------------ + // Medium precision estimation, one Newton Raphson iteration, 3 ops. + AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));} + AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));} + AH3 APrxMedRcpH3(AH3 a){AH3 b=AH3_AW3(AW3_(0x778d)-AW3_AH3(a));return b*(-b*a+AH3_(2.0));} + AH4 APrxMedRcpH4(AH4 a){AH4 b=AH4_AW4(AW4_(0x778d)-AW4_AH4(a));return b*(-b*a+AH4_(2.0));} +//------------------------------------------------------------------------------------------------------------------------------ + // Minimize squared error across {smallest normal to 16384.0}, 2 ops. + AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));} + AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));} + AH3 APrxLoRsqH3(AH3 a){return AH3_AW3(AW3_(0x59a3)-(AW3_AH3(a)>>AW3_(1)));} + AH4 APrxLoRsqH4(AH4 a){return AH4_AW4(AW4_(0x59a3)-(AW4_AH4(a)>>AW4_(1)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// FLOAT APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN", +// - Idea dates back to SGI, then to Quake 3, etc. +// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +// - sqrt(x)=rsqrt(x)*x +// - rcp(x)=rsqrt(x)*rsqrt(x) for positive x +// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +//------------------------------------------------------------------------------------------------------------------------------ +// These below are from perhaps less complete searching for optimal. +// Used FP16 normal range for testing with +4096 32-bit step size for sampling error. +// So these match up well with the half approximations. +//============================================================================================================================== + AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));} + AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));} + AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));} + AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 APrxLoSqrtF2(AF2 a){return AF2_AU2((AU2_AF2(a)>>AU2_(1))+AU2_(0x1fbc4639));} + AF2 APrxLoRcpF2(AF2 a){return AF2_AU2(AU2_(0x7ef07ebb)-AU2_AF2(a));} + AF2 APrxMedRcpF2(AF2 a){AF2 b=AF2_AU2(AU2_(0x7ef19fff)-AU2_AF2(a));return b*(-b*a+AF2_(2.0));} + AF2 APrxLoRsqF2(AF2 a){return AF2_AU2(AU2_(0x5f347d74)-(AU2_AF2(a)>>AU2_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF3 APrxLoSqrtF3(AF3 a){return AF3_AU3((AU3_AF3(a)>>AU3_(1))+AU3_(0x1fbc4639));} + AF3 APrxLoRcpF3(AF3 a){return AF3_AU3(AU3_(0x7ef07ebb)-AU3_AF3(a));} + AF3 APrxMedRcpF3(AF3 a){AF3 b=AF3_AU3(AU3_(0x7ef19fff)-AU3_AF3(a));return b*(-b*a+AF3_(2.0));} + AF3 APrxLoRsqF3(AF3 a){return AF3_AU3(AU3_(0x5f347d74)-(AU3_AF3(a)>>AU3_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF4 APrxLoSqrtF4(AF4 a){return AF4_AU4((AU4_AF4(a)>>AU4_(1))+AU4_(0x1fbc4639));} + AF4 APrxLoRcpF4(AF4 a){return AF4_AU4(AU4_(0x7ef07ebb)-AU4_AF4(a));} + AF4 APrxMedRcpF4(AF4 a){AF4 b=AF4_AU4(AU4_(0x7ef19fff)-AU4_AF4(a));return b*(-b*a+AF4_(2.0));} + AF4 APrxLoRsqF4(AF4 a){return AF4_AU4(AU4_(0x5f347d74)-(AU4_AF4(a)>>AU4_(1)));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PQ APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// PQ is very close to x^(1/8). The functions below Use the fast float approximation method to do +// PQ<~>Gamma2 (4th power and fast 4th root) and PQ<~>Linear (8th power and fast 8th root). Maximum error is ~0.2%. +//============================================================================================================================== +// Helpers + AF1 Quart(AF1 a) { a = a * a; return a * a;} + AF1 Oct(AF1 a) { a = a * a; a = a * a; return a * a; } + AF2 Quart(AF2 a) { a = a * a; return a * a; } + AF2 Oct(AF2 a) { a = a * a; a = a * a; return a * a; } + AF3 Quart(AF3 a) { a = a * a; return a * a; } + AF3 Oct(AF3 a) { a = a * a; a = a * a; return a * a; } + AF4 Quart(AF4 a) { a = a * a; return a * a; } + AF4 Oct(AF4 a) { a = a * a; a = a * a; return a * a; } + //------------------------------------------------------------------------------------------------------------------------------ + AF1 APrxPQToGamma2(AF1 a) { return Quart(a); } + AF1 APrxPQToLinear(AF1 a) { return Oct(a); } + AF1 APrxLoGamma2ToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); } + AF1 APrxMedGamma2ToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); AF1 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF1 APrxHighGamma2ToPQ(AF1 a) { return sqrt(sqrt(a)); } + AF1 APrxLoLinearToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); } + AF1 APrxMedLinearToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); AF1 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF1 APrxHighLinearToPQ(AF1 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF2 APrxPQToGamma2(AF2 a) { return Quart(a); } + AF2 APrxPQToLinear(AF2 a) { return Oct(a); } + AF2 APrxLoGamma2ToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); } + AF2 APrxMedGamma2ToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); AF2 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF2 APrxHighGamma2ToPQ(AF2 a) { return sqrt(sqrt(a)); } + AF2 APrxLoLinearToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); } + AF2 APrxMedLinearToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); AF2 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF2 APrxHighLinearToPQ(AF2 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF3 APrxPQToGamma2(AF3 a) { return Quart(a); } + AF3 APrxPQToLinear(AF3 a) { return Oct(a); } + AF3 APrxLoGamma2ToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); } + AF3 APrxMedGamma2ToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); AF3 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF3 APrxHighGamma2ToPQ(AF3 a) { return sqrt(sqrt(a)); } + AF3 APrxLoLinearToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); } + AF3 APrxMedLinearToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); AF3 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF3 APrxHighLinearToPQ(AF3 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF4 APrxPQToGamma2(AF4 a) { return Quart(a); } + AF4 APrxPQToLinear(AF4 a) { return Oct(a); } + AF4 APrxLoGamma2ToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); } + AF4 APrxMedGamma2ToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); AF4 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF4 APrxHighGamma2ToPQ(AF4 a) { return sqrt(sqrt(a)); } + AF4 APrxLoLinearToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); } + AF4 APrxMedLinearToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); AF4 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF4 APrxHighLinearToPQ(AF4 a) { return sqrt(sqrt(sqrt(a))); } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PARABOLIC SIN & COS +//------------------------------------------------------------------------------------------------------------------------------ +// Approximate answers to transcendental questions. +//------------------------------------------------------------------------------------------------------------------------------ +//============================================================================================================================== + #if 1 + // Valid input range is {-1 to 1} representing {0 to 2 pi}. + // Output range is {-1/4 to 1/4} representing {-1 to 1}. + AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD. + AF2 APSinF2(AF2 x){return x*abs(x)-x;} + AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT + AF2 APCosF2(AF2 x){x=AFractF2(x*AF2_(0.5)+AF2_(0.75));x=x*AF2_(2.0)-AF2_(1.0);return APSinF2(x);} + AF2 APSinCosF1(AF1 x){AF1 y=AFractF1(x*AF1_(0.5)+AF1_(0.75));y=y*AF1_(2.0)-AF1_(1.0);return APSinF2(AF2(x,y));} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + // For a packed {sin,cos} pair, + // - Native takes 16 clocks and 4 issue slots (no packed transcendentals). + // - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed). + AH1 APSinH1(AH1 x){return x*abs(x)-x;} + AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA + AH1 APCosH1(AH1 x){x=AFractH1(x*AH1_(0.5)+AH1_(0.75));x=x*AH1_(2.0)-AH1_(1.0);return APSinH1(x);} + AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND + AH2 APSinCosH1(AH1 x){AH1 y=AFractH1(x*AH1_(0.5)+AH1_(0.75));y=y*AH1_(2.0)-AH1_(1.0);return APSinH2(AH2(x,y));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [ZOL] ZERO ONE LOGIC +//------------------------------------------------------------------------------------------------------------------------------ +// Conditional free logic designed for easy 16-bit packing, and backwards porting to 32-bit. +//------------------------------------------------------------------------------------------------------------------------------ +// 0 := false +// 1 := true +//------------------------------------------------------------------------------------------------------------------------------ +// AndNot(x,y) -> !(x&y) .... One op. +// AndOr(x,y,z) -> (x&y)|z ... One op. +// GtZero(x) -> x>0.0 ..... One op. +// Sel(x,y,z) -> x?y:z ..... Two ops, has no precision loss. +// Signed(x) -> x<0.0 ..... One op. +// ZeroPass(x,y) -> x?0:y ..... Two ops, 'y' is a pass through safe for aliasing as integer. +//------------------------------------------------------------------------------------------------------------------------------ +// OPTIMIZATION NOTES +// ================== +// - On Vega to use 2 constants in a packed op, pass in as one AW2 or one AH2 'k.xy' and use as 'k.xx' and 'k.yy'. +// For example 'a.xy*k.xx+k.yy'. +//============================================================================================================================== + #if 1 + AU1 AZolAndU1(AU1 x,AU1 y){return min(x,y);} + AU2 AZolAndU2(AU2 x,AU2 y){return min(x,y);} + AU3 AZolAndU3(AU3 x,AU3 y){return min(x,y);} + AU4 AZolAndU4(AU4 x,AU4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AZolNotU1(AU1 x){return x^AU1_(1);} + AU2 AZolNotU2(AU2 x){return x^AU2_(1);} + AU3 AZolNotU3(AU3 x){return x^AU3_(1);} + AU4 AZolNotU4(AU4 x){return x^AU4_(1);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AZolOrU1(AU1 x,AU1 y){return max(x,y);} + AU2 AZolOrU2(AU2 x,AU2 y){return max(x,y);} + AU3 AZolOrU3(AU3 x,AU3 y){return max(x,y);} + AU4 AZolOrU4(AU4 x,AU4 y){return max(x,y);} +//============================================================================================================================== + AU1 AZolF1ToU1(AF1 x){return AU1(x);} + AU2 AZolF2ToU2(AF2 x){return AU2(x);} + AU3 AZolF3ToU3(AF3 x){return AU3(x);} + AU4 AZolF4ToU4(AF4 x){return AU4(x);} +//------------------------------------------------------------------------------------------------------------------------------ + // 2 ops, denormals don't work in 32-bit on PC (and if they are enabled, OMOD is disabled). + AU1 AZolNotF1ToU1(AF1 x){return AU1(AF1_(1.0)-x);} + AU2 AZolNotF2ToU2(AF2 x){return AU2(AF2_(1.0)-x);} + AU3 AZolNotF3ToU3(AF3 x){return AU3(AF3_(1.0)-x);} + AU4 AZolNotF4ToU4(AF4 x){return AU4(AF4_(1.0)-x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolU1ToF1(AU1 x){return AF1(x);} + AF2 AZolU2ToF2(AU2 x){return AF2(x);} + AF3 AZolU3ToF3(AU3 x){return AF3(x);} + AF4 AZolU4ToF4(AU4 x){return AF4(x);} +//============================================================================================================================== + AF1 AZolAndF1(AF1 x,AF1 y){return min(x,y);} + AF2 AZolAndF2(AF2 x,AF2 y){return min(x,y);} + AF3 AZolAndF3(AF3 x,AF3 y){return min(x,y);} + AF4 AZolAndF4(AF4 x,AF4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ASolAndNotF1(AF1 x,AF1 y){return (-x)*y+AF1_(1.0);} + AF2 ASolAndNotF2(AF2 x,AF2 y){return (-x)*y+AF2_(1.0);} + AF3 ASolAndNotF3(AF3 x,AF3 y){return (-x)*y+AF3_(1.0);} + AF4 ASolAndNotF4(AF4 x,AF4 y){return (-x)*y+AF4_(1.0);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolAndOrF1(AF1 x,AF1 y,AF1 z){return ASatF1(x*y+z);} + AF2 AZolAndOrF2(AF2 x,AF2 y,AF2 z){return ASatF2(x*y+z);} + AF3 AZolAndOrF3(AF3 x,AF3 y,AF3 z){return ASatF3(x*y+z);} + AF4 AZolAndOrF4(AF4 x,AF4 y,AF4 z){return ASatF4(x*y+z);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolGtZeroF1(AF1 x){return ASatF1(x*AF1_(A_INFP_F));} + AF2 AZolGtZeroF2(AF2 x){return ASatF2(x*AF2_(A_INFP_F));} + AF3 AZolGtZeroF3(AF3 x){return ASatF3(x*AF3_(A_INFP_F));} + AF4 AZolGtZeroF4(AF4 x){return ASatF4(x*AF4_(A_INFP_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolNotF1(AF1 x){return AF1_(1.0)-x;} + AF2 AZolNotF2(AF2 x){return AF2_(1.0)-x;} + AF3 AZolNotF3(AF3 x){return AF3_(1.0)-x;} + AF4 AZolNotF4(AF4 x){return AF4_(1.0)-x;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolOrF1(AF1 x,AF1 y){return max(x,y);} + AF2 AZolOrF2(AF2 x,AF2 y){return max(x,y);} + AF3 AZolOrF3(AF3 x,AF3 y){return max(x,y);} + AF4 AZolOrF4(AF4 x,AF4 y){return max(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolSelF1(AF1 x,AF1 y,AF1 z){AF1 r=(-x)*z+z;return x*y+r;} + AF2 AZolSelF2(AF2 x,AF2 y,AF2 z){AF2 r=(-x)*z+z;return x*y+r;} + AF3 AZolSelF3(AF3 x,AF3 y,AF3 z){AF3 r=(-x)*z+z;return x*y+r;} + AF4 AZolSelF4(AF4 x,AF4 y,AF4 z){AF4 r=(-x)*z+z;return x*y+r;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolSignedF1(AF1 x){return ASatF1(x*AF1_(A_INFN_F));} + AF2 AZolSignedF2(AF2 x){return ASatF2(x*AF2_(A_INFN_F));} + AF3 AZolSignedF3(AF3 x){return ASatF3(x*AF3_(A_INFN_F));} + AF4 AZolSignedF4(AF4 x){return ASatF4(x*AF4_(A_INFN_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolZeroPassF1(AF1 x,AF1 y){return AF1_AU1((AU1_AF1(x)!=AU1_(0))?AU1_(0):AU1_AF1(y));} + AF2 AZolZeroPassF2(AF2 x,AF2 y){return AF2_AU2((AU2_AF2(x)!=AU2_(0))?AU2_(0):AU2_AF2(y));} + AF3 AZolZeroPassF3(AF3 x,AF3 y){return AF3_AU3((AU3_AF3(x)!=AU3_(0))?AU3_(0):AU3_AF3(y));} + AF4 AZolZeroPassF4(AF4 x,AF4 y){return AF4_AU4((AU4_AF4(x)!=AU4_(0))?AU4_(0):AU4_AF4(y));} + #endif +//============================================================================================================================== + #ifdef A_HALF + AW1 AZolAndW1(AW1 x,AW1 y){return min(x,y);} + AW2 AZolAndW2(AW2 x,AW2 y){return min(x,y);} + AW3 AZolAndW3(AW3 x,AW3 y){return min(x,y);} + AW4 AZolAndW4(AW4 x,AW4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AZolNotW1(AW1 x){return x^AW1_(1);} + AW2 AZolNotW2(AW2 x){return x^AW2_(1);} + AW3 AZolNotW3(AW3 x){return x^AW3_(1);} + AW4 AZolNotW4(AW4 x){return x^AW4_(1);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AZolOrW1(AW1 x,AW1 y){return max(x,y);} + AW2 AZolOrW2(AW2 x,AW2 y){return max(x,y);} + AW3 AZolOrW3(AW3 x,AW3 y){return max(x,y);} + AW4 AZolOrW4(AW4 x,AW4 y){return max(x,y);} +//============================================================================================================================== + // Uses denormal trick. + AW1 AZolH1ToW1(AH1 x){return AW1_AH1(x*AH1_AW1(AW1_(1)));} + AW2 AZolH2ToW2(AH2 x){return AW2_AH2(x*AH2_AW2(AW2_(1)));} + AW3 AZolH3ToW3(AH3 x){return AW3_AH3(x*AH3_AW3(AW3_(1)));} + AW4 AZolH4ToW4(AH4 x){return AW4_AH4(x*AH4_AW4(AW4_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + // AMD arch lacks a packed conversion opcode. + AH1 AZolW1ToH1(AW1 x){return AH1_AW1(x*AW1_AH1(AH1_(1.0)));} + AH2 AZolW2ToH2(AW2 x){return AH2_AW2(x*AW2_AH2(AH2_(1.0)));} + AH3 AZolW1ToH3(AW3 x){return AH3_AW3(x*AW3_AH3(AH3_(1.0)));} + AH4 AZolW2ToH4(AW4 x){return AH4_AW4(x*AW4_AH4(AH4_(1.0)));} +//============================================================================================================================== + AH1 AZolAndH1(AH1 x,AH1 y){return min(x,y);} + AH2 AZolAndH2(AH2 x,AH2 y){return min(x,y);} + AH3 AZolAndH3(AH3 x,AH3 y){return min(x,y);} + AH4 AZolAndH4(AH4 x,AH4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASolAndNotH1(AH1 x,AH1 y){return (-x)*y+AH1_(1.0);} + AH2 ASolAndNotH2(AH2 x,AH2 y){return (-x)*y+AH2_(1.0);} + AH3 ASolAndNotH3(AH3 x,AH3 y){return (-x)*y+AH3_(1.0);} + AH4 ASolAndNotH4(AH4 x,AH4 y){return (-x)*y+AH4_(1.0);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolAndOrH1(AH1 x,AH1 y,AH1 z){return ASatH1(x*y+z);} + AH2 AZolAndOrH2(AH2 x,AH2 y,AH2 z){return ASatH2(x*y+z);} + AH3 AZolAndOrH3(AH3 x,AH3 y,AH3 z){return ASatH3(x*y+z);} + AH4 AZolAndOrH4(AH4 x,AH4 y,AH4 z){return ASatH4(x*y+z);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolGtZeroH1(AH1 x){return ASatH1(x*AH1_(A_INFP_H));} + AH2 AZolGtZeroH2(AH2 x){return ASatH2(x*AH2_(A_INFP_H));} + AH3 AZolGtZeroH3(AH3 x){return ASatH3(x*AH3_(A_INFP_H));} + AH4 AZolGtZeroH4(AH4 x){return ASatH4(x*AH4_(A_INFP_H));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolNotH1(AH1 x){return AH1_(1.0)-x;} + AH2 AZolNotH2(AH2 x){return AH2_(1.0)-x;} + AH3 AZolNotH3(AH3 x){return AH3_(1.0)-x;} + AH4 AZolNotH4(AH4 x){return AH4_(1.0)-x;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolOrH1(AH1 x,AH1 y){return max(x,y);} + AH2 AZolOrH2(AH2 x,AH2 y){return max(x,y);} + AH3 AZolOrH3(AH3 x,AH3 y){return max(x,y);} + AH4 AZolOrH4(AH4 x,AH4 y){return max(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolSelH1(AH1 x,AH1 y,AH1 z){AH1 r=(-x)*z+z;return x*y+r;} + AH2 AZolSelH2(AH2 x,AH2 y,AH2 z){AH2 r=(-x)*z+z;return x*y+r;} + AH3 AZolSelH3(AH3 x,AH3 y,AH3 z){AH3 r=(-x)*z+z;return x*y+r;} + AH4 AZolSelH4(AH4 x,AH4 y,AH4 z){AH4 r=(-x)*z+z;return x*y+r;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolSignedH1(AH1 x){return ASatH1(x*AH1_(A_INFN_H));} + AH2 AZolSignedH2(AH2 x){return ASatH2(x*AH2_(A_INFN_H));} + AH3 AZolSignedH3(AH3 x){return ASatH3(x*AH3_(A_INFN_H));} + AH4 AZolSignedH4(AH4 x){return ASatH4(x*AH4_(A_INFN_H));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COLOR CONVERSIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These are all linear to/from some other space (where 'linear' has been shortened out of the function name). +// So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'. +// These are branch free implementations. +// The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion. +//------------------------------------------------------------------------------------------------------------------------------ +// TRANSFER FUNCTIONS +// ================== +// 709 ..... Rec709 used for some HDTVs +// Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native +// Pq ...... PQ native for HDR10 +// Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type +// Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations) +// Three ... Gamma 3.0, less fast, but good for HDR. +//------------------------------------------------------------------------------------------------------------------------------ +// KEEPING TO SPEC +// =============== +// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +// Also there is a slight step in the transition regions. +// Precision of the coefficients in the spec being the likely cause. +// Main usage case of the sRGB code is to do the linear->sRGB converstion in a compute shader before store. +// This is to work around lack of hardware (typically only ROP does the conversion for free). +// To "correct" the linear segment, would be to introduce error, because hardware decode of sRGB->linear is fixed (and free). +// So this header keeps with the spec. +// For linear->sRGB transforms, the linear segment in some respects reduces error, because rounding in that region is linear. +// Rounding in the curved region in hardware (and fast software code) introduces error due to rounding in non-linear. +//------------------------------------------------------------------------------------------------------------------------------ +// FOR PQ +// ====== +// Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2. +// All constants are only specified to FP32 precision. +// External PQ source reference, +// - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl +//------------------------------------------------------------------------------------------------------------------------------ +// PACKED VERSIONS +// =============== +// These are the A*H2() functions. +// There is no PQ functions as FP16 seemed to not have enough precision for the conversion. +// The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors. +// Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least). +//------------------------------------------------------------------------------------------------------------------------------ +// NOTES +// ===== +// Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case. +//============================================================================================================================== + #if 1 + AF1 ATo709F1(AF1 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AF2 ATo709F2(AF2 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AF3 ATo709F3(AF3 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma(). + AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,AF1_(rcpX));} + AF2 AToGammaF2(AF2 c,AF1 rcpX){return pow(c,AF2_(rcpX));} + AF3 AToGammaF3(AF3 c,AF1 rcpX){return pow(c,AF3_(rcpX));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302)); + return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));} + AF2 AToPqF1(AF2 x){AF2 p=pow(x,AF2_(0.159302)); + return pow((AF2_(0.835938)+AF2_(18.8516)*p)/(AF2_(1.0)+AF2_(18.6875)*p),AF2_(78.8438));} + AF3 AToPqF1(AF3 x){AF3 p=pow(x,AF3_(0.159302)); + return pow((AF3_(0.835938)+AF3_(18.8516)*p)/(AF3_(1.0)+AF3_(18.6875)*p),AF3_(78.8438));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToSrgbF1(AF1 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AF2 AToSrgbF2(AF2 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AF3 AToSrgbF3(AF3 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToTwoF1(AF1 c){return sqrt(c);} + AF2 AToTwoF2(AF2 c){return sqrt(c);} + AF3 AToTwoF3(AF3 c){return sqrt(c);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToThreeF1(AF1 c){return pow(c,AF1_(1.0/3.0));} + AF2 AToThreeF2(AF2 c){return pow(c,AF2_(1.0/3.0));} + AF3 AToThreeF3(AF3 c){return pow(c,AF3_(1.0/3.0));} + #endif +//============================================================================================================================== + #if 1 + // Unfortunately median won't work here. + AF1 AFrom709F1(AF1 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AF2 AFrom709F2(AF2 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AF3 AFrom709F3(AF3 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,AF1_(x));} + AF2 AFromGammaF2(AF2 c,AF1 x){return pow(c,AF2_(x));} + AF3 AFromGammaF3(AF3 c,AF1 x){return pow(c,AF3_(x));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833)); + return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));} + AF2 AFromPqF1(AF2 x){AF2 p=pow(x,AF2_(0.0126833)); + return pow(ASatF2(p-AF2_(0.835938))/(AF2_(18.8516)-AF2_(18.6875)*p),AF2_(6.27739));} + AF3 AFromPqF1(AF3 x){AF3 p=pow(x,AF3_(0.0126833)); + return pow(ASatF3(p-AF3_(0.835938))/(AF3_(18.8516)-AF3_(18.6875)*p),AF3_(6.27739));} +//------------------------------------------------------------------------------------------------------------------------------ + // Unfortunately median won't work here. + AF1 AFromSrgbF1(AF1 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AF2 AFromSrgbF2(AF2 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AF3 AFromSrgbF3(AF3 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromTwoF1(AF1 c){return c*c;} + AF2 AFromTwoF2(AF2 c){return c*c;} + AF3 AFromTwoF3(AF3 c){return c*c;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromThreeF1(AF1 c){return c*c*c;} + AF2 AFromThreeF2(AF2 c){return c*c*c;} + AF3 AFromThreeF3(AF3 c){return c*c*c;} + #endif +//============================================================================================================================== + #ifdef A_HALF + AH1 ATo709H1(AH1 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AH2 ATo709H2(AH2 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AH3 ATo709H3(AH3 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToGammaH1(AH1 c,AH1 rcpX){return pow(c,AH1_(rcpX));} + AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));} + AH3 AToGammaH3(AH3 c,AH1 rcpX){return pow(c,AH3_(rcpX));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToSrgbH1(AH1 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AH2 AToSrgbH2(AH2 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AH3 AToSrgbH3(AH3 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToTwoH1(AH1 c){return sqrt(c);} + AH2 AToTwoH2(AH2 c){return sqrt(c);} + AH3 AToTwoH3(AH3 c){return sqrt(c);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToThreeF1(AH1 c){return pow(c,AH1_(1.0/3.0));} + AH2 AToThreeF2(AH2 c){return pow(c,AH2_(1.0/3.0));} + AH3 AToThreeF3(AH3 c){return pow(c,AH3_(1.0/3.0));} + #endif +//============================================================================================================================== + #ifdef A_HALF + AH1 AFrom709H1(AH1 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AH2 AFrom709H2(AH2 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AH3 AFrom709H3(AH3 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromGammaH1(AH1 c,AH1 x){return pow(c,AH1_(x));} + AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));} + AH3 AFromGammaH3(AH3 c,AH1 x){return pow(c,AH3_(x));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AHromSrgbF1(AH1 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AH2 AHromSrgbF2(AH2 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AH3 AHromSrgbF3(AH3 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromTwoH1(AH1 c){return c*c;} + AH2 AFromTwoH2(AH2 c){return c*c;} + AH3 AFromTwoH3(AH3 c){return c*c;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromThreeH1(AH1 c){return c*c*c;} + AH2 AFromThreeH2(AH2 c){return c*c*c;} + AH3 AFromThreeH3(AH3 c){return c*c*c;} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CS REMAP +//============================================================================================================================== + // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear. + // 543210 + // ====== + // ..xxx. + // yy...y + AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} +//============================================================================================================================== + // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions. + // 543210 + // ====== + // .xx..x + // y..yy. + // Details, + // LANE TO 8x8 MAPPING + // =================== + // 00 01 08 09 10 11 18 19 + // 02 03 0a 0b 12 13 1a 1b + // 04 05 0c 0d 14 15 1c 1d + // 06 07 0e 0f 16 17 1e 1f + // 20 21 28 29 30 31 38 39 + // 22 23 2a 2b 32 33 3a 3b + // 24 25 2c 2d 34 35 3c 3d + // 26 27 2e 2f 36 37 3e 3f + AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} +//============================================================================================================================== + #ifdef A_HALF + AW2 ARmp8x8H(AU1 a){return AW2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} + AW2 ARmpRed8x8H(AU1 a){return AW2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} + #endif +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// REFERENCE +// +//------------------------------------------------------------------------------------------------------------------------------ +// IEEE FLOAT RULES +// ================ +// - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1 +// - {+/-}0 * {+/-}INF = NaN +// - -INF + (+INF) = NaN +// - {+/-}0 / {+/-}0 = NaN +// - {+/-}INF / {+/-}INF = NaN +// - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN) +// - 0 == -0 +// - 4/0 = +INF +// - 4/-0 = -INF +// - 4+INF = +INF +// - 4-INF = -INF +// - 4*(+INF) = +INF +// - 4*(-INF) = -INF +// - -4*(+INF) = -INF +// - sqrt(+INF) = +INF +//------------------------------------------------------------------------------------------------------------------------------ +// FP16 ENCODING +// ============= +// fedcba9876543210 +// ---------------- +// ......mmmmmmmmmm 10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals) +// .eeeee.......... 5-bit exponent +// .00000.......... denormals +// .00001.......... -14 exponent +// .11110.......... 15 exponent +// .111110000000000 infinity +// .11111nnnnnnnnnn NaN with n!=0 +// s............... sign +//------------------------------------------------------------------------------------------------------------------------------ +// FP16/INT16 ALIASING DENORMAL +// ============================ +// 11-bit unsigned integers alias with half float denormal/normal values, +// 1 = 2^(-24) = 1/16777216 ....................... first denormal value +// 2 = 2^(-23) +// ... +// 1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value +// 1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers +// 2047 .............................................. last normal value that still maps to integers +// Scaling limits, +// 2^15 = 32768 ...................................... largest power of 2 scaling +// Largest pow2 conversion mapping is at *32768, +// 1 : 2^(-9) = 1/512 +// 2 : 1/256 +// 4 : 1/128 +// 8 : 1/64 +// 16 : 1/32 +// 32 : 1/16 +// 64 : 1/8 +// 128 : 1/4 +// 256 : 1/2 +// 512 : 1 +// 1024 : 2 +// 2047 : a little less than 4 +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU/CPU PORTABILITY +// +// +//------------------------------------------------------------------------------------------------------------------------------ +// This is the GPU implementation. +// See the CPU implementation for docs. +//============================================================================================================================== +#ifdef A_GPU + #define A_TRUE true + #define A_FALSE false + #define A_STATIC +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD2 + #define retAD3 AD3 + #define retAD4 AD4 + #define retAF2 AF2 + #define retAF3 AF3 + #define retAF4 AF4 + #define retAL2 AL2 + #define retAL3 AL3 + #define retAL4 AL4 + #define retAU2 AU2 + #define retAU3 AU3 + #define retAU4 AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 in AD2 + #define inAD3 in AD3 + #define inAD4 in AD4 + #define inAF2 in AF2 + #define inAF3 in AF3 + #define inAF4 in AF4 + #define inAL2 in AL2 + #define inAL3 in AL3 + #define inAL4 in AL4 + #define inAU2 in AU2 + #define inAU3 in AU3 + #define inAU4 in AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 inout AD2 + #define inoutAD3 inout AD3 + #define inoutAD4 inout AD4 + #define inoutAF2 inout AF2 + #define inoutAF3 inout AF3 + #define inoutAF4 inout AF4 + #define inoutAL2 inout AL2 + #define inoutAL3 inout AL3 + #define inoutAL4 inout AL4 + #define inoutAU2 inout AU2 + #define inoutAU3 inout AU3 + #define inoutAU4 inout AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 out AD2 + #define outAD3 out AD3 + #define outAD4 out AD4 + #define outAF2 out AF2 + #define outAF3 out AF3 + #define outAF4 out AF4 + #define outAL2 out AL2 + #define outAL3 out AL3 + #define outAL4 out AL4 + #define outAU2 out AU2 + #define outAU3 out AU3 + #define outAU4 out AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD2 x + #define varAD3(x) AD3 x + #define varAD4(x) AD4 x + #define varAF2(x) AF2 x + #define varAF3(x) AF3 x + #define varAF4(x) AF4 x + #define varAL2(x) AL2 x + #define varAL3(x) AL3 x + #define varAL4(x) AL4 x + #define varAU2(x) AU2 x + #define varAU3(x) AU3 x + #define varAU4(x) AU4 x +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) AD2(x,y) + #define initAD3(x,y,z) AD3(x,y,z) + #define initAD4(x,y,z,w) AD4(x,y,z,w) + #define initAF2(x,y) AF2(x,y) + #define initAF3(x,y,z) AF3(x,y,z) + #define initAF4(x,y,z,w) AF4(x,y,z,w) + #define initAL2(x,y) AL2(x,y) + #define initAL3(x,y,z) AL3(x,y,z) + #define initAL4(x,y,z,w) AL4(x,y,z,w) + #define initAU2(x,y) AU2(x,y) + #define initAU3(x,y,z) AU3(x,y,z) + #define initAU4(x,y,z,w) AU4(x,y,z,w) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//============================================================================================================================== + #define AAbsD1(a) abs(AD1(a)) + #define AAbsF1(a) abs(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ACosD1(a) cos(AD1(a)) + #define ACosF1(a) cos(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ADotD2(a,b) dot(AD2(a),AD2(b)) + #define ADotD3(a,b) dot(AD3(a),AD3(b)) + #define ADotD4(a,b) dot(AD4(a),AD4(b)) + #define ADotF2(a,b) dot(AF2(a),AF2(b)) + #define ADotF3(a,b) dot(AF3(a),AF3(b)) + #define ADotF4(a,b) dot(AF4(a),AF4(b)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AExp2D1(a) exp2(AD1(a)) + #define AExp2F1(a) exp2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AFloorD1(a) floor(AD1(a)) + #define AFloorF1(a) floor(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ALog2D1(a) log2(AD1(a)) + #define ALog2F1(a) log2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMaxD1(a,b) max(a,b) + #define AMaxF1(a,b) max(a,b) + #define AMaxL1(a,b) max(a,b) + #define AMaxU1(a,b) max(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMinD1(a,b) min(a,b) + #define AMinF1(a,b) min(a,b) + #define AMinL1(a,b) min(a,b) + #define AMinU1(a,b) min(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASinD1(a) sin(AD1(a)) + #define ASinF1(a) sin(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASqrtD1(a) sqrt(AD1(a)) + #define ASqrtF1(a) sqrt(AF1(a)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + #define APowD1(a,b) pow(AD1(a),AF1(b)) + #define APowF1(a,b) pow(AF1(a),AF1(b)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + #ifdef A_DUBL + AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;} + AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;} + AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;} + AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;} + AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d=a+AD2_(b);return d;} + AD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d=a+AD3_(b);return d;} + AD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d=a+AD4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;} + AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;} + AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;} + AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;} + AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;} + AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;} + AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;} + AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;} + AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;} + AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;} + AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;} + AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;} + AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;} + AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;} + AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;} + AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;} + AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;} + AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;} + AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;} + #endif +//============================================================================================================================== + AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;} + AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;} + AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;} + AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;} + AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d=a+AF2_(b);return d;} + AF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d=a+AF3_(b);return d;} + AF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d=a+AF4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;} + AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;} + AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;} + AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;} + AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;} + AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;} + AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;} + AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;} + AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;} + AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;} + AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;} + AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;} + AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;} + AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;} + AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;} + AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;} + AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;} + AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;} + AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;} +#endif + +#define FSR_EASU_F 1 +AU4 con0, con1, con2, con3; +float srcW, srcH, dstW, dstH; +vec2 bLeft, tRight; + +AF2 translate(AF2 pos) { + return AF2(pos.x * scaleX, pos.y * scaleY); +} + +void setBounds(vec2 bottomLeft, vec2 topRight) { + bLeft = bottomLeft; + tRight = topRight; +} + +AF4 FsrEasuRF(AF2 p) { AF4 res = textureGather(Source, translate(p), 0); return res; } +AF4 FsrEasuGF(AF2 p) { AF4 res = textureGather(Source, translate(p), 1); return res; } +AF4 FsrEasuBF(AF2 p) { AF4 res = textureGather(Source, translate(p), 2); return res; } + +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// AMD FidelityFX SUPER RESOLUTION [FSR 1] ::: SPATIAL SCALING & EXTRAS - v1.20210629 +// +// +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// ABOUT +// ===== +// FSR is a collection of algorithms relating to generating a higher resolution image. +// This specific header focuses on single-image non-temporal image scaling, and related tools. +// +// The core functions are EASU and RCAS: +// [EASU] Edge Adaptive Spatial Upsampling ....... 1x to 4x area range spatial scaling, clamped adaptive elliptical filter. +// [RCAS] Robust Contrast Adaptive Sharpening .... A non-scaling variation on CAS. +// RCAS needs to be applied after EASU as a separate pass. +// +// Optional utility functions are: +// [LFGA] Linear Film Grain Applicator ........... Tool to apply film grain after scaling. +// [SRTM] Simple Reversible Tone-Mapper .......... Linear HDR {0 to FP16_MAX} to {0 to 1} and back. +// [TEPD] Temporal Energy Preserving Dither ...... Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion. +// See each individual sub-section for inline documentation. +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// FUNCTION PERMUTATIONS +// ===================== +// *F() ..... Single item computation with 32-bit. +// *H() ..... Single item computation with 16-bit, with packing (aka two 16-bit ops in parallel) when possible. +// *Hx2() ... Processing two items in parallel with 16-bit, easier packing. +// Not all interfaces in this file have a *Hx2() form. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [EASU] EDGE ADAPTIVE SPATIAL UPSAMPLING +// +//------------------------------------------------------------------------------------------------------------------------------ +// EASU provides a high quality spatial-only scaling at relatively low cost. +// Meaning EASU is appropiate for laptops and other low-end GPUs. +// Quality from 1x to 4x area scaling is good. +//------------------------------------------------------------------------------------------------------------------------------ +// The scalar uses a modified fast approximation to the standard lanczos(size=2) kernel. +// EASU runs in a single pass, so it applies a directionally and anisotropically adaptive radial lanczos. +// This is also kept as simple as possible to have minimum runtime. +//------------------------------------------------------------------------------------------------------------------------------ +// The lanzcos filter has negative lobes, so by itself it will introduce ringing. +// To remove all ringing, the algorithm uses the nearest 2x2 input texels as a neighborhood, +// and limits output to the minimum and maximum of that neighborhood. +//------------------------------------------------------------------------------------------------------------------------------ +// Input image requirements: +// +// Color needs to be encoded as 3 channel[red, green, blue](e.g.XYZ not supported) +// Each channel needs to be in the range[0, 1] +// Any color primaries are supported +// Display / tonemapping curve needs to be as if presenting to sRGB display or similar(e.g.Gamma 2.0) +// There should be no banding in the input +// There should be no high amplitude noise in the input +// There should be no noise in the input that is not at input pixel granularity +// For performance purposes, use 32bpp formats +//------------------------------------------------------------------------------------------------------------------------------ +// Best to apply EASU at the end of the frame after tonemapping +// but before film grain or composite of the UI. +//------------------------------------------------------------------------------------------------------------------------------ +// Example of including this header for D3D HLSL : +// +// #define A_GPU 1 +// #define A_HLSL 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of including this header for Vulkan GLSL : +// +// #define A_GPU 1 +// #define A_GLSL 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of including this header for Vulkan HLSL : +// +// #define A_GPU 1 +// #define A_HLSL 1 +// #define A_HLSL_6_2 1 +// #define A_NO_16_BIT_CAST 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of declaring the required input callbacks for GLSL : +// The callbacks need to gather4 for each color channel using the specified texture coordinate 'p'. +// EASU uses gather4 to reduce position computation logic and for free Arrays of Structures to Structures of Arrays conversion. +// +// AH4 FsrEasuRH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,0));} +// AH4 FsrEasuGH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,1));} +// AH4 FsrEasuBH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,2));} +// ... +// The FsrEasuCon function needs to be called from the CPU or GPU to set up constants. +// The difference in viewport and input image size is there to support Dynamic Resolution Scaling. +// To use FsrEasuCon() on the CPU, define A_CPU before including ffx_a and ffx_fsr1. +// Including a GPU example here, the 'con0' through 'con3' values would be stored out to a constant buffer. +// AU4 con0,con1,con2,con3; +// FsrEasuCon(con0,con1,con2,con3, +// 1920.0,1080.0, // Viewport size (top left aligned) in the input image which is to be scaled. +// 3840.0,2160.0, // The size of the input image. +// 2560.0,1440.0); // The output resolution. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CONSTANT SETUP +//============================================================================================================================== +// Call to setup required constant values (works on CPU or GPU). +A_STATIC void FsrEasuCon( +outAU4 con0, +outAU4 con1, +outAU4 con2, +outAU4 con3, +// This the rendered image resolution being upscaled +AF1 inputViewportInPixelsX, +AF1 inputViewportInPixelsY, +// This is the resolution of the resource containing the input image (useful for dynamic resolution) +AF1 inputSizeInPixelsX, +AF1 inputSizeInPixelsY, +// This is the display resolution which the input image gets upscaled to +AF1 outputSizeInPixelsX, +AF1 outputSizeInPixelsY){ + // Output integer position to a pixel position in viewport. + con0[0]=AU1_AF1(inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)); + con0[1]=AU1_AF1(inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)); + con0[2]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5)); + con0[3]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5)); + // Viewport pixel position to normalized image space. + // This is used to get upper-left of 'F' tap. + con1[0]=AU1_AF1(ARcpF1(inputSizeInPixelsX)); + con1[1]=AU1_AF1(ARcpF1(inputSizeInPixelsY)); + // Centers of gather4, first offset from upper-left of 'F'. + // +---+---+ + // | | | + // +--(0)--+ + // | b | c | + // +---F---+---+---+ + // | e | f | g | h | + // +--(1)--+--(2)--+ + // | i | j | k | l | + // +---+---+---+---+ + // | n | o | + // +--(3)--+ + // | | | + // +---+---+ + con1[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX)); + con1[3]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsY)); + // These are from (0) instead of 'F'. + con2[0]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsX)); + con2[1]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY)); + con2[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX)); + con2[3]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY)); + con3[0]=AU1_AF1(AF1_( 0.0)*ARcpF1(inputSizeInPixelsX)); + con3[1]=AU1_AF1(AF1_( 4.0)*ARcpF1(inputSizeInPixelsY)); + con3[2]=con3[3]=0;} + +//If the an offset into the input image resource +A_STATIC void FsrEasuConOffset( + outAU4 con0, + outAU4 con1, + outAU4 con2, + outAU4 con3, + // This the rendered image resolution being upscaled + AF1 inputViewportInPixelsX, + AF1 inputViewportInPixelsY, + // This is the resolution of the resource containing the input image (useful for dynamic resolution) + AF1 inputSizeInPixelsX, + AF1 inputSizeInPixelsY, + // This is the display resolution which the input image gets upscaled to + AF1 outputSizeInPixelsX, + AF1 outputSizeInPixelsY, + // This is the input image offset into the resource containing it (useful for dynamic resolution) + AF1 inputOffsetInPixelsX, + AF1 inputOffsetInPixelsY) { + FsrEasuCon(con0, con1, con2, con3, inputViewportInPixelsX, inputViewportInPixelsY, inputSizeInPixelsX, inputSizeInPixelsY, outputSizeInPixelsX, outputSizeInPixelsY); + con0[2] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsX * ARcpF1(outputSizeInPixelsX) - AF1_(0.5) + inputOffsetInPixelsX); + con0[3] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsY * ARcpF1(outputSizeInPixelsY) - AF1_(0.5) + inputOffsetInPixelsY); +} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 32-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(FSR_EASU_F) + // Input callback prototypes, need to be implemented by calling shader + AF4 FsrEasuRF(AF2 p); + AF4 FsrEasuGF(AF2 p); + AF4 FsrEasuBF(AF2 p); +//------------------------------------------------------------------------------------------------------------------------------ + // Filtering for a given tap for the scalar. + void FsrEasuTapF( + inout AF3 aC, // Accumulated color, with negative lobe. + inout AF1 aW, // Accumulated weight. + AF2 off, // Pixel offset from resolve position to tap. + AF2 dir, // Gradient direction. + AF2 len, // Length. + AF1 lob, // Negative lobe strength. + AF1 clp, // Clipping point. + AF3 c){ // Tap color. + // Rotate offset by direction. + AF2 v; + v.x=(off.x*( dir.x))+(off.y*dir.y); + v.y=(off.x*(-dir.y))+(off.y*dir.x); + // Anisotropy. + v*=len; + // Compute distance^2. + AF1 d2=v.x*v.x+v.y*v.y; + // Limit to the window as at corner, 2 taps can easily be outside. + d2=min(d2,clp); + // Approximation of lancos2 without sin() or rcp(), or sqrt() to get x. + // (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2 + // |_______________________________________| |_______________| + // base window + // The general form of the 'base' is, + // (a*(b*x^2-1)^2-(a-1)) + // Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe. + AF1 wB=AF1_(2.0/5.0)*d2+AF1_(-1.0); + AF1 wA=lob*d2+AF1_(-1.0); + wB*=wB; + wA*=wA; + wB=AF1_(25.0/16.0)*wB+AF1_(-(25.0/16.0-1.0)); + AF1 w=wB*wA; + // Do weighted average. + aC+=c*w;aW+=w;} +//------------------------------------------------------------------------------------------------------------------------------ + // Accumulate direction and length. + void FsrEasuSetF( + inout AF2 dir, + inout AF1 len, + AF2 pp, + AP1 biS,AP1 biT,AP1 biU,AP1 biV, + AF1 lA,AF1 lB,AF1 lC,AF1 lD,AF1 lE){ + // Compute bilinear weight, branches factor out as predicates are compiler time immediates. + // s t + // u v + AF1 w = AF1_(0.0); + if(biS)w=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y); + if(biT)w= pp.x *(AF1_(1.0)-pp.y); + if(biU)w=(AF1_(1.0)-pp.x)* pp.y ; + if(biV)w= pp.x * pp.y ; + // Direction is the '+' diff. + // a + // b c d + // e + // Then takes magnitude from abs average of both sides of 'c'. + // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms. + AF1 dc=lD-lC; + AF1 cb=lC-lB; + AF1 lenX=max(abs(dc),abs(cb)); + lenX=APrxLoRcpF1(lenX); + AF1 dirX=lD-lB; + dir.x+=dirX*w; + lenX=ASatF1(abs(dirX)*lenX); + lenX*=lenX; + len+=lenX*w; + // Repeat for the y axis. + AF1 ec=lE-lC; + AF1 ca=lC-lA; + AF1 lenY=max(abs(ec),abs(ca)); + lenY=APrxLoRcpF1(lenY); + AF1 dirY=lE-lA; + dir.y+=dirY*w; + lenY=ASatF1(abs(dirY)*lenY); + lenY*=lenY; + len+=lenY*w;} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrEasuF( + out AF3 pix, + AU2 ip, // Integer pixel position in output. + AU4 con0, // Constants generated by FsrEasuCon(). + AU4 con1, + AU4 con2, + AU4 con3){ +//------------------------------------------------------------------------------------------------------------------------------ + // Get position of 'f'. + AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw); + AF2 fp=floor(pp); + pp-=fp; +//------------------------------------------------------------------------------------------------------------------------------ + // 12-tap kernel. + // b c + // e f g h + // i j k l + // n o + // Gather 4 ordering. + // a b + // r g + // For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions, + // a b <- unused (z) + // r g + // a b a b + // r g r g + // a b + // r g <- unused (z) + // Allowing dead-code removal to remove the 'z's. + AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw); + // These are from p0 to avoid pulling two constants on pre-Navi hardware. + AF2 p1=p0+AF2_AU2(con2.xy); + AF2 p2=p0+AF2_AU2(con2.zw); + AF2 p3=p0+AF2_AU2(con3.xy); + AF4 bczzR=FsrEasuRF(p0); + AF4 bczzG=FsrEasuGF(p0); + AF4 bczzB=FsrEasuBF(p0); + AF4 ijfeR=FsrEasuRF(p1); + AF4 ijfeG=FsrEasuGF(p1); + AF4 ijfeB=FsrEasuBF(p1); + AF4 klhgR=FsrEasuRF(p2); + AF4 klhgG=FsrEasuGF(p2); + AF4 klhgB=FsrEasuBF(p2); + AF4 zzonR=FsrEasuRF(p3); + AF4 zzonG=FsrEasuGF(p3); + AF4 zzonB=FsrEasuBF(p3); +//------------------------------------------------------------------------------------------------------------------------------ + // Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD). + AF4 bczzL=bczzB*AF4_(0.5)+(bczzR*AF4_(0.5)+bczzG); + AF4 ijfeL=ijfeB*AF4_(0.5)+(ijfeR*AF4_(0.5)+ijfeG); + AF4 klhgL=klhgB*AF4_(0.5)+(klhgR*AF4_(0.5)+klhgG); + AF4 zzonL=zzonB*AF4_(0.5)+(zzonR*AF4_(0.5)+zzonG); + // Rename. + AF1 bL=bczzL.x; + AF1 cL=bczzL.y; + AF1 iL=ijfeL.x; + AF1 jL=ijfeL.y; + AF1 fL=ijfeL.z; + AF1 eL=ijfeL.w; + AF1 kL=klhgL.x; + AF1 lL=klhgL.y; + AF1 hL=klhgL.z; + AF1 gL=klhgL.w; + AF1 oL=zzonL.z; + AF1 nL=zzonL.w; + // Accumulate for bilinear interpolation. + AF2 dir=AF2_(0.0); + AF1 len=AF1_(0.0); + FsrEasuSetF(dir,len,pp,true, false,false,false,bL,eL,fL,gL,jL); + FsrEasuSetF(dir,len,pp,false,true ,false,false,cL,fL,gL,hL,kL); + FsrEasuSetF(dir,len,pp,false,false,true ,false,fL,iL,jL,kL,nL); + FsrEasuSetF(dir,len,pp,false,false,false,true ,gL,jL,kL,lL,oL); +//------------------------------------------------------------------------------------------------------------------------------ + // Normalize with approximation, and cleanup close to zero. + AF2 dir2=dir*dir; + AF1 dirR=dir2.x+dir2.y; + AP1 zro=dirR<AF1_(1.0/32768.0); + dirR=APrxLoRsqF1(dirR); + dirR=zro?AF1_(1.0):dirR; + dir.x=zro?AF1_(1.0):dir.x; + dir*=AF2_(dirR); + // Transform from {0 to 2} to {0 to 1} range, and shape with square. + len=len*AF1_(0.5); + len*=len; + // Stretch kernel {1.0 vert|horz, to sqrt(2.0) on diagonal}. + AF1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpF1(max(abs(dir.x),abs(dir.y))); + // Anisotropic length after rotation, + // x := 1.0 lerp to 'stretch' on edges + // y := 1.0 lerp to 2x on edges + AF2 len2=AF2(AF1_(1.0)+(stretch-AF1_(1.0))*len,AF1_(1.0)+AF1_(-0.5)*len); + // Based on the amount of 'edge', + // the window shifts from +/-{sqrt(2.0) to slightly beyond 2.0}. + AF1 lob=AF1_(0.5)+AF1_((1.0/4.0-0.04)-0.5)*len; + // Set distance^2 clipping point to the end of the adjustable window. + AF1 clp=APrxLoRcpF1(lob); +//------------------------------------------------------------------------------------------------------------------------------ + // Accumulation mixed with min/max of 4 nearest. + // b c + // e f g h + // i j k l + // n o + AF3 min4=min(AMin3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)), + AF3(klhgR.x,klhgG.x,klhgB.x)); + AF3 max4=max(AMax3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)), + AF3(klhgR.x,klhgG.x,klhgB.x)); + // Accumulation. + AF3 aC=AF3_(0.0); + AF1 aW=AF1_(0.0); + FsrEasuTapF(aC,aW,AF2( 0.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.x,bczzG.x,bczzB.x)); // b + FsrEasuTapF(aC,aW,AF2( 1.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.y,bczzG.y,bczzB.y)); // c + FsrEasuTapF(aC,aW,AF2(-1.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.x,ijfeG.x,ijfeB.x)); // i + FsrEasuTapF(aC,aW,AF2( 0.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.y,ijfeG.y,ijfeB.y)); // j + FsrEasuTapF(aC,aW,AF2( 0.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.z,ijfeG.z,ijfeB.z)); // f + FsrEasuTapF(aC,aW,AF2(-1.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.w,ijfeG.w,ijfeB.w)); // e + FsrEasuTapF(aC,aW,AF2( 1.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.x,klhgG.x,klhgB.x)); // k + FsrEasuTapF(aC,aW,AF2( 2.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.y,klhgG.y,klhgB.y)); // l + FsrEasuTapF(aC,aW,AF2( 2.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.z,klhgG.z,klhgB.z)); // h + FsrEasuTapF(aC,aW,AF2( 1.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.w,klhgG.w,klhgB.w)); // g + FsrEasuTapF(aC,aW,AF2( 1.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.z,zzonG.z,zzonB.z)); // o + FsrEasuTapF(aC,aW,AF2( 0.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.w,zzonG.w,zzonB.w)); // n +//------------------------------------------------------------------------------------------------------------------------------ + // Normalize and dering. + pix=min(max4,max(min4,aC*AF3_(ARcpF1(aW))));} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_EASU_H) +// Input callback prototypes, need to be implemented by calling shader + AH4 FsrEasuRH(AF2 p); + AH4 FsrEasuGH(AF2 p); + AH4 FsrEasuBH(AF2 p); +//------------------------------------------------------------------------------------------------------------------------------ + // This runs 2 taps in parallel. + void FsrEasuTapH( + inout AH2 aCR,inout AH2 aCG,inout AH2 aCB, + inout AH2 aW, + AH2 offX,AH2 offY, + AH2 dir, + AH2 len, + AH1 lob, + AH1 clp, + AH2 cR,AH2 cG,AH2 cB){ + AH2 vX,vY; + vX=offX* dir.xx +offY*dir.yy; + vY=offX*(-dir.yy)+offY*dir.xx; + vX*=len.x;vY*=len.y; + AH2 d2=vX*vX+vY*vY; + d2=min(d2,AH2_(clp)); + AH2 wB=AH2_(2.0/5.0)*d2+AH2_(-1.0); + AH2 wA=AH2_(lob)*d2+AH2_(-1.0); + wB*=wB; + wA*=wA; + wB=AH2_(25.0/16.0)*wB+AH2_(-(25.0/16.0-1.0)); + AH2 w=wB*wA; + aCR+=cR*w;aCG+=cG*w;aCB+=cB*w;aW+=w;} +//------------------------------------------------------------------------------------------------------------------------------ + // This runs 2 taps in parallel. + void FsrEasuSetH( + inout AH2 dirPX,inout AH2 dirPY, + inout AH2 lenP, + AH2 pp, + AP1 biST,AP1 biUV, + AH2 lA,AH2 lB,AH2 lC,AH2 lD,AH2 lE){ + AH2 w = AH2_(0.0); + if(biST)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_(AH1_(1.0)-pp.y); + if(biUV)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_( pp.y); + // ABS is not free in the packed FP16 path. + AH2 dc=lD-lC; + AH2 cb=lC-lB; + AH2 lenX=max(abs(dc),abs(cb)); + lenX=ARcpH2(lenX); + AH2 dirX=lD-lB; + dirPX+=dirX*w; + lenX=ASatH2(abs(dirX)*lenX); + lenX*=lenX; + lenP+=lenX*w; + AH2 ec=lE-lC; + AH2 ca=lC-lA; + AH2 lenY=max(abs(ec),abs(ca)); + lenY=ARcpH2(lenY); + AH2 dirY=lE-lA; + dirPY+=dirY*w; + lenY=ASatH2(abs(dirY)*lenY); + lenY*=lenY; + lenP+=lenY*w;} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrEasuH( + out AH3 pix, + AU2 ip, + AU4 con0, + AU4 con1, + AU4 con2, + AU4 con3){ +//------------------------------------------------------------------------------------------------------------------------------ + AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw); + AF2 fp=floor(pp); + pp-=fp; + AH2 ppp=AH2(pp); +//------------------------------------------------------------------------------------------------------------------------------ + AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw); + AF2 p1=p0+AF2_AU2(con2.xy); + AF2 p2=p0+AF2_AU2(con2.zw); + AF2 p3=p0+AF2_AU2(con3.xy); + AH4 bczzR=FsrEasuRH(p0); + AH4 bczzG=FsrEasuGH(p0); + AH4 bczzB=FsrEasuBH(p0); + AH4 ijfeR=FsrEasuRH(p1); + AH4 ijfeG=FsrEasuGH(p1); + AH4 ijfeB=FsrEasuBH(p1); + AH4 klhgR=FsrEasuRH(p2); + AH4 klhgG=FsrEasuGH(p2); + AH4 klhgB=FsrEasuBH(p2); + AH4 zzonR=FsrEasuRH(p3); + AH4 zzonG=FsrEasuGH(p3); + AH4 zzonB=FsrEasuBH(p3); +//------------------------------------------------------------------------------------------------------------------------------ + AH4 bczzL=bczzB*AH4_(0.5)+(bczzR*AH4_(0.5)+bczzG); + AH4 ijfeL=ijfeB*AH4_(0.5)+(ijfeR*AH4_(0.5)+ijfeG); + AH4 klhgL=klhgB*AH4_(0.5)+(klhgR*AH4_(0.5)+klhgG); + AH4 zzonL=zzonB*AH4_(0.5)+(zzonR*AH4_(0.5)+zzonG); + AH1 bL=bczzL.x; + AH1 cL=bczzL.y; + AH1 iL=ijfeL.x; + AH1 jL=ijfeL.y; + AH1 fL=ijfeL.z; + AH1 eL=ijfeL.w; + AH1 kL=klhgL.x; + AH1 lL=klhgL.y; + AH1 hL=klhgL.z; + AH1 gL=klhgL.w; + AH1 oL=zzonL.z; + AH1 nL=zzonL.w; + // This part is different, accumulating 2 taps in parallel. + AH2 dirPX=AH2_(0.0); + AH2 dirPY=AH2_(0.0); + AH2 lenP=AH2_(0.0); + FsrEasuSetH(dirPX,dirPY,lenP,ppp,true, false,AH2(bL,cL),AH2(eL,fL),AH2(fL,gL),AH2(gL,hL),AH2(jL,kL)); + FsrEasuSetH(dirPX,dirPY,lenP,ppp,false,true ,AH2(fL,gL),AH2(iL,jL),AH2(jL,kL),AH2(kL,lL),AH2(nL,oL)); + AH2 dir=AH2(dirPX.r+dirPX.g,dirPY.r+dirPY.g); + AH1 len=lenP.r+lenP.g; +//------------------------------------------------------------------------------------------------------------------------------ + AH2 dir2=dir*dir; + AH1 dirR=dir2.x+dir2.y; + AP1 zro=dirR<AH1_(1.0/32768.0); + dirR=APrxLoRsqH1(dirR); + dirR=zro?AH1_(1.0):dirR; + dir.x=zro?AH1_(1.0):dir.x; + dir*=AH2_(dirR); + len=len*AH1_(0.5); + len*=len; + AH1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpH1(max(abs(dir.x),abs(dir.y))); + AH2 len2=AH2(AH1_(1.0)+(stretch-AH1_(1.0))*len,AH1_(1.0)+AH1_(-0.5)*len); + AH1 lob=AH1_(0.5)+AH1_((1.0/4.0-0.04)-0.5)*len; + AH1 clp=APrxLoRcpH1(lob); +//------------------------------------------------------------------------------------------------------------------------------ + // FP16 is different, using packed trick to do min and max in same operation. + AH2 bothR=max(max(AH2(-ijfeR.z,ijfeR.z),AH2(-klhgR.w,klhgR.w)),max(AH2(-ijfeR.y,ijfeR.y),AH2(-klhgR.x,klhgR.x))); + AH2 bothG=max(max(AH2(-ijfeG.z,ijfeG.z),AH2(-klhgG.w,klhgG.w)),max(AH2(-ijfeG.y,ijfeG.y),AH2(-klhgG.x,klhgG.x))); + AH2 bothB=max(max(AH2(-ijfeB.z,ijfeB.z),AH2(-klhgB.w,klhgB.w)),max(AH2(-ijfeB.y,ijfeB.y),AH2(-klhgB.x,klhgB.x))); + // This part is different for FP16, working pairs of taps at a time. + AH2 pR=AH2_(0.0); + AH2 pG=AH2_(0.0); + AH2 pB=AH2_(0.0); + AH2 pW=AH2_(0.0); + FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0, 1.0)-ppp.xx,AH2(-1.0,-1.0)-ppp.yy,dir,len2,lob,clp,bczzR.xy,bczzG.xy,bczzB.xy); + FsrEasuTapH(pR,pG,pB,pW,AH2(-1.0, 0.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,ijfeR.xy,ijfeG.xy,ijfeB.xy); + FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0,-1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,ijfeR.zw,ijfeG.zw,ijfeB.zw); + FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 2.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,klhgR.xy,klhgG.xy,klhgB.xy); + FsrEasuTapH(pR,pG,pB,pW,AH2( 2.0, 1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,klhgR.zw,klhgG.zw,klhgB.zw); + FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 0.0)-ppp.xx,AH2( 2.0, 2.0)-ppp.yy,dir,len2,lob,clp,zzonR.zw,zzonG.zw,zzonB.zw); + AH3 aC=AH3(pR.x+pR.y,pG.x+pG.y,pB.x+pB.y); + AH1 aW=pW.x+pW.y; +//------------------------------------------------------------------------------------------------------------------------------ + // Slightly different for FP16 version due to combined min and max. + pix=min(AH3(bothR.y,bothG.y,bothB.y),max(-AH3(bothR.x,bothG.x,bothB.x),aC*AH3_(ARcpH1(aW))));} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [RCAS] ROBUST CONTRAST ADAPTIVE SHARPENING +// +//------------------------------------------------------------------------------------------------------------------------------ +// CAS uses a simplified mechanism to convert local contrast into a variable amount of sharpness. +// RCAS uses a more exact mechanism, solving for the maximum local sharpness possible before clipping. +// RCAS also has a built in process to limit sharpening of what it detects as possible noise. +// RCAS sharper does not support scaling, as it should be applied after EASU scaling. +// Pass EASU output straight into RCAS, no color conversions necessary. +//------------------------------------------------------------------------------------------------------------------------------ +// RCAS is based on the following logic. +// RCAS uses a 5 tap filter in a cross pattern (same as CAS), +// w n +// w 1 w for taps w m e +// w s +// Where 'w' is the negative lobe weight. +// output = (w*(n+e+w+s)+m)/(4*w+1) +// RCAS solves for 'w' by seeing where the signal might clip out of the {0 to 1} input range, +// 0 == (w*(n+e+w+s)+m)/(4*w+1) -> w = -m/(n+e+w+s) +// 1 == (w*(n+e+w+s)+m)/(4*w+1) -> w = (1-m)/(n+e+w+s-4*1) +// Then chooses the 'w' which results in no clipping, limits 'w', and multiplies by the 'sharp' amount. +// This solution above has issues with MSAA input as the steps along the gradient cause edge detection issues. +// So RCAS uses 4x the maximum and 4x the minimum (depending on equation)in place of the individual taps. +// As well as switching from 'm' to either the minimum or maximum (depending on side), to help in energy conservation. +// This stabilizes RCAS. +// RCAS does a simple highpass which is normalized against the local contrast then shaped, +// 0.25 +// 0.25 -1 0.25 +// 0.25 +// This is used as a noise detection filter, to reduce the effect of RCAS on grain, and focus on real edges. +// +// GLSL example for the required callbacks : +// +// AH4 FsrRcasLoadH(ASW2 p){return AH4(imageLoad(imgSrc,ASU2(p)));} +// void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b) +// { +// //do any simple input color conversions here or leave empty if none needed +// } +// +// FsrRcasCon need to be called from the CPU or GPU to set up constants. +// Including a GPU example here, the 'con' value would be stored out to a constant buffer. +// +// AU4 con; +// FsrRcasCon(con, +// 0.0); // The scale is {0.0 := maximum sharpness, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. +// --------------- +// RCAS sharpening supports a CAS-like pass-through alpha via, +// #define FSR_RCAS_PASSTHROUGH_ALPHA 1 +// RCAS also supports a define to enable a more expensive path to avoid some sharpening of noise. +// Would suggest it is better to apply film grain after RCAS sharpening (and after scaling) instead of using this define, +// #define FSR_RCAS_DENOISE 1 +//============================================================================================================================== +// This is set at the limit of providing unnatural results for sharpening. +#define FSR_RCAS_LIMIT (0.25-(1.0/16.0)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CONSTANT SETUP +//============================================================================================================================== +// Call to setup required constant values (works on CPU or GPU). +A_STATIC void FsrRcasCon( +outAU4 con, +// The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. +AF1 sharpness){ + // Transform from stops to linear value. + sharpness=AExp2F1(-sharpness); + varAF2(hSharp)=initAF2(sharpness,sharpness); + con[0]=AU1_AF1(sharpness); + con[1]=AU1_AH2_AF2(hSharp); + con[2]=0; + con[3]=0;} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 32-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(FSR_RCAS_F) + // Input callback prototypes that need to be implemented by calling shader + AF4 FsrRcasLoadF(ASU2 p); + void FsrRcasInputF(inout AF1 r,inout AF1 g,inout AF1 b); +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasF( + out AF1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. + out AF1 pixG, + out AF1 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AF1 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // Algorithm uses minimal 3x3 pixel neighborhood. + // b + // d e f + // h + ASU2 sp=ASU2(ip); + AF3 b=FsrRcasLoadF(sp+ASU2( 0,-1)).rgb; + AF3 d=FsrRcasLoadF(sp+ASU2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AF4 ee=FsrRcasLoadF(sp); + AF3 e=ee.rgb;pixA=ee.a; + #else + AF3 e=FsrRcasLoadF(sp).rgb; + #endif + AF3 f=FsrRcasLoadF(sp+ASU2( 1, 0)).rgb; + AF3 h=FsrRcasLoadF(sp+ASU2( 0, 1)).rgb; + // Rename (32-bit) or regroup (16-bit). + AF1 bR=b.r; + AF1 bG=b.g; + AF1 bB=b.b; + AF1 dR=d.r; + AF1 dG=d.g; + AF1 dB=d.b; + AF1 eR=e.r; + AF1 eG=e.g; + AF1 eB=e.b; + AF1 fR=f.r; + AF1 fG=f.g; + AF1 fB=f.b; + AF1 hR=h.r; + AF1 hG=h.g; + AF1 hB=h.b; + // Run optional input transform. + FsrRcasInputF(bR,bG,bB); + FsrRcasInputF(dR,dG,dB); + FsrRcasInputF(eR,eG,eB); + FsrRcasInputF(fR,fG,fB); + FsrRcasInputF(hR,hG,hB); + // Luma times 2. + AF1 bL=bB*AF1_(0.5)+(bR*AF1_(0.5)+bG); + AF1 dL=dB*AF1_(0.5)+(dR*AF1_(0.5)+dG); + AF1 eL=eB*AF1_(0.5)+(eR*AF1_(0.5)+eG); + AF1 fL=fB*AF1_(0.5)+(fR*AF1_(0.5)+fG); + AF1 hL=hB*AF1_(0.5)+(hR*AF1_(0.5)+hG); + // Noise detection. + AF1 nz=AF1_(0.25)*bL+AF1_(0.25)*dL+AF1_(0.25)*fL+AF1_(0.25)*hL-eL; + nz=ASatF1(abs(nz)*APrxMedRcpF1(AMax3F1(AMax3F1(bL,dL,eL),fL,hL)-AMin3F1(AMin3F1(bL,dL,eL),fL,hL))); + nz=AF1_(-0.5)*nz+AF1_(1.0); + // Min and max of ring. + AF1 mn4R=min(AMin3F1(bR,dR,fR),hR); + AF1 mn4G=min(AMin3F1(bG,dG,fG),hG); + AF1 mn4B=min(AMin3F1(bB,dB,fB),hB); + AF1 mx4R=max(AMax3F1(bR,dR,fR),hR); + AF1 mx4G=max(AMax3F1(bG,dG,fG),hG); + AF1 mx4B=max(AMax3F1(bB,dB,fB),hB); + // Immediate constants for peak range. + AF2 peakC=AF2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AF1 hitMinR=min(mn4R,eR)*ARcpF1(AF1_(4.0)*mx4R); + AF1 hitMinG=min(mn4G,eG)*ARcpF1(AF1_(4.0)*mx4G); + AF1 hitMinB=min(mn4B,eB)*ARcpF1(AF1_(4.0)*mx4B); + AF1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpF1(AF1_(4.0)*mn4R+peakC.y); + AF1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpF1(AF1_(4.0)*mn4G+peakC.y); + AF1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpF1(AF1_(4.0)*mn4B+peakC.y); + AF1 lobeR=max(-hitMinR,hitMaxR); + AF1 lobeG=max(-hitMinG,hitMaxG); + AF1 lobeB=max(-hitMinB,hitMaxB); + AF1 lobe=max(AF1_(-FSR_RCAS_LIMIT),min(AMax3F1(lobeR,lobeG,lobeB),AF1_(0.0)))*AF1_AU1(con.x); + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AF1 rcpL=APrxMedRcpF1(AF1_(4.0)*lobe+AF1_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL; + return;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_H) + // Input callback prototypes that need to be implemented by calling shader + AH4 FsrRcasLoadH(ASW2 p); + void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b); +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasH( + out AH1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. + out AH1 pixG, + out AH1 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AH1 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // Sharpening algorithm uses minimal 3x3 pixel neighborhood. + // b + // d e f + // h + ASW2 sp=ASW2(ip); + AH3 b=FsrRcasLoadH(sp+ASW2( 0,-1)).rgb; + AH3 d=FsrRcasLoadH(sp+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee=FsrRcasLoadH(sp); + AH3 e=ee.rgb;pixA=ee.a; + #else + AH3 e=FsrRcasLoadH(sp).rgb; + #endif + AH3 f=FsrRcasLoadH(sp+ASW2( 1, 0)).rgb; + AH3 h=FsrRcasLoadH(sp+ASW2( 0, 1)).rgb; + // Rename (32-bit) or regroup (16-bit). + AH1 bR=b.r; + AH1 bG=b.g; + AH1 bB=b.b; + AH1 dR=d.r; + AH1 dG=d.g; + AH1 dB=d.b; + AH1 eR=e.r; + AH1 eG=e.g; + AH1 eB=e.b; + AH1 fR=f.r; + AH1 fG=f.g; + AH1 fB=f.b; + AH1 hR=h.r; + AH1 hG=h.g; + AH1 hB=h.b; + // Run optional input transform. + FsrRcasInputH(bR,bG,bB); + FsrRcasInputH(dR,dG,dB); + FsrRcasInputH(eR,eG,eB); + FsrRcasInputH(fR,fG,fB); + FsrRcasInputH(hR,hG,hB); + // Luma times 2. + AH1 bL=bB*AH1_(0.5)+(bR*AH1_(0.5)+bG); + AH1 dL=dB*AH1_(0.5)+(dR*AH1_(0.5)+dG); + AH1 eL=eB*AH1_(0.5)+(eR*AH1_(0.5)+eG); + AH1 fL=fB*AH1_(0.5)+(fR*AH1_(0.5)+fG); + AH1 hL=hB*AH1_(0.5)+(hR*AH1_(0.5)+hG); + // Noise detection. + AH1 nz=AH1_(0.25)*bL+AH1_(0.25)*dL+AH1_(0.25)*fL+AH1_(0.25)*hL-eL; + nz=ASatH1(abs(nz)*APrxMedRcpH1(AMax3H1(AMax3H1(bL,dL,eL),fL,hL)-AMin3H1(AMin3H1(bL,dL,eL),fL,hL))); + nz=AH1_(-0.5)*nz+AH1_(1.0); + // Min and max of ring. + AH1 mn4R=min(AMin3H1(bR,dR,fR),hR); + AH1 mn4G=min(AMin3H1(bG,dG,fG),hG); + AH1 mn4B=min(AMin3H1(bB,dB,fB),hB); + AH1 mx4R=max(AMax3H1(bR,dR,fR),hR); + AH1 mx4G=max(AMax3H1(bG,dG,fG),hG); + AH1 mx4B=max(AMax3H1(bB,dB,fB),hB); + // Immediate constants for peak range. + AH2 peakC=AH2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AH1 hitMinR=min(mn4R,eR)*ARcpH1(AH1_(4.0)*mx4R); + AH1 hitMinG=min(mn4G,eG)*ARcpH1(AH1_(4.0)*mx4G); + AH1 hitMinB=min(mn4B,eB)*ARcpH1(AH1_(4.0)*mx4B); + AH1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH1(AH1_(4.0)*mn4R+peakC.y); + AH1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH1(AH1_(4.0)*mn4G+peakC.y); + AH1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH1(AH1_(4.0)*mn4B+peakC.y); + AH1 lobeR=max(-hitMinR,hitMaxR); + AH1 lobeG=max(-hitMinG,hitMaxG); + AH1 lobeB=max(-hitMinB,hitMaxB); + AH1 lobe=max(AH1_(-FSR_RCAS_LIMIT),min(AMax3H1(lobeR,lobeG,lobeB),AH1_(0.0)))*AH2_AU1(con.y).x; + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AH1 rcpL=APrxMedRcpH1(AH1_(4.0)*lobe+AH1_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_HX2) + // Input callback prototypes that need to be implemented by the calling shader + AH4 FsrRcasLoadHx2(ASW2 p); + void FsrRcasInputHx2(inout AH2 r,inout AH2 g,inout AH2 b); +//------------------------------------------------------------------------------------------------------------------------------ + // Can be used to convert from packed Structures of Arrays to Arrays of Structures for store. + void FsrRcasDepackHx2(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){ + #ifdef A_HLSL + // Invoke a slower path for DX only, since it won't allow uninitialized values. + pix0.a=pix1.a=0.0; + #endif + pix0.rgb=AH3(pixR.x,pixG.x,pixB.x); + pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasHx2( + // Output values are for 2 8x8 tiles in a 16x8 region. + // pix<R,G,B>.x = left 8x8 tile + // pix<R,G,B>.y = right 8x8 tile + // This enables later processing to easily be packed as well. + out AH2 pixR, + out AH2 pixG, + out AH2 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AH2 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // No scaling algorithm uses minimal 3x3 pixel neighborhood. + ASW2 sp0=ASW2(ip); + AH3 b0=FsrRcasLoadHx2(sp0+ASW2( 0,-1)).rgb; + AH3 d0=FsrRcasLoadHx2(sp0+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee0=FsrRcasLoadHx2(sp0); + AH3 e0=ee0.rgb;pixA.r=ee0.a; + #else + AH3 e0=FsrRcasLoadHx2(sp0).rgb; + #endif + AH3 f0=FsrRcasLoadHx2(sp0+ASW2( 1, 0)).rgb; + AH3 h0=FsrRcasLoadHx2(sp0+ASW2( 0, 1)).rgb; + ASW2 sp1=sp0+ASW2(8,0); + AH3 b1=FsrRcasLoadHx2(sp1+ASW2( 0,-1)).rgb; + AH3 d1=FsrRcasLoadHx2(sp1+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee1=FsrRcasLoadHx2(sp1); + AH3 e1=ee1.rgb;pixA.g=ee1.a; + #else + AH3 e1=FsrRcasLoadHx2(sp1).rgb; + #endif + AH3 f1=FsrRcasLoadHx2(sp1+ASW2( 1, 0)).rgb; + AH3 h1=FsrRcasLoadHx2(sp1+ASW2( 0, 1)).rgb; + // Arrays of Structures to Structures of Arrays conversion. + AH2 bR=AH2(b0.r,b1.r); + AH2 bG=AH2(b0.g,b1.g); + AH2 bB=AH2(b0.b,b1.b); + AH2 dR=AH2(d0.r,d1.r); + AH2 dG=AH2(d0.g,d1.g); + AH2 dB=AH2(d0.b,d1.b); + AH2 eR=AH2(e0.r,e1.r); + AH2 eG=AH2(e0.g,e1.g); + AH2 eB=AH2(e0.b,e1.b); + AH2 fR=AH2(f0.r,f1.r); + AH2 fG=AH2(f0.g,f1.g); + AH2 fB=AH2(f0.b,f1.b); + AH2 hR=AH2(h0.r,h1.r); + AH2 hG=AH2(h0.g,h1.g); + AH2 hB=AH2(h0.b,h1.b); + // Run optional input transform. + FsrRcasInputHx2(bR,bG,bB); + FsrRcasInputHx2(dR,dG,dB); + FsrRcasInputHx2(eR,eG,eB); + FsrRcasInputHx2(fR,fG,fB); + FsrRcasInputHx2(hR,hG,hB); + // Luma times 2. + AH2 bL=bB*AH2_(0.5)+(bR*AH2_(0.5)+bG); + AH2 dL=dB*AH2_(0.5)+(dR*AH2_(0.5)+dG); + AH2 eL=eB*AH2_(0.5)+(eR*AH2_(0.5)+eG); + AH2 fL=fB*AH2_(0.5)+(fR*AH2_(0.5)+fG); + AH2 hL=hB*AH2_(0.5)+(hR*AH2_(0.5)+hG); + // Noise detection. + AH2 nz=AH2_(0.25)*bL+AH2_(0.25)*dL+AH2_(0.25)*fL+AH2_(0.25)*hL-eL; + nz=ASatH2(abs(nz)*APrxMedRcpH2(AMax3H2(AMax3H2(bL,dL,eL),fL,hL)-AMin3H2(AMin3H2(bL,dL,eL),fL,hL))); + nz=AH2_(-0.5)*nz+AH2_(1.0); + // Min and max of ring. + AH2 mn4R=min(AMin3H2(bR,dR,fR),hR); + AH2 mn4G=min(AMin3H2(bG,dG,fG),hG); + AH2 mn4B=min(AMin3H2(bB,dB,fB),hB); + AH2 mx4R=max(AMax3H2(bR,dR,fR),hR); + AH2 mx4G=max(AMax3H2(bG,dG,fG),hG); + AH2 mx4B=max(AMax3H2(bB,dB,fB),hB); + // Immediate constants for peak range. + AH2 peakC=AH2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AH2 hitMinR=min(mn4R,eR)*ARcpH2(AH2_(4.0)*mx4R); + AH2 hitMinG=min(mn4G,eG)*ARcpH2(AH2_(4.0)*mx4G); + AH2 hitMinB=min(mn4B,eB)*ARcpH2(AH2_(4.0)*mx4B); + AH2 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH2(AH2_(4.0)*mn4R+peakC.y); + AH2 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH2(AH2_(4.0)*mn4G+peakC.y); + AH2 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH2(AH2_(4.0)*mn4B+peakC.y); + AH2 lobeR=max(-hitMinR,hitMaxR); + AH2 lobeG=max(-hitMinG,hitMaxG); + AH2 lobeB=max(-hitMinB,hitMaxB); + AH2 lobe=max(AH2_(-FSR_RCAS_LIMIT),min(AMax3H2(lobeR,lobeG,lobeB),AH2_(0.0)))*AH2_(AH2_AU1(con.y).x); + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AH2 rcpL=APrxMedRcpH2(AH2_(4.0)*lobe+AH2_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [LFGA] LINEAR FILM GRAIN APPLICATOR +// +//------------------------------------------------------------------------------------------------------------------------------ +// Adding output-resolution film grain after scaling is a good way to mask both rendering and scaling artifacts. +// Suggest using tiled blue noise as film grain input, with peak noise frequency set for a specific look and feel. +// The 'Lfga*()' functions provide a convenient way to introduce grain. +// These functions limit grain based on distance to signal limits. +// This is done so that the grain is temporally energy preserving, and thus won't modify image tonality. +// Grain application should be done in a linear colorspace. +// The grain should be temporally changing, but have a temporal sum per pixel that adds to zero (non-biased). +//------------------------------------------------------------------------------------------------------------------------------ +// Usage, +// FsrLfga*( +// color, // In/out linear colorspace color {0 to 1} ranged. +// grain, // Per pixel grain texture value {-0.5 to 0.5} ranged, input is 3-channel to support colored grain. +// amount); // Amount of grain (0 to 1} ranged. +//------------------------------------------------------------------------------------------------------------------------------ +// Example if grain texture is monochrome: 'FsrLfgaF(color,AF3_(grain),amount)' +//============================================================================================================================== +#if defined(A_GPU) + // Maximum grain is the minimum distance to the signal limit. + void FsrLfgaF(inout AF3 c,AF3 t,AF1 a){c+=(t*AF3_(a))*min(AF3_(1.0)-c,c);} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + // Half precision version (slower). + void FsrLfgaH(inout AH3 c,AH3 t,AH1 a){c+=(t*AH3_(a))*min(AH3_(1.0)-c,c);} +//------------------------------------------------------------------------------------------------------------------------------ + // Packed half precision version (faster). + void FsrLfgaHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 tR,AH2 tG,AH2 tB,AH1 a){ + cR+=(tR*AH2_(a))*min(AH2_(1.0)-cR,cR);cG+=(tG*AH2_(a))*min(AH2_(1.0)-cG,cG);cB+=(tB*AH2_(a))*min(AH2_(1.0)-cB,cB);} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [SRTM] SIMPLE REVERSIBLE TONE-MAPPER +// +//------------------------------------------------------------------------------------------------------------------------------ +// This provides a way to take linear HDR color {0 to FP16_MAX} and convert it into a temporary {0 to 1} ranged post-tonemapped linear. +// The tonemapper preserves RGB ratio, which helps maintain HDR color bleed during filtering. +//------------------------------------------------------------------------------------------------------------------------------ +// Reversible tonemapper usage, +// FsrSrtm*(color); // {0 to FP16_MAX} converted to {0 to 1}. +// FsrSrtmInv*(color); // {0 to 1} converted into {0 to 32768, output peak safe for FP16}. +//============================================================================================================================== +#if defined(A_GPU) + void FsrSrtmF(inout AF3 c){c*=AF3_(ARcpF1(AMax3F1(c.r,c.g,c.b)+AF1_(1.0)));} + // The extra max solves the c=1.0 case (which is a /0). + void FsrSrtmInvF(inout AF3 c){c*=AF3_(ARcpF1(max(AF1_(1.0/32768.0),AF1_(1.0)-AMax3F1(c.r,c.g,c.b))));} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + void FsrSrtmH(inout AH3 c){c*=AH3_(ARcpH1(AMax3H1(c.r,c.g,c.b)+AH1_(1.0)));} + void FsrSrtmInvH(inout AH3 c){c*=AH3_(ARcpH1(max(AH1_(1.0/32768.0),AH1_(1.0)-AMax3H1(c.r,c.g,c.b))));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrSrtmHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){ + AH2 rcp=ARcpH2(AMax3H2(cR,cG,cB)+AH2_(1.0));cR*=rcp;cG*=rcp;cB*=rcp;} + void FsrSrtmInvHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){ + AH2 rcp=ARcpH2(max(AH2_(1.0/32768.0),AH2_(1.0)-AMax3H2(cR,cG,cB)));cR*=rcp;cG*=rcp;cB*=rcp;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [TEPD] TEMPORAL ENERGY PRESERVING DITHER +// +//------------------------------------------------------------------------------------------------------------------------------ +// Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion. +// Gamma 2.0 is used so that the conversion back to linear is just to square the color. +// The conversion comes in 8-bit and 10-bit modes, designed for output to 8-bit UNORM or 10:10:10:2 respectively. +// Given good non-biased temporal blue noise as dither input, +// the output dither will temporally conserve energy. +// This is done by choosing the linear nearest step point instead of perceptual nearest. +// See code below for details. +//------------------------------------------------------------------------------------------------------------------------------ +// DX SPEC RULES FOR FLOAT->UNORM 8-BIT CONVERSION +// =============================================== +// - Output is 'uint(floor(saturate(n)*255.0+0.5))'. +// - Thus rounding is to nearest. +// - NaN gets converted to zero. +// - INF is clamped to {0.0 to 1.0}. +//============================================================================================================================== +#if defined(A_GPU) + // Hand tuned integer position to dither value, with more values than simple checkerboard. + // Only 32-bit has enough precision for this compddation. + // Output is {0 to <1}. + AF1 FsrTepdDitF(AU2 p,AU1 f){ + AF1 x=AF1_(p.x+f); + AF1 y=AF1_(p.y); + // The 1.61803 golden ratio. + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + // Number designed to provide a good visual pattern. + AF1 b=AF1_(1.0/3.69); + x=x*a+(y*b); + return AFractF1(x);} +//------------------------------------------------------------------------------------------------------------------------------ + // This version is 8-bit gamma 2.0. + // The 'c' input is {0 to 1}. + // Output is {0 to 1} ready for image store. + void FsrTepdC8F(inout AF3 c,AF1 dit){ + AF3 n=sqrt(c); + n=floor(n*AF3_(255.0))*AF3_(1.0/255.0); + AF3 a=n*n; + AF3 b=n+AF3_(1.0/255.0);b=b*b; + // Ratio of 'a' to 'b' required to produce 'c'. + // APrxLoRcpF1() won't work here (at least for very high dynamic ranges). + // APrxMedRcpF1() is an IADD,FMA,MUL. + AF3 r=(c-b)*APrxMedRcpF3(a-b); + // Use the ratio as a cutoff to choose 'a' or 'b'. + // AGtZeroF1() is a MUL. + c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + // This version is 10-bit gamma 2.0. + // The 'c' input is {0 to 1}. + // Output is {0 to 1} ready for image store. + void FsrTepdC10F(inout AF3 c,AF1 dit){ + AF3 n=sqrt(c); + n=floor(n*AF3_(1023.0))*AF3_(1.0/1023.0); + AF3 a=n*n; + AF3 b=n+AF3_(1.0/1023.0);b=b*b; + AF3 r=(c-b)*APrxMedRcpF3(a-b); + c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/1023.0));} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + AH1 FsrTepdDitH(AU2 p,AU1 f){ + AF1 x=AF1_(p.x+f); + AF1 y=AF1_(p.y); + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + AF1 b=AF1_(1.0/3.69); + x=x*a+(y*b); + return AH1(AFractF1(x));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC8H(inout AH3 c,AH1 dit){ + AH3 n=sqrt(c); + n=floor(n*AH3_(255.0))*AH3_(1.0/255.0); + AH3 a=n*n; + AH3 b=n+AH3_(1.0/255.0);b=b*b; + AH3 r=(c-b)*APrxMedRcpH3(a-b); + c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC10H(inout AH3 c,AH1 dit){ + AH3 n=sqrt(c); + n=floor(n*AH3_(1023.0))*AH3_(1.0/1023.0); + AH3 a=n*n; + AH3 b=n+AH3_(1.0/1023.0);b=b*b; + AH3 r=(c-b)*APrxMedRcpH3(a-b); + c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/1023.0));} +//============================================================================================================================== + // This computes dither for positions 'p' and 'p+{8,0}'. + AH2 FsrTepdDitHx2(AU2 p,AU1 f){ + AF2 x; + x.x=AF1_(p.x+f); + x.y=x.x+AF1_(8.0); + AF1 y=AF1_(p.y); + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + AF1 b=AF1_(1.0/3.69); + x=x*AF2_(a)+AF2_(y*b); + return AH2(AFractF2(x));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC8Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){ + AH2 nR=sqrt(cR); + AH2 nG=sqrt(cG); + AH2 nB=sqrt(cB); + nR=floor(nR*AH2_(255.0))*AH2_(1.0/255.0); + nG=floor(nG*AH2_(255.0))*AH2_(1.0/255.0); + nB=floor(nB*AH2_(255.0))*AH2_(1.0/255.0); + AH2 aR=nR*nR; + AH2 aG=nG*nG; + AH2 aB=nB*nB; + AH2 bR=nR+AH2_(1.0/255.0);bR=bR*bR; + AH2 bG=nG+AH2_(1.0/255.0);bG=bG*bG; + AH2 bB=nB+AH2_(1.0/255.0);bB=bB*bB; + AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR); + AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG); + AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB); + cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/255.0)); + cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/255.0)); + cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC10Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){ + AH2 nR=sqrt(cR); + AH2 nG=sqrt(cG); + AH2 nB=sqrt(cB); + nR=floor(nR*AH2_(1023.0))*AH2_(1.0/1023.0); + nG=floor(nG*AH2_(1023.0))*AH2_(1.0/1023.0); + nB=floor(nB*AH2_(1023.0))*AH2_(1.0/1023.0); + AH2 aR=nR*nR; + AH2 aG=nG*nG; + AH2 aB=nB*nB; + AH2 bR=nR+AH2_(1.0/1023.0);bR=bR*bR; + AH2 bG=nG+AH2_(1.0/1023.0);bG=bG*bG; + AH2 bB=nB+AH2_(1.0/1023.0);bB=bB*bB; + AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR); + AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG); + AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB); + cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/1023.0)); + cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/1023.0)); + cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/1023.0));} +#endif + + +float insideBox(vec2 v) { + vec2 s = step(bLeft, v) - step(tRight, v); + return s.x * s.y; +} + +AF2 translateDest(AF2 pos) { + AF2 translatedPos = AF2(pos.x, pos.y); + translatedPos.x = dstX1 < dstX0 ? dstX1 - translatedPos.x : translatedPos.x; + translatedPos.y = dstY0 < dstY1 ? dstY1 + dstY0 - translatedPos.y - 1 : translatedPos.y; + return translatedPos; +} + +void CurrFilter(AU2 pos) +{ + if((insideBox(vec2(pos.x, pos.y))) == 0) { + imageStore(imgOutput, ASU2(pos.x, pos.y), AF4(0,0,0,1)); + return; + } + AF3 c; + FsrEasuF(c, AU2(pos.x - bLeft.x, pos.y - bLeft.y), con0, con1, con2, con3); + imageStore(imgOutput, ASU2(translateDest(pos)), AF4(c, 1)); +} + +void main() { + srcW = abs(srcX1 - srcX0); + srcH = abs(srcY1 - srcY0); + dstW = abs(dstX1 - dstX0); + dstH = abs(dstY1 - dstY0); + + AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u); + + setBounds(vec2(dstX0 < dstX1 ? dstX0 : dstX1, dstY0 < dstY1 ? dstY0 : dstY1), + vec2(dstX1 > dstX0 ? dstX1 : dstX0, dstY1 > dstY0 ? dstY1 : dstY0)); + + // Upscaling + FsrEasuCon(con0, con1, con2, con3, + srcW, srcH, // Viewport size (top left aligned) in the input image which is to be scaled. + srcW, srcH, // The size of the input image. + dstW, dstH); // The output resolution. + + CurrFilter(gxy); + gxy.x += 8u; + CurrFilter(gxy); + gxy.y += 8u; + CurrFilter(gxy); + gxy.x -= 8u; + CurrFilter(gxy); +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrScaling.spv b/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrScaling.spv new file mode 100644 index 0000000000000000000000000000000000000000..c15b72ec6c278e354720e2518f556e6328973ab3 GIT binary patch literal 44672 zcma)_1-M<s8Ll@(aQC9Y-7R=R2o52*6DJTO0wGuk?(W6CXesVe3KS@XVr}sPEwp%n zLJPg`yU+T<oVmTvb9b{k-}n78|IDnl_CD+EGjy4DiRoIcX<9S3rf+@Lz14or(V7;e z)taf*z1?oU&5qlwJZ}8pmDgNrRUKw+b!q$QGh1uARyW$N93uw|AEjX|c_wnqPQ3aY z9Gh_H@6m<-O-p+#q#GV(Y&Vn{vGwY^+P)K)9lF~3;BNTOKs%s~yV0li)jQc|E$nl* z?Q3+h&sEqLXxrE9WS_sVFRFcK`=2ni|3#+SySJ9o-h1c}%^m+bYFr$B+}J_wdY1r? z9y+u=z9qo}hmRje1Ml8io|gFG+ZG8N<1*-pUlu%k)VOy1a^Rohr*Exh{I&xouF-dm zWe*;@#`-ys+kLN0JF&s5&>q@g7jIG<Pv2Ubw%4}9NBzHYtbsi_)&wWVTHuZx-C8}( z(SOjGzN>XClw(+TYFxM02HJNVFut_Ao*j9+w>H+^Z|uZvM*oz=92=o`tabNR@1K0e z9n>V!r`J@UZmrGr=|6JJx)axJubB3}Q|;Yb+knT99WZL#hymk=Fk`Dzgg#rNj~VS8 zPuJQ3JZ{MN-lHds8a%G;6Tfb)?Xe9UJ%0S?k=qOzI^NiBt$yYlJ*NNgVf&+Wtn1FS zeaDU6Y{0k){reXE?1Vn1@!9PspUq33T??N*fAZ;F`s^{)XV%sMKlyArdepK9PgryP z^5DX${^9Snwjb@F(WBbevD|rvdrGzK13UQY;e$H3rvvusTZ3tbj~X*!{4T?X95QC~ z*zsGA>NkAikP+h;nD!z6FZ=G=r)v$3eaGRGhIERZt~Km`*_ychv5lTEo{l>QnsfO7 zwCzrRv$Q5)Yu03^0b`uaHJlk2H%G@rX~5U>VkGT=O*^nRv;i!R`e<;+HQlAXo`Z*v zZR-c2j~FtljmL^d4{T#E_(3Daw6Q0NbLrjdc-p3>9fyqXSmCCofvc@Q_22)sfY#u( zmwpcer{BZDV>rq5IRZRz_>OISWE=0)#z(dB&TTxojdy9|qroG%pp2a&=AsiHBW~v5 zSaF{Y{W$Sv;+a}!f}62)Ozy!GnAz%V#K(Lm(~cQFv5ijw4{z7owRI}EJSR`<;8V}Z z(>wUobMlN1-kg)&Tj#>LR}LRMr1$8F#dUWM`oSEYBNxz4y$AFeGH!g~cRsd``#|^B zCE!gbj2+u|_=xdC#&TV8v#|Dyr}}hly&@hkc7W%J*U+04f4kxzRQ%J5e^v2+SNw;H z|5$O~RLgp%t$3G;cdK~!icjB(cWccKA3S{I5T<xEZ%Mp^b!e2~`_*pVOrqi1)?B=3 z({^aPmwsIut@P`PHhA3lPWieuTIt6}3;jCf<3mhaEB&TN8#idch#}mYQj4D>+FJX* z(xo-01@m$p7Y7gQxW%{mlJN1(9o}<b8Ti;C9q-92fOi}{VeFtG9kDCHe_Destu+hZ zwJY03g>92gwyv!$E8ef-yH<S9itk(TAr&89@sSlDTk%6Get5-?toU&iKcV8MRs4*K zpI`9{D}GtUuc-KS6~Ce4e&+mXow~MeulPL`zpoST(s~TuaZl;edJ8<5yOQU|C*VWc z_sy=Y&pO3)ZGF{=cWHg2Z?BH?=KD@IJ_~cPl=aQfiFawu3_qlOzIJKNKNWXvEzrrY zYip5;FH!N8E51%A-lerUd@yfVj%{o3pn)CrZV%=?$B!}XYh-75S?_Le-k+x0hjp@d zX&nR~(&U{09yFjM&!J%Qa#eQlqdLWPZB2obcdGrYPWCRXOXThMlP;~B!9yE-hdMVN z$MLwB8-(~-akF+Wi+ML`-^;-R2ThvPzp3LbZN<I*-Nx=`PW|7j>>o7t-i`gk%Kj02 z_yI$QB>%^i{gcL?{GV3#FB*IACjXa}{cHGvBlaJb{NGge?;3mZe_z>uXzab4{6ALq zX}EDunlyS;@=psd=cij^PyX(eeTK%K{4-YeS>VpUpV^(?*(&>76<+}Ee8zvl%D!mD zmx4P#@n5F0uTb&T;Ep%`YgG1iI`J;84dDZE_8i*;JgC8&f`>P_FZh54ZwVgS;BCMT z#Mte@2R67rctnGD2JheC-M|h+|2@H@8@vzLg4G9rJ3cRVZ4Ctfv{qeOgW(;YAG@}O zbh34A4X^lt6(3pg(G?$4@q;Qpw&LR|KB3}=RQ%A2A6D_n6`xY^V=I1q#ZRdCi4{Mo z;wM-9)QX>8@iQxacEvBK_(c`Jq~e!W{IZH)Uhyj`er3h4uK2YTzrNx(R{YkA-(K-M zDt>3h@2dFS6~CwA_g4J=ia%KKhb#VQ#h-+8-K@@Si0i0p>zNMjZLfV@y#Svyc68hJ zQWg73#ows-2NnOM;$K$$U+{6`#|{}kX#aM9Gx0(Br|Y<DYgRbd@zh#ptL$@DeEy0r z*ok*(^?-N$49jQYPPVSCr8@C0t>xhzKjU_7t<cHVwY4&Q<nU2zW)AyQ_RT8Zx8nVb zA24xh{Lsoitm6B3;$2!};T=E6cWsTUY!kG#f3EM^I=GXK&l(j!w&KTE{Dg|1RPj?P zep<!PsQ9H7zr5mCR{W}pUsLhxDt<%7e^K#!D}H~)AFTL86@R4Sk5&AMia%NLKUDmc zioaIz*DL;~ioa3uw<`Wl#Xqh1=N135;$Kz#>xzF{@$V}B?~2dB5548}JyXSJsralF zpS|L9R($S?&s*^&E53Bam#z466<?v^D^+}zimz7jjVs=(;+s~yPsRIIe2a>2Rq<^q zzDLFPs`x$?-?!oeDn6*<Ln=P3;u9)<NW~AW_@s&-UhyL<KDpvkDt>0g&#w5n6+f@y z7gYSBieFOkODld;#c!$jZ56-0;&)X1&Whh%@p~)&NW~xT#JjYfg7X-R_u}rYSHV-C zD-Q0*ZxcE`FU-k9s-LdmZmk92!$%I=zWoTh`3$iX+MvdN9dYqoe%%i4<E-|3e$Ng* z^*w&Q4$e=l9rpF%KlRtOwL!%<toX(i?^W?lE5235`@?zm$2(|8|GRhaEy4%Dc}6$Y zzJCXA>^w_{51V)>W1GJH`@&%(_T6SQ&+4`ubui6%9yD&*C%MMq`_tIEw|}>Y|E{CQ z9=Q3~(G$k#>1VVDJNb2OJyh{WD*kB2AFKG|6@Q}QPgeY?ia%5FXDj}E#b2)Y8x{Xc z#Xm0i7R-RpX?-ox(X^r|tMN87RpZC{w&pX^`0+8bt<79$#uz($p>fK$wYdw8Qrg=5 zg*IKIEmUasXTHS>&1bS|OLA<^&#aEa`ZlLxpU*B1_7&a(9J|k9wU=Bwa@*hQPP=0> zUasAj+}~Bt1YWS={@z;tvxa}#@CD#=(Tn}`Ae;Hb^MY_`E!IA|@|%WM`%vpae#a|T zpAYWb$~*FO<u_EVwif#wjotjl&q=Em`@D_ab<sW_t(x|E;ErQ44quDGoqKKS`Y+sI z^&V8>TxgT$Fm6S-arUj>N)1-e;duJMjn8;C1FM<KJX_M5$8p>LRt;9~!8yAft+DbB zt!ryXuw#<j$KJH|;h4?253M=XdoY9h(walPJoNcrM-NtbL=)p!^hy7=iP1JUnGd10 z--(S)-h<!c9ZH+plS7~KNUpLq*NIIo=c)(aZaAsQWuMy0afqF}vuQndtWkaut^Moo z+TTOF7{@dmD>VF3urIl>FSps`jl<t)^A5Xn{65D6M}B^!cKIh9<BwQ#-|#Qt$*=u; z4$sxtXVJ)O5mwym*gpS%4(DVJ{2h;R>o?r%U(NiEZ!Wmkl>T!!HuJ~+Gqf2Rdk?U- znK{fS_jf#M;r@<CZoZ|#u9N590%TdXvC(vTg)hj+mxJhM{PK-K&HQrjPrg!LPq<R} zrjU%!I(F&sZ?BK@JE*cB+qO@B#CZMB>+s(s{O%6>RQ@Ua`RAV7DE2OP)_QvC`cL2H zvp=#NWAVHm#$qnt;Qhggd(lO1wbl*)nHOWLHMNc<;qOj5_u&<}rk(GJaL+C4jdl*2 z=U=JaiY8Z&-!n~YslDH5=8WHeo7htO8E;O;m9cA~IZvh5uhGib1Dn`VJHFA3Prp~9 z>7(X+cwWAN#=oijZ3yK}<)6b{ulRoj_gwQln-7CF(sg>dH0JQ!nqLQA`g(53eJ^Wk z@E&06FlI$Edk!ywW}P`4zqyt}H`GJEcEi1&np^JYQSIS=9xZv#iu-vqcJurBu;jZp zJmcR3?kQz{{r$Yizp32Mi*oP(V*p+Y`djZo@M*y6#>;&lSJOVe!N$by`?_}fS3eA% z^^v~@_r4PQC-6<+D^i91_&Gq$`=7_!tajE1+OF+iD4qlM={?!cxUR8$AzDAjhTEP{ z@$)KvO~vmhxbFde-c3HapLfH3&+zkZ$set_pLa{UpLfG;{k&UpKkt^@&$}h}^KQxg zyjyZV?}j@+e%>v)pLa{{=iQR~c~{<xS@zz)Bi#1~@2MLy$I#YJ{GWP!c5Zwg#TK7k z(A1A14^3-V{?9n|nNies<NrKgJV&%|#0q;(t0(uKVEgx6PVT+H>hamT@$o(%pMB8O z)BnC;<J8mte&F=4-Tu9X)RTK4*#5ouB=;aR_4o{Ke7qmUX9${l`X355PCfk(1E+uO z_U|>Sp4`L1_V2wcxeq{7kI#XPkN3a$j6hRQ|0BW1si*%@;PkKE{=Fxto7-!A0*BYV zpE=C!b$oE6>+kg@?}>jOY=?k1r&UjmL&4q~${ds6>c1w=oQH$eyk{it2(WS93)1(I zaP`C;1y(bzUsJ<mu=%u2ZnUGp)~3z6y>F;x4yS;f=W-2?K~s;<v5il8E*yuZelPv_ z-hMpTIQ4Jo=OkKR-!pgGPUJY1!~2*xxlaby{hxxSZoS^e<lhlzPUFp!_|w4kd3rjU zdgA3}ym@lIoB_6v<U12=zVe(p3r#&fXE#3Oxp@wndgkL?uyN|n#|5;R4{hgjT+HEk z#L0ajxbFWVG<C=0eO&H%%xS!N5`PKU__DU2qp2reUdEdz^KmKIK0K%VtaTaKIZ{s_ zmxJp$zXDA?K36tA<#}}#n)?0CIde7GICbafI@-*Uwre?V<Zz7Q<h~wU_kRPLx?{YC zR_++hX}oz7{|m73Wo<X1sV81u#+xT|bTimKlJ6F9J?FQgsmJHG#-}{5ZbwtkeEbq@ zoVxRICvE0K+pjq8;cz_S<h~1B_kTB<y5qTnR_=JrX}oz7e=pehvbOus)Dtf+<IR)# zxF2jEIcFXKJ4foqKS=9&Bz}lid(T#Ps%W(y2Jb+tkGACeHMl>AF;CFSWBUzw_r~@l ztvt4;!HIhcERXHC;KV%xmdExScmRj}JxeQ(?RVh)8{6}=@}B&qg7v=u_WRP0&_2rX z7{}ur)@*IoU|;4m)|`G8{yo?j`&)-r9@~pxb7@<bRvz0+VEfS4lU5$vAHeplZ9Q7K zHs7CK0grBMFVo6>?{W;Uf-j*}_nh$a<m+Jl%lEE7!qt6m@xAL$V71pc68C3t;;cii zU+Q?Xv1@w+EKeP8fvrP5`Q8SbuY50j2d<tv-UX|bb-V{poPEjlOC9ewc5QzF%Tvb( zVCzs%9Up=-E^|2kzoMy6Y3Ac^V70Q2kKl>3FS&lH<L`}K+s9ye>i7rPI@EvQ6U(|B z{%ink{~ZfW`{#;vi1o9p&nCRKea2{h{Vr|vAJN6h;k|t}4qrYeOpaN>=J1(pa$Esc z55KbEx!$jWU%{bnKl1eBGrqYwe9g&`e&zz(&pfordo5T!{JM%?-|&8n%)B>n7^9yy z`&LWe9nV@2c+JO=zI~2t-#&*;UpIl(!*8zmEfv4D;r*C-`?-z79LDLZ-FnnhkLS#y z9KIIjupY0qML4WyF<SfA?+y-ibBN0vOTm5h;7E?e3%o>umjs)8Y1-tz3#@<m-C$!5 zr*D7Gzn4QT_WQu*iT!@C+Ow>KwLZXMPUEy$vszj6@^D{e&C9@3|FQ*M4s6XU(58lm z!TN_k0yZZ6F|cu|@o}))Gpvs_KEYuQ;~wSEZk_6>({pMy4qq#ASlf!UD;Ic`0`t|+ zcAcx!T8Ht!;m|+)DX=l&PlJt{nY#3UhGQBI$D*(H*nSI+&Aj?rvwB(c+HhZG&HnN( z^{-jrwZP8pI<%?jd9eQBzXKZ+{sP#z)b)F?b$K4@t39?C!LgZFUu#xR&E3I0Iebme zVV%>_uFGL<>(TmLL+up~TknrzpEG%XG{>tPGjga~tJ-TE<~gzPc^#~ux^>9QIyQj& zD(hGuZXFxarj9qk))D@8!_)7d+qRB6^?idQb-o3*PIbp&Y&nj1;M&z^rV_{TE{F5z zHDGRYyu;xbw3TDn81Aba!$xq&unBFR5q$v8T#Dc4utu+s4>=a)P|ukD3O0||ls<ps zP|N$>M_}XB^|9`cIn=FFT-Mne?yIb`7u-5GrA?imf~_<BtA;yw+CJe(-JgN2TRnAu z4mNM<{sOFKyt%&QNc=y+#;ZTx%*EGWW3*+g-+=8)TgLh=SlzLTGuA%f%{hE+#*uqm zU$A4{f;MCOH@F<@4;BBh;+_<x|FjkN?{P}Ie~(jg{~o90{ymP|xpR&G!(pAym9@#m z*5$m3t?7G?o<th+Uyh|X^w&>&@=Zhj<g<3UIjq|pVrwq*b%mQxfBm%U=UB8Q_jKUo zZjY<+b4(p;+)>{Q@a0KoynfpA%w$He_3Pt0%CnAsM%sqM*H#>^<Ce5rbGVM%()#(( z`J0tE^?Y9L4xSB7T_2w*&cor)S^l0W=jyzTejGl=%m+74J-O!xo7*uZZUHp)<X#YL zZhuD=pM}8c$-OYxIQ4vXTLf&M+8n<;<DZuPcHr=}9Y@B$eS!Odos0go_UoK2K@NRf zv&F&k*p>ntqispBy!>oCFSeaIeC^1QK6e7!&n~oHJ4+L{3`fpo*JY)K+n2v5Tam-x zk)`jI!RE1VedXQ<ta%l%YpuQ?D`6e0g3Y1rQMhsTscuevRs*X!cjC;Qw%s^<mE+hI z?l^X*%{csfu5uh}H$3B53to<69k_X{M_+lyu`bwgsAnAOfz6@K=h?>Dr+UV*K3L6h zh|6*8Nj_iYIQD=$j=g9zj*W;b$FWJnGmeen<v4o5&0{_K$}^7MV8@}Jar6P3L))et z#@VNO#<7_?hvN`u9G=SuaQNDn!}Dq%+WiVVpuhtQJgC8rZE%5y6nJQXhZT7L0uKi} zX9v<cXXe|I8Z&3M{VKj|#rLeZf0I|n52^U@ijS=L*oq%gasMW-%ztFXkE{3z6+f-w zXH@+BieFgq%PM|F#jmUQ4Hdt&;<s1)o{HbsaL*_2$6LT%^YE?UuBqpqYbW=d)pzSQ z?pV{@bGJb=hqink*%qwsy?Y_zwgc}+I}5FO<@%)0?ZKXF;XA-Rm-O{~k*9|KZQOAl zrG_2R%%Lqc>;zU%4LgIaA@A{WeeA<?a2JmBu^V_seAQFa?qKIBeeMC*Pdz?+firiW z$MWRdyN&t%7Hx9u12&H~>+oDJ>(~!&9qRdg&j7G-+A`LGVD*f3uy*1G(VADTkJ#&{ z>|-e0KGZXgVPMxIb?y(>Pdz>dfU`zkgYt~^z&6HCn;av+=Fw&yBWcSzM!~H^J!2gW zHcnf{ItHwsu^t58kGOo+lIxQ?$AP_f*{AmcdFmYB#{3?PHu)!j)sz2V@Brd-?#lH^ z{)ymWO@8k!^5j3Xjrsi)ZSqe7t0(_q-~q(vyp-!>z22K%#+H0Xz?~!Y^nVmM<MZAn zkN@O0=J#l{S%;&+=Fyg#rhwH`(=p%y^fQRoI^_C@y&oRSkv@(GTa$WfIuV>X@qQ_f z|4D7k@1$r`)5&1-XiH6}fYnpeso(+hlXaHsV@;>gmNlIYw<h(}bS60SbOu<Sn$But zeh);On$8BBM_Xz-2dti&&IS8EnsY<0k2Rf7dmcynxBzTT>Z$1>aOUYkusr@3w=usL zqfJehfX$;VHT@i{o|-NNTT{*_xjxo(8Ett!T@JS<_0)7FIP-J`Se}}$YU5T*JvCho zHjlQ{bPZTNHC+p~ra`pUA=gKI9c?*J*TbzzJvH43&OF@!mZzp)v@y?0X;afpVDo58 zO*ezpQ`0SAYs&c~*T<S}rF{ci`nU~lP3o!Xm*C9P?O=KQ?`UIw&qbS>eg!s<w$yYd zSUok}1s+5{IiKYESkpbUcXOnVd%@PEo|^6lXP)i@%j5q*8}nR~HZ?s6HjlQ{^blA* zH9ZWrrkqc5ePVy4v1cCT)*t($jXi5G*IxciG%fijaQGU*;b+$sX-77AI@(bU_UE9{ z1s+r2gBq;=*aDAhusO$b_!;$JT0f(j?=k8zCZAc~s`w`r|E%I)Rs5TZe_wI;DJc7M zpMsLRPl4Pyw0)d-$Lzdit)Bp^$LH5z=c4p^5=}kp{Tr}x>WO;_Y;M;#xt~T;kIyq; zbC*89MN?1iXTip)AKiRjd=6}%+LG&eu=8!5p4W2yV*eeu9_tHe>hbwKxUT0#H1(sJ z&$Tas)l%Odz~)g;o|nP(*k3_YkI$>%dR|{cQ%~;K!N#c{-HiQ@VEfdTTz>*P_FPMH z{harincqKyJ%_{Jfa`BR+TR4L>F3;OQ%nB0!I|5v<vVEV@p<=u_`HXvp7HzzY@B-H z-UpkzJO@62tH<ZV|KamjH1*{E8`wDYqno+<2yCC)9GBOhTE_4(SS|eTV6SPP$#<vy z2b_QHXKMOtPabVzuh~yIazFeW?EO<cdA|VLPip-Vte?6$yv~p1NREGk&7q#Ld<`~E zn=zlzs-=%_!DERD{};UM^E<dc>c)HTDEs_3+&<OQ=YPP)X*0(Ak6QZp0c;)NKQ=t$ z{4ZQhztpT2e{PrUoZ-_nJn_@Q)$~jMYGUtoT{tpl-N4qbo_yWG)|Y(K!Sz!&hxbRn zYn>c3g3Y0xIhYA-oVJWXEq%-aF6UrYxbu`gXM^jbZoKzkHRI=?ui3%AcO>5&aD7ha z9M(Q3SReJo%mwy6%o_EX8?L6WeP~ll&UwJ*44)V5`(DO4A6!j8<F%>P@1@71`#O{( z&le{(*!Ppe3jOc~PozDfz(*GNr~*%Juz8Ly@RR}{Q{ZC@d>q*K-V<o^Uh6(Q<_dQ& zo|3y4Psx|4xO?%GcK6~bxqI=H-2He;?w&j)-?`%M$y3_hlc(edReVCl-J7TMcW<7O zPpSA>4R?K=i}G{m&)}Z->e`(<HP4qr&=&-|hTmda80^dMC#o;Rq2~OHjdyO#p9dC& zn_s;b=d5)u2KF48rMMsTKvU1p>5GGnRX49|ROVe0ZeI1|T?%a8^8I9KH1*_N25hW) zuG3|~_NOg#pyqeCGCteP$#P)NW#?oCurKFCeR&Qw=R};^Rs`2$UI|UzK0W8k@vQ=P zeCo-uD!3lsYG~>i-|Apv)ib^|!1kpr<5P2dsns^)TNCW{>G;+L`*M8hYjLPKKC$_{ zKFZqHg<HFNYVQfI$G;w$dTL)EY^-|XHUQUS-VjYa<J$;qta`?$=J-;pZN|4Tcrx`l zzFuHoj!%6P4mHQu#?3j~8?CN?Q#AF|+Nbd;&%e#k)KhC;uyN|CRn1z{udTKEIb;j4 z_q*^d;ob`~XIsJbQ8%CWCbjr)1J*x$TlkbBeml56>c($Ps}}!$U~}es;5&f1Y^jdt zrG;OAu(9#mvGCKD&s94$`m)5EOF#Sdo~bSU?gCcR=KWKi_v&528xyadShd9N4ld_x z5BL=PGiQ6k^-(w8`>|T$_X4Yh?+u=xBWt)1SReJQ{k~w|56gSnerW1Pa}K$d1Hfv= zCvG4(an>T&FMSUJTTAA3FgWwCEq+75&R1$23f5QMyu)bKGKc$vH=ri<%%NKH4F{Vq z{D6ih|ABBd{SvPh{}JFaek8n{!%=X3)Dtrr?B|Hg*BH2(zV@L_E&gM{zW;}hgO}IF zc(^|5#veqh7XO36<+X7L+|M65PbR|kQ8#`9ty<zIfz6q<ISib&(H6hM!N$h#h{8`> zu8kub-D|^K`q}THwAzwyGFVO9QMB@`>(OAZ4fVvTCH5F_IcLYh%P}7Z*GJv>DYR;d zKOU?Wege2$*AwCTsOKC%3G6vu-WN|sQ_otT0#-9Vai@Y4XDxF5()Vd#YstKx4$k~* zi{BYw=PT##nP7d@&3hKDTITR<u-Arq=1?v9&H<Y*{M?2o|9NmV{SvPh|MS6R`~~oG z4lji3qn?<Hz~!}ZF<ebw`_QIVe{Me$-PcJR>)_|}s*@XhBJC*+_V>D{7WlLRpI+cI zz`i%0MVn_`m(XuMfBNj}=8E4@@y9FvY{g$L_|h!sn*}%jTLsttZo##`S8(kg6kPj< z1=s#j!L@%}aP6NIT>Ga5*ZxJpwSQS~?OzvM`!@yG{$0Vfe_wFzKNMX1j|JC0O_$E| z?;em@5AEFwZhZHGYoDRu+T8=PjCT*na9j6)EV+9?hMUhlAj56l1G41q0U56UQUzCc z56IYU-2*aQfA@e4w{;K5lDh|F$=w68<eOA{(~9@4_?8uSU&u0_dqbAIf5qJ+vb4KT zWXattvgGa;8SZ)vD7d=&M#gSCxZ*<!Zv60q>+jx?iP!Glk>UQ#?%t6lckjrOyLV*C z#}-`Oy(43{b??ZMyLV*C-8-`6?j0F!eUl5W?%t85{n&zA|M3-f@5uOTckjrOpHy&l z_l}I+*1aQ3?%t6lckjrOpIvb4ckjs3?%t8%+TA-c-1^)*vgGa^S#tM|Ecq1$SHH62 zS6BSnio17Y^4Z>4aO-pL$kOiKk>T3!D7f{zcVz5x_l_*Ndq<Ysy(7bI-8-`6?j2cj z_l_*Ndq<Z1(SlpwlLgoBnSxuNdq<Y>FIC+ABTM@qEAD=hrTx8%e^7Dvmn{9ithoD2 zmiC$WVJqD6xW8n$t@}%s-2Ej>?*5V`cYn!}_o(>d6?c!x(%(HMOTI$E{e0m*ld;SD zRNQ?gOMBmnyU%24A6jwunJn$@Gg)%?nGCmeugQ|T*JR1vZ!+9^rxaZM*n&I%$5-6_ zCKIpS{U*b0PpP>3O~!72?l)O-_nQp2b-&4Q^Iuu<t15m?!Hv7F;_f$@eD?2tlO=b* z$#7fun+&&q_nR!a`%RYI{U*b0AFH_gO~!8j?l)O-_nQp2b-&4Q`***|lDpqz$=z=< z+}8ajOYVM?;r9Pd#occ*c3bzG47Y#xn=HBeO_tpKCc|yrZ?fd>HyLjK?l)O-_nQp2 zb-&4Q`***|lDpqz$!9OPy8BJWZtH%N;r8!-lO<oW;Og!-8N2<v-(<<%Z?fd>HyLj0 zev>74zsYd>cfZM!Z(MNoUIn**_nVBryidj5Z!&gU_nR!a`%Q-1zxz#=-2EoQZTBj; z{kh*{?DBmpKA_<Gx!+{`ZQXA&-2UBfvgGbJ8E)%-li~K~ev>74zsZsxUT}5yn~dFd za>4E2{U&3VpILBq_nVB}{@rh~<nA|Fa`&4Iw{^eClDpqzxc#}`WXaucGThevCd2LD z{U%F(d&S*vGIm?{n=HBeO@`aQ`%Q-1K2mY_n^gB_@jkRa=jShfZXN@lnf6k+n)bcn zmw`{E-JU4@FNdpn_~J(x{zjnVZ-}ly)9x$Um5ruf{^sH;G<~)Cy&C(y8m#Vf2l+K% zYf(=v*MiGhu7j(2@UO*3jj8=!kD%RGYPq4&%35wj(^p$+`2|?r=Q-AL6WDsxeLmqc z0yXQ>ehXO5XE*X&!S*qIV>8C*2<nNs4XhS^JJ=jPuQSIl!D{-YPqp~}3Y>V?=}x%M zIkHZ7!PPu`mFsjjns#5An|m6qoSS>m^wpNRxeu(Kb-EvHE$Y^0Ef0XpS{{U}dGN2( zIz5D--B)UPxY5d59zoMrTWWa}te$mx3~W8>&Y90@)G}vJfYnmxufaYK%A7q3SM%^y z&e?C!wEIe(Pc>Ru=hJBVYD=BZfYnpyZ^720Zk;|uQcIoBfy+9dhnIE!4zA|GzfSY{ z0)lp5sq^=ZR@V6<n!egn=SyJq)cFUnwWwRC&$iT3=PTf{&R5|+Q_Fn523Pa&RnF(@ zXxe?H&ObI<S?8b7^wpL+{|r`7oo|4xMcq1m=BAc9-v(Q!wzp{IJ|FdOblw5`_eSd4 z-=tMb>|el%^;w)evG0R-BUatmcWKoU`&V#cKLpDY`#12O#Ht(n0j*kM{|@%N(DpH{ zJhA@(TeG_Mk7(5r`x!W~pMvFy{Tyt~>c)OTtCrZWz={15EKlq|!Pcy9>=(3ZiTxJr zIil?wT6tps1-52&?O)TXCHCLo#C{K!C-y&JYgRY*J6g5G{ui9sAHnj(;@+-V-Pj*! z)e_qU?D?Y2XN~g2b_H9rx^|yUswH+haAMofA>oOg9&F9(#&)AsOYBVG#QKa<p4gee z&a=9)GtjCT>p43s*z-l3&o6i3f=TS`@CEQu_a3eP9AIO#CC8j#&vkS7%(KifH{2ZR z$uSSu7;VWhFW7V5>(GAYgX^arZGN!*YqKw(zp5q2&%mBX+I+TJ*0B)WI@D9g!eC>x zCC4IQuK{!TJhsfS7~CA{$<YIBjJD)h9PG7_x|e|Krygxdu>EVZFP{yoCCAcW&o6C0 z_buyK7H%Etsbe{?G1`)2d9c@pIeb=J=2#JK4)x?%32cnE<X9Q(HIlklf$OIpZB?-S zYqPJVXw{Nqb+G51HlInCb*u@u4)xTr7T6eV$+0%rYsMVw(3UyYg_}b?IeLPP(Uu(R zfxUK8_xfP{)T3<xwtsE*<ui4)UYwVH*4+r~c~|~Edt<n|{ytNe$G;a?|MGWyy|r`b zzX@%dH~IU(Jr~OSo59uf-;`FK{F{UIFY|8!SJ%HUtvvo)f%PB6b?&_Q`}eKk>iTa< zE06!SVEw<xFaF!X)%D+oRv!O;VAsE_e+Rg_{@c^a<G&+V|FZs_;OhGKr<KQl7qI?i z{ky`|_1~FR9{=6J`j_?Z0aw?5H(Gi8_X4}-W&XY4>iX|VE06!aVExPd`@z-q--lK% z9!UEJ^XUE6&*g*Qv(c(+A3&=n9zq*m*Jvo*?+&SJA55#}nwaYVu<_c4)5^16Bf$D= zJCIhMxKZFsntQ}Zuw1{<w8I)ahIao3A5`G64NgDf;hz7-kE3n#<{Ud1?zy6E0<Ao8 z6TzN$+76+W$2JM<8fiO}Rvz2oV9#%DhtbM=vfB{DX&nJRp4Pc=%#O`5ITpuY?bc{5 z_GxZ&nA`b05^SuwomYAKJqm12ZO*GaaYuu*4wJ$1#2o|9I!pn}6L%aq>u@YsZvNwG zGxsNevnD4N`biB=Kc~Pq;V}MWT6t`zfiu^qg5|ND0nS{X4wlDu7T9acoM+O?V><`D zePcVDR-Sb@7wkGrrghAY%`rI^$6)Q&Xf5_>ZgZI1bvO@fto=H#^7MN?*qqv&S9#(t z1ZN#C0Lv41F*xgR5m=tMpM$dwmw@HwzmzuXa9M#b2WL&LX!P`R6+Cl&C0HKYHQ-G- z%zZVjJhtn=Ij^q;%VWC%oVmUpERXFMVDC5PypdL(b+`%aIye`O*|9k$$Kn{Q-5RaM zKFw_obGr^VgN-$}^D0ljw}8#5&3Tn4?ly4N;a0FbalZs-9c~B96Zb1{*5MAY-28Xa zW*zP-@ZANz2ke@ppZmZ$ukQuRV|xIcxxODPkL@9F=K4XfJhn%`{W+}ZVOn`?kAe4W zY>(2)vks4gT?gmFF*`QL<X9YowOgaL*r&P8VQ$yq39zwQ2YLGaHQ1b42YKRt19lz4 zpK7?Dm(2S#TpxAgpQKgG``mBAYUTU-vvBqJJlFV?@8!><sUJ;0-Rb9dVB^&7=NVeH zjN$iSHEl1^zR2PGrLlhi`yQyCy-{BV8>7vb7iiUz?^ST}y#kge-)rFB<Wo;AYR38< z@7KZR^ZT%WruF6bVb%Y{@fL^Q`4Joc25tF$-#6jbtFHZzv}%5*)qKX9BelK_PObXN zOJ8H+`wm#2)Gbfl?}AhJ`(R(zt^OB|zjD~O*!T}<%ep^=TerIQ_h{9u#eBw^BkTS* zu;*5JZhQn+PrSU0H&4d$G1xwm@9*H`GhVJw{Qm*YJbeO|=QGu(U_Vo->+iY!8Q6R| z*FOiV>1%#%YRUfvcyMF;k~TkUcn*FAcW%_pr}j^<G1|;$tXlGY4OR>Prs0YI7OtjW z;?*)A-+?n9$?-j!dVK!<KYad!rk;L&02`;CxF5mhF0a}D!qwx$H2z;c)1av*_q1T+ z)Q@iFr3=_TwIx?qa6Oi8XzKaCVt265?Pyxs5<4BZu624e^~BBqPOP?khjhk9_dBHK z*U!23I|SNtPna3(_e9k*wpqZAMVm4IVoYjczhh8dJF~+*Kh(2UbAXM}-|rxl{&O|{ z>iK@{++bt0CC5Bq&!N;bFI+$M%=dg?``2b)e!rpYYXP{mscZN94*vHsGWLZUyL$Rv z7+m(d2wXq)^t&k7+}g4i)M8-k(Pm$MH$zRlIBj`NEdjSKb?r;hF3FL)mTv6oscRW< zS=X{~{nS&}a$s|7Gj9)Cwd7b4tftNHYRJ>)N?^|eb?tt)LoMgxDqx=zl|Lu0il!c) z)xbUnDt%T*Q;*LYV4qo)K5L?>XK&KAz~(kqTVmG+`+TST*>fE<_4up{_W4ce(-TcS zKI?&f)>QhekEWg)HUOL3SZ#^j5bQjZ_m7Rx)Z?=;*m)>@HbGO5PcN|ZQ2O-N&XF26 z1)JMgZLUinTIWZ+8Lf4S`_k&~zH<IO;O5}9Y4z7nd-~o2>^zq5Ut6N7$7d_B^H}<9 zjiw%-ZNScB>9Z}Gdd9FF*xbfy^ZsOg+k?GtE!EVjpW|4WR$Kbm0bHMV{n6Cpvm>}Z z?{-2{kI&BF`n=l(O+B^l3O2W~+7i1PxIXW8M^lf_9^m@C+Y?PaK6`=d^KNf6_0+Hr z*xbfyOYFYj`n=l@O+7vX!1Z}I5KTQkgTVE9HyBMlH4Fip+gNR`%TQY9M?8$yI>r0b z>Ywv&ICwSIRDb=nr|$#6^?7$7ntFUjfa~*aB$|4BMuF?|ZZw*D#xMqKZez9OygLZ& zdFT70wd&_M`q63=kE6}AD9_dL@VRN#wU4D$6CX?)Uq2fi0{7mou6+Wnn)hHon;i=F z=e+!`dJ=phhkC|$82HWVx;h+9Jw8W({ry3C?H!4x9-pJY{;s3+nT)2Mc{&<wZez7s zr(-?_>^L366k2)SM~(wKE^Wus%EiaiI-ZR6M6jP<w4DH!$96J!Rt{|^(aMwmRB(^R zb_%ULw$s7OH@4Gg<>okp*0J<tmKk#EOz;F+Yqv&gvj+Pzr?JME*YTbO*4JE)Tb@48 z2Ae~h<Cbf4J<kP~>v<mh91iuY!}(y>v%J?|fTkXw3&E~u>2nd9dVDShyPl=bC1~nd z&!2<MZLGG8?=o=K^HQ)p<GTW!^}HM`cdS>^I-ZR6YH-%`DzH4ZYr$F1Yryj4zaE_R zybdgn?M86c^9HcozJEdMShAisfn85)w?=ET2KzFnvBsF!^}HFZZ`M<uK5qe=BkL*G z=6c=+F4yyR_^ll3S%+VOy%(3))g5T+@%a_ldvWP=Cz^VE?gD!+E`9DsQ_p(d12(s@ z+A_ZTz**0G!Samn0dUsyez4rJK1l0$GS-K|S<i>S^4J~)XFVSQ%ai|caMtrNuspV3 zgR`DbfaUi6B&}n~dj1CNdRn_RTAMZ4mpP3!#=NfQQ(%3wp7QkhG}s(jPq{YN^S9t~ zJ)ea?!=avacn(~@w?2=i9-rTV>-W|d(A4AedvN{U`XZWo*7GH>xsBD9@x2Vrdj0_{ z&-h*iXFXp5%N^@$w2miZ{UbQ*`8rr0+n>Q%&p(0X$^Rxe>-h#)9^2dCtmj)`xqZJw z>sYd$?}A-VYqv&gvj+Pzr?JME*Y$i4tZ&v+o<9EqHb>S|uFduQ09>x;hw%3~)UytM z1=pV$|AwX>pO3)xXU30>D}4SAu0J#W15G{a`3czE#%jy>J_BbxKLyJ(zAwO8&(Fbf z$ND9$<H=b63C?<c1(wJ54LIxhHCUef{{m+{zXi)<`yQP2{0=O)?|;)emaON0z^<pY zTcfpEgMFFPSYyoVdj0^`H|r@+pFe`lk@b{ob3J)<(rG=Xf&Uk~de&-Mu<Kd=yw(Lx zJw9E*u4n1f4NW~h-NCMB=`$Ugde(D#u(^%ZmhsI9&U*U2Y<b2vGdSz%ceLe>)$eCJ zo{V)iaMsiBYRhAr1Dy5rd)xBlp9`Gz^gG=0*yaIeJ^enn+`j!Tw`0kA&Ifiqt=$@} z%^K{>oW>esUe|Meu)bMOdHP%cY>up_T$}5;Ah^6f7lQlW{#4I8EDWxH4p;<DJwA(q z>z@M_LsO4W4{-f+z~X4?S<fZF<~CMa#<vtW>$xOYp7AXM&U*Ttce!Kr`|ge>V_hDc z_4K>%^4L}cXFXQ{%aea)aMsiB#LHt_6`b|-`|)!7_Pg<pCF{95*!8q_YqU0NurG5O zYm9ka&o#jMW<BNUb4{>0vYv8nuIJj|ay{39uf?IBbyyeddX}F{d!nhwXFag%S^BJx zrXHUSz^-TMvmu&#)^j7UxsBD9@ofUmdTtDsXMDZES<hZzxntdw*70Pln}M^QeZcbA zHV0=t`-0`kza=>9xdm7r+t%Q$=T>03eQ#sSk@egb?0Q<eHCmfB*q1qtHO9QI=XPLy zv!3$wxjon%Sx>n(*K-GOxt{&u{W#RK4m*Nf&+-}FPH5`!*%|D5mOi_nsmEtmu<Kd+ z?1rYE_1qn7Zez7&e0zeko_m1h8Q<REtmj@}xntdj*70Pl`+>8b`-0`M4FqRB2Y}_t zKNy_#90ZofHWZxo90Hcx_b^(=lJ(pl?0Q<eHCmfB*q1qtHO9QI=WwvTSx<TTJOFHt ztfyR?>p22kuIEVjfgI{thf!eHvwYthjiw%-F<{rT^f?GkJw9W>u4m~p4oyAlIUa0o zW3^>`2ZOVo6TtF}Zz4GBc?ekUSP!LjJQ?d@;H>8)uspUSz**13!Sduk3Y_&k5-gAH zXmHkZGFWckQ)nGa*7F#!>uK%QXl>SDU*<H{81uTG$Ab0Eddk!1abR;~J>}Y5&lAAq zdY%YBo<lwBa1wYLo~xAC)yZh;@i_(Dt@1e)O+7xRfoG_EPDfMEdY%C`x3StXzO%qt z&ojaDjPD$9*7Izz+_9cZ>v%HO^TAoq^T6`hE(B*iF96Gv|6*{~^CGZ3wx5Huo|k~- z_I)X>W664626jEI-5RaU8tluQ#u{T@*Yk3)zFALs`n&>cj;yC#o9lTMxLnVx;a75~ zXC1BqyPoBBbuF5De69n#o~6(AXzKB~0qlB~J~yJNXFY!bHn*|bGQOL^S<jom@{I3R zaMtq{u-viUM(cPo)?b3Np0|VLvHc31^}GWtPyV~WS<gGc^4RVHXFcx*%kBGKTE~+0 zybtVpTDvt`n>E;%IgK^Oysqc{V12Wm^7Q!t*c@3;dEy=f_iKKBG0#Krl?(nb+<8o{ zN8y>fN5Jy@&h4=_?)W=1`+6MSp>^!(_ypYN&g%Xi)c$@AwhwK2ru`&1&$O*apV)rW z*p?#J+V!=sli}LRePVomeJ+QuvpL)+#=o;Whr{ncoJX6zUY;U0e+%b%{xp1Xig4WW zXW+(Jhy1s2Yh0B>U+wXI7QAE;`y9NCeIA}zeYNNL`R~By*2i&Kms+lg7r<)eGq&Hu z)#LMG<5Pa_d<jiGzkmJ%*f@3n`zZ4K_fZ_*1suM%;>f(64|d)zq|Lm&N}ThSJt_ZE z@oyWR+Fpa3r>y^VH1+uWvGFPE{}Y;een0(ZuyN|nlXa+>V{Rt*P4JwwZ_wu4eG9A} zpLZIcw+o+l!Rq;b(tBXz)cqdO2eiKYK908cIsVS!yoi(gLvY>yU(wVv&cA`JyPS)U z;OhA<!N*|Z)E%eutLArcti^uqKRN#au5*5Zrk;3t8E>A<(WhYhNWRa&$!EM=pN#Vh zu;cukHaWfotH<Y`jn7ww&(~n}%=0&3<J6t!?`Sj6+Wy7Sg$vs8ij(_$u>F_0|Ba>| zpZ_#IWsN_esUOYx;(g*ruyN|fc~1Wq?AXimP-pe5&oppj)uT-dR`<Ib&Y9PMnse^B z9D`#?KV89fKi$yO6E830&6D}=j?F%jZ#sDL886pIf3NN78@t!NIM+QNh1>s)S(?^G z9A5YCrFAifHUFG8*ZrKt<+@)HeW{ACUvP75P;mF;+OXmqSG-rnH!ZmL@U0r|m=}W2 z1h>vyOACT$MpO6inQ2<H@PBHFnH6kI*4LQX(9{z%J6J9EojJhfQFm^g2erHh&IR^8 zNBvG}^LvPMgVp`Ls^|8+U|-%tw9Uhz<{XP%r}=1q#<2iL>Rk|Ak8vS1b^jjn9&#=W zR!htx;ChUUqNyilF|e9r>_NLYhk4Wok=HuZmf-OH)jF*=eJl;G>skg)J@dFMSk0K_ zXqV?$oa;901#PX+=(&zogd3+mk~&<UmB8xhdu4DvzE#lF)Ay=iwe-Cj*uJyw(AMhw zpL+UU18kgn`d$;P?mYD&W-YMwsCzDWj;pzT{_XQd;B^|C=es<%O~C&5yp8kRm&eu{ z?0?T&o7aInwm#tX8k^UIyeGe*_uAbI?7nzjd)8u~<~E1E=I}FDU$8#r_qvs9^PJn9 z!*fpV7`FgBzs{xijxFJ8#(E9PJ@>V34fb<``Y8N8m$u<hPv6^u?bE;WrD<))|Ec+k kwtb_qL{qhXjpjJ>-&x)P&3OH_d%yL3bqw~aJ)Z~v4>v$_kpKVy literal 0 HcmV?d00001 diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrSharpening.glsl b/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrSharpening.glsl new file mode 100644 index 000000000..785bc0c83 --- /dev/null +++ b/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrSharpening.glsl @@ -0,0 +1,3904 @@ +// Sharpening +#version 430 core +layout (local_size_x = 64) in; +layout( rgba8, binding = 0, set = 3) uniform image2D imgOutput; +layout( binding = 2 ) uniform invResolution +{ + vec2 invResolution_data; +}; +layout( binding = 3 ) uniform outvResolution +{ + vec2 outvResolution_data; +}; +layout( binding = 1, set = 2) uniform sampler2D source; +layout( binding = 4 ) uniform sharpening +{ + float sharpening_data; +}; + +#define A_GPU 1 +#define A_GLSL 1 +//============================================================================================================================== +// +// [A] SHADER PORTABILITY 1.20210629 +// +//============================================================================================================================== +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +// MIT LICENSE +// =========== +// Copyright (c) 2014 Michal Drobot (for concepts used in "FLOAT APPROXIMATIONS"). +// ----------- +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// ----------- +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the +// Software. +// ----------- +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +// ABOUT +// ===== +// Common central point for high-level shading language and C portability for various shader headers. +//------------------------------------------------------------------------------------------------------------------------------ +// DEFINES +// ======= +// A_CPU ..... Include the CPU related code. +// A_GPU ..... Include the GPU related code. +// A_GLSL .... Using GLSL. +// A_HLSL .... Using HLSL. +// A_HLSL_6_2 Using HLSL 6.2 with new 'uint16_t' and related types (requires '-enable-16bit-types'). +// A_NO_16_BIT_CAST Don't use instructions that are not availabe in SPIR-V (needed for running A_HLSL_6_2 on Vulkan) +// A_GCC ..... Using a GCC compatible compiler (else assume MSVC compatible compiler by default). +// ======= +// A_BYTE .... Support 8-bit integer. +// A_HALF .... Support 16-bit integer and floating point. +// A_LONG .... Support 64-bit integer. +// A_DUBL .... Support 64-bit floating point. +// ======= +// A_WAVE .... Support wave-wide operations. +//------------------------------------------------------------------------------------------------------------------------------ +// To get #include "ffx_a.h" working in GLSL use '#extension GL_GOOGLE_include_directive:require'. +//------------------------------------------------------------------------------------------------------------------------------ +// SIMPLIFIED TYPE SYSTEM +// ====================== +// - All ints will be unsigned with exception of when signed is required. +// - Type naming simplified and shortened "A<type><#components>", +// - H = 16-bit float (half) +// - F = 32-bit float (float) +// - D = 64-bit float (double) +// - P = 1-bit integer (predicate, not using bool because 'B' is used for byte) +// - B = 8-bit integer (byte) +// - W = 16-bit integer (word) +// - U = 32-bit integer (unsigned) +// - L = 64-bit integer (long) +// - Using "AS<type><#components>" for signed when required. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Make sure 'ALerp*(a,b,m)' does 'b*m+(-a*m+a)' (2 ops). +//------------------------------------------------------------------------------------------------------------------------------ +// CHANGE LOG +// ========== +// 20200914 - Expanded wave ops and prx code. +// 20200713 - Added [ZOL] section, fixed serious bugs in sRGB and Rec.709 color conversion code, etc. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COMMON +//============================================================================================================================== +#define A_2PI 6.28318530718 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// CPU +// +// +//============================================================================================================================== +#ifdef A_CPU + // Supporting user defined overrides. + #ifndef A_RESTRICT + #define A_RESTRICT __restrict + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifndef A_STATIC + #define A_STATIC static + #endif +//------------------------------------------------------------------------------------------------------------------------------ + // Same types across CPU and GPU. + // Predicate uses 32-bit integer (C friendly bool). + typedef uint32_t AP1; + typedef float AF1; + typedef double AD1; + typedef uint8_t AB1; + typedef uint16_t AW1; + typedef uint32_t AU1; + typedef uint64_t AL1; + typedef int8_t ASB1; + typedef int16_t ASW1; + typedef int32_t ASU1; + typedef int64_t ASL1; +//------------------------------------------------------------------------------------------------------------------------------ + #define AD1_(a) ((AD1)(a)) + #define AF1_(a) ((AF1)(a)) + #define AL1_(a) ((AL1)(a)) + #define AU1_(a) ((AU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1_(a) ((ASL1)(a)) + #define ASU1_(a) ((ASU1)(a)) +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AU1 AU1_AF1(AF1 a){union{AF1 f;AU1 u;}bits;bits.f=a;return bits.u;} +//------------------------------------------------------------------------------------------------------------------------------ + #define A_TRUE 1 + #define A_FALSE 0 +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// CPU/GPU PORTING +// +//------------------------------------------------------------------------------------------------------------------------------ +// Get CPU and GPU to share all setup code, without duplicate code paths. +// This uses a lower-case prefix for special vector constructs. +// - In C restrict pointers are used. +// - In the shading language, in/inout/out arguments are used. +// This depends on the ability to access a vector value in both languages via array syntax (aka color[2]). +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD1 *A_RESTRICT + #define retAD3 AD1 *A_RESTRICT + #define retAD4 AD1 *A_RESTRICT + #define retAF2 AF1 *A_RESTRICT + #define retAF3 AF1 *A_RESTRICT + #define retAF4 AF1 *A_RESTRICT + #define retAL2 AL1 *A_RESTRICT + #define retAL3 AL1 *A_RESTRICT + #define retAL4 AL1 *A_RESTRICT + #define retAU2 AU1 *A_RESTRICT + #define retAU3 AU1 *A_RESTRICT + #define retAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 AD1 *A_RESTRICT + #define inAD3 AD1 *A_RESTRICT + #define inAD4 AD1 *A_RESTRICT + #define inAF2 AF1 *A_RESTRICT + #define inAF3 AF1 *A_RESTRICT + #define inAF4 AF1 *A_RESTRICT + #define inAL2 AL1 *A_RESTRICT + #define inAL3 AL1 *A_RESTRICT + #define inAL4 AL1 *A_RESTRICT + #define inAU2 AU1 *A_RESTRICT + #define inAU3 AU1 *A_RESTRICT + #define inAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 AD1 *A_RESTRICT + #define inoutAD3 AD1 *A_RESTRICT + #define inoutAD4 AD1 *A_RESTRICT + #define inoutAF2 AF1 *A_RESTRICT + #define inoutAF3 AF1 *A_RESTRICT + #define inoutAF4 AF1 *A_RESTRICT + #define inoutAL2 AL1 *A_RESTRICT + #define inoutAL3 AL1 *A_RESTRICT + #define inoutAL4 AL1 *A_RESTRICT + #define inoutAU2 AU1 *A_RESTRICT + #define inoutAU3 AU1 *A_RESTRICT + #define inoutAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 AD1 *A_RESTRICT + #define outAD3 AD1 *A_RESTRICT + #define outAD4 AD1 *A_RESTRICT + #define outAF2 AF1 *A_RESTRICT + #define outAF3 AF1 *A_RESTRICT + #define outAF4 AF1 *A_RESTRICT + #define outAL2 AL1 *A_RESTRICT + #define outAL3 AL1 *A_RESTRICT + #define outAL4 AL1 *A_RESTRICT + #define outAU2 AU1 *A_RESTRICT + #define outAU3 AU1 *A_RESTRICT + #define outAU4 AU1 *A_RESTRICT +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD1 x[2] + #define varAD3(x) AD1 x[3] + #define varAD4(x) AD1 x[4] + #define varAF2(x) AF1 x[2] + #define varAF3(x) AF1 x[3] + #define varAF4(x) AF1 x[4] + #define varAL2(x) AL1 x[2] + #define varAL3(x) AL1 x[3] + #define varAL4(x) AL1 x[4] + #define varAU2(x) AU1 x[2] + #define varAU3(x) AU1 x[3] + #define varAU4(x) AU1 x[4] +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) {x,y} + #define initAD3(x,y,z) {x,y,z} + #define initAD4(x,y,z,w) {x,y,z,w} + #define initAF2(x,y) {x,y} + #define initAF3(x,y,z) {x,y,z} + #define initAF4(x,y,z,w) {x,y,z,w} + #define initAL2(x,y) {x,y} + #define initAL3(x,y,z) {x,y,z} + #define initAL4(x,y,z,w) {x,y,z,w} + #define initAU2(x,y) {x,y} + #define initAU3(x,y,z) {x,y,z} + #define initAU4(x,y,z,w) {x,y,z,w} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Replace transcendentals with manual versions. +//============================================================================================================================== + #ifdef A_GCC + A_STATIC AD1 AAbsD1(AD1 a){return __builtin_fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return __builtin_fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(__builtin_abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(__builtin_llabs(ASL1_(a)));} + #else + A_STATIC AD1 AAbsD1(AD1 a){return fabs(a);} + A_STATIC AF1 AAbsF1(AF1 a){return fabsf(a);} + A_STATIC AU1 AAbsSU1(AU1 a){return AU1_(abs(ASU1_(a)));} + A_STATIC AL1 AAbsSL1(AL1 a){return AL1_(labs((long)ASL1_(a)));} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ACosD1(AD1 a){return __builtin_cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return __builtin_cosf(a);} + #else + A_STATIC AD1 ACosD1(AD1 a){return cos(a);} + A_STATIC AF1 ACosF1(AF1 a){return cosf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ADotD2(inAD2 a,inAD2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AD1 ADotD3(inAD3 a,inAD3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AD1 ADotD4(inAD4 a,inAD4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} + A_STATIC AF1 ADotF2(inAF2 a,inAF2 b){return a[0]*b[0]+a[1]*b[1];} + A_STATIC AF1 ADotF3(inAF3 a,inAF3 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2];} + A_STATIC AF1 ADotF4(inAF4 a,inAF4 b){return a[0]*b[0]+a[1]*b[1]+a[2]*b[2]+a[3]*b[3];} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AExp2D1(AD1 a){return __builtin_exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return __builtin_exp2f(a);} + #else + A_STATIC AD1 AExp2D1(AD1 a){return exp2(a);} + A_STATIC AF1 AExp2F1(AF1 a){return exp2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 AFloorD1(AD1 a){return __builtin_floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return __builtin_floorf(a);} + #else + A_STATIC AD1 AFloorD1(AD1 a){return floor(a);} + A_STATIC AF1 AFloorF1(AF1 a){return floorf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ALerpD1(AD1 a,AD1 b,AD1 c){return b*c+(-a*c+a);} + A_STATIC AF1 ALerpF1(AF1 a,AF1 b,AF1 c){return b*c+(-a*c+a);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ALog2D1(AD1 a){return __builtin_log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return __builtin_log2f(a);} + #else + A_STATIC AD1 ALog2D1(AD1 a){return log2(a);} + A_STATIC AF1 ALog2F1(AF1 a){return log2f(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMaxD1(AD1 a,AD1 b){return a>b?a:b;} + A_STATIC AF1 AMaxF1(AF1 a,AF1 b){return a>b?a:b;} + A_STATIC AL1 AMaxL1(AL1 a,AL1 b){return a>b?a:b;} + A_STATIC AU1 AMaxU1(AU1 a,AU1 b){return a>b?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + // These follow the convention that A integer types don't have signage, until they are operated on. + A_STATIC AL1 AMaxSL1(AL1 a,AL1 b){return (ASL1_(a)>ASL1_(b))?a:b;} + A_STATIC AU1 AMaxSU1(AU1 a,AU1 b){return (ASU1_(a)>ASU1_(b))?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AMinD1(AD1 a,AD1 b){return a<b?a:b;} + A_STATIC AF1 AMinF1(AF1 a,AF1 b){return a<b?a:b;} + A_STATIC AL1 AMinL1(AL1 a,AL1 b){return a<b?a:b;} + A_STATIC AU1 AMinU1(AU1 a,AU1 b){return a<b?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AL1 AMinSL1(AL1 a,AL1 b){return (ASL1_(a)<ASL1_(b))?a:b;} + A_STATIC AU1 AMinSU1(AU1 a,AU1 b){return (ASU1_(a)<ASU1_(b))?a:b;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ARcpD1(AD1 a){return 1.0/a;} + A_STATIC AF1 ARcpF1(AF1 a){return 1.0f/a;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AL1 AShrSL1(AL1 a,AL1 b){return AL1_(ASL1_(a)>>ASL1_(b));} + A_STATIC AU1 AShrSU1(AU1 a,AU1 b){return AU1_(ASU1_(a)>>ASU1_(b));} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASinD1(AD1 a){return __builtin_sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return __builtin_sinf(a);} + #else + A_STATIC AD1 ASinD1(AD1 a){return sin(a);} + A_STATIC AF1 ASinF1(AF1 a){return sinf(a);} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_GCC + A_STATIC AD1 ASqrtD1(AD1 a){return __builtin_sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return __builtin_sqrtf(a);} + #else + A_STATIC AD1 ASqrtD1(AD1 a){return sqrt(a);} + A_STATIC AF1 ASqrtF1(AF1 a){return sqrtf(a);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + A_STATIC AD1 AClampD1(AD1 x,AD1 n,AD1 m){return AMaxD1(n,AMinD1(x,m));} + A_STATIC AF1 AClampF1(AF1 x,AF1 n,AF1 m){return AMaxF1(n,AMinF1(x,m));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 AFractD1(AD1 a){return a-AFloorD1(a);} + A_STATIC AF1 AFractF1(AF1 a){return a-AFloorF1(a);} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 APowD1(AD1 a,AD1 b){return AExp2D1(b*ALog2D1(a));} + A_STATIC AF1 APowF1(AF1 a,AF1 b){return AExp2F1(b*ALog2F1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ARsqD1(AD1 a){return ARcpD1(ASqrtD1(a));} + A_STATIC AF1 ARsqF1(AF1 a){return ARcpF1(ASqrtF1(a));} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC AD1 ASatD1(AD1 a){return AMinD1(1.0,AMaxD1(0.0,a));} + A_STATIC AF1 ASatF1(AF1 a){return AMinF1(1.0f,AMaxF1(0.0f,a));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + A_STATIC retAD2 opAAbsD2(outAD2 d,inAD2 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);return d;} + A_STATIC retAD3 opAAbsD3(outAD3 d,inAD3 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);return d;} + A_STATIC retAD4 opAAbsD4(outAD4 d,inAD4 a){d[0]=AAbsD1(a[0]);d[1]=AAbsD1(a[1]);d[2]=AAbsD1(a[2]);d[3]=AAbsD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAbsF2(outAF2 d,inAF2 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);return d;} + A_STATIC retAF3 opAAbsF3(outAF3 d,inAF3 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);return d;} + A_STATIC retAF4 opAAbsF4(outAF4 d,inAF4 a){d[0]=AAbsF1(a[0]);d[1]=AAbsF1(a[1]);d[2]=AAbsF1(a[2]);d[3]=AAbsF1(a[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];return d;} + A_STATIC retAF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];return d;} + A_STATIC retAF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]+b[0];d[1]=a[1]+b[1];d[2]=a[2]+b[2];d[3]=a[3]+b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;} + A_STATIC retAD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;} + A_STATIC retAD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;return d;} + A_STATIC retAF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;return d;} + A_STATIC retAF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]+b;d[1]=a[1]+b;d[2]=a[2]+b;d[3]=a[3]+b;return d;} +//============================================================================================================================== + A_STATIC retAD2 opACpyD2(outAD2 d,inAD2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAD3 opACpyD3(outAD3 d,inAD3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAD4 opACpyD4(outAD4 d,inAD4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opACpyF2(outAF2 d,inAF2 a){d[0]=a[0];d[1]=a[1];return d;} + A_STATIC retAF3 opACpyF3(outAF3 d,inAF3 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];return d;} + A_STATIC retAF4 opACpyF4(outAF4 d,inAF4 a){d[0]=a[0];d[1]=a[1];d[2]=a[2];d[3]=a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);return d;} + A_STATIC retAD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);return d;} + A_STATIC retAD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d[0]=ALerpD1(a[0],b[0],c[0]);d[1]=ALerpD1(a[1],b[1],c[1]);d[2]=ALerpD1(a[2],b[2],c[2]);d[3]=ALerpD1(a[3],b[3],c[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);return d;} + A_STATIC retAF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);return d;} + A_STATIC retAF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d[0]=ALerpF1(a[0],b[0],c[0]);d[1]=ALerpF1(a[1],b[1],c[1]);d[2]=ALerpF1(a[2],b[2],c[2]);d[3]=ALerpF1(a[3],b[3],c[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);return d;} + A_STATIC retAD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);return d;} + A_STATIC retAD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d[0]=ALerpD1(a[0],b[0],c);d[1]=ALerpD1(a[1],b[1],c);d[2]=ALerpD1(a[2],b[2],c);d[3]=ALerpD1(a[3],b[3],c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);return d;} + A_STATIC retAF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);return d;} + A_STATIC retAF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d[0]=ALerpF1(a[0],b[0],c);d[1]=ALerpF1(a[1],b[1],c);d[2]=ALerpF1(a[2],b[2],c);d[3]=ALerpF1(a[3],b[3],c);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMaxD1(a[0],b[0]);d[1]=AMaxD1(a[1],b[1]);d[2]=AMaxD1(a[2],b[2]);d[3]=AMaxD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMaxF1(a[0],b[0]);d[1]=AMaxF1(a[1],b[1]);d[2]=AMaxF1(a[2],b[2]);d[3]=AMaxF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);return d;} + A_STATIC retAD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);return d;} + A_STATIC retAD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d[0]=AMinD1(a[0],b[0]);d[1]=AMinD1(a[1],b[1]);d[2]=AMinD1(a[2],b[2]);d[3]=AMinD1(a[3],b[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);return d;} + A_STATIC retAF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);return d;} + A_STATIC retAF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d[0]=AMinF1(a[0],b[0]);d[1]=AMinF1(a[1],b[1]);d[2]=AMinF1(a[2],b[2]);d[3]=AMinF1(a[3],b[3]);return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];return d;} + A_STATIC retAF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];return d;} + A_STATIC retAF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d[0]=a[0]*b[0];d[1]=a[1]*b[1];d[2]=a[2]*b[2];d[3]=a[3]*b[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;return d;} + A_STATIC retAF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;return d;} + A_STATIC retAF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d[0]=a[0]*b;d[1]=a[1]*b;d[2]=a[2]*b;d[3]=a[3]*b;return d;} +//============================================================================================================================== + A_STATIC retAD2 opANegD2(outAD2 d,inAD2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAD3 opANegD3(outAD3 d,inAD3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAD4 opANegD4(outAD4 d,inAD4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opANegF2(outAF2 d,inAF2 a){d[0]=-a[0];d[1]=-a[1];return d;} + A_STATIC retAF3 opANegF3(outAF3 d,inAF3 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];return d;} + A_STATIC retAF4 opANegF4(outAF4 d,inAF4 a){d[0]=-a[0];d[1]=-a[1];d[2]=-a[2];d[3]=-a[3];return d;} +//============================================================================================================================== + A_STATIC retAD2 opARcpD2(outAD2 d,inAD2 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);return d;} + A_STATIC retAD3 opARcpD3(outAD3 d,inAD3 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);return d;} + A_STATIC retAD4 opARcpD4(outAD4 d,inAD4 a){d[0]=ARcpD1(a[0]);d[1]=ARcpD1(a[1]);d[2]=ARcpD1(a[2]);d[3]=ARcpD1(a[3]);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + A_STATIC retAF2 opARcpF2(outAF2 d,inAF2 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);return d;} + A_STATIC retAF3 opARcpF3(outAF3 d,inAF3 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);return d;} + A_STATIC retAF4 opARcpF4(outAF4 d,inAF4 a){d[0]=ARcpF1(a[0]);d[1]=ARcpF1(a[1]);d[2]=ARcpF1(a[2]);d[3]=ARcpF1(a[3]);return d;} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF FLOAT PACKING +//============================================================================================================================== + // Convert float to half (in lower 16-bits of output). + // Same fast technique as documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf + // Supports denormals. + // Conversion rules are to make computations possibly "safer" on the GPU, + // -INF & -NaN -> -65504 + // +INF & +NaN -> +65504 + A_STATIC AU1 AU1_AH1_AF1(AF1 f){ + static AW1 base[512]={ + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, + 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0001,0x0002,0x0004,0x0008,0x0010,0x0020,0x0040,0x0080,0x0100, + 0x0200,0x0400,0x0800,0x0c00,0x1000,0x1400,0x1800,0x1c00,0x2000,0x2400,0x2800,0x2c00,0x3000,0x3400,0x3800,0x3c00, + 0x4000,0x4400,0x4800,0x4c00,0x5000,0x5400,0x5800,0x5c00,0x6000,0x6400,0x6800,0x6c00,0x7000,0x7400,0x7800,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff,0x7bff, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000, + 0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8001,0x8002,0x8004,0x8008,0x8010,0x8020,0x8040,0x8080,0x8100, + 0x8200,0x8400,0x8800,0x8c00,0x9000,0x9400,0x9800,0x9c00,0xa000,0xa400,0xa800,0xac00,0xb000,0xb400,0xb800,0xbc00, + 0xc000,0xc400,0xc800,0xcc00,0xd000,0xd400,0xd800,0xdc00,0xe000,0xe400,0xe800,0xec00,0xf000,0xf400,0xf800,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff, + 0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff,0xfbff}; + static AB1 shift[512]={ + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x17,0x16,0x15,0x14,0x13,0x12,0x11,0x10,0x0f, + 0x0e,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d, + 0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x0d,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18, + 0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18,0x18}; + union{AF1 f;AU1 u;}bits;bits.f=f;AU1 u=bits.u;AU1 i=u>>23;return (AU1)(base[i])+((u&0x7fffff)>>shift[i]);} +//------------------------------------------------------------------------------------------------------------------------------ + // Used to output packed constant. + A_STATIC AU1 AU1_AH2_AF2(inAF2 a){return AU1_AH1_AF1(a[0])+(AU1_AH1_AF1(a[1])<<16);} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GLSL +// +// +//============================================================================================================================== +#if defined(A_GLSL) && defined(A_GPU) + #ifndef A_SKIP_EXT + #ifdef A_HALF + #extension GL_EXT_shader_16bit_storage:require + #extension GL_EXT_shader_explicit_arithmetic_types:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_LONG + #extension GL_ARB_gpu_shader_int64:require + #extension GL_NV_shader_atomic_int64:require + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_WAVE + #extension GL_KHR_shader_subgroup_arithmetic:require + #extension GL_KHR_shader_subgroup_ballot:require + #extension GL_KHR_shader_subgroup_quad:require + #extension GL_KHR_shader_subgroup_shuffle:require + #endif + #endif +//============================================================================================================================== + #define AP1 bool + #define AP2 bvec2 + #define AP3 bvec3 + #define AP4 bvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 vec2 + #define AF3 vec3 + #define AF4 vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uvec2 + #define AU3 uvec3 + #define AU4 uvec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 ivec2 + #define ASU3 ivec3 + #define ASU4 ivec4 +//============================================================================================================================== + #define AF1_AU1(x) uintBitsToFloat(AU1(x)) + #define AF2_AU2(x) uintBitsToFloat(AU2(x)) + #define AF3_AU3(x) uintBitsToFloat(AU3(x)) + #define AF4_AU4(x) uintBitsToFloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) floatBitsToUint(AF1(x)) + #define AU2_AF2(x) floatBitsToUint(AF2(x)) + #define AU3_AF3(x) floatBitsToUint(AF3(x)) + #define AU4_AF4(x) floatBitsToUint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH1_AF1_x(AF1 a){return packHalf2x16(AF2(a,0.0));} + #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2_AF2 packHalf2x16 + #define AU1_AW2Unorm_AF2 packUnorm2x16 + #define AU1_AB4Unorm_AF4 packUnorm4x8 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF2_AH2_AU1 unpackHalf2x16 + #define AF2_AW2Unorm_AU1 unpackUnorm2x16 + #define AF4_AB4Unorm_AU1 unpackUnorm4x8 +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){return bitfieldExtract(src,ASU1(off),ASU1(bits));} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + // Proxy for V_BFI_B32 where the 'mask' is set as 'bits', 'mask=(1<<bits)-1', and 'bits' needs to be an immediate. + AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){return bitfieldInsert(src,ins,0,ASU1(bits));} +//------------------------------------------------------------------------------------------------------------------------------ + // V_MED3_F32. + AF1 AClampF1(AF1 x,AF1 n,AF1 m){return clamp(x,n,m);} + AF2 AClampF2(AF2 x,AF2 n,AF2 m){return clamp(x,n,m);} + AF3 AClampF3(AF3 x,AF3 n,AF3 m){return clamp(x,n,m);} + AF4 AClampF4(AF4 x,AF4 n,AF4 m){return clamp(x,n,m);} +//------------------------------------------------------------------------------------------------------------------------------ + // V_FRACT_F32 (note DX frac() is different). + AF1 AFractF1(AF1 x){return fract(x);} + AF2 AFractF2(AF2 x){return fract(x);} + AF3 AFractF3(AF3 x){return fract(x);} + AF4 AFractF4(AF4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return mix(x,y,a);} + AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return mix(x,y,a);} + AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return mix(x,y,a);} + AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + // V_MAX3_F32. + AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));} + AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));} + AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));} + AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));} + AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));} + AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));} + AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));} + AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));} + AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));} + AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));} + AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));} + AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));} + AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Clamp has an easier pattern match for med3 when some ordering is known. + // V_MED3_F32. + AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));} + AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));} + AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));} + AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));} +//------------------------------------------------------------------------------------------------------------------------------ + // V_MIN3_F32. + AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));} + AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));} + AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));} + AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));} + AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));} + AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));} + AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));} + AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));} + AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));} + AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));} + AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));} + AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));} + AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently. + // V_COS_F32. + AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));} + AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));} + AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));} + AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));} +//------------------------------------------------------------------------------------------------------------------------------ + // Normalized trig. Valid input domain is {-256 to +256}. No GLSL compiler intrinsic exists to map to this currently. + // V_SIN_F32. + AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));} + AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));} + AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));} + AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ARcpF1(AF1 x){return AF1_(1.0)/x;} + AF2 ARcpF2(AF2 x){return AF2_(1.0)/x;} + AF3 ARcpF3(AF3 x){return AF3_(1.0)/x;} + AF4 ARcpF4(AF4 x){return AF4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ARsqF1(AF1 x){return AF1_(1.0)/sqrt(x);} + AF2 ARsqF2(AF2 x){return AF2_(1.0)/sqrt(x);} + AF3 ARsqF3(AF3 x){return AF3_(1.0)/sqrt(x);} + AF4 ARsqF4(AF4 x){return AF4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ASatF1(AF1 x){return clamp(x,AF1_(0.0),AF1_(1.0));} + AF2 ASatF2(AF2 x){return clamp(x,AF2_(0.0),AF2_(1.0));} + AF3 ASatF3(AF3 x){return clamp(x,AF3_(0.0),AF3_(1.0));} + AF4 ASatF4(AF4 x){return clamp(x,AF4_(0.0),AF4_(1.0));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #define AB1 uint8_t + #define AB2 u8vec2 + #define AB3 u8vec3 + #define AB4 u8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASB1 int8_t + #define ASB2 i8vec2 + #define ASB3 i8vec3 + #define ASB4 i8vec4 +//------------------------------------------------------------------------------------------------------------------------------ + AB1 AB1_x(AB1 a){return AB1(a);} + AB2 AB2_x(AB1 a){return AB2(a,a);} + AB3 AB3_x(AB1 a){return AB3(a,a,a);} + AB4 AB4_x(AB1 a){return AB4(a,a,a,a);} + #define AB1_(a) AB1_x(AB1(a)) + #define AB2_(a) AB2_x(AB1(a)) + #define AB3_(a) AB3_x(AB1(a)) + #define AB4_(a) AB4_x(AB1(a)) + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #define AH1 float16_t + #define AH2 f16vec2 + #define AH3 f16vec3 + #define AH4 f16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 uint16_t + #define AW2 u16vec2 + #define AW3 u16vec3 + #define AW4 u16vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 int16_t + #define ASW2 i16vec2 + #define ASW3 i16vec3 + #define ASW4 i16vec4 +//============================================================================================================================== + #define AH2_AU1(x) unpackFloat2x16(AU1(x)) + AH4 AH4_AU2_x(AU2 x){return AH4(unpackFloat2x16(x.x),unpackFloat2x16(x.y));} + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) unpackUint2x16(AU1(x)) + #define AW4_AU2(x) unpackUint4x16(pack64(AU2(x))) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AH2(x) packFloat2x16(AH2(x)) + AU2 AU2_AH4_x(AH4 x){return AU2(packFloat2x16(x.xy),packFloat2x16(x.zw));} + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) packUint2x16(AW2(x)) + #define AU2_AW4(x) unpack32(packUint4x16(AW4(x))) +//============================================================================================================================== + #define AW1_AH1(x) halfBitsToUint16(AH1(x)) + #define AW2_AH2(x) halfBitsToUint16(AH2(x)) + #define AW3_AH3(x) halfBitsToUint16(AH3(x)) + #define AW4_AH4(x) halfBitsToUint16(AH4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AH1_AW1(x) uint16BitsToHalf(AW1(x)) + #define AH2_AW2(x) uint16BitsToHalf(AW2(x)) + #define AH3_AW3(x) uint16BitsToHalf(AW3(x)) + #define AH4_AW4(x) uint16BitsToHalf(AW4(x)) +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AClampH1(AH1 x,AH1 n,AH1 m){return clamp(x,n,m);} + AH2 AClampH2(AH2 x,AH2 n,AH2 m){return clamp(x,n,m);} + AH3 AClampH3(AH3 x,AH3 n,AH3 m){return clamp(x,n,m);} + AH4 AClampH4(AH4 x,AH4 n,AH4 m){return clamp(x,n,m);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFractH1(AH1 x){return fract(x);} + AH2 AFractH2(AH2 x){return fract(x);} + AH3 AFractH3(AH3 x){return fract(x);} + AH4 AFractH4(AH4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return mix(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return mix(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return mix(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of max3. + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // No packed version of min3. + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return AH1_(1.0)/x;} + AH2 ARcpH2(AH2 x){return AH2_(1.0)/x;} + AH3 ARcpH3(AH3 x){return AH3_(1.0)/x;} + AH4 ARcpH4(AH4 x){return AH4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return AH1_(1.0)/sqrt(x);} + AH2 ARsqH2(AH2 x){return AH2_(1.0)/sqrt(x);} + AH3 ARsqH3(AH3 x){return AH3_(1.0)/sqrt(x);} + AH4 ARsqH4(AH4 x){return AH4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return clamp(x,AH1_(0.0),AH1_(1.0));} + AH2 ASatH2(AH2 x){return clamp(x,AH2_(0.0),AH2_(1.0));} + AH3 ASatH3(AH3 x){return clamp(x,AH3_(0.0),AH3_(1.0));} + AH4 ASatH4(AH4 x){return clamp(x,AH4_(0.0),AH4_(1.0));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #define AD1 double + #define AD2 dvec2 + #define AD3 dvec3 + #define AD4 dvec4 +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 x){return fract(x);} + AD2 AFractD2(AD2 x){return fract(x);} + AD3 AFractD3(AD3 x){return fract(x);} + AD4 AFractD4(AD4 x){return fract(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return mix(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return mix(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return mix(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return mix(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return AD1_(1.0)/x;} + AD2 ARcpD2(AD2 x){return AD2_(1.0)/x;} + AD3 ARcpD3(AD3 x){return AD3_(1.0)/x;} + AD4 ARcpD4(AD4 x){return AD4_(1.0)/x;} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return AD1_(1.0)/sqrt(x);} + AD2 ARsqD2(AD2 x){return AD2_(1.0)/sqrt(x);} + AD3 ARsqD3(AD3 x){return AD3_(1.0)/sqrt(x);} + AD4 ARsqD4(AD4 x){return AD4_(1.0)/sqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return clamp(x,AD1_(0.0),AD1_(1.0));} + AD2 ASatD2(AD2 x){return clamp(x,AD2_(0.0),AD2_(1.0));} + AD3 ASatD3(AD3 x){return clamp(x,AD3_(0.0),AD3_(1.0));} + AD4 ASatD4(AD4 x){return clamp(x,AD4_(0.0),AD4_(1.0));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// GLSL LONG +//============================================================================================================================== + #ifdef A_LONG + #define AL1 uint64_t + #define AL2 u64vec2 + #define AL3 u64vec3 + #define AL4 u64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASL1 int64_t + #define ASL2 i64vec2 + #define ASL3 i64vec3 + #define ASL4 i64vec4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AL1_AU2(x) packUint2x32(AU2(x)) + #define AU2_AL1(x) unpackUint2x32(AL1(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AL1_x(AL1 a){return AL1(a);} + AL2 AL2_x(AL1 a){return AL2(a,a);} + AL3 AL3_x(AL1 a){return AL3(a,a,a);} + AL4 AL4_x(AL1 a){return AL4(a,a,a,a);} + #define AL1_(a) AL1_x(AL1(a)) + #define AL2_(a) AL2_x(AL1(a)) + #define AL3_(a) AL3_x(AL1(a)) + #define AL4_(a) AL4_x(AL1(a)) +//============================================================================================================================== + AL1 AAbsSL1(AL1 a){return AL1(abs(ASL1(a)));} + AL2 AAbsSL2(AL2 a){return AL2(abs(ASL2(a)));} + AL3 AAbsSL3(AL3 a){return AL3(abs(ASL3(a)));} + AL4 AAbsSL4(AL4 a){return AL4(abs(ASL4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMaxSL1(AL1 a,AL1 b){return AL1(max(ASU1(a),ASU1(b)));} + AL2 AMaxSL2(AL2 a,AL2 b){return AL2(max(ASU2(a),ASU2(b)));} + AL3 AMaxSL3(AL3 a,AL3 b){return AL3(max(ASU3(a),ASU3(b)));} + AL4 AMaxSL4(AL4 a,AL4 b){return AL4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AL1 AMinSL1(AL1 a,AL1 b){return AL1(min(ASU1(a),ASU1(b)));} + AL2 AMinSL2(AL2 a,AL2 b){return AL2(min(ASU2(a),ASU2(b)));} + AL3 AMinSL3(AL3 a,AL3 b){return AL3(min(ASU3(a),ASU3(b)));} + AL4 AMinSL4(AL4 a,AL4 b){return AL4(min(ASU4(a),ASU4(b)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// WAVE OPERATIONS +//============================================================================================================================== + #ifdef A_WAVE + // Where 'x' must be a compile time literal. + AF1 AWaveXorF1(AF1 v,AU1 x){return subgroupShuffleXor(v,x);} + AF2 AWaveXorF2(AF2 v,AU1 x){return subgroupShuffleXor(v,x);} + AF3 AWaveXorF3(AF3 v,AU1 x){return subgroupShuffleXor(v,x);} + AF4 AWaveXorF4(AF4 v,AU1 x){return subgroupShuffleXor(v,x);} + AU1 AWaveXorU1(AU1 v,AU1 x){return subgroupShuffleXor(v,x);} + AU2 AWaveXorU2(AU2 v,AU1 x){return subgroupShuffleXor(v,x);} + AU3 AWaveXorU3(AU3 v,AU1 x){return subgroupShuffleXor(v,x);} + AU4 AWaveXorU4(AU4 v,AU1 x){return subgroupShuffleXor(v,x);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(subgroupShuffleXor(AU1_AH2(v),x));} + AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(subgroupShuffleXor(AU2_AH4(v),x));} + AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(subgroupShuffleXor(AU1_AW2(v),x));} + AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU2(subgroupShuffleXor(AU2_AW4(v),x));} + #endif + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// HLSL +// +// +//============================================================================================================================== +#if defined(A_HLSL) && defined(A_GPU) + #ifdef A_HLSL_6_2 + #define AP1 bool + #define AP2 bool2 + #define AP3 bool3 + #define AP4 bool4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float32_t + #define AF2 float32_t2 + #define AF3 float32_t3 + #define AF4 float32_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint32_t + #define AU2 uint32_t2 + #define AU3 uint32_t3 + #define AU4 uint32_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int32_t + #define ASU2 int32_t2 + #define ASU3 int32_t3 + #define ASU4 int32_t4 + #else + #define AP1 bool + #define AP2 bool2 + #define AP3 bool3 + #define AP4 bool4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AF1 float + #define AF2 float2 + #define AF3 float3 + #define AF4 float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1 uint + #define AU2 uint2 + #define AU3 uint3 + #define AU4 uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASU1 int + #define ASU2 int2 + #define ASU3 int3 + #define ASU4 int4 + #endif +//============================================================================================================================== + #define AF1_AU1(x) asfloat(AU1(x)) + #define AF2_AU2(x) asfloat(AU2(x)) + #define AF3_AU3(x) asfloat(AU3(x)) + #define AF4_AU4(x) asfloat(AU4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AU1_AF1(x) asuint(AF1(x)) + #define AU2_AF2(x) asuint(AF2(x)) + #define AU3_AF3(x) asuint(AF3(x)) + #define AU4_AF4(x) asuint(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH1_AF1_x(AF1 a){return f32tof16(a);} + #define AU1_AH1_AF1(a) AU1_AH1_AF1_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_AF2_x(AF2 a){return f32tof16(a.x)|(f32tof16(a.y)<<16);} + #define AU1_AH2_AF2(a) AU1_AH2_AF2_x(AF2(a)) + #define AU1_AB4Unorm_AF4(x) D3DCOLORtoUBYTE4(AF4(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AF2 AF2_AH2_AU1_x(AU1 x){return AF2(f16tof32(x&0xFFFF),f16tof32(x>>16));} + #define AF2_AH2_AU1(x) AF2_AH2_AU1_x(AU1(x)) +//============================================================================================================================== + AF1 AF1_x(AF1 a){return AF1(a);} + AF2 AF2_x(AF1 a){return AF2(a,a);} + AF3 AF3_x(AF1 a){return AF3(a,a,a);} + AF4 AF4_x(AF1 a){return AF4(a,a,a,a);} + #define AF1_(a) AF1_x(AF1(a)) + #define AF2_(a) AF2_x(AF1(a)) + #define AF3_(a) AF3_x(AF1(a)) + #define AF4_(a) AF4_x(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_x(AU1 a){return AU1(a);} + AU2 AU2_x(AU1 a){return AU2(a,a);} + AU3 AU3_x(AU1 a){return AU3(a,a,a);} + AU4 AU4_x(AU1 a){return AU4(a,a,a,a);} + #define AU1_(a) AU1_x(AU1(a)) + #define AU2_(a) AU2_x(AU1(a)) + #define AU3_(a) AU3_x(AU1(a)) + #define AU4_(a) AU4_x(AU1(a)) +//============================================================================================================================== + AU1 AAbsSU1(AU1 a){return AU1(abs(ASU1(a)));} + AU2 AAbsSU2(AU2 a){return AU2(abs(ASU2(a)));} + AU3 AAbsSU3(AU3 a){return AU3(abs(ASU3(a)));} + AU4 AAbsSU4(AU4 a){return AU4(abs(ASU4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABfe(AU1 src,AU1 off,AU1 bits){AU1 mask=(1u<<bits)-1;return (src>>off)&mask;} + AU1 ABfi(AU1 src,AU1 ins,AU1 mask){return (ins&mask)|(src&(~mask));} + AU1 ABfiM(AU1 src,AU1 ins,AU1 bits){AU1 mask=(1u<<bits)-1;return (ins&mask)|(src&(~mask));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AClampF1(AF1 x,AF1 n,AF1 m){return max(n,min(x,m));} + AF2 AClampF2(AF2 x,AF2 n,AF2 m){return max(n,min(x,m));} + AF3 AClampF3(AF3 x,AF3 n,AF3 m){return max(n,min(x,m));} + AF4 AClampF4(AF4 x,AF4 n,AF4 m){return max(n,min(x,m));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFractF1(AF1 x){return x-floor(x);} + AF2 AFractF2(AF2 x){return x-floor(x);} + AF3 AFractF3(AF3 x){return x-floor(x);} + AF4 AFractF4(AF4 x){return x-floor(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ALerpF1(AF1 x,AF1 y,AF1 a){return lerp(x,y,a);} + AF2 ALerpF2(AF2 x,AF2 y,AF2 a){return lerp(x,y,a);} + AF3 ALerpF3(AF3 x,AF3 y,AF3 a){return lerp(x,y,a);} + AF4 ALerpF4(AF4 x,AF4 y,AF4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AMax3F1(AF1 x,AF1 y,AF1 z){return max(x,max(y,z));} + AF2 AMax3F2(AF2 x,AF2 y,AF2 z){return max(x,max(y,z));} + AF3 AMax3F3(AF3 x,AF3 y,AF3 z){return max(x,max(y,z));} + AF4 AMax3F4(AF4 x,AF4 y,AF4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMax3SU1(AU1 x,AU1 y,AU1 z){return AU1(max(ASU1(x),max(ASU1(y),ASU1(z))));} + AU2 AMax3SU2(AU2 x,AU2 y,AU2 z){return AU2(max(ASU2(x),max(ASU2(y),ASU2(z))));} + AU3 AMax3SU3(AU3 x,AU3 y,AU3 z){return AU3(max(ASU3(x),max(ASU3(y),ASU3(z))));} + AU4 AMax3SU4(AU4 x,AU4 y,AU4 z){return AU4(max(ASU4(x),max(ASU4(y),ASU4(z))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMax3U1(AU1 x,AU1 y,AU1 z){return max(x,max(y,z));} + AU2 AMax3U2(AU2 x,AU2 y,AU2 z){return max(x,max(y,z));} + AU3 AMax3U3(AU3 x,AU3 y,AU3 z){return max(x,max(y,z));} + AU4 AMax3U4(AU4 x,AU4 y,AU4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMaxSU1(AU1 a,AU1 b){return AU1(max(ASU1(a),ASU1(b)));} + AU2 AMaxSU2(AU2 a,AU2 b){return AU2(max(ASU2(a),ASU2(b)));} + AU3 AMaxSU3(AU3 a,AU3 b){return AU3(max(ASU3(a),ASU3(b)));} + AU4 AMaxSU4(AU4 a,AU4 b){return AU4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AMed3F1(AF1 x,AF1 y,AF1 z){return max(min(x,y),min(max(x,y),z));} + AF2 AMed3F2(AF2 x,AF2 y,AF2 z){return max(min(x,y),min(max(x,y),z));} + AF3 AMed3F3(AF3 x,AF3 y,AF3 z){return max(min(x,y),min(max(x,y),z));} + AF4 AMed3F4(AF4 x,AF4 y,AF4 z){return max(min(x,y),min(max(x,y),z));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AMin3F1(AF1 x,AF1 y,AF1 z){return min(x,min(y,z));} + AF2 AMin3F2(AF2 x,AF2 y,AF2 z){return min(x,min(y,z));} + AF3 AMin3F3(AF3 x,AF3 y,AF3 z){return min(x,min(y,z));} + AF4 AMin3F4(AF4 x,AF4 y,AF4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMin3SU1(AU1 x,AU1 y,AU1 z){return AU1(min(ASU1(x),min(ASU1(y),ASU1(z))));} + AU2 AMin3SU2(AU2 x,AU2 y,AU2 z){return AU2(min(ASU2(x),min(ASU2(y),ASU2(z))));} + AU3 AMin3SU3(AU3 x,AU3 y,AU3 z){return AU3(min(ASU3(x),min(ASU3(y),ASU3(z))));} + AU4 AMin3SU4(AU4 x,AU4 y,AU4 z){return AU4(min(ASU4(x),min(ASU4(y),ASU4(z))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMin3U1(AU1 x,AU1 y,AU1 z){return min(x,min(y,z));} + AU2 AMin3U2(AU2 x,AU2 y,AU2 z){return min(x,min(y,z));} + AU3 AMin3U3(AU3 x,AU3 y,AU3 z){return min(x,min(y,z));} + AU4 AMin3U4(AU4 x,AU4 y,AU4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AMinSU1(AU1 a,AU1 b){return AU1(min(ASU1(a),ASU1(b)));} + AU2 AMinSU2(AU2 a,AU2 b){return AU2(min(ASU2(a),ASU2(b)));} + AU3 AMinSU3(AU3 a,AU3 b){return AU3(min(ASU3(a),ASU3(b)));} + AU4 AMinSU4(AU4 a,AU4 b){return AU4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ANCosF1(AF1 x){return cos(x*AF1_(A_2PI));} + AF2 ANCosF2(AF2 x){return cos(x*AF2_(A_2PI));} + AF3 ANCosF3(AF3 x){return cos(x*AF3_(A_2PI));} + AF4 ANCosF4(AF4 x){return cos(x*AF4_(A_2PI));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ANSinF1(AF1 x){return sin(x*AF1_(A_2PI));} + AF2 ANSinF2(AF2 x){return sin(x*AF2_(A_2PI));} + AF3 ANSinF3(AF3 x){return sin(x*AF3_(A_2PI));} + AF4 ANSinF4(AF4 x){return sin(x*AF4_(A_2PI));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ARcpF1(AF1 x){return rcp(x);} + AF2 ARcpF2(AF2 x){return rcp(x);} + AF3 ARcpF3(AF3 x){return rcp(x);} + AF4 ARcpF4(AF4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ARsqF1(AF1 x){return rsqrt(x);} + AF2 ARsqF2(AF2 x){return rsqrt(x);} + AF3 ARsqF3(AF3 x){return rsqrt(x);} + AF4 ARsqF4(AF4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ASatF1(AF1 x){return saturate(x);} + AF2 ASatF2(AF2 x){return saturate(x);} + AF3 ASatF3(AF3 x){return saturate(x);} + AF4 ASatF4(AF4 x){return saturate(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AShrSU1(AU1 a,AU1 b){return AU1(ASU1(a)>>ASU1(b));} + AU2 AShrSU2(AU2 a,AU2 b){return AU2(ASU2(a)>>ASU2(b));} + AU3 AShrSU3(AU3 a,AU3 b){return AU3(ASU3(a)>>ASU3(b));} + AU4 AShrSU4(AU4 a,AU4 b){return AU4(ASU4(a)>>ASU4(b));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL BYTE +//============================================================================================================================== + #ifdef A_BYTE + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL HALF +//============================================================================================================================== + #ifdef A_HALF + #ifdef A_HLSL_6_2 + #define AH1 float16_t + #define AH2 float16_t2 + #define AH3 float16_t3 + #define AH4 float16_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 uint16_t + #define AW2 uint16_t2 + #define AW3 uint16_t3 + #define AW4 uint16_t4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 int16_t + #define ASW2 int16_t2 + #define ASW3 int16_t3 + #define ASW4 int16_t4 + #else + #define AH1 min16float + #define AH2 min16float2 + #define AH3 min16float3 + #define AH4 min16float4 +//------------------------------------------------------------------------------------------------------------------------------ + #define AW1 min16uint + #define AW2 min16uint2 + #define AW3 min16uint3 + #define AW4 min16uint4 +//------------------------------------------------------------------------------------------------------------------------------ + #define ASW1 min16int + #define ASW2 min16int2 + #define ASW3 min16int3 + #define ASW4 min16int4 + #endif +//============================================================================================================================== + // Need to use manual unpack to get optimal execution (don't use packed types in buffers directly). + // Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/ + AH2 AH2_AU1_x(AU1 x){AF2 t=f16tof32(AU2(x&0xFFFF,x>>16));return AH2(t);} + AH4 AH4_AU2_x(AU2 x){return AH4(AH2_AU1_x(x.x),AH2_AU1_x(x.y));} + AW2 AW2_AU1_x(AU1 x){AU2 t=AU2(x&0xFFFF,x>>16);return AW2(t);} + AW4 AW4_AU2_x(AU2 x){return AW4(AW2_AU1_x(x.x),AW2_AU1_x(x.y));} + #define AH2_AU1(x) AH2_AU1_x(AU1(x)) + #define AH4_AU2(x) AH4_AU2_x(AU2(x)) + #define AW2_AU1(x) AW2_AU1_x(AU1(x)) + #define AW4_AU2(x) AW4_AU2_x(AU2(x)) +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AU1_AH2_x(AH2 x){return f32tof16(x.x)+(f32tof16(x.y)<<16);} + AU2 AU2_AH4_x(AH4 x){return AU2(AU1_AH2_x(x.xy),AU1_AH2_x(x.zw));} + AU1 AU1_AW2_x(AW2 x){return AU1(x.x)+(AU1(x.y)<<16);} + AU2 AU2_AW4_x(AW4 x){return AU2(AU1_AW2_x(x.xy),AU1_AW2_x(x.zw));} + #define AU1_AH2(x) AU1_AH2_x(AH2(x)) + #define AU2_AH4(x) AU2_AH4_x(AH4(x)) + #define AU1_AW2(x) AU1_AW2_x(AW2(x)) + #define AU2_AW4(x) AU2_AW4_x(AW4(x)) +//============================================================================================================================== + #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST) + #define AW1_AH1(x) asuint16(x) + #define AW2_AH2(x) asuint16(x) + #define AW3_AH3(x) asuint16(x) + #define AW4_AH4(x) asuint16(x) + #else + #define AW1_AH1(a) AW1(f32tof16(AF1(a))) + #define AW2_AH2(a) AW2(AW1_AH1((a).x),AW1_AH1((a).y)) + #define AW3_AH3(a) AW3(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z)) + #define AW4_AH4(a) AW4(AW1_AH1((a).x),AW1_AH1((a).y),AW1_AH1((a).z),AW1_AH1((a).w)) + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #if defined(A_HLSL_6_2) && !defined(A_NO_16_BIT_CAST) + #define AH1_AW1(x) asfloat16(x) + #define AH2_AW2(x) asfloat16(x) + #define AH3_AW3(x) asfloat16(x) + #define AH4_AW4(x) asfloat16(x) + #else + #define AH1_AW1(a) AH1(f16tof32(AU1(a))) + #define AH2_AW2(a) AH2(AH1_AW1((a).x),AH1_AW1((a).y)) + #define AH3_AW3(a) AH3(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z)) + #define AH4_AW4(a) AH4(AH1_AW1((a).x),AH1_AW1((a).y),AH1_AW1((a).z),AH1_AW1((a).w)) + #endif +//============================================================================================================================== + AH1 AH1_x(AH1 a){return AH1(a);} + AH2 AH2_x(AH1 a){return AH2(a,a);} + AH3 AH3_x(AH1 a){return AH3(a,a,a);} + AH4 AH4_x(AH1 a){return AH4(a,a,a,a);} + #define AH1_(a) AH1_x(AH1(a)) + #define AH2_(a) AH2_x(AH1(a)) + #define AH3_(a) AH3_x(AH1(a)) + #define AH4_(a) AH4_x(AH1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AW1_x(AW1 a){return AW1(a);} + AW2 AW2_x(AW1 a){return AW2(a,a);} + AW3 AW3_x(AW1 a){return AW3(a,a,a);} + AW4 AW4_x(AW1 a){return AW4(a,a,a,a);} + #define AW1_(a) AW1_x(AW1(a)) + #define AW2_(a) AW2_x(AW1(a)) + #define AW3_(a) AW3_x(AW1(a)) + #define AW4_(a) AW4_x(AW1(a)) +//============================================================================================================================== + AW1 AAbsSW1(AW1 a){return AW1(abs(ASW1(a)));} + AW2 AAbsSW2(AW2 a){return AW2(abs(ASW2(a)));} + AW3 AAbsSW3(AW3 a){return AW3(abs(ASW3(a)));} + AW4 AAbsSW4(AW4 a){return AW4(abs(ASW4(a)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AClampH1(AH1 x,AH1 n,AH1 m){return max(n,min(x,m));} + AH2 AClampH2(AH2 x,AH2 n,AH2 m){return max(n,min(x,m));} + AH3 AClampH3(AH3 x,AH3 n,AH3 m){return max(n,min(x,m));} + AH4 AClampH4(AH4 x,AH4 n,AH4 m){return max(n,min(x,m));} +//------------------------------------------------------------------------------------------------------------------------------ + // V_FRACT_F16 (note DX frac() is different). + AH1 AFractH1(AH1 x){return x-floor(x);} + AH2 AFractH2(AH2 x){return x-floor(x);} + AH3 AFractH3(AH3 x){return x-floor(x);} + AH4 AFractH4(AH4 x){return x-floor(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ALerpH1(AH1 x,AH1 y,AH1 a){return lerp(x,y,a);} + AH2 ALerpH2(AH2 x,AH2 y,AH2 a){return lerp(x,y,a);} + AH3 ALerpH3(AH3 x,AH3 y,AH3 a){return lerp(x,y,a);} + AH4 ALerpH4(AH4 x,AH4 y,AH4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMax3H1(AH1 x,AH1 y,AH1 z){return max(x,max(y,z));} + AH2 AMax3H2(AH2 x,AH2 y,AH2 z){return max(x,max(y,z));} + AH3 AMax3H3(AH3 x,AH3 y,AH3 z){return max(x,max(y,z));} + AH4 AMax3H4(AH4 x,AH4 y,AH4 z){return max(x,max(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMaxSW1(AW1 a,AW1 b){return AW1(max(ASU1(a),ASU1(b)));} + AW2 AMaxSW2(AW2 a,AW2 b){return AW2(max(ASU2(a),ASU2(b)));} + AW3 AMaxSW3(AW3 a,AW3 b){return AW3(max(ASU3(a),ASU3(b)));} + AW4 AMaxSW4(AW4 a,AW4 b){return AW4(max(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AMin3H1(AH1 x,AH1 y,AH1 z){return min(x,min(y,z));} + AH2 AMin3H2(AH2 x,AH2 y,AH2 z){return min(x,min(y,z));} + AH3 AMin3H3(AH3 x,AH3 y,AH3 z){return min(x,min(y,z));} + AH4 AMin3H4(AH4 x,AH4 y,AH4 z){return min(x,min(y,z));} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AMinSW1(AW1 a,AW1 b){return AW1(min(ASU1(a),ASU1(b)));} + AW2 AMinSW2(AW2 a,AW2 b){return AW2(min(ASU2(a),ASU2(b)));} + AW3 AMinSW3(AW3 a,AW3 b){return AW3(min(ASU3(a),ASU3(b)));} + AW4 AMinSW4(AW4 a,AW4 b){return AW4(min(ASU4(a),ASU4(b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARcpH1(AH1 x){return rcp(x);} + AH2 ARcpH2(AH2 x){return rcp(x);} + AH3 ARcpH3(AH3 x){return rcp(x);} + AH4 ARcpH4(AH4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ARsqH1(AH1 x){return rsqrt(x);} + AH2 ARsqH2(AH2 x){return rsqrt(x);} + AH3 ARsqH3(AH3 x){return rsqrt(x);} + AH4 ARsqH4(AH4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASatH1(AH1 x){return saturate(x);} + AH2 ASatH2(AH2 x){return saturate(x);} + AH3 ASatH3(AH3 x){return saturate(x);} + AH4 ASatH4(AH4 x){return saturate(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AShrSW1(AW1 a,AW1 b){return AW1(ASW1(a)>>ASW1(b));} + AW2 AShrSW2(AW2 a,AW2 b){return AW2(ASW2(a)>>ASW2(b));} + AW3 AShrSW3(AW3 a,AW3 b){return AW3(ASW3(a)>>ASW3(b));} + AW4 AShrSW4(AW4 a,AW4 b){return AW4(ASW4(a)>>ASW4(b));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HLSL DOUBLE +//============================================================================================================================== + #ifdef A_DUBL + #ifdef A_HLSL_6_2 + #define AD1 float64_t + #define AD2 float64_t2 + #define AD3 float64_t3 + #define AD4 float64_t4 + #else + #define AD1 double + #define AD2 double2 + #define AD3 double3 + #define AD4 double4 + #endif +//------------------------------------------------------------------------------------------------------------------------------ + AD1 AD1_x(AD1 a){return AD1(a);} + AD2 AD2_x(AD1 a){return AD2(a,a);} + AD3 AD3_x(AD1 a){return AD3(a,a,a);} + AD4 AD4_x(AD1 a){return AD4(a,a,a,a);} + #define AD1_(a) AD1_x(AD1(a)) + #define AD2_(a) AD2_x(AD1(a)) + #define AD3_(a) AD3_x(AD1(a)) + #define AD4_(a) AD4_x(AD1(a)) +//============================================================================================================================== + AD1 AFractD1(AD1 a){return a-floor(a);} + AD2 AFractD2(AD2 a){return a-floor(a);} + AD3 AFractD3(AD3 a){return a-floor(a);} + AD4 AFractD4(AD4 a){return a-floor(a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ALerpD1(AD1 x,AD1 y,AD1 a){return lerp(x,y,a);} + AD2 ALerpD2(AD2 x,AD2 y,AD2 a){return lerp(x,y,a);} + AD3 ALerpD3(AD3 x,AD3 y,AD3 a){return lerp(x,y,a);} + AD4 ALerpD4(AD4 x,AD4 y,AD4 a){return lerp(x,y,a);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARcpD1(AD1 x){return rcp(x);} + AD2 ARcpD2(AD2 x){return rcp(x);} + AD3 ARcpD3(AD3 x){return rcp(x);} + AD4 ARcpD4(AD4 x){return rcp(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ARsqD1(AD1 x){return rsqrt(x);} + AD2 ARsqD2(AD2 x){return rsqrt(x);} + AD3 ARsqD3(AD3 x){return rsqrt(x);} + AD4 ARsqD4(AD4 x){return rsqrt(x);} +//------------------------------------------------------------------------------------------------------------------------------ + AD1 ASatD1(AD1 x){return saturate(x);} + AD2 ASatD2(AD2 x){return saturate(x);} + AD3 ASatD3(AD3 x){return saturate(x);} + AD4 ASatD4(AD4 x){return saturate(x);} + #endif +//============================================================================================================================== +// HLSL WAVE +//============================================================================================================================== + #ifdef A_WAVE + // Where 'x' must be a compile time literal. + AF1 AWaveXorF1(AF1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF2 AWaveXorF2(AF2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF3 AWaveXorF3(AF3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AF4 AWaveXorF4(AF4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU1 AWaveXorU1(AU1 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU2 AWaveXorU1(AU2 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU3 AWaveXorU1(AU3 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} + AU4 AWaveXorU1(AU4 v,AU1 x){return WaveReadLaneAt(v,WaveGetLaneIndex()^x);} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AH2 AWaveXorH2(AH2 v,AU1 x){return AH2_AU1(WaveReadLaneAt(AU1_AH2(v),WaveGetLaneIndex()^x));} + AH4 AWaveXorH4(AH4 v,AU1 x){return AH4_AU2(WaveReadLaneAt(AU2_AH4(v),WaveGetLaneIndex()^x));} + AW2 AWaveXorW2(AW2 v,AU1 x){return AW2_AU1(WaveReadLaneAt(AU1_AW2(v),WaveGetLaneIndex()^x));} + AW4 AWaveXorW4(AW4 v,AU1 x){return AW4_AU1(WaveReadLaneAt(AU1_AW4(v),WaveGetLaneIndex()^x));} + #endif + #endif +//============================================================================================================================== +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU COMMON +// +// +//============================================================================================================================== +#ifdef A_GPU + // Negative and positive infinity. + #define A_INFP_F AF1_AU1(0x7f800000u) + #define A_INFN_F AF1_AU1(0xff800000u) +//------------------------------------------------------------------------------------------------------------------------------ + // Copy sign from 's' to positive 'd'. + AF1 ACpySgnF1(AF1 d,AF1 s){return AF1_AU1(AU1_AF1(d)|(AU1_AF1(s)&AU1_(0x80000000u)));} + AF2 ACpySgnF2(AF2 d,AF2 s){return AF2_AU2(AU2_AF2(d)|(AU2_AF2(s)&AU2_(0x80000000u)));} + AF3 ACpySgnF3(AF3 d,AF3 s){return AF3_AU3(AU3_AF3(d)|(AU3_AF3(s)&AU3_(0x80000000u)));} + AF4 ACpySgnF4(AF4 d,AF4 s){return AF4_AU4(AU4_AF4(d)|(AU4_AF4(s)&AU4_(0x80000000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Single operation to return (useful to create a mask to use in lerp for branch free logic), + // m=NaN := 0 + // m>=0 := 0 + // m<0 := 1 + // Uses the following useful floating point logic, + // saturate(+a*(-INF)==-INF) := 0 + // saturate( 0*(-INF)== NaN) := 0 + // saturate(-a*(-INF)==+INF) := 1 + AF1 ASignedF1(AF1 m){return ASatF1(m*AF1_(A_INFN_F));} + AF2 ASignedF2(AF2 m){return ASatF2(m*AF2_(A_INFN_F));} + AF3 ASignedF3(AF3 m){return ASatF3(m*AF3_(A_INFN_F));} + AF4 ASignedF4(AF4 m){return ASatF4(m*AF4_(A_INFN_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AGtZeroF1(AF1 m){return ASatF1(m*AF1_(A_INFP_F));} + AF2 AGtZeroF2(AF2 m){return ASatF2(m*AF2_(A_INFP_F));} + AF3 AGtZeroF3(AF3 m){return ASatF3(m*AF3_(A_INFP_F));} + AF4 AGtZeroF4(AF4 m){return ASatF4(m*AF4_(A_INFP_F));} +//============================================================================================================================== + #ifdef A_HALF + #ifdef A_HLSL_6_2 + #define A_INFP_H AH1_AW1((uint16_t)0x7c00u) + #define A_INFN_H AH1_AW1((uint16_t)0xfc00u) + #else + #define A_INFP_H AH1_AW1(0x7c00u) + #define A_INFN_H AH1_AW1(0xfc00u) + #endif + +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ACpySgnH1(AH1 d,AH1 s){return AH1_AW1(AW1_AH1(d)|(AW1_AH1(s)&AW1_(0x8000u)));} + AH2 ACpySgnH2(AH2 d,AH2 s){return AH2_AW2(AW2_AH2(d)|(AW2_AH2(s)&AW2_(0x8000u)));} + AH3 ACpySgnH3(AH3 d,AH3 s){return AH3_AW3(AW3_AH3(d)|(AW3_AH3(s)&AW3_(0x8000u)));} + AH4 ACpySgnH4(AH4 d,AH4 s){return AH4_AW4(AW4_AH4(d)|(AW4_AH4(s)&AW4_(0x8000u)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASignedH1(AH1 m){return ASatH1(m*AH1_(A_INFN_H));} + AH2 ASignedH2(AH2 m){return ASatH2(m*AH2_(A_INFN_H));} + AH3 ASignedH3(AH3 m){return ASatH3(m*AH3_(A_INFN_H));} + AH4 ASignedH4(AH4 m){return ASatH4(m*AH4_(A_INFN_H));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AGtZeroH1(AH1 m){return ASatH1(m*AH1_(A_INFP_H));} + AH2 AGtZeroH2(AH2 m){return ASatH2(m*AH2_(A_INFP_H));} + AH3 AGtZeroH3(AH3 m){return ASatH3(m*AH3_(A_INFP_H));} + AH4 AGtZeroH4(AH4 m){return ASatH4(m*AH4_(A_INFP_H));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [FIS] FLOAT INTEGER SORTABLE +//------------------------------------------------------------------------------------------------------------------------------ +// Float to integer sortable. +// - If sign bit=0, flip the sign bit (positives). +// - If sign bit=1, flip all bits (negatives). +// Integer sortable to float. +// - If sign bit=1, flip the sign bit (positives). +// - If sign bit=0, flip all bits (negatives). +// Has nice side effects. +// - Larger integers are more positive values. +// - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage). +// Burns 3 ops for conversion {shift,or,xor}. +//============================================================================================================================== + AU1 AFisToU1(AU1 x){return x^(( AShrSU1(x,AU1_(31)))|AU1_(0x80000000));} + AU1 AFisFromU1(AU1 x){return x^((~AShrSU1(x,AU1_(31)))|AU1_(0x80000000));} +//------------------------------------------------------------------------------------------------------------------------------ + // Just adjust high 16-bit value (useful when upper part of 32-bit word is a 16-bit float value). + AU1 AFisToHiU1(AU1 x){return x^(( AShrSU1(x,AU1_(15)))|AU1_(0x80000000));} + AU1 AFisFromHiU1(AU1 x){return x^((~AShrSU1(x,AU1_(15)))|AU1_(0x80000000));} +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + AW1 AFisToW1(AW1 x){return x^(( AShrSW1(x,AW1_(15)))|AW1_(0x8000));} + AW1 AFisFromW1(AW1 x){return x^((~AShrSW1(x,AW1_(15)))|AW1_(0x8000));} +//------------------------------------------------------------------------------------------------------------------------------ + AW2 AFisToW2(AW2 x){return x^(( AShrSW2(x,AW2_(15)))|AW2_(0x8000));} + AW2 AFisFromW2(AW2 x){return x^((~AShrSW2(x,AW2_(15)))|AW2_(0x8000));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [PERM] V_PERM_B32 +//------------------------------------------------------------------------------------------------------------------------------ +// Support for V_PERM_B32 started in the 3rd generation of GCN. +//------------------------------------------------------------------------------------------------------------------------------ +// yyyyxxxx - The 'i' input. +// 76543210 +// ======== +// HGFEDCBA - Naming on permutation. +//------------------------------------------------------------------------------------------------------------------------------ +// TODO +// ==== +// - Make sure compiler optimizes this. +//============================================================================================================================== + #ifdef A_HALF + AU1 APerm0E0A(AU2 i){return((i.x )&0xffu)|((i.y<<16)&0xff0000u);} + AU1 APerm0F0B(AU2 i){return((i.x>> 8)&0xffu)|((i.y<< 8)&0xff0000u);} + AU1 APerm0G0C(AU2 i){return((i.x>>16)&0xffu)|((i.y )&0xff0000u);} + AU1 APerm0H0D(AU2 i){return((i.x>>24)&0xffu)|((i.y>> 8)&0xff0000u);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 APermHGFA(AU2 i){return((i.x )&0x000000ffu)|(i.y&0xffffff00u);} + AU1 APermHGFC(AU2 i){return((i.x>>16)&0x000000ffu)|(i.y&0xffffff00u);} + AU1 APermHGAE(AU2 i){return((i.x<< 8)&0x0000ff00u)|(i.y&0xffff00ffu);} + AU1 APermHGCE(AU2 i){return((i.x>> 8)&0x0000ff00u)|(i.y&0xffff00ffu);} + AU1 APermHAFE(AU2 i){return((i.x<<16)&0x00ff0000u)|(i.y&0xff00ffffu);} + AU1 APermHCFE(AU2 i){return((i.x )&0x00ff0000u)|(i.y&0xff00ffffu);} + AU1 APermAGFE(AU2 i){return((i.x<<24)&0xff000000u)|(i.y&0x00ffffffu);} + AU1 APermCGFE(AU2 i){return((i.x<< 8)&0xff000000u)|(i.y&0x00ffffffu);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 APermGCEA(AU2 i){return((i.x)&0x00ff00ffu)|((i.y<<8)&0xff00ff00u);} + AU1 APermGECA(AU2 i){return(((i.x)&0xffu)|((i.x>>8)&0xff00u)|((i.y<<16)&0xff0000u)|((i.y<<8)&0xff000000u));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [BUC] BYTE UNSIGNED CONVERSION +//------------------------------------------------------------------------------------------------------------------------------ +// Designed to use the optimal conversion, enables the scaling to possibly be factored into other computation. +// Works on a range of {0 to A_BUC_<32,16>}, for <32-bit, and 16-bit> respectively. +//------------------------------------------------------------------------------------------------------------------------------ +// OPCODE NOTES +// ============ +// GCN does not do UNORM or SNORM for bytes in opcodes. +// - V_CVT_F32_UBYTE{0,1,2,3} - Unsigned byte to float. +// - V_CVT_PKACC_U8_F32 - Float to unsigned byte (does bit-field insert into 32-bit integer). +// V_PERM_B32 does byte packing with ability to zero fill bytes as well. +// - Can pull out byte values from two sources, and zero fill upper 8-bits of packed hi and lo. +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U1() - Designed for V_CVT_F32_UBYTE* and V_CVT_PKACCUM_U8_F32 ops. +// ==== ===== +// 0 : 0 +// 1 : 1 +// ... +// 255 : 255 +// : 256 (just outside the encoding range) +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABuc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32. +// ==== ===== +// 0 : 0 +// 1 : 1/512 +// 2 : 1/256 +// ... +// 64 : 1/8 +// 128 : 1/4 +// 255 : 255/512 +// : 1/2 (just outside the encoding range) +//------------------------------------------------------------------------------------------------------------------------------ +// OPTIMAL IMPLEMENTATIONS ON AMD ARCHITECTURES +// ============================================ +// r=ABuc0FromU1(i) +// V_CVT_F32_UBYTE0 r,i +// -------------------------------------------- +// r=ABuc0ToU1(d,i) +// V_CVT_PKACCUM_U8_F32 r,i,0,d +// -------------------------------------------- +// d=ABuc0FromU2(i) +// Where 'k0' is an SGPR with 0x0E0A +// Where 'k1' is an SGPR with {32768.0} packed into the lower 16-bits +// V_PERM_B32 d,i.x,i.y,k0 +// V_PK_FMA_F16 d,d,k1.x,0 +// -------------------------------------------- +// r=ABuc0ToU2(d,i) +// Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits +// Where 'k1' is an SGPR with 0x???? +// Where 'k2' is an SGPR with 0x???? +// V_PK_FMA_F16 i,i,k0.x,0 +// V_PERM_B32 r.x,i,i,k1 +// V_PERM_B32 r.y,i,i,k2 +//============================================================================================================================== + // Peak range for 32-bit and 16-bit operations. + #define A_BUC_32 (255.0) + #define A_BUC_16 (255.0/512.0) +//============================================================================================================================== + #if 1 + // Designed to be one V_CVT_PKACCUM_U8_F32. + // The extra min is required to pattern match to V_CVT_PKACCUM_U8_F32. + AU1 ABuc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i),255u) )&(0x000000ffu));} + AU1 ABuc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i),255u)<< 8)&(0x0000ff00u));} + AU1 ABuc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i),255u)<<16)&(0x00ff0000u));} + AU1 ABuc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i),255u)<<24)&(0xff000000u));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed to be one V_CVT_F32_UBYTE*. + AF1 ABuc0FromU1(AU1 i){return AF1((i )&255u);} + AF1 ABuc1FromU1(AU1 i){return AF1((i>> 8)&255u);} + AF1 ABuc2FromU1(AU1 i){return AF1((i>>16)&255u);} + AF1 ABuc3FromU1(AU1 i){return AF1((i>>24)&255u);} + #endif +//============================================================================================================================== + #ifdef A_HALF + // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}. + AW2 ABuc01ToW2(AH2 x,AH2 y){x*=AH2_(1.0/32768.0);y*=AH2_(1.0/32768.0); + return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed for 3 ops to do SOA to AOS and conversion. + AU2 ABuc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABuc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABuc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABuc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0))); + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + // Designed for 2 ops to do both AOS to SOA, and conversion. + AH2 ABuc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0);} + AH2 ABuc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0);} + AH2 ABuc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0);} + AH2 ABuc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [BSC] BYTE SIGNED CONVERSION +//------------------------------------------------------------------------------------------------------------------------------ +// Similar to [BUC]. +// Works on a range of {-/+ A_BSC_<32,16>}, for <32-bit, and 16-bit> respectively. +//------------------------------------------------------------------------------------------------------------------------------ +// ENCODING (without zero-based encoding) +// ======== +// 0 = unused (can be used to mean something else) +// 1 = lowest value +// 128 = exact zero center (zero based encoding +// 255 = highest value +//------------------------------------------------------------------------------------------------------------------------------ +// Zero-based [Zb] flips the MSB bit of the byte (making 128 "exact zero" actually zero). +// This is useful if there is a desire for cleared values to decode as zero. +//------------------------------------------------------------------------------------------------------------------------------ +// BYTE : FLOAT - ABsc{0,1,2,3}{To,From}U2() - Designed for 16-bit denormal tricks and V_PERM_B32. +// ==== ===== +// 0 : -127/512 (unused) +// 1 : -126/512 +// 2 : -125/512 +// ... +// 128 : 0 +// ... +// 255 : 127/512 +// : 1/4 (just outside the encoding range) +//============================================================================================================================== + // Peak range for 32-bit and 16-bit operations. + #define A_BSC_32 (127.0) + #define A_BSC_16 (127.0/512.0) +//============================================================================================================================== + #if 1 + AU1 ABsc0ToU1(AU1 d,AF1 i){return (d&0xffffff00u)|((min(AU1(i+128.0),255u) )&(0x000000ffu));} + AU1 ABsc1ToU1(AU1 d,AF1 i){return (d&0xffff00ffu)|((min(AU1(i+128.0),255u)<< 8)&(0x0000ff00u));} + AU1 ABsc2ToU1(AU1 d,AF1 i){return (d&0xff00ffffu)|((min(AU1(i+128.0),255u)<<16)&(0x00ff0000u));} + AU1 ABsc3ToU1(AU1 d,AF1 i){return (d&0x00ffffffu)|((min(AU1(i+128.0),255u)<<24)&(0xff000000u));} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 ABsc0ToZbU1(AU1 d,AF1 i){return ((d&0xffffff00u)|((min(AU1(trunc(i)+128.0),255u) )&(0x000000ffu)))^0x00000080u;} + AU1 ABsc1ToZbU1(AU1 d,AF1 i){return ((d&0xffff00ffu)|((min(AU1(trunc(i)+128.0),255u)<< 8)&(0x0000ff00u)))^0x00008000u;} + AU1 ABsc2ToZbU1(AU1 d,AF1 i){return ((d&0xff00ffffu)|((min(AU1(trunc(i)+128.0),255u)<<16)&(0x00ff0000u)))^0x00800000u;} + AU1 ABsc3ToZbU1(AU1 d,AF1 i){return ((d&0x00ffffffu)|((min(AU1(trunc(i)+128.0),255u)<<24)&(0xff000000u)))^0x80000000u;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ABsc0FromU1(AU1 i){return AF1((i )&255u)-128.0;} + AF1 ABsc1FromU1(AU1 i){return AF1((i>> 8)&255u)-128.0;} + AF1 ABsc2FromU1(AU1 i){return AF1((i>>16)&255u)-128.0;} + AF1 ABsc3FromU1(AU1 i){return AF1((i>>24)&255u)-128.0;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ABsc0FromZbU1(AU1 i){return AF1(((i )&255u)^0x80u)-128.0;} + AF1 ABsc1FromZbU1(AU1 i){return AF1(((i>> 8)&255u)^0x80u)-128.0;} + AF1 ABsc2FromZbU1(AU1 i){return AF1(((i>>16)&255u)^0x80u)-128.0;} + AF1 ABsc3FromZbU1(AU1 i){return AF1(((i>>24)&255u)^0x80u)-128.0;} + #endif +//============================================================================================================================== + #ifdef A_HALF + // Takes {x0,x1} and {y0,y1} and builds {{x0,y0},{x1,y1}}. + AW2 ABsc01ToW2(AH2 x,AH2 y){x=x*AH2_(1.0/32768.0)+AH2_(0.25/32768.0);y=y*AH2_(1.0/32768.0)+AH2_(0.25/32768.0); + return AW2_AU1(APermGCEA(AU2(AU1_AW2(AW2_AH2(x)),AU1_AW2(AW2_AH2(y)))));} +//------------------------------------------------------------------------------------------------------------------------------ + AU2 ABsc0ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABsc1ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABsc2ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABsc3ToU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0))); + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AU2 ABsc0ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHGFA(AU2(d.x,b)),APermHGFC(AU2(d.y,b)));} + AU2 ABsc1ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHGAE(AU2(d.x,b)),APermHGCE(AU2(d.y,b)));} + AU2 ABsc2ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermHAFE(AU2(d.x,b)),APermHCFE(AU2(d.y,b)));} + AU2 ABsc3ToZbU2(AU2 d,AH2 i){AU1 b=AU1_AW2(AW2_AH2(i*AH2_(1.0/32768.0)+AH2_(0.25/32768.0)))^0x00800080u; + return AU2(APermAGFE(AU2(d.x,b)),APermCGFE(AU2(d.y,b)));} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 ABsc0FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc1FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc2FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc3FromU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)))*AH2_(32768.0)-AH2_(0.25);} +//------------------------------------------------------------------------------------------------------------------------------ + AH2 ABsc0FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0E0A(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc1FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0F0B(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc2FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0G0C(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + AH2 ABsc3FromZbU2(AU2 i){return AH2_AW2(AW2_AU1(APerm0H0D(i)^0x00800080u))*AH2_(32768.0)-AH2_(0.25);} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// HALF APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These support only positive inputs. +// Did not see value yet in specialization for range. +// Using quick testing, ended up mostly getting the same "best" approximation for various ranges. +// With hardware that can co-execute transcendentals, the value in approximations could be less than expected. +// However from a latency perspective, if execution of a transcendental is 4 clk, with no packed support, -> 8 clk total. +// And co-execution would require a compiler interleaving a lot of independent work for packed usage. +//------------------------------------------------------------------------------------------------------------------------------ +// The one Newton Raphson iteration form of rsq() was skipped (requires 6 ops total). +// Same with sqrt(), as this could be x*rsq() (7 ops). +//============================================================================================================================== + #ifdef A_HALF + // Minimize squared error across full positive range, 2 ops. + // The 0x1de2 based approximation maps {0 to 1} input maps to < 1 output. + AH1 APrxLoSqrtH1(AH1 a){return AH1_AW1((AW1_AH1(a)>>AW1_(1))+AW1_(0x1de2));} + AH2 APrxLoSqrtH2(AH2 a){return AH2_AW2((AW2_AH2(a)>>AW2_(1))+AW2_(0x1de2));} + AH3 APrxLoSqrtH3(AH3 a){return AH3_AW3((AW3_AH3(a)>>AW3_(1))+AW3_(0x1de2));} + AH4 APrxLoSqrtH4(AH4 a){return AH4_AW4((AW4_AH4(a)>>AW4_(1))+AW4_(0x1de2));} +//------------------------------------------------------------------------------------------------------------------------------ + // Lower precision estimation, 1 op. + // Minimize squared error across {smallest normal to 16384.0}. + AH1 APrxLoRcpH1(AH1 a){return AH1_AW1(AW1_(0x7784)-AW1_AH1(a));} + AH2 APrxLoRcpH2(AH2 a){return AH2_AW2(AW2_(0x7784)-AW2_AH2(a));} + AH3 APrxLoRcpH3(AH3 a){return AH3_AW3(AW3_(0x7784)-AW3_AH3(a));} + AH4 APrxLoRcpH4(AH4 a){return AH4_AW4(AW4_(0x7784)-AW4_AH4(a));} +//------------------------------------------------------------------------------------------------------------------------------ + // Medium precision estimation, one Newton Raphson iteration, 3 ops. + AH1 APrxMedRcpH1(AH1 a){AH1 b=AH1_AW1(AW1_(0x778d)-AW1_AH1(a));return b*(-b*a+AH1_(2.0));} + AH2 APrxMedRcpH2(AH2 a){AH2 b=AH2_AW2(AW2_(0x778d)-AW2_AH2(a));return b*(-b*a+AH2_(2.0));} + AH3 APrxMedRcpH3(AH3 a){AH3 b=AH3_AW3(AW3_(0x778d)-AW3_AH3(a));return b*(-b*a+AH3_(2.0));} + AH4 APrxMedRcpH4(AH4 a){AH4 b=AH4_AW4(AW4_(0x778d)-AW4_AH4(a));return b*(-b*a+AH4_(2.0));} +//------------------------------------------------------------------------------------------------------------------------------ + // Minimize squared error across {smallest normal to 16384.0}, 2 ops. + AH1 APrxLoRsqH1(AH1 a){return AH1_AW1(AW1_(0x59a3)-(AW1_AH1(a)>>AW1_(1)));} + AH2 APrxLoRsqH2(AH2 a){return AH2_AW2(AW2_(0x59a3)-(AW2_AH2(a)>>AW2_(1)));} + AH3 APrxLoRsqH3(AH3 a){return AH3_AW3(AW3_(0x59a3)-(AW3_AH3(a)>>AW3_(1)));} + AH4 APrxLoRsqH4(AH4 a){return AH4_AW4(AW4_(0x59a3)-(AW4_AH4(a)>>AW4_(1)));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// FLOAT APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// Michal Drobot has an excellent presentation on these: "Low Level Optimizations For GCN", +// - Idea dates back to SGI, then to Quake 3, etc. +// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +// - sqrt(x)=rsqrt(x)*x +// - rcp(x)=rsqrt(x)*rsqrt(x) for positive x +// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +//------------------------------------------------------------------------------------------------------------------------------ +// These below are from perhaps less complete searching for optimal. +// Used FP16 normal range for testing with +4096 32-bit step size for sampling error. +// So these match up well with the half approximations. +//============================================================================================================================== + AF1 APrxLoSqrtF1(AF1 a){return AF1_AU1((AU1_AF1(a)>>AU1_(1))+AU1_(0x1fbc4639));} + AF1 APrxLoRcpF1(AF1 a){return AF1_AU1(AU1_(0x7ef07ebb)-AU1_AF1(a));} + AF1 APrxMedRcpF1(AF1 a){AF1 b=AF1_AU1(AU1_(0x7ef19fff)-AU1_AF1(a));return b*(-b*a+AF1_(2.0));} + AF1 APrxLoRsqF1(AF1 a){return AF1_AU1(AU1_(0x5f347d74)-(AU1_AF1(a)>>AU1_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 APrxLoSqrtF2(AF2 a){return AF2_AU2((AU2_AF2(a)>>AU2_(1))+AU2_(0x1fbc4639));} + AF2 APrxLoRcpF2(AF2 a){return AF2_AU2(AU2_(0x7ef07ebb)-AU2_AF2(a));} + AF2 APrxMedRcpF2(AF2 a){AF2 b=AF2_AU2(AU2_(0x7ef19fff)-AU2_AF2(a));return b*(-b*a+AF2_(2.0));} + AF2 APrxLoRsqF2(AF2 a){return AF2_AU2(AU2_(0x5f347d74)-(AU2_AF2(a)>>AU2_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF3 APrxLoSqrtF3(AF3 a){return AF3_AU3((AU3_AF3(a)>>AU3_(1))+AU3_(0x1fbc4639));} + AF3 APrxLoRcpF3(AF3 a){return AF3_AU3(AU3_(0x7ef07ebb)-AU3_AF3(a));} + AF3 APrxMedRcpF3(AF3 a){AF3 b=AF3_AU3(AU3_(0x7ef19fff)-AU3_AF3(a));return b*(-b*a+AF3_(2.0));} + AF3 APrxLoRsqF3(AF3 a){return AF3_AU3(AU3_(0x5f347d74)-(AU3_AF3(a)>>AU3_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + AF4 APrxLoSqrtF4(AF4 a){return AF4_AU4((AU4_AF4(a)>>AU4_(1))+AU4_(0x1fbc4639));} + AF4 APrxLoRcpF4(AF4 a){return AF4_AU4(AU4_(0x7ef07ebb)-AU4_AF4(a));} + AF4 APrxMedRcpF4(AF4 a){AF4 b=AF4_AU4(AU4_(0x7ef19fff)-AU4_AF4(a));return b*(-b*a+AF4_(2.0));} + AF4 APrxLoRsqF4(AF4 a){return AF4_AU4(AU4_(0x5f347d74)-(AU4_AF4(a)>>AU4_(1)));} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PQ APPROXIMATIONS +//------------------------------------------------------------------------------------------------------------------------------ +// PQ is very close to x^(1/8). The functions below Use the fast float approximation method to do +// PQ<~>Gamma2 (4th power and fast 4th root) and PQ<~>Linear (8th power and fast 8th root). Maximum error is ~0.2%. +//============================================================================================================================== +// Helpers + AF1 Quart(AF1 a) { a = a * a; return a * a;} + AF1 Oct(AF1 a) { a = a * a; a = a * a; return a * a; } + AF2 Quart(AF2 a) { a = a * a; return a * a; } + AF2 Oct(AF2 a) { a = a * a; a = a * a; return a * a; } + AF3 Quart(AF3 a) { a = a * a; return a * a; } + AF3 Oct(AF3 a) { a = a * a; a = a * a; return a * a; } + AF4 Quart(AF4 a) { a = a * a; return a * a; } + AF4 Oct(AF4 a) { a = a * a; a = a * a; return a * a; } + //------------------------------------------------------------------------------------------------------------------------------ + AF1 APrxPQToGamma2(AF1 a) { return Quart(a); } + AF1 APrxPQToLinear(AF1 a) { return Oct(a); } + AF1 APrxLoGamma2ToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); } + AF1 APrxMedGamma2ToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(2)) + AU1_(0x2F9A4E46)); AF1 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF1 APrxHighGamma2ToPQ(AF1 a) { return sqrt(sqrt(a)); } + AF1 APrxLoLinearToPQ(AF1 a) { return AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); } + AF1 APrxMedLinearToPQ(AF1 a) { AF1 b = AF1_AU1((AU1_AF1(a) >> AU1_(3)) + AU1_(0x378D8723)); AF1 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF1 APrxHighLinearToPQ(AF1 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF2 APrxPQToGamma2(AF2 a) { return Quart(a); } + AF2 APrxPQToLinear(AF2 a) { return Oct(a); } + AF2 APrxLoGamma2ToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); } + AF2 APrxMedGamma2ToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(2)) + AU2_(0x2F9A4E46)); AF2 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF2 APrxHighGamma2ToPQ(AF2 a) { return sqrt(sqrt(a)); } + AF2 APrxLoLinearToPQ(AF2 a) { return AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); } + AF2 APrxMedLinearToPQ(AF2 a) { AF2 b = AF2_AU2((AU2_AF2(a) >> AU2_(3)) + AU2_(0x378D8723)); AF2 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF2 APrxHighLinearToPQ(AF2 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF3 APrxPQToGamma2(AF3 a) { return Quart(a); } + AF3 APrxPQToLinear(AF3 a) { return Oct(a); } + AF3 APrxLoGamma2ToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); } + AF3 APrxMedGamma2ToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(2)) + AU3_(0x2F9A4E46)); AF3 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF3 APrxHighGamma2ToPQ(AF3 a) { return sqrt(sqrt(a)); } + AF3 APrxLoLinearToPQ(AF3 a) { return AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); } + AF3 APrxMedLinearToPQ(AF3 a) { AF3 b = AF3_AU3((AU3_AF3(a) >> AU3_(3)) + AU3_(0x378D8723)); AF3 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF3 APrxHighLinearToPQ(AF3 a) { return sqrt(sqrt(sqrt(a))); } + //------------------------------------------------------------------------------------------------------------------------------ + AF4 APrxPQToGamma2(AF4 a) { return Quart(a); } + AF4 APrxPQToLinear(AF4 a) { return Oct(a); } + AF4 APrxLoGamma2ToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); } + AF4 APrxMedGamma2ToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(2)) + AU4_(0x2F9A4E46)); AF4 b4 = Quart(b); return b - b * (b4 - a) / (AF1_(4.0) * b4); } + AF4 APrxHighGamma2ToPQ(AF4 a) { return sqrt(sqrt(a)); } + AF4 APrxLoLinearToPQ(AF4 a) { return AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); } + AF4 APrxMedLinearToPQ(AF4 a) { AF4 b = AF4_AU4((AU4_AF4(a) >> AU4_(3)) + AU4_(0x378D8723)); AF4 b8 = Oct(b); return b - b * (b8 - a) / (AF1_(8.0) * b8); } + AF4 APrxHighLinearToPQ(AF4 a) { return sqrt(sqrt(sqrt(a))); } +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PARABOLIC SIN & COS +//------------------------------------------------------------------------------------------------------------------------------ +// Approximate answers to transcendental questions. +//------------------------------------------------------------------------------------------------------------------------------ +//============================================================================================================================== + #if 1 + // Valid input range is {-1 to 1} representing {0 to 2 pi}. + // Output range is {-1/4 to 1/4} representing {-1 to 1}. + AF1 APSinF1(AF1 x){return x*abs(x)-x;} // MAD. + AF2 APSinF2(AF2 x){return x*abs(x)-x;} + AF1 APCosF1(AF1 x){x=AFractF1(x*AF1_(0.5)+AF1_(0.75));x=x*AF1_(2.0)-AF1_(1.0);return APSinF1(x);} // 3x MAD, FRACT + AF2 APCosF2(AF2 x){x=AFractF2(x*AF2_(0.5)+AF2_(0.75));x=x*AF2_(2.0)-AF2_(1.0);return APSinF2(x);} + AF2 APSinCosF1(AF1 x){AF1 y=AFractF1(x*AF1_(0.5)+AF1_(0.75));y=y*AF1_(2.0)-AF1_(1.0);return APSinF2(AF2(x,y));} + #endif +//------------------------------------------------------------------------------------------------------------------------------ + #ifdef A_HALF + // For a packed {sin,cos} pair, + // - Native takes 16 clocks and 4 issue slots (no packed transcendentals). + // - Parabolic takes 8 clocks and 8 issue slots (only fract is non-packed). + AH1 APSinH1(AH1 x){return x*abs(x)-x;} + AH2 APSinH2(AH2 x){return x*abs(x)-x;} // AND,FMA + AH1 APCosH1(AH1 x){x=AFractH1(x*AH1_(0.5)+AH1_(0.75));x=x*AH1_(2.0)-AH1_(1.0);return APSinH1(x);} + AH2 APCosH2(AH2 x){x=AFractH2(x*AH2_(0.5)+AH2_(0.75));x=x*AH2_(2.0)-AH2_(1.0);return APSinH2(x);} // 3x FMA, 2xFRACT, AND + AH2 APSinCosH1(AH1 x){AH1 y=AFractH1(x*AH1_(0.5)+AH1_(0.75));y=y*AH1_(2.0)-AH1_(1.0);return APSinH2(AH2(x,y));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// [ZOL] ZERO ONE LOGIC +//------------------------------------------------------------------------------------------------------------------------------ +// Conditional free logic designed for easy 16-bit packing, and backwards porting to 32-bit. +//------------------------------------------------------------------------------------------------------------------------------ +// 0 := false +// 1 := true +//------------------------------------------------------------------------------------------------------------------------------ +// AndNot(x,y) -> !(x&y) .... One op. +// AndOr(x,y,z) -> (x&y)|z ... One op. +// GtZero(x) -> x>0.0 ..... One op. +// Sel(x,y,z) -> x?y:z ..... Two ops, has no precision loss. +// Signed(x) -> x<0.0 ..... One op. +// ZeroPass(x,y) -> x?0:y ..... Two ops, 'y' is a pass through safe for aliasing as integer. +//------------------------------------------------------------------------------------------------------------------------------ +// OPTIMIZATION NOTES +// ================== +// - On Vega to use 2 constants in a packed op, pass in as one AW2 or one AH2 'k.xy' and use as 'k.xx' and 'k.yy'. +// For example 'a.xy*k.xx+k.yy'. +//============================================================================================================================== + #if 1 + AU1 AZolAndU1(AU1 x,AU1 y){return min(x,y);} + AU2 AZolAndU2(AU2 x,AU2 y){return min(x,y);} + AU3 AZolAndU3(AU3 x,AU3 y){return min(x,y);} + AU4 AZolAndU4(AU4 x,AU4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AZolNotU1(AU1 x){return x^AU1_(1);} + AU2 AZolNotU2(AU2 x){return x^AU2_(1);} + AU3 AZolNotU3(AU3 x){return x^AU3_(1);} + AU4 AZolNotU4(AU4 x){return x^AU4_(1);} +//------------------------------------------------------------------------------------------------------------------------------ + AU1 AZolOrU1(AU1 x,AU1 y){return max(x,y);} + AU2 AZolOrU2(AU2 x,AU2 y){return max(x,y);} + AU3 AZolOrU3(AU3 x,AU3 y){return max(x,y);} + AU4 AZolOrU4(AU4 x,AU4 y){return max(x,y);} +//============================================================================================================================== + AU1 AZolF1ToU1(AF1 x){return AU1(x);} + AU2 AZolF2ToU2(AF2 x){return AU2(x);} + AU3 AZolF3ToU3(AF3 x){return AU3(x);} + AU4 AZolF4ToU4(AF4 x){return AU4(x);} +//------------------------------------------------------------------------------------------------------------------------------ + // 2 ops, denormals don't work in 32-bit on PC (and if they are enabled, OMOD is disabled). + AU1 AZolNotF1ToU1(AF1 x){return AU1(AF1_(1.0)-x);} + AU2 AZolNotF2ToU2(AF2 x){return AU2(AF2_(1.0)-x);} + AU3 AZolNotF3ToU3(AF3 x){return AU3(AF3_(1.0)-x);} + AU4 AZolNotF4ToU4(AF4 x){return AU4(AF4_(1.0)-x);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolU1ToF1(AU1 x){return AF1(x);} + AF2 AZolU2ToF2(AU2 x){return AF2(x);} + AF3 AZolU3ToF3(AU3 x){return AF3(x);} + AF4 AZolU4ToF4(AU4 x){return AF4(x);} +//============================================================================================================================== + AF1 AZolAndF1(AF1 x,AF1 y){return min(x,y);} + AF2 AZolAndF2(AF2 x,AF2 y){return min(x,y);} + AF3 AZolAndF3(AF3 x,AF3 y){return min(x,y);} + AF4 AZolAndF4(AF4 x,AF4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 ASolAndNotF1(AF1 x,AF1 y){return (-x)*y+AF1_(1.0);} + AF2 ASolAndNotF2(AF2 x,AF2 y){return (-x)*y+AF2_(1.0);} + AF3 ASolAndNotF3(AF3 x,AF3 y){return (-x)*y+AF3_(1.0);} + AF4 ASolAndNotF4(AF4 x,AF4 y){return (-x)*y+AF4_(1.0);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolAndOrF1(AF1 x,AF1 y,AF1 z){return ASatF1(x*y+z);} + AF2 AZolAndOrF2(AF2 x,AF2 y,AF2 z){return ASatF2(x*y+z);} + AF3 AZolAndOrF3(AF3 x,AF3 y,AF3 z){return ASatF3(x*y+z);} + AF4 AZolAndOrF4(AF4 x,AF4 y,AF4 z){return ASatF4(x*y+z);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolGtZeroF1(AF1 x){return ASatF1(x*AF1_(A_INFP_F));} + AF2 AZolGtZeroF2(AF2 x){return ASatF2(x*AF2_(A_INFP_F));} + AF3 AZolGtZeroF3(AF3 x){return ASatF3(x*AF3_(A_INFP_F));} + AF4 AZolGtZeroF4(AF4 x){return ASatF4(x*AF4_(A_INFP_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolNotF1(AF1 x){return AF1_(1.0)-x;} + AF2 AZolNotF2(AF2 x){return AF2_(1.0)-x;} + AF3 AZolNotF3(AF3 x){return AF3_(1.0)-x;} + AF4 AZolNotF4(AF4 x){return AF4_(1.0)-x;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolOrF1(AF1 x,AF1 y){return max(x,y);} + AF2 AZolOrF2(AF2 x,AF2 y){return max(x,y);} + AF3 AZolOrF3(AF3 x,AF3 y){return max(x,y);} + AF4 AZolOrF4(AF4 x,AF4 y){return max(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolSelF1(AF1 x,AF1 y,AF1 z){AF1 r=(-x)*z+z;return x*y+r;} + AF2 AZolSelF2(AF2 x,AF2 y,AF2 z){AF2 r=(-x)*z+z;return x*y+r;} + AF3 AZolSelF3(AF3 x,AF3 y,AF3 z){AF3 r=(-x)*z+z;return x*y+r;} + AF4 AZolSelF4(AF4 x,AF4 y,AF4 z){AF4 r=(-x)*z+z;return x*y+r;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolSignedF1(AF1 x){return ASatF1(x*AF1_(A_INFN_F));} + AF2 AZolSignedF2(AF2 x){return ASatF2(x*AF2_(A_INFN_F));} + AF3 AZolSignedF3(AF3 x){return ASatF3(x*AF3_(A_INFN_F));} + AF4 AZolSignedF4(AF4 x){return ASatF4(x*AF4_(A_INFN_F));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AZolZeroPassF1(AF1 x,AF1 y){return AF1_AU1((AU1_AF1(x)!=AU1_(0))?AU1_(0):AU1_AF1(y));} + AF2 AZolZeroPassF2(AF2 x,AF2 y){return AF2_AU2((AU2_AF2(x)!=AU2_(0))?AU2_(0):AU2_AF2(y));} + AF3 AZolZeroPassF3(AF3 x,AF3 y){return AF3_AU3((AU3_AF3(x)!=AU3_(0))?AU3_(0):AU3_AF3(y));} + AF4 AZolZeroPassF4(AF4 x,AF4 y){return AF4_AU4((AU4_AF4(x)!=AU4_(0))?AU4_(0):AU4_AF4(y));} + #endif +//============================================================================================================================== + #ifdef A_HALF + AW1 AZolAndW1(AW1 x,AW1 y){return min(x,y);} + AW2 AZolAndW2(AW2 x,AW2 y){return min(x,y);} + AW3 AZolAndW3(AW3 x,AW3 y){return min(x,y);} + AW4 AZolAndW4(AW4 x,AW4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AZolNotW1(AW1 x){return x^AW1_(1);} + AW2 AZolNotW2(AW2 x){return x^AW2_(1);} + AW3 AZolNotW3(AW3 x){return x^AW3_(1);} + AW4 AZolNotW4(AW4 x){return x^AW4_(1);} +//------------------------------------------------------------------------------------------------------------------------------ + AW1 AZolOrW1(AW1 x,AW1 y){return max(x,y);} + AW2 AZolOrW2(AW2 x,AW2 y){return max(x,y);} + AW3 AZolOrW3(AW3 x,AW3 y){return max(x,y);} + AW4 AZolOrW4(AW4 x,AW4 y){return max(x,y);} +//============================================================================================================================== + // Uses denormal trick. + AW1 AZolH1ToW1(AH1 x){return AW1_AH1(x*AH1_AW1(AW1_(1)));} + AW2 AZolH2ToW2(AH2 x){return AW2_AH2(x*AH2_AW2(AW2_(1)));} + AW3 AZolH3ToW3(AH3 x){return AW3_AH3(x*AH3_AW3(AW3_(1)));} + AW4 AZolH4ToW4(AH4 x){return AW4_AH4(x*AH4_AW4(AW4_(1)));} +//------------------------------------------------------------------------------------------------------------------------------ + // AMD arch lacks a packed conversion opcode. + AH1 AZolW1ToH1(AW1 x){return AH1_AW1(x*AW1_AH1(AH1_(1.0)));} + AH2 AZolW2ToH2(AW2 x){return AH2_AW2(x*AW2_AH2(AH2_(1.0)));} + AH3 AZolW1ToH3(AW3 x){return AH3_AW3(x*AW3_AH3(AH3_(1.0)));} + AH4 AZolW2ToH4(AW4 x){return AH4_AW4(x*AW4_AH4(AH4_(1.0)));} +//============================================================================================================================== + AH1 AZolAndH1(AH1 x,AH1 y){return min(x,y);} + AH2 AZolAndH2(AH2 x,AH2 y){return min(x,y);} + AH3 AZolAndH3(AH3 x,AH3 y){return min(x,y);} + AH4 AZolAndH4(AH4 x,AH4 y){return min(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 ASolAndNotH1(AH1 x,AH1 y){return (-x)*y+AH1_(1.0);} + AH2 ASolAndNotH2(AH2 x,AH2 y){return (-x)*y+AH2_(1.0);} + AH3 ASolAndNotH3(AH3 x,AH3 y){return (-x)*y+AH3_(1.0);} + AH4 ASolAndNotH4(AH4 x,AH4 y){return (-x)*y+AH4_(1.0);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolAndOrH1(AH1 x,AH1 y,AH1 z){return ASatH1(x*y+z);} + AH2 AZolAndOrH2(AH2 x,AH2 y,AH2 z){return ASatH2(x*y+z);} + AH3 AZolAndOrH3(AH3 x,AH3 y,AH3 z){return ASatH3(x*y+z);} + AH4 AZolAndOrH4(AH4 x,AH4 y,AH4 z){return ASatH4(x*y+z);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolGtZeroH1(AH1 x){return ASatH1(x*AH1_(A_INFP_H));} + AH2 AZolGtZeroH2(AH2 x){return ASatH2(x*AH2_(A_INFP_H));} + AH3 AZolGtZeroH3(AH3 x){return ASatH3(x*AH3_(A_INFP_H));} + AH4 AZolGtZeroH4(AH4 x){return ASatH4(x*AH4_(A_INFP_H));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolNotH1(AH1 x){return AH1_(1.0)-x;} + AH2 AZolNotH2(AH2 x){return AH2_(1.0)-x;} + AH3 AZolNotH3(AH3 x){return AH3_(1.0)-x;} + AH4 AZolNotH4(AH4 x){return AH4_(1.0)-x;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolOrH1(AH1 x,AH1 y){return max(x,y);} + AH2 AZolOrH2(AH2 x,AH2 y){return max(x,y);} + AH3 AZolOrH3(AH3 x,AH3 y){return max(x,y);} + AH4 AZolOrH4(AH4 x,AH4 y){return max(x,y);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolSelH1(AH1 x,AH1 y,AH1 z){AH1 r=(-x)*z+z;return x*y+r;} + AH2 AZolSelH2(AH2 x,AH2 y,AH2 z){AH2 r=(-x)*z+z;return x*y+r;} + AH3 AZolSelH3(AH3 x,AH3 y,AH3 z){AH3 r=(-x)*z+z;return x*y+r;} + AH4 AZolSelH4(AH4 x,AH4 y,AH4 z){AH4 r=(-x)*z+z;return x*y+r;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AZolSignedH1(AH1 x){return ASatH1(x*AH1_(A_INFN_H));} + AH2 AZolSignedH2(AH2 x){return ASatH2(x*AH2_(A_INFN_H));} + AH3 AZolSignedH3(AH3 x){return ASatH3(x*AH3_(A_INFN_H));} + AH4 AZolSignedH4(AH4 x){return ASatH4(x*AH4_(A_INFN_H));} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// COLOR CONVERSIONS +//------------------------------------------------------------------------------------------------------------------------------ +// These are all linear to/from some other space (where 'linear' has been shortened out of the function name). +// So 'ToGamma' is 'LinearToGamma', and 'FromGamma' is 'LinearFromGamma'. +// These are branch free implementations. +// The AToSrgbF1() function is useful for stores for compute shaders for GPUs without hardware linear->sRGB store conversion. +//------------------------------------------------------------------------------------------------------------------------------ +// TRANSFER FUNCTIONS +// ================== +// 709 ..... Rec709 used for some HDTVs +// Gamma ... Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native +// Pq ...... PQ native for HDR10 +// Srgb .... The sRGB output, typical of PC displays, useful for 10-bit output, or storing to 8-bit UNORM without SRGB type +// Two ..... Gamma 2.0, fastest conversion (useful for intermediate pass approximations) +// Three ... Gamma 3.0, less fast, but good for HDR. +//------------------------------------------------------------------------------------------------------------------------------ +// KEEPING TO SPEC +// =============== +// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +// Also there is a slight step in the transition regions. +// Precision of the coefficients in the spec being the likely cause. +// Main usage case of the sRGB code is to do the linear->sRGB converstion in a compute shader before store. +// This is to work around lack of hardware (typically only ROP does the conversion for free). +// To "correct" the linear segment, would be to introduce error, because hardware decode of sRGB->linear is fixed (and free). +// So this header keeps with the spec. +// For linear->sRGB transforms, the linear segment in some respects reduces error, because rounding in that region is linear. +// Rounding in the curved region in hardware (and fast software code) introduces error due to rounding in non-linear. +//------------------------------------------------------------------------------------------------------------------------------ +// FOR PQ +// ====== +// Both input and output is {0.0-1.0}, and where output 1.0 represents 10000.0 cd/m^2. +// All constants are only specified to FP32 precision. +// External PQ source reference, +// - https://github.com/ampas/aces-dev/blob/master/transforms/ctl/utilities/ACESlib.Utilities_Color.a1.0.1.ctl +//------------------------------------------------------------------------------------------------------------------------------ +// PACKED VERSIONS +// =============== +// These are the A*H2() functions. +// There is no PQ functions as FP16 seemed to not have enough precision for the conversion. +// The remaining functions are "good enough" for 8-bit, and maybe 10-bit if not concerned about a few 1-bit errors. +// Precision is lowest in the 709 conversion, higher in sRGB, higher still in Two and Gamma (when using 2.2 at least). +//------------------------------------------------------------------------------------------------------------------------------ +// NOTES +// ===== +// Could be faster for PQ conversions to be in ALU or a texture lookup depending on usage case. +//============================================================================================================================== + #if 1 + AF1 ATo709F1(AF1 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AF2 ATo709F2(AF2 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AF3 ATo709F3(AF3 c){AF3 j=AF3(0.018*4.5,4.5,0.45);AF2 k=AF2(1.099,-0.099); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + // Note 'rcpX' is '1/x', where the 'x' is what would be used in AFromGamma(). + AF1 AToGammaF1(AF1 c,AF1 rcpX){return pow(c,AF1_(rcpX));} + AF2 AToGammaF2(AF2 c,AF1 rcpX){return pow(c,AF2_(rcpX));} + AF3 AToGammaF3(AF3 c,AF1 rcpX){return pow(c,AF3_(rcpX));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToPqF1(AF1 x){AF1 p=pow(x,AF1_(0.159302)); + return pow((AF1_(0.835938)+AF1_(18.8516)*p)/(AF1_(1.0)+AF1_(18.6875)*p),AF1_(78.8438));} + AF2 AToPqF1(AF2 x){AF2 p=pow(x,AF2_(0.159302)); + return pow((AF2_(0.835938)+AF2_(18.8516)*p)/(AF2_(1.0)+AF2_(18.6875)*p),AF2_(78.8438));} + AF3 AToPqF1(AF3 x){AF3 p=pow(x,AF3_(0.159302)); + return pow((AF3_(0.835938)+AF3_(18.8516)*p)/(AF3_(1.0)+AF3_(18.6875)*p),AF3_(78.8438));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToSrgbF1(AF1 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AF2 AToSrgbF2(AF2 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AF3 AToSrgbF3(AF3 c){AF3 j=AF3(0.0031308*12.92,12.92,1.0/2.4);AF2 k=AF2(1.055,-0.055); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToTwoF1(AF1 c){return sqrt(c);} + AF2 AToTwoF2(AF2 c){return sqrt(c);} + AF3 AToTwoF3(AF3 c){return sqrt(c);} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AToThreeF1(AF1 c){return pow(c,AF1_(1.0/3.0));} + AF2 AToThreeF2(AF2 c){return pow(c,AF2_(1.0/3.0));} + AF3 AToThreeF3(AF3 c){return pow(c,AF3_(1.0/3.0));} + #endif +//============================================================================================================================== + #if 1 + // Unfortunately median won't work here. + AF1 AFrom709F1(AF1 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AF2 AFrom709F2(AF2 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AF3 AFrom709F3(AF3 c){AF3 j=AF3(0.081/4.5,1.0/4.5,1.0/0.45);AF2 k=AF2(1.0/1.099,0.099/1.099); + return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromGammaF1(AF1 c,AF1 x){return pow(c,AF1_(x));} + AF2 AFromGammaF2(AF2 c,AF1 x){return pow(c,AF2_(x));} + AF3 AFromGammaF3(AF3 c,AF1 x){return pow(c,AF3_(x));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromPqF1(AF1 x){AF1 p=pow(x,AF1_(0.0126833)); + return pow(ASatF1(p-AF1_(0.835938))/(AF1_(18.8516)-AF1_(18.6875)*p),AF1_(6.27739));} + AF2 AFromPqF1(AF2 x){AF2 p=pow(x,AF2_(0.0126833)); + return pow(ASatF2(p-AF2_(0.835938))/(AF2_(18.8516)-AF2_(18.6875)*p),AF2_(6.27739));} + AF3 AFromPqF1(AF3 x){AF3 p=pow(x,AF3_(0.0126833)); + return pow(ASatF3(p-AF3_(0.835938))/(AF3_(18.8516)-AF3_(18.6875)*p),AF3_(6.27739));} +//------------------------------------------------------------------------------------------------------------------------------ + // Unfortunately median won't work here. + AF1 AFromSrgbF1(AF1 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF1(AZolSignedF1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AF2 AFromSrgbF2(AF2 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF2(AZolSignedF2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AF3 AFromSrgbF3(AF3 c){AF3 j=AF3(0.04045/12.92,1.0/12.92,2.4);AF2 k=AF2(1.0/1.055,0.055/1.055); + return AZolSelF3(AZolSignedF3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromTwoF1(AF1 c){return c*c;} + AF2 AFromTwoF2(AF2 c){return c*c;} + AF3 AFromTwoF3(AF3 c){return c*c;} +//------------------------------------------------------------------------------------------------------------------------------ + AF1 AFromThreeF1(AF1 c){return c*c*c;} + AF2 AFromThreeF2(AF2 c){return c*c*c;} + AF3 AFromThreeF3(AF3 c){return c*c*c;} + #endif +//============================================================================================================================== + #ifdef A_HALF + AH1 ATo709H1(AH1 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AH2 ATo709H2(AH2 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AH3 ATo709H3(AH3 c){AH3 j=AH3(0.018*4.5,4.5,0.45);AH2 k=AH2(1.099,-0.099); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToGammaH1(AH1 c,AH1 rcpX){return pow(c,AH1_(rcpX));} + AH2 AToGammaH2(AH2 c,AH1 rcpX){return pow(c,AH2_(rcpX));} + AH3 AToGammaH3(AH3 c,AH1 rcpX){return pow(c,AH3_(rcpX));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToSrgbH1(AH1 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.x ,c*j.y ,pow(c,j.z )*k.x +k.y );} + AH2 AToSrgbH2(AH2 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.xx ,c*j.yy ,pow(c,j.zz )*k.xx +k.yy );} + AH3 AToSrgbH3(AH3 c){AH3 j=AH3(0.0031308*12.92,12.92,1.0/2.4);AH2 k=AH2(1.055,-0.055); + return clamp(j.xxx,c*j.yyy,pow(c,j.zzz)*k.xxx+k.yyy);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToTwoH1(AH1 c){return sqrt(c);} + AH2 AToTwoH2(AH2 c){return sqrt(c);} + AH3 AToTwoH3(AH3 c){return sqrt(c);} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AToThreeF1(AH1 c){return pow(c,AH1_(1.0/3.0));} + AH2 AToThreeF2(AH2 c){return pow(c,AH2_(1.0/3.0));} + AH3 AToThreeF3(AH3 c){return pow(c,AH3_(1.0/3.0));} + #endif +//============================================================================================================================== + #ifdef A_HALF + AH1 AFrom709H1(AH1 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AH2 AFrom709H2(AH2 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AH3 AFrom709H3(AH3 c){AH3 j=AH3(0.081/4.5,1.0/4.5,1.0/0.45);AH2 k=AH2(1.0/1.099,0.099/1.099); + return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromGammaH1(AH1 c,AH1 x){return pow(c,AH1_(x));} + AH2 AFromGammaH2(AH2 c,AH1 x){return pow(c,AH2_(x));} + AH3 AFromGammaH3(AH3 c,AH1 x){return pow(c,AH3_(x));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AHromSrgbF1(AH1 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH1(AZolSignedH1(c-j.x ),c*j.y ,pow(c*k.x +k.y ,j.z ));} + AH2 AHromSrgbF2(AH2 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH2(AZolSignedH2(c-j.xx ),c*j.yy ,pow(c*k.xx +k.yy ,j.zz ));} + AH3 AHromSrgbF3(AH3 c){AH3 j=AH3(0.04045/12.92,1.0/12.92,2.4);AH2 k=AH2(1.0/1.055,0.055/1.055); + return AZolSelH3(AZolSignedH3(c-j.xxx),c*j.yyy,pow(c*k.xxx+k.yyy,j.zzz));} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromTwoH1(AH1 c){return c*c;} + AH2 AFromTwoH2(AH2 c){return c*c;} + AH3 AFromTwoH3(AH3 c){return c*c;} +//------------------------------------------------------------------------------------------------------------------------------ + AH1 AFromThreeH1(AH1 c){return c*c*c;} + AH2 AFromThreeH2(AH2 c){return c*c*c;} + AH3 AFromThreeH3(AH3 c){return c*c*c;} + #endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CS REMAP +//============================================================================================================================== + // Simple remap 64x1 to 8x8 with rotated 2x2 pixel quads in quad linear. + // 543210 + // ====== + // ..xxx. + // yy...y + AU2 ARmp8x8(AU1 a){return AU2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} +//============================================================================================================================== + // More complex remap 64x1 to 8x8 which is necessary for 2D wave reductions. + // 543210 + // ====== + // .xx..x + // y..yy. + // Details, + // LANE TO 8x8 MAPPING + // =================== + // 00 01 08 09 10 11 18 19 + // 02 03 0a 0b 12 13 1a 1b + // 04 05 0c 0d 14 15 1c 1d + // 06 07 0e 0f 16 17 1e 1f + // 20 21 28 29 30 31 38 39 + // 22 23 2a 2b 32 33 3a 3b + // 24 25 2c 2d 34 35 3c 3d + // 26 27 2e 2f 36 37 3e 3f + AU2 ARmpRed8x8(AU1 a){return AU2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} +//============================================================================================================================== + #ifdef A_HALF + AW2 ARmp8x8H(AU1 a){return AW2(ABfe(a,1u,3u),ABfiM(ABfe(a,3u,3u),a,1u));} + AW2 ARmpRed8x8H(AU1 a){return AW2(ABfiM(ABfe(a,2u,3u),a,1u),ABfiM(ABfe(a,3u,3u),ABfe(a,1u,2u),2u));} + #endif +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// REFERENCE +// +//------------------------------------------------------------------------------------------------------------------------------ +// IEEE FLOAT RULES +// ================ +// - saturate(NaN)=0, saturate(-INF)=0, saturate(+INF)=1 +// - {+/-}0 * {+/-}INF = NaN +// - -INF + (+INF) = NaN +// - {+/-}0 / {+/-}0 = NaN +// - {+/-}INF / {+/-}INF = NaN +// - a<(-0) := sqrt(a) = NaN (a=-0.0 won't NaN) +// - 0 == -0 +// - 4/0 = +INF +// - 4/-0 = -INF +// - 4+INF = +INF +// - 4-INF = -INF +// - 4*(+INF) = +INF +// - 4*(-INF) = -INF +// - -4*(+INF) = -INF +// - sqrt(+INF) = +INF +//------------------------------------------------------------------------------------------------------------------------------ +// FP16 ENCODING +// ============= +// fedcba9876543210 +// ---------------- +// ......mmmmmmmmmm 10-bit mantissa (encodes 11-bit 0.5 to 1.0 except for denormals) +// .eeeee.......... 5-bit exponent +// .00000.......... denormals +// .00001.......... -14 exponent +// .11110.......... 15 exponent +// .111110000000000 infinity +// .11111nnnnnnnnnn NaN with n!=0 +// s............... sign +//------------------------------------------------------------------------------------------------------------------------------ +// FP16/INT16 ALIASING DENORMAL +// ============================ +// 11-bit unsigned integers alias with half float denormal/normal values, +// 1 = 2^(-24) = 1/16777216 ....................... first denormal value +// 2 = 2^(-23) +// ... +// 1023 = 2^(-14)*(1-2^(-10)) = 2^(-14)*(1-1/1024) ... last denormal value +// 1024 = 2^(-14) = 1/16384 .......................... first normal value that still maps to integers +// 2047 .............................................. last normal value that still maps to integers +// Scaling limits, +// 2^15 = 32768 ...................................... largest power of 2 scaling +// Largest pow2 conversion mapping is at *32768, +// 1 : 2^(-9) = 1/512 +// 2 : 1/256 +// 4 : 1/128 +// 8 : 1/64 +// 16 : 1/32 +// 32 : 1/16 +// 64 : 1/8 +// 128 : 1/4 +// 256 : 1/2 +// 512 : 1 +// 1024 : 2 +// 2047 : a little less than 4 +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// GPU/CPU PORTABILITY +// +// +//------------------------------------------------------------------------------------------------------------------------------ +// This is the GPU implementation. +// See the CPU implementation for docs. +//============================================================================================================================== +#ifdef A_GPU + #define A_TRUE true + #define A_FALSE false + #define A_STATIC +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR ARGUMENT/RETURN/INITIALIZATION PORTABILITY +//============================================================================================================================== + #define retAD2 AD2 + #define retAD3 AD3 + #define retAD4 AD4 + #define retAF2 AF2 + #define retAF3 AF3 + #define retAF4 AF4 + #define retAL2 AL2 + #define retAL3 AL3 + #define retAL4 AL4 + #define retAU2 AU2 + #define retAU3 AU3 + #define retAU4 AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inAD2 in AD2 + #define inAD3 in AD3 + #define inAD4 in AD4 + #define inAF2 in AF2 + #define inAF3 in AF3 + #define inAF4 in AF4 + #define inAL2 in AL2 + #define inAL3 in AL3 + #define inAL4 in AL4 + #define inAU2 in AU2 + #define inAU3 in AU3 + #define inAU4 in AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define inoutAD2 inout AD2 + #define inoutAD3 inout AD3 + #define inoutAD4 inout AD4 + #define inoutAF2 inout AF2 + #define inoutAF3 inout AF3 + #define inoutAF4 inout AF4 + #define inoutAL2 inout AL2 + #define inoutAL3 inout AL3 + #define inoutAL4 inout AL4 + #define inoutAU2 inout AU2 + #define inoutAU3 inout AU3 + #define inoutAU4 inout AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define outAD2 out AD2 + #define outAD3 out AD3 + #define outAD4 out AD4 + #define outAF2 out AF2 + #define outAF3 out AF3 + #define outAF4 out AF4 + #define outAL2 out AL2 + #define outAL3 out AL3 + #define outAL4 out AL4 + #define outAU2 out AU2 + #define outAU3 out AU3 + #define outAU4 out AU4 +//------------------------------------------------------------------------------------------------------------------------------ + #define varAD2(x) AD2 x + #define varAD3(x) AD3 x + #define varAD4(x) AD4 x + #define varAF2(x) AF2 x + #define varAF3(x) AF3 x + #define varAF4(x) AF4 x + #define varAL2(x) AL2 x + #define varAL3(x) AL3 x + #define varAL4(x) AL4 x + #define varAU2(x) AU2 x + #define varAU3(x) AU3 x + #define varAU4(x) AU4 x +//------------------------------------------------------------------------------------------------------------------------------ + #define initAD2(x,y) AD2(x,y) + #define initAD3(x,y,z) AD3(x,y,z) + #define initAD4(x,y,z,w) AD4(x,y,z,w) + #define initAF2(x,y) AF2(x,y) + #define initAF3(x,y,z) AF3(x,y,z) + #define initAF4(x,y,z,w) AF4(x,y,z,w) + #define initAL2(x,y) AL2(x,y) + #define initAL3(x,y,z) AL3(x,y,z) + #define initAL4(x,y,z,w) AL4(x,y,z,w) + #define initAU2(x,y) AU2(x,y) + #define initAU3(x,y,z) AU3(x,y,z) + #define initAU4(x,y,z,w) AU4(x,y,z,w) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS +//============================================================================================================================== + #define AAbsD1(a) abs(AD1(a)) + #define AAbsF1(a) abs(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ACosD1(a) cos(AD1(a)) + #define ACosF1(a) cos(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ADotD2(a,b) dot(AD2(a),AD2(b)) + #define ADotD3(a,b) dot(AD3(a),AD3(b)) + #define ADotD4(a,b) dot(AD4(a),AD4(b)) + #define ADotF2(a,b) dot(AF2(a),AF2(b)) + #define ADotF3(a,b) dot(AF3(a),AF3(b)) + #define ADotF4(a,b) dot(AF4(a),AF4(b)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AExp2D1(a) exp2(AD1(a)) + #define AExp2F1(a) exp2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AFloorD1(a) floor(AD1(a)) + #define AFloorF1(a) floor(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ALog2D1(a) log2(AD1(a)) + #define ALog2F1(a) log2(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMaxD1(a,b) max(a,b) + #define AMaxF1(a,b) max(a,b) + #define AMaxL1(a,b) max(a,b) + #define AMaxU1(a,b) max(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define AMinD1(a,b) min(a,b) + #define AMinF1(a,b) min(a,b) + #define AMinL1(a,b) min(a,b) + #define AMinU1(a,b) min(a,b) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASinD1(a) sin(AD1(a)) + #define ASinF1(a) sin(AF1(a)) +//------------------------------------------------------------------------------------------------------------------------------ + #define ASqrtD1(a) sqrt(AD1(a)) + #define ASqrtF1(a) sqrt(AF1(a)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// SCALAR RETURN OPS - DEPENDENT +//============================================================================================================================== + #define APowD1(a,b) pow(AD1(a),AF1(b)) + #define APowF1(a,b) pow(AF1(a),AF1(b)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// VECTOR OPS +//------------------------------------------------------------------------------------------------------------------------------ +// These are added as needed for production or prototyping, so not necessarily a complete set. +// They follow a convention of taking in a destination and also returning the destination value to increase utility. +//============================================================================================================================== + #ifdef A_DUBL + AD2 opAAbsD2(outAD2 d,inAD2 a){d=abs(a);return d;} + AD3 opAAbsD3(outAD3 d,inAD3 a){d=abs(a);return d;} + AD4 opAAbsD4(outAD4 d,inAD4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAAddD2(outAD2 d,inAD2 a,inAD2 b){d=a+b;return d;} + AD3 opAAddD3(outAD3 d,inAD3 a,inAD3 b){d=a+b;return d;} + AD4 opAAddD4(outAD4 d,inAD4 a,inAD4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAAddOneD2(outAD2 d,inAD2 a,AD1 b){d=a+AD2_(b);return d;} + AD3 opAAddOneD3(outAD3 d,inAD3 a,AD1 b){d=a+AD3_(b);return d;} + AD4 opAAddOneD4(outAD4 d,inAD4 a,AD1 b){d=a+AD4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opACpyD2(outAD2 d,inAD2 a){d=a;return d;} + AD3 opACpyD3(outAD3 d,inAD3 a){d=a;return d;} + AD4 opACpyD4(outAD4 d,inAD4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpD2(outAD2 d,inAD2 a,inAD2 b,inAD2 c){d=ALerpD2(a,b,c);return d;} + AD3 opALerpD3(outAD3 d,inAD3 a,inAD3 b,inAD3 c){d=ALerpD3(a,b,c);return d;} + AD4 opALerpD4(outAD4 d,inAD4 a,inAD4 b,inAD4 c){d=ALerpD4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opALerpOneD2(outAD2 d,inAD2 a,inAD2 b,AD1 c){d=ALerpD2(a,b,AD2_(c));return d;} + AD3 opALerpOneD3(outAD3 d,inAD3 a,inAD3 b,AD1 c){d=ALerpD3(a,b,AD3_(c));return d;} + AD4 opALerpOneD4(outAD4 d,inAD4 a,inAD4 b,AD1 c){d=ALerpD4(a,b,AD4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMaxD2(outAD2 d,inAD2 a,inAD2 b){d=max(a,b);return d;} + AD3 opAMaxD3(outAD3 d,inAD3 a,inAD3 b){d=max(a,b);return d;} + AD4 opAMaxD4(outAD4 d,inAD4 a,inAD4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMinD2(outAD2 d,inAD2 a,inAD2 b){d=min(a,b);return d;} + AD3 opAMinD3(outAD3 d,inAD3 a,inAD3 b){d=min(a,b);return d;} + AD4 opAMinD4(outAD4 d,inAD4 a,inAD4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulD2(outAD2 d,inAD2 a,inAD2 b){d=a*b;return d;} + AD3 opAMulD3(outAD3 d,inAD3 a,inAD3 b){d=a*b;return d;} + AD4 opAMulD4(outAD4 d,inAD4 a,inAD4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opAMulOneD2(outAD2 d,inAD2 a,AD1 b){d=a*AD2_(b);return d;} + AD3 opAMulOneD3(outAD3 d,inAD3 a,AD1 b){d=a*AD3_(b);return d;} + AD4 opAMulOneD4(outAD4 d,inAD4 a,AD1 b){d=a*AD4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opANegD2(outAD2 d,inAD2 a){d=-a;return d;} + AD3 opANegD3(outAD3 d,inAD3 a){d=-a;return d;} + AD4 opANegD4(outAD4 d,inAD4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AD2 opARcpD2(outAD2 d,inAD2 a){d=ARcpD2(a);return d;} + AD3 opARcpD3(outAD3 d,inAD3 a){d=ARcpD3(a);return d;} + AD4 opARcpD4(outAD4 d,inAD4 a){d=ARcpD4(a);return d;} + #endif +//============================================================================================================================== + AF2 opAAbsF2(outAF2 d,inAF2 a){d=abs(a);return d;} + AF3 opAAbsF3(outAF3 d,inAF3 a){d=abs(a);return d;} + AF4 opAAbsF4(outAF4 d,inAF4 a){d=abs(a);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAAddF2(outAF2 d,inAF2 a,inAF2 b){d=a+b;return d;} + AF3 opAAddF3(outAF3 d,inAF3 a,inAF3 b){d=a+b;return d;} + AF4 opAAddF4(outAF4 d,inAF4 a,inAF4 b){d=a+b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAAddOneF2(outAF2 d,inAF2 a,AF1 b){d=a+AF2_(b);return d;} + AF3 opAAddOneF3(outAF3 d,inAF3 a,AF1 b){d=a+AF3_(b);return d;} + AF4 opAAddOneF4(outAF4 d,inAF4 a,AF1 b){d=a+AF4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opACpyF2(outAF2 d,inAF2 a){d=a;return d;} + AF3 opACpyF3(outAF3 d,inAF3 a){d=a;return d;} + AF4 opACpyF4(outAF4 d,inAF4 a){d=a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpF2(outAF2 d,inAF2 a,inAF2 b,inAF2 c){d=ALerpF2(a,b,c);return d;} + AF3 opALerpF3(outAF3 d,inAF3 a,inAF3 b,inAF3 c){d=ALerpF3(a,b,c);return d;} + AF4 opALerpF4(outAF4 d,inAF4 a,inAF4 b,inAF4 c){d=ALerpF4(a,b,c);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opALerpOneF2(outAF2 d,inAF2 a,inAF2 b,AF1 c){d=ALerpF2(a,b,AF2_(c));return d;} + AF3 opALerpOneF3(outAF3 d,inAF3 a,inAF3 b,AF1 c){d=ALerpF3(a,b,AF3_(c));return d;} + AF4 opALerpOneF4(outAF4 d,inAF4 a,inAF4 b,AF1 c){d=ALerpF4(a,b,AF4_(c));return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMaxF2(outAF2 d,inAF2 a,inAF2 b){d=max(a,b);return d;} + AF3 opAMaxF3(outAF3 d,inAF3 a,inAF3 b){d=max(a,b);return d;} + AF4 opAMaxF4(outAF4 d,inAF4 a,inAF4 b){d=max(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMinF2(outAF2 d,inAF2 a,inAF2 b){d=min(a,b);return d;} + AF3 opAMinF3(outAF3 d,inAF3 a,inAF3 b){d=min(a,b);return d;} + AF4 opAMinF4(outAF4 d,inAF4 a,inAF4 b){d=min(a,b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulF2(outAF2 d,inAF2 a,inAF2 b){d=a*b;return d;} + AF3 opAMulF3(outAF3 d,inAF3 a,inAF3 b){d=a*b;return d;} + AF4 opAMulF4(outAF4 d,inAF4 a,inAF4 b){d=a*b;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opAMulOneF2(outAF2 d,inAF2 a,AF1 b){d=a*AF2_(b);return d;} + AF3 opAMulOneF3(outAF3 d,inAF3 a,AF1 b){d=a*AF3_(b);return d;} + AF4 opAMulOneF4(outAF4 d,inAF4 a,AF1 b){d=a*AF4_(b);return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opANegF2(outAF2 d,inAF2 a){d=-a;return d;} + AF3 opANegF3(outAF3 d,inAF3 a){d=-a;return d;} + AF4 opANegF4(outAF4 d,inAF4 a){d=-a;return d;} +//------------------------------------------------------------------------------------------------------------------------------ + AF2 opARcpF2(outAF2 d,inAF2 a){d=ARcpF2(a);return d;} + AF3 opARcpF3(outAF3 d,inAF3 a){d=ARcpF3(a);return d;} + AF4 opARcpF4(outAF4 d,inAF4 a){d=ARcpF4(a);return d;} +#endif + + +#define FSR_RCAS_F 1 +AU4 con0; + +AF4 FsrRcasLoadF(ASU2 p) { return AF4(texelFetch(source, p, 0)); } +void FsrRcasInputF(inout AF1 r, inout AF1 g, inout AF1 b) {} + +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// +// AMD FidelityFX SUPER RESOLUTION [FSR 1] ::: SPATIAL SCALING & EXTRAS - v1.20210629 +// +// +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// FidelityFX Super Resolution Sample +// +// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// ABOUT +// ===== +// FSR is a collection of algorithms relating to generating a higher resolution image. +// This specific header focuses on single-image non-temporal image scaling, and related tools. +// +// The core functions are EASU and RCAS: +// [EASU] Edge Adaptive Spatial Upsampling ....... 1x to 4x area range spatial scaling, clamped adaptive elliptical filter. +// [RCAS] Robust Contrast Adaptive Sharpening .... A non-scaling variation on CAS. +// RCAS needs to be applied after EASU as a separate pass. +// +// Optional utility functions are: +// [LFGA] Linear Film Grain Applicator ........... Tool to apply film grain after scaling. +// [SRTM] Simple Reversible Tone-Mapper .......... Linear HDR {0 to FP16_MAX} to {0 to 1} and back. +// [TEPD] Temporal Energy Preserving Dither ...... Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion. +// See each individual sub-section for inline documentation. +//------------------------------------------------------------------------------------------------------------------------------ +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//------------------------------------------------------------------------------------------------------------------------------ +// FUNCTION PERMUTATIONS +// ===================== +// *F() ..... Single item computation with 32-bit. +// *H() ..... Single item computation with 16-bit, with packing (aka two 16-bit ops in parallel) when possible. +// *Hx2() ... Processing two items in parallel with 16-bit, easier packing. +// Not all interfaces in this file have a *Hx2() form. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [EASU] EDGE ADAPTIVE SPATIAL UPSAMPLING +// +//------------------------------------------------------------------------------------------------------------------------------ +// EASU provides a high quality spatial-only scaling at relatively low cost. +// Meaning EASU is appropiate for laptops and other low-end GPUs. +// Quality from 1x to 4x area scaling is good. +//------------------------------------------------------------------------------------------------------------------------------ +// The scalar uses a modified fast approximation to the standard lanczos(size=2) kernel. +// EASU runs in a single pass, so it applies a directionally and anisotropically adaptive radial lanczos. +// This is also kept as simple as possible to have minimum runtime. +//------------------------------------------------------------------------------------------------------------------------------ +// The lanzcos filter has negative lobes, so by itself it will introduce ringing. +// To remove all ringing, the algorithm uses the nearest 2x2 input texels as a neighborhood, +// and limits output to the minimum and maximum of that neighborhood. +//------------------------------------------------------------------------------------------------------------------------------ +// Input image requirements: +// +// Color needs to be encoded as 3 channel[red, green, blue](e.g.XYZ not supported) +// Each channel needs to be in the range[0, 1] +// Any color primaries are supported +// Display / tonemapping curve needs to be as if presenting to sRGB display or similar(e.g.Gamma 2.0) +// There should be no banding in the input +// There should be no high amplitude noise in the input +// There should be no noise in the input that is not at input pixel granularity +// For performance purposes, use 32bpp formats +//------------------------------------------------------------------------------------------------------------------------------ +// Best to apply EASU at the end of the frame after tonemapping +// but before film grain or composite of the UI. +//------------------------------------------------------------------------------------------------------------------------------ +// Example of including this header for D3D HLSL : +// +// #define A_GPU 1 +// #define A_HLSL 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of including this header for Vulkan GLSL : +// +// #define A_GPU 1 +// #define A_GLSL 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of including this header for Vulkan HLSL : +// +// #define A_GPU 1 +// #define A_HLSL 1 +// #define A_HLSL_6_2 1 +// #define A_NO_16_BIT_CAST 1 +// #define A_HALF 1 +// #include "ffx_a.h" +// #define FSR_EASU_H 1 +// #define FSR_RCAS_H 1 +// //declare input callbacks +// #include "ffx_fsr1.h" +// +// Example of declaring the required input callbacks for GLSL : +// The callbacks need to gather4 for each color channel using the specified texture coordinate 'p'. +// EASU uses gather4 to reduce position computation logic and for free Arrays of Structures to Structures of Arrays conversion. +// +// AH4 FsrEasuRH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,0));} +// AH4 FsrEasuGH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,1));} +// AH4 FsrEasuBH(AF2 p){return AH4(textureGather(sampler2D(tex,sam),p,2));} +// ... +// The FsrEasuCon function needs to be called from the CPU or GPU to set up constants. +// The difference in viewport and input image size is there to support Dynamic Resolution Scaling. +// To use FsrEasuCon() on the CPU, define A_CPU before including ffx_a and ffx_fsr1. +// Including a GPU example here, the 'con0' through 'con3' values would be stored out to a constant buffer. +// AU4 con0,con1,con2,con3; +// FsrEasuCon(con0,con1,con2,con3, +// 1920.0,1080.0, // Viewport size (top left aligned) in the input image which is to be scaled. +// 3840.0,2160.0, // The size of the input image. +// 2560.0,1440.0); // The output resolution. +//============================================================================================================================== +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CONSTANT SETUP +//============================================================================================================================== +// Call to setup required constant values (works on CPU or GPU). +A_STATIC void FsrEasuCon( +outAU4 con0, +outAU4 con1, +outAU4 con2, +outAU4 con3, +// This the rendered image resolution being upscaled +AF1 inputViewportInPixelsX, +AF1 inputViewportInPixelsY, +// This is the resolution of the resource containing the input image (useful for dynamic resolution) +AF1 inputSizeInPixelsX, +AF1 inputSizeInPixelsY, +// This is the display resolution which the input image gets upscaled to +AF1 outputSizeInPixelsX, +AF1 outputSizeInPixelsY){ + // Output integer position to a pixel position in viewport. + con0[0]=AU1_AF1(inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)); + con0[1]=AU1_AF1(inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)); + con0[2]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsX*ARcpF1(outputSizeInPixelsX)-AF1_(0.5)); + con0[3]=AU1_AF1(AF1_(0.5)*inputViewportInPixelsY*ARcpF1(outputSizeInPixelsY)-AF1_(0.5)); + // Viewport pixel position to normalized image space. + // This is used to get upper-left of 'F' tap. + con1[0]=AU1_AF1(ARcpF1(inputSizeInPixelsX)); + con1[1]=AU1_AF1(ARcpF1(inputSizeInPixelsY)); + // Centers of gather4, first offset from upper-left of 'F'. + // +---+---+ + // | | | + // +--(0)--+ + // | b | c | + // +---F---+---+---+ + // | e | f | g | h | + // +--(1)--+--(2)--+ + // | i | j | k | l | + // +---+---+---+---+ + // | n | o | + // +--(3)--+ + // | | | + // +---+---+ + con1[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX)); + con1[3]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsY)); + // These are from (0) instead of 'F'. + con2[0]=AU1_AF1(AF1_(-1.0)*ARcpF1(inputSizeInPixelsX)); + con2[1]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY)); + con2[2]=AU1_AF1(AF1_( 1.0)*ARcpF1(inputSizeInPixelsX)); + con2[3]=AU1_AF1(AF1_( 2.0)*ARcpF1(inputSizeInPixelsY)); + con3[0]=AU1_AF1(AF1_( 0.0)*ARcpF1(inputSizeInPixelsX)); + con3[1]=AU1_AF1(AF1_( 4.0)*ARcpF1(inputSizeInPixelsY)); + con3[2]=con3[3]=0;} + +//If the an offset into the input image resource +A_STATIC void FsrEasuConOffset( + outAU4 con0, + outAU4 con1, + outAU4 con2, + outAU4 con3, + // This the rendered image resolution being upscaled + AF1 inputViewportInPixelsX, + AF1 inputViewportInPixelsY, + // This is the resolution of the resource containing the input image (useful for dynamic resolution) + AF1 inputSizeInPixelsX, + AF1 inputSizeInPixelsY, + // This is the display resolution which the input image gets upscaled to + AF1 outputSizeInPixelsX, + AF1 outputSizeInPixelsY, + // This is the input image offset into the resource containing it (useful for dynamic resolution) + AF1 inputOffsetInPixelsX, + AF1 inputOffsetInPixelsY) { + FsrEasuCon(con0, con1, con2, con3, inputViewportInPixelsX, inputViewportInPixelsY, inputSizeInPixelsX, inputSizeInPixelsY, outputSizeInPixelsX, outputSizeInPixelsY); + con0[2] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsX * ARcpF1(outputSizeInPixelsX) - AF1_(0.5) + inputOffsetInPixelsX); + con0[3] = AU1_AF1(AF1_(0.5) * inputViewportInPixelsY * ARcpF1(outputSizeInPixelsY) - AF1_(0.5) + inputOffsetInPixelsY); +} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 32-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(FSR_EASU_F) + // Input callback prototypes, need to be implemented by calling shader + AF4 FsrEasuRF(AF2 p); + AF4 FsrEasuGF(AF2 p); + AF4 FsrEasuBF(AF2 p); +//------------------------------------------------------------------------------------------------------------------------------ + // Filtering for a given tap for the scalar. + void FsrEasuTapF( + inout AF3 aC, // Accumulated color, with negative lobe. + inout AF1 aW, // Accumulated weight. + AF2 off, // Pixel offset from resolve position to tap. + AF2 dir, // Gradient direction. + AF2 len, // Length. + AF1 lob, // Negative lobe strength. + AF1 clp, // Clipping point. + AF3 c){ // Tap color. + // Rotate offset by direction. + AF2 v; + v.x=(off.x*( dir.x))+(off.y*dir.y); + v.y=(off.x*(-dir.y))+(off.y*dir.x); + // Anisotropy. + v*=len; + // Compute distance^2. + AF1 d2=v.x*v.x+v.y*v.y; + // Limit to the window as at corner, 2 taps can easily be outside. + d2=min(d2,clp); + // Approximation of lancos2 without sin() or rcp(), or sqrt() to get x. + // (25/16 * (2/5 * x^2 - 1)^2 - (25/16 - 1)) * (1/4 * x^2 - 1)^2 + // |_______________________________________| |_______________| + // base window + // The general form of the 'base' is, + // (a*(b*x^2-1)^2-(a-1)) + // Where 'a=1/(2*b-b^2)' and 'b' moves around the negative lobe. + AF1 wB=AF1_(2.0/5.0)*d2+AF1_(-1.0); + AF1 wA=lob*d2+AF1_(-1.0); + wB*=wB; + wA*=wA; + wB=AF1_(25.0/16.0)*wB+AF1_(-(25.0/16.0-1.0)); + AF1 w=wB*wA; + // Do weighted average. + aC+=c*w;aW+=w;} +//------------------------------------------------------------------------------------------------------------------------------ + // Accumulate direction and length. + void FsrEasuSetF( + inout AF2 dir, + inout AF1 len, + AF2 pp, + AP1 biS,AP1 biT,AP1 biU,AP1 biV, + AF1 lA,AF1 lB,AF1 lC,AF1 lD,AF1 lE){ + // Compute bilinear weight, branches factor out as predicates are compiler time immediates. + // s t + // u v + AF1 w = AF1_(0.0); + if(biS)w=(AF1_(1.0)-pp.x)*(AF1_(1.0)-pp.y); + if(biT)w= pp.x *(AF1_(1.0)-pp.y); + if(biU)w=(AF1_(1.0)-pp.x)* pp.y ; + if(biV)w= pp.x * pp.y ; + // Direction is the '+' diff. + // a + // b c d + // e + // Then takes magnitude from abs average of both sides of 'c'. + // Length converts gradient reversal to 0, smoothly to non-reversal at 1, shaped, then adding horz and vert terms. + AF1 dc=lD-lC; + AF1 cb=lC-lB; + AF1 lenX=max(abs(dc),abs(cb)); + lenX=APrxLoRcpF1(lenX); + AF1 dirX=lD-lB; + dir.x+=dirX*w; + lenX=ASatF1(abs(dirX)*lenX); + lenX*=lenX; + len+=lenX*w; + // Repeat for the y axis. + AF1 ec=lE-lC; + AF1 ca=lC-lA; + AF1 lenY=max(abs(ec),abs(ca)); + lenY=APrxLoRcpF1(lenY); + AF1 dirY=lE-lA; + dir.y+=dirY*w; + lenY=ASatF1(abs(dirY)*lenY); + lenY*=lenY; + len+=lenY*w;} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrEasuF( + out AF3 pix, + AU2 ip, // Integer pixel position in output. + AU4 con0, // Constants generated by FsrEasuCon(). + AU4 con1, + AU4 con2, + AU4 con3){ +//------------------------------------------------------------------------------------------------------------------------------ + // Get position of 'f'. + AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw); + AF2 fp=floor(pp); + pp-=fp; +//------------------------------------------------------------------------------------------------------------------------------ + // 12-tap kernel. + // b c + // e f g h + // i j k l + // n o + // Gather 4 ordering. + // a b + // r g + // For packed FP16, need either {rg} or {ab} so using the following setup for gather in all versions, + // a b <- unused (z) + // r g + // a b a b + // r g r g + // a b + // r g <- unused (z) + // Allowing dead-code removal to remove the 'z's. + AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw); + // These are from p0 to avoid pulling two constants on pre-Navi hardware. + AF2 p1=p0+AF2_AU2(con2.xy); + AF2 p2=p0+AF2_AU2(con2.zw); + AF2 p3=p0+AF2_AU2(con3.xy); + AF4 bczzR=FsrEasuRF(p0); + AF4 bczzG=FsrEasuGF(p0); + AF4 bczzB=FsrEasuBF(p0); + AF4 ijfeR=FsrEasuRF(p1); + AF4 ijfeG=FsrEasuGF(p1); + AF4 ijfeB=FsrEasuBF(p1); + AF4 klhgR=FsrEasuRF(p2); + AF4 klhgG=FsrEasuGF(p2); + AF4 klhgB=FsrEasuBF(p2); + AF4 zzonR=FsrEasuRF(p3); + AF4 zzonG=FsrEasuGF(p3); + AF4 zzonB=FsrEasuBF(p3); +//------------------------------------------------------------------------------------------------------------------------------ + // Simplest multi-channel approximate luma possible (luma times 2, in 2 FMA/MAD). + AF4 bczzL=bczzB*AF4_(0.5)+(bczzR*AF4_(0.5)+bczzG); + AF4 ijfeL=ijfeB*AF4_(0.5)+(ijfeR*AF4_(0.5)+ijfeG); + AF4 klhgL=klhgB*AF4_(0.5)+(klhgR*AF4_(0.5)+klhgG); + AF4 zzonL=zzonB*AF4_(0.5)+(zzonR*AF4_(0.5)+zzonG); + // Rename. + AF1 bL=bczzL.x; + AF1 cL=bczzL.y; + AF1 iL=ijfeL.x; + AF1 jL=ijfeL.y; + AF1 fL=ijfeL.z; + AF1 eL=ijfeL.w; + AF1 kL=klhgL.x; + AF1 lL=klhgL.y; + AF1 hL=klhgL.z; + AF1 gL=klhgL.w; + AF1 oL=zzonL.z; + AF1 nL=zzonL.w; + // Accumulate for bilinear interpolation. + AF2 dir=AF2_(0.0); + AF1 len=AF1_(0.0); + FsrEasuSetF(dir,len,pp,true, false,false,false,bL,eL,fL,gL,jL); + FsrEasuSetF(dir,len,pp,false,true ,false,false,cL,fL,gL,hL,kL); + FsrEasuSetF(dir,len,pp,false,false,true ,false,fL,iL,jL,kL,nL); + FsrEasuSetF(dir,len,pp,false,false,false,true ,gL,jL,kL,lL,oL); +//------------------------------------------------------------------------------------------------------------------------------ + // Normalize with approximation, and cleanup close to zero. + AF2 dir2=dir*dir; + AF1 dirR=dir2.x+dir2.y; + AP1 zro=dirR<AF1_(1.0/32768.0); + dirR=APrxLoRsqF1(dirR); + dirR=zro?AF1_(1.0):dirR; + dir.x=zro?AF1_(1.0):dir.x; + dir*=AF2_(dirR); + // Transform from {0 to 2} to {0 to 1} range, and shape with square. + len=len*AF1_(0.5); + len*=len; + // Stretch kernel {1.0 vert|horz, to sqrt(2.0) on diagonal}. + AF1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpF1(max(abs(dir.x),abs(dir.y))); + // Anisotropic length after rotation, + // x := 1.0 lerp to 'stretch' on edges + // y := 1.0 lerp to 2x on edges + AF2 len2=AF2(AF1_(1.0)+(stretch-AF1_(1.0))*len,AF1_(1.0)+AF1_(-0.5)*len); + // Based on the amount of 'edge', + // the window shifts from +/-{sqrt(2.0) to slightly beyond 2.0}. + AF1 lob=AF1_(0.5)+AF1_((1.0/4.0-0.04)-0.5)*len; + // Set distance^2 clipping point to the end of the adjustable window. + AF1 clp=APrxLoRcpF1(lob); +//------------------------------------------------------------------------------------------------------------------------------ + // Accumulation mixed with min/max of 4 nearest. + // b c + // e f g h + // i j k l + // n o + AF3 min4=min(AMin3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)), + AF3(klhgR.x,klhgG.x,klhgB.x)); + AF3 max4=max(AMax3F3(AF3(ijfeR.z,ijfeG.z,ijfeB.z),AF3(klhgR.w,klhgG.w,klhgB.w),AF3(ijfeR.y,ijfeG.y,ijfeB.y)), + AF3(klhgR.x,klhgG.x,klhgB.x)); + // Accumulation. + AF3 aC=AF3_(0.0); + AF1 aW=AF1_(0.0); + FsrEasuTapF(aC,aW,AF2( 0.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.x,bczzG.x,bczzB.x)); // b + FsrEasuTapF(aC,aW,AF2( 1.0,-1.0)-pp,dir,len2,lob,clp,AF3(bczzR.y,bczzG.y,bczzB.y)); // c + FsrEasuTapF(aC,aW,AF2(-1.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.x,ijfeG.x,ijfeB.x)); // i + FsrEasuTapF(aC,aW,AF2( 0.0, 1.0)-pp,dir,len2,lob,clp,AF3(ijfeR.y,ijfeG.y,ijfeB.y)); // j + FsrEasuTapF(aC,aW,AF2( 0.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.z,ijfeG.z,ijfeB.z)); // f + FsrEasuTapF(aC,aW,AF2(-1.0, 0.0)-pp,dir,len2,lob,clp,AF3(ijfeR.w,ijfeG.w,ijfeB.w)); // e + FsrEasuTapF(aC,aW,AF2( 1.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.x,klhgG.x,klhgB.x)); // k + FsrEasuTapF(aC,aW,AF2( 2.0, 1.0)-pp,dir,len2,lob,clp,AF3(klhgR.y,klhgG.y,klhgB.y)); // l + FsrEasuTapF(aC,aW,AF2( 2.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.z,klhgG.z,klhgB.z)); // h + FsrEasuTapF(aC,aW,AF2( 1.0, 0.0)-pp,dir,len2,lob,clp,AF3(klhgR.w,klhgG.w,klhgB.w)); // g + FsrEasuTapF(aC,aW,AF2( 1.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.z,zzonG.z,zzonB.z)); // o + FsrEasuTapF(aC,aW,AF2( 0.0, 2.0)-pp,dir,len2,lob,clp,AF3(zzonR.w,zzonG.w,zzonB.w)); // n +//------------------------------------------------------------------------------------------------------------------------------ + // Normalize and dering. + pix=min(max4,max(min4,aC*AF3_(ARcpF1(aW))));} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_EASU_H) +// Input callback prototypes, need to be implemented by calling shader + AH4 FsrEasuRH(AF2 p); + AH4 FsrEasuGH(AF2 p); + AH4 FsrEasuBH(AF2 p); +//------------------------------------------------------------------------------------------------------------------------------ + // This runs 2 taps in parallel. + void FsrEasuTapH( + inout AH2 aCR,inout AH2 aCG,inout AH2 aCB, + inout AH2 aW, + AH2 offX,AH2 offY, + AH2 dir, + AH2 len, + AH1 lob, + AH1 clp, + AH2 cR,AH2 cG,AH2 cB){ + AH2 vX,vY; + vX=offX* dir.xx +offY*dir.yy; + vY=offX*(-dir.yy)+offY*dir.xx; + vX*=len.x;vY*=len.y; + AH2 d2=vX*vX+vY*vY; + d2=min(d2,AH2_(clp)); + AH2 wB=AH2_(2.0/5.0)*d2+AH2_(-1.0); + AH2 wA=AH2_(lob)*d2+AH2_(-1.0); + wB*=wB; + wA*=wA; + wB=AH2_(25.0/16.0)*wB+AH2_(-(25.0/16.0-1.0)); + AH2 w=wB*wA; + aCR+=cR*w;aCG+=cG*w;aCB+=cB*w;aW+=w;} +//------------------------------------------------------------------------------------------------------------------------------ + // This runs 2 taps in parallel. + void FsrEasuSetH( + inout AH2 dirPX,inout AH2 dirPY, + inout AH2 lenP, + AH2 pp, + AP1 biST,AP1 biUV, + AH2 lA,AH2 lB,AH2 lC,AH2 lD,AH2 lE){ + AH2 w = AH2_(0.0); + if(biST)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_(AH1_(1.0)-pp.y); + if(biUV)w=(AH2(1.0,0.0)+AH2(-pp.x,pp.x))*AH2_( pp.y); + // ABS is not free in the packed FP16 path. + AH2 dc=lD-lC; + AH2 cb=lC-lB; + AH2 lenX=max(abs(dc),abs(cb)); + lenX=ARcpH2(lenX); + AH2 dirX=lD-lB; + dirPX+=dirX*w; + lenX=ASatH2(abs(dirX)*lenX); + lenX*=lenX; + lenP+=lenX*w; + AH2 ec=lE-lC; + AH2 ca=lC-lA; + AH2 lenY=max(abs(ec),abs(ca)); + lenY=ARcpH2(lenY); + AH2 dirY=lE-lA; + dirPY+=dirY*w; + lenY=ASatH2(abs(dirY)*lenY); + lenY*=lenY; + lenP+=lenY*w;} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrEasuH( + out AH3 pix, + AU2 ip, + AU4 con0, + AU4 con1, + AU4 con2, + AU4 con3){ +//------------------------------------------------------------------------------------------------------------------------------ + AF2 pp=AF2(ip)*AF2_AU2(con0.xy)+AF2_AU2(con0.zw); + AF2 fp=floor(pp); + pp-=fp; + AH2 ppp=AH2(pp); +//------------------------------------------------------------------------------------------------------------------------------ + AF2 p0=fp*AF2_AU2(con1.xy)+AF2_AU2(con1.zw); + AF2 p1=p0+AF2_AU2(con2.xy); + AF2 p2=p0+AF2_AU2(con2.zw); + AF2 p3=p0+AF2_AU2(con3.xy); + AH4 bczzR=FsrEasuRH(p0); + AH4 bczzG=FsrEasuGH(p0); + AH4 bczzB=FsrEasuBH(p0); + AH4 ijfeR=FsrEasuRH(p1); + AH4 ijfeG=FsrEasuGH(p1); + AH4 ijfeB=FsrEasuBH(p1); + AH4 klhgR=FsrEasuRH(p2); + AH4 klhgG=FsrEasuGH(p2); + AH4 klhgB=FsrEasuBH(p2); + AH4 zzonR=FsrEasuRH(p3); + AH4 zzonG=FsrEasuGH(p3); + AH4 zzonB=FsrEasuBH(p3); +//------------------------------------------------------------------------------------------------------------------------------ + AH4 bczzL=bczzB*AH4_(0.5)+(bczzR*AH4_(0.5)+bczzG); + AH4 ijfeL=ijfeB*AH4_(0.5)+(ijfeR*AH4_(0.5)+ijfeG); + AH4 klhgL=klhgB*AH4_(0.5)+(klhgR*AH4_(0.5)+klhgG); + AH4 zzonL=zzonB*AH4_(0.5)+(zzonR*AH4_(0.5)+zzonG); + AH1 bL=bczzL.x; + AH1 cL=bczzL.y; + AH1 iL=ijfeL.x; + AH1 jL=ijfeL.y; + AH1 fL=ijfeL.z; + AH1 eL=ijfeL.w; + AH1 kL=klhgL.x; + AH1 lL=klhgL.y; + AH1 hL=klhgL.z; + AH1 gL=klhgL.w; + AH1 oL=zzonL.z; + AH1 nL=zzonL.w; + // This part is different, accumulating 2 taps in parallel. + AH2 dirPX=AH2_(0.0); + AH2 dirPY=AH2_(0.0); + AH2 lenP=AH2_(0.0); + FsrEasuSetH(dirPX,dirPY,lenP,ppp,true, false,AH2(bL,cL),AH2(eL,fL),AH2(fL,gL),AH2(gL,hL),AH2(jL,kL)); + FsrEasuSetH(dirPX,dirPY,lenP,ppp,false,true ,AH2(fL,gL),AH2(iL,jL),AH2(jL,kL),AH2(kL,lL),AH2(nL,oL)); + AH2 dir=AH2(dirPX.r+dirPX.g,dirPY.r+dirPY.g); + AH1 len=lenP.r+lenP.g; +//------------------------------------------------------------------------------------------------------------------------------ + AH2 dir2=dir*dir; + AH1 dirR=dir2.x+dir2.y; + AP1 zro=dirR<AH1_(1.0/32768.0); + dirR=APrxLoRsqH1(dirR); + dirR=zro?AH1_(1.0):dirR; + dir.x=zro?AH1_(1.0):dir.x; + dir*=AH2_(dirR); + len=len*AH1_(0.5); + len*=len; + AH1 stretch=(dir.x*dir.x+dir.y*dir.y)*APrxLoRcpH1(max(abs(dir.x),abs(dir.y))); + AH2 len2=AH2(AH1_(1.0)+(stretch-AH1_(1.0))*len,AH1_(1.0)+AH1_(-0.5)*len); + AH1 lob=AH1_(0.5)+AH1_((1.0/4.0-0.04)-0.5)*len; + AH1 clp=APrxLoRcpH1(lob); +//------------------------------------------------------------------------------------------------------------------------------ + // FP16 is different, using packed trick to do min and max in same operation. + AH2 bothR=max(max(AH2(-ijfeR.z,ijfeR.z),AH2(-klhgR.w,klhgR.w)),max(AH2(-ijfeR.y,ijfeR.y),AH2(-klhgR.x,klhgR.x))); + AH2 bothG=max(max(AH2(-ijfeG.z,ijfeG.z),AH2(-klhgG.w,klhgG.w)),max(AH2(-ijfeG.y,ijfeG.y),AH2(-klhgG.x,klhgG.x))); + AH2 bothB=max(max(AH2(-ijfeB.z,ijfeB.z),AH2(-klhgB.w,klhgB.w)),max(AH2(-ijfeB.y,ijfeB.y),AH2(-klhgB.x,klhgB.x))); + // This part is different for FP16, working pairs of taps at a time. + AH2 pR=AH2_(0.0); + AH2 pG=AH2_(0.0); + AH2 pB=AH2_(0.0); + AH2 pW=AH2_(0.0); + FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0, 1.0)-ppp.xx,AH2(-1.0,-1.0)-ppp.yy,dir,len2,lob,clp,bczzR.xy,bczzG.xy,bczzB.xy); + FsrEasuTapH(pR,pG,pB,pW,AH2(-1.0, 0.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,ijfeR.xy,ijfeG.xy,ijfeB.xy); + FsrEasuTapH(pR,pG,pB,pW,AH2( 0.0,-1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,ijfeR.zw,ijfeG.zw,ijfeB.zw); + FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 2.0)-ppp.xx,AH2( 1.0, 1.0)-ppp.yy,dir,len2,lob,clp,klhgR.xy,klhgG.xy,klhgB.xy); + FsrEasuTapH(pR,pG,pB,pW,AH2( 2.0, 1.0)-ppp.xx,AH2( 0.0, 0.0)-ppp.yy,dir,len2,lob,clp,klhgR.zw,klhgG.zw,klhgB.zw); + FsrEasuTapH(pR,pG,pB,pW,AH2( 1.0, 0.0)-ppp.xx,AH2( 2.0, 2.0)-ppp.yy,dir,len2,lob,clp,zzonR.zw,zzonG.zw,zzonB.zw); + AH3 aC=AH3(pR.x+pR.y,pG.x+pG.y,pB.x+pB.y); + AH1 aW=pW.x+pW.y; +//------------------------------------------------------------------------------------------------------------------------------ + // Slightly different for FP16 version due to combined min and max. + pix=min(AH3(bothR.y,bothG.y,bothB.y),max(-AH3(bothR.x,bothG.x,bothB.x),aC*AH3_(ARcpH1(aW))));} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [RCAS] ROBUST CONTRAST ADAPTIVE SHARPENING +// +//------------------------------------------------------------------------------------------------------------------------------ +// CAS uses a simplified mechanism to convert local contrast into a variable amount of sharpness. +// RCAS uses a more exact mechanism, solving for the maximum local sharpness possible before clipping. +// RCAS also has a built in process to limit sharpening of what it detects as possible noise. +// RCAS sharper does not support scaling, as it should be applied after EASU scaling. +// Pass EASU output straight into RCAS, no color conversions necessary. +//------------------------------------------------------------------------------------------------------------------------------ +// RCAS is based on the following logic. +// RCAS uses a 5 tap filter in a cross pattern (same as CAS), +// w n +// w 1 w for taps w m e +// w s +// Where 'w' is the negative lobe weight. +// output = (w*(n+e+w+s)+m)/(4*w+1) +// RCAS solves for 'w' by seeing where the signal might clip out of the {0 to 1} input range, +// 0 == (w*(n+e+w+s)+m)/(4*w+1) -> w = -m/(n+e+w+s) +// 1 == (w*(n+e+w+s)+m)/(4*w+1) -> w = (1-m)/(n+e+w+s-4*1) +// Then chooses the 'w' which results in no clipping, limits 'w', and multiplies by the 'sharp' amount. +// This solution above has issues with MSAA input as the steps along the gradient cause edge detection issues. +// So RCAS uses 4x the maximum and 4x the minimum (depending on equation)in place of the individual taps. +// As well as switching from 'm' to either the minimum or maximum (depending on side), to help in energy conservation. +// This stabilizes RCAS. +// RCAS does a simple highpass which is normalized against the local contrast then shaped, +// 0.25 +// 0.25 -1 0.25 +// 0.25 +// This is used as a noise detection filter, to reduce the effect of RCAS on grain, and focus on real edges. +// +// GLSL example for the required callbacks : +// +// AH4 FsrRcasLoadH(ASW2 p){return AH4(imageLoad(imgSrc,ASU2(p)));} +// void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b) +// { +// //do any simple input color conversions here or leave empty if none needed +// } +// +// FsrRcasCon need to be called from the CPU or GPU to set up constants. +// Including a GPU example here, the 'con' value would be stored out to a constant buffer. +// +// AU4 con; +// FsrRcasCon(con, +// 0.0); // The scale is {0.0 := maximum sharpness, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. +// --------------- +// RCAS sharpening supports a CAS-like pass-through alpha via, +// #define FSR_RCAS_PASSTHROUGH_ALPHA 1 +// RCAS also supports a define to enable a more expensive path to avoid some sharpening of noise. +// Would suggest it is better to apply film grain after RCAS sharpening (and after scaling) instead of using this define, +// #define FSR_RCAS_DENOISE 1 +//============================================================================================================================== +// This is set at the limit of providing unnatural results for sharpening. +#define FSR_RCAS_LIMIT (0.25-(1.0/16.0)) +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// CONSTANT SETUP +//============================================================================================================================== +// Call to setup required constant values (works on CPU or GPU). +A_STATIC void FsrRcasCon( +outAU4 con, +// The scale is {0.0 := maximum, to N>0, where N is the number of stops (halving) of the reduction of sharpness}. +AF1 sharpness){ + // Transform from stops to linear value. + sharpness=AExp2F1(-sharpness); + varAF2(hSharp)=initAF2(sharpness,sharpness); + con[0]=AU1_AF1(sharpness); + con[1]=AU1_AH2_AF2(hSharp); + con[2]=0; + con[3]=0;} +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 32-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(FSR_RCAS_F) + // Input callback prototypes that need to be implemented by calling shader + AF4 FsrRcasLoadF(ASU2 p); + void FsrRcasInputF(inout AF1 r,inout AF1 g,inout AF1 b); +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasF( + out AF1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. + out AF1 pixG, + out AF1 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AF1 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // Algorithm uses minimal 3x3 pixel neighborhood. + // b + // d e f + // h + ASU2 sp=ASU2(ip); + AF3 b=FsrRcasLoadF(sp+ASU2( 0,-1)).rgb; + AF3 d=FsrRcasLoadF(sp+ASU2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AF4 ee=FsrRcasLoadF(sp); + AF3 e=ee.rgb;pixA=ee.a; + #else + AF3 e=FsrRcasLoadF(sp).rgb; + #endif + AF3 f=FsrRcasLoadF(sp+ASU2( 1, 0)).rgb; + AF3 h=FsrRcasLoadF(sp+ASU2( 0, 1)).rgb; + // Rename (32-bit) or regroup (16-bit). + AF1 bR=b.r; + AF1 bG=b.g; + AF1 bB=b.b; + AF1 dR=d.r; + AF1 dG=d.g; + AF1 dB=d.b; + AF1 eR=e.r; + AF1 eG=e.g; + AF1 eB=e.b; + AF1 fR=f.r; + AF1 fG=f.g; + AF1 fB=f.b; + AF1 hR=h.r; + AF1 hG=h.g; + AF1 hB=h.b; + // Run optional input transform. + FsrRcasInputF(bR,bG,bB); + FsrRcasInputF(dR,dG,dB); + FsrRcasInputF(eR,eG,eB); + FsrRcasInputF(fR,fG,fB); + FsrRcasInputF(hR,hG,hB); + // Luma times 2. + AF1 bL=bB*AF1_(0.5)+(bR*AF1_(0.5)+bG); + AF1 dL=dB*AF1_(0.5)+(dR*AF1_(0.5)+dG); + AF1 eL=eB*AF1_(0.5)+(eR*AF1_(0.5)+eG); + AF1 fL=fB*AF1_(0.5)+(fR*AF1_(0.5)+fG); + AF1 hL=hB*AF1_(0.5)+(hR*AF1_(0.5)+hG); + // Noise detection. + AF1 nz=AF1_(0.25)*bL+AF1_(0.25)*dL+AF1_(0.25)*fL+AF1_(0.25)*hL-eL; + nz=ASatF1(abs(nz)*APrxMedRcpF1(AMax3F1(AMax3F1(bL,dL,eL),fL,hL)-AMin3F1(AMin3F1(bL,dL,eL),fL,hL))); + nz=AF1_(-0.5)*nz+AF1_(1.0); + // Min and max of ring. + AF1 mn4R=min(AMin3F1(bR,dR,fR),hR); + AF1 mn4G=min(AMin3F1(bG,dG,fG),hG); + AF1 mn4B=min(AMin3F1(bB,dB,fB),hB); + AF1 mx4R=max(AMax3F1(bR,dR,fR),hR); + AF1 mx4G=max(AMax3F1(bG,dG,fG),hG); + AF1 mx4B=max(AMax3F1(bB,dB,fB),hB); + // Immediate constants for peak range. + AF2 peakC=AF2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AF1 hitMinR=min(mn4R,eR)*ARcpF1(AF1_(4.0)*mx4R); + AF1 hitMinG=min(mn4G,eG)*ARcpF1(AF1_(4.0)*mx4G); + AF1 hitMinB=min(mn4B,eB)*ARcpF1(AF1_(4.0)*mx4B); + AF1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpF1(AF1_(4.0)*mn4R+peakC.y); + AF1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpF1(AF1_(4.0)*mn4G+peakC.y); + AF1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpF1(AF1_(4.0)*mn4B+peakC.y); + AF1 lobeR=max(-hitMinR,hitMaxR); + AF1 lobeG=max(-hitMinG,hitMaxG); + AF1 lobeB=max(-hitMinB,hitMaxB); + AF1 lobe=max(AF1_(-FSR_RCAS_LIMIT),min(AMax3F1(lobeR,lobeG,lobeB),AF1_(0.0)))*AF1_AU1(con.x); + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AF1 rcpL=APrxMedRcpF1(AF1_(4.0)*lobe+AF1_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL; + return;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// NON-PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_H) + // Input callback prototypes that need to be implemented by calling shader + AH4 FsrRcasLoadH(ASW2 p); + void FsrRcasInputH(inout AH1 r,inout AH1 g,inout AH1 b); +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasH( + out AH1 pixR, // Output values, non-vector so port between RcasFilter() and RcasFilterH() is easy. + out AH1 pixG, + out AH1 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AH1 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // Sharpening algorithm uses minimal 3x3 pixel neighborhood. + // b + // d e f + // h + ASW2 sp=ASW2(ip); + AH3 b=FsrRcasLoadH(sp+ASW2( 0,-1)).rgb; + AH3 d=FsrRcasLoadH(sp+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee=FsrRcasLoadH(sp); + AH3 e=ee.rgb;pixA=ee.a; + #else + AH3 e=FsrRcasLoadH(sp).rgb; + #endif + AH3 f=FsrRcasLoadH(sp+ASW2( 1, 0)).rgb; + AH3 h=FsrRcasLoadH(sp+ASW2( 0, 1)).rgb; + // Rename (32-bit) or regroup (16-bit). + AH1 bR=b.r; + AH1 bG=b.g; + AH1 bB=b.b; + AH1 dR=d.r; + AH1 dG=d.g; + AH1 dB=d.b; + AH1 eR=e.r; + AH1 eG=e.g; + AH1 eB=e.b; + AH1 fR=f.r; + AH1 fG=f.g; + AH1 fB=f.b; + AH1 hR=h.r; + AH1 hG=h.g; + AH1 hB=h.b; + // Run optional input transform. + FsrRcasInputH(bR,bG,bB); + FsrRcasInputH(dR,dG,dB); + FsrRcasInputH(eR,eG,eB); + FsrRcasInputH(fR,fG,fB); + FsrRcasInputH(hR,hG,hB); + // Luma times 2. + AH1 bL=bB*AH1_(0.5)+(bR*AH1_(0.5)+bG); + AH1 dL=dB*AH1_(0.5)+(dR*AH1_(0.5)+dG); + AH1 eL=eB*AH1_(0.5)+(eR*AH1_(0.5)+eG); + AH1 fL=fB*AH1_(0.5)+(fR*AH1_(0.5)+fG); + AH1 hL=hB*AH1_(0.5)+(hR*AH1_(0.5)+hG); + // Noise detection. + AH1 nz=AH1_(0.25)*bL+AH1_(0.25)*dL+AH1_(0.25)*fL+AH1_(0.25)*hL-eL; + nz=ASatH1(abs(nz)*APrxMedRcpH1(AMax3H1(AMax3H1(bL,dL,eL),fL,hL)-AMin3H1(AMin3H1(bL,dL,eL),fL,hL))); + nz=AH1_(-0.5)*nz+AH1_(1.0); + // Min and max of ring. + AH1 mn4R=min(AMin3H1(bR,dR,fR),hR); + AH1 mn4G=min(AMin3H1(bG,dG,fG),hG); + AH1 mn4B=min(AMin3H1(bB,dB,fB),hB); + AH1 mx4R=max(AMax3H1(bR,dR,fR),hR); + AH1 mx4G=max(AMax3H1(bG,dG,fG),hG); + AH1 mx4B=max(AMax3H1(bB,dB,fB),hB); + // Immediate constants for peak range. + AH2 peakC=AH2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AH1 hitMinR=min(mn4R,eR)*ARcpH1(AH1_(4.0)*mx4R); + AH1 hitMinG=min(mn4G,eG)*ARcpH1(AH1_(4.0)*mx4G); + AH1 hitMinB=min(mn4B,eB)*ARcpH1(AH1_(4.0)*mx4B); + AH1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH1(AH1_(4.0)*mn4R+peakC.y); + AH1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH1(AH1_(4.0)*mn4G+peakC.y); + AH1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH1(AH1_(4.0)*mn4B+peakC.y); + AH1 lobeR=max(-hitMinR,hitMaxR); + AH1 lobeG=max(-hitMinG,hitMaxG); + AH1 lobeB=max(-hitMinB,hitMaxB); + AH1 lobe=max(AH1_(-FSR_RCAS_LIMIT),min(AMax3H1(lobeR,lobeG,lobeB),AH1_(0.0)))*AH2_AU1(con.y).x; + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AH1 rcpL=APrxMedRcpH1(AH1_(4.0)*lobe+AH1_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// PACKED 16-BIT VERSION +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF)&&defined(FSR_RCAS_HX2) + // Input callback prototypes that need to be implemented by the calling shader + AH4 FsrRcasLoadHx2(ASW2 p); + void FsrRcasInputHx2(inout AH2 r,inout AH2 g,inout AH2 b); +//------------------------------------------------------------------------------------------------------------------------------ + // Can be used to convert from packed Structures of Arrays to Arrays of Structures for store. + void FsrRcasDepackHx2(out AH4 pix0,out AH4 pix1,AH2 pixR,AH2 pixG,AH2 pixB){ + #ifdef A_HLSL + // Invoke a slower path for DX only, since it won't allow uninitialized values. + pix0.a=pix1.a=0.0; + #endif + pix0.rgb=AH3(pixR.x,pixG.x,pixB.x); + pix1.rgb=AH3(pixR.y,pixG.y,pixB.y);} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrRcasHx2( + // Output values are for 2 8x8 tiles in a 16x8 region. + // pix<R,G,B>.x = left 8x8 tile + // pix<R,G,B>.y = right 8x8 tile + // This enables later processing to easily be packed as well. + out AH2 pixR, + out AH2 pixG, + out AH2 pixB, + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + out AH2 pixA, + #endif + AU2 ip, // Integer pixel position in output. + AU4 con){ // Constant generated by RcasSetup(). + // No scaling algorithm uses minimal 3x3 pixel neighborhood. + ASW2 sp0=ASW2(ip); + AH3 b0=FsrRcasLoadHx2(sp0+ASW2( 0,-1)).rgb; + AH3 d0=FsrRcasLoadHx2(sp0+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee0=FsrRcasLoadHx2(sp0); + AH3 e0=ee0.rgb;pixA.r=ee0.a; + #else + AH3 e0=FsrRcasLoadHx2(sp0).rgb; + #endif + AH3 f0=FsrRcasLoadHx2(sp0+ASW2( 1, 0)).rgb; + AH3 h0=FsrRcasLoadHx2(sp0+ASW2( 0, 1)).rgb; + ASW2 sp1=sp0+ASW2(8,0); + AH3 b1=FsrRcasLoadHx2(sp1+ASW2( 0,-1)).rgb; + AH3 d1=FsrRcasLoadHx2(sp1+ASW2(-1, 0)).rgb; + #ifdef FSR_RCAS_PASSTHROUGH_ALPHA + AH4 ee1=FsrRcasLoadHx2(sp1); + AH3 e1=ee1.rgb;pixA.g=ee1.a; + #else + AH3 e1=FsrRcasLoadHx2(sp1).rgb; + #endif + AH3 f1=FsrRcasLoadHx2(sp1+ASW2( 1, 0)).rgb; + AH3 h1=FsrRcasLoadHx2(sp1+ASW2( 0, 1)).rgb; + // Arrays of Structures to Structures of Arrays conversion. + AH2 bR=AH2(b0.r,b1.r); + AH2 bG=AH2(b0.g,b1.g); + AH2 bB=AH2(b0.b,b1.b); + AH2 dR=AH2(d0.r,d1.r); + AH2 dG=AH2(d0.g,d1.g); + AH2 dB=AH2(d0.b,d1.b); + AH2 eR=AH2(e0.r,e1.r); + AH2 eG=AH2(e0.g,e1.g); + AH2 eB=AH2(e0.b,e1.b); + AH2 fR=AH2(f0.r,f1.r); + AH2 fG=AH2(f0.g,f1.g); + AH2 fB=AH2(f0.b,f1.b); + AH2 hR=AH2(h0.r,h1.r); + AH2 hG=AH2(h0.g,h1.g); + AH2 hB=AH2(h0.b,h1.b); + // Run optional input transform. + FsrRcasInputHx2(bR,bG,bB); + FsrRcasInputHx2(dR,dG,dB); + FsrRcasInputHx2(eR,eG,eB); + FsrRcasInputHx2(fR,fG,fB); + FsrRcasInputHx2(hR,hG,hB); + // Luma times 2. + AH2 bL=bB*AH2_(0.5)+(bR*AH2_(0.5)+bG); + AH2 dL=dB*AH2_(0.5)+(dR*AH2_(0.5)+dG); + AH2 eL=eB*AH2_(0.5)+(eR*AH2_(0.5)+eG); + AH2 fL=fB*AH2_(0.5)+(fR*AH2_(0.5)+fG); + AH2 hL=hB*AH2_(0.5)+(hR*AH2_(0.5)+hG); + // Noise detection. + AH2 nz=AH2_(0.25)*bL+AH2_(0.25)*dL+AH2_(0.25)*fL+AH2_(0.25)*hL-eL; + nz=ASatH2(abs(nz)*APrxMedRcpH2(AMax3H2(AMax3H2(bL,dL,eL),fL,hL)-AMin3H2(AMin3H2(bL,dL,eL),fL,hL))); + nz=AH2_(-0.5)*nz+AH2_(1.0); + // Min and max of ring. + AH2 mn4R=min(AMin3H2(bR,dR,fR),hR); + AH2 mn4G=min(AMin3H2(bG,dG,fG),hG); + AH2 mn4B=min(AMin3H2(bB,dB,fB),hB); + AH2 mx4R=max(AMax3H2(bR,dR,fR),hR); + AH2 mx4G=max(AMax3H2(bG,dG,fG),hG); + AH2 mx4B=max(AMax3H2(bB,dB,fB),hB); + // Immediate constants for peak range. + AH2 peakC=AH2(1.0,-1.0*4.0); + // Limiters, these need to be high precision RCPs. + AH2 hitMinR=min(mn4R,eR)*ARcpH2(AH2_(4.0)*mx4R); + AH2 hitMinG=min(mn4G,eG)*ARcpH2(AH2_(4.0)*mx4G); + AH2 hitMinB=min(mn4B,eB)*ARcpH2(AH2_(4.0)*mx4B); + AH2 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH2(AH2_(4.0)*mn4R+peakC.y); + AH2 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH2(AH2_(4.0)*mn4G+peakC.y); + AH2 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH2(AH2_(4.0)*mn4B+peakC.y); + AH2 lobeR=max(-hitMinR,hitMaxR); + AH2 lobeG=max(-hitMinG,hitMaxG); + AH2 lobeB=max(-hitMinB,hitMaxB); + AH2 lobe=max(AH2_(-FSR_RCAS_LIMIT),min(AMax3H2(lobeR,lobeG,lobeB),AH2_(0.0)))*AH2_(AH2_AU1(con.y).x); + // Apply noise removal. + #ifdef FSR_RCAS_DENOISE + lobe*=nz; + #endif + // Resolve, which needs the medium precision rcp approximation to avoid visible tonality changes. + AH2 rcpL=APrxMedRcpH2(AH2_(4.0)*lobe+AH2_(1.0)); + pixR=(lobe*bR+lobe*dR+lobe*hR+lobe*fR+eR)*rcpL; + pixG=(lobe*bG+lobe*dG+lobe*hG+lobe*fG+eG)*rcpL; + pixB=(lobe*bB+lobe*dB+lobe*hB+lobe*fB+eB)*rcpL;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [LFGA] LINEAR FILM GRAIN APPLICATOR +// +//------------------------------------------------------------------------------------------------------------------------------ +// Adding output-resolution film grain after scaling is a good way to mask both rendering and scaling artifacts. +// Suggest using tiled blue noise as film grain input, with peak noise frequency set for a specific look and feel. +// The 'Lfga*()' functions provide a convenient way to introduce grain. +// These functions limit grain based on distance to signal limits. +// This is done so that the grain is temporally energy preserving, and thus won't modify image tonality. +// Grain application should be done in a linear colorspace. +// The grain should be temporally changing, but have a temporal sum per pixel that adds to zero (non-biased). +//------------------------------------------------------------------------------------------------------------------------------ +// Usage, +// FsrLfga*( +// color, // In/out linear colorspace color {0 to 1} ranged. +// grain, // Per pixel grain texture value {-0.5 to 0.5} ranged, input is 3-channel to support colored grain. +// amount); // Amount of grain (0 to 1} ranged. +//------------------------------------------------------------------------------------------------------------------------------ +// Example if grain texture is monochrome: 'FsrLfgaF(color,AF3_(grain),amount)' +//============================================================================================================================== +#if defined(A_GPU) + // Maximum grain is the minimum distance to the signal limit. + void FsrLfgaF(inout AF3 c,AF3 t,AF1 a){c+=(t*AF3_(a))*min(AF3_(1.0)-c,c);} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + // Half precision version (slower). + void FsrLfgaH(inout AH3 c,AH3 t,AH1 a){c+=(t*AH3_(a))*min(AH3_(1.0)-c,c);} +//------------------------------------------------------------------------------------------------------------------------------ + // Packed half precision version (faster). + void FsrLfgaHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 tR,AH2 tG,AH2 tB,AH1 a){ + cR+=(tR*AH2_(a))*min(AH2_(1.0)-cR,cR);cG+=(tG*AH2_(a))*min(AH2_(1.0)-cG,cG);cB+=(tB*AH2_(a))*min(AH2_(1.0)-cB,cB);} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [SRTM] SIMPLE REVERSIBLE TONE-MAPPER +// +//------------------------------------------------------------------------------------------------------------------------------ +// This provides a way to take linear HDR color {0 to FP16_MAX} and convert it into a temporary {0 to 1} ranged post-tonemapped linear. +// The tonemapper preserves RGB ratio, which helps maintain HDR color bleed during filtering. +//------------------------------------------------------------------------------------------------------------------------------ +// Reversible tonemapper usage, +// FsrSrtm*(color); // {0 to FP16_MAX} converted to {0 to 1}. +// FsrSrtmInv*(color); // {0 to 1} converted into {0 to 32768, output peak safe for FP16}. +//============================================================================================================================== +#if defined(A_GPU) + void FsrSrtmF(inout AF3 c){c*=AF3_(ARcpF1(AMax3F1(c.r,c.g,c.b)+AF1_(1.0)));} + // The extra max solves the c=1.0 case (which is a /0). + void FsrSrtmInvF(inout AF3 c){c*=AF3_(ARcpF1(max(AF1_(1.0/32768.0),AF1_(1.0)-AMax3F1(c.r,c.g,c.b))));} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + void FsrSrtmH(inout AH3 c){c*=AH3_(ARcpH1(AMax3H1(c.r,c.g,c.b)+AH1_(1.0)));} + void FsrSrtmInvH(inout AH3 c){c*=AH3_(ARcpH1(max(AH1_(1.0/32768.0),AH1_(1.0)-AMax3H1(c.r,c.g,c.b))));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrSrtmHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){ + AH2 rcp=ARcpH2(AMax3H2(cR,cG,cB)+AH2_(1.0));cR*=rcp;cG*=rcp;cB*=rcp;} + void FsrSrtmInvHx2(inout AH2 cR,inout AH2 cG,inout AH2 cB){ + AH2 rcp=ARcpH2(max(AH2_(1.0/32768.0),AH2_(1.0)-AMax3H2(cR,cG,cB)));cR*=rcp;cG*=rcp;cB*=rcp;} +#endif +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//_____________________________________________________________/\_______________________________________________________________ +//============================================================================================================================== +// +// FSR - [TEPD] TEMPORAL ENERGY PRESERVING DITHER +// +//------------------------------------------------------------------------------------------------------------------------------ +// Temporally energy preserving dithered {0 to 1} linear to gamma 2.0 conversion. +// Gamma 2.0 is used so that the conversion back to linear is just to square the color. +// The conversion comes in 8-bit and 10-bit modes, designed for output to 8-bit UNORM or 10:10:10:2 respectively. +// Given good non-biased temporal blue noise as dither input, +// the output dither will temporally conserve energy. +// This is done by choosing the linear nearest step point instead of perceptual nearest. +// See code below for details. +//------------------------------------------------------------------------------------------------------------------------------ +// DX SPEC RULES FOR FLOAT->UNORM 8-BIT CONVERSION +// =============================================== +// - Output is 'uint(floor(saturate(n)*255.0+0.5))'. +// - Thus rounding is to nearest. +// - NaN gets converted to zero. +// - INF is clamped to {0.0 to 1.0}. +//============================================================================================================================== +#if defined(A_GPU) + // Hand tuned integer position to dither value, with more values than simple checkerboard. + // Only 32-bit has enough precision for this compddation. + // Output is {0 to <1}. + AF1 FsrTepdDitF(AU2 p,AU1 f){ + AF1 x=AF1_(p.x+f); + AF1 y=AF1_(p.y); + // The 1.61803 golden ratio. + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + // Number designed to provide a good visual pattern. + AF1 b=AF1_(1.0/3.69); + x=x*a+(y*b); + return AFractF1(x);} +//------------------------------------------------------------------------------------------------------------------------------ + // This version is 8-bit gamma 2.0. + // The 'c' input is {0 to 1}. + // Output is {0 to 1} ready for image store. + void FsrTepdC8F(inout AF3 c,AF1 dit){ + AF3 n=sqrt(c); + n=floor(n*AF3_(255.0))*AF3_(1.0/255.0); + AF3 a=n*n; + AF3 b=n+AF3_(1.0/255.0);b=b*b; + // Ratio of 'a' to 'b' required to produce 'c'. + // APrxLoRcpF1() won't work here (at least for very high dynamic ranges). + // APrxMedRcpF1() is an IADD,FMA,MUL. + AF3 r=(c-b)*APrxMedRcpF3(a-b); + // Use the ratio as a cutoff to choose 'a' or 'b'. + // AGtZeroF1() is a MUL. + c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + // This version is 10-bit gamma 2.0. + // The 'c' input is {0 to 1}. + // Output is {0 to 1} ready for image store. + void FsrTepdC10F(inout AF3 c,AF1 dit){ + AF3 n=sqrt(c); + n=floor(n*AF3_(1023.0))*AF3_(1.0/1023.0); + AF3 a=n*n; + AF3 b=n+AF3_(1.0/1023.0);b=b*b; + AF3 r=(c-b)*APrxMedRcpF3(a-b); + c=ASatF3(n+AGtZeroF3(AF3_(dit)-r)*AF3_(1.0/1023.0));} +#endif +//============================================================================================================================== +#if defined(A_GPU)&&defined(A_HALF) + AH1 FsrTepdDitH(AU2 p,AU1 f){ + AF1 x=AF1_(p.x+f); + AF1 y=AF1_(p.y); + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + AF1 b=AF1_(1.0/3.69); + x=x*a+(y*b); + return AH1(AFractF1(x));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC8H(inout AH3 c,AH1 dit){ + AH3 n=sqrt(c); + n=floor(n*AH3_(255.0))*AH3_(1.0/255.0); + AH3 a=n*n; + AH3 b=n+AH3_(1.0/255.0);b=b*b; + AH3 r=(c-b)*APrxMedRcpH3(a-b); + c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC10H(inout AH3 c,AH1 dit){ + AH3 n=sqrt(c); + n=floor(n*AH3_(1023.0))*AH3_(1.0/1023.0); + AH3 a=n*n; + AH3 b=n+AH3_(1.0/1023.0);b=b*b; + AH3 r=(c-b)*APrxMedRcpH3(a-b); + c=ASatH3(n+AGtZeroH3(AH3_(dit)-r)*AH3_(1.0/1023.0));} +//============================================================================================================================== + // This computes dither for positions 'p' and 'p+{8,0}'. + AH2 FsrTepdDitHx2(AU2 p,AU1 f){ + AF2 x; + x.x=AF1_(p.x+f); + x.y=x.x+AF1_(8.0); + AF1 y=AF1_(p.y); + AF1 a=AF1_((1.0+sqrt(5.0))/2.0); + AF1 b=AF1_(1.0/3.69); + x=x*AF2_(a)+AF2_(y*b); + return AH2(AFractF2(x));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC8Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){ + AH2 nR=sqrt(cR); + AH2 nG=sqrt(cG); + AH2 nB=sqrt(cB); + nR=floor(nR*AH2_(255.0))*AH2_(1.0/255.0); + nG=floor(nG*AH2_(255.0))*AH2_(1.0/255.0); + nB=floor(nB*AH2_(255.0))*AH2_(1.0/255.0); + AH2 aR=nR*nR; + AH2 aG=nG*nG; + AH2 aB=nB*nB; + AH2 bR=nR+AH2_(1.0/255.0);bR=bR*bR; + AH2 bG=nG+AH2_(1.0/255.0);bG=bG*bG; + AH2 bB=nB+AH2_(1.0/255.0);bB=bB*bB; + AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR); + AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG); + AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB); + cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/255.0)); + cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/255.0)); + cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/255.0));} +//------------------------------------------------------------------------------------------------------------------------------ + void FsrTepdC10Hx2(inout AH2 cR,inout AH2 cG,inout AH2 cB,AH2 dit){ + AH2 nR=sqrt(cR); + AH2 nG=sqrt(cG); + AH2 nB=sqrt(cB); + nR=floor(nR*AH2_(1023.0))*AH2_(1.0/1023.0); + nG=floor(nG*AH2_(1023.0))*AH2_(1.0/1023.0); + nB=floor(nB*AH2_(1023.0))*AH2_(1.0/1023.0); + AH2 aR=nR*nR; + AH2 aG=nG*nG; + AH2 aB=nB*nB; + AH2 bR=nR+AH2_(1.0/1023.0);bR=bR*bR; + AH2 bG=nG+AH2_(1.0/1023.0);bG=bG*bG; + AH2 bB=nB+AH2_(1.0/1023.0);bB=bB*bB; + AH2 rR=(cR-bR)*APrxMedRcpH2(aR-bR); + AH2 rG=(cG-bG)*APrxMedRcpH2(aG-bG); + AH2 rB=(cB-bB)*APrxMedRcpH2(aB-bB); + cR=ASatH2(nR+AGtZeroH2(dit-rR)*AH2_(1.0/1023.0)); + cG=ASatH2(nG+AGtZeroH2(dit-rG)*AH2_(1.0/1023.0)); + cB=ASatH2(nB+AGtZeroH2(dit-rB)*AH2_(1.0/1023.0));} +#endif + + +void CurrFilter(AU2 pos) +{ + AF3 c; + FsrRcasF(c.r, c.g, c.b, pos, con0); + imageStore(imgOutput, ASU2(pos), AF4(c, 1)); +} + +void main() { + FsrRcasCon(con0, sharpening_data); + + AU2 gxy = ARmp8x8(gl_LocalInvocationID.x) + AU2(gl_WorkGroupID.x << 4u, gl_WorkGroupID.y << 4u); + CurrFilter(gxy); + gxy.x += 8u; + CurrFilter(gxy); + gxy.y += 8u; + CurrFilter(gxy); + gxy.x -= 8u; + CurrFilter(gxy); +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrSharpening.spv b/Ryujinx.Graphics.Vulkan/Effects/Shaders/FsrSharpening.spv new file mode 100644 index 0000000000000000000000000000000000000000..b2e30e1fe82d1a17c9b837a184d99a88f1c112bb GIT binary patch literal 20472 zcmZvj37nNx`Nv<F83qv%MNt%i0YOwmQB-hcQ4j?|aH%YD*hgW8hJm0^QE@9ZHM3l* z(KORsS~N>j$;`DZGxsIRT++%V%U%E9@4fFcoH?KWjpO<Kp5OC4=Q-!R@4fH6XR7Hs zq`oM+6dM=ai(PAq%4@TtE0zLVw@N>G>a3|Fm$l6sIeO3Cbm&!79rf9?s4wd1YiUaw zo0{p0PrAZ!=}o-$&1mCj`rB&wA1=jBkUBil*g7md(8f<1HGAdo1*7%_*WterePadJ zkr(|a_2nJ<rX6~>qObad1@o1w|Ef9qU|-fcw~}uQaLa-PmGQR(&uMB~MhEX!Y)emk z)3hXkHEfMN;`@P{nwM4Lw*ha6uP+*mpVqi?k4dAPLG>Rta}1zgS>l28t4chG{>v3y zUksxkKdq_x|BD}j7V+DIBYr5j8edoJX#C8%ORK|I#~t1gS67TspViow>N|Jn^~J90 zGg?<pn?EnfQq40GE%srwelwRW9kX&wWhb299vympu^)KSveub%8<$OOX`DA{_==|8 zV?XVe((1VGCT#GV(!6we+oWXG%y}SsYsI&wavd!!@p$?<6<l9Tp)bdp*wQ?F#q!av zd$s1tXmeYd8LO^11iWlfW9!o9`ODn!I<Bml8)B28S1jLsU(@JQTTH`eY12wwEBx>( zH+6+iuX3jXuPJ8GH!(r+5#Z|h-HM~Z6PLHPPHI}*HouiRSrL7X!oIYndOe*aZftE_ zLhsmTWc;j7yrwu8UR{UU;`~lpZE;z~ugdt<8NVju*Jk{>j9;Je8!~=l#&6E}ZSZ9+ z%UkEpug>)W_@Y^EP<8E|P%rVb;F#xn@P;+2DPDxnqq$zMWZG+R(pR+CGwludf>L`Y z)82(IDzy(Y?GyMMt_Jt6OBXO#g1A=9RUobh&nta;h)bVM#HCMf@ch!JkGS;dD=vNd zfftlM{l%rvKym3a7`&+T87eM)hKnmcwZ+ch4Qp0g?2_?aGrniW_s;m(j31cs@fn|( z@hKTUEaTHMJ}cu#W&D^<yr%d9e9qMBz8~L7t1aer;x)wy@Oh>0;!av^v9uGfDOSMe zm%b}IX|=`bPQ0c#4ZfiCUE4{kEza)5Yl`#Xi%Q=MI%&1V#hrLfaT&asH+--6D>`Yl zMSI4t$@ujd|5?Uw&G>B@zdhr3Wc<#Iugmy78NWB<_h<Zpj6am|M>76s#{Zo0r!xL5 zd`a_Y?w;y;K9}h)WPE+bU#iA)$5rEB&h%F^{#wT0sK#@bR^#8y^tUtqPR8Fie&vSx zKgjftGX6=%Yr0nAJL<26r|%QpGTuGon^fZ`ME$)oy?4ep&-hmI`Hd$|Ea$%sd{I*y zPe3y(*Z088e@MoMb@)%N`0teIBQrj#!+%1>f6q)GoALch|3*Ikz27Eg`k@)0*5O~R z=g3SyHsi;2_*d(hpXrM;-U46TGRKdNN<D2QcLS|&Rmmrm{N$2DEBYFEy1%Dod~L?h z%=kGOzYyL!cPZ~&)%+J_`Xw2^v=gr>9*55@W1keaH18I<p6cXVTRffdXEOe5#-H!R z>xx(5O-mL|U*5)pW@XH};w@~IXTtf-P0b4nz5&!%Y}T;BK6_qcTO;dNQ~U>iU#|5# zo$}Wf|Lw$UiVxunSFXahd*yr4!o{<vw#;o@Jf(RBU0YL2^OS=k*QcFg`R-Ih?BdzS zw6vZ$xwU2aQav5B7xr}j`euB)jBnqGcPn;;H#M)AIe%Hp;^me=0o|xiyRS}Kb=sAA z?_BDBM;ZZcS>Dzui8-|U>a1;8yOE_nnR)o^H<-q>i{K4l-kvJly;3(sU9Rry5Gr!t zd=05~_|#Nvn)P>|26t#xZhUR2`#p7I@U|s)z2)1Ke0a(IuIG5MKIwScVm#L;9Z&9f z>c+>~r19D!-nB{N<;K$$eelw+0VWss1$CnzRB5$bwMC_AYhae$>BZW&g!iN$MDsNW zz6rgWcKv&mSbGDJn<rdb*tY{m-WYQ$;zyPK)jDg7-Qn7dbNs#N2hqCF^xvCaoBq}_ zhTeSI)yL91j{9YO`_OCC&-xCaH#TapH_jW%IT>f(gXz_@H?R+r=#2}zK8L~evEBw& z&Ns6>w?4*L%aNt7ZD+Xi@VkO9xpl9uur&t;Ut8f--TKa>efqt(P7eLjsy<Gx-cEC^ zf?o-C@B8tc^%i=^P)}dMx5e{T1pS=f&r2t5`<C4M&zEa<AHDfvFYX7s9@=C4N5H;< zKL*y{_^0WOw?4;vhTeGX`nx{Q(>~=jf<17LU!b{W+Qarz$+bt^TVUrEK5v7y=@Yj1 zORn8GYnfeLjE?ocyu#NU5c*?P{qEpfJDFn5I&(VjesK3td-x2f`iu*HN`;TR%Q-&* zc3iJB*I4f7nKt+A<q}ti#aO;D<6T|1S0udjHHX~yJzoP@UElM}=Xh^}m9*(A_dQRW z`u|F-zkBo%-0Ld#=wrCGXg6N&d!jbubrc&Dy6=(NoWJ&ZxU~o04BkEAJHW@mM^Lrn z_@1h*9h=QL`5t;7*fDnk<5sabrrh^YZTk9t8NR!ec5V8~eb3Z)FZR&h+I(Ry-osKy zukXS1es+6}%I*E^4(@&JXLoSl!~A>>uHK8JDfe?Y<$eyQ+|Ob8c&c!3-le%;UdQ`W zgZuPeX+MC9tw%l<g|2v?|M)wXb*mY35c~tMHe<Zr<c{H_yU^TI?_K@fv#zCGf9E70 zi@)!EHQ>?o+9O9DHupWv(G9NsUgFHz9jwjkA>uXy8`q8=WA}h-kGPG&+Kih~*3c7d zKD9NaZ4<Dysadysqs_e5y(w6mn)@PmPMC_`;G^laTbp~iIavR6PJOUxcTDHB1z4Nc zbi{25jyT7X>lbxwRqAS95AvvEYp``_k9_^W=4;0uYqSkE?NP_JU~Op~{oxVkSaSWM zjsc~v=5;8KItGHRLwnRQ2psc@IvTKPKdoGk!C-A^9Yf#|=U8(6qK=`ZuIBYGk2;2d ztwa0AysM0%dEa^;cn09h_maJd^}gAN<~_VIz4`U~OlkiFyEt;Vf4ynGdeO|`zHdr1 z$7b{~&tprwImBs>zHnbY3q_7T3Em>Xo+FrhEBeTN99aM0v%$u!rvAO?kEdx1y%B7l z(C2`)J<M#K(_ET4kEf|wv$nKmpJ#ogHE#`%`uio=XI*RVPaicb0P7!oA=sGUO<?1q z#uLEW9%PrS@kE+AjPt!s-8!{Lo$g%&&6m%UvBv`vJTSq7z}7jK-a3qLrs*HN1#C=k zem3Sa#m}$$ivJhEU1-B;`l^T43J%S@`dYL0wB})OUun%l;8Fkf2_6cz<{jvxrsZJ$ zgRcM^6Z}hH<D#yWVC&MRuX<>!z@eE}Uu)K$*1RL!S9&kG2G+I{eSH3%1ol4jUKRW7 z<Gt!ypG@<f)E>2-0yfXtrOz6$e%h@=p4Kq}?klZhXSj9jLLYUU0k)3dXO%q0JsobG zb?Up87ImHpwodKp#-{T)yVSM!q!Q<L4$XBmr@76cpL0-4=ddf>S2~B0aOdEU7E$xL z;FyE&JLe}{`(-7!Mz!;3&cXZV>oniHw8xw;06UKNoIV%Qw8eY#MPTE!>*Kt>LDTLS z)+3L67lXA0|7OWgDPu2zYYY8bV8=hB-0N?H_0evA=W{77_Th4{`=I^QGUf`facYt0 zJ78;3bG_xU-lM>yX})%+#rxMDVAp$3de__ay^5Bu_thD{Cgay;{JM-^pYa<qeq+XO zF1hRF{(PTiovw|w$;H;yUSeyyk~WsgjrjqsFHL{_)Fa;y!I96}<>s?)bBL`u&G#d? z`SjOMT|eid7P)T%M{egO*Uve%)1tl~gZ*qVUO)ACX8Q@)`t@-i<*|=r!TZsC?L%`P z$I$Ofb07Dox0YM!e?hyg<a^WqlBVB0^jqj}r|C1BH8bxWrA@!Cv_-Td%Iob{V0B+% zyA!O<`#o%Tl{VwUc6Vv3p`THXwGM1fa~PwKb6rOtb2Y~Vny&+B@wsqdg2yFzJlHju zNS{6fnP&>k*Cd*G4yK=6;(Gc+XvQ8&?|pDT{R1>L*Rvb_Z)p0(eSL3f>xruVTbgUJ z9Zk)B*I#?&co3{D_(LUk@9zUYOw&iZ@z$U%{C@{FKHj^257rjXo__#4uSaOc%k?qd z{C}jG)0%tIKT5M^uO+p}^Cz%*+G*yI>l^x?!Rn9E)a8yD`r}~rD{1O-^?YAuV)HeX z7W+Fb!G|Y!dWjFCpHX7>@d%oGH;dlA^L}`OR$uaG=$}ov`uc>cznF0KR}!xNTEf*| zPq_LU30Hq7;p*=uT>Zm@TjwVk@4}`;{r0sP_dGn+dt}^m@l^M0JmsE`r`$90lzUE| z^8Ok3ygbzhXMAYJJv&eRJwH#m=jSQ+{5<8JpQqgO^OSpjo^sF6Q$9Z9o}Z_>=jSQ+ z{5<8JpQqgO^OSpjo^sF6Q||eB%8$?Zyo{faanI4yc+b&O?m2qOJxfpd>WrV3@wFNE zJUxx~JU!)}si)jC^^|+2p7JX)-k$MmGJbu=Jy%cjd9I#v&(%}zxq8YyS5LX;>M38B z@q03UZ^k`WPvbpTPx(U`_gp>IJy%b;=jti<Ts`HUtEc?AjC-!0>gzN9QpP=3PyIbt zPx)&Z_gp>IJy%b;=jti<Ts`HUtEb#^^^|{<anIFL-E;Mnd#;}HZW-^ManIFLf6vuZ z?zwu(Jy#EYKkvU=C0x7b>Y>{Y%(&<3sXi>@o~x(2=jtgRmGM0@J~rc?tEcgvtEb#^ z^^|+Ap7J9zer(2%%ed$2X}ss^Dfe7G<!u>XmGP4^z9!?ItEc%qS5Ntw8TVX0)i2Dr z=jy3`NyaZNxzA0b*xx5<K6Cio<+JotrOjs>pEczAsy#>3=5LPjzm&EY==EP;+Wda! zbN63qKGUzKP3PP2)AWC%X}9^}cHjx88t=1=dW`pXa5~=0aBVNq9Pbr+Z86@fV8_!Q z<NX7y-R3JD&u1g`81J88$1}IjLGsA`FR;0_NA7=vwcC89xqYrukKAv9)4BWyuI+7_ z<M~{rEynvV*zvT-c>e=yxA{uP^EpjD#(NK(j`smv+xs-f^EpjhjQ0`P@wCTyAA_~q ze5K>P3s?8KxSsw~n)~Oour+j}H$c1N`5Y(@|E^&D+iBrngH5~sJ}1h<zYeUw&*kCY z4V!lTeU6lee|NC^o7S@tHtqV?)62tuW3c}1N&cSLwCmr4ULO9v!1|~4Y>G|0{ywM5 z!+$fd*F;*+=Ge6B?{lm?{I>w>-%gA9Zi!91{yyi*!+$HV{%Jj1W7Dp`&%yHW-v;b8 zlh(5>HtqWRoGcIj?ZEoC)1saM*tF~4pI#pRgTVTy^)z79uD{RO^6(!5_L@rT*&dsA z{Rh*_!+#i9|8`o`vjaBm`VXa-hyRXX{nL7O!lqsS;q>yzH3IB4tL8JkJhYKuuTM3f z@8zNG2KHK1^I2c+ZzI0PjRH@gH=l9p5w|;7yK#;y4{Z;y{%VdZ4{bEqJZg?B4{a~7 z*SXrB^zzWgfW4;G_NJGIwh!3rSIzGq@|eTEVCP^y<J2Q=Kd^S=99JIN{$Typ99JIN z0bui}RmM%UabWKaHNX4FBW?oN`#^0xy*#voz+Ure6Y1qKhl9b+!F<N4N8BW^cH<mZ z9@=EE{%VdZ5A6`JdDI+N9@?Q`?<ch>^zzWAg1vXt4x^Wcb~xDkL~R<qJmxSR>>SKz zoO;B~0BbkSapj>M0oGs5apj@S1e-_Aapj>M2_9T(v*_ia9S!z=GwvvQd1#*jdvB>7 zLobgxd=~5+%x9c>#C;B|-8jdUhjuJje>KOIhxU1}dDI+N9@=r>xHpdn&!+kL=Vw(T zxCg!V_}zI9*ci3QF&7-~3-iEfj`?tNXpbBVz{aRWj)mYZv{TEm7lHND9=0a1<EuHB zFVJg?94CS+_X^r#a9YO_xOHfcI-0@8s6~zza6QfZOX<@bUxb@Od*o;Z8>1FEmVv#W zqwY4ae%iyf9PIdNj&%aPw#e}%aNMga!D$_<;MSo%>i9C)7`4dp6>x8w`B&4YIZlF` zLwn>n8ElMN<TwS~hZc3O0qdtdY^Q=9U(K;rsL>+F>EMB-ww6Av;|#cUXpcJ11RJ9k zInDxm-<tny`ZUKmaC2yn9A5<+qZT>72JTOby3Ymcr#)=vfgNAXu}-7c7CF8S9$sn} z(5H1=2)7RHQO8AKW7HzYH^AP%=D(Oe&GAjRIkZQPOTfmcMUHQQhti_%Z-e#I9=1!t zj<4oe=hJHwUqOF4E$;R2fE`zRjPYHt_gK{NJ+OY-&CyPu=C~4W4(*ZSDzMjf<oG^V zKkZTH)nIe0nfDs{H17}K=G7i~uLZj|k@tsS{j^8k>%it#Gw=2EY2F{f&8t1~-T-!; zBJYi0{j^8ko51E)i@2M?UiYV$?_WO#>!;nZFQeBM{yzb0i}Qq^g89d{grZOjzgxiW zfjNFgFAweKVAoddR(g48zW`g0+HLf5KVy7f|0TGA-h9TX>vuc7TIBc@*fmzWgI*rm zU0~Nq?M`}m<i8u-Kz{QXryg;?2FHBXf#sq71|0Lb2P_Zmx8Ru1y<mBacOQ5d<C)Jm zb?dsHUM+Gw2#)zY0G5aLFgWJ(5Lh1he+M?d`HWMKef~Yz`Re1CkI-w2`|1ziU5V8m z*VG@u?zfsTkJ4+4m_LC>5Tjk)>qMLQ&-9Pc!uN5o*Rb}O%M)N@V!S88`e`@EQ}nBe zi5ySE&7nQ6@n^ussYMOiVvOg(@tOY|SRUG6!1XlaUZ9tU_E&JfQd>_ik9GbV*mX9a zaq1EGB6u(@{)Xf4VE$G9zQZ_m$9tAuE#h7QcPX`(>E)sQ1Kgw3UZt0Z_ByzKsl7%o zk2?PewodaIryg<t0uQA{o&N^&uTrOR>el%Zy;{V*1+Fc%H|gb}{Rg~Bsl81v5ADC; zfu;5iy*%ptAJ{t0XPkP(y$c>ri#p!}^RH5;aq8Cj2EAIueF%<y{{Sox?PGB4`$u4T zXrF>(-#-D%qfWmFHZZpNj8l)eE@<h#cZDNY{(i<db?bZ|qguq(sbN#|_hWfz_2Ag| z%5TYuwh=h?-QShvQD+Zu17n-dIQ7W6F*x1#p75yCICX3Bw{5kE+Y}u8?(fv{&^80d zzWbZCJhVRG*mr-wmPegifUVPf#;Hf#mf&>X`@*A6<J7Iw-_X^>{vO_s7CE*Bd!GgG z5AO|+YiT>UIfv5pRX2~nn-8GH`_3S+?>pKfR|D9+jQ6g=aQ(EK!{6GqMUL&kJCZ|t z#0&*ngPJk^ey=UY*a7VQ7koIpPg4Jm@Tgy3b@TW;ep>&|aO>9|xkiB9<EUj9uzuRj z;cx%iqW)dM)~`KccGE^vGsg1-Z865~VDHP|d%*i9^^b-}{rak#$8&|W{=MMVuRU_@ z4R-INmN8)cw41}T2yIdSK49zD9x?lZtwGHg&o{J<=h?{LMfV4LO{KqU9RSxJbsPxx zItV@vo?c_);m&b5O<(o+Or8KXpFWO1kv^UGL2&1-J@Ow6_L`14OoHpDJ?fqew(fRf zBkmBm_W0bK0yb8=V;@SdE!OKWu<NBg#+nMYW;J6x-_aK19uCgeYdSXVQO69h*In=< z!09zT6CUfOuX?Q4EU@|Xar`6c)Ac$E?s{pD{6~Ymhhh%L!1dD}b$<q&uh(a>X^-{# z9N1Xxj(se>wpg#vgIzD}G1eEr)~sgCG<t0@?rd<rUdLn89`!eZz0QNr0jKxSTzIUP zzUr}F^T6iQ$MNUWr|Y!<?s{pD{0qU}voVK7aQ(DL-A&+py-vWUJ=W_)u(8@5dojJX zSg$2u*Gqeh)eN>~HDivW*A}%c1#d%(zjgW|SX=0=V6Vw|e_sZUwb57Iye;%<5w{#X zj23xUfVGAGC9r!Jc~^oXufFQ$ZKGF<xG#f8(IW3xz}iAz4R(%^_at!S)mPoTtLW9l zYv@m*#X6n}b`P~j-qXR(QUBBE<)NJcwobLR^zzWo0y~!4ne_7btUepuz&PeJPF=rq z<k+L0ufnZId(7)=VCUuMta<ba?c7rH{n5GT>vgsX{5-IKM-}hCJ;CQ=(;j~V`*pCk zxP~qOn^U{>olLJyd=Y(m{eJ^)t=eM^E(RMD`MwF(PrErTp-;b)d<$+4?Q!jY8*H4K zF_+S7i#56o>~}5g>KD>$iyAHmTSM?Gz&^`HP2Yj*qdk1S3r=Ic2lp8_V%p*QXpcIt z1e;4O@>~T@^L!sZILUK0Tp#U`=NhoN)FRIhz-gXq;XYSKJwJr&qdoFm2R4^l<hdT~ z=V|!-2(F*@@VNo(8b<CL!TM=;eW$T@+QR=Pa9Z2VaG${={>N~Av`5TOz-i1+;Xa#3 z%+KKZXpdTN0h>!L^4tne^ZXp{^LXUB4X%&&$ny)Zxzr-hFTrV^+u?p6h&*?|_0b-A zeg!s{TI9JC?B{Rz+y&Q9d-&W9PS<xGTtDru@8PVkw($QoIIZm-cx_VKZ{Yf9kC=PG zY0Pioey51}+y~c3d(?VA*j#Fn=K*k<=RvsNBO=d3aDB8#o`=EaQj0vl1E+a@5BGaa z<aq?HkM_v(2e7%+BF`VeeqM*qqj3GShtHqD>H0nf*H63aJDv5_mj9jV(b#<*N%QYi z{TtMyX#U;lG4%f3sekABII%v@%I%-Z_|qAGCgJ9MHsJ@upD(%f`*$eM!5!CH^_8m~ zgzW|JlQcEs<YW1NdeIes;XnSZt@T)g;~KADtj&6``uOsh{I6i=<7a|7<@$vG-@xVy z{};jfx0n8MeZv3mV8;#rm%#da)@**cKKj2*{}0-$C3o(xft`E&4bJOuZTkB+Gyepe tFaGZQUtn#%!uIdd#%0oBd!w}J8h@YuCd7FC)%_fr%Q)8Lxax6@{2w`Cd9DBe literal 0 HcmV?d00001 diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/Fxaa.glsl b/Ryujinx.Graphics.Vulkan/Effects/Shaders/Fxaa.glsl new file mode 100644 index 000000000..f197c64ca --- /dev/null +++ b/Ryujinx.Graphics.Vulkan/Effects/Shaders/Fxaa.glsl @@ -0,0 +1,1177 @@ +/*============================================================================ + + + NVIDIA FXAA 3.11 by TIMOTHY LOTTES + + +------------------------------------------------------------------------------ +COPYRIGHT (C) 2010, 2011 NVIDIA CORPORATION. ALL RIGHTS RESERVED. +------------------------------------------------------------------------------ +TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THIS SOFTWARE IS PROVIDED +*AS IS* AND NVIDIA AND ITS SUPPLIERS DISCLAIM ALL WARRANTIES, EITHER EXPRESS +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT SHALL NVIDIA +OR ITS SUPPLIERS BE LIABLE FOR ANY SPECIAL, INCIDENTAL, INDIRECT, OR +CONSEQUENTIAL DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR +LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, +OR ANY OTHER PECUNIARY LOSS) ARISING OUT OF THE USE OF OR INABILITY TO USE +THIS SOFTWARE, EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + +------------------------------------------------------------------------------ + INTEGRATION CHECKLIST +------------------------------------------------------------------------------ +(1.) +In the shader source, setup defines for the desired configuration. +When providing multiple shaders (for different presets), +simply setup the defines differently in multiple files. +Example, + + #define FXAA_PC 1 + #define FXAA_HLSL_5 1 + #define FXAA_QUALITY_PRESET 12 + +Or, + + #define FXAA_360 1 + +Or, + + #define FXAA_PS3 1 + +Etc. + +(2.) +Then include this file, + + #include "Fxaa3_11.h" + +(3.) +Then call the FXAA pixel shader from within your desired shader. +Look at the FXAA Quality FxaaPixelShader() for docs on inputs. +As for FXAA 3.11 all inputs for all shaders are the same +to enable easy porting between platforms. + + return FxaaPixelShader(...); + +(4.) +Insure pass prior to FXAA outputs RGBL (see next section). +Or use, + + #define FXAA_GREEN_AS_LUMA 1 + +(5.) +Setup engine to provide the following constants +which are used in the FxaaPixelShader() inputs, + + FxaaFloat2 fxaaQualityRcpFrame, + FxaaFloat4 fxaaConsoleRcpFrameOpt, + FxaaFloat4 fxaaConsoleRcpFrameOpt2, + FxaaFloat4 fxaaConsole360RcpFrameOpt2, + FxaaFloat fxaaQualitySubpix, + FxaaFloat fxaaQualityEdgeThreshold, + FxaaFloat fxaaQualityEdgeThresholdMin, + FxaaFloat fxaaConsoleEdgeSharpness, + FxaaFloat fxaaConsoleEdgeThreshold, + FxaaFloat fxaaConsoleEdgeThresholdMin, + FxaaFloat4 fxaaConsole360ConstDir + +Look at the FXAA Quality FxaaPixelShader() for docs on inputs. + +(6.) +Have FXAA vertex shader run as a full screen triangle, +and output "pos" and "fxaaConsolePosPos" +such that inputs in the pixel shader provide, + + // {xy} = center of pixel + FxaaFloat2 pos, + + // {xy_} = upper left of pixel + // {_zw} = lower right of pixel + FxaaFloat4 fxaaConsolePosPos, + +(7.) +Insure the texture sampler(s) used by FXAA are set to bilinear filtering. + + +------------------------------------------------------------------------------ + INTEGRATION - RGBL AND COLORSPACE +------------------------------------------------------------------------------ +FXAA3 requires RGBL as input unless the following is set, + + #define FXAA_GREEN_AS_LUMA 1 + +In which case the engine uses green in place of luma, +and requires RGB input is in a non-linear colorspace. + +RGB should be LDR (low dynamic range). +Specifically do FXAA after tonemapping. + +RGB data as returned by a texture fetch can be non-linear, +or linear when FXAA_GREEN_AS_LUMA is not set. +Note an "sRGB format" texture counts as linear, +because the result of a texture fetch is linear data. +Regular "RGBA8" textures in the sRGB colorspace are non-linear. + +If FXAA_GREEN_AS_LUMA is not set, +luma must be stored in the alpha channel prior to running FXAA. +This luma should be in a perceptual space (could be gamma 2.0). +Example pass before FXAA where output is gamma 2.0 encoded, + + color.rgb = ToneMap(color.rgb); // linear color output + color.rgb = sqrt(color.rgb); // gamma 2.0 color output + return color; + +To use FXAA, + + color.rgb = ToneMap(color.rgb); // linear color output + color.rgb = sqrt(color.rgb); // gamma 2.0 color output + color.a = dot(color.rgb, FxaaFloat3(0.299, 0.587, 0.114)); // compute luma + return color; + +Another example where output is linear encoded, +say for instance writing to an sRGB formated render target, +where the render target does the conversion back to sRGB after blending, + + color.rgb = ToneMap(color.rgb); // linear color output + return color; + +To use FXAA, + + color.rgb = ToneMap(color.rgb); // linear color output + color.a = sqrt(dot(color.rgb, FxaaFloat3(0.299, 0.587, 0.114))); // compute luma + return color; + +Getting luma correct is required for the algorithm to work correctly. + + +------------------------------------------------------------------------------ + BEING LINEARLY CORRECT? +------------------------------------------------------------------------------ +Applying FXAA to a framebuffer with linear RGB color will look worse. +This is very counter intuitive, but happends to be true in this case. +The reason is because dithering artifacts will be more visiable +in a linear colorspace. + + +------------------------------------------------------------------------------ + COMPLEX INTEGRATION +------------------------------------------------------------------------------ +Q. What if the engine is blending into RGB before wanting to run FXAA? + +A. In the last opaque pass prior to FXAA, + have the pass write out luma into alpha. + Then blend into RGB only. + FXAA should be able to run ok + assuming the blending pass did not any add aliasing. + This should be the common case for particles and common blending passes. + +A. Or use FXAA_GREEN_AS_LUMA. + +============================================================================*/ + +#version 430 core + +layout(local_size_x = 16, local_size_y = 16) in; +layout(rgba8, binding = 0, set = 3) uniform image2D imgOutput; + +layout(binding = 1, set = 2) uniform sampler2D inputImage; +layout(binding = 2) uniform invResolution +{ + vec2 invResolution_data; +}; + +#define FXAA_QUALITY_PRESET 12 +#define FXAA_GREEN_AS_LUMA 1 +#define FXAA_PC 1 +#define FXAA_GLSL_130 1 + + +/*============================================================================ + + INTEGRATION KNOBS + +/*==========================================================================*/ +#ifndef FXAA_PC + // + // FXAA Quality + // The high quality PC algorithm. + // + #define FXAA_PC 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_GLSL_120 + #define FXAA_GLSL_120 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_GLSL_130 + #define FXAA_GLSL_130 0 +#endif +/*==========================================================================*/ +#ifndef FXAA_GREEN_AS_LUMA + // + // For those using non-linear color, + // and either not able to get luma in alpha, or not wanting to, + // this enables FXAA to run using green as a proxy for luma. + // So with this enabled, no need to pack luma in alpha. + // + // This will turn off AA on anything which lacks some amount of green. + // Pure red and blue or combination of only R and B, will get no AA. + // + // Might want to lower the settings for both, + // fxaaConsoleEdgeThresholdMin + // fxaaQualityEdgeThresholdMin + // In order to insure AA does not get turned off on colors + // which contain a minor amount of green. + // + // 1 = On. + // 0 = Off. + // + #define FXAA_GREEN_AS_LUMA 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_EARLY_EXIT + // + // Controls algorithm's early exit path. + // On PS3 turning this ON adds 2 cycles to the shader. + // On 360 turning this OFF adds 10ths of a millisecond to the shader. + // Turning this off on console will result in a more blurry image. + // So this defaults to on. + // + // 1 = On. + // 0 = Off. + // + #define FXAA_EARLY_EXIT 1 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_DISCARD + // + // Only valid for PC OpenGL currently. + // Probably will not work when FXAA_GREEN_AS_LUMA = 1. + // + // 1 = Use discard on pixels which don't need AA. + // For APIs which enable concurrent TEX+ROP from same surface. + // 0 = Return unchanged color on pixels which don't need AA. + // + #define FXAA_DISCARD 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_FAST_PIXEL_OFFSET + // + // Used for GLSL 120 only. + // + // 1 = GL API supports fast pixel offsets + // 0 = do not use fast pixel offsets + // + #ifdef GL_EXT_gpu_shader4 + #define FXAA_FAST_PIXEL_OFFSET 1 + #endif + #ifdef GL_NV_gpu_shader5 + #define FXAA_FAST_PIXEL_OFFSET 1 + #endif + #ifdef GL_ARB_gpu_shader5 + #define FXAA_FAST_PIXEL_OFFSET 1 + #endif + #ifndef FXAA_FAST_PIXEL_OFFSET + #define FXAA_FAST_PIXEL_OFFSET 0 + #endif +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_GATHER4_ALPHA + // + // 1 = API supports gather4 on alpha channel. + // 0 = API does not support gather4 on alpha channel. + // + #if (FXAA_HLSL_5 == 1) + #define FXAA_GATHER4_ALPHA 1 + #endif + #ifdef GL_ARB_gpu_shader5 + #define FXAA_GATHER4_ALPHA 1 + #endif + #ifdef GL_NV_gpu_shader5 + #define FXAA_GATHER4_ALPHA 1 + #endif + #ifndef FXAA_GATHER4_ALPHA + #define FXAA_GATHER4_ALPHA 0 + #endif +#endif + +/*============================================================================ + FXAA QUALITY - TUNING KNOBS +------------------------------------------------------------------------------ +NOTE the other tuning knobs are now in the shader function inputs! +============================================================================*/ +#ifndef FXAA_QUALITY_PRESET + // + // Choose the quality preset. + // This needs to be compiled into the shader as it effects code. + // Best option to include multiple presets is to + // in each shader define the preset, then include this file. + // + // OPTIONS + // ----------------------------------------------------------------------- + // 10 to 15 - default medium dither (10=fastest, 15=highest quality) + // 20 to 29 - less dither, more expensive (20=fastest, 29=highest quality) + // 39 - no dither, very expensive + // + // NOTES + // ----------------------------------------------------------------------- + // 12 = slightly faster then FXAA 3.9 and higher edge quality (default) + // 13 = about same speed as FXAA 3.9 and better than 12 + // 23 = closest to FXAA 3.9 visually and performance wise + // _ = the lowest digit is directly related to performance + // _ = the highest digit is directly related to style + // + #define FXAA_QUALITY_PRESET 12 +#endif + + +/*============================================================================ + + FXAA QUALITY - PRESETS + +============================================================================*/ + +/*============================================================================ + FXAA QUALITY - MEDIUM DITHER PRESETS +============================================================================*/ +#if (FXAA_QUALITY_PRESET == 10) + #define FXAA_QUALITY_PS 3 + #define FXAA_QUALITY_P0 1.5 + #define FXAA_QUALITY_P1 3.0 + #define FXAA_QUALITY_P2 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY_PRESET == 11) + #define FXAA_QUALITY_PS 4 + #define FXAA_QUALITY_P0 1.0 + #define FXAA_QUALITY_P1 1.5 + #define FXAA_QUALITY_P2 3.0 + #define FXAA_QUALITY_P3 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY_PRESET == 12) + #define FXAA_QUALITY_PS 5 + #define FXAA_QUALITY_P0 1.0 + #define FXAA_QUALITY_P1 1.5 + #define FXAA_QUALITY_P2 2.0 + #define FXAA_QUALITY_P3 4.0 + #define FXAA_QUALITY_P4 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY_PRESET == 13) + #define FXAA_QUALITY_PS 6 + #define FXAA_QUALITY_P0 1.0 + #define FXAA_QUALITY_P1 1.5 + #define FXAA_QUALITY_P2 2.0 + #define FXAA_QUALITY_P3 2.0 + #define FXAA_QUALITY_P4 4.0 + #define FXAA_QUALITY_P5 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY_PRESET == 14) + #define FXAA_QUALITY_PS 7 + #define FXAA_QUALITY_P0 1.0 + #define FXAA_QUALITY_P1 1.5 + #define FXAA_QUALITY_P2 2.0 + #define FXAA_QUALITY_P3 2.0 + #define FXAA_QUALITY_P4 2.0 + #define FXAA_QUALITY_P5 4.0 + #define FXAA_QUALITY_P6 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY_PRESET == 15) + #define FXAA_QUALITY_PS 8 + #define FXAA_QUALITY_P0 1.0 + #define FXAA_QUALITY_P1 1.5 + #define FXAA_QUALITY_P2 2.0 + #define FXAA_QUALITY_P3 2.0 + #define FXAA_QUALITY_P4 2.0 + #define FXAA_QUALITY_P5 2.0 + #define FXAA_QUALITY_P6 4.0 + #define FXAA_QUALITY_P7 12.0 +#endif + +/*============================================================================ + FXAA QUALITY - LOW DITHER PRESETS +============================================================================*/ +#if (FXAA_QUALITY_PRESET == 20) + #define FXAA_QUALITY_PS 3 + #define FXAA_QUALITY_P0 1.5 + #define FXAA_QUALITY_P1 2.0 + #define FXAA_QUALITY_P2 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY_PRESET == 21) + #define FXAA_QUALITY_PS 4 + #define FXAA_QUALITY_P0 1.0 + #define FXAA_QUALITY_P1 1.5 + #define FXAA_QUALITY_P2 2.0 + #define FXAA_QUALITY_P3 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY_PRESET == 22) + #define FXAA_QUALITY_PS 5 + #define FXAA_QUALITY_P0 1.0 + #define FXAA_QUALITY_P1 1.5 + #define FXAA_QUALITY_P2 2.0 + #define FXAA_QUALITY_P3 2.0 + #define FXAA_QUALITY_P4 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY_PRESET == 23) + #define FXAA_QUALITY_PS 6 + #define FXAA_QUALITY_P0 1.0 + #define FXAA_QUALITY_P1 1.5 + #define FXAA_QUALITY_P2 2.0 + #define FXAA_QUALITY_P3 2.0 + #define FXAA_QUALITY_P4 2.0 + #define FXAA_QUALITY_P5 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY_PRESET == 24) + #define FXAA_QUALITY_PS 7 + #define FXAA_QUALITY_P0 1.0 + #define FXAA_QUALITY_P1 1.5 + #define FXAA_QUALITY_P2 2.0 + #define FXAA_QUALITY_P3 2.0 + #define FXAA_QUALITY_P4 2.0 + #define FXAA_QUALITY_P5 3.0 + #define FXAA_QUALITY_P6 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY_PRESET == 25) + #define FXAA_QUALITY_PS 8 + #define FXAA_QUALITY_P0 1.0 + #define FXAA_QUALITY_P1 1.5 + #define FXAA_QUALITY_P2 2.0 + #define FXAA_QUALITY_P3 2.0 + #define FXAA_QUALITY_P4 2.0 + #define FXAA_QUALITY_P5 2.0 + #define FXAA_QUALITY_P6 4.0 + #define FXAA_QUALITY_P7 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY_PRESET == 26) + #define FXAA_QUALITY_PS 9 + #define FXAA_QUALITY_P0 1.0 + #define FXAA_QUALITY_P1 1.5 + #define FXAA_QUALITY_P2 2.0 + #define FXAA_QUALITY_P3 2.0 + #define FXAA_QUALITY_P4 2.0 + #define FXAA_QUALITY_P5 2.0 + #define FXAA_QUALITY_P6 2.0 + #define FXAA_QUALITY_P7 4.0 + #define FXAA_QUALITY_P8 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY_PRESET == 27) + #define FXAA_QUALITY_PS 10 + #define FXAA_QUALITY_P0 1.0 + #define FXAA_QUALITY_P1 1.5 + #define FXAA_QUALITY_P2 2.0 + #define FXAA_QUALITY_P3 2.0 + #define FXAA_QUALITY_P4 2.0 + #define FXAA_QUALITY_P5 2.0 + #define FXAA_QUALITY_P6 2.0 + #define FXAA_QUALITY_P7 2.0 + #define FXAA_QUALITY_P8 4.0 + #define FXAA_QUALITY_P9 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY_PRESET == 28) + #define FXAA_QUALITY_PS 11 + #define FXAA_QUALITY_P0 1.0 + #define FXAA_QUALITY_P1 1.5 + #define FXAA_QUALITY_P2 2.0 + #define FXAA_QUALITY_P3 2.0 + #define FXAA_QUALITY_P4 2.0 + #define FXAA_QUALITY_P5 2.0 + #define FXAA_QUALITY_P6 2.0 + #define FXAA_QUALITY_P7 2.0 + #define FXAA_QUALITY_P8 2.0 + #define FXAA_QUALITY_P9 4.0 + #define FXAA_QUALITY_P10 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY_PRESET == 29) + #define FXAA_QUALITY_PS 12 + #define FXAA_QUALITY_P0 1.0 + #define FXAA_QUALITY_P1 1.5 + #define FXAA_QUALITY_P2 2.0 + #define FXAA_QUALITY_P3 2.0 + #define FXAA_QUALITY_P4 2.0 + #define FXAA_QUALITY_P5 2.0 + #define FXAA_QUALITY_P6 2.0 + #define FXAA_QUALITY_P7 2.0 + #define FXAA_QUALITY_P8 2.0 + #define FXAA_QUALITY_P9 2.0 + #define FXAA_QUALITY_P10 4.0 + #define FXAA_QUALITY_P11 8.0 +#endif + +/*============================================================================ + FXAA QUALITY - EXTREME QUALITY +============================================================================*/ +#if (FXAA_QUALITY_PRESET == 39) + #define FXAA_QUALITY_PS 12 + #define FXAA_QUALITY_P0 1.0 + #define FXAA_QUALITY_P1 1.0 + #define FXAA_QUALITY_P2 1.0 + #define FXAA_QUALITY_P3 1.0 + #define FXAA_QUALITY_P4 1.0 + #define FXAA_QUALITY_P5 1.5 + #define FXAA_QUALITY_P6 2.0 + #define FXAA_QUALITY_P7 2.0 + #define FXAA_QUALITY_P8 2.0 + #define FXAA_QUALITY_P9 2.0 + #define FXAA_QUALITY_P10 4.0 + #define FXAA_QUALITY_P11 8.0 +#endif + + + +/*============================================================================ + + API PORTING + +============================================================================*/ +#if (FXAA_GLSL_120 == 1) || (FXAA_GLSL_130 == 1) + #define FxaaBool bool + #define FxaaDiscard discard + #define FxaaFloat float + #define FxaaFloat2 vec2 + #define FxaaFloat3 vec3 + #define FxaaFloat4 vec4 + #define FxaaHalf float + #define FxaaHalf2 vec2 + #define FxaaHalf3 vec3 + #define FxaaHalf4 vec4 + #define FxaaInt2 ivec2 + #define FxaaSat(x) clamp(x, 0.0, 1.0) + #define FxaaTex sampler2D +#else + #define FxaaBool bool + #define FxaaDiscard clip(-1) + #define FxaaFloat float + #define FxaaFloat2 float2 + #define FxaaFloat3 float3 + #define FxaaFloat4 float4 + #define FxaaHalf half + #define FxaaHalf2 half2 + #define FxaaHalf3 half3 + #define FxaaHalf4 half4 + #define FxaaSat(x) saturate(x) +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_GLSL_120 == 1) + // Requires, + // #version 120 + // And at least, + // #extension GL_EXT_gpu_shader4 : enable + // (or set FXAA_FAST_PIXEL_OFFSET 1 to work like DX9) + #define FxaaTexTop(t, p) texture2DLod(t, p, 0.0) + #if (FXAA_FAST_PIXEL_OFFSET == 1) + #define FxaaTexOff(t, p, o, r) texture2DLodOffset(t, p, 0.0, o) + #else + #define FxaaTexOff(t, p, o, r) texture2DLod(t, p + (o * r), 0.0) + #endif + #if (FXAA_GATHER4_ALPHA == 1) + // use #extension GL_ARB_gpu_shader5 : enable + #define FxaaTexAlpha4(t, p) textureGather(t, p, 3) + #define FxaaTexOffAlpha4(t, p, o) textureGatherOffset(t, p, o, 3) + #define FxaaTexGreen4(t, p) textureGather(t, p, 1) + #define FxaaTexOffGreen4(t, p, o) textureGatherOffset(t, p, o, 1) + #endif +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_GLSL_130 == 1) + // Requires "#version 130" or better + #define FxaaTexTop(t, p) textureLod(t, p, 0.0) + #define FxaaTexOff(t, p, o, r) textureLodOffset(t, p, 0.0, o) + #if (FXAA_GATHER4_ALPHA == 1) + // use #extension GL_ARB_gpu_shader5 : enable + #define FxaaTexAlpha4(t, p) textureGather(t, p, 3) + #define FxaaTexOffAlpha4(t, p, o) textureGatherOffset(t, p, o, 3) + #define FxaaTexGreen4(t, p) textureGather(t, p, 1) + #define FxaaTexOffGreen4(t, p, o) textureGatherOffset(t, p, o, 1) + #endif +#endif + + +/*============================================================================ + GREEN AS LUMA OPTION SUPPORT FUNCTION +============================================================================*/ +#if (FXAA_GREEN_AS_LUMA == 0) + FxaaFloat FxaaLuma(FxaaFloat4 rgba) { return rgba.w; } +#else + FxaaFloat FxaaLuma(FxaaFloat4 rgba) { return rgba.y; } +#endif + + + + +/*============================================================================ + + FXAA3 QUALITY - PC + +============================================================================*/ +#if (FXAA_PC == 1) +/*--------------------------------------------------------------------------*/ +FxaaFloat4 FxaaPixelShader( + // + // Use noperspective interpolation here (turn off perspective interpolation). + // {xy} = center of pixel + FxaaFloat2 pos, + // + // Used only for FXAA Console, and not used on the 360 version. + // Use noperspective interpolation here (turn off perspective interpolation). + // {xy_} = upper left of pixel + // {_zw} = lower right of pixel + FxaaFloat4 fxaaConsolePosPos, + // + // Input color texture. + // {rgb_} = color in linear or perceptual color space + // if (FXAA_GREEN_AS_LUMA == 0) + // {__a} = luma in perceptual color space (not linear) + FxaaTex tex, + // + // Only used on the optimized 360 version of FXAA Console. + // For everything but 360, just use the same input here as for "tex". + // For 360, same texture, just alias with a 2nd sampler. + // This sampler needs to have an exponent bias of -1. + FxaaTex fxaaConsole360TexExpBiasNegOne, + // + // Only used on the optimized 360 version of FXAA Console. + // For everything but 360, just use the same input here as for "tex". + // For 360, same texture, just alias with a 3nd sampler. + // This sampler needs to have an exponent bias of -2. + FxaaTex fxaaConsole360TexExpBiasNegTwo, + // + // Only used on FXAA Quality. + // This must be from a constant/uniform. + // {x_} = 1.0/screenWidthInPixels + // {_y} = 1.0/screenHeightInPixels + FxaaFloat2 fxaaQualityRcpFrame, + // + // Only used on FXAA Console. + // This must be from a constant/uniform. + // This effects sub-pixel AA quality and inversely sharpness. + // Where N ranges between, + // N = 0.50 (default) + // N = 0.33 (sharper) + // {x__} = -N/screenWidthInPixels + // {_y_} = -N/screenHeightInPixels + // {_z_} = N/screenWidthInPixels + // {__w} = N/screenHeightInPixels + FxaaFloat4 fxaaConsoleRcpFrameOpt, + // + // Only used on FXAA Console. + // Not used on 360, but used on PS3 and PC. + // This must be from a constant/uniform. + // {x__} = -2.0/screenWidthInPixels + // {_y_} = -2.0/screenHeightInPixels + // {_z_} = 2.0/screenWidthInPixels + // {__w} = 2.0/screenHeightInPixels + FxaaFloat4 fxaaConsoleRcpFrameOpt2, + // + // Only used on FXAA Console. + // Only used on 360 in place of fxaaConsoleRcpFrameOpt2. + // This must be from a constant/uniform. + // {x__} = 8.0/screenWidthInPixels + // {_y_} = 8.0/screenHeightInPixels + // {_z_} = -4.0/screenWidthInPixels + // {__w} = -4.0/screenHeightInPixels + FxaaFloat4 fxaaConsole360RcpFrameOpt2, + // + // Only used on FXAA Quality. + // This used to be the FXAA_QUALITY_SUBPIX define. + // It is here now to allow easier tuning. + // Choose the amount of sub-pixel aliasing removal. + // This can effect sharpness. + // 1.00 - upper limit (softer) + // 0.75 - default amount of filtering + // 0.50 - lower limit (sharper, less sub-pixel aliasing removal) + // 0.25 - almost off + // 0.00 - completely off + FxaaFloat fxaaQualitySubpix, + // + // Only used on FXAA Quality. + // This used to be the FXAA_QUALITY_EDGE_THRESHOLD define. + // It is here now to allow easier tuning. + // The minimum amount of local contrast required to apply algorithm. + // 0.333 - too little (faster) + // 0.250 - low quality + // 0.166 - default + // 0.125 - high quality + // 0.063 - overkill (slower) + FxaaFloat fxaaQualityEdgeThreshold, + // + // Only used on FXAA Quality. + // This used to be the FXAA_QUALITY_EDGE_THRESHOLD_MIN define. + // It is here now to allow easier tuning. + // Trims the algorithm from processing darks. + // 0.0833 - upper limit (default, the start of visible unfiltered edges) + // 0.0625 - high quality (faster) + // 0.0312 - visible limit (slower) + // Special notes when using FXAA_GREEN_AS_LUMA, + // Likely want to set this to zero. + // As colors that are mostly not-green + // will appear very dark in the green channel! + // Tune by looking at mostly non-green content, + // then start at zero and increase until aliasing is a problem. + FxaaFloat fxaaQualityEdgeThresholdMin, + // + // Only used on FXAA Console. + // This used to be the FXAA_CONSOLE_EDGE_SHARPNESS define. + // It is here now to allow easier tuning. + // This does not effect PS3, as this needs to be compiled in. + // Use FXAA_CONSOLE_PS3_EDGE_SHARPNESS for PS3. + // Due to the PS3 being ALU bound, + // there are only three safe values here: 2 and 4 and 8. + // These options use the shaders ability to a free *|/ by 2|4|8. + // For all other platforms can be a non-power of two. + // 8.0 is sharper (default!!!) + // 4.0 is softer + // 2.0 is really soft (good only for vector graphics inputs) + FxaaFloat fxaaConsoleEdgeSharpness, + // + // Only used on FXAA Console. + // This used to be the FXAA_CONSOLE_EDGE_THRESHOLD define. + // It is here now to allow easier tuning. + // This does not effect PS3, as this needs to be compiled in. + // Use FXAA_CONSOLE_PS3_EDGE_THRESHOLD for PS3. + // Due to the PS3 being ALU bound, + // there are only two safe values here: 1/4 and 1/8. + // These options use the shaders ability to a free *|/ by 2|4|8. + // The console setting has a different mapping than the quality setting. + // Other platforms can use other values. + // 0.125 leaves less aliasing, but is softer (default!!!) + // 0.25 leaves more aliasing, and is sharper + FxaaFloat fxaaConsoleEdgeThreshold, + // + // Only used on FXAA Console. + // This used to be the FXAA_CONSOLE_EDGE_THRESHOLD_MIN define. + // It is here now to allow easier tuning. + // Trims the algorithm from processing darks. + // The console setting has a different mapping than the quality setting. + // This only applies when FXAA_EARLY_EXIT is 1. + // This does not apply to PS3, + // PS3 was simplified to avoid more shader instructions. + // 0.06 - faster but more aliasing in darks + // 0.05 - default + // 0.04 - slower and less aliasing in darks + // Special notes when using FXAA_GREEN_AS_LUMA, + // Likely want to set this to zero. + // As colors that are mostly not-green + // will appear very dark in the green channel! + // Tune by looking at mostly non-green content, + // then start at zero and increase until aliasing is a problem. + FxaaFloat fxaaConsoleEdgeThresholdMin, + // + // Extra constants for 360 FXAA Console only. + // Use zeros or anything else for other platforms. + // These must be in physical constant registers and NOT immedates. + // Immedates will result in compiler un-optimizing. + // {xyzw} = float4(1.0, -1.0, 0.25, -0.25) + FxaaFloat4 fxaaConsole360ConstDir +) { +/*--------------------------------------------------------------------------*/ + FxaaFloat2 posM; + posM.x = pos.x; + posM.y = pos.y; + #if (FXAA_GATHER4_ALPHA == 1) + #if (FXAA_DISCARD == 0) + FxaaFloat4 rgbyM = FxaaTexTop(tex, posM); + #if (FXAA_GREEN_AS_LUMA == 0) + #define lumaM rgbyM.w + #else + #define lumaM rgbyM.y + #endif + #endif + #if (FXAA_GREEN_AS_LUMA == 0) + FxaaFloat4 luma4A = FxaaTexAlpha4(tex, posM); + FxaaFloat4 luma4B = FxaaTexOffAlpha4(tex, posM, FxaaInt2(-1, -1)); + #else + FxaaFloat4 luma4A = FxaaTexGreen4(tex, posM); + FxaaFloat4 luma4B = FxaaTexOffGreen4(tex, posM, FxaaInt2(-1, -1)); + #endif + #if (FXAA_DISCARD == 1) + #define lumaM luma4A.w + #endif + #define lumaE luma4A.z + #define lumaS luma4A.x + #define lumaSE luma4A.y + #define lumaNW luma4B.w + #define lumaN luma4B.z + #define lumaW luma4B.x + #else + FxaaFloat4 rgbyM = FxaaTexTop(tex, posM); + #if (FXAA_GREEN_AS_LUMA == 0) + #define lumaM rgbyM.w + #else + #define lumaM rgbyM.y + #endif + FxaaFloat lumaS = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 0, 1), fxaaQualityRcpFrame.xy)); + FxaaFloat lumaE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1, 0), fxaaQualityRcpFrame.xy)); + FxaaFloat lumaN = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 0,-1), fxaaQualityRcpFrame.xy)); + FxaaFloat lumaW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 0), fxaaQualityRcpFrame.xy)); + #endif +/*--------------------------------------------------------------------------*/ + FxaaFloat maxSM = max(lumaS, lumaM); + FxaaFloat minSM = min(lumaS, lumaM); + FxaaFloat maxESM = max(lumaE, maxSM); + FxaaFloat minESM = min(lumaE, minSM); + FxaaFloat maxWN = max(lumaN, lumaW); + FxaaFloat minWN = min(lumaN, lumaW); + FxaaFloat rangeMax = max(maxWN, maxESM); + FxaaFloat rangeMin = min(minWN, minESM); + FxaaFloat rangeMaxScaled = rangeMax * fxaaQualityEdgeThreshold; + FxaaFloat range = rangeMax - rangeMin; + FxaaFloat rangeMaxClamped = max(fxaaQualityEdgeThresholdMin, rangeMaxScaled); + FxaaBool earlyExit = range < rangeMaxClamped; +/*--------------------------------------------------------------------------*/ + if(earlyExit) + #if (FXAA_DISCARD == 1) + FxaaDiscard; + #else + return rgbyM; + #endif +/*--------------------------------------------------------------------------*/ + #if (FXAA_GATHER4_ALPHA == 0) + FxaaFloat lumaNW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1,-1), fxaaQualityRcpFrame.xy)); + FxaaFloat lumaSE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1, 1), fxaaQualityRcpFrame.xy)); + FxaaFloat lumaNE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1,-1), fxaaQualityRcpFrame.xy)); + FxaaFloat lumaSW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 1), fxaaQualityRcpFrame.xy)); + #else + FxaaFloat lumaNE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(1, -1), fxaaQualityRcpFrame.xy)); + FxaaFloat lumaSW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 1), fxaaQualityRcpFrame.xy)); + #endif +/*--------------------------------------------------------------------------*/ + FxaaFloat lumaNS = lumaN + lumaS; + FxaaFloat lumaWE = lumaW + lumaE; + FxaaFloat subpixRcpRange = 1.0/range; + FxaaFloat subpixNSWE = lumaNS + lumaWE; + FxaaFloat edgeHorz1 = (-2.0 * lumaM) + lumaNS; + FxaaFloat edgeVert1 = (-2.0 * lumaM) + lumaWE; +/*--------------------------------------------------------------------------*/ + FxaaFloat lumaNESE = lumaNE + lumaSE; + FxaaFloat lumaNWNE = lumaNW + lumaNE; + FxaaFloat edgeHorz2 = (-2.0 * lumaE) + lumaNESE; + FxaaFloat edgeVert2 = (-2.0 * lumaN) + lumaNWNE; +/*--------------------------------------------------------------------------*/ + FxaaFloat lumaNWSW = lumaNW + lumaSW; + FxaaFloat lumaSWSE = lumaSW + lumaSE; + FxaaFloat edgeHorz4 = (abs(edgeHorz1) * 2.0) + abs(edgeHorz2); + FxaaFloat edgeVert4 = (abs(edgeVert1) * 2.0) + abs(edgeVert2); + FxaaFloat edgeHorz3 = (-2.0 * lumaW) + lumaNWSW; + FxaaFloat edgeVert3 = (-2.0 * lumaS) + lumaSWSE; + FxaaFloat edgeHorz = abs(edgeHorz3) + edgeHorz4; + FxaaFloat edgeVert = abs(edgeVert3) + edgeVert4; +/*--------------------------------------------------------------------------*/ + FxaaFloat subpixNWSWNESE = lumaNWSW + lumaNESE; + FxaaFloat lengthSign = fxaaQualityRcpFrame.x; + FxaaBool horzSpan = edgeHorz >= edgeVert; + FxaaFloat subpixA = subpixNSWE * 2.0 + subpixNWSWNESE; +/*--------------------------------------------------------------------------*/ + if(!horzSpan) lumaN = lumaW; + if(!horzSpan) lumaS = lumaE; + if(horzSpan) lengthSign = fxaaQualityRcpFrame.y; + FxaaFloat subpixB = (subpixA * (1.0/12.0)) - lumaM; +/*--------------------------------------------------------------------------*/ + FxaaFloat gradientN = lumaN - lumaM; + FxaaFloat gradientS = lumaS - lumaM; + FxaaFloat lumaNN = lumaN + lumaM; + FxaaFloat lumaSS = lumaS + lumaM; + FxaaBool pairN = abs(gradientN) >= abs(gradientS); + FxaaFloat gradient = max(abs(gradientN), abs(gradientS)); + if(pairN) lengthSign = -lengthSign; + FxaaFloat subpixC = FxaaSat(abs(subpixB) * subpixRcpRange); +/*--------------------------------------------------------------------------*/ + FxaaFloat2 posB; + posB.x = posM.x; + posB.y = posM.y; + FxaaFloat2 offNP; + offNP.x = (!horzSpan) ? 0.0 : fxaaQualityRcpFrame.x; + offNP.y = ( horzSpan) ? 0.0 : fxaaQualityRcpFrame.y; + if(!horzSpan) posB.x += lengthSign * 0.5; + if( horzSpan) posB.y += lengthSign * 0.5; +/*--------------------------------------------------------------------------*/ + FxaaFloat2 posN; + posN.x = posB.x - offNP.x * FXAA_QUALITY_P0; + posN.y = posB.y - offNP.y * FXAA_QUALITY_P0; + FxaaFloat2 posP; + posP.x = posB.x + offNP.x * FXAA_QUALITY_P0; + posP.y = posB.y + offNP.y * FXAA_QUALITY_P0; + FxaaFloat subpixD = ((-2.0)*subpixC) + 3.0; + FxaaFloat lumaEndN = FxaaLuma(FxaaTexTop(tex, posN)); + FxaaFloat subpixE = subpixC * subpixC; + FxaaFloat lumaEndP = FxaaLuma(FxaaTexTop(tex, posP)); +/*--------------------------------------------------------------------------*/ + if(!pairN) lumaNN = lumaSS; + FxaaFloat gradientScaled = gradient * 1.0/4.0; + FxaaFloat lumaMM = lumaM - lumaNN * 0.5; + FxaaFloat subpixF = subpixD * subpixE; + FxaaBool lumaMLTZero = lumaMM < 0.0; +/*--------------------------------------------------------------------------*/ + lumaEndN -= lumaNN * 0.5; + lumaEndP -= lumaNN * 0.5; + FxaaBool doneN = abs(lumaEndN) >= gradientScaled; + FxaaBool doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P1; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P1; + FxaaBool doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P1; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P1; +/*--------------------------------------------------------------------------*/ + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P2; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P2; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P2; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P2; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 3) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P3; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P3; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P3; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P3; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 4) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P4; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P4; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P4; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P4; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 5) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P5; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P5; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P5; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P5; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 6) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P6; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P6; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P6; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P6; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 7) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P7; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P7; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P7; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P7; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 8) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P8; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P8; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P8; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P8; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 9) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P9; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P9; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P9; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P9; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 10) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P10; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P10; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P10; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P10; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 11) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P11; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P11; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P11; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P11; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY_PS > 12) + if(doneNP) { + if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); + if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY_P12; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY_P12; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY_P12; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY_P12; +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } +/*--------------------------------------------------------------------------*/ + FxaaFloat dstN = posM.x - posN.x; + FxaaFloat dstP = posP.x - posM.x; + if(!horzSpan) dstN = posM.y - posN.y; + if(!horzSpan) dstP = posP.y - posM.y; +/*--------------------------------------------------------------------------*/ + FxaaBool goodSpanN = (lumaEndN < 0.0) != lumaMLTZero; + FxaaFloat spanLength = (dstP + dstN); + FxaaBool goodSpanP = (lumaEndP < 0.0) != lumaMLTZero; + FxaaFloat spanLengthRcp = 1.0/spanLength; +/*--------------------------------------------------------------------------*/ + FxaaBool directionN = dstN < dstP; + FxaaFloat dst = min(dstN, dstP); + FxaaBool goodSpan = directionN ? goodSpanN : goodSpanP; + FxaaFloat subpixG = subpixF * subpixF; + FxaaFloat pixelOffset = (dst * (-spanLengthRcp)) + 0.5; + FxaaFloat subpixH = subpixG * fxaaQualitySubpix; +/*--------------------------------------------------------------------------*/ + FxaaFloat pixelOffsetGood = goodSpan ? pixelOffset : 0.0; + FxaaFloat pixelOffsetSubpix = max(pixelOffsetGood, subpixH); + if(!horzSpan) posM.x += pixelOffsetSubpix * lengthSign; + if( horzSpan) posM.y += pixelOffsetSubpix * lengthSign; + #if (FXAA_DISCARD == 1) + return FxaaTexTop(tex, posM); + #else + return FxaaFloat4(FxaaTexTop(tex, posM).xyz, lumaM); + #endif +} +/*==========================================================================*/ +#endif + +vec4 mainImage(vec2 fragCoord) +{ + vec2 rcpFrame = 1./invResolution_data.xy; + vec2 uv2 = fragCoord.xy / invResolution_data.xy; + + float fxaaQualitySubpix = 0.75; // [0..1], default 0.75 + float fxaaQualityEdgeThreshold = 0.166; // [0.125..0.33], default 0.166 + float fxaaQualityEdgeThresholdMin = 0.02;//0.0625; // ? + vec4 dummy4 = vec4(0.0,0.0,0.0,0.0); + float dummy1 = 0.0; + + vec4 col = FxaaPixelShader(uv2, dummy4, + inputImage, inputImage, inputImage, + rcpFrame, dummy4, dummy4, dummy4, + fxaaQualitySubpix, fxaaQualityEdgeThreshold, + fxaaQualityEdgeThresholdMin, + dummy1, dummy1, dummy1, dummy4); + + vec4 fragColor = vec4( col.xyz, 1. ); + + return fragColor; +} + +void main() +{ + ivec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4); + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 4; j++) + { + ivec2 texelCoord = ivec2(loc.x + i, loc.y + j); + vec4 outColor = mainImage(texelCoord + vec2(0.5)); + imageStore(imgOutput, texelCoord, outColor); + } + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/Fxaa.spv b/Ryujinx.Graphics.Vulkan/Effects/Shaders/Fxaa.spv new file mode 100644 index 0000000000000000000000000000000000000000..b466bcb659d56d910d30aac5d781d27b8ff1313f GIT binary patch literal 25012 zcmaK!34m5r{r+Es0mKdWeL-<cGxsGo5K%Ei6cRH_M;LHyU~mQ$Op|d-%QCafEX%T7 z(A>&>gWNJTx6DjaE6d6@Tilo5=kvbz84tJrzuTP7^E~H!zRS7iz31gJIB=<rhBO8a zXbfu%ZEQTK(Y#h{EY%nQTBb3$+3!1k()g`<=S<sb`yIB{VEM+trkyq`VADqL)S2y_ zvmE#{!^6?lR-_HDn)U<v-%|8fL)x$?=e03s8OHXR*WNyU?#%Yhj+?&yu5j&!(f7<a zs+|tLYGWk5b`v}2b#zTSx_w$l59ZtM>XhDXx9M-u)NRwZssFTsw2{{;_?+F{i~k`+ zXf&qde@yqR-tMlBiQT>YG-D5JtVTblV;*+H8mlj9x806gPwtpEcHZp0I@^0Ebj&z# zR)_fxYpn6V?Is`Jt=*8unw8x_bKAQ*=gdEN>g;`b+Gln!Hng$Ul0Gta;Osfh9oks? z|2w`d<4ZTzS#q3Q3m4nA;;*}8{7G|<n%y}M&H3vsIW~6MjE>1i_jL3g-Q6{fp{C9H z|HI}0wugQ;sBEgW)em>DXZEa)UYEFOGvdE({wvmo|HGygYiMJmC2gqLsB`x2>@m*Z z#%7GWZ^zASpV86WLF#RdO`X@%K4VOGcaO!n2i^Saj|1qMd}n(0<a`Ckcc<@S-)=v; z#m9v2)#CfZXSUCq6#I$t&RPBZkk<HE1^pe~8t><Be&$b^K=1gB)_BF+8Z+TN?XzZd z9MJA9!=|lqY`@m}Y+Cc&evM?Nws&<<p#J7ZZ_XXuI4K-|W4hXB&c;xmr=oYX_jJu4 zJFk<i*PMGcy|qsm+v4ZKXScI;tMT)y@kvu!;}_J%FRaEVOlpl^T=6MQe}fyB!h4&W zkv)5`MQYd9SlF*mnAEauYg~=i!KT`;yXS;$z}nqVX@_+5%!zgzS}W&CL}}SAiZ;bc zTH1Z|@yExc>F=RR(_gg5(VG6ISY^xZsY;tP<^34XqQ&3#mEDV#roU(}7k1lKcCQq6 z(OxU;fX)2gF6>$w&+-9T!wF<KVXPUsHf`u#9kXW4IeJp(44&X>L(z_A{U*(BZ$8uA zAItX7AC1MJ#>jrYS98u5@EJYr(>gn5&1vn?w#GJv#;tA6+X=0?P8jKD_li%d?)@0} z?Do!{_!w9CXw@>PF&VA3AI3C&9Oj(vUOs6ppY8DO>C-3p88gONt#Ms8`2{WRaFbsK zZ~5CB-qyGZZjNJTP3xa?t#gV!eSQC&vCTO*zCWkm@2yMB8QfS5&l;7Vz5A-U2UMT^ zhv2Pz_G$WlG~?qZAJ);+jW($94BE8rSskr)8q|29;;H%Vj3a2jAqQ`n{}wgApvKRt z@#kv%r5fLkbAMWI^VT=x?b71aTC_C|g3svgp5~s)y*&i27e2nZ|ID+kaadv3s%3EF zh$ZZJ1~l(mTjMD7X`MYCQ|EMc>(q06x?1m?rmtfPU%A)a`f5H)_HEkq!a3zSx(=K^ zy|=@8JPZ46_Ji{*T+(J=%0M5|=vz{s^!|C-*0_wZo*dxKd<Qo!N9&w*+`%1utC(xf z=z}@C)xDM&J7QY<oc2~t3#)PKyAC|}IM4JUjT^y!PpQ5y6!DjC+`QzN=j?x-p|}1I zo7NdRsBt?s)8@{cIiClUYjbBf4^s2F3qG~G3*6RN1n!(Qd+wa(f!eZPj8^WuyKDU3 z8o$5BAE@yMYy9CFf277AtMMml{K*=Bs>YvQg10rEg}1&NbanGJqPYeyp>=gn1rKe0 z|DDlw#J*kKN40m2n{{0G)OPRmxV<ae*RbuZFw5CEriqye=6hvFSM{aC^;`+9d+wYu z-K|h4gBq)&y}zDqjkV#OGiMw)cMj{0FL!MA(?)QweYsC+d}P6c8t$hJn<MY@5%7$A zA002b<2+KD^ZkzRcXhv0>dzdsNABm^dpSOmaqaznyE0z&I|6I%#jAk^({Dgq9s42l za;BjTVpSVWO??D>X?n-iwO_8{`qF-RdTrI$#CipK?FZ1xD|~I@4`)C-=dV~9smWKO zPfjDKVC9M%#~idzi>7XK?*FSfW-Xi0>)Uy&!aqR20nKZRif;in*Kj_1%}Z`w{yE9Z z`J?DJpv7K3y5g?YK43N1%K74bY434u$=-2%ze-b2Ouy7PXBGaxkKT1Th2GrdnR^b{ zc+QP>e#O<3^JU;hW5J%`*ED%!QQ|FzyFW7id~1AE_$C7yjVl)}+&%mRI9I>Fg-t%H z-~OH^U(nBAZt_L__CpNT7&Y28x(MvLDy_%=``K|XS@RWO*FkO9iu-v|GnY~HSHiut zzp5H@esaC3IX<dC_AT(2-r9VRjNb+y_S7Z2ZOQ7ozIVXA$4j=>K3&xG)r$28u=d_V z>-Zzw{iKPx{0U&$>gF!@v#jR$%N2Hxx!C^=EuX_z!TKLnjoZIljeD=HaaD}H7dAh8 zYr<ViW0aTc6~Ai<xU@Adxz7}@<gpRl&*PSOGk>2cYBoO~I}$%Worg)|b(A)GeY!S2 zr_En(Ek27&ZbISO`%DgRewGXF=hbJi<9;J{?fyb@J^g&{jx*Qqua$lpHuAs2)m+yT zu^EN-53qXtyaLvbb;Zv=;p*}8FR)ta=ihMkiNtab{0FRW^*vY->sJ$7yZiJF*64M5 z*UJ7?a(a{gEt;3(Z&%~?<}(WY9k4#GBUT^$J+PYl;Z1DhZ-{C778{qL{<s6cj@cV` zApIbkmvP&war?v_>^LoPhk(@*SH2WYU-}liuHNsZ!LC8>&nR3CLsO6cWx&?8fH{sY z3)e<HewG94C-+CY<<ZpRX9ci%#?NrDHtM-QD}wd!m}}#HQ`3j{Z)LDrH{8$GDqyuf za+9wHR`WCGeD|Z;E{wY`*8p!uukN_tC)D)sXKfvtpR>eU7i>Ja^Vb7AXDVD<$JM+y z-Y3`BUi(qluMY+`Qu792wb97A_iE11z4t!J-EaD{kG~O&mG@#pxLW)<rsn+YVfWiR zJZ*NS`AqQHvkcA4?+rV-&h*1*##@%&`|26JWu@o7I`@N>{w}uS_;r0o(!BgWpkJQ} zt5nz+zDHE|yMgZwgI6!`8Wnb~-;a`?@B4z+F0kLZ)XmrTb-})$3-&!+@CF4QQQ!?L zY@Qp@T#JqAeKwlQ)--FBYvcQ_lKZ}^<YNkMZu{5x#2WW~SLSPfc#XH$_>3B#S>wmn zxbL}2|GwWUx$n2aPXV7=aP#q<R>rOW+=4rPUcnvromT7}_nlVBFRt-RYkXmiUtQzA z!z%sVR^y9m{Jt9ZomDyi@f!DCRXP4_jlWpqFW2}hHST+<(%;)PK48F-<GzQAz4ya+ zP~onp@1IIOvc`S?RF3=ZspP(QD!K2S!j0#9r*Px>-l^oia|-wKaafJF7u@;NYJ6tF zo$ouT(x307!kzCssgnCnD%}0%`>1g3eIHeF-$#XOe`~?DUtHt9gNnW5zJm&PeS8O1 za^F9d{Dm6#{Zqz`@B62c`~Ioq3u@f=PvyAppGxlgr*QM#j^}>KcPaQOj2~2R?GGuq z<A)X8ao;(`pX0u33U}Q1OyQ0nQ*g(9&y;a_Z;j8d@sn!YcT3LC*|aU#vt=8a=c8xS zb~N?a@QSAS+}6hPR<2FPcL3|7Jl}SNtH)+1u+M&N^dZ;An127>h2~i%_dCdkY1$u3 zw&uGVO&j&4X#SZ+Kb{|s#m^q#($Ai7{TxYU`q_)7jr!;&9?0C59v^$dOCP?pZ}}M7 z^f3;ujk-SEhx^gYXAHgLsrw^f{g>-I9<Cmn1Hjf%uHOW>`m#jyo*YPH>o>o5Ip1;P zc^*0zUk8Ef@eW2)kIf`-J>FzA^~C!q*m$1Z&Uf5+6X_j`uS3Dco6_uC`{!femfqrr z!Ob(iKMqbU+B&Z9L+Blg?@xg1zCVf9(p$cdfSXf%9|=z0+B&Z9!|9Kt#rIKQ?^T)i zRJeL<rd2lOz3V_zPrT{i)Z%={jn_`^SbQA~uE*;{Q;*Ftl}#D%Q)udm*9A^}&Uf5+ zGw2<254&Gx(aa&d8|?bV&unn=*4A<T`2EMR%<BQ`JG>X{TE@>DaPraCas3=i?^xy? z2iAA^@!<7nr&r&l=Yh3Rcm7;@wb-8kF6Vz5Zj4i_`6q(4QFs1)dbP~|40!#5pA0re z>R15IdT8soeomraK#QN#!1@k99jwjG<k3fe22C4v=bu_>+B^PPn%ZlOokj2EpKI06 zq^bFxS?ru4^q-^cO!LnQV#l>}d^r6VXscCxIr=Zswx%7*lgasCqG_W(jJ7N79Gdx^ zOYgXUwsYTnnYIi~-SM+4EwR1=FJpZbUdH+wTpRVoIv=dB3+NqBtgnNOrSAB7^lJG$ zd;?tn%zP6~JvJACcc*RB>|1;PTVVCRs-|y)ou_V`3+dJ3<2zvEls>)-SC7plVB<__ z_N_SI1FI*__rcCnH_pZMYObAYdpXU$b7qy-72wM%UHePHYO%i(Y_8!~fiI>d-Veaq zs5^fly;^d<7VN&+f)*dwfz|W5yB>TEO+7Y01RFPTegxJ|-8fg%t0m5j;Ch^!(9{#> zX0UP86XzE2#k9n^6|9}Qac-bji~a3j_haJT(HvW{zB|#><MS@C^VIcu8@*b5{21)L zUrbA!yTR(=_kfKXpZ9{bQP<}pdbRkxA6(xnKS5K^dOQGjJ=7EDr(p9-oCm?$sT=1$ zdbPxP7;Nt4p8OeHJ#iiZ8%I5H9tAI=CC+1D?bMC)5WQOBJOQq+(a+J;<MT=Id9?Wa z1y~z(eLhaF7W-d<wNH$vn`2Af%U_|X$LBL(=c()SDSEZ|{59CUxh$Wy!Sv732G9o4 z9Lt)#0CwzoTIT)+?D}TzZ^4=CSaSFs_ywBd=I|`NTKxPT?7oekKY-)MvE=X)*s(H) zKf+!6%>5HMa~(?#e+E0>adUW)UM*|+7qIth8(Q-ED_A}8{|0tnsK@5-VDD4n`~$3= zx^Z5nS4*6Kg5Bq3oPWX96X)Mx<ESUjf56_m#Ca90ow{*ep;wFj>tJJNUETm^T^x(u zn_%xp{JsU&R$afZ(W}M&9k8+D_g!%OIu^V4z^-xpYNV~Ye&427i~Uky*D-t`{5E*z z4}xo>?tI@_sEy`aSP8qq;5F&Xbsd7Hu6^^J1~~qg23uSBFt|D0iN8MjW#HPVJKuL8 zYO!Aqyb!zGv*qDx<=$KYZl0-OI9NM%efl0nO`m=ES`loV%PYPTSk3P}tI&J-y+?gz znwsBh#94<`!S#K-8k%}+R<CTP;Lkkd+W77^KJAm&8sIXoHQ{RJwKlz%d8w~OQ!_7d zVy**r-y1V)xGtJ{Y}Tu6yocr`*T%f!(>{5v4>m9J7(wr49_ky=)XYPi7#o7!CtJ{x z$3|%C#&n;^jhXrOvE3N#ean5{6s)!hEipC&?_7(qIhuODdw&4zTy^7)q*sfNEy3Q$ za=ku?rXHKE!1XvELQ_wit-;PyH;(V-)Z$}XaQ%6-9h!P<wg=ba?0}}8I6H!!r*0hI z`KiUn&ft1IyP&DZ=EL&Zy6uXl-pA+C^JzD*^VE&wyF|74*aKXzXHRVln^E9;oY83N ziL)2jdFsaT9i>|K!QNokGS9)WV1C$ryypPKZXd8~5xafC{50(x%kySGus$3&-WYnd z_}L#^#`_4IpQazjVmBUK#yg;}b1Z*$n^5VS;X_~A8Fw7LWBTi3>_D)&_!&Eh-pkLJ z`b3(VpD}TAI2hbUI~hCg*(9)<{?+AvzB1Q7b3Y0mNy~a20_LY#r(?030=^VIpTk4p z+Nv+hpW%%AF)%;P&${Egf-j?)$7HzU**6~tZ_QYFejJXb9-B{q52NMzB-bY6p9Jfp ze6}2crXHIk!TQj~JmlJBy^jJ9rR5Bm3RZLdr_+16{^}jH88p{ltiNgW*6I3c@0d1; ze>Av$PIjWHXTH3gub-^@F<@iF->1Osw5*9-o7B?<wjT5EruQ;`^;xvpG;<Uie<pqM z*WNL067N`W{rTO4rk?q7^Ur+y<lhT6M*PhIn}6<yT$|)S7hK+t<KSx6Gmk#?s2@*L zGiPyP&Ih~ivsXV2Ry%=~JWd2>Z)M&|XzG2{v+pxtHRosE$>7Yhez|tmpIq!y&jN7n z$0^|Cuii&fGk<aNI2Byxe;Qn^%>Q(_d6awq47hso|14O|`N{uGaORo6Ts!kmF7~ni zTxDN=PCt*P9-Fhl*^{Z^3ux-O2VVrM`8>!Ob`D%$j>XrPz-Q62zvbFx{9JJIPpmJa zsmJC#aQ0YizJjKne7*`+EAu%Yt}n;p>uX^1$$ph<m+`NI&8J+S3*qXq`3AVYKHo%B zPd*oc)y$`_+JE1I>&vnD`Zo9iTGmIdUB)j4*VpGeXzH=~F1Wrvm!PR9pYMUy%6z^L z*Oz1Qbt%|<vOaR{vhJ6IC)48h3b23HQtzwkyb`RwkmjDej9yJ2!?FDVcr|+Ya`acj zy(js5mutY<s4vC1&olj83wJE<@~#6f!<f3`SJA5_*B^q-HT*{vPwqFswNZEe_4I1V z^Coba=gn}}CVAch)<(U|^H#WH$@4a_d8!-dMtU{l^r7Dl9z)BSbq83@XXIV<UOw~H z@1&{uj1i}{MPi!%ccPd342Z3LY<~<MN6R|g4OVk+<UQfNaP@m=`dmz}mVI(Rxc(jR zCur)4^8naf66dF2wG(K0W;_UXo@2(jk6tZ49tOLA!hZ(#KE>xFU~SZ${}8=e>>mT` zGw;eD2diaIJqlKf{S)9a=byvX%sDnsg57JW;TK>vZOy?kwb(xeE`9zIu2%Yd8t%S` z&tHMnwAH6$YVr9j*f{x~_iHde>=NEb0AlwX*n5}n$<KqeRoCw`^lGvH4cJ)m`&%$S zO}~!C?swqQ?~8D4)%E)Vy_)uY%>O;uSlQ!$0IRvj|48rU9#?;frsf_OXV3i!+)gW> z&3}fgXFvP}toAZ3^Zp7xi&oD28(cl>{dcgM^E2-s;LNioxpsc8lAnEQd<9(A_)oZ+ zHU67EHLCxMre=-e#QYD~8h!p|tzU(!r^eU7YGsYD!_Bpv_Xb=&HNFW}bAIBz1<pKc zlsE0jEBV={#&^JFjqk$MGX5Ued!PG^n}*t@oDF@{Gyv`#b>qDaR?GaQz~%gba5dMk zjo!<BqCSYG<{FAq(_nCYpA11$&l(N|tCefGH2ebcDfh`RH1)pfY+DAb=KRE47Myw3 zB-hS;lKkvb<8t7##^vE^);OF#HL9;bQ?o{KVy+0T?~|3#)KlZiV70QwRp91Y?vqu~ z)KlYXU^V9_-s<4Yvqrgg)|mY4Q{x(7@2xeh1x`)sYtq!LNt`)rgUh|O4qQzi>(R%D z`nohVeTenpcZ>DG<(a(!-2I(1djwn?b$<HK0e`2*v7Ff(fjzU;t=DI*n)N2GeR3ZO zF7N5aa5Zz^l-|pItG)?M&D_PweKT;G`{r=hJh^`Wu8lfB{ki-5M~)@;Ey3olZtgzM z)yzF{?UVZl!R5Md1y?iot?9kYUHwBeHFFp1!*>PSfL){TZQ<paw;f#V1Z?s=+8*va z$Fjz1=9w7w$#VyAxyC!f)y#8edN1=---)JXp5o-S3)uAt|1iAFb62=pndfeB=Q)=3 zR5Q=Su#eB(!LCKl&^_Q#GQ*~@+Y`K)ft<~w;M%Gi$9tj{zoWtC9>06R<JYm+jRBW_ z_l9e$u3y(sP5VA_8VfG(|2}Xv*L6R7FV|IlUz(chD$es^9Jv0>-5*Uo&!dlk)qKaF zdE?<ZW0U^@XzJM~6ToWD&%6V{nP*LM?Yt++&ptIy1eY})1Xr`hN%UUUsD3a_%^Jmt zIT>7k=6)1SJvANzRx4|q0?!$n8V^NNPmLb~t2saM4g+VNHOjTK#^h(88b1y$YdjpT zmhn%3%g@gz;c7XXj{rMI-FQB~)iVD`a5=vnuI3s}rOz6wA4OAh4aKQx8o0hsI?&X! zhSR}n<r>a_=Zwu79*w5nSDnqBU^V9_-Z9|JvnIKAu3_@CPmP}fmo;|5)vR$AeQH#n zNmH{%abk9Z>-%IjntEzH7OYm**aOcQn;Lu3)KlXeu$uD|Z!S3VtWmC=H6}m%)OZ{? zXY)L8YEnO*re;mz%$W}^_tpt;HP_`t`uI@)G)+w(Vtsfvp9C(?x6i=c-#On-hHInF zPyhM00ODBAw?44vo4WP-tW~q##I;ZEr+~|QdMaGa+)t<X^0Tjg8cogI#mW5)aGCpO z;jVddKNGHvIzRonp9OI&xql99?&{|5^IXl`6W2bue;!<}``K_cbN?c}m$|Edfu?5e zVtsfvp93z><}bm^Gw)ouT6yMu8SXsCvc_uWnHcuT^E`05#$SP}ndkZRUgoL(RhpW4 zij(Kpz~$L|0lduf>u|L)&kNzsb1dtrW}b;*pBUc&muK@g;oJf;<NaM>cM*6o138<& z1=m*HINlSr`29AxJex0u$FF0t`wqDD`(3!U>iTsJ)wJ&;r%S-){r?_Z&2_z$-ph4W z|2|F4brt9Na2dG%%)J~<J<p>nz-sppKl2vCbH*nBE78<*Ze0adbAIOi0GxT&B-hS+ zlKkvb<JI7@#%the)_5I#YE-|Lre=-e#JnC{f9C!WO+7XK2&`7tcmq6VY-+p_O+7W< z1Xgo?;@u3+JZqF|XN}3vJ~iF~E^E9Mu9orJz~$%XcDP#3<~zX7Q8%8?Z?(+76I{-} z3$ErGE~d{KsxP9cxrXA@^kZ;+pWKb6o;AD&tX8h!z3`l|S;PC#)cdNl`F^mP^AqnU z;LNioxpuB$^0QBk4}i-We+pN##)s%rqxyq1HER?n=ELCnKKU7%dTM+GtX9_eC_HCu zYJ3b$JvBZKR&##hJps-<Ym{qejmghGHU1o&v-ua`)TI6-P0gCbne!C5+*`kdtGO<} zqK^;tr)g^X5bMLU`5AC|zC8<ff9HJrHC!8Ye)`Y1=OB*de0v`3`KE5YK5NyiH*xKg z`wQUmp8iHoGxy)od-;4*|1C|;+{MZLMR1w>@8Pa_a{mKd8+CsAbAJiqSaSa(*xc34 z-RHTQxhJlDa{m*!T=zf2)y(}b^j_w!{xVI?+{OCvZ2l{_Je&UpFVDQc!_~?&?;mjI zIhHk6Gtb1ZPoA%U%QgNdT+KZHP48u%>i?ptnWs2;{s&y1&9B1CJYR#Wm3h7ncb;Qe zPc`#Q4Ew}*16-cXZ^F3+WX7Dr?k(_Q268sP4cAuPINlSr_<aXlp3U#V<JYm+y$3G+ z;;Nyox_(_lHSPO=1Hk3|Uka|~x(=fEa$VI2($rj6ah?xt;QBLnFq(RvM?=ACLui?| zG&pB$@*jq#o^xv%u$uETZ&`5WS(98l?@98rPmRlg%Nm!5t6Af4`qZet0!_^t#fiBh zxc<yt2~9mUt_)TyYg`4MGd4A@il&|#R|Bg#Kk-%vXP!06wX??LXP+9^0GBnc30KSb zTHx~Yvo>5UXY)E>=cpUc=eJtsuL~~cuLoCi4L6|A8mg~PQ*#Z)sc8hbzE3tpQ_mW1 z1Xe58a3nluY}RmNH1)pfY~BQ{=KRFl6r6e1B-hS0On&yMaWimP<K}QRYuti9HL8Dr zre=-e#M}~G-zOhLQ%{Xsfz`?yKLpPin;N%9Q%{ZCfYqFzc-w+A&l=_0S!43Em*?EI zm(Qa&e|z~_6}ON7?U}nZg_nPyumhTUY<2|a^O)RsLQ~Hk-x=&Yb@TM!hpNTLhrzj5 z@v$qKdTe$B*K^q&O+9h;06S0JIR3j+wcO)9!9&^G`MaG_a5eAoUi7)g>Z56Du8}x- zi~*a+7PRELH=25E#)9+tjm<u2>dAFqu=CWjCTiY`#I;ZE`+>{c$HCRi{Uh{VKC9LD zr>U8{IC+f+XTKz`1JKlCGXY$$??5#5<USGXJoT)znz<*gePSF8_FT;0VNZgq`FGf< zZ!+A!%l<x}=|1$2!nIR3#zFLI@i_(T{S7}9?7r%&o(UfVYoqS`L+I6F|8cNqYH~Up zu2$ys33!>)C*j(u>+>*rwfH;|Y@GZKLOa;sNl@4R2zs^HPX(KE_%yIJC8rLsHtNnl zie4@DGr;Biqv6ia?|62CDY5zQQMGkkKhx<Q(~oQYDX_l7yTGn@_)M_1Ucom$?Pr0V zqn%^=JceF9xy=SYMDXxqE1u_I4_q5{=XcYq#eNRhJj3ULUBCQJ>2YAS_&Xl#9QEvh zd0=(F(;1@|tQMaqfQ=jeY4Ez#7oR7>wNZEee0nwQU5C$rtvmBi1{=e5^1JZ@ur})X zeT|dA>X~y2SS|ciuyJw^PXm*F^WQLQ>$th}(L0u0P6wCIhcn=6J|E7c_wxCm{#lxu z&j)d0p9R)$_~*cF1kZYW9<Gggo)>3>>(AvcpsD9Rd=aeX{LDKCoO#wG*Uo21a<fl; zUjkd7^?VtedeqOQsacOW`JV?a>-h@2tmmt6ZPb(3`QZBV?`vr4spkT)n)5U7>)_0@ z9=UeblicjhZ8*ta2rmE3_YJu7^Jl(q!nIN7NN(<*1Npt?i{Rg)Id1cEe`u3jzKvf0 z-KLAt)DNnjkKY07$1B=*D@}jVE~zyAPi(aQy8_=sbH4VD>)U^4t6u&$dHmgSuT5w> zu_nvYZ%Xq%ZBC#6zRhLK8&dG=3huw{xv|D?uJKz7J{0@g3-0_o3-0(`1vjU~1$X@J zf;)b1!5zQ9#viEh2Mccehim+i8h@<DpQ!OCYy7E#o6plV{%pnF4_ASIKwC(=lIH$v z{?1~h``^EkUjz57vUz@~U5loEHO)QkeNu~$>%rd7#Q7n-9_L4B>hXC4*m>&u^j@op zz0WsQ*n50Sg}uMGRoHubM}@twIj_z4$8i1W%X=y}SNGOEVCOpK{gtPV`@q)bnD<(q zc|QSXp7&jzv7Z)W?g4qm9xBG%4|2ym7k&n|R^z!h<QaPuZ0(M@PvjYU9PC;+=AMyf z?B`(D!!h@dJY&Bo#@tKtjQz40b6?3FbIpDQPVMe7`7ZodOB{O!Jd)mZc0Csr%{6qb zToY@zMr$!=b2A5H>C=4uJnAoTe+^d4^X@sYn$0WEyF8<wN7u$H+6$FdKFfYnY2~x* cw`k7S-to)uVNTD29ZOEX1FQL*6Z@R|KY~1(qW}N^ literal 0 HcmV?d00001 diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaBlend.glsl b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaBlend.glsl new file mode 100644 index 000000000..a518cf25e --- /dev/null +++ b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaBlend.glsl @@ -0,0 +1,1404 @@ +#version 430 core +#define SMAA_GLSL_4 1 + +layout (constant_id = 0) const int SMAA_PRESET_LOW = 0; +layout (constant_id = 1) const int SMAA_PRESET_MEDIUM = 0; +layout (constant_id = 2) const int SMAA_PRESET_HIGH = 0; +layout (constant_id = 3) const int SMAA_PRESET_ULTRA = 0; +layout (constant_id = 4) const float METRIC_WIDTH = 1920.0; +layout (constant_id = 5) const float METRIC_HEIGHT = 1080.0; + +#define SMAA_RT_METRICS float4(1.0 / METRIC_WIDTH, 1.0 / METRIC_HEIGHT, METRIC_WIDTH, METRIC_HEIGHT) + +layout (local_size_x = 16, local_size_y = 16) in; +/** + * Copyright (C) 2013 Jorge Jimenez (jorge@iryoku.com) + * Copyright (C) 2013 Jose I. Echevarria (joseignacioechevarria@gmail.com) + * Copyright (C) 2013 Belen Masia (bmasia@unizar.es) + * Copyright (C) 2013 Fernando Navarro (fernandn@microsoft.com) + * Copyright (C) 2013 Diego Gutierrez (diegog@unizar.es) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to + * do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. As clarification, there + * is no requirement that the copyright notice and permission be included in + * binary distributions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +/** + * _______ ___ ___ ___ ___ + * / || \/ | / \ / \ + * | (---- | \ / | / ^ \ / ^ \ + * \ \ | |\/| | / /_\ \ / /_\ \ + * ----) | | | | | / _____ \ / _____ \ + * |_______/ |__| |__| /__/ \__\ /__/ \__\ + * + * E N H A N C E D + * S U B P I X E L M O R P H O L O G I C A L A N T I A L I A S I N G + * + * http://www.iryoku.com/smaa/ + * + * Hi, welcome aboard! + * + * Here you'll find instructions to get the shader up and running as fast as + * possible. + * + * IMPORTANTE NOTICE: when updating, remember to update both this file and the + * precomputed textures! They may change from version to version. + * + * The shader has three passes, chained together as follows: + * + * |input|------------------ + * v | + * [ SMAA*EdgeDetection ] | + * v | + * |edgesTex| | + * v | + * [ SMAABlendingWeightCalculation ] | + * v | + * |blendTex| | + * v | + * [ SMAANeighborhoodBlending ] <------ + * v + * |output| + * + * Note that each [pass] has its own vertex and pixel shader. Remember to use + * oversized triangles instead of quads to avoid overshading along the + * diagonal. + * + * You've three edge detection methods to choose from: luma, color or depth. + * They represent different quality/performance and anti-aliasing/sharpness + * tradeoffs, so our recommendation is for you to choose the one that best + * suits your particular scenario: + * + * - Depth edge detection is usually the fastest but it may miss some edges. + * + * - Luma edge detection is usually more expensive than depth edge detection, + * but catches visible edges that depth edge detection can miss. + * + * - Color edge detection is usually the most expensive one but catches + * chroma-only edges. + * + * For quickstarters: just use luma edge detection. + * + * The general advice is to not rush the integration process and ensure each + * step is done correctly (don't try to integrate SMAA T2x with predicated edge + * detection from the start!). Ok then, let's go! + * + * 1. The first step is to create two RGBA temporal render targets for holding + * |edgesTex| and |blendTex|. + * + * In DX10 or DX11, you can use a RG render target for the edges texture. + * In the case of NVIDIA GPUs, using RG render targets seems to actually be + * slower. + * + * On the Xbox 360, you can use the same render target for resolving both + * |edgesTex| and |blendTex|, as they aren't needed simultaneously. + * + * 2. Both temporal render targets |edgesTex| and |blendTex| must be cleared + * each frame. Do not forget to clear the alpha channel! + * + * 3. The next step is loading the two supporting precalculated textures, + * 'areaTex' and 'searchTex'. You'll find them in the 'Textures' folder as + * C++ headers, and also as regular DDS files. They'll be needed for the + * 'SMAABlendingWeightCalculation' pass. + * + * If you use the C++ headers, be sure to load them in the format specified + * inside of them. + * + * You can also compress 'areaTex' and 'searchTex' using BC5 and BC4 + * respectively, if you have that option in your content processor pipeline. + * When compressing then, you get a non-perceptible quality decrease, and a + * marginal performance increase. + * + * 4. All samplers must be set to linear filtering and clamp. + * + * After you get the technique working, remember that 64-bit inputs have + * half-rate linear filtering on GCN. + * + * If SMAA is applied to 64-bit color buffers, switching to point filtering + * when accesing them will increase the performance. Search for + * 'SMAASamplePoint' to see which textures may benefit from point + * filtering, and where (which is basically the color input in the edge + * detection and resolve passes). + * + * 5. All texture reads and buffer writes must be non-sRGB, with the exception + * of the input read and the output write in + * 'SMAANeighborhoodBlending' (and only in this pass!). If sRGB reads in + * this last pass are not possible, the technique will work anyway, but + * will perform antialiasing in gamma space. + * + * IMPORTANT: for best results the input read for the color/luma edge + * detection should *NOT* be sRGB. + * + * 6. Before including SMAA.h you'll have to setup the render target metrics, + * the target and any optional configuration defines. Optionally you can + * use a preset. + * + * You have the following targets available: + * SMAA_HLSL_3 + * SMAA_HLSL_4 + * SMAA_HLSL_4_1 + * SMAA_GLSL_3 * + * SMAA_GLSL_4 * + * + * * (See SMAA_INCLUDE_VS and SMAA_INCLUDE_PS below). + * + * And four presets: + * SMAA_PRESET_LOW (%60 of the quality) + * SMAA_PRESET_MEDIUM (%80 of the quality) + * SMAA_PRESET_HIGH (%95 of the quality) + * SMAA_PRESET_ULTRA (%99 of the quality) + * + * For example: + * #define SMAA_RT_METRICS float4(1.0 / 1280.0, 1.0 / 720.0, 1280.0, 720.0) + * #define SMAA_HLSL_4 + * #define SMAA_PRESET_HIGH + * #include "SMAA.h" + * + * Note that SMAA_RT_METRICS doesn't need to be a macro, it can be a + * uniform variable. The code is designed to minimize the impact of not + * using a constant value, but it is still better to hardcode it. + * + * Depending on how you encoded 'areaTex' and 'searchTex', you may have to + * add (and customize) the following defines before including SMAA.h: + * #define SMAA_AREATEX_SELECT(sample) sample.rg + * #define SMAA_SEARCHTEX_SELECT(sample) sample.r + * + * If your engine is already using porting macros, you can define + * SMAA_CUSTOM_SL, and define the porting functions by yourself. + * + * 7. Then, you'll have to setup the passes as indicated in the scheme above. + * You can take a look into SMAA.fx, to see how we did it for our demo. + * Checkout the function wrappers, you may want to copy-paste them! + * + * 8. It's recommended to validate the produced |edgesTex| and |blendTex|. + * You can use a screenshot from your engine to compare the |edgesTex| + * and |blendTex| produced inside of the engine with the results obtained + * with the reference demo. + * + * 9. After you get the last pass to work, it's time to optimize. You'll have + * to initialize a stencil buffer in the first pass (discard is already in + * the code), then mask execution by using it the second pass. The last + * pass should be executed in all pixels. + * + * + * After this point you can choose to enable predicated thresholding, + * temporal supersampling and motion blur integration: + * + * a) If you want to use predicated thresholding, take a look into + * SMAA_PREDICATION; you'll need to pass an extra texture in the edge + * detection pass. + * + * b) If you want to enable temporal supersampling (SMAA T2x): + * + * 1. The first step is to render using subpixel jitters. I won't go into + * detail, but it's as simple as moving each vertex position in the + * vertex shader, you can check how we do it in our DX10 demo. + * + * 2. Then, you must setup the temporal resolve. You may want to take a look + * into SMAAResolve for resolving 2x modes. After you get it working, you'll + * probably see ghosting everywhere. But fear not, you can enable the + * CryENGINE temporal reprojection by setting the SMAA_REPROJECTION macro. + * Check out SMAA_DECODE_VELOCITY if your velocity buffer is encoded. + * + * 3. The next step is to apply SMAA to each subpixel jittered frame, just as + * done for 1x. + * + * 4. At this point you should already have something usable, but for best + * results the proper area textures must be set depending on current jitter. + * For this, the parameter 'subsampleIndices' of + * 'SMAABlendingWeightCalculationPS' must be set as follows, for our T2x + * mode: + * + * @SUBSAMPLE_INDICES + * + * | S# | Camera Jitter | subsampleIndices | + * +----+------------------+---------------------+ + * | 0 | ( 0.25, -0.25) | float4(1, 1, 1, 0) | + * | 1 | (-0.25, 0.25) | float4(2, 2, 2, 0) | + * + * These jitter positions assume a bottom-to-top y axis. S# stands for the + * sample number. + * + * More information about temporal supersampling here: + * http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf + * + * c) If you want to enable spatial multisampling (SMAA S2x): + * + * 1. The scene must be rendered using MSAA 2x. The MSAA 2x buffer must be + * created with: + * - DX10: see below (*) + * - DX10.1: D3D10_STANDARD_MULTISAMPLE_PATTERN or + * - DX11: D3D11_STANDARD_MULTISAMPLE_PATTERN + * + * This allows to ensure that the subsample order matches the table in + * @SUBSAMPLE_INDICES. + * + * (*) In the case of DX10, we refer the reader to: + * - SMAA::detectMSAAOrder and + * - SMAA::msaaReorder + * + * These functions allow to match the standard multisample patterns by + * detecting the subsample order for a specific GPU, and reordering + * them appropriately. + * + * 2. A shader must be run to output each subsample into a separate buffer + * (DX10 is required). You can use SMAASeparate for this purpose, or just do + * it in an existing pass (for example, in the tone mapping pass, which has + * the advantage of feeding tone mapped subsamples to SMAA, which will yield + * better results). + * + * 3. The full SMAA 1x pipeline must be run for each separated buffer, storing + * the results in the final buffer. The second run should alpha blend with + * the existing final buffer using a blending factor of 0.5. + * 'subsampleIndices' must be adjusted as in the SMAA T2x case (see point + * b). + * + * d) If you want to enable temporal supersampling on top of SMAA S2x + * (which actually is SMAA 4x): + * + * 1. SMAA 4x consists on temporally jittering SMAA S2x, so the first step is + * to calculate SMAA S2x for current frame. In this case, 'subsampleIndices' + * must be set as follows: + * + * | F# | S# | Camera Jitter | Net Jitter | subsampleIndices | + * +----+----+--------------------+-------------------+----------------------+ + * | 0 | 0 | ( 0.125, 0.125) | ( 0.375, -0.125) | float4(5, 3, 1, 3) | + * | 0 | 1 | ( 0.125, 0.125) | (-0.125, 0.375) | float4(4, 6, 2, 3) | + * +----+----+--------------------+-------------------+----------------------+ + * | 1 | 2 | (-0.125, -0.125) | ( 0.125, -0.375) | float4(3, 5, 1, 4) | + * | 1 | 3 | (-0.125, -0.125) | (-0.375, 0.125) | float4(6, 4, 2, 4) | + * + * These jitter positions assume a bottom-to-top y axis. F# stands for the + * frame number. S# stands for the sample number. + * + * 2. After calculating SMAA S2x for current frame (with the new subsample + * indices), previous frame must be reprojected as in SMAA T2x mode (see + * point b). + * + * e) If motion blur is used, you may want to do the edge detection pass + * together with motion blur. This has two advantages: + * + * 1. Pixels under heavy motion can be omitted from the edge detection process. + * For these pixels we can just store "no edge", as motion blur will take + * care of them. + * 2. The center pixel tap is reused. + * + * Note that in this case depth testing should be used instead of stenciling, + * as we have to write all the pixels in the motion blur pass. + * + * That's it! + */ + +//----------------------------------------------------------------------------- +// SMAA Presets + +/** + * Note that if you use one of these presets, the following configuration + * macros will be ignored if set in the "Configurable Defines" section. + */ + +#if defined(SMAA_PRESET_LOW) +#define SMAA_THRESHOLD 0.15 +#define SMAA_MAX_SEARCH_STEPS 4 +#define SMAA_DISABLE_DIAG_DETECTION +#define SMAA_DISABLE_CORNER_DETECTION +#elif defined(SMAA_PRESET_MEDIUM) +#define SMAA_THRESHOLD 0.1 +#define SMAA_MAX_SEARCH_STEPS 8 +#define SMAA_DISABLE_DIAG_DETECTION +#define SMAA_DISABLE_CORNER_DETECTION +#elif defined(SMAA_PRESET_HIGH) +#define SMAA_THRESHOLD 0.1 +#define SMAA_MAX_SEARCH_STEPS 16 +#define SMAA_MAX_SEARCH_STEPS_DIAG 8 +#define SMAA_CORNER_ROUNDING 25 +#elif defined(SMAA_PRESET_ULTRA) +#define SMAA_THRESHOLD 0.05 +#define SMAA_MAX_SEARCH_STEPS 32 +#define SMAA_MAX_SEARCH_STEPS_DIAG 16 +#define SMAA_CORNER_ROUNDING 25 +#endif + +//----------------------------------------------------------------------------- +// Configurable Defines + +/** + * SMAA_THRESHOLD specifies the threshold or sensitivity to edges. + * Lowering this value you will be able to detect more edges at the expense of + * performance. + * + * Range: [0, 0.5] + * 0.1 is a reasonable value, and allows to catch most visible edges. + * 0.05 is a rather overkill value, that allows to catch 'em all. + * + * If temporal supersampling is used, 0.2 could be a reasonable value, as low + * contrast edges are properly filtered by just 2x. + */ +#ifndef SMAA_THRESHOLD +#define SMAA_THRESHOLD 0.1 +#endif + +/** + * SMAA_DEPTH_THRESHOLD specifies the threshold for depth edge detection. + * + * Range: depends on the depth range of the scene. + */ +#ifndef SMAA_DEPTH_THRESHOLD +#define SMAA_DEPTH_THRESHOLD (0.1 * SMAA_THRESHOLD) +#endif + +/** + * SMAA_MAX_SEARCH_STEPS specifies the maximum steps performed in the + * horizontal/vertical pattern searches, at each side of the pixel. + * + * In number of pixels, it's actually the double. So the maximum line length + * perfectly handled by, for example 16, is 64 (by perfectly, we meant that + * longer lines won't look as good, but still antialiased). + * + * Range: [0, 112] + */ +#ifndef SMAA_MAX_SEARCH_STEPS +#define SMAA_MAX_SEARCH_STEPS 16 +#endif + +/** + * SMAA_MAX_SEARCH_STEPS_DIAG specifies the maximum steps performed in the + * diagonal pattern searches, at each side of the pixel. In this case we jump + * one pixel at time, instead of two. + * + * Range: [0, 20] + * + * On high-end machines it is cheap (between a 0.8x and 0.9x slower for 16 + * steps), but it can have a significant impact on older machines. + * + * Define SMAA_DISABLE_DIAG_DETECTION to disable diagonal processing. + */ +#ifndef SMAA_MAX_SEARCH_STEPS_DIAG +#define SMAA_MAX_SEARCH_STEPS_DIAG 8 +#endif + +/** + * SMAA_CORNER_ROUNDING specifies how much sharp corners will be rounded. + * + * Range: [0, 100] + * + * Define SMAA_DISABLE_CORNER_DETECTION to disable corner processing. + */ +#ifndef SMAA_CORNER_ROUNDING +#define SMAA_CORNER_ROUNDING 25 +#endif + +/** + * If there is an neighbor edge that has SMAA_LOCAL_CONTRAST_FACTOR times + * bigger contrast than current edge, current edge will be discarded. + * + * This allows to eliminate spurious crossing edges, and is based on the fact + * that, if there is too much contrast in a direction, that will hide + * perceptually contrast in the other neighbors. + */ +#ifndef SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR +#define SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR 2.0 +#endif + +/** + * Predicated thresholding allows to better preserve texture details and to + * improve performance, by decreasing the number of detected edges using an + * additional buffer like the light accumulation buffer, object ids or even the + * depth buffer (the depth buffer usage may be limited to indoor or short range + * scenes). + * + * It locally decreases the luma or color threshold if an edge is found in an + * additional buffer (so the global threshold can be higher). + * + * This method was developed by Playstation EDGE MLAA team, and used in + * Killzone 3, by using the light accumulation buffer. More information here: + * http://iryoku.com/aacourse/downloads/06-MLAA-on-PS3.pptx + */ +#ifndef SMAA_PREDICATION +#define SMAA_PREDICATION 0 +#endif + +/** + * Threshold to be used in the additional predication buffer. + * + * Range: depends on the input, so you'll have to find the magic number that + * works for you. + */ +#ifndef SMAA_PREDICATION_THRESHOLD +#define SMAA_PREDICATION_THRESHOLD 0.01 +#endif + +/** + * How much to scale the global threshold used for luma or color edge + * detection when using predication. + * + * Range: [1, 5] + */ +#ifndef SMAA_PREDICATION_SCALE +#define SMAA_PREDICATION_SCALE 2.0 +#endif + +/** + * How much to locally decrease the threshold. + * + * Range: [0, 1] + */ +#ifndef SMAA_PREDICATION_STRENGTH +#define SMAA_PREDICATION_STRENGTH 0.4 +#endif + +/** + * Temporal reprojection allows to remove ghosting artifacts when using + * temporal supersampling. We use the CryEngine 3 method which also introduces + * velocity weighting. This feature is of extreme importance for totally + * removing ghosting. More information here: + * http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf + * + * Note that you'll need to setup a velocity buffer for enabling reprojection. + * For static geometry, saving the previous depth buffer is a viable + * alternative. + */ +#ifndef SMAA_REPROJECTION +#define SMAA_REPROJECTION 0 +#endif + +/** + * SMAA_REPROJECTION_WEIGHT_SCALE controls the velocity weighting. It allows to + * remove ghosting trails behind the moving object, which are not removed by + * just using reprojection. Using low values will exhibit ghosting, while using + * high values will disable temporal supersampling under motion. + * + * Behind the scenes, velocity weighting removes temporal supersampling when + * the velocity of the subsamples differs (meaning they are different objects). + * + * Range: [0, 80] + */ +#ifndef SMAA_REPROJECTION_WEIGHT_SCALE +#define SMAA_REPROJECTION_WEIGHT_SCALE 30.0 +#endif + +/** + * On some compilers, discard cannot be used in vertex shaders. Thus, they need + * to be compiled separately. + */ +#ifndef SMAA_INCLUDE_VS +#define SMAA_INCLUDE_VS 1 +#endif +#ifndef SMAA_INCLUDE_PS +#define SMAA_INCLUDE_PS 1 +#endif + +//----------------------------------------------------------------------------- +// Texture Access Defines + +#ifndef SMAA_AREATEX_SELECT +#if defined(SMAA_HLSL_3) +#define SMAA_AREATEX_SELECT(sample) sample.ra +#else +#define SMAA_AREATEX_SELECT(sample) sample.rg +#endif +#endif + +#ifndef SMAA_SEARCHTEX_SELECT +#define SMAA_SEARCHTEX_SELECT(sample) sample.r +#endif + +#ifndef SMAA_DECODE_VELOCITY +#define SMAA_DECODE_VELOCITY(sample) sample.rg +#endif + +//----------------------------------------------------------------------------- +// Non-Configurable Defines + +#define SMAA_AREATEX_MAX_DISTANCE 16 +#define SMAA_AREATEX_MAX_DISTANCE_DIAG 20 +#define SMAA_AREATEX_PIXEL_SIZE (1.0 / float2(160.0, 560.0)) +#define SMAA_AREATEX_SUBTEX_SIZE (1.0 / 7.0) +#define SMAA_SEARCHTEX_SIZE float2(66.0, 33.0) +#define SMAA_SEARCHTEX_PACKED_SIZE float2(64.0, 16.0) +#define SMAA_CORNER_ROUNDING_NORM (float(SMAA_CORNER_ROUNDING) / 100.0) + +//----------------------------------------------------------------------------- +// Porting Functions + +#if defined(SMAA_HLSL_3) +#define SMAATexture2D(tex) sampler2D tex +#define SMAATexturePass2D(tex) tex +#define SMAASampleLevelZero(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0)) +#define SMAASampleLevelZeroPoint(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0)) +#define SMAASampleLevelZeroOffset(tex, coord, offset) tex2Dlod(tex, float4(coord + offset * SMAA_RT_METRICS.xy, 0.0, 0.0)) +#define SMAASample(tex, coord) tex2D(tex, coord) +#define SMAASamplePoint(tex, coord) tex2D(tex, coord) +#define SMAASampleOffset(tex, coord, offset) tex2D(tex, coord + offset * SMAA_RT_METRICS.xy) +#define SMAA_FLATTEN [flatten] +#define SMAA_BRANCH [branch] +#endif +#if defined(SMAA_HLSL_4) || defined(SMAA_HLSL_4_1) +SamplerState LinearSampler { Filter = MIN_MAG_LINEAR_MIP_POINT; AddressU = Clamp; AddressV = Clamp; }; +SamplerState PointSampler { Filter = MIN_MAG_MIP_POINT; AddressU = Clamp; AddressV = Clamp; }; +#define SMAATexture2D(tex) Texture2D tex +#define SMAATexturePass2D(tex) tex +#define SMAASampleLevelZero(tex, coord) tex.SampleLevel(LinearSampler, coord, 0) +#define SMAASampleLevelZeroPoint(tex, coord) tex.SampleLevel(PointSampler, coord, 0) +#define SMAASampleLevelZeroOffset(tex, coord, offset) tex.SampleLevel(LinearSampler, coord, 0, offset) +#define SMAASample(tex, coord) tex.Sample(LinearSampler, coord) +#define SMAASamplePoint(tex, coord) tex.Sample(PointSampler, coord) +#define SMAASampleOffset(tex, coord, offset) tex.Sample(LinearSampler, coord, offset) +#define SMAA_FLATTEN [flatten] +#define SMAA_BRANCH [branch] +#define SMAATexture2DMS2(tex) Texture2DMS<float4, 2> tex +#define SMAALoad(tex, pos, sample) tex.Load(pos, sample) +#if defined(SMAA_HLSL_4_1) +#define SMAAGather(tex, coord) tex.Gather(LinearSampler, coord, 0) +#endif +#endif +#if defined(SMAA_GLSL_3) || defined(SMAA_GLSL_4) +#define SMAATexture2D(tex) sampler2D tex +#define SMAATexturePass2D(tex) tex +#define SMAASampleLevelZero(tex, coord) textureLod(tex, coord, 0.0) +#define SMAASampleLevelZeroPoint(tex, coord) textureLod(tex, coord, 0.0) +#define SMAASampleLevelZeroOffset(tex, coord, offset) textureLodOffset(tex, coord, 0.0, offset) +#define SMAASample(tex, coord) texture(tex, coord) +#define SMAASamplePoint(tex, coord) texture(tex, coord) +#define SMAASampleOffset(tex, coord, offset) texture(tex, coord, offset) +#define SMAA_FLATTEN +#define SMAA_BRANCH +#define lerp(a, b, t) mix(a, b, t) +#define saturate(a) clamp(a, 0.0, 1.0) +#if defined(SMAA_GLSL_4) +#define mad(a, b, c) fma(a, b, c) +#define SMAAGather(tex, coord) textureGather(tex, coord) +#else +#define mad(a, b, c) (a * b + c) +#endif +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#endif + +#if !defined(SMAA_HLSL_3) && !defined(SMAA_HLSL_4) && !defined(SMAA_HLSL_4_1) && !defined(SMAA_GLSL_3) && !defined(SMAA_GLSL_4) && !defined(SMAA_CUSTOM_SL) +#error you must define the shading language: SMAA_HLSL_*, SMAA_GLSL_* or SMAA_CUSTOM_SL +#endif + +//----------------------------------------------------------------------------- +// Misc functions + +/** + * Gathers current pixel, and the top-left neighbors. + */ +float3 SMAAGatherNeighbours(float2 texcoord, + float4 offset[3], + SMAATexture2D(tex)) { + #ifdef SMAAGather + return SMAAGather(tex, texcoord + SMAA_RT_METRICS.xy * float2(-0.5, -0.5)).grb; + #else + float P = SMAASamplePoint(tex, texcoord).r; + float Pleft = SMAASamplePoint(tex, offset[0].xy).r; + float Ptop = SMAASamplePoint(tex, offset[0].zw).r; + return float3(P, Pleft, Ptop); + #endif +} + +/** + * Adjusts the threshold by means of predication. + */ +float2 SMAACalculatePredicatedThreshold(float2 texcoord, + float4 offset[3], + SMAATexture2D(predicationTex)) { + float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(predicationTex)); + float2 delta = abs(neighbours.xx - neighbours.yz); + float2 edges = step(SMAA_PREDICATION_THRESHOLD, delta); + return SMAA_PREDICATION_SCALE * SMAA_THRESHOLD * (1.0 - SMAA_PREDICATION_STRENGTH * edges); +} + +/** + * Conditional move: + */ +void SMAAMovc(bool2 cond, inout float2 variable, float2 value) { + SMAA_FLATTEN if (cond.x) variable.x = value.x; + SMAA_FLATTEN if (cond.y) variable.y = value.y; +} + +void SMAAMovc(bool4 cond, inout float4 variable, float4 value) { + SMAAMovc(cond.xy, variable.xy, value.xy); + SMAAMovc(cond.zw, variable.zw, value.zw); +} + + +#if SMAA_INCLUDE_VS +//----------------------------------------------------------------------------- +// Vertex Shaders + +/** + * Edge Detection Vertex Shader + */ +void SMAAEdgeDetectionVS(float2 texcoord, + out float4 offset[3]) { + offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-1.0, 0.0, 0.0, -1.0), texcoord.xyxy); + offset[1] = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0, 1.0), texcoord.xyxy); + offset[2] = mad(SMAA_RT_METRICS.xyxy, float4(-2.0, 0.0, 0.0, -2.0), texcoord.xyxy); +} + +/** + * Blend Weight Calculation Vertex Shader + */ +void SMAABlendingWeightCalculationVS(float2 texcoord, + out float2 pixcoord, + out float4 offset[3]) { + pixcoord = texcoord * SMAA_RT_METRICS.zw; + + // We will use these offsets for the searches later on (see @PSEUDO_GATHER4): + offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-0.25, -0.125, 1.25, -0.125), texcoord.xyxy); + offset[1] = mad(SMAA_RT_METRICS.xyxy, float4(-0.125, -0.25, -0.125, 1.25), texcoord.xyxy); + + // And these for the searches, they indicate the ends of the loops: + offset[2] = mad(SMAA_RT_METRICS.xxyy, + float4(-2.0, 2.0, -2.0, 2.0) * float(SMAA_MAX_SEARCH_STEPS), + float4(offset[0].xz, offset[1].yw)); +} + +/** + * Neighborhood Blending Vertex Shader + */ +void SMAANeighborhoodBlendingVS(float2 texcoord, + out float4 offset) { + offset = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0, 1.0), texcoord.xyxy); +} +#endif // SMAA_INCLUDE_VS + +#if SMAA_INCLUDE_PS +//----------------------------------------------------------------------------- +// Edge Detection Pixel Shaders (First Pass) + +/** + * Luma Edge Detection + * + * IMPORTANT NOTICE: luma edge detection requires gamma-corrected colors, and + * thus 'colorTex' should be a non-sRGB texture. + */ +float2 SMAALumaEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(colorTex) + #if SMAA_PREDICATION + , SMAATexture2D(predicationTex) + #endif + ) { + // Calculate the threshold: + #if SMAA_PREDICATION + float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, SMAATexturePass2D(predicationTex)); + #else + float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD); + #endif + + // Calculate lumas: + float3 weights = float3(0.2126, 0.7152, 0.0722); + float L = dot(SMAASamplePoint(colorTex, texcoord).rgb, weights); + + float Lleft = dot(SMAASamplePoint(colorTex, offset[0].xy).rgb, weights); + float Ltop = dot(SMAASamplePoint(colorTex, offset[0].zw).rgb, weights); + + // We do the usual threshold: + float4 delta; + delta.xy = abs(L - float2(Lleft, Ltop)); + float2 edges = step(threshold, delta.xy); + + // Then discard if there is no edge: + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + // Calculate right and bottom deltas: + float Lright = dot(SMAASamplePoint(colorTex, offset[1].xy).rgb, weights); + float Lbottom = dot(SMAASamplePoint(colorTex, offset[1].zw).rgb, weights); + delta.zw = abs(L - float2(Lright, Lbottom)); + + // Calculate the maximum delta in the direct neighborhood: + float2 maxDelta = max(delta.xy, delta.zw); + + // Calculate left-left and top-top deltas: + float Lleftleft = dot(SMAASamplePoint(colorTex, offset[2].xy).rgb, weights); + float Ltoptop = dot(SMAASamplePoint(colorTex, offset[2].zw).rgb, weights); + delta.zw = abs(float2(Lleft, Ltop) - float2(Lleftleft, Ltoptop)); + + // Calculate the final maximum delta: + maxDelta = max(maxDelta.xy, delta.zw); + float finalDelta = max(maxDelta.x, maxDelta.y); + + // Local contrast adaptation: + edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy); + + return edges; +} + +/** + * Color Edge Detection + * + * IMPORTANT NOTICE: color edge detection requires gamma-corrected colors, and + * thus 'colorTex' should be a non-sRGB texture. + */ +float2 SMAAColorEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(colorTex) + #if SMAA_PREDICATION + , SMAATexture2D(predicationTex) + #endif + ) { + // Calculate the threshold: + #if SMAA_PREDICATION + float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, predicationTex); + #else + float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD); + #endif + + // Calculate color deltas: + float4 delta; + float3 C = SMAASamplePoint(colorTex, texcoord).rgb; + + float3 Cleft = SMAASamplePoint(colorTex, offset[0].xy).rgb; + float3 t = abs(C - Cleft); + delta.x = max(max(t.r, t.g), t.b); + + float3 Ctop = SMAASamplePoint(colorTex, offset[0].zw).rgb; + t = abs(C - Ctop); + delta.y = max(max(t.r, t.g), t.b); + + // We do the usual threshold: + float2 edges = step(threshold, delta.xy); + + // Then discard if there is no edge: + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + // Calculate right and bottom deltas: + float3 Cright = SMAASamplePoint(colorTex, offset[1].xy).rgb; + t = abs(C - Cright); + delta.z = max(max(t.r, t.g), t.b); + + float3 Cbottom = SMAASamplePoint(colorTex, offset[1].zw).rgb; + t = abs(C - Cbottom); + delta.w = max(max(t.r, t.g), t.b); + + // Calculate the maximum delta in the direct neighborhood: + float2 maxDelta = max(delta.xy, delta.zw); + + // Calculate left-left and top-top deltas: + float3 Cleftleft = SMAASamplePoint(colorTex, offset[2].xy).rgb; + t = abs(C - Cleftleft); + delta.z = max(max(t.r, t.g), t.b); + + float3 Ctoptop = SMAASamplePoint(colorTex, offset[2].zw).rgb; + t = abs(C - Ctoptop); + delta.w = max(max(t.r, t.g), t.b); + + // Calculate the final maximum delta: + maxDelta = max(maxDelta.xy, delta.zw); + float finalDelta = max(maxDelta.x, maxDelta.y); + + // Local contrast adaptation: + edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy); + + return edges; +} + +/** + * Depth Edge Detection + */ +float2 SMAADepthEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(depthTex)) { + float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(depthTex)); + float2 delta = abs(neighbours.xx - float2(neighbours.y, neighbours.z)); + float2 edges = step(SMAA_DEPTH_THRESHOLD, delta); + + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + return edges; +} + +//----------------------------------------------------------------------------- +// Diagonal Search Functions + +#if !defined(SMAA_DISABLE_DIAG_DETECTION) + +/** + * Allows to decode two binary values from a bilinear-filtered access. + */ +float2 SMAADecodeDiagBilinearAccess(float2 e) { + // Bilinear access for fetching 'e' have a 0.25 offset, and we are + // interested in the R and G edges: + // + // +---G---+-------+ + // | x o R x | + // +-------+-------+ + // + // Then, if one of these edge is enabled: + // Red: (0.75 * X + 0.25 * 1) => 0.25 or 1.0 + // Green: (0.75 * 1 + 0.25 * X) => 0.75 or 1.0 + // + // This function will unpack the values (mad + mul + round): + // wolframalpha.com: round(x * abs(5 * x - 5 * 0.75)) plot 0 to 1 + e.r = e.r * abs(5.0 * e.r - 5.0 * 0.75); + return round(e); +} + +float4 SMAADecodeDiagBilinearAccess(float4 e) { + e.rb = e.rb * abs(5.0 * e.rb - 5.0 * 0.75); + return round(e); +} + +/** + * These functions allows to perform diagonal pattern searches. + */ +float2 SMAASearchDiag1(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) { + float4 coord = float4(texcoord, -1.0, 1.0); + float3 t = float3(SMAA_RT_METRICS.xy, 1.0); + while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) && + coord.w > 0.9) { + coord.xyz = mad(t, float3(dir, 1.0), coord.xyz); + e = SMAASampleLevelZero(edgesTex, coord.xy).rg; + coord.w = dot(e, float2(0.5, 0.5)); + } + return coord.zw; +} + +float2 SMAASearchDiag2(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) { + float4 coord = float4(texcoord, -1.0, 1.0); + coord.x += 0.25 * SMAA_RT_METRICS.x; // See @SearchDiag2Optimization + float3 t = float3(SMAA_RT_METRICS.xy, 1.0); + while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) && + coord.w > 0.9) { + coord.xyz = mad(t, float3(dir, 1.0), coord.xyz); + + // @SearchDiag2Optimization + // Fetch both edges at once using bilinear filtering: + e = SMAASampleLevelZero(edgesTex, coord.xy).rg; + e = SMAADecodeDiagBilinearAccess(e); + + // Non-optimized version: + // e.g = SMAASampleLevelZero(edgesTex, coord.xy).g; + // e.r = SMAASampleLevelZeroOffset(edgesTex, coord.xy, int2(1, 0)).r; + + coord.w = dot(e, float2(0.5, 0.5)); + } + return coord.zw; +} + +/** + * Similar to SMAAArea, this calculates the area corresponding to a certain + * diagonal distance and crossing edges 'e'. + */ +float2 SMAAAreaDiag(SMAATexture2D(areaTex), float2 dist, float2 e, float offset) { + float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE_DIAG, SMAA_AREATEX_MAX_DISTANCE_DIAG), e, dist); + + // We do a scale and bias for mapping to texel space: + texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE); + + // Diagonal areas are on the second half of the texture: + texcoord.x += 0.5; + + // Move to proper place, according to the subpixel offset: + texcoord.y += SMAA_AREATEX_SUBTEX_SIZE * offset; + + // Do it! + return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord)); +} + +/** + * This searches for diagonal patterns and returns the corresponding weights. + */ +float2 SMAACalculateDiagWeights(SMAATexture2D(edgesTex), SMAATexture2D(areaTex), float2 texcoord, float2 e, float4 subsampleIndices) { + float2 weights = float2(0.0, 0.0); + + // Search for the line ends: + float4 d; + float2 end; + if (e.r > 0.0) { + d.xz = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, 1.0), end); + d.x += float(end.y > 0.9); + } else + d.xz = float2(0.0, 0.0); + d.yw = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, -1.0), end); + + SMAA_BRANCH + if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3 + // Fetch the crossing edges: + float4 coords = mad(float4(-d.x + 0.25, d.x, d.y, -d.y - 0.25), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + float4 c; + c.xy = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).rg; + c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).rg; + c.yxwz = SMAADecodeDiagBilinearAccess(c.xyzw); + + // Non-optimized version: + // float4 coords = mad(float4(-d.x, d.x, d.y, -d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + // float4 c; + // c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).g; + // c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, 0)).r; + // c.z = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).g; + // c.w = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, -1)).r; + + // Merge crossing edges at each side into a single value: + float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw); + + // Remove the crossing edge if we didn't found the end of the line: + SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0)); + + // Fetch the areas for this line: + weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.z); + } + + // Search for the line ends: + d.xz = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, -1.0), end); + if (SMAASampleLevelZeroOffset(edgesTex, texcoord, int2(1, 0)).r > 0.0) { + d.yw = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, 1.0), end); + d.y += float(end.y > 0.9); + } else + d.yw = float2(0.0, 0.0); + + SMAA_BRANCH + if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3 + // Fetch the crossing edges: + float4 coords = mad(float4(-d.x, -d.x, d.y, d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + float4 c; + c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).g; + c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, -1)).r; + c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).gr; + float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw); + + // Remove the crossing edge if we didn't found the end of the line: + SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0)); + + // Fetch the areas for this line: + weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.w).gr; + } + + return weights; +} +#endif + +//----------------------------------------------------------------------------- +// Horizontal/Vertical Search Functions + +/** + * This allows to determine how much length should we add in the last step + * of the searches. It takes the bilinearly interpolated edge (see + * @PSEUDO_GATHER4), and adds 0, 1 or 2, depending on which edges and + * crossing edges are active. + */ +float SMAASearchLength(SMAATexture2D(searchTex), float2 e, float offset) { + // The texture is flipped vertically, with left and right cases taking half + // of the space horizontally: + float2 scale = SMAA_SEARCHTEX_SIZE * float2(0.5, -1.0); + float2 bias = SMAA_SEARCHTEX_SIZE * float2(offset, 1.0); + + // Scale and bias to access texel centers: + scale += float2(-1.0, 1.0); + bias += float2( 0.5, -0.5); + + // Convert from pixel coordinates to texcoords: + // (We use SMAA_SEARCHTEX_PACKED_SIZE because the texture is cropped) + scale *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE; + bias *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE; + + // Lookup the search texture: + return SMAA_SEARCHTEX_SELECT(SMAASampleLevelZero(searchTex, mad(scale, e, bias))); +} + +/** + * Horizontal/vertical search functions for the 2nd pass. + */ +float SMAASearchXLeft(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + /** + * @PSEUDO_GATHER4 + * This texcoord has been offset by (-0.25, -0.125) in the vertex shader to + * sample between edge, thus fetching four edges in a row. + * Sampling with different offsets in each direction allows to disambiguate + * which edges are active from the four fetched ones. + */ + float2 e = float2(0.0, 1.0); + while (texcoord.x > end && + e.g > 0.8281 && // Is there some edge not activated? + e.r == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(-float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord); + } + + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0), 3.25); + return mad(SMAA_RT_METRICS.x, offset, texcoord.x); + + // Non-optimized version: + // We correct the previous (-0.25, -0.125) offset we applied: + // texcoord.x += 0.25 * SMAA_RT_METRICS.x; + + // The searches are bias by 1, so adjust the coords accordingly: + // texcoord.x += SMAA_RT_METRICS.x; + + // Disambiguate the length added by the last step: + // texcoord.x += 2.0 * SMAA_RT_METRICS.x; // Undo last step + // texcoord.x -= SMAA_RT_METRICS.x * (255.0 / 127.0) * SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0); + // return mad(SMAA_RT_METRICS.x, offset, texcoord.x); +} + +float SMAASearchXRight(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(0.0, 1.0); + while (texcoord.x < end && + e.g > 0.8281 && // Is there some edge not activated? + e.r == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.5), 3.25); + return mad(-SMAA_RT_METRICS.x, offset, texcoord.x); +} + +float SMAASearchYUp(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(1.0, 0.0); + while (texcoord.y > end && + e.r > 0.8281 && // Is there some edge not activated? + e.g == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(-float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.0), 3.25); + return mad(SMAA_RT_METRICS.y, offset, texcoord.y); +} + +float SMAASearchYDown(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(1.0, 0.0); + while (texcoord.y < end && + e.r > 0.8281 && // Is there some edge not activated? + e.g == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.5), 3.25); + return mad(-SMAA_RT_METRICS.y, offset, texcoord.y); +} + +/** + * Ok, we have the distance and both crossing edges. So, what are the areas + * at each side of current edge? + */ +float2 SMAAArea(SMAATexture2D(areaTex), float2 dist, float e1, float e2, float offset) { + // Rounding prevents precision errors of bilinear filtering: + float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE, SMAA_AREATEX_MAX_DISTANCE), round(4.0 * float2(e1, e2)), dist); + + // We do a scale and bias for mapping to texel space: + texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE); + + // Move to proper place, according to the subpixel offset: + texcoord.y = mad(SMAA_AREATEX_SUBTEX_SIZE, offset, texcoord.y); + + // Do it! + return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord)); +} + +//----------------------------------------------------------------------------- +// Corner Detection Functions + +void SMAADetectHorizontalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) { + #if !defined(SMAA_DISABLE_CORNER_DETECTION) + float2 leftRight = step(d.xy, d.yx); + float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight; + + rounding /= leftRight.x + leftRight.y; // Reduce blending for pixels in the center of a line. + + float2 factor = float2(1.0, 1.0); + factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, 1)).r; + factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, 1)).r; + factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, -2)).r; + factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, -2)).r; + + weights *= saturate(factor); + #endif +} + +void SMAADetectVerticalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) { + #if !defined(SMAA_DISABLE_CORNER_DETECTION) + float2 leftRight = step(d.xy, d.yx); + float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight; + + rounding /= leftRight.x + leftRight.y; + + float2 factor = float2(1.0, 1.0); + factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2( 1, 0)).g; + factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2( 1, 1)).g; + factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(-2, 0)).g; + factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(-2, 1)).g; + + weights *= saturate(factor); + #endif +} + +//----------------------------------------------------------------------------- +// Blending Weight Calculation Pixel Shader (Second Pass) + +float4 SMAABlendingWeightCalculationPS(float2 texcoord, + float2 pixcoord, + float4 offset[3], + SMAATexture2D(edgesTex), + SMAATexture2D(areaTex), + SMAATexture2D(searchTex), + float4 subsampleIndices) { // Just pass zero for SMAA 1x, see @SUBSAMPLE_INDICES. + float4 weights = float4(0.0, 0.0, 0.0, 0.0); + + float2 e = SMAASample(edgesTex, texcoord).rg; + + SMAA_BRANCH + if (e.g > 0.0) { // Edge at north + #if !defined(SMAA_DISABLE_DIAG_DETECTION) + // Diagonals have both north and west edges, so searching for them in + // one of the boundaries is enough. + weights.rg = SMAACalculateDiagWeights(SMAATexturePass2D(edgesTex), SMAATexturePass2D(areaTex), texcoord, e, subsampleIndices); + + // We give priority to diagonals, so if we find a diagonal we skip + // horizontal/vertical processing. + SMAA_BRANCH + if (weights.r == -weights.g) { // weights.r + weights.g == 0.0 + #endif + + float2 d; + + // Find the distance to the left: + float3 coords; + coords.x = SMAASearchXLeft(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].xy, offset[2].x); + coords.y = offset[1].y; // offset[1].y = texcoord.y - 0.25 * SMAA_RT_METRICS.y (@CROSSING_OFFSET) + d.x = coords.x; + + // Now fetch the left crossing edges, two at a time using bilinear + // filtering. Sampling at -0.25 (see @CROSSING_OFFSET) enables to + // discern what value each edge has: + float e1 = SMAASampleLevelZero(edgesTex, coords.xy).r; + + // Find the distance to the right: + coords.z = SMAASearchXRight(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].zw, offset[2].y); + d.y = coords.z; + + // We want the distances to be in pixel units (doing this here allow to + // better interleave arithmetic and memory accesses): + d = abs(round(mad(SMAA_RT_METRICS.zz, d, -pixcoord.xx))); + + // SMAAArea below needs a sqrt, as the areas texture is compressed + // quadratically: + float2 sqrt_d = sqrt(d); + + // Fetch the right crossing edges: + float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.zy, int2(1, 0)).r; + + // Ok, we know how this pattern looks like, now it is time for getting + // the actual area: + weights.rg = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.y); + + // Fix corners: + coords.y = texcoord.y; + SMAADetectHorizontalCornerPattern(SMAATexturePass2D(edgesTex), weights.rg, coords.xyzy, d); + + #if !defined(SMAA_DISABLE_DIAG_DETECTION) + } else + e.r = 0.0; // Skip vertical processing. + #endif + } + + SMAA_BRANCH + if (e.r > 0.0) { // Edge at west + float2 d; + + // Find the distance to the top: + float3 coords; + coords.y = SMAASearchYUp(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].xy, offset[2].z); + coords.x = offset[0].x; // offset[1].x = texcoord.x - 0.25 * SMAA_RT_METRICS.x; + d.x = coords.y; + + // Fetch the top crossing edges: + float e1 = SMAASampleLevelZero(edgesTex, coords.xy).g; + + // Find the distance to the bottom: + coords.z = SMAASearchYDown(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].zw, offset[2].w); + d.y = coords.z; + + // We want the distances to be in pixel units: + d = abs(round(mad(SMAA_RT_METRICS.ww, d, -pixcoord.yy))); + + // SMAAArea below needs a sqrt, as the areas texture is compressed + // quadratically: + float2 sqrt_d = sqrt(d); + + // Fetch the bottom crossing edges: + float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.xz, int2(0, 1)).g; + + // Get the area for this direction: + weights.ba = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.x); + + // Fix corners: + coords.x = texcoord.x; + SMAADetectVerticalCornerPattern(SMAATexturePass2D(edgesTex), weights.ba, coords.xyxz, d); + } + + return weights; +} + +//----------------------------------------------------------------------------- +// Neighborhood Blending Pixel Shader (Third Pass) + +float4 SMAANeighborhoodBlendingPS(float2 texcoord, + float4 offset, + SMAATexture2D(colorTex), + SMAATexture2D(blendTex) + #if SMAA_REPROJECTION + , SMAATexture2D(velocityTex) + #endif + ) { + // Fetch the blending weights for current pixel: + float4 a; + a.x = SMAASample(blendTex, offset.xy).a; // Right + a.y = SMAASample(blendTex, offset.zw).g; // Top + a.wz = SMAASample(blendTex, texcoord).xz; // Bottom / Left + + // Is there any blending weight with a value greater than 0.0? + SMAA_BRANCH + if (dot(a, float4(1.0, 1.0, 1.0, 1.0)) < 1e-5) { + float4 color = SMAASampleLevelZero(colorTex, texcoord); + + #if SMAA_REPROJECTION + float2 velocity = SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, texcoord)); + + // Pack velocity into the alpha channel: + color.a = sqrt(5.0 * length(velocity)); + #endif + + return color; + } else { + bool h = max(a.x, a.z) > max(a.y, a.w); // max(horizontal) > max(vertical) + + // Calculate the blending offsets: + float4 blendingOffset = float4(0.0, a.y, 0.0, a.w); + float2 blendingWeight = a.yw; + SMAAMovc(bool4(h, h, h, h), blendingOffset, float4(a.x, 0.0, a.z, 0.0)); + SMAAMovc(bool2(h, h), blendingWeight, a.xz); + blendingWeight /= dot(blendingWeight, float2(1.0, 1.0)); + + // Calculate the texture coordinates: + float4 blendingCoord = mad(blendingOffset, float4(SMAA_RT_METRICS.xy, -SMAA_RT_METRICS.xy), texcoord.xyxy); + + // We exploit bilinear filtering to mix current pixel with the chosen + // neighbor: + float4 color = blendingWeight.x * SMAASampleLevelZero(colorTex, blendingCoord.xy); + color += blendingWeight.y * SMAASampleLevelZero(colorTex, blendingCoord.zw); + + #if SMAA_REPROJECTION + // Antialias velocity for proper reprojection in a later stage: + float2 velocity = blendingWeight.x * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.xy)); + velocity += blendingWeight.y * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.zw)); + + // Pack velocity into the alpha channel: + color.a = sqrt(5.0 * length(velocity)); + #endif + + return color; + } +} + +//----------------------------------------------------------------------------- +// Temporal Resolve Pixel Shader (Optional Pass) + +float4 SMAAResolvePS(float2 texcoord, + SMAATexture2D(currentColorTex), + SMAATexture2D(previousColorTex) + #if SMAA_REPROJECTION + , SMAATexture2D(velocityTex) + #endif + ) { + #if SMAA_REPROJECTION + // Velocity is assumed to be calculated for motion blur, so we need to + // inverse it for reprojection: + float2 velocity = -SMAA_DECODE_VELOCITY(SMAASamplePoint(velocityTex, texcoord).rg); + + // Fetch current pixel: + float4 current = SMAASamplePoint(currentColorTex, texcoord); + + // Reproject current coordinates and fetch previous pixel: + float4 previous = SMAASamplePoint(previousColorTex, texcoord + velocity); + + // Attenuate the previous pixel if the velocity is different: + float delta = abs(current.a * current.a - previous.a * previous.a) / 5.0; + float weight = 0.5 * saturate(1.0 - sqrt(delta) * SMAA_REPROJECTION_WEIGHT_SCALE); + + // Blend the pixels according to the calculated weight: + return lerp(current, previous, weight); + #else + // Just blend the pixels: + float4 current = SMAASamplePoint(currentColorTex, texcoord); + float4 previous = SMAASamplePoint(previousColorTex, texcoord); + return lerp(current, previous, 0.5); + #endif +} + +//----------------------------------------------------------------------------- +// Separate Multisamples Pixel Shader (Optional Pass) + +#ifdef SMAALoad +void SMAASeparatePS(float4 position, + float2 texcoord, + out float4 target0, + out float4 target1, + SMAATexture2DMS2(colorTexMS)) { + int2 pos = int2(position.xy); + target0 = SMAALoad(colorTexMS, pos, 0); + target1 = SMAALoad(colorTexMS, pos, 1); +} +#endif + +//----------------------------------------------------------------------------- +#endif // SMAA_INCLUDE_PS + +layout(rgba8, binding = 0, set = 3) uniform image2D imgOutput; + +layout(binding = 1, set = 2) uniform sampler2D inputImg; +layout(binding = 3, set = 2) uniform sampler2D samplerArea; +layout(binding = 4, set = 2) uniform sampler2D samplerSearch; +layout( binding = 2 ) uniform invResolution +{ + vec2 invResolution_data; +}; + +void main() { + ivec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4); + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 4; j++) + { + ivec2 texelCoord = ivec2(loc.x + i, loc.y + j); + vec2 coord = (texelCoord + vec2(0.5)) / invResolution_data; + vec2 pixCoord; + vec4 offset[3]; + + SMAABlendingWeightCalculationVS( coord, pixCoord, offset); + + vec4 oColor = SMAABlendingWeightCalculationPS(coord, pixCoord, offset, inputImg, samplerArea, samplerSearch, ivec4(0)); + + imageStore(imgOutput, texelCoord, oColor); + } + } +} diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaBlend.spv b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaBlend.spv new file mode 100644 index 0000000000000000000000000000000000000000..8efa011f77f3c49ed762e7465ae685ebd01b8348 GIT binary patch literal 33728 zcmai-2b@*a`L(Yw17h#JjlCDFSOGymQBdq9IKqG^G6(}2D=4N|5-TP#i7kn-L}N78 zSYnDPc1_b5jnNca>>6YFp6AY4%$^hfzi(%D_FC(G-`)2<=UztW+h?8rjlK&t7HKTp zIH<MJOiMTVG!_Ca+UVCTM~ogfdh?lG6F1*x+bwliqS3eMr_Yl37}i2Qu6@ch4WB}m zLRVXwyi}F-@5}%CP~He>#iNXCr4Ra!+k426y*p=5*l_ky19zG|sXL?Vzc}TD&S?`V z;H~tf(U{#nV@mr`Q#-n~7W$+%b(Vr2t5RC$(5W5MCQg|)`Jj#|laKD&y?yG0SyS7) zrgTm_aNLGzX7;3Q4%_;Ootky8L|j+Li4!_IXDn!MWwhy2O0A`_3R>r+Ni#dTY;V!V znpJzlIwo{Z>=-tsee%#LQ>RSpXrD1;!i0{QGn<X!)5=_|LD}8UMH}n>|KqmVsft^t zh}*o18%GNhj<&t6Hk>(ds|8c%*l=l%X=CFiPVShwf5(Z@HUab=XWz!AXcMQ*X!db4 zW5|we8%pQ0r>%kiXIpNTdRtqe)!Q0~mbSJwhTO`I+?mq0hRo<_cT~L^nzU6lrk2JI z__gEXklZ7K;1j3JWHq|yct^_aI$IhB^{mwrjbU)x$ZlvZSN8^)*=(=(L{{_P+StuH zd(C4@V=!7zKUy0@;PtT%HICfc=-=3#a^|d~X0{(UeQL)jwkDecsX6vNDSPI5bjP&G zT}Lk%|AMi%Hbz)SbKM<%_jpHAX1t@oJ>y-Zu`j%*-h)SXOzP@2Osn=>?QI>s#@E`| z2d#Hot&K5g^)Ze$mLuwZIS5>|x1T5Pe;UNcHpdxPYjc0JdYcEJ)!RJK*k+rHG>!oG zw0Xz@(?6p@+dIUVT6>40)!RD^t=`_@##Zf302l2I>pWrF|Fmd(<Bh4ccO+W9y>_&E zdq)}DY;Qp0Sg`GRE}9K_z*)HNALp{AF$tfZz1Y&23@^{b(bmUd^bVRL<`5cp47g{% zEZLX=x2<6vT^$p;Mt07aa#H8CuJ)<Bcg~pBF=K3dS69c3?$LJNpxZQWo$mGYVbFWL zEsajJ6Pow$OxkO0Oo!LUdOVsJZYwvrd%CCn13PAPO_}ijZ@s&HFMxXcv(U=+XT$65 zpMcig{xXd-z_$N?K0C(#&$GiDBbmoZ_jx+`f7)qnoPt(=mYj;#GykoP)8O?noQ~GB zwyljh@ZNi<e`BuNXWlLS8kb_*d-(qQjoN+uL8FH4?{&dk_iKE;M;|$S)QFLuJ@>?` zlz9lcU*3T`x7s%1IbPa!fXCJjp58vA{kW!YYvWjLy`HBnjkD2u)}wFZ0(ke__ibDR z?%wk)jf=rOb+t6+*Z8G1enXA>zEQ^CRO9!-o8#aHc8pKSt1-UP%eQai4Q&`5|3|%S zEsc+B{L>om)2DYoT5G(2jW1E-OV#+&HNIAjuU+Ho)c9`ja@_mLo8#tQ5Le^us>RK! z@e^wNq#8f9#!s*DFT-bYvh&<J4}R2?_Ud`qw{a~zwnxA{XQZX^7`$BDCwlR|jpvKl zh5Gi6?E^1kTYB-njb-3zcbi)5w!M6L59!7GHij3mlWMV(d--zj_TsIL6XCtjN^9eM zw5dF>n-9_IOkILDqjOgCy%wptC$B`C)IOoBlgryX_Sw&fd*L(UUh5Ut(pbNx_u1SK zp1BwZF6V8V8sDzQhr@gJSxaM&ntjh+yl-Pa_{3@-9tiG!R<$$^@8#FhII_lP!e<^o zqig)c?sGgFo@d)BHQ&=}d`^v@QRC;;`1v({QH@^(FUNd!&3;WU-nVfRd}1}`TfiCf zoxS{88u!%rWAJj!Pa0c2&;L~OeWu2rt?}n-{N);dt;XNz#rrlsf=}(7z^+@k`8ja% z)bS&xb{^F}b=0)kofDd$wnh!B#<*B(qxtUPeY-r^cMtdL>R|SM2QL}TPhQ@$8=y7k zvgy;Wv5B$MX7ATAvvcY!Kf|H-qfK>FFWdNu?OpBN`(d-nZaV|PJb$ZJw6`?2Ml0`@ zZS-023~Om@SL55)_)f;}-Z?dAZ)f<FY13zQjXKU(u;%!7LF<0$pW!PUTEE7wJ=(bD z>wMF<rLh}&+5V6k-@V3%_u{RMQSd3pO&&9=i#{~#?AO>A&3BsdWA__AZutJ=M~^v( zKJ;(wSJU?%K5Wzhd=St!u4Ws_`$6}3`ZxBk*$x=J|9(R-j$l50*WeKd>gK*<$O&oM z{jOk9@M`3ds9C>t;HDAm4K=RtroZdAcGJ?s+u*S~-`Y#A-8Suyb6?u!+PV6gcGsb_ z%eANf&ZqYH%e9-V-?_CdliWF#o8``<IQ?_(<f-pKxbv|xd0D*lYs0Etyd3}Ek8(xo z>Q5<FUkko4rJ8vKA_g?|Zr??~^7z_@e(G)1vRGxa4sB-JSccL#$G<%PzbxfiWYZ=U zUk+^lR)a53X^fhET7lAb)NN-)O8byDw7226I;DM4H?ED+IP22B_5$qO3}~zmw_gj9 z%M`vUJ=y?+e%7~P<)qd|%^OkLo_hL{wzosGZR^)}d&;`6K5fkSu7$0v%l<pwJt(aw zw(hw8jnReP+qQ7?8C$kzOtSs)ecCxpoNd|nHhOsyrE``stpL_9S>LSNJg~|6Ig8RU zyPsk|54=#t9m9NbqcLZ<f%wZk+l|H>!@@n=jm8&;xlg_gT8o@``Q>Dj^&9`4CYy6_ z_;;JUv4DTC$*<ULQ+no@t|1$vpEgTlt@{SDZN>Lyu=dhFzPF(}e`)t_FzJEtpLFw~ zsp}rNKI$ps`*73uioxNJH~I7?_pH1`F57y&X`geSXX`Dn^WfQW4c?}74b)Q~U(J%9 zUr_%5_@qgbc8h)WroG3%t(!AveuKtc1y?gCX7i@coL$4GHu)7p5_23}%^aWO!4E(5 z&`#kq;8!m3@{Wm_3D-wGKC{3NKQwwl%xw6TOB}Nx<^;Gt>WMiOyxyo~c1^wKz#XGG z+J#`}(ft_hdNjFu>be_^G}yWRBiMbCwGjUa?NgF<Xn&fl=01(~BHX<d%{hG&&AQEL ze{m|dZF96W(d<+DX&Z;5S(ojkpJ&^tM$6FfbGrEhspnj{KI-xLO4DZ^d$fC<zY15= zFFuzveH!=0XFlA0pdO#EH+|;p9R7`NzDr^*gX^On?K-gQ;kXia2ig&zj9aiTe*q`m zANw!6`JB}8AY32y_&f^kGq)$^DR`qXc5q@oggfWuURajHV?Ro5Gc@b9{=|+))5rL< z_jwOqW6pwoe-d2H9G{cBeFlfmZ}P#-^W}P819v^$Betb>E1KMFoO|u3XrB~&?SU#z zJ#mj#aiz9sOLuJcpY^X)Y29sh-;EofIleM`r~DV)JH<H9!N+j#C7Nnz{{)Vysqa*A z-^tXpGo&Uq#<4VG(Xce_i-N7+F=njGR`xY$#dkN)jCE}ac1hK2OYZxen)b~rtiN`j zS-#Ji!hK&Wx$kEt-w8&5y-ycvDn0*a{ps(>%3ojWl=}{6@;T->e1|iI`wmxf-{DH` zJDe%=a{$~w$J&&qsq^hSotoKa>UK<O`edAI@c|0vW}jR$KSw#Q_R%@^vr@_ZycDkg z>IL`N@8_l1<$l%)Uje>#!L|E2Cw94?bHerab53|0+|N1T`ujPh<bF;GH{Q=F;l}$p zMLxt)^Rp1yeK!wn7sl*9>r?6TaFDma)%uc8sn)6$te$%Mfvv}RPCfn6)KkyGV70QI z0dV!PwB=b@1Z>^vyRt&Ahnm>6@I3y9HCr6)+F8CwpO&Otifq!pbY-`+Kklz(z}9gy zZTY^n99YeB^C3R+CCJug-D2By&C~7*mCe$2SE5{*Y_i=|D!XOcT@`E{X?HcSTH2Mb zNVYE96+5r)zct9tLDt7{tXb))e=WG<nnR5CHn2YGsb_7l^<;hYTL(=&^{flFpQ&d( zus-TppY_4kug$q}|EpPt>$f3TtrPBjvJqHqE`5`43|3nkU;F0Wvjy4v$T_wi{kE+1 zFI90{!Hre7UC)_X{I><GY4c3U2eH#V?>m53rBwHPd8QIKsM3t{tje|dxoJnRzS=yu z^27}Wr#;W0Jhq`=>oU&sC{LZkz}BU0cS^Z<IHk{$L41PnoqG@X8I;y-9r}7Fdk=f} z>TjI)fxgy1f^tvtsEWHDdx2dK?bbUQY|I3>zS`B?yY6H6pQZlpk-fp?p4bPj<}=Xt zwW%4Ov*A9LdmgRNGWCrCmwR<ATrKr!Q#0PPZM^64W1cQMkaLH7&zlw_Z%;oKrd+(j z-iu2V*xyUAo~0?>b7xV0fvo>LvVHSD_##>Ted_x%rOA6i{Yzvu>k(Vec9dt6tv~hn z96YDe<8v<D{^;YJ$@OuJsnarj`wG~;<*eGbuaed6+Xa*+`=)+AS<SwQ)9!^}`<8m_ z+eKvc_*@LOZ~E9bxjyzSbz0Wv-}7Yhdj;p;IbW&3evhK=Gsxf5uy20X<Xn5cFCpvi z`L<tQBhM%M**4lYD$UQgV>!$2-^<AQYSUNS7;M)0O|o{g=icZ0S=6uXTV&hR$8p*} zwX}5wxZeJil~%TY6`H=<dfFe*?Az67+RY}8Uo%-|+}{T~?lFwo@mxz*Gn>ko#n{#Q zTvzD>DQ)8#vT?3q8|C$6=h*wxvB{0e*nR-6kL|`vE5~*dn!eiVWBVbRcC)GMXU6s; z^m1&sz}3vAGA3iYwbC7%ZQM*YE@S&K*s<k)kQ<Y+-43pg?T$(-$95;0zS`<zy9-Ub zImy17(~o<>nPbPh4E#Q_diLM_m8NcT4<`Pn;EW^j51^?h{%2q{vu(S7_pQ#y&%yVT zP0@Y<zMrgbv|oa)#}w^Bu$q23N52AFr*#;kPuAquV0~Q^=f>}0^v_s+19mL8k=<`{ zW6C>X5%`*9)9U2h5o>_G3)Z5vzU?S~OI9!M(1$8LK978c&!cGSp78;ckC7iHtDjkE zPgMHjl#6l8>-Qwt@wJe(C;ln0?P^cEzXPXlj#aK-_WbWFyLCI}Kaj24tnCxVs+MP$ z`(|CTX>GFe(nh%s*?C!yGW+A{O3(gy2Cil{^&!Xi&tU6_@3UYvvq|5tFemo$FJ$e` zmG5<G&yk%Yvva5RJXv3D8Q)*Qj?KAyo}Byj1+e|op1J!Q*gm<g*^4iN^))%4`lu!5 z?_gtIA}8h_U}J387=6?-_J4wnv5(I0zsUNUwYdgrSp&zmIoY%k*)`aZa^nj7EZd~O zn-+L8u<Nu1rJoVoC|@P_ulU=P?^HZ{<TbeUWjy}|tC^y`0amlmIiqia_0{$|c|O_k z+n%=Ef$vr}@3m;}Ra$umzK^D_Hhr~azkC4JZZ_GE^zA=j?WSlSg4MeF+pqcE>3`w+ zYD-_8i*CJ7<0G(hXX$6BPbfbnn|$A}GVPYx4-0`EulvFI^!)ZgQ@?|K+7I3WR`WaC z_nDu5lqTPgw6&7etXrJA`-81}IdbY=7)|}Is_sR=YJUG~A5)LzyR5qHFABCD`>nri z*tYt|$d1#t#kTVn<zm>Jo9vOr;hVxUze~W?OwpFCH080hY&}as^ws8^Tc2lOX}Dt> z!#jk2%fQvlCJSb|dFN|i7C~*G8o+vu$y_Z5w!ZQ#FOO!N$$hDhT4GiJ8*?5xV^|R_ zZ5gAFT4GkJV#@Qsaus8LjnPNVv2K923fQxov9Aj69((tD;c9R-v&rIM*3Nla17f^B z*6;UtITLGwwVO>Aw!ayrwyfP+V6}_cGw#oI;A(4=Gp=>P_Q}0w9diBLQ=S3$howB{ z#C?-7tyjgTZ|lR=%=Wu4*?UgCyyvz-H<kC?mhe1}w<@q}AAM`EecF~X_ng0TR^D?v z6x=(rz2cddjp5dxwciA;W{S3Hr77p$+YF+ww({PyeQmkpwg9{5z00C)S!w0Hw-uVc z+H&uu{efuOO?eO48t$LfXv_L-1GatlzvFYAw}q?Et=4@zu$u9S+a8=a*Ilk(`a1|* zAH$B7R*qpOG<~&Y4A~1iqiHvrEHb`bAlgmQcC9q!+#|a|^wpNT+x1m<PY$MZKUjKq z52f6lY|=igvRh{C!@-Wdyu0^+tLN?>0anZ1Jqm2{{?s;-tY+Kd)V(Lzy34zJFSvT{ z?$Kbi0c87_dMtBy?+vyc`>nri*tYr*vg5REvFq+#xewU6$vz(g_UvSS$HLW2(e|x0 z<;?kh5Ph|kclS8BW6RyWKUmFdvMBHF0}#~ggMG7JV=`9<f~~K-yAOgJXL4Waqn4P1 z!N!z#_aX4KWsE**i8&N(OnG-71~(>mw?1mlop<-)VDBUOGL%QayT{&rcYhA9W;R*m z?j8>@UZ48ieI%N8v&q6fd#`EB+8qT})8_ppAH;d~9dQD9O-l8|swH+J*x20r9dNa^ z$(fT$V8@etU#_3~(evZpvMkToP<ZBJGPs<Nqv2}i^4?c3@BN+WmudTE?tZ`Bp~BwP zg9^N3g}0;JiEN*CX=2`CD37IX@BNH>8r+!hPPp~?`O$fs4(F%2ACCj*ua9$RUD^}d z1y)P!iC})3u`>bMPar2&d*=4@U}MWW`6Rgd-0Ds~8LZYuPTVQr#96OgKl|jpej3<$ z$n)TI_^D*|)G-Ha9iETG&4sI{?K8w=;}dr#IB~Wu*UvgU5A(pTQ|9Na%0GMl3vhkZ z>wEr-Xxhyt3*QOU%e^tIX?0F}KKtxe;K3Dk?S~Y2XocG-cPG0~hEq2Gtc>z(+R6KY zJ{J_+x-P2mi)(y-jbB>hH`MrzHGWgU+i35;8h@(BUn#i#d!yhB!#^sx_K$1)(;D}8 z%rk!dTMO>l>R)i}{*HO+@9&tG+}|-T`C2vZ@0gc%f5$w$jr#o^^OE~J=Ha&Q@0o`? zUVqm-T)V$(UUGleyyX6_dCC1<^Ki%O@0!cq=kASj$T<h+f`?)&&+d6>>Un;D1+2C@ z+0S+7Q<{7}Yx^qsdt~>aIA`V(@DTiL*F7c|yT2|Z5325|uYvs@z`d`ZcK2NB`Fh2T zb??g4|8IcZ-{mv)GPre`jL}CeG2a9mQ$8QR1viGDW{f^+iTO6zw#$3wa=3ct=?bvg z0P?kbF7yt&60Gifh4YvCt^%juj#J;O(bVTwcjkA%YSxjs?}8KO*yQ^8+;Ux91J@${ z*MQ@nccAa1xd%+vp^sW(t_9cU=Q=cF_-V%IqgJ+S`&qLefQMAR-WPJ)@~*g^oVEWU zIBTz;cJGhWb92Rwb^hh)=a0blvwSzY1#X=tWAsr=%&lN!d=JQc{1|QwKg}3@)Dm+W z*qHK++zwZd&mEOdxj*hiQ=eO{=Urgq)U#)PA|~HW_H5ltY4U7oyN9gix{A}@{os0^ zeu}1kdbOV(0IQ|_pMljjC#RmDgX{JD0!=+WzpQ-9u|0^U9-m)*hR?6j)YG5efYtJy z@NdCt_9ve?9|GG?`(PXT<TKjCV9!zcIrEW9D?g(>il(o&p3i9Af6nt5ns&3PDw<{X z+~bHtu;tH&o`9>FO;yn>^IiRui0Z!kI|kb^&OPY!PrLPRfc<x1?>f2f+P{bElh2WV z0IQiz76(^<2K5ZYczt}w^G<o1oY+5u^)Z_)vPREBo+6ib<zL{&nCz=QYKeIcY)pB^ zo`)O5PcudzwT$nt;BtH~z{~Od4X$S9(L}2~<9i9sczrUy7r`0d-@*ErO&00vKOpwC zyx0E;H^$`n^ifO9zre<n_rc3>WB6&t=%bePdKGMp^}RwV_p^#?`5Jg#O7-}^4%R>W z>EB>AeOvKuA^UvxJ51};SD(bc1@4aT{w(%3T+M894&(by&G%innwg*GT<0DCy-IV> zz6rKo{q4gWl-g78```=6S%(k6YUZqA{tV+kXalOe7$b8XKZNts{EVsH_}dx1<NhyL z`#EIW{)ke|wvGJ+Y=5+UOeq(CN@+}fzr>C#g}?FoX-^#sVN=W4<QZFEa5=UXxSH8S zSToM|N^PmLA2|K;9aEltwJ_LsbN8bDKHmqRsoMwNLFIW4E&}$m;$X7vG~Zb%)7GNk zv+2lO=FB-+46bIJ^=VV{d3-O}=Y?hBm&7)NIDLKJm5Y7X^>=sEk7dBy;OF_(PrL8G zsb|@W8+#G;%QNQXz|LcNzbubtohD=SQA^AUU}MTNyCRw~$jumi)a--z=Stw*FV4lv z@FA4BBUgdzqpp2*D)J7=^L|xyeN1@{uLf7w*XQsWmEGsCw$;f4$<C`d^U(&jZO672 zST6Q6fxrKpd0iKrdDTz5pA%BgdKJ&S$}>mngPo)D4&DIGI!(stqn4Nr!N!#L+D2%` zAU9+5QA^CmVAr&K&ToRIo_X37tmgU7JMLz1_4+&R=4k1+<Ma${fu=r}eJtM+tY#hQ z$5!CPIX1a|?rqn_HE=EBzco1iS>tWcyc<l`p^sW(wyk2y`Pr_D$^7V}R<>*VsbhPv z?YiGHz8%oi<1?u8Dd%)YH1)aF9P9)(PCfIyGg$p@>UTfx3O2bPwe3Pyb1uYbZ!oyt zry*$Sc@G<^znS*^e7HNf9y1J0Ju$<<YKhqcoH@_fN1&<4XXI!2j6ze-``(^l`(ypa zYtQ@MUf}xs-snmz-}m-L(^p&leQzH$?PgQ8C7NZ<-WY5{u;uwY7OrMCl`(nW+ZUU< z&rZi+JI1-!{JgK-`n|8lfpcH^j<CO+oM-6)U^TPJdUKZ@h-SP#xyyXt%RWB{T_0mI z?+3&8BfBr{k6gd>=MZq&pF`#3^ye_Jn%PwL=Lj_8^+|sY2d6)u1M6c-f5yY@k7vOC z$n{Hqjs%zeX_u4JpQFHPW`3xlXMQFi7_U$I<6WQFiRk*69fv9X>Oiw!<+EoJ+!&L6 z)kiHclflN6_r}q1W31m8ebgME>wOH^nDpsbu$tLq5#Onh()T#Hn%Vjtd*0usRa)NP ztXF^gFojZk>g@!3Uekx^a5XbO&3g0xc05A;{cQ%C?fYj0+Ktcq+f1<byuWpU)oj}} znGLo-_G=cUTzmp$`Tlky-1j%*_0yg@K3{1Wn>=GX8C;I-6u6q%lyx}?t}S(*22Q_D z1<SL}bHKj8xz76gteT6aZXZskl;`<=1~~6;wqt$zWWLV?>tnV*ru1VTTKR5w7F^93 z+tQ}yb7)se?|jR|e*vtI-!Y5x9kYGjv)K~&GUfKv<?q*wsIcEHj;yfHfl*}Z-itEd zC4ZUvd`AyIuj2Mg+qvY-&-q~M$UeFNuI3${-%q*_{#CLz>##nxjO8M*_0Fx{b1w#~ zUC8*-rds?j1zSh_zXn#DPfq=6@&6{+`qQ6pfz>V}r+&5ge;aK5@xL6b7JqGO`g_-3 z0d{^9e<j@Z!mk25=iyhwts`gpJ7D*RXWCf(^~s+*d>3pR`tj3zU(Mb7Jp}D0_kz6K z3;TAPn*Z%>_QKw1!TVI$eKDrO?u)Tx_r-pc*$3BBclL!oj}+Ya#|p0fiHbYN#$HEu zKVMI2-Io4&_75m;B%8F~RM{<aHh&1Vf8~4E&1mY*tNZ>(VEdi-saxRssQW(kV@i|n zQ`&ANtJxQE+PV#Ff6Mo`+u`b|`%bXh9c25QdMxwj+joI&$8qRy8}{QTl<9{)cazoZ zhdBMX2W&seG2aVUzm>D%cL(=@)%*-&KT?ln`f)$lcI=z}w&5J8-#~W$ZCmVIx~G2% zcAsR=J^*+AvnPKB*GHY7X8XDKevY8sY;tdB4`|D~-mk#k6Xkb%zpk|MUGFz&`fBTW z*X#a{;kRho%_iGQpC3ZA&pBrg!*k9a1RJZ5ebZ07bLd`lJ}l+gug+8IejM!D80)^5 zi{10Ch2KxOC!d6SAA8R9({6vX{SKV|JPOwLDYEr=wzOxieh;qC)gLOYoU1>g>8mYs zmAm3kXxdGV!#+F>x2^2CXTbGy_GdKp^!r(G`fZ)op^x{Ew(|ZtsM#9t>6E#1#ua$~ z3VVMXP+{+m1Iezz!IZgwo~Q2e{#l3@qj2N<6x=#mD(-w5oBQXlVC%N@{`ni_i)54Q z{ZeJO%)I;^Z2!u$_zyJo%;i79_B+2{^DnqQ>Yn>oC{3O{Z7-A6?29;Uy$ZI!<^A&- zTs`;C>tMCqKdHwu=l<Vd+i@KF+lKvklQR9#=MA!&{Sc=gZ-MPcIp(+F>bZa30juTy zNj;Y7$Gc$Lv2Xg@hI63)0@?YuZLxFd{qr8!eUkI`KHT}w{qq4_A9a3u-9P_9&~7%l zSF;DS<xKq-JRe(m|9n(w<(c{zO<!#}Q@QItLDOzF*;e}eDVlxG{lnBRI5!_+Ggcq_ zrk{5ApL@~yu#{)NI!~#)FTT%LvF>}h^|<HT=tS<HR=D?%=S)BCImi9LImg~V`sV)e zY-!J2^#|AIYT-&N=V}0&zS=TZxqlWx({9Rl42#0m9hdjfVqn|O9$Xw;KX*%@sb?HZ zf-?^5vkrZ{kF=Hd(Gktod*4Tgpyh5lw8GvehgI0S<Z!ZU@;S=fNB%u&xqtNW?@9|d z-oGm?T)Tf)n%s3V*8h&*^<9qAx-GqrR-jywY|_3`Ww*?{tPHk)<sGpKntJAPRj~ce zov|8RA9e46H7HG<MQy8-)$EHnZLJBmzvVr!7F<1bw}I7iAEh44+yiTaZO3uwZyWYw zUCQ)BpLNJ;_CuV0tOvFq<(SuptLHx20IZh#DD_yT9~**g$G+)r8_t3H@?_`Vw#Cln z26S#Cu=^zEZ)3RgpZjPNxIXF}rC#^ZrfAyDCiiOgfVP~m&B66Ewne3tXKYI}eYNF` z<v!X9O}p7-Tj}#aH2a+UXlr=RjemcHvHI9I{j|IP+>6eKWw~FSr_{Y2*tJPNwg;=3 zZNp@}o(XMY&qMjHGzjip<r&mZd(QZd;PlnIO5fa7o;&TC$DP3SdEB|u%6Z%cO<!%9 z$J|xBqG`|kc{l9_x2^2Q!QlEC9D=5veh&qw-_~gz`gk{KEAOTWv|%dWmBzz!Hyv4F z@0Ruod#@Ztc6}yN=588J-R0dgso=&>F1U3ZQ*qbD*xXHffUVooyJ;llD6+}DuxDkr z%)IOcwtwY09gU{$esM1M2HWqvEA0c<N8LMNETzeFscj5d&Ay1!*1ll-Ti#9k!PRp& zjRULYZc06txfAvW+m7SV-!|;Wft2ZoJ_nH1?1wo0I0$S%$}t}dSI^ya2v{w5Q|hry zKMn=kj(yYLHk<?X5oG7zw#Ck+chg~D_esv;;c(|acheDYebnoB)928%n@#T3>;Y{# zQ%8d9XR5u@$}@Eon!eg{rgA4tK+|qE*;e{I5zRj5Zt8$*_dAjCU}N>MZ~AF>|G5{P z4@-IWtMin)j|RIo>BkhXn%Oo?*6W$jCiXm(?@`CXy_=3C>!&?ud@4A7^={HPca!H% zd*<;taD5)9Ra!ZZooM=M%RJ_8nvSO3HuD~JJY3x|d%k9XZ9DsOCfIo^_h%QHdd4ve zoN-v6b?D<=r7d^WBGhv%+0;SK{WPh<&HJUo-YrKLcuIkf0lRinDRXa~NITwJ8OJGb z<HJvdTSvZcJPododj1>B)4}Q|lk?wB&H+2N=A41EH|ByTqi5fpQTe&=P68XJk9)$t zX;18#V70`~184qb0<@n%nON-^$5~*<pY{0y+`7v3`666B>+>aW>OU1=9r~n>FN0Hu zZOV<YP49`b!LCvIe@^9}d*WQUKI-**;yg6%W|RGJU#Mqa7&Aj7={R!s#k2~$FFGsS zMmfFEj|aOKW>RJ^oKM~Eh44#i{K|@FJQu*NC;Y;S=iFZeS2NpxQ+`+F;!4{R&3g28 z4()@s)OjgbEqm>2V72*V{q3V#*5vD8{mbXmH{j~=xvcUj*Z!Mm>T~&?&-45(uyN|? z%eTR{m$Ps=SU<D(jpy_VuzuRoKQ*!Qw<e30eq05&zOt`Z!_{rmJ@g&0>nUG`^1E=? zF?0MqxSHAIJm=3fzmI0TKK9M$;5FdfYuAGHG3EFFuY=nMZRzLrVEbP7^9Hzje$Vg+ zU^U|tcOy7)j$f`{>^FfOf4L4ngsW%#H-pRZ{|H`={}#BK+4DfLVEjLZ7_U#pe=9iS zzYVO9DdWE#ZXdLzpLc*W4*TW)xD!o1<G%~6W_;rA1}Dz(%k_)>C*b<}--D)}@!tzB z$A2HZ9RK}rHM6N)|DT~5uTRGR0663SIanW4#{UbrebAPE{u1ms%5{1WuAcG#3an;) z;(iTIoa2}47yEC(_4WTPntI0n5V#!w!|-zakHFQ;_4R)Y&3Jt>{zt(X|KnhN%JDw| zw-4IV&nLkdN7nx-H1&-CcVIQ+6Zd;?;vBzRKkKkRf2i!aPyPt^jGCkU3C;HxbDn!o zqq)wTRCO4qU+R4Z?7g2qPx~`m%{cp^O|ASpES$R&$)@sI(gpXq;<IE{h1)1+7y1cc zpCzBC%(LTJ>Mx%qFVy&J6;Gf40=J&<=PI6i?s>SH*|D4QEct7tZHZ<*`Z`Ygpe=R& z4Xl=D$%|mM`DFd=qgvMQC9wYGz4v#xdVKy-`IKklpJ?iHt7pl-z{aVkFE4{_FYEjY zSU+=~C9i_@)0Y0JiJialS@JsE`pUlk8?J7fK1<#JyPk5NC2zuA$IS6ta5b~ZdCs%s zZ8YQc$ys_4hn%H%(DgAUcgeeO`=BlJ^&Z&1m;HPnuAZ~>0a(rW#Qg`HIL9y7FZK_? zj=x-o|H9QX{*S=r_&<i1<NpM%X0G2QpQ0JBPsV>MIOAtZ8~T*vUkJ@UXiGo)fHMyJ z<+G$OntI0H0#-9VajoFQIexi*vG)Vl*S|lSdd9ynxE%igcsc$>;A&=5x&Dix8Lv;q z@827m@h=Y6ryTzhaQmPw{ah04ILdWe3QaxZUmC1teBzb?C(iN9^^1L3aDDxkLsQTA zmj{>QUjbf@e?_>OxxW4@qZzMH#_!*en(?m!)~6i*s&M<DE&W^#oN;9RS4UIN_}2id z8K1Z{!HIMHa{a8s{;XBmbDy-q>(7$4(dOf8&a-44H2dtc#5#=AFZHeq_TJC4WIec= zarQ%-TKO#TZaIT&I*FX;$H@ggrNS#vo?77zDNif#=>?us;Wo;-WS=)@Qu@46Tc19Z z&l~?8k&^pwiIm)bOC;R*?F#-je0afq_U%#d?cn}fB8k`Tza<i`{lJ22KfK`Dk1V+M znFZHAyWrYSDY$n3U6HcAIW>Mp!HxId6^Xy&J-^`EFDkg>zpCKcuP(UlUsG`H{<|Wn zU;8Zu*Y3YBQrhn+xc0{iZv2x4*Y3YDQpP`1<ImRka|Jive`6%^j^BS{BwYI&6?ad& zmo^~h-q;B2zRa1}817!q=lD(F`lx$vc<%B{-4so`*@UTiZrq>Ra=teMtCjcS=5Y0V zuet?T&HCS`pId=VeqX6=OR}10L!AC@4Yt3I&2u1+ZCkMIYV%CUwRwiO2hXkIJRkBw zJh8lQb^s5fbY48C)@!_e8Pg!Jb2$%t?#UhD#+qzXAGO5n1UANbNX*V~V={O8s3m3> zaN4o&K2vu^Q;*MXl}|apgVEIUo!$_zTE0IX3br0~=X)5X$@$i{JNYuQ^DVZmHtZw7 z-f5SUGnXUL)Uz)}fz`~WjBO9Nwjunsk>_A9uyx%=o=493Mx$$U4!w)y{+^^`-y7`j zp}975$kwS(=3*bP_uD*d#><UKn`6MXc>}qeud(RbQqR6%>(R%0<od)uuCmX=o__C- zrXHUIz}}tnsygKQ=)VElLExOdx8MiEZ8P(D2w2T*avpOo4@EOxpPb9=sl(8;o9w&y zwVM5M4UYiV*YI;_>S=pCSk0XG1?%#z*OvKg2WJh-`8*0;n=waH%Cm+Ozz5Nftf6)4 zlX;s6b{_n_YUAa`#NJWZGxw9w)Z;T5?A&Ml<oe{h^P|D@so(FzqfM!_^1Jh6(Dc=& zueQw5v1r=O&XL*vc!x|yv;FYnz`hqGb{bsGoa9+Gwi8Wz-eabNUHgG#+q7<d@_qF2 zm0cfwqtB>xiiSSUrF~LQzq-I`iJ1kqo;AtZjG0NPZjAf(1aP@;PlT8I_VaKxvvXp7 z`K)ykn(_KLR`2`rv)0Mz`j~9ncLlZd_f)X`(RK=@T$^{p>0tM|w$mu(vCRcrm$o^S z^6Zr}<;0i!cOII${%26i^L#i9e5~z}?SpMur+%sP3*hz1^YF>J{36^~llAJOmY6Sr zjj<nz`7+!X=g=5^)Dm+x*ymr__jAzH<8v<9^~qU~>*L(`Ec^=CcC$CX3RW|l()M|9 zZN_<DT>!S;{C6)Gg4N6><Ffwt{d{zNjCbuX2D|sPT|_DOjxgpD@C-`#ne7^@Ph#hT z)icIR!S(t68k&A4+t5cXF<%GQ=ldII#$?R;sQKrC!zg_oUPhTZz6sutJg;g)?w&6F zjgxOhX}o3PzXeXbV{;#W8%;g$Mwf%t@@{k`*yOWX+ZALr>lRyg8}{#j^Q^OO*ZR9? z>RIdWfz`~WjQeW1w%qgI2j`ymUQS=HMc3y1UPCF*J%1he2z+zTTc<u5`}N?f$k~^2 zW76ggVB7RvV_wnb574!xo*Ti|GY=p8A=fANAA-|P`h7E+dVGEazKJ}qsza`i{@!`F zg3CMa$8hV+Jl+OYGn<^pJgaX<GhQF}w(FSleFvKM%=;~1HT%_u?_J>f8s3ejp0<Ai zRx?|_U&}ZTcfz%0KJNu*4VS|wb8#QKHe>FglxGd^2T#B^YiOPNWZr%Xb{_JqmKziM z1BE^9{R~Y#K0gOL_t^(>eezuX1-SmK{$-_=&*}%!^wm~>R{si3yV?0M+aK?tU!&Q6 z_;0}Fv--DiHM8qv&a?U<H0^m-KV10?#K$(RTc12%AF1rtp>Om@E1jaDk85F{)YGrW z!D@+l0&G1#tF;;P7^S)~?%Sup<-YwLyxh0HhpU-Q?w>rX|A1z^K6zGO!^CCp{t;at z`(WEog4NRBr@{6|+n*@q+Pvfb40f+;dxlaT+h4%erR`ZtdG^Zl;JL)lBgg-*XzKbu zM=8&<`UP;F)%L-*tW&?#`8V+TWcP5!_9EO^llAJOmYA2o#(0hr^LMy0nGb!`67vtR z&+2*P^!=Y`>hbv(*!7uLwJX<0fA8;Cz_y#S|0-C`T%P@x;o6LQi@4Xp)|+Sbzrku| zlYKT>hvR(>T_59J`!~VvJ#BAL%JZy#3+$h-xX*0YSbY-vHdsAld<X0p%K3g5uAj*^ p^ifO9dthVA`F<a6Oy*l3HJ`OtQu;i!%-nqdwhn*SK<w`t{67JuZruO? literal 0 HcmV?d00001 diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaEdge.glsl b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaEdge.glsl new file mode 100644 index 000000000..668b97d5d --- /dev/null +++ b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaEdge.glsl @@ -0,0 +1,1402 @@ +#version 430 core +#define SMAA_GLSL_4 1 + +layout (constant_id = 0) const int SMAA_PRESET_LOW = 0; +layout (constant_id = 1) const int SMAA_PRESET_MEDIUM = 0; +layout (constant_id = 2) const int SMAA_PRESET_HIGH = 0; +layout (constant_id = 3) const int SMAA_PRESET_ULTRA = 0; +layout (constant_id = 4) const float METRIC_WIDTH = 1920.0; +layout (constant_id = 5) const float METRIC_HEIGHT = 1080.0; + +#define SMAA_RT_METRICS float4(1.0 / METRIC_WIDTH, 1.0 / METRIC_HEIGHT, METRIC_WIDTH, METRIC_HEIGHT) + +layout (local_size_x = 16, local_size_y = 16) in; +/** + * Copyright (C) 2013 Jorge Jimenez (jorge@iryoku.com) + * Copyright (C) 2013 Jose I. Echevarria (joseignacioechevarria@gmail.com) + * Copyright (C) 2013 Belen Masia (bmasia@unizar.es) + * Copyright (C) 2013 Fernando Navarro (fernandn@microsoft.com) + * Copyright (C) 2013 Diego Gutierrez (diegog@unizar.es) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to + * do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. As clarification, there + * is no requirement that the copyright notice and permission be included in + * binary distributions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +/** + * _______ ___ ___ ___ ___ + * / || \/ | / \ / \ + * | (---- | \ / | / ^ \ / ^ \ + * \ \ | |\/| | / /_\ \ / /_\ \ + * ----) | | | | | / _____ \ / _____ \ + * |_______/ |__| |__| /__/ \__\ /__/ \__\ + * + * E N H A N C E D + * S U B P I X E L M O R P H O L O G I C A L A N T I A L I A S I N G + * + * http://www.iryoku.com/smaa/ + * + * Hi, welcome aboard! + * + * Here you'll find instructions to get the shader up and running as fast as + * possible. + * + * IMPORTANTE NOTICE: when updating, remember to update both this file and the + * precomputed textures! They may change from version to version. + * + * The shader has three passes, chained together as follows: + * + * |input|------------------ + * v | + * [ SMAA*EdgeDetection ] | + * v | + * |edgesTex| | + * v | + * [ SMAABlendingWeightCalculation ] | + * v | + * |blendTex| | + * v | + * [ SMAANeighborhoodBlending ] <------ + * v + * |output| + * + * Note that each [pass] has its own vertex and pixel shader. Remember to use + * oversized triangles instead of quads to avoid overshading along the + * diagonal. + * + * You've three edge detection methods to choose from: luma, color or depth. + * They represent different quality/performance and anti-aliasing/sharpness + * tradeoffs, so our recommendation is for you to choose the one that best + * suits your particular scenario: + * + * - Depth edge detection is usually the fastest but it may miss some edges. + * + * - Luma edge detection is usually more expensive than depth edge detection, + * but catches visible edges that depth edge detection can miss. + * + * - Color edge detection is usually the most expensive one but catches + * chroma-only edges. + * + * For quickstarters: just use luma edge detection. + * + * The general advice is to not rush the integration process and ensure each + * step is done correctly (don't try to integrate SMAA T2x with predicated edge + * detection from the start!). Ok then, let's go! + * + * 1. The first step is to create two RGBA temporal render targets for holding + * |edgesTex| and |blendTex|. + * + * In DX10 or DX11, you can use a RG render target for the edges texture. + * In the case of NVIDIA GPUs, using RG render targets seems to actually be + * slower. + * + * On the Xbox 360, you can use the same render target for resolving both + * |edgesTex| and |blendTex|, as they aren't needed simultaneously. + * + * 2. Both temporal render targets |edgesTex| and |blendTex| must be cleared + * each frame. Do not forget to clear the alpha channel! + * + * 3. The next step is loading the two supporting precalculated textures, + * 'areaTex' and 'searchTex'. You'll find them in the 'Textures' folder as + * C++ headers, and also as regular DDS files. They'll be needed for the + * 'SMAABlendingWeightCalculation' pass. + * + * If you use the C++ headers, be sure to load them in the format specified + * inside of them. + * + * You can also compress 'areaTex' and 'searchTex' using BC5 and BC4 + * respectively, if you have that option in your content processor pipeline. + * When compressing then, you get a non-perceptible quality decrease, and a + * marginal performance increase. + * + * 4. All samplers must be set to linear filtering and clamp. + * + * After you get the technique working, remember that 64-bit inputs have + * half-rate linear filtering on GCN. + * + * If SMAA is applied to 64-bit color buffers, switching to point filtering + * when accesing them will increase the performance. Search for + * 'SMAASamplePoint' to see which textures may benefit from point + * filtering, and where (which is basically the color input in the edge + * detection and resolve passes). + * + * 5. All texture reads and buffer writes must be non-sRGB, with the exception + * of the input read and the output write in + * 'SMAANeighborhoodBlending' (and only in this pass!). If sRGB reads in + * this last pass are not possible, the technique will work anyway, but + * will perform antialiasing in gamma space. + * + * IMPORTANT: for best results the input read for the color/luma edge + * detection should *NOT* be sRGB. + * + * 6. Before including SMAA.h you'll have to setup the render target metrics, + * the target and any optional configuration defines. Optionally you can + * use a preset. + * + * You have the following targets available: + * SMAA_HLSL_3 + * SMAA_HLSL_4 + * SMAA_HLSL_4_1 + * SMAA_GLSL_3 * + * SMAA_GLSL_4 * + * + * * (See SMAA_INCLUDE_VS and SMAA_INCLUDE_PS below). + * + * And four presets: + * SMAA_PRESET_LOW (%60 of the quality) + * SMAA_PRESET_MEDIUM (%80 of the quality) + * SMAA_PRESET_HIGH (%95 of the quality) + * SMAA_PRESET_ULTRA (%99 of the quality) + * + * For example: + * #define SMAA_RT_METRICS float4(1.0 / 1280.0, 1.0 / 720.0, 1280.0, 720.0) + * #define SMAA_HLSL_4 + * #define SMAA_PRESET_HIGH + * #include "SMAA.h" + * + * Note that SMAA_RT_METRICS doesn't need to be a macro, it can be a + * uniform variable. The code is designed to minimize the impact of not + * using a constant value, but it is still better to hardcode it. + * + * Depending on how you encoded 'areaTex' and 'searchTex', you may have to + * add (and customize) the following defines before including SMAA.h: + * #define SMAA_AREATEX_SELECT(sample) sample.rg + * #define SMAA_SEARCHTEX_SELECT(sample) sample.r + * + * If your engine is already using porting macros, you can define + * SMAA_CUSTOM_SL, and define the porting functions by yourself. + * + * 7. Then, you'll have to setup the passes as indicated in the scheme above. + * You can take a look into SMAA.fx, to see how we did it for our demo. + * Checkout the function wrappers, you may want to copy-paste them! + * + * 8. It's recommended to validate the produced |edgesTex| and |blendTex|. + * You can use a screenshot from your engine to compare the |edgesTex| + * and |blendTex| produced inside of the engine with the results obtained + * with the reference demo. + * + * 9. After you get the last pass to work, it's time to optimize. You'll have + * to initialize a stencil buffer in the first pass (discard is already in + * the code), then mask execution by using it the second pass. The last + * pass should be executed in all pixels. + * + * + * After this point you can choose to enable predicated thresholding, + * temporal supersampling and motion blur integration: + * + * a) If you want to use predicated thresholding, take a look into + * SMAA_PREDICATION; you'll need to pass an extra texture in the edge + * detection pass. + * + * b) If you want to enable temporal supersampling (SMAA T2x): + * + * 1. The first step is to render using subpixel jitters. I won't go into + * detail, but it's as simple as moving each vertex position in the + * vertex shader, you can check how we do it in our DX10 demo. + * + * 2. Then, you must setup the temporal resolve. You may want to take a look + * into SMAAResolve for resolving 2x modes. After you get it working, you'll + * probably see ghosting everywhere. But fear not, you can enable the + * CryENGINE temporal reprojection by setting the SMAA_REPROJECTION macro. + * Check out SMAA_DECODE_VELOCITY if your velocity buffer is encoded. + * + * 3. The next step is to apply SMAA to each subpixel jittered frame, just as + * done for 1x. + * + * 4. At this point you should already have something usable, but for best + * results the proper area textures must be set depending on current jitter. + * For this, the parameter 'subsampleIndices' of + * 'SMAABlendingWeightCalculationPS' must be set as follows, for our T2x + * mode: + * + * @SUBSAMPLE_INDICES + * + * | S# | Camera Jitter | subsampleIndices | + * +----+------------------+---------------------+ + * | 0 | ( 0.25, -0.25) | float4(1, 1, 1, 0) | + * | 1 | (-0.25, 0.25) | float4(2, 2, 2, 0) | + * + * These jitter positions assume a bottom-to-top y axis. S# stands for the + * sample number. + * + * More information about temporal supersampling here: + * http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf + * + * c) If you want to enable spatial multisampling (SMAA S2x): + * + * 1. The scene must be rendered using MSAA 2x. The MSAA 2x buffer must be + * created with: + * - DX10: see below (*) + * - DX10.1: D3D10_STANDARD_MULTISAMPLE_PATTERN or + * - DX11: D3D11_STANDARD_MULTISAMPLE_PATTERN + * + * This allows to ensure that the subsample order matches the table in + * @SUBSAMPLE_INDICES. + * + * (*) In the case of DX10, we refer the reader to: + * - SMAA::detectMSAAOrder and + * - SMAA::msaaReorder + * + * These functions allow to match the standard multisample patterns by + * detecting the subsample order for a specific GPU, and reordering + * them appropriately. + * + * 2. A shader must be run to output each subsample into a separate buffer + * (DX10 is required). You can use SMAASeparate for this purpose, or just do + * it in an existing pass (for example, in the tone mapping pass, which has + * the advantage of feeding tone mapped subsamples to SMAA, which will yield + * better results). + * + * 3. The full SMAA 1x pipeline must be run for each separated buffer, storing + * the results in the final buffer. The second run should alpha blend with + * the existing final buffer using a blending factor of 0.5. + * 'subsampleIndices' must be adjusted as in the SMAA T2x case (see point + * b). + * + * d) If you want to enable temporal supersampling on top of SMAA S2x + * (which actually is SMAA 4x): + * + * 1. SMAA 4x consists on temporally jittering SMAA S2x, so the first step is + * to calculate SMAA S2x for current frame. In this case, 'subsampleIndices' + * must be set as follows: + * + * | F# | S# | Camera Jitter | Net Jitter | subsampleIndices | + * +----+----+--------------------+-------------------+----------------------+ + * | 0 | 0 | ( 0.125, 0.125) | ( 0.375, -0.125) | float4(5, 3, 1, 3) | + * | 0 | 1 | ( 0.125, 0.125) | (-0.125, 0.375) | float4(4, 6, 2, 3) | + * +----+----+--------------------+-------------------+----------------------+ + * | 1 | 2 | (-0.125, -0.125) | ( 0.125, -0.375) | float4(3, 5, 1, 4) | + * | 1 | 3 | (-0.125, -0.125) | (-0.375, 0.125) | float4(6, 4, 2, 4) | + * + * These jitter positions assume a bottom-to-top y axis. F# stands for the + * frame number. S# stands for the sample number. + * + * 2. After calculating SMAA S2x for current frame (with the new subsample + * indices), previous frame must be reprojected as in SMAA T2x mode (see + * point b). + * + * e) If motion blur is used, you may want to do the edge detection pass + * together with motion blur. This has two advantages: + * + * 1. Pixels under heavy motion can be omitted from the edge detection process. + * For these pixels we can just store "no edge", as motion blur will take + * care of them. + * 2. The center pixel tap is reused. + * + * Note that in this case depth testing should be used instead of stenciling, + * as we have to write all the pixels in the motion blur pass. + * + * That's it! + */ + +//----------------------------------------------------------------------------- +// SMAA Presets + +/** + * Note that if you use one of these presets, the following configuration + * macros will be ignored if set in the "Configurable Defines" section. + */ + +#if defined(SMAA_PRESET_LOW) +#define SMAA_THRESHOLD 0.15 +#define SMAA_MAX_SEARCH_STEPS 4 +#define SMAA_DISABLE_DIAG_DETECTION +#define SMAA_DISABLE_CORNER_DETECTION +#elif defined(SMAA_PRESET_MEDIUM) +#define SMAA_THRESHOLD 0.1 +#define SMAA_MAX_SEARCH_STEPS 8 +#define SMAA_DISABLE_DIAG_DETECTION +#define SMAA_DISABLE_CORNER_DETECTION +#elif defined(SMAA_PRESET_HIGH) +#define SMAA_THRESHOLD 0.1 +#define SMAA_MAX_SEARCH_STEPS 16 +#define SMAA_MAX_SEARCH_STEPS_DIAG 8 +#define SMAA_CORNER_ROUNDING 25 +#elif defined(SMAA_PRESET_ULTRA) +#define SMAA_THRESHOLD 0.05 +#define SMAA_MAX_SEARCH_STEPS 32 +#define SMAA_MAX_SEARCH_STEPS_DIAG 16 +#define SMAA_CORNER_ROUNDING 25 +#endif + +//----------------------------------------------------------------------------- +// Configurable Defines + +/** + * SMAA_THRESHOLD specifies the threshold or sensitivity to edges. + * Lowering this value you will be able to detect more edges at the expense of + * performance. + * + * Range: [0, 0.5] + * 0.1 is a reasonable value, and allows to catch most visible edges. + * 0.05 is a rather overkill value, that allows to catch 'em all. + * + * If temporal supersampling is used, 0.2 could be a reasonable value, as low + * contrast edges are properly filtered by just 2x. + */ +#ifndef SMAA_THRESHOLD +#define SMAA_THRESHOLD 0.1 +#endif + +/** + * SMAA_DEPTH_THRESHOLD specifies the threshold for depth edge detection. + * + * Range: depends on the depth range of the scene. + */ +#ifndef SMAA_DEPTH_THRESHOLD +#define SMAA_DEPTH_THRESHOLD (0.1 * SMAA_THRESHOLD) +#endif + +/** + * SMAA_MAX_SEARCH_STEPS specifies the maximum steps performed in the + * horizontal/vertical pattern searches, at each side of the pixel. + * + * In number of pixels, it's actually the double. So the maximum line length + * perfectly handled by, for example 16, is 64 (by perfectly, we meant that + * longer lines won't look as good, but still antialiased). + * + * Range: [0, 112] + */ +#ifndef SMAA_MAX_SEARCH_STEPS +#define SMAA_MAX_SEARCH_STEPS 16 +#endif + +/** + * SMAA_MAX_SEARCH_STEPS_DIAG specifies the maximum steps performed in the + * diagonal pattern searches, at each side of the pixel. In this case we jump + * one pixel at time, instead of two. + * + * Range: [0, 20] + * + * On high-end machines it is cheap (between a 0.8x and 0.9x slower for 16 + * steps), but it can have a significant impact on older machines. + * + * Define SMAA_DISABLE_DIAG_DETECTION to disable diagonal processing. + */ +#ifndef SMAA_MAX_SEARCH_STEPS_DIAG +#define SMAA_MAX_SEARCH_STEPS_DIAG 8 +#endif + +/** + * SMAA_CORNER_ROUNDING specifies how much sharp corners will be rounded. + * + * Range: [0, 100] + * + * Define SMAA_DISABLE_CORNER_DETECTION to disable corner processing. + */ +#ifndef SMAA_CORNER_ROUNDING +#define SMAA_CORNER_ROUNDING 25 +#endif + +/** + * If there is an neighbor edge that has SMAA_LOCAL_CONTRAST_FACTOR times + * bigger contrast than current edge, current edge will be discarded. + * + * This allows to eliminate spurious crossing edges, and is based on the fact + * that, if there is too much contrast in a direction, that will hide + * perceptually contrast in the other neighbors. + */ +#ifndef SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR +#define SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR 2.0 +#endif + +/** + * Predicated thresholding allows to better preserve texture details and to + * improve performance, by decreasing the number of detected edges using an + * additional buffer like the light accumulation buffer, object ids or even the + * depth buffer (the depth buffer usage may be limited to indoor or short range + * scenes). + * + * It locally decreases the luma or color threshold if an edge is found in an + * additional buffer (so the global threshold can be higher). + * + * This method was developed by Playstation EDGE MLAA team, and used in + * Killzone 3, by using the light accumulation buffer. More information here: + * http://iryoku.com/aacourse/downloads/06-MLAA-on-PS3.pptx + */ +#ifndef SMAA_PREDICATION +#define SMAA_PREDICATION 0 +#endif + +/** + * Threshold to be used in the additional predication buffer. + * + * Range: depends on the input, so you'll have to find the magic number that + * works for you. + */ +#ifndef SMAA_PREDICATION_THRESHOLD +#define SMAA_PREDICATION_THRESHOLD 0.01 +#endif + +/** + * How much to scale the global threshold used for luma or color edge + * detection when using predication. + * + * Range: [1, 5] + */ +#ifndef SMAA_PREDICATION_SCALE +#define SMAA_PREDICATION_SCALE 2.0 +#endif + +/** + * How much to locally decrease the threshold. + * + * Range: [0, 1] + */ +#ifndef SMAA_PREDICATION_STRENGTH +#define SMAA_PREDICATION_STRENGTH 0.4 +#endif + +/** + * Temporal reprojection allows to remove ghosting artifacts when using + * temporal supersampling. We use the CryEngine 3 method which also introduces + * velocity weighting. This feature is of extreme importance for totally + * removing ghosting. More information here: + * http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf + * + * Note that you'll need to setup a velocity buffer for enabling reprojection. + * For static geometry, saving the previous depth buffer is a viable + * alternative. + */ +#ifndef SMAA_REPROJECTION +#define SMAA_REPROJECTION 0 +#endif + +/** + * SMAA_REPROJECTION_WEIGHT_SCALE controls the velocity weighting. It allows to + * remove ghosting trails behind the moving object, which are not removed by + * just using reprojection. Using low values will exhibit ghosting, while using + * high values will disable temporal supersampling under motion. + * + * Behind the scenes, velocity weighting removes temporal supersampling when + * the velocity of the subsamples differs (meaning they are different objects). + * + * Range: [0, 80] + */ +#ifndef SMAA_REPROJECTION_WEIGHT_SCALE +#define SMAA_REPROJECTION_WEIGHT_SCALE 30.0 +#endif + +/** + * On some compilers, discard cannot be used in vertex shaders. Thus, they need + * to be compiled separately. + */ +#ifndef SMAA_INCLUDE_VS +#define SMAA_INCLUDE_VS 1 +#endif +#ifndef SMAA_INCLUDE_PS +#define SMAA_INCLUDE_PS 1 +#endif + +//----------------------------------------------------------------------------- +// Texture Access Defines + +#ifndef SMAA_AREATEX_SELECT +#if defined(SMAA_HLSL_3) +#define SMAA_AREATEX_SELECT(sample) sample.ra +#else +#define SMAA_AREATEX_SELECT(sample) sample.rg +#endif +#endif + +#ifndef SMAA_SEARCHTEX_SELECT +#define SMAA_SEARCHTEX_SELECT(sample) sample.r +#endif + +#ifndef SMAA_DECODE_VELOCITY +#define SMAA_DECODE_VELOCITY(sample) sample.rg +#endif + +//----------------------------------------------------------------------------- +// Non-Configurable Defines + +#define SMAA_AREATEX_MAX_DISTANCE 16 +#define SMAA_AREATEX_MAX_DISTANCE_DIAG 20 +#define SMAA_AREATEX_PIXEL_SIZE (1.0 / float2(160.0, 560.0)) +#define SMAA_AREATEX_SUBTEX_SIZE (1.0 / 7.0) +#define SMAA_SEARCHTEX_SIZE float2(66.0, 33.0) +#define SMAA_SEARCHTEX_PACKED_SIZE float2(64.0, 16.0) +#define SMAA_CORNER_ROUNDING_NORM (float(SMAA_CORNER_ROUNDING) / 100.0) + +//----------------------------------------------------------------------------- +// Porting Functions + +#if defined(SMAA_HLSL_3) +#define SMAATexture2D(tex) sampler2D tex +#define SMAATexturePass2D(tex) tex +#define SMAASampleLevelZero(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0)) +#define SMAASampleLevelZeroPoint(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0)) +#define SMAASampleLevelZeroOffset(tex, coord, offset) tex2Dlod(tex, float4(coord + offset * SMAA_RT_METRICS.xy, 0.0, 0.0)) +#define SMAASample(tex, coord) tex2D(tex, coord) +#define SMAASamplePoint(tex, coord) tex2D(tex, coord) +#define SMAASampleOffset(tex, coord, offset) tex2D(tex, coord + offset * SMAA_RT_METRICS.xy) +#define SMAA_FLATTEN [flatten] +#define SMAA_BRANCH [branch] +#endif +#if defined(SMAA_HLSL_4) || defined(SMAA_HLSL_4_1) +SamplerState LinearSampler { Filter = MIN_MAG_LINEAR_MIP_POINT; AddressU = Clamp; AddressV = Clamp; }; +SamplerState PointSampler { Filter = MIN_MAG_MIP_POINT; AddressU = Clamp; AddressV = Clamp; }; +#define SMAATexture2D(tex) Texture2D tex +#define SMAATexturePass2D(tex) tex +#define SMAASampleLevelZero(tex, coord) tex.SampleLevel(LinearSampler, coord, 0) +#define SMAASampleLevelZeroPoint(tex, coord) tex.SampleLevel(PointSampler, coord, 0) +#define SMAASampleLevelZeroOffset(tex, coord, offset) tex.SampleLevel(LinearSampler, coord, 0, offset) +#define SMAASample(tex, coord) tex.Sample(LinearSampler, coord) +#define SMAASamplePoint(tex, coord) tex.Sample(PointSampler, coord) +#define SMAASampleOffset(tex, coord, offset) tex.Sample(LinearSampler, coord, offset) +#define SMAA_FLATTEN [flatten] +#define SMAA_BRANCH [branch] +#define SMAATexture2DMS2(tex) Texture2DMS<float4, 2> tex +#define SMAALoad(tex, pos, sample) tex.Load(pos, sample) +#if defined(SMAA_HLSL_4_1) +#define SMAAGather(tex, coord) tex.Gather(LinearSampler, coord, 0) +#endif +#endif +#if defined(SMAA_GLSL_3) || defined(SMAA_GLSL_4) +#define SMAATexture2D(tex) sampler2D tex +#define SMAATexturePass2D(tex) tex +#define SMAASampleLevelZero(tex, coord) textureLod(tex, coord, 0.0) +#define SMAASampleLevelZeroPoint(tex, coord) textureLod(tex, coord, 0.0) +#define SMAASampleLevelZeroOffset(tex, coord, offset) textureLodOffset(tex, coord, 0.0, offset) +#define SMAASample(tex, coord) texture(tex, coord) +#define SMAASamplePoint(tex, coord) texture(tex, coord) +#define SMAASampleOffset(tex, coord, offset) texture(tex, coord, offset) +#define SMAA_FLATTEN +#define SMAA_BRANCH +#define lerp(a, b, t) mix(a, b, t) +#define saturate(a) clamp(a, 0.0, 1.0) +#if defined(SMAA_GLSL_4) +#define mad(a, b, c) fma(a, b, c) +#define SMAAGather(tex, coord) textureGather(tex, coord) +#else +#define mad(a, b, c) (a * b + c) +#endif +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#endif + +#if !defined(SMAA_HLSL_3) && !defined(SMAA_HLSL_4) && !defined(SMAA_HLSL_4_1) && !defined(SMAA_GLSL_3) && !defined(SMAA_GLSL_4) && !defined(SMAA_CUSTOM_SL) +#error you must define the shading language: SMAA_HLSL_*, SMAA_GLSL_* or SMAA_CUSTOM_SL +#endif + +//----------------------------------------------------------------------------- +// Misc functions + +/** + * Gathers current pixel, and the top-left neighbors. + */ +float3 SMAAGatherNeighbours(float2 texcoord, + float4 offset[3], + SMAATexture2D(tex)) { + #ifdef SMAAGather + return SMAAGather(tex, texcoord + SMAA_RT_METRICS.xy * float2(-0.5, -0.5)).grb; + #else + float P = SMAASamplePoint(tex, texcoord).r; + float Pleft = SMAASamplePoint(tex, offset[0].xy).r; + float Ptop = SMAASamplePoint(tex, offset[0].zw).r; + return float3(P, Pleft, Ptop); + #endif +} + +/** + * Adjusts the threshold by means of predication. + */ +float2 SMAACalculatePredicatedThreshold(float2 texcoord, + float4 offset[3], + SMAATexture2D(predicationTex)) { + float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(predicationTex)); + float2 delta = abs(neighbours.xx - neighbours.yz); + float2 edges = step(SMAA_PREDICATION_THRESHOLD, delta); + return SMAA_PREDICATION_SCALE * SMAA_THRESHOLD * (1.0 - SMAA_PREDICATION_STRENGTH * edges); +} + +/** + * Conditional move: + */ +void SMAAMovc(bool2 cond, inout float2 variable, float2 value) { + SMAA_FLATTEN if (cond.x) variable.x = value.x; + SMAA_FLATTEN if (cond.y) variable.y = value.y; +} + +void SMAAMovc(bool4 cond, inout float4 variable, float4 value) { + SMAAMovc(cond.xy, variable.xy, value.xy); + SMAAMovc(cond.zw, variable.zw, value.zw); +} + + +#if SMAA_INCLUDE_VS +//----------------------------------------------------------------------------- +// Vertex Shaders + +/** + * Edge Detection Vertex Shader + */ +void SMAAEdgeDetectionVS(float2 texcoord, + out float4 offset[3]) { + offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-1.0, 0.0, 0.0, -1.0), texcoord.xyxy); + offset[1] = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0, 1.0), texcoord.xyxy); + offset[2] = mad(SMAA_RT_METRICS.xyxy, float4(-2.0, 0.0, 0.0, -2.0), texcoord.xyxy); +} + +/** + * Blend Weight Calculation Vertex Shader + */ +void SMAABlendingWeightCalculationVS(float2 texcoord, + out float2 pixcoord, + out float4 offset[3]) { + pixcoord = texcoord * SMAA_RT_METRICS.zw; + + // We will use these offsets for the searches later on (see @PSEUDO_GATHER4): + offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-0.25, -0.125, 1.25, -0.125), texcoord.xyxy); + offset[1] = mad(SMAA_RT_METRICS.xyxy, float4(-0.125, -0.25, -0.125, 1.25), texcoord.xyxy); + + // And these for the searches, they indicate the ends of the loops: + offset[2] = mad(SMAA_RT_METRICS.xxyy, + float4(-2.0, 2.0, -2.0, 2.0) * float(SMAA_MAX_SEARCH_STEPS), + float4(offset[0].xz, offset[1].yw)); +} + +/** + * Neighborhood Blending Vertex Shader + */ +void SMAANeighborhoodBlendingVS(float2 texcoord, + out float4 offset) { + offset = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0, 1.0), texcoord.xyxy); +} +#endif // SMAA_INCLUDE_VS + +#if SMAA_INCLUDE_PS +//----------------------------------------------------------------------------- +// Edge Detection Pixel Shaders (First Pass) + +/** + * Luma Edge Detection + * + * IMPORTANT NOTICE: luma edge detection requires gamma-corrected colors, and + * thus 'colorTex' should be a non-sRGB texture. + */ +float2 SMAALumaEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(colorTex) + #if SMAA_PREDICATION + , SMAATexture2D(predicationTex) + #endif + ) { + // Calculate the threshold: + #if SMAA_PREDICATION + float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, SMAATexturePass2D(predicationTex)); + #else + float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD); + #endif + + // Calculate lumas: + float3 weights = float3(0.2126, 0.7152, 0.0722); + float L = dot(SMAASamplePoint(colorTex, texcoord).rgb, weights); + + float Lleft = dot(SMAASamplePoint(colorTex, offset[0].xy).rgb, weights); + float Ltop = dot(SMAASamplePoint(colorTex, offset[0].zw).rgb, weights); + + // We do the usual threshold: + float4 delta; + delta.xy = abs(L - float2(Lleft, Ltop)); + float2 edges = step(threshold, delta.xy); + + // Then discard if there is no edge: + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + // Calculate right and bottom deltas: + float Lright = dot(SMAASamplePoint(colorTex, offset[1].xy).rgb, weights); + float Lbottom = dot(SMAASamplePoint(colorTex, offset[1].zw).rgb, weights); + delta.zw = abs(L - float2(Lright, Lbottom)); + + // Calculate the maximum delta in the direct neighborhood: + float2 maxDelta = max(delta.xy, delta.zw); + + // Calculate left-left and top-top deltas: + float Lleftleft = dot(SMAASamplePoint(colorTex, offset[2].xy).rgb, weights); + float Ltoptop = dot(SMAASamplePoint(colorTex, offset[2].zw).rgb, weights); + delta.zw = abs(float2(Lleft, Ltop) - float2(Lleftleft, Ltoptop)); + + // Calculate the final maximum delta: + maxDelta = max(maxDelta.xy, delta.zw); + float finalDelta = max(maxDelta.x, maxDelta.y); + + // Local contrast adaptation: + edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy); + + return edges; +} + +/** + * Color Edge Detection + * + * IMPORTANT NOTICE: color edge detection requires gamma-corrected colors, and + * thus 'colorTex' should be a non-sRGB texture. + */ +float2 SMAAColorEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(colorTex) + #if SMAA_PREDICATION + , SMAATexture2D(predicationTex) + #endif + ) { + // Calculate the threshold: + #if SMAA_PREDICATION + float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, predicationTex); + #else + float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD); + #endif + + // Calculate color deltas: + float4 delta; + float3 C = SMAASamplePoint(colorTex, texcoord).rgb; + + float3 Cleft = SMAASamplePoint(colorTex, offset[0].xy).rgb; + float3 t = abs(C - Cleft); + delta.x = max(max(t.r, t.g), t.b); + + float3 Ctop = SMAASamplePoint(colorTex, offset[0].zw).rgb; + t = abs(C - Ctop); + delta.y = max(max(t.r, t.g), t.b); + + // We do the usual threshold: + float2 edges = step(threshold, delta.xy); + + // Then discard if there is no edge: + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + // Calculate right and bottom deltas: + float3 Cright = SMAASamplePoint(colorTex, offset[1].xy).rgb; + t = abs(C - Cright); + delta.z = max(max(t.r, t.g), t.b); + + float3 Cbottom = SMAASamplePoint(colorTex, offset[1].zw).rgb; + t = abs(C - Cbottom); + delta.w = max(max(t.r, t.g), t.b); + + // Calculate the maximum delta in the direct neighborhood: + float2 maxDelta = max(delta.xy, delta.zw); + + // Calculate left-left and top-top deltas: + float3 Cleftleft = SMAASamplePoint(colorTex, offset[2].xy).rgb; + t = abs(C - Cleftleft); + delta.z = max(max(t.r, t.g), t.b); + + float3 Ctoptop = SMAASamplePoint(colorTex, offset[2].zw).rgb; + t = abs(C - Ctoptop); + delta.w = max(max(t.r, t.g), t.b); + + // Calculate the final maximum delta: + maxDelta = max(maxDelta.xy, delta.zw); + float finalDelta = max(maxDelta.x, maxDelta.y); + + // Local contrast adaptation: + edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy); + + return edges; +} + +/** + * Depth Edge Detection + */ +float2 SMAADepthEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(depthTex)) { + float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(depthTex)); + float2 delta = abs(neighbours.xx - float2(neighbours.y, neighbours.z)); + float2 edges = step(SMAA_DEPTH_THRESHOLD, delta); + + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + return edges; +} + +//----------------------------------------------------------------------------- +// Diagonal Search Functions + +#if !defined(SMAA_DISABLE_DIAG_DETECTION) + +/** + * Allows to decode two binary values from a bilinear-filtered access. + */ +float2 SMAADecodeDiagBilinearAccess(float2 e) { + // Bilinear access for fetching 'e' have a 0.25 offset, and we are + // interested in the R and G edges: + // + // +---G---+-------+ + // | x o R x | + // +-------+-------+ + // + // Then, if one of these edge is enabled: + // Red: (0.75 * X + 0.25 * 1) => 0.25 or 1.0 + // Green: (0.75 * 1 + 0.25 * X) => 0.75 or 1.0 + // + // This function will unpack the values (mad + mul + round): + // wolframalpha.com: round(x * abs(5 * x - 5 * 0.75)) plot 0 to 1 + e.r = e.r * abs(5.0 * e.r - 5.0 * 0.75); + return round(e); +} + +float4 SMAADecodeDiagBilinearAccess(float4 e) { + e.rb = e.rb * abs(5.0 * e.rb - 5.0 * 0.75); + return round(e); +} + +/** + * These functions allows to perform diagonal pattern searches. + */ +float2 SMAASearchDiag1(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) { + float4 coord = float4(texcoord, -1.0, 1.0); + float3 t = float3(SMAA_RT_METRICS.xy, 1.0); + while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) && + coord.w > 0.9) { + coord.xyz = mad(t, float3(dir, 1.0), coord.xyz); + e = SMAASampleLevelZero(edgesTex, coord.xy).rg; + coord.w = dot(e, float2(0.5, 0.5)); + } + return coord.zw; +} + +float2 SMAASearchDiag2(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) { + float4 coord = float4(texcoord, -1.0, 1.0); + coord.x += 0.25 * SMAA_RT_METRICS.x; // See @SearchDiag2Optimization + float3 t = float3(SMAA_RT_METRICS.xy, 1.0); + while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) && + coord.w > 0.9) { + coord.xyz = mad(t, float3(dir, 1.0), coord.xyz); + + // @SearchDiag2Optimization + // Fetch both edges at once using bilinear filtering: + e = SMAASampleLevelZero(edgesTex, coord.xy).rg; + e = SMAADecodeDiagBilinearAccess(e); + + // Non-optimized version: + // e.g = SMAASampleLevelZero(edgesTex, coord.xy).g; + // e.r = SMAASampleLevelZeroOffset(edgesTex, coord.xy, int2(1, 0)).r; + + coord.w = dot(e, float2(0.5, 0.5)); + } + return coord.zw; +} + +/** + * Similar to SMAAArea, this calculates the area corresponding to a certain + * diagonal distance and crossing edges 'e'. + */ +float2 SMAAAreaDiag(SMAATexture2D(areaTex), float2 dist, float2 e, float offset) { + float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE_DIAG, SMAA_AREATEX_MAX_DISTANCE_DIAG), e, dist); + + // We do a scale and bias for mapping to texel space: + texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE); + + // Diagonal areas are on the second half of the texture: + texcoord.x += 0.5; + + // Move to proper place, according to the subpixel offset: + texcoord.y += SMAA_AREATEX_SUBTEX_SIZE * offset; + + // Do it! + return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord)); +} + +/** + * This searches for diagonal patterns and returns the corresponding weights. + */ +float2 SMAACalculateDiagWeights(SMAATexture2D(edgesTex), SMAATexture2D(areaTex), float2 texcoord, float2 e, float4 subsampleIndices) { + float2 weights = float2(0.0, 0.0); + + // Search for the line ends: + float4 d; + float2 end; + if (e.r > 0.0) { + d.xz = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, 1.0), end); + d.x += float(end.y > 0.9); + } else + d.xz = float2(0.0, 0.0); + d.yw = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, -1.0), end); + + SMAA_BRANCH + if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3 + // Fetch the crossing edges: + float4 coords = mad(float4(-d.x + 0.25, d.x, d.y, -d.y - 0.25), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + float4 c; + c.xy = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).rg; + c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).rg; + c.yxwz = SMAADecodeDiagBilinearAccess(c.xyzw); + + // Non-optimized version: + // float4 coords = mad(float4(-d.x, d.x, d.y, -d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + // float4 c; + // c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).g; + // c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, 0)).r; + // c.z = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).g; + // c.w = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, -1)).r; + + // Merge crossing edges at each side into a single value: + float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw); + + // Remove the crossing edge if we didn't found the end of the line: + SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0)); + + // Fetch the areas for this line: + weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.z); + } + + // Search for the line ends: + d.xz = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, -1.0), end); + if (SMAASampleLevelZeroOffset(edgesTex, texcoord, int2(1, 0)).r > 0.0) { + d.yw = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, 1.0), end); + d.y += float(end.y > 0.9); + } else + d.yw = float2(0.0, 0.0); + + SMAA_BRANCH + if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3 + // Fetch the crossing edges: + float4 coords = mad(float4(-d.x, -d.x, d.y, d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + float4 c; + c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).g; + c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, -1)).r; + c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).gr; + float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw); + + // Remove the crossing edge if we didn't found the end of the line: + SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0)); + + // Fetch the areas for this line: + weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.w).gr; + } + + return weights; +} +#endif + +//----------------------------------------------------------------------------- +// Horizontal/Vertical Search Functions + +/** + * This allows to determine how much length should we add in the last step + * of the searches. It takes the bilinearly interpolated edge (see + * @PSEUDO_GATHER4), and adds 0, 1 or 2, depending on which edges and + * crossing edges are active. + */ +float SMAASearchLength(SMAATexture2D(searchTex), float2 e, float offset) { + // The texture is flipped vertically, with left and right cases taking half + // of the space horizontally: + float2 scale = SMAA_SEARCHTEX_SIZE * float2(0.5, -1.0); + float2 bias = SMAA_SEARCHTEX_SIZE * float2(offset, 1.0); + + // Scale and bias to access texel centers: + scale += float2(-1.0, 1.0); + bias += float2( 0.5, -0.5); + + // Convert from pixel coordinates to texcoords: + // (We use SMAA_SEARCHTEX_PACKED_SIZE because the texture is cropped) + scale *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE; + bias *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE; + + // Lookup the search texture: + return SMAA_SEARCHTEX_SELECT(SMAASampleLevelZero(searchTex, mad(scale, e, bias))); +} + +/** + * Horizontal/vertical search functions for the 2nd pass. + */ +float SMAASearchXLeft(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + /** + * @PSEUDO_GATHER4 + * This texcoord has been offset by (-0.25, -0.125) in the vertex shader to + * sample between edge, thus fetching four edges in a row. + * Sampling with different offsets in each direction allows to disambiguate + * which edges are active from the four fetched ones. + */ + float2 e = float2(0.0, 1.0); + while (texcoord.x > end && + e.g > 0.8281 && // Is there some edge not activated? + e.r == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(-float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord); + } + + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0), 3.25); + return mad(SMAA_RT_METRICS.x, offset, texcoord.x); + + // Non-optimized version: + // We correct the previous (-0.25, -0.125) offset we applied: + // texcoord.x += 0.25 * SMAA_RT_METRICS.x; + + // The searches are bias by 1, so adjust the coords accordingly: + // texcoord.x += SMAA_RT_METRICS.x; + + // Disambiguate the length added by the last step: + // texcoord.x += 2.0 * SMAA_RT_METRICS.x; // Undo last step + // texcoord.x -= SMAA_RT_METRICS.x * (255.0 / 127.0) * SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0); + // return mad(SMAA_RT_METRICS.x, offset, texcoord.x); +} + +float SMAASearchXRight(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(0.0, 1.0); + while (texcoord.x < end && + e.g > 0.8281 && // Is there some edge not activated? + e.r == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.5), 3.25); + return mad(-SMAA_RT_METRICS.x, offset, texcoord.x); +} + +float SMAASearchYUp(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(1.0, 0.0); + while (texcoord.y > end && + e.r > 0.8281 && // Is there some edge not activated? + e.g == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(-float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.0), 3.25); + return mad(SMAA_RT_METRICS.y, offset, texcoord.y); +} + +float SMAASearchYDown(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(1.0, 0.0); + while (texcoord.y < end && + e.r > 0.8281 && // Is there some edge not activated? + e.g == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.5), 3.25); + return mad(-SMAA_RT_METRICS.y, offset, texcoord.y); +} + +/** + * Ok, we have the distance and both crossing edges. So, what are the areas + * at each side of current edge? + */ +float2 SMAAArea(SMAATexture2D(areaTex), float2 dist, float e1, float e2, float offset) { + // Rounding prevents precision errors of bilinear filtering: + float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE, SMAA_AREATEX_MAX_DISTANCE), round(4.0 * float2(e1, e2)), dist); + + // We do a scale and bias for mapping to texel space: + texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE); + + // Move to proper place, according to the subpixel offset: + texcoord.y = mad(SMAA_AREATEX_SUBTEX_SIZE, offset, texcoord.y); + + // Do it! + return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord)); +} + +//----------------------------------------------------------------------------- +// Corner Detection Functions + +void SMAADetectHorizontalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) { + #if !defined(SMAA_DISABLE_CORNER_DETECTION) + float2 leftRight = step(d.xy, d.yx); + float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight; + + rounding /= leftRight.x + leftRight.y; // Reduce blending for pixels in the center of a line. + + float2 factor = float2(1.0, 1.0); + factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, 1)).r; + factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, 1)).r; + factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, -2)).r; + factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, -2)).r; + + weights *= saturate(factor); + #endif +} + +void SMAADetectVerticalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) { + #if !defined(SMAA_DISABLE_CORNER_DETECTION) + float2 leftRight = step(d.xy, d.yx); + float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight; + + rounding /= leftRight.x + leftRight.y; + + float2 factor = float2(1.0, 1.0); + factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2( 1, 0)).g; + factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2( 1, 1)).g; + factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(-2, 0)).g; + factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(-2, 1)).g; + + weights *= saturate(factor); + #endif +} + +//----------------------------------------------------------------------------- +// Blending Weight Calculation Pixel Shader (Second Pass) + +float4 SMAABlendingWeightCalculationPS(float2 texcoord, + float2 pixcoord, + float4 offset[3], + SMAATexture2D(edgesTex), + SMAATexture2D(areaTex), + SMAATexture2D(searchTex), + float4 subsampleIndices) { // Just pass zero for SMAA 1x, see @SUBSAMPLE_INDICES. + float4 weights = float4(0.0, 0.0, 0.0, 0.0); + + float2 e = SMAASample(edgesTex, texcoord).rg; + + SMAA_BRANCH + if (e.g > 0.0) { // Edge at north + #if !defined(SMAA_DISABLE_DIAG_DETECTION) + // Diagonals have both north and west edges, so searching for them in + // one of the boundaries is enough. + weights.rg = SMAACalculateDiagWeights(SMAATexturePass2D(edgesTex), SMAATexturePass2D(areaTex), texcoord, e, subsampleIndices); + + // We give priority to diagonals, so if we find a diagonal we skip + // horizontal/vertical processing. + SMAA_BRANCH + if (weights.r == -weights.g) { // weights.r + weights.g == 0.0 + #endif + + float2 d; + + // Find the distance to the left: + float3 coords; + coords.x = SMAASearchXLeft(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].xy, offset[2].x); + coords.y = offset[1].y; // offset[1].y = texcoord.y - 0.25 * SMAA_RT_METRICS.y (@CROSSING_OFFSET) + d.x = coords.x; + + // Now fetch the left crossing edges, two at a time using bilinear + // filtering. Sampling at -0.25 (see @CROSSING_OFFSET) enables to + // discern what value each edge has: + float e1 = SMAASampleLevelZero(edgesTex, coords.xy).r; + + // Find the distance to the right: + coords.z = SMAASearchXRight(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].zw, offset[2].y); + d.y = coords.z; + + // We want the distances to be in pixel units (doing this here allow to + // better interleave arithmetic and memory accesses): + d = abs(round(mad(SMAA_RT_METRICS.zz, d, -pixcoord.xx))); + + // SMAAArea below needs a sqrt, as the areas texture is compressed + // quadratically: + float2 sqrt_d = sqrt(d); + + // Fetch the right crossing edges: + float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.zy, int2(1, 0)).r; + + // Ok, we know how this pattern looks like, now it is time for getting + // the actual area: + weights.rg = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.y); + + // Fix corners: + coords.y = texcoord.y; + SMAADetectHorizontalCornerPattern(SMAATexturePass2D(edgesTex), weights.rg, coords.xyzy, d); + + #if !defined(SMAA_DISABLE_DIAG_DETECTION) + } else + e.r = 0.0; // Skip vertical processing. + #endif + } + + SMAA_BRANCH + if (e.r > 0.0) { // Edge at west + float2 d; + + // Find the distance to the top: + float3 coords; + coords.y = SMAASearchYUp(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].xy, offset[2].z); + coords.x = offset[0].x; // offset[1].x = texcoord.x - 0.25 * SMAA_RT_METRICS.x; + d.x = coords.y; + + // Fetch the top crossing edges: + float e1 = SMAASampleLevelZero(edgesTex, coords.xy).g; + + // Find the distance to the bottom: + coords.z = SMAASearchYDown(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].zw, offset[2].w); + d.y = coords.z; + + // We want the distances to be in pixel units: + d = abs(round(mad(SMAA_RT_METRICS.ww, d, -pixcoord.yy))); + + // SMAAArea below needs a sqrt, as the areas texture is compressed + // quadratically: + float2 sqrt_d = sqrt(d); + + // Fetch the bottom crossing edges: + float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.xz, int2(0, 1)).g; + + // Get the area for this direction: + weights.ba = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.x); + + // Fix corners: + coords.x = texcoord.x; + SMAADetectVerticalCornerPattern(SMAATexturePass2D(edgesTex), weights.ba, coords.xyxz, d); + } + + return weights; +} + +//----------------------------------------------------------------------------- +// Neighborhood Blending Pixel Shader (Third Pass) + +float4 SMAANeighborhoodBlendingPS(float2 texcoord, + float4 offset, + SMAATexture2D(colorTex), + SMAATexture2D(blendTex) + #if SMAA_REPROJECTION + , SMAATexture2D(velocityTex) + #endif + ) { + // Fetch the blending weights for current pixel: + float4 a; + a.x = SMAASample(blendTex, offset.xy).a; // Right + a.y = SMAASample(blendTex, offset.zw).g; // Top + a.wz = SMAASample(blendTex, texcoord).xz; // Bottom / Left + + // Is there any blending weight with a value greater than 0.0? + SMAA_BRANCH + if (dot(a, float4(1.0, 1.0, 1.0, 1.0)) < 1e-5) { + float4 color = SMAASampleLevelZero(colorTex, texcoord); + + #if SMAA_REPROJECTION + float2 velocity = SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, texcoord)); + + // Pack velocity into the alpha channel: + color.a = sqrt(5.0 * length(velocity)); + #endif + + return color; + } else { + bool h = max(a.x, a.z) > max(a.y, a.w); // max(horizontal) > max(vertical) + + // Calculate the blending offsets: + float4 blendingOffset = float4(0.0, a.y, 0.0, a.w); + float2 blendingWeight = a.yw; + SMAAMovc(bool4(h, h, h, h), blendingOffset, float4(a.x, 0.0, a.z, 0.0)); + SMAAMovc(bool2(h, h), blendingWeight, a.xz); + blendingWeight /= dot(blendingWeight, float2(1.0, 1.0)); + + // Calculate the texture coordinates: + float4 blendingCoord = mad(blendingOffset, float4(SMAA_RT_METRICS.xy, -SMAA_RT_METRICS.xy), texcoord.xyxy); + + // We exploit bilinear filtering to mix current pixel with the chosen + // neighbor: + float4 color = blendingWeight.x * SMAASampleLevelZero(colorTex, blendingCoord.xy); + color += blendingWeight.y * SMAASampleLevelZero(colorTex, blendingCoord.zw); + + #if SMAA_REPROJECTION + // Antialias velocity for proper reprojection in a later stage: + float2 velocity = blendingWeight.x * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.xy)); + velocity += blendingWeight.y * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.zw)); + + // Pack velocity into the alpha channel: + color.a = sqrt(5.0 * length(velocity)); + #endif + + return color; + } +} + +//----------------------------------------------------------------------------- +// Temporal Resolve Pixel Shader (Optional Pass) + +float4 SMAAResolvePS(float2 texcoord, + SMAATexture2D(currentColorTex), + SMAATexture2D(previousColorTex) + #if SMAA_REPROJECTION + , SMAATexture2D(velocityTex) + #endif + ) { + #if SMAA_REPROJECTION + // Velocity is assumed to be calculated for motion blur, so we need to + // inverse it for reprojection: + float2 velocity = -SMAA_DECODE_VELOCITY(SMAASamplePoint(velocityTex, texcoord).rg); + + // Fetch current pixel: + float4 current = SMAASamplePoint(currentColorTex, texcoord); + + // Reproject current coordinates and fetch previous pixel: + float4 previous = SMAASamplePoint(previousColorTex, texcoord + velocity); + + // Attenuate the previous pixel if the velocity is different: + float delta = abs(current.a * current.a - previous.a * previous.a) / 5.0; + float weight = 0.5 * saturate(1.0 - sqrt(delta) * SMAA_REPROJECTION_WEIGHT_SCALE); + + // Blend the pixels according to the calculated weight: + return lerp(current, previous, weight); + #else + // Just blend the pixels: + float4 current = SMAASamplePoint(currentColorTex, texcoord); + float4 previous = SMAASamplePoint(previousColorTex, texcoord); + return lerp(current, previous, 0.5); + #endif +} + +//----------------------------------------------------------------------------- +// Separate Multisamples Pixel Shader (Optional Pass) + +#ifdef SMAALoad +void SMAASeparatePS(float4 position, + float2 texcoord, + out float4 target0, + out float4 target1, + SMAATexture2DMS2(colorTexMS)) { + int2 pos = int2(position.xy); + target0 = SMAALoad(colorTexMS, pos, 0); + target1 = SMAALoad(colorTexMS, pos, 1); +} +#endif + +//----------------------------------------------------------------------------- +#endif // SMAA_INCLUDE_PS + +layout(rgba8, binding = 0, set = 3) uniform image2D imgOutput; + +layout(binding = 1, set = 2) uniform sampler2D inputImg; +layout( binding = 2 ) uniform invResolution +{ + vec2 invResolution_data; +}; + +void main() +{ + vec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4); + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 4; j++) + { + ivec2 texelCoord = ivec2(loc.x + i, loc.y + j); + vec2 coord = (texelCoord + vec2(0.5)) / invResolution_data; + vec4 offset[3]; + SMAAEdgeDetectionVS(coord, offset); + vec2 oColor = SMAAColorEdgeDetectionPS(coord, offset, inputImg); + if (oColor != float2(-2.0, -2.0)) + { + imageStore(imgOutput, texelCoord, vec4(oColor, 0.0, 1.0)); + } + } + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaEdge.spv b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaEdge.spv new file mode 100644 index 0000000000000000000000000000000000000000..1062a9e3abd0624aec557d24c06e1a161dec6874 GIT binary patch literal 8464 zcmZ9Q33OG}6^1X6m!RNOkwH<3D54-DqKJqPl8{7$K@tSC7N6k-Vv{^d9+X<!;za3W zCp%f&+IhCxTJ5ZzTdSSt(c00DcCFT_)%N>t&aRhtmfQ31e^2L}eeS(4Oc+1ADVs1Z zJ20D=Z5o&5<FIUeCY+Qt=5t4PZ})=H+Li^2&seC#!P$h|PoG2ZDKR%7!{xyd4UZy+ z!pj|&WA!Z0HG$ai;9ChU;Zck$u?qcr*R-^>Z`oRDtJEs}wZZDhMZI%(4J=x=Yhdvu zr(e2^xDs`<tXA3GU#;%g0?cz+qdG7!TB)fYlpTp~uGZ>Mb;o`Q*Y1~abkS+c#&VRh zX@L5?)A!3;%4VSVo3pR7dn~RoI~K5}y>DGt>*ftzZGD}=H$TmHws&=O_MyA}Rm`<* zJ1V2wszYv|#JO}Zx8}GZ>lP1H2G~k&>t!Cp4cSKUmda4AoZA|*0p`|PbvqNj4cV2< z70z!o>HC;lcMNXbmY+*QcC-5B)mp7OjDIP+4Sl%0yDg>6xjXf-{{Pp|klias9ZMPa zAo{@INO>qlx`%(l4ORQGPR#GY)}hTEL)FX6LtP`gs{LiV+0}-RzKz_0!GsS1Uy<Wd zHXTe4DnqSf-E~iM;G)?j?$&%`M|Q2Nj8=zsnw8i_YRWtIvu)l|_OPtEAlc1%I{4}P z(vU4i-(KEP9wuJ<nRWV7{qxJDy_B7eJ~*;{XRT{^t2K>TRu{8;KfRQ78Z*4LXJ?JL zJf<=0h4Y?nUc0Wnx4mz3ch3fLHD!Hu{F?T*uJvoI)0C~RvvqQ}#@5i3T~udV-`%&a z1!D*6^PXY`jp@xv_ZimivpWfyiF8sa&T}@pqk}V)^BLCP^E@uMJgu8E?4FVKqOKh} zkM}&Z`$jRJx^{=UXTyK_Q1?vK6?M-+9Qob5dgR-P;(mN)564TtW~|!9Q$dZ)vyf@z zYhqT{HVb_svz(&+0A}r;-x2)(K;~J9V_wp&Xa1S!lbMZ?Gv7hX=8|vbd=F-}mUXnv z!257!Ym7sRBlM%lJ_Un*<~$-f$u$!>mDw7RQ@d-mr?U|AxK{O}5r?`pkIA(?D;6c) z`;ui3w}sxE>o0B#{}Xb3&)uQV$@MIo&kEf4iHJV(ir6*JV|I=D#9B@PyS9j31a>_U zD-Pc!V9zb;o&#pw6S2!t<RVrazOCT<@4IjL{QODEGT%pXial;;Zba;%YhBIkTIJ1Q z&o?0s?H4m^Hzw?tg568#Ww7gwntQ<RN11}<8s@O=1-stjnjb=EtaN?<0=R})-y>ja z$uGmM?sF{X`W{Qzm{`xh!QNr>jmL2&*!AgOLU%uM`a2&4r@!`T$!?r>_k9wWu_E?x zGP;~{3@i1WVNT;u0q1AI?0VIGk3{`b6V_k5_t*D`!}Yd+Jum%zURI;KhU5Yo8`@*d z9m%M#In{mFIK12L(RYm_bl){a-FHn<_g&+NGu?phZ;<({#$NdDkyE@6XL8?SO{jVQ z9)l@?U6aqN-_O{$eefGu)csC|ZXf(shHkvy$<TeD_>Bu)yWhCbwfl_=UAy17qVD%B zbp8FFskg9d@73docg^QvIeYMqJ%PwyhmZPy66c+}DxLL{iH|%_p_|8Ziah@X%SRp? zELY4k4o-fpXUoh_+05MXRcg7`hQy!CiF>~4V$b$y^vCHz6WCrjKSJCA%m*S4?URz- z+4Xr}CWFm!JGH%!2ZQB&R-V8|eIjBmbBnEO52Nm($>wa`XD}a*IIKG**`1^A5nyvf z-Kk)?sH=V$VlL~7J!jujGr*of^vmbwNI3b(e-zk#d4Aewg5|Czhx*ZAW8_EDSsVkF zk2=SKtrPt;ZWf$;)M*C0kEnATST5?!1{)(Eb&dziN54-1TU*|<^?8u9#(3rv5xKYv za}#F`@5VfIIp0^V{Z!_Ki0>nNWFGxaOZ>hRw+P)>dF%Sz$%X$JU^#6*bLvYuJ)i$2 zVE@jM_xUr%-vehQ-58%+^~iS)IP&>Ssz=<pVEv8r`BY!Zs$KUA@R7{MX!AX)pU=6r z$iEUScNhC>VP1vE?Irdm*600hP5h(8w}TztbNMzz&if`d?`q~Qq%-OE<2=NEMBVej z*1eFr-jQywoFlk3iIZN-uKdk#0gArb=Aio>?Wd;otfAfKC)VVw?z80@on3>^aSzyg zxQ8{UuLaBbd&INSCTG0QsqsEr>b@h?og?3c;6^0!tpm$NK5cTw$9u<j#FN>$>?|a{ z6W$SrzY&+P4&TrI-7EH8?6)Rg3$yn!f5yO`g}>W^J1%j}%yE9R!Ltx;`fBUpWIfyC z5$%eDrRBpp`ZgErS|a8ou$<y3#!P1A<~=#_{;speiHLDNcm5unkGLnFJI`I+nApbx zaQ)s+O`PkGy)6XmtF3--r@?7g9L06U-cE-v?rkx;oZ=|P#NN(GynC~TvvIMvGr{gH z?tr>6vA46p^?O^IxZ>W<hSOJD{oedX4DE`exX#$yx$woktw5Jk9L1Q}+seedCu=N6 zjElXsfZdzF$?co^N~Cz7eHZy2a!f_yK2J;7`#e2ifA7seyxT`H$8V!mVDqT2X6~rd zyX*Acg6{p=SkR3hDCj<0R~B^b`|9+Y3%dTd6?FaYtkdr;=of<@OuA>}UfK}*GKIMl zanI4GF0f~_F}?531M4F{9+`vm@Lc&f$@z$O#bFQiv5se?J?g9hTPJ#V0oZeoZ=KI+ z4<g?~ZO?mc;^ZCXk9A!LHm5%I>skk=U2zoGWv{fmF89!fxYp3ugY~(VcZ2qe5Pjq) z^QM#AfS9uZ(e8fjxxD-H%r8OYHX-J2WPT<hA3o0l>vJg*KAXYvlQ{unE<>y}7cpLY z#9t0JzKleCKUhBEw}2!5GO+fDuYkRquF*ZH>u)aa##W@*vu)`0JsX6RkDgrtmQ(D@ z1SER)Y_K`?sqfhkoOZ?Gn)PvA1I*f8m*+o>6ni#;u21x=ntbG=XWPN%v}f9*XYz3# z&jHu>YzLft_>3l>V$W)D@^R)n!ExrsYmfL{;QF3D7fwFncY`C|T-qc4d0=~HefOZQ zzh~3L{Cs3p(tW0`LiCURydZJ%4(nLg*cT>!A+ve(wTAB}ZBgSzV7VL9xBqIe+#ci( zxNDfNMdaro{%*UT+2QXtZPy`kzH7x#G4D;feB8m8fQxtVrRdX<Q<9H+d>JAidwhA~ z<Q?uY?%*rH=G3SD4&DH#U2zx}cW@tAd)&cSf{S<XRp|P}9eg!dANjb0uK}CWJE%SG zpnUZAwP5eZCd68e%r_$P;qyAMKE*rudUW}?LvH|EYc68E_K1HY*!VIMHQof4kNBIw z5pOQ-5&veecglG8pss)P>@DD8&)$k&-?O*D$w$v_0m~_#dGWpRc64*zg4Fly9dOze z$2cT<b~9Le^z5DBV$a@%u21yr-C%v>qi62{o70|YkDkfLdAt`~-?R6@$%oJTlTWc{ zAApmOGyfpiTK3F%?GgVWaDC4{3@0D)w}K<yT-qc4BVc<r32_hV`g=A$L$@Q^qaPnl zoV>%l;rlUg(f1B?ImNTLcdn_4`Qu2e$(;IH|F&d{J98I$@y>h#eLAuPiMpQz%f}i& z1(s7B)`{o*(_nMz<C^9m@w??SVC{;-xVSTSg0)+-o%yp!@y>h>U7xr!cPAhDxHI>F z&FP)d9(U#*aP;HzVEefVv6j#I7ZCaI`65`K;+^>tx_tEf%V2AHXN=b#@m~QOUq+(F zSHbcT|21&Ln@fAde;sU}jdu^~`kQMC^EZ$gi2n1K??dW)c7NidXWvAZQ#^C;O7!eo zU~}rDt=O|~qia_ju2~<~buY8_=-GF`#h!f^U7zUL_rUtdN6)?wHm5z)9zBzf^Y{VS z^DOr4hv@R*^P}Wb?Aedu<m1eL0**5`UVFs<6kOl4pTWsT{LjG=Z!YZ-{|m4^i|1Tj zf6u0g`Im^#p1RNduMqvCKMy2MKKf%_V}FfAUGqGEcuwZi7HjwoSgv?ZzeSf1pWh{) z;yL{uPCoYe2XNFgUc2=kWY*>$8kzry$i<%i1TOCB&**ZB!<;v8XWi3Z5bX~k{@b41 uUlDUEvG>1$ja!IlH!jZf@8Bzu>r?#0U^!!4uQs{x{|8vk-#lXf?d5;9EWxh; literal 0 HcmV?d00001 diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaNeighbour.glsl b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaNeighbour.glsl new file mode 100644 index 000000000..df30d727b --- /dev/null +++ b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaNeighbour.glsl @@ -0,0 +1,1403 @@ +#version 430 core +#define SMAA_GLSL_4 1 + +layout (constant_id = 0) const int SMAA_PRESET_LOW = 0; +layout (constant_id = 1) const int SMAA_PRESET_MEDIUM = 0; +layout (constant_id = 2) const int SMAA_PRESET_HIGH = 0; +layout (constant_id = 3) const int SMAA_PRESET_ULTRA = 0; +layout (constant_id = 4) const float METRIC_WIDTH = 1920.0; +layout (constant_id = 5) const float METRIC_HEIGHT = 1080.0; + +#define SMAA_RT_METRICS float4(1.0 / METRIC_WIDTH, 1.0 / METRIC_HEIGHT, METRIC_WIDTH, METRIC_HEIGHT) + +layout (local_size_x = 16, local_size_y = 16) in; +/** + * Copyright (C) 2013 Jorge Jimenez (jorge@iryoku.com) + * Copyright (C) 2013 Jose I. Echevarria (joseignacioechevarria@gmail.com) + * Copyright (C) 2013 Belen Masia (bmasia@unizar.es) + * Copyright (C) 2013 Fernando Navarro (fernandn@microsoft.com) + * Copyright (C) 2013 Diego Gutierrez (diegog@unizar.es) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to + * do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. As clarification, there + * is no requirement that the copyright notice and permission be included in + * binary distributions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +/** + * _______ ___ ___ ___ ___ + * / || \/ | / \ / \ + * | (---- | \ / | / ^ \ / ^ \ + * \ \ | |\/| | / /_\ \ / /_\ \ + * ----) | | | | | / _____ \ / _____ \ + * |_______/ |__| |__| /__/ \__\ /__/ \__\ + * + * E N H A N C E D + * S U B P I X E L M O R P H O L O G I C A L A N T I A L I A S I N G + * + * http://www.iryoku.com/smaa/ + * + * Hi, welcome aboard! + * + * Here you'll find instructions to get the shader up and running as fast as + * possible. + * + * IMPORTANTE NOTICE: when updating, remember to update both this file and the + * precomputed textures! They may change from version to version. + * + * The shader has three passes, chained together as follows: + * + * |input|------------------ + * v | + * [ SMAA*EdgeDetection ] | + * v | + * |edgesTex| | + * v | + * [ SMAABlendingWeightCalculation ] | + * v | + * |blendTex| | + * v | + * [ SMAANeighborhoodBlending ] <------ + * v + * |output| + * + * Note that each [pass] has its own vertex and pixel shader. Remember to use + * oversized triangles instead of quads to avoid overshading along the + * diagonal. + * + * You've three edge detection methods to choose from: luma, color or depth. + * They represent different quality/performance and anti-aliasing/sharpness + * tradeoffs, so our recommendation is for you to choose the one that best + * suits your particular scenario: + * + * - Depth edge detection is usually the fastest but it may miss some edges. + * + * - Luma edge detection is usually more expensive than depth edge detection, + * but catches visible edges that depth edge detection can miss. + * + * - Color edge detection is usually the most expensive one but catches + * chroma-only edges. + * + * For quickstarters: just use luma edge detection. + * + * The general advice is to not rush the integration process and ensure each + * step is done correctly (don't try to integrate SMAA T2x with predicated edge + * detection from the start!). Ok then, let's go! + * + * 1. The first step is to create two RGBA temporal render targets for holding + * |edgesTex| and |blendTex|. + * + * In DX10 or DX11, you can use a RG render target for the edges texture. + * In the case of NVIDIA GPUs, using RG render targets seems to actually be + * slower. + * + * On the Xbox 360, you can use the same render target for resolving both + * |edgesTex| and |blendTex|, as they aren't needed simultaneously. + * + * 2. Both temporal render targets |edgesTex| and |blendTex| must be cleared + * each frame. Do not forget to clear the alpha channel! + * + * 3. The next step is loading the two supporting precalculated textures, + * 'areaTex' and 'searchTex'. You'll find them in the 'Textures' folder as + * C++ headers, and also as regular DDS files. They'll be needed for the + * 'SMAABlendingWeightCalculation' pass. + * + * If you use the C++ headers, be sure to load them in the format specified + * inside of them. + * + * You can also compress 'areaTex' and 'searchTex' using BC5 and BC4 + * respectively, if you have that option in your content processor pipeline. + * When compressing then, you get a non-perceptible quality decrease, and a + * marginal performance increase. + * + * 4. All samplers must be set to linear filtering and clamp. + * + * After you get the technique working, remember that 64-bit inputs have + * half-rate linear filtering on GCN. + * + * If SMAA is applied to 64-bit color buffers, switching to point filtering + * when accesing them will increase the performance. Search for + * 'SMAASamplePoint' to see which textures may benefit from point + * filtering, and where (which is basically the color input in the edge + * detection and resolve passes). + * + * 5. All texture reads and buffer writes must be non-sRGB, with the exception + * of the input read and the output write in + * 'SMAANeighborhoodBlending' (and only in this pass!). If sRGB reads in + * this last pass are not possible, the technique will work anyway, but + * will perform antialiasing in gamma space. + * + * IMPORTANT: for best results the input read for the color/luma edge + * detection should *NOT* be sRGB. + * + * 6. Before including SMAA.h you'll have to setup the render target metrics, + * the target and any optional configuration defines. Optionally you can + * use a preset. + * + * You have the following targets available: + * SMAA_HLSL_3 + * SMAA_HLSL_4 + * SMAA_HLSL_4_1 + * SMAA_GLSL_3 * + * SMAA_GLSL_4 * + * + * * (See SMAA_INCLUDE_VS and SMAA_INCLUDE_PS below). + * + * And four presets: + * SMAA_PRESET_LOW (%60 of the quality) + * SMAA_PRESET_MEDIUM (%80 of the quality) + * SMAA_PRESET_HIGH (%95 of the quality) + * SMAA_PRESET_ULTRA (%99 of the quality) + * + * For example: + * #define SMAA_RT_METRICS float4(1.0 / 1280.0, 1.0 / 720.0, 1280.0, 720.0) + * #define SMAA_HLSL_4 + * #define SMAA_PRESET_HIGH + * #include "SMAA.h" + * + * Note that SMAA_RT_METRICS doesn't need to be a macro, it can be a + * uniform variable. The code is designed to minimize the impact of not + * using a constant value, but it is still better to hardcode it. + * + * Depending on how you encoded 'areaTex' and 'searchTex', you may have to + * add (and customize) the following defines before including SMAA.h: + * #define SMAA_AREATEX_SELECT(sample) sample.rg + * #define SMAA_SEARCHTEX_SELECT(sample) sample.r + * + * If your engine is already using porting macros, you can define + * SMAA_CUSTOM_SL, and define the porting functions by yourself. + * + * 7. Then, you'll have to setup the passes as indicated in the scheme above. + * You can take a look into SMAA.fx, to see how we did it for our demo. + * Checkout the function wrappers, you may want to copy-paste them! + * + * 8. It's recommended to validate the produced |edgesTex| and |blendTex|. + * You can use a screenshot from your engine to compare the |edgesTex| + * and |blendTex| produced inside of the engine with the results obtained + * with the reference demo. + * + * 9. After you get the last pass to work, it's time to optimize. You'll have + * to initialize a stencil buffer in the first pass (discard is already in + * the code), then mask execution by using it the second pass. The last + * pass should be executed in all pixels. + * + * + * After this point you can choose to enable predicated thresholding, + * temporal supersampling and motion blur integration: + * + * a) If you want to use predicated thresholding, take a look into + * SMAA_PREDICATION; you'll need to pass an extra texture in the edge + * detection pass. + * + * b) If you want to enable temporal supersampling (SMAA T2x): + * + * 1. The first step is to render using subpixel jitters. I won't go into + * detail, but it's as simple as moving each vertex position in the + * vertex shader, you can check how we do it in our DX10 demo. + * + * 2. Then, you must setup the temporal resolve. You may want to take a look + * into SMAAResolve for resolving 2x modes. After you get it working, you'll + * probably see ghosting everywhere. But fear not, you can enable the + * CryENGINE temporal reprojection by setting the SMAA_REPROJECTION macro. + * Check out SMAA_DECODE_VELOCITY if your velocity buffer is encoded. + * + * 3. The next step is to apply SMAA to each subpixel jittered frame, just as + * done for 1x. + * + * 4. At this point you should already have something usable, but for best + * results the proper area textures must be set depending on current jitter. + * For this, the parameter 'subsampleIndices' of + * 'SMAABlendingWeightCalculationPS' must be set as follows, for our T2x + * mode: + * + * @SUBSAMPLE_INDICES + * + * | S# | Camera Jitter | subsampleIndices | + * +----+------------------+---------------------+ + * | 0 | ( 0.25, -0.25) | float4(1, 1, 1, 0) | + * | 1 | (-0.25, 0.25) | float4(2, 2, 2, 0) | + * + * These jitter positions assume a bottom-to-top y axis. S# stands for the + * sample number. + * + * More information about temporal supersampling here: + * http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf + * + * c) If you want to enable spatial multisampling (SMAA S2x): + * + * 1. The scene must be rendered using MSAA 2x. The MSAA 2x buffer must be + * created with: + * - DX10: see below (*) + * - DX10.1: D3D10_STANDARD_MULTISAMPLE_PATTERN or + * - DX11: D3D11_STANDARD_MULTISAMPLE_PATTERN + * + * This allows to ensure that the subsample order matches the table in + * @SUBSAMPLE_INDICES. + * + * (*) In the case of DX10, we refer the reader to: + * - SMAA::detectMSAAOrder and + * - SMAA::msaaReorder + * + * These functions allow to match the standard multisample patterns by + * detecting the subsample order for a specific GPU, and reordering + * them appropriately. + * + * 2. A shader must be run to output each subsample into a separate buffer + * (DX10 is required). You can use SMAASeparate for this purpose, or just do + * it in an existing pass (for example, in the tone mapping pass, which has + * the advantage of feeding tone mapped subsamples to SMAA, which will yield + * better results). + * + * 3. The full SMAA 1x pipeline must be run for each separated buffer, storing + * the results in the final buffer. The second run should alpha blend with + * the existing final buffer using a blending factor of 0.5. + * 'subsampleIndices' must be adjusted as in the SMAA T2x case (see point + * b). + * + * d) If you want to enable temporal supersampling on top of SMAA S2x + * (which actually is SMAA 4x): + * + * 1. SMAA 4x consists on temporally jittering SMAA S2x, so the first step is + * to calculate SMAA S2x for current frame. In this case, 'subsampleIndices' + * must be set as follows: + * + * | F# | S# | Camera Jitter | Net Jitter | subsampleIndices | + * +----+----+--------------------+-------------------+----------------------+ + * | 0 | 0 | ( 0.125, 0.125) | ( 0.375, -0.125) | float4(5, 3, 1, 3) | + * | 0 | 1 | ( 0.125, 0.125) | (-0.125, 0.375) | float4(4, 6, 2, 3) | + * +----+----+--------------------+-------------------+----------------------+ + * | 1 | 2 | (-0.125, -0.125) | ( 0.125, -0.375) | float4(3, 5, 1, 4) | + * | 1 | 3 | (-0.125, -0.125) | (-0.375, 0.125) | float4(6, 4, 2, 4) | + * + * These jitter positions assume a bottom-to-top y axis. F# stands for the + * frame number. S# stands for the sample number. + * + * 2. After calculating SMAA S2x for current frame (with the new subsample + * indices), previous frame must be reprojected as in SMAA T2x mode (see + * point b). + * + * e) If motion blur is used, you may want to do the edge detection pass + * together with motion blur. This has two advantages: + * + * 1. Pixels under heavy motion can be omitted from the edge detection process. + * For these pixels we can just store "no edge", as motion blur will take + * care of them. + * 2. The center pixel tap is reused. + * + * Note that in this case depth testing should be used instead of stenciling, + * as we have to write all the pixels in the motion blur pass. + * + * That's it! + */ + +//----------------------------------------------------------------------------- +// SMAA Presets + +/** + * Note that if you use one of these presets, the following configuration + * macros will be ignored if set in the "Configurable Defines" section. + */ + +#if defined(SMAA_PRESET_LOW) +#define SMAA_THRESHOLD 0.15 +#define SMAA_MAX_SEARCH_STEPS 4 +#define SMAA_DISABLE_DIAG_DETECTION +#define SMAA_DISABLE_CORNER_DETECTION +#elif defined(SMAA_PRESET_MEDIUM) +#define SMAA_THRESHOLD 0.1 +#define SMAA_MAX_SEARCH_STEPS 8 +#define SMAA_DISABLE_DIAG_DETECTION +#define SMAA_DISABLE_CORNER_DETECTION +#elif defined(SMAA_PRESET_HIGH) +#define SMAA_THRESHOLD 0.1 +#define SMAA_MAX_SEARCH_STEPS 16 +#define SMAA_MAX_SEARCH_STEPS_DIAG 8 +#define SMAA_CORNER_ROUNDING 25 +#elif defined(SMAA_PRESET_ULTRA) +#define SMAA_THRESHOLD 0.05 +#define SMAA_MAX_SEARCH_STEPS 32 +#define SMAA_MAX_SEARCH_STEPS_DIAG 16 +#define SMAA_CORNER_ROUNDING 25 +#endif + +//----------------------------------------------------------------------------- +// Configurable Defines + +/** + * SMAA_THRESHOLD specifies the threshold or sensitivity to edges. + * Lowering this value you will be able to detect more edges at the expense of + * performance. + * + * Range: [0, 0.5] + * 0.1 is a reasonable value, and allows to catch most visible edges. + * 0.05 is a rather overkill value, that allows to catch 'em all. + * + * If temporal supersampling is used, 0.2 could be a reasonable value, as low + * contrast edges are properly filtered by just 2x. + */ +#ifndef SMAA_THRESHOLD +#define SMAA_THRESHOLD 0.1 +#endif + +/** + * SMAA_DEPTH_THRESHOLD specifies the threshold for depth edge detection. + * + * Range: depends on the depth range of the scene. + */ +#ifndef SMAA_DEPTH_THRESHOLD +#define SMAA_DEPTH_THRESHOLD (0.1 * SMAA_THRESHOLD) +#endif + +/** + * SMAA_MAX_SEARCH_STEPS specifies the maximum steps performed in the + * horizontal/vertical pattern searches, at each side of the pixel. + * + * In number of pixels, it's actually the double. So the maximum line length + * perfectly handled by, for example 16, is 64 (by perfectly, we meant that + * longer lines won't look as good, but still antialiased). + * + * Range: [0, 112] + */ +#ifndef SMAA_MAX_SEARCH_STEPS +#define SMAA_MAX_SEARCH_STEPS 16 +#endif + +/** + * SMAA_MAX_SEARCH_STEPS_DIAG specifies the maximum steps performed in the + * diagonal pattern searches, at each side of the pixel. In this case we jump + * one pixel at time, instead of two. + * + * Range: [0, 20] + * + * On high-end machines it is cheap (between a 0.8x and 0.9x slower for 16 + * steps), but it can have a significant impact on older machines. + * + * Define SMAA_DISABLE_DIAG_DETECTION to disable diagonal processing. + */ +#ifndef SMAA_MAX_SEARCH_STEPS_DIAG +#define SMAA_MAX_SEARCH_STEPS_DIAG 8 +#endif + +/** + * SMAA_CORNER_ROUNDING specifies how much sharp corners will be rounded. + * + * Range: [0, 100] + * + * Define SMAA_DISABLE_CORNER_DETECTION to disable corner processing. + */ +#ifndef SMAA_CORNER_ROUNDING +#define SMAA_CORNER_ROUNDING 25 +#endif + +/** + * If there is an neighbor edge that has SMAA_LOCAL_CONTRAST_FACTOR times + * bigger contrast than current edge, current edge will be discarded. + * + * This allows to eliminate spurious crossing edges, and is based on the fact + * that, if there is too much contrast in a direction, that will hide + * perceptually contrast in the other neighbors. + */ +#ifndef SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR +#define SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR 2.0 +#endif + +/** + * Predicated thresholding allows to better preserve texture details and to + * improve performance, by decreasing the number of detected edges using an + * additional buffer like the light accumulation buffer, object ids or even the + * depth buffer (the depth buffer usage may be limited to indoor or short range + * scenes). + * + * It locally decreases the luma or color threshold if an edge is found in an + * additional buffer (so the global threshold can be higher). + * + * This method was developed by Playstation EDGE MLAA team, and used in + * Killzone 3, by using the light accumulation buffer. More information here: + * http://iryoku.com/aacourse/downloads/06-MLAA-on-PS3.pptx + */ +#ifndef SMAA_PREDICATION +#define SMAA_PREDICATION 0 +#endif + +/** + * Threshold to be used in the additional predication buffer. + * + * Range: depends on the input, so you'll have to find the magic number that + * works for you. + */ +#ifndef SMAA_PREDICATION_THRESHOLD +#define SMAA_PREDICATION_THRESHOLD 0.01 +#endif + +/** + * How much to scale the global threshold used for luma or color edge + * detection when using predication. + * + * Range: [1, 5] + */ +#ifndef SMAA_PREDICATION_SCALE +#define SMAA_PREDICATION_SCALE 2.0 +#endif + +/** + * How much to locally decrease the threshold. + * + * Range: [0, 1] + */ +#ifndef SMAA_PREDICATION_STRENGTH +#define SMAA_PREDICATION_STRENGTH 0.4 +#endif + +/** + * Temporal reprojection allows to remove ghosting artifacts when using + * temporal supersampling. We use the CryEngine 3 method which also introduces + * velocity weighting. This feature is of extreme importance for totally + * removing ghosting. More information here: + * http://iryoku.com/aacourse/downloads/13-Anti-Aliasing-Methods-in-CryENGINE-3.pdf + * + * Note that you'll need to setup a velocity buffer for enabling reprojection. + * For static geometry, saving the previous depth buffer is a viable + * alternative. + */ +#ifndef SMAA_REPROJECTION +#define SMAA_REPROJECTION 0 +#endif + +/** + * SMAA_REPROJECTION_WEIGHT_SCALE controls the velocity weighting. It allows to + * remove ghosting trails behind the moving object, which are not removed by + * just using reprojection. Using low values will exhibit ghosting, while using + * high values will disable temporal supersampling under motion. + * + * Behind the scenes, velocity weighting removes temporal supersampling when + * the velocity of the subsamples differs (meaning they are different objects). + * + * Range: [0, 80] + */ +#ifndef SMAA_REPROJECTION_WEIGHT_SCALE +#define SMAA_REPROJECTION_WEIGHT_SCALE 30.0 +#endif + +/** + * On some compilers, discard cannot be used in vertex shaders. Thus, they need + * to be compiled separately. + */ +#ifndef SMAA_INCLUDE_VS +#define SMAA_INCLUDE_VS 1 +#endif +#ifndef SMAA_INCLUDE_PS +#define SMAA_INCLUDE_PS 1 +#endif + +//----------------------------------------------------------------------------- +// Texture Access Defines + +#ifndef SMAA_AREATEX_SELECT +#if defined(SMAA_HLSL_3) +#define SMAA_AREATEX_SELECT(sample) sample.ra +#else +#define SMAA_AREATEX_SELECT(sample) sample.rg +#endif +#endif + +#ifndef SMAA_SEARCHTEX_SELECT +#define SMAA_SEARCHTEX_SELECT(sample) sample.r +#endif + +#ifndef SMAA_DECODE_VELOCITY +#define SMAA_DECODE_VELOCITY(sample) sample.rg +#endif + +//----------------------------------------------------------------------------- +// Non-Configurable Defines + +#define SMAA_AREATEX_MAX_DISTANCE 16 +#define SMAA_AREATEX_MAX_DISTANCE_DIAG 20 +#define SMAA_AREATEX_PIXEL_SIZE (1.0 / float2(160.0, 560.0)) +#define SMAA_AREATEX_SUBTEX_SIZE (1.0 / 7.0) +#define SMAA_SEARCHTEX_SIZE float2(66.0, 33.0) +#define SMAA_SEARCHTEX_PACKED_SIZE float2(64.0, 16.0) +#define SMAA_CORNER_ROUNDING_NORM (float(SMAA_CORNER_ROUNDING) / 100.0) + +//----------------------------------------------------------------------------- +// Porting Functions + +#if defined(SMAA_HLSL_3) +#define SMAATexture2D(tex) sampler2D tex +#define SMAATexturePass2D(tex) tex +#define SMAASampleLevelZero(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0)) +#define SMAASampleLevelZeroPoint(tex, coord) tex2Dlod(tex, float4(coord, 0.0, 0.0)) +#define SMAASampleLevelZeroOffset(tex, coord, offset) tex2Dlod(tex, float4(coord + offset * SMAA_RT_METRICS.xy, 0.0, 0.0)) +#define SMAASample(tex, coord) tex2D(tex, coord) +#define SMAASamplePoint(tex, coord) tex2D(tex, coord) +#define SMAASampleOffset(tex, coord, offset) tex2D(tex, coord + offset * SMAA_RT_METRICS.xy) +#define SMAA_FLATTEN [flatten] +#define SMAA_BRANCH [branch] +#endif +#if defined(SMAA_HLSL_4) || defined(SMAA_HLSL_4_1) +SamplerState LinearSampler { Filter = MIN_MAG_LINEAR_MIP_POINT; AddressU = Clamp; AddressV = Clamp; }; +SamplerState PointSampler { Filter = MIN_MAG_MIP_POINT; AddressU = Clamp; AddressV = Clamp; }; +#define SMAATexture2D(tex) Texture2D tex +#define SMAATexturePass2D(tex) tex +#define SMAASampleLevelZero(tex, coord) tex.SampleLevel(LinearSampler, coord, 0) +#define SMAASampleLevelZeroPoint(tex, coord) tex.SampleLevel(PointSampler, coord, 0) +#define SMAASampleLevelZeroOffset(tex, coord, offset) tex.SampleLevel(LinearSampler, coord, 0, offset) +#define SMAASample(tex, coord) tex.Sample(LinearSampler, coord) +#define SMAASamplePoint(tex, coord) tex.Sample(PointSampler, coord) +#define SMAASampleOffset(tex, coord, offset) tex.Sample(LinearSampler, coord, offset) +#define SMAA_FLATTEN [flatten] +#define SMAA_BRANCH [branch] +#define SMAATexture2DMS2(tex) Texture2DMS<float4, 2> tex +#define SMAALoad(tex, pos, sample) tex.Load(pos, sample) +#if defined(SMAA_HLSL_4_1) +#define SMAAGather(tex, coord) tex.Gather(LinearSampler, coord, 0) +#endif +#endif +#if defined(SMAA_GLSL_3) || defined(SMAA_GLSL_4) +#define SMAATexture2D(tex) sampler2D tex +#define SMAATexturePass2D(tex) tex +#define SMAASampleLevelZero(tex, coord) textureLod(tex, coord, 0.0) +#define SMAASampleLevelZeroPoint(tex, coord) textureLod(tex, coord, 0.0) +#define SMAASampleLevelZeroOffset(tex, coord, offset) textureLodOffset(tex, coord, 0.0, offset) +#define SMAASample(tex, coord) texture(tex, coord) +#define SMAASamplePoint(tex, coord) texture(tex, coord) +#define SMAASampleOffset(tex, coord, offset) texture(tex, coord, offset) +#define SMAA_FLATTEN +#define SMAA_BRANCH +#define lerp(a, b, t) mix(a, b, t) +#define saturate(a) clamp(a, 0.0, 1.0) +#if defined(SMAA_GLSL_4) +#define mad(a, b, c) fma(a, b, c) +#define SMAAGather(tex, coord) textureGather(tex, coord) +#else +#define mad(a, b, c) (a * b + c) +#endif +#define float2 vec2 +#define float3 vec3 +#define float4 vec4 +#define int2 ivec2 +#define int3 ivec3 +#define int4 ivec4 +#define bool2 bvec2 +#define bool3 bvec3 +#define bool4 bvec4 +#endif + +#if !defined(SMAA_HLSL_3) && !defined(SMAA_HLSL_4) && !defined(SMAA_HLSL_4_1) && !defined(SMAA_GLSL_3) && !defined(SMAA_GLSL_4) && !defined(SMAA_CUSTOM_SL) +#error you must define the shading language: SMAA_HLSL_*, SMAA_GLSL_* or SMAA_CUSTOM_SL +#endif + +//----------------------------------------------------------------------------- +// Misc functions + +/** + * Gathers current pixel, and the top-left neighbors. + */ +float3 SMAAGatherNeighbours(float2 texcoord, + float4 offset[3], + SMAATexture2D(tex)) { + #ifdef SMAAGather + return SMAAGather(tex, texcoord + SMAA_RT_METRICS.xy * float2(-0.5, -0.5)).grb; + #else + float P = SMAASamplePoint(tex, texcoord).r; + float Pleft = SMAASamplePoint(tex, offset[0].xy).r; + float Ptop = SMAASamplePoint(tex, offset[0].zw).r; + return float3(P, Pleft, Ptop); + #endif +} + +/** + * Adjusts the threshold by means of predication. + */ +float2 SMAACalculatePredicatedThreshold(float2 texcoord, + float4 offset[3], + SMAATexture2D(predicationTex)) { + float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(predicationTex)); + float2 delta = abs(neighbours.xx - neighbours.yz); + float2 edges = step(SMAA_PREDICATION_THRESHOLD, delta); + return SMAA_PREDICATION_SCALE * SMAA_THRESHOLD * (1.0 - SMAA_PREDICATION_STRENGTH * edges); +} + +/** + * Conditional move: + */ +void SMAAMovc(bool2 cond, inout float2 variable, float2 value) { + SMAA_FLATTEN if (cond.x) variable.x = value.x; + SMAA_FLATTEN if (cond.y) variable.y = value.y; +} + +void SMAAMovc(bool4 cond, inout float4 variable, float4 value) { + SMAAMovc(cond.xy, variable.xy, value.xy); + SMAAMovc(cond.zw, variable.zw, value.zw); +} + + +#if SMAA_INCLUDE_VS +//----------------------------------------------------------------------------- +// Vertex Shaders + +/** + * Edge Detection Vertex Shader + */ +void SMAAEdgeDetectionVS(float2 texcoord, + out float4 offset[3]) { + offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-1.0, 0.0, 0.0, -1.0), texcoord.xyxy); + offset[1] = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0, 1.0), texcoord.xyxy); + offset[2] = mad(SMAA_RT_METRICS.xyxy, float4(-2.0, 0.0, 0.0, -2.0), texcoord.xyxy); +} + +/** + * Blend Weight Calculation Vertex Shader + */ +void SMAABlendingWeightCalculationVS(float2 texcoord, + out float2 pixcoord, + out float4 offset[3]) { + pixcoord = texcoord * SMAA_RT_METRICS.zw; + + // We will use these offsets for the searches later on (see @PSEUDO_GATHER4): + offset[0] = mad(SMAA_RT_METRICS.xyxy, float4(-0.25, -0.125, 1.25, -0.125), texcoord.xyxy); + offset[1] = mad(SMAA_RT_METRICS.xyxy, float4(-0.125, -0.25, -0.125, 1.25), texcoord.xyxy); + + // And these for the searches, they indicate the ends of the loops: + offset[2] = mad(SMAA_RT_METRICS.xxyy, + float4(-2.0, 2.0, -2.0, 2.0) * float(SMAA_MAX_SEARCH_STEPS), + float4(offset[0].xz, offset[1].yw)); +} + +/** + * Neighborhood Blending Vertex Shader + */ +void SMAANeighborhoodBlendingVS(float2 texcoord, + out float4 offset) { + offset = mad(SMAA_RT_METRICS.xyxy, float4( 1.0, 0.0, 0.0, 1.0), texcoord.xyxy); +} +#endif // SMAA_INCLUDE_VS + +#if SMAA_INCLUDE_PS +//----------------------------------------------------------------------------- +// Edge Detection Pixel Shaders (First Pass) + +/** + * Luma Edge Detection + * + * IMPORTANT NOTICE: luma edge detection requires gamma-corrected colors, and + * thus 'colorTex' should be a non-sRGB texture. + */ +float2 SMAALumaEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(colorTex) + #if SMAA_PREDICATION + , SMAATexture2D(predicationTex) + #endif + ) { + // Calculate the threshold: + #if SMAA_PREDICATION + float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, SMAATexturePass2D(predicationTex)); + #else + float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD); + #endif + + // Calculate lumas: + float3 weights = float3(0.2126, 0.7152, 0.0722); + float L = dot(SMAASamplePoint(colorTex, texcoord).rgb, weights); + + float Lleft = dot(SMAASamplePoint(colorTex, offset[0].xy).rgb, weights); + float Ltop = dot(SMAASamplePoint(colorTex, offset[0].zw).rgb, weights); + + // We do the usual threshold: + float4 delta; + delta.xy = abs(L - float2(Lleft, Ltop)); + float2 edges = step(threshold, delta.xy); + + // Then discard if there is no edge: + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + // Calculate right and bottom deltas: + float Lright = dot(SMAASamplePoint(colorTex, offset[1].xy).rgb, weights); + float Lbottom = dot(SMAASamplePoint(colorTex, offset[1].zw).rgb, weights); + delta.zw = abs(L - float2(Lright, Lbottom)); + + // Calculate the maximum delta in the direct neighborhood: + float2 maxDelta = max(delta.xy, delta.zw); + + // Calculate left-left and top-top deltas: + float Lleftleft = dot(SMAASamplePoint(colorTex, offset[2].xy).rgb, weights); + float Ltoptop = dot(SMAASamplePoint(colorTex, offset[2].zw).rgb, weights); + delta.zw = abs(float2(Lleft, Ltop) - float2(Lleftleft, Ltoptop)); + + // Calculate the final maximum delta: + maxDelta = max(maxDelta.xy, delta.zw); + float finalDelta = max(maxDelta.x, maxDelta.y); + + // Local contrast adaptation: + edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy); + + return edges; +} + +/** + * Color Edge Detection + * + * IMPORTANT NOTICE: color edge detection requires gamma-corrected colors, and + * thus 'colorTex' should be a non-sRGB texture. + */ +float2 SMAAColorEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(colorTex) + #if SMAA_PREDICATION + , SMAATexture2D(predicationTex) + #endif + ) { + // Calculate the threshold: + #if SMAA_PREDICATION + float2 threshold = SMAACalculatePredicatedThreshold(texcoord, offset, predicationTex); + #else + float2 threshold = float2(SMAA_THRESHOLD, SMAA_THRESHOLD); + #endif + + // Calculate color deltas: + float4 delta; + float3 C = SMAASamplePoint(colorTex, texcoord).rgb; + + float3 Cleft = SMAASamplePoint(colorTex, offset[0].xy).rgb; + float3 t = abs(C - Cleft); + delta.x = max(max(t.r, t.g), t.b); + + float3 Ctop = SMAASamplePoint(colorTex, offset[0].zw).rgb; + t = abs(C - Ctop); + delta.y = max(max(t.r, t.g), t.b); + + // We do the usual threshold: + float2 edges = step(threshold, delta.xy); + + // Then discard if there is no edge: + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + // Calculate right and bottom deltas: + float3 Cright = SMAASamplePoint(colorTex, offset[1].xy).rgb; + t = abs(C - Cright); + delta.z = max(max(t.r, t.g), t.b); + + float3 Cbottom = SMAASamplePoint(colorTex, offset[1].zw).rgb; + t = abs(C - Cbottom); + delta.w = max(max(t.r, t.g), t.b); + + // Calculate the maximum delta in the direct neighborhood: + float2 maxDelta = max(delta.xy, delta.zw); + + // Calculate left-left and top-top deltas: + float3 Cleftleft = SMAASamplePoint(colorTex, offset[2].xy).rgb; + t = abs(C - Cleftleft); + delta.z = max(max(t.r, t.g), t.b); + + float3 Ctoptop = SMAASamplePoint(colorTex, offset[2].zw).rgb; + t = abs(C - Ctoptop); + delta.w = max(max(t.r, t.g), t.b); + + // Calculate the final maximum delta: + maxDelta = max(maxDelta.xy, delta.zw); + float finalDelta = max(maxDelta.x, maxDelta.y); + + // Local contrast adaptation: + edges.xy *= step(finalDelta, SMAA_LOCAL_CONTRAST_ADAPTATION_FACTOR * delta.xy); + + return edges; +} + +/** + * Depth Edge Detection + */ +float2 SMAADepthEdgeDetectionPS(float2 texcoord, + float4 offset[3], + SMAATexture2D(depthTex)) { + float3 neighbours = SMAAGatherNeighbours(texcoord, offset, SMAATexturePass2D(depthTex)); + float2 delta = abs(neighbours.xx - float2(neighbours.y, neighbours.z)); + float2 edges = step(SMAA_DEPTH_THRESHOLD, delta); + + if (dot(edges, float2(1.0, 1.0)) == 0.0) + return float2(-2.0, -2.0); + + return edges; +} + +//----------------------------------------------------------------------------- +// Diagonal Search Functions + +#if !defined(SMAA_DISABLE_DIAG_DETECTION) + +/** + * Allows to decode two binary values from a bilinear-filtered access. + */ +float2 SMAADecodeDiagBilinearAccess(float2 e) { + // Bilinear access for fetching 'e' have a 0.25 offset, and we are + // interested in the R and G edges: + // + // +---G---+-------+ + // | x o R x | + // +-------+-------+ + // + // Then, if one of these edge is enabled: + // Red: (0.75 * X + 0.25 * 1) => 0.25 or 1.0 + // Green: (0.75 * 1 + 0.25 * X) => 0.75 or 1.0 + // + // This function will unpack the values (mad + mul + round): + // wolframalpha.com: round(x * abs(5 * x - 5 * 0.75)) plot 0 to 1 + e.r = e.r * abs(5.0 * e.r - 5.0 * 0.75); + return round(e); +} + +float4 SMAADecodeDiagBilinearAccess(float4 e) { + e.rb = e.rb * abs(5.0 * e.rb - 5.0 * 0.75); + return round(e); +} + +/** + * These functions allows to perform diagonal pattern searches. + */ +float2 SMAASearchDiag1(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) { + float4 coord = float4(texcoord, -1.0, 1.0); + float3 t = float3(SMAA_RT_METRICS.xy, 1.0); + while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) && + coord.w > 0.9) { + coord.xyz = mad(t, float3(dir, 1.0), coord.xyz); + e = SMAASampleLevelZero(edgesTex, coord.xy).rg; + coord.w = dot(e, float2(0.5, 0.5)); + } + return coord.zw; +} + +float2 SMAASearchDiag2(SMAATexture2D(edgesTex), float2 texcoord, float2 dir, out float2 e) { + float4 coord = float4(texcoord, -1.0, 1.0); + coord.x += 0.25 * SMAA_RT_METRICS.x; // See @SearchDiag2Optimization + float3 t = float3(SMAA_RT_METRICS.xy, 1.0); + while (coord.z < float(SMAA_MAX_SEARCH_STEPS_DIAG - 1) && + coord.w > 0.9) { + coord.xyz = mad(t, float3(dir, 1.0), coord.xyz); + + // @SearchDiag2Optimization + // Fetch both edges at once using bilinear filtering: + e = SMAASampleLevelZero(edgesTex, coord.xy).rg; + e = SMAADecodeDiagBilinearAccess(e); + + // Non-optimized version: + // e.g = SMAASampleLevelZero(edgesTex, coord.xy).g; + // e.r = SMAASampleLevelZeroOffset(edgesTex, coord.xy, int2(1, 0)).r; + + coord.w = dot(e, float2(0.5, 0.5)); + } + return coord.zw; +} + +/** + * Similar to SMAAArea, this calculates the area corresponding to a certain + * diagonal distance and crossing edges 'e'. + */ +float2 SMAAAreaDiag(SMAATexture2D(areaTex), float2 dist, float2 e, float offset) { + float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE_DIAG, SMAA_AREATEX_MAX_DISTANCE_DIAG), e, dist); + + // We do a scale and bias for mapping to texel space: + texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE); + + // Diagonal areas are on the second half of the texture: + texcoord.x += 0.5; + + // Move to proper place, according to the subpixel offset: + texcoord.y += SMAA_AREATEX_SUBTEX_SIZE * offset; + + // Do it! + return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord)); +} + +/** + * This searches for diagonal patterns and returns the corresponding weights. + */ +float2 SMAACalculateDiagWeights(SMAATexture2D(edgesTex), SMAATexture2D(areaTex), float2 texcoord, float2 e, float4 subsampleIndices) { + float2 weights = float2(0.0, 0.0); + + // Search for the line ends: + float4 d; + float2 end; + if (e.r > 0.0) { + d.xz = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, 1.0), end); + d.x += float(end.y > 0.9); + } else + d.xz = float2(0.0, 0.0); + d.yw = SMAASearchDiag1(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, -1.0), end); + + SMAA_BRANCH + if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3 + // Fetch the crossing edges: + float4 coords = mad(float4(-d.x + 0.25, d.x, d.y, -d.y - 0.25), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + float4 c; + c.xy = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).rg; + c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).rg; + c.yxwz = SMAADecodeDiagBilinearAccess(c.xyzw); + + // Non-optimized version: + // float4 coords = mad(float4(-d.x, d.x, d.y, -d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + // float4 c; + // c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).g; + // c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, 0)).r; + // c.z = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).g; + // c.w = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, -1)).r; + + // Merge crossing edges at each side into a single value: + float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw); + + // Remove the crossing edge if we didn't found the end of the line: + SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0)); + + // Fetch the areas for this line: + weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.z); + } + + // Search for the line ends: + d.xz = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(-1.0, -1.0), end); + if (SMAASampleLevelZeroOffset(edgesTex, texcoord, int2(1, 0)).r > 0.0) { + d.yw = SMAASearchDiag2(SMAATexturePass2D(edgesTex), texcoord, float2(1.0, 1.0), end); + d.y += float(end.y > 0.9); + } else + d.yw = float2(0.0, 0.0); + + SMAA_BRANCH + if (d.x + d.y > 2.0) { // d.x + d.y + 1 > 3 + // Fetch the crossing edges: + float4 coords = mad(float4(-d.x, -d.x, d.y, d.y), SMAA_RT_METRICS.xyxy, texcoord.xyxy); + float4 c; + c.x = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2(-1, 0)).g; + c.y = SMAASampleLevelZeroOffset(edgesTex, coords.xy, int2( 0, -1)).r; + c.zw = SMAASampleLevelZeroOffset(edgesTex, coords.zw, int2( 1, 0)).gr; + float2 cc = mad(float2(2.0, 2.0), c.xz, c.yw); + + // Remove the crossing edge if we didn't found the end of the line: + SMAAMovc(bool2(step(0.9, d.zw)), cc, float2(0.0, 0.0)); + + // Fetch the areas for this line: + weights += SMAAAreaDiag(SMAATexturePass2D(areaTex), d.xy, cc, subsampleIndices.w).gr; + } + + return weights; +} +#endif + +//----------------------------------------------------------------------------- +// Horizontal/Vertical Search Functions + +/** + * This allows to determine how much length should we add in the last step + * of the searches. It takes the bilinearly interpolated edge (see + * @PSEUDO_GATHER4), and adds 0, 1 or 2, depending on which edges and + * crossing edges are active. + */ +float SMAASearchLength(SMAATexture2D(searchTex), float2 e, float offset) { + // The texture is flipped vertically, with left and right cases taking half + // of the space horizontally: + float2 scale = SMAA_SEARCHTEX_SIZE * float2(0.5, -1.0); + float2 bias = SMAA_SEARCHTEX_SIZE * float2(offset, 1.0); + + // Scale and bias to access texel centers: + scale += float2(-1.0, 1.0); + bias += float2( 0.5, -0.5); + + // Convert from pixel coordinates to texcoords: + // (We use SMAA_SEARCHTEX_PACKED_SIZE because the texture is cropped) + scale *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE; + bias *= 1.0 / SMAA_SEARCHTEX_PACKED_SIZE; + + // Lookup the search texture: + return SMAA_SEARCHTEX_SELECT(SMAASampleLevelZero(searchTex, mad(scale, e, bias))); +} + +/** + * Horizontal/vertical search functions for the 2nd pass. + */ +float SMAASearchXLeft(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + /** + * @PSEUDO_GATHER4 + * This texcoord has been offset by (-0.25, -0.125) in the vertex shader to + * sample between edge, thus fetching four edges in a row. + * Sampling with different offsets in each direction allows to disambiguate + * which edges are active from the four fetched ones. + */ + float2 e = float2(0.0, 1.0); + while (texcoord.x > end && + e.g > 0.8281 && // Is there some edge not activated? + e.r == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(-float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord); + } + + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0), 3.25); + return mad(SMAA_RT_METRICS.x, offset, texcoord.x); + + // Non-optimized version: + // We correct the previous (-0.25, -0.125) offset we applied: + // texcoord.x += 0.25 * SMAA_RT_METRICS.x; + + // The searches are bias by 1, so adjust the coords accordingly: + // texcoord.x += SMAA_RT_METRICS.x; + + // Disambiguate the length added by the last step: + // texcoord.x += 2.0 * SMAA_RT_METRICS.x; // Undo last step + // texcoord.x -= SMAA_RT_METRICS.x * (255.0 / 127.0) * SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.0); + // return mad(SMAA_RT_METRICS.x, offset, texcoord.x); +} + +float SMAASearchXRight(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(0.0, 1.0); + while (texcoord.x < end && + e.g > 0.8281 && // Is there some edge not activated? + e.r == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(float2(2.0, 0.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e, 0.5), 3.25); + return mad(-SMAA_RT_METRICS.x, offset, texcoord.x); +} + +float SMAASearchYUp(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(1.0, 0.0); + while (texcoord.y > end && + e.r > 0.8281 && // Is there some edge not activated? + e.g == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(-float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.0), 3.25); + return mad(SMAA_RT_METRICS.y, offset, texcoord.y); +} + +float SMAASearchYDown(SMAATexture2D(edgesTex), SMAATexture2D(searchTex), float2 texcoord, float end) { + float2 e = float2(1.0, 0.0); + while (texcoord.y < end && + e.r > 0.8281 && // Is there some edge not activated? + e.g == 0.0) { // Or is there a crossing edge that breaks the line? + e = SMAASampleLevelZero(edgesTex, texcoord).rg; + texcoord = mad(float2(0.0, 2.0), SMAA_RT_METRICS.xy, texcoord); + } + float offset = mad(-(255.0 / 127.0), SMAASearchLength(SMAATexturePass2D(searchTex), e.gr, 0.5), 3.25); + return mad(-SMAA_RT_METRICS.y, offset, texcoord.y); +} + +/** + * Ok, we have the distance and both crossing edges. So, what are the areas + * at each side of current edge? + */ +float2 SMAAArea(SMAATexture2D(areaTex), float2 dist, float e1, float e2, float offset) { + // Rounding prevents precision errors of bilinear filtering: + float2 texcoord = mad(float2(SMAA_AREATEX_MAX_DISTANCE, SMAA_AREATEX_MAX_DISTANCE), round(4.0 * float2(e1, e2)), dist); + + // We do a scale and bias for mapping to texel space: + texcoord = mad(SMAA_AREATEX_PIXEL_SIZE, texcoord, 0.5 * SMAA_AREATEX_PIXEL_SIZE); + + // Move to proper place, according to the subpixel offset: + texcoord.y = mad(SMAA_AREATEX_SUBTEX_SIZE, offset, texcoord.y); + + // Do it! + return SMAA_AREATEX_SELECT(SMAASampleLevelZero(areaTex, texcoord)); +} + +//----------------------------------------------------------------------------- +// Corner Detection Functions + +void SMAADetectHorizontalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) { + #if !defined(SMAA_DISABLE_CORNER_DETECTION) + float2 leftRight = step(d.xy, d.yx); + float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight; + + rounding /= leftRight.x + leftRight.y; // Reduce blending for pixels in the center of a line. + + float2 factor = float2(1.0, 1.0); + factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, 1)).r; + factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, 1)).r; + factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(0, -2)).r; + factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(1, -2)).r; + + weights *= saturate(factor); + #endif +} + +void SMAADetectVerticalCornerPattern(SMAATexture2D(edgesTex), inout float2 weights, float4 texcoord, float2 d) { + #if !defined(SMAA_DISABLE_CORNER_DETECTION) + float2 leftRight = step(d.xy, d.yx); + float2 rounding = (1.0 - SMAA_CORNER_ROUNDING_NORM) * leftRight; + + rounding /= leftRight.x + leftRight.y; + + float2 factor = float2(1.0, 1.0); + factor.x -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2( 1, 0)).g; + factor.x -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2( 1, 1)).g; + factor.y -= rounding.x * SMAASampleLevelZeroOffset(edgesTex, texcoord.xy, int2(-2, 0)).g; + factor.y -= rounding.y * SMAASampleLevelZeroOffset(edgesTex, texcoord.zw, int2(-2, 1)).g; + + weights *= saturate(factor); + #endif +} + +//----------------------------------------------------------------------------- +// Blending Weight Calculation Pixel Shader (Second Pass) + +float4 SMAABlendingWeightCalculationPS(float2 texcoord, + float2 pixcoord, + float4 offset[3], + SMAATexture2D(edgesTex), + SMAATexture2D(areaTex), + SMAATexture2D(searchTex), + float4 subsampleIndices) { // Just pass zero for SMAA 1x, see @SUBSAMPLE_INDICES. + float4 weights = float4(0.0, 0.0, 0.0, 0.0); + + float2 e = SMAASample(edgesTex, texcoord).rg; + + SMAA_BRANCH + if (e.g > 0.0) { // Edge at north + #if !defined(SMAA_DISABLE_DIAG_DETECTION) + // Diagonals have both north and west edges, so searching for them in + // one of the boundaries is enough. + weights.rg = SMAACalculateDiagWeights(SMAATexturePass2D(edgesTex), SMAATexturePass2D(areaTex), texcoord, e, subsampleIndices); + + // We give priority to diagonals, so if we find a diagonal we skip + // horizontal/vertical processing. + SMAA_BRANCH + if (weights.r == -weights.g) { // weights.r + weights.g == 0.0 + #endif + + float2 d; + + // Find the distance to the left: + float3 coords; + coords.x = SMAASearchXLeft(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].xy, offset[2].x); + coords.y = offset[1].y; // offset[1].y = texcoord.y - 0.25 * SMAA_RT_METRICS.y (@CROSSING_OFFSET) + d.x = coords.x; + + // Now fetch the left crossing edges, two at a time using bilinear + // filtering. Sampling at -0.25 (see @CROSSING_OFFSET) enables to + // discern what value each edge has: + float e1 = SMAASampleLevelZero(edgesTex, coords.xy).r; + + // Find the distance to the right: + coords.z = SMAASearchXRight(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[0].zw, offset[2].y); + d.y = coords.z; + + // We want the distances to be in pixel units (doing this here allow to + // better interleave arithmetic and memory accesses): + d = abs(round(mad(SMAA_RT_METRICS.zz, d, -pixcoord.xx))); + + // SMAAArea below needs a sqrt, as the areas texture is compressed + // quadratically: + float2 sqrt_d = sqrt(d); + + // Fetch the right crossing edges: + float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.zy, int2(1, 0)).r; + + // Ok, we know how this pattern looks like, now it is time for getting + // the actual area: + weights.rg = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.y); + + // Fix corners: + coords.y = texcoord.y; + SMAADetectHorizontalCornerPattern(SMAATexturePass2D(edgesTex), weights.rg, coords.xyzy, d); + + #if !defined(SMAA_DISABLE_DIAG_DETECTION) + } else + e.r = 0.0; // Skip vertical processing. + #endif + } + + SMAA_BRANCH + if (e.r > 0.0) { // Edge at west + float2 d; + + // Find the distance to the top: + float3 coords; + coords.y = SMAASearchYUp(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].xy, offset[2].z); + coords.x = offset[0].x; // offset[1].x = texcoord.x - 0.25 * SMAA_RT_METRICS.x; + d.x = coords.y; + + // Fetch the top crossing edges: + float e1 = SMAASampleLevelZero(edgesTex, coords.xy).g; + + // Find the distance to the bottom: + coords.z = SMAASearchYDown(SMAATexturePass2D(edgesTex), SMAATexturePass2D(searchTex), offset[1].zw, offset[2].w); + d.y = coords.z; + + // We want the distances to be in pixel units: + d = abs(round(mad(SMAA_RT_METRICS.ww, d, -pixcoord.yy))); + + // SMAAArea below needs a sqrt, as the areas texture is compressed + // quadratically: + float2 sqrt_d = sqrt(d); + + // Fetch the bottom crossing edges: + float e2 = SMAASampleLevelZeroOffset(edgesTex, coords.xz, int2(0, 1)).g; + + // Get the area for this direction: + weights.ba = SMAAArea(SMAATexturePass2D(areaTex), sqrt_d, e1, e2, subsampleIndices.x); + + // Fix corners: + coords.x = texcoord.x; + SMAADetectVerticalCornerPattern(SMAATexturePass2D(edgesTex), weights.ba, coords.xyxz, d); + } + + return weights; +} + +//----------------------------------------------------------------------------- +// Neighborhood Blending Pixel Shader (Third Pass) + +float4 SMAANeighborhoodBlendingPS(float2 texcoord, + float4 offset, + SMAATexture2D(colorTex), + SMAATexture2D(blendTex) + #if SMAA_REPROJECTION + , SMAATexture2D(velocityTex) + #endif + ) { + // Fetch the blending weights for current pixel: + float4 a; + a.x = SMAASample(blendTex, offset.xy).a; // Right + a.y = SMAASample(blendTex, offset.zw).g; // Top + a.wz = SMAASample(blendTex, texcoord).xz; // Bottom / Left + + // Is there any blending weight with a value greater than 0.0? + SMAA_BRANCH + if (dot(a, float4(1.0, 1.0, 1.0, 1.0)) < 1e-5) { + float4 color = SMAASampleLevelZero(colorTex, texcoord); + + #if SMAA_REPROJECTION + float2 velocity = SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, texcoord)); + + // Pack velocity into the alpha channel: + color.a = sqrt(5.0 * length(velocity)); + #endif + + return color; + } else { + bool h = max(a.x, a.z) > max(a.y, a.w); // max(horizontal) > max(vertical) + + // Calculate the blending offsets: + float4 blendingOffset = float4(0.0, a.y, 0.0, a.w); + float2 blendingWeight = a.yw; + SMAAMovc(bool4(h, h, h, h), blendingOffset, float4(a.x, 0.0, a.z, 0.0)); + SMAAMovc(bool2(h, h), blendingWeight, a.xz); + blendingWeight /= dot(blendingWeight, float2(1.0, 1.0)); + + // Calculate the texture coordinates: + float4 blendingCoord = mad(blendingOffset, float4(SMAA_RT_METRICS.xy, -SMAA_RT_METRICS.xy), texcoord.xyxy); + + // We exploit bilinear filtering to mix current pixel with the chosen + // neighbor: + float4 color = blendingWeight.x * SMAASampleLevelZero(colorTex, blendingCoord.xy); + color += blendingWeight.y * SMAASampleLevelZero(colorTex, blendingCoord.zw); + + #if SMAA_REPROJECTION + // Antialias velocity for proper reprojection in a later stage: + float2 velocity = blendingWeight.x * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.xy)); + velocity += blendingWeight.y * SMAA_DECODE_VELOCITY(SMAASampleLevelZero(velocityTex, blendingCoord.zw)); + + // Pack velocity into the alpha channel: + color.a = sqrt(5.0 * length(velocity)); + #endif + + return color; + } +} + +//----------------------------------------------------------------------------- +// Temporal Resolve Pixel Shader (Optional Pass) + +float4 SMAAResolvePS(float2 texcoord, + SMAATexture2D(currentColorTex), + SMAATexture2D(previousColorTex) + #if SMAA_REPROJECTION + , SMAATexture2D(velocityTex) + #endif + ) { + #if SMAA_REPROJECTION + // Velocity is assumed to be calculated for motion blur, so we need to + // inverse it for reprojection: + float2 velocity = -SMAA_DECODE_VELOCITY(SMAASamplePoint(velocityTex, texcoord).rg); + + // Fetch current pixel: + float4 current = SMAASamplePoint(currentColorTex, texcoord); + + // Reproject current coordinates and fetch previous pixel: + float4 previous = SMAASamplePoint(previousColorTex, texcoord + velocity); + + // Attenuate the previous pixel if the velocity is different: + float delta = abs(current.a * current.a - previous.a * previous.a) / 5.0; + float weight = 0.5 * saturate(1.0 - sqrt(delta) * SMAA_REPROJECTION_WEIGHT_SCALE); + + // Blend the pixels according to the calculated weight: + return lerp(current, previous, weight); + #else + // Just blend the pixels: + float4 current = SMAASamplePoint(currentColorTex, texcoord); + float4 previous = SMAASamplePoint(previousColorTex, texcoord); + return lerp(current, previous, 0.5); + #endif +} + +//----------------------------------------------------------------------------- +// Separate Multisamples Pixel Shader (Optional Pass) + +#ifdef SMAALoad +void SMAASeparatePS(float4 position, + float2 texcoord, + out float4 target0, + out float4 target1, + SMAATexture2DMS2(colorTexMS)) { + int2 pos = int2(position.xy); + target0 = SMAALoad(colorTexMS, pos, 0); + target1 = SMAALoad(colorTexMS, pos, 1); +} +#endif + +//----------------------------------------------------------------------------- +#endif // SMAA_INCLUDE_PS + +layout(rgba8, binding = 0, set = 3) uniform image2D imgOutput; + +layout(binding = 1, set = 2) uniform sampler2D inputImg; +layout(binding = 3, set = 2) uniform sampler2D samplerBlend; +layout( binding = 2 ) uniform invResolution +{ + vec2 invResolution_data; +}; + +void main() { + vec2 loc = ivec2(gl_GlobalInvocationID.x * 4, gl_GlobalInvocationID.y * 4); + for(int i = 0; i < 4; i++) + { + for(int j = 0; j < 4; j++) + { + ivec2 texelCoord = ivec2(loc.x + i, loc.y + j); + vec2 coord = (texelCoord + vec2(0.5)) / invResolution_data; + vec2 pixCoord; + vec4 offset; + + SMAANeighborhoodBlendingVS(coord, offset); + + vec4 oColor = SMAANeighborhoodBlendingPS(coord, offset, inputImg, samplerBlend); + + imageStore(imgOutput, texelCoord, oColor); + } + } +} diff --git a/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaNeighbour.spv b/Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaNeighbour.spv new file mode 100644 index 0000000000000000000000000000000000000000..fa0208f25069dbd07bff6133f52792e1e769f681 GIT binary patch literal 8328 zcmaKw37D2u6~|wgjWv~3P+3Jl22?~4QD9`onK2NAQ7FstV;Ep;n2&~Gva&>Tt0!%< z#Wvel)T}hJGRth=7u(DB=dmo?+)C~Dd*>c`c^{wd)A9b#|19_1bMJk>p_<{d8?u^V z*{EzpcJ{C=A4g=vGvVm0KA&6HZd$u&&%m}tOHMgahi7Fqxt~6V;Zw(4i|nfO^l5ky zIUHW@h#aeDd9E5_hl6h+xDJnETpcy&w`pBd)4J+l*Sx{4i&qS89};-|$1-<S`?fKm z*HKHB4OaSlDqDNIhqzj5lIz_oVa}<{g`7*sIV8EgHk-^`%y}ewIp>s|i=3mg>FDM> zr@Lpz&aKt{oz?2LGkd%Hw)OPwIDgZ;p{kdx$ZI?X-+}H+x~kRwRCjGQ4X(O<`=0Ir zYaE-+|3CRQ9L%?8@kuL`Jnx)m$a~x&dF!&d=v~#`YJW%fCBQsx9y9Ig+x9e9o2`V~ zUFomv%6)3Hrjp)V($|&r4JCckA$on*iN3C-V`KZ8&du%39c`%8QlC{0@@*~ct!*7S zUz1(Ttj#mOnmOM?vR%i#GspGWjm-I8===Jo`&pmeRNyyzMtPrWv;FYJecoKsZ!PJ! zm-IVI`hi3A`s^O`sAo+&aQAQ@d~pvCp!Zh0*x!i!z3k}iZ0)UXt@O6{4OY7<13lHg zc6(mOdwGJnC*cO(jlJ>i9}ebTb@!%)TxS|wTr~T8EP7Ah;KuGf)!w~kCAOY5<sFCE zI=58@Dnn}?m+a<T06utEYqLi5Vh<OU^po^o<E0y_b1`~P-|oEw?YnlkrgfbRx2Lje zcW-xp{%nT$+H6TF*V04ux@?8HcI{ZdcYu5;S2LV%v2(-5mQ5`koom-`W{rldrNpmm zX>LD{i>vLdlC6!qH&jDI)>^Wix3*(r6UJ70;XTF-8qzCC_Zz7FLFUoO(MTIBMW1J* zJ6btUIlqDW+vi!i<)A(jJ?!>Qdr{X8oyXh9qFr6PUug5&FYQHLyF<IZ<A3>3w`b~# zx_uJI`u0XW^0lM5U%!vz@zQT5R_)>m{I{NYGPxR<#qu-IM=;ANlZY6Z^FzL)!0O@a z8v4o4B+Hm&Glw?CHO4W!hUeoN<C%?<*FJ&SykVaR)-G?}NzATkoHk`9=jeILyN=ux zvdm;Q7cDqsgnkrznuSF_Ynh$AhPa08SajEooZ6L%_%2|cfjAZ<y>Up*b1ej0w|h1B z;-p(o_%8(yE9hIm?jh_KfITO5<1R$9Y~QNIN%yX2*+b2tS91ND=J3ydaN!!M#C>c> z%ptFc!?rK!av_oD3b_6E&Ry0>pFQ8Z3O=6MpV94ywYt~8FuO1L6-oEq$!Y&<!p6j2 z{|@%d?d?CAXCV5U|1oAcdt>~6neDBd@y<^oj?n7}jyrT1Sby`xo@San%T~pnW}(X| z)7XjM0Qc&8bHJ`G=YG`vE<`<Z6V_k5cf{|4BXr-r!+W_L{7m=Kd{Ogh$zNY{sQayO z#2#0pOL_kGz;A^kbiWlv-ET!v_gmpGpJ&mA?%#Kf+*;4hZ-<=XU0urgM~$fYc|Qu{ zx78X)Ge3aXYjw}b-^QZeK(5d|7k?u|H{RdJ(0#}LwuP?U-?q@T``cF3{cQ_fe}At+ z*WceO^(LzJ9{vsSZu)l5NPT!H@mJ!r0Q?U`&O3T3KKA-wh<xPvH`qM(De^oFmXAD- zfaQvL9z~bmz*^qV#}RYOS6RzD^&dojB^|fF>SBBAy?=}|c#7GxaDJG$VeE4_;?T}N zA#%I3^|^;yusLpHZNF3X=yJZD$MI43f|<+QV%PNyW8D$S=Ipwom`5WH*Bz7W&av)T zusLGgXMyEnUG<TOxm;IlpZ$g&0k#KmUcRGoaPpCVJlK8NKkXCH<t`_O`b4lX@_nfn zlfd$^&SbFb#CaNbB%FM#GX?BEVx6gAxmf2Yurczn&e34`IPYV?t}SnGeGhW3;eD76 zmW#VE11#qn-i_H{xmk#{&t;y6_<iz>%%k7@#9y1@jz>3E-gSL<a^b%aET_#kr@ow9 z;`=`VJe^tI_h*d1FDEA57~idW<U0i%`FxY=5w{Gizj3}#_2vBaVa`**^O=p&=6BU^ zslL8-?dI`2c{<`ZQr+I20k(Is&PuQ`UD);2F6TY<&U)9J_4j?O0(-vu=$U#GSZ+1q z`r71-k9XjmRUgi5KIh1HCfIu!`PQJzMLunE#>ad1?LCnV%T7h&`|CH_F$P&i9sZ3q zHetVc{+nA~?7y|G!++;{KNd1CLiCR}<l8?X@sHS7u)}YF{E3L1dBo=NEuM^+Kl1oS zmLxuWPC>UGee9XKKJGDcI!A5Gz}6P;&e~2z<gM*AW{0)OFGu98O&sg409#w+v9{9@ z`S3XdY;F2jo4P*M7CD{E{htcwn1tBp@ywGGp2&P;0Z##2+fmH^CVEyYk@!vWjG9V% zb3xaCT|w7>LrLG1bnn?X@G8W5*0YN1tVZM%2UjZJuXU_O;(eS6wszzEW~@Qv6i38a zueOM50lUwLI}0qQI3mt{X<JVI;moawciB43sok8e-IlmGt9Ep2^B$ec{qsD}M)Y%d ze)`Bo%(KD9c)k&{7Hmw^q>o(m?i_Hjck9vR6l?ZxT-WkUv_%ij1s8j_5nWCx#(8$y zBCZ2m?A>|ja!N7IGu9TpJ0EQCtiznzqj#GV7kzjRy0vXVq7Tmn>*sJk`p8Ah^T5Vj zfJDsm!Nx@I^pPvSkLhrZqmlRy9h0!{V;bU{n8EDtk!N`kav{=*1h+MD73K}xnF-8Y z$VG@YeYLG8gSKsmcEw?x4(I624zRUE%ucYJ;wZ+9NxwloiT9i58ru=$?3;VhUhILj z%tsu@BGH363EP9?5}wUGw}9t??alGbelPUB7-=ZzR~Ph!;Oh#y{x=qMzZW+ZbiY;m z3%dR{m-Jf;`Uvpt1>Jmil=K4y-F)|y^!t+T8O#E|0CAu3271BvBfcNIz;cSiGue{P zzYlD@KE97`B<^Pwti1+_d%PQL{~8g`OI^RH<%Qs4E&b@lTK1sJDXwSksAT|bygubx z_QGkm_NZkLY%NO>Yf;xPYIzYjY8eH-1o6(qdM^g+BOmKU&t3wjU2$lSbzcg0-Qrum z6x|qywdf-kF)srfvjvI%z8q|fdoo5Jxj4tmz{cE=-t^^QxqZk1xO(PSAo5or{+?gO z?C|$o+m*<x5#N9~dUFlfccN}@u0@Kyc_q3&^3fY>^sQ)%8eavLdlE0P>$qO*`+Bf@ z@V*!`lKC}=eE7T;tdG43pVxuq$8b-Lc|BsEJWu1bNBkSW#(NJU{sypo_`DIUk7pJ> zZvxB58gB;28pdnCGxhl`U~ACFKKPB0i~Mf|muq+%oP7AaJ^2)Ccn6$(e2?D=_6-{A ztljne25GbQ3C!<8TrckHyAvnx9(-5E`K{7socHBDh_#3Q-je>llK%drTi-bF2M}`@ zr?2+d-v`0gR6Oesq05KQhm%k7tUm%LA3gXe*f{sE&Dp)|XLkMQ!7X6>qU~m8b@8pt z=7`_+kAbHm#_Oj&a@>|U<IJTVz4<t}*sD(@-G6H~fIo@oBX9iei8Fo__*01e*LFwJ zW8F`q+p9RgJHc{_!#8K#0cLHnm(PF?M_kLC+GD-XCeAq5RS(<ez;f{|`8-(eE+l;J z2J7Sgtp5v$eoFj1_lw}W5pA)KT<q&hVEY(<bKDEQ5a}#%Urt<wInMkmV12det1Wu- zRj_u&F${_PUjvu(e?4)<{NI4nS6ey%H{rA^4)b4@&hT4c?eWe2HrRR`!F>lTXIyaK z1sm%K?t5T4{bJwW2b<Fz#^@90@dNN2B>uMgAy`gvn8)E9e_Q=1>5G{?3-cT2S>3~| zJ>KGv!8aj|h-;2y{s|(lK8yLM$lRp+-T4_}o;aJICr;jBE%E#M3$XF}c%Oa8*K@bb z^Gig#>si}<iHq8Pg>G$2k*Mw0VEL%+esHn2-=G(3`z^Yh;`%j6)b=~@{fIv0+I|nG q-Su3{djEh}<K?OTKPFD!s9!Gp{{+_G-$k*%i&rr}MTh+V9{vYTn-BW{ literal 0 HcmV?d00001 diff --git a/Ryujinx.Graphics.Vulkan/Effects/SmaaConstants.cs b/Ryujinx.Graphics.Vulkan/Effects/SmaaConstants.cs new file mode 100644 index 000000000..a5f060f1b --- /dev/null +++ b/Ryujinx.Graphics.Vulkan/Effects/SmaaConstants.cs @@ -0,0 +1,15 @@ +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Vulkan.Effects +{ + [StructLayout(LayoutKind.Sequential, Pack = 4)] + internal struct SmaaConstants + { + public int QualityLow; + public int QualityMedium; + public int QualityHigh; + public int QualityUltra; + public float Width; + public float Height; + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Vulkan/Effects/SmaaPostProcessingEffect.cs b/Ryujinx.Graphics.Vulkan/Effects/SmaaPostProcessingEffect.cs new file mode 100644 index 000000000..4dcdaa646 --- /dev/null +++ b/Ryujinx.Graphics.Vulkan/Effects/SmaaPostProcessingEffect.cs @@ -0,0 +1,314 @@ +using Ryujinx.Common; +using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.Shader; +using Ryujinx.Graphics.Shader.Translation; +using Silk.NET.Vulkan; +using System; +using Format = Ryujinx.Graphics.GAL.Format; + +namespace Ryujinx.Graphics.Vulkan.Effects +{ + internal partial class SmaaPostProcessingEffect : IPostProcessingEffect + { + public const int AreaWidth = 160; + public const int AreaHeight = 560; + public const int SearchWidth = 64; + public const int SearchHeight = 16; + + private readonly VulkanRenderer _renderer; + private ISampler _samplerLinear; + private SmaaConstants _specConstants; + private ShaderCollection _edgeProgram; + private ShaderCollection _blendProgram; + private ShaderCollection _neighbourProgram; + + private PipelineHelperShader _pipeline; + + private TextureView _outputTexture; + private TextureView _edgeOutputTexture; + private TextureView _blendOutputTexture; + private TextureView _areaTexture; + private TextureView _searchTexture; + private Device _device; + private bool _recreatePipelines; + private int _quality; + + public SmaaPostProcessingEffect(VulkanRenderer renderer, Device device, int quality) + { + _device = device; + _renderer = renderer; + _quality = quality; + + Initialize(); + } + + public int Quality + { + get => _quality; + set + { + _quality = value; + + _recreatePipelines = true; + } + } + + public void Dispose() + { + DeletePipelines(); + _samplerLinear?.Dispose(); + _outputTexture?.Dispose(); + _edgeOutputTexture?.Dispose(); + _blendOutputTexture?.Dispose(); + _areaTexture?.Dispose(); + _searchTexture?.Dispose(); + } + + private unsafe void RecreateShaders(int width, int height) + { + _recreatePipelines = false; + + DeletePipelines(); + _pipeline = new PipelineHelperShader(_renderer, _device); + + _pipeline.Initialize(); + + var edgeShader = EmbeddedResources.Read("Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaEdge.spv"); + var blendShader = EmbeddedResources.Read("Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaBlend.spv"); + var neighbourShader = EmbeddedResources.Read("Ryujinx.Graphics.Vulkan/Effects/Shaders/SmaaNeighbour.spv"); + + var edgeBindings = new ShaderBindings( + new[] { 2 }, + Array.Empty<int>(), + new[] { 1 }, + new[] { 0 }); + + var blendBindings = new ShaderBindings( + new[] { 2 }, + Array.Empty<int>(), + new[] { 1, 3, 4 }, + new[] { 0 }); + + var neighbourBindings = new ShaderBindings( + new[] { 2 }, + Array.Empty<int>(), + new[] { 1, 3 }, + new[] { 0 }); + + _samplerLinear = _renderer.CreateSampler(GAL.SamplerCreateInfo.Create(MinFilter.Linear, MagFilter.Linear)); + + _specConstants = new SmaaConstants() + { + Width = width, + Height = height, + QualityLow = Quality == 0 ? 1 : 0, + QualityMedium = Quality == 1 ? 1 : 0, + QualityHigh = Quality == 2 ? 1 : 0, + QualityUltra = Quality == 3 ? 1 : 0, + }; + + var specInfo = new SpecDescription( + (0, SpecConstType.Int32), + (1, SpecConstType.Int32), + (2, SpecConstType.Int32), + (3, SpecConstType.Int32), + (4, SpecConstType.Float32), + (5, SpecConstType.Float32)); + + _edgeProgram = _renderer.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(edgeShader, edgeBindings, ShaderStage.Compute, TargetLanguage.Spirv) + }, new[] { specInfo }); + + _blendProgram = _renderer.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(blendShader, blendBindings, ShaderStage.Compute, TargetLanguage.Spirv) + }, new[] { specInfo }); + + _neighbourProgram = _renderer.CreateProgramWithMinimalLayout(new[] + { + new ShaderSource(neighbourShader, neighbourBindings, ShaderStage.Compute, TargetLanguage.Spirv) + }, new[] { specInfo }); + } + + public void DeletePipelines() + { + _pipeline?.Dispose(); + _edgeProgram?.Dispose(); + _blendProgram?.Dispose(); + _neighbourProgram?.Dispose(); + } + + private void Initialize() + { + var areaInfo = new TextureCreateInfo(AreaWidth, + AreaHeight, + 1, + 1, + 1, + 1, + 1, + 1, + Format.R8G8Unorm, + DepthStencilMode.Depth, + Target.Texture2D, + SwizzleComponent.Red, + SwizzleComponent.Green, + SwizzleComponent.Blue, + SwizzleComponent.Alpha); + + var searchInfo = new TextureCreateInfo(SearchWidth, + SearchHeight, + 1, + 1, + 1, + 1, + 1, + 1, + Format.R8Unorm, + DepthStencilMode.Depth, + Target.Texture2D, + SwizzleComponent.Red, + SwizzleComponent.Green, + SwizzleComponent.Blue, + SwizzleComponent.Alpha); + + var areaTexture = EmbeddedResources.Read("Ryujinx.Graphics.Vulkan/Effects/Textures/SmaaAreaTexture.bin"); + var searchTexture = EmbeddedResources.Read("Ryujinx.Graphics.Vulkan/Effects/Textures/SmaaSearchTexture.bin"); + + _areaTexture = _renderer.CreateTexture(areaInfo, 1) as TextureView; + _searchTexture = _renderer.CreateTexture(searchInfo, 1) as TextureView; + + _areaTexture.SetData(areaTexture); + _searchTexture.SetData(searchTexture); + } + + public TextureView Run(TextureView view, CommandBufferScoped cbs, int width, int height) + { + if (_recreatePipelines || _outputTexture == null || _outputTexture.Info.Width != view.Width || _outputTexture.Info.Height != view.Height) + { + RecreateShaders(view.Width, view.Height); + _outputTexture?.Dispose(); + _edgeOutputTexture?.Dispose(); + _blendOutputTexture?.Dispose(); + + var info = view.Info; + + if (view.Info.Format.IsBgr()) + { + info = new TextureCreateInfo(info.Width, + info.Height, + info.Depth, + info.Levels, + info.Samples, + info.BlockWidth, + info.BlockHeight, + info.BytesPerPixel, + info.Format, + info.DepthStencilMode, + info.Target, + info.SwizzleB, + info.SwizzleG, + info.SwizzleR, + info.SwizzleA); + } + + _outputTexture = _renderer.CreateTexture(info, view.ScaleFactor) as TextureView; + _edgeOutputTexture = _renderer.CreateTexture(info, view.ScaleFactor) as TextureView; + _blendOutputTexture = _renderer.CreateTexture(info, view.ScaleFactor) as TextureView; + } + + Span<GAL.Viewport> viewports = stackalloc GAL.Viewport[1]; + + viewports[0] = new GAL.Viewport( + new Rectangle<float>(0, 0, view.Width, view.Height), + ViewportSwizzle.PositiveX, + ViewportSwizzle.PositiveY, + ViewportSwizzle.PositiveZ, + ViewportSwizzle.PositiveW, + 0f, + 1f); + + Span<Rectangle<int>> scissors = stackalloc Rectangle<int>[1]; + + scissors[0] = new Rectangle<int>(0, 0, view.Width, view.Height); + + _renderer.HelperShader.Clear(_renderer, + _edgeOutputTexture.GetImageView(), + new float[] { 0, 0, 0, 1 }, + (uint)(ColorComponentFlags.RBit | ColorComponentFlags.GBit | ColorComponentFlags.BBit | ColorComponentFlags.ABit), + view.Width, + view.Height, + _edgeOutputTexture.VkFormat, + ComponentType.UnsignedInteger, + scissors[0]); + + _renderer.HelperShader.Clear(_renderer, + _blendOutputTexture.GetImageView(), + new float[] { 0, 0, 0, 1 }, + (uint)(ColorComponentFlags.RBit | ColorComponentFlags.GBit | ColorComponentFlags.BBit | ColorComponentFlags.ABit), + view.Width, + view.Height, + _blendOutputTexture.VkFormat, + ComponentType.UnsignedInteger, + scissors[0]); + + _renderer.Pipeline.TextureBarrier(); + + var dispatchX = BitUtils.DivRoundUp(view.Width, IPostProcessingEffect.LocalGroupSize); + var dispatchY = BitUtils.DivRoundUp(view.Height, IPostProcessingEffect.LocalGroupSize); + + // Edge pass + _pipeline.SetCommandBuffer(cbs); + _pipeline.SetProgram(_edgeProgram); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, view, _samplerLinear); + _pipeline.Specialize(_specConstants); + + ReadOnlySpan<float> resolutionBuffer = stackalloc float[] { view.Width, view.Height }; + int rangeSize = resolutionBuffer.Length * sizeof(float); + var bufferHandle = _renderer.BufferManager.CreateWithHandle(_renderer, rangeSize, false); + + _renderer.BufferManager.SetData(bufferHandle, 0, resolutionBuffer); + var bufferRanges = new BufferRange(bufferHandle, 0, rangeSize); + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(2, bufferRanges) }); + _pipeline.SetScissors(scissors); + _pipeline.SetViewports(viewports, false); + _pipeline.SetImage(0, _edgeOutputTexture, GAL.Format.R8G8B8A8Unorm); + _pipeline.DispatchCompute(dispatchX, dispatchY, 1); + _pipeline.ComputeBarrier(); + + // Blend pass + _pipeline.SetCommandBuffer(cbs); + _pipeline.SetProgram(_blendProgram); + _pipeline.Specialize(_specConstants); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, _edgeOutputTexture, _samplerLinear); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 3, _areaTexture, _samplerLinear); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 4, _searchTexture, _samplerLinear); + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(2, bufferRanges) }); + _pipeline.SetScissors(scissors); + _pipeline.SetViewports(viewports, false); + _pipeline.SetImage(0, _blendOutputTexture, GAL.Format.R8G8B8A8Unorm); + _pipeline.DispatchCompute(dispatchX, dispatchY, 1); + _pipeline.ComputeBarrier(); + + // Neighbour pass + _pipeline.SetCommandBuffer(cbs); + _pipeline.SetProgram(_neighbourProgram); + _pipeline.Specialize(_specConstants); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 3, _blendOutputTexture, _samplerLinear); + _pipeline.SetTextureAndSampler(ShaderStage.Compute, 1, view, _samplerLinear); + _pipeline.SetUniformBuffers(stackalloc[] { new BufferAssignment(2, bufferRanges) }); + _pipeline.SetScissors(scissors); + _pipeline.SetViewports(viewports, false); + _pipeline.SetImage(0, _outputTexture, GAL.Format.R8G8B8A8Unorm); + _pipeline.DispatchCompute(dispatchX, dispatchY, 1); + _pipeline.ComputeBarrier(); + + _pipeline.Finish(); + + _renderer.BufferManager.Delete(bufferHandle); + + return _outputTexture; + } + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Vulkan/Effects/Textures/SmaaAreaTexture.bin b/Ryujinx.Graphics.Vulkan/Effects/Textures/SmaaAreaTexture.bin new file mode 100644 index 0000000000000000000000000000000000000000..f4a7a1b417766c12bbac4e4bdc56796f18538bd6 GIT binary patch literal 179200 zcmdSChkqN_mHs{GL?MVqfW3FHfnW!V2!g%$UL;B)B~rcj*s?5HvMgIJaxaPFIB^oE z$4Q**W;dH`Nwy@L&2IMd`(NJY+?hck0nA9^T7EGfTO<ZE!~4Z^&bf2v-g7uk;*~A2 zUt;%T9?tD)?P&G0|NLS9($A5<2QGCi`6oL@`~&_ze{V;RzuVu1Ex@+Uj!wU@sjua= zx?8JTy{&bv4Xur>P1u^T`PkOdDsFqO$-AyZ^Es}aGe~|W&IcazaIQlpzkNAFWxdq_ zZ=fzv-`U`A>}>Kk`J1u%XzTF#TlkGzGW9t-GduG36t$POm$z58RkeBAYTCTmYTN3t z)weZZlY8AbP3LzV1VNSLr(%BKF%Or`?}%~6Ig;6**IU$6+U*Wh1gZkn0Z*W&)7x3w zS?jOstoPUZ8~hF2hITS>z|oT$$n4DN$nPlHQ@p3N-3^iz?Un6S?bYqojOVWLJIO6_ z6`Ud_Ac-B1c{s=0F4<C~y>9>gq;@57+A)$kl-Zxtm)}#^UEE#TRn}Dwl9hp~&T69B z37Xsy^@4uPJZSHAb*F)0wm;8b&{5b?yr*PO>7KG6Nx9dRXg<fC=MHndT%P3TVtn8+ z50|aKGphaidGn-w%r%@gnAw-zm)A=ai@Hm?z%k$s@FXj_<H}Xtv~e_P(AMYdN$F1S z$_!+8<_Z)^jXcL))89qz1UJVubLlYvNxc1-hs)-7PPw67G0rAU+QyPcQU=opGW)ap za(e}eUBz7jNA9F@zh)s}(lnYhWb1eIx_VN1hJhS^kRsH$Yy2*AXSg-4pR4Aa5~!l? ze$2yV^E<EH)a*-GFwGD_$FOTCl`+if%jpHh?t*U8VsTe7cUo~!y{wxtOeBt2hHL|l zK4))AcUl+eFe{M5l$Z~T-bH?7g42j1C7eAfqPTBfW)9$y50}mFg5rq!fNt3^pE%73 zjyQ)?2Ga&ehgp5u#1QtJ-(AR^Q5;h3)y^kO8^;BL{Z6LBw5|-sFq=`_HGWsP3*1p| znH%65xO~nM7gm&=k9j!wg~tssj_}bV)Gze6UQ`@Yt!q~kmW*?W)0PSAm}4}VsgN=3 z1w&qm+*!q@YE84Go7GR6#?8>6b<p0Q+~?{|VU{d1lzZJMO^$r)3U`q^$?fMRIX`ST zn=?o5mip~650}mFlH!E&kY-J{qF*r0nx{Z;+zx`I!qmaE{)~alKE{wc$89MOs8^uC z8N;M$EJP4?OqL992!>s|#_uY3i95|5;ufGlALm9KNt9w1zW*@~m(LG@VN<<dyEkFU zFmIX;5`-PQhEfL7uo1&d@H?tJsNSnt)XnLqjT4DuNh6kFXmG&MpWG{`5Da&b-(~JB zcZA!^O>hCOjw|L;Ia4^H_&;Ce9_w(~{4R5+lv}EUP~ftD(Ku_G5eYg*V8??@g~Sm2 zjwv@_!OKj6Qy@4RMv$TjRG8YmtNd_H?f`l*gIpVII3JPN7@t~bFY$Yf!ykg*Y2{JX zhGswcK*PLoHZiC{c)}sFW5y8S>^LIGe)X!BS0KD#h#)+npu%0`cLmox!EGRhjB{OF z6IagVaLID-im!5yb+~MPR}ff_DG#X+X!ns1G|rpC2-=}RiY8!4ZBZOnkp(Y7fwOv2 z;JA4-X_#3t#S%mlXYa1^yTF~|Ho1M=3>3JBtK~|$Ec9jcA(h4ciTFLn;Sa&@tm3%x zuxeeiMhXPMInzwylwidph$UbM72=OMq1b{4+^1P#3Y;+z!7(CeB|pe}!v0<5cM*N) zqp;uwZj9?j6shKl5JzlWLTqm09^&^Hhd&g*6Yzj!!F$OE!iGW641(5i>zHlKK1!Z2 z8T?Kvj*th01uy9qpulP4q$!M`jS-a1FB?7MgXp20M>8*DyMpa%*mi}R<&MLJT|gX; zwqFDl<KJwF^GofQ+m0*TImJok5!E5}K`nYQ2^2+O!$dG?(mG}xX9P!y-zo4rq&ff# zUJfd7GLa&&pg|%i_qu4d0R$Iuk8uJjJCE~U3?oJK0^Ah04*i{sx<3Xt@okL%#XY~& ze!1;{mj%D0%z_a^-~$&8<OOG_FO!7cAWzWF_#Fj5-hxF1GB1cY!V?^_4a&VPYo`h$ zRK19=YoJJ+PNT<iPGET)^l}h)N4fc^`$bSO{$=BTan3KbU+#U>Rq#6new(o11F+x~ z){kK}j7U5Q4e|uR?=*Ta6hYQBt5Dzq`hgThreMPqi3NgkuPZcLff1@^j1o*E%I-&R z`3QO}ClEExfa5yoCF1Tk`u-T)#J7d<znJHj+Ap^q;D@1sW8?wV2Q>SLpnj3!F!co~ zjtB%9zvGHc<vRK>dl5r;1yU3dy&xkf_qt*`@ozffLObp<CJ`t0B3f)9<{riMjzW7S z7(tD`KkNk2eheG`i+a4&{@f+ghW3d1gz}{FwBn57tl}KLQL+v4G@<RHe~)X*w3={G zb69;ubxe6ec}j6gvHb`->XN)o?sX&D>;$I<+~*G9USkeQ+y{!6V9)bx{7->?^UnPd zC)oK*#Q0y7<E8ey`<xSr3;I>ve$Bf2kZM!41tUGCJgzu_?Iak15-jq}jI*LV-Dm4c z95PNM%;*+0%bLCFHT412x^hE#NV%!pQXEknRUDH?kGmxIy4zZ8$K6FWdME+(^v2QO zTR_YWAOG9Af7=PR{~CEXdX3x8FSXy(nK|GXGfx{A5|(v)wQHIK>UH&o>X7O%NFKq) z(>%V{<V!Cp$*gfUTRY4>rUAo<enK~;oz={17B$Q2RW)d;_A3v{<tO*L+uMr~s%(rB z)WPR<f#DeVMUMY%-@oGok#B52eEa#O_Io?>x>E=3qe+vd8N+<Sl5R!2SF@(suRe&4 zI1)*yYhBl6tH~@X%BoDMbu?SslLDq5<A7mEKc*W8#~BSHsa}%HPwsUiTg!ek!_yT* zg-y8E=w`PMk>h`n`=gv7{H-YCe-Xz^?XTNY9LVf-4cbPNCK9KOv-<gjMcuM?6%_Yr zV4sZRx@u!nGnD9aR%R6zX1mio&U$;Zr7g*y*k$T5^cx2CBRb+ZshyU~PwsWY+Zy~i z5o0X*xX-PHB}W^;3*R3+DEbf44o5T(o?mKzLwjjwZg*OrbI3Mo8BZh?&M}5d+La*1 z{ptfp)f0veYi)`<yPzPuG`-U0O>VF?TUwL$BzBs*OudZaux>;yKe^Y9-fE0cInnDY zg$kn&;Dz?bIDz<f^ut5vm)hUhUfz)($m~h&cMRD^EaT=$)3h<j5K1I_h8B;jW)phM z%??jmDG272WK^VjoV5;~VY}IH3YfZ$(4t&^a<414bH~3%jIm_lJ~!3?p16O<fst>- zIvgWl?EF&uo7yWoiaK+;(|cV5jNq7gB5}${mOQ6lNLbJ<F@~!rlnc5+Q@gd^S(#p% zQ;?rilIc#Za@9ENY{al7somUR@|(Kk@{@br$ks;uW<az|XZ)fM;PLw-4%zujobf+) zywv`twyHfP{=6<Iu+IsC))C7XJmI8i3Mv#C!XutiE^9^%fuv@;H>Dz@I43_pr#Q1L zts<q`>2=iE8X3d3qz>8qWL|e?i^Ja0V~i!<03Pj^c*Q#$BVabE{mpHjJ*EEqKsFSZ z(w{tNCxW9%<G9}^6$TleR_xPEB=nfttPRPYRCgvLSd>|oUY=UzgeBXFp)biNm!I70 zN;E%#@rylxCpm&ed}1HYB}naWZmVf`cNBK!c4c;_^}2}QFi#M6Oe!Qxh7Qjt_N%9L z14e&Rv#r)ym0Ffr1P$gEWtOJ9Q!Amun&dhg7|Q1-@46EFIL7ZW2Jnbayu+pT``WyS zAjST?Kz3IKY&eV{c|sz{Ecq-Ri_dC@3|)zBmIg<St0D~qbMx{*uq4BsTAo_%WR@(K zpWN$8@_WJ`pmtQi0~Q2wm;zJC3z7zhVa4PL$&R7IbBcA<yk<<_YuW<^);X(F%F|0l zg2kEcG^o&3?eshhKRMTZ!gwSv?`x}VuiR5g797YT1(FvO2wG{hi98`xcuujQT+)o| z`VF0lt(Hcc7dGrpFA)f4!4rZYqKSNd@~$h{fG7NcmbSX~sy$^LMX+E}AfgC)!2vv{ zpyw4L!HIL+Vdb)VN;hce;uYxdI4e`#8O51JdAWJHg&;_t&@Gpr-0MDJJd&5Ec2u{! z_Z0gJcm*Pgz=kP~gb2bDp651|tLho;2rSrdZh-=8VZ#-O#6?+!B0+e<hv6sZx{?ie z!XId9s|UaGJtZB5EQUaVybW(7c%D0=+^e3|jKP8frgrjyjv8lms+%Hlh+wgNe)6vS zgz-pTz6EoUJ%}Ksu;6@HFnk~>kZiagab(aYSTXTiQ_pKAV8Pv{j--~PMv5Y2!)b^k z0>SJe`TXQvSF!<5_yfeR1`(tTeHgOfZ1RCUf(=s~8R7|^=Z-24s1`Mox?z2<vD4I+ z<g?Wyid4H`!xTq&f&~x5PtJ9pFdoUv6F)CzcDn@&<`tOUOK})pkm3j<c!4{nIH+7w zPiseD!Cg?Gj}(X~Qk{aZL2)Fz0Dkac_{q7hWCNb?2ljDW?13OXfaV{qh8{y-!Qig= zsQM}HY3@4r6#k=)eh%Nn_pjrTe*h23_Tr%!J={6Y9+F}B9}gVqk)!x%<vc#MhAYz} z-Dp>)$K?E#pD-TD%MTi7v@5DLJjB>!4=_%E<|%sM!CHKlJxHQQNl(k<$L(>s%1u69 zmu5&kp_)-HDwh>|6>Exvigm>X9z1S>+7XcL$Qd%uYL_wOPgmYj98-i}nW)mksPmG4 z|C9bePx46Ow0=&zqS?nD2C+v$N<0Ws&?BVd3ZCX^L35(cV(X8aGL@&N7u#wQn~fa_ zJ-R{7h-N}Pt)5lQE0>hZ%2g%N+^^^;>v4`GPU;ufmG|Q@RQQ$Y+WeKDG#{z!cV+fF zM$MDPISkulz!?ud@JLOyp*&2F)Ue_45hxy4@HQ)h-)e%-R+5=n<gB#Tnww4Sh5#rI zXooaonsN1%dPY5~o>wg@{ncG~a5Z9{Vpm?qAZ_@SH~9xwxUSUS|D^W^@_JK;f{%}g zA^-S@Q4DfCqLj_=pr$La##xk=RhZ&-c&znF&89Y^-_WJ+(e>*FwIjq)GpX+I(v=78 zW9-4zT-bvvy7Ic}knjyYX+Bcd?=0%c>~{^>&<^(aXo0n4Rf~s5f+C<sw1$rsGW2&? zJ!EXQR;CnYXBVWEIxFp7YeQ0VVykJ7p%WB)iK1>;+fheX?sE;ZD?^1C?52l*{FT{* zWwL|ESlK4k^4;0rS=yD;mo|_*BD7=HFdu9QJ%Zbh2f_zb2UW899Z`)ZbeL-$?z93B zOfPX&BztVN7J(s@7%+50iyifyrQJEbX@l&_?9pG?l~;pT-knzd-<_q?-Id>)*_Sfp z071)ydCD|noI^_%NeyJpyb?9C`5jZv=z2`eR*$nZy&xwiKcgho?W}Ut*y=2eNzLXK zQ@g1H6gwK|%Fuq)D=+Kt0F_-?VtO8zogIIApcFZ&>4Q?|@2m(Ec4zmd54eULqd|hR zMyL=B;Sc$iXl3)mgC@;@p*^YIUYSywkq?5I#i?bk3TL&=%NY8=u+7xbNLTL4?qgRT zv5Mave`T5s!CzTw`XASBFvOc4D8)~5`k=)5{Z)bD?mXxqb<jC%AF+;;1{0@EvrL6x zsH2vw2>ir;i}){3HPo-Z&SCZ>9yBEcOwAT=az$!MCL>ssR+dr@6~dBjbxDo5cj;)N zD|hAfvMb}ei@5R}=63N{ruN5~p2uNsYP(7mdwQT0KgsEX66g0<2THs0yGaKrgUN`* z!FIro$*T)3;Yo?y#DAmm+p8Ye^%&cd8f+e?JGEFKm|hAyu5?!0Vac|7;)g2-@_U3U z4@X>ihSxtB^6if~J&!|oAM1Y=idfSFrT9rsACx%1za~)DRoI;i9i$Do29rbWfG316 zoF}gihO+sc#)GCQZNI^v*lekFRJqF1iZb(abMrEb(%}iA!s=wQ<c?;#GJJ<{<)Mfx zlm7WDhuR-?dS2N2-x&KtCy4r!<n+Pd1i<L$_t$im2Z~@vqyw_yWXz@KHHcU;P0@t4 zL^i)O7^a!k4C%WtMAKlaaaN?1r59!9F@hPz7%yZhbPzwfayPp&^B!SW#xxS)%HfTU zGChyO!?WP}->Cb;PY~s2$?1a<=l6Tz$4d|ib4Y>xDIn+w5u8k<c0db8+5FBb)|K;` zQC*K=4-{But9F*BmIeuC!V{*t5lxUWA?(WNK}B475<Zx}a%7WtOwZ#Y=l@07A9;cu zzeUdPi*kI#2X^lF*LGII1CkwO^`%1xE?AOdlr(4^PvT<<wL~_*bKHh<k$=!+YJ~#5 zjw*P;k_^&dUT#()2*MMlfM3{^5&t8ujNUwpVlz8dvhA4Ow{3#nj{Q4`MZU4^@a@z8 z==+J7U%37JXhjwL5bUTY8~s4ifeUMi>^#9xJ061HA$riH9fSq9n@NFq(1amE3=n3C z1mOu8zpBo%F5$}XzF}9!dI*7FWNYcSi0OG8n?4XR|8Lv=ohOKRXXpIBZO8BUK;(X8 z&ZtKAg5s_MQebZe)&P<v5kcE1t?Qt6z!UQP#Q(>MzvuV|O_~v1ufcDkVg6bN*|0mc zD6=p~Fa!K><xb(sJz0I*u1xBuD@Qa|_$6$59`W0;{x@QOv=fBA7v=Eq>3@>v_t%48 zIXqw?d2*(})WH<kFnMzj6x$(^i`1d`u6SRJ{vWOXjd;)m1>!*yY?vZ(kRbTs%7`HB z%IL+A_Jdc(y^wI_u%?Fn7&g5x>IA*8{b5H%`!VX_q3M4#KP1Zf;?Vh#wZ(%7QVKgl z3<>IhdBG8Ts2!2q;^+?#niTsmRbgB=sP8s*Fa=T+sdTwn9N`IOceI3G8F#{>0@;;G z{XD^FZKn5v)B9pf&<pO5bpql27>CO~KXTsGfL|FhXykY2hIGJem?E*z4pC{59L0CT z^hXbx_A3|EliFd#U_U)*N~%K?sdnN)6N@8-+3<tl7jb3up9BTcm2p2JTse00h3S1U zC+NwV{s(!*TK_BW{BrYO?Uk9oN!=<l|CP+VQL*`PG(C@>e-&f>uZ-z`;1_%SublJC zT?e!!H-FO?n*R#Z`{GW}lbrq+|8QCWD{+3g{AA~E`l9n+Nlnm;Ct&Q8JY3=fa`RBh z&foL}=f9E~pOe6c@k5FA#S$lwt3TQKo4)A$R}$lM63qv^`0IbAj+e_%cK)U>I{%f# z_?%SxrPlXK9WR%k?EFn%bp9)e@j0pXORevfI$kb6+4-Bk==@g_<8xB&4;~(CeX+y| z<gTOV^Ea{FCGqn&ebM=^B*y0?njb!V$UVyXVcGoT=WqIA^Iu7f&q?u%IGo$L{#WXF zx%?g?f72J6|4L$fPOAOe4(E0b0LDLqe10<XH+|9huO!Cj;<rA^@ZpXD#`tHD%kLrb zH+`}Buc8d#J?{8l<N)Aq@spXq>5I>Q6?c3te(UM}Ez03s_yA!1GsxH9S!7U@mA~nW z&VLm(f#2hf|AhtscZ;9w{7t)?|0@2UPXdZDfcLoLf5;m}>x(5`^7Z!+`I~k(|5f}J z(D=yz(fA)fz!?AXa`{E(Z$dvhcK)VQirvnC6|V`zFXjT?#~uHZ&VVQP{_UaiH|>7@ ztJnl3#^+)m$nSZ~!;$}w6v(SE^zY96#aHnqpGF47EhL0N7F5bYDP#c@a<B;*u;^pR z;Q4=%|0+6)r08$UeYZ;P2V;Cr?(>g>*M>Iw7JH_;#=3?BgPr}IeVx6XJ^mhlH#VvS zBGdwzxmGz_JJitE)YIJM3-|&powNnZ6VTViwl}T~&GrrT^mX-gbp-;Qfll`F^REtn zaEtM0hu+TSG@)M}{-+i0@xB1L`un=`bpF25xr&ME(VF4f!MgtXzJ}h$-o~D$ZrYl= z*cR~RR~6QkHB>fvn!UbSUtLQBwnm|j3SUWuucdOXc*QeaH{LkXG}t`g>+|)t^tSY1 z>u&9C?P6P?l{QHtUgUFnj9L0`a4V1X1<2+13&l(N^VUt*zKq43+5D-(@shE!5%+NU zP~~9NKy|;T&(l}a>+SL873P-}RhD_my_L1q^`3^B25)0+V_j2yQ+;y-w#Md0UlX>A zX&VJAW%HF&o(b<*?MU5F{b0jD<3MA7Qy;e8<{oU_zHXl+zb38{<0g;%`Zjk5YoU~n ze7xM}AHU_kro64YXgX%yNZyySoW77bn=_p^nLklDRy0~NTsl-X<Q^&?EYHi&D=sW6 zE-$NeS69?jdaG+awKa9#dT)JgLv2G{V|^n?V!Px#l6|0HrDVZ9TQOBN=^3vX^^VjI z*A3MT)(_P8H}p4R14+q-wIG*R0cVx`T#OI=kb4j7p}dbQVY&MIJ@+a1y6UF(lHrtj z%ern~bFQQ;rp;%}W=-cz<xb{}7mO8-7L63=<>eI?6c?41xJxVCRpr$co~oK^Z?%_E z<T=*Y)z>v#PCk*ok+W8?QoL9?S3W}wt0z2THKR2nBFDM`c!?Oz4!zyVH6jyMChl1u z>Gf0YTigvK!Yg^CqdD2}_usgma$n`{t8Qp7>(3aEnGahJ+Sig-T+69MaVB#*dn%ha z;#H7eR9I3}3WDYCit?(8>Wb<rPc@@h<E^RnGK!ZSr&71F*7HEHXt89zY?c^SPE<`) zk9)>EBSevLlx$WTawmGYLM{b&T#xYjIrly8RjhOD<#He4X!-p9$o&%E)LV)dRL^KG zC!95$Fl{9rvL3XpIaZy^uBFt4w7K+|jOi@!%g=`b3rmUxg6@h6o?$i5(BrM~dc9Zd zXHt%49LheBx36Fo1Q$x@%4T?mL=g-}JtN>K*{C*-EZB`G;*ggyJMxUb;K+iXLKJD` zG9q`$`c1C={)ziL?)y;SYs%+U*ELsk=k%wH#}W@GZSV|}SDec!i>dQzj9)=PVPSDm zDI@51S3rf8#IUlus=8WG;uZT@*YWhttc{!ld3!-{33dz>y2BWfJ$ob@LHruHDkv}= zUQqgmW7`Qo=YGO{#Jzw?z~lHDQn&4u{#EY%+kbNZfGd5>z0KWM-c~&gg6H+8NrmP^ zV0gf`W?yxHp$jQ>@?pV+1x1C$#k>aHZrCwY7-9&DS8V5;CsL0<fd_N;=L-Z&7RqMJ zX25W|a<X!QHtFtd8-&xu)o^7nz!Y@mgc<wNE$2V+Irka&3HJcQ5c^@n4<E|q^7}LQ zJMQP)cc8%6759|4K=7*We8OqNN#jw|CK#>@48e~qm{(wkAXHdZQ69z+{E|<m98KTM zgaUK-=dbb_q-at$6UI=ILp%62Ba>nU6qt#ebBUZ*@|ds>{F3___g(H??m0FMy#px) zAFc?HTz>z>{S!Rk54dk|?{KdwUQpg7g4zoSXJE(2O<ReF%^Qqih@YUqk`nmA5_cI? zSVk%o7+$fRPd=4$Jncy4p{#WfTniDTXcA-?*3{@fwj&d&kE=%}#Zu&mPDAre4@2;m zP{sGTuW>J780;VtAvfb|kWAVZ_Ls=z_ZRM8xWDCo46T2adlNQ%7X+Uog1WP0$D~4H zXc76rg5d+14TlkAcFY)FwOvR)<2sRcB>iwEq6oa;zWmkv)k65e;>8j{h0tNxpBnu~ zo?injxE$FOvth$lL}J<Zbid>%f_#H}9le;7+!EIZFPMjiNOE8D`TZUdgtYz<)c!#6 zqVkUNhMEXo)SXQ@tv_izX4*0xHV65U1ydA(4HuQ*g&rX^$VU?p<oTUUJ({+ad5CP7 z;s_(SOa%F80u@TOgZQ<e4_J#BLJG{l-G~KapE3#l3jBVIp2a)dU5w;!AQ^loq?|4L zu2?R=|K|P`5#$%J;BRpsaBp$1zzg11J;MlI(w!qaW-8?Q@g5KaDT<I63@6B<$yM7$ z$64pelw-UCvkrnF;z&3_M3XtW{F)#e4|*|0Tn=p5f#&NuwIq9d;8z?);P0SKFC$06 zNp6)JfkUl;D~&pJea9)~^ZPgMxA1^J#<fU+Z*ceF1)mEM)PW#W$oTOQL{MN9f>0sj zhX?`-<`qabOmQTfAXK>H58nO;>5n1^S#SkzBl6Mo6nMczc?5~yPjPScby)Con2m54 ziOu^E&`Z%Bl5<NVpWnYD0{;!7{iobF@ol}$y#^b;i%3iwyrzW)$rI{B{3r(V3WN<8 zGaC+BvHPm^qWx_0=@hbH^kVoZ!pD&af>2U6Zcm$FqpcvtZ|#_NCT_EMe-K+Q^14Uc zzn%BC-+e}%3!bwc-vC`<$2EF%Nl8$(7ak(&C*<<`AMQU8fq#u?^#ktP__p3f6nPmo z{5%L==Lu@h!xIMiQ6FZf0)qsvS})npA%ZXqW(s6cM2sW(tA)5T#)jr<K(G?_$+sY~ zm83n&r0wTA@DsIg`~Gdbw*PvJD^@CP2)b3m6?~EBxP~uEqk&f~gMXmEp$XAmyb5T` z=l4hMpSizh-uAobM}Gj1`#Q7XJBY+Q!OI}HonJ7Dh+dEpEQS7v-&yb@3uX%Bqlg$s z1cI!+R2iTEDxnrMpasongV>5e{7wr8tH3zg{_ViF{r(;bVdQaoBhHZzMpaN6U4uB$ zRa(Bz5kd55^em?~eWC`js$Td+v)uc)|KW(=KfnY26nEU8a3A8X@h$WOUxF9Bt$a2} zP~=AuB&a|>ibN2+#`8l2IhhK6=tq+cQxriQ;Uh76geyhC=33#`3K1d1zh7+QAhcJ4 z);VMB51t_U51%4#%;MI?oS-zrZ{{E~JYB=XID*m-;Hi#)U?v50qMVJtq7<IC8XUn| zF2BEm-@hP&{0e;-vS40;Z@`Ah3qGg3ArKV!Q3Mezm{%Z+BE`j}EE1zfNc_$xpG6;< zw_sj@f(?st1O&rcorw0)UtcEt^|eeXy|COWw2OngkG4PT1krx{F_dr-g;{bqn^tE+ z&xg)YfN!K6dIlpJ-@1iD8SQwJjr1E4-O~Y_D7v{yZ(d1WVO{}lTJ!17OKd)tO0tdK zVcR>J&BQTVS8}VXF{Li8CcQenDx(5hd8RwFEUPT56k7>4@j$76SM1%crZg{|CDYB0 zFUc-uTTymVPEk&wutl^v{BJL3Dyf0`TG3K|{4d)6h!gDm<!9(m+(W<54}uPj(V$L* zv8E!kc|M{>DOXN+CvSl80tlAkYr+795_dRu_L)4cu%V=)uq3}Izc9Z*peT?enqjZc zxfc|t)T;?o#=*p%B)_%I=Ce0B8j|aswN9_ACdHFdomz!WikrCqn040LZ|_L<IU8KH zDK)9pX_aXe>E-F=8Sad-jIzv9Y$e#XwOEh4ixkkxW36*TiIL-fJNIup!S-J%9=?r7 z$cqRHIh<2%PSBZP$;2m<R7Sn)uW`5GC8@@3IRZ%%bu86jXyaU5Mq_z(iMyz@u(+V8 z01RnkS|p0$B!3H?KgXR@uB(@|QwgJn0aH(6z}%74W@)whY)!UCdxN7cxy~u_k^1+v zb<H|u9kd1PdmO&xCTG2?*5wtx1(3v6p6*U}?;Mg#Km(G|N6uypOTaI3{BQgI?ZCGE z{yF%3h-V<jFn-klf-bd1XE10IaTNzKq~77TxYrOxHc<4Y9a4u*aeJfSaF=c3tj$$5 z?#dE(aS7e5QirX8F$71v!btuBQR7YSCaOqnp|H`SW=1!rA2#$Gdr^F$)4V6CJ*hRR z1tg`oiTlr44_TM3lhz??kFC?*Zf|ikB{w?jopmlj%_&tWRjHL}+nP(i>fq_@P+<m| zlF#@>j{imOk8*<Ww`lx|_|W*(AOdR+m!h(2O?sV4WrjCQ7RP7qU}5PR#uwV*6!UR$ zGe1b7^VQT=RF##Nlogj2fnXt2xSb*+`EQ6DA9AnYF8d;$qHn17GKLenQT-rFR}jU3 z$)DJfB=V8^_q_E86u4lWK$71+TbHeaF>D6K27w|tx;!c2Z4Lg+zIiyKG}u_Q@xSo> z!Gogz!1H+rwsi_N+>M}`!=<Y1T62Qls7X|q`SBTQ`O_HXU*?8Eum;W}17D#LnN?b9 z>#98Em5d;6QyD{^Ae0!U#XlnQeg}QN7rAHPO;2Ki!2uNQS<uXCCs8_Ogr{iiHc4?4 z_g}Ogx2{`PtaFTDzqK1G?69}lTOB?}lcSNR=&W@`Z#BMrBh=x55TcL&h4#ldf%vvK zesv!Tq%z=q5$05tLz9$XH0Vq;KJz2oalHi_zl7|C3)~Pc=|zCf#usSjTI%XOHI-Eu z-gIM#1O$tj3Tf<w*P%d>#_xXs1-_1Pit9KNO%hs1M%fi&sF_0XnNeg$9W;o1r2f5V zJ!w4*1uj`<uu^*1+HdW(1#Eu%9%5+sIiN&FQEcaqe>HGINf1G-@jr3@jsqj#;K#2% zMvT3QoRRyOg;s)1nlf2q(V6u8_`<ij_t8&$2A?>H-r@i*TFVB!Q@NJ<h8k~Gbwy=3 zLEe%h82%Z3XY!=);c3+!5X8K7<XAkUJcz~S%gD<MhS~`zQHq<m|C05Tb<29tx(XYf z2Eh^QfVJ1!1%@3F3?o|`@tX#{=0t&CwDCWFf5ag>f8ockKEeIyJt*)9Y<Lg>se%!7 zs;%14_`(}#_f^>N8dTbk%hn+p=W{I$jo#X7Pi0kkMOk@BgSg=*9kLhG;h(vGgpz5T z;w|nz;`!4ccoq|dwxGiOs#Wz8uS1Q<N9x~8)-%?l)(t3d**Y%}9JKZ!ngnc}%o{Qt z#ve>qgJ8VzKiV(x;`w|Rqr$Ipμw<5_SoZr#d3FkR_X{|a|ppP>)<F?`p3c>nXr zKe~z%eto!bJvg^EHrCd8m;%cy+!*2%tQdA2vSiruf8p-!R~V`N7L@!tyx=VmL`_i? zt3AS0xCU=X45he<`!8G1!UN(dvUM+Pcn*CT(jX(qROp8{Y=<pNG+!<7i#7fyIR;4l zK11K)BlIrrqL*@<6$0<V?JYeLO;-}XpP}8K;Ew+R6!;ZPFWbO4^@KpMwXvzLzQzj$ zR#)Khgh;RigT_QKXvzP@{T}1@KSd1rI=+e5p!QpMYDxr8!HzLqO}SsW58hDZBlYiP z>pANQSnxU&xME#IZ+Z&ZK8PSZA)*PXki4M;KaTNxtnokM^8-YPcLjYNh7FG(j?}}$ zWkWL*VSdEg{T7~*JPY4{Tqrf%i;K55HPzR9z121311n&|Zg;60k9_dr2?`AV0)Btb zp0#`o1>(LJ-_}h=5Op!hj(HU-rMQXviQh?B@FD8~roef`5vqebf*#=j*)gxehk2AE ziTeNI13bT4C=idgEAWVso>7Pdal0?P3je~!??1;k!H@nuz<2R{kl-0+$D7DSEb@{1 z_p<doBFHi8CR5-l6ganoU^n`O{}*lE&2zRkHP<)P)<J=uYNkN4;V^=%M+1oe&C&S% zukftkQ#SU3@n?9!+t4#v@dZTolVryVDQ@EaE8uq;9&igWgcL|NJcl?E(jc?rN1j); zn~dU}tF^hgp|K7Y?5W`u2rmd54kyU$_`kV-M+E*`<|{sd4}1?c{0h8ah~QcD;Ez89 zzYB=KCvX=+F=P$>m?iXNX2S@QC+z$G2S4(FWWi8i6|><8g6LJ37W3oxf5&>`H2(Zm zj9NT^$9(~JJ~x;KFQPwliWBuE_3src@gono$rQMcDR7>}kqPDpQEVOr<9+vEhJCHg zzD9~5wG=}{1@c}nq(KlQel&hhPy4=yUd-1KMc#xBzl1pQ9D3$t#TOBcrMQXviQif9 zI|d8JJT543FM5K!7o5U99Yqo%_+?-e?{5G<iooOnJwXNXJ(+MTE}`-Jf8l-u5BM3P z{dbrG-(fcVB6>x)1cH|yg5O2(I|UC&7L3_5R#G6@FnPfc!FYfBmtkM4&xZ(tDWS}Q z5kt_AVK3ebx-oTy`ZD16C-h)`3lB)Wp-%+`z6Be88D5YGUS}E<^(FQ173)RxU{1pW z@)jIYAlWbx9A_H*GBAqwHxR$3`Uc*DDTeSqQ0&PB`_shl4|oFnYxcbPd+5b{13r+R zV!jH3FM!}pMo@~IxIe^?x8QZ_ekgDiHoSoQIX;dM!FYfBmto)Qu*ZLZW&Rud+n>1q z<o?Y47q-7}|IN1l;rQ*Z-2aog1#oxrIdu95obiu1<DY|P<j*Od@kR9Oe~)X@^=Oay z{$DwM`(KXVz6^}w{S6z6JBS3|K~(r0+WHL?_D`Vsd)EGcNBjQBwm-?F_y4#;%_h|N zA!5wW;4A+Q-_1WF?))Cxzk(>8k<R%i?kVMm7&-r2kp35TE&kpA4EqNE!nnr!+rJF^ zru8S4HxWnQ#~tAJ(4wC~7k`6x{{}51mj8^6c*@mZe&U$o6e9AQVE7$GnxCS_^DA)r zdyxDsSkf8kd|%N%Kn(gJ_WmvQ{EjjH0~m)0lQsP3mjO}SzhT6BKzCO4jN%?*=?Cy> z-@#bTk6_ck07q)KNK!7pJX@D~75$i-=v}`9hTlT0`5`F&9Qx+Z_xJ3(xu$<f`7ZYU z0N45@<4bgTw#1RNN-V$ZuL0MRX-j)vbzSj1TKpCmeuP$imr<lv^Bn(1w*K;xTN9>~ z>li1yf&07Hai8*aoR9xjegbv>Lg0AKa990?;;Y#AefW-_gYB=GMt{dNN(v<(@~>ZZ zd|Y3~jCH@^xb}kTY2|J7h9BU*;sdn#n`rZ=j3TxA=d$_bx*Cmx>Se_idehIKZ~ZF1 zllO7Hj}fsyh1z}uTA$&3*NnF{uP8~Op9p?~^!e+c-bG9QvIFeNnM__WAJU)HTm->e zu;W)?$?t;Uhpf%tMeE3-W!rB`U1E=RO0{2c90aeUUvnQSd`DpT36%XkM)8{IIqiMb z8;bX#cOv*BX5-{Dc!uOHNsC|hS9kts%7S&jaVz1p<`M|rgdM*G)x0S%{03S_%8|=2 zH?1bgpD?OkRBm84=Oqxl4Lg2K_)bWNWX+#)*Aj2(?x|l>z6~4x2CncucuC&ADSi<{ zKH^kiWy>4b&-+la{?FkrY5Nu1e8yw&5udWiM?N-k@2@2BzlmoacVQzR;aeqdwfzh| zx@)i=ii7;#@N3gw&{a5`K4V`sZy1j0&N70}D(@&>M4$Kp&PqD`8uN&9`Q@coTiXl+ zni<tz#U{oRE@Pbk7Vc2!d~c(rUuCxZE%19b;VvxrO~reH(#g7c+a^mE8FF~YwgMHu z2kt!M&zL{?h0s==t5|sqXXm5+kKr50+lYIADar2$o@Hzz+P#ieMmv8}QaeWV-)4UL z$DqicJw&;yWGHjWxn$jM+SH%WoddzAlsA;O6)(W<US*c|9%Cq%Uv7Gp&1dS-jX{A2 z6h|>G$P>H^Rg((eh9wh2@Y6pJ3w{6v^8S|hkNn@rM;T&BFAY|h9zi^M6%@(ZDZ2d- ziu(!L!<8#-d3dIH5#Q0<pvcEQ+WS++QVQT*&?~{*jpMP-U!>8r<LbBYw|`AIJNa<_ z>_6uMWdk`ADf9NdN$bWV`coiynGw9DxXV=d8Z7x8{M~Z-<z-aZn-T;1Va<$sRk^M> zhEc66xHG<q(drlBTOY9Vz0X}U-_YODzNmgx`KIz+*ziX}#0o_iKJxSJSK^8F4BGe< z`sNQ{6<>qR^L_x2DGK$*Ts%?Sz;*86J9-CuKW3E4(@MSG!e2IyNl8ty&hO<iY<SMG z58u#joc&#VN8e-~jlRPlah>k|ys@;I<Yntx;vvJ)gwxvdn#<~^RL>}HDxXI*xzF1C zmTZ2x85NEObBCc{H=$lot|<<o)#n6)x1o{yOoeX|Kf`U^J<ZFi*P+1o1usFK@dwNk zlBJLe#g_sq>4hbqz%$tApk<0lq+2{{#OyzFLJpqzEMgWA?Y+;uE&rW}$H)5jBb@0t z=qY2JzY&j-^qC;IfU{r2H$=TWzDMy1Jb7n%U;b#uv}?h(H|d~h({L=|Oc=pCiWd|w z!MnUFn_pf=d2)SHo3U3ns+m=<C=a41b3!2aEWW`P&~{ScHS;ru=X7^9_f@Yk1yZ~h zHN{&eS;)^Lnw^WOe&d+ua{@l$2E6{uXgiyIm9I40v^n^Z_MQU6oA{>q7#MMaD8Kvw z%IDYrDllCw%K2$^Y^nycC)@Gdd;=|c8Vp~?H^hH`ot3?X!&#H5bB-13n)#4%3tsSS zkl?fA31P?gW%J9;bUSM;t)?#hpmq`p+zT5%f;OFJ8hjS&xC=XenfMuRCEV4%q<)1d z@IBaYC_0P&?Xzty&cIXuUaT-UD8%;XI5e3QlA<#?G&z`oHV73ShS$A{Co^~PcT2s( z*Z(RoD`?00X?9#bXH3HE$#y*TU4vRLp=V3|GK#SNs@|fZobj|-=OV8_<^|7cFKDj7 zimxl5#oXMxihBy#{x7r4>9zVyov`3BC~#T1UvUUt@U%ei1_<7PI<A?ogWvPo7h%B< zpul$o8y5RFym$G@j^?Lg23`kdsVqSYC*hO8k1RNcvltCdbq?mGbVDcm;PKDF+HUMP ze)}8S*Z)R2KYc%#AMC`Ogmzqc84-uRpBre$OPs&Dr(`g93>G}^ScL*NOk1$w)4Fp( zf;W`6;0fjO%grit)!3TMdknq05zQ28U9Q50H}S2V!BaYlB+nq0-Qlh!T{qs;-_gCm zEcgxOTZ(rPhd*R-g#5~<=uJ>8+4*up3C(!=KZSV|hdIn&^C7qtb1CXXbAk&5X-e-X zRJe|7O4o1Qx&C+C`Dt}*6&|D;6QIsY5FCalJjl-d6=>Y==_wt^8_k?bolRbb0uLk} zh7BK2I1@qercyq?(iD%aF{zy_ST_y@E-BYw!+fpZ5JB)W-PGUKy}&G3P#|oWyx=zx zJw<{$`B9S1LKsyiW(mv@KXLx4O=q&`QdBvZ;n)s4p218E@RfK)TmQT5{62gog_tX+ zOoGhm+(YOuQT5`}uw=idyR5%pBzuA^*iH&uH*La(PwIGrSD6*xcqo3=_J*WZQven` z3I#5pA44{L1on0o7DpPqPWm%F3x0Q*1;3_zL-{uPF%*Zt9zigA^KF>CTMg6g<N4A2 z(*iD4X*L+F+Ef)Sw`qZ&oPY|YUgG@H$o0S5&R+}3r*kGXAm#&RJ7%y=L;V|2p}(fv z-B&o0J)S<3f*4}kM+$@uA4OjfaYTDj!xNORzmn7{M_p2j$qx%2)=n@5?jsw97d#at zcrEE^)3b=c&uj0&f?q`pAs_gjV8b5=H5jY?REMYnrp4x;HX=rrV9vZ<lW5Z08NmwJ zco!5Z^%Cb#ZeRZ!cK#MN(JB|RR~s;4lNNKeqrW)`AAErGd%Mbei-vR2he_oXxDN_E zY&>E(u0I_{@S*r6*I9ju9mZ~0@EB9z3T$|TdBIcg>zANGfgd8sJq^X+2ciO*4S&S$ zlfMlsj{Qo)<lR!5KSUnN%O>Nca!ERqIl-YxQ)c6hMtDN0SM2q_!ue@_a3jQCh}q;O z1!kOP;H#>|W&5D-1#T(ri1TFf8OJ&MdHV(XMcYN&CAM9*UABg{E7o5sKh!?Zzht;; zylr|e@mAuqiO-sEV0#AJ^(20KI_c>oFP=gk!*jKBn4fVG+a+w`d9GMP+cm4aEnfSn zk`o$2D;VlvvPA&uP?`0{q=aNf&<){8z2dF^70%D+S15(PZJbsM8D=9O*D-=)+-Si{ z_QCYc)T1fKU8kI<lFwjG(Ang3j&qLlj3}65yJY`8?kGQ2zN>yi^NRMq?q0%O{qy=e zhTDecjJL4eG~F~k%eH)bH=drY;hFdmJlj2qC%k9yocA2#c^=QqX(O)k=v|X|{)QE% z;q!NMp%IHF(P*&foa%IFP<r_=KR-9t`d@Z_>LGbC9WW2;Jy1`L7IEc5-SxP1FV|l- zQ?Qb=KWii7aQYFVm~zr}5+qLxB#9>R{11$=WA>!ttIBs&Z>V1*irN=-_jGp??k0ew z{<a=0K{L<Qgjp9;cn-OSr{ae(yWl7|o?s+TgC)@<p1Z>j>JsOl()xXtK|*2HnM``C zmM18!KZ*6fA$=FZbgi6D1shI99I4<5R!)@67p&y1WgpBs1cqB!4S6*6c*+T^OA9CY zXN+g^tIt1FypLG<rs{$ERn5zq``VYl@da#mb<cxjUYgh5fq55`c#exZ8qCbt5GWoK zIFcq$?;1a;@i{4eQtNxAj_>x2x~Gd5KyY8qf$Vjxg5((z#bZ20M)JS#)a-9@NBVvA z%|23mRq-BU_`2#f^(!Jp?LF;1U0!-svd<R4EXWDW#8|=;abma*d)|Z=k3fw)$KBGO z)cBlK`=!?RN*&+j9jcrtoh@1f!M(XyJ({(_7;e(4T&6{y;$QGIhgPrB?4eK4L;Fzi zf%08};RBJP<|Qr0Uz`p0J(xc_41zOI;WB0htzmx4I#Z%Zakuz!6614H?U!2LD|LLJ zcCc!!e7a;F1XuI-=I)0I57JuP%tIL>L((G6pZpyZ_;bV&n!WKc_jRc7eV(E64HX!w zUWFF(GTp9PhtJlDxqHKy!!nK87)!8Zo*`NDA=vZguGud<d))CksrE~)@0B_}P~Yzv zt(YvEDOnH*u3-i9fh@9QVu+~1bSTbWqxHW(Vg$d5d)2QgK7b{^qkId!-#1jRgCg#K zQ)-f%?CrKL#1Wq09Ok8wC5sG6iMyshiSaq9_Dikrl{&t&zOQDuYP@{9bhdb*XbBoz zB~M5yWR}b`+)AVQYc&6gpFi|N+~0p2_Xr=sj-f)uyI_czf<ACwR%vRrv)<lf>#%l% zAS!|*8qY$7i%{Vz<_M83AKX2D6614H?U!2LD|LKlL$7zJdW;lUI#;|HPB3K2P$ED7 z>L2k`>{qbipI~g{Q`{YV3!d=nOoe30#4tCjB&`w_ObYD8N`wJ{;0&S(sgNw0819<> zB*y2Y+Ap=fSL*oA#@^aN&q(D&`7~^Jo(SecgS;mUv*iE5T$F#s{EB~o0_mPRMDXLF z3f}|6yzJt%3YRyz(cWV3KpgHxUuFdJ&?XT}<`7L55l@KWuJPl?Ph*bHNwr^UeXrE< zosB(p17yJ!6YiNZ*l_V;;WmQIlK%>R|BgP)@9<=n?s+~#91%6h2=Xda<YpJAm#27~ z^^RtHo6T?SLL3nZP78Lt1ctlFFYfr9RQtv8mssn2rH=PEb=URR3|Egrfm5*IS<1R1 z5M0e;v4nZUzhb7|f4~F&BWC}Q0)LJtcs~jf{3f$w@`cbJ_*FaWV8N~SJy2lJHiEng zca0y7pT-%VlWIRd{t|nAuhjA2SKsd)f(7#mWHt<f%gl;}SOS87#XQIVKp*g*5JUbB zPq#m3HvB{Mr^yd~f>`nmvSThcyC}n*S_KPkaQL7=vf&=Yks-l~r<fhj@1Fi>{4~z^ zoK*Xz*7r&s?{Dg=?;{UbJ;p1LdNNFdB0<(CWc>aJ4@eeF3gm70C%7jL5hOd#1HZD= zN>>dmm{(vYY?$H*PjCX!nCy5L^%r-1POAO<_(AOTy;8^fo4Xo%$$~ww;7am=WWx~z zSuFWq%m)1v`p{&-{~#!kZ1~692<GO1Us?q`V4b7UE-J7aHXKQC7x~2<pOb1ozkZe0 z<wjrMD|NiUk1V)qTvQ<SWS9n5cq>MqkmvXB;71nx_ksd{CJ_9dV8!3!a&rnZOTn+2 zS#UFaV7tvPc)@UjyT*?nKdr$~YSi&LsrE~)@0B_p{1Abe2P6xI0@07*domFOi62Ga zKcEluJ3)cuJ3d2ChT;fM@Y@2v@)USLX2DP(`Y~j~<OK(DM?$@s@!hjuKL0_?@j0pX zORevfI^ORKPy{9qNWDN&fqYMf2o~@f{9nwG|4&2^@__#sRN&``!;u6@e<kT|@T(yU zMlVKGU^l#Ah~O^jPaJ=aIzA`WeyR1nQpYoX_5I8Pj#PyyP_$thnSm7(zdz$1jy&K$ zgWqqM4<rTtjP(UY4GR1aK|Ib{#Na0Kf#}EhaX(C6P$al({G>A+$3?($TE+cD{GD>^ ze-v-jjMeuyq9_8@_28>|w1mp6P#Q9}b(o#S`JS@6%0^F<w;2^IeDzep0$Wq7SigcQ zSZr`L#r2-XIuvAR@-?^ku;JB8Tf{5=Uyr?yS-|5l%ZJF$f1g)VQeWO!)m%eNT8IhN z^&lqURD7*SExWFtwvM<4GWv3Q^1BPWN&;m8cV~HLWoMPY+V8<u<M;a077Itp`YU@q z-QKR+Kz*R0v!S!Gv(eu~TSqh7d>y_HNq&!6z~eE?hsftwQdm(^?XIb;t*Z0XhgJ0u z>w5V3x*iC~ca$e}2TY6BndFI-(e&ZWp`3xd{`|he-lE=;p3<JOZg+Qi7s^+x6wH)N zxJN67ss=s%HGSUR+TOaJy6*b!`mTnqMr=)irhp{B$1LFSnB_w_x%EF#7p0`gT~b+A zRqm;*siL|bbv3B!L3KSsRXysP5SriTZmP~|Hx2vEE7k@3jB_ewB5f>VG;26#D0eV# zprF67uc$ZqWctCJm4f-=>C%bvv5L{E;p!pJV9h{HzqhZpudWvxV=2k+F$;Lm@XceC z50S4ws!CB>>@GoN6sk#4Db)24>rsFs7ID@$AUuBusb!y0o!1`MA2RMYuUeMubICKV z>D0-z@r<#|(d?1zk=$X&nbgg!{dp?|3&pdg(`A$8;}v6-BcM3!A&&iCY_$k$lKf}@ z^f3x}Q0MzG%7@73Csw5>1HlS+h#~5Fu!<ByT@M84pFn~46xUT3G$#|b3>&8XNqa5J zwgtzWa~2e*(k9Z!Ge+&_P`+X#dkq8^i{?sZ7{iH*@ybz99PtoE#!-@A_yj)55AXay zWct9J`=#EIo4*sO><TEOJ*rYrCUL3;P)U_hDuWd%c#7yEevZL}H@N4OPpd9!PV0{8 zH;wCwYhbuyTXf7P&pKyQW>Tl^=TW}mFcO~bV+2WsGiB56iE?5%RykTVQjHF&XISd* zM+fS$3V4L3Z#`1^5c&F}x-wK31zBKOE_Sgd1<#NuQk5PrZjyhEo4F4#N%uAgUecV^ zozNdK95Nj=6T?;8l6{esXuse@;#ex6lCu|SP8W+7ilIW>`b-f+bmWO5lqkt>=L9~< zL&85m{36o_hVPaBS#JJfr1BK2Qcym7KAXK*lOn{h1~)0c!Jxu7Ftzn1#VtngobFV@ zQNyM&#BjyFi1GoaP`=`DCKX@U3xZ2Ui^cOyg&~F$m1B}Eh&F*w@}STMqNNWE?UnxN zVfh8CQt$+M6^1JGRO2S)ceo{?2_+BU1#j{MwWkw~>9>rVrj5h{(BVGl@M7{Al&?VX zfGi{|4ibbNGZh9IO13~8K#rWiCwZ9o0si*+k;{j`e(5HH|9|M;(2a%o6B6_C8J$B_ zDL}B4a!gQt9!i<OsvKdM^`GDYzmK87w_(H2V*>S4s>_=5+B3|Kw~U8P>tMLgb}9L6 z%1Kr{U;{S1mPZd=mI`UuXCBdnEP0x2S!`#_fB6CAM=szIKK<ws%7;7@Ki-0o&zV)F z;Qb&Y2o*AdU<iKyg5l{OVMzJ|M3I*iD088FN=*dM!j5?rf+6v9iRCLOZT7w}f_yY# z3}bQ=cf_8+Cw?OCzdXNa=>y{)7ThUUe=L<T^MHI!0Hk!Hv>a4pi!><OaRr9Cs8Gmf zSY$;>@?Yg%RNO{#ZIR&VgyV=NP@(OT<D8Qfpvgc171%HcvcX1PgNs5mnF;=(v3`g* zfiKoEVegC6w;r*4h;03d{HUq|UZJ`uB0<59!S4@<AisbGQ<(-789?IbTgqn`!3)}R zm{xsKe^h^j_#uK&@_S0%zRt>5>}3R(nH3AsB&?~?e~dqYFZwZI@ACX&r4J1IV2mFh zQh(bN$f{C=2ttJz=Hd&5P|Cayp+F>gQlQKQBX~u_6Fj9qZo7>1?PtQuSFi#q6i2X$ z1ktC7@n?tL4h_IRQUMR*_ZaB|A5wqJ1MaL!5hCaYKPnXRD~u!XiP}(P05<$QmeY|2 z89{i$(^ld~<rjnkG?^4dsB{4F3lUtz%=-8XfTb4jhDb;K$>_tfwm1ujbv;;y6K4VN zL+LLVgM&6KRs;wV1V1ViLKC&Vk0~;wz}I2J_Yg;(RbCeeB9?&P1;#H_zJf2lz(?ZM z{1x7c(dhUKfTb4jMx_>d)1^}ugJ8@B#HgMUcLDH2^CJu96&P8S0tBf-6cXY55&XnN zZQp<oq;#Js_M>=CAb3@CQG1@}hX}$7&`=RBA&Q8R7zA-w!Y|;6w*Xjj0dI&^w4cg3 zn-VjdV=f>rqUtGe768k>|78IL_ABrUQy@|-vKJ%B`28~?2tU>GTUZeD9-_!A=*gg5 z68bWvK@bGLi<nrA0zPEHX<JmnhmRus{Rp#S5ER-RdjYWI0^VrULhoVuFlW?bLU@b? z#F$>lVcvbr1;BFoiFHwkUkn8j#4^TDEEGZ({1H<ir5SsX#Swv^zz-3GFF+$GP$-}x z#1UvP*wR=FfF&332G0=n2ey2eOE7aW77(-Pg&bB!#8?3A{7~@=JP!IjqGl)o7N0ta zQeXwsQfc17U4?X7tN)R^&{Zif{nh_QG&<S>UC9N!5og%(3qSqMtkl{-Fv<dAHocHi z2*g+boPQdLfWC%U`BPXNrK=JXV2SCfgalZLn@EWuoz_ZrqAabv^jH7At-%=>E{<FP z96tURbph|TGi>{npZ=D@nRFTl2<})wEKDz?C>C`AaM@n#dEIT*>sU4=rUIgLN<@-R zfF&fXvW=<bq_z5kT>a@EDCMNT`t!Eta#(s1xd3?k_}}*ZJ5CV!MsWEsXE3UrAh>M- zad3KJj0M1zbFNL(MeR-1eZ^ZyjPWsgz2C?96D1$uliBd;s(zQ;m9VOmPHXi$x%><- zsozydr@#7RWNX=PkqdNpj{imOk8*<Ww}Q)uxkPn>S?vPBhy}#K<-Z(C_e5C$?3u{e zXFYDXq`jefLHQcSOemcVr3m=}C{h9fApw@F)i|YIS4gF``n6ns#=Dv~lp)Q>N`Lil z;cX569KJv|+W24i{@_8;e+Z2~+hD^wi;8M+g)JZsn_fsZykh~dcQj`)dBc2Ce_8X4 z>UrgT1)tc0l8EuiFodKrDUFGP+GW+2B7Rz{&*kzn-PXRMdPgCZ{_6M9Ta6xtxBxi% z_+My$j1!1&i{sB&KFpbw27}hBO2q=f-~wXWFHA3_sz*B(0M`!Z&!+6NY#GlaT-IDy z-B#XHh{>BMiP*QHL|RDy<FvY@o`fm&er5c$RzH=?FY!6uef1m4_Z9KeU;VS#&K>`X z3v^?R|B3r|92ofqzdTh~KFlR*^d_yH5fl~>6F;i6z?T1Vs5!E20dU=5(Ny}fecgP_ za8`Fk^R()g@&%|+Ox{E(F<1hubg#v47}YGQHWcyFT76G8zgr3SG}7s>ejC}^h~M}H zy0ORqgay12$L#oJ+wx)9aDrLmP*Qy=wtzTd`7ejEQQH;(*Y}r<XD&F`EQd`e^ye5s zB+gO3sGu}#Z{zM&NX(L6ZEG_P=w{S=m79v#X|293o8PngyRhIlmG8lZ#pHH7(_j5& zM}x!Ppar_|#{bv?-tYr<{!HU9Vata(v&yJXNYW%Tf>=PjZF(V9-HKQM+|XAxmNT2O zVn3L)Wjv)nue%BjB83?2_`X6+-UNmjRd!!ukA6(Es6L=Ps)&`=>Kk(TnOXWPHPT-x zKTz<Vf>IA{Pk;4yJ6jy_P9peG1dg+SH{$a<ej$Eh`ERy-7&dG$>MUxfGMyGm@0eb= zZ2@p&uX`kKI&Co-OP&*t8crvi*Iv~;rM>}A_ySZ&DSqC<2*7(t%hY5J7>1$1cxkP^ zDx2R8!yVm=npdH~c<HbHW=Dg!zp)GBe^kCYxO|v1D@;0gK`T!%q)%abA%~({VGDp8 zdn<<XCo|?<E4G74uwf9qzz9C0LgE~z!dGF*Z)H>@Hzai!k=9DHpk7lRLVCn#X|3LU z2!6rzSBN5fk~}`*QzAey{greAy6rr_;NP2iDu)UuvSw449c$JN^Oo_X{)|o}h{QQc zB*20SA7qw0>n&}jUj3++rL|ISDWavddQ&#PXN=DwebasQYfOP3zzar8fAxzm7{^0j zuBp3fuxKm?7VKPw0uPyv!V8|&T@*BU3!aed_&%kzvLdaOVNf>-1+uhOh$Gw6TFFi{ zCZ@mAu=H1mBGJ=d{p_I|^aUT#)LlJLGMYP;G3SB;EgR-d({b4FS=}W@@Ve?I%KO5O zk(SA8^O=KbtyKGA!#mSjy)0LMNZ)iz|2#ZkF#Q#?;qB?KwkHt#g6T+KuDRROUpkyO znK_fXl)P#?kaQ?<%XmV6I^i4$UPC0gj%a*a`8+M_wKXN}fd!A~rZn@2A_viv*`C(w z1^N7tz6t5Cm<1#K6`}~)Fr`rpC2#r>;_;XK)!gOjD;p{p&z??OaIV<bEbHdOuwn9o zA%ZuQ&t)O4m7S%vf(5g*R)``<YXvVDnHWrFqMcCsD_Af~e}yQrGyPSx1Y%z*qRm}3 zz3!pHv7D*&xs+uGDR3ik({#*miU?{&4c?$-(T-qREA13~AktdFhL7NWB$U?b=0oW3 zStOto(qE~B^jEOq$n;mz2{@lHO<$L{w|uB*1RiiEbup+w*f0p5)t%Q~=B+pjX|0m$ zEG>zCW3PS~y%;1`QzNYvY&blv)pgnYSo$lLzUjVNNPmSW!rO2t{nhuLFdoUv`vTsc z^1<SfyveNTw0Xo3C=gMEw_%DS0zu;Eth4!&I!xWLV5GHz0)@0z@PcAmtEV1@pYb_F zkbByfm<5XpBpVK;zxq_N0Z;e?Rr!UOTDtQk7D5;Ih;>h-{^bIi!o+-AUUHr*FC{N6 zFCAM3+cNVq^Rj}Q#cR)ZvHcmr_p|fFEhkUho-iKC%hwgU^NWKO<fD<~2}*Gj_v2n~ zQ!%FQN@z7UCe|f+E!DOvdxfJSx!mb?mbuDON>fTxOHxZLE0%6+liizK<*abIQ_50H z(@N5c)3Id~(Uw`rZ;}mo!XIcXt1K#`>gQBJKBD%ysJw^dheGqP=2hjCdQjVw;5W3H ze2GowhNOB+t<`I*v3cy(j;iD;%Q4HWrQh0tGV~3O+T<E%wX4!qky4&gp6X65ODjt& zO)E_=NiTWAcqA|1ROu-#FD@%W{d1~QOAOf-Ui+K~iu#iJ7f+p7p?Pjyv8<X>k7@>V zJ^Fy5!`NnOHTld<NsUPjNp+Sw%W2D+Wy&&W4OsVpVUxWc6unLlI99nTQ!22Pr?^wy zk_~vmA87V?%d4o~B@q;>pNA;2+ULYlikr9}v)6<|^W2tVzj9GEqaM=?>-u%Q`YuDK zVUMXD99t57mUEUvmSxMN1qI}-owjyc3m7&!>K%2USmX4#h$BcoVLXzT_jzk8tKAhu zaC-&$u<Gae`63^wfAJhXSZE&UK{ph8m5b^b^@L_rJ19^L82rW#%X!NYC~(0tVL<_T zED!FmwcA_l&GsfogFumSlx)Bg{y<A@eRWL*>Yp=$sQD|_osOt}ULc>}wnFoWCI^(O z$_4eTdQvl{8PV|+yDb+j$1UrKB6Ex&3dlo+9X7t`G%1m%So?(WNM63BuEFE2tfuPS z0>P*i<O`&9C+^4kk8OqKxuc48#hP-3F`UwjLy5z>LGU|iISd6ZS!OKb&>)pt4OsoQ zJpw~gA}A(d5ZH?Zr!AP}*d}c4n2aKBMxRSZxgeX_Cw6{D3cxJRJBY0pxl*H@U)PhA zL>XfnV_c!7p`ix#&nr;>oUeCT%qrFf>rNx4U@MT%FG`_#%&$A7K%sf%vT7bWoYG8a z#w?dCrz~5RgO*j;@H7Yt1>~(LARo?<rk>E_#5$n`#4$vvjpL3fi*3q6Ota)a;RDC; z&q(oW#lzkvu<O8C={pENmUv?JWV=R_flQztc7-O4_KTmQzk}6}_fYj|R=t}vh-#r? z_47~#c~M_d|6=_|)I#%!ChLeNt4b6}Cx+m6#&XoM0R=8w<^_U$`SSp?WGa5%YWGq5 z705VRfX89ff(Ep<8Ep_-f!T4#xIClY$>pPsRZt3jrcwA*ixfU6uL)-_!I72G&o3kh z)U%wObcGr`53U!k(2V-=O|^AYyOt`~Qq4=A;I``Lg@sbw#Qj*m5TnpMheGpE;Tq4- za@leg9`LZ`fMqXicn*CT(x6a49xC)hhhQiIuwn{XA(FU?f4|5GGoN0@?8n>$vsp<~ z2a4HeJd7i?QtCAb?UitPFpP2jumpjWIW`-AMJb*KSA!#cSFC<KYS((G{yAT_maiDf zYcLp3L_Sjgf?v!+^H9nrQ{g`43izF~oPY(dLxC%nMf9epEMtryUwE2S2yYl_e<B`T z(_ddE{Pnd=X&52=5J}DZxg4#{q_eQk6o5!M^h9lxdiBC`Q9uMAi*f#K2?C9nf{>1< z=|s_uYf$}eRR5*Qp-h4Bf@N+jZ^NrNSV0~=Ln&_JezyK2PN8|&F;gMqcM=wSNKoKB z;>aW;7%U)9DhzLb_}^ZRR(psKtrfe6RO)^%N8>OU6V*0YS{AIi2rVoRKRoKse)zpg zXb+i+k(oHeCV2iG2?CW!Vr*yE$OA>8{y8j|DX;=@I7CqNgixW#N9tej`&Z0;i&JPG zcD$+B0KfByAjd45Oo6LV;M@*^-PW$iM(dG4I0dcEV|3h5B4#*|g<V0a`xY)o?MyJ6 zbY>;BFO#Vx^7x&<MXvvia{j0Z0@Yw>LptaTksmA=3an(+yCVtmp0HSoo48+C{}H#) zJlV133izFd2i!soAs<LKJcl?E(jc?r?JZ8gB&cMLrdJR{+#Y|<Ecg!2IgY@Qqju>m z21BAI30fkCQZJg{QU^=S#^?3;aFp|t1;2(kyunSN+f)fB>%cr1g%%90T(EvT#Srp= zV(n?silM<$EY=eFNd3#!f5<2_4}KRAfluHr<S=?MYv{);p(isNMvxVd-`Rd0Y|Rc8 zW}qSY#P2iKzN_4R1k+kBN0p+r>P=9g$^z0-FX9)u{&(B?#ra)P69nprUt=SAK#C!} z0;3Qtk<U+Np?QHHdB9Dkz<o@C^DK@`Fh9r&$VY2G|IIt0Lg<Tc-#aL4a|$-x4VRds zO4ZsE%tn2p#;k~TeE56O*8grhKY75I2?B-s@vQziESOgy-Hp(yz>o$@q;x0l$Bc7X zh2|N*v$zL21`9sK6o@Pdf)|{^Jsrhj5Daf?=w~f_y#;2JLj0(GUqd;ft0<Q;jW|-x zWvfy(j)WwG(WDC<6aA;?>whE8{|l@p{U&Cy+(F@pO_W?1f)#t=I<gVA&5`xbQN5dd z;Ew9uB@e?-R-t+DyNDj(DR@A#;C1*wQXttddBGsT=&h!^5(7M;o%qoV=#MeS<R+d3 z?&C(8kIGi2X_9poJ@Wm>e}&H<3Sa*lJU>4_DQ1E|q5e75kLN9zSD<LaVt@K!_{l0X z&-k4~1UV73;59@MQXttd5gaEC#%{d^{vN#!e*W4gm}z$p3OoWE9)wG);6PCA)JoKE z5X(12TmKt8zgS=(YJ$MxvuPKcmy$0#F59orW~1$@xLL2V4avUjE3mZF=<}Y3ZZ6{P z>k@h({B{K_f*bdu@+s|C^w$j6jMq%p64_?HCT>aB#LaR|-WHo5O{Yb~<mX>~7xT7X zMLDN)C>u3{!GLCT{j!zmm{BkBBK^_)t7z+gh4cRs5$gL;_&e|sH!x#m3#EG|Q9h-e zCZ^V`6mDc6$vBaEGUc@Etn-}nT=Mzk3&|H87ae@zk4q^2fz5@o9*B~MU~R|YwN4>+ zo&`<fc^;9JHsX2__Vgk5K>3pTuI9G(Io+*<XZ6qOZy0VEo-saSyl%X1qU~wZ)AHz9 zuElS@n&<Z!#%w>rDzUrhWgJJ@s0ozrX{1gG=I}|pcz&_g|FZK_g!&<7MNmeEJKR@L zHfjT<dnR~-_0wf51qX6AvbHjgrXNqGvK6ON&bZDvg~A`n=dgijT9X}RJz!s$p#Xoi z1*<y_+ar>v!IEe)p16BMp%w03#T&|3RQJ{QG<UVnYwzf8>z+%vm2gXc6Wg-}Y`en` zaQFm>KY)jSSJ1~{*zgGANIh=#vL*0AL=x+NB@zTSj#ti=tQ4%}t!E#~+RQkTel+cP z8mW=CQ?4`6BqN#bP42LvtOxx23K$+lpM$4(OyJ0qJdNnf3a!AN-T}kc7{wPg#1SNQ zM3PuOw_E(A#^<E?Nv-ddI=*SdGv!_=CW265_Myzp%p)ina5U{$kRmDabVilaXJ=(S z5FeM&+g(G%T!%exLW@VBMxNsdj9&!{t#I!v-d4P+e4u<)^)j^hlIBIt3)&ZmqxSjT z;wLpeC)Iwb^}SNZHxGIzDrZX<L2zH*{+xrbWRy_J+{$3ZKSC5UD_jkZJvLU>1NW&5 zu;jf0LsDXpBFn!KEVKd@zNetV0<SAyQ@s+Tcu#Y0*Z4_{&q=jkYJIQN@xFoDv8t)^ zxzfd=)q=hG`%yv#mW;KyjG<7x0$R*+r_?6<?434N)&u>Ed8lwjU`W<{h^NT%M?@4_ z;XXiVMxNn=5XD{NCow)J)qbh<y;8^f`sznKlNB>%^CcmI6i<jDtQj7W6#&U9ORaH+ zmGuC@IrP=Yl6f5-gf)XA8$Z}qXay>KUtstKtQmz?RIltBKZ)@<srE~)@0B{frMF?I zh7>qkx=_3X4X);sCp?gIFgt=_R%u$btKLCH6@;=L=;6*ng^N((D()S~AHpLB@87l+ zT7e4RQ@o>Kg;rqA4|a{8#Q2<4`=!?RN*xb=gWl0<QXp(tAh?ea6fK!qb2cmML4{>t z!$MgP5S$TINR~_t_p|lqQ46gIDijK>>>59b@j1!%WAqB;ePgcgl{~(uaiDg@Gf_EJ z&TN<nBKH9A2}y^73Q<;}BE{=$WC{$H^}uKy5nO-@mzg)*2Y!FT9E2E!R-i(%WH8({ zelZJp@S`YF?U!2LD|tNl)eXafCn{##vt^X{eyK2$;6d~Z4~3WYuv0Q;p{xf8P78Lt z1cs~N7q`#~Q{j7xcXy4S#Q2<K`|<pZmG_OYzE|@2?xy~_p_<WZUV+q;5ebT(kW{#y zU7S&#TJ5S!ZgQ})9_YzL5)@R($tbi!D%?eWamVMR+Ap=fSL*oI?xw!_LGOqM#RH%~ zc)=1PxXd)TnlF0711L+u%6d3iSr1m4Cs@`48st6Uyk!CWWE5KAzPgM2;*QTrwVy^0 z;;rwMI=;25skfdypr}CJ3x){p&7(fy0alixG9|362W&W4)&t|6Q@kC6pR7VFyT(sq zd`_zU#4q0ZUa8|-yPA6&`r!d7hD=mWQxPQeWWqJb$C9E<R@MU++(?CGNP+0d1j~Ae z1Q|aWg;sWzALZJ5!~z~t>w9JJ>uH1sgasppR8Ara^F5hxg7Ac)vL3KtR#X8|gfDI+ zl=a{Vaxx38>>|Io<8xB&ms;N|b$n}}nfQ?hBn##hDB3XYiUmKoO8ih5N+|0gD6kVY z99Gr?BgT^gKgmKXyUH)l_?%SxrPlXK9p4)8Q3NIr$XhV4K)xqK8eE~0g!#k|Wj#<B z3S~WD!9fMWhQrEwSjHJY8HHAMmEX_tgy%c3VC4M881yBKp&r0<o-vF=w@bC3#{c;B z=h4^qN*&Mm^`QU=c|fvYQGt9<1_X;3LE@L;POD7uxN2d+vC4XkfuF2GE4#+eCDp?C z`-|i6mfS11{>SvSp0&VC=XmF^f6(9W@ALQid;H!0F17{yv~|)pc&&J-s@of=M?H!r zKh>e2Dipqsmf*(M33=gK&rqPdGtfy#`TZTSHpvE5agxs{<DFxK&3uXXiTL~E=3l#! zzFIt4F<d=>@(3u5zzR$RnmU_0oBg!;#7*DnY|m~hY%6UmZ>wsnZmVhYw$--Pwbf&5 zz}CpNrnWiHNaH|rudk=2yQQnOt2NNtiA~Iw9Qrr@Uyr?mxP4Gcek$e%u3@MjPdOg> zC70i8n$t;ZuKCQ#+_8e;;=$5>cV9(cWp7olr^nM>1HpL%wVkz{b^1<gTS`l2OI}N1 zOL0qCOL=QWYgKD?n+F@Q<Z0HmEtE}oMrsG^2O9gEdYe$z#)l0oyBW(6P05B$<5^K2 zrdU4W>mH^pG(N@`Ah-U<yNai@#}fD3mYlO`lbI7aqj|#xgGB?y{iS_neeS;U-ilt( z?5WZRlG+?CsV$kl9ACbV7?!k@`O2|X_^Mi}TB=)#V@=y)$#liIXT&>PH&EZ-(BIhC z)YH_>ZzvHIVkwDb4g#vjRG~+Fy~aI*P*e8EUw~ZweaPKbo!1>P9k8t07o0PxQ|S{~ z<Jlv5L-~UR1BKvN(pQSjt?x?QV{LV|r1~;_*}mLnF!U9HVJRrO8O2IprLW4jR6J8Y zSp|YM!?lBTDD=T7LWv?pfu$st1#Sw@+T57T@kp;X@X+%Z0(IUa9nB&CZilS=?cYEG z&abG?By1TEnD<(jP(p!9c}!+ZWR0O5%23{5{s3_-0>2JRo1@j$lIF|sWkH2`&H2qd zLq@TMr?^x!S2kTSQ8fmFBi^C90b*FkD>0m+WV4nqd$%7ZSbzyJkMMd64?QpAac&Fc z>3oEjTz=og6u4)V7f}*r69h3;1qD%Nozp2`m_Cs?o`te9tUQmt+tiWNZfi|$VFa7A zm<scn3z`d?i+p6wP$IVFqWRJpDpXNDN|oolf(myiQL;%(99i%lM3F4yWqIh=+nBcS z45k!}VB!{LBR%v>F27Ikc>j*#n(CbPM8aX?K@>z;wW2VOV}=-}f?>u)<`^g%x=ns_ zyA^ARoGmG&K}Il#ESVTK6*U(#pUC*Rrz(O3Yes5=D$$RgRGWw@^it_KP%#e_D= z`9j;z_cjs)-Npj?WrU&fM{ojUy`j1b{BNP&L+JN!WBSKE#SP^p^%?E4ghNEo%*yiE zXOnq`Oo<tW9;4s9$08_@CrF+U1eqlZ41LQ*3sk6rFO|Yd%`g>;cFbEdeB$=@?)ZHf zvkj)W0o-ZSz=mBk6#p=<cQA451x!3TgfF25Gg@sAb1*(Dm*00#)aX9<obsCLyyled zC`v;eH0?L<g9;aIa}I$atQq_|6Jf!mz+|6G&|qeBHrX*_SO6UsG%cfC%52$;dkQx^ z6BI}IO^hZyKm1>^ImB-U<`;kh%Q>1`OVe;=zutjey~JJTPC|kG2;_MW=|<)9<0m-X zQCx=t&+-HfAegw1ibJ7H*(?}N!<HF8Kk>7nALE1<Ol4M_*+c|Mg}Ke-4T;|z75QKa zWN}0w7#2<B@uLVr_quItR%0e4^biE!#ZB|e+%4_`CIwA#oizCp^9tp^<nsFzQ#@aX z(r-e67huCDbX)ojMv&}yg;dBG^88@IZK47Ng1(HVtS}W)i2y_pC~%S#2o170!ZQqu zCXy{#M(~`2@zFvUDu!18FKERyaG9@nF?s)0Osl$zUd#$NjMWU~h{Uv-LEcL~zpq2- zFLJk`z{{$0n$z0jJVE1r69`f?fePpBh8{y8(QocZB7RU{a<fa+V2~iGFvyR3F+4#& z5^q!C|JUA|xVLqk2i_Mu0fGPtZUA=@;J&XS36PSwXr*?E+C^%$Bx<o1%aUcwmb}Q0 zcgI_tCeG3(Q)iMk>3nUzX{XO*rk$oUNxvlRG-;dmZ~1=jxws%eg1C?<#}Xsl=aEQW z+yk87ec$t*d(OL<_F@Kp2N0dD_&$%Y6yHMFLJ$$k@~K$(eS}=Ri1X?x27>i?lQ-Ri zA<CAYIM|5;;Tx7$thcPshz>(=PGm6mBm|F%H9-o6pLa|j0U^jdrbLZ`N{fY{(&Or= zlm}22WL^S0b{cjhNeZ*zCl+`bzuk?55)Gk%jm%{5OwNv=@cS6Kc!eOQ^GM)!8tEl2 zCYYrm%AVhM3Dx_6JN2sN4ib3X@s#tj>tfDXgJ2#6kwW-Q!cT1QNL&J=rO5<`841y2 z;ioK!jDdy(b|mbGA%*7gzcUQ{2N9f|Xz&CQ*pC86P{AUU&}m8^r$Mp6SVlDXG7>10 z@ewAIb=(4v`Q=&L-yf1*?i)zDXz<G-fl7xjyDlUV6g?Jxxr(1?FcE-AprONY85Dv; zIzO=>#w<o31$iDxIhc`jLS~vV?jV|a7+S_LgSgBv^_}A1V;rK%i_e=dXe+WbBe}a( z`-c{{&|P2)GcxQ1>;Wu&>L(o0X0nw(aR62Yp|m&R5_kh0zKR_=pCBkBVS=B`VbmDN zJSJ8W9|h+L>im=ifuNd4$L&ZeLAvjP;3PAYJ@i2+9*XxvXEjAA>jrghZ`$)F{Pt=k zb~Ju1Q3<=$T{@8Z2{TNIFv&5*a!2AD$afHu9NF{xV_XNs*1x5_W_d|TAUez_h$YTB z6W8JsUihtJ1~}#$^D7OGOJKA#m0(<d<0D{v9-YRHbgS-5g^mI2>n14fq8B<T>FGhF zZUX_4eLTTUq&;uK?+%RxEK15pF&<l>Bpef;@G>8O6L20zWr*p`ChXKsq*dOrpW{Hb z&u`*j{B6dH_X!htow9hh83S*i!;FGri4z13elh~0!6Jb&lNKG0kAk`kQrd&@FTqcr zN9z)Z9hslY#Jg`{g{qYq@*4WM8H&=&J*pSS_`i?AzIuuCCj8#Tp?+NE%s$+Rx{>8E z?#I;pC&=o;79nWGD7SDIN703i{An8;`3~9h`wYkWud|Bqj?!R}z?-(~N{6u{SYllU z6+dP$ir;FnAxNOiV?>9O2<rR{3zC>eCkPs&U{d#sunV0qT#ckjAGcyGcQWqngK9Mk zc<Ik4HIVMd4+*<D3YL7$>VZ%rPl3ZqmN#&p_=JymI+ozrYFVY0oZPM4ttk2~6nRHe ztYmG$xPQVonYYf%k-J_$;E-}$s2FZ;?5-LJK>-h#)dgz`*5t4CkNL;_<Gyj<L>`Bi zBlW7Ke~b0h4F+RHv?Hw~wJ_;bNA4uYy4-asea>fQ(o7Erk%BQs^^MAQ?1iVk{+I52 zN&}03`3m}U6>;tWjy;EfrW&No_=HJqgFM~cz;r}G$74+nnqG&6my{Or9VWX5Yled( zMWX?|AB<ah;~YYRe&~ImAM3wl+wWf!?5S>vM4D>aDmu%%gT3s;4HT~`8Y~<N40A*a zV)?P9iKU*|RNviN-&NIHHc&c<S79^|D;UWi&0p<b?O)@QW36wkNb!Om`;ZYyg&^O5 z1Syzc3{PJFTYP>&0}FrsDkIYK+6it(GX$M3^jQA{VHnfaNcpfA39OSv?rK@yvGKNY z^&NVcYRCEp>xZhMp;*a?E>%c_$b)dyk2uYbv~$j#dE<e>U{^&;O=CFRP}5Y^Qqfl4 z5$a@5v!|rDq>r89{!~g!|F%66ZR}`iXs_+8>?!LF_H!o&*%jqJ6mTE76GC&;pU8iZ z=(iUMs9`kkK#zps!u7v}=MxvE|KSf9k?v}jFb5;dn)5Z6gbo&is(~0V2+7lltZ#{# zl3MfX*;w9D#oI|~F)-9JR5w&PTox@IDIOu?#b`j+DWSMlILg0yV_(;9SdY54=T78D zi~53{<*ijsHTBe_u5GMts%)ufEpH1!GSsy+vD7oW!(;XRjqS~iZFL>hofX|>y`jER zp(ygKB)UM6Ow(%#v#&$~T6oszSF`8T^}p2fX&OlW)7N<Vc!Ot;6AX5%)apx)gOM<R z6xMN<+Ku(CISA{k3k5O~x<DP`J~j*vwGY(~)eKig%c4PD3ZWRlZZa~h5n0rw@msP= zYd_%Hnmg$q4Gb3d1Ut*xDx0etY9mxuu5GAptWq3PDJ}imba;IvTHn*yMoslddu>Nm zS4DSuPpCI&P>gd-?rY*_x9VRlqkS9v7O(#$pHEzr{s&pVdPlp>>cK&DnBZZ+xa~GK zf>w%*MdbW@^!@65XeS@egN+SkCSAhNaOY6dP#A(0(X!zn7P5G>NHVXC1V&``p(w)f zJFHPWWj*fL>)Mh#=^G_=Uw=tYu%oQ4qPeoMx}h#oR~xRauW4ACSn8R5;Z5Pyk%5M; z##SnVN80MzYlLA}c~4nSP$)7Qh%~0}HAA^1LZE+&-{SSZ`SVLOVEp-OYEJtK8hR3A zu5SYfL2QZeDU<cB_j$V0Lw`48M+oi2%%}w(K4o~gd$?t&ez=ZNPzYjQS+z@Is3_*I z`A^)XXDxHq8OI*il!r|=e=I=azLM@>N2s;DrLw82p++)Urczq^xBcNQ;c+Ceub~ry zO)aExL<*}rDurQpSx*RxFkJfiR}vMJ@SH2_SLxROjPnMlCH@yaA27l_$Lhgh%y|@% zu0#esc880Rut3%qK19P`V4wH|Q_Xdl6+K%5DG;c}FdXd}ZX0SG4iD7|!I&Z_QaEBT zEF39-;b+{1JKAN-N&7*^F4txj3D){Y3Wfp$q?_v!hGngl&6N#ROA|{yb0EAeye={t zSykWD(9zgR<v0ko)ru6RFkGVB@@KJ?MT%eg^*{Z5h6d!6&gV@u_$m@O4WCt5;tC;X zbGj@ZeSPa~0#|Pke13!}=Qx&F0uzf7P@5Kw_D0)>nw13BBnSp|DTE<U4`NkDK4YA{ zt6fI|kJ|S+cjRnV8%3l3;lQc@nrtv+|0BEOKzK)ZLwGIQA^r8;5G3PFOG9g<wXVIk zz1m>dRhBscj--0XwEidOO<p>mcNCv<iq9B^y^$%3U(2;Qow7djF>}CI)%wgy9@(bQ za#?1S@cle38tWVGM1vdHAFE|$sZwMx8XAV6NTJA~Jb{h=XXN~Ko@Fjsp0FKZ6pRzh zTkVe`h5e$*ieYy2JnIjJcQOKwM@Axp=x|qKyH2nLf+B@5tW<Jn(tRiVGOhoau3+hW zjP<LN3<aB+7<V%{DN_XX^{p?{_d5h<8==!9%w?J(SPH-X;jU=gP!oIHb?ERsK}JGd z3S~50{eAAkTS)meB=ETXkYn1l9T{Bj9rrR4>QY#QCYNN-Z&!FKJQ-fY7&uVhD+DF$ zQ47gfTf<0UjjqY&`6nO9`k%pPj@z*d9hRz&Qc=pU)(1afgnSbTy~#ZB6n1($?})-! zQb@!`28O$%ZNtq-U?f^QRAUf~g~XN^QW*O_tEcZODZglW!a8f;@7V3y7AMF^h!hS6 z`UCyN+4I{S-WpyX9*e|~z<x%-&QyX*3Tw>ynF#3TeOmbS;M>zG5~%AiE9@1b#fl($ ztfWx(FW*<Az*WmR>q*-Y`#$W5BA7ejUF#e5N0k(2%kNNl4;Ex&cp|bI30#E^%P80; zGRS<UUPeMCg`ao-m)m}9bQKn)6Mnh`*2yTS<}!+)J`y5@vVW<E|39sX1Rh0)_d0gE zwu%f2K}NzDQkXrz>F{<&K+)h>WC#gFhm#315~9a#%kAIieZAs0+=&LaFpEI~bsbJ2 zD0*z{U%te%&SjB6>x}(?V_FEhH$jm33?t!)tlwvK912fkfv1!PkLnVL4yO{7k?`~G z|8m=pjSgagyU^eko<n2|tV>H!53qQLIm|8Q>K84it;g&~7zOt@cjyFVM#!8nJ9?h= zhr)ZYz+0HZOvEK{puUfp45Q$@7Nf_@?ce8p{fOe%9qovT28#q@MU)OJORSHBjD&x} z{-wCLzsgMfI<uHLN-v<pjDmYy+Y<!GWc@y?<4|~Cco+OOqQPsK#YB}A>17m55LDLq z^X~t0+lSvsKNh4z*I<!ArNc4`CKJ^6f5pzf$Sm+FlIotZ9=FZ#G%_ugI8Ja~*8C1D ze%lxU*D(uRt;Rr+Kp6#-2`;yPpZE16qeG+cV+3rEi3T@{cTYlx=L!BP`<JqM|0X-l zx0p#Hfh03MMiGz$l(N7Q#|dU9eV+BgZ#NcXDz3pJeC(ji1Z5@z!8TosANu_FiXYFw zU82EaLktPjby&9}XtC`7eu@Qtn-TB@o&~Qc2|U3}@Q{px5ZtN=-VeV?W`Vi{t`a*e zqo7z~Mey^MpXIjyJhQs5v$OVH_Sd9P==UkO`UAr8{)CiBe@faXIex_P7g-*)+67iK zKBCQU#mk_4rds{JYFXOVTRz9C+Bf-BIV+#1f0lgGk4Tne9RFpx-TU0H-*11?@;p}I z9iCC7VCbjxTZ#~UhrWH6J@Y^2_>3c4ex8b9+kT!CZ?NY0I<3CX)3+)B$^6@BS=yfU z-LSk0L%rOve4>2TXX+jrcTu=L?60l<S=V{n(=dFQ{=Q9*-bZ>qf#lce@i$eU|B#)_ ztob!J`qwyjT93j|FEV<UcHhEsOS>O)kA&vewb`PJuA8=(EN^h;L;gyMP4#Ji2ub74 z$$fj+UmHuO^NzXZ>{qNeEVnE#B8k%5cNE7D>90=m)2#WqyQ_jddE>6__5;@AmN|TK zu4Aunb9aoE-^GXFLv1!V=e_ED&i119HOreY{2=MGzKI5Wn|mf&@UXwuSMCTL^d8SS z>v+=kwDqR-mgNo>_Dig|3PtJb`}FOj?D-9Zs#^ksxf8Cfj=k0y%SjksV#a*~tNj8x zeOGaOQ=2V6?SIn!wBtG39qV0-A))#mGxS}g|6zYk)^07C&Oe-cGUtr*qW!Avx?*_S zBJ!eBd|OFlw)*QHE{#NjUHQ@6N!J#~9{WM-QBsc2u|IQ_S<f?y;)~p&*~%vZ=e<{Q zZa8j1@D-%+4Mq=LHa|k2KZ)!4!_IFaGF7&#aKCRx5j^j>V!vj)VSN@kyloL_c@2Gj zgIQnJ{GvroH5DyIeg07hPC0hj_t}o1$&@ODp_JMaeZB?1Q$=&WOB4-%27<S(FI!)= zypAU83_n1_Wvu+<VaKP9H*TujR<gI?pzmnzshl&e^NvgQr_khQ6vG!RFIrwf5?{-n z-$-FwMNMO{Gcf2I^-Q`pJ9jwt+7D4Q;AEVk6nuPIo2@!oayI{>_X<VBZwf)%OV+zc z;p>*S;tb!1;YX~bZ^J*}ZDnJ=idFjqEREP7<7g||%gV5BgXFxNO+IgS1K!Y%)BbDd z*oR2`;`X}>W*EEQQ#RFT-}r3#f4q4^&6d#aqJ8;?@{Z-6a-VUXb3W;~iWFYg8CqYk z+(DbOmcP{no#o+hd24Y`!BE~B&t%SKitSC?_alWzVR#x{zQBk$TYaMROu_lQOP;4F zc@06k5PStG{F3F3B!=QTwvE~POX$FR%F5`vEj<-lmgW-sUS#?%jD@&ZyssvaFynU| zSE^a==Zk1ry7q118rGzrWh@gLXN)Wc=dWqwt?TQi%6F7Z2lo4q<Q-Q8=bRTER~$$o znylo|`n)A;erpQ4gAFxRO~KB>et*on*0T<R+a0?p`f<oQW2NLH1q9}_*_so<(}i<> zk-%#dY<^Y|d{OjRN#UC?6#FGFJ9~x)(f((l_$4(L`m*YaXgEGZjRDIzK2%qcqq|Ue z2b(AQug{ju_&pE32)-G!1pTLNf1Orc4dc^Tz!%l0%M8zG{S$4xeO+X8)wa-{qJ8M_ zjQ51+l>4;ntn-5NvMz-%yooMn&o95Xw5h75zO1dJCm;n!#}vVB6rI^?KVTOrJZ?E< znXNe<dZOrT{)N0txmVHQXCU}IGAIP^!ce5}O<3tKEBjp=*q=Se_<4(d7;~ZbG<+G? zYNh$?Deho@_7WEAwwgb@sjS|Y&G4bQPt&p=SCA#z--#DdUNx>(2k;`2Pk)JP3|ses zHr6rOu(^6m+0K&bLK6D<jv54A=be`vSCtfsCTGoWt-r6Nr6L@zYz}r5^%V^HMj^N! z8QezcBlOsQ$R<(<zp_)sXA92zp7dT39fsg@34-XclEOFGqSjv)+)-j^@)07fF00v) zSn=01HWat8+2hN{p=cdGDkqTBYwU*0j8kl%+3Owd#w_&2#WijF;@#DxS*zJG+J+|2 zA?G)liRts%H?^_OiN;NJQx)5Tdy4i34)_n}9n%TsoK+(sdMsn1Eo**b{(<7Q^7`89 z1|+aMFyJ4~TkRcJ1h+zP4|;sie%Lx&d%XNq$(g`8zb=8I!#Cpu?^tCd#F`+5$^EZj z*Km^ku_I{HMXZp_hEVKXY&eFko_ux`NALnV$UgXadMPu`JIR-){aJhqo<#N}ggtHh z2N3Kbt<26|+W_y|iyzD!<FVfQ%i37?MDxb*RMpn9T}a@*f&=~;MnR>;5M(42Jw^(% z<~QzNRn!q`tgekzAc4ib=y24##vq6_-h&iUz8ZcfOBn(4FX$484kr;*wnX$ewbMbo zE5wz150Y>SN|!afho<rA?WTu0`Rv-RLJD`H=_k?k%jl-rD>?qxf=f-B_9d*VMay?X zu$LdVbB9jAP+Gq!{8~0d(BSeN!9B%$1Ef)76g-}LQpQ0=@B(^##gQ#P&$w@}urt_H zSyxxt7-}u)3iKBYi5<zsHPyX|RW57^QV72a_!XTkIOn_Qy(C5J(P6P8x)$Go;LAwi z!u}WGm^VN#wjv3_5LBYUQ6@TtPU`p6;X}0sAAxD4^d!ABdo7IrP1C+;usBcVc=@Lb z_u<XlKWY6O<Ly|_MC<zc&9zgN+d{ibrm-R<hGi5q2%d4xsgdv=_zeZROIs=;XmEK; zumda7?;pmF%o7w_GFyAB;$-lN;<LI2d#_+c&|zgq41!`y7WY2@qb@XcJ(93b_=$#% zDc3Wf%bAY{S|>Qysk1YSFJJJN#qqz3+TYL38pUa-RI^Dnx{i43`x!S+X=A<PZR=zN ztlCn(QzTI7@Dbm!6oTiRS@WC78xHi8w3ati*F`FuL;{QYM2E2>YjY>#GPun#TX(GT zWaw1!nZh}2h>}3hHPK;3@D`Tj1=}60@q+HEU&W2P4NYB(U?2!G0!qgF7F;fJa~$FX zEoaA&lU?a9Px+hSADJBgyQux047+{YHr(_J`0RG(Q{%|^Zax}*?UPL#!)S2D_Rt=Y zz`%Z)$zVs~GRT~I&UH`xdP~|uja7BERSjjWNMJWQJm`;NN7i}_g2L}a=v2w+LNr)x zh%SLjhi}SkFitR~vx#5)jP*^7MkB&cMnG}uk@)FitH*86#{;dI4v!**X0P<|zbWk> z1k^@^t_`Q6Tsz*A;)FlI2W~{h$NI)Qupp7mHB*(?5F}7p5oR*8dB?pc^l=b^+4J*8 z3;IerLQR$7T1h&lB(PWMa11+Q5LEoiPU$0{vLTFto~NYby0Rk%!KM12hp<W2LB((0 zf3jK&*xW*}f&sA`p)`9f75|&k{xl=3gcWjs#6h|cf-T&i5$+HCI^icHAR3Gf5eY2Z zj~!N{V1nR1@Qda57k7qQD(Y+M!j-xNVnsxUWfYY8G+GS5suRMG5l}Q(B=E9~frbvp zWpL^Kd$E>v!f)PxvIpaW+mt=W1Hp2g@6rvZGvdFNKK?hUeYroA%wpOY8uB$;4sM4+ zuot1+)-v6+zwtoBA$1&X;HWo_BlR2+{dmuJqv&FAuKaZ66P2f`PFA0&IZ<=G=6J1s z9IG>ql5mVY$?fc|O|!$cKYW1WpgInPjpK0mu%vX$>d4f6sRyX|iT~tg2E0LrraH!X zufyeGtd~FpvzPAwmp=YCsr{WO>nI~cJx*B!F?f`Velrrjp?h1$uC_fbdz<$)?-!Dd zdh$t$u1Y?>X?fLk(|gtbWZ-<!T*=weGr`lLC(2HhpDI6D!BKg#@<io{suNY7p1S(( zNHjc#XT~P>$G737v6G$h-S`VlbL@qx9NE#E4QK8@5vKG0laovhdl(k05RY7^D;I+1 zX^Dj2((%8vZ^XCD{aJ+sstgMjAWALACT+5BbN9B+o$b5Z_K><$9SM@j{JyKbXM4%{ zocn3-72l=&iv{Nk=Zel2pDj64db;#<@QKi=(5W&xJcGg3`o@m>ei*I}kFz_z0ghAn zs%=9mb(TW&!SI9by#M3|hV)fPpo-wPc$`ilX!g?mBMtw_H1WTw_otf?q8uaaw6QK! z3c-nijlEkSxP$b&$YJZW&M=9jE|K`}THdg|=y)#YhUc31O5P>^#rz8e=L6?-j>Tt6 z;0Q@itf;H1rJ=FCz9%w(SH@^~Efgn_#*K>O7A2G09~eKg^*J+sX7Rmd*N+cw=-=G4 zwR>CFc0G2lZFed|MRH%W<bQeB`kL*I<Cg0g_tT!IgrV<A{{<+X6N-grMIMXYs|$L| z+G{Df*VfQg-xpbhU%^Ot4f;HxC~k;zd{FXd3uj)RGwZ)ue6QK{<3sBPHua?tL=tzm zG9soNKVWz9L(7}iyNck=oa^puxmR_DzKeb&vEUpW=L+3x@&`&gE8FTI*xJxp--9L( zB8Rca>I6m6=?BG+DMZHgIkWzo#rK+BKQ=VEYD52~-l=XPMv+0%XzPcJh)D#+|L-g0 z5PRMFitV=Jmh-0ThDf0>^j<;|VJP}s;9lz|IH|0ox;3H*cGP#{6*LfMI0{42=ZOc$ z&t!ehtp8^5y=K>s4NnfPmux>W3d+%$AQ&GJbvgV~c4NQBOzACFm0nf^pLIPWQV7E< zafVO&F8JMJzM;b2U}r^pZA+xN2^s7_k9+ZJSrr*layX(S@qx*o$@-jG|IOli&8{Dd zPN2aX`{O#S2x3c;4>cl6{)11k^zT~Uu)b=0$u2SoL6Jhy<OD<axGzdTNOxIBRa;$4 zJ=I5B8rvb*gA^t+d|>=c*5}OnZx-KccD>>!8Vo_D!$Q!oCG!;9e~5R=H`$vZejgpa zs|Y^td`|RONukd0ve!M47b_qrDb!WbUeg**5bSK|MhbOJ9>S(PFn$s}m|=a+tp8^5 zy=K>s#jqfQ>xAFD4yTTUisFy(MfnzU%@2{lH__o&Y%d~%ae}%O%4nEN_Mwsd!J@v> z?(&YR_F4!wP|8|lQ1n=rLKzJo7(YFJddc-Uv;LdK_nKX=_zkUFg$8pdJFJg`sUu;> zkMN9={0ASiyCo9%Dx;u5Q0cKqA)_JLheq>Na3&JiQQcPC8i^BZQv~BuXrwhawIgQV zU#*{JSf4Yy-dMj{D!$k3dUE|qrcvF7B#(j%M#3NAr}syUfS)J{e9Q8sd4i7T^pVhY zov{#ps|f@_gM(e=9hK;CT}v`SU5|So96wqA%dkFY)_=43UbE}rH#{i>#fGHnaLP#d zXYl(jBj8urR}~3-10BAL4Binf7J@PoG8W>755EFJTT6S&kU-I4?1&+Q=y9jg;|Hd{ z!Y{-6oLT?P;(N`m7k*e^G+5b?B?<l+{-fVz4k+^e0b}4>N{3&uz7!{jHHP3bF8HnS z5zM8+tCa+b4r51}8`U`2rU-UENcofX(<RsE%=$0u2btn~&8~-^%wXm<c%cr9En!ah zXW9>#!F)Te!6JcgT1AInu_Y6HR_7OkpJ;HmE`jLqB7zT4{tmO1wB-7nS^xF*tEJ<6 z&8~-^Sm0z0Hl9WdJEBHH;rGW_kZ+*DA2W*)3Dk91C-?$dEHX&;ur<C|{*VNxm-d8o z36xO~J2EeWZK}vuHVn<ae|&vu$@Mw2>-G56rQ>_eu214OuffSPnK(t^_Zb%Cn@WQ} zWDI=A@+La0kAg3v#X3Rwt@VxMs}Ya{2}%O7BIvN#k$HmIu#@+geSfp{IkW4{;(N`m zho5eNl>{maB4ePwqL3tk{{>&}&+w=H7IT<SupvmGCK9N0SmrWug2FFv6n-)S_QoX; zD-zdXnNO>6@PY9&S)ViOzwpZx-)nZg;-_2S#4J!zoPQco6oudSlm+=zY48Vldl?eQ zC@7X#5hVNCSRTQwY6R><0=r}kL;@28Wj@{1^uYKnvw+7k%ZF&$*8k|aI=FXuTWoV| z<H-6EDUx86oRG2;NrflYy34y82iu9s?v3@w21Ztm3~~&O49Ab?h#avIsvdL?55|UL z@=I)F$(Pyv$8wnFILOq-GRuc#o&SEMX|QXsZ)jj>aCm5VC^{UC#yF(!qsB2Z%AwtI z?DKCe-B`K4b}};2INm(oI^I6sG1fWO&CxT~%hAWty0v?K-{io=AXQeR&WcfEMX#<h z|8J(gt8>dN;IYi|AzAZVTQXcTR8NRz`%ot-V|t>!!+pd3!vn(u(N)n^vB4N(Mkw8o z_GQbH&e^=_z|NAbWmA=#YBtoZkF0B$Y@BGGXqjl8XcL;_o#Rc@9oxDo#?rrGKow&d zn&6-u3oKPRme^QK4K1epmRZ1KndL*YZ0moF7mNg>RixReC&;j6sBNg7ki)KMcT`9U zOQG2x?PoZC-ge$~%zH3@x^QR7_RyA!sjAJ@o9i}(H%2zpuWww}G}#Qx#{F%(I=6Li z>Dk=3seeQNdML7tp(aw3$t+FzEwg~fGRuc#Eq{7hl;KLGka85mjYuLChgDe=!v7UX z!Z>@GV!vv6+CG<aJom6~f5G0O-6cCh+scIERQ2Xso#V#(jg1FLdb$&WTYEP5iorq> zDGOvU)Jd)znl$CN%mN<EEFY3Jze(Skz(@&6#)M#<V%RX;q$suu#h6|er3(}Nj&{p> z#c?+0WbRDf!Tf!J>7qR)JA>QHww7<HR1|CA*l>`fr@K3Lkn(wJ_f+pDq)?}rEQwGw z<(KThXZq&FZ%9oaxcI!;U$V{Lxh`*Q0Vx7YAxK#i62+*pD8mNBmLbZb#GuGS$_LuZ zmK(Mwoo66;H1Cjqf5E;&VYn-ZB$jWfP!t;uHSbg9E4Hho*}5JZ43$3X+HCgymHWjq z3V19`A87h{3*L}5KQBcAB$;!NRDnXUMi@rPzXZcZ#zfKQmf=<=iC-bX>@Dll_KOfa zm3u7ji0@$j{=nYC>Eb;lyMsGJN)j6mHIe*T6%Wt}Dk)Sln4l<Q;)1Wc_}9e_e2d?c z;V-)T+kKZ0L4U=}>Hp6EuS{Rk-^WIzEDD8<3en+Uv@8a};c5s<fg^*VP;6nD>kkOU zpiG19s^h%tjQdpXaqo=pkpDozzQEohVYnxVBsLtD@(c0u6-K(<WGReO%+PiHl(7Js z?7(Mw(fk|Ir4O7xYyMNV{3weeg^ff4ONT>>AVoby3Ww_`w3T4UGS{a(n!JhxUbkP? z33^U=kLDfrAIKL?jx&Vc0k%i=@)cr7^uw?wIz9e1?Lm46KGVyLH!QP!NY?TfFN+c& z!KWfPEHWqrl@yXNfjuFP29_DV#ltb_wvfPUj!VvSB7+b-mN$bQAHbf-XgFPexba|8 z`HCch-CGjYBpZH9Iq(^PO#V-Hf9u5`=pM_5+!Mbszi6<OMIoJdTn3{;P^6IZJi-uu z-)4E?E$tN~@P_Rv$CJ)E*J&ZB^w^NXeelDAhy=!q2Z$v$4)j>fX|m(Dlmnj$IsLTZ zUUi@4L+*i}D(jIyYDi!(suL7jq6iKN!7$4VQps4#43MH6D{|Fw(RnsO@TgC$37RbY z#DXN1uh8vCT#t=8jVXjo4t%B;C%eDhSNV|a`NhkktX2}JM#11Pso2qCouF<_YT@@C zEC_`{n8iq$1}PMB!F4uAWbh<1D0+-F5q_$GPjc}9!;a_-l{GO%klBIH^zwQ4xBDs| zau57eSr5@*T>_O3&l6N5A$m+A4M`O9J{tV0lECZe@MY)u6oRupTwLKN7KDR&wC?aQ zKME$K&=i2F17GGAmYF{AUiqoQMluE}36xRLkU`y+R4abpU`qxK)?QK)C^{^5WL^f3 zdym#15%*THARP1MD;5&mW*Pu&7QmYspH|ey?n#CK;(~5GG&2MMXFY#mVVN}A6gEPG zbqN$Zyoey_Vx&-rqzrvSyK8xYG4KXDd{x;IgCK=M;{3#dhy>0%+$7r(q>%1s4gfX_ z;5FgbY_XR>aLEATuuaS8V3<8WDeFNha?xPMK%=w~iBgkfFj@saDHQSv8zgU1DC9-G zPzXAFg;2UV*BLbqp2|HDQSzr-5Mv%4*WzS5BK$H30GkHzW(G+3ux1J1L9;{vG46#@ zKV``P;B24Y^s*?5AEn78CGTpI%PI*JOKfOyh2r;ZEQl%;!YoFYz^Cmp3SJ-(hcXvf zl2bZAV+2f*z+D|l1mnG5Isn)-fHyNg6+SGLDV7KzR_=vDaESomd*Nr4^)MumnRJ4n zl*toYO!_6gP{;?2fO@GP3WZp&D;>Us9f=c+^J5i8Ka4VPN(Oh%7lTXm-a=$Tx&U3% z0Nz9kX@7`^4-3Jx0mO0lLLrzo0J!{R$A^}$v$30;0Lw^HHJ|?KM->-r*`&4lVYZ19 zeD7JZNq?1^04rU9u4w>oT5O3w#>0m-j}?Mx0*K@8g))Yu4FGPO^<Qz_wtdMW=@BH2 z(tMgKBaM<ufYm(g-|swUHA`#t-E8?4-Eh5Xd(UE){_0N>9o2tW9H5)L{+Bj@cR>pa zeocfAYdLlZE(#z{xEE^V+)on#+`hZ?c-}?Vb^8m}S4bWr39$53N|KN&DFIe@V{lF0 zPS;WUIqOrFXR-L{(^_Sn=qY&7d(-)n?G5X@mQ3ldzPF&e8nH2{0l=mKyo*{$`9o6p zu!OcP3Ls8$FVqN{Srh==wYBm<;Dq;l&Qp$OY|j&LO9~d|zIp;IN!KAsPrj~wI#5;J z6By6k?mFN&Zk@ASwxmyM^-;F+S3c*z>VD4gqU|;7n@aZQQ*9-uzxr-UXZ2rF19TU! z|D~Q!(?If{Qo@I&{>*{^V&R)`FVsjhu^<4rcXRFTlEe8YbI;{mc0O&t3BwmjNJaV! zMbSur_3>~h(h?lZpUB<n+UuCHpM>Eh%i^?FZ)eM|@^s)y@6$QYIqul*T3@%m1;hFD zEy=Q<+*#x2<N)1t>wn4Tjcb<pe?j=L=E#9yQULJ+_rgU1z<nF*ca-fbob{c|oy)o8 ze9Cb>&hQlrscf*Il2Qq2jFvV<D!PiI{>j`eu04)}_M^5ZV0h7zn%3%W*8H9*I`6yU zx#7A6LDFAY->|$5LrKkMB&m@!LZ2?t?fC%R^y`1~=a*=}`16ABVeE+A4Z%bJ@dEck zu_Gw~zys?Ww^r^hIaqKMf@gCsx~@2`Id0fUfMva7eHn(L&rlpGYOASf3HAj>eUqLk z*DmKi#}WH+>l4;<Fq}_o^+LA%s!o;6<zMn%b3X&Y+qRc&uOfwS#u?71zxt}t&qe<m z0lG`A{~6~OU6}fZ)b&%shc&lN5sU{A>wJ^k3#s{$900s(UGr4U&d}bX!}-VZPUW6) zUvOPP3ZJ$=6K6<TDxtW#u(KlESl(GYm_M2~>DipK13f-upS7KgGt|>s-N=?-^~ut+ z1sC(Kc%II=>4c#DCG_|;E9tM|42>i|sohQfO%KqO^_PqRyvdg={yBO5G#);TCANEz zK@~tO=NGsaN)@w20Px^s>!!Nx6?;nd2WI@o;{-1`uOfxlZ92o-l8%M6Oyv#Xs@7m{ z;gElgS0^~_*iRB|(pp(hqstd8mq?6~WuP_bo6Z!T_h0fp<-YDR2uj+VFQLiv3_n=h z<D|bxfNrMsKNY~6^!_wI8tbPi;lrBCVe?=s2q4zaFK{nR3IHCQY~L8!TD2=QU39Qu z)_2@{%5%nj&h?~Im%<xyhA*rs=m|E})HIcKmh=Z={<Yq9$l!L@ZpS{PaK=trD=TTO zEElrp7dl-umyZPIUUT1YJ*#B!MVm^0r8N01i=@9IK~u(;NAb%Tz?<><rNW1q%h+-u z7)s`w2>;bcwi6Ek9-8Rb*f3SSvusbvzQV)#M|~$Ccv=X$F2|+thK&SRYxDa`TdKnK zm2JVEqE-1LzA;5`n~Sto4$@j-PmT-2toa=;BY8=|1>Ys_Rdo0n2tJP%3qh6s3MnLo z7!1u`4`lsM$}gvc4{IKaQxROky-@0JsQ}=i@y-p6o9nh!><&&B?GGIBAIsAT&S6W0 zpp1oRGHIDg+A8Y9)y-ucC4GS*|7hNrcRez=%|%)(^jMd|6WQ`p>6^|5&ikK40!4>O zf0ZDp(qAEkl0xhue+`d!Z)hgazG@c|xVI1;7J|7VgJ*LtxGpI@W-KHvQ*lRmV{L6i zMO(1DcpxzBC#{ttxYf1Oxd%Nat(ATDUig)sDLhB&4Us_6VbWhIf}|c|Mx)YSS<MrG zeZhQ($GX?IY>Z4*?<hwCi}wW%p~I56L??J2f=UX<{ewlFp(aU+(;Vt3A+1$G6gx7W ziwwHAx^^%poJNliWzSEgZ<<4cNx+5#KAoco;)Z_PZV)61+ZT+-^y7!ede*f{`jTyx zyGWn2w`hOC5k>H1oS;4uUKuAWwL92S8L6voEN=~VVMR!5g&moI;3f!)E!pGPe=q!k zr&ao<iz<1}HFQ|)2+5n`1SS2|tEL_Jg5MAw>z!=d$OyQlYNsxNg$MIz{33%o!81AM zDDQiD!be)B($<OwG`O+_3G6BAM~6pz^903~?8%m&O5cP9iEA+Fuapjp9We-sEqUb& z#$)>N@SAAg&_s%i>g^S}1d0yN`i_x)C{EBl=ei(itw_sME{W-?nn=skg%uGU9?e_J zY*3d$2;LJv5&)9G?!4cSK+$0mQ8Jf#-thuj3_;Tle8F#sj`dA+Y)}cD(BQH?G6s@< zNM<rA1Sj&M0hN}iDqL67P}zzEb|ZnLwZe|f6Wn?a{8aiONnodJh%SMm!=%4T5WMpR z<1zhs_%VY~37oc6i4D;cI33JCqI8(Kj2Z{eOn74j{UxMjs;sZ8C9Rc`K+;;F!!hhg zf?)Ris*Lna!tY5n29o|tje<!8O*`-fzacu-Ki;{%bz{Tkx~b}I6$uFx9Zn_)Khj!t zg%|;AYQt3x6(WIM#iX@Dhexm@GM`3^?~R{W5T(H~iy{4$q_7hmW)zIe;1`U?^y4So z<L(LfB*!|AtV$PUERmwsRd|{`-R=SRusiA=aj$l-;aKY)<8#J6<DQB55iaTR40>W7 zIj^6caO>wL$#G{K>qwyVaKBcUP~_LYCT}ffV2tUUen?OBKCJV|kv&)Kf^FE}7_6!e zMM|5B+6p@hx(j;p`}_m`RsKQWkZ(9|I4|mrdBd^#I*d?JZ^1zRpnu30^~Lf=@<zR@ zIo8OLyOv`t_u;0eX`|J#V5~?L`Sq`svhDG*=VQ<u<NFKG365+D;<d5gxhAhC&=QPP z)YMdj%Nv4Cr7b0G#T`YRh24RkKyRS0pg%vnDbn3o-%{0H)>+z9+*{Zm7$_LbAIcvl z*C#AFMtq}rqsxjd`dzRet&df*cPr)NDSW;<f3+WJ^sSxesPp7OExUu&<8#)XuJPQ# z{H~&wU}Jf>s-~tYTv1=%7-|Z(l(v?%m2{MJ7L!aYTHn#w&|KSA*-=Ib`jXz_zM}rZ z0S44T4oGrD%@ctAE+A$2jM3&;U9=Ma0V!5X(QX(PNa1^-C>$vuuLx#GZ@sK1-O!HO zx4R~CqyD}?XK`z=sjR*-TwPmRT~`^YXb_H}mJlh}#v=Xo?TwAib*<Iy6&+<=q3+Ti zgJR*TIL8cM{h>HD+7_#i)<Ccff+ew{SmCH*h?kH~G0rh-eyY&C^?+lmYchA#H<;gB z*jdsRY%Xi4h*Z`I#oDTHWqk#y0N3L--P6zp!AMJOTNMn;yFy*To;XD%jfD>dqzqqT zv?JDt|F@J}En_4sjTS40g(8PYVuE7U{8XWN%W>;o#}?P5XEblf-yi5H>JWyZ#`1>B zNL5`;U2P316jcHF`mTmnMX;{5roE~|7?yPfyLE~seZ?8R`a^MQv@6yUtH*n}YB)|1 z?`9a{<5##^*Ji~qYksQGyf$Z<vF~wAL2!*X=3kZH7w9hTC}|D0gqq44D$!?Bd#VER zk-qv)2sSm<w}e}3+p60u;|xPO!%}j@N;d5Mq}e#kaU^lfuy#Cy|6a3YBMCU<{Gl}O zFoS%K)@R7ceoU=IFK)k}J8YAT=KDx9y|DcwTr=7;(iUq#0&9{8mJW+&PzuAW`6UX? zTMycIIXCA_de(YJd_(>L3~v{vqczwXLYqm8N&)%ss1%T|@0cejQaI1BD=1~+J4xQU zjWkXB;*|7$=zZ1p+DjTqj=O~vs6zRG;yC{fGk#OJ1#M7m?@ix62?D7yO{VD`iEm)s zDY>7cy(8_hrf5A;9My4xVoOS+B86g6@bnWoya#@&&^!`&)V9yD!?ihQooCEDnm6oU zRUn!iX9&M|0eP&5Dj<&xHjpBrPDx>cq4ZyBBMjg`wh4;6=!M=>A$3s0lVq<YAW}L? zM)EE6QaH}P!;Iey^vcOayhfWy-@Zx^m`7y85Ke-d@I}~(w96;#S9fc)Z=^HY5~F;4 zv=)MuN{e-Zx)j0?O)kuuUs9oY>k<3j1i{tbs9(lHG+7vul2;Xw532(5jDo6wyg?8t z6ir5#_5NGP1lvk_-8C@U3`L=|NA-d|rt5Sp-=~Gih8u<Q1~hX19cKKF(N3S1i(AGL z?Z*=YTG@<i;VzD%3mdsl+u$hQVYGjg^6_yAtcg-EG)@p(f_J1Yg;?dR`6UX?BZ0@Q zhwRgi?a1JI4~6C#33Vw%li`;rARm#kq4NY~B*dDuDNUa5e-VjcJ7KsQNfAa{+4I?{ z;jeHP-_Zdr-(xMqm$ZkR*&<6D;4JSjdmTe-*Few0mTubilM)1qTSzOkql_0qQ9faG zV3hLlv1TMNf)#<_P^GfPA+g5Fzf|<N_#XJB6q>i~x9@gr(+TFvNEr1GB8C0%TU0>a zkill1U@cNuliYuCOlgMDAW|?!;?9jw#KY-rihb<B`!PSqUWl`DOX3}qP&9kZpotSu zDdLT3+fPXlm_yFpI+7UoqRsLhCUt%kpN_?gcca7ALlV<L`FIW)36&Jy1HVL}dF`C# zBsxr?`8dG|2#)4aXx_hyWZLlp@?i?dV@0I+bO*CR3dlD#LQqCRq%gJrK9v(&|NbLL z!Gwl?*!vVdIswjRE#Kud-XWA@NS4_vHU8JQTKFW||9}!TXHi(mk>|B3$AkzBdq|cX z(?(Z~_QWV3j|PhbQoI`-u3AK}G;4l~3eBU#6q;u~y_w}5A-Ebnj>2zI0eN&dl^}Y& zu>W2ppoY=B13i*H;=Lgnd<iic!TT|vyxA%@cms)`rH5v(h4H^>+7}JJ##ChkVpq+c zI-_MVnotkJZsFGpKhfaEAyqzJbU1~e%&N2Jmr`h6Ng##h83lJbx4Je#Q06mhyd#QV zynsBj7+nH;gdl}~4H-m_7xzCGU8`iQYM}~9zlO)`$M`org9L5`7d0jvIUZ{OIjw_o z`{K(N{AF?c@1pj<f+ycCO0n$4PIW=BK(mXRV2JzJpp6bPgQ0vpv*<=5q2m&W4$FMn z&|(N?Eq_M*TT-EUba+3rX$s9J2(E?Sf&%hb5mi8*Q7}%BId%Pl?y6ticuAFMwOETb z;r9vtb+@&POlH<0ay44M-R&mxcquYsp8sC_tM*_CR&!~DYh=M}QTy*vbn7Zp?Crd* z1A+m`Kknr|mJ7c=EJz0$tV^I+5t+$gN0JDJ?tx!Qp?Rz@h30Ll&^(sdAUFoUv<2i< z@o9q~Gs1NIq=yAmUr>C$#t8UjN^xDIpw3PPyiWcM!5q8vuPpuLNq<fCUsK8L6~RRN z84?7_w}4;&s45?ykU*IUiVmj`%$i@CLi5aHC^U}_izU_xuA>Bl@QW9a$A-j6Z!8ih zI^3C%!6oaT=P(eJQ00U59!UZFB}$f^=W#$DC)!Z&e7ncx$fbW~FYyoBhxy)sJxz}P zO=>?wf<UJx<>TR}%Ex0v8YMTbZbgtleJ-Q4_#XHzC^WAm@R*fC^XRZxVntB#TcUtG zPsE*AVx`4P_ur-YWY!`6Yf{$!HX3%4DfK!=!Dhhb+dM9($FlVG`WeH&YH|E;Qu|W& z{bknW=6Lwt&L}v*;9HFm@oKi7{VfNZ4mBQbJlt@^IHc@H{fu!$W;kYhV)dIM=4Cx7 z5$KyOnhnmD&sK6&%~sFWsH1jPKk8=n!!&X1tp1Kn{dY;~E8YL^9o4^cs()jc=0=7l zzb)74%yhk+oA<9;9RC|{Uo6y;2?7VVb?@q!Zrj_kzxhD(fu{7uKZukZtZVIVjMk4u zGMDw(r@d#n;kxLZ^PdhpQFN;0Wa)|EiO}(~<K-L`$19Fis-x=IgWxBwwD6Pu83ia# zqKunS#crk-RHDI$&*YUk{x{zKR~TW*NI=22bK0!RwmO2cHzR|k!<+lIb?@xl-44Te z0g$vLW5KTamX5~$`e<ZzWIVD|S&!YsUcPF(>AdQG(tF-Fmw&e4OyTLGCyGy%oGLw8 zdNO!2bTV|JjN`%Z1CH+hXY_9lf!c)*4`W9nOfgK?|4rk6rTq*E0;3xSrh0`S5v1*V z5)36OZJ(0KgSt$vF6ymnZ)k3B>}eQ);b>%SBwbmL?Zkn-XL-r?oa5=7E1pZ<i+Sh$ zLh)=sIG!%nNuDY_^`Q8ft<RZ=pJo={Yj%BX{ov;Qtv%bicXa77xO>{BjY2dq)DK9m zDHtg0tZi#-Zfopn=&N5<Pg#%1n#forwXDY!(SyuqtuI3GhU=PJXL!MP-hVEC4vJ?B zb&jVW6hGvxCe!+yS^v%Ad(EyNSvRz4U~``k+#V;ms{?Ib$Z&1`U};ZfM_p@Ub8BN~ z17$tx2a!X{dI&{T97S|`1Nn0(v|@S9ddGIl@r>)~oTuDZaxZzG%)8*b;5#Q2l{B7x zVEjzh=gj(V7T;@j{mA6dhE<#TMTdppeDMl3A|@no@0fq6xVNmcx;@e&1UnkKk-~vE z!%-NDJ|m0dFQL$i_NL{oB6!nz9Vxu(Q4G;$T@s=A!1$T0&zbe#EWX$5`jN@u^@Eb_ zXNw{j9|_};;7KDQWhn~#gWVM!wQUg|Pa%jNtFj*TLrM-Q>miaz{tZ=VMWfJ)^)@ni z(|JQlq0Uf7M3Kbv4~(D5`kYz+&Ek8_t{<6*t{d7oK<NOb!y<({#Fiu-YDApy#R3B* zy=7fh?RBm75Tv961bdJ|%6il%N~4fJB3@_(9j4HV{g&fd=Q9))yyg~77CBVf{J{8` ztk0SC-z>h@?0UsdG#DLbCZiwoBcZ`@4`nF|28;VbNMKESSSQ%j&Pdpe6zZBRk1jDS zQD{Yb!}2QW-Qxt66zZC+7(Ot53krBFT%R-Rzgc{*+4UpiG0F8P{N{BybtF^_CrIi% zRMcPEQ{GjD4u@MCL<S*9Sr4p<NTG~|L-0!}w4w+~fdSEDkwTr}<p;(uO#zSO^*OWt zo5lB<T~GRU;ipTWK9ez|P|XPqJ*F&0{%|20tjc;&RDq%tieQ@}Xh`8ImH!}KXay@m zp%vybx9tW&84E>|uRbt-M*Qj$>vLxPH;eBzyPhO}!cVs$$)n(ckx-ST2&^jZ3nGD) z=&&TTPY_gk+#_Y8;FnTp1s#6bAgD&dxD-A>exLC4w&ePpS^v%Ad(Ezg-{i0m6dR&O z!MGjPrO=pF@0#?kk)-0H!SS*lZDL0nnnjB#N&!L9<6ihB6<Q(1yDGGjOi=0Z(+`ZF z$@-jG|IOli&8~;vgyN@baOx;14CA&0em+%}qJ**(B7xOX+z5h=Nd&Pb+M+@$=rDy= zm<=i|Mh0ahyzv0}Wn7;#>%Uoiui5p&j~UFo1~1fMWlK6m{@^!Ms5CfU)&m`0L=b){ zg;tP2DVC%Pttf&rpSkIL=7I9dus&zje_8*_6yIxhJ^XYFJg>pV(}-b5<o})VvJ~;M z9&rh*L5C?yk&r=b3H%lnT2VTjO7PhS$S>pioLT?%_0y%}d(Ey-;%C^9#X1Z@;it+{ z6j7EUsK!9CB9!&e39@S|<6xWOmr`g&%3LTN)(O6VEk*_(D8CHrb7uWF)}K?#`zFQr znq3b+-2#gaZ;IQH#X7uAm8Hm6WhqMZ!ZM73l=X<~uso3{f{NdQLMzN;UMDtRMnTGi z*c8DB$}hwEoLT?p*Po^Qa-zI%BEHw`dc{w-z{V^vZif?3BTA1IKYx4#OqM{=VHpKw zE(1Yri9#!J9ZnE@VEkNWJ<NRGe1|>r^^g2nKkjYIcwJ;O_xJjh#TtDkK0jjP-E@;F zQv;9sxr(^cz<u`PrzCsawmcEO&rg0xE~{dnEkH(=`+WUM`+srYisAqFd-|$kue^k< z^!rr8f7bpN8}O&tuKPVDSdF(v5DD~Zj{A|If6;!<g#Qkkx{dd9`J;bLHU2_n_)_>a z54pYW`Sq`wXz)i^k@a|V{T{xAUpY1;fHOzVy%YR5?Vq&&!Ir~II2Mc`%D;Cn;a7pD zK`}0kUV=qz_eS8i+OM_0W7Fo#xF7AoF(`Czmp}UN7Jijza0wFVLx-LBM)0@VZ?vCj zKi0m11fIrqrVCO2y?RyS#Z!p}2XSUAK!@G;M)2RYf6@Lw?XTIikwB8e$`}8k54wAh zH`w48LIMl1B3?#8NA?rqf7kw1`=$0j+4lMaT;R@2L=1k@4}I9@gCi*XWCScj@k9cB zxG>)X!GG8OP5Vbiz`q~`_B(Mm^dVfg%sp2hTo=C3mge~du_1+6kvv907nV4ybp_$~ zf3^Ru{T16@UqJ$&#GiZv4wyLdf1%y_oqf2@Pewq|U|j;eY81?h;D2bp)&7||^nXW# zzd@kFO{~ZsB8wzY!PI&1ch<>ee2|N7gwW3_4q>(M7*!!@J$Q{r7M+v#n!Kj$1#U|L zx?j#`RH{!&`JAJcu+8R1u7iva@($xY#=A_~e+W)nAy^v!_Chh0WUC4UX=Tu_UDUTM z_}$fpEPj{O%j~C=`y>=oD1C~9VYOzrFeVf*J_yD6_Xx{J<CRPQN)q2*q`;5^@fDX$ zuN9m%dY63Oq<?=!@K=u(a^$-4mG<!|#fqa)Qk?#l6x1`kqeK=cY*6&`o+9;KLVT)u zfQiZH|ISY5r0X3Nd(hu9I+wS^d6O^ra~$q2u<uc254b#T@)k?hSe;NQ_apQxEX$h^ zl;FA|hE-k;{>#I>$h#h;siKp)o%F3Z>F$>^?lmEMl4mFVc@uvBr2T|vw;Ak3ovqaA zhoQq`%~hXNz^GUJYwhnTNOYTbOt8jP4YFK$$hIhoUhdhW^m3p-#+@%g&yWwSMlJvA z>woFaoAmED+W*02`+00`mlm;>JBpmX9Jkx)L7Q{=EdRf1zu@WaE6faMh&LWkH>(i5 zPFXLs-rt&;$yfS3PurOr%Uu`Q5bY^JB$C(v7N0lyHuCiGV;qhz5x&`_HCU_crOtpW zPcgK6EV(i}{GSkhTZOPru~IF85#@ZmEce{wVui_f`JJ4$GdC6~^v4;>(?|09-@@}I zUrN@menQE=yV?_Ehv?E8tu?kXq%hy*g`vwM>ob3^{W(_TRqZk|xRnsaK0;?|5&gnP zOnu`hY$%UgNnt(=g<tCWU+Q@ie!}OcIE{V8%z2i;n=Y-%QfsR~3JYES9B+=SFZ=`R z3*SRV@9=aoqXOUt31e;Gqe-vwD87VHjvAkQir>QZzvS~K{nOX4{wq(vuOnXvl$B^^ z&Rk^=If|VHE`5FBZ*U9v7SG4Gu+=k!$*x0>dk7*7o3#GWp0+V^=b*<@Q77H{-~4%# zE{J^nf~U9d6ZUhP0Ly8f-8!{qOT=1jFL#tW<@xxZ(BQvB0>4U_>`g-8Bt&cjq0|FN z;UoJJ+vh@$<ryR0`k!&$q=O<~zta9z`+tzHw+K-+g0NdG^^AlScK!MIXAu4oGWr35 z+Sj2x!+z5y@>dL*wEj?^wlWcOCipF0|I^Q#bYWh;B=GEIba<Bi^igE6#nND{wF$pp zv%>l}Sn2N(tSIH?u91#oh8fQkp}>#yO9+YlEw%n9=S^Po@+IppZ!^N3X9a7D=Z#LS z6@qg9H_UN<s{J=sX#Plo`?MF>D>(zfY4Q*~;xB6wzjW(=rfYz@eEkD+yg%nT<6~wi z*IB{Zk9NvTuhsG&nv9VD$SnIOtg(I%#_#i<=O`4T6MV$^NnJ(Vf-GbG?>AVGzr#ZO z06On7OSwWalxZwP6oSI<U$G!RXN>$yBv6uLNU-}&X56Qd!N;%11RfK3OyDtr#{?b| zcue3ifyV?M6L?JEF@eVf9us&>;4y*61RfK3OyG+mu!_i*AF+D=E4&>4g_X>IV?F)1 ztZC}Ue`r(oyLkBgoE6UBEI9igtbP86>DT@ayBS|4Wr(zquKlByPgyzr6?-81XB(fZ zw{G6P=@<Ci-w=S{w}=t?U-a--aQZc*^qvaK0{4*RE<RL0VTJvdDQAWAZ%x15HPS{u z#oqY$*kMc8e%LmH@4;WOgZT?QIewkQSig&=AMg;o!O4!;^Z3XA0E+*EmG+;pANEW3 zB>pclX({l#df3(eGCnFlhT=~-`wPYSpA~7dum4r(wZiauq%d9kHg`3v^l!2^@?-W$ z{vKZhxl4N6au*+smzCX|Q|wH9#17b>(v!cYFMo&E_s=!0)ang6y6NNdNYQuD_5X?_ z{VhHH8GZeQY4`p!R~^KYW$jY!7uqd7w0;{Otk2kE`fGMFrEUEV{*!6@56c@Y?B(o6 zlV3v8zCpCh57`a-D@ft{63BH|*}K?}y@VIgN9=NcpWX1k<eV@zdx_uu^U$*`)qYU( zl(3sQg9qE&X!7^)<dC-i21)-va{nHgmjy@N_3Un6h2gts^B>ZeA8PncmfCUyt~y&6 zzEMxGFZMdSkl#X9eu%XErD^ZP^Zu(U{@0qm{W7h<iSOwI1h27k`5|q84=MjK`@ug! z&L6E;Et0T@eZ%YQAije}eM7@{Bc%B}?owyO+QqKzEFJ{6@x=I;bN`7E&FqyN|7*c3 zZBhFbcp2J?kj{+|yv!K#4!rey_ZL~*AG6Q@U7xSSYiBcYYUk0Xm*J$m*8+Br-&O8t zfZ#fOsOIovc^PdJhGwtS_+JaRV?q1ko#_!bn*qEMX7FIUg%{BWcyj!aF8^j9@X(zP z;iET!6rP|jH-(=_NUq0S<gBtcLU4k;?~_Q%Ewt@j({H~x{&zw965B~2O%a~EF-C|P zZ2e8Vh~80m=R2nDKa6iEWyUc|Uk;(kieIVbcX)g`rOp~#lcf_6w4Lbjd9?hF>DNye z|GS`lER^QR<u-NVd%7Kh=V<vw{9xWklON4jG14%Ehs7T8-;fb7V9j;A1FkYht*se? zYnU+#-7BWuOCSH6)V|0!feUu#I1xN|$GJB%j1brGB6<}md{ke0Z^m?fW%SVN^yK9f zIV<d8MR1hSYCk$|_F5|bH_^WEE5YB|;$^DTOjW9Ncu&sY!FCldqL<B@e~8Z)5F^<o z{KWscL~1R#-TAIkM!|@+1sNQ{8c&;k|E1%9<L!%gYmjIehZ6~`#p5)F-0z0q8Kj%> z_Yr<cJ-ax+Qas6XogQyap)2I5w%03y(o?fnrug4@`)Yj9+!ll|gx7cvp1YeEIfURP zv*sW2^B6|mdoqY{<k>xLw?C)YS?;K@HCS7b!Kmr?&m8}&+7}DV_@G&H_>gM6#s_#_ zk(#hGV!s~sm(EY}Z}=^_E+nu39j>s~#tE8rPxn8~5dSOfOMHn>bJ+MNh|hfowrd>U zQz7`Mzm~f`w*umOmvsx5yFRx<;(PC#;d0mKR!n^F{SsW}`rHbN@4Y{Q%UqvZQSrU^ zLvWevb1N*q_udIEbA4{b#rNJT!DX(`t-$!+dn35a^|=)p-+M0vW%Xbg>vJnKzBhY< z%UqwEkH1`D@x9p+T;}@ReEj8#i|@^j;4;_e^!Upa8sD1@!T9?9{jSdmzZD$cYfe!3 zEn|Id#mD!W5nSf_-0yvS?=lN`EVF!w+1-8k&o8rp$1=-@Jp6pjzTq+pcr3Ggh}pM$ z_|Gr1fX6b+hdlgz%)a3=3wSKEe2Ce%d-%^Uvw(;G_;TOnLmqxUX5TQ`fzRx{Y0k^^ zaT(=9(zN)9{VCOf@1DQBF@3zx@*(&9c@NzMiyipx=_?Aq)bxS(borzI=5!8x_s}ch zw=jL+JzW0izcsxBU)KE+e)0f)pXEcc{+x&QjY~Q3W%-4dN*|cz<&XZ`mUiIFs&59r z#pwgHy8O|9Qzi$#EP9tIeP9-sKl*RU?7(N*zj=P?(g&Je{^-BK)PXN^7j=I3T|Ojp z;}64GvjAYT0A7<m*ry1Z%Md_pIZDvmN)7-v3*a^3=Ps}eL2$_cVp|v|)0G?mY!<+4 z!mq&M^5YV|WB{?-hRgFx4gfX{;LQxMRP$OLAqs#j5kOq1S?us#xdFhY0lb;{iSvg$ z*Iq>^N!kEn+zT}xPF8LJuxS8qW`1h@+2#t^Bp4@605R@`QkiAt1^_3o|D_G!&CE}Q z4{L6Zv(y@a;GzIx<z6V^mn$^@ICcFmO#p9Ze!BaX$L{hw%B>9$To6F4+zU08XMbe| z054wuTM)o&#!rP0iv$+gtE`O>ObH;4yBDt90N`}%e<=aH=KKgB);x|JpR?3fV`)ND zlLCn2?u8oFMpkA3aQgMXqyXN`{0#S_Tr1Uj0**3Ut)*E7O2-3;<L-qTl?Ya50Ps@l ze~AEIbAGz}k;j>nhYnZR!ir!#fH>h^s1ZT95(9vjUjK^+@S5=x4JCY7%e5ha`OZ>C z!HA^=8B_tp3HL&o#jL~t;7sd(Du6c|eky!eLjt|7LPyA6ZLL=X2_QD~o63VLGXOaA z`k!eDT7CaWg%4{UyW5@PcNRO!Z8g>gODi%MO>!^PsJOK90)Usj{%7pJp}`i93kfVh zhbwHgIzizZ5C4_=p(`%{IJ0^_KKq!!V*-x}JSOm%z+(cB2|On7n80HKj|n^`@R-13 z0*?tiCh(ZRV*(E`0T(+HA+`>yIK<a+;bEV07Zg}Z*~6&7k6zxD`aZL+jgkww4+o(( zvg<BvTYk43wT0cbLSmH4)F&99VSI{N`(ONdD+Ej9-(Kfqe0S-ecGoiG@_VfT-V;(3 zh2wlL&AxvtIbj=N*u?+ySIv{e_isk4w%9%R!4xZsa;NmqkoP|tuUx#ON)q2*_!xb~ zcl55-;|RHPb8SA}RjfFcK~nFhS^xT==T~<pn7;j_1c5euvHW~Sk@_wnt|~KY{-HUa z!`8!MI+YqFB>n{N$8LL-E8zBc?74{^mck=s`u%&5y)w2Y^OkCVQG!4R&M<i-G!~A< zrtLpGZ*a1KEqA{->F$?n6odH*B|&DiE?b?m)a8evBiAbLEMT+Dynk(IZUsDKdpB+S zX%hrmG@<B)`J?o5pg+c)FF{Y0eEo(pCZACPWslZnjX27kML9k<&Oo`gT;6NCeTmhm zr5!!~H*NbF5(HYI_-MWCY{AN17ugc+DbXlF^h=7Fd`K$mtq^RmS0ROgoIGKOHe1a4 zmm2?@ru`L`AkeP4loaavS)@_WOf>jASdl02e(bU~+G-qS&LUTSju(ce=}C<JT_gT? zQTr<{L7<J1N*{aj#kW@a_frb$d_*$(S-c;+EKRmrdj(QhnB&j!n#<q9_}>NXufPO> z(!(4a^2BDU^i@Bj=++-n_~<glXC|~xOS3I(uX2Q(#V!D6e(C-jlD<5jk->$Ffw&nY zwZ9@01lkz6bI@bNOXTAh6oL9a1z~Pem}466M+jOY_G+w2X=ZwQ`uxT5zbWm@2=Vun zI$BW)0>$>Z&|~<C2lU@kvg})wL%Kofo}H9W>9n+1>uojm3Uhv9LCUegMT`b{wCGea zm}p=3ulX5wXoV#RwBi!#6n=($e1-(Rj1JFId}dS;Y_Qd4%WrY~Z@his_iM`K{S8+B zdz6iOhtfUQD4#OJjA@E80*~-ZdPs@T-%?Id<l~2w+j^U_QRgY$GllnKr==B-dXtwt z>52zd8S_bzKp*334zg|Kx<vacHbEf0)aBzJC^q=#6ny)bqD0pz%dwxiOiT!x;8SZZ zI=^)Bzp8y1VgHd?`cEi%_&x5?`+UZ86rVYR44N0XG5Z2v^m*a;8!X7*F~a|V-n~oF ztt%8hnx^<n6c0#~mo9%9;(u3og23gj&#i#?UK0s1IlavFxfK!LYl0}_>1D3Zt&sTM zjEFLxT;}@Riiz*d2r0wKWv<Vyp!nVlkTRTF=K9<Ui|@?<YKc?JT%TKU@x4o6TJq1! zT%TKk@x4o;N_%3N>vJnIzBetXCH}a~^|=)q-@62?G=H97f4Q&qIeGS_yzk13?@fa$ z?Vsk?U+!ytPWY+vzAG=jH!ZG3f7I7s?t6W1MaTCpf|ceE`ufW+*+cqotN{KID_Cz( xTKj5zb!khQD^mWn!sB~W`kMBaWft&Q@$tQBp(Xusnd@`L{x9WyS7dze{|`Wm#zg=C literal 0 HcmV?d00001 diff --git a/Ryujinx.Graphics.Vulkan/Effects/Textures/SmaaSearchTexture.bin b/Ryujinx.Graphics.Vulkan/Effects/Textures/SmaaSearchTexture.bin new file mode 100644 index 0000000000000000000000000000000000000000..db5bf73f7d5a0b5e436d336849c90bfbc24d76dc GIT binary patch literal 1024 zcmezOkD<Pvf#Dy7Vt@dk2uKi2{R1+90K~@zpc={6kIhU{#3;3&QvIa3l@@A|qY7?5 evLH0#aK#_8QgZae^^nP+)P73!lj-bXqYVIqI9W{q literal 0 HcmV?d00001 diff --git a/Ryujinx.Graphics.Vulkan/NativeArray.cs b/Ryujinx.Graphics.Vulkan/NativeArray.cs index f74074390..3a8512874 100644 --- a/Ryujinx.Graphics.Vulkan/NativeArray.cs +++ b/Ryujinx.Graphics.Vulkan/NativeArray.cs @@ -38,8 +38,11 @@ namespace Ryujinx.Graphics.Vulkan public void Dispose() { - Marshal.FreeHGlobal((IntPtr)Pointer); - Pointer = null; + if (Pointer != null) + { + Marshal.FreeHGlobal((IntPtr)Pointer); + Pointer = null; + } } } } diff --git a/Ryujinx.Graphics.Vulkan/PipelineBase.cs b/Ryujinx.Graphics.Vulkan/PipelineBase.cs index f779305db..583bb9539 100644 --- a/Ryujinx.Graphics.Vulkan/PipelineBase.cs +++ b/Ryujinx.Graphics.Vulkan/PipelineBase.cs @@ -150,6 +150,28 @@ namespace Ryujinx.Graphics.Vulkan null); } + public void ComputeBarrier() + { + MemoryBarrier memoryBarrier = new MemoryBarrier() + { + SType = StructureType.MemoryBarrier, + SrcAccessMask = AccessFlags.MemoryReadBit | AccessFlags.MemoryWriteBit, + DstAccessMask = AccessFlags.MemoryReadBit | AccessFlags.MemoryWriteBit + }; + + Gd.Api.CmdPipelineBarrier( + CommandBuffer, + PipelineStageFlags.ComputeShaderBit, + PipelineStageFlags.AllCommandsBit, + 0, + 1, + new ReadOnlySpan<MemoryBarrier>(memoryBarrier), + 0, + ReadOnlySpan<BufferMemoryBarrier>.Empty, + 0, + ReadOnlySpan<ImageMemoryBarrier>.Empty); + } + public void BeginTransformFeedback(GAL.PrimitiveTopology topology) { _tfEnabled = true; @@ -803,6 +825,11 @@ namespace Ryujinx.Graphics.Vulkan _descriptorSetUpdater.SetImage(binding, image, imageFormat); } + public void SetImage(int binding, Auto<DisposableImageView> image) + { + _descriptorSetUpdater.SetImage(binding, image); + } + public void SetIndexBuffer(BufferRange buffer, GAL.IndexType type) { if (buffer.Handle != BufferHandle.Null) diff --git a/Ryujinx.Graphics.Vulkan/Ryujinx.Graphics.Vulkan.csproj b/Ryujinx.Graphics.Vulkan/Ryujinx.Graphics.Vulkan.csproj index 87f14a6ab..57e2240a7 100644 --- a/Ryujinx.Graphics.Vulkan/Ryujinx.Graphics.Vulkan.csproj +++ b/Ryujinx.Graphics.Vulkan/Ryujinx.Graphics.Vulkan.csproj @@ -12,6 +12,17 @@ <AllowUnsafeBlocks>true</AllowUnsafeBlocks> </PropertyGroup> + <ItemGroup> + <EmbeddedResource Include="Effects\Textures\SmaaAreaTexture.bin" /> + <EmbeddedResource Include="Effects\Textures\SmaaSearchTexture.bin" /> + <EmbeddedResource Include="Effects\Shaders\FsrScaling.spv" /> + <EmbeddedResource Include="Effects\Shaders\FsrSharpening.spv" /> + <EmbeddedResource Include="Effects\Shaders\Fxaa.spv" /> + <EmbeddedResource Include="Effects\Shaders\SmaaBlend.spv" /> + <EmbeddedResource Include="Effects\Shaders\SmaaEdge.spv" /> + <EmbeddedResource Include="Effects\Shaders\SmaaNeighbour.spv" /> + </ItemGroup> + <ItemGroup> <PackageReference Include="OpenTK.Windowing.GraphicsLibraryFramework" /> <PackageReference Include="shaderc.net" /> diff --git a/Ryujinx.Graphics.Vulkan/Window.cs b/Ryujinx.Graphics.Vulkan/Window.cs index a90a824df..5d6def3a9 100644 --- a/Ryujinx.Graphics.Vulkan/Window.cs +++ b/Ryujinx.Graphics.Vulkan/Window.cs @@ -1,4 +1,5 @@ using Ryujinx.Graphics.GAL; +using Ryujinx.Graphics.Vulkan.Effects; using Silk.NET.Vulkan; using Silk.NET.Vulkan.Extensions.KHR; using System; @@ -29,6 +30,14 @@ namespace Ryujinx.Graphics.Vulkan private bool _vsyncEnabled; private bool _vsyncModeChanged; private VkFormat _format; + private AntiAliasing _currentAntiAliasing; + private bool _updateEffect; + private IPostProcessingEffect _effect; + private IScalingFilter _scalingFilter; + private bool _isLinear; + private float _scalingFilterLevel; + private bool _updateScalingFilter; + private ScalingFilter _currentScalingFilter; public unsafe Window(VulkanRenderer gd, SurfaceKHR surface, PhysicalDevice physicalDevice, Device device) { @@ -116,7 +125,7 @@ namespace Ryujinx.Graphics.Vulkan ImageFormat = surfaceFormat.Format, ImageColorSpace = surfaceFormat.ColorSpace, ImageExtent = extent, - ImageUsage = ImageUsageFlags.ColorAttachmentBit | ImageUsageFlags.TransferDstBit, + ImageUsage = ImageUsageFlags.ColorAttachmentBit | ImageUsageFlags.TransferDstBit | ImageUsageFlags.StorageBit, ImageSharingMode = SharingMode.Exclusive, ImageArrayLayers = 1, PreTransform = capabilities.CurrentTransform, @@ -280,6 +289,13 @@ namespace Ryujinx.Graphics.Vulkan var view = (TextureView)texture; + UpdateEffect(); + + if (_effect != null) + { + view = _effect.Run(view, cbs, _width, _height); + } + int srcX0, srcX1, srcY0, srcY1; float scale = view.ScaleFactor; @@ -315,6 +331,18 @@ namespace Ryujinx.Graphics.Vulkan if (ScreenCaptureRequested) { + if (_effect != null) + { + _gd.CommandBufferPool.Return( + cbs, + null, + stackalloc[] { PipelineStageFlags.ColorAttachmentOutputBit }, + null); + _gd.FlushAllCommands(); + cbs.GetFence().Wait(); + cbs = _gd.CommandBufferPool.Rent(); + } + CaptureFrame(view, srcX0, srcY0, srcX1 - srcX0, srcY1 - srcY0, view.Info.Format.IsBgr(), crop.FlipX, crop.FlipY); ScreenCaptureRequested = false; @@ -335,20 +363,36 @@ namespace Ryujinx.Graphics.Vulkan int dstY0 = crop.FlipY ? dstPaddingY : _height - dstPaddingY; int dstY1 = crop.FlipY ? _height - dstPaddingY : dstPaddingY; - _gd.HelperShader.BlitColor( - _gd, - cbs, - view, - _swapchainImageViews[nextImage], - _width, - _height, - 1, - _format, - false, - new Extents2D(srcX0, srcY0, srcX1, srcY1), - new Extents2D(dstX0, dstY1, dstX1, dstY0), - true, - true); + if (_scalingFilter != null) + { + _scalingFilter.Run( + view, + cbs, + _swapchainImageViews[nextImage], + _format, + _width, + _height, + new Extents2D(srcX0, srcY0, srcX1, srcY1), + new Extents2D(dstX0, dstY0, dstX1, dstY1) + ); + } + else + { + _gd.HelperShader.BlitColor( + _gd, + cbs, + view, + _swapchainImageViews[nextImage], + _width, + _height, + 1, + _format, + false, + new Extents2D(srcX0, srcY0, srcX1, srcY1), + new Extents2D(dstX0, dstY1, dstX1, dstY0), + _isLinear, + true); + } Transition( cbs.CommandBuffer, @@ -387,6 +431,95 @@ namespace Ryujinx.Graphics.Vulkan } } + public override void SetAntiAliasing(AntiAliasing effect) + { + if (_currentAntiAliasing == effect && _effect != null) + { + return; + } + + _currentAntiAliasing = effect; + + _updateEffect = true; + } + + public override void SetScalingFilter(ScalingFilter type) + { + if (_currentScalingFilter == type && _effect != null) + { + return; + } + + _currentScalingFilter = type; + + _updateScalingFilter = true; + } + + private void UpdateEffect() + { + if (_updateEffect) + { + _updateEffect = false; + + switch (_currentAntiAliasing) + { + case AntiAliasing.Fxaa: + _effect?.Dispose(); + _effect = new FxaaPostProcessingEffect(_gd, _device); + break; + case AntiAliasing.None: + _effect?.Dispose(); + _effect = null; + break; + case AntiAliasing.SmaaLow: + case AntiAliasing.SmaaMedium: + case AntiAliasing.SmaaHigh: + case AntiAliasing.SmaaUltra: + var quality = _currentAntiAliasing - AntiAliasing.SmaaLow; + if (_effect is SmaaPostProcessingEffect smaa) + { + smaa.Quality = quality; + } + else + { + _effect?.Dispose(); + _effect = new SmaaPostProcessingEffect(_gd, _device, quality); + } + break; + } + } + + if (_updateScalingFilter) + { + _updateScalingFilter = false; + + switch (_currentScalingFilter) + { + case ScalingFilter.Bilinear: + case ScalingFilter.Nearest: + _scalingFilter?.Dispose(); + _scalingFilter = null; + _isLinear = _currentScalingFilter == ScalingFilter.Bilinear; + break; + case ScalingFilter.Fsr: + if (_scalingFilter is not FsrScalingFilter) + { + _scalingFilter?.Dispose(); + _scalingFilter = new FsrScalingFilter(_gd, _device); + } + + _scalingFilter.Level = _scalingFilterLevel; + break; + } + } + } + + public override void SetScalingFilterLevel(float level) + { + _scalingFilterLevel = level; + _updateScalingFilter = true; + } + private unsafe void Transition( CommandBuffer commandBuffer, Image image, @@ -456,8 +589,10 @@ namespace Ryujinx.Graphics.Vulkan } _gd.SwapchainApi.DestroySwapchain(_device, _swapchain, null); - } + + _effect?.Dispose(); + _scalingFilter?.Dispose(); } } diff --git a/Ryujinx.Graphics.Vulkan/WindowBase.cs b/Ryujinx.Graphics.Vulkan/WindowBase.cs index 651fe7c16..0a365e8fb 100644 --- a/Ryujinx.Graphics.Vulkan/WindowBase.cs +++ b/Ryujinx.Graphics.Vulkan/WindowBase.cs @@ -11,5 +11,8 @@ namespace Ryujinx.Graphics.Vulkan public abstract void Present(ITexture texture, ImageCrop crop, Action swapBuffersCallback); public abstract void SetSize(int width, int height); public abstract void ChangeVSyncMode(bool vsyncEnabled); + public abstract void SetAntiAliasing(AntiAliasing effect); + public abstract void SetScalingFilter(ScalingFilter scalerType); + public abstract void SetScalingFilterLevel(float scale); } } \ No newline at end of file diff --git a/Ryujinx.Ui.Common/Configuration/ConfigurationFileFormat.cs b/Ryujinx.Ui.Common/Configuration/ConfigurationFileFormat.cs index 226b5933b..e9aec04b2 100644 --- a/Ryujinx.Ui.Common/Configuration/ConfigurationFileFormat.cs +++ b/Ryujinx.Ui.Common/Configuration/ConfigurationFileFormat.cs @@ -14,7 +14,7 @@ namespace Ryujinx.Ui.Common.Configuration /// <summary> /// The current version of the file format /// </summary> - public const int CurrentVersion = 43; + public const int CurrentVersion = 44; /// <summary> /// Version of the configuration file format @@ -51,6 +51,21 @@ namespace Ryujinx.Ui.Common.Configuration /// </summary> public AspectRatio AspectRatio { get; set; } + /// <summary> + /// Applies anti-aliasing to the renderer. + /// </summary> + public AntiAliasing AntiAliasing { get; set; } + + /// <summary> + /// Sets the framebuffer upscaling type. + /// </summary> + public ScalingFilter ScalingFilter { get; set; } + + /// <summary> + /// Sets the framebuffer upscaling level. + /// </summary> + public int ScalingFilterLevel { get; set; } + /// <summary> /// Dumps shaders in this local directory /// </summary> diff --git a/Ryujinx.Ui.Common/Configuration/ConfigurationState.cs b/Ryujinx.Ui.Common/Configuration/ConfigurationState.cs index f193b1570..bcdd2e70a 100644 --- a/Ryujinx.Ui.Common/Configuration/ConfigurationState.cs +++ b/Ryujinx.Ui.Common/Configuration/ConfigurationState.cs @@ -433,6 +433,21 @@ namespace Ryujinx.Ui.Common.Configuration /// </summary> public ReactiveObject<GraphicsBackend> GraphicsBackend { get; private set; } + /// <summary> + /// Applies anti-aliasing to the renderer. + /// </summary> + public ReactiveObject<AntiAliasing> AntiAliasing { get; private set; } + + /// <summary> + /// Sets the framebuffer upscaling type. + /// </summary> + public ReactiveObject<ScalingFilter> ScalingFilter { get; private set; } + + /// <summary> + /// Sets the framebuffer upscaling level. + /// </summary> + public ReactiveObject<int> ScalingFilterLevel { get; private set; } + /// <summary> /// Preferred GPU /// </summary> @@ -463,6 +478,12 @@ namespace Ryujinx.Ui.Common.Configuration PreferredGpu.Event += static (sender, e) => LogValueChange(sender, e, nameof(PreferredGpu)); EnableMacroHLE = new ReactiveObject<bool>(); EnableMacroHLE.Event += static (sender, e) => LogValueChange(sender, e, nameof(EnableMacroHLE)); + AntiAliasing = new ReactiveObject<AntiAliasing>(); + AntiAliasing.Event += static (sender, e) => LogValueChange(sender, e, nameof(AntiAliasing)); + ScalingFilter = new ReactiveObject<ScalingFilter>(); + ScalingFilter.Event += static (sender, e) => LogValueChange(sender, e, nameof(ScalingFilter)); + ScalingFilterLevel = new ReactiveObject<int>(); + ScalingFilterLevel.Event += static (sender, e) => LogValueChange(sender, e, nameof(ScalingFilterLevel)); } } @@ -540,6 +561,9 @@ namespace Ryujinx.Ui.Common.Configuration ResScaleCustom = Graphics.ResScaleCustom, MaxAnisotropy = Graphics.MaxAnisotropy, AspectRatio = Graphics.AspectRatio, + AntiAliasing = Graphics.AntiAliasing, + ScalingFilter = Graphics.ScalingFilter, + ScalingFilterLevel = Graphics.ScalingFilterLevel, GraphicsShadersDumpPath = Graphics.ShadersDumpPath, LoggingEnableDebug = Logger.EnableDebug, LoggingEnableStub = Logger.EnableStub, @@ -651,6 +675,9 @@ namespace Ryujinx.Ui.Common.Configuration Graphics.EnableShaderCache.Value = true; Graphics.EnableTextureRecompression.Value = false; Graphics.EnableMacroHLE.Value = true; + Graphics.AntiAliasing.Value = AntiAliasing.None; + Graphics.ScalingFilter.Value = ScalingFilter.Bilinear; + Graphics.ScalingFilterLevel.Value = 80; System.EnablePtc.Value = true; System.EnableInternetAccess.Value = false; System.EnableFsIntegrityChecks.Value = true; @@ -1208,6 +1235,17 @@ namespace Ryujinx.Ui.Common.Configuration configurationFileFormat.UseHypervisor = true; } + if (configurationFileFormat.Version < 44) + { + Ryujinx.Common.Logging.Logger.Warning?.Print(LogClass.Application, $"Outdated configuration version {configurationFileFormat.Version}, migrating to version 42."); + + configurationFileFormat.AntiAliasing = AntiAliasing.None; + configurationFileFormat.ScalingFilter = ScalingFilter.Bilinear; + configurationFileFormat.ScalingFilterLevel = 80; + + configurationFileUpdated = true; + } + Logger.EnableFileLog.Value = configurationFileFormat.EnableFileLog; Graphics.ResScale.Value = configurationFileFormat.ResScale; Graphics.ResScaleCustom.Value = configurationFileFormat.ResScaleCustom; @@ -1217,6 +1255,9 @@ namespace Ryujinx.Ui.Common.Configuration Graphics.BackendThreading.Value = configurationFileFormat.BackendThreading; Graphics.GraphicsBackend.Value = configurationFileFormat.GraphicsBackend; Graphics.PreferredGpu.Value = configurationFileFormat.PreferredGpu; + Graphics.AntiAliasing.Value = configurationFileFormat.AntiAliasing; + Graphics.ScalingFilter.Value = configurationFileFormat.ScalingFilter; + Graphics.ScalingFilterLevel.Value = configurationFileFormat.ScalingFilterLevel; Logger.EnableDebug.Value = configurationFileFormat.LoggingEnableDebug; Logger.EnableStub.Value = configurationFileFormat.LoggingEnableStub; Logger.EnableInfo.Value = configurationFileFormat.LoggingEnableInfo; diff --git a/Ryujinx/Ui/RendererWidgetBase.cs b/Ryujinx/Ui/RendererWidgetBase.cs index 4bf2a70ff..957bbcd55 100644 --- a/Ryujinx/Ui/RendererWidgetBase.cs +++ b/Ryujinx/Ui/RendererWidgetBase.cs @@ -27,6 +27,7 @@ namespace Ryujinx.Ui using Image = SixLabors.ImageSharp.Image; using Key = Input.Key; using Switch = HLE.Switch; + using ScalingFilter = Graphics.GAL.ScalingFilter; public abstract class RendererWidgetBase : DrawingArea { @@ -116,6 +117,21 @@ namespace Ryujinx.Ui _lastCursorMoveTime = Stopwatch.GetTimestamp(); ConfigurationState.Instance.HideCursorOnIdle.Event += HideCursorStateChanged; + ConfigurationState.Instance.Graphics.AntiAliasing.Event += UpdateAnriAliasing; + ConfigurationState.Instance.Graphics.ScalingFilter.Event += UpdateScalingFilter; + ConfigurationState.Instance.Graphics.ScalingFilterLevel.Event += UpdateScalingFilterLevel; + } + + private void UpdateScalingFilterLevel(object sender, ReactiveEventArgs<int> e) + { + Renderer.Window.SetScalingFilter((ScalingFilter)ConfigurationState.Instance.Graphics.ScalingFilter.Value); + Renderer.Window.SetScalingFilterLevel(ConfigurationState.Instance.Graphics.ScalingFilterLevel.Value); + } + + private void UpdateScalingFilter(object sender, ReactiveEventArgs<Ryujinx.Common.Configuration.ScalingFilter> e) + { + Renderer.Window.SetScalingFilter((ScalingFilter)ConfigurationState.Instance.Graphics.ScalingFilter.Value); + Renderer.Window.SetScalingFilterLevel(ConfigurationState.Instance.Graphics.ScalingFilterLevel.Value); } public abstract void InitializeRenderer(); @@ -149,11 +165,19 @@ namespace Ryujinx.Ui private void Renderer_Destroyed(object sender, EventArgs e) { ConfigurationState.Instance.HideCursorOnIdle.Event -= HideCursorStateChanged; + ConfigurationState.Instance.Graphics.AntiAliasing.Event -= UpdateAnriAliasing; + ConfigurationState.Instance.Graphics.ScalingFilter.Event -= UpdateScalingFilter; + ConfigurationState.Instance.Graphics.ScalingFilterLevel.Event -= UpdateScalingFilterLevel; NpadManager.Dispose(); Dispose(); } + private void UpdateAnriAliasing(object sender, ReactiveEventArgs<Ryujinx.Common.Configuration.AntiAliasing> e) + { + Renderer?.Window.SetAntiAliasing((Graphics.GAL.AntiAliasing)e.NewValue); + } + protected override bool OnMotionNotifyEvent(EventMotion evnt) { if (_hideCursorOnIdle) @@ -394,6 +418,10 @@ namespace Ryujinx.Ui Device.Gpu.Renderer.Initialize(_glLogLevel); + Renderer.Window.SetAntiAliasing((Graphics.GAL.AntiAliasing)ConfigurationState.Instance.Graphics.AntiAliasing.Value); + Renderer.Window.SetScalingFilter((Graphics.GAL.ScalingFilter)ConfigurationState.Instance.Graphics.ScalingFilter.Value); + Renderer.Window.SetScalingFilterLevel(ConfigurationState.Instance.Graphics.ScalingFilterLevel.Value); + _gpuBackendName = GetGpuBackendName(); _gpuVendorName = GetGpuVendorName(); diff --git a/Ryujinx/Ui/Windows/SettingsWindow.cs b/Ryujinx/Ui/Windows/SettingsWindow.cs index 220bb82ae..61af7d397 100644 --- a/Ryujinx/Ui/Windows/SettingsWindow.cs +++ b/Ryujinx/Ui/Windows/SettingsWindow.cs @@ -95,10 +95,14 @@ namespace Ryujinx.Ui.Windows [GUI] Entry _graphicsShadersDumpPath; [GUI] ComboBoxText _anisotropy; [GUI] ComboBoxText _aspectRatio; + [GUI] ComboBoxText _antiAliasing; + [GUI] ComboBoxText _scalingFilter; [GUI] ComboBoxText _graphicsBackend; [GUI] ComboBoxText _preferredGpu; [GUI] ComboBoxText _resScaleCombo; [GUI] Entry _resScaleText; + [GUI] Adjustment _scalingFilterLevel; + [GUI] Scale _scalingFilterSlider; [GUI] ToggleButton _configureController1; [GUI] ToggleButton _configureController2; [GUI] ToggleButton _configureController3; @@ -139,6 +143,7 @@ namespace Ryujinx.Ui.Windows _systemTimeZoneEntry.FocusOutEvent += TimeZoneEntry_FocusOut; _resScaleCombo.Changed += (sender, args) => _resScaleText.Visible = _resScaleCombo.ActiveId == "-1"; + _scalingFilter.Changed += (sender, args) => _scalingFilterSlider.Visible = _scalingFilter.ActiveId == "2"; _galThreading.Changed += (sender, args) => { if (_galThreading.ActiveId != ConfigurationState.Instance.Graphics.BackendThreading.Value.ToString()) @@ -338,6 +343,8 @@ namespace Ryujinx.Ui.Windows _anisotropy.SetActiveId(ConfigurationState.Instance.Graphics.MaxAnisotropy.Value.ToString()); _aspectRatio.SetActiveId(((int)ConfigurationState.Instance.Graphics.AspectRatio.Value).ToString()); _graphicsBackend.SetActiveId(((int)ConfigurationState.Instance.Graphics.GraphicsBackend.Value).ToString()); + _antiAliasing.SetActiveId(((int)ConfigurationState.Instance.Graphics.AntiAliasing.Value).ToString()); + _scalingFilter.SetActiveId(((int)ConfigurationState.Instance.Graphics.ScalingFilter.Value).ToString()); UpdatePreferredGpuComboBox(); @@ -345,7 +352,9 @@ namespace Ryujinx.Ui.Windows _custThemePath.Buffer.Text = ConfigurationState.Instance.Ui.CustomThemePath; _resScaleText.Buffer.Text = ConfigurationState.Instance.Graphics.ResScaleCustom.Value.ToString(); + _scalingFilterLevel.Value = ConfigurationState.Instance.Graphics.ScalingFilterLevel.Value; _resScaleText.Visible = _resScaleCombo.ActiveId == "-1"; + _scalingFilterSlider.Visible = _scalingFilter.ActiveId == "2"; _graphicsShadersDumpPath.Buffer.Text = ConfigurationState.Instance.Graphics.ShadersDumpPath; _fsLogSpinAdjustment.Value = ConfigurationState.Instance.System.FsGlobalAccessLogMode; _systemTimeOffset = ConfigurationState.Instance.System.SystemTimeOffset; @@ -605,6 +614,9 @@ namespace Ryujinx.Ui.Windows ConfigurationState.Instance.Graphics.ResScale.Value = int.Parse(_resScaleCombo.ActiveId); ConfigurationState.Instance.Graphics.ResScaleCustom.Value = resScaleCustom; ConfigurationState.Instance.System.AudioVolume.Value = (float)_audioVolumeSlider.Value / 100.0f; + ConfigurationState.Instance.Graphics.AntiAliasing.Value = Enum.Parse<AntiAliasing>(_antiAliasing.ActiveId); + ConfigurationState.Instance.Graphics.ScalingFilter.Value = Enum.Parse<ScalingFilter>(_scalingFilter.ActiveId); + ConfigurationState.Instance.Graphics.ScalingFilterLevel.Value = (int)_scalingFilterLevel.Value; _previousVolumeLevel = ConfigurationState.Instance.System.AudioVolume.Value; diff --git a/Ryujinx/Ui/Windows/SettingsWindow.glade b/Ryujinx/Ui/Windows/SettingsWindow.glade index e39be81a9..c19c1db9f 100644 --- a/Ryujinx/Ui/Windows/SettingsWindow.glade +++ b/Ryujinx/Ui/Windows/SettingsWindow.glade @@ -40,6 +40,13 @@ <property name="inline-completion">True</property> <property name="inline-selection">True</property> </object> + <object class="GtkAdjustment" id="_scalingFilterLevel"> + <property name="lower">0</property> + <property name="upper">101</property> + <property name="step-increment">1</property> + <property name="page-increment">5</property> + <property name="page-size">1</property> + </object> <object class="GtkWindow" id="_settingsWin"> <property name="can-focus">False</property> <property name="title" translatable="yes">Ryujinx - Settings</property> @@ -2152,6 +2159,118 @@ <property name="position">3</property> </packing> </child> + <child> + <object class="GtkBox"> + <property name="visible">True</property> + <property name="can-focus">False</property> + <property name="margin-top">5</property> + <property name="margin-bottom">5</property> + <child> + <object class="GtkLabel"> + <property name="visible">True</property> + <property name="can-focus">False</property> + <property name="tooltip-text" translatable="yes">Applies a final effect to the game render</property> + <property name="label" translatable="yes">Post Processing Effect:</property> + </object> + <packing> + <property name="expand">False</property> + <property name="fill">True</property> + <property name="padding">5</property> + <property name="position">0</property> + </packing> + </child> + <child> + <object class="GtkComboBoxText" id="_antiAliasing"> + <property name="visible">True</property> + <property name="can-focus">False</property> + <property name="tooltip-text" translatable="yes">Applies anti-aliasing to the game render</property> + <property name="active-id">1</property> + <items> + <item id="0" translatable="yes">None</item> + <item id="1" translatable="yes">FXAA</item> + <item id="2" translatable="yes">SMAA Low</item> + <item id="3" translatable="yes">SMAA Medium</item> + <item id="4" translatable="yes">SMAA High</item> + <item id="5" translatable="yes">SMAA Ultra</item> + </items> + </object> + <packing> + <property name="expand">False</property> + <property name="fill">True</property> + <property name="position">1</property> + </packing> + </child> + </object> + <packing> + <property name="expand">False</property> + <property name="fill">True</property> + <property name="padding">5</property> + <property name="position">4</property> + </packing> + </child> + <child> + <object class="GtkBox"> + <property name="width-request">100</property> + <property name="visible">True</property> + <property name="can-focus">False</property> + <property name="margin-top">5</property> + <property name="margin-bottom">5</property> + <child> + <object class="GtkLabel"> + <property name="visible">True</property> + <property name="can-focus">False</property> + <property name="tooltip-text" translatable="yes">Enables Framebuffer Upscaling</property> + <property name="label" translatable="yes">Upscale: </property> + </object> + <packing> + <property name="expand">False</property> + <property name="fill">True</property> + <property name="padding">5</property> + <property name="position">0</property> + </packing> + </child> + <child> + <object class="GtkComboBoxText" id="_scalingFilter"> + <property name="visible">True</property> + <property name="can-focus">False</property> + <property name="tooltip-text" translatable="yes">Enables Framebuffer Upscaling</property> + <property name="active-id">1</property> + <items> + <item id="0" translatable="yes">Bilinear</item> + <item id="1" translatable="yes">Nearest</item> + <item id="2" translatable="yes">FSR</item> + </items> + </object> + <packing> + <property name="expand">False</property> + <property name="fill">True</property> + <property name="position">1</property> + </packing> + </child> + <child> + <object class="GtkScale" id="_scalingFilterSlider"> + <property name="width-request">200</property> + <property name="visible">True</property> + <property name="can-focus">True</property> + <property name="margin-start">5</property> + <property name="adjustment">_scalingFilterLevel</property> + <property name="round-digits">1</property> + <property name="value-pos">right</property> + </object> + <packing> + <property name="expand">False</property> + <property name="fill">True</property> + <property name="position">3</property> + </packing> + </child> + </object> + <packing> + <property name="expand">False</property> + <property name="fill">True</property> + <property name="padding">5</property> + <property name="position">5</property> + </packing> + </child> <child> <object class="GtkBox"> <property name="visible">True</property> @@ -2197,7 +2316,7 @@ <property name="expand">False</property> <property name="fill">True</property> <property name="padding">5</property> - <property name="position">4</property> + <property name="position">6</property> </packing> </child> <child> @@ -2246,7 +2365,7 @@ <property name="expand">False</property> <property name="fill">True</property> <property name="padding">5</property> - <property name="position">5</property> + <property name="position">7</property> </packing> </child> </object> From 9b5a0c388980b16f7adfceb1f57320087bfc6322 Mon Sep 17 00:00:00 2001 From: riperiperi <rhy3756547@hotmail.com> Date: Tue, 28 Feb 2023 03:41:44 +0000 Subject: [PATCH 36/41] Sockets: Properly convert error codes on MacOS (#4491) * Sockets: Properly convert error codes on MacOS The error codes for MacOS are very different to how they are on windows or linux. An alternate mapping is used when the host operating system is MacOS. This PR also defaults IsDhcpEnabled to true when interfaceProperties.DhcpServerAddresses is not available. This change was already in `macos1`. * Address feedback --- .../StaticService/Types/IpAddressSetting.cs | 2 +- .../Sockets/Bsd/Impl/WinSockHelper.cs | 147 ++++++++++++------ 2 files changed, 104 insertions(+), 45 deletions(-) diff --git a/Ryujinx.HLE/HOS/Services/Nifm/StaticService/Types/IpAddressSetting.cs b/Ryujinx.HLE/HOS/Services/Nifm/StaticService/Types/IpAddressSetting.cs index 5bb046abe..30667b928 100644 --- a/Ryujinx.HLE/HOS/Services/Nifm/StaticService/Types/IpAddressSetting.cs +++ b/Ryujinx.HLE/HOS/Services/Nifm/StaticService/Types/IpAddressSetting.cs @@ -15,7 +15,7 @@ namespace Ryujinx.HLE.HOS.Services.Nifm.StaticService.Types public IpAddressSetting(IPInterfaceProperties interfaceProperties, UnicastIPAddressInformation unicastIPAddressInformation) { - IsDhcpEnabled = !OperatingSystem.IsMacOS() && interfaceProperties.DhcpServerAddresses.Count != 0; + IsDhcpEnabled = OperatingSystem.IsMacOS() || interfaceProperties.DhcpServerAddresses.Count != 0; Address = new IpV4Address(unicastIPAddressInformation.Address); IPv4Mask = new IpV4Address(unicastIPAddressInformation.IPv4Mask); GatewayAddress = new IpV4Address(interfaceProperties.GatewayAddresses[0].Address); diff --git a/Ryujinx.HLE/HOS/Services/Sockets/Bsd/Impl/WinSockHelper.cs b/Ryujinx.HLE/HOS/Services/Sockets/Bsd/Impl/WinSockHelper.cs index 48439d7d3..5668d30b0 100644 --- a/Ryujinx.HLE/HOS/Services/Sockets/Bsd/Impl/WinSockHelper.cs +++ b/Ryujinx.HLE/HOS/Services/Sockets/Bsd/Impl/WinSockHelper.cs @@ -1,4 +1,5 @@ -using Ryujinx.HLE.HOS.Services.Sockets.Bsd.Types; +using Ryujinx.HLE.HOS.Services.Sockets.Bsd.Types; +using System; using System.Collections.Generic; using System.Net.Sockets; @@ -9,85 +10,133 @@ namespace Ryujinx.HLE.HOS.Services.Sockets.Bsd.Impl private static readonly Dictionary<WsaError, LinuxError> _errorMap = new() { // WSAEINTR - {WsaError.WSAEINTR, LinuxError.EINTR}, + { WsaError.WSAEINTR, LinuxError.EINTR }, // WSAEWOULDBLOCK - {WsaError.WSAEWOULDBLOCK, LinuxError.EWOULDBLOCK}, + { WsaError.WSAEWOULDBLOCK, LinuxError.EWOULDBLOCK }, // WSAEINPROGRESS - {WsaError.WSAEINPROGRESS, LinuxError.EINPROGRESS}, + { WsaError.WSAEINPROGRESS, LinuxError.EINPROGRESS }, // WSAEALREADY - {WsaError.WSAEALREADY, LinuxError.EALREADY}, + { WsaError.WSAEALREADY, LinuxError.EALREADY }, // WSAENOTSOCK - {WsaError.WSAENOTSOCK, LinuxError.ENOTSOCK}, + { WsaError.WSAENOTSOCK, LinuxError.ENOTSOCK }, // WSAEDESTADDRREQ - {WsaError.WSAEDESTADDRREQ, LinuxError.EDESTADDRREQ}, + { WsaError.WSAEDESTADDRREQ, LinuxError.EDESTADDRREQ }, // WSAEMSGSIZE - {WsaError.WSAEMSGSIZE, LinuxError.EMSGSIZE}, + { WsaError.WSAEMSGSIZE, LinuxError.EMSGSIZE }, // WSAEPROTOTYPE - {WsaError.WSAEPROTOTYPE, LinuxError.EPROTOTYPE}, + { WsaError.WSAEPROTOTYPE, LinuxError.EPROTOTYPE }, // WSAENOPROTOOPT - {WsaError.WSAENOPROTOOPT, LinuxError.ENOPROTOOPT}, + { WsaError.WSAENOPROTOOPT, LinuxError.ENOPROTOOPT }, // WSAEPROTONOSUPPORT - {WsaError.WSAEPROTONOSUPPORT, LinuxError.EPROTONOSUPPORT}, + { WsaError.WSAEPROTONOSUPPORT, LinuxError.EPROTONOSUPPORT }, // WSAESOCKTNOSUPPORT - {WsaError.WSAESOCKTNOSUPPORT, LinuxError.ESOCKTNOSUPPORT}, + { WsaError.WSAESOCKTNOSUPPORT, LinuxError.ESOCKTNOSUPPORT }, // WSAEOPNOTSUPP - {WsaError.WSAEOPNOTSUPP, LinuxError.EOPNOTSUPP}, + { WsaError.WSAEOPNOTSUPP, LinuxError.EOPNOTSUPP }, // WSAEPFNOSUPPORT - {WsaError.WSAEPFNOSUPPORT, LinuxError.EPFNOSUPPORT}, + { WsaError.WSAEPFNOSUPPORT, LinuxError.EPFNOSUPPORT }, // WSAEAFNOSUPPORT - {WsaError.WSAEAFNOSUPPORT, LinuxError.EAFNOSUPPORT}, + { WsaError.WSAEAFNOSUPPORT, LinuxError.EAFNOSUPPORT }, // WSAEADDRINUSE - {WsaError.WSAEADDRINUSE, LinuxError.EADDRINUSE}, + { WsaError.WSAEADDRINUSE, LinuxError.EADDRINUSE }, // WSAEADDRNOTAVAIL - {WsaError.WSAEADDRNOTAVAIL, LinuxError.EADDRNOTAVAIL}, + { WsaError.WSAEADDRNOTAVAIL, LinuxError.EADDRNOTAVAIL }, // WSAENETDOWN - {WsaError.WSAENETDOWN, LinuxError.ENETDOWN}, + { WsaError.WSAENETDOWN, LinuxError.ENETDOWN }, // WSAENETUNREACH - {WsaError.WSAENETUNREACH, LinuxError.ENETUNREACH}, + { WsaError.WSAENETUNREACH, LinuxError.ENETUNREACH }, // WSAENETRESET - {WsaError.WSAENETRESET, LinuxError.ENETRESET}, + { WsaError.WSAENETRESET, LinuxError.ENETRESET }, // WSAECONNABORTED - {WsaError.WSAECONNABORTED, LinuxError.ECONNABORTED}, + { WsaError.WSAECONNABORTED, LinuxError.ECONNABORTED }, // WSAECONNRESET - {WsaError.WSAECONNRESET, LinuxError.ECONNRESET}, + { WsaError.WSAECONNRESET, LinuxError.ECONNRESET }, // WSAENOBUFS - {WsaError.WSAENOBUFS, LinuxError.ENOBUFS}, + { WsaError.WSAENOBUFS, LinuxError.ENOBUFS }, // WSAEISCONN - {WsaError.WSAEISCONN, LinuxError.EISCONN}, + { WsaError.WSAEISCONN, LinuxError.EISCONN }, // WSAENOTCONN - {WsaError.WSAENOTCONN, LinuxError.ENOTCONN}, + { WsaError.WSAENOTCONN, LinuxError.ENOTCONN }, // WSAESHUTDOWN - {WsaError.WSAESHUTDOWN, LinuxError.ESHUTDOWN}, + { WsaError.WSAESHUTDOWN, LinuxError.ESHUTDOWN }, // WSAETOOMANYREFS - {WsaError.WSAETOOMANYREFS, LinuxError.ETOOMANYREFS}, + { WsaError.WSAETOOMANYREFS, LinuxError.ETOOMANYREFS }, // WSAETIMEDOUT - {WsaError.WSAETIMEDOUT, LinuxError.ETIMEDOUT}, + { WsaError.WSAETIMEDOUT, LinuxError.ETIMEDOUT }, // WSAECONNREFUSED - {WsaError.WSAECONNREFUSED, LinuxError.ECONNREFUSED}, + { WsaError.WSAECONNREFUSED, LinuxError.ECONNREFUSED }, // WSAELOOP - {WsaError.WSAELOOP, LinuxError.ELOOP}, + { WsaError.WSAELOOP, LinuxError.ELOOP }, // WSAENAMETOOLONG - {WsaError.WSAENAMETOOLONG, LinuxError.ENAMETOOLONG}, + { WsaError.WSAENAMETOOLONG, LinuxError.ENAMETOOLONG }, // WSAEHOSTDOWN - {WsaError.WSAEHOSTDOWN, LinuxError.EHOSTDOWN}, + { WsaError.WSAEHOSTDOWN, LinuxError.EHOSTDOWN }, // WSAEHOSTUNREACH - {WsaError.WSAEHOSTUNREACH, LinuxError.EHOSTUNREACH}, + { WsaError.WSAEHOSTUNREACH, LinuxError.EHOSTUNREACH }, // WSAENOTEMPTY - {WsaError.WSAENOTEMPTY, LinuxError.ENOTEMPTY}, + { WsaError.WSAENOTEMPTY, LinuxError.ENOTEMPTY }, // WSAEUSERS - {WsaError.WSAEUSERS, LinuxError.EUSERS}, + { WsaError.WSAEUSERS, LinuxError.EUSERS }, // WSAEDQUOT - {WsaError.WSAEDQUOT, LinuxError.EDQUOT}, + { WsaError.WSAEDQUOT, LinuxError.EDQUOT }, // WSAESTALE - {WsaError.WSAESTALE, LinuxError.ESTALE}, + { WsaError.WSAESTALE, LinuxError.ESTALE }, // WSAEREMOTE - {WsaError.WSAEREMOTE, LinuxError.EREMOTE}, + { WsaError.WSAEREMOTE, LinuxError.EREMOTE }, // WSAEINVAL - {WsaError.WSAEINVAL, LinuxError.EINVAL}, + { WsaError.WSAEINVAL, LinuxError.EINVAL }, // WSAEFAULT - {WsaError.WSAEFAULT, LinuxError.EFAULT}, + { WsaError.WSAEFAULT, LinuxError.EFAULT }, // NOERROR - {0, 0} + { 0, 0 } + }; + + private static readonly Dictionary<int, LinuxError> _errorMapMacOs = new() + { + { 35, LinuxError.EAGAIN }, + { 11, LinuxError.EDEADLOCK }, + { 91, LinuxError.ENOMSG }, + { 90, LinuxError.EIDRM }, + { 77, LinuxError.ENOLCK }, + { 70, LinuxError.ESTALE }, + { 36, LinuxError.EINPROGRESS }, + { 37, LinuxError.EALREADY }, + { 38, LinuxError.ENOTSOCK }, + { 39, LinuxError.EDESTADDRREQ }, + { 40, LinuxError.EMSGSIZE }, + { 41, LinuxError.EPROTOTYPE }, + { 42, LinuxError.ENOPROTOOPT }, + { 43, LinuxError.EPROTONOSUPPORT }, + { 44, LinuxError.ESOCKTNOSUPPORT }, + { 45, LinuxError.EOPNOTSUPP }, + { 46, LinuxError.EPFNOSUPPORT }, + { 47, LinuxError.EAFNOSUPPORT }, + { 48, LinuxError.EADDRINUSE }, + { 49, LinuxError.EADDRNOTAVAIL }, + { 50, LinuxError.ENETDOWN }, + { 51, LinuxError.ENETUNREACH }, + { 52, LinuxError.ENETRESET }, + { 53, LinuxError.ECONNABORTED }, + { 54, LinuxError.ECONNRESET }, + { 55, LinuxError.ENOBUFS }, + { 56, LinuxError.EISCONN }, + { 57, LinuxError.ENOTCONN }, + { 58, LinuxError.ESHUTDOWN }, + { 60, LinuxError.ETIMEDOUT }, + { 61, LinuxError.ECONNREFUSED }, + { 64, LinuxError.EHOSTDOWN }, + { 65, LinuxError.EHOSTUNREACH }, + { 68, LinuxError.EUSERS }, + { 62, LinuxError.ELOOP }, + { 63, LinuxError.ENAMETOOLONG }, + { 66, LinuxError.ENOTEMPTY }, + { 69, LinuxError.EDQUOT }, + { 71, LinuxError.EREMOTE }, + { 78, LinuxError.ENOSYS }, + { 59, LinuxError.ETOOMANYREFS }, + { 92, LinuxError.EILSEQ }, + { 89, LinuxError.ECANCELED }, + { 84, LinuxError.EOVERFLOW } }; private static readonly Dictionary<BsdSocketOption, SocketOptionName> _soSocketOptionMap = new() @@ -136,12 +185,22 @@ namespace Ryujinx.HLE.HOS.Services.Sockets.Bsd.Impl public static LinuxError ConvertError(WsaError errorCode) { - if (!_errorMap.TryGetValue(errorCode, out LinuxError errno)) + if (OperatingSystem.IsMacOS()) { - errno = (LinuxError)errorCode; + if (_errorMapMacOs.TryGetValue((int)errorCode, out LinuxError errno)) + { + return errno; + } + } + else + { + if (_errorMap.TryGetValue(errorCode, out LinuxError errno)) + { + return errno; + } } - return errno; + return (LinuxError)errorCode; } public static bool TryConvertSocketOption(BsdSocketOption option, SocketOptionLevel level, out SocketOptionName name) From ecee34a50cd8e4266cb2ecc9910d8d33d612c84a Mon Sep 17 00:00:00 2001 From: Steveice10 <1269164+Steveice10@users.noreply.github.com> Date: Wed, 1 Mar 2023 18:42:27 -0800 Subject: [PATCH 37/41] Update LibHac to 0.18.0 (#4414) * Update LibHac to 0.18.0 * Change instance of AsBytes(CreateReadOnlySpan(...)) to AsReadOnlyByteSpan(...) --- Directory.Packages.props | 2 +- Ryujinx.Ava/Common/ApplicationHelper.cs | 6 +- .../UI/Controls/NavigationDialogHost.axaml.cs | 2 +- .../UI/ViewModels/AvatarProfileViewModel.cs | 2 +- .../UI/ViewModels/MainWindowViewModel.cs | 3 +- .../UI/ViewModels/TitleUpdateViewModel.cs | 2 +- .../UserFirmwareAvatarSelectorViewModel.cs | 2 +- .../UI/Views/Main/MainMenuBarView.axaml.cs | 2 +- .../Views/User/UserSaveManagerView.axaml.cs | 2 +- .../DownloadableContentManagerWindow.axaml.cs | 4 +- Ryujinx.HLE/FileSystem/ContentManager.cs | 26 ++--- Ryujinx.HLE/FileSystem/LocationEntry.cs | 2 +- Ryujinx.HLE/FileSystem/VirtualFileSystem.cs | 9 +- Ryujinx.HLE/HOS/Applets/Error/ErrorApplet.cs | 2 +- Ryujinx.HLE/HOS/ApplicationLoader.cs | 28 ++--- Ryujinx.HLE/HOS/ModLoader.cs | 4 +- Ryujinx.HLE/HOS/ProgramLoader.cs | 8 +- .../Services/Account/Acc/AccountManager.cs | 2 +- .../ApplicationProxy/IApplicationFunctions.cs | 2 +- .../HOS/Services/Bcat/IServiceCreator.cs | 8 +- .../IDeliveryCacheStorageService.cs | 8 +- .../FileSystemProxy/FileSystemProxyHelper.cs | 18 +-- .../Fs/FileSystemProxy/IFileSystem.cs | 8 +- .../HOS/Services/Fs/IFileSystemProxy.cs | 108 +++++++++--------- .../HOS/Services/Fs/IMultiCommitManager.cs | 2 +- .../HOS/Services/Mii/Types/StoreData.cs | 6 +- .../ILocationResolver.cs | 4 +- .../Nifm/StaticService/Types/ProxySetting.cs | 4 +- .../HOS/Services/Sdb/Pl/SharedFontManager.cs | 2 +- .../Settings/ISystemSettingsServer.cs | 2 +- .../Services/Ssl/BuiltInCertificateManager.cs | 4 +- .../Time/TimeZone/TimeZoneContentManager.cs | 6 +- Ryujinx.Ui.Common/App/ApplicationLibrary.cs | 14 +-- Ryujinx/Ui/MainWindow.cs | 2 +- Ryujinx/Ui/Widgets/GameTableContextMenu.cs | 6 +- Ryujinx/Ui/Windows/AvatarWindow.cs | 2 +- Ryujinx/Ui/Windows/DlcWindow.cs | 4 +- Ryujinx/Ui/Windows/TitleUpdateWindow.cs | 2 +- 38 files changed, 161 insertions(+), 159 deletions(-) diff --git a/Directory.Packages.props b/Directory.Packages.props index 35c98e5a3..b46b77e70 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -19,7 +19,7 @@ <PackageVersion Include="GtkSharp.Dependencies" Version="1.1.1" /> <PackageVersion Include="GtkSharp.Dependencies.osx" Version="0.0.5" /> <PackageVersion Include="jp2masa.Avalonia.Flexbox" Version="0.2.0" /> - <PackageVersion Include="LibHac" Version="0.17.0" /> + <PackageVersion Include="LibHac" Version="0.18.0" /> <PackageVersion Include="Microsoft.CodeAnalysis.Analyzers" Version="3.3.4" /> <PackageVersion Include="Microsoft.CodeAnalysis.CSharp" Version="4.4.0" /> <PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.5.0" /> diff --git a/Ryujinx.Ava/Common/ApplicationHelper.cs b/Ryujinx.Ava/Common/ApplicationHelper.cs index 0b8bd8da1..276d18745 100644 --- a/Ryujinx.Ava/Common/ApplicationHelper.cs +++ b/Ryujinx.Ava/Common/ApplicationHelper.cs @@ -193,7 +193,7 @@ namespace Ryujinx.Ava.Common { using var ncaFile = new UniqueRef<IFile>(); - pfs.OpenFile(ref ncaFile.Ref(), fileEntry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); + pfs.OpenFile(ref ncaFile.Ref, fileEntry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); Nca nca = new(_virtualFileSystem.KeySet, ncaFile.Get.AsStorage()); if (nca.Header.ContentType == NcaContentType.Program) @@ -249,8 +249,8 @@ namespace Ryujinx.Ava.Common using var uniqueSourceFs = new UniqueRef<IFileSystem>(ncaFileSystem); using var uniqueOutputFs = new UniqueRef<IFileSystem>(new LocalFileSystem(destination)); - fsClient.Register(source.ToU8Span(), ref uniqueSourceFs.Ref()); - fsClient.Register(output.ToU8Span(), ref uniqueOutputFs.Ref()); + fsClient.Register(source.ToU8Span(), ref uniqueSourceFs.Ref); + fsClient.Register(output.ToU8Span(), ref uniqueOutputFs.Ref); (Result? resultCode, bool canceled) = CopyDirectory(fsClient, $"{source}:/", $"{output}:/", cancellationToken.Token); diff --git a/Ryujinx.Ava/UI/Controls/NavigationDialogHost.axaml.cs b/Ryujinx.Ava/UI/Controls/NavigationDialogHost.axaml.cs index 6911a4d4c..741885305 100644 --- a/Ryujinx.Ava/UI/Controls/NavigationDialogHost.axaml.cs +++ b/Ryujinx.Ava/UI/Controls/NavigationDialogHost.axaml.cs @@ -121,7 +121,7 @@ namespace Ryujinx.Ava.UI.Controls using var saveDataIterator = new UniqueRef<SaveDataIterator>(); - HorizonClient.Fs.OpenSaveDataIterator(ref saveDataIterator.Ref(), SaveDataSpaceId.User, in saveDataFilter).ThrowIfFailure(); + HorizonClient.Fs.OpenSaveDataIterator(ref saveDataIterator.Ref, SaveDataSpaceId.User, in saveDataFilter).ThrowIfFailure(); Span<SaveDataInfo> saveDataInfo = stackalloc SaveDataInfo[10]; diff --git a/Ryujinx.Ava/UI/ViewModels/AvatarProfileViewModel.cs b/Ryujinx.Ava/UI/ViewModels/AvatarProfileViewModel.cs index 1d0906237..b2b310149 100644 --- a/Ryujinx.Ava/UI/ViewModels/AvatarProfileViewModel.cs +++ b/Ryujinx.Ava/UI/ViewModels/AvatarProfileViewModel.cs @@ -246,7 +246,7 @@ namespace Ryujinx.Ava.UI.ViewModels { using var file = new UniqueRef<IFile>(); - romfs.OpenFile(ref file.Ref(), ("/" + item.FullPath).ToU8Span(), OpenMode.Read) + romfs.OpenFile(ref file.Ref, ("/" + item.FullPath).ToU8Span(), OpenMode.Read) .ThrowIfFailure(); using (MemoryStream stream = new()) diff --git a/Ryujinx.Ava/UI/ViewModels/MainWindowViewModel.cs b/Ryujinx.Ava/UI/ViewModels/MainWindowViewModel.cs index a4ccac2da..489dfe621 100644 --- a/Ryujinx.Ava/UI/ViewModels/MainWindowViewModel.cs +++ b/Ryujinx.Ava/UI/ViewModels/MainWindowViewModel.cs @@ -7,8 +7,7 @@ using DynamicData; using DynamicData.Binding; using LibHac.Common; using LibHac.Fs; -using LibHac.FsSystem; -using LibHac.Tools.Fs; +using LibHac.Tools.FsSystem.NcaUtils; using Ryujinx.Ava.Common; using Ryujinx.Ava.Common.Locale; using Ryujinx.Ava.Input; diff --git a/Ryujinx.Ava/UI/ViewModels/TitleUpdateViewModel.cs b/Ryujinx.Ava/UI/ViewModels/TitleUpdateViewModel.cs index f330006e3..dd9e1b961 100644 --- a/Ryujinx.Ava/UI/ViewModels/TitleUpdateViewModel.cs +++ b/Ryujinx.Ava/UI/ViewModels/TitleUpdateViewModel.cs @@ -170,7 +170,7 @@ public class TitleUpdateViewModel : BaseModel using UniqueRef<IFile> nacpFile = new(); - controlNca.OpenFileSystem(NcaSectionType.Data, IntegrityCheckLevel.None).OpenFile(ref nacpFile.Ref(), "/control.nacp".ToU8Span(), OpenMode.Read).ThrowIfFailure(); + controlNca.OpenFileSystem(NcaSectionType.Data, IntegrityCheckLevel.None).OpenFile(ref nacpFile.Ref, "/control.nacp".ToU8Span(), OpenMode.Read).ThrowIfFailure(); nacpFile.Get.Read(out _, 0, SpanHelpers.AsByteSpan(ref controlData), ReadOption.None).ThrowIfFailure(); TitleUpdates.Add(new TitleUpdateModel(controlData, path)); diff --git a/Ryujinx.Ava/UI/ViewModels/UserFirmwareAvatarSelectorViewModel.cs b/Ryujinx.Ava/UI/ViewModels/UserFirmwareAvatarSelectorViewModel.cs index 9d981128c..558cad5a7 100644 --- a/Ryujinx.Ava/UI/ViewModels/UserFirmwareAvatarSelectorViewModel.cs +++ b/Ryujinx.Ava/UI/ViewModels/UserFirmwareAvatarSelectorViewModel.cs @@ -126,7 +126,7 @@ namespace Ryujinx.Ava.UI.ViewModels { using var file = new UniqueRef<IFile>(); - romfs.OpenFile(ref file.Ref(), ("/" + item.FullPath).ToU8Span(), OpenMode.Read).ThrowIfFailure(); + romfs.OpenFile(ref file.Ref, ("/" + item.FullPath).ToU8Span(), OpenMode.Read).ThrowIfFailure(); using (MemoryStream stream = new()) using (MemoryStream streamPng = new()) diff --git a/Ryujinx.Ava/UI/Views/Main/MainMenuBarView.axaml.cs b/Ryujinx.Ava/UI/Views/Main/MainMenuBarView.axaml.cs index 11ecd0fc9..1c6f4265c 100644 --- a/Ryujinx.Ava/UI/Views/Main/MainMenuBarView.axaml.cs +++ b/Ryujinx.Ava/UI/Views/Main/MainMenuBarView.axaml.cs @@ -1,8 +1,8 @@ using Avalonia; using Avalonia.Controls; using Avalonia.Interactivity; -using LibHac.FsSystem; using LibHac.Ncm; +using LibHac.Tools.FsSystem.NcaUtils; using Ryujinx.Ava.Common.Locale; using Ryujinx.Ava.UI.Helpers; using Ryujinx.Ava.UI.ViewModels; diff --git a/Ryujinx.Ava/UI/Views/User/UserSaveManagerView.axaml.cs b/Ryujinx.Ava/UI/Views/User/UserSaveManagerView.axaml.cs index 074ca30e9..08fef27d0 100644 --- a/Ryujinx.Ava/UI/Views/User/UserSaveManagerView.axaml.cs +++ b/Ryujinx.Ava/UI/Views/User/UserSaveManagerView.axaml.cs @@ -76,7 +76,7 @@ namespace Ryujinx.Ava.UI.Views.User using var saveDataIterator = new UniqueRef<SaveDataIterator>(); - _horizonClient.Fs.OpenSaveDataIterator(ref saveDataIterator.Ref(), SaveDataSpaceId.User, in saveDataFilter).ThrowIfFailure(); + _horizonClient.Fs.OpenSaveDataIterator(ref saveDataIterator.Ref, SaveDataSpaceId.User, in saveDataFilter).ThrowIfFailure(); Span<SaveDataInfo> saveDataInfo = stackalloc SaveDataInfo[10]; diff --git a/Ryujinx.Ava/UI/Windows/DownloadableContentManagerWindow.axaml.cs b/Ryujinx.Ava/UI/Windows/DownloadableContentManagerWindow.axaml.cs index 47216c489..2dab1d352 100644 --- a/Ryujinx.Ava/UI/Windows/DownloadableContentManagerWindow.axaml.cs +++ b/Ryujinx.Ava/UI/Windows/DownloadableContentManagerWindow.axaml.cs @@ -105,7 +105,7 @@ namespace Ryujinx.Ava.UI.Windows { using UniqueRef<IFile> ncaFile = new(); - partitionFileSystem.OpenFile(ref ncaFile.Ref(), downloadableContentNca.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); + partitionFileSystem.OpenFile(ref ncaFile.Ref, downloadableContentNca.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); Nca nca = TryOpenNca(ncaFile.Get.AsStorage(), downloadableContentContainer.ContainerPath); if (nca != null) @@ -158,7 +158,7 @@ namespace Ryujinx.Ava.UI.Windows { using var ncaFile = new UniqueRef<IFile>(); - partitionFileSystem.OpenFile(ref ncaFile.Ref(), fileEntry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); + partitionFileSystem.OpenFile(ref ncaFile.Ref, fileEntry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); Nca nca = TryOpenNca(ncaFile.Get.AsStorage(), path); if (nca == null) diff --git a/Ryujinx.HLE/FileSystem/ContentManager.cs b/Ryujinx.HLE/FileSystem/ContentManager.cs index 9f0f3a4ae..4e3940081 100644 --- a/Ryujinx.HLE/FileSystem/ContentManager.cs +++ b/Ryujinx.HLE/FileSystem/ContentManager.cs @@ -209,7 +209,7 @@ namespace Ryujinx.HLE.FileSystem { using var ncaFile = new UniqueRef<IFile>(); - fs.OpenFile(ref ncaFile.Ref(), ncaPath.FullPath.ToU8Span(), OpenMode.Read); + fs.OpenFile(ref ncaFile.Ref, ncaPath.FullPath.ToU8Span(), OpenMode.Read); var nca = new Nca(_virtualFileSystem.KeySet, ncaFile.Get.AsStorage()); if (nca.Header.ContentType != NcaContentType.Meta) { @@ -221,7 +221,7 @@ namespace Ryujinx.HLE.FileSystem using var pfs0 = nca.OpenFileSystem(0, integrityCheckLevel); using var cnmtFile = new UniqueRef<IFile>(); - pfs0.OpenFile(ref cnmtFile.Ref(), pfs0.EnumerateEntries().Single().FullPath.ToU8Span(), OpenMode.Read); + pfs0.OpenFile(ref cnmtFile.Ref, pfs0.EnumerateEntries().Single().FullPath.ToU8Span(), OpenMode.Read); var cnmt = new Cnmt(cnmtFile.Get.AsStream()); if (cnmt.Type != ContentMetaType.AddOnContent || (cnmt.TitleId & 0xFFFFFFFFFFFFE000) != aocBaseId) @@ -276,11 +276,11 @@ namespace Ryujinx.HLE.FileSystem { case ".xci": pfs = new Xci(_virtualFileSystem.KeySet, file.AsStorage()).OpenPartition(XciPartitionType.Secure); - pfs.OpenFile(ref ncaFile.Ref(), aoc.NcaPath.ToU8Span(), OpenMode.Read); + pfs.OpenFile(ref ncaFile.Ref, aoc.NcaPath.ToU8Span(), OpenMode.Read); break; case ".nsp": pfs = new PartitionFileSystem(file.AsStorage()); - pfs.OpenFile(ref ncaFile.Ref(), aoc.NcaPath.ToU8Span(), OpenMode.Read); + pfs.OpenFile(ref ncaFile.Ref, aoc.NcaPath.ToU8Span(), OpenMode.Read); break; default: return false; // Print error? @@ -625,11 +625,11 @@ namespace Ryujinx.HLE.FileSystem if (filesystem.FileExists($"{path}/00")) { - filesystem.OpenFile(ref file.Ref(), $"{path}/00".ToU8Span(), mode); + filesystem.OpenFile(ref file.Ref, $"{path}/00".ToU8Span(), mode); } else { - filesystem.OpenFile(ref file.Ref(), path.ToU8Span(), mode); + filesystem.OpenFile(ref file.Ref, path.ToU8Span(), mode); } return file.Release(); @@ -751,7 +751,7 @@ namespace Ryujinx.HLE.FileSystem using var metaFile = new UniqueRef<IFile>(); - if (fs.OpenFile(ref metaFile.Ref(), cnmtPath.ToU8Span(), OpenMode.Read).IsSuccess()) + if (fs.OpenFile(ref metaFile.Ref, cnmtPath.ToU8Span(), OpenMode.Read).IsSuccess()) { var meta = new Cnmt(metaFile.Get.AsStream()); @@ -781,7 +781,7 @@ namespace Ryujinx.HLE.FileSystem using var systemVersionFile = new UniqueRef<IFile>(); - if (romfs.OpenFile(ref systemVersionFile.Ref(), "/file".ToU8Span(), OpenMode.Read).IsSuccess()) + if (romfs.OpenFile(ref systemVersionFile.Ref, "/file".ToU8Span(), OpenMode.Read).IsSuccess()) { systemVersion = new SystemVersion(systemVersionFile.Get.AsStream()); } @@ -820,7 +820,7 @@ namespace Ryujinx.HLE.FileSystem using var metaFile = new UniqueRef<IFile>(); - if (fs.OpenFile(ref metaFile.Ref(), cnmtPath.ToU8Span(), OpenMode.Read).IsSuccess()) + if (fs.OpenFile(ref metaFile.Ref, cnmtPath.ToU8Span(), OpenMode.Read).IsSuccess()) { var meta = new Cnmt(metaFile.Get.AsStream()); @@ -891,7 +891,7 @@ namespace Ryujinx.HLE.FileSystem using var metaFile = new UniqueRef<IFile>(); - if (fs.OpenFile(ref metaFile.Ref(), cnmtPath.ToU8Span(), OpenMode.Read).IsSuccess()) + if (fs.OpenFile(ref metaFile.Ref, cnmtPath.ToU8Span(), OpenMode.Read).IsSuccess()) { var meta = new Cnmt(metaFile.Get.AsStream()); @@ -909,7 +909,7 @@ namespace Ryujinx.HLE.FileSystem using var systemVersionFile = new UniqueRef<IFile>(); - if (romfs.OpenFile(ref systemVersionFile.Ref(), "/file".ToU8Span(), OpenMode.Read).IsSuccess()) + if (romfs.OpenFile(ref systemVersionFile.Ref, "/file".ToU8Span(), OpenMode.Read).IsSuccess()) { systemVersion = new SystemVersion(systemVersionFile.Get.AsStream()); } @@ -960,7 +960,7 @@ namespace Ryujinx.HLE.FileSystem using var metaFile = new UniqueRef<IFile>(); - if (fs.OpenFile(ref metaFile.Ref(), cnmtPath.ToU8Span(), OpenMode.Read).IsSuccess()) + if (fs.OpenFile(ref metaFile.Ref, cnmtPath.ToU8Span(), OpenMode.Read).IsSuccess()) { var meta = new Cnmt(metaFile.Get.AsStream()); @@ -1030,7 +1030,7 @@ namespace Ryujinx.HLE.FileSystem using var systemVersionFile = new UniqueRef<IFile>(); - if (romfs.OpenFile(ref systemVersionFile.Ref(), "/file".ToU8Span(), OpenMode.Read).IsSuccess()) + if (romfs.OpenFile(ref systemVersionFile.Ref, "/file".ToU8Span(), OpenMode.Read).IsSuccess()) { return new SystemVersion(systemVersionFile.Get.AsStream()); } diff --git a/Ryujinx.HLE/FileSystem/LocationEntry.cs b/Ryujinx.HLE/FileSystem/LocationEntry.cs index 45cbc8cd5..a60c28967 100644 --- a/Ryujinx.HLE/FileSystem/LocationEntry.cs +++ b/Ryujinx.HLE/FileSystem/LocationEntry.cs @@ -1,4 +1,4 @@ -using LibHac.FsSystem; +using LibHac.Tools.FsSystem.NcaUtils; namespace Ryujinx.HLE.FileSystem { diff --git a/Ryujinx.HLE/FileSystem/VirtualFileSystem.cs b/Ryujinx.HLE/FileSystem/VirtualFileSystem.cs index 0b91d3a2d..3f94ce61b 100644 --- a/Ryujinx.HLE/FileSystem/VirtualFileSystem.cs +++ b/Ryujinx.HLE/FileSystem/VirtualFileSystem.cs @@ -260,15 +260,16 @@ namespace Ryujinx.HLE.FileSystem { using var ticketFile = new UniqueRef<IFile>(); - Result result = fs.OpenFile(ref ticketFile.Ref(), ticketEntry.FullPath.ToU8Span(), OpenMode.Read); + Result result = fs.OpenFile(ref ticketFile.Ref, ticketEntry.FullPath.ToU8Span(), OpenMode.Read); if (result.IsSuccess()) { Ticket ticket = new Ticket(ticketFile.Get.AsStream()); + var titleKey = ticket.GetTitleKey(KeySet); - if (ticket.TitleKeyType == TitleKeyType.Common) + if (titleKey != null) { - KeySet.ExternalKeySet.Add(new RightsId(ticket.RightsId), new AccessKey(ticket.GetTitleKey(KeySet))); + KeySet.ExternalKeySet.Add(new RightsId(ticket.RightsId), new AccessKey(titleKey)); } } } @@ -302,7 +303,7 @@ namespace Ryujinx.HLE.FileSystem using var iterator = new UniqueRef<SaveDataIterator>(); - Result rc = hos.Fs.OpenSaveDataIterator(ref iterator.Ref(), spaceId); + Result rc = hos.Fs.OpenSaveDataIterator(ref iterator.Ref, spaceId); if (rc.IsFailure()) return rc; while (true) diff --git a/Ryujinx.HLE/HOS/Applets/Error/ErrorApplet.cs b/Ryujinx.HLE/HOS/Applets/Error/ErrorApplet.cs index ba7efbd7d..c5c6e8e95 100644 --- a/Ryujinx.HLE/HOS/Applets/Error/ErrorApplet.cs +++ b/Ryujinx.HLE/HOS/Applets/Error/ErrorApplet.cs @@ -122,7 +122,7 @@ namespace Ryujinx.HLE.HOS.Applets.Error { using var binaryFile = new UniqueRef<IFile>(); - romfs.OpenFile(ref binaryFile.Ref(), filePath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); + romfs.OpenFile(ref binaryFile.Ref, filePath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); StreamReader reader = new StreamReader(binaryFile.Get.AsStream(), Encoding.Unicode); return CleanText(reader.ReadToEnd()); diff --git a/Ryujinx.HLE/HOS/ApplicationLoader.cs b/Ryujinx.HLE/HOS/ApplicationLoader.cs index 67e0a9c7b..82bd9b312 100644 --- a/Ryujinx.HLE/HOS/ApplicationLoader.cs +++ b/Ryujinx.HLE/HOS/ApplicationLoader.cs @@ -110,7 +110,7 @@ namespace Ryujinx.HLE.HOS { using var ncaFile = new UniqueRef<IFile>(); - pfs.OpenFile(ref ncaFile.Ref(), fileEntry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); + pfs.OpenFile(ref ncaFile.Ref, fileEntry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); Nca nca = new Nca(fileSystem.KeySet, ncaFile.Release().AsStorage()); @@ -154,7 +154,7 @@ namespace Ryujinx.HLE.HOS { using var ncaFile = new UniqueRef<IFile>(); - pfs.OpenFile(ref ncaFile.Ref(), fileEntry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); + pfs.OpenFile(ref ncaFile.Ref, fileEntry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); Nca nca = new Nca(fileSystem.KeySet, ncaFile.Release().AsStorage()); @@ -329,7 +329,7 @@ namespace Ryujinx.HLE.HOS using var npdmFile = new UniqueRef<IFile>(); - Result result = codeFs.OpenFile(ref npdmFile.Ref(), "/main.npdm".ToU8Span(), OpenMode.Read); + Result result = codeFs.OpenFile(ref npdmFile.Ref, "/main.npdm".ToU8Span(), OpenMode.Read); MetaLoader metaData; @@ -356,7 +356,7 @@ namespace Ryujinx.HLE.HOS using var nsoFile = new UniqueRef<IFile>(); - codeFs.OpenFile(ref nsoFile.Ref(), $"/{name}".ToU8Span(), OpenMode.Read).ThrowIfFailure(); + codeFs.OpenFile(ref nsoFile.Ref, $"/{name}".ToU8Span(), OpenMode.Read).ThrowIfFailure(); nsos[i] = new NsoExecutable(nsoFile.Release().AsStorage(), name); } @@ -371,10 +371,10 @@ namespace Ryujinx.HLE.HOS ProgramInfo programInfo = new ProgramInfo(in npdm, displayVersion, usePtc, allowCodeMemoryForJit: false); ProgramLoader.LoadNsos(_device.System.KernelContext, metaData, programInfo, executables: programs); - string titleIdText = npdm.Aci.Value.ProgramId.Value.ToString("x16"); - bool titleIs64Bit = (npdm.Meta.Value.Flags & 1) != 0; + string titleIdText = npdm.Aci.ProgramId.Value.ToString("x16"); + bool titleIs64Bit = (npdm.Meta.Flags & 1) != 0; - string programName = Encoding.ASCII.GetString(npdm.Meta.Value.ProgramName).TrimEnd('\0'); + string programName = Encoding.ASCII.GetString(npdm.Meta.ProgramName).TrimEnd('\0'); Logger.Info?.Print(LogClass.Loader, $"Service Loaded: {programName} [{titleIdText}] [{(titleIs64Bit ? "64-bit" : "32-bit")}]"); } @@ -520,7 +520,7 @@ namespace Ryujinx.HLE.HOS { using var npdmFile = new UniqueRef<IFile>(); - Result result = fs.OpenFile(ref npdmFile.Ref(), "/main.npdm".ToU8Span(), OpenMode.Read); + Result result = fs.OpenFile(ref npdmFile.Ref, "/main.npdm".ToU8Span(), OpenMode.Read); MetaLoader metaData; @@ -543,8 +543,8 @@ namespace Ryujinx.HLE.HOS metaData.GetNpdm(out var npdm).ThrowIfFailure(); - TitleId = npdm.Aci.Value.ProgramId.Value; - TitleIs64Bit = (npdm.Meta.Value.Flags & 1) != 0; + TitleId = npdm.Aci.ProgramId.Value; + TitleIs64Bit = (npdm.Meta.Flags & 1) != 0; _device.System.LibHacHorizonManager.ArpIReader.ApplicationId = new LibHac.ApplicationId(TitleId); return metaData; @@ -555,7 +555,7 @@ namespace Ryujinx.HLE.HOS using var controlFile = new UniqueRef<IFile>(); IFileSystem controlFs = controlNca.OpenFileSystem(NcaSectionType.Data, device.System.FsIntegrityCheckLevel); - Result result = controlFs.OpenFile(ref controlFile.Ref(), "/control.nacp".ToU8Span(), OpenMode.Read); + Result result = controlFs.OpenFile(ref controlFile.Ref, "/control.nacp".ToU8Span(), OpenMode.Read); if (result.IsSuccess()) { @@ -603,7 +603,7 @@ namespace Ryujinx.HLE.HOS using var nsoFile = new UniqueRef<IFile>(); - codeFs.OpenFile(ref nsoFile.Ref(), $"/{name}".ToU8Span(), OpenMode.Read).ThrowIfFailure(); + codeFs.OpenFile(ref nsoFile.Ref, $"/{name}".ToU8Span(), OpenMode.Read).ThrowIfFailure(); nsos[i] = new NsoExecutable(nsoFile.Release().AsStorage(), name); } @@ -752,7 +752,7 @@ namespace Ryujinx.HLE.HOS _titleName = programInfo.Name; TitleId = programInfo.ProgramId; - TitleIs64Bit = (npdm.Meta.Value.Flags & 1) != 0; + TitleIs64Bit = (npdm.Meta.Flags & 1) != 0; _device.System.LibHacHorizonManager.ArpIReader.ApplicationId = new LibHac.ApplicationId(TitleId); // Explicitly null titleid to disable the shader cache. @@ -798,7 +798,7 @@ namespace Ryujinx.HLE.HOS { using var ncaFile = new UniqueRef<IFile>(); - pfs.OpenFile(ref ncaFile.Ref(), fileEntry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); + pfs.OpenFile(ref ncaFile.Ref, fileEntry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); Nca nca = new Nca(fileSystem.KeySet, ncaFile.Release().AsStorage()); diff --git a/Ryujinx.HLE/HOS/ModLoader.cs b/Ryujinx.HLE/HOS/ModLoader.cs index b6c9973f0..bf0f1f891 100644 --- a/Ryujinx.HLE/HOS/ModLoader.cs +++ b/Ryujinx.HLE/HOS/ModLoader.cs @@ -475,7 +475,7 @@ namespace Ryujinx.HLE.HOS { using var file = new UniqueRef<IFile>(); - baseRom.OpenFile(ref file.Ref(), entry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); + baseRom.OpenFile(ref file.Ref, entry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); builder.AddFile(entry.FullPath, file.Release()); } @@ -494,7 +494,7 @@ namespace Ryujinx.HLE.HOS { using var file = new UniqueRef<IFile>(); - fs.OpenFile(ref file.Ref(), entry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); + fs.OpenFile(ref file.Ref, entry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); if (fileSet.Add(entry.FullPath)) { builder.AddFile(entry.FullPath, file.Release()); diff --git a/Ryujinx.HLE/HOS/ProgramLoader.cs b/Ryujinx.HLE/HOS/ProgramLoader.cs index 1f6fd96d7..158ab701f 100644 --- a/Ryujinx.HLE/HOS/ProgramLoader.cs +++ b/Ryujinx.HLE/HOS/ProgramLoader.cs @@ -28,9 +28,9 @@ namespace Ryujinx.HLE.HOS public ProgramInfo(in Npdm npdm, string displayVersion, bool diskCacheEnabled, bool allowCodeMemoryForJit) { - ulong programId = npdm.Aci.Value.ProgramId.Value; + ulong programId = npdm.Aci.ProgramId.Value; - Name = StringUtils.Utf8ZToString(npdm.Meta.Value.ProgramName); + Name = StringUtils.Utf8ZToString(npdm.Meta.ProgramName); ProgramId = programId; TitleIdText = programId.ToString("x16"); DisplayVersion = displayVersion; @@ -193,7 +193,7 @@ namespace Ryujinx.HLE.HOS return ProgramLoadResult.Failed; } - ref readonly var meta = ref npdm.Meta.Value; + ref readonly var meta = ref npdm.Meta; ulong argsStart = 0; uint argsSize = 0; @@ -298,7 +298,7 @@ namespace Ryujinx.HLE.HOS KProcess process = new KProcess(context, programInfo.AllowCodeMemoryForJit); - MemoryRegion memoryRegion = (MemoryRegion)((npdm.Acid.Value.Flags >> 2) & 0xf); + MemoryRegion memoryRegion = (MemoryRegion)((npdm.Acid.Flags >> 2) & 0xf); if (memoryRegion > MemoryRegion.NvServices) { diff --git a/Ryujinx.HLE/HOS/Services/Account/Acc/AccountManager.cs b/Ryujinx.HLE/HOS/Services/Account/Acc/AccountManager.cs index 41d5028fb..f5364329d 100644 --- a/Ryujinx.HLE/HOS/Services/Account/Acc/AccountManager.cs +++ b/Ryujinx.HLE/HOS/Services/Account/Acc/AccountManager.cs @@ -183,7 +183,7 @@ namespace Ryujinx.HLE.HOS.Services.Account.Acc using var saveDataIterator = new UniqueRef<SaveDataIterator>(); - _horizonClient.Fs.OpenSaveDataIterator(ref saveDataIterator.Ref(), SaveDataSpaceId.User, in saveDataFilter).ThrowIfFailure(); + _horizonClient.Fs.OpenSaveDataIterator(ref saveDataIterator.Ref, SaveDataSpaceId.User, in saveDataFilter).ThrowIfFailure(); Span<SaveDataInfo> saveDataInfo = stackalloc SaveDataInfo[10]; diff --git a/Ryujinx.HLE/HOS/Services/Am/AppletOE/ApplicationProxyService/ApplicationProxy/IApplicationFunctions.cs b/Ryujinx.HLE/HOS/Services/Am/AppletOE/ApplicationProxyService/ApplicationProxy/IApplicationFunctions.cs index 49331e216..f8f88a1cb 100644 --- a/Ryujinx.HLE/HOS/Services/Am/AppletOE/ApplicationProxyService/ApplicationProxy/IApplicationFunctions.cs +++ b/Ryujinx.HLE/HOS/Services/Am/AppletOE/ApplicationProxyService/ApplicationProxy/IApplicationFunctions.cs @@ -1,9 +1,9 @@ using LibHac.Account; using LibHac.Common; using LibHac.Fs; -using LibHac.FsSystem; using LibHac.Ncm; using LibHac.Ns; +using LibHac.Tools.FsSystem.NcaUtils; using Ryujinx.Common; using Ryujinx.Common.Logging; using Ryujinx.HLE.Exceptions; diff --git a/Ryujinx.HLE/HOS/Services/Bcat/IServiceCreator.cs b/Ryujinx.HLE/HOS/Services/Bcat/IServiceCreator.cs index d4528efa7..b16ea4c18 100644 --- a/Ryujinx.HLE/HOS/Services/Bcat/IServiceCreator.cs +++ b/Ryujinx.HLE/HOS/Services/Bcat/IServiceCreator.cs @@ -54,11 +54,11 @@ namespace Ryujinx.HLE.HOS.Services.Bcat using var serv = new SharedRef<LibHac.Bcat.Impl.Ipc.IDeliveryCacheStorageService>(); - Result rc = _base.Get.CreateDeliveryCacheStorageService(ref serv.Ref(), pid); + Result rc = _base.Get.CreateDeliveryCacheStorageService(ref serv.Ref, pid); if (rc.IsSuccess()) { - MakeObject(context, new IDeliveryCacheStorageService(context, ref serv.Ref())); + MakeObject(context, new IDeliveryCacheStorageService(context, ref serv.Ref)); } return (ResultCode)rc.Value; @@ -72,11 +72,11 @@ namespace Ryujinx.HLE.HOS.Services.Bcat using var service = new SharedRef<LibHac.Bcat.Impl.Ipc.IDeliveryCacheStorageService>(); - Result rc = _base.Get.CreateDeliveryCacheStorageServiceWithApplicationId(ref service.Ref(), applicationId); + Result rc = _base.Get.CreateDeliveryCacheStorageServiceWithApplicationId(ref service.Ref, applicationId); if (rc.IsSuccess()) { - MakeObject(context, new IDeliveryCacheStorageService(context, ref service.Ref())); + MakeObject(context, new IDeliveryCacheStorageService(context, ref service.Ref)); } return (ResultCode)rc.Value; diff --git a/Ryujinx.HLE/HOS/Services/Bcat/ServiceCreator/IDeliveryCacheStorageService.cs b/Ryujinx.HLE/HOS/Services/Bcat/ServiceCreator/IDeliveryCacheStorageService.cs index 71d7aed70..32dd75d8e 100644 --- a/Ryujinx.HLE/HOS/Services/Bcat/ServiceCreator/IDeliveryCacheStorageService.cs +++ b/Ryujinx.HLE/HOS/Services/Bcat/ServiceCreator/IDeliveryCacheStorageService.cs @@ -20,11 +20,11 @@ namespace Ryujinx.HLE.HOS.Services.Bcat.ServiceCreator { using var service = new SharedRef<LibHac.Bcat.Impl.Ipc.IDeliveryCacheFileService>(); - Result result = _base.Get.CreateFileService(ref service.Ref()); + Result result = _base.Get.CreateFileService(ref service.Ref); if (result.IsSuccess()) { - MakeObject(context, new IDeliveryCacheFileService(ref service.Ref())); + MakeObject(context, new IDeliveryCacheFileService(ref service.Ref)); } return (ResultCode)result.Value; @@ -36,11 +36,11 @@ namespace Ryujinx.HLE.HOS.Services.Bcat.ServiceCreator { using var service = new SharedRef<LibHac.Bcat.Impl.Ipc.IDeliveryCacheDirectoryService>(); - Result result = _base.Get.CreateDirectoryService(ref service.Ref()); + Result result = _base.Get.CreateDirectoryService(ref service.Ref); if (result.IsSuccess()) { - MakeObject(context, new IDeliveryCacheDirectoryService(ref service.Ref())); + MakeObject(context, new IDeliveryCacheDirectoryService(ref service.Ref)); } return (ResultCode)result.Value; diff --git a/Ryujinx.HLE/HOS/Services/Fs/FileSystemProxy/FileSystemProxyHelper.cs b/Ryujinx.HLE/HOS/Services/Fs/FileSystemProxy/FileSystemProxyHelper.cs index 2afa34807..ba924db83 100644 --- a/Ryujinx.HLE/HOS/Services/Fs/FileSystemProxy/FileSystemProxyHelper.cs +++ b/Ryujinx.HLE/HOS/Services/Fs/FileSystemProxy/FileSystemProxyHelper.cs @@ -30,9 +30,9 @@ namespace Ryujinx.HLE.HOS.Services.Fs.FileSystemProxy ImportTitleKeysFromNsp(nsp.Get, context.Device.System.KeySet); - using SharedRef<LibHac.FsSrv.Sf.IFileSystem> adapter = FileSystemInterfaceAdapter.CreateShared(ref nsp.Ref(), true); + using SharedRef<LibHac.FsSrv.Sf.IFileSystem> adapter = FileSystemInterfaceAdapter.CreateShared(ref nsp.Ref, true); - openedFileSystem = new IFileSystem(ref adapter.Ref()); + openedFileSystem = new IFileSystem(ref adapter.Ref); } catch (HorizonResultException ex) { @@ -58,9 +58,9 @@ namespace Ryujinx.HLE.HOS.Services.Fs.FileSystemProxy LibHac.Fs.Fsa.IFileSystem fileSystem = nca.OpenFileSystem(NcaSectionType.Data, context.Device.System.FsIntegrityCheckLevel); using var sharedFs = new SharedRef<LibHac.Fs.Fsa.IFileSystem>(fileSystem); - using SharedRef<LibHac.FsSrv.Sf.IFileSystem> adapter = FileSystemInterfaceAdapter.CreateShared(ref sharedFs.Ref(), true); + using SharedRef<LibHac.FsSrv.Sf.IFileSystem> adapter = FileSystemInterfaceAdapter.CreateShared(ref sharedFs.Ref, true); - openedFileSystem = new IFileSystem(ref adapter.Ref()); + openedFileSystem = new IFileSystem(ref adapter.Ref); } catch (HorizonResultException ex) { @@ -98,7 +98,7 @@ namespace Ryujinx.HLE.HOS.Services.Fs.FileSystemProxy using var ncaFile = new UniqueRef<LibHac.Fs.Fsa.IFile>(); - Result result = nsp.OpenFile(ref ncaFile.Ref(), filename.ToU8Span(), OpenMode.Read); + Result result = nsp.OpenFile(ref ncaFile.Ref, filename.ToU8Span(), OpenMode.Read); if (result.IsFailure()) { return (ResultCode)result.Value; @@ -121,13 +121,17 @@ namespace Ryujinx.HLE.HOS.Services.Fs.FileSystemProxy { using var ticketFile = new UniqueRef<LibHac.Fs.Fsa.IFile>(); - Result result = nsp.OpenFile(ref ticketFile.Ref(), ticketEntry.FullPath.ToU8Span(), OpenMode.Read); + Result result = nsp.OpenFile(ref ticketFile.Ref, ticketEntry.FullPath.ToU8Span(), OpenMode.Read); if (result.IsSuccess()) { Ticket ticket = new Ticket(ticketFile.Get.AsStream()); + var titleKey = ticket.GetTitleKey(keySet); - keySet.ExternalKeySet.Add(new RightsId(ticket.RightsId), new AccessKey(ticket.GetTitleKey(keySet))); + if (titleKey != null) + { + keySet.ExternalKeySet.Add(new RightsId(ticket.RightsId), new AccessKey(titleKey)); + } } } } diff --git a/Ryujinx.HLE/HOS/Services/Fs/FileSystemProxy/IFileSystem.cs b/Ryujinx.HLE/HOS/Services/Fs/FileSystemProxy/IFileSystem.cs index d68ef3952..623f1371e 100644 --- a/Ryujinx.HLE/HOS/Services/Fs/FileSystemProxy/IFileSystem.cs +++ b/Ryujinx.HLE/HOS/Services/Fs/FileSystemProxy/IFileSystem.cs @@ -111,11 +111,11 @@ namespace Ryujinx.HLE.HOS.Services.Fs.FileSystemProxy ref readonly Path name = ref FileSystemProxyHelper.GetSfPath(context); using var file = new SharedRef<LibHac.FsSrv.Sf.IFile>(); - Result result = _fileSystem.Get.OpenFile(ref file.Ref(), in name, mode); + Result result = _fileSystem.Get.OpenFile(ref file.Ref, in name, mode); if (result.IsSuccess()) { - IFile fileInterface = new IFile(ref file.Ref()); + IFile fileInterface = new IFile(ref file.Ref); MakeObject(context, fileInterface); } @@ -132,11 +132,11 @@ namespace Ryujinx.HLE.HOS.Services.Fs.FileSystemProxy ref readonly Path name = ref FileSystemProxyHelper.GetSfPath(context); using var dir = new SharedRef<LibHac.FsSrv.Sf.IDirectory>(); - Result result = _fileSystem.Get.OpenDirectory(ref dir.Ref(), name, mode); + Result result = _fileSystem.Get.OpenDirectory(ref dir.Ref, name, mode); if (result.IsSuccess()) { - IDirectory dirInterface = new IDirectory(ref dir.Ref()); + IDirectory dirInterface = new IDirectory(ref dir.Ref); MakeObject(context, dirInterface); } diff --git a/Ryujinx.HLE/HOS/Services/Fs/IFileSystemProxy.cs b/Ryujinx.HLE/HOS/Services/Fs/IFileSystemProxy.cs index a4bc62540..e43b1cad0 100644 --- a/Ryujinx.HLE/HOS/Services/Fs/IFileSystemProxy.cs +++ b/Ryujinx.HLE/HOS/Services/Fs/IFileSystemProxy.cs @@ -109,10 +109,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs ref readonly var path = ref FileSystemProxyHelper.GetFspPath(context); using var fileSystem = new SharedRef<IFileSystem>(); - Result result = _baseFileSystemProxy.Get.OpenBisFileSystem(ref fileSystem.Ref(), in path, bisPartitionId); + Result result = _baseFileSystemProxy.Get.OpenBisFileSystem(ref fileSystem.Ref, in path, bisPartitionId); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref())); + MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref)); return ResultCode.Success; } @@ -124,10 +124,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs BisPartitionId bisPartitionId = (BisPartitionId)context.RequestData.ReadInt32(); using var storage = new SharedRef<IStorage>(); - Result result = _baseFileSystemProxy.Get.OpenBisStorage(ref storage.Ref(), bisPartitionId); + Result result = _baseFileSystemProxy.Get.OpenBisStorage(ref storage.Ref, bisPartitionId); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new FileSystemProxy.IStorage(ref storage.Ref())); + MakeObject(context, new FileSystemProxy.IStorage(ref storage.Ref)); return ResultCode.Success; } @@ -145,10 +145,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs { using var fileSystem = new SharedRef<IFileSystem>(); - Result result = _baseFileSystemProxy.Get.OpenSdCardFileSystem(ref fileSystem.Ref()); + Result result = _baseFileSystemProxy.Get.OpenSdCardFileSystem(ref fileSystem.Ref); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref())); + MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref)); return ResultCode.Success; } @@ -247,10 +247,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs GameCardPartitionRaw partitionId = (GameCardPartitionRaw)context.RequestData.ReadInt32(); using var storage = new SharedRef<IStorage>(); - Result result = _baseFileSystemProxy.Get.OpenGameCardStorage(ref storage.Ref(), handle, partitionId); + Result result = _baseFileSystemProxy.Get.OpenGameCardStorage(ref storage.Ref, handle, partitionId); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new FileSystemProxy.IStorage(ref storage.Ref())); + MakeObject(context, new FileSystemProxy.IStorage(ref storage.Ref)); return ResultCode.Success; } @@ -263,10 +263,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs GameCardPartition partitionId = (GameCardPartition)context.RequestData.ReadInt32(); using var fileSystem = new SharedRef<IFileSystem>(); - Result result = _baseFileSystemProxy.Get.OpenGameCardFileSystem(ref fileSystem.Ref(), handle, partitionId); + Result result = _baseFileSystemProxy.Get.OpenGameCardFileSystem(ref fileSystem.Ref, handle, partitionId); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref())); + MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref)); return ResultCode.Success; } @@ -338,10 +338,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs SaveDataAttribute attribute = context.RequestData.ReadStruct<SaveDataAttribute>(); using var fileSystem = new SharedRef<IFileSystem>(); - Result result = _baseFileSystemProxy.Get.OpenSaveDataFileSystem(ref fileSystem.Ref(), spaceId, in attribute); + Result result = _baseFileSystemProxy.Get.OpenSaveDataFileSystem(ref fileSystem.Ref, spaceId, in attribute); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref())); + MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref)); return ResultCode.Success; } @@ -354,10 +354,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs SaveDataAttribute attribute = context.RequestData.ReadStruct<SaveDataAttribute>(); using var fileSystem = new SharedRef<IFileSystem>(); - Result result = _baseFileSystemProxy.Get.OpenSaveDataFileSystemBySystemSaveDataId(ref fileSystem.Ref(), spaceId, in attribute); + Result result = _baseFileSystemProxy.Get.OpenSaveDataFileSystemBySystemSaveDataId(ref fileSystem.Ref, spaceId, in attribute); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref())); + MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref)); return ResultCode.Success; } @@ -370,10 +370,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs SaveDataAttribute attribute = context.RequestData.ReadStruct<SaveDataAttribute>(); using var fileSystem = new SharedRef<IFileSystem>(); - Result result = _baseFileSystemProxy.Get.OpenReadOnlySaveDataFileSystem(ref fileSystem.Ref(), spaceId, in attribute); + Result result = _baseFileSystemProxy.Get.OpenReadOnlySaveDataFileSystem(ref fileSystem.Ref, spaceId, in attribute); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref())); + MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref)); return ResultCode.Success; } @@ -432,10 +432,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs { using var infoReader = new SharedRef<LibHac.FsSrv.Sf.ISaveDataInfoReader>(); - Result result = _baseFileSystemProxy.Get.OpenSaveDataInfoReader(ref infoReader.Ref()); + Result result = _baseFileSystemProxy.Get.OpenSaveDataInfoReader(ref infoReader.Ref); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new ISaveDataInfoReader(ref infoReader.Ref())); + MakeObject(context, new ISaveDataInfoReader(ref infoReader.Ref)); return ResultCode.Success; } @@ -447,10 +447,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs SaveDataSpaceId spaceId = (SaveDataSpaceId)context.RequestData.ReadByte(); using var infoReader = new SharedRef<LibHac.FsSrv.Sf.ISaveDataInfoReader>(); - Result result = _baseFileSystemProxy.Get.OpenSaveDataInfoReaderBySaveDataSpaceId(ref infoReader.Ref(), spaceId); + Result result = _baseFileSystemProxy.Get.OpenSaveDataInfoReaderBySaveDataSpaceId(ref infoReader.Ref, spaceId); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new ISaveDataInfoReader(ref infoReader.Ref())); + MakeObject(context, new ISaveDataInfoReader(ref infoReader.Ref)); return ResultCode.Success; } @@ -461,10 +461,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs { using var infoReader = new SharedRef<LibHac.FsSrv.Sf.ISaveDataInfoReader>(); - Result result = _baseFileSystemProxy.Get.OpenSaveDataInfoReaderOnlyCacheStorage(ref infoReader.Ref()); + Result result = _baseFileSystemProxy.Get.OpenSaveDataInfoReaderOnlyCacheStorage(ref infoReader.Ref); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new ISaveDataInfoReader(ref infoReader.Ref())); + MakeObject(context, new ISaveDataInfoReader(ref infoReader.Ref)); return ResultCode.Success; } @@ -477,10 +477,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs ulong saveDataId = context.RequestData.ReadUInt64(); using var fileSystem = new SharedRef<IFileSystem>(); - Result result = _baseFileSystemProxy.Get.OpenSaveDataInternalStorageFileSystem(ref fileSystem.Ref(), spaceId, saveDataId); + Result result = _baseFileSystemProxy.Get.OpenSaveDataInternalStorageFileSystem(ref fileSystem.Ref, spaceId, saveDataId); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref())); + MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref)); return ResultCode.Success; } @@ -537,10 +537,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs SaveDataFilter filter = context.RequestData.ReadStruct<SaveDataFilter>(); using var infoReader = new SharedRef<LibHac.FsSrv.Sf.ISaveDataInfoReader>(); - Result result = _baseFileSystemProxy.Get.OpenSaveDataInfoReaderWithFilter(ref infoReader.Ref(), spaceId, in filter); + Result result = _baseFileSystemProxy.Get.OpenSaveDataInfoReaderWithFilter(ref infoReader.Ref, spaceId, in filter); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new ISaveDataInfoReader(ref infoReader.Ref())); + MakeObject(context, new ISaveDataInfoReader(ref infoReader.Ref)); return ResultCode.Success; } @@ -605,10 +605,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs SaveDataAttribute attribute = context.RequestData.ReadStruct<SaveDataAttribute>(); using var file = new SharedRef<LibHac.FsSrv.Sf.IFile>(); - Result result = _baseFileSystemProxy.Get.OpenSaveDataMetaFile(ref file.Ref(), spaceId, in attribute, metaType); + Result result = _baseFileSystemProxy.Get.OpenSaveDataMetaFile(ref file.Ref, spaceId, in attribute, metaType); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new IFile(ref file.Ref())); + MakeObject(context, new IFile(ref file.Ref)); return ResultCode.Success; } @@ -637,10 +637,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs ImageDirectoryId directoryId = (ImageDirectoryId)context.RequestData.ReadInt32(); using var fileSystem = new SharedRef<IFileSystem>(); - Result result = _baseFileSystemProxy.Get.OpenImageDirectoryFileSystem(ref fileSystem.Ref(), directoryId); + Result result = _baseFileSystemProxy.Get.OpenImageDirectoryFileSystem(ref fileSystem.Ref, directoryId); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref())); + MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref)); return ResultCode.Success; } @@ -651,10 +651,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs BaseFileSystemId fileSystemId = (BaseFileSystemId)context.RequestData.ReadInt32(); using var fileSystem = new SharedRef<IFileSystem>(); - Result result = _baseFileSystemProxy.Get.OpenBaseFileSystem(ref fileSystem.Ref(), fileSystemId); + Result result = _baseFileSystemProxy.Get.OpenBaseFileSystem(ref fileSystem.Ref, fileSystemId); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref())); + MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref)); return ResultCode.Success; } @@ -665,10 +665,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs ContentStorageId contentStorageId = (ContentStorageId)context.RequestData.ReadInt32(); using var fileSystem = new SharedRef<IFileSystem>(); - Result result = _baseFileSystemProxy.Get.OpenContentStorageFileSystem(ref fileSystem.Ref(), contentStorageId); + Result result = _baseFileSystemProxy.Get.OpenContentStorageFileSystem(ref fileSystem.Ref, contentStorageId); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref())); + MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref)); return ResultCode.Success; } @@ -679,10 +679,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs CloudBackupWorkStorageId storageId = (CloudBackupWorkStorageId)context.RequestData.ReadInt32(); using var fileSystem = new SharedRef<IFileSystem>(); - Result result = _baseFileSystemProxy.Get.OpenCloudBackupWorkStorageFileSystem(ref fileSystem.Ref(), storageId); + Result result = _baseFileSystemProxy.Get.OpenCloudBackupWorkStorageFileSystem(ref fileSystem.Ref, storageId); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref())); + MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref)); return ResultCode.Success; } @@ -693,10 +693,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs CustomStorageId customStorageId = (CustomStorageId)context.RequestData.ReadInt32(); using var fileSystem = new SharedRef<IFileSystem>(); - Result result = _baseFileSystemProxy.Get.OpenCustomStorageFileSystem(ref fileSystem.Ref(), customStorageId); + Result result = _baseFileSystemProxy.Get.OpenCustomStorageFileSystem(ref fileSystem.Ref, customStorageId); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref())); + MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref)); return ResultCode.Success; } @@ -707,9 +707,9 @@ namespace Ryujinx.HLE.HOS.Services.Fs { var storage = context.Device.FileSystem.GetRomFs(_pid).AsStorage(true); using var sharedStorage = new SharedRef<LibHac.Fs.IStorage>(storage); - using var sfStorage = new SharedRef<IStorage>(new StorageInterfaceAdapter(ref sharedStorage.Ref())); + using var sfStorage = new SharedRef<IStorage>(new StorageInterfaceAdapter(ref sharedStorage.Ref)); - MakeObject(context, new FileSystemProxy.IStorage(ref sfStorage.Ref())); + MakeObject(context, new FileSystemProxy.IStorage(ref sfStorage.Ref)); return ResultCode.Success; } @@ -730,9 +730,9 @@ namespace Ryujinx.HLE.HOS.Services.Fs var storage = context.Device.FileSystem.ModLoader.ApplyRomFsMods(titleId, aocStorage); using var sharedStorage = new SharedRef<LibHac.Fs.IStorage>(storage); - using var sfStorage = new SharedRef<IStorage>(new StorageInterfaceAdapter(ref sharedStorage.Ref())); + using var sfStorage = new SharedRef<IStorage>(new StorageInterfaceAdapter(ref sharedStorage.Ref)); - MakeObject(context, new FileSystemProxy.IStorage(ref sfStorage.Ref())); + MakeObject(context, new FileSystemProxy.IStorage(ref sfStorage.Ref)); return ResultCode.Success; } @@ -765,9 +765,9 @@ namespace Ryujinx.HLE.HOS.Services.Fs Nca nca = new Nca(context.Device.System.KeySet, ncaStorage); LibHac.Fs.IStorage romfsStorage = nca.OpenStorage(NcaSectionType.Data, context.Device.System.FsIntegrityCheckLevel); using var sharedStorage = new SharedRef<LibHac.Fs.IStorage>(romfsStorage); - using var sfStorage = new SharedRef<IStorage>(new StorageInterfaceAdapter(ref sharedStorage.Ref())); + using var sfStorage = new SharedRef<IStorage>(new StorageInterfaceAdapter(ref sharedStorage.Ref)); - MakeObject(context, new FileSystemProxy.IStorage(ref sfStorage.Ref())); + MakeObject(context, new FileSystemProxy.IStorage(ref sfStorage.Ref)); } catch (HorizonResultException ex) { @@ -796,9 +796,9 @@ namespace Ryujinx.HLE.HOS.Services.Fs { var storage = context.Device.FileSystem.GetRomFs(_pid).AsStorage(true); using var sharedStorage = new SharedRef<LibHac.Fs.IStorage>(storage); - using var sfStorage = new SharedRef<IStorage>(new StorageInterfaceAdapter(ref sharedStorage.Ref())); + using var sfStorage = new SharedRef<IStorage>(new StorageInterfaceAdapter(ref sharedStorage.Ref)); - MakeObject(context, new FileSystemProxy.IStorage(ref sfStorage.Ref())); + MakeObject(context, new FileSystemProxy.IStorage(ref sfStorage.Ref)); return ResultCode.Success; } @@ -816,9 +816,9 @@ namespace Ryujinx.HLE.HOS.Services.Fs var storage = context.Device.FileSystem.GetRomFs(_pid).AsStorage(true); using var sharedStorage = new SharedRef<LibHac.Fs.IStorage>(storage); - using var sfStorage = new SharedRef<IStorage>(new StorageInterfaceAdapter(ref sharedStorage.Ref())); + using var sfStorage = new SharedRef<IStorage>(new StorageInterfaceAdapter(ref sharedStorage.Ref)); - MakeObject(context, new FileSystemProxy.IStorage(ref sfStorage.Ref())); + MakeObject(context, new FileSystemProxy.IStorage(ref sfStorage.Ref)); return ResultCode.Success; } @@ -829,10 +829,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs { using var deviceOperator = new SharedRef<LibHac.FsSrv.Sf.IDeviceOperator>(); - Result result = _baseFileSystemProxy.Get.OpenDeviceOperator(ref deviceOperator.Ref()); + Result result = _baseFileSystemProxy.Get.OpenDeviceOperator(ref deviceOperator.Ref); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new IDeviceOperator(ref deviceOperator.Ref())); + MakeObject(context, new IDeviceOperator(ref deviceOperator.Ref)); return ResultCode.Success; } @@ -1195,10 +1195,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs { using var fileSystem = new SharedRef<IFileSystem>(); - Result result = _baseFileSystemProxy.Get.OpenRegisteredUpdatePartition(ref fileSystem.Ref()); + Result result = _baseFileSystemProxy.Get.OpenRegisteredUpdatePartition(ref fileSystem.Ref); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref())); + MakeObject(context, new FileSystemProxy.IFileSystem(ref fileSystem.Ref)); return ResultCode.Success; } @@ -1290,10 +1290,10 @@ namespace Ryujinx.HLE.HOS.Services.Fs { using var commitManager = new SharedRef<LibHac.FsSrv.Sf.IMultiCommitManager>(); - Result result = _baseFileSystemProxy.Get.OpenMultiCommitManager(ref commitManager.Ref()); + Result result = _baseFileSystemProxy.Get.OpenMultiCommitManager(ref commitManager.Ref); if (result.IsFailure()) return (ResultCode)result.Value; - MakeObject(context, new IMultiCommitManager(ref commitManager.Ref())); + MakeObject(context, new IMultiCommitManager(ref commitManager.Ref)); return ResultCode.Success; } diff --git a/Ryujinx.HLE/HOS/Services/Fs/IMultiCommitManager.cs b/Ryujinx.HLE/HOS/Services/Fs/IMultiCommitManager.cs index 4c28117bd..1a85e1b2d 100644 --- a/Ryujinx.HLE/HOS/Services/Fs/IMultiCommitManager.cs +++ b/Ryujinx.HLE/HOS/Services/Fs/IMultiCommitManager.cs @@ -19,7 +19,7 @@ namespace Ryujinx.HLE.HOS.Services.Fs { using SharedRef<LibHac.FsSrv.Sf.IFileSystem> fileSystem = GetObject<IFileSystem>(context, 0).GetBaseFileSystem(); - Result result = _baseCommitManager.Get.Add(ref fileSystem.Ref()); + Result result = _baseCommitManager.Get.Add(ref fileSystem.Ref); return (ResultCode)result.Value; } diff --git a/Ryujinx.HLE/HOS/Services/Mii/Types/StoreData.cs b/Ryujinx.HLE/HOS/Services/Mii/Types/StoreData.cs index 31c46bc09..8411693f4 100644 --- a/Ryujinx.HLE/HOS/Services/Mii/Types/StoreData.cs +++ b/Ryujinx.HLE/HOS/Services/Mii/Types/StoreData.cs @@ -1,7 +1,5 @@ -using LibHac.Common; -using Ryujinx.HLE.Utilities; +using Ryujinx.Common.Utilities; using System; -using System.Diagnostics; using System.Runtime.InteropServices; namespace Ryujinx.HLE.HOS.Services.Mii.Types @@ -78,7 +76,7 @@ namespace Ryujinx.HLE.HOS.Services.Mii.Types private ReadOnlySpan<byte> AsSpan() { - return MemoryMarshal.AsBytes(SpanHelpers.CreateReadOnlySpan(in this, 1)); + return SpanHelpers.AsReadOnlyByteSpan(ref this); } private ReadOnlySpan<byte> AsSpanWithoutDeviceCrc() diff --git a/Ryujinx.HLE/HOS/Services/Ncm/Lr/LocationResolverManager/ILocationResolver.cs b/Ryujinx.HLE/HOS/Services/Ncm/Lr/LocationResolverManager/ILocationResolver.cs index 0767b148f..d97bd009b 100644 --- a/Ryujinx.HLE/HOS/Services/Ncm/Lr/LocationResolverManager/ILocationResolver.cs +++ b/Ryujinx.HLE/HOS/Services/Ncm/Lr/LocationResolverManager/ILocationResolver.cs @@ -1,5 +1,5 @@ -using LibHac.FsSystem; -using LibHac.Ncm; +using LibHac.Ncm; +using LibHac.Tools.FsSystem.NcaUtils; using Ryujinx.HLE.FileSystem; using System.Text; diff --git a/Ryujinx.HLE/HOS/Services/Nifm/StaticService/Types/ProxySetting.cs b/Ryujinx.HLE/HOS/Services/Nifm/StaticService/Types/ProxySetting.cs index 827520f15..6e534fe1c 100644 --- a/Ryujinx.HLE/HOS/Services/Nifm/StaticService/Types/ProxySetting.cs +++ b/Ryujinx.HLE/HOS/Services/Nifm/StaticService/Types/ProxySetting.cs @@ -1,5 +1,5 @@ -using LibHac.Common; -using Ryujinx.Common.Memory; +using Ryujinx.Common.Memory; +using Ryujinx.Common.Utilities; using System; using System.Runtime.InteropServices; diff --git a/Ryujinx.HLE/HOS/Services/Sdb/Pl/SharedFontManager.cs b/Ryujinx.HLE/HOS/Services/Sdb/Pl/SharedFontManager.cs index f1ef6a2f0..66a69a8be 100644 --- a/Ryujinx.HLE/HOS/Services/Sdb/Pl/SharedFontManager.cs +++ b/Ryujinx.HLE/HOS/Services/Sdb/Pl/SharedFontManager.cs @@ -77,7 +77,7 @@ namespace Ryujinx.HLE.HOS.Services.Sdb.Pl using var fontFile = new UniqueRef<IFile>(); - romfs.OpenFile(ref fontFile.Ref(), ("/" + fontFilename).ToU8Span(), OpenMode.Read).ThrowIfFailure(); + romfs.OpenFile(ref fontFile.Ref, ("/" + fontFilename).ToU8Span(), OpenMode.Read).ThrowIfFailure(); data = DecryptFont(fontFile.Get.AsStream()); } diff --git a/Ryujinx.HLE/HOS/Services/Settings/ISystemSettingsServer.cs b/Ryujinx.HLE/HOS/Services/Settings/ISystemSettingsServer.cs index 7f32ce6bd..bae10d0b0 100644 --- a/Ryujinx.HLE/HOS/Services/Settings/ISystemSettingsServer.cs +++ b/Ryujinx.HLE/HOS/Services/Settings/ISystemSettingsServer.cs @@ -321,7 +321,7 @@ namespace Ryujinx.HLE.HOS.Services.Settings using var firmwareFile = new UniqueRef<IFile>(); - Result result = firmwareRomFs.OpenFile(ref firmwareFile.Ref(), "/file".ToU8Span(), OpenMode.Read); + Result result = firmwareRomFs.OpenFile(ref firmwareFile.Ref, "/file".ToU8Span(), OpenMode.Read); if (result.IsFailure()) { return null; diff --git a/Ryujinx.HLE/HOS/Services/Ssl/BuiltInCertificateManager.cs b/Ryujinx.HLE/HOS/Services/Ssl/BuiltInCertificateManager.cs index a164c7455..abbc13541 100644 --- a/Ryujinx.HLE/HOS/Services/Ssl/BuiltInCertificateManager.cs +++ b/Ryujinx.HLE/HOS/Services/Ssl/BuiltInCertificateManager.cs @@ -133,14 +133,14 @@ namespace Ryujinx.HLE.HOS.Services.Ssl using var trustedCertsFileRef = new UniqueRef<IFile>(); - Result result = romfs.OpenFile(ref trustedCertsFileRef.Ref(), "/ssl_TrustedCerts.bdf".ToU8Span(), OpenMode.Read); + Result result = romfs.OpenFile(ref trustedCertsFileRef.Ref, "/ssl_TrustedCerts.bdf".ToU8Span(), OpenMode.Read); if (!result.IsSuccess()) { // [1.0.0 - 2.3.0] if (ResultFs.PathNotFound.Includes(result)) { - result = romfs.OpenFile(ref trustedCertsFileRef.Ref(), "/ssl_TrustedCerts.tcf".ToU8Span(), OpenMode.Read); + result = romfs.OpenFile(ref trustedCertsFileRef.Ref, "/ssl_TrustedCerts.tcf".ToU8Span(), OpenMode.Read); } if (result.IsFailure()) diff --git a/Ryujinx.HLE/HOS/Services/Time/TimeZone/TimeZoneContentManager.cs b/Ryujinx.HLE/HOS/Services/Time/TimeZone/TimeZoneContentManager.cs index f4b3a9590..69ed56d45 100644 --- a/Ryujinx.HLE/HOS/Services/Time/TimeZone/TimeZoneContentManager.cs +++ b/Ryujinx.HLE/HOS/Services/Time/TimeZone/TimeZoneContentManager.cs @@ -97,7 +97,7 @@ namespace Ryujinx.HLE.HOS.Services.Time.TimeZone using var binaryListFile = new UniqueRef<IFile>(); - romfs.OpenFile(ref binaryListFile.Ref(), "/binaryList.txt".ToU8Span(), OpenMode.Read).ThrowIfFailure(); + romfs.OpenFile(ref binaryListFile.Ref, "/binaryList.txt".ToU8Span(), OpenMode.Read).ThrowIfFailure(); StreamReader reader = new StreamReader(binaryListFile.Get.AsStream()); @@ -143,7 +143,7 @@ namespace Ryujinx.HLE.HOS.Services.Time.TimeZone using var tzif = new UniqueRef<IFile>(); - if (romfs.OpenFile(ref tzif.Ref(), $"/zoneinfo/{locName}".ToU8Span(), OpenMode.Read).IsFailure()) + if (romfs.OpenFile(ref tzif.Ref, $"/zoneinfo/{locName}".ToU8Span(), OpenMode.Read).IsFailure()) { Logger.Error?.Print(LogClass.ServiceTime, $"Error opening /zoneinfo/{locName}"); continue; @@ -273,7 +273,7 @@ namespace Ryujinx.HLE.HOS.Services.Time.TimeZone using var timeZoneBinaryFile = new UniqueRef<IFile>(); - Result result = romfs.OpenFile(ref timeZoneBinaryFile.Ref(), $"/zoneinfo/{locationName}".ToU8Span(), OpenMode.Read); + Result result = romfs.OpenFile(ref timeZoneBinaryFile.Ref, $"/zoneinfo/{locationName}".ToU8Span(), OpenMode.Read); timeZoneBinaryStream = timeZoneBinaryFile.Release().AsStream(); diff --git a/Ryujinx.Ui.Common/App/ApplicationLibrary.cs b/Ryujinx.Ui.Common/App/ApplicationLibrary.cs index 951516c08..43510d5ec 100644 --- a/Ryujinx.Ui.Common/App/ApplicationLibrary.cs +++ b/Ryujinx.Ui.Common/App/ApplicationLibrary.cs @@ -72,7 +72,7 @@ namespace Ryujinx.Ui.App.Common { using UniqueRef<IFile> controlFile = new(); - controlFs.OpenFile(ref controlFile.Ref(), "/control.nacp".ToU8Span(), OpenMode.Read).ThrowIfFailure(); + controlFs.OpenFile(ref controlFile.Ref, "/control.nacp".ToU8Span(), OpenMode.Read).ThrowIfFailure(); controlFile.Get.Read(out _, 0, outProperty, ReadOption.None).ThrowIfFailure(); } @@ -178,7 +178,7 @@ namespace Ryujinx.Ui.App.Common { using UniqueRef<IFile> ncaFile = new(); - pfs.OpenFile(ref ncaFile.Ref(), fileEntry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); + pfs.OpenFile(ref ncaFile.Ref, fileEntry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); Nca nca = new(_virtualFileSystem.KeySet, ncaFile.Get.AsStorage()); int dataIndex = Nca.GetSectionIndexFromType(NcaSectionType.Data, NcaContentType.Program); @@ -211,7 +211,7 @@ namespace Ryujinx.Ui.App.Common using UniqueRef<IFile> npdmFile = new(); - Result result = pfs.OpenFile(ref npdmFile.Ref(), "/main.npdm".ToU8Span(), OpenMode.Read); + Result result = pfs.OpenFile(ref npdmFile.Ref, "/main.npdm".ToU8Span(), OpenMode.Read); if (ResultFs.PathNotFound.Includes(result)) { @@ -241,7 +241,7 @@ namespace Ryujinx.Ui.App.Common { using UniqueRef<IFile> icon = new(); - controlFs.OpenFile(ref icon.Ref(), $"/icon_{_desiredTitleLanguage}.dat".ToU8Span(), OpenMode.Read).ThrowIfFailure(); + controlFs.OpenFile(ref icon.Ref, $"/icon_{_desiredTitleLanguage}.dat".ToU8Span(), OpenMode.Read).ThrowIfFailure(); using MemoryStream stream = new(); @@ -259,7 +259,7 @@ namespace Ryujinx.Ui.App.Common using var icon = new UniqueRef<IFile>(); - controlFs.OpenFile(ref icon.Ref(), entry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); + controlFs.OpenFile(ref icon.Ref, entry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); using MemoryStream stream = new(); @@ -572,7 +572,7 @@ namespace Ryujinx.Ui.App.Common { using var icon = new UniqueRef<IFile>(); - controlFs.OpenFile(ref icon.Ref(), $"/icon_{_desiredTitleLanguage}.dat".ToU8Span(), OpenMode.Read).ThrowIfFailure(); + controlFs.OpenFile(ref icon.Ref, $"/icon_{_desiredTitleLanguage}.dat".ToU8Span(), OpenMode.Read).ThrowIfFailure(); using MemoryStream stream = new(); @@ -590,7 +590,7 @@ namespace Ryujinx.Ui.App.Common using var icon = new UniqueRef<IFile>(); - controlFs.OpenFile(ref icon.Ref(), entry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); + controlFs.OpenFile(ref icon.Ref, entry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); using (MemoryStream stream = new()) { diff --git a/Ryujinx/Ui/MainWindow.cs b/Ryujinx/Ui/MainWindow.cs index 5051fb5f6..6d3d4aad6 100644 --- a/Ryujinx/Ui/MainWindow.cs +++ b/Ryujinx/Ui/MainWindow.cs @@ -2,10 +2,10 @@ using Gtk; using LibHac.Common; using LibHac.Common.Keys; -using LibHac.FsSystem; using LibHac.Ncm; using LibHac.Ns; using LibHac.Tools.FsSystem; +using LibHac.Tools.FsSystem.NcaUtils; using Ryujinx.Audio.Backends.Dummy; using Ryujinx.Audio.Backends.OpenAL; using Ryujinx.Audio.Backends.SDL2; diff --git a/Ryujinx/Ui/Widgets/GameTableContextMenu.cs b/Ryujinx/Ui/Widgets/GameTableContextMenu.cs index e45509861..a63d68ff2 100644 --- a/Ryujinx/Ui/Widgets/GameTableContextMenu.cs +++ b/Ryujinx/Ui/Widgets/GameTableContextMenu.cs @@ -224,7 +224,7 @@ namespace Ryujinx.Ui.Widgets { using var ncaFile = new UniqueRef<IFile>(); - pfs.OpenFile(ref ncaFile.Ref(), fileEntry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); + pfs.OpenFile(ref ncaFile.Ref, fileEntry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); Nca nca = new Nca(_virtualFileSystem.KeySet, ncaFile.Release().AsStorage()); @@ -280,8 +280,8 @@ namespace Ryujinx.Ui.Widgets using var uniqueSourceFs = new UniqueRef<IFileSystem>(ncaFileSystem); using var uniqueOutputFs = new UniqueRef<IFileSystem>(new LocalFileSystem(destination)); - fsClient.Register(source.ToU8Span(), ref uniqueSourceFs.Ref()); - fsClient.Register(output.ToU8Span(), ref uniqueOutputFs.Ref()); + fsClient.Register(source.ToU8Span(), ref uniqueSourceFs.Ref); + fsClient.Register(output.ToU8Span(), ref uniqueOutputFs.Ref); (Result? resultCode, bool canceled) = CopyDirectory(fsClient, $"{source}:/", $"{output}:/"); diff --git a/Ryujinx/Ui/Windows/AvatarWindow.cs b/Ryujinx/Ui/Windows/AvatarWindow.cs index c715907d7..fc928bde2 100644 --- a/Ryujinx/Ui/Windows/AvatarWindow.cs +++ b/Ryujinx/Ui/Windows/AvatarWindow.cs @@ -134,7 +134,7 @@ namespace Ryujinx.Ui.Windows { using var file = new UniqueRef<IFile>(); - romfs.OpenFile(ref file.Ref(), ("/" + item.FullPath).ToU8Span(), OpenMode.Read).ThrowIfFailure(); + romfs.OpenFile(ref file.Ref, ("/" + item.FullPath).ToU8Span(), OpenMode.Read).ThrowIfFailure(); using (MemoryStream stream = new MemoryStream()) using (MemoryStream streamPng = new MemoryStream()) diff --git a/Ryujinx/Ui/Windows/DlcWindow.cs b/Ryujinx/Ui/Windows/DlcWindow.cs index 0a97ac2a2..9fccec195 100644 --- a/Ryujinx/Ui/Windows/DlcWindow.cs +++ b/Ryujinx/Ui/Windows/DlcWindow.cs @@ -93,7 +93,7 @@ namespace Ryujinx.Ui.Windows { using var ncaFile = new UniqueRef<IFile>(); - pfs.OpenFile(ref ncaFile.Ref(), dlcNca.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); + pfs.OpenFile(ref ncaFile.Ref, dlcNca.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); Nca nca = TryCreateNca(ncaFile.Get.AsStorage(), dlcContainer.ContainerPath); if (nca != null) @@ -161,7 +161,7 @@ namespace Ryujinx.Ui.Windows { using var ncaFile = new UniqueRef<IFile>(); - pfs.OpenFile(ref ncaFile.Ref(), fileEntry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); + pfs.OpenFile(ref ncaFile.Ref, fileEntry.FullPath.ToU8Span(), OpenMode.Read).ThrowIfFailure(); Nca nca = TryCreateNca(ncaFile.Get.AsStorage(), containerPath); diff --git a/Ryujinx/Ui/Windows/TitleUpdateWindow.cs b/Ryujinx/Ui/Windows/TitleUpdateWindow.cs index 2618168cd..4aea58955 100644 --- a/Ryujinx/Ui/Windows/TitleUpdateWindow.cs +++ b/Ryujinx/Ui/Windows/TitleUpdateWindow.cs @@ -102,7 +102,7 @@ namespace Ryujinx.Ui.Windows using var nacpFile = new UniqueRef<IFile>(); - controlNca.OpenFileSystem(NcaSectionType.Data, IntegrityCheckLevel.None).OpenFile(ref nacpFile.Ref(), "/control.nacp".ToU8Span(), OpenMode.Read).ThrowIfFailure(); + controlNca.OpenFileSystem(NcaSectionType.Data, IntegrityCheckLevel.None).OpenFile(ref nacpFile.Ref, "/control.nacp".ToU8Span(), OpenMode.Read).ThrowIfFailure(); nacpFile.Get.Read(out _, 0, SpanHelpers.AsByteSpan(ref controlData), ReadOption.None).ThrowIfFailure(); RadioButton radioButton = new RadioButton($"Version {controlData.DisplayVersionString.ToString()} - {path}"); From dba908dc788c639f9c9d2e85108389efc031aac6 Mon Sep 17 00:00:00 2001 From: MetrosexualGarbodor <79612681+MetrosexualGarbodor@users.noreply.github.com> Date: Sat, 4 Mar 2023 02:15:29 +0000 Subject: [PATCH 38/41] Add post processing feature to the readme (#4499) * Add post processing feature to the readme Adds post processing information to the GPU section in the readme. * correct "Anti-Aliasing" --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0f11daf15..fdb29a481 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ Ryujinx system files are stored in the `Ryujinx` folder. This folder is located - **GPU** - The GPU emulator emulates the Switch's Maxwell GPU using either the OpenGL (version 4.5 minimum), Vulkan, or Metal (via MoltenVK) APIs through a custom build of OpenTK or Silk.NET respectively. There are currently four graphics enhancements available to the end user in Ryujinx: Disk Shader Caching, Resolution Scaling, Aspect Ratio Adjustment, and Anisotropic Filtering. These enhancements can be adjusted or toggled as desired in the GUI. + The GPU emulator emulates the Switch's Maxwell GPU using either the OpenGL (version 4.5 minimum), Vulkan, or Metal (via MoltenVK) APIs through a custom build of OpenTK or Silk.NET respectively. There are currently six graphics enhancements available to the end user in Ryujinx: Disk Shader Caching, Resolution Scaling, Anti-Aliasing, Scaling Filters (including FSR), Anisotropic Filtering and Aspect Ratio Adjustment. These enhancements can be adjusted or toggled as desired in the GUI. - **Input** From 155736c9863ed90c3ffa177266f67a0bdaa63fd1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 4 Mar 2023 11:32:30 +0000 Subject: [PATCH 39/41] nuget: bump UnicornEngine.Unicorn (#4500) Bumps [UnicornEngine.Unicorn](https://github.com/unicorn-engine/unicorn) from 2.0.2-rc1-a913199 to 2.0.2-rc1-f7c841d. - [Release notes](https://github.com/unicorn-engine/unicorn/releases) - [Changelog](https://github.com/unicorn-engine/unicorn/blob/master/ChangeLog) - [Commits](https://github.com/unicorn-engine/unicorn/commits) --- updated-dependencies: - dependency-name: UnicornEngine.Unicorn dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Directory.Packages.props | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Directory.Packages.props b/Directory.Packages.props index b46b77e70..eef982774 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -49,7 +49,7 @@ <PackageVersion Include="System.Management" Version="7.0.0" /> <PackageVersion Include="System.Net.NameResolution" Version="4.3.0" /> <PackageVersion Include="System.Threading.ThreadPool" Version="4.3.0" /> - <PackageVersion Include="UnicornEngine.Unicorn" Version="2.0.2-rc1-a913199" /> + <PackageVersion Include="UnicornEngine.Unicorn" Version="2.0.2-rc1-f7c841d" /> <PackageVersion Include="XamlNameReferenceGenerator" Version="1.5.1" /> </ItemGroup> </Project> From 4f3af839be4134ed63dbd705758714bd0fbba9ef Mon Sep 17 00:00:00 2001 From: gdkchan <gab.dark.100@gmail.com> Date: Sat, 4 Mar 2023 10:43:08 -0300 Subject: [PATCH 40/41] Minor code formatting (#4498) --- ARMeilleure/Instructions/InstEmitHash32.cs | 3 +-- ARMeilleure/Instructions/InstEmitHashHelper.cs | 3 +-- .../SoundIoHardwareDeviceDriver.cs | 3 +-- .../Renderer/Dsp/Command/CompressorCommand.cs | 6 +++--- .../Renderer/Server/Effect/CompressorEffect.cs | 2 +- .../UI/Controls/NavigationDialogHost.axaml.cs | 2 +- Ryujinx.Ava/UI/Helpers/LoggerAdapter.cs | 2 +- Ryujinx.Ava/UI/ViewModels/AmiiboWindowViewModel.cs | 4 +--- .../UI/ViewModels/ControllerSettingsViewModel.cs | 3 --- Ryujinx.Ava/UI/Views/User/UserEditorView.axaml.cs | 2 +- Ryujinx.Common/SystemInfo/LinuxSystemInfo.cs | 2 +- Ryujinx.Common/SystemInfo/MacOSSystemInfo.cs | 2 +- Ryujinx.Common/SystemInfo/SystemInfo.cs | 4 ++-- Ryujinx.Common/SystemInfo/WindowsSystemInfo.cs | 2 +- Ryujinx.Common/SystemInterop/StdErrAdapter.cs | 4 ++-- .../Shader/ShaderSpecializationState.cs | 2 +- Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs | 4 ++-- Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs | 4 ++-- Ryujinx.Graphics.OpenGL/PersistentBuffers.cs | 8 ++++---- .../CodeGen/Spirv/SpirvDelegates.cs | 6 +++--- .../CodeGen/Spirv/SpirvGenerator.cs | 3 +-- Ryujinx.Graphics.Texture/BC7Decoder.cs | 2 +- Ryujinx.Graphics.Vulkan/BackgroundResources.cs | 6 +++--- Ryujinx.Graphics.Vulkan/IdList.cs | 2 +- .../MoltenVK/MVKInitialization.cs | 2 +- Ryujinx.Graphics.Vulkan/Queries/CounterQueue.cs | 2 +- .../HOS/Applets/Controller/ControllerApplet.cs | 9 ++++----- .../SoftwareKeyboard/SoftwareKeyboardApplet.cs | 4 ++-- .../SoftwareKeyboardRendererBase.cs | 8 ++++---- .../Diagnostics/Demangler/Ast/IntegerLiteral.cs | 2 +- Ryujinx.HLE/HOS/Diagnostics/Demangler/Demangler.cs | 4 ++-- Ryujinx.HLE/HOS/Kernel/Memory/KPageTableBase.cs | 2 +- .../HOS/Kernel/Process/HleProcessDebugger.cs | 2 +- Ryujinx.HLE/HOS/ModLoader.cs | 8 ++++---- .../Services/Account/Acc/AccountSaveDataManager.cs | 2 +- .../LibraryAppletCreator/ILibraryAppletAccessor.cs | 4 ++-- Ryujinx.HLE/HOS/Services/Bcat/IServiceCreator.cs | 2 +- Ryujinx.HLE/HOS/Services/Fs/IFileSystemProxy.cs | 3 +-- Ryujinx.HLE/HOS/Services/IpcService.cs | 2 +- .../HOS/Services/Mii/Types/RandomMiiConstants.cs | 4 ++-- Ryujinx.HLE/HOS/Services/Nv/NvMemoryAllocator.cs | 6 +++--- .../Vi/RootService/IApplicationDisplayService.cs | 6 +++--- Ryujinx.Horizon/Sdk/OsTypes/Impl/MultiWaitImpl.cs | 2 +- Ryujinx.Horizon/Sdk/OsTypes/OsSystemEvent.cs | 4 ++-- Ryujinx.Horizon/Sdk/Sf/Hipc/ServerManagerBase.cs | 4 ++-- Ryujinx.Horizon/Sm/SmMain.cs | 4 ++-- Ryujinx.Input/Assigner/GamepadButtonAssigner.cs | 2 +- Ryujinx/Ui/Helper/MetalHelper.cs | 2 +- Ryujinx/Ui/RendererWidgetBase.cs | 8 ++++---- Ryujinx/Ui/VKRenderer.cs | 2 +- Ryujinx/Ui/Widgets/ProfileDialog.cs | 3 +-- Ryujinx/Ui/Windows/ControllerWindow.cs | 14 ++++++-------- Ryujinx/Ui/Windows/SettingsWindow.cs | 7 +++---- 53 files changed, 95 insertions(+), 110 deletions(-) diff --git a/ARMeilleure/Instructions/InstEmitHash32.cs b/ARMeilleure/Instructions/InstEmitHash32.cs index fec782dd8..5d39f8afc 100644 --- a/ARMeilleure/Instructions/InstEmitHash32.cs +++ b/ARMeilleure/Instructions/InstEmitHash32.cs @@ -1,9 +1,8 @@ using ARMeilleure.Decoders; using ARMeilleure.IntermediateRepresentation; using ARMeilleure.Translation; - -using static ARMeilleure.Instructions.InstEmitHelper; using static ARMeilleure.Instructions.InstEmitHashHelper; +using static ARMeilleure.Instructions.InstEmitHelper; namespace ARMeilleure.Instructions { diff --git a/ARMeilleure/Instructions/InstEmitHashHelper.cs b/ARMeilleure/Instructions/InstEmitHashHelper.cs index 1dfe771c4..55a03a4f6 100644 --- a/ARMeilleure/Instructions/InstEmitHashHelper.cs +++ b/ARMeilleure/Instructions/InstEmitHashHelper.cs @@ -4,9 +4,8 @@ using ARMeilleure.IntermediateRepresentation; using ARMeilleure.Translation; using System; using System.Diagnostics; - -using static ARMeilleure.IntermediateRepresentation.Operand.Factory; using static ARMeilleure.Instructions.InstEmitSimdHelper; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; namespace ARMeilleure.Instructions { diff --git a/Ryujinx.Audio.Backends.SoundIo/SoundIoHardwareDeviceDriver.cs b/Ryujinx.Audio.Backends.SoundIo/SoundIoHardwareDeviceDriver.cs index 2eab59086..02da27769 100644 --- a/Ryujinx.Audio.Backends.SoundIo/SoundIoHardwareDeviceDriver.cs +++ b/Ryujinx.Audio.Backends.SoundIo/SoundIoHardwareDeviceDriver.cs @@ -5,9 +5,8 @@ using Ryujinx.Memory; using System; using System.Collections.Concurrent; using System.Threading; - -using static Ryujinx.Audio.Integration.IHardwareDeviceDriver; using static Ryujinx.Audio.Backends.SoundIo.Native.SoundIo; +using static Ryujinx.Audio.Integration.IHardwareDeviceDriver; namespace Ryujinx.Audio.Backends.SoundIo { diff --git a/Ryujinx.Audio/Renderer/Dsp/Command/CompressorCommand.cs b/Ryujinx.Audio/Renderer/Dsp/Command/CompressorCommand.cs index 8c3442935..34231e614 100644 --- a/Ryujinx.Audio/Renderer/Dsp/Command/CompressorCommand.cs +++ b/Ryujinx.Audio/Renderer/Dsp/Command/CompressorCommand.cs @@ -1,8 +1,8 @@ -using System; -using System.Diagnostics; -using Ryujinx.Audio.Renderer.Dsp.Effect; +using Ryujinx.Audio.Renderer.Dsp.Effect; using Ryujinx.Audio.Renderer.Dsp.State; using Ryujinx.Audio.Renderer.Parameter.Effect; +using System; +using System.Diagnostics; namespace Ryujinx.Audio.Renderer.Dsp.Command { diff --git a/Ryujinx.Audio/Renderer/Server/Effect/CompressorEffect.cs b/Ryujinx.Audio/Renderer/Server/Effect/CompressorEffect.cs index f4e5ae829..32162abcd 100644 --- a/Ryujinx.Audio/Renderer/Server/Effect/CompressorEffect.cs +++ b/Ryujinx.Audio/Renderer/Server/Effect/CompressorEffect.cs @@ -1,7 +1,7 @@ using Ryujinx.Audio.Renderer.Common; using Ryujinx.Audio.Renderer.Dsp.State; -using Ryujinx.Audio.Renderer.Parameter.Effect; using Ryujinx.Audio.Renderer.Parameter; +using Ryujinx.Audio.Renderer.Parameter.Effect; using Ryujinx.Audio.Renderer.Server.MemoryPool; using System; using System.Diagnostics; diff --git a/Ryujinx.Ava/UI/Controls/NavigationDialogHost.axaml.cs b/Ryujinx.Ava/UI/Controls/NavigationDialogHost.axaml.cs index 741885305..1b857fae4 100644 --- a/Ryujinx.Ava/UI/Controls/NavigationDialogHost.axaml.cs +++ b/Ryujinx.Ava/UI/Controls/NavigationDialogHost.axaml.cs @@ -16,9 +16,9 @@ using Ryujinx.Ava.UI.Views.User; using Ryujinx.HLE.FileSystem; using Ryujinx.HLE.HOS.Services.Account.Acc; using System; -using System.Threading.Tasks; using System.Collections.Generic; using System.Linq; +using System.Threading.Tasks; using UserProfile = Ryujinx.Ava.UI.Models.UserProfile; namespace Ryujinx.Ava.UI.Controls diff --git a/Ryujinx.Ava/UI/Helpers/LoggerAdapter.cs b/Ryujinx.Ava/UI/Helpers/LoggerAdapter.cs index bb9681e22..7a29cc198 100644 --- a/Ryujinx.Ava/UI/Helpers/LoggerAdapter.cs +++ b/Ryujinx.Ava/UI/Helpers/LoggerAdapter.cs @@ -6,8 +6,8 @@ namespace Ryujinx.Ava.UI.Helpers { using AvaLogger = Avalonia.Logging.Logger; using AvaLogLevel = Avalonia.Logging.LogEventLevel; - using RyuLogger = Ryujinx.Common.Logging.Logger; using RyuLogClass = Ryujinx.Common.Logging.LogClass; + using RyuLogger = Ryujinx.Common.Logging.Logger; internal class LoggerAdapter : Avalonia.Logging.ILogSink { diff --git a/Ryujinx.Ava/UI/ViewModels/AmiiboWindowViewModel.cs b/Ryujinx.Ava/UI/ViewModels/AmiiboWindowViewModel.cs index 5561a20ca..5311318c5 100644 --- a/Ryujinx.Ava/UI/ViewModels/AmiiboWindowViewModel.cs +++ b/Ryujinx.Ava/UI/ViewModels/AmiiboWindowViewModel.cs @@ -3,7 +3,6 @@ using Avalonia.Collections; using Avalonia.Media.Imaging; using Avalonia.Threading; using Ryujinx.Ava.Common.Locale; -using Ryujinx.Ava.UI.Controls; using Ryujinx.Ava.UI.Helpers; using Ryujinx.Ava.UI.Models; using Ryujinx.Ava.UI.Windows; @@ -17,7 +16,6 @@ using System.IO; using System.Linq; using System.Net.Http; using System.Text; -using System.Text.Json; using System.Threading.Tasks; namespace Ryujinx.Ava.UI.ViewModels @@ -31,7 +29,7 @@ namespace Ryujinx.Ava.UI.ViewModels private readonly byte[] _amiiboLogoBytes; private readonly HttpClient _httpClient; private readonly StyleableWindow _owner; - + private Bitmap _amiiboImage; private List<Amiibo.AmiiboApi> _amiiboList; private AvaloniaList<Amiibo.AmiiboApi> _amiibos; diff --git a/Ryujinx.Ava/UI/ViewModels/ControllerSettingsViewModel.cs b/Ryujinx.Ava/UI/ViewModels/ControllerSettingsViewModel.cs index f63fc3491..35256b3b5 100644 --- a/Ryujinx.Ava/UI/ViewModels/ControllerSettingsViewModel.cs +++ b/Ryujinx.Ava/UI/ViewModels/ControllerSettingsViewModel.cs @@ -3,11 +3,8 @@ using Avalonia.Controls; using Avalonia.Controls.ApplicationLifetimes; using Avalonia.Svg.Skia; using Avalonia.Threading; -using LibHac.Bcat; -using LibHac.Tools.Fs; using Ryujinx.Ava.Common.Locale; using Ryujinx.Ava.Input; -using Ryujinx.Ava.UI.Controls; using Ryujinx.Ava.UI.Helpers; using Ryujinx.Ava.UI.Models; using Ryujinx.Ava.UI.Windows; diff --git a/Ryujinx.Ava/UI/Views/User/UserEditorView.axaml.cs b/Ryujinx.Ava/UI/Views/User/UserEditorView.axaml.cs index fb33dcf8f..81938d23b 100644 --- a/Ryujinx.Ava/UI/Views/User/UserEditorView.axaml.cs +++ b/Ryujinx.Ava/UI/Views/User/UserEditorView.axaml.cs @@ -5,8 +5,8 @@ using FluentAvalonia.UI.Controls; using FluentAvalonia.UI.Navigation; using Ryujinx.Ava.Common.Locale; using Ryujinx.Ava.UI.Controls; -using Ryujinx.Ava.UI.Models; using Ryujinx.Ava.UI.Helpers; +using Ryujinx.Ava.UI.Models; using Ryujinx.HLE.HOS.Services.Account.Acc; using System; using UserProfile = Ryujinx.Ava.UI.Models.UserProfile; diff --git a/Ryujinx.Common/SystemInfo/LinuxSystemInfo.cs b/Ryujinx.Common/SystemInfo/LinuxSystemInfo.cs index cd4a3d821..b0c15e491 100644 --- a/Ryujinx.Common/SystemInfo/LinuxSystemInfo.cs +++ b/Ryujinx.Common/SystemInfo/LinuxSystemInfo.cs @@ -1,9 +1,9 @@ +using Ryujinx.Common.Logging; using System; using System.Collections.Generic; using System.Globalization; using System.IO; using System.Runtime.Versioning; -using Ryujinx.Common.Logging; namespace Ryujinx.Common.SystemInfo { diff --git a/Ryujinx.Common/SystemInfo/MacOSSystemInfo.cs b/Ryujinx.Common/SystemInfo/MacOSSystemInfo.cs index ad022bdf0..06324a54c 100644 --- a/Ryujinx.Common/SystemInfo/MacOSSystemInfo.cs +++ b/Ryujinx.Common/SystemInfo/MacOSSystemInfo.cs @@ -1,9 +1,9 @@ +using Ryujinx.Common.Logging; using System; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Versioning; using System.Text; -using Ryujinx.Common.Logging; namespace Ryujinx.Common.SystemInfo { diff --git a/Ryujinx.Common/SystemInfo/SystemInfo.cs b/Ryujinx.Common/SystemInfo/SystemInfo.cs index 1db72d9b9..e9ce3c58a 100644 --- a/Ryujinx.Common/SystemInfo/SystemInfo.cs +++ b/Ryujinx.Common/SystemInfo/SystemInfo.cs @@ -1,8 +1,8 @@ -using System; +using Ryujinx.Common.Logging; +using System; using System.Runtime.InteropServices; using System.Runtime.Intrinsics.X86; using System.Text; -using Ryujinx.Common.Logging; namespace Ryujinx.Common.SystemInfo { diff --git a/Ryujinx.Common/SystemInfo/WindowsSystemInfo.cs b/Ryujinx.Common/SystemInfo/WindowsSystemInfo.cs index 11f0785e6..c3598a1eb 100644 --- a/Ryujinx.Common/SystemInfo/WindowsSystemInfo.cs +++ b/Ryujinx.Common/SystemInfo/WindowsSystemInfo.cs @@ -1,8 +1,8 @@ +using Ryujinx.Common.Logging; using System; using System.Management; using System.Runtime.InteropServices; using System.Runtime.Versioning; -using Ryujinx.Common.Logging; namespace Ryujinx.Common.SystemInfo { diff --git a/Ryujinx.Common/SystemInterop/StdErrAdapter.cs b/Ryujinx.Common/SystemInterop/StdErrAdapter.cs index 12e240ad3..efb142184 100644 --- a/Ryujinx.Common/SystemInterop/StdErrAdapter.cs +++ b/Ryujinx.Common/SystemInterop/StdErrAdapter.cs @@ -1,9 +1,9 @@ +using Ryujinx.Common.Logging; using System; using System.IO; +using System.Runtime.InteropServices; using System.Runtime.Versioning; using System.Threading; -using Ryujinx.Common.Logging; -using System.Runtime.InteropServices; namespace Ryujinx.Common.SystemInterop { diff --git a/Ryujinx.Graphics.Gpu/Shader/ShaderSpecializationState.cs b/Ryujinx.Graphics.Gpu/Shader/ShaderSpecializationState.cs index a4bf81363..856507cd7 100644 --- a/Ryujinx.Graphics.Gpu/Shader/ShaderSpecializationState.cs +++ b/Ryujinx.Graphics.Gpu/Shader/ShaderSpecializationState.cs @@ -1,7 +1,7 @@ using Ryujinx.Common.Memory; +using Ryujinx.Graphics.GAL; using Ryujinx.Graphics.Gpu.Image; using Ryujinx.Graphics.Gpu.Memory; -using Ryujinx.Graphics.GAL; using Ryujinx.Graphics.Gpu.Shader.DiskCache; using Ryujinx.Graphics.Shader; using System; diff --git a/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs index 873f667ac..3fc3c72a7 100644 --- a/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs +++ b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs @@ -1,7 +1,7 @@ -using System; +using Ryujinx.Graphics.Nvdec.Vp9.Common; +using System; using System.Diagnostics; using System.Runtime.CompilerServices; -using Ryujinx.Graphics.Nvdec.Vp9.Common; using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.TxfmCommon; namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp diff --git a/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs index 5d177b409..050951216 100644 --- a/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs +++ b/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs @@ -1,6 +1,6 @@ -using System; +using Ryujinx.Common.Memory; +using System; using System.Buffers.Binary; -using Ryujinx.Common.Memory; namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp { diff --git a/Ryujinx.Graphics.OpenGL/PersistentBuffers.cs b/Ryujinx.Graphics.OpenGL/PersistentBuffers.cs index 872efcc37..654e25b9d 100644 --- a/Ryujinx.Graphics.OpenGL/PersistentBuffers.cs +++ b/Ryujinx.Graphics.OpenGL/PersistentBuffers.cs @@ -1,10 +1,10 @@ -using System; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; -using OpenTK.Graphics.OpenGL; +using OpenTK.Graphics.OpenGL; using Ryujinx.Common.Logging; using Ryujinx.Graphics.GAL; using Ryujinx.Graphics.OpenGL.Image; +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; namespace Ryujinx.Graphics.OpenGL { diff --git a/Ryujinx.Graphics.Shader/CodeGen/Spirv/SpirvDelegates.cs b/Ryujinx.Graphics.Shader/CodeGen/Spirv/SpirvDelegates.cs index 04c3be1b8..3ccfd7f55 100644 --- a/Ryujinx.Graphics.Shader/CodeGen/Spirv/SpirvDelegates.cs +++ b/Ryujinx.Graphics.Shader/CodeGen/Spirv/SpirvDelegates.cs @@ -1,7 +1,7 @@ -using FuncUnaryInstruction = System.Func<Spv.Generator.Instruction, Spv.Generator.Instruction, Spv.Generator.Instruction>; -using FuncBinaryInstruction = System.Func<Spv.Generator.Instruction, Spv.Generator.Instruction, Spv.Generator.Instruction, Spv.Generator.Instruction>; -using FuncTernaryInstruction = System.Func<Spv.Generator.Instruction, Spv.Generator.Instruction, Spv.Generator.Instruction, Spv.Generator.Instruction, Spv.Generator.Instruction>; +using FuncBinaryInstruction = System.Func<Spv.Generator.Instruction, Spv.Generator.Instruction, Spv.Generator.Instruction, Spv.Generator.Instruction>; using FuncQuaternaryInstruction = System.Func<Spv.Generator.Instruction, Spv.Generator.Instruction, Spv.Generator.Instruction, Spv.Generator.Instruction, Spv.Generator.Instruction, Spv.Generator.Instruction>; +using FuncTernaryInstruction = System.Func<Spv.Generator.Instruction, Spv.Generator.Instruction, Spv.Generator.Instruction, Spv.Generator.Instruction, Spv.Generator.Instruction>; +using FuncUnaryInstruction = System.Func<Spv.Generator.Instruction, Spv.Generator.Instruction, Spv.Generator.Instruction>; namespace Ryujinx.Graphics.Shader.CodeGen.Spirv { diff --git a/Ryujinx.Graphics.Shader/CodeGen/Spirv/SpirvGenerator.cs b/Ryujinx.Graphics.Shader/CodeGen/Spirv/SpirvGenerator.cs index 9f08b319d..ca8235383 100644 --- a/Ryujinx.Graphics.Shader/CodeGen/Spirv/SpirvGenerator.cs +++ b/Ryujinx.Graphics.Shader/CodeGen/Spirv/SpirvGenerator.cs @@ -9,9 +9,8 @@ using static Spv.Specification; namespace Ryujinx.Graphics.Shader.CodeGen.Spirv { using SpvInstruction = Spv.Generator.Instruction; - using SpvLiteralInteger = Spv.Generator.LiteralInteger; - using SpvInstructionPool = Spv.Generator.GeneratorPool<Spv.Generator.Instruction>; + using SpvLiteralInteger = Spv.Generator.LiteralInteger; using SpvLiteralIntegerPool = Spv.Generator.GeneratorPool<Spv.Generator.LiteralInteger>; static class SpirvGenerator diff --git a/Ryujinx.Graphics.Texture/BC7Decoder.cs b/Ryujinx.Graphics.Texture/BC7Decoder.cs index 060d1ab85..b865a5593 100644 --- a/Ryujinx.Graphics.Texture/BC7Decoder.cs +++ b/Ryujinx.Graphics.Texture/BC7Decoder.cs @@ -1,6 +1,6 @@ using Ryujinx.Graphics.Texture.Utils; -using System.Diagnostics; using System; +using System.Diagnostics; using System.Numerics; using System.Runtime.InteropServices; diff --git a/Ryujinx.Graphics.Vulkan/BackgroundResources.cs b/Ryujinx.Graphics.Vulkan/BackgroundResources.cs index 30972f923..b93b7a250 100644 --- a/Ryujinx.Graphics.Vulkan/BackgroundResources.cs +++ b/Ryujinx.Graphics.Vulkan/BackgroundResources.cs @@ -1,7 +1,7 @@ -using System.Threading; -using System.Collections.Generic; +using Silk.NET.Vulkan; using System; -using Silk.NET.Vulkan; +using System.Collections.Generic; +using System.Threading; namespace Ryujinx.Graphics.Vulkan { diff --git a/Ryujinx.Graphics.Vulkan/IdList.cs b/Ryujinx.Graphics.Vulkan/IdList.cs index 5c0623c3f..9fba9fe99 100644 --- a/Ryujinx.Graphics.Vulkan/IdList.cs +++ b/Ryujinx.Graphics.Vulkan/IdList.cs @@ -1,5 +1,5 @@ -using System.Collections.Generic; using System; +using System.Collections.Generic; namespace Ryujinx.Graphics.Vulkan { diff --git a/Ryujinx.Graphics.Vulkan/MoltenVK/MVKInitialization.cs b/Ryujinx.Graphics.Vulkan/MoltenVK/MVKInitialization.cs index ca2fbfb94..5910d1aac 100644 --- a/Ryujinx.Graphics.Vulkan/MoltenVK/MVKInitialization.cs +++ b/Ryujinx.Graphics.Vulkan/MoltenVK/MVKInitialization.cs @@ -1,7 +1,7 @@ using Silk.NET.Vulkan; using System; -using System.Runtime.Versioning; using System.Runtime.InteropServices; +using System.Runtime.Versioning; namespace Ryujinx.Graphics.Vulkan.MoltenVK { diff --git a/Ryujinx.Graphics.Vulkan/Queries/CounterQueue.cs b/Ryujinx.Graphics.Vulkan/Queries/CounterQueue.cs index c47f95eab..7293b74f9 100644 --- a/Ryujinx.Graphics.Vulkan/Queries/CounterQueue.cs +++ b/Ryujinx.Graphics.Vulkan/Queries/CounterQueue.cs @@ -2,8 +2,8 @@ using Silk.NET.Vulkan; using System; using System.Collections.Generic; -using System.Threading; using System.Linq; +using System.Threading; namespace Ryujinx.Graphics.Vulkan.Queries { diff --git a/Ryujinx.HLE/HOS/Applets/Controller/ControllerApplet.cs b/Ryujinx.HLE/HOS/Applets/Controller/ControllerApplet.cs index e5af5fd98..5cdfb3143 100644 --- a/Ryujinx.HLE/HOS/Applets/Controller/ControllerApplet.cs +++ b/Ryujinx.HLE/HOS/Applets/Controller/ControllerApplet.cs @@ -1,12 +1,11 @@ +using Ryujinx.Common.Logging; +using Ryujinx.HLE.HOS.Services.Am.AppletAE; +using Ryujinx.HLE.HOS.Services.Hid; +using Ryujinx.HLE.HOS.Services.Hid.Types; using System; using System.IO; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using Ryujinx.Common.Logging; -using Ryujinx.HLE.HOS.Services.Hid; -using Ryujinx.HLE.HOS.Services.Hid.Types; -using Ryujinx.HLE.HOS.Services.Am.AppletAE; - using static Ryujinx.HLE.HOS.Services.Hid.HidServer.HidUtils; namespace Ryujinx.HLE.HOS.Applets diff --git a/Ryujinx.HLE/HOS/Applets/SoftwareKeyboard/SoftwareKeyboardApplet.cs b/Ryujinx.HLE/HOS/Applets/SoftwareKeyboard/SoftwareKeyboardApplet.cs index 4077ad420..278ea56c2 100644 --- a/Ryujinx.HLE/HOS/Applets/SoftwareKeyboard/SoftwareKeyboardApplet.cs +++ b/Ryujinx.HLE/HOS/Applets/SoftwareKeyboard/SoftwareKeyboardApplet.cs @@ -297,7 +297,7 @@ namespace Ryujinx.HLE.HOS.Applets _foregroundState = SoftwareKeyboardState.Complete; } - else if(_foregroundState == SoftwareKeyboardState.Complete) + else if (_foregroundState == SoftwareKeyboardState.Complete) { // If we have already completed, we push the result text // back on the output buffer and poll the application. @@ -780,7 +780,7 @@ namespace Ryujinx.HLE.HOS.Applets { return null; } - + if (input.Length == 0) { return string.Empty; diff --git a/Ryujinx.HLE/HOS/Applets/SoftwareKeyboard/SoftwareKeyboardRendererBase.cs b/Ryujinx.HLE/HOS/Applets/SoftwareKeyboard/SoftwareKeyboardRendererBase.cs index 71835e2da..9a91fa321 100644 --- a/Ryujinx.HLE/HOS/Applets/SoftwareKeyboard/SoftwareKeyboardRendererBase.cs +++ b/Ryujinx.HLE/HOS/Applets/SoftwareKeyboard/SoftwareKeyboardRendererBase.cs @@ -1,16 +1,16 @@ using Ryujinx.HLE.Ui; using Ryujinx.Memory; -using SixLabors.ImageSharp; -using SixLabors.ImageSharp.Processing; -using SixLabors.ImageSharp.Drawing.Processing; using SixLabors.Fonts; +using SixLabors.ImageSharp; +using SixLabors.ImageSharp.Drawing.Processing; +using SixLabors.ImageSharp.PixelFormats; +using SixLabors.ImageSharp.Processing; using System; using System.Diagnostics; using System.IO; using System.Numerics; using System.Reflection; using System.Runtime.InteropServices; -using SixLabors.ImageSharp.PixelFormats; namespace Ryujinx.HLE.HOS.Applets.SoftwareKeyboard { diff --git a/Ryujinx.HLE/HOS/Diagnostics/Demangler/Ast/IntegerLiteral.cs b/Ryujinx.HLE/HOS/Diagnostics/Demangler/Ast/IntegerLiteral.cs index ea048d768..33752d00c 100644 --- a/Ryujinx.HLE/HOS/Diagnostics/Demangler/Ast/IntegerLiteral.cs +++ b/Ryujinx.HLE/HOS/Diagnostics/Demangler/Ast/IntegerLiteral.cs @@ -1,5 +1,5 @@ -using System.IO; using System; +using System.IO; namespace Ryujinx.HLE.HOS.Diagnostics.Demangler.Ast { diff --git a/Ryujinx.HLE/HOS/Diagnostics/Demangler/Demangler.cs b/Ryujinx.HLE/HOS/Diagnostics/Demangler/Demangler.cs index a6618eca4..1bfd7ac07 100644 --- a/Ryujinx.HLE/HOS/Diagnostics/Demangler/Demangler.cs +++ b/Ryujinx.HLE/HOS/Diagnostics/Demangler/Demangler.cs @@ -1,8 +1,8 @@ -using System; +using Ryujinx.HLE.HOS.Diagnostics.Demangler.Ast; +using System; using System.Collections.Generic; using System.IO; using System.Linq; -using Ryujinx.HLE.HOS.Diagnostics.Demangler.Ast; namespace Ryujinx.HLE.HOS.Diagnostics.Demangler { diff --git a/Ryujinx.HLE/HOS/Kernel/Memory/KPageTableBase.cs b/Ryujinx.HLE/HOS/Kernel/Memory/KPageTableBase.cs index bd7d5725b..614eb5271 100644 --- a/Ryujinx.HLE/HOS/Kernel/Memory/KPageTableBase.cs +++ b/Ryujinx.HLE/HOS/Kernel/Memory/KPageTableBase.cs @@ -1,9 +1,9 @@ using Ryujinx.Common; using Ryujinx.HLE.HOS.Kernel.Common; using Ryujinx.HLE.HOS.Kernel.Process; +using Ryujinx.Horizon.Common; using Ryujinx.Memory; using Ryujinx.Memory.Range; -using Ryujinx.Horizon.Common; using System; using System.Collections.Generic; using System.Diagnostics; diff --git a/Ryujinx.HLE/HOS/Kernel/Process/HleProcessDebugger.cs b/Ryujinx.HLE/HOS/Kernel/Process/HleProcessDebugger.cs index 0a78a26dd..8fee5c0d1 100644 --- a/Ryujinx.HLE/HOS/Kernel/Process/HleProcessDebugger.cs +++ b/Ryujinx.HLE/HOS/Kernel/Process/HleProcessDebugger.cs @@ -55,7 +55,7 @@ namespace Ryujinx.HLE.HOS.Kernel.Process void AppendTrace(ulong address) { - if(AnalyzePointer(out PointerInfo info, address, thread)) + if (AnalyzePointer(out PointerInfo info, address, thread)) { trace.AppendLine($" 0x{address:x16}\t{info.ImageDisplay}\t{info.SubDisplay}"); } diff --git a/Ryujinx.HLE/HOS/ModLoader.cs b/Ryujinx.HLE/HOS/ModLoader.cs index bf0f1f891..a6dc90135 100644 --- a/Ryujinx.HLE/HOS/ModLoader.cs +++ b/Ryujinx.HLE/HOS/ModLoader.cs @@ -7,15 +7,15 @@ using LibHac.Tools.FsSystem; using LibHac.Tools.FsSystem.RomFs; using Ryujinx.Common.Configuration; using Ryujinx.Common.Logging; -using Ryujinx.HLE.Loaders.Mods; +using Ryujinx.HLE.HOS.Kernel.Process; using Ryujinx.HLE.Loaders.Executables; +using Ryujinx.HLE.Loaders.Mods; using System; using System.Collections.Generic; using System.Collections.Specialized; -using System.Linq; -using System.IO; -using Ryujinx.HLE.HOS.Kernel.Process; using System.Globalization; +using System.IO; +using System.Linq; using Path = System.IO.Path; namespace Ryujinx.HLE.HOS diff --git a/Ryujinx.HLE/HOS/Services/Account/Acc/AccountSaveDataManager.cs b/Ryujinx.HLE/HOS/Services/Account/Acc/AccountSaveDataManager.cs index 3bd0e2da4..ec0b0a10b 100644 --- a/Ryujinx.HLE/HOS/Services/Account/Acc/AccountSaveDataManager.cs +++ b/Ryujinx.HLE/HOS/Services/Account/Acc/AccountSaveDataManager.cs @@ -1,6 +1,6 @@ using Ryujinx.Common.Configuration; -using Ryujinx.Common.Utilities; using Ryujinx.Common.Logging; +using Ryujinx.Common.Utilities; using System; using System.Collections.Concurrent; using System.Collections.Generic; diff --git a/Ryujinx.HLE/HOS/Services/Am/AppletAE/AllSystemAppletProxiesService/LibraryAppletCreator/ILibraryAppletAccessor.cs b/Ryujinx.HLE/HOS/Services/Am/AppletAE/AllSystemAppletProxiesService/LibraryAppletCreator/ILibraryAppletAccessor.cs index 134566d96..4ed502e0e 100644 --- a/Ryujinx.HLE/HOS/Services/Am/AppletAE/AllSystemAppletProxiesService/LibraryAppletCreator/ILibraryAppletAccessor.cs +++ b/Ryujinx.HLE/HOS/Services/Am/AppletAE/AllSystemAppletProxiesService/LibraryAppletCreator/ILibraryAppletAccessor.cs @@ -133,7 +133,7 @@ namespace Ryujinx.HLE.HOS.Services.Am.AppletAE.AllSystemAppletProxiesService.Lib // PopOutData() -> object<nn::am::service::IStorage> public ResultCode PopOutData(ServiceCtx context) { - if(_normalSession.TryPop(out byte[] data)) + if (_normalSession.TryPop(out byte[] data)) { MakeObject(context, new IStorage(data)); @@ -160,7 +160,7 @@ namespace Ryujinx.HLE.HOS.Services.Am.AppletAE.AllSystemAppletProxiesService.Lib // PopInteractiveOutData() -> object<nn::am::service::IStorage> public ResultCode PopInteractiveOutData(ServiceCtx context) { - if(_interactiveSession.TryPop(out byte[] data)) + if (_interactiveSession.TryPop(out byte[] data)) { MakeObject(context, new IStorage(data)); diff --git a/Ryujinx.HLE/HOS/Services/Bcat/IServiceCreator.cs b/Ryujinx.HLE/HOS/Services/Bcat/IServiceCreator.cs index b16ea4c18..937fe76c5 100644 --- a/Ryujinx.HLE/HOS/Services/Bcat/IServiceCreator.cs +++ b/Ryujinx.HLE/HOS/Services/Bcat/IServiceCreator.cs @@ -1,8 +1,8 @@ using LibHac; using LibHac.Common; using Ryujinx.Common; -using Ryujinx.HLE.HOS.Services.Bcat.ServiceCreator; using Ryujinx.HLE.HOS.Services.Arp; +using Ryujinx.HLE.HOS.Services.Bcat.ServiceCreator; namespace Ryujinx.HLE.HOS.Services.Bcat { diff --git a/Ryujinx.HLE/HOS/Services/Fs/IFileSystemProxy.cs b/Ryujinx.HLE/HOS/Services/Fs/IFileSystemProxy.cs index e43b1cad0..37143a5aa 100644 --- a/Ryujinx.HLE/HOS/Services/Fs/IFileSystemProxy.cs +++ b/Ryujinx.HLE/HOS/Services/Fs/IFileSystemProxy.cs @@ -14,12 +14,11 @@ using Ryujinx.Common.Logging; using Ryujinx.HLE.HOS.Services.Fs.FileSystemProxy; using System; using System.IO; - using static Ryujinx.HLE.Utilities.StringUtils; +using GameCardHandle = System.UInt32; using IFileSystem = LibHac.FsSrv.Sf.IFileSystem; using IStorage = LibHac.FsSrv.Sf.IStorage; using RightsId = LibHac.Fs.RightsId; -using GameCardHandle = System.UInt32; namespace Ryujinx.HLE.HOS.Services.Fs { diff --git a/Ryujinx.HLE/HOS/Services/IpcService.cs b/Ryujinx.HLE/HOS/Services/IpcService.cs index 526565a58..4c7d83ea6 100644 --- a/Ryujinx.HLE/HOS/Services/IpcService.cs +++ b/Ryujinx.HLE/HOS/Services/IpcService.cs @@ -4,8 +4,8 @@ using Ryujinx.HLE.HOS.Ipc; using System; using System.Collections.Generic; using System.IO; -using System.Reflection; using System.Linq; +using System.Reflection; namespace Ryujinx.HLE.HOS.Services { diff --git a/Ryujinx.HLE/HOS/Services/Mii/Types/RandomMiiConstants.cs b/Ryujinx.HLE/HOS/Services/Mii/Types/RandomMiiConstants.cs index 16e9289eb..82529450b 100644 --- a/Ryujinx.HLE/HOS/Services/Mii/Types/RandomMiiConstants.cs +++ b/Ryujinx.HLE/HOS/Services/Mii/Types/RandomMiiConstants.cs @@ -1,6 +1,6 @@ -using System; +using Ryujinx.Common.Utilities; +using System; using System.Runtime.InteropServices; -using Ryujinx.Common.Utilities; namespace Ryujinx.HLE.HOS.Services.Mii.Types { diff --git a/Ryujinx.HLE/HOS/Services/Nv/NvMemoryAllocator.cs b/Ryujinx.HLE/HOS/Services/Nv/NvMemoryAllocator.cs index 7369bee5f..341b5e576 100644 --- a/Ryujinx.HLE/HOS/Services/Nv/NvMemoryAllocator.cs +++ b/Ryujinx.HLE/HOS/Services/Nv/NvMemoryAllocator.cs @@ -1,8 +1,8 @@ using Ryujinx.Common.Collections; -using System.Collections.Generic; -using System; -using Ryujinx.Graphics.Gpu.Memory; using Ryujinx.Common.Logging; +using Ryujinx.Graphics.Gpu.Memory; +using System; +using System.Collections.Generic; namespace Ryujinx.HLE.HOS.Services.Nv { diff --git a/Ryujinx.HLE/HOS/Services/Vi/RootService/IApplicationDisplayService.cs b/Ryujinx.HLE/HOS/Services/Vi/RootService/IApplicationDisplayService.cs index d6feb33f4..085d6c519 100644 --- a/Ryujinx.HLE/HOS/Services/Vi/RootService/IApplicationDisplayService.cs +++ b/Ryujinx.HLE/HOS/Services/Vi/RootService/IApplicationDisplayService.cs @@ -6,15 +6,15 @@ using Ryujinx.HLE.HOS.Ipc; using Ryujinx.HLE.HOS.Kernel.Common; using Ryujinx.HLE.HOS.Services.SurfaceFlinger; using Ryujinx.HLE.HOS.Services.Vi.RootService.ApplicationDisplayService; -using Ryujinx.HLE.Ui; using Ryujinx.HLE.HOS.Services.Vi.RootService.ApplicationDisplayService.Types; using Ryujinx.HLE.HOS.Services.Vi.Types; +using Ryujinx.HLE.Ui; +using Ryujinx.Horizon.Common; using System; -using System.Diagnostics; using System.Collections.Generic; +using System.Diagnostics; using System.Runtime.CompilerServices; using System.Text; -using Ryujinx.Horizon.Common; namespace Ryujinx.HLE.HOS.Services.Vi.RootService { diff --git a/Ryujinx.Horizon/Sdk/OsTypes/Impl/MultiWaitImpl.cs b/Ryujinx.Horizon/Sdk/OsTypes/Impl/MultiWaitImpl.cs index fd45792d7..a4a671eaf 100644 --- a/Ryujinx.Horizon/Sdk/OsTypes/Impl/MultiWaitImpl.cs +++ b/Ryujinx.Horizon/Sdk/OsTypes/Impl/MultiWaitImpl.cs @@ -1,7 +1,7 @@ using Ryujinx.Common; using Ryujinx.Horizon.Common; -using System.Collections.Generic; using System; +using System.Collections.Generic; namespace Ryujinx.Horizon.Sdk.OsTypes.Impl { diff --git a/Ryujinx.Horizon/Sdk/OsTypes/OsSystemEvent.cs b/Ryujinx.Horizon/Sdk/OsTypes/OsSystemEvent.cs index 061d7a3cd..40723a5cf 100644 --- a/Ryujinx.Horizon/Sdk/OsTypes/OsSystemEvent.cs +++ b/Ryujinx.Horizon/Sdk/OsTypes/OsSystemEvent.cs @@ -1,5 +1,5 @@ -using Ryujinx.Horizon.Sdk.OsTypes.Impl; -using Ryujinx.Horizon.Common; +using Ryujinx.Horizon.Common; +using Ryujinx.Horizon.Sdk.OsTypes.Impl; using System; namespace Ryujinx.Horizon.Sdk.OsTypes diff --git a/Ryujinx.Horizon/Sdk/Sf/Hipc/ServerManagerBase.cs b/Ryujinx.Horizon/Sdk/Sf/Hipc/ServerManagerBase.cs index 9d21290d8..c36cdda26 100644 --- a/Ryujinx.Horizon/Sdk/Sf/Hipc/ServerManagerBase.cs +++ b/Ryujinx.Horizon/Sdk/Sf/Hipc/ServerManagerBase.cs @@ -1,5 +1,5 @@ -using Ryujinx.Horizon.Sdk.OsTypes; -using Ryujinx.Horizon.Common; +using Ryujinx.Horizon.Common; +using Ryujinx.Horizon.Sdk.OsTypes; using Ryujinx.Horizon.Sdk.Sf.Cmif; using Ryujinx.Horizon.Sdk.Sm; using System; diff --git a/Ryujinx.Horizon/Sm/SmMain.cs b/Ryujinx.Horizon/Sm/SmMain.cs index 5656d464f..f0b4d3300 100644 --- a/Ryujinx.Horizon/Sm/SmMain.cs +++ b/Ryujinx.Horizon/Sm/SmMain.cs @@ -1,5 +1,5 @@ -using Ryujinx.Horizon.Prepo.Types; -using Ryujinx.Horizon.Prepo; +using Ryujinx.Horizon.Prepo; +using Ryujinx.Horizon.Prepo.Types; using Ryujinx.Horizon.Sdk.Sf.Hipc; using Ryujinx.Horizon.Sdk.Sm; using Ryujinx.Horizon.Sm.Impl; diff --git a/Ryujinx.Input/Assigner/GamepadButtonAssigner.cs b/Ryujinx.Input/Assigner/GamepadButtonAssigner.cs index e3aaf8b1b..8621b3a52 100644 --- a/Ryujinx.Input/Assigner/GamepadButtonAssigner.cs +++ b/Ryujinx.Input/Assigner/GamepadButtonAssigner.cs @@ -1,5 +1,5 @@ -using System.Collections.Generic; using System; +using System.Collections.Generic; using System.IO; using System.Linq; diff --git a/Ryujinx/Ui/Helper/MetalHelper.cs b/Ryujinx/Ui/Helper/MetalHelper.cs index 1e10eb05a..c2d4893e8 100644 --- a/Ryujinx/Ui/Helper/MetalHelper.cs +++ b/Ryujinx/Ui/Helper/MetalHelper.cs @@ -1,7 +1,7 @@ using Gdk; using System; -using System.Runtime.Versioning; using System.Runtime.InteropServices; +using System.Runtime.Versioning; namespace Ryujinx.Ui.Helper { diff --git a/Ryujinx/Ui/RendererWidgetBase.cs b/Ryujinx/Ui/RendererWidgetBase.cs index 957bbcd55..e5d22d65c 100644 --- a/Ryujinx/Ui/RendererWidgetBase.cs +++ b/Ryujinx/Ui/RendererWidgetBase.cs @@ -4,13 +4,13 @@ using Gtk; using Ryujinx.Common; using Ryujinx.Common.Configuration; using Ryujinx.Common.Logging; -using Ryujinx.Ui.Common.Configuration; -using Ryujinx.Graphics.Gpu; using Ryujinx.Graphics.GAL; using Ryujinx.Graphics.GAL.Multithreading; +using Ryujinx.Graphics.Gpu; using Ryujinx.Input; using Ryujinx.Input.GTK3; using Ryujinx.Input.HLE; +using Ryujinx.Ui.Common.Configuration; using Ryujinx.Ui.Widgets; using SixLabors.ImageSharp; using SixLabors.ImageSharp.Formats.Png; @@ -26,8 +26,8 @@ namespace Ryujinx.Ui { using Image = SixLabors.ImageSharp.Image; using Key = Input.Key; - using Switch = HLE.Switch; using ScalingFilter = Graphics.GAL.ScalingFilter; + using Switch = HLE.Switch; public abstract class RendererWidgetBase : DrawingArea { @@ -321,7 +321,7 @@ namespace Ryujinx.Ui Window.Cursor = (cursorMoveDelta >= CursorHideIdleTime * Stopwatch.Frequency) ? _invisibleCursor : null; } - if(ConfigurationState.Instance.Hid.EnableMouse && _isMouseInClient) + if (ConfigurationState.Instance.Hid.EnableMouse && _isMouseInClient) { Window.Cursor = _invisibleCursor; } diff --git a/Ryujinx/Ui/VKRenderer.cs b/Ryujinx/Ui/VKRenderer.cs index e49b30c3b..d2106c58f 100644 --- a/Ryujinx/Ui/VKRenderer.cs +++ b/Ryujinx/Ui/VKRenderer.cs @@ -3,9 +3,9 @@ using Ryujinx.Common.Configuration; using Ryujinx.Input.HLE; using Ryujinx.Ui.Helper; using SPB.Graphics.Vulkan; +using SPB.Platform.Metal; using SPB.Platform.Win32; using SPB.Platform.X11; -using SPB.Platform.Metal; using SPB.Windowing; using System; using System.Runtime.InteropServices; diff --git a/Ryujinx/Ui/Widgets/ProfileDialog.cs b/Ryujinx/Ui/Widgets/ProfileDialog.cs index 96b44d240..242e8bd7d 100644 --- a/Ryujinx/Ui/Widgets/ProfileDialog.cs +++ b/Ryujinx/Ui/Widgets/ProfileDialog.cs @@ -1,8 +1,7 @@ using Gtk; +using Ryujinx.Ui.Common.Configuration; using System; using System.Reflection; -using Ryujinx.Ui.Common.Configuration; - using GUI = Gtk.Builder.ObjectAttribute; namespace Ryujinx.Ui.Widgets diff --git a/Ryujinx/Ui/Windows/ControllerWindow.cs b/Ryujinx/Ui/Windows/ControllerWindow.cs index 8c3a43c85..0f0fba0b8 100644 --- a/Ryujinx/Ui/Windows/ControllerWindow.cs +++ b/Ryujinx/Ui/Windows/ControllerWindow.cs @@ -2,11 +2,14 @@ using Gtk; using Ryujinx.Common.Configuration; using Ryujinx.Common.Configuration.Hid; using Ryujinx.Common.Configuration.Hid.Controller; +using Ryujinx.Common.Configuration.Hid.Controller.Motion; using Ryujinx.Common.Configuration.Hid.Keyboard; +using Ryujinx.Common.Logging; using Ryujinx.Common.Utilities; -using Ryujinx.Ui.Common.Configuration; using Ryujinx.Input; +using Ryujinx.Input.Assigner; using Ryujinx.Input.GTK3; +using Ryujinx.Ui.Common.Configuration; using Ryujinx.Ui.Widgets; using System; using System.Collections.Generic; @@ -14,15 +17,10 @@ using System.IO; using System.Reflection; using System.Text.Json; using System.Threading; - -using GUI = Gtk.Builder.ObjectAttribute; -using Key = Ryujinx.Common.Configuration.Hid.Key; - using ConfigGamepadInputId = Ryujinx.Common.Configuration.Hid.Controller.GamepadInputId; using ConfigStickInputId = Ryujinx.Common.Configuration.Hid.Controller.StickInputId; -using Ryujinx.Common.Configuration.Hid.Controller.Motion; -using Ryujinx.Common.Logging; -using Ryujinx.Input.Assigner; +using GUI = Gtk.Builder.ObjectAttribute; +using Key = Ryujinx.Common.Configuration.Hid.Key; namespace Ryujinx.Ui.Windows { diff --git a/Ryujinx/Ui/Windows/SettingsWindow.cs b/Ryujinx/Ui/Windows/SettingsWindow.cs index 61af7d397..f049da505 100644 --- a/Ryujinx/Ui/Windows/SettingsWindow.cs +++ b/Ryujinx/Ui/Windows/SettingsWindow.cs @@ -7,9 +7,10 @@ using Ryujinx.Common.Configuration; using Ryujinx.Common.Configuration.Hid; using Ryujinx.Common.GraphicsDriver; using Ryujinx.Graphics.Vulkan; -using Ryujinx.Ui.Common.Configuration; using Ryujinx.HLE.FileSystem; using Ryujinx.HLE.HOS.Services.Time.TimeZone; +using Ryujinx.Ui.Common.Configuration; +using Ryujinx.Ui.Common.Configuration.System; using Ryujinx.Ui.Helper; using Ryujinx.Ui.Widgets; using System; @@ -18,9 +19,7 @@ using System.Globalization; using System.IO; using System.Reflection; using System.Threading.Tasks; - using GUI = Gtk.Builder.ObjectAttribute; -using Ryujinx.Ui.Common.Configuration.System; namespace Ryujinx.Ui.Windows { @@ -702,7 +701,7 @@ namespace Ryujinx.Ui.Windows { break; } - } while(_gameDirsBoxStore.IterNext(ref treeIter)); + } while (_gameDirsBoxStore.IterNext(ref treeIter)); } if (!_directoryChanged) From b8556530f2b160db70ff571adf25ae26d4b8f58f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 4 Mar 2023 15:37:08 +0100 Subject: [PATCH 41/41] nuget: bump Microsoft.CodeAnalysis.CSharp from 4.4.0 to 4.5.0 (#4488) Bumps [Microsoft.CodeAnalysis.CSharp](https://github.com/dotnet/roslyn) from 4.4.0 to 4.5.0. - [Release notes](https://github.com/dotnet/roslyn/releases) - [Changelog](https://github.com/dotnet/roslyn/blob/main/docs/Breaking%20API%20Changes.md) - [Commits](https://github.com/dotnet/roslyn/commits) --- updated-dependencies: - dependency-name: Microsoft.CodeAnalysis.CSharp dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Directory.Packages.props | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Directory.Packages.props b/Directory.Packages.props index eef982774..5de9461a1 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -21,7 +21,7 @@ <PackageVersion Include="jp2masa.Avalonia.Flexbox" Version="0.2.0" /> <PackageVersion Include="LibHac" Version="0.18.0" /> <PackageVersion Include="Microsoft.CodeAnalysis.Analyzers" Version="3.3.4" /> - <PackageVersion Include="Microsoft.CodeAnalysis.CSharp" Version="4.4.0" /> + <PackageVersion Include="Microsoft.CodeAnalysis.CSharp" Version="4.5.0" /> <PackageVersion Include="Microsoft.NET.Test.Sdk" Version="17.5.0" /> <PackageVersion Include="MsgPack.Cli" Version="1.0.1" /> <PackageVersion Include="NUnit" Version="3.13.3" />