diff --git a/Ryujinx.Graphics.Gpu/GraphicsConfig.cs b/Ryujinx.Graphics.Gpu/GraphicsConfig.cs index 811f41323..d2f98c7f2 100644 --- a/Ryujinx.Graphics.Gpu/GraphicsConfig.cs +++ b/Ryujinx.Graphics.Gpu/GraphicsConfig.cs @@ -61,5 +61,10 @@ namespace Ryujinx.Graphics.Gpu /// Enables or disables shader SPIR-V compilation. /// public static bool EnableSpirvCompilationOnVulkan = true; + + /// + /// Enables or disables recompression of compressed textures that are not natively supported by the host. + /// + public static bool EnableTextureRecompression = false; } } \ No newline at end of file diff --git a/Ryujinx.Graphics.Gpu/Image/Texture.cs b/Ryujinx.Graphics.Gpu/Image/Texture.cs index cb10f456b..2ef76b686 100644 --- a/Ryujinx.Graphics.Gpu/Image/Texture.cs +++ b/Ryujinx.Graphics.Gpu/Image/Texture.cs @@ -826,13 +826,18 @@ namespace Ryujinx.Graphics.Gpu.Image depth, levels, layers, - out Span decoded)) + out byte[] decoded)) { string texInfo = $"{Info.Target} {Info.FormatInfo.Format} {Info.Width}x{Info.Height}x{Info.DepthOrLayers} levels {Info.Levels}"; Logger.Debug?.Print(LogClass.Gpu, $"Invalid ASTC texture at 0x{Info.GpuAddress:X} ({texInfo})."); } + if (GraphicsConfig.EnableTextureRecompression) + { + decoded = BCnEncoder.EncodeBC7(decoded, width, height, depth, levels, layers); + } + data = decoded; } else if (!_context.Capabilities.SupportsR4G4Format && Format == Format.R4G4Unorm) diff --git a/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs b/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs index 61b48dc4d..62cd456db 100644 --- a/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs +++ b/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs @@ -71,11 +71,15 @@ namespace Ryujinx.Graphics.Gpu.Image { if (info.FormatInfo.Format.IsAstcUnorm()) { - return new FormatInfo(Format.R8G8B8A8Unorm, 1, 1, 4, 4); + return GraphicsConfig.EnableTextureRecompression + ? new FormatInfo(Format.Bc7Unorm, 4, 4, 16, 4) + : new FormatInfo(Format.R8G8B8A8Unorm, 1, 1, 4, 4); } else if (info.FormatInfo.Format.IsAstcSrgb()) { - return new FormatInfo(Format.R8G8B8A8Srgb, 1, 1, 4, 4); + return GraphicsConfig.EnableTextureRecompression + ? new FormatInfo(Format.Bc7Srgb, 4, 4, 16, 4) + : new FormatInfo(Format.R8G8B8A8Srgb, 1, 1, 4, 4); } } diff --git a/Ryujinx.Graphics.Texture/Astc/AstcDecoder.cs b/Ryujinx.Graphics.Texture/Astc/AstcDecoder.cs index 238f46a07..08738583e 100644 --- a/Ryujinx.Graphics.Texture/Astc/AstcDecoder.cs +++ b/Ryujinx.Graphics.Texture/Astc/AstcDecoder.cs @@ -291,7 +291,7 @@ namespace Ryujinx.Graphics.Texture.Astc int depth, int levels, int layers, - out Span decoded) + out byte[] decoded) { byte[] output = new byte[QueryDecompressedSize(width, height, depth, levels, layers)]; diff --git a/Ryujinx.Graphics.Texture/BCnEncoder.cs b/Ryujinx.Graphics.Texture/BCnEncoder.cs new file mode 100644 index 000000000..02b79c1b8 --- /dev/null +++ b/Ryujinx.Graphics.Texture/BCnEncoder.cs @@ -0,0 +1,60 @@ +using Ryujinx.Common; +using Ryujinx.Graphics.Texture.Encoders; +using System; + +namespace Ryujinx.Graphics.Texture +{ + public static class BCnEncoder + { + private const int BlockWidth = 4; + private const int BlockHeight = 4; + + public static byte[] EncodeBC7(byte[] data, int width, int height, int depth, int levels, int layers) + { + int size = 0; + + for (int l = 0; l < levels; l++) + { + int w = BitUtils.DivRoundUp(Math.Max(1, width >> l), BlockWidth); + int h = BitUtils.DivRoundUp(Math.Max(1, height >> l), BlockHeight); + + size += w * h * 16 * Math.Max(1, depth >> l) * layers; + } + + byte[] output = new byte[size]; + + int imageBaseIOffs = 0; + int imageBaseOOffs = 0; + + for (int l = 0; l < levels; l++) + { + int rgba8Size = width * height * depth * layers * 4; + + int w = BitUtils.DivRoundUp(width, BlockWidth); + int h = BitUtils.DivRoundUp(height, BlockHeight); + + for (int l2 = 0; l2 < layers; l2++) + { + for (int z = 0; z < depth; z++) + { + BC7Encoder.Encode( + output.AsMemory().Slice(imageBaseOOffs), + data.AsMemory().Slice(imageBaseIOffs), + width, + height, + EncodeMode.Fast | EncodeMode.Multithreaded); + + imageBaseIOffs += width * height * 4; + imageBaseOOffs += w * h * 16; + } + } + + width = Math.Max(1, width >> 1); + height = Math.Max(1, height >> 1); + depth = Math.Max(1, depth >> 1); + } + + return output; + } + } +} \ No newline at end of file diff --git a/Ryujinx.Graphics.Texture/Encoders/BC7Encoder.cs b/Ryujinx.Graphics.Texture/Encoders/BC7Encoder.cs new file mode 100644 index 000000000..896f70468 --- /dev/null +++ b/Ryujinx.Graphics.Texture/Encoders/BC7Encoder.cs @@ -0,0 +1,1073 @@ +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +using System.Threading.Tasks; + +namespace Ryujinx.Graphics.Texture.Encoders +{ + static class BC7Encoder + { + private struct ModeInfo + { + public readonly int SubsetCount; + public readonly int PartitionBitCount; + public readonly int PBits; + public readonly int RotationBitCount; + public readonly int IndexModeBitCount; + public readonly int ColorIndexBitCount; + public readonly int AlphaIndexBitCount; + public readonly int ColorDepth; + public readonly int AlphaDepth; + + public ModeInfo( + int subsetCount, + int partitionBitsCount, + int pBits, + int rotationBitCount, + int indexModeBitCount, + int colorIndexBitCount, + int alphaIndexBitCount, + int colorDepth, + int alphaDepth) + { + SubsetCount = subsetCount; + PartitionBitCount = partitionBitsCount; + PBits = pBits; + RotationBitCount = rotationBitCount; + IndexModeBitCount = indexModeBitCount; + ColorIndexBitCount = colorIndexBitCount; + AlphaIndexBitCount = alphaIndexBitCount; + ColorDepth = colorDepth; + AlphaDepth = alphaDepth; + } + } + + private static readonly ModeInfo[] _modeInfos = new ModeInfo[] + { + new ModeInfo(3, 4, 6, 0, 0, 3, 0, 4, 0), + new ModeInfo(2, 6, 2, 0, 0, 3, 0, 6, 0), + new ModeInfo(3, 6, 0, 0, 0, 2, 0, 5, 0), + new ModeInfo(2, 6, 4, 0, 0, 2, 0, 7, 0), + new ModeInfo(1, 0, 0, 2, 1, 2, 3, 5, 6), + new ModeInfo(1, 0, 0, 2, 0, 2, 2, 7, 8), + new ModeInfo(1, 0, 2, 0, 0, 4, 0, 7, 7), + new ModeInfo(2, 6, 4, 0, 0, 2, 0, 5, 5) + }; + + public static void Encode(Memory outputStorage, ReadOnlyMemory data, int width, int height, EncodeMode mode) + { + int widthInBlocks = (width + 3) / 4; + int heightInBlocks = (height + 3) / 4; + + bool fastMode = (mode & EncodeMode.ModeMask) == EncodeMode.Fast; + + if (mode.HasFlag(EncodeMode.Multithreaded)) + { + Parallel.For(0, heightInBlocks, (yInBlocks) => + { + Span output = MemoryMarshal.Cast(outputStorage.Span); + int y = yInBlocks * 4; + + for (int xInBlocks = 0; xInBlocks < widthInBlocks; xInBlocks++) + { + int x = xInBlocks * 4; + Block block = CompressBlock(data.Span, x, y, width, height, fastMode); + + int offset = (yInBlocks * widthInBlocks + xInBlocks) * 2; + output[offset] = block.Low; + output[offset + 1] = block.High; + } + }); + } + else + { + Span output = MemoryMarshal.Cast(outputStorage.Span); + int offset = 0; + + for (int y = 0; y < height; y += 4) + { + for (int x = 0; x < width; x += 4) + { + Block block = CompressBlock(data.Span, x, y, width, height, fastMode); + + output[offset++] = block.Low; + output[offset++] = block.High; + } + } + } + } + + private static int[] _mostFrequentPartitions = new int[] + { + 0, 13, 2, 1, 15, 14, 10, 23 + }; + + private struct Block + { + public ulong Low; + public ulong High; + + public void Encode(ulong value, ref int offset, int bits) + { + if (offset >= 64) + { + High |= value << (offset - 64); + } + else + { + Low |= value << offset; + + if (offset + bits > 64) + { + int remainder = 64 - offset; + High |= value >> remainder; + } + } + + offset += bits; + } + } + + private static Block CompressBlock(ReadOnlySpan data, int x, int y, int width, int height, bool fastMode) + { + int w = Math.Min(4, width - x); + int h = Math.Min(4, height - y); + + var dataUint = MemoryMarshal.Cast(data); + + int baseOffset = y * width + x; + + Span tile = stackalloc uint[w * h]; + + for (int ty = 0; ty < h; ty++) + { + int rowOffset = baseOffset + ty * width; + + for (int tx = 0; tx < w; tx++) + { + tile[ty * w + tx] = dataUint[rowOffset + tx]; + } + } + + return fastMode ? EncodeFast(tile, w, h) : EncodeExhaustive(tile, w, h); + } + + private static Block EncodeFast(ReadOnlySpan tile, int w, int h) + { + (RgbaColor8 minColor, RgbaColor8 maxColor) = BC7Utils.GetMinMaxColors(tile, w, h); + + bool alphaNotOne = minColor.A != 255 || maxColor.A != 255; + int variance = BC7Utils.SquaredDifference(minColor.GetColor32(), maxColor.GetColor32()); + int selectedMode; + int indexMode = 0; + + if (alphaNotOne) + { + bool constantAlpha = minColor.A == maxColor.A; + if (constantAlpha) + { + selectedMode = variance > 160 ? 7 : 6; + } + else + { + if (variance > 160) + { + Span uniqueRGB = stackalloc uint[16]; + Span uniqueAlpha = stackalloc uint[16]; + + int uniqueRGBCount = 0; + int uniqueAlphaCount = 0; + + uint rgbMask = new RgbaColor8(255, 255, 255, 0).ToUInt32(); + uint alphaMask = new RgbaColor8(0, 0, 0, 255).ToUInt32(); + + for (int i = 0; i < tile.Length; i++) + { + uint c = tile[i]; + + if (!uniqueRGB.Slice(0, uniqueRGBCount).Contains(c & rgbMask)) + { + uniqueRGB[uniqueRGBCount++] = c & rgbMask; + } + + if (!uniqueAlpha.Slice(0, uniqueAlphaCount).Contains(c & alphaMask)) + { + uniqueAlpha[uniqueAlphaCount++] = c & alphaMask; + } + } + + selectedMode = 4; + indexMode = uniqueRGBCount > uniqueAlphaCount ? 1 : 0; + } + else + { + selectedMode = 5; + } + } + } + else + { + if (variance > 160) + { + selectedMode = 1; + } + else + { + selectedMode = 6; + } + } + + int selectedPartition = 0; + + if (selectedMode == 1 || selectedMode == 7) + { + int partitionSelectionLowestError = int.MaxValue; + + for (int i = 0; i < _mostFrequentPartitions.Length; i++) + { + int p = _mostFrequentPartitions[i]; + int error = GetEndPointSelectionErrorFast(tile, 2, p, w, h, partitionSelectionLowestError); + if (error < partitionSelectionLowestError) + { + partitionSelectionLowestError = error; + selectedPartition = p; + } + } + } + + return Encode(selectedMode, selectedPartition, 0, indexMode, fastMode: true, tile, w, h, out _); + } + + private static Block EncodeExhaustive(ReadOnlySpan tile, int w, int h) + { + Block bestBlock = default; + int lowestError = int.MaxValue; + int lowestErrorSubsets = int.MaxValue; + + for (int m = 0; m < 8; m++) + { + for (int r = 0; r < (m == 4 || m == 5 ? 4 : 1); r++) + { + for (int im = 0; im < (m == 4 ? 2 : 1); im++) + { + for (int p = 0; p < 1 << _modeInfos[m].PartitionBitCount; p++) + { + Block block = Encode(m, p, r, im, fastMode: false, tile, w, h, out int maxError); + if (maxError < lowestError || (maxError == lowestError && _modeInfos[m].SubsetCount < lowestErrorSubsets)) + { + lowestError = maxError; + lowestErrorSubsets = _modeInfos[m].SubsetCount; + bestBlock = block; + } + } + } + } + } + + return bestBlock; + } + + private static Block Encode( + int mode, + int partition, + int rotation, + int indexMode, + bool fastMode, + ReadOnlySpan tile, + int w, + int h, + out int errorSum) + { + ModeInfo modeInfo = _modeInfos[mode]; + int subsetCount = modeInfo.SubsetCount; + int partitionBitCount = modeInfo.PartitionBitCount; + int rotationBitCount = modeInfo.RotationBitCount; + int indexModeBitCount = modeInfo.IndexModeBitCount; + int colorDepth = modeInfo.ColorDepth; + int alphaDepth = modeInfo.AlphaDepth; + int pBits = modeInfo.PBits; + int colorIndexBitCount = modeInfo.ColorIndexBitCount; + int alphaIndexBitCount = modeInfo.AlphaIndexBitCount; + bool separateAlphaIndices = alphaIndexBitCount != 0; + + uint alphaMask; + + if (separateAlphaIndices) + { + alphaMask = rotation switch + { + 1 => new RgbaColor8(255, 0, 0, 0).ToUInt32(), + 2 => new RgbaColor8(0, 255, 0, 0).ToUInt32(), + 3 => new RgbaColor8(0, 0, 255, 0).ToUInt32(), + _ => new RgbaColor8(0, 0, 0, 255).ToUInt32() + }; + } + else + { + alphaMask = new RgbaColor8(0, 0, 0, 0).ToUInt32(); + } + + if (indexMode != 0) + { + alphaMask = ~alphaMask; + } + + // + // Select color palette. + // + + Span endPoints0 = stackalloc uint[subsetCount]; + Span endPoints1 = stackalloc uint[subsetCount]; + + SelectEndPoints( + tile, + w, + h, + endPoints0, + endPoints1, + subsetCount, + partition, + colorIndexBitCount, + colorDepth, + alphaDepth, + ~alphaMask, + fastMode); + + if (separateAlphaIndices) + { + SelectEndPoints( + tile, + w, + h, + endPoints0, + endPoints1, + subsetCount, + partition, + alphaIndexBitCount, + colorDepth, + alphaDepth, + alphaMask, + fastMode); + } + + Span pBitValues = stackalloc int[pBits]; + + for (int i = 0; i < pBits; i++) + { + int pBit; + + if (pBits == subsetCount) + { + pBit = GetPBit(endPoints0[i], endPoints1[i], colorDepth, alphaDepth); + } + else + { + int subset = i >> 1; + uint color = (i & 1) == 0 ? endPoints0[subset] : endPoints1[subset]; + pBit = GetPBit(color, colorDepth, alphaDepth); + } + + pBitValues[i] = pBit; + } + + int colorIndexCount = 1 << colorIndexBitCount; + int alphaIndexCount = 1 << alphaIndexBitCount; + + Span colorIndices = stackalloc byte[16]; + Span alphaIndices = stackalloc byte[16]; + + errorSum = BC7Utils.SelectIndices( + tile, + w, + h, + endPoints0, + endPoints1, + pBitValues, + colorIndices, + subsetCount, + partition, + colorIndexBitCount, + colorIndexCount, + colorDepth, + alphaDepth, + pBits, + alphaMask); + + if (separateAlphaIndices) + { + errorSum += BC7Utils.SelectIndices( + tile, + w, + h, + endPoints0, + endPoints1, + pBitValues, + alphaIndices, + subsetCount, + partition, + alphaIndexBitCount, + alphaIndexCount, + colorDepth, + alphaDepth, + pBits, + ~alphaMask); + } + + Span colorSwapSubset = stackalloc bool[3]; + + for (int i = 0; i < 3; i++) + { + colorSwapSubset[i] = colorIndices[BC7Tables.FixUpIndices[subsetCount - 1][partition][i]] >= (colorIndexCount >> 1); + } + + bool alphaSwapSubset = alphaIndices[0] >= (alphaIndexCount >> 1); + + Block block = new Block(); + + int offset = 0; + + block.Encode(1UL << mode, ref offset, mode + 1); + block.Encode((ulong)partition, ref offset, partitionBitCount); + block.Encode((ulong)rotation, ref offset, rotationBitCount); + block.Encode((ulong)indexMode, ref offset, indexModeBitCount); + + for (int comp = 0; comp < 3; comp++) + { + int rotatedComp = comp; + + if (((comp + 1) & 3) == rotation) + { + rotatedComp = 3; + } + + for (int subset = 0; subset < subsetCount; subset++) + { + RgbaColor8 color0 = RgbaColor8.FromUInt32(endPoints0[subset]); + RgbaColor8 color1 = RgbaColor8.FromUInt32(endPoints1[subset]); + + int pBit0 = -1, pBit1 = -1; + + if (pBits == subsetCount) + { + pBit0 = pBit1 = pBitValues[subset]; + } + else if (pBits != 0) + { + pBit0 = pBitValues[subset * 2]; + pBit1 = pBitValues[subset * 2 + 1]; + } + + if (indexMode == 0 ? colorSwapSubset[subset] : alphaSwapSubset) + { + block.Encode(BC7Utils.QuantizeComponent(color1.GetComponent(rotatedComp), colorDepth, pBit1), ref offset, colorDepth); + block.Encode(BC7Utils.QuantizeComponent(color0.GetComponent(rotatedComp), colorDepth, pBit0), ref offset, colorDepth); + } + else + { + block.Encode(BC7Utils.QuantizeComponent(color0.GetComponent(rotatedComp), colorDepth, pBit0), ref offset, colorDepth); + block.Encode(BC7Utils.QuantizeComponent(color1.GetComponent(rotatedComp), colorDepth, pBit1), ref offset, colorDepth); + } + } + } + + if (alphaDepth != 0) + { + int rotatedComp = (rotation - 1) & 3; + + for (int subset = 0; subset < subsetCount; subset++) + { + RgbaColor8 color0 = RgbaColor8.FromUInt32(endPoints0[subset]); + RgbaColor8 color1 = RgbaColor8.FromUInt32(endPoints1[subset]); + + int pBit0 = -1, pBit1 = -1; + + if (pBits == subsetCount) + { + pBit0 = pBit1 = pBitValues[subset]; + } + else if (pBits != 0) + { + pBit0 = pBitValues[subset * 2]; + pBit1 = pBitValues[subset * 2 + 1]; + } + + if (separateAlphaIndices && indexMode == 0 ? alphaSwapSubset : colorSwapSubset[subset]) + { + block.Encode(BC7Utils.QuantizeComponent(color1.GetComponent(rotatedComp), alphaDepth, pBit1), ref offset, alphaDepth); + block.Encode(BC7Utils.QuantizeComponent(color0.GetComponent(rotatedComp), alphaDepth, pBit0), ref offset, alphaDepth); + } + else + { + block.Encode(BC7Utils.QuantizeComponent(color0.GetComponent(rotatedComp), alphaDepth, pBit0), ref offset, alphaDepth); + block.Encode(BC7Utils.QuantizeComponent(color1.GetComponent(rotatedComp), alphaDepth, pBit1), ref offset, alphaDepth); + } + } + } + + for (int i = 0; i < pBits; i++) + { + block.Encode((ulong)pBitValues[i], ref offset, 1); + } + + byte[] fixUpTable = BC7Tables.FixUpIndices[subsetCount - 1][partition]; + + for (int i = 0; i < 16; i++) + { + int subset = BC7Tables.PartitionTable[subsetCount - 1][partition][i]; + byte index = colorIndices[i]; + + if (colorSwapSubset[subset]) + { + index = (byte)(index ^ (colorIndexCount - 1)); + } + + int finalIndexBitCount = i == fixUpTable[subset] ? colorIndexBitCount - 1 : colorIndexBitCount; + + if (index >= (1 << finalIndexBitCount)) + { + throw new Exception("invalid index " + index); + } + + block.Encode(index, ref offset, finalIndexBitCount); + } + + if (separateAlphaIndices) + { + for (int i = 0; i < 16; i++) + { + byte index = alphaIndices[i]; + + if (alphaSwapSubset) + { + index = (byte)(index ^ (alphaIndexCount - 1)); + } + + int finalIndexBitCount = i == 0 ? alphaIndexBitCount - 1 : alphaIndexBitCount; + + if (index >= (1 << finalIndexBitCount)) + { + throw new Exception("invalid alpha index " + index); + } + + block.Encode(index, ref offset, finalIndexBitCount); + } + } + + return block; + } + + private static unsafe int GetEndPointSelectionErrorFast(ReadOnlySpan tile, int subsetCount, int partition, int w, int h, int maxError) + { + byte[] partitionTable = BC7Tables.PartitionTable[subsetCount - 1][partition]; + + Span minColors = stackalloc RgbaColor8[subsetCount]; + Span maxColors = stackalloc RgbaColor8[subsetCount]; + + BC7Utils.GetMinMaxColors(partitionTable, tile, w, h, minColors, maxColors, subsetCount); + + Span endPoints0 = stackalloc uint[subsetCount]; + Span endPoints1 = stackalloc uint[subsetCount]; + + SelectEndPointsFast(partitionTable, tile, w, h, subsetCount, minColors, maxColors, endPoints0, endPoints1, uint.MaxValue); + + Span palette = stackalloc RgbaColor32[8]; + + int errorSum = 0; + + for (int subset = 0; subset < subsetCount; subset++) + { + RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32(); + int sum = blockDir.R + blockDir.G + blockDir.B + blockDir.A; + if (sum != 0) + { + blockDir = (blockDir << 6) / new RgbaColor32(sum); + } + + uint c0 = endPoints0[subset]; + uint c1 = endPoints1[subset]; + + int pBit0 = GetPBit(c0, 6, 0); + int pBit1 = GetPBit(c1, 6, 0); + + c0 = BC7Utils.Quantize(RgbaColor8.FromUInt32(c0), 6, 0, pBit0).ToUInt32(); + c1 = BC7Utils.Quantize(RgbaColor8.FromUInt32(c1), 6, 0, pBit1).ToUInt32(); + + if (Sse41.IsSupported) + { + Vector128 c0Rep = Vector128.Create(c0).AsByte(); + Vector128 c1Rep = Vector128.Create(c1).AsByte(); + + Vector128 c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); + + Vector128 rWeights; + Vector128 lWeights; + + fixed (byte* pWeights = BC7Tables.Weights[1], pInvWeights = BC7Tables.InverseWeights[1]) + { + rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte(); + lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte(); + } + + Vector128 iWeights = Sse2.UnpackLow(rWeights, lWeights); + Vector128 iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); + Vector128 iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); + Vector128 iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128 iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128 iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + Vector128 iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + + static Vector128 ShiftRoundToNearest(Vector128 x) + { + return Sse2.ShiftRightLogical(Sse2.Add(x, Vector128.Create((short)32)), 6); + } + + Vector128 pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); + Vector128 pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); + Vector128 pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte())); + Vector128 pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte())); + + for (int i = 0; i < tile.Length; i++) + { + if (partitionTable[i] != subset) + { + continue; + } + + uint c = tile[i]; + + Vector128 color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); + + Vector128 delta0 = Sse2.Subtract(color, pal0); + Vector128 delta1 = Sse2.Subtract(color, pal1); + Vector128 delta2 = Sse2.Subtract(color, pal2); + Vector128 delta3 = Sse2.Subtract(color, pal3); + + Vector128 deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); + Vector128 deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); + Vector128 deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2); + Vector128 deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3); + + Vector128 deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); + Vector128 deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3); + + Vector128 delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23); + + Vector128 min = Sse41.MinHorizontal(delta); + + errorSum += min.GetElement(0); + } + } + else + { + RgbaColor32 e032 = RgbaColor8.FromUInt32(c0).GetColor32(); + RgbaColor32 e132 = RgbaColor8.FromUInt32(c1).GetColor32(); + + palette[0] = e032; + palette[palette.Length - 1] = e132; + + for (int i = 1; i < palette.Length - 1; i++) + { + palette[i] = BC7Utils.Interpolate(e032, e132, i, 3); + } + + for (int i = 0; i < tile.Length; i++) + { + if (partitionTable[i] != subset) + { + continue; + } + + uint c = tile[i]; + RgbaColor32 color = Unsafe.As(ref c).GetColor32(); + + int bestMatchScore = int.MaxValue; + + for (int j = 0; j < palette.Length; j++) + { + int score = BC7Utils.SquaredDifference(color, palette[j]); + + if (score < bestMatchScore) + { + bestMatchScore = score; + } + } + + errorSum += bestMatchScore; + } + } + + // No point in continuing if we are already above maximum. + if (errorSum >= maxError) + { + return int.MaxValue; + } + } + + return errorSum; + } + + private static void SelectEndPoints( + ReadOnlySpan tile, + int w, + int h, + Span endPoints0, + Span endPoints1, + int subsetCount, + int partition, + int indexBitCount, + int colorDepth, + int alphaDepth, + uint writeMask, + bool fastMode) + { + byte[] partitionTable = BC7Tables.PartitionTable[subsetCount - 1][partition]; + + Span minColors = stackalloc RgbaColor8[subsetCount]; + Span maxColors = stackalloc RgbaColor8[subsetCount]; + + BC7Utils.GetMinMaxColors(partitionTable, tile, w, h, minColors, maxColors, subsetCount); + + uint inverseMask = ~writeMask; + + for (int i = 0; i < subsetCount; i++) + { + Unsafe.As(ref minColors[i]) |= inverseMask; + Unsafe.As(ref maxColors[i]) |= inverseMask; + } + + if (fastMode) + { + SelectEndPointsFast(partitionTable, tile, w, h, subsetCount, minColors, maxColors, endPoints0, endPoints1, writeMask); + } + else + { + Span colors = stackalloc RgbaColor8[subsetCount * 16]; + Span counts = stackalloc byte[subsetCount]; + + int i = 0; + for (int ty = 0; ty < h; ty++) + { + for (int tx = 0; tx < w; tx++) + { + int subset = partitionTable[ty * 4 + tx]; + RgbaColor8 color = RgbaColor8.FromUInt32(tile[i++] | inverseMask); + + static void AddIfNew(Span values, RgbaColor8 value, int subset, ref byte count) + { + for (int i = 0; i < count; i++) + { + if (values[subset * 16 + i] == value) + { + return; + } + } + + values[subset * 16 + count++] = value; + } + + AddIfNew(colors, color, subset, ref counts[subset]); + } + } + + for (int subset = 0; subset < subsetCount; subset++) + { + int offset = subset * 16; + + RgbaColor8 minColor = minColors[subset]; + RgbaColor8 maxColor = maxColors[subset]; + + ReadOnlySpan subsetColors = colors.Slice(offset, counts[subset]); + + (RgbaColor8 e0, RgbaColor8 e1) = SelectEndPoints(subsetColors, minColor, maxColor, indexBitCount, colorDepth, alphaDepth, inverseMask); + + endPoints0[subset] = (endPoints0[subset] & inverseMask) | (e0.ToUInt32() & writeMask); + endPoints1[subset] = (endPoints1[subset] & inverseMask) | (e1.ToUInt32() & writeMask); + } + } + } + + private static unsafe void SelectEndPointsFast( + ReadOnlySpan partitionTable, + ReadOnlySpan tile, + int w, + int h, + int subsetCount, + ReadOnlySpan minColors, + ReadOnlySpan maxColors, + Span endPoints0, + Span endPoints1, + uint writeMask) + { + uint inverseMask = ~writeMask; + + if (Sse41.IsSupported && w == 4 && h == 4) + { + Vector128 row0, row1, row2, row3; + Vector128 ones = Vector128.AllBitsSet; + + fixed (uint* pTile = tile) + { + row0 = Sse2.LoadVector128(pTile).AsByte(); + row1 = Sse2.LoadVector128(pTile + 4).AsByte(); + row2 = Sse2.LoadVector128(pTile + 8).AsByte(); + row3 = Sse2.LoadVector128(pTile + 12).AsByte(); + } + + Vector128 partitionMask; + + fixed (byte* pPartitionTable = partitionTable) + { + partitionMask = Sse2.LoadVector128(pPartitionTable); + } + + for (int subset = 0; subset < subsetCount; subset++) + { + RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32(); + int sum = blockDir.R + blockDir.G + blockDir.B + blockDir.A; + if (sum != 0) + { + blockDir = (blockDir << 6) / new RgbaColor32(sum); + } + + Vector128 bd = Vector128.Create(blockDir.GetColor8().ToUInt32()).AsByte(); + + Vector128 delta0 = Ssse3.MultiplyAddAdjacent(row0, bd.AsSByte()); + Vector128 delta1 = Ssse3.MultiplyAddAdjacent(row1, bd.AsSByte()); + Vector128 delta2 = Ssse3.MultiplyAddAdjacent(row2, bd.AsSByte()); + Vector128 delta3 = Ssse3.MultiplyAddAdjacent(row3, bd.AsSByte()); + + Vector128 delta01 = Ssse3.HorizontalAdd(delta0, delta1); + Vector128 delta23 = Ssse3.HorizontalAdd(delta2, delta3); + + Vector128 subsetMask = Sse2.Xor(Sse2.CompareEqual(partitionMask, Vector128.Create((byte)subset)), ones.AsByte()); + + Vector128 subsetMask01 = Sse2.UnpackLow(subsetMask, subsetMask).AsInt16(); + Vector128 subsetMask23 = Sse2.UnpackHigh(subsetMask, subsetMask).AsInt16(); + + Vector128 min01 = Sse41.MinHorizontal(Sse2.Or(delta01, subsetMask01).AsUInt16()); + Vector128 min23 = Sse41.MinHorizontal(Sse2.Or(delta23, subsetMask23).AsUInt16()); + Vector128 max01 = Sse41.MinHorizontal(Sse2.Xor(Sse2.AndNot(subsetMask01, delta01), ones).AsUInt16()); + Vector128 max23 = Sse41.MinHorizontal(Sse2.Xor(Sse2.AndNot(subsetMask23, delta23), ones).AsUInt16()); + + uint minPos01 = min01.AsUInt32().GetElement(0); + uint minPos23 = min23.AsUInt32().GetElement(0); + uint maxPos01 = max01.AsUInt32().GetElement(0); + uint maxPos23 = max23.AsUInt32().GetElement(0); + + uint minDistColor = (ushort)minPos23 < (ushort)minPos01 + ? tile[(int)(minPos23 >> 16) + 8] + : tile[(int)(minPos01 >> 16)]; + + // Note that we calculate the maximum as the minimum of the inverse, so less here is actually greater. + uint maxDistColor = (ushort)maxPos23 < (ushort)maxPos01 + ? tile[(int)(maxPos23 >> 16) + 8] + : tile[(int)(maxPos01 >> 16)]; + + endPoints0[subset] = (endPoints0[subset] & inverseMask) | (minDistColor & writeMask); + endPoints1[subset] = (endPoints1[subset] & inverseMask) | (maxDistColor & writeMask); + } + } + else + { + for (int subset = 0; subset < subsetCount; subset++) + { + RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32(); + blockDir = RgbaColor32.DivideGuarded(blockDir << 6, new RgbaColor32(blockDir.R + blockDir.G + blockDir.B + blockDir.A), 0); + + int minDist = int.MaxValue; + int maxDist = int.MinValue; + + RgbaColor8 minDistColor = default; + RgbaColor8 maxDistColor = default; + + int i = 0; + for (int ty = 0; ty < h; ty++) + { + for (int tx = 0; tx < w; tx++, i++) + { + if (partitionTable[ty * 4 + tx] != subset) + { + continue; + } + + RgbaColor8 color = RgbaColor8.FromUInt32(tile[i]); + int dist = RgbaColor32.Dot(color.GetColor32(), blockDir); + + if (minDist > dist) + { + minDist = dist; + minDistColor = color; + } + + if (maxDist < dist) + { + maxDist = dist; + maxDistColor = color; + } + } + } + + endPoints0[subset] = (endPoints0[subset] & inverseMask) | (minDistColor.ToUInt32() & writeMask); + endPoints1[subset] = (endPoints1[subset] & inverseMask) | (maxDistColor.ToUInt32() & writeMask); + } + } + } + + private static (RgbaColor8, RgbaColor8) SelectEndPoints( + ReadOnlySpan values, + RgbaColor8 minValue, + RgbaColor8 maxValue, + int indexBitCount, + int colorDepth, + int alphaDepth, + uint alphaMask) + { + int n = values.Length; + int numInterpolatedColors = 1 << indexBitCount; + int numInterpolatedColorsMinus1 = numInterpolatedColors - 1; + + if (n == 0) + { + return (default, default); + } + + minValue = BC7Utils.Quantize(minValue, colorDepth, alphaDepth); + maxValue = BC7Utils.Quantize(maxValue, colorDepth, alphaDepth); + + RgbaColor32 blockDir = maxValue.GetColor32() - minValue.GetColor32(); + blockDir = RgbaColor32.DivideGuarded(blockDir << 6, new RgbaColor32(blockDir.R + blockDir.G + blockDir.B + blockDir.A), 0); + + int minDist = int.MaxValue; + int maxDist = 0; + + for (int i = 0; i < values.Length; i++) + { + RgbaColor8 color = values[i]; + int dist = RgbaColor32.Dot(BC7Utils.Quantize(color, colorDepth, alphaDepth).GetColor32(), blockDir); + + if (minDist >= dist) + { + minDist = dist; + } + + if (maxDist <= dist) + { + maxDist = dist; + } + } + + Span palette = stackalloc RgbaColor8[numInterpolatedColors]; + + int distRange = Math.Max(1, maxDist - minDist); + + RgbaColor32 nV = new RgbaColor32(n); + + int bestErrorSum = int.MaxValue; + RgbaColor8 bestE0 = default; + RgbaColor8 bestE1 = default; + + Span indices = stackalloc int[n]; + Span colors = stackalloc RgbaColor32[n]; + + for (int maxIndex = numInterpolatedColorsMinus1; maxIndex >= 1; maxIndex--) + { + int sumX = 0; + int sumXX = 0; + int sumXXIncrement = 0; + + for (int i = 0; i < values.Length; i++) + { + RgbaColor32 color = values[i].GetColor32(); + + int dist = RgbaColor32.Dot(color, blockDir); + + int normalizedValue = ((dist - minDist) << 6) / distRange; + int texelIndex = (normalizedValue * maxIndex + 32) >> 6; + + indices[i] = texelIndex; + colors[i] = color; + + sumX += texelIndex; + sumXX += texelIndex * texelIndex; + sumXXIncrement += 1 + texelIndex * 2; + } + + for (int start = 0; start < numInterpolatedColors - maxIndex; start++) + { + RgbaColor32 sumY = new RgbaColor32(0); + RgbaColor32 sumXY = new RgbaColor32(0); + + for (int i = 0; i < indices.Length; i++) + { + RgbaColor32 y = colors[i]; + + sumY += y; + sumXY += new RgbaColor32(start + indices[i]) * y; + } + + RgbaColor32 sumXV = new RgbaColor32(sumX); + RgbaColor32 sumXXV = new RgbaColor32(sumXX); + RgbaColor32 m = RgbaColor32.DivideGuarded((nV * sumXY - sumXV * sumY) << 6, nV * sumXXV - sumXV * sumXV, 0); + RgbaColor32 b = ((sumY << 6) - m * sumXV) / nV; + + RgbaColor8 candidateE0 = (b >> 6).GetColor8(); + RgbaColor8 candidateE1 = ((b + m * new RgbaColor32(numInterpolatedColorsMinus1)) >> 6).GetColor8(); + + int pBit0 = GetPBit(candidateE0.ToUInt32(), colorDepth, alphaDepth); + int pBit1 = GetPBit(candidateE1.ToUInt32(), colorDepth, alphaDepth); + + int errorSum = BC7Utils.SelectIndices( + MemoryMarshal.Cast(values), + candidateE0.ToUInt32(), + candidateE1.ToUInt32(), + pBit0, + pBit1, + indexBitCount, + numInterpolatedColors, + colorDepth, + alphaDepth, + alphaMask); + + if (errorSum <= bestErrorSum) + { + bestErrorSum = errorSum; + bestE0 = candidateE0; + bestE1 = candidateE1; + } + + sumX += n; + sumXX += sumXXIncrement; + sumXXIncrement += 2 * n; + } + } + + return (bestE0, bestE1); + } + + private static int GetPBit(uint color, int colorDepth, int alphaDepth) + { + uint mask = 0x808080u >> colorDepth; + + if (alphaDepth != 0) + { + mask |= 0x80000000u >> alphaDepth; + } + + color &= 0x7f7f7f7fu; + color += mask >> 1; + + int onesCount = BitOperations.PopCount(color & mask); + return onesCount >= 2 ? 1 : 0; + } + + private static int GetPBit(uint c0, uint c1, int colorDepth, int alphaDepth) + { + // Giving preference to the first endpoint yields better results, + // might be a side effect of the endpoint selection algorithm? + return GetPBit(c0, colorDepth, alphaDepth); + } + } +} diff --git a/Ryujinx.Graphics.Texture/Encoders/BC7Tables.cs b/Ryujinx.Graphics.Texture/Encoders/BC7Tables.cs new file mode 100644 index 000000000..6dddd283c --- /dev/null +++ b/Ryujinx.Graphics.Texture/Encoders/BC7Tables.cs @@ -0,0 +1,285 @@ +namespace Ryujinx.Graphics.Texture.Encoders +{ + static class BC7Tables + { + public static readonly byte[][] Weights = + { + new byte[] { 0, 21, 43, 64 }, + new byte[] { 0, 9, 18, 27, 37, 46, 55, 64 }, + new byte[] { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 } + }; + + public static readonly byte[][] InverseWeights = + { + new byte[] { 64, 43, 21, 0 }, + new byte[] { 64, 55, 46, 37, 27, 18, 9, 0 }, + new byte[] { 64, 60, 55, 51, 47, 43, 38, 34, 30, 26, 21, 17, 13, 9, 4, 0 } + }; + + public static readonly byte[][][] FixUpIndices = new byte[3][][] + { + new byte[64][] + { + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, + new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 }, new byte[] { 0, 0, 0 } + }, + new byte[64][] + { + new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, + new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, + new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, + new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, + new byte[] { 0, 15, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 2, 0 }, + new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 15, 0 }, + new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 }, + new byte[] { 0, 8, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 }, + new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 6, 0 }, new byte[] { 0, 8, 0 }, + new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, + new byte[] { 0, 2, 0 }, new byte[] { 0, 8, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 }, + new byte[] { 0, 2, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 6, 0 }, + new byte[] { 0, 6, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 6, 0 }, new byte[] { 0, 8, 0 }, + new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 }, + new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, new byte[] { 0, 15, 0 }, + new byte[] { 0, 15, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 2, 0 }, new byte[] { 0, 15, 0 } + }, + new byte[64][] + { + new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 }, new byte[] { 0, 15, 8 }, new byte[] { 0, 15, 3 }, + new byte[] { 0, 8, 15 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 15, 3 }, new byte[] { 0, 15, 8 }, + new byte[] { 0, 8, 15 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 6, 15 }, new byte[] { 0, 6, 15 }, + new byte[] { 0, 6, 15 }, new byte[] { 0, 5, 15 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 }, + new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 15, 3 }, + new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 }, new byte[] { 0, 6, 15 }, new byte[] { 0, 10, 8 }, + new byte[] { 0, 5, 3 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 8, 6 }, new byte[] { 0, 6, 10 }, + new byte[] { 0, 8, 15 }, new byte[] { 0, 5, 15 }, new byte[] { 0, 15, 10 }, new byte[] { 0, 15, 8 }, + new byte[] { 0, 8, 15 }, new byte[] { 0, 15, 3 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 5, 10 }, + new byte[] { 0, 6, 10 }, new byte[] { 0, 10, 8 }, new byte[] { 0, 8, 9 }, new byte[] { 0, 15, 10 }, + new byte[] { 0, 15, 6 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 15, 8 }, new byte[] { 0, 5, 15 }, + new byte[] { 0, 15, 3 }, new byte[] { 0, 15, 6 }, new byte[] { 0, 15, 6 }, new byte[] { 0, 15, 8 }, + new byte[] { 0, 3, 15 }, new byte[] { 0, 15, 3 }, new byte[] { 0, 5, 15 }, new byte[] { 0, 5, 15 }, + new byte[] { 0, 5, 15 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 5, 15 }, new byte[] { 0, 10, 15 }, + new byte[] { 0, 5, 15 }, new byte[] { 0, 10, 15 }, new byte[] { 0, 8, 15 }, new byte[] { 0, 13, 15 }, + new byte[] { 0, 15, 3 }, new byte[] { 0, 12, 15 }, new byte[] { 0, 3, 15 }, new byte[] { 0, 3, 8 } + } + }; + + public static readonly byte[][][] PartitionTable = new byte[3][][] + { + new byte[64][] + { + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 0 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 1 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 2 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 3 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 4 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 5 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 6 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 7 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 8 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 9 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 10 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 11 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 12 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 13 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 14 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 15 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 16 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 17 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 18 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 19 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 20 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 21 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 22 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 23 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 24 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 25 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 26 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 27 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 28 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 29 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 30 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 31 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 32 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 33 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 34 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 35 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 36 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 37 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 38 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 39 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 40 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 41 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 42 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 43 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 44 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 45 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 46 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 47 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 48 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 49 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 50 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 51 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 52 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 53 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 54 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 55 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 56 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 57 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 58 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 59 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 60 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 61 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, // 62 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } // 63 + }, + new byte[64][] + { + new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, // 0 + new byte[16] { 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1 }, // 1 + new byte[16] { 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1 }, // 2 + new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, // 3 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1 }, // 4 + new byte[16] { 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // 5 + new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // 6 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, // 7 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1 }, // 8 + new byte[16] { 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // 9 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, // 10 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1 }, // 11 + new byte[16] { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // 12 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }, // 13 + new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, // 14 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1 }, // 15 + new byte[16] { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1 }, // 16 + new byte[16] { 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, // 17 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0 }, // 18 + new byte[16] { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, // 19 + new byte[16] { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, // 20 + new byte[16] { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0 }, // 21 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, // 22 + new byte[16] { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1 }, // 23 + new byte[16] { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, // 24 + new byte[16] { 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, // 25 + new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0 }, // 26 + new byte[16] { 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0 }, // 27 + new byte[16] { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0 }, // 28 + new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, // 29 + new byte[16] { 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0 }, // 30 + new byte[16] { 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, // 31 + new byte[16] { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, // 32 + new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1 }, // 33 + new byte[16] { 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0 }, // 34 + new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0 }, // 35 + new byte[16] { 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0 }, // 36 + new byte[16] { 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0 }, // 37 + new byte[16] { 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1 }, // 38 + new byte[16] { 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1 }, // 39 + new byte[16] { 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0 }, // 40 + new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0 }, // 41 + new byte[16] { 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0 }, // 42 + new byte[16] { 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0 }, // 43 + new byte[16] { 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0 }, // 44 + new byte[16] { 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1 }, // 45 + new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1 }, // 46 + new byte[16] { 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0 }, // 47 + new byte[16] { 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, // 48 + new byte[16] { 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0 }, // 49 + new byte[16] { 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0 }, // 50 + new byte[16] { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0 }, // 51 + new byte[16] { 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1 }, // 52 + new byte[16] { 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, // 53 + new byte[16] { 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, // 54 + new byte[16] { 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0 }, // 55 + new byte[16] { 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, // 56 + new byte[16] { 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1 }, // 57 + new byte[16] { 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1 }, // 58 + new byte[16] { 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1 }, // 59 + new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, // 60 + new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, // 61 + new byte[16] { 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0 }, // 62 + new byte[16] { 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1 } // 63 + }, + new byte[64][] + { + new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 1, 2, 2, 2, 2 }, // 0 + new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1 }, // 1 + new byte[16] { 0, 0, 0, 0, 2, 0, 0, 1, 2, 2, 1, 1, 2, 2, 1, 1 }, // 2 + new byte[16] { 0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 1, 0, 1, 1, 1 }, // 3 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2 }, // 4 + new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 2, 2 }, // 5 + new byte[16] { 0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1 }, // 6 + new byte[16] { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1 }, // 7 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2 }, // 8 + new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2 }, // 9 + new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 }, // 10 + new byte[16] { 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2 }, // 11 + new byte[16] { 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2 }, // 12 + new byte[16] { 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2 }, // 13 + new byte[16] { 0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2 }, // 14 + new byte[16] { 0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0, 2, 2, 2, 0 }, // 15 + new byte[16] { 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2 }, // 16 + new byte[16] { 0, 1, 1, 1, 0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0 }, // 17 + new byte[16] { 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2 }, // 18 + new byte[16] { 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1 }, // 19 + new byte[16] { 0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2 }, // 20 + new byte[16] { 0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 2, 1, 2, 2, 2, 1 }, // 21 + new byte[16] { 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2 }, // 22 + new byte[16] { 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 0, 2, 2, 1, 0 }, // 23 + new byte[16] { 0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0 }, // 24 + new byte[16] { 0, 0, 1, 2, 0, 0, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2 }, // 25 + new byte[16] { 0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1, 0, 1, 1, 0 }, // 26 + new byte[16] { 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1 }, // 27 + new byte[16] { 0, 0, 2, 2, 1, 1, 0, 2, 1, 1, 0, 2, 0, 0, 2, 2 }, // 28 + new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 2, 0, 0, 2, 2, 2, 2, 2 }, // 29 + new byte[16] { 0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1 }, // 30 + new byte[16] { 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 1, 1, 2, 2, 2, 1 }, // 31 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 2, 2, 2 }, // 32 + new byte[16] { 0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2, 0, 0, 1, 1 }, // 33 + new byte[16] { 0, 0, 1, 1, 0, 0, 1, 2, 0, 0, 2, 2, 0, 2, 2, 2 }, // 34 + new byte[16] { 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0 }, // 35 + new byte[16] { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0 }, // 36 + new byte[16] { 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0 }, // 37 + new byte[16] { 0, 1, 2, 0, 2, 0, 1, 2, 1, 2, 0, 1, 0, 1, 2, 0 }, // 38 + new byte[16] { 0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1 }, // 39 + new byte[16] { 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 1, 1 }, // 40 + new byte[16] { 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2 }, // 41 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1 }, // 42 + new byte[16] { 0, 0, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2, 1, 1, 2, 2 }, // 43 + new byte[16] { 0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 1, 1 }, // 44 + new byte[16] { 0, 2, 2, 0, 1, 2, 2, 1, 0, 2, 2, 0, 1, 2, 2, 1 }, // 45 + new byte[16] { 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 0, 1 }, // 46 + new byte[16] { 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1 }, // 47 + new byte[16] { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2 }, // 48 + new byte[16] { 0, 2, 2, 2, 0, 1, 1, 1, 0, 2, 2, 2, 0, 1, 1, 1 }, // 49 + new byte[16] { 0, 0, 0, 2, 1, 1, 1, 2, 0, 0, 0, 2, 1, 1, 1, 2 }, // 50 + new byte[16] { 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2 }, // 51 + new byte[16] { 0, 2, 2, 2, 0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2 }, // 52 + new byte[16] { 0, 0, 0, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2 }, // 53 + new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2 }, // 54 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2 }, // 55 + new byte[16] { 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2 }, // 56 + new byte[16] { 0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2 }, // 57 + new byte[16] { 0, 0, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2 }, // 58 + new byte[16] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2 }, // 59 + new byte[16] { 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1 }, // 60 + new byte[16] { 0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2 }, // 61 + new byte[16] { 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 }, // 62 + new byte[16] { 0, 1, 1, 1, 2, 0, 1, 1, 2, 2, 0, 1, 2, 2, 2, 0 } // 63 + } + }; + } +} diff --git a/Ryujinx.Graphics.Texture/Encoders/BC7Utils.cs b/Ryujinx.Graphics.Texture/Encoders/BC7Utils.cs new file mode 100644 index 000000000..c43d90fb7 --- /dev/null +++ b/Ryujinx.Graphics.Texture/Encoders/BC7Utils.cs @@ -0,0 +1,1305 @@ +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace Ryujinx.Graphics.Texture.Encoders +{ + static class BC7Utils + { + private static byte[][] _quantizationLut; + private static byte[][] _quantizationLutNoPBit; + + static BC7Utils() + { + _quantizationLut = new byte[5][]; + _quantizationLutNoPBit = new byte[5][]; + + for (int depth = 4; depth < 9; depth++) + { + byte[] lut = new byte[512]; + byte[] lutNoPBit = new byte[256]; + + for (int i = 0; i < lut.Length; i++) + { + lut[i] = QuantizeComponentForLut((byte)i, depth, i >> 8); + + if (i < lutNoPBit.Length) + { + lutNoPBit[i] = QuantizeComponentForLut((byte)i, depth); + } + } + + _quantizationLut[depth - 4] = lut; + _quantizationLutNoPBit[depth - 4] = lutNoPBit; + } + } + + public static (RgbaColor8, RgbaColor8) GetMinMaxColors(ReadOnlySpan tile, int w, int h) + { + if (Sse41.IsSupported && w == 4 && h == 4) + { + GetMinMaxColorsOneSubset4x4Sse41(tile, out RgbaColor8 minColor, out RgbaColor8 maxColor); + + return (minColor, maxColor); + } + else + { + RgbaColor8 minColor = new RgbaColor8(255, 255, 255, 255); + RgbaColor8 maxColor = default; + + for (int i = 0; i < tile.Length; i++) + { + RgbaColor8 color = RgbaColor8.FromUInt32(tile[i]); + + minColor.R = Math.Min(minColor.R, color.R); + minColor.G = Math.Min(minColor.G, color.G); + minColor.B = Math.Min(minColor.B, color.B); + minColor.A = Math.Min(minColor.A, color.A); + + maxColor.R = Math.Max(maxColor.R, color.R); + maxColor.G = Math.Max(maxColor.G, color.G); + maxColor.B = Math.Max(maxColor.B, color.B); + maxColor.A = Math.Max(maxColor.A, color.A); + } + + return (minColor, maxColor); + } + } + + public static void GetMinMaxColors( + ReadOnlySpan partitionTable, + ReadOnlySpan tile, + int w, + int h, + Span minColors, + Span maxColors, + int subsetCount) + { + if (Sse41.IsSupported && w == 4 && h == 4) + { + if (subsetCount == 1) + { + GetMinMaxColorsOneSubset4x4Sse41(tile, out minColors[0], out maxColors[0]); + return; + } + else if (subsetCount == 2) + { + GetMinMaxColorsTwoSubsets4x4Sse41(partitionTable, tile, minColors, maxColors); + return; + } + } + + minColors.Fill(new RgbaColor8(255, 255, 255, 255)); + + int i = 0; + for (int ty = 0; ty < h; ty++) + { + for (int tx = 0; tx < w; tx++) + { + int subset = partitionTable[ty * w + tx]; + RgbaColor8 color = RgbaColor8.FromUInt32(tile[i++]); + + minColors[subset].R = Math.Min(minColors[subset].R, color.R); + minColors[subset].G = Math.Min(minColors[subset].G, color.G); + minColors[subset].B = Math.Min(minColors[subset].B, color.B); + minColors[subset].A = Math.Min(minColors[subset].A, color.A); + + maxColors[subset].R = Math.Max(maxColors[subset].R, color.R); + maxColors[subset].G = Math.Max(maxColors[subset].G, color.G); + maxColors[subset].B = Math.Max(maxColors[subset].B, color.B); + maxColors[subset].A = Math.Max(maxColors[subset].A, color.A); + } + } + } + + private static unsafe void GetMinMaxColorsOneSubset4x4Sse41(ReadOnlySpan tile, out RgbaColor8 minColor, out RgbaColor8 maxColor) + { + Vector128 min = Vector128.AllBitsSet; + Vector128 max = Vector128.Zero; + Vector128 row0, row1, row2, row3; + + fixed (uint* pTile = tile) + { + row0 = Sse2.LoadVector128(pTile).AsByte(); + row1 = Sse2.LoadVector128(pTile + 4).AsByte(); + row2 = Sse2.LoadVector128(pTile + 8).AsByte(); + row3 = Sse2.LoadVector128(pTile + 12).AsByte(); + } + + min = Sse2.Min(min, row0); + max = Sse2.Max(max, row0); + min = Sse2.Min(min, row1); + max = Sse2.Max(max, row1); + min = Sse2.Min(min, row2); + max = Sse2.Max(max, row2); + min = Sse2.Min(min, row3); + max = Sse2.Max(max, row3); + + minColor = HorizontalMin(min); + maxColor = HorizontalMax(max); + } + + private static unsafe void GetMinMaxColorsTwoSubsets4x4Sse41( + ReadOnlySpan partitionTable, + ReadOnlySpan tile, + Span minColors, + Span maxColors) + { + Vector128 partitionMask; + + fixed (byte* pPartitionTable = partitionTable) + { + partitionMask = Sse2.LoadVector128(pPartitionTable); + } + + Vector128 subset0Mask = Sse2.CompareEqual(partitionMask, Vector128.Zero); + + Vector128 subset0MaskRep16Low = Sse2.UnpackLow(subset0Mask, subset0Mask); + Vector128 subset0MaskRep16High = Sse2.UnpackHigh(subset0Mask, subset0Mask); + + Vector128 subset0Mask0 = Sse2.UnpackLow(subset0MaskRep16Low.AsInt16(), subset0MaskRep16Low.AsInt16()).AsByte(); + Vector128 subset0Mask1 = Sse2.UnpackHigh(subset0MaskRep16Low.AsInt16(), subset0MaskRep16Low.AsInt16()).AsByte(); + Vector128 subset0Mask2 = Sse2.UnpackLow(subset0MaskRep16High.AsInt16(), subset0MaskRep16High.AsInt16()).AsByte(); + Vector128 subset0Mask3 = Sse2.UnpackHigh(subset0MaskRep16High.AsInt16(), subset0MaskRep16High.AsInt16()).AsByte(); + + Vector128 min0 = Vector128.AllBitsSet; + Vector128 min1 = Vector128.AllBitsSet; + Vector128 max0 = Vector128.Zero; + Vector128 max1 = Vector128.Zero; + + Vector128 row0, row1, row2, row3; + + fixed (uint* pTile = tile) + { + row0 = Sse2.LoadVector128(pTile).AsByte(); + row1 = Sse2.LoadVector128(pTile + 4).AsByte(); + row2 = Sse2.LoadVector128(pTile + 8).AsByte(); + row3 = Sse2.LoadVector128(pTile + 12).AsByte(); + } + + min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row0, subset0Mask0)); + min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row1, subset0Mask1)); + min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row2, subset0Mask2)); + min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row3, subset0Mask3)); + + min1 = Sse2.Min(min1, Sse2.Or(row0, subset0Mask0)); + min1 = Sse2.Min(min1, Sse2.Or(row1, subset0Mask1)); + min1 = Sse2.Min(min1, Sse2.Or(row2, subset0Mask2)); + min1 = Sse2.Min(min1, Sse2.Or(row3, subset0Mask3)); + + max0 = Sse2.Max(max0, Sse2.And(row0, subset0Mask0)); + max0 = Sse2.Max(max0, Sse2.And(row1, subset0Mask1)); + max0 = Sse2.Max(max0, Sse2.And(row2, subset0Mask2)); + max0 = Sse2.Max(max0, Sse2.And(row3, subset0Mask3)); + + max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask0, row0)); + max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask1, row1)); + max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask2, row2)); + max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask3, row3)); + + minColors[0] = HorizontalMin(min0); + minColors[1] = HorizontalMin(min1); + maxColors[0] = HorizontalMax(max0); + maxColors[1] = HorizontalMax(max1); + } + + private static RgbaColor8 HorizontalMin(Vector128 x) + { + x = Sse2.Min(x, Sse2.Shuffle(x.AsInt32(), 0x31).AsByte()); + x = Sse2.Min(x, Sse2.Shuffle(x.AsInt32(), 2).AsByte()); + return RgbaColor8.FromUInt32(x.AsUInt32().GetElement(0)); + } + + private static RgbaColor8 HorizontalMax(Vector128 x) + { + x = Sse2.Max(x, Sse2.Shuffle(x.AsInt32(), 0x31).AsByte()); + x = Sse2.Max(x, Sse2.Shuffle(x.AsInt32(), 2).AsByte()); + return RgbaColor8.FromUInt32(x.AsUInt32().GetElement(0)); + } + + public static int SelectIndices( + ReadOnlySpan values, + uint endPoint0, + uint endPoint1, + int pBit0, + int pBit1, + int indexBitCount, + int indexCount, + int colorDepth, + int alphaDepth, + uint alphaMask) + { + if (Sse41.IsSupported) + { + if (indexBitCount == 2) + { + return Select2BitIndicesSse41( + values, + endPoint0, + endPoint1, + pBit0, + pBit1, + indexBitCount, + indexCount, + colorDepth, + alphaDepth, + alphaMask); + } + else if (indexBitCount == 3) + { + return Select3BitIndicesSse41( + values, + endPoint0, + endPoint1, + pBit0, + pBit1, + indexBitCount, + indexCount, + colorDepth, + alphaDepth, + alphaMask); + } + else if (indexBitCount == 4) + { + return Select4BitIndicesOneSubsetSse41( + values, + endPoint0, + endPoint1, + pBit0, + pBit1, + indexBitCount, + indexCount, + colorDepth, + alphaDepth, + alphaMask); + } + } + + return SelectIndicesFallback( + values, + endPoint0, + endPoint1, + pBit0, + pBit1, + indexBitCount, + indexCount, + colorDepth, + alphaDepth, + alphaMask); + } + + private static unsafe int Select2BitIndicesSse41( + ReadOnlySpan values, + uint endPoint0, + uint endPoint1, + int pBit0, + int pBit1, + int indexBitCount, + int indexCount, + int colorDepth, + int alphaDepth, + uint alphaMask) + { + uint alphaMaskForPalette = alphaMask; + + if (alphaDepth == 0) + { + alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); + } + + int errorSum = 0; + + RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0); + RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1); + + Vector128 c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); + Vector128 c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); + + Vector128 c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); + + Vector128 rWeights; + Vector128 lWeights; + + fixed (byte* pWeights = BC7Tables.Weights[0], pInvWeights = BC7Tables.InverseWeights[0]) + { + rWeights = Sse2.LoadScalarVector128((uint*)pWeights).AsByte(); + lWeights = Sse2.LoadScalarVector128((uint*)pInvWeights).AsByte(); + } + + Vector128 iWeights = Sse2.UnpackLow(lWeights, rWeights); + Vector128 iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); + Vector128 iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128 iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + + Vector128 pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); + Vector128 pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); + + for (int i = 0; i < values.Length; i++) + { + uint c = values[i] | alphaMask; + + Vector128 color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); + + Vector128 delta0 = Sse2.Subtract(color, pal0); + Vector128 delta1 = Sse2.Subtract(color, pal1); + + Vector128 deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); + Vector128 deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); + + Vector128 deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); + + Vector128 delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum01); + + Vector128 min = Sse41.MinHorizontal(delta); + + ushort error = min.GetElement(0); + + errorSum += error; + } + + return errorSum; + } + + private static unsafe int Select3BitIndicesSse41( + ReadOnlySpan values, + uint endPoint0, + uint endPoint1, + int pBit0, + int pBit1, + int indexBitCount, + int indexCount, + int colorDepth, + int alphaDepth, + uint alphaMask) + { + uint alphaMaskForPalette = alphaMask; + + if (alphaDepth == 0) + { + alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); + } + + int errorSum = 0; + + RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0); + RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1); + + Vector128 c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); + Vector128 c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); + + Vector128 c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); + + Vector128 rWeights; + Vector128 lWeights; + + fixed (byte* pWeights = BC7Tables.Weights[1], pInvWeights = BC7Tables.InverseWeights[1]) + { + rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte(); + lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte(); + } + + Vector128 iWeights = Sse2.UnpackLow(lWeights, rWeights); + Vector128 iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); + Vector128 iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); + Vector128 iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128 iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128 iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + Vector128 iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + + Vector128 pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); + Vector128 pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); + Vector128 pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte())); + Vector128 pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte())); + + for (int i = 0; i < values.Length; i++) + { + uint c = values[i] | alphaMask; + + Vector128 color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); + + Vector128 delta0 = Sse2.Subtract(color, pal0); + Vector128 delta1 = Sse2.Subtract(color, pal1); + Vector128 delta2 = Sse2.Subtract(color, pal2); + Vector128 delta3 = Sse2.Subtract(color, pal3); + + Vector128 deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); + Vector128 deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); + Vector128 deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2); + Vector128 deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3); + + Vector128 deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); + Vector128 deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3); + + Vector128 delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23); + + Vector128 min = Sse41.MinHorizontal(delta); + + ushort error = min.GetElement(0); + + errorSum += error; + } + + return errorSum; + } + + private static unsafe int Select4BitIndicesOneSubsetSse41( + ReadOnlySpan values, + uint endPoint0, + uint endPoint1, + int pBit0, + int pBit1, + int indexBitCount, + int indexCount, + int colorDepth, + int alphaDepth, + uint alphaMask) + { + uint alphaMaskForPalette = alphaMask; + + if (alphaDepth == 0) + { + alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); + } + + int errorSum = 0; + + RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0); + RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1); + + Vector128 c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); + Vector128 c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); + + Vector128 c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); + + Vector128 rWeights; + Vector128 lWeights; + + fixed (byte* pWeights = BC7Tables.Weights[2], pInvWeights = BC7Tables.InverseWeights[2]) + { + rWeights = Sse2.LoadVector128(pWeights); + lWeights = Sse2.LoadVector128(pInvWeights); + } + + Vector128 iWeightsLow = Sse2.UnpackLow(lWeights, rWeights); + Vector128 iWeightsHigh = Sse2.UnpackHigh(lWeights, rWeights); + Vector128 iWeights01 = Sse2.UnpackLow(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte(); + Vector128 iWeights23 = Sse2.UnpackHigh(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte(); + Vector128 iWeights45 = Sse2.UnpackLow(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte(); + Vector128 iWeights67 = Sse2.UnpackHigh(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte(); + Vector128 iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128 iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128 iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + Vector128 iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + Vector128 iWeights4 = Sse2.UnpackLow(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte(); + Vector128 iWeights5 = Sse2.UnpackHigh(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte(); + Vector128 iWeights6 = Sse2.UnpackLow(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte(); + Vector128 iWeights7 = Sse2.UnpackHigh(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte(); + + Vector128 pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); + Vector128 pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); + Vector128 pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte())); + Vector128 pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte())); + Vector128 pal4 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights4.AsSByte())); + Vector128 pal5 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights5.AsSByte())); + Vector128 pal6 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights6.AsSByte())); + Vector128 pal7 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights7.AsSByte())); + + for (int i = 0; i < values.Length; i++) + { + uint c = values[i] | alphaMask; + + Vector128 color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); + + Vector128 delta0 = Sse2.Subtract(color, pal0); + Vector128 delta1 = Sse2.Subtract(color, pal1); + Vector128 delta2 = Sse2.Subtract(color, pal2); + Vector128 delta3 = Sse2.Subtract(color, pal3); + Vector128 delta4 = Sse2.Subtract(color, pal4); + Vector128 delta5 = Sse2.Subtract(color, pal5); + Vector128 delta6 = Sse2.Subtract(color, pal6); + Vector128 delta7 = Sse2.Subtract(color, pal7); + + Vector128 deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); + Vector128 deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); + Vector128 deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2); + Vector128 deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3); + Vector128 deltaSum4 = Sse2.MultiplyAddAdjacent(delta4, delta4); + Vector128 deltaSum5 = Sse2.MultiplyAddAdjacent(delta5, delta5); + Vector128 deltaSum6 = Sse2.MultiplyAddAdjacent(delta6, delta6); + Vector128 deltaSum7 = Sse2.MultiplyAddAdjacent(delta7, delta7); + + Vector128 deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); + Vector128 deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3); + Vector128 deltaSum45 = Ssse3.HorizontalAdd(deltaSum4, deltaSum5); + Vector128 deltaSum67 = Ssse3.HorizontalAdd(deltaSum6, deltaSum7); + + Vector128 delta0123 = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23); + Vector128 delta4567 = Sse41.PackUnsignedSaturate(deltaSum45, deltaSum67); + + Vector128 min0123 = Sse41.MinHorizontal(delta0123); + Vector128 min4567 = Sse41.MinHorizontal(delta4567); + + ushort minPos0123 = min0123.GetElement(0); + ushort minPos4567 = min4567.GetElement(0); + + if (minPos4567 < minPos0123) + { + errorSum += minPos4567; + } + else + { + errorSum += minPos0123; + } + } + + return errorSum; + } + + private static int SelectIndicesFallback( + ReadOnlySpan values, + uint endPoint0, + uint endPoint1, + int pBit0, + int pBit1, + int indexBitCount, + int indexCount, + int colorDepth, + int alphaDepth, + uint alphaMask) + { + int errorSum = 0; + + uint alphaMaskForPalette = alphaMask; + + if (alphaDepth == 0) + { + alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); + } + + Span palette = stackalloc uint[indexCount]; + + RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0); + RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1); + + Unsafe.As(ref c0) |= alphaMaskForPalette; + Unsafe.As(ref c1) |= alphaMaskForPalette; + + palette[0] = c0.ToUInt32(); + palette[indexCount - 1] = c1.ToUInt32(); + + for (int j = 1; j < indexCount - 1; j++) + { + palette[j] = Interpolate(c0, c1, j, indexBitCount).ToUInt32(); + } + + for (int i = 0; i < values.Length; i++) + { + uint color = values[i] | alphaMask; + + int bestMatchScore = int.MaxValue; + int bestMatchIndex = 0; + + for (int j = 0; j < indexCount; j++) + { + int score = SquaredDifference( + RgbaColor8.FromUInt32(color).GetColor32(), + RgbaColor8.FromUInt32(palette[j]).GetColor32()); + + if (score < bestMatchScore) + { + bestMatchScore = score; + bestMatchIndex = j; + } + } + + errorSum += bestMatchScore; + } + + return errorSum; + } + + public static int SelectIndices( + ReadOnlySpan tile, + int w, + int h, + ReadOnlySpan endPoints0, + ReadOnlySpan endPoints1, + ReadOnlySpan pBitValues, + Span indices, + int subsetCount, + int partition, + int indexBitCount, + int indexCount, + int colorDepth, + int alphaDepth, + int pBits, + uint alphaMask) + { + if (Sse41.IsSupported) + { + if (indexBitCount == 2) + { + return Select2BitIndicesSse41( + tile, + w, + h, + endPoints0, + endPoints1, + pBitValues, + indices, + subsetCount, + partition, + colorDepth, + alphaDepth, + pBits, + alphaMask); + } + else if (indexBitCount == 3) + { + return Select3BitIndicesSse41( + tile, + w, + h, + endPoints0, + endPoints1, + pBitValues, + indices, + subsetCount, + partition, + colorDepth, + alphaDepth, + pBits, + alphaMask); + } + else if (indexBitCount == 4) + { + Debug.Assert(subsetCount == 1); + + return Select4BitIndicesOneSubsetSse41( + tile, + w, + h, + endPoints0[0], + endPoints1[0], + pBitValues, + indices, + partition, + colorDepth, + alphaDepth, + pBits, + alphaMask); + } + } + + return SelectIndicesFallback( + tile, + w, + h, + endPoints0, + endPoints1, + pBitValues, + indices, + subsetCount, + partition, + indexBitCount, + indexCount, + colorDepth, + alphaDepth, + pBits, + alphaMask); + } + + private static unsafe int Select2BitIndicesSse41( + ReadOnlySpan tile, + int w, + int h, + ReadOnlySpan endPoints0, + ReadOnlySpan endPoints1, + ReadOnlySpan pBitValues, + Span indices, + int subsetCount, + int partition, + int colorDepth, + int alphaDepth, + int pBits, + uint alphaMask) + { + byte[] partitionTable = BC7Tables.PartitionTable[subsetCount - 1][partition]; + + uint alphaMaskForPalette = alphaMask; + + if (alphaDepth == 0) + { + alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); + } + + int errorSum = 0; + + for (int subset = 0; subset < subsetCount; subset++) + { + int pBit0 = -1, pBit1 = -1; + + if (pBits == subsetCount) + { + pBit0 = pBit1 = pBitValues[subset]; + } + else if (pBits != 0) + { + pBit0 = pBitValues[subset * 2]; + pBit1 = pBitValues[subset * 2 + 1]; + } + + RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0); + RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1); + + Vector128 c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); + Vector128 c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); + + Vector128 c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); + + Vector128 rWeights; + Vector128 lWeights; + + fixed (byte* pWeights = BC7Tables.Weights[0], pInvWeights = BC7Tables.InverseWeights[0]) + { + rWeights = Sse2.LoadScalarVector128((uint*)pWeights).AsByte(); + lWeights = Sse2.LoadScalarVector128((uint*)pInvWeights).AsByte(); + } + + Vector128 iWeights = Sse2.UnpackLow(lWeights, rWeights); + Vector128 iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); + Vector128 iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128 iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + + Vector128 pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); + Vector128 pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); + + int i = 0; + for (int ty = 0; ty < h; ty++) + { + for (int tx = 0; tx < w; tx++, i++) + { + int tileOffset = ty * 4 + tx; + if (partitionTable[tileOffset] != subset) + { + continue; + } + + uint c = tile[i] | alphaMask; + + Vector128 color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); + + Vector128 delta0 = Sse2.Subtract(color, pal0); + Vector128 delta1 = Sse2.Subtract(color, pal1); + + Vector128 deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); + Vector128 deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); + + Vector128 deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); + + Vector128 delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum01); + + Vector128 min = Sse41.MinHorizontal(delta); + + uint minPos = min.AsUInt32().GetElement(0); + ushort error = (ushort)minPos; + uint index = minPos >> 16; + + indices[tileOffset] = (byte)index; + errorSum += error; + } + } + } + + return errorSum; + } + + private static unsafe int Select3BitIndicesSse41( + ReadOnlySpan tile, + int w, + int h, + ReadOnlySpan endPoints0, + ReadOnlySpan endPoints1, + ReadOnlySpan pBitValues, + Span indices, + int subsetCount, + int partition, + int colorDepth, + int alphaDepth, + int pBits, + uint alphaMask) + { + byte[] partitionTable = BC7Tables.PartitionTable[subsetCount - 1][partition]; + + uint alphaMaskForPalette = alphaMask; + + if (alphaDepth == 0) + { + alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); + } + + int errorSum = 0; + + for (int subset = 0; subset < subsetCount; subset++) + { + int pBit0 = -1, pBit1 = -1; + + if (pBits == subsetCount) + { + pBit0 = pBit1 = pBitValues[subset]; + } + else if (pBits != 0) + { + pBit0 = pBitValues[subset * 2]; + pBit1 = pBitValues[subset * 2 + 1]; + } + + RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0); + RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1); + + Vector128 c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); + Vector128 c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); + + Vector128 c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); + + Vector128 rWeights; + Vector128 lWeights; + + fixed (byte* pWeights = BC7Tables.Weights[1], pInvWeights = BC7Tables.InverseWeights[1]) + { + rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte(); + lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte(); + } + + Vector128 iWeights = Sse2.UnpackLow(lWeights, rWeights); + Vector128 iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); + Vector128 iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); + Vector128 iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128 iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128 iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + Vector128 iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + + Vector128 pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); + Vector128 pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); + Vector128 pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte())); + Vector128 pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte())); + + int i = 0; + for (int ty = 0; ty < h; ty++) + { + for (int tx = 0; tx < w; tx++, i++) + { + int tileOffset = ty * 4 + tx; + if (partitionTable[tileOffset] != subset) + { + continue; + } + + uint c = tile[i] | alphaMask; + + Vector128 color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); + + Vector128 delta0 = Sse2.Subtract(color, pal0); + Vector128 delta1 = Sse2.Subtract(color, pal1); + Vector128 delta2 = Sse2.Subtract(color, pal2); + Vector128 delta3 = Sse2.Subtract(color, pal3); + + Vector128 deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); + Vector128 deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); + Vector128 deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2); + Vector128 deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3); + + Vector128 deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); + Vector128 deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3); + + Vector128 delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23); + + Vector128 min = Sse41.MinHorizontal(delta); + + uint minPos = min.AsUInt32().GetElement(0); + ushort error = (ushort)minPos; + uint index = minPos >> 16; + + indices[tileOffset] = (byte)index; + errorSum += error; + } + } + } + + return errorSum; + } + + private static unsafe int Select4BitIndicesOneSubsetSse41( + ReadOnlySpan tile, + int w, + int h, + uint endPoint0, + uint endPoint1, + ReadOnlySpan pBitValues, + Span indices, + int partition, + int colorDepth, + int alphaDepth, + int pBits, + uint alphaMask) + { + uint alphaMaskForPalette = alphaMask; + + if (alphaDepth == 0) + { + alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); + } + + int errorSum = 0; + + int pBit0 = -1, pBit1 = -1; + + if (pBits != 0) + { + pBit0 = pBitValues[0]; + pBit1 = pBitValues[1]; + } + + RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0); + RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1); + + Vector128 c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); + Vector128 c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); + + Vector128 c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); + + Vector128 rWeights; + Vector128 lWeights; + + fixed (byte* pWeights = BC7Tables.Weights[2], pInvWeights = BC7Tables.InverseWeights[2]) + { + rWeights = Sse2.LoadVector128(pWeights); + lWeights = Sse2.LoadVector128(pInvWeights); + } + + Vector128 iWeightsLow = Sse2.UnpackLow(lWeights, rWeights); + Vector128 iWeightsHigh = Sse2.UnpackHigh(lWeights, rWeights); + Vector128 iWeights01 = Sse2.UnpackLow(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte(); + Vector128 iWeights23 = Sse2.UnpackHigh(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte(); + Vector128 iWeights45 = Sse2.UnpackLow(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte(); + Vector128 iWeights67 = Sse2.UnpackHigh(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte(); + Vector128 iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128 iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); + Vector128 iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + Vector128 iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); + Vector128 iWeights4 = Sse2.UnpackLow(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte(); + Vector128 iWeights5 = Sse2.UnpackHigh(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte(); + Vector128 iWeights6 = Sse2.UnpackLow(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte(); + Vector128 iWeights7 = Sse2.UnpackHigh(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte(); + + Vector128 pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); + Vector128 pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); + Vector128 pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte())); + Vector128 pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte())); + Vector128 pal4 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights4.AsSByte())); + Vector128 pal5 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights5.AsSByte())); + Vector128 pal6 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights6.AsSByte())); + Vector128 pal7 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights7.AsSByte())); + + int i = 0; + for (int ty = 0; ty < h; ty++) + { + for (int tx = 0; tx < w; tx++, i++) + { + uint c = tile[i] | alphaMask; + + Vector128 color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); + + Vector128 delta0 = Sse2.Subtract(color, pal0); + Vector128 delta1 = Sse2.Subtract(color, pal1); + Vector128 delta2 = Sse2.Subtract(color, pal2); + Vector128 delta3 = Sse2.Subtract(color, pal3); + Vector128 delta4 = Sse2.Subtract(color, pal4); + Vector128 delta5 = Sse2.Subtract(color, pal5); + Vector128 delta6 = Sse2.Subtract(color, pal6); + Vector128 delta7 = Sse2.Subtract(color, pal7); + + Vector128 deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); + Vector128 deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); + Vector128 deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2); + Vector128 deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3); + Vector128 deltaSum4 = Sse2.MultiplyAddAdjacent(delta4, delta4); + Vector128 deltaSum5 = Sse2.MultiplyAddAdjacent(delta5, delta5); + Vector128 deltaSum6 = Sse2.MultiplyAddAdjacent(delta6, delta6); + Vector128 deltaSum7 = Sse2.MultiplyAddAdjacent(delta7, delta7); + + Vector128 deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); + Vector128 deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3); + Vector128 deltaSum45 = Ssse3.HorizontalAdd(deltaSum4, deltaSum5); + Vector128 deltaSum67 = Ssse3.HorizontalAdd(deltaSum6, deltaSum7); + + Vector128 delta0123 = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23); + Vector128 delta4567 = Sse41.PackUnsignedSaturate(deltaSum45, deltaSum67); + + Vector128 min0123 = Sse41.MinHorizontal(delta0123); + Vector128 min4567 = Sse41.MinHorizontal(delta4567); + + uint minPos0123 = min0123.AsUInt32().GetElement(0); + uint minPos4567 = min4567.AsUInt32().GetElement(0); + + if ((ushort)minPos4567 < (ushort)minPos0123) + { + errorSum += (ushort)minPos4567; + indices[ty * 4 + tx] = (byte)(8 + (minPos4567 >> 16)); + } + else + { + errorSum += (ushort)minPos0123; + indices[ty * 4 + tx] = (byte)(minPos0123 >> 16); + } + } + } + + return errorSum; + } + + private static Vector128 ShiftRoundToNearest(Vector128 x) + { + return Sse2.ShiftRightLogical(Sse2.Add(x, Vector128.Create((short)32)), 6); + } + + private static int SelectIndicesFallback( + ReadOnlySpan tile, + int w, + int h, + ReadOnlySpan endPoints0, + ReadOnlySpan endPoints1, + ReadOnlySpan pBitValues, + Span indices, + int subsetCount, + int partition, + int indexBitCount, + int indexCount, + int colorDepth, + int alphaDepth, + int pBits, + uint alphaMask) + { + int errorSum = 0; + + uint alphaMaskForPalette = alphaMask; + + if (alphaDepth == 0) + { + alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); + } + + Span palette = stackalloc uint[subsetCount * indexCount]; + + for (int subset = 0; subset < subsetCount; subset++) + { + int palBase = subset * indexCount; + + int pBit0 = -1, pBit1 = -1; + + if (pBits == subsetCount) + { + pBit0 = pBit1 = pBitValues[subset]; + } + else if (pBits != 0) + { + pBit0 = pBitValues[subset * 2]; + pBit1 = pBitValues[subset * 2 + 1]; + } + + RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0); + RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1); + + Unsafe.As(ref c0) |= alphaMaskForPalette; + Unsafe.As(ref c1) |= alphaMaskForPalette; + + palette[palBase + 0] = c0.ToUInt32(); + palette[palBase + indexCount - 1] = c1.ToUInt32(); + + for (int j = 1; j < indexCount - 1; j++) + { + palette[palBase + j] = Interpolate(c0, c1, j, indexBitCount).ToUInt32(); + } + } + + int i = 0; + for (int ty = 0; ty < h; ty++) + { + for (int tx = 0; tx < w; tx++) + { + int subset = BC7Tables.PartitionTable[subsetCount - 1][partition][ty * 4 + tx]; + uint color = tile[i++] | alphaMask; + + int bestMatchScore = int.MaxValue; + int bestMatchIndex = 0; + + for (int j = 0; j < indexCount; j++) + { + int score = SquaredDifference( + RgbaColor8.FromUInt32(color).GetColor32(), + RgbaColor8.FromUInt32(palette[subset * indexCount + j]).GetColor32()); + + if (score < bestMatchScore) + { + bestMatchScore = score; + bestMatchIndex = j; + } + } + + indices[ty * 4 + tx] = (byte)bestMatchIndex; + errorSum += bestMatchScore; + } + } + + return errorSum; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int SquaredDifference(RgbaColor32 color1, RgbaColor32 color2) + { + RgbaColor32 delta = color1 - color2; + return RgbaColor32.Dot(delta, delta); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor8 Interpolate(RgbaColor8 color1, RgbaColor8 color2, int weightIndex, int indexBitCount) + { + return Interpolate(color1.GetColor32(), color2.GetColor32(), weightIndex, indexBitCount).GetColor8(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor32 Interpolate(RgbaColor32 color1, RgbaColor32 color2, int weightIndex, int indexBitCount) + { + Debug.Assert(indexBitCount >= 2 && indexBitCount <= 4); + + int weight = (((weightIndex << 7) / ((1 << indexBitCount) - 1)) + 1) >> 1; + + RgbaColor32 weightV = new RgbaColor32(weight); + RgbaColor32 invWeightV = new RgbaColor32(64 - weight); + + return (color1 * invWeightV + color2 * weightV + new RgbaColor32(32)) >> 6; + } + + public static RgbaColor8 Quantize(RgbaColor8 color, int colorBits, int alphaBits, int pBit = -1) + { + if (alphaBits == 0) + { + int colorShift = 8 - colorBits; + + uint c; + + if (pBit >= 0) + { + byte[] lutColor = _quantizationLut[colorBits - 4]; + + Debug.Assert(pBit <= 1); + int high = pBit << 8; + uint mask = (0xffu >> (colorBits + 1)) * 0x10101; + + c = lutColor[color.R | high]; + c |= (uint)lutColor[color.G | high] << 8; + c |= (uint)lutColor[color.B | high] << 16; + + c <<= colorShift; + c |= (c >> (colorBits + 1)) & mask; + c |= ((uint)pBit * 0x10101) << (colorShift - 1); + } + else + { + byte[] lutColor = _quantizationLutNoPBit[colorBits - 4]; + + uint mask = (0xffu >> colorBits) * 0x10101; + + c = lutColor[color.R]; + c |= (uint)lutColor[color.G] << 8; + c |= (uint)lutColor[color.B] << 16; + + c <<= colorShift; + c |= (c >> colorBits) & mask; + } + + c |= (uint)color.A << 24; + + return RgbaColor8.FromUInt32(c); + } + + return QuantizeFallback(color, colorBits, alphaBits, pBit); + } + + private static RgbaColor8 QuantizeFallback(RgbaColor8 color, int colorBits, int alphaBits, int pBit = -1) + { + byte r = UnquantizeComponent(QuantizeComponent(color.R, colorBits, pBit), colorBits, pBit); + byte g = UnquantizeComponent(QuantizeComponent(color.G, colorBits, pBit), colorBits, pBit); + byte b = UnquantizeComponent(QuantizeComponent(color.B, colorBits, pBit), colorBits, pBit); + byte a = alphaBits == 0 ? color.A : UnquantizeComponent(QuantizeComponent(color.A, alphaBits, pBit), alphaBits, pBit); + return new RgbaColor8(r, g, b, a); + } + + public static byte QuantizeComponent(byte component, int bits, int pBit = -1) + { + return pBit >= 0 ? _quantizationLut[bits - 4][component | (pBit << 8)] : _quantizationLutNoPBit[bits - 4][component]; + } + + private static byte QuantizeComponentForLut(byte component, int bits, int pBit = -1) + { + int shift = 8 - bits; + int fill = component >> bits; + + if (pBit >= 0) + { + Debug.Assert(pBit <= 1); + fill >>= 1; + fill |= pBit << (shift - 1); + } + + int q1 = component >> shift; + int q2 = Math.Max(q1 - 1, 0); + int q3 = Math.Min(q1 + 1, (1 << bits) - 1); + + int delta1 = FastAbs(((q1 << shift) | fill) - component); + int delta2 = component - ((q2 << shift) | fill); + int delta3 = ((q3 << shift) | fill) - component; + + if (delta1 < delta2 && delta1 < delta3) + { + return (byte)q1; + } + else if (delta2 < delta3) + { + return (byte)q2; + } + else + { + return (byte)q3; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int FastAbs(int x) + { + int sign = x >> 31; + return (x + sign) ^ sign; + } + + private static byte UnquantizeComponent(byte component, int bits, int pBit) + { + int shift = 8 - bits; + int value = component << shift; + + if (pBit >= 0) + { + Debug.Assert(pBit <= 1); + value |= value >> (bits + 1); + value |= pBit << (shift - 1); + } + else + { + value |= value >> bits; + } + + return (byte)value; + } + } +} diff --git a/Ryujinx.Graphics.Texture/Encoders/EncodeMode.cs b/Ryujinx.Graphics.Texture/Encoders/EncodeMode.cs new file mode 100644 index 000000000..5734d301e --- /dev/null +++ b/Ryujinx.Graphics.Texture/Encoders/EncodeMode.cs @@ -0,0 +1,10 @@ +namespace Ryujinx.Graphics.Texture.Encoders +{ + enum EncodeMode + { + Fast, + Exhaustive, + ModeMask = 0xff, + Multithreaded = 1 << 8 + } +} diff --git a/Ryujinx.Graphics.Texture/Encoders/RgbaColor32.cs b/Ryujinx.Graphics.Texture/Encoders/RgbaColor32.cs new file mode 100644 index 000000000..ed4b7507e --- /dev/null +++ b/Ryujinx.Graphics.Texture/Encoders/RgbaColor32.cs @@ -0,0 +1,206 @@ +using System; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace Ryujinx.Graphics.Texture.Encoders +{ + struct RgbaColor32 : IEquatable + { + private readonly Vector128 _color; + + public int R => _color.GetElement(0); + public int G => _color.GetElement(1); + public int B => _color.GetElement(2); + public int A => _color.GetElement(3); + + public RgbaColor32(Vector128 color) + { + _color = color; + } + + public RgbaColor32(int r, int g, int b, int a) + { + _color = Vector128.Create(r, g, b, a); + } + + public RgbaColor32(int scalar) + { + _color = Vector128.Create(scalar); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor32 operator +(RgbaColor32 x, RgbaColor32 y) + { + if (Sse2.IsSupported) + { + return new RgbaColor32(Sse2.Add(x._color, y._color)); + } + else + { + return new RgbaColor32(x.R + y.R, x.G + y.G, x.B + y.B, x.A + y.A); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor32 operator -(RgbaColor32 x, RgbaColor32 y) + { + if (Sse2.IsSupported) + { + return new RgbaColor32(Sse2.Subtract(x._color, y._color)); + } + else + { + return new RgbaColor32(x.R - y.R, x.G - y.G, x.B - y.B, x.A - y.A); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor32 operator *(RgbaColor32 x, RgbaColor32 y) + { + if (Sse41.IsSupported) + { + return new RgbaColor32(Sse41.MultiplyLow(x._color, y._color)); + } + else + { + return new RgbaColor32(x.R * y.R, x.G * y.G, x.B * y.B, x.A * y.A); + } + } + + public static RgbaColor32 operator /(RgbaColor32 x, RgbaColor32 y) + { + return new RgbaColor32(x.R / y.R, x.G / y.G, x.B / y.B, x.A / y.A); + } + + public static RgbaColor32 DivideGuarded(RgbaColor32 x, RgbaColor32 y, int resultIfZero) + { + int DivideGuarded(int dividend, int divisor) + { + if (divisor == 0) + { + return resultIfZero; + } + + return dividend / divisor; + } + + return new RgbaColor32(DivideGuarded(x.R, y.R), DivideGuarded(x.G, y.G), DivideGuarded(x.B, y.B), DivideGuarded(x.A, y.A)); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor32 operator <<(RgbaColor32 x, int shift) + { + if (Sse2.IsSupported) + { + return new RgbaColor32(Sse2.ShiftLeftLogical(x._color, (byte)shift)); + } + else + { + return new RgbaColor32(x.R << shift, x.G << shift, x.B << shift, x.A << shift); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor32 operator >>(RgbaColor32 x, int shift) + { + if (Sse2.IsSupported) + { + return new RgbaColor32(Sse2.ShiftRightLogical(x._color, (byte)shift)); + } + else + { + return new RgbaColor32(x.R >> shift, x.G >> shift, x.B >> shift, x.A >> shift); + } + } + + public static bool operator ==(RgbaColor32 x, RgbaColor32 y) + { + return x.Equals(y); + } + + public static bool operator !=(RgbaColor32 x, RgbaColor32 y) + { + return !x.Equals(y); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int Dot(RgbaColor32 x, RgbaColor32 y) + { + if (Sse41.IsSupported) + { + Vector128 product = Sse41.MultiplyLow(x._color, y._color); + Vector128 sum = Ssse3.HorizontalAdd(product, product); + sum = Ssse3.HorizontalAdd(sum, sum); + return sum.GetElement(0); + } + else + { + return x.R * y.R + x.G * y.G + x.B * y.B + x.A * y.A; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor32 Max(RgbaColor32 x, RgbaColor32 y) + { + if (Sse41.IsSupported) + { + return new RgbaColor32(Sse41.Max(x._color, y._color)); + } + else + { + return new RgbaColor32(Math.Max(x.R, y.R), Math.Max(x.G, y.G), Math.Max(x.B, y.B), Math.Max(x.A, y.A)); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static RgbaColor32 Min(RgbaColor32 x, RgbaColor32 y) + { + if (Sse41.IsSupported) + { + return new RgbaColor32(Sse41.Min(x._color, y._color)); + } + else + { + return new RgbaColor32(Math.Min(x.R, y.R), Math.Min(x.G, y.G), Math.Min(x.B, y.B), Math.Min(x.A, y.A)); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public RgbaColor8 GetColor8() + { + if (Sse41.IsSupported) + { + Vector128 temp = _color; + Vector128 color16 = Sse41.PackUnsignedSaturate(temp, temp); + Vector128 color8 = Sse2.PackUnsignedSaturate(color16.AsInt16(), color16.AsInt16()); + uint color = color8.AsUInt32().GetElement(0); + return Unsafe.As(ref color); + } + else + { + return new RgbaColor8(ClampByte(R), ClampByte(G), ClampByte(B), ClampByte(A)); + } + } + + private static byte ClampByte(int value) + { + return (byte)Math.Clamp(value, 0, 255); + } + + public override int GetHashCode() + { + return HashCode.Combine(R, G, B, A); + } + + public override bool Equals(object obj) + { + return obj is RgbaColor32 other && Equals(other); + } + + public bool Equals(RgbaColor32 other) + { + return _color.Equals(other._color); + } + } +} diff --git a/Ryujinx.Graphics.Texture/Encoders/RgbaColor8.cs b/Ryujinx.Graphics.Texture/Encoders/RgbaColor8.cs new file mode 100644 index 000000000..bbf3c086a --- /dev/null +++ b/Ryujinx.Graphics.Texture/Encoders/RgbaColor8.cs @@ -0,0 +1,83 @@ +using System; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace Ryujinx.Graphics.Texture.Encoders +{ + struct RgbaColor8 : IEquatable + { + public byte R; + public byte G; + public byte B; + public byte A; + + public RgbaColor8(byte r, byte g, byte b, byte a) + { + R = r; + G = g; + B = b; + A = a; + } + + public static RgbaColor8 FromUInt32(uint color) + { + return Unsafe.As(ref color); + } + + public static bool operator ==(RgbaColor8 x, RgbaColor8 y) + { + return x.Equals(y); + } + + public static bool operator !=(RgbaColor8 x, RgbaColor8 y) + { + return !x.Equals(y); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public RgbaColor32 GetColor32() + { + if (Sse41.IsSupported) + { + Vector128 color = Vector128.CreateScalarUnsafe(Unsafe.As(ref this)).AsByte(); + return new RgbaColor32(Sse41.ConvertToVector128Int32(color)); + } + else + { + return new RgbaColor32(R, G, B, A); + } + } + + public uint ToUInt32() + { + return Unsafe.As(ref this); + } + + public override int GetHashCode() + { + return HashCode.Combine(R, G, B, A); + } + + public override bool Equals(object obj) + { + return obj is RgbaColor8 other && Equals(other); + } + + public bool Equals(RgbaColor8 other) + { + return R == other.R && G == other.G && B == other.B && A == other.A; + } + + public byte GetComponent(int index) + { + return index switch + { + 1 => G, + 2 => B, + 3 => A, + _ => R + }; + } + } +}