From 34a583e04580365c97f412c432fac19b5cbed38b Mon Sep 17 00:00:00 2001 From: gdk Date: Sat, 4 Feb 2023 19:44:30 -0300 Subject: [PATCH] Implement VP9 loop filtering --- src/Ryujinx.Graphics.Nvdec.Vp9/BitDepth.cs | 13 +- src/Ryujinx.Graphics.Nvdec.Vp9/CodecErr.cs | 22 +- .../Common/BitUtils.cs | 4 +- .../Common/MemoryAllocator.cs | 9 +- .../Common/MemoryUtil.cs | 2 +- src/Ryujinx.Graphics.Nvdec.Vp9/Constants.cs | 53 +- src/Ryujinx.Graphics.Nvdec.Vp9/DSubExp.cs | 47 + src/Ryujinx.Graphics.Nvdec.Vp9/DecodeFrame.cs | 1080 ++++++++-- src/Ryujinx.Graphics.Nvdec.Vp9/DecodeMv.cs | 401 ++-- src/Ryujinx.Graphics.Nvdec.Vp9/Decoder.cs | 85 +- src/Ryujinx.Graphics.Nvdec.Vp9/Detokenize.cs | 68 +- .../Dsp/Convolve.cs | 162 +- src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Filter.cs | 2 +- .../Dsp/IntraPred.cs | 369 ++-- src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs | 1068 +++++----- .../Dsp/LoopFilterAuto.cs | 229 ++ .../Dsp/LoopFilterScalar.cs | 1093 ++++++++++ .../Dsp/LoopFilterSse2.cs | 1837 +++++++++++++++++ src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Prob.cs | 30 +- src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs | 123 +- .../Dsp/TxfmCommon.cs | 72 +- src/Ryujinx.Graphics.Nvdec.Vp9/Entropy.cs | 623 ++++++ src/Ryujinx.Graphics.Nvdec.Vp9/EntropyMode.cs | 400 ++++ src/Ryujinx.Graphics.Nvdec.Vp9/EntropyMv.cs | 165 ++ .../FrameBuffers.cs | 79 + src/Ryujinx.Graphics.Nvdec.Vp9/Idct.cs | 169 +- .../InternalErrorException.cs | 4 +- .../InternalErrorInfo.cs | 2 +- src/Ryujinx.Graphics.Nvdec.Vp9/LoopFilter.cs | 1832 ++++++++++++++-- src/Ryujinx.Graphics.Nvdec.Vp9/Luts.cs | 184 +- src/Ryujinx.Graphics.Nvdec.Vp9/PredCommon.cs | 140 +- src/Ryujinx.Graphics.Nvdec.Vp9/Prob.cs | 94 + src/Ryujinx.Graphics.Nvdec.Vp9/QuantCommon.cs | 245 +-- .../ReadBitBuffer.cs | 96 + src/Ryujinx.Graphics.Nvdec.Vp9/ReconInter.cs | 90 +- src/Ryujinx.Graphics.Nvdec.Vp9/ReconIntra.cs | 271 +-- src/Ryujinx.Graphics.Nvdec.Vp9/TileBuffer.cs | 2 +- .../TileWorkerData.cs | 13 +- .../Types/BModeInfo.cs | 4 +- .../Types/BitstreamProfile.cs | 11 + .../Types/BlockSize.cs | 30 +- src/Ryujinx.Graphics.Nvdec.Vp9/Types/Buf2D.cs | 2 +- .../Types/BufferPool.cs | 18 + .../Types/FrameType.cs | 2 +- .../Types/LoopFilter.cs | 15 +- .../Types/LoopFilterInfoN.cs | 2 +- .../Types/LoopFilterMask.cs | 2 +- .../Types/LoopFilterThresh.cs | 2 +- .../Types/MacroBlockD.cs | 66 +- .../Types/MacroBlockDPlane.cs | 3 +- .../Types/ModeInfo.cs | 51 +- .../Types/MotionVectorContext.cs | 2 +- src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv.cs | 167 +- src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv32.cs | 2 +- .../Types/MvClassType.cs | 24 +- .../Types/MvJointType.cs | 10 +- src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvRef.cs | 2 +- .../Types/PartitionType.cs | 2 +- .../Types/PlaneType.cs | 6 +- .../Types/Position.cs | 2 +- .../Types/PredictionMode.cs | 32 +- .../Types/RefBuffer.cs | 5 +- .../Types/RefCntBuffer.cs | 12 + .../Types/ReferenceMode.cs | 10 +- .../Types/ScaleFactors.cs | 312 +-- .../Types/SegLvlFeatures.cs | 12 +- .../Types/Segmentation.cs | 103 +- .../Types/Surface.cs | 196 +- .../Types/TileInfo.cs | 8 +- .../Types/TxMode.cs | 14 +- .../Types/TxSize.cs | 12 +- .../Types/TxType.cs | 12 +- .../Types/Vp9Common.cs | 724 ++++++- .../Types/Vp9Decoder.cs | 410 ++++ .../Types/VpxCodecFrameBuffer.cs | 10 + .../Types/VpxColorRange.cs | 11 + .../Types/VpxColorSpace.cs | 29 + .../Types/Vp9/PictureInfo.cs | 2 + src/Ryujinx.Graphics.Video/Vp9PictureInfo.cs | 4 +- 79 files changed, 10926 insertions(+), 2595 deletions(-) create mode 100644 src/Ryujinx.Graphics.Nvdec.Vp9/DSubExp.cs create mode 100644 src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterAuto.cs create mode 100644 src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterScalar.cs create mode 100644 src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterSse2.cs create mode 100644 src/Ryujinx.Graphics.Nvdec.Vp9/Entropy.cs create mode 100644 src/Ryujinx.Graphics.Nvdec.Vp9/EntropyMode.cs create mode 100644 src/Ryujinx.Graphics.Nvdec.Vp9/EntropyMv.cs create mode 100644 src/Ryujinx.Graphics.Nvdec.Vp9/FrameBuffers.cs create mode 100644 src/Ryujinx.Graphics.Nvdec.Vp9/Prob.cs create mode 100644 src/Ryujinx.Graphics.Nvdec.Vp9/ReadBitBuffer.cs create mode 100644 src/Ryujinx.Graphics.Nvdec.Vp9/Types/BitstreamProfile.cs create mode 100644 src/Ryujinx.Graphics.Nvdec.Vp9/Types/BufferPool.cs create mode 100644 src/Ryujinx.Graphics.Nvdec.Vp9/Types/RefCntBuffer.cs create mode 100644 src/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Decoder.cs create mode 100644 src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxCodecFrameBuffer.cs create mode 100644 src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxColorRange.cs create mode 100644 src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxColorSpace.cs diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/BitDepth.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/BitDepth.cs index b7b709536..4b4e7ecfa 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/BitDepth.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/BitDepth.cs @@ -2,8 +2,13 @@ { internal enum BitDepth { - Bits8 = 8, /**< 8 bits */ - Bits10 = 10, /**< 10 bits */ - Bits12 = 12, /**< 12 bits */ + // 8 bits + Bits8 = 8, + + // 10 bits + Bits10 = 10, + + // 12 bits + Bits12 = 12 } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/CodecErr.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/CodecErr.cs index b695fed5b..82658b4cf 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/CodecErr.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/CodecErr.cs @@ -3,26 +3,26 @@ internal enum CodecErr { /*!\brief Operation completed without error */ - CodecOk, + Ok, /*!\brief Unspecified error */ - CodecError, + Error, /*!\brief Memory operation failed */ - CodecMemError, + MemError, /*!\brief ABI version mismatch */ - CodecAbiMismatch, + AbiMismatch, /*!\brief Algorithm does not have required capability */ - CodecIncapable, + Incapable, /*!\brief The given bitstream is not supported. * * The bitstream was unable to be parsed at the highest level. The decoder * is unable to proceed. This error \ref SHOULD be treated as fatal to the * stream. */ - CodecUnsupBitstream, + UnsupBitstream, /*!\brief Encoded bitstream uses an unsupported feature * @@ -31,7 +31,7 @@ * pictures from being properly decoded. This error \ref MAY be treated as * fatal to the stream or \ref MAY be treated as fatal to the current GOP. */ - CodecUnsupFeature, + UnsupFeature, /*!\brief The coded data for this stream is corrupt or incomplete * @@ -41,16 +41,16 @@ * stream or \ref MAY be treated as fatal to the current GOP. If decoding * is continued for the current GOP, artifacts may be present. */ - CodecCorruptFrame, + CorruptFrame, /*!\brief An application-supplied parameter is not valid. * */ - CodecInvalidParam, + InvalidParam, /*!\brief An iterator reached the end of list. * */ - CodecListEnd + ListEnd } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Common/BitUtils.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Common/BitUtils.cs index 641188f8a..df350ac87 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Common/BitUtils.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Common/BitUtils.cs @@ -10,7 +10,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Common [MethodImpl(MethodImplOptions.AggressiveInlining)] public static byte ClipPixel(int val) { - return (byte)((val > 255) ? 255 : (val < 0) ? 0 : val); + return (byte)(val > 255 ? 255 : val < 0 ? 0 : val); } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -55,4 +55,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Common return numValues > 0 ? GetMsb(numValues) + 1 : 0; } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryAllocator.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryAllocator.cs index 473dd904a..af9c67608 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryAllocator.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryAllocator.cs @@ -16,7 +16,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Common public bool InUse; } - private PoolItem[] _pool = new PoolItem[PoolEntries]; + private readonly PoolItem[] _pool = new PoolItem[PoolEntries]; public ArrayPtr Allocate(int length) where T : unmanaged { @@ -51,6 +51,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Common { Marshal.FreeHGlobal(item.Pointer); } + item.Pointer = ptr; item.Length = lengthInBytes; break; @@ -58,7 +59,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Common } } - return new ArrayPtr(ptr, length); + ArrayPtr allocation = new ArrayPtr(ptr, length); + + allocation.AsSpan().Fill(default); + + return allocation; } public unsafe void Free(ArrayPtr arr) where T : unmanaged diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryUtil.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryUtil.cs index 909a94832..6277590ef 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryUtil.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryUtil.cs @@ -20,4 +20,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Common new Span(ptr, length).Fill(value); } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Constants.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Constants.cs index aaf1d7b98..4d7f919e5 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Constants.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Constants.cs @@ -1,8 +1,10 @@ -namespace Ryujinx.Graphics.Nvdec.Vp9 +using Ryujinx.Graphics.Nvdec.Vp9.Types; + +namespace Ryujinx.Graphics.Nvdec.Vp9 { internal static class Constants { - public const int Vp9InterpExtend = 4; + public const int InterpExtend = 4; public const int MaxMbPlane = 3; @@ -14,17 +16,18 @@ public const int MaxRefFrames = 4; public const int MiSizeLog2 = 3; - public const int MiBlockSizeLog2 = 6 - MiSizeLog2; // 64 = 2^6 + public const int MiBlockSizeLog2 = 6 - MiSizeLog2; // 64 = 2^6 - public const int MiSize = 1 << MiSizeLog2; // pixels per mi-unit - public const int MiBlockSize = 1 << MiBlockSizeLog2; // mi-units per max block + public const int MiSize = 1 << MiSizeLog2; // pixels per mi-unit + public const int MiBlockSize = 1 << MiBlockSizeLog2; // mi-units per max block public const int MiMask = MiBlockSize - 1; - public const int PartitionPloffset = 4; // number of probability models per block size + public const int PartitionPloffset = 4; // number of probability models per block size /* Segment Feature Masks */ public const int MaxMvRefCandidates = 2; + public const int IntraInterContexts = 4; public const int CompInterContexts = 5; public const int RefContexts = 5; @@ -32,12 +35,26 @@ public const int EightTapSmooth = 1; public const int EightTapSharp = 2; public const int SwitchableFilters = 3; /* Number of switchable filters */ + public const int Bilinear = 3; - public const int Switchable = 4; /* should be the last one */ + + // The codec can operate in four possible inter prediction filter mode: + // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three. + public const int SwitchableFilterContexts = SwitchableFilters + 1; + public const int Switchable = 4; /* Should be the last one */ // Frame public const int RefsPerFrame = 3; + public const int RefFramesLog2 = 3; + public const int RefFrames = 1 << RefFramesLog2; + + // 1 scratch frame for the new frame, 3 for scaled references on the encoder. + public const int FrameBuffers = RefFrames + 4; + + public const int FrameContextsLog2 = 2; + public const int FrameContexts = 1 << FrameContextsLog2; + public const int NumPingPongBuffers = 2; public const int Class0Bits = 1; /* bits at integer precision for class 0 */ @@ -48,9 +65,9 @@ public const int MvLow = -(1 << MvInUseBits); // Coefficient token alphabet - public const int ZeroToken = 0; // 0 Extra Bits 0+0 - public const int OneToken = 1; // 1 Extra Bits 0+1 - public const int TwoToken = 2; // 2 Extra Bits 0+1 + public const int ZeroToken = 0; // 0 Extra Bits 0+0 + public const int OneToken = 1; // 1 Extra Bits 0+1 + public const int TwoToken = 2; // 2 Extra Bits 0+1 public const int PivotNode = 2; @@ -65,5 +82,19 @@ public const int SegmentAbsData = 1; public const int MaxSegments = 8; + + public const int PartitionTypes = (int)PartitionType.PartitionTypes; + + public const int PartitionPlOffset = 4; // Number of probability models per block size + public const int PartitionContexts = 4 * PartitionPlOffset; + + public const int PlaneTypes = (int)PlaneType.PlaneTypes; + + public const int IntraModes = (int)PredictionMode.TmPred + 1; + + public const int InterModes = 1 + (int)PredictionMode.NewMv - (int)PredictionMode.NearestMv; + + public const int SkipContexts = 3; + public const int InterModeContexts = 7; } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/DSubExp.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/DSubExp.cs new file mode 100644 index 000000000..cac1b67f6 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/DSubExp.cs @@ -0,0 +1,47 @@ +using System.Diagnostics; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal static class DSubExp + { + public static int InvRecenterNonneg(int v, int m) + { + if (v > 2 * m) + { + return v; + } + + return (v & 1) != 0 ? m - ((v + 1) >> 1) : m + (v >> 1); + } + + private static readonly byte[] InvMapTable = + { + 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176, 189, 202, 215, 228, 241, 254, 1, 2, 3, 4, + 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, + 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 86, 87, 88, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, + 115, 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, + 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 177, 178, 179, 180, 181, 182, + 183, 184, 185, 186, 187, 188, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, + 206, 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, + 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, + 251, 252, 253, 253 + }; + + public static int InvRemapProb(int v, int m) + { + Debug.Assert(v < InvMapTable.Length / sizeof(byte)); + + v = InvMapTable[v]; + m--; + if (m << 1 <= Prob.MaxProb) + { + return 1 + InvRecenterNonneg(v, m); + } + + return Prob.MaxProb - InvRecenterNonneg(v, Prob.MaxProb - 1 - m); + } + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/DecodeFrame.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/DecodeFrame.cs index cdd645a38..e61c9567e 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/DecodeFrame.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/DecodeFrame.cs @@ -9,18 +9,99 @@ using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Threading.Tasks; -using Mv = Ryujinx.Graphics.Nvdec.Vp9.Types.Mv; namespace Ryujinx.Graphics.Nvdec.Vp9 { - static class DecodeFrame + internal static class DecodeFrame { private static bool ReadIsValid(ArrayPtr start, int len) { return len != 0 && len <= start.Length; } - private static void InverseTransformBlockInter(ref MacroBlockD xd, int plane, TxSize txSize, Span dst, int stride, int eob) + private static void ReadTxModeProbs(ref Vp9EntropyProbs txProbs, ref Reader r) + { + for (int i = 0; i < EntropyMode.TxSizeContexts; ++i) + { + for (int j = 0; j < (int)TxSize.TxSizes - 3; ++j) + { + r.DiffUpdateProb(ref txProbs.Tx8x8Prob[i][j]); + } + } + + for (int i = 0; i < EntropyMode.TxSizeContexts; ++i) + { + for (int j = 0; j < (int)TxSize.TxSizes - 2; ++j) + { + r.DiffUpdateProb(ref txProbs.Tx16x16Prob[i][j]); + } + } + + for (int i = 0; i < EntropyMode.TxSizeContexts; ++i) + { + for (int j = 0; j < (int)TxSize.TxSizes - 1; ++j) + { + r.DiffUpdateProb(ref txProbs.Tx32x32Prob[i][j]); + } + } + } + + private static void ReadSwitchableInterpProbs(ref Vp9EntropyProbs fc, ref Reader r) + { + for (int j = 0; j < Constants.SwitchableFilterContexts; ++j) + { + for (int i = 0; i < Constants.SwitchableFilters - 1; ++i) + { + r.DiffUpdateProb(ref fc.SwitchableInterpProb[j][i]); + } + } + } + + private static void ReadInterModeProbs(ref Vp9EntropyProbs fc, ref Reader r) + { + for (int i = 0; i < Constants.InterModeContexts; ++i) + { + for (int j = 0; j < Constants.InterModes - 1; ++j) + { + r.DiffUpdateProb( ref fc.InterModeProb[i][j]); + } + } + } + + private static void ReadMvProbs(ref Vp9EntropyProbs ctx, bool allowHp, ref Reader r) + { + r.UpdateMvProbs(ctx.Joints.AsSpan(), EntropyMv.Joints - 1); + + for (int i = 0; i < 2; ++i) + { + r.UpdateMvProbs(MemoryMarshal.CreateSpan(ref ctx.Sign[i], 1), 1); + r.UpdateMvProbs(ctx.Classes[i].AsSpan(), EntropyMv.Classes - 1); + r.UpdateMvProbs(ctx.Class0[i].AsSpan(), EntropyMv.Class0Size - 1); + r.UpdateMvProbs(ctx.Bits[i].AsSpan(), EntropyMv.OffsetBits); + } + + for (int i = 0; i < 2; ++i) + { + for (int j = 0; j < EntropyMv.Class0Size; ++j) + { + r.UpdateMvProbs(ctx.Class0Fp[i][j].AsSpan(), EntropyMv.FpSize - 1); + } + + r.UpdateMvProbs(ctx.Fp[i].AsSpan(), 3); + } + + if (allowHp) + { + for (int i = 0; i < 2; ++i) + { + r.UpdateMvProbs(MemoryMarshal.CreateSpan(ref ctx.Class0Hp[i], 1), 1); + r.UpdateMvProbs(MemoryMarshal.CreateSpan(ref ctx.Hp[i], 1), 1); + } + } + } + + private static void InverseTransformBlockInter(ref MacroBlockD xd, int plane, TxSize txSize, Span dst, + int stride, int eob) { ref MacroBlockDPlane pd = ref xd.Plane[plane]; ArrayPtr dqcoeff = pd.DqCoeff; @@ -48,7 +129,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 case TxSize.Tx32x32: Idct.HighbdIdct32x32Add(dqcoeff.AsSpan(), dst16, stride, eob, xd.Bd); break; - default: Debug.Assert(false, "Invalid transform size"); break; + default: + Debug.Assert(false, "Invalid transform size"); + break; } } } @@ -62,11 +145,21 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { switch (txSize) { - case TxSize.Tx4x4: Idct.Idct4x4Add(dqcoeff.AsSpan(), dst, stride, eob); break; - case TxSize.Tx8x8: Idct.Idct8x8Add(dqcoeff.AsSpan(), dst, stride, eob); break; - case TxSize.Tx16x16: Idct.Idct16x16Add(dqcoeff.AsSpan(), dst, stride, eob); break; - case TxSize.Tx32x32: Idct.Idct32x32Add(dqcoeff.AsSpan(), dst, stride, eob); break; - default: Debug.Assert(false, "Invalid transform size"); return; + case TxSize.Tx4x4: + Idct.Idct4x4Add(dqcoeff.AsSpan(), dst, stride, eob); + break; + case TxSize.Tx8x8: + Idct.Idct8x8Add(dqcoeff.AsSpan(), dst, stride, eob); + break; + case TxSize.Tx16x16: + Idct.Idct16x16Add(dqcoeff.AsSpan(), dst, stride, eob); + break; + case TxSize.Tx32x32: + Idct.Idct32x32Add(dqcoeff.AsSpan(), dst, stride, eob); + break; + default: + Debug.Assert(false, "Invalid transform size"); + return; } } } @@ -79,15 +172,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { if (txSize <= TxSize.Tx16x16 && eob <= 10) { - dqcoeff.AsSpan().Slice(0, 4 * (4 << (int)txSize)).Fill(0); + dqcoeff.AsSpan().Slice(0, 4 * (4 << (int)txSize)).Clear(); } else if (txSize == TxSize.Tx32x32 && eob <= 34) { - dqcoeff.AsSpan().Slice(0, 256).Fill(0); + dqcoeff.AsSpan().Slice(0, 256).Clear(); } else { - dqcoeff.AsSpan().Slice(0, 16 << ((int)txSize << 1)).Fill(0); + dqcoeff.AsSpan().Slice(0, 16 << ((int)txSize << 1)).Clear(); } } } @@ -127,7 +220,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 case TxSize.Tx32x32: Idct.HighbdIdct32x32Add(dqcoeff.AsSpan(), dst16, stride, eob, xd.Bd); break; - default: Debug.Assert(false, "Invalid transform size"); break; + default: + Debug.Assert(false, "Invalid transform size"); + break; } } } @@ -141,11 +236,21 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { switch (txSize) { - case TxSize.Tx4x4: Idct.Iht4x4Add(txType, dqcoeff.AsSpan(), dst, stride, eob); break; - case TxSize.Tx8x8: Idct.Iht8x8Add(txType, dqcoeff.AsSpan(), dst, stride, eob); break; - case TxSize.Tx16x16: Idct.Iht16x16Add(txType, dqcoeff.AsSpan(), dst, stride, eob); break; - case TxSize.Tx32x32: Idct.Idct32x32Add(dqcoeff.AsSpan(), dst, stride, eob); break; - default: Debug.Assert(false, "Invalid transform size"); return; + case TxSize.Tx4x4: + Idct.Iht4x4Add(txType, dqcoeff.AsSpan(), dst, stride, eob); + break; + case TxSize.Tx8x8: + Idct.Iht8x8Add(txType, dqcoeff.AsSpan(), dst, stride, eob); + break; + case TxSize.Tx16x16: + Idct.Iht16x16Add(txType, dqcoeff.AsSpan(), dst, stride, eob); + break; + case TxSize.Tx32x32: + Idct.Idct32x32Add(dqcoeff.AsSpan(), dst, stride, eob); + break; + default: + Debug.Assert(false, "Invalid transform size"); + return; } } } @@ -158,15 +263,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { if (txType == TxType.DctDct && txSize <= TxSize.Tx16x16 && eob <= 10) { - dqcoeff.AsSpan().Slice(0, 4 * (4 << (int)txSize)).Fill(0); + dqcoeff.AsSpan().Slice(0, 4 * (4 << (int)txSize)).Clear(); } else if (txSize == TxSize.Tx32x32 && eob <= 34) { - dqcoeff.AsSpan().Slice(0, 256).Fill(0); + dqcoeff.AsSpan().Slice(0, 256).Clear(); } else { - dqcoeff.AsSpan().Slice(0, 16 << ((int)txSize << 1)).Fill(0); + dqcoeff.AsSpan().Slice(0, 16 << ((int)txSize << 1)).Clear(); } } } @@ -181,8 +286,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { ref MacroBlockD xd = ref twd.Xd; ref MacroBlockDPlane pd = ref xd.Plane[plane]; - PredictionMode mode = (plane == 0) ? mi.Mode : mi.UvMode; - int dstOffset = 4 * row * pd.Dst.Stride + 4 * col; + PredictionMode mode = plane == 0 ? mi.Mode : mi.UvMode; + int dstOffset = (4 * row * pd.Dst.Stride) + (4 * col); byte* dst = &pd.Dst.Buf.ToPointer()[dstOffset]; Span dstSpan = pd.Dst.Buf.AsSpan().Slice(dstOffset); @@ -194,15 +299,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } - ReconIntra.PredictIntraBlock(ref xd, pd.N4Wl, txSize, mode, dst, pd.Dst.Stride, dst, pd.Dst.Stride, col, row, plane); + ReconIntra.PredictIntraBlock(ref xd, pd.N4Wl, txSize, mode, dst, pd.Dst.Stride, dst, pd.Dst.Stride, col, + row, plane); if (mi.Skip == 0) { TxType txType = - (plane != 0 || xd.Lossless) ? TxType.DctDct : ReconIntra.IntraModeToTxTypeLookup[(int)mode]; - var sc = (plane != 0 || xd.Lossless) - ? Luts.Vp9DefaultScanOrders[(int)txSize] - : Luts.Vp9ScanOrders[(int)txSize][(int)txType]; + plane != 0 || xd.Lossless ? TxType.DctDct : ReconIntra.IntraModeToTxTypeLookup[(int)mode]; + Luts.ScanOrder sc = plane != 0 || xd.Lossless + ? Luts.DefaultScanOrders[(int)txSize] + : Luts.ScanOrders[(int)txSize][(int)txType]; int eob = Detokenize.DecodeBlockTokens(ref twd, plane, sc, col, row, txSize, mi.SegmentId); if (eob > 0) { @@ -221,14 +327,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { ref MacroBlockD xd = ref twd.Xd; ref MacroBlockDPlane pd = ref xd.Plane[plane]; - var sc = Luts.Vp9DefaultScanOrders[(int)txSize]; + Luts.ScanOrder sc = Luts.DefaultScanOrders[(int)txSize]; int eob = Detokenize.DecodeBlockTokens(ref twd, plane, sc, col, row, txSize, mi.SegmentId); - Span dst = pd.Dst.Buf.AsSpan().Slice(4 * row * pd.Dst.Stride + 4 * col); + Span dst = pd.Dst.Buf.AsSpan().Slice((4 * row * pd.Dst.Stride) + (4 * col)); if (eob > 0) { InverseTransformBlockInter(ref xd, plane, txSize, dst, pd.Dst.Stride, eob); } + return eob; } @@ -245,7 +352,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int h) { // Get a pointer to the start of the real data for this row. - byte* refRow = src - x - y * srcStride; + byte* refRow = src - x - (y * srcStride); if (y >= h) { @@ -317,7 +424,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { // Get a pointer to the start of the real data for this row. ushort* src = (ushort*)src8; - ushort* refRow = src - x - y * srcStride; + ushort* refRow = src - x - (y * srcStride); if (y >= h) { @@ -460,9 +567,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int refr) { ref MacroBlockDPlane pd = ref xd.Plane[plane]; - byte* dst = dstBuf.Buf.ToPointer() + dstBuf.Stride * y + x; + byte* dst = dstBuf.Buf.ToPointer() + (dstBuf.Stride * y) + x; Mv32 scaledMv; - int xs, ys, x0, y0, x0_16, y0_16, frameWidth, frameHeight, bufStride, subpelX, subpelY; + int xs, ys, x0, y0, x016, y016, frameWidth, frameHeight, bufStride, subpelX, subpelY; byte* refFrame; byte* bufPtr; @@ -484,16 +591,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { Mv mvQ4 = ReconInter.ClampMvToUmvBorderSb(ref xd, ref mv, bw, bh, pd.SubsamplingX, pd.SubsamplingY); // Co-ordinate of containing block to pixel precision. - int xStart = (-xd.MbToLeftEdge >> (3 + pd.SubsamplingX)); - int yStart = (-xd.MbToTopEdge >> (3 + pd.SubsamplingY)); + int xStart = -xd.MbToLeftEdge >> (3 + pd.SubsamplingX); + int yStart = -xd.MbToTopEdge >> (3 + pd.SubsamplingY); // Co-ordinate of the block to 1/16th pixel precision. - x0_16 = (xStart + x) << Filter.SubpelBits; - y0_16 = (yStart + y) << Filter.SubpelBits; + x016 = (xStart + x) << Filter.SubpelBits; + y016 = (yStart + y) << Filter.SubpelBits; // Co-ordinate of current block in reference frame // to 1/16th pixel precision. - x0_16 = sf.ScaleValueX(x0_16); - y0_16 = sf.ScaleValueY(y0_16); + x016 = sf.ScaleValueX(x016); + y016 = sf.ScaleValueY(y016); // Map the top left corner of the block into the reference frame. x0 = sf.ScaleValueX(xStart + x); @@ -512,13 +619,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 y0 = (-xd.MbToTopEdge >> (3 + pd.SubsamplingY)) + y; // Co-ordinate of the block to 1/16th pixel precision. - x0_16 = x0 << Filter.SubpelBits; - y0_16 = y0 << Filter.SubpelBits; + x016 = x0 << Filter.SubpelBits; + y016 = y0 << Filter.SubpelBits; scaledMv.Row = mv.Row * (1 << (1 - pd.SubsamplingY)); scaledMv.Col = mv.Col * (1 << (1 - pd.SubsamplingX)); xs = ys = 16; } + subpelX = scaledMv.Col & Filter.SubpelMask; subpelY = scaledMv.Row & Filter.SubpelMask; @@ -526,34 +634,35 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 // reference frame. x0 += scaledMv.Col >> Filter.SubpelBits; y0 += scaledMv.Row >> Filter.SubpelBits; - x0_16 += scaledMv.Col; - y0_16 += scaledMv.Row; + x016 += scaledMv.Col; + y016 += scaledMv.Row; // Get reference block pointer. - bufPtr = refFrame + y0 * preBuf.Stride + x0; + bufPtr = refFrame + (y0 * preBuf.Stride) + x0; bufStride = preBuf.Stride; // Do border extension if there is motion or the // width/height is not a multiple of 8 pixels. - if (isScaled || scaledMv.Col != 0 || scaledMv.Row != 0 || (frameWidth & 0x7) != 0 || (frameHeight & 0x7) != 0) + if (isScaled || scaledMv.Col != 0 || scaledMv.Row != 0 || (frameWidth & 0x7) != 0 || + (frameHeight & 0x7) != 0) { - int y1 = ((y0_16 + (h - 1) * ys) >> Filter.SubpelBits) + 1; + int y1 = ((y016 + ((h - 1) * ys)) >> Filter.SubpelBits) + 1; // Get reference block bottom right horizontal coordinate. - int x1 = ((x0_16 + (w - 1) * xs) >> Filter.SubpelBits) + 1; + int x1 = ((x016 + ((w - 1) * xs)) >> Filter.SubpelBits) + 1; int xPad = 0, yPad = 0; - if (subpelX != 0 || (sf.XStepQ4 != Filter.SubpelShifts)) + if (subpelX != 0 || sf.XStepQ4 != Filter.SubpelShifts) { - x0 -= Constants.Vp9InterpExtend - 1; - x1 += Constants.Vp9InterpExtend; + x0 -= Constants.InterpExtend - 1; + x1 += Constants.InterpExtend; xPad = 1; } - if (subpelY != 0 || (sf.YStepQ4 != Filter.SubpelShifts)) + if (subpelY != 0 || sf.YStepQ4 != Filter.SubpelShifts) { - y0 -= Constants.Vp9InterpExtend - 1; - y1 += Constants.Vp9InterpExtend; + y0 -= Constants.InterpExtend - 1; + y1 += Constants.InterpExtend; yPad = 1; } @@ -562,10 +671,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 y0 < 0 || y0 > frameHeight - 1 || y1 < 0 || y1 > frameHeight - 1) { // Extend the border. - byte* bufPtr1 = refFrame + y0 * bufStride + x0; + byte* bufPtr1 = refFrame + (y0 * bufStride) + x0; int bW = x1 - x0 + 1; int bH = y1 - y0 + 1; - int borderOffset = yPad * 3 * bW + xPad * 3; + int borderOffset = (yPad * 3 * bW) + (xPad * 3); ExtendAndPredict( bufPtr1, @@ -592,6 +701,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 return; } } + if (xd.CurBuf.HighBd) { ReconInter.HighbdInterPredictor( @@ -635,7 +745,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int miX = miCol * Constants.MiSize; int miY = miRow * Constants.MiSize; ref ModeInfo mi = ref xd.Mi[0].Value; - Array8[] kernel = Luts.Vp9FilterKernels[mi.InterpFilter]; + Array8[] kernel = Luts.FilterKernels[mi.InterpFilter]; BlockSize sbType = mi.SbType; int isCompound = mi.HasSecondRef() ? 1 : 0; int refr; @@ -650,11 +760,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 if (!sf.IsValidScale()) { - xd.ErrorInfo.Value.InternalError(CodecErr.CodecUnsupBitstream, "Reference frame has invalid dimensions"); + xd.ErrorInfo.Value.InternalError(CodecErr.UnsupBitstream, + "Reference frame has invalid dimensions"); } isScaled = sf.IsScaled(); - ReconInter.SetupPrePlanes(ref xd, refr, ref refFrameBuf, miRow, miCol, isScaled ? new Ptr(ref sf) : Ptr.Null); + ReconInter.SetupPrePlanes(ref xd, refr, ref refFrameBuf, miRow, miCol, + isScaled ? new Ptr(ref sf) : Ptr.Null); xd.BlockRefs[refr] = new Ptr(ref refBuf); if (sbType < BlockSize.Block8x8) @@ -668,10 +780,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int n4Wx4 = 4 * num4x4W; int n4Hx4 = 4 * num4x4H; ref Buf2D preBuf = ref pd.Pre[refr]; - int i = 0, x, y; - for (y = 0; y < num4x4H; ++y) + int i = 0; + for (int y = 0; y < num4x4H; ++y) { - for (x = 0; x < num4x4W; ++x) + for (int x = 0; x < num4x4W; ++x) { Mv mv = ReconInter.AverageSplitMvs(ref pd, ref mi, refr, i++); DecBuildInterPredictors( @@ -733,21 +845,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } - private static unsafe void DecResetSkipContext(ref MacroBlockD xd) - { - int i; - for (i = 0; i < Constants.MaxMbPlane; i++) - { - ref MacroBlockDPlane pd = ref xd.Plane[i]; - MemoryUtil.Fill(pd.AboveContext.ToPointer(), (sbyte)0, pd.N4W); - MemoryUtil.Fill(pd.LeftContext.ToPointer(), (sbyte)0, pd.N4H); - } - } - private static void SetPlaneN4(ref MacroBlockD xd, int bw, int bh, int bwl, int bhl) { - int i; - for (i = 0; i < Constants.MaxMbPlane; i++) + for (int i = 0; i < Constants.MaxMbPlane; i++) { xd.Plane[i].N4W = (ushort)((bw << 1) >> xd.Plane[i].SubsamplingX); xd.Plane[i].N4H = (ushort)((bh << 1) >> xd.Plane[i].SubsamplingY); @@ -769,18 +869,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int bwl, int bhl) { - int offset = miRow * cm.MiStride + miCol; - int x, y; + int offset = (miRow * cm.MiStride) + miCol; + ref TileInfo tile = ref xd.Tile; xd.Mi = cm.MiGridVisible.Slice(offset); xd.Mi[0] = new Ptr(ref cm.Mi[offset]); xd.Mi[0].Value.SbType = bsize; - for (y = 0; y < yMis; ++y) + for (int y = 0; y < yMis; ++y) { - for (x = y == 0 ? 1 : 0; x < xMis; ++x) + for (int x = y == 0 ? 1 : 0; x < xMis; ++x) { - xd.Mi[y * cm.MiStride + x] = xd.Mi[0]; + xd.Mi[(y * cm.MiStride) + x] = xd.Mi[0]; } } @@ -820,7 +920,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 BlockSize uvSubsize = Luts.SsSizeLookup[(int)bsize][cm.SubsamplingX][cm.SubsamplingY]; if (uvSubsize == BlockSize.BlockInvalid) { - xd.ErrorInfo.Value.InternalError(CodecErr.CodecCorruptFrame, "Invalid block size."); + xd.ErrorInfo.Value.InternalError(CodecErr.CorruptFrame, "Invalid block size."); } } @@ -828,7 +928,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 if (mi.Skip != 0) { - DecResetSkipContext(ref xd); + xd.DecResetSkipContext(); } if (!mi.IsInterBlock()) @@ -842,8 +942,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int num4x4H = pd.N4H; int step = 1 << (int)txSize; int row, col; - int maxBlocksWide = num4x4W + (xd.MbToRightEdge >= 0 ? 0 : xd.MbToRightEdge >> (5 + pd.SubsamplingX)); - int maxBlocksHigh = num4x4H + (xd.MbToBottomEdge >= 0 ? 0 : xd.MbToBottomEdge >> (5 + pd.SubsamplingY)); + int maxBlocksWide = + num4x4W + (xd.MbToRightEdge >= 0 ? 0 : xd.MbToRightEdge >> (5 + pd.SubsamplingX)); + int maxBlocksHigh = + num4x4H + (xd.MbToBottomEdge >= 0 ? 0 : xd.MbToBottomEdge >> (5 + pd.SubsamplingY)); xd.MaxBlocksWide = (uint)(xd.MbToRightEdge >= 0 ? 0 : maxBlocksWide); xd.MaxBlocksHigh = (uint)(xd.MbToBottomEdge >= 0 ? 0 : maxBlocksHigh); @@ -876,8 +978,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int num4x4H = pd.N4H; int step = 1 << (int)txSize; int row, col; - int maxBlocksWide = num4x4W + (xd.MbToRightEdge >= 0 ? 0 : xd.MbToRightEdge >> (5 + pd.SubsamplingX)); - int maxBlocksHigh = num4x4H + (xd.MbToBottomEdge >= 0 ? 0 : xd.MbToBottomEdge >> (5 + pd.SubsamplingY)); + int maxBlocksWide = + num4x4W + (xd.MbToRightEdge >= 0 ? 0 : xd.MbToRightEdge >> (5 + pd.SubsamplingX)); + int maxBlocksHigh = num4x4H + + (xd.MbToBottomEdge >= 0 ? 0 : xd.MbToBottomEdge >> (5 + pd.SubsamplingY)); xd.MaxBlocksWide = (uint)(xd.MbToRightEdge >= 0 ? 0 : maxBlocksWide); xd.MaxBlocksHigh = (uint)(xd.MbToBottomEdge >= 0 ? 0 : maxBlocksHigh); @@ -893,7 +997,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 if (!less8x8 && eobtotal == 0) { - mi.Skip = 1; // Skip loopfilter + mi.Skip = 1; // Skip loopfilter } } } @@ -906,15 +1010,6 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } - private static int DecPartitionPlaneContext(ref TileWorkerData twd, int miRow, int miCol, int bsl) - { - ref sbyte aboveCtx = ref twd.Xd.AboveSegContext[miCol]; - ref sbyte leftCtx = ref twd.Xd.LeftSegContext[miRow & Constants.MiMask]; - int above = (aboveCtx >> bsl) & 1, left = (leftCtx >> bsl) & 1; - - return (left * 2 + above) + bsl * Constants.PartitionPloffset; - } - private static void DecUpdatePartitionContext( ref TileWorkerData twd, int miRow, @@ -923,7 +1018,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int bw) { Span aboveCtx = twd.Xd.AboveSegContext.Slice(miCol).AsSpan(); - Span leftCtx = MemoryMarshal.CreateSpan(ref twd.Xd.LeftSegContext[miRow & Constants.MiMask], 8 - (miRow & Constants.MiMask)); + Span leftCtx = MemoryMarshal.CreateSpan(ref twd.Xd.LeftSegContext[miRow & Constants.MiMask], + 8 - (miRow & Constants.MiMask)); // Update the partition context at the end notes. Set partition bits // of block sizes larger than the current one to be one, and partition @@ -940,14 +1036,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int hasCols, int bsl) { - int ctx = DecPartitionPlaneContext(ref twd, miRow, miCol, bsl); + int ctx = twd.DecPartitionPlaneContext(miRow, miCol, bsl); ReadOnlySpan probs = MemoryMarshal.CreateReadOnlySpan(ref twd.Xd.PartitionProbs[ctx][0], 3); PartitionType p; ref Reader r = ref twd.BitReader; if (hasRows != 0 && hasCols != 0) { - p = (PartitionType)r.ReadTree(Luts.Vp9PartitionTree, probs); + p = (PartitionType)r.ReadTree(Luts.PartitionTree, probs); } else if (hasRows == 0 && hasCols != 0) { @@ -983,8 +1079,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int hbs = num8x8Wh >> 1; PartitionType partition; BlockSize subsize; - bool hasRows = (miRow + hbs) < cm.MiRows; - bool hasCols = (miCol + hbs) < cm.MiCols; + bool hasRows = miRow + hbs < cm.MiRows; + bool hasCols = miCol + hbs < cm.MiCols; ref MacroBlockD xd = ref twd.Xd; if (miRow >= cm.MiRows || miCol >= cm.MiCols) @@ -1030,12 +1126,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 DecodePartition(ref twd, ref cm, miRow + hbs, miCol, subsize, n8x8L2); DecodePartition(ref twd, ref cm, miRow + hbs, miCol + hbs, subsize, n8x8L2); break; - default: Debug.Assert(false, "Invalid partition type"); break; + default: + Debug.Assert(false, "Invalid partition type"); + break; } } // Update partition context - if (bsize >= BlockSize.Block8x8 && (bsize == BlockSize.Block8x8 || partition != PartitionType.PartitionSplit)) + if (bsize >= BlockSize.Block8x8 && + (bsize == BlockSize.Block8x8 || partition != PartitionType.PartitionSplit)) { DecUpdatePartitionContext(ref twd, miRow, miCol, subsize, num8x8Wh); } @@ -1051,15 +1150,257 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 // partition can't be fully read then throw an error. if (!ReadIsValid(data, readSize)) { - errorInfo.InternalError(CodecErr.CodecCorruptFrame, "Truncated packet or corrupt tile length"); + errorInfo.InternalError(CodecErr.CorruptFrame, "Truncated packet or corrupt tile length"); } if (r.Init(data, readSize)) { - errorInfo.InternalError(CodecErr.CodecMemError, "Failed to allocate bool decoder 1"); + errorInfo.InternalError(CodecErr.MemError, "Failed to allocate bool decoder 1"); } } + private static void ReadCoefProbsCommon(ref Array2>>>> coefProbs, + ref Reader r, int txSize) + { + if (r.ReadBit() != 0) + { + for (int i = 0; i < Constants.PlaneTypes; ++i) + { + for (int j = 0; j < Entropy.RefTypes; ++j) + { + for (int k = 0; k < Entropy.CoefBands; ++k) + { + for (int l = 0; l < Entropy.BAND_COEFF_CONTEXTS(k); ++l) + { + for (int m = 0; m < Entropy.UnconstrainedNodes; ++m) + { + r.DiffUpdateProb( ref coefProbs[i][j][k][l][m]); + } + } + } + } + } + } + } + + private static void ReadCoefProbs(ref Vp9EntropyProbs fc, TxMode txMode, ref Reader r) + { + int maxTxSize = (int)Luts.TxModeToBiggestTxSize[(int)txMode]; + for (int txSize = (int)TxSize.Tx4x4; txSize <= maxTxSize; ++txSize) + { + ReadCoefProbsCommon(ref fc.CoefProbs[txSize], ref r, txSize); + } + } + + private static void SetupLoopfilter(ref Types.LoopFilter lf, ref ReadBitBuffer rb) + { + lf.FilterLevel = rb.ReadLiteral(6); + lf.SharpnessLevel = rb.ReadLiteral(3); + + // Read in loop filter deltas applied at the MB level based on mode or ref + // frame. + lf.ModeRefDeltaUpdate = false; + + lf.ModeRefDeltaEnabled = rb.ReadBit() != 0; + if (lf.ModeRefDeltaEnabled) + { + lf.ModeRefDeltaUpdate = rb.ReadBit() != 0; + if (lf.ModeRefDeltaUpdate) + { + for (int i = 0; i < LoopFilter.MaxRefLfDeltas; i++) + { + if (rb.ReadBit() != 0) + { + lf.RefDeltas[i] = (sbyte)rb.ReadSignedLiteral(6); + } + } + + for (int i = 0; i < LoopFilter.MaxModeLfDeltas; i++) + { + if (rb.ReadBit() != 0) + { + lf.ModeDeltas[i] = (sbyte)rb.ReadSignedLiteral(6); + } + } + } + } + } + + private static void SetupQuantization(ref Vp9Common cm, ref MacroBlockD xd, ref ReadBitBuffer rb) + { + cm.BaseQindex = rb.ReadLiteral(QuantCommon.QindexBits); + cm.YDcDeltaQ = rb.ReadDeltaQ(); + cm.UvDcDeltaQ = rb.ReadDeltaQ(); + cm.UvAcDeltaQ = rb.ReadDeltaQ(); + cm.DequantBitDepth = cm.BitDepth; + xd.Lossless = cm.BaseQindex == 0 && cm.YDcDeltaQ == 0 && cm.UvDcDeltaQ == 0 && cm.UvAcDeltaQ == 0; + + xd.Bd = (int)cm.BitDepth; + } + + private static readonly byte[] LiteralToFilter = + { + Constants.EightTapSmooth, Constants.EightTap, Constants.EightTapSharp, Constants.Bilinear + }; + + private static byte ReadInterpFilter(ref ReadBitBuffer rb) + { + return rb.ReadBit() != 0 + ? (byte)Constants.Switchable + : LiteralToFilter[rb.ReadLiteral(2)]; + } + + private static void SetupRenderSize(ref Vp9Common cm, ref ReadBitBuffer rb) + { + cm.RenderWidth = cm.Width; + cm.RenderHeight = cm.Height; + if (rb.ReadBit() != 0) + { + rb.ReadFrameSize(out cm.RenderWidth, out cm.RenderHeight); + } + } + + private static void SetupFrameSize(MemoryAllocator allocator, ref Vp9Common cm, ref ReadBitBuffer rb) + { + int width = 0, height = 0; + ref BufferPool pool = ref cm.BufferPool.Value; + rb.ReadFrameSize(out width, out height); + cm.ResizeContextBuffers(allocator, width, height); + SetupRenderSize(ref cm, ref rb); + + if (cm.GetFrameNewBuffer().ReallocFrameBuffer( + allocator, + cm.Width, + cm.Height, + cm.SubsamplingX, + cm.SubsamplingY, + cm.UseHighBitDepth, + Surface.DecBorderInPixels, + cm.ByteAlignment, + new Ptr(ref pool.FrameBufs[cm.NewFbIdx].RawFrameBuffer), + FrameBuffers.GetFrameBuffer, + pool.CbPriv) != 0) + { + cm.Error.InternalError(CodecErr.MemError, "Failed to allocate frame buffer"); + } + + pool.FrameBufs[cm.NewFbIdx].Released = 0; + pool.FrameBufs[cm.NewFbIdx].Buf.SubsamplingX = cm.SubsamplingX; + pool.FrameBufs[cm.NewFbIdx].Buf.SubsamplingY = cm.SubsamplingY; + pool.FrameBufs[cm.NewFbIdx].Buf.BitDepth = (uint)cm.BitDepth; + pool.FrameBufs[cm.NewFbIdx].Buf.ColorSpace = cm.ColorSpace; + pool.FrameBufs[cm.NewFbIdx].Buf.ColorRange = cm.ColorRange; + pool.FrameBufs[cm.NewFbIdx].Buf.RenderWidth = cm.RenderWidth; + pool.FrameBufs[cm.NewFbIdx].Buf.RenderHeight = cm.RenderHeight; + } + + private static bool ValidRefFrameImgFmt( + BitDepth refBitDepth, + int refXss, int refYss, + BitDepth thisBitDepth, + int thisXss, + int thisYss) + { + return refBitDepth == thisBitDepth && refXss == thisXss && refYss == thisYss; + } + + private static void SetupFrameSizeWithRefs(MemoryAllocator allocator, ref Vp9Common cm, + ref ReadBitBuffer rb) + { + int width = 0, height = 0; + bool found = false; + + bool hasValidRefFrame = false; + ref BufferPool pool = ref cm.BufferPool.Value; + for (int i = 0; i < Constants.RefsPerFrame; ++i) + { + if (rb.ReadBit() != 0) + { + if (cm.FrameRefs[i].Idx != RefBuffer.InvalidIdx) + { + ref Surface buf = ref cm.FrameRefs[i].Buf; + width = buf.YCropWidth; + height = buf.YCropHeight; + found = true; + break; + } + + cm.Error.InternalError(CodecErr.CorruptFrame, "Failed to decode frame size"); + } + } + + if (!found) + { + rb.ReadFrameSize(out width, out height); + } + + if (width <= 0 || height <= 0) + { + cm.Error.InternalError(CodecErr.CorruptFrame, "Invalid frame size"); + } + + // Check to make sure at least one of frames that this frame references + // has valid dimensions. + for (int i = 0; i < Constants.RefsPerFrame; ++i) + { + ref RefBuffer refFrame = ref cm.FrameRefs[i]; + hasValidRefFrame |= + refFrame.Idx != RefBuffer.InvalidIdx && + ScaleFactors.ValidRefFrameSize(refFrame.Buf.YCropWidth, refFrame.Buf.YCropHeight, width, + height); + } + + if (!hasValidRefFrame) + { + cm.Error.InternalError(CodecErr.CorruptFrame, "Referenced frame has invalid size"); + } + + for (int i = 0; i < Constants.RefsPerFrame; ++i) + { + ref RefBuffer refFrame = ref cm.FrameRefs[i]; + if (refFrame.Idx == RefBuffer.InvalidIdx || + !ValidRefFrameImgFmt( + (BitDepth)refFrame.Buf.BitDepth, + refFrame.Buf.SubsamplingX, + refFrame.Buf.SubsamplingY, + cm.BitDepth, + cm.SubsamplingX, + cm.SubsamplingY)) + { + cm.Error.InternalError(CodecErr.CorruptFrame, + "Referenced frame has incompatible color format"); + } + } + + cm.ResizeContextBuffers(allocator, width, height); + SetupRenderSize(ref cm, ref rb); + + if (cm.GetFrameNewBuffer().ReallocFrameBuffer( + allocator, + cm.Width, + cm.Height, + cm.SubsamplingX, + cm.SubsamplingY, + cm.UseHighBitDepth, + Surface.DecBorderInPixels, + cm.ByteAlignment, + new Ptr(ref pool.FrameBufs[cm.NewFbIdx].RawFrameBuffer), + FrameBuffers.GetFrameBuffer, + pool.CbPriv) != 0) + { + cm.Error.InternalError(CodecErr.MemError, "Failed to allocate frame buffer"); + } + + pool.FrameBufs[cm.NewFbIdx].Released = 0; + pool.FrameBufs[cm.NewFbIdx].Buf.SubsamplingX = cm.SubsamplingX; + pool.FrameBufs[cm.NewFbIdx].Buf.SubsamplingY = cm.SubsamplingY; + pool.FrameBufs[cm.NewFbIdx].Buf.BitDepth = (uint)cm.BitDepth; + pool.FrameBufs[cm.NewFbIdx].Buf.ColorSpace = cm.ColorSpace; + pool.FrameBufs[cm.NewFbIdx].Buf.ColorRange = cm.ColorRange; + pool.FrameBufs[cm.NewFbIdx].Buf.RenderWidth = cm.RenderWidth; + pool.FrameBufs[cm.NewFbIdx].Buf.RenderHeight = cm.RenderHeight; + } + // Reads the next tile returning its size and adjusting '*data' accordingly // based on 'isLast'. private static void GetTileBuffer( @@ -1074,7 +1415,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { if (!ReadIsValid(data, 4)) { - errorInfo.InternalError(CodecErr.CodecCorruptFrame, "Truncated packet or corrupt tile length"); + errorInfo.InternalError(CodecErr.CorruptFrame, "Truncated packet or corrupt tile length"); } size = BinaryPrimitives.ReadInt32BigEndian(data.AsSpan()); @@ -1082,7 +1423,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 if (size > data.Length) { - errorInfo.InternalError(CodecErr.CodecCorruptFrame, "Truncated packet or corrupt tile size"); + errorInfo.InternalError(CodecErr.CorruptFrame, "Truncated packet or corrupt tile size"); } } else @@ -1096,11 +1437,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 data = data.Slice(size); } - private static void GetTileBuffers(ref Vp9Common cm, ArrayPtr data, int tileCols, ref Array64 tileBuffers) + private static void GetTileBuffers(ref Vp9Common cm, ArrayPtr data, int tileCols, + ref Array64 tileBuffers) { - int c; - - for (c = 0; c < tileCols; ++c) + for (int c = 0; c < tileCols; ++c) { bool isLast = c == tileCols - 1; ref TileBuffer buf = ref tileBuffers[c]; @@ -1116,13 +1456,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int tileRows, ref Array4> tileBuffers) { - int r, c; - - for (r = 0; r < tileRows; ++r) + for (int r = 0; r < tileRows; ++r) { - for (c = 0; c < tileCols; ++c) + for (int c = 0; c < tileCols; ++c) { - bool isLast = (r == tileRows - 1) && (c == tileCols - 1); + bool isLast = r == tileRows - 1 && c == tileCols - 1; ref TileBuffer buf = ref tileBuffers[r][c]; GetTileBuffer(isLast, ref cm.Error, ref data, ref buf); } @@ -1134,12 +1472,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int alignedCols = TileInfo.MiColsAlignedToSb(cm.MiCols); int tileCols = 1 << cm.Log2TileCols; int tileRows = 1 << cm.Log2TileRows; - Array4> tileBuffers = new Array4>(); + Array4> tileBuffers = new(); int tileRow, tileCol; int miRow, miCol; Debug.Assert(tileRows <= 4); - Debug.Assert(tileCols <= (1 << 6)); + Debug.Assert(tileCols <= 1 << 6); // Note: this memset assumes above_context[0], [1] and [2] // are allocated as part of the same buffer. @@ -1155,7 +1493,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 for (tileCol = 0; tileCol < tileCols; ++tileCol) { ref TileBuffer buf = ref tileBuffers[tileRow][tileCol]; - ref TileWorkerData tileData = ref cm.TileWorkerData[tileCols * tileRow + tileCol]; + ref TileWorkerData tileData = ref cm.TileWorkerData[(tileCols * tileRow) + tileCol]; tileData.Xd = cm.Mb; tileData.Xd.Corrupted = false; tileData.Xd.Counts = cm.Counts; @@ -1168,14 +1506,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 for (tileRow = 0; tileRow < tileRows; ++tileRow) { - TileInfo tile = new TileInfo(); + TileInfo tile = new(); tile.SetRow(ref cm, tileRow); for (miRow = tile.MiRowStart; miRow < tile.MiRowEnd; miRow += Constants.MiBlockSize) { for (tileCol = 0; tileCol < tileCols; ++tileCol) { int col = tileCol; - ref TileWorkerData tileData = ref cm.TileWorkerData[tileCols * tileRow + col]; + ref TileWorkerData tileData = ref cm.TileWorkerData[(tileCols * tileRow) + col]; tile.SetCol(ref cm, col); tileData.Xd.LeftContext = new Array3>(); tileData.Xd.LeftSegContext = new Array8(); @@ -1183,20 +1521,22 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { DecodePartition(ref tileData, ref cm, miRow, miCol, BlockSize.Block64x64, 4); } + cm.Mb.Corrupted |= tileData.Xd.Corrupted; if (cm.Mb.Corrupted) { - cm.Error.InternalError(CodecErr.CodecCorruptFrame, "Failed to decode tile data"); + cm.Error.InternalError(CodecErr.CorruptFrame, "Failed to decode tile data"); } } } } // Get last tile data. - return cm.TileWorkerData[tileCols * tileRows - 1].BitReader.FindEnd(); + return cm.TileWorkerData[(tileCols * tileRows) - 1].BitReader.FindEnd(); } - private static bool DecodeTileCol(ref TileWorkerData tileData, ref Vp9Common cm, ref Array64 tileBuffers) + private static bool DecodeTileCol(ref TileWorkerData tileData, ref Vp9Common cm, + ref Array64 tileBuffers) { ref TileInfo tile = ref tileData.Xd.Tile; int finalCol = (1 << cm.Log2TileCols) - 1; @@ -1237,7 +1577,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 return !tileData.Xd.Corrupted; } - public static unsafe ArrayPtr DecodeTilesMt(ref Vp9Common cm, ArrayPtr data, int maxThreads) + public static ArrayPtr DecodeTilesMt(ref Vp9Common cm, ArrayPtr data, int maxThreads) { ArrayPtr bitReaderEnd = ArrayPtr.Null; @@ -1247,11 +1587,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int numWorkers = Math.Min(maxThreads, tileCols); int n; - Debug.Assert(tileCols <= (1 << 6)); + Debug.Assert(tileCols <= 1 << 6); Debug.Assert(tileRows == 1); - cm.AboveContext.AsSpan().Fill(0); - cm.AboveSegContext.AsSpan().Fill(0); + LoopFilter.ResetLfm(ref cm); + + cm.AboveContext.AsSpan().Clear(); + cm.AboveSegContext.AsSpan().Clear(); for (n = 0; n < numWorkers; ++n) { @@ -1262,7 +1604,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 tileData.Counts = new Vp9BackwardUpdates(); } - Array64 tileBuffers = new Array64(); + Array64 tileBuffers = new(); GetTileBuffers(ref cm, data, tileCols, ref tileBuffers); @@ -1298,7 +1640,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 for (n = 0; n < numWorkers; ++n) { - int count = baseVal + (remain + n) / numWorkers; + int count = baseVal + ((remain + n) / numWorkers); ref TileWorkerData tileData = ref cm.TileWorkerData[n + totalTiles]; tileData.BufStart = bufStart; @@ -1307,9 +1649,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 bufStart += count; } - Ptr cmPtr = new Ptr(ref cm); + Ptr cmPtr = new(ref cm); - Parallel.For(0, numWorkers, (n) => + Parallel.For(0, numWorkers, n => { ref TileWorkerData tileData = ref cmPtr.Value.TileWorkerData[n + totalTiles]; @@ -1353,5 +1695,477 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 a[i] += c[i]; } } + + private static void ErrorHandler(Ptr data) + { + ref Vp9Common cm = ref data.Value; + cm.Error.InternalError(CodecErr.CorruptFrame, "Truncated packet"); + } + + private static void FlushAllFbOnKey(ref Vp9Common cm) + { + if (cm.FrameType == FrameType.KeyFrame && cm.CurrentVideoFrame > 0) + { + ref Array12 frameBufs = ref cm.BufferPool.Value.FrameBufs; + ref BufferPool pool = ref cm.BufferPool.Value; + + for (int i = 0; i < Constants.FrameBuffers; ++i) + { + if (i == cm.NewFbIdx) + { + continue; + } + + frameBufs[i].RefCount = 0; + if (frameBufs[i].Released == 0) + { + FrameBuffers.ReleaseFrameBuffer(pool.CbPriv, ref frameBufs[i].RawFrameBuffer); + frameBufs[i].Released = 1; + } + } + } + } + + private const int SyncCode0 = 0x49; + private const int SyncCode1 = 0x83; + private const int SyncCode2 = 0x42; + + private const int FrameMarker = 0x2; + + private static bool ReadSyncCode(ref ReadBitBuffer rb) + { + return rb.ReadLiteral(8) == SyncCode0 && + rb.ReadLiteral(8) == SyncCode1 && + rb.ReadLiteral(8) == SyncCode2; + } + + private static void RefCntFb(ref Array12 bufs, ref int idx, int newIdx) + { + int refIndex = idx; + + if (refIndex >= 0 && bufs[refIndex].RefCount > 0) + { + bufs[refIndex].RefCount--; + } + + idx = newIdx; + + bufs[newIdx].RefCount++; + } + + private static ulong ReadUncompressedHeader(MemoryAllocator allocator, ref Vp9Decoder pbi, + ref ReadBitBuffer rb) + { + ref Vp9Common cm = ref pbi.Common; + ref BufferPool pool = ref cm.BufferPool.Value; + ref Array12 frameBufs = ref pool.FrameBufs; + int mask, refIndex = 0; + ulong sz; + + cm.LastFrameType = cm.FrameType; + cm.LastIntraOnly = cm.IntraOnly; + + if (rb.ReadLiteral(2) != FrameMarker) + { + cm.Error.InternalError(CodecErr.UnsupBitstream, "Invalid frame marker"); + } + + cm.Profile = rb.ReadProfile(); + if (cm.Profile >= BitstreamProfile.MaxProfiles) + { + cm.Error.InternalError(CodecErr.UnsupBitstream, "Unsupported bitstream profile"); + } + + cm.ShowExistingFrame = rb.ReadBit(); + if (cm.ShowExistingFrame != 0) + { + // Show an existing frame directly. + int frameToShow = cm.RefFrameMap[rb.ReadLiteral(3)]; + if (frameToShow < 0 || frameBufs[frameToShow].RefCount < 1) + { + cm.Error.InternalError(CodecErr.UnsupBitstream, + $"Buffer {frameToShow} does not contain a decoded frame"); + } + + RefCntFb(ref frameBufs, ref cm.NewFbIdx, frameToShow); + pbi.RefreshFrameFlags = 0; + cm.Lf.FilterLevel = 0; + cm.ShowFrame = 1; + + return 0; + } + + cm.FrameType = (FrameType)rb.ReadBit(); + cm.ShowFrame = rb.ReadBit(); + cm.ErrorResilientMode = rb.ReadBit(); + + if (cm.FrameType == FrameType.KeyFrame) + { + if (!ReadSyncCode(ref rb)) + { + cm.Error.InternalError(CodecErr.UnsupBitstream, "Invalid frame sync code"); + } + + cm.ReadBitdepthColorspaceSampling(ref rb); + pbi.RefreshFrameFlags = (1 << Constants.RefFrames) - 1; + + for (int i = 0; i < Constants.RefsPerFrame; ++i) + { + cm.FrameRefs[i].Idx = RefBuffer.InvalidIdx; + cm.FrameRefs[i].Buf = default; + } + + SetupFrameSize(allocator, ref cm, ref rb); + if (pbi.NeedResync != 0) + { + cm.RefFrameMap.AsSpan().Fill(-1); + FlushAllFbOnKey(ref cm); + pbi.NeedResync = 0; + } + } + else + { + cm.IntraOnly = (cm.ShowFrame != 0 ? 0 : rb.ReadBit()) != 0; + + cm.ResetFrameContext = cm.ErrorResilientMode != 0 ? 0 : rb.ReadLiteral(2); + + if (cm.IntraOnly) + { + if (!ReadSyncCode(ref rb)) + { + cm.Error.InternalError(CodecErr.UnsupBitstream, "Invalid frame sync code"); + } + + if (cm.Profile > BitstreamProfile.Profile0) + { + cm.ReadBitdepthColorspaceSampling(ref rb); + } + else + { + // NOTE: The intra-only frame header does not include the specification + // of either the color format or color sub-sampling in profile 0. VP9 + // specifies that the default color format should be YUV 4:2:0 in this + // case (normative). + cm.ColorSpace = VpxColorSpace.Bt601; + cm.ColorRange = VpxColorRange.Studio; + cm.SubsamplingY = cm.SubsamplingX = 1; + cm.BitDepth = BitDepth.Bits8; + cm.UseHighBitDepth = false; + } + + pbi.RefreshFrameFlags = rb.ReadLiteral(Constants.RefFrames); + SetupFrameSize(allocator, ref cm, ref rb); + if (pbi.NeedResync != 0) + { + cm.RefFrameMap.AsSpan().Fill(-1); + pbi.NeedResync = 0; + } + } + else if (pbi.NeedResync != 1) + { + /* Skip if need resync */ + pbi.RefreshFrameFlags = rb.ReadLiteral(Constants.RefFrames); + for (int i = 0; i < Constants.RefsPerFrame; ++i) + { + int refr = rb.ReadLiteral(Constants.RefFramesLog2); + int idx = cm.RefFrameMap[refr]; + ref RefBuffer refFrame = ref cm.FrameRefs[i]; + refFrame.Idx = idx; + refFrame.Buf = frameBufs[idx].Buf; + cm.RefFrameSignBias[Constants.LastFrame + i] = (sbyte)rb.ReadBit(); + } + + SetupFrameSizeWithRefs(allocator, ref cm, ref rb); + + cm.AllowHighPrecisionMv = rb.ReadBit() != 0; + cm.InterpFilter = ReadInterpFilter(ref rb); + + for (int i = 0; i < Constants.RefsPerFrame; ++i) + { + ref RefBuffer refBuf = ref cm.FrameRefs[i]; + refBuf.Sf.SetupScaleFactorsForFrame( + refBuf.Buf.YCropWidth, + refBuf.Buf.YCropHeight, + cm.Width, + cm.Height); + } + } + } + + cm.GetFrameNewBuffer().BitDepth = (uint)cm.BitDepth; + cm.GetFrameNewBuffer().ColorSpace = cm.ColorSpace; + cm.GetFrameNewBuffer().ColorRange = cm.ColorRange; + cm.GetFrameNewBuffer().RenderWidth = cm.RenderWidth; + cm.GetFrameNewBuffer().RenderHeight = cm.RenderHeight; + + if (pbi.NeedResync != 0) + { + cm.Error.InternalError(CodecErr.CorruptFrame, + "Keyframe / intra-only frame required to reset decoder state"); + } + + if (cm.ErrorResilientMode == 0) + { + cm.RefreshFrameContext = rb.ReadBit(); + cm.FrameParallelDecodingMode = rb.ReadBit(); + if (cm.FrameParallelDecodingMode == 0) + { + cm.Counts.Value = new Vp9BackwardUpdates(); + } + } + else + { + cm.RefreshFrameContext = 0; + cm.FrameParallelDecodingMode = 1; + } + + // This flag will be overridden by the call to SetupPastIndependence + // below, forcing the use of context 0 for those frame types. + cm.FrameContextIdx = (uint)rb.ReadLiteral(Constants.FrameContextsLog2); + + // Generate next_ref_frame_map. + for (mask = pbi.RefreshFrameFlags; mask != 0; mask >>= 1) + { + if ((mask & 1) != 0) + { + cm.NextRefFrameMap[refIndex] = cm.NewFbIdx; + ++frameBufs[cm.NewFbIdx].RefCount; + } + else + { + cm.NextRefFrameMap[refIndex] = cm.RefFrameMap[refIndex]; + } + + // Current thread holds the reference frame. + if (cm.RefFrameMap[refIndex] >= 0) + { + ++frameBufs[cm.RefFrameMap[refIndex]].RefCount; + } + + ++refIndex; + } + + for (; refIndex < Constants.RefFrames; ++refIndex) + { + cm.NextRefFrameMap[refIndex] = cm.RefFrameMap[refIndex]; + // Current thread holds the reference frame. + if (cm.RefFrameMap[refIndex] >= 0) + { + ++frameBufs[cm.RefFrameMap[refIndex]].RefCount; + } + } + + pbi.HoldRefBuf = 1; + + if (cm.FrameIsIntraOnly() || cm.ErrorResilientMode != 0) + { + EntropyMode.SetupPastIndependence(ref cm); + } + + SetupLoopfilter(ref cm.Lf, ref rb); + SetupQuantization(ref cm, ref cm.Mb, ref rb); + cm.Seg.SetupSegmentation(ref cm.Fc.Value, ref rb); + cm.SetupSegmentationDequant(); + + cm.SetupTileInfo(ref rb); + sz = (ulong)rb.ReadLiteral(16); + + if (sz == 0) + { + cm.Error.InternalError(CodecErr.CorruptFrame, "Invalid header size"); + } + + return sz; + } + + private static bool ReadCompressedHeader(ref Vp9Decoder pbi, ArrayPtr data, ulong partitionSize) + { + ref Vp9Common cm = ref pbi.Common; + ref MacroBlockD xd = ref cm.Mb; + ref Vp9EntropyProbs fc = ref cm.Fc.Value; + Reader r = new(); + + if (r.Init(data, (int)partitionSize)) + { + cm.Error.InternalError(CodecErr.MemError, "Failed to allocate bool decoder 0"); + } + + cm.TxMode = xd.Lossless ? TxMode.Only4x4 : r.ReadTxMode(); + if (cm.TxMode == TxMode.TxModeSelect) + { + ReadTxModeProbs(ref fc, ref r); + } + + ReadCoefProbs(ref fc, cm.TxMode, ref r); + + for (int k = 0; k < Constants.SkipContexts; ++k) + { + r.DiffUpdateProb(ref fc.SkipProb[k]); + } + + if (!cm.FrameIsIntraOnly()) + { + ReadInterModeProbs(ref fc, ref r); + + if (cm.InterpFilter == Constants.Switchable) + { + ReadSwitchableInterpProbs(ref fc, ref r); + } + + for (int i = 0; i < Constants.IntraInterContexts; i++) + { + r.DiffUpdateProb( ref fc.IntraInterProb[i]); + } + + cm.ReferenceMode = cm.ReadFrameReferenceMode(ref r); + if (cm.ReferenceMode != ReferenceMode.Single) + { + cm.SetupCompoundReferenceMode(); + } + + cm.ReadFrameReferenceModeProbs(ref r); + + for (int j = 0; j < EntropyMode.BlockSizeGroups; j++) + { + for (int i = 0; i < Constants.IntraModes - 1; ++i) + { + r.DiffUpdateProb( ref fc.YModeProb[j][i]); + } + } + + for (int j = 0; j < Constants.PartitionContexts; ++j) + { + for (int i = 0; i < Constants.PartitionTypes - 1; ++i) + { + r.DiffUpdateProb( ref fc.PartitionProb[j][i]); + } + } + + ReadMvProbs(ref fc, cm.AllowHighPrecisionMv, ref r); + } + + return r.HasError(); + } + + private static ref ReadBitBuffer InitReadBitBuffer(ref ReadBitBuffer rb, ReadOnlySpan data) + { + rb.BitOffset = 0; + rb.BitBuffer = data; + return ref rb; + } + + public static unsafe void Decode(MemoryAllocator allocator, + ref Vp9Decoder pbi, + ArrayPtr data, + out ArrayPtr pDataEnd, + bool multithreaded = true) + { + ref Vp9Common cm = ref pbi.Common; + ref MacroBlockD xd = ref cm.Mb; + ReadBitBuffer rb = new(); + int contextUpdated = 0; + Span clearData = stackalloc byte[80]; + ulong firstPartitionSize = + ReadUncompressedHeader(allocator, ref pbi, ref InitReadBitBuffer(ref rb, data.AsSpan())); + int tileRows = 1 << cm.Log2TileRows; + int tileCols = 1 << cm.Log2TileCols; + ref Surface newFb = ref cm.GetFrameNewBuffer(); + xd.CurBuf = newFb; + + if (firstPartitionSize == 0) + { + // showing a frame directly + pDataEnd = data.Slice(cm.Profile <= BitstreamProfile.Profile2 ? 1 : 2); + return; + } + + data = data.Slice((int)rb.BytesRead()); + if (!ReadIsValid(data, (int)firstPartitionSize)) + { + cm.Error.InternalError(CodecErr.CorruptFrame, "Truncated packet or corrupt header length"); + } + + cm.UsePrevFrameMvs = + cm.ErrorResilientMode == 0 && + cm.Width == cm.LastWidth && + cm.Height == cm.LastHeight && + !cm.LastIntraOnly && + cm.LastShowFrame != 0 && + cm.LastFrameType != FrameType.KeyFrame; + + xd.SetupBlockPlanes(cm.SubsamplingX, cm.SubsamplingY); + + cm.Fc = new Ptr(ref cm.FrameContexts[(int)cm.FrameContextIdx]); + + xd.Corrupted = false; + newFb.Corrupted = ReadCompressedHeader(ref pbi, data, firstPartitionSize) ? 1 : 0; + if (newFb.Corrupted != 0) + { + cm.Error.InternalError(CodecErr.CorruptFrame, "Decode failed. Frame data header is corrupted."); + } + + if (cm.Lf.FilterLevel != 0 && cm.SkipLoopFilter == 0) + { + LoopFilter.LoopFilterFrameInit(ref cm, cm.Lf.FilterLevel); + } + + int threadCount = multithreaded ? Math.Max(1, Environment.ProcessorCount / 2) : 0; + + if (cm.TileWorkerData.IsNull || tileCols * tileRows != cm.TotalTiles) + { + int numTileWorkers = (tileCols * tileRows) + threadCount; + if (!cm.TileWorkerData.IsNull) + { + allocator.Free(cm.TileWorkerData); + } + + cm.CheckMemError( ref cm.TileWorkerData, allocator.Allocate(numTileWorkers)); + cm.TotalTiles = tileRows * tileCols; + } + + if (multithreaded) + { + pDataEnd = DecodeTilesMt(ref pbi.Common, data.Slice((int)firstPartitionSize), threadCount); + + LoopFilter.LoopFilterFrameMt( + ref cm.Mb.CurBuf, + ref cm, + ref cm.Mb, + cm.Lf.FilterLevel, + false, + false, + threadCount); + } + else + { + pDataEnd = DecodeTiles(ref pbi.Common, data.Slice((int)firstPartitionSize)); + + LoopFilter.LoopFilterFrame(ref cm.Mb.CurBuf, ref cm, ref cm.Mb, cm.Lf.FilterLevel, false, false); + } + + if (!xd.Corrupted) + { + if (cm.ErrorResilientMode == 0 && cm.FrameParallelDecodingMode == 0) + { + cm.AdaptCoefProbs(); + + if (!cm.FrameIsIntraOnly()) + { + cm.AdaptModeProbs(); + cm.AdaptMvProbs(cm.AllowHighPrecisionMv); + } + } + } + else + { + cm.Error.InternalError(CodecErr.CorruptFrame, "Decode failed. Frame data is corrupted."); + } + + // Non frame parallel update frame context here. + if (cm.RefreshFrameContext != 0 && contextUpdated == 0) + { + cm.FrameContexts[(int)cm.FrameContextIdx] = cm.Fc.Value; + } + } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/DecodeMv.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/DecodeMv.cs index 3281905c1..b77a602b6 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/DecodeMv.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/DecodeMv.cs @@ -5,18 +5,16 @@ using Ryujinx.Graphics.Video; using System; using System.Diagnostics; using System.Runtime.CompilerServices; -using Mv = Ryujinx.Graphics.Nvdec.Vp9.Types.Mv; -using MvRef = Ryujinx.Graphics.Nvdec.Vp9.Types.MvRef; namespace Ryujinx.Graphics.Nvdec.Vp9 { internal static class DecodeMv { - private const int MvrefNeighbours = 8; + private const int RefNeighbours = 8; private static PredictionMode ReadIntraMode(ref Reader r, ReadOnlySpan p) { - return (PredictionMode)r.ReadTree(Luts.Vp9IntraModeTree, p); + return (PredictionMode)r.ReadTree(Luts.IntraModeTree, p); } private static PredictionMode ReadIntraModeY(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r, int sizeGroup) @@ -43,7 +41,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 private static PredictionMode ReadInterMode(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r, int ctx) { - int mode = r.ReadTree(Luts.Vp9InterModeTree, cm.Fc.Value.InterModeProb[ctx].AsSpan()); + int mode = r.ReadTree(Luts.InterModeTree, cm.Fc.Value.InterModeProb[ctx].AsSpan()); if (!xd.Counts.IsNull) { ++xd.Counts.Value.InterMode[ctx][mode]; @@ -54,7 +52,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 private static int ReadSegmentId(ref Reader r, ref Array7 segTreeProbs) { - return r.ReadTree(Luts.Vp9SegmentTree, segTreeProbs.AsSpan()); + return r.ReadTree(Luts.SegmentTree, segTreeProbs.AsSpan()); } private static ReadOnlySpan GetTxProbs(ref Vp9EntropyProbs fc, TxSize maxTxSize, int ctx) @@ -64,7 +62,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 case TxSize.Tx8x8: return fc.Tx8x8Prob[ctx].AsSpan(); case TxSize.Tx16x16: return fc.Tx16x16Prob[ctx].AsSpan(); case TxSize.Tx32x32: return fc.Tx32x32Prob[ctx].AsSpan(); - default: Debug.Assert(false, "Invalid maxTxSize."); return ReadOnlySpan.Empty; + default: + Debug.Assert(false, "Invalid maxTxSize."); + return ReadOnlySpan.Empty; } } @@ -75,7 +75,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 case TxSize.Tx8x8: return counts.Tx8x8[ctx].AsSpan(); case TxSize.Tx16x16: return counts.Tx16x16[ctx].AsSpan(); case TxSize.Tx32x32: return counts.Tx32x32[ctx].AsSpan(); - default: Debug.Assert(false, "Invalid maxTxSize."); return Span.Empty; + default: + Debug.Assert(false, "Invalid maxTxSize."); + return Span.Empty; } } @@ -110,21 +112,20 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { return ReadSelectedTxSize(ref cm, ref xd, maxTxSize, ref r); } - else - { - return (TxSize)Math.Min((int)maxTxSize, (int)Luts.TxModeToBiggestTxSize[(int)txMode]); - } + + return (TxSize)Math.Min((int)maxTxSize, (int)Luts.TxModeToBiggestTxSize[(int)txMode]); } - private static int DecGetSegmentId(ref Vp9Common cm, ArrayPtr segmentIds, int miOffset, int xMis, int yMis) + private static int DecGetSegmentId(ref Vp9Common cm, ArrayPtr segmentIds, int miOffset, int xMis, + int yMis) { - int x, y, segmentId = int.MaxValue; + int segmentId = int.MaxValue; - for (y = 0; y < yMis; y++) + for (int y = 0; y < yMis; y++) { - for (x = 0; x < xMis; x++) + for (int x = 0; x < xMis; x++) { - segmentId = Math.Min(segmentId, segmentIds[miOffset + y * cm.MiCols + x]); + segmentId = Math.Min(segmentId, segmentIds[miOffset + (y * cm.MiCols) + x]); } } @@ -134,15 +135,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 private static void SetSegmentId(ref Vp9Common cm, int miOffset, int xMis, int yMis, int segmentId) { - int x, y; - Debug.Assert(segmentId >= 0 && segmentId < Constants.MaxSegments); - for (y = 0; y < yMis; y++) + for (int y = 0; y < yMis; y++) { - for (x = 0; x < xMis; x++) + for (int x = 0; x < xMis; x++) { - cm.CurrentFrameSegMap[miOffset + y * cm.MiCols + x] = (byte)segmentId; + cm.CurrentFrameSegMap[miOffset + (y * cm.MiCols) + x] = (byte)segmentId; } } } @@ -155,13 +154,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int xMis, int yMis) { - int x, y; - - for (y = 0; y < yMis; y++) + for (int y = 0; y < yMis; y++) { - for (x = 0; x < xMis; x++) + for (int x = 0; x < xMis; x++) { - currentSegmentIds[miOffset + y * cm.MiCols + x] = (byte)(!lastSegmentIds.IsNull ? lastSegmentIds[miOffset + y * cm.MiCols + x] : 0); + currentSegmentIds[miOffset + (y * cm.MiCols) + x] = (byte)(!lastSegmentIds.IsNull + ? lastSegmentIds[miOffset + (y * cm.MiCols) + x] + : 0); } } } @@ -173,7 +172,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 if (!seg.Enabled) { - return 0; // Default for disabled segmentation + return 0; // Default for disabled segmentation } if (!seg.UpdateMap) @@ -199,11 +198,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 ref Segmentation seg = ref cm.Seg; ref ModeInfo mi = ref xd.Mi[0].Value; int predictedSegmentId, segmentId; - int miOffset = miRow * cm.MiCols + miCol; + int miOffset = (miRow * cm.MiCols) + miCol; if (!seg.Enabled) { - return 0; // Default for disabled segmentation + return 0; // Default for disabled segmentation } predictedSegmentId = !cm.LastFrameSegMap.IsNull @@ -220,41 +219,42 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { byte predProb = Segmentation.GetPredProbSegId(ref cm.Fc.Value.SegPredProb, ref xd); mi.SegIdPredicted = (sbyte)r.Read(predProb); - segmentId = mi.SegIdPredicted != 0 ? predictedSegmentId : ReadSegmentId(ref r, ref cm.Fc.Value.SegTreeProb); + segmentId = mi.SegIdPredicted != 0 + ? predictedSegmentId + : ReadSegmentId(ref r, ref cm.Fc.Value.SegTreeProb); } else { segmentId = ReadSegmentId(ref r, ref cm.Fc.Value.SegTreeProb); } + SetSegmentId(ref cm, miOffset, xMis, yMis, segmentId); return segmentId; } private static int ReadSkip(ref Vp9Common cm, ref MacroBlockD xd, int segmentId, ref Reader r) { - if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.SegLvlSkip) != 0) + if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.Skip) != 0) { return 1; } - else - { - int ctx = xd.GetSkipContext(); - int skip = r.Read(cm.Fc.Value.SkipProb[ctx]); - if (!xd.Counts.IsNull) - { - ++xd.Counts.Value.Skip[ctx][skip]; - } - return skip; + int ctx = xd.GetSkipContext(); + int skip = r.Read(cm.Fc.Value.SkipProb[ctx]); + if (!xd.Counts.IsNull) + { + ++xd.Counts.Value.Skip[ctx][skip]; } + + return skip; } - private static int ReadMvComponent(ref Reader r, ref Vp9EntropyProbs fc, int mvcomp, bool usehp) + private static int ReadComponent(ref Reader r, ref Vp9EntropyProbs fc, int mvcomp, bool usehp) { int mag, d, fr, hp; bool sign = r.Read(fc.Sign[mvcomp]) != 0; - MvClassType mvClass = (MvClassType)r.ReadTree(Luts.Vp9MvClassTree, fc.Classes[mvcomp].AsSpan()); - bool class0 = mvClass == MvClassType.MvClass0; + MvClassType mvClass = (MvClassType)r.ReadTree(Luts.MvClassTree, fc.Classes[mvcomp].AsSpan()); + bool class0 = mvClass == MvClassType.Class0; // Integer part if (class0) @@ -264,11 +264,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } else { - int i; - int n = (int)mvClass + Constants.Class0Bits - 1; // Number of bits + int n = (int)mvClass + Constants.Class0Bits - 1; // Number of bits d = 0; - for (i = 0; i < n; ++i) + for (int i = 0; i < n; ++i) { d |= r.Read(fc.Bits[mvcomp][i]) << i; } @@ -277,7 +276,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } // Fractional part - fr = r.ReadTree(Luts.Vp9MvFPTree, class0 ? fc.Class0Fp[mvcomp][d].AsSpan() : fc.Fp[mvcomp].AsSpan()); + fr = r.ReadTree(Luts.MvFPTree, class0 ? fc.Class0Fp[mvcomp][d].AsSpan() : fc.Fp[mvcomp].AsSpan()); // High precision part (if hp is not used, the default value of the hp is 1) hp = usehp ? r.Read(class0 ? fc.Class0Hp[mvcomp] : fc.Hp[mvcomp]) : 1; @@ -287,29 +286,29 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 return sign ? -mag : mag; } - private static void ReadMv( + private static void Read( ref Reader r, ref Mv mv, ref Mv refr, ref Vp9EntropyProbs fc, Ptr counts, - bool allowHP) + bool allowHp) { - MvJointType jointType = (MvJointType)r.ReadTree(Luts.Vp9MvJointTree, fc.Joints.AsSpan()); - bool useHP = allowHP && refr.UseMvHp(); - Mv diff = new Mv(); + MvJointType jointType = (MvJointType)r.ReadTree(Luts.MvJointTree, fc.Joints.AsSpan()); + bool useHp = allowHp && refr.UseHp(); + Mv diff = new(); - if (Mv.MvJointVertical(jointType)) + if (Mv.JointVertical(jointType)) { - diff.Row = (short)ReadMvComponent(ref r, ref fc, 0, useHP); + diff.Row = (short)ReadComponent(ref r, ref fc, 0, useHp); } - if (Mv.MvJointHorizontal(jointType)) + if (Mv.JointHorizontal(jointType)) { - diff.Col = (short)ReadMvComponent(ref r, ref fc, 1, useHP); + diff.Col = (short)ReadComponent(ref r, ref fc, 1, useHp); } - diff.IncMv(counts); + diff.Inc(counts); mv.Row = (short)(refr.Row + diff.Row); mv.Col = (short)(refr.Col + diff.Col); @@ -317,7 +316,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 private static ReferenceMode ReadBlockReferenceMode(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r) { - if (cm.ReferenceMode == ReferenceMode.ReferenceModeSelect) + if (cm.ReferenceMode == ReferenceMode.Select) { int ctx = PredCommon.GetReferenceModeContext(ref cm, ref xd); ReferenceMode mode = (ReferenceMode)r.Read(cm.Fc.Value.CompInterProb[ctx]); @@ -326,12 +325,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 ++xd.Counts.Value.CompInter[ctx][(int)mode]; } - return mode; // SingleReference or CompoundReference - } - else - { - return cm.ReferenceMode; + return mode; // SingleReference or CompoundReference } + + return cm.ReferenceMode; } // Read the referncence frame @@ -344,15 +341,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { ref Vp9EntropyProbs fc = ref cm.Fc.Value; - if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.SegLvlRefFrame) != 0) + if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.RefFrame) != 0) { - refFrame[0] = (sbyte)cm.Seg.GetSegData(segmentId, SegLvlFeatures.SegLvlRefFrame); + refFrame[0] = (sbyte)cm.Seg.GetSegData(segmentId, SegLvlFeatures.RefFrame); refFrame[1] = Constants.None; } else { ReferenceMode mode = ReadBlockReferenceMode(ref cm, ref xd, ref r); - if (mode == ReferenceMode.CompoundReference) + if (mode == ReferenceMode.Compound) { int idx = cm.RefFrameSignBias[cm.CompFixedRef]; int ctx = PredCommon.GetPredContextCompRefP(ref cm, ref xd); @@ -365,7 +362,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 refFrame[idx] = cm.CompFixedRef; refFrame[idx == 0 ? 1 : 0] = cm.CompVarRef[bit]; } - else if (mode == ReferenceMode.SingleReference) + else if (mode == ReferenceMode.Single) { int ctx0 = PredCommon.GetPredContextSingleRefP1(ref xd); int bit0 = r.Read(fc.SingleRefProb[ctx0][0]); @@ -402,7 +399,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 private static byte ReadSwitchableInterpFilter(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r) { int ctx = xd.GetPredContextSwitchableInterp(); - byte type = (byte)r.ReadTree(Luts.Vp9SwitchableInterpTree, cm.Fc.Value.SwitchableInterpProb[ctx].AsSpan()); + byte type = (byte)r.ReadTree(Luts.SwitchableInterpTree, cm.Fc.Value.SwitchableInterpProb[ctx].AsSpan()); if (!xd.Counts.IsNull) { ++xd.Counts.Value.SwitchableInterp[ctx][type]; @@ -414,12 +411,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 private static void ReadIntraBlockModeInfo(ref Vp9Common cm, ref MacroBlockD xd, ref ModeInfo mi, ref Reader r) { BlockSize bsize = mi.SbType; - int i; + switch (bsize) { case BlockSize.Block4x4: - for (i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { mi.Bmi[i].Mode = ReadIntraModeY(ref cm, ref xd, ref r, 0); } @@ -434,7 +431,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 mi.Bmi[0].Mode = mi.Bmi[1].Mode = ReadIntraModeY(ref cm, ref xd, ref r, 0); mi.Bmi[2].Mode = mi.Bmi[3].Mode = mi.Mode = ReadIntraModeY(ref cm, ref xd, ref r, 0); break; - default: mi.Mode = ReadIntraModeY(ref cm, ref xd, ref r, Luts.SizeGroupLookup[(int)bsize]); break; + default: + mi.Mode = ReadIntraModeY(ref cm, ref xd, ref r, Luts.SizeGroupLookup[(int)bsize]); + break; } mi.UvMode = ReadIntraModeUv(ref cm, ref xd, ref r, (byte)mi.Mode); @@ -447,27 +446,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 mi.RefFrame[1] = Constants.None; } - private static bool IsMvValid(ref Mv mv) - { - return mv.Row > Constants.MvLow && - mv.Row < Constants.MvUpp && - mv.Col > Constants.MvLow && - mv.Col < Constants.MvUpp; - } - - private static void CopyMvPair(ref Array2 dst, ref Array2 src) + private static void CopyPair(ref Array2 dst, ref Array2 src) { dst[0] = src[0]; dst[1] = src[1]; } - private static void ZeroMvPair(ref Array2 dst) + private static void ZeroPair(ref Array2 dst) { dst[0] = new Mv(); dst[1] = new Mv(); } - private static bool AssignMv( + private static bool Assign( ref Vp9Common cm, ref MacroBlockD xd, PredictionMode mode, @@ -475,84 +466,81 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 ref Array2 refMv, ref Array2 nearNearestMv, int isCompound, - bool allowHP, + bool allowHp, ref Reader r) { - int i; bool ret = true; switch (mode) { case PredictionMode.NewMv: { - for (i = 0; i < 1 + isCompound; ++i) + for (int i = 0; i < 1 + isCompound; ++i) { - ReadMv(ref r, ref mv[i], ref refMv[i], ref cm.Fc.Value, xd.Counts, allowHP); - ret = ret && IsMvValid(ref mv[i]); + Read(ref r, ref mv[i], ref refMv[i], ref cm.Fc.Value, xd.Counts, allowHp); + ret = ret && mv[i].IsValid(); } + break; } case PredictionMode.NearMv: case PredictionMode.NearestMv: { - CopyMvPair(ref mv, ref nearNearestMv); + CopyPair(ref mv, ref nearNearestMv); break; } case PredictionMode.ZeroMv: { - ZeroMvPair(ref mv); + ZeroPair(ref mv); break; } default: return false; } + return ret; } private static bool ReadIsInterBlock(ref Vp9Common cm, ref MacroBlockD xd, int segmentId, ref Reader r) { - if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.SegLvlRefFrame) != 0) + if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.RefFrame) != 0) { - return cm.Seg.GetSegData(segmentId, SegLvlFeatures.SegLvlRefFrame) != Constants.IntraFrame; + return cm.Seg.GetSegData(segmentId, SegLvlFeatures.RefFrame) != Constants.IntraFrame; } - else - { - int ctx = xd.GetIntraInterContext(); - bool isInter = r.Read(cm.Fc.Value.IntraInterProb[ctx]) != 0; - if (!xd.Counts.IsNull) - { - ++xd.Counts.Value.IntraInter[ctx][isInter ? 1 : 0]; - } - return isInter; + int ctx = xd.GetIntraInterContext(); + bool isInter = r.Read(cm.Fc.Value.IntraInterProb[ctx]) != 0; + if (!xd.Counts.IsNull) + { + ++xd.Counts.Value.IntraInter[ctx][isInter ? 1 : 0]; } + + return isInter; } - private static void DecFindBestRefMvs(bool allowHP, Span mvlist, ref Mv bestMv, int refmvCount) + private static void DecFindBestRefs(bool allowHp, Span mvlist, ref Mv bestMv, int refmvCount) { - int i; - // Make sure all the candidates are properly clamped etc - for (i = 0; i < refmvCount; ++i) + for (int i = 0; i < refmvCount; ++i) { - mvlist[i].LowerMvPrecision(allowHP); + mvlist[i].LowerPrecision(allowHp); bestMv = mvlist[i]; } } - private static bool AddMvRefListEb(Mv mv, ref int refMvCount, Span mvRefList, bool earlyBreak) + private static bool AddRefListEb(Mv mv, ref int refCount, Span mvRefList, bool earlyBreak) { - if (refMvCount != 0) + if (refCount != 0) { if (Unsafe.As(ref mv) != Unsafe.As(ref mvRefList[0])) { - mvRefList[refMvCount] = mv; - refMvCount++; + mvRefList[refCount] = mv; + refCount++; return true; } } else { - mvRefList[refMvCount++] = mv; + mvRefList[refCount++] = mv; if (earlyBreak) { return true; @@ -562,19 +550,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 return false; } - // Performs mv sign inversion if indicated by the reference frame combination. - private static Mv ScaleMv(ref ModeInfo mi, int refr, sbyte thisRefFrame, ref Array4 refSignBias) - { - Mv mv = mi.Mv[refr]; - if (refSignBias[mi.RefFrame[refr]] != refSignBias[thisRefFrame]) - { - mv.Row *= -1; - mv.Col *= -1; - } - return mv; - } - - private static bool IsDiffRefFrameAddMvEb( + private static bool IsDiffRefFrameAddEb( ref ModeInfo mbmi, sbyte refFrame, ref Array4 refSignBias, @@ -586,26 +562,30 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { if (mbmi.RefFrame[0] != refFrame) { - if (AddMvRefListEb(ScaleMv(ref mbmi, 0, refFrame, ref refSignBias), ref refmvCount, mvRefList, earlyBreak)) - { - return true; - } - } - if (mbmi.HasSecondRef() && mbmi.RefFrame[1] != refFrame && Unsafe.As(ref mbmi.Mv[1]) != Unsafe.As(ref mbmi.Mv[0])) - { - if (AddMvRefListEb(ScaleMv(ref mbmi, 1, refFrame, ref refSignBias), ref refmvCount, mvRefList, earlyBreak)) + if (AddRefListEb(mbmi.ScaleMv(0, refFrame, ref refSignBias), ref refmvCount, mvRefList, + earlyBreak)) { return true; } } + if (mbmi.HasSecondRef() && mbmi.RefFrame[1] != refFrame && + Unsafe.As(ref mbmi.Mv[1]) != Unsafe.As(ref mbmi.Mv[0])) + { + if (AddRefListEb(mbmi.ScaleMv(1, refFrame, ref refSignBias), ref refmvCount, mvRefList, + earlyBreak)) + { + return true; + } + } } + return false; } // This function searches the neighborhood of a given MB/SB // to try and find candidate reference vectors. - private static unsafe int DecFindMvRefs( + private static int DecFindRefs( ref Vp9Common cm, ref MacroBlockD xd, PredictionMode mode, @@ -615,12 +595,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int miRow, int miCol, int block, - int isSub8X8) + int isSub8x8) { ref Array4 refSignBias = ref cm.RefFrameSignBias; int i, refmvCount = 0; bool differentRefFound = false; - Ptr prevFrameMvs = cm.UsePrevFrameMvs ? new Ptr(ref cm.PrevFrameMvs[miRow * cm.MiCols + miCol]) : Ptr.Null; + Ptr prevFrameMvs = cm.UsePrevFrameMvs + ? new Ptr(ref cm.PrevFrameMvs[(miRow * cm.MiCols) + miCol]) + : Ptr.Null; ref TileInfo tile = ref xd.Tile; // If mode is nearestmv or newmv (uses nearestmv as a reference) then stop // searching after the first mv is found. @@ -630,7 +612,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 mvRefList.Slice(0, Constants.MaxMvRefCandidates).Fill(new Mv()); i = 0; - if (isSub8X8 != 0) + if (isSub8x8 != 0) { // If the size < 8x8 we get the mv from the bmi substructure for the // nearest two blocks. @@ -639,19 +621,21 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 ref Position mvRef = ref mvRefSearch[i]; if (tile.IsInside(miCol, miRow, cm.MiRows, ref mvRef)) { - ref ModeInfo candidateMi = ref xd.Mi[mvRef.Col + mvRef.Row * xd.MiStride].Value; + ref ModeInfo candidateMi = ref xd.Mi[mvRef.Col + (mvRef.Row * xd.MiStride)].Value; differentRefFound = true; if (candidateMi.RefFrame[0] == refFrame) { - if (AddMvRefListEb(candidateMi.GetSubBlockMv(0, mvRef.Col, block), ref refmvCount, mvRefList, earlyBreak)) + if (AddRefListEb(candidateMi.GetSubBlockMv(0, mvRef.Col, block), ref refmvCount, + mvRefList, earlyBreak)) { goto Done; } } else if (candidateMi.RefFrame[1] == refFrame) { - if (AddMvRefListEb(candidateMi.GetSubBlockMv(1, mvRef.Col, block), ref refmvCount, mvRefList, earlyBreak)) + if (AddRefListEb(candidateMi.GetSubBlockMv(1, mvRef.Col, block), ref refmvCount, + mvRefList, earlyBreak)) { goto Done; } @@ -663,24 +647,24 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 // Check the rest of the neighbors in much the same way // as before except we don't need to keep track of sub blocks or // mode counts. - for (; i < MvrefNeighbours; ++i) + for (; i < RefNeighbours; ++i) { ref Position mvRef = ref mvRefSearch[i]; if (tile.IsInside(miCol, miRow, cm.MiRows, ref mvRef)) { - ref ModeInfo candidate = ref xd.Mi[mvRef.Col + mvRef.Row * xd.MiStride].Value; + ref ModeInfo candidate = ref xd.Mi[mvRef.Col + (mvRef.Row * xd.MiStride)].Value; differentRefFound = true; if (candidate.RefFrame[0] == refFrame) { - if (AddMvRefListEb(candidate.Mv[0], ref refmvCount, mvRefList, earlyBreak)) + if (AddRefListEb(candidate.Mv[0], ref refmvCount, mvRefList, earlyBreak)) { goto Done; } } else if (candidate.RefFrame[1] == refFrame) { - if (AddMvRefListEb(candidate.Mv[1], ref refmvCount, mvRefList, earlyBreak)) + if (AddRefListEb(candidate.Mv[1], ref refmvCount, mvRefList, earlyBreak)) { goto Done; } @@ -693,14 +677,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { if (prevFrameMvs.Value.RefFrame[0] == refFrame) { - if (AddMvRefListEb(prevFrameMvs.Value.Mv[0], ref refmvCount, mvRefList, earlyBreak)) + if (AddRefListEb(prevFrameMvs.Value.Mv[0], ref refmvCount, mvRefList, earlyBreak)) { goto Done; } } else if (prevFrameMvs.Value.RefFrame[1] == refFrame) { - if (AddMvRefListEb(prevFrameMvs.Value.Mv[1], ref refmvCount, mvRefList, earlyBreak)) + if (AddRefListEb(prevFrameMvs.Value.Mv[1], ref refmvCount, mvRefList, earlyBreak)) { goto Done; } @@ -712,15 +696,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 // different reference frames. if (differentRefFound) { - for (i = 0; i < MvrefNeighbours; ++i) + for (i = 0; i < RefNeighbours; ++i) { ref Position mvRef = ref mvRefSearch[i]; if (tile.IsInside(miCol, miRow, cm.MiRows, ref mvRef)) { - ref ModeInfo candidate = ref xd.Mi[mvRef.Col + mvRef.Row * xd.MiStride].Value; + ref ModeInfo candidate = ref xd.Mi[mvRef.Col + (mvRef.Row * xd.MiStride)].Value; // If the candidate is Intra we don't want to consider its mv. - if (IsDiffRefFrameAddMvEb(ref candidate, refFrame, ref refSignBias, ref refmvCount, mvRefList, earlyBreak)) + if (IsDiffRefFrameAddEb(ref candidate, refFrame, ref refSignBias, ref refmvCount, mvRefList, + earlyBreak)) { goto Done; } @@ -739,7 +724,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 mv.Row *= -1; mv.Col *= -1; } - if (AddMvRefListEb(mv, ref refmvCount, mvRefList, earlyBreak)) + + if (AddRefListEb(mv, ref refmvCount, mvRefList, earlyBreak)) { goto Done; } @@ -747,7 +733,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 if (prevFrameMvs.Value.RefFrame[1] > Constants.IntraFrame && prevFrameMvs.Value.RefFrame[1] != refFrame && - Unsafe.As(ref prevFrameMvs.Value.Mv[1]) != Unsafe.As(ref prevFrameMvs.Value.Mv[0])) + Unsafe.As(ref prevFrameMvs.Value.Mv[1]) != + Unsafe.As(ref prevFrameMvs.Value.Mv[0])) { Mv mv = prevFrameMvs.Value.Mv[1]; if (refSignBias[prevFrameMvs.Value.RefFrame[1]] != refSignBias[refFrame]) @@ -755,7 +742,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 mv.Row *= -1; mv.Col *= -1; } - if (AddMvRefListEb(mv, ref refmvCount, mvRefList, earlyBreak)) + + if (AddRefListEb(mv, ref refmvCount, mvRefList, earlyBreak)) { goto Done; } @@ -772,17 +760,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 refmvCount = 1; } - Done: + Done: // Clamp vectors for (i = 0; i < refmvCount; ++i) { - mvRefList[i].ClampMvRef(ref xd); + mvRefList[i].ClampRef(ref xd); } return refmvCount; } - private static void AppendSub8x8MvsForIdx( + private static void AppendSub8x8ForIdx( ref Vp9Common cm, ref MacroBlockD xd, Span mvRefSearch, @@ -796,16 +784,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 Span mvList = stackalloc Mv[Constants.MaxMvRefCandidates]; ref ModeInfo mi = ref xd.Mi[0].Value; ref Array4 bmi = ref mi.Bmi; - int n; int refmvCount; Debug.Assert(Constants.MaxMvRefCandidates == 2); - refmvCount = DecFindMvRefs(ref cm, ref xd, bMode, mi.RefFrame[refr], mvRefSearch, mvList, miRow, miCol, block, 1); + refmvCount = DecFindRefs(ref cm, ref xd, bMode, mi.RefFrame[refr], mvRefSearch, mvList, miRow, miCol, + block, 1); switch (block) { - case 0: bestSub8x8 = mvList[refmvCount - 1]; break; + case 0: + bestSub8x8 = mvList[refmvCount - 1]; + break; case 1: case 2: if (bMode == PredictionMode.NearestMv) @@ -815,7 +805,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 else { bestSub8x8 = new Mv(); - for (n = 0; n < refmvCount; ++n) + for (int n = 0; n < refmvCount; ++n) { if (Unsafe.As(ref bmi[0].Mv[refr]) != Unsafe.As(ref mvList[n])) { @@ -824,6 +814,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } } + break; case 3: if (bMode == PredictionMode.NearestMv) @@ -838,7 +829,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 candidates[2] = mvList[0]; candidates[3] = mvList[1]; bestSub8x8 = new Mv(); - for (n = 0; n < 2 + Constants.MaxMvRefCandidates; ++n) + for (int n = 0; n < 2 + Constants.MaxMvRefCandidates; ++n) { if (Unsafe.As(ref bmi[2].Mv[refr]) != Unsafe.As(ref candidates[n])) { @@ -847,24 +838,27 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } } + + break; + default: + Debug.Assert(false, "Invalid block index."); break; - default: Debug.Assert(false, "Invalid block index."); break; } } - private static byte GetModeContext(ref Vp9Common cm, ref MacroBlockD xd, Span mvRefSearch, int miRow, int miCol) + private static byte GetModeContext(ref Vp9Common cm, ref MacroBlockD xd, Span mvRefSearch, int miRow, + int miCol) { - int i; int contextCounter = 0; ref TileInfo tile = ref xd.Tile; // Get mode count from nearest 2 blocks - for (i = 0; i < 2; ++i) + for (int i = 0; i < 2; ++i) { ref Position mvRef = ref mvRefSearch[i]; if (tile.IsInside(miCol, miRow, cm.MiRows, ref mvRef)) { - ref ModeInfo candidate = ref xd.Mi[mvRef.Col + mvRef.Row * xd.MiStride].Value; + ref ModeInfo candidate = ref xd.Mi[mvRef.Col + (mvRef.Row * xd.MiStride)].Value; // Keep counts for entropy encoding. contextCounter += Luts.Mode2Counter[(int)candidate.Mode]; } @@ -882,8 +876,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 ref Reader r) { BlockSize bsize = mi.SbType; - bool allowHP = cm.AllowHighPrecisionMv; - Array2 bestRefMvs = new Array2(); + bool allowHp = cm.AllowHighPrecisionMv; + Array2 bestRefMvs = new(); int refr, isCompound; byte interModeCtx; Span mvRefSearch = Luts.MvRefBlocks[(int)bsize]; @@ -892,12 +886,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 isCompound = mi.HasSecondRef() ? 1 : 0; interModeCtx = GetModeContext(ref cm, ref xd, mvRefSearch, miRow, miCol); - if (cm.Seg.IsSegFeatureActive(mi.SegmentId, SegLvlFeatures.SegLvlSkip) != 0) + if (cm.Seg.IsSegFeatureActive(mi.SegmentId, SegLvlFeatures.Skip) != 0) { mi.Mode = PredictionMode.ZeroMv; if (bsize < BlockSize.Block8x8) { - xd.ErrorInfo.Value.InternalError(CodecErr.CodecUnsupBitstream, "Invalid usage of segement feature on small blocks"); + xd.ErrorInfo.Value.InternalError(CodecErr.UnsupBitstream, + "Invalid usage of segement feature on small blocks"); return; } } @@ -925,53 +920,58 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 sbyte frame = mi.RefFrame[refr]; int refmvCount; - refmvCount = DecFindMvRefs(ref cm, ref xd, mi.Mode, frame, mvRefSearch, tmpMvs, miRow, miCol, -1, 0); + refmvCount = DecFindRefs(ref cm, ref xd, mi.Mode, frame, mvRefSearch, tmpMvs, miRow, miCol, + -1, 0); - DecFindBestRefMvs(allowHP, tmpMvs, ref bestRefMvs[refr], refmvCount); + DecFindBestRefs(allowHp, tmpMvs, ref bestRefMvs[refr], refmvCount); } } } - mi.InterpFilter = (cm.InterpFilter == Constants.Switchable) ? ReadSwitchableInterpFilter(ref cm, ref xd, ref r) : cm.InterpFilter; + mi.InterpFilter = cm.InterpFilter == Constants.Switchable + ? ReadSwitchableInterpFilter(ref cm, ref xd, ref r) + : cm.InterpFilter; if (bsize < BlockSize.Block8x8) { - int num4X4W = 1 << xd.BmodeBlocksWl; - int num4X4H = 1 << xd.BmodeBlocksHl; + int num4x4W = 1 << xd.BmodeBlocksWl; + int num4x4H = 1 << xd.BmodeBlocksHl; int idx, idy; PredictionMode bMode = 0; - Array2 bestSub8x8 = new Array2(); + Array2 bestSub8x8 = new(); const uint invalidMv = 0x80008000; // Initialize the 2nd element as even though it won't be used meaningfully // if isCompound is false. Unsafe.As(ref bestSub8x8[1]) = invalidMv; - for (idy = 0; idy < 2; idy += num4X4H) + for (idy = 0; idy < 2; idy += num4x4H) { - for (idx = 0; idx < 2; idx += num4X4W) + for (idx = 0; idx < 2; idx += num4x4W) { - int j = idy * 2 + idx; + int j = (idy * 2) + idx; bMode = ReadInterMode(ref cm, ref xd, ref r, interModeCtx); if (bMode == PredictionMode.NearestMv || bMode == PredictionMode.NearMv) { for (refr = 0; refr < 1 + isCompound; ++refr) { - AppendSub8x8MvsForIdx(ref cm, ref xd, mvRefSearch, bMode, j, refr, miRow, miCol, ref bestSub8x8[refr]); + AppendSub8x8ForIdx(ref cm, ref xd, mvRefSearch, bMode, j, refr, miRow, miCol, + ref bestSub8x8[refr]); } } - if (!AssignMv(ref cm, ref xd, bMode, ref mi.Bmi[j].Mv, ref bestRefMvs, ref bestSub8x8, isCompound, allowHP, ref r)) + if (!Assign(ref cm, ref xd, bMode, ref mi.Bmi[j].Mv, ref bestRefMvs, ref bestSub8x8, + isCompound, allowHp, ref r)) { xd.Corrupted |= true; break; } - if (num4X4H == 2) + if (num4x4H == 2) { mi.Bmi[j + 2] = mi.Bmi[j]; } - if (num4X4W == 2) + if (num4x4W == 2) { mi.Bmi[j + 1] = mi.Bmi[j]; } @@ -980,11 +980,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 mi.Mode = bMode; - CopyMvPair(ref mi.Mv, ref mi.Bmi[3].Mv); + CopyPair(ref mi.Mv, ref mi.Bmi[3].Mv); } else { - xd.Corrupted |= !AssignMv(ref cm, ref xd, mi.Mode, ref mi.Mv, ref bestRefMvs, ref bestRefMvs, isCompound, allowHP, ref r); + xd.Corrupted |= !Assign(ref cm, ref xd, mi.Mode, ref mi.Mv, ref bestRefMvs, ref bestRefMvs, + isCompound, allowHp, ref r); } } @@ -1026,11 +1027,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 return leftMi.Value.GetYMode(b + 1); } - else - { - Debug.Assert(b == 1 || b == 3); - return curMi.Value.Bmi[b - 1].Mode; - } + + Debug.Assert(b == 1 || b == 3); + return curMi.Value.Bmi[b - 1].Mode; } private static PredictionMode AboveBlockMode(Ptr curMi, Ptr aboveMi, int b) @@ -1044,11 +1043,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 return aboveMi.Value.GetYMode(b + 2); } - else - { - Debug.Assert(b == 2 || b == 3); - return curMi.Value.Bmi[b - 2].Mode; - } + + Debug.Assert(b == 2 || b == 3); + return curMi.Value.Bmi[b - 2].Mode; } private static ReadOnlySpan GetYModeProbs( @@ -1076,8 +1073,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 Ptr aboveMi = xd.AboveMi; Ptr leftMi = xd.LeftMi; BlockSize bsize = mi.Value.SbType; - int i; - int miOffset = miRow * cm.MiCols + miCol; + + int miOffset = (miRow * cm.MiCols) + miCol; mi.Value.SegmentId = (sbyte)ReadIntraSegmentId(ref cm, miOffset, xMis, yMis, ref r); mi.Value.Skip = (sbyte)ReadSkip(ref cm, ref xd, mi.Value.SegmentId, ref r); @@ -1088,7 +1085,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 switch (bsize) { case BlockSize.Block4x4: - for (i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { mi.Value.Bmi[i].Mode = ReadIntraMode(ref r, GetYModeProbs(ref cm.Fc.Value, mi, aboveMi, leftMi, i)); @@ -1133,8 +1130,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 ref Reader r = ref twd.BitReader; ref MacroBlockD xd = ref twd.Xd; ref ModeInfo mi = ref xd.Mi[0].Value; - ArrayPtr frameMvs = cm.CurFrameMvs.Slice(miRow * cm.MiCols + miCol); - int w, h; + ArrayPtr frameMvs = cm.CurFrameMvs.Slice((miRow * cm.MiCols) + miCol); if (cm.FrameIsIntraOnly()) { @@ -1144,17 +1140,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { ReadInterFrameModeInfo(ref cm, ref xd, miRow, miCol, ref r, xMis, yMis); - for (h = 0; h < yMis; ++h) + for (int h = 0; h < yMis; ++h) { - for (w = 0; w < xMis; ++w) + for (int w = 0; w < xMis; ++w) { ref MvRef mv = ref frameMvs[w]; CopyRefFramePair(ref mv.RefFrame, ref mi.RefFrame); - CopyMvPair(ref mv.Mv, ref mi.Mv); + CopyPair(ref mv.Mv, ref mi.Mv); } + frameMvs = frameMvs.Slice(cm.MiCols); } } } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Decoder.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Decoder.cs index acebd8ab9..3a729a54a 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Decoder.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Decoder.cs @@ -3,7 +3,6 @@ using Ryujinx.Graphics.Nvdec.Vp9.Common; using Ryujinx.Graphics.Nvdec.Vp9.Types; using Ryujinx.Graphics.Video; using System; -using Vp9MvRef = Ryujinx.Graphics.Video.Vp9MvRef; namespace Ryujinx.Graphics.Nvdec.Vp9 { @@ -11,16 +10,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { public bool IsHardwareAccelerated => false; - private readonly MemoryAllocator _allocator = new MemoryAllocator(); + private readonly MemoryAllocator _allocator = new(); - public ISurface CreateSurface(int width, int height) => new Surface(width, height); + public ISurface CreateSurface(int width, int height) + { + return new Surface(width, height); + } private static ReadOnlySpan LiteralToFilter => new byte[] { - Constants.EightTapSmooth, - Constants.EightTap, - Constants.EightTapSharp, - Constants.Bilinear + Constants.EightTapSmooth, Constants.EightTap, Constants.EightTapSharp, Constants.Bilinear }; public unsafe bool Decode( @@ -30,7 +29,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 ReadOnlySpan mvsIn, Span mvsOut) { - Vp9Common cm = new Vp9Common(); + Vp9Common cm = new(); cm.FrameType = pictureInfo.IsKeyFrame ? FrameType.KeyFrame : FrameType.InterFrame; cm.IntraOnly = pictureInfo.IntraOnly; @@ -68,6 +67,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 cm.CompFixedRef = pictureInfo.CompFixedRef; cm.CompVarRef = pictureInfo.CompVarRef; + cm.BitDepth = BitDepth.Bits8; + cm.Log2TileCols = pictureInfo.Log2TileCols; cm.Log2TileRows = pictureInfo.Log2TileRows; @@ -78,6 +79,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 cm.Seg.FeatureMask = pictureInfo.SegmentFeatureEnable; cm.Seg.FeatureData = pictureInfo.SegmentFeatureData; + cm.Lf.FilterLevel = pictureInfo.LoopFilterLevel; + cm.Lf.SharpnessLevel = pictureInfo.LoopFilterSharpnessLevel; cm.Lf.ModeRefDeltaEnabled = pictureInfo.ModeRefDeltaEnabled; cm.Lf.RefDeltas = pictureInfo.RefDeltas; cm.Lf.ModeDeltas = pictureInfo.ModeDeltas; @@ -105,7 +108,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 cm.SetupSegmentationDequant(); cm.SetupScaleFactors(); - SetMvs(ref cm, mvsIn); + cm.SetMvs(mvsIn); fixed (byte* dataPtr = bitstream) { @@ -114,10 +117,27 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 if (maxThreads > 1 && tileRows == 1 && tileCols > 1) { DecodeFrame.DecodeTilesMt(ref cm, new ArrayPtr(dataPtr, bitstream.Length), maxThreads); + + LoopFilter.LoopFilterFrameMt( + ref cm.Mb.CurBuf, + ref cm, + ref cm.Mb, + cm.Lf.FilterLevel, + false, + false, + maxThreads); } else { DecodeFrame.DecodeTiles(ref cm, new ArrayPtr(dataPtr, bitstream.Length)); + + LoopFilter.LoopFilterFrame( + ref cm.Mb.CurBuf, + ref cm, + ref cm.Mb, + cm.Lf.FilterLevel, + false, + false); } } catch (InternalErrorException) @@ -126,7 +146,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } - GetMvs(ref cm, mvsOut); + cm.GetMvs(mvsOut); cm.FreeTileWorkerData(_allocator); cm.FreeContextBuffers(_allocator); @@ -134,48 +154,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 return true; } - private static void SetMvs(ref Vp9Common cm, ReadOnlySpan mvs) + public void Dispose() { - if (mvs.Length > cm.PrevFrameMvs.Length) - { - throw new ArgumentException($"Size mismatch, expected: {cm.PrevFrameMvs.Length}, but got: {mvs.Length}."); - } - - for (int i = 0; i < mvs.Length; i++) - { - ref var mv = ref cm.PrevFrameMvs[i]; - - mv.Mv[0].Row = mvs[i].Mvs[0].Row; - mv.Mv[0].Col = mvs[i].Mvs[0].Col; - mv.Mv[1].Row = mvs[i].Mvs[1].Row; - mv.Mv[1].Col = mvs[i].Mvs[1].Col; - - mv.RefFrame[0] = (sbyte)mvs[i].RefFrames[0]; - mv.RefFrame[1] = (sbyte)mvs[i].RefFrames[1]; - } + _allocator.Dispose(); } - - private static void GetMvs(ref Vp9Common cm, Span mvs) - { - if (mvs.Length > cm.CurFrameMvs.Length) - { - throw new ArgumentException($"Size mismatch, expected: {cm.CurFrameMvs.Length}, but got: {mvs.Length}."); - } - - for (int i = 0; i < mvs.Length; i++) - { - ref var mv = ref cm.CurFrameMvs[i]; - - mvs[i].Mvs[0].Row = mv.Mv[0].Row; - mvs[i].Mvs[0].Col = mv.Mv[0].Col; - mvs[i].Mvs[1].Row = mv.Mv[1].Row; - mvs[i].Mvs[1].Col = mv.Mv[1].Col; - - mvs[i].RefFrames[0] = mv.RefFrame[0]; - mvs[i].RefFrames[1] = mv.RefFrame[1]; - } - } - - public void Dispose() => _allocator.Dispose(); } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Detokenize.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Detokenize.cs index 52b1b3dc4..ebcacd5fd 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Detokenize.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Detokenize.cs @@ -19,24 +19,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { const int maxNeighbors = 2; - return (1 + tokenCache[neighbors[maxNeighbors * c + 0]] + tokenCache[neighbors[maxNeighbors * c + 1]]) >> 1; - } - - private static int ReadCoeff( - ref Reader r, - ReadOnlySpan probs, - int n, - ref ulong value, - ref int count, - ref uint range) - { - int i, val = 0; - for (i = 0; i < n; ++i) - { - val = (val << 1) | r.ReadBool(probs[i], ref value, ref count, ref range); - } - - return val; + return (1 + tokenCache[neighbors[(maxNeighbors * c) + 0]] + + tokenCache[neighbors[(maxNeighbors * c) + 1]]) >> 1; } private static int DecodeCoefs( @@ -57,14 +41,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int band, c = 0; ref Array6>> coefProbs = ref fc.CoefProbs[(int)txSize][(int)type][refr]; Span tokenCache = stackalloc byte[32 * 32]; - ReadOnlySpan bandTranslate = Luts.get_band_translate(txSize); - int dqShift = (txSize == TxSize.Tx32x32) ? 1 : 0; + ReadOnlySpan bandTranslate = Luts.GetBandTranslate(txSize); + int dqShift = txSize == TxSize.Tx32x32 ? 1 : 0; int v; short dqv = dq[0]; - ReadOnlySpan cat6Prob = (xd.Bd == 12) - ? Luts.Vp9Cat6ProbHigh12 - : (xd.Bd == 10) ? Luts.Vp9Cat6ProbHigh12.Slice(2) : Luts.Vp9Cat6Prob; - int cat6Bits = (xd.Bd == 12) ? 18 : (xd.Bd == 10) ? 16 : 14; + ReadOnlySpan cat6Prob = xd.Bd == 12 + ? Luts.Cat6ProbHigh12 + : xd.Bd == 10 + ? Luts.Cat6ProbHigh12.Slice(2) + : Luts.Cat6Prob; + int cat6Bits = xd.Bd == 12 ? 18 : xd.Bd == 10 ? 16 : 14; // Keep value, range, and count as locals. The compiler produces better // results with the locals than using r directly. ulong value = r.Value; @@ -107,8 +93,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 r.Value = value; r.Range = range; r.Count = count; - return c; // Zero tokens at the end (no eob token) + return c; // Zero tokens at the end (no eob token) } + ctx = GetCoefContext(nb, tokenCache, c); band = bandTranslate[0]; bandTranslate = bandTranslate.Slice(1); @@ -117,7 +104,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 if (r.ReadBool(prob[OneContextNode], ref value, ref count, ref range) != 0) { - ReadOnlySpan p = Luts.Vp9Pareto8Full[prob[Constants.PivotNode] - 1]; + ReadOnlySpan p = Luts.Pareto8Full[prob[Constants.PivotNode] - 1]; if (!xd.Counts.IsNull) { ++counts.Coef[(int)txSize][(int)type][refr][band][ctx][Constants.TwoToken]; @@ -132,20 +119,24 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { if (r.ReadBool(p[7], ref value, ref count, ref range) != 0) { - val = Constants.Cat6MinVal + ReadCoeff(ref r, cat6Prob, cat6Bits, ref value, ref count, ref range); + val = Constants.Cat6MinVal + r.ReadCoeff(cat6Prob, cat6Bits, ref value, + ref count, ref range); } else { - val = Constants.Cat5MinVal + ReadCoeff(ref r, Luts.Vp9Cat5Prob, 5, ref value, ref count, ref range); + val = Constants.Cat5MinVal + r.ReadCoeff(Luts.Cat5Prob, 5, ref value, + ref count, ref range); } } else if (r.ReadBool(p[6], ref value, ref count, ref range) != 0) { - val = Constants.Cat4MinVal + ReadCoeff(ref r, Luts.Vp9Cat4Prob, 4, ref value, ref count, ref range); + val = Constants.Cat4MinVal + r.ReadCoeff(Luts.Cat4Prob, 4, ref value, ref count, + ref range); } else { - val = Constants.Cat3MinVal + ReadCoeff(ref r, Luts.Vp9Cat3Prob, 3, ref value, ref count, ref range); + val = Constants.Cat3MinVal + r.ReadCoeff(Luts.Cat3Prob, 3, ref value, ref count, + ref range); } } else @@ -153,13 +144,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 tokenCache[scan[c]] = 4; if (r.ReadBool(p[4], ref value, ref count, ref range) != 0) { - val = Constants.Cat2MinVal + ReadCoeff(ref r, Luts.Vp9Cat2Prob, 2, ref value, ref count, ref range); + val = Constants.Cat2MinVal + r.ReadCoeff(Luts.Cat2Prob, 2, ref value, ref count, + ref range); } else { - val = Constants.Cat1MinVal + ReadCoeff(ref r, Luts.Vp9Cat1Prob, 1, ref value, ref count, ref range); + val = Constants.Cat1MinVal + r.ReadCoeff(Luts.Cat1Prob, 1, ref value, ref count, + ref range); } } + // Val may use 18-bits v = (int)(((long)val * dqv) >> dqShift); } @@ -187,7 +181,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 tokenCache[scan[c]] = 1; v = dqv >> dqShift; } - dqcoeff[scan[c]] = (int)HighbdCheckRange(r.ReadBool(128, ref value, ref count, ref range) != 0 ? -v : v, xd.Bd); + + dqcoeff[scan[c]] = (int)HighbdCheckRange(r.ReadBool(128, ref value, ref count, ref range) != 0 ? -v : v, + xd.Bd); ++c; ctx = GetCoefContext(nb, tokenCache, c); dqv = dq[1]; @@ -199,7 +195,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 return c; } - private static void GetCtxShift(ref MacroBlockD xd, ref int ctxShiftA, ref int ctxShiftL, int x, int y, uint txSizeInBlocks) + private static void GetCtxShift(ref MacroBlockD xd, ref int ctxShiftA, ref int ctxShiftL, int x, int y, + uint txSizeInBlocks) { if (xd.MaxBlocksWide != 0) { @@ -208,6 +205,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 ctxShiftA = (int)(txSizeInBlocks - (xd.MaxBlocksWide - x)) * 8; } } + if (xd.MaxBlocksHigh != 0) { if (txSizeInBlocks + y > xd.MaxBlocksHigh) @@ -322,4 +320,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 return eob; } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Convolve.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Convolve.cs index d49a6bf63..8cea09aec 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Convolve.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Convolve.cs @@ -75,17 +75,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Vector128 zero = Vector128.Zero; Vector128 const64 = Vector128.Create(64); - ulong x, y; - src -= SubpelTaps / 2 - 1; + src -= (SubpelTaps / 2) - 1; fixed (Array8* xFilter = xFilters) { - Vector128 vfilter = Sse2.LoadVector128((short*)xFilter + (uint)(x0Q4 & SubpelMask) * 8); + Vector128 vfilter = Sse2.LoadVector128((short*)xFilter + ((uint)(x0Q4 & SubpelMask) * 8)); - for (y = 0; y < (uint)h; ++y) + for (ulong y = 0; y < (uint)h; ++y) { ulong srcOffset = (uint)x0Q4 >> SubpelBits; - for (x = 0; x < (uint)w; x += 4) + for (ulong x = 0; x < (uint)w; x += 4) { Vector128 vsrc0 = Sse41.ConvertToVector128Int16(&src[srcOffset + x]); Vector128 vsrc1 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 1]); @@ -94,8 +93,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Vector128 sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero); - Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle()); + Sse.StoreScalar((float*)&dst[x], + PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle()); } + src += srcStride; dst += dstStride; } @@ -120,18 +121,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp return; } - int x, y; - src -= SubpelTaps / 2 - 1; + src -= (SubpelTaps / 2) - 1; - for (y = 0; y < h; ++y) + for (int y = 0; y < h; ++y) { int xQ4 = x0Q4; - for (x = 0; x < w; ++x) + for (int x = 0; x < w; ++x) { byte* srcX = &src[xQ4 >> SubpelBits]; ref Array8 xFilter = ref xFilters[xQ4 & SubpelMask]; - int k, sum = 0; - for (k = 0; k < SubpelTaps; ++k) + int sum = 0; + for (int k = 0; k < SubpelTaps; ++k) { sum += srcX[k] * xFilter[k]; } @@ -139,6 +139,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp dst[x] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)); xQ4 += xStepQ4; } + src += srcStride; dst += dstStride; } @@ -155,25 +156,26 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp int w, int h) { - int x, y; - src -= SubpelTaps / 2 - 1; + src -= (SubpelTaps / 2) - 1; - for (y = 0; y < h; ++y) + for (int y = 0; y < h; ++y) { int xQ4 = x0Q4; - for (x = 0; x < w; ++x) + for (int x = 0; x < w; ++x) { byte* srcX = &src[xQ4 >> SubpelBits]; ref Array8 xFilter = ref xFilters[xQ4 & SubpelMask]; - int k, sum = 0; - for (k = 0; k < SubpelTaps; ++k) + int sum = 0; + for (int k = 0; k < SubpelTaps; ++k) { sum += srcX[k] * xFilter[k]; } - dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1); + dst[x] = (byte)BitUtils.RoundPowerOfTwo( + dst[x] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1); xQ4 += xStepQ4; } + src += srcStride; dst += dstStride; } @@ -202,18 +204,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp srcStride * 6, srcStride * 7); - ulong x, y; - src -= srcStride * (SubpelTaps / 2 - 1); + src -= srcStride * ((SubpelTaps / 2) - 1); fixed (Array8* yFilter = yFilters) { - Vector128 vfilter = Sse2.LoadVector128((short*)yFilter + (uint)(y0Q4 & SubpelMask) * 8); + Vector128 vfilter = Sse2.LoadVector128((short*)yFilter + ((uint)(y0Q4 & SubpelMask) * 8)); ulong srcBaseY = (uint)y0Q4 >> SubpelBits; - for (y = 0; y < (uint)h; ++y) + for (ulong y = 0; y < (uint)h; ++y) { ulong srcOffset = (srcBaseY + y) * (uint)srcStride; - for (x = 0; x < (uint)w; x += 4) + for (ulong x = 0; x < (uint)w; x += 4) { Vector256 vsrc = Avx2.GatherVector256((uint*)&src[srcOffset + x], indices, 1).AsInt32(); @@ -239,8 +240,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Vector128 sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero); - Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle()); + Sse.StoreScalar((float*)&dst[x], + PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle()); } + dst += dstStride; } } @@ -264,18 +267,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp return; } - int x, y; - src -= srcStride * (SubpelTaps / 2 - 1); + src -= srcStride * ((SubpelTaps / 2) - 1); - for (x = 0; x < w; ++x) + for (int x = 0; x < w; ++x) { int yQ4 = y0Q4; - for (y = 0; y < h; ++y) + for (int y = 0; y < h; ++y) { byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride]; ref Array8 yFilter = ref yFilters[yQ4 & SubpelMask]; - int k, sum = 0; - for (k = 0; k < SubpelTaps; ++k) + int sum = 0; + for (int k = 0; k < SubpelTaps; ++k) { sum += srcY[k * srcStride] * yFilter[k]; } @@ -283,6 +285,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp dst[y * dstStride] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)); yQ4 += yStepQ4; } + ++src; ++dst; } @@ -299,18 +302,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp int w, int h) { - int x, y; - src -= srcStride * (SubpelTaps / 2 - 1); + src -= srcStride * ((SubpelTaps / 2) - 1); - for (x = 0; x < w; ++x) + for (int x = 0; x < w; ++x) { int yQ4 = y0Q4; - for (y = 0; y < h; ++y) + for (int y = 0; y < h; ++y) { byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride]; ref Array8 yFilter = ref yFilters[yQ4 & SubpelMask]; - int k, sum = 0; - for (k = 0; k < SubpelTaps; ++k) + int sum = 0; + for (int k = 0; k < SubpelTaps; ++k) { sum += srcY[k * srcStride] * yFilter[k]; } @@ -319,6 +321,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp dst[y * dstStride] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1); yQ4 += yStepQ4; } + ++src; ++dst; } @@ -418,15 +421,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp // ==> yStepQ4 = 64. Since w and h are at most 16, the temp buffer is still // big enough. byte* temp = stackalloc byte[64 * 135]; - int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps; + int intermediateHeight = ((((h - 1) * yStepQ4) + y0Q4) >> SubpelBits) + SubpelTaps; Debug.Assert(w <= 64); Debug.Assert(h <= 64); Debug.Assert(yStepQ4 <= 32 || (yStepQ4 <= 64 && h <= 32)); Debug.Assert(xStepQ4 <= 64); - ConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight); - ConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h); + ConvolveHoriz(src - (srcStride * ((SubpelTaps / 2) - 1)), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, + intermediateHeight); + ConvolveVert(temp + (64 * ((SubpelTaps / 2) - 1)), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h); } public static unsafe void Convolve8Avg( @@ -487,11 +491,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp int w, int h) { - int x, y; - - for (y = 0; y < h; ++y) + for (int y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) + for (int x = 0; x < w; ++x) { dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1); } @@ -609,18 +611,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp int h, int bd) { - int x, y; - src -= SubpelTaps / 2 - 1; + src -= (SubpelTaps / 2) - 1; - for (y = 0; y < h; ++y) + for (int y = 0; y < h; ++y) { int xQ4 = x0Q4; - for (x = 0; x < w; ++x) + for (int x = 0; x < w; ++x) { ushort* srcX = &src[xQ4 >> SubpelBits]; ref Array8 xFilter = ref xFilters[xQ4 & SubpelMask]; - int k, sum = 0; - for (k = 0; k < SubpelTaps; ++k) + int sum = 0; + for (int k = 0; k < SubpelTaps; ++k) { sum += srcX[k] * xFilter[k]; } @@ -628,6 +629,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp dst[x] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd); xQ4 += xStepQ4; } + src += srcStride; dst += dstStride; } @@ -645,25 +647,26 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp int h, int bd) { - int x, y; - src -= SubpelTaps / 2 - 1; + src -= (SubpelTaps / 2) - 1; - for (y = 0; y < h; ++y) + for (int y = 0; y < h; ++y) { int xQ4 = x0Q4; - for (x = 0; x < w; ++x) + for (int x = 0; x < w; ++x) { ushort* srcX = &src[xQ4 >> SubpelBits]; ref Array8 xFilter = ref xFilters[xQ4 & SubpelMask]; - int k, sum = 0; - for (k = 0; k < SubpelTaps; ++k) + int sum = 0; + for (int k = 0; k < SubpelTaps; ++k) { sum += srcX[k] * xFilter[k]; } - dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1); + dst[x] = (ushort)BitUtils.RoundPowerOfTwo( + dst[x] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1); xQ4 += xStepQ4; } + src += srcStride; dst += dstStride; } @@ -681,18 +684,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp int h, int bd) { - int x, y; - src -= srcStride * (SubpelTaps / 2 - 1); + src -= srcStride * ((SubpelTaps / 2) - 1); - for (x = 0; x < w; ++x) + for (int x = 0; x < w; ++x) { int yQ4 = y0Q4; - for (y = 0; y < h; ++y) + for (int y = 0; y < h; ++y) { ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride]; ref Array8 yFilter = ref yFilters[yQ4 & SubpelMask]; - int k, sum = 0; - for (k = 0; k < SubpelTaps; ++k) + int sum = 0; + for (int k = 0; k < SubpelTaps; ++k) { sum += srcY[k * srcStride] * yFilter[k]; } @@ -700,6 +702,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp dst[y * dstStride] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd); yQ4 += yStepQ4; } + ++src; ++dst; } @@ -717,26 +720,27 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp int h, int bd) { - int x, y; - src -= srcStride * (SubpelTaps / 2 - 1); + src -= srcStride * ((SubpelTaps / 2) - 1); - for (x = 0; x < w; ++x) + for (int x = 0; x < w; ++x) { int yQ4 = y0Q4; - for (y = 0; y < h; ++y) + for (int y = 0; y < h; ++y) { ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride]; ref Array8 yFilter = ref yFilters[yQ4 & SubpelMask]; - int k, sum = 0; - for (k = 0; k < SubpelTaps; ++k) + int sum = 0; + for (int k = 0; k < SubpelTaps; ++k) { sum += srcY[k * srcStride] * yFilter[k]; } dst[y * dstStride] = (ushort)BitUtils.RoundPowerOfTwo( - dst[y * dstStride] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1); + dst[y * dstStride] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), + 1); yQ4 += yStepQ4; } + ++src; ++dst; } @@ -769,15 +773,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp // --Require an additional SubpelTaps rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. ushort* temp = stackalloc ushort[64 * 135]; - int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps; + int intermediateHeight = ((((h - 1) * yStepQ4) + y0Q4) >> SubpelBits) + SubpelTaps; Debug.Assert(w <= 64); Debug.Assert(h <= 64); Debug.Assert(yStepQ4 <= 32); Debug.Assert(xStepQ4 <= 32); - HighbdConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight, bd); - HighbdConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd); + HighbdConvolveHoriz(src - (srcStride * ((SubpelTaps / 2) - 1)), srcStride, temp, 64, filter, x0Q4, xStepQ4, + w, intermediateHeight, bd); + HighbdConvolveVert(temp + (64 * ((SubpelTaps / 2) - 1)), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h, + bd); } public static unsafe void HighbdConvolve8Horiz( @@ -811,7 +817,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp int h, int bd) { - HighbdConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h, bd); + HighbdConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h, bd); } public static unsafe void HighbdConvolve8Vert( @@ -926,11 +932,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp int h, int bd) { - int x, y; - - for (y = 0; y < h; ++y) + for (int y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) + for (int x = 0; x < w; ++x) { dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1); } @@ -940,4 +944,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Filter.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Filter.cs index 169628977..e9f038aaf 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Filter.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Filter.cs @@ -9,4 +9,4 @@ public const int SubpelShifts = 1 << SubpelBits; public const int SubpelTaps = 8; } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/IntraPred.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/IntraPred.cs index 62b3a9b14..ce53cccb4 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/IntraPred.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/IntraPred.cs @@ -6,22 +6,22 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp { private static unsafe ref byte Dst(byte* dst, int stride, int x, int y) { - return ref dst[x + y * stride]; + return ref dst[x + (y * stride)]; } private static unsafe ref ushort Dst(ushort* dst, int stride, int x, int y) { - return ref dst[x + y * stride]; + return ref dst[x + (y * stride)]; } private static byte Avg3(byte a, byte b, byte c) { - return (byte)((a + 2 * b + c + 2) >> 2); + return (byte)((a + (2 * b) + c + 2) >> 2); } private static ushort Avg3(ushort a, ushort b, ushort c) { - return (ushort)((a + 2 * b + c + 2) >> 2); + return (ushort)((a + (2 * b) + c + 2) >> 2); } private static byte Avg2(byte a, byte b) @@ -51,9 +51,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp private static unsafe void D207Predictor(byte* dst, int stride, int bs, byte* above, byte* left) { - int r, c; // First column - for (r = 0; r < bs - 1; ++r) + for (int r = 0; r < bs - 1; ++r) { dst[r * stride] = Avg2(left[r], left[r + 1]); } @@ -62,7 +61,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp dst++; // Second column - for (r = 0; r < bs - 2; ++r) + for (int r = 0; r < bs - 2; ++r) { dst[r * stride] = Avg3(left[r], left[r + 1], left[r + 2]); } @@ -72,16 +71,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp dst++; // Rest of last row - for (c = 0; c < bs - 2; ++c) + for (int c = 0; c < bs - 2; ++c) { - dst[(bs - 1) * stride + c] = left[bs - 1]; + dst[((bs - 1) * stride) + c] = left[bs - 1]; } - for (r = bs - 2; r >= 0; --r) + for (int r = bs - 2; r >= 0; --r) { - for (c = 0; c < bs - 2; ++c) + for (int c = 0; c < bs - 2; ++c) { - dst[r * stride + c] = dst[(r + 1) * stride + c - 2]; + dst[(r * stride) + c] = dst[((r + 1) * stride) + c - 2]; } } } @@ -103,19 +102,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp private static unsafe void D63Predictor(byte* dst, int stride, int bs, byte* above, byte* left) { - int r, c; - int size; - for (c = 0; c < bs; ++c) + for (int c = 0; c < bs; ++c) { dst[c] = Avg2(above[c], above[c + 1]); dst[stride + c] = Avg3(above[c], above[c + 1], above[c + 2]); } - for (r = 2, size = bs - 2; r < bs; r += 2, --size) + + for (int r = 2, size = bs - 2; r < bs; r += 2, --size) { - MemoryUtil.Copy(dst + (r + 0) * stride, dst + (r >> 1), size); - MemoryUtil.Fill(dst + (r + 0) * stride + size, above[bs - 1], bs - size); - MemoryUtil.Copy(dst + (r + 1) * stride, dst + stride + (r >> 1), size); - MemoryUtil.Fill(dst + (r + 1) * stride + size, above[bs - 1], bs - size); + MemoryUtil.Copy(dst + ((r + 0) * stride), dst + (r >> 1), size); + MemoryUtil.Fill(dst + ((r + 0) * stride) + size, above[bs - 1], bs - size); + MemoryUtil.Copy(dst + ((r + 1) * stride), dst + stride + (r >> 1), size); + MemoryUtil.Fill(dst + ((r + 1) * stride) + size, above[bs - 1], bs - size); } } @@ -138,15 +136,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp { byte aboveRight = above[bs - 1]; byte* dstRow0 = dst; - int x, size; - for (x = 0; x < bs - 1; ++x) + for (int x = 0; x < bs - 1; ++x) { dst[x] = Avg3(above[x], above[x + 1], above[x + 2]); } + dst[bs - 1] = aboveRight; dst += stride; - for (x = 1, size = bs - 2; x < bs; ++x, --size) + for (int x = 1, size = bs - 2; x < bs; ++x, --size) { MemoryUtil.Copy(dst, dstRow0 + x, size); MemoryUtil.Fill(dst + size, aboveRight, x + 1); @@ -171,10 +169,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp private static unsafe void D117Predictor(byte* dst, int stride, int bs, byte* above, byte* left) { - int r, c; - // First row - for (c = 0; c < bs; c++) + for (int c = 0; c < bs; c++) { dst[c] = Avg2(above[c - 1], above[c]); } @@ -183,7 +179,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp // Second row dst[0] = Avg3(left[0], above[-1], above[0]); - for (c = 1; c < bs; c++) + for (int c = 1; c < bs; c++) { dst[c] = Avg3(above[c - 2], above[c - 1], above[c]); } @@ -192,17 +188,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp // The rest of first col dst[0] = Avg3(above[-1], left[0], left[1]); - for (r = 3; r < bs; ++r) + for (int r = 3; r < bs; ++r) { dst[(r - 2) * stride] = Avg3(left[r - 3], left[r - 2], left[r - 1]); } // The rest of the block - for (r = 2; r < bs; ++r) + for (int r = 2; r < bs; ++r) { - for (c = 1; c < bs; c++) + for (int c = 1; c < bs; c++) { - dst[c] = dst[-2 * stride + c - 1]; + dst[c] = dst[(-2 * stride) + c - 1]; } dst += stride; @@ -226,26 +222,26 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp private static unsafe void D135Predictor(byte* dst, int stride, int bs, byte* above, byte* left) { - int i; - byte* border = stackalloc byte[32 + 32 - 1]; // outer border from bottom-left to top-right + byte* border = stackalloc byte[32 + 32 - 1]; // outer border from bottom-left to top-right // Dst(dst, stride, bs, bs - 2)[0], i.e., border starting at bottom-left - for (i = 0; i < bs - 2; ++i) + for (int i = 0; i < bs - 2; ++i) { border[i] = Avg3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]); } + border[bs - 2] = Avg3(above[-1], left[0], left[1]); border[bs - 1] = Avg3(left[0], above[-1], above[0]); border[bs - 0] = Avg3(above[-1], above[0], above[1]); // dst[0][2, size), i.e., remaining top border ascending - for (i = 0; i < bs - 2; ++i) + for (int i = 0; i < bs - 2; ++i) { border[bs + 1 + i] = Avg3(above[i], above[i + 1], above[i + 2]); } - for (i = 0; i < bs; ++i) + for (int i = 0; i < bs; ++i) { - MemoryUtil.Copy(dst + i * stride, border + bs - 1 - i, bs); + MemoryUtil.Copy(dst + (i * stride), border + bs - 1 - i, bs); } } @@ -266,9 +262,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp private static unsafe void D153Predictor(byte* dst, int stride, int bs, byte* above, byte* left) { - int r, c; dst[0] = Avg2(above[-1], left[0]); - for (r = 1; r < bs; r++) + for (int r = 1; r < bs; r++) { dst[r * stride] = Avg2(left[r - 1], left[r]); } @@ -277,23 +272,23 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp dst[0] = Avg3(left[0], above[-1], above[0]); dst[stride] = Avg3(above[-1], left[0], left[1]); - for (r = 2; r < bs; r++) + for (int r = 2; r < bs; r++) { dst[r * stride] = Avg3(left[r - 2], left[r - 1], left[r]); } dst++; - for (c = 0; c < bs - 2; c++) + for (int c = 0; c < bs - 2; c++) { dst[c] = Avg3(above[c - 1], above[c], above[c + 1]); } dst += stride; - for (r = 1; r < bs; ++r) + for (int r = 1; r < bs; ++r) { - for (c = 0; c < bs - 2; c++) + for (int c = 0; c < bs - 2; c++) { dst[c] = dst[-stride + c - 2]; } @@ -324,9 +319,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp private static unsafe void VPredictor(byte* dst, int stride, int bs, byte* above, byte* left) { - int r; - - for (r = 0; r < bs; r++) + for (int r = 0; r < bs; r++) { MemoryUtil.Copy(dst, above, bs); dst += stride; @@ -355,43 +348,40 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp private static unsafe void HPredictor(byte* dst, int stride, int bs, byte* above, byte* left) { - int r; - - for (r = 0; r < bs; r++) + for (int r = 0; r < bs; r++) { MemoryUtil.Fill(dst, left[r], bs); dst += stride; } } - public static unsafe void TMPredictor4x4(byte* dst, int stride, byte* above, byte* left) + public static unsafe void TmPredictor4x4(byte* dst, int stride, byte* above, byte* left) { - TMPredictor(dst, stride, 4, above, left); + TmPredictor(dst, stride, 4, above, left); } - public static unsafe void TMPredictor8x8(byte* dst, int stride, byte* above, byte* left) + public static unsafe void TmPredictor8x8(byte* dst, int stride, byte* above, byte* left) { - TMPredictor(dst, stride, 8, above, left); + TmPredictor(dst, stride, 8, above, left); } - public static unsafe void TMPredictor16x16(byte* dst, int stride, byte* above, byte* left) + public static unsafe void TmPredictor16x16(byte* dst, int stride, byte* above, byte* left) { - TMPredictor(dst, stride, 16, above, left); + TmPredictor(dst, stride, 16, above, left); } - public static unsafe void TMPredictor32x32(byte* dst, int stride, byte* above, byte* left) + public static unsafe void TmPredictor32x32(byte* dst, int stride, byte* above, byte* left) { - TMPredictor(dst, stride, 32, above, left); + TmPredictor(dst, stride, 32, above, left); } - private static unsafe void TMPredictor(byte* dst, int stride, int bs, byte* above, byte* left) + private static unsafe void TmPredictor(byte* dst, int stride, int bs, byte* above, byte* left) { - int r, c; int yTopLeft = above[-1]; - for (r = 0; r < bs; r++) + for (int r = 0; r < bs; r++) { - for (c = 0; c < bs; c++) + for (int c = 0; c < bs; c++) { dst[c] = BitUtils.ClipPixel(left[r] + above[c] - yTopLeft); } @@ -422,9 +412,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp private static unsafe void Dc128Predictor(byte* dst, int stride, int bs, byte* above, byte* left) { - int r; - - for (r = 0; r < bs; r++) + for (int r = 0; r < bs; r++) { MemoryUtil.Fill(dst, (byte)128, bs); dst += stride; @@ -453,16 +441,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp private static unsafe void DcLeftPredictor(byte* dst, int stride, int bs, byte* above, byte* left) { - int i, r, expectedDc, sum = 0; + int expectedDc, sum = 0; - for (i = 0; i < bs; i++) + for (int i = 0; i < bs; i++) { sum += left[i]; } expectedDc = (sum + (bs >> 1)) / bs; - for (r = 0; r < bs; r++) + for (int r = 0; r < bs; r++) { MemoryUtil.Fill(dst, (byte)expectedDc, bs); dst += stride; @@ -491,16 +479,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp private static unsafe void DcTopPredictor(byte* dst, int stride, int bs, byte* above, byte* left) { - int i, r, expectedDc, sum = 0; + int expectedDc, sum = 0; - for (i = 0; i < bs; i++) + for (int i = 0; i < bs; i++) { sum += above[i]; } expectedDc = (sum + (bs >> 1)) / bs; - for (r = 0; r < bs; r++) + for (int r = 0; r < bs; r++) { MemoryUtil.Fill(dst, (byte)expectedDc, bs); dst += stride; @@ -529,10 +517,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp private static unsafe void DcPredictor(byte* dst, int stride, int bs, byte* above, byte* left) { - int i, r, expectedDc, sum = 0; + int expectedDc, sum = 0; int count = 2 * bs; - for (i = 0; i < bs; i++) + for (int i = 0; i < bs; i++) { sum += above[i]; sum += left[i]; @@ -540,7 +528,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp expectedDc = (sum + (count >> 1)) / count; - for (r = 0; r < bs; r++) + for (int r = 0; r < bs; r++) { MemoryUtil.Fill(dst, (byte)expectedDc, bs); dst += stride; @@ -555,10 +543,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp byte k = left[2]; byte l = left[3]; - MemoryUtil.Fill(dst + stride * 0, Avg3(h, I, j), 4); - MemoryUtil.Fill(dst + stride * 1, Avg3(I, j, k), 4); - MemoryUtil.Fill(dst + stride * 2, Avg3(j, k, l), 4); - MemoryUtil.Fill(dst + stride * 3, Avg3(k, l, l), 4); + MemoryUtil.Fill(dst + (stride * 0), Avg3(h, I, j), 4); + MemoryUtil.Fill(dst + (stride * 1), Avg3(I, j, k), 4); + MemoryUtil.Fill(dst + (stride * 2), Avg3(j, k, l), 4); + MemoryUtil.Fill(dst + (stride * 3), Avg3(k, l, l), 4); } public static unsafe void VePredictor4x4(byte* dst, int stride, byte* above, byte* left) @@ -574,9 +562,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp dst[1] = Avg3(I, j, k); dst[2] = Avg3(j, k, l); dst[3] = Avg3(k, l, m); - MemoryUtil.Copy(dst + stride * 1, dst, 4); - MemoryUtil.Copy(dst + stride * 2, dst, 4); - MemoryUtil.Copy(dst + stride * 3, dst, 4); + MemoryUtil.Copy(dst + (stride * 1), dst, 4); + MemoryUtil.Copy(dst + (stride * 2), dst, 4); + MemoryUtil.Copy(dst + (stride * 3), dst, 4); } public static unsafe void D207Predictor4x4(byte* dst, int stride, byte* above, byte* left) @@ -591,7 +579,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Dst(dst, stride, 1, 0) = Avg3(I, j, k); Dst(dst, stride, 3, 0) = Dst(dst, stride, 1, 1) = Avg3(j, k, l); Dst(dst, stride, 3, 1) = Dst(dst, stride, 1, 2) = Avg3(k, l, l); - Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 0, 3) = Dst(dst, stride, 1, 3) = Dst(dst, stride, 2, 3) = Dst(dst, stride, 3, 3) = l; + Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 0, 3) = + Dst(dst, stride, 1, 3) = Dst(dst, stride, 2, 3) = Dst(dst, stride, 3, 3) = l; } public static unsafe void D63Predictor4x4(byte* dst, int stride, byte* above, byte* left) @@ -607,16 +596,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 2) = Avg2(b, c); Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 2) = Avg2(c, d); Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 2) = Avg2(d, e); - Dst(dst, stride, 3, 2) = Avg2(e, f); // Differs from vp8 + Dst(dst, stride, 3, 2) = Avg2(e, f); // Differs from vp8 Dst(dst, stride, 0, 1) = Avg3(a, b, c); Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 3) = Avg3(b, c, d); Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 3) = Avg3(c, d, e); Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 3) = Avg3(d, e, f); - Dst(dst, stride, 3, 3) = Avg3(e, f, g); // Differs from vp8 + Dst(dst, stride, 3, 3) = Avg3(e, f, g); // Differs from vp8 } - public static unsafe void D63ePredictor4x4(byte* dst, int stride, byte* above, byte* left) + public static unsafe void D63EPredictor4x4(byte* dst, int stride, byte* above, byte* left) { byte a = above[0]; byte b = above[1]; @@ -652,13 +641,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Dst(dst, stride, 0, 0) = Avg3(a, b, c); Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 1) = Avg3(b, c, d); Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 2) = Avg3(c, d, e); - Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f); + Dst(dst, stride, 3, 0) = + Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f); Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 3) = Avg3(e, f, g); Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 3) = Avg3(f, g, h); - Dst(dst, stride, 3, 3) = h; // differs from vp8 + Dst(dst, stride, 3, 3) = h; // differs from vp8 } - public static unsafe void D45ePredictor4x4(byte* dst, int stride, byte* above, byte* left) + public static unsafe void D45EPredictor4x4(byte* dst, int stride, byte* above, byte* left) { byte a = above[0]; byte b = above[1]; @@ -671,7 +661,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Dst(dst, stride, 0, 0) = Avg3(a, b, c); Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 1) = Avg3(b, c, d); Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 2) = Avg3(c, d, e); - Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f); + Dst(dst, stride, 3, 0) = + Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f); Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 3) = Avg3(e, f, g); Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 3) = Avg3(f, g, h); Dst(dst, stride, 3, 3) = Avg3(g, h, h); @@ -714,7 +705,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Dst(dst, stride, 0, 3) = Avg3(j, k, l); Dst(dst, stride, 1, 3) = Dst(dst, stride, 0, 2) = Avg3(I, j, k); Dst(dst, stride, 2, 3) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 1) = Avg3(x, I, j); - Dst(dst, stride, 3, 3) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 0) = Avg3(a, x, I); + Dst(dst, stride, 3, 3) = + Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 0) = Avg3(a, x, I); Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 0) = Avg3(b, a, x); Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 0) = Avg3(c, b, a); Dst(dst, stride, 3, 0) = Avg3(d, c, b); @@ -758,38 +750,39 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp HighbdD207Predictor(dst, stride, 32, above, left, bd); } - private static unsafe void HighbdD207Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + private static unsafe void HighbdD207Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, + int bd) { - int r, c; - // First column. - for (r = 0; r < bs - 1; ++r) + for (int r = 0; r < bs - 1; ++r) { dst[r * stride] = Avg2(left[r], left[r + 1]); } + dst[(bs - 1) * stride] = left[bs - 1]; dst++; // Second column. - for (r = 0; r < bs - 2; ++r) + for (int r = 0; r < bs - 2; ++r) { dst[r * stride] = Avg3(left[r], left[r + 1], left[r + 2]); } + dst[(bs - 2) * stride] = Avg3(left[bs - 2], left[bs - 1], left[bs - 1]); dst[(bs - 1) * stride] = left[bs - 1]; dst++; // Rest of last row. - for (c = 0; c < bs - 2; ++c) + for (int c = 0; c < bs - 2; ++c) { - dst[(bs - 1) * stride + c] = left[bs - 1]; + dst[((bs - 1) * stride) + c] = left[bs - 1]; } - for (r = bs - 2; r >= 0; --r) + for (int r = bs - 2; r >= 0; --r) { - for (c = 0; c < bs - 2; ++c) + for (int c = 0; c < bs - 2; ++c) { - dst[r * stride + c] = dst[(r + 1) * stride + c - 2]; + dst[(r * stride) + c] = dst[((r + 1) * stride) + c - 2]; } } } @@ -809,21 +802,21 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp HighbdD63Predictor(dst, stride, 32, above, left, bd); } - private static unsafe void HighbdD63Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + private static unsafe void HighbdD63Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, + int bd) { - int r, c; - int size; - for (c = 0; c < bs; ++c) + for (int c = 0; c < bs; ++c) { dst[c] = Avg2(above[c], above[c + 1]); dst[stride + c] = Avg3(above[c], above[c + 1], above[c + 2]); } - for (r = 2, size = bs - 2; r < bs; r += 2, --size) + + for (int r = 2, size = bs - 2; r < bs; r += 2, --size) { - MemoryUtil.Copy(dst + (r + 0) * stride, dst + (r >> 1), size); - MemoryUtil.Fill(dst + (r + 0) * stride + size, above[bs - 1], bs - size); - MemoryUtil.Copy(dst + (r + 1) * stride, dst + stride + (r >> 1), size); - MemoryUtil.Fill(dst + (r + 1) * stride + size, above[bs - 1], bs - size); + MemoryUtil.Copy(dst + ((r + 0) * stride), dst + (r >> 1), size); + MemoryUtil.Fill(dst + ((r + 0) * stride) + size, above[bs - 1], bs - size); + MemoryUtil.Copy(dst + ((r + 1) * stride), dst + stride + (r >> 1), size); + MemoryUtil.Fill(dst + ((r + 1) * stride) + size, above[bs - 1], bs - size); } } @@ -842,19 +835,20 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp HighbdD45Predictor(dst, stride, 32, above, left, bd); } - private static unsafe void HighbdD45Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + private static unsafe void HighbdD45Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, + int bd) { ushort aboveRight = above[bs - 1]; ushort* dstRow0 = dst; - int x, size; - for (x = 0; x < bs - 1; ++x) + for (int x = 0; x < bs - 1; ++x) { dst[x] = Avg3(above[x], above[x + 1], above[x + 2]); } + dst[bs - 1] = aboveRight; dst += stride; - for (x = 1, size = bs - 2; x < bs; ++x, --size) + for (int x = 1, size = bs - 2; x < bs; ++x, --size) { MemoryUtil.Copy(dst, dstRow0 + x, size); MemoryUtil.Fill(dst + size, aboveRight, x + 1); @@ -877,12 +871,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp HighbdD117Predictor(dst, stride, 32, above, left, bd); } - private static unsafe void HighbdD117Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + private static unsafe void HighbdD117Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, + int bd) { - int r, c; - // First row - for (c = 0; c < bs; c++) + for (int c = 0; c < bs; c++) { dst[c] = Avg2(above[c - 1], above[c]); } @@ -891,7 +884,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp // Second row dst[0] = Avg3(left[0], above[-1], above[0]); - for (c = 1; c < bs; c++) + for (int c = 1; c < bs; c++) { dst[c] = Avg3(above[c - 2], above[c - 1], above[c]); } @@ -900,17 +893,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp // The rest of first col dst[0] = Avg3(above[-1], left[0], left[1]); - for (r = 3; r < bs; ++r) + for (int r = 3; r < bs; ++r) { dst[(r - 2) * stride] = Avg3(left[r - 3], left[r - 2], left[r - 1]); } // The rest of the block - for (r = 2; r < bs; ++r) + for (int r = 2; r < bs; ++r) { - for (c = 1; c < bs; c++) + for (int c = 1; c < bs; c++) { - dst[c] = dst[-2 * stride + c - 1]; + dst[c] = dst[(-2 * stride) + c - 1]; } dst += stride; @@ -932,28 +925,29 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp HighbdD135Predictor(dst, stride, 32, above, left, bd); } - private static unsafe void HighbdD135Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + private static unsafe void HighbdD135Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, + int bd) { - int i; - ushort* border = stackalloc ushort[32 + 32 - 1]; // Outer border from bottom-left to top-right + ushort* border = stackalloc ushort[32 + 32 - 1]; // Outer border from bottom-left to top-right // Dst(dst, stride, bs, bs - 2)[0], i.e., border starting at bottom-left - for (i = 0; i < bs - 2; ++i) + for (int i = 0; i < bs - 2; ++i) { border[i] = Avg3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]); } + border[bs - 2] = Avg3(above[-1], left[0], left[1]); border[bs - 1] = Avg3(left[0], above[-1], above[0]); border[bs - 0] = Avg3(above[-1], above[0], above[1]); // dst[0][2, size), i.e., remaining top border ascending - for (i = 0; i < bs - 2; ++i) + for (int i = 0; i < bs - 2; ++i) { border[bs + 1 + i] = Avg3(above[i], above[i + 1], above[i + 2]); } - for (i = 0; i < bs; ++i) + for (int i = 0; i < bs; ++i) { - MemoryUtil.Copy(dst + i * stride, border + bs - 1 - i, bs); + MemoryUtil.Copy(dst + (i * stride), border + bs - 1 - i, bs); } } @@ -972,11 +966,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp HighbdD153Predictor(dst, stride, 32, above, left, bd); } - private static unsafe void HighbdD153Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + private static unsafe void HighbdD153Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, + int bd) { - int r, c; dst[0] = Avg2(above[-1], left[0]); - for (r = 1; r < bs; r++) + for (int r = 1; r < bs; r++) { dst[r * stride] = Avg2(left[r - 1], left[r]); } @@ -985,23 +979,23 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp dst[0] = Avg3(left[0], above[-1], above[0]); dst[stride] = Avg3(above[-1], left[0], left[1]); - for (r = 2; r < bs; r++) + for (int r = 2; r < bs; r++) { dst[r * stride] = Avg3(left[r - 2], left[r - 1], left[r]); } dst++; - for (c = 0; c < bs - 2; c++) + for (int c = 0; c < bs - 2; c++) { dst[c] = Avg3(above[c - 1], above[c], above[c + 1]); } dst += stride; - for (r = 1; r < bs; ++r) + for (int r = 1; r < bs; ++r) { - for (c = 0; c < bs - 2; c++) + for (int c = 0; c < bs - 2; c++) { dst[c] = dst[-stride + c - 2]; } @@ -1030,10 +1024,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp HighbdVPredictor(dst, stride, 32, above, left, bd); } - private static unsafe void HighbdVPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + private static unsafe void HighbdVPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, + int bd) { - int r; - for (r = 0; r < bs; r++) + for (int r = 0; r < bs; r++) { MemoryUtil.Copy(dst, above, bs); dst += stride; @@ -1060,44 +1054,44 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp HighbdHPredictor(dst, stride, 32, above, left, bd); } - private static unsafe void HighbdHPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + private static unsafe void HighbdHPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, + int bd) { - int r; - for (r = 0; r < bs; r++) + for (int r = 0; r < bs; r++) { MemoryUtil.Fill(dst, left[r], bs); dst += stride; } } - public static unsafe void HighbdTMPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) + public static unsafe void HighbdTmPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) { - HighbdTMPredictor(dst, stride, 4, above, left, bd); + HighbdTmPredictor(dst, stride, 4, above, left, bd); } - public static unsafe void HighbdTMPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd) + public static unsafe void HighbdTmPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd) { - HighbdTMPredictor(dst, stride, 8, above, left, bd); + HighbdTmPredictor(dst, stride, 8, above, left, bd); } - public static unsafe void HighbdTMPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) + public static unsafe void HighbdTmPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) { - HighbdTMPredictor(dst, stride, 16, above, left, bd); + HighbdTmPredictor(dst, stride, 16, above, left, bd); } - public static unsafe void HighbdTMPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) + public static unsafe void HighbdTmPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) { - HighbdTMPredictor(dst, stride, 32, above, left, bd); + HighbdTmPredictor(dst, stride, 32, above, left, bd); } - private static unsafe void HighbdTMPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + private static unsafe void HighbdTmPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, + int bd) { - int r, c; int yTopLeft = above[-1]; - for (r = 0; r < bs; r++) + for (int r = 0; r < bs; r++) { - for (c = 0; c < bs; c++) + for (int c = 0; c < bs; c++) { dst[c] = BitUtils.ClipPixelHighbd(left[r] + above[c] - yTopLeft, bd); } @@ -1116,21 +1110,22 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp HighbdDc128Predictor(dst, stride, 8, above, left, bd); } - public static unsafe void HighbdDc128Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) + public static unsafe void HighbdDc128Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, + int bd) { HighbdDc128Predictor(dst, stride, 16, above, left, bd); } - public static unsafe void HighbdDc128Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) + public static unsafe void HighbdDc128Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, + int bd) { HighbdDc128Predictor(dst, stride, 32, above, left, bd); } - private static unsafe void HighbdDc128Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + private static unsafe void HighbdDc128Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, + int bd) { - int r; - - for (r = 0; r < bs; r++) + for (int r = 0; r < bs; r++) { MemoryUtil.Fill(dst, (ushort)(128 << (bd - 8)), bs); dst += stride; @@ -1147,28 +1142,31 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp HighbdDcLeftPredictor(dst, stride, 8, above, left, bd); } - public static unsafe void HighbdDcLeftPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) + public static unsafe void HighbdDcLeftPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, + int bd) { HighbdDcLeftPredictor(dst, stride, 16, above, left, bd); } - public static unsafe void HighbdDcLeftPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) + public static unsafe void HighbdDcLeftPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, + int bd) { HighbdDcLeftPredictor(dst, stride, 32, above, left, bd); } - private static unsafe void HighbdDcLeftPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + private static unsafe void HighbdDcLeftPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, + int bd) { - int i, r, expectedDc, sum = 0; + int expectedDc, sum = 0; - for (i = 0; i < bs; i++) + for (int i = 0; i < bs; i++) { sum += left[i]; } expectedDc = (sum + (bs >> 1)) / bs; - for (r = 0; r < bs; r++) + for (int r = 0; r < bs; r++) { MemoryUtil.Fill(dst, (ushort)expectedDc, bs); dst += stride; @@ -1185,28 +1183,31 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp HighbdDcTopPredictor(dst, stride, 8, above, left, bd); } - public static unsafe void HighbdDcTopPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd) + public static unsafe void HighbdDcTopPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, + int bd) { HighbdDcTopPredictor(dst, stride, 16, above, left, bd); } - public static unsafe void HighbdDcTopPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd) + public static unsafe void HighbdDcTopPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, + int bd) { HighbdDcTopPredictor(dst, stride, 32, above, left, bd); } - private static unsafe void HighbdDcTopPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + private static unsafe void HighbdDcTopPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, + int bd) { - int i, r, expectedDc, sum = 0; + int expectedDc, sum = 0; - for (i = 0; i < bs; i++) + for (int i = 0; i < bs; i++) { sum += above[i]; } expectedDc = (sum + (bs >> 1)) / bs; - for (r = 0; r < bs; r++) + for (int r = 0; r < bs; r++) { MemoryUtil.Fill(dst, (ushort)expectedDc, bs); dst += stride; @@ -1233,12 +1234,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp HighbdDcPredictor(dst, stride, 32, above, left, bd); } - private static unsafe void HighbdDcPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd) + private static unsafe void HighbdDcPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, + int bd) { - int i, r, expectedDc, sum = 0; + int expectedDc, sum = 0; int count = 2 * bs; - for (i = 0; i < bs; i++) + for (int i = 0; i < bs; i++) { sum += above[i]; sum += left[i]; @@ -1246,7 +1248,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp expectedDc = (sum + (count >> 1)) / count; - for (r = 0; r < bs; r++) + for (int r = 0; r < bs; r++) { MemoryUtil.Fill(dst, (ushort)expectedDc, bs); dst += stride; @@ -1265,7 +1267,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Dst(dst, stride, 1, 0) = Avg3(I, j, k); Dst(dst, stride, 3, 0) = Dst(dst, stride, 1, 1) = Avg3(j, k, l); Dst(dst, stride, 3, 1) = Dst(dst, stride, 1, 2) = Avg3(k, l, l); - Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 0, 3) = Dst(dst, stride, 1, 3) = Dst(dst, stride, 2, 3) = Dst(dst, stride, 3, 3) = l; + Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 0, 3) = + Dst(dst, stride, 1, 3) = Dst(dst, stride, 2, 3) = Dst(dst, stride, 3, 3) = l; } public static unsafe void HighbdD63Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) @@ -1281,13 +1284,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 2) = Avg2(b, c); Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 2) = Avg2(c, d); Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 2) = Avg2(d, e); - Dst(dst, stride, 3, 2) = Avg2(e, f); // Differs from vp8 + Dst(dst, stride, 3, 2) = Avg2(e, f); // Differs from vp8 Dst(dst, stride, 0, 1) = Avg3(a, b, c); Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 3) = Avg3(b, c, d); Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 3) = Avg3(c, d, e); Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 3) = Avg3(d, e, f); - Dst(dst, stride, 3, 3) = Avg3(e, f, g); // Differs from vp8 + Dst(dst, stride, 3, 3) = Avg3(e, f, g); // Differs from vp8 } public static unsafe void HighbdD45Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) @@ -1303,10 +1306,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Dst(dst, stride, 0, 0) = Avg3(a, b, c); Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 1) = Avg3(b, c, d); Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 2) = Avg3(c, d, e); - Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f); + Dst(dst, stride, 3, 0) = + Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f); Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 3) = Avg3(e, f, g); Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 3) = Avg3(f, g, h); - Dst(dst, stride, 3, 3) = h; // Differs from vp8 + Dst(dst, stride, 3, 3) = h; // Differs from vp8 } public static unsafe void HighbdD117Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd) @@ -1346,7 +1350,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Dst(dst, stride, 0, 3) = Avg3(j, k, l); Dst(dst, stride, 1, 3) = Dst(dst, stride, 0, 2) = Avg3(I, j, k); Dst(dst, stride, 2, 3) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 1) = Avg3(x, I, j); - Dst(dst, stride, 3, 3) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 0) = Avg3(a, x, I); + Dst(dst, stride, 3, 3) = + Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 0) = Avg3(a, x, I); Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 0) = Avg3(b, a, x); Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 0) = Avg3(c, b, a); Dst(dst, stride, 3, 0) = Avg3(d, c, b); @@ -1376,4 +1381,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Dst(dst, stride, 1, 3) = Avg3(l, k, j); } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs index 3fc3c72a7..d93ff25f7 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs @@ -10,13 +10,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp { // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse // transform amplify bits + 1 bit for contingency in rounding and quantizing - private const int HighbdValidTxfmMagnitudeRange = (1 << 25); + private const int HighbdValidTxfmMagnitudeRange = 1 << 25; [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int DetectInvalidHighbdInput(ReadOnlySpan input, int size) { - int i; - for (i = 0; i < size; ++i) + for (int i = 0; i < size; ++i) { if (Math.Abs(input[i]) >= HighbdValidTxfmMagnitudeRange) { @@ -92,13 +91,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ - int i; + Span output = stackalloc int[16]; long a1, b1, c1, d1, e1; ReadOnlySpan ip = input; Span op = output; - for (i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { a1 = ip[0] >> UnitQuantShift; c1 = ip[1] >> UnitQuantShift; @@ -120,7 +119,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } Span ip2 = output; - for (i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { a1 = ip2[4 * 0]; c1 = ip2[4 * 1]; @@ -146,7 +145,6 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void Iwht4x41Add(ReadOnlySpan input, Span dest, int stride) { - int i; long a1, e1; Span tmp = stackalloc int[4]; ReadOnlySpan ip = input; @@ -159,7 +157,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp op[1] = op[2] = op[3] = WrapLow(e1); Span ip2 = tmp; - for (i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { e1 = ip2[0] >> 1; a1 = ip2[0] - e1; @@ -182,24 +180,24 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp if ((x0 | x1 | x2 | x3) == 0) { - output.Slice(0, 4).Fill(0); + output.Slice(0, 4).Clear(); return; } // 32-bit result is enough for the following multiplications. - s0 = SinPi1_9 * x0; - s1 = SinPi2_9 * x0; - s2 = SinPi3_9 * x1; - s3 = SinPi4_9 * x2; - s4 = SinPi1_9 * x2; - s5 = SinPi2_9 * x3; - s6 = SinPi4_9 * x3; + s0 = SinPi19 * x0; + s1 = SinPi29 * x0; + s2 = SinPi39 * x1; + s3 = SinPi49 * x2; + s4 = SinPi19 * x2; + s5 = SinPi29 * x3; + s6 = SinPi49 * x3; s7 = WrapLow(x0 - x2 + x3); s0 = s0 + s3 + s5; s1 = s1 - s4 - s6; s3 = s2; - s2 = SinPi3_9 * s7; + s2 = SinPi39 * s7; // 1-D transform scaling factor is sqrt(2). // The overall dynamic range is 14b (input) + 14b (multiplication scaling) @@ -218,12 +216,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp long temp1, temp2; // stage 1 - temp1 = ((short)input[0] + (short)input[2]) * CosPi16_64; - temp2 = ((short)input[0] - (short)input[2]) * CosPi16_64; + temp1 = ((short)input[0] + (short)input[2]) * CosPi1664; + temp2 = ((short)input[0] - (short)input[2]) * CosPi1664; step[0] = (short)WrapLow(DctConstRoundShift(temp1)); step[1] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = (short)input[1] * CosPi24_64 - (short)input[3] * CosPi8_64; - temp2 = (short)input[1] * CosPi8_64 + (short)input[3] * CosPi24_64; + temp1 = ((short)input[1] * CosPi2464) - ((short)input[3] * CosPi864); + temp2 = ((short)input[1] * CosPi864) + ((short)input[3] * CosPi2464); step[2] = (short)WrapLow(DctConstRoundShift(temp1)); step[3] = (short)WrapLow(DctConstRoundShift(temp2)); @@ -237,14 +235,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void Idct4x416Add(ReadOnlySpan input, Span dest, int stride) { - int i, j; Span output = stackalloc int[4 * 4]; Span outptr = output; Span tempIn = stackalloc int[4]; Span tempOut = stackalloc int[4]; // Rows - for (i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { Idct4(input, outptr); input = input.Slice(4); @@ -252,31 +249,31 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Columns - for (i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) + for (int j = 0; j < 4; ++j) { - tempIn[j] = output[j * 4 + i]; + tempIn[j] = output[(j * 4) + i]; } Idct4(tempIn, tempOut); - for (j = 0; j < 4; ++j) + for (int j = 0; j < 4; ++j) { - dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4)); + dest[(j * stride) + i] = + ClipPixelAdd(dest[(j * stride) + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4)); } } } public static void Idct4x41Add(ReadOnlySpan input, Span dest, int stride) { - int i; long a1; - int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64)); + int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi1664)); - output = WrapLow(DctConstRoundShift(output * CosPi16_64)); + output = WrapLow(DctConstRoundShift(output * CosPi1664)); a1 = BitUtils.RoundPowerOfTwo(output, 4); - for (i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { dest[0] = ClipPixelAdd(dest[0], a1); dest[1] = ClipPixelAdd(dest[1], a1); @@ -300,19 +297,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0) { - output.Slice(0, 8).Fill(0); + output.Slice(0, 8).Clear(); return; } // stage 1 - s0 = (int)(CosPi2_64 * x0 + CosPi30_64 * x1); - s1 = (int)(CosPi30_64 * x0 - CosPi2_64 * x1); - s2 = (int)(CosPi10_64 * x2 + CosPi22_64 * x3); - s3 = (int)(CosPi22_64 * x2 - CosPi10_64 * x3); - s4 = (int)(CosPi18_64 * x4 + CosPi14_64 * x5); - s5 = (int)(CosPi14_64 * x4 - CosPi18_64 * x5); - s6 = (int)(CosPi26_64 * x6 + CosPi6_64 * x7); - s7 = (int)(CosPi6_64 * x6 - CosPi26_64 * x7); + s0 = (int)((CosPi264 * x0) + (CosPi3064 * x1)); + s1 = (int)((CosPi3064 * x0) - (CosPi264 * x1)); + s2 = (int)((CosPi1064 * x2) + (CosPi2264 * x3)); + s3 = (int)((CosPi2264 * x2) - (CosPi1064 * x3)); + s4 = (int)((CosPi1864 * x4) + (CosPi1464 * x5)); + s5 = (int)((CosPi1464 * x4) - (CosPi1864 * x5)); + s6 = (int)((CosPi2664 * x6) + (CosPi664 * x7)); + s7 = (int)((CosPi664 * x6) - (CosPi2664 * x7)); x0 = WrapLow(DctConstRoundShift(s0 + s4)); x1 = WrapLow(DctConstRoundShift(s1 + s5)); @@ -328,10 +325,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp s1 = (int)x1; s2 = (int)x2; s3 = (int)x3; - s4 = (int)(CosPi8_64 * x4 + CosPi24_64 * x5); - s5 = (int)(CosPi24_64 * x4 - CosPi8_64 * x5); - s6 = (int)(-CosPi24_64 * x6 + CosPi8_64 * x7); - s7 = (int)(CosPi8_64 * x6 + CosPi24_64 * x7); + s4 = (int)((CosPi864 * x4) + (CosPi2464 * x5)); + s5 = (int)((CosPi2464 * x4) - (CosPi864 * x5)); + s6 = (int)((-CosPi2464 * x6) + (CosPi864 * x7)); + s7 = (int)((CosPi864 * x6) + (CosPi2464 * x7)); x0 = WrapLow(s0 + s2); x1 = WrapLow(s1 + s3); @@ -343,10 +340,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp x7 = WrapLow(DctConstRoundShift(s5 - s7)); // stage 3 - s2 = (int)(CosPi16_64 * (x2 + x3)); - s3 = (int)(CosPi16_64 * (x2 - x3)); - s6 = (int)(CosPi16_64 * (x6 + x7)); - s7 = (int)(CosPi16_64 * (x6 - x7)); + s2 = (int)(CosPi1664 * (x2 + x3)); + s3 = (int)(CosPi1664 * (x2 - x3)); + s6 = (int)(CosPi1664 * (x6 + x7)); + s7 = (int)(CosPi1664 * (x6 - x7)); x2 = WrapLow(DctConstRoundShift(s2)); x3 = WrapLow(DctConstRoundShift(s3)); @@ -375,22 +372,22 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[2] = (short)input[4]; step1[1] = (short)input[2]; step1[3] = (short)input[6]; - temp1 = (short)input[1] * CosPi28_64 - (short)input[7] * CosPi4_64; - temp2 = (short)input[1] * CosPi4_64 + (short)input[7] * CosPi28_64; + temp1 = ((short)input[1] * CosPi2864) - ((short)input[7] * CosPi464); + temp2 = ((short)input[1] * CosPi464) + ((short)input[7] * CosPi2864); step1[4] = (short)WrapLow(DctConstRoundShift(temp1)); step1[7] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = (short)input[5] * CosPi12_64 - (short)input[3] * CosPi20_64; - temp2 = (short)input[5] * CosPi20_64 + (short)input[3] * CosPi12_64; + temp1 = ((short)input[5] * CosPi1264) - ((short)input[3] * CosPi2064); + temp2 = ((short)input[5] * CosPi2064) + ((short)input[3] * CosPi1264); step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); // stage 2 - temp1 = (step1[0] + step1[2]) * CosPi16_64; - temp2 = (step1[0] - step1[2]) * CosPi16_64; + temp1 = (step1[0] + step1[2]) * CosPi1664; + temp2 = (step1[0] - step1[2]) * CosPi1664; step2[0] = (short)WrapLow(DctConstRoundShift(temp1)); step2[1] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = step1[1] * CosPi24_64 - step1[3] * CosPi8_64; - temp2 = step1[1] * CosPi8_64 + step1[3] * CosPi24_64; + temp1 = (step1[1] * CosPi2464) - (step1[3] * CosPi864); + temp2 = (step1[1] * CosPi864) + (step1[3] * CosPi2464); step2[2] = (short)WrapLow(DctConstRoundShift(temp1)); step2[3] = (short)WrapLow(DctConstRoundShift(temp2)); step2[4] = (short)WrapLow(step1[4] + step1[5]); @@ -404,8 +401,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[2] = (short)WrapLow(step2[1] - step2[2]); step1[3] = (short)WrapLow(step2[0] - step2[3]); step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * CosPi16_64; - temp2 = (step2[5] + step2[6]) * CosPi16_64; + temp1 = (step2[6] - step2[5]) * CosPi1664; + temp2 = (step2[5] + step2[6]) * CosPi1664; step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); step1[7] = step2[7]; @@ -424,14 +421,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void Idct8x864Add(ReadOnlySpan input, Span dest, int stride) { - int i, j; Span output = stackalloc int[8 * 8]; Span outptr = output; Span tempIn = stackalloc int[8]; Span tempOut = stackalloc int[8]; // First transform rows - for (i = 0; i < 8; ++i) + for (int i = 0; i < 8; ++i) { Idct8(input, outptr); input = input.Slice(8); @@ -439,18 +435,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Then transform columns - for (i = 0; i < 8; ++i) + for (int i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) + for (int j = 0; j < 8; ++j) { - tempIn[j] = output[j * 8 + i]; + tempIn[j] = output[(j * 8) + i]; } Idct8(tempIn, tempOut); - for (j = 0; j < 8; ++j) + for (int j = 0; j < 8; ++j) { - dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], - BitUtils.RoundPowerOfTwo(tempOut[j], 5)); + dest[(j * stride) + i] = ClipPixelAdd(dest[(j * stride) + i], + BitUtils.RoundPowerOfTwo(tempOut[j], 5)); } } } @@ -458,17 +454,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void Idct8x812Add(ReadOnlySpan input, Span dest, int stride) { - int i, j; Span output = stackalloc int[8 * 8]; Span outptr = output; Span tempIn = stackalloc int[8]; Span tempOut = stackalloc int[8]; - output.Fill(0); + output.Clear(); // First transform rows // Only first 4 row has non-zero coefs - for (i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { Idct8(input, outptr); input = input.Slice(8); @@ -476,32 +471,32 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Then transform columns - for (i = 0; i < 8; ++i) + for (int i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) + for (int j = 0; j < 8; ++j) { - tempIn[j] = output[j * 8 + i]; + tempIn[j] = output[(j * 8) + i]; } Idct8(tempIn, tempOut); - for (j = 0; j < 8; ++j) + for (int j = 0; j < 8; ++j) { - dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5)); + dest[(j * stride) + i] = + ClipPixelAdd(dest[(j * stride) + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5)); } } } public static void Idct8x81Add(ReadOnlySpan input, Span dest, int stride) { - int i, j; long a1; - int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64)); + int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi1664)); - output = WrapLow(DctConstRoundShift(output * CosPi16_64)); + output = WrapLow(DctConstRoundShift(output * CosPi1664)); a1 = BitUtils.RoundPowerOfTwo(output, 5); - for (j = 0; j < 8; ++j) + for (int j = 0; j < 8; ++j) { - for (i = 0; i < 8; ++i) + for (int i = 0; i < 8; ++i) { dest[i] = ClipPixelAdd(dest[i], a1); } @@ -533,27 +528,27 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0) { - output.Slice(0, 16).Fill(0); + output.Slice(0, 16).Clear(); return; } // stage 1 - s0 = x0 * CosPi1_64 + x1 * CosPi31_64; - s1 = x0 * CosPi31_64 - x1 * CosPi1_64; - s2 = x2 * CosPi5_64 + x3 * CosPi27_64; - s3 = x2 * CosPi27_64 - x3 * CosPi5_64; - s4 = x4 * CosPi9_64 + x5 * CosPi23_64; - s5 = x4 * CosPi23_64 - x5 * CosPi9_64; - s6 = x6 * CosPi13_64 + x7 * CosPi19_64; - s7 = x6 * CosPi19_64 - x7 * CosPi13_64; - s8 = x8 * CosPi17_64 + x9 * CosPi15_64; - s9 = x8 * CosPi15_64 - x9 * CosPi17_64; - s10 = x10 * CosPi21_64 + x11 * CosPi11_64; - s11 = x10 * CosPi11_64 - x11 * CosPi21_64; - s12 = x12 * CosPi25_64 + x13 * CosPi7_64; - s13 = x12 * CosPi7_64 - x13 * CosPi25_64; - s14 = x14 * CosPi29_64 + x15 * CosPi3_64; - s15 = x14 * CosPi3_64 - x15 * CosPi29_64; + s0 = (x0 * CosPi164) + (x1 * CosPi3164); + s1 = (x0 * CosPi3164) - (x1 * CosPi164); + s2 = (x2 * CosPi564) + (x3 * CosPi2764); + s3 = (x2 * CosPi2764) - (x3 * CosPi564); + s4 = (x4 * CosPi964) + (x5 * CosPi2364); + s5 = (x4 * CosPi2364) - (x5 * CosPi964); + s6 = (x6 * CosPi1364) + (x7 * CosPi1964); + s7 = (x6 * CosPi1964) - (x7 * CosPi1364); + s8 = (x8 * CosPi1764) + (x9 * CosPi1564); + s9 = (x8 * CosPi1564) - (x9 * CosPi1764); + s10 = (x10 * CosPi2164) + (x11 * CosPi1164); + s11 = (x10 * CosPi1164) - (x11 * CosPi2164); + s12 = (x12 * CosPi2564) + (x13 * CosPi764); + s13 = (x12 * CosPi764) - (x13 * CosPi2564); + s14 = (x14 * CosPi2964) + (x15 * CosPi364); + s15 = (x14 * CosPi364) - (x15 * CosPi2964); x0 = WrapLow(DctConstRoundShift(s0 + s8)); x1 = WrapLow(DctConstRoundShift(s1 + s9)); @@ -581,14 +576,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp s5 = x5; s6 = x6; s7 = x7; - s8 = x8 * CosPi4_64 + x9 * CosPi28_64; - s9 = x8 * CosPi28_64 - x9 * CosPi4_64; - s10 = x10 * CosPi20_64 + x11 * CosPi12_64; - s11 = x10 * CosPi12_64 - x11 * CosPi20_64; - s12 = -x12 * CosPi28_64 + x13 * CosPi4_64; - s13 = x12 * CosPi4_64 + x13 * CosPi28_64; - s14 = -x14 * CosPi12_64 + x15 * CosPi20_64; - s15 = x14 * CosPi20_64 + x15 * CosPi12_64; + s8 = (x8 * CosPi464) + (x9 * CosPi2864); + s9 = (x8 * CosPi2864) - (x9 * CosPi464); + s10 = (x10 * CosPi2064) + (x11 * CosPi1264); + s11 = (x10 * CosPi1264) - (x11 * CosPi2064); + s12 = (-x12 * CosPi2864) + (x13 * CosPi464); + s13 = (x12 * CosPi464) + (x13 * CosPi2864); + s14 = (-x14 * CosPi1264) + (x15 * CosPi2064); + s15 = (x14 * CosPi2064) + (x15 * CosPi1264); x0 = WrapLow(s0 + s4); x1 = WrapLow(s1 + s5); @@ -612,18 +607,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp s1 = x1; s2 = x2; s3 = x3; - s4 = x4 * CosPi8_64 + x5 * CosPi24_64; - s5 = x4 * CosPi24_64 - x5 * CosPi8_64; - s6 = -x6 * CosPi24_64 + x7 * CosPi8_64; - s7 = x6 * CosPi8_64 + x7 * CosPi24_64; + s4 = (x4 * CosPi864) + (x5 * CosPi2464); + s5 = (x4 * CosPi2464) - (x5 * CosPi864); + s6 = (-x6 * CosPi2464) + (x7 * CosPi864); + s7 = (x6 * CosPi864) + (x7 * CosPi2464); s8 = x8; s9 = x9; s10 = x10; s11 = x11; - s12 = x12 * CosPi8_64 + x13 * CosPi24_64; - s13 = x12 * CosPi24_64 - x13 * CosPi8_64; - s14 = -x14 * CosPi24_64 + x15 * CosPi8_64; - s15 = x14 * CosPi8_64 + x15 * CosPi24_64; + s12 = (x12 * CosPi864) + (x13 * CosPi2464); + s13 = (x12 * CosPi2464) - (x13 * CosPi864); + s14 = (-x14 * CosPi2464) + (x15 * CosPi864); + s15 = (x14 * CosPi864) + (x15 * CosPi2464); x0 = WrapLow(s0 + s2); x1 = WrapLow(s1 + s3); @@ -643,14 +638,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp x15 = WrapLow(DctConstRoundShift(s13 - s15)); // stage 4 - s2 = (-CosPi16_64) * (x2 + x3); - s3 = CosPi16_64 * (x2 - x3); - s6 = CosPi16_64 * (x6 + x7); - s7 = CosPi16_64 * (-x6 + x7); - s10 = CosPi16_64 * (x10 + x11); - s11 = CosPi16_64 * (-x10 + x11); - s14 = (-CosPi16_64) * (x14 + x15); - s15 = CosPi16_64 * (x14 - x15); + s2 = -CosPi1664 * (x2 + x3); + s3 = CosPi1664 * (x2 - x3); + s6 = CosPi1664 * (x6 + x7); + s7 = CosPi1664 * (-x6 + x7); + s10 = CosPi1664 * (x10 + x11); + s11 = CosPi1664 * (-x10 + x11); + s14 = -CosPi1664 * (x14 + x15); + s15 = CosPi1664 * (x14 - x15); x2 = WrapLow(DctConstRoundShift(s2)); x3 = WrapLow(DctConstRoundShift(s3)); @@ -714,23 +709,23 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step2[6] = step1[6]; step2[7] = step1[7]; - temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64; - temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64; + temp1 = (step1[8] * CosPi3064) - (step1[15] * CosPi264); + temp2 = (step1[8] * CosPi264) + (step1[15] * CosPi3064); step2[8] = (short)WrapLow(DctConstRoundShift(temp1)); step2[15] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64; - temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64; + temp1 = (step1[9] * CosPi1464) - (step1[14] * CosPi1864); + temp2 = (step1[9] * CosPi1864) + (step1[14] * CosPi1464); step2[9] = (short)WrapLow(DctConstRoundShift(temp1)); step2[14] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64; - temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64; + temp1 = (step1[10] * CosPi2264) - (step1[13] * CosPi1064); + temp2 = (step1[10] * CosPi1064) + (step1[13] * CosPi2264); step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64; - temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64; + temp1 = (step1[11] * CosPi664) - (step1[12] * CosPi2664); + temp2 = (step1[11] * CosPi2664) + (step1[12] * CosPi664); step2[11] = (short)WrapLow(DctConstRoundShift(temp1)); step2[12] = (short)WrapLow(DctConstRoundShift(temp2)); @@ -740,12 +735,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[2] = step2[2]; step1[3] = step2[3]; - temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64; - temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64; + temp1 = (step2[4] * CosPi2864) - (step2[7] * CosPi464); + temp2 = (step2[4] * CosPi464) + (step2[7] * CosPi2864); step1[4] = (short)WrapLow(DctConstRoundShift(temp1)); step1[7] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64; - temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64; + temp1 = (step2[5] * CosPi1264) - (step2[6] * CosPi2064); + temp2 = (step2[5] * CosPi2064) + (step2[6] * CosPi1264); step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); @@ -759,12 +754,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[15] = (short)WrapLow(step2[14] + step2[15]); // stage 4 - temp1 = (step1[0] + step1[1]) * CosPi16_64; - temp2 = (step1[0] - step1[1]) * CosPi16_64; + temp1 = (step1[0] + step1[1]) * CosPi1664; + temp2 = (step1[0] - step1[1]) * CosPi1664; step2[0] = (short)WrapLow(DctConstRoundShift(temp1)); step2[1] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64; - temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64; + temp1 = (step1[2] * CosPi2464) - (step1[3] * CosPi864); + temp2 = (step1[2] * CosPi864) + (step1[3] * CosPi2464); step2[2] = (short)WrapLow(DctConstRoundShift(temp1)); step2[3] = (short)WrapLow(DctConstRoundShift(temp2)); step2[4] = (short)WrapLow(step1[4] + step1[5]); @@ -774,12 +769,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step2[8] = step1[8]; step2[15] = step1[15]; - temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64; - temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64; + temp1 = (-step1[9] * CosPi864) + (step1[14] * CosPi2464); + temp2 = (step1[9] * CosPi2464) + (step1[14] * CosPi864); step2[9] = (short)WrapLow(DctConstRoundShift(temp1)); step2[14] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64; - temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64; + temp1 = (-step1[10] * CosPi2464) - (step1[13] * CosPi864); + temp2 = (-step1[10] * CosPi864) + (step1[13] * CosPi2464); step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); step2[11] = step1[11]; @@ -791,8 +786,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[2] = (short)WrapLow(step2[1] - step2[2]); step1[3] = (short)WrapLow(step2[0] - step2[3]); step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * CosPi16_64; - temp2 = (step2[5] + step2[6]) * CosPi16_64; + temp1 = (step2[6] - step2[5]) * CosPi1664; + temp2 = (step2[5] + step2[6]) * CosPi1664; step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); step1[7] = step2[7]; @@ -817,12 +812,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step2[7] = (short)WrapLow(step1[0] - step1[7]); step2[8] = step1[8]; step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * CosPi16_64; - temp2 = (step1[10] + step1[13]) * CosPi16_64; + temp1 = (-step1[10] + step1[13]) * CosPi1664; + temp2 = (step1[10] + step1[13]) * CosPi1664; step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = (-step1[11] + step1[12]) * CosPi16_64; - temp2 = (step1[11] + step1[12]) * CosPi16_64; + temp1 = (-step1[11] + step1[12]) * CosPi1664; + temp2 = (step1[11] + step1[12]) * CosPi1664; step2[11] = (short)WrapLow(DctConstRoundShift(temp1)); step2[12] = (short)WrapLow(DctConstRoundShift(temp2)); step2[14] = step1[14]; @@ -850,14 +845,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void Idct16x16256Add(ReadOnlySpan input, Span dest, int stride) { - int i, j; Span output = stackalloc int[16 * 16]; Span outptr = output; Span tempIn = stackalloc int[16]; Span tempOut = stackalloc int[16]; // First transform rows - for (i = 0; i < 16; ++i) + for (int i = 0; i < 16; ++i) { Idct16(input, outptr); input = input.Slice(16); @@ -865,17 +859,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Then transform columns - for (i = 0; i < 16; ++i) + for (int i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { - tempIn[j] = output[j * 16 + i]; + tempIn[j] = output[(j * 16) + i]; } Idct16(tempIn, tempOut); - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { - dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); + dest[(j * stride) + i] = + ClipPixelAdd(dest[(j * stride) + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); } } } @@ -883,17 +878,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void Idct16x1638Add(ReadOnlySpan input, Span dest, int stride) { - int i, j; Span output = stackalloc int[16 * 16]; Span outptr = output; Span tempIn = stackalloc int[16]; Span tempOut = stackalloc int[16]; - output.Fill(0); + output.Clear(); // First transform rows. Since all non-zero dct coefficients are in // upper-left 8x8 area, we only need to calculate first 8 rows here. - for (i = 0; i < 8; ++i) + for (int i = 0; i < 8; ++i) { Idct16(input, outptr); input = input.Slice(16); @@ -901,17 +895,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Then transform columns - for (i = 0; i < 16; ++i) + for (int i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { - tempIn[j] = output[j * 16 + i]; + tempIn[j] = output[(j * 16) + i]; } Idct16(tempIn, tempOut); - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { - dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); + dest[(j * stride) + i] = + ClipPixelAdd(dest[(j * stride) + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); } } } @@ -919,17 +914,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void Idct16x1610Add(ReadOnlySpan input, Span dest, int stride) { - int i, j; Span output = stackalloc int[16 * 16]; Span outptr = output; Span tempIn = stackalloc int[16]; Span tempOut = stackalloc int[16]; - output.Fill(0); + output.Clear(); // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. - for (i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { Idct16(input, outptr); input = input.Slice(16); @@ -937,32 +931,32 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Then transform columns - for (i = 0; i < 16; ++i) + for (int i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { - tempIn[j] = output[j * 16 + i]; + tempIn[j] = output[(j * 16) + i]; } Idct16(tempIn, tempOut); - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { - dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); + dest[(j * stride) + i] = + ClipPixelAdd(dest[(j * stride) + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); } } } public static void Idct16x161Add(ReadOnlySpan input, Span dest, int stride) { - int i, j; long a1; - int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64)); + int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi1664)); - output = WrapLow(DctConstRoundShift(output * CosPi16_64)); + output = WrapLow(DctConstRoundShift(output * CosPi1664)); a1 = BitUtils.RoundPowerOfTwo(output, 6); - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { - for (i = 0; i < 16; ++i) + for (int i = 0; i < 16; ++i) { dest[i] = ClipPixelAdd(dest[i], a1); } @@ -996,43 +990,43 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[14] = (short)input[14]; step1[15] = (short)input[30]; - temp1 = (short)input[1] * CosPi31_64 - (short)input[31] * CosPi1_64; - temp2 = (short)input[1] * CosPi1_64 + (short)input[31] * CosPi31_64; + temp1 = ((short)input[1] * CosPi3164) - ((short)input[31] * CosPi164); + temp2 = ((short)input[1] * CosPi164) + ((short)input[31] * CosPi3164); step1[16] = (short)WrapLow(DctConstRoundShift(temp1)); step1[31] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = (short)input[17] * CosPi15_64 - (short)input[15] * CosPi17_64; - temp2 = (short)input[17] * CosPi17_64 + (short)input[15] * CosPi15_64; + temp1 = ((short)input[17] * CosPi1564) - ((short)input[15] * CosPi1764); + temp2 = ((short)input[17] * CosPi1764) + ((short)input[15] * CosPi1564); step1[17] = (short)WrapLow(DctConstRoundShift(temp1)); step1[30] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = (short)input[9] * CosPi23_64 - (short)input[23] * CosPi9_64; - temp2 = (short)input[9] * CosPi9_64 + (short)input[23] * CosPi23_64; + temp1 = ((short)input[9] * CosPi2364) - ((short)input[23] * CosPi964); + temp2 = ((short)input[9] * CosPi964) + ((short)input[23] * CosPi2364); step1[18] = (short)WrapLow(DctConstRoundShift(temp1)); step1[29] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = (short)input[25] * CosPi7_64 - (short)input[7] * CosPi25_64; - temp2 = (short)input[25] * CosPi25_64 + (short)input[7] * CosPi7_64; + temp1 = ((short)input[25] * CosPi764) - ((short)input[7] * CosPi2564); + temp2 = ((short)input[25] * CosPi2564) + ((short)input[7] * CosPi764); step1[19] = (short)WrapLow(DctConstRoundShift(temp1)); step1[28] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = (short)input[5] * CosPi27_64 - (short)input[27] * CosPi5_64; - temp2 = (short)input[5] * CosPi5_64 + (short)input[27] * CosPi27_64; + temp1 = ((short)input[5] * CosPi2764) - ((short)input[27] * CosPi564); + temp2 = ((short)input[5] * CosPi564) + ((short)input[27] * CosPi2764); step1[20] = (short)WrapLow(DctConstRoundShift(temp1)); step1[27] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = (short)input[21] * CosPi11_64 - (short)input[11] * CosPi21_64; - temp2 = (short)input[21] * CosPi21_64 + (short)input[11] * CosPi11_64; + temp1 = ((short)input[21] * CosPi1164) - ((short)input[11] * CosPi2164); + temp2 = ((short)input[21] * CosPi2164) + ((short)input[11] * CosPi1164); step1[21] = (short)WrapLow(DctConstRoundShift(temp1)); step1[26] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = (short)input[13] * CosPi19_64 - (short)input[19] * CosPi13_64; - temp2 = (short)input[13] * CosPi13_64 + (short)input[19] * CosPi19_64; + temp1 = ((short)input[13] * CosPi1964) - ((short)input[19] * CosPi1364); + temp2 = ((short)input[13] * CosPi1364) + ((short)input[19] * CosPi1964); step1[22] = (short)WrapLow(DctConstRoundShift(temp1)); step1[25] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = (short)input[29] * CosPi3_64 - (short)input[3] * CosPi29_64; - temp2 = (short)input[29] * CosPi29_64 + (short)input[3] * CosPi3_64; + temp1 = ((short)input[29] * CosPi364) - ((short)input[3] * CosPi2964); + temp2 = ((short)input[29] * CosPi2964) + ((short)input[3] * CosPi364); step1[23] = (short)WrapLow(DctConstRoundShift(temp1)); step1[24] = (short)WrapLow(DctConstRoundShift(temp2)); @@ -1046,23 +1040,23 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step2[6] = step1[6]; step2[7] = step1[7]; - temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64; - temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64; + temp1 = (step1[8] * CosPi3064) - (step1[15] * CosPi264); + temp2 = (step1[8] * CosPi264) + (step1[15] * CosPi3064); step2[8] = (short)WrapLow(DctConstRoundShift(temp1)); step2[15] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64; - temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64; + temp1 = (step1[9] * CosPi1464) - (step1[14] * CosPi1864); + temp2 = (step1[9] * CosPi1864) + (step1[14] * CosPi1464); step2[9] = (short)WrapLow(DctConstRoundShift(temp1)); step2[14] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64; - temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64; + temp1 = (step1[10] * CosPi2264) - (step1[13] * CosPi1064); + temp2 = (step1[10] * CosPi1064) + (step1[13] * CosPi2264); step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64; - temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64; + temp1 = (step1[11] * CosPi664) - (step1[12] * CosPi2664); + temp2 = (step1[11] * CosPi2664) + (step1[12] * CosPi664); step2[11] = (short)WrapLow(DctConstRoundShift(temp1)); step2[12] = (short)WrapLow(DctConstRoundShift(temp2)); @@ -1089,12 +1083,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[2] = step2[2]; step1[3] = step2[3]; - temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64; - temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64; + temp1 = (step2[4] * CosPi2864) - (step2[7] * CosPi464); + temp2 = (step2[4] * CosPi464) + (step2[7] * CosPi2864); step1[4] = (short)WrapLow(DctConstRoundShift(temp1)); step1[7] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64; - temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64; + temp1 = (step2[5] * CosPi1264) - (step2[6] * CosPi2064); + temp2 = (step2[5] * CosPi2064) + (step2[6] * CosPi1264); step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); @@ -1109,22 +1103,22 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[16] = step2[16]; step1[31] = step2[31]; - temp1 = -step2[17] * CosPi4_64 + step2[30] * CosPi28_64; - temp2 = step2[17] * CosPi28_64 + step2[30] * CosPi4_64; + temp1 = (-step2[17] * CosPi464) + (step2[30] * CosPi2864); + temp2 = (step2[17] * CosPi2864) + (step2[30] * CosPi464); step1[17] = (short)WrapLow(DctConstRoundShift(temp1)); step1[30] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = -step2[18] * CosPi28_64 - step2[29] * CosPi4_64; - temp2 = -step2[18] * CosPi4_64 + step2[29] * CosPi28_64; + temp1 = (-step2[18] * CosPi2864) - (step2[29] * CosPi464); + temp2 = (-step2[18] * CosPi464) + (step2[29] * CosPi2864); step1[18] = (short)WrapLow(DctConstRoundShift(temp1)); step1[29] = (short)WrapLow(DctConstRoundShift(temp2)); step1[19] = step2[19]; step1[20] = step2[20]; - temp1 = -step2[21] * CosPi20_64 + step2[26] * CosPi12_64; - temp2 = step2[21] * CosPi12_64 + step2[26] * CosPi20_64; + temp1 = (-step2[21] * CosPi2064) + (step2[26] * CosPi1264); + temp2 = (step2[21] * CosPi1264) + (step2[26] * CosPi2064); step1[21] = (short)WrapLow(DctConstRoundShift(temp1)); step1[26] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = -step2[22] * CosPi12_64 - step2[25] * CosPi20_64; - temp2 = -step2[22] * CosPi20_64 + step2[25] * CosPi12_64; + temp1 = (-step2[22] * CosPi1264) - (step2[25] * CosPi2064); + temp2 = (-step2[22] * CosPi2064) + (step2[25] * CosPi1264); step1[22] = (short)WrapLow(DctConstRoundShift(temp1)); step1[25] = (short)WrapLow(DctConstRoundShift(temp2)); step1[23] = step2[23]; @@ -1133,12 +1127,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[28] = step2[28]; // stage 4 - temp1 = (step1[0] + step1[1]) * CosPi16_64; - temp2 = (step1[0] - step1[1]) * CosPi16_64; + temp1 = (step1[0] + step1[1]) * CosPi1664; + temp2 = (step1[0] - step1[1]) * CosPi1664; step2[0] = (short)WrapLow(DctConstRoundShift(temp1)); step2[1] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64; - temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64; + temp1 = (step1[2] * CosPi2464) - (step1[3] * CosPi864); + temp2 = (step1[2] * CosPi864) + (step1[3] * CosPi2464); step2[2] = (short)WrapLow(DctConstRoundShift(temp1)); step2[3] = (short)WrapLow(DctConstRoundShift(temp2)); step2[4] = (short)WrapLow(step1[4] + step1[5]); @@ -1148,12 +1142,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step2[8] = step1[8]; step2[15] = step1[15]; - temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64; - temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64; + temp1 = (-step1[9] * CosPi864) + (step1[14] * CosPi2464); + temp2 = (step1[9] * CosPi2464) + (step1[14] * CosPi864); step2[9] = (short)WrapLow(DctConstRoundShift(temp1)); step2[14] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64; - temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64; + temp1 = (-step1[10] * CosPi2464) - (step1[13] * CosPi864); + temp2 = (-step1[10] * CosPi864) + (step1[13] * CosPi2464); step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); step2[11] = step1[11]; @@ -1183,8 +1177,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[2] = (short)WrapLow(step2[1] - step2[2]); step1[3] = (short)WrapLow(step2[0] - step2[3]); step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * CosPi16_64; - temp2 = (step2[5] + step2[6]) * CosPi16_64; + temp1 = (step2[6] - step2[5]) * CosPi1664; + temp2 = (step2[5] + step2[6]) * CosPi1664; step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); step1[7] = step2[7]; @@ -1200,20 +1194,20 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[16] = step2[16]; step1[17] = step2[17]; - temp1 = -step2[18] * CosPi8_64 + step2[29] * CosPi24_64; - temp2 = step2[18] * CosPi24_64 + step2[29] * CosPi8_64; + temp1 = (-step2[18] * CosPi864) + (step2[29] * CosPi2464); + temp2 = (step2[18] * CosPi2464) + (step2[29] * CosPi864); step1[18] = (short)WrapLow(DctConstRoundShift(temp1)); step1[29] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = -step2[19] * CosPi8_64 + step2[28] * CosPi24_64; - temp2 = step2[19] * CosPi24_64 + step2[28] * CosPi8_64; + temp1 = (-step2[19] * CosPi864) + (step2[28] * CosPi2464); + temp2 = (step2[19] * CosPi2464) + (step2[28] * CosPi864); step1[19] = (short)WrapLow(DctConstRoundShift(temp1)); step1[28] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = -step2[20] * CosPi24_64 - step2[27] * CosPi8_64; - temp2 = -step2[20] * CosPi8_64 + step2[27] * CosPi24_64; + temp1 = (-step2[20] * CosPi2464) - (step2[27] * CosPi864); + temp2 = (-step2[20] * CosPi864) + (step2[27] * CosPi2464); step1[20] = (short)WrapLow(DctConstRoundShift(temp1)); step1[27] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = -step2[21] * CosPi24_64 - step2[26] * CosPi8_64; - temp2 = -step2[21] * CosPi8_64 + step2[26] * CosPi24_64; + temp1 = (-step2[21] * CosPi2464) - (step2[26] * CosPi864); + temp2 = (-step2[21] * CosPi864) + (step2[26] * CosPi2464); step1[21] = (short)WrapLow(DctConstRoundShift(temp1)); step1[26] = (short)WrapLow(DctConstRoundShift(temp2)); step1[22] = step2[22]; @@ -1234,12 +1228,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step2[7] = (short)WrapLow(step1[0] - step1[7]); step2[8] = step1[8]; step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * CosPi16_64; - temp2 = (step1[10] + step1[13]) * CosPi16_64; + temp1 = (-step1[10] + step1[13]) * CosPi1664; + temp2 = (step1[10] + step1[13]) * CosPi1664; step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = (-step1[11] + step1[12]) * CosPi16_64; - temp2 = (step1[11] + step1[12]) * CosPi16_64; + temp1 = (-step1[11] + step1[12]) * CosPi1664; + temp2 = (step1[11] + step1[12]) * CosPi1664; step2[11] = (short)WrapLow(DctConstRoundShift(temp1)); step2[12] = (short)WrapLow(DctConstRoundShift(temp2)); step2[14] = step1[14]; @@ -1285,20 +1279,20 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[17] = step2[17]; step1[18] = step2[18]; step1[19] = step2[19]; - temp1 = (-step2[20] + step2[27]) * CosPi16_64; - temp2 = (step2[20] + step2[27]) * CosPi16_64; + temp1 = (-step2[20] + step2[27]) * CosPi1664; + temp2 = (step2[20] + step2[27]) * CosPi1664; step1[20] = (short)WrapLow(DctConstRoundShift(temp1)); step1[27] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = (-step2[21] + step2[26]) * CosPi16_64; - temp2 = (step2[21] + step2[26]) * CosPi16_64; + temp1 = (-step2[21] + step2[26]) * CosPi1664; + temp2 = (step2[21] + step2[26]) * CosPi1664; step1[21] = (short)WrapLow(DctConstRoundShift(temp1)); step1[26] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = (-step2[22] + step2[25]) * CosPi16_64; - temp2 = (step2[22] + step2[25]) * CosPi16_64; + temp1 = (-step2[22] + step2[25]) * CosPi1664; + temp2 = (step2[22] + step2[25]) * CosPi1664; step1[22] = (short)WrapLow(DctConstRoundShift(temp1)); step1[25] = (short)WrapLow(DctConstRoundShift(temp2)); - temp1 = (-step2[23] + step2[24]) * CosPi16_64; - temp2 = (step2[23] + step2[24]) * CosPi16_64; + temp1 = (-step2[23] + step2[24]) * CosPi1664; + temp2 = (step2[23] + step2[24]) * CosPi1664; step1[23] = (short)WrapLow(DctConstRoundShift(temp1)); step1[24] = (short)WrapLow(DctConstRoundShift(temp2)); step1[28] = step2[28]; @@ -1344,17 +1338,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void Idct32x321024Add(ReadOnlySpan input, Span dest, int stride) { - int i, j; Span output = stackalloc int[32 * 32]; Span outptr = output; Span tempIn = stackalloc int[32]; Span tempOut = stackalloc int[32]; // Rows - for (i = 0; i < 32; ++i) + for (int i = 0; i < 32; ++i) { short zeroCoeff = 0; - for (j = 0; j < 32; ++j) + for (int j = 0; j < 32; ++j) { zeroCoeff |= (short)input[j]; } @@ -1365,7 +1358,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } else { - outptr.Slice(0, 32).Fill(0); + outptr.Slice(0, 32).Clear(); } input = input.Slice(32); @@ -1373,17 +1366,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Columns - for (i = 0; i < 32; ++i) + for (int i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) + for (int j = 0; j < 32; ++j) { - tempIn[j] = output[j * 32 + i]; + tempIn[j] = output[(j * 32) + i]; } Idct32(tempIn, tempOut); - for (j = 0; j < 32; ++j) + for (int j = 0; j < 32; ++j) { - dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); + dest[(j * stride) + i] = + ClipPixelAdd(dest[(j * stride) + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); } } } @@ -1391,17 +1385,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void Idct32x32135Add(ReadOnlySpan input, Span dest, int stride) { - int i, j; Span output = stackalloc int[32 * 32]; Span outptr = output; Span tempIn = stackalloc int[32]; Span tempOut = stackalloc int[32]; - output.Fill(0); + output.Clear(); // Rows // Only upper-left 16x16 has non-zero coeff - for (i = 0; i < 16; ++i) + for (int i = 0; i < 16; ++i) { Idct32(input, outptr); input = input.Slice(32); @@ -1409,17 +1402,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Columns - for (i = 0; i < 32; ++i) + for (int i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) + for (int j = 0; j < 32; ++j) { - tempIn[j] = output[j * 32 + i]; + tempIn[j] = output[(j * 32) + i]; } Idct32(tempIn, tempOut); - for (j = 0; j < 32; ++j) + for (int j = 0; j < 32; ++j) { - dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); + dest[(j * stride) + i] = + ClipPixelAdd(dest[(j * stride) + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); } } } @@ -1427,17 +1421,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void Idct32x3234Add(ReadOnlySpan input, Span dest, int stride) { - int i, j; Span output = stackalloc int[32 * 32]; Span outptr = output; Span tempIn = stackalloc int[32]; Span tempOut = stackalloc int[32]; - output.Fill(0); + output.Clear(); // Rows // Only upper-left 8x8 has non-zero coeff - for (i = 0; i < 8; ++i) + for (int i = 0; i < 8; ++i) { Idct32(input, outptr); input = input.Slice(32); @@ -1445,33 +1438,33 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Columns - for (i = 0; i < 32; ++i) + for (int i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) + for (int j = 0; j < 32; ++j) { - tempIn[j] = output[j * 32 + i]; + tempIn[j] = output[(j * 32) + i]; } Idct32(tempIn, tempOut); - for (j = 0; j < 32; ++j) + for (int j = 0; j < 32; ++j) { - dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); + dest[(j * stride) + i] = + ClipPixelAdd(dest[(j * stride) + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); } } } public static void Idct32x321Add(ReadOnlySpan input, Span dest, int stride) { - int i, j; long a1; - int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64)); + int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi1664)); - output = WrapLow(DctConstRoundShift(output * CosPi16_64)); + output = WrapLow(DctConstRoundShift(output * CosPi1664)); a1 = BitUtils.RoundPowerOfTwo(output, 6); - for (j = 0; j < 32; ++j) + for (int j = 0; j < 32; ++j) { - for (i = 0; i < 32; ++i) + for (int i = 0; i < 32; ++i) { dest[i] = ClipPixelAdd(dest[i], a1); } @@ -1485,13 +1478,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 0.5 shifts per pixel. */ - int i; + Span output = stackalloc int[16]; long a1, b1, c1, d1, e1; ReadOnlySpan ip = input; Span op = output; - for (i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { a1 = ip[0] >> UnitQuantShift; c1 = ip[1] >> UnitQuantShift; @@ -1513,7 +1506,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } ReadOnlySpan ip2 = output; - for (i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { a1 = ip2[4 * 0]; c1 = ip2[4 * 1]; @@ -1539,7 +1532,6 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void HighbdIwht4x41Add(ReadOnlySpan input, Span dest, int stride, int bd) { - int i; long a1, e1; Span tmp = stackalloc int[4]; ReadOnlySpan ip = input; @@ -1552,7 +1544,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp op[1] = op[2] = op[3] = HighbdWrapLow(e1, bd); ReadOnlySpan ip2 = tmp; - for (i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { e1 = ip2[0] >> 1; a1 = ip2[0] - e1; @@ -1576,29 +1568,29 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp if (DetectInvalidHighbdInput(input, 4) != 0) { Debug.Assert(false, "invalid highbd txfm input"); - output.Slice(0, 4).Fill(0); + output.Slice(0, 4).Clear(); return; } if ((x0 | x1 | x2 | x3) == 0) { - output.Slice(0, 4).Fill(0); + output.Slice(0, 4).Clear(); return; } - s0 = (long)SinPi1_9 * x0; - s1 = (long)SinPi2_9 * x0; - s2 = (long)SinPi3_9 * x1; - s3 = (long)SinPi4_9 * x2; - s4 = (long)SinPi1_9 * x2; - s5 = (long)SinPi2_9 * x3; - s6 = (long)SinPi4_9 * x3; + s0 = (long)SinPi19 * x0; + s1 = (long)SinPi29 * x0; + s2 = (long)SinPi39 * x1; + s3 = (long)SinPi49 * x2; + s4 = (long)SinPi19 * x2; + s5 = (long)SinPi29 * x3; + s6 = (long)SinPi49 * x3; s7 = HighbdWrapLow(x0 - x2 + x3, bd); s0 = s0 + s3 + s5; s1 = s1 - s4 - s6; s3 = s2; - s2 = SinPi3_9 * s7; + s2 = SinPi39 * s7; // 1-D transform scaling factor is sqrt(2). // The overall dynamic range is 14b (input) + 14b (multiplication scaling) @@ -1619,17 +1611,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp if (DetectInvalidHighbdInput(input, 4) != 0) { Debug.Assert(false, "invalid highbd txfm input"); - output.Slice(0, 4).Fill(0); + output.Slice(0, 4).Clear(); return; } // stage 1 - temp1 = (input[0] + input[2]) * (long)CosPi16_64; - temp2 = (input[0] - input[2]) * (long)CosPi16_64; + temp1 = (input[0] + input[2]) * (long)CosPi1664; + temp2 = (input[0] - input[2]) * (long)CosPi1664; step[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = input[1] * (long)CosPi24_64 - input[3] * (long)CosPi8_64; - temp2 = input[1] * (long)CosPi8_64 + input[3] * (long)CosPi24_64; + temp1 = (input[1] * (long)CosPi2464) - (input[3] * (long)CosPi864); + temp2 = (input[1] * (long)CosPi864) + (input[3] * (long)CosPi2464); step[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd); @@ -1643,14 +1635,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void HighbdIdct4x416Add(ReadOnlySpan input, Span dest, int stride, int bd) { - int i, j; Span output = stackalloc int[4 * 4]; Span outptr = output; Span tempIn = stackalloc int[4]; Span tempOut = stackalloc int[4]; // Rows - for (i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { HighbdIdct4(input, outptr, bd); input = input.Slice(4); @@ -1658,31 +1649,31 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Columns - for (i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) + for (int j = 0; j < 4; ++j) { - tempIn[j] = output[j * 4 + i]; + tempIn[j] = output[(j * 4) + i]; } HighbdIdct4(tempIn, tempOut, bd); - for (j = 0; j < 4; ++j) + for (int j = 0; j < 4; ++j) { - dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd); + dest[(j * stride) + i] = HighbdClipPixelAdd(dest[(j * stride) + i], + BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd); } } } public static void HighbdIdct4x41Add(ReadOnlySpan input, Span dest, int stride, int bd) { - int i; long a1; - int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd); + int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi1664), bd); - output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd); + output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi1664), bd); a1 = BitUtils.RoundPowerOfTwo(output, 4); - for (i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { dest[0] = HighbdClipPixelAdd(dest[0], a1, bd); dest[1] = HighbdClipPixelAdd(dest[1], a1, bd); @@ -1707,25 +1698,25 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp if (DetectInvalidHighbdInput(input, 8) != 0) { Debug.Assert(false, "invalid highbd txfm input"); - output.Slice(0, 8).Fill(0); + output.Slice(0, 8).Clear(); return; } if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0) { - output.Slice(0, 8).Fill(0); + output.Slice(0, 8).Clear(); return; } // stage 1 - s0 = (long)CosPi2_64 * x0 + (long)CosPi30_64 * x1; - s1 = (long)CosPi30_64 * x0 - (long)CosPi2_64 * x1; - s2 = (long)CosPi10_64 * x2 + (long)CosPi22_64 * x3; - s3 = (long)CosPi22_64 * x2 - (long)CosPi10_64 * x3; - s4 = (long)CosPi18_64 * x4 + (long)CosPi14_64 * x5; - s5 = (long)CosPi14_64 * x4 - (long)CosPi18_64 * x5; - s6 = (long)CosPi26_64 * x6 + (long)CosPi6_64 * x7; - s7 = (long)CosPi6_64 * x6 - (long)CosPi26_64 * x7; + s0 = ((long)CosPi264 * x0) + ((long)CosPi3064 * x1); + s1 = ((long)CosPi3064 * x0) - ((long)CosPi264 * x1); + s2 = ((long)CosPi1064 * x2) + ((long)CosPi2264 * x3); + s3 = ((long)CosPi2264 * x2) - ((long)CosPi1064 * x3); + s4 = ((long)CosPi1864 * x4) + ((long)CosPi1464 * x5); + s5 = ((long)CosPi1464 * x4) - ((long)CosPi1864 * x5); + s6 = ((long)CosPi2664 * x6) + ((long)CosPi664 * x7); + s7 = ((long)CosPi664 * x6) - ((long)CosPi2664 * x7); x0 = HighbdWrapLow(DctConstRoundShift(s0 + s4), bd); x1 = HighbdWrapLow(DctConstRoundShift(s1 + s5), bd); @@ -1741,10 +1732,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp s1 = x1; s2 = x2; s3 = x3; - s4 = (long)CosPi8_64 * x4 + (long)CosPi24_64 * x5; - s5 = (long)CosPi24_64 * x4 - (long)CosPi8_64 * x5; - s6 = (long)(-CosPi24_64) * x6 + (long)CosPi8_64 * x7; - s7 = (long)CosPi8_64 * x6 + (long)CosPi24_64 * x7; + s4 = ((long)CosPi864 * x4) + ((long)CosPi2464 * x5); + s5 = ((long)CosPi2464 * x4) - ((long)CosPi864 * x5); + s6 = ((long)-CosPi2464 * x6) + ((long)CosPi864 * x7); + s7 = ((long)CosPi864 * x6) + ((long)CosPi2464 * x7); x0 = HighbdWrapLow(s0 + s2, bd); x1 = HighbdWrapLow(s1 + s3, bd); @@ -1756,10 +1747,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp x7 = HighbdWrapLow(DctConstRoundShift(s5 - s7), bd); // stage 3 - s2 = (long)CosPi16_64 * (x2 + x3); - s3 = (long)CosPi16_64 * (x2 - x3); - s6 = (long)CosPi16_64 * (x6 + x7); - s7 = (long)CosPi16_64 * (x6 - x7); + s2 = (long)CosPi1664 * (x2 + x3); + s3 = (long)CosPi1664 * (x2 - x3); + s6 = (long)CosPi1664 * (x6 + x7); + s7 = (long)CosPi1664 * (x6 - x7); x2 = HighbdWrapLow(DctConstRoundShift(s2), bd); x3 = HighbdWrapLow(DctConstRoundShift(s3), bd); @@ -1786,7 +1777,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp if (DetectInvalidHighbdInput(input, 8) != 0) { Debug.Assert(false, "invalid highbd txfm input"); - output.Slice(0, 8).Fill(0); + output.Slice(0, 8).Clear(); return; } @@ -1795,12 +1786,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[2] = input[4]; step1[1] = input[2]; step1[3] = input[6]; - temp1 = input[1] * (long)CosPi28_64 - input[7] * (long)CosPi4_64; - temp2 = input[1] * (long)CosPi4_64 + input[7] * (long)CosPi28_64; + temp1 = (input[1] * (long)CosPi2864) - (input[7] * (long)CosPi464); + temp2 = (input[1] * (long)CosPi464) + (input[7] * (long)CosPi2864); step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = input[5] * (long)CosPi12_64 - input[3] * (long)CosPi20_64; - temp2 = input[5] * (long)CosPi20_64 + input[3] * (long)CosPi12_64; + temp1 = (input[5] * (long)CosPi1264) - (input[3] * (long)CosPi2064); + temp2 = (input[5] * (long)CosPi2064) + (input[3] * (long)CosPi1264); step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); @@ -1815,8 +1806,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp // stage 3 - odd half step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * (long)CosPi16_64; - temp2 = (step2[5] + step2[6]) * (long)CosPi16_64; + temp1 = (step2[6] - step2[5]) * (long)CosPi1664; + temp2 = (step2[5] + step2[6]) * (long)CosPi1664; step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); step1[7] = step2[7]; @@ -1835,14 +1826,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void HighbdIdct8x864Add(ReadOnlySpan input, Span dest, int stride, int bd) { - int i, j; Span output = stackalloc int[8 * 8]; Span outptr = output; Span tempIn = stackalloc int[8]; Span tempOut = stackalloc int[8]; // First transform rows - for (i = 0; i < 8; ++i) + for (int i = 0; i < 8; ++i) { HighbdIdct8(input, outptr, bd); input = input.Slice(8); @@ -1850,17 +1840,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Then transform columns - for (i = 0; i < 8; ++i) + for (int i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) + for (int j = 0; j < 8; ++j) { - tempIn[j] = output[j * 8 + i]; + tempIn[j] = output[(j * 8) + i]; } HighbdIdct8(tempIn, tempOut, bd); - for (j = 0; j < 8; ++j) + for (int j = 0; j < 8; ++j) { - dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd); + dest[(j * stride) + i] = HighbdClipPixelAdd(dest[(j * stride) + i], + BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd); } } } @@ -1868,17 +1859,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void HighbdIdct8x812Add(ReadOnlySpan input, Span dest, int stride, int bd) { - int i, j; Span output = stackalloc int[8 * 8]; Span outptr = output; Span tempIn = stackalloc int[8]; Span tempOut = stackalloc int[8]; - output.Fill(0); + output.Clear(); // First transform rows // Only first 4 row has non-zero coefs - for (i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { HighbdIdct8(input, outptr, bd); input = input.Slice(8); @@ -1886,32 +1876,32 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Then transform columns - for (i = 0; i < 8; ++i) + for (int i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) + for (int j = 0; j < 8; ++j) { - tempIn[j] = output[j * 8 + i]; + tempIn[j] = output[(j * 8) + i]; } HighbdIdct8(tempIn, tempOut, bd); - for (j = 0; j < 8; ++j) + for (int j = 0; j < 8; ++j) { - dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd); + dest[(j * stride) + i] = HighbdClipPixelAdd(dest[(j * stride) + i], + BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd); } } } - public static void vpx_Highbdidct8x8_1_add_c(ReadOnlySpan input, Span dest, int stride, int bd) + public static void VpxHighbdidct8x81AddC(ReadOnlySpan input, Span dest, int stride, int bd) { - int i, j; long a1; - int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd); + int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi1664), bd); - output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd); + output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi1664), bd); a1 = BitUtils.RoundPowerOfTwo(output, 5); - for (j = 0; j < 8; ++j) + for (int j = 0; j < 8; ++j) { - for (i = 0; i < 8; ++i) + for (int i = 0; i < 8; ++i) { dest[i] = HighbdClipPixelAdd(dest[i], a1, bd); } @@ -1943,33 +1933,33 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp if (DetectInvalidHighbdInput(input, 16) != 0) { Debug.Assert(false, "invalid highbd txfm input"); - output.Slice(0, 16).Fill(0); + output.Slice(0, 16).Clear(); return; } if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0) { - output.Slice(0, 16).Fill(0); + output.Slice(0, 16).Clear(); return; } // stage 1 - s0 = x0 * (long)CosPi1_64 + x1 * (long)CosPi31_64; - s1 = x0 * (long)CosPi31_64 - x1 * (long)CosPi1_64; - s2 = x2 * (long)CosPi5_64 + x3 * (long)CosPi27_64; - s3 = x2 * (long)CosPi27_64 - x3 * (long)CosPi5_64; - s4 = x4 * (long)CosPi9_64 + x5 * (long)CosPi23_64; - s5 = x4 * (long)CosPi23_64 - x5 * (long)CosPi9_64; - s6 = x6 * (long)CosPi13_64 + x7 * (long)CosPi19_64; - s7 = x6 * (long)CosPi19_64 - x7 * (long)CosPi13_64; - s8 = x8 * (long)CosPi17_64 + x9 * (long)CosPi15_64; - s9 = x8 * (long)CosPi15_64 - x9 * (long)CosPi17_64; - s10 = x10 * (long)CosPi21_64 + x11 * (long)CosPi11_64; - s11 = x10 * (long)CosPi11_64 - x11 * (long)CosPi21_64; - s12 = x12 * (long)CosPi25_64 + x13 * (long)CosPi7_64; - s13 = x12 * (long)CosPi7_64 - x13 * (long)CosPi25_64; - s14 = x14 * (long)CosPi29_64 + x15 * (long)CosPi3_64; - s15 = x14 * (long)CosPi3_64 - x15 * (long)CosPi29_64; + s0 = (x0 * (long)CosPi164) + (x1 * (long)CosPi3164); + s1 = (x0 * (long)CosPi3164) - (x1 * (long)CosPi164); + s2 = (x2 * (long)CosPi564) + (x3 * (long)CosPi2764); + s3 = (x2 * (long)CosPi2764) - (x3 * (long)CosPi564); + s4 = (x4 * (long)CosPi964) + (x5 * (long)CosPi2364); + s5 = (x4 * (long)CosPi2364) - (x5 * (long)CosPi964); + s6 = (x6 * (long)CosPi1364) + (x7 * (long)CosPi1964); + s7 = (x6 * (long)CosPi1964) - (x7 * (long)CosPi1364); + s8 = (x8 * (long)CosPi1764) + (x9 * (long)CosPi1564); + s9 = (x8 * (long)CosPi1564) - (x9 * (long)CosPi1764); + s10 = (x10 * (long)CosPi2164) + (x11 * (long)CosPi1164); + s11 = (x10 * (long)CosPi1164) - (x11 * (long)CosPi2164); + s12 = (x12 * (long)CosPi2564) + (x13 * (long)CosPi764); + s13 = (x12 * (long)CosPi764) - (x13 * (long)CosPi2564); + s14 = (x14 * (long)CosPi2964) + (x15 * (long)CosPi364); + s15 = (x14 * (long)CosPi364) - (x15 * (long)CosPi2964); x0 = HighbdWrapLow(DctConstRoundShift(s0 + s8), bd); x1 = HighbdWrapLow(DctConstRoundShift(s1 + s9), bd); @@ -1997,14 +1987,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp s5 = x5; s6 = x6; s7 = x7; - s8 = x8 * (long)CosPi4_64 + x9 * (long)CosPi28_64; - s9 = x8 * (long)CosPi28_64 - x9 * (long)CosPi4_64; - s10 = x10 * (long)CosPi20_64 + x11 * (long)CosPi12_64; - s11 = x10 * (long)CosPi12_64 - x11 * (long)CosPi20_64; - s12 = -x12 * (long)CosPi28_64 + x13 * (long)CosPi4_64; - s13 = x12 * (long)CosPi4_64 + x13 * (long)CosPi28_64; - s14 = -x14 * (long)CosPi12_64 + x15 * (long)CosPi20_64; - s15 = x14 * (long)CosPi20_64 + x15 * (long)CosPi12_64; + s8 = (x8 * (long)CosPi464) + (x9 * (long)CosPi2864); + s9 = (x8 * (long)CosPi2864) - (x9 * (long)CosPi464); + s10 = (x10 * (long)CosPi2064) + (x11 * (long)CosPi1264); + s11 = (x10 * (long)CosPi1264) - (x11 * (long)CosPi2064); + s12 = (-x12 * (long)CosPi2864) + (x13 * (long)CosPi464); + s13 = (x12 * (long)CosPi464) + (x13 * (long)CosPi2864); + s14 = (-x14 * (long)CosPi1264) + (x15 * (long)CosPi2064); + s15 = (x14 * (long)CosPi2064) + (x15 * (long)CosPi1264); x0 = HighbdWrapLow(s0 + s4, bd); x1 = HighbdWrapLow(s1 + s5, bd); @@ -2028,18 +2018,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp s1 = x1; s2 = x2; s3 = x3; - s4 = x4 * (long)CosPi8_64 + x5 * (long)CosPi24_64; - s5 = x4 * (long)CosPi24_64 - x5 * (long)CosPi8_64; - s6 = -x6 * (long)CosPi24_64 + x7 * (long)CosPi8_64; - s7 = x6 * (long)CosPi8_64 + x7 * (long)CosPi24_64; + s4 = (x4 * (long)CosPi864) + (x5 * (long)CosPi2464); + s5 = (x4 * (long)CosPi2464) - (x5 * (long)CosPi864); + s6 = (-x6 * (long)CosPi2464) + (x7 * (long)CosPi864); + s7 = (x6 * (long)CosPi864) + (x7 * (long)CosPi2464); s8 = x8; s9 = x9; s10 = x10; s11 = x11; - s12 = x12 * (long)CosPi8_64 + x13 * (long)CosPi24_64; - s13 = x12 * (long)CosPi24_64 - x13 * (long)CosPi8_64; - s14 = -x14 * (long)CosPi24_64 + x15 * (long)CosPi8_64; - s15 = x14 * (long)CosPi8_64 + x15 * (long)CosPi24_64; + s12 = (x12 * (long)CosPi864) + (x13 * (long)CosPi2464); + s13 = (x12 * (long)CosPi2464) - (x13 * (long)CosPi864); + s14 = (-x14 * (long)CosPi2464) + (x15 * (long)CosPi864); + s15 = (x14 * (long)CosPi864) + (x15 * (long)CosPi2464); x0 = HighbdWrapLow(s0 + s2, bd); x1 = HighbdWrapLow(s1 + s3, bd); @@ -2059,14 +2049,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp x15 = HighbdWrapLow(DctConstRoundShift(s13 - s15), bd); // stage 4 - s2 = (long)(-CosPi16_64) * (x2 + x3); - s3 = (long)CosPi16_64 * (x2 - x3); - s6 = (long)CosPi16_64 * (x6 + x7); - s7 = (long)CosPi16_64 * (-x6 + x7); - s10 = (long)CosPi16_64 * (x10 + x11); - s11 = (long)CosPi16_64 * (-x10 + x11); - s14 = (long)(-CosPi16_64) * (x14 + x15); - s15 = (long)CosPi16_64 * (x14 - x15); + s2 = (long)-CosPi1664 * (x2 + x3); + s3 = (long)CosPi1664 * (x2 - x3); + s6 = (long)CosPi1664 * (x6 + x7); + s7 = (long)CosPi1664 * (-x6 + x7); + s10 = (long)CosPi1664 * (x10 + x11); + s11 = (long)CosPi1664 * (-x10 + x11); + s14 = (long)-CosPi1664 * (x14 + x15); + s15 = (long)CosPi1664 * (x14 - x15); x2 = HighbdWrapLow(DctConstRoundShift(s2), bd); x3 = HighbdWrapLow(DctConstRoundShift(s3), bd); @@ -2105,7 +2095,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp if (DetectInvalidHighbdInput(input, 16) != 0) { Debug.Assert(false, "invalid highbd txfm input"); - output.Slice(0, 16).Fill(0); + output.Slice(0, 16).Clear(); return; } @@ -2137,23 +2127,23 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step2[6] = step1[6]; step2[7] = step1[7]; - temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64; - temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64; + temp1 = (step1[8] * (long)CosPi3064) - (step1[15] * (long)CosPi264); + temp2 = (step1[8] * (long)CosPi264) + (step1[15] * (long)CosPi3064); step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64; - temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64; + temp1 = (step1[9] * (long)CosPi1464) - (step1[14] * (long)CosPi1864); + temp2 = (step1[9] * (long)CosPi1864) + (step1[14] * (long)CosPi1464); step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64; - temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64; + temp1 = (step1[10] * (long)CosPi2264) - (step1[13] * (long)CosPi1064); + temp2 = (step1[10] * (long)CosPi1064) + (step1[13] * (long)CosPi2264); step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64; - temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64; + temp1 = (step1[11] * (long)CosPi664) - (step1[12] * (long)CosPi2664); + temp2 = (step1[11] * (long)CosPi2664) + (step1[12] * (long)CosPi664); step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd); @@ -2163,12 +2153,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[2] = step2[2]; step1[3] = step2[3]; - temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64; - temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64; + temp1 = (step2[4] * (long)CosPi2864) - (step2[7] * (long)CosPi464); + temp2 = (step2[4] * (long)CosPi464) + (step2[7] * (long)CosPi2864); step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64; - temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64; + temp1 = (step2[5] * (long)CosPi1264) - (step2[6] * (long)CosPi2064); + temp2 = (step2[5] * (long)CosPi2064) + (step2[6] * (long)CosPi1264); step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); @@ -2182,12 +2172,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[15] = HighbdWrapLow(step2[14] + step2[15], bd); // stage 4 - temp1 = (step1[0] + step1[1]) * (long)CosPi16_64; - temp2 = (step1[0] - step1[1]) * (long)CosPi16_64; + temp1 = (step1[0] + step1[1]) * (long)CosPi1664; + temp2 = (step1[0] - step1[1]) * (long)CosPi1664; step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64; - temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64; + temp1 = (step1[2] * (long)CosPi2464) - (step1[3] * (long)CosPi864); + temp2 = (step1[2] * (long)CosPi864) + (step1[3] * (long)CosPi2464); step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd); step2[4] = HighbdWrapLow(step1[4] + step1[5], bd); @@ -2197,12 +2187,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step2[8] = step1[8]; step2[15] = step1[15]; - temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64; - temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64; + temp1 = (-step1[9] * (long)CosPi864) + (step1[14] * (long)CosPi2464); + temp2 = (step1[9] * (long)CosPi2464) + (step1[14] * (long)CosPi864); step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64; - temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64; + temp1 = (-step1[10] * (long)CosPi2464) - (step1[13] * (long)CosPi864); + temp2 = (-step1[10] * (long)CosPi864) + (step1[13] * (long)CosPi2464); step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); step2[11] = step1[11]; @@ -2214,8 +2204,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[2] = HighbdWrapLow(step2[1] - step2[2], bd); step1[3] = HighbdWrapLow(step2[0] - step2[3], bd); step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * (long)CosPi16_64; - temp2 = (step2[5] + step2[6]) * (long)CosPi16_64; + temp1 = (step2[6] - step2[5]) * (long)CosPi1664; + temp2 = (step2[5] + step2[6]) * (long)CosPi1664; step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); step1[7] = step2[7]; @@ -2240,12 +2230,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step2[7] = HighbdWrapLow(step1[0] - step1[7], bd); step2[8] = step1[8]; step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64; - temp2 = (step1[10] + step1[13]) * (long)CosPi16_64; + temp1 = (-step1[10] + step1[13]) * (long)CosPi1664; + temp2 = (step1[10] + step1[13]) * (long)CosPi1664; step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64; - temp2 = (step1[11] + step1[12]) * (long)CosPi16_64; + temp1 = (-step1[11] + step1[12]) * (long)CosPi1664; + temp2 = (step1[11] + step1[12]) * (long)CosPi1664; step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd); step2[14] = step1[14]; @@ -2273,14 +2263,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void HighbdIdct16x16256Add(ReadOnlySpan input, Span dest, int stride, int bd) { - int i, j; Span output = stackalloc int[16 * 16]; Span outptr = output; Span tempIn = stackalloc int[16]; Span tempOut = stackalloc int[16]; // First transform rows - for (i = 0; i < 16; ++i) + for (int i = 0; i < 16; ++i) { HighbdIdct16(input, outptr, bd); input = input.Slice(16); @@ -2288,17 +2277,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Then transform columns - for (i = 0; i < 16; ++i) + for (int i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { - tempIn[j] = output[j * 16 + i]; + tempIn[j] = output[(j * 16) + i]; } HighbdIdct16(tempIn, tempOut, bd); - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { - dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); + dest[(j * stride) + i] = HighbdClipPixelAdd(dest[(j * stride) + i], + BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); } } } @@ -2306,17 +2296,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void HighbdIdct16x1638Add(ReadOnlySpan input, Span dest, int stride, int bd) { - int i, j; Span output = stackalloc int[16 * 16]; Span outptr = output; Span tempIn = stackalloc int[16]; Span tempOut = stackalloc int[16]; - output.Fill(0); + output.Clear(); // First transform rows. Since all non-zero dct coefficients are in // upper-left 8x8 area, we only need to calculate first 8 rows here. - for (i = 0; i < 8; ++i) + for (int i = 0; i < 8; ++i) { HighbdIdct16(input, outptr, bd); input = input.Slice(16); @@ -2324,16 +2313,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Then transform columns - for (i = 0; i < 16; ++i) + for (int i = 0; i < 16; ++i) { Span destT = dest; - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { - tempIn[j] = output[j * 16 + i]; + tempIn[j] = output[(j * 16) + i]; } HighbdIdct16(tempIn, tempOut, bd); - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); destT = destT.Slice(stride); @@ -2344,17 +2333,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void HighbdIdct16x1610Add(ReadOnlySpan input, Span dest, int stride, int bd) { - int i, j; Span output = stackalloc int[16 * 16]; Span outptr = output; Span tempIn = stackalloc int[16]; Span tempOut = stackalloc int[16]; - output.Fill(0); + output.Clear(); // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. - for (i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { HighbdIdct16(input, outptr, bd); input = input.Slice(16); @@ -2362,32 +2350,32 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Then transform columns - for (i = 0; i < 16; ++i) + for (int i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { - tempIn[j] = output[j * 16 + i]; + tempIn[j] = output[(j * 16) + i]; } HighbdIdct16(tempIn, tempOut, bd); - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { - dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); + dest[(j * stride) + i] = HighbdClipPixelAdd(dest[(j * stride) + i], + BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); } } } public static void HighbdIdct16x161Add(ReadOnlySpan input, Span dest, int stride, int bd) { - int i, j; long a1; - int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd); + int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi1664), bd); - output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd); + output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi1664), bd); a1 = BitUtils.RoundPowerOfTwo(output, 6); - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { - for (i = 0; i < 16; ++i) + for (int i = 0; i < 16; ++i) { dest[i] = HighbdClipPixelAdd(dest[i], a1, bd); } @@ -2406,7 +2394,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp if (DetectInvalidHighbdInput(input, 32) != 0) { Debug.Assert(false, "invalid highbd txfm input"); - output.Slice(0, 32).Fill(0); + output.Slice(0, 32).Clear(); return; } @@ -2428,43 +2416,43 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[14] = input[14]; step1[15] = input[30]; - temp1 = input[1] * (long)CosPi31_64 - input[31] * (long)CosPi1_64; - temp2 = input[1] * (long)CosPi1_64 + input[31] * (long)CosPi31_64; + temp1 = (input[1] * (long)CosPi3164) - (input[31] * (long)CosPi164); + temp2 = (input[1] * (long)CosPi164) + (input[31] * (long)CosPi3164); step1[16] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[31] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = input[17] * (long)CosPi15_64 - input[15] * (long)CosPi17_64; - temp2 = input[17] * (long)CosPi17_64 + input[15] * (long)CosPi15_64; + temp1 = (input[17] * (long)CosPi1564) - (input[15] * (long)CosPi1764); + temp2 = (input[17] * (long)CosPi1764) + (input[15] * (long)CosPi1564); step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = input[9] * (long)CosPi23_64 - input[23] * (long)CosPi9_64; - temp2 = input[9] * (long)CosPi9_64 + input[23] * (long)CosPi23_64; + temp1 = (input[9] * (long)CosPi2364) - (input[23] * (long)CosPi964); + temp2 = (input[9] * (long)CosPi964) + (input[23] * (long)CosPi2364); step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = input[25] * (long)CosPi7_64 - input[7] * (long)CosPi25_64; - temp2 = input[25] * (long)CosPi25_64 + input[7] * (long)CosPi7_64; + temp1 = (input[25] * (long)CosPi764) - (input[7] * (long)CosPi2564); + temp2 = (input[25] * (long)CosPi2564) + (input[7] * (long)CosPi764); step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = input[5] * (long)CosPi27_64 - input[27] * (long)CosPi5_64; - temp2 = input[5] * (long)CosPi5_64 + input[27] * (long)CosPi27_64; + temp1 = (input[5] * (long)CosPi2764) - (input[27] * (long)CosPi564); + temp2 = (input[5] * (long)CosPi564) + (input[27] * (long)CosPi2764); step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = input[21] * (long)CosPi11_64 - input[11] * (long)CosPi21_64; - temp2 = input[21] * (long)CosPi21_64 + input[11] * (long)CosPi11_64; + temp1 = (input[21] * (long)CosPi1164) - (input[11] * (long)CosPi2164); + temp2 = (input[21] * (long)CosPi2164) + (input[11] * (long)CosPi1164); step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = input[13] * (long)CosPi19_64 - input[19] * (long)CosPi13_64; - temp2 = input[13] * (long)CosPi13_64 + input[19] * (long)CosPi19_64; + temp1 = (input[13] * (long)CosPi1964) - (input[19] * (long)CosPi1364); + temp2 = (input[13] * (long)CosPi1364) + (input[19] * (long)CosPi1964); step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = input[29] * (long)CosPi3_64 - input[3] * (long)CosPi29_64; - temp2 = input[29] * (long)CosPi29_64 + input[3] * (long)CosPi3_64; + temp1 = (input[29] * (long)CosPi364) - (input[3] * (long)CosPi2964); + temp2 = (input[29] * (long)CosPi2964) + (input[3] * (long)CosPi364); step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd); @@ -2478,23 +2466,23 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step2[6] = step1[6]; step2[7] = step1[7]; - temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64; - temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64; + temp1 = (step1[8] * (long)CosPi3064) - (step1[15] * (long)CosPi264); + temp2 = (step1[8] * (long)CosPi264) + (step1[15] * (long)CosPi3064); step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64; - temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64; + temp1 = (step1[9] * (long)CosPi1464) - (step1[14] * (long)CosPi1864); + temp2 = (step1[9] * (long)CosPi1864) + (step1[14] * (long)CosPi1464); step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64; - temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64; + temp1 = (step1[10] * (long)CosPi2264) - (step1[13] * (long)CosPi1064); + temp2 = (step1[10] * (long)CosPi1064) + (step1[13] * (long)CosPi2264); step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64; - temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64; + temp1 = (step1[11] * (long)CosPi664) - (step1[12] * (long)CosPi2664); + temp2 = (step1[11] * (long)CosPi2664) + (step1[12] * (long)CosPi664); step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd); @@ -2521,12 +2509,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[2] = step2[2]; step1[3] = step2[3]; - temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64; - temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64; + temp1 = (step2[4] * (long)CosPi2864) - (step2[7] * (long)CosPi464); + temp2 = (step2[4] * (long)CosPi464) + (step2[7] * (long)CosPi2864); step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64; - temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64; + temp1 = (step2[5] * (long)CosPi1264) - (step2[6] * (long)CosPi2064); + temp2 = (step2[5] * (long)CosPi2064) + (step2[6] * (long)CosPi1264); step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); @@ -2541,22 +2529,22 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[16] = step2[16]; step1[31] = step2[31]; - temp1 = -step2[17] * (long)CosPi4_64 + step2[30] * (long)CosPi28_64; - temp2 = step2[17] * (long)CosPi28_64 + step2[30] * (long)CosPi4_64; + temp1 = (-step2[17] * (long)CosPi464) + (step2[30] * (long)CosPi2864); + temp2 = (step2[17] * (long)CosPi2864) + (step2[30] * (long)CosPi464); step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = -step2[18] * (long)CosPi28_64 - step2[29] * (long)CosPi4_64; - temp2 = -step2[18] * (long)CosPi4_64 + step2[29] * (long)CosPi28_64; + temp1 = (-step2[18] * (long)CosPi2864) - (step2[29] * (long)CosPi464); + temp2 = (-step2[18] * (long)CosPi464) + (step2[29] * (long)CosPi2864); step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd); step1[19] = step2[19]; step1[20] = step2[20]; - temp1 = -step2[21] * (long)CosPi20_64 + step2[26] * (long)CosPi12_64; - temp2 = step2[21] * (long)CosPi12_64 + step2[26] * (long)CosPi20_64; + temp1 = (-step2[21] * (long)CosPi2064) + (step2[26] * (long)CosPi1264); + temp2 = (step2[21] * (long)CosPi1264) + (step2[26] * (long)CosPi2064); step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = -step2[22] * (long)CosPi12_64 - step2[25] * (long)CosPi20_64; - temp2 = -step2[22] * (long)CosPi20_64 + step2[25] * (long)CosPi12_64; + temp1 = (-step2[22] * (long)CosPi1264) - (step2[25] * (long)CosPi2064); + temp2 = (-step2[22] * (long)CosPi2064) + (step2[25] * (long)CosPi1264); step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd); step1[23] = step2[23]; @@ -2565,12 +2553,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[28] = step2[28]; // stage 4 - temp1 = (step1[0] + step1[1]) * (long)CosPi16_64; - temp2 = (step1[0] - step1[1]) * (long)CosPi16_64; + temp1 = (step1[0] + step1[1]) * (long)CosPi1664; + temp2 = (step1[0] - step1[1]) * (long)CosPi1664; step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64; - temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64; + temp1 = (step1[2] * (long)CosPi2464) - (step1[3] * (long)CosPi864); + temp2 = (step1[2] * (long)CosPi864) + (step1[3] * (long)CosPi2464); step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd); step2[4] = HighbdWrapLow(step1[4] + step1[5], bd); @@ -2580,12 +2568,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step2[8] = step1[8]; step2[15] = step1[15]; - temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64; - temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64; + temp1 = (-step1[9] * (long)CosPi864) + (step1[14] * (long)CosPi2464); + temp2 = (step1[9] * (long)CosPi2464) + (step1[14] * (long)CosPi864); step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64; - temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64; + temp1 = (-step1[10] * (long)CosPi2464) - (step1[13] * (long)CosPi864); + temp2 = (-step1[10] * (long)CosPi864) + (step1[13] * (long)CosPi2464); step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); step2[11] = step1[11]; @@ -2615,8 +2603,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[2] = HighbdWrapLow(step2[1] - step2[2], bd); step1[3] = HighbdWrapLow(step2[0] - step2[3], bd); step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * (long)CosPi16_64; - temp2 = (step2[5] + step2[6]) * (long)CosPi16_64; + temp1 = (step2[6] - step2[5]) * (long)CosPi1664; + temp2 = (step2[5] + step2[6]) * (long)CosPi1664; step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); step1[7] = step2[7]; @@ -2632,20 +2620,20 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[16] = step2[16]; step1[17] = step2[17]; - temp1 = -step2[18] * (long)CosPi8_64 + step2[29] * (long)CosPi24_64; - temp2 = step2[18] * (long)CosPi24_64 + step2[29] * (long)CosPi8_64; + temp1 = (-step2[18] * (long)CosPi864) + (step2[29] * (long)CosPi2464); + temp2 = (step2[18] * (long)CosPi2464) + (step2[29] * (long)CosPi864); step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = -step2[19] * (long)CosPi8_64 + step2[28] * (long)CosPi24_64; - temp2 = step2[19] * (long)CosPi24_64 + step2[28] * (long)CosPi8_64; + temp1 = (-step2[19] * (long)CosPi864) + (step2[28] * (long)CosPi2464); + temp2 = (step2[19] * (long)CosPi2464) + (step2[28] * (long)CosPi864); step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = -step2[20] * (long)CosPi24_64 - step2[27] * (long)CosPi8_64; - temp2 = -step2[20] * (long)CosPi8_64 + step2[27] * (long)CosPi24_64; + temp1 = (-step2[20] * (long)CosPi2464) - (step2[27] * (long)CosPi864); + temp2 = (-step2[20] * (long)CosPi864) + (step2[27] * (long)CosPi2464); step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = -step2[21] * (long)CosPi24_64 - step2[26] * (long)CosPi8_64; - temp2 = -step2[21] * (long)CosPi8_64 + step2[26] * (long)CosPi24_64; + temp1 = (-step2[21] * (long)CosPi2464) - (step2[26] * (long)CosPi864); + temp2 = (-step2[21] * (long)CosPi864) + (step2[26] * (long)CosPi2464); step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd); step1[22] = step2[22]; @@ -2666,12 +2654,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step2[7] = HighbdWrapLow(step1[0] - step1[7], bd); step2[8] = step1[8]; step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64; - temp2 = (step1[10] + step1[13]) * (long)CosPi16_64; + temp1 = (-step1[10] + step1[13]) * (long)CosPi1664; + temp2 = (step1[10] + step1[13]) * (long)CosPi1664; step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64; - temp2 = (step1[11] + step1[12]) * (long)CosPi16_64; + temp1 = (-step1[11] + step1[12]) * (long)CosPi1664; + temp2 = (step1[11] + step1[12]) * (long)CosPi1664; step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd); step2[14] = step1[14]; @@ -2717,20 +2705,20 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp step1[17] = step2[17]; step1[18] = step2[18]; step1[19] = step2[19]; - temp1 = (-step2[20] + step2[27]) * (long)CosPi16_64; - temp2 = (step2[20] + step2[27]) * (long)CosPi16_64; + temp1 = (-step2[20] + step2[27]) * (long)CosPi1664; + temp2 = (step2[20] + step2[27]) * (long)CosPi1664; step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = (-step2[21] + step2[26]) * (long)CosPi16_64; - temp2 = (step2[21] + step2[26]) * (long)CosPi16_64; + temp1 = (-step2[21] + step2[26]) * (long)CosPi1664; + temp2 = (step2[21] + step2[26]) * (long)CosPi1664; step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = (-step2[22] + step2[25]) * (long)CosPi16_64; - temp2 = (step2[22] + step2[25]) * (long)CosPi16_64; + temp1 = (-step2[22] + step2[25]) * (long)CosPi1664; + temp2 = (step2[22] + step2[25]) * (long)CosPi1664; step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd); - temp1 = (-step2[23] + step2[24]) * (long)CosPi16_64; - temp2 = (step2[23] + step2[24]) * (long)CosPi16_64; + temp1 = (-step2[23] + step2[24]) * (long)CosPi1664; + temp2 = (step2[23] + step2[24]) * (long)CosPi1664; step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd); step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd); step1[28] = step2[28]; @@ -2776,17 +2764,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void HighbdIdct32x321024Add(ReadOnlySpan input, Span dest, int stride, int bd) { - int i, j; Span output = stackalloc int[32 * 32]; Span outptr = output; Span tempIn = stackalloc int[32]; Span tempOut = stackalloc int[32]; // Rows - for (i = 0; i < 32; ++i) + for (int i = 0; i < 32; ++i) { int zeroCoeff = 0; - for (j = 0; j < 32; ++j) + for (int j = 0; j < 32; ++j) { zeroCoeff |= input[j]; } @@ -2797,7 +2784,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } else { - outptr.Slice(0, 32).Fill(0); + outptr.Slice(0, 32).Clear(); } input = input.Slice(32); @@ -2805,17 +2792,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Columns - for (i = 0; i < 32; ++i) + for (int i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) + for (int j = 0; j < 32; ++j) { - tempIn[j] = output[j * 32 + i]; + tempIn[j] = output[(j * 32) + i]; } HighbdIdct32(tempIn, tempOut, bd); - for (j = 0; j < 32; ++j) + for (int j = 0; j < 32; ++j) { - dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); + dest[(j * stride) + i] = HighbdClipPixelAdd(dest[(j * stride) + i], + BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); } } } @@ -2823,17 +2811,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void HighbdIdct32x32135Add(ReadOnlySpan input, Span dest, int stride, int bd) { - int i, j; Span output = stackalloc int[32 * 32]; Span outptr = output; Span tempIn = stackalloc int[32]; Span tempOut = stackalloc int[32]; - output.Fill(0); + output.Clear(); // Rows // Only upper-left 16x16 has non-zero coeff - for (i = 0; i < 16; ++i) + for (int i = 0; i < 16; ++i) { HighbdIdct32(input, outptr, bd); input = input.Slice(32); @@ -2841,16 +2828,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Columns - for (i = 0; i < 32; ++i) + for (int i = 0; i < 32; ++i) { Span destT = dest; - for (j = 0; j < 32; ++j) + for (int j = 0; j < 32; ++j) { - tempIn[j] = output[j * 32 + i]; + tempIn[j] = output[(j * 32) + i]; } HighbdIdct32(tempIn, tempOut, bd); - for (j = 0; j < 32; ++j) + for (int j = 0; j < 32; ++j) { destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); destT = destT.Slice(stride); @@ -2861,17 +2848,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp [SkipLocalsInit] public static void HighbdIdct32x3234Add(ReadOnlySpan input, Span dest, int stride, int bd) { - int i, j; Span output = stackalloc int[32 * 32]; Span outptr = output; Span tempIn = stackalloc int[32]; Span tempOut = stackalloc int[32]; - output.Fill(0); + output.Clear(); // Rows // Only upper-left 8x8 has non-zero coeff - for (i = 0; i < 8; ++i) + for (int i = 0; i < 8; ++i) { HighbdIdct32(input, outptr, bd); input = input.Slice(32); @@ -2879,33 +2865,33 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } // Columns - for (i = 0; i < 32; ++i) + for (int i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) + for (int j = 0; j < 32; ++j) { - tempIn[j] = output[j * 32 + i]; + tempIn[j] = output[(j * 32) + i]; } HighbdIdct32(tempIn, tempOut, bd); - for (j = 0; j < 32; ++j) + for (int j = 0; j < 32; ++j) { - dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); + dest[(j * stride) + i] = HighbdClipPixelAdd(dest[(j * stride) + i], + BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); } } } public static void HighbdIdct32x321Add(ReadOnlySpan input, Span dest, int stride, int bd) { - int i, j; int a1; - int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd); + int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi1664), bd); - output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd); + output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi1664), bd); a1 = BitUtils.RoundPowerOfTwo(output, 6); - for (j = 0; j < 32; ++j) + for (int j = 0; j < 32; ++j) { - for (i = 0; i < 32; ++i) + for (int i = 0; i < 32; ++i) { dest[i] = HighbdClipPixelAdd(dest[i], a1, bd); } @@ -2914,4 +2900,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterAuto.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterAuto.cs new file mode 100644 index 000000000..fbd87e17f --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterAuto.cs @@ -0,0 +1,229 @@ +using Ryujinx.Common.Memory; +using System; +using System.Runtime.Intrinsics.X86; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp +{ + internal class LoopFilterAuto + { + public static void LpfHorizontal4( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit, + ReadOnlySpan limit, + ReadOnlySpan thresh) + { + if (Sse2.IsSupported) + { + LoopFilterSse2.LpfHorizontal4(s, pitch, blimit, limit, thresh); + } + else + { + LoopFilterScalar.LpfHorizontal4(s, pitch, blimit[0], limit[0], thresh[0]); + } + } + + public static void LpfHorizontal4Dual( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit0, + ReadOnlySpan limit0, + ReadOnlySpan thresh0, + ReadOnlySpan blimit1, + ReadOnlySpan limit1, + ReadOnlySpan thresh1) + { + if (Sse2.IsSupported) + { + LoopFilterSse2.LpfHorizontal4Dual(s, pitch, blimit0, limit0, thresh0, blimit1, limit1, thresh1); + } + else + { + LoopFilterScalar.LpfHorizontal4Dual(s, pitch, blimit0[0], limit0[0], thresh0[0], blimit1[0], limit1[0], + thresh1[0]); + } + } + + public static void LpfHorizontal8( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit, + ReadOnlySpan limit, + ReadOnlySpan thresh) + { + if (Sse2.IsSupported) + { + LoopFilterSse2.LpfHorizontal8(s, pitch, blimit, limit, thresh); + } + else + { + LoopFilterScalar.LpfHorizontal8(s, pitch, blimit[0], limit[0], thresh[0]); + } + } + + public static void LpfHorizontal8Dual( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit0, + ReadOnlySpan limit0, + ReadOnlySpan thresh0, + ReadOnlySpan blimit1, + ReadOnlySpan limit1, + ReadOnlySpan thresh1) + { + if (Sse2.IsSupported) + { + LoopFilterSse2.LpfHorizontal8Dual(s, pitch, blimit0, limit0, thresh0, blimit1, limit1, thresh1); + } + else + { + LoopFilterScalar.LpfHorizontal8Dual(s, pitch, blimit0[0], limit0[0], thresh0[0], blimit1[0], limit1[0], + thresh1[0]); + } + } + + public static void LpfHorizontal16( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit, + ReadOnlySpan limit, + ReadOnlySpan thresh) + { + if (Sse2.IsSupported) + { + LoopFilterSse2.LpfHorizontal16(s, pitch, blimit, limit, thresh); + } + else + { + LoopFilterScalar.LpfHorizontal16(s, pitch, blimit[0], limit[0], thresh[0]); + } + } + + public static void LpfHorizontal16Dual( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit, + ReadOnlySpan limit, + ReadOnlySpan thresh) + { + if (Sse2.IsSupported) + { + LoopFilterSse2.LpfHorizontal16Dual(s, pitch, blimit, limit, thresh); + } + else + { + LoopFilterScalar.LpfHorizontal16Dual(s, pitch, blimit[0], limit[0], thresh[0]); + } + } + + public static void LpfVertical4( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit, + ReadOnlySpan limit, + ReadOnlySpan thresh) + { + if (Sse2.IsSupported) + { + LoopFilterSse2.LpfVertical4(s, pitch, blimit, limit, thresh); + } + else + { + LoopFilterScalar.LpfVertical4(s, pitch, blimit[0], limit[0], thresh[0]); + } + } + + public static void LpfVertical4Dual( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit0, + ReadOnlySpan limit0, + ReadOnlySpan thresh0, + ReadOnlySpan blimit1, + ReadOnlySpan limit1, + ReadOnlySpan thresh1) + { + if (Sse2.IsSupported) + { + LoopFilterSse2.LpfVertical4Dual(s, pitch, blimit0, limit0, thresh0, blimit1, limit1, thresh1); + } + else + { + LoopFilterScalar.LpfVertical4Dual(s, pitch, blimit0[0], limit0[0], thresh0[0], blimit1[0], limit1[0], + thresh1[0]); + } + } + + public static void LpfVertical8( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit, + ReadOnlySpan limit, + ReadOnlySpan thresh) + { + if (Sse2.IsSupported) + { + LoopFilterSse2.LpfVertical8(s, pitch, blimit, limit, thresh); + } + else + { + LoopFilterScalar.LpfVertical8(s, pitch, blimit[0], limit[0], thresh[0]); + } + } + + public static void LpfVertical8Dual( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit0, + ReadOnlySpan limit0, + ReadOnlySpan thresh0, + ReadOnlySpan blimit1, + ReadOnlySpan limit1, + ReadOnlySpan thresh1) + { + if (Sse2.IsSupported) + { + LoopFilterSse2.LpfVertical8Dual(s, pitch, blimit0, limit0, thresh0, blimit1, limit1, thresh1); + } + else + { + LoopFilterScalar.LpfVertical8Dual(s, pitch, blimit0[0], limit0[0], thresh0[0], blimit1[0], limit1[0], + thresh1[0]); + } + } + + public static void LpfVertical16( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit, + ReadOnlySpan limit, + ReadOnlySpan thresh) + { + if (Sse2.IsSupported) + { + LoopFilterSse2.LpfVertical16(s, pitch, blimit, limit, thresh); + } + else + { + LoopFilterScalar.LpfVertical16(s, pitch, blimit[0], limit[0], thresh[0]); + } + } + + public static void LpfVertical16Dual( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit, + ReadOnlySpan limit, + ReadOnlySpan thresh) + { + if (Sse2.IsSupported) + { + LoopFilterSse2.LpfVertical16Dual(s, pitch, blimit, limit, thresh); + } + else + { + LoopFilterScalar.LpfVertical16Dual(s, pitch, blimit[0], limit[0], thresh[0]); + } + } + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterScalar.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterScalar.cs new file mode 100644 index 000000000..79c3f1235 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterScalar.cs @@ -0,0 +1,1093 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Common; +using System; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp +{ + internal static class LoopFilterScalar + { + private static sbyte ClampSbyte(int t) + { + return (sbyte)Math.Clamp(t, -128, 127); + } + + private static short ClampSbyteHigh(int t, int bd) + { + return bd switch + { + 10 => (short)Math.Clamp(t, -128 * 4, (128 * 4) - 1), + 12 => (short)Math.Clamp(t, -128 * 16, (128 * 16) - 1), + _ => (short)Math.Clamp(t, -128, 128 - 1) + }; + } + + // Should we apply any filter at all: 11111111 yes, 00000000 no + private static sbyte FilterMask( + byte limit, + byte blimit, + byte p3, + byte p2, + byte p1, + byte p0, + byte q0, + byte q1, + byte q2, + byte q3) + { + int mask = 0; + mask |= Math.Abs(p3 - p2) > limit ? -1 : 0; + mask |= Math.Abs(p2 - p1) > limit ? -1 : 0; + mask |= Math.Abs(p1 - p0) > limit ? -1 : 0; + mask |= Math.Abs(q1 - q0) > limit ? -1 : 0; + mask |= Math.Abs(q2 - q1) > limit ? -1 : 0; + mask |= Math.Abs(q3 - q2) > limit ? -1 : 0; + mask |= (Math.Abs(p0 - q0) * 2) + (Math.Abs(p1 - q1) / 2) > blimit ? -1 : 0; + return (sbyte)~mask; + } + + private static sbyte FlatMask4( + byte thresh, + byte p3, + byte p2, + byte p1, + byte p0, + byte q0, + byte q1, + byte q2, + byte q3) + { + int mask = 0; + mask |= Math.Abs(p1 - p0) > thresh ? -1 : 0; + mask |= Math.Abs(q1 - q0) > thresh ? -1 : 0; + mask |= Math.Abs(p2 - p0) > thresh ? -1 : 0; + mask |= Math.Abs(q2 - q0) > thresh ? -1 : 0; + mask |= Math.Abs(p3 - p0) > thresh ? -1 : 0; + mask |= Math.Abs(q3 - q0) > thresh ? -1 : 0; + return (sbyte)~mask; + } + + private static sbyte FlatMask5( + byte thresh, + byte p4, + byte p3, + byte p2, + byte p1, + byte p0, + byte q0, + byte q1, + byte q2, + byte q3, + byte q4) + { + int mask = ~FlatMask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3); + mask |= Math.Abs(p4 - p0) > thresh ? -1 : 0; + mask |= Math.Abs(q4 - q0) > thresh ? -1 : 0; + return (sbyte)~mask; + } + + // Is there high edge variance internal edge: 11111111 yes, 00000000 no + private static sbyte HevMask( + byte thresh, + byte p1, + byte p0, + byte q0, + byte q1) + { + int hev = 0; + hev |= Math.Abs(p1 - p0) > thresh ? -1 : 0; + hev |= Math.Abs(q1 - q0) > thresh ? -1 : 0; + return (sbyte)hev; + } + + private static void Filter4( + sbyte mask, + byte thresh, + ref byte op1, + ref byte op0, + ref byte oq0, + ref byte oq1) + { + sbyte filter1, filter2; + + sbyte ps1 = (sbyte)(op1 ^ 0x80); + sbyte ps0 = (sbyte)(op0 ^ 0x80); + sbyte qs0 = (sbyte)(oq0 ^ 0x80); + sbyte qs1 = (sbyte)(oq1 ^ 0x80); + sbyte hev = HevMask(thresh, op1, op0, oq0, oq1); + + // add outer taps if we have high edge variance + sbyte filter = (sbyte)(ClampSbyte(ps1 - qs1) & hev); + + // inner taps + filter = (sbyte)(ClampSbyte(filter + (3 * (qs0 - ps0))) & mask); + + // save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set it to adjust by -1 to account for the fact + // we'd round it by 3 the other way + filter1 = (sbyte)(ClampSbyte(filter + 4) >> 3); + filter2 = (sbyte)(ClampSbyte(filter + 3) >> 3); + + oq0 = (byte)(ClampSbyte(qs0 - filter1) ^ 0x80); + op0 = (byte)(ClampSbyte(ps0 + filter2) ^ 0x80); + + // outer tap adjustments + filter = (sbyte)(BitUtils.RoundPowerOfTwo(filter1, 1) & ~hev); + + oq1 = (byte)(ClampSbyte(qs1 - filter) ^ 0x80); + op1 = (byte)(ClampSbyte(ps1 + filter) ^ 0x80); + } + + public static void LpfHorizontal4( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh) + { + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (int i = 0; i < 8; ++i) + { + byte p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], p0 = s[-pitch]; + byte q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], q3 = s[3 * pitch]; + sbyte mask = FilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3); + Filter4(mask, thresh, ref s[-2 * pitch], ref s[-1 * pitch], ref s[0], ref s[1 * pitch]); + s = s.Slice(1); + } + } + + public static void LpfHorizontal4Dual( + ArrayPtr s, + int pitch, + byte blimit0, + byte limit0, + byte thresh0, + byte blimit1, + byte limit1, + byte thresh1) + { + LpfHorizontal4(s, pitch, blimit0, limit0, thresh0); + LpfHorizontal4(s.Slice(8), pitch, blimit1, limit1, thresh1); + } + + public static void LpfVertical4( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh) + { + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (int i = 0; i < 8; ++i) + { + byte p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + byte q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + sbyte mask = FilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3); + Filter4(mask, thresh, ref s[-2], ref s[-1], ref s[0], ref s[1]); + s = s.Slice(pitch); + } + } + + public static void LpfVertical4Dual( + ArrayPtr s, + int pitch, + byte blimit0, + byte limit0, + byte thresh0, + byte blimit1, + byte limit1, + byte thresh1) + { + LpfVertical4(s, pitch, blimit0, limit0, thresh0); + LpfVertical4(s.Slice(8 * pitch), pitch, blimit1, limit1, thresh1); + } + + private static void Filter8( + sbyte mask, + byte thresh, + bool flat, + ref byte op3, + ref byte op2, + ref byte op1, + ref byte op0, + ref byte oq0, + ref byte oq1, + ref byte oq2, + ref byte oq3) + { + if (flat && mask != 0) + { + byte p3 = op3, p2 = op2, p1 = op1, p0 = op0; + byte q0 = oq0, q1 = oq1, q2 = oq2, q3 = oq3; + + // 7-tap filter [1, 1, 1, 2, 1, 1, 1] + op2 = (byte)BitUtils.RoundPowerOfTwo(p3 + p3 + p3 + (2 * p2) + p1 + p0 + q0, 3); + op1 = (byte)BitUtils.RoundPowerOfTwo(p3 + p3 + p2 + (2 * p1) + p0 + q0 + q1, 3); + op0 = (byte)BitUtils.RoundPowerOfTwo(p3 + p2 + p1 + (2 * p0) + q0 + q1 + q2, 3); + oq0 = (byte)BitUtils.RoundPowerOfTwo(p2 + p1 + p0 + (2 * q0) + q1 + q2 + q3, 3); + oq1 = (byte)BitUtils.RoundPowerOfTwo(p1 + p0 + q0 + (2 * q1) + q2 + q3 + q3, 3); + oq2 = (byte)BitUtils.RoundPowerOfTwo(p0 + q0 + q1 + (2 * q2) + q3 + q3 + q3, 3); + } + else + { + Filter4(mask, thresh, ref op1, ref op0, ref oq0, ref oq1); + } + } + + public static void LpfHorizontal8( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh) + { + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (int i = 0; i < 8; ++i) + { + byte p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], p0 = s[-pitch]; + byte q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], q3 = s[3 * pitch]; + + sbyte mask = FilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3); + sbyte flat = FlatMask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + Filter8( + mask, + thresh, + flat != 0, + ref s[-4 * pitch], + ref s[-3 * pitch], + ref s[-2 * pitch], + ref s[-1 * pitch], + ref s[0], + ref s[1 * pitch], + ref s[2 * pitch], + ref s[3 * pitch]); + s = s.Slice(1); + } + } + + public static void LpfHorizontal8Dual( + ArrayPtr s, + int pitch, + byte blimit0, + byte limit0, + byte thresh0, + byte blimit1, + byte limit1, + byte thresh1) + { + LpfHorizontal8(s, pitch, blimit0, limit0, thresh0); + LpfHorizontal8(s.Slice(8), pitch, blimit1, limit1, thresh1); + } + + public static void LpfVertical8( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh) + { + for (int i = 0; i < 8; ++i) + { + byte p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + byte q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + sbyte mask = FilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3); + sbyte flat = FlatMask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + Filter8( + mask, + thresh, + flat != 0, + ref s[-4], + ref s[-3], + ref s[-2], + ref s[-1], + ref s[0], + ref s[1], + ref s[2], + ref s[3]); + s = s.Slice(pitch); + } + } + + public static void LpfVertical8Dual( + ArrayPtr s, + int pitch, + byte blimit0, + byte limit0, + byte thresh0, + byte blimit1, + byte limit1, + byte thresh1) + { + LpfVertical8(s, pitch, blimit0, limit0, thresh0); + LpfVertical8(s.Slice(8 * pitch), pitch, blimit1, limit1, thresh1); + } + + private static void Filter16( + sbyte mask, + byte thresh, + bool flat, + bool flat2, + ref byte op7, + ref byte op6, + ref byte op5, + ref byte op4, + ref byte op3, + ref byte op2, + ref byte op1, + ref byte op0, + ref byte oq0, + ref byte oq1, + ref byte oq2, + ref byte oq3, + ref byte oq4, + ref byte oq5, + ref byte oq6, + ref byte oq7) + { + if (flat2 && flat && mask != 0) + { + byte p7 = op7, p6 = op6, p5 = op5, p4 = op4, p3 = op3, p2 = op2, p1 = op1, p0 = op0; + byte q0 = oq0, q1 = oq1, q2 = oq2, q3 = oq3, q4 = oq4, q5 = oq5, q6 = oq6, q7 = oq7; + + // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] + op6 = (byte)BitUtils.RoundPowerOfTwo( + (p7 * 7) + (p6 * 2) + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4); + op5 = (byte)BitUtils.RoundPowerOfTwo( + (p7 * 6) + p6 + (p5 * 2) + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4); + op4 = (byte)BitUtils.RoundPowerOfTwo( + (p7 * 5) + p6 + p5 + (p4 * 2) + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4); + op3 = (byte)BitUtils.RoundPowerOfTwo( + (p7 * 4) + p6 + p5 + p4 + (p3 * 2) + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4); + op2 = (byte)BitUtils.RoundPowerOfTwo( + (p7 * 3) + p6 + p5 + p4 + p3 + (p2 * 2) + p1 + p0 + q0 + q1 + q2 + q3 + q4, 4); + op1 = (byte)BitUtils.RoundPowerOfTwo( + (p7 * 2) + p6 + p5 + p4 + p3 + p2 + (p1 * 2) + p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); + op0 = (byte)BitUtils.RoundPowerOfTwo( + p7 + p6 + p5 + p4 + p3 + p2 + p1 + (p0 * 2) + q0 + q1 + q2 + q3 + q4 + q5 + q6, 4); + oq0 = (byte)BitUtils.RoundPowerOfTwo( + p6 + p5 + p4 + p3 + p2 + p1 + p0 + (q0 * 2) + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); + oq1 = (byte)BitUtils.RoundPowerOfTwo( + p5 + p4 + p3 + p2 + p1 + p0 + q0 + (q1 * 2) + q2 + q3 + q4 + q5 + q6 + (q7 * 2), 4); + oq2 = (byte)BitUtils.RoundPowerOfTwo( + p4 + p3 + p2 + p1 + p0 + q0 + q1 + (q2 * 2) + q3 + q4 + q5 + q6 + (q7 * 3), 4); + oq3 = (byte)BitUtils.RoundPowerOfTwo( + p3 + p2 + p1 + p0 + q0 + q1 + q2 + (q3 * 2) + q4 + q5 + q6 + (q7 * 4), 4); + oq4 = (byte)BitUtils.RoundPowerOfTwo( + p2 + p1 + p0 + q0 + q1 + q2 + q3 + (q4 * 2) + q5 + q6 + (q7 * 5), 4); + oq5 = (byte)BitUtils.RoundPowerOfTwo( + p1 + p0 + q0 + q1 + q2 + q3 + q4 + (q5 * 2) + q6 + (q7 * 6), 4); + oq6 = (byte)BitUtils.RoundPowerOfTwo( + p0 + q0 + q1 + q2 + q3 + q4 + q5 + (q6 * 2) + (q7 * 7), 4); + } + else + { + Filter8(mask, thresh, flat, ref op3, ref op2, ref op1, ref op0, ref oq0, ref oq1, ref oq2, ref oq3); + } + } + + private static void MbLpfHorizontalEdgeW( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh, + int count) + { + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (int i = 0; i < 8 * count; ++i) + { + byte p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], p0 = s[-pitch]; + byte q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], q3 = s[3 * pitch]; + sbyte mask = FilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3); + sbyte flat = FlatMask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + sbyte flat2 = FlatMask5( + 1, + s[-8 * pitch], + s[-7 * pitch], + s[-6 * pitch], + s[-5 * pitch], + p0, + q0, + s[4 * pitch], + s[5 * pitch], + s[6 * pitch], + s[7 * pitch]); + + Filter16( + mask, + thresh, + flat != 0, + flat2 != 0, + ref s[-8 * pitch], + ref s[-7 * pitch], + ref s[-6 * pitch], + ref s[-5 * pitch], + ref s[-4 * pitch], + ref s[-3 * pitch], + ref s[-2 * pitch], + ref s[-1 * pitch], + ref s[0], + ref s[1 * pitch], + ref s[2 * pitch], + ref s[3 * pitch], + ref s[4 * pitch], + ref s[5 * pitch], + ref s[6 * pitch], + ref s[7 * pitch]); + s = s.Slice(1); + } + } + + public static void LpfHorizontal16( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh) + { + MbLpfHorizontalEdgeW(s, pitch, blimit, limit, thresh, 1); + } + + public static void LpfHorizontal16Dual( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh) + { + MbLpfHorizontalEdgeW(s, pitch, blimit, limit, thresh, 2); + } + + private static void MbLpfVerticalEdgeW( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh, + int count) + { + for (int i = 0; i < count; ++i) + { + byte p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + byte q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + sbyte mask = FilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3); + sbyte flat = FlatMask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + sbyte flat2 = FlatMask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], s[7]); + + Filter16( + mask, + thresh, + flat != 0, + flat2 != 0, + ref s[-8], + ref s[-7], + ref s[-6], + ref s[-5], + ref s[-4], + ref s[-3], + ref s[-2], + ref s[-1], + ref s[0], + ref s[1], + ref s[2], + ref s[3], + ref s[4], + ref s[5], + ref s[6], + ref s[7]); + s = s.Slice(pitch); + } + } + + public static void LpfVertical16( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh) + { + MbLpfVerticalEdgeW(s, pitch, blimit, limit, thresh, 8); + } + + public static void LpfVertical16Dual( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh) + { + MbLpfVerticalEdgeW(s, pitch, blimit, limit, thresh, 16); + } + + // Should we apply any filter at all: 11111111 yes, 00000000 no ? + private static sbyte HighBdFilterMask( + byte limit, + byte blimit, + ushort p3, + ushort p2, + ushort p1, + ushort p0, + ushort q0, + ushort q1, + ushort q2, + ushort q3, + int bd) + { + int mask = 0; + short limit16 = (short)(limit << (bd - 8)); + short blimit16 = (short)(blimit << (bd - 8)); + mask |= Math.Abs(p3 - p2) > limit16 ? -1 : 0; + mask |= Math.Abs(p2 - p1) > limit16 ? -1 : 0; + mask |= Math.Abs(p1 - p0) > limit16 ? -1 : 0; + mask |= Math.Abs(q1 - q0) > limit16 ? -1 : 0; + mask |= Math.Abs(q2 - q1) > limit16 ? -1 : 0; + mask |= Math.Abs(q3 - q2) > limit16 ? -1 : 0; + mask |= (Math.Abs(p0 - q0) * 2) + (Math.Abs(p1 - q1) / 2) > blimit16 ? -1 : 0; + return (sbyte)~mask; + } + + private static sbyte HighBdFlatMask4( + byte thresh, + ushort p3, + ushort p2, + ushort p1, + ushort p0, + ushort q0, + ushort q1, + ushort q2, + ushort q3, + int bd) + { + int mask = 0; + short thresh16 = (short)(thresh << (bd - 8)); + mask |= Math.Abs(p1 - p0) > thresh16 ? -1 : 0; + mask |= Math.Abs(q1 - q0) > thresh16 ? -1 : 0; + mask |= Math.Abs(p2 - p0) > thresh16 ? -1 : 0; + mask |= Math.Abs(q2 - q0) > thresh16 ? -1 : 0; + mask |= Math.Abs(p3 - p0) > thresh16 ? -1 : 0; + mask |= Math.Abs(q3 - q0) > thresh16 ? -1 : 0; + return (sbyte)~mask; + } + + private static sbyte HighBdFlatMask5( + byte thresh, + ushort p4, + ushort p3, + ushort p2, + ushort p1, + ushort p0, + ushort q0, + ushort q1, + ushort q2, + ushort q3, + ushort q4, + int bd) + { + int mask = ~HighBdFlatMask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd); + short thresh16 = (short)(thresh << (bd - 8)); + mask |= Math.Abs(p4 - p0) > thresh16 ? -1 : 0; + mask |= Math.Abs(q4 - q0) > thresh16 ? -1 : 0; + return (sbyte)~mask; + } + + // Is there high edge variance internal edge: + // 11111111_11111111 yes, 00000000_00000000 no ? + private static short HighBdHevMask( + byte thresh, + ushort p1, + ushort p0, + ushort q0, + ushort q1, + int bd) + { + int hev = 0; + short thresh16 = (short)(thresh << (bd - 8)); + hev |= Math.Abs(p1 - p0) > thresh16 ? -1 : 0; + hev |= Math.Abs(q1 - q0) > thresh16 ? -1 : 0; + return (short)hev; + } + + private static void HighBdFilter4( + sbyte mask, + byte thresh, + ref ushort op1, + ref ushort op0, + ref ushort oq0, + ref ushort oq1, + int bd) + { + short filter1, filter2; + // ^0x80 equivalent to subtracting 0x80 from the values to turn them + // into -128 to +127 instead of 0 to 255. + int shift = bd - 8; + short ps1 = (short)((short)op1 - (0x80 << shift)); + short ps0 = (short)((short)op0 - (0x80 << shift)); + short qs0 = (short)((short)oq0 - (0x80 << shift)); + short qs1 = (short)((short)oq1 - (0x80 << shift)); + short hev = HighBdHevMask(thresh, op1, op0, oq0, oq1, bd); + + // Add outer taps if we have high edge variance. + short filter = (short)(ClampSbyteHigh(ps1 - qs1, bd) & hev); + + // Inner taps. + filter = (short)(ClampSbyteHigh(filter + (3 * (qs0 - ps0)), bd) & mask); + + // Save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set it to adjust by -1 to account for the fact + // we'd round it by 3 the other way. + filter1 = (short)(ClampSbyteHigh(filter + 4, bd) >> 3); + filter2 = (short)(ClampSbyteHigh(filter + 3, bd) >> 3); + + oq0 = (ushort)(ClampSbyteHigh(qs0 - filter1, bd) + (0x80 << shift)); + op0 = (ushort)(ClampSbyteHigh(ps0 + filter2, bd) + (0x80 << shift)); + + // Outer tap adjustments. + filter = (short)(BitUtils.RoundPowerOfTwo(filter1, 1) & ~hev); + + oq1 = (ushort)(ClampSbyteHigh(qs1 - filter, bd) + (0x80 << shift)); + op1 = (ushort)(ClampSbyteHigh(ps1 + filter, bd) + (0x80 << shift)); + } + + public static void HighBdLpfHorizontal4( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh, + int bd) + { + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (int i = 0; i < 8; ++i) + { + ushort p3 = s[-4 * pitch]; + ushort p2 = s[-3 * pitch]; + ushort p1 = s[-2 * pitch]; + ushort p0 = s[-pitch]; + ushort q0 = s[0 * pitch]; + ushort q1 = s[1 * pitch]; + ushort q2 = s[2 * pitch]; + ushort q3 = s[3 * pitch]; + sbyte mask = HighBdFilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + HighBdFilter4(mask, thresh, ref s[-2 * pitch], ref s[-1 * pitch], ref s[0], ref s[1 * pitch], bd); + s = s.Slice(1); + } + } + + public static void HighBdLpfHorizontal4Dual( + ArrayPtr s, + int pitch, + byte blimit0, + byte limit0, + byte thresh0, + byte blimit1, + byte limit1, + byte thresh1, + int bd) + { + HighBdLpfHorizontal4(s, pitch, blimit0, limit0, thresh0, bd); + HighBdLpfHorizontal4(s.Slice(8), pitch, blimit1, limit1, thresh1, bd); + } + + public static void HighBdLpfVertical4( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh, + int bd) + { + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (int i = 0; i < 8; ++i) + { + ushort p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + ushort q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + sbyte mask = HighBdFilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + HighBdFilter4(mask, thresh, ref s[-2], ref s[-1], ref s[0], ref s[1], bd); + s = s.Slice(pitch); + } + } + + public static void HighBdLpfVertical4Dual( + ArrayPtr s, + int pitch, + byte blimit0, + byte limit0, + byte thresh0, + byte blimit1, + byte limit1, + byte thresh1, + int bd) + { + HighBdLpfVertical4(s, pitch, blimit0, limit0, thresh0, bd); + HighBdLpfVertical4(s.Slice(8 * pitch), pitch, blimit1, limit1, thresh1, bd); + } + + private static void HighBdFilter8( + sbyte mask, + byte thresh, + bool flat, + ref ushort op3, + ref ushort op2, + ref ushort op1, + ref ushort op0, + ref ushort oq0, + ref ushort oq1, + ref ushort oq2, + ref ushort oq3, + int bd) + { + if (flat && mask != 0) + { + ushort p3 = op3, p2 = op2, p1 = op1, p0 = op0; + ushort q0 = oq0, q1 = oq1, q2 = oq2, q3 = oq3; + + // 7-tap filter [1, 1, 1, 2, 1, 1, 1] + op2 = (ushort)BitUtils.RoundPowerOfTwo(p3 + p3 + p3 + (2 * p2) + p1 + p0 + q0, 3); + op1 = (ushort)BitUtils.RoundPowerOfTwo(p3 + p3 + p2 + (2 * p1) + p0 + q0 + q1, 3); + op0 = (ushort)BitUtils.RoundPowerOfTwo(p3 + p2 + p1 + (2 * p0) + q0 + q1 + q2, 3); + oq0 = (ushort)BitUtils.RoundPowerOfTwo(p2 + p1 + p0 + (2 * q0) + q1 + q2 + q3, 3); + oq1 = (ushort)BitUtils.RoundPowerOfTwo(p1 + p0 + q0 + (2 * q1) + q2 + q3 + q3, 3); + oq2 = (ushort)BitUtils.RoundPowerOfTwo(p0 + q0 + q1 + (2 * q2) + q3 + q3 + q3, 3); + } + else + { + HighBdFilter4(mask, thresh, ref op1, ref op0, ref oq0, ref oq1, bd); + } + } + + public static void HighBdLpfHorizontal8( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh, + int bd) + { + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (int i = 0; i < 8; ++i) + { + ushort p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], p0 = s[-pitch]; + ushort q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], q3 = s[3 * pitch]; + + sbyte mask = HighBdFilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + sbyte flat = HighBdFlatMask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + HighBdFilter8( + mask, + thresh, + flat != 0, + ref s[-4 * pitch], + ref s[-3 * pitch], + ref s[-2 * pitch], + ref s[-1 * pitch], + ref s[0], + ref s[1 * pitch], + ref s[2 * pitch], + ref s[3 * pitch], + bd); + s = s.Slice(1); + } + } + + public static void HighBdLpfHorizontal8Dual( + ArrayPtr s, + int pitch, + byte blimit0, + byte limit0, + byte thresh0, + byte blimit1, + byte limit1, + byte thresh1, + int bd) + { + HighBdLpfHorizontal8(s, pitch, blimit0, limit0, thresh0, bd); + HighBdLpfHorizontal8(s.Slice(8), pitch, blimit1, limit1, thresh1, bd); + } + + public static void HighBdLpfVertical8( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh, + int bd) + { + for (int i = 0; i < 8; ++i) + { + ushort p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + ushort q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + sbyte mask = HighBdFilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + sbyte flat = HighBdFlatMask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + HighBdFilter8( + mask, + thresh, + flat != 0, + ref s[-4], + ref s[-3], + ref s[-2], + ref s[-1], + ref s[0], + ref s[1], + ref s[2], + ref s[3], + bd); + s = s.Slice(pitch); + } + } + + public static void HighBdLpfVertical8Dual( + ArrayPtr s, + int pitch, + byte blimit0, + byte limit0, + byte thresh0, + byte blimit1, + byte limit1, + byte thresh1, + int bd) + { + HighBdLpfVertical8(s, pitch, blimit0, limit0, thresh0, bd); + HighBdLpfVertical8(s.Slice(8 * pitch), pitch, blimit1, limit1, thresh1, bd); + } + + private static void HighBdFilter16( + sbyte mask, + byte thresh, + bool flat, + bool flat2, + ref ushort op7, + ref ushort op6, + ref ushort op5, + ref ushort op4, + ref ushort op3, + ref ushort op2, + ref ushort op1, + ref ushort op0, + ref ushort oq0, + ref ushort oq1, + ref ushort oq2, + ref ushort oq3, + ref ushort oq4, + ref ushort oq5, + ref ushort oq6, + ref ushort oq7, + int bd) + { + if (flat2 && flat && mask != 0) + { + ushort p7 = op7; + ushort p6 = op6; + ushort p5 = op5; + ushort p4 = op4; + ushort p3 = op3; + ushort p2 = op2; + ushort p1 = op1; + ushort p0 = op0; + ushort q0 = oq0; + ushort q1 = oq1; + ushort q2 = oq2; + ushort q3 = oq3; + ushort q4 = oq4; + ushort q5 = oq5; + ushort q6 = oq6; + ushort q7 = oq7; + + // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] + op6 = (ushort)BitUtils.RoundPowerOfTwo( + (p7 * 7) + (p6 * 2) + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4); + op5 = (ushort)BitUtils.RoundPowerOfTwo( + (p7 * 6) + p6 + (p5 * 2) + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4); + op4 = (ushort)BitUtils.RoundPowerOfTwo( + (p7 * 5) + p6 + p5 + (p4 * 2) + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4); + op3 = (ushort)BitUtils.RoundPowerOfTwo( + (p7 * 4) + p6 + p5 + p4 + (p3 * 2) + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4); + op2 = (ushort)BitUtils.RoundPowerOfTwo( + (p7 * 3) + p6 + p5 + p4 + p3 + (p2 * 2) + p1 + p0 + q0 + q1 + q2 + q3 + q4, 4); + op1 = (ushort)BitUtils.RoundPowerOfTwo( + (p7 * 2) + p6 + p5 + p4 + p3 + p2 + (p1 * 2) + p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); + op0 = (ushort)BitUtils.RoundPowerOfTwo( + p7 + p6 + p5 + p4 + p3 + p2 + p1 + (p0 * 2) + q0 + q1 + q2 + q3 + q4 + q5 + q6, 4); + oq0 = (ushort)BitUtils.RoundPowerOfTwo( + p6 + p5 + p4 + p3 + p2 + p1 + p0 + (q0 * 2) + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); + oq1 = (ushort)BitUtils.RoundPowerOfTwo( + p5 + p4 + p3 + p2 + p1 + p0 + q0 + (q1 * 2) + q2 + q3 + q4 + q5 + q6 + (q7 * 2), 4); + oq2 = (ushort)BitUtils.RoundPowerOfTwo( + p4 + p3 + p2 + p1 + p0 + q0 + q1 + (q2 * 2) + q3 + q4 + q5 + q6 + (q7 * 3), 4); + oq3 = (ushort)BitUtils.RoundPowerOfTwo( + p3 + p2 + p1 + p0 + q0 + q1 + q2 + (q3 * 2) + q4 + q5 + q6 + (q7 * 4), 4); + oq4 = (ushort)BitUtils.RoundPowerOfTwo( + p2 + p1 + p0 + q0 + q1 + q2 + q3 + (q4 * 2) + q5 + q6 + (q7 * 5), 4); + oq5 = (ushort)BitUtils.RoundPowerOfTwo( + p1 + p0 + q0 + q1 + q2 + q3 + q4 + (q5 * 2) + q6 + (q7 * 6), 4); + oq6 = (ushort)BitUtils.RoundPowerOfTwo( + p0 + q0 + q1 + q2 + q3 + q4 + q5 + (q6 * 2) + (q7 * 7), 4); + } + else + { + HighBdFilter8(mask, thresh, flat, ref op3, ref op2, ref op1, ref op0, ref oq0, ref oq1, ref oq2, + ref oq3, bd); + } + } + + private static void HighBdMbLpfHorizontalEdgeW( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh, + int count, + int bd) + { + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (int i = 0; i < 8 * count; ++i) + { + ushort p3 = s[-4 * pitch]; + ushort p2 = s[-3 * pitch]; + ushort p1 = s[-2 * pitch]; + ushort p0 = s[-pitch]; + ushort q0 = s[0 * pitch]; + ushort q1 = s[1 * pitch]; + ushort q2 = s[2 * pitch]; + ushort q3 = s[3 * pitch]; + sbyte mask = HighBdFilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + sbyte flat = HighBdFlatMask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + sbyte flat2 = HighBdFlatMask5( + 1, + s[-8 * pitch], + s[-7 * pitch], + s[-6 * pitch], + s[-5 * pitch], + p0, + q0, + s[4 * pitch], + s[5 * pitch], + s[6 * pitch], + s[7 * pitch], + bd); + + HighBdFilter16( + mask, + thresh, + flat != 0, + flat2 != 0, + ref s[-8 * pitch], + ref s[-7 * pitch], + ref s[-6 * pitch], + ref s[-5 * pitch], + ref s[-4 * pitch], + ref s[-3 * pitch], + ref s[-2 * pitch], + ref s[-1 * pitch], + ref s[0], + ref s[1 * pitch], + ref s[2 * pitch], + ref s[3 * pitch], + ref s[4 * pitch], + ref s[5 * pitch], + ref s[6 * pitch], + ref s[7 * pitch], + bd); + s = s.Slice(1); + } + } + + public static void HighBdLpfHorizontal16( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh, + int bd) + { + HighBdMbLpfHorizontalEdgeW(s, pitch, blimit, limit, thresh, 1, bd); + } + + public static void HighBdLpfHorizontal16Dual( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh, + int bd) + { + HighBdMbLpfHorizontalEdgeW(s, pitch, blimit, limit, thresh, 2, bd); + } + + private static void HighBdMbLpfVerticalEdgeW( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh, + int count, + int bd) + { + for (int i = 0; i < count; ++i) + { + ushort p3 = s[-4]; + ushort p2 = s[-3]; + ushort p1 = s[-2]; + ushort p0 = s[-1]; + ushort q0 = s[0]; + ushort q1 = s[1]; + ushort q2 = s[2]; + ushort q3 = s[3]; + sbyte mask = HighBdFilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + sbyte flat = HighBdFlatMask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + sbyte flat2 = HighBdFlatMask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], s[7], bd); + + HighBdFilter16( + mask, + thresh, + flat != 0, + flat2 != 0, + ref s[-8], + ref s[-7], + ref s[-6], + ref s[-5], + ref s[-4], + ref s[-3], + ref s[-2], + ref s[-1], + ref s[0], + ref s[1], + ref s[2], + ref s[3], + ref s[4], + ref s[5], + ref s[6], + ref s[7], + bd); + s = s.Slice(pitch); + } + } + + public static void HighBdLpfVertical16( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh, + int bd) + { + HighBdMbLpfVerticalEdgeW(s, pitch, blimit, limit, thresh, 8, bd); + } + + public static void HighBdLpfVertical16Dual( + ArrayPtr s, + int pitch, + byte blimit, + byte limit, + byte thresh, + int bd) + { + HighBdMbLpfVerticalEdgeW(s, pitch, blimit, limit, thresh, 16, bd); + } + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterSse2.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterSse2.cs new file mode 100644 index 000000000..cb4575100 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterSse2.cs @@ -0,0 +1,1837 @@ +using Ryujinx.Common.Memory; +using System; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp +{ + internal static class LoopFilterSse2 + { + private static Vector128 AbsDiff(Vector128 a, Vector128 b) + { + return Sse2.Or(Sse2.SubtractSaturate(a, b), Sse2.SubtractSaturate(b, a)); + } + + private static void FilterHevMask( + Vector128 q1P1, + Vector128 q0P0, + Vector128 p3P2, + Vector128 p2P1, + Vector128 p1P0, + Vector128 q3Q2, + Vector128 q2Q1, + Vector128 q1Q0, + Vector128 limitV, + Vector128 threshV, + out Vector128 hev, + out Vector128 mask) + { + /* (abs(q1 - q0), abs(p1 - p0) */ + Vector128 flat = AbsDiff(q1P1, q0P0); + /* abs(p1 - q1), abs(p0 - q0) */ + Vector128 absP1Q1P0Q0 = AbsDiff(p1P0, q1Q0); + Vector128 absP0Q0, absP1Q1, work; + + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ + hev = Sse2.UnpackLow(Sse2.Max(flat, Sse2.ShiftRightLogical128BitLane(flat, 8)), Vector128.Zero); + hev = Sse2.CompareGreaterThan(hev.AsInt16(), threshV.AsInt16()).AsByte(); + hev = Sse2.PackSignedSaturate(hev.AsInt16(), hev.AsInt16()).AsByte(); + + /* const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); */ + absP0Q0 = Sse2.AddSaturate(absP1Q1P0Q0, absP1Q1P0Q0); /* abs(p0 - q0) * 2 */ + absP1Q1 = Sse2.UnpackHigh(absP1Q1P0Q0, absP1Q1P0Q0); /* abs(p1 - q1) */ + absP1Q1 = Sse2.ShiftRightLogical(absP1Q1.AsInt16(), 9).AsByte(); + absP1Q1 = Sse2.PackSignedSaturate(absP1Q1.AsInt16(), absP1Q1.AsInt16()).AsByte(); /* abs(p1 - q1) / 2 */ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ + mask = Sse2.AddSaturate(absP0Q0, absP1Q1); + /* abs(p3 - p2), abs(p2 - p1) */ + work = AbsDiff(p3P2, p2P1); + flat = Sse2.Max(work, flat); + /* abs(q3 - q2), abs(q2 - q1) */ + work = AbsDiff(q3Q2, q2Q1); + flat = Sse2.Max(work, flat); + flat = Sse2.Max(flat, Sse2.ShiftRightLogical128BitLane(flat, 8)); + mask = Sse2.UnpackLow(mask.AsInt64(), flat.AsInt64()).AsByte(); + mask = Sse2.SubtractSaturate(mask, limitV); + mask = Sse2.CompareEqual(mask, Vector128.Zero); + mask = Sse2.And(mask, Sse2.ShiftRightLogical128BitLane(mask, 8)); + } + + private static void Filter4( + Vector128 p1P0, + Vector128 q1Q0, + Vector128 hev, + Vector128 mask, + Vector128 ff, + out Vector128 ps1Ps0, + out Vector128 qs1Qs0) + { + Vector128 t3T4 = Vector128.Create( + 4, 4, 4, 4, + 4, 4, 4, 4, + 3, 3, 3, 3, + 3, 3, 3, (byte)3); + Vector128 t80 = Vector128.Create((byte)0x80); + Vector128 filter, filter2Filter1, work; + + ps1Ps0 = Sse2.Xor(p1P0, t80); /* ^ 0x80 */ + qs1Qs0 = Sse2.Xor(q1Q0, t80); + + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ + work = Sse2.SubtractSaturate(ps1Ps0.AsSByte(), qs1Qs0.AsSByte()).AsByte(); + filter = Sse2.And(Sse2.ShiftRightLogical128BitLane(work, 8), hev); + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ + filter = Sse2.SubtractSaturate(filter.AsSByte(), work.AsSByte()).AsByte(); + filter = Sse2.SubtractSaturate(filter.AsSByte(), work.AsSByte()).AsByte(); + filter = Sse2.SubtractSaturate(filter.AsSByte(), work.AsSByte()).AsByte(); /* + 3 * (qs0 - ps0) */ + filter = Sse2.And(filter, mask); /* & mask */ + filter = Sse2.UnpackLow(filter.AsInt64(), filter.AsInt64()).AsByte(); + + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ + filter2Filter1 = Sse2.AddSaturate(filter.AsSByte(), t3T4.AsSByte()).AsByte(); /* signed_char_clamp */ + filter = Sse2.UnpackHigh(filter2Filter1, filter2Filter1); + filter2Filter1 = Sse2.UnpackLow(filter2Filter1, filter2Filter1); + filter2Filter1 = Sse2.ShiftRightArithmetic(filter2Filter1.AsInt16(), 11).AsByte(); /* >> 3 */ + filter = Sse2.ShiftRightArithmetic(filter.AsInt16(), 11).AsByte(); /* >> 3 */ + filter2Filter1 = Sse2.PackSignedSaturate(filter2Filter1.AsInt16(), filter.AsInt16()).AsByte(); + + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ + filter = Sse2.SubtractSaturate(filter2Filter1.AsSByte(), ff.AsSByte()).AsByte(); /* + 1 */ + filter = Sse2.UnpackLow(filter, filter); + filter = Sse2.ShiftRightArithmetic(filter.AsInt16(), 9).AsByte(); /* round */ + filter = Sse2.PackSignedSaturate(filter.AsInt16(), filter.AsInt16()).AsByte(); + filter = Sse2.AndNot(hev, filter); + + hev = Sse2.UnpackHigh(filter2Filter1.AsInt64(), filter.AsInt64()).AsByte(); + filter2Filter1 = Sse2.UnpackLow(filter2Filter1.AsInt64(), filter.AsInt64()).AsByte(); + + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ + qs1Qs0 = Sse2.SubtractSaturate(qs1Qs0.AsSByte(), filter2Filter1.AsSByte()).AsByte(); + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ + ps1Ps0 = Sse2.AddSaturate(ps1Ps0.AsSByte(), hev.AsSByte()).AsByte(); + qs1Qs0 = Sse2.Xor(qs1Qs0, t80); /* ^ 0x80 */ + ps1Ps0 = Sse2.Xor(ps1Ps0, t80); /* ^ 0x80 */ + } + + public static unsafe void LpfHorizontal4( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit, + ReadOnlySpan limit, + ReadOnlySpan thresh) + { + Vector128 zero = Vector128.Zero; + Vector128 limitV, threshV; + + fixed (byte* pBLimit = blimit, pLimit = limit, pThresh = thresh) + { + limitV = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)pBLimit), + Sse2.LoadScalarVector128((long*)pLimit)).AsByte(); + threshV = Sse2.UnpackLow(Sse2.LoadScalarVector128((long*)pThresh).AsByte(), zero); + } + + Vector128 ff = Sse2.CompareEqual(zero, zero); + Vector128 q1P1, q0P0, p3P2, p2P1, p1P0, q3Q2, q2Q1, q1Q0, ps1Ps0, qs1Qs0; + Vector128 mask, hev; + + p3P2 = Sse2.UnpackLow(Sse2.LoadScalarVector128((long*)(s.ToPointer() - (3 * pitch))), + Sse2.LoadScalarVector128((long*)(s.ToPointer() - (4 * pitch)))).AsByte(); + q1P1 = Sse2.UnpackLow(Sse2.LoadScalarVector128((long*)(s.ToPointer() - (2 * pitch))), + Sse2.LoadScalarVector128((long*)(s.ToPointer() + (1 * pitch)))).AsByte(); + q0P0 = Sse2.UnpackLow(Sse2.LoadScalarVector128((long*)(s.ToPointer() - (1 * pitch))), + Sse2.LoadScalarVector128((long*)(s.ToPointer() + (0 * pitch)))).AsByte(); + q3Q2 = Sse2.UnpackLow(Sse2.LoadScalarVector128((long*)(s.ToPointer() + (2 * pitch))), + Sse2.LoadScalarVector128((long*)(s.ToPointer() + (3 * pitch)))).AsByte(); + p1P0 = Sse2.UnpackLow(q0P0.AsInt64(), q1P1.AsInt64()).AsByte(); + p2P1 = Sse2.UnpackLow(q1P1.AsInt64(), p3P2.AsInt64()).AsByte(); + q1Q0 = Sse2.UnpackHigh(q0P0.AsInt64(), q1P1.AsInt64()).AsByte(); + q2Q1 = Sse2.UnpackLow(Sse2.ShiftRightLogical128BitLane(q1P1, 8).AsInt64(), q3Q2.AsInt64()).AsByte(); + + FilterHevMask(q1P1, q0P0, p3P2, p2P1, p1P0, q3Q2, q2Q1, q1Q0, limitV, threshV, out hev, out mask); + Filter4(p1P0, q1Q0, hev, mask, ff, out ps1Ps0, out qs1Qs0); + + Sse.StoreHigh((float*)(s.ToPointer() - (2 * pitch)), ps1Ps0.AsSingle()); // *op1 + Sse2.StoreScalar((long*)(s.ToPointer() - (1 * pitch)), ps1Ps0.AsInt64()); // *op0 + Sse2.StoreScalar((long*)(s.ToPointer() + (0 * pitch)), qs1Qs0.AsInt64()); // *oq0 + Sse.StoreHigh((float*)(s.ToPointer() + (1 * pitch)), qs1Qs0.AsSingle()); // *oq1 + } + + public static unsafe void LpfVertical4( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit, + ReadOnlySpan limit, + ReadOnlySpan thresh) + { + Vector128 zero = Vector128.Zero; + Vector128 limitV, threshV; + + fixed (byte* pBLimit = blimit, pLimit = limit, pThresh = thresh) + { + limitV = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)pBLimit).AsInt64(), + Sse2.LoadScalarVector128((long*)pLimit).AsInt64()).AsByte(); + threshV = Sse2.UnpackLow(Sse2.LoadScalarVector128((long*)pThresh).AsByte(), zero); + } + + Vector128 ff = Sse2.CompareEqual(zero, zero); + Vector128 x0, x1, x2, x3; + Vector128 q1P1, q0P0, p3P2, p2P1, p1P0, q3Q2, q2Q1, q1Q0, ps1Ps0, qs1Qs0; + Vector128 mask, hev; + + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + q1Q0 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(s.ToPointer() + (0 * pitch) - 4)).AsByte(), + Sse2.LoadScalarVector128((long*)(s.ToPointer() + (1 * pitch) - 4)).AsByte()); + + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + x1 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(s.ToPointer() + (2 * pitch) - 4)).AsByte(), + Sse2.LoadScalarVector128((long*)(s.ToPointer() + (3 * pitch) - 4)).AsByte()); + + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + x2 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(s.ToPointer() + (4 * pitch) - 4)).AsByte(), + Sse2.LoadScalarVector128((long*)(s.ToPointer() + (5 * pitch) - 4)).AsByte()); + + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + x3 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(s.ToPointer() + (6 * pitch) - 4)).AsByte(), + Sse2.LoadScalarVector128((long*)(s.ToPointer() + (7 * pitch) - 4)).AsByte()); + + // Transpose 8x8 + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + p1P0 = Sse2.UnpackLow(q1Q0.AsInt16(), x1.AsInt16()).AsByte(); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + x0 = Sse2.UnpackLow(x2.AsInt16(), x3.AsInt16()).AsByte(); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + p3P2 = Sse2.UnpackLow(p1P0.AsInt32(), x0.AsInt32()).AsByte(); + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + p1P0 = Sse2.UnpackHigh(p1P0.AsInt32(), x0.AsInt32()).AsByte(); + p3P2 = Sse2.UnpackHigh(p3P2.AsInt64(), Sse2.ShiftLeftLogical128BitLane(p3P2, 8).AsInt64()) + .AsByte(); // swap lo and high + p1P0 = Sse2.UnpackHigh(p1P0.AsInt64(), Sse2.ShiftLeftLogical128BitLane(p1P0, 8).AsInt64()) + .AsByte(); // swap lo and high + + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + q1Q0 = Sse2.UnpackHigh(q1Q0.AsInt16(), x1.AsInt16()).AsByte(); + // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + x2 = Sse2.UnpackHigh(x2.AsInt16(), x3.AsInt16()).AsByte(); + // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + q3Q2 = Sse2.UnpackHigh(q1Q0.AsInt32(), x2.AsInt32()).AsByte(); + // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + q1Q0 = Sse2.UnpackLow(q1Q0.AsInt32(), x2.AsInt32()).AsByte(); + + q0P0 = Sse2.UnpackLow(p1P0.AsInt64(), q1Q0.AsInt64()).AsByte(); + q1P1 = Sse2.UnpackHigh(p1P0.AsInt64(), q1Q0.AsInt64()).AsByte(); + p1P0 = Sse2.UnpackLow(q0P0.AsInt64(), q1P1.AsInt64()).AsByte(); + p2P1 = Sse2.UnpackLow(q1P1.AsInt64(), p3P2.AsInt64()).AsByte(); + q2Q1 = Sse2.UnpackLow(Sse2.ShiftRightLogical128BitLane(q1P1, 8).AsInt64(), q3Q2.AsInt64()).AsByte(); + + FilterHevMask(q1P1, q0P0, p3P2, p2P1, p1P0, q3Q2, q2Q1, q1Q0, limitV, threshV, out hev, out mask); + Filter4(p1P0, q1Q0, hev, mask, ff, out ps1Ps0, out qs1Qs0); + + // Transpose 8x4 to 4x8 + // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37 + // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07 + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + ps1Ps0 = Sse2.UnpackHigh(ps1Ps0.AsInt64(), Sse2.ShiftLeftLogical128BitLane(ps1Ps0, 8).AsInt64()).AsByte(); + // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37 + x0 = Sse2.UnpackHigh(ps1Ps0, qs1Qs0); + // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27 + ps1Ps0 = Sse2.UnpackLow(ps1Ps0, qs1Qs0); + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + qs1Qs0 = Sse2.UnpackHigh(ps1Ps0, x0); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ps1Ps0 = Sse2.UnpackLow(ps1Ps0, x0); + + *(int*)(s.ToPointer() + (0 * pitch) - 2) = ps1Ps0.AsInt32().GetElement(0); + ps1Ps0 = Sse2.ShiftRightLogical128BitLane(ps1Ps0, 4); + *(int*)(s.ToPointer() + (1 * pitch) - 2) = ps1Ps0.AsInt32().GetElement(0); + ps1Ps0 = Sse2.ShiftRightLogical128BitLane(ps1Ps0, 4); + *(int*)(s.ToPointer() + (2 * pitch) - 2) = ps1Ps0.AsInt32().GetElement(0); + ps1Ps0 = Sse2.ShiftRightLogical128BitLane(ps1Ps0, 4); + *(int*)(s.ToPointer() + (3 * pitch) - 2) = ps1Ps0.AsInt32().GetElement(0); + + *(int*)(s.ToPointer() + (4 * pitch) - 2) = qs1Qs0.AsInt32().GetElement(0); + qs1Qs0 = Sse2.ShiftRightLogical128BitLane(qs1Qs0, 4); + *(int*)(s.ToPointer() + (5 * pitch) - 2) = qs1Qs0.AsInt32().GetElement(0); + qs1Qs0 = Sse2.ShiftRightLogical128BitLane(qs1Qs0, 4); + *(int*)(s.ToPointer() + (6 * pitch) - 2) = qs1Qs0.AsInt32().GetElement(0); + qs1Qs0 = Sse2.ShiftRightLogical128BitLane(qs1Qs0, 4); + *(int*)(s.ToPointer() + (7 * pitch) - 2) = qs1Qs0.AsInt32().GetElement(0); + } + + public static unsafe void LpfHorizontal16( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit, + ReadOnlySpan limit, + ReadOnlySpan thresh) + { + Vector128 zero = Vector128.Zero; + Vector128 one = Vector128.Create((byte)1); + Vector128 blimitV, limitV, threshV; + + fixed (byte* pBLimit = blimit, pLimit = limit, pThresh = thresh) + { + blimitV = Sse2.LoadVector128(pBLimit); + limitV = Sse2.LoadVector128(pLimit); + threshV = Sse2.LoadVector128(pThresh); + } + + Vector128 mask, hev, flat, flat2; + Vector128 q7P7, q6P6, q5P5, q4P4, q3P3, q2P2, q1P1, q0P0, p0Q0, p1Q1; + Vector128 absP1P0; + + q4P4 = Sse2.LoadScalarVector128((long*)(s.ToPointer() - (5 * pitch))).AsByte(); + q4P4 = Sse.LoadHigh(q4P4.AsSingle(), (float*)(s.ToPointer() + (4 * pitch))).AsByte(); + q3P3 = Sse2.LoadScalarVector128((long*)(s.ToPointer() - (4 * pitch))).AsByte(); + q3P3 = Sse.LoadHigh(q3P3.AsSingle(), (float*)(s.ToPointer() + (3 * pitch))).AsByte(); + q2P2 = Sse2.LoadScalarVector128((long*)(s.ToPointer() - (3 * pitch))).AsByte(); + q2P2 = Sse.LoadHigh(q2P2.AsSingle(), (float*)(s.ToPointer() + (2 * pitch))).AsByte(); + q1P1 = Sse2.LoadScalarVector128((long*)(s.ToPointer() - (2 * pitch))).AsByte(); + q1P1 = Sse.LoadHigh(q1P1.AsSingle(), (float*)(s.ToPointer() + (1 * pitch))).AsByte(); + p1Q1 = Sse2.Shuffle(q1P1.AsUInt32(), 78).AsByte(); + q0P0 = Sse2.LoadScalarVector128((long*)(s.ToPointer() - (1 * pitch))).AsByte(); + q0P0 = Sse.LoadHigh(q0P0.AsSingle(), (float*)(s.ToPointer() - (0 * pitch))).AsByte(); + p0Q0 = Sse2.Shuffle(q0P0.AsUInt32(), 78).AsByte(); + + { + Vector128 absP1Q1, absP0Q0, absQ1Q0, fe, ff, work; + absP1P0 = AbsDiff(q1P1, q0P0); + absQ1Q0 = Sse2.ShiftRightLogical128BitLane(absP1P0, 8); + fe = Vector128.Create((byte)0xfe); + ff = Sse2.CompareEqual(absP1P0, absP1P0); + absP0Q0 = AbsDiff(q0P0, p0Q0); + absP1Q1 = AbsDiff(q1P1, p1Q1); + flat = Sse2.Max(absP1P0, absQ1Q0); + hev = Sse2.SubtractSaturate(flat, threshV); + hev = Sse2.Xor(Sse2.CompareEqual(hev, zero), ff); + + absP0Q0 = Sse2.AddSaturate(absP0Q0, absP0Q0); + absP1Q1 = Sse2.ShiftRightLogical(Sse2.And(absP1Q1, fe).AsInt16(), 1).AsByte(); + mask = Sse2.SubtractSaturate(Sse2.AddSaturate(absP0Q0, absP1Q1), blimitV); + mask = Sse2.Xor(Sse2.CompareEqual(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = Sse2.Max(absP1P0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = Sse2.Max(AbsDiff(q2P2, q1P1), AbsDiff(q3P3, q2P2)); + mask = Sse2.Max(work, mask); + mask = Sse2.Max(mask, Sse2.ShiftRightLogical128BitLane(mask, 8)); + mask = Sse2.SubtractSaturate(mask, limitV); + mask = Sse2.CompareEqual(mask, zero); + } + + // lp filter + { + Vector128 t4 = Vector128.Create((byte)4); + Vector128 t3 = Vector128.Create((byte)3); + Vector128 t80 = Vector128.Create((byte)0x80); + Vector128 t1 = Vector128.Create((ushort)0x1); + Vector128 qs1Ps1 = Sse2.Xor(q1P1, t80); + Vector128 qs0Ps0 = Sse2.Xor(q0P0, t80); + Vector128 qs0 = Sse2.Xor(p0Q0, t80); + Vector128 qs1 = Sse2.Xor(p1Q1, t80); + Vector128 filt; + Vector128 workA; + Vector128 filter1, filter2; + Vector128 flat2Q6P6, flat2Q5P5, flat2Q4P4, flat2Q3P3, flat2Q2P2; + Vector128 flat2Q1P1, flat2Q0P0, flatQ2P2, flatQ1P1, flatQ0P0; + + filt = Sse2.And(Sse2.SubtractSaturate(qs1Ps1.AsSByte(), qs1.AsSByte()).AsByte(), hev); + workA = Sse2.SubtractSaturate(qs0.AsSByte(), qs0Ps0.AsSByte()).AsByte(); + filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte(); + filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte(); + filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte(); + // (vpx_filter + 3 * (qs0 - ps0)) & mask + filt = Sse2.And(filt, mask); + + filter1 = Sse2.AddSaturate(filt.AsSByte(), t4.AsSByte()).AsByte(); + filter2 = Sse2.AddSaturate(filt.AsSByte(), t3.AsSByte()).AsByte(); + + filter1 = Sse2.UnpackLow(zero, filter1); + filter1 = Sse2.ShiftRightArithmetic(filter1.AsInt16(), 0xB).AsByte(); + filter2 = Sse2.UnpackLow(zero, filter2); + filter2 = Sse2.ShiftRightArithmetic(filter2.AsInt16(), 0xB).AsByte(); + + // Filter1 >> 3 + filt = Sse2.PackSignedSaturate(filter2.AsInt16(), + Sse2.SubtractSaturate(zero.AsInt16(), filter1.AsInt16())).AsByte(); + qs0Ps0 = Sse2.Xor(Sse2.AddSaturate(qs0Ps0.AsSByte(), filt.AsSByte()).AsByte(), t80); + + // filt >> 1 + filt = Sse2.AddSaturate(filter1.AsInt16(), t1.AsInt16()).AsByte(); + filt = Sse2.ShiftRightArithmetic(filt.AsInt16(), 1).AsByte(); + filt = Sse2.AndNot(Sse2.ShiftRightArithmetic(Sse2.UnpackLow(zero, hev).AsInt16(), 0x8), filt.AsInt16()) + .AsByte(); + filt = Sse2.PackSignedSaturate(filt.AsInt16(), Sse2.SubtractSaturate(zero.AsInt16(), filt.AsInt16())) + .AsByte(); + qs1Ps1 = Sse2.Xor(Sse2.AddSaturate(qs1Ps1.AsSByte(), filt.AsSByte()).AsByte(), t80); + // loopfilter done + + { + Vector128 work; + flat = Sse2.Max(AbsDiff(q2P2, q0P0), AbsDiff(q3P3, q0P0)); + flat = Sse2.Max(absP1P0, flat); + flat = Sse2.Max(flat, Sse2.ShiftRightLogical128BitLane(flat, 8)); + flat = Sse2.SubtractSaturate(flat, one); + flat = Sse2.CompareEqual(flat, zero); + flat = Sse2.And(flat, mask); + + q5P5 = Sse2.LoadScalarVector128((long*)(s.ToPointer() - (6 * pitch))).AsByte(); + q5P5 = Sse.LoadHigh(q5P5.AsSingle(), (float*)(s.ToPointer() + (5 * pitch))).AsByte(); + + q6P6 = Sse2.LoadScalarVector128((long*)(s.ToPointer() - (7 * pitch))).AsByte(); + q6P6 = Sse.LoadHigh(q6P6.AsSingle(), (float*)(s.ToPointer() + (6 * pitch))).AsByte(); + flat2 = Sse2.Max(AbsDiff(q4P4, q0P0), AbsDiff(q5P5, q0P0)); + + q7P7 = Sse2.LoadScalarVector128((long*)(s.ToPointer() - (8 * pitch))).AsByte(); + q7P7 = Sse.LoadHigh(q7P7.AsSingle(), (float*)(s.ToPointer() + (7 * pitch))).AsByte(); + work = Sse2.Max(AbsDiff(q6P6, q0P0), AbsDiff(q7P7, q0P0)); + flat2 = Sse2.Max(work, flat2); + flat2 = Sse2.Max(flat2, Sse2.ShiftRightLogical128BitLane(flat2, 8)); + flat2 = Sse2.SubtractSaturate(flat2, one); + flat2 = Sse2.CompareEqual(flat2, zero); + flat2 = Sse2.And(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + Vector128 eight = Vector128.Create((short)8); + Vector128 four = Vector128.Create((short)4); + Vector128 p716, p616, p516, p416, p316, p216, p116, p016; + Vector128 q716, q616, q516, q416, q316, q216, q116, q016; + Vector128 pixelFilterP, pixelFilterQ; + Vector128 pixetFilterP2P1P0, pixetFilterQ2Q1Q0; + Vector128 sumP7, sumQ7, sumP3, sumQ3, resP, resQ; + + p716 = Sse2.UnpackLow(q7P7, zero).AsInt16(); + p616 = Sse2.UnpackLow(q6P6, zero).AsInt16(); + p516 = Sse2.UnpackLow(q5P5, zero).AsInt16(); + p416 = Sse2.UnpackLow(q4P4, zero).AsInt16(); + p316 = Sse2.UnpackLow(q3P3, zero).AsInt16(); + p216 = Sse2.UnpackLow(q2P2, zero).AsInt16(); + p116 = Sse2.UnpackLow(q1P1, zero).AsInt16(); + p016 = Sse2.UnpackLow(q0P0, zero).AsInt16(); + q016 = Sse2.UnpackHigh(q0P0, zero).AsInt16(); + q116 = Sse2.UnpackHigh(q1P1, zero).AsInt16(); + q216 = Sse2.UnpackHigh(q2P2, zero).AsInt16(); + q316 = Sse2.UnpackHigh(q3P3, zero).AsInt16(); + q416 = Sse2.UnpackHigh(q4P4, zero).AsInt16(); + q516 = Sse2.UnpackHigh(q5P5, zero).AsInt16(); + q616 = Sse2.UnpackHigh(q6P6, zero).AsInt16(); + q716 = Sse2.UnpackHigh(q7P7, zero).AsInt16(); + + pixelFilterP = Sse2.Add(Sse2.Add(p616, p516), Sse2.Add(p416, p316)); + pixelFilterQ = Sse2.Add(Sse2.Add(q616, q516), Sse2.Add(q416, q316)); + + pixetFilterP2P1P0 = Sse2.Add(p016, Sse2.Add(p216, p116)); + pixelFilterP = Sse2.Add(pixelFilterP, pixetFilterP2P1P0); + + pixetFilterQ2Q1Q0 = Sse2.Add(q016, Sse2.Add(q216, q116)); + pixelFilterQ = Sse2.Add(pixelFilterQ, pixetFilterQ2Q1Q0); + pixelFilterP = Sse2.Add(eight, Sse2.Add(pixelFilterP, pixelFilterQ)); + pixetFilterP2P1P0 = Sse2.Add(four, Sse2.Add(pixetFilterP2P1P0, pixetFilterQ2Q1Q0)); + resP = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterP, Sse2.Add(p716, p016)), 4); + resQ = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterP, Sse2.Add(q716, q016)), 4); + flat2Q0P0 = Sse2.PackUnsignedSaturate(resP, resQ); + resP = Sse2.ShiftRightLogical(Sse2.Add(pixetFilterP2P1P0, Sse2.Add(p316, p016)), 3); + resQ = Sse2.ShiftRightLogical(Sse2.Add(pixetFilterP2P1P0, Sse2.Add(q316, q016)), 3); + + flatQ0P0 = Sse2.PackUnsignedSaturate(resP, resQ); + + sumP7 = Sse2.Add(p716, p716); + sumQ7 = Sse2.Add(q716, q716); + sumP3 = Sse2.Add(p316, p316); + sumQ3 = Sse2.Add(q316, q316); + + pixelFilterQ = Sse2.Subtract(pixelFilterP, p616); + pixelFilterP = Sse2.Subtract(pixelFilterP, q616); + resP = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterP, Sse2.Add(sumP7, p116)), 4); + resQ = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterQ, Sse2.Add(sumQ7, q116)), 4); + flat2Q1P1 = Sse2.PackUnsignedSaturate(resP, resQ); + + pixetFilterQ2Q1Q0 = Sse2.Subtract(pixetFilterP2P1P0, p216); + pixetFilterP2P1P0 = Sse2.Subtract(pixetFilterP2P1P0, q216); + resP = Sse2.ShiftRightLogical(Sse2.Add(pixetFilterP2P1P0, Sse2.Add(sumP3, p116)), 3); + resQ = Sse2.ShiftRightLogical(Sse2.Add(pixetFilterQ2Q1Q0, Sse2.Add(sumQ3, q116)), 3); + flatQ1P1 = Sse2.PackUnsignedSaturate(resP, resQ); + + sumP7 = Sse2.Add(sumP7, p716); + sumQ7 = Sse2.Add(sumQ7, q716); + sumP3 = Sse2.Add(sumP3, p316); + sumQ3 = Sse2.Add(sumQ3, q316); + + pixelFilterP = Sse2.Subtract(pixelFilterP, q516); + pixelFilterQ = Sse2.Subtract(pixelFilterQ, p516); + resP = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterP, Sse2.Add(sumP7, p216)), 4); + resQ = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterQ, Sse2.Add(sumQ7, q216)), 4); + flat2Q2P2 = Sse2.PackUnsignedSaturate(resP, resQ); + + pixetFilterP2P1P0 = Sse2.Subtract(pixetFilterP2P1P0, q116); + pixetFilterQ2Q1Q0 = Sse2.Subtract(pixetFilterQ2Q1Q0, p116); + + resP = Sse2.ShiftRightLogical(Sse2.Add(pixetFilterP2P1P0, Sse2.Add(sumP3, p216)), 3); + resQ = Sse2.ShiftRightLogical(Sse2.Add(pixetFilterQ2Q1Q0, Sse2.Add(sumQ3, q216)), 3); + flatQ2P2 = Sse2.PackUnsignedSaturate(resP, resQ); + + sumP7 = Sse2.Add(sumP7, p716); + sumQ7 = Sse2.Add(sumQ7, q716); + pixelFilterP = Sse2.Subtract(pixelFilterP, q416); + pixelFilterQ = Sse2.Subtract(pixelFilterQ, p416); + resP = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterP, Sse2.Add(sumP7, p316)), 4); + resQ = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterQ, Sse2.Add(sumQ7, q316)), 4); + flat2Q3P3 = Sse2.PackUnsignedSaturate(resP, resQ); + + sumP7 = Sse2.Add(sumP7, p716); + sumQ7 = Sse2.Add(sumQ7, q716); + pixelFilterP = Sse2.Subtract(pixelFilterP, q316); + pixelFilterQ = Sse2.Subtract(pixelFilterQ, p316); + resP = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterP, Sse2.Add(sumP7, p416)), 4); + resQ = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterQ, Sse2.Add(sumQ7, q416)), 4); + flat2Q4P4 = Sse2.PackUnsignedSaturate(resP, resQ); + + sumP7 = Sse2.Add(sumP7, p716); + sumQ7 = Sse2.Add(sumQ7, q716); + pixelFilterP = Sse2.Subtract(pixelFilterP, q216); + pixelFilterQ = Sse2.Subtract(pixelFilterQ, p216); + resP = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterP, Sse2.Add(sumP7, p516)), 4); + resQ = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterQ, Sse2.Add(sumQ7, q516)), 4); + flat2Q5P5 = Sse2.PackUnsignedSaturate(resP, resQ); + + sumP7 = Sse2.Add(sumP7, p716); + sumQ7 = Sse2.Add(sumQ7, q716); + pixelFilterP = Sse2.Subtract(pixelFilterP, q116); + pixelFilterQ = Sse2.Subtract(pixelFilterQ, p116); + resP = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterP, Sse2.Add(sumP7, p616)), 4); + resQ = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterQ, Sse2.Add(sumQ7, q616)), 4); + flat2Q6P6 = Sse2.PackUnsignedSaturate(resP, resQ); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + flat = Sse2.Shuffle(flat.AsInt32(), 68).AsByte(); + flat2 = Sse2.Shuffle(flat2.AsInt32(), 68).AsByte(); + + q2P2 = Sse2.AndNot(flat, q2P2); + flatQ2P2 = Sse2.And(flat, flatQ2P2); + q2P2 = Sse2.Or(q2P2, flatQ2P2); + + qs1Ps1 = Sse2.AndNot(flat, qs1Ps1); + flatQ1P1 = Sse2.And(flat, flatQ1P1); + q1P1 = Sse2.Or(qs1Ps1, flatQ1P1); + + qs0Ps0 = Sse2.AndNot(flat, qs0Ps0); + flatQ0P0 = Sse2.And(flat, flatQ0P0); + q0P0 = Sse2.Or(qs0Ps0, flatQ0P0); + + q6P6 = Sse2.AndNot(flat2, q6P6); + flat2Q6P6 = Sse2.And(flat2, flat2Q6P6); + q6P6 = Sse2.Or(q6P6, flat2Q6P6); + Sse2.StoreScalar((long*)(s.ToPointer() - (7 * pitch)), q6P6.AsInt64()); + Sse.StoreHigh((float*)(s.ToPointer() + (6 * pitch)), q6P6.AsSingle()); + + q5P5 = Sse2.AndNot(flat2, q5P5); + flat2Q5P5 = Sse2.And(flat2, flat2Q5P5); + q5P5 = Sse2.Or(q5P5, flat2Q5P5); + Sse2.StoreScalar((long*)(s.ToPointer() - (6 * pitch)), q5P5.AsInt64()); + Sse.StoreHigh((float*)(s.ToPointer() + (5 * pitch)), q5P5.AsSingle()); + + q4P4 = Sse2.AndNot(flat2, q4P4); + flat2Q4P4 = Sse2.And(flat2, flat2Q4P4); + q4P4 = Sse2.Or(q4P4, flat2Q4P4); + Sse2.StoreScalar((long*)(s.ToPointer() - (5 * pitch)), q4P4.AsInt64()); + Sse.StoreHigh((float*)(s.ToPointer() + (4 * pitch)), q4P4.AsSingle()); + + q3P3 = Sse2.AndNot(flat2, q3P3); + flat2Q3P3 = Sse2.And(flat2, flat2Q3P3); + q3P3 = Sse2.Or(q3P3, flat2Q3P3); + Sse2.StoreScalar((long*)(s.ToPointer() - (4 * pitch)), q3P3.AsInt64()); + Sse.StoreHigh((float*)(s.ToPointer() + (3 * pitch)), q3P3.AsSingle()); + + q2P2 = Sse2.AndNot(flat2, q2P2); + flat2Q2P2 = Sse2.And(flat2, flat2Q2P2); + q2P2 = Sse2.Or(q2P2, flat2Q2P2); + Sse2.StoreScalar((long*)(s.ToPointer() - (3 * pitch)), q2P2.AsInt64()); + Sse.StoreHigh((float*)(s.ToPointer() + (2 * pitch)), q2P2.AsSingle()); + + q1P1 = Sse2.AndNot(flat2, q1P1); + flat2Q1P1 = Sse2.And(flat2, flat2Q1P1); + q1P1 = Sse2.Or(q1P1, flat2Q1P1); + Sse2.StoreScalar((long*)(s.ToPointer() - (2 * pitch)), q1P1.AsInt64()); + Sse.StoreHigh((float*)(s.ToPointer() + (1 * pitch)), q1P1.AsSingle()); + + q0P0 = Sse2.AndNot(flat2, q0P0); + flat2Q0P0 = Sse2.And(flat2, flat2Q0P0); + q0P0 = Sse2.Or(q0P0, flat2Q0P0); + Sse2.StoreScalar((long*)(s.ToPointer() - (1 * pitch)), q0P0.AsInt64()); + Sse.StoreHigh((float*)(s.ToPointer() - (0 * pitch)), q0P0.AsSingle()); + } + } + + private static Vector128 FilterAdd2Sub2( + Vector128 total, + Vector128 a1, + Vector128 a2, + Vector128 s1, + Vector128 s2) + { + Vector128 x = Sse2.Add(a1, total); + x = Sse2.Add(Sse2.Subtract(x, Sse2.Add(s1, s2)), a2); + return x; + } + + private static Vector128 Filter8Mask( + Vector128 flat, + Vector128 otherFilt, + Vector128 f8Lo, + Vector128 f8Hi) + { + Vector128 f8 = + Sse2.PackUnsignedSaturate(Sse2.ShiftRightLogical(f8Lo, 3), Sse2.ShiftRightLogical(f8Hi, 3)); + Vector128 result = Sse2.And(flat, f8); + return Sse2.Or(Sse2.AndNot(flat, otherFilt), result); + } + + private static Vector128 Filter16Mask( + Vector128 flat, + Vector128 otherFilt, + Vector128 fLo, + Vector128 fHi) + { + Vector128 f = + Sse2.PackUnsignedSaturate(Sse2.ShiftRightLogical(fLo, 4), Sse2.ShiftRightLogical(fHi, 4)); + Vector128 result = Sse2.And(flat, f); + return Sse2.Or(Sse2.AndNot(flat, otherFilt), result); + } + + public static unsafe void LpfHorizontal16Dual( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit, + ReadOnlySpan limit, + ReadOnlySpan thresh) + { + Vector128 zero = Vector128.Zero; + Vector128 one = Vector128.Create((byte)1); + Vector128 blimitV, limitV, threshV; + + fixed (byte* pBLimit = blimit, pLimit = limit, pThresh = thresh) + { + blimitV = Sse2.LoadVector128(pBLimit); + limitV = Sse2.LoadVector128(pLimit); + threshV = Sse2.LoadVector128(pThresh); + } + + Vector128 mask, hev, flat, flat2; + Vector128 p7, p6, p5; + Vector128 p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + Vector128 q5, q6, q7; + + Vector128 op2, op1, op0, oq0, oq1, oq2; + + Vector128 maxAbsP1P0Q1Q0; + + p7 = Sse2.LoadVector128(s.ToPointer() - (8 * pitch)); + p6 = Sse2.LoadVector128(s.ToPointer() - (7 * pitch)); + p5 = Sse2.LoadVector128(s.ToPointer() - (6 * pitch)); + p4 = Sse2.LoadVector128(s.ToPointer() - (5 * pitch)); + p3 = Sse2.LoadVector128(s.ToPointer() - (4 * pitch)); + p2 = Sse2.LoadVector128(s.ToPointer() - (3 * pitch)); + p1 = Sse2.LoadVector128(s.ToPointer() - (2 * pitch)); + p0 = Sse2.LoadVector128(s.ToPointer() - (1 * pitch)); + q0 = Sse2.LoadVector128(s.ToPointer() - (0 * pitch)); + q1 = Sse2.LoadVector128(s.ToPointer() + (1 * pitch)); + q2 = Sse2.LoadVector128(s.ToPointer() + (2 * pitch)); + q3 = Sse2.LoadVector128(s.ToPointer() + (3 * pitch)); + q4 = Sse2.LoadVector128(s.ToPointer() + (4 * pitch)); + q5 = Sse2.LoadVector128(s.ToPointer() + (5 * pitch)); + q6 = Sse2.LoadVector128(s.ToPointer() + (6 * pitch)); + q7 = Sse2.LoadVector128(s.ToPointer() + (7 * pitch)); + + { + Vector128 absP1P0 = AbsDiff(p1, p0); + Vector128 absQ1Q0 = AbsDiff(q1, q0); + Vector128 fe = Vector128.Create((byte)0xfe); + Vector128 ff = Sse2.CompareEqual(zero, zero); + Vector128 absP0Q0 = AbsDiff(p0, q0); + Vector128 absP1Q1 = AbsDiff(p1, q1); + Vector128 work; + maxAbsP1P0Q1Q0 = Sse2.Max(absP1P0, absQ1Q0); + + absP0Q0 = Sse2.AddSaturate(absP0Q0, absP0Q0); + absP1Q1 = Sse2.ShiftRightLogical(Sse2.And(absP1Q1, fe).AsInt16(), 1).AsByte(); + mask = Sse2.SubtractSaturate(Sse2.AddSaturate(absP0Q0, absP1Q1), blimitV); + mask = Sse2.Xor(Sse2.CompareEqual(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = Sse2.Max(maxAbsP1P0Q1Q0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = Sse2.Max(AbsDiff(p2, p1), AbsDiff(p3, p2)); + mask = Sse2.Max(work, mask); + work = Sse2.Max(AbsDiff(q2, q1), AbsDiff(q3, q2)); + mask = Sse2.Max(work, mask); + mask = Sse2.SubtractSaturate(mask, limitV); + mask = Sse2.CompareEqual(mask, zero); + } + + { + Vector128 work; + work = Sse2.Max(AbsDiff(p2, p0), AbsDiff(q2, q0)); + flat = Sse2.Max(work, maxAbsP1P0Q1Q0); + work = Sse2.Max(AbsDiff(p3, p0), AbsDiff(q3, q0)); + flat = Sse2.Max(work, flat); + work = Sse2.Max(AbsDiff(p4, p0), AbsDiff(q4, q0)); + flat = Sse2.SubtractSaturate(flat, one); + flat = Sse2.CompareEqual(flat, zero); + flat = Sse2.And(flat, mask); + flat2 = Sse2.Max(AbsDiff(p5, p0), AbsDiff(q5, q0)); + flat2 = Sse2.Max(work, flat2); + work = Sse2.Max(AbsDiff(p6, p0), AbsDiff(q6, q0)); + flat2 = Sse2.Max(work, flat2); + work = Sse2.Max(AbsDiff(p7, p0), AbsDiff(q7, q0)); + flat2 = Sse2.Max(work, flat2); + flat2 = Sse2.SubtractSaturate(flat2, one); + flat2 = Sse2.CompareEqual(flat2, zero); + flat2 = Sse2.And(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter4 + { + Vector128 t4 = Vector128.Create((byte)4); + Vector128 t3 = Vector128.Create((byte)3); + Vector128 t80 = Vector128.Create((byte)0x80); + Vector128 te0 = Vector128.Create((byte)0xe0); + Vector128 t1F = Vector128.Create((byte)0x1f); + Vector128 t1 = Vector128.Create((byte)0x1); + Vector128 t7F = Vector128.Create((byte)0x7f); + Vector128 ff = Sse2.CompareEqual(t4, t4); + + Vector128 filt; + Vector128 workA; + Vector128 filter1, filter2; + + op1 = Sse2.Xor(p1, t80); + op0 = Sse2.Xor(p0, t80); + oq0 = Sse2.Xor(q0, t80); + oq1 = Sse2.Xor(q1, t80); + + hev = Sse2.SubtractSaturate(maxAbsP1P0Q1Q0, threshV); + hev = Sse2.Xor(Sse2.CompareEqual(hev, zero), ff); + filt = Sse2.And(Sse2.SubtractSaturate(op1.AsSByte(), oq1.AsSByte()).AsByte(), hev); + + workA = Sse2.SubtractSaturate(oq0.AsSByte(), op0.AsSByte()).AsByte(); + filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte(); + filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte(); + filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte(); + // (vpx_filter + 3 * (qs0 - ps0)) & mask + filt = Sse2.And(filt, mask); + filter1 = Sse2.AddSaturate(filt.AsSByte(), t4.AsSByte()).AsByte(); + filter2 = Sse2.AddSaturate(filt.AsSByte(), t3.AsSByte()).AsByte(); + + // Filter1 >> 3 + workA = Sse2.CompareGreaterThan(zero.AsSByte(), filter1.AsSByte()).AsByte(); + filter1 = Sse2.ShiftRightLogical(filter1.AsInt16(), 3).AsByte(); + workA = Sse2.And(workA, te0); + filter1 = Sse2.And(filter1, t1F); + filter1 = Sse2.Or(filter1, workA); + oq0 = Sse2.Xor(Sse2.SubtractSaturate(oq0.AsSByte(), filter1.AsSByte()).AsByte(), t80); + + // Filter2 >> 3 + workA = Sse2.CompareGreaterThan(zero.AsSByte(), filter2.AsSByte()).AsByte(); + filter2 = Sse2.ShiftRightLogical(filter2.AsInt16(), 3).AsByte(); + workA = Sse2.And(workA, te0); + filter2 = Sse2.And(filter2, t1F); + filter2 = Sse2.Or(filter2, workA); + op0 = Sse2.Xor(Sse2.AddSaturate(op0.AsSByte(), filter2.AsSByte()).AsByte(), t80); + + // filt >> 1 + filt = Sse2.AddSaturate(filter1.AsSByte(), t1.AsSByte()).AsByte(); + workA = Sse2.CompareGreaterThan(zero.AsSByte(), filt.AsSByte()).AsByte(); + filt = Sse2.ShiftRightLogical(filt.AsInt16(), 1).AsByte(); + workA = Sse2.And(workA, t80); + filt = Sse2.And(filt, t7F); + filt = Sse2.Or(filt, workA); + filt = Sse2.AndNot(hev, filt); + op1 = Sse2.Xor(Sse2.AddSaturate(op1.AsSByte(), filt.AsSByte()).AsByte(), t80); + oq1 = Sse2.Xor(Sse2.SubtractSaturate(oq1.AsSByte(), filt.AsSByte()).AsByte(), t80); + // loopfilter done + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter8 + { + Vector128 four = Vector128.Create((short)4); + Vector128 p3Lo = Sse2.UnpackLow(p3, zero).AsInt16(); + Vector128 p2Lo = Sse2.UnpackLow(p2, zero).AsInt16(); + Vector128 p1Lo = Sse2.UnpackLow(p1, zero).AsInt16(); + Vector128 p0Lo = Sse2.UnpackLow(p0, zero).AsInt16(); + Vector128 q0Lo = Sse2.UnpackLow(q0, zero).AsInt16(); + Vector128 q1Lo = Sse2.UnpackLow(q1, zero).AsInt16(); + Vector128 q2Lo = Sse2.UnpackLow(q2, zero).AsInt16(); + Vector128 q3Lo = Sse2.UnpackLow(q3, zero).AsInt16(); + + Vector128 p3Hi = Sse2.UnpackHigh(p3, zero).AsInt16(); + Vector128 p2Hi = Sse2.UnpackHigh(p2, zero).AsInt16(); + Vector128 p1Hi = Sse2.UnpackHigh(p1, zero).AsInt16(); + Vector128 p0Hi = Sse2.UnpackHigh(p0, zero).AsInt16(); + Vector128 q0Hi = Sse2.UnpackHigh(q0, zero).AsInt16(); + Vector128 q1Hi = Sse2.UnpackHigh(q1, zero).AsInt16(); + Vector128 q2Hi = Sse2.UnpackHigh(q2, zero).AsInt16(); + Vector128 q3Hi = Sse2.UnpackHigh(q3, zero).AsInt16(); + Vector128 f8Lo, f8Hi; + + f8Lo = Sse2.Add(Sse2.Add(p3Lo, four), Sse2.Add(p3Lo, p2Lo)); + f8Lo = Sse2.Add(Sse2.Add(p3Lo, f8Lo), Sse2.Add(p2Lo, p1Lo)); + f8Lo = Sse2.Add(Sse2.Add(p0Lo, q0Lo), f8Lo); + + f8Hi = Sse2.Add(Sse2.Add(p3Hi, four), Sse2.Add(p3Hi, p2Hi)); + f8Hi = Sse2.Add(Sse2.Add(p3Hi, f8Hi), Sse2.Add(p2Hi, p1Hi)); + f8Hi = Sse2.Add(Sse2.Add(p0Hi, q0Hi), f8Hi); + + op2 = Filter8Mask(flat, p2, f8Lo, f8Hi); + + f8Lo = FilterAdd2Sub2(f8Lo, q1Lo, p1Lo, p2Lo, p3Lo); + f8Hi = FilterAdd2Sub2(f8Hi, q1Hi, p1Hi, p2Hi, p3Hi); + op1 = Filter8Mask(flat, op1, f8Lo, f8Hi); + + f8Lo = FilterAdd2Sub2(f8Lo, q2Lo, p0Lo, p1Lo, p3Lo); + f8Hi = FilterAdd2Sub2(f8Hi, q2Hi, p0Hi, p1Hi, p3Hi); + op0 = Filter8Mask(flat, op0, f8Lo, f8Hi); + + f8Lo = FilterAdd2Sub2(f8Lo, q3Lo, q0Lo, p0Lo, p3Lo); + f8Hi = FilterAdd2Sub2(f8Hi, q3Hi, q0Hi, p0Hi, p3Hi); + oq0 = Filter8Mask(flat, oq0, f8Lo, f8Hi); + + f8Lo = FilterAdd2Sub2(f8Lo, q3Lo, q1Lo, q0Lo, p2Lo); + f8Hi = FilterAdd2Sub2(f8Hi, q3Hi, q1Hi, q0Hi, p2Hi); + oq1 = Filter8Mask(flat, oq1, f8Lo, f8Hi); + + f8Lo = FilterAdd2Sub2(f8Lo, q3Lo, q2Lo, q1Lo, p1Lo); + f8Hi = FilterAdd2Sub2(f8Hi, q3Hi, q2Hi, q1Hi, p1Hi); + oq2 = Filter8Mask(flat, q2, f8Lo, f8Hi); + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // wide flat calculations + { + Vector128 eight = Vector128.Create((short)8); + Vector128 p7Lo = Sse2.UnpackLow(p7, zero).AsInt16(); + Vector128 p6Lo = Sse2.UnpackLow(p6, zero).AsInt16(); + Vector128 p5Lo = Sse2.UnpackLow(p5, zero).AsInt16(); + Vector128 p4Lo = Sse2.UnpackLow(p4, zero).AsInt16(); + Vector128 p3Lo = Sse2.UnpackLow(p3, zero).AsInt16(); + Vector128 p2Lo = Sse2.UnpackLow(p2, zero).AsInt16(); + Vector128 p1Lo = Sse2.UnpackLow(p1, zero).AsInt16(); + Vector128 p0Lo = Sse2.UnpackLow(p0, zero).AsInt16(); + Vector128 q0Lo = Sse2.UnpackLow(q0, zero).AsInt16(); + Vector128 q1Lo = Sse2.UnpackLow(q1, zero).AsInt16(); + Vector128 q2Lo = Sse2.UnpackLow(q2, zero).AsInt16(); + Vector128 q3Lo = Sse2.UnpackLow(q3, zero).AsInt16(); + Vector128 q4Lo = Sse2.UnpackLow(q4, zero).AsInt16(); + Vector128 q5Lo = Sse2.UnpackLow(q5, zero).AsInt16(); + Vector128 q6Lo = Sse2.UnpackLow(q6, zero).AsInt16(); + Vector128 q7Lo = Sse2.UnpackLow(q7, zero).AsInt16(); + + Vector128 p7Hi = Sse2.UnpackHigh(p7, zero).AsInt16(); + Vector128 p6Hi = Sse2.UnpackHigh(p6, zero).AsInt16(); + Vector128 p5Hi = Sse2.UnpackHigh(p5, zero).AsInt16(); + Vector128 p4Hi = Sse2.UnpackHigh(p4, zero).AsInt16(); + Vector128 p3Hi = Sse2.UnpackHigh(p3, zero).AsInt16(); + Vector128 p2Hi = Sse2.UnpackHigh(p2, zero).AsInt16(); + Vector128 p1Hi = Sse2.UnpackHigh(p1, zero).AsInt16(); + Vector128 p0Hi = Sse2.UnpackHigh(p0, zero).AsInt16(); + Vector128 q0Hi = Sse2.UnpackHigh(q0, zero).AsInt16(); + Vector128 q1Hi = Sse2.UnpackHigh(q1, zero).AsInt16(); + Vector128 q2Hi = Sse2.UnpackHigh(q2, zero).AsInt16(); + Vector128 q3Hi = Sse2.UnpackHigh(q3, zero).AsInt16(); + Vector128 q4Hi = Sse2.UnpackHigh(q4, zero).AsInt16(); + Vector128 q5Hi = Sse2.UnpackHigh(q5, zero).AsInt16(); + Vector128 q6Hi = Sse2.UnpackHigh(q6, zero).AsInt16(); + Vector128 q7Hi = Sse2.UnpackHigh(q7, zero).AsInt16(); + + Vector128 fLo; + Vector128 fHi; + + fLo = Sse2.Subtract(Sse2.ShiftLeftLogical(p7Lo, 3), p7Lo); // p7 * 7 + fLo = Sse2.Add(Sse2.ShiftLeftLogical(p6Lo, 1), Sse2.Add(p4Lo, fLo)); + fLo = Sse2.Add(Sse2.Add(p3Lo, fLo), Sse2.Add(p2Lo, p1Lo)); + fLo = Sse2.Add(Sse2.Add(p0Lo, q0Lo), fLo); + fLo = Sse2.Add(Sse2.Add(p5Lo, eight), fLo); + + fHi = Sse2.Subtract(Sse2.ShiftLeftLogical(p7Hi, 3), p7Hi); // p7 * 7 + fHi = Sse2.Add(Sse2.ShiftLeftLogical(p6Hi, 1), Sse2.Add(p4Hi, fHi)); + fHi = Sse2.Add(Sse2.Add(p3Hi, fHi), Sse2.Add(p2Hi, p1Hi)); + fHi = Sse2.Add(Sse2.Add(p0Hi, q0Hi), fHi); + fHi = Sse2.Add(Sse2.Add(p5Hi, eight), fHi); + + p6 = Filter16Mask(flat2, p6, fLo, fHi); + Sse2.Store(s.ToPointer() - (7 * pitch), p6); + + fLo = FilterAdd2Sub2(fLo, q1Lo, p5Lo, p6Lo, p7Lo); + fHi = FilterAdd2Sub2(fHi, q1Hi, p5Hi, p6Hi, p7Hi); + p5 = Filter16Mask(flat2, p5, fLo, fHi); + Sse2.Store(s.ToPointer() - (6 * pitch), p5); + + fLo = FilterAdd2Sub2(fLo, q2Lo, p4Lo, p5Lo, p7Lo); + fHi = FilterAdd2Sub2(fHi, q2Hi, p4Hi, p5Hi, p7Hi); + p4 = Filter16Mask(flat2, p4, fLo, fHi); + Sse2.Store(s.ToPointer() - (5 * pitch), p4); + + fLo = FilterAdd2Sub2(fLo, q3Lo, p3Lo, p4Lo, p7Lo); + fHi = FilterAdd2Sub2(fHi, q3Hi, p3Hi, p4Hi, p7Hi); + p3 = Filter16Mask(flat2, p3, fLo, fHi); + Sse2.Store(s.ToPointer() - (4 * pitch), p3); + + fLo = FilterAdd2Sub2(fLo, q4Lo, p2Lo, p3Lo, p7Lo); + fHi = FilterAdd2Sub2(fHi, q4Hi, p2Hi, p3Hi, p7Hi); + op2 = Filter16Mask(flat2, op2, fLo, fHi); + Sse2.Store(s.ToPointer() - (3 * pitch), op2); + + fLo = FilterAdd2Sub2(fLo, q5Lo, p1Lo, p2Lo, p7Lo); + fHi = FilterAdd2Sub2(fHi, q5Hi, p1Hi, p2Hi, p7Hi); + op1 = Filter16Mask(flat2, op1, fLo, fHi); + Sse2.Store(s.ToPointer() - (2 * pitch), op1); + + fLo = FilterAdd2Sub2(fLo, q6Lo, p0Lo, p1Lo, p7Lo); + fHi = FilterAdd2Sub2(fHi, q6Hi, p0Hi, p1Hi, p7Hi); + op0 = Filter16Mask(flat2, op0, fLo, fHi); + Sse2.Store(s.ToPointer() - (1 * pitch), op0); + + fLo = FilterAdd2Sub2(fLo, q7Lo, q0Lo, p0Lo, p7Lo); + fHi = FilterAdd2Sub2(fHi, q7Hi, q0Hi, p0Hi, p7Hi); + oq0 = Filter16Mask(flat2, oq0, fLo, fHi); + Sse2.Store(s.ToPointer() - (0 * pitch), oq0); + + fLo = FilterAdd2Sub2(fLo, q7Lo, q1Lo, p6Lo, q0Lo); + fHi = FilterAdd2Sub2(fHi, q7Hi, q1Hi, p6Hi, q0Hi); + oq1 = Filter16Mask(flat2, oq1, fLo, fHi); + Sse2.Store(s.ToPointer() + (1 * pitch), oq1); + + fLo = FilterAdd2Sub2(fLo, q7Lo, q2Lo, p5Lo, q1Lo); + fHi = FilterAdd2Sub2(fHi, q7Hi, q2Hi, p5Hi, q1Hi); + oq2 = Filter16Mask(flat2, oq2, fLo, fHi); + Sse2.Store(s.ToPointer() + (2 * pitch), oq2); + + fLo = FilterAdd2Sub2(fLo, q7Lo, q3Lo, p4Lo, q2Lo); + fHi = FilterAdd2Sub2(fHi, q7Hi, q3Hi, p4Hi, q2Hi); + q3 = Filter16Mask(flat2, q3, fLo, fHi); + Sse2.Store(s.ToPointer() + (3 * pitch), q3); + + fLo = FilterAdd2Sub2(fLo, q7Lo, q4Lo, p3Lo, q3Lo); + fHi = FilterAdd2Sub2(fHi, q7Hi, q4Hi, p3Hi, q3Hi); + q4 = Filter16Mask(flat2, q4, fLo, fHi); + Sse2.Store(s.ToPointer() + (4 * pitch), q4); + + fLo = FilterAdd2Sub2(fLo, q7Lo, q5Lo, p2Lo, q4Lo); + fHi = FilterAdd2Sub2(fHi, q7Hi, q5Hi, p2Hi, q4Hi); + q5 = Filter16Mask(flat2, q5, fLo, fHi); + Sse2.Store(s.ToPointer() + (5 * pitch), q5); + + fLo = FilterAdd2Sub2(fLo, q7Lo, q6Lo, p1Lo, q5Lo); + fHi = FilterAdd2Sub2(fHi, q7Hi, q6Hi, p1Hi, q5Hi); + q6 = Filter16Mask(flat2, q6, fLo, fHi); + Sse2.Store(s.ToPointer() + (6 * pitch), q6); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + } + } + + public static unsafe void LpfHorizontal8( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit, + ReadOnlySpan limit, + ReadOnlySpan thresh) + { + Vector128 flatOp2; + Vector128 flatOp1; + Vector128 flatOp0; + Vector128 flatOq2; + Vector128 flatOq1; + Vector128 flatOq0; + Vector128 zero = Vector128.Zero; + Vector128 blimitV, limitV, threshV; + + fixed (byte* pBLimit = blimit, pLimit = limit, pThresh = thresh) + { + blimitV = Sse2.LoadVector128(pBLimit); + limitV = Sse2.LoadVector128(pLimit); + threshV = Sse2.LoadVector128(pThresh); + } + + Vector128 mask, hev, flat; + Vector128 p3, p2, p1, p0, q0, q1, q2, q3; + Vector128 q3P3, q2P2, q1P1, q0P0, p1Q1, p0Q0; + + q3P3 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(s.ToPointer() - (4 * pitch))), + Sse2.LoadScalarVector128((long*)(s.ToPointer() + (3 * pitch)))).AsByte(); + q2P2 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(s.ToPointer() - (3 * pitch))), + Sse2.LoadScalarVector128((long*)(s.ToPointer() + (2 * pitch)))).AsByte(); + q1P1 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(s.ToPointer() - (2 * pitch))), + Sse2.LoadScalarVector128((long*)(s.ToPointer() + (1 * pitch)))).AsByte(); + q0P0 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(s.ToPointer() - (1 * pitch))), + Sse2.LoadScalarVector128((long*)(s.ToPointer() - (0 * pitch)))).AsByte(); + p1Q1 = Sse2.Shuffle(q1P1.AsInt32(), 78).AsByte(); + p0Q0 = Sse2.Shuffle(q0P0.AsInt32(), 78).AsByte(); + + { + // filter_mask and hev_mask + Vector128 one = Vector128.Create((byte)1); + Vector128 fe = Vector128.Create((byte)0xfe); + Vector128 ff = Sse2.CompareEqual(fe, fe); + Vector128 absP1Q1, absP0Q0, absQ1Q0, absP1P0, work; + absP1P0 = AbsDiff(q1P1, q0P0); + absQ1Q0 = Sse2.ShiftRightLogical128BitLane(absP1P0, 8); + + absP0Q0 = AbsDiff(q0P0, p0Q0); + absP1Q1 = AbsDiff(q1P1, p1Q1); + flat = Sse2.Max(absP1P0, absQ1Q0); + hev = Sse2.SubtractSaturate(flat, threshV); + hev = Sse2.Xor(Sse2.CompareEqual(hev, zero), ff); + + absP0Q0 = Sse2.AddSaturate(absP0Q0, absP0Q0); + absP1Q1 = Sse2.ShiftRightLogical(Sse2.And(absP1Q1, fe).AsInt16(), 1).AsByte(); + mask = Sse2.SubtractSaturate(Sse2.AddSaturate(absP0Q0, absP1Q1), blimitV); + mask = Sse2.Xor(Sse2.CompareEqual(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = Sse2.Max(absP1P0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = Sse2.Max(AbsDiff(q2P2, q1P1), AbsDiff(q3P3, q2P2)); + mask = Sse2.Max(work, mask); + mask = Sse2.Max(mask, Sse2.ShiftRightLogical128BitLane(mask, 8)); + mask = Sse2.SubtractSaturate(mask, limitV); + mask = Sse2.CompareEqual(mask, zero); + + // flat_mask4 + + flat = Sse2.Max(AbsDiff(q2P2, q0P0), AbsDiff(q3P3, q0P0)); + flat = Sse2.Max(absP1P0, flat); + flat = Sse2.Max(flat, Sse2.ShiftRightLogical128BitLane(flat, 8)); + flat = Sse2.SubtractSaturate(flat, one); + flat = Sse2.CompareEqual(flat, zero); + flat = Sse2.And(flat, mask); + } + + { + Vector128 four = Vector128.Create((short)4); + { + Vector128 workpA, workpB, workpShft; + p3 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(s.ToPointer() - (4 * pitch))).AsByte(), zero); + p2 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(s.ToPointer() - (3 * pitch))).AsByte(), zero); + p1 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(s.ToPointer() - (2 * pitch))).AsByte(), zero); + p0 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(s.ToPointer() - (1 * pitch))).AsByte(), zero); + q0 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(s.ToPointer() - (0 * pitch))).AsByte(), zero); + q1 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(s.ToPointer() + (1 * pitch))).AsByte(), zero); + q2 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(s.ToPointer() + (2 * pitch))).AsByte(), zero); + q3 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(s.ToPointer() + (3 * pitch))).AsByte(), zero); + + workpA = Sse2.Add(Sse2.Add(p3.AsInt16(), p3.AsInt16()), Sse2.Add(p2.AsInt16(), p1.AsInt16())); + workpA = Sse2.Add(Sse2.Add(workpA, four), p0.AsInt16()); + workpB = Sse2.Add(Sse2.Add(q0.AsInt16(), p2.AsInt16()), p3.AsInt16()); + workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3); + Sse2.StoreScalar((long*)&flatOp2, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64()); + + workpB = Sse2.Add(Sse2.Add(q0.AsInt16(), q1.AsInt16()), p1.AsInt16()); + workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3); + Sse2.StoreScalar((long*)&flatOp1, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64()); + + workpA = Sse2.Add(Sse2.Subtract(workpA, p3.AsInt16()), q2.AsInt16()); + workpB = Sse2.Add(Sse2.Subtract(workpB, p1.AsInt16()), p0.AsInt16()); + workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3); + Sse2.StoreScalar((long*)&flatOp0, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64()); + + workpA = Sse2.Add(Sse2.Subtract(workpA, p3.AsInt16()), q3.AsInt16()); + workpB = Sse2.Add(Sse2.Subtract(workpB, p0.AsInt16()), q0.AsInt16()); + workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3); + Sse2.StoreScalar((long*)&flatOq0, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64()); + + workpA = Sse2.Add(Sse2.Subtract(workpA, p2.AsInt16()), q3.AsInt16()); + workpB = Sse2.Add(Sse2.Subtract(workpB, q0.AsInt16()), q1.AsInt16()); + workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3); + Sse2.StoreScalar((long*)&flatOq1, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64()); + + workpA = Sse2.Add(Sse2.Subtract(workpA, p1.AsInt16()), q3.AsInt16()); + workpB = Sse2.Add(Sse2.Subtract(workpB, q1.AsInt16()), q2.AsInt16()); + workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3); + Sse2.StoreScalar((long*)&flatOq2, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64()); + } + } + // lp filter + { + Vector128 t4 = Vector128.Create((byte)4); + Vector128 t3 = Vector128.Create((byte)3); + Vector128 t80 = Vector128.Create((byte)0x80); + Vector128 t1 = Vector128.Create((byte)0x1); + Vector128 ps1 = + Sse2.Xor(Sse2.LoadScalarVector128((long*)(s.ToPointer() - (2 * pitch))).AsByte(), + t80); + Vector128 ps0 = + Sse2.Xor(Sse2.LoadScalarVector128((long*)(s.ToPointer() - (1 * pitch))).AsByte(), + t80); + Vector128 qs0 = + Sse2.Xor(Sse2.LoadScalarVector128((long*)(s.ToPointer() + (0 * pitch))).AsByte(), + t80); + Vector128 qs1 = + Sse2.Xor(Sse2.LoadScalarVector128((long*)(s.ToPointer() + (1 * pitch))).AsByte(), + t80); + Vector128 filt; + Vector128 workA; + Vector128 filter1, filter2; + + filt = Sse2.And(Sse2.SubtractSaturate(ps1.AsSByte(), qs1.AsSByte()).AsByte(), hev); + workA = Sse2.SubtractSaturate(qs0.AsSByte(), ps0.AsSByte()).AsByte(); + filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte(); + filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte(); + filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte(); + // (vpx_filter + 3 * (qs0 - ps0)) & mask + filt = Sse2.And(filt, mask); + + filter1 = Sse2.AddSaturate(filt.AsSByte(), t4.AsSByte()).AsByte(); + filter2 = Sse2.AddSaturate(filt.AsSByte(), t3.AsSByte()).AsByte(); + + // Filter1 >> 3 + filter1 = Sse2.UnpackLow(zero, filter1); + filter1 = Sse2.ShiftRightArithmetic(filter1.AsInt16(), 11).AsByte(); + filter1 = Sse2.PackSignedSaturate(filter1.AsInt16(), filter1.AsInt16()).AsByte(); + + // Filter2 >> 3 + filter2 = Sse2.UnpackLow(zero, filter2); + filter2 = Sse2.ShiftRightArithmetic(filter2.AsInt16(), 11).AsByte(); + filter2 = Sse2.PackSignedSaturate(filter2.AsInt16(), zero.AsInt16()).AsByte(); + + // filt >> 1 + filt = Sse2.AddSaturate(filter1.AsSByte(), t1.AsSByte()).AsByte(); + filt = Sse2.UnpackLow(zero, filt); + filt = Sse2.ShiftRightArithmetic(filt.AsInt16(), 9).AsByte(); + filt = Sse2.PackSignedSaturate(filt.AsInt16(), zero.AsInt16()).AsByte(); + + filt = Sse2.AndNot(hev, filt); + + workA = Sse2.Xor(Sse2.SubtractSaturate(qs0.AsSByte(), filter1.AsSByte()).AsByte(), t80); + q0 = Sse2.LoadScalarVector128((long*)&flatOq0).AsByte(); + workA = Sse2.AndNot(flat, workA); + q0 = Sse2.And(flat, q0); + q0 = Sse2.Or(workA, q0); + + workA = Sse2.Xor(Sse2.SubtractSaturate(qs1.AsSByte(), filt.AsSByte()).AsByte(), t80); + q1 = Sse2.LoadScalarVector128((long*)&flatOq1).AsByte(); + workA = Sse2.AndNot(flat, workA); + q1 = Sse2.And(flat, q1); + q1 = Sse2.Or(workA, q1); + + workA = Sse2.LoadVector128(s.ToPointer() + (2 * pitch)); + q2 = Sse2.LoadScalarVector128((long*)&flatOq2).AsByte(); + workA = Sse2.AndNot(flat, workA); + q2 = Sse2.And(flat, q2); + q2 = Sse2.Or(workA, q2); + + workA = Sse2.Xor(Sse2.AddSaturate(ps0.AsSByte(), filter2.AsSByte()).AsByte(), t80); + p0 = Sse2.LoadScalarVector128((long*)&flatOp0).AsByte(); + workA = Sse2.AndNot(flat, workA); + p0 = Sse2.And(flat, p0); + p0 = Sse2.Or(workA, p0); + + workA = Sse2.Xor(Sse2.AddSaturate(ps1.AsSByte(), filt.AsSByte()).AsByte(), t80); + p1 = Sse2.LoadScalarVector128((long*)&flatOp1).AsByte(); + workA = Sse2.AndNot(flat, workA); + p1 = Sse2.And(flat, p1); + p1 = Sse2.Or(workA, p1); + + workA = Sse2.LoadVector128(s.ToPointer() - (3 * pitch)); + p2 = Sse2.LoadScalarVector128((long*)&flatOp2).AsByte(); + workA = Sse2.AndNot(flat, workA); + p2 = Sse2.And(flat, p2); + p2 = Sse2.Or(workA, p2); + + Sse2.StoreScalar((long*)(s.ToPointer() - (3 * pitch)), p2.AsInt64()); + Sse2.StoreScalar((long*)(s.ToPointer() - (2 * pitch)), p1.AsInt64()); + Sse2.StoreScalar((long*)(s.ToPointer() - (1 * pitch)), p0.AsInt64()); + Sse2.StoreScalar((long*)(s.ToPointer() + (0 * pitch)), q0.AsInt64()); + Sse2.StoreScalar((long*)(s.ToPointer() + (1 * pitch)), q1.AsInt64()); + Sse2.StoreScalar((long*)(s.ToPointer() + (2 * pitch)), q2.AsInt64()); + } + } + + public static unsafe void LpfHorizontal8Dual( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit0, + ReadOnlySpan limit0, + ReadOnlySpan thresh0, + ReadOnlySpan blimit1, + ReadOnlySpan limit1, + ReadOnlySpan thresh1) + { + Vector128 flatOp2; + Vector128 flatOp1; + Vector128 flatOp0; + Vector128 flatOq2; + Vector128 flatOq1; + Vector128 flatOq0; + Vector128 zero = Vector128.Zero; + Vector128 blimit, limit, thresh; + + fixed (byte* pBLimit0 = blimit0, pLimit0 = limit0, pThresh0 = thresh0, + pBLimit1 = blimit1, pLimit1 = limit1, pThresh1 = thresh1) + { + blimit = Sse2.UnpackLow(Sse2.LoadVector128(pBLimit0).AsInt64(), Sse2.LoadVector128(pBLimit1).AsInt64()) + .AsByte(); + limit = Sse2.UnpackLow(Sse2.LoadVector128(pLimit0).AsInt64(), Sse2.LoadVector128(pLimit1).AsInt64()) + .AsByte(); + thresh = Sse2.UnpackLow(Sse2.LoadVector128(pThresh0).AsInt64(), Sse2.LoadVector128(pThresh1).AsInt64()) + .AsByte(); + } + + Vector128 mask, hev, flat; + Vector128 p3, p2, p1, p0, q0, q1, q2, q3; + + p3 = Sse2.LoadVector128(s.ToPointer() - (4 * pitch)); + p2 = Sse2.LoadVector128(s.ToPointer() - (3 * pitch)); + p1 = Sse2.LoadVector128(s.ToPointer() - (2 * pitch)); + p0 = Sse2.LoadVector128(s.ToPointer() - (1 * pitch)); + q0 = Sse2.LoadVector128(s.ToPointer() - (0 * pitch)); + q1 = Sse2.LoadVector128(s.ToPointer() + (1 * pitch)); + q2 = Sse2.LoadVector128(s.ToPointer() + (2 * pitch)); + q3 = Sse2.LoadVector128(s.ToPointer() + (3 * pitch)); + { + Vector128 absP1P0 = Sse2.Or(Sse2.SubtractSaturate(p1, p0), Sse2.SubtractSaturate(p0, p1)); + Vector128 absQ1Q0 = Sse2.Or(Sse2.SubtractSaturate(q1, q0), Sse2.SubtractSaturate(q0, q1)); + Vector128 one = Vector128.Create((byte)1); + Vector128 fe = Vector128.Create((byte)0xfe); + Vector128 ff = Sse2.CompareEqual(absP1P0, absP1P0); + Vector128 absP0Q0 = Sse2.Or(Sse2.SubtractSaturate(p0, q0), Sse2.SubtractSaturate(q0, p0)); + Vector128 absP1Q1 = Sse2.Or(Sse2.SubtractSaturate(p1, q1), Sse2.SubtractSaturate(q1, p1)); + Vector128 work; + + // filter_mask and hev_mask + flat = Sse2.Max(absP1P0, absQ1Q0); + hev = Sse2.SubtractSaturate(flat, thresh); + hev = Sse2.Xor(Sse2.CompareEqual(hev, zero), ff); + + absP0Q0 = Sse2.AddSaturate(absP0Q0, absP0Q0); + absP1Q1 = Sse2.ShiftRightLogical(Sse2.And(absP1Q1, fe).AsInt16(), 1).AsByte(); + mask = Sse2.SubtractSaturate(Sse2.AddSaturate(absP0Q0, absP1Q1), blimit); + mask = Sse2.Xor(Sse2.CompareEqual(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = Sse2.Max(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = Sse2.Max( + Sse2.Or(Sse2.SubtractSaturate(p2, p1), Sse2.SubtractSaturate(p1, p2)), + Sse2.Or(Sse2.SubtractSaturate(p3, p2), Sse2.SubtractSaturate(p2, p3))); + mask = Sse2.Max(work, mask); + work = Sse2.Max( + Sse2.Or(Sse2.SubtractSaturate(q2, q1), Sse2.SubtractSaturate(q1, q2)), + Sse2.Or(Sse2.SubtractSaturate(q3, q2), Sse2.SubtractSaturate(q2, q3))); + mask = Sse2.Max(work, mask); + mask = Sse2.SubtractSaturate(mask, limit); + mask = Sse2.CompareEqual(mask, zero); + + // flat_mask4 + work = Sse2.Max( + Sse2.Or(Sse2.SubtractSaturate(p2, p0), Sse2.SubtractSaturate(p0, p2)), + Sse2.Or(Sse2.SubtractSaturate(q2, q0), Sse2.SubtractSaturate(q0, q2))); + flat = Sse2.Max(work, flat); + work = Sse2.Max( + Sse2.Or(Sse2.SubtractSaturate(p3, p0), Sse2.SubtractSaturate(p0, p3)), + Sse2.Or(Sse2.SubtractSaturate(q3, q0), Sse2.SubtractSaturate(q0, q3))); + flat = Sse2.Max(work, flat); + flat = Sse2.SubtractSaturate(flat, one); + flat = Sse2.CompareEqual(flat, zero); + flat = Sse2.And(flat, mask); + } + { + Vector128 four = Vector128.Create((short)4); + ArrayPtr src = s; + int i = 0; + + do + { + Vector128 workpA, workpB, workpShft; + p3 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(src.ToPointer() - (4 * pitch))).AsByte(), zero); + p2 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(src.ToPointer() - (3 * pitch))).AsByte(), zero); + p1 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(src.ToPointer() - (2 * pitch))).AsByte(), zero); + p0 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(src.ToPointer() - (1 * pitch))).AsByte(), zero); + q0 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(src.ToPointer() - (0 * pitch))).AsByte(), zero); + q1 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(src.ToPointer() + (1 * pitch))).AsByte(), zero); + q2 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(src.ToPointer() + (2 * pitch))).AsByte(), zero); + q3 = Sse2.UnpackLow( + Sse2.LoadScalarVector128((long*)(src.ToPointer() + (3 * pitch))).AsByte(), zero); + + workpA = Sse2.Add(Sse2.Add(p3.AsInt16(), p3.AsInt16()), Sse2.Add(p2.AsInt16(), p1.AsInt16())); + workpA = Sse2.Add(Sse2.Add(workpA, four), p0.AsInt16()); + workpB = Sse2.Add(Sse2.Add(q0.AsInt16(), p2.AsInt16()), p3.AsInt16()); + workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3); + Sse2.StoreScalar((long*)&flatOp2 + i, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64()); + + workpB = Sse2.Add(Sse2.Add(q0.AsInt16(), q1.AsInt16()), p1.AsInt16()); + workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3); + Sse2.StoreScalar((long*)&flatOp1 + i, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64()); + + workpA = Sse2.Add(Sse2.Subtract(workpA, p3.AsInt16()), q2.AsInt16()); + workpB = Sse2.Add(Sse2.Subtract(workpB, p1.AsInt16()), p0.AsInt16()); + workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3); + Sse2.StoreScalar((long*)&flatOp0 + i, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64()); + + workpA = Sse2.Add(Sse2.Subtract(workpA, p3.AsInt16()), q3.AsInt16()); + workpB = Sse2.Add(Sse2.Subtract(workpB, p0.AsInt16()), q0.AsInt16()); + workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3); + Sse2.StoreScalar((long*)&flatOq0 + i, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64()); + + workpA = Sse2.Add(Sse2.Subtract(workpA, p2.AsInt16()), q3.AsInt16()); + workpB = Sse2.Add(Sse2.Subtract(workpB, q0.AsInt16()), q1.AsInt16()); + workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3); + Sse2.StoreScalar((long*)&flatOq1 + i, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64()); + + workpA = Sse2.Add(Sse2.Subtract(workpA, p1.AsInt16()), q3.AsInt16()); + workpB = Sse2.Add(Sse2.Subtract(workpB, q1.AsInt16()), q2.AsInt16()); + workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3); + Sse2.StoreScalar((long*)&flatOq2 + i, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64()); + + src = src.Slice(8); + } while (++i < 2); + } + // lp filter + { + Vector128 t4 = Vector128.Create((byte)4); + Vector128 t3 = Vector128.Create((byte)3); + Vector128 t80 = Vector128.Create((byte)0x80); + Vector128 te0 = Vector128.Create((byte)0xe0); + Vector128 t1F = Vector128.Create((byte)0x1f); + Vector128 t1 = Vector128.Create((byte)0x1); + Vector128 t7F = Vector128.Create((byte)0x7f); + + Vector128 ps1 = Sse2.Xor(Sse2.LoadVector128(s.ToPointer() - (2 * pitch)), t80); + Vector128 ps0 = Sse2.Xor(Sse2.LoadVector128(s.ToPointer() - (1 * pitch)), t80); + Vector128 qs0 = Sse2.Xor(Sse2.LoadVector128(s.ToPointer() + (0 * pitch)), t80); + Vector128 qs1 = Sse2.Xor(Sse2.LoadVector128(s.ToPointer() + (1 * pitch)), t80); + Vector128 filt; + Vector128 workA; + Vector128 filter1, filter2; + + filt = Sse2.And(Sse2.SubtractSaturate(ps1.AsSByte(), qs1.AsSByte()).AsByte(), hev); + workA = Sse2.SubtractSaturate(qs0.AsSByte(), ps0.AsSByte()).AsByte(); + filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte(); + filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte(); + filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte(); + // (vpx_filter + 3 * (qs0 - ps0)) & mask + filt = Sse2.And(filt, mask); + + filter1 = Sse2.AddSaturate(filt.AsSByte(), t4.AsSByte()).AsByte(); + filter2 = Sse2.AddSaturate(filt.AsSByte(), t3.AsSByte()).AsByte(); + + // Filter1 >> 3 + workA = Sse2.CompareGreaterThan(zero.AsSByte(), filter1.AsSByte()).AsByte(); + filter1 = Sse2.ShiftRightLogical(filter1.AsInt16(), 3).AsByte(); + workA = Sse2.And(workA, te0); + filter1 = Sse2.And(filter1, t1F); + filter1 = Sse2.Or(filter1, workA); + + // Filter2 >> 3 + workA = Sse2.CompareGreaterThan(zero.AsSByte(), filter2.AsSByte()).AsByte(); + filter2 = Sse2.ShiftRightLogical(filter2.AsInt16(), 3).AsByte(); + workA = Sse2.And(workA, te0); + filter2 = Sse2.And(filter2, t1F); + filter2 = Sse2.Or(filter2, workA); + + // filt >> 1 + filt = Sse2.AddSaturate(filter1.AsSByte(), t1.AsSByte()).AsByte(); + workA = Sse2.CompareGreaterThan(zero.AsSByte(), filt.AsSByte()).AsByte(); + filt = Sse2.ShiftRightLogical(filt.AsInt16(), 1).AsByte(); + workA = Sse2.And(workA, t80); + filt = Sse2.And(filt, t7F); + filt = Sse2.Or(filt, workA); + + filt = Sse2.AndNot(hev, filt); + + workA = Sse2.Xor(Sse2.SubtractSaturate(qs0.AsSByte(), filter1.AsSByte()).AsByte(), t80); + q0 = Sse2.LoadVector128((byte*)&flatOq0); + workA = Sse2.AndNot(flat, workA); + q0 = Sse2.And(flat, q0); + q0 = Sse2.Or(workA, q0); + + workA = Sse2.Xor(Sse2.SubtractSaturate(qs1.AsSByte(), filt.AsSByte()).AsByte(), t80); + q1 = Sse2.LoadVector128((byte*)&flatOq1); + workA = Sse2.AndNot(flat, workA); + q1 = Sse2.And(flat, q1); + q1 = Sse2.Or(workA, q1); + + workA = Sse2.LoadVector128(s.ToPointer() + (2 * pitch)); + q2 = Sse2.LoadVector128((byte*)&flatOq2); + workA = Sse2.AndNot(flat, workA); + q2 = Sse2.And(flat, q2); + q2 = Sse2.Or(workA, q2); + + workA = Sse2.Xor(Sse2.AddSaturate(ps0.AsSByte(), filter2.AsSByte()).AsByte(), t80); + p0 = Sse2.LoadVector128((byte*)&flatOp0); + workA = Sse2.AndNot(flat, workA); + p0 = Sse2.And(flat, p0); + p0 = Sse2.Or(workA, p0); + + workA = Sse2.Xor(Sse2.AddSaturate(ps1.AsSByte(), filt.AsSByte()).AsByte(), t80); + p1 = Sse2.LoadVector128((byte*)&flatOp1); + workA = Sse2.AndNot(flat, workA); + p1 = Sse2.And(flat, p1); + p1 = Sse2.Or(workA, p1); + + workA = Sse2.LoadVector128(s.ToPointer() - (3 * pitch)); + p2 = Sse2.LoadVector128((byte*)&flatOp2); + workA = Sse2.AndNot(flat, workA); + p2 = Sse2.And(flat, p2); + p2 = Sse2.Or(workA, p2); + + Sse2.Store(s.ToPointer() - (3 * pitch), p2); + Sse2.Store(s.ToPointer() - (2 * pitch), p1); + Sse2.Store(s.ToPointer() - (1 * pitch), p0); + Sse2.Store(s.ToPointer() + (0 * pitch), q0); + Sse2.Store(s.ToPointer() + (1 * pitch), q1); + Sse2.Store(s.ToPointer() + (2 * pitch), q2); + } + } + + public static unsafe void LpfHorizontal4Dual( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit0, + ReadOnlySpan limit0, + ReadOnlySpan thresh0, + ReadOnlySpan blimit1, + ReadOnlySpan limit1, + ReadOnlySpan thresh1) + { + Vector128 blimit, limit, thresh; + + fixed (byte* pBLimit0 = blimit0, pLimit0 = limit0, pThresh0 = thresh0, + pBLimit1 = blimit1, pLimit1 = limit1, pThresh1 = thresh1) + { + blimit = Sse2.UnpackLow(Sse2.LoadVector128(pBLimit0).AsInt64(), Sse2.LoadVector128(pBLimit1).AsInt64()) + .AsByte(); + limit = Sse2.UnpackLow(Sse2.LoadVector128(pLimit0).AsInt64(), Sse2.LoadVector128(pLimit1).AsInt64()) + .AsByte(); + thresh = Sse2.UnpackLow(Sse2.LoadVector128(pThresh0).AsInt64(), Sse2.LoadVector128(pThresh1).AsInt64()) + .AsByte(); + } + + Vector128 zero = Vector128.Zero; + Vector128 p3, p2, p1, p0, q0, q1, q2, q3; + Vector128 mask, hev, flat; + + p3 = Sse2.LoadVector128(s.ToPointer() - (4 * pitch)); + p2 = Sse2.LoadVector128(s.ToPointer() - (3 * pitch)); + p1 = Sse2.LoadVector128(s.ToPointer() - (2 * pitch)); + p0 = Sse2.LoadVector128(s.ToPointer() - (1 * pitch)); + q0 = Sse2.LoadVector128(s.ToPointer() - (0 * pitch)); + q1 = Sse2.LoadVector128(s.ToPointer() + (1 * pitch)); + q2 = Sse2.LoadVector128(s.ToPointer() + (2 * pitch)); + q3 = Sse2.LoadVector128(s.ToPointer() + (3 * pitch)); + + // filter_mask and hev_mask + { + Vector128 absP1P0 = Sse2.Or(Sse2.SubtractSaturate(p1, p0), Sse2.SubtractSaturate(p0, p1)); + Vector128 absQ1Q0 = Sse2.Or(Sse2.SubtractSaturate(q1, q0), Sse2.SubtractSaturate(q0, q1)); + Vector128 fe = Vector128.Create((byte)0xfe); + Vector128 ff = Sse2.CompareEqual(absP1P0, absP1P0); + Vector128 absP0Q0 = Sse2.Or(Sse2.SubtractSaturate(p0, q0), Sse2.SubtractSaturate(q0, p0)); + Vector128 absP1Q1 = Sse2.Or(Sse2.SubtractSaturate(p1, q1), Sse2.SubtractSaturate(q1, p1)); + Vector128 work; + + flat = Sse2.Max(absP1P0, absQ1Q0); + hev = Sse2.SubtractSaturate(flat, thresh); + hev = Sse2.Xor(Sse2.CompareEqual(hev, zero), ff); + + absP0Q0 = Sse2.AddSaturate(absP0Q0, absP0Q0); + absP1Q1 = Sse2.ShiftRightLogical(Sse2.And(absP1Q1, fe).AsInt16(), 1).AsByte(); + mask = Sse2.SubtractSaturate(Sse2.AddSaturate(absP0Q0, absP1Q1), blimit); + mask = Sse2.Xor(Sse2.CompareEqual(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = Sse2.Max(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = Sse2.Max( + Sse2.Or(Sse2.SubtractSaturate(p2, p1), Sse2.SubtractSaturate(p1, p2)), + Sse2.Or(Sse2.SubtractSaturate(p3, p2), Sse2.SubtractSaturate(p2, p3))); + mask = Sse2.Max(work, mask); + work = Sse2.Max( + Sse2.Or(Sse2.SubtractSaturate(q2, q1), Sse2.SubtractSaturate(q1, q2)), + Sse2.Or(Sse2.SubtractSaturate(q3, q2), Sse2.SubtractSaturate(q2, q3))); + mask = Sse2.Max(work, mask); + mask = Sse2.SubtractSaturate(mask, limit); + mask = Sse2.CompareEqual(mask, zero); + } + + // filter4 + { + Vector128 t4 = Vector128.Create((byte)4); + Vector128 t3 = Vector128.Create((byte)3); + Vector128 t80 = Vector128.Create((byte)0x80); + Vector128 te0 = Vector128.Create((byte)0xe0); + Vector128 t1F = Vector128.Create((byte)0x1f); + Vector128 t1 = Vector128.Create((byte)0x1); + Vector128 t7F = Vector128.Create((byte)0x7f); + + Vector128 ps1 = Sse2.Xor(Sse2.LoadVector128(s.ToPointer() - (2 * pitch)), t80); + Vector128 ps0 = Sse2.Xor(Sse2.LoadVector128(s.ToPointer() - (1 * pitch)), t80); + Vector128 qs0 = Sse2.Xor(Sse2.LoadVector128(s.ToPointer() + (0 * pitch)), t80); + Vector128 qs1 = Sse2.Xor(Sse2.LoadVector128(s.ToPointer() + (1 * pitch)), t80); + Vector128 filt; + Vector128 workA; + Vector128 filter1, filter2; + + filt = Sse2.And(Sse2.SubtractSaturate(ps1.AsSByte(), qs1.AsSByte()).AsByte(), hev); + workA = Sse2.SubtractSaturate(qs0.AsSByte(), ps0.AsSByte()).AsByte(); + filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte(); + filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte(); + filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte(); + // (vpx_filter + 3 * (qs0 - ps0)) & mask + filt = Sse2.And(filt, mask); + + filter1 = Sse2.AddSaturate(filt.AsSByte(), t4.AsSByte()).AsByte(); + filter2 = Sse2.AddSaturate(filt.AsSByte(), t3.AsSByte()).AsByte(); + + // Filter1 >> 3 + workA = Sse2.CompareGreaterThan(zero.AsSByte(), filter1.AsSByte()).AsByte(); + filter1 = Sse2.ShiftRightLogical(filter1.AsInt16(), 3).AsByte(); + workA = Sse2.And(workA, te0); + filter1 = Sse2.And(filter1, t1F); + filter1 = Sse2.Or(filter1, workA); + + // Filter2 >> 3 + workA = Sse2.CompareGreaterThan(zero.AsSByte(), filter2.AsSByte()).AsByte(); + filter2 = Sse2.ShiftRightLogical(filter2.AsInt16(), 3).AsByte(); + workA = Sse2.And(workA, te0); + filter2 = Sse2.And(filter2, t1F); + filter2 = Sse2.Or(filter2, workA); + + // filt >> 1 + filt = Sse2.AddSaturate(filter1.AsSByte(), t1.AsSByte()).AsByte(); + workA = Sse2.CompareGreaterThan(zero.AsSByte(), filt.AsSByte()).AsByte(); + filt = Sse2.ShiftRightLogical(filt.AsInt16(), 1).AsByte(); + workA = Sse2.And(workA, t80); + filt = Sse2.And(filt, t7F); + filt = Sse2.Or(filt, workA); + + filt = Sse2.AndNot(hev, filt); + + q0 = Sse2.Xor(Sse2.SubtractSaturate(qs0.AsSByte(), filter1.AsSByte()).AsByte(), t80); + q1 = Sse2.Xor(Sse2.SubtractSaturate(qs1.AsSByte(), filt.AsSByte()).AsByte(), t80); + p0 = Sse2.Xor(Sse2.AddSaturate(ps0.AsSByte(), filter2.AsSByte()).AsByte(), t80); + p1 = Sse2.Xor(Sse2.AddSaturate(ps1.AsSByte(), filt.AsSByte()).AsByte(), t80); + + Sse2.Store(s.ToPointer() - (2 * pitch), p1); + Sse2.Store(s.ToPointer() - (1 * pitch), p0); + Sse2.Store(s.ToPointer() + (0 * pitch), q0); + Sse2.Store(s.ToPointer() + (1 * pitch), q1); + } + } + + private static unsafe void Transpose8x16( + ArrayPtr in0, + ArrayPtr in1, + int inP, + ArrayPtr output, + int outP) + { + Vector128 x0, x1, x2, x3, x4, x5, x6, x7; + Vector128 x8, x9, x10, x11, x12, x13, x14, x15; + + // 2-way interleave w/hoisting of unpacks + x0 = Sse2.LoadScalarVector128((long*)in0.ToPointer()).AsByte(); // 1 + x1 = Sse2.LoadScalarVector128((long*)(in0.ToPointer() + inP)).AsByte(); // 3 + x0 = Sse2.UnpackLow(x0, x1); // 1 + + x2 = Sse2.LoadScalarVector128((long*)(in0.ToPointer() + (2 * inP))).AsByte(); // 5 + x3 = Sse2.LoadScalarVector128((long*)(in0.ToPointer() + (3 * inP))).AsByte(); // 7 + x1 = Sse2.UnpackLow(x2, x3); // 2 + + x4 = Sse2.LoadScalarVector128((long*)(in0.ToPointer() + (4 * inP))).AsByte(); // 9 + x5 = Sse2.LoadScalarVector128((long*)(in0.ToPointer() + (5 * inP))).AsByte(); // 11 + x2 = Sse2.UnpackLow(x4, x5); // 3 + + x6 = Sse2.LoadScalarVector128((long*)(in0.ToPointer() + (6 * inP))).AsByte(); // 13 + x7 = Sse2.LoadScalarVector128((long*)(in0.ToPointer() + (7 * inP))).AsByte(); // 15 + x3 = Sse2.UnpackLow(x6, x7); // 4 + x4 = Sse2.UnpackLow(x0.AsInt16(), x1.AsInt16()).AsByte(); // 9 + + x8 = Sse2.LoadScalarVector128((long*)in1.ToPointer()).AsByte(); // 2 + x9 = Sse2.LoadScalarVector128((long*)(in1.ToPointer() + inP)).AsByte(); // 4 + x8 = Sse2.UnpackLow(x8, x9); // 5 + x5 = Sse2.UnpackLow(x2.AsInt16(), x3.AsInt16()).AsByte(); // 10 + + x10 = Sse2.LoadScalarVector128((long*)(in1.ToPointer() + (2 * inP))).AsByte(); // 6 + x11 = Sse2.LoadScalarVector128((long*)(in1.ToPointer() + (3 * inP))).AsByte(); // 8 + x9 = Sse2.UnpackLow(x10, x11); // 6 + + x12 = Sse2.LoadScalarVector128((long*)(in1.ToPointer() + (4 * inP))).AsByte(); // 10 + x13 = Sse2.LoadScalarVector128((long*)(in1.ToPointer() + (5 * inP))).AsByte(); // 12 + x10 = Sse2.UnpackLow(x12, x13); // 7 + x12 = Sse2.UnpackLow(x8.AsInt16(), x9.AsInt16()).AsByte(); // 11 + + x14 = Sse2.LoadScalarVector128((long*)(in1.ToPointer() + (6 * inP))).AsByte(); // 14 + x15 = Sse2.LoadScalarVector128((long*)(in1.ToPointer() + (7 * inP))).AsByte(); // 16 + x11 = Sse2.UnpackLow(x14, x15); // 8 + x13 = Sse2.UnpackLow(x10.AsInt16(), x11.AsInt16()).AsByte(); // 12 + + x6 = Sse2.UnpackLow(x4.AsInt32(), x5.AsInt32()).AsByte(); // 13 + x7 = Sse2.UnpackHigh(x4.AsInt32(), x5.AsInt32()).AsByte(); // 14 + x14 = Sse2.UnpackLow(x12.AsInt32(), x13.AsInt32()).AsByte(); // 15 + x15 = Sse2.UnpackHigh(x12.AsInt32(), x13.AsInt32()).AsByte(); // 16 + + // Store first 4-line result + Sse2.Store(output.ToPointer(), Sse2.UnpackLow(x6.AsInt64(), x14.AsInt64()).AsByte()); + Sse2.Store(output.ToPointer() + outP, Sse2.UnpackHigh(x6.AsInt64(), x14.AsInt64()).AsByte()); + Sse2.Store(output.ToPointer() + (2 * outP), Sse2.UnpackLow(x7.AsInt64(), x15.AsInt64()).AsByte()); + Sse2.Store(output.ToPointer() + (3 * outP), Sse2.UnpackHigh(x7.AsInt64(), x15.AsInt64()).AsByte()); + + x4 = Sse2.UnpackHigh(x0.AsInt16(), x1.AsInt16()).AsByte(); + x5 = Sse2.UnpackHigh(x2.AsInt16(), x3.AsInt16()).AsByte(); + x12 = Sse2.UnpackHigh(x8.AsInt16(), x9.AsInt16()).AsByte(); + x13 = Sse2.UnpackHigh(x10.AsInt16(), x11.AsInt16()).AsByte(); + + x6 = Sse2.UnpackLow(x4.AsInt32(), x5.AsInt32()).AsByte(); + x7 = Sse2.UnpackHigh(x4.AsInt32(), x5.AsInt32()).AsByte(); + x14 = Sse2.UnpackLow(x12.AsInt32(), x13.AsInt32()).AsByte(); + x15 = Sse2.UnpackHigh(x12.AsInt32(), x13.AsInt32()).AsByte(); + + // Store second 4-line result + Sse2.Store(output.ToPointer() + (4 * outP), Sse2.UnpackLow(x6.AsInt64(), x14.AsInt64()).AsByte()); + Sse2.Store(output.ToPointer() + (5 * outP), Sse2.UnpackHigh(x6.AsInt64(), x14.AsInt64()).AsByte()); + Sse2.Store(output.ToPointer() + (6 * outP), Sse2.UnpackLow(x7.AsInt64(), x15.AsInt64()).AsByte()); + Sse2.Store(output.ToPointer() + (7 * outP), Sse2.UnpackHigh(x7.AsInt64(), x15.AsInt64()).AsByte()); + } + + private static unsafe void Transpose( + ReadOnlySpan> src, + int inP, + ReadOnlySpan> dst, + int outP, + int num8x8ToTranspose) + { + int idx8x8 = 0; + Vector128 x0, x1, x2, x3, x4, x5, x6, x7; + + do + { + ArrayPtr input = src[idx8x8]; + ArrayPtr output = dst[idx8x8]; + + x0 = Sse2.LoadScalarVector128((long*)(input.ToPointer() + (0 * inP))) + .AsByte(); // 00 01 02 03 04 05 06 07 + x1 = Sse2.LoadScalarVector128((long*)(input.ToPointer() + (1 * inP))) + .AsByte(); // 10 11 12 13 14 15 16 17 + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + x0 = Sse2.UnpackLow(x0, x1); + + x2 = Sse2.LoadScalarVector128((long*)(input.ToPointer() + (2 * inP))) + .AsByte(); // 20 21 22 23 24 25 26 27 + x3 = Sse2.LoadScalarVector128((long*)(input.ToPointer() + (3 * inP))) + .AsByte(); // 30 31 32 33 34 35 36 37 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + x1 = Sse2.UnpackLow(x2, x3); + + x4 = Sse2.LoadScalarVector128((long*)(input.ToPointer() + (4 * inP))) + .AsByte(); // 40 41 42 43 44 45 46 47 + x5 = Sse2.LoadScalarVector128((long*)(input.ToPointer() + (5 * inP))) + .AsByte(); // 50 51 52 53 54 55 56 57 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + x2 = Sse2.UnpackLow(x4, x5); + + x6 = Sse2.LoadScalarVector128((long*)(input.ToPointer() + (6 * inP))) + .AsByte(); // 60 61 62 63 64 65 66 67 + x7 = Sse2.LoadScalarVector128((long*)(input.ToPointer() + (7 * inP))) + .AsByte(); // 70 71 72 73 74 75 76 77 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + x3 = Sse2.UnpackLow(x6, x7); + + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + x4 = Sse2.UnpackLow(x0.AsInt16(), x1.AsInt16()).AsByte(); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + x5 = Sse2.UnpackLow(x2.AsInt16(), x3.AsInt16()).AsByte(); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + x6 = Sse2.UnpackLow(x4.AsInt32(), x5.AsInt32()).AsByte(); + Sse2.StoreScalar((long*)(output.ToPointer() + (0 * outP)), x6.AsInt64()); // 00 10 20 30 40 50 60 70 + Sse2.StoreHigh((double*)(output.ToPointer() + (1 * outP)), x6.AsDouble()); // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + x7 = Sse2.UnpackHigh(x4.AsInt32(), x5.AsInt32()).AsByte(); + Sse2.StoreScalar((long*)(output.ToPointer() + (2 * outP)), x7.AsInt64()); // 02 12 22 32 42 52 62 72 + Sse2.StoreHigh((double*)(output.ToPointer() + (3 * outP)), x7.AsDouble()); // 03 13 23 33 43 53 63 73 + + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + x4 = Sse2.UnpackHigh(x0.AsInt16(), x1.AsInt16()).AsByte(); + // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + x5 = Sse2.UnpackHigh(x2.AsInt16(), x3.AsInt16()).AsByte(); + // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + x6 = Sse2.UnpackLow(x4.AsInt32(), x5.AsInt32()).AsByte(); + Sse2.StoreScalar((long*)(output.ToPointer() + (4 * outP)), x6.AsInt64()); // 04 14 24 34 44 54 64 74 + Sse2.StoreHigh((double*)(output.ToPointer() + (5 * outP)), x6.AsDouble()); // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + x7 = Sse2.UnpackHigh(x4.AsInt32(), x5.AsInt32()).AsByte(); + + Sse2.StoreScalar((long*)(output.ToPointer() + (6 * outP)), x7.AsInt64()); // 06 16 26 36 46 56 66 76 + Sse2.StoreHigh((double*)(output.ToPointer() + (7 * outP)), x7.AsDouble()); // 07 17 27 37 47 57 67 77 + } while (++idx8x8 < num8x8ToTranspose); + } + + public static unsafe void LpfVertical4Dual( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit0, + ReadOnlySpan limit0, + ReadOnlySpan thresh0, + ReadOnlySpan blimit1, + ReadOnlySpan limit1, + ReadOnlySpan thresh1) + { + ulong* tDstStorage = stackalloc ulong[16]; + ArrayPtr tDst = new((byte*)tDstStorage, 16 * 8); + Span> src = stackalloc ArrayPtr[2]; + Span> dst = stackalloc ArrayPtr[2]; + + // Transpose 8x16 + Transpose8x16(s.Slice(-4), s.Slice(-4 + (pitch * 8)), pitch, tDst, 16); + + // Loop filtering + LpfHorizontal4Dual(tDst.Slice(4 * 16), 16, blimit0, limit0, thresh0, blimit1, limit1, thresh1); + src[0] = tDst; + src[1] = tDst.Slice(8); + dst[0] = s.Slice(-4); + dst[1] = s.Slice(-4 + (pitch * 8)); + + // Transpose back + Transpose(src, 16, dst, pitch, 2); + } + + public static unsafe void LpfVertical8( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit, + ReadOnlySpan limit, + ReadOnlySpan thresh) + { + ulong* tDstStorage = stackalloc ulong[8]; + ArrayPtr tDst = new((byte*)tDstStorage, 8 * 8); + Span> src = stackalloc ArrayPtr[1]; + Span> dst = stackalloc ArrayPtr[1]; + + // Transpose 8x8 + src[0] = s.Slice(-4); + dst[0] = tDst; + + Transpose(src, pitch, dst, 8, 1); + + // Loop filtering + LpfHorizontal8(tDst.Slice(4 * 8), 8, blimit, limit, thresh); + + // Transpose back + Transpose(dst, 8, src, pitch, 1); + } + + public static unsafe void LpfVertical8Dual( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit0, + ReadOnlySpan limit0, + ReadOnlySpan thresh0, + ReadOnlySpan blimit1, + ReadOnlySpan limit1, + ReadOnlySpan thresh1) + { + ulong* tDstStorage = stackalloc ulong[16]; + ArrayPtr tDst = new((byte*)tDstStorage, 16 * 8); + Span> src = stackalloc ArrayPtr[2]; + Span> dst = stackalloc ArrayPtr[2]; + + // Transpose 8x16 + Transpose8x16(s.Slice(-4), s.Slice(-4 + (pitch * 8)), pitch, tDst, 16); + + // Loop filtering + LpfHorizontal8Dual(tDst.Slice(4 * 16), 16, blimit0, limit0, thresh0, blimit1, limit1, thresh1); + + src[0] = tDst; + src[1] = tDst.Slice(8); + + dst[0] = s.Slice(-4); + dst[1] = s.Slice(-4 + (pitch * 8)); + + // Transpose back + Transpose(src, 16, dst, pitch, 2); + } + + public static unsafe void LpfVertical16( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit, + ReadOnlySpan limit, + ReadOnlySpan thresh) + { + ulong* tDstStorage = stackalloc ulong[16]; + ArrayPtr tDst = new((byte*)tDstStorage, 16 * 8); + Span> src = stackalloc ArrayPtr[2]; + Span> dst = stackalloc ArrayPtr[2]; + + src[0] = s.Slice(-8); + src[1] = s; + dst[0] = tDst; + dst[1] = tDst.Slice(8 * 8); + + // Transpose 16x8 + Transpose(src, pitch, dst, 8, 2); + + // Loop filtering + LpfHorizontal16(tDst.Slice(8 * 8), 8, blimit, limit, thresh); + + // Transpose back + Transpose(dst, 8, src, pitch, 2); + } + + public static unsafe void LpfVertical16Dual( + ArrayPtr s, + int pitch, + ReadOnlySpan blimit, + ReadOnlySpan limit, + ReadOnlySpan thresh) + { + Vector128* tDstStorage = stackalloc Vector128[16]; + ArrayPtr tDst = new((byte*)tDstStorage, 256); + + // Transpose 16x16 + Transpose8x16(s.Slice(-8), s.Slice(-8 + (8 * pitch)), pitch, tDst, 16); + Transpose8x16(s, s.Slice(8 * pitch), pitch, tDst.Slice(8 * 16), 16); + + // Loop filtering + LpfHorizontal16Dual(tDst.Slice(8 * 16), 16, blimit, limit, thresh); + + // Transpose back + Transpose8x16(tDst, tDst.Slice(8 * 16), 16, s.Slice(-8), pitch); + Transpose8x16(tDst.Slice(8), tDst.Slice(8 + (8 * 16)), 16, s.Slice(-8 + (8 * pitch)), pitch); + } + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Prob.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Prob.cs index 0d5e8b6e3..4d458303c 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Prob.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Prob.cs @@ -12,7 +12,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp { Debug.Assert(den != 0); { - int p = (int)(((ulong)num * 256 + (den >> 1)) / den); + int p = (int)((((ulong)num * 256) + (den >> 1)) / den); // (p > 255) ? 255 : (p < 1) ? 1 : p; int clippedProb = p | ((255 - p) >> 23) | (p == 0 ? 1 : 0); return (byte)clippedProb; @@ -22,14 +22,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp /* This function assumes prob1 and prob2 are already within [1,255] range. */ public static byte WeightedProb(int prob1, int prob2, int factor) { - return (byte)BitUtils.RoundPowerOfTwo(prob1 * (256 - factor) + prob2 * factor, 8); + return (byte)BitUtils.RoundPowerOfTwo((prob1 * (256 - factor)) + (prob2 * factor), 8); } // MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT; - private static readonly uint[] CountToUpdateFactor = new uint[] + private static readonly uint[] CountToUpdateFactor = { - 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64, - 70, 76, 83, 89, 96, 102, 108, 115, 121, 128 + 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64, 70, 76, 83, 89, 96, 102, 108, 115, 121, 128 }; private const int ModeMvCountSat = 20; @@ -41,13 +40,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp { return preProb; } - else - { - uint count = Math.Min(den, ModeMvCountSat); - uint factor = CountToUpdateFactor[(int)count]; - byte prob = GetProb(ct0, den); - return WeightedProb(preProb, prob, (int)factor); - } + + uint count = Math.Min(den, ModeMvCountSat); + uint factor = CountToUpdateFactor[(int)count]; + byte prob = GetProb(ct0, den); + return WeightedProb(preProb, prob, (int)factor); } private static uint TreeMergeProbsImpl( @@ -58,16 +55,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Span probs) { int l = tree[i]; - uint leftCount = (l <= 0) ? counts[-l] : TreeMergeProbsImpl((uint)l, tree, preProbs, counts, probs); + uint leftCount = l <= 0 ? counts[-l] : TreeMergeProbsImpl((uint)l, tree, preProbs, counts, probs); int r = tree[i + 1]; - uint rightCount = (r <= 0) ? counts[-r] : TreeMergeProbsImpl((uint)r, tree, preProbs, counts, probs); + uint rightCount = r <= 0 ? counts[-r] : TreeMergeProbsImpl((uint)r, tree, preProbs, counts, probs); probs[(int)(i >> 1)] = ModeMvMergeProbs(preProbs[(int)(i >> 1)], leftCount, rightCount); return leftCount + rightCount; } - public static void TreeMergeProbs(sbyte[] tree, ReadOnlySpan preProbs, ReadOnlySpan counts, Span probs) + public static void TreeMergeProbs(sbyte[] tree, ReadOnlySpan preProbs, ReadOnlySpan counts, + Span probs) { TreeMergeProbsImpl(0, tree, preProbs, counts, probs); } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs index 050951216..7e5608c46 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs @@ -1,4 +1,5 @@ using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Types; using System; using System.Buffers.Binary; @@ -6,19 +7,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp { internal struct Reader { - private static readonly byte[] Norm = new byte[] + private static readonly byte[] Norm = { - 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + private const int BdValueSize = sizeof(ulong) * 8; // This is meant to be a large, positive constant that can still be efficiently @@ -37,15 +37,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp { return true; } - else - { - _buffer = new ArrayPtr(ref buffer[0], size); - Value = 0; - Count = -8; - Range = 255; - Fill(); - return ReadBit() != 0; // Marker bit - } + + _buffer = new ArrayPtr(ref buffer[0], size); + Value = 0; + Count = -8; + Range = 255; + Fill(); + return ReadBit() != 0; // Marker bit } private void Fill() @@ -124,7 +122,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp ulong bigsplit; int count; uint range; - uint split = (Range * (uint)prob + (256 - (uint)prob)) >> 8; + uint split = ((Range * (uint)prob) + (256 - (uint)prob)) >> 8; if (Count < 0) { @@ -160,7 +158,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp public int ReadBit() { - return Read(128); // vpx_prob_half + return Read(128); // vpx_prob_half } public int ReadLiteral(int bits) @@ -181,7 +179,6 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp while ((i = tree[i + Read(probs[i >> 1])]) > 0) { - continue; } return -i; @@ -189,7 +186,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp public int ReadBool(int prob, ref ulong value, ref int count, ref uint range) { - uint split = (range * (uint)prob + (256 - (uint)prob)) >> 8; + uint split = ((range * (uint)prob) + (256 - (uint)prob)) >> 8; ulong bigsplit = (ulong)split << (BdValueSize - 8); if (count < 0) @@ -213,6 +210,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp } return 1; } + range = split; { int shift = Norm[range]; @@ -231,7 +229,82 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp Count -= 8; _buffer = _buffer.Slice(-1); } + return _buffer; } + + private int DecodeUniform() + { + const int l = 8; + const int m = (1 << l) - 191; + int v = ReadLiteral(l - 1); + return v < m ? v : (v << 1) - m + ReadBit(); + } + + public int DecodeTermSubexp() + { + if (ReadBit() == 0) + { + return ReadLiteral(4); + } + + if (ReadBit() == 0) + { + return ReadLiteral(4) + 16; + } + + if (ReadBit() == 0) + { + return ReadLiteral(5) + 32; + } + + return DecodeUniform() + 64; + } + + public TxMode ReadTxMode() + { + TxMode txMode = (TxMode)ReadLiteral(2); + if (txMode == TxMode.Allow32x32) + { + txMode += ReadBit(); + } + + return txMode; + } + + public int ReadCoeff( + ReadOnlySpan probs, + int n, + ref ulong value, + ref int count, + ref uint range) + { + int val = 0; + for (int i = 0; i < n; ++i) + { + val = (val << 1) | ReadBool(probs[i], ref value, ref count, ref range); + } + + return val; + } + + public void DiffUpdateProb(ref byte p) + { + if (Read(Entropy.DiffUpdateProb) != 0) + { + p = (byte)DSubExp.InvRemapProb(DecodeTermSubexp(), p); + } + } + + public void UpdateMvProbs(Span p, int n) + { + for (int i = 0; i < n; ++i) + { + if (Read(EntropyMv.UpdateProb) != 0) + { + p[i] = (byte)((ReadLiteral(7) << 1) | 1); + } + } + } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/TxfmCommon.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/TxfmCommon.cs index e041f2e0b..eda2dd69c 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/TxfmCommon.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/TxfmCommon.cs @@ -13,42 +13,42 @@ // for (int i = 1; i < 32; ++i) // Console.WriteLine("public const short CosPi{0}_64 = {1};", i, MathF.Round(16384 * MathF.Cos(i * MathF.PI / 64))); // Note: sin(k * Pi / 64) = cos((32 - k) * Pi / 64) - public const short CosPi1_64 = 16364; - public const short CosPi2_64 = 16305; - public const short CosPi3_64 = 16207; - public const short CosPi4_64 = 16069; - public const short CosPi5_64 = 15893; - public const short CosPi6_64 = 15679; - public const short CosPi7_64 = 15426; - public const short CosPi8_64 = 15137; - public const short CosPi9_64 = 14811; - public const short CosPi10_64 = 14449; - public const short CosPi11_64 = 14053; - public const short CosPi12_64 = 13623; - public const short CosPi13_64 = 13160; - public const short CosPi14_64 = 12665; - public const short CosPi15_64 = 12140; - public const short CosPi16_64 = 11585; - public const short CosPi17_64 = 11003; - public const short CosPi18_64 = 10394; - public const short CosPi19_64 = 9760; - public const short CosPi20_64 = 9102; - public const short CosPi21_64 = 8423; - public const short CosPi22_64 = 7723; - public const short CosPi23_64 = 7005; - public const short CosPi24_64 = 6270; - public const short CosPi25_64 = 5520; - public const short CosPi26_64 = 4756; - public const short CosPi27_64 = 3981; - public const short CosPi28_64 = 3196; - public const short CosPi29_64 = 2404; - public const short CosPi30_64 = 1606; - public const short CosPi31_64 = 804; + public const short CosPi164 = 16364; + public const short CosPi264 = 16305; + public const short CosPi364 = 16207; + public const short CosPi464 = 16069; + public const short CosPi564 = 15893; + public const short CosPi664 = 15679; + public const short CosPi764 = 15426; + public const short CosPi864 = 15137; + public const short CosPi964 = 14811; + public const short CosPi1064 = 14449; + public const short CosPi1164 = 14053; + public const short CosPi1264 = 13623; + public const short CosPi1364 = 13160; + public const short CosPi1464 = 12665; + public const short CosPi1564 = 12140; + public const short CosPi1664 = 11585; + public const short CosPi1764 = 11003; + public const short CosPi1864 = 10394; + public const short CosPi1964 = 9760; + public const short CosPi2064 = 9102; + public const short CosPi2164 = 8423; + public const short CosPi2264 = 7723; + public const short CosPi2364 = 7005; + public const short CosPi2464 = 6270; + public const short CosPi2564 = 5520; + public const short CosPi2664 = 4756; + public const short CosPi2764 = 3981; + public const short CosPi2864 = 3196; + public const short CosPi2964 = 2404; + public const short CosPi3064 = 1606; + public const short CosPi3164 = 804; // 16384 * sqrt(2) * sin(kPi / 9) * 2 / 3 - public const short SinPi1_9 = 5283; - public const short SinPi2_9 = 9929; - public const short SinPi3_9 = 13377; - public const short SinPi4_9 = 15212; + public const short SinPi19 = 5283; + public const short SinPi29 = 9929; + public const short SinPi39 = 13377; + public const short SinPi49 = 15212; } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Entropy.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Entropy.cs new file mode 100644 index 000000000..1e7e74fad --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Entropy.cs @@ -0,0 +1,623 @@ +using Ryujinx.Graphics.Nvdec.Vp9.Types; +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal static class Entropy + { + public const int DiffUpdateProb = 252; + + // Coefficient token alphabet + public const int ZeroToken = 0; // 0 Extra Bits 0+0 + public const int OneToken = 1; // 1 Extra Bits 0+1 + public const int TwoToken = 2; // 2 Extra Bits 0+1 + public const int ThreeToken = 3; // 3 Extra Bits 0+1 + public const int FourToken = 4; // 4 Extra Bits 0+1 + public const int Category1Token = 5; // 5-6 Extra Bits 1+1 + public const int Category2Token = 6; // 7-10 Extra Bits 2+1 + public const int Category3Token = 7; // 11-18 Extra Bits 3+1 + public const int Category4Token = 8; // 19-34 Extra Bits 4+1 + public const int Category5Token = 9; // 35-66 Extra Bits 5+1 + public const int Category6Token = 10; // 67+ Extra Bits 14+1 + public const int EobToken = 11; // EOB Extra Bits 0+0 + + public const int EntropyTokens = 12; + + public const int RefTypes = 2; // intra=0, inter=1 + + /* Middle dimension reflects the coefficient position within the transform. */ + public const int CoefBands = 6; + + /* Inside dimension is measure of nearby complexity, that reflects the energy + of nearby coefficients are nonzero. For the first coefficient (DC, unless + block type is 0), we look at the (already encoded) blocks above and to the + left of the current block. The context index is then the number (0,1,or 2) + of these blocks having nonzero coefficients. + After decoding a coefficient, the measure is determined by the size of the + most recently decoded coefficient. + Note that the intuitive meaning of this measure changes as coefficients + are decoded, e.g., prior to the first token, a zero means that my neighbors + are empty while, after the first token, because of the use of end-of-block, + a zero means we just decoded a zero and hence guarantees that a non-zero + coefficient will appear later in this block. However, this shift + in meaning is perfectly OK because our context depends also on the + coefficient band (and since zigzag positions 0, 1, and 2 are in + distinct bands). */ + + public const int CoeffContexts = 6; + + public static int BAND_COEFF_CONTEXTS(int band) + { + return band == 0 ? 3 : CoeffContexts; + } + + public const int UnconstrainedNodes = 3; + + public const int PivotNode = 2; + + public const int Cat1MinVal = 5; + public const int Cat2MinVal = 7; + public const int Cat3MinVal = 11; + public const int Cat4MinVal = 19; + public const int Cat5MinVal = 35; + public const int Cat6MinVal = 67; + + public static readonly byte[] Cat1Prob = { 159 }; + public static readonly byte[] Cat2Prob = { 165, 145 }; + public static readonly byte[] Cat3Prob = { 173, 148, 140 }; + public static readonly byte[] Cat4Prob = { 176, 155, 140, 135 }; + public static readonly byte[] Cat5Prob = { 180, 157, 141, 134, 130 }; + + public static readonly byte[] Cat6Prob = + { + 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 + }; + + public static readonly byte[] Cat6ProbHigh12 = + { + 255, 255, 255, 255, 254, 254, 54, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 + }; + + public const int EobModelToken = 3; + + private static readonly byte[] CoefbandTrans8x8Plus = + { + 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, + // beyond MAXBAND_INDEX+1 all values are filled as 5 + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 + }; + + private static readonly byte[] CoefbandTrans4x4 = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5 }; + + public static readonly byte[][] Pareto8Full = + { + new byte[] { 3, 86, 128, 6, 86, 23, 88, 29 }, new byte[] { 6, 86, 128, 11, 87, 42, 91, 52 }, + new byte[] { 9, 86, 129, 17, 88, 61, 94, 76 }, new byte[] { 12, 86, 129, 22, 88, 77, 97, 93 }, + new byte[] { 15, 87, 129, 28, 89, 93, 100, 110 }, new byte[] { 17, 87, 129, 33, 90, 105, 103, 123 }, + new byte[] { 20, 88, 130, 38, 91, 118, 106, 136 }, new byte[] { 23, 88, 130, 43, 91, 128, 108, 146 }, + new byte[] { 26, 89, 131, 48, 92, 139, 111, 156 }, new byte[] { 28, 89, 131, 53, 93, 147, 114, 163 }, + new byte[] { 31, 90, 131, 58, 94, 156, 117, 171 }, new byte[] { 34, 90, 131, 62, 94, 163, 119, 177 }, + new byte[] { 37, 90, 132, 66, 95, 171, 122, 184 }, new byte[] { 39, 90, 132, 70, 96, 177, 124, 189 }, + new byte[] { 42, 91, 132, 75, 97, 183, 127, 194 }, new byte[] { 44, 91, 132, 79, 97, 188, 129, 198 }, + new byte[] { 47, 92, 133, 83, 98, 193, 132, 202 }, new byte[] { 49, 92, 133, 86, 99, 197, 134, 205 }, + new byte[] { 52, 93, 133, 90, 100, 201, 137, 208 }, new byte[] { 54, 93, 133, 94, 100, 204, 139, 211 }, + new byte[] { 57, 94, 134, 98, 101, 208, 142, 214 }, new byte[] { 59, 94, 134, 101, 102, 211, 144, 216 }, + new byte[] { 62, 94, 135, 105, 103, 214, 146, 218 }, + new byte[] { 64, 94, 135, 108, 103, 216, 148, 220 }, + new byte[] { 66, 95, 135, 111, 104, 219, 151, 222 }, + new byte[] { 68, 95, 135, 114, 105, 221, 153, 223 }, + new byte[] { 71, 96, 136, 117, 106, 224, 155, 225 }, + new byte[] { 73, 96, 136, 120, 106, 225, 157, 226 }, + new byte[] { 76, 97, 136, 123, 107, 227, 159, 228 }, + new byte[] { 78, 97, 136, 126, 108, 229, 160, 229 }, + new byte[] { 80, 98, 137, 129, 109, 231, 162, 231 }, + new byte[] { 82, 98, 137, 131, 109, 232, 164, 232 }, + new byte[] { 84, 98, 138, 134, 110, 234, 166, 233 }, + new byte[] { 86, 98, 138, 137, 111, 235, 168, 234 }, + new byte[] { 89, 99, 138, 140, 112, 236, 170, 235 }, + new byte[] { 91, 99, 138, 142, 112, 237, 171, 235 }, + new byte[] { 93, 100, 139, 145, 113, 238, 173, 236 }, + new byte[] { 95, 100, 139, 147, 114, 239, 174, 237 }, + new byte[] { 97, 101, 140, 149, 115, 240, 176, 238 }, + new byte[] { 99, 101, 140, 151, 115, 241, 177, 238 }, + new byte[] { 101, 102, 140, 154, 116, 242, 179, 239 }, + new byte[] { 103, 102, 140, 156, 117, 242, 180, 239 }, + new byte[] { 105, 103, 141, 158, 118, 243, 182, 240 }, + new byte[] { 107, 103, 141, 160, 118, 243, 183, 240 }, + new byte[] { 109, 104, 141, 162, 119, 244, 185, 241 }, + new byte[] { 111, 104, 141, 164, 119, 244, 186, 241 }, + new byte[] { 113, 104, 142, 166, 120, 245, 187, 242 }, + new byte[] { 114, 104, 142, 168, 121, 245, 188, 242 }, + new byte[] { 116, 105, 143, 170, 122, 246, 190, 243 }, + new byte[] { 118, 105, 143, 171, 122, 246, 191, 243 }, + new byte[] { 120, 106, 143, 173, 123, 247, 192, 244 }, + new byte[] { 121, 106, 143, 175, 124, 247, 193, 244 }, + new byte[] { 123, 107, 144, 177, 125, 248, 195, 244 }, + new byte[] { 125, 107, 144, 178, 125, 248, 196, 244 }, + new byte[] { 127, 108, 145, 180, 126, 249, 197, 245 }, + new byte[] { 128, 108, 145, 181, 127, 249, 198, 245 }, + new byte[] { 130, 109, 145, 183, 128, 249, 199, 245 }, + new byte[] { 132, 109, 145, 184, 128, 249, 200, 245 }, + new byte[] { 134, 110, 146, 186, 129, 250, 201, 246 }, + new byte[] { 135, 110, 146, 187, 130, 250, 202, 246 }, + new byte[] { 137, 111, 147, 189, 131, 251, 203, 246 }, + new byte[] { 138, 111, 147, 190, 131, 251, 204, 246 }, + new byte[] { 140, 112, 147, 192, 132, 251, 205, 247 }, + new byte[] { 141, 112, 147, 193, 132, 251, 206, 247 }, + new byte[] { 143, 113, 148, 194, 133, 251, 207, 247 }, + new byte[] { 144, 113, 148, 195, 134, 251, 207, 247 }, + new byte[] { 146, 114, 149, 197, 135, 252, 208, 248 }, + new byte[] { 147, 114, 149, 198, 135, 252, 209, 248 }, + new byte[] { 149, 115, 149, 199, 136, 252, 210, 248 }, + new byte[] { 150, 115, 149, 200, 137, 252, 210, 248 }, + new byte[] { 152, 115, 150, 201, 138, 252, 211, 248 }, + new byte[] { 153, 115, 150, 202, 138, 252, 212, 248 }, + new byte[] { 155, 116, 151, 204, 139, 253, 213, 249 }, + new byte[] { 156, 116, 151, 205, 139, 253, 213, 249 }, + new byte[] { 158, 117, 151, 206, 140, 253, 214, 249 }, + new byte[] { 159, 117, 151, 207, 141, 253, 215, 249 }, + new byte[] { 161, 118, 152, 208, 142, 253, 216, 249 }, + new byte[] { 162, 118, 152, 209, 142, 253, 216, 249 }, + new byte[] { 163, 119, 153, 210, 143, 253, 217, 249 }, + new byte[] { 164, 119, 153, 211, 143, 253, 217, 249 }, + new byte[] { 166, 120, 153, 212, 144, 254, 218, 250 }, + new byte[] { 167, 120, 153, 212, 145, 254, 219, 250 }, + new byte[] { 168, 121, 154, 213, 146, 254, 220, 250 }, + new byte[] { 169, 121, 154, 214, 146, 254, 220, 250 }, + new byte[] { 171, 122, 155, 215, 147, 254, 221, 250 }, + new byte[] { 172, 122, 155, 216, 147, 254, 221, 250 }, + new byte[] { 173, 123, 155, 217, 148, 254, 222, 250 }, + new byte[] { 174, 123, 155, 217, 149, 254, 222, 250 }, + new byte[] { 176, 124, 156, 218, 150, 254, 223, 250 }, + new byte[] { 177, 124, 156, 219, 150, 254, 223, 250 }, + new byte[] { 178, 125, 157, 220, 151, 254, 224, 251 }, + new byte[] { 179, 125, 157, 220, 151, 254, 224, 251 }, + new byte[] { 180, 126, 157, 221, 152, 254, 225, 251 }, + new byte[] { 181, 126, 157, 221, 152, 254, 225, 251 }, + new byte[] { 183, 127, 158, 222, 153, 254, 226, 251 }, + new byte[] { 184, 127, 158, 223, 154, 254, 226, 251 }, + new byte[] { 185, 128, 159, 224, 155, 255, 227, 251 }, + new byte[] { 186, 128, 159, 224, 155, 255, 227, 251 }, + new byte[] { 187, 129, 160, 225, 156, 255, 228, 251 }, + new byte[] { 188, 130, 160, 225, 156, 255, 228, 251 }, + new byte[] { 189, 131, 160, 226, 157, 255, 228, 251 }, + new byte[] { 190, 131, 160, 226, 158, 255, 228, 251 }, + new byte[] { 191, 132, 161, 227, 159, 255, 229, 251 }, + new byte[] { 192, 132, 161, 227, 159, 255, 229, 251 }, + new byte[] { 193, 133, 162, 228, 160, 255, 230, 252 }, + new byte[] { 194, 133, 162, 229, 160, 255, 230, 252 }, + new byte[] { 195, 134, 163, 230, 161, 255, 231, 252 }, + new byte[] { 196, 134, 163, 230, 161, 255, 231, 252 }, + new byte[] { 197, 135, 163, 231, 162, 255, 231, 252 }, + new byte[] { 198, 135, 163, 231, 162, 255, 231, 252 }, + new byte[] { 199, 136, 164, 232, 163, 255, 232, 252 }, + new byte[] { 200, 136, 164, 232, 164, 255, 232, 252 }, + new byte[] { 201, 137, 165, 233, 165, 255, 233, 252 }, + new byte[] { 201, 137, 165, 233, 165, 255, 233, 252 }, + new byte[] { 202, 138, 166, 233, 166, 255, 233, 252 }, + new byte[] { 203, 138, 166, 233, 166, 255, 233, 252 }, + new byte[] { 204, 139, 166, 234, 167, 255, 234, 252 }, + new byte[] { 205, 139, 166, 234, 167, 255, 234, 252 }, + new byte[] { 206, 140, 167, 235, 168, 255, 235, 252 }, + new byte[] { 206, 140, 167, 235, 168, 255, 235, 252 }, + new byte[] { 207, 141, 168, 236, 169, 255, 235, 252 }, + new byte[] { 208, 141, 168, 236, 170, 255, 235, 252 }, + new byte[] { 209, 142, 169, 237, 171, 255, 236, 252 }, + new byte[] { 209, 143, 169, 237, 171, 255, 236, 252 }, + new byte[] { 210, 144, 169, 237, 172, 255, 236, 252 }, + new byte[] { 211, 144, 169, 237, 172, 255, 236, 252 }, + new byte[] { 212, 145, 170, 238, 173, 255, 237, 252 }, + new byte[] { 213, 145, 170, 238, 173, 255, 237, 252 }, + new byte[] { 214, 146, 171, 239, 174, 255, 237, 253 }, + new byte[] { 214, 146, 171, 239, 174, 255, 237, 253 }, + new byte[] { 215, 147, 172, 240, 175, 255, 238, 253 }, + new byte[] { 215, 147, 172, 240, 175, 255, 238, 253 }, + new byte[] { 216, 148, 173, 240, 176, 255, 238, 253 }, + new byte[] { 217, 148, 173, 240, 176, 255, 238, 253 }, + new byte[] { 218, 149, 173, 241, 177, 255, 239, 253 }, + new byte[] { 218, 149, 173, 241, 178, 255, 239, 253 }, + new byte[] { 219, 150, 174, 241, 179, 255, 239, 253 }, + new byte[] { 219, 151, 174, 241, 179, 255, 239, 253 }, + new byte[] { 220, 152, 175, 242, 180, 255, 240, 253 }, + new byte[] { 221, 152, 175, 242, 180, 255, 240, 253 }, + new byte[] { 222, 153, 176, 242, 181, 255, 240, 253 }, + new byte[] { 222, 153, 176, 242, 181, 255, 240, 253 }, + new byte[] { 223, 154, 177, 243, 182, 255, 240, 253 }, + new byte[] { 223, 154, 177, 243, 182, 255, 240, 253 }, + new byte[] { 224, 155, 178, 244, 183, 255, 241, 253 }, + new byte[] { 224, 155, 178, 244, 183, 255, 241, 253 }, + new byte[] { 225, 156, 178, 244, 184, 255, 241, 253 }, + new byte[] { 225, 157, 178, 244, 184, 255, 241, 253 }, + new byte[] { 226, 158, 179, 244, 185, 255, 242, 253 }, + new byte[] { 227, 158, 179, 244, 185, 255, 242, 253 }, + new byte[] { 228, 159, 180, 245, 186, 255, 242, 253 }, + new byte[] { 228, 159, 180, 245, 186, 255, 242, 253 }, + new byte[] { 229, 160, 181, 245, 187, 255, 242, 253 }, + new byte[] { 229, 160, 181, 245, 187, 255, 242, 253 }, + new byte[] { 230, 161, 182, 246, 188, 255, 243, 253 }, + new byte[] { 230, 162, 182, 246, 188, 255, 243, 253 }, + new byte[] { 231, 163, 183, 246, 189, 255, 243, 253 }, + new byte[] { 231, 163, 183, 246, 189, 255, 243, 253 }, + new byte[] { 232, 164, 184, 247, 190, 255, 243, 253 }, + new byte[] { 232, 164, 184, 247, 190, 255, 243, 253 }, + new byte[] { 233, 165, 185, 247, 191, 255, 244, 253 }, + new byte[] { 233, 165, 185, 247, 191, 255, 244, 253 }, + new byte[] { 234, 166, 185, 247, 192, 255, 244, 253 }, + new byte[] { 234, 167, 185, 247, 192, 255, 244, 253 }, + new byte[] { 235, 168, 186, 248, 193, 255, 244, 253 }, + new byte[] { 235, 168, 186, 248, 193, 255, 244, 253 }, + new byte[] { 236, 169, 187, 248, 194, 255, 244, 253 }, + new byte[] { 236, 169, 187, 248, 194, 255, 244, 253 }, + new byte[] { 236, 170, 188, 248, 195, 255, 245, 253 }, + new byte[] { 236, 170, 188, 248, 195, 255, 245, 253 }, + new byte[] { 237, 171, 189, 249, 196, 255, 245, 254 }, + new byte[] { 237, 172, 189, 249, 196, 255, 245, 254 }, + new byte[] { 238, 173, 190, 249, 197, 255, 245, 254 }, + new byte[] { 238, 173, 190, 249, 197, 255, 245, 254 }, + new byte[] { 239, 174, 191, 249, 198, 255, 245, 254 }, + new byte[] { 239, 174, 191, 249, 198, 255, 245, 254 }, + new byte[] { 240, 175, 192, 249, 199, 255, 246, 254 }, + new byte[] { 240, 176, 192, 249, 199, 255, 246, 254 }, + new byte[] { 240, 177, 193, 250, 200, 255, 246, 254 }, + new byte[] { 240, 177, 193, 250, 200, 255, 246, 254 }, + new byte[] { 241, 178, 194, 250, 201, 255, 246, 254 }, + new byte[] { 241, 178, 194, 250, 201, 255, 246, 254 }, + new byte[] { 242, 179, 195, 250, 202, 255, 246, 254 }, + new byte[] { 242, 180, 195, 250, 202, 255, 246, 254 }, + new byte[] { 242, 181, 196, 250, 203, 255, 247, 254 }, + new byte[] { 242, 181, 196, 250, 203, 255, 247, 254 }, + new byte[] { 243, 182, 197, 251, 204, 255, 247, 254 }, + new byte[] { 243, 183, 197, 251, 204, 255, 247, 254 }, + new byte[] { 244, 184, 198, 251, 205, 255, 247, 254 }, + new byte[] { 244, 184, 198, 251, 205, 255, 247, 254 }, + new byte[] { 244, 185, 199, 251, 206, 255, 247, 254 }, + new byte[] { 244, 185, 199, 251, 206, 255, 247, 254 }, + new byte[] { 245, 186, 200, 251, 207, 255, 247, 254 }, + new byte[] { 245, 187, 200, 251, 207, 255, 247, 254 }, + new byte[] { 246, 188, 201, 252, 207, 255, 248, 254 }, + new byte[] { 246, 188, 201, 252, 207, 255, 248, 254 }, + new byte[] { 246, 189, 202, 252, 208, 255, 248, 254 }, + new byte[] { 246, 190, 202, 252, 208, 255, 248, 254 }, + new byte[] { 247, 191, 203, 252, 209, 255, 248, 254 }, + new byte[] { 247, 191, 203, 252, 209, 255, 248, 254 }, + new byte[] { 247, 192, 204, 252, 210, 255, 248, 254 }, + new byte[] { 247, 193, 204, 252, 210, 255, 248, 254 }, + new byte[] { 248, 194, 205, 252, 211, 255, 248, 254 }, + new byte[] { 248, 194, 205, 252, 211, 255, 248, 254 }, + new byte[] { 248, 195, 206, 252, 212, 255, 249, 254 }, + new byte[] { 248, 196, 206, 252, 212, 255, 249, 254 }, + new byte[] { 249, 197, 207, 253, 213, 255, 249, 254 }, + new byte[] { 249, 197, 207, 253, 213, 255, 249, 254 }, + new byte[] { 249, 198, 208, 253, 214, 255, 249, 254 }, + new byte[] { 249, 199, 209, 253, 214, 255, 249, 254 }, + new byte[] { 250, 200, 210, 253, 215, 255, 249, 254 }, + new byte[] { 250, 200, 210, 253, 215, 255, 249, 254 }, + new byte[] { 250, 201, 211, 253, 215, 255, 249, 254 }, + new byte[] { 250, 202, 211, 253, 215, 255, 249, 254 }, + new byte[] { 250, 203, 212, 253, 216, 255, 249, 254 }, + new byte[] { 250, 203, 212, 253, 216, 255, 249, 254 }, + new byte[] { 251, 204, 213, 253, 217, 255, 250, 254 }, + new byte[] { 251, 205, 213, 253, 217, 255, 250, 254 }, + new byte[] { 251, 206, 214, 254, 218, 255, 250, 254 }, + new byte[] { 251, 206, 215, 254, 218, 255, 250, 254 }, + new byte[] { 252, 207, 216, 254, 219, 255, 250, 254 }, + new byte[] { 252, 208, 216, 254, 219, 255, 250, 254 }, + new byte[] { 252, 209, 217, 254, 220, 255, 250, 254 }, + new byte[] { 252, 210, 217, 254, 220, 255, 250, 254 }, + new byte[] { 252, 211, 218, 254, 221, 255, 250, 254 }, + new byte[] { 252, 212, 218, 254, 221, 255, 250, 254 }, + new byte[] { 253, 213, 219, 254, 222, 255, 250, 254 }, + new byte[] { 253, 213, 220, 254, 222, 255, 250, 254 }, + new byte[] { 253, 214, 221, 254, 223, 255, 250, 254 }, + new byte[] { 253, 215, 221, 254, 223, 255, 250, 254 }, + new byte[] { 253, 216, 222, 254, 224, 255, 251, 254 }, + new byte[] { 253, 217, 223, 254, 224, 255, 251, 254 }, + new byte[] { 253, 218, 224, 254, 225, 255, 251, 254 }, + new byte[] { 253, 219, 224, 254, 225, 255, 251, 254 }, + new byte[] { 254, 220, 225, 254, 225, 255, 251, 254 }, + new byte[] { 254, 221, 226, 254, 225, 255, 251, 254 }, + new byte[] { 254, 222, 227, 255, 226, 255, 251, 254 }, + new byte[] { 254, 223, 227, 255, 226, 255, 251, 254 }, + new byte[] { 254, 224, 228, 255, 227, 255, 251, 254 }, + new byte[] { 254, 225, 229, 255, 227, 255, 251, 254 }, + new byte[] { 254, 226, 230, 255, 228, 255, 251, 254 }, + new byte[] { 254, 227, 230, 255, 229, 255, 251, 254 }, + new byte[] { 255, 228, 231, 255, 230, 255, 251, 254 }, + new byte[] { 255, 229, 232, 255, 230, 255, 251, 254 }, + new byte[] { 255, 230, 233, 255, 231, 255, 252, 254 }, + new byte[] { 255, 231, 234, 255, 231, 255, 252, 254 }, + new byte[] { 255, 232, 235, 255, 232, 255, 252, 254 }, + new byte[] { 255, 233, 236, 255, 232, 255, 252, 254 }, + new byte[] { 255, 235, 237, 255, 233, 255, 252, 254 }, + new byte[] { 255, 236, 238, 255, 234, 255, 252, 254 }, + new byte[] { 255, 238, 240, 255, 235, 255, 252, 255 }, + new byte[] { 255, 239, 241, 255, 235, 255, 252, 254 }, + new byte[] { 255, 241, 243, 255, 236, 255, 252, 254 }, + new byte[] { 255, 243, 245, 255, 237, 255, 252, 254 }, + new byte[] { 255, 246, 247, 255, 239, 255, 253, 255 } + }; + + internal static readonly byte[] DefaultCoefProbs4x4 = + { + // Y plane + // Intra + // Band 0 + 195, 29, 183, 84, 49, 136, 8, 42, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Band 1 + 31, 107, 169, 35, 99, 159, 17, 82, 140, 8, 66, 114, 2, 44, 76, 1, 19, 32, + // Band 2 + 40, 132, 201, 29, 114, 187, 13, 91, 157, 7, 75, 127, 3, 58, 95, 1, 28, 47, + // Band 3 + 69, 142, 221, 42, 122, 201, 15, 91, 159, 6, 67, 121, 1, 42, 77, 1, 17, 31, + // Band 4 + 102, 148, 228, 67, 117, 204, 17, 82, 154, 6, 59, 114, 2, 39, 75, 1, 15, 29, + // Band 5 + 156, 57, 233, 119, 57, 212, 58, 48, 163, 29, 40, 124, 12, 30, 81, 3, 12, 31, + // Inter + // Band 0 + 191, 107, 226, 124, 117, 204, 25, 99, 155, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Band 1 + 29, 148, 210, 37, 126, 194, 8, 93, 157, 2, 68, 118, 1, 39, 69, 1, 17, 33, + // Band 2 + 41, 151, 213, 27, 123, 193, 3, 82, 144, 1, 58, 105, 1, 32, 60, 1, 13, 26, + // Band 3 + 59, 159, 220, 23, 126, 198, 4, 88, 151, 1, 66, 114, 1, 38, 71, 1, 18, 34, + // Band 4 + 114, 136, 232, 51, 114, 207, 11, 83, 155, 3, 56, 105, 1, 33, 65, 1, 17, 34, + // Band 5 + 149, 65, 234, 121, 57, 215, 61, 49, 166, 28, 36, 114, 12, 25, 76, 3, 16, 42, + // UV plane + // Intra + // Band 0 + 214, 49, 220, 132, 63, 188, 42, 65, 137, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Band 1 + 85, 137, 221, 104, 131, 216, 49, 111, 192, 21, 87, 155, 2, 49, 87, 1, 16, 28, + // Band 2 + 89, 163, 230, 90, 137, 220, 29, 100, 183, 10, 70, 135, 2, 42, 81, 1, 17, 33, + // Band 3 + 108, 167, 237, 55, 133, 222, 15, 97, 179, 4, 72, 135, 1, 45, 85, 1, 19, 38, + // Band 4 + 124, 146, 240, 66, 124, 224, 17, 88, 175, 4, 58, 122, 1, 36, 75, 1, 18, 37, + // Band 5 + 141, 79, 241, 126, 70, 227, 66, 58, 182, 30, 44, 136, 12, 34, 96, 2, 20, 47, + // Inter + // Band 0 + 229, 99, 249, 143, 111, 235, 46, 109, 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Band 1 + 82, 158, 236, 94, 146, 224, 25, 117, 191, 9, 87, 149, 3, 56, 99, 1, 33, 57, + // Band 2 + 83, 167, 237, 68, 145, 222, 10, 103, 177, 2, 72, 131, 1, 41, 79, 1, 20, 39, + // Band 3 + 99, 167, 239, 47, 141, 224, 10, 104, 178, 2, 73, 133, 1, 44, 85, 1, 22, 47, + // Band 4 + 127, 145, 243, 71, 129, 228, 17, 93, 177, 3, 61, 124, 1, 41, 84, 1, 21, 52, + // Band 5 + 157, 78, 244, 140, 72, 231, 69, 58, 184, 31, 44, 137, 14, 38, 105, 8, 23, 61 + }; + + internal static readonly byte[] DefaultCoefProbs8x8 = + { + // Y plane + // Intra + // Band 0 + 125, 34, 187, 52, 41, 133, 6, 31, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Band 1 + 37, 109, 153, 51, 102, 147, 23, 87, 128, 8, 67, 101, 1, 41, 63, 1, 19, 29, + // Band 2 + 31, 154, 185, 17, 127, 175, 6, 96, 145, 2, 73, 114, 1, 51, 82, 1, 28, 45, + // Band 3 + 23, 163, 200, 10, 131, 185, 2, 93, 148, 1, 67, 111, 1, 41, 69, 1, 14, 24, + // Band 4 + 29, 176, 217, 12, 145, 201, 3, 101, 156, 1, 69, 111, 1, 39, 63, 1, 14, 23, + // Band 5 + 57, 192, 233, 25, 154, 215, 6, 109, 167, 3, 78, 118, 1, 48, 69, 1, 21, 29, + // Inter + // Band 0 + 202, 105, 245, 108, 106, 216, 18, 90, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Band 1 + 33, 172, 219, 64, 149, 206, 14, 117, 177, 5, 90, 141, 2, 61, 95, 1, 37, 57, + // Band 2 + 33, 179, 220, 11, 140, 198, 1, 89, 148, 1, 60, 104, 1, 33, 57, 1, 12, 21, + // Band 3 + 30, 181, 221, 8, 141, 198, 1, 87, 145, 1, 58, 100, 1, 31, 55, 1, 12, 20, + // Band 4 + 32, 186, 224, 7, 142, 198, 1, 86, 143, 1, 58, 100, 1, 31, 55, 1, 12, 22, + // Band 5 + 57, 192, 227, 20, 143, 204, 3, 96, 154, 1, 68, 112, 1, 42, 69, 1, 19, 32, + // UV plane + // Intra + // Band 0 + 212, 35, 215, 113, 47, 169, 29, 48, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Band 1 + 74, 129, 203, 106, 120, 203, 49, 107, 178, 19, 84, 144, 4, 50, 84, 1, 15, 25, + // Band 2 + 71, 172, 217, 44, 141, 209, 15, 102, 173, 6, 76, 133, 2, 51, 89, 1, 24, 42, + // Band 3 + 64, 185, 231, 31, 148, 216, 8, 103, 175, 3, 74, 131, 1, 46, 81, 1, 18, 30, + // Band 4 + 65, 196, 235, 25, 157, 221, 5, 105, 174, 1, 67, 120, 1, 38, 69, 1, 15, 30, + // Band 5 + 65, 204, 238, 30, 156, 224, 7, 107, 177, 2, 70, 124, 1, 42, 73, 1, 18, 34, + // Inter + // Band 0 + 225, 86, 251, 144, 104, 235, 42, 99, 181, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Band 1 + 85, 175, 239, 112, 165, 229, 29, 136, 200, 12, 103, 162, 6, 77, 123, 2, 53, 84, + // Band 2 + 75, 183, 239, 30, 155, 221, 3, 106, 171, 1, 74, 128, 1, 44, 76, 1, 17, 28, + // Band 3 + 73, 185, 240, 27, 159, 222, 2, 107, 172, 1, 75, 127, 1, 42, 73, 1, 17, 29, + // Band 4 + 62, 190, 238, 21, 159, 222, 2, 107, 172, 1, 72, 122, 1, 40, 71, 1, 18, 32, + // Band 5 + 61, 199, 240, 27, 161, 226, 4, 113, 180, 1, 76, 129, 1, 46, 80, 1, 23, 41 + }; + + internal static readonly byte[] DefaultCoefProbs16x16 = + { + // Y plane + // Intra + // Band 0 + 7, 27, 153, 5, 30, 95, 1, 16, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Band 1 + 50, 75, 127, 57, 75, 124, 27, 67, 108, 10, 54, 86, 1, 33, 52, 1, 12, 18, + // Band 2 + 43, 125, 151, 26, 108, 148, 7, 83, 122, 2, 59, 89, 1, 38, 60, 1, 17, 27, + // Band 3 + 23, 144, 163, 13, 112, 154, 2, 75, 117, 1, 50, 81, 1, 31, 51, 1, 14, 23, + // Band 4 + 18, 162, 185, 6, 123, 171, 1, 78, 125, 1, 51, 86, 1, 31, 54, 1, 14, 23, + // Band 5 + 15, 199, 227, 3, 150, 204, 1, 91, 146, 1, 55, 95, 1, 30, 53, 1, 11, 20, + // Inter + // Band 0 + 19, 55, 240, 19, 59, 196, 3, 52, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Band 1 + 41, 166, 207, 104, 153, 199, 31, 123, 181, 14, 101, 152, 5, 72, 106, 1, 36, 52, + // Band 2 + 35, 176, 211, 12, 131, 190, 2, 88, 144, 1, 60, 101, 1, 36, 60, 1, 16, 28, + // Band 3 + 28, 183, 213, 8, 134, 191, 1, 86, 142, 1, 56, 96, 1, 30, 53, 1, 12, 20, + // Band 4 + 20, 190, 215, 4, 135, 192, 1, 84, 139, 1, 53, 91, 1, 28, 49, 1, 11, 20, + // Band 5 + 13, 196, 216, 2, 137, 192, 1, 86, 143, 1, 57, 99, 1, 32, 56, 1, 13, 24, + // UV plane + // Intra + // Band 0 + 211, 29, 217, 96, 47, 156, 22, 43, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Band 1 + 78, 120, 193, 111, 116, 186, 46, 102, 164, 15, 80, 128, 2, 49, 76, 1, 18, 28, + // Band 2 + 71, 161, 203, 42, 132, 192, 10, 98, 150, 3, 69, 109, 1, 44, 70, 1, 18, 29, + // Band 3 + 57, 186, 211, 30, 140, 196, 4, 93, 146, 1, 62, 102, 1, 38, 65, 1, 16, 27, + // Band 4 + 47, 199, 217, 14, 145, 196, 1, 88, 142, 1, 57, 98, 1, 36, 62, 1, 15, 26, + // Band 5 + 26, 219, 229, 5, 155, 207, 1, 94, 151, 1, 60, 104, 1, 36, 62, 1, 16, 28, + // Inter + // Band 0 + 233, 29, 248, 146, 47, 220, 43, 52, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Band 1 + 100, 163, 232, 179, 161, 222, 63, 142, 204, 37, 113, 174, 26, 89, 137, 18, 68, 97, + // Band 2 + 85, 181, 230, 32, 146, 209, 7, 100, 164, 3, 71, 121, 1, 45, 77, 1, 18, 30, + // Band 3 + 65, 187, 230, 20, 148, 207, 2, 97, 159, 1, 68, 116, 1, 40, 70, 1, 14, 29, + // Band 4 + 40, 194, 227, 8, 147, 204, 1, 94, 155, 1, 65, 112, 1, 39, 66, 1, 14, 26, + // Band 5 + 16, 208, 228, 3, 151, 207, 1, 98, 160, 1, 67, 117, 1, 41, 74, 1, 17, 31 + }; + + internal static readonly byte[] DefaultCoefProbs32x32 = + { + // Y plane + // Intra + // Band 0 + 17, 38, 140, 7, 34, 80, 1, 17, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Band 1 + 37, 75, 128, 41, 76, 128, 26, 66, 116, 12, 52, 94, 2, 32, 55, 1, 10, 16, + // Band 2 + 50, 127, 154, 37, 109, 152, 16, 82, 121, 5, 59, 85, 1, 35, 54, 1, 13, 20, + // Band 3 + 40, 142, 167, 17, 110, 157, 2, 71, 112, 1, 44, 72, 1, 27, 45, 1, 11, 17, + // Band 4 + 30, 175, 188, 9, 124, 169, 1, 74, 116, 1, 48, 78, 1, 30, 49, 1, 11, 18, + // Band 5 + 10, 222, 223, 2, 150, 194, 1, 83, 128, 1, 48, 79, 1, 27, 45, 1, 11, 17, + // Inter + // Band 0 + 36, 41, 235, 29, 36, 193, 10, 27, 111, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Band 1 + 85, 165, 222, 177, 162, 215, 110, 135, 195, 57, 113, 168, 23, 83, 120, 10, 49, 61, + // Band 2 + 85, 190, 223, 36, 139, 200, 5, 90, 146, 1, 60, 103, 1, 38, 65, 1, 18, 30, + // Band 3 + 72, 202, 223, 23, 141, 199, 2, 86, 140, 1, 56, 97, 1, 36, 61, 1, 16, 27, + // Band 4 + 55, 218, 225, 13, 145, 200, 1, 86, 141, 1, 57, 99, 1, 35, 61, 1, 13, 22, + // Band 5 + 15, 235, 212, 1, 132, 184, 1, 84, 139, 1, 57, 97, 1, 34, 56, 1, 14, 23, + // UV plane + // Intra + // Band 0 + 181, 21, 201, 61, 37, 123, 10, 38, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Band 1 + 47, 106, 172, 95, 104, 173, 42, 93, 159, 18, 77, 131, 4, 50, 81, 1, 17, 23, + // Band 2 + 62, 147, 199, 44, 130, 189, 28, 102, 154, 18, 75, 115, 2, 44, 65, 1, 12, 19, + // Band 3 + 55, 153, 210, 24, 130, 194, 3, 93, 146, 1, 61, 97, 1, 31, 50, 1, 10, 16, + // Band 4 + 49, 186, 223, 17, 148, 204, 1, 96, 142, 1, 53, 83, 1, 26, 44, 1, 11, 17, + // Band 5 + 13, 217, 212, 2, 136, 180, 1, 78, 124, 1, 50, 83, 1, 29, 49, 1, 14, 23, + // Inter + // Band 0 + 197, 13, 247, 82, 17, 222, 25, 17, 162, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // Band 1 + 126, 186, 247, 234, 191, 243, 176, 177, 234, 104, 158, 220, 66, 128, 186, 55, 90, 137, + // Band 2 + 111, 197, 242, 46, 158, 219, 9, 104, 171, 2, 65, 125, 1, 44, 80, 1, 17, 91, + // Band 3 + 104, 208, 245, 39, 168, 224, 3, 109, 162, 1, 79, 124, 1, 50, 102, 1, 43, 102, + // Band 4 + 84, 220, 246, 31, 177, 231, 2, 115, 180, 1, 79, 134, 1, 55, 77, 1, 60, 79, + // Band 5 + 43, 243, 240, 8, 180, 217, 1, 115, 166, 1, 84, 121, 1, 51, 67, 1, 16, 6 + }; + + public static byte[] GetBandTranslate(int txSize) + { + return txSize == (int)TxSize.Tx4x4 ? CoefbandTrans4x4 : CoefbandTrans8x8Plus; + } + + public static void CopyProbs(ref T dest, ReadOnlySpan probs) where T : unmanaged + { + if (Unsafe.SizeOf() != probs.Length) + { + throw new Exception("size mismatch expected: " + probs.Length + " got: " + Unsafe.SizeOf()); + } + + probs.CopyTo(MemoryMarshal.Cast(MemoryMarshal.CreateSpan(ref dest, 1))); + } + + internal const int CoefCountSat = 24; + internal const int CoefMaxUpdateFactor = 112; + internal const int CoefCountSatKey = 24; + internal const int CoefMaxUpdateFactorKey = 112; + internal const int CoefCountSatAfterKey = 24; + internal const int CoefMaxUpdateFactorAfterKey = 128; + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/EntropyMode.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/EntropyMode.cs new file mode 100644 index 000000000..3964c5fb1 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/EntropyMode.cs @@ -0,0 +1,400 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Common; +using Ryujinx.Graphics.Nvdec.Vp9.Types; +using Ryujinx.Graphics.Video; +using System; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal class EntropyMode + { + public const int BlockSizeGroups = 4; + + public const int TxSizeContexts = 2; + + public static readonly byte[][][] KfYModeProb = + { + new[] + { + // above = dc + new byte[] { 137, 30, 42, 148, 151, 207, 70, 52, 91 }, // left = dc + new byte[] { 92, 45, 102, 136, 116, 180, 74, 90, 100 }, // left = v + new byte[] { 73, 32, 19, 187, 222, 215, 46, 34, 100 }, // left = h + new byte[] { 91, 30, 32, 116, 121, 186, 93, 86, 94 }, // left = d45 + new byte[] { 72, 35, 36, 149, 68, 206, 68, 63, 105 }, // left = d135 + new byte[] { 73, 31, 28, 138, 57, 124, 55, 122, 151 }, // left = d117 + new byte[] { 67, 23, 21, 140, 126, 197, 40, 37, 171 }, // left = d153 + new byte[] { 86, 27, 28, 128, 154, 212, 45, 43, 53 }, // left = d207 + new byte[] { 74, 32, 27, 107, 86, 160, 63, 134, 102 }, // left = d63 + new byte[] { 59, 67, 44, 140, 161, 202, 78, 67, 119 } // left = tm + }, + new[] + { + // above = v + new byte[] { 63, 36, 126, 146, 123, 158, 60, 90, 96 }, // left = dc + new byte[] { 43, 46, 168, 134, 107, 128, 69, 142, 92 }, // left = v + new byte[] { 44, 29, 68, 159, 201, 177, 50, 57, 77 }, // left = h + new byte[] { 58, 38, 76, 114, 97, 172, 78, 133, 92 }, // left = d45 + new byte[] { 46, 41, 76, 140, 63, 184, 69, 112, 57 }, // left = d135 + new byte[] { 38, 32, 85, 140, 46, 112, 54, 151, 133 }, // left = d117 + new byte[] { 39, 27, 61, 131, 110, 175, 44, 75, 136 }, // left = d153 + new byte[] { 52, 30, 74, 113, 130, 175, 51, 64, 58 }, // left = d207 + new byte[] { 47, 35, 80, 100, 74, 143, 64, 163, 74 }, // left = d63 + new byte[] { 36, 61, 116, 114, 128, 162, 80, 125, 82 } // left = tm + }, + new[] + { + // above = h + new byte[] { 82, 26, 26, 171, 208, 204, 44, 32, 105 }, // left = dc + new byte[] { 55, 44, 68, 166, 179, 192, 57, 57, 108 }, // left = v + new byte[] { 42, 26, 11, 199, 241, 228, 23, 15, 85 }, // left = h + new byte[] { 68, 42, 19, 131, 160, 199, 55, 52, 83 }, // left = d45 + new byte[] { 58, 50, 25, 139, 115, 232, 39, 52, 118 }, // left = d135 + new byte[] { 50, 35, 33, 153, 104, 162, 64, 59, 131 }, // left = d117 + new byte[] { 44, 24, 16, 150, 177, 202, 33, 19, 156 }, // left = d153 + new byte[] { 55, 27, 12, 153, 203, 218, 26, 27, 49 }, // left = d207 + new byte[] { 53, 49, 21, 110, 116, 168, 59, 80, 76 }, // left = d63 + new byte[] { 38, 72, 19, 168, 203, 212, 50, 50, 107 } // left = tm + }, + new[] + { + // above = d45 + new byte[] { 103, 26, 36, 129, 132, 201, 83, 80, 93 }, // left = dc + new byte[] { 59, 38, 83, 112, 103, 162, 98, 136, 90 }, // left = v + new byte[] { 62, 30, 23, 158, 200, 207, 59, 57, 50 }, // left = h + new byte[] { 67, 30, 29, 84, 86, 191, 102, 91, 59 }, // left = d45 + new byte[] { 60, 32, 33, 112, 71, 220, 64, 89, 104 }, // left = d135 + new byte[] { 53, 26, 34, 130, 56, 149, 84, 120, 103 }, // left = d117 + new byte[] { 53, 21, 23, 133, 109, 210, 56, 77, 172 }, // left = d153 + new byte[] { 77, 19, 29, 112, 142, 228, 55, 66, 36 }, // left = d207 + new byte[] { 61, 29, 29, 93, 97, 165, 83, 175, 162 }, // left = d63 + new byte[] { 47, 47, 43, 114, 137, 181, 100, 99, 95 } // left = tm + }, + new[] + { + // above = d135 + new byte[] { 69, 23, 29, 128, 83, 199, 46, 44, 101 }, // left = dc + new byte[] { 53, 40, 55, 139, 69, 183, 61, 80, 110 }, // left = v + new byte[] { 40, 29, 19, 161, 180, 207, 43, 24, 91 }, // left = h + new byte[] { 60, 34, 19, 105, 61, 198, 53, 64, 89 }, // left = d45 + new byte[] { 52, 31, 22, 158, 40, 209, 58, 62, 89 }, // left = d135 + new byte[] { 44, 31, 29, 147, 46, 158, 56, 102, 198 }, // left = d117 + new byte[] { 35, 19, 12, 135, 87, 209, 41, 45, 167 }, // left = d153 + new byte[] { 55, 25, 21, 118, 95, 215, 38, 39, 66 }, // left = d207 + new byte[] { 51, 38, 25, 113, 58, 164, 70, 93, 97 }, // left = d63 + new byte[] { 47, 54, 34, 146, 108, 203, 72, 103, 151 } // left = tm + }, + new[] + { + // above = d117 + new byte[] { 64, 19, 37, 156, 66, 138, 49, 95, 133 }, // left = dc + new byte[] { 46, 27, 80, 150, 55, 124, 55, 121, 135 }, // left = v + new byte[] { 36, 23, 27, 165, 149, 166, 54, 64, 118 }, // left = h + new byte[] { 53, 21, 36, 131, 63, 163, 60, 109, 81 }, // left = d45 + new byte[] { 40, 26, 35, 154, 40, 185, 51, 97, 123 }, // left = d135 + new byte[] { 35, 19, 34, 179, 19, 97, 48, 129, 124 }, // left = d117 + new byte[] { 36, 20, 26, 136, 62, 164, 33, 77, 154 }, // left = d153 + new byte[] { 45, 18, 32, 130, 90, 157, 40, 79, 91 }, // left = d207 + new byte[] { 45, 26, 28, 129, 45, 129, 49, 147, 123 }, // left = d63 + new byte[] { 38, 44, 51, 136, 74, 162, 57, 97, 121 } // left = tm + }, + new[] + { + // above = d153 + new byte[] { 75, 17, 22, 136, 138, 185, 32, 34, 166 }, // left = dc + new byte[] { 56, 39, 58, 133, 117, 173, 48, 53, 187 }, // left = v + new byte[] { 35, 21, 12, 161, 212, 207, 20, 23, 145 }, // left = h + new byte[] { 56, 29, 19, 117, 109, 181, 55, 68, 112 }, // left = d45 + new byte[] { 47, 29, 17, 153, 64, 220, 59, 51, 114 }, // left = d135 + new byte[] { 46, 16, 24, 136, 76, 147, 41, 64, 172 }, // left = d117 + new byte[] { 34, 17, 11, 108, 152, 187, 13, 15, 209 }, // left = d153 + new byte[] { 51, 24, 14, 115, 133, 209, 32, 26, 104 }, // left = d207 + new byte[] { 55, 30, 18, 122, 79, 179, 44, 88, 116 }, // left = d63 + new byte[] { 37, 49, 25, 129, 168, 164, 41, 54, 148 } // left = tm + }, + new[] + { + // above = d207 + new byte[] { 82, 22, 32, 127, 143, 213, 39, 41, 70 }, // left = dc + new byte[] { 62, 44, 61, 123, 105, 189, 48, 57, 64 }, // left = v + new byte[] { 47, 25, 17, 175, 222, 220, 24, 30, 86 }, // left = h + new byte[] { 68, 36, 17, 106, 102, 206, 59, 74, 74 }, // left = d45 + new byte[] { 57, 39, 23, 151, 68, 216, 55, 63, 58 }, // left = d135 + new byte[] { 49, 30, 35, 141, 70, 168, 82, 40, 115 }, // left = d117 + new byte[] { 51, 25, 15, 136, 129, 202, 38, 35, 139 }, // left = d153 + new byte[] { 68, 26, 16, 111, 141, 215, 29, 28, 28 }, // left = d207 + new byte[] { 59, 39, 19, 114, 75, 180, 77, 104, 42 }, // left = d63 + new byte[] { 40, 61, 26, 126, 152, 206, 61, 59, 93 } // left = tm + }, + new[] + { + // above = d63 + new byte[] { 78, 23, 39, 111, 117, 170, 74, 124, 94 }, // left = dc + new byte[] { 48, 34, 86, 101, 92, 146, 78, 179, 134 }, // left = v + new byte[] { 47, 22, 24, 138, 187, 178, 68, 69, 59 }, // left = h + new byte[] { 56, 25, 33, 105, 112, 187, 95, 177, 129 }, // left = d45 + new byte[] { 48, 31, 27, 114, 63, 183, 82, 116, 56 }, // left = d135 + new byte[] { 43, 28, 37, 121, 63, 123, 61, 192, 169 }, // left = d117 + new byte[] { 42, 17, 24, 109, 97, 177, 56, 76, 122 }, // left = d153 + new byte[] { 58, 18, 28, 105, 139, 182, 70, 92, 63 }, // left = d207 + new byte[] { 46, 23, 32, 74, 86, 150, 67, 183, 88 }, // left = d63 + new byte[] { 36, 38, 48, 92, 122, 165, 88, 137, 91 } // left = tm + }, + new[] + { + // above = tm + new byte[] { 65, 70, 60, 155, 159, 199, 61, 60, 81 }, // left = dc + new byte[] { 44, 78, 115, 132, 119, 173, 71, 112, 93 }, // left = v + new byte[] { 39, 38, 21, 184, 227, 206, 42, 32, 64 }, // left = h + new byte[] { 58, 47, 36, 124, 137, 193, 80, 82, 78 }, // left = d45 + new byte[] { 49, 50, 35, 144, 95, 205, 63, 78, 59 }, // left = d135 + new byte[] { 41, 53, 52, 148, 71, 142, 65, 128, 51 }, // left = d117 + new byte[] { 40, 36, 28, 143, 143, 202, 40, 55, 137 }, // left = d153 + new byte[] { 52, 34, 29, 129, 183, 227, 42, 35, 43 }, // left = d207 + new byte[] { 42, 44, 44, 104, 105, 164, 64, 130, 80 }, // left = d63 + new byte[] { 43, 81, 53, 140, 169, 204, 68, 84, 72 } // left = tm + } + }; + + public static readonly byte[][] KfUvModeProb = + { + new byte[] { 144, 11, 54, 157, 195, 130, 46, 58, 108 }, // y = dc + new byte[] { 118, 15, 123, 148, 131, 101, 44, 93, 131 }, // y = v + new byte[] { 113, 12, 23, 188, 226, 142, 26, 32, 125 }, // y = h + new byte[] { 120, 11, 50, 123, 163, 135, 64, 77, 103 }, // y = d45 + new byte[] { 113, 9, 36, 155, 111, 157, 32, 44, 161 }, // y = d135 + new byte[] { 116, 9, 55, 176, 76, 96, 37, 61, 149 }, // y = d117 + new byte[] { 115, 9, 28, 141, 161, 167, 21, 25, 193 }, // y = d153 + new byte[] { 120, 12, 32, 145, 195, 142, 32, 38, 86 }, // y = d207 + new byte[] { 116, 12, 64, 120, 140, 125, 49, 115, 121 }, // y = d63 + new byte[] { 102, 19, 66, 162, 182, 122, 35, 59, 128 } // y = tm + }; + + private static readonly byte[] DefaultIfYProbs = + { + 65, 32, 18, 144, 162, 194, 41, 51, 98, // block_size < 8x8 + 132, 68, 18, 165, 217, 196, 45, 40, 78, // block_size < 16x16 + 173, 80, 19, 176, 240, 193, 64, 35, 46, // block_size < 32x32 + 221, 135, 38, 194, 248, 121, 96, 85, 29 // block_size >= 32x32 + }; + + private static readonly byte[] DefaultIfUvProbs = + { + 120, 7, 76, 176, 208, 126, 28, 54, 103, // y = dc + 48, 12, 154, 155, 139, 90, 34, 117, 119, // y = v + 67, 6, 25, 204, 243, 158, 13, 21, 96, // y = h + 97, 5, 44, 131, 176, 139, 48, 68, 97, // y = d45 + 83, 5, 42, 156, 111, 152, 26, 49, 152, // y = d135 + 80, 5, 58, 178, 74, 83, 33, 62, 145, // y = d117 + 86, 5, 32, 154, 192, 168, 14, 22, 163, // y = d153 + 85, 5, 32, 156, 216, 148, 19, 29, 73, // y = d207 + 77, 7, 64, 116, 132, 122, 37, 126, 120, // y = d63 + 101, 21, 107, 181, 192, 103, 19, 67, 125 // y = tm + }; + + private static readonly byte[] DefaultPartitionProbs = + { + // 8x8 . 4x4 + 199, 122, 141, // a/l both not split + 147, 63, 159, // a split, l not split + 148, 133, 118, // l split, a not split + 121, 104, 114, // a/l both split + // 16x16 . 8x8 + 174, 73, 87, // a/l both not split + 92, 41, 83, // a split, l not split + 82, 99, 50, // l split, a not split + 53, 39, 39, // a/l both split + // 32x32 . 16x16 + 177, 58, 59, // a/l both not split + 68, 26, 63, // a split, l not split + 52, 79, 25, // l split, a not split + 17, 14, 12, // a/l both split + // 64x64 . 32x32 + 222, 34, 30, // a/l both not split + 72, 16, 44, // a split, l not split + 58, 32, 12, // l split, a not split + 10, 7, 6 // a/l both split + }; + + private static readonly byte[] DefaultInterModeProbs = + { + 2, 173, 34, // 0 = both zero mv + 7, 145, 85, // 1 = one zero mv + one a predicted mv + 7, 166, 63, // 2 = two predicted mvs + 7, 94, 66, // 3 = one predicted/zero and one new mv + 8, 64, 46, // 4 = two new mvs + 17, 81, 31, // 5 = one intra neighbour + x + 25, 29, 30 // 6 = two intra neighbours + }; + + /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */ + public static readonly sbyte[] IntraModeTree = + { + -(int)PredictionMode.DcPred, 2, /* 0 = DC_NODE */ -(int)PredictionMode.TmPred, 4, /* 1 = TM_NODE */ + -(int)PredictionMode.VPred, 6, /* 2 = V_NODE */ 8, 12, /* 3 = COM_NODE */ -(int)PredictionMode.HPred, + 10, /* 4 = H_NODE */ -(int)PredictionMode.D135Pred, -(int)PredictionMode.D117Pred, /* 5 = D135_NODE */ + -(int)PredictionMode.D45Pred, 14, /* 6 = D45_NODE */ -(int)PredictionMode.D63Pred, + 16, /* 7 = D63_NODE */ -(int)PredictionMode.D153Pred, -(int)PredictionMode.D207Pred /* 8 = D153_NODE */ + }; + + public static readonly sbyte[] InterModeTree = + { + -((int)PredictionMode.ZeroMv - (int)PredictionMode.NearestMv), 2, + -((int)PredictionMode.NearestMv - (int)PredictionMode.NearestMv), 4, + -((int)PredictionMode.NearMv - (int)PredictionMode.NearestMv), + -((int)PredictionMode.NewMv - (int)PredictionMode.NearestMv) + }; + + public static readonly sbyte[] PartitionTree = + { + -(sbyte)PartitionType.PartitionNone, 2, -(sbyte)PartitionType.PartitionHorz, 4, + -(sbyte)PartitionType.PartitionVert, -(sbyte)PartitionType.PartitionSplit + }; + + public static readonly sbyte[] SwitchableInterpTree = + { + -Constants.EightTap, 2, -Constants.EightTapSmooth, -Constants.EightTapSharp + }; + + private static readonly byte[] DefaultIntraInterP = { 9, 102, 187, 225 }; + private static readonly byte[] DefaultCompInterP = { 239, 183, 119, 96, 41 }; + private static readonly byte[] DefaultCompRefP = { 50, 126, 123, 221, 226 }; + private static readonly byte[] DefaultSingleRefP = { 33, 16, 77, 74, 142, 142, 172, 170, 238, 247 }; + private static readonly byte[] DefaultTxProbs = { 3, 136, 37, 5, 52, 13, 20, 152, 15, 101, 100, 66 }; + + static EntropyMode() + { + byte[][] KfPartitionProbs = + { + // 8x8 . 4x4 + new byte[] { 158, 97, 94 }, // a/l both not split + new byte[] { 93, 24, 99 }, // a split, l not split + new byte[] { 85, 119, 44 }, // l split, a not split + new byte[] { 62, 59, 67 }, // a/l both split + + // 16x16 . 8x8 + new byte[] { 149, 53, 53 }, // a/l both not split + new byte[] { 94, 20, 48 }, // a split, l not split + new byte[] { 83, 53, 24 }, // l split, a not split + new byte[] { 52, 18, 18 }, // a/l both split + + // 32x32 . 16x16 + new byte[] { 150, 40, 39 }, // a/l both not split + new byte[] { 78, 12, 26 }, // a split, l not split + new byte[] { 67, 33, 11 }, // l split, a not split + new byte[] { 24, 7, 5 }, // a/l both split + + // 64x64 . 32x32 + new byte[] { 174, 35, 49 }, // a/l both not split + new byte[] { 68, 11, 27 }, // a split, l not split + new byte[] { 57, 15, 9 }, // l split, a not split + new byte[] { 12, 3, 3 } // a/l both split + }; + } + + private static readonly byte[] DefaultSkipProbs = { 192, 128, 64 }; + + private static readonly byte[] DefaultSwitchableInterpProb = { 235, 162, 36, 255, 34, 3, 149, 144 }; + + private static void InitModeProbs(ref Vp9EntropyProbs fc) + { + Entropy.CopyProbs(ref fc.UvModeProb, DefaultIfUvProbs); + Entropy.CopyProbs(ref fc.YModeProb, DefaultIfYProbs); + Entropy.CopyProbs(ref fc.SwitchableInterpProb, DefaultSwitchableInterpProb); + Entropy.CopyProbs(ref fc.PartitionProb, DefaultPartitionProbs); + Entropy.CopyProbs(ref fc.IntraInterProb, DefaultIntraInterP); + Entropy.CopyProbs(ref fc.CompInterProb, DefaultCompInterP); + Entropy.CopyProbs(ref fc.CompRefProb, DefaultCompRefP); + Entropy.CopyProbs(ref fc.SingleRefProb, DefaultSingleRefP); + Entropy.CopyProbs(ref fc.Tx32x32Prob, DefaultTxProbs.AsSpan().Slice(0, 6)); + Entropy.CopyProbs(ref fc.Tx16x16Prob, DefaultTxProbs.AsSpan().Slice(6, 4)); + Entropy.CopyProbs(ref fc.Tx8x8Prob, DefaultTxProbs.AsSpan().Slice(10, 2)); + Entropy.CopyProbs(ref fc.SkipProb, DefaultSkipProbs); + Entropy.CopyProbs(ref fc.InterModeProb, DefaultInterModeProbs); + } + + internal static void TxCountsToBranchCounts32x32(ReadOnlySpan txCount32x32P, + ref Array3> ct32x32P) + { + ct32x32P[0][0] = txCount32x32P[(int)TxSize.Tx4x4]; + ct32x32P[0][1] = txCount32x32P[(int)TxSize.Tx8x8] + txCount32x32P[(int)TxSize.Tx16x16] + + txCount32x32P[(int)TxSize.Tx32x32]; + ct32x32P[1][0] = txCount32x32P[(int)TxSize.Tx8x8]; + ct32x32P[1][1] = txCount32x32P[(int)TxSize.Tx16x16] + txCount32x32P[(int)TxSize.Tx32x32]; + ct32x32P[2][0] = txCount32x32P[(int)TxSize.Tx16x16]; + ct32x32P[2][1] = txCount32x32P[(int)TxSize.Tx32x32]; + } + + internal static void TxCountsToBranchCounts16x16(ReadOnlySpan txCount16x16P, + ref Array2> ct16x16P) + { + ct16x16P[0][0] = txCount16x16P[(int)TxSize.Tx4x4]; + ct16x16P[0][1] = txCount16x16P[(int)TxSize.Tx8x8] + txCount16x16P[(int)TxSize.Tx16x16]; + ct16x16P[1][0] = txCount16x16P[(int)TxSize.Tx8x8]; + ct16x16P[1][1] = txCount16x16P[(int)TxSize.Tx16x16]; + } + + internal static void TxCountsToBranchCounts8x8(ReadOnlySpan txCount8x8P, + ref Array1> ct8x8P) + { + ct8x8P[0][0] = txCount8x8P[(int)TxSize.Tx4x4]; + ct8x8P[0][1] = txCount8x8P[(int)TxSize.Tx8x8]; + } + + public static unsafe void SetupPastIndependence(ref Vp9Common cm) + { + // Reset the segment feature data to the default stats: + // Features disabled, 0, with delta coding (Default state). + ref Types.LoopFilter lf = ref cm.Lf; + + cm.Seg.ClearAllSegFeatures(); + cm.Seg.AbsDelta = Segmentation.SegmentDeltadata; + + if (!cm.LastFrameSegMap.IsNull) + { + MemoryUtil.Fill(cm.LastFrameSegMap.ToPointer(), (byte)0, cm.MiRows * cm.MiCols); + } + + if (!cm.CurrentFrameSegMap.IsNull) + { + MemoryUtil.Fill(cm.CurrentFrameSegMap.ToPointer(), (byte)0, cm.MiRows * cm.MiCols); + } + + // Reset the mode ref deltas for loop filter + lf.LastRefDeltas = new Array4(); + lf.LastModeDeltas = new Array2(); + lf.SetDefaultLfDeltas(); + + // To force update of the sharpness + lf.LastSharpnessLevel = -1; + + cm.DefaultCoefProbs(); + InitModeProbs(ref cm.Fc.Value); + cm.InitMvProbs(); + + if (cm.FrameType == FrameType.KeyFrame || cm.ErrorResilientMode != 0 || cm.ResetFrameContext == 3) + { + // Reset all frame contexts. + for (int i = 0; i < Constants.FrameContexts; ++i) + { + cm.FrameContexts[i] = cm.Fc.Value; + } + } + else if (cm.ResetFrameContext == 2) + { + // Reset only the frame context specified in the frame header. + cm.FrameContexts[(int)cm.FrameContextIdx] = cm.Fc.Value; + } + + // prev_mip will only be allocated in encoder. + if (cm.FrameIsIntraOnly() && !cm.PrevMip.IsNull) + { + cm.PrevMi.Value = new ModeInfo(); + } + + cm.RefFrameSignBias = new Array4(); + + cm.FrameContextIdx = 0; + } + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/EntropyMv.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/EntropyMv.cs new file mode 100644 index 000000000..04568e557 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/EntropyMv.cs @@ -0,0 +1,165 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Types; +using Ryujinx.Graphics.Video; +using System.Diagnostics; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal static class EntropyMv + { + public const int UpdateProb = 252; + + /* Symbols for coding which components are zero jointly */ + public const int Joints = 4; + + + public static readonly sbyte[] JointTree = + { + -(sbyte)MvJointType.Zero, 2, -(sbyte)MvJointType.Hnzvz, 4, + -(sbyte)MvJointType.Hzvnz, -(sbyte)MvJointType.Hnzvnz + }; + + public static readonly sbyte[] ClassTree = + { + -(sbyte)MvClassType.Class0, 2, -(sbyte)MvClassType.Class1, 4, 6, 8, -(sbyte)MvClassType.Class2, + -(sbyte)MvClassType.Class3, 10, 12, -(sbyte)MvClassType.Class4, -(sbyte)MvClassType.Class5, + -(sbyte)MvClassType.Class6, 14, 16, 18, -(sbyte)MvClassType.Class7, -(sbyte)MvClassType.Class8, + -(sbyte)MvClassType.Class9, -(sbyte)MvClassType.Class10 + }; + + public static readonly sbyte[] Class0Tree = { -0, -1 }; + + public static readonly sbyte[] FpTree = { -0, 2, -1, 4, -2, -3 }; + + private static bool JointVertical(MvJointType type) + { + return type == MvJointType.Hzvnz || type == MvJointType.Hnzvnz; + } + + private static bool JointHorizontal(MvJointType type) + { + return type == MvJointType.Hnzvz || type == MvJointType.Hnzvnz; + } + + private static readonly byte[] LogInBase2 = + { + 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 10 + }; + + private static int ClassBase(MvClassType c) + { + return c != 0 ? Class0Size << ((int)c + 2) : 0; + } + + private static MvClassType GetClass(int z, Ptr offset) + { + MvClassType c = z >= Class0Size * 4096 + ? MvClassType.Class10 + : (MvClassType)LogInBase2[z >> 3]; + if (!offset.IsNull) + { + offset.Value = z - ClassBase(c); + } + + return c; + } + + private static void IncComponent(int v, ref Vp9BackwardUpdates compCounts, int compIndex, int incr, int usehp) + { + int s, z, c, o = 0, d, e, f; + Debug.Assert(v != 0); /* should not be zero */ + s = v < 0 ? 1 : 0; + compCounts.Sign[compIndex][s] += (uint)incr; + z = (s != 0 ? -v : v) - 1; /* magnitude - 1 */ + + c = (int)GetClass(z, new Ptr(ref o)); + compCounts.Classes[compIndex][c] += (uint)incr; + + d = o >> 3; /* int mv data */ + f = (o >> 1) & 3; /* fractional pel mv data */ + e = o & 1; /* high precision mv data */ + + if (c == (int)MvClassType.Class0) + { + compCounts.Class0[compIndex][d] += (uint)incr; + compCounts.Class0Fp[compIndex][d][f] += (uint)incr; + compCounts.Class0Hp[compIndex][e] += (uint)(usehp * incr); + } + else + { + int b = c + Class0Bits - 1; // number of bits + for (int i = 0; i < b; ++i) + { + compCounts.Bits[compIndex][i][(d >> i) & 1] += (uint)incr; + } + + compCounts.Fp[compIndex][f] += (uint)incr; + compCounts.Hp[compIndex][e] += (uint)(usehp * incr); + } + } + + public static void Inc(ref Mv mv, Ptr counts) + { + if (!counts.IsNull) + { + MvJointType j = mv.GetJoint(); + ++counts.Value.Joints[(int)j]; + + if (JointVertical(j)) + { + IncComponent(mv.Row, ref counts.Value, 0, 1, 1); + } + + if (JointHorizontal(j)) + { + IncComponent(mv.Col, ref counts.Value, 1, 1, 1); + } + } + } + + /* Symbols for coding magnitude class of nonzero components */ + public const int Classes = 11; + + public const int Class0Bits = 1; /* bits at integer precision for class 0 */ + public const int Class0Size = 1 << Class0Bits; + public const int OffsetBits = Classes + Class0Bits - 2; + public const int FpSize = 4; + + public const int MaxBits = Classes + Class0Bits + 2; + public const int Max = (1 << MaxBits) - 1; + public const int Vals = (Max << 1) + 1; + + public const int InUseBits = 14; + public const int Upp = (1 << InUseBits) - 1; + public const int Low = -(1 << InUseBits); + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/FrameBuffers.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/FrameBuffers.cs new file mode 100644 index 000000000..108265772 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/FrameBuffers.cs @@ -0,0 +1,79 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Common; +using Ryujinx.Graphics.Nvdec.Vp9.Types; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + internal struct InternalFrameBuffer + { + public ArrayPtr Data; + public bool InUse; + } + + internal struct InternalFrameBufferList + { + public ArrayPtr IntFb; + } + + internal static class FrameBuffers + { + public static int GetFrameBuffer(MemoryAllocator allocator, Ptr cbPriv, ulong minSize, + ref VpxCodecFrameBuffer fb) + { + int i; + Ptr intFbList = cbPriv; + if (intFbList.IsNull) + { + return -1; + } + + // Find a free frame buffer. + for (i = 0; i < intFbList.Value.IntFb.Length; ++i) + { + if (!intFbList.Value.IntFb[i].InUse) + { + break; + } + } + + if (i == intFbList.Value.IntFb.Length) + { + return -1; + } + + if ((ulong)intFbList.Value.IntFb[i].Data.Length < minSize) + { + if (!intFbList.Value.IntFb[i].Data.IsNull) + { + allocator.Free(intFbList.Value.IntFb[i].Data); + } + + // The data must be zeroed to fix a valgrind error from the C loop filter + // due to access uninitialized memory in frame border. It could be + // skipped if border were totally removed. + intFbList.Value.IntFb[i].Data = allocator.Allocate((int)minSize); + if (intFbList.Value.IntFb[i].Data.IsNull) + { + return -1; + } + } + + fb.Data = intFbList.Value.IntFb[i].Data; + intFbList.Value.IntFb[i].InUse = true; + + // Set the frame buffer's private data to point at the internal frame buffer. + fb.Priv = new Ptr(ref intFbList.Value.IntFb[i]); + return 0; + } + + public static int ReleaseFrameBuffer(Ptr cbPriv, ref VpxCodecFrameBuffer fb) + { + if (!fb.Priv.IsNull) + { + fb.Priv.Value.InUse = false; + } + + return 0; + } + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Idct.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Idct.cs index 9fa5842a6..fc19f76d8 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Idct.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Idct.cs @@ -8,11 +8,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 internal static class Idct { private delegate void Transform1D(ReadOnlySpan input, Span output); + private delegate void HighbdTransform1D(ReadOnlySpan input, Span output, int bd); private struct Transform2D { - public Transform1D Cols, Rows; // Vertical and horizontal + public readonly Transform1D Cols; // Vertical and horizontal + public readonly Transform1D Rows; // Vertical and horizontal public Transform2D(Transform1D cols, Transform1D rows) { @@ -23,7 +25,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 private struct HighbdTransform2D { - public HighbdTransform1D Cols, Rows; // Vertical and horizontal + public readonly HighbdTransform1D Cols; // Vertical and horizontal + public readonly HighbdTransform1D Rows; // Vertical and horizontal public HighbdTransform2D(HighbdTransform1D cols, HighbdTransform1D rows) { @@ -32,24 +35,23 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } - private static readonly Transform2D[] Iht4 = new Transform2D[] + private static readonly Transform2D[] Iht4 = { - new Transform2D(Idct4, Idct4), // DCT_DCT = 0 - new Transform2D(Iadst4, Idct4), // ADST_DCT = 1 - new Transform2D(Idct4, Iadst4), // DCT_ADST = 2 - new Transform2D(Iadst4, Iadst4) // ADST_ADST = 3 + new(Idct4, Idct4), // DCT_DCT = 0 + new(Iadst4, Idct4), // ADST_DCT = 1 + new(Idct4, Iadst4), // DCT_ADST = 2 + new(Iadst4, Iadst4) // ADST_ADST = 3 }; public static void Iht4x416Add(ReadOnlySpan input, Span dest, int stride, int txType) { - int i, j; Span output = stackalloc int[4 * 4]; Span outptr = output; Span tempIn = stackalloc int[4]; Span tempOut = stackalloc int[4]; // Inverse transform row vectors - for (i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { Iht4[txType].Rows(input, outptr); input = input.Slice(4); @@ -57,32 +59,32 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } // Inverse transform column vectors - for (i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) + for (int j = 0; j < 4; ++j) { - tempIn[j] = output[j * 4 + i]; + tempIn[j] = output[(j * 4) + i]; } Iht4[txType].Cols(tempIn, tempOut); - for (j = 0; j < 4; ++j) + for (int j = 0; j < 4; ++j) { - dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4)); + dest[(j * stride) + i] = + ClipPixelAdd(dest[(j * stride) + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4)); } } } - private static readonly Transform2D[] Iht8 = new Transform2D[] + private static readonly Transform2D[] Iht8 = { - new Transform2D(Idct8, Idct8), // DCT_DCT = 0 - new Transform2D(Iadst8, Idct8), // ADST_DCT = 1 - new Transform2D(Idct8, Iadst8), // DCT_ADST = 2 - new Transform2D(Iadst8, Iadst8) // ADST_ADST = 3 + new(Idct8, Idct8), // DCT_DCT = 0 + new(Iadst8, Idct8), // ADST_DCT = 1 + new(Idct8, Iadst8), // DCT_ADST = 2 + new(Iadst8, Iadst8) // ADST_ADST = 3 }; public static void Iht8x864Add(ReadOnlySpan input, Span dest, int stride, int txType) { - int i, j; Span output = stackalloc int[8 * 8]; Span outptr = output; Span tempIn = stackalloc int[8]; @@ -90,7 +92,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 Transform2D ht = Iht8[txType]; // Inverse transform row vectors - for (i = 0; i < 8; ++i) + for (int i = 0; i < 8; ++i) { ht.Rows(input, outptr); input = input.Slice(8); @@ -98,32 +100,32 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } // Inverse transform column vectors - for (i = 0; i < 8; ++i) + for (int i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) + for (int j = 0; j < 8; ++j) { - tempIn[j] = output[j * 8 + i]; + tempIn[j] = output[(j * 8) + i]; } ht.Cols(tempIn, tempOut); - for (j = 0; j < 8; ++j) + for (int j = 0; j < 8; ++j) { - dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5)); + dest[(j * stride) + i] = + ClipPixelAdd(dest[(j * stride) + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5)); } } } - private static readonly Transform2D[] Iht16 = new Transform2D[] + private static readonly Transform2D[] Iht16 = { - new Transform2D(Idct16, Idct16), // DCT_DCT = 0 - new Transform2D(Iadst16, Idct16), // ADST_DCT = 1 - new Transform2D(Idct16, Iadst16), // DCT_ADST = 2 - new Transform2D(Iadst16, Iadst16) // ADST_ADST = 3 + new(Idct16, Idct16), // DCT_DCT = 0 + new(Iadst16, Idct16), // ADST_DCT = 1 + new(Idct16, Iadst16), // DCT_ADST = 2 + new(Iadst16, Iadst16) // ADST_ADST = 3 }; public static void Iht16x16256Add(ReadOnlySpan input, Span dest, int stride, int txType) { - int i, j; Span output = stackalloc int[16 * 16]; Span outptr = output; Span tempIn = stackalloc int[16]; @@ -131,7 +133,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 Transform2D ht = Iht16[txType]; // Rows - for (i = 0; i < 16; ++i) + for (int i = 0; i < 16; ++i) { ht.Rows(input, outptr); input = input.Slice(16); @@ -139,17 +141,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } // Columns - for (i = 0; i < 16; ++i) + for (int i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { - tempIn[j] = output[j * 16 + i]; + tempIn[j] = output[(j * 16) + i]; } ht.Cols(tempIn, tempOut); - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { - dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); + dest[(j * stride) + i] = + ClipPixelAdd(dest[(j * stride) + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); } } } @@ -271,7 +274,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } public static void Iht16x16Add(TxType txType, ReadOnlySpan input, Span dest, - int stride, int eob) + int stride, int eob) { if (txType == TxType.DctDct) { @@ -283,24 +286,23 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } - private static readonly HighbdTransform2D[] HighbdIht4 = new HighbdTransform2D[] + private static readonly HighbdTransform2D[] HighbdIht4 = { - new HighbdTransform2D(HighbdIdct4, HighbdIdct4), // DCT_DCT = 0 - new HighbdTransform2D(HighbdIadst4, HighbdIdct4), // ADST_DCT = 1 - new HighbdTransform2D(HighbdIdct4, HighbdIadst4), // DCT_ADST = 2 - new HighbdTransform2D(HighbdIadst4, HighbdIadst4) // ADST_ADST = 3 + new(HighbdIdct4, HighbdIdct4), // DCT_DCT = 0 + new(HighbdIadst4, HighbdIdct4), // ADST_DCT = 1 + new(HighbdIdct4, HighbdIadst4), // DCT_ADST = 2 + new(HighbdIadst4, HighbdIadst4) // ADST_ADST = 3 }; public static void HighbdIht4x416Add(ReadOnlySpan input, Span dest, int stride, int txType, int bd) { - int i, j; Span output = stackalloc int[4 * 4]; Span outptr = output; Span tempIn = stackalloc int[4]; Span tempOut = stackalloc int[4]; // Inverse transform row vectors. - for (i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { HighbdIht4[txType].Rows(input, outptr, bd); input = input.Slice(4); @@ -308,32 +310,32 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } // Inverse transform column vectors. - for (i = 0; i < 4; ++i) + for (int i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) + for (int j = 0; j < 4; ++j) { - tempIn[j] = output[j * 4 + i]; + tempIn[j] = output[(j * 4) + i]; } HighbdIht4[txType].Cols(tempIn, tempOut, bd); - for (j = 0; j < 4; ++j) + for (int j = 0; j < 4; ++j) { - dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd); + dest[(j * stride) + i] = HighbdClipPixelAdd(dest[(j * stride) + i], + BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd); } } } - private static readonly HighbdTransform2D[] HighIht8 = new HighbdTransform2D[] + private static readonly HighbdTransform2D[] HighIht8 = { - new HighbdTransform2D(HighbdIdct8, HighbdIdct8), // DCT_DCT = 0 - new HighbdTransform2D(HighbdIadst8, HighbdIdct8), // ADST_DCT = 1 - new HighbdTransform2D(HighbdIdct8, HighbdIadst8), // DCT_ADST = 2 - new HighbdTransform2D(HighbdIadst8, HighbdIadst8) // ADST_ADST = 3 + new(HighbdIdct8, HighbdIdct8), // DCT_DCT = 0 + new(HighbdIadst8, HighbdIdct8), // ADST_DCT = 1 + new(HighbdIdct8, HighbdIadst8), // DCT_ADST = 2 + new(HighbdIadst8, HighbdIadst8) // ADST_ADST = 3 }; public static void HighbdIht8x864Add(ReadOnlySpan input, Span dest, int stride, int txType, int bd) { - int i, j; Span output = stackalloc int[8 * 8]; Span outptr = output; Span tempIn = stackalloc int[8]; @@ -341,7 +343,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 HighbdTransform2D ht = HighIht8[txType]; // Inverse transform row vectors. - for (i = 0; i < 8; ++i) + for (int i = 0; i < 8; ++i) { ht.Rows(input, outptr, bd); input = input.Slice(8); @@ -349,32 +351,33 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } // Inverse transform column vectors. - for (i = 0; i < 8; ++i) + for (int i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) + for (int j = 0; j < 8; ++j) { - tempIn[j] = output[j * 8 + i]; + tempIn[j] = output[(j * 8) + i]; } ht.Cols(tempIn, tempOut, bd); - for (j = 0; j < 8; ++j) + for (int j = 0; j < 8; ++j) { - dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd); + dest[(j * stride) + i] = HighbdClipPixelAdd(dest[(j * stride) + i], + BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd); } } } - private static readonly HighbdTransform2D[] HighIht16 = new HighbdTransform2D[] + private static readonly HighbdTransform2D[] HighIht16 = { - new HighbdTransform2D(HighbdIdct16, HighbdIdct16), // DCT_DCT = 0 - new HighbdTransform2D(HighbdIadst16, HighbdIdct16), // ADST_DCT = 1 - new HighbdTransform2D(HighbdIdct16, HighbdIadst16), // DCT_ADST = 2 - new HighbdTransform2D(HighbdIadst16, HighbdIadst16) // ADST_ADST = 3 + new(HighbdIdct16, HighbdIdct16), // DCT_DCT = 0 + new(HighbdIadst16, HighbdIdct16), // ADST_DCT = 1 + new(HighbdIdct16, HighbdIadst16), // DCT_ADST = 2 + new(HighbdIadst16, HighbdIadst16) // ADST_ADST = 3 }; - public static void HighbdIht16x16256Add(ReadOnlySpan input, Span dest, int stride, int txType, int bd) + public static void HighbdIht16x16256Add(ReadOnlySpan input, Span dest, int stride, int txType, + int bd) { - int i, j; Span output = stackalloc int[16 * 16]; Span outptr = output; Span tempIn = stackalloc int[16]; @@ -382,7 +385,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 HighbdTransform2D ht = HighIht16[txType]; // Rows - for (i = 0; i < 16; ++i) + for (int i = 0; i < 16; ++i) { ht.Rows(input, outptr, bd); input = input.Slice(16); @@ -390,17 +393,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } // Columns - for (i = 0; i < 16; ++i) + for (int i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { - tempIn[j] = output[j * 16 + i]; + tempIn[j] = output[(j * 16) + i]; } ht.Cols(tempIn, tempOut, bd); - for (j = 0; j < 16; ++j) + for (int j = 0; j < 16; ++j) { - dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); + dest[(j * stride) + i] = HighbdClipPixelAdd(dest[(j * stride) + i], + BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); } } } @@ -440,7 +444,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 // DC only DCT coefficient if (eob == 1) { - vpx_Highbdidct8x8_1_add_c(input, dest, stride, bd); + VpxHighbdidct8x81AddC(input, dest, stride, bd); } else if (eob <= 12) { @@ -497,7 +501,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } // Iht - public static void HighbdIht4x4Add(TxType txType, ReadOnlySpan input, Span dest, int stride, int eob, int bd) + public static void HighbdIht4x4Add(TxType txType, ReadOnlySpan input, Span dest, int stride, + int eob, int bd) { if (txType == TxType.DctDct) { @@ -509,7 +514,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } - public static void HighbdIht8x8Add(TxType txType, ReadOnlySpan input, Span dest, int stride, int eob, int bd) + public static void HighbdIht8x8Add(TxType txType, ReadOnlySpan input, Span dest, int stride, + int eob, int bd) { if (txType == TxType.DctDct) { @@ -521,7 +527,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } - public static void HighbdIht16x16Add(TxType txType, ReadOnlySpan input, Span dest, int stride, int eob, int bd) + public static void HighbdIht16x16Add(TxType txType, ReadOnlySpan input, Span dest, int stride, + int eob, int bd) { if (txType == TxType.DctDct) { @@ -533,4 +540,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorException.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorException.cs index baa0ab998..e23eb758c 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorException.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorException.cs @@ -2,7 +2,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { - class InternalErrorException : Exception + internal class InternalErrorException : Exception { public InternalErrorException(string message) : base(message) { @@ -12,4 +12,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorInfo.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorInfo.cs index 68e9cb4bb..5da2a50be 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorInfo.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorInfo.cs @@ -11,4 +11,4 @@ throw new InternalErrorException(message); } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/LoopFilter.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/LoopFilter.cs index 9ecccc64e..120d8763c 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/LoopFilter.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/LoopFilter.cs @@ -1,8 +1,12 @@ using Ryujinx.Common.Memory; using Ryujinx.Graphics.Nvdec.Vp9.Common; +using Ryujinx.Graphics.Nvdec.Vp9.Dsp; using Ryujinx.Graphics.Nvdec.Vp9.Types; using System; +using System.Diagnostics; using System.Runtime.InteropServices; +using System.Threading; +using System.Threading.Tasks; namespace Ryujinx.Graphics.Nvdec.Vp9 { @@ -13,11 +17,119 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 public const int MaxRefLfDeltas = 4; public const int MaxModeLfDeltas = 2; + private struct LfSync + { + private int[] _curSbCol; + private object[] _syncObjects; + private int _syncRange; + + private static int GetSyncRange(int width) + { + // nsync numbers are picked by testing. For example, for 4k + // video, using 4 gives best performance. + if (width < 640) + { + return 1; + } + + if (width <= 1280) + { + return 2; + } + + if (width <= 4096) + { + return 4; + } + + return 8; + } + + public void Initialize(int width, int sbRows) + { + if (_curSbCol == null || _curSbCol.Length != sbRows) + { + _curSbCol = new int[sbRows]; + _syncObjects = new object[sbRows]; + + for (int i = 0; i < sbRows; i++) + { + _syncObjects[i] = new object(); + } + } + + _syncRange = GetSyncRange(width); + _curSbCol.AsSpan().Fill(-1); + } + + public void SyncRead(int r, int c) + { + if (_curSbCol == null) + { + return; + } + + int nsync = _syncRange; + + if (r != 0 && (c & (nsync - 1)) == 0) + { + object syncObject = _syncObjects[r - 1]; + lock (syncObject) + { + while (c > _curSbCol[r - 1] - nsync) + { + Monitor.Wait(syncObject); + } + } + } + } + + public void SyncWrite(int r, int c, int sbCols) + { + if (_curSbCol == null) + { + return; + } + + int nsync = _syncRange; + + int cur; + // Only signal when there are enough filtered SB for next row to run. + bool sig = true; + + if (c < sbCols - 1) + { + cur = c; + + if (c % nsync != 0) + { + sig = false; + } + } + else + { + cur = sbCols + nsync; + } + + if (sig) + { + object syncObject = _syncObjects[r]; + + lock (syncObject) + { + _curSbCol[r] = cur; + + Monitor.Pulse(syncObject); + } + } + } + } + // 64 bit masks for left transform size. Each 1 represents a position where // we should apply a loop filter across the left border of an 8x8 block // boundary. // - // In the case of TX_16X16 -> ( in low order byte first we end up with + // In the case of (int)TxSize.Tx16x16 . ( in low order byte first we end up with // a mask that looks like this // // 10101010 @@ -30,19 +142,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 // 10101010 // // A loopfilter should be applied to every other 8x8 horizontally. - private static readonly ulong[] Left64X64TxformMask = new ulong[] + private static readonly ulong[] Left64x64TxformMask = { - 0xffffffffffffffffUL, // TX_4X4 - 0xffffffffffffffffUL, // TX_8x8 - 0x5555555555555555UL, // TX_16x16 - 0x1111111111111111UL, // TX_32x32 + 0xffffffffffffffffUL, // (int)TxSize.Tx4x4 + 0xffffffffffffffffUL, // (int)TxSize.Tx8x8 + 0x5555555555555555UL, // (int)TxSize.Tx16x16 + 0x1111111111111111UL // (int)TxSize.Tx32x32 }; // 64 bit masks for above transform size. Each 1 represents a position where // we should apply a loop filter across the top border of an 8x8 block // boundary. // - // In the case of TX_32x32 -> ( in low order byte first we end up with + // In the case of (int)TxSize.Tx32x32 . ( in low order byte first we end up with // a mask that looks like this // // 11111111 @@ -55,19 +167,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 // 00000000 // // A loopfilter should be applied to every other 4 the row vertically. - private static readonly ulong[] Above64X64TxformMask = new ulong[] + private static readonly ulong[] Above64x64TxformMask = { - 0xffffffffffffffffUL, // TX_4X4 - 0xffffffffffffffffUL, // TX_8x8 - 0x00ff00ff00ff00ffUL, // TX_16x16 - 0x000000ff000000ffUL, // TX_32x32 + 0xffffffffffffffffUL, // (int)TxSize.Tx4x4 + 0xffffffffffffffffUL, // (int)TxSize.Tx8x8 + 0x00ff00ff00ff00ffUL, // (int)TxSize.Tx16x16 + 0x000000ff000000ffUL // (int)TxSize.Tx32x32 }; // 64 bit masks for prediction sizes (left). Each 1 represents a position // where left border of an 8x8 block. These are aligned to the right most // appropriate bit, and then shifted into place. // - // In the case of TX_16x32 -> ( low order byte first ) we end up with + // In the case of TX_16x32 . ( low order byte first ) we end up with // a mask that looks like this : // // 10000000 @@ -78,59 +190,59 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 // 00000000 // 00000000 // 00000000 - private static readonly ulong[] LeftPredictionMask = new ulong[] + private static readonly ulong[] LeftPredictionMask = { - 0x0000000000000001UL, // BLOCK_4X4, - 0x0000000000000001UL, // BLOCK_4X8, - 0x0000000000000001UL, // BLOCK_8X4, - 0x0000000000000001UL, // BLOCK_8X8, - 0x0000000000000101UL, // BLOCK_8X16, - 0x0000000000000001UL, // BLOCK_16X8, - 0x0000000000000101UL, // BLOCK_16X16, - 0x0000000001010101UL, // BLOCK_16X32, - 0x0000000000000101UL, // BLOCK_32X16, - 0x0000000001010101UL, // BLOCK_32X32, - 0x0101010101010101UL, // BLOCK_32X64, - 0x0000000001010101UL, // BLOCK_64X32, - 0x0101010101010101UL, // BLOCK_64X64 + 0x0000000000000001UL, // BLOCK_4x4, + 0x0000000000000001UL, // BLOCK_4x8, + 0x0000000000000001UL, // BLOCK_8x4, + 0x0000000000000001UL, // BLOCK_8x8, + 0x0000000000000101UL, // BLOCK_8x16, + 0x0000000000000001UL, // BLOCK_16x8, + 0x0000000000000101UL, // BLOCK_16x16, + 0x0000000001010101UL, // BLOCK_16x32, + 0x0000000000000101UL, // BLOCK_32x16, + 0x0000000001010101UL, // BLOCK_32x32, + 0x0101010101010101UL, // BLOCK_32x64, + 0x0000000001010101UL, // BLOCK_64x32, + 0x0101010101010101UL // BLOCK_64x64 }; // 64 bit mask to shift and set for each prediction size. - private static readonly ulong[] AbovePredictionMask = new ulong[] + private static readonly ulong[] AbovePredictionMask = { - 0x0000000000000001UL, // BLOCK_4X4 - 0x0000000000000001UL, // BLOCK_4X8 - 0x0000000000000001UL, // BLOCK_8X4 - 0x0000000000000001UL, // BLOCK_8X8 - 0x0000000000000001UL, // BLOCK_8X16, - 0x0000000000000003UL, // BLOCK_16X8 - 0x0000000000000003UL, // BLOCK_16X16 - 0x0000000000000003UL, // BLOCK_16X32, - 0x000000000000000fUL, // BLOCK_32X16, - 0x000000000000000fUL, // BLOCK_32X32, - 0x000000000000000fUL, // BLOCK_32X64, - 0x00000000000000ffUL, // BLOCK_64X32, - 0x00000000000000ffUL, // BLOCK_64X64 + 0x0000000000000001UL, // BLOCK_4x4 + 0x0000000000000001UL, // BLOCK_4x8 + 0x0000000000000001UL, // BLOCK_8x4 + 0x0000000000000001UL, // BLOCK_8x8 + 0x0000000000000001UL, // BLOCK_8x16, + 0x0000000000000003UL, // BLOCK_16x8 + 0x0000000000000003UL, // BLOCK_16x16 + 0x0000000000000003UL, // BLOCK_16x32, + 0x000000000000000fUL, // BLOCK_32x16, + 0x000000000000000fUL, // BLOCK_32x32, + 0x000000000000000fUL, // BLOCK_32x64, + 0x00000000000000ffUL, // BLOCK_64x32, + 0x00000000000000ffUL // BLOCK_64x64 }; // 64 bit mask to shift and set for each prediction size. A bit is set for // each 8x8 block that would be in the left most block of the given block // size in the 64x64 block. - private static readonly ulong[] SizeMask = new ulong[] + private static readonly ulong[] SizeMask = { - 0x0000000000000001UL, // BLOCK_4X4 - 0x0000000000000001UL, // BLOCK_4X8 - 0x0000000000000001UL, // BLOCK_8X4 - 0x0000000000000001UL, // BLOCK_8X8 - 0x0000000000000101UL, // BLOCK_8X16, - 0x0000000000000003UL, // BLOCK_16X8 - 0x0000000000000303UL, // BLOCK_16X16 - 0x0000000003030303UL, // BLOCK_16X32, - 0x0000000000000f0fUL, // BLOCK_32X16, - 0x000000000f0f0f0fUL, // BLOCK_32X32, - 0x0f0f0f0f0f0f0f0fUL, // BLOCK_32X64, - 0x00000000ffffffffUL, // BLOCK_64X32, - 0xffffffffffffffffUL, // BLOCK_64X64 + 0x0000000000000001UL, // BLOCK_4x4 + 0x0000000000000001UL, // BLOCK_4x8 + 0x0000000000000001UL, // BLOCK_8x4 + 0x0000000000000001UL, // BLOCK_8x8 + 0x0000000000000101UL, // BLOCK_8x16, + 0x0000000000000003UL, // BLOCK_16x8 + 0x0000000000000303UL, // BLOCK_16x16 + 0x0000000003030303UL, // BLOCK_16x32, + 0x0000000000000f0fUL, // BLOCK_32x16, + 0x000000000f0f0f0fUL, // BLOCK_32x32, + 0x0f0f0f0f0f0f0f0fUL, // BLOCK_32x64, + 0x00000000ffffffffUL, // BLOCK_64x32, + 0xffffffffffffffffUL // BLOCK_64x64 }; // These are used for masking the left and above borders. @@ -138,83 +250,83 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 private const ulong AboveBorder = 0x000000ff000000ffUL; // 16 bit masks for uv transform sizes. - private static readonly ushort[] Left64X64TxformMaskUv = new ushort[] + private static readonly ushort[] Left64x64TxformMaskUv = { - 0xffff, // TX_4X4 - 0xffff, // TX_8x8 - 0x5555, // TX_16x16 - 0x1111, // TX_32x32 + 0xffff, // (int)TxSize.Tx4x4 + 0xffff, // (int)TxSize.Tx8x8 + 0x5555, // (int)TxSize.Tx16x16 + 0x1111 // (int)TxSize.Tx32x32 }; - private static readonly ushort[] Above64X64TxformMaskUv = new ushort[] + private static readonly ushort[] Above64x64TxformMaskUv = { - 0xffff, // TX_4X4 - 0xffff, // TX_8x8 - 0x0f0f, // TX_16x16 - 0x000f, // TX_32x32 + 0xffff, // (int)TxSize.Tx4x4 + 0xffff, // (int)TxSize.Tx8x8 + 0x0f0f, // (int)TxSize.Tx16x16 + 0x000f // (int)TxSize.Tx32x32 }; // 16 bit left mask to shift and set for each uv prediction size. - private static readonly ushort[] LeftPredictionMaskUv = new ushort[] + private static readonly ushort[] LeftPredictionMaskUv = { - 0x0001, // BLOCK_4X4, - 0x0001, // BLOCK_4X8, - 0x0001, // BLOCK_8X4, - 0x0001, // BLOCK_8X8, - 0x0001, // BLOCK_8X16, - 0x0001, // BLOCK_16X8, - 0x0001, // BLOCK_16X16, - 0x0011, // BLOCK_16X32, - 0x0001, // BLOCK_32X16, - 0x0011, // BLOCK_32X32, - 0x1111, // BLOCK_32X64 - 0x0011, // BLOCK_64X32, - 0x1111, // BLOCK_64X64 + 0x0001, // BLOCK_4x4, + 0x0001, // BLOCK_4x8, + 0x0001, // BLOCK_8x4, + 0x0001, // BLOCK_8x8, + 0x0001, // BLOCK_8x16, + 0x0001, // BLOCK_16x8, + 0x0001, // BLOCK_16x16, + 0x0011, // BLOCK_16x32, + 0x0001, // BLOCK_32x16, + 0x0011, // BLOCK_32x32, + 0x1111, // BLOCK_32x64 + 0x0011, // BLOCK_64x32, + 0x1111 // BLOCK_64x64 }; // 16 bit above mask to shift and set for uv each prediction size. - private static readonly ushort[] AbovePredictionMaskUv = new ushort[] + private static readonly ushort[] AbovePredictionMaskUv = { - 0x0001, // BLOCK_4X4 - 0x0001, // BLOCK_4X8 - 0x0001, // BLOCK_8X4 - 0x0001, // BLOCK_8X8 - 0x0001, // BLOCK_8X16, - 0x0001, // BLOCK_16X8 - 0x0001, // BLOCK_16X16 - 0x0001, // BLOCK_16X32, - 0x0003, // BLOCK_32X16, - 0x0003, // BLOCK_32X32, - 0x0003, // BLOCK_32X64, - 0x000f, // BLOCK_64X32, - 0x000f, // BLOCK_64X64 + 0x0001, // BLOCK_4x4 + 0x0001, // BLOCK_4x8 + 0x0001, // BLOCK_8x4 + 0x0001, // BLOCK_8x8 + 0x0001, // BLOCK_8x16, + 0x0001, // BLOCK_16x8 + 0x0001, // BLOCK_16x16 + 0x0001, // BLOCK_16x32, + 0x0003, // BLOCK_32x16, + 0x0003, // BLOCK_32x32, + 0x0003, // BLOCK_32x64, + 0x000f, // BLOCK_64x32, + 0x000f // BLOCK_64x64 }; // 64 bit mask to shift and set for each uv prediction size - private static readonly ushort[] SizeMaskUv = new ushort[] + private static readonly ushort[] SizeMaskUv = { - 0x0001, // BLOCK_4X4 - 0x0001, // BLOCK_4X8 - 0x0001, // BLOCK_8X4 - 0x0001, // BLOCK_8X8 - 0x0001, // BLOCK_8X16, - 0x0001, // BLOCK_16X8 - 0x0001, // BLOCK_16X16 - 0x0011, // BLOCK_16X32, - 0x0003, // BLOCK_32X16, - 0x0033, // BLOCK_32X32, - 0x3333, // BLOCK_32X64, - 0x00ff, // BLOCK_64X32, - 0xffff, // BLOCK_64X64 + 0x0001, // BLOCK_4x4 + 0x0001, // BLOCK_4x8 + 0x0001, // BLOCK_8x4 + 0x0001, // BLOCK_8x8 + 0x0001, // BLOCK_8x16, + 0x0001, // BLOCK_16x8 + 0x0001, // BLOCK_16x16 + 0x0011, // BLOCK_16x32, + 0x0003, // BLOCK_32x16, + 0x0033, // BLOCK_32x32, + 0x3333, // BLOCK_32x64, + 0x00ff, // BLOCK_64x32, + 0xffff // BLOCK_64x64 }; private const ushort LeftBorderUv = 0x1111; private const ushort AboveBorderUv = 0x000f; - private static readonly int[] ModeLfLut = new int[] + private static readonly int[] ModeLfLut = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES - 1, 1, 0, 1 // INTER_MODES (ZEROMV == 0) + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES + 1, 1, 0, 1 // INTER_MODES (ZEROMV == 0) }; private static byte GetFilterLevel(ref LoopFilterInfoN lfiN, ref ModeInfo mi) @@ -222,14 +334,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 return lfiN.Lvl[mi.SegmentId][mi.RefFrame[0]][ModeLfLut[(int)mi.Mode]]; } - private static ref LoopFilterMask GetLfm(ref Types.LoopFilter lf, int miRow, int miCol) + private static Span GetLfm(ref Types.LoopFilter lf, int miRow, int miCol) { - return ref lf.Lfm[(miCol >> 3) + ((miRow >> 3) * lf.LfmStride)]; + return lf.Lfm.AsSpan().Slice((miCol >> 3) + ((miRow >> 3) * lf.LfmStride)); } // 8x8 blocks in a superblock. A "1" represents the first block in a 16x16 // or greater area. - private static readonly byte[][] FirstBlockIn16x16 = new byte[][] + private static readonly byte[][] FirstBlockIn16x16 = { new byte[] { 1, 0, 1, 0, 1, 0, 1, 0 }, new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 }, new byte[] { 1, 0, 1, 0, 1, 0, 1, 0 }, new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 }, @@ -246,15 +358,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 ref LoopFilterInfoN lfiN = ref cm.LfInfo; int filterLevel = GetFilterLevel(ref lfiN, ref mi); TxSize txSizeUv = Luts.UvTxsizeLookup[(int)blockSize][(int)txSizeY][1][1]; - ref LoopFilterMask lfm = ref GetLfm(ref cm.Lf, miRow, miCol); + ref LoopFilterMask lfm = ref GetLfm(ref cm.Lf, miRow, miCol)[0]; ref ulong leftY = ref lfm.LeftY[(int)txSizeY]; ref ulong aboveY = ref lfm.AboveY[(int)txSizeY]; - ref ulong int4X4Y = ref lfm.Int4x4Y; + ref ulong int4x4Y = ref lfm.Int4x4Y; ref ushort leftUv = ref lfm.LeftUv[(int)txSizeUv]; ref ushort aboveUv = ref lfm.AboveUv[(int)txSizeUv]; - ref ushort int4X4Uv = ref lfm.Int4x4Uv; - int rowInSb = (miRow & 7); - int colInSb = (miCol & 7); + ref ushort int4x4Uv = ref lfm.Int4x4Uv; + int rowInSb = miRow & 7; + int colInSb = miCol & 7; int shiftY = colInSb + (rowInSb << 3); int shiftUv = (colInSb >> 1) + ((rowInSb >> 1) << 2); int buildUv = FirstBlockIn16x16[rowInSb][colInSb]; @@ -263,15 +375,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { return; } - else + + int index = shiftY; + + for (int i = 0; i < bh; i++) { - int index = shiftY; - int i; - for (i = 0; i < bh; i++) - { - MemoryMarshal.CreateSpan(ref lfm.LflY[index], 64 - index).Slice(0, bw).Fill((byte)filterLevel); - index += 8; - } + MemoryMarshal.CreateSpan(ref lfm.LflY[index], 64 - index).Slice(0, bw).Fill((byte)filterLevel); + index += 8; } // These set 1 in the current block size for the block size edges. @@ -305,13 +415,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 // Add a mask for the transform size. The transform size mask is set to // be correct for a 64x64 prediction block size. Mask to match the size of // the block we are working on and then shift it into place. - aboveY |= (SizeMask[(int)blockSize] & Above64X64TxformMask[(int)txSizeY]) << shiftY; - leftY |= (SizeMask[(int)blockSize] & Left64X64TxformMask[(int)txSizeY]) << shiftY; + aboveY |= (SizeMask[(int)blockSize] & Above64x64TxformMask[(int)txSizeY]) << shiftY; + leftY |= (SizeMask[(int)blockSize] & Left64x64TxformMask[(int)txSizeY]) << shiftY; if (buildUv != 0) { - aboveUv |= (ushort)((SizeMaskUv[(int)blockSize] & Above64X64TxformMaskUv[(int)txSizeUv]) << shiftUv); - leftUv |= (ushort)((SizeMaskUv[(int)blockSize] & Left64X64TxformMaskUv[(int)txSizeUv]) << shiftUv); + aboveUv |= (ushort)((SizeMaskUv[(int)blockSize] & Above64x64TxformMaskUv[(int)txSizeUv]) << shiftUv); + leftUv |= (ushort)((SizeMaskUv[(int)blockSize] & Left64x64TxformMaskUv[(int)txSizeUv]) << shiftUv); } // Try to determine what to do with the internal 4x4 block boundaries. These @@ -319,20 +429,154 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 // internal ones can be skipped and don't depend on the prediction block size. if (txSizeY == TxSize.Tx4x4) { - int4X4Y |= SizeMask[(int)blockSize] << shiftY; + int4x4Y |= SizeMask[(int)blockSize] << shiftY; } if (buildUv != 0 && txSizeUv == TxSize.Tx4x4) { - int4X4Uv |= (ushort)((SizeMaskUv[(int)blockSize] & 0xffff) << shiftUv); + int4x4Uv |= (ushort)((SizeMaskUv[(int)blockSize] & 0xffff) << shiftUv); } } + private static void AdjustMask(ref Vp9Common cm, int miRow, int miCol, ref LoopFilterMask lfm) + { + const ulong leftBorder = 0x1111111111111111UL; + const ulong aboveBorder = 0x000000ff000000ffUL; + const ushort leftBorderUv = 0x1111; + const ushort aboveBorderUv = 0x000f; + + + // The largest loopfilter we have is 16x16 so we use the 16x16 mask + // for 32x32 transforms also. + lfm.LeftY[(int)TxSize.Tx16x16] |= lfm.LeftY[(int)TxSize.Tx32x32]; + lfm.AboveY[(int)TxSize.Tx16x16] |= lfm.AboveY[(int)TxSize.Tx32x32]; + lfm.LeftUv[(int)TxSize.Tx16x16] |= lfm.LeftUv[(int)TxSize.Tx32x32]; + lfm.AboveUv[(int)TxSize.Tx16x16] |= lfm.AboveUv[(int)TxSize.Tx32x32]; + + // We do at least 8 tap filter on every 32x32 even if the transform size + // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and + // remove it from the 4x4. + lfm.LeftY[(int)TxSize.Tx8x8] |= lfm.LeftY[(int)TxSize.Tx4x4] & leftBorder; + lfm.LeftY[(int)TxSize.Tx4x4] &= ~leftBorder; + lfm.AboveY[(int)TxSize.Tx8x8] |= lfm.AboveY[(int)TxSize.Tx4x4] & aboveBorder; + lfm.AboveY[(int)TxSize.Tx4x4] &= ~aboveBorder; + lfm.LeftUv[(int)TxSize.Tx8x8] |= (ushort)(lfm.LeftUv[(int)TxSize.Tx4x4] & leftBorderUv); + lfm.LeftUv[(int)TxSize.Tx4x4] &= unchecked((ushort)~leftBorderUv); + lfm.AboveUv[(int)TxSize.Tx8x8] |= (ushort)(lfm.AboveUv[(int)TxSize.Tx4x4] & aboveBorderUv); + lfm.AboveUv[(int)TxSize.Tx4x4] &= unchecked((ushort)~aboveBorderUv); + + // We do some special edge handling. + if (miRow + Constants.MiBlockSize > cm.MiRows) + { + int rows = cm.MiRows - miRow; + + // Each pixel inside the border gets a 1, + ulong maskY = (1UL << (rows << 3)) - 1; + ushort maskUv = (ushort)((1 << (((rows + 1) >> 1) << 2)) - 1); + + // Remove values completely outside our border. + for (int i = 0; i < (int)TxSize.Tx32x32; i++) + { + lfm.LeftY[i] &= maskY; + lfm.AboveY[i] &= maskY; + lfm.LeftUv[i] &= maskUv; + lfm.AboveUv[i] &= maskUv; + } + + lfm.Int4x4Y &= maskY; + lfm.Int4x4Uv &= maskUv; + + // We don't apply a wide loop filter on the last uv block row. If set + // apply the shorter one instead. + if (rows == 1) + { + lfm.AboveUv[(int)TxSize.Tx8x8] |= lfm.AboveUv[(int)TxSize.Tx16x16]; + lfm.AboveUv[(int)TxSize.Tx16x16] = 0; + } + + if (rows == 5) + { + lfm.AboveUv[(int)TxSize.Tx8x8] |= (ushort)(lfm.AboveUv[(int)TxSize.Tx16x16] & 0xff00); + lfm.AboveUv[(int)TxSize.Tx16x16] &= (ushort)~(lfm.AboveUv[(int)TxSize.Tx16x16] & 0xff00); + } + } + + if (miCol + Constants.MiBlockSize > cm.MiCols) + { + int columns = cm.MiCols - miCol; + + // Each pixel inside the border gets a 1, the multiply copies the border + // to where we need it. + ulong maskY = ((1UL << columns) - 1) * 0x0101010101010101UL; + ushort maskUv = (ushort)(((1 << ((columns + 1) >> 1)) - 1) * 0x1111); + + // Internal edges are not applied on the last column of the image so + // we mask 1 more for the internal edges + ushort maskUvInt = (ushort)(((1 << (columns >> 1)) - 1) * 0x1111); + + // Remove the bits outside the image edge. + for (int i = 0; i < (int)TxSize.Tx32x32; i++) + { + lfm.LeftY[i] &= maskY; + lfm.AboveY[i] &= maskY; + lfm.LeftUv[i] &= maskUv; + lfm.AboveUv[i] &= maskUv; + } + + lfm.Int4x4Y &= maskY; + lfm.Int4x4Uv &= maskUvInt; + + // We don't apply a wide loop filter on the last uv column. If set + // apply the shorter one instead. + if (columns == 1) + { + lfm.LeftUv[(int)TxSize.Tx8x8] |= lfm.LeftUv[(int)TxSize.Tx16x16]; + lfm.LeftUv[(int)TxSize.Tx16x16] = 0; + } + + if (columns == 5) + { + lfm.LeftUv[(int)TxSize.Tx8x8] |= (ushort)(lfm.LeftUv[(int)TxSize.Tx16x16] & 0xcccc); + lfm.LeftUv[(int)TxSize.Tx16x16] &= (ushort)~(lfm.LeftUv[(int)TxSize.Tx16x16] & 0xcccc); + } + } + + // We don't apply a loop filter on the first column in the image, mask that + // out. + if (miCol == 0) + { + for (int i = 0; i < (int)TxSize.Tx32x32; i++) + { + lfm.LeftY[i] &= 0xfefefefefefefefeUL; + lfm.LeftUv[i] &= 0xeeee; + } + } + + // Assert if we try to apply 2 different loop filters at the same position. + Debug.Assert((lfm.LeftY[(int)TxSize.Tx16x16] & lfm.LeftY[(int)TxSize.Tx8x8]) == 0); + Debug.Assert((lfm.LeftY[(int)TxSize.Tx16x16] & lfm.LeftY[(int)TxSize.Tx4x4]) == 0); + Debug.Assert((lfm.LeftY[(int)TxSize.Tx8x8] & lfm.LeftY[(int)TxSize.Tx4x4]) == 0); + Debug.Assert((lfm.Int4x4Y & lfm.LeftY[(int)TxSize.Tx16x16]) == 0); + Debug.Assert((lfm.LeftUv[(int)TxSize.Tx16x16] & lfm.LeftUv[(int)TxSize.Tx8x8]) == 0); + Debug.Assert((lfm.LeftUv[(int)TxSize.Tx16x16] & lfm.LeftUv[(int)TxSize.Tx4x4]) == 0); + Debug.Assert((lfm.LeftUv[(int)TxSize.Tx8x8] & lfm.LeftUv[(int)TxSize.Tx4x4]) == 0); + Debug.Assert((lfm.Int4x4Uv & lfm.LeftUv[(int)TxSize.Tx16x16]) == 0); + Debug.Assert((lfm.AboveY[(int)TxSize.Tx16x16] & lfm.AboveY[(int)TxSize.Tx8x8]) == 0); + Debug.Assert((lfm.AboveY[(int)TxSize.Tx16x16] & lfm.AboveY[(int)TxSize.Tx4x4]) == 0); + Debug.Assert((lfm.AboveY[(int)TxSize.Tx8x8] & lfm.AboveY[(int)TxSize.Tx4x4]) == 0); + Debug.Assert((lfm.Int4x4Y & lfm.AboveY[(int)TxSize.Tx16x16]) == 0); + Debug.Assert((lfm.AboveUv[(int)TxSize.Tx16x16] & lfm.AboveUv[(int)TxSize.Tx8x8]) == 0); + Debug.Assert((lfm.AboveUv[(int)TxSize.Tx16x16] & lfm.AboveUv[(int)TxSize.Tx4x4]) == 0); + Debug.Assert((lfm.AboveUv[(int)TxSize.Tx8x8] & lfm.AboveUv[(int)TxSize.Tx4x4]) == 0); + Debug.Assert((lfm.Int4x4Uv & lfm.AboveUv[(int)TxSize.Tx16x16]) == 0); + } + public static unsafe void ResetLfm(ref Vp9Common cm) { if (cm.Lf.FilterLevel != 0) { - MemoryUtil.Fill(cm.Lf.Lfm.ToPointer(), new LoopFilterMask(), ((cm.MiRows + (Constants.MiBlockSize - 1)) >> 3) * cm.Lf.LfmStride); + MemoryUtil.Fill(cm.Lf.Lfm.ToPointer(), new LoopFilterMask(), + ((cm.MiRows + (Constants.MiBlockSize - 1)) >> 3) * cm.Lf.LfmStride); } } @@ -348,9 +592,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 if (sharpnessLvl > 0) { - if (blockInsideLimit > (9 - sharpnessLvl)) + if (blockInsideLimit > 9 - sharpnessLvl) { - blockInsideLimit = (9 - sharpnessLvl); + blockInsideLimit = 9 - sharpnessLvl; } } @@ -360,7 +604,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } lfi.Lfthr[lvl].Lim.AsSpan().Fill((byte)blockInsideLimit); - lfi.Lfthr[lvl].Mblim.AsSpan().Fill((byte)(2 * (lvl + 2) + blockInsideLimit)); + lfi.Lfthr[lvl].Mblim.AsSpan().Fill((byte)((2 * (lvl + 2)) + blockInsideLimit)); } } @@ -385,10 +629,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 for (segId = 0; segId < Constants.MaxSegments; segId++) { int lvlSeg = defaultFiltLvl; - if (seg.IsSegFeatureActive(segId, SegLvlFeatures.SegLvlAltLf) != 0) + if (seg.IsSegFeatureActive(segId, SegLvlFeatures.AltLf) != 0) { - int data = seg.GetSegData(segId, SegLvlFeatures.SegLvlAltLf); - lvlSeg = Math.Clamp(seg.AbsDelta == Constants.SegmentAbsData ? data : defaultFiltLvl + data, 0, MaxLoopFilter); + int data = seg.GetSegData(segId, SegLvlFeatures.AltLf); + lvlSeg = Math.Clamp(seg.AbsDelta == Constants.SegmentAbsData ? data : defaultFiltLvl + data, 0, + MaxLoopFilter); } if (!lf.ModeRefDeltaEnabled) @@ -400,19 +645,1322 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 else { int refr, mode; - int intraLvl = lvlSeg + lf.RefDeltas[Constants.IntraFrame] * scale; + int intraLvl = lvlSeg + (lf.RefDeltas[Constants.IntraFrame] * scale); lfi.Lvl[segId][Constants.IntraFrame][0] = (byte)Math.Clamp(intraLvl, 0, MaxLoopFilter); for (refr = Constants.LastFrame; refr < Constants.MaxRefFrames; ++refr) { for (mode = 0; mode < MaxModeLfDeltas; ++mode) { - int interLvl = lvlSeg + lf.RefDeltas[refr] * scale + lf.ModeDeltas[mode] * scale; + int interLvl = lvlSeg + (lf.RefDeltas[refr] * scale) + (lf.ModeDeltas[mode] * scale); lfi.Lvl[segId][refr][mode] = (byte)Math.Clamp(interLvl, 0, MaxLoopFilter); } } } } } + + private static void FilterSelectivelyVertRow2( + int subsamplingFactor, + ArrayPtr s, + int pitch, + uint mask16x16, + uint mask8x8, + uint mask4x4, + uint mask4x4Int, + ReadOnlySpan lfthr, + ReadOnlySpan lfl) + { + uint dualMaskCutoff = subsamplingFactor != 0 ? 0xffu : 0xffffu; + int lflForward = subsamplingFactor != 0 ? 4 : 8; + uint dualOne = 1u | (1u << lflForward); + Span> ss = stackalloc ArrayPtr[2]; + Span lfis = stackalloc LoopFilterThresh[2]; + ss[0] = s; + + for (uint mask = (mask16x16 | mask8x8 | mask4x4 | mask4x4Int) & dualMaskCutoff; + mask != 0; + mask = (mask & ~dualOne) >> 1) + { + if ((mask & dualOne) != 0) + { + lfis[0] = lfthr[lfl[0]]; + lfis[1] = lfthr[lfl[lflForward]]; + ss[1] = ss[0].Slice(8 * pitch); + + if ((mask16x16 & dualOne) != 0) + { + if ((mask16x16 & dualOne) == dualOne) + { + LoopFilterAuto.LpfVertical16Dual(ss[0], pitch, lfis[0].Mblim.AsSpan(), lfis[0].Lim.AsSpan(), + lfis[0].HevThr.AsSpan()); + } + else + { + ref LoopFilterThresh lfi = ref lfis[(mask16x16 & 1) == 0 ? 1 : 0]; + LoopFilterAuto.LpfVertical16(ss[(mask16x16 & 1) == 0 ? 1 : 0], pitch, lfi.Mblim.AsSpan(), + lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); + } + } + + if ((mask8x8 & dualOne) != 0) + { + if ((mask8x8 & dualOne) == dualOne) + { + LoopFilterAuto.LpfVertical8Dual( + ss[0], + pitch, + lfis[0].Mblim.AsSpan(), + lfis[0].Lim.AsSpan(), + lfis[0].HevThr.AsSpan(), + lfis[1].Mblim.AsSpan(), + lfis[1].Lim.AsSpan(), + lfis[1].HevThr.AsSpan()); + } + else + { + ref LoopFilterThresh lfi = ref lfis[(mask8x8 & 1) == 0 ? 1 : 0]; + LoopFilterAuto.LpfVertical8( + ss[(mask8x8 & 1) == 0 ? 1 : 0], + pitch, + lfi.Mblim.AsSpan(), + lfi.Lim.AsSpan(), + lfi.HevThr.AsSpan()); + } + } + + if ((mask4x4 & dualOne) != 0) + { + if ((mask4x4 & dualOne) == dualOne) + { + LoopFilterAuto.LpfVertical4Dual( + ss[0], + pitch, + lfis[0].Mblim.AsSpan(), + lfis[0].Lim.AsSpan(), + lfis[0].HevThr.AsSpan(), + lfis[1].Mblim.AsSpan(), + lfis[1].Lim.AsSpan(), + lfis[1].HevThr.AsSpan()); + } + else + { + ref LoopFilterThresh lfi = ref lfis[(mask4x4 & 1) == 0 ? 1 : 0]; + LoopFilterAuto.LpfVertical4(ss[(mask4x4 & 1) == 0 ? 1 : 0], pitch, lfi.Mblim.AsSpan(), + lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); + } + } + + if ((mask4x4Int & dualOne) != 0) + { + if ((mask4x4Int & dualOne) == dualOne) + { + LoopFilterAuto.LpfVertical4Dual( + ss[0].Slice(4), + pitch, + lfis[0].Mblim.AsSpan(), + lfis[0].Lim.AsSpan(), + lfis[0].HevThr.AsSpan(), + lfis[1].Mblim.AsSpan(), + lfis[1].Lim.AsSpan(), + lfis[1].HevThr.AsSpan()); + } + else + { + ref LoopFilterThresh lfi = ref lfis[(mask4x4Int & 1) == 0 ? 1 : 0]; + LoopFilterAuto.LpfVertical4(ss[(mask4x4Int & 1) == 0 ? 1 : 0].Slice(4), pitch, + lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); + } + } + } + + ss[0] = ss[0].Slice(8); + lfl = lfl.Slice(1); + mask16x16 >>= 1; + mask8x8 >>= 1; + mask4x4 >>= 1; + mask4x4Int >>= 1; + } + } + + private static void HighbdFilterSelectivelyVertRow2( + int subsamplingFactor, + ArrayPtr s, + int pitch, + uint mask16x16, + uint mask8x8, + uint mask4x4, + uint mask4x4Int, + ReadOnlySpan lfthr, + ReadOnlySpan lfl, + int bd) + { + uint dualMaskCutoff = subsamplingFactor != 0 ? 0xffu : 0xffffu; + int lflForward = subsamplingFactor != 0 ? 4 : 8; + uint dualOne = 1u | (1u << lflForward); + Span> ss = stackalloc ArrayPtr[2]; + Span lfis = stackalloc LoopFilterThresh[2]; + ss[0] = s; + + for (uint mask = (mask16x16 | mask8x8 | mask4x4 | mask4x4Int) & dualMaskCutoff; + mask != 0; + mask = (mask & ~dualOne) >> 1) + { + if ((mask & dualOne) != 0) + { + lfis[0] = lfthr[lfl[0]]; + lfis[1] = lfthr[lfl[lflForward]]; + ss[1] = ss[0].Slice(8 * pitch); + + if ((mask16x16 & dualOne) != 0) + { + if ((mask16x16 & dualOne) == dualOne) + { + LoopFilterScalar.HighBdLpfVertical16Dual(ss[0], pitch, lfis[0].Mblim[0], lfis[0].Lim[0], + lfis[0].HevThr[0], bd); + } + else + { + ref LoopFilterThresh lfi = ref lfis[(mask16x16 & 1) == 0 ? 1 : 0]; + LoopFilterScalar.HighBdLpfVertical16(ss[(mask16x16 & 1) == 0 ? 1 : 0], pitch, lfi.Mblim[0], + lfi.Lim[0], lfi.HevThr[0], bd); + } + } + + if ((mask8x8 & dualOne) != 0) + { + if ((mask8x8 & dualOne) == dualOne) + { + LoopFilterScalar.HighBdLpfVertical8Dual( + ss[0], + pitch, + lfis[0].Mblim[0], + lfis[0].Lim[0], + lfis[0].HevThr[0], + lfis[1].Mblim[0], + lfis[1].Lim[0], + lfis[1].HevThr[0], + bd); + } + else + { + ref LoopFilterThresh lfi = ref lfis[(mask8x8 & 1) == 0 ? 1 : 0]; + LoopFilterScalar.HighBdLpfVertical8( + ss[(mask8x8 & 1) == 0 ? 1 : 0], + pitch, + lfi.Mblim[0], + lfi.Lim[0], + lfi.HevThr[0], + bd); + } + } + + if ((mask4x4 & dualOne) != 0) + { + if ((mask4x4 & dualOne) == dualOne) + { + LoopFilterScalar.HighBdLpfVertical4Dual( + ss[0], + pitch, + lfis[0].Mblim[0], + lfis[0].Lim[0], + lfis[0].HevThr[0], + lfis[1].Mblim[0], + lfis[1].Lim[0], + lfis[1].HevThr[0], + bd); + } + else + { + ref LoopFilterThresh lfi = ref lfis[(mask4x4 & 1) == 0 ? 1 : 0]; + LoopFilterScalar.HighBdLpfVertical4(ss[(mask4x4 & 1) == 0 ? 1 : 0], pitch, lfi.Mblim[0], + lfi.Lim[0], lfi.HevThr[0], bd); + } + } + + if ((mask4x4Int & dualOne) != 0) + { + if ((mask4x4Int & dualOne) == dualOne) + { + LoopFilterScalar.HighBdLpfVertical4Dual( + ss[0].Slice(4), + pitch, + lfis[0].Mblim[0], + lfis[0].Lim[0], + lfis[0].HevThr[0], + lfis[1].Mblim[0], + lfis[1].Lim[0], + lfis[1].HevThr[0], + bd); + } + else + { + ref LoopFilterThresh lfi = ref lfis[(mask4x4Int & 1) == 0 ? 1 : 0]; + LoopFilterScalar.HighBdLpfVertical4(ss[(mask4x4Int & 1) == 0 ? 1 : 0].Slice(4), pitch, + lfi.Mblim[0], lfi.Lim[0], lfi.HevThr[0], bd); + } + } + } + + ss[0] = ss[0].Slice(8); + lfl = lfl.Slice(1); + mask16x16 >>= 1; + mask8x8 >>= 1; + mask4x4 >>= 1; + mask4x4Int >>= 1; + } + } + + private static void FilterSelectivelyHoriz( + ArrayPtr s, + int pitch, + uint mask16x16, + uint mask8x8, + uint mask4x4, + uint mask4x4Int, + ReadOnlySpan lfthr, + ReadOnlySpan lfl) + { + int count; + + for (uint mask = mask16x16 | mask8x8 | mask4x4 | mask4x4Int; mask != 0; mask >>= count) + { + count = 1; + if ((mask & 1) != 0) + { + LoopFilterThresh lfi = lfthr[lfl[0]]; + + if ((mask16x16 & 1) != 0) + { + if ((mask16x16 & 3) == 3) + { + LoopFilterAuto.LpfHorizontal16Dual(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), + lfi.HevThr.AsSpan()); + count = 2; + } + else + { + LoopFilterAuto.LpfHorizontal16(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), + lfi.HevThr.AsSpan()); + } + } + else if ((mask8x8 & 1) != 0) + { + if ((mask8x8 & 3) == 3) + { + // Next block's thresholds. + LoopFilterThresh lfin = lfthr[lfl[1]]; + + LoopFilterAuto.LpfHorizontal8Dual( + s, + pitch, + lfi.Mblim.AsSpan(), + lfi.Lim.AsSpan(), + lfi.HevThr.AsSpan(), + lfin.Mblim.AsSpan(), + lfin.Lim.AsSpan(), + lfin.HevThr.AsSpan()); + + if ((mask4x4Int & 3) == 3) + { + LoopFilterAuto.LpfHorizontal4Dual( + s.Slice(4 * pitch), + pitch, + lfi.Mblim.AsSpan(), + lfi.Lim.AsSpan(), + lfi.HevThr.AsSpan(), + lfin.Mblim.AsSpan(), + lfin.Lim.AsSpan(), + lfin.HevThr.AsSpan()); + } + else if ((mask4x4Int & 1) != 0) + { + LoopFilterAuto.LpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim.AsSpan(), + lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); + } + else if ((mask4x4Int & 2) != 0) + { + LoopFilterAuto.LpfHorizontal4(s.Slice(8 + (4 * pitch)), pitch, lfin.Mblim.AsSpan(), + lfin.Lim.AsSpan(), lfin.HevThr.AsSpan()); + } + + count = 2; + } + else + { + LoopFilterAuto.LpfHorizontal8(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), + lfi.HevThr.AsSpan()); + + if ((mask4x4Int & 1) != 0) + { + LoopFilterAuto.LpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim.AsSpan(), + lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); + } + } + } + else if ((mask4x4 & 1) != 0) + { + if ((mask4x4 & 3) == 3) + { + // Next block's thresholds. + LoopFilterThresh lfin = lfthr[lfl[1]]; + + LoopFilterAuto.LpfHorizontal4Dual( + s, + pitch, + lfi.Mblim.AsSpan(), + lfi.Lim.AsSpan(), + lfi.HevThr.AsSpan(), + lfin.Mblim.AsSpan(), + lfin.Lim.AsSpan(), + lfin.HevThr.AsSpan()); + + if ((mask4x4Int & 3) == 3) + { + LoopFilterAuto.LpfHorizontal4Dual( + s.Slice(4 * pitch), + pitch, + lfi.Mblim.AsSpan(), + lfi.Lim.AsSpan(), + lfi.HevThr.AsSpan(), + lfin.Mblim.AsSpan(), + lfin.Lim.AsSpan(), + lfin.HevThr.AsSpan()); + } + else if ((mask4x4Int & 1) != 0) + { + LoopFilterAuto.LpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim.AsSpan(), + lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); + } + else if ((mask4x4Int & 2) != 0) + { + LoopFilterAuto.LpfHorizontal4(s.Slice(8 + (4 * pitch)), pitch, lfin.Mblim.AsSpan(), + lfin.Lim.AsSpan(), lfin.HevThr.AsSpan()); + } + + count = 2; + } + else + { + LoopFilterAuto.LpfHorizontal4(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), + lfi.HevThr.AsSpan()); + + if ((mask4x4Int & 1) != 0) + { + LoopFilterAuto.LpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim.AsSpan(), + lfi.Lim.AsSpan(), lfi.HevThr.AsSpan()); + } + } + } + else + { + LoopFilterAuto.LpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), + lfi.HevThr.AsSpan()); + } + } + + s = s.Slice(8 * count); + lfl = lfl.Slice(count); + mask16x16 >>= count; + mask8x8 >>= count; + mask4x4 >>= count; + mask4x4Int >>= count; + } + } + + private static void HighbdFilterSelectivelyHoriz( + ArrayPtr s, + int pitch, + uint mask16x16, + uint mask8x8, + uint mask4x4, + uint mask4x4Int, + ReadOnlySpan lfthr, + ReadOnlySpan lfl, + int bd) + { + int count; + + for (uint mask = mask16x16 | mask8x8 | mask4x4 | mask4x4Int; mask != 0; mask >>= count) + { + count = 1; + if ((mask & 1) != 0) + { + LoopFilterThresh lfi = lfthr[lfl[0]]; + + if ((mask16x16 & 1) != 0) + { + if ((mask16x16 & 3) == 3) + { + LoopFilterScalar.HighBdLpfHorizontal16Dual(s, pitch, lfi.Mblim[0], lfi.Lim[0], + lfi.HevThr[0], bd); + count = 2; + } + else + { + LoopFilterScalar.HighBdLpfHorizontal16(s, pitch, lfi.Mblim[0], lfi.Lim[0], lfi.HevThr[0], + bd); + } + } + else if ((mask8x8 & 1) != 0) + { + if ((mask8x8 & 3) == 3) + { + // Next block's thresholds. + LoopFilterThresh lfin = lfthr[lfl[1]]; + + LoopFilterScalar.HighBdLpfHorizontal8Dual( + s, + pitch, + lfi.Mblim[0], + lfi.Lim[0], + lfi.HevThr[0], + lfin.Mblim[0], + lfin.Lim[0], + lfin.HevThr[0], + bd); + + if ((mask4x4Int & 3) == 3) + { + LoopFilterScalar.HighBdLpfHorizontal4Dual( + s.Slice(4 * pitch), + pitch, + lfi.Mblim[0], + lfi.Lim[0], + lfi.HevThr[0], + lfin.Mblim[0], + lfin.Lim[0], + lfin.HevThr[0], + bd); + } + else if ((mask4x4Int & 1) != 0) + { + LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim[0], + lfi.Lim[0], lfi.HevThr[0], bd); + } + else if ((mask4x4Int & 2) != 0) + { + LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(8 + (4 * pitch)), pitch, lfin.Mblim[0], + lfin.Lim[0], lfin.HevThr[0], bd); + } + + count = 2; + } + else + { + LoopFilterScalar.HighBdLpfHorizontal8(s, pitch, lfi.Mblim[0], lfi.Lim[0], lfi.HevThr[0], + bd); + + if ((mask4x4Int & 1) != 0) + { + LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim[0], + lfi.Lim[0], lfi.HevThr[0], bd); + } + } + } + else if ((mask4x4 & 1) != 0) + { + if ((mask4x4 & 3) == 3) + { + // Next block's thresholds. + LoopFilterThresh lfin = lfthr[lfl[1]]; + + LoopFilterScalar.HighBdLpfHorizontal4Dual( + s, + pitch, + lfi.Mblim[0], + lfi.Lim[0], + lfi.HevThr[0], + lfin.Mblim[0], + lfin.Lim[0], + lfin.HevThr[0], + bd); + + if ((mask4x4Int & 3) == 3) + { + LoopFilterScalar.HighBdLpfHorizontal4Dual( + s.Slice(4 * pitch), + pitch, + lfi.Mblim[0], + lfi.Lim[0], + lfi.HevThr[0], + lfin.Mblim[0], + lfin.Lim[0], + lfin.HevThr[0], + bd); + } + else if ((mask4x4Int & 1) != 0) + { + LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim[0], + lfi.Lim[0], lfi.HevThr[0], bd); + } + else if ((mask4x4Int & 2) != 0) + { + LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(8 + (4 * pitch)), pitch, lfin.Mblim[0], + lfin.Lim[0], lfin.HevThr[0], bd); + } + + count = 2; + } + else + { + LoopFilterScalar.HighBdLpfHorizontal4(s, pitch, lfi.Mblim[0], lfi.Lim[0], lfi.HevThr[0], + bd); + + if ((mask4x4Int & 1) != 0) + { + LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim[0], + lfi.Lim[0], lfi.HevThr[0], bd); + } + } + } + else + { + LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim[0], lfi.Lim[0], + lfi.HevThr[0], bd); + } + } + + s = s.Slice(8 * count); + lfl = lfl.Slice(count); + mask16x16 >>= count; + mask8x8 >>= count; + mask4x4 >>= count; + mask4x4Int >>= count; + } + } + + private static void FilterSelectivelyVert( + ArrayPtr s, + int pitch, + uint mask16x16, + uint mask8x8, + uint mask4x4, + uint mask4x4Int, + ReadOnlySpan lfthr, + ReadOnlySpan lfl) + { + for (uint mask = mask16x16 | mask8x8 | mask4x4 | mask4x4Int; mask != 0; mask >>= 1) + { + LoopFilterThresh lfi = lfthr[lfl[0]]; + + if ((mask & 1) != 0) + { + if ((mask16x16 & 1) != 0) + { + LoopFilterAuto.LpfVertical16(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), + lfi.HevThr.AsSpan()); + } + else if ((mask8x8 & 1) != 0) + { + LoopFilterAuto.LpfVertical8(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), + lfi.HevThr.AsSpan()); + } + else if ((mask4x4 & 1) != 0) + { + LoopFilterAuto.LpfVertical4(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), + lfi.HevThr.AsSpan()); + } + } + + if ((mask4x4Int & 1) != 0) + { + LoopFilterAuto.LpfVertical4(s.Slice(4), pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), + lfi.HevThr.AsSpan()); + } + + s = s.Slice(8); + lfl = lfl.Slice(1); + mask16x16 >>= 1; + mask8x8 >>= 1; + mask4x4 >>= 1; + mask4x4Int >>= 1; + } + } + + private static void HighbdFilterSelectivelyVert( + ArrayPtr s, + int pitch, + uint mask16x16, + uint mask8x8, + uint mask4x4, + uint mask4x4Int, + ReadOnlySpan lfthr, + ReadOnlySpan lfl, + int bd) + { + for (uint mask = mask16x16 | mask8x8 | mask4x4 | mask4x4Int; mask != 0; mask >>= 1) + { + LoopFilterThresh lfi = lfthr[lfl[0]]; + + if ((mask & 1) != 0) + { + if ((mask16x16 & 1) != 0) + { + LoopFilterScalar.HighBdLpfVertical16(s, pitch, lfi.Mblim[0], lfi.Lim[0], lfi.HevThr[0], bd); + } + else if ((mask8x8 & 1) != 0) + { + LoopFilterScalar.HighBdLpfVertical8(s, pitch, lfi.Mblim[0], lfi.Lim[0], lfi.HevThr[0], bd); + } + else if ((mask4x4 & 1) != 0) + { + LoopFilterScalar.HighBdLpfVertical4(s, pitch, lfi.Mblim[0], lfi.Lim[0], lfi.HevThr[0], bd); + } + } + + if ((mask4x4Int & 1) != 0) + { + LoopFilterScalar.HighBdLpfVertical4(s.Slice(4), pitch, lfi.Mblim[0], lfi.Lim[0], lfi.HevThr[0], bd); + } + + s = s.Slice(8); + lfl = lfl.Slice(1); + mask16x16 >>= 1; + mask8x8 >>= 1; + mask4x4 >>= 1; + mask4x4Int >>= 1; + } + } + + private static readonly byte[] Num4x4BlocksWideLookup = { 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16 }; + private static readonly byte[] Num4x4BlocksHighLookup = { 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16 }; + private static readonly byte[] Num8x8BlocksWideLookup = { 1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8 }; + private static readonly byte[] Num8x8BlocksHighLookup = { 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8 }; + + private static void FilterBlockPlaneNon420( + ref Vp9Common cm, + ref MacroBlockDPlane plane, + ArrayPtr> mi8x8, + int miRow, + int miCol) + { + int ssX = plane.SubsamplingX; + int ssY = plane.SubsamplingY; + int rowStep = 1 << ssY; + int colStep = 1 << ssX; + int rowStepStride = cm.MiStride * rowStep; + ref Buf2D dst = ref plane.Dst; + ArrayPtr dst0 = dst.Buf; + Span mask16x16 = stackalloc int[Constants.MiBlockSize]; + Span mask8x8 = stackalloc int[Constants.MiBlockSize]; + Span mask4x4 = stackalloc int[Constants.MiBlockSize]; + Span mask4x4Int = stackalloc int[Constants.MiBlockSize]; + Span lfl = stackalloc byte[Constants.MiBlockSize * Constants.MiBlockSize]; + + + for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r += rowStep) + { + uint mask16x16C = 0; + uint mask8x8C = 0; + uint mask4x4C = 0; + uint borderMask; + + // Determine the vertical edges that need filtering + for (int c = 0; c < Constants.MiBlockSize && miCol + c < cm.MiCols; c += colStep) + { + ref ModeInfo mi = ref mi8x8[c].Value; + BlockSize sbType = mi.SbType; + bool skipThis = mi.Skip != 0 && mi.IsInterBlock(); + // left edge of current unit is block/partition edge -> no skip + bool blockEdgeLeft = Num4x4BlocksWideLookup[(int)sbType] <= 1 || (c & (Num8x8BlocksWideLookup[(int)sbType] - 1)) == 0; + bool skipThisC = skipThis && !blockEdgeLeft; + // top edge of current unit is block/partition edge -> no skip + bool blockEdgeAbove = Num4x4BlocksHighLookup[(int)sbType] <= 1 || (r & (Num8x8BlocksHighLookup[(int)sbType] - 1)) == 0; + bool skipThisR = skipThis && !blockEdgeAbove; + TxSize txSize = mi.GetUvTxSize(ref plane); + bool skipBorder4x4C = ssX != 0 && miCol + c == cm.MiCols - 1; + bool skipBorder4x4R = ssY != 0 && miRow + r == cm.MiRows - 1; + + // Filter level can vary per MI + if ((lfl[(r << 3) + (c >> ssX)] = GetFilterLevel(ref cm.LfInfo, ref mi)) == 0) + { + continue; + } + + // Build masks based on the transform size of each block + if (txSize == TxSize.Tx32x32) + { + if (!skipThisC && ((c >> ssX) & 3) == 0) + { + if (!skipBorder4x4C) + { + mask16x16C |= 1u << (c >> ssX); + } + else + { + mask8x8C |= 1u << (c >> ssX); + } + } + + if (!skipThisR && ((r >> ssY) & 3) == 0) + { + if (!skipBorder4x4R) + { + mask16x16[r] |= 1 << (c >> ssX); + } + else + { + mask8x8[r] |= 1 << (c >> ssX); + } + } + } + else if (txSize == TxSize.Tx16x16) + { + if (!skipThisC && ((c >> ssX) & 1) == 0) + { + if (!skipBorder4x4C) + { + mask16x16C |= 1u << (c >> ssX); + } + else + { + mask8x8C |= 1u << (c >> ssX); + } + } + + if (!skipThisR && ((r >> ssY) & 1) == 0) + { + if (!skipBorder4x4R) + { + mask16x16[r] |= 1 << (c >> ssX); + } + else + { + mask8x8[r] |= 1 << (c >> ssX); + } + } + } + else + { + // force 8x8 filtering on 32x32 boundaries + if (!skipThisC) + { + if (txSize == TxSize.Tx8x8 || ((c >> ssX) & 3) == 0) + { + mask8x8C |= 1u << (c >> ssX); + } + else + { + mask4x4C |= 1u << (c >> ssX); + } + } + + if (!skipThisR) + { + if (txSize == TxSize.Tx8x8 || ((r >> ssY) & 3) == 0) + { + mask8x8[r] |= 1 << (c >> ssX); + } + else + { + mask4x4[r] |= 1 << (c >> ssX); + } + } + + if (!skipThis && txSize < TxSize.Tx8x8 && !skipBorder4x4C) + { + mask4x4Int[r] |= 1 << (c >> ssX); + } + } + } + + // Disable filtering on the leftmost column + borderMask = ~(miCol == 0 ? 1u : 0u); + + if (cm.UseHighBitDepth) + { + HighbdFilterSelectivelyVert( + ConvertToUshortPtr(dst.Buf), + dst.Stride, + mask16x16C & borderMask, + mask8x8C & borderMask, + mask4x4C & borderMask, + (uint)mask4x4Int[r], + cm.LfInfo.Lfthr.AsSpan(), + lfl.Slice(r << 3), + (int)cm.BitDepth); + } + else + { + FilterSelectivelyVert( + dst.Buf, + dst.Stride, + mask16x16C & borderMask, + mask8x8C & borderMask, + mask4x4C & borderMask, + (uint)mask4x4Int[r], + cm.LfInfo.Lfthr.AsSpan(), + lfl.Slice(r << 3)); + } + + dst.Buf = dst.Buf.Slice(8 * dst.Stride); + mi8x8 = mi8x8.Slice(rowStepStride); + } + + // Now do horizontal pass + dst.Buf = dst0; + for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r += rowStep) + { + bool skipBorder4x4R = ssY != 0 && miRow + r == cm.MiRows - 1; + uint mask4x4IntR = skipBorder4x4R ? 0u : (uint)mask4x4Int[r]; + + uint mask16x16R; + uint mask8x8R; + uint mask4x4R; + + if (miRow + r == 0) + { + mask16x16R = 0; + mask8x8R = 0; + mask4x4R = 0; + } + else + { + mask16x16R = (uint)mask16x16[r]; + mask8x8R = (uint)mask8x8[r]; + mask4x4R = (uint)mask4x4[r]; + } + + if (cm.UseHighBitDepth) + { + HighbdFilterSelectivelyHoriz( + ConvertToUshortPtr(dst.Buf), + dst.Stride, + mask16x16R, + mask8x8R, + mask4x4R, + mask4x4IntR, + cm.LfInfo.Lfthr.AsSpan(), + lfl.Slice(r << 3), + (int)cm.BitDepth); + } + else + { + FilterSelectivelyHoriz( + dst.Buf, + dst.Stride, + mask16x16R, + mask8x8R, + mask4x4R, + mask4x4IntR, + cm.LfInfo.Lfthr.AsSpan(), + lfl.Slice(r << 3)); + } + + dst.Buf = dst.Buf.Slice(8 * dst.Stride); + } + } + + private static void FilterBlockPlaneSs00(ref Vp9Common cm, ref MacroBlockDPlane plane, int miRow, + ref LoopFilterMask lfm) + { + ref Buf2D dst = ref plane.Dst; + ArrayPtr dst0 = dst.Buf; + ulong mask16x16 = lfm.LeftY[(int)TxSize.Tx16x16]; + ulong mask8x8 = lfm.LeftY[(int)TxSize.Tx8x8]; + ulong mask4x4 = lfm.LeftY[(int)TxSize.Tx4x4]; + ulong mask4x4Int = lfm.Int4x4Y; + + Debug.Assert(plane.SubsamplingX == 0 && plane.SubsamplingY == 0); + + // Vertical pass: do 2 rows at one time + for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r += 2) + { + if (cm.UseHighBitDepth) + { + // Disable filtering on the leftmost column. + HighbdFilterSelectivelyVertRow2( + plane.SubsamplingX, + ConvertToUshortPtr(dst.Buf), + dst.Stride, + (uint)mask16x16, + (uint)mask8x8, + (uint)mask4x4, + (uint)mask4x4Int, + cm.LfInfo.Lfthr.AsSpan(), + lfm.LflY.AsSpan().Slice(r << 3), + (int)cm.BitDepth); + } + else + { + // Disable filtering on the leftmost column. + FilterSelectivelyVertRow2( + plane.SubsamplingX, + dst.Buf, + dst.Stride, + (uint)mask16x16, + (uint)mask8x8, + (uint)mask4x4, + (uint)mask4x4Int, + cm.LfInfo.Lfthr.AsSpan(), + lfm.LflY.AsSpan().Slice(r << 3)); + } + + dst.Buf = dst.Buf.Slice(16 * dst.Stride); + mask16x16 >>= 16; + mask8x8 >>= 16; + mask4x4 >>= 16; + mask4x4Int >>= 16; + } + + // Horizontal pass + dst.Buf = dst0; + mask16x16 = lfm.AboveY[(int)TxSize.Tx16x16]; + mask8x8 = lfm.AboveY[(int)TxSize.Tx8x8]; + mask4x4 = lfm.AboveY[(int)TxSize.Tx4x4]; + mask4x4Int = lfm.Int4x4Y; + + for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r++) + { + uint mask16x16R; + uint mask8x8R; + uint mask4x4R; + + if (miRow + r == 0) + { + mask16x16R = 0; + mask8x8R = 0; + mask4x4R = 0; + } + else + { + mask16x16R = (uint)mask16x16 & 0xff; + mask8x8R = (uint)mask8x8 & 0xff; + mask4x4R = (uint)mask4x4 & 0xff; + } + + if (cm.UseHighBitDepth) + { + HighbdFilterSelectivelyHoriz( + ConvertToUshortPtr(dst.Buf), + dst.Stride, + mask16x16R, + mask8x8R, + mask4x4R, + (uint)mask4x4Int & 0xff, + cm.LfInfo.Lfthr.AsSpan(), + lfm.LflY.AsSpan().Slice(r << 3), + (int)cm.BitDepth); + } + else + { + FilterSelectivelyHoriz( + dst.Buf, + dst.Stride, + mask16x16R, + mask8x8R, + mask4x4R, + (uint)mask4x4Int & 0xff, + cm.LfInfo.Lfthr.AsSpan(), + lfm.LflY.AsSpan().Slice(r << 3)); + } + + dst.Buf = dst.Buf.Slice(8 * dst.Stride); + mask16x16 >>= 8; + mask8x8 >>= 8; + mask4x4 >>= 8; + mask4x4Int >>= 8; + } + } + + private static void FilterBlockPlaneSs11(ref Vp9Common cm, ref MacroBlockDPlane plane, int miRow, + ref LoopFilterMask lfm) + { + Buf2D dst = plane.Dst; + ArrayPtr dst0 = dst.Buf; + + Span lflUv = stackalloc byte[16]; + + ushort mask16x16 = lfm.LeftUv[(int)TxSize.Tx16x16]; + ushort mask8x8 = lfm.LeftUv[(int)TxSize.Tx8x8]; + ushort mask4x4 = lfm.LeftUv[(int)TxSize.Tx4x4]; + ushort mask4x4Int = lfm.Int4x4Uv; + + Debug.Assert(plane.SubsamplingX == 1 && plane.SubsamplingY == 1); + + // Vertical pass: do 2 rows at one time + for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r += 4) + { + for (int c = 0; c < Constants.MiBlockSize >> 1; c++) + { + lflUv[(r << 1) + c] = lfm.LflY[(r << 3) + (c << 1)]; + lflUv[((r + 2) << 1) + c] = lfm.LflY[((r + 2) << 3) + (c << 1)]; + } + + if (cm.UseHighBitDepth) + { + // Disable filtering on the leftmost column. + HighbdFilterSelectivelyVertRow2( + plane.SubsamplingX, + ConvertToUshortPtr(dst.Buf), + dst.Stride, + mask16x16, + mask8x8, + mask4x4, + mask4x4Int, + cm.LfInfo.Lfthr.AsSpan(), + lflUv.Slice(r << 1), + (int)cm.BitDepth); + } + else + { + // Disable filtering on the leftmost column. + FilterSelectivelyVertRow2( + plane.SubsamplingX, + dst.Buf, + dst.Stride, + mask16x16, + mask8x8, + mask4x4, + mask4x4Int, + cm.LfInfo.Lfthr.AsSpan(), + lflUv.Slice(r << 1)); + } + + dst.Buf = dst.Buf.Slice(16 * dst.Stride); + mask16x16 >>= 8; + mask8x8 >>= 8; + mask4x4 >>= 8; + mask4x4Int >>= 8; + } + + // Horizontal pass + dst.Buf = dst0; + mask16x16 = lfm.AboveUv[(int)TxSize.Tx16x16]; + mask8x8 = lfm.AboveUv[(int)TxSize.Tx8x8]; + mask4x4 = lfm.AboveUv[(int)TxSize.Tx4x4]; + mask4x4Int = lfm.Int4x4Uv; + + for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r += 2) + { + bool skipBorder4x4R = miRow + r == cm.MiRows - 1; + uint mask4x4IntR = skipBorder4x4R ? 0u : (uint)mask4x4Int & 0xf; + uint mask16x16R; + uint mask8x8R; + uint mask4x4R; + + if (miRow + r == 0) + { + mask16x16R = 0; + mask8x8R = 0; + mask4x4R = 0; + } + else + { + mask16x16R = (uint)mask16x16 & 0xf; + mask8x8R = (uint)mask8x8 & 0xf; + mask4x4R = (uint)mask4x4 & 0xf; + } + + if (cm.UseHighBitDepth) + { + HighbdFilterSelectivelyHoriz( + ConvertToUshortPtr(dst.Buf), + dst.Stride, + mask16x16R, + mask8x8R, + mask4x4R, + mask4x4IntR, + cm.LfInfo.Lfthr.AsSpan(), + lflUv.Slice(r << 1), + (int)cm.BitDepth); + } + else + { + FilterSelectivelyHoriz( + dst.Buf, + dst.Stride, + mask16x16R, + mask8x8R, + mask4x4R, + mask4x4IntR, + cm.LfInfo.Lfthr.AsSpan(), + lflUv.Slice(r << 1)); + } + + dst.Buf = dst.Buf.Slice(8 * dst.Stride); + mask16x16 >>= 4; + mask8x8 >>= 4; + mask4x4 >>= 4; + mask4x4Int >>= 4; + } + } + + private enum LfPath + { + LfPathSlow, + LfPath420, + LfPath444 + } + + private static void LoopFilterRows( + ref Surface frameBuffer, + ref Vp9Common cm, + Array3 planes, + int start, + int stop, + int step, + bool yOnly, + LfSync lfSync) + { + int numPlanes = yOnly ? 1 : Constants.MaxMbPlane; + int sbCols = TileInfo.MiColsAlignedToSb(cm.MiCols) >> Constants.MiBlockSizeLog2; + LfPath path; + int miRow, miCol; + + if (yOnly) + { + path = LfPath.LfPath444; + } + else if (planes[1].SubsamplingY == 1 && planes[1].SubsamplingX == 1) + { + path = LfPath.LfPath420; + } + else if (planes[1].SubsamplingY == 0 && planes[1].SubsamplingX == 0) + { + path = LfPath.LfPath444; + } + else + { + path = LfPath.LfPathSlow; + } + + for (miRow = start; miRow < stop; miRow += step) + { + ArrayPtr> mi = cm.MiGridVisible.Slice(miRow * cm.MiStride); + Span lfm = GetLfm(ref cm.Lf, miRow, 0); + + for (miCol = 0; miCol < cm.MiCols; miCol += Constants.MiBlockSize, lfm = lfm.Slice(1)) + { + int r = miRow >> Constants.MiBlockSizeLog2; + int c = miCol >> Constants.MiBlockSizeLog2; + int plane; + + lfSync.SyncRead(r, c); + + ReconInter.SetupDstPlanes(ref planes, ref frameBuffer, miRow, miCol); + + AdjustMask(ref cm, miRow, miCol, ref lfm[0]); + + FilterBlockPlaneSs00(ref cm, ref planes[0], miRow, ref lfm[0]); + for (plane = 1; plane < numPlanes; ++plane) + { + switch (path) + { + case LfPath.LfPath420: + FilterBlockPlaneSs11(ref cm, ref planes[plane], miRow, ref lfm[0]); + break; + case LfPath.LfPath444: + FilterBlockPlaneSs00(ref cm, ref planes[plane], miRow, ref lfm[0]); + break; + case LfPath.LfPathSlow: + FilterBlockPlaneNon420(ref cm, ref planes[plane], mi.Slice(miCol), miRow, + miCol); + break; + } + } + + lfSync.SyncWrite(r, c, sbCols); + } + } + } + + public static void LoopFilterFrame( + ref Surface frame, + ref Vp9Common cm, + ref MacroBlockD xd, + int frameFilterLevel, + bool yOnly, + bool partialFrame) + { + if (frameFilterLevel == 0) + { + return; + } + + int startMiRow = 0; + int miRowsToFilter = cm.MiRows; + + if (partialFrame && cm.MiRows > 8) + { + startMiRow = cm.MiRows >> 1; + startMiRow &= ~7; + miRowsToFilter = Math.Max(cm.MiRows / 8, 8); + } + + int endMiRow = startMiRow + miRowsToFilter; + + LoopFilterRows(ref frame, ref cm, xd.Plane, startMiRow, endMiRow, Constants.MiBlockSize, yOnly, + default); + } + + private static void LoopFilterRowsMt( + ref Surface frameBuffer, + ref Vp9Common cm, + Array3 planes, + int start, + int stop, + bool yOnly, + int threadCount) + { + int sbRows = TileInfo.MiColsAlignedToSb(cm.MiRows) >> Constants.MiBlockSizeLog2; + int numTileCols = 1 << cm.Log2TileCols; + int numWorkers = Math.Min(threadCount, Math.Min(numTileCols, sbRows)); + + LfSync lfSync = new(); + lfSync.Initialize(cm.Width, sbRows); + + Ptr frameBufferPtr = new(ref frameBuffer); + Ptr cmPtr = new(ref cm); + + Parallel.For(0, numWorkers, n => + { + LoopFilterRows( + ref frameBufferPtr.Value, + ref cmPtr.Value, + planes, + start + (n * Constants.MiBlockSize), + stop, + numWorkers * Constants.MiBlockSize, + yOnly, + lfSync); + }); + } + + public static void LoopFilterFrameMt( + ref Surface frame, + ref Vp9Common cm, + ref MacroBlockD xd, + int frameFilterLevel, + bool yOnly, + bool partialFrame, + int threadCount) + { + if (frameFilterLevel == 0) + { + return; + } + + int startMiRow = 0; + int miRowsToFilter = cm.MiRows; + + if (partialFrame && cm.MiRows > 8) + { + startMiRow = cm.MiRows >> 1; + startMiRow &= ~7; + miRowsToFilter = Math.Max(cm.MiRows / 8, 8); + } + + int endMiRow = startMiRow + miRowsToFilter; + + LoopFilterFrameInit(ref cm, frameFilterLevel); + LoopFilterRowsMt(ref frame, ref cm, xd.Plane, startMiRow, endMiRow, yOnly, threadCount); + } + + private static unsafe ArrayPtr ConvertToUshortPtr(ArrayPtr s) + { + return new ArrayPtr((ushort*)s.ToPointer(), s.Length / 2); + } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Luts.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Luts.cs index 140181ef8..f21461304 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Luts.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Luts.cs @@ -84,7 +84,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 }, new TxSize[][][] { - // BLOCK_4X8 + // BLOCK_4x8 new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, @@ -92,7 +92,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 }, new TxSize[][][] { - // BLOCK_8X4 + // BLOCK_8x4 new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, @@ -108,7 +108,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 }, new TxSize[][][] { - // BLOCK_8X16 + // BLOCK_8x16 new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, @@ -116,7 +116,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 }, new TxSize[][][] { - // BLOCK_16X8 + // BLOCK_16x8 new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 } }, new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, @@ -132,7 +132,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 }, new TxSize[][][] { - // BLOCK_16X32 + // BLOCK_16x32 new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, @@ -140,7 +140,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 }, new TxSize[][][] { - // BLOCK_32X16 + // BLOCK_32x16 new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 } }, @@ -156,7 +156,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 }, new TxSize[][][] { - // BLOCK_32X64 + // BLOCK_32x64 new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } }, @@ -164,7 +164,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 }, new TxSize[][][] { - // BLOCK_64X32 + // BLOCK_64x32 new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } }, @@ -172,7 +172,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 }, new TxSize[][][] { - // BLOCK_64X64 + // BLOCK_64x64 new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } }, new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } }, new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } }, @@ -198,18 +198,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 public static readonly PartitionContextPair[] PartitionContextLookup = new PartitionContextPair[] { new PartitionContextPair(15, 15), // 4X4 - {0b1111, 0b1111} - new PartitionContextPair(15, 14), // 4X8 - {0b1111, 0b1110} - new PartitionContextPair(14, 15), // 8X4 - {0b1110, 0b1111} + new PartitionContextPair(15, 14), // 4x8 - {0b1111, 0b1110} + new PartitionContextPair(14, 15), // 8x4 - {0b1110, 0b1111} new PartitionContextPair(14, 14), // 8X8 - {0b1110, 0b1110} - new PartitionContextPair(14, 12), // 8X16 - {0b1110, 0b1100} - new PartitionContextPair(12, 14), // 16X8 - {0b1100, 0b1110} + new PartitionContextPair(14, 12), // 8x16 - {0b1110, 0b1100} + new PartitionContextPair(12, 14), // 16x8 - {0b1100, 0b1110} new PartitionContextPair(12, 12), // 16X16 - {0b1100, 0b1100} - new PartitionContextPair(12, 8), // 16X32 - {0b1100, 0b1000} - new PartitionContextPair(8, 12), // 32X16 - {0b1000, 0b1100} + new PartitionContextPair(12, 8), // 16x32 - {0b1100, 0b1000} + new PartitionContextPair(8, 12), // 32x16 - {0b1000, 0b1100} new PartitionContextPair(8, 8), // 32X32 - {0b1000, 0b1000} - new PartitionContextPair(8, 0), // 32X64 - {0b1000, 0b0000} - new PartitionContextPair(0, 8), // 64X32 - {0b0000, 0b1000} - new PartitionContextPair(0, 0), // 64X64 - {0b0000, 0b0000} + new PartitionContextPair(8, 0), // 32x64 - {0b1000, 0b0000} + new PartitionContextPair(0, 8), // 64x32 - {0b0000, 0b1000} + new PartitionContextPair(0, 0), // 64x64 - {0b0000, 0b0000} }; // Filter @@ -281,7 +281,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 return output; } - public static readonly Array8[][] Vp9FilterKernels = new Array8[][] + public static readonly Array8[][] FilterKernels = new Array8[][] { SubPelFilters8, SubPelFilters8Lp, SubPelFilters8S, BilinearFilters }; @@ -797,22 +797,22 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 959, 990, 991, 1022, 0, 0, }; - private static readonly short[] Vp9DefaultIscan4X4 = new short[] + private static readonly short[] DefaultIscan4X4 = new short[] { 0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15, }; - private static readonly short[] Vp9ColIscan4X4 = new short[] + private static readonly short[] ColIscan4X4 = new short[] { 0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15, }; - private static readonly short[] Vp9RowIscan4X4 = new short[] + private static readonly short[] RowIscan4X4 = new short[] { 0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15, }; - private static readonly short[] Vp9ColIscan8X8 = new short[] + private static readonly short[] ColIscan8X8 = new short[] { 0, 3, 8, 15, 22, 32, 40, 47, 1, 5, 11, 18, 26, 34, 44, 51, 2, 7, 13, 20, 28, 38, 46, 54, 4, 10, 16, 24, 31, 41, 50, 56, @@ -820,7 +820,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63, }; - private static readonly short[] Vp9RowIscan8X8 = new short[] + private static readonly short[] RowIscan8X8 = new short[] { 0, 1, 2, 5, 8, 12, 19, 24, 3, 4, 7, 10, 15, 20, 30, 39, 6, 9, 13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52, @@ -828,7 +828,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63, }; - private static readonly short[] Vp9DefaultIscan8X8 = new short[] + private static readonly short[] DefaultIscan8X8 = new short[] { 0, 2, 5, 9, 14, 22, 31, 37, 1, 4, 8, 13, 19, 26, 38, 44, 3, 6, 10, 17, 24, 30, 42, 49, 7, 11, 15, 21, 29, 36, 47, 53, @@ -836,7 +836,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63, }; - private static readonly short[] Vp9ColIscan16X16 = new short[] + private static readonly short[] ColIscan16X16 = new short[] { 0, 4, 11, 20, 31, 43, 59, 75, 85, 109, 130, 150, 165, 181, 195, 198, 1, 6, 14, 23, 34, 47, 64, 81, 95, 114, 135, 153, 171, 188, 201, 212, @@ -856,7 +856,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255, }; - private static readonly short[] Vp9RowIscan16X16 = new short[] + private static readonly short[] RowIscan16X16 = new short[] { 0, 1, 2, 4, 6, 9, 12, 17, 22, 29, 36, 43, 54, 64, 76, 86, 3, 5, 7, 11, 15, 19, 25, 32, 38, 48, 59, 68, 84, 99, @@ -878,7 +878,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 255, }; - private static readonly short[] Vp9DefaultIscan16X16 = new short[] + private static readonly short[] DefaultIscan16X16 = new short[] { 0, 2, 5, 9, 17, 24, 36, 44, 55, 72, 88, 104, 128, 143, 166, 179, 1, 4, 8, 13, 20, 30, 40, 54, 66, 79, 96, 113, 141, 154, @@ -900,7 +900,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 255, }; - private static readonly short[] Vp9DefaultIscan32X32 = new short[] + private static readonly short[] DefaultIscan32X32 = new short[] { 0, 2, 5, 10, 17, 25, 38, 47, 62, 83, 101, 121, 145, 170, 193, 204, 210, 219, 229, 233, 245, 257, 275, 299, 342, 356, @@ -997,94 +997,94 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } - public static readonly ScanOrder[] Vp9DefaultScanOrders = new ScanOrder[] + public static readonly ScanOrder[] DefaultScanOrders = new ScanOrder[] { - new ScanOrder(DefaultScan4X4, Vp9DefaultIscan4X4, DefaultScan4X4Neighbors), - new ScanOrder(DefaultScan8X8, Vp9DefaultIscan8X8, DefaultScan8X8Neighbors), - new ScanOrder(DefaultScan16X16, Vp9DefaultIscan16X16, DefaultScan16X16Neighbors), - new ScanOrder(DefaultScan32X32, Vp9DefaultIscan32X32, DefaultScan32X32Neighbors) + new ScanOrder(DefaultScan4X4, DefaultIscan4X4, DefaultScan4X4Neighbors), + new ScanOrder(DefaultScan8X8, DefaultIscan8X8, DefaultScan8X8Neighbors), + new ScanOrder(DefaultScan16X16, DefaultIscan16X16, DefaultScan16X16Neighbors), + new ScanOrder(DefaultScan32X32, DefaultIscan32X32, DefaultScan32X32Neighbors) }; - public static readonly ScanOrder[][] Vp9ScanOrders = new ScanOrder[][] + public static readonly ScanOrder[][] ScanOrders = new ScanOrder[][] { new ScanOrder[] { // TX_4X4 - new ScanOrder(DefaultScan4X4, Vp9DefaultIscan4X4, DefaultScan4X4Neighbors), - new ScanOrder(RowScan4X4, Vp9RowIscan4X4, RowScan4X4Neighbors), - new ScanOrder(ColScan4X4, Vp9ColIscan4X4, ColScan4X4Neighbors), - new ScanOrder(DefaultScan4X4, Vp9DefaultIscan4X4, DefaultScan4X4Neighbors) + new ScanOrder(DefaultScan4X4, DefaultIscan4X4, DefaultScan4X4Neighbors), + new ScanOrder(RowScan4X4, RowIscan4X4, RowScan4X4Neighbors), + new ScanOrder(ColScan4X4, ColIscan4X4, ColScan4X4Neighbors), + new ScanOrder(DefaultScan4X4, DefaultIscan4X4, DefaultScan4X4Neighbors) }, new ScanOrder[] { // TX_8X8 - new ScanOrder(DefaultScan8X8, Vp9DefaultIscan8X8, DefaultScan8X8Neighbors), - new ScanOrder(RowScan8X8, Vp9RowIscan8X8, RowScan8X8Neighbors), - new ScanOrder(ColScan8X8, Vp9ColIscan8X8, ColScan8X8Neighbors), - new ScanOrder(DefaultScan8X8, Vp9DefaultIscan8X8, DefaultScan8X8Neighbors) + new ScanOrder(DefaultScan8X8, DefaultIscan8X8, DefaultScan8X8Neighbors), + new ScanOrder(RowScan8X8, RowIscan8X8, RowScan8X8Neighbors), + new ScanOrder(ColScan8X8, ColIscan8X8, ColScan8X8Neighbors), + new ScanOrder(DefaultScan8X8, DefaultIscan8X8, DefaultScan8X8Neighbors) }, new ScanOrder[] { // TX_16X16 - new ScanOrder(DefaultScan16X16, Vp9DefaultIscan16X16, DefaultScan16X16Neighbors), - new ScanOrder(RowScan16X16, Vp9RowIscan16X16, RowScan16X16Neighbors), - new ScanOrder(ColScan16X16, Vp9ColIscan16X16, ColScan16X16Neighbors), - new ScanOrder(DefaultScan16X16, Vp9DefaultIscan16X16, DefaultScan16X16Neighbors) + new ScanOrder(DefaultScan16X16, DefaultIscan16X16, DefaultScan16X16Neighbors), + new ScanOrder(RowScan16X16, RowIscan16X16, RowScan16X16Neighbors), + new ScanOrder(ColScan16X16, ColIscan16X16, ColScan16X16Neighbors), + new ScanOrder(DefaultScan16X16, DefaultIscan16X16, DefaultScan16X16Neighbors) }, new ScanOrder[] { // TX_32X32 - new ScanOrder(DefaultScan32X32, Vp9DefaultIscan32X32, DefaultScan32X32Neighbors), - new ScanOrder(DefaultScan32X32, Vp9DefaultIscan32X32, DefaultScan32X32Neighbors), - new ScanOrder(DefaultScan32X32, Vp9DefaultIscan32X32, DefaultScan32X32Neighbors), - new ScanOrder(DefaultScan32X32, Vp9DefaultIscan32X32, DefaultScan32X32Neighbors) + new ScanOrder(DefaultScan32X32, DefaultIscan32X32, DefaultScan32X32Neighbors), + new ScanOrder(DefaultScan32X32, DefaultIscan32X32, DefaultScan32X32Neighbors), + new ScanOrder(DefaultScan32X32, DefaultIscan32X32, DefaultScan32X32Neighbors), + new ScanOrder(DefaultScan32X32, DefaultIscan32X32, DefaultScan32X32Neighbors) } }; // Entropy MV - public static readonly sbyte[] Vp9MvJointTree = new sbyte[] + public static readonly sbyte[] MvJointTree = new sbyte[] { - -(sbyte)MvJointType.MvJointZero, 2, -(sbyte)MvJointType.MvJointHnzvz, 4, -(sbyte)MvJointType.MvJointHzvnz, -(sbyte)MvJointType.MvJointHnzvnz + -(sbyte)MvJointType.Zero, 2, -(sbyte)MvJointType.Hnzvz, 4, -(sbyte)MvJointType.Hzvnz, -(sbyte)MvJointType.Hnzvnz }; - public static readonly sbyte[] Vp9MvClassTree = new sbyte[] + public static readonly sbyte[] MvClassTree = new sbyte[] { - -(sbyte)MvClassType.MvClass0, + -(sbyte)MvClassType.Class0, 2, - -(sbyte)MvClassType.MvClass1, + -(sbyte)MvClassType.Class1, 4, 6, 8, - -(sbyte)MvClassType.MvClass2, - -(sbyte)MvClassType.MvClass3, + -(sbyte)MvClassType.Class2, + -(sbyte)MvClassType.Class3, 10, 12, - -(sbyte)MvClassType.MvClass4, - -(sbyte)MvClassType.MvClass5, - -(sbyte)MvClassType.MvClass6, + -(sbyte)MvClassType.Class4, + -(sbyte)MvClassType.Class5, + -(sbyte)MvClassType.Class6, 14, 16, 18, - -(sbyte)MvClassType.MvClass7, - -(sbyte)MvClassType.MvClass8, - -(sbyte)MvClassType.MvClass9, - -(sbyte)MvClassType.MvClass10, + -(sbyte)MvClassType.Class7, + -(sbyte)MvClassType.Class8, + -(sbyte)MvClassType.Class9, + -(sbyte)MvClassType.Class10 }; - public static ReadOnlySpan Vp9MvFPTree => new sbyte[] { -0, 2, -1, 4, -2, -3 }; + public static ReadOnlySpan MvFPTree => new sbyte[] { -0, 2, -1, 4, -2, -3 }; // Entropy - public static ReadOnlySpan Vp9Cat1Prob => new byte[] { 159 }; - public static ReadOnlySpan Vp9Cat2Prob => new byte[] { 165, 145 }; - public static ReadOnlySpan Vp9Cat3Prob => new byte[] { 173, 148, 140 }; - public static ReadOnlySpan Vp9Cat4Prob => new byte[] { 176, 155, 140, 135 }; - public static ReadOnlySpan Vp9Cat5Prob => new byte[] { 180, 157, 141, 134, 130 }; - public static ReadOnlySpan Vp9Cat6Prob => new byte[] { 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 }; + public static ReadOnlySpan Cat1Prob => new byte[] { 159 }; + public static ReadOnlySpan Cat2Prob => new byte[] { 165, 145 }; + public static ReadOnlySpan Cat3Prob => new byte[] { 173, 148, 140 }; + public static ReadOnlySpan Cat4Prob => new byte[] { 176, 155, 140, 135 }; + public static ReadOnlySpan Cat5Prob => new byte[] { 180, 157, 141, 134, 130 }; + public static ReadOnlySpan Cat6Prob => new byte[] { 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 }; - public static ReadOnlySpan Vp9Cat6ProbHigh12 => new byte[] + public static ReadOnlySpan Cat6ProbHigh12 => new byte[] { 255, 255, 255, 255, 254, 254, 54, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 }; - private static readonly byte[] Vp9CoefbandTrans8X8Plus = new byte[] + private static readonly byte[] CoefbandTrans8X8Plus = new byte[] { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, // Beyond MAXBAND_INDEX+1 all values are filled as 5 @@ -1129,17 +1129,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, }; - private static ReadOnlySpan Vp9CoefbandTrans4X4 => new byte[] + private static ReadOnlySpan CoefbandTrans4X4 => new byte[] { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, }; - public static ReadOnlySpan get_band_translate(TxSize txSize) + public static ReadOnlySpan GetBandTranslate(TxSize txSize) { - return txSize == TxSize.Tx4x4 ? Vp9CoefbandTrans4X4 : Vp9CoefbandTrans8X8Plus; + return txSize == TxSize.Tx4x4 ? CoefbandTrans4X4 : CoefbandTrans8X8Plus; } - public static readonly byte[][] Vp9Pareto8Full = new byte[][] + public static readonly byte[][] Pareto8Full = new byte[][] { new byte[] { 3, 86, 128, 6, 86, 23, 88, 29 }, new byte[] { 6, 86, 128, 11, 87, 42, 91, 52 }, @@ -1399,7 +1399,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 }; /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */ - public static readonly sbyte[] Vp9IntraModeTree = new sbyte[] + public static readonly sbyte[] IntraModeTree = new sbyte[] { -(sbyte)PredictionMode.DcPred, 2, /* 0 = DC_NODE */ -(sbyte)PredictionMode.TmPred, 4, /* 1 = TM_NODE */ @@ -1412,7 +1412,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 -(sbyte)PredictionMode.D153Pred, -(sbyte)PredictionMode.D207Pred /* 8 = D153_NODE */ }; - public static readonly sbyte[] Vp9InterModeTree = new sbyte[] + public static readonly sbyte[] InterModeTree = new sbyte[] { -((sbyte)PredictionMode.ZeroMv - (sbyte)PredictionMode. NearestMv), 2, -((sbyte)PredictionMode.NearestMv - (sbyte)PredictionMode.NearestMv), 4, @@ -1420,17 +1420,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 -((sbyte)PredictionMode.NewMv - (sbyte)PredictionMode.NearestMv) }; - public static readonly sbyte[] Vp9PartitionTree = new sbyte[] + public static readonly sbyte[] PartitionTree = new sbyte[] { -(sbyte)PartitionType.PartitionNone, 2, -(sbyte)PartitionType.PartitionHorz, 4, -(sbyte)PartitionType.PartitionVert, -(sbyte)PartitionType.PartitionSplit }; - public static readonly sbyte[] Vp9SwitchableInterpTree = new sbyte[] + public static readonly sbyte[] SwitchableInterpTree = new sbyte[] { -Constants.EightTap, 2, -Constants.EightTapSmooth, -Constants.EightTapSharp }; - public static readonly sbyte[] Vp9SegmentTree = new sbyte[] + public static readonly sbyte[] SegmentTree = new sbyte[] { 2, 4, 6, 8, 10, 12, 0, -1, -2, -3, -4, -5, -6, -7 }; @@ -1497,7 +1497,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 new Position( -2, -1 ), new Position( -1, -2 ), new Position( -2, -2 ) }, - // 4X8 + // 4x8 new Position[] { new Position( -1, 0 ), new Position( 0, -1 ), new Position( -1, -1 ), @@ -1506,7 +1506,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 new Position( -2, -1 ), new Position( -1, -2 ), new Position( -2, -2 ) }, - // 8X4 + // 8x4 new Position[] { new Position( -1, 0 ), new Position( 0, -1 ), new Position( -1, -1 ), @@ -1524,7 +1524,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 new Position( -2, -1 ), new Position( -1, -2 ), new Position( -2, -2 ) }, - // 8X16 + // 8x16 new Position[] { new Position( 0, -1 ), new Position( -1, 0 ), new Position( 1, -1 ), @@ -1533,7 +1533,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 new Position( -2, 0 ), new Position( -2, -1 ), new Position( -1, -2 ) }, - // 16X8 + // 16x8 new Position[] { new Position( -1, 0 ), new Position( 0, -1 ), new Position( -1, 1 ), @@ -1551,7 +1551,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 new Position( -3, 0 ), new Position( 0, -3 ), new Position( -3, -3 ) }, - // 16X32 + // 16x32 new Position[] { new Position( 0, -1 ), new Position( -1, 0 ), new Position( 2, -1 ), @@ -1560,7 +1560,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 new Position( 0, -3 ), new Position( -3, 0 ), new Position( -3, -3 ) }, - // 32X16 + // 32x16 new Position[] { new Position( -1, 0 ), new Position( 0, -1 ), new Position( -1, 2 ), @@ -1578,7 +1578,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 new Position( -3, 0 ), new Position( 0, -3 ), new Position( -3, -3 ) }, - // 32X64 + // 32x64 new Position[] { new Position( 0, -1 ), new Position( -1, 0 ), new Position( 4, -1 ), @@ -1587,7 +1587,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 new Position( 0, -3 ), new Position( -3, 0 ), new Position( 2, -1 ) }, - // 64X32 + // 64x32 new Position[] { new Position( -1, 0 ), new Position( 0, -1 ), new Position( -1, 4 ), @@ -1596,7 +1596,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 new Position( -3, 0 ), new Position( 0, -3 ), new Position( -1, 2 ) }, - // 64X64 + // 64x64 new Position[] { new Position( -1, 3 ), new Position( 3, -1 ), new Position( -1, 4 ), diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/PredCommon.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/PredCommon.cs index a9da10425..30cab3b60 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/PredCommon.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/PredCommon.cs @@ -13,7 +13,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 // left of the entries corresponding to real macroblocks. // The prediction flags in these dummy entries are initialized to 0. if (!xd.AboveMi.IsNull && !xd.LeftMi.IsNull) - { // both edges available + { + // both edges available if (!xd.AboveMi.Value.HasSecondRef() && !xd.LeftMi.Value.HasSecondRef()) { // Neither edge uses comp pred (0/1) @@ -23,20 +24,24 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 else if (!xd.AboveMi.Value.HasSecondRef()) { // One of two edges uses comp pred (2/3) - ctx = 2 + (xd.AboveMi.Value.RefFrame[0] == cm.CompFixedRef || !xd.AboveMi.Value.IsInterBlock() ? 1 : 0); + ctx = 2 + (xd.AboveMi.Value.RefFrame[0] == cm.CompFixedRef || !xd.AboveMi.Value.IsInterBlock() + ? 1 + : 0); } else if (!xd.LeftMi.Value.HasSecondRef()) { // One of two edges uses comp pred (2/3) - ctx = 2 + (xd.LeftMi.Value.RefFrame[0] == cm.CompFixedRef || !xd.LeftMi.Value.IsInterBlock() ? 1 : 0); + ctx = 2 + + (xd.LeftMi.Value.RefFrame[0] == cm.CompFixedRef || !xd.LeftMi.Value.IsInterBlock() ? 1 : 0); } - else // Both edges use comp pred (4) + else // Both edges use comp pred (4) { ctx = 4; } } else if (!xd.AboveMi.IsNull || !xd.LeftMi.IsNull) - { // One edge available + { + // One edge available ref ModeInfo edgeMi = ref !xd.AboveMi.IsNull ? ref xd.AboveMi.Value : ref xd.LeftMi.Value; if (!edgeMi.HasSecondRef()) @@ -51,9 +56,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } else - { // No edges available (1) + { + // No edges available (1) ctx = 1; } + Debug.Assert(ctx >= 0 && ctx < Constants.CompInterContexts); return ctx; } @@ -70,29 +77,33 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int varRefIdx = fixRefIdx == 0 ? 1 : 0; if (!xd.AboveMi.IsNull && !xd.LeftMi.IsNull) - { // Both edges available + { + // Both edges available bool aboveIntra = !xd.AboveMi.Value.IsInterBlock(); bool leftIntra = !xd.LeftMi.Value.IsInterBlock(); if (aboveIntra && leftIntra) - { // Intra/Intra (2) + { + // Intra/Intra (2) predContext = 2; } else if (aboveIntra || leftIntra) - { // Intra/Inter + { + // Intra/Inter ref ModeInfo edgeMi = ref aboveIntra ? ref xd.LeftMi.Value : ref xd.AboveMi.Value; - if (!edgeMi.HasSecondRef()) // single pred (1/3) + if (!edgeMi.HasSecondRef()) // single pred (1/3) { - predContext = 1 + 2 * (edgeMi.RefFrame[0] != cm.CompVarRef[1] ? 1 : 0); + predContext = 1 + (2 * (edgeMi.RefFrame[0] != cm.CompVarRef[1] ? 1 : 0)); } - else // Comp pred (1/3) + else // Comp pred (1/3) { - predContext = 1 + 2 * (edgeMi.RefFrame[varRefIdx] != cm.CompVarRef[1] ? 1 : 0); + predContext = 1 + (2 * (edgeMi.RefFrame[varRefIdx] != cm.CompVarRef[1] ? 1 : 0)); } } else - { // Inter/Inter + { + // Inter/Inter bool lSg = !xd.LeftMi.Value.HasSecondRef(); bool aSg = !xd.AboveMi.Value.HasSecondRef(); sbyte vrfa = aSg ? xd.AboveMi.Value.RefFrame[0] : xd.AboveMi.Value.RefFrame[varRefIdx]; @@ -103,7 +114,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 predContext = 0; } else if (lSg && aSg) - { // Single/Single + { + // Single/Single if ((vrfa == cm.CompFixedRef && vrfl == cm.CompVarRef[0]) || (vrfl == cm.CompFixedRef && vrfa == cm.CompVarRef[0])) { @@ -119,7 +131,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } else if (lSg || aSg) - { // Single/Comp + { + // Single/Comp sbyte vrfc = lSg ? vrfa : vrfl; sbyte rfs = aSg ? vrfa : vrfl; if (vrfc == cm.CompVarRef[1] && rfs != cm.CompVarRef[1]) @@ -136,7 +149,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } else if (vrfa == vrfl) - { // Comp/Comp + { + // Comp/Comp predContext = 4; } else @@ -146,7 +160,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } else if (!xd.AboveMi.IsNull || !xd.LeftMi.IsNull) - { // One edge available + { + // One edge available ref ModeInfo edgeMi = ref !xd.AboveMi.IsNull ? ref xd.AboveMi.Value : ref xd.LeftMi.Value; if (!edgeMi.IsInterBlock()) @@ -166,9 +181,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } else - { // No edges available (2) + { + // No edges available (2) predContext = 2; } + Debug.Assert(predContext >= 0 && predContext < Constants.RefContexts); return predContext; } @@ -181,16 +198,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 // left of the entries corresponding to real macroblocks. // The prediction flags in these dummy entries are initialized to 0. if (!xd.AboveMi.IsNull && !xd.LeftMi.IsNull) - { // Both edges available + { + // Both edges available bool aboveIntra = !xd.AboveMi.Value.IsInterBlock(); bool leftIntra = !xd.LeftMi.Value.IsInterBlock(); if (aboveIntra && leftIntra) - { // Intra/Intra + { + // Intra/Intra predContext = 2; } else if (aboveIntra || leftIntra) - { // Intra/Inter or Inter/Intra + { + // Intra/Inter or Inter/Intra ref ModeInfo edgeMi = ref aboveIntra ? ref xd.LeftMi.Value : ref xd.AboveMi.Value; if (!edgeMi.HasSecondRef()) { @@ -199,11 +219,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 else { predContext = 1 + (edgeMi.RefFrame[0] == Constants.LastFrame || - edgeMi.RefFrame[1] == Constants.LastFrame ? 1 : 0); + edgeMi.RefFrame[1] == Constants.LastFrame + ? 1 + : 0); } } else - { // Inter/Inter + { + // Inter/Inter bool aboveHasSecond = xd.AboveMi.Value.HasSecondRef(); bool leftHasSecond = xd.LeftMi.Value.HasSecondRef(); sbyte above0 = xd.AboveMi.Value.RefFrame[0]; @@ -214,7 +237,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 if (aboveHasSecond && leftHasSecond) { predContext = 1 + (above0 == Constants.LastFrame || above1 == Constants.LastFrame || - left0 == Constants.LastFrame || left1 == Constants.LastFrame ? 1 : 0); + left0 == Constants.LastFrame || left1 == Constants.LastFrame + ? 1 + : 0); } else if (aboveHasSecond || leftHasSecond) { @@ -228,24 +253,28 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } else { - predContext = (crf1 == Constants.LastFrame || crf2 == Constants.LastFrame ? 1 : 0); + predContext = crf1 == Constants.LastFrame || crf2 == Constants.LastFrame ? 1 : 0; } } else { - predContext = 2 * (above0 == Constants.LastFrame ? 1 : 0) + 2 * (left0 == Constants.LastFrame ? 1 : 0); + predContext = (2 * (above0 == Constants.LastFrame ? 1 : 0)) + + (2 * (left0 == Constants.LastFrame ? 1 : 0)); } } } else if (!xd.AboveMi.IsNull || !xd.LeftMi.IsNull) - { // One edge available + { + // One edge available ref ModeInfo edgeMi = ref !xd.AboveMi.IsNull ? ref xd.AboveMi.Value : ref xd.LeftMi.Value; if (!edgeMi.IsInterBlock()) - { // Intra + { + // Intra predContext = 2; } else - { // Inter + { + // Inter if (!edgeMi.HasSecondRef()) { predContext = 4 * (edgeMi.RefFrame[0] == Constants.LastFrame ? 1 : 0); @@ -253,14 +282,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 else { predContext = 1 + (edgeMi.RefFrame[0] == Constants.LastFrame || - edgeMi.RefFrame[1] == Constants.LastFrame ? 1 : 0); + edgeMi.RefFrame[1] == Constants.LastFrame + ? 1 + : 0); } } } else - { // No edges available + { + // No edges available predContext = 2; } + Debug.Assert(predContext >= 0 && predContext < Constants.RefContexts); return predContext; } @@ -274,16 +307,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 // left of the entries corresponding to real macroblocks. // The prediction flags in these dummy entries are initialized to 0. if (!xd.AboveMi.IsNull && !xd.LeftMi.IsNull) - { // Both edges available + { + // Both edges available bool aboveIntra = !xd.AboveMi.Value.IsInterBlock(); bool leftIntra = !xd.LeftMi.Value.IsInterBlock(); if (aboveIntra && leftIntra) - { // Intra/Intra + { + // Intra/Intra predContext = 2; } else if (aboveIntra || leftIntra) - { // Intra/Inter or Inter/Intra + { + // Intra/Inter or Inter/Intra ref ModeInfo edgeMi = ref aboveIntra ? ref xd.LeftMi.Value : ref xd.AboveMi.Value; if (!edgeMi.HasSecondRef()) { @@ -298,12 +334,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } else { - predContext = 1 + 2 * (edgeMi.RefFrame[0] == Constants.GoldenFrame || - edgeMi.RefFrame[1] == Constants.GoldenFrame ? 1 : 0); + predContext = 1 + (2 * (edgeMi.RefFrame[0] == Constants.GoldenFrame || + edgeMi.RefFrame[1] == Constants.GoldenFrame + ? 1 + : 0)); } } else - { // Inter/Inter + { + // Inter/Inter bool aboveHasSecond = xd.AboveMi.Value.HasSecondRef(); bool leftHasSecond = xd.LeftMi.Value.HasSecondRef(); sbyte above0 = xd.AboveMi.Value.RefFrame[0]; @@ -316,7 +355,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 if (above0 == left0 && above1 == left1) { predContext = 3 * (above0 == Constants.GoldenFrame || above1 == Constants.GoldenFrame || - left0 == Constants.GoldenFrame || left1 == Constants.GoldenFrame ? 1 : 0); + left0 == Constants.GoldenFrame || left1 == Constants.GoldenFrame + ? 1 + : 0); } else { @@ -339,7 +380,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } else { - predContext = 1 + 2 * (crf1 == Constants.GoldenFrame || crf2 == Constants.GoldenFrame ? 1 : 0); + predContext = + 1 + (2 * (crf1 == Constants.GoldenFrame || crf2 == Constants.GoldenFrame ? 1 : 0)); } } else @@ -350,18 +392,20 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } else if (above0 == Constants.LastFrame || left0 == Constants.LastFrame) { - sbyte edge0 = (above0 == Constants.LastFrame) ? left0 : above0; + sbyte edge0 = above0 == Constants.LastFrame ? left0 : above0; predContext = 4 * (edge0 == Constants.GoldenFrame ? 1 : 0); } else { - predContext = 2 * (above0 == Constants.GoldenFrame ? 1 : 0) + 2 * (left0 == Constants.GoldenFrame ? 1 : 0); + predContext = (2 * (above0 == Constants.GoldenFrame ? 1 : 0)) + + (2 * (left0 == Constants.GoldenFrame ? 1 : 0)); } } } } else if (!xd.AboveMi.IsNull || !xd.LeftMi.IsNull) - { // One edge available + { + // One edge available ref ModeInfo edgeMi = ref !xd.AboveMi.IsNull ? ref xd.AboveMi.Value : ref xd.LeftMi.Value; if (!edgeMi.IsInterBlock() || (edgeMi.RefFrame[0] == Constants.LastFrame && !edgeMi.HasSecondRef())) @@ -375,15 +419,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 else { predContext = 3 * (edgeMi.RefFrame[0] == Constants.GoldenFrame || - edgeMi.RefFrame[1] == Constants.GoldenFrame ? 1 : 0); + edgeMi.RefFrame[1] == Constants.GoldenFrame + ? 1 + : 0); } } else - { // No edges available (2) + { + // No edges available (2) predContext = 2; } + Debug.Assert(predContext >= 0 && predContext < Constants.RefContexts); return predContext; } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Prob.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Prob.cs new file mode 100644 index 000000000..98e841ce7 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Prob.cs @@ -0,0 +1,94 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Common; +using System; +using System.Diagnostics; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + public static class Prob + { + public const int MaxProb = 255; + + private static byte GetProb(uint num, uint den) + { + Debug.Assert(den != 0); + { + int p = (int)((((ulong)num * 256) + (den >> 1)) / den); + // (p > 255) ? 255 : (p < 1) ? 1 : p; + int clippedProb = p | ((255 - p) >> 23) | (p == 0 ? 1 : 0); + return (byte)clippedProb; + } + } + + private static byte GetBinaryProb(uint n0, uint n1) + { + uint den = n0 + n1; + if (den == 0) + { + return 128; + } + + return GetProb(n0, den); + } + + /* This function assumes prob1 and prob2 are already within [1,255] range. */ + public static byte WeightedProb(int prob1, int prob2, int factor) + { + return (byte)BitUtils.RoundPowerOfTwo((prob1 * (256 - factor)) + (prob2 * factor), 8); + } + + public static byte MergeProbs(byte preProb, ref Array2 ct, uint countSat, uint maxUpdateFactor) + { + byte prob = GetBinaryProb(ct[0], ct[1]); + uint count = Math.Min(ct[0] + ct[1], countSat); + uint factor = maxUpdateFactor * count / countSat; + return WeightedProb(preProb, prob, (int)factor); + } + + // MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT; + private static readonly uint[] CountToUpdateFactor = + { + 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64, 70, 76, 83, 89, 96, 102, 108, 115, 121, 128 + }; + + private const int ModeMvCountSat = 20; + + public static byte ModeMvMergeProbs(byte preProb, ref Array2 ct) + { + uint den = ct[0] + ct[1]; + if (den == 0) + { + return preProb; + } + + uint count = Math.Min(den, ModeMvCountSat); + uint factor = CountToUpdateFactor[(int)count]; + byte prob = GetProb(ct[0], den); + return WeightedProb(preProb, prob, (int)factor); + } + + private static uint TreeMergeProbsImpl( + uint i, + sbyte[] tree, + ReadOnlySpan preProbs, + ReadOnlySpan counts, + Span probs) + { + int l = tree[i]; + uint leftCount = l <= 0 ? counts[-l] : TreeMergeProbsImpl((uint)l, tree, preProbs, counts, probs); + int r = tree[i + 1]; + uint rightCount = r <= 0 ? counts[-r] : TreeMergeProbsImpl((uint)r, tree, preProbs, counts, probs); + Array2 ct = new(); + ct[0] = leftCount; + ct[1] = rightCount; + probs[(int)(i >> 1)] = ModeMvMergeProbs(preProbs[(int)(i >> 1)], ref ct); + return leftCount + rightCount; + } + + public static void VpxTreeMergeProbs(sbyte[] tree, ReadOnlySpan preProbs, ReadOnlySpan counts, + Span probs) + { + TreeMergeProbsImpl(0, tree, preProbs, counts, probs); + } + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/QuantCommon.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/QuantCommon.cs index 5c52c32f5..6a0768802 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/QuantCommon.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/QuantCommon.cs @@ -1,163 +1,116 @@ -using Ryujinx.Graphics.Nvdec.Vp9.Types; -using System; +using System; using System.Diagnostics; namespace Ryujinx.Graphics.Nvdec.Vp9 { internal static class QuantCommon { - public const int MinQ = 0; public const int MaxQ = 255; + public const int QindexBits = 8; - private static readonly short[] DcQlookup = new short[] + private static readonly short[] DcQlookup = { - 4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18, - 19, 19, 20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30, - 31, 32, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, - 43, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 53, - 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 62, 63, 64, 65, - 66, 66, 67, 68, 69, 70, 70, 71, 72, 73, 74, 74, 75, 76, - 77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85, 87, 88, - 90, 92, 93, 95, 96, 98, 99, 101, 102, 104, 105, 107, 108, 110, - 111, 113, 114, 116, 117, 118, 120, 121, 123, 125, 127, 129, 131, 134, - 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 161, 164, - 166, 169, 172, 174, 177, 180, 182, 185, 187, 190, 192, 195, 199, 202, - 205, 208, 211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247, - 250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292, 296, 300, - 304, 309, 313, 317, 322, 326, 330, 335, 340, 344, 349, 354, 359, 364, - 369, 374, 379, 384, 389, 395, 400, 406, 411, 417, 423, 429, 435, 441, - 447, 454, 461, 467, 475, 482, 489, 497, 505, 513, 522, 530, 539, 549, - 559, 569, 579, 590, 602, 614, 626, 640, 654, 668, 684, 700, 717, 736, - 755, 775, 796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139, - 1184, 1232, 1282, 1336, + 4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, + 30, 31, 32, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, + 52, 53, 53, 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 62, 63, 64, 65, 66, 66, 67, 68, 69, 70, 70, 71, 72, + 73, 74, 74, 75, 76, 77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85, 87, 88, 90, 92, 93, 95, 96, 98, 99, + 101, 102, 104, 105, 107, 108, 110, 111, 113, 114, 116, 117, 118, 120, 121, 123, 125, 127, 129, 131, 134, + 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 161, 164, 166, 169, 172, 174, 177, 180, 182, + 185, 187, 190, 192, 195, 199, 202, 205, 208, 211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247, + 250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292, 296, 300, 304, 309, 313, 317, 322, 326, 330, + 335, 340, 344, 349, 354, 359, 364, 369, 374, 379, 384, 389, 395, 400, 406, 411, 417, 423, 429, 435, 441, + 447, 454, 461, 467, 475, 482, 489, 497, 505, 513, 522, 530, 539, 549, 559, 569, 579, 590, 602, 614, 626, + 640, 654, 668, 684, 700, 717, 736, 755, 775, 796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, + 1139, 1184, 1232, 1282, 1336 }; - private static readonly short[] DcQlookup10 = new short[] + private static readonly short[] DcQlookup10 = { - 4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34, 37, - 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75, 78, 82, - 86, 90, 93, 97, 101, 105, 109, 113, 116, 120, 124, 128, 132, - 136, 140, 143, 147, 151, 155, 159, 163, 166, 170, 174, 178, 182, - 185, 189, 193, 197, 200, 204, 208, 212, 215, 219, 223, 226, 230, - 233, 237, 241, 244, 248, 251, 255, 259, 262, 266, 269, 273, 276, - 280, 283, 287, 290, 293, 297, 300, 304, 307, 310, 314, 317, 321, - 324, 327, 331, 334, 337, 343, 350, 356, 362, 369, 375, 381, 387, - 394, 400, 406, 412, 418, 424, 430, 436, 442, 448, 454, 460, 466, - 472, 478, 484, 490, 499, 507, 516, 525, 533, 542, 550, 559, 567, - 576, 584, 592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687, - 698, 708, 718, 729, 739, 749, 759, 770, 782, 795, 807, 819, 831, - 844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988, 1001, - 1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, 1186, 1202, - 1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, 1398, 1416, 1436, - 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717, - 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, 1958, 1990, 2021, 2054, 2088, - 2123, 2159, 2197, 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, 2616, 2675, - 2737, 2802, 2871, 2944, 3020, 3102, 3188, 3280, 3375, 3478, 3586, 3702, 3823, - 3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347, + 4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34, 37, 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75, 78, 82, + 86, 90, 93, 97, 101, 105, 109, 113, 116, 120, 124, 128, 132, 136, 140, 143, 147, 151, 155, 159, 163, + 166, 170, 174, 178, 182, 185, 189, 193, 197, 200, 204, 208, 212, 215, 219, 223, 226, 230, 233, 237, 241, + 244, 248, 251, 255, 259, 262, 266, 269, 273, 276, 280, 283, 287, 290, 293, 297, 300, 304, 307, 310, 314, + 317, 321, 324, 327, 331, 334, 337, 343, 350, 356, 362, 369, 375, 381, 387, 394, 400, 406, 412, 418, 424, + 430, 436, 442, 448, 454, 460, 466, 472, 478, 484, 490, 499, 507, 516, 525, 533, 542, 550, 559, 567, 576, + 584, 592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687, 698, 708, 718, 729, 739, 749, 759, 770, 782, + 795, 807, 819, 831, 844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988, 1001, 1015, 1030, 1045, + 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, 1186, 1202, 1218, 1236, 1253, 1271, 1288, 1306, 1323, + 1342, 1361, 1379, 1398, 1416, 1436, 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, 1647, 1670, + 1692, 1717, 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, 1958, 1990, 2021, 2054, 2088, 2123, 2159, + 2197, 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, 2616, 2675, 2737, 2802, 2871, 2944, 3020, 3102, + 3188, 3280, 3375, 3478, 3586, 3702, 3823, 3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347 }; - private static readonly short[] DcQlookup12 = new short[] + private static readonly short[] DcQlookup12 = { - 4, 12, 18, 25, 33, 41, 50, 60, 70, 80, 91, - 103, 115, 127, 140, 153, 166, 180, 194, 208, 222, 237, - 251, 266, 281, 296, 312, 327, 343, 358, 374, 390, 405, - 421, 437, 453, 469, 484, 500, 516, 532, 548, 564, 580, - 596, 611, 627, 643, 659, 674, 690, 706, 721, 737, 752, - 768, 783, 798, 814, 829, 844, 859, 874, 889, 904, 919, - 934, 949, 964, 978, 993, 1008, 1022, 1037, 1051, 1065, 1080, - 1094, 1108, 1122, 1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234, - 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342, 1368, 1393, 1419, - 1444, 1469, 1494, 1519, 1544, 1569, 1594, 1618, 1643, 1668, 1692, - 1717, 1741, 1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933, 1957, - 1992, 2027, 2061, 2096, 2130, 2165, 2199, 2233, 2267, 2300, 2334, - 2367, 2400, 2434, 2467, 2499, 2532, 2575, 2618, 2661, 2704, 2746, - 2788, 2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127, 3177, 3226, - 3275, 3324, 3373, 3421, 3469, 3517, 3565, 3621, 3677, 3733, 3788, - 3843, 3897, 3951, 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420, - 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942, 5013, 5083, 5153, - 5222, 5291, 5367, 5442, 5517, 5591, 5665, 5745, 5825, 5905, 5984, - 6063, 6149, 6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867, 6966, - 7064, 7163, 7269, 7376, 7483, 7599, 7715, 7832, 7958, 8085, 8214, - 8352, 8492, 8635, 8788, 8945, 9104, 9275, 9450, 9639, 9832, 10031, - 10245, 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, 12750, 13118, - 13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, 17575, 18237, 18949, - 19718, 20521, 21387, + 4, 12, 18, 25, 33, 41, 50, 60, 70, 80, 91, 103, 115, 127, 140, 153, 166, 180, 194, 208, 222, 237, 251, + 266, 281, 296, 312, 327, 343, 358, 374, 390, 405, 421, 437, 453, 469, 484, 500, 516, 532, 548, 564, 580, + 596, 611, 627, 643, 659, 674, 690, 706, 721, 737, 752, 768, 783, 798, 814, 829, 844, 859, 874, 889, 904, + 919, 934, 949, 964, 978, 993, 1008, 1022, 1037, 1051, 1065, 1080, 1094, 1108, 1122, 1136, 1151, 1165, + 1179, 1192, 1206, 1220, 1234, 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342, 1368, 1393, 1419, 1444, + 1469, 1494, 1519, 1544, 1569, 1594, 1618, 1643, 1668, 1692, 1717, 1741, 1765, 1789, 1814, 1838, 1862, + 1885, 1909, 1933, 1957, 1992, 2027, 2061, 2096, 2130, 2165, 2199, 2233, 2267, 2300, 2334, 2367, 2400, + 2434, 2467, 2499, 2532, 2575, 2618, 2661, 2704, 2746, 2788, 2830, 2872, 2913, 2954, 2995, 3036, 3076, + 3127, 3177, 3226, 3275, 3324, 3373, 3421, 3469, 3517, 3565, 3621, 3677, 3733, 3788, 3843, 3897, 3951, + 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420, 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942, 5013, + 5083, 5153, 5222, 5291, 5367, 5442, 5517, 5591, 5665, 5745, 5825, 5905, 5984, 6063, 6149, 6234, 6319, + 6404, 6495, 6587, 6678, 6769, 6867, 6966, 7064, 7163, 7269, 7376, 7483, 7599, 7715, 7832, 7958, 8085, + 8214, 8352, 8492, 8635, 8788, 8945, 9104, 9275, 9450, 9639, 9832, 10031, 10245, 10465, 10702, 10946, + 11210, 11482, 11776, 12081, 12409, 12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, + 17575, 18237, 18949, 19718, 20521, 21387 }; - private static readonly short[] AcQlookup = new short[] + private static readonly short[] AcQlookup = { - 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, - 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, - 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, - 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, - 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, - 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, - 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, - 98, 99, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118, - 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, - 146, 148, 150, 152, 155, 158, 161, 164, 167, 170, 173, 176, 179, - 182, 185, 188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223, - 227, 231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280, - 285, 290, 295, 300, 305, 311, 317, 323, 329, 335, 341, 347, 353, - 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432, 440, 448, - 456, 465, 474, 483, 492, 501, 510, 520, 530, 540, 550, 560, 571, - 582, 593, 604, 615, 627, 639, 651, 663, 676, 689, 702, 715, 729, - 743, 757, 771, 786, 801, 816, 832, 848, 864, 881, 898, 915, 933, - 951, 969, 988, 1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, 1173, 1196, - 1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, 1479, 1508, 1537, - 1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828, + 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, + 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, + 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 104, 106, 108, 110, 112, 114, + 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 155, 158, + 161, 164, 167, 170, 173, 176, 179, 182, 185, 188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223, 227, + 231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280, 285, 290, 295, 300, 305, 311, 317, 323, 329, + 335, 341, 347, 353, 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432, 440, 448, 456, 465, 474, 483, + 492, 501, 510, 520, 530, 540, 550, 560, 571, 582, 593, 604, 615, 627, 639, 651, 663, 676, 689, 702, 715, + 729, 743, 757, 771, 786, 801, 816, 832, 848, 864, 881, 898, 915, 933, 951, 969, 988, 1007, 1026, 1046, + 1066, 1087, 1108, 1129, 1151, 1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, + 1479, 1508, 1537, 1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828 }; - private static readonly short[] AcQlookup10 = new short[] + private static readonly short[] AcQlookup10 = { - 4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37, 40, - 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83, 88, 92, - 96, 100, 105, 109, 114, 118, 122, 127, 131, 136, 140, 145, 149, - 154, 158, 163, 168, 172, 177, 181, 186, 190, 195, 199, 204, 208, - 213, 217, 222, 226, 231, 235, 240, 244, 249, 253, 258, 262, 267, - 271, 275, 280, 284, 289, 293, 297, 302, 306, 311, 315, 319, 324, - 328, 332, 337, 341, 345, 349, 354, 358, 362, 367, 371, 375, 379, - 384, 388, 392, 396, 401, 409, 417, 425, 433, 441, 449, 458, 466, - 474, 482, 490, 498, 506, 514, 523, 531, 539, 547, 555, 563, 571, - 579, 588, 596, 604, 616, 628, 640, 652, 664, 676, 688, 700, 713, - 725, 737, 749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889, - 905, 922, 938, 954, 970, 986, 1002, 1018, 1038, 1058, 1078, 1098, 1118, - 1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, 1411, - 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, 1759, 1791, - 1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, 2199, 2239, 2283, - 2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915, - 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, 3455, 3523, 3591, 3659, 3731, - 3803, 3876, 3952, 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, 4692, 4784, - 4876, 4972, 5068, 5168, 5268, 5372, 5476, 5584, 5692, 5804, 5916, 6032, 6148, - 6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312, + 4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37, 40, 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83, 88, 92, + 96, 100, 105, 109, 114, 118, 122, 127, 131, 136, 140, 145, 149, 154, 158, 163, 168, 172, 177, 181, 186, + 190, 195, 199, 204, 208, 213, 217, 222, 226, 231, 235, 240, 244, 249, 253, 258, 262, 267, 271, 275, 280, + 284, 289, 293, 297, 302, 306, 311, 315, 319, 324, 328, 332, 337, 341, 345, 349, 354, 358, 362, 367, 371, + 375, 379, 384, 388, 392, 396, 401, 409, 417, 425, 433, 441, 449, 458, 466, 474, 482, 490, 498, 506, 514, + 523, 531, 539, 547, 555, 563, 571, 579, 588, 596, 604, 616, 628, 640, 652, 664, 676, 688, 700, 713, 725, + 737, 749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889, 905, 922, 938, 954, 970, 986, 1002, 1018, + 1038, 1058, 1078, 1098, 1118, 1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, + 1411, 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, 1759, 1791, 1823, 1859, 1895, + 1931, 1967, 2003, 2039, 2079, 2119, 2159, 2199, 2239, 2283, 2327, 2371, 2415, 2459, 2507, 2555, 2603, + 2651, 2703, 2755, 2807, 2859, 2915, 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, 3455, 3523, 3591, + 3659, 3731, 3803, 3876, 3952, 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, 4692, 4784, 4876, 4972, + 5068, 5168, 5268, 5372, 5476, 5584, 5692, 5804, 5916, 6032, 6148, 6268, 6388, 6512, 6640, 6768, 6900, + 7036, 7172, 7312 }; - private static readonly short[] AcQlookup12 = new short[] + private static readonly short[] AcQlookup12 = { - 4, 13, 19, 27, 35, 44, 54, 64, 75, 87, 99, - 112, 126, 139, 154, 168, 183, 199, 214, 230, 247, 263, - 280, 297, 314, 331, 349, 366, 384, 402, 420, 438, 456, - 475, 493, 511, 530, 548, 567, 586, 604, 623, 642, 660, - 679, 698, 716, 735, 753, 772, 791, 809, 828, 846, 865, - 884, 902, 920, 939, 957, 976, 994, 1012, 1030, 1049, 1067, - 1085, 1103, 1121, 1139, 1157, 1175, 1193, 1211, 1229, 1246, 1264, - 1282, 1299, 1317, 1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457, - 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595, 1627, 1660, 1693, - 1725, 1758, 1791, 1824, 1856, 1889, 1922, 1954, 1987, 2020, 2052, - 2085, 2118, 2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378, 2411, - 2459, 2508, 2556, 2605, 2653, 2701, 2750, 2798, 2847, 2895, 2943, - 2992, 3040, 3088, 3137, 3185, 3234, 3298, 3362, 3426, 3491, 3555, - 3619, 3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149, 4230, 4310, - 4390, 4470, 4550, 4631, 4711, 4791, 4871, 4967, 5064, 5160, 5256, - 5352, 5448, 5544, 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410, - 6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435, 7579, 7723, 7867, - 8011, 8155, 8315, 8475, 8635, 8795, 8956, 9132, 9308, 9484, 9660, - 9836, 10028, 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, 11885, - 12109, 12333, 12573, 12813, 13053, 13309, 13565, 13821, 14093, 14365, 14637, - 14925, 15213, 15502, 15806, 16110, 16414, 16734, 17054, 17390, 17726, 18062, - 18414, 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, 21902, 22334, - 22766, 23214, 23662, 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599, - 28143, 28687, 29247, + 4, 13, 19, 27, 35, 44, 54, 64, 75, 87, 99, 112, 126, 139, 154, 168, 183, 199, 214, 230, 247, 263, 280, + 297, 314, 331, 349, 366, 384, 402, 420, 438, 456, 475, 493, 511, 530, 548, 567, 586, 604, 623, 642, 660, + 679, 698, 716, 735, 753, 772, 791, 809, 828, 846, 865, 884, 902, 920, 939, 957, 976, 994, 1012, 1030, + 1049, 1067, 1085, 1103, 1121, 1139, 1157, 1175, 1193, 1211, 1229, 1246, 1264, 1282, 1299, 1317, 1335, + 1352, 1370, 1387, 1405, 1422, 1440, 1457, 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595, 1627, 1660, + 1693, 1725, 1758, 1791, 1824, 1856, 1889, 1922, 1954, 1987, 2020, 2052, 2085, 2118, 2150, 2183, 2216, + 2248, 2281, 2313, 2346, 2378, 2411, 2459, 2508, 2556, 2605, 2653, 2701, 2750, 2798, 2847, 2895, 2943, + 2992, 3040, 3088, 3137, 3185, 3234, 3298, 3362, 3426, 3491, 3555, 3619, 3684, 3748, 3812, 3876, 3941, + 4005, 4069, 4149, 4230, 4310, 4390, 4470, 4550, 4631, 4711, 4791, 4871, 4967, 5064, 5160, 5256, 5352, + 5448, 5544, 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410, 6522, 6650, 6778, 6906, 7034, 7162, 7290, + 7435, 7579, 7723, 7867, 8011, 8155, 8315, 8475, 8635, 8795, 8956, 9132, 9308, 9484, 9660, 9836, 10028, + 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, 11885, 12109, 12333, 12573, 12813, 13053, 13309, + 13565, 13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806, 16110, 16414, 16734, 17054, 17390, 17726, + 18062, 18414, 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, 21902, 22334, 22766, 23214, 23662, + 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247 }; public static short DcQuant(int qindex, int delta, BitDepth bitDepth) @@ -168,7 +121,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 case BitDepth.Bits10: return DcQlookup10[Math.Clamp(qindex + delta, 0, MaxQ)]; case BitDepth.Bits12: return DcQlookup12[Math.Clamp(qindex + delta, 0, MaxQ)]; default: - Debug.Assert(false, "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + Debug.Assert(false, "bitDepth should be Bits8, Bits10 or Bits12"); return -1; } } @@ -181,23 +134,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 case BitDepth.Bits10: return AcQlookup10[Math.Clamp(qindex + delta, 0, MaxQ)]; case BitDepth.Bits12: return AcQlookup12[Math.Clamp(qindex + delta, 0, MaxQ)]; default: - Debug.Assert(false, "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + Debug.Assert(false, "bitDepth should be Bits8, Bits10 or Bits12"); return -1; } } - - public static int GetQIndex(ref Segmentation seg, int segmentId, int baseQIndex) - { - if (seg.IsSegFeatureActive(segmentId, SegLvlFeatures.SegLvlAltQ) != 0) - { - int data = seg.GetSegData(segmentId, SegLvlFeatures.SegLvlAltQ); - int segQIndex = seg.AbsDelta == Constants.SegmentAbsData ? data : baseQIndex + data; - return Math.Clamp(segQIndex, 0, MaxQ); - } - else - { - return baseQIndex; - } - } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/ReadBitBuffer.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/ReadBitBuffer.cs new file mode 100644 index 000000000..f0333802b --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/ReadBitBuffer.cs @@ -0,0 +1,96 @@ +using Ryujinx.Graphics.Nvdec.Vp9.Types; +using System; +using System.Diagnostics; +using System.Numerics; + +namespace Ryujinx.Graphics.Nvdec.Vp9 +{ + public ref struct ReadBitBuffer + { + public ReadOnlySpan BitBuffer; + public ulong BitOffset; + public object ErrorHandlerData; + + private static int GetMsb(uint n) + { + Debug.Assert(n != 0); + return 31 ^ BitOperations.LeadingZeroCount(n); + } + + private static int GetUnsignedBits(uint numValues) + { + return numValues > 0 ? GetMsb(numValues) + 1 : 0; + } + + public int DecodeUnsignedMax(int max) + { + int data = ReadLiteral(GetUnsignedBits((uint)max)); + return data > max ? max : data; + } + + public ulong BytesRead() + { + return (BitOffset + 7) >> 3; + } + + public int ReadBit() + { + ulong off = BitOffset; + ulong p = off >> 3; + int q = 7 - (int)(off & 0x7); + if (p < (ulong)BitBuffer.Length) + { + int bit = (BitBuffer[(int)p] >> q) & 1; + BitOffset = off + 1; + return bit; + } + + return 0; + } + + public int ReadLiteral(int bits) + { + int value = 0, bit; + for (bit = bits - 1; bit >= 0; bit--) + { + value |= ReadBit() << bit; + } + + return value; + } + + public int ReadSignedLiteral(int bits) + { + int value = ReadLiteral(bits); + return ReadBit() != 0 ? -value : value; + } + + public int ReadInvSignedLiteral(int bits) + { + return ReadSignedLiteral(bits); + } + + public int ReadDeltaQ() + { + return ReadBit() != 0 ? ReadSignedLiteral(4) : 0; + } + + public void ReadFrameSize(out int width, out int height) + { + width = ReadLiteral(16) + 1; + height = ReadLiteral(16) + 1; + } + + public BitstreamProfile ReadProfile() + { + int profile = ReadBit(); + profile |= ReadBit() << 1; + if (profile > 2) + { + profile += ReadBit(); + } + + return (BitstreamProfile)profile; + } + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/ReconInter.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/ReconInter.cs index a4c295e5f..35dbf1cab 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/ReconInter.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/ReconInter.cs @@ -77,67 +77,38 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 bd); } - private static int RoundMvCompQ4(int value) + public static int RoundMvCompQ4(int value) { return (value < 0 ? value - 2 : value + 2) / 4; } - private static Mv MiMvPredQ4(ref ModeInfo mi, int idx) - { - Mv res = new Mv() - { - Row = (short)RoundMvCompQ4( - mi.Bmi[0].Mv[idx].Row + mi.Bmi[1].Mv[idx].Row + - mi.Bmi[2].Mv[idx].Row + mi.Bmi[3].Mv[idx].Row), - Col = (short)RoundMvCompQ4( - mi.Bmi[0].Mv[idx].Col + mi.Bmi[1].Mv[idx].Col + - mi.Bmi[2].Mv[idx].Col + mi.Bmi[3].Mv[idx].Col) - }; - return res; - } - - private static int RoundMvCompQ2(int value) + public static int RoundMvCompQ2(int value) { return (value < 0 ? value - 1 : value + 1) / 2; } - private static Mv MiMvPredQ2(ref ModeInfo mi, int idx, int block0, int block1) - { - Mv res = new Mv() - { - Row = (short)RoundMvCompQ2( - mi.Bmi[block0].Mv[idx].Row + - mi.Bmi[block1].Mv[idx].Row), - Col = (short)RoundMvCompQ2( - mi.Bmi[block0].Mv[idx].Col + - mi.Bmi[block1].Mv[idx].Col) - }; - return res; - } - public static Mv ClampMvToUmvBorderSb(ref MacroBlockD xd, ref Mv srcMv, int bw, int bh, int ssX, int ssY) { // If the MV points so far into the UMV border that no visible pixels // are used for reconstruction, the subpel part of the MV can be // discarded and the MV limited to 16 pixels with equivalent results. - int spelLeft = (Constants.Vp9InterpExtend + bw) << SubpelBits; + int spelLeft = (Constants.InterpExtend + bw) << SubpelBits; int spelRight = spelLeft - SubpelShifts; - int spelTop = (Constants.Vp9InterpExtend + bh) << SubpelBits; + int spelTop = (Constants.InterpExtend + bh) << SubpelBits; int spelBottom = spelTop - SubpelShifts; - Mv clampedMv = new Mv() + Mv clampedMv = new() { - Row = (short)(srcMv.Row * (1 << (1 - ssY))), - Col = (short)(srcMv.Col * (1 << (1 - ssX))) + Row = (short)(srcMv.Row * (1 << (1 - ssY))), Col = (short)(srcMv.Col * (1 << (1 - ssX))) }; Debug.Assert(ssX <= 1); Debug.Assert(ssY <= 1); - clampedMv.ClampMv( - xd.MbToLeftEdge * (1 << (1 - ssX)) - spelLeft, - xd.MbToRightEdge * (1 << (1 - ssX)) + spelRight, - xd.MbToTopEdge * (1 << (1 - ssY)) - spelTop, - xd.MbToBottomEdge * (1 << (1 - ssY)) + spelBottom); + clampedMv.Clamp( + (xd.MbToLeftEdge * (1 << (1 - ssX))) - spelLeft, + (xd.MbToRightEdge * (1 << (1 - ssX))) + spelRight, + (xd.MbToTopEdge * (1 << (1 - ssY))) - spelTop, + (xd.MbToBottomEdge * (1 << (1 - ssY))) + spelBottom); return clampedMv; } @@ -145,15 +116,26 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 public static Mv AverageSplitMvs(ref MacroBlockDPlane pd, ref ModeInfo mi, int refr, int block) { int ssIdx = ((pd.SubsamplingX > 0 ? 1 : 0) << 1) | (pd.SubsamplingY > 0 ? 1 : 0); - Mv res = new Mv(); + Mv res = new(); switch (ssIdx) { - case 0: res = mi.Bmi[block].Mv[refr]; break; - case 1: res = MiMvPredQ2(ref mi, refr, block, block + 2); break; - case 2: res = MiMvPredQ2(ref mi, refr, block, block + 1); break; - case 3: res = MiMvPredQ4(ref mi, refr); break; - default: Debug.Assert(ssIdx <= 3 && ssIdx >= 0); break; + case 0: + res = mi.Bmi[block].Mv[refr]; + break; + case 1: + res = mi.MvPredQ2(refr, block, block + 2); + break; + case 2: + res = mi.MvPredQ2(refr, block, block + 1); + break; + case 3: + res = mi.MvPredQ4(refr); + break; + default: + Debug.Assert(ssIdx <= 3 && ssIdx >= 0); + break; } + return res; } @@ -161,7 +143,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { int x = !sf.IsNull ? sf.Value.ScaleValueX(xOffset) : xOffset; int y = !sf.IsNull ? sf.Value.ScaleValueY(yOffset) : yOffset; - return y * stride + x; + return (y * stride) + x; } private static void SetupPredPlanes( @@ -194,12 +176,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 strides[0] = src.Stride; strides[1] = src.UvStride; strides[2] = src.UvStride; - int i; - for (i = 0; i < Constants.MaxMbPlane; ++i) + for (int i = 0; i < Constants.MaxMbPlane; ++i) { ref MacroBlockDPlane pd = ref planes[i]; - SetupPredPlanes(ref pd.Dst, buffers[i], strides[i], miRow, miCol, Ptr.Null, pd.SubsamplingX, pd.SubsamplingY); + SetupPredPlanes(ref pd.Dst, buffers[i], strides[i], miRow, miCol, Ptr.Null, + pd.SubsamplingX, pd.SubsamplingY); } } @@ -221,14 +203,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 strides[0] = src.Stride; strides[1] = src.UvStride; strides[2] = src.UvStride; - int i; - for (i = 0; i < Constants.MaxMbPlane; ++i) + for (int i = 0; i < Constants.MaxMbPlane; ++i) { ref MacroBlockDPlane pd = ref xd.Plane[i]; - SetupPredPlanes(ref pd.Pre[idx], buffers[i], strides[i], miRow, miCol, sf, pd.SubsamplingX, pd.SubsamplingY); + SetupPredPlanes(ref pd.Pre[idx], buffers[i], strides[i], miRow, miCol, sf, pd.SubsamplingX, + pd.SubsamplingY); } } } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/ReconIntra.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/ReconIntra.cs index e346c01d1..332d9754f 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/ReconIntra.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/ReconIntra.cs @@ -7,18 +7,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { internal static class ReconIntra { - public static readonly TxType[] IntraModeToTxTypeLookup = new TxType[] + public static readonly TxType[] IntraModeToTxTypeLookup = { - TxType.DctDct, // DC - TxType.AdstDct, // V - TxType.DctAdst, // H - TxType.DctDct, // D45 - TxType.AdstAdst, // D135 - TxType.AdstDct, // D117 - TxType.DctAdst, // D153 - TxType.DctAdst, // D207 - TxType.AdstDct, // D63 - TxType.AdstAdst // TM + TxType.DctDct, // DC + TxType.AdstDct, // V + TxType.DctAdst, // H + TxType.DctDct, // D45 + TxType.AdstAdst, // D135 + TxType.AdstDct, // D117 + TxType.DctAdst, // D153 + TxType.DctAdst, // D207 + TxType.AdstDct, // D63 + TxType.AdstAdst // TM }; private const int NeedLeft = 1 << 1; @@ -27,241 +27,129 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 private static ReadOnlySpan ExtendModes => new byte[] { - NeedAbove | NeedLeft, // DC - NeedAbove, // V - NeedLeft, // H - NeedAboveRight, // D45 - NeedLeft | NeedAbove, // D135 - NeedLeft | NeedAbove, // D117 - NeedLeft | NeedAbove, // D153 - NeedLeft, // D207 - NeedAboveRight, // D63 - NeedLeft | NeedAbove, // TM + NeedAbove | NeedLeft, // DC + NeedAbove, // V + NeedLeft, // H + NeedAboveRight, // D45 + NeedLeft | NeedAbove, // D135 + NeedLeft | NeedAbove, // D117 + NeedLeft | NeedAbove, // D153 + NeedLeft, // D207 + NeedAboveRight, // D63 + NeedLeft | NeedAbove // TM }; private unsafe delegate void IntraPredFn(byte* dst, int stride, byte* above, byte* left); - private static unsafe IntraPredFn[][] _pred = new IntraPredFn[][] + private static readonly unsafe IntraPredFn[][] Pred = { - new IntraPredFn[] - { - null, - null, - null, - null - }, - new IntraPredFn[] - { - VPredictor4x4, - VPredictor8x8, - VPredictor16x16, - VPredictor32x32 - }, - new IntraPredFn[] - { - HPredictor4x4, - HPredictor8x8, - HPredictor16x16, - HPredictor32x32 - }, - new IntraPredFn[] - { - D45Predictor4x4, - D45Predictor8x8, - D45Predictor16x16, - D45Predictor32x32 - }, - new IntraPredFn[] - { - D135Predictor4x4, - D135Predictor8x8, - D135Predictor16x16, - D135Predictor32x32 - }, - new IntraPredFn[] - { - D117Predictor4x4, - D117Predictor8x8, - D117Predictor16x16, - D117Predictor32x32 - }, - new IntraPredFn[] - { - D153Predictor4x4, - D153Predictor8x8, - D153Predictor16x16, - D153Predictor32x32 - }, - new IntraPredFn[] - { - D207Predictor4x4, - D207Predictor8x8, - D207Predictor16x16, - D207Predictor32x32 - }, - new IntraPredFn[] - { - D63Predictor4x4, - D63Predictor8x8, - D63Predictor16x16, - D63Predictor32x32 - }, - new IntraPredFn[] - { - TMPredictor4x4, - TMPredictor8x8, - TMPredictor16x16, - TMPredictor32x32 - } + new IntraPredFn[] { null, null, null, null }, + new IntraPredFn[] { VPredictor4x4, VPredictor8x8, VPredictor16x16, VPredictor32x32 }, + new IntraPredFn[] { HPredictor4x4, HPredictor8x8, HPredictor16x16, HPredictor32x32 }, + new IntraPredFn[] { D45Predictor4x4, D45Predictor8x8, D45Predictor16x16, D45Predictor32x32 }, + new IntraPredFn[] { D135Predictor4x4, D135Predictor8x8, D135Predictor16x16, D135Predictor32x32 }, + new IntraPredFn[] { D117Predictor4x4, D117Predictor8x8, D117Predictor16x16, D117Predictor32x32 }, + new IntraPredFn[] { D153Predictor4x4, D153Predictor8x8, D153Predictor16x16, D153Predictor32x32 }, + new IntraPredFn[] { D207Predictor4x4, D207Predictor8x8, D207Predictor16x16, D207Predictor32x32 }, + new IntraPredFn[] { D63Predictor4x4, D63Predictor8x8, D63Predictor16x16, D63Predictor32x32 }, + new IntraPredFn[] { TmPredictor4x4, TmPredictor8x8, TmPredictor16x16, TmPredictor32x32 } }; - private static unsafe IntraPredFn[][][] _dcPred = new IntraPredFn[][][] + private static readonly unsafe IntraPredFn[][][] DcPred = { - new IntraPredFn[][] + new[] { new IntraPredFn[] { - Dc128Predictor4x4, - Dc128Predictor8x8, - Dc128Predictor16x16, - Dc128Predictor32x32 + Dc128Predictor4x4, Dc128Predictor8x8, Dc128Predictor16x16, Dc128Predictor32x32 }, new IntraPredFn[] { - DcTopPredictor4x4, - DcTopPredictor8x8, - DcTopPredictor16x16, - DcTopPredictor32x32 + DcTopPredictor4x4, DcTopPredictor8x8, DcTopPredictor16x16, DcTopPredictor32x32 } }, - new IntraPredFn[][] + new[] { new IntraPredFn[] { - DcLeftPredictor4x4, - DcLeftPredictor8x8, - DcLeftPredictor16x16, - DcLeftPredictor32x32 + DcLeftPredictor4x4, DcLeftPredictor8x8, DcLeftPredictor16x16, DcLeftPredictor32x32 }, - new IntraPredFn[] - { - DcPredictor4x4, - DcPredictor8x8, - DcPredictor16x16, - DcPredictor32x32 - } + new IntraPredFn[] { DcPredictor4x4, DcPredictor8x8, DcPredictor16x16, DcPredictor32x32 } } }; private unsafe delegate void IntraHighPredFn(ushort* dst, int stride, ushort* above, ushort* left, int bd); - private static unsafe IntraHighPredFn[][] _predHigh = new IntraHighPredFn[][] + private static readonly unsafe IntraHighPredFn[][] PredHigh = { + new IntraHighPredFn[] { null, null, null, null }, new IntraHighPredFn[] { - null, - null, - null, - null + HighbdVPredictor4x4, HighbdVPredictor8x8, HighbdVPredictor16x16, HighbdVPredictor32x32 }, new IntraHighPredFn[] { - HighbdVPredictor4x4, - HighbdVPredictor8x8, - HighbdVPredictor16x16, - HighbdVPredictor32x32 + HighbdHPredictor4x4, HighbdHPredictor8x8, HighbdHPredictor16x16, HighbdHPredictor32x32 }, new IntraHighPredFn[] { - HighbdHPredictor4x4, - HighbdHPredictor8x8, - HighbdHPredictor16x16, - HighbdHPredictor32x32 + HighbdD45Predictor4x4, HighbdD45Predictor8x8, HighbdD45Predictor16x16, HighbdD45Predictor32x32 }, new IntraHighPredFn[] { - HighbdD45Predictor4x4, - HighbdD45Predictor8x8, - HighbdD45Predictor16x16, - HighbdD45Predictor32x32 - }, - new IntraHighPredFn[] - { - HighbdD135Predictor4x4, - HighbdD135Predictor8x8, - HighbdD135Predictor16x16, + HighbdD135Predictor4x4, HighbdD135Predictor8x8, HighbdD135Predictor16x16, HighbdD135Predictor32x32 }, new IntraHighPredFn[] { - HighbdD117Predictor4x4, - HighbdD117Predictor8x8, - HighbdD117Predictor16x16, + HighbdD117Predictor4x4, HighbdD117Predictor8x8, HighbdD117Predictor16x16, HighbdD117Predictor32x32 }, new IntraHighPredFn[] { - HighbdD153Predictor4x4, - HighbdD153Predictor8x8, - HighbdD153Predictor16x16, + HighbdD153Predictor4x4, HighbdD153Predictor8x8, HighbdD153Predictor16x16, HighbdD153Predictor32x32 }, new IntraHighPredFn[] { - HighbdD207Predictor4x4, - HighbdD207Predictor8x8, - HighbdD207Predictor16x16, + HighbdD207Predictor4x4, HighbdD207Predictor8x8, HighbdD207Predictor16x16, HighbdD207Predictor32x32 }, new IntraHighPredFn[] { - HighbdD63Predictor4x4, - HighbdD63Predictor8x8, - HighbdD63Predictor16x16, - HighbdD63Predictor32x32 + HighbdD63Predictor4x4, HighbdD63Predictor8x8, HighbdD63Predictor16x16, HighbdD63Predictor32x32 }, new IntraHighPredFn[] { - HighbdTMPredictor4x4, - HighbdTMPredictor8x8, - HighbdTMPredictor16x16, - HighbdTMPredictor32x32 + HighbdTmPredictor4x4, HighbdTmPredictor8x8, HighbdTmPredictor16x16, HighbdTmPredictor32x32 } }; - private static unsafe IntraHighPredFn[][][] _dcPredHigh = new IntraHighPredFn[][][] + private static readonly unsafe IntraHighPredFn[][][] DcPredHigh = { - new IntraHighPredFn[][] + new[] { new IntraHighPredFn[] { - HighbdDc128Predictor4x4, - HighbdDc128Predictor8x8, - HighbdDc128Predictor16x16, + HighbdDc128Predictor4x4, HighbdDc128Predictor8x8, HighbdDc128Predictor16x16, HighbdDc128Predictor32x32 }, new IntraHighPredFn[] { - HighbdDcTopPredictor4x4, - HighbdDcTopPredictor8x8, - HighbdDcTopPredictor16x16, + HighbdDcTopPredictor4x4, HighbdDcTopPredictor8x8, HighbdDcTopPredictor16x16, HighbdDcTopPredictor32x32 } }, - new IntraHighPredFn[][] + new[] { new IntraHighPredFn[] { - HighbdDcLeftPredictor4x4, - HighbdDcLeftPredictor8x8, - HighbdDcLeftPredictor16x16, + HighbdDcLeftPredictor4x4, HighbdDcLeftPredictor8x8, HighbdDcLeftPredictor16x16, HighbdDcLeftPredictor32x32 }, new IntraHighPredFn[] { - HighbdDcPredictor4x4, - HighbdDcPredictor8x8, - HighbdDcPredictor16x16, + HighbdDcPredictor4x4, HighbdDcPredictor8x8, HighbdDcPredictor16x16, HighbdDcPredictor32x32 } } @@ -332,7 +220,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { for (i = 0; i < bs; ++i) { - leftCol[i] = refr[i * refStride - 1]; + leftCol[i] = refr[(i * refStride) - 1]; } } else @@ -340,12 +228,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int extendBottom = frameHeight - y0; for (i = 0; i < extendBottom; ++i) { - leftCol[i] = refr[i * refStride - 1]; + leftCol[i] = refr[(i * refStride) - 1]; } for (; i < bs; ++i) { - leftCol[i] = refr[(extendBottom - 1) * refStride - 1]; + leftCol[i] = refr[((extendBottom - 1) * refStride) - 1]; } } } @@ -354,7 +242,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 /* faster path if the block does not need extension */ for (i = 0; i < bs; ++i) { - leftCol[i] = refr[i * refStride - 1]; + leftCol[i] = refr[(i * refStride) - 1]; } } } @@ -396,6 +284,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 MemoryUtil.Copy(aboveRow, aboveRef, bs); } } + aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (ushort)(baseVal + 1); } else @@ -414,7 +303,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 if (xd.MbToRightEdge < 0) { /* slower path if the block needs border extension */ - if (x0 + 2 * bs <= frameWidth) + if (x0 + (2 * bs) <= frameWidth) { if (rightAvailable != 0 && bs == 4) { @@ -432,7 +321,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 if (rightAvailable != 0 && bs == 4) { MemoryUtil.Copy(aboveRow, aboveRef, r); - MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + 2 * bs - frameWidth); + MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + (2 * bs) - frameWidth); } else { @@ -444,8 +333,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { int r = frameWidth - x0; MemoryUtil.Copy(aboveRow, aboveRef, r); - MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + 2 * bs - frameWidth); + MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + (2 * bs) - frameWidth); } + aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (ushort)(baseVal + 1); } else @@ -481,11 +371,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 // Predict if (mode == PredictionMode.DcPred) { - _dcPredHigh[leftAvailable][upAvailable][(int)txSize](dst, dstStride, constAboveRow, leftCol, xd.Bd); + DcPredHigh[leftAvailable][upAvailable][(int)txSize](dst, dstStride, constAboveRow, leftCol, xd.Bd); } else { - _predHigh[(int)mode][(int)txSize](dst, dstStride, constAboveRow, leftCol, xd.Bd); + PredHigh[(int)mode][(int)txSize](dst, dstStride, constAboveRow, leftCol, xd.Bd); } } @@ -549,7 +439,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { for (i = 0; i < bs; ++i) { - leftCol[i] = refr[i * refStride - 1]; + leftCol[i] = refr[(i * refStride) - 1]; } } else @@ -557,12 +447,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int extendBottom = frameHeight - y0; for (i = 0; i < extendBottom; ++i) { - leftCol[i] = refr[i * refStride - 1]; + leftCol[i] = refr[(i * refStride) - 1]; } for (; i < bs; ++i) { - leftCol[i] = refr[(extendBottom - 1) * refStride - 1]; + leftCol[i] = refr[((extendBottom - 1) * refStride) - 1]; } } } @@ -571,7 +461,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 /* Faster path if the block does not need extension */ for (i = 0; i < bs; ++i) { - leftCol[i] = refr[i * refStride - 1]; + leftCol[i] = refr[(i * refStride) - 1]; } } } @@ -613,6 +503,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 MemoryUtil.Copy(aboveRow, aboveRef, bs); } } + aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (byte)129; } else @@ -631,7 +522,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 if (xd.MbToRightEdge < 0) { /* Slower path if the block needs border extension */ - if (x0 + 2 * bs <= frameWidth) + if (x0 + (2 * bs) <= frameWidth) { if (rightAvailable != 0 && bs == 4) { @@ -649,7 +540,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 if (rightAvailable != 0 && bs == 4) { MemoryUtil.Copy(aboveRow, aboveRef, r); - MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + 2 * bs - frameWidth); + MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + (2 * bs) - frameWidth); } else { @@ -661,7 +552,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 { int r = frameWidth - x0; MemoryUtil.Copy(aboveRow, aboveRef, r); - MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + 2 * bs - frameWidth); + MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + (2 * bs) - frameWidth); } } else @@ -684,6 +575,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 } } } + aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (byte)129; } else @@ -696,11 +588,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 // Predict if (mode == PredictionMode.DcPred) { - _dcPred[leftAvailable][upAvailable][(int)txSize](dst, dstStride, constAboveRow, leftCol); + DcPred[leftAvailable][upAvailable][(int)txSize](dst, dstStride, constAboveRow, leftCol); } else { - _pred[(int)mode][(int)txSize](dst, dstStride, constAboveRow, leftCol); + Pred[(int)mode][(int)txSize](dst, dstStride, constAboveRow, leftCol); } } @@ -721,7 +613,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 int txw = 1 << (int)txSize; int haveTop = loff != 0 || !xd.AboveMi.IsNull ? 1 : 0; int haveLeft = aoff != 0 || !xd.LeftMi.IsNull ? 1 : 0; - int haveRight = (aoff + txw) < bw ? 1 : 0; + int haveRight = aoff + txw < bw ? 1 : 0; int x = aoff * 4; int y = loff * 4; @@ -743,6 +635,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 plane); return; } + BuildIntraPredictors( ref xd, refr, @@ -759,4 +652,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 plane); } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/TileBuffer.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/TileBuffer.cs index c5a25e6bc..48f3d0596 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/TileBuffer.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/TileBuffer.cs @@ -8,4 +8,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 public ArrayPtr Data; public int Size; } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/TileWorkerData.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/TileWorkerData.cs index 333a077a0..3654e6466 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/TileWorkerData.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/TileWorkerData.cs @@ -12,9 +12,20 @@ namespace Ryujinx.Graphics.Nvdec.Vp9 public int BufEnd; public Reader BitReader; public Vp9BackwardUpdates Counts; + public MacroBlockD Xd; + /* dqcoeff are shared by all the planes. So planes must be decoded serially */ public Array32> Dqcoeff; public InternalErrorInfo ErrorInfo; + + public int DecPartitionPlaneContext(int miRow, int miCol, int bsl) + { + ref sbyte aboveCtx = ref Xd.AboveSegContext[miCol]; + ref sbyte leftCtx = ref Xd.LeftSegContext[miRow & Constants.MiMask]; + int above = (aboveCtx >> bsl) & 1, left = (leftCtx >> bsl) & 1; + + return (left * 2) + above + (bsl * Constants.PartitionPloffset); + } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BModeInfo.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BModeInfo.cs index 9e1cd8b41..ce653ca0c 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BModeInfo.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BModeInfo.cs @@ -5,6 +5,6 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types internal struct BModeInfo { public PredictionMode Mode; - public Array2 Mv; // First, second inter predictor motion vectors + public Array2 Mv; // First, second inter predictor motion vectors } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BitstreamProfile.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BitstreamProfile.cs new file mode 100644 index 000000000..13293f460 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BitstreamProfile.cs @@ -0,0 +1,11 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + public enum BitstreamProfile + { + Profile0, + Profile1, + Profile2, + Profile3, + MaxProfiles + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BlockSize.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BlockSize.cs index 22a48e207..9d1dd1d27 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BlockSize.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BlockSize.cs @@ -2,20 +2,20 @@ { internal enum BlockSize { - Block4x4 = 0, - Block4x8 = 1, - Block8x4 = 2, - Block8x8 = 3, - Block8x16 = 4, - Block16x8 = 5, - Block16x16 = 6, - Block16x32 = 7, - Block32x16 = 8, - Block32x32 = 9, - Block32x64 = 10, - Block64x32 = 11, - Block64x64 = 12, - BlockSizes = 13, + Block4x4, + Block4x8, + Block8x4, + Block8x8, + Block8x16, + Block16x8, + Block16x16, + Block16x32, + Block32x16, + Block32x32, + Block32x64, + Block64x32, + Block64x64, + BlockSizes, BlockInvalid = BlockSizes } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Buf2D.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Buf2D.cs index 180d5e341..17efeb86b 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Buf2D.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Buf2D.cs @@ -7,4 +7,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types public ArrayPtr Buf; public int Stride; } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BufferPool.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BufferPool.cs new file mode 100644 index 000000000..bfe87d5a5 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/BufferPool.cs @@ -0,0 +1,18 @@ +using Ryujinx.Common.Memory; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct BufferPool + { + // Private data associated with the frame buffer callbacks. + public Ptr CbPriv; + + // vpx_get_frame_buffer_cb_fn_t get_fb_cb; + // vpx_release_frame_buffer_cb_fn_t release_fb_cb; + + public Array12 FrameBufs; + + // Frame buffers allocated internally by the codec. + public InternalFrameBufferList IntFrameBuffers; + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/FrameType.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/FrameType.cs index a783999ef..148a10d36 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/FrameType.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/FrameType.cs @@ -5,4 +5,4 @@ KeyFrame = 0, InterFrame = 1 } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilter.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilter.cs index 8dc33bda8..ad3de310a 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilter.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilter.cs @@ -23,5 +23,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types public ArrayPtr Lfm; public int LfmStride; + + public void SetDefaultLfDeltas() + { + ModeRefDeltaEnabled = true; + ModeRefDeltaUpdate = true; + + RefDeltas[Constants.IntraFrame] = 1; + RefDeltas[Constants.LastFrame] = 0; + RefDeltas[Constants.GoldenFrame] = -1; + RefDeltas[Constants.AltRefFrame] = -1; + ModeDeltas[0] = 0; + ModeDeltas[1] = 0; + } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterInfoN.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterInfoN.cs index 0ac38a7b9..3573b91cf 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterInfoN.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterInfoN.cs @@ -7,4 +7,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types public Array64 Lfthr; public Array8>> Lvl; } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterMask.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterMask.cs index 4aff843a1..2c5ecc27b 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterMask.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterMask.cs @@ -21,4 +21,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types public ushort Int4x4Uv; public Array64 LflY; } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterThresh.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterThresh.cs index edd79af4a..e5430015e 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterThresh.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterThresh.cs @@ -12,4 +12,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types public Array16 HevThr; #pragma warning restore CS0649 } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockD.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockD.cs index f1111528e..bc6b7be32 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockD.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockD.cs @@ -1,4 +1,5 @@ using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Common; using Ryujinx.Graphics.Video; namespace Ryujinx.Graphics.Nvdec.Vp9.Types @@ -82,18 +83,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types { return leftType; } - else if (leftType == Constants.SwitchableFilters) + + if (leftType == Constants.SwitchableFilters) { return aboveType; } - else if (aboveType == Constants.SwitchableFilters) + + if (aboveType == Constants.SwitchableFilters) { return leftType; } - else - { - return Constants.SwitchableFilters; - } + + return Constants.SwitchableFilters; } // The mode info data structure has a one element border above and to the @@ -106,15 +107,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types public int GetIntraInterContext() { if (!AboveMi.IsNull && !LeftMi.IsNull) - { // Both edges available + { + // Both edges available bool aboveIntra = !AboveMi.Value.IsInterBlock(); bool leftIntra = !LeftMi.Value.IsInterBlock(); - return leftIntra && aboveIntra ? 3 : (leftIntra || aboveIntra ? 1 : 0); + return leftIntra && aboveIntra ? 3 : leftIntra || aboveIntra ? 1 : 0; } - else if (!AboveMi.IsNull || !LeftMi.IsNull) - { // One edge available + + if (!AboveMi.IsNull || !LeftMi.IsNull) + { + // One edge available return 2 * (!(!AboveMi.IsNull ? AboveMi.Value : LeftMi.Value).IsInterBlock() ? 1 : 0); } + return 0; } @@ -125,8 +130,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types public int GetTxSizeContext() { int maxTxSize = (int)Luts.MaxTxSizeLookup[(int)Mi[0].Value.SbType]; - int aboveCtx = (!AboveMi.IsNull && AboveMi.Value.Skip == 0) ? (int)AboveMi.Value.TxSize : maxTxSize; - int leftCtx = (!LeftMi.IsNull && LeftMi.Value.Skip == 0) ? (int)LeftMi.Value.TxSize : maxTxSize; + int aboveCtx = !AboveMi.IsNull && AboveMi.Value.Skip == 0 ? (int)AboveMi.Value.TxSize : maxTxSize; + int leftCtx = !LeftMi.IsNull && LeftMi.Value.Skip == 0 ? (int)LeftMi.Value.TxSize : maxTxSize; if (LeftMi.IsNull) { leftCtx = aboveCtx; @@ -137,14 +142,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types aboveCtx = leftCtx; } - return (aboveCtx + leftCtx) > maxTxSize ? 1 : 0; + return aboveCtx + leftCtx > maxTxSize ? 1 : 0; } public void SetupBlockPlanes(int ssX, int ssY) { - int i; - - for (i = 0; i < Constants.MaxMbPlane; i++) + for (int i = 0; i < Constants.MaxMbPlane; i++) { Plane[i].SubsamplingX = i != 0 ? ssX : 0; Plane[i].SubsamplingY = i != 0 ? ssY : 0; @@ -155,25 +158,36 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types { int aboveIdx = miCol * 2; int leftIdx = (miRow * 2) & 15; - int i; - for (i = 0; i < Constants.MaxMbPlane; ++i) + + for (int i = 0; i < Constants.MaxMbPlane; ++i) { ref MacroBlockDPlane pd = ref Plane[i]; pd.AboveContext = AboveContext[i].Slice(aboveIdx >> pd.SubsamplingX); - pd.LeftContext = new ArrayPtr(ref LeftContext[i][leftIdx >> pd.SubsamplingY], 16 - (leftIdx >> pd.SubsamplingY)); + pd.LeftContext = new ArrayPtr(ref LeftContext[i][leftIdx >> pd.SubsamplingY], + 16 - (leftIdx >> pd.SubsamplingY)); } } internal void SetMiRowCol(ref TileInfo tile, int miRow, int bh, int miCol, int bw, int miRows, int miCols) { - MbToTopEdge = -((miRow * Constants.MiSize) * 8); - MbToBottomEdge = ((miRows - bh - miRow) * Constants.MiSize) * 8; - MbToLeftEdge = -((miCol * Constants.MiSize) * 8); - MbToRightEdge = ((miCols - bw - miCol) * Constants.MiSize) * 8; + MbToTopEdge = -(miRow * Constants.MiSize * 8); + MbToBottomEdge = (miRows - bh - miRow) * Constants.MiSize * 8; + MbToLeftEdge = -(miCol * Constants.MiSize * 8); + MbToRightEdge = (miCols - bw - miCol) * Constants.MiSize * 8; // Are edges available for intra prediction? - AboveMi = (miRow != 0) ? Mi[-MiStride] : Ptr.Null; - LeftMi = (miCol > tile.MiColStart) ? Mi[-1] : Ptr.Null; + AboveMi = miRow != 0 ? Mi[-MiStride] : Ptr.Null; + LeftMi = miCol > tile.MiColStart ? Mi[-1] : Ptr.Null; + } + + public unsafe void DecResetSkipContext() + { + for (int i = 0; i < Constants.MaxMbPlane; i++) + { + ref MacroBlockDPlane pd = ref Plane[i]; + MemoryUtil.Fill(pd.AboveContext.ToPointer(), (sbyte)0, pd.N4W); + MemoryUtil.Fill(pd.LeftContext.ToPointer(), (sbyte)0, pd.N4H); + } } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockDPlane.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockDPlane.cs index ae4ec6f41..2dd97ba85 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockDPlane.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockDPlane.cs @@ -15,7 +15,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types // Number of 4x4s in current block public ushort N4W, N4H; + // Log2 of N4W, N4H public byte N4Wl, N4Hl; } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ModeInfo.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ModeInfo.cs index 8ef281d83..52ac48ac0 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ModeInfo.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ModeInfo.cs @@ -11,7 +11,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types public TxSize TxSize; public sbyte Skip; public sbyte SegmentId; - public sbyte SegIdPredicted; // Valid only when TemporalUpdate is enabled + public sbyte SegIdPredicted; // Valid only when TemporalUpdate is enabled // Only for Intra blocks public PredictionMode UvMode; @@ -35,7 +35,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types public TxSize GetUvTxSize(ref MacroBlockDPlane pd) { Debug.Assert(SbType < BlockSize.Block8x8 || - Luts.SsSizeLookup[(int)SbType][pd.SubsamplingX][pd.SubsamplingY] != BlockSize.BlockInvalid); + Luts.SsSizeLookup[(int)SbType][pd.SubsamplingX][pd.SubsamplingY] != BlockSize.BlockInvalid); return Luts.UvTxsizeLookup[(int)SbType][(int)TxSize][pd.SubsamplingX][pd.SubsamplingY]; } @@ -49,9 +49,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types return RefFrame[1] > Constants.IntraFrame; } - private static readonly int[][] IdxNColumnToSubblock = new int[][] + private static readonly int[][] IdxNColumnToSubblock = { - new int[] { 1, 2 }, new int[] { 1, 3 }, new int[] { 3, 2 }, new int[] { 3, 3 } + new[] { 1, 2 }, new[] { 1, 3 }, new[] { 3, 2 }, new[] { 3, 3 } }; // This function returns either the appropriate sub block or block's mv @@ -62,5 +62,46 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types ? Bmi[IdxNColumnToSubblock[blockIdx][searchCol == 0 ? 1 : 0]].Mv[whichMv] : Mv[whichMv]; } + + public Mv MvPredQ4(int idx) + { + Mv res = new() + { + Row = (short)ReconInter.RoundMvCompQ4( + Bmi[0].Mv[idx].Row + Bmi[1].Mv[idx].Row + + Bmi[2].Mv[idx].Row + Bmi[3].Mv[idx].Row), + Col = (short)ReconInter.RoundMvCompQ4( + Bmi[0].Mv[idx].Col + Bmi[1].Mv[idx].Col + + Bmi[2].Mv[idx].Col + Bmi[3].Mv[idx].Col) + }; + return res; + } + + public Mv MvPredQ2(int idx, int block0, int block1) + { + Mv res = new() + { + Row = (short)ReconInter.RoundMvCompQ2( + Bmi[block0].Mv[idx].Row + + Bmi[block1].Mv[idx].Row), + Col = (short)ReconInter.RoundMvCompQ2( + Bmi[block0].Mv[idx].Col + + Bmi[block1].Mv[idx].Col) + }; + return res; + } + + // Performs mv sign inversion if indicated by the reference frame combination. + public Mv ScaleMv(int refr, sbyte thisRefFrame, ref Array4 refSignBias) + { + Mv mv = Mv[refr]; + if (refSignBias[RefFrame[refr]] != refSignBias[thisRefFrame]) + { + mv.Row *= -1; + mv.Col *= -1; + } + + return mv; + } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MotionVectorContext.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MotionVectorContext.cs index 319c8dba8..81c95d286 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MotionVectorContext.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MotionVectorContext.cs @@ -11,4 +11,4 @@ BothIntra = 6, InvalidCase = 9 } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv.cs index 815bbb321..10bd40e7e 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv.cs @@ -12,96 +12,86 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types private static ReadOnlySpan LogInBase2 => new byte[] { - 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, - 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10 + 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 10 }; - public bool UseMvHp() + public bool UseHp() { - const int kMvRefThresh = 64; // Threshold for use of high-precision 1/8 mv + const int kMvRefThresh = 64; // Threshold for use of high-precision 1/8 mv return Math.Abs(Row) < kMvRefThresh && Math.Abs(Col) < kMvRefThresh; } - public static bool MvJointVertical(MvJointType type) + public static bool JointVertical(MvJointType type) { - return type == MvJointType.MvJointHzvnz || type == MvJointType.MvJointHnzvnz; + return type == MvJointType.Hzvnz || type == MvJointType.Hnzvnz; } - public static bool MvJointHorizontal(MvJointType type) + public static bool JointHorizontal(MvJointType type) { - return type == MvJointType.MvJointHnzvz || type == MvJointType.MvJointHnzvnz; + return type == MvJointType.Hnzvz || type == MvJointType.Hnzvnz; } - private static int MvClassBase(MvClassType c) + private static int ClassBase(MvClassType c) { return c != 0 ? Constants.Class0Size << ((int)c + 2) : 0; } - private static MvClassType GetMvClass(int z, Ptr offset) + private static MvClassType GetClass(int z, Ptr offset) { - MvClassType c = (z >= Constants.Class0Size * 4096) ? MvClassType.MvClass10 : (MvClassType)LogInBase2[z >> 3]; + MvClassType c = z >= Constants.Class0Size * 4096 ? MvClassType.Class10 : (MvClassType)LogInBase2[z >> 3]; if (!offset.IsNull) { - offset.Value = z - MvClassBase(c); + offset.Value = z - ClassBase(c); } return c; } - private static void IncMvComponent(int v, ref Vp9BackwardUpdates counts, int comp, int incr, int usehp) + private static void IncComponent(int v, ref Vp9BackwardUpdates counts, int comp, int incr, int usehp) { - int s, z, c, o = 0, d, e, f; + int o = 0; Debug.Assert(v != 0); /* Should not be zero */ - s = v < 0 ? 1 : 0; + int s = v < 0 ? 1 : 0; counts.Sign[comp][s] += (uint)incr; - z = (s != 0 ? -v : v) - 1; /* Magnitude - 1 */ + int z = (s != 0 ? -v : v) - 1 /* Magnitude - 1 */; - c = (int)GetMvClass(z, new Ptr(ref o)); + int c = (int)GetClass(z, new Ptr(ref o)); counts.Classes[comp][c] += (uint)incr; - d = (o >> 3); /* Int mv data */ - f = (o >> 1) & 3; /* Fractional pel mv data */ - e = (o & 1); /* High precision mv data */ + int d = o >> 3 /* Int mv data */; + int f = (o >> 1) & 3 /* Fractional pel mv data */; + int e = o & 1 /* High precision mv data */; - if (c == (int)MvClassType.MvClass0) + if (c == (int)MvClassType.Class0) { counts.Class0[comp][d] += (uint)incr; counts.Class0Fp[comp][d][f] += (uint)incr; @@ -109,11 +99,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types } else { - int i; - int b = c + Constants.Class0Bits - 1; // Number of bits - for (i = 0; i < b; ++i) + int b = c + Constants.Class0Bits - 1; // Number of bits + for (int i = 0; i < b; ++i) { - counts.Bits[comp][i][((d >> i) & 1)] += (uint)incr; + counts.Bits[comp][i][(d >> i) & 1] += (uint)incr; } counts.Fp[comp][f] += (uint)incr; @@ -121,58 +110,56 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types } } - private MvJointType GetMvJoint() + public MvJointType GetJoint() { if (Row == 0) { - return Col == 0 ? MvJointType.MvJointZero : MvJointType.MvJointHnzvz; - } - else - { - return Col == 0 ? MvJointType.MvJointHzvnz : MvJointType.MvJointHnzvnz; + return Col == 0 ? MvJointType.Zero : MvJointType.Hnzvz; } + + return Col == 0 ? MvJointType.Hzvnz : MvJointType.Hnzvnz; } - internal void IncMv(Ptr counts) + internal void Inc(Ptr counts) { if (!counts.IsNull) { - MvJointType j = GetMvJoint(); + MvJointType j = GetJoint(); ++counts.Value.Joints[(int)j]; - if (MvJointVertical(j)) + if (JointVertical(j)) { - IncMvComponent(Row, ref counts.Value, 0, 1, 1); + IncComponent(Row, ref counts.Value, 0, 1, 1); } - if (MvJointHorizontal(j)) + if (JointHorizontal(j)) { - IncMvComponent(Col, ref counts.Value, 1, 1, 1); + IncComponent(Col, ref counts.Value, 1, 1, 1); } } } - public void ClampMv(int minCol, int maxCol, int minRow, int maxRow) + public void Clamp(int minCol, int maxCol, int minRow, int maxRow) { Col = (short)Math.Clamp(Col, minCol, maxCol); Row = (short)Math.Clamp(Row, minRow, maxRow); } - private const int MvBorder = (16 << 3); // Allow 16 pels in 1/8th pel units + private const int Border = 16 << 3; // Allow 16 pels in 1/8th pel units - public void ClampMvRef(ref MacroBlockD xd) + public void ClampRef(ref MacroBlockD xd) { - ClampMv( - xd.MbToLeftEdge - MvBorder, - xd.MbToRightEdge + MvBorder, - xd.MbToTopEdge - MvBorder, - xd.MbToBottomEdge + MvBorder); + Clamp( + xd.MbToLeftEdge - Border, + xd.MbToRightEdge + Border, + xd.MbToTopEdge - Border, + xd.MbToBottomEdge + Border); } - public void LowerMvPrecision(bool allowHP) + public void LowerPrecision(bool allowHp) { - bool useHP = allowHP && UseMvHp(); - if (!useHP) + bool useHp = allowHp && UseHp(); + if (!useHp) { if ((Row & 1) != 0) { @@ -185,5 +172,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types } } } + + public bool IsValid() + { + return Row is > Constants.MvLow and < Constants.MvUpp && + Col is > Constants.MvLow and < Constants.MvUpp; + } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv32.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv32.cs index fb25d18e9..f007878d0 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv32.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv32.cs @@ -5,4 +5,4 @@ public int Row; public int Col; } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvClassType.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvClassType.cs index 68a0b59af..dc86418ea 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvClassType.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvClassType.cs @@ -2,16 +2,16 @@ { internal enum MvClassType { - MvClass0 = 0, /* (0, 2] integer pel */ - MvClass1 = 1, /* (2, 4] integer pel */ - MvClass2 = 2, /* (4, 8] integer pel */ - MvClass3 = 3, /* (8, 16] integer pel */ - MvClass4 = 4, /* (16, 32] integer pel */ - MvClass5 = 5, /* (32, 64] integer pel */ - MvClass6 = 6, /* (64, 128] integer pel */ - MvClass7 = 7, /* (128, 256] integer pel */ - MvClass8 = 8, /* (256, 512] integer pel */ - MvClass9 = 9, /* (512, 1024] integer pel */ - MvClass10 = 10, /* (1024,2048] integer pel */ + Class0, /* (0, 2] integer pel */ + Class1, /* (2, 4] integer pel */ + Class2, /* (4, 8] integer pel */ + Class3, /* (8, 16] integer pel */ + Class4, /* (16, 32] integer pel */ + Class5, /* (32, 64] integer pel */ + Class6, /* (64, 128] integer pel */ + Class7, /* (128, 256] integer pel */ + Class8, /* (256, 512] integer pel */ + Class9, /* (512, 1024] integer pel */ + Class10 /* (1024,2048] integer pel */ } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvJointType.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvJointType.cs index a20cb6d0b..f1942c01f 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvJointType.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvJointType.cs @@ -2,9 +2,9 @@ { internal enum MvJointType { - MvJointZero = 0, /* Zero vector */ - MvJointHnzvz = 1, /* Vert zero, hor nonzero */ - MvJointHzvnz = 2, /* Hor zero, vert nonzero */ - MvJointHnzvnz = 3, /* Both components nonzero */ + Zero, /* Zero vector */ + Hnzvz, /* Vert zero, hor nonzero */ + Hzvnz, /* Hor zero, vert nonzero */ + Hnzvnz /* Both components nonzero */ } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvRef.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvRef.cs index 71949a09b..a5978841f 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvRef.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvRef.cs @@ -7,4 +7,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types public Array2 Mv; public Array2 RefFrame; } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PartitionType.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PartitionType.cs index 096f9818a..238ea1ed8 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PartitionType.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PartitionType.cs @@ -9,4 +9,4 @@ PartitionTypes, PartitionInvalid = PartitionTypes } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PlaneType.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PlaneType.cs index 790aa2a0c..ced1646e9 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PlaneType.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PlaneType.cs @@ -2,8 +2,8 @@ { internal enum PlaneType { - Y = 0, - Uv = 1, + Y, + Uv, PlaneTypes } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Position.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Position.cs index 0d3b56f67..732a6df98 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Position.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Position.cs @@ -11,4 +11,4 @@ Col = col; } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PredictionMode.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PredictionMode.cs index bbb9be9ad..a0f15905d 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PredictionMode.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/PredictionMode.cs @@ -2,20 +2,20 @@ { internal enum PredictionMode { - DcPred = 0, // Average of above and left pixels - VPred = 1, // Vertical - HPred = 2, // Horizontal - D45Pred = 3, // Directional 45 deg = round(arctan(1 / 1) * 180 / pi) - D135Pred = 4, // Directional 135 deg = 180 - 45 - D117Pred = 5, // Directional 117 deg = 180 - 63 - D153Pred = 6, // Directional 153 deg = 180 - 27 - D207Pred = 7, // Directional 207 deg = 180 + 27 - D63Pred = 8, // Directional 63 deg = round(arctan(2 / 1) * 180 / pi) - TmPred = 9, // True-motion - NearestMv = 10, - NearMv = 11, - ZeroMv = 12, - NewMv = 13, - MbModeCount = 14 + DcPred, // Average of above and left pixels + VPred, // Vertical + HPred, // Horizontal + D45Pred, // Directional 45 deg = round(arctan(1 / 1) * 180 / pi) + D135Pred, // Directional 135 deg = 180 - 45 + D117Pred, // Directional 117 deg = 180 - 63 + D153Pred, // Directional 153 deg = 180 - 27 + D207Pred, // Directional 207 deg = 180 + 27 + D63Pred, // Directional 63 deg = round(arctan(2 / 1) * 180 / pi) + TmPred, // True-motion + NearestMv, + NearMv, + ZeroMv, + NewMv, + MbModeCount } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/RefBuffer.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/RefBuffer.cs index 9942dd053..4095d6dd1 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/RefBuffer.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/RefBuffer.cs @@ -2,7 +2,10 @@ { internal struct RefBuffer { + public const int InvalidIdx = -1; // Invalid buffer index. + + public int Idx; public Surface Buf; public ScaleFactors Sf; } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/RefCntBuffer.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/RefCntBuffer.cs new file mode 100644 index 000000000..47febf38d --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/RefCntBuffer.cs @@ -0,0 +1,12 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct RefCntBuffer + { + public int RefCount; + public int MiRows; + public int MiCols; + public byte Released; + public VpxCodecFrameBuffer RawFrameBuffer; + public Surface Buf; + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ReferenceMode.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ReferenceMode.cs index 7cbf9f4ef..a678f8181 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ReferenceMode.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ReferenceMode.cs @@ -2,9 +2,9 @@ { internal enum ReferenceMode { - SingleReference = 0, - CompoundReference = 1, - ReferenceModeSelect = 2, - ReferenceModes = 3 + Single, + Compound, + Select, + ReferenceModes } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ScaleFactors.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ScaleFactors.cs index 970f96801..fc2c42b86 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ScaleFactors.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/ScaleFactors.cs @@ -8,7 +8,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types internal struct ScaleFactors { private const int RefScaleShift = 14; - private const int RefNoScale = (1 << RefScaleShift); + private const int RefNoScale = 1 << RefScaleShift; private const int RefInvalidScale = -1; private unsafe delegate void ConvolveFn( @@ -38,248 +38,99 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types int h, int bd); - private static readonly unsafe ConvolveFn[][][] PredictX16Y16 = new ConvolveFn[][][] + private static readonly unsafe ConvolveFn[][][] PredictX16Y16 = { - new ConvolveFn[][] + new[] { - new ConvolveFn[] - { - ConvolveCopy, - ConvolveAvg - }, - new ConvolveFn[] - { - Convolve8Vert, - Convolve8AvgVert - } + new ConvolveFn[] { ConvolveCopy, ConvolveAvg }, + new ConvolveFn[] { Convolve8Vert, Convolve8AvgVert } }, - new ConvolveFn[][] + new[] { - new ConvolveFn[] - { - Convolve8Horiz, - Convolve8AvgHoriz - }, - new ConvolveFn[] - { - Convolve8, - Convolve8Avg - } + new ConvolveFn[] { Convolve8Horiz, Convolve8AvgHoriz }, + new ConvolveFn[] { Convolve8, Convolve8Avg } } }; - private static readonly unsafe ConvolveFn[][][] PredictX16 = new ConvolveFn[][][] + private static readonly unsafe ConvolveFn[][][] PredictX16 = { - new ConvolveFn[][] + new[] { - new ConvolveFn[] - { - ScaledVert, - ScaledAvgVert - }, - new ConvolveFn[] - { - ScaledVert, - ScaledAvgVert - } + new ConvolveFn[] { ScaledVert, ScaledAvgVert }, new ConvolveFn[] { ScaledVert, ScaledAvgVert } }, - new ConvolveFn[][] + new[] { new ConvolveFn[] { Scaled2D, ScaledAvg2D }, new ConvolveFn[] { Scaled2D, ScaledAvg2D } } + }; + + private static readonly unsafe ConvolveFn[][][] PredictY16 = + { + new[] { new ConvolveFn[] { ScaledHoriz, ScaledAvgHoriz }, new ConvolveFn[] { Scaled2D, ScaledAvg2D } }, + new[] { new ConvolveFn[] { ScaledHoriz, ScaledAvgHoriz }, new ConvolveFn[] { Scaled2D, ScaledAvg2D } } + }; + + private static readonly unsafe ConvolveFn[][][] Predict = + { + new[] { new ConvolveFn[] { Scaled2D, ScaledAvg2D }, new ConvolveFn[] { Scaled2D, ScaledAvg2D } }, + new[] { new ConvolveFn[] { Scaled2D, ScaledAvg2D }, new ConvolveFn[] { Scaled2D, ScaledAvg2D } } + }; + + private static readonly unsafe HighbdConvolveFn[][][] HighbdPredictX16Y16 = + { + new[] { - new ConvolveFn[] - { - Scaled2D, - ScaledAvg2D - }, - new ConvolveFn[] - { - Scaled2D, - ScaledAvg2D - } + new HighbdConvolveFn[] { HighbdConvolveCopy, HighbdConvolveAvg }, + new HighbdConvolveFn[] { HighbdConvolve8Vert, HighbdConvolve8AvgVert } + }, + new[] + { + new HighbdConvolveFn[] { HighbdConvolve8Horiz, HighbdConvolve8AvgHoriz }, + new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg } } }; - private static readonly unsafe ConvolveFn[][][] PredictY16 = new ConvolveFn[][][] + private static readonly unsafe HighbdConvolveFn[][][] HighbdPredictX16 = { - new ConvolveFn[][] + new[] { - new ConvolveFn[] - { - ScaledHoriz, - ScaledAvgHoriz - }, - new ConvolveFn[] - { - Scaled2D, - ScaledAvg2D - } + new HighbdConvolveFn[] { HighbdConvolve8Vert, HighbdConvolve8AvgVert }, + new HighbdConvolveFn[] { HighbdConvolve8Vert, HighbdConvolve8AvgVert } }, - new ConvolveFn[][] + new[] { - new ConvolveFn[] - { - ScaledHoriz, - ScaledAvgHoriz - }, - new ConvolveFn[] - { - Scaled2D, - ScaledAvg2D - } + new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg }, + new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg } } }; - private static readonly unsafe ConvolveFn[][][] Predict = new ConvolveFn[][][] + private static readonly unsafe HighbdConvolveFn[][][] HighbdPredictY16 = { - new ConvolveFn[][] + new[] { - new ConvolveFn[] - { - Scaled2D, - ScaledAvg2D - }, - new ConvolveFn[] - { - Scaled2D, - ScaledAvg2D - } + new HighbdConvolveFn[] { HighbdConvolve8Horiz, HighbdConvolve8AvgHoriz }, + new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg } }, - new ConvolveFn[][] + new[] { - new ConvolveFn[] - { - Scaled2D, - ScaledAvg2D - }, - new ConvolveFn[] - { - Scaled2D, - ScaledAvg2D - } + new HighbdConvolveFn[] { HighbdConvolve8Horiz, HighbdConvolve8AvgHoriz }, + new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg } } }; - private static readonly unsafe HighbdConvolveFn[][][] HighbdPredictX16Y16 = new HighbdConvolveFn[][][] + private static readonly unsafe HighbdConvolveFn[][][] HighbdPredict = { - new HighbdConvolveFn[][] + new[] { - new HighbdConvolveFn[] - { - HighbdConvolveCopy, - HighbdConvolveAvg - }, - new HighbdConvolveFn[] - { - HighbdConvolve8Vert, - HighbdConvolve8AvgVert - } + new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg }, + new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg } }, - new HighbdConvolveFn[][] + new[] { - new HighbdConvolveFn[] - { - HighbdConvolve8Horiz, - HighbdConvolve8AvgHoriz - }, - new HighbdConvolveFn[] - { - HighbdConvolve8, - HighbdConvolve8Avg - } + new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg }, + new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg } } }; - private static readonly unsafe HighbdConvolveFn[][][] HighbdPredictX16 = new HighbdConvolveFn[][][] - { - new HighbdConvolveFn[][] - { - new HighbdConvolveFn[] - { - HighbdConvolve8Vert, - HighbdConvolve8AvgVert - }, - new HighbdConvolveFn[] - { - HighbdConvolve8Vert, - HighbdConvolve8AvgVert - } - }, - new HighbdConvolveFn[][] - { - new HighbdConvolveFn[] - { - HighbdConvolve8, - HighbdConvolve8Avg - }, - new HighbdConvolveFn[] - { - HighbdConvolve8, - HighbdConvolve8Avg - } - } - }; - - private static readonly unsafe HighbdConvolveFn[][][] HighbdPredictY16 = new HighbdConvolveFn[][][] - { - new HighbdConvolveFn[][] - { - new HighbdConvolveFn[] - { - HighbdConvolve8Horiz, - HighbdConvolve8AvgHoriz - }, - new HighbdConvolveFn[] - { - HighbdConvolve8, - HighbdConvolve8Avg - } - }, - new HighbdConvolveFn[][] - { - new HighbdConvolveFn[] - { - HighbdConvolve8Horiz, - HighbdConvolve8AvgHoriz - }, - new HighbdConvolveFn[] - { - HighbdConvolve8, - HighbdConvolve8Avg - } - } - }; - - private static readonly unsafe HighbdConvolveFn[][][] HighbdPredict = new HighbdConvolveFn[][][] - { - new HighbdConvolveFn[][] - { - new HighbdConvolveFn[] - { - HighbdConvolve8, - HighbdConvolve8Avg - }, - new HighbdConvolveFn[] - { - HighbdConvolve8, - HighbdConvolve8Avg - } - }, - new HighbdConvolveFn[][] - { - new HighbdConvolveFn[] - { - HighbdConvolve8, - HighbdConvolve8Avg - }, - new HighbdConvolveFn[] - { - HighbdConvolve8, - HighbdConvolve8Avg - } - } - }; - - public int XScaleFP; // Horizontal fixed point scale factor - public int YScaleFP; // Vertical fixed point scale factor + public int XScaleFp; // Horizontal fixed point scale factor + public int YScaleFp; // Vertical fixed point scale factor public int XStepQ4; public int YStepQ4; @@ -315,12 +166,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types if (YStepQ4 == 16) { // No scaling in either direction. - PredictX16Y16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h); + PredictX16Y16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, + h); } else { // No scaling in x direction. Must always scale in the y direction. - PredictX16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h); + PredictX16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, + h); } } else @@ -328,7 +181,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types if (YStepQ4 == 16) { // No scaling in the y direction. Must always scale in the x direction. - PredictY16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h); + PredictY16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, + h); } else { @@ -361,12 +215,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types if (YStepQ4 == 16) { // No scaling in either direction. - HighbdPredictX16Y16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h, bd); + HighbdPredictX16Y16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, + ys, w, h, bd); } else { // No scaling in x direction. Must always scale in the y direction. - HighbdPredictX16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h, bd); + HighbdPredictX16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, + w, h, bd); } } else @@ -374,24 +230,26 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types if (YStepQ4 == 16) { // No scaling in the y direction. Must always scale in the x direction. - HighbdPredictY16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h, bd); + HighbdPredictY16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, + w, h, bd); } else { // Must always scale in both directions. - HighbdPredict[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h, bd); + HighbdPredict[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, + h, bd); } } } private int ScaledX(int val) { - return (int)((long)val * XScaleFP >> RefScaleShift); + return (int)(((long)val * XScaleFp) >> RefScaleShift); } private int ScaledY(int val) { - return (int)((long)val * YScaleFP >> RefScaleShift); + return (int)(((long)val * YScaleFp) >> RefScaleShift); } private static int GetFixedPointScaleFactor(int otherSize, int thisSize) @@ -407,22 +265,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types { int xOffQ4 = ScaledX(x << SubpelBits) & SubpelMask; int yOffQ4 = ScaledY(y << SubpelBits) & SubpelMask; - Mv32 res = new Mv32() - { - Row = ScaledY(mv.Row) + yOffQ4, - Col = ScaledX(mv.Col) + xOffQ4 - }; + Mv32 res = new() { Row = ScaledY(mv.Row) + yOffQ4, Col = ScaledX(mv.Col) + xOffQ4 }; return res; } public bool IsValidScale() { - return XScaleFP != RefInvalidScale && YScaleFP != RefInvalidScale; + return XScaleFp != RefInvalidScale && YScaleFp != RefInvalidScale; } public bool IsScaled() { - return IsValidScale() && (XScaleFP != RefNoScale || YScaleFP != RefNoScale); + return IsValidScale() && (XScaleFp != RefNoScale || YScaleFp != RefNoScale); } public static bool ValidRefFrameSize(int refWidth, int refHeight, int thisWidth, int thisHeight) @@ -437,15 +291,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types { if (!ValidRefFrameSize(otherW, otherH, thisW, thisH)) { - XScaleFP = RefInvalidScale; - YScaleFP = RefInvalidScale; + XScaleFp = RefInvalidScale; + YScaleFp = RefInvalidScale; return; } - XScaleFP = GetFixedPointScaleFactor(otherW, thisW); - YScaleFP = GetFixedPointScaleFactor(otherH, thisH); + XScaleFp = GetFixedPointScaleFactor(otherW, thisW); + YScaleFp = GetFixedPointScaleFactor(otherH, thisH); XStepQ4 = ScaledX(16); YStepQ4 = ScaledY(16); } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/SegLvlFeatures.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/SegLvlFeatures.cs index c3ea3fd89..21d57d1d7 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/SegLvlFeatures.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/SegLvlFeatures.cs @@ -2,10 +2,10 @@ { internal enum SegLvlFeatures { - SegLvlAltQ = 0, // Use alternate Quantizer .... - SegLvlAltLf = 1, // Use alternate loop filter value... - SegLvlRefFrame = 2, // Optional Segment reference frame - SegLvlSkip = 3, // Optional Segment (0,0) + skip mode - SegLvlMax = 4 // Number of features supported + AltQ, // Use alternate Quantizer .... + AltLf, // Use alternate loop filter value... + RefFrame, // Optional Segment reference frame + Skip, // Optional Segment (0,0) + skip mode + Max // Number of features supported } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Segmentation.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Segmentation.cs index 53d1f2ccb..adfd8e31e 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Segmentation.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Segmentation.cs @@ -1,4 +1,6 @@ using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Video; +using System; using System.Diagnostics; using System.Runtime.InteropServices; @@ -6,8 +8,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types { internal struct Segmentation { - private static readonly int[] SegFeatureDataSigned = new int[] { 1, 1, 0, 0 }; - private static readonly int[] SegFeatureDataMax = new int[] { QuantCommon.MaxQ, Vp9.LoopFilter.MaxLoopFilter, 3, 0 }; + public const int SegmentDeltadata = 0; + public const int SegmentAbsdata = 1; + + public const int MaxSegments = 8; + public const int SegTreeProbs = MaxSegments - 1; + + public const int PredictionProbs = 3; + + private static readonly int[] SegFeatureDataSigned = { 1, 1, 0, 0 }; + private static readonly int[] SegFeatureDataMax = { QuantCommon.MaxQ, Vp9.LoopFilter.MaxLoopFilter, 3, 0 }; public bool Enabled; public bool UpdateMap; @@ -26,8 +36,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types public void ClearAllSegFeatures() { - MemoryMarshal.CreateSpan(ref FeatureData[0][0], 8 * 4).Fill(0); - MemoryMarshal.CreateSpan(ref FeatureMask[0], 8).Fill(0); + MemoryMarshal.CreateSpan(ref FeatureData[0][0], 8 * 4).Clear(); + MemoryMarshal.CreateSpan(ref FeatureMask[0], 8).Clear(); AqAvOffset = 0; } @@ -67,5 +77,88 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types { return FeatureData[segmentId][(int)featureId]; } + + public int GetQIndex(int segmentId, int baseQIndex) + { + if (IsSegFeatureActive(segmentId, SegLvlFeatures.AltQ) != 0) + { + int data = GetSegData(segmentId, SegLvlFeatures.AltQ); + int segQIndex = AbsDelta == Constants.SegmentAbsData ? data : baseQIndex + data; + return Math.Clamp(segQIndex, 0, QuantCommon.MaxQ); + } + + return baseQIndex; + } + + public void SetupSegmentation(ref Vp9EntropyProbs fc, ref ReadBitBuffer rb) + { + UpdateMap = false; + UpdateData = 0; + + Enabled = rb.ReadBit() != 0; + if (!Enabled) + { + return; + } + + // Segmentation map update + UpdateMap = rb.ReadBit() != 0; + if (UpdateMap) + { + for (int i = 0; i < SegTreeProbs; i++) + { + fc.SegTreeProb[i] = rb.ReadBit() != 0 + ? (byte)rb.ReadLiteral(8) + : (byte)Prob.MaxProb; + } + + TemporalUpdate = rb.ReadBit() != 0; + if (TemporalUpdate) + { + for (int i = 0; i < PredictionProbs; i++) + { + fc.SegPredProb[i] = rb.ReadBit() != 0 + ? (byte)rb.ReadLiteral(8) + : (byte)Prob.MaxProb; + } + } + else + { + for (int i = 0; i < PredictionProbs; i++) + { + fc.SegPredProb[i] = Prob.MaxProb; + } + } + } + + // Segmentation data update + UpdateData = (byte)rb.ReadBit(); + if (UpdateData != 0) + { + AbsDelta = (byte)rb.ReadBit(); + + ClearAllSegFeatures(); + + for (int i = 0; i < Constants.MaxSegments; i++) + { + for (int j = 0; j < (int)SegLvlFeatures.Max; j++) + { + int data = 0; + int featureEnabled = rb.ReadBit(); + if (featureEnabled != 0) + { + EnableSegFeature(i, (SegLvlFeatures)j); + data = rb.DecodeUnsignedMax(FeatureDataMax((SegLvlFeatures)j)); + if (IsSegFeatureSigned((SegLvlFeatures)j) != 0) + { + data = rb.ReadBit() != 0 ? -data : data; + } + } + + SetSegData(i, (SegLvlFeatures)j, data); + } + } + } + } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Surface.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Surface.cs index d5b51bc2f..96ca2dd50 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Surface.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Surface.cs @@ -1,33 +1,66 @@ using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Common; using Ryujinx.Graphics.Video; using System; +using System.Diagnostics; using System.Runtime.InteropServices; namespace Ryujinx.Graphics.Nvdec.Vp9.Types { + internal delegate int VpxGetFrameBufferCbFnT(MemoryAllocator allocator, Ptr cbPriv, + ulong minSize, ref VpxCodecFrameBuffer fb); + internal struct Surface : ISurface { + public const int Innerborderinpixels = 96; + public const int InterpExtend = 4; + public const int EncBorderInPixels = 160; + public const int DecBorderInPixels = 32; + + public const int Yv12FlagHighbitdepth = 8; + public ArrayPtr YBuffer; public ArrayPtr UBuffer; public ArrayPtr VBuffer; - public unsafe Plane YPlane => new Plane((IntPtr)YBuffer.ToPointer(), YBuffer.Length); - public unsafe Plane UPlane => new Plane((IntPtr)UBuffer.ToPointer(), UBuffer.Length); - public unsafe Plane VPlane => new Plane((IntPtr)VBuffer.ToPointer(), VBuffer.Length); + public unsafe Plane YPlane => new((IntPtr)YBuffer.ToPointer(), YBuffer.Length); + public unsafe Plane UPlane => new((IntPtr)UBuffer.ToPointer(), UBuffer.Length); + public unsafe Plane VPlane => new((IntPtr)VBuffer.ToPointer(), VBuffer.Length); public FrameField Field => FrameField.Progressive; - public int Width { get; } - public int Height { get; } - public int AlignedWidth { get; } - public int AlignedHeight { get; } - public int Stride { get; } - public int UvWidth { get; } - public int UvHeight { get; } - public int UvAlignedWidth { get; } - public int UvAlignedHeight { get; } - public int UvStride { get; } - public bool HighBd => false; + public int Width { get; private set; } + public int Height { get; private set; } + public int AlignedWidth { get; private set; } + public int AlignedHeight { get; private set; } + public int Stride { get; private set; } + public int UvWidth { get; private set; } + public int UvHeight { get; private set; } + public int UvAlignedWidth { get; private set; } + public int UvAlignedHeight { get; private set; } + public int UvStride { get; private set; } + public bool HighBd { get; private set; } + + public int FrameSize { get; private set; } + public int Border { get; private set; } + + public int YCropWidth => Width; + public int YCropHeight => Height; + public int UvCropWidth => UvWidth; + public int UvCropHeight => UvHeight; + + public ArrayPtr BufferAlloc; + public int BufferAllocSz; + public int SubsamplingX; + public int SubsamplingY; + public uint BitDepth; + public VpxColorSpace ColorSpace; + public VpxColorRange ColorRange; + public int RenderWidth; + public int RenderHeight; + + public int Corrupted; + public int Flags; private readonly IntPtr _pointer; @@ -40,16 +73,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types int alignedWidth = (width + 7) & ~7; int alignedHeight = (height + 7) & ~7; - int yStride = ((alignedWidth + 2 * border) + 31) & ~31; - int yplaneSize = (alignedHeight + 2 * border) * yStride; + int yStride = (alignedWidth + (2 * border) + 31) & ~31; + int yplaneSize = (alignedHeight + (2 * border)) * yStride; int uvWidth = alignedWidth >> ssX; int uvHeight = alignedHeight >> ssY; int uvStride = yStride >> ssX; int uvBorderW = border >> ssX; int uvBorderH = border >> ssY; - int uvplaneSize = (uvHeight + 2 * uvBorderH) * uvStride; + int uvplaneSize = (uvHeight + (2 * uvBorderH)) * uvStride; - int frameSize = (highbd ? 2 : 1) * (yplaneSize + 2 * uvplaneSize); + int frameSize = (highbd ? 2 : 1) * (yplaneSize + (2 * uvplaneSize)); IntPtr pointer = Marshal.AllocHGlobal(frameSize); _pointer = pointer; @@ -74,9 +107,134 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types VBuffer = NewPlane(yplaneSize + uvplaneSize, uvplaneSize, (uvBorderH * uvStride) + uvBorderW); } + public unsafe int ReallocFrameBuffer( + MemoryAllocator allocator, + int width, + int height, + int ssX, + int ssY, + bool useHighbitdepth, + int border, + int byteAlignment, + Ptr fb, + VpxGetFrameBufferCbFnT cb, + Ptr cbPriv) + { + int byteAlign = byteAlignment == 0 ? 1 : byteAlignment; // TODO: Is it safe to ignore the alignment? + int alignedWidth = (width + 7) & ~7; + int alignedHeight = (height + 7) & ~7; + int yStride = (alignedWidth + (2 * border) + 31) & ~31; + ulong yplaneSize = + ((ulong)(alignedHeight + (2 * border)) * (ulong)yStride) + (ulong)byteAlignment; + int uvWidth = alignedWidth >> ssX; + int uvHeight = alignedHeight >> ssY; + int uvStride = yStride >> ssX; + int uvBorderW = border >> ssX; + int uvBorderH = border >> ssY; + ulong uvplaneSize = + ((ulong)(uvHeight + (2 * uvBorderH)) * (ulong)uvStride) + (ulong)byteAlignment; + + ulong frameSize = (ulong)(1 + (useHighbitdepth ? 1 : 0)) * (yplaneSize + (2 * uvplaneSize)); + + ArrayPtr buf = ArrayPtr.Null; + + // frame_size is stored in buffer_alloc_sz, which is an int. If it won't + // fit, fail early. + if (frameSize > int.MaxValue) + { + return -1; + } + + if (cb != null) + { + const int alignAddrExtraSize = 31; + ulong externalFrameSize = frameSize + alignAddrExtraSize; + + Debug.Assert(!fb.IsNull); + + // Allocation to hold larger frame, or first allocation. + if (cb(allocator, cbPriv, externalFrameSize, ref fb.Value) < 0) + { + return -1; + } + + if (fb.Value.Data.IsNull || (ulong)fb.Value.Data.Length < externalFrameSize) + { + return -1; + } + + BufferAlloc = fb.Value.Data; + } + else if (frameSize > (ulong)BufferAllocSz) + { + // Allocation to hold larger frame, or first allocation. + allocator.Free(BufferAlloc); + BufferAlloc = ArrayPtr.Null; + + BufferAlloc = allocator.Allocate((int)frameSize); + if (BufferAlloc.IsNull) + { + return -1; + } + + BufferAllocSz = (int)frameSize; + + // This memset is needed for fixing valgrind error from C loop filter + // due to access uninitialized memory in frame border. It could be + // removed if border is totally removed. + MemoryUtil.Fill(BufferAlloc.ToPointer(), (byte)0, BufferAllocSz); + } + + /* Only support allocating buffers that have a border that's a multiple + * of 32. The border restriction is required to get 16-byte alignment of + * the start of the chroma rows without introducing an arbitrary gap + * between planes, which would break the semantics of things like + * vpx_img_set_rect(). */ + if ((border & 0x1f) != 0) + { + return -3; + } + + Width = width; + Height = height; + AlignedWidth = alignedWidth; + AlignedHeight = alignedHeight; + Stride = yStride; + + UvWidth = (width + ssX) >> ssX; + UvHeight = (height + ssY) >> ssY; + UvAlignedWidth = uvWidth; + UvAlignedHeight = uvHeight; + UvStride = uvStride; + + Border = border; + FrameSize = (int)frameSize; + SubsamplingX = ssX; + SubsamplingY = ssY; + + buf = BufferAlloc; + if (useHighbitdepth) + { + // Store uint16 addresses when using 16bit framebuffers + buf = BufferAlloc; + Flags = Yv12FlagHighbitdepth; + } + else + { + Flags = 0; + } + + YBuffer = buf.Slice((border * yStride) + border); + UBuffer = buf.Slice((int)yplaneSize + (uvBorderH * uvStride) + uvBorderW); + VBuffer = buf.Slice((int)yplaneSize + (int)uvplaneSize + (uvBorderH * uvStride) + uvBorderW); + + Corrupted = 0; /* assume not corrupted by errors */ + return 0; + } + public void Dispose() { Marshal.FreeHGlobal(_pointer); } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TileInfo.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TileInfo.cs index 67289c47d..3b16018b7 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TileInfo.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TileInfo.cs @@ -55,7 +55,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types private static int GetMinLog2TileCols(int sb64Cols) { int minLog2 = 0; - while ((MaxTileWidthB64 << minLog2) < sb64Cols) + while (MaxTileWidthB64 << minLog2 < sb64Cols) { ++minLog2; } @@ -66,7 +66,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types private static int GetMaxLog2TileCols(int sb64Cols) { int maxLog2 = 1; - while ((sb64Cols >> maxLog2) >= MinTileWidthB64) + while (sb64Cols >> maxLog2 >= MinTileWidthB64) { ++maxLog2; } @@ -74,7 +74,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types return maxLog2 - 1; } - public static void GetTileNBits(int miCols, ref int minLog2TileCols, ref int maxLog2TileCols) + public static void GetTileNBits(int miCols, out int minLog2TileCols, out int maxLog2TileCols) { int sb64Cols = MiColsAlignedToSb(miCols) >> Constants.MiBlockSizeLog2; minLog2TileCols = GetMinLog2TileCols(sb64Cols); @@ -82,4 +82,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types Debug.Assert(minLog2TileCols <= maxLog2TileCols); } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxMode.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxMode.cs index db914525c..a0329d6c7 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxMode.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxMode.cs @@ -2,11 +2,11 @@ { public enum TxMode { - Only4X4 = 0, // Only 4x4 transform used - Allow8X8 = 1, // Allow block transform size up to 8x8 - Allow16X16 = 2, // Allow block transform size up to 16x16 - Allow32X32 = 3, // Allow block transform size up to 32x32 - TxModeSelect = 4, // Transform specified for each block - TxModes = 5 + Only4x4, // Only 4x4 transform used + Allow8x8, // Allow block transform size up to 8x8 + Allow16x16, // Allow block transform size up to 16x16 + Allow32x32, // Allow block transform size up to 32x32 + TxModeSelect, // Transform specified for each block + TxModes } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxSize.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxSize.cs index 994deb2c3..a58154fe6 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxSize.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxSize.cs @@ -2,10 +2,10 @@ { public enum TxSize { - Tx4x4 = 0, // 4x4 transform - Tx8x8 = 1, // 8x8 transform - Tx16x16 = 2, // 16x16 transform - Tx32x32 = 3, // 32x32 transform - TxSizes = 4 + Tx4x4, // 4x4 transform + Tx8x8, // 8x8 transform + Tx16x16, // 16x16 transform + Tx32x32, // 32x32 transform + TxSizes } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxType.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxType.cs index dbf7251cd..60c5c9a44 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxType.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxType.cs @@ -2,10 +2,10 @@ { internal enum TxType { - DctDct = 0, // DCT in both horizontal and vertical - AdstDct = 1, // ADST in vertical, DCT in horizontal - DctAdst = 2, // DCT in vertical, ADST in horizontal - AdstAdst = 3, // ADST in both directions - TxTypes = 4 + DctDct, // DCT in both horizontal and vertical + AdstDct, // ADST in vertical, DCT in horizontal + DctAdst, // DCT in vertical, ADST in horizontal + AdstAdst, // ADST in both directions + TxTypes } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Common.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Common.cs index faadd3498..aa567927b 100644 --- a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Common.cs +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Common.cs @@ -1,6 +1,8 @@ using Ryujinx.Common.Memory; using Ryujinx.Graphics.Nvdec.Vp9.Common; +using Ryujinx.Graphics.Nvdec.Vp9.Dsp; using Ryujinx.Graphics.Video; +using System; namespace Ryujinx.Graphics.Nvdec.Vp9.Types { @@ -9,27 +11,62 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types public MacroBlockD Mb; public ArrayPtr TileWorkerData; + public int TotalTiles; public InternalErrorInfo Error; + public VpxColorSpace ColorSpace; + public VpxColorRange ColorRange; + public int Width; public int Height; + public int RenderWidth; + public int RenderHeight; + + public int LastWidth; + public int LastHeight; + public int SubsamplingX; public int SubsamplingY; + public bool UseHighBitDepth; + public ArrayPtr PrevFrameMvs; public ArrayPtr CurFrameMvs; + public Ptr FrameToShow; + public Ptr PrevFrame; + + public Ptr CurFrame; + + public Array8 RefFrameMap; /* maps fb_idx to reference slot */ + + // Prepare ref_frame_map for the next frame. + // Only used in frame parallel decode. + public Array8 NextRefFrameMap; + public Array3 FrameRefs; + public int NewFbIdx; + + public int CurShowFrameFbIdx; + + public FrameType LastFrameType; public FrameType FrameType; + public int ShowFrame; + public int LastShowFrame; + public int ShowExistingFrame; + // Flag signaling that the frame is encoded using only Intra modes. public bool IntraOnly; + public bool LastIntraOnly; public bool AllowHighPrecisionMv; + public int ResetFrameContext; + // MBs, MbRows/Cols is in 16-pixel units; MiRows/Cols is in // ModeInfo (8-pixel) units. public int MBs; @@ -49,8 +86,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types /* We allocate a ModeInfo struct for each macroblock, together with an extra row on top and column on the left to simplify prediction. */ + public int MiAllocSize; public ArrayPtr Mip; /* Base of allocated array */ - public ArrayPtr Mi; /* Corresponds to upper left visible macroblock */ + public ArrayPtr Mi; /* Corresponds to upper left visible macroblock */ + + // prev_mip and prev_mi will only be allocated in VP9 encoder. + public Ptr PrevMip; /* MODE_INFO array 'mip' from last decoded frame */ + public Ptr PrevMi; /* 'mi' from last frame (points into prev_mip) */ public ArrayPtr> MiGridBase; public ArrayPtr> MiGridVisible; @@ -70,6 +112,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types public LoopFilterInfoN LfInfo; + public int RefreshFrameContext; /* Two state 0 = NO, 1 = YES */ + public Array4 RefFrameSignBias; /* Two state 0, 1 */ public LoopFilter Lf; @@ -81,10 +125,26 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types public ReferenceMode ReferenceMode; public Ptr Fc; + public ArrayPtr FrameContexts; // FRAME_CONTEXTS + public uint FrameContextIdx; /* Context to use/update */ public Ptr Counts; + public uint CurrentVideoFrame; + public BitstreamProfile Profile; + + public BitDepth BitDepth; + public BitDepth DequantBitDepth; // bit_depth of current dequantizer + + public int ErrorResilientMode; + public int FrameParallelDecodingMode; + public int Log2TileCols, Log2TileRows; + public int ByteAlignment; + public int SkipLoopFilter; + + public Ptr BufferPool; + public ArrayPtr AboveSegContext; public ArrayPtr AboveContext; @@ -95,8 +155,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types public bool CompoundReferenceAllowed() { - int i; - for (i = 1; i < Constants.RefsPerFrame; ++i) + for (int i = 1; i < Constants.RefsPerFrame; ++i) { if (RefFrameSignBias[i + 1] != RefFrameSignBias[1]) { @@ -107,6 +166,47 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types return false; } + public ref Surface GetFrameNewBuffer() + { + return ref BufferPool.Value.FrameBufs[NewFbIdx].Buf; + } + + public int GetFreeFb() + { + ref Array12 frameBufs = ref BufferPool.Value.FrameBufs; + + int i; + + for (i = 0; i < Constants.FrameBuffers; ++i) + { + if (frameBufs[i].RefCount == 0) + { + break; + } + } + + if (i != Constants.FrameBuffers) + { + frameBufs[i].RefCount = 1; + } + else + { + // Reset i to be INVALID_IDX to indicate no free buffer found. + i = RefBuffer.InvalidIdx; + } + + return i; + } + + public void SwapCurrentAndLastSegMap() + { + // Swap indices. + (SegMapIdx, PrevSegMapIdx) = (PrevSegMapIdx, SegMapIdx); + + CurrentFrameSegMap = SegMapArray[SegMapIdx]; + LastFrameSegMap = SegMapArray[PrevSegMapIdx]; + } + private static int CalcMiSize(int len) { // Len is in mi units. @@ -129,7 +229,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types public void AllocTileWorkerData(MemoryAllocator allocator, int tileCols, int tileRows, int maxThreads) { - TileWorkerData = allocator.Allocate(tileCols * tileRows + (maxThreads > 1 ? maxThreads : 0)); + TileWorkerData = + allocator.Allocate((tileCols * tileRows) + (maxThreads > 1 ? maxThreads : 0)); } public void FreeTileWorkerData(MemoryAllocator allocator) @@ -139,9 +240,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types private void AllocSegMap(MemoryAllocator allocator, int segMapSize) { - int i; - - for (i = 0; i < Constants.NumPingPongBuffers; ++i) + for (int i = 0; i < Constants.NumPingPongBuffers; ++i) { SegMapArray[i] = allocator.Allocate(segMapSize); } @@ -156,9 +255,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types private void FreeSegMap(MemoryAllocator allocator) { - int i; - - for (i = 0; i < Constants.NumPingPongBuffers; ++i) + for (int i = 0; i < Constants.NumPingPongBuffers; ++i) { allocator.Free(SegMapArray[i]); SegMapArray[i] = ArrayPtr.Null; @@ -194,6 +291,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types Lf.Lfm = ArrayPtr.Null; allocator.Free(CurFrameMvs); CurFrameMvs = ArrayPtr.Null; + if (UsePrevFrameMvs) { allocator.Free(PrevFrameMvs); @@ -209,7 +307,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types Lf.Lfm = allocator.Allocate(((MiRows + (Constants.MiBlockSize - 1)) >> 3) * Lf.LfmStride); } - public void AllocContextBuffers(MemoryAllocator allocator, int width, int height) + public bool AllocContextBuffers(MemoryAllocator allocator, int width, int height) { SetMbMi(width, height); int newMiSize = MiStride * CalcMiSize(MiRows); @@ -239,6 +337,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types { PrevFrameMvs = allocator.Allocate(MiRows * MiCols); } + + return false; } private unsafe void DecSetupMi() @@ -266,9 +366,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types internal void InitMacroBlockD(ref MacroBlockD xd, ArrayPtr dqcoeff) { - int i; - - for (i = 0; i < Constants.MaxMbPlane; ++i) + for (int i = 0; i < Constants.MaxMbPlane; ++i) { xd.Plane[i].DqCoeff = dqcoeff; xd.AboveContext[i] = AboveContext.Slice(i * 2 * TileInfo.MiColsAlignedToSb(MiCols)); @@ -281,6 +379,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types { MemoryUtil.Copy(ref xd.Plane[i].SegDequant, ref UvDequant); } + xd.Fc = new Ptr(ref Fc.Value); } @@ -293,29 +392,27 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types public void SetupSegmentationDequant() { - const BitDepth bitDepth = BitDepth.Bits8; // TODO: Configurable // Build y/uv dequant values based on segmentation. if (Seg.Enabled) { - int i; - for (i = 0; i < Constants.MaxSegments; ++i) + for (int i = 0; i < Constants.MaxSegments; ++i) { - int qIndex = QuantCommon.GetQIndex(ref Seg, i, BaseQindex); - YDequant[i][0] = QuantCommon.DcQuant(qIndex, YDcDeltaQ, bitDepth); - YDequant[i][1] = QuantCommon.AcQuant(qIndex, 0, bitDepth); - UvDequant[i][0] = QuantCommon.DcQuant(qIndex, UvDcDeltaQ, bitDepth); - UvDequant[i][1] = QuantCommon.AcQuant(qIndex, UvAcDeltaQ, bitDepth); + int qindex = Seg.GetQIndex(i, BaseQindex); + YDequant[i][0] = QuantCommon.DcQuant(qindex, YDcDeltaQ, BitDepth); + YDequant[i][1] = QuantCommon.AcQuant(qindex, 0, BitDepth); + UvDequant[i][0] = QuantCommon.DcQuant(qindex, UvDcDeltaQ, BitDepth); + UvDequant[i][1] = QuantCommon.AcQuant(qindex, UvAcDeltaQ, BitDepth); } } else { - int qIndex = BaseQindex; + int qindex = BaseQindex; // When segmentation is disabled, only the first value is used. The // remaining are don't cares. - YDequant[0][0] = QuantCommon.DcQuant(qIndex, YDcDeltaQ, bitDepth); - YDequant[0][1] = QuantCommon.AcQuant(qIndex, 0, bitDepth); - UvDequant[0][0] = QuantCommon.DcQuant(qIndex, UvDcDeltaQ, bitDepth); - UvDequant[0][1] = QuantCommon.AcQuant(qIndex, UvAcDeltaQ, bitDepth); + YDequant[0][0] = QuantCommon.DcQuant(qindex, YDcDeltaQ, BitDepth); + YDequant[0][1] = QuantCommon.AcQuant(qindex, 0, BitDepth); + UvDequant[0][0] = QuantCommon.DcQuant(qindex, UvDcDeltaQ, BitDepth); + UvDequant[0][1] = QuantCommon.AcQuant(qindex, UvAcDeltaQ, BitDepth); } } @@ -327,5 +424,576 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types refBuf.Sf.SetupScaleFactorsForFrame(refBuf.Buf.Width, refBuf.Buf.Height, Width, Height); } } + + public void ReadFrameReferenceModeProbs(ref Reader r) + { + ref Vp9EntropyProbs fc = ref Fc.Value; + + + if (ReferenceMode == ReferenceMode.Select) + { + for (int i = 0; i < Constants.CompInterContexts; ++i) + { + r.DiffUpdateProb(ref fc.CompInterProb[i]); + } + } + + if (ReferenceMode != ReferenceMode.Compound) + { + for (int i = 0; i < Constants.RefContexts; ++i) + { + r.DiffUpdateProb(ref fc.SingleRefProb[i][0]); + r.DiffUpdateProb(ref fc.SingleRefProb[i][1]); + } + } + + if (ReferenceMode != ReferenceMode.Single) + { + for (int i = 0; i < Constants.RefContexts; ++i) + { + r.DiffUpdateProb(ref fc.CompRefProb[i]); + } + } + } + + public ReferenceMode ReadFrameReferenceMode(ref Reader r) + { + if (CompoundReferenceAllowed()) + { + return r.ReadBit() != 0 + ? r.ReadBit() != 0 ? ReferenceMode.Select : ReferenceMode.Compound + : ReferenceMode.Single; + } + + return ReferenceMode.Single; + } + + public void SetupCompoundReferenceMode() + { + if (RefFrameSignBias[Constants.LastFrame] == RefFrameSignBias[Constants.GoldenFrame]) + { + CompFixedRef = Constants.AltRefFrame; + CompVarRef[0] = Constants.LastFrame; + CompVarRef[1] = Constants.GoldenFrame; + } + else if (RefFrameSignBias[Constants.LastFrame] == RefFrameSignBias[Constants.AltRefFrame]) + { + CompFixedRef = Constants.GoldenFrame; + CompVarRef[0] = Constants.LastFrame; + CompVarRef[1] = Constants.AltRefFrame; + } + else + { + CompFixedRef = Constants.LastFrame; + CompVarRef[0] = Constants.GoldenFrame; + CompVarRef[1] = Constants.AltRefFrame; + } + } + + public void InitMvProbs() + { + Fc.Value.Joints[0] = 32; + Fc.Value.Joints[1] = 64; + Fc.Value.Joints[2] = 96; + + Fc.Value.Sign[0] = 128; + Fc.Value.Classes[0][0] = 224; + Fc.Value.Classes[0][1] = 144; + Fc.Value.Classes[0][2] = 192; + Fc.Value.Classes[0][3] = 168; + Fc.Value.Classes[0][4] = 192; + Fc.Value.Classes[0][5] = 176; + Fc.Value.Classes[0][6] = 192; + Fc.Value.Classes[0][7] = 198; + Fc.Value.Classes[0][8] = 198; + Fc.Value.Classes[0][9] = 245; + Fc.Value.Class0[0][0] = 216; + Fc.Value.Bits[0][0] = 136; + Fc.Value.Bits[0][1] = 140; + Fc.Value.Bits[0][2] = 148; + Fc.Value.Bits[0][3] = 160; + Fc.Value.Bits[0][4] = 176; + Fc.Value.Bits[0][5] = 192; + Fc.Value.Bits[0][6] = 224; + Fc.Value.Bits[0][7] = 234; + Fc.Value.Bits[0][8] = 234; + Fc.Value.Bits[0][9] = 240; + Fc.Value.Class0Fp[0][0][0] = 128; + Fc.Value.Class0Fp[0][0][1] = 128; + Fc.Value.Class0Fp[0][0][2] = 64; + Fc.Value.Class0Fp[0][1][0] = 96; + Fc.Value.Class0Fp[0][1][1] = 112; + Fc.Value.Class0Fp[0][1][2] = 64; + Fc.Value.Fp[0][0] = 64; + Fc.Value.Fp[0][1] = 96; + Fc.Value.Fp[0][2] = 64; + Fc.Value.Class0Hp[0] = 160; + Fc.Value.Hp[0] = 128; + + Fc.Value.Sign[1] = 128; + Fc.Value.Classes[1][0] = 216; + Fc.Value.Classes[1][1] = 128; + Fc.Value.Classes[1][2] = 176; + Fc.Value.Classes[1][3] = 160; + Fc.Value.Classes[1][4] = 176; + Fc.Value.Classes[1][5] = 176; + Fc.Value.Classes[1][6] = 192; + Fc.Value.Classes[1][7] = 198; + Fc.Value.Classes[1][8] = 198; + Fc.Value.Classes[1][9] = 208; + Fc.Value.Class0[1][0] = 208; + Fc.Value.Bits[1][0] = 136; + Fc.Value.Bits[1][1] = 140; + Fc.Value.Bits[1][2] = 148; + Fc.Value.Bits[1][3] = 160; + Fc.Value.Bits[1][4] = 176; + Fc.Value.Bits[1][5] = 192; + Fc.Value.Bits[1][6] = 224; + Fc.Value.Bits[1][7] = 234; + Fc.Value.Bits[1][8] = 234; + Fc.Value.Bits[1][9] = 240; + Fc.Value.Class0Fp[1][0][0] = 128; + Fc.Value.Class0Fp[1][0][1] = 128; + Fc.Value.Class0Fp[1][0][2] = 64; + Fc.Value.Class0Fp[1][1][0] = 96; + Fc.Value.Class0Fp[1][1][1] = 112; + Fc.Value.Class0Fp[1][1][2] = 64; + Fc.Value.Fp[1][0] = 64; + Fc.Value.Fp[1][1] = 96; + Fc.Value.Fp[1][2] = 64; + Fc.Value.Class0Hp[1] = 160; + Fc.Value.Hp[1] = 128; + } + + public void AdaptMvProbs(bool allowHp) + { + ref Vp9EntropyProbs fc = ref Fc.Value; + ref Vp9EntropyProbs preFc = ref FrameContexts[(int)FrameContextIdx]; + ref Vp9BackwardUpdates counts = ref Counts.Value; + + Prob.VpxTreeMergeProbs( + EntropyMv.JointTree, + preFc.Joints.AsSpan(), + counts.Joints.AsSpan(), + fc.Joints.AsSpan()); + + for (int i = 0; i < 2; ++i) + { + fc.Sign[i] = Prob.ModeMvMergeProbs(preFc.Sign[i], ref counts.Sign[i]); + Prob.VpxTreeMergeProbs( + EntropyMv.ClassTree, + preFc.Classes[i].AsSpan(), + counts.Classes[i].AsSpan(), + fc.Classes[i].AsSpan()); + Prob.VpxTreeMergeProbs( + EntropyMv.Class0Tree, + preFc.Class0[i].AsSpan(), + counts.Class0[i].AsSpan(), + fc.Class0[i].AsSpan()); + + for (int j = 0; j < EntropyMv.OffsetBits; ++j) + { + fc.Bits[i][j] = Prob.ModeMvMergeProbs(preFc.Bits[i][j], ref counts.Bits[i][j]); + } + + for (int j = 0; j < EntropyMv.Class0Size; ++j) + { + Prob.VpxTreeMergeProbs( + EntropyMv.FpTree, + preFc.Class0Fp[i][j].AsSpan(), + counts.Class0Fp[i][j].AsSpan(), + fc.Class0Fp[i][j].AsSpan()); + } + + Prob.VpxTreeMergeProbs(EntropyMv.FpTree, preFc.Fp[i].AsSpan(), counts.Fp[i].AsSpan(), + fc.Fp[i].AsSpan()); + + if (allowHp) + { + fc.Class0Hp[i] = Prob.ModeMvMergeProbs(preFc.Class0Hp[i], ref counts.Class0Hp[i]); + fc.Hp[i] = Prob.ModeMvMergeProbs(preFc.Hp[i], ref counts.Hp[i]); + } + } + } + + public void ResizeContextBuffers(MemoryAllocator allocator, int width, int height) + { + if (Width != width || Height != height) + { + int newMiRows = BitUtils.AlignPowerOfTwo(height, Constants.MiSizeLog2) >> Constants.MiSizeLog2; + int newMiCols = BitUtils.AlignPowerOfTwo(width, Constants.MiSizeLog2) >> Constants.MiSizeLog2; + + // Allocations in AllocContextBuffers() depend on individual + // dimensions as well as the overall size. + if (newMiCols > MiCols || newMiRows > MiRows) + { + if (AllocContextBuffers(allocator, width, height)) + { + // The Mi* values have been cleared and any existing context + // buffers have been freed. Clear Width and Height to be + // consistent and to force a realloc next time. + Width = 0; + Height = 0; + Error.InternalError(CodecErr.MemError, "Failed to allocate context buffers"); + } + } + else + { + SetMbMi(width, height); + } + + InitContextBuffers(); + Width = width; + Height = height; + } + + if (CurFrameMvs.IsNull || + MiRows > CurFrame.Value.MiRows || + MiCols > CurFrame.Value.MiCols) + { + ResizeMvBuffer(allocator); + } + } + + public void CheckMemError(ref ArrayPtr lval, ArrayPtr expr) + where T : unmanaged + { + lval = expr; + if (lval.IsNull) + { + Error.InternalError(CodecErr.MemError, "Failed to allocate"); + } + } + + private void ResizeMvBuffer(MemoryAllocator allocator) + { + allocator.Free(CurFrameMvs); + CurFrame.Value.MiRows = MiRows; + CurFrame.Value.MiCols = MiCols; + CheckMemError(ref CurFrameMvs, allocator.Allocate(MiRows * MiCols)); + } + + public void CheckMemError(ref Ptr lval, Ptr expr) where T : unmanaged + { + lval = expr; + if (lval.IsNull) + { + Error.InternalError(CodecErr.MemError, "Failed to allocate"); + } + } + + public void SetupTileInfo(ref ReadBitBuffer rb) + { + int minLog2TileCols = 0, maxLog2TileCols = 0, maxOnes; + TileInfo.GetTileNBits(MiCols, out minLog2TileCols, out maxLog2TileCols); + + // columns + maxOnes = maxLog2TileCols - minLog2TileCols; + Log2TileCols = minLog2TileCols; + while (maxOnes-- != 0 && rb.ReadBit() != 0) + { + Log2TileCols++; + } + + if (Log2TileCols > 6) + { + Error.InternalError(CodecErr.CorruptFrame, "Invalid number of tile columns"); + } + + // rows + Log2TileRows = rb.ReadBit(); + if (Log2TileRows != 0) + { + Log2TileRows += rb.ReadBit(); + } + } + + public void ReadBitdepthColorspaceSampling(ref ReadBitBuffer rb) + { + if (Profile >= BitstreamProfile.Profile2) + { + BitDepth = rb.ReadBit() != 0 ? BitDepth.Bits12 : BitDepth.Bits10; + UseHighBitDepth = true; + } + else + { + BitDepth = BitDepth.Bits8; + UseHighBitDepth = false; + } + + ColorSpace = (VpxColorSpace)rb.ReadLiteral(3); + if (ColorSpace != VpxColorSpace.Srgb) + { + ColorRange = (VpxColorRange)rb.ReadBit(); + if (Profile == BitstreamProfile.Profile1 || Profile == BitstreamProfile.Profile3) + { + SubsamplingX = rb.ReadBit(); + SubsamplingY = rb.ReadBit(); + if (SubsamplingX == 1 && SubsamplingY == 1) + { + Error.InternalError(CodecErr.UnsupBitstream, + "4:2:0 color not supported in profile 1 or 3"); + } + + if (rb.ReadBit() != 0) + { + Error.InternalError(CodecErr.UnsupBitstream, "Reserved bit set"); + } + } + else + { + SubsamplingY = SubsamplingX = 1; + } + } + else + { + ColorRange = VpxColorRange.Full; + if (Profile == BitstreamProfile.Profile1 || Profile == BitstreamProfile.Profile3) + { + // Note if colorspace is SRGB then 4:4:4 chroma sampling is assumed. + // 4:2:2 or 4:4:0 chroma sampling is not allowed. + SubsamplingY = SubsamplingX = 0; + if (rb.ReadBit() != 0) + { + Error.InternalError(CodecErr.UnsupBitstream, "Reserved bit set"); + } + } + else + { + Error.InternalError(CodecErr.UnsupBitstream, "4:4:4 color not supported in profile 0 or 2"); + } + } + } + + public void AdaptModeProbs() + { + ref Vp9EntropyProbs fc = ref Fc.Value; + ref Vp9EntropyProbs preFc = ref FrameContexts[(int)FrameContextIdx]; + ref Vp9BackwardUpdates counts = ref Counts.Value; + + for (int i = 0; i < Constants.IntraInterContexts; i++) + { + fc.IntraInterProb[i] = Prob.ModeMvMergeProbs(preFc.IntraInterProb[i], ref counts.IntraInter[i]); + } + + for (int i = 0; i < Constants.CompInterContexts; i++) + { + fc.CompInterProb[i] = Prob.ModeMvMergeProbs(preFc.CompInterProb[i], ref counts.CompInter[i]); + } + + for (int i = 0; i < Constants.RefContexts; i++) + { + fc.CompRefProb[i] = Prob.ModeMvMergeProbs(preFc.CompRefProb[i], ref counts.CompRef[i]); + } + + for (int i = 0; i < Constants.RefContexts; i++) + { + for (int j = 0; j < 2; j++) + { + fc.SingleRefProb[i][j] = + Prob.ModeMvMergeProbs(preFc.SingleRefProb[i][j], ref counts.SingleRef[i][j]); + } + } + + for (int i = 0; i < Constants.InterModeContexts; i++) + { + Prob.VpxTreeMergeProbs( + EntropyMode.InterModeTree, + preFc.InterModeProb[i].AsSpan(), + counts.InterMode[i].AsSpan(), + fc.InterModeProb[i].AsSpan()); + } + + for (int i = 0; i < EntropyMode.BlockSizeGroups; i++) + { + Prob.VpxTreeMergeProbs( + EntropyMode.IntraModeTree, + preFc.YModeProb[i].AsSpan(), + counts.YMode[i].AsSpan(), + fc.YModeProb[i].AsSpan()); + } + + for (int i = 0; i < Constants.IntraModes; ++i) + { + Prob.VpxTreeMergeProbs( + EntropyMode.IntraModeTree, + preFc.UvModeProb[i].AsSpan(), + counts.UvMode[i].AsSpan(), + fc.UvModeProb[i].AsSpan()); + } + + for (int i = 0; i < Constants.PartitionContexts; i++) + { + Prob.VpxTreeMergeProbs( + EntropyMode.PartitionTree, + preFc.PartitionProb[i].AsSpan(), + counts.Partition[i].AsSpan(), + fc.PartitionProb[i].AsSpan()); + } + + if (InterpFilter == Constants.Switchable) + { + for (int i = 0; i < Constants.SwitchableFilterContexts; i++) + { + Prob.VpxTreeMergeProbs( + EntropyMode.SwitchableInterpTree, + preFc.SwitchableInterpProb[i].AsSpan(), + counts.SwitchableInterp[i].AsSpan(), + fc.SwitchableInterpProb[i].AsSpan()); + } + } + + if (TxMode == TxMode.TxModeSelect) + { + Array1> branchCt8x8P = new(); + Array2> branchCt16x16P = new(); + Array3> branchCt32x32P = new(); + + for (int i = 0; i < EntropyMode.TxSizeContexts; ++i) + { + EntropyMode.TxCountsToBranchCounts8x8(counts.Tx8x8[i].AsSpan(), ref branchCt8x8P); + for (int j = 0; j < (int)TxSize.TxSizes - 3; ++j) + { + fc.Tx8x8Prob[i][j] = Prob.ModeMvMergeProbs(preFc.Tx8x8Prob[i][j], ref branchCt8x8P[j]); + } + + EntropyMode.TxCountsToBranchCounts16x16(counts.Tx16x16[i].AsSpan(), ref branchCt16x16P); + for (int j = 0; j < (int)TxSize.TxSizes - 2; ++j) + { + fc.Tx16x16Prob[i][j] = + Prob.ModeMvMergeProbs(preFc.Tx16x16Prob[i][j], ref branchCt16x16P[j]); + } + + EntropyMode.TxCountsToBranchCounts32x32(counts.Tx32x32[i].AsSpan(), ref branchCt32x32P); + for (int j = 0; j < (int)TxSize.TxSizes - 1; ++j) + { + fc.Tx32x32Prob[i][j] = + Prob.ModeMvMergeProbs(preFc.Tx32x32Prob[i][j], ref branchCt32x32P[j]); + } + } + } + + for (int i = 0; i < Constants.SkipContexts; ++i) + { + fc.SkipProb[i] = Prob.ModeMvMergeProbs(preFc.SkipProb[i], ref counts.Skip[i]); + } + } + + public void AdaptCoefProbs() + { + byte t; + uint countSat, updateFactor; + + if (FrameIsIntraOnly()) + { + updateFactor = Entropy.CoefMaxUpdateFactorKey; + countSat = Entropy.CoefCountSatKey; + } + else if (LastFrameType == FrameType.KeyFrame) + { + updateFactor = Entropy.CoefMaxUpdateFactorAfterKey; /* adapt quickly */ + countSat = Entropy.CoefCountSatAfterKey; + } + else + { + updateFactor = Entropy.CoefMaxUpdateFactor; + countSat = Entropy.CoefCountSat; + } + + for (t = (int)TxSize.Tx4x4; t <= (int)TxSize.Tx32x32; t++) + { + AdaptCoefProbs(t, countSat, updateFactor); + } + } + + public void SetMvs(ReadOnlySpan mvs) + { + if (mvs.Length > PrevFrameMvs.Length) + { + throw new ArgumentException( + $"Size mismatch, expected: {PrevFrameMvs.Length}, but got: {mvs.Length}."); + } + + for (int i = 0; i < mvs.Length; i++) + { + ref MvRef mv = ref PrevFrameMvs[i]; + + mv.Mv[0].Row = mvs[i].Mvs[0].Row; + mv.Mv[0].Col = mvs[i].Mvs[0].Col; + mv.Mv[1].Row = mvs[i].Mvs[1].Row; + mv.Mv[1].Col = mvs[i].Mvs[1].Col; + + mv.RefFrame[0] = (sbyte)mvs[i].RefFrames[0]; + mv.RefFrame[1] = (sbyte)mvs[i].RefFrames[1]; + } + } + + public void GetMvs(Span mvs) + { + if (mvs.Length > CurFrameMvs.Length) + { + throw new ArgumentException( + $"Size mismatch, expected: {CurFrameMvs.Length}, but got: {mvs.Length}."); + } + + for (int i = 0; i < mvs.Length; i++) + { + ref MvRef mv = ref CurFrameMvs[i]; + + mvs[i].Mvs[0].Row = mv.Mv[0].Row; + mvs[i].Mvs[0].Col = mv.Mv[0].Col; + mvs[i].Mvs[1].Row = mv.Mv[1].Row; + mvs[i].Mvs[1].Col = mv.Mv[1].Col; + + mvs[i].RefFrames[0] = mv.RefFrame[0]; + mvs[i].RefFrames[1] = mv.RefFrame[1]; + } + } + + private void AdaptCoefProbs(byte txSize, uint countSat, uint updateFactor) + { + ref Vp9EntropyProbs preFc = ref FrameContexts[(int)FrameContextIdx]; + ref Array2>>>> probs = ref Fc.Value.CoefProbs[txSize]; + ref Array2>>>> preProbs = ref preFc.CoefProbs[txSize]; + ref Array2>>>> counts = ref Counts.Value.Coef[txSize]; + ref Array2>>> eobCounts = ref Counts.Value.EobBranch[txSize]; + + for (int i = 0; i < Constants.PlaneTypes; ++i) + { + for (int j = 0; j < Entropy.RefTypes; ++j) + { + for (int k = 0; k < Entropy.CoefBands; ++k) + { + for (int l = 0; l < Entropy.BAND_COEFF_CONTEXTS(k); ++l) + { + int n0 = (int)counts[i][j][k][l][Entropy.ZeroToken]; + int n1 = (int)counts[i][j][k][l][Entropy.OneToken]; + int n2 = (int)counts[i][j][k][l][Entropy.TwoToken]; + int neob = (int)counts[i][j][k][l][Entropy.EobModelToken]; + Array3> branchCt = new(); + branchCt[0][0] = (uint)neob; + branchCt[0][1] = (uint)(eobCounts[i][j][k][l] - neob); + branchCt[1][0] = (uint)n0; + branchCt[1][1] = (uint)(n1 + n2); + branchCt[2][0] = (uint)n1; + branchCt[2][1] = (uint)n2; + for (int m = 0; m < Entropy.UnconstrainedNodes; ++m) + { + probs[i][j][k][l][m] = Prob.MergeProbs(preProbs[i][j][k][l][m], ref branchCt[m], + countSat, updateFactor); + } + } + } + } + } + } + + public void DefaultCoefProbs() + { + Entropy.CopyProbs(ref Fc.Value.CoefProbs[(int)TxSize.Tx4x4], Entropy.DefaultCoefProbs4x4); + Entropy.CopyProbs(ref Fc.Value.CoefProbs[(int)TxSize.Tx8x8], Entropy.DefaultCoefProbs8x8); + Entropy.CopyProbs(ref Fc.Value.CoefProbs[(int)TxSize.Tx16x16], Entropy.DefaultCoefProbs16x16); + Entropy.CopyProbs(ref Fc.Value.CoefProbs[(int)TxSize.Tx32x32], Entropy.DefaultCoefProbs32x32); + } } -} +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Decoder.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Decoder.cs new file mode 100644 index 000000000..c3dfd8b1a --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Decoder.cs @@ -0,0 +1,410 @@ +using Ryujinx.Common.Memory; +using Ryujinx.Graphics.Nvdec.Vp9.Common; +using Ryujinx.Graphics.Video; +using System.Diagnostics; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct Vp9Decoder + { + public Vp9Common Common; + + public int ReadyForNewData; + + public int RefreshFrameFlags; + + public int NeedResync; // Wait for key/intra-only frame. + public int HoldRefBuf; // Hold the reference buffer. + + private static void DecreaseRefCount(int idx, ref Array12 frameBufs, ref BufferPool pool) + { + if (idx >= 0 && frameBufs[idx].RefCount > 0) + { + --frameBufs[idx].RefCount; + // A worker may only get a free framebuffer index when calling GetFreeFb. + // But the private buffer is not set up until finish decoding header. + // So any error happens during decoding header, the frame_bufs will not + // have valid priv buffer. + if (frameBufs[idx].Released == 0 && frameBufs[idx].RefCount == 0 && + !frameBufs[idx].RawFrameBuffer.Priv.IsNull) + { + FrameBuffers.ReleaseFrameBuffer(pool.CbPriv, ref frameBufs[idx].RawFrameBuffer); + frameBufs[idx].Released = 1; + } + } + } + + public void Create(MemoryAllocator allocator, ref BufferPool pool) + { + ref Vp9Common cm = ref Common; + + cm.CheckMemError(ref cm.Fc, + new Ptr(ref allocator.Allocate(1)[0])); + cm.CheckMemError(ref cm.FrameContexts, + allocator.Allocate(Constants.FrameContexts)); + + for (int i = 0; i < EntropyMode.KfYModeProb.Length; i++) + { + for (int j = 0; j < EntropyMode.KfYModeProb[i].Length; j++) + { + for (int k = 0; k < EntropyMode.KfYModeProb[i][j].Length; k++) + { + cm.Fc.Value.KfYModeProb[i][j][k] = EntropyMode.KfYModeProb[i][j][k]; + } + } + } + + for (int i = 0; i < EntropyMode.KfUvModeProb.Length; i++) + { + for (int j = 0; j < EntropyMode.KfUvModeProb[i].Length; j++) + { + cm.Fc.Value.KfUvModeProb[i][j] = EntropyMode.KfUvModeProb[i][j]; + } + } + + byte[][] KfPartitionProbs = + { + // 8x8 . 4x4 + new byte[] { 158, 97, 94 }, // a/l both not split + new byte[] { 93, 24, 99 }, // a split, l not split + new byte[] { 85, 119, 44 }, // l split, a not split + new byte[] { 62, 59, 67 }, // a/l both split + + // 16x16 . 8x8 + new byte[] { 149, 53, 53 }, // a/l both not split + new byte[] { 94, 20, 48 }, // a split, l not split + new byte[] { 83, 53, 24 }, // l split, a not split + new byte[] { 52, 18, 18 }, // a/l both split + + // 32x32 . 16x16 + new byte[] { 150, 40, 39 }, // a/l both not split + new byte[] { 78, 12, 26 }, // a split, l not split + new byte[] { 67, 33, 11 }, // l split, a not split + new byte[] { 24, 7, 5 }, // a/l both split + + // 64x64 . 32x32 + new byte[] { 174, 35, 49 }, // a/l both not split + new byte[] { 68, 11, 27 }, // a split, l not split + new byte[] { 57, 15, 9 }, // l split, a not split + new byte[] { 12, 3, 3 } // a/l both split + }; + + for (int i = 0; i < KfPartitionProbs.Length; i++) + { + for (int j = 0; j < KfPartitionProbs[i].Length; j++) + { + cm.Fc.Value.KfPartitionProb[i][j] = KfPartitionProbs[i][j]; + } + } + + cm.Counts = new Ptr(ref allocator.Allocate(1)[0]); + + NeedResync = 1; + + // Initialize the references to not point to any frame buffers. + for (int i = 0; i < 8; i++) + { + cm.RefFrameMap[i] = -1; + cm.NextRefFrameMap[i] = -1; + } + + cm.CurrentVideoFrame = 0; + ReadyForNewData = 1; + Common.BufferPool = new Ptr(ref pool); + + cm.BitDepth = BitDepth.Bits8; + cm.DequantBitDepth = BitDepth.Bits8; + + // vp9_loop_filter_init(ref cm); + } + + /* If any buffer updating is signaled it should be done here. */ + private void SwapFrameBuffers() + { + int refIndex = 0, mask; + ref Vp9Common cm = ref Common; + ref BufferPool pool = ref cm.BufferPool.Value; + ref Array12 frameBufs = ref cm.BufferPool.Value.FrameBufs; + + for (mask = RefreshFrameFlags; mask != 0; mask >>= 1) + { + int oldIdx = cm.RefFrameMap[refIndex]; + // Current thread releases the holding of reference frame. + DecreaseRefCount(oldIdx, ref frameBufs, ref pool); + + // Release the reference frame in reference map. + if ((mask & 1) != 0) + { + DecreaseRefCount(oldIdx, ref frameBufs, ref pool); + } + + cm.RefFrameMap[refIndex] = cm.NextRefFrameMap[refIndex]; + ++refIndex; + } + + // Current thread releases the holding of reference frame. + for (; refIndex < Constants.RefFrames && cm.ShowExistingFrame == 0; ++refIndex) + { + int oldIdx = cm.RefFrameMap[refIndex]; + DecreaseRefCount(oldIdx, ref frameBufs, ref pool); + cm.RefFrameMap[refIndex] = cm.NextRefFrameMap[refIndex]; + } + + HoldRefBuf = 0; + cm.FrameToShow = new Ptr(ref cm.GetFrameNewBuffer()); + + --frameBufs[cm.NewFbIdx].RefCount; + + // Invalidate these references until the next frame starts. + for (refIndex = 0; refIndex < 3; refIndex++) + { + cm.FrameRefs[refIndex].Idx = RefBuffer.InvalidIdx; + } + } + + public CodecErr ReceiveCompressedData(MemoryAllocator allocator, ulong size, ref ArrayPtr psource) + { + ref Vp9Common cm = ref Common; + ref BufferPool pool = ref cm.BufferPool.Value; + ref Array12 frameBufs = ref cm.BufferPool.Value.FrameBufs; + ArrayPtr source = psource; + CodecErr retcode = 0; + cm.Error.ErrorCode = CodecErr.Ok; + + if (size == 0) + { + // This is used to signal that we are missing frames. + // We do not know if the missing frame(s) was supposed to update + // any of the reference buffers, but we act conservative and + // mark only the last buffer as corrupted. + + if (cm.FrameRefs[0].Idx > 0) + { + cm.FrameRefs[0].Buf.Corrupted = 1; + } + } + + ReadyForNewData = 0; + + // Check if the previous frame was a frame without any references to it. + if (cm.NewFbIdx >= 0 && frameBufs[cm.NewFbIdx].RefCount == 0 && + frameBufs[cm.NewFbIdx].Released == 0) + { + FrameBuffers.ReleaseFrameBuffer(pool.CbPriv, ref frameBufs[cm.NewFbIdx].RawFrameBuffer); + frameBufs[cm.NewFbIdx].Released = 1; + } + + // Find a free frame buffer. Return error if can not find any. + cm.NewFbIdx = cm.GetFreeFb(); + if (cm.NewFbIdx == RefBuffer.InvalidIdx) + { + ReadyForNewData = 1; + cm.Error.InternalError(CodecErr.MemError, "Unable to find free frame buffer"); + + return cm.Error.ErrorCode; + } + + // Assign a MV array to the frame buffer. + cm.CurFrame = new Ptr(ref pool.FrameBufs[cm.NewFbIdx]); + + HoldRefBuf = 0; + + DecodeFrame.Decode(allocator, ref this, new ArrayPtr(ref source[0], (int)size), out psource); + + SwapFrameBuffers(); + + // vpx_clear_system_state(); + + if (cm.ShowExistingFrame == 0) + { + cm.LastShowFrame = cm.ShowFrame; + cm.PrevFrame = cm.CurFrame; + + if (cm.PrevFrameMvs.IsNull || cm.PrevFrameMvs.Length != cm.CurFrameMvs.Length) + { + allocator.Free(cm.PrevFrameMvs); + cm.PrevFrameMvs = allocator.Allocate(cm.CurFrameMvs.Length); + } + + cm.CurFrameMvs.AsSpan().CopyTo(cm.PrevFrameMvs.AsSpan()); + if (cm.Seg.Enabled) + { + cm.SwapCurrentAndLastSegMap(); + } + } + + if (cm.ShowFrame != 0) + { + cm.CurShowFrameFbIdx = cm.NewFbIdx; + } + + // Update progress in frame parallel decode. + cm.LastWidth = cm.Width; + cm.LastHeight = cm.Height; + if (cm.ShowFrame != 0) + { + cm.CurrentVideoFrame++; + } + + return retcode; + } + + public int GetRawFrame(ref Surface sd) + { + ref Vp9Common cm = ref Common; + int ret = -1; + + if (ReadyForNewData == 1) + { + return ret; + } + + ReadyForNewData = 1; + + if (cm.ShowFrame == 0) + { + return ret; + } + + ReadyForNewData = 1; + + sd = cm.FrameToShow.Value; + ret = 0; + + return ret; + } + + public CodecErr Decode(MemoryAllocator allocator, ArrayPtr data) + { + ArrayPtr dataStart = data; + CodecErr res; + Array8 frameSizes = new(); + int frameCount = 0; + + res = Types.Decoder.ParseSuperframeIndex(data, (ulong)data.Length, ref frameSizes, out frameCount); + if (res != CodecErr.Ok) + { + return res; + } + + // Decode in serial mode. + if (frameCount > 0) + { + for (int i = 0; i < frameCount; ++i) + { + ArrayPtr dataStartCopy = dataStart; + uint frameSize = frameSizes[i]; + if (frameSize > (uint)dataStart.Length) + { + return CodecErr.CorruptFrame; + } + + res = ReceiveCompressedData(allocator, frameSize, ref dataStartCopy); + if (res != CodecErr.Ok) + { + return res; + } + + dataStart = dataStart.Slice((int)frameSize); + } + } + else + { + while (dataStart.Length != 0) + { + uint frameSize = (uint)dataStart.Length; + res = ReceiveCompressedData(allocator, frameSize, ref dataStart); + if (res != CodecErr.Ok) + { + return res; + } + + // Account for suboptimal termination by the encoder. + while (dataStart.Length != 0) + { + byte marker = Types.Decoder.ReadMarker(dataStart); + if (marker != 0) + { + break; + } + + dataStart = dataStart.Slice(1); + } + } + } + + return res; + } + } + + internal static class Decoder + { + public static byte ReadMarker(ArrayPtr data) + { + return data[0]; + } + + public static CodecErr ParseSuperframeIndex(ArrayPtr data, ulong dataSz, ref Array8 sizes, out int count) + { + // A chunk ending with a byte matching 0xc0 is an invalid chunk unless + // it is a super frame index. If the last byte of real video compression + // data is 0xc0 the encoder must add a 0 byte. If we have the marker but + // not the associated matching marker byte at the front of the index we have + // an invalid bitstream and need to return an error. + + byte marker; + + Debug.Assert(dataSz != 0); + marker = ReadMarker(data.Slice((int)dataSz - 1)); + count = 0; + + if ((marker & 0xe0) == 0xc0) + { + uint frames = (uint)(marker & 0x7) + 1; + uint mag = (uint)((marker >> 3) & 0x3) + 1; + ulong indexSz = 2 + (mag * frames); + + // This chunk is marked as having a superframe index but doesn't have + // enough data for it, thus it's an invalid superframe index. + if (dataSz < indexSz) + { + return CodecErr.CorruptFrame; + } + + { + byte marker2 = ReadMarker(data.Slice((int)(dataSz - indexSz))); + + // This chunk is marked as having a superframe index but doesn't have + // the matching marker byte at the front of the index therefore it's an + // invalid chunk. + if (marker != marker2) + { + return CodecErr.CorruptFrame; + } + } + + { + // Found a valid superframe index. + ArrayPtr x = data.Slice((int)(dataSz - indexSz + 1)); + + for (int i = 0; i < frames; ++i) + { + uint thisSz = 0; + + for (int j = 0; j < mag; ++j) + { + thisSz |= (uint)x[0] << j * 8; + x = x.Slice(1); + } + + sizes[i] = thisSz; + } + + count = (int)frames; + } + } + + return CodecErr.Ok; + } + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxCodecFrameBuffer.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxCodecFrameBuffer.cs new file mode 100644 index 000000000..358de79dd --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxCodecFrameBuffer.cs @@ -0,0 +1,10 @@ +using Ryujinx.Common.Memory; + +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal struct VpxCodecFrameBuffer + { + public ArrayPtr Data; + public Ptr Priv; + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxColorRange.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxColorRange.cs new file mode 100644 index 000000000..9f8c7c53d --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxColorRange.cs @@ -0,0 +1,11 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal enum VpxColorRange + { + // Y [16..235], UV [16..240] + Studio, + + // YUV/RGB [0..255] + Full + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxColorSpace.cs b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxColorSpace.cs new file mode 100644 index 000000000..a1706c0d0 --- /dev/null +++ b/src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxColorSpace.cs @@ -0,0 +1,29 @@ +namespace Ryujinx.Graphics.Nvdec.Vp9.Types +{ + internal enum VpxColorSpace + { + // Unknown + Unknown, + + // BT.601 + Bt601, + + // BT.709 + Bt709, + + // SMPTE.170 + Smpte170, + + // SMPTE.240 + Smpte240, + + // BT.2020 + Bt2020, + + // Reserved + Reserved, + + // sRGB + Srgb + } +} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Nvdec/Types/Vp9/PictureInfo.cs b/src/Ryujinx.Graphics.Nvdec/Types/Vp9/PictureInfo.cs index 7d06f7474..5a090d544 100644 --- a/src/Ryujinx.Graphics.Nvdec/Types/Vp9/PictureInfo.cs +++ b/src/Ryujinx.Graphics.Nvdec/Types/Vp9/PictureInfo.cs @@ -59,6 +59,8 @@ namespace Ryujinx.Graphics.Nvdec.Types.Vp9 Flags.HasFlag(FrameFlags.LastShowFrame) && !Flags.HasFlag(FrameFlags.LastFrameIsKeyFrame), RefFrameSignBias = RefFrameSignBias, + LoopFilterLevel = FirstLevel, + LoopFilterSharpnessLevel = SharpnessLevel, BaseQIndex = BaseQIndex, YDcDeltaQ = YDcDeltaQ, UvDcDeltaQ = UvDcDeltaQ, diff --git a/src/Ryujinx.Graphics.Video/Vp9PictureInfo.cs b/src/Ryujinx.Graphics.Video/Vp9PictureInfo.cs index a5cc2b450..3b201955c 100644 --- a/src/Ryujinx.Graphics.Video/Vp9PictureInfo.cs +++ b/src/Ryujinx.Graphics.Video/Vp9PictureInfo.cs @@ -10,6 +10,8 @@ namespace Ryujinx.Graphics.Video public bool IsKeyFrame; public bool IntraOnly; public Array4 RefFrameSignBias; + public int LoopFilterLevel; + public int LoopFilterSharpnessLevel; public int BaseQIndex; public int YDcDeltaQ; public int UvDcDeltaQ; @@ -36,4 +38,4 @@ namespace Ryujinx.Graphics.Video public Vp9EntropyProbs Entropy; public Vp9BackwardUpdates BackwardUpdateCounts; } -} +} \ No newline at end of file