From 8f6387128ad6fc6a6106d1347f86ea97e549f5a2 Mon Sep 17 00:00:00 2001
From: LDj3SNuD <35856442+LDj3SNuD@users.noreply.github.com>
Date: Tue, 26 Jun 2018 03:32:29 +0200
Subject: [PATCH] Add Sse Opt. for Cmeq_V_2D, Cmgt_V_2D (Reg). Add Sse Opt. for
 Crc32cb, Crc32ch, Crc32cw, Crc32cx. Add 10 simple tests for Fcmgt, Fcmge,
 Fcmeq, Fcmle, Fcmlt (S, V) (Reg, Zero). Add 2 Cnt_V tests. (#183)

* Add files via upload

* Add files via upload

* Add files via upload

* CPE

* Add EmitSse42Crc32()

* Update CpuTestSimdCmp.cs

* Update Pseudocode.cs

* Update Instructions.cs

* Update CpuTestSimd.cs

* Update Instructions.cs
---
 ChocolArm64/AOpCodeTable.cs                   |   6 +-
 ChocolArm64/AOptimizations.cs                 |  12 +-
 ChocolArm64/Instruction/AInstEmitHash.cs      |  52 ++-
 .../Instruction/AInstEmitSimdArithmetic.cs    |  32 +-
 ChocolArm64/Instruction/AInstEmitSimdCmp.cs   |  60 ++-
 .../Instruction/AInstEmitSimdHelper.cs        |  64 +--
 Ryujinx.Tests/Cpu/CpuTest.cs                  |  33 +-
 Ryujinx.Tests/Cpu/CpuTestSimd.cs              |  41 ++
 Ryujinx.Tests/Cpu/CpuTestSimdArithmetic.cs    |  22 +-
 Ryujinx.Tests/Cpu/CpuTestSimdCmp.cs           | 375 ++++++++++++++++++
 Ryujinx.Tests/Cpu/CpuTestSimdMove.cs          |  62 +--
 Ryujinx.Tests/Cpu/Tester/Instructions.cs      |  41 +-
 Ryujinx.Tests/Cpu/Tester/Pseudocode.cs        |  18 +
 13 files changed, 698 insertions(+), 120 deletions(-)
 create mode 100644 Ryujinx.Tests/Cpu/CpuTestSimdCmp.cs

diff --git a/ChocolArm64/AOpCodeTable.cs b/ChocolArm64/AOpCodeTable.cs
index fcaee3847..e78d0b572 100644
--- a/ChocolArm64/AOpCodeTable.cs
+++ b/ChocolArm64/AOpCodeTable.cs
@@ -225,16 +225,16 @@ namespace ChocolArm64
             SetA64("000111100x1xxxxxxxxx01xxxxx0xxxx", AInstEmit.Fccmp_S,       typeof(AOpCodeSimdFcond));
             SetA64("000111100x1xxxxxxxxx01xxxxx1xxxx", AInstEmit.Fccmpe_S,      typeof(AOpCodeSimdFcond));
             SetA64("010111100x1xxxxx111001xxxxxxxxxx", AInstEmit.Fcmeq_S,       typeof(AOpCodeSimdReg));
-            SetA64("0>0011100<1xxxxx111001xxxxxxxxxx", AInstEmit.Fcmeq_V,       typeof(AOpCodeSimdReg));
             SetA64("010111101x100000110110xxxxxxxxxx", AInstEmit.Fcmeq_S,       typeof(AOpCodeSimd));
+            SetA64("0>0011100<1xxxxx111001xxxxxxxxxx", AInstEmit.Fcmeq_V,       typeof(AOpCodeSimdReg));
             SetA64("0>0011101<100000110110xxxxxxxxxx", AInstEmit.Fcmeq_V,       typeof(AOpCodeSimd));
             SetA64("011111100x1xxxxx111001xxxxxxxxxx", AInstEmit.Fcmge_S,       typeof(AOpCodeSimdReg));
-            SetA64("0>1011100<1xxxxx111001xxxxxxxxxx", AInstEmit.Fcmge_V,       typeof(AOpCodeSimdReg));
             SetA64("011111101x100000110010xxxxxxxxxx", AInstEmit.Fcmge_S,       typeof(AOpCodeSimd));
+            SetA64("0>1011100<1xxxxx111001xxxxxxxxxx", AInstEmit.Fcmge_V,       typeof(AOpCodeSimdReg));
             SetA64("0>1011101<100000110010xxxxxxxxxx", AInstEmit.Fcmge_V,       typeof(AOpCodeSimd));
             SetA64("011111101x1xxxxx111001xxxxxxxxxx", AInstEmit.Fcmgt_S,       typeof(AOpCodeSimdReg));
-            SetA64("0>1011101<1xxxxx111001xxxxxxxxxx", AInstEmit.Fcmgt_V,       typeof(AOpCodeSimdReg));
             SetA64("010111101x100000110010xxxxxxxxxx", AInstEmit.Fcmgt_S,       typeof(AOpCodeSimd));
+            SetA64("0>1011101<1xxxxx111001xxxxxxxxxx", AInstEmit.Fcmgt_V,       typeof(AOpCodeSimdReg));
             SetA64("0>0011101<100000110010xxxxxxxxxx", AInstEmit.Fcmgt_V,       typeof(AOpCodeSimd));
             SetA64("011111101x100000110110xxxxxxxxxx", AInstEmit.Fcmle_S,       typeof(AOpCodeSimd));
             SetA64("0>1011101<100000110110xxxxxxxxxx", AInstEmit.Fcmle_V,       typeof(AOpCodeSimd));
diff --git a/ChocolArm64/AOptimizations.cs b/ChocolArm64/AOptimizations.cs
index e8c1f7c44..800cf363d 100644
--- a/ChocolArm64/AOptimizations.cs
+++ b/ChocolArm64/AOptimizations.cs
@@ -6,7 +6,15 @@ public static class AOptimizations
 
     public static bool GenerateCallStack = true;
 
-    public static bool UseSse2IfAvailable = true;
+    private static bool UseAllSseIfAvailable = true;
 
-    internal static bool UseSse2 = UseSse2IfAvailable && Sse2.IsSupported;
+    private static bool UseSseIfAvailable   = true;
+    private static bool UseSse2IfAvailable  = true;
+    private static bool UseSse41IfAvailable = true;
+    private static bool UseSse42IfAvailable = true;
+
+    internal static bool UseSse   = (UseAllSseIfAvailable && UseSseIfAvailable)   && Sse.IsSupported;
+    internal static bool UseSse2  = (UseAllSseIfAvailable && UseSse2IfAvailable)  && Sse2.IsSupported;
+    internal static bool UseSse41 = (UseAllSseIfAvailable && UseSse41IfAvailable) && Sse41.IsSupported;
+    internal static bool UseSse42 = (UseAllSseIfAvailable && UseSse42IfAvailable) && Sse42.IsSupported;
 }
\ No newline at end of file
diff --git a/ChocolArm64/Instruction/AInstEmitHash.cs b/ChocolArm64/Instruction/AInstEmitHash.cs
index 94e03f6c1..69bdbc480 100644
--- a/ChocolArm64/Instruction/AInstEmitHash.cs
+++ b/ChocolArm64/Instruction/AInstEmitHash.cs
@@ -1,7 +1,9 @@
 using ChocolArm64.Decoder;
 using ChocolArm64.State;
 using ChocolArm64.Translation;
+using System;
 using System.Reflection.Emit;
+using System.Runtime.Intrinsics.X86;
 
 namespace ChocolArm64.Instruction
 {
@@ -29,22 +31,62 @@ namespace ChocolArm64.Instruction
 
         public static void Crc32cb(AILEmitterCtx Context)
         {
-            EmitCrc32(Context, nameof(ASoftFallback.Crc32cb));
+            if (AOptimizations.UseSse42)
+            {
+                EmitSse42Crc32(Context, typeof(uint), typeof(byte));
+            }
+            else
+            {
+                EmitCrc32(Context, nameof(ASoftFallback.Crc32cb));
+            }
         }
 
         public static void Crc32ch(AILEmitterCtx Context)
         {
-            EmitCrc32(Context, nameof(ASoftFallback.Crc32ch));
+            if (AOptimizations.UseSse42)
+            {
+                EmitSse42Crc32(Context, typeof(uint), typeof(ushort));
+            }
+            else
+            {
+                EmitCrc32(Context, nameof(ASoftFallback.Crc32ch));
+            }
         }
 
         public static void Crc32cw(AILEmitterCtx Context)
         {
-            EmitCrc32(Context, nameof(ASoftFallback.Crc32cw));
+            if (AOptimizations.UseSse42)
+            {
+                EmitSse42Crc32(Context, typeof(uint), typeof(uint));
+            }
+            else
+            {
+                EmitCrc32(Context, nameof(ASoftFallback.Crc32cw));
+            }
         }
 
         public static void Crc32cx(AILEmitterCtx Context)
         {
-            EmitCrc32(Context, nameof(ASoftFallback.Crc32cx));
+            if (AOptimizations.UseSse42)
+            {
+                EmitSse42Crc32(Context, typeof(ulong), typeof(ulong));
+            }
+            else
+            {
+                EmitCrc32(Context, nameof(ASoftFallback.Crc32cx));
+            }
+        }
+
+        private static void EmitSse42Crc32(AILEmitterCtx Context, Type TCrc, Type TData)
+        {
+            AOpCodeAluRs Op = (AOpCodeAluRs)Context.CurrOp;
+
+            Context.EmitLdintzr(Op.Rn);
+            Context.EmitLdintzr(Op.Rm);
+
+            Context.EmitCall(typeof(Sse42).GetMethod(nameof(Sse42.Crc32), new Type[] { TCrc, TData }));
+
+            Context.EmitStintzr(Op.Rd);
         }
 
         private static void EmitCrc32(AILEmitterCtx Context, string Name)
@@ -70,4 +112,4 @@ namespace ChocolArm64.Instruction
             Context.EmitStintzr(Op.Rd);
         }
     }
-}
\ No newline at end of file
+}
diff --git a/ChocolArm64/Instruction/AInstEmitSimdArithmetic.cs b/ChocolArm64/Instruction/AInstEmitSimdArithmetic.cs
index 8cd4654b7..8b6e234c1 100644
--- a/ChocolArm64/Instruction/AInstEmitSimdArithmetic.cs
+++ b/ChocolArm64/Instruction/AInstEmitSimdArithmetic.cs
@@ -319,9 +319,9 @@ namespace ChocolArm64.Instruction
 
         public static void Fadd_S(AILEmitterCtx Context)
         {
-            if (AOptimizations.UseSse2)
+            if (AOptimizations.UseSse && AOptimizations.UseSse2)
             {
-                EmitSse2CallF(Context, nameof(Sse2.AddScalar));
+                EmitSseOrSse2CallF(Context, nameof(Sse.AddScalar));
             }
             else
             {
@@ -331,9 +331,9 @@ namespace ChocolArm64.Instruction
 
         public static void Fadd_V(AILEmitterCtx Context)
         {
-            if (AOptimizations.UseSse2)
+            if (AOptimizations.UseSse && AOptimizations.UseSse2)
             {
-                EmitSse2CallF(Context, nameof(Sse2.Add));
+                EmitSseOrSse2CallF(Context, nameof(Sse.Add));
             }
             else
             {
@@ -389,9 +389,9 @@ namespace ChocolArm64.Instruction
 
         public static void Fdiv_S(AILEmitterCtx Context)
         {
-            if (AOptimizations.UseSse2)
+            if (AOptimizations.UseSse && AOptimizations.UseSse2)
             {
-                EmitSse2CallF(Context, nameof(Sse2.DivideScalar));
+                EmitSseOrSse2CallF(Context, nameof(Sse.DivideScalar));
             }
             else
             {
@@ -401,9 +401,9 @@ namespace ChocolArm64.Instruction
 
         public static void Fdiv_V(AILEmitterCtx Context)
         {
-            if (AOptimizations.UseSse2)
+            if (AOptimizations.UseSse && AOptimizations.UseSse2)
             {
-                EmitSse2CallF(Context, nameof(Sse2.Divide));
+                EmitSseOrSse2CallF(Context, nameof(Sse.Divide));
             }
             else
             {
@@ -563,9 +563,9 @@ namespace ChocolArm64.Instruction
 
         public static void Fmul_S(AILEmitterCtx Context)
         {
-            if (AOptimizations.UseSse2)
+            if (AOptimizations.UseSse && AOptimizations.UseSse2)
             {
-                EmitSse2CallF(Context, nameof(Sse2.MultiplyScalar));
+                EmitSseOrSse2CallF(Context, nameof(Sse.MultiplyScalar));
             }
             else
             {
@@ -580,9 +580,9 @@ namespace ChocolArm64.Instruction
 
         public static void Fmul_V(AILEmitterCtx Context)
         {
-            if (AOptimizations.UseSse2)
+            if (AOptimizations.UseSse && AOptimizations.UseSse2)
             {
-                EmitSse2CallF(Context, nameof(Sse2.Multiply));
+                EmitSseOrSse2CallF(Context, nameof(Sse.Multiply));
             }
             else
             {
@@ -1019,9 +1019,9 @@ namespace ChocolArm64.Instruction
 
         public static void Fsub_S(AILEmitterCtx Context)
         {
-            if (AOptimizations.UseSse2)
+            if (AOptimizations.UseSse && AOptimizations.UseSse2)
             {
-                EmitSse2CallF(Context, nameof(Sse2.SubtractScalar));
+                EmitSseOrSse2CallF(Context, nameof(Sse.SubtractScalar));
             }
             else
             {
@@ -1031,9 +1031,9 @@ namespace ChocolArm64.Instruction
 
         public static void Fsub_V(AILEmitterCtx Context)
         {
-            if (AOptimizations.UseSse2)
+            if (AOptimizations.UseSse && AOptimizations.UseSse2)
             {
-                EmitSse2CallF(Context, nameof(Sse2.Subtract));
+                EmitSseOrSse2CallF(Context, nameof(Sse.Subtract));
             }
             else
             {
diff --git a/ChocolArm64/Instruction/AInstEmitSimdCmp.cs b/ChocolArm64/Instruction/AInstEmitSimdCmp.cs
index ba8ac3e2a..68a7ab880 100644
--- a/ChocolArm64/Instruction/AInstEmitSimdCmp.cs
+++ b/ChocolArm64/Instruction/AInstEmitSimdCmp.cs
@@ -19,9 +19,20 @@ namespace ChocolArm64.Instruction
 
         public static void Cmeq_V(AILEmitterCtx Context)
         {
-            if (AOptimizations.UseSse2 && Context.CurrOp is AOpCodeSimdReg Op && Op.Size < 3)
+            if (Context.CurrOp is AOpCodeSimdReg Op)
             {
-                EmitSse2Call(Context, nameof(Sse2.CompareEqual));
+                if (Op.Size < 3 && AOptimizations.UseSse2)
+                {
+                    EmitSse2Call(Context, nameof(Sse2.CompareEqual));
+                }
+                else if (Op.Size == 3 && AOptimizations.UseSse41)
+                {
+                    EmitSse41Call(Context, nameof(Sse41.CompareEqual));
+                }
+                else
+                {
+                    EmitCmp(Context, OpCodes.Beq_S, Scalar: false);
+                }
             }
             else
             {
@@ -46,9 +57,20 @@ namespace ChocolArm64.Instruction
 
         public static void Cmgt_V(AILEmitterCtx Context)
         {
-            if (AOptimizations.UseSse2 && Context.CurrOp is AOpCodeSimdReg Op && Op.Size < 3)
+            if (Context.CurrOp is AOpCodeSimdReg Op)
             {
-                EmitSse2Call(Context, nameof(Sse2.CompareGreaterThan));
+                if (Op.Size < 3 && AOptimizations.UseSse2)
+                {
+                    EmitSse2Call(Context, nameof(Sse2.CompareGreaterThan));
+                }
+                else if (Op.Size == 3 && AOptimizations.UseSse42)
+                {
+                    EmitSse42Call(Context, nameof(Sse42.CompareGreaterThan));
+                }
+                else
+                {
+                    EmitCmp(Context, OpCodes.Bgt_S, Scalar: false);
+                }
             }
             else
             {
@@ -133,9 +155,10 @@ namespace ChocolArm64.Instruction
 
         public static void Fcmeq_S(AILEmitterCtx Context)
         {
-            if (AOptimizations.UseSse2 && Context.CurrOp is AOpCodeSimdReg)
+            if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse
+                                                 && AOptimizations.UseSse2)
             {
-                EmitSse2CallF(Context, nameof(Sse2.CompareEqualScalar));
+                EmitSseOrSse2CallF(Context, nameof(Sse.CompareEqualScalar));
             }
             else
             {
@@ -145,9 +168,10 @@ namespace ChocolArm64.Instruction
 
         public static void Fcmeq_V(AILEmitterCtx Context)
         {
-            if (AOptimizations.UseSse2 && Context.CurrOp is AOpCodeSimdReg)
+            if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse
+                                                 && AOptimizations.UseSse2)
             {
-                EmitSse2CallF(Context, nameof(Sse2.CompareEqual));
+                EmitSseOrSse2CallF(Context, nameof(Sse.CompareEqual));
             }
             else
             {
@@ -157,9 +181,10 @@ namespace ChocolArm64.Instruction
 
         public static void Fcmge_S(AILEmitterCtx Context)
         {
-            if (AOptimizations.UseSse2 && Context.CurrOp is AOpCodeSimdReg)
+            if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse
+                                                 && AOptimizations.UseSse2)
             {
-                EmitSse2CallF(Context, nameof(Sse2.CompareGreaterThanOrEqualScalar));
+                EmitSseOrSse2CallF(Context, nameof(Sse.CompareGreaterThanOrEqualScalar));
             }
             else
             {
@@ -169,9 +194,10 @@ namespace ChocolArm64.Instruction
 
         public static void Fcmge_V(AILEmitterCtx Context)
         {
-            if (AOptimizations.UseSse2 && Context.CurrOp is AOpCodeSimdReg)
+            if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse
+                                                 && AOptimizations.UseSse2)
             {
-                EmitSse2CallF(Context, nameof(Sse2.CompareGreaterThanOrEqual));
+                EmitSseOrSse2CallF(Context, nameof(Sse.CompareGreaterThanOrEqual));
             }
             else
             {
@@ -181,9 +207,10 @@ namespace ChocolArm64.Instruction
 
         public static void Fcmgt_S(AILEmitterCtx Context)
         {
-            if (AOptimizations.UseSse2 && Context.CurrOp is AOpCodeSimdReg)
+            if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse
+                                                 && AOptimizations.UseSse2)
             {
-                EmitSse2CallF(Context, nameof(Sse2.CompareGreaterThanScalar));
+                EmitSseOrSse2CallF(Context, nameof(Sse.CompareGreaterThanScalar));
             }
             else
             {
@@ -193,9 +220,10 @@ namespace ChocolArm64.Instruction
 
         public static void Fcmgt_V(AILEmitterCtx Context)
         {
-            if (AOptimizations.UseSse2 && Context.CurrOp is AOpCodeSimdReg)
+            if (Context.CurrOp is AOpCodeSimdReg && AOptimizations.UseSse
+                                                 && AOptimizations.UseSse2)
             {
-                EmitSse2CallF(Context, nameof(Sse2.CompareGreaterThan));
+                EmitSseOrSse2CallF(Context, nameof(Sse.CompareGreaterThan));
             }
             else
             {
diff --git a/ChocolArm64/Instruction/AInstEmitSimdHelper.cs b/ChocolArm64/Instruction/AInstEmitSimdHelper.cs
index 3caf2a3ed..80c6aeb7a 100644
--- a/ChocolArm64/Instruction/AInstEmitSimdHelper.cs
+++ b/ChocolArm64/Instruction/AInstEmitSimdHelper.cs
@@ -3,6 +3,7 @@ using ChocolArm64.State;
 using ChocolArm64.Translation;
 using System;
 using System.Reflection;
+using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
 
@@ -34,11 +35,27 @@ namespace ChocolArm64.Instruction
             return (8 << (Op.Size + 1)) - Op.Imm;
         }
 
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static void EmitSse2Call(AILEmitterCtx Context, string Name)
         {
-            AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp;
+            EmitSseCall(Context, Name, typeof(Sse2));
+        }
 
-            int SizeF = Op.Size & 1;
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void EmitSse41Call(AILEmitterCtx Context, string Name)
+        {
+            EmitSseCall(Context, Name, typeof(Sse41));
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void EmitSse42Call(AILEmitterCtx Context, string Name)
+        {
+            EmitSseCall(Context, Name, typeof(Sse42));
+        }
+
+        private static void EmitSseCall(AILEmitterCtx Context, string Name, Type Type)
+        {
+            AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp;
 
             void Ldvec(int Reg)
             {
@@ -57,8 +74,6 @@ namespace ChocolArm64.Instruction
 
             Type BaseType = null;
 
-            Type[] Types;
-
             switch (Op.Size)
             {
                 case 0: BaseType = typeof(Vector128<sbyte>); break;
@@ -71,15 +86,13 @@ namespace ChocolArm64.Instruction
             {
                 Ldvec(BinOp.Rm);
 
-                Types = new Type[] { BaseType, BaseType };
+                Context.EmitCall(Type.GetMethod(Name, new Type[] { BaseType, BaseType }));
             }
             else
             {
-                Types = new Type[] { BaseType };
+                Context.EmitCall(Type.GetMethod(Name, new Type[] { BaseType }));
             }
 
-            Context.EmitCall(typeof(Sse2).GetMethod(Name, Types));
-
             switch (Op.Size)
             {
                 case 0: AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorSByteToSingle)); break;
@@ -96,7 +109,7 @@ namespace ChocolArm64.Instruction
             }
         }
 
-        public static void EmitSse2CallF(AILEmitterCtx Context, string Name)
+        public static void EmitSseOrSse2CallF(AILEmitterCtx Context, string Name)
         {
             AOpCodeSimd Op = (AOpCodeSimd)Context.CurrOp;
 
@@ -114,36 +127,31 @@ namespace ChocolArm64.Instruction
 
             Ldvec(Op.Rn);
 
-            Type BaseType = SizeF == 0
-                ? typeof(Vector128<float>)
-                : typeof(Vector128<double>);
+            Type Type;
+            Type BaseType;
 
-            Type[] Types;
+            if (SizeF == 0)
+            {
+                Type = typeof(Sse);
+                BaseType = typeof(Vector128<float>);
+            }
+            else /* if (SizeF == 1) */
+            {
+                Type = typeof(Sse2);
+                BaseType = typeof(Vector128<double>);
+            }
 
             if (Op is AOpCodeSimdReg BinOp)
             {
                 Ldvec(BinOp.Rm);
 
-                Types = new Type[] { BaseType, BaseType };
+                Context.EmitCall(Type.GetMethod(Name, new Type[] { BaseType, BaseType }));
             }
             else
             {
-                Types = new Type[] { BaseType };
+                Context.EmitCall(Type.GetMethod(Name, new Type[] { BaseType }));
             }
 
-            MethodInfo MthdInfo;
-
-            if (SizeF == 0)
-            {
-                MthdInfo = typeof(Sse).GetMethod(Name, Types);
-            }
-            else /* if (SizeF == 1) */
-            {
-                MthdInfo = typeof(Sse2).GetMethod(Name, Types);
-            }
-
-            Context.EmitCall(MthdInfo);
-
             if (SizeF == 1)
             {
                 AVectorHelper.EmitCall(Context, nameof(AVectorHelper.VectorDoubleToSingle));
diff --git a/Ryujinx.Tests/Cpu/CpuTest.cs b/Ryujinx.Tests/Cpu/CpuTest.cs
index e2442ee49..2af50c6c8 100644
--- a/Ryujinx.Tests/Cpu/CpuTest.cs
+++ b/Ryujinx.Tests/Cpu/CpuTest.cs
@@ -113,6 +113,22 @@ namespace Ryujinx.Tests.Cpu
             return GetThreadState();
         }
 
+        protected static Vector128<float> MakeVectorE0(double A0)
+        {
+            return Sse.StaticCast<long, float>(Sse2.SetVector128(0, BitConverter.DoubleToInt64Bits(A0)));
+        }
+
+        protected static Vector128<float> MakeVectorE0E1(double A0, double A1)
+        {
+            return Sse.StaticCast<long, float>(Sse2.SetVector128(BitConverter.DoubleToInt64Bits(A1),
+                                                                 BitConverter.DoubleToInt64Bits(A0)));
+        }
+
+        protected static Vector128<float> MakeVectorE1(double A1)
+        {
+            return Sse.StaticCast<long, float>(Sse2.SetVector128(BitConverter.DoubleToInt64Bits(A1), 0));
+        }
+
         protected static double VectorExtractDouble(Vector128<float> Vector, byte Index)
         {
             long Value = Sse41.Extract(Sse.StaticCast<float, long>(Vector), Index);
@@ -120,24 +136,19 @@ namespace Ryujinx.Tests.Cpu
             return BitConverter.Int64BitsToDouble(Value);
         }
 
-        protected static Vector128<float> MakeVectorE0(double A)
+        protected static Vector128<float> MakeVectorE0(ulong A0)
         {
-            return Sse.StaticCast<long, float>(Sse2.SetVector128(0, BitConverter.DoubleToInt64Bits(A)));
+            return Sse.StaticCast<ulong, float>(Sse2.SetVector128(0, A0));
         }
 
-        protected static Vector128<float> MakeVectorE0(ulong A)
+        protected static Vector128<float> MakeVectorE0E1(ulong A0, ulong A1)
         {
-            return Sse.StaticCast<ulong, float>(Sse2.SetVector128(0, A));
+            return Sse.StaticCast<ulong, float>(Sse2.SetVector128(A1, A0));
         }
 
-        protected static Vector128<float> MakeVectorE0E1(ulong A, ulong B)
+        protected static Vector128<float> MakeVectorE1(ulong A1)
         {
-            return Sse.StaticCast<ulong, float>(Sse2.SetVector128(B, A));
-        }
-
-        protected static Vector128<float> MakeVectorE1(ulong B)
-        {
-            return Sse.StaticCast<ulong, float>(Sse2.SetVector128(B, 0));
+            return Sse.StaticCast<ulong, float>(Sse2.SetVector128(A1, 0));
         }
 
         protected static ulong GetVectorE0(Vector128<float> Vector)
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimd.cs b/Ryujinx.Tests/Cpu/CpuTestSimd.cs
index 6cc823042..90461728a 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimd.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimd.cs
@@ -628,6 +628,47 @@ namespace Ryujinx.Tests.Cpu
             });
         }
 
+        [Test, Description("CNT <Vd>.<T>, <Vn>.<T>")]
+        public void Cnt_V_8B([ValueSource("_8B_")] [Random(1)] ulong A)
+        {
+            uint Opcode = 0x0E205820; // CNT V0.8B, V1.8B
+            Bits Op = new Bits(Opcode);
+
+            Vector128<float> V0 = MakeVectorE1(TestContext.CurrentContext.Random.NextULong());
+            Vector128<float> V1 = MakeVectorE0(A);
+            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1);
+
+            AArch64.V(1, new Bits(A));
+            SimdFp.Cnt_V(Op[30], Op[23, 22], Op[9, 5], Op[4, 0]);
+
+            Assert.Multiple(() =>
+            {
+                Assert.That(GetVectorE0(ThreadState.V0), Is.EqualTo(AArch64.V(64, 0).ToUInt64()));
+                Assert.That(GetVectorE1(ThreadState.V0), Is.Zero);
+            });
+        }
+
+        [Test, Pairwise, Description("CNT <Vd>.<T>, <Vn>.<T>")]
+        public void Cnt_V_16B([ValueSource("_8B_")] [Random(1)] ulong A0,
+                              [ValueSource("_8B_")] [Random(1)] ulong A1)
+        {
+            uint Opcode = 0x4E205820; // CNT V0.16B, V1.16B
+            Bits Op = new Bits(Opcode);
+
+            Vector128<float> V1 = MakeVectorE0E1(A0, A1);
+            AThreadState ThreadState = SingleOpcode(Opcode, V1: V1);
+
+            AArch64.Vpart(1, 0, new Bits(A0));
+            AArch64.Vpart(1, 1, new Bits(A1));
+            SimdFp.Cnt_V(Op[30], Op[23, 22], Op[9, 5], Op[4, 0]);
+
+            Assert.Multiple(() =>
+            {
+                Assert.That(GetVectorE0(ThreadState.V0), Is.EqualTo(AArch64.Vpart(64, 0, 0).ToUInt64()));
+                Assert.That(GetVectorE1(ThreadState.V0), Is.EqualTo(AArch64.Vpart(64, 0, 1).ToUInt64()));
+            });
+        }
+
         [Test, Description("NEG <V><d>, <V><n>")]
         public void Neg_S_D([ValueSource("_1D_")] [Random(1)] ulong A)
         {
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdArithmetic.cs b/Ryujinx.Tests/Cpu/CpuTestSimdArithmetic.cs
index d04eca7b5..98be2fc5b 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdArithmetic.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdArithmetic.cs
@@ -176,10 +176,13 @@ namespace Ryujinx.Tests.Cpu
         {
             AThreadState ThreadState = SingleOpcode(0x4EA1D802, V0: Sse.SetAllVector128(A));
 
-            Assert.That(Sse41.Extract(ThreadState.V2, (byte)0), Is.EqualTo(1 / A));
-            Assert.That(Sse41.Extract(ThreadState.V2, (byte)1), Is.EqualTo(1 / A));
-            Assert.That(Sse41.Extract(ThreadState.V2, (byte)2), Is.EqualTo(1 / A));
-            Assert.That(Sse41.Extract(ThreadState.V2, (byte)3), Is.EqualTo(1 / A));
+            Assert.Multiple(() =>
+            {
+                Assert.That(Sse41.Extract(ThreadState.V2, (byte)0), Is.EqualTo(1 / A));
+                Assert.That(Sse41.Extract(ThreadState.V2, (byte)1), Is.EqualTo(1 / A));
+                Assert.That(Sse41.Extract(ThreadState.V2, (byte)2), Is.EqualTo(1 / A));
+                Assert.That(Sse41.Extract(ThreadState.V2, (byte)3), Is.EqualTo(1 / A));
+            });
         }
 
         [Test, Description("FRECPS D0, D1, D2")]
@@ -199,10 +202,13 @@ namespace Ryujinx.Tests.Cpu
                 V2: Sse.SetAllVector128(A),
                 V0: Sse.SetAllVector128(B));
 
-            Assert.That(Sse41.Extract(ThreadState.V4, (byte)0), Is.EqualTo(2 - (A * B)));
-            Assert.That(Sse41.Extract(ThreadState.V4, (byte)1), Is.EqualTo(2 - (A * B)));
-            Assert.That(Sse41.Extract(ThreadState.V4, (byte)2), Is.EqualTo(2 - (A * B)));
-            Assert.That(Sse41.Extract(ThreadState.V4, (byte)3), Is.EqualTo(2 - (A * B)));
+            Assert.Multiple(() =>
+            {
+                Assert.That(Sse41.Extract(ThreadState.V4, (byte)0), Is.EqualTo(2 - (A * B)));
+                Assert.That(Sse41.Extract(ThreadState.V4, (byte)1), Is.EqualTo(2 - (A * B)));
+                Assert.That(Sse41.Extract(ThreadState.V4, (byte)2), Is.EqualTo(2 - (A * B)));
+                Assert.That(Sse41.Extract(ThreadState.V4, (byte)3), Is.EqualTo(2 - (A * B)));
+            });
         }
 
         [TestCase(0x3FE66666u, false, 0x40000000u)]
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdCmp.cs b/Ryujinx.Tests/Cpu/CpuTestSimdCmp.cs
new file mode 100644
index 000000000..41f5113d6
--- /dev/null
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdCmp.cs
@@ -0,0 +1,375 @@
+using ChocolArm64.State;
+
+using NUnit.Framework;
+
+using System;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace Ryujinx.Tests.Cpu
+{
+    public class CpuTestSimdCmp : CpuTest
+    {
+#region "ValueSource"
+        private static float[] _floats_()
+        {
+            return new float[] { float.NegativeInfinity, float.MinValue, -1f, -0f,
+                                 +0f, +1f, float.MaxValue, float.PositiveInfinity };
+        }
+
+        private static double[] _doubles_()
+        {
+            return new double[] { double.NegativeInfinity, double.MinValue, -1d, -0d,
+                                  +0d, +1d, double.MaxValue, double.PositiveInfinity };
+        }
+#endregion
+
+        [Test, Description("FCMEQ D0, D1, D2 | FCMGE D0, D1, D2 | FCMGT D0, D1, D2")]
+        public void Fcmeq_Fcmge_Fcmgt_Reg_S_D([ValueSource("_doubles_")] [Random(8)] double A,
+                                              [ValueSource("_doubles_")] [Random(8)] double B,
+                                              [Values(0u, 1u, 3u)] uint EU) // EQ, GE, GT
+        {
+            uint Opcode = 0x5E62E420 | ((EU & 1) << 29) | ((EU >> 1) << 23);
+            Vector128<float> V0 = Sse.StaticCast<double, float>(Sse2.SetAllVector128(TestContext.CurrentContext.Random.NextDouble()));
+            Vector128<float> V1 = Sse.StaticCast<double, float>(Sse2.SetScalarVector128(A));
+            Vector128<float> V2 = Sse.StaticCast<double, float>(Sse2.SetScalarVector128(B));
+
+            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1, V2: V2);
+
+            byte[] Exp   = default(byte[]);
+            byte[] Ones  = new byte[] {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+            byte[] Zeros = new byte[] {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+
+            switch (EU)
+            {
+                case 0: Exp = (A == B ? Ones : Zeros); break;
+                case 1: Exp = (A >= B ? Ones : Zeros); break;
+                case 3: Exp = (A >  B ? Ones : Zeros); break;
+            }
+
+            Assert.Multiple(() =>
+            {
+                Assert.That(BitConverter.GetBytes(VectorExtractDouble(ThreadState.V0, (byte)0)), Is.EquivalentTo(Exp));
+                Assert.That(VectorExtractDouble(ThreadState.V0, (byte)1), Is.Zero);
+            });
+        }
+
+        [Test, Description("FCMEQ S0, S1, S2 | FCMGE S0, S1, S2 | FCMGT S0, S1, S2")]
+        public void Fcmeq_Fcmge_Fcmgt_Reg_S_S([ValueSource("_floats_")] [Random(8)] float A,
+                                              [ValueSource("_floats_")] [Random(8)] float B,
+                                              [Values(0u, 1u, 3u)] uint EU) // EQ, GE, GT
+        {
+            uint Opcode = 0x5E22E420 | ((EU & 1) << 29) | ((EU >> 1) << 23);
+            Vector128<float> V0 = Sse.SetAllVector128(TestContext.CurrentContext.Random.NextFloat());
+            Vector128<float> V1 = Sse.SetScalarVector128(A);
+            Vector128<float> V2 = Sse.SetScalarVector128(B);
+
+            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1, V2: V2);
+
+            byte[] Exp   = default(byte[]);
+            byte[] Ones  = new byte[] {0xFF, 0xFF, 0xFF, 0xFF};
+            byte[] Zeros = new byte[] {0x00, 0x00, 0x00, 0x00};
+
+            switch (EU)
+            {
+                case 0: Exp = (A == B ? Ones : Zeros); break;
+                case 1: Exp = (A >= B ? Ones : Zeros); break;
+                case 3: Exp = (A >  B ? Ones : Zeros); break;
+            }
+
+            Assert.Multiple(() =>
+            {
+                Assert.That(BitConverter.GetBytes(Sse41.Extract(ThreadState.V0, (byte)0)), Is.EquivalentTo(Exp));
+                Assert.That(Sse41.Extract(ThreadState.V0, (byte)1), Is.Zero);
+                Assert.That(Sse41.Extract(ThreadState.V0, (byte)2), Is.Zero);
+                Assert.That(Sse41.Extract(ThreadState.V0, (byte)3), Is.Zero);
+            });
+        }
+
+        [Test, Description("FCMEQ V0.2D, V1.2D, V2.2D | FCMGE V0.2D, V1.2D, V2.2D | FCMGT V0.2D, V1.2D, V2.2D")]
+        public void Fcmeq_Fcmge_Fcmgt_Reg_V_2D([ValueSource("_doubles_")] [Random(8)] double A,
+                                               [ValueSource("_doubles_")] [Random(8)] double B,
+                                               [Values(0u, 1u, 3u)] uint EU) // EQ, GE, GT
+        {
+            uint Opcode = 0x4E62E420 | ((EU & 1) << 29) | ((EU >> 1) << 23);
+            Vector128<float> V1 = Sse.StaticCast<double, float>(Sse2.SetAllVector128(A));
+            Vector128<float> V2 = Sse.StaticCast<double, float>(Sse2.SetAllVector128(B));
+
+            AThreadState ThreadState = SingleOpcode(Opcode, V1: V1, V2: V2);
+
+            byte[] Exp   = default(byte[]);
+            byte[] Ones  = new byte[] {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+            byte[] Zeros = new byte[] {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+
+            switch (EU)
+            {
+                case 0: Exp = (A == B ? Ones : Zeros); break;
+                case 1: Exp = (A >= B ? Ones : Zeros); break;
+                case 3: Exp = (A >  B ? Ones : Zeros); break;
+            }
+
+            Assert.Multiple(() =>
+            {
+                Assert.That(BitConverter.GetBytes(VectorExtractDouble(ThreadState.V0, (byte)0)), Is.EquivalentTo(Exp));
+                Assert.That(BitConverter.GetBytes(VectorExtractDouble(ThreadState.V0, (byte)1)), Is.EquivalentTo(Exp));
+            });
+        }
+
+        [Test, Description("FCMEQ V0.2S, V1.2S, V2.2S | FCMGE V0.2S, V1.2S, V2.2S | FCMGT V0.2S, V1.2S, V2.2S")]
+        public void Fcmeq_Fcmge_Fcmgt_Reg_V_2S([ValueSource("_floats_")] [Random(8)] float A,
+                                               [ValueSource("_floats_")] [Random(8)] float B,
+                                               [Values(0u, 1u, 3u)] uint EU) // EQ, GE, GT
+        {
+            uint Opcode = 0x0E22E420 | ((EU & 1) << 29) | ((EU >> 1) << 23);
+            Vector128<float> V0 = Sse.SetAllVector128(TestContext.CurrentContext.Random.NextFloat());
+            Vector128<float> V1 = Sse.SetVector128(0, 0, A, A);
+            Vector128<float> V2 = Sse.SetVector128(0, 0, B, B);
+
+            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1, V2: V2);
+
+            byte[] Exp   = default(byte[]);
+            byte[] Ones  = new byte[] {0xFF, 0xFF, 0xFF, 0xFF};
+            byte[] Zeros = new byte[] {0x00, 0x00, 0x00, 0x00};
+
+            switch (EU)
+            {
+                case 0: Exp = (A == B ? Ones : Zeros); break;
+                case 1: Exp = (A >= B ? Ones : Zeros); break;
+                case 3: Exp = (A >  B ? Ones : Zeros); break;
+            }
+
+            Assert.Multiple(() =>
+            {
+                Assert.That(BitConverter.GetBytes(Sse41.Extract(ThreadState.V0, (byte)0)), Is.EquivalentTo(Exp));
+                Assert.That(BitConverter.GetBytes(Sse41.Extract(ThreadState.V0, (byte)1)), Is.EquivalentTo(Exp));
+                Assert.That(Sse41.Extract(ThreadState.V0, (byte)2), Is.Zero);
+                Assert.That(Sse41.Extract(ThreadState.V0, (byte)3), Is.Zero);
+            });
+        }
+
+        [Test, Description("FCMEQ V0.4S, V1.4S, V2.4S | FCMGE V0.4S, V1.4S, V2.4S | FCMGT V0.4S, V1.4S, V2.4S")]
+        public void Fcmeq_Fcmge_Fcmgt_Reg_V_4S([ValueSource("_floats_")] [Random(8)] float A,
+                                               [ValueSource("_floats_")] [Random(8)] float B,
+                                               [Values(0u, 1u, 3u)] uint EU) // EQ, GE, GT
+        {
+            uint Opcode = 0x4E22E420 | ((EU & 1) << 29) | ((EU >> 1) << 23);
+            Vector128<float> V1 = Sse.SetAllVector128(A);
+            Vector128<float> V2 = Sse.SetAllVector128(B);
+
+            AThreadState ThreadState = SingleOpcode(Opcode, V1: V1, V2: V2);
+
+            byte[] Exp   = default(byte[]);
+            byte[] Ones  = new byte[] {0xFF, 0xFF, 0xFF, 0xFF};
+            byte[] Zeros = new byte[] {0x00, 0x00, 0x00, 0x00};
+
+            switch (EU)
+            {
+                case 0: Exp = (A == B ? Ones : Zeros); break;
+                case 1: Exp = (A >= B ? Ones : Zeros); break;
+                case 3: Exp = (A >  B ? Ones : Zeros); break;
+            }
+
+            Assert.Multiple(() =>
+            {
+                Assert.That(BitConverter.GetBytes(Sse41.Extract(ThreadState.V0, (byte)0)), Is.EquivalentTo(Exp));
+                Assert.That(BitConverter.GetBytes(Sse41.Extract(ThreadState.V0, (byte)1)), Is.EquivalentTo(Exp));
+                Assert.That(BitConverter.GetBytes(Sse41.Extract(ThreadState.V0, (byte)2)), Is.EquivalentTo(Exp));
+                Assert.That(BitConverter.GetBytes(Sse41.Extract(ThreadState.V0, (byte)3)), Is.EquivalentTo(Exp));
+            });
+        }
+
+        [Test, Description("FCMGT D0, D1, #0.0 | FCMGE D0, D1, #0.0 | FCMEQ D0, D1, #0.0 | FCMLE D0, D1, #0.0 | FCMLT D0, D1, #0.0")]
+        public void Fcmgt_Fcmge_Fcmeq_Fcmle_Fcmlt_Zero_S_D([ValueSource("_doubles_")] [Random(8)] double A,
+                                                           [Values(0u, 1u, 2u, 3u)] uint opU, // GT, GE, EQ, LE
+                                                           [Values(0u, 1u)] uint bit13) // "LT"
+        {
+            uint Opcode = 0x5EE0C820 | (((opU & 1) & ~bit13) << 29) | (bit13 << 13) | (((opU >> 1) & ~bit13) << 12);
+            Vector128<float> V0 = Sse.StaticCast<double, float>(Sse2.SetAllVector128(TestContext.CurrentContext.Random.NextDouble()));
+            Vector128<float> V1 = Sse.StaticCast<double, float>(Sse2.SetScalarVector128(A));
+
+            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1);
+
+            double Zero  = +0d;
+            byte[] Exp   = default(byte[]);
+            byte[] Ones  = new byte[] {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+            byte[] Zeros = new byte[] {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+
+            if (bit13 == 0)
+            {
+                switch (opU)
+                {
+                    case 0: Exp = (A    >  Zero ? Ones : Zeros); break;
+                    case 1: Exp = (A    >= Zero ? Ones : Zeros); break;
+                    case 2: Exp = (A    == Zero ? Ones : Zeros); break;
+                    case 3: Exp = (Zero >= A    ? Ones : Zeros); break;
+                }
+            }
+            else
+            {
+                Exp = (Zero > A ? Ones : Zeros);
+            }
+
+            Assert.Multiple(() =>
+            {
+                Assert.That(BitConverter.GetBytes(VectorExtractDouble(ThreadState.V0, (byte)0)), Is.EquivalentTo(Exp));
+                Assert.That(VectorExtractDouble(ThreadState.V0, (byte)1), Is.Zero);
+            });
+        }
+
+        [Test, Description("FCMGT S0, S1, #0.0 | FCMGE S0, S1, #0.0 | FCMEQ S0, S1, #0.0 | FCMLE S0, S1, #0.0 | FCMLT S0, S1, #0.0")]
+        public void Fcmgt_Fcmge_Fcmeq_Fcmle_Fcmlt_Zero_S_S([ValueSource("_floats_")] [Random(8)] float A,
+                                                           [Values(0u, 1u, 2u, 3u)] uint opU, // GT, GE, EQ, LE
+                                                           [Values(0u, 1u)] uint bit13) // "LT"
+        {
+            uint Opcode = 0x5EA0C820 | (((opU & 1) & ~bit13) << 29) | (bit13 << 13) | (((opU >> 1) & ~bit13) << 12);
+            Vector128<float> V0 = Sse.SetAllVector128(TestContext.CurrentContext.Random.NextFloat());
+            Vector128<float> V1 = Sse.SetScalarVector128(A);
+
+            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1);
+
+            float  Zero  = +0f;
+            byte[] Exp   = default(byte[]);
+            byte[] Ones  = new byte[] {0xFF, 0xFF, 0xFF, 0xFF};
+            byte[] Zeros = new byte[] {0x00, 0x00, 0x00, 0x00};
+
+            if (bit13 == 0)
+            {
+                switch (opU)
+                {
+                    case 0: Exp = (A    >  Zero ? Ones : Zeros); break;
+                    case 1: Exp = (A    >= Zero ? Ones : Zeros); break;
+                    case 2: Exp = (A    == Zero ? Ones : Zeros); break;
+                    case 3: Exp = (Zero >= A    ? Ones : Zeros); break;
+                }
+            }
+            else
+            {
+                Exp = (Zero > A ? Ones : Zeros);
+            }
+
+            Assert.Multiple(() =>
+            {
+                Assert.That(BitConverter.GetBytes(Sse41.Extract(ThreadState.V0, (byte)0)), Is.EquivalentTo(Exp));
+                Assert.That(Sse41.Extract(ThreadState.V0, (byte)1), Is.Zero);
+                Assert.That(Sse41.Extract(ThreadState.V0, (byte)2), Is.Zero);
+                Assert.That(Sse41.Extract(ThreadState.V0, (byte)3), Is.Zero);
+            });
+        }
+
+        [Test, Description("FCMGT V0.2D, V1.2D, #0.0 | FCMGE V0.2D, V1.2D, #0.0 | FCMEQ V0.2D, V1.2D, #0.0 | FCMLE V0.2D, V1.2D, #0.0 | FCMLT V0.2D, V1.2D, #0.0")]
+        public void Fcmgt_Fcmge_Fcmeq_Fcmle_Fcmlt_Zero_V_2D([ValueSource("_doubles_")] [Random(8)] double A,
+                                                            [Values(0u, 1u, 2u, 3u)] uint opU, // GT, GE, EQ, LE
+                                                            [Values(0u, 1u)] uint bit13) // "LT"
+        {
+            uint Opcode = 0x4EE0C820 | (((opU & 1) & ~bit13) << 29) | (bit13 << 13) | (((opU >> 1) & ~bit13) << 12);
+            Vector128<float> V1 = Sse.StaticCast<double, float>(Sse2.SetAllVector128(A));
+
+            AThreadState ThreadState = SingleOpcode(Opcode, V1: V1);
+
+            double Zero  = +0d;
+            byte[] Exp   = default(byte[]);
+            byte[] Ones  = new byte[] {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+            byte[] Zeros = new byte[] {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+
+            if (bit13 == 0)
+            {
+                switch (opU)
+                {
+                    case 0: Exp = (A    >  Zero ? Ones : Zeros); break;
+                    case 1: Exp = (A    >= Zero ? Ones : Zeros); break;
+                    case 2: Exp = (A    == Zero ? Ones : Zeros); break;
+                    case 3: Exp = (Zero >= A    ? Ones : Zeros); break;
+                }
+            }
+            else
+            {
+                Exp = (Zero > A ? Ones : Zeros);
+            }
+
+            Assert.Multiple(() =>
+            {
+                Assert.That(BitConverter.GetBytes(VectorExtractDouble(ThreadState.V0, (byte)0)), Is.EquivalentTo(Exp));
+                Assert.That(BitConverter.GetBytes(VectorExtractDouble(ThreadState.V0, (byte)1)), Is.EquivalentTo(Exp));
+            });
+        }
+
+        [Test, Description("FCMGT V0.2S, V1.2S, #0.0 | FCMGE V0.2S, V1.2S, #0.0 | FCMEQ V0.2S, V1.2S, #0.0 | FCMLE V0.2S, V1.2S, #0.0 | FCMLT V0.2S, V1.2S, #0.0")]
+        public void Fcmgt_Fcmge_Fcmeq_Fcmle_Fcmlt_Zero_V_2S([ValueSource("_floats_")] [Random(8)] float A,
+                                                            [Values(0u, 1u, 2u, 3u)] uint opU, // GT, GE, EQ, LE
+                                                            [Values(0u, 1u)] uint bit13) // "LT"
+        {
+            uint Opcode = 0x0EA0C820 | (((opU & 1) & ~bit13) << 29) | (bit13 << 13) | (((opU >> 1) & ~bit13) << 12);
+            Vector128<float> V0 = Sse.SetAllVector128(TestContext.CurrentContext.Random.NextFloat());
+            Vector128<float> V1 = Sse.SetVector128(0, 0, A, A);
+
+            AThreadState ThreadState = SingleOpcode(Opcode, V0: V0, V1: V1);
+
+            float  Zero  = +0f;
+            byte[] Exp   = default(byte[]);
+            byte[] Ones  = new byte[] {0xFF, 0xFF, 0xFF, 0xFF};
+            byte[] Zeros = new byte[] {0x00, 0x00, 0x00, 0x00};
+
+            if (bit13 == 0)
+            {
+                switch (opU)
+                {
+                    case 0: Exp = (A    >  Zero ? Ones : Zeros); break;
+                    case 1: Exp = (A    >= Zero ? Ones : Zeros); break;
+                    case 2: Exp = (A    == Zero ? Ones : Zeros); break;
+                    case 3: Exp = (Zero >= A    ? Ones : Zeros); break;
+                }
+            }
+            else
+            {
+                Exp = (Zero > A ? Ones : Zeros);
+            }
+
+            Assert.Multiple(() =>
+            {
+                Assert.That(BitConverter.GetBytes(Sse41.Extract(ThreadState.V0, (byte)0)), Is.EquivalentTo(Exp));
+                Assert.That(BitConverter.GetBytes(Sse41.Extract(ThreadState.V0, (byte)1)), Is.EquivalentTo(Exp));
+                Assert.That(Sse41.Extract(ThreadState.V0, (byte)2), Is.Zero);
+                Assert.That(Sse41.Extract(ThreadState.V0, (byte)3), Is.Zero);
+            });
+        }
+
+        [Test, Description("FCMGT V0.4S, V1.4S, #0.0 | FCMGE V0.4S, V1.4S, #0.0 | FCMEQ V0.4S, V1.4S, #0.0 | FCMLE V0.4S, V1.4S, #0.0 | FCMLT V0.4S, V1.4S, #0.0")]
+        public void Fcmgt_Fcmge_Fcmeq_Fcmle_Fcmlt_Zero_V_4S([ValueSource("_floats_")] [Random(8)] float A,
+                                                            [Values(0u, 1u, 2u, 3u)] uint opU, // GT, GE, EQ, LE
+                                                            [Values(0u, 1u)] uint bit13) // "LT"
+        {
+            uint Opcode = 0x4EA0C820 | (((opU & 1) & ~bit13) << 29) | (bit13 << 13) | (((opU >> 1) & ~bit13) << 12);
+            Vector128<float> V1 = Sse.SetAllVector128(A);
+
+            AThreadState ThreadState = SingleOpcode(Opcode, V1: V1);
+
+            float  Zero  = +0f;
+            byte[] Exp   = default(byte[]);
+            byte[] Ones  = new byte[] {0xFF, 0xFF, 0xFF, 0xFF};
+            byte[] Zeros = new byte[] {0x00, 0x00, 0x00, 0x00};
+
+            if (bit13 == 0)
+            {
+                switch (opU)
+                {
+                    case 0: Exp = (A    >  Zero ? Ones : Zeros); break;
+                    case 1: Exp = (A    >= Zero ? Ones : Zeros); break;
+                    case 2: Exp = (A    == Zero ? Ones : Zeros); break;
+                    case 3: Exp = (Zero >= A    ? Ones : Zeros); break;
+                }
+            }
+            else
+            {
+                Exp = (Zero > A ? Ones : Zeros);
+            }
+
+            Assert.Multiple(() =>
+            {
+                Assert.That(BitConverter.GetBytes(Sse41.Extract(ThreadState.V0, (byte)0)), Is.EquivalentTo(Exp));
+                Assert.That(BitConverter.GetBytes(Sse41.Extract(ThreadState.V0, (byte)1)), Is.EquivalentTo(Exp));
+                Assert.That(BitConverter.GetBytes(Sse41.Extract(ThreadState.V0, (byte)2)), Is.EquivalentTo(Exp));
+                Assert.That(BitConverter.GetBytes(Sse41.Extract(ThreadState.V0, (byte)3)), Is.EquivalentTo(Exp));
+            });
+        }
+    }
+}
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdMove.cs b/Ryujinx.Tests/Cpu/CpuTestSimdMove.cs
index 498488206..055e08689 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdMove.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdMove.cs
@@ -19,12 +19,13 @@ namespace Ryujinx.Tests.Cpu
 
             AThreadState ThreadState = SingleOpcode(Opcode, V1: V1, V2: V2);
 
-            Sse41.Extract(Sse.StaticCast<float, uint>(ThreadState.V0), (byte)0);
-
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, uint>(ThreadState.V0), (byte)0), Is.EqualTo(A0));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, uint>(ThreadState.V0), (byte)1), Is.EqualTo(B0));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, uint>(ThreadState.V0), (byte)2), Is.EqualTo(A2));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, uint>(ThreadState.V0), (byte)3), Is.EqualTo(B2));
+            Assert.Multiple(() =>
+            {
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, uint>(ThreadState.V0), (byte)0), Is.EqualTo(A0));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, uint>(ThreadState.V0), (byte)1), Is.EqualTo(B0));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, uint>(ThreadState.V0), (byte)2), Is.EqualTo(A2));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, uint>(ThreadState.V0), (byte)3), Is.EqualTo(B2));
+            });
         }
 
         [Test, Description("TRN1 V0.8B, V1.8B, V2.8B")]
@@ -39,14 +40,17 @@ namespace Ryujinx.Tests.Cpu
 
             AThreadState ThreadState = SingleOpcode(Opcode, V1: V1, V2: V2);
 
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)0), Is.EqualTo(A0));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)1), Is.EqualTo(B0));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)2), Is.EqualTo(A2));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)3), Is.EqualTo(B2));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)4), Is.EqualTo(A4));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)5), Is.EqualTo(B4));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)6), Is.EqualTo(A6));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)7), Is.EqualTo(B6));
+            Assert.Multiple(() =>
+            {
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)0), Is.EqualTo(A0));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)1), Is.EqualTo(B0));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)2), Is.EqualTo(A2));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)3), Is.EqualTo(B2));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)4), Is.EqualTo(A4));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)5), Is.EqualTo(B4));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)6), Is.EqualTo(A6));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)7), Is.EqualTo(B6));
+            });
         }
 
         [Test, Description("TRN2 V0.4S, V1.4S, V2.4S")]
@@ -59,10 +63,13 @@ namespace Ryujinx.Tests.Cpu
 
             AThreadState ThreadState = SingleOpcode(Opcode, V1: V1, V2: V2);
 
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, uint>(ThreadState.V0), (byte)0), Is.EqualTo(A1));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, uint>(ThreadState.V0), (byte)1), Is.EqualTo(B1));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, uint>(ThreadState.V0), (byte)2), Is.EqualTo(A3));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, uint>(ThreadState.V0), (byte)3), Is.EqualTo(B3));
+            Assert.Multiple(() =>
+            {
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, uint>(ThreadState.V0), (byte)0), Is.EqualTo(A1));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, uint>(ThreadState.V0), (byte)1), Is.EqualTo(B1));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, uint>(ThreadState.V0), (byte)2), Is.EqualTo(A3));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, uint>(ThreadState.V0), (byte)3), Is.EqualTo(B3));
+            });
         }
 
         [Test, Description("TRN2 V0.8B, V1.8B, V2.8B")]
@@ -77,14 +84,17 @@ namespace Ryujinx.Tests.Cpu
 
             AThreadState ThreadState = SingleOpcode(Opcode, V1: V1, V2: V2);
 
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)0), Is.EqualTo(A1));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)1), Is.EqualTo(B1));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)2), Is.EqualTo(A3));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)3), Is.EqualTo(B3));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)4), Is.EqualTo(A5));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)5), Is.EqualTo(B5));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)6), Is.EqualTo(A7));
-            Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)7), Is.EqualTo(B7));
+            Assert.Multiple(() =>
+            {
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)0), Is.EqualTo(A1));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)1), Is.EqualTo(B1));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)2), Is.EqualTo(A3));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)3), Is.EqualTo(B3));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)4), Is.EqualTo(A5));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)5), Is.EqualTo(B5));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)6), Is.EqualTo(A7));
+                Assert.That(Sse41.Extract(Sse.StaticCast<float, byte>(ThreadState.V0), (byte)7), Is.EqualTo(B7));
+            });
         }
 
         [TestCase(0u, 0u, 0x2313221221112010ul, 0x0000000000000000ul)]
diff --git a/Ryujinx.Tests/Cpu/Tester/Instructions.cs b/Ryujinx.Tests/Cpu/Tester/Instructions.cs
index aa62ddccd..a4e04e960 100644
--- a/Ryujinx.Tests/Cpu/Tester/Instructions.cs
+++ b/Ryujinx.Tests/Cpu/Tester/Instructions.cs
@@ -1826,7 +1826,7 @@ namespace Ryujinx.Tests.Cpu.Tester
         // addp_advsimd_pair.html
         public static void Addp_S(Bits size, Bits Rn, Bits Rd)
         {
-            /* Decode Scalar */
+            /* Decode */
             int d = (int)UInt(Rd);
             int n = (int)UInt(Rn);
 
@@ -1875,7 +1875,7 @@ namespace Ryujinx.Tests.Cpu.Tester
         {
             const bool U = false;
 
-            /* Decode */
+            /* Decode Vector */
             int d = (int)UInt(Rd);
             int n = (int)UInt(Rn);
 
@@ -1917,7 +1917,7 @@ namespace Ryujinx.Tests.Cpu.Tester
         {
             const bool U = true;
 
-            /* Decode */
+            /* Decode Vector */
             int d = (int)UInt(Rd);
             int n = (int)UInt(Rn);
 
@@ -2654,6 +2654,37 @@ namespace Ryujinx.Tests.Cpu.Tester
             V(d, result);
         }
 
+        // cnt_advsimd.html
+        public static void Cnt_V(bool Q, Bits size, Bits Rn, Bits Rd)
+        {
+            /* Decode Vector */
+            int d = (int)UInt(Rd);
+            int n = (int)UInt(Rn);
+
+            /* if size != '00' then ReservedValue(); */
+
+            int esize = 8;
+            int datasize = (Q ? 128 : 64);
+            int elements = datasize / 8;
+
+            /* Operation */
+            /* CheckFPAdvSIMDEnabled64(); */
+
+            Bits result = new Bits(datasize);
+            Bits operand = V(datasize, n);
+
+            BigInteger count;
+
+            for (int e = 0; e <= elements - 1; e++)
+            {
+                count = (BigInteger)BitCount(Elem(operand, e, esize));
+
+                Elem(result, e, esize, count.SubBigInteger(esize - 1, 0));
+            }
+
+            V(d, result);
+        }
+
         // neg_advsimd.html#NEG_asisdmisc_R
         public static void Neg_S(Bits size, Bits Rn, Bits Rd)
         {
@@ -2745,7 +2776,7 @@ namespace Ryujinx.Tests.Cpu.Tester
         // not_advsimd.html
         public static void Not_V(bool Q, Bits Rn, Bits Rd)
         {
-            /* Decode */
+            /* Decode Vector */
             int d = (int)UInt(Rd);
             int n = (int)UInt(Rn);
 
@@ -3095,7 +3126,7 @@ namespace Ryujinx.Tests.Cpu.Tester
         // addp_advsimd_vec.html
         public static void Addp_V(bool Q, Bits size, Bits Rm, Bits Rn, Bits Rd)
         {
-            /* Decode Vector */
+            /* Decode */
             int d = (int)UInt(Rd);
             int n = (int)UInt(Rn);
             int m = (int)UInt(Rm);
diff --git a/Ryujinx.Tests/Cpu/Tester/Pseudocode.cs b/Ryujinx.Tests/Cpu/Tester/Pseudocode.cs
index 363e2de94..3a877fb1a 100644
--- a/Ryujinx.Tests/Cpu/Tester/Pseudocode.cs
+++ b/Ryujinx.Tests/Cpu/Tester/Pseudocode.cs
@@ -586,6 +586,24 @@ namespace Ryujinx.Tests.Cpu.Tester
             return (x >= 0 ? x : -x);
         }
 
+        // shared_pseudocode.html#impl-shared.BitCount.1
+        public static int BitCount(Bits x)
+        {
+            int N = x.Count;
+
+            int result = 0;
+
+            for (int i = 0; i <= N - 1; i++)
+            {
+                if (x[i])
+                {
+                    result = result + 1;
+                }
+            }
+
+            return result;
+        }
+
         // shared_pseudocode.html#impl-shared.CountLeadingSignBits.1
         public static int CountLeadingSignBits(Bits x)
         {