diff --git a/aes-tboxes/aes.cpp b/aes-tboxes/aes.cpp
index 0e2cc68..81f4d2e 100644
--- a/aes-tboxes/aes.cpp
+++ b/aes-tboxes/aes.cpp
@@ -1,5 +1,7 @@
 #include <chrono>
+#include <cstdint>
 #include <cstdio>
+#include <cstring>
 #include <iostream>
 #include <stdint.h>
 #include <stdlib.h>
@@ -14,7 +16,7 @@ http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf
 */
 
 #define WORD(byte0, byte1, byte2, byte3) ((((((uint16_t)(byte3 << 8) | byte2) << 8) | byte1) << 8) | byte0)
-#define WBYTE(value, position) ((value >> (position * 8)) & 0xFF)
+#define WBYTE(value, position) (((uint32_t)value >> (position * 8)) & 0xFF)
 
 /* AES Constants */
 // AES polynomial
@@ -38,7 +40,13 @@ const uint8_t SBOX[256] = {
     0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
     0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
     0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
-}; 
+};
+
+// T-boxes
+uint32_t T0[256];
+uint32_t T1[256];
+uint32_t T2[256];
+uint32_t T3[256];
 
 const uint8_t rCon[12] = {
     0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
@@ -93,26 +101,6 @@ uint8_t xtime(uint8_t a) {
     return ((a << 1) ^ mask) & 0xFF;
 }
 
-// not mandatory - mix a single column
-uint32_t mixColumn(uint32_t c) {
-    uint32_t result = c;
-    uint8_t *source = (uint8_t*)(&c);
-    uint8_t *target = (uint8_t*)(&result);
-    uint8_t base = *source ^ *(source + 1) ^ *(source + 2) ^ *(source + 3);
-    *target ^= base ^ xtime(*source ^ *(source + 1));
-    *(target + 1) ^= base ^ xtime(*(source + 1) ^ *(source + 2));
-    *(target + 2) ^= base ^ xtime(*(source  + 2) ^ *(source + 3));
-    *(target + 3) ^= base ^ xtime(*(source + 3) ^ *source);
-    return result;
-}
-
-
-void mixColumns(t_state s) {
-    for (uint8_t i = 0; i < 4; i++) {
-        s[i] = mixColumn(s[i]);
-    }
-}
-
 /*
 * Key expansion from 128bits (4*32b)
 * to 11 round keys (11*4*32b)
@@ -159,21 +147,27 @@ void aes(uint8_t *in, uint8_t *out, uint8_t *skey)
     uint32_t expKey[11 * 4];
 
     expandKey(skey, expKey);
-
     addRoundKey(state, expKey, 0);
 
-    for (int i = 1; i <= 10; i++) {
-        subBytes(state);
+    for (int i = 1; i < 10; i++) {
+        t_state tmp;
 
-        shiftRows(state);
-
-        if (i < 10) {
-            mixColumns(state);
+        for (int j = 0; j < 4; j++) {
+            tmp[j] =
+                T0[WBYTE(state[j], 0)] ^
+                T1[WBYTE(state[(j + 1) % 4], 1)] ^
+                T2[WBYTE(state[(j + 2) % 4], 2)] ^
+                T3[WBYTE(state[(j + 3) % 4], 3)];
         }
 
-        addRoundKey(state, expKey, 4*i);
+        memcpy(state, tmp, sizeof(t_state));
+        addRoundKey(state, expKey, 4 * i);
     }
 
+    subBytes(state);
+    shiftRows(state);
+    addRoundKey(state, expKey, 40);
+
     for (int i = 0; i < 16; i++) {
         if (i < 4) out[i] = WBYTE(state[0], i % 4);
         else if (i < 8) out[i] = WBYTE(state[1], i % 4);
@@ -198,6 +192,16 @@ int main(int argc, char* argv[])
         cycles = std::atoi(argv[1]);
     }
 
+    for (int i = 0; i <= 0xFF; i++) {
+        uint8_t a1 = SBOX[i];
+        uint8_t a2 = xtime(a1);
+        uint8_t a3 = a2 ^ a1;
+        T0[i] = WORD(a2, a1, a1, a3);
+        T1[i] = WORD(a3, a2, a1, a1);
+        T2[i] = WORD(a1, a3, a2, a1);
+        T3[i] = WORD(a1, a1, a3, a2);
+    }
+
     const auto start{std::chrono::steady_clock::now()};
     {
         for (int i = 0; i < cycles; i++) {