From a0d79770e968c3159fab2e209bc0b9ed2d1c1e7c Mon Sep 17 00:00:00 2001 From: Manuel Thalmann Date: Tue, 19 Dec 2023 17:55:24 +0100 Subject: [PATCH] Implement intrinsic `AES` --- aes-intrinsic/aes.cpp | 213 ++++++++++++------------------------------ 1 file changed, 62 insertions(+), 151 deletions(-) diff --git a/aes-intrinsic/aes.cpp b/aes-intrinsic/aes.cpp index 484d4ff..ce8b2d3 100644 --- a/aes-intrinsic/aes.cpp +++ b/aes-intrinsic/aes.cpp @@ -1,10 +1,14 @@ #include +#include #include #include #include +#include #include #include #include +#include +#include /* AES-128 simple implementation template and testing */ @@ -15,90 +19,28 @@ AES specification: http://csrc.nist.gov/publications/fips/fips197/fips-197.pdf */ -#define WORD(byte0, byte1, byte2, byte3) ((((((uint16_t)(byte3 << 8) | byte2) << 8) | byte1) << 8) | byte0) -#define WBYTE(value, position) (((uint32_t)value >> (position * 8)) & 0xFF) - /* AES Constants */ // AES polynomial const uint16_t POLYNOMIAL = 0b100011011; -// forward sbox -const uint8_t SBOX[256] = { - 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, - 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, - 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, - 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, - 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, - 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, - 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, - 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, - 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, - 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, - 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, - 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, - 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, - 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, - 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, - 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 -}; +__m128i computeKey(__m128i key, __m128i expansionSource) { + __m128i tmp1 = _mm_shuffle_epi32(expansionSource, 0xFF); + __m128i tmp2; -// T-boxes -uint32_t T0[256]; -uint32_t T1[256]; -uint32_t T2[256]; -uint32_t T3[256]; - -const uint8_t rCon[12] = { - 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, -}; - -/* AES state type */ -typedef uint32_t t_state[4]; - -void hexprint16(uint8_t *p) { - for (int i = 0; i < 16; i++) - printf("%02hhx ", p[i]); - puts(""); + _mm_storeu_si128(&tmp2, key); + key = _mm_slli_si128(key, 1 * 4); + tmp2 = _mm_xor_si128(tmp2, key); + key = _mm_slli_si128(key, 1 * 4); + tmp2 = _mm_xor_si128(tmp2, key); + key = _mm_slli_si128(key, 1 * 4); + tmp2 = _mm_xor_si128(tmp2, key); + key = _mm_slli_si128(key, 1 * 4); + tmp2 = _mm_xor_si128(tmp2, key); + return tmp1 ^ tmp2; } -// **************** AES functions **************** -uint32_t subWord(uint32_t w) { - return WORD(SBOX[WBYTE(w, 0)], SBOX[WBYTE(w, 1)], SBOX[WBYTE(w, 2)], SBOX[WBYTE(w, 3)]); -} - -void subBytes(t_state s) { - for (uint8_t i = 0; i < 4; i++) { - s[i] = subWord(s[i]); - } -} - - -void shiftRows(t_state s) { - for (uint8_t i = 0; i < 4; i++) { - uint32_t mask = 0xFF << (i * 8); - - for (uint8_t shiftCount = 0; shiftCount < i; shiftCount++) { - for (uint8_t currentByte = 0; currentByte < 3; currentByte++) { - // Swap s[currentByte] and s[currentByte + 1] - s[currentByte] = s[currentByte] ^ (mask & s[currentByte + 1]); - s[currentByte + 1] = s[currentByte + 1] ^ (mask & s[currentByte]); - s[currentByte] = s[currentByte] ^ (mask & s[currentByte + 1]); - } - } - } -} - -uint8_t xtime(uint8_t a) { - uint8_t mask; - - if (a & 0b10000000) { - mask = POLYNOMIAL & 0xFF; - } - else { - mask = 0x00; - } - - return ((a << 1) ^ mask) & 0xFF; +void addKey(uint8_t index, __m128i expKey[11], __m128i expSource) { + _mm_storeu_si128(&expKey[index], computeKey(expKey[index - 1], expSource)); } /* @@ -107,73 +49,52 @@ uint8_t xtime(uint8_t a) { * each round key is 4*32b */ // Taken from: https://www.brainkart.com/article/AES-Key-Expansion_8410/ -void expandKey(uint8_t k[16], uint32_t ek[44]) { - for (uint8_t i = 0; i < 4; i++) { - ek[i] = WORD(k[i * 4], k[i * 4 + 1], k[i * 4 + 2], k[i * 4 + 3]); - } +void expandKey(__m128i key, __m128i expKey[11]) { + __m128i expSource; + _mm_storeu_si128(&expKey[0], key); - for (uint8_t i = 4; i < 44; i++) { - uint32_t key = ek[i - 1]; - - if (i % 4 == 0) { - key = (key >> 8) | (key << 24); - key = subWord(key) ^ rCon[i / 4]; - } - - ek[i] = ek[i - 4] ^ key; - } + expSource = _mm_aeskeygenassist_si128(expKey[0], 0x01); + addKey(1, expKey, expSource); + expSource = _mm_aeskeygenassist_si128(expKey[1], 0x02); + addKey(2, expKey, expSource); + expSource = _mm_aeskeygenassist_si128(expKey[2], 0x04); + addKey(3, expKey, expSource); + expSource = _mm_aeskeygenassist_si128(expKey[3], 0x08); + addKey(4, expKey, expSource); + expSource = _mm_aeskeygenassist_si128(expKey[4], 0x10); + addKey(5, expKey, expSource); + expSource = _mm_aeskeygenassist_si128(expKey[5], 0x20); + addKey(6, expKey, expSource); + expSource = _mm_aeskeygenassist_si128(expKey[6], 0x40); + addKey(7, expKey, expSource); + expSource = _mm_aeskeygenassist_si128(expKey[7], 0x80); + addKey(8, expKey, expSource); + expSource = _mm_aeskeygenassist_si128(expKey[8], 0x1B); + addKey(9, expKey, expSource); + expSource = _mm_aeskeygenassist_si128(expKey[9], 0x36); + addKey(10, expKey, expSource); } - -/* Adding expanded round key (prepared before) */ -void addRoundKey(t_state s, uint32_t ek[], short round) { - s[0] ^= ek[round]; - s[1] ^= ek[round + 1]; - s[2] ^= ek[round + 2]; - s[3] ^= ek[round + 3]; -} - -void aes(uint8_t *in, uint8_t *out, uint8_t *skey) +void aes(__m128i *value, __m128i key) { //... Initialize ... - unsigned short round = 0; + __m128i expKey[11]; + __m128i tmp = _mm_load_si128(value); - t_state state; + expandKey(key, expKey); - for (uint8_t i = 0; i < 4; i++) { - state[i] = WORD(in[i * 4], in[i * 4 + 1], in[i * 4 + 2], in[i * 4 + 3]); - } - - uint32_t expKey[11 * 4]; - - expandKey(skey, expKey); - addRoundKey(state, expKey, 0); - - for (int i = 1; i < 10; i++) { - t_state tmp; - - for (int j = 0; j < 4; j++) { - tmp[j] = - T0[WBYTE(state[j], 0)] ^ - T1[WBYTE(state[(j + 1) % 4], 1)] ^ - T2[WBYTE(state[(j + 2) % 4], 2)] ^ - T3[WBYTE(state[(j + 3) % 4], 3)]; - } - - memcpy(state, tmp, sizeof(t_state)); - addRoundKey(state, expKey, 4 * i); - } - - subBytes(state); - shiftRows(state); - addRoundKey(state, expKey, 40); - - for (int i = 0; i < 16; i++) { - if (i < 4) out[i] = WBYTE(state[0], i % 4); - else if (i < 8) out[i] = WBYTE(state[1], i % 4); - else if (i < 12) out[i] = WBYTE(state[2], i % 4); - else out[i] = WBYTE(state[3], i % 4); - } + tmp = _mm_xor_si128(tmp, expKey[0]); + tmp = _mm_aesenc_si128(tmp, expKey[1]); + tmp = _mm_aesenc_si128(tmp, expKey[2]); + tmp = _mm_aesenc_si128(tmp, expKey[3]); + tmp = _mm_aesenc_si128(tmp, expKey[4]); + tmp = _mm_aesenc_si128(tmp, expKey[5]); + tmp = _mm_aesenc_si128(tmp, expKey[6]); + tmp = _mm_aesenc_si128(tmp, expKey[7]); + tmp = _mm_aesenc_si128(tmp, expKey[8]); + tmp = _mm_aesenc_si128(tmp, expKey[9]); + tmp = _mm_aesenclast_si128(tmp, expKey[10]); + _mm_storeu_si128(value, tmp); } //**************************** @@ -182,8 +103,8 @@ void aes(uint8_t *in, uint8_t *out, uint8_t *skey) int main(int argc, char* argv[]) { uint32_t cycles = 1000000; - uint8_t key[16] = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff }; - uint8_t in[16] = { 0xab, 0xcd, 0xef, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0x01, 0x23, 0x45, 0x67, 0x89}; + __m128i key = _mm_setr_epi8(0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff); + __m128i value = _mm_setr_epi8(0xab, 0xcd, 0xef, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0x01, 0x23, 0x45, 0x67, 0x89); if (argc > 2) { std::cerr << "Invalid number of arguments\n"; @@ -192,20 +113,10 @@ int main(int argc, char* argv[]) cycles = std::atoi(argv[1]); } - for (int i = 0; i <= 0xFF; i++) { - uint8_t a1 = SBOX[i]; - uint8_t a2 = xtime(a1); - uint8_t a3 = a2 ^ a1; - T0[i] = WORD(a2, a1, a1, a3); - T1[i] = WORD(a3, a2, a1, a1); - T2[i] = WORD(a1, a3, a2, a1); - T3[i] = WORD(a1, a1, a3, a2); - } - const auto start{std::chrono::steady_clock::now()}; { for (int i = 0; i < cycles; i++) { - aes(in, in, key); + aes(&value, key); } } const auto end{std::chrono::steady_clock::now()}; @@ -214,5 +125,5 @@ int main(int argc, char* argv[]) std::cout << "AES (" << cycles << " runs)\nElapsed time: "; std::cout << milliseconds << "ms\n"; // Before C++20 - exit(in[0]); + exit(value[0]); }