1 /* 2 * Argon2 reference source code package - reference C implementations 3 * 4 * Copyright 2015 5 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves 6 * 7 * You may use this work under the terms of a Creative Commons CC0 1.0 8 * License/Waiver or the Apache Public License 2.0, at your option. The terms of 9 * these licenses can be found at: 10 * 11 * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 12 * - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * You should have received a copy of both of these licenses along with this 15 * software. If not, they may be obtained at the above URLs. 16 */ 17 18 #ifndef BLAKE_ROUND_MKA_OPT_H 19 #define BLAKE_ROUND_MKA_OPT_H 20 21 #include "blake2-impl.h" 22 23 #include <emmintrin.h> 24 #if defined(__SSSE3__) 25 #include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */ 26 #endif 27 28 #if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__)) 29 #include <x86intrin.h> 30 #endif 31 32 #if !defined(__AVX512F__) 33 #if !defined(__AVX2__) 34 #if !defined(__XOP__) 35 #if defined(__SSSE3__) 36 #define r16 \ 37 (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) 38 #define r24 \ 39 (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) 40 #define _mm_roti_epi64(x, c) \ 41 (-(c) == 32) \ 42 ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \ 43 : (-(c) == 24) \ 44 ? _mm_shuffle_epi8((x), r24) \ 45 : (-(c) == 16) \ 46 ? _mm_shuffle_epi8((x), r16) \ 47 : (-(c) == 63) \ 48 ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ 49 _mm_add_epi64((x), (x))) \ 50 : _mm_xor_si128(_mm_srli_epi64((x), -(c)), \ 51 _mm_slli_epi64((x), 64 - (-(c)))) 52 #else /* defined(__SSE2__) */ 53 #define _mm_roti_epi64(r, c) \ 54 _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c)))) 55 #endif 56 #else 57 #endif 58 59 static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) { 60 const __m128i z = _mm_mul_epu32(x, y); 61 return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z)); 62 } 63 64 #define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ 65 do { \ 66 A0 = fBlaMka(A0, B0); \ 67 A1 = fBlaMka(A1, B1); \ 68 \ 69 D0 = _mm_xor_si128(D0, A0); \ 70 D1 = _mm_xor_si128(D1, A1); \ 71 \ 72 D0 = _mm_roti_epi64(D0, -32); \ 73 D1 = _mm_roti_epi64(D1, -32); \ 74 \ 75 C0 = fBlaMka(C0, D0); \ 76 C1 = fBlaMka(C1, D1); \ 77 \ 78 B0 = _mm_xor_si128(B0, C0); \ 79 B1 = _mm_xor_si128(B1, C1); \ 80 \ 81 B0 = _mm_roti_epi64(B0, -24); \ 82 B1 = _mm_roti_epi64(B1, -24); \ 83 } while ((void)0, 0) 84 85 #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ 86 do { \ 87 A0 = fBlaMka(A0, B0); \ 88 A1 = fBlaMka(A1, B1); \ 89 \ 90 D0 = _mm_xor_si128(D0, A0); \ 91 D1 = _mm_xor_si128(D1, A1); \ 92 \ 93 D0 = _mm_roti_epi64(D0, -16); \ 94 D1 = _mm_roti_epi64(D1, -16); \ 95 \ 96 C0 = fBlaMka(C0, D0); \ 97 C1 = fBlaMka(C1, D1); \ 98 \ 99 B0 = _mm_xor_si128(B0, C0); \ 100 B1 = _mm_xor_si128(B1, C1); \ 101 \ 102 B0 = _mm_roti_epi64(B0, -63); \ 103 B1 = _mm_roti_epi64(B1, -63); \ 104 } while ((void)0, 0) 105 106 #if defined(__SSSE3__) 107 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ 108 do { \ 109 __m128i t0 = _mm_alignr_epi8(B1, B0, 8); \ 110 __m128i t1 = _mm_alignr_epi8(B0, B1, 8); \ 111 B0 = t0; \ 112 B1 = t1; \ 113 \ 114 t0 = C0; \ 115 C0 = C1; \ 116 C1 = t0; \ 117 \ 118 t0 = _mm_alignr_epi8(D1, D0, 8); \ 119 t1 = _mm_alignr_epi8(D0, D1, 8); \ 120 D0 = t1; \ 121 D1 = t0; \ 122 } while ((void)0, 0) 123 124 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ 125 do { \ 126 __m128i t0 = _mm_alignr_epi8(B0, B1, 8); \ 127 __m128i t1 = _mm_alignr_epi8(B1, B0, 8); \ 128 B0 = t0; \ 129 B1 = t1; \ 130 \ 131 t0 = C0; \ 132 C0 = C1; \ 133 C1 = t0; \ 134 \ 135 t0 = _mm_alignr_epi8(D0, D1, 8); \ 136 t1 = _mm_alignr_epi8(D1, D0, 8); \ 137 D0 = t1; \ 138 D1 = t0; \ 139 } while ((void)0, 0) 140 #else /* SSE2 */ 141 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ 142 do { \ 143 __m128i t0 = D0; \ 144 __m128i t1 = B0; \ 145 D0 = C0; \ 146 C0 = C1; \ 147 C1 = D0; \ 148 D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0)); \ 149 D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1)); \ 150 B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1)); \ 151 B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1)); \ 152 } while ((void)0, 0) 153 154 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ 155 do { \ 156 __m128i t0, t1; \ 157 t0 = C0; \ 158 C0 = C1; \ 159 C1 = t0; \ 160 t0 = B0; \ 161 t1 = D0; \ 162 B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0)); \ 163 B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1)); \ 164 D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1)); \ 165 D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1)); \ 166 } while ((void)0, 0) 167 #endif 168 169 #define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \ 170 do { \ 171 G1(A0, B0, C0, D0, A1, B1, C1, D1); \ 172 G2(A0, B0, C0, D0, A1, B1, C1, D1); \ 173 \ 174 DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ 175 \ 176 G1(A0, B0, C0, D0, A1, B1, C1, D1); \ 177 G2(A0, B0, C0, D0, A1, B1, C1, D1); \ 178 \ 179 UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ 180 } while ((void)0, 0) 181 #else /* __AVX2__ */ 182 183 #include <immintrin.h> 184 185 #define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)) 186 #define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)) 187 #define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)) 188 #define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x))) 189 190 #define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 191 do { \ 192 __m256i ml = _mm256_mul_epu32(A0, B0); \ 193 ml = _mm256_add_epi64(ml, ml); \ 194 A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ 195 D0 = _mm256_xor_si256(D0, A0); \ 196 D0 = rotr32(D0); \ 197 \ 198 ml = _mm256_mul_epu32(C0, D0); \ 199 ml = _mm256_add_epi64(ml, ml); \ 200 C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ 201 \ 202 B0 = _mm256_xor_si256(B0, C0); \ 203 B0 = rotr24(B0); \ 204 \ 205 ml = _mm256_mul_epu32(A1, B1); \ 206 ml = _mm256_add_epi64(ml, ml); \ 207 A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ 208 D1 = _mm256_xor_si256(D1, A1); \ 209 D1 = rotr32(D1); \ 210 \ 211 ml = _mm256_mul_epu32(C1, D1); \ 212 ml = _mm256_add_epi64(ml, ml); \ 213 C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ 214 \ 215 B1 = _mm256_xor_si256(B1, C1); \ 216 B1 = rotr24(B1); \ 217 } while((void)0, 0); 218 219 #define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 220 do { \ 221 __m256i ml = _mm256_mul_epu32(A0, B0); \ 222 ml = _mm256_add_epi64(ml, ml); \ 223 A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \ 224 D0 = _mm256_xor_si256(D0, A0); \ 225 D0 = rotr16(D0); \ 226 \ 227 ml = _mm256_mul_epu32(C0, D0); \ 228 ml = _mm256_add_epi64(ml, ml); \ 229 C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \ 230 B0 = _mm256_xor_si256(B0, C0); \ 231 B0 = rotr63(B0); \ 232 \ 233 ml = _mm256_mul_epu32(A1, B1); \ 234 ml = _mm256_add_epi64(ml, ml); \ 235 A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \ 236 D1 = _mm256_xor_si256(D1, A1); \ 237 D1 = rotr16(D1); \ 238 \ 239 ml = _mm256_mul_epu32(C1, D1); \ 240 ml = _mm256_add_epi64(ml, ml); \ 241 C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \ 242 B1 = _mm256_xor_si256(B1, C1); \ 243 B1 = rotr63(B1); \ 244 } while((void)0, 0); 245 246 #define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ 247 do { \ 248 B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ 249 C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ 250 D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ 251 \ 252 B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ 253 C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ 254 D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ 255 } while((void)0, 0); 256 257 #define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ 258 do { \ 259 __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ 260 __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ 261 B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ 262 B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ 263 \ 264 tmp1 = C0; \ 265 C0 = C1; \ 266 C1 = tmp1; \ 267 \ 268 tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \ 269 tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \ 270 D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ 271 D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ 272 } while(0); 273 274 #define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ 275 do { \ 276 B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ 277 C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ 278 D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ 279 \ 280 B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ 281 C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ 282 D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ 283 } while((void)0, 0); 284 285 #define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ 286 do { \ 287 __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \ 288 __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \ 289 B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ 290 B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ 291 \ 292 tmp1 = C0; \ 293 C0 = C1; \ 294 C1 = tmp1; \ 295 \ 296 tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \ 297 tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \ 298 D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \ 299 D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \ 300 } while((void)0, 0); 301 302 #define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \ 303 do{ \ 304 G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 305 G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 306 \ 307 DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ 308 \ 309 G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 310 G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 311 \ 312 UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \ 313 } while((void)0, 0); 314 315 #define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \ 316 do{ \ 317 G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 318 G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 319 \ 320 DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ 321 \ 322 G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 323 G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \ 324 \ 325 UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \ 326 } while((void)0, 0); 327 328 #endif /* __AVX2__ */ 329 330 #else /* __AVX512F__ */ 331 332 #include <immintrin.h> 333 334 #define ror64(x, n) _mm512_ror_epi64((x), (n)) 335 336 static __m512i muladd(__m512i x, __m512i y) 337 { 338 __m512i z = _mm512_mul_epu32(x, y); 339 return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z)); 340 } 341 342 #define G1(A0, B0, C0, D0, A1, B1, C1, D1) \ 343 do { \ 344 A0 = muladd(A0, B0); \ 345 A1 = muladd(A1, B1); \ 346 \ 347 D0 = _mm512_xor_si512(D0, A0); \ 348 D1 = _mm512_xor_si512(D1, A1); \ 349 \ 350 D0 = ror64(D0, 32); \ 351 D1 = ror64(D1, 32); \ 352 \ 353 C0 = muladd(C0, D0); \ 354 C1 = muladd(C1, D1); \ 355 \ 356 B0 = _mm512_xor_si512(B0, C0); \ 357 B1 = _mm512_xor_si512(B1, C1); \ 358 \ 359 B0 = ror64(B0, 24); \ 360 B1 = ror64(B1, 24); \ 361 } while ((void)0, 0) 362 363 #define G2(A0, B0, C0, D0, A1, B1, C1, D1) \ 364 do { \ 365 A0 = muladd(A0, B0); \ 366 A1 = muladd(A1, B1); \ 367 \ 368 D0 = _mm512_xor_si512(D0, A0); \ 369 D1 = _mm512_xor_si512(D1, A1); \ 370 \ 371 D0 = ror64(D0, 16); \ 372 D1 = ror64(D1, 16); \ 373 \ 374 C0 = muladd(C0, D0); \ 375 C1 = muladd(C1, D1); \ 376 \ 377 B0 = _mm512_xor_si512(B0, C0); \ 378 B1 = _mm512_xor_si512(B1, C1); \ 379 \ 380 B0 = ror64(B0, 63); \ 381 B1 = ror64(B1, 63); \ 382 } while ((void)0, 0) 383 384 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ 385 do { \ 386 B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ 387 B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ 388 \ 389 C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ 390 C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ 391 \ 392 D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ 393 D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ 394 } while ((void)0, 0) 395 396 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ 397 do { \ 398 B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ 399 B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ 400 \ 401 C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ 402 C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ 403 \ 404 D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ 405 D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ 406 } while ((void)0, 0) 407 408 #define BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1) \ 409 do { \ 410 G1(A0, B0, C0, D0, A1, B1, C1, D1); \ 411 G2(A0, B0, C0, D0, A1, B1, C1, D1); \ 412 \ 413 DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ 414 \ 415 G1(A0, B0, C0, D0, A1, B1, C1, D1); \ 416 G2(A0, B0, C0, D0, A1, B1, C1, D1); \ 417 \ 418 UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ 419 } while ((void)0, 0) 420 421 #define SWAP_HALVES(A0, A1) \ 422 do { \ 423 __m512i t0, t1; \ 424 t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \ 425 t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \ 426 A0 = t0; \ 427 A1 = t1; \ 428 } while((void)0, 0) 429 430 #define SWAP_QUARTERS(A0, A1) \ 431 do { \ 432 SWAP_HALVES(A0, A1); \ 433 A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \ 434 A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \ 435 } while((void)0, 0) 436 437 #define UNSWAP_QUARTERS(A0, A1) \ 438 do { \ 439 A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \ 440 A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \ 441 SWAP_HALVES(A0, A1); \ 442 } while((void)0, 0) 443 444 #define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \ 445 do { \ 446 SWAP_HALVES(A0, B0); \ 447 SWAP_HALVES(C0, D0); \ 448 SWAP_HALVES(A1, B1); \ 449 SWAP_HALVES(C1, D1); \ 450 BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \ 451 SWAP_HALVES(A0, B0); \ 452 SWAP_HALVES(C0, D0); \ 453 SWAP_HALVES(A1, B1); \ 454 SWAP_HALVES(C1, D1); \ 455 } while ((void)0, 0) 456 457 #define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \ 458 do { \ 459 SWAP_QUARTERS(A0, A1); \ 460 SWAP_QUARTERS(B0, B1); \ 461 SWAP_QUARTERS(C0, C1); \ 462 SWAP_QUARTERS(D0, D1); \ 463 BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \ 464 UNSWAP_QUARTERS(A0, A1); \ 465 UNSWAP_QUARTERS(B0, B1); \ 466 UNSWAP_QUARTERS(C0, C1); \ 467 UNSWAP_QUARTERS(D0, D1); \ 468 } while ((void)0, 0) 469 470 #endif /* __AVX512F__ */ 471 #endif /* BLAKE_ROUND_MKA_OPT_H */ 472