1 /*===---- avx10_2niintrin.h - AVX10.2 new instruction intrinsics -----------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 #ifndef __IMMINTRIN_H 10 #error "Never use <avx10_2niintrin.h> directly; include <immintrin.h> instead." 11 #endif 12 13 #ifdef __SSE2__ 14 15 #ifndef __AVX10_2NIINTRIN_H 16 #define __AVX10_2NIINTRIN_H 17 18 #define __DEFAULT_FN_ATTRS128 \ 19 __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ 20 __min_vector_width__(128))) 21 #define __DEFAULT_FN_ATTRS256 \ 22 __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \ 23 __min_vector_width__(256))) 24 25 /* VNNI FP16 */ 26 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_dpph_ps(__m128 __W, 27 __m128h __A, 28 __m128h __B) { 29 return (__m128)__builtin_ia32_vdpphps128((__v4sf)__W, (__v8hf)__A, 30 (__v8hf)__B); 31 } 32 33 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_dpph_ps(__m128 __W, 34 __mmask8 __U, 35 __m128h __A, 36 __m128h __B) { 37 return (__m128)__builtin_ia32_selectps_128( 38 (__mmask8)__U, (__v4sf)_mm_dpph_ps(__W, __A, __B), (__v4sf)__W); 39 } 40 41 static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_dpph_ps(__mmask8 __U, 42 __m128 __W, 43 __m128h __A, 44 __m128h __B) { 45 return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, 46 (__v4sf)_mm_dpph_ps(__W, __A, __B), 47 (__v4sf)_mm_setzero_ps()); 48 } 49 50 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_dpph_ps(__m256 __W, 51 __m256h __A, 52 __m256h __B) { 53 return (__m256)__builtin_ia32_vdpphps256((__v8sf)__W, (__v16hf)__A, 54 (__v16hf)__B); 55 } 56 57 static __inline__ __m256 __DEFAULT_FN_ATTRS256 58 _mm256_mask_dpph_ps(__m256 __W, __mmask8 __U, __m256h __A, __m256h __B) { 59 return (__m256)__builtin_ia32_selectps_256( 60 (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B), (__v8sf)__W); 61 } 62 63 static __inline__ __m256 __DEFAULT_FN_ATTRS256 64 _mm256_maskz_dpph_ps(__mmask8 __U, __m256 __W, __m256h __A, __m256h __B) { 65 return (__m256)__builtin_ia32_selectps_256( 66 (__mmask8)__U, (__v8sf)_mm256_dpph_ps(__W, __A, __B), 67 (__v8sf)_mm256_setzero_ps()); 68 } 69 70 /* VMPSADBW */ 71 #define _mm_mask_mpsadbw_epu8(W, U, A, B, imm) \ 72 ((__m128i)__builtin_ia32_selectw_128( \ 73 (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \ 74 (__v8hi)(__m128i)(W))) 75 76 #define _mm_maskz_mpsadbw_epu8(U, A, B, imm) \ 77 ((__m128i)__builtin_ia32_selectw_128( \ 78 (__mmask8)(U), (__v8hi)_mm_mpsadbw_epu8((A), (B), (imm)), \ 79 (__v8hi)_mm_setzero_si128())) 80 81 #define _mm256_mask_mpsadbw_epu8(W, U, A, B, imm) \ 82 ((__m256i)__builtin_ia32_selectw_256( \ 83 (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \ 84 (__v16hi)(__m256i)(W))) 85 86 #define _mm256_maskz_mpsadbw_epu8(U, A, B, imm) \ 87 ((__m256i)__builtin_ia32_selectw_256( \ 88 (__mmask16)(U), (__v16hi)_mm256_mpsadbw_epu8((A), (B), (imm)), \ 89 (__v16hi)_mm256_setzero_si256())) 90 91 /* VNNI INT8 */ 92 static __inline__ __m128i __DEFAULT_FN_ATTRS128 93 _mm_mask_dpbssd_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { 94 return (__m128i)__builtin_ia32_selectd_128( 95 __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B), (__v4si)__W); 96 } 97 98 static __inline__ __m128i __DEFAULT_FN_ATTRS128 99 _mm_maskz_dpbssd_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { 100 return (__m128i)__builtin_ia32_selectd_128( 101 __U, (__v4si)_mm_dpbssd_epi32(__W, __A, __B), 102 (__v4si)_mm_setzero_si128()); 103 } 104 105 static __inline__ __m256i __DEFAULT_FN_ATTRS256 106 _mm256_mask_dpbssd_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { 107 return (__m256i)__builtin_ia32_selectd_256( 108 __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B), (__v8si)__W); 109 } 110 111 static __inline__ __m256i __DEFAULT_FN_ATTRS256 112 _mm256_maskz_dpbssd_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { 113 return (__m256i)__builtin_ia32_selectd_256( 114 __U, (__v8si)_mm256_dpbssd_epi32(__W, __A, __B), 115 (__v8si)_mm256_setzero_si256()); 116 } 117 118 static __inline__ __m128i __DEFAULT_FN_ATTRS128 119 _mm_mask_dpbssds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { 120 return (__m128i)__builtin_ia32_selectd_128( 121 __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B), (__v4si)__W); 122 } 123 124 static __inline__ __m128i __DEFAULT_FN_ATTRS128 125 _mm_maskz_dpbssds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { 126 return (__m128i)__builtin_ia32_selectd_128( 127 __U, (__v4si)_mm_dpbssds_epi32(__W, __A, __B), 128 (__v4si)_mm_setzero_si128()); 129 } 130 131 static __inline__ __m256i __DEFAULT_FN_ATTRS256 132 _mm256_mask_dpbssds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { 133 return (__m256i)__builtin_ia32_selectd_256( 134 __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B), (__v8si)__W); 135 } 136 137 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbssds_epi32( 138 __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { 139 return (__m256i)__builtin_ia32_selectd_256( 140 __U, (__v8si)_mm256_dpbssds_epi32(__W, __A, __B), 141 (__v8si)_mm256_setzero_si256()); 142 } 143 144 static __inline__ __m128i __DEFAULT_FN_ATTRS128 145 _mm_mask_dpbsud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { 146 return (__m128i)__builtin_ia32_selectd_128( 147 __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B), (__v4si)__W); 148 } 149 150 static __inline__ __m128i __DEFAULT_FN_ATTRS128 151 _mm_maskz_dpbsud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { 152 return (__m128i)__builtin_ia32_selectd_128( 153 __U, (__v4si)_mm_dpbsud_epi32(__W, __A, __B), 154 (__v4si)_mm_setzero_si128()); 155 } 156 157 static __inline__ __m256i __DEFAULT_FN_ATTRS256 158 _mm256_mask_dpbsud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { 159 return (__m256i)__builtin_ia32_selectd_256( 160 __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B), (__v8si)__W); 161 } 162 163 static __inline__ __m256i __DEFAULT_FN_ATTRS256 164 _mm256_maskz_dpbsud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { 165 return (__m256i)__builtin_ia32_selectd_256( 166 __U, (__v8si)_mm256_dpbsud_epi32(__W, __A, __B), 167 (__v8si)_mm256_setzero_si256()); 168 } 169 170 static __inline__ __m128i __DEFAULT_FN_ATTRS128 171 _mm_mask_dpbsuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { 172 return (__m128i)__builtin_ia32_selectd_128( 173 __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B), (__v4si)__W); 174 } 175 176 static __inline__ __m128i __DEFAULT_FN_ATTRS128 177 _mm_maskz_dpbsuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { 178 return (__m128i)__builtin_ia32_selectd_128( 179 __U, (__v4si)_mm_dpbsuds_epi32(__W, __A, __B), 180 (__v4si)_mm_setzero_si128()); 181 } 182 183 static __inline__ __m256i __DEFAULT_FN_ATTRS256 184 _mm256_mask_dpbsuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { 185 return (__m256i)__builtin_ia32_selectd_256( 186 __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B), (__v8si)__W); 187 } 188 189 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbsuds_epi32( 190 __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { 191 return (__m256i)__builtin_ia32_selectd_256( 192 __U, (__v8si)_mm256_dpbsuds_epi32(__W, __A, __B), 193 (__v8si)_mm256_setzero_si256()); 194 } 195 196 static __inline__ __m128i __DEFAULT_FN_ATTRS128 197 _mm_mask_dpbuud_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { 198 return (__m128i)__builtin_ia32_selectd_128( 199 __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B), (__v4si)__W); 200 } 201 202 static __inline__ __m128i __DEFAULT_FN_ATTRS128 203 _mm_maskz_dpbuud_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { 204 return (__m128i)__builtin_ia32_selectd_128( 205 __U, (__v4si)_mm_dpbuud_epi32(__W, __A, __B), 206 (__v4si)_mm_setzero_si128()); 207 } 208 209 static __inline__ __m256i __DEFAULT_FN_ATTRS256 210 _mm256_mask_dpbuud_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { 211 return (__m256i)__builtin_ia32_selectd_256( 212 __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B), (__v8si)__W); 213 } 214 215 static __inline__ __m256i __DEFAULT_FN_ATTRS256 216 _mm256_maskz_dpbuud_epi32(__mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { 217 return (__m256i)__builtin_ia32_selectd_256( 218 __U, (__v8si)_mm256_dpbuud_epi32(__W, __A, __B), 219 (__v8si)_mm256_setzero_si256()); 220 } 221 222 static __inline__ __m128i __DEFAULT_FN_ATTRS128 223 _mm_mask_dpbuuds_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { 224 return (__m128i)__builtin_ia32_selectd_128( 225 __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B), (__v4si)__W); 226 } 227 228 static __inline__ __m128i __DEFAULT_FN_ATTRS128 229 _mm_maskz_dpbuuds_epi32(__mmask8 __U, __m128i __W, __m128i __A, __m128i __B) { 230 return (__m128i)__builtin_ia32_selectd_128( 231 __U, (__v4si)_mm_dpbuuds_epi32(__W, __A, __B), 232 (__v4si)_mm_setzero_si128()); 233 } 234 235 static __inline__ __m256i __DEFAULT_FN_ATTRS256 236 _mm256_mask_dpbuuds_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { 237 return (__m256i)__builtin_ia32_selectd_256( 238 __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B), (__v8si)__W); 239 } 240 241 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbuuds_epi32( 242 __mmask8 __U, __m256i __W, __m256i __A, __m256i __B) { 243 return (__m256i)__builtin_ia32_selectd_256( 244 __U, (__v8si)_mm256_dpbuuds_epi32(__W, __A, __B), 245 (__v8si)_mm256_setzero_si256()); 246 } 247 248 /* VNNI INT16 */ 249 static __inline__ __m128i __DEFAULT_FN_ATTRS128 250 _mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { 251 return (__m128i)__builtin_ia32_selectd_128( 252 (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C), (__v4si)__A); 253 } 254 255 static __inline__ __m128i __DEFAULT_FN_ATTRS128 256 _mm_maskz_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { 257 return (__m128i)__builtin_ia32_selectd_128( 258 (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C), 259 (__v4si)_mm_setzero_si128()); 260 } 261 262 static __inline__ __m256i __DEFAULT_FN_ATTRS256 263 _mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { 264 return (__m256i)__builtin_ia32_selectd_256( 265 (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C), (__v8si)__A); 266 } 267 268 static __inline__ __m256i __DEFAULT_FN_ATTRS256 269 _mm256_maskz_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { 270 return (__m256i)__builtin_ia32_selectd_256( 271 (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C), 272 (__v8si)_mm256_setzero_si256()); 273 } 274 275 static __inline__ __m128i __DEFAULT_FN_ATTRS128 276 _mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { 277 return (__m128i)__builtin_ia32_selectd_128( 278 (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C), (__v4si)__A); 279 } 280 281 static __inline__ __m128i __DEFAULT_FN_ATTRS128 282 _mm_maskz_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { 283 return (__m128i)__builtin_ia32_selectd_128( 284 (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C), 285 (__v4si)_mm_setzero_si128()); 286 } 287 288 static __inline__ __m256i __DEFAULT_FN_ATTRS256 289 _mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { 290 return (__m256i)__builtin_ia32_selectd_256( 291 (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C), (__v8si)__A); 292 } 293 294 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwsuds_epi32( 295 __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { 296 return (__m256i)__builtin_ia32_selectd_256( 297 (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C), 298 (__v8si)_mm256_setzero_si256()); 299 } 300 301 static __inline__ __m128i __DEFAULT_FN_ATTRS128 302 _mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { 303 return (__m128i)__builtin_ia32_selectd_128( 304 (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C), (__v4si)__A); 305 } 306 307 static __inline__ __m128i __DEFAULT_FN_ATTRS128 308 _mm_maskz_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { 309 return (__m128i)__builtin_ia32_selectd_128( 310 (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C), 311 (__v4si)_mm_setzero_si128()); 312 } 313 314 static __inline__ __m256i __DEFAULT_FN_ATTRS256 315 _mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { 316 return (__m256i)__builtin_ia32_selectd_256( 317 (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C), (__v8si)__A); 318 } 319 320 static __inline__ __m256i __DEFAULT_FN_ATTRS256 321 _mm256_maskz_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { 322 return (__m256i)__builtin_ia32_selectd_256( 323 (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C), 324 (__v8si)_mm256_setzero_si256()); 325 } 326 327 static __inline__ __m128i __DEFAULT_FN_ATTRS128 328 _mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { 329 return (__m128i)__builtin_ia32_selectd_128( 330 (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C), (__v4si)__A); 331 } 332 333 static __inline__ __m128i __DEFAULT_FN_ATTRS128 334 _mm_maskz_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { 335 return (__m128i)__builtin_ia32_selectd_128( 336 (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C), 337 (__v4si)_mm_setzero_si128()); 338 } 339 340 static __inline__ __m256i __DEFAULT_FN_ATTRS256 341 _mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { 342 return (__m256i)__builtin_ia32_selectd_256( 343 (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C), (__v8si)__A); 344 } 345 346 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwusds_epi32( 347 __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { 348 return (__m256i)__builtin_ia32_selectd_256( 349 (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C), 350 (__v8si)_mm256_setzero_si256()); 351 } 352 353 static __inline__ __m128i __DEFAULT_FN_ATTRS128 354 _mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { 355 return (__m128i)__builtin_ia32_selectd_128( 356 (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C), (__v4si)__A); 357 } 358 359 static __inline__ __m128i __DEFAULT_FN_ATTRS128 360 _mm_maskz_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { 361 return (__m128i)__builtin_ia32_selectd_128( 362 (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C), 363 (__v4si)_mm_setzero_si128()); 364 } 365 366 static __inline__ __m256i __DEFAULT_FN_ATTRS256 367 _mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { 368 return (__m256i)__builtin_ia32_selectd_256( 369 (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C), (__v8si)__A); 370 } 371 372 static __inline__ __m256i __DEFAULT_FN_ATTRS256 373 _mm256_maskz_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { 374 return (__m256i)__builtin_ia32_selectd_256( 375 (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C), 376 (__v8si)_mm256_setzero_si256()); 377 } 378 379 static __inline__ __m128i __DEFAULT_FN_ATTRS128 380 _mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { 381 return (__m128i)__builtin_ia32_selectd_128( 382 (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C), (__v4si)__A); 383 } 384 385 static __inline__ __m128i __DEFAULT_FN_ATTRS128 386 _mm_maskz_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) { 387 return (__m128i)__builtin_ia32_selectd_128( 388 (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C), 389 (__v4si)_mm_setzero_si128()); 390 } 391 392 static __inline__ __m256i __DEFAULT_FN_ATTRS256 393 _mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { 394 return (__m256i)__builtin_ia32_selectd_256( 395 (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C), (__v8si)__A); 396 } 397 398 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwuuds_epi32( 399 __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) { 400 return (__m256i)__builtin_ia32_selectd_256( 401 (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C), 402 (__v8si)_mm256_setzero_si256()); 403 } 404 405 /* YMM Rounding */ 406 #define _mm256_add_round_pd(A, B, R) \ 407 ((__m256d)__builtin_ia32_vaddpd256_round((__v4df)(__m256d)(A), \ 408 (__v4df)(__m256d)(B), (int)(R))) 409 410 #define _mm256_mask_add_round_pd(W, U, A, B, R) \ 411 ((__m256d)__builtin_ia32_selectpd_256( \ 412 (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \ 413 (__v4df)(__m256d)(W))) 414 415 #define _mm256_maskz_add_round_pd(U, A, B, R) \ 416 ((__m256d)__builtin_ia32_selectpd_256( \ 417 (__mmask8)(U), (__v4df)_mm256_add_round_pd((A), (B), (R)), \ 418 (__v4df)_mm256_setzero_pd())) 419 420 #define _mm256_add_round_ph(A, B, R) \ 421 ((__m256h)__builtin_ia32_vaddph256_round((__v16hf)(__m256h)(A), \ 422 (__v16hf)(__m256h)(B), (int)(R))) 423 424 #define _mm256_mask_add_round_ph(W, U, A, B, R) \ 425 ((__m256h)__builtin_ia32_selectph_256( \ 426 (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \ 427 (__v16hf)(__m256h)(W))) 428 429 #define _mm256_maskz_add_round_ph(U, A, B, R) \ 430 ((__m256h)__builtin_ia32_selectph_256( \ 431 (__mmask16)(U), (__v16hf)_mm256_add_round_ph((A), (B), (R)), \ 432 (__v16hf)_mm256_setzero_ph())) 433 434 #define _mm256_add_round_ps(A, B, R) \ 435 ((__m256)__builtin_ia32_vaddps256_round((__v8sf)(__m256)(A), \ 436 (__v8sf)(__m256)(B), (int)(R))) 437 438 #define _mm256_mask_add_round_ps(W, U, A, B, R) \ 439 ((__m256)__builtin_ia32_selectps_256( \ 440 (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \ 441 (__v8sf)(__m256)(W))) 442 443 #define _mm256_maskz_add_round_ps(U, A, B, R) \ 444 ((__m256)__builtin_ia32_selectps_256( \ 445 (__mmask8)(U), (__v8sf)_mm256_add_round_ps((A), (B), (R)), \ 446 (__v8sf)_mm256_setzero_ps())) 447 448 #define _mm256_cmp_round_pd_mask(A, B, P, R) \ 449 ((__mmask8)__builtin_ia32_vcmppd256_round_mask( \ 450 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8)-1, \ 451 (int)(R))) 452 453 #define _mm256_mask_cmp_round_pd_mask(U, A, B, P, R) \ 454 ((__mmask8)__builtin_ia32_vcmppd256_round_mask( \ 455 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(P), (__mmask8)(U), \ 456 (int)(R))) 457 458 #define _mm256_cmp_round_ph_mask(A, B, P, R) \ 459 ((__mmask16)__builtin_ia32_vcmpph256_round_mask( \ 460 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16)-1, \ 461 (int)(R))) 462 463 #define _mm256_mask_cmp_round_ph_mask(U, A, B, P, R) \ 464 ((__mmask16)__builtin_ia32_vcmpph256_round_mask( \ 465 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(P), (__mmask16)(U), \ 466 (int)(R))) 467 468 #define _mm256_cmp_round_ps_mask(A, B, P, R) \ 469 ((__mmask8)__builtin_ia32_vcmpps256_round_mask( \ 470 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8)-1, \ 471 (int)(R))) 472 473 #define _mm256_mask_cmp_round_ps_mask(U, A, B, P, R) \ 474 ((__mmask8)__builtin_ia32_vcmpps256_round_mask( \ 475 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(P), (__mmask8)(U), \ 476 (int)(R))) 477 478 #define _mm256_cvt_roundepi32_ph(A, R) \ 479 ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask( \ 480 (__v8si)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) 481 482 #define _mm256_mask_cvt_roundepi32_ph(W, U, A, R) \ 483 ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask((__v8si)(A), (__v8hf)(W), \ 484 (__mmask8)(U), (int)(R))) 485 486 #define _mm256_maskz_cvt_roundepi32_ph(U, A, R) \ 487 ((__m128h)__builtin_ia32_vcvtdq2ph256_round_mask( \ 488 (__v8si)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 489 490 #define _mm256_cvt_roundepi32_ps(A, R) \ 491 ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask((__v8si)(__m256i)(A), \ 492 (__v8sf)_mm256_setzero_ps(), \ 493 (__mmask8)-1, (int)(R))) 494 495 #define _mm256_mask_cvt_roundepi32_ps(W, U, A, R) \ 496 ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask( \ 497 (__v8si)(__m256i)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R))) 498 499 #define _mm256_maskz_cvt_roundepi32_ps(U, A, R) \ 500 ((__m256)__builtin_ia32_vcvtdq2ps256_round_mask((__v8si)(__m256i)(A), \ 501 (__v8sf)_mm256_setzero_ps(), \ 502 (__mmask8)(U), (int)(R))) 503 504 #define _mm256_cvt_roundpd_epi32(A, R) \ 505 ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \ 506 (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1, \ 507 (int)(R))) 508 509 #define _mm256_mask_cvt_roundpd_epi32(W, U, A, R) \ 510 ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \ 511 (__v4df)(__m256d)(A), (__v4si)(__m128i)(W), (__mmask8)(U), (int)(R))) 512 513 #define _mm256_maskz_cvt_roundpd_epi32(U, A, R) \ 514 ((__m128i)__builtin_ia32_vcvtpd2dq256_round_mask( \ 515 (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)(U), \ 516 (int)(R))) 517 518 #define _mm256_cvt_roundpd_ph(A, R) \ 519 ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask( \ 520 (__v4df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) 521 522 #define _mm256_mask_cvt_roundpd_ph(W, U, A, R) \ 523 ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask((__v4df)(A), (__v8hf)(W), \ 524 (__mmask8)(U), (int)(R))) 525 526 #define _mm256_maskz_cvt_roundpd_ph(U, A, R) \ 527 ((__m128h)__builtin_ia32_vcvtpd2ph256_round_mask( \ 528 (__v4df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 529 530 #define _mm256_cvt_roundpd_ps(A, R) \ 531 ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask( \ 532 (__v4df)(__m256d)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R))) 533 534 #define _mm256_mask_cvt_roundpd_ps(W, U, A, R) \ 535 ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask( \ 536 (__v4df)(__m256d)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R))) 537 538 #define _mm256_maskz_cvt_roundpd_ps(U, A, R) \ 539 ((__m128)__builtin_ia32_vcvtpd2ps256_round_mask((__v4df)(__m256d)(A), \ 540 (__v4sf)_mm_setzero_ps(), \ 541 (__mmask8)(U), (int)(R))) 542 543 #define _mm256_cvt_roundpd_epi64(A, R) \ 544 ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \ 545 (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \ 546 (int)(R))) 547 548 #define _mm256_mask_cvt_roundpd_epi64(W, U, A, R) \ 549 ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \ 550 (__v4df)(__m256d)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R))) 551 552 #define _mm256_maskz_cvt_roundpd_epi64(U, A, R) \ 553 ((__m256i)__builtin_ia32_vcvtpd2qq256_round_mask( \ 554 (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \ 555 (int)(R))) 556 557 #define _mm256_cvt_roundpd_epu32(A, R) \ 558 ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \ 559 (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1, \ 560 (int)(R))) 561 562 #define _mm256_mask_cvt_roundpd_epu32(W, U, A, R) \ 563 ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \ 564 (__v4df)(__m256d)(A), (__v4su)(__m128i)(W), (__mmask8)(U), (int)(R))) 565 566 #define _mm256_maskz_cvt_roundpd_epu32(U, A, R) \ 567 ((__m128i)__builtin_ia32_vcvtpd2udq256_round_mask( \ 568 (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)(U), \ 569 (int)(R))) 570 571 #define _mm256_cvt_roundpd_epu64(A, R) \ 572 ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \ 573 (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \ 574 (int)(R))) 575 576 #define _mm256_mask_cvt_roundpd_epu64(W, U, A, R) \ 577 ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \ 578 (__v4df)(__m256d)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R))) 579 580 #define _mm256_maskz_cvt_roundpd_epu64(U, A, R) \ 581 ((__m256i)__builtin_ia32_vcvtpd2uqq256_round_mask( \ 582 (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \ 583 (int)(R))) 584 585 #define _mm256_cvt_roundph_epi32(A, R) \ 586 ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask( \ 587 (__v8hf)(A), (__v8si)_mm256_undefined_si256(), (__mmask8)(-1), \ 588 (int)(R))) 589 590 #define _mm256_mask_cvt_roundph_epi32(W, U, A, R) \ 591 ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask((__v8hf)(A), (__v8si)(W), \ 592 (__mmask8)(U), (int)(R))) 593 594 #define _mm256_maskz_cvt_roundph_epi32(U, A, R) \ 595 ((__m256i)__builtin_ia32_vcvtph2dq256_round_mask( \ 596 (__v8hf)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), (int)(R))) 597 598 #define _mm256_cvt_roundph_pd(A, R) \ 599 ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask( \ 600 (__v8hf)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)(-1), (int)(R))) 601 602 #define _mm256_mask_cvt_roundph_pd(W, U, A, R) \ 603 ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask((__v8hf)(A), (__v4df)(W), \ 604 (__mmask8)(U), (int)(R))) 605 606 #define _mm256_maskz_cvt_roundph_pd(U, A, R) \ 607 ((__m256d)__builtin_ia32_vcvtph2pd256_round_mask( \ 608 (__v8hf)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R))) 609 610 #define _mm256_cvtx_roundph_ps(A, R) \ 611 ((__m256)__builtin_ia32_vcvtph2psx256_round_mask( \ 612 (__v8hf)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8)(-1), (int)(R))) 613 614 #define _mm256_mask_cvtx_roundph_ps(W, U, A, R) \ 615 ((__m256)__builtin_ia32_vcvtph2psx256_round_mask((__v8hf)(A), (__v8sf)(W), \ 616 (__mmask8)(U), (int)(R))) 617 618 #define _mm256_maskz_cvtx_roundph_ps(U, A, R) \ 619 ((__m256)__builtin_ia32_vcvtph2psx256_round_mask( \ 620 (__v8hf)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R))) 621 622 #define _mm256_cvt_roundph_epi64(A, R) \ 623 ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask( \ 624 (__v8hf)(A), (__v4di)_mm256_undefined_si256(), (__mmask8)(-1), \ 625 (int)(R))) 626 627 #define _mm256_mask_cvt_roundph_epi64(W, U, A, R) \ 628 ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask((__v8hf)(A), (__v4di)(W), \ 629 (__mmask8)(U), (int)(R))) 630 631 #define _mm256_maskz_cvt_roundph_epi64(U, A, R) \ 632 ((__m256i)__builtin_ia32_vcvtph2qq256_round_mask( \ 633 (__v8hf)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), (int)(R))) 634 635 #define _mm256_cvt_roundph_epu32(A, R) \ 636 ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask( \ 637 (__v8hf)(A), (__v8su)_mm256_undefined_si256(), (__mmask8)(-1), \ 638 (int)(R))) 639 640 #define _mm256_mask_cvt_roundph_epu32(W, U, A, R) \ 641 ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask((__v8hf)(A), (__v8su)(W), \ 642 (__mmask8)(U), (int)(R))) 643 644 #define _mm256_maskz_cvt_roundph_epu32(U, A, R) \ 645 ((__m256i)__builtin_ia32_vcvtph2udq256_round_mask( \ 646 (__v8hf)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), (int)(R))) 647 648 #define _mm256_cvt_roundph_epu64(A, R) \ 649 ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask( \ 650 (__v8hf)(A), (__v4du)_mm256_undefined_si256(), (__mmask8)(-1), \ 651 (int)(R))) 652 653 #define _mm256_mask_cvt_roundph_epu64(W, U, A, R) \ 654 ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask((__v8hf)(A), (__v4du)(W), \ 655 (__mmask8)(U), (int)(R))) 656 657 #define _mm256_maskz_cvt_roundph_epu64(U, A, R) \ 658 ((__m256i)__builtin_ia32_vcvtph2uqq256_round_mask( \ 659 (__v8hf)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), (int)(R))) 660 661 #define _mm256_cvt_roundph_epu16(A, R) \ 662 ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask( \ 663 (__v16hf)(A), (__v16hu)_mm256_undefined_si256(), (__mmask16)(-1), \ 664 (int)(R))) 665 666 #define _mm256_mask_cvt_roundph_epu16(W, U, A, R) \ 667 ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask((__v16hf)(A), (__v16hu)(W), \ 668 (__mmask16)(U), (int)(R))) 669 670 #define _mm256_maskz_cvt_roundph_epu16(U, A, R) \ 671 ((__m256i)__builtin_ia32_vcvtph2uw256_round_mask( \ 672 (__v16hf)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \ 673 (int)(R))) 674 675 #define _mm256_cvt_roundph_epi16(A, R) \ 676 ((__m256i)__builtin_ia32_vcvtph2w256_round_mask( \ 677 (__v16hf)(A), (__v16hi)_mm256_undefined_si256(), (__mmask16)(-1), \ 678 (int)(R))) 679 680 #define _mm256_mask_cvt_roundph_epi16(W, U, A, R) \ 681 ((__m256i)__builtin_ia32_vcvtph2w256_round_mask((__v16hf)(A), (__v16hi)(W), \ 682 (__mmask16)(U), (int)(R))) 683 684 #define _mm256_maskz_cvt_roundph_epi16(U, A, R) \ 685 ((__m256i)__builtin_ia32_vcvtph2w256_round_mask( \ 686 (__v16hf)(A), (__v16hi)_mm256_setzero_si256(), (__mmask16)(U), \ 687 (int)(R))) 688 689 #define _mm256_cvt_roundps_epi32(A, R) \ 690 ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \ 691 (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1, \ 692 (int)(R))) 693 694 #define _mm256_mask_cvt_roundps_epi32(W, U, A, R) \ 695 ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \ 696 (__v8sf)(__m256)(A), (__v8si)(__m256i)(W), (__mmask8)(U), (int)(R))) 697 698 #define _mm256_maskz_cvt_roundps_epi32(U, A, R) \ 699 ((__m256i)__builtin_ia32_vcvtps2dq256_round_mask( \ 700 (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), \ 701 (int)(R))) 702 703 #define _mm256_cvt_roundps_pd(A, R) \ 704 ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \ 705 (__v4sf)(__m128)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1, \ 706 (int)(R))) 707 708 #define _mm256_mask_cvt_roundps_pd(W, U, A, R) \ 709 ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \ 710 (__v4sf)(__m128)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R))) 711 712 #define _mm256_maskz_cvt_roundps_pd(U, A, R) \ 713 ((__m256d)__builtin_ia32_vcvtps2pd256_round_mask( \ 714 (__v4sf)(__m128)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \ 715 (int)(R))) 716 717 #define _mm256_cvt_roundps_ph(A, I) \ 718 ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \ 719 (__v8hi)_mm_undefined_si128(), \ 720 (__mmask8)-1)) 721 722 /* FIXME: We may use these way in future. 723 #define _mm256_cvt_roundps_ph(A, I) \ 724 ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask( \ 725 (__v8sf)(__m256)(A), (int)(I), (__v8hi)_mm_undefined_si128(), \ 726 (__mmask8)-1)) 727 #define _mm256_mask_cvt_roundps_ph(U, W, A, I) \ 728 ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask( \ 729 (__v8sf)(__m256)(A), (int)(I), (__v8hi)(__m128i)(U), (__mmask8)(W))) 730 #define _mm256_maskz_cvt_roundps_ph(W, A, I) \ 731 ((__m128i)__builtin_ia32_vcvtps2ph256_round_mask( \ 732 (__v8sf)(__m256)(A), (int)(I), (__v8hi)_mm_setzero_si128(), \ 733 (__mmask8)(W))) */ 734 735 #define _mm256_cvtx_roundps_ph(A, R) \ 736 ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask( \ 737 (__v8sf)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) 738 739 #define _mm256_mask_cvtx_roundps_ph(W, U, A, R) \ 740 ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask((__v8sf)(A), (__v8hf)(W), \ 741 (__mmask8)(U), (int)(R))) 742 743 #define _mm256_maskz_cvtx_roundps_ph(U, A, R) \ 744 ((__m128h)__builtin_ia32_vcvtps2phx256_round_mask( \ 745 (__v8sf)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 746 747 #define _mm256_cvt_roundps_epi64(A, R) \ 748 ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \ 749 (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \ 750 (int)(R))) 751 752 #define _mm256_mask_cvt_roundps_epi64(W, U, A, R) \ 753 ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \ 754 (__v4sf)(__m128)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R))) 755 756 #define _mm256_maskz_cvt_roundps_epi64(U, A, R) \ 757 ((__m256i)__builtin_ia32_vcvtps2qq256_round_mask( \ 758 (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \ 759 (int)(R))) 760 761 #define _mm256_cvt_roundps_epu32(A, R) \ 762 ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \ 763 (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \ 764 (int)(R))) 765 766 #define _mm256_mask_cvt_roundps_epu32(W, U, A, R) \ 767 ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \ 768 (__v8sf)(__m256)(A), (__v8su)(__m256i)(W), (__mmask8)(U), (int)(R))) 769 770 #define _mm256_maskz_cvt_roundps_epu32(U, A, R) \ 771 ((__m256i)__builtin_ia32_vcvtps2udq256_round_mask( \ 772 (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), \ 773 (int)(R))) 774 775 #define _mm256_cvt_roundps_epu64(A, R) \ 776 ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \ 777 (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \ 778 (int)(R))) 779 780 #define _mm256_mask_cvt_roundps_epu64(W, U, A, R) \ 781 ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \ 782 (__v4sf)(__m128)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R))) 783 784 #define _mm256_maskz_cvt_roundps_epu64(U, A, R) \ 785 ((__m256i)__builtin_ia32_vcvtps2uqq256_round_mask( \ 786 (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \ 787 (int)(R))) 788 789 #define _mm256_cvt_roundepi64_pd(A, R) \ 790 ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \ 791 (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1, \ 792 (int)(R))) 793 794 #define _mm256_mask_cvt_roundepi64_pd(W, U, A, R) \ 795 ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \ 796 (__v4di)(__m256i)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R))) 797 798 #define _mm256_maskz_cvt_roundepi64_pd(U, A, R) \ 799 ((__m256d)__builtin_ia32_vcvtqq2pd256_round_mask( \ 800 (__v4di)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \ 801 (int)(R))) 802 803 #define _mm256_cvt_roundepi64_ph(A, R) \ 804 ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask( \ 805 (__v4di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) 806 807 #define _mm256_mask_cvt_roundepi64_ph(W, U, A, R) \ 808 ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask((__v4di)(A), (__v8hf)(W), \ 809 (__mmask8)(U), (int)(R))) 810 811 #define _mm256_maskz_cvt_roundepi64_ph(U, A, R) \ 812 ((__m128h)__builtin_ia32_vcvtqq2ph256_round_mask( \ 813 (__v4di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 814 815 #define _mm256_cvt_roundepi64_ps(A, R) \ 816 ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask( \ 817 (__v4di)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R))) 818 819 #define _mm256_mask_cvt_roundepi64_ps(W, U, A, R) \ 820 ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask( \ 821 (__v4di)(__m256i)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R))) 822 823 #define _mm256_maskz_cvt_roundepi64_ps(U, A, R) \ 824 ((__m128)__builtin_ia32_vcvtqq2ps256_round_mask((__v4di)(__m256i)(A), \ 825 (__v4sf)_mm_setzero_ps(), \ 826 (__mmask8)(U), (int)(R))) 827 828 #define _mm256_cvtt_roundpd_epi32(A, R) \ 829 ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \ 830 (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)-1, \ 831 (int)(R))) 832 833 #define _mm256_mask_cvtt_roundpd_epi32(W, U, A, R) \ 834 ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \ 835 (__v4df)(__m256d)(A), (__v4si)(__m128i)(W), (__mmask8)(U), (int)(R))) 836 837 #define _mm256_maskz_cvtt_roundpd_epi32(U, A, R) \ 838 ((__m128i)__builtin_ia32_vcvttpd2dq256_round_mask( \ 839 (__v4df)(__m256d)(A), (__v4si)_mm_setzero_si128(), (__mmask8)(U), \ 840 (int)(R))) 841 842 #define _mm256_cvtt_roundpd_epi64(A, R) \ 843 ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \ 844 (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \ 845 (int)(R))) 846 847 #define _mm256_mask_cvtt_roundpd_epi64(W, U, A, R) \ 848 ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \ 849 (__v4df)(__m256d)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R))) 850 851 #define _mm256_maskz_cvtt_roundpd_epi64(U, A, R) \ 852 ((__m256i)__builtin_ia32_vcvttpd2qq256_round_mask( \ 853 (__v4df)(__m256d)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \ 854 (int)(R))) 855 856 #define _mm256_cvtt_roundpd_epu32(A, R) \ 857 ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \ 858 (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)-1, \ 859 (int)(R))) 860 861 #define _mm256_mask_cvtt_roundpd_epu32(W, U, A, R) \ 862 ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \ 863 (__v4df)(__m256d)(A), (__v4su)(__m128i)(W), (__mmask8)(U), (int)(R))) 864 865 #define _mm256_maskz_cvtt_roundpd_epu32(U, A, R) \ 866 ((__m128i)__builtin_ia32_vcvttpd2udq256_round_mask( \ 867 (__v4df)(__m256d)(A), (__v4su)_mm_setzero_si128(), (__mmask8)(U), \ 868 (int)(R))) 869 870 #define _mm256_cvtt_roundpd_epu64(A, R) \ 871 ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \ 872 (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \ 873 (int)(R))) 874 875 #define _mm256_mask_cvtt_roundpd_epu64(W, U, A, R) \ 876 ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \ 877 (__v4df)(__m256d)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R))) 878 879 #define _mm256_maskz_cvtt_roundpd_epu64(U, A, R) \ 880 ((__m256i)__builtin_ia32_vcvttpd2uqq256_round_mask( \ 881 (__v4df)(__m256d)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \ 882 (int)(R))) 883 884 #define _mm256_cvtt_roundph_epi32(A, R) \ 885 ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask( \ 886 (__v8hf)(A), (__v8si)_mm256_undefined_si256(), (__mmask8)(-1), \ 887 (int)(R))) 888 889 #define _mm256_mask_cvtt_roundph_epi32(W, U, A, R) \ 890 ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask((__v8hf)(A), (__v8si)(W), \ 891 (__mmask8)(U), (int)(R))) 892 893 #define _mm256_maskz_cvtt_roundph_epi32(U, A, R) \ 894 ((__m256i)__builtin_ia32_vcvttph2dq256_round_mask( \ 895 (__v8hf)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), (int)(R))) 896 897 #define _mm256_cvtt_roundph_epi64(A, R) \ 898 ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask( \ 899 (__v8hf)(A), (__v4di)_mm256_undefined_si256(), (__mmask8)(-1), \ 900 (int)(R))) 901 902 #define _mm256_mask_cvtt_roundph_epi64(W, U, A, R) \ 903 ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask((__v8hf)(A), (__v4di)(W), \ 904 (__mmask8)(U), (int)(R))) 905 906 #define _mm256_maskz_cvtt_roundph_epi64(U, A, R) \ 907 ((__m256i)__builtin_ia32_vcvttph2qq256_round_mask( \ 908 (__v8hf)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), (int)(R))) 909 910 #define _mm256_cvtt_roundph_epu32(A, R) \ 911 ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask( \ 912 (__v8hf)(A), (__v8su)_mm256_undefined_si256(), (__mmask8)(-1), \ 913 (int)(R))) 914 915 #define _mm256_mask_cvtt_roundph_epu32(W, U, A, R) \ 916 ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask((__v8hf)(A), (__v8su)(W), \ 917 (__mmask8)(U), (int)(R))) 918 919 #define _mm256_maskz_cvtt_roundph_epu32(U, A, R) \ 920 ((__m256i)__builtin_ia32_vcvttph2udq256_round_mask( \ 921 (__v8hf)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), (int)(R))) 922 923 #define _mm256_cvtt_roundph_epu64(A, R) \ 924 ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask( \ 925 (__v8hf)(A), (__v4du)_mm256_undefined_si256(), (__mmask8)(-1), \ 926 (int)(R))) 927 928 #define _mm256_mask_cvtt_roundph_epu64(W, U, A, R) \ 929 ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask((__v8hf)(A), (__v4du)(W), \ 930 (__mmask8)(U), (int)(R))) 931 932 #define _mm256_maskz_cvtt_roundph_epu64(U, A, R) \ 933 ((__m256i)__builtin_ia32_vcvttph2uqq256_round_mask( \ 934 (__v8hf)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), (int)(R))) 935 936 #define _mm256_cvtt_roundph_epu16(A, R) \ 937 ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \ 938 (__v16hf)(A), (__v16hu)_mm256_undefined_si256(), (__mmask16)(-1), \ 939 (int)(R))) 940 941 #define _mm256_mask_cvtt_roundph_epu16(W, U, A, R) \ 942 ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \ 943 (__v16hf)(A), (__v16hu)(W), (__mmask16)(U), (int)(R))) 944 945 #define _mm256_maskz_cvtt_roundph_epu16(U, A, R) \ 946 ((__m256i)__builtin_ia32_vcvttph2uw256_round_mask( \ 947 (__v16hf)(A), (__v16hu)_mm256_setzero_si256(), (__mmask16)(U), \ 948 (int)(R))) 949 950 #define _mm256_cvtt_roundph_epi16(A, R) \ 951 ((__m256i)__builtin_ia32_vcvttph2w256_round_mask( \ 952 (__v16hf)(A), (__v16hi)_mm256_undefined_si256(), (__mmask16)(-1), \ 953 (int)(R))) 954 955 #define _mm256_mask_cvtt_roundph_epi16(W, U, A, R) \ 956 ((__m256i)__builtin_ia32_vcvttph2w256_round_mask((__v16hf)(A), (__v16hi)(W), \ 957 (__mmask16)(U), (int)(R))) 958 959 #define _mm256_maskz_cvtt_roundph_epi16(U, A, R) \ 960 ((__m256i)__builtin_ia32_vcvttph2w256_round_mask( \ 961 (__v16hf)(A), (__v16hi)_mm256_setzero_si256(), (__mmask16)(U), \ 962 (int)(R))) 963 964 #define _mm256_cvtt_roundps_epi32(A, R) \ 965 ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \ 966 (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)-1, \ 967 (int)(R))) 968 969 #define _mm256_mask_cvtt_roundps_epi32(W, U, A, R) \ 970 ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \ 971 (__v8sf)(__m256)(A), (__v8si)(__m256i)(W), (__mmask8)(U), (int)(R))) 972 973 #define _mm256_maskz_cvtt_roundps_epi32(U, A, R) \ 974 ((__m256i)__builtin_ia32_vcvttps2dq256_round_mask( \ 975 (__v8sf)(__m256)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), \ 976 (int)(R))) 977 978 #define _mm256_cvtt_roundps_epi64(A, R) \ 979 ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \ 980 (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)-1, \ 981 (int)(R))) 982 983 #define _mm256_mask_cvtt_roundps_epi64(W, U, A, R) \ 984 ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \ 985 (__v4sf)(__m128)(A), (__v4di)(__m256i)(W), (__mmask8)(U), (int)(R))) 986 987 #define _mm256_maskz_cvtt_roundps_epi64(U, A, R) \ 988 ((__m256i)__builtin_ia32_vcvttps2qq256_round_mask( \ 989 (__v4sf)(__m128)(A), (__v4di)_mm256_setzero_si256(), (__mmask8)(U), \ 990 (int)(R))) 991 992 #define _mm256_cvtt_roundps_epu32(A, R) \ 993 ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \ 994 (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)-1, \ 995 (int)(R))) 996 997 #define _mm256_mask_cvtt_roundps_epu32(W, U, A, R) \ 998 ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \ 999 (__v8sf)(__m256)(A), (__v8su)(__m256i)(W), (__mmask8)(U), (int)(R))) 1000 1001 #define _mm256_maskz_cvtt_roundps_epu32(U, A, R) \ 1002 ((__m256i)__builtin_ia32_vcvttps2udq256_round_mask( \ 1003 (__v8sf)(__m256)(A), (__v8su)_mm256_setzero_si256(), (__mmask8)(U), \ 1004 (int)(R))) 1005 1006 #define _mm256_cvtt_roundps_epu64(A, R) \ 1007 ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \ 1008 (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)-1, \ 1009 (int)(R))) 1010 1011 #define _mm256_mask_cvtt_roundps_epu64(W, U, A, R) \ 1012 ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \ 1013 (__v4sf)(__m128)(A), (__v4du)(__m256i)(W), (__mmask8)(U), (int)(R))) 1014 1015 #define _mm256_maskz_cvtt_roundps_epu64(U, A, R) \ 1016 ((__m256i)__builtin_ia32_vcvttps2uqq256_round_mask( \ 1017 (__v4sf)(__m128)(A), (__v4du)_mm256_setzero_si256(), (__mmask8)(U), \ 1018 (int)(R))) 1019 1020 #define _mm256_cvt_roundepu32_ph(A, R) \ 1021 ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask( \ 1022 (__v8su)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) 1023 1024 #define _mm256_mask_cvt_roundepu32_ph(W, U, A, R) \ 1025 ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask((__v8su)(A), (__v8hf)(W), \ 1026 (__mmask8)(U), (int)(R))) 1027 1028 #define _mm256_maskz_cvt_roundepu32_ph(U, A, R) \ 1029 ((__m128h)__builtin_ia32_vcvtudq2ph256_round_mask( \ 1030 (__v8su)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 1031 1032 #define _mm256_cvt_roundepu32_ps(A, R) \ 1033 ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \ 1034 (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, \ 1035 (int)(R))) 1036 1037 #define _mm256_mask_cvt_roundepu32_ps(W, U, A, R) \ 1038 ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \ 1039 (__v8su)(__m256i)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R))) 1040 1041 #define _mm256_maskz_cvt_roundepu32_ps(U, A, R) \ 1042 ((__m256)__builtin_ia32_vcvtudq2ps256_round_mask( \ 1043 (__v8su)(__m256i)(A), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), \ 1044 (int)(R))) 1045 1046 #define _mm256_cvt_roundepu64_pd(A, R) \ 1047 ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \ 1048 (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)-1, \ 1049 (int)(R))) 1050 1051 #define _mm256_mask_cvt_roundepu64_pd(W, U, A, R) \ 1052 ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \ 1053 (__v4du)(__m256i)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R))) 1054 1055 #define _mm256_maskz_cvt_roundepu64_pd(U, A, R) \ 1056 ((__m256d)__builtin_ia32_vcvtuqq2pd256_round_mask( \ 1057 (__v4du)(__m256i)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \ 1058 (int)(R))) 1059 1060 #define _mm256_cvt_roundepu64_ph(A, R) \ 1061 ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask( \ 1062 (__v4du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) 1063 1064 #define _mm256_mask_cvt_roundepu64_ph(W, U, A, R) \ 1065 ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask((__v4du)(A), (__v8hf)(W), \ 1066 (__mmask8)(U), (int)(R))) 1067 1068 #define _mm256_maskz_cvt_roundepu64_ph(U, A, R) \ 1069 ((__m128h)__builtin_ia32_vcvtuqq2ph256_round_mask( \ 1070 (__v4du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) 1071 1072 #define _mm256_cvt_roundepu64_ps(A, R) \ 1073 ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask( \ 1074 (__v4du)(__m256i)(A), (__v4sf)_mm_setzero_ps(), (__mmask8)-1, (int)(R))) 1075 1076 #define _mm256_mask_cvt_roundepu64_ps(W, U, A, R) \ 1077 ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask( \ 1078 (__v4du)(__m256i)(A), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R))) 1079 1080 #define _mm256_maskz_cvt_roundepu64_ps(U, A, R) \ 1081 ((__m128)__builtin_ia32_vcvtuqq2ps256_round_mask((__v4du)(__m256i)(A), \ 1082 (__v4sf)_mm_setzero_ps(), \ 1083 (__mmask8)(U), (int)(R))) 1084 1085 #define _mm256_cvt_roundepu16_ph(A, R) \ 1086 ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask( \ 1087 (__v16hu)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)(-1), \ 1088 (int)(R))) 1089 1090 #define _mm256_mask_cvt_roundepu16_ph(W, U, A, R) \ 1091 ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask((__v16hu)(A), (__v16hf)(W), \ 1092 (__mmask16)(U), (int)(R))) 1093 1094 #define _mm256_maskz_cvt_roundepu16_ph(U, A, R) \ 1095 ((__m256h)__builtin_ia32_vcvtuw2ph256_round_mask( \ 1096 (__v16hu)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) 1097 1098 #define _mm256_cvt_roundepi16_ph(A, R) \ 1099 ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask( \ 1100 (__v16hi)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)(-1), \ 1101 (int)(R))) 1102 1103 #define _mm256_mask_cvt_roundepi16_ph(W, U, A, R) \ 1104 ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask((__v16hi)(A), (__v16hf)(W), \ 1105 (__mmask16)(U), (int)(R))) 1106 1107 #define _mm256_maskz_cvt_roundepi16_ph(U, A, R) \ 1108 ((__m256h)__builtin_ia32_vcvtw2ph256_round_mask( \ 1109 (__v16hi)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) 1110 1111 #define _mm256_div_round_pd(A, B, R) \ 1112 ((__m256d)__builtin_ia32_vdivpd256_round((__v4df)(__m256d)(A), \ 1113 (__v4df)(__m256d)(B), (int)(R))) 1114 1115 #define _mm256_mask_div_round_pd(W, U, A, B, R) \ 1116 ((__m256d)__builtin_ia32_selectpd_256( \ 1117 (__mmask8)(U), (__v4df)_mm256_div_round_pd((A), (B), (R)), \ 1118 (__v4df)(__m256d)(W))) 1119 1120 #define _mm256_maskz_div_round_pd(U, A, B, R) \ 1121 ((__m256d)__builtin_ia32_selectpd_256( \ 1122 (__mmask8)(U), (__v4df)_mm256_div_round_pd((A), (B), (R)), \ 1123 (__v4df)_mm256_setzero_pd())) 1124 1125 #define _mm256_div_round_ph(A, B, R) \ 1126 ((__m256h)__builtin_ia32_vdivph256_round((__v16hf)(__m256h)(A), \ 1127 (__v16hf)(__m256h)(B), (int)(R))) 1128 1129 #define _mm256_mask_div_round_ph(W, U, A, B, R) \ 1130 ((__m256h)__builtin_ia32_selectph_256( \ 1131 (__mmask16)(U), (__v16hf)_mm256_div_round_ph((A), (B), (R)), \ 1132 (__v16hf)(__m256h)(W))) 1133 1134 #define _mm256_maskz_div_round_ph(U, A, B, R) \ 1135 ((__m256h)__builtin_ia32_selectph_256( \ 1136 (__mmask16)(U), (__v16hf)_mm256_div_round_ph((A), (B), (R)), \ 1137 (__v16hf)_mm256_setzero_ph())) 1138 1139 #define _mm256_div_round_ps(A, B, R) \ 1140 ((__m256)__builtin_ia32_vdivps256_round((__v8sf)(__m256)(A), \ 1141 (__v8sf)(__m256)(B), (int)(R))) 1142 1143 #define _mm256_mask_div_round_ps(W, U, A, B, R) \ 1144 ((__m256)__builtin_ia32_selectps_256( \ 1145 (__mmask8)(U), (__v8sf)_mm256_div_round_ps((A), (B), (R)), \ 1146 (__v8sf)(__m256)(W))) 1147 1148 #define _mm256_maskz_div_round_ps(U, A, B, R) \ 1149 ((__m256)__builtin_ia32_selectps_256( \ 1150 (__mmask8)(U), (__v8sf)_mm256_div_round_ps((A), (B), (R)), \ 1151 (__v8sf)_mm256_setzero_ps())) 1152 1153 #define _mm256_fcmadd_round_pch(A, B, C, R) \ 1154 ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask3( \ 1155 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \ 1156 (__mmask8)-1, (int)(R))) 1157 1158 #define _mm256_mask_fcmadd_round_pch(A, U, B, C, R) \ 1159 ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask( \ 1160 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \ 1161 (__mmask8)(U), (int)(R))) 1162 1163 #define _mm256_mask3_fcmadd_round_pch(A, B, C, U, R) \ 1164 ((__m256h)__builtin_ia32_vfcmaddcph256_round_mask3( \ 1165 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \ 1166 (__mmask8)(U), (int)(R))) 1167 1168 #define _mm256_maskz_fcmadd_round_pch(U, A, B, C, R) \ 1169 ((__m256h)__builtin_ia32_vfcmaddcph256_round_maskz( \ 1170 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \ 1171 (__mmask8)(U), (int)(R))) 1172 1173 #define _mm256_cmul_round_pch(A, B, R) \ 1174 ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \ 1175 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \ 1176 (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R))) 1177 1178 #define _mm256_mask_cmul_round_pch(W, U, A, B, R) \ 1179 ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \ 1180 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(W), \ 1181 (__mmask8)(U), (int)(R))) 1182 1183 #define _mm256_maskz_cmul_round_pch(U, A, B, R) \ 1184 ((__m256h)__builtin_ia32_vfcmulcph256_round_mask( \ 1185 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \ 1186 (__v8sf)(__m256h)_mm256_setzero_ph(), (__mmask8)(U), (int)(R))) 1187 1188 #define _mm256_fixupimm_round_pd(A, B, C, imm, R) \ 1189 ((__m256d)__builtin_ia32_vfixupimmpd256_round_mask( \ 1190 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \ 1191 (int)(imm), (__mmask8)-1, (int)(R))) 1192 1193 #define _mm256_mask_fixupimm_round_pd(A, U, B, C, imm, R) \ 1194 ((__m256d)__builtin_ia32_vfixupimmpd256_round_mask( \ 1195 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \ 1196 (int)(imm), (__mmask8)(U), (int)(R))) 1197 1198 #define _mm256_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \ 1199 ((__m256d)__builtin_ia32_vfixupimmpd256_round_maskz( \ 1200 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4di)(__m256i)(C), \ 1201 (int)(imm), (__mmask8)(U), (int)(R))) 1202 1203 #define _mm256_fixupimm_round_ps(A, B, C, imm, R) \ 1204 ((__m256)__builtin_ia32_vfixupimmps256_round_mask( \ 1205 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \ 1206 (int)(imm), (__mmask8)-1, (int)(R))) 1207 1208 #define _mm256_mask_fixupimm_round_ps(A, U, B, C, imm, R) \ 1209 ((__m256)__builtin_ia32_vfixupimmps256_round_mask( \ 1210 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \ 1211 (int)(imm), (__mmask8)(U), (int)(R))) 1212 1213 #define _mm256_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \ 1214 ((__m256)__builtin_ia32_vfixupimmps256_round_maskz( \ 1215 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8si)(__m256i)(C), \ 1216 (int)(imm), (__mmask8)(U), (int)(R))) 1217 1218 #define _mm256_fmadd_round_pd(A, B, C, R) \ 1219 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \ 1220 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ 1221 (__mmask8)-1, (int)(R))) 1222 1223 #define _mm256_mask_fmadd_round_pd(A, U, B, C, R) \ 1224 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \ 1225 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ 1226 (__mmask8)(U), (int)(R))) 1227 1228 #define _mm256_mask3_fmadd_round_pd(A, B, C, U, R) \ 1229 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask3( \ 1230 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ 1231 (__mmask8)(U), (int)(R))) 1232 1233 #define _mm256_maskz_fmadd_round_pd(U, A, B, C, R) \ 1234 ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \ 1235 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ 1236 (__mmask8)(U), (int)(R))) 1237 1238 #define _mm256_fmsub_round_pd(A, B, C, R) \ 1239 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \ 1240 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ 1241 (__mmask8)-1, (int)(R))) 1242 1243 #define _mm256_mask_fmsub_round_pd(A, U, B, C, R) \ 1244 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \ 1245 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ 1246 (__mmask8)(U), (int)(R))) 1247 1248 #define _mm256_maskz_fmsub_round_pd(U, A, B, C, R) \ 1249 ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \ 1250 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ 1251 (__mmask8)(U), (int)(R))) 1252 1253 #define _mm256_fnmadd_round_pd(A, B, C, R) \ 1254 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \ 1255 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ 1256 (__mmask8)-1, (int)(R))) 1257 1258 #define _mm256_mask3_fnmadd_round_pd(A, B, C, U, R) \ 1259 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask3( \ 1260 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ 1261 (__mmask8)(U), (int)(R))) 1262 1263 #define _mm256_maskz_fnmadd_round_pd(U, A, B, C, R) \ 1264 ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \ 1265 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ 1266 (__mmask8)(U), (int)(R))) 1267 1268 #define _mm256_fnmsub_round_pd(A, B, C, R) \ 1269 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \ 1270 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ 1271 (__mmask8)-1, (int)(R))) 1272 1273 #define _mm256_maskz_fnmsub_round_pd(U, A, B, C, R) \ 1274 ((__m256d)__builtin_ia32_vfmaddpd256_round_maskz( \ 1275 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ 1276 (__mmask8)(U), (int)(R))) 1277 1278 #define _mm256_fmadd_round_ph(A, B, C, R) \ 1279 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \ 1280 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ 1281 (__mmask16)-1, (int)(R))) 1282 1283 #define _mm256_mask_fmadd_round_ph(A, U, B, C, R) \ 1284 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \ 1285 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ 1286 (__mmask16)(U), (int)(R))) 1287 1288 #define _mm256_mask3_fmadd_round_ph(A, B, C, U, R) \ 1289 ((__m256h)__builtin_ia32_vfmaddph256_round_mask3( \ 1290 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ 1291 (__mmask16)(U), (int)(R))) 1292 1293 #define _mm256_maskz_fmadd_round_ph(U, A, B, C, R) \ 1294 ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \ 1295 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ 1296 (__mmask16)(U), (int)(R))) 1297 1298 #define _mm256_fmsub_round_ph(A, B, C, R) \ 1299 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \ 1300 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ 1301 (__mmask16)-1, (int)(R))) 1302 1303 #define _mm256_mask_fmsub_round_ph(A, U, B, C, R) \ 1304 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \ 1305 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ 1306 (__mmask16)(U), (int)(R))) 1307 1308 #define _mm256_maskz_fmsub_round_ph(U, A, B, C, R) \ 1309 ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \ 1310 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ 1311 (__mmask16)(U), (int)(R))) 1312 1313 #define _mm256_fnmadd_round_ph(A, B, C, R) \ 1314 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \ 1315 (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ 1316 (__mmask16)-1, (int)(R))) 1317 1318 #define _mm256_mask3_fnmadd_round_ph(A, B, C, U, R) \ 1319 ((__m256h)__builtin_ia32_vfmaddph256_round_mask3( \ 1320 -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ 1321 (__mmask16)(U), (int)(R))) 1322 1323 #define _mm256_maskz_fnmadd_round_ph(U, A, B, C, R) \ 1324 ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \ 1325 -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ 1326 (__mmask16)(U), (int)(R))) 1327 1328 #define _mm256_fnmsub_round_ph(A, B, C, R) \ 1329 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \ 1330 (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ 1331 (__mmask16)-1, (int)(R))) 1332 1333 #define _mm256_maskz_fnmsub_round_ph(U, A, B, C, R) \ 1334 ((__m256h)__builtin_ia32_vfmaddph256_round_maskz( \ 1335 -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ 1336 (__mmask16)(U), (int)(R))) 1337 1338 #define _mm256_fmadd_round_ps(A, B, C, R) \ 1339 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \ 1340 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ 1341 (__mmask8)-1, (int)(R))) 1342 1343 #define _mm256_mask_fmadd_round_ps(A, U, B, C, R) \ 1344 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \ 1345 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ 1346 (__mmask8)(U), (int)(R))) 1347 1348 #define _mm256_mask3_fmadd_round_ps(A, B, C, U, R) \ 1349 ((__m256)__builtin_ia32_vfmaddps256_round_mask3( \ 1350 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ 1351 (__mmask8)(U), (int)(R))) 1352 1353 #define _mm256_maskz_fmadd_round_ps(U, A, B, C, R) \ 1354 ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \ 1355 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ 1356 (__mmask8)(U), (int)(R))) 1357 1358 #define _mm256_fmsub_round_ps(A, B, C, R) \ 1359 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \ 1360 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ 1361 (__mmask8)-1, (int)(R))) 1362 1363 #define _mm256_mask_fmsub_round_ps(A, U, B, C, R) \ 1364 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \ 1365 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ 1366 (__mmask8)(U), (int)(R))) 1367 1368 #define _mm256_maskz_fmsub_round_ps(U, A, B, C, R) \ 1369 ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \ 1370 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ 1371 (__mmask8)(U), (int)(R))) 1372 1373 #define _mm256_fnmadd_round_ps(A, B, C, R) \ 1374 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \ 1375 (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ 1376 (__mmask8)-1, (int)(R))) 1377 1378 #define _mm256_mask3_fnmadd_round_ps(A, B, C, U, R) \ 1379 ((__m256)__builtin_ia32_vfmaddps256_round_mask3( \ 1380 -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ 1381 (__mmask8)(U), (int)(R))) 1382 1383 #define _mm256_maskz_fnmadd_round_ps(U, A, B, C, R) \ 1384 ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \ 1385 -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ 1386 (__mmask8)(U), (int)(R))) 1387 1388 #define _mm256_fnmsub_round_ps(A, B, C, R) \ 1389 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \ 1390 (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ 1391 (__mmask8)-1, (int)(R))) 1392 1393 #define _mm256_maskz_fnmsub_round_ps(U, A, B, C, R) \ 1394 ((__m256)__builtin_ia32_vfmaddps256_round_maskz( \ 1395 -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ 1396 (__mmask8)(U), (int)(R))) 1397 1398 #define _mm256_fmadd_round_pch(A, B, C, R) \ 1399 ((__m256h)__builtin_ia32_vfmaddcph256_round_mask3( \ 1400 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \ 1401 (__mmask8)-1, (int)(R))) 1402 1403 #define _mm256_mask_fmadd_round_pch(A, U, B, C, R) \ 1404 ((__m256h)__builtin_ia32_vfmaddcph256_round_mask( \ 1405 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \ 1406 (__mmask8)(U), (int)(R))) 1407 1408 #define _mm256_mask3_fmadd_round_pch(A, B, C, U, R) \ 1409 ((__m256h)__builtin_ia32_vfmaddcph256_round_mask3( \ 1410 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \ 1411 (__mmask8)(U), (int)(R))) 1412 1413 #define _mm256_maskz_fmadd_round_pch(U, A, B, C, R) \ 1414 ((__m256h)__builtin_ia32_vfmaddcph256_round_maskz( \ 1415 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(C), \ 1416 (__mmask8)(U), (int)(R))) 1417 1418 #define _mm256_fmaddsub_round_pd(A, B, C, R) \ 1419 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \ 1420 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ 1421 (__mmask8)-1, (int)(R))) 1422 1423 #define _mm256_mask_fmaddsub_round_pd(A, U, B, C, R) \ 1424 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \ 1425 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ 1426 (__mmask8)(U), (int)(R))) 1427 1428 #define _mm256_mask3_fmaddsub_round_pd(A, B, C, U, R) \ 1429 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask3( \ 1430 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ 1431 (__mmask8)(U), (int)(R))) 1432 1433 #define _mm256_maskz_fmaddsub_round_pd(U, A, B, C, R) \ 1434 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_maskz( \ 1435 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ 1436 (__mmask8)(U), (int)(R))) 1437 1438 #define _mm256_fmsubadd_round_pd(A, B, C, R) \ 1439 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \ 1440 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ 1441 (__mmask8)-1, (int)(R))) 1442 1443 #define _mm256_mask_fmsubadd_round_pd(A, U, B, C, R) \ 1444 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_mask( \ 1445 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ 1446 (__mmask8)(U), (int)(R))) 1447 1448 #define _mm256_maskz_fmsubadd_round_pd(U, A, B, C, R) \ 1449 ((__m256d)__builtin_ia32_vfmaddsubpd256_round_maskz( \ 1450 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ 1451 (__mmask8)(U), (int)(R))) 1452 1453 #define _mm256_fmaddsub_round_ph(A, B, C, R) \ 1454 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \ 1455 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ 1456 (__mmask16)-1, (int)(R))) 1457 1458 #define _mm256_mask_fmaddsub_round_ph(A, U, B, C, R) \ 1459 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \ 1460 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ 1461 (__mmask16)(U), (int)(R))) 1462 1463 #define _mm256_mask3_fmaddsub_round_ph(A, B, C, U, R) \ 1464 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask3( \ 1465 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ 1466 (__mmask16)(U), (int)(R))) 1467 1468 #define _mm256_maskz_fmaddsub_round_ph(U, A, B, C, R) \ 1469 ((__m256h)__builtin_ia32_vfmaddsubph256_round_maskz( \ 1470 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ 1471 (__mmask16)(U), (int)(R))) 1472 1473 #define _mm256_fmsubadd_round_ph(A, B, C, R) \ 1474 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \ 1475 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ 1476 (__mmask16)-1, (int)(R))) 1477 1478 #define _mm256_mask_fmsubadd_round_ph(A, U, B, C, R) \ 1479 ((__m256h)__builtin_ia32_vfmaddsubph256_round_mask( \ 1480 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ 1481 (__mmask16)(U), (int)(R))) 1482 1483 #define _mm256_maskz_fmsubadd_round_ph(U, A, B, C, R) \ 1484 ((__m256h)__builtin_ia32_vfmaddsubph256_round_maskz( \ 1485 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ 1486 (__mmask16)(U), (int)(R))) 1487 1488 #define _mm256_fmaddsub_round_ps(A, B, C, R) \ 1489 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \ 1490 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ 1491 (__mmask8)-1, (int)(R))) 1492 1493 #define _mm256_mask_fmaddsub_round_ps(A, U, B, C, R) \ 1494 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \ 1495 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ 1496 (__mmask8)(U), (int)(R))) 1497 1498 #define _mm256_mask3_fmaddsub_round_ps(A, B, C, U, R) \ 1499 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask3( \ 1500 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ 1501 (__mmask8)(U), (int)(R))) 1502 1503 #define _mm256_maskz_fmaddsub_round_ps(U, A, B, C, R) \ 1504 ((__m256)__builtin_ia32_vfmaddsubps256_round_maskz( \ 1505 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ 1506 (__mmask8)(U), (int)(R))) 1507 1508 #define _mm256_fmsubadd_round_ps(A, B, C, R) \ 1509 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \ 1510 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ 1511 (__mmask8)-1, (int)(R))) 1512 1513 #define _mm256_mask_fmsubadd_round_ps(A, U, B, C, R) \ 1514 ((__m256)__builtin_ia32_vfmaddsubps256_round_mask( \ 1515 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ 1516 (__mmask8)(U), (int)(R))) 1517 1518 #define _mm256_maskz_fmsubadd_round_ps(U, A, B, C, R) \ 1519 ((__m256)__builtin_ia32_vfmaddsubps256_round_maskz( \ 1520 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ 1521 (__mmask8)(U), (int)(R))) 1522 #define _mm256_mask3_fmsub_round_pd(A, B, C, U, R) \ 1523 ((__m256d)__builtin_ia32_vfmsubpd256_round_mask3( \ 1524 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ 1525 (__mmask8)(U), (int)(R))) 1526 1527 #define _mm256_mask3_fmsubadd_round_pd(A, B, C, U, R) \ 1528 ((__m256d)__builtin_ia32_vfmsubaddpd256_round_mask3( \ 1529 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ 1530 (__mmask8)(U), (int)(R))) 1531 1532 #define _mm256_mask_fnmadd_round_pd(A, U, B, C, R) \ 1533 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \ 1534 (__v4df)(__m256d)(A), -(__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ 1535 (__mmask8)(U), (int)(R))) 1536 1537 #define _mm256_mask_fnmsub_round_pd(A, U, B, C, R) \ 1538 ((__m256d)__builtin_ia32_vfmaddpd256_round_mask( \ 1539 (__v4df)(__m256d)(A), -(__v4df)(__m256d)(B), -(__v4df)(__m256d)(C), \ 1540 (__mmask8)(U), (int)(R))) 1541 1542 #define _mm256_mask3_fnmsub_round_pd(A, B, C, U, R) \ 1543 ((__m256d)__builtin_ia32_vfmsubpd256_round_mask3( \ 1544 -(__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(C), \ 1545 (__mmask8)(U), (int)(R))) 1546 1547 #define _mm256_mask3_fmsub_round_ph(A, B, C, U, R) \ 1548 ((__m256h)__builtin_ia32_vfmsubph256_round_mask3( \ 1549 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ 1550 (__mmask16)(U), (int)(R))) 1551 1552 #define _mm256_mask3_fmsubadd_round_ph(A, B, C, U, R) \ 1553 ((__m256h)__builtin_ia32_vfmsubaddph256_round_mask3( \ 1554 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ 1555 (__mmask16)(U), (int)(R))) 1556 1557 #define _mm256_mask_fnmadd_round_ph(A, U, B, C, R) \ 1558 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \ 1559 (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ 1560 (__mmask16)(U), (int)(R))) 1561 1562 #define _mm256_mask_fnmsub_round_ph(A, U, B, C, R) \ 1563 ((__m256h)__builtin_ia32_vfmaddph256_round_mask( \ 1564 (__v16hf)(__m256h)(A), -(__v16hf)(__m256h)(B), -(__v16hf)(__m256h)(C), \ 1565 (__mmask16)(U), (int)(R))) 1566 1567 #define _mm256_mask3_fnmsub_round_ph(A, B, C, U, R) \ 1568 ((__m256h)__builtin_ia32_vfmsubph256_round_mask3( \ 1569 -(__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(C), \ 1570 (__mmask16)(U), (int)(R))) 1571 1572 #define _mm256_mask3_fmsub_round_ps(A, B, C, U, R) \ 1573 ((__m256)__builtin_ia32_vfmsubps256_round_mask3( \ 1574 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ 1575 (__mmask8)(U), (int)(R))) 1576 1577 #define _mm256_mask3_fmsubadd_round_ps(A, B, C, U, R) \ 1578 ((__m256)__builtin_ia32_vfmsubaddps256_round_mask3( \ 1579 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ 1580 (__mmask8)(U), (int)(R))) 1581 1582 #define _mm256_mask_fnmadd_round_ps(A, U, B, C, R) \ 1583 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \ 1584 (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ 1585 (__mmask8)(U), (int)(R))) 1586 1587 #define _mm256_mask_fnmsub_round_ps(A, U, B, C, R) \ 1588 ((__m256)__builtin_ia32_vfmaddps256_round_mask( \ 1589 (__v8sf)(__m256)(A), -(__v8sf)(__m256)(B), -(__v8sf)(__m256)(C), \ 1590 (__mmask8)(U), (int)(R))) 1591 1592 #define _mm256_mask3_fnmsub_round_ps(A, B, C, U, R) \ 1593 ((__m256)__builtin_ia32_vfmsubps256_round_mask3( \ 1594 -(__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(C), \ 1595 (__mmask8)(U), (int)(R))) 1596 1597 #define _mm256_mul_round_pch(A, B, R) \ 1598 ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \ 1599 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \ 1600 (__v8sf)(__m256h)_mm256_undefined_ph(), (__mmask8)-1, (int)(R))) 1601 1602 #define _mm256_mask_mul_round_pch(W, U, A, B, R) \ 1603 ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \ 1604 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), (__v8sf)(__m256h)(W), \ 1605 (__mmask8)(U), (int)(R))) 1606 1607 #define _mm256_maskz_mul_round_pch(U, A, B, R) \ 1608 ((__m256h)__builtin_ia32_vfmulcph256_round_mask( \ 1609 (__v8sf)(__m256h)(A), (__v8sf)(__m256h)(B), \ 1610 (__v8sf)(__m256h)_mm256_setzero_ph(), (__mmask8)(U), (int)(R))) 1611 1612 #define _mm256_getexp_round_pd(A, R) \ 1613 ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \ 1614 (__v4df)(__m256d)(A), (__v4df)_mm256_undefined_pd(), (__mmask8)-1, \ 1615 (int)(R))) 1616 1617 #define _mm256_mask_getexp_round_pd(W, U, A, R) \ 1618 ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \ 1619 (__v4df)(__m256d)(A), (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R))) 1620 1621 #define _mm256_maskz_getexp_round_pd(U, A, R) \ 1622 ((__m256d)__builtin_ia32_vgetexppd256_round_mask( \ 1623 (__v4df)(__m256d)(A), (__v4df)_mm256_setzero_pd(), (__mmask8)(U), \ 1624 (int)(R))) 1625 1626 #define _mm256_getexp_round_ph(A, R) \ 1627 ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \ 1628 (__v16hf)(__m256h)(A), (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, \ 1629 (int)(R))) 1630 1631 #define _mm256_mask_getexp_round_ph(W, U, A, R) \ 1632 ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \ 1633 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(W), (__mmask16)(U), (int)(R))) 1634 1635 #define _mm256_maskz_getexp_round_ph(U, A, R) \ 1636 ((__m256h)__builtin_ia32_vgetexpph256_round_mask( \ 1637 (__v16hf)(__m256h)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), \ 1638 (int)(R))) 1639 1640 #define _mm256_getexp_round_ps(A, R) \ 1641 ((__m256)__builtin_ia32_vgetexpps256_round_mask( \ 1642 (__v8sf)(__m256)(A), (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, \ 1643 (int)(R))) 1644 1645 #define _mm256_mask_getexp_round_ps(W, U, A, R) \ 1646 ((__m256)__builtin_ia32_vgetexpps256_round_mask( \ 1647 (__v8sf)(__m256)(A), (__v8sf)(__m256)(W), (__mmask8)(U), (int)(R))) 1648 1649 #define _mm256_maskz_getexp_round_ps(U, A, R) \ 1650 ((__m256)__builtin_ia32_vgetexpps256_round_mask((__v8sf)(__m256)(A), \ 1651 (__v8sf)_mm256_setzero_ps(), \ 1652 (__mmask8)(U), (int)(R))) 1653 1654 #define _mm256_getmant_round_pd(A, B, C, R) \ 1655 ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \ 1656 (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), \ 1657 (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R))) 1658 1659 #define _mm256_mask_getmant_round_pd(W, U, A, B, C, R) \ 1660 ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \ 1661 (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), (__v4df)(__m256d)(W), \ 1662 (__mmask8)(U), (int)(R))) 1663 1664 #define _mm256_maskz_getmant_round_pd(U, A, B, C, R) \ 1665 ((__m256d)__builtin_ia32_vgetmantpd256_round_mask( \ 1666 (__v4df)(__m256d)(A), (int)(((C) << 2) | (B)), \ 1667 (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R))) 1668 1669 #define _mm256_getmant_round_ph(A, B, C, R) \ 1670 ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \ 1671 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ 1672 (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R))) 1673 1674 #define _mm256_mask_getmant_round_ph(W, U, A, B, C, R) \ 1675 ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \ 1676 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \ 1677 (__mmask16)(U), (int)(R))) 1678 1679 #define _mm256_maskz_getmant_round_ph(U, A, B, C, R) \ 1680 ((__m256h)__builtin_ia32_vgetmantph256_round_mask( \ 1681 (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ 1682 (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) 1683 1684 #define _mm256_getmant_round_ps(A, B, C, R) \ 1685 ((__m256)__builtin_ia32_vgetmantps256_round_mask( \ 1686 (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), \ 1687 (__v8sf)_mm256_undefined_ps(), (__mmask8)-1, (int)(R))) 1688 1689 #define _mm256_mask_getmant_round_ps(W, U, A, B, C, R) \ 1690 ((__m256)__builtin_ia32_vgetmantps256_round_mask( \ 1691 (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), (__v8sf)(__m256)(W), \ 1692 (__mmask8)(U), (int)(R))) 1693 1694 #define _mm256_maskz_getmant_round_ps(U, A, B, C, R) \ 1695 ((__m256)__builtin_ia32_vgetmantps256_round_mask( \ 1696 (__v8sf)(__m256)(A), (int)(((C) << 2) | (B)), \ 1697 (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R))) 1698 1699 #define _mm256_max_round_pd(A, B, R) \ 1700 ((__m256d)__builtin_ia32_vmaxpd256_round((__v4df)(__m256d)(A), \ 1701 (__v4df)(__m256d)(B), (int)(R))) 1702 1703 #define _mm256_mask_max_round_pd(W, U, A, B, R) \ 1704 ((__m256d)__builtin_ia32_selectpd_256( \ 1705 (__mmask8)(U), (__v4df)_mm256_max_round_pd((A), (B), (R)), \ 1706 (__v4df)(__m256d)(W))) 1707 1708 #define _mm256_maskz_max_round_pd(U, A, B, R) \ 1709 ((__m256d)__builtin_ia32_selectpd_256( \ 1710 (__mmask8)(U), (__v4df)_mm256_max_round_pd((A), (B), (R)), \ 1711 (__v4df)_mm256_setzero_pd())) 1712 1713 #define _mm256_max_round_ph(A, B, R) \ 1714 ((__m256h)__builtin_ia32_vmaxph256_round((__v16hf)(__m256h)(A), \ 1715 (__v16hf)(__m256h)(B), (int)(R))) 1716 1717 #define _mm256_mask_max_round_ph(W, U, A, B, R) \ 1718 ((__m256h)__builtin_ia32_selectph_256( \ 1719 (__mmask16)(U), (__v16hf)_mm256_max_round_ph((A), (B), (R)), \ 1720 (__v16hf)(__m256h)(W))) 1721 1722 #define _mm256_maskz_max_round_ph(U, A, B, R) \ 1723 ((__m256h)__builtin_ia32_selectph_256( \ 1724 (__mmask16)(U), (__v16hf)_mm256_max_round_ph((A), (B), (R)), \ 1725 (__v16hf)_mm256_setzero_ph())) 1726 1727 #define _mm256_max_round_ps(A, B, R) \ 1728 ((__m256)__builtin_ia32_vmaxps256_round((__v8sf)(__m256)(A), \ 1729 (__v8sf)(__m256)(B), (int)(R))) 1730 1731 #define _mm256_mask_max_round_ps(W, U, A, B, R) \ 1732 ((__m256)__builtin_ia32_selectps_256( \ 1733 (__mmask8)(U), (__v8sf)_mm256_max_round_ps((A), (B), (R)), \ 1734 (__v8sf)(__m256)(W))) 1735 1736 #define _mm256_maskz_max_round_ps(U, A, B, R) \ 1737 ((__m256)__builtin_ia32_selectps_256( \ 1738 (__mmask8)(U), (__v8sf)_mm256_max_round_ps((A), (B), (R)), \ 1739 (__v8sf)_mm256_setzero_ps())) 1740 1741 #define _mm256_min_round_pd(A, B, R) \ 1742 ((__m256d)__builtin_ia32_vminpd256_round((__v4df)(__m256d)(A), \ 1743 (__v4df)(__m256d)(B), (int)(R))) 1744 1745 #define _mm256_mask_min_round_pd(W, U, A, B, R) \ 1746 ((__m256d)__builtin_ia32_selectpd_256( \ 1747 (__mmask8)(U), (__v4df)_mm256_min_round_pd((A), (B), (R)), \ 1748 (__v4df)(__m256d)(W))) 1749 1750 #define _mm256_maskz_min_round_pd(U, A, B, R) \ 1751 ((__m256d)__builtin_ia32_selectpd_256( \ 1752 (__mmask8)(U), (__v4df)_mm256_min_round_pd((A), (B), (R)), \ 1753 (__v4df)_mm256_setzero_pd())) 1754 1755 #define _mm256_min_round_ph(A, B, R) \ 1756 ((__m256h)__builtin_ia32_vminph256_round((__v16hf)(__m256h)(A), \ 1757 (__v16hf)(__m256h)(B), (int)(R))) 1758 1759 #define _mm256_mask_min_round_ph(W, U, A, B, R) \ 1760 ((__m256h)__builtin_ia32_selectph_256( \ 1761 (__mmask16)(U), (__v16hf)_mm256_min_round_ph((A), (B), (R)), \ 1762 (__v16hf)(__m256h)(W))) 1763 1764 #define _mm256_maskz_min_round_ph(U, A, B, R) \ 1765 ((__m256h)__builtin_ia32_selectph_256( \ 1766 (__mmask16)(U), (__v16hf)_mm256_min_round_ph((A), (B), (R)), \ 1767 (__v16hf)_mm256_setzero_ph())) 1768 1769 #define _mm256_min_round_ps(A, B, R) \ 1770 ((__m256)__builtin_ia32_vminps256_round((__v8sf)(__m256)(A), \ 1771 (__v8sf)(__m256)(B), (int)(R))) 1772 1773 #define _mm256_mask_min_round_ps(W, U, A, B, R) \ 1774 ((__m256)__builtin_ia32_selectps_256( \ 1775 (__mmask8)(U), (__v8sf)_mm256_min_round_ps((A), (B), (R)), \ 1776 (__v8sf)(__m256)(W))) 1777 1778 #define _mm256_maskz_min_round_ps(U, A, B, R) \ 1779 ((__m256)__builtin_ia32_selectps_256( \ 1780 (__mmask8)(U), (__v8sf)_mm256_min_round_ps((A), (B), (R)), \ 1781 (__v8sf)_mm256_setzero_ps())) 1782 1783 #define _mm256_mul_round_pd(A, B, R) \ 1784 ((__m256d)__builtin_ia32_vmulpd256_round((__v4df)(__m256d)(A), \ 1785 (__v4df)(__m256d)(B), (int)(R))) 1786 1787 #define _mm256_mask_mul_round_pd(W, U, A, B, R) \ 1788 ((__m256d)__builtin_ia32_selectpd_256( \ 1789 (__mmask8)(U), (__v4df)_mm256_mul_round_pd((A), (B), (R)), \ 1790 (__v4df)(__m256d)(W))) 1791 1792 #define _mm256_maskz_mul_round_pd(U, A, B, R) \ 1793 ((__m256d)__builtin_ia32_selectpd_256( \ 1794 (__mmask8)(U), (__v4df)_mm256_mul_round_pd((A), (B), (R)), \ 1795 (__v4df)_mm256_setzero_pd())) 1796 1797 #define _mm256_mul_round_ph(A, B, R) \ 1798 ((__m256h)__builtin_ia32_vmulph256_round((__v16hf)(__m256h)(A), \ 1799 (__v16hf)(__m256h)(B), (int)(R))) 1800 1801 #define _mm256_mask_mul_round_ph(W, U, A, B, R) \ 1802 ((__m256h)__builtin_ia32_selectph_256( \ 1803 (__mmask16)(U), (__v16hf)_mm256_mul_round_ph((A), (B), (R)), \ 1804 (__v16hf)(__m256h)(W))) 1805 1806 #define _mm256_maskz_mul_round_ph(U, A, B, R) \ 1807 ((__m256h)__builtin_ia32_selectph_256( \ 1808 (__mmask16)(U), (__v16hf)_mm256_mul_round_ph((A), (B), (R)), \ 1809 (__v16hf)_mm256_setzero_ph())) 1810 1811 #define _mm256_mul_round_ps(A, B, R) \ 1812 ((__m256)__builtin_ia32_vmulps256_round((__v8sf)(__m256)(A), \ 1813 (__v8sf)(__m256)(B), (int)(R))) 1814 1815 #define _mm256_mask_mul_round_ps(W, U, A, B, R) \ 1816 ((__m256)__builtin_ia32_selectps_256( \ 1817 (__mmask8)(U), (__v8sf)_mm256_mul_round_ps((A), (B), (R)), \ 1818 (__v8sf)(__m256)(W))) 1819 1820 #define _mm256_maskz_mul_round_ps(U, A, B, R) \ 1821 ((__m256)__builtin_ia32_selectps_256( \ 1822 (__mmask8)(U), (__v8sf)_mm256_mul_round_ps((A), (B), (R)), \ 1823 (__v8sf)_mm256_setzero_ps())) 1824 1825 #define _mm256_range_round_pd(A, B, C, R) \ 1826 ((__m256d)__builtin_ia32_vrangepd256_round_mask( \ 1827 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ 1828 (__v4df)_mm256_setzero_pd(), (__mmask8)-1, (int)(R))) 1829 1830 #define _mm256_mask_range_round_pd(W, U, A, B, C, R) \ 1831 ((__m256d)__builtin_ia32_vrangepd256_round_mask( \ 1832 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ 1833 (__v4df)(__m256d)(W), (__mmask8)(U), (int)(R))) 1834 1835 #define _mm256_maskz_range_round_pd(U, A, B, C, R) \ 1836 ((__m256d)__builtin_ia32_vrangepd256_round_mask( \ 1837 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), \ 1838 (__v4df)_mm256_setzero_pd(), (__mmask8)(U), (int)(R))) 1839 1840 #define _mm256_range_round_ps(A, B, C, R) \ 1841 ((__m256)__builtin_ia32_vrangeps256_round_mask( \ 1842 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \ 1843 (__v8sf)_mm256_setzero_ps(), (__mmask8)-1, (int)(R))) 1844 1845 #define _mm256_mask_range_round_ps(W, U, A, B, C, R) \ 1846 ((__m256)__builtin_ia32_vrangeps256_round_mask( \ 1847 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \ 1848 (__mmask8)(U), (int)(R))) 1849 1850 #define _mm256_maskz_range_round_ps(U, A, B, C, R) \ 1851 ((__m256)__builtin_ia32_vrangeps256_round_mask( \ 1852 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), \ 1853 (__v8sf)_mm256_setzero_ps(), (__mmask8)(U), (int)(R))) 1854 1855 #define _mm256_reduce_round_pd(A, B, R) \ 1856 ((__m256d)__builtin_ia32_vreducepd256_round_mask( \ 1857 (__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), \ 1858 (__mmask8)-1, (int)(R))) 1859 1860 #define _mm256_mask_reduce_round_pd(W, U, A, B, R) \ 1861 ((__m256d)__builtin_ia32_vreducepd256_round_mask( \ 1862 (__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U), \ 1863 (int)(R))) 1864 1865 #define _mm256_maskz_reduce_round_pd(U, A, B, R) \ 1866 ((__m256d)__builtin_ia32_vreducepd256_round_mask( \ 1867 (__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), \ 1868 (__mmask8)(U), (int)(R))) 1869 1870 #define _mm256_mask_reduce_round_ph(W, U, A, imm, R) \ 1871 ((__m256h)__builtin_ia32_vreduceph256_round_mask( \ 1872 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \ 1873 (__mmask16)(U), (int)(R))) 1874 1875 #define _mm256_maskz_reduce_round_ph(U, A, imm, R) \ 1876 ((__m256h)__builtin_ia32_vreduceph256_round_mask( \ 1877 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ 1878 (__mmask16)(U), (int)(R))) 1879 1880 #define _mm256_reduce_round_ph(A, imm, R) \ 1881 ((__m256h)__builtin_ia32_vreduceph256_round_mask( \ 1882 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(), \ 1883 (__mmask16)-1, (int)(R))) 1884 1885 #define _mm256_reduce_round_ps(A, B, R) \ 1886 ((__m256)__builtin_ia32_vreduceps256_round_mask( \ 1887 (__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), \ 1888 (__mmask8)-1, (int)(R))) 1889 1890 #define _mm256_mask_reduce_round_ps(W, U, A, B, R) \ 1891 ((__m256)__builtin_ia32_vreduceps256_round_mask( \ 1892 (__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U), \ 1893 (int)(R))) 1894 1895 #define _mm256_maskz_reduce_round_ps(U, A, B, R) \ 1896 ((__m256)__builtin_ia32_vreduceps256_round_mask( \ 1897 (__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), \ 1898 (__mmask8)(U), (int)(R))) 1899 1900 #define _mm256_roundscale_round_pd(A, imm, R) \ 1901 ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \ 1902 (__v4df)(__m256d)(A), (int)(imm), (__v4df)_mm256_undefined_pd(), \ 1903 (__mmask8)-1, (int)(R))) 1904 1905 #define _mm256_mask_roundscale_round_pd(A, B, C, imm, R) \ 1906 ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \ 1907 (__v4df)(__m256d)(C), (int)(imm), (__v4df)(__m256d)(A), (__mmask8)(B), \ 1908 (int)(R))) 1909 1910 #define _mm256_maskz_roundscale_round_pd(A, B, imm, R) \ 1911 ((__m256d)__builtin_ia32_vrndscalepd256_round_mask( \ 1912 (__v4df)(__m256d)(B), (int)(imm), (__v4df)_mm256_setzero_pd(), \ 1913 (__mmask8)(A), (int)(R))) 1914 1915 #define _mm256_roundscale_round_ph(A, imm, R) \ 1916 ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \ 1917 (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_undefined_ph(), \ 1918 (__mmask16)-1, (int)(R))) 1919 1920 #define _mm256_mask_roundscale_round_ph(A, B, C, imm, R) \ 1921 ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \ 1922 (__v16hf)(__m256h)(C), (int)(imm), (__v16hf)(__m256h)(A), \ 1923 (__mmask16)(B), (int)(R))) 1924 1925 #define _mm256_maskz_roundscale_round_ph(A, B, imm, R) \ 1926 ((__m256h)__builtin_ia32_vrndscaleph256_round_mask( \ 1927 (__v16hf)(__m256h)(B), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ 1928 (__mmask16)(A), (int)(R))) 1929 1930 #define _mm256_roundscale_round_ps(A, imm, R) \ 1931 ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \ 1932 (__v8sf)(__m256)(A), (int)(imm), (__v8sf)_mm256_undefined_ps(), \ 1933 (__mmask8)-1, (int)(R))) 1934 1935 #define _mm256_mask_roundscale_round_ps(A, B, C, imm, R) \ 1936 ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \ 1937 (__v8sf)(__m256)(C), (int)(imm), (__v8sf)(__m256)(A), (__mmask8)(B), \ 1938 (int)(R))) 1939 1940 #define _mm256_maskz_roundscale_round_ps(A, B, imm, R) \ 1941 ((__m256)__builtin_ia32_vrndscaleps256_round_mask( \ 1942 (__v8sf)(__m256)(B), (int)(imm), (__v8sf)_mm256_setzero_ps(), \ 1943 (__mmask8)(A), (int)(R))) 1944 1945 #define _mm256_scalef_round_pd(A, B, R) \ 1946 ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \ 1947 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), \ 1948 (__v4df)_mm256_undefined_pd(), (__mmask8)-1, (int)(R))) 1949 1950 #define _mm256_mask_scalef_round_pd(W, U, A, B, R) \ 1951 ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \ 1952 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)(__m256d)(W), \ 1953 (__mmask8)(U), (int)(R))) 1954 1955 #define _mm256_maskz_scalef_round_pd(U, A, B, R) \ 1956 ((__m256d)__builtin_ia32_vscalefpd256_round_mask( \ 1957 (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (__v4df)_mm256_setzero_pd(), \ 1958 (__mmask8)(U), (int)(R))) 1959 1960 #define _mm256_scalef_round_ph(A, B, R) \ 1961 ((__m256h)__builtin_ia32_vscalefph256_round_mask( \ 1962 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), \ 1963 (__v16hf)_mm256_undefined_ph(), (__mmask16)-1, (int)(R))) 1964 1965 #define _mm256_mask_scalef_round_ph(W, U, A, B, R) \ 1966 ((__m256h)__builtin_ia32_vscalefph256_round_mask( \ 1967 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (__v16hf)(__m256h)(W), \ 1968 (__mmask16)(U), (int)(R))) 1969 1970 #define _mm256_maskz_scalef_round_ph(U, A, B, R) \ 1971 ((__m256h)__builtin_ia32_vscalefph256_round_mask( \ 1972 (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), \ 1973 (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) 1974 1975 #define _mm256_scalef_round_ps(A, B, R) \ 1976 ((__m256)__builtin_ia32_vscalefps256_round_mask( \ 1977 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)_mm256_undefined_ps(), \ 1978 (__mmask8)-1, (int)(R))) 1979 1980 #define _mm256_mask_scalef_round_ps(W, U, A, B, R) \ 1981 ((__m256)__builtin_ia32_vscalefps256_round_mask( \ 1982 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)(__m256)(W), \ 1983 (__mmask8)(U), (int)(R))) 1984 1985 #define _mm256_maskz_scalef_round_ps(U, A, B, R) \ 1986 ((__m256)__builtin_ia32_vscalefps256_round_mask( \ 1987 (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (__v8sf)_mm256_setzero_ps(), \ 1988 (__mmask8)(U), (int)(R))) 1989 1990 #define _mm256_sqrt_round_pd(A, R) \ 1991 ((__m256d)__builtin_ia32_vsqrtpd256_round((__v4df)(__m256d)(A), (int)(R))) 1992 1993 #define _mm256_mask_sqrt_round_pd(W, U, A, R) \ 1994 ((__m256d)__builtin_ia32_selectpd_256( \ 1995 (__mmask8)(U), (__v4df)_mm256_sqrt_round_pd((A), (R)), \ 1996 (__v4df)(__m256d)(W))) 1997 1998 #define _mm256_maskz_sqrt_round_pd(U, A, R) \ 1999 ((__m256d)__builtin_ia32_selectpd_256( \ 2000 (__mmask8)(U), (__v4df)_mm256_sqrt_round_pd((A), (R)), \ 2001 (__v4df)_mm256_setzero_pd())) 2002 2003 #define _mm256_sqrt_round_ph(A, R) \ 2004 ((__m256h)__builtin_ia32_vsqrtph256_round((__v16hf)(__m256h)(A), (int)(R))) 2005 2006 #define _mm256_mask_sqrt_round_ph(W, U, A, R) \ 2007 ((__m256h)__builtin_ia32_selectph_256( \ 2008 (__mmask16)(U), (__v16hf)_mm256_sqrt_round_ph((A), (R)), \ 2009 (__v16hf)(__m256h)(W))) 2010 2011 #define _mm256_maskz_sqrt_round_ph(U, A, R) \ 2012 ((__m256h)__builtin_ia32_selectph_256( \ 2013 (__mmask16)(U), (__v16hf)_mm256_sqrt_round_ph((A), (R)), \ 2014 (__v16hf)_mm256_setzero_ph())) 2015 2016 #define _mm256_sqrt_round_ps(A, R) \ 2017 ((__m256)__builtin_ia32_vsqrtps256_round((__v8sf)(__m256)(A), (int)(R))) 2018 2019 #define _mm256_mask_sqrt_round_ps(W, U, A, R) \ 2020 ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ 2021 (__v8sf)_mm256_sqrt_round_ps((A), (R)), \ 2022 (__v8sf)(__m256)(W))) 2023 2024 #define _mm256_maskz_sqrt_round_ps(U, A, R) \ 2025 ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ 2026 (__v8sf)_mm256_sqrt_round_ps((A), (R)), \ 2027 (__v8sf)_mm256_setzero_ps())) 2028 2029 #define _mm256_sub_round_pd(A, B, R) \ 2030 ((__m256d)__builtin_ia32_vsubpd256_round((__v4df)(__m256d)(A), \ 2031 (__v4df)(__m256d)(B), (int)(R))) 2032 2033 #define _mm256_mask_sub_round_pd(W, U, A, B, R) \ 2034 ((__m256d)__builtin_ia32_selectpd_256( \ 2035 (__mmask8)(U), (__v4df)_mm256_sub_round_pd((A), (B), (R)), \ 2036 (__v4df)(__m256d)(W))) 2037 2038 #define _mm256_maskz_sub_round_pd(U, A, B, R) \ 2039 ((__m256d)__builtin_ia32_selectpd_256( \ 2040 (__mmask8)(U), (__v4df)_mm256_sub_round_pd((A), (B), (R)), \ 2041 (__v4df)_mm256_setzero_pd())) 2042 2043 #define _mm256_sub_round_ph(A, B, R) \ 2044 ((__m256h)__builtin_ia32_vsubph256_round((__v16hf)(__m256h)(A), \ 2045 (__v16hf)(__m256h)(B), (int)(R))) 2046 2047 #define _mm256_mask_sub_round_ph(W, U, A, B, R) \ 2048 ((__m256h)__builtin_ia32_selectph_256( \ 2049 (__mmask16)(U), (__v16hf)_mm256_sub_round_ph((A), (B), (R)), \ 2050 (__v16hf)(__m256h)(W))) 2051 2052 #define _mm256_maskz_sub_round_ph(U, A, B, R) \ 2053 ((__m256h)__builtin_ia32_selectph_256( \ 2054 (__mmask16)(U), (__v16hf)_mm256_sub_round_ph((A), (B), (R)), \ 2055 (__v16hf)_mm256_setzero_ph())) 2056 2057 #define _mm256_sub_round_ps(A, B, R) \ 2058 ((__m256)__builtin_ia32_vsubps256_round((__v8sf)(__m256)(A), \ 2059 (__v8sf)(__m256)(B), (int)(R))) 2060 2061 #define _mm256_mask_sub_round_ps(W, U, A, B, R) \ 2062 ((__m256)__builtin_ia32_selectps_256( \ 2063 (__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)), \ 2064 (__v8sf)(__m256)(W))) 2065 2066 #define _mm256_maskz_sub_round_ps(U, A, B, R) \ 2067 ((__m256)__builtin_ia32_selectps_256( \ 2068 (__mmask8)(U), (__v8sf)_mm256_sub_round_ps((A), (B), (R)), \ 2069 (__v8sf)_mm256_setzero_ps())) 2070 2071 #undef __DEFAULT_FN_ATTRS256 2072 #undef __DEFAULT_FN_ATTRS128 2073 2074 #endif /* __AVX10_2NIINTRIN_H */ 2075 #endif /* __SSE2__ */ 2076