1 /* Copyright (C) 2008-2015 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 /* Implemented from the specification included in the Intel C++ Compiler 25 User Guide and Reference, version 11.0. */ 26 27 #ifndef _IMMINTRIN_H_INCLUDED 28 # error "Never use <avxintrin.h> directly; include <immintrin.h> instead." 29 #endif 30 31 #ifndef _AVXINTRIN_H_INCLUDED 32 #define _AVXINTRIN_H_INCLUDED 33 34 #ifndef __AVX__ 35 #pragma GCC push_options 36 #pragma GCC target("avx") 37 #define __DISABLE_AVX__ 38 #endif /* __AVX__ */ 39 40 /* Internal data types for implementing the intrinsics. */ 41 typedef double __v4df __attribute__ ((__vector_size__ (32))); 42 typedef float __v8sf __attribute__ ((__vector_size__ (32))); 43 typedef long long __v4di __attribute__ ((__vector_size__ (32))); 44 typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32))); 45 typedef int __v8si __attribute__ ((__vector_size__ (32))); 46 typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); 47 typedef short __v16hi __attribute__ ((__vector_size__ (32))); 48 typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); 49 typedef char __v32qi __attribute__ ((__vector_size__ (32))); 50 typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); 51 52 /* The Intel API is flexible enough that we must allow aliasing with other 53 vector types, and their scalar components. */ 54 typedef float __m256 __attribute__ ((__vector_size__ (32), 55 __may_alias__)); 56 typedef long long __m256i __attribute__ ((__vector_size__ (32), 57 __may_alias__)); 58 typedef double __m256d __attribute__ ((__vector_size__ (32), 59 __may_alias__)); 60 61 /* Compare predicates for scalar and packed compare intrinsics. */ 62 63 /* Equal (ordered, non-signaling) */ 64 #define _CMP_EQ_OQ 0x00 65 /* Less-than (ordered, signaling) */ 66 #define _CMP_LT_OS 0x01 67 /* Less-than-or-equal (ordered, signaling) */ 68 #define _CMP_LE_OS 0x02 69 /* Unordered (non-signaling) */ 70 #define _CMP_UNORD_Q 0x03 71 /* Not-equal (unordered, non-signaling) */ 72 #define _CMP_NEQ_UQ 0x04 73 /* Not-less-than (unordered, signaling) */ 74 #define _CMP_NLT_US 0x05 75 /* Not-less-than-or-equal (unordered, signaling) */ 76 #define _CMP_NLE_US 0x06 77 /* Ordered (nonsignaling) */ 78 #define _CMP_ORD_Q 0x07 79 /* Equal (unordered, non-signaling) */ 80 #define _CMP_EQ_UQ 0x08 81 /* Not-greater-than-or-equal (unordered, signaling) */ 82 #define _CMP_NGE_US 0x09 83 /* Not-greater-than (unordered, signaling) */ 84 #define _CMP_NGT_US 0x0a 85 /* False (ordered, non-signaling) */ 86 #define _CMP_FALSE_OQ 0x0b 87 /* Not-equal (ordered, non-signaling) */ 88 #define _CMP_NEQ_OQ 0x0c 89 /* Greater-than-or-equal (ordered, signaling) */ 90 #define _CMP_GE_OS 0x0d 91 /* Greater-than (ordered, signaling) */ 92 #define _CMP_GT_OS 0x0e 93 /* True (unordered, non-signaling) */ 94 #define _CMP_TRUE_UQ 0x0f 95 /* Equal (ordered, signaling) */ 96 #define _CMP_EQ_OS 0x10 97 /* Less-than (ordered, non-signaling) */ 98 #define _CMP_LT_OQ 0x11 99 /* Less-than-or-equal (ordered, non-signaling) */ 100 #define _CMP_LE_OQ 0x12 101 /* Unordered (signaling) */ 102 #define _CMP_UNORD_S 0x13 103 /* Not-equal (unordered, signaling) */ 104 #define _CMP_NEQ_US 0x14 105 /* Not-less-than (unordered, non-signaling) */ 106 #define _CMP_NLT_UQ 0x15 107 /* Not-less-than-or-equal (unordered, non-signaling) */ 108 #define _CMP_NLE_UQ 0x16 109 /* Ordered (signaling) */ 110 #define _CMP_ORD_S 0x17 111 /* Equal (unordered, signaling) */ 112 #define _CMP_EQ_US 0x18 113 /* Not-greater-than-or-equal (unordered, non-signaling) */ 114 #define _CMP_NGE_UQ 0x19 115 /* Not-greater-than (unordered, non-signaling) */ 116 #define _CMP_NGT_UQ 0x1a 117 /* False (ordered, signaling) */ 118 #define _CMP_FALSE_OS 0x1b 119 /* Not-equal (ordered, signaling) */ 120 #define _CMP_NEQ_OS 0x1c 121 /* Greater-than-or-equal (ordered, non-signaling) */ 122 #define _CMP_GE_OQ 0x1d 123 /* Greater-than (ordered, non-signaling) */ 124 #define _CMP_GT_OQ 0x1e 125 /* True (unordered, signaling) */ 126 #define _CMP_TRUE_US 0x1f 127 128 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 129 _mm256_add_pd (__m256d __A, __m256d __B) 130 { 131 return (__m256d) ((__v4df)__A + (__v4df)__B); 132 } 133 134 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 135 _mm256_add_ps (__m256 __A, __m256 __B) 136 { 137 return (__m256) ((__v8sf)__A + (__v8sf)__B); 138 } 139 140 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 141 _mm256_addsub_pd (__m256d __A, __m256d __B) 142 { 143 return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B); 144 } 145 146 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 147 _mm256_addsub_ps (__m256 __A, __m256 __B) 148 { 149 return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B); 150 } 151 152 153 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 154 _mm256_and_pd (__m256d __A, __m256d __B) 155 { 156 return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B); 157 } 158 159 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 160 _mm256_and_ps (__m256 __A, __m256 __B) 161 { 162 return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B); 163 } 164 165 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 166 _mm256_andnot_pd (__m256d __A, __m256d __B) 167 { 168 return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B); 169 } 170 171 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 172 _mm256_andnot_ps (__m256 __A, __m256 __B) 173 { 174 return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B); 175 } 176 177 /* Double/single precision floating point blend instructions - select 178 data from 2 sources using constant/variable mask. */ 179 180 #ifdef __OPTIMIZE__ 181 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 182 _mm256_blend_pd (__m256d __X, __m256d __Y, const int __M) 183 { 184 return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X, 185 (__v4df)__Y, 186 __M); 187 } 188 189 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 190 _mm256_blend_ps (__m256 __X, __m256 __Y, const int __M) 191 { 192 return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X, 193 (__v8sf)__Y, 194 __M); 195 } 196 #else 197 #define _mm256_blend_pd(X, Y, M) \ 198 ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \ 199 (__v4df)(__m256d)(Y), (int)(M))) 200 201 #define _mm256_blend_ps(X, Y, M) \ 202 ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \ 203 (__v8sf)(__m256)(Y), (int)(M))) 204 #endif 205 206 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 207 _mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M) 208 { 209 return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X, 210 (__v4df)__Y, 211 (__v4df)__M); 212 } 213 214 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 215 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M) 216 { 217 return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X, 218 (__v8sf)__Y, 219 (__v8sf)__M); 220 } 221 222 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 223 _mm256_div_pd (__m256d __A, __m256d __B) 224 { 225 return (__m256d) ((__v4df)__A / (__v4df)__B); 226 } 227 228 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 229 _mm256_div_ps (__m256 __A, __m256 __B) 230 { 231 return (__m256) ((__v8sf)__A / (__v8sf)__B); 232 } 233 234 /* Dot product instructions with mask-defined summing and zeroing parts 235 of result. */ 236 237 #ifdef __OPTIMIZE__ 238 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 239 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M) 240 { 241 return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X, 242 (__v8sf)__Y, 243 __M); 244 } 245 #else 246 #define _mm256_dp_ps(X, Y, M) \ 247 ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \ 248 (__v8sf)(__m256)(Y), (int)(M))) 249 #endif 250 251 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 252 _mm256_hadd_pd (__m256d __X, __m256d __Y) 253 { 254 return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y); 255 } 256 257 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 258 _mm256_hadd_ps (__m256 __X, __m256 __Y) 259 { 260 return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y); 261 } 262 263 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 264 _mm256_hsub_pd (__m256d __X, __m256d __Y) 265 { 266 return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y); 267 } 268 269 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 270 _mm256_hsub_ps (__m256 __X, __m256 __Y) 271 { 272 return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y); 273 } 274 275 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 276 _mm256_max_pd (__m256d __A, __m256d __B) 277 { 278 return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B); 279 } 280 281 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 282 _mm256_max_ps (__m256 __A, __m256 __B) 283 { 284 return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B); 285 } 286 287 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 288 _mm256_min_pd (__m256d __A, __m256d __B) 289 { 290 return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B); 291 } 292 293 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 294 _mm256_min_ps (__m256 __A, __m256 __B) 295 { 296 return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B); 297 } 298 299 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 300 _mm256_mul_pd (__m256d __A, __m256d __B) 301 { 302 return (__m256d) ((__v4df)__A * (__v4df)__B); 303 } 304 305 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 306 _mm256_mul_ps (__m256 __A, __m256 __B) 307 { 308 return (__m256) ((__v8sf)__A * (__v8sf)__B); 309 } 310 311 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 312 _mm256_or_pd (__m256d __A, __m256d __B) 313 { 314 return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B); 315 } 316 317 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 318 _mm256_or_ps (__m256 __A, __m256 __B) 319 { 320 return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B); 321 } 322 323 #ifdef __OPTIMIZE__ 324 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 325 _mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask) 326 { 327 return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B, 328 __mask); 329 } 330 331 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 332 _mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask) 333 { 334 return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B, 335 __mask); 336 } 337 #else 338 #define _mm256_shuffle_pd(A, B, N) \ 339 ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \ 340 (__v4df)(__m256d)(B), (int)(N))) 341 342 #define _mm256_shuffle_ps(A, B, N) \ 343 ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \ 344 (__v8sf)(__m256)(B), (int)(N))) 345 #endif 346 347 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 348 _mm256_sub_pd (__m256d __A, __m256d __B) 349 { 350 return (__m256d) ((__v4df)__A - (__v4df)__B); 351 } 352 353 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 354 _mm256_sub_ps (__m256 __A, __m256 __B) 355 { 356 return (__m256) ((__v8sf)__A - (__v8sf)__B); 357 } 358 359 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 360 _mm256_xor_pd (__m256d __A, __m256d __B) 361 { 362 return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B); 363 } 364 365 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 366 _mm256_xor_ps (__m256 __A, __m256 __B) 367 { 368 return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B); 369 } 370 371 #ifdef __OPTIMIZE__ 372 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 373 _mm_cmp_pd (__m128d __X, __m128d __Y, const int __P) 374 { 375 return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P); 376 } 377 378 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 379 _mm_cmp_ps (__m128 __X, __m128 __Y, const int __P) 380 { 381 return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P); 382 } 383 384 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 385 _mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P) 386 { 387 return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y, 388 __P); 389 } 390 391 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 392 _mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P) 393 { 394 return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y, 395 __P); 396 } 397 398 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 399 _mm_cmp_sd (__m128d __X, __m128d __Y, const int __P) 400 { 401 return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P); 402 } 403 404 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 405 _mm_cmp_ss (__m128 __X, __m128 __Y, const int __P) 406 { 407 return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P); 408 } 409 #else 410 #define _mm_cmp_pd(X, Y, P) \ 411 ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \ 412 (__v2df)(__m128d)(Y), (int)(P))) 413 414 #define _mm_cmp_ps(X, Y, P) \ 415 ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \ 416 (__v4sf)(__m128)(Y), (int)(P))) 417 418 #define _mm256_cmp_pd(X, Y, P) \ 419 ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \ 420 (__v4df)(__m256d)(Y), (int)(P))) 421 422 #define _mm256_cmp_ps(X, Y, P) \ 423 ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \ 424 (__v8sf)(__m256)(Y), (int)(P))) 425 426 #define _mm_cmp_sd(X, Y, P) \ 427 ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \ 428 (__v2df)(__m128d)(Y), (int)(P))) 429 430 #define _mm_cmp_ss(X, Y, P) \ 431 ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \ 432 (__v4sf)(__m128)(Y), (int)(P))) 433 #endif 434 435 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 436 _mm256_cvtepi32_pd (__m128i __A) 437 { 438 return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A); 439 } 440 441 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 442 _mm256_cvtepi32_ps (__m256i __A) 443 { 444 return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A); 445 } 446 447 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 448 _mm256_cvtpd_ps (__m256d __A) 449 { 450 return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A); 451 } 452 453 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 454 _mm256_cvtps_epi32 (__m256 __A) 455 { 456 return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A); 457 } 458 459 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 460 _mm256_cvtps_pd (__m128 __A) 461 { 462 return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A); 463 } 464 465 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 466 _mm256_cvttpd_epi32 (__m256d __A) 467 { 468 return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A); 469 } 470 471 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 472 _mm256_cvtpd_epi32 (__m256d __A) 473 { 474 return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A); 475 } 476 477 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 478 _mm256_cvttps_epi32 (__m256 __A) 479 { 480 return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A); 481 } 482 483 #ifdef __OPTIMIZE__ 484 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 485 _mm256_extractf128_pd (__m256d __X, const int __N) 486 { 487 return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N); 488 } 489 490 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 491 _mm256_extractf128_ps (__m256 __X, const int __N) 492 { 493 return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N); 494 } 495 496 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 497 _mm256_extractf128_si256 (__m256i __X, const int __N) 498 { 499 return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N); 500 } 501 502 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 503 _mm256_extract_epi32 (__m256i __X, int const __N) 504 { 505 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); 506 return _mm_extract_epi32 (__Y, __N % 4); 507 } 508 509 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 510 _mm256_extract_epi16 (__m256i __X, int const __N) 511 { 512 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); 513 return _mm_extract_epi16 (__Y, __N % 8); 514 } 515 516 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 517 _mm256_extract_epi8 (__m256i __X, int const __N) 518 { 519 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); 520 return _mm_extract_epi8 (__Y, __N % 16); 521 } 522 523 #ifdef __x86_64__ 524 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 525 _mm256_extract_epi64 (__m256i __X, const int __N) 526 { 527 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); 528 return _mm_extract_epi64 (__Y, __N % 2); 529 } 530 #endif 531 #else 532 #define _mm256_extractf128_pd(X, N) \ 533 ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \ 534 (int)(N))) 535 536 #define _mm256_extractf128_ps(X, N) \ 537 ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \ 538 (int)(N))) 539 540 #define _mm256_extractf128_si256(X, N) \ 541 ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \ 542 (int)(N))) 543 544 #define _mm256_extract_epi32(X, N) \ 545 (__extension__ \ 546 ({ \ 547 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ 548 _mm_extract_epi32 (__Y, (N) % 4); \ 549 })) 550 551 #define _mm256_extract_epi16(X, N) \ 552 (__extension__ \ 553 ({ \ 554 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ 555 _mm_extract_epi16 (__Y, (N) % 8); \ 556 })) 557 558 #define _mm256_extract_epi8(X, N) \ 559 (__extension__ \ 560 ({ \ 561 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ 562 _mm_extract_epi8 (__Y, (N) % 16); \ 563 })) 564 565 #ifdef __x86_64__ 566 #define _mm256_extract_epi64(X, N) \ 567 (__extension__ \ 568 ({ \ 569 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ 570 _mm_extract_epi64 (__Y, (N) % 2); \ 571 })) 572 #endif 573 #endif 574 575 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 576 _mm256_zeroall (void) 577 { 578 __builtin_ia32_vzeroall (); 579 } 580 581 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 582 _mm256_zeroupper (void) 583 { 584 __builtin_ia32_vzeroupper (); 585 } 586 587 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 588 _mm_permutevar_pd (__m128d __A, __m128i __C) 589 { 590 return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A, 591 (__v2di)__C); 592 } 593 594 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 595 _mm256_permutevar_pd (__m256d __A, __m256i __C) 596 { 597 return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A, 598 (__v4di)__C); 599 } 600 601 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 602 _mm_permutevar_ps (__m128 __A, __m128i __C) 603 { 604 return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A, 605 (__v4si)__C); 606 } 607 608 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 609 _mm256_permutevar_ps (__m256 __A, __m256i __C) 610 { 611 return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A, 612 (__v8si)__C); 613 } 614 615 #ifdef __OPTIMIZE__ 616 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 617 _mm_permute_pd (__m128d __X, const int __C) 618 { 619 return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C); 620 } 621 622 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 623 _mm256_permute_pd (__m256d __X, const int __C) 624 { 625 return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C); 626 } 627 628 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 629 _mm_permute_ps (__m128 __X, const int __C) 630 { 631 return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C); 632 } 633 634 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 635 _mm256_permute_ps (__m256 __X, const int __C) 636 { 637 return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C); 638 } 639 #else 640 #define _mm_permute_pd(X, C) \ 641 ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C))) 642 643 #define _mm256_permute_pd(X, C) \ 644 ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C))) 645 646 #define _mm_permute_ps(X, C) \ 647 ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C))) 648 649 #define _mm256_permute_ps(X, C) \ 650 ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C))) 651 #endif 652 653 #ifdef __OPTIMIZE__ 654 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 655 _mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C) 656 { 657 return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X, 658 (__v4df)__Y, 659 __C); 660 } 661 662 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 663 _mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C) 664 { 665 return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X, 666 (__v8sf)__Y, 667 __C); 668 } 669 670 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 671 _mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C) 672 { 673 return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X, 674 (__v8si)__Y, 675 __C); 676 } 677 #else 678 #define _mm256_permute2f128_pd(X, Y, C) \ 679 ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \ 680 (__v4df)(__m256d)(Y), \ 681 (int)(C))) 682 683 #define _mm256_permute2f128_ps(X, Y, C) \ 684 ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \ 685 (__v8sf)(__m256)(Y), \ 686 (int)(C))) 687 688 #define _mm256_permute2f128_si256(X, Y, C) \ 689 ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \ 690 (__v8si)(__m256i)(Y), \ 691 (int)(C))) 692 #endif 693 694 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 695 _mm_broadcast_ss (float const *__X) 696 { 697 return (__m128) __builtin_ia32_vbroadcastss (__X); 698 } 699 700 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 701 _mm256_broadcast_sd (double const *__X) 702 { 703 return (__m256d) __builtin_ia32_vbroadcastsd256 (__X); 704 } 705 706 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 707 _mm256_broadcast_ss (float const *__X) 708 { 709 return (__m256) __builtin_ia32_vbroadcastss256 (__X); 710 } 711 712 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 713 _mm256_broadcast_pd (__m128d const *__X) 714 { 715 return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X); 716 } 717 718 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 719 _mm256_broadcast_ps (__m128 const *__X) 720 { 721 return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X); 722 } 723 724 #ifdef __OPTIMIZE__ 725 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 726 _mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O) 727 { 728 return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X, 729 (__v2df)__Y, 730 __O); 731 } 732 733 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 734 _mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O) 735 { 736 return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X, 737 (__v4sf)__Y, 738 __O); 739 } 740 741 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 742 _mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O) 743 { 744 return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X, 745 (__v4si)__Y, 746 __O); 747 } 748 749 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 750 _mm256_insert_epi32 (__m256i __X, int __D, int const __N) 751 { 752 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); 753 __Y = _mm_insert_epi32 (__Y, __D, __N % 4); 754 return _mm256_insertf128_si256 (__X, __Y, __N >> 2); 755 } 756 757 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 758 _mm256_insert_epi16 (__m256i __X, int __D, int const __N) 759 { 760 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); 761 __Y = _mm_insert_epi16 (__Y, __D, __N % 8); 762 return _mm256_insertf128_si256 (__X, __Y, __N >> 3); 763 } 764 765 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 766 _mm256_insert_epi8 (__m256i __X, int __D, int const __N) 767 { 768 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); 769 __Y = _mm_insert_epi8 (__Y, __D, __N % 16); 770 return _mm256_insertf128_si256 (__X, __Y, __N >> 4); 771 } 772 773 #ifdef __x86_64__ 774 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 775 _mm256_insert_epi64 (__m256i __X, long long __D, int const __N) 776 { 777 __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); 778 __Y = _mm_insert_epi64 (__Y, __D, __N % 2); 779 return _mm256_insertf128_si256 (__X, __Y, __N >> 1); 780 } 781 #endif 782 #else 783 #define _mm256_insertf128_pd(X, Y, O) \ 784 ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \ 785 (__v2df)(__m128d)(Y), \ 786 (int)(O))) 787 788 #define _mm256_insertf128_ps(X, Y, O) \ 789 ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \ 790 (__v4sf)(__m128)(Y), \ 791 (int)(O))) 792 793 #define _mm256_insertf128_si256(X, Y, O) \ 794 ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \ 795 (__v4si)(__m128i)(Y), \ 796 (int)(O))) 797 798 #define _mm256_insert_epi32(X, D, N) \ 799 (__extension__ \ 800 ({ \ 801 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ 802 __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \ 803 _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \ 804 })) 805 806 #define _mm256_insert_epi16(X, D, N) \ 807 (__extension__ \ 808 ({ \ 809 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ 810 __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \ 811 _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \ 812 })) 813 814 #define _mm256_insert_epi8(X, D, N) \ 815 (__extension__ \ 816 ({ \ 817 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ 818 __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \ 819 _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \ 820 })) 821 822 #ifdef __x86_64__ 823 #define _mm256_insert_epi64(X, D, N) \ 824 (__extension__ \ 825 ({ \ 826 __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ 827 __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \ 828 _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \ 829 })) 830 #endif 831 #endif 832 833 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 834 _mm256_load_pd (double const *__P) 835 { 836 return *(__m256d *)__P; 837 } 838 839 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 840 _mm256_store_pd (double *__P, __m256d __A) 841 { 842 *(__m256d *)__P = __A; 843 } 844 845 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 846 _mm256_load_ps (float const *__P) 847 { 848 return *(__m256 *)__P; 849 } 850 851 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 852 _mm256_store_ps (float *__P, __m256 __A) 853 { 854 *(__m256 *)__P = __A; 855 } 856 857 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 858 _mm256_loadu_pd (double const *__P) 859 { 860 return (__m256d) __builtin_ia32_loadupd256 (__P); 861 } 862 863 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 864 _mm256_storeu_pd (double *__P, __m256d __A) 865 { 866 __builtin_ia32_storeupd256 (__P, (__v4df)__A); 867 } 868 869 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 870 _mm256_loadu_ps (float const *__P) 871 { 872 return (__m256) __builtin_ia32_loadups256 (__P); 873 } 874 875 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 876 _mm256_storeu_ps (float *__P, __m256 __A) 877 { 878 __builtin_ia32_storeups256 (__P, (__v8sf)__A); 879 } 880 881 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 882 _mm256_load_si256 (__m256i const *__P) 883 { 884 return *__P; 885 } 886 887 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 888 _mm256_store_si256 (__m256i *__P, __m256i __A) 889 { 890 *__P = __A; 891 } 892 893 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 894 _mm256_loadu_si256 (__m256i const *__P) 895 { 896 return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P); 897 } 898 899 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 900 _mm256_storeu_si256 (__m256i *__P, __m256i __A) 901 { 902 __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A); 903 } 904 905 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 906 _mm_maskload_pd (double const *__P, __m128i __M) 907 { 908 return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P, 909 (__v2di)__M); 910 } 911 912 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 913 _mm_maskstore_pd (double *__P, __m128i __M, __m128d __A) 914 { 915 __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A); 916 } 917 918 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 919 _mm256_maskload_pd (double const *__P, __m256i __M) 920 { 921 return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P, 922 (__v4di)__M); 923 } 924 925 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 926 _mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A) 927 { 928 __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A); 929 } 930 931 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 932 _mm_maskload_ps (float const *__P, __m128i __M) 933 { 934 return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P, 935 (__v4si)__M); 936 } 937 938 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 939 _mm_maskstore_ps (float *__P, __m128i __M, __m128 __A) 940 { 941 __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A); 942 } 943 944 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 945 _mm256_maskload_ps (float const *__P, __m256i __M) 946 { 947 return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P, 948 (__v8si)__M); 949 } 950 951 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 952 _mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A) 953 { 954 __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A); 955 } 956 957 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 958 _mm256_movehdup_ps (__m256 __X) 959 { 960 return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X); 961 } 962 963 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 964 _mm256_moveldup_ps (__m256 __X) 965 { 966 return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X); 967 } 968 969 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 970 _mm256_movedup_pd (__m256d __X) 971 { 972 return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X); 973 } 974 975 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 976 _mm256_lddqu_si256 (__m256i const *__P) 977 { 978 return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P); 979 } 980 981 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 982 _mm256_stream_si256 (__m256i *__A, __m256i __B) 983 { 984 __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B); 985 } 986 987 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 988 _mm256_stream_pd (double *__A, __m256d __B) 989 { 990 __builtin_ia32_movntpd256 (__A, (__v4df)__B); 991 } 992 993 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 994 _mm256_stream_ps (float *__P, __m256 __A) 995 { 996 __builtin_ia32_movntps256 (__P, (__v8sf)__A); 997 } 998 999 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1000 _mm256_rcp_ps (__m256 __A) 1001 { 1002 return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A); 1003 } 1004 1005 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1006 _mm256_rsqrt_ps (__m256 __A) 1007 { 1008 return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A); 1009 } 1010 1011 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1012 _mm256_sqrt_pd (__m256d __A) 1013 { 1014 return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A); 1015 } 1016 1017 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1018 _mm256_sqrt_ps (__m256 __A) 1019 { 1020 return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A); 1021 } 1022 1023 #ifdef __OPTIMIZE__ 1024 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1025 _mm256_round_pd (__m256d __V, const int __M) 1026 { 1027 return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M); 1028 } 1029 1030 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1031 _mm256_round_ps (__m256 __V, const int __M) 1032 { 1033 return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M); 1034 } 1035 #else 1036 #define _mm256_round_pd(V, M) \ 1037 ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M))) 1038 1039 #define _mm256_round_ps(V, M) \ 1040 ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M))) 1041 #endif 1042 1043 #define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL) 1044 #define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR) 1045 #define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL) 1046 #define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR) 1047 1048 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1049 _mm256_unpackhi_pd (__m256d __A, __m256d __B) 1050 { 1051 return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B); 1052 } 1053 1054 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1055 _mm256_unpacklo_pd (__m256d __A, __m256d __B) 1056 { 1057 return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B); 1058 } 1059 1060 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1061 _mm256_unpackhi_ps (__m256 __A, __m256 __B) 1062 { 1063 return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B); 1064 } 1065 1066 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1067 _mm256_unpacklo_ps (__m256 __A, __m256 __B) 1068 { 1069 return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B); 1070 } 1071 1072 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1073 _mm_testz_pd (__m128d __M, __m128d __V) 1074 { 1075 return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V); 1076 } 1077 1078 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1079 _mm_testc_pd (__m128d __M, __m128d __V) 1080 { 1081 return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V); 1082 } 1083 1084 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1085 _mm_testnzc_pd (__m128d __M, __m128d __V) 1086 { 1087 return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V); 1088 } 1089 1090 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1091 _mm_testz_ps (__m128 __M, __m128 __V) 1092 { 1093 return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V); 1094 } 1095 1096 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1097 _mm_testc_ps (__m128 __M, __m128 __V) 1098 { 1099 return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V); 1100 } 1101 1102 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1103 _mm_testnzc_ps (__m128 __M, __m128 __V) 1104 { 1105 return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V); 1106 } 1107 1108 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1109 _mm256_testz_pd (__m256d __M, __m256d __V) 1110 { 1111 return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V); 1112 } 1113 1114 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1115 _mm256_testc_pd (__m256d __M, __m256d __V) 1116 { 1117 return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V); 1118 } 1119 1120 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1121 _mm256_testnzc_pd (__m256d __M, __m256d __V) 1122 { 1123 return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V); 1124 } 1125 1126 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1127 _mm256_testz_ps (__m256 __M, __m256 __V) 1128 { 1129 return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V); 1130 } 1131 1132 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1133 _mm256_testc_ps (__m256 __M, __m256 __V) 1134 { 1135 return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V); 1136 } 1137 1138 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1139 _mm256_testnzc_ps (__m256 __M, __m256 __V) 1140 { 1141 return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V); 1142 } 1143 1144 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1145 _mm256_testz_si256 (__m256i __M, __m256i __V) 1146 { 1147 return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V); 1148 } 1149 1150 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1151 _mm256_testc_si256 (__m256i __M, __m256i __V) 1152 { 1153 return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V); 1154 } 1155 1156 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1157 _mm256_testnzc_si256 (__m256i __M, __m256i __V) 1158 { 1159 return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V); 1160 } 1161 1162 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1163 _mm256_movemask_pd (__m256d __A) 1164 { 1165 return __builtin_ia32_movmskpd256 ((__v4df)__A); 1166 } 1167 1168 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1169 _mm256_movemask_ps (__m256 __A) 1170 { 1171 return __builtin_ia32_movmskps256 ((__v8sf)__A); 1172 } 1173 1174 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1175 _mm256_undefined_pd (void) 1176 { 1177 __m256d __Y = __Y; 1178 return __Y; 1179 } 1180 1181 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1182 _mm256_undefined_ps (void) 1183 { 1184 __m256 __Y = __Y; 1185 return __Y; 1186 } 1187 1188 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1189 _mm256_undefined_si256 (void) 1190 { 1191 __m256i __Y = __Y; 1192 return __Y; 1193 } 1194 1195 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1196 _mm256_setzero_pd (void) 1197 { 1198 return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 }; 1199 } 1200 1201 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1202 _mm256_setzero_ps (void) 1203 { 1204 return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0, 1205 0.0, 0.0, 0.0, 0.0 }; 1206 } 1207 1208 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1209 _mm256_setzero_si256 (void) 1210 { 1211 return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; 1212 } 1213 1214 /* Create the vector [A B C D]. */ 1215 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1216 _mm256_set_pd (double __A, double __B, double __C, double __D) 1217 { 1218 return __extension__ (__m256d){ __D, __C, __B, __A }; 1219 } 1220 1221 /* Create the vector [A B C D E F G H]. */ 1222 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1223 _mm256_set_ps (float __A, float __B, float __C, float __D, 1224 float __E, float __F, float __G, float __H) 1225 { 1226 return __extension__ (__m256){ __H, __G, __F, __E, 1227 __D, __C, __B, __A }; 1228 } 1229 1230 /* Create the vector [A B C D E F G H]. */ 1231 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1232 _mm256_set_epi32 (int __A, int __B, int __C, int __D, 1233 int __E, int __F, int __G, int __H) 1234 { 1235 return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E, 1236 __D, __C, __B, __A }; 1237 } 1238 1239 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1240 _mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12, 1241 short __q11, short __q10, short __q09, short __q08, 1242 short __q07, short __q06, short __q05, short __q04, 1243 short __q03, short __q02, short __q01, short __q00) 1244 { 1245 return __extension__ (__m256i)(__v16hi){ 1246 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 1247 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 1248 }; 1249 } 1250 1251 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1252 _mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28, 1253 char __q27, char __q26, char __q25, char __q24, 1254 char __q23, char __q22, char __q21, char __q20, 1255 char __q19, char __q18, char __q17, char __q16, 1256 char __q15, char __q14, char __q13, char __q12, 1257 char __q11, char __q10, char __q09, char __q08, 1258 char __q07, char __q06, char __q05, char __q04, 1259 char __q03, char __q02, char __q01, char __q00) 1260 { 1261 return __extension__ (__m256i)(__v32qi){ 1262 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, 1263 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, 1264 __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, 1265 __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31 1266 }; 1267 } 1268 1269 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1270 _mm256_set_epi64x (long long __A, long long __B, long long __C, 1271 long long __D) 1272 { 1273 return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A }; 1274 } 1275 1276 /* Create a vector with all elements equal to A. */ 1277 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1278 _mm256_set1_pd (double __A) 1279 { 1280 return __extension__ (__m256d){ __A, __A, __A, __A }; 1281 } 1282 1283 /* Create a vector with all elements equal to A. */ 1284 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1285 _mm256_set1_ps (float __A) 1286 { 1287 return __extension__ (__m256){ __A, __A, __A, __A, 1288 __A, __A, __A, __A }; 1289 } 1290 1291 /* Create a vector with all elements equal to A. */ 1292 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1293 _mm256_set1_epi32 (int __A) 1294 { 1295 return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A, 1296 __A, __A, __A, __A }; 1297 } 1298 1299 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1300 _mm256_set1_epi16 (short __A) 1301 { 1302 return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A, 1303 __A, __A, __A, __A, __A, __A, __A, __A); 1304 } 1305 1306 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1307 _mm256_set1_epi8 (char __A) 1308 { 1309 return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, 1310 __A, __A, __A, __A, __A, __A, __A, __A, 1311 __A, __A, __A, __A, __A, __A, __A, __A, 1312 __A, __A, __A, __A, __A, __A, __A, __A); 1313 } 1314 1315 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1316 _mm256_set1_epi64x (long long __A) 1317 { 1318 return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A }; 1319 } 1320 1321 /* Create vectors of elements in the reversed order from the 1322 _mm256_set_XXX functions. */ 1323 1324 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1325 _mm256_setr_pd (double __A, double __B, double __C, double __D) 1326 { 1327 return _mm256_set_pd (__D, __C, __B, __A); 1328 } 1329 1330 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1331 _mm256_setr_ps (float __A, float __B, float __C, float __D, 1332 float __E, float __F, float __G, float __H) 1333 { 1334 return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A); 1335 } 1336 1337 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1338 _mm256_setr_epi32 (int __A, int __B, int __C, int __D, 1339 int __E, int __F, int __G, int __H) 1340 { 1341 return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A); 1342 } 1343 1344 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1345 _mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12, 1346 short __q11, short __q10, short __q09, short __q08, 1347 short __q07, short __q06, short __q05, short __q04, 1348 short __q03, short __q02, short __q01, short __q00) 1349 { 1350 return _mm256_set_epi16 (__q00, __q01, __q02, __q03, 1351 __q04, __q05, __q06, __q07, 1352 __q08, __q09, __q10, __q11, 1353 __q12, __q13, __q14, __q15); 1354 } 1355 1356 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1357 _mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28, 1358 char __q27, char __q26, char __q25, char __q24, 1359 char __q23, char __q22, char __q21, char __q20, 1360 char __q19, char __q18, char __q17, char __q16, 1361 char __q15, char __q14, char __q13, char __q12, 1362 char __q11, char __q10, char __q09, char __q08, 1363 char __q07, char __q06, char __q05, char __q04, 1364 char __q03, char __q02, char __q01, char __q00) 1365 { 1366 return _mm256_set_epi8 (__q00, __q01, __q02, __q03, 1367 __q04, __q05, __q06, __q07, 1368 __q08, __q09, __q10, __q11, 1369 __q12, __q13, __q14, __q15, 1370 __q16, __q17, __q18, __q19, 1371 __q20, __q21, __q22, __q23, 1372 __q24, __q25, __q26, __q27, 1373 __q28, __q29, __q30, __q31); 1374 } 1375 1376 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1377 _mm256_setr_epi64x (long long __A, long long __B, long long __C, 1378 long long __D) 1379 { 1380 return _mm256_set_epi64x (__D, __C, __B, __A); 1381 } 1382 1383 /* Casts between various SP, DP, INT vector types. Note that these do no 1384 conversion of values, they just change the type. */ 1385 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1386 _mm256_castpd_ps (__m256d __A) 1387 { 1388 return (__m256) __A; 1389 } 1390 1391 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1392 _mm256_castpd_si256 (__m256d __A) 1393 { 1394 return (__m256i) __A; 1395 } 1396 1397 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1398 _mm256_castps_pd (__m256 __A) 1399 { 1400 return (__m256d) __A; 1401 } 1402 1403 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1404 _mm256_castps_si256(__m256 __A) 1405 { 1406 return (__m256i) __A; 1407 } 1408 1409 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1410 _mm256_castsi256_ps (__m256i __A) 1411 { 1412 return (__m256) __A; 1413 } 1414 1415 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1416 _mm256_castsi256_pd (__m256i __A) 1417 { 1418 return (__m256d) __A; 1419 } 1420 1421 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1422 _mm256_castpd256_pd128 (__m256d __A) 1423 { 1424 return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A); 1425 } 1426 1427 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1428 _mm256_castps256_ps128 (__m256 __A) 1429 { 1430 return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A); 1431 } 1432 1433 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1434 _mm256_castsi256_si128 (__m256i __A) 1435 { 1436 return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A); 1437 } 1438 1439 /* When cast is done from a 128 to 256-bit type, the low 128 bits of 1440 the 256-bit result contain source parameter value and the upper 128 1441 bits of the result are undefined. Those intrinsics shouldn't 1442 generate any extra moves. */ 1443 1444 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1445 _mm256_castpd128_pd256 (__m128d __A) 1446 { 1447 return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A); 1448 } 1449 1450 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1451 _mm256_castps128_ps256 (__m128 __A) 1452 { 1453 return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A); 1454 } 1455 1456 extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1457 _mm256_castsi128_si256 (__m128i __A) 1458 { 1459 return (__m256i) __builtin_ia32_si256_si ((__v4si)__A); 1460 } 1461 1462 #ifdef __DISABLE_AVX__ 1463 #undef __DISABLE_AVX__ 1464 #pragma GCC pop_options 1465 #endif /* __DISABLE_AVX__ */ 1466 1467 #endif /* _AVXINTRIN_H_INCLUDED */ 1468