1 /* Copyright (C) 2011-2013 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 #ifndef _IMMINTRIN_H_INCLUDED 25 # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead." 26 #endif 27 28 /* Sum absolute 8-bit integer difference of adjacent groups of 4 29 byte integers in the first 2 operands. Starting offsets within 30 operands are determined by the 3rd mask operand. */ 31 #ifdef __OPTIMIZE__ 32 extern __inline __m256i 33 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 34 _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M) 35 { 36 return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X, 37 (__v32qi)__Y, __M); 38 } 39 #else 40 #define _mm256_mpsadbw_epu8(X, Y, M) \ 41 ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \ 42 (__v32qi)(__m256i)(Y), (int)(M))) 43 #endif 44 45 extern __inline __m256i 46 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 47 _mm256_abs_epi8 (__m256i __A) 48 { 49 return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A); 50 } 51 52 extern __inline __m256i 53 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 54 _mm256_abs_epi16 (__m256i __A) 55 { 56 return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A); 57 } 58 59 extern __inline __m256i 60 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 61 _mm256_abs_epi32 (__m256i __A) 62 { 63 return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A); 64 } 65 66 extern __inline __m256i 67 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 68 _mm256_packs_epi32 (__m256i __A, __m256i __B) 69 { 70 return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B); 71 } 72 73 extern __inline __m256i 74 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 75 _mm256_packs_epi16 (__m256i __A, __m256i __B) 76 { 77 return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B); 78 } 79 80 extern __inline __m256i 81 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 82 _mm256_packus_epi32 (__m256i __A, __m256i __B) 83 { 84 return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B); 85 } 86 87 extern __inline __m256i 88 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 89 _mm256_packus_epi16 (__m256i __A, __m256i __B) 90 { 91 return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B); 92 } 93 94 extern __inline __m256i 95 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 96 _mm256_add_epi8 (__m256i __A, __m256i __B) 97 { 98 return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B); 99 } 100 101 extern __inline __m256i 102 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 103 _mm256_add_epi16 (__m256i __A, __m256i __B) 104 { 105 return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B); 106 } 107 108 extern __inline __m256i 109 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 110 _mm256_add_epi32 (__m256i __A, __m256i __B) 111 { 112 return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B); 113 } 114 115 extern __inline __m256i 116 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 117 _mm256_add_epi64 (__m256i __A, __m256i __B) 118 { 119 return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B); 120 } 121 122 extern __inline __m256i 123 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 124 _mm256_adds_epi8 (__m256i __A, __m256i __B) 125 { 126 return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B); 127 } 128 129 extern __inline __m256i 130 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 131 _mm256_adds_epi16 (__m256i __A, __m256i __B) 132 { 133 return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B); 134 } 135 136 extern __inline __m256i 137 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 138 _mm256_adds_epu8 (__m256i __A, __m256i __B) 139 { 140 return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B); 141 } 142 143 extern __inline __m256i 144 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 145 _mm256_adds_epu16 (__m256i __A, __m256i __B) 146 { 147 return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B); 148 } 149 150 #ifdef __OPTIMIZE__ 151 extern __inline __m256i 152 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 153 _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N) 154 { 155 return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A, 156 (__v4di)__B, 157 __N * 8); 158 } 159 #else 160 /* In that case (__N*8) will be in vreg, and insn will not be matched. */ 161 /* Use define instead */ 162 #define _mm256_alignr_epi8(A, B, N) \ 163 ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \ 164 (__v4di)(__m256i)(B), \ 165 (int)(N) * 8)) 166 #endif 167 168 extern __inline __m256i 169 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 170 _mm256_and_si256 (__m256i __A, __m256i __B) 171 { 172 return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B); 173 } 174 175 extern __inline __m256i 176 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 177 _mm256_andnot_si256 (__m256i __A, __m256i __B) 178 { 179 return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B); 180 } 181 182 extern __inline __m256i 183 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 184 _mm256_avg_epu8 (__m256i __A, __m256i __B) 185 { 186 return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B); 187 } 188 189 extern __inline __m256i 190 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 191 _mm256_avg_epu16 (__m256i __A, __m256i __B) 192 { 193 return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B); 194 } 195 196 extern __inline __m256i 197 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 198 _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M) 199 { 200 return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X, 201 (__v32qi)__Y, 202 (__v32qi)__M); 203 } 204 205 #ifdef __OPTIMIZE__ 206 extern __inline __m256i 207 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 208 _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M) 209 { 210 return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X, 211 (__v16hi)__Y, 212 __M); 213 } 214 #else 215 #define _mm256_blend_epi16(X, Y, M) \ 216 ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \ 217 (__v16hi)(__m256i)(Y), (int)(M))) 218 #endif 219 220 extern __inline __m256i 221 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 222 _mm256_cmpeq_epi8 (__m256i __A, __m256i __B) 223 { 224 return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B); 225 } 226 227 extern __inline __m256i 228 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 229 _mm256_cmpeq_epi16 (__m256i __A, __m256i __B) 230 { 231 return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B); 232 } 233 234 extern __inline __m256i 235 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 236 _mm256_cmpeq_epi32 (__m256i __A, __m256i __B) 237 { 238 return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B); 239 } 240 241 extern __inline __m256i 242 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 243 _mm256_cmpeq_epi64 (__m256i __A, __m256i __B) 244 { 245 return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B); 246 } 247 248 extern __inline __m256i 249 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 250 _mm256_cmpgt_epi8 (__m256i __A, __m256i __B) 251 { 252 return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A, 253 (__v32qi)__B); 254 } 255 256 extern __inline __m256i 257 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 258 _mm256_cmpgt_epi16 (__m256i __A, __m256i __B) 259 { 260 return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A, 261 (__v16hi)__B); 262 } 263 264 extern __inline __m256i 265 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 266 _mm256_cmpgt_epi32 (__m256i __A, __m256i __B) 267 { 268 return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A, 269 (__v8si)__B); 270 } 271 272 extern __inline __m256i 273 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 274 _mm256_cmpgt_epi64 (__m256i __A, __m256i __B) 275 { 276 return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B); 277 } 278 279 extern __inline __m256i 280 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 281 _mm256_hadd_epi16 (__m256i __X, __m256i __Y) 282 { 283 return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X, 284 (__v16hi)__Y); 285 } 286 287 extern __inline __m256i 288 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 289 _mm256_hadd_epi32 (__m256i __X, __m256i __Y) 290 { 291 return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y); 292 } 293 294 extern __inline __m256i 295 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 296 _mm256_hadds_epi16 (__m256i __X, __m256i __Y) 297 { 298 return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X, 299 (__v16hi)__Y); 300 } 301 302 extern __inline __m256i 303 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 304 _mm256_hsub_epi16 (__m256i __X, __m256i __Y) 305 { 306 return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X, 307 (__v16hi)__Y); 308 } 309 310 extern __inline __m256i 311 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 312 _mm256_hsub_epi32 (__m256i __X, __m256i __Y) 313 { 314 return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y); 315 } 316 317 extern __inline __m256i 318 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 319 _mm256_hsubs_epi16 (__m256i __X, __m256i __Y) 320 { 321 return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X, 322 (__v16hi)__Y); 323 } 324 325 extern __inline __m256i 326 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 327 _mm256_maddubs_epi16 (__m256i __X, __m256i __Y) 328 { 329 return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X, 330 (__v32qi)__Y); 331 } 332 333 extern __inline __m256i 334 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 335 _mm256_madd_epi16 (__m256i __A, __m256i __B) 336 { 337 return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A, 338 (__v16hi)__B); 339 } 340 341 extern __inline __m256i 342 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 343 _mm256_max_epi8 (__m256i __A, __m256i __B) 344 { 345 return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B); 346 } 347 348 extern __inline __m256i 349 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 350 _mm256_max_epi16 (__m256i __A, __m256i __B) 351 { 352 return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B); 353 } 354 355 extern __inline __m256i 356 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 357 _mm256_max_epi32 (__m256i __A, __m256i __B) 358 { 359 return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B); 360 } 361 362 extern __inline __m256i 363 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 364 _mm256_max_epu8 (__m256i __A, __m256i __B) 365 { 366 return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B); 367 } 368 369 extern __inline __m256i 370 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 371 _mm256_max_epu16 (__m256i __A, __m256i __B) 372 { 373 return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B); 374 } 375 376 extern __inline __m256i 377 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 378 _mm256_max_epu32 (__m256i __A, __m256i __B) 379 { 380 return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B); 381 } 382 383 extern __inline __m256i 384 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 385 _mm256_min_epi8 (__m256i __A, __m256i __B) 386 { 387 return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B); 388 } 389 390 extern __inline __m256i 391 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 392 _mm256_min_epi16 (__m256i __A, __m256i __B) 393 { 394 return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B); 395 } 396 397 extern __inline __m256i 398 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 399 _mm256_min_epi32 (__m256i __A, __m256i __B) 400 { 401 return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B); 402 } 403 404 extern __inline __m256i 405 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 406 _mm256_min_epu8 (__m256i __A, __m256i __B) 407 { 408 return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B); 409 } 410 411 extern __inline __m256i 412 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 413 _mm256_min_epu16 (__m256i __A, __m256i __B) 414 { 415 return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B); 416 } 417 418 extern __inline __m256i 419 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 420 _mm256_min_epu32 (__m256i __A, __m256i __B) 421 { 422 return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B); 423 } 424 425 extern __inline int 426 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 427 _mm256_movemask_epi8 (__m256i __A) 428 { 429 return __builtin_ia32_pmovmskb256 ((__v32qi)__A); 430 } 431 432 extern __inline __m256i 433 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 434 _mm256_cvtepi8_epi16 (__m128i __X) 435 { 436 return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X); 437 } 438 439 extern __inline __m256i 440 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 441 _mm256_cvtepi8_epi32 (__m128i __X) 442 { 443 return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X); 444 } 445 446 extern __inline __m256i 447 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 448 _mm256_cvtepi8_epi64 (__m128i __X) 449 { 450 return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X); 451 } 452 453 extern __inline __m256i 454 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 455 _mm256_cvtepi16_epi32 (__m128i __X) 456 { 457 return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X); 458 } 459 460 extern __inline __m256i 461 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 462 _mm256_cvtepi16_epi64 (__m128i __X) 463 { 464 return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X); 465 } 466 467 extern __inline __m256i 468 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 469 _mm256_cvtepi32_epi64 (__m128i __X) 470 { 471 return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X); 472 } 473 474 extern __inline __m256i 475 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 476 _mm256_cvtepu8_epi16 (__m128i __X) 477 { 478 return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X); 479 } 480 481 extern __inline __m256i 482 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 483 _mm256_cvtepu8_epi32 (__m128i __X) 484 { 485 return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X); 486 } 487 488 extern __inline __m256i 489 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 490 _mm256_cvtepu8_epi64 (__m128i __X) 491 { 492 return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X); 493 } 494 495 extern __inline __m256i 496 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 497 _mm256_cvtepu16_epi32 (__m128i __X) 498 { 499 return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X); 500 } 501 502 extern __inline __m256i 503 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 504 _mm256_cvtepu16_epi64 (__m128i __X) 505 { 506 return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X); 507 } 508 509 extern __inline __m256i 510 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 511 _mm256_cvtepu32_epi64 (__m128i __X) 512 { 513 return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X); 514 } 515 516 extern __inline __m256i 517 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 518 _mm256_mul_epi32 (__m256i __X, __m256i __Y) 519 { 520 return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y); 521 } 522 523 extern __inline __m256i 524 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 525 _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y) 526 { 527 return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X, 528 (__v16hi)__Y); 529 } 530 531 extern __inline __m256i 532 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 533 _mm256_mulhi_epu16 (__m256i __A, __m256i __B) 534 { 535 return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B); 536 } 537 538 extern __inline __m256i 539 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 540 _mm256_mulhi_epi16 (__m256i __A, __m256i __B) 541 { 542 return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B); 543 } 544 545 extern __inline __m256i 546 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 547 _mm256_mullo_epi16 (__m256i __A, __m256i __B) 548 { 549 return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B); 550 } 551 552 extern __inline __m256i 553 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 554 _mm256_mullo_epi32 (__m256i __A, __m256i __B) 555 { 556 return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B); 557 } 558 559 extern __inline __m256i 560 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 561 _mm256_mul_epu32 (__m256i __A, __m256i __B) 562 { 563 return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B); 564 } 565 566 extern __inline __m256i 567 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 568 _mm256_or_si256 (__m256i __A, __m256i __B) 569 { 570 return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B); 571 } 572 573 extern __inline __m256i 574 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 575 _mm256_sad_epu8 (__m256i __A, __m256i __B) 576 { 577 return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B); 578 } 579 580 extern __inline __m256i 581 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 582 _mm256_shuffle_epi8 (__m256i __X, __m256i __Y) 583 { 584 return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X, 585 (__v32qi)__Y); 586 } 587 588 #ifdef __OPTIMIZE__ 589 extern __inline __m256i 590 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 591 _mm256_shuffle_epi32 (__m256i __A, const int __mask) 592 { 593 return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask); 594 } 595 596 extern __inline __m256i 597 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 598 _mm256_shufflehi_epi16 (__m256i __A, const int __mask) 599 { 600 return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask); 601 } 602 603 extern __inline __m256i 604 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 605 _mm256_shufflelo_epi16 (__m256i __A, const int __mask) 606 { 607 return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask); 608 } 609 #else 610 #define _mm256_shuffle_epi32(A, N) \ 611 ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N))) 612 #define _mm256_shufflehi_epi16(A, N) \ 613 ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N))) 614 #define _mm256_shufflelo_epi16(A, N) \ 615 ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N))) 616 #endif 617 618 extern __inline __m256i 619 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 620 _mm256_sign_epi8 (__m256i __X, __m256i __Y) 621 { 622 return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y); 623 } 624 625 extern __inline __m256i 626 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 627 _mm256_sign_epi16 (__m256i __X, __m256i __Y) 628 { 629 return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y); 630 } 631 632 extern __inline __m256i 633 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 634 _mm256_sign_epi32 (__m256i __X, __m256i __Y) 635 { 636 return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y); 637 } 638 639 #ifdef __OPTIMIZE__ 640 extern __inline __m256i 641 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 642 _mm256_bslli_epi128 (__m256i __A, const int __N) 643 { 644 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8); 645 } 646 647 extern __inline __m256i 648 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 649 _mm256_slli_si256 (__m256i __A, const int __N) 650 { 651 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8); 652 } 653 #else 654 #define _mm256_bslli_epi128(A, N) \ 655 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8)) 656 #define _mm256_slli_si256(A, N) \ 657 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8)) 658 #endif 659 660 extern __inline __m256i 661 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 662 _mm256_slli_epi16 (__m256i __A, int __B) 663 { 664 return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B); 665 } 666 667 extern __inline __m256i 668 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 669 _mm256_sll_epi16 (__m256i __A, __m128i __B) 670 { 671 return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B); 672 } 673 674 extern __inline __m256i 675 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 676 _mm256_slli_epi32 (__m256i __A, int __B) 677 { 678 return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B); 679 } 680 681 extern __inline __m256i 682 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 683 _mm256_sll_epi32 (__m256i __A, __m128i __B) 684 { 685 return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B); 686 } 687 688 extern __inline __m256i 689 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 690 _mm256_slli_epi64 (__m256i __A, int __B) 691 { 692 return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B); 693 } 694 695 extern __inline __m256i 696 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 697 _mm256_sll_epi64 (__m256i __A, __m128i __B) 698 { 699 return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B); 700 } 701 702 extern __inline __m256i 703 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 704 _mm256_srai_epi16 (__m256i __A, int __B) 705 { 706 return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B); 707 } 708 709 extern __inline __m256i 710 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 711 _mm256_sra_epi16 (__m256i __A, __m128i __B) 712 { 713 return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B); 714 } 715 716 extern __inline __m256i 717 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 718 _mm256_srai_epi32 (__m256i __A, int __B) 719 { 720 return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B); 721 } 722 723 extern __inline __m256i 724 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 725 _mm256_sra_epi32 (__m256i __A, __m128i __B) 726 { 727 return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B); 728 } 729 730 #ifdef __OPTIMIZE__ 731 extern __inline __m256i 732 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 733 _mm256_bsrli_epi128 (__m256i __A, const int __N) 734 { 735 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8); 736 } 737 738 extern __inline __m256i 739 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 740 _mm256_srli_si256 (__m256i __A, const int __N) 741 { 742 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8); 743 } 744 #else 745 #define _mm256_bsrli_epi128(A, N) \ 746 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8)) 747 #define _mm256_srli_si256(A, N) \ 748 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8)) 749 #endif 750 751 extern __inline __m256i 752 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 753 _mm256_srli_epi16 (__m256i __A, int __B) 754 { 755 return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B); 756 } 757 758 extern __inline __m256i 759 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 760 _mm256_srl_epi16 (__m256i __A, __m128i __B) 761 { 762 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B); 763 } 764 765 extern __inline __m256i 766 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 767 _mm256_srli_epi32 (__m256i __A, int __B) 768 { 769 return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B); 770 } 771 772 extern __inline __m256i 773 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 774 _mm256_srl_epi32 (__m256i __A, __m128i __B) 775 { 776 return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B); 777 } 778 779 extern __inline __m256i 780 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 781 _mm256_srli_epi64 (__m256i __A, int __B) 782 { 783 return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B); 784 } 785 786 extern __inline __m256i 787 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 788 _mm256_srl_epi64 (__m256i __A, __m128i __B) 789 { 790 return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B); 791 } 792 793 extern __inline __m256i 794 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 795 _mm256_sub_epi8 (__m256i __A, __m256i __B) 796 { 797 return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B); 798 } 799 800 extern __inline __m256i 801 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 802 _mm256_sub_epi16 (__m256i __A, __m256i __B) 803 { 804 return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B); 805 } 806 807 extern __inline __m256i 808 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 809 _mm256_sub_epi32 (__m256i __A, __m256i __B) 810 { 811 return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B); 812 } 813 814 extern __inline __m256i 815 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 816 _mm256_sub_epi64 (__m256i __A, __m256i __B) 817 { 818 return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B); 819 } 820 821 extern __inline __m256i 822 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 823 _mm256_subs_epi8 (__m256i __A, __m256i __B) 824 { 825 return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B); 826 } 827 828 extern __inline __m256i 829 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 830 _mm256_subs_epi16 (__m256i __A, __m256i __B) 831 { 832 return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B); 833 } 834 835 extern __inline __m256i 836 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 837 _mm256_subs_epu8 (__m256i __A, __m256i __B) 838 { 839 return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B); 840 } 841 842 extern __inline __m256i 843 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 844 _mm256_subs_epu16 (__m256i __A, __m256i __B) 845 { 846 return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B); 847 } 848 849 extern __inline __m256i 850 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 851 _mm256_unpackhi_epi8 (__m256i __A, __m256i __B) 852 { 853 return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B); 854 } 855 856 extern __inline __m256i 857 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 858 _mm256_unpackhi_epi16 (__m256i __A, __m256i __B) 859 { 860 return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B); 861 } 862 863 extern __inline __m256i 864 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 865 _mm256_unpackhi_epi32 (__m256i __A, __m256i __B) 866 { 867 return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B); 868 } 869 870 extern __inline __m256i 871 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 872 _mm256_unpackhi_epi64 (__m256i __A, __m256i __B) 873 { 874 return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B); 875 } 876 877 extern __inline __m256i 878 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 879 _mm256_unpacklo_epi8 (__m256i __A, __m256i __B) 880 { 881 return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B); 882 } 883 884 extern __inline __m256i 885 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 886 _mm256_unpacklo_epi16 (__m256i __A, __m256i __B) 887 { 888 return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B); 889 } 890 891 extern __inline __m256i 892 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 893 _mm256_unpacklo_epi32 (__m256i __A, __m256i __B) 894 { 895 return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B); 896 } 897 898 extern __inline __m256i 899 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 900 _mm256_unpacklo_epi64 (__m256i __A, __m256i __B) 901 { 902 return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B); 903 } 904 905 extern __inline __m256i 906 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 907 _mm256_xor_si256 (__m256i __A, __m256i __B) 908 { 909 return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B); 910 } 911 912 extern __inline __m256i 913 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 914 _mm256_stream_load_si256 (__m256i const *__X) 915 { 916 return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X); 917 } 918 919 extern __inline __m128 920 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 921 _mm_broadcastss_ps (__m128 __X) 922 { 923 return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X); 924 } 925 926 extern __inline __m256 927 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 928 _mm256_broadcastss_ps (__m128 __X) 929 { 930 return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X); 931 } 932 933 extern __inline __m256d 934 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 935 _mm256_broadcastsd_pd (__m128d __X) 936 { 937 return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X); 938 } 939 940 extern __inline __m256i 941 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 942 _mm256_broadcastsi128_si256 (__m128i __X) 943 { 944 return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X); 945 } 946 947 #ifdef __OPTIMIZE__ 948 extern __inline __m128i 949 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 950 _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M) 951 { 952 return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X, 953 (__v4si)__Y, 954 __M); 955 } 956 #else 957 #define _mm_blend_epi32(X, Y, M) \ 958 ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \ 959 (__v4si)(__m128i)(Y), (int)(M))) 960 #endif 961 962 #ifdef __OPTIMIZE__ 963 extern __inline __m256i 964 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 965 _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M) 966 { 967 return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X, 968 (__v8si)__Y, 969 __M); 970 } 971 #else 972 #define _mm256_blend_epi32(X, Y, M) \ 973 ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \ 974 (__v8si)(__m256i)(Y), (int)(M))) 975 #endif 976 977 extern __inline __m256i 978 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 979 _mm256_broadcastb_epi8 (__m128i __X) 980 { 981 return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X); 982 } 983 984 extern __inline __m256i 985 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 986 _mm256_broadcastw_epi16 (__m128i __X) 987 { 988 return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X); 989 } 990 991 extern __inline __m256i 992 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 993 _mm256_broadcastd_epi32 (__m128i __X) 994 { 995 return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X); 996 } 997 998 extern __inline __m256i 999 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1000 _mm256_broadcastq_epi64 (__m128i __X) 1001 { 1002 return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X); 1003 } 1004 1005 extern __inline __m128i 1006 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1007 _mm_broadcastb_epi8 (__m128i __X) 1008 { 1009 return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X); 1010 } 1011 1012 extern __inline __m128i 1013 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1014 _mm_broadcastw_epi16 (__m128i __X) 1015 { 1016 return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X); 1017 } 1018 1019 extern __inline __m128i 1020 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1021 _mm_broadcastd_epi32 (__m128i __X) 1022 { 1023 return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X); 1024 } 1025 1026 extern __inline __m128i 1027 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1028 _mm_broadcastq_epi64 (__m128i __X) 1029 { 1030 return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X); 1031 } 1032 1033 extern __inline __m256i 1034 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1035 _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y) 1036 { 1037 return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y); 1038 } 1039 1040 #ifdef __OPTIMIZE__ 1041 extern __inline __m256d 1042 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1043 _mm256_permute4x64_pd (__m256d __X, const int __M) 1044 { 1045 return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M); 1046 } 1047 #else 1048 #define _mm256_permute4x64_pd(X, M) \ 1049 ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M))) 1050 #endif 1051 1052 extern __inline __m256 1053 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1054 _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y) 1055 { 1056 return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y); 1057 } 1058 1059 #ifdef __OPTIMIZE__ 1060 extern __inline __m256i 1061 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1062 _mm256_permute4x64_epi64 (__m256i __X, const int __M) 1063 { 1064 return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M); 1065 } 1066 #else 1067 #define _mm256_permute4x64_epi64(X, M) \ 1068 ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M))) 1069 #endif 1070 1071 1072 #ifdef __OPTIMIZE__ 1073 extern __inline __m256i 1074 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1075 _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M) 1076 { 1077 return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M); 1078 } 1079 #else 1080 #define _mm256_permute2x128_si256(X, Y, M) \ 1081 ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M))) 1082 #endif 1083 1084 #ifdef __OPTIMIZE__ 1085 extern __inline __m128i 1086 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1087 _mm256_extracti128_si256 (__m256i __X, const int __M) 1088 { 1089 return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M); 1090 } 1091 #else 1092 #define _mm256_extracti128_si256(X, M) \ 1093 ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M))) 1094 #endif 1095 1096 #ifdef __OPTIMIZE__ 1097 extern __inline __m256i 1098 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1099 _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M) 1100 { 1101 return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M); 1102 } 1103 #else 1104 #define _mm256_inserti128_si256(X, Y, M) \ 1105 ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \ 1106 (__v2di)(__m128i)(Y), \ 1107 (int)(M))) 1108 #endif 1109 1110 extern __inline __m256i 1111 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1112 _mm256_maskload_epi32 (int const *__X, __m256i __M ) 1113 { 1114 return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X, 1115 (__v8si)__M); 1116 } 1117 1118 extern __inline __m256i 1119 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1120 _mm256_maskload_epi64 (long long const *__X, __m256i __M ) 1121 { 1122 return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X, 1123 (__v4di)__M); 1124 } 1125 1126 extern __inline __m128i 1127 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1128 _mm_maskload_epi32 (int const *__X, __m128i __M ) 1129 { 1130 return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X, 1131 (__v4si)__M); 1132 } 1133 1134 extern __inline __m128i 1135 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1136 _mm_maskload_epi64 (long long const *__X, __m128i __M ) 1137 { 1138 return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X, 1139 (__v2di)__M); 1140 } 1141 1142 extern __inline void 1143 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1144 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y ) 1145 { 1146 __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y); 1147 } 1148 1149 extern __inline void 1150 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1151 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y ) 1152 { 1153 __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y); 1154 } 1155 1156 extern __inline void 1157 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1158 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y ) 1159 { 1160 __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y); 1161 } 1162 1163 extern __inline void 1164 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1165 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y ) 1166 { 1167 __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y); 1168 } 1169 1170 extern __inline __m256i 1171 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1172 _mm256_sllv_epi32 (__m256i __X, __m256i __Y) 1173 { 1174 return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y); 1175 } 1176 1177 extern __inline __m128i 1178 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1179 _mm_sllv_epi32 (__m128i __X, __m128i __Y) 1180 { 1181 return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y); 1182 } 1183 1184 extern __inline __m256i 1185 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1186 _mm256_sllv_epi64 (__m256i __X, __m256i __Y) 1187 { 1188 return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y); 1189 } 1190 1191 extern __inline __m128i 1192 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1193 _mm_sllv_epi64 (__m128i __X, __m128i __Y) 1194 { 1195 return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y); 1196 } 1197 1198 extern __inline __m256i 1199 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1200 _mm256_srav_epi32 (__m256i __X, __m256i __Y) 1201 { 1202 return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y); 1203 } 1204 1205 extern __inline __m128i 1206 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1207 _mm_srav_epi32 (__m128i __X, __m128i __Y) 1208 { 1209 return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y); 1210 } 1211 1212 extern __inline __m256i 1213 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1214 _mm256_srlv_epi32 (__m256i __X, __m256i __Y) 1215 { 1216 return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y); 1217 } 1218 1219 extern __inline __m128i 1220 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1221 _mm_srlv_epi32 (__m128i __X, __m128i __Y) 1222 { 1223 return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y); 1224 } 1225 1226 extern __inline __m256i 1227 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1228 _mm256_srlv_epi64 (__m256i __X, __m256i __Y) 1229 { 1230 return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y); 1231 } 1232 1233 extern __inline __m128i 1234 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1235 _mm_srlv_epi64 (__m128i __X, __m128i __Y) 1236 { 1237 return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y); 1238 } 1239 1240 #ifdef __OPTIMIZE__ 1241 extern __inline __m128d 1242 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1243 _mm_i32gather_pd (double const *base, __m128i index, const int scale) 1244 { 1245 __v2df src = _mm_setzero_pd (); 1246 __v2df mask = _mm_cmpeq_pd (src, src); 1247 1248 return (__m128d) __builtin_ia32_gathersiv2df (src, 1249 base, 1250 (__v4si)index, 1251 mask, 1252 scale); 1253 } 1254 1255 extern __inline __m128d 1256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1257 _mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index, 1258 __m128d mask, const int scale) 1259 { 1260 return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src, 1261 base, 1262 (__v4si)index, 1263 (__v2df)mask, 1264 scale); 1265 } 1266 1267 extern __inline __m256d 1268 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1269 _mm256_i32gather_pd (double const *base, __m128i index, const int scale) 1270 { 1271 __v4df src = _mm256_setzero_pd (); 1272 __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ); 1273 1274 return (__m256d) __builtin_ia32_gathersiv4df (src, 1275 base, 1276 (__v4si)index, 1277 mask, 1278 scale); 1279 } 1280 1281 extern __inline __m256d 1282 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1283 _mm256_mask_i32gather_pd (__m256d src, double const *base, 1284 __m128i index, __m256d mask, const int scale) 1285 { 1286 return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src, 1287 base, 1288 (__v4si)index, 1289 (__v4df)mask, 1290 scale); 1291 } 1292 1293 extern __inline __m128d 1294 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1295 _mm_i64gather_pd (double const *base, __m128i index, const int scale) 1296 { 1297 __v2df src = _mm_setzero_pd (); 1298 __v2df mask = _mm_cmpeq_pd (src, src); 1299 1300 return (__m128d) __builtin_ia32_gatherdiv2df (src, 1301 base, 1302 (__v2di)index, 1303 mask, 1304 scale); 1305 } 1306 1307 extern __inline __m128d 1308 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1309 _mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index, 1310 __m128d mask, const int scale) 1311 { 1312 return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src, 1313 base, 1314 (__v2di)index, 1315 (__v2df)mask, 1316 scale); 1317 } 1318 1319 extern __inline __m256d 1320 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1321 _mm256_i64gather_pd (double const *base, __m256i index, const int scale) 1322 { 1323 __v4df src = _mm256_setzero_pd (); 1324 __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ); 1325 1326 return (__m256d) __builtin_ia32_gatherdiv4df (src, 1327 base, 1328 (__v4di)index, 1329 mask, 1330 scale); 1331 } 1332 1333 extern __inline __m256d 1334 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1335 _mm256_mask_i64gather_pd (__m256d src, double const *base, 1336 __m256i index, __m256d mask, const int scale) 1337 { 1338 return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src, 1339 base, 1340 (__v4di)index, 1341 (__v4df)mask, 1342 scale); 1343 } 1344 1345 extern __inline __m128 1346 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1347 _mm_i32gather_ps (float const *base, __m128i index, const int scale) 1348 { 1349 __v4sf src = _mm_setzero_ps (); 1350 __v4sf mask = _mm_cmpeq_ps (src, src); 1351 1352 return (__m128) __builtin_ia32_gathersiv4sf (src, 1353 base, 1354 (__v4si)index, 1355 mask, 1356 scale); 1357 } 1358 1359 extern __inline __m128 1360 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1361 _mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index, 1362 __m128 mask, const int scale) 1363 { 1364 return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src, 1365 base, 1366 (__v4si)index, 1367 (__v4sf)mask, 1368 scale); 1369 } 1370 1371 extern __inline __m256 1372 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1373 _mm256_i32gather_ps (float const *base, __m256i index, const int scale) 1374 { 1375 __v8sf src = _mm256_setzero_ps (); 1376 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ); 1377 1378 return (__m256) __builtin_ia32_gathersiv8sf (src, 1379 base, 1380 (__v8si)index, 1381 mask, 1382 scale); 1383 } 1384 1385 extern __inline __m256 1386 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1387 _mm256_mask_i32gather_ps (__m256 src, float const *base, 1388 __m256i index, __m256 mask, const int scale) 1389 { 1390 return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src, 1391 base, 1392 (__v8si)index, 1393 (__v8sf)mask, 1394 scale); 1395 } 1396 1397 extern __inline __m128 1398 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1399 _mm_i64gather_ps (float const *base, __m128i index, const int scale) 1400 { 1401 __v4sf src = _mm_setzero_ps (); 1402 __v4sf mask = _mm_cmpeq_ps (src, src); 1403 1404 return (__m128) __builtin_ia32_gatherdiv4sf (src, 1405 base, 1406 (__v2di)index, 1407 mask, 1408 scale); 1409 } 1410 1411 extern __inline __m128 1412 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1413 _mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index, 1414 __m128 mask, const int scale) 1415 { 1416 return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src, 1417 base, 1418 (__v2di)index, 1419 (__v4sf)mask, 1420 scale); 1421 } 1422 1423 extern __inline __m128 1424 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1425 _mm256_i64gather_ps (float const *base, __m256i index, const int scale) 1426 { 1427 __v4sf src = _mm_setzero_ps (); 1428 __v4sf mask = _mm_cmpeq_ps (src, src); 1429 1430 return (__m128) __builtin_ia32_gatherdiv4sf256 (src, 1431 base, 1432 (__v4di)index, 1433 mask, 1434 scale); 1435 } 1436 1437 extern __inline __m128 1438 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1439 _mm256_mask_i64gather_ps (__m128 src, float const *base, 1440 __m256i index, __m128 mask, const int scale) 1441 { 1442 return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src, 1443 base, 1444 (__v4di)index, 1445 (__v4sf)mask, 1446 scale); 1447 } 1448 1449 extern __inline __m128i 1450 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1451 _mm_i32gather_epi64 (long long int const *base, 1452 __m128i index, const int scale) 1453 { 1454 __v2di src = __extension__ (__v2di){ 0, 0 }; 1455 __v2di mask = __extension__ (__v2di){ ~0, ~0 }; 1456 1457 return (__m128i) __builtin_ia32_gathersiv2di (src, 1458 base, 1459 (__v4si)index, 1460 mask, 1461 scale); 1462 } 1463 1464 extern __inline __m128i 1465 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1466 _mm_mask_i32gather_epi64 (__m128i src, long long int const *base, 1467 __m128i index, __m128i mask, const int scale) 1468 { 1469 return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src, 1470 base, 1471 (__v4si)index, 1472 (__v2di)mask, 1473 scale); 1474 } 1475 1476 extern __inline __m256i 1477 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1478 _mm256_i32gather_epi64 (long long int const *base, 1479 __m128i index, const int scale) 1480 { 1481 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 }; 1482 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; 1483 1484 return (__m256i) __builtin_ia32_gathersiv4di (src, 1485 base, 1486 (__v4si)index, 1487 mask, 1488 scale); 1489 } 1490 1491 extern __inline __m256i 1492 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1493 _mm256_mask_i32gather_epi64 (__m256i src, long long int const *base, 1494 __m128i index, __m256i mask, const int scale) 1495 { 1496 return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src, 1497 base, 1498 (__v4si)index, 1499 (__v4di)mask, 1500 scale); 1501 } 1502 1503 extern __inline __m128i 1504 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1505 _mm_i64gather_epi64 (long long int const *base, 1506 __m128i index, const int scale) 1507 { 1508 __v2di src = __extension__ (__v2di){ 0, 0 }; 1509 __v2di mask = __extension__ (__v2di){ ~0, ~0 }; 1510 1511 return (__m128i) __builtin_ia32_gatherdiv2di (src, 1512 base, 1513 (__v2di)index, 1514 mask, 1515 scale); 1516 } 1517 1518 extern __inline __m128i 1519 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1520 _mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index, 1521 __m128i mask, const int scale) 1522 { 1523 return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src, 1524 base, 1525 (__v2di)index, 1526 (__v2di)mask, 1527 scale); 1528 } 1529 1530 extern __inline __m256i 1531 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1532 _mm256_i64gather_epi64 (long long int const *base, 1533 __m256i index, const int scale) 1534 { 1535 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 }; 1536 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; 1537 1538 return (__m256i) __builtin_ia32_gatherdiv4di (src, 1539 base, 1540 (__v4di)index, 1541 mask, 1542 scale); 1543 } 1544 1545 extern __inline __m256i 1546 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1547 _mm256_mask_i64gather_epi64 (__m256i src, long long int const *base, 1548 __m256i index, __m256i mask, const int scale) 1549 { 1550 return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src, 1551 base, 1552 (__v4di)index, 1553 (__v4di)mask, 1554 scale); 1555 } 1556 1557 extern __inline __m128i 1558 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1559 _mm_i32gather_epi32 (int const *base, __m128i index, const int scale) 1560 { 1561 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1562 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1563 1564 return (__m128i) __builtin_ia32_gathersiv4si (src, 1565 base, 1566 (__v4si)index, 1567 mask, 1568 scale); 1569 } 1570 1571 extern __inline __m128i 1572 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1573 _mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index, 1574 __m128i mask, const int scale) 1575 { 1576 return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src, 1577 base, 1578 (__v4si)index, 1579 (__v4si)mask, 1580 scale); 1581 } 1582 1583 extern __inline __m256i 1584 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1585 _mm256_i32gather_epi32 (int const *base, __m256i index, const int scale) 1586 { 1587 __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 }; 1588 __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 }; 1589 1590 return (__m256i) __builtin_ia32_gathersiv8si (src, 1591 base, 1592 (__v8si)index, 1593 mask, 1594 scale); 1595 } 1596 1597 extern __inline __m256i 1598 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1599 _mm256_mask_i32gather_epi32 (__m256i src, int const *base, 1600 __m256i index, __m256i mask, const int scale) 1601 { 1602 return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src, 1603 base, 1604 (__v8si)index, 1605 (__v8si)mask, 1606 scale); 1607 } 1608 1609 extern __inline __m128i 1610 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1611 _mm_i64gather_epi32 (int const *base, __m128i index, const int scale) 1612 { 1613 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1614 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1615 1616 return (__m128i) __builtin_ia32_gatherdiv4si (src, 1617 base, 1618 (__v2di)index, 1619 mask, 1620 scale); 1621 } 1622 1623 extern __inline __m128i 1624 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1625 _mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index, 1626 __m128i mask, const int scale) 1627 { 1628 return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src, 1629 base, 1630 (__v2di)index, 1631 (__v4si)mask, 1632 scale); 1633 } 1634 1635 extern __inline __m128i 1636 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1637 _mm256_i64gather_epi32 (int const *base, __m256i index, const int scale) 1638 { 1639 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1640 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1641 1642 return (__m128i) __builtin_ia32_gatherdiv4si256 (src, 1643 base, 1644 (__v4di)index, 1645 mask, 1646 scale); 1647 } 1648 1649 extern __inline __m128i 1650 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1651 _mm256_mask_i64gather_epi32 (__m128i src, int const *base, 1652 __m256i index, __m128i mask, const int scale) 1653 { 1654 return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src, 1655 base, 1656 (__v4di)index, 1657 (__v4si)mask, 1658 scale); 1659 } 1660 #else /* __OPTIMIZE__ */ 1661 #define _mm_i32gather_pd(BASE, INDEX, SCALE) \ 1662 (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \ 1663 (double const *)BASE, \ 1664 (__v4si)(__m128i)INDEX, \ 1665 (__v2df)_mm_set1_pd( \ 1666 (double)(long long int) -1), \ 1667 (int)SCALE) 1668 1669 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1670 (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, \ 1671 (double const *)BASE, \ 1672 (__v4si)(__m128i)INDEX, \ 1673 (__v2df)(__m128d)MASK, \ 1674 (int)SCALE) 1675 1676 #define _mm256_i32gather_pd(BASE, INDEX, SCALE) \ 1677 (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \ 1678 (double const *)BASE, \ 1679 (__v4si)(__m128i)INDEX, \ 1680 (__v4df)_mm256_set1_pd( \ 1681 (double)(long long int) -1), \ 1682 (int)SCALE) 1683 1684 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1685 (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, \ 1686 (double const *)BASE, \ 1687 (__v4si)(__m128i)INDEX, \ 1688 (__v4df)(__m256d)MASK, \ 1689 (int)SCALE) 1690 1691 #define _mm_i64gather_pd(BASE, INDEX, SCALE) \ 1692 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \ 1693 (double const *)BASE, \ 1694 (__v2di)(__m128i)INDEX, \ 1695 (__v2df)_mm_set1_pd( \ 1696 (double)(long long int) -1), \ 1697 (int)SCALE) 1698 1699 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1700 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, \ 1701 (double const *)BASE, \ 1702 (__v2di)(__m128i)INDEX, \ 1703 (__v2df)(__m128d)MASK, \ 1704 (int)SCALE) 1705 1706 #define _mm256_i64gather_pd(BASE, INDEX, SCALE) \ 1707 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \ 1708 (double const *)BASE, \ 1709 (__v4di)(__m256i)INDEX, \ 1710 (__v4df)_mm256_set1_pd( \ 1711 (double)(long long int) -1), \ 1712 (int)SCALE) 1713 1714 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1715 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, \ 1716 (double const *)BASE, \ 1717 (__v4di)(__m256i)INDEX, \ 1718 (__v4df)(__m256d)MASK, \ 1719 (int)SCALE) 1720 1721 #define _mm_i32gather_ps(BASE, INDEX, SCALE) \ 1722 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \ 1723 (float const *)BASE, \ 1724 (__v4si)(__m128i)INDEX, \ 1725 _mm_set1_ps ((float)(int) -1), \ 1726 (int)SCALE) 1727 1728 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1729 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC, \ 1730 (float const *)BASE, \ 1731 (__v4si)(__m128i)INDEX, \ 1732 (__v4sf)(__m128d)MASK, \ 1733 (int)SCALE) 1734 1735 #define _mm256_i32gather_ps(BASE, INDEX, SCALE) \ 1736 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \ 1737 (float const *)BASE, \ 1738 (__v8si)(__m256i)INDEX, \ 1739 (__v8sf)_mm256_set1_ps ( \ 1740 (float)(int) -1), \ 1741 (int)SCALE) 1742 1743 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1744 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, \ 1745 (float const *)BASE, \ 1746 (__v8si)(__m256i)INDEX, \ 1747 (__v8sf)(__m256d)MASK, \ 1748 (int)SCALE) 1749 1750 #define _mm_i64gather_ps(BASE, INDEX, SCALE) \ 1751 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \ 1752 (float const *)BASE, \ 1753 (__v2di)(__m128i)INDEX, \ 1754 (__v4sf)_mm_set1_ps ( \ 1755 (float)(int) -1), \ 1756 (int)SCALE) 1757 1758 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1759 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, \ 1760 (float const *)BASE, \ 1761 (__v2di)(__m128i)INDEX, \ 1762 (__v4sf)(__m128d)MASK, \ 1763 (int)SCALE) 1764 1765 #define _mm256_i64gather_ps(BASE, INDEX, SCALE) \ 1766 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \ 1767 (float const *)BASE, \ 1768 (__v4di)(__m256i)INDEX, \ 1769 (__v4sf)_mm_set1_ps( \ 1770 (float)(int) -1), \ 1771 (int)SCALE) 1772 1773 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1774 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, \ 1775 (float const *)BASE, \ 1776 (__v4di)(__m256i)INDEX, \ 1777 (__v4sf)(__m128)MASK, \ 1778 (int)SCALE) 1779 1780 #define _mm_i32gather_epi64(BASE, INDEX, SCALE) \ 1781 (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \ 1782 (long long const *)BASE, \ 1783 (__v4si)(__m128i)INDEX, \ 1784 (__v2di)_mm_set1_epi64x (-1), \ 1785 (int)SCALE) 1786 1787 #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1788 (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC, \ 1789 (long long const *)BASE, \ 1790 (__v4si)(__m128i)INDEX, \ 1791 (__v2di)(__m128i)MASK, \ 1792 (int)SCALE) 1793 1794 #define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \ 1795 (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \ 1796 (long long const *)BASE, \ 1797 (__v4si)(__m128i)INDEX, \ 1798 (__v4di)_mm256_set1_epi64x (-1), \ 1799 (int)SCALE) 1800 1801 #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1802 (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC, \ 1803 (long long const *)BASE, \ 1804 (__v4si)(__m128i)INDEX, \ 1805 (__v4di)(__m256i)MASK, \ 1806 (int)SCALE) 1807 1808 #define _mm_i64gather_epi64(BASE, INDEX, SCALE) \ 1809 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \ 1810 (long long const *)BASE, \ 1811 (__v2di)(__m128i)INDEX, \ 1812 (__v2di)_mm_set1_epi64x (-1), \ 1813 (int)SCALE) 1814 1815 #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1816 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC, \ 1817 (long long const *)BASE, \ 1818 (__v2di)(__m128i)INDEX, \ 1819 (__v2di)(__m128i)MASK, \ 1820 (int)SCALE) 1821 1822 #define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \ 1823 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \ 1824 (long long const *)BASE, \ 1825 (__v4di)(__m256i)INDEX, \ 1826 (__v4di)_mm256_set1_epi64x (-1), \ 1827 (int)SCALE) 1828 1829 #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1830 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC, \ 1831 (long long const *)BASE, \ 1832 (__v4di)(__m256i)INDEX, \ 1833 (__v4di)(__m256i)MASK, \ 1834 (int)SCALE) 1835 1836 #define _mm_i32gather_epi32(BASE, INDEX, SCALE) \ 1837 (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \ 1838 (int const *)BASE, \ 1839 (__v4si)(__m128i)INDEX, \ 1840 (__v4si)_mm_set1_epi32 (-1), \ 1841 (int)SCALE) 1842 1843 #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1844 (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC, \ 1845 (int const *)BASE, \ 1846 (__v4si)(__m128i)INDEX, \ 1847 (__v4si)(__m128i)MASK, \ 1848 (int)SCALE) 1849 1850 #define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \ 1851 (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \ 1852 (int const *)BASE, \ 1853 (__v8si)(__m256i)INDEX, \ 1854 (__v8si)_mm256_set1_epi32 (-1), \ 1855 (int)SCALE) 1856 1857 #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1858 (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC, \ 1859 (int const *)BASE, \ 1860 (__v8si)(__m256i)INDEX, \ 1861 (__v8si)(__m256i)MASK, \ 1862 (int)SCALE) 1863 1864 #define _mm_i64gather_epi32(BASE, INDEX, SCALE) \ 1865 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \ 1866 (int const *)BASE, \ 1867 (__v2di)(__m128i)INDEX, \ 1868 (__v4si)_mm_set1_epi32 (-1), \ 1869 (int)SCALE) 1870 1871 #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1872 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC, \ 1873 (int const *)BASE, \ 1874 (__v2di)(__m128i)INDEX, \ 1875 (__v4si)(__m128i)MASK, \ 1876 (int)SCALE) 1877 1878 #define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \ 1879 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \ 1880 (int const *)BASE, \ 1881 (__v4di)(__m256i)INDEX, \ 1882 (__v4si)_mm_set1_epi32(-1), \ 1883 (int)SCALE) 1884 1885 #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1886 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC, \ 1887 (int const *)BASE, \ 1888 (__v4di)(__m256i)INDEX, \ 1889 (__v4si)(__m128i)MASK, \ 1890 (int)SCALE) 1891 #endif /* __OPTIMIZE__ */ 1892