xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/config/i386/avx2intrin.h (revision 4d5abbe83f525258eb479e5fca29f25cb943f379)
1 /* Copyright (C) 2011-2013 Free Software Foundation, Inc.
2 
3    This file is part of GCC.
4 
5    GCC is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3, or (at your option)
8    any later version.
9 
10    GCC is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    Under Section 7 of GPL version 3, you are granted additional
16    permissions described in the GCC Runtime Library Exception, version
17    3.1, as published by the Free Software Foundation.
18 
19    You should have received a copy of the GNU General Public License and
20    a copy of the GCC Runtime Library Exception along with this program;
21    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
22    <http://www.gnu.org/licenses/>.  */
23 
24 #ifndef _IMMINTRIN_H_INCLUDED
25 # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
26 #endif
27 
28 /* Sum absolute 8-bit integer difference of adjacent groups of 4
29    byte integers in the first 2 operands.  Starting offsets within
30    operands are determined by the 3rd mask operand.  */
31 #ifdef __OPTIMIZE__
32 extern __inline __m256i
33 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
34 _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
35 {
36   return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
37 					      (__v32qi)__Y, __M);
38 }
39 #else
40 #define _mm256_mpsadbw_epu8(X, Y, M)					\
41   ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X),		\
42 					(__v32qi)(__m256i)(Y), (int)(M)))
43 #endif
44 
45 extern __inline __m256i
46 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
47 _mm256_abs_epi8 (__m256i __A)
48 {
49   return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
50 }
51 
52 extern __inline __m256i
53 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
54 _mm256_abs_epi16 (__m256i __A)
55 {
56   return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
57 }
58 
59 extern __inline __m256i
60 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
61 _mm256_abs_epi32 (__m256i __A)
62 {
63   return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
64 }
65 
66 extern __inline __m256i
67 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
68 _mm256_packs_epi32 (__m256i __A, __m256i __B)
69 {
70   return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
71 }
72 
73 extern __inline __m256i
74 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
75 _mm256_packs_epi16 (__m256i __A, __m256i __B)
76 {
77   return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
78 }
79 
80 extern __inline __m256i
81 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
82 _mm256_packus_epi32 (__m256i __A, __m256i __B)
83 {
84   return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
85 }
86 
87 extern __inline __m256i
88 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
89 _mm256_packus_epi16 (__m256i __A, __m256i __B)
90 {
91   return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
92 }
93 
94 extern __inline __m256i
95 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
96 _mm256_add_epi8 (__m256i __A, __m256i __B)
97 {
98   return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B);
99 }
100 
101 extern __inline __m256i
102 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
103 _mm256_add_epi16 (__m256i __A, __m256i __B)
104 {
105   return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B);
106 }
107 
108 extern __inline __m256i
109 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
110 _mm256_add_epi32 (__m256i __A, __m256i __B)
111 {
112   return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B);
113 }
114 
115 extern __inline __m256i
116 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
117 _mm256_add_epi64 (__m256i __A, __m256i __B)
118 {
119   return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B);
120 }
121 
122 extern __inline __m256i
123 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
124 _mm256_adds_epi8 (__m256i __A, __m256i __B)
125 {
126   return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
127 }
128 
129 extern __inline __m256i
130 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
131 _mm256_adds_epi16 (__m256i __A, __m256i __B)
132 {
133   return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
134 }
135 
136 extern __inline __m256i
137 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
138 _mm256_adds_epu8 (__m256i __A, __m256i __B)
139 {
140   return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
141 }
142 
143 extern __inline __m256i
144 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
145 _mm256_adds_epu16 (__m256i __A, __m256i __B)
146 {
147   return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
148 }
149 
150 #ifdef __OPTIMIZE__
151 extern __inline __m256i
152 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
153 _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
154 {
155   return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
156 					      (__v4di)__B,
157 					      __N * 8);
158 }
159 #else
160 /* In that case (__N*8) will be in vreg, and insn will not be matched. */
161 /* Use define instead */
162 #define _mm256_alignr_epi8(A, B, N)				   \
163   ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A),	   \
164 					(__v4di)(__m256i)(B),	   \
165 					(int)(N) * 8))
166 #endif
167 
168 extern __inline __m256i
169 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
170 _mm256_and_si256 (__m256i __A, __m256i __B)
171 {
172   return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B);
173 }
174 
175 extern __inline __m256i
176 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
177 _mm256_andnot_si256 (__m256i __A, __m256i __B)
178 {
179   return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
180 }
181 
182 extern __inline __m256i
183 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
184 _mm256_avg_epu8 (__m256i __A, __m256i __B)
185 {
186   return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
187 }
188 
189 extern __inline __m256i
190 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
191 _mm256_avg_epu16 (__m256i __A, __m256i __B)
192 {
193   return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
194 }
195 
196 extern __inline __m256i
197 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
198 _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
199 {
200   return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
201 					       (__v32qi)__Y,
202 					       (__v32qi)__M);
203 }
204 
205 #ifdef __OPTIMIZE__
206 extern __inline __m256i
207 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
208 _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
209 {
210   return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
211 					      (__v16hi)__Y,
212 					       __M);
213 }
214 #else
215 #define _mm256_blend_epi16(X, Y, M)					\
216   ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X),		\
217 					(__v16hi)(__m256i)(Y), (int)(M)))
218 #endif
219 
220 extern __inline __m256i
221 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
222 _mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
223 {
224   return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B);
225 }
226 
227 extern __inline __m256i
228 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
229 _mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
230 {
231   return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B);
232 }
233 
234 extern __inline __m256i
235 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
236 _mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
237 {
238   return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B);
239 }
240 
241 extern __inline __m256i
242 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
243 _mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
244 {
245   return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B);
246 }
247 
248 extern __inline __m256i
249 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
250 _mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
251 {
252   return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A,
253 					     (__v32qi)__B);
254 }
255 
256 extern __inline __m256i
257 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
258 _mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
259 {
260   return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A,
261 					     (__v16hi)__B);
262 }
263 
264 extern __inline __m256i
265 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
266 _mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
267 {
268   return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A,
269 					     (__v8si)__B);
270 }
271 
272 extern __inline __m256i
273 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
274 _mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
275 {
276   return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B);
277 }
278 
279 extern __inline __m256i
280 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
281 _mm256_hadd_epi16 (__m256i __X, __m256i __Y)
282 {
283   return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
284 					     (__v16hi)__Y);
285 }
286 
287 extern __inline __m256i
288 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
289 _mm256_hadd_epi32 (__m256i __X, __m256i __Y)
290 {
291   return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
292 }
293 
294 extern __inline __m256i
295 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
296 _mm256_hadds_epi16 (__m256i __X, __m256i __Y)
297 {
298   return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
299 					      (__v16hi)__Y);
300 }
301 
302 extern __inline __m256i
303 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
304 _mm256_hsub_epi16 (__m256i __X, __m256i __Y)
305 {
306   return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
307 					     (__v16hi)__Y);
308 }
309 
310 extern __inline __m256i
311 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
312 _mm256_hsub_epi32 (__m256i __X, __m256i __Y)
313 {
314   return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
315 }
316 
317 extern __inline __m256i
318 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
319 _mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
320 {
321   return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
322 					      (__v16hi)__Y);
323 }
324 
325 extern __inline __m256i
326 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
327 _mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
328 {
329   return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
330 						(__v32qi)__Y);
331 }
332 
333 extern __inline __m256i
334 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
335 _mm256_madd_epi16 (__m256i __A, __m256i __B)
336 {
337   return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
338 					     (__v16hi)__B);
339 }
340 
341 extern __inline __m256i
342 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
343 _mm256_max_epi8 (__m256i __A, __m256i __B)
344 {
345   return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
346 }
347 
348 extern __inline __m256i
349 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
350 _mm256_max_epi16 (__m256i __A, __m256i __B)
351 {
352   return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
353 }
354 
355 extern __inline __m256i
356 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
357 _mm256_max_epi32 (__m256i __A, __m256i __B)
358 {
359   return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
360 }
361 
362 extern __inline __m256i
363 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
364 _mm256_max_epu8 (__m256i __A, __m256i __B)
365 {
366   return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
367 }
368 
369 extern __inline __m256i
370 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
371 _mm256_max_epu16 (__m256i __A, __m256i __B)
372 {
373   return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
374 }
375 
376 extern __inline __m256i
377 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
378 _mm256_max_epu32 (__m256i __A, __m256i __B)
379 {
380   return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
381 }
382 
383 extern __inline __m256i
384 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
385 _mm256_min_epi8 (__m256i __A, __m256i __B)
386 {
387   return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
388 }
389 
390 extern __inline __m256i
391 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
392 _mm256_min_epi16 (__m256i __A, __m256i __B)
393 {
394   return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
395 }
396 
397 extern __inline __m256i
398 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
399 _mm256_min_epi32 (__m256i __A, __m256i __B)
400 {
401   return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
402 }
403 
404 extern __inline __m256i
405 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
406 _mm256_min_epu8 (__m256i __A, __m256i __B)
407 {
408   return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
409 }
410 
411 extern __inline __m256i
412 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
413 _mm256_min_epu16 (__m256i __A, __m256i __B)
414 {
415   return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
416 }
417 
418 extern __inline __m256i
419 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
420 _mm256_min_epu32 (__m256i __A, __m256i __B)
421 {
422   return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
423 }
424 
425 extern __inline int
426 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
427 _mm256_movemask_epi8 (__m256i __A)
428 {
429   return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
430 }
431 
432 extern __inline __m256i
433 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
434 _mm256_cvtepi8_epi16 (__m128i __X)
435 {
436   return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
437 }
438 
439 extern __inline __m256i
440 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
441 _mm256_cvtepi8_epi32 (__m128i __X)
442 {
443   return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
444 }
445 
446 extern __inline __m256i
447 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
448 _mm256_cvtepi8_epi64 (__m128i __X)
449 {
450   return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
451 }
452 
453 extern __inline __m256i
454 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
455 _mm256_cvtepi16_epi32 (__m128i __X)
456 {
457   return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
458 }
459 
460 extern __inline __m256i
461 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
462 _mm256_cvtepi16_epi64 (__m128i __X)
463 {
464   return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
465 }
466 
467 extern __inline __m256i
468 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
469 _mm256_cvtepi32_epi64 (__m128i __X)
470 {
471   return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
472 }
473 
474 extern __inline __m256i
475 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
476 _mm256_cvtepu8_epi16 (__m128i __X)
477 {
478   return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
479 }
480 
481 extern __inline __m256i
482 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
483 _mm256_cvtepu8_epi32 (__m128i __X)
484 {
485   return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
486 }
487 
488 extern __inline __m256i
489 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
490 _mm256_cvtepu8_epi64 (__m128i __X)
491 {
492   return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
493 }
494 
495 extern __inline __m256i
496 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
497 _mm256_cvtepu16_epi32 (__m128i __X)
498 {
499   return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
500 }
501 
502 extern __inline __m256i
503 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
504 _mm256_cvtepu16_epi64 (__m128i __X)
505 {
506   return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
507 }
508 
509 extern __inline __m256i
510 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
511 _mm256_cvtepu32_epi64 (__m128i __X)
512 {
513   return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
514 }
515 
516 extern __inline __m256i
517 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
518 _mm256_mul_epi32 (__m256i __X, __m256i __Y)
519 {
520   return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
521 }
522 
523 extern __inline __m256i
524 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
525 _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
526 {
527   return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
528 					       (__v16hi)__Y);
529 }
530 
531 extern __inline __m256i
532 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
533 _mm256_mulhi_epu16 (__m256i __A, __m256i __B)
534 {
535   return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
536 }
537 
538 extern __inline __m256i
539 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
540 _mm256_mulhi_epi16 (__m256i __A, __m256i __B)
541 {
542   return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
543 }
544 
545 extern __inline __m256i
546 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
547 _mm256_mullo_epi16 (__m256i __A, __m256i __B)
548 {
549   return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B);
550 }
551 
552 extern __inline __m256i
553 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
554 _mm256_mullo_epi32 (__m256i __A, __m256i __B)
555 {
556   return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B);
557 }
558 
559 extern __inline __m256i
560 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
561 _mm256_mul_epu32 (__m256i __A, __m256i __B)
562 {
563   return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
564 }
565 
566 extern __inline __m256i
567 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
568 _mm256_or_si256 (__m256i __A, __m256i __B)
569 {
570   return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B);
571 }
572 
573 extern __inline __m256i
574 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
575 _mm256_sad_epu8 (__m256i __A, __m256i __B)
576 {
577   return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
578 }
579 
580 extern __inline __m256i
581 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
582 _mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
583 {
584   return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
585 					     (__v32qi)__Y);
586 }
587 
588 #ifdef __OPTIMIZE__
589 extern __inline __m256i
590 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
591 _mm256_shuffle_epi32 (__m256i __A, const int __mask)
592 {
593   return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
594 }
595 
596 extern __inline __m256i
597 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
598 _mm256_shufflehi_epi16 (__m256i __A, const int __mask)
599 {
600   return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
601 }
602 
603 extern __inline __m256i
604 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
605 _mm256_shufflelo_epi16 (__m256i __A, const int __mask)
606 {
607   return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
608 }
609 #else
610 #define _mm256_shuffle_epi32(A, N) \
611   ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
612 #define _mm256_shufflehi_epi16(A, N) \
613   ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
614 #define _mm256_shufflelo_epi16(A, N) \
615   ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
616 #endif
617 
618 extern __inline __m256i
619 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
620 _mm256_sign_epi8 (__m256i __X, __m256i __Y)
621 {
622   return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
623 }
624 
625 extern __inline __m256i
626 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
627 _mm256_sign_epi16 (__m256i __X, __m256i __Y)
628 {
629   return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
630 }
631 
632 extern __inline __m256i
633 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
634 _mm256_sign_epi32 (__m256i __X, __m256i __Y)
635 {
636   return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
637 }
638 
639 #ifdef __OPTIMIZE__
640 extern __inline __m256i
641 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
642 _mm256_bslli_epi128 (__m256i __A, const int __N)
643 {
644   return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
645 }
646 
647 extern __inline __m256i
648 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
649 _mm256_slli_si256 (__m256i __A, const int __N)
650 {
651   return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
652 }
653 #else
654 #define _mm256_bslli_epi128(A, N) \
655   ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
656 #define _mm256_slli_si256(A, N) \
657   ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
658 #endif
659 
660 extern __inline __m256i
661 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
662 _mm256_slli_epi16 (__m256i __A, int __B)
663 {
664   return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
665 }
666 
667 extern __inline __m256i
668 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
669 _mm256_sll_epi16 (__m256i __A, __m128i __B)
670 {
671   return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
672 }
673 
674 extern __inline __m256i
675 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
676 _mm256_slli_epi32 (__m256i __A, int __B)
677 {
678   return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
679 }
680 
681 extern __inline __m256i
682 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
683 _mm256_sll_epi32 (__m256i __A, __m128i __B)
684 {
685   return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
686 }
687 
688 extern __inline __m256i
689 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
690 _mm256_slli_epi64 (__m256i __A, int __B)
691 {
692   return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
693 }
694 
695 extern __inline __m256i
696 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
697 _mm256_sll_epi64 (__m256i __A, __m128i __B)
698 {
699   return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
700 }
701 
702 extern __inline __m256i
703 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
704 _mm256_srai_epi16 (__m256i __A, int __B)
705 {
706   return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
707 }
708 
709 extern __inline __m256i
710 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
711 _mm256_sra_epi16 (__m256i __A, __m128i __B)
712 {
713   return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
714 }
715 
716 extern __inline __m256i
717 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
718 _mm256_srai_epi32 (__m256i __A, int __B)
719 {
720   return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
721 }
722 
723 extern __inline __m256i
724 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
725 _mm256_sra_epi32 (__m256i __A, __m128i __B)
726 {
727   return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
728 }
729 
730 #ifdef __OPTIMIZE__
731 extern __inline __m256i
732 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
733 _mm256_bsrli_epi128 (__m256i __A, const int __N)
734 {
735   return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
736 }
737 
738 extern __inline __m256i
739 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
740 _mm256_srli_si256 (__m256i __A, const int __N)
741 {
742   return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
743 }
744 #else
745 #define _mm256_bsrli_epi128(A, N) \
746   ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
747 #define _mm256_srli_si256(A, N) \
748   ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
749 #endif
750 
751 extern __inline __m256i
752 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
753 _mm256_srli_epi16 (__m256i __A, int __B)
754 {
755   return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
756 }
757 
758 extern __inline __m256i
759 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
760 _mm256_srl_epi16 (__m256i __A, __m128i __B)
761 {
762   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
763 }
764 
765 extern __inline __m256i
766 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
767 _mm256_srli_epi32 (__m256i __A, int __B)
768 {
769   return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
770 }
771 
772 extern __inline __m256i
773 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
774 _mm256_srl_epi32 (__m256i __A, __m128i __B)
775 {
776   return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
777 }
778 
779 extern __inline __m256i
780 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
781 _mm256_srli_epi64 (__m256i __A, int __B)
782 {
783   return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
784 }
785 
786 extern __inline __m256i
787 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
788 _mm256_srl_epi64 (__m256i __A, __m128i __B)
789 {
790   return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
791 }
792 
793 extern __inline __m256i
794 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
795 _mm256_sub_epi8 (__m256i __A, __m256i __B)
796 {
797   return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B);
798 }
799 
800 extern __inline __m256i
801 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
802 _mm256_sub_epi16 (__m256i __A, __m256i __B)
803 {
804   return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B);
805 }
806 
807 extern __inline __m256i
808 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
809 _mm256_sub_epi32 (__m256i __A, __m256i __B)
810 {
811   return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B);
812 }
813 
814 extern __inline __m256i
815 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
816 _mm256_sub_epi64 (__m256i __A, __m256i __B)
817 {
818   return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B);
819 }
820 
821 extern __inline __m256i
822 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
823 _mm256_subs_epi8 (__m256i __A, __m256i __B)
824 {
825   return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
826 }
827 
828 extern __inline __m256i
829 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
830 _mm256_subs_epi16 (__m256i __A, __m256i __B)
831 {
832   return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
833 }
834 
835 extern __inline __m256i
836 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
837 _mm256_subs_epu8 (__m256i __A, __m256i __B)
838 {
839   return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
840 }
841 
842 extern __inline __m256i
843 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
844 _mm256_subs_epu16 (__m256i __A, __m256i __B)
845 {
846   return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
847 }
848 
849 extern __inline __m256i
850 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
851 _mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
852 {
853   return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
854 }
855 
856 extern __inline __m256i
857 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
858 _mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
859 {
860   return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
861 }
862 
863 extern __inline __m256i
864 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
865 _mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
866 {
867   return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
868 }
869 
870 extern __inline __m256i
871 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
872 _mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
873 {
874   return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
875 }
876 
877 extern __inline __m256i
878 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
879 _mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
880 {
881   return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
882 }
883 
884 extern __inline __m256i
885 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
886 _mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
887 {
888   return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
889 }
890 
891 extern __inline __m256i
892 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
893 _mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
894 {
895   return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
896 }
897 
898 extern __inline __m256i
899 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
900 _mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
901 {
902   return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
903 }
904 
905 extern __inline __m256i
906 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
907 _mm256_xor_si256 (__m256i __A, __m256i __B)
908 {
909   return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B);
910 }
911 
912 extern __inline __m256i
913 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
914 _mm256_stream_load_si256 (__m256i const *__X)
915 {
916   return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
917 }
918 
919 extern __inline __m128
920 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
921 _mm_broadcastss_ps (__m128 __X)
922 {
923   return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
924 }
925 
926 extern __inline __m256
927 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
928 _mm256_broadcastss_ps (__m128 __X)
929 {
930   return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
931 }
932 
933 extern __inline __m256d
934 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
935 _mm256_broadcastsd_pd (__m128d __X)
936 {
937   return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
938 }
939 
940 extern __inline __m256i
941 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
942 _mm256_broadcastsi128_si256 (__m128i __X)
943 {
944   return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
945 }
946 
947 #ifdef __OPTIMIZE__
948 extern __inline __m128i
949 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
950 _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
951 {
952   return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
953 					      (__v4si)__Y,
954 					      __M);
955 }
956 #else
957 #define _mm_blend_epi32(X, Y, M)					\
958   ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X),		\
959 					(__v4si)(__m128i)(Y), (int)(M)))
960 #endif
961 
962 #ifdef __OPTIMIZE__
963 extern __inline __m256i
964 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
965 _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
966 {
967   return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
968 					      (__v8si)__Y,
969 					      __M);
970 }
971 #else
972 #define _mm256_blend_epi32(X, Y, M)					\
973   ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X),		\
974 					(__v8si)(__m256i)(Y), (int)(M)))
975 #endif
976 
977 extern __inline __m256i
978 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
979 _mm256_broadcastb_epi8 (__m128i __X)
980 {
981   return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
982 }
983 
984 extern __inline __m256i
985 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
986 _mm256_broadcastw_epi16 (__m128i __X)
987 {
988   return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
989 }
990 
991 extern __inline __m256i
992 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
993 _mm256_broadcastd_epi32 (__m128i __X)
994 {
995   return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
996 }
997 
998 extern __inline __m256i
999 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1000 _mm256_broadcastq_epi64 (__m128i __X)
1001 {
1002   return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
1003 }
1004 
1005 extern __inline __m128i
1006 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1007 _mm_broadcastb_epi8 (__m128i __X)
1008 {
1009   return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
1010 }
1011 
1012 extern __inline __m128i
1013 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1014 _mm_broadcastw_epi16 (__m128i __X)
1015 {
1016   return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
1017 }
1018 
1019 extern __inline __m128i
1020 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1021 _mm_broadcastd_epi32 (__m128i __X)
1022 {
1023   return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
1024 }
1025 
1026 extern __inline __m128i
1027 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1028 _mm_broadcastq_epi64 (__m128i __X)
1029 {
1030   return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
1031 }
1032 
1033 extern __inline __m256i
1034 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1035 _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
1036 {
1037   return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
1038 }
1039 
1040 #ifdef __OPTIMIZE__
1041 extern __inline __m256d
1042 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1043 _mm256_permute4x64_pd (__m256d __X, const int __M)
1044 {
1045   return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
1046 }
1047 #else
1048 #define _mm256_permute4x64_pd(X, M)			       \
1049   ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
1050 #endif
1051 
1052 extern __inline __m256
1053 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1054 _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
1055 {
1056   return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
1057 }
1058 
1059 #ifdef __OPTIMIZE__
1060 extern __inline __m256i
1061 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1062 _mm256_permute4x64_epi64 (__m256i __X, const int __M)
1063 {
1064   return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
1065 }
1066 #else
1067 #define _mm256_permute4x64_epi64(X, M)			       \
1068   ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
1069 #endif
1070 
1071 
1072 #ifdef __OPTIMIZE__
1073 extern __inline __m256i
1074 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1075 _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
1076 {
1077   return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
1078 }
1079 #else
1080 #define _mm256_permute2x128_si256(X, Y, M)				\
1081   ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
1082 #endif
1083 
1084 #ifdef __OPTIMIZE__
1085 extern __inline __m128i
1086 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1087 _mm256_extracti128_si256 (__m256i __X, const int __M)
1088 {
1089   return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
1090 }
1091 #else
1092 #define _mm256_extracti128_si256(X, M)				\
1093   ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
1094 #endif
1095 
1096 #ifdef __OPTIMIZE__
1097 extern __inline __m256i
1098 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1099 _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
1100 {
1101   return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
1102 }
1103 #else
1104 #define _mm256_inserti128_si256(X, Y, M)			 \
1105   ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
1106 					   (__v2di)(__m128i)(Y), \
1107 					   (int)(M)))
1108 #endif
1109 
1110 extern __inline __m256i
1111 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1112 _mm256_maskload_epi32 (int const *__X, __m256i __M )
1113 {
1114   return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
1115 						(__v8si)__M);
1116 }
1117 
1118 extern __inline __m256i
1119 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1120 _mm256_maskload_epi64 (long long const *__X, __m256i __M )
1121 {
1122   return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
1123 						(__v4di)__M);
1124 }
1125 
1126 extern __inline __m128i
1127 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1128 _mm_maskload_epi32 (int const *__X, __m128i __M )
1129 {
1130   return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
1131 					     (__v4si)__M);
1132 }
1133 
1134 extern __inline __m128i
1135 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1136 _mm_maskload_epi64 (long long const *__X, __m128i __M )
1137 {
1138   return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
1139 					     (__v2di)__M);
1140 }
1141 
1142 extern __inline void
1143 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1144 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
1145 {
1146   __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
1147 }
1148 
1149 extern __inline void
1150 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1151 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
1152 {
1153   __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
1154 }
1155 
1156 extern __inline void
1157 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1158 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
1159 {
1160   __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
1161 }
1162 
1163 extern __inline void
1164 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1165 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
1166 {
1167   __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
1168 }
1169 
1170 extern __inline __m256i
1171 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1172 _mm256_sllv_epi32 (__m256i __X, __m256i __Y)
1173 {
1174   return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
1175 }
1176 
1177 extern __inline __m128i
1178 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1179 _mm_sllv_epi32 (__m128i __X, __m128i __Y)
1180 {
1181   return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
1182 }
1183 
1184 extern __inline __m256i
1185 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1186 _mm256_sllv_epi64 (__m256i __X, __m256i __Y)
1187 {
1188   return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
1189 }
1190 
1191 extern __inline __m128i
1192 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1193 _mm_sllv_epi64 (__m128i __X, __m128i __Y)
1194 {
1195   return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
1196 }
1197 
1198 extern __inline __m256i
1199 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1200 _mm256_srav_epi32 (__m256i __X, __m256i __Y)
1201 {
1202   return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
1203 }
1204 
1205 extern __inline __m128i
1206 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1207 _mm_srav_epi32 (__m128i __X, __m128i __Y)
1208 {
1209   return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
1210 }
1211 
1212 extern __inline __m256i
1213 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1214 _mm256_srlv_epi32 (__m256i __X, __m256i __Y)
1215 {
1216   return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
1217 }
1218 
1219 extern __inline __m128i
1220 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1221 _mm_srlv_epi32 (__m128i __X, __m128i __Y)
1222 {
1223   return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
1224 }
1225 
1226 extern __inline __m256i
1227 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1228 _mm256_srlv_epi64 (__m256i __X, __m256i __Y)
1229 {
1230   return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
1231 }
1232 
1233 extern __inline __m128i
1234 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1235 _mm_srlv_epi64 (__m128i __X, __m128i __Y)
1236 {
1237   return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
1238 }
1239 
1240 #ifdef __OPTIMIZE__
1241 extern __inline __m128d
1242 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1243 _mm_i32gather_pd (double const *base, __m128i index, const int scale)
1244 {
1245   __v2df src = _mm_setzero_pd ();
1246   __v2df mask = _mm_cmpeq_pd (src, src);
1247 
1248   return (__m128d) __builtin_ia32_gathersiv2df (src,
1249 						base,
1250 						(__v4si)index,
1251 						mask,
1252 						scale);
1253 }
1254 
1255 extern __inline __m128d
1256 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1257 _mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index,
1258 		       __m128d mask, const int scale)
1259 {
1260   return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src,
1261 						base,
1262 						(__v4si)index,
1263 						(__v2df)mask,
1264 						scale);
1265 }
1266 
1267 extern __inline __m256d
1268 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1269 _mm256_i32gather_pd (double const *base, __m128i index, const int scale)
1270 {
1271   __v4df src = _mm256_setzero_pd ();
1272   __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
1273 
1274   return (__m256d) __builtin_ia32_gathersiv4df (src,
1275 						base,
1276 						(__v4si)index,
1277 						mask,
1278 						scale);
1279 }
1280 
1281 extern __inline __m256d
1282 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1283 _mm256_mask_i32gather_pd (__m256d src, double const *base,
1284 			  __m128i index, __m256d mask, const int scale)
1285 {
1286   return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src,
1287 						base,
1288 						(__v4si)index,
1289 						(__v4df)mask,
1290 						scale);
1291 }
1292 
1293 extern __inline __m128d
1294 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1295 _mm_i64gather_pd (double const *base, __m128i index, const int scale)
1296 {
1297   __v2df src = _mm_setzero_pd ();
1298   __v2df mask = _mm_cmpeq_pd (src, src);
1299 
1300   return (__m128d) __builtin_ia32_gatherdiv2df (src,
1301 						base,
1302 						(__v2di)index,
1303 						mask,
1304 						scale);
1305 }
1306 
1307 extern __inline __m128d
1308 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1309 _mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index,
1310 		       __m128d mask, const int scale)
1311 {
1312   return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src,
1313 						base,
1314 						(__v2di)index,
1315 						(__v2df)mask,
1316 						scale);
1317 }
1318 
1319 extern __inline __m256d
1320 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1321 _mm256_i64gather_pd (double const *base, __m256i index, const int scale)
1322 {
1323   __v4df src = _mm256_setzero_pd ();
1324   __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ);
1325 
1326   return (__m256d) __builtin_ia32_gatherdiv4df (src,
1327 						base,
1328 						(__v4di)index,
1329 						mask,
1330 						scale);
1331 }
1332 
1333 extern __inline __m256d
1334 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1335 _mm256_mask_i64gather_pd (__m256d src, double const *base,
1336 			  __m256i index, __m256d mask, const int scale)
1337 {
1338   return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src,
1339 						base,
1340 						(__v4di)index,
1341 						(__v4df)mask,
1342 						scale);
1343 }
1344 
1345 extern __inline __m128
1346 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1347 _mm_i32gather_ps (float const *base, __m128i index, const int scale)
1348 {
1349   __v4sf src = _mm_setzero_ps ();
1350   __v4sf mask = _mm_cmpeq_ps (src, src);
1351 
1352   return (__m128) __builtin_ia32_gathersiv4sf (src,
1353 					       base,
1354 					       (__v4si)index,
1355 					       mask,
1356 					       scale);
1357 }
1358 
1359 extern __inline __m128
1360 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1361 _mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index,
1362 		       __m128 mask, const int scale)
1363 {
1364   return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src,
1365 					       base,
1366 					       (__v4si)index,
1367 					       (__v4sf)mask,
1368 					       scale);
1369 }
1370 
1371 extern __inline __m256
1372 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1373 _mm256_i32gather_ps (float const *base, __m256i index, const int scale)
1374 {
1375   __v8sf src = _mm256_setzero_ps ();
1376   __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
1377 
1378   return (__m256) __builtin_ia32_gathersiv8sf (src,
1379 					       base,
1380 					       (__v8si)index,
1381 					       mask,
1382 					       scale);
1383 }
1384 
1385 extern __inline __m256
1386 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1387 _mm256_mask_i32gather_ps (__m256 src, float const *base,
1388 			  __m256i index, __m256 mask, const int scale)
1389 {
1390   return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src,
1391 					       base,
1392 					       (__v8si)index,
1393 					       (__v8sf)mask,
1394 					       scale);
1395 }
1396 
1397 extern __inline __m128
1398 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1399 _mm_i64gather_ps (float const *base, __m128i index, const int scale)
1400 {
1401   __v4sf src = _mm_setzero_ps ();
1402   __v4sf mask = _mm_cmpeq_ps (src, src);
1403 
1404   return (__m128) __builtin_ia32_gatherdiv4sf (src,
1405 					       base,
1406 					       (__v2di)index,
1407 					       mask,
1408 					       scale);
1409 }
1410 
1411 extern __inline __m128
1412 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1413 _mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index,
1414 		       __m128 mask, const int scale)
1415 {
1416   return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src,
1417 						base,
1418 						(__v2di)index,
1419 						(__v4sf)mask,
1420 						scale);
1421 }
1422 
1423 extern __inline __m128
1424 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1425 _mm256_i64gather_ps (float const *base, __m256i index, const int scale)
1426 {
1427   __v4sf src = _mm_setzero_ps ();
1428   __v4sf mask = _mm_cmpeq_ps (src, src);
1429 
1430   return (__m128) __builtin_ia32_gatherdiv4sf256 (src,
1431 						  base,
1432 						  (__v4di)index,
1433 						  mask,
1434 						  scale);
1435 }
1436 
1437 extern __inline __m128
1438 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1439 _mm256_mask_i64gather_ps (__m128 src, float const *base,
1440 			  __m256i index, __m128 mask, const int scale)
1441 {
1442   return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src,
1443 						  base,
1444 						  (__v4di)index,
1445 						  (__v4sf)mask,
1446 						  scale);
1447 }
1448 
1449 extern __inline __m128i
1450 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1451 _mm_i32gather_epi64 (long long int const *base,
1452 		     __m128i index, const int scale)
1453 {
1454   __v2di src = __extension__ (__v2di){ 0, 0 };
1455   __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1456 
1457   return (__m128i) __builtin_ia32_gathersiv2di (src,
1458 						base,
1459 						(__v4si)index,
1460 						mask,
1461 						scale);
1462 }
1463 
1464 extern __inline __m128i
1465 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1466 _mm_mask_i32gather_epi64 (__m128i src, long long int const *base,
1467 			  __m128i index, __m128i mask, const int scale)
1468 {
1469   return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src,
1470 						base,
1471 						(__v4si)index,
1472 						(__v2di)mask,
1473 						scale);
1474 }
1475 
1476 extern __inline __m256i
1477 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1478 _mm256_i32gather_epi64 (long long int const *base,
1479 			__m128i index, const int scale)
1480 {
1481   __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1482   __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1483 
1484   return (__m256i) __builtin_ia32_gathersiv4di (src,
1485 						base,
1486 						(__v4si)index,
1487 						mask,
1488 						scale);
1489 }
1490 
1491 extern __inline __m256i
1492 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1493 _mm256_mask_i32gather_epi64 (__m256i src, long long int const *base,
1494 			     __m128i index, __m256i mask, const int scale)
1495 {
1496   return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src,
1497 						base,
1498 						(__v4si)index,
1499 						(__v4di)mask,
1500 						scale);
1501 }
1502 
1503 extern __inline __m128i
1504 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1505 _mm_i64gather_epi64 (long long int const *base,
1506 		     __m128i index, const int scale)
1507 {
1508   __v2di src = __extension__ (__v2di){ 0, 0 };
1509   __v2di mask = __extension__ (__v2di){ ~0, ~0 };
1510 
1511   return (__m128i) __builtin_ia32_gatherdiv2di (src,
1512 						base,
1513 						(__v2di)index,
1514 						mask,
1515 						scale);
1516 }
1517 
1518 extern __inline __m128i
1519 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1520 _mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index,
1521 			  __m128i mask, const int scale)
1522 {
1523   return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src,
1524 						base,
1525 						(__v2di)index,
1526 						(__v2di)mask,
1527 						scale);
1528 }
1529 
1530 extern __inline __m256i
1531 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1532 _mm256_i64gather_epi64 (long long int const *base,
1533 			__m256i index, const int scale)
1534 {
1535   __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
1536   __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
1537 
1538   return (__m256i) __builtin_ia32_gatherdiv4di (src,
1539 						base,
1540 						(__v4di)index,
1541 						mask,
1542 						scale);
1543 }
1544 
1545 extern __inline __m256i
1546 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1547 _mm256_mask_i64gather_epi64 (__m256i src, long long int const *base,
1548 			     __m256i index, __m256i mask, const int scale)
1549 {
1550   return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src,
1551 						base,
1552 						(__v4di)index,
1553 						(__v4di)mask,
1554 						scale);
1555 }
1556 
1557 extern __inline __m128i
1558 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1559 _mm_i32gather_epi32 (int const *base, __m128i index, const int scale)
1560 {
1561   __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1562   __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1563 
1564   return (__m128i) __builtin_ia32_gathersiv4si (src,
1565 					       base,
1566 					       (__v4si)index,
1567 					       mask,
1568 					       scale);
1569 }
1570 
1571 extern __inline __m128i
1572 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1573 _mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index,
1574 			  __m128i mask, const int scale)
1575 {
1576   return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src,
1577 						base,
1578 						(__v4si)index,
1579 						(__v4si)mask,
1580 						scale);
1581 }
1582 
1583 extern __inline __m256i
1584 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1585 _mm256_i32gather_epi32 (int const *base, __m256i index, const int scale)
1586 {
1587   __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
1588   __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
1589 
1590   return (__m256i) __builtin_ia32_gathersiv8si (src,
1591 						base,
1592 						(__v8si)index,
1593 						mask,
1594 						scale);
1595 }
1596 
1597 extern __inline __m256i
1598 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1599 _mm256_mask_i32gather_epi32 (__m256i src, int const *base,
1600 			     __m256i index, __m256i mask, const int scale)
1601 {
1602   return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src,
1603 						base,
1604 						(__v8si)index,
1605 						(__v8si)mask,
1606 						scale);
1607 }
1608 
1609 extern __inline __m128i
1610 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1611 _mm_i64gather_epi32 (int const *base, __m128i index, const int scale)
1612 {
1613   __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1614   __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1615 
1616   return (__m128i) __builtin_ia32_gatherdiv4si (src,
1617 						base,
1618 						(__v2di)index,
1619 						mask,
1620 						scale);
1621 }
1622 
1623 extern __inline __m128i
1624 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1625 _mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index,
1626 			  __m128i mask, const int scale)
1627 {
1628   return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src,
1629 						base,
1630 						(__v2di)index,
1631 						(__v4si)mask,
1632 						scale);
1633 }
1634 
1635 extern __inline __m128i
1636 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1637 _mm256_i64gather_epi32 (int const *base, __m256i index, const int scale)
1638 {
1639   __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
1640   __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
1641 
1642   return (__m128i) __builtin_ia32_gatherdiv4si256 (src,
1643 						  base,
1644 						  (__v4di)index,
1645 						  mask,
1646 						  scale);
1647 }
1648 
1649 extern __inline __m128i
1650 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
1651 _mm256_mask_i64gather_epi32 (__m128i src, int const *base,
1652 			     __m256i index, __m128i mask, const int scale)
1653 {
1654   return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src,
1655 						   base,
1656 						   (__v4di)index,
1657 						   (__v4si)mask,
1658 						   scale);
1659 }
1660 #else /* __OPTIMIZE__ */
1661 #define _mm_i32gather_pd(BASE, INDEX, SCALE)				\
1662   (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (),	\
1663 					 (double const *)BASE,		\
1664 					 (__v4si)(__m128i)INDEX,	\
1665 					 (__v2df)_mm_set1_pd(		\
1666 					   (double)(long long int) -1), \
1667 					 (int)SCALE)
1668 
1669 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
1670   (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC,	 \
1671 					 (double const *)BASE,	 \
1672 					 (__v4si)(__m128i)INDEX, \
1673 					 (__v2df)(__m128d)MASK,	 \
1674 					 (int)SCALE)
1675 
1676 #define _mm256_i32gather_pd(BASE, INDEX, SCALE)				\
1677   (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (),	\
1678 					 (double const *)BASE,		\
1679 					 (__v4si)(__m128i)INDEX,	\
1680 					 (__v4df)_mm256_set1_pd(	\
1681 					   (double)(long long int) -1), \
1682 					 (int)SCALE)
1683 
1684 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
1685   (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC,	 \
1686 					 (double const *)BASE,	 \
1687 					 (__v4si)(__m128i)INDEX, \
1688 					 (__v4df)(__m256d)MASK,	 \
1689 					 (int)SCALE)
1690 
1691 #define _mm_i64gather_pd(BASE, INDEX, SCALE)				\
1692   (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (),	\
1693 					 (double const *)BASE,		\
1694 					 (__v2di)(__m128i)INDEX,	\
1695 					 (__v2df)_mm_set1_pd(		\
1696 					   (double)(long long int) -1), \
1697 					 (int)SCALE)
1698 
1699 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
1700   (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC,	 \
1701 					 (double const *)BASE,	 \
1702 					 (__v2di)(__m128i)INDEX, \
1703 					 (__v2df)(__m128d)MASK,	 \
1704 					 (int)SCALE)
1705 
1706 #define _mm256_i64gather_pd(BASE, INDEX, SCALE)				\
1707   (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (),	\
1708 					 (double const *)BASE,		\
1709 					 (__v4di)(__m256i)INDEX,	\
1710 					 (__v4df)_mm256_set1_pd(	\
1711 					   (double)(long long int) -1), \
1712 					 (int)SCALE)
1713 
1714 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
1715   (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC,	 \
1716 					 (double const *)BASE,	 \
1717 					 (__v4di)(__m256i)INDEX, \
1718 					 (__v4df)(__m256d)MASK,	 \
1719 					 (int)SCALE)
1720 
1721 #define _mm_i32gather_ps(BASE, INDEX, SCALE)				\
1722   (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (),	\
1723 					(float const *)BASE,		\
1724 					(__v4si)(__m128i)INDEX,		\
1725 					_mm_set1_ps ((float)(int) -1),	\
1726 					(int)SCALE)
1727 
1728 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
1729   (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC,	 \
1730 					(float const *)BASE,	 \
1731 					(__v4si)(__m128i)INDEX,	 \
1732 					(__v4sf)(__m128d)MASK,	 \
1733 					(int)SCALE)
1734 
1735 #define _mm256_i32gather_ps(BASE, INDEX, SCALE)			       \
1736   (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
1737 					(float const *)BASE,	       \
1738 					(__v8si)(__m256i)INDEX,	       \
1739 					(__v8sf)_mm256_set1_ps (       \
1740 					  (float)(int) -1),	       \
1741 					(int)SCALE)
1742 
1743 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
1744   (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC,	\
1745 					(float const *)BASE,	\
1746 					(__v8si)(__m256i)INDEX, \
1747 					(__v8sf)(__m256d)MASK,	\
1748 					(int)SCALE)
1749 
1750 #define _mm_i64gather_ps(BASE, INDEX, SCALE)				\
1751   (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (),	\
1752 					(float const *)BASE,		\
1753 					(__v2di)(__m128i)INDEX,		\
1754 					(__v4sf)_mm_set1_ps (		\
1755 					  (float)(int) -1),		\
1756 					(int)SCALE)
1757 
1758 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
1759   (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC,	 \
1760 					(float const *)BASE,	 \
1761 					(__v2di)(__m128i)INDEX,	 \
1762 					(__v4sf)(__m128d)MASK,	 \
1763 					(int)SCALE)
1764 
1765 #define _mm256_i64gather_ps(BASE, INDEX, SCALE)				\
1766   (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (),	\
1767 					   (float const *)BASE,		\
1768 					   (__v4di)(__m256i)INDEX,	\
1769 					   (__v4sf)_mm_set1_ps(		\
1770 					     (float)(int) -1),		\
1771 					   (int)SCALE)
1772 
1773 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	   \
1774   (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC,	   \
1775 					   (float const *)BASE,	   \
1776 					   (__v4di)(__m256i)INDEX, \
1777 					   (__v4sf)(__m128)MASK,   \
1778 					   (int)SCALE)
1779 
1780 #define _mm_i32gather_epi64(BASE, INDEX, SCALE)				\
1781   (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
1782 					 (long long const *)BASE,	\
1783 					 (__v4si)(__m128i)INDEX,	\
1784 					 (__v2di)_mm_set1_epi64x (-1),	\
1785 					 (int)SCALE)
1786 
1787 #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE)	  \
1788   (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC,	  \
1789 					 (long long const *)BASE, \
1790 					 (__v4si)(__m128i)INDEX,  \
1791 					 (__v2di)(__m128i)MASK,	  \
1792 					 (int)SCALE)
1793 
1794 #define _mm256_i32gather_epi64(BASE, INDEX, SCALE)			   \
1795   (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
1796 					 (long long const *)BASE,	   \
1797 					 (__v4si)(__m128i)INDEX,	   \
1798 					 (__v4di)_mm256_set1_epi64x (-1),  \
1799 					 (int)SCALE)
1800 
1801 #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1802   (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC,	   \
1803 					 (long long const *)BASE,  \
1804 					 (__v4si)(__m128i)INDEX,   \
1805 					 (__v4di)(__m256i)MASK,	   \
1806 					 (int)SCALE)
1807 
1808 #define _mm_i64gather_epi64(BASE, INDEX, SCALE)				\
1809   (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
1810 					 (long long const *)BASE,	\
1811 					 (__v2di)(__m128i)INDEX,	\
1812 					 (__v2di)_mm_set1_epi64x (-1),	\
1813 					 (int)SCALE)
1814 
1815 #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE)	  \
1816   (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC,	  \
1817 					 (long long const *)BASE, \
1818 					 (__v2di)(__m128i)INDEX,  \
1819 					 (__v2di)(__m128i)MASK,	  \
1820 					 (int)SCALE)
1821 
1822 #define _mm256_i64gather_epi64(BASE, INDEX, SCALE)			   \
1823   (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
1824 					 (long long const *)BASE,	   \
1825 					 (__v4di)(__m256i)INDEX,	   \
1826 					 (__v4di)_mm256_set1_epi64x (-1),  \
1827 					 (int)SCALE)
1828 
1829 #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
1830   (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC,	   \
1831 					 (long long const *)BASE,  \
1832 					 (__v4di)(__m256i)INDEX,   \
1833 					 (__v4di)(__m256i)MASK,	   \
1834 					 (int)SCALE)
1835 
1836 #define _mm_i32gather_epi32(BASE, INDEX, SCALE)				\
1837   (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (),	\
1838 					 (int const *)BASE,		\
1839 					 (__v4si)(__m128i)INDEX,	\
1840 					 (__v4si)_mm_set1_epi32 (-1),	\
1841 					 (int)SCALE)
1842 
1843 #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1844   (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC,	\
1845 					(int const *)BASE,	\
1846 					(__v4si)(__m128i)INDEX, \
1847 					(__v4si)(__m128i)MASK,	\
1848 					(int)SCALE)
1849 
1850 #define _mm256_i32gather_epi32(BASE, INDEX, SCALE)			   \
1851   (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
1852 					 (int const *)BASE,		   \
1853 					 (__v8si)(__m256i)INDEX,	   \
1854 					 (__v8si)_mm256_set1_epi32 (-1),   \
1855 					 (int)SCALE)
1856 
1857 #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1858   (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC,	   \
1859 					(int const *)BASE,	   \
1860 					(__v8si)(__m256i)INDEX,	   \
1861 					(__v8si)(__m256i)MASK,	   \
1862 					(int)SCALE)
1863 
1864 #define _mm_i64gather_epi32(BASE, INDEX, SCALE)				\
1865   (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (),	\
1866 					 (int const *)BASE,		\
1867 					 (__v2di)(__m128i)INDEX,	\
1868 					 (__v4si)_mm_set1_epi32 (-1),	\
1869 					 (int)SCALE)
1870 
1871 #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1872   (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC,	\
1873 					(int const *)BASE,	\
1874 					(__v2di)(__m128i)INDEX, \
1875 					(__v4si)(__m128i)MASK,	\
1876 					(int)SCALE)
1877 
1878 #define _mm256_i64gather_epi32(BASE, INDEX, SCALE)			   \
1879   (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
1880 					    (int const *)BASE,		   \
1881 					    (__v4di)(__m256i)INDEX,	   \
1882 					    (__v4si)_mm_set1_epi32(-1),	   \
1883 					    (int)SCALE)
1884 
1885 #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
1886   (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC,  \
1887 					   (int const *)BASE,	   \
1888 					   (__v4di)(__m256i)INDEX, \
1889 					   (__v4si)(__m128i)MASK,  \
1890 					   (int)SCALE)
1891 #endif  /* __OPTIMIZE__ */
1892