1*38fd1498Szrj /* Copyright (C) 2007-2018 Free Software Foundation, Inc.
2*38fd1498Szrj
3*38fd1498Szrj This file is part of GCC.
4*38fd1498Szrj
5*38fd1498Szrj GCC is free software; you can redistribute it and/or modify
6*38fd1498Szrj it under the terms of the GNU General Public License as published by
7*38fd1498Szrj the Free Software Foundation; either version 3, or (at your option)
8*38fd1498Szrj any later version.
9*38fd1498Szrj
10*38fd1498Szrj GCC is distributed in the hope that it will be useful,
11*38fd1498Szrj but WITHOUT ANY WARRANTY; without even the implied warranty of
12*38fd1498Szrj MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13*38fd1498Szrj GNU General Public License for more details.
14*38fd1498Szrj
15*38fd1498Szrj Under Section 7 of GPL version 3, you are granted additional
16*38fd1498Szrj permissions described in the GCC Runtime Library Exception, version
17*38fd1498Szrj 3.1, as published by the Free Software Foundation.
18*38fd1498Szrj
19*38fd1498Szrj You should have received a copy of the GNU General Public License and
20*38fd1498Szrj a copy of the GCC Runtime Library Exception along with this program;
21*38fd1498Szrj see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22*38fd1498Szrj <http://www.gnu.org/licenses/>. */
23*38fd1498Szrj
24*38fd1498Szrj #ifndef _X86INTRIN_H_INCLUDED
25*38fd1498Szrj # error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
26*38fd1498Szrj #endif
27*38fd1498Szrj
28*38fd1498Szrj #ifndef _FMA4INTRIN_H_INCLUDED
29*38fd1498Szrj #define _FMA4INTRIN_H_INCLUDED
30*38fd1498Szrj
31*38fd1498Szrj /* We need definitions from the SSE4A, SSE3, SSE2 and SSE header files. */
32*38fd1498Szrj #include <ammintrin.h>
33*38fd1498Szrj
34*38fd1498Szrj #ifndef __FMA4__
35*38fd1498Szrj #pragma GCC push_options
36*38fd1498Szrj #pragma GCC target("fma4")
37*38fd1498Szrj #define __DISABLE_FMA4__
38*38fd1498Szrj #endif /* __FMA4__ */
39*38fd1498Szrj
40*38fd1498Szrj /* 128b Floating point multiply/add type instructions. */
41*38fd1498Szrj extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_ps(__m128 __A,__m128 __B,__m128 __C)42*38fd1498Szrj _mm_macc_ps (__m128 __A, __m128 __B, __m128 __C)
43*38fd1498Szrj {
44*38fd1498Szrj return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
45*38fd1498Szrj }
46*38fd1498Szrj
47*38fd1498Szrj extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_pd(__m128d __A,__m128d __B,__m128d __C)48*38fd1498Szrj _mm_macc_pd (__m128d __A, __m128d __B, __m128d __C)
49*38fd1498Szrj {
50*38fd1498Szrj return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
51*38fd1498Szrj }
52*38fd1498Szrj
53*38fd1498Szrj extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_ss(__m128 __A,__m128 __B,__m128 __C)54*38fd1498Szrj _mm_macc_ss (__m128 __A, __m128 __B, __m128 __C)
55*38fd1498Szrj {
56*38fd1498Szrj return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
57*38fd1498Szrj }
58*38fd1498Szrj
59*38fd1498Szrj extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_sd(__m128d __A,__m128d __B,__m128d __C)60*38fd1498Szrj _mm_macc_sd (__m128d __A, __m128d __B, __m128d __C)
61*38fd1498Szrj {
62*38fd1498Szrj return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
63*38fd1498Szrj }
64*38fd1498Szrj
65*38fd1498Szrj extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msub_ps(__m128 __A,__m128 __B,__m128 __C)66*38fd1498Szrj _mm_msub_ps (__m128 __A, __m128 __B, __m128 __C)
67*38fd1498Szrj
68*38fd1498Szrj {
69*38fd1498Szrj return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
70*38fd1498Szrj }
71*38fd1498Szrj
72*38fd1498Szrj extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msub_pd(__m128d __A,__m128d __B,__m128d __C)73*38fd1498Szrj _mm_msub_pd (__m128d __A, __m128d __B, __m128d __C)
74*38fd1498Szrj {
75*38fd1498Szrj return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, -(__v2df)__C);
76*38fd1498Szrj }
77*38fd1498Szrj
78*38fd1498Szrj extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msub_ss(__m128 __A,__m128 __B,__m128 __C)79*38fd1498Szrj _mm_msub_ss (__m128 __A, __m128 __B, __m128 __C)
80*38fd1498Szrj {
81*38fd1498Szrj return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
82*38fd1498Szrj }
83*38fd1498Szrj
84*38fd1498Szrj extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msub_sd(__m128d __A,__m128d __B,__m128d __C)85*38fd1498Szrj _mm_msub_sd (__m128d __A, __m128d __B, __m128d __C)
86*38fd1498Szrj {
87*38fd1498Szrj return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, -(__v2df)__C);
88*38fd1498Szrj }
89*38fd1498Szrj
90*38fd1498Szrj extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmacc_ps(__m128 __A,__m128 __B,__m128 __C)91*38fd1498Szrj _mm_nmacc_ps (__m128 __A, __m128 __B, __m128 __C)
92*38fd1498Szrj {
93*38fd1498Szrj return (__m128) __builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
94*38fd1498Szrj }
95*38fd1498Szrj
96*38fd1498Szrj extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmacc_pd(__m128d __A,__m128d __B,__m128d __C)97*38fd1498Szrj _mm_nmacc_pd (__m128d __A, __m128d __B, __m128d __C)
98*38fd1498Szrj {
99*38fd1498Szrj return (__m128d) __builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B, (__v2df)__C);
100*38fd1498Szrj }
101*38fd1498Szrj
102*38fd1498Szrj extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmacc_ss(__m128 __A,__m128 __B,__m128 __C)103*38fd1498Szrj _mm_nmacc_ss (__m128 __A, __m128 __B, __m128 __C)
104*38fd1498Szrj {
105*38fd1498Szrj return (__m128) __builtin_ia32_vfmaddss (-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
106*38fd1498Szrj }
107*38fd1498Szrj
108*38fd1498Szrj extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmacc_sd(__m128d __A,__m128d __B,__m128d __C)109*38fd1498Szrj _mm_nmacc_sd (__m128d __A, __m128d __B, __m128d __C)
110*38fd1498Szrj {
111*38fd1498Szrj return (__m128d) __builtin_ia32_vfmaddsd (-(__v2df)__A, (__v2df)__B, (__v2df)__C);
112*38fd1498Szrj }
113*38fd1498Szrj
114*38fd1498Szrj extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmsub_ps(__m128 __A,__m128 __B,__m128 __C)115*38fd1498Szrj _mm_nmsub_ps (__m128 __A, __m128 __B, __m128 __C)
116*38fd1498Szrj {
117*38fd1498Szrj return (__m128) __builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
118*38fd1498Szrj }
119*38fd1498Szrj
120*38fd1498Szrj extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmsub_pd(__m128d __A,__m128d __B,__m128d __C)121*38fd1498Szrj _mm_nmsub_pd (__m128d __A, __m128d __B, __m128d __C)
122*38fd1498Szrj {
123*38fd1498Szrj return (__m128d) __builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
124*38fd1498Szrj }
125*38fd1498Szrj
126*38fd1498Szrj extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmsub_ss(__m128 __A,__m128 __B,__m128 __C)127*38fd1498Szrj _mm_nmsub_ss (__m128 __A, __m128 __B, __m128 __C)
128*38fd1498Szrj {
129*38fd1498Szrj return (__m128) __builtin_ia32_vfmaddss (-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
130*38fd1498Szrj }
131*38fd1498Szrj
132*38fd1498Szrj extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmsub_sd(__m128d __A,__m128d __B,__m128d __C)133*38fd1498Szrj _mm_nmsub_sd (__m128d __A, __m128d __B, __m128d __C)
134*38fd1498Szrj {
135*38fd1498Szrj return (__m128d) __builtin_ia32_vfmaddsd (-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
136*38fd1498Szrj }
137*38fd1498Szrj
138*38fd1498Szrj extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddsub_ps(__m128 __A,__m128 __B,__m128 __C)139*38fd1498Szrj _mm_maddsub_ps (__m128 __A, __m128 __B, __m128 __C)
140*38fd1498Szrj {
141*38fd1498Szrj return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
142*38fd1498Szrj }
143*38fd1498Szrj
144*38fd1498Szrj extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddsub_pd(__m128d __A,__m128d __B,__m128d __C)145*38fd1498Szrj _mm_maddsub_pd (__m128d __A, __m128d __B, __m128d __C)
146*38fd1498Szrj {
147*38fd1498Szrj return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
148*38fd1498Szrj }
149*38fd1498Szrj
150*38fd1498Szrj extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msubadd_ps(__m128 __A,__m128 __B,__m128 __C)151*38fd1498Szrj _mm_msubadd_ps (__m128 __A, __m128 __B, __m128 __C)
152*38fd1498Szrj {
153*38fd1498Szrj return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
154*38fd1498Szrj }
155*38fd1498Szrj
156*38fd1498Szrj extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msubadd_pd(__m128d __A,__m128d __B,__m128d __C)157*38fd1498Szrj _mm_msubadd_pd (__m128d __A, __m128d __B, __m128d __C)
158*38fd1498Szrj {
159*38fd1498Szrj return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, -(__v2df)__C);
160*38fd1498Szrj }
161*38fd1498Szrj
162*38fd1498Szrj /* 256b Floating point multiply/add type instructions. */
163*38fd1498Szrj extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_macc_ps(__m256 __A,__m256 __B,__m256 __C)164*38fd1498Szrj _mm256_macc_ps (__m256 __A, __m256 __B, __m256 __C)
165*38fd1498Szrj {
166*38fd1498Szrj return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
167*38fd1498Szrj }
168*38fd1498Szrj
169*38fd1498Szrj extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_macc_pd(__m256d __A,__m256d __B,__m256d __C)170*38fd1498Szrj _mm256_macc_pd (__m256d __A, __m256d __B, __m256d __C)
171*38fd1498Szrj {
172*38fd1498Szrj return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
173*38fd1498Szrj }
174*38fd1498Szrj
175*38fd1498Szrj extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_msub_ps(__m256 __A,__m256 __B,__m256 __C)176*38fd1498Szrj _mm256_msub_ps (__m256 __A, __m256 __B, __m256 __C)
177*38fd1498Szrj
178*38fd1498Szrj {
179*38fd1498Szrj return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
180*38fd1498Szrj }
181*38fd1498Szrj
182*38fd1498Szrj extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_msub_pd(__m256d __A,__m256d __B,__m256d __C)183*38fd1498Szrj _mm256_msub_pd (__m256d __A, __m256d __B, __m256d __C)
184*38fd1498Szrj {
185*38fd1498Szrj return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, -(__v4df)__C);
186*38fd1498Szrj }
187*38fd1498Szrj
188*38fd1498Szrj extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_nmacc_ps(__m256 __A,__m256 __B,__m256 __C)189*38fd1498Szrj _mm256_nmacc_ps (__m256 __A, __m256 __B, __m256 __C)
190*38fd1498Szrj {
191*38fd1498Szrj return (__m256) __builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
192*38fd1498Szrj }
193*38fd1498Szrj
194*38fd1498Szrj extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_nmacc_pd(__m256d __A,__m256d __B,__m256d __C)195*38fd1498Szrj _mm256_nmacc_pd (__m256d __A, __m256d __B, __m256d __C)
196*38fd1498Szrj {
197*38fd1498Szrj return (__m256d) __builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B, (__v4df)__C);
198*38fd1498Szrj }
199*38fd1498Szrj
200*38fd1498Szrj extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_nmsub_ps(__m256 __A,__m256 __B,__m256 __C)201*38fd1498Szrj _mm256_nmsub_ps (__m256 __A, __m256 __B, __m256 __C)
202*38fd1498Szrj {
203*38fd1498Szrj return (__m256) __builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
204*38fd1498Szrj }
205*38fd1498Szrj
206*38fd1498Szrj extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_nmsub_pd(__m256d __A,__m256d __B,__m256d __C)207*38fd1498Szrj _mm256_nmsub_pd (__m256d __A, __m256d __B, __m256d __C)
208*38fd1498Szrj {
209*38fd1498Szrj return (__m256d) __builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
210*38fd1498Szrj }
211*38fd1498Szrj
212*38fd1498Szrj extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maddsub_ps(__m256 __A,__m256 __B,__m256 __C)213*38fd1498Szrj _mm256_maddsub_ps (__m256 __A, __m256 __B, __m256 __C)
214*38fd1498Szrj {
215*38fd1498Szrj return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
216*38fd1498Szrj }
217*38fd1498Szrj
218*38fd1498Szrj extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maddsub_pd(__m256d __A,__m256d __B,__m256d __C)219*38fd1498Szrj _mm256_maddsub_pd (__m256d __A, __m256d __B, __m256d __C)
220*38fd1498Szrj {
221*38fd1498Szrj return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
222*38fd1498Szrj }
223*38fd1498Szrj
224*38fd1498Szrj extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_msubadd_ps(__m256 __A,__m256 __B,__m256 __C)225*38fd1498Szrj _mm256_msubadd_ps (__m256 __A, __m256 __B, __m256 __C)
226*38fd1498Szrj {
227*38fd1498Szrj return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
228*38fd1498Szrj }
229*38fd1498Szrj
230*38fd1498Szrj extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_msubadd_pd(__m256d __A,__m256d __B,__m256d __C)231*38fd1498Szrj _mm256_msubadd_pd (__m256d __A, __m256d __B, __m256d __C)
232*38fd1498Szrj {
233*38fd1498Szrj return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, -(__v4df)__C);
234*38fd1498Szrj }
235*38fd1498Szrj
236*38fd1498Szrj #ifdef __DISABLE_FMA4__
237*38fd1498Szrj #undef __DISABLE_FMA4__
238*38fd1498Szrj #pragma GCC pop_options
239*38fd1498Szrj #endif /* __DISABLE_FMA4__ */
240*38fd1498Szrj
241*38fd1498Szrj #endif
242