xref: /freebsd-src/contrib/llvm-project/clang/lib/Headers/fmaintrin.h (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
10b57cec5SDimitry Andric /*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
20b57cec5SDimitry Andric  *
30b57cec5SDimitry Andric  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric  * See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric  *
70b57cec5SDimitry Andric  *===-----------------------------------------------------------------------===
80b57cec5SDimitry Andric  */
90b57cec5SDimitry Andric 
100b57cec5SDimitry Andric #ifndef __IMMINTRIN_H
110b57cec5SDimitry Andric #error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
120b57cec5SDimitry Andric #endif
130b57cec5SDimitry Andric 
140b57cec5SDimitry Andric #ifndef __FMAINTRIN_H
150b57cec5SDimitry Andric #define __FMAINTRIN_H
160b57cec5SDimitry Andric 
170b57cec5SDimitry Andric /* Define the default attributes for the functions in this file. */
180b57cec5SDimitry Andric #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
190b57cec5SDimitry Andric #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
200b57cec5SDimitry Andric 
2106c3fb27SDimitry Andric /// Computes a multiply-add of 128-bit vectors of [4 x float].
2206c3fb27SDimitry Andric ///    For each element, computes <c> (__A * __B) + __C </c>.
2306c3fb27SDimitry Andric ///
2406c3fb27SDimitry Andric /// \headerfile <immintrin.h>
2506c3fb27SDimitry Andric ///
2606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213PS instruction.
2706c3fb27SDimitry Andric ///
2806c3fb27SDimitry Andric /// \param __A
2906c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand.
3006c3fb27SDimitry Andric /// \param __B
3106c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier.
3206c3fb27SDimitry Andric /// \param __C
3306c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the addend.
3406c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result.
350b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
360b57cec5SDimitry Andric _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
370b57cec5SDimitry Andric {
380b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
390b57cec5SDimitry Andric }
400b57cec5SDimitry Andric 
4106c3fb27SDimitry Andric /// Computes a multiply-add of 128-bit vectors of [2 x double].
4206c3fb27SDimitry Andric ///    For each element, computes <c> (__A * __B) + __C </c>.
4306c3fb27SDimitry Andric ///
4406c3fb27SDimitry Andric /// \headerfile <immintrin.h>
4506c3fb27SDimitry Andric ///
4606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213PD instruction.
4706c3fb27SDimitry Andric ///
4806c3fb27SDimitry Andric /// \param __A
4906c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand.
5006c3fb27SDimitry Andric /// \param __B
5106c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier.
5206c3fb27SDimitry Andric /// \param __C
5306c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the addend.
5406c3fb27SDimitry Andric /// \returns A 128-bit [2 x double] vector containing the result.
550b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
560b57cec5SDimitry Andric _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
570b57cec5SDimitry Andric {
580b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
590b57cec5SDimitry Andric }
600b57cec5SDimitry Andric 
6106c3fb27SDimitry Andric /// Computes a scalar multiply-add of the single-precision values in the
6206c3fb27SDimitry Andric ///    low 32 bits of 128-bit vectors of [4 x float].
63*0fca6ea1SDimitry Andric ///
64*0fca6ea1SDimitry Andric /// \code{.operation}
6506c3fb27SDimitry Andric /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
6606c3fb27SDimitry Andric /// result[127:32] = __A[127:32]
6706c3fb27SDimitry Andric /// \endcode
6806c3fb27SDimitry Andric ///
6906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
7006c3fb27SDimitry Andric ///
7106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213SS instruction.
7206c3fb27SDimitry Andric ///
7306c3fb27SDimitry Andric /// \param __A
7406c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
7506c3fb27SDimitry Andric ///    32 bits.
7606c3fb27SDimitry Andric /// \param __B
7706c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier in the low
7806c3fb27SDimitry Andric ///    32 bits.
7906c3fb27SDimitry Andric /// \param __C
8006c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the addend in the low
8106c3fb27SDimitry Andric ///    32 bits.
8206c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result in the low
8306c3fb27SDimitry Andric ///    32 bits and a copy of \a __A[127:32] in the upper 96 bits.
840b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
850b57cec5SDimitry Andric _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
860b57cec5SDimitry Andric {
870b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
880b57cec5SDimitry Andric }
890b57cec5SDimitry Andric 
9006c3fb27SDimitry Andric /// Computes a scalar multiply-add of the double-precision values in the
9106c3fb27SDimitry Andric ///    low 64 bits of 128-bit vectors of [2 x double].
92*0fca6ea1SDimitry Andric ///
93*0fca6ea1SDimitry Andric /// \code{.operation}
9406c3fb27SDimitry Andric /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
9506c3fb27SDimitry Andric /// result[127:64] = __A[127:64]
9606c3fb27SDimitry Andric /// \endcode
9706c3fb27SDimitry Andric ///
9806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
9906c3fb27SDimitry Andric ///
10006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213SD instruction.
10106c3fb27SDimitry Andric ///
10206c3fb27SDimitry Andric /// \param __A
10306c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
10406c3fb27SDimitry Andric ///    64 bits.
10506c3fb27SDimitry Andric /// \param __B
10606c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier in the low
10706c3fb27SDimitry Andric ///    64 bits.
10806c3fb27SDimitry Andric /// \param __C
10906c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the addend in the low
11006c3fb27SDimitry Andric ///    64 bits.
11106c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result in the low
11206c3fb27SDimitry Andric ///    64 bits and a copy of \a __A[127:64] in the upper 64 bits.
1130b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
1140b57cec5SDimitry Andric _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
1150b57cec5SDimitry Andric {
1160b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
1170b57cec5SDimitry Andric }
1180b57cec5SDimitry Andric 
11906c3fb27SDimitry Andric /// Computes a multiply-subtract of 128-bit vectors of [4 x float].
12006c3fb27SDimitry Andric ///    For each element, computes <c> (__A * __B) - __C </c>.
12106c3fb27SDimitry Andric ///
12206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
12306c3fb27SDimitry Andric ///
12406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
12506c3fb27SDimitry Andric ///
12606c3fb27SDimitry Andric /// \param __A
12706c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand.
12806c3fb27SDimitry Andric /// \param __B
12906c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier.
13006c3fb27SDimitry Andric /// \param __C
13106c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the subtrahend.
13206c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result.
1330b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
1340b57cec5SDimitry Andric _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
1350b57cec5SDimitry Andric {
1360b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
1370b57cec5SDimitry Andric }
1380b57cec5SDimitry Andric 
13906c3fb27SDimitry Andric /// Computes a multiply-subtract of 128-bit vectors of [2 x double].
14006c3fb27SDimitry Andric ///    For each element, computes <c> (__A * __B) - __C </c>.
14106c3fb27SDimitry Andric ///
14206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
14306c3fb27SDimitry Andric ///
14406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
14506c3fb27SDimitry Andric ///
14606c3fb27SDimitry Andric /// \param __A
14706c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand.
14806c3fb27SDimitry Andric /// \param __B
14906c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier.
15006c3fb27SDimitry Andric /// \param __C
15106c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the addend.
15206c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result.
1530b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
1540b57cec5SDimitry Andric _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
1550b57cec5SDimitry Andric {
1560b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
1570b57cec5SDimitry Andric }
1580b57cec5SDimitry Andric 
15906c3fb27SDimitry Andric /// Computes a scalar multiply-subtract of the single-precision values in
16006c3fb27SDimitry Andric ///    the low 32 bits of 128-bit vectors of [4 x float].
161*0fca6ea1SDimitry Andric ///
162*0fca6ea1SDimitry Andric /// \code{.operation}
16306c3fb27SDimitry Andric /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
16406c3fb27SDimitry Andric /// result[127:32] = __A[127:32]
16506c3fb27SDimitry Andric /// \endcode
16606c3fb27SDimitry Andric ///
16706c3fb27SDimitry Andric /// \headerfile <immintrin.h>
16806c3fb27SDimitry Andric ///
16906c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213SS instruction.
17006c3fb27SDimitry Andric ///
17106c3fb27SDimitry Andric /// \param __A
17206c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
17306c3fb27SDimitry Andric ///    32 bits.
17406c3fb27SDimitry Andric /// \param __B
17506c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier in the low
17606c3fb27SDimitry Andric ///    32 bits.
17706c3fb27SDimitry Andric /// \param __C
17806c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the subtrahend in the low
17906c3fb27SDimitry Andric ///   32 bits.
18006c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result in the low
18106c3fb27SDimitry Andric ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
1820b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
1830b57cec5SDimitry Andric _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
1840b57cec5SDimitry Andric {
1850b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
1860b57cec5SDimitry Andric }
1870b57cec5SDimitry Andric 
18806c3fb27SDimitry Andric /// Computes a scalar multiply-subtract of the double-precision values in
18906c3fb27SDimitry Andric ///    the low 64 bits of 128-bit vectors of [2 x double].
190*0fca6ea1SDimitry Andric ///
191*0fca6ea1SDimitry Andric /// \code{.operation}
19206c3fb27SDimitry Andric /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
19306c3fb27SDimitry Andric /// result[127:64] = __A[127:64]
19406c3fb27SDimitry Andric /// \endcode
19506c3fb27SDimitry Andric ///
19606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
19706c3fb27SDimitry Andric ///
19806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213SD instruction.
19906c3fb27SDimitry Andric ///
20006c3fb27SDimitry Andric /// \param __A
20106c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
20206c3fb27SDimitry Andric ///    64 bits.
20306c3fb27SDimitry Andric /// \param __B
20406c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier in the low
20506c3fb27SDimitry Andric ///    64 bits.
20606c3fb27SDimitry Andric /// \param __C
20706c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the subtrahend in the low
20806c3fb27SDimitry Andric ///    64 bits.
20906c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result in the low
21006c3fb27SDimitry Andric ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
2110b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
2120b57cec5SDimitry Andric _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
2130b57cec5SDimitry Andric {
2140b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
2150b57cec5SDimitry Andric }
2160b57cec5SDimitry Andric 
21706c3fb27SDimitry Andric /// Computes a negated multiply-add of 128-bit vectors of [4 x float].
21806c3fb27SDimitry Andric ///    For each element, computes <c> -(__A * __B) + __C </c>.
21906c3fb27SDimitry Andric ///
22006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
22106c3fb27SDimitry Andric ///
22206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
22306c3fb27SDimitry Andric ///
22406c3fb27SDimitry Andric /// \param __A
22506c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand.
22606c3fb27SDimitry Andric /// \param __B
22706c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier.
22806c3fb27SDimitry Andric /// \param __C
22906c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the addend.
23006c3fb27SDimitry Andric /// \returns A 128-bit [4 x float] vector containing the result.
2310b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
2320b57cec5SDimitry Andric _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
2330b57cec5SDimitry Andric {
2340b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
2350b57cec5SDimitry Andric }
2360b57cec5SDimitry Andric 
23706c3fb27SDimitry Andric /// Computes a negated multiply-add of 128-bit vectors of [2 x double].
23806c3fb27SDimitry Andric ///    For each element, computes <c> -(__A * __B) + __C </c>.
23906c3fb27SDimitry Andric ///
24006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
24106c3fb27SDimitry Andric ///
24206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
24306c3fb27SDimitry Andric ///
24406c3fb27SDimitry Andric /// \param __A
24506c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand.
24606c3fb27SDimitry Andric /// \param __B
24706c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier.
24806c3fb27SDimitry Andric /// \param __C
24906c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the addend.
25006c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result.
2510b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
2520b57cec5SDimitry Andric _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
2530b57cec5SDimitry Andric {
2540b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
2550b57cec5SDimitry Andric }
2560b57cec5SDimitry Andric 
25706c3fb27SDimitry Andric /// Computes a scalar negated multiply-add of the single-precision values in
25806c3fb27SDimitry Andric ///    the low 32 bits of 128-bit vectors of [4 x float].
259*0fca6ea1SDimitry Andric ///
260*0fca6ea1SDimitry Andric /// \code{.operation}
26106c3fb27SDimitry Andric /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
26206c3fb27SDimitry Andric /// result[127:32] = __A[127:32]
26306c3fb27SDimitry Andric /// \endcode
26406c3fb27SDimitry Andric ///
26506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
26606c3fb27SDimitry Andric ///
26706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213SS instruction.
26806c3fb27SDimitry Andric ///
26906c3fb27SDimitry Andric /// \param __A
27006c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
27106c3fb27SDimitry Andric ///    32 bits.
27206c3fb27SDimitry Andric /// \param __B
27306c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier in the low
27406c3fb27SDimitry Andric ///    32 bits.
27506c3fb27SDimitry Andric /// \param __C
27606c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the addend in the low
27706c3fb27SDimitry Andric ///    32 bits.
27806c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result in the low
27906c3fb27SDimitry Andric ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
2800b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
2810b57cec5SDimitry Andric _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
2820b57cec5SDimitry Andric {
2830b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
2840b57cec5SDimitry Andric }
2850b57cec5SDimitry Andric 
28606c3fb27SDimitry Andric /// Computes a scalar negated multiply-add of the double-precision values
28706c3fb27SDimitry Andric ///    in the low 64 bits of 128-bit vectors of [2 x double].
288*0fca6ea1SDimitry Andric ///
289*0fca6ea1SDimitry Andric /// \code{.operation}
29006c3fb27SDimitry Andric /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
29106c3fb27SDimitry Andric /// result[127:64] = __A[127:64]
29206c3fb27SDimitry Andric /// \endcode
29306c3fb27SDimitry Andric ///
29406c3fb27SDimitry Andric /// \headerfile <immintrin.h>
29506c3fb27SDimitry Andric ///
29606c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213SD instruction.
29706c3fb27SDimitry Andric ///
29806c3fb27SDimitry Andric /// \param __A
29906c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
30006c3fb27SDimitry Andric ///    64 bits.
30106c3fb27SDimitry Andric /// \param __B
30206c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier in the low
30306c3fb27SDimitry Andric ///    64 bits.
30406c3fb27SDimitry Andric /// \param __C
30506c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the addend in the low
30606c3fb27SDimitry Andric ///    64 bits.
30706c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result in the low
30806c3fb27SDimitry Andric ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
3090b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
3100b57cec5SDimitry Andric _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
3110b57cec5SDimitry Andric {
3120b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
3130b57cec5SDimitry Andric }
3140b57cec5SDimitry Andric 
31506c3fb27SDimitry Andric /// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
31606c3fb27SDimitry Andric ///    For each element, computes <c> -(__A * __B) - __C </c>.
31706c3fb27SDimitry Andric ///
31806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
31906c3fb27SDimitry Andric ///
32006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
32106c3fb27SDimitry Andric ///
32206c3fb27SDimitry Andric /// \param __A
32306c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand.
32406c3fb27SDimitry Andric /// \param __B
32506c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier.
32606c3fb27SDimitry Andric /// \param __C
32706c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the subtrahend.
32806c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result.
3290b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
3300b57cec5SDimitry Andric _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
3310b57cec5SDimitry Andric {
3320b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
3330b57cec5SDimitry Andric }
3340b57cec5SDimitry Andric 
33506c3fb27SDimitry Andric /// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
33606c3fb27SDimitry Andric ///    For each element, computes <c> -(__A * __B) - __C </c>.
33706c3fb27SDimitry Andric ///
33806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
33906c3fb27SDimitry Andric ///
34006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
34106c3fb27SDimitry Andric ///
34206c3fb27SDimitry Andric /// \param __A
34306c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand.
34406c3fb27SDimitry Andric /// \param __B
34506c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier.
34606c3fb27SDimitry Andric /// \param __C
34706c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the subtrahend.
34806c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result.
3490b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
3500b57cec5SDimitry Andric _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
3510b57cec5SDimitry Andric {
3520b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
3530b57cec5SDimitry Andric }
3540b57cec5SDimitry Andric 
35506c3fb27SDimitry Andric /// Computes a scalar negated multiply-subtract of the single-precision
35606c3fb27SDimitry Andric ///    values in the low 32 bits of 128-bit vectors of [4 x float].
357*0fca6ea1SDimitry Andric ///
358*0fca6ea1SDimitry Andric /// \code{.operation}
35906c3fb27SDimitry Andric /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
36006c3fb27SDimitry Andric /// result[127:32] = __A[127:32]
36106c3fb27SDimitry Andric /// \endcode
36206c3fb27SDimitry Andric ///
36306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
36406c3fb27SDimitry Andric ///
36506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
36606c3fb27SDimitry Andric ///
36706c3fb27SDimitry Andric /// \param __A
36806c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand in the low
36906c3fb27SDimitry Andric ///    32 bits.
37006c3fb27SDimitry Andric /// \param __B
37106c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier in the low
37206c3fb27SDimitry Andric ///    32 bits.
37306c3fb27SDimitry Andric /// \param __C
37406c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the subtrahend in the low
37506c3fb27SDimitry Andric ///    32 bits.
37606c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result in the low
37706c3fb27SDimitry Andric ///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
3780b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
3790b57cec5SDimitry Andric _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
3800b57cec5SDimitry Andric {
3810b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
3820b57cec5SDimitry Andric }
3830b57cec5SDimitry Andric 
38406c3fb27SDimitry Andric /// Computes a scalar negated multiply-subtract of the double-precision
38506c3fb27SDimitry Andric ///    values in the low 64 bits of 128-bit vectors of [2 x double].
386*0fca6ea1SDimitry Andric ///
387*0fca6ea1SDimitry Andric /// \code{.operation}
38806c3fb27SDimitry Andric /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
38906c3fb27SDimitry Andric /// result[127:64] = __A[127:64]
39006c3fb27SDimitry Andric /// \endcode
39106c3fb27SDimitry Andric ///
39206c3fb27SDimitry Andric /// \headerfile <immintrin.h>
39306c3fb27SDimitry Andric ///
39406c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
39506c3fb27SDimitry Andric ///
39606c3fb27SDimitry Andric /// \param __A
39706c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand in the low
39806c3fb27SDimitry Andric ///    64 bits.
39906c3fb27SDimitry Andric /// \param __B
40006c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier in the low
40106c3fb27SDimitry Andric ///    64 bits.
40206c3fb27SDimitry Andric /// \param __C
40306c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the subtrahend in the low
40406c3fb27SDimitry Andric ///    64 bits.
40506c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result in the low
40606c3fb27SDimitry Andric ///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
4070b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
4080b57cec5SDimitry Andric _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
4090b57cec5SDimitry Andric {
4100b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
4110b57cec5SDimitry Andric }
4120b57cec5SDimitry Andric 
41306c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 128-bit vectors of
41406c3fb27SDimitry Andric ///    [4 x float].
415*0fca6ea1SDimitry Andric ///
416*0fca6ea1SDimitry Andric /// \code{.operation}
41706c3fb27SDimitry Andric /// result[31:0]  = (__A[31:0] * __B[31:0]) - __C[31:0]
41806c3fb27SDimitry Andric /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
41906c3fb27SDimitry Andric /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
42006c3fb27SDimitry Andric /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
42106c3fb27SDimitry Andric /// \endcode
42206c3fb27SDimitry Andric ///
42306c3fb27SDimitry Andric /// \headerfile <immintrin.h>
42406c3fb27SDimitry Andric ///
42506c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
42606c3fb27SDimitry Andric ///
42706c3fb27SDimitry Andric /// \param __A
42806c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand.
42906c3fb27SDimitry Andric /// \param __B
43006c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier.
43106c3fb27SDimitry Andric /// \param __C
43206c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
43306c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result.
4340b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
4350b57cec5SDimitry Andric _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
4360b57cec5SDimitry Andric {
4370b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
4380b57cec5SDimitry Andric }
4390b57cec5SDimitry Andric 
44006c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 128-bit vectors of
44106c3fb27SDimitry Andric ///    [2 x double].
442*0fca6ea1SDimitry Andric ///
443*0fca6ea1SDimitry Andric /// \code{.operation}
44406c3fb27SDimitry Andric /// result[63:0]  = (__A[63:0] * __B[63:0]) - __C[63:0]
44506c3fb27SDimitry Andric /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
44606c3fb27SDimitry Andric /// \endcode
44706c3fb27SDimitry Andric ///
44806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
44906c3fb27SDimitry Andric ///
45006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
45106c3fb27SDimitry Andric ///
45206c3fb27SDimitry Andric /// \param __A
45306c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand.
45406c3fb27SDimitry Andric /// \param __B
45506c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier.
45606c3fb27SDimitry Andric /// \param __C
45706c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
45806c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result.
4590b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
4600b57cec5SDimitry Andric _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
4610b57cec5SDimitry Andric {
4620b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
4630b57cec5SDimitry Andric }
4640b57cec5SDimitry Andric 
46506c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 128-bit vectors of
46606c3fb27SDimitry Andric ///    [4 x float].
467*0fca6ea1SDimitry Andric ///
468*0fca6ea1SDimitry Andric /// \code{.operation}
46906c3fb27SDimitry Andric /// result[31:0]  = (__A[31:0] * __B[31:0]) + __C[31:0]
47006c3fb27SDimitry Andric /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
47106c3fb27SDimitry Andric /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
47206c3fb27SDimitry Andric /// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
47306c3fb27SDimitry Andric /// \endcode
47406c3fb27SDimitry Andric ///
47506c3fb27SDimitry Andric /// \headerfile <immintrin.h>
47606c3fb27SDimitry Andric ///
47706c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
47806c3fb27SDimitry Andric ///
47906c3fb27SDimitry Andric /// \param __A
48006c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplicand.
48106c3fb27SDimitry Andric /// \param __B
48206c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the multiplier.
48306c3fb27SDimitry Andric /// \param __C
48406c3fb27SDimitry Andric ///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
48506c3fb27SDimitry Andric /// \returns A 128-bit vector of [4 x float] containing the result.
4860b57cec5SDimitry Andric static __inline__ __m128 __DEFAULT_FN_ATTRS128
4870b57cec5SDimitry Andric _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
4880b57cec5SDimitry Andric {
4890b57cec5SDimitry Andric   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
4900b57cec5SDimitry Andric }
4910b57cec5SDimitry Andric 
49206c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 128-bit vectors of
49306c3fb27SDimitry Andric ///    [2 x double].
494*0fca6ea1SDimitry Andric ///
495*0fca6ea1SDimitry Andric /// \code{.operation}
49606c3fb27SDimitry Andric /// result[63:0]  = (__A[63:0] * __B[63:0]) + __C[63:0]
49706c3fb27SDimitry Andric /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
49806c3fb27SDimitry Andric /// \endcode
49906c3fb27SDimitry Andric ///
50006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
50106c3fb27SDimitry Andric ///
50206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
50306c3fb27SDimitry Andric ///
50406c3fb27SDimitry Andric /// \param __A
50506c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplicand.
50606c3fb27SDimitry Andric /// \param __B
50706c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the multiplier.
50806c3fb27SDimitry Andric /// \param __C
50906c3fb27SDimitry Andric ///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
51006c3fb27SDimitry Andric /// \returns A 128-bit vector of [2 x double] containing the result.
5110b57cec5SDimitry Andric static __inline__ __m128d __DEFAULT_FN_ATTRS128
5120b57cec5SDimitry Andric _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
5130b57cec5SDimitry Andric {
5140b57cec5SDimitry Andric   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
5150b57cec5SDimitry Andric }
5160b57cec5SDimitry Andric 
51706c3fb27SDimitry Andric /// Computes a multiply-add of 256-bit vectors of [8 x float].
51806c3fb27SDimitry Andric ///    For each element, computes <c> (__A * __B) + __C </c>.
51906c3fb27SDimitry Andric ///
52006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
52106c3fb27SDimitry Andric ///
52206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213PS instruction.
52306c3fb27SDimitry Andric ///
52406c3fb27SDimitry Andric /// \param __A
52506c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplicand.
52606c3fb27SDimitry Andric /// \param __B
52706c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplier.
52806c3fb27SDimitry Andric /// \param __C
52906c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the addend.
53006c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result.
5310b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256
5320b57cec5SDimitry Andric _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
5330b57cec5SDimitry Andric {
5340b57cec5SDimitry Andric   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
5350b57cec5SDimitry Andric }
5360b57cec5SDimitry Andric 
53706c3fb27SDimitry Andric /// Computes a multiply-add of 256-bit vectors of [4 x double].
53806c3fb27SDimitry Andric ///    For each element, computes <c> (__A * __B) + __C </c>.
53906c3fb27SDimitry Andric ///
54006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
54106c3fb27SDimitry Andric ///
54206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADD213PD instruction.
54306c3fb27SDimitry Andric ///
54406c3fb27SDimitry Andric /// \param __A
54506c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplicand.
54606c3fb27SDimitry Andric /// \param __B
54706c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplier.
54806c3fb27SDimitry Andric /// \param __C
54906c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the addend.
55006c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result.
5510b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256
5520b57cec5SDimitry Andric _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
5530b57cec5SDimitry Andric {
5540b57cec5SDimitry Andric   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
5550b57cec5SDimitry Andric }
5560b57cec5SDimitry Andric 
55706c3fb27SDimitry Andric /// Computes a multiply-subtract of 256-bit vectors of [8 x float].
55806c3fb27SDimitry Andric ///    For each element, computes <c> (__A * __B) - __C </c>.
55906c3fb27SDimitry Andric ///
56006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
56106c3fb27SDimitry Andric ///
56206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213PS instruction.
56306c3fb27SDimitry Andric ///
56406c3fb27SDimitry Andric /// \param __A
56506c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplicand.
56606c3fb27SDimitry Andric /// \param __B
56706c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplier.
56806c3fb27SDimitry Andric /// \param __C
56906c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the subtrahend.
57006c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result.
5710b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256
5720b57cec5SDimitry Andric _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
5730b57cec5SDimitry Andric {
5740b57cec5SDimitry Andric   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
5750b57cec5SDimitry Andric }
5760b57cec5SDimitry Andric 
57706c3fb27SDimitry Andric /// Computes a multiply-subtract of 256-bit vectors of [4 x double].
57806c3fb27SDimitry Andric ///    For each element, computes <c> (__A * __B) - __C </c>.
57906c3fb27SDimitry Andric ///
58006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
58106c3fb27SDimitry Andric ///
58206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUB213PD instruction.
58306c3fb27SDimitry Andric ///
58406c3fb27SDimitry Andric /// \param __A
58506c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplicand.
58606c3fb27SDimitry Andric /// \param __B
58706c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplier.
58806c3fb27SDimitry Andric /// \param __C
58906c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the subtrahend.
59006c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result.
5910b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256
5920b57cec5SDimitry Andric _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
5930b57cec5SDimitry Andric {
5940b57cec5SDimitry Andric   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
5950b57cec5SDimitry Andric }
5960b57cec5SDimitry Andric 
59706c3fb27SDimitry Andric /// Computes a negated multiply-add of 256-bit vectors of [8 x float].
59806c3fb27SDimitry Andric ///    For each element, computes <c> -(__A * __B) + __C </c>.
59906c3fb27SDimitry Andric ///
60006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
60106c3fb27SDimitry Andric ///
60206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213PS instruction.
60306c3fb27SDimitry Andric ///
60406c3fb27SDimitry Andric /// \param __A
60506c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplicand.
60606c3fb27SDimitry Andric /// \param __B
60706c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplier.
60806c3fb27SDimitry Andric /// \param __C
60906c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the addend.
61006c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result.
6110b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256
6120b57cec5SDimitry Andric _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
6130b57cec5SDimitry Andric {
6140b57cec5SDimitry Andric   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
6150b57cec5SDimitry Andric }
6160b57cec5SDimitry Andric 
61706c3fb27SDimitry Andric /// Computes a negated multiply-add of 256-bit vectors of [4 x double].
61806c3fb27SDimitry Andric ///    For each element, computes <c> -(__A * __B) + __C </c>.
61906c3fb27SDimitry Andric ///
62006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
62106c3fb27SDimitry Andric ///
62206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMADD213PD instruction.
62306c3fb27SDimitry Andric ///
62406c3fb27SDimitry Andric /// \param __A
62506c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplicand.
62606c3fb27SDimitry Andric /// \param __B
62706c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplier.
62806c3fb27SDimitry Andric /// \param __C
62906c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the addend.
63006c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result.
6310b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256
6320b57cec5SDimitry Andric _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
6330b57cec5SDimitry Andric {
6340b57cec5SDimitry Andric   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
6350b57cec5SDimitry Andric }
6360b57cec5SDimitry Andric 
63706c3fb27SDimitry Andric /// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
63806c3fb27SDimitry Andric ///    For each element, computes <c> -(__A * __B) - __C </c>.
63906c3fb27SDimitry Andric ///
64006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
64106c3fb27SDimitry Andric ///
64206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
64306c3fb27SDimitry Andric ///
64406c3fb27SDimitry Andric /// \param __A
64506c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplicand.
64606c3fb27SDimitry Andric /// \param __B
64706c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplier.
64806c3fb27SDimitry Andric /// \param __C
64906c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the subtrahend.
65006c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result.
6510b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256
6520b57cec5SDimitry Andric _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
6530b57cec5SDimitry Andric {
6540b57cec5SDimitry Andric   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
6550b57cec5SDimitry Andric }
6560b57cec5SDimitry Andric 
65706c3fb27SDimitry Andric /// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
65806c3fb27SDimitry Andric ///    For each element, computes <c> -(__A * __B) - __C </c>.
65906c3fb27SDimitry Andric ///
66006c3fb27SDimitry Andric /// \headerfile <immintrin.h>
66106c3fb27SDimitry Andric ///
66206c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
66306c3fb27SDimitry Andric ///
66406c3fb27SDimitry Andric /// \param __A
66506c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplicand.
66606c3fb27SDimitry Andric /// \param __B
66706c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplier.
66806c3fb27SDimitry Andric /// \param __C
66906c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the subtrahend.
67006c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result.
6710b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256
6720b57cec5SDimitry Andric _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
6730b57cec5SDimitry Andric {
6740b57cec5SDimitry Andric   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
6750b57cec5SDimitry Andric }
6760b57cec5SDimitry Andric 
67706c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 256-bit vectors of
67806c3fb27SDimitry Andric ///    [8 x float].
679*0fca6ea1SDimitry Andric ///
680*0fca6ea1SDimitry Andric /// \code{.operation}
68106c3fb27SDimitry Andric /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
68206c3fb27SDimitry Andric /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
68306c3fb27SDimitry Andric /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
68406c3fb27SDimitry Andric /// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
68506c3fb27SDimitry Andric /// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
68606c3fb27SDimitry Andric /// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
68706c3fb27SDimitry Andric /// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
68806c3fb27SDimitry Andric /// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
68906c3fb27SDimitry Andric /// \endcode
69006c3fb27SDimitry Andric ///
69106c3fb27SDimitry Andric /// \headerfile <immintrin.h>
69206c3fb27SDimitry Andric ///
69306c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
69406c3fb27SDimitry Andric ///
69506c3fb27SDimitry Andric /// \param __A
69606c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplicand.
69706c3fb27SDimitry Andric /// \param __B
69806c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplier.
69906c3fb27SDimitry Andric /// \param __C
70006c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
70106c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result.
7020b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256
7030b57cec5SDimitry Andric _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
7040b57cec5SDimitry Andric {
7050b57cec5SDimitry Andric   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
7060b57cec5SDimitry Andric }
7070b57cec5SDimitry Andric 
70806c3fb27SDimitry Andric /// Computes a multiply with alternating add/subtract of 256-bit vectors of
70906c3fb27SDimitry Andric ///    [4 x double].
710*0fca6ea1SDimitry Andric ///
711*0fca6ea1SDimitry Andric /// \code{.operation}
71206c3fb27SDimitry Andric /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
71306c3fb27SDimitry Andric /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
71406c3fb27SDimitry Andric /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
71506c3fb27SDimitry Andric /// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
71606c3fb27SDimitry Andric /// \endcode
71706c3fb27SDimitry Andric ///
71806c3fb27SDimitry Andric /// \headerfile <immintrin.h>
71906c3fb27SDimitry Andric ///
72006c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
72106c3fb27SDimitry Andric ///
72206c3fb27SDimitry Andric /// \param __A
72306c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplicand.
72406c3fb27SDimitry Andric /// \param __B
72506c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplier.
72606c3fb27SDimitry Andric /// \param __C
72706c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
72806c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result.
7290b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256
7300b57cec5SDimitry Andric _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
7310b57cec5SDimitry Andric {
7320b57cec5SDimitry Andric   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
7330b57cec5SDimitry Andric }
7340b57cec5SDimitry Andric 
73506c3fb27SDimitry Andric /// Computes a vector multiply with alternating add/subtract of 256-bit
73606c3fb27SDimitry Andric ///    vectors of [8 x float].
737*0fca6ea1SDimitry Andric ///
738*0fca6ea1SDimitry Andric /// \code{.operation}
73906c3fb27SDimitry Andric /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
74006c3fb27SDimitry Andric /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
74106c3fb27SDimitry Andric /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
74206c3fb27SDimitry Andric /// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
74306c3fb27SDimitry Andric /// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
74406c3fb27SDimitry Andric /// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
74506c3fb27SDimitry Andric /// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
74606c3fb27SDimitry Andric /// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
74706c3fb27SDimitry Andric /// \endcode
74806c3fb27SDimitry Andric ///
74906c3fb27SDimitry Andric /// \headerfile <immintrin.h>
75006c3fb27SDimitry Andric ///
75106c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
75206c3fb27SDimitry Andric ///
75306c3fb27SDimitry Andric /// \param __A
75406c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplicand.
75506c3fb27SDimitry Andric /// \param __B
75606c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the multiplier.
75706c3fb27SDimitry Andric /// \param __C
75806c3fb27SDimitry Andric ///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
75906c3fb27SDimitry Andric /// \returns A 256-bit vector of [8 x float] containing the result.
7600b57cec5SDimitry Andric static __inline__ __m256 __DEFAULT_FN_ATTRS256
7610b57cec5SDimitry Andric _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
7620b57cec5SDimitry Andric {
7630b57cec5SDimitry Andric   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
7640b57cec5SDimitry Andric }
7650b57cec5SDimitry Andric 
76606c3fb27SDimitry Andric /// Computes a vector multiply with alternating add/subtract of 256-bit
76706c3fb27SDimitry Andric ///    vectors of [4 x double].
768*0fca6ea1SDimitry Andric ///
769*0fca6ea1SDimitry Andric /// \code{.operation}
77006c3fb27SDimitry Andric /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
77106c3fb27SDimitry Andric /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
77206c3fb27SDimitry Andric /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
77306c3fb27SDimitry Andric /// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
77406c3fb27SDimitry Andric /// \endcode
77506c3fb27SDimitry Andric ///
77606c3fb27SDimitry Andric /// \headerfile <immintrin.h>
77706c3fb27SDimitry Andric ///
77806c3fb27SDimitry Andric /// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
77906c3fb27SDimitry Andric ///
78006c3fb27SDimitry Andric /// \param __A
78106c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplicand.
78206c3fb27SDimitry Andric /// \param __B
78306c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the multiplier.
78406c3fb27SDimitry Andric /// \param __C
78506c3fb27SDimitry Andric ///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
78606c3fb27SDimitry Andric /// \returns A 256-bit vector of [4 x double] containing the result.
7870b57cec5SDimitry Andric static __inline__ __m256d __DEFAULT_FN_ATTRS256
7880b57cec5SDimitry Andric _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
7890b57cec5SDimitry Andric {
7900b57cec5SDimitry Andric   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
7910b57cec5SDimitry Andric }
7920b57cec5SDimitry Andric 
7930b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS128
7940b57cec5SDimitry Andric #undef __DEFAULT_FN_ATTRS256
7950b57cec5SDimitry Andric 
7960b57cec5SDimitry Andric #endif /* __FMAINTRIN_H */
797