1 /*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __PMMINTRIN_H 11 #define __PMMINTRIN_H 12 13 #if !defined(__i386__) && !defined(__x86_64__) 14 #error "This header is only meant to be used on x86 and x64 architecture" 15 #endif 16 17 #include <emmintrin.h> 18 19 /* Define the default attributes for the functions in this file. */ 20 #if defined(__EVEX512__) && !defined(__AVX10_1_512__) 21 #define __DEFAULT_FN_ATTRS \ 22 __attribute__((__always_inline__, __nodebug__, \ 23 __target__("sse3,no-evex512"), __min_vector_width__(128))) 24 #else 25 #define __DEFAULT_FN_ATTRS \ 26 __attribute__((__always_inline__, __nodebug__, __target__("sse3"), \ 27 __min_vector_width__(128))) 28 #endif 29 30 #if defined(__cplusplus) && (__cplusplus >= 201103L) 31 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr 32 #else 33 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS 34 #endif 35 36 /// Loads data from an unaligned memory location to elements in a 128-bit 37 /// vector. 38 /// 39 /// If the address of the data is not 16-byte aligned, the instruction may 40 /// read two adjacent aligned blocks of memory to retrieve the requested 41 /// data. 42 /// 43 /// \headerfile <x86intrin.h> 44 /// 45 /// This intrinsic corresponds to the <c> VLDDQU </c> instruction. 46 /// 47 /// \param __p 48 /// A pointer to a 128-bit integer vector containing integer values. 49 /// \returns A 128-bit vector containing the moved values. 50 static __inline__ __m128i __DEFAULT_FN_ATTRS 51 _mm_lddqu_si128(__m128i_u const *__p) 52 { 53 return (__m128i)__builtin_ia32_lddqu((char const *)__p); 54 } 55 56 /// Adds the even-indexed values and subtracts the odd-indexed values of 57 /// two 128-bit vectors of [4 x float]. 58 /// 59 /// \headerfile <x86intrin.h> 60 /// 61 /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. 62 /// 63 /// \param __a 64 /// A 128-bit vector of [4 x float] containing the left source operand. 65 /// \param __b 66 /// A 128-bit vector of [4 x float] containing the right source operand. 67 /// \returns A 128-bit vector of [4 x float] containing the alternating sums and 68 /// differences of both operands. 69 static __inline__ __m128 __DEFAULT_FN_ATTRS 70 _mm_addsub_ps(__m128 __a, __m128 __b) 71 { 72 return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b); 73 } 74 75 /// Horizontally adds the adjacent pairs of values contained in two 76 /// 128-bit vectors of [4 x float]. 77 /// 78 /// \headerfile <x86intrin.h> 79 /// 80 /// This intrinsic corresponds to the <c> VHADDPS </c> instruction. 81 /// 82 /// \param __a 83 /// A 128-bit vector of [4 x float] containing one of the source operands. 84 /// The horizontal sums of the values are stored in the lower bits of the 85 /// destination. 86 /// \param __b 87 /// A 128-bit vector of [4 x float] containing one of the source operands. 88 /// The horizontal sums of the values are stored in the upper bits of the 89 /// destination. 90 /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of 91 /// both operands. 92 static __inline__ __m128 __DEFAULT_FN_ATTRS 93 _mm_hadd_ps(__m128 __a, __m128 __b) 94 { 95 return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b); 96 } 97 98 /// Horizontally subtracts the adjacent pairs of values contained in two 99 /// 128-bit vectors of [4 x float]. 100 /// 101 /// \headerfile <x86intrin.h> 102 /// 103 /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. 104 /// 105 /// \param __a 106 /// A 128-bit vector of [4 x float] containing one of the source operands. 107 /// The horizontal differences between the values are stored in the lower 108 /// bits of the destination. 109 /// \param __b 110 /// A 128-bit vector of [4 x float] containing one of the source operands. 111 /// The horizontal differences between the values are stored in the upper 112 /// bits of the destination. 113 /// \returns A 128-bit vector of [4 x float] containing the horizontal 114 /// differences of both operands. 115 static __inline__ __m128 __DEFAULT_FN_ATTRS 116 _mm_hsub_ps(__m128 __a, __m128 __b) 117 { 118 return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b); 119 } 120 121 /// Moves and duplicates odd-indexed values from a 128-bit vector 122 /// of [4 x float] to float values stored in a 128-bit vector of 123 /// [4 x float]. 124 /// 125 /// \headerfile <x86intrin.h> 126 /// 127 /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. 128 /// 129 /// \param __a 130 /// A 128-bit vector of [4 x float]. \n 131 /// Bits [127:96] of the source are written to bits [127:96] and [95:64] of 132 /// the destination. \n 133 /// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the 134 /// destination. 135 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated 136 /// values. 137 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR 138 _mm_movehdup_ps(__m128 __a) 139 { 140 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3); 141 } 142 143 /// Duplicates even-indexed values from a 128-bit vector of 144 /// [4 x float] to float values stored in a 128-bit vector of [4 x float]. 145 /// 146 /// \headerfile <x86intrin.h> 147 /// 148 /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. 149 /// 150 /// \param __a 151 /// A 128-bit vector of [4 x float] \n 152 /// Bits [95:64] of the source are written to bits [127:96] and [95:64] of 153 /// the destination. \n 154 /// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the 155 /// destination. 156 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated 157 /// values. 158 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR 159 _mm_moveldup_ps(__m128 __a) 160 { 161 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2); 162 } 163 164 /// Adds the even-indexed values and subtracts the odd-indexed values of 165 /// two 128-bit vectors of [2 x double]. 166 /// 167 /// \headerfile <x86intrin.h> 168 /// 169 /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. 170 /// 171 /// \param __a 172 /// A 128-bit vector of [2 x double] containing the left source operand. 173 /// \param __b 174 /// A 128-bit vector of [2 x double] containing the right source operand. 175 /// \returns A 128-bit vector of [2 x double] containing the alternating sums 176 /// and differences of both operands. 177 static __inline__ __m128d __DEFAULT_FN_ATTRS 178 _mm_addsub_pd(__m128d __a, __m128d __b) 179 { 180 return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b); 181 } 182 183 /// Horizontally adds the pairs of values contained in two 128-bit 184 /// vectors of [2 x double]. 185 /// 186 /// \headerfile <x86intrin.h> 187 /// 188 /// This intrinsic corresponds to the <c> VHADDPD </c> instruction. 189 /// 190 /// \param __a 191 /// A 128-bit vector of [2 x double] containing one of the source operands. 192 /// The horizontal sum of the values is stored in the lower bits of the 193 /// destination. 194 /// \param __b 195 /// A 128-bit vector of [2 x double] containing one of the source operands. 196 /// The horizontal sum of the values is stored in the upper bits of the 197 /// destination. 198 /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of 199 /// both operands. 200 static __inline__ __m128d __DEFAULT_FN_ATTRS 201 _mm_hadd_pd(__m128d __a, __m128d __b) 202 { 203 return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b); 204 } 205 206 /// Horizontally subtracts the pairs of values contained in two 128-bit 207 /// vectors of [2 x double]. 208 /// 209 /// \headerfile <x86intrin.h> 210 /// 211 /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. 212 /// 213 /// \param __a 214 /// A 128-bit vector of [2 x double] containing one of the source operands. 215 /// The horizontal difference of the values is stored in the lower bits of 216 /// the destination. 217 /// \param __b 218 /// A 128-bit vector of [2 x double] containing one of the source operands. 219 /// The horizontal difference of the values is stored in the upper bits of 220 /// the destination. 221 /// \returns A 128-bit vector of [2 x double] containing the horizontal 222 /// differences of both operands. 223 static __inline__ __m128d __DEFAULT_FN_ATTRS 224 _mm_hsub_pd(__m128d __a, __m128d __b) 225 { 226 return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b); 227 } 228 229 /// Moves and duplicates one double-precision value to double-precision 230 /// values stored in a 128-bit vector of [2 x double]. 231 /// 232 /// \headerfile <x86intrin.h> 233 /// 234 /// \code 235 /// __m128d _mm_loaddup_pd(double const *dp); 236 /// \endcode 237 /// 238 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 239 /// 240 /// \param dp 241 /// A pointer to a double-precision value to be moved and duplicated. 242 /// \returns A 128-bit vector of [2 x double] containing the moved and 243 /// duplicated values. 244 #define _mm_loaddup_pd(dp) _mm_load1_pd(dp) 245 246 /// Moves and duplicates the double-precision value in the lower bits of 247 /// a 128-bit vector of [2 x double] to double-precision values stored in a 248 /// 128-bit vector of [2 x double]. 249 /// 250 /// \headerfile <x86intrin.h> 251 /// 252 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 253 /// 254 /// \param __a 255 /// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits 256 /// [127:64] and [63:0] of the destination. 257 /// \returns A 128-bit vector of [2 x double] containing the moved and 258 /// duplicated values. 259 static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR 260 _mm_movedup_pd(__m128d __a) 261 { 262 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 263 } 264 265 /// Establishes a linear address memory range to be monitored and puts 266 /// the processor in the monitor event pending state. Data stored in the 267 /// monitored address range causes the processor to exit the pending state. 268 /// 269 /// The \c MONITOR instruction can be used in kernel mode, and in other modes 270 /// if MSR <c> C001_0015h[MonMwaitUserEn] </c> is set. 271 /// 272 /// \headerfile <x86intrin.h> 273 /// 274 /// This intrinsic corresponds to the \c MONITOR instruction. 275 /// 276 /// \param __p 277 /// The memory range to be monitored. The size of the range is determined by 278 /// CPUID function 0000_0005h. 279 /// \param __extensions 280 /// Optional extensions for the monitoring state. 281 /// \param __hints 282 /// Optional hints for the monitoring state. 283 static __inline__ void __DEFAULT_FN_ATTRS 284 _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints) 285 { 286 __builtin_ia32_monitor(__p, __extensions, __hints); 287 } 288 289 /// Used with the \c MONITOR instruction to wait while the processor is in 290 /// the monitor event pending state. Data stored in the monitored address 291 /// range, or an interrupt, causes the processor to exit the pending state. 292 /// 293 /// The \c MWAIT instruction can be used in kernel mode, and in other modes if 294 /// MSR <c> C001_0015h[MonMwaitUserEn] </c> is set. 295 /// 296 /// \headerfile <x86intrin.h> 297 /// 298 /// This intrinsic corresponds to the \c MWAIT instruction. 299 /// 300 /// \param __extensions 301 /// Optional extensions for the monitoring state, which can vary by 302 /// processor. 303 /// \param __hints 304 /// Optional hints for the monitoring state, which can vary by processor. 305 static __inline__ void __DEFAULT_FN_ATTRS 306 _mm_mwait(unsigned __extensions, unsigned __hints) 307 { 308 __builtin_ia32_mwait(__extensions, __hints); 309 } 310 311 #undef __DEFAULT_FN_ATTRS 312 #undef __DEFAULT_FN_ATTRS_CONSTEXPR 313 314 #endif /* __PMMINTRIN_H */ 315