1 /*===---- __clang_cuda_device_functions.h - CUDA runtime support -----------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __CLANG_CUDA_DEVICE_FUNCTIONS_H__ 11 #define __CLANG_CUDA_DEVICE_FUNCTIONS_H__ 12 13 #ifndef __OPENMP_NVPTX__ 14 #if CUDA_VERSION < 9000 15 #error This file is intended to be used with CUDA-9+ only. 16 #endif 17 #endif 18 19 // __DEVICE__ is a helper macro with common set of attributes for the wrappers 20 // we implement in this file. We need static in order to avoid emitting unused 21 // functions and __forceinline__ helps inlining these wrappers at -O1. 22 #pragma push_macro("__DEVICE__") 23 #ifdef __OPENMP_NVPTX__ 24 #define __DEVICE__ static __attribute__((always_inline, nothrow)) 25 #else 26 #define __DEVICE__ static __device__ __forceinline__ 27 #endif 28 29 __DEVICE__ int __all(int __a) { return __nvvm_vote_all(__a); } 30 __DEVICE__ int __any(int __a) { return __nvvm_vote_any(__a); } 31 __DEVICE__ unsigned int __ballot(int __a) { return __nvvm_vote_ballot(__a); } 32 __DEVICE__ unsigned int __brev(unsigned int __a) { return __nv_brev(__a); } 33 __DEVICE__ unsigned long long __brevll(unsigned long long __a) { 34 return __nv_brevll(__a); 35 } 36 #if defined(__cplusplus) 37 __DEVICE__ void __brkpt() { asm volatile("brkpt;"); } 38 __DEVICE__ void __brkpt(int __a) { __brkpt(); } 39 #else 40 __DEVICE__ void __attribute__((overloadable)) __brkpt(void) { asm volatile("brkpt;"); } 41 __DEVICE__ void __attribute__((overloadable)) __brkpt(int __a) { __brkpt(); } 42 #endif 43 __DEVICE__ unsigned int __byte_perm(unsigned int __a, unsigned int __b, 44 unsigned int __c) { 45 return __nv_byte_perm(__a, __b, __c); 46 } 47 __DEVICE__ int __clz(int __a) { return __nv_clz(__a); } 48 __DEVICE__ int __clzll(long long __a) { return __nv_clzll(__a); } 49 __DEVICE__ float __cosf(float __a) { return __nv_fast_cosf(__a); } 50 __DEVICE__ double __dAtomicAdd(double *__p, double __v) { 51 return __nvvm_atom_add_gen_d(__p, __v); 52 } 53 __DEVICE__ double __dAtomicAdd_block(double *__p, double __v) { 54 return __nvvm_atom_cta_add_gen_d(__p, __v); 55 } 56 __DEVICE__ double __dAtomicAdd_system(double *__p, double __v) { 57 return __nvvm_atom_sys_add_gen_d(__p, __v); 58 } 59 __DEVICE__ double __dadd_rd(double __a, double __b) { 60 return __nv_dadd_rd(__a, __b); 61 } 62 __DEVICE__ double __dadd_rn(double __a, double __b) { 63 return __nv_dadd_rn(__a, __b); 64 } 65 __DEVICE__ double __dadd_ru(double __a, double __b) { 66 return __nv_dadd_ru(__a, __b); 67 } 68 __DEVICE__ double __dadd_rz(double __a, double __b) { 69 return __nv_dadd_rz(__a, __b); 70 } 71 __DEVICE__ double __ddiv_rd(double __a, double __b) { 72 return __nv_ddiv_rd(__a, __b); 73 } 74 __DEVICE__ double __ddiv_rn(double __a, double __b) { 75 return __nv_ddiv_rn(__a, __b); 76 } 77 __DEVICE__ double __ddiv_ru(double __a, double __b) { 78 return __nv_ddiv_ru(__a, __b); 79 } 80 __DEVICE__ double __ddiv_rz(double __a, double __b) { 81 return __nv_ddiv_rz(__a, __b); 82 } 83 __DEVICE__ double __dmul_rd(double __a, double __b) { 84 return __nv_dmul_rd(__a, __b); 85 } 86 __DEVICE__ double __dmul_rn(double __a, double __b) { 87 return __nv_dmul_rn(__a, __b); 88 } 89 __DEVICE__ double __dmul_ru(double __a, double __b) { 90 return __nv_dmul_ru(__a, __b); 91 } 92 __DEVICE__ double __dmul_rz(double __a, double __b) { 93 return __nv_dmul_rz(__a, __b); 94 } 95 __DEVICE__ float __double2float_rd(double __a) { 96 return __nv_double2float_rd(__a); 97 } 98 __DEVICE__ float __double2float_rn(double __a) { 99 return __nv_double2float_rn(__a); 100 } 101 __DEVICE__ float __double2float_ru(double __a) { 102 return __nv_double2float_ru(__a); 103 } 104 __DEVICE__ float __double2float_rz(double __a) { 105 return __nv_double2float_rz(__a); 106 } 107 __DEVICE__ int __double2hiint(double __a) { return __nv_double2hiint(__a); } 108 __DEVICE__ int __double2int_rd(double __a) { return __nv_double2int_rd(__a); } 109 __DEVICE__ int __double2int_rn(double __a) { return __nv_double2int_rn(__a); } 110 __DEVICE__ int __double2int_ru(double __a) { return __nv_double2int_ru(__a); } 111 __DEVICE__ int __double2int_rz(double __a) { return __nv_double2int_rz(__a); } 112 __DEVICE__ long long __double2ll_rd(double __a) { 113 return __nv_double2ll_rd(__a); 114 } 115 __DEVICE__ long long __double2ll_rn(double __a) { 116 return __nv_double2ll_rn(__a); 117 } 118 __DEVICE__ long long __double2ll_ru(double __a) { 119 return __nv_double2ll_ru(__a); 120 } 121 __DEVICE__ long long __double2ll_rz(double __a) { 122 return __nv_double2ll_rz(__a); 123 } 124 __DEVICE__ int __double2loint(double __a) { return __nv_double2loint(__a); } 125 __DEVICE__ unsigned int __double2uint_rd(double __a) { 126 return __nv_double2uint_rd(__a); 127 } 128 __DEVICE__ unsigned int __double2uint_rn(double __a) { 129 return __nv_double2uint_rn(__a); 130 } 131 __DEVICE__ unsigned int __double2uint_ru(double __a) { 132 return __nv_double2uint_ru(__a); 133 } 134 __DEVICE__ unsigned int __double2uint_rz(double __a) { 135 return __nv_double2uint_rz(__a); 136 } 137 __DEVICE__ unsigned long long __double2ull_rd(double __a) { 138 return __nv_double2ull_rd(__a); 139 } 140 __DEVICE__ unsigned long long __double2ull_rn(double __a) { 141 return __nv_double2ull_rn(__a); 142 } 143 __DEVICE__ unsigned long long __double2ull_ru(double __a) { 144 return __nv_double2ull_ru(__a); 145 } 146 __DEVICE__ unsigned long long __double2ull_rz(double __a) { 147 return __nv_double2ull_rz(__a); 148 } 149 __DEVICE__ long long __double_as_longlong(double __a) { 150 return __nv_double_as_longlong(__a); 151 } 152 __DEVICE__ double __drcp_rd(double __a) { return __nv_drcp_rd(__a); } 153 __DEVICE__ double __drcp_rn(double __a) { return __nv_drcp_rn(__a); } 154 __DEVICE__ double __drcp_ru(double __a) { return __nv_drcp_ru(__a); } 155 __DEVICE__ double __drcp_rz(double __a) { return __nv_drcp_rz(__a); } 156 __DEVICE__ double __dsqrt_rd(double __a) { return __nv_dsqrt_rd(__a); } 157 __DEVICE__ double __dsqrt_rn(double __a) { return __nv_dsqrt_rn(__a); } 158 __DEVICE__ double __dsqrt_ru(double __a) { return __nv_dsqrt_ru(__a); } 159 __DEVICE__ double __dsqrt_rz(double __a) { return __nv_dsqrt_rz(__a); } 160 __DEVICE__ double __dsub_rd(double __a, double __b) { 161 return __nv_dsub_rd(__a, __b); 162 } 163 __DEVICE__ double __dsub_rn(double __a, double __b) { 164 return __nv_dsub_rn(__a, __b); 165 } 166 __DEVICE__ double __dsub_ru(double __a, double __b) { 167 return __nv_dsub_ru(__a, __b); 168 } 169 __DEVICE__ double __dsub_rz(double __a, double __b) { 170 return __nv_dsub_rz(__a, __b); 171 } 172 __DEVICE__ float __exp10f(float __a) { return __nv_fast_exp10f(__a); } 173 __DEVICE__ float __expf(float __a) { return __nv_fast_expf(__a); } 174 __DEVICE__ float __fAtomicAdd(float *__p, float __v) { 175 return __nvvm_atom_add_gen_f(__p, __v); 176 } 177 __DEVICE__ float __fAtomicAdd_block(float *__p, float __v) { 178 return __nvvm_atom_cta_add_gen_f(__p, __v); 179 } 180 __DEVICE__ float __fAtomicAdd_system(float *__p, float __v) { 181 return __nvvm_atom_sys_add_gen_f(__p, __v); 182 } 183 __DEVICE__ float __fAtomicExch(float *__p, float __v) { 184 return __nv_int_as_float( 185 __nvvm_atom_xchg_gen_i((int *)__p, __nv_float_as_int(__v))); 186 } 187 __DEVICE__ float __fAtomicExch_block(float *__p, float __v) { 188 return __nv_int_as_float( 189 __nvvm_atom_cta_xchg_gen_i((int *)__p, __nv_float_as_int(__v))); 190 } 191 __DEVICE__ float __fAtomicExch_system(float *__p, float __v) { 192 return __nv_int_as_float( 193 __nvvm_atom_sys_xchg_gen_i((int *)__p, __nv_float_as_int(__v))); 194 } 195 __DEVICE__ float __fadd_rd(float __a, float __b) { 196 return __nv_fadd_rd(__a, __b); 197 } 198 __DEVICE__ float __fadd_rn(float __a, float __b) { 199 return __nv_fadd_rn(__a, __b); 200 } 201 __DEVICE__ float __fadd_ru(float __a, float __b) { 202 return __nv_fadd_ru(__a, __b); 203 } 204 __DEVICE__ float __fadd_rz(float __a, float __b) { 205 return __nv_fadd_rz(__a, __b); 206 } 207 __DEVICE__ float __fdiv_rd(float __a, float __b) { 208 return __nv_fdiv_rd(__a, __b); 209 } 210 __DEVICE__ float __fdiv_rn(float __a, float __b) { 211 return __nv_fdiv_rn(__a, __b); 212 } 213 __DEVICE__ float __fdiv_ru(float __a, float __b) { 214 return __nv_fdiv_ru(__a, __b); 215 } 216 __DEVICE__ float __fdiv_rz(float __a, float __b) { 217 return __nv_fdiv_rz(__a, __b); 218 } 219 __DEVICE__ float __fdividef(float __a, float __b) { 220 return __nv_fast_fdividef(__a, __b); 221 } 222 __DEVICE__ int __ffs(int __a) { return __nv_ffs(__a); } 223 __DEVICE__ int __ffsll(long long __a) { return __nv_ffsll(__a); } 224 __DEVICE__ int __finite(double __a) { return __nv_isfinited(__a); } 225 __DEVICE__ int __finitef(float __a) { return __nv_finitef(__a); } 226 #ifdef _MSC_VER 227 __DEVICE__ int __finitel(long double __a); 228 #endif 229 __DEVICE__ int __float2int_rd(float __a) { return __nv_float2int_rd(__a); } 230 __DEVICE__ int __float2int_rn(float __a) { return __nv_float2int_rn(__a); } 231 __DEVICE__ int __float2int_ru(float __a) { return __nv_float2int_ru(__a); } 232 __DEVICE__ int __float2int_rz(float __a) { return __nv_float2int_rz(__a); } 233 __DEVICE__ long long __float2ll_rd(float __a) { return __nv_float2ll_rd(__a); } 234 __DEVICE__ long long __float2ll_rn(float __a) { return __nv_float2ll_rn(__a); } 235 __DEVICE__ long long __float2ll_ru(float __a) { return __nv_float2ll_ru(__a); } 236 __DEVICE__ long long __float2ll_rz(float __a) { return __nv_float2ll_rz(__a); } 237 __DEVICE__ unsigned int __float2uint_rd(float __a) { 238 return __nv_float2uint_rd(__a); 239 } 240 __DEVICE__ unsigned int __float2uint_rn(float __a) { 241 return __nv_float2uint_rn(__a); 242 } 243 __DEVICE__ unsigned int __float2uint_ru(float __a) { 244 return __nv_float2uint_ru(__a); 245 } 246 __DEVICE__ unsigned int __float2uint_rz(float __a) { 247 return __nv_float2uint_rz(__a); 248 } 249 __DEVICE__ unsigned long long __float2ull_rd(float __a) { 250 return __nv_float2ull_rd(__a); 251 } 252 __DEVICE__ unsigned long long __float2ull_rn(float __a) { 253 return __nv_float2ull_rn(__a); 254 } 255 __DEVICE__ unsigned long long __float2ull_ru(float __a) { 256 return __nv_float2ull_ru(__a); 257 } 258 __DEVICE__ unsigned long long __float2ull_rz(float __a) { 259 return __nv_float2ull_rz(__a); 260 } 261 __DEVICE__ int __float_as_int(float __a) { return __nv_float_as_int(__a); } 262 __DEVICE__ unsigned int __float_as_uint(float __a) { 263 return __nv_float_as_uint(__a); 264 } 265 __DEVICE__ double __fma_rd(double __a, double __b, double __c) { 266 return __nv_fma_rd(__a, __b, __c); 267 } 268 __DEVICE__ double __fma_rn(double __a, double __b, double __c) { 269 return __nv_fma_rn(__a, __b, __c); 270 } 271 __DEVICE__ double __fma_ru(double __a, double __b, double __c) { 272 return __nv_fma_ru(__a, __b, __c); 273 } 274 __DEVICE__ double __fma_rz(double __a, double __b, double __c) { 275 return __nv_fma_rz(__a, __b, __c); 276 } 277 __DEVICE__ float __fmaf_ieee_rd(float __a, float __b, float __c) { 278 return __nv_fmaf_ieee_rd(__a, __b, __c); 279 } 280 __DEVICE__ float __fmaf_ieee_rn(float __a, float __b, float __c) { 281 return __nv_fmaf_ieee_rn(__a, __b, __c); 282 } 283 __DEVICE__ float __fmaf_ieee_ru(float __a, float __b, float __c) { 284 return __nv_fmaf_ieee_ru(__a, __b, __c); 285 } 286 __DEVICE__ float __fmaf_ieee_rz(float __a, float __b, float __c) { 287 return __nv_fmaf_ieee_rz(__a, __b, __c); 288 } 289 __DEVICE__ float __fmaf_rd(float __a, float __b, float __c) { 290 return __nv_fmaf_rd(__a, __b, __c); 291 } 292 __DEVICE__ float __fmaf_rn(float __a, float __b, float __c) { 293 return __nv_fmaf_rn(__a, __b, __c); 294 } 295 __DEVICE__ float __fmaf_ru(float __a, float __b, float __c) { 296 return __nv_fmaf_ru(__a, __b, __c); 297 } 298 __DEVICE__ float __fmaf_rz(float __a, float __b, float __c) { 299 return __nv_fmaf_rz(__a, __b, __c); 300 } 301 __DEVICE__ float __fmul_rd(float __a, float __b) { 302 return __nv_fmul_rd(__a, __b); 303 } 304 __DEVICE__ float __fmul_rn(float __a, float __b) { 305 return __nv_fmul_rn(__a, __b); 306 } 307 __DEVICE__ float __fmul_ru(float __a, float __b) { 308 return __nv_fmul_ru(__a, __b); 309 } 310 __DEVICE__ float __fmul_rz(float __a, float __b) { 311 return __nv_fmul_rz(__a, __b); 312 } 313 __DEVICE__ float __frcp_rd(float __a) { return __nv_frcp_rd(__a); } 314 __DEVICE__ float __frcp_rn(float __a) { return __nv_frcp_rn(__a); } 315 __DEVICE__ float __frcp_ru(float __a) { return __nv_frcp_ru(__a); } 316 __DEVICE__ float __frcp_rz(float __a) { return __nv_frcp_rz(__a); } 317 __DEVICE__ float __frsqrt_rn(float __a) { return __nv_frsqrt_rn(__a); } 318 __DEVICE__ float __fsqrt_rd(float __a) { return __nv_fsqrt_rd(__a); } 319 __DEVICE__ float __fsqrt_rn(float __a) { return __nv_fsqrt_rn(__a); } 320 __DEVICE__ float __fsqrt_ru(float __a) { return __nv_fsqrt_ru(__a); } 321 __DEVICE__ float __fsqrt_rz(float __a) { return __nv_fsqrt_rz(__a); } 322 __DEVICE__ float __fsub_rd(float __a, float __b) { 323 return __nv_fsub_rd(__a, __b); 324 } 325 __DEVICE__ float __fsub_rn(float __a, float __b) { 326 return __nv_fsub_rn(__a, __b); 327 } 328 __DEVICE__ float __fsub_ru(float __a, float __b) { 329 return __nv_fsub_ru(__a, __b); 330 } 331 __DEVICE__ float __fsub_rz(float __a, float __b) { 332 return __nv_fsub_rz(__a, __b); 333 } 334 __DEVICE__ int __hadd(int __a, int __b) { return __nv_hadd(__a, __b); } 335 __DEVICE__ double __hiloint2double(int __a, int __b) { 336 return __nv_hiloint2double(__a, __b); 337 } 338 __DEVICE__ int __iAtomicAdd(int *__p, int __v) { 339 return __nvvm_atom_add_gen_i(__p, __v); 340 } 341 __DEVICE__ int __iAtomicAdd_block(int *__p, int __v) { 342 return __nvvm_atom_cta_add_gen_i(__p, __v); 343 } 344 __DEVICE__ int __iAtomicAdd_system(int *__p, int __v) { 345 return __nvvm_atom_sys_add_gen_i(__p, __v); 346 } 347 __DEVICE__ int __iAtomicAnd(int *__p, int __v) { 348 return __nvvm_atom_and_gen_i(__p, __v); 349 } 350 __DEVICE__ int __iAtomicAnd_block(int *__p, int __v) { 351 return __nvvm_atom_cta_and_gen_i(__p, __v); 352 } 353 __DEVICE__ int __iAtomicAnd_system(int *__p, int __v) { 354 return __nvvm_atom_sys_and_gen_i(__p, __v); 355 } 356 __DEVICE__ int __iAtomicCAS(int *__p, int __cmp, int __v) { 357 return __nvvm_atom_cas_gen_i(__p, __cmp, __v); 358 } 359 __DEVICE__ int __iAtomicCAS_block(int *__p, int __cmp, int __v) { 360 return __nvvm_atom_cta_cas_gen_i(__p, __cmp, __v); 361 } 362 __DEVICE__ int __iAtomicCAS_system(int *__p, int __cmp, int __v) { 363 return __nvvm_atom_sys_cas_gen_i(__p, __cmp, __v); 364 } 365 __DEVICE__ int __iAtomicExch(int *__p, int __v) { 366 return __nvvm_atom_xchg_gen_i(__p, __v); 367 } 368 __DEVICE__ int __iAtomicExch_block(int *__p, int __v) { 369 return __nvvm_atom_cta_xchg_gen_i(__p, __v); 370 } 371 __DEVICE__ int __iAtomicExch_system(int *__p, int __v) { 372 return __nvvm_atom_sys_xchg_gen_i(__p, __v); 373 } 374 __DEVICE__ int __iAtomicMax(int *__p, int __v) { 375 return __nvvm_atom_max_gen_i(__p, __v); 376 } 377 __DEVICE__ int __iAtomicMax_block(int *__p, int __v) { 378 return __nvvm_atom_cta_max_gen_i(__p, __v); 379 } 380 __DEVICE__ int __iAtomicMax_system(int *__p, int __v) { 381 return __nvvm_atom_sys_max_gen_i(__p, __v); 382 } 383 __DEVICE__ int __iAtomicMin(int *__p, int __v) { 384 return __nvvm_atom_min_gen_i(__p, __v); 385 } 386 __DEVICE__ int __iAtomicMin_block(int *__p, int __v) { 387 return __nvvm_atom_cta_min_gen_i(__p, __v); 388 } 389 __DEVICE__ int __iAtomicMin_system(int *__p, int __v) { 390 return __nvvm_atom_sys_min_gen_i(__p, __v); 391 } 392 __DEVICE__ int __iAtomicOr(int *__p, int __v) { 393 return __nvvm_atom_or_gen_i(__p, __v); 394 } 395 __DEVICE__ int __iAtomicOr_block(int *__p, int __v) { 396 return __nvvm_atom_cta_or_gen_i(__p, __v); 397 } 398 __DEVICE__ int __iAtomicOr_system(int *__p, int __v) { 399 return __nvvm_atom_sys_or_gen_i(__p, __v); 400 } 401 __DEVICE__ int __iAtomicXor(int *__p, int __v) { 402 return __nvvm_atom_xor_gen_i(__p, __v); 403 } 404 __DEVICE__ int __iAtomicXor_block(int *__p, int __v) { 405 return __nvvm_atom_cta_xor_gen_i(__p, __v); 406 } 407 __DEVICE__ int __iAtomicXor_system(int *__p, int __v) { 408 return __nvvm_atom_sys_xor_gen_i(__p, __v); 409 } 410 __DEVICE__ long long __illAtomicMax(long long *__p, long long __v) { 411 return __nvvm_atom_max_gen_ll(__p, __v); 412 } 413 __DEVICE__ long long __illAtomicMax_block(long long *__p, long long __v) { 414 return __nvvm_atom_cta_max_gen_ll(__p, __v); 415 } 416 __DEVICE__ long long __illAtomicMax_system(long long *__p, long long __v) { 417 return __nvvm_atom_sys_max_gen_ll(__p, __v); 418 } 419 __DEVICE__ long long __illAtomicMin(long long *__p, long long __v) { 420 return __nvvm_atom_min_gen_ll(__p, __v); 421 } 422 __DEVICE__ long long __illAtomicMin_block(long long *__p, long long __v) { 423 return __nvvm_atom_cta_min_gen_ll(__p, __v); 424 } 425 __DEVICE__ long long __illAtomicMin_system(long long *__p, long long __v) { 426 return __nvvm_atom_sys_min_gen_ll(__p, __v); 427 } 428 __DEVICE__ double __int2double_rn(int __a) { return __nv_int2double_rn(__a); } 429 __DEVICE__ float __int2float_rd(int __a) { return __nv_int2float_rd(__a); } 430 __DEVICE__ float __int2float_rn(int __a) { return __nv_int2float_rn(__a); } 431 __DEVICE__ float __int2float_ru(int __a) { return __nv_int2float_ru(__a); } 432 __DEVICE__ float __int2float_rz(int __a) { return __nv_int2float_rz(__a); } 433 __DEVICE__ float __int_as_float(int __a) { return __nv_int_as_float(__a); } 434 __DEVICE__ int __isfinited(double __a) { return __nv_isfinited(__a); } 435 __DEVICE__ int __isinf(double __a) { return __nv_isinfd(__a); } 436 __DEVICE__ int __isinff(float __a) { return __nv_isinff(__a); } 437 #ifdef _MSC_VER 438 __DEVICE__ int __isinfl(long double __a); 439 #endif 440 __DEVICE__ int __isnan(double __a) { return __nv_isnand(__a); } 441 __DEVICE__ int __isnanf(float __a) { return __nv_isnanf(__a); } 442 #ifdef _MSC_VER 443 __DEVICE__ int __isnanl(long double __a); 444 #endif 445 __DEVICE__ double __ll2double_rd(long long __a) { 446 return __nv_ll2double_rd(__a); 447 } 448 __DEVICE__ double __ll2double_rn(long long __a) { 449 return __nv_ll2double_rn(__a); 450 } 451 __DEVICE__ double __ll2double_ru(long long __a) { 452 return __nv_ll2double_ru(__a); 453 } 454 __DEVICE__ double __ll2double_rz(long long __a) { 455 return __nv_ll2double_rz(__a); 456 } 457 __DEVICE__ float __ll2float_rd(long long __a) { return __nv_ll2float_rd(__a); } 458 __DEVICE__ float __ll2float_rn(long long __a) { return __nv_ll2float_rn(__a); } 459 __DEVICE__ float __ll2float_ru(long long __a) { return __nv_ll2float_ru(__a); } 460 __DEVICE__ float __ll2float_rz(long long __a) { return __nv_ll2float_rz(__a); } 461 __DEVICE__ long long __llAtomicAnd(long long *__p, long long __v) { 462 return __nvvm_atom_and_gen_ll(__p, __v); 463 } 464 __DEVICE__ long long __llAtomicAnd_block(long long *__p, long long __v) { 465 return __nvvm_atom_cta_and_gen_ll(__p, __v); 466 } 467 __DEVICE__ long long __llAtomicAnd_system(long long *__p, long long __v) { 468 return __nvvm_atom_sys_and_gen_ll(__p, __v); 469 } 470 __DEVICE__ long long __llAtomicOr(long long *__p, long long __v) { 471 return __nvvm_atom_or_gen_ll(__p, __v); 472 } 473 __DEVICE__ long long __llAtomicOr_block(long long *__p, long long __v) { 474 return __nvvm_atom_cta_or_gen_ll(__p, __v); 475 } 476 __DEVICE__ long long __llAtomicOr_system(long long *__p, long long __v) { 477 return __nvvm_atom_sys_or_gen_ll(__p, __v); 478 } 479 __DEVICE__ long long __llAtomicXor(long long *__p, long long __v) { 480 return __nvvm_atom_xor_gen_ll(__p, __v); 481 } 482 __DEVICE__ long long __llAtomicXor_block(long long *__p, long long __v) { 483 return __nvvm_atom_cta_xor_gen_ll(__p, __v); 484 } 485 __DEVICE__ long long __llAtomicXor_system(long long *__p, long long __v) { 486 return __nvvm_atom_sys_xor_gen_ll(__p, __v); 487 } 488 __DEVICE__ float __log10f(float __a) { return __nv_fast_log10f(__a); } 489 __DEVICE__ float __log2f(float __a) { return __nv_fast_log2f(__a); } 490 __DEVICE__ float __logf(float __a) { return __nv_fast_logf(__a); } 491 __DEVICE__ double __longlong_as_double(long long __a) { 492 return __nv_longlong_as_double(__a); 493 } 494 __DEVICE__ int __mul24(int __a, int __b) { return __nv_mul24(__a, __b); } 495 __DEVICE__ long long __mul64hi(long long __a, long long __b) { 496 return __nv_mul64hi(__a, __b); 497 } 498 __DEVICE__ int __mulhi(int __a, int __b) { return __nv_mulhi(__a, __b); } 499 __DEVICE__ unsigned int __pm0(void) { return __nvvm_read_ptx_sreg_pm0(); } 500 __DEVICE__ unsigned int __pm1(void) { return __nvvm_read_ptx_sreg_pm1(); } 501 __DEVICE__ unsigned int __pm2(void) { return __nvvm_read_ptx_sreg_pm2(); } 502 __DEVICE__ unsigned int __pm3(void) { return __nvvm_read_ptx_sreg_pm3(); } 503 __DEVICE__ int __popc(int __a) { return __nv_popc(__a); } 504 __DEVICE__ int __popcll(long long __a) { return __nv_popcll(__a); } 505 __DEVICE__ float __powf(float __a, float __b) { 506 return __nv_fast_powf(__a, __b); 507 } 508 509 // Parameter must have a known integer value. 510 #define __prof_trigger(__a) asm __volatile__("pmevent \t%0;" ::"i"(__a)) 511 __DEVICE__ int __rhadd(int __a, int __b) { return __nv_rhadd(__a, __b); } 512 __DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c) { 513 return __nv_sad(__a, __b, __c); 514 } 515 __DEVICE__ float __saturatef(float __a) { return __nv_saturatef(__a); } 516 __DEVICE__ int __signbitd(double __a) { return __nv_signbitd(__a); } 517 __DEVICE__ int __signbitf(float __a) { return __nv_signbitf(__a); } 518 __DEVICE__ void __sincosf(float __a, float *__s, float *__c) { 519 return __nv_fast_sincosf(__a, __s, __c); 520 } 521 __DEVICE__ float __sinf(float __a) { return __nv_fast_sinf(__a); } 522 __DEVICE__ int __syncthreads_and(int __a) { return __nvvm_bar0_and(__a); } 523 __DEVICE__ int __syncthreads_count(int __a) { return __nvvm_bar0_popc(__a); } 524 __DEVICE__ int __syncthreads_or(int __a) { return __nvvm_bar0_or(__a); } 525 __DEVICE__ float __tanf(float __a) { return __nv_fast_tanf(__a); } 526 __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); } 527 __DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); }; 528 __DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); }; 529 __DEVICE__ void __trap(void) { asm volatile("trap;"); } 530 __DEVICE__ unsigned int __uAtomicAdd(unsigned int *__p, unsigned int __v) { 531 return __nvvm_atom_add_gen_i((int *)__p, __v); 532 } 533 __DEVICE__ unsigned int __uAtomicAdd_block(unsigned int *__p, 534 unsigned int __v) { 535 return __nvvm_atom_cta_add_gen_i((int *)__p, __v); 536 } 537 __DEVICE__ unsigned int __uAtomicAdd_system(unsigned int *__p, 538 unsigned int __v) { 539 return __nvvm_atom_sys_add_gen_i((int *)__p, __v); 540 } 541 __DEVICE__ unsigned int __uAtomicAnd(unsigned int *__p, unsigned int __v) { 542 return __nvvm_atom_and_gen_i((int *)__p, __v); 543 } 544 __DEVICE__ unsigned int __uAtomicAnd_block(unsigned int *__p, 545 unsigned int __v) { 546 return __nvvm_atom_cta_and_gen_i((int *)__p, __v); 547 } 548 __DEVICE__ unsigned int __uAtomicAnd_system(unsigned int *__p, 549 unsigned int __v) { 550 return __nvvm_atom_sys_and_gen_i((int *)__p, __v); 551 } 552 __DEVICE__ unsigned int __uAtomicCAS(unsigned int *__p, unsigned int __cmp, 553 unsigned int __v) { 554 return __nvvm_atom_cas_gen_i((int *)__p, __cmp, __v); 555 } 556 __DEVICE__ unsigned int 557 __uAtomicCAS_block(unsigned int *__p, unsigned int __cmp, unsigned int __v) { 558 return __nvvm_atom_cta_cas_gen_i((int *)__p, __cmp, __v); 559 } 560 __DEVICE__ unsigned int 561 __uAtomicCAS_system(unsigned int *__p, unsigned int __cmp, unsigned int __v) { 562 return __nvvm_atom_sys_cas_gen_i((int *)__p, __cmp, __v); 563 } 564 __DEVICE__ unsigned int __uAtomicDec(unsigned int *__p, unsigned int __v) { 565 return __nvvm_atom_dec_gen_ui(__p, __v); 566 } 567 __DEVICE__ unsigned int __uAtomicDec_block(unsigned int *__p, 568 unsigned int __v) { 569 return __nvvm_atom_cta_dec_gen_ui(__p, __v); 570 } 571 __DEVICE__ unsigned int __uAtomicDec_system(unsigned int *__p, 572 unsigned int __v) { 573 return __nvvm_atom_sys_dec_gen_ui(__p, __v); 574 } 575 __DEVICE__ unsigned int __uAtomicExch(unsigned int *__p, unsigned int __v) { 576 return __nvvm_atom_xchg_gen_i((int *)__p, __v); 577 } 578 __DEVICE__ unsigned int __uAtomicExch_block(unsigned int *__p, 579 unsigned int __v) { 580 return __nvvm_atom_cta_xchg_gen_i((int *)__p, __v); 581 } 582 __DEVICE__ unsigned int __uAtomicExch_system(unsigned int *__p, 583 unsigned int __v) { 584 return __nvvm_atom_sys_xchg_gen_i((int *)__p, __v); 585 } 586 __DEVICE__ unsigned int __uAtomicInc(unsigned int *__p, unsigned int __v) { 587 return __nvvm_atom_inc_gen_ui(__p, __v); 588 } 589 __DEVICE__ unsigned int __uAtomicInc_block(unsigned int *__p, 590 unsigned int __v) { 591 return __nvvm_atom_cta_inc_gen_ui(__p, __v); 592 } 593 __DEVICE__ unsigned int __uAtomicInc_system(unsigned int *__p, 594 unsigned int __v) { 595 return __nvvm_atom_sys_inc_gen_ui(__p, __v); 596 } 597 __DEVICE__ unsigned int __uAtomicMax(unsigned int *__p, unsigned int __v) { 598 return __nvvm_atom_max_gen_ui(__p, __v); 599 } 600 __DEVICE__ unsigned int __uAtomicMax_block(unsigned int *__p, 601 unsigned int __v) { 602 return __nvvm_atom_cta_max_gen_ui(__p, __v); 603 } 604 __DEVICE__ unsigned int __uAtomicMax_system(unsigned int *__p, 605 unsigned int __v) { 606 return __nvvm_atom_sys_max_gen_ui(__p, __v); 607 } 608 __DEVICE__ unsigned int __uAtomicMin(unsigned int *__p, unsigned int __v) { 609 return __nvvm_atom_min_gen_ui(__p, __v); 610 } 611 __DEVICE__ unsigned int __uAtomicMin_block(unsigned int *__p, 612 unsigned int __v) { 613 return __nvvm_atom_cta_min_gen_ui(__p, __v); 614 } 615 __DEVICE__ unsigned int __uAtomicMin_system(unsigned int *__p, 616 unsigned int __v) { 617 return __nvvm_atom_sys_min_gen_ui(__p, __v); 618 } 619 __DEVICE__ unsigned int __uAtomicOr(unsigned int *__p, unsigned int __v) { 620 return __nvvm_atom_or_gen_i((int *)__p, __v); 621 } 622 __DEVICE__ unsigned int __uAtomicOr_block(unsigned int *__p, unsigned int __v) { 623 return __nvvm_atom_cta_or_gen_i((int *)__p, __v); 624 } 625 __DEVICE__ unsigned int __uAtomicOr_system(unsigned int *__p, 626 unsigned int __v) { 627 return __nvvm_atom_sys_or_gen_i((int *)__p, __v); 628 } 629 __DEVICE__ unsigned int __uAtomicXor(unsigned int *__p, unsigned int __v) { 630 return __nvvm_atom_xor_gen_i((int *)__p, __v); 631 } 632 __DEVICE__ unsigned int __uAtomicXor_block(unsigned int *__p, 633 unsigned int __v) { 634 return __nvvm_atom_cta_xor_gen_i((int *)__p, __v); 635 } 636 __DEVICE__ unsigned int __uAtomicXor_system(unsigned int *__p, 637 unsigned int __v) { 638 return __nvvm_atom_sys_xor_gen_i((int *)__p, __v); 639 } 640 __DEVICE__ unsigned int __uhadd(unsigned int __a, unsigned int __b) { 641 return __nv_uhadd(__a, __b); 642 } 643 __DEVICE__ double __uint2double_rn(unsigned int __a) { 644 return __nv_uint2double_rn(__a); 645 } 646 __DEVICE__ float __uint2float_rd(unsigned int __a) { 647 return __nv_uint2float_rd(__a); 648 } 649 __DEVICE__ float __uint2float_rn(unsigned int __a) { 650 return __nv_uint2float_rn(__a); 651 } 652 __DEVICE__ float __uint2float_ru(unsigned int __a) { 653 return __nv_uint2float_ru(__a); 654 } 655 __DEVICE__ float __uint2float_rz(unsigned int __a) { 656 return __nv_uint2float_rz(__a); 657 } 658 __DEVICE__ float __uint_as_float(unsigned int __a) { 659 return __nv_uint_as_float(__a); 660 } // 661 __DEVICE__ double __ull2double_rd(unsigned long long __a) { 662 return __nv_ull2double_rd(__a); 663 } 664 __DEVICE__ double __ull2double_rn(unsigned long long __a) { 665 return __nv_ull2double_rn(__a); 666 } 667 __DEVICE__ double __ull2double_ru(unsigned long long __a) { 668 return __nv_ull2double_ru(__a); 669 } 670 __DEVICE__ double __ull2double_rz(unsigned long long __a) { 671 return __nv_ull2double_rz(__a); 672 } 673 __DEVICE__ float __ull2float_rd(unsigned long long __a) { 674 return __nv_ull2float_rd(__a); 675 } 676 __DEVICE__ float __ull2float_rn(unsigned long long __a) { 677 return __nv_ull2float_rn(__a); 678 } 679 __DEVICE__ float __ull2float_ru(unsigned long long __a) { 680 return __nv_ull2float_ru(__a); 681 } 682 __DEVICE__ float __ull2float_rz(unsigned long long __a) { 683 return __nv_ull2float_rz(__a); 684 } 685 __DEVICE__ unsigned long long __ullAtomicAdd(unsigned long long *__p, 686 unsigned long long __v) { 687 return __nvvm_atom_add_gen_ll((long long *)__p, __v); 688 } 689 __DEVICE__ unsigned long long __ullAtomicAdd_block(unsigned long long *__p, 690 unsigned long long __v) { 691 return __nvvm_atom_cta_add_gen_ll((long long *)__p, __v); 692 } 693 __DEVICE__ unsigned long long __ullAtomicAdd_system(unsigned long long *__p, 694 unsigned long long __v) { 695 return __nvvm_atom_sys_add_gen_ll((long long *)__p, __v); 696 } 697 __DEVICE__ unsigned long long __ullAtomicAnd(unsigned long long *__p, 698 unsigned long long __v) { 699 return __nvvm_atom_and_gen_ll((long long *)__p, __v); 700 } 701 __DEVICE__ unsigned long long __ullAtomicAnd_block(unsigned long long *__p, 702 unsigned long long __v) { 703 return __nvvm_atom_cta_and_gen_ll((long long *)__p, __v); 704 } 705 __DEVICE__ unsigned long long __ullAtomicAnd_system(unsigned long long *__p, 706 unsigned long long __v) { 707 return __nvvm_atom_sys_and_gen_ll((long long *)__p, __v); 708 } 709 __DEVICE__ unsigned long long __ullAtomicCAS(unsigned long long *__p, 710 unsigned long long __cmp, 711 unsigned long long __v) { 712 return __nvvm_atom_cas_gen_ll((long long *)__p, __cmp, __v); 713 } 714 __DEVICE__ unsigned long long __ullAtomicCAS_block(unsigned long long *__p, 715 unsigned long long __cmp, 716 unsigned long long __v) { 717 return __nvvm_atom_cta_cas_gen_ll((long long *)__p, __cmp, __v); 718 } 719 __DEVICE__ unsigned long long __ullAtomicCAS_system(unsigned long long *__p, 720 unsigned long long __cmp, 721 unsigned long long __v) { 722 return __nvvm_atom_sys_cas_gen_ll((long long *)__p, __cmp, __v); 723 } 724 __DEVICE__ unsigned long long __ullAtomicExch(unsigned long long *__p, 725 unsigned long long __v) { 726 return __nvvm_atom_xchg_gen_ll((long long *)__p, __v); 727 } 728 __DEVICE__ unsigned long long __ullAtomicExch_block(unsigned long long *__p, 729 unsigned long long __v) { 730 return __nvvm_atom_cta_xchg_gen_ll((long long *)__p, __v); 731 } 732 __DEVICE__ unsigned long long __ullAtomicExch_system(unsigned long long *__p, 733 unsigned long long __v) { 734 return __nvvm_atom_sys_xchg_gen_ll((long long *)__p, __v); 735 } 736 __DEVICE__ unsigned long long __ullAtomicMax(unsigned long long *__p, 737 unsigned long long __v) { 738 return __nvvm_atom_max_gen_ull(__p, __v); 739 } 740 __DEVICE__ unsigned long long __ullAtomicMax_block(unsigned long long *__p, 741 unsigned long long __v) { 742 return __nvvm_atom_cta_max_gen_ull(__p, __v); 743 } 744 __DEVICE__ unsigned long long __ullAtomicMax_system(unsigned long long *__p, 745 unsigned long long __v) { 746 return __nvvm_atom_sys_max_gen_ull(__p, __v); 747 } 748 __DEVICE__ unsigned long long __ullAtomicMin(unsigned long long *__p, 749 unsigned long long __v) { 750 return __nvvm_atom_min_gen_ull(__p, __v); 751 } 752 __DEVICE__ unsigned long long __ullAtomicMin_block(unsigned long long *__p, 753 unsigned long long __v) { 754 return __nvvm_atom_cta_min_gen_ull(__p, __v); 755 } 756 __DEVICE__ unsigned long long __ullAtomicMin_system(unsigned long long *__p, 757 unsigned long long __v) { 758 return __nvvm_atom_sys_min_gen_ull(__p, __v); 759 } 760 __DEVICE__ unsigned long long __ullAtomicOr(unsigned long long *__p, 761 unsigned long long __v) { 762 return __nvvm_atom_or_gen_ll((long long *)__p, __v); 763 } 764 __DEVICE__ unsigned long long __ullAtomicOr_block(unsigned long long *__p, 765 unsigned long long __v) { 766 return __nvvm_atom_cta_or_gen_ll((long long *)__p, __v); 767 } 768 __DEVICE__ unsigned long long __ullAtomicOr_system(unsigned long long *__p, 769 unsigned long long __v) { 770 return __nvvm_atom_sys_or_gen_ll((long long *)__p, __v); 771 } 772 __DEVICE__ unsigned long long __ullAtomicXor(unsigned long long *__p, 773 unsigned long long __v) { 774 return __nvvm_atom_xor_gen_ll((long long *)__p, __v); 775 } 776 __DEVICE__ unsigned long long __ullAtomicXor_block(unsigned long long *__p, 777 unsigned long long __v) { 778 return __nvvm_atom_cta_xor_gen_ll((long long *)__p, __v); 779 } 780 __DEVICE__ unsigned long long __ullAtomicXor_system(unsigned long long *__p, 781 unsigned long long __v) { 782 return __nvvm_atom_sys_xor_gen_ll((long long *)__p, __v); 783 } 784 __DEVICE__ unsigned int __umul24(unsigned int __a, unsigned int __b) { 785 return __nv_umul24(__a, __b); 786 } 787 __DEVICE__ unsigned long long __umul64hi(unsigned long long __a, 788 unsigned long long __b) { 789 return __nv_umul64hi(__a, __b); 790 } 791 __DEVICE__ unsigned int __umulhi(unsigned int __a, unsigned int __b) { 792 return __nv_umulhi(__a, __b); 793 } 794 __DEVICE__ unsigned int __urhadd(unsigned int __a, unsigned int __b) { 795 return __nv_urhadd(__a, __b); 796 } 797 __DEVICE__ unsigned int __usad(unsigned int __a, unsigned int __b, 798 unsigned int __c) { 799 return __nv_usad(__a, __b, __c); 800 } 801 802 #if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020 803 __DEVICE__ unsigned int __vabs2(unsigned int __a) { return __nv_vabs2(__a); } 804 __DEVICE__ unsigned int __vabs4(unsigned int __a) { return __nv_vabs4(__a); } 805 __DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) { 806 return __nv_vabsdiffs2(__a, __b); 807 } 808 __DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) { 809 return __nv_vabsdiffs4(__a, __b); 810 } 811 __DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) { 812 return __nv_vabsdiffu2(__a, __b); 813 } 814 __DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) { 815 return __nv_vabsdiffu4(__a, __b); 816 } 817 __DEVICE__ unsigned int __vabsss2(unsigned int __a) { 818 return __nv_vabsss2(__a); 819 } 820 __DEVICE__ unsigned int __vabsss4(unsigned int __a) { 821 return __nv_vabsss4(__a); 822 } 823 __DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) { 824 return __nv_vadd2(__a, __b); 825 } 826 __DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) { 827 return __nv_vadd4(__a, __b); 828 } 829 __DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) { 830 return __nv_vaddss2(__a, __b); 831 } 832 __DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) { 833 return __nv_vaddss4(__a, __b); 834 } 835 __DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) { 836 return __nv_vaddus2(__a, __b); 837 } 838 __DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) { 839 return __nv_vaddus4(__a, __b); 840 } 841 __DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) { 842 return __nv_vavgs2(__a, __b); 843 } 844 __DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) { 845 return __nv_vavgs4(__a, __b); 846 } 847 __DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) { 848 return __nv_vavgu2(__a, __b); 849 } 850 __DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) { 851 return __nv_vavgu4(__a, __b); 852 } 853 __DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) { 854 return __nv_vcmpeq2(__a, __b); 855 } 856 __DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) { 857 return __nv_vcmpeq4(__a, __b); 858 } 859 __DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) { 860 return __nv_vcmpges2(__a, __b); 861 } 862 __DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) { 863 return __nv_vcmpges4(__a, __b); 864 } 865 __DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) { 866 return __nv_vcmpgeu2(__a, __b); 867 } 868 __DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) { 869 return __nv_vcmpgeu4(__a, __b); 870 } 871 __DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) { 872 return __nv_vcmpgts2(__a, __b); 873 } 874 __DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) { 875 return __nv_vcmpgts4(__a, __b); 876 } 877 __DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) { 878 return __nv_vcmpgtu2(__a, __b); 879 } 880 __DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) { 881 return __nv_vcmpgtu4(__a, __b); 882 } 883 __DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) { 884 return __nv_vcmples2(__a, __b); 885 } 886 __DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) { 887 return __nv_vcmples4(__a, __b); 888 } 889 __DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) { 890 return __nv_vcmpleu2(__a, __b); 891 } 892 __DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) { 893 return __nv_vcmpleu4(__a, __b); 894 } 895 __DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) { 896 return __nv_vcmplts2(__a, __b); 897 } 898 __DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) { 899 return __nv_vcmplts4(__a, __b); 900 } 901 __DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) { 902 return __nv_vcmpltu2(__a, __b); 903 } 904 __DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) { 905 return __nv_vcmpltu4(__a, __b); 906 } 907 __DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) { 908 return __nv_vcmpne2(__a, __b); 909 } 910 __DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) { 911 return __nv_vcmpne4(__a, __b); 912 } 913 __DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) { 914 return __nv_vhaddu2(__a, __b); 915 } 916 __DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) { 917 return __nv_vhaddu4(__a, __b); 918 } 919 __DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) { 920 return __nv_vmaxs2(__a, __b); 921 } 922 __DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) { 923 return __nv_vmaxs4(__a, __b); 924 } 925 __DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) { 926 return __nv_vmaxu2(__a, __b); 927 } 928 __DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) { 929 return __nv_vmaxu4(__a, __b); 930 } 931 __DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) { 932 return __nv_vmins2(__a, __b); 933 } 934 __DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) { 935 return __nv_vmins4(__a, __b); 936 } 937 __DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) { 938 return __nv_vminu2(__a, __b); 939 } 940 __DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) { 941 return __nv_vminu4(__a, __b); 942 } 943 __DEVICE__ unsigned int __vneg2(unsigned int __a) { return __nv_vneg2(__a); } 944 __DEVICE__ unsigned int __vneg4(unsigned int __a) { return __nv_vneg4(__a); } 945 __DEVICE__ unsigned int __vnegss2(unsigned int __a) { 946 return __nv_vnegss2(__a); 947 } 948 __DEVICE__ unsigned int __vnegss4(unsigned int __a) { 949 return __nv_vnegss4(__a); 950 } 951 __DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) { 952 return __nv_vsads2(__a, __b); 953 } 954 __DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) { 955 return __nv_vsads4(__a, __b); 956 } 957 __DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) { 958 return __nv_vsadu2(__a, __b); 959 } 960 __DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) { 961 return __nv_vsadu4(__a, __b); 962 } 963 __DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) { 964 return __nv_vseteq2(__a, __b); 965 } 966 __DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) { 967 return __nv_vseteq4(__a, __b); 968 } 969 __DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) { 970 return __nv_vsetges2(__a, __b); 971 } 972 __DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) { 973 return __nv_vsetges4(__a, __b); 974 } 975 __DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) { 976 return __nv_vsetgeu2(__a, __b); 977 } 978 __DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) { 979 return __nv_vsetgeu4(__a, __b); 980 } 981 __DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) { 982 return __nv_vsetgts2(__a, __b); 983 } 984 __DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) { 985 return __nv_vsetgts4(__a, __b); 986 } 987 __DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) { 988 return __nv_vsetgtu2(__a, __b); 989 } 990 __DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) { 991 return __nv_vsetgtu4(__a, __b); 992 } 993 __DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) { 994 return __nv_vsetles2(__a, __b); 995 } 996 __DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) { 997 return __nv_vsetles4(__a, __b); 998 } 999 __DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) { 1000 return __nv_vsetleu2(__a, __b); 1001 } 1002 __DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) { 1003 return __nv_vsetleu4(__a, __b); 1004 } 1005 __DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) { 1006 return __nv_vsetlts2(__a, __b); 1007 } 1008 __DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) { 1009 return __nv_vsetlts4(__a, __b); 1010 } 1011 __DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) { 1012 return __nv_vsetltu2(__a, __b); 1013 } 1014 __DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) { 1015 return __nv_vsetltu4(__a, __b); 1016 } 1017 __DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) { 1018 return __nv_vsetne2(__a, __b); 1019 } 1020 __DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) { 1021 return __nv_vsetne4(__a, __b); 1022 } 1023 __DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) { 1024 return __nv_vsub2(__a, __b); 1025 } 1026 __DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) { 1027 return __nv_vsub4(__a, __b); 1028 } 1029 __DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) { 1030 return __nv_vsubss2(__a, __b); 1031 } 1032 __DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) { 1033 return __nv_vsubss4(__a, __b); 1034 } 1035 __DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) { 1036 return __nv_vsubus2(__a, __b); 1037 } 1038 __DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) { 1039 return __nv_vsubus4(__a, __b); 1040 } 1041 #else // CUDA_VERSION >= 9020 1042 // CUDA no longer provides inline assembly (or bitcode) implementation of these 1043 // functions, so we have to reimplment them. The implementation is naive and is 1044 // not optimized for performance. 1045 1046 // Helper function to convert N-bit boolean subfields into all-0 or all-1. 1047 // E.g. __bool2mask(0x01000100,8) -> 0xff00ff00 1048 // __bool2mask(0x00010000,16) -> 0xffff0000 1049 __DEVICE__ unsigned int __bool2mask(unsigned int __a, int shift) { 1050 return (__a << shift) - __a; 1051 } 1052 __DEVICE__ unsigned int __vabs2(unsigned int __a) { 1053 unsigned int r; 1054 asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" 1055 : "=r"(r) 1056 : "r"(__a), "r"(0), "r"(0)); 1057 return r; 1058 } 1059 __DEVICE__ unsigned int __vabs4(unsigned int __a) { 1060 unsigned int r; 1061 asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" 1062 : "=r"(r) 1063 : "r"(__a), "r"(0), "r"(0)); 1064 return r; 1065 } 1066 __DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) { 1067 unsigned int r; 1068 asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" 1069 : "=r"(r) 1070 : "r"(__a), "r"(__b), "r"(0)); 1071 return r; 1072 } 1073 1074 __DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) { 1075 unsigned int r; 1076 asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" 1077 : "=r"(r) 1078 : "r"(__a), "r"(__b), "r"(0)); 1079 return r; 1080 } 1081 __DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) { 1082 unsigned int r; 1083 asm("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;" 1084 : "=r"(r) 1085 : "r"(__a), "r"(__b), "r"(0)); 1086 return r; 1087 } 1088 __DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) { 1089 unsigned int r; 1090 asm("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;" 1091 : "=r"(r) 1092 : "r"(__a), "r"(__b), "r"(0)); 1093 return r; 1094 } 1095 __DEVICE__ unsigned int __vabsss2(unsigned int __a) { 1096 unsigned int r; 1097 asm("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;" 1098 : "=r"(r) 1099 : "r"(__a), "r"(0), "r"(0)); 1100 return r; 1101 } 1102 __DEVICE__ unsigned int __vabsss4(unsigned int __a) { 1103 unsigned int r; 1104 asm("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;" 1105 : "=r"(r) 1106 : "r"(__a), "r"(0), "r"(0)); 1107 return r; 1108 } 1109 __DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) { 1110 unsigned int r; 1111 asm("vadd2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1112 return r; 1113 } 1114 __DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) { 1115 unsigned int r; 1116 asm("vadd4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1117 return r; 1118 } 1119 __DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) { 1120 unsigned int r; 1121 asm("vadd2.s32.s32.s32.sat %0,%1,%2,%3;" 1122 : "=r"(r) 1123 : "r"(__a), "r"(__b), "r"(0)); 1124 return r; 1125 } 1126 __DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) { 1127 unsigned int r; 1128 asm("vadd4.s32.s32.s32.sat %0,%1,%2,%3;" 1129 : "=r"(r) 1130 : "r"(__a), "r"(__b), "r"(0)); 1131 return r; 1132 } 1133 __DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) { 1134 unsigned int r; 1135 asm("vadd2.u32.u32.u32.sat %0,%1,%2,%3;" 1136 : "=r"(r) 1137 : "r"(__a), "r"(__b), "r"(0)); 1138 return r; 1139 } 1140 __DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) { 1141 unsigned int r; 1142 asm("vadd4.u32.u32.u32.sat %0,%1,%2,%3;" 1143 : "=r"(r) 1144 : "r"(__a), "r"(__b), "r"(0)); 1145 return r; 1146 } 1147 __DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) { 1148 unsigned int r; 1149 asm("vavrg2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1150 return r; 1151 } 1152 __DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) { 1153 unsigned int r; 1154 asm("vavrg4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1155 return r; 1156 } 1157 __DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) { 1158 unsigned int r; 1159 asm("vavrg2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1160 return r; 1161 } 1162 __DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) { 1163 unsigned int r; 1164 asm("vavrg4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1165 return r; 1166 } 1167 __DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) { 1168 unsigned int r; 1169 asm("vset2.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1170 return r; 1171 } 1172 __DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) { 1173 return __bool2mask(__vseteq2(__a, __b), 16); 1174 } 1175 __DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) { 1176 unsigned int r; 1177 asm("vset4.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1178 return r; 1179 } 1180 __DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) { 1181 return __bool2mask(__vseteq4(__a, __b), 8); 1182 } 1183 __DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) { 1184 unsigned int r; 1185 asm("vset2.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1186 return r; 1187 } 1188 __DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) { 1189 return __bool2mask(__vsetges2(__a, __b), 16); 1190 } 1191 __DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) { 1192 unsigned int r; 1193 asm("vset4.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1194 return r; 1195 } 1196 __DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) { 1197 return __bool2mask(__vsetges4(__a, __b), 8); 1198 } 1199 __DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) { 1200 unsigned int r; 1201 asm("vset2.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1202 return r; 1203 } 1204 __DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) { 1205 return __bool2mask(__vsetgeu2(__a, __b), 16); 1206 } 1207 __DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) { 1208 unsigned int r; 1209 asm("vset4.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1210 return r; 1211 } 1212 __DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) { 1213 return __bool2mask(__vsetgeu4(__a, __b), 8); 1214 } 1215 __DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) { 1216 unsigned int r; 1217 asm("vset2.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1218 return r; 1219 } 1220 __DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) { 1221 return __bool2mask(__vsetgts2(__a, __b), 16); 1222 } 1223 __DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) { 1224 unsigned int r; 1225 asm("vset4.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1226 return r; 1227 } 1228 __DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) { 1229 return __bool2mask(__vsetgts4(__a, __b), 8); 1230 } 1231 __DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) { 1232 unsigned int r; 1233 asm("vset2.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1234 return r; 1235 } 1236 __DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) { 1237 return __bool2mask(__vsetgtu2(__a, __b), 16); 1238 } 1239 __DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) { 1240 unsigned int r; 1241 asm("vset4.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1242 return r; 1243 } 1244 __DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) { 1245 return __bool2mask(__vsetgtu4(__a, __b), 8); 1246 } 1247 __DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) { 1248 unsigned int r; 1249 asm("vset2.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1250 return r; 1251 } 1252 __DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) { 1253 return __bool2mask(__vsetles2(__a, __b), 16); 1254 } 1255 __DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) { 1256 unsigned int r; 1257 asm("vset4.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1258 return r; 1259 } 1260 __DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) { 1261 return __bool2mask(__vsetles4(__a, __b), 8); 1262 } 1263 __DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) { 1264 unsigned int r; 1265 asm("vset2.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1266 return r; 1267 } 1268 __DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) { 1269 return __bool2mask(__vsetleu2(__a, __b), 16); 1270 } 1271 __DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) { 1272 unsigned int r; 1273 asm("vset4.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1274 return r; 1275 } 1276 __DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) { 1277 return __bool2mask(__vsetleu4(__a, __b), 8); 1278 } 1279 __DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) { 1280 unsigned int r; 1281 asm("vset2.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1282 return r; 1283 } 1284 __DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) { 1285 return __bool2mask(__vsetlts2(__a, __b), 16); 1286 } 1287 __DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) { 1288 unsigned int r; 1289 asm("vset4.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1290 return r; 1291 } 1292 __DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) { 1293 return __bool2mask(__vsetlts4(__a, __b), 8); 1294 } 1295 __DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) { 1296 unsigned int r; 1297 asm("vset2.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1298 return r; 1299 } 1300 __DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) { 1301 return __bool2mask(__vsetltu2(__a, __b), 16); 1302 } 1303 __DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) { 1304 unsigned int r; 1305 asm("vset4.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1306 return r; 1307 } 1308 __DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) { 1309 return __bool2mask(__vsetltu4(__a, __b), 8); 1310 } 1311 __DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) { 1312 unsigned int r; 1313 asm("vset2.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1314 return r; 1315 } 1316 __DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) { 1317 return __bool2mask(__vsetne2(__a, __b), 16); 1318 } 1319 __DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) { 1320 unsigned int r; 1321 asm("vset4.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1322 return r; 1323 } 1324 __DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) { 1325 return __bool2mask(__vsetne4(__a, __b), 8); 1326 } 1327 1328 // Based on ITEM 23 in AIM-239: http://dspace.mit.edu/handle/1721.1/6086 1329 // (a & b) + (a | b) = a + b = (a ^ b) + 2 * (a & b) => 1330 // (a + b) / 2 = ((a ^ b) >> 1) + (a & b) 1331 // To operate on multiple sub-elements we need to make sure to mask out bits 1332 // that crossed over into adjacent elements during the shift. 1333 __DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) { 1334 return (((__a ^ __b) >> 1) & ~0x80008000u) + (__a & __b); 1335 } 1336 __DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) { 1337 return (((__a ^ __b) >> 1) & ~0x80808080u) + (__a & __b); 1338 } 1339 1340 __DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) { 1341 unsigned int r; 1342 if ((__a & 0x8000) && (__b & 0x8000)) { 1343 // Work around a bug in ptxas which produces invalid result if low element 1344 // is negative. 1345 unsigned mask = __vcmpgts2(__a, __b); 1346 r = (__a & mask) | (__b & ~mask); 1347 } else { 1348 asm("vmax2.s32.s32.s32 %0,%1,%2,%3;" 1349 : "=r"(r) 1350 : "r"(__a), "r"(__b), "r"(0)); 1351 } 1352 return r; 1353 } 1354 __DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) { 1355 unsigned int r; 1356 asm("vmax4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1357 return r; 1358 } 1359 __DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) { 1360 unsigned int r; 1361 asm("vmax2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1362 return r; 1363 } 1364 __DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) { 1365 unsigned int r; 1366 asm("vmax4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1367 return r; 1368 } 1369 __DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) { 1370 unsigned int r; 1371 asm("vmin2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1372 return r; 1373 } 1374 __DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) { 1375 unsigned int r; 1376 asm("vmin4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1377 return r; 1378 } 1379 __DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) { 1380 unsigned int r; 1381 asm("vmin2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1382 return r; 1383 } 1384 __DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) { 1385 unsigned int r; 1386 asm("vmin4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1387 return r; 1388 } 1389 __DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) { 1390 unsigned int r; 1391 asm("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;" 1392 : "=r"(r) 1393 : "r"(__a), "r"(__b), "r"(0)); 1394 return r; 1395 } 1396 __DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) { 1397 unsigned int r; 1398 asm("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;" 1399 : "=r"(r) 1400 : "r"(__a), "r"(__b), "r"(0)); 1401 return r; 1402 } 1403 __DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) { 1404 unsigned int r; 1405 asm("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;" 1406 : "=r"(r) 1407 : "r"(__a), "r"(__b), "r"(0)); 1408 return r; 1409 } 1410 __DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) { 1411 unsigned int r; 1412 asm("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;" 1413 : "=r"(r) 1414 : "r"(__a), "r"(__b), "r"(0)); 1415 return r; 1416 } 1417 1418 __DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) { 1419 unsigned int r; 1420 asm("vsub2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1421 return r; 1422 } 1423 __DEVICE__ unsigned int __vneg2(unsigned int __a) { return __vsub2(0, __a); } 1424 1425 __DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) { 1426 unsigned int r; 1427 asm("vsub4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); 1428 return r; 1429 } 1430 __DEVICE__ unsigned int __vneg4(unsigned int __a) { return __vsub4(0, __a); } 1431 __DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) { 1432 unsigned int r; 1433 asm("vsub2.s32.s32.s32.sat %0,%1,%2,%3;" 1434 : "=r"(r) 1435 : "r"(__a), "r"(__b), "r"(0)); 1436 return r; 1437 } 1438 __DEVICE__ unsigned int __vnegss2(unsigned int __a) { 1439 return __vsubss2(0, __a); 1440 } 1441 __DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) { 1442 unsigned int r; 1443 asm("vsub4.s32.s32.s32.sat %0,%1,%2,%3;" 1444 : "=r"(r) 1445 : "r"(__a), "r"(__b), "r"(0)); 1446 return r; 1447 } 1448 __DEVICE__ unsigned int __vnegss4(unsigned int __a) { 1449 return __vsubss4(0, __a); 1450 } 1451 __DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) { 1452 unsigned int r; 1453 asm("vsub2.u32.u32.u32.sat %0,%1,%2,%3;" 1454 : "=r"(r) 1455 : "r"(__a), "r"(__b), "r"(0)); 1456 return r; 1457 } 1458 __DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) { 1459 unsigned int r; 1460 asm("vsub4.u32.u32.u32.sat %0,%1,%2,%3;" 1461 : "=r"(r) 1462 : "r"(__a), "r"(__b), "r"(0)); 1463 return r; 1464 } 1465 #endif // CUDA_VERSION >= 9020 1466 1467 // For OpenMP we require the user to include <time.h> as we need to know what 1468 // clock_t is on the system. 1469 #ifndef __OPENMP_NVPTX__ 1470 __DEVICE__ /* clock_t= */ int clock() { return __nvvm_read_ptx_sreg_clock(); } 1471 #endif 1472 __DEVICE__ long long clock64() { return __nvvm_read_ptx_sreg_clock64(); } 1473 1474 // These functions shouldn't be declared when including this header 1475 // for math function resolution purposes. 1476 #ifndef __OPENMP_NVPTX__ 1477 __DEVICE__ void *memcpy(void *__a, const void *__b, size_t __c) { 1478 return __builtin_memcpy(__a, __b, __c); 1479 } 1480 __DEVICE__ void *memset(void *__a, int __b, size_t __c) { 1481 return __builtin_memset(__a, __b, __c); 1482 } 1483 #endif 1484 1485 #pragma pop_macro("__DEVICE__") 1486 #endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__ 1487