151441f80STom Stellard#include <clc/clc.h> 251441f80STom Stellard 351441f80STom Stellard#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable 451441f80STom Stellard 551441f80STom Stellard#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ 6f991505dSAaron Watry typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ 751441f80STom Stellard _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ 8f991505dSAaron Watry *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[2*offset])) = vec; \ 951441f80STom Stellard } \ 1051441f80STom Stellard\ 1151441f80STom Stellard _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ 12f991505dSAaron Watry *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[3*offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1); \ 134cb7cf27SAaron Watry mem[3 * offset + 2] = vec.s2;\ 1451441f80STom Stellard } \ 1551441f80STom Stellard\ 16f991505dSAaron Watry typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ 1751441f80STom Stellard _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ 18f991505dSAaron Watry *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4*) (&mem[4*offset])) = vec; \ 1951441f80STom Stellard } \ 2051441f80STom Stellard\ 21f991505dSAaron Watry typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ 2251441f80STom Stellard _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ 23f991505dSAaron Watry *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8*) (&mem[8*offset])) = vec; \ 2451441f80STom Stellard } \ 2551441f80STom Stellard\ 26f991505dSAaron Watry typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\ 2751441f80STom Stellard _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ 28f991505dSAaron Watry *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16*) (&mem[16*offset])) = vec; \ 2951441f80STom Stellard } \ 3051441f80STom Stellard 31d768ac03STom Stellard#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \ 32d768ac03STom Stellard VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \ 33d768ac03STom Stellard VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \ 34d768ac03STom Stellard VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \ 3551441f80STom Stellard 367ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(char) 377ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(uchar) 387ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(short) 397ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(ushort) 407ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(int) 417ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(uint) 427ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(long) 437ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(ulong) 447ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(float) 4551441f80STom Stellard 4651441f80STom Stellard 4751441f80STom Stellard#ifdef cl_khr_fp64 4851441f80STom Stellard#pragma OPENCL EXTENSION cl_khr_fp64 : enable 4951441f80STom Stellard VSTORE_ADDR_SPACES(double) 5051441f80STom Stellard#endif 517ab2d0bdSJan Vesely 52661ac03aSJan Vesely#ifdef cl_khr_fp16 53661ac03aSJan Vesely#pragma OPENCL EXTENSION cl_khr_fp16 : enable 54661ac03aSJan Vesely VSTORE_ADDR_SPACES(half) 550a5aac3fSJan Vesely#endif 560a5aac3fSJan Vesely 57661ac03aSJan Vesely/* vstore_half are legal even without cl_khr_fp16 */ 58661ac03aSJan Vesely#if __clang_major__ < 6 59661ac03aSJan Vesely#define DECLARE_HELPER(STYPE, AS, builtin) void __clc_vstore_half_##STYPE##_helper##AS(STYPE, AS half *); 60661ac03aSJan Vesely#else 61661ac03aSJan Vesely#define DECLARE_HELPER(STYPE, AS, __builtin) \ 6287036d27SJan Vesely_CLC_DEF void __clc_vstore_half_##STYPE##_helper##AS(STYPE s, AS half *d) \ 63661ac03aSJan Vesely{ \ 64661ac03aSJan Vesely __builtin(s, d); \ 65661ac03aSJan Vesely} 66661ac03aSJan Vesely#endif 67661ac03aSJan Vesely 68661ac03aSJan VeselyDECLARE_HELPER(float, __private, __builtin_store_halff); 69661ac03aSJan VeselyDECLARE_HELPER(float, __global, __builtin_store_halff); 70661ac03aSJan VeselyDECLARE_HELPER(float, __local, __builtin_store_halff); 71661ac03aSJan Vesely 72661ac03aSJan Vesely#ifdef cl_khr_fp64 73661ac03aSJan VeselyDECLARE_HELPER(double, __private, __builtin_store_half); 74661ac03aSJan VeselyDECLARE_HELPER(double, __global, __builtin_store_half); 75661ac03aSJan VeselyDECLARE_HELPER(double, __local, __builtin_store_half); 76661ac03aSJan Vesely#endif 770a5aac3fSJan Vesely 78187ec005SJan Vesely#define VEC_STORE1(STYPE, AS, val, ROUNDF) __clc_vstore_half_##STYPE##_helper##AS (ROUNDF(val), &mem[offset++]); 79661ac03aSJan Vesely 80187ec005SJan Vesely#define VEC_STORE2(STYPE, AS, val, ROUNDF) \ 81187ec005SJan Vesely VEC_STORE1(STYPE, AS, val.lo, ROUNDF) \ 82187ec005SJan Vesely VEC_STORE1(STYPE, AS, val.hi, ROUNDF) 83187ec005SJan Vesely#define VEC_STORE3(STYPE, AS, val, ROUNDF) \ 84187ec005SJan Vesely VEC_STORE1(STYPE, AS, val.s0, ROUNDF) \ 85187ec005SJan Vesely VEC_STORE1(STYPE, AS, val.s1, ROUNDF) \ 86187ec005SJan Vesely VEC_STORE1(STYPE, AS, val.s2, ROUNDF) 87187ec005SJan Vesely#define VEC_STORE4(STYPE, AS, val, ROUNDF) \ 88187ec005SJan Vesely VEC_STORE2(STYPE, AS, val.lo, ROUNDF) \ 89187ec005SJan Vesely VEC_STORE2(STYPE, AS, val.hi, ROUNDF) 90187ec005SJan Vesely#define VEC_STORE8(STYPE, AS, val, ROUNDF) \ 91187ec005SJan Vesely VEC_STORE4(STYPE, AS, val.lo, ROUNDF) \ 92187ec005SJan Vesely VEC_STORE4(STYPE, AS, val.hi, ROUNDF) 93187ec005SJan Vesely#define VEC_STORE16(STYPE, AS, val, ROUNDF) \ 94187ec005SJan Vesely VEC_STORE8(STYPE, AS, val.lo, ROUNDF) \ 95187ec005SJan Vesely VEC_STORE8(STYPE, AS, val.hi, ROUNDF) 960a5aac3fSJan Vesely 97187ec005SJan Vesely#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, ROUNDF) \ 98ad867272SJan Vesely _CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset, AS half *mem) { \ 99ad867272SJan Vesely offset *= VEC_SIZE; \ 100187ec005SJan Vesely VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \ 1017ab2d0bdSJan Vesely } \ 1027ab2d0bdSJan Vesely _CLC_OVERLOAD _CLC_DEF void vstorea_half##SUFFIX(TYPE vec, size_t offset, AS half *mem) { \ 1037ab2d0bdSJan Vesely offset *= OFFSET; \ 104187ec005SJan Vesely VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \ 105ad867272SJan Vesely } 106ad867272SJan Vesely 107187ec005SJan Vesely_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x) 108187ec005SJan Vesely{ 109187ec005SJan Vesely return x; 110187ec005SJan Vesely} 111d526a2b6SJan Vesely_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) 112d526a2b6SJan Vesely{ 113d526a2b6SJan Vesely /* Remove lower 13 bits to make sure the number is rounded down */ 114d526a2b6SJan Vesely int mask = 0xffffe000; 115d526a2b6SJan Vesely const int exp = (as_uint(x) >> 23 & 0xff) - 127; 116d526a2b6SJan Vesely /* Denormals cannot be flushed, and they use different bit for rounding */ 117d526a2b6SJan Vesely if (exp < -14) 118d526a2b6SJan Vesely mask <<= min(-(exp + 14), 10); 119d526a2b6SJan Vesely /* RTZ does not produce Inf for large numbers */ 120d526a2b6SJan Vesely if (fabs(x) > 65504.0f && !isinf(x)) 121d526a2b6SJan Vesely return copysign(65504.0f, x); 122d526a2b6SJan Vesely /* Handle nan corner case */ 123d526a2b6SJan Vesely if (isnan(x)) 124d526a2b6SJan Vesely return x; 125d526a2b6SJan Vesely return as_float(as_uint(x) & mask); 126d526a2b6SJan Vesely} 1272655312cSJan Vesely_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) 1282655312cSJan Vesely{ 1292655312cSJan Vesely const float inf = copysign(INFINITY, x); 1302655312cSJan Vesely /* Set lower 13 bits */ 1312655312cSJan Vesely int mask = (1 << 13) - 1; 1322655312cSJan Vesely const int exp = (as_uint(x) >> 23 & 0xff) - 127; 1332655312cSJan Vesely /* Denormals cannot be flushed, and they use different bit for rounding */ 1342655312cSJan Vesely if (exp < -14) 1352655312cSJan Vesely mask = (1 << (13 + min(-(exp + 14), 10))) - 1; 1362655312cSJan Vesely /* Handle nan corner case */ 1372655312cSJan Vesely if (isnan(x)) 1382655312cSJan Vesely return x; 1392655312cSJan Vesely const float next = nextafter(as_float(as_uint(x) | mask), inf); 1402655312cSJan Vesely return ((as_uint(x) & mask) == 0) ? x : next; 1412655312cSJan Vesely} 1422655312cSJan Vesely_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) 1432655312cSJan Vesely{ 1442655312cSJan Vesely return ((as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x); 1452655312cSJan Vesely} 146f2d876aeSJan Vesely_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x) 147f2d876aeSJan Vesely{ 148f2d876aeSJan Vesely return ((as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x); 149f2d876aeSJan Vesely} 150*1c570566SJan Vesely_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) 151*1c570566SJan Vesely{ 152*1c570566SJan Vesely /* Mantisa + implicit bit */ 153*1c570566SJan Vesely const uint mantissa = (as_uint(x) & 0x7fffff) | (1u << 23); 154*1c570566SJan Vesely const int exp = (as_uint(x) >> 23 & 0xff) - 127; 155*1c570566SJan Vesely int shift = 13; 156*1c570566SJan Vesely if (exp < -14) { 157*1c570566SJan Vesely /* The default assumes lower 13 bits are rounded, 158*1c570566SJan Vesely * but it might be more for denormals. 159*1c570566SJan Vesely * Shifting beyond last == 0b, and qr == 00b is not necessary */ 160*1c570566SJan Vesely shift += min(-(exp + 14), 15); 161*1c570566SJan Vesely } 162*1c570566SJan Vesely int mask = (1 << shift) - 1; 163*1c570566SJan Vesely const uint grs = mantissa & mask; 164*1c570566SJan Vesely const uint last = mantissa & (1 << shift); 165*1c570566SJan Vesely /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. 166*1c570566SJan Vesely * exp > 15 should round to inf. */ 167*1c570566SJan Vesely bool roundup = (grs > (1 << (shift - 1))) || 168*1c570566SJan Vesely (grs == (1 << (shift - 1)) && last != 0) || (exp > 15); 169*1c570566SJan Vesely return roundup ? __clc_rti(x) : __clc_rtz(x); 170*1c570566SJan Vesely} 1712655312cSJan Vesely 172187ec005SJan Vesely#ifdef cl_khr_fp64 173187ec005SJan Vesely_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x) 174187ec005SJan Vesely{ 175187ec005SJan Vesely return x; 176187ec005SJan Vesely} 177d526a2b6SJan Vesely_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x) 178d526a2b6SJan Vesely{ 179d526a2b6SJan Vesely /* Remove lower 42 bits to make sure the number is rounded down */ 180d526a2b6SJan Vesely ulong mask = 0xfffffc0000000000UL; 181d526a2b6SJan Vesely const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; 182d526a2b6SJan Vesely /* Denormals cannot be flushed, and they use different bit for rounding */ 183d526a2b6SJan Vesely if (exp < -14) 184d526a2b6SJan Vesely mask <<= min(-(exp + 14), 10); 185d526a2b6SJan Vesely /* RTZ does not produce Inf for large numbers */ 186d526a2b6SJan Vesely if (fabs(x) > 65504.0 && !isinf(x)) 187d526a2b6SJan Vesely return copysign(65504.0, x); 188d526a2b6SJan Vesely /* Handle nan corner case */ 189d526a2b6SJan Vesely if (isnan(x)) 190d526a2b6SJan Vesely return x; 191d526a2b6SJan Vesely return as_double(as_ulong(x) & mask); 192d526a2b6SJan Vesely} 1932655312cSJan Vesely_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x) 1942655312cSJan Vesely{ 1952655312cSJan Vesely const double inf = copysign((double)INFINITY, x); 1962655312cSJan Vesely /* Set lower 42 bits */ 1972655312cSJan Vesely long mask = (1UL << 42UL) - 1UL; 1982655312cSJan Vesely const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; 1992655312cSJan Vesely /* Denormals cannot be flushed, and they use different bit for rounding */ 2002655312cSJan Vesely if (exp < -14) 2012655312cSJan Vesely mask = (1UL << (42UL + min(-(exp + 14), 10))) - 1; 2022655312cSJan Vesely /* Handle nan corner case */ 2032655312cSJan Vesely if (isnan(x)) 2042655312cSJan Vesely return x; 2052655312cSJan Vesely const double next = nextafter(as_double(as_ulong(x) | mask), inf); 2062655312cSJan Vesely return ((as_ulong(x) & mask) == 0) ? x : next; 2072655312cSJan Vesely} 2082655312cSJan Vesely_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x) 2092655312cSJan Vesely{ 2102655312cSJan Vesely return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x) : __clc_rti(x); 2112655312cSJan Vesely} 212f2d876aeSJan Vesely_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x) 213f2d876aeSJan Vesely{ 214f2d876aeSJan Vesely return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x) : __clc_rtz(x); 215f2d876aeSJan Vesely} 216*1c570566SJan Vesely_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x) 217*1c570566SJan Vesely{ 218*1c570566SJan Vesely /* Mantisa + implicit bit */ 219*1c570566SJan Vesely const ulong mantissa = (as_ulong(x) & 0xfffffffffffff) | (1UL << 52); 220*1c570566SJan Vesely const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023; 221*1c570566SJan Vesely int shift = 42; 222*1c570566SJan Vesely if (exp < -14) { 223*1c570566SJan Vesely /* The default assumes lower 13 bits are rounded, 224*1c570566SJan Vesely * but it might be more for denormals. 225*1c570566SJan Vesely * Shifting beyond last == 0b, and qr == 00b is not necessary */ 226*1c570566SJan Vesely shift += min(-(exp + 14), 15); 227*1c570566SJan Vesely } 228*1c570566SJan Vesely ulong mask = (1UL << shift) - 1UL; 229*1c570566SJan Vesely const ulong grs = mantissa & mask; 230*1c570566SJan Vesely const ulong last = mantissa & (1UL << shift); 231*1c570566SJan Vesely /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. 232*1c570566SJan Vesely * exp > 15 should round to inf. */ 233*1c570566SJan Vesely bool roundup = (grs > (1UL << (shift - 1UL))) || 234*1c570566SJan Vesely (grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15); 235*1c570566SJan Vesely return roundup ? __clc_rti(x) : __clc_rtz(x); 236*1c570566SJan Vesely} 237187ec005SJan Vesely#endif 238187ec005SJan Vesely 239187ec005SJan Vesely#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \ 240d526a2b6SJan Vesely __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_noop) \ 2412655312cSJan Vesely __FUNC(SUFFIX ## _rtz, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtz) \ 242f2d876aeSJan Vesely __FUNC(SUFFIX ## _rtn, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtn) \ 243*1c570566SJan Vesely __FUNC(SUFFIX ## _rtp, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtp) \ 244*1c570566SJan Vesely __FUNC(SUFFIX ## _rte, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rte) 245187ec005SJan Vesely 246187ec005SJan Vesely#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \ 247187ec005SJan Vesely __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) 248ad867272SJan Vesely 249ad867272SJan Vesely#define __CLC_BODY "vstore_half.inc" 250ad867272SJan Vesely#include <clc/math/gentype.inc> 251661ac03aSJan Vesely#undef __CLC_BODY 252661ac03aSJan Vesely#undef FUNC 253187ec005SJan Vesely#undef __XFUNC 254661ac03aSJan Vesely#undef __FUNC 255661ac03aSJan Vesely#undef VEC_LOAD16 256661ac03aSJan Vesely#undef VEC_LOAD8 257661ac03aSJan Vesely#undef VEC_LOAD4 258661ac03aSJan Vesely#undef VEC_LOAD3 259661ac03aSJan Vesely#undef VEC_LOAD2 260661ac03aSJan Vesely#undef VEC_LOAD1 261661ac03aSJan Vesely#undef DECLARE_HELPER 262661ac03aSJan Vesely#undef VSTORE_ADDR_SPACES 263661ac03aSJan Vesely#undef VSTORE_VECTORIZE 264