xref: /llvm-project/libclc/generic/lib/shared/vstore.cl (revision 1c570566c39a62fa620085bff03f715c965cfaee)
151441f80STom Stellard#include <clc/clc.h>
251441f80STom Stellard
351441f80STom Stellard#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
451441f80STom Stellard
551441f80STom Stellard#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
6f991505dSAaron Watry  typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
751441f80STom Stellard  _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
8f991505dSAaron Watry    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[2*offset])) = vec; \
951441f80STom Stellard  } \
1051441f80STom Stellard\
1151441f80STom Stellard  _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
12f991505dSAaron Watry    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2*) (&mem[3*offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1); \
134cb7cf27SAaron Watry    mem[3 * offset + 2] = vec.s2;\
1451441f80STom Stellard  } \
1551441f80STom Stellard\
16f991505dSAaron Watry  typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
1751441f80STom Stellard  _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
18f991505dSAaron Watry    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4*) (&mem[4*offset])) = vec; \
1951441f80STom Stellard  } \
2051441f80STom Stellard\
21f991505dSAaron Watry  typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
2251441f80STom Stellard  _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
23f991505dSAaron Watry    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8*) (&mem[8*offset])) = vec; \
2451441f80STom Stellard  } \
2551441f80STom Stellard\
26f991505dSAaron Watry  typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 __attribute__ ((aligned (sizeof(PRIM_TYPE))));\
2751441f80STom Stellard  _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
28f991505dSAaron Watry    *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16*) (&mem[16*offset])) = vec; \
2951441f80STom Stellard  } \
3051441f80STom Stellard
31d768ac03STom Stellard#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \
32d768ac03STom Stellard    VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \
33d768ac03STom Stellard    VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \
34d768ac03STom Stellard    VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \
3551441f80STom Stellard
367ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(char)
377ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(uchar)
387ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(short)
397ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(ushort)
407ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(int)
417ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(uint)
427ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(long)
437ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(ulong)
447ab2d0bdSJan VeselyVSTORE_ADDR_SPACES(float)
4551441f80STom Stellard
4651441f80STom Stellard
4751441f80STom Stellard#ifdef cl_khr_fp64
4851441f80STom Stellard#pragma OPENCL EXTENSION cl_khr_fp64 : enable
4951441f80STom Stellard    VSTORE_ADDR_SPACES(double)
5051441f80STom Stellard#endif
517ab2d0bdSJan Vesely
52661ac03aSJan Vesely#ifdef cl_khr_fp16
53661ac03aSJan Vesely#pragma OPENCL EXTENSION cl_khr_fp16 : enable
54661ac03aSJan Vesely    VSTORE_ADDR_SPACES(half)
550a5aac3fSJan Vesely#endif
560a5aac3fSJan Vesely
57661ac03aSJan Vesely/* vstore_half are legal even without cl_khr_fp16 */
58661ac03aSJan Vesely#if __clang_major__ < 6
59661ac03aSJan Vesely#define DECLARE_HELPER(STYPE, AS, builtin) void __clc_vstore_half_##STYPE##_helper##AS(STYPE, AS half *);
60661ac03aSJan Vesely#else
61661ac03aSJan Vesely#define DECLARE_HELPER(STYPE, AS, __builtin) \
6287036d27SJan Vesely_CLC_DEF void __clc_vstore_half_##STYPE##_helper##AS(STYPE s, AS half *d) \
63661ac03aSJan Vesely{ \
64661ac03aSJan Vesely	__builtin(s, d); \
65661ac03aSJan Vesely}
66661ac03aSJan Vesely#endif
67661ac03aSJan Vesely
68661ac03aSJan VeselyDECLARE_HELPER(float, __private, __builtin_store_halff);
69661ac03aSJan VeselyDECLARE_HELPER(float, __global, __builtin_store_halff);
70661ac03aSJan VeselyDECLARE_HELPER(float, __local, __builtin_store_halff);
71661ac03aSJan Vesely
72661ac03aSJan Vesely#ifdef cl_khr_fp64
73661ac03aSJan VeselyDECLARE_HELPER(double, __private, __builtin_store_half);
74661ac03aSJan VeselyDECLARE_HELPER(double, __global, __builtin_store_half);
75661ac03aSJan VeselyDECLARE_HELPER(double, __local, __builtin_store_half);
76661ac03aSJan Vesely#endif
770a5aac3fSJan Vesely
78187ec005SJan Vesely#define VEC_STORE1(STYPE, AS, val, ROUNDF) __clc_vstore_half_##STYPE##_helper##AS (ROUNDF(val), &mem[offset++]);
79661ac03aSJan Vesely
80187ec005SJan Vesely#define VEC_STORE2(STYPE, AS, val, ROUNDF) \
81187ec005SJan Vesely	VEC_STORE1(STYPE, AS, val.lo, ROUNDF) \
82187ec005SJan Vesely	VEC_STORE1(STYPE, AS, val.hi, ROUNDF)
83187ec005SJan Vesely#define VEC_STORE3(STYPE, AS, val, ROUNDF) \
84187ec005SJan Vesely	VEC_STORE1(STYPE, AS, val.s0, ROUNDF) \
85187ec005SJan Vesely	VEC_STORE1(STYPE, AS, val.s1, ROUNDF) \
86187ec005SJan Vesely	VEC_STORE1(STYPE, AS, val.s2, ROUNDF)
87187ec005SJan Vesely#define VEC_STORE4(STYPE, AS, val, ROUNDF) \
88187ec005SJan Vesely	VEC_STORE2(STYPE, AS, val.lo, ROUNDF) \
89187ec005SJan Vesely	VEC_STORE2(STYPE, AS, val.hi, ROUNDF)
90187ec005SJan Vesely#define VEC_STORE8(STYPE, AS, val, ROUNDF) \
91187ec005SJan Vesely	VEC_STORE4(STYPE, AS, val.lo, ROUNDF) \
92187ec005SJan Vesely	VEC_STORE4(STYPE, AS, val.hi, ROUNDF)
93187ec005SJan Vesely#define VEC_STORE16(STYPE, AS, val, ROUNDF) \
94187ec005SJan Vesely	VEC_STORE8(STYPE, AS, val.lo, ROUNDF) \
95187ec005SJan Vesely	VEC_STORE8(STYPE, AS, val.hi, ROUNDF)
960a5aac3fSJan Vesely
97187ec005SJan Vesely#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, ROUNDF) \
98ad867272SJan Vesely  _CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset, AS half *mem) { \
99ad867272SJan Vesely    offset *= VEC_SIZE; \
100187ec005SJan Vesely    VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \
1017ab2d0bdSJan Vesely  } \
1027ab2d0bdSJan Vesely  _CLC_OVERLOAD _CLC_DEF void vstorea_half##SUFFIX(TYPE vec, size_t offset, AS half *mem) { \
1037ab2d0bdSJan Vesely    offset *= OFFSET; \
104187ec005SJan Vesely    VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) \
105ad867272SJan Vesely  }
106ad867272SJan Vesely
107187ec005SJan Vesely_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x)
108187ec005SJan Vesely{
109187ec005SJan Vesely	return x;
110187ec005SJan Vesely}
111d526a2b6SJan Vesely_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x)
112d526a2b6SJan Vesely{
113d526a2b6SJan Vesely	/* Remove lower 13 bits to make sure the number is rounded down */
114d526a2b6SJan Vesely	int mask = 0xffffe000;
115d526a2b6SJan Vesely	const int exp = (as_uint(x) >> 23 & 0xff) - 127;
116d526a2b6SJan Vesely	/* Denormals cannot be flushed, and they use different bit for rounding */
117d526a2b6SJan Vesely	if (exp < -14)
118d526a2b6SJan Vesely		mask <<= min(-(exp + 14), 10);
119d526a2b6SJan Vesely	/* RTZ does not produce Inf for large numbers */
120d526a2b6SJan Vesely	if (fabs(x) > 65504.0f && !isinf(x))
121d526a2b6SJan Vesely		return copysign(65504.0f, x);
122d526a2b6SJan Vesely	/* Handle nan corner case */
123d526a2b6SJan Vesely	if (isnan(x))
124d526a2b6SJan Vesely		return x;
125d526a2b6SJan Vesely	return as_float(as_uint(x) & mask);
126d526a2b6SJan Vesely}
1272655312cSJan Vesely_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x)
1282655312cSJan Vesely{
1292655312cSJan Vesely	const float inf = copysign(INFINITY, x);
1302655312cSJan Vesely	/* Set lower 13 bits */
1312655312cSJan Vesely	int mask = (1 << 13) - 1;
1322655312cSJan Vesely	const int exp = (as_uint(x) >> 23 & 0xff) - 127;
1332655312cSJan Vesely	/* Denormals cannot be flushed, and they use different bit for rounding */
1342655312cSJan Vesely	if (exp < -14)
1352655312cSJan Vesely		mask = (1 << (13 + min(-(exp + 14), 10))) - 1;
1362655312cSJan Vesely	/* Handle nan corner case */
1372655312cSJan Vesely	if (isnan(x))
1382655312cSJan Vesely		return x;
1392655312cSJan Vesely	const float next = nextafter(as_float(as_uint(x) | mask), inf);
1402655312cSJan Vesely	return ((as_uint(x) & mask) == 0) ? x : next;
1412655312cSJan Vesely}
1422655312cSJan Vesely_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x)
1432655312cSJan Vesely{
1442655312cSJan Vesely	return ((as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x);
1452655312cSJan Vesely}
146f2d876aeSJan Vesely_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x)
147f2d876aeSJan Vesely{
148f2d876aeSJan Vesely	return ((as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x);
149f2d876aeSJan Vesely}
150*1c570566SJan Vesely_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x)
151*1c570566SJan Vesely{
152*1c570566SJan Vesely	/* Mantisa + implicit bit */
153*1c570566SJan Vesely	const uint mantissa = (as_uint(x) & 0x7fffff) | (1u << 23);
154*1c570566SJan Vesely	const int exp = (as_uint(x) >> 23 & 0xff) - 127;
155*1c570566SJan Vesely	int shift = 13;
156*1c570566SJan Vesely	if (exp < -14) {
157*1c570566SJan Vesely		/* The default assumes lower 13 bits are rounded,
158*1c570566SJan Vesely		 * but it might be more for denormals.
159*1c570566SJan Vesely		 * Shifting beyond last == 0b, and qr == 00b is not necessary */
160*1c570566SJan Vesely		shift += min(-(exp + 14), 15);
161*1c570566SJan Vesely	}
162*1c570566SJan Vesely	int mask = (1 << shift) - 1;
163*1c570566SJan Vesely	const uint grs = mantissa & mask;
164*1c570566SJan Vesely	const uint last = mantissa & (1 << shift);
165*1c570566SJan Vesely	/* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
166*1c570566SJan Vesely	 * exp > 15 should round to inf. */
167*1c570566SJan Vesely	bool roundup = (grs > (1 << (shift - 1))) ||
168*1c570566SJan Vesely		(grs == (1 << (shift - 1)) && last != 0) || (exp > 15);
169*1c570566SJan Vesely	return roundup ? __clc_rti(x) : __clc_rtz(x);
170*1c570566SJan Vesely}
1712655312cSJan Vesely
172187ec005SJan Vesely#ifdef cl_khr_fp64
173187ec005SJan Vesely_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x)
174187ec005SJan Vesely{
175187ec005SJan Vesely	return x;
176187ec005SJan Vesely}
177d526a2b6SJan Vesely_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x)
178d526a2b6SJan Vesely{
179d526a2b6SJan Vesely	/* Remove lower 42 bits to make sure the number is rounded down */
180d526a2b6SJan Vesely	ulong mask = 0xfffffc0000000000UL;
181d526a2b6SJan Vesely	const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
182d526a2b6SJan Vesely	/* Denormals cannot be flushed, and they use different bit for rounding */
183d526a2b6SJan Vesely	if (exp < -14)
184d526a2b6SJan Vesely		mask <<= min(-(exp + 14), 10);
185d526a2b6SJan Vesely	/* RTZ does not produce Inf for large numbers */
186d526a2b6SJan Vesely	if (fabs(x) > 65504.0 && !isinf(x))
187d526a2b6SJan Vesely		return copysign(65504.0, x);
188d526a2b6SJan Vesely	/* Handle nan corner case */
189d526a2b6SJan Vesely	if (isnan(x))
190d526a2b6SJan Vesely		return x;
191d526a2b6SJan Vesely	return as_double(as_ulong(x) & mask);
192d526a2b6SJan Vesely}
1932655312cSJan Vesely_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x)
1942655312cSJan Vesely{
1952655312cSJan Vesely	const double inf = copysign((double)INFINITY, x);
1962655312cSJan Vesely	/* Set lower 42 bits */
1972655312cSJan Vesely	long mask = (1UL << 42UL) - 1UL;
1982655312cSJan Vesely	const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
1992655312cSJan Vesely	/* Denormals cannot be flushed, and they use different bit for rounding */
2002655312cSJan Vesely	if (exp < -14)
2012655312cSJan Vesely		mask = (1UL << (42UL + min(-(exp + 14), 10))) - 1;
2022655312cSJan Vesely	/* Handle nan corner case */
2032655312cSJan Vesely	if (isnan(x))
2042655312cSJan Vesely		return x;
2052655312cSJan Vesely	const double next = nextafter(as_double(as_ulong(x) | mask), inf);
2062655312cSJan Vesely	return ((as_ulong(x) & mask) == 0) ? x : next;
2072655312cSJan Vesely}
2082655312cSJan Vesely_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x)
2092655312cSJan Vesely{
2102655312cSJan Vesely	return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x) : __clc_rti(x);
2112655312cSJan Vesely}
212f2d876aeSJan Vesely_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x)
213f2d876aeSJan Vesely{
214f2d876aeSJan Vesely	return ((as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x) : __clc_rtz(x);
215f2d876aeSJan Vesely}
216*1c570566SJan Vesely_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x)
217*1c570566SJan Vesely{
218*1c570566SJan Vesely	/* Mantisa + implicit bit */
219*1c570566SJan Vesely	const ulong mantissa = (as_ulong(x) & 0xfffffffffffff) | (1UL << 52);
220*1c570566SJan Vesely	const int exp = (as_ulong(x) >> 52 & 0x7ff) - 1023;
221*1c570566SJan Vesely	int shift = 42;
222*1c570566SJan Vesely	if (exp < -14) {
223*1c570566SJan Vesely		/* The default assumes lower 13 bits are rounded,
224*1c570566SJan Vesely		 * but it might be more for denormals.
225*1c570566SJan Vesely		 * Shifting beyond last == 0b, and qr == 00b is not necessary */
226*1c570566SJan Vesely		shift += min(-(exp + 14), 15);
227*1c570566SJan Vesely	}
228*1c570566SJan Vesely	ulong mask = (1UL << shift) - 1UL;
229*1c570566SJan Vesely	const ulong grs = mantissa & mask;
230*1c570566SJan Vesely	const ulong last = mantissa & (1UL << shift);
231*1c570566SJan Vesely	/* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
232*1c570566SJan Vesely	 * exp > 15 should round to inf. */
233*1c570566SJan Vesely	bool roundup = (grs > (1UL << (shift - 1UL))) ||
234*1c570566SJan Vesely		(grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15);
235*1c570566SJan Vesely	return roundup ? __clc_rti(x) : __clc_rtz(x);
236*1c570566SJan Vesely}
237187ec005SJan Vesely#endif
238187ec005SJan Vesely
239187ec005SJan Vesely#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \
240d526a2b6SJan Vesely	__FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_noop) \
2412655312cSJan Vesely	__FUNC(SUFFIX ## _rtz, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtz) \
242f2d876aeSJan Vesely	__FUNC(SUFFIX ## _rtn, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtn) \
243*1c570566SJan Vesely	__FUNC(SUFFIX ## _rtp, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rtp) \
244*1c570566SJan Vesely	__FUNC(SUFFIX ## _rte, VEC_SIZE, OFFSET, TYPE, STYPE, AS, __clc_rte)
245187ec005SJan Vesely
246187ec005SJan Vesely#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS) \
247187ec005SJan Vesely	__XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS)
248ad867272SJan Vesely
249ad867272SJan Vesely#define __CLC_BODY "vstore_half.inc"
250ad867272SJan Vesely#include <clc/math/gentype.inc>
251661ac03aSJan Vesely#undef __CLC_BODY
252661ac03aSJan Vesely#undef FUNC
253187ec005SJan Vesely#undef __XFUNC
254661ac03aSJan Vesely#undef __FUNC
255661ac03aSJan Vesely#undef VEC_LOAD16
256661ac03aSJan Vesely#undef VEC_LOAD8
257661ac03aSJan Vesely#undef VEC_LOAD4
258661ac03aSJan Vesely#undef VEC_LOAD3
259661ac03aSJan Vesely#undef VEC_LOAD2
260661ac03aSJan Vesely#undef VEC_LOAD1
261661ac03aSJan Vesely#undef DECLARE_HELPER
262661ac03aSJan Vesely#undef VSTORE_ADDR_SPACES
263661ac03aSJan Vesely#undef VSTORE_VECTORIZE
264