1#include <clc/clc.h> 2 3#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable 4 5#define ROUND_VEC1(out, in, ROUNDF) out = ROUNDF(in); 6#define ROUND_VEC2(out, in, ROUNDF) \ 7 ROUND_VEC1(out.lo, in.lo, ROUNDF); \ 8 ROUND_VEC1(out.hi, in.hi, ROUNDF); 9#define ROUND_VEC3(out, in, ROUNDF) \ 10 ROUND_VEC1(out.s0, in.s0, ROUNDF); \ 11 ROUND_VEC1(out.s1, in.s1, ROUNDF); \ 12 ROUND_VEC1(out.s2, in.s2, ROUNDF); 13#define ROUND_VEC4(out, in, ROUNDF) \ 14 ROUND_VEC2(out.lo, in.lo, ROUNDF); \ 15 ROUND_VEC2(out.hi, in.hi, ROUNDF); 16#define ROUND_VEC8(out, in, ROUNDF) \ 17 ROUND_VEC4(out.lo, in.lo, ROUNDF); \ 18 ROUND_VEC4(out.hi, in.hi, ROUNDF); 19#define ROUND_VEC16(out, in, ROUNDF) \ 20 ROUND_VEC8(out.lo, in.lo, ROUNDF); \ 21 ROUND_VEC8(out.hi, in.hi, ROUNDF); 22 23#define __FUNC(SUFFIX, VEC_SIZE, TYPE, AS, ROUNDF) \ 24 void _CLC_OVERLOAD vstore_half_##VEC_SIZE(TYPE, size_t, AS half *); \ 25 _CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset, \ 26 AS half *mem) { \ 27 TYPE rounded_vec; \ 28 ROUND_VEC##VEC_SIZE(rounded_vec, vec, ROUNDF); \ 29 vstore_half_##VEC_SIZE(rounded_vec, offset, mem); \ 30 } \ 31 void _CLC_OVERLOAD vstorea_half_##VEC_SIZE(TYPE, size_t, AS half *); \ 32 _CLC_OVERLOAD _CLC_DEF void vstorea_half##SUFFIX(TYPE vec, size_t offset, \ 33 AS half *mem) { \ 34 TYPE rounded_vec; \ 35 ROUND_VEC##VEC_SIZE(rounded_vec, vec, ROUNDF); \ 36 vstorea_half_##VEC_SIZE(rounded_vec, offset, mem); \ 37 } 38 39_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) { 40 /* Handle nan corner case */ 41 if (isnan(x)) 42 return x; 43 /* RTZ does not produce Inf for large numbers */ 44 if (fabs(x) > 65504.0f && !isinf(x)) 45 return copysign(65504.0f, x); 46 47 const int exp = (as_uint(x) >> 23 & 0xff) - 127; 48 /* Manage range rounded to +- zero explicitely */ 49 if (exp < -24) 50 return copysign(0.0f, x); 51 52 /* Remove lower 13 bits to make sure the number is rounded down */ 53 int mask = 0xffffe000; 54 /* Denormals cannot be flushed, and they use different bit for rounding */ 55 if (exp < -14) 56 mask <<= min(-(exp + 14), 10); 57 58 return as_float(as_uint(x) & mask); 59} 60 61_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) { 62 /* Handle nan corner case */ 63 if (isnan(x)) 64 return x; 65 66 const float inf = copysign(INFINITY, x); 67 uint ux = as_uint(x); 68 69 /* Manage +- infinity explicitely */ 70 if (as_float(ux & 0x7fffffff) > 0x1.ffcp+15f) { 71 return inf; 72 } 73 /* Manage +- zero explicitely */ 74 if ((ux & 0x7fffffff) == 0) { 75 return copysign(0.0f, x); 76 } 77 78 const int exp = (as_uint(x) >> 23 & 0xff) - 127; 79 /* Manage range rounded to smallest half denormal explicitely */ 80 if (exp < -24) { 81 return copysign(0x1.0p-24f, x); 82 } 83 84 /* Set lower 13 bits */ 85 int mask = (1 << 13) - 1; 86 /* Denormals cannot be flushed, and they use different bit for rounding */ 87 if (exp < -14) { 88 mask = (1 << (13 + min(-(exp + 14), 10))) - 1; 89 } 90 91 const float next = nextafter(as_float(ux | mask), inf); 92 return ((ux & mask) == 0) ? as_float(ux) : next; 93} 94_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) { 95 return ((as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x); 96} 97_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x) { 98 return ((as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x); 99} 100_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) { 101 /* Mantisa + implicit bit */ 102 const uint mantissa = (as_uint(x) & 0x7fffff) | (1u << 23); 103 const int exp = (as_uint(x) >> 23 & 0xff) - 127; 104 int shift = 13; 105 if (exp < -14) { 106 /* The default assumes lower 13 bits are rounded, 107 * but it might be more for denormals. 108 * Shifting beyond last == 0b, and qr == 00b is not necessary */ 109 shift += min(-(exp + 14), 15); 110 } 111 int mask = (1 << shift) - 1; 112 const uint grs = mantissa & mask; 113 const uint last = mantissa & (1 << shift); 114 /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. 115 * exp > 15 should round to inf. */ 116 bool roundup = (grs > (1 << (shift - 1))) || 117 (grs == (1 << (shift - 1)) && last != 0) || (exp > 15); 118 return roundup ? __clc_rti(x) : __clc_rtz(x); 119} 120 121#define __XFUNC(SUFFIX, VEC_SIZE, TYPE, AS) \ 122 __FUNC(SUFFIX, VEC_SIZE, TYPE, AS, __clc_rte) \ 123 __FUNC(SUFFIX##_rtz, VEC_SIZE, TYPE, AS, __clc_rtz) \ 124 __FUNC(SUFFIX##_rtn, VEC_SIZE, TYPE, AS, __clc_rtn) \ 125 __FUNC(SUFFIX##_rtp, VEC_SIZE, TYPE, AS, __clc_rtp) \ 126 __FUNC(SUFFIX##_rte, VEC_SIZE, TYPE, AS, __clc_rte) 127 128#define FUNC(SUFFIX, VEC_SIZE, TYPE, AS) __XFUNC(SUFFIX, VEC_SIZE, TYPE, AS) 129 130#define __CLC_BODY "vstore_half.inc" 131#include <clc/math/gentype.inc> 132#undef __CLC_BODY 133#undef FUNC 134#undef __XFUNC 135#undef __FUNC 136