1// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 2// RUN: %clang_cc1 -cl-std=CL1.2 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck %s 3// REQUIRES: amdgpu-registered-target 4 5#pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 7typedef unsigned int uint; 8typedef unsigned short ushort; 9typedef unsigned int __attribute__((ext_vector_type(2))) uint2; 10typedef unsigned int __attribute__((ext_vector_type(6))) uint6; 11typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32; 12typedef half __attribute__((ext_vector_type(32))) half32; 13typedef short __attribute__((ext_vector_type(2))) short2; 14typedef __bf16 __attribute__((ext_vector_type(2))) bfloat2; 15typedef float __attribute__((ext_vector_type(16))) float16; 16typedef half __attribute__((ext_vector_type(2))) half2; 17typedef float __attribute__((ext_vector_type(2))) float2; 18typedef float __attribute__((ext_vector_type(32))) float32; 19 20// CHECK-LABEL: @test_prng_b32( 21// CHECK-NEXT: entry: 22// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 23// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 24// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 25// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4 26// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4 27// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.prng.b32(i32 [[TMP0]]) 28// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 29// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4 30// CHECK-NEXT: ret void 31// 32void test_prng_b32(global uint* out, uint a) { 33 *out = __builtin_amdgcn_prng_b32(a); 34} 35 36// CHECK-LABEL: @test_permlane16_swap( 37// CHECK-NEXT: entry: 38// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 39// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 40// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 41// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 42// CHECK-NEXT: store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4 43// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 44// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 45// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 46// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false) 47// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 48// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 49// CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 50// CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 51// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 52// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 53// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 54// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 55// CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false) 56// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0 57// CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1 58// CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0 59// CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1 60// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 61// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8 62// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 63// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 64// CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true) 65// CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0 66// CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1 67// CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0 68// CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1 69// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 70// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8 71// CHECK-NEXT: ret void 72// 73void test_permlane16_swap(global uint2* out, uint old, uint src) { 74 *out = __builtin_amdgcn_permlane16_swap(old, src, false, false); 75 *out = __builtin_amdgcn_permlane16_swap(old, src, true, false); 76 *out = __builtin_amdgcn_permlane16_swap(old, src, false, true); 77} 78 79// CHECK-LABEL: @test_permlane32_swap( 80// CHECK-NEXT: entry: 81// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 82// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 83// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 84// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 85// CHECK-NEXT: store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4 86// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 87// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 88// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 89// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false) 90// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 91// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 92// CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 93// CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 94// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 95// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 96// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 97// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 98// CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false) 99// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0 100// CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1 101// CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0 102// CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1 103// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 104// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8 105// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4 106// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 107// CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true) 108// CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0 109// CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1 110// CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0 111// CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1 112// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 113// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8 114// CHECK-NEXT: ret void 115// 116void test_permlane32_swap(global uint2* out, uint old, uint src) { 117 *out = __builtin_amdgcn_permlane32_swap(old, src, false, false); 118 *out = __builtin_amdgcn_permlane32_swap(old, src, true, false); 119 *out = __builtin_amdgcn_permlane32_swap(old, src, false, true); 120} 121 122// CHECK-LABEL: @test_cvt_scalef32_pk( 123// CHECK-NEXT: entry: 124// CHECK-NEXT: [[OUT6_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 125// CHECK-NEXT: [[SRCBF32_ADDR:%.*]] = alloca <32 x bfloat>, align 64, addrspace(5) 126// CHECK-NEXT: [[SRCH32_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5) 127// CHECK-NEXT: [[SRC0F32_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5) 128// CHECK-NEXT: [[SRC1F32_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5) 129// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 130// CHECK-NEXT: store ptr addrspace(1) [[OUT6:%.*]], ptr addrspace(5) [[OUT6_ADDR]], align 8 131// CHECK-NEXT: store <32 x bfloat> [[SRCBF32:%.*]], ptr addrspace(5) [[SRCBF32_ADDR]], align 64 132// CHECK-NEXT: store <32 x half> [[SRCH32:%.*]], ptr addrspace(5) [[SRCH32_ADDR]], align 64 133// CHECK-NEXT: store <16 x float> [[SRC0F32:%.*]], ptr addrspace(5) [[SRC0F32_ADDR]], align 64 134// CHECK-NEXT: store <16 x float> [[SRC1F32:%.*]], ptr addrspace(5) [[SRC1F32_ADDR]], align 64 135// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 136// CHECK-NEXT: [[TMP0:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64 137// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 138// CHECK-NEXT: [[TMP2:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> [[TMP0]], float [[TMP1]]) 139// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 140// CHECK-NEXT: store <6 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 32 141// CHECK-NEXT: [[TMP4:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64 142// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 143// CHECK-NEXT: [[TMP6:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> [[TMP4]], float [[TMP5]]) 144// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 145// CHECK-NEXT: store <6 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 32 146// CHECK-NEXT: [[TMP8:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64 147// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 148// CHECK-NEXT: [[TMP10:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> [[TMP8]], float [[TMP9]]) 149// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 150// CHECK-NEXT: store <6 x i32> [[TMP10]], ptr addrspace(1) [[TMP11]], align 32 151// CHECK-NEXT: [[TMP12:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64 152// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 153// CHECK-NEXT: [[TMP14:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> [[TMP12]], float [[TMP13]]) 154// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 155// CHECK-NEXT: store <6 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 32 156// CHECK-NEXT: [[TMP16:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC0F32_ADDR]], align 64 157// CHECK-NEXT: [[TMP17:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC1F32_ADDR]], align 64 158// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 159// CHECK-NEXT: [[TMP19:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> [[TMP16]], <16 x float> [[TMP17]], float [[TMP18]]) 160// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 161// CHECK-NEXT: store <6 x i32> [[TMP19]], ptr addrspace(1) [[TMP20]], align 32 162// CHECK-NEXT: [[TMP21:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC0F32_ADDR]], align 64 163// CHECK-NEXT: [[TMP22:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC1F32_ADDR]], align 64 164// CHECK-NEXT: [[TMP23:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 165// CHECK-NEXT: [[TMP24:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> [[TMP21]], <16 x float> [[TMP22]], float [[TMP23]]) 166// CHECK-NEXT: [[TMP25:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 167// CHECK-NEXT: store <6 x i32> [[TMP24]], ptr addrspace(1) [[TMP25]], align 32 168// CHECK-NEXT: ret void 169// 170void test_cvt_scalef32_pk(global uint6 *out6, bfloat32 srcbf32, half32 srch32, float16 src0f32, float16 src1f32, float scale) 171{ 172 *out6 = __builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16(srcbf32, scale); 173 *out6 = __builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(srch32, scale); 174 *out6 = __builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16(srcbf32, scale); 175 *out6 = __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(srch32, scale); 176 *out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(src0f32, src1f32, scale); 177 *out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(src0f32, src1f32, scale); 178} 179 180// CHECK-LABEL: @test_ashr_pk_i8_i32( 181// CHECK-NEXT: entry: 182// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 183// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 184// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 185// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 186// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 187// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4 188// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 189// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4 190// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4 191// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 192// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4 193// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.i8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) 194// CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP3]] to i32 195// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 196// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4 197// CHECK-NEXT: ret void 198// 199void test_ashr_pk_i8_i32(global int* out, uint src0, uint src1, uint src2) { 200 *out = __builtin_amdgcn_ashr_pk_i8_i32(src0, src1, src2); 201} 202 203// CHECK-LABEL: @test_ashr_pk_u8_i32( 204// CHECK-NEXT: entry: 205// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 206// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 207// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 208// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 209// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 210// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4 211// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 212// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4 213// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4 214// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 215// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4 216// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.u8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]]) 217// CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP3]] to i32 218// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 219// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4 220// CHECK-NEXT: ret void 221// 222void test_ashr_pk_u8_i32(global int* out, uint src0, uint src1, uint src2) { 223 *out = __builtin_amdgcn_ashr_pk_u8_i32(src0, src1, src2); 224} 225 226// CHECK-LABEL: @builtins_amdgcn_dl_insts( 227// CHECK-NEXT: entry: 228// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 229// CHECK-NEXT: [[FC_ADDR:%.*]] = alloca float, align 4, addrspace(5) 230// CHECK-NEXT: [[V2SSA_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) 231// CHECK-NEXT: [[V2SSB_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5) 232// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 233// CHECK-NEXT: store float [[FC:%.*]], ptr addrspace(5) [[FC_ADDR]], align 4 234// CHECK-NEXT: store <2 x i16> [[V2SSA:%.*]], ptr addrspace(5) [[V2SSA_ADDR]], align 4 235// CHECK-NEXT: store <2 x i16> [[V2SSB:%.*]], ptr addrspace(5) [[V2SSB_ADDR]], align 4 236// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSA_ADDR]], align 4 237// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[TMP0]] to <2 x bfloat> 238// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSB_ADDR]], align 4 239// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to <2 x bfloat> 240// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[FC_ADDR]], align 4 241// CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[TMP1]], <2 x bfloat> [[TMP3]], float [[TMP4]], i1 false) 242// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 243// CHECK-NEXT: store float [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 244// CHECK-NEXT: ret void 245// 246void builtins_amdgcn_dl_insts(global float *out, float fC, short2 v2ssA, short2 v2ssB) { 247 *out = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false); 248} 249 250// CHECK-LABEL: @builtins_amdgcn_dl_dot2c( 251// CHECK-NEXT: entry: 252// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 253// CHECK-NEXT: [[FC_ADDR:%.*]] = alloca float, align 4, addrspace(5) 254// CHECK-NEXT: [[V2SSA_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) 255// CHECK-NEXT: [[V2SSB_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) 256// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 257// CHECK-NEXT: store float [[FC:%.*]], ptr addrspace(5) [[FC_ADDR]], align 4 258// CHECK-NEXT: store <2 x bfloat> [[V2SSA:%.*]], ptr addrspace(5) [[V2SSA_ADDR]], align 4 259// CHECK-NEXT: store <2 x bfloat> [[V2SSB:%.*]], ptr addrspace(5) [[V2SSB_ADDR]], align 4 260// CHECK-NEXT: [[TMP0:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[V2SSA_ADDR]], align 4 261// CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[V2SSB_ADDR]], align 4 262// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[FC_ADDR]], align 4 263// CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.amdgcn.fdot2c.f32.bf16(<2 x bfloat> [[TMP0]], <2 x bfloat> [[TMP1]], float [[TMP2]], i1 false) 264// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 265// CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 266// CHECK-NEXT: ret void 267// 268void builtins_amdgcn_dl_dot2c(global float *out, float fC, bfloat2 v2ssA, bfloat2 v2ssB) { 269 *out = __builtin_amdgcn_fdot2c_f32_bf16(v2ssA, v2ssB, fC, false); 270} 271 272// CHECK-LABEL: @test_cvt_scalef32_f16_fp8( 273// CHECK-NEXT: entry: 274// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 275// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 276// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 277// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 278// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 279// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 280// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 281// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 4 282// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 283// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 284// CHECK-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP1]], i32 [[TMP2]], float [[TMP3]], i32 0, i1 false) 285// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 286// CHECK-NEXT: store <2 x half> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 287// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 288// CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP6]], align 4 289// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 290// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 291// CHECK-NEXT: [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP7]], i32 [[TMP8]], float [[TMP9]], i32 1, i1 false) 292// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 293// CHECK-NEXT: store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 294// CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 295// CHECK-NEXT: [[TMP13:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP12]], align 4 296// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 297// CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 298// CHECK-NEXT: [[TMP16:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP13]], i32 [[TMP14]], float [[TMP15]], i32 2, i1 false) 299// CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 300// CHECK-NEXT: store <2 x half> [[TMP16]], ptr addrspace(1) [[TMP17]], align 4 301// CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 302// CHECK-NEXT: [[TMP19:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP18]], align 4 303// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 304// CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 305// CHECK-NEXT: [[TMP22:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP19]], i32 [[TMP20]], float [[TMP21]], i32 3, i1 false) 306// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 307// CHECK-NEXT: store <2 x half> [[TMP22]], ptr addrspace(1) [[TMP23]], align 4 308// CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 309// CHECK-NEXT: [[TMP25:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP24]], align 4 310// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 311// CHECK-NEXT: [[TMP27:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 312// CHECK-NEXT: [[TMP28:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP25]], i32 [[TMP26]], float [[TMP27]], i32 0, i1 true) 313// CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 314// CHECK-NEXT: store <2 x half> [[TMP28]], ptr addrspace(1) [[TMP29]], align 4 315// CHECK-NEXT: [[TMP30:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 316// CHECK-NEXT: [[TMP31:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP30]], align 4 317// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 318// CHECK-NEXT: [[TMP33:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 319// CHECK-NEXT: [[TMP34:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP31]], i32 [[TMP32]], float [[TMP33]], i32 1, i1 true) 320// CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 321// CHECK-NEXT: store <2 x half> [[TMP34]], ptr addrspace(1) [[TMP35]], align 4 322// CHECK-NEXT: [[TMP36:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 323// CHECK-NEXT: [[TMP37:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP36]], align 4 324// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 325// CHECK-NEXT: [[TMP39:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 326// CHECK-NEXT: [[TMP40:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP37]], i32 [[TMP38]], float [[TMP39]], i32 2, i1 true) 327// CHECK-NEXT: [[TMP41:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 328// CHECK-NEXT: store <2 x half> [[TMP40]], ptr addrspace(1) [[TMP41]], align 4 329// CHECK-NEXT: [[TMP42:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 330// CHECK-NEXT: [[TMP43:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP42]], align 4 331// CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 332// CHECK-NEXT: [[TMP45:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 333// CHECK-NEXT: [[TMP46:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP43]], i32 [[TMP44]], float [[TMP45]], i32 3, i1 true) 334// CHECK-NEXT: [[TMP47:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 335// CHECK-NEXT: store <2 x half> [[TMP46]], ptr addrspace(1) [[TMP47]], align 4 336// CHECK-NEXT: ret void 337// 338void test_cvt_scalef32_f16_fp8(global half2* out, uint src, float scale) 339{ 340 *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 0, false); 341 *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 1, false); 342 *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 2, false); 343 *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 3, false); 344 *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 0, true); 345 *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 1, true); 346 *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 2, true); 347 *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 3, true); 348} 349 350// CHECK-LABEL: @test_cvt_scalef32_f32_fp8( 351// CHECK-NEXT: entry: 352// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 353// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 354// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 355// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 356// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 357// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 358// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 359// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 360// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP0]], float [[TMP1]], i32 0) 361// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 362// CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 363// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 364// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 365// CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP4]], float [[TMP5]], i32 1) 366// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 367// CHECK-NEXT: store float [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 368// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 369// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 370// CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP8]], float [[TMP9]], i32 2) 371// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 372// CHECK-NEXT: store float [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 373// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 374// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 375// CHECK-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP12]], float [[TMP13]], i32 3) 376// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 377// CHECK-NEXT: store float [[TMP14]], ptr addrspace(1) [[TMP15]], align 4 378// CHECK-NEXT: ret void 379// 380void test_cvt_scalef32_f32_fp8(global float* out, uint src, float scale) 381{ 382 *out = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, 0); 383 *out = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, 1); 384 *out = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, 2); 385 *out = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, 3); 386} 387 388// CHECK-LABEL: @test_cvt_scalef32_f16_bf8( 389// CHECK-NEXT: entry: 390// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 391// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 392// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 393// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 394// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 395// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 396// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 397// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 4 398// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 399// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 400// CHECK-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP1]], i32 [[TMP2]], float [[TMP3]], i32 0, i1 false) 401// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 402// CHECK-NEXT: store <2 x half> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 403// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 404// CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP6]], align 4 405// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 406// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 407// CHECK-NEXT: [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP7]], i32 [[TMP8]], float [[TMP9]], i32 1, i1 false) 408// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 409// CHECK-NEXT: store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 410// CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 411// CHECK-NEXT: [[TMP13:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP12]], align 4 412// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 413// CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 414// CHECK-NEXT: [[TMP16:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP13]], i32 [[TMP14]], float [[TMP15]], i32 2, i1 false) 415// CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 416// CHECK-NEXT: store <2 x half> [[TMP16]], ptr addrspace(1) [[TMP17]], align 4 417// CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 418// CHECK-NEXT: [[TMP19:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP18]], align 4 419// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 420// CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 421// CHECK-NEXT: [[TMP22:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP19]], i32 [[TMP20]], float [[TMP21]], i32 3, i1 false) 422// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 423// CHECK-NEXT: store <2 x half> [[TMP22]], ptr addrspace(1) [[TMP23]], align 4 424// CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 425// CHECK-NEXT: [[TMP25:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP24]], align 4 426// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 427// CHECK-NEXT: [[TMP27:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 428// CHECK-NEXT: [[TMP28:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP25]], i32 [[TMP26]], float [[TMP27]], i32 0, i1 true) 429// CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 430// CHECK-NEXT: store <2 x half> [[TMP28]], ptr addrspace(1) [[TMP29]], align 4 431// CHECK-NEXT: [[TMP30:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 432// CHECK-NEXT: [[TMP31:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP30]], align 4 433// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 434// CHECK-NEXT: [[TMP33:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 435// CHECK-NEXT: [[TMP34:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP31]], i32 [[TMP32]], float [[TMP33]], i32 1, i1 true) 436// CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 437// CHECK-NEXT: store <2 x half> [[TMP34]], ptr addrspace(1) [[TMP35]], align 4 438// CHECK-NEXT: [[TMP36:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 439// CHECK-NEXT: [[TMP37:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP36]], align 4 440// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 441// CHECK-NEXT: [[TMP39:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 442// CHECK-NEXT: [[TMP40:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP37]], i32 [[TMP38]], float [[TMP39]], i32 2, i1 true) 443// CHECK-NEXT: [[TMP41:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 444// CHECK-NEXT: store <2 x half> [[TMP40]], ptr addrspace(1) [[TMP41]], align 4 445// CHECK-NEXT: [[TMP42:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 446// CHECK-NEXT: [[TMP43:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP42]], align 4 447// CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 448// CHECK-NEXT: [[TMP45:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 449// CHECK-NEXT: [[TMP46:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP43]], i32 [[TMP44]], float [[TMP45]], i32 3, i1 true) 450// CHECK-NEXT: [[TMP47:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 451// CHECK-NEXT: store <2 x half> [[TMP46]], ptr addrspace(1) [[TMP47]], align 4 452// CHECK-NEXT: ret void 453// 454void test_cvt_scalef32_f16_bf8(global half2* out, uint src, float scale) 455{ 456 *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 0, false); 457 *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 1, false); 458 *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 2, false); 459 *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 3, false); 460 *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 0, true); 461 *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 1, true); 462 *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 2, true); 463 *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 3, true); 464} 465 466// CHECK-LABEL: @test_cvt_scalef32_f32_bf8( 467// CHECK-NEXT: entry: 468// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 469// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 470// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 471// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 472// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 473// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 474// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 475// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 476// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 [[TMP0]], float [[TMP1]], i32 0) 477// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 478// CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 479// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 480// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 481// CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 [[TMP4]], float [[TMP5]], i32 1) 482// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 483// CHECK-NEXT: store float [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 484// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 485// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 486// CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 [[TMP8]], float [[TMP9]], i32 2) 487// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 488// CHECK-NEXT: store float [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 489// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 490// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 491// CHECK-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 [[TMP12]], float [[TMP13]], i32 3) 492// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 493// CHECK-NEXT: store float [[TMP14]], ptr addrspace(1) [[TMP15]], align 4 494// CHECK-NEXT: ret void 495// 496void test_cvt_scalef32_f32_bf8(global float* out, uint src, float scale) 497{ 498 *out = __builtin_amdgcn_cvt_scalef32_f32_bf8(src, scale, 0); 499 *out = __builtin_amdgcn_cvt_scalef32_f32_bf8(src, scale, 1); 500 *out = __builtin_amdgcn_cvt_scalef32_f32_bf8(src, scale, 2); 501 *out = __builtin_amdgcn_cvt_scalef32_f32_bf8(src, scale, 3); 502} 503 504// CHECK-LABEL: @test_cvt_scalef32_pk_fp8_f32( 505// CHECK-NEXT: entry: 506// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 507// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca float, align 4, addrspace(5) 508// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca float, align 4, addrspace(5) 509// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 510// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 511// CHECK-NEXT: store float [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4 512// CHECK-NEXT: store float [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 513// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 514// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 515// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 516// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 517// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 518// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 519// CHECK-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], i1 true) 520// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 521// CHECK-NEXT: store <2 x i16> [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 522// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 523// CHECK-NEXT: [[TMP8:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP7]], align 4 524// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 525// CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 526// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 527// CHECK-NEXT: [[TMP12:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> [[TMP8]], float [[TMP9]], float [[TMP10]], float [[TMP11]], i1 false) 528// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 529// CHECK-NEXT: store <2 x i16> [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 530// CHECK-NEXT: ret void 531// 532void test_cvt_scalef32_pk_fp8_f32(global short2* out, float src0, float src1, float scale) 533{ 534 *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(*out, src0, src1, scale, true); 535 *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(*out, src0, src1, scale, false); 536} 537 538// CHECK-LABEL: @test_cvt_scalef32_pk_bf8_f32( 539// CHECK-NEXT: entry: 540// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 541// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca float, align 4, addrspace(5) 542// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca float, align 4, addrspace(5) 543// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 544// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 545// CHECK-NEXT: store float [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4 546// CHECK-NEXT: store float [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 547// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 548// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 549// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 550// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 551// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 552// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 553// CHECK-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], i1 true) 554// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 555// CHECK-NEXT: store <2 x i16> [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 556// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 557// CHECK-NEXT: [[TMP8:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP7]], align 4 558// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 559// CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 560// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 561// CHECK-NEXT: [[TMP12:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> [[TMP8]], float [[TMP9]], float [[TMP10]], float [[TMP11]], i1 false) 562// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 563// CHECK-NEXT: store <2 x i16> [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 564// CHECK-NEXT: ret void 565// 566void test_cvt_scalef32_pk_bf8_f32(global short2* out, float src0, float src1, float scale) 567{ 568 *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(*out, src0, src1, scale, true); 569 *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(*out, src0, src1, scale, false); 570} 571 572 573// CHECK-LABEL: @test_cvt_scalef32_pk_f32_fp8( 574// CHECK-NEXT: entry: 575// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 576// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 577// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 578// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 579// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 580// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 581// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 582// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 583// CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 [[TMP0]], float [[TMP1]], i1 true) 584// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 585// CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8 586// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 587// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 588// CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 [[TMP4]], float [[TMP5]], i1 false) 589// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 590// CHECK-NEXT: store <2 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 591// CHECK-NEXT: ret void 592// 593void test_cvt_scalef32_pk_f32_fp8(global float2* out, unsigned int src, float scale) 594{ 595 *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp8(src, scale, true); 596 *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp8(src, scale, false); 597} 598 599// CHECK-LABEL: @test_cvt_scalef32_pk_f32_bf8( 600// CHECK-NEXT: entry: 601// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 602// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 603// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 604// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 605// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 606// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 607// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 608// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 609// CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 [[TMP0]], float [[TMP1]], i1 true) 610// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 611// CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8 612// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 613// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 614// CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 [[TMP4]], float [[TMP5]], i1 false) 615// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 616// CHECK-NEXT: store <2 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 617// CHECK-NEXT: ret void 618// 619void test_cvt_scalef32_pk_f32_bf8(global float2* out, unsigned int src, float scale) 620{ 621 *out = __builtin_amdgcn_cvt_scalef32_pk_f32_bf8(src, scale, true); 622 *out = __builtin_amdgcn_cvt_scalef32_pk_f32_bf8(src, scale, false); 623} 624 625// CHECK-LABEL: @test_cvt_scalef32_pk_fp8_f16( 626// CHECK-NEXT: entry: 627// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 628// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) 629// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 630// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 631// CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 632// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 633// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 634// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 635// CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 636// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 637// CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i1 true) 638// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 639// CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 640// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 641// CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4 642// CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 643// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 644// CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i1 false) 645// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 646// CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 647// CHECK-NEXT: ret void 648// 649void test_cvt_scalef32_pk_fp8_f16(global short2* out, half2 src, float scale) 650{ 651 *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(*out, src, scale, true); 652 *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(*out, src, scale, false); 653} 654 655// CHECK-LABEL: @test_cvt_scalef32_pk_fp8_bf16( 656// CHECK-NEXT: entry: 657// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 658// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) 659// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 660// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 661// CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 662// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 663// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 664// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 665// CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 666// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 667// CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i1 true) 668// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 669// CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 670// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 671// CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4 672// CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 673// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 674// CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i1 false) 675// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 676// CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 677// CHECK-NEXT: ret void 678// 679void test_cvt_scalef32_pk_fp8_bf16(global short2* out, bfloat2 src, float scale) 680{ 681 *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16(*out, src, scale, true); 682 *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16(*out, src, scale, false); 683} 684 685// CHECK-LABEL: @test_cvt_scalef32_pk_bf8_f16( 686// CHECK-NEXT: entry: 687// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 688// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) 689// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 690// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 691// CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 692// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 693// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 694// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 695// CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 696// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 697// CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i1 true) 698// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 699// CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 700// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 701// CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4 702// CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 703// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 704// CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i1 false) 705// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 706// CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 707// CHECK-NEXT: ret void 708// 709void test_cvt_scalef32_pk_bf8_f16(global short2* out, half2 src, float scale) 710{ 711 *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(*out, src, scale, true); 712 *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(*out, src, scale, false); 713} 714 715// CHECK-LABEL: @test_cvt_scalef32_pk_bf8_bf16( 716// CHECK-NEXT: entry: 717// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 718// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) 719// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 720// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 721// CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 722// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 723// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 724// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4 725// CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 726// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 727// CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i1 true) 728// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 729// CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 730// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 731// CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4 732// CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 733// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 734// CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i1 false) 735// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 736// CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 737// CHECK-NEXT: ret void 738// 739void test_cvt_scalef32_pk_bf8_bf16(global short2* out, bfloat2 src, float scale) 740{ 741 *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16(*out, src, scale, true); 742 *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16(*out, src, scale, false); 743} 744 745// CHECK-LABEL: @test_cvt_scalef32_pk_f32_fp4( 746// CHECK-NEXT: entry: 747// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 748// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 749// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 750// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 751// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 752// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 753// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 754// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 755// CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 [[TMP0]], float [[TMP1]], i32 0) 756// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 757// CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8 758// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 759// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 760// CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 [[TMP4]], float [[TMP5]], i32 1) 761// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 762// CHECK-NEXT: store <2 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 763// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 764// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 765// CHECK-NEXT: [[TMP10:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 [[TMP8]], float [[TMP9]], i32 2) 766// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 767// CHECK-NEXT: store <2 x float> [[TMP10]], ptr addrspace(1) [[TMP11]], align 8 768// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 769// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 770// CHECK-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 [[TMP12]], float [[TMP13]], i32 3) 771// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 772// CHECK-NEXT: store <2 x float> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8 773// CHECK-NEXT: ret void 774// 775void test_cvt_scalef32_pk_f32_fp4(global float2* out, uint src, float scale) 776{ 777 *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(src, scale, 0); 778 *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(src, scale, 1); 779 *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(src, scale, 2); 780 *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(src, scale, 3); 781} 782 783// CHECK-LABEL: @test_cvt_scalef32_pk_fp4_f32( 784// CHECK-NEXT: entry: 785// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 786// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca float, align 4, addrspace(5) 787// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca float, align 4, addrspace(5) 788// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 789// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 790// CHECK-NEXT: store float [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4 791// CHECK-NEXT: store float [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 792// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 793// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 794// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 795// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 796// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 797// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 798// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], i32 0) 799// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 800// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 801// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 802// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 803// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 804// CHECK-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 805// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 806// CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 [[TMP8]], float [[TMP9]], float [[TMP10]], float [[TMP11]], i32 1) 807// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 808// CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 809// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 810// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 811// CHECK-NEXT: [[TMP16:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 812// CHECK-NEXT: [[TMP17:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 813// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 814// CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 [[TMP15]], float [[TMP16]], float [[TMP17]], float [[TMP18]], i32 2) 815// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 816// CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 817// CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 818// CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4 819// CHECK-NEXT: [[TMP23:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4 820// CHECK-NEXT: [[TMP24:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4 821// CHECK-NEXT: [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 822// CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 [[TMP22]], float [[TMP23]], float [[TMP24]], float [[TMP25]], i32 3) 823// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 824// CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4 825// CHECK-NEXT: ret void 826// 827void test_cvt_scalef32_pk_fp4_f32(global unsigned int* out, float src0, float src1, float scale) 828{ 829 *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(*out, src0, src1, scale, 0); 830 *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(*out, src0, src1, scale, 1); 831 *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(*out, src0, src1, scale, 2); 832 *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(*out, src0, src1, scale, 3); 833} 834 835// CHECK-LABEL: @test_cvt_scalef32_pk_f16_fp4( 836// CHECK-NEXT: entry: 837// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 838// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 839// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 840// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 841// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 842// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 843// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 844// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 845// CHECK-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP0]], float [[TMP1]], i32 0) 846// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 847// CHECK-NEXT: store <2 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 848// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 849// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 850// CHECK-NEXT: [[TMP6:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP4]], float [[TMP5]], i32 1) 851// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 852// CHECK-NEXT: store <2 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 853// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 854// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 855// CHECK-NEXT: [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP8]], float [[TMP9]], i32 2) 856// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 857// CHECK-NEXT: store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 858// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 859// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 860// CHECK-NEXT: [[TMP14:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP12]], float [[TMP13]], i32 3) 861// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 862// CHECK-NEXT: store <2 x half> [[TMP14]], ptr addrspace(1) [[TMP15]], align 4 863// CHECK-NEXT: ret void 864// 865void test_cvt_scalef32_pk_f16_fp4(global half2* out, uint src, float scale) 866{ 867 *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp4(src, scale, 0); 868 *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp4(src, scale, 1); 869 *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp4(src, scale, 2); 870 *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp4(src, scale, 3); 871} 872 873// CHECK-LABEL: @test_cvt_scalef32_pk_bf16_fp4( 874// CHECK-NEXT: entry: 875// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 876// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 877// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 878// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 879// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 880// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 881// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 882// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 883// CHECK-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 [[TMP0]], float [[TMP1]], i32 0) 884// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 885// CHECK-NEXT: store <2 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 886// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 887// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 888// CHECK-NEXT: [[TMP6:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 [[TMP4]], float [[TMP5]], i32 1) 889// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 890// CHECK-NEXT: store <2 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 891// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 892// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 893// CHECK-NEXT: [[TMP10:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 [[TMP8]], float [[TMP9]], i32 2) 894// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 895// CHECK-NEXT: store <2 x bfloat> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 896// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 897// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 898// CHECK-NEXT: [[TMP14:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 [[TMP12]], float [[TMP13]], i32 3) 899// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 900// CHECK-NEXT: store <2 x bfloat> [[TMP14]], ptr addrspace(1) [[TMP15]], align 4 901// CHECK-NEXT: ret void 902// 903void test_cvt_scalef32_pk_bf16_fp4(global bfloat2* out, uint src, float scale) 904{ 905 *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp4(src, scale, 0); 906 *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp4(src, scale, 1); 907 *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp4(src, scale, 2); 908 *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp4(src, scale, 3); 909} 910 911// CHECK-LABEL: @test_cvt_scalef32_pk_f32_fp6( 912// CHECK-NEXT: entry: 913// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 914// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5) 915// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 916// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 917// CHECK-NEXT: store <6 x i32> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 32 918// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 919// CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 920// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 921// CHECK-NEXT: [[TMP2:%.*]] = call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.fp6(<6 x i32> [[TMP0]], float [[TMP1]]) 922// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 923// CHECK-NEXT: store <32 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 128 924// CHECK-NEXT: [[TMP4:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 925// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 926// CHECK-NEXT: [[TMP6:%.*]] = call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.bf6(<6 x i32> [[TMP4]], float [[TMP5]]) 927// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 928// CHECK-NEXT: store <32 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 128 929// CHECK-NEXT: ret void 930// 931void test_cvt_scalef32_pk_f32_fp6(global float32* out, uint6 src, float scale) 932{ 933 *out = __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(src, scale); 934 *out = __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(src, scale); 935} 936 937// CHECK-LABEL: @test_cvt_scalef32_pk32_f16_fpbf6( 938// CHECK-NEXT: entry: 939// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 940// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5) 941// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 942// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 943// CHECK-NEXT: store <6 x i32> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 32 944// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 945// CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 946// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 947// CHECK-NEXT: [[TMP2:%.*]] = call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> [[TMP0]], float [[TMP1]]) 948// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 949// CHECK-NEXT: store <32 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 64 950// CHECK-NEXT: [[TMP4:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 951// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 952// CHECK-NEXT: [[TMP6:%.*]] = call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> [[TMP4]], float [[TMP5]]) 953// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 954// CHECK-NEXT: store <32 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 64 955// CHECK-NEXT: ret void 956// 957void test_cvt_scalef32_pk32_f16_fpbf6(global half32 *out, uint6 src, float scale) 958{ 959 *out = __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6(src, scale); 960 *out = __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6(src, scale); 961} 962 963// CHECK-LABEL: @test_cvt_scalef32_pk32_bf16_fpbf6( 964// CHECK-NEXT: entry: 965// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 966// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5) 967// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 968// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 969// CHECK-NEXT: store <6 x i32> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 32 970// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 971// CHECK-NEXT: [[TMP0:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 972// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 973// CHECK-NEXT: [[TMP2:%.*]] = call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> [[TMP0]], float [[TMP1]]) 974// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 975// CHECK-NEXT: store <32 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 64 976// CHECK-NEXT: [[TMP4:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32 977// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 978// CHECK-NEXT: [[TMP6:%.*]] = call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> [[TMP4]], float [[TMP5]]) 979// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 980// CHECK-NEXT: store <32 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 64 981// CHECK-NEXT: ret void 982// 983void test_cvt_scalef32_pk32_bf16_fpbf6(global bfloat32 *out, uint6 src, float scale) 984{ 985 *out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6(src, scale); 986 *out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6(src, scale); 987} 988 989// CHECK-LABEL: @test_cvt_scalef32_pk_f16_fp8( 990// CHECK-NEXT: entry: 991// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 992// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 993// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 994// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 995// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 996// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 997// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 998// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 999// CHECK-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 [[TMP0]], float [[TMP1]], i1 true) 1000// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1001// CHECK-NEXT: store <2 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 1002// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 1003// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1004// CHECK-NEXT: [[TMP6:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 [[TMP4]], float [[TMP5]], i1 false) 1005// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1006// CHECK-NEXT: store <2 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 1007// CHECK-NEXT: ret void 1008// 1009void test_cvt_scalef32_pk_f16_fp8(global half2* out, unsigned int src, float scale) 1010{ 1011 *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(src, scale, true); 1012 *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(src, scale, false); 1013} 1014 1015// CHECK-LABEL: @test_cvt_scalef32_pk_f16_bf8( 1016// CHECK-NEXT: entry: 1017// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1018// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1019// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1020// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1021// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 1022// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 1023// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 1024// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1025// CHECK-NEXT: [[TMP2:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 [[TMP0]], float [[TMP1]], i1 true) 1026// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1027// CHECK-NEXT: store <2 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 1028// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 1029// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1030// CHECK-NEXT: [[TMP6:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 [[TMP4]], float [[TMP5]], i1 false) 1031// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1032// CHECK-NEXT: store <2 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 1033// CHECK-NEXT: ret void 1034// 1035void test_cvt_scalef32_pk_f16_bf8(global half2* out, unsigned int src, float scale) 1036{ 1037 *out = __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(src, scale, true); 1038 *out = __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(src, scale, false); 1039} 1040 1041// CHECK-LABEL: @test_cvt_scalef32_pk_bf16_fp8( 1042// CHECK-NEXT: entry: 1043// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1044// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1045// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1046// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1047// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 1048// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 1049// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 1050// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1051// CHECK-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 [[TMP0]], float [[TMP1]], i1 true) 1052// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1053// CHECK-NEXT: store <2 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 1054// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 1055// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1056// CHECK-NEXT: [[TMP6:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 [[TMP4]], float [[TMP5]], i1 false) 1057// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1058// CHECK-NEXT: store <2 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 1059// CHECK-NEXT: ret void 1060// 1061void test_cvt_scalef32_pk_bf16_fp8(global bfloat2* out, unsigned int src, float scale) 1062{ 1063 *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(src, scale, true); 1064 *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(src, scale, false); 1065} 1066 1067// CHECK-LABEL: @test_cvt_scalef32_pk_bf16_bf8( 1068// CHECK-NEXT: entry: 1069// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1070// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1071// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1072// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1073// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 1074// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 1075// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 1076// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1077// CHECK-NEXT: [[TMP2:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 [[TMP0]], float [[TMP1]], i1 true) 1078// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1079// CHECK-NEXT: store <2 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4 1080// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4 1081// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1082// CHECK-NEXT: [[TMP6:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 [[TMP4]], float [[TMP5]], i1 false) 1083// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1084// CHECK-NEXT: store <2 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4 1085// CHECK-NEXT: ret void 1086// 1087void test_cvt_scalef32_pk_bf16_bf8(global bfloat2* out, unsigned int src, float scale) 1088{ 1089 *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(src, scale, true); 1090 *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(src, scale, false); 1091} 1092 1093// CHECK-LABEL: @test_cvt_scalef32_pk_fp4_f16( 1094// CHECK-NEXT: entry: 1095// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1096// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) 1097// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1098// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1099// CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 1100// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 1101// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1102// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 1103// CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 1104// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1105// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i32 0) 1106// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1107// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 1108// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1109// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 1110// CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 1111// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1112// CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i32 1) 1113// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1114// CHECK-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 1115// CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1116// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4 1117// CHECK-NEXT: [[TMP14:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 1118// CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1119// CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP13]], <2 x half> [[TMP14]], float [[TMP15]], i32 2) 1120// CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1121// CHECK-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[TMP17]], align 4 1122// CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1123// CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(1) [[TMP18]], align 4 1124// CHECK-NEXT: [[TMP20:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 1125// CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1126// CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP19]], <2 x half> [[TMP20]], float [[TMP21]], i32 3) 1127// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1128// CHECK-NEXT: store i32 [[TMP22]], ptr addrspace(1) [[TMP23]], align 4 1129// CHECK-NEXT: ret void 1130// 1131void test_cvt_scalef32_pk_fp4_f16(global unsigned int* out, half2 src, float scale) 1132{ 1133 *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16(*out, src, scale, 0); 1134 *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16(*out, src, scale, 1); 1135 *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16(*out, src, scale, 2); 1136 *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16(*out, src, scale, 3); 1137} 1138 1139// CHECK-LABEL: @test_cvt_scalef32_pk_fp4_bf16( 1140// CHECK-NEXT: entry: 1141// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1142// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) 1143// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1144// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1145// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1146// CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 1147// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 1148// CHECK-NEXT: store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4 1149// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1150// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 1151// CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 1152// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1153// CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i32 0) 1154// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1155// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 1156// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1157// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 1158// CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 1159// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1160// CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i32 1) 1161// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1162// CHECK-NEXT: store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 1163// CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1164// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4 1165// CHECK-NEXT: [[TMP14:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 1166// CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1167// CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP13]], <2 x bfloat> [[TMP14]], float [[TMP15]], i32 2) 1168// CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1169// CHECK-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[TMP17]], align 4 1170// CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1171// CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr addrspace(1) [[TMP18]], align 4 1172// CHECK-NEXT: [[TMP20:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 1173// CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1174// CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP19]], <2 x bfloat> [[TMP20]], float [[TMP21]], i32 3) 1175// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1176// CHECK-NEXT: store i32 [[TMP22]], ptr addrspace(1) [[TMP23]], align 4 1177// CHECK-NEXT: ret void 1178// 1179void test_cvt_scalef32_pk_fp4_bf16(global unsigned int* out, bfloat2 src, float scale, uint old) 1180{ 1181 *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(*out, src, scale, 0); 1182 *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(*out, src, scale, 1); 1183 *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(*out, src, scale, 2); 1184 *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(*out, src, scale, 3); 1185} 1186 1187// CHECK-LABEL: @test_cvt_scalef32_sr_pk_fp4_f16( 1188// CHECK-NEXT: entry: 1189// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1190// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5) 1191// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1192// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1193// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1194// CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 1195// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 1196// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 1197// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1198// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 1199// CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 1200// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1201// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1202// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP1]], <2 x half> [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) 1203// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1204// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 1205// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1206// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 1207// CHECK-NEXT: [[TMP9:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 1208// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1209// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1210// CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP8]], <2 x half> [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) 1211// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1212// CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 1213// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1214// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 1215// CHECK-NEXT: [[TMP16:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 1216// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1217// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1218// CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP15]], <2 x half> [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) 1219// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1220// CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 1221// CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1222// CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4 1223// CHECK-NEXT: [[TMP23:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4 1224// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1225// CHECK-NEXT: [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1226// CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP22]], <2 x half> [[TMP23]], i32 [[TMP24]], float [[TMP25]], i32 3) 1227// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1228// CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4 1229// CHECK-NEXT: ret void 1230// 1231void test_cvt_scalef32_sr_pk_fp4_f16(global unsigned *out, half2 src, uint seed, float scale) 1232{ 1233 *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(*out, src, seed, scale, 0); 1234 *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(*out, src, seed, scale, 1); 1235 *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(*out, src, seed, scale, 2); 1236 *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(*out, src, seed, scale, 3); 1237} 1238 1239// CHECK-LABEL: @test_cvt_scalef32_sr_pk_fp4_bf16( 1240// CHECK-NEXT: entry: 1241// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1242// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5) 1243// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1244// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1245// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1246// CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 1247// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 1248// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 1249// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1250// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 1251// CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 1252// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1253// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1254// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP1]], <2 x bfloat> [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) 1255// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1256// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 1257// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1258// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 1259// CHECK-NEXT: [[TMP9:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 1260// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1261// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1262// CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP8]], <2 x bfloat> [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) 1263// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1264// CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 1265// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1266// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 1267// CHECK-NEXT: [[TMP16:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 1268// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1269// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1270// CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP15]], <2 x bfloat> [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) 1271// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1272// CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 1273// CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1274// CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4 1275// CHECK-NEXT: [[TMP23:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4 1276// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1277// CHECK-NEXT: [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1278// CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP22]], <2 x bfloat> [[TMP23]], i32 [[TMP24]], float [[TMP25]], i32 3) 1279// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1280// CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4 1281// CHECK-NEXT: ret void 1282// 1283void test_cvt_scalef32_sr_pk_fp4_bf16(global unsigned *out, bfloat2 src, uint seed, float scale) 1284{ 1285 *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16(*out, src, seed, scale, 0); 1286 *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16(*out, src, seed, scale, 1); 1287 *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16(*out, src, seed, scale, 2); 1288 *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16(*out, src, seed, scale, 3); 1289} 1290 1291// CHECK-LABEL: @test_cvt_scalef32_sr_pk_fp4_f32( 1292// CHECK-NEXT: entry: 1293// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1294// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x float>, align 8, addrspace(5) 1295// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1296// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1297// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1298// CHECK-NEXT: store <2 x float> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 1299// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 1300// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 1301// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1302// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 1303// CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8 1304// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1305// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1306// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP1]], <2 x float> [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) 1307// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1308// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 1309// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1310// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 1311// CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8 1312// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1313// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1314// CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP8]], <2 x float> [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) 1315// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1316// CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 1317// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1318// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 1319// CHECK-NEXT: [[TMP16:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8 1320// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1321// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1322// CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP15]], <2 x float> [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) 1323// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1324// CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 1325// CHECK-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1326// CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4 1327// CHECK-NEXT: [[TMP23:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8 1328// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1329// CHECK-NEXT: [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1330// CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP22]], <2 x float> [[TMP23]], i32 [[TMP24]], float [[TMP25]], i32 3) 1331// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1332// CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4 1333// CHECK-NEXT: ret void 1334// 1335void test_cvt_scalef32_sr_pk_fp4_f32(global unsigned *out, float2 src, uint seed, float scale) 1336{ 1337 *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 0); 1338 *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 1); 1339 *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 2); 1340 *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 3); 1341} 1342 1343// CHECK-LABEL: @test_cvt_scalef32_sr_pk32( 1344// CHECK-NEXT: entry: 1345// CHECK-NEXT: [[OUT6_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1346// CHECK-NEXT: [[SRCBF32_ADDR:%.*]] = alloca <32 x bfloat>, align 64, addrspace(5) 1347// CHECK-NEXT: [[SRCH32_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5) 1348// CHECK-NEXT: [[SRCF32_ADDR:%.*]] = alloca <32 x float>, align 128, addrspace(5) 1349// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1350// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1351// CHECK-NEXT: store ptr addrspace(1) [[OUT6:%.*]], ptr addrspace(5) [[OUT6_ADDR]], align 8 1352// CHECK-NEXT: store <32 x bfloat> [[SRCBF32:%.*]], ptr addrspace(5) [[SRCBF32_ADDR]], align 64 1353// CHECK-NEXT: store <32 x half> [[SRCH32:%.*]], ptr addrspace(5) [[SRCH32_ADDR]], align 64 1354// CHECK-NEXT: store <32 x float> [[SRCF32:%.*]], ptr addrspace(5) [[SRCF32_ADDR]], align 128 1355// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4 1356// CHECK-NEXT: store float [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4 1357// CHECK-NEXT: [[TMP0:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64 1358// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 1359// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 1360// CHECK-NEXT: [[TMP3:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> [[TMP0]], i32 [[TMP1]], float [[TMP2]]) 1361// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 1362// CHECK-NEXT: store <6 x i32> [[TMP3]], ptr addrspace(1) [[TMP4]], align 32 1363// CHECK-NEXT: [[TMP5:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64 1364// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 1365// CHECK-NEXT: [[TMP7:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 1366// CHECK-NEXT: [[TMP8:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> [[TMP5]], i32 [[TMP6]], float [[TMP7]]) 1367// CHECK-NEXT: [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 1368// CHECK-NEXT: store <6 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 32 1369// CHECK-NEXT: [[TMP10:%.*]] = load <32 x float>, ptr addrspace(5) [[SRCF32_ADDR]], align 128 1370// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 1371// CHECK-NEXT: [[TMP12:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 1372// CHECK-NEXT: [[TMP13:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f32(<32 x float> [[TMP10]], i32 [[TMP11]], float [[TMP12]]) 1373// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 1374// CHECK-NEXT: store <6 x i32> [[TMP13]], ptr addrspace(1) [[TMP14]], align 32 1375// CHECK-NEXT: [[TMP15:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64 1376// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 1377// CHECK-NEXT: [[TMP17:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 1378// CHECK-NEXT: [[TMP18:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> [[TMP15]], i32 [[TMP16]], float [[TMP17]]) 1379// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 1380// CHECK-NEXT: store <6 x i32> [[TMP18]], ptr addrspace(1) [[TMP19]], align 32 1381// CHECK-NEXT: [[TMP20:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64 1382// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 1383// CHECK-NEXT: [[TMP22:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 1384// CHECK-NEXT: [[TMP23:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f16(<32 x half> [[TMP20]], i32 [[TMP21]], float [[TMP22]]) 1385// CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 1386// CHECK-NEXT: store <6 x i32> [[TMP23]], ptr addrspace(1) [[TMP24]], align 32 1387// CHECK-NEXT: [[TMP25:%.*]] = load <32 x float>, ptr addrspace(5) [[SRCF32_ADDR]], align 128 1388// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4 1389// CHECK-NEXT: [[TMP27:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4 1390// CHECK-NEXT: [[TMP28:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f32(<32 x float> [[TMP25]], i32 [[TMP26]], float [[TMP27]]) 1391// CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8 1392// CHECK-NEXT: store <6 x i32> [[TMP28]], ptr addrspace(1) [[TMP29]], align 32 1393// CHECK-NEXT: ret void 1394// 1395void test_cvt_scalef32_sr_pk32(global uint6 *out6, bfloat32 srcbf32, half32 srch32, float32 srcf32, unsigned src1, float src2) 1396{ 1397 *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16(srcbf32, src1, src2); 1398 *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16(srch32, src1, src2); 1399 *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(srcf32, src1, src2); 1400 *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16(srcbf32, src1, src2); 1401 *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16(srch32, src1, src2); 1402 *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(srcf32, src1, src2); 1403} 1404 1405// CHECK-LABEL: @test_cvt_scalef32_sr_bf8_bf16( 1406// CHECK-NEXT: entry: 1407// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1408// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) 1409// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1410// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1411// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1412// CHECK-NEXT: store bfloat [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 2 1413// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 1414// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 1415// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1416// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 1417// CHECK-NEXT: [[TMP2:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 1418// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1419// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1420// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 [[TMP1]], bfloat [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) 1421// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1422// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 1423// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1424// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 1425// CHECK-NEXT: [[TMP9:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 1426// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1427// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1428// CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 [[TMP8]], bfloat [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) 1429// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1430// CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 1431// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1432// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 1433// CHECK-NEXT: [[TMP16:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 1434// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1435// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1436// CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 [[TMP15]], bfloat [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) 1437// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1438// CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 1439// CHECK-NEXT: ret void 1440// 1441void test_cvt_scalef32_sr_bf8_bf16(global unsigned *out, __bf16 src, uint seed, float scale) 1442{ 1443 *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_bf16(*out, src, seed, scale, 0); 1444 *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_bf16(*out, src, seed, scale, 1); 1445 *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_bf16(*out, src, seed, scale, 2); 1446} 1447 1448// CHECK-LABEL: @test_cvt_scalef32_sr_bf8_f16( 1449// CHECK-NEXT: entry: 1450// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1451// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca half, align 2, addrspace(5) 1452// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1453// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1454// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1455// CHECK-NEXT: store half [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 2 1456// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 1457// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 1458// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1459// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 1460// CHECK-NEXT: [[TMP2:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 1461// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1462// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1463// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 [[TMP1]], half [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) 1464// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1465// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 1466// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1467// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 1468// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 1469// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1470// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1471// CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 [[TMP8]], half [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) 1472// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1473// CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 1474// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1475// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 1476// CHECK-NEXT: [[TMP16:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 1477// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1478// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1479// CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 [[TMP15]], half [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) 1480// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1481// CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 1482// CHECK-NEXT: ret void 1483// 1484void test_cvt_scalef32_sr_bf8_f16(global unsigned *out, half src, uint seed, float scale) 1485{ 1486 *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f16(*out, src, seed, scale, 0); 1487 *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f16(*out, src, seed, scale, 1); 1488 *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f16(*out, src, seed, scale, 2); 1489} 1490 1491// CHECK-LABEL: @test_cvt_scalef32_sr_bf8_f32( 1492// CHECK-NEXT: entry: 1493// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1494// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1495// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1496// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1497// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1498// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 1499// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 1500// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 1501// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1502// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 1503// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 1504// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1505// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1506// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f32(i32 [[TMP1]], float [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) 1507// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1508// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 1509// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1510// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 1511// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 1512// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1513// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1514// CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f32(i32 [[TMP8]], float [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) 1515// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1516// CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 1517// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1518// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 1519// CHECK-NEXT: [[TMP16:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 1520// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1521// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1522// CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f32(i32 [[TMP15]], float [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) 1523// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1524// CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 1525// CHECK-NEXT: ret void 1526// 1527void test_cvt_scalef32_sr_bf8_f32(global unsigned *out, float src, uint seed, float scale) 1528{ 1529 *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f32(*out, src, seed, scale, 0); 1530 *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f32(*out, src, seed, scale, 1); 1531 *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f32(*out, src, seed, scale, 2); 1532} 1533 1534// CHECK-LABEL: @test_cvt_scalef32_sr_fp8_bf16( 1535// CHECK-NEXT: entry: 1536// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1537// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) 1538// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1539// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1540// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1541// CHECK-NEXT: store bfloat [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 2 1542// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 1543// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 1544// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1545// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 1546// CHECK-NEXT: [[TMP2:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 1547// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1548// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1549// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.bf16(i32 [[TMP1]], bfloat [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) 1550// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1551// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 1552// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1553// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 1554// CHECK-NEXT: [[TMP9:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 1555// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1556// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1557// CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.bf16(i32 [[TMP8]], bfloat [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) 1558// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1559// CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 1560// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1561// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 1562// CHECK-NEXT: [[TMP16:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2 1563// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1564// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1565// CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.bf16(i32 [[TMP15]], bfloat [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) 1566// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1567// CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 1568// CHECK-NEXT: ret void 1569// 1570void test_cvt_scalef32_sr_fp8_bf16(global unsigned *out, __bf16 src, uint seed, float scale) 1571{ 1572 *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_bf16(*out, src, seed, scale, 0); 1573 *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_bf16(*out, src, seed, scale, 1); 1574 *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_bf16(*out, src, seed, scale, 2); 1575} 1576 1577// CHECK-LABEL: @test_cvt_scalef32_sr_fp8_f16( 1578// CHECK-NEXT: entry: 1579// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1580// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca half, align 2, addrspace(5) 1581// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1582// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1583// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1584// CHECK-NEXT: store half [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 2 1585// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 1586// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 1587// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1588// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 1589// CHECK-NEXT: [[TMP2:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 1590// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1591// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1592// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f16(i32 [[TMP1]], half [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) 1593// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1594// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 1595// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1596// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 1597// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 1598// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1599// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1600// CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f16(i32 [[TMP8]], half [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) 1601// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1602// CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 1603// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1604// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 1605// CHECK-NEXT: [[TMP16:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2 1606// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1607// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1608// CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f16(i32 [[TMP15]], half [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) 1609// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1610// CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 1611// CHECK-NEXT: ret void 1612// 1613void test_cvt_scalef32_sr_fp8_f16(global unsigned *out, half src, uint seed, float scale) 1614{ 1615 *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f16(*out, src, seed, scale, 0); 1616 *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f16(*out, src, seed, scale, 1); 1617 *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f16(*out, src, seed, scale, 2); 1618} 1619 1620// CHECK-LABEL: @test_cvt_scalef32_sr_fp8_f32( 1621// CHECK-NEXT: entry: 1622// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1623// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1624// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1625// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1626// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1627// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 1628// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 1629// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4 1630// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1631// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4 1632// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 1633// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1634// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1635// CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f32(i32 [[TMP1]], float [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0) 1636// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1637// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4 1638// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1639// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4 1640// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 1641// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1642// CHECK-NEXT: [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1643// CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f32(i32 [[TMP8]], float [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1) 1644// CHECK-NEXT: [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1645// CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4 1646// CHECK-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1647// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4 1648// CHECK-NEXT: [[TMP16:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 1649// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1650// CHECK-NEXT: [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4 1651// CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f32(i32 [[TMP15]], float [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2) 1652// CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1653// CHECK-NEXT: store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4 1654// CHECK-NEXT: ret void 1655// 1656void test_cvt_scalef32_sr_fp8_f32(global unsigned *out, float src, uint seed, float scale) 1657{ 1658 *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f32(*out, src, seed, scale, 0); 1659 *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f32(*out, src, seed, scale, 1); 1660 *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f32(*out, src, seed, scale, 2); 1661} 1662 1663// CHECK-LABEL: @test_bitop3_b32( 1664// CHECK-NEXT: entry: 1665// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1666// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1667// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1668// CHECK-NEXT: [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1669// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1670// CHECK-NEXT: store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4 1671// CHECK-NEXT: store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4 1672// CHECK-NEXT: store i32 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 4 1673// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4 1674// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4 1675// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4 1676// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.bitop3.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 1) 1677// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1678// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4 1679// CHECK-NEXT: ret void 1680// 1681void test_bitop3_b32(global uint* out, uint a, uint b, uint c) 1682{ 1683 *out = __builtin_amdgcn_bitop3_b32(a, b, c, 1); 1684} 1685 1686// CHECK-LABEL: @test_bitop3_b16( 1687// CHECK-NEXT: entry: 1688// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1689// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i16, align 2, addrspace(5) 1690// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i16, align 2, addrspace(5) 1691// CHECK-NEXT: [[C_ADDR:%.*]] = alloca i16, align 2, addrspace(5) 1692// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1693// CHECK-NEXT: store i16 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 2 1694// CHECK-NEXT: store i16 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 2 1695// CHECK-NEXT: store i16 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 2 1696// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr addrspace(5) [[A_ADDR]], align 2 1697// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr addrspace(5) [[B_ADDR]], align 2 1698// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(5) [[C_ADDR]], align 2 1699// CHECK-NEXT: [[TMP3:%.*]] = call i16 @llvm.amdgcn.bitop3.i16(i16 [[TMP0]], i16 [[TMP1]], i16 [[TMP2]], i32 1) 1700// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1701// CHECK-NEXT: store i16 [[TMP3]], ptr addrspace(1) [[TMP4]], align 2 1702// CHECK-NEXT: ret void 1703// 1704void test_bitop3_b16(global ushort* out, ushort a, ushort b, ushort c) 1705{ 1706 *out = __builtin_amdgcn_bitop3_b16(a, b, c, 1); 1707} 1708 1709// CHECK-LABEL: @test_cvt_sr_bf16_f32( 1710// CHECK-NEXT: entry: 1711// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1712// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1713// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1714// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1715// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 1716// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 1717// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1718// CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[TMP0]], align 4 1719// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 1720// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1721// CHECK-NEXT: [[TMP4:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> [[TMP1]], float [[TMP2]], i32 [[TMP3]], i1 false) 1722// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1723// CHECK-NEXT: store <2 x bfloat> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 1724// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1725// CHECK-NEXT: [[TMP7:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[TMP6]], align 4 1726// CHECK-NEXT: [[TMP8:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 1727// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1728// CHECK-NEXT: [[TMP10:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> [[TMP7]], float [[TMP8]], i32 [[TMP9]], i1 true) 1729// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1730// CHECK-NEXT: store <2 x bfloat> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 1731// CHECK-NEXT: ret void 1732// 1733void test_cvt_sr_bf16_f32(global bfloat2 *out, float src, uint seed) 1734{ 1735 *out = __builtin_amdgcn_cvt_sr_bf16_f32(*out, src, seed, 0); 1736 *out = __builtin_amdgcn_cvt_sr_bf16_f32(*out, src, seed, 1); 1737} 1738 1739// CHECK-LABEL: @test_cvt_sr_f16_f32( 1740// CHECK-NEXT: entry: 1741// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1742// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5) 1743// CHECK-NEXT: [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5) 1744// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8 1745// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4 1746// CHECK-NEXT: store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4 1747// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1748// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 4 1749// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 1750// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1751// CHECK-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> [[TMP1]], float [[TMP2]], i32 [[TMP3]], i1 false) 1752// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1753// CHECK-NEXT: store <2 x half> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4 1754// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1755// CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP6]], align 4 1756// CHECK-NEXT: [[TMP8:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4 1757// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4 1758// CHECK-NEXT: [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> [[TMP7]], float [[TMP8]], i32 [[TMP9]], i1 true) 1759// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8 1760// CHECK-NEXT: store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4 1761// CHECK-NEXT: ret void 1762// 1763void test_cvt_sr_f16_f32(global half2 *out, float src, uint seed) 1764{ 1765 *out = __builtin_amdgcn_cvt_sr_f16_f32(*out, src, seed, 0); 1766 *out = __builtin_amdgcn_cvt_sr_f16_f32(*out, src, seed, 1); 1767} 1768 1769// CHECK-LABEL: @test_global_load_lds_96( 1770// CHECK-NEXT: entry: 1771// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1772// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) 1773// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 1774// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 1775// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 1776// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 1777// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0) 1778// CHECK-NEXT: ret void 1779// 1780void test_global_load_lds_96(global void* src, local void *dst) { 1781 __builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); 1782} 1783 1784// CHECK-LABEL: @test_global_load_lds_128( 1785// CHECK-NEXT: entry: 1786// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) 1787// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) 1788// CHECK-NEXT: store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8 1789// CHECK-NEXT: store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4 1790// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8 1791// CHECK-NEXT: [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4 1792// CHECK-NEXT: call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0) 1793// CHECK-NEXT: ret void 1794// 1795void test_global_load_lds_128(global void* src, local void *dst) { 1796 __builtin_amdgcn_global_load_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); 1797} 1798