xref: /llvm-project/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl (revision e0f52538c9739d945e316eac0ddd92d26e4e380a)
1// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2// RUN: %clang_cc1 -cl-std=CL1.2 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck %s
3// REQUIRES: amdgpu-registered-target
4
5#pragma OPENCL EXTENSION cl_khr_fp16 : enable
6
7typedef unsigned int uint;
8typedef unsigned short ushort;
9typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
10typedef unsigned int __attribute__((ext_vector_type(6))) uint6;
11typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32;
12typedef half __attribute__((ext_vector_type(32))) half32;
13typedef short __attribute__((ext_vector_type(2))) short2;
14typedef __bf16 __attribute__((ext_vector_type(2))) bfloat2;
15typedef float __attribute__((ext_vector_type(16))) float16;
16typedef half __attribute__((ext_vector_type(2))) half2;
17typedef float __attribute__((ext_vector_type(2))) float2;
18typedef float __attribute__((ext_vector_type(32))) float32;
19
20// CHECK-LABEL: @test_prng_b32(
21// CHECK-NEXT:  entry:
22// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
23// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
24// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
25// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
26// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
27// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.prng.b32(i32 [[TMP0]])
28// CHECK-NEXT:    [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
29// CHECK-NEXT:    store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
30// CHECK-NEXT:    ret void
31//
32void test_prng_b32(global uint* out, uint a) {
33  *out = __builtin_amdgcn_prng_b32(a);
34}
35
36// CHECK-LABEL: @test_permlane16_swap(
37// CHECK-NEXT:  entry:
38// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
39// CHECK-NEXT:    [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
40// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
41// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
42// CHECK-NEXT:    store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4
43// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
44// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
45// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
46// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
47// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
48// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
49// CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
50// CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
51// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
52// CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
53// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
54// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
55// CHECK-NEXT:    [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
56// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
57// CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
58// CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
59// CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
60// CHECK-NEXT:    [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
61// CHECK-NEXT:    store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
62// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
63// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
64// CHECK-NEXT:    [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
65// CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
66// CHECK-NEXT:    [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
67// CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
68// CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
69// CHECK-NEXT:    [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
70// CHECK-NEXT:    store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
71// CHECK-NEXT:    ret void
72//
73void test_permlane16_swap(global uint2* out, uint old, uint src) {
74  *out = __builtin_amdgcn_permlane16_swap(old, src, false, false);
75  *out = __builtin_amdgcn_permlane16_swap(old, src, true, false);
76  *out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
77}
78
79// CHECK-LABEL: @test_permlane32_swap(
80// CHECK-NEXT:  entry:
81// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
82// CHECK-NEXT:    [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
83// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
84// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
85// CHECK-NEXT:    store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4
86// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
87// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
88// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
89// CHECK-NEXT:    [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false)
90// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0
91// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1
92// CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0
93// CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1
94// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
95// CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
96// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
97// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
98// CHECK-NEXT:    [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false)
99// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0
100// CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1
101// CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0
102// CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1
103// CHECK-NEXT:    [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
104// CHECK-NEXT:    store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
105// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(5) [[OLD_ADDR]], align 4
106// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
107// CHECK-NEXT:    [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true)
108// CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0
109// CHECK-NEXT:    [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1
110// CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0
111// CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1
112// CHECK-NEXT:    [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
113// CHECK-NEXT:    store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
114// CHECK-NEXT:    ret void
115//
116void test_permlane32_swap(global uint2* out, uint old, uint src) {
117  *out = __builtin_amdgcn_permlane32_swap(old, src, false, false);
118  *out = __builtin_amdgcn_permlane32_swap(old, src, true, false);
119  *out = __builtin_amdgcn_permlane32_swap(old, src, false, true);
120}
121
122// CHECK-LABEL: @test_cvt_scalef32_pk(
123// CHECK-NEXT:  entry:
124// CHECK-NEXT:    [[OUT6_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
125// CHECK-NEXT:    [[SRCBF32_ADDR:%.*]] = alloca <32 x bfloat>, align 64, addrspace(5)
126// CHECK-NEXT:    [[SRCH32_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5)
127// CHECK-NEXT:    [[SRC0F32_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5)
128// CHECK-NEXT:    [[SRC1F32_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5)
129// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
130// CHECK-NEXT:    store ptr addrspace(1) [[OUT6:%.*]], ptr addrspace(5) [[OUT6_ADDR]], align 8
131// CHECK-NEXT:    store <32 x bfloat> [[SRCBF32:%.*]], ptr addrspace(5) [[SRCBF32_ADDR]], align 64
132// CHECK-NEXT:    store <32 x half> [[SRCH32:%.*]], ptr addrspace(5) [[SRCH32_ADDR]], align 64
133// CHECK-NEXT:    store <16 x float> [[SRC0F32:%.*]], ptr addrspace(5) [[SRC0F32_ADDR]], align 64
134// CHECK-NEXT:    store <16 x float> [[SRC1F32:%.*]], ptr addrspace(5) [[SRC1F32_ADDR]], align 64
135// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
136// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64
137// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
138// CHECK-NEXT:    [[TMP2:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> [[TMP0]], float [[TMP1]])
139// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
140// CHECK-NEXT:    store <6 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 32
141// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64
142// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
143// CHECK-NEXT:    [[TMP6:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> [[TMP4]], float [[TMP5]])
144// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
145// CHECK-NEXT:    store <6 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 32
146// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64
147// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
148// CHECK-NEXT:    [[TMP10:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> [[TMP8]], float [[TMP9]])
149// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
150// CHECK-NEXT:    store <6 x i32> [[TMP10]], ptr addrspace(1) [[TMP11]], align 32
151// CHECK-NEXT:    [[TMP12:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64
152// CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
153// CHECK-NEXT:    [[TMP14:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> [[TMP12]], float [[TMP13]])
154// CHECK-NEXT:    [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
155// CHECK-NEXT:    store <6 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 32
156// CHECK-NEXT:    [[TMP16:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC0F32_ADDR]], align 64
157// CHECK-NEXT:    [[TMP17:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC1F32_ADDR]], align 64
158// CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
159// CHECK-NEXT:    [[TMP19:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> [[TMP16]], <16 x float> [[TMP17]], float [[TMP18]])
160// CHECK-NEXT:    [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
161// CHECK-NEXT:    store <6 x i32> [[TMP19]], ptr addrspace(1) [[TMP20]], align 32
162// CHECK-NEXT:    [[TMP21:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC0F32_ADDR]], align 64
163// CHECK-NEXT:    [[TMP22:%.*]] = load <16 x float>, ptr addrspace(5) [[SRC1F32_ADDR]], align 64
164// CHECK-NEXT:    [[TMP23:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
165// CHECK-NEXT:    [[TMP24:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> [[TMP21]], <16 x float> [[TMP22]], float [[TMP23]])
166// CHECK-NEXT:    [[TMP25:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
167// CHECK-NEXT:    store <6 x i32> [[TMP24]], ptr addrspace(1) [[TMP25]], align 32
168// CHECK-NEXT:    ret void
169//
170void test_cvt_scalef32_pk(global uint6 *out6, bfloat32 srcbf32, half32 srch32, float16 src0f32, float16 src1f32, float scale)
171{
172  *out6 = __builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16(srcbf32, scale);
173  *out6 = __builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(srch32, scale);
174  *out6 = __builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16(srcbf32, scale);
175  *out6 = __builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(srch32, scale);
176  *out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(src0f32, src1f32, scale);
177  *out6 = __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(src0f32, src1f32, scale);
178}
179
180// CHECK-LABEL: @test_ashr_pk_i8_i32(
181// CHECK-NEXT:  entry:
182// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
183// CHECK-NEXT:    [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
184// CHECK-NEXT:    [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
185// CHECK-NEXT:    [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
186// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
187// CHECK-NEXT:    store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
188// CHECK-NEXT:    store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
189// CHECK-NEXT:    store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4
190// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
191// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
192// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
193// CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.i8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
194// CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP3]] to i32
195// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
196// CHECK-NEXT:    store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4
197// CHECK-NEXT:    ret void
198//
199void test_ashr_pk_i8_i32(global int* out, uint src0, uint src1, uint src2) {
200  *out = __builtin_amdgcn_ashr_pk_i8_i32(src0, src1, src2);
201}
202
203// CHECK-LABEL: @test_ashr_pk_u8_i32(
204// CHECK-NEXT:  entry:
205// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
206// CHECK-NEXT:    [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
207// CHECK-NEXT:    [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
208// CHECK-NEXT:    [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
209// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
210// CHECK-NEXT:    store i32 [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
211// CHECK-NEXT:    store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
212// CHECK-NEXT:    store i32 [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4
213// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
214// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
215// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
216// CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.amdgcn.ashr.pk.u8.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
217// CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP3]] to i32
218// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
219// CHECK-NEXT:    store i32 [[CONV]], ptr addrspace(1) [[TMP4]], align 4
220// CHECK-NEXT:    ret void
221//
222void test_ashr_pk_u8_i32(global int* out, uint src0, uint src1, uint src2) {
223  *out = __builtin_amdgcn_ashr_pk_u8_i32(src0, src1, src2);
224}
225
226// CHECK-LABEL: @builtins_amdgcn_dl_insts(
227// CHECK-NEXT:  entry:
228// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
229// CHECK-NEXT:    [[FC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
230// CHECK-NEXT:    [[V2SSA_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
231// CHECK-NEXT:    [[V2SSB_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
232// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
233// CHECK-NEXT:    store float [[FC:%.*]], ptr addrspace(5) [[FC_ADDR]], align 4
234// CHECK-NEXT:    store <2 x i16> [[V2SSA:%.*]], ptr addrspace(5) [[V2SSA_ADDR]], align 4
235// CHECK-NEXT:    store <2 x i16> [[V2SSB:%.*]], ptr addrspace(5) [[V2SSB_ADDR]], align 4
236// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSA_ADDR]], align 4
237// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[TMP0]] to <2 x bfloat>
238// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSB_ADDR]], align 4
239// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to <2 x bfloat>
240// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(5) [[FC_ADDR]], align 4
241// CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[TMP1]], <2 x bfloat> [[TMP3]], float [[TMP4]], i1 false)
242// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
243// CHECK-NEXT:    store float [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
244// CHECK-NEXT:    ret void
245//
246void builtins_amdgcn_dl_insts(global float *out, float fC, short2 v2ssA, short2 v2ssB) {
247  *out = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false);
248}
249
250// CHECK-LABEL: @builtins_amdgcn_dl_dot2c(
251// CHECK-NEXT:  entry:
252// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
253// CHECK-NEXT:    [[FC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
254// CHECK-NEXT:    [[V2SSA_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
255// CHECK-NEXT:    [[V2SSB_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
256// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
257// CHECK-NEXT:    store float [[FC:%.*]], ptr addrspace(5) [[FC_ADDR]], align 4
258// CHECK-NEXT:    store <2 x bfloat> [[V2SSA:%.*]], ptr addrspace(5) [[V2SSA_ADDR]], align 4
259// CHECK-NEXT:    store <2 x bfloat> [[V2SSB:%.*]], ptr addrspace(5) [[V2SSB_ADDR]], align 4
260// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[V2SSA_ADDR]], align 4
261// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[V2SSB_ADDR]], align 4
262// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[FC_ADDR]], align 4
263// CHECK-NEXT:    [[TMP3:%.*]] = call float @llvm.amdgcn.fdot2c.f32.bf16(<2 x bfloat> [[TMP0]], <2 x bfloat> [[TMP1]], float [[TMP2]], i1 false)
264// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
265// CHECK-NEXT:    store float [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
266// CHECK-NEXT:    ret void
267//
268void builtins_amdgcn_dl_dot2c(global float *out, float fC, bfloat2 v2ssA, bfloat2 v2ssB) {
269  *out = __builtin_amdgcn_fdot2c_f32_bf16(v2ssA, v2ssB, fC, false);
270}
271
272// CHECK-LABEL: @test_cvt_scalef32_f16_fp8(
273// CHECK-NEXT:  entry:
274// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
275// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
276// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
277// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
278// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
279// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
280// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
281// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 4
282// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
283// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
284// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP1]], i32 [[TMP2]], float [[TMP3]], i32 0, i1 false)
285// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
286// CHECK-NEXT:    store <2 x half> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
287// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
288// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP6]], align 4
289// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
290// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
291// CHECK-NEXT:    [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP7]], i32 [[TMP8]], float [[TMP9]], i32 1, i1 false)
292// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
293// CHECK-NEXT:    store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
294// CHECK-NEXT:    [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
295// CHECK-NEXT:    [[TMP13:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP12]], align 4
296// CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
297// CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
298// CHECK-NEXT:    [[TMP16:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP13]], i32 [[TMP14]], float [[TMP15]], i32 2, i1 false)
299// CHECK-NEXT:    [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
300// CHECK-NEXT:    store <2 x half> [[TMP16]], ptr addrspace(1) [[TMP17]], align 4
301// CHECK-NEXT:    [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
302// CHECK-NEXT:    [[TMP19:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP18]], align 4
303// CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
304// CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
305// CHECK-NEXT:    [[TMP22:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP19]], i32 [[TMP20]], float [[TMP21]], i32 3, i1 false)
306// CHECK-NEXT:    [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
307// CHECK-NEXT:    store <2 x half> [[TMP22]], ptr addrspace(1) [[TMP23]], align 4
308// CHECK-NEXT:    [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
309// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP24]], align 4
310// CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
311// CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
312// CHECK-NEXT:    [[TMP28:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP25]], i32 [[TMP26]], float [[TMP27]], i32 0, i1 true)
313// CHECK-NEXT:    [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
314// CHECK-NEXT:    store <2 x half> [[TMP28]], ptr addrspace(1) [[TMP29]], align 4
315// CHECK-NEXT:    [[TMP30:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
316// CHECK-NEXT:    [[TMP31:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP30]], align 4
317// CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
318// CHECK-NEXT:    [[TMP33:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
319// CHECK-NEXT:    [[TMP34:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP31]], i32 [[TMP32]], float [[TMP33]], i32 1, i1 true)
320// CHECK-NEXT:    [[TMP35:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
321// CHECK-NEXT:    store <2 x half> [[TMP34]], ptr addrspace(1) [[TMP35]], align 4
322// CHECK-NEXT:    [[TMP36:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
323// CHECK-NEXT:    [[TMP37:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP36]], align 4
324// CHECK-NEXT:    [[TMP38:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
325// CHECK-NEXT:    [[TMP39:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
326// CHECK-NEXT:    [[TMP40:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP37]], i32 [[TMP38]], float [[TMP39]], i32 2, i1 true)
327// CHECK-NEXT:    [[TMP41:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
328// CHECK-NEXT:    store <2 x half> [[TMP40]], ptr addrspace(1) [[TMP41]], align 4
329// CHECK-NEXT:    [[TMP42:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
330// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP42]], align 4
331// CHECK-NEXT:    [[TMP44:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
332// CHECK-NEXT:    [[TMP45:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
333// CHECK-NEXT:    [[TMP46:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP43]], i32 [[TMP44]], float [[TMP45]], i32 3, i1 true)
334// CHECK-NEXT:    [[TMP47:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
335// CHECK-NEXT:    store <2 x half> [[TMP46]], ptr addrspace(1) [[TMP47]], align 4
336// CHECK-NEXT:    ret void
337//
338void test_cvt_scalef32_f16_fp8(global half2* out, uint src, float scale)
339{
340  *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 0, false);
341  *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 1, false);
342  *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 2, false);
343  *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 3, false);
344  *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 0, true);
345  *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 1, true);
346  *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 2, true);
347  *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 3, true);
348}
349
350// CHECK-LABEL: @test_cvt_scalef32_f32_fp8(
351// CHECK-NEXT:  entry:
352// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
353// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
354// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
355// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
356// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
357// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
358// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
359// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
360// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP0]], float [[TMP1]], i32 0)
361// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
362// CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
363// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
364// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
365// CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP4]], float [[TMP5]], i32 1)
366// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
367// CHECK-NEXT:    store float [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
368// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
369// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
370// CHECK-NEXT:    [[TMP10:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP8]], float [[TMP9]], i32 2)
371// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
372// CHECK-NEXT:    store float [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
373// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
374// CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
375// CHECK-NEXT:    [[TMP14:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP12]], float [[TMP13]], i32 3)
376// CHECK-NEXT:    [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
377// CHECK-NEXT:    store float [[TMP14]], ptr addrspace(1) [[TMP15]], align 4
378// CHECK-NEXT:    ret void
379//
380void test_cvt_scalef32_f32_fp8(global float* out, uint src, float scale)
381{
382  *out = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, 0);
383  *out = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, 1);
384  *out = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, 2);
385  *out = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, 3);
386}
387
388// CHECK-LABEL: @test_cvt_scalef32_f16_bf8(
389// CHECK-NEXT:  entry:
390// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
391// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
392// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
393// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
394// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
395// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
396// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
397// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 4
398// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
399// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
400// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP1]], i32 [[TMP2]], float [[TMP3]], i32 0, i1 false)
401// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
402// CHECK-NEXT:    store <2 x half> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
403// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
404// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP6]], align 4
405// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
406// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
407// CHECK-NEXT:    [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP7]], i32 [[TMP8]], float [[TMP9]], i32 1, i1 false)
408// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
409// CHECK-NEXT:    store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
410// CHECK-NEXT:    [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
411// CHECK-NEXT:    [[TMP13:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP12]], align 4
412// CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
413// CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
414// CHECK-NEXT:    [[TMP16:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP13]], i32 [[TMP14]], float [[TMP15]], i32 2, i1 false)
415// CHECK-NEXT:    [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
416// CHECK-NEXT:    store <2 x half> [[TMP16]], ptr addrspace(1) [[TMP17]], align 4
417// CHECK-NEXT:    [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
418// CHECK-NEXT:    [[TMP19:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP18]], align 4
419// CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
420// CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
421// CHECK-NEXT:    [[TMP22:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP19]], i32 [[TMP20]], float [[TMP21]], i32 3, i1 false)
422// CHECK-NEXT:    [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
423// CHECK-NEXT:    store <2 x half> [[TMP22]], ptr addrspace(1) [[TMP23]], align 4
424// CHECK-NEXT:    [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
425// CHECK-NEXT:    [[TMP25:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP24]], align 4
426// CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
427// CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
428// CHECK-NEXT:    [[TMP28:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP25]], i32 [[TMP26]], float [[TMP27]], i32 0, i1 true)
429// CHECK-NEXT:    [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
430// CHECK-NEXT:    store <2 x half> [[TMP28]], ptr addrspace(1) [[TMP29]], align 4
431// CHECK-NEXT:    [[TMP30:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
432// CHECK-NEXT:    [[TMP31:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP30]], align 4
433// CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
434// CHECK-NEXT:    [[TMP33:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
435// CHECK-NEXT:    [[TMP34:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP31]], i32 [[TMP32]], float [[TMP33]], i32 1, i1 true)
436// CHECK-NEXT:    [[TMP35:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
437// CHECK-NEXT:    store <2 x half> [[TMP34]], ptr addrspace(1) [[TMP35]], align 4
438// CHECK-NEXT:    [[TMP36:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
439// CHECK-NEXT:    [[TMP37:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP36]], align 4
440// CHECK-NEXT:    [[TMP38:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
441// CHECK-NEXT:    [[TMP39:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
442// CHECK-NEXT:    [[TMP40:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP37]], i32 [[TMP38]], float [[TMP39]], i32 2, i1 true)
443// CHECK-NEXT:    [[TMP41:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
444// CHECK-NEXT:    store <2 x half> [[TMP40]], ptr addrspace(1) [[TMP41]], align 4
445// CHECK-NEXT:    [[TMP42:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
446// CHECK-NEXT:    [[TMP43:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP42]], align 4
447// CHECK-NEXT:    [[TMP44:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
448// CHECK-NEXT:    [[TMP45:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
449// CHECK-NEXT:    [[TMP46:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP43]], i32 [[TMP44]], float [[TMP45]], i32 3, i1 true)
450// CHECK-NEXT:    [[TMP47:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
451// CHECK-NEXT:    store <2 x half> [[TMP46]], ptr addrspace(1) [[TMP47]], align 4
452// CHECK-NEXT:    ret void
453//
454void test_cvt_scalef32_f16_bf8(global half2* out, uint src, float scale)
455{
456  *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 0, false);
457  *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 1, false);
458  *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 2, false);
459  *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 3, false);
460  *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 0, true);
461  *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 1, true);
462  *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 2, true);
463  *out = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out, src, scale, 3, true);
464}
465
466// CHECK-LABEL: @test_cvt_scalef32_f32_bf8(
467// CHECK-NEXT:  entry:
468// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
469// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
470// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
471// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
472// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
473// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
474// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
475// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
476// CHECK-NEXT:    [[TMP2:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 [[TMP0]], float [[TMP1]], i32 0)
477// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
478// CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
479// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
480// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
481// CHECK-NEXT:    [[TMP6:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 [[TMP4]], float [[TMP5]], i32 1)
482// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
483// CHECK-NEXT:    store float [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
484// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
485// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
486// CHECK-NEXT:    [[TMP10:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 [[TMP8]], float [[TMP9]], i32 2)
487// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
488// CHECK-NEXT:    store float [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
489// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
490// CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
491// CHECK-NEXT:    [[TMP14:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 [[TMP12]], float [[TMP13]], i32 3)
492// CHECK-NEXT:    [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
493// CHECK-NEXT:    store float [[TMP14]], ptr addrspace(1) [[TMP15]], align 4
494// CHECK-NEXT:    ret void
495//
496void test_cvt_scalef32_f32_bf8(global float* out, uint src, float scale)
497{
498  *out = __builtin_amdgcn_cvt_scalef32_f32_bf8(src, scale, 0);
499  *out = __builtin_amdgcn_cvt_scalef32_f32_bf8(src, scale, 1);
500  *out = __builtin_amdgcn_cvt_scalef32_f32_bf8(src, scale, 2);
501  *out = __builtin_amdgcn_cvt_scalef32_f32_bf8(src, scale, 3);
502}
503
504// CHECK-LABEL: @test_cvt_scalef32_pk_fp8_f32(
505// CHECK-NEXT:  entry:
506// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
507// CHECK-NEXT:    [[SRC0_ADDR:%.*]] = alloca float, align 4, addrspace(5)
508// CHECK-NEXT:    [[SRC1_ADDR:%.*]] = alloca float, align 4, addrspace(5)
509// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
510// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
511// CHECK-NEXT:    store float [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
512// CHECK-NEXT:    store float [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
513// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
514// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
515// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
516// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4
517// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4
518// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
519// CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], i1 true)
520// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
521// CHECK-NEXT:    store <2 x i16> [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
522// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
523// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP7]], align 4
524// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4
525// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4
526// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
527// CHECK-NEXT:    [[TMP12:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> [[TMP8]], float [[TMP9]], float [[TMP10]], float [[TMP11]], i1 false)
528// CHECK-NEXT:    [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
529// CHECK-NEXT:    store <2 x i16> [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
530// CHECK-NEXT:    ret void
531//
532void test_cvt_scalef32_pk_fp8_f32(global short2* out, float src0, float src1, float scale)
533{
534  *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(*out, src0, src1, scale, true);
535  *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(*out, src0, src1, scale, false);
536}
537
538// CHECK-LABEL: @test_cvt_scalef32_pk_bf8_f32(
539// CHECK-NEXT:  entry:
540// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
541// CHECK-NEXT:    [[SRC0_ADDR:%.*]] = alloca float, align 4, addrspace(5)
542// CHECK-NEXT:    [[SRC1_ADDR:%.*]] = alloca float, align 4, addrspace(5)
543// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
544// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
545// CHECK-NEXT:    store float [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
546// CHECK-NEXT:    store float [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
547// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
548// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
549// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
550// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4
551// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4
552// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
553// CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], i1 true)
554// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
555// CHECK-NEXT:    store <2 x i16> [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
556// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
557// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP7]], align 4
558// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4
559// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4
560// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
561// CHECK-NEXT:    [[TMP12:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> [[TMP8]], float [[TMP9]], float [[TMP10]], float [[TMP11]], i1 false)
562// CHECK-NEXT:    [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
563// CHECK-NEXT:    store <2 x i16> [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
564// CHECK-NEXT:    ret void
565//
566void test_cvt_scalef32_pk_bf8_f32(global short2* out, float src0, float src1, float scale)
567{
568  *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(*out, src0, src1, scale, true);
569  *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(*out, src0, src1, scale, false);
570}
571
572
573// CHECK-LABEL: @test_cvt_scalef32_pk_f32_fp8(
574// CHECK-NEXT:  entry:
575// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
576// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
577// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
578// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
579// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
580// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
581// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
582// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
583// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 [[TMP0]], float [[TMP1]], i1 true)
584// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
585// CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8
586// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
587// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
588// CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 [[TMP4]], float [[TMP5]], i1 false)
589// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
590// CHECK-NEXT:    store <2 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
591// CHECK-NEXT:    ret void
592//
593void test_cvt_scalef32_pk_f32_fp8(global float2* out, unsigned int src, float scale)
594{
595  *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp8(src, scale, true);
596  *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp8(src, scale, false);
597}
598
599// CHECK-LABEL: @test_cvt_scalef32_pk_f32_bf8(
600// CHECK-NEXT:  entry:
601// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
602// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
603// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
604// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
605// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
606// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
607// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
608// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
609// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 [[TMP0]], float [[TMP1]], i1 true)
610// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
611// CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8
612// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
613// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
614// CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 [[TMP4]], float [[TMP5]], i1 false)
615// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
616// CHECK-NEXT:    store <2 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
617// CHECK-NEXT:    ret void
618//
619void test_cvt_scalef32_pk_f32_bf8(global float2* out, unsigned int src, float scale)
620{
621  *out = __builtin_amdgcn_cvt_scalef32_pk_f32_bf8(src, scale, true);
622  *out = __builtin_amdgcn_cvt_scalef32_pk_f32_bf8(src, scale, false);
623}
624
625// CHECK-LABEL: @test_cvt_scalef32_pk_fp8_f16(
626// CHECK-NEXT:  entry:
627// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
628// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
629// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
630// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
631// CHECK-NEXT:    store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
632// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
633// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
634// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
635// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
636// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
637// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i1 true)
638// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
639// CHECK-NEXT:    store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
640// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
641// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4
642// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
643// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
644// CHECK-NEXT:    [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i1 false)
645// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
646// CHECK-NEXT:    store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
647// CHECK-NEXT:    ret void
648//
649void test_cvt_scalef32_pk_fp8_f16(global short2* out, half2 src, float scale)
650{
651  *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(*out, src, scale, true);
652  *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(*out, src, scale, false);
653}
654
655// CHECK-LABEL: @test_cvt_scalef32_pk_fp8_bf16(
656// CHECK-NEXT:  entry:
657// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
658// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
659// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
660// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
661// CHECK-NEXT:    store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
662// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
663// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
664// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
665// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
666// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
667// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i1 true)
668// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
669// CHECK-NEXT:    store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
670// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
671// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4
672// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
673// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
674// CHECK-NEXT:    [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i1 false)
675// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
676// CHECK-NEXT:    store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
677// CHECK-NEXT:    ret void
678//
679void test_cvt_scalef32_pk_fp8_bf16(global short2* out, bfloat2 src, float scale)
680{
681  *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16(*out, src, scale, true);
682  *out = __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16(*out, src, scale, false);
683}
684
685// CHECK-LABEL: @test_cvt_scalef32_pk_bf8_f16(
686// CHECK-NEXT:  entry:
687// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
688// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
689// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
690// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
691// CHECK-NEXT:    store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
692// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
693// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
694// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
695// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
696// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
697// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i1 true)
698// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
699// CHECK-NEXT:    store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
700// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
701// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4
702// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
703// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
704// CHECK-NEXT:    [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i1 false)
705// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
706// CHECK-NEXT:    store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
707// CHECK-NEXT:    ret void
708//
709void test_cvt_scalef32_pk_bf8_f16(global short2* out, half2 src, float scale)
710{
711  *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(*out, src, scale, true);
712  *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(*out, src, scale, false);
713}
714
715// CHECK-LABEL: @test_cvt_scalef32_pk_bf8_bf16(
716// CHECK-NEXT:  entry:
717// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
718// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
719// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
720// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
721// CHECK-NEXT:    store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
722// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
723// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
724// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
725// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
726// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
727// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i1 true)
728// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
729// CHECK-NEXT:    store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
730// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
731// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4
732// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
733// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
734// CHECK-NEXT:    [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i1 false)
735// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
736// CHECK-NEXT:    store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
737// CHECK-NEXT:    ret void
738//
739void test_cvt_scalef32_pk_bf8_bf16(global short2* out, bfloat2 src, float scale)
740{
741  *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16(*out, src, scale, true);
742  *out = __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16(*out, src, scale, false);
743}
744
745// CHECK-LABEL: @test_cvt_scalef32_pk_f32_fp4(
746// CHECK-NEXT:  entry:
747// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
748// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
749// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
750// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
751// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
752// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
753// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
754// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
755// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 [[TMP0]], float [[TMP1]], i32 0)
756// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
757// CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8
758// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
759// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
760// CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 [[TMP4]], float [[TMP5]], i32 1)
761// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
762// CHECK-NEXT:    store <2 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
763// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
764// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
765// CHECK-NEXT:    [[TMP10:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 [[TMP8]], float [[TMP9]], i32 2)
766// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
767// CHECK-NEXT:    store <2 x float> [[TMP10]], ptr addrspace(1) [[TMP11]], align 8
768// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
769// CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
770// CHECK-NEXT:    [[TMP14:%.*]] = call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 [[TMP12]], float [[TMP13]], i32 3)
771// CHECK-NEXT:    [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
772// CHECK-NEXT:    store <2 x float> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
773// CHECK-NEXT:    ret void
774//
775void test_cvt_scalef32_pk_f32_fp4(global float2* out, uint src, float scale)
776{
777  *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(src, scale, 0);
778  *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(src, scale, 1);
779  *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(src, scale, 2);
780  *out = __builtin_amdgcn_cvt_scalef32_pk_f32_fp4(src, scale, 3);
781}
782
783// CHECK-LABEL: @test_cvt_scalef32_pk_fp4_f32(
784// CHECK-NEXT:  entry:
785// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
786// CHECK-NEXT:    [[SRC0_ADDR:%.*]] = alloca float, align 4, addrspace(5)
787// CHECK-NEXT:    [[SRC1_ADDR:%.*]] = alloca float, align 4, addrspace(5)
788// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
789// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
790// CHECK-NEXT:    store float [[SRC0:%.*]], ptr addrspace(5) [[SRC0_ADDR]], align 4
791// CHECK-NEXT:    store float [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
792// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
793// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
794// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
795// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4
796// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4
797// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
798// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 [[TMP1]], float [[TMP2]], float [[TMP3]], float [[TMP4]], i32 0)
799// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
800// CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
801// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
802// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
803// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4
804// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4
805// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
806// CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 [[TMP8]], float [[TMP9]], float [[TMP10]], float [[TMP11]], i32 1)
807// CHECK-NEXT:    [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
808// CHECK-NEXT:    store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
809// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
810// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4
811// CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4
812// CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4
813// CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
814// CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 [[TMP15]], float [[TMP16]], float [[TMP17]], float [[TMP18]], i32 2)
815// CHECK-NEXT:    [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
816// CHECK-NEXT:    store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4
817// CHECK-NEXT:    [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
818// CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4
819// CHECK-NEXT:    [[TMP23:%.*]] = load float, ptr addrspace(5) [[SRC0_ADDR]], align 4
820// CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr addrspace(5) [[SRC1_ADDR]], align 4
821// CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
822// CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 [[TMP22]], float [[TMP23]], float [[TMP24]], float [[TMP25]], i32 3)
823// CHECK-NEXT:    [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
824// CHECK-NEXT:    store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4
825// CHECK-NEXT:    ret void
826//
827void test_cvt_scalef32_pk_fp4_f32(global unsigned int* out, float src0, float src1, float scale)
828{
829  *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(*out, src0, src1, scale, 0);
830  *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(*out, src0, src1, scale, 1);
831  *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(*out, src0, src1, scale, 2);
832  *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f32(*out, src0, src1, scale, 3);
833}
834
835// CHECK-LABEL: @test_cvt_scalef32_pk_f16_fp4(
836// CHECK-NEXT:  entry:
837// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
838// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
839// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
840// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
841// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
842// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
843// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
844// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
845// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP0]], float [[TMP1]], i32 0)
846// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
847// CHECK-NEXT:    store <2 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
848// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
849// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
850// CHECK-NEXT:    [[TMP6:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP4]], float [[TMP5]], i32 1)
851// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
852// CHECK-NEXT:    store <2 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
853// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
854// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
855// CHECK-NEXT:    [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP8]], float [[TMP9]], i32 2)
856// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
857// CHECK-NEXT:    store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
858// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
859// CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
860// CHECK-NEXT:    [[TMP14:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 [[TMP12]], float [[TMP13]], i32 3)
861// CHECK-NEXT:    [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
862// CHECK-NEXT:    store <2 x half> [[TMP14]], ptr addrspace(1) [[TMP15]], align 4
863// CHECK-NEXT:    ret void
864//
865void test_cvt_scalef32_pk_f16_fp4(global half2* out, uint src, float scale)
866{
867  *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp4(src, scale, 0);
868  *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp4(src, scale, 1);
869  *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp4(src, scale, 2);
870  *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp4(src, scale, 3);
871}
872
873// CHECK-LABEL: @test_cvt_scalef32_pk_bf16_fp4(
874// CHECK-NEXT:  entry:
875// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
876// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
877// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
878// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
879// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
880// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
881// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
882// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
883// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 [[TMP0]], float [[TMP1]], i32 0)
884// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
885// CHECK-NEXT:    store <2 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
886// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
887// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
888// CHECK-NEXT:    [[TMP6:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 [[TMP4]], float [[TMP5]], i32 1)
889// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
890// CHECK-NEXT:    store <2 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
891// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
892// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
893// CHECK-NEXT:    [[TMP10:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 [[TMP8]], float [[TMP9]], i32 2)
894// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
895// CHECK-NEXT:    store <2 x bfloat> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
896// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
897// CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
898// CHECK-NEXT:    [[TMP14:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 [[TMP12]], float [[TMP13]], i32 3)
899// CHECK-NEXT:    [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
900// CHECK-NEXT:    store <2 x bfloat> [[TMP14]], ptr addrspace(1) [[TMP15]], align 4
901// CHECK-NEXT:    ret void
902//
903void test_cvt_scalef32_pk_bf16_fp4(global bfloat2* out, uint src, float scale)
904{
905  *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp4(src, scale, 0);
906  *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp4(src, scale, 1);
907  *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp4(src, scale, 2);
908  *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp4(src, scale, 3);
909}
910
911// CHECK-LABEL: @test_cvt_scalef32_pk_f32_fp6(
912// CHECK-NEXT:  entry:
913// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
914// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5)
915// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
916// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
917// CHECK-NEXT:    store <6 x i32> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 32
918// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
919// CHECK-NEXT:    [[TMP0:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32
920// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
921// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.fp6(<6 x i32> [[TMP0]], float [[TMP1]])
922// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
923// CHECK-NEXT:    store <32 x float> [[TMP2]], ptr addrspace(1) [[TMP3]], align 128
924// CHECK-NEXT:    [[TMP4:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32
925// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
926// CHECK-NEXT:    [[TMP6:%.*]] = call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.bf6(<6 x i32> [[TMP4]], float [[TMP5]])
927// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
928// CHECK-NEXT:    store <32 x float> [[TMP6]], ptr addrspace(1) [[TMP7]], align 128
929// CHECK-NEXT:    ret void
930//
931void test_cvt_scalef32_pk_f32_fp6(global float32* out, uint6 src, float scale)
932{
933  *out = __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(src, scale);
934  *out = __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(src, scale);
935}
936
937// CHECK-LABEL: @test_cvt_scalef32_pk32_f16_fpbf6(
938// CHECK-NEXT:  entry:
939// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
940// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5)
941// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
942// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
943// CHECK-NEXT:    store <6 x i32> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 32
944// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
945// CHECK-NEXT:    [[TMP0:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32
946// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
947// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> [[TMP0]], float [[TMP1]])
948// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
949// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 64
950// CHECK-NEXT:    [[TMP4:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32
951// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
952// CHECK-NEXT:    [[TMP6:%.*]] = call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> [[TMP4]], float [[TMP5]])
953// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
954// CHECK-NEXT:    store <32 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 64
955// CHECK-NEXT:    ret void
956//
957void test_cvt_scalef32_pk32_f16_fpbf6(global half32 *out, uint6 src, float scale)
958{
959  *out = __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6(src, scale);
960  *out = __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6(src, scale);
961}
962
963// CHECK-LABEL: @test_cvt_scalef32_pk32_bf16_fpbf6(
964// CHECK-NEXT:  entry:
965// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
966// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca <6 x i32>, align 32, addrspace(5)
967// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
968// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
969// CHECK-NEXT:    store <6 x i32> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 32
970// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
971// CHECK-NEXT:    [[TMP0:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32
972// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
973// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> [[TMP0]], float [[TMP1]])
974// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
975// CHECK-NEXT:    store <32 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 64
976// CHECK-NEXT:    [[TMP4:%.*]] = load <6 x i32>, ptr addrspace(5) [[SRC_ADDR]], align 32
977// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
978// CHECK-NEXT:    [[TMP6:%.*]] = call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> [[TMP4]], float [[TMP5]])
979// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
980// CHECK-NEXT:    store <32 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 64
981// CHECK-NEXT:    ret void
982//
983void test_cvt_scalef32_pk32_bf16_fpbf6(global bfloat32 *out, uint6 src, float scale)
984{
985  *out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6(src, scale);
986  *out = __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6(src, scale);
987}
988
989// CHECK-LABEL: @test_cvt_scalef32_pk_f16_fp8(
990// CHECK-NEXT:  entry:
991// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
992// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
993// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
994// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
995// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
996// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
997// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
998// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
999// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 [[TMP0]], float [[TMP1]], i1 true)
1000// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1001// CHECK-NEXT:    store <2 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1002// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1003// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1004// CHECK-NEXT:    [[TMP6:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 [[TMP4]], float [[TMP5]], i1 false)
1005// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1006// CHECK-NEXT:    store <2 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
1007// CHECK-NEXT:    ret void
1008//
1009void test_cvt_scalef32_pk_f16_fp8(global half2* out, unsigned int src, float scale)
1010{
1011  *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(src, scale, true);
1012  *out = __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(src, scale, false);
1013}
1014
1015// CHECK-LABEL: @test_cvt_scalef32_pk_f16_bf8(
1016// CHECK-NEXT:  entry:
1017// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1018// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1019// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1020// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1021// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1022// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1023// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1024// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1025// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 [[TMP0]], float [[TMP1]], i1 true)
1026// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1027// CHECK-NEXT:    store <2 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1028// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1029// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1030// CHECK-NEXT:    [[TMP6:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 [[TMP4]], float [[TMP5]], i1 false)
1031// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1032// CHECK-NEXT:    store <2 x half> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
1033// CHECK-NEXT:    ret void
1034//
1035void test_cvt_scalef32_pk_f16_bf8(global half2* out, unsigned int src, float scale)
1036{
1037  *out = __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(src, scale, true);
1038  *out = __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(src, scale, false);
1039}
1040
1041// CHECK-LABEL: @test_cvt_scalef32_pk_bf16_fp8(
1042// CHECK-NEXT:  entry:
1043// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1044// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1045// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1046// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1047// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1048// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1049// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1050// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1051// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 [[TMP0]], float [[TMP1]], i1 true)
1052// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1053// CHECK-NEXT:    store <2 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1054// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1055// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1056// CHECK-NEXT:    [[TMP6:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 [[TMP4]], float [[TMP5]], i1 false)
1057// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1058// CHECK-NEXT:    store <2 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
1059// CHECK-NEXT:    ret void
1060//
1061void test_cvt_scalef32_pk_bf16_fp8(global bfloat2* out, unsigned int src, float scale)
1062{
1063  *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(src, scale, true);
1064  *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(src, scale, false);
1065}
1066
1067// CHECK-LABEL: @test_cvt_scalef32_pk_bf16_bf8(
1068// CHECK-NEXT:  entry:
1069// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1070// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1071// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1072// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1073// CHECK-NEXT:    store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1074// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1075// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1076// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1077// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 [[TMP0]], float [[TMP1]], i1 true)
1078// CHECK-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1079// CHECK-NEXT:    store <2 x bfloat> [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
1080// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
1081// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1082// CHECK-NEXT:    [[TMP6:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 [[TMP4]], float [[TMP5]], i1 false)
1083// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1084// CHECK-NEXT:    store <2 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
1085// CHECK-NEXT:    ret void
1086//
1087void test_cvt_scalef32_pk_bf16_bf8(global bfloat2* out, unsigned int src, float scale)
1088{
1089  *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(src, scale, true);
1090  *out = __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(src, scale, false);
1091}
1092
1093// CHECK-LABEL: @test_cvt_scalef32_pk_fp4_f16(
1094// CHECK-NEXT:  entry:
1095// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1096// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
1097// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1098// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1099// CHECK-NEXT:    store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1100// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1101// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1102// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1103// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1104// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1105// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i32 0)
1106// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1107// CHECK-NEXT:    store i32 [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
1108// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1109// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
1110// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1111// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1112// CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i32 1)
1113// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1114// CHECK-NEXT:    store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
1115// CHECK-NEXT:    [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1116// CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
1117// CHECK-NEXT:    [[TMP14:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1118// CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1119// CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP13]], <2 x half> [[TMP14]], float [[TMP15]], i32 2)
1120// CHECK-NEXT:    [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1121// CHECK-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[TMP17]], align 4
1122// CHECK-NEXT:    [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1123// CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(1) [[TMP18]], align 4
1124// CHECK-NEXT:    [[TMP20:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1125// CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1126// CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 [[TMP19]], <2 x half> [[TMP20]], float [[TMP21]], i32 3)
1127// CHECK-NEXT:    [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1128// CHECK-NEXT:    store i32 [[TMP22]], ptr addrspace(1) [[TMP23]], align 4
1129// CHECK-NEXT:    ret void
1130//
1131void test_cvt_scalef32_pk_fp4_f16(global unsigned int* out, half2 src, float scale)
1132{
1133  *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16(*out, src, scale, 0);
1134  *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16(*out, src, scale, 1);
1135  *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16(*out, src, scale, 2);
1136  *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_f16(*out, src, scale, 3);
1137}
1138
1139// CHECK-LABEL: @test_cvt_scalef32_pk_fp4_bf16(
1140// CHECK-NEXT:  entry:
1141// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1142// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
1143// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1144// CHECK-NEXT:    [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1145// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1146// CHECK-NEXT:    store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1147// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1148// CHECK-NEXT:    store i32 [[OLD:%.*]], ptr addrspace(5) [[OLD_ADDR]], align 4
1149// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1150// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1151// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1152// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1153// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i32 0)
1154// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1155// CHECK-NEXT:    store i32 [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
1156// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1157// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
1158// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1159// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1160// CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i32 1)
1161// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1162// CHECK-NEXT:    store i32 [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
1163// CHECK-NEXT:    [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1164// CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4
1165// CHECK-NEXT:    [[TMP14:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1166// CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1167// CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP13]], <2 x bfloat> [[TMP14]], float [[TMP15]], i32 2)
1168// CHECK-NEXT:    [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1169// CHECK-NEXT:    store i32 [[TMP16]], ptr addrspace(1) [[TMP17]], align 4
1170// CHECK-NEXT:    [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1171// CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr addrspace(1) [[TMP18]], align 4
1172// CHECK-NEXT:    [[TMP20:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1173// CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1174// CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 [[TMP19]], <2 x bfloat> [[TMP20]], float [[TMP21]], i32 3)
1175// CHECK-NEXT:    [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1176// CHECK-NEXT:    store i32 [[TMP22]], ptr addrspace(1) [[TMP23]], align 4
1177// CHECK-NEXT:    ret void
1178//
1179void test_cvt_scalef32_pk_fp4_bf16(global unsigned int* out, bfloat2 src, float scale, uint old)
1180{
1181  *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(*out, src, scale, 0);
1182  *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(*out, src, scale, 1);
1183  *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(*out, src, scale, 2);
1184  *out = __builtin_amdgcn_cvt_scalef32_pk_fp4_bf16(*out, src, scale, 3);
1185}
1186
1187// CHECK-LABEL: @test_cvt_scalef32_sr_pk_fp4_f16(
1188// CHECK-NEXT:  entry:
1189// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1190// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
1191// CHECK-NEXT:    [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1192// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1193// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1194// CHECK-NEXT:    store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1195// CHECK-NEXT:    store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1196// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1197// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1198// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1199// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1200// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1201// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1202// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP1]], <2 x half> [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0)
1203// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1204// CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
1205// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1206// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
1207// CHECK-NEXT:    [[TMP9:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1208// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1209// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1210// CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP8]], <2 x half> [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1)
1211// CHECK-NEXT:    [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1212// CHECK-NEXT:    store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
1213// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1214// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4
1215// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1216// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1217// CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1218// CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP15]], <2 x half> [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2)
1219// CHECK-NEXT:    [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1220// CHECK-NEXT:    store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4
1221// CHECK-NEXT:    [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1222// CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4
1223// CHECK-NEXT:    [[TMP23:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
1224// CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1225// CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1226// CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 [[TMP22]], <2 x half> [[TMP23]], i32 [[TMP24]], float [[TMP25]], i32 3)
1227// CHECK-NEXT:    [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1228// CHECK-NEXT:    store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4
1229// CHECK-NEXT:    ret void
1230//
1231void test_cvt_scalef32_sr_pk_fp4_f16(global unsigned *out, half2 src, uint seed, float scale)
1232{
1233  *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(*out, src, seed, scale, 0);
1234  *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(*out, src, seed, scale, 1);
1235  *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(*out, src, seed, scale, 2);
1236  *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f16(*out, src, seed, scale, 3);
1237}
1238
1239// CHECK-LABEL: @test_cvt_scalef32_sr_pk_fp4_bf16(
1240// CHECK-NEXT:  entry:
1241// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1242// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
1243// CHECK-NEXT:    [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1244// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1245// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1246// CHECK-NEXT:    store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1247// CHECK-NEXT:    store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1248// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1249// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1250// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1251// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1252// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1253// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1254// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP1]], <2 x bfloat> [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0)
1255// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1256// CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
1257// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1258// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
1259// CHECK-NEXT:    [[TMP9:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1260// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1261// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1262// CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP8]], <2 x bfloat> [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1)
1263// CHECK-NEXT:    [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1264// CHECK-NEXT:    store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
1265// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1266// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4
1267// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1268// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1269// CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1270// CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP15]], <2 x bfloat> [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2)
1271// CHECK-NEXT:    [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1272// CHECK-NEXT:    store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4
1273// CHECK-NEXT:    [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1274// CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4
1275// CHECK-NEXT:    [[TMP23:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
1276// CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1277// CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1278// CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 [[TMP22]], <2 x bfloat> [[TMP23]], i32 [[TMP24]], float [[TMP25]], i32 3)
1279// CHECK-NEXT:    [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1280// CHECK-NEXT:    store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4
1281// CHECK-NEXT:    ret void
1282//
1283void test_cvt_scalef32_sr_pk_fp4_bf16(global unsigned *out, bfloat2 src, uint seed, float scale)
1284{
1285  *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16(*out, src, seed, scale, 0);
1286  *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16(*out, src, seed, scale, 1);
1287  *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16(*out, src, seed, scale, 2);
1288  *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_bf16(*out, src, seed, scale, 3);
1289}
1290
1291// CHECK-LABEL: @test_cvt_scalef32_sr_pk_fp4_f32(
1292// CHECK-NEXT:  entry:
1293// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1294// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca <2 x float>, align 8, addrspace(5)
1295// CHECK-NEXT:    [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1296// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1297// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1298// CHECK-NEXT:    store <2 x float> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
1299// CHECK-NEXT:    store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1300// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1301// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1302// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1303// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8
1304// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1305// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1306// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP1]], <2 x float> [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0)
1307// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1308// CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
1309// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1310// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
1311// CHECK-NEXT:    [[TMP9:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8
1312// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1313// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1314// CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP8]], <2 x float> [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1)
1315// CHECK-NEXT:    [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1316// CHECK-NEXT:    store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
1317// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1318// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4
1319// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8
1320// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1321// CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1322// CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP15]], <2 x float> [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2)
1323// CHECK-NEXT:    [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1324// CHECK-NEXT:    store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4
1325// CHECK-NEXT:    [[TMP21:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1326// CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr addrspace(1) [[TMP21]], align 4
1327// CHECK-NEXT:    [[TMP23:%.*]] = load <2 x float>, ptr addrspace(5) [[SRC_ADDR]], align 8
1328// CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1329// CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1330// CHECK-NEXT:    [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 [[TMP22]], <2 x float> [[TMP23]], i32 [[TMP24]], float [[TMP25]], i32 3)
1331// CHECK-NEXT:    [[TMP27:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1332// CHECK-NEXT:    store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4
1333// CHECK-NEXT:    ret void
1334//
1335void test_cvt_scalef32_sr_pk_fp4_f32(global unsigned *out, float2 src, uint seed, float scale)
1336{
1337  *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 0);
1338  *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 1);
1339  *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 2);
1340  *out = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(*out, src, seed, scale, 3);
1341}
1342
1343// CHECK-LABEL: @test_cvt_scalef32_sr_pk32(
1344// CHECK-NEXT:  entry:
1345// CHECK-NEXT:    [[OUT6_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1346// CHECK-NEXT:    [[SRCBF32_ADDR:%.*]] = alloca <32 x bfloat>, align 64, addrspace(5)
1347// CHECK-NEXT:    [[SRCH32_ADDR:%.*]] = alloca <32 x half>, align 64, addrspace(5)
1348// CHECK-NEXT:    [[SRCF32_ADDR:%.*]] = alloca <32 x float>, align 128, addrspace(5)
1349// CHECK-NEXT:    [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1350// CHECK-NEXT:    [[SRC2_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1351// CHECK-NEXT:    store ptr addrspace(1) [[OUT6:%.*]], ptr addrspace(5) [[OUT6_ADDR]], align 8
1352// CHECK-NEXT:    store <32 x bfloat> [[SRCBF32:%.*]], ptr addrspace(5) [[SRCBF32_ADDR]], align 64
1353// CHECK-NEXT:    store <32 x half> [[SRCH32:%.*]], ptr addrspace(5) [[SRCH32_ADDR]], align 64
1354// CHECK-NEXT:    store <32 x float> [[SRCF32:%.*]], ptr addrspace(5) [[SRCF32_ADDR]], align 128
1355// CHECK-NEXT:    store i32 [[SRC1:%.*]], ptr addrspace(5) [[SRC1_ADDR]], align 4
1356// CHECK-NEXT:    store float [[SRC2:%.*]], ptr addrspace(5) [[SRC2_ADDR]], align 4
1357// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64
1358// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
1359// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
1360// CHECK-NEXT:    [[TMP3:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> [[TMP0]], i32 [[TMP1]], float [[TMP2]])
1361// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
1362// CHECK-NEXT:    store <6 x i32> [[TMP3]], ptr addrspace(1) [[TMP4]], align 32
1363// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64
1364// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
1365// CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
1366// CHECK-NEXT:    [[TMP8:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> [[TMP5]], i32 [[TMP6]], float [[TMP7]])
1367// CHECK-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
1368// CHECK-NEXT:    store <6 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 32
1369// CHECK-NEXT:    [[TMP10:%.*]] = load <32 x float>, ptr addrspace(5) [[SRCF32_ADDR]], align 128
1370// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
1371// CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
1372// CHECK-NEXT:    [[TMP13:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f32(<32 x float> [[TMP10]], i32 [[TMP11]], float [[TMP12]])
1373// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
1374// CHECK-NEXT:    store <6 x i32> [[TMP13]], ptr addrspace(1) [[TMP14]], align 32
1375// CHECK-NEXT:    [[TMP15:%.*]] = load <32 x bfloat>, ptr addrspace(5) [[SRCBF32_ADDR]], align 64
1376// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
1377// CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
1378// CHECK-NEXT:    [[TMP18:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> [[TMP15]], i32 [[TMP16]], float [[TMP17]])
1379// CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
1380// CHECK-NEXT:    store <6 x i32> [[TMP18]], ptr addrspace(1) [[TMP19]], align 32
1381// CHECK-NEXT:    [[TMP20:%.*]] = load <32 x half>, ptr addrspace(5) [[SRCH32_ADDR]], align 64
1382// CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
1383// CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
1384// CHECK-NEXT:    [[TMP23:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f16(<32 x half> [[TMP20]], i32 [[TMP21]], float [[TMP22]])
1385// CHECK-NEXT:    [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
1386// CHECK-NEXT:    store <6 x i32> [[TMP23]], ptr addrspace(1) [[TMP24]], align 32
1387// CHECK-NEXT:    [[TMP25:%.*]] = load <32 x float>, ptr addrspace(5) [[SRCF32_ADDR]], align 128
1388// CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
1389// CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr addrspace(5) [[SRC2_ADDR]], align 4
1390// CHECK-NEXT:    [[TMP28:%.*]] = call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f32(<32 x float> [[TMP25]], i32 [[TMP26]], float [[TMP27]])
1391// CHECK-NEXT:    [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT6_ADDR]], align 8
1392// CHECK-NEXT:    store <6 x i32> [[TMP28]], ptr addrspace(1) [[TMP29]], align 32
1393// CHECK-NEXT:    ret void
1394//
1395void test_cvt_scalef32_sr_pk32(global uint6 *out6, bfloat32 srcbf32, half32 srch32, float32 srcf32, unsigned src1, float src2)
1396{
1397  *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16(srcbf32, src1, src2);
1398  *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16(srch32, src1, src2);
1399  *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(srcf32, src1, src2);
1400  *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16(srcbf32, src1, src2);
1401  *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16(srch32, src1, src2);
1402  *out6 = __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(srcf32, src1, src2);
1403}
1404
1405// CHECK-LABEL: @test_cvt_scalef32_sr_bf8_bf16(
1406// CHECK-NEXT:  entry:
1407// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1408// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
1409// CHECK-NEXT:    [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1410// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1411// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1412// CHECK-NEXT:    store bfloat [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 2
1413// CHECK-NEXT:    store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1414// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1415// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1416// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1417// CHECK-NEXT:    [[TMP2:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2
1418// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1419// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1420// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 [[TMP1]], bfloat [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0)
1421// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1422// CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
1423// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1424// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
1425// CHECK-NEXT:    [[TMP9:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2
1426// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1427// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1428// CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 [[TMP8]], bfloat [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1)
1429// CHECK-NEXT:    [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1430// CHECK-NEXT:    store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
1431// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1432// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4
1433// CHECK-NEXT:    [[TMP16:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2
1434// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1435// CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1436// CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 [[TMP15]], bfloat [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2)
1437// CHECK-NEXT:    [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1438// CHECK-NEXT:    store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4
1439// CHECK-NEXT:    ret void
1440//
1441void test_cvt_scalef32_sr_bf8_bf16(global unsigned *out, __bf16 src, uint seed, float scale)
1442{
1443  *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_bf16(*out, src, seed, scale, 0);
1444  *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_bf16(*out, src, seed, scale, 1);
1445  *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_bf16(*out, src, seed, scale, 2);
1446}
1447
1448// CHECK-LABEL: @test_cvt_scalef32_sr_bf8_f16(
1449// CHECK-NEXT:  entry:
1450// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1451// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca half, align 2, addrspace(5)
1452// CHECK-NEXT:    [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1453// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1454// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1455// CHECK-NEXT:    store half [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 2
1456// CHECK-NEXT:    store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1457// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1458// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1459// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1460// CHECK-NEXT:    [[TMP2:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2
1461// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1462// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1463// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 [[TMP1]], half [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0)
1464// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1465// CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
1466// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1467// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
1468// CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2
1469// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1470// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1471// CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 [[TMP8]], half [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1)
1472// CHECK-NEXT:    [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1473// CHECK-NEXT:    store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
1474// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1475// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4
1476// CHECK-NEXT:    [[TMP16:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2
1477// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1478// CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1479// CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 [[TMP15]], half [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2)
1480// CHECK-NEXT:    [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1481// CHECK-NEXT:    store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4
1482// CHECK-NEXT:    ret void
1483//
1484void test_cvt_scalef32_sr_bf8_f16(global unsigned *out, half src, uint seed, float scale)
1485{
1486  *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f16(*out, src, seed, scale, 0);
1487  *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f16(*out, src, seed, scale, 1);
1488  *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f16(*out, src, seed, scale, 2);
1489}
1490
1491// CHECK-LABEL: @test_cvt_scalef32_sr_bf8_f32(
1492// CHECK-NEXT:  entry:
1493// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1494// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1495// CHECK-NEXT:    [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1496// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1497// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1498// CHECK-NEXT:    store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1499// CHECK-NEXT:    store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1500// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1501// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1502// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1503// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1504// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1505// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1506// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f32(i32 [[TMP1]], float [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0)
1507// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1508// CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
1509// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1510// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
1511// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1512// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1513// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1514// CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f32(i32 [[TMP8]], float [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1)
1515// CHECK-NEXT:    [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1516// CHECK-NEXT:    store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
1517// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1518// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4
1519// CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1520// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1521// CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1522// CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f32(i32 [[TMP15]], float [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2)
1523// CHECK-NEXT:    [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1524// CHECK-NEXT:    store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4
1525// CHECK-NEXT:    ret void
1526//
1527void test_cvt_scalef32_sr_bf8_f32(global unsigned *out, float src, uint seed, float scale)
1528{
1529  *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f32(*out, src, seed, scale, 0);
1530  *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f32(*out, src, seed, scale, 1);
1531  *out = __builtin_amdgcn_cvt_scalef32_sr_bf8_f32(*out, src, seed, scale, 2);
1532}
1533
1534// CHECK-LABEL: @test_cvt_scalef32_sr_fp8_bf16(
1535// CHECK-NEXT:  entry:
1536// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1537// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
1538// CHECK-NEXT:    [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1539// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1540// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1541// CHECK-NEXT:    store bfloat [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 2
1542// CHECK-NEXT:    store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1543// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1544// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1545// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1546// CHECK-NEXT:    [[TMP2:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2
1547// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1548// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1549// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.bf16(i32 [[TMP1]], bfloat [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0)
1550// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1551// CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
1552// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1553// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
1554// CHECK-NEXT:    [[TMP9:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2
1555// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1556// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1557// CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.bf16(i32 [[TMP8]], bfloat [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1)
1558// CHECK-NEXT:    [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1559// CHECK-NEXT:    store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
1560// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1561// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4
1562// CHECK-NEXT:    [[TMP16:%.*]] = load bfloat, ptr addrspace(5) [[SRC_ADDR]], align 2
1563// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1564// CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1565// CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.bf16(i32 [[TMP15]], bfloat [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2)
1566// CHECK-NEXT:    [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1567// CHECK-NEXT:    store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4
1568// CHECK-NEXT:    ret void
1569//
1570void test_cvt_scalef32_sr_fp8_bf16(global unsigned *out, __bf16 src, uint seed, float scale)
1571{
1572  *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_bf16(*out, src, seed, scale, 0);
1573  *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_bf16(*out, src, seed, scale, 1);
1574  *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_bf16(*out, src, seed, scale, 2);
1575}
1576
1577// CHECK-LABEL: @test_cvt_scalef32_sr_fp8_f16(
1578// CHECK-NEXT:  entry:
1579// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1580// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca half, align 2, addrspace(5)
1581// CHECK-NEXT:    [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1582// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1583// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1584// CHECK-NEXT:    store half [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 2
1585// CHECK-NEXT:    store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1586// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1587// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1588// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1589// CHECK-NEXT:    [[TMP2:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2
1590// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1591// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1592// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f16(i32 [[TMP1]], half [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0)
1593// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1594// CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
1595// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1596// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
1597// CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2
1598// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1599// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1600// CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f16(i32 [[TMP8]], half [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1)
1601// CHECK-NEXT:    [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1602// CHECK-NEXT:    store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
1603// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1604// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4
1605// CHECK-NEXT:    [[TMP16:%.*]] = load half, ptr addrspace(5) [[SRC_ADDR]], align 2
1606// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1607// CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1608// CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f16(i32 [[TMP15]], half [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2)
1609// CHECK-NEXT:    [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1610// CHECK-NEXT:    store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4
1611// CHECK-NEXT:    ret void
1612//
1613void test_cvt_scalef32_sr_fp8_f16(global unsigned *out, half src, uint seed, float scale)
1614{
1615  *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f16(*out, src, seed, scale, 0);
1616  *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f16(*out, src, seed, scale, 1);
1617  *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f16(*out, src, seed, scale, 2);
1618}
1619
1620// CHECK-LABEL: @test_cvt_scalef32_sr_fp8_f32(
1621// CHECK-NEXT:  entry:
1622// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1623// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1624// CHECK-NEXT:    [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1625// CHECK-NEXT:    [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1626// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1627// CHECK-NEXT:    store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1628// CHECK-NEXT:    store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1629// CHECK-NEXT:    store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
1630// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1631// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP0]], align 4
1632// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1633// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1634// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1635// CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f32(i32 [[TMP1]], float [[TMP2]], i32 [[TMP3]], float [[TMP4]], i32 0)
1636// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1637// CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
1638// CHECK-NEXT:    [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1639// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
1640// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1641// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1642// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1643// CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f32(i32 [[TMP8]], float [[TMP9]], i32 [[TMP10]], float [[TMP11]], i32 1)
1644// CHECK-NEXT:    [[TMP13:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1645// CHECK-NEXT:    store i32 [[TMP12]], ptr addrspace(1) [[TMP13]], align 4
1646// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1647// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr addrspace(1) [[TMP14]], align 4
1648// CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1649// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1650// CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
1651// CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.fp8.f32(i32 [[TMP15]], float [[TMP16]], i32 [[TMP17]], float [[TMP18]], i32 2)
1652// CHECK-NEXT:    [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1653// CHECK-NEXT:    store i32 [[TMP19]], ptr addrspace(1) [[TMP20]], align 4
1654// CHECK-NEXT:    ret void
1655//
1656void test_cvt_scalef32_sr_fp8_f32(global unsigned *out, float src, uint seed, float scale)
1657{
1658  *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f32(*out, src, seed, scale, 0);
1659  *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f32(*out, src, seed, scale, 1);
1660  *out = __builtin_amdgcn_cvt_scalef32_sr_fp8_f32(*out, src, seed, scale, 2);
1661}
1662
1663// CHECK-LABEL: @test_bitop3_b32(
1664// CHECK-NEXT:  entry:
1665// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1666// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1667// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1668// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1669// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1670// CHECK-NEXT:    store i32 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 4
1671// CHECK-NEXT:    store i32 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 4
1672// CHECK-NEXT:    store i32 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 4
1673// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A_ADDR]], align 4
1674// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[B_ADDR]], align 4
1675// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[C_ADDR]], align 4
1676// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.bitop3.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i32 1)
1677// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1678// CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
1679// CHECK-NEXT:    ret void
1680//
1681void test_bitop3_b32(global uint* out, uint a, uint b, uint c)
1682{
1683  *out = __builtin_amdgcn_bitop3_b32(a, b, c, 1);
1684}
1685
1686// CHECK-LABEL: @test_bitop3_b16(
1687// CHECK-NEXT:  entry:
1688// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1689// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
1690// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
1691// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca i16, align 2, addrspace(5)
1692// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1693// CHECK-NEXT:    store i16 [[A:%.*]], ptr addrspace(5) [[A_ADDR]], align 2
1694// CHECK-NEXT:    store i16 [[B:%.*]], ptr addrspace(5) [[B_ADDR]], align 2
1695// CHECK-NEXT:    store i16 [[C:%.*]], ptr addrspace(5) [[C_ADDR]], align 2
1696// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr addrspace(5) [[A_ADDR]], align 2
1697// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr addrspace(5) [[B_ADDR]], align 2
1698// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(5) [[C_ADDR]], align 2
1699// CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.amdgcn.bitop3.i16(i16 [[TMP0]], i16 [[TMP1]], i16 [[TMP2]], i32 1)
1700// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1701// CHECK-NEXT:    store i16 [[TMP3]], ptr addrspace(1) [[TMP4]], align 2
1702// CHECK-NEXT:    ret void
1703//
1704void test_bitop3_b16(global ushort* out, ushort a, ushort b, ushort c)
1705{
1706  *out = __builtin_amdgcn_bitop3_b16(a, b, c, 1);
1707}
1708
1709// CHECK-LABEL: @test_cvt_sr_bf16_f32(
1710// CHECK-NEXT:  entry:
1711// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1712// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1713// CHECK-NEXT:    [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1714// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1715// CHECK-NEXT:    store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1716// CHECK-NEXT:    store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1717// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1718// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[TMP0]], align 4
1719// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1720// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1721// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> [[TMP1]], float [[TMP2]], i32 [[TMP3]], i1 false)
1722// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1723// CHECK-NEXT:    store <2 x bfloat> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
1724// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1725// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[TMP6]], align 4
1726// CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1727// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1728// CHECK-NEXT:    [[TMP10:%.*]] = call <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat> [[TMP7]], float [[TMP8]], i32 [[TMP9]], i1 true)
1729// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1730// CHECK-NEXT:    store <2 x bfloat> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
1731// CHECK-NEXT:    ret void
1732//
1733void test_cvt_sr_bf16_f32(global bfloat2 *out, float src, uint seed)
1734{
1735  *out = __builtin_amdgcn_cvt_sr_bf16_f32(*out, src, seed, 0);
1736  *out = __builtin_amdgcn_cvt_sr_bf16_f32(*out, src, seed, 1);
1737}
1738
1739// CHECK-LABEL: @test_cvt_sr_f16_f32(
1740// CHECK-NEXT:  entry:
1741// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1742// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
1743// CHECK-NEXT:    [[SEED_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
1744// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
1745// CHECK-NEXT:    store float [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
1746// CHECK-NEXT:    store i32 [[SEED:%.*]], ptr addrspace(5) [[SEED_ADDR]], align 4
1747// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1748// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 4
1749// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1750// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1751// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> [[TMP1]], float [[TMP2]], i32 [[TMP3]], i1 false)
1752// CHECK-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1753// CHECK-NEXT:    store <2 x half> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
1754// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1755// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP6]], align 4
1756// CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr addrspace(5) [[SRC_ADDR]], align 4
1757// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr addrspace(5) [[SEED_ADDR]], align 4
1758// CHECK-NEXT:    [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half> [[TMP7]], float [[TMP8]], i32 [[TMP9]], i1 true)
1759// CHECK-NEXT:    [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
1760// CHECK-NEXT:    store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
1761// CHECK-NEXT:    ret void
1762//
1763void test_cvt_sr_f16_f32(global half2 *out, float src, uint seed)
1764{
1765  *out = __builtin_amdgcn_cvt_sr_f16_f32(*out, src, seed, 0);
1766  *out = __builtin_amdgcn_cvt_sr_f16_f32(*out, src, seed, 1);
1767}
1768
1769// CHECK-LABEL: @test_global_load_lds_96(
1770// CHECK-NEXT:  entry:
1771// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1772// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
1773// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
1774// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
1775// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
1776// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
1777// CHECK-NEXT:    call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 12, i32 0, i32 0)
1778// CHECK-NEXT:    ret void
1779//
1780void test_global_load_lds_96(global void* src, local void *dst) {
1781  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0);
1782}
1783
1784// CHECK-LABEL: @test_global_load_lds_128(
1785// CHECK-NEXT:  entry:
1786// CHECK-NEXT:    [[SRC_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
1787// CHECK-NEXT:    [[DST_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5)
1788// CHECK-NEXT:    store ptr addrspace(1) [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 8
1789// CHECK-NEXT:    store ptr addrspace(3) [[DST:%.*]], ptr addrspace(5) [[DST_ADDR]], align 4
1790// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[SRC_ADDR]], align 8
1791// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[DST_ADDR]], align 4
1792// CHECK-NEXT:    call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) [[TMP0]], ptr addrspace(3) [[TMP1]], i32 16, i32 0, i32 0)
1793// CHECK-NEXT:    ret void
1794//
1795void test_global_load_lds_128(global void* src, local void *dst) {
1796  __builtin_amdgcn_global_load_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0);
1797}
1798