xref: /llvm-project/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
5
6; FIXME: This should be merged with uint_to_fp.ll, but s_uint_to_fp_v2i64 crashes on r600
7
8define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %in) #0 {
9; GFX6-LABEL: s_uint_to_fp_i64_to_f16:
10; GFX6:       ; %bb.0:
11; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
12; GFX6-NEXT:    s_mov_b32 s7, 0xf000
13; GFX6-NEXT:    s_mov_b32 s6, -1
14; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX6-NEXT:    s_mov_b32 s4, s0
16; GFX6-NEXT:    s_mov_b32 s5, s1
17; GFX6-NEXT:    s_flbit_i32_b32 s0, s3
18; GFX6-NEXT:    s_min_u32 s8, s0, 32
19; GFX6-NEXT:    s_lshl_b64 s[0:1], s[2:3], s8
20; GFX6-NEXT:    s_min_u32 s0, s0, 1
21; GFX6-NEXT:    s_or_b32 s0, s1, s0
22; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s0
23; GFX6-NEXT:    s_sub_i32 s0, 32, s8
24; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s0
25; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
26; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0
27; GFX6-NEXT:    s_endpgm
28;
29; GFX8-LABEL: s_uint_to_fp_i64_to_f16:
30; GFX8:       ; %bb.0:
31; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
32; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX8-NEXT:    s_flbit_i32_b32 s4, s3
34; GFX8-NEXT:    s_min_u32 s4, s4, 32
35; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
36; GFX8-NEXT:    s_min_u32 s2, s2, 1
37; GFX8-NEXT:    s_or_b32 s2, s3, s2
38; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s2
39; GFX8-NEXT:    s_sub_i32 s2, 32, s4
40; GFX8-NEXT:    v_mov_b32_e32 v1, s1
41; GFX8-NEXT:    v_ldexp_f32 v0, v0, s2
42; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v0
43; GFX8-NEXT:    v_mov_b32_e32 v0, s0
44; GFX8-NEXT:    flat_store_short v[0:1], v2
45; GFX8-NEXT:    s_endpgm
46;
47; GFX11-LABEL: s_uint_to_fp_i64_to_f16:
48; GFX11:       ; %bb.0:
49; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
50; GFX11-NEXT:    v_mov_b32_e32 v1, 0
51; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
52; GFX11-NEXT:    s_clz_i32_u32 s4, s3
53; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
54; GFX11-NEXT:    s_min_u32 s4, s4, 32
55; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
56; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
57; GFX11-NEXT:    s_min_u32 s2, s2, 1
58; GFX11-NEXT:    s_or_b32 s2, s3, s2
59; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
60; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
61; GFX11-NEXT:    s_sub_i32 s2, 32, s4
62; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
63; GFX11-NEXT:    v_ldexp_f32 v0, v0, s2
64; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
65; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
66; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
67; GFX11-NEXT:    s_endpgm
68  %result = uitofp i64 %in to half
69  store half %result, ptr addrspace(1) %out
70  ret void
71}
72
73define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
74; GFX6-LABEL: v_uint_to_fp_i64_to_f16:
75; GFX6:       ; %bb.0:
76; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
77; GFX6-NEXT:    s_mov_b32 s7, 0xf000
78; GFX6-NEXT:    s_mov_b32 s6, 0
79; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
80; GFX6-NEXT:    v_mov_b32_e32 v2, 0
81; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
82; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
83; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
84; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
85; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
86; GFX6-NEXT:    s_waitcnt vmcnt(0)
87; GFX6-NEXT:    v_ffbh_u32_e32 v0, v4
88; GFX6-NEXT:    v_min_u32_e32 v0, 32, v0
89; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
90; GFX6-NEXT:    v_min_u32_e32 v3, 1, v3
91; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
92; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, v3
93; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 32, v0
94; GFX6-NEXT:    v_ldexp_f32_e32 v0, v3, v0
95; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
96; GFX6-NEXT:    buffer_store_short v0, v[1:2], s[0:3], 0 addr64
97; GFX6-NEXT:    s_endpgm
98;
99; GFX8-LABEL: v_uint_to_fp_i64_to_f16:
100; GFX8:       ; %bb.0:
101; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
102; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
103; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
104; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
105; GFX8-NEXT:    v_mov_b32_e32 v2, s3
106; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
107; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
108; GFX8-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
109; GFX8-NEXT:    s_waitcnt vmcnt(0)
110; GFX8-NEXT:    v_ffbh_u32_e32 v3, v2
111; GFX8-NEXT:    v_min_u32_e32 v3, 32, v3
112; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v3, v[1:2]
113; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v3
114; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
115; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
116; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
117; GFX8-NEXT:    v_mov_b32_e32 v2, s1
118; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
119; GFX8-NEXT:    v_ldexp_f32 v1, v1, v3
120; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v1
121; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
122; GFX8-NEXT:    flat_store_short v[0:1], v3
123; GFX8-NEXT:    s_endpgm
124;
125; GFX11-LABEL: v_uint_to_fp_i64_to_f16:
126; GFX11:       ; %bb.0:
127; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
128; GFX11-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
129; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
130; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v2
131; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
132; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
133; GFX11-NEXT:    s_waitcnt vmcnt(0)
134; GFX11-NEXT:    v_clz_i32_u32_e32 v3, v1
135; GFX11-NEXT:    v_min_u32_e32 v3, 32, v3
136; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
137; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
138; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
139; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
140; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
141; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v3
142; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
143; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
144; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
145; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 1, v2
146; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
147; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
148; GFX11-NEXT:    s_endpgm
149  %tid = call i32 @llvm.amdgcn.workitem.id.x()
150  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
151  %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
152  %val = load i64, ptr addrspace(1) %in.gep
153  %result = uitofp i64 %val to half
154  store half %result, ptr addrspace(1) %out.gep
155  ret void
156}
157
158define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %in) #0 {
159; GFX6-LABEL: s_uint_to_fp_i64_to_f32:
160; GFX6:       ; %bb.0:
161; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
162; GFX6-NEXT:    s_mov_b32 s7, 0xf000
163; GFX6-NEXT:    s_mov_b32 s6, -1
164; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
165; GFX6-NEXT:    s_mov_b32 s4, s0
166; GFX6-NEXT:    s_mov_b32 s5, s1
167; GFX6-NEXT:    s_flbit_i32_b32 s0, s3
168; GFX6-NEXT:    s_min_u32 s8, s0, 32
169; GFX6-NEXT:    s_lshl_b64 s[0:1], s[2:3], s8
170; GFX6-NEXT:    s_min_u32 s0, s0, 1
171; GFX6-NEXT:    s_or_b32 s0, s1, s0
172; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s0
173; GFX6-NEXT:    s_sub_i32 s0, 32, s8
174; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s0
175; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
176; GFX6-NEXT:    s_endpgm
177;
178; GFX8-LABEL: s_uint_to_fp_i64_to_f32:
179; GFX8:       ; %bb.0:
180; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
181; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
182; GFX8-NEXT:    s_flbit_i32_b32 s4, s3
183; GFX8-NEXT:    s_min_u32 s4, s4, 32
184; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
185; GFX8-NEXT:    s_min_u32 s2, s2, 1
186; GFX8-NEXT:    s_or_b32 s2, s3, s2
187; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s2
188; GFX8-NEXT:    v_mov_b32_e32 v0, s0
189; GFX8-NEXT:    s_sub_i32 s0, 32, s4
190; GFX8-NEXT:    v_mov_b32_e32 v1, s1
191; GFX8-NEXT:    v_ldexp_f32 v2, v2, s0
192; GFX8-NEXT:    flat_store_dword v[0:1], v2
193; GFX8-NEXT:    s_endpgm
194;
195; GFX11-LABEL: s_uint_to_fp_i64_to_f32:
196; GFX11:       ; %bb.0:
197; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
198; GFX11-NEXT:    v_mov_b32_e32 v1, 0
199; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
200; GFX11-NEXT:    s_clz_i32_u32 s4, s3
201; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
202; GFX11-NEXT:    s_min_u32 s4, s4, 32
203; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
204; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
205; GFX11-NEXT:    s_min_u32 s2, s2, 1
206; GFX11-NEXT:    s_or_b32 s2, s3, s2
207; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
208; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
209; GFX11-NEXT:    s_sub_i32 s2, 32, s4
210; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
211; GFX11-NEXT:    v_ldexp_f32 v0, v0, s2
212; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
213; GFX11-NEXT:    s_endpgm
214  %result = uitofp i64 %in to float
215  store float %result, ptr addrspace(1) %out
216  ret void
217}
218
219define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
220; GFX6-LABEL: v_uint_to_fp_i64_to_f32:
221; GFX6:       ; %bb.0:
222; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
223; GFX6-NEXT:    s_mov_b32 s7, 0xf000
224; GFX6-NEXT:    s_mov_b32 s6, 0
225; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
226; GFX6-NEXT:    v_mov_b32_e32 v2, 0
227; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
228; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
229; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
230; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
231; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
232; GFX6-NEXT:    s_waitcnt vmcnt(0)
233; GFX6-NEXT:    v_ffbh_u32_e32 v0, v4
234; GFX6-NEXT:    v_min_u32_e32 v0, 32, v0
235; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
236; GFX6-NEXT:    v_min_u32_e32 v3, 1, v3
237; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
238; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, v3
239; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 32, v0
240; GFX6-NEXT:    v_ldexp_f32_e32 v0, v3, v0
241; GFX6-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
242; GFX6-NEXT:    s_endpgm
243;
244; GFX8-LABEL: v_uint_to_fp_i64_to_f32:
245; GFX8:       ; %bb.0:
246; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
247; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
248; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
249; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
250; GFX8-NEXT:    v_mov_b32_e32 v2, s3
251; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
252; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
253; GFX8-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
254; GFX8-NEXT:    s_waitcnt vmcnt(0)
255; GFX8-NEXT:    v_ffbh_u32_e32 v0, v2
256; GFX8-NEXT:    v_min_u32_e32 v4, 32, v0
257; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[1:2]
258; GFX8-NEXT:    v_mov_b32_e32 v2, s1
259; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
260; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
261; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, v0
262; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v3
263; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
264; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v4
265; GFX8-NEXT:    v_ldexp_f32 v2, v5, v2
266; GFX8-NEXT:    flat_store_dword v[0:1], v2
267; GFX8-NEXT:    s_endpgm
268;
269; GFX11-LABEL: v_uint_to_fp_i64_to_f32:
270; GFX11:       ; %bb.0:
271; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
272; GFX11-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
273; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
274; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v2
275; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
276; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
277; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
278; GFX11-NEXT:    s_waitcnt vmcnt(0)
279; GFX11-NEXT:    v_clz_i32_u32_e32 v3, v1
280; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
281; GFX11-NEXT:    v_min_u32_e32 v3, 32, v3
282; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
283; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
284; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
285; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
286; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v3
287; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
288; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
289; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
290; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
291; GFX11-NEXT:    s_endpgm
292  %tid = call i32 @llvm.amdgcn.workitem.id.x()
293  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
294  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
295  %val = load i64, ptr addrspace(1) %in.gep
296  %result = uitofp i64 %val to float
297  store float %result, ptr addrspace(1) %out.gep
298  ret void
299}
300
301define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 x i64> %in) #0{
302; GFX6-LABEL: s_uint_to_fp_v2i64_to_v2f32:
303; GFX6:       ; %bb.0:
304; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
305; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
306; GFX6-NEXT:    s_mov_b32 s7, 0xf000
307; GFX6-NEXT:    s_mov_b32 s6, -1
308; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
309; GFX6-NEXT:    s_flbit_i32_b32 s8, s3
310; GFX6-NEXT:    s_flbit_i32_b32 s9, s1
311; GFX6-NEXT:    s_min_u32 s8, s8, 32
312; GFX6-NEXT:    s_min_u32 s9, s9, 32
313; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
314; GFX6-NEXT:    s_sub_i32 s8, 32, s8
315; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
316; GFX6-NEXT:    s_sub_i32 s9, 32, s9
317; GFX6-NEXT:    s_min_u32 s2, s2, 1
318; GFX6-NEXT:    s_min_u32 s0, s0, 1
319; GFX6-NEXT:    s_or_b32 s2, s3, s2
320; GFX6-NEXT:    s_or_b32 s0, s1, s0
321; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
322; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s0
323; GFX6-NEXT:    v_ldexp_f32_e64 v1, v0, s8
324; GFX6-NEXT:    v_ldexp_f32_e64 v0, v2, s9
325; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
326; GFX6-NEXT:    s_endpgm
327;
328; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f32:
329; GFX8:       ; %bb.0:
330; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
331; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
332; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
333; GFX8-NEXT:    s_flbit_i32_b32 s6, s3
334; GFX8-NEXT:    s_flbit_i32_b32 s7, s1
335; GFX8-NEXT:    s_min_u32 s6, s6, 32
336; GFX8-NEXT:    s_min_u32 s7, s7, 32
337; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
338; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
339; GFX8-NEXT:    s_min_u32 s2, s2, 1
340; GFX8-NEXT:    s_or_b32 s2, s3, s2
341; GFX8-NEXT:    s_min_u32 s0, s0, 1
342; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s2
343; GFX8-NEXT:    s_or_b32 s0, s1, s0
344; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s0
345; GFX8-NEXT:    s_sub_i32 s0, 32, s6
346; GFX8-NEXT:    v_ldexp_f32 v1, v0, s0
347; GFX8-NEXT:    s_sub_i32 s0, 32, s7
348; GFX8-NEXT:    v_ldexp_f32 v0, v2, s0
349; GFX8-NEXT:    v_mov_b32_e32 v2, s4
350; GFX8-NEXT:    v_mov_b32_e32 v3, s5
351; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
352; GFX8-NEXT:    s_endpgm
353;
354; GFX11-LABEL: s_uint_to_fp_v2i64_to_v2f32:
355; GFX11:       ; %bb.0:
356; GFX11-NEXT:    s_clause 0x1
357; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
358; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
359; GFX11-NEXT:    v_mov_b32_e32 v3, 0
360; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
361; GFX11-NEXT:    s_clz_i32_u32 s6, s3
362; GFX11-NEXT:    s_clz_i32_u32 s7, s1
363; GFX11-NEXT:    s_min_u32 s6, s6, 32
364; GFX11-NEXT:    s_min_u32 s7, s7, 32
365; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
366; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
367; GFX11-NEXT:    s_min_u32 s2, s2, 1
368; GFX11-NEXT:    s_min_u32 s0, s0, 1
369; GFX11-NEXT:    s_or_b32 s2, s3, s2
370; GFX11-NEXT:    s_or_b32 s0, s1, s0
371; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
372; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, s0
373; GFX11-NEXT:    s_sub_i32 s0, 32, s6
374; GFX11-NEXT:    s_sub_i32 s1, 32, s7
375; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
376; GFX11-NEXT:    v_ldexp_f32 v1, v0, s0
377; GFX11-NEXT:    v_ldexp_f32 v0, v2, s1
378; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
379; GFX11-NEXT:    s_endpgm
380  %result = uitofp <2 x i64> %in to <2 x float>
381  store <2 x float> %result, ptr addrspace(1) %out
382  ret void
383}
384
385define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
386; GFX6-LABEL: v_uint_to_fp_v4i64_to_v4f32:
387; GFX6:       ; %bb.0:
388; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
389; GFX6-NEXT:    s_mov_b32 s7, 0xf000
390; GFX6-NEXT:    s_mov_b32 s6, 0
391; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
392; GFX6-NEXT:    v_mov_b32_e32 v9, 0
393; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
395; GFX6-NEXT:    buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 offset:16
396; GFX6-NEXT:    buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64
397; GFX6-NEXT:    v_lshlrev_b32_e32 v10, 4, v0
398; GFX6-NEXT:    v_mov_b32_e32 v11, v9
399; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
400; GFX6-NEXT:    s_waitcnt vmcnt(1)
401; GFX6-NEXT:    v_ffbh_u32_e32 v0, v4
402; GFX6-NEXT:    v_ffbh_u32_e32 v9, v2
403; GFX6-NEXT:    s_waitcnt vmcnt(0)
404; GFX6-NEXT:    v_ffbh_u32_e32 v12, v8
405; GFX6-NEXT:    v_ffbh_u32_e32 v13, v6
406; GFX6-NEXT:    v_min_u32_e32 v0, 32, v0
407; GFX6-NEXT:    v_min_u32_e32 v9, 32, v9
408; GFX6-NEXT:    v_min_u32_e32 v12, 32, v12
409; GFX6-NEXT:    v_min_u32_e32 v13, 32, v13
410; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
411; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, 32, v0
412; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], v9
413; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 32, v9
414; GFX6-NEXT:    v_lshl_b64 v[7:8], v[7:8], v12
415; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 32, v12
416; GFX6-NEXT:    v_lshl_b64 v[5:6], v[5:6], v13
417; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, 32, v13
418; GFX6-NEXT:    v_min_u32_e32 v3, 1, v3
419; GFX6-NEXT:    v_min_u32_e32 v0, 1, v0
420; GFX6-NEXT:    v_min_u32_e32 v7, 1, v7
421; GFX6-NEXT:    v_min_u32_e32 v5, 1, v5
422; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
423; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
424; GFX6-NEXT:    v_or_b32_e32 v1, v8, v7
425; GFX6-NEXT:    v_or_b32_e32 v4, v6, v5
426; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, v3
427; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
428; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, v1
429; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, v4
430; GFX6-NEXT:    v_ldexp_f32_e32 v3, v3, v14
431; GFX6-NEXT:    v_ldexp_f32_e32 v2, v0, v2
432; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v9
433; GFX6-NEXT:    v_ldexp_f32_e32 v0, v4, v12
434; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[10:11], s[0:3], 0 addr64
435; GFX6-NEXT:    s_endpgm
436;
437; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f32:
438; GFX8:       ; %bb.0:
439; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
440; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
441; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
442; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
443; GFX8-NEXT:    v_mov_b32_e32 v2, s3
444; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s2, v1
445; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
446; GFX8-NEXT:    flat_load_dwordx4 v[1:4], v[5:6]
447; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 16, v5
448; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
449; GFX8-NEXT:    flat_load_dwordx4 v[5:8], v[5:6]
450; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v0
451; GFX8-NEXT:    v_mov_b32_e32 v10, s1
452; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
453; GFX8-NEXT:    s_waitcnt vmcnt(1)
454; GFX8-NEXT:    v_ffbh_u32_e32 v0, v4
455; GFX8-NEXT:    v_ffbh_u32_e32 v11, v2
456; GFX8-NEXT:    v_min_u32_e32 v0, 32, v0
457; GFX8-NEXT:    v_min_u32_e32 v11, 32, v11
458; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v0, v[3:4]
459; GFX8-NEXT:    s_waitcnt vmcnt(0)
460; GFX8-NEXT:    v_ffbh_u32_e32 v12, v8
461; GFX8-NEXT:    v_ffbh_u32_e32 v13, v6
462; GFX8-NEXT:    v_min_u32_e32 v12, 32, v12
463; GFX8-NEXT:    v_min_u32_e32 v13, 32, v13
464; GFX8-NEXT:    v_sub_u32_e32 v14, vcc, 32, v0
465; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v11, v[1:2]
466; GFX8-NEXT:    v_lshlrev_b64 v[7:8], v12, v[7:8]
467; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v13, v[5:6]
468; GFX8-NEXT:    v_min_u32_e32 v3, 1, v3
469; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
470; GFX8-NEXT:    v_min_u32_e32 v7, 1, v7
471; GFX8-NEXT:    v_min_u32_e32 v5, 1, v5
472; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
473; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
474; GFX8-NEXT:    v_or_b32_e32 v1, v8, v7
475; GFX8-NEXT:    v_or_b32_e32 v4, v6, v5
476; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
477; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
478; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, v1
479; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
480; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v11
481; GFX8-NEXT:    v_sub_u32_e32 v11, vcc, 32, v12
482; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, 32, v13
483; GFX8-NEXT:    v_ldexp_f32 v1, v3, v14
484; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
485; GFX8-NEXT:    v_ldexp_f32 v3, v5, v11
486; GFX8-NEXT:    v_ldexp_f32 v2, v4, v12
487; GFX8-NEXT:    flat_store_dwordx4 v[9:10], v[0:3]
488; GFX8-NEXT:    s_endpgm
489;
490; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f32:
491; GFX11:       ; %bb.0:
492; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
493; GFX11-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
494; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
495; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 5, v8
496; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
497; GFX11-NEXT:    s_clause 0x1
498; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3] offset:16
499; GFX11-NEXT:    global_load_b128 v[4:7], v4, s[2:3]
500; GFX11-NEXT:    s_waitcnt vmcnt(1)
501; GFX11-NEXT:    v_clz_i32_u32_e32 v9, v3
502; GFX11-NEXT:    v_clz_i32_u32_e32 v10, v1
503; GFX11-NEXT:    s_waitcnt vmcnt(0)
504; GFX11-NEXT:    v_clz_i32_u32_e32 v11, v7
505; GFX11-NEXT:    v_clz_i32_u32_e32 v12, v5
506; GFX11-NEXT:    v_min_u32_e32 v9, 32, v9
507; GFX11-NEXT:    v_min_u32_e32 v10, 32, v10
508; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
509; GFX11-NEXT:    v_min_u32_e32 v11, 32, v11
510; GFX11-NEXT:    v_min_u32_e32 v12, 32, v12
511; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
512; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
513; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v10, v[0:1]
514; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
515; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
516; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v12, v[4:5]
517; GFX11-NEXT:    v_sub_nc_u32_e32 v9, 32, v9
518; GFX11-NEXT:    v_sub_nc_u32_e32 v10, 32, v10
519; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
520; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
521; GFX11-NEXT:    v_min_u32_e32 v6, 1, v6
522; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
523; GFX11-NEXT:    v_sub_nc_u32_e32 v11, 32, v11
524; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
525; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
526; GFX11-NEXT:    v_or_b32_e32 v1, v7, v6
527; GFX11-NEXT:    v_or_b32_e32 v3, v5, v4
528; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v12
529; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
530; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
531; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
532; GFX11-NEXT:    v_cvt_f32_u32_e32 v5, v3
533; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 4, v8
534; GFX11-NEXT:    v_ldexp_f32 v3, v2, v9
535; GFX11-NEXT:    v_ldexp_f32 v2, v0, v10
536; GFX11-NEXT:    v_ldexp_f32 v1, v1, v11
537; GFX11-NEXT:    v_ldexp_f32 v0, v5, v4
538; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
539; GFX11-NEXT:    s_endpgm
540  %tid = call i32 @llvm.amdgcn.workitem.id.x()
541  %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
542  %out.gep = getelementptr <4 x float>, ptr addrspace(1) %out, i32 %tid
543  %value = load <4 x i64>, ptr addrspace(1) %in.gep
544  %result = uitofp <4 x i64> %value to <4 x float>
545  store <4 x float> %result, ptr addrspace(1) %out.gep
546  ret void
547}
548
549define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 x i64> %in) #0{
550; GFX6-LABEL: s_uint_to_fp_v2i64_to_v2f16:
551; GFX6:       ; %bb.0:
552; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
553; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
554; GFX6-NEXT:    s_mov_b32 s3, 0xf000
555; GFX6-NEXT:    s_mov_b32 s2, -1
556; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
557; GFX6-NEXT:    s_flbit_i32_b32 s4, s11
558; GFX6-NEXT:    s_flbit_i32_b32 s5, s9
559; GFX6-NEXT:    s_min_u32 s6, s4, 32
560; GFX6-NEXT:    s_min_u32 s12, s5, 32
561; GFX6-NEXT:    s_lshl_b64 s[4:5], s[10:11], s6
562; GFX6-NEXT:    s_sub_i32 s10, 32, s6
563; GFX6-NEXT:    s_lshl_b64 s[6:7], s[8:9], s12
564; GFX6-NEXT:    s_sub_i32 s8, 32, s12
565; GFX6-NEXT:    s_min_u32 s4, s4, 1
566; GFX6-NEXT:    s_min_u32 s6, s6, 1
567; GFX6-NEXT:    s_or_b32 s4, s5, s4
568; GFX6-NEXT:    s_or_b32 s5, s7, s6
569; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s4
570; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s5
571; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s10
572; GFX6-NEXT:    v_ldexp_f32_e64 v1, v1, s8
573; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
574; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
575; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
576; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
577; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
578; GFX6-NEXT:    s_endpgm
579;
580; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f16:
581; GFX8:       ; %bb.0:
582; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
583; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
584; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
585; GFX8-NEXT:    s_flbit_i32_b32 s6, s3
586; GFX8-NEXT:    s_flbit_i32_b32 s7, s1
587; GFX8-NEXT:    s_min_u32 s6, s6, 32
588; GFX8-NEXT:    s_min_u32 s7, s7, 32
589; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
590; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
591; GFX8-NEXT:    s_min_u32 s2, s2, 1
592; GFX8-NEXT:    s_min_u32 s0, s0, 1
593; GFX8-NEXT:    s_or_b32 s2, s3, s2
594; GFX8-NEXT:    s_or_b32 s0, s1, s0
595; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s2
596; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s0
597; GFX8-NEXT:    s_sub_i32 s6, 32, s6
598; GFX8-NEXT:    s_sub_i32 s0, 32, s7
599; GFX8-NEXT:    v_ldexp_f32 v0, v0, s6
600; GFX8-NEXT:    v_ldexp_f32 v1, v1, s0
601; GFX8-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
602; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v1
603; GFX8-NEXT:    v_or_b32_e32 v2, v1, v0
604; GFX8-NEXT:    v_mov_b32_e32 v0, s4
605; GFX8-NEXT:    v_mov_b32_e32 v1, s5
606; GFX8-NEXT:    flat_store_dword v[0:1], v2
607; GFX8-NEXT:    s_endpgm
608;
609; GFX11-LABEL: s_uint_to_fp_v2i64_to_v2f16:
610; GFX11:       ; %bb.0:
611; GFX11-NEXT:    s_clause 0x1
612; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
613; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
614; GFX11-NEXT:    v_mov_b32_e32 v2, 0
615; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
616; GFX11-NEXT:    s_clz_i32_u32 s6, s3
617; GFX11-NEXT:    s_clz_i32_u32 s7, s1
618; GFX11-NEXT:    s_min_u32 s6, s6, 32
619; GFX11-NEXT:    s_min_u32 s7, s7, 32
620; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
621; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
622; GFX11-NEXT:    s_min_u32 s2, s2, 1
623; GFX11-NEXT:    s_min_u32 s0, s0, 1
624; GFX11-NEXT:    s_or_b32 s2, s3, s2
625; GFX11-NEXT:    s_or_b32 s0, s1, s0
626; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
627; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, s0
628; GFX11-NEXT:    s_sub_i32 s0, 32, s6
629; GFX11-NEXT:    s_sub_i32 s1, 32, s7
630; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
631; GFX11-NEXT:    v_ldexp_f32 v0, v0, s0
632; GFX11-NEXT:    v_ldexp_f32 v1, v1, s1
633; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
634; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
635; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
636; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
637; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
638; GFX11-NEXT:    global_store_b32 v2, v0, s[4:5]
639; GFX11-NEXT:    s_endpgm
640  %result = uitofp <2 x i64> %in to <2 x half>
641  store <2 x half> %result, ptr addrspace(1) %out
642  ret void
643}
644
645define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
646; GFX6-LABEL: v_uint_to_fp_v4i64_to_v4f16:
647; GFX6:       ; %bb.0:
648; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
649; GFX6-NEXT:    s_mov_b32 s7, 0xf000
650; GFX6-NEXT:    s_mov_b32 s6, 0
651; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
652; GFX6-NEXT:    v_mov_b32_e32 v9, 0
653; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
654; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
655; GFX6-NEXT:    buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 offset:16
656; GFX6-NEXT:    buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64
657; GFX6-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
658; GFX6-NEXT:    v_mov_b32_e32 v11, v9
659; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
660; GFX6-NEXT:    s_waitcnt vmcnt(1)
661; GFX6-NEXT:    v_ffbh_u32_e32 v0, v4
662; GFX6-NEXT:    v_ffbh_u32_e32 v9, v2
663; GFX6-NEXT:    s_waitcnt vmcnt(0)
664; GFX6-NEXT:    v_ffbh_u32_e32 v12, v8
665; GFX6-NEXT:    v_ffbh_u32_e32 v13, v6
666; GFX6-NEXT:    v_min_u32_e32 v0, 32, v0
667; GFX6-NEXT:    v_min_u32_e32 v9, 32, v9
668; GFX6-NEXT:    v_min_u32_e32 v12, 32, v12
669; GFX6-NEXT:    v_min_u32_e32 v13, 32, v13
670; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
671; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, 32, v0
672; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], v9
673; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 32, v9
674; GFX6-NEXT:    v_lshl_b64 v[7:8], v[7:8], v12
675; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 32, v12
676; GFX6-NEXT:    v_lshl_b64 v[5:6], v[5:6], v13
677; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, 32, v13
678; GFX6-NEXT:    v_min_u32_e32 v3, 1, v3
679; GFX6-NEXT:    v_min_u32_e32 v0, 1, v0
680; GFX6-NEXT:    v_min_u32_e32 v7, 1, v7
681; GFX6-NEXT:    v_min_u32_e32 v5, 1, v5
682; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
683; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
684; GFX6-NEXT:    v_or_b32_e32 v1, v8, v7
685; GFX6-NEXT:    v_or_b32_e32 v4, v6, v5
686; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, v3
687; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
688; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, v1
689; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, v4
690; GFX6-NEXT:    v_ldexp_f32_e32 v3, v3, v14
691; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
692; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v9
693; GFX6-NEXT:    v_ldexp_f32_e32 v2, v4, v12
694; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
695; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
696; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
697; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
698; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
699; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
700; GFX6-NEXT:    v_or_b32_e32 v1, v0, v3
701; GFX6-NEXT:    v_or_b32_e32 v0, v2, v4
702; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], v[10:11], s[0:3], 0 addr64
703; GFX6-NEXT:    s_endpgm
704;
705; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f16:
706; GFX8:       ; %bb.0:
707; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
708; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
709; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 3, v0
710; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
711; GFX8-NEXT:    v_mov_b32_e32 v2, s3
712; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s2, v1
713; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
714; GFX8-NEXT:    flat_load_dwordx4 v[1:4], v[5:6]
715; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 16, v5
716; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
717; GFX8-NEXT:    flat_load_dwordx4 v[5:8], v[5:6]
718; GFX8-NEXT:    v_mov_b32_e32 v10, s1
719; GFX8-NEXT:    s_waitcnt vmcnt(1)
720; GFX8-NEXT:    v_ffbh_u32_e32 v0, v4
721; GFX8-NEXT:    v_ffbh_u32_e32 v11, v2
722; GFX8-NEXT:    v_min_u32_e32 v0, 32, v0
723; GFX8-NEXT:    v_min_u32_e32 v11, 32, v11
724; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v0, v[3:4]
725; GFX8-NEXT:    s_waitcnt vmcnt(0)
726; GFX8-NEXT:    v_ffbh_u32_e32 v12, v8
727; GFX8-NEXT:    v_ffbh_u32_e32 v13, v6
728; GFX8-NEXT:    v_min_u32_e32 v12, 32, v12
729; GFX8-NEXT:    v_min_u32_e32 v13, 32, v13
730; GFX8-NEXT:    v_sub_u32_e32 v14, vcc, 32, v0
731; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v11, v[1:2]
732; GFX8-NEXT:    v_lshlrev_b64 v[7:8], v12, v[7:8]
733; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v13, v[5:6]
734; GFX8-NEXT:    v_min_u32_e32 v3, 1, v3
735; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
736; GFX8-NEXT:    v_min_u32_e32 v7, 1, v7
737; GFX8-NEXT:    v_min_u32_e32 v5, 1, v5
738; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
739; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
740; GFX8-NEXT:    v_or_b32_e32 v1, v8, v7
741; GFX8-NEXT:    v_or_b32_e32 v4, v6, v5
742; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, v3
743; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
744; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
745; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
746; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v11
747; GFX8-NEXT:    v_sub_u32_e32 v11, vcc, 32, v12
748; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, 32, v13
749; GFX8-NEXT:    v_ldexp_f32 v3, v3, v14
750; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
751; GFX8-NEXT:    v_ldexp_f32 v1, v1, v11
752; GFX8-NEXT:    v_ldexp_f32 v2, v4, v12
753; GFX8-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
754; GFX8-NEXT:    v_cvt_f16_f32_e32 v4, v0
755; GFX8-NEXT:    v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
756; GFX8-NEXT:    v_cvt_f16_f32_e32 v6, v2
757; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v9
758; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v10, vcc
759; GFX8-NEXT:    v_or_b32_e32 v2, v4, v3
760; GFX8-NEXT:    v_or_b32_e32 v3, v6, v5
761; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
762; GFX8-NEXT:    s_endpgm
763;
764; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f16:
765; GFX11:       ; %bb.0:
766; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
767; GFX11-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
768; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
769; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 5, v8
770; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
771; GFX11-NEXT:    s_clause 0x1
772; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3] offset:16
773; GFX11-NEXT:    global_load_b128 v[4:7], v4, s[2:3]
774; GFX11-NEXT:    s_waitcnt vmcnt(1)
775; GFX11-NEXT:    v_clz_i32_u32_e32 v9, v3
776; GFX11-NEXT:    v_clz_i32_u32_e32 v10, v1
777; GFX11-NEXT:    s_waitcnt vmcnt(0)
778; GFX11-NEXT:    v_clz_i32_u32_e32 v11, v7
779; GFX11-NEXT:    v_clz_i32_u32_e32 v12, v5
780; GFX11-NEXT:    v_min_u32_e32 v9, 32, v9
781; GFX11-NEXT:    v_min_u32_e32 v10, 32, v10
782; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
783; GFX11-NEXT:    v_min_u32_e32 v11, 32, v11
784; GFX11-NEXT:    v_min_u32_e32 v12, 32, v12
785; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
786; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
787; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v10, v[0:1]
788; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
789; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
790; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v12, v[4:5]
791; GFX11-NEXT:    v_sub_nc_u32_e32 v9, 32, v9
792; GFX11-NEXT:    v_sub_nc_u32_e32 v10, 32, v10
793; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
794; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
795; GFX11-NEXT:    v_min_u32_e32 v6, 1, v6
796; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
797; GFX11-NEXT:    v_sub_nc_u32_e32 v11, 32, v11
798; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
799; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
800; GFX11-NEXT:    v_or_b32_e32 v1, v7, v6
801; GFX11-NEXT:    v_or_b32_e32 v3, v5, v4
802; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v12
803; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
804; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
805; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
806; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
807; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v8
808; GFX11-NEXT:    v_ldexp_f32 v2, v2, v9
809; GFX11-NEXT:    v_ldexp_f32 v0, v0, v10
810; GFX11-NEXT:    v_ldexp_f32 v1, v1, v11
811; GFX11-NEXT:    v_ldexp_f32 v3, v3, v4
812; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
813; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
814; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
815; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
816; GFX11-NEXT:    v_cvt_f16_f32_e32 v4, v1
817; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
818; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
819; GFX11-NEXT:    v_pack_b32_f16 v1, v0, v2
820; GFX11-NEXT:    v_pack_b32_f16 v0, v3, v4
821; GFX11-NEXT:    global_store_b64 v5, v[0:1], s[0:1]
822; GFX11-NEXT:    s_endpgm
823  %tid = call i32 @llvm.amdgcn.workitem.id.x()
824  %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
825  %out.gep = getelementptr <4 x half>, ptr addrspace(1) %out, i32 %tid
826  %value = load <4 x i64>, ptr addrspace(1) %in.gep
827  %result = uitofp <4 x i64> %value to <4 x half>
828  store <4 x half> %result, ptr addrspace(1) %out.gep
829  ret void
830}
831
832declare i32 @llvm.amdgcn.workitem.id.x() #1
833
834attributes #0 = { nounwind }
835attributes #1 = { nounwind readnone }
836