xref: /llvm-project/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
5
6; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600
7
8define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %in) #0 {
9; GFX6-LABEL: s_sint_to_fp_i64_to_f16:
10; GFX6:       ; %bb.0:
11; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
12; GFX6-NEXT:    s_mov_b32 s7, 0xf000
13; GFX6-NEXT:    s_mov_b32 s6, -1
14; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX6-NEXT:    s_mov_b32 s4, s0
16; GFX6-NEXT:    s_mov_b32 s5, s1
17; GFX6-NEXT:    s_flbit_i32 s0, s3
18; GFX6-NEXT:    s_xor_b32 s1, s2, s3
19; GFX6-NEXT:    s_add_i32 s0, s0, -1
20; GFX6-NEXT:    s_ashr_i32 s1, s1, 31
21; GFX6-NEXT:    s_add_i32 s1, s1, 32
22; GFX6-NEXT:    s_min_u32 s8, s0, s1
23; GFX6-NEXT:    s_lshl_b64 s[0:1], s[2:3], s8
24; GFX6-NEXT:    s_min_u32 s0, s0, 1
25; GFX6-NEXT:    s_or_b32 s0, s1, s0
26; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s0
27; GFX6-NEXT:    s_sub_i32 s0, 32, s8
28; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s0
29; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
30; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0
31; GFX6-NEXT:    s_endpgm
32;
33; GFX8-LABEL: s_sint_to_fp_i64_to_f16:
34; GFX8:       ; %bb.0:
35; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
36; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
37; GFX8-NEXT:    s_xor_b32 s5, s2, s3
38; GFX8-NEXT:    s_flbit_i32 s4, s3
39; GFX8-NEXT:    s_ashr_i32 s5, s5, 31
40; GFX8-NEXT:    s_add_i32 s4, s4, -1
41; GFX8-NEXT:    s_add_i32 s5, s5, 32
42; GFX8-NEXT:    s_min_u32 s4, s4, s5
43; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
44; GFX8-NEXT:    s_min_u32 s2, s2, 1
45; GFX8-NEXT:    s_or_b32 s2, s3, s2
46; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, s2
47; GFX8-NEXT:    s_sub_i32 s2, 32, s4
48; GFX8-NEXT:    v_mov_b32_e32 v1, s1
49; GFX8-NEXT:    v_ldexp_f32 v0, v0, s2
50; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v0
51; GFX8-NEXT:    v_mov_b32_e32 v0, s0
52; GFX8-NEXT:    flat_store_short v[0:1], v2
53; GFX8-NEXT:    s_endpgm
54;
55; GFX11-LABEL: s_sint_to_fp_i64_to_f16:
56; GFX11:       ; %bb.0:
57; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
58; GFX11-NEXT:    v_mov_b32_e32 v1, 0
59; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
60; GFX11-NEXT:    s_xor_b32 s4, s2, s3
61; GFX11-NEXT:    s_cls_i32 s5, s3
62; GFX11-NEXT:    s_ashr_i32 s4, s4, 31
63; GFX11-NEXT:    s_add_i32 s5, s5, -1
64; GFX11-NEXT:    s_add_i32 s4, s4, 32
65; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
66; GFX11-NEXT:    s_min_u32 s4, s5, s4
67; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
68; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
69; GFX11-NEXT:    s_min_u32 s2, s2, 1
70; GFX11-NEXT:    s_or_b32 s2, s3, s2
71; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
72; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s2
73; GFX11-NEXT:    s_sub_i32 s2, 32, s4
74; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
75; GFX11-NEXT:    v_ldexp_f32 v0, v0, s2
76; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
77; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
78; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
79; GFX11-NEXT:    s_endpgm
80  %result = sitofp i64 %in to half
81  store half %result, ptr addrspace(1) %out
82  ret void
83}
84
85define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
86; GFX6-LABEL: v_sint_to_fp_i64_to_f16:
87; GFX6:       ; %bb.0:
88; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
89; GFX6-NEXT:    s_mov_b32 s7, 0xf000
90; GFX6-NEXT:    s_mov_b32 s6, 0
91; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
92; GFX6-NEXT:    v_mov_b32_e32 v2, 0
93; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
94; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
95; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
96; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
97; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
98; GFX6-NEXT:    s_waitcnt vmcnt(0)
99; GFX6-NEXT:    v_xor_b32_e32 v0, v3, v4
100; GFX6-NEXT:    v_ffbh_i32_e32 v5, v4
101; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
102; GFX6-NEXT:    v_add_i32_e32 v5, vcc, -1, v5
103; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
104; GFX6-NEXT:    v_min_u32_e32 v0, v5, v0
105; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
106; GFX6-NEXT:    v_min_u32_e32 v3, 1, v3
107; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
108; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, v3
109; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 32, v0
110; GFX6-NEXT:    v_ldexp_f32_e32 v0, v3, v0
111; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
112; GFX6-NEXT:    buffer_store_short v0, v[1:2], s[0:3], 0 addr64
113; GFX6-NEXT:    s_endpgm
114;
115; GFX8-LABEL: v_sint_to_fp_i64_to_f16:
116; GFX8:       ; %bb.0:
117; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
118; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
119; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
120; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX8-NEXT:    v_mov_b32_e32 v2, s3
122; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
123; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
124; GFX8-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
125; GFX8-NEXT:    s_waitcnt vmcnt(0)
126; GFX8-NEXT:    v_xor_b32_e32 v3, v1, v2
127; GFX8-NEXT:    v_ffbh_i32_e32 v4, v2
128; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
129; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
130; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 32, v3
131; GFX8-NEXT:    v_min_u32_e32 v3, v4, v3
132; GFX8-NEXT:    v_lshlrev_b64 v[1:2], v3, v[1:2]
133; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v3
134; GFX8-NEXT:    v_min_u32_e32 v1, 1, v1
135; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
136; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
137; GFX8-NEXT:    v_mov_b32_e32 v2, s1
138; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
139; GFX8-NEXT:    v_ldexp_f32 v1, v1, v3
140; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v1
141; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
142; GFX8-NEXT:    flat_store_short v[0:1], v3
143; GFX8-NEXT:    s_endpgm
144;
145; GFX11-LABEL: v_sint_to_fp_i64_to_f16:
146; GFX11:       ; %bb.0:
147; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
148; GFX11-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
149; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
150; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v2
151; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
152; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
153; GFX11-NEXT:    s_waitcnt vmcnt(0)
154; GFX11-NEXT:    v_xor_b32_e32 v3, v0, v1
155; GFX11-NEXT:    v_cls_i32_e32 v4, v1
156; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
157; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
158; GFX11-NEXT:    v_add_nc_u32_e32 v4, -1, v4
159; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
160; GFX11-NEXT:    v_add_nc_u32_e32 v3, 32, v3
161; GFX11-NEXT:    v_min_u32_e32 v3, v4, v3
162; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
163; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
164; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
165; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
166; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
167; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v3
168; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
169; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
170; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
171; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 1, v2
172; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
173; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
174; GFX11-NEXT:    s_endpgm
175  %tid = call i32 @llvm.amdgcn.workitem.id.x()
176  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
177  %out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
178  %val = load i64, ptr addrspace(1) %in.gep
179  %result = sitofp i64 %val to half
180  store half %result, ptr addrspace(1) %out.gep
181  ret void
182}
183
184define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %in) #0 {
185; GFX6-LABEL: s_sint_to_fp_i64_to_f32:
186; GFX6:       ; %bb.0:
187; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
188; GFX6-NEXT:    s_mov_b32 s7, 0xf000
189; GFX6-NEXT:    s_mov_b32 s6, -1
190; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
191; GFX6-NEXT:    s_mov_b32 s4, s0
192; GFX6-NEXT:    s_mov_b32 s5, s1
193; GFX6-NEXT:    s_flbit_i32 s0, s3
194; GFX6-NEXT:    s_xor_b32 s1, s2, s3
195; GFX6-NEXT:    s_add_i32 s0, s0, -1
196; GFX6-NEXT:    s_ashr_i32 s1, s1, 31
197; GFX6-NEXT:    s_add_i32 s1, s1, 32
198; GFX6-NEXT:    s_min_u32 s8, s0, s1
199; GFX6-NEXT:    s_lshl_b64 s[0:1], s[2:3], s8
200; GFX6-NEXT:    s_min_u32 s0, s0, 1
201; GFX6-NEXT:    s_or_b32 s0, s1, s0
202; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s0
203; GFX6-NEXT:    s_sub_i32 s0, 32, s8
204; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s0
205; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
206; GFX6-NEXT:    s_endpgm
207;
208; GFX8-LABEL: s_sint_to_fp_i64_to_f32:
209; GFX8:       ; %bb.0:
210; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
211; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
212; GFX8-NEXT:    s_xor_b32 s5, s2, s3
213; GFX8-NEXT:    s_flbit_i32 s4, s3
214; GFX8-NEXT:    s_ashr_i32 s5, s5, 31
215; GFX8-NEXT:    s_add_i32 s4, s4, -1
216; GFX8-NEXT:    s_add_i32 s5, s5, 32
217; GFX8-NEXT:    s_min_u32 s4, s4, s5
218; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
219; GFX8-NEXT:    s_min_u32 s2, s2, 1
220; GFX8-NEXT:    s_or_b32 s2, s3, s2
221; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, s2
222; GFX8-NEXT:    v_mov_b32_e32 v0, s0
223; GFX8-NEXT:    s_sub_i32 s0, 32, s4
224; GFX8-NEXT:    v_mov_b32_e32 v1, s1
225; GFX8-NEXT:    v_ldexp_f32 v2, v2, s0
226; GFX8-NEXT:    flat_store_dword v[0:1], v2
227; GFX8-NEXT:    s_endpgm
228;
229; GFX11-LABEL: s_sint_to_fp_i64_to_f32:
230; GFX11:       ; %bb.0:
231; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
232; GFX11-NEXT:    v_mov_b32_e32 v1, 0
233; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
234; GFX11-NEXT:    s_xor_b32 s4, s2, s3
235; GFX11-NEXT:    s_cls_i32 s5, s3
236; GFX11-NEXT:    s_ashr_i32 s4, s4, 31
237; GFX11-NEXT:    s_add_i32 s5, s5, -1
238; GFX11-NEXT:    s_add_i32 s4, s4, 32
239; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
240; GFX11-NEXT:    s_min_u32 s4, s5, s4
241; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
242; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
243; GFX11-NEXT:    s_min_u32 s2, s2, 1
244; GFX11-NEXT:    s_or_b32 s2, s3, s2
245; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
246; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s2
247; GFX11-NEXT:    s_sub_i32 s2, 32, s4
248; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
249; GFX11-NEXT:    v_ldexp_f32 v0, v0, s2
250; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
251; GFX11-NEXT:    s_endpgm
252  %result = sitofp i64 %in to float
253  store float %result, ptr addrspace(1) %out
254  ret void
255}
256
257define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
258; GFX6-LABEL: v_sint_to_fp_i64_to_f32:
259; GFX6:       ; %bb.0:
260; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
261; GFX6-NEXT:    s_mov_b32 s7, 0xf000
262; GFX6-NEXT:    s_mov_b32 s6, 0
263; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
264; GFX6-NEXT:    v_mov_b32_e32 v2, 0
265; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
266; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
267; GFX6-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
268; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
269; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
270; GFX6-NEXT:    s_waitcnt vmcnt(0)
271; GFX6-NEXT:    v_xor_b32_e32 v0, v3, v4
272; GFX6-NEXT:    v_ffbh_i32_e32 v5, v4
273; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
274; GFX6-NEXT:    v_add_i32_e32 v5, vcc, -1, v5
275; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
276; GFX6-NEXT:    v_min_u32_e32 v0, v5, v0
277; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
278; GFX6-NEXT:    v_min_u32_e32 v3, 1, v3
279; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
280; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, v3
281; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, 32, v0
282; GFX6-NEXT:    v_ldexp_f32_e32 v0, v3, v0
283; GFX6-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
284; GFX6-NEXT:    s_endpgm
285;
286; GFX8-LABEL: v_sint_to_fp_i64_to_f32:
287; GFX8:       ; %bb.0:
288; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
289; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
290; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
291; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
292; GFX8-NEXT:    v_mov_b32_e32 v2, s3
293; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
294; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
295; GFX8-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
296; GFX8-NEXT:    s_waitcnt vmcnt(0)
297; GFX8-NEXT:    v_xor_b32_e32 v0, v1, v2
298; GFX8-NEXT:    v_ffbh_i32_e32 v4, v2
299; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
300; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
301; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
302; GFX8-NEXT:    v_min_u32_e32 v4, v4, v0
303; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v4, v[1:2]
304; GFX8-NEXT:    v_mov_b32_e32 v2, s1
305; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
306; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
307; GFX8-NEXT:    v_cvt_f32_i32_e32 v5, v0
308; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v3
309; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
310; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v4
311; GFX8-NEXT:    v_ldexp_f32 v2, v5, v2
312; GFX8-NEXT:    flat_store_dword v[0:1], v2
313; GFX8-NEXT:    s_endpgm
314;
315; GFX11-LABEL: v_sint_to_fp_i64_to_f32:
316; GFX11:       ; %bb.0:
317; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
318; GFX11-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
319; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
320; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v2
321; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
322; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
323; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
324; GFX11-NEXT:    s_waitcnt vmcnt(0)
325; GFX11-NEXT:    v_xor_b32_e32 v3, v0, v1
326; GFX11-NEXT:    v_cls_i32_e32 v4, v1
327; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
328; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
329; GFX11-NEXT:    v_add_nc_u32_e32 v4, -1, v4
330; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
331; GFX11-NEXT:    v_add_nc_u32_e32 v3, 32, v3
332; GFX11-NEXT:    v_min_u32_e32 v3, v4, v3
333; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
334; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v3, v[0:1]
335; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
336; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
337; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
338; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v3
339; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
340; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
341; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
342; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
343; GFX11-NEXT:    s_endpgm
344  %tid = call i32 @llvm.amdgcn.workitem.id.x()
345  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
346  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
347  %val = load i64, ptr addrspace(1) %in.gep
348  %result = sitofp i64 %val to float
349  store float %result, ptr addrspace(1) %out.gep
350  ret void
351}
352
353define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2 x i64> %in) #0{
354; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f32:
355; GFX6:       ; %bb.0:
356; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
357; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
358; GFX6-NEXT:    s_mov_b32 s3, 0xf000
359; GFX6-NEXT:    s_mov_b32 s2, -1
360; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
361; GFX6-NEXT:    s_flbit_i32 s4, s11
362; GFX6-NEXT:    s_xor_b32 s5, s10, s11
363; GFX6-NEXT:    s_flbit_i32 s6, s9
364; GFX6-NEXT:    s_xor_b32 s7, s8, s9
365; GFX6-NEXT:    s_add_i32 s4, s4, -1
366; GFX6-NEXT:    s_ashr_i32 s5, s5, 31
367; GFX6-NEXT:    s_add_i32 s6, s6, -1
368; GFX6-NEXT:    s_ashr_i32 s7, s7, 31
369; GFX6-NEXT:    s_add_i32 s5, s5, 32
370; GFX6-NEXT:    s_add_i32 s7, s7, 32
371; GFX6-NEXT:    s_min_u32 s12, s4, s5
372; GFX6-NEXT:    s_min_u32 s13, s6, s7
373; GFX6-NEXT:    s_lshl_b64 s[4:5], s[10:11], s12
374; GFX6-NEXT:    s_sub_i32 s10, 32, s12
375; GFX6-NEXT:    s_lshl_b64 s[6:7], s[8:9], s13
376; GFX6-NEXT:    s_sub_i32 s8, 32, s13
377; GFX6-NEXT:    s_min_u32 s4, s4, 1
378; GFX6-NEXT:    s_min_u32 s6, s6, 1
379; GFX6-NEXT:    s_or_b32 s4, s5, s4
380; GFX6-NEXT:    s_or_b32 s5, s7, s6
381; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
382; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s5
383; GFX6-NEXT:    v_ldexp_f32_e64 v1, v0, s10
384; GFX6-NEXT:    v_ldexp_f32_e64 v0, v2, s8
385; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
386; GFX6-NEXT:    s_endpgm
387;
388; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f32:
389; GFX8:       ; %bb.0:
390; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
391; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
392; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
393; GFX8-NEXT:    s_xor_b32 s7, s2, s3
394; GFX8-NEXT:    s_flbit_i32 s6, s3
395; GFX8-NEXT:    s_ashr_i32 s7, s7, 31
396; GFX8-NEXT:    s_add_i32 s6, s6, -1
397; GFX8-NEXT:    s_add_i32 s7, s7, 32
398; GFX8-NEXT:    s_min_u32 s6, s6, s7
399; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
400; GFX8-NEXT:    s_min_u32 s2, s2, 1
401; GFX8-NEXT:    s_or_b32 s2, s3, s2
402; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, s2
403; GFX8-NEXT:    s_xor_b32 s2, s0, s1
404; GFX8-NEXT:    s_flbit_i32 s8, s1
405; GFX8-NEXT:    s_ashr_i32 s2, s2, 31
406; GFX8-NEXT:    s_add_i32 s8, s8, -1
407; GFX8-NEXT:    s_add_i32 s2, s2, 32
408; GFX8-NEXT:    s_min_u32 s2, s8, s2
409; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
410; GFX8-NEXT:    s_min_u32 s0, s0, 1
411; GFX8-NEXT:    s_or_b32 s0, s1, s0
412; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, s0
413; GFX8-NEXT:    s_sub_i32 s0, 32, s6
414; GFX8-NEXT:    v_ldexp_f32 v1, v0, s0
415; GFX8-NEXT:    s_sub_i32 s0, 32, s2
416; GFX8-NEXT:    v_ldexp_f32 v0, v2, s0
417; GFX8-NEXT:    v_mov_b32_e32 v2, s4
418; GFX8-NEXT:    v_mov_b32_e32 v3, s5
419; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
420; GFX8-NEXT:    s_endpgm
421;
422; GFX11-LABEL: s_sint_to_fp_v2i64_to_v2f32:
423; GFX11:       ; %bb.0:
424; GFX11-NEXT:    s_clause 0x1
425; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
426; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
427; GFX11-NEXT:    v_mov_b32_e32 v3, 0
428; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
429; GFX11-NEXT:    s_xor_b32 s7, s2, s3
430; GFX11-NEXT:    s_xor_b32 s9, s0, s1
431; GFX11-NEXT:    s_cls_i32 s6, s3
432; GFX11-NEXT:    s_cls_i32 s8, s1
433; GFX11-NEXT:    s_ashr_i32 s7, s7, 31
434; GFX11-NEXT:    s_ashr_i32 s9, s9, 31
435; GFX11-NEXT:    s_add_i32 s6, s6, -1
436; GFX11-NEXT:    s_add_i32 s8, s8, -1
437; GFX11-NEXT:    s_add_i32 s7, s7, 32
438; GFX11-NEXT:    s_add_i32 s9, s9, 32
439; GFX11-NEXT:    s_min_u32 s6, s6, s7
440; GFX11-NEXT:    s_min_u32 s7, s8, s9
441; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
442; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
443; GFX11-NEXT:    s_min_u32 s2, s2, 1
444; GFX11-NEXT:    s_min_u32 s0, s0, 1
445; GFX11-NEXT:    s_or_b32 s2, s3, s2
446; GFX11-NEXT:    s_or_b32 s0, s1, s0
447; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s2
448; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, s0
449; GFX11-NEXT:    s_sub_i32 s0, 32, s6
450; GFX11-NEXT:    s_sub_i32 s1, 32, s7
451; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
452; GFX11-NEXT:    v_ldexp_f32 v1, v0, s0
453; GFX11-NEXT:    v_ldexp_f32 v0, v2, s1
454; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
455; GFX11-NEXT:    s_endpgm
456  %result = sitofp <2 x i64> %in to <2 x float>
457  store <2 x float> %result, ptr addrspace(1) %out
458  ret void
459}
460
461define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
462; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f32:
463; GFX6:       ; %bb.0:
464; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
465; GFX6-NEXT:    s_mov_b32 s7, 0xf000
466; GFX6-NEXT:    s_mov_b32 s6, 0
467; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
468; GFX6-NEXT:    v_mov_b32_e32 v9, 0
469; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
470; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
471; GFX6-NEXT:    buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 offset:16
472; GFX6-NEXT:    buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64
473; GFX6-NEXT:    v_lshlrev_b32_e32 v10, 4, v0
474; GFX6-NEXT:    v_mov_b32_e32 v11, v9
475; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
476; GFX6-NEXT:    s_waitcnt vmcnt(1)
477; GFX6-NEXT:    v_xor_b32_e32 v0, v3, v4
478; GFX6-NEXT:    v_ffbh_i32_e32 v9, v4
479; GFX6-NEXT:    v_xor_b32_e32 v12, v1, v2
480; GFX6-NEXT:    v_ffbh_i32_e32 v13, v2
481; GFX6-NEXT:    s_waitcnt vmcnt(0)
482; GFX6-NEXT:    v_xor_b32_e32 v14, v7, v8
483; GFX6-NEXT:    v_ffbh_i32_e32 v15, v8
484; GFX6-NEXT:    v_xor_b32_e32 v16, v5, v6
485; GFX6-NEXT:    v_ffbh_i32_e32 v17, v6
486; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
487; GFX6-NEXT:    v_add_i32_e32 v9, vcc, -1, v9
488; GFX6-NEXT:    v_ashrrev_i32_e32 v12, 31, v12
489; GFX6-NEXT:    v_add_i32_e32 v13, vcc, -1, v13
490; GFX6-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
491; GFX6-NEXT:    v_add_i32_e32 v15, vcc, -1, v15
492; GFX6-NEXT:    v_ashrrev_i32_e32 v16, 31, v16
493; GFX6-NEXT:    v_add_i32_e32 v17, vcc, -1, v17
494; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
495; GFX6-NEXT:    v_add_i32_e32 v12, vcc, 32, v12
496; GFX6-NEXT:    v_add_i32_e32 v14, vcc, 32, v14
497; GFX6-NEXT:    v_add_i32_e32 v16, vcc, 32, v16
498; GFX6-NEXT:    v_min_u32_e32 v0, v9, v0
499; GFX6-NEXT:    v_min_u32_e32 v9, v13, v12
500; GFX6-NEXT:    v_min_u32_e32 v12, v15, v14
501; GFX6-NEXT:    v_min_u32_e32 v13, v17, v16
502; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
503; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, 32, v0
504; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], v9
505; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 32, v9
506; GFX6-NEXT:    v_lshl_b64 v[7:8], v[7:8], v12
507; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 32, v12
508; GFX6-NEXT:    v_lshl_b64 v[5:6], v[5:6], v13
509; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, 32, v13
510; GFX6-NEXT:    v_min_u32_e32 v3, 1, v3
511; GFX6-NEXT:    v_min_u32_e32 v0, 1, v0
512; GFX6-NEXT:    v_min_u32_e32 v7, 1, v7
513; GFX6-NEXT:    v_min_u32_e32 v5, 1, v5
514; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
515; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
516; GFX6-NEXT:    v_or_b32_e32 v1, v8, v7
517; GFX6-NEXT:    v_or_b32_e32 v4, v6, v5
518; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, v3
519; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, v0
520; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, v1
521; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v4
522; GFX6-NEXT:    v_ldexp_f32_e32 v3, v3, v14
523; GFX6-NEXT:    v_ldexp_f32_e32 v2, v0, v2
524; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v9
525; GFX6-NEXT:    v_ldexp_f32_e32 v0, v4, v12
526; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[10:11], s[0:3], 0 addr64
527; GFX6-NEXT:    s_endpgm
528;
529; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f32:
530; GFX8:       ; %bb.0:
531; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
532; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
533; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
534; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
535; GFX8-NEXT:    v_mov_b32_e32 v2, s3
536; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s2, v1
537; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
538; GFX8-NEXT:    flat_load_dwordx4 v[1:4], v[5:6]
539; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 16, v5
540; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
541; GFX8-NEXT:    flat_load_dwordx4 v[5:8], v[5:6]
542; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v0
543; GFX8-NEXT:    v_mov_b32_e32 v10, s1
544; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
545; GFX8-NEXT:    s_waitcnt vmcnt(1)
546; GFX8-NEXT:    v_xor_b32_e32 v0, v3, v4
547; GFX8-NEXT:    v_xor_b32_e32 v12, v1, v2
548; GFX8-NEXT:    v_ffbh_i32_e32 v11, v4
549; GFX8-NEXT:    v_ffbh_i32_e32 v13, v2
550; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
551; GFX8-NEXT:    s_waitcnt vmcnt(0)
552; GFX8-NEXT:    v_xor_b32_e32 v14, v7, v8
553; GFX8-NEXT:    v_xor_b32_e32 v16, v5, v6
554; GFX8-NEXT:    v_ffbh_i32_e32 v15, v8
555; GFX8-NEXT:    v_ffbh_i32_e32 v17, v6
556; GFX8-NEXT:    v_ashrrev_i32_e32 v12, 31, v12
557; GFX8-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
558; GFX8-NEXT:    v_ashrrev_i32_e32 v16, 31, v16
559; GFX8-NEXT:    v_add_u32_e32 v11, vcc, -1, v11
560; GFX8-NEXT:    v_add_u32_e32 v13, vcc, -1, v13
561; GFX8-NEXT:    v_add_u32_e32 v15, vcc, -1, v15
562; GFX8-NEXT:    v_add_u32_e32 v17, vcc, -1, v17
563; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
564; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 32, v12
565; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 32, v14
566; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 32, v16
567; GFX8-NEXT:    v_min_u32_e32 v0, v11, v0
568; GFX8-NEXT:    v_min_u32_e32 v11, v13, v12
569; GFX8-NEXT:    v_min_u32_e32 v12, v15, v14
570; GFX8-NEXT:    v_min_u32_e32 v13, v17, v16
571; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v0, v[3:4]
572; GFX8-NEXT:    v_sub_u32_e32 v14, vcc, 32, v0
573; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v11, v[1:2]
574; GFX8-NEXT:    v_lshlrev_b64 v[7:8], v12, v[7:8]
575; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v13, v[5:6]
576; GFX8-NEXT:    v_min_u32_e32 v3, 1, v3
577; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
578; GFX8-NEXT:    v_min_u32_e32 v7, 1, v7
579; GFX8-NEXT:    v_min_u32_e32 v5, 1, v5
580; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
581; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
582; GFX8-NEXT:    v_or_b32_e32 v1, v8, v7
583; GFX8-NEXT:    v_or_b32_e32 v4, v6, v5
584; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
585; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
586; GFX8-NEXT:    v_cvt_f32_i32_e32 v5, v1
587; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
588; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v11
589; GFX8-NEXT:    v_sub_u32_e32 v11, vcc, 32, v12
590; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, 32, v13
591; GFX8-NEXT:    v_ldexp_f32 v1, v3, v14
592; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
593; GFX8-NEXT:    v_ldexp_f32 v3, v5, v11
594; GFX8-NEXT:    v_ldexp_f32 v2, v4, v12
595; GFX8-NEXT:    flat_store_dwordx4 v[9:10], v[0:3]
596; GFX8-NEXT:    s_endpgm
597;
598; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f32:
599; GFX11:       ; %bb.0:
600; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
601; GFX11-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
602; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
603; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 5, v8
604; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
605; GFX11-NEXT:    s_clause 0x1
606; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3] offset:16
607; GFX11-NEXT:    global_load_b128 v[4:7], v4, s[2:3]
608; GFX11-NEXT:    s_waitcnt vmcnt(1)
609; GFX11-NEXT:    v_xor_b32_e32 v9, v2, v3
610; GFX11-NEXT:    v_xor_b32_e32 v11, v0, v1
611; GFX11-NEXT:    s_waitcnt vmcnt(0)
612; GFX11-NEXT:    v_xor_b32_e32 v13, v6, v7
613; GFX11-NEXT:    v_xor_b32_e32 v15, v4, v5
614; GFX11-NEXT:    v_cls_i32_e32 v10, v3
615; GFX11-NEXT:    v_cls_i32_e32 v12, v1
616; GFX11-NEXT:    v_cls_i32_e32 v14, v7
617; GFX11-NEXT:    v_cls_i32_e32 v16, v5
618; GFX11-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
619; GFX11-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
620; GFX11-NEXT:    v_ashrrev_i32_e32 v13, 31, v13
621; GFX11-NEXT:    v_ashrrev_i32_e32 v15, 31, v15
622; GFX11-NEXT:    v_add_nc_u32_e32 v10, -1, v10
623; GFX11-NEXT:    v_add_nc_u32_e32 v12, -1, v12
624; GFX11-NEXT:    v_add_nc_u32_e32 v14, -1, v14
625; GFX11-NEXT:    v_add_nc_u32_e32 v16, -1, v16
626; GFX11-NEXT:    v_add_nc_u32_e32 v9, 32, v9
627; GFX11-NEXT:    v_add_nc_u32_e32 v11, 32, v11
628; GFX11-NEXT:    v_add_nc_u32_e32 v13, 32, v13
629; GFX11-NEXT:    v_add_nc_u32_e32 v15, 32, v15
630; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
631; GFX11-NEXT:    v_min_u32_e32 v9, v10, v9
632; GFX11-NEXT:    v_min_u32_e32 v10, v12, v11
633; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
634; GFX11-NEXT:    v_min_u32_e32 v11, v14, v13
635; GFX11-NEXT:    v_min_u32_e32 v12, v16, v15
636; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
637; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
638; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v10, v[0:1]
639; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
640; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
641; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v12, v[4:5]
642; GFX11-NEXT:    v_sub_nc_u32_e32 v9, 32, v9
643; GFX11-NEXT:    v_sub_nc_u32_e32 v10, 32, v10
644; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
645; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
646; GFX11-NEXT:    v_min_u32_e32 v6, 1, v6
647; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
648; GFX11-NEXT:    v_sub_nc_u32_e32 v11, 32, v11
649; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
650; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
651; GFX11-NEXT:    v_or_b32_e32 v1, v7, v6
652; GFX11-NEXT:    v_or_b32_e32 v3, v5, v4
653; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v12
654; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
655; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
656; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
657; GFX11-NEXT:    v_cvt_f32_i32_e32 v5, v3
658; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 4, v8
659; GFX11-NEXT:    v_ldexp_f32 v3, v2, v9
660; GFX11-NEXT:    v_ldexp_f32 v2, v0, v10
661; GFX11-NEXT:    v_ldexp_f32 v1, v1, v11
662; GFX11-NEXT:    v_ldexp_f32 v0, v5, v4
663; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
664; GFX11-NEXT:    s_endpgm
665  %tid = call i32 @llvm.amdgcn.workitem.id.x()
666  %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
667  %out.gep = getelementptr <4 x float>, ptr addrspace(1) %out, i32 %tid
668  %value = load <4 x i64>, ptr addrspace(1) %in.gep
669  %result = sitofp <4 x i64> %value to <4 x float>
670  store <4 x float> %result, ptr addrspace(1) %out.gep
671  ret void
672}
673
674define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2 x i64> %in) #0{
675; GFX6-LABEL: s_sint_to_fp_v2i64_to_v2f16:
676; GFX6:       ; %bb.0:
677; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0xd
678; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
679; GFX6-NEXT:    s_mov_b32 s3, 0xf000
680; GFX6-NEXT:    s_mov_b32 s2, -1
681; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
682; GFX6-NEXT:    s_flbit_i32 s4, s11
683; GFX6-NEXT:    s_xor_b32 s5, s10, s11
684; GFX6-NEXT:    s_flbit_i32 s6, s9
685; GFX6-NEXT:    s_xor_b32 s7, s8, s9
686; GFX6-NEXT:    s_add_i32 s4, s4, -1
687; GFX6-NEXT:    s_ashr_i32 s5, s5, 31
688; GFX6-NEXT:    s_add_i32 s6, s6, -1
689; GFX6-NEXT:    s_ashr_i32 s7, s7, 31
690; GFX6-NEXT:    s_add_i32 s5, s5, 32
691; GFX6-NEXT:    s_add_i32 s7, s7, 32
692; GFX6-NEXT:    s_min_u32 s12, s4, s5
693; GFX6-NEXT:    s_min_u32 s13, s6, s7
694; GFX6-NEXT:    s_lshl_b64 s[4:5], s[10:11], s12
695; GFX6-NEXT:    s_sub_i32 s10, 32, s12
696; GFX6-NEXT:    s_lshl_b64 s[6:7], s[8:9], s13
697; GFX6-NEXT:    s_sub_i32 s8, 32, s13
698; GFX6-NEXT:    s_min_u32 s4, s4, 1
699; GFX6-NEXT:    s_min_u32 s6, s6, 1
700; GFX6-NEXT:    s_or_b32 s4, s5, s4
701; GFX6-NEXT:    s_or_b32 s5, s7, s6
702; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s4
703; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
704; GFX6-NEXT:    v_ldexp_f32_e64 v0, v0, s10
705; GFX6-NEXT:    v_ldexp_f32_e64 v1, v1, s8
706; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
707; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
708; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
709; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
710; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
711; GFX6-NEXT:    s_endpgm
712;
713; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f16:
714; GFX8:       ; %bb.0:
715; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
716; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
717; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
718; GFX8-NEXT:    s_xor_b32 s7, s2, s3
719; GFX8-NEXT:    s_flbit_i32 s6, s3
720; GFX8-NEXT:    s_ashr_i32 s7, s7, 31
721; GFX8-NEXT:    s_add_i32 s6, s6, -1
722; GFX8-NEXT:    s_add_i32 s7, s7, 32
723; GFX8-NEXT:    s_min_u32 s6, s6, s7
724; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
725; GFX8-NEXT:    s_min_u32 s2, s2, 1
726; GFX8-NEXT:    s_or_b32 s2, s3, s2
727; GFX8-NEXT:    s_xor_b32 s3, s0, s1
728; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, s2
729; GFX8-NEXT:    s_flbit_i32 s2, s1
730; GFX8-NEXT:    s_ashr_i32 s3, s3, 31
731; GFX8-NEXT:    s_add_i32 s2, s2, -1
732; GFX8-NEXT:    s_add_i32 s3, s3, 32
733; GFX8-NEXT:    s_min_u32 s2, s2, s3
734; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
735; GFX8-NEXT:    s_min_u32 s0, s0, 1
736; GFX8-NEXT:    s_or_b32 s0, s1, s0
737; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, s0
738; GFX8-NEXT:    s_sub_i32 s6, 32, s6
739; GFX8-NEXT:    s_sub_i32 s0, 32, s2
740; GFX8-NEXT:    v_ldexp_f32 v0, v0, s6
741; GFX8-NEXT:    v_ldexp_f32 v1, v1, s0
742; GFX8-NEXT:    v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
743; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v1
744; GFX8-NEXT:    v_or_b32_e32 v2, v1, v0
745; GFX8-NEXT:    v_mov_b32_e32 v0, s4
746; GFX8-NEXT:    v_mov_b32_e32 v1, s5
747; GFX8-NEXT:    flat_store_dword v[0:1], v2
748; GFX8-NEXT:    s_endpgm
749;
750; GFX11-LABEL: s_sint_to_fp_v2i64_to_v2f16:
751; GFX11:       ; %bb.0:
752; GFX11-NEXT:    s_clause 0x1
753; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
754; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
755; GFX11-NEXT:    v_mov_b32_e32 v2, 0
756; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
757; GFX11-NEXT:    s_xor_b32 s7, s2, s3
758; GFX11-NEXT:    s_xor_b32 s9, s0, s1
759; GFX11-NEXT:    s_cls_i32 s6, s3
760; GFX11-NEXT:    s_cls_i32 s8, s1
761; GFX11-NEXT:    s_ashr_i32 s7, s7, 31
762; GFX11-NEXT:    s_ashr_i32 s9, s9, 31
763; GFX11-NEXT:    s_add_i32 s6, s6, -1
764; GFX11-NEXT:    s_add_i32 s8, s8, -1
765; GFX11-NEXT:    s_add_i32 s7, s7, 32
766; GFX11-NEXT:    s_add_i32 s9, s9, 32
767; GFX11-NEXT:    s_min_u32 s6, s6, s7
768; GFX11-NEXT:    s_min_u32 s7, s8, s9
769; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
770; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
771; GFX11-NEXT:    s_min_u32 s2, s2, 1
772; GFX11-NEXT:    s_min_u32 s0, s0, 1
773; GFX11-NEXT:    s_or_b32 s2, s3, s2
774; GFX11-NEXT:    s_or_b32 s0, s1, s0
775; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s2
776; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, s0
777; GFX11-NEXT:    s_sub_i32 s0, 32, s6
778; GFX11-NEXT:    s_sub_i32 s1, 32, s7
779; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
780; GFX11-NEXT:    v_ldexp_f32 v0, v0, s0
781; GFX11-NEXT:    v_ldexp_f32 v1, v1, s1
782; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
783; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
784; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
785; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
786; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
787; GFX11-NEXT:    global_store_b32 v2, v0, s[4:5]
788; GFX11-NEXT:    s_endpgm
789  %result = sitofp <2 x i64> %in to <2 x half>
790  store <2 x half> %result, ptr addrspace(1) %out
791  ret void
792}
793
794define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
795; GFX6-LABEL: v_sint_to_fp_v4i64_to_v4f16:
796; GFX6:       ; %bb.0:
797; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
798; GFX6-NEXT:    s_mov_b32 s7, 0xf000
799; GFX6-NEXT:    s_mov_b32 s6, 0
800; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
801; GFX6-NEXT:    v_mov_b32_e32 v9, 0
802; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
803; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
804; GFX6-NEXT:    buffer_load_dwordx4 v[1:4], v[8:9], s[4:7], 0 addr64 offset:16
805; GFX6-NEXT:    buffer_load_dwordx4 v[5:8], v[8:9], s[4:7], 0 addr64
806; GFX6-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
807; GFX6-NEXT:    v_mov_b32_e32 v11, v9
808; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
809; GFX6-NEXT:    s_waitcnt vmcnt(1)
810; GFX6-NEXT:    v_xor_b32_e32 v0, v3, v4
811; GFX6-NEXT:    v_ffbh_i32_e32 v9, v4
812; GFX6-NEXT:    v_xor_b32_e32 v12, v1, v2
813; GFX6-NEXT:    v_ffbh_i32_e32 v13, v2
814; GFX6-NEXT:    s_waitcnt vmcnt(0)
815; GFX6-NEXT:    v_xor_b32_e32 v14, v7, v8
816; GFX6-NEXT:    v_ffbh_i32_e32 v15, v8
817; GFX6-NEXT:    v_xor_b32_e32 v16, v5, v6
818; GFX6-NEXT:    v_ffbh_i32_e32 v17, v6
819; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
820; GFX6-NEXT:    v_add_i32_e32 v9, vcc, -1, v9
821; GFX6-NEXT:    v_ashrrev_i32_e32 v12, 31, v12
822; GFX6-NEXT:    v_add_i32_e32 v13, vcc, -1, v13
823; GFX6-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
824; GFX6-NEXT:    v_add_i32_e32 v15, vcc, -1, v15
825; GFX6-NEXT:    v_ashrrev_i32_e32 v16, 31, v16
826; GFX6-NEXT:    v_add_i32_e32 v17, vcc, -1, v17
827; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
828; GFX6-NEXT:    v_add_i32_e32 v12, vcc, 32, v12
829; GFX6-NEXT:    v_add_i32_e32 v14, vcc, 32, v14
830; GFX6-NEXT:    v_add_i32_e32 v16, vcc, 32, v16
831; GFX6-NEXT:    v_min_u32_e32 v0, v9, v0
832; GFX6-NEXT:    v_min_u32_e32 v9, v13, v12
833; GFX6-NEXT:    v_min_u32_e32 v12, v15, v14
834; GFX6-NEXT:    v_min_u32_e32 v13, v17, v16
835; GFX6-NEXT:    v_lshl_b64 v[3:4], v[3:4], v0
836; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, 32, v0
837; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], v9
838; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 32, v9
839; GFX6-NEXT:    v_lshl_b64 v[7:8], v[7:8], v12
840; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, 32, v12
841; GFX6-NEXT:    v_lshl_b64 v[5:6], v[5:6], v13
842; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, 32, v13
843; GFX6-NEXT:    v_min_u32_e32 v3, 1, v3
844; GFX6-NEXT:    v_min_u32_e32 v0, 1, v0
845; GFX6-NEXT:    v_min_u32_e32 v7, 1, v7
846; GFX6-NEXT:    v_min_u32_e32 v5, 1, v5
847; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
848; GFX6-NEXT:    v_or_b32_e32 v0, v1, v0
849; GFX6-NEXT:    v_or_b32_e32 v1, v8, v7
850; GFX6-NEXT:    v_or_b32_e32 v4, v6, v5
851; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, v3
852; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, v0
853; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, v1
854; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v4
855; GFX6-NEXT:    v_ldexp_f32_e32 v3, v3, v14
856; GFX6-NEXT:    v_ldexp_f32_e32 v0, v0, v2
857; GFX6-NEXT:    v_ldexp_f32_e32 v1, v1, v9
858; GFX6-NEXT:    v_ldexp_f32_e32 v2, v4, v12
859; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
860; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
861; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
862; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
863; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
864; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
865; GFX6-NEXT:    v_or_b32_e32 v1, v0, v3
866; GFX6-NEXT:    v_or_b32_e32 v0, v2, v4
867; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], v[10:11], s[0:3], 0 addr64
868; GFX6-NEXT:    s_endpgm
869;
870; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f16:
871; GFX8:       ; %bb.0:
872; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
873; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
874; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 3, v0
875; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
876; GFX8-NEXT:    v_mov_b32_e32 v2, s3
877; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s2, v1
878; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
879; GFX8-NEXT:    flat_load_dwordx4 v[1:4], v[5:6]
880; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 16, v5
881; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
882; GFX8-NEXT:    flat_load_dwordx4 v[5:8], v[5:6]
883; GFX8-NEXT:    v_mov_b32_e32 v10, s1
884; GFX8-NEXT:    s_waitcnt vmcnt(1)
885; GFX8-NEXT:    v_xor_b32_e32 v0, v3, v4
886; GFX8-NEXT:    v_xor_b32_e32 v12, v1, v2
887; GFX8-NEXT:    v_ffbh_i32_e32 v11, v4
888; GFX8-NEXT:    v_ffbh_i32_e32 v13, v2
889; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
890; GFX8-NEXT:    s_waitcnt vmcnt(0)
891; GFX8-NEXT:    v_xor_b32_e32 v14, v7, v8
892; GFX8-NEXT:    v_xor_b32_e32 v16, v5, v6
893; GFX8-NEXT:    v_ffbh_i32_e32 v15, v8
894; GFX8-NEXT:    v_ffbh_i32_e32 v17, v6
895; GFX8-NEXT:    v_ashrrev_i32_e32 v12, 31, v12
896; GFX8-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
897; GFX8-NEXT:    v_ashrrev_i32_e32 v16, 31, v16
898; GFX8-NEXT:    v_add_u32_e32 v11, vcc, -1, v11
899; GFX8-NEXT:    v_add_u32_e32 v13, vcc, -1, v13
900; GFX8-NEXT:    v_add_u32_e32 v15, vcc, -1, v15
901; GFX8-NEXT:    v_add_u32_e32 v17, vcc, -1, v17
902; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
903; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 32, v12
904; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 32, v14
905; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 32, v16
906; GFX8-NEXT:    v_min_u32_e32 v0, v11, v0
907; GFX8-NEXT:    v_min_u32_e32 v11, v13, v12
908; GFX8-NEXT:    v_min_u32_e32 v12, v15, v14
909; GFX8-NEXT:    v_min_u32_e32 v13, v17, v16
910; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v0, v[3:4]
911; GFX8-NEXT:    v_sub_u32_e32 v14, vcc, 32, v0
912; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v11, v[1:2]
913; GFX8-NEXT:    v_lshlrev_b64 v[7:8], v12, v[7:8]
914; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v13, v[5:6]
915; GFX8-NEXT:    v_min_u32_e32 v3, 1, v3
916; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
917; GFX8-NEXT:    v_min_u32_e32 v7, 1, v7
918; GFX8-NEXT:    v_min_u32_e32 v5, 1, v5
919; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
920; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
921; GFX8-NEXT:    v_or_b32_e32 v1, v8, v7
922; GFX8-NEXT:    v_or_b32_e32 v4, v6, v5
923; GFX8-NEXT:    v_cvt_f32_i32_e32 v3, v3
924; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
925; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
926; GFX8-NEXT:    v_cvt_f32_i32_e32 v4, v4
927; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v11
928; GFX8-NEXT:    v_sub_u32_e32 v11, vcc, 32, v12
929; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, 32, v13
930; GFX8-NEXT:    v_ldexp_f32 v3, v3, v14
931; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
932; GFX8-NEXT:    v_ldexp_f32 v1, v1, v11
933; GFX8-NEXT:    v_ldexp_f32 v2, v4, v12
934; GFX8-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
935; GFX8-NEXT:    v_cvt_f16_f32_e32 v4, v0
936; GFX8-NEXT:    v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
937; GFX8-NEXT:    v_cvt_f16_f32_e32 v6, v2
938; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v9
939; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v10, vcc
940; GFX8-NEXT:    v_or_b32_e32 v2, v4, v3
941; GFX8-NEXT:    v_or_b32_e32 v3, v6, v5
942; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
943; GFX8-NEXT:    s_endpgm
944;
945; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f16:
946; GFX11:       ; %bb.0:
947; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
948; GFX11-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
949; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
950; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 5, v8
951; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
952; GFX11-NEXT:    s_clause 0x1
953; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3] offset:16
954; GFX11-NEXT:    global_load_b128 v[4:7], v4, s[2:3]
955; GFX11-NEXT:    s_waitcnt vmcnt(1)
956; GFX11-NEXT:    v_xor_b32_e32 v9, v2, v3
957; GFX11-NEXT:    v_xor_b32_e32 v11, v0, v1
958; GFX11-NEXT:    s_waitcnt vmcnt(0)
959; GFX11-NEXT:    v_xor_b32_e32 v13, v6, v7
960; GFX11-NEXT:    v_xor_b32_e32 v15, v4, v5
961; GFX11-NEXT:    v_cls_i32_e32 v10, v3
962; GFX11-NEXT:    v_cls_i32_e32 v12, v1
963; GFX11-NEXT:    v_cls_i32_e32 v14, v7
964; GFX11-NEXT:    v_cls_i32_e32 v16, v5
965; GFX11-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
966; GFX11-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
967; GFX11-NEXT:    v_ashrrev_i32_e32 v13, 31, v13
968; GFX11-NEXT:    v_ashrrev_i32_e32 v15, 31, v15
969; GFX11-NEXT:    v_add_nc_u32_e32 v10, -1, v10
970; GFX11-NEXT:    v_add_nc_u32_e32 v12, -1, v12
971; GFX11-NEXT:    v_add_nc_u32_e32 v14, -1, v14
972; GFX11-NEXT:    v_add_nc_u32_e32 v16, -1, v16
973; GFX11-NEXT:    v_add_nc_u32_e32 v9, 32, v9
974; GFX11-NEXT:    v_add_nc_u32_e32 v11, 32, v11
975; GFX11-NEXT:    v_add_nc_u32_e32 v13, 32, v13
976; GFX11-NEXT:    v_add_nc_u32_e32 v15, 32, v15
977; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
978; GFX11-NEXT:    v_min_u32_e32 v9, v10, v9
979; GFX11-NEXT:    v_min_u32_e32 v10, v12, v11
980; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
981; GFX11-NEXT:    v_min_u32_e32 v11, v14, v13
982; GFX11-NEXT:    v_min_u32_e32 v12, v16, v15
983; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
984; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v9, v[2:3]
985; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v10, v[0:1]
986; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
987; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v11, v[6:7]
988; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v12, v[4:5]
989; GFX11-NEXT:    v_sub_nc_u32_e32 v9, 32, v9
990; GFX11-NEXT:    v_sub_nc_u32_e32 v10, 32, v10
991; GFX11-NEXT:    v_min_u32_e32 v2, 1, v2
992; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
993; GFX11-NEXT:    v_min_u32_e32 v6, 1, v6
994; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
995; GFX11-NEXT:    v_sub_nc_u32_e32 v11, 32, v11
996; GFX11-NEXT:    v_or_b32_e32 v2, v3, v2
997; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
998; GFX11-NEXT:    v_or_b32_e32 v1, v7, v6
999; GFX11-NEXT:    v_or_b32_e32 v3, v5, v4
1000; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 32, v12
1001; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
1002; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
1003; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
1004; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
1005; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 3, v8
1006; GFX11-NEXT:    v_ldexp_f32 v2, v2, v9
1007; GFX11-NEXT:    v_ldexp_f32 v0, v0, v10
1008; GFX11-NEXT:    v_ldexp_f32 v1, v1, v11
1009; GFX11-NEXT:    v_ldexp_f32 v3, v3, v4
1010; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1011; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
1012; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
1013; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1014; GFX11-NEXT:    v_cvt_f16_f32_e32 v4, v1
1015; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
1016; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
1017; GFX11-NEXT:    v_pack_b32_f16 v1, v0, v2
1018; GFX11-NEXT:    v_pack_b32_f16 v0, v3, v4
1019; GFX11-NEXT:    global_store_b64 v5, v[0:1], s[0:1]
1020; GFX11-NEXT:    s_endpgm
1021  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1022  %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
1023  %out.gep = getelementptr <4 x half>, ptr addrspace(1) %out, i32 %tid
1024  %value = load <4 x i64>, ptr addrspace(1) %in.gep
1025  %result = sitofp <4 x i64> %value to <4 x half>
1026  store <4 x half> %result, ptr addrspace(1) %out.gep
1027  ret void
1028}
1029
1030declare i32 @llvm.amdgcn.workitem.id.x() #1
1031
1032attributes #0 = { nounwind }
1033attributes #1 = { nounwind readnone }
1034