xref: /llvm-project/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope  -check-prefixes=GCN,SI %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
4
5declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
6
7define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
8; SI-LABEL: v_uint_to_fp_i64_to_f64:
9; SI:       ; %bb.0:
10; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
11; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
12; SI-NEXT:    s_waitcnt lgkmcnt(0)
13; SI-NEXT:    v_mov_b32_e32 v1, s3
14; SI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
15; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
16; SI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
17; SI-NEXT:    s_waitcnt vmcnt(0)
18; SI-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
19; SI-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
20; SI-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
21; SI-NEXT:    v_mov_b32_e32 v2, s0
22; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
23; SI-NEXT:    v_mov_b32_e32 v3, s1
24; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
25; SI-NEXT:    s_endpgm
26;
27; VI-LABEL: v_uint_to_fp_i64_to_f64:
28; VI:       ; %bb.0:
29; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
30; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
31; VI-NEXT:    s_waitcnt lgkmcnt(0)
32; VI-NEXT:    v_mov_b32_e32 v1, s3
33; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
34; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
35; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
36; VI-NEXT:    s_waitcnt vmcnt(0)
37; VI-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
38; VI-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
39; VI-NEXT:    v_ldexp_f64 v[1:2], v[1:2], 32
40; VI-NEXT:    v_add_f64 v[0:1], v[1:2], v[3:4]
41; VI-NEXT:    v_mov_b32_e32 v2, s0
42; VI-NEXT:    v_mov_b32_e32 v3, s1
43; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
44; VI-NEXT:    s_endpgm
45  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
46  %gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
47  %val = load i64, ptr addrspace(1) %gep, align 8
48  %result = uitofp i64 %val to double
49  store double %result, ptr addrspace(1) %out
50  ret void
51}
52
53define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %in) {
54; SI-LABEL: s_uint_to_fp_i64_to_f64:
55; SI:       ; %bb.0:
56; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
57; SI-NEXT:    s_waitcnt lgkmcnt(0)
58; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
59; SI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
60; SI-NEXT:    v_mov_b32_e32 v4, s0
61; SI-NEXT:    v_mov_b32_e32 v5, s1
62; SI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 32
63; SI-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
64; SI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
65; SI-NEXT:    s_endpgm
66;
67; VI-LABEL: s_uint_to_fp_i64_to_f64:
68; VI:       ; %bb.0:
69; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
70; VI-NEXT:    s_waitcnt lgkmcnt(0)
71; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
72; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
73; VI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 32
74; VI-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
75; VI-NEXT:    v_mov_b32_e32 v2, s0
76; VI-NEXT:    v_mov_b32_e32 v3, s1
77; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
78; VI-NEXT:    s_endpgm
79  %cast = uitofp i64 %in to double
80  store double %cast, ptr addrspace(1) %out, align 8
81  ret void
82}
83
84define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2 x i64> %in) {
85; SI-LABEL: s_uint_to_fp_v2i64_to_v2f64:
86; SI:       ; %bb.0:
87; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x4
88; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
89; SI-NEXT:    s_waitcnt lgkmcnt(0)
90; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
91; SI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s1
92; SI-NEXT:    v_cvt_f64_u32_e32 v[4:5], s2
93; SI-NEXT:    v_cvt_f64_u32_e32 v[6:7], s0
94; SI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 32
95; SI-NEXT:    v_ldexp_f64 v[8:9], v[2:3], 32
96; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[4:5]
97; SI-NEXT:    v_add_f64 v[0:1], v[8:9], v[6:7]
98; SI-NEXT:    v_mov_b32_e32 v4, s4
99; SI-NEXT:    v_mov_b32_e32 v5, s5
100; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
101; SI-NEXT:    s_endpgm
102;
103; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64:
104; VI:       ; %bb.0:
105; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
106; VI-NEXT:    s_waitcnt lgkmcnt(0)
107; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
108; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s1
109; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], s0
110; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
111; VI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 32
112; VI-NEXT:    v_ldexp_f64 v[4:5], v[2:3], 32
113; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
114; VI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
115; VI-NEXT:    v_add_f64 v[0:1], v[4:5], v[6:7]
116; VI-NEXT:    s_waitcnt lgkmcnt(0)
117; VI-NEXT:    v_mov_b32_e32 v5, s1
118; VI-NEXT:    v_mov_b32_e32 v4, s0
119; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
120; VI-NEXT:    s_endpgm
121  %cast = uitofp <2 x i64> %in to <2 x double>
122  store <2 x double> %cast, ptr addrspace(1) %out, align 16
123  ret void
124}
125
126define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %in) {
127; SI-LABEL: s_uint_to_fp_v4i64_to_v4f64:
128; SI:       ; %bb.0:
129; SI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x8
130; SI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
131; SI-NEXT:    s_waitcnt lgkmcnt(0)
132; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
133; SI-NEXT:    v_cvt_f64_u32_e32 v[4:5], s1
134; SI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
135; SI-NEXT:    v_cvt_f64_u32_e32 v[6:7], s0
136; SI-NEXT:    v_cvt_f64_u32_e32 v[8:9], s7
137; SI-NEXT:    v_cvt_f64_u32_e32 v[10:11], s5
138; SI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 32
139; SI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 32
140; SI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
141; SI-NEXT:    v_add_f64 v[0:1], v[4:5], v[6:7]
142; SI-NEXT:    v_cvt_f64_u32_e32 v[4:5], s6
143; SI-NEXT:    v_cvt_f64_u32_e32 v[12:13], s4
144; SI-NEXT:    v_ldexp_f64 v[6:7], v[8:9], 32
145; SI-NEXT:    v_ldexp_f64 v[8:9], v[10:11], 32
146; SI-NEXT:    s_add_u32 s0, s8, 16
147; SI-NEXT:    s_addc_u32 s1, s9, 0
148; SI-NEXT:    v_add_f64 v[6:7], v[6:7], v[4:5]
149; SI-NEXT:    v_add_f64 v[4:5], v[8:9], v[12:13]
150; SI-NEXT:    v_mov_b32_e32 v9, s1
151; SI-NEXT:    v_mov_b32_e32 v8, s0
152; SI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
153; SI-NEXT:    s_nop 0
154; SI-NEXT:    v_mov_b32_e32 v4, s8
155; SI-NEXT:    v_mov_b32_e32 v5, s9
156; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
157; SI-NEXT:    s_endpgm
158;
159; VI-LABEL: s_uint_to_fp_v4i64_to_v4f64:
160; VI:       ; %bb.0:
161; VI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x20
162; VI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
163; VI-NEXT:    s_waitcnt lgkmcnt(0)
164; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s7
165; VI-NEXT:    v_cvt_f64_u32_e32 v[4:5], s5
166; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
167; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], s1
168; VI-NEXT:    v_ldexp_f64 v[8:9], v[2:3], 32
169; VI-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 32
170; VI-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 32
171; VI-NEXT:    v_ldexp_f64 v[10:11], v[6:7], 32
172; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], s6
173; VI-NEXT:    v_cvt_f64_u32_e32 v[12:13], s4
174; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
175; VI-NEXT:    v_cvt_f64_u32_e32 v[14:15], s0
176; VI-NEXT:    v_add_f64 v[6:7], v[8:9], v[6:7]
177; VI-NEXT:    v_add_f64 v[4:5], v[4:5], v[12:13]
178; VI-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
179; VI-NEXT:    v_add_f64 v[0:1], v[10:11], v[14:15]
180; VI-NEXT:    s_add_u32 s0, s8, 16
181; VI-NEXT:    s_addc_u32 s1, s9, 0
182; VI-NEXT:    v_mov_b32_e32 v11, s1
183; VI-NEXT:    v_mov_b32_e32 v8, s8
184; VI-NEXT:    v_mov_b32_e32 v10, s0
185; VI-NEXT:    v_mov_b32_e32 v9, s9
186; VI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
187; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
188; VI-NEXT:    s_endpgm
189  %cast = uitofp <4 x i64> %in to <4 x double>
190  store <4 x double> %cast, ptr addrspace(1) %out, align 16
191  ret void
192}
193
194define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in) {
195; SI-LABEL: s_uint_to_fp_i32_to_f64:
196; SI:       ; %bb.0:
197; SI-NEXT:    s_load_dword s2, s[8:9], 0x2
198; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
199; SI-NEXT:    s_waitcnt lgkmcnt(0)
200; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
201; SI-NEXT:    v_mov_b32_e32 v3, s1
202; SI-NEXT:    v_mov_b32_e32 v2, s0
203; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
204; SI-NEXT:    s_endpgm
205;
206; VI-LABEL: s_uint_to_fp_i32_to_f64:
207; VI:       ; %bb.0:
208; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
209; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
210; VI-NEXT:    s_waitcnt lgkmcnt(0)
211; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
212; VI-NEXT:    v_mov_b32_e32 v3, s1
213; VI-NEXT:    v_mov_b32_e32 v2, s0
214; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
215; VI-NEXT:    s_endpgm
216  %cast = uitofp i32 %in to double
217  store double %cast, ptr addrspace(1) %out, align 8
218  ret void
219}
220
221define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2 x i32> %in) {
222; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64:
223; GCN:       ; %bb.0:
224; GCN-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
225; GCN-NEXT:    s_waitcnt lgkmcnt(0)
226; GCN-NEXT:    v_cvt_f64_u32_e32 v[2:3], s3
227; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
228; GCN-NEXT:    v_mov_b32_e32 v5, s1
229; GCN-NEXT:    v_mov_b32_e32 v4, s0
230; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
231; GCN-NEXT:    s_endpgm
232  %cast = uitofp <2 x i32> %in to <2 x double>
233  store <2 x double> %cast, ptr addrspace(1) %out, align 16
234  ret void
235}
236
237define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4 x i32> %in) {
238; SI-LABEL: s_uint_to_fp_v4i32_to_v4f64:
239; SI:       ; %bb.0:
240; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x4
241; SI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
242; SI-NEXT:    s_waitcnt lgkmcnt(0)
243; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
244; SI-NEXT:    v_cvt_f64_u32_e32 v[6:7], s3
245; SI-NEXT:    v_cvt_f64_u32_e32 v[4:5], s2
246; SI-NEXT:    s_add_u32 s0, s4, 16
247; SI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s1
248; SI-NEXT:    s_addc_u32 s1, s5, 0
249; SI-NEXT:    v_mov_b32_e32 v9, s1
250; SI-NEXT:    v_mov_b32_e32 v8, s0
251; SI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
252; SI-NEXT:    s_nop 0
253; SI-NEXT:    v_mov_b32_e32 v4, s4
254; SI-NEXT:    v_mov_b32_e32 v5, s5
255; SI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
256; SI-NEXT:    s_endpgm
257;
258; VI-LABEL: s_uint_to_fp_v4i32_to_v4f64:
259; VI:       ; %bb.0:
260; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
261; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
262; VI-NEXT:    s_waitcnt lgkmcnt(0)
263; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
264; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], s3
265; VI-NEXT:    v_cvt_f64_u32_e32 v[4:5], s2
266; VI-NEXT:    s_add_u32 s0, s4, 16
267; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s1
268; VI-NEXT:    s_addc_u32 s1, s5, 0
269; VI-NEXT:    v_mov_b32_e32 v9, s1
270; VI-NEXT:    v_mov_b32_e32 v8, s0
271; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
272; VI-NEXT:    s_nop 0
273; VI-NEXT:    v_mov_b32_e32 v4, s4
274; VI-NEXT:    v_mov_b32_e32 v5, s5
275; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
276; VI-NEXT:    s_endpgm
277  %cast = uitofp <4 x i32> %in to <4 x double>
278  store <4 x double> %cast, ptr addrspace(1) %out, align 16
279  ret void
280}
281
282; We can't fold the SGPRs into v_cndmask_b32_e32, because it already
283; uses an SGPR (implicit vcc).
284define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in) {
285; SI-LABEL: uint_to_fp_i1_to_f64:
286; SI:       ; %bb.0:
287; SI-NEXT:    s_load_dword s2, s[8:9], 0x2
288; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
289; SI-NEXT:    v_mov_b32_e32 v0, 0
290; SI-NEXT:    s_waitcnt lgkmcnt(0)
291; SI-NEXT:    s_cmp_eq_u32 s2, 0
292; SI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
293; SI-NEXT:    v_mov_b32_e32 v3, s1
294; SI-NEXT:    v_mov_b32_e32 v1, s2
295; SI-NEXT:    v_mov_b32_e32 v2, s0
296; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
297; SI-NEXT:    s_endpgm
298;
299; VI-LABEL: uint_to_fp_i1_to_f64:
300; VI:       ; %bb.0:
301; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
302; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
303; VI-NEXT:    v_mov_b32_e32 v0, 0
304; VI-NEXT:    s_waitcnt lgkmcnt(0)
305; VI-NEXT:    s_cmp_eq_u32 s2, 0
306; VI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
307; VI-NEXT:    v_mov_b32_e32 v3, s1
308; VI-NEXT:    v_mov_b32_e32 v1, s2
309; VI-NEXT:    v_mov_b32_e32 v2, s0
310; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
311; VI-NEXT:    s_endpgm
312  %cmp = icmp eq i32 %in, 0
313  %fp = uitofp i1 %cmp to double
314  store double %fp, ptr addrspace(1) %out, align 4
315  ret void
316}
317
318define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %in) {
319; SI-LABEL: uint_to_fp_i1_to_f64_load:
320; SI:       ; %bb.0:
321; SI-NEXT:    s_load_dword s2, s[8:9], 0x2
322; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
323; SI-NEXT:    s_waitcnt lgkmcnt(0)
324; SI-NEXT:    s_bitcmp1_b32 s2, 0
325; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
326; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
327; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
328; SI-NEXT:    v_mov_b32_e32 v3, s1
329; SI-NEXT:    v_mov_b32_e32 v2, s0
330; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
331; SI-NEXT:    s_endpgm
332;
333; VI-LABEL: uint_to_fp_i1_to_f64_load:
334; VI:       ; %bb.0:
335; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
336; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
337; VI-NEXT:    s_waitcnt lgkmcnt(0)
338; VI-NEXT:    s_bitcmp1_b32 s2, 0
339; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
340; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
341; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
342; VI-NEXT:    v_mov_b32_e32 v3, s1
343; VI-NEXT:    v_mov_b32_e32 v2, s0
344; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
345; VI-NEXT:    s_endpgm
346  %fp = uitofp i1 %in to double
347  store double %fp, ptr addrspace(1) %out, align 8
348  ret void
349}
350
351define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in) {
352; SI-LABEL: s_uint_to_fp_i8_to_f64:
353; SI:       ; %bb.0:
354; SI-NEXT:    s_load_dword s2, s[8:9], 0x2
355; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
356; SI-NEXT:    s_waitcnt lgkmcnt(0)
357; SI-NEXT:    s_and_b32 s2, s2, 0xff
358; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
359; SI-NEXT:    v_mov_b32_e32 v3, s1
360; SI-NEXT:    v_mov_b32_e32 v2, s0
361; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
362; SI-NEXT:    s_endpgm
363;
364; VI-LABEL: s_uint_to_fp_i8_to_f64:
365; VI:       ; %bb.0:
366; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
367; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
368; VI-NEXT:    s_waitcnt lgkmcnt(0)
369; VI-NEXT:    s_and_b32 s2, s2, 0xff
370; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
371; VI-NEXT:    v_mov_b32_e32 v3, s1
372; VI-NEXT:    v_mov_b32_e32 v2, s0
373; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
374; VI-NEXT:    s_endpgm
375  %fp = uitofp i8 %in to double
376  store double %fp, ptr addrspace(1) %out
377  ret void
378}
379
380; FIXME: Worse on VI
381define double @v_uint_to_fp_i8_to_f64(i8 %in) {
382; SI-LABEL: v_uint_to_fp_i8_to_f64:
383; SI:       ; %bb.0:
384; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
385; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
386; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
387; SI-NEXT:    s_setpc_b64 s[30:31]
388;
389; VI-LABEL: v_uint_to_fp_i8_to_f64:
390; VI:       ; %bb.0:
391; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
392; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
393; VI-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
394; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
395; VI-NEXT:    s_setpc_b64 s[30:31]
396  %fp = uitofp i8 %in to double
397  ret double %fp
398}
399
400define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
401; SI-LABEL: s_select_uint_to_fp_i1_vals_f64:
402; SI:       ; %bb.0:
403; SI-NEXT:    s_load_dword s2, s[8:9], 0x2
404; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
405; SI-NEXT:    v_mov_b32_e32 v0, 0
406; SI-NEXT:    s_waitcnt lgkmcnt(0)
407; SI-NEXT:    s_cmp_eq_u32 s2, 0
408; SI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
409; SI-NEXT:    v_mov_b32_e32 v3, s1
410; SI-NEXT:    v_mov_b32_e32 v1, s2
411; SI-NEXT:    v_mov_b32_e32 v2, s0
412; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
413; SI-NEXT:    s_endpgm
414;
415; VI-LABEL: s_select_uint_to_fp_i1_vals_f64:
416; VI:       ; %bb.0:
417; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
418; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
419; VI-NEXT:    v_mov_b32_e32 v0, 0
420; VI-NEXT:    s_waitcnt lgkmcnt(0)
421; VI-NEXT:    s_cmp_eq_u32 s2, 0
422; VI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
423; VI-NEXT:    v_mov_b32_e32 v3, s1
424; VI-NEXT:    v_mov_b32_e32 v1, s2
425; VI-NEXT:    v_mov_b32_e32 v2, s0
426; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
427; VI-NEXT:    s_endpgm
428  %cmp = icmp eq i32 %in, 0
429  %select = select i1 %cmp, double 1.0, double 0.0
430  store double %select, ptr addrspace(1) %out, align 8
431  ret void
432}
433
434define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
435; GCN-LABEL: v_select_uint_to_fp_i1_vals_f64:
436; GCN:       ; %bb.0:
437; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
438; GCN-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
439; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
440; GCN-NEXT:    v_mov_b32_e32 v3, 0
441; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
442; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
443; GCN-NEXT:    s_waitcnt vmcnt(0)
444; GCN-NEXT:    s_setpc_b64 s[30:31]
445  %cmp = icmp eq i32 %in, 0
446  %select = select i1 %cmp, double 1.0, double 0.0
447  store double %select, ptr addrspace(1) %out, align 8
448  ret void
449}
450
451define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
452; SI-LABEL: s_select_uint_to_fp_i1_vals_i64:
453; SI:       ; %bb.0:
454; SI-NEXT:    s_load_dword s2, s[8:9], 0x2
455; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
456; SI-NEXT:    v_mov_b32_e32 v0, 0
457; SI-NEXT:    s_waitcnt lgkmcnt(0)
458; SI-NEXT:    s_cmp_eq_u32 s2, 0
459; SI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
460; SI-NEXT:    v_mov_b32_e32 v3, s1
461; SI-NEXT:    v_mov_b32_e32 v1, s2
462; SI-NEXT:    v_mov_b32_e32 v2, s0
463; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
464; SI-NEXT:    s_endpgm
465;
466; VI-LABEL: s_select_uint_to_fp_i1_vals_i64:
467; VI:       ; %bb.0:
468; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
469; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
470; VI-NEXT:    v_mov_b32_e32 v0, 0
471; VI-NEXT:    s_waitcnt lgkmcnt(0)
472; VI-NEXT:    s_cmp_eq_u32 s2, 0
473; VI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
474; VI-NEXT:    v_mov_b32_e32 v3, s1
475; VI-NEXT:    v_mov_b32_e32 v1, s2
476; VI-NEXT:    v_mov_b32_e32 v2, s0
477; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
478; VI-NEXT:    s_endpgm
479  %cmp = icmp eq i32 %in, 0
480  %select = select i1 %cmp, i64 u0x3ff0000000000000, i64 0
481  store i64 %select, ptr addrspace(1) %out, align 8
482  ret void
483}
484
485define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
486; GCN-LABEL: v_select_uint_to_fp_i1_vals_i64:
487; GCN:       ; %bb.0:
488; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
489; GCN-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
490; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
491; GCN-NEXT:    v_mov_b32_e32 v3, 0
492; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
493; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
494; GCN-NEXT:    s_waitcnt vmcnt(0)
495; GCN-NEXT:    s_setpc_b64 s[30:31]
496  %cmp = icmp eq i32 %in, 0
497  %select = select i1 %cmp, i64 u0x3ff0000000000000, i64 0
498  store i64 %select, ptr addrspace(1) %out, align 8
499  ret void
500}
501
502; TODO: This should swap the selected order / invert the compare and do it.
503define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
504; SI-LABEL: s_swap_select_uint_to_fp_i1_vals_f64:
505; SI:       ; %bb.0:
506; SI-NEXT:    s_load_dword s2, s[8:9], 0x2
507; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
508; SI-NEXT:    v_mov_b32_e32 v0, 0
509; SI-NEXT:    s_waitcnt lgkmcnt(0)
510; SI-NEXT:    s_cmp_eq_u32 s2, 0
511; SI-NEXT:    s_cselect_b32 s2, 0, 0x3ff00000
512; SI-NEXT:    v_mov_b32_e32 v3, s1
513; SI-NEXT:    v_mov_b32_e32 v1, s2
514; SI-NEXT:    v_mov_b32_e32 v2, s0
515; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
516; SI-NEXT:    s_endpgm
517;
518; VI-LABEL: s_swap_select_uint_to_fp_i1_vals_f64:
519; VI:       ; %bb.0:
520; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
521; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
522; VI-NEXT:    v_mov_b32_e32 v0, 0
523; VI-NEXT:    s_waitcnt lgkmcnt(0)
524; VI-NEXT:    s_cmp_eq_u32 s2, 0
525; VI-NEXT:    s_cselect_b32 s2, 0, 0x3ff00000
526; VI-NEXT:    v_mov_b32_e32 v3, s1
527; VI-NEXT:    v_mov_b32_e32 v1, s2
528; VI-NEXT:    v_mov_b32_e32 v2, s0
529; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
530; VI-NEXT:    s_endpgm
531  %cmp = icmp eq i32 %in, 0
532  %select = select i1 %cmp, double 0.0, double 1.0
533  store double %select, ptr addrspace(1) %out, align 8
534  ret void
535}
536
537define void @v_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
538; GCN-LABEL: v_swap_select_uint_to_fp_i1_vals_f64:
539; GCN:       ; %bb.0:
540; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
541; GCN-NEXT:    v_mov_b32_e32 v4, 0x3ff00000
542; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
543; GCN-NEXT:    v_mov_b32_e32 v3, 0
544; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, 0, vcc
545; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
546; GCN-NEXT:    s_waitcnt vmcnt(0)
547; GCN-NEXT:    s_setpc_b64 s[30:31]
548  %cmp = icmp eq i32 %in, 0
549  %select = select i1 %cmp, double 0.0, double 1.0
550  store double %select, ptr addrspace(1) %out, align 8
551  ret void
552}
553