xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll (revision 5d9c717597aef72e4ba27a2b143e9753c513e5c9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
3; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
4
5declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
6declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
7
8define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
9; SI-LABEL: v_uitofp_i32_to_f32_mask255:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
13; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
14; SI-NEXT:    s_setpc_b64 s[30:31]
15;
16; VI-LABEL: v_uitofp_i32_to_f32_mask255:
17; VI:       ; %bb.0:
18; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
20; VI-NEXT:    s_setpc_b64 s[30:31]
21  %masked = and i32 %arg0, 255
22  %cvt = uitofp i32 %masked to float
23  ret float %cvt
24}
25
26define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
27; SI-LABEL: v_sitofp_i32_to_f32_mask255:
28; SI:       ; %bb.0:
29; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
31; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
32; SI-NEXT:    s_setpc_b64 s[30:31]
33;
34; VI-LABEL: v_sitofp_i32_to_f32_mask255:
35; VI:       ; %bb.0:
36; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
38; VI-NEXT:    s_setpc_b64 s[30:31]
39  %masked = and i32 %arg0, 255
40  %cvt = sitofp i32 %masked to float
41  ret float %cvt
42}
43
44define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind {
45; GCN-LABEL: v_uitofp_to_f32_lshr7_mask255:
46; GCN:       ; %bb.0:
47; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48; GCN-NEXT:    v_bfe_u32 v0, v0, 7, 8
49; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
50; GCN-NEXT:    s_setpc_b64 s[30:31]
51  %lshr.7 = lshr i32 %arg0, 7
52  %masked = and i32 %lshr.7, 255
53  %cvt = uitofp i32 %masked to float
54  ret float %cvt
55}
56
57define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind {
58; SI-LABEL: v_uitofp_to_f32_lshr8_mask255:
59; SI:       ; %bb.0:
60; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
61; SI-NEXT:    v_bfe_u32 v0, v0, 8, 8
62; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
63; SI-NEXT:    s_setpc_b64 s[30:31]
64;
65; VI-LABEL: v_uitofp_to_f32_lshr8_mask255:
66; VI:       ; %bb.0:
67; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
68; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
69; VI-NEXT:    s_setpc_b64 s[30:31]
70  %lshr.8 = lshr i32 %arg0, 8
71  %masked = and i32 %lshr.8, 255
72  %cvt = uitofp i32 %masked to float
73  ret float %cvt
74}
75
76define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
77; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
78; SI:       ; %bb.0:
79; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80; SI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
81; SI-NEXT:    s_mov_b32 s6, -1
82; SI-NEXT:    s_mov_b32 s7, 0xf000
83; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
84; SI-NEXT:    s_waitcnt expcnt(0)
85; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
86; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
87; SI-NEXT:    s_waitcnt vmcnt(0)
88; SI-NEXT:    s_setpc_b64 s[30:31]
89;
90; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
91; VI:       ; %bb.0:
92; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
93; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
94; VI-NEXT:    flat_store_dword v[0:1], v0
95; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
96; VI-NEXT:    s_waitcnt vmcnt(0)
97; VI-NEXT:    s_setpc_b64 s[30:31]
98  %lshr.8 = lshr i32 %arg0, 8
99  store i32 %lshr.8, ptr addrspace(1) undef
100  %masked = and i32 %lshr.8, 255
101  %cvt = uitofp i32 %masked to float
102  ret float %cvt
103}
104
105define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind {
106; SI-LABEL: v_uitofp_to_f32_lshr16_mask255:
107; SI:       ; %bb.0:
108; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109; SI-NEXT:    v_bfe_u32 v0, v0, 16, 8
110; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
111; SI-NEXT:    s_setpc_b64 s[30:31]
112;
113; VI-LABEL: v_uitofp_to_f32_lshr16_mask255:
114; VI:       ; %bb.0:
115; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
116; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
117; VI-NEXT:    s_setpc_b64 s[30:31]
118  %lshr.16 = lshr i32 %arg0, 16
119  %masked = and i32 %lshr.16, 255
120  %cvt = uitofp i32 %masked to float
121  ret float %cvt
122}
123
124define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind {
125; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255:
126; GCN:       ; %bb.0:
127; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
128; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
129; GCN-NEXT:    s_setpc_b64 s[30:31]
130  %lshr.16 = lshr i32 %arg0, 24
131  %masked = and i32 %lshr.16, 255
132  %cvt = uitofp i32 %masked to float
133  ret float %cvt
134}
135
136define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind {
137; SI-LABEL: v_uitofp_i8_to_f32:
138; SI:       ; %bb.0:
139; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
140; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
141; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
142; SI-NEXT:    s_setpc_b64 s[30:31]
143;
144; VI-LABEL: v_uitofp_i8_to_f32:
145; VI:       ; %bb.0:
146; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
148; VI-NEXT:    s_setpc_b64 s[30:31]
149  %cvt = uitofp i8 %arg0 to float
150  ret float %cvt
151}
152
153define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind {
154; SI-LABEL: v_uitofp_v2i8_to_v2f32:
155; SI:       ; %bb.0:
156; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
158; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v1
159; SI-NEXT:    v_bfe_u32 v0, v0, 8, 8
160; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v0
161; SI-NEXT:    v_mov_b32_e32 v0, v2
162; SI-NEXT:    s_setpc_b64 s[30:31]
163;
164; VI-LABEL: v_uitofp_v2i8_to_v2f32:
165; VI:       ; %bb.0:
166; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
167; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
168; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
169; VI-NEXT:    v_mov_b32_e32 v0, v2
170; VI-NEXT:    s_setpc_b64 s[30:31]
171  %val = bitcast i16 %arg0 to <2 x i8>
172  %cvt = uitofp <2 x i8> %val to <2 x float>
173  ret <2 x float> %cvt
174}
175
176define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind {
177; SI-LABEL: v_uitofp_v3i8_to_v3f32:
178; SI:       ; %bb.0:
179; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
180; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
181; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
182; SI-NEXT:    v_bfe_u32 v1, v0, 8, 8
183; SI-NEXT:    v_bfe_u32 v0, v0, 16, 8
184; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
185; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
186; SI-NEXT:    v_mov_b32_e32 v0, v3
187; SI-NEXT:    s_setpc_b64 s[30:31]
188;
189; VI-LABEL: v_uitofp_v3i8_to_v3f32:
190; VI:       ; %bb.0:
191; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
192; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
193; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
194; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
195; VI-NEXT:    v_mov_b32_e32 v0, v3
196; VI-NEXT:    s_setpc_b64 s[30:31]
197  %trunc = trunc i32 %arg0 to i24
198  %val = bitcast i24 %trunc to <3 x i8>
199  %cvt = uitofp <3 x i8> %val to <3 x float>
200  ret <3 x float> %cvt
201}
202
203define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
204; SI-LABEL: v_uitofp_v4i8_to_v4f32:
205; SI:       ; %bb.0:
206; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
207; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
208; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v1
209; SI-NEXT:    v_bfe_u32 v1, v0, 8, 8
210; SI-NEXT:    v_bfe_u32 v2, v0, 16, 8
211; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
212; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
213; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
214; SI-NEXT:    v_mov_b32_e32 v0, v4
215; SI-NEXT:    s_setpc_b64 s[30:31]
216;
217; VI-LABEL: v_uitofp_v4i8_to_v4f32:
218; VI:       ; %bb.0:
219; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
220; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
221; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
222; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
223; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
224; VI-NEXT:    v_mov_b32_e32 v0, v4
225; VI-NEXT:    s_setpc_b64 s[30:31]
226  %val = bitcast i32 %arg0 to <4 x i8>
227  %cvt = uitofp <4 x i8> %val to <4 x float>
228  ret <4 x float> %cvt
229}
230
231define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
232; SI-LABEL: v_uitofp_unpack_i32_to_v4f32:
233; SI:       ; %bb.0:
234; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
235; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
236; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v1
237; SI-NEXT:    v_bfe_u32 v1, v0, 8, 8
238; SI-NEXT:    v_bfe_u32 v2, v0, 16, 8
239; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
240; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
241; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
242; SI-NEXT:    v_mov_b32_e32 v0, v4
243; SI-NEXT:    s_setpc_b64 s[30:31]
244;
245; VI-LABEL: v_uitofp_unpack_i32_to_v4f32:
246; VI:       ; %bb.0:
247; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
248; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
249; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
250; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
251; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
252; VI-NEXT:    v_mov_b32_e32 v0, v4
253; VI-NEXT:    s_setpc_b64 s[30:31]
254  %mask.arg0 = and i32 %arg0, 255
255  %cvt0 = uitofp i32 %mask.arg0 to float
256
257  %lshr.8 = lshr i32 %arg0, 8
258  %mask.lshr.8 = and i32 %lshr.8, 255
259  %cvt1 = uitofp i32 %mask.lshr.8 to float
260
261  %lshr.16 = lshr i32 %arg0, 16
262  %mask.lshr.16 = and i32 %lshr.16, 255
263  %cvt2 = uitofp i32 %mask.lshr.16 to float
264
265  %lshr.24 = lshr i32 %arg0, 24
266  %mask.lshr.24 = and i32 %lshr.24, 255
267  %cvt3 = uitofp i32 %mask.lshr.24 to float
268
269  %ins.0 = insertelement <4 x float> undef, float %cvt0, i32 0
270  %ins.1 = insertelement <4 x float> %ins.0, float %cvt1, i32 1
271  %ins.2 = insertelement <4 x float> %ins.1, float %cvt2, i32 2
272  %ins.3 = insertelement <4 x float> %ins.2, float %cvt3, i32 3
273  ret <4 x float> %ins.3
274}
275
276define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
277; SI-LABEL: v_uitofp_i32_to_f16_mask255:
278; SI:       ; %bb.0:
279; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
281; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
282; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
283; SI-NEXT:    s_setpc_b64 s[30:31]
284;
285; VI-LABEL: v_uitofp_i32_to_f16_mask255:
286; VI:       ; %bb.0:
287; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
288; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
289; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
290; VI-NEXT:    s_setpc_b64 s[30:31]
291  %masked = and i32 %arg0, 255
292  %cvt = uitofp i32 %masked to half
293  ret half %cvt
294}
295
296define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
297; SI-LABEL: v_sitofp_i32_to_f16_mask255:
298; SI:       ; %bb.0:
299; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
300; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
301; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
302; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
303; SI-NEXT:    s_setpc_b64 s[30:31]
304;
305; VI-LABEL: v_sitofp_i32_to_f16_mask255:
306; VI:       ; %bb.0:
307; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
308; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
309; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
310; VI-NEXT:    s_setpc_b64 s[30:31]
311  %masked = and i32 %arg0, 255
312  %cvt = sitofp i32 %masked to half
313  ret half %cvt
314}
315
316define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
317; SI-LABEL: v_uitofp_to_f16_lshr8_mask255:
318; SI:       ; %bb.0:
319; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
320; SI-NEXT:    v_bfe_u32 v0, v0, 8, 8
321; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
322; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
323; SI-NEXT:    s_setpc_b64 s[30:31]
324;
325; VI-LABEL: v_uitofp_to_f16_lshr8_mask255:
326; VI:       ; %bb.0:
327; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
328; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
329; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
330; VI-NEXT:    s_setpc_b64 s[30:31]
331  %lshr.8 = lshr i32 %arg0, 8
332  %masked = and i32 %lshr.8, 255
333  %cvt = uitofp i32 %masked to half
334  ret half %cvt
335}
336
337define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
338; SI-LABEL: v_uitofp_to_f16_lshr16_mask255:
339; SI:       ; %bb.0:
340; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
341; SI-NEXT:    v_bfe_u32 v0, v0, 16, 8
342; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
343; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
344; SI-NEXT:    s_setpc_b64 s[30:31]
345;
346; VI-LABEL: v_uitofp_to_f16_lshr16_mask255:
347; VI:       ; %bb.0:
348; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
349; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
350; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
351; VI-NEXT:    s_setpc_b64 s[30:31]
352  %lshr.16 = lshr i32 %arg0, 16
353  %masked = and i32 %lshr.16, 255
354  %cvt = uitofp i32 %masked to half
355  ret half %cvt
356}
357
358define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind {
359; GCN-LABEL: v_uitofp_to_f16_lshr24_mask255:
360; GCN:       ; %bb.0:
361; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
362; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
363; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v0
364; GCN-NEXT:    s_setpc_b64 s[30:31]
365  %lshr.16 = lshr i32 %arg0, 24
366  %masked = and i32 %lshr.16, 255
367  %cvt = uitofp i32 %masked to half
368  ret half %cvt
369}
370
371define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind {
372; SI-LABEL: v_uitofp_i8_to_f16:
373; SI:       ; %bb.0:
374; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
375; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
376; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
377; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
378; SI-NEXT:    s_setpc_b64 s[30:31]
379;
380; VI-LABEL: v_uitofp_i8_to_f16:
381; VI:       ; %bb.0:
382; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
384; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
385; VI-NEXT:    s_setpc_b64 s[30:31]
386  %cvt = uitofp i8 %arg0 to half
387  ret half %cvt
388}
389
390define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind {
391; GCN-LABEL: v_uitofp_i32_to_f64_mask255:
392; GCN:       ; %bb.0:
393; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
394; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
395; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
396; GCN-NEXT:    s_setpc_b64 s[30:31]
397  %masked = and i32 %arg0, 255
398  %cvt = uitofp i32 %masked to double
399  ret double %cvt
400}
401
402define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind {
403; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255:
404; GCN:       ; %bb.0:
405; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
406; GCN-NEXT:    v_bfe_u32 v0, v0, 8, 8
407; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
408; GCN-NEXT:    s_setpc_b64 s[30:31]
409  %lshr.8 = lshr i32 %arg0, 8
410  %masked = and i32 %lshr.8, 255
411  %cvt = uitofp i32 %masked to double
412  ret double %cvt
413}
414
415define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind {
416; GCN-LABEL: v_uitofp_to_f64_lshr16_mask255:
417; GCN:       ; %bb.0:
418; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
419; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 8
420; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
421; GCN-NEXT:    s_setpc_b64 s[30:31]
422  %lshr.16 = lshr i32 %arg0, 16
423  %masked = and i32 %lshr.16, 255
424  %cvt = uitofp i32 %masked to double
425  ret double %cvt
426}
427
428define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind {
429; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255:
430; GCN:       ; %bb.0:
431; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
432; GCN-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
433; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
434; GCN-NEXT:    s_setpc_b64 s[30:31]
435  %lshr.16 = lshr i32 %arg0, 24
436  %masked = and i32 %lshr.16, 255
437  %cvt = uitofp i32 %masked to double
438  ret double %cvt
439}
440
441define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
442; GCN-LABEL: v_uitofp_i8_to_f64:
443; GCN:       ; %bb.0:
444; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
445; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
446; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
447; GCN-NEXT:    s_setpc_b64 s[30:31]
448  %cvt = uitofp i8 %arg0 to double
449  ret double %cvt
450}
451
452define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
453; SI-LABEL: load_i8_to_f32:
454; SI:       ; %bb.0:
455; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
456; SI-NEXT:    s_mov_b32 s6, 0
457; SI-NEXT:    s_mov_b32 s7, 0xf000
458; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
459; SI-NEXT:    s_waitcnt lgkmcnt(0)
460; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
461; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
462; SI-NEXT:    s_mov_b32 s6, -1
463; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
464; SI-NEXT:    s_waitcnt vmcnt(0)
465; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
466; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
467; SI-NEXT:    s_endpgm
468;
469; VI-LABEL: load_i8_to_f32:
470; VI:       ; %bb.0:
471; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
472; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
473; VI-NEXT:    s_waitcnt lgkmcnt(0)
474; VI-NEXT:    v_mov_b32_e32 v1, s2
475; VI-NEXT:    v_mov_b32_e32 v2, s3
476; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
477; VI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v3, vcc
478; VI-NEXT:    flat_load_ubyte v0, v[0:1]
479; VI-NEXT:    s_waitcnt vmcnt(0)
480; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
481; VI-NEXT:    v_mov_b32_e32 v0, s0
482; VI-NEXT:    v_mov_b32_e32 v1, s1
483; VI-NEXT:    flat_store_dword v[0:1], v2
484; VI-NEXT:    s_endpgm
485  %tid = call i32 @llvm.amdgcn.workitem.id.x()
486  %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid
487  %load = load i8, ptr addrspace(1) %gep, align 1
488  %cvt = uitofp i8 %load to float
489  store float %cvt, ptr addrspace(1) %out, align 4
490  ret void
491}
492
493define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
494; SI-LABEL: load_v2i8_to_v2f32:
495; SI:       ; %bb.0:
496; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
497; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
498; SI-NEXT:    v_mov_b32_e32 v1, 0
499; SI-NEXT:    s_mov_b32 s6, 0
500; SI-NEXT:    s_mov_b32 s7, 0xf000
501; SI-NEXT:    s_waitcnt lgkmcnt(0)
502; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
503; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
504; SI-NEXT:    s_mov_b32 s6, -1
505; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
506; SI-NEXT:    s_waitcnt vmcnt(0)
507; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
508; SI-NEXT:    v_bfe_u32 v2, v0, 8, 8
509; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v1
510; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
511; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
512; SI-NEXT:    s_endpgm
513;
514; VI-LABEL: load_v2i8_to_v2f32:
515; VI:       ; %bb.0:
516; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
517; VI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
518; VI-NEXT:    s_waitcnt lgkmcnt(0)
519; VI-NEXT:    v_mov_b32_e32 v0, s2
520; VI-NEXT:    v_mov_b32_e32 v1, s3
521; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
522; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
523; VI-NEXT:    flat_load_ushort v1, v[0:1]
524; VI-NEXT:    v_mov_b32_e32 v3, s1
525; VI-NEXT:    v_mov_b32_e32 v2, s0
526; VI-NEXT:    s_waitcnt vmcnt(0)
527; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
528; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
529; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
530; VI-NEXT:    s_endpgm
531  %tid = call i32 @llvm.amdgcn.workitem.id.x()
532  %gep = getelementptr <2 x i8>, ptr addrspace(1) %in, i32 %tid
533  %load = load <2 x i8>, ptr addrspace(1) %gep, align 2
534  %cvt = uitofp <2 x i8> %load to <2 x float>
535  store <2 x float> %cvt, ptr addrspace(1) %out, align 16
536  ret void
537}
538
539define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
540; SI-LABEL: load_v3i8_to_v3f32:
541; SI:       ; %bb.0:
542; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
543; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
544; SI-NEXT:    v_mov_b32_e32 v1, 0
545; SI-NEXT:    s_mov_b32 s6, 0
546; SI-NEXT:    s_mov_b32 s7, 0xf000
547; SI-NEXT:    s_waitcnt lgkmcnt(0)
548; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
549; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
550; SI-NEXT:    s_mov_b32 s6, -1
551; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
552; SI-NEXT:    s_waitcnt vmcnt(0)
553; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
554; SI-NEXT:    v_bfe_u32 v2, v0, 8, 8
555; SI-NEXT:    v_bfe_u32 v3, v0, 16, 8
556; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v1
557; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
558; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v3
559; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
560; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:8
561; SI-NEXT:    s_endpgm
562;
563; VI-LABEL: load_v3i8_to_v3f32:
564; VI:       ; %bb.0:
565; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
566; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
567; VI-NEXT:    s_waitcnt lgkmcnt(0)
568; VI-NEXT:    v_mov_b32_e32 v0, s2
569; VI-NEXT:    v_mov_b32_e32 v1, s3
570; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
571; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
572; VI-NEXT:    flat_load_dword v2, v[0:1]
573; VI-NEXT:    v_mov_b32_e32 v4, s1
574; VI-NEXT:    v_mov_b32_e32 v3, s0
575; VI-NEXT:    s_waitcnt vmcnt(0)
576; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
577; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
578; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
579; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
580; VI-NEXT:    s_endpgm
581  %tid = call i32 @llvm.amdgcn.workitem.id.x()
582  %gep = getelementptr <3 x i8>, ptr addrspace(1) %in, i32 %tid
583  %load = load <3 x i8>, ptr addrspace(1) %gep, align 4
584  %cvt = uitofp <3 x i8> %load to <3 x float>
585  store <3 x float> %cvt, ptr addrspace(1) %out, align 16
586  ret void
587}
588
589define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
590; SI-LABEL: load_v4i8_to_v4f32:
591; SI:       ; %bb.0:
592; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
593; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
594; SI-NEXT:    v_mov_b32_e32 v1, 0
595; SI-NEXT:    s_mov_b32 s6, 0
596; SI-NEXT:    s_mov_b32 s7, 0xf000
597; SI-NEXT:    s_waitcnt lgkmcnt(0)
598; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
599; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
600; SI-NEXT:    s_mov_b32 s6, -1
601; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
602; SI-NEXT:    s_waitcnt vmcnt(0)
603; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
604; SI-NEXT:    v_bfe_u32 v2, v0, 8, 8
605; SI-NEXT:    v_bfe_u32 v4, v0, 16, 8
606; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
607; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v1
608; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
609; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
610; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
611; SI-NEXT:    s_endpgm
612;
613; VI-LABEL: load_v4i8_to_v4f32:
614; VI:       ; %bb.0:
615; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
616; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
617; VI-NEXT:    s_waitcnt lgkmcnt(0)
618; VI-NEXT:    v_mov_b32_e32 v0, s2
619; VI-NEXT:    v_mov_b32_e32 v1, s3
620; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
621; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
622; VI-NEXT:    flat_load_dword v3, v[0:1]
623; VI-NEXT:    v_mov_b32_e32 v5, s1
624; VI-NEXT:    v_mov_b32_e32 v4, s0
625; VI-NEXT:    s_waitcnt vmcnt(0)
626; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
627; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
628; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
629; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v3
630; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
631; VI-NEXT:    s_endpgm
632  %tid = call i32 @llvm.amdgcn.workitem.id.x()
633  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
634  %load = load <4 x i8>, ptr addrspace(1) %gep, align 4
635  %cvt = uitofp <4 x i8> %load to <4 x float>
636  store <4 x float> %cvt, ptr addrspace(1) %out, align 16
637  ret void
638}
639
640; This should not be adding instructions to shift into the correct
641; position in the word for the component.
642
643; FIXME: Packing bytes
644define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
645; SI-LABEL: load_v4i8_to_v4f32_unaligned:
646; SI:       ; %bb.0:
647; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
648; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
649; SI-NEXT:    v_mov_b32_e32 v1, 0
650; SI-NEXT:    s_mov_b32 s6, 0
651; SI-NEXT:    s_mov_b32 s7, 0xf000
652; SI-NEXT:    s_waitcnt lgkmcnt(0)
653; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
654; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1
655; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3
656; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
657; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
658; SI-NEXT:    s_mov_b32 s6, -1
659; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
660; SI-NEXT:    s_waitcnt vmcnt(3)
661; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
662; SI-NEXT:    s_waitcnt vmcnt(2)
663; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
664; SI-NEXT:    s_waitcnt vmcnt(1)
665; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
666; SI-NEXT:    s_waitcnt vmcnt(0)
667; SI-NEXT:    v_or_b32_e32 v0, v1, v0
668; SI-NEXT:    v_or_b32_e32 v1, v2, v3
669; SI-NEXT:    v_or_b32_e32 v0, v1, v0
670; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
671; SI-NEXT:    v_bfe_u32 v2, v0, 8, 8
672; SI-NEXT:    v_bfe_u32 v4, v0, 16, 8
673; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
674; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v1
675; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
676; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
677; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
678; SI-NEXT:    s_endpgm
679;
680; VI-LABEL: load_v4i8_to_v4f32_unaligned:
681; VI:       ; %bb.0:
682; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
683; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
684; VI-NEXT:    s_waitcnt lgkmcnt(0)
685; VI-NEXT:    v_mov_b32_e32 v0, s2
686; VI-NEXT:    v_mov_b32_e32 v1, s3
687; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
688; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
689; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
690; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
691; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v0
692; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
693; VI-NEXT:    v_add_u32_e32 v6, vcc, 3, v0
694; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
695; VI-NEXT:    flat_load_ubyte v2, v[2:3]
696; VI-NEXT:    flat_load_ubyte v3, v[6:7]
697; VI-NEXT:    flat_load_ubyte v4, v[4:5]
698; VI-NEXT:    flat_load_ubyte v0, v[0:1]
699; VI-NEXT:    s_waitcnt vmcnt(3)
700; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
701; VI-NEXT:    s_waitcnt vmcnt(2)
702; VI-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
703; VI-NEXT:    s_waitcnt vmcnt(1)
704; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
705; VI-NEXT:    s_waitcnt vmcnt(0)
706; VI-NEXT:    v_or_b32_e32 v0, v1, v0
707; VI-NEXT:    v_or_b32_e32 v1, v2, v3
708; VI-NEXT:    v_or_b32_e32 v3, v1, v0
709; VI-NEXT:    v_mov_b32_e32 v5, s1
710; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
711; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
712; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
713; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v3
714; VI-NEXT:    v_mov_b32_e32 v4, s0
715; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
716; VI-NEXT:    s_endpgm
717  %tid = call i32 @llvm.amdgcn.workitem.id.x()
718  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
719  %load = load <4 x i8>, ptr addrspace(1) %gep, align 1
720  %cvt = uitofp <4 x i8> %load to <4 x float>
721  store <4 x float> %cvt, ptr addrspace(1) %out, align 16
722  ret void
723}
724
725define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind {
726; SI-LABEL: load_v4i8_to_v4f32_2_uses:
727; SI:       ; %bb.0:
728; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
729; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
730; SI-NEXT:    v_mov_b32_e32 v1, 0
731; SI-NEXT:    s_mov_b32 s2, 0
732; SI-NEXT:    s_mov_b32 s3, 0xf000
733; SI-NEXT:    s_waitcnt lgkmcnt(0)
734; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
735; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
736; SI-NEXT:    s_mov_b32 s2, -1
737; SI-NEXT:    s_waitcnt lgkmcnt(0)
738; SI-NEXT:    s_mov_b64 s[0:1], s[4:5]
739; SI-NEXT:    s_waitcnt vmcnt(0)
740; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
741; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
742; SI-NEXT:    v_and_b32_e32 v5, 0xff, v0
743; SI-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
744; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
745; SI-NEXT:    v_add_i32_e32 v6, vcc, 9, v0
746; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
747; SI-NEXT:    v_and_b32_e32 v5, 0xff, v1
748; SI-NEXT:    v_and_b32_e32 v7, 0xff, v2
749; SI-NEXT:    v_add_i32_e32 v8, vcc, 9, v1
750; SI-NEXT:    v_add_i32_e32 v9, vcc, 9, v2
751; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v5
752; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v7
753; SI-NEXT:    v_and_b32_e32 v5, 0xff, v8
754; SI-NEXT:    v_add_i32_e32 v4, vcc, 9, v4
755; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
756; SI-NEXT:    v_and_b32_e32 v7, 0xff, v9
757; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
758; SI-NEXT:    s_waitcnt expcnt(0)
759; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
760; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
761; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
762; SI-NEXT:    v_or_b32_e32 v0, v6, v0
763; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
764; SI-NEXT:    v_or_b32_e32 v0, v0, v1
765; SI-NEXT:    v_or_b32_e32 v0, v0, v2
766; SI-NEXT:    s_mov_b64 s[0:1], s[6:7]
767; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
768; SI-NEXT:    s_endpgm
769;
770; VI-LABEL: load_v4i8_to_v4f32_2_uses:
771; VI:       ; %bb.0:
772; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
773; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
774; VI-NEXT:    v_mov_b32_e32 v6, 9
775; VI-NEXT:    v_mov_b32_e32 v7, 8
776; VI-NEXT:    s_waitcnt lgkmcnt(0)
777; VI-NEXT:    v_mov_b32_e32 v0, s0
778; VI-NEXT:    v_mov_b32_e32 v1, s1
779; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
780; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
781; VI-NEXT:    flat_load_dword v1, v[0:1]
782; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
783; VI-NEXT:    v_mov_b32_e32 v2, 0xff
784; VI-NEXT:    s_waitcnt lgkmcnt(0)
785; VI-NEXT:    v_mov_b32_e32 v5, s1
786; VI-NEXT:    v_mov_b32_e32 v4, s0
787; VI-NEXT:    s_waitcnt vmcnt(0)
788; VI-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
789; VI-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
790; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
791; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v1
792; VI-NEXT:    v_add_u16_e32 v9, 9, v1
793; VI-NEXT:    v_add_u16_sdwa v10, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
794; VI-NEXT:    v_add_u16_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
795; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
796; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
797; VI-NEXT:    v_add_u16_e32 v8, 9, v8
798; VI-NEXT:    v_and_b32_e32 v10, 0xff, v10
799; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
800; VI-NEXT:    v_and_b32_e32 v6, 0xff, v6
801; VI-NEXT:    v_lshlrev_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
802; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v10
803; VI-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
804; VI-NEXT:    v_lshlrev_b32_e32 v2, 24, v6
805; VI-NEXT:    v_or_b32_e32 v0, v0, v1
806; VI-NEXT:    v_or_b32_e32 v2, v0, v2
807; VI-NEXT:    v_mov_b32_e32 v0, s2
808; VI-NEXT:    v_mov_b32_e32 v1, s3
809; VI-NEXT:    flat_store_dword v[0:1], v2
810; VI-NEXT:    s_endpgm
811  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
812  %in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
813  %load = load <4 x i8>, ptr addrspace(1) %in.ptr, align 4
814  %cvt = uitofp <4 x i8> %load to <4 x float>
815  store <4 x float> %cvt, ptr addrspace(1) %out, align 16
816  %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
817  store <4 x i8> %add, ptr addrspace(1) %out2, align 4
818  ret void
819}
820
821define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
822; SI-LABEL: load_v7i8_to_v7f32:
823; SI:       ; %bb.0:
824; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
825; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
826; SI-NEXT:    v_mov_b32_e32 v1, 0
827; SI-NEXT:    s_mov_b32 s6, 0
828; SI-NEXT:    s_mov_b32 s7, 0xf000
829; SI-NEXT:    s_waitcnt lgkmcnt(0)
830; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
831; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64
832; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:1
833; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
834; SI-NEXT:    buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:3
835; SI-NEXT:    buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:4
836; SI-NEXT:    buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:5
837; SI-NEXT:    buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:6
838; SI-NEXT:    s_mov_b32 s6, -1
839; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
840; SI-NEXT:    s_waitcnt vmcnt(6)
841; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v2
842; SI-NEXT:    s_waitcnt vmcnt(5)
843; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v3
844; SI-NEXT:    s_waitcnt vmcnt(4)
845; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
846; SI-NEXT:    s_waitcnt vmcnt(3)
847; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v5
848; SI-NEXT:    s_waitcnt vmcnt(2)
849; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v6
850; SI-NEXT:    s_waitcnt vmcnt(1)
851; SI-NEXT:    v_cvt_f32_ubyte0_e32 v5, v7
852; SI-NEXT:    s_waitcnt vmcnt(0)
853; SI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v8
854; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
855; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
856; SI-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:24
857; SI-NEXT:    s_endpgm
858;
859; VI-LABEL: load_v7i8_to_v7f32:
860; VI:       ; %bb.0:
861; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
862; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
863; VI-NEXT:    s_waitcnt lgkmcnt(0)
864; VI-NEXT:    v_mov_b32_e32 v0, s2
865; VI-NEXT:    v_mov_b32_e32 v1, s3
866; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
867; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
868; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
869; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
870; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v0
871; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
872; VI-NEXT:    v_add_u32_e32 v6, vcc, 3, v0
873; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
874; VI-NEXT:    v_add_u32_e32 v8, vcc, 4, v0
875; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
876; VI-NEXT:    v_add_u32_e32 v10, vcc, 5, v0
877; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
878; VI-NEXT:    v_add_u32_e32 v12, vcc, 6, v0
879; VI-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
880; VI-NEXT:    flat_load_ubyte v0, v[0:1]
881; VI-NEXT:    flat_load_ubyte v1, v[2:3]
882; VI-NEXT:    flat_load_ubyte v2, v[4:5]
883; VI-NEXT:    flat_load_ubyte v3, v[6:7]
884; VI-NEXT:    flat_load_ubyte v4, v[8:9]
885; VI-NEXT:    flat_load_ubyte v5, v[10:11]
886; VI-NEXT:    flat_load_ubyte v6, v[12:13]
887; VI-NEXT:    v_mov_b32_e32 v8, s1
888; VI-NEXT:    v_mov_b32_e32 v7, s0
889; VI-NEXT:    s_add_u32 s0, s0, 16
890; VI-NEXT:    s_addc_u32 s1, s1, 0
891; VI-NEXT:    v_mov_b32_e32 v10, s1
892; VI-NEXT:    v_mov_b32_e32 v9, s0
893; VI-NEXT:    s_waitcnt vmcnt(6)
894; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
895; VI-NEXT:    s_waitcnt vmcnt(5)
896; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
897; VI-NEXT:    s_waitcnt vmcnt(4)
898; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
899; VI-NEXT:    s_waitcnt vmcnt(3)
900; VI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v3
901; VI-NEXT:    s_waitcnt vmcnt(2)
902; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v4
903; VI-NEXT:    s_waitcnt vmcnt(1)
904; VI-NEXT:    v_cvt_f32_ubyte0_e32 v5, v5
905; VI-NEXT:    s_waitcnt vmcnt(0)
906; VI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v6
907; VI-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
908; VI-NEXT:    flat_store_dwordx3 v[9:10], v[4:6]
909; VI-NEXT:    s_endpgm
910  %tid = call i32 @llvm.amdgcn.workitem.id.x()
911  %gep = getelementptr <7 x i8>, ptr addrspace(1) %in, i32 %tid
912  %load = load <7 x i8>, ptr addrspace(1) %gep, align 1
913  %cvt = uitofp <7 x i8> %load to <7 x float>
914  store <7 x float> %cvt, ptr addrspace(1) %out, align 16
915  ret void
916}
917
918define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
919; SI-LABEL: load_v8i8_to_v8f32:
920; SI:       ; %bb.0:
921; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
922; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
923; SI-NEXT:    v_mov_b32_e32 v1, 0
924; SI-NEXT:    s_mov_b32 s6, 0
925; SI-NEXT:    s_mov_b32 s7, 0xf000
926; SI-NEXT:    s_waitcnt lgkmcnt(0)
927; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
928; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
929; SI-NEXT:    s_mov_b32 s6, -1
930; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
931; SI-NEXT:    s_waitcnt vmcnt(0)
932; SI-NEXT:    v_and_b32_e32 v2, 0xff, v0
933; SI-NEXT:    v_bfe_u32 v4, v0, 8, 8
934; SI-NEXT:    v_bfe_u32 v5, v0, 16, 8
935; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
936; SI-NEXT:    v_and_b32_e32 v6, 0xff, v1
937; SI-NEXT:    v_bfe_u32 v8, v1, 8, 8
938; SI-NEXT:    v_bfe_u32 v9, v1, 16, 8
939; SI-NEXT:    v_cvt_f32_ubyte3_e32 v7, v1
940; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v2
941; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
942; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v5
943; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v6
944; SI-NEXT:    v_cvt_f32_ubyte0_e32 v5, v8
945; SI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v9
946; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
947; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
948; SI-NEXT:    s_endpgm
949;
950; VI-LABEL: load_v8i8_to_v8f32:
951; VI:       ; %bb.0:
952; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
953; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
954; VI-NEXT:    s_waitcnt lgkmcnt(0)
955; VI-NEXT:    v_mov_b32_e32 v0, s2
956; VI-NEXT:    v_mov_b32_e32 v1, s3
957; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
958; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
959; VI-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
960; VI-NEXT:    v_mov_b32_e32 v9, s1
961; VI-NEXT:    v_mov_b32_e32 v8, s0
962; VI-NEXT:    s_add_u32 s0, s0, 16
963; VI-NEXT:    s_addc_u32 s1, s1, 0
964; VI-NEXT:    v_mov_b32_e32 v11, s1
965; VI-NEXT:    v_mov_b32_e32 v10, s0
966; VI-NEXT:    s_waitcnt vmcnt(0)
967; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
968; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
969; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
970; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v6
971; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
972; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
973; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
974; VI-NEXT:    v_cvt_f32_ubyte3_e32 v7, v7
975; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
976; VI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
977; VI-NEXT:    s_endpgm
978  %tid = call i32 @llvm.amdgcn.workitem.id.x()
979  %gep = getelementptr <8 x i8>, ptr addrspace(1) %in, i32 %tid
980  %load = load <8 x i8>, ptr addrspace(1) %gep, align 8
981  %cvt = uitofp <8 x i8> %load to <8 x float>
982  store <8 x float> %cvt, ptr addrspace(1) %out, align 16
983  ret void
984}
985
986define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
987; SI-LABEL: i8_zext_inreg_i32_to_f32:
988; SI:       ; %bb.0:
989; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
990; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
991; SI-NEXT:    v_mov_b32_e32 v1, 0
992; SI-NEXT:    s_mov_b32 s6, 0
993; SI-NEXT:    s_mov_b32 s7, 0xf000
994; SI-NEXT:    s_waitcnt lgkmcnt(0)
995; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
996; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
997; SI-NEXT:    s_mov_b32 s6, -1
998; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
999; SI-NEXT:    s_waitcnt vmcnt(0)
1000; SI-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
1001; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1002; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1003; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1004; SI-NEXT:    s_endpgm
1005;
1006; VI-LABEL: i8_zext_inreg_i32_to_f32:
1007; VI:       ; %bb.0:
1008; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1009; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1010; VI-NEXT:    s_waitcnt lgkmcnt(0)
1011; VI-NEXT:    v_mov_b32_e32 v0, s2
1012; VI-NEXT:    v_mov_b32_e32 v1, s3
1013; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1014; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1015; VI-NEXT:    flat_load_dword v0, v[0:1]
1016; VI-NEXT:    s_waitcnt vmcnt(0)
1017; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
1018; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1019; VI-NEXT:    v_mov_b32_e32 v0, s0
1020; VI-NEXT:    v_mov_b32_e32 v1, s1
1021; VI-NEXT:    flat_store_dword v[0:1], v2
1022; VI-NEXT:    s_endpgm
1023  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1024  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1025  %load = load i32, ptr addrspace(1) %gep, align 4
1026  %add = add i32 %load, 2
1027  %inreg = and i32 %add, 255
1028  %cvt = uitofp i32 %inreg to float
1029  store float %cvt, ptr addrspace(1) %out, align 4
1030  ret void
1031}
1032
1033define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1034; SI-LABEL: i8_zext_inreg_hi1_to_f32:
1035; SI:       ; %bb.0:
1036; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1037; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1038; SI-NEXT:    v_mov_b32_e32 v1, 0
1039; SI-NEXT:    s_mov_b32 s6, 0
1040; SI-NEXT:    s_mov_b32 s7, 0xf000
1041; SI-NEXT:    s_waitcnt lgkmcnt(0)
1042; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1043; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1044; SI-NEXT:    s_mov_b32 s6, -1
1045; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1046; SI-NEXT:    s_waitcnt vmcnt(0)
1047; SI-NEXT:    v_bfe_u32 v0, v0, 8, 8
1048; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1049; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1050; SI-NEXT:    s_endpgm
1051;
1052; VI-LABEL: i8_zext_inreg_hi1_to_f32:
1053; VI:       ; %bb.0:
1054; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1055; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1056; VI-NEXT:    s_waitcnt lgkmcnt(0)
1057; VI-NEXT:    v_mov_b32_e32 v0, s2
1058; VI-NEXT:    v_mov_b32_e32 v1, s3
1059; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1060; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1061; VI-NEXT:    flat_load_dword v0, v[0:1]
1062; VI-NEXT:    s_waitcnt vmcnt(0)
1063; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
1064; VI-NEXT:    v_mov_b32_e32 v0, s0
1065; VI-NEXT:    v_mov_b32_e32 v1, s1
1066; VI-NEXT:    flat_store_dword v[0:1], v2
1067; VI-NEXT:    s_endpgm
1068  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1069  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1070  %load = load i32, ptr addrspace(1) %gep, align 4
1071  %inreg = and i32 %load, 65280
1072  %shr = lshr i32 %inreg, 8
1073  %cvt = uitofp i32 %shr to float
1074  store float %cvt, ptr addrspace(1) %out, align 4
1075  ret void
1076}
1077
1078; We don't get these ones because of the zext, but instcombine removes
1079; them so it shouldn't really matter.
1080define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1081; SI-LABEL: i8_zext_i32_to_f32:
1082; SI:       ; %bb.0:
1083; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1084; SI-NEXT:    s_mov_b32 s6, 0
1085; SI-NEXT:    s_mov_b32 s7, 0xf000
1086; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
1087; SI-NEXT:    s_waitcnt lgkmcnt(0)
1088; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1089; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1090; SI-NEXT:    s_mov_b32 s6, -1
1091; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1092; SI-NEXT:    s_waitcnt vmcnt(0)
1093; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1094; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1095; SI-NEXT:    s_endpgm
1096;
1097; VI-LABEL: i8_zext_i32_to_f32:
1098; VI:       ; %bb.0:
1099; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1100; VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
1101; VI-NEXT:    s_waitcnt lgkmcnt(0)
1102; VI-NEXT:    v_mov_b32_e32 v1, s2
1103; VI-NEXT:    v_mov_b32_e32 v2, s3
1104; VI-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
1105; VI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v3, vcc
1106; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1107; VI-NEXT:    s_waitcnt vmcnt(0)
1108; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
1109; VI-NEXT:    v_mov_b32_e32 v0, s0
1110; VI-NEXT:    v_mov_b32_e32 v1, s1
1111; VI-NEXT:    flat_store_dword v[0:1], v2
1112; VI-NEXT:    s_endpgm
1113  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1114  %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid
1115  %load = load i8, ptr addrspace(1) %gep, align 1
1116  %ext = zext i8 %load to i32
1117  %cvt = uitofp i32 %ext to float
1118  store float %cvt, ptr addrspace(1) %out, align 4
1119  ret void
1120}
1121
1122define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1123; SI-LABEL: v4i8_zext_v4i32_to_v4f32:
1124; SI:       ; %bb.0:
1125; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1126; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1127; SI-NEXT:    v_mov_b32_e32 v1, 0
1128; SI-NEXT:    s_mov_b32 s6, 0
1129; SI-NEXT:    s_mov_b32 s7, 0xf000
1130; SI-NEXT:    s_waitcnt lgkmcnt(0)
1131; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1132; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64 offset:1
1133; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3
1134; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
1135; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1136; SI-NEXT:    s_mov_b32 s6, -1
1137; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1138; SI-NEXT:    s_waitcnt vmcnt(3)
1139; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
1140; SI-NEXT:    s_waitcnt vmcnt(2)
1141; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
1142; SI-NEXT:    s_waitcnt vmcnt(1)
1143; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
1144; SI-NEXT:    s_waitcnt vmcnt(0)
1145; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1146; SI-NEXT:    v_or_b32_e32 v1, v2, v3
1147; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1148; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
1149; SI-NEXT:    v_bfe_u32 v2, v0, 8, 8
1150; SI-NEXT:    v_bfe_u32 v4, v0, 16, 8
1151; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
1152; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v1
1153; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
1154; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
1155; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1156; SI-NEXT:    s_endpgm
1157;
1158; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
1159; VI:       ; %bb.0:
1160; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1161; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1162; VI-NEXT:    s_waitcnt lgkmcnt(0)
1163; VI-NEXT:    v_mov_b32_e32 v0, s2
1164; VI-NEXT:    v_mov_b32_e32 v1, s3
1165; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1166; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1167; VI-NEXT:    v_add_u32_e32 v2, vcc, 1, v0
1168; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1169; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v0
1170; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1171; VI-NEXT:    v_add_u32_e32 v6, vcc, 3, v0
1172; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
1173; VI-NEXT:    flat_load_ubyte v2, v[2:3]
1174; VI-NEXT:    flat_load_ubyte v3, v[6:7]
1175; VI-NEXT:    flat_load_ubyte v4, v[4:5]
1176; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1177; VI-NEXT:    s_waitcnt vmcnt(3)
1178; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
1179; VI-NEXT:    s_waitcnt vmcnt(2)
1180; VI-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
1181; VI-NEXT:    s_waitcnt vmcnt(1)
1182; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
1183; VI-NEXT:    s_waitcnt vmcnt(0)
1184; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1185; VI-NEXT:    v_or_b32_e32 v1, v2, v3
1186; VI-NEXT:    v_or_b32_e32 v3, v1, v0
1187; VI-NEXT:    v_mov_b32_e32 v5, s1
1188; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1189; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
1190; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1191; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v3
1192; VI-NEXT:    v_mov_b32_e32 v4, s0
1193; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1194; VI-NEXT:    s_endpgm
1195  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1196  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
1197  %load = load <4 x i8>, ptr addrspace(1) %gep, align 1
1198  %ext = zext <4 x i8> %load to <4 x i32>
1199  %cvt = uitofp <4 x i32> %ext to <4 x float>
1200  store <4 x float> %cvt, ptr addrspace(1) %out, align 16
1201  ret void
1202}
1203
1204define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1205; SI-LABEL: extract_byte0_to_f32:
1206; SI:       ; %bb.0:
1207; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1208; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1209; SI-NEXT:    v_mov_b32_e32 v1, 0
1210; SI-NEXT:    s_mov_b32 s6, 0
1211; SI-NEXT:    s_mov_b32 s7, 0xf000
1212; SI-NEXT:    s_waitcnt lgkmcnt(0)
1213; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1214; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
1215; SI-NEXT:    s_mov_b32 s6, -1
1216; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1217; SI-NEXT:    s_waitcnt vmcnt(0)
1218; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1219; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1220; SI-NEXT:    s_endpgm
1221;
1222; VI-LABEL: extract_byte0_to_f32:
1223; VI:       ; %bb.0:
1224; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1225; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1226; VI-NEXT:    s_waitcnt lgkmcnt(0)
1227; VI-NEXT:    v_mov_b32_e32 v0, s2
1228; VI-NEXT:    v_mov_b32_e32 v1, s3
1229; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1230; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1231; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1232; VI-NEXT:    s_waitcnt vmcnt(0)
1233; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
1234; VI-NEXT:    v_mov_b32_e32 v0, s0
1235; VI-NEXT:    v_mov_b32_e32 v1, s1
1236; VI-NEXT:    flat_store_dword v[0:1], v2
1237; VI-NEXT:    s_endpgm
1238  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1239  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1240  %val = load i32, ptr addrspace(1) %gep
1241  %and = and i32 %val, 255
1242  %cvt = uitofp i32 %and to float
1243  store float %cvt, ptr addrspace(1) %out
1244  ret void
1245}
1246
1247define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1248; SI-LABEL: extract_byte1_to_f32:
1249; SI:       ; %bb.0:
1250; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1251; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1252; SI-NEXT:    v_mov_b32_e32 v1, 0
1253; SI-NEXT:    s_mov_b32 s6, 0
1254; SI-NEXT:    s_mov_b32 s7, 0xf000
1255; SI-NEXT:    s_waitcnt lgkmcnt(0)
1256; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1257; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1258; SI-NEXT:    s_mov_b32 s6, -1
1259; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1260; SI-NEXT:    s_waitcnt vmcnt(0)
1261; SI-NEXT:    v_bfe_u32 v0, v0, 8, 8
1262; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1263; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1264; SI-NEXT:    s_endpgm
1265;
1266; VI-LABEL: extract_byte1_to_f32:
1267; VI:       ; %bb.0:
1268; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1269; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1270; VI-NEXT:    s_waitcnt lgkmcnt(0)
1271; VI-NEXT:    v_mov_b32_e32 v0, s2
1272; VI-NEXT:    v_mov_b32_e32 v1, s3
1273; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1274; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1275; VI-NEXT:    flat_load_dword v0, v[0:1]
1276; VI-NEXT:    s_waitcnt vmcnt(0)
1277; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
1278; VI-NEXT:    v_mov_b32_e32 v0, s0
1279; VI-NEXT:    v_mov_b32_e32 v1, s1
1280; VI-NEXT:    flat_store_dword v[0:1], v2
1281; VI-NEXT:    s_endpgm
1282  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1283  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1284  %val = load i32, ptr addrspace(1) %gep
1285  %srl = lshr i32 %val, 8
1286  %and = and i32 %srl, 255
1287  %cvt = uitofp i32 %and to float
1288  store float %cvt, ptr addrspace(1) %out
1289  ret void
1290}
1291
1292define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1293; SI-LABEL: extract_byte2_to_f32:
1294; SI:       ; %bb.0:
1295; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1296; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1297; SI-NEXT:    v_mov_b32_e32 v1, 0
1298; SI-NEXT:    s_mov_b32 s6, 0
1299; SI-NEXT:    s_mov_b32 s7, 0xf000
1300; SI-NEXT:    s_waitcnt lgkmcnt(0)
1301; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1302; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1303; SI-NEXT:    s_mov_b32 s6, -1
1304; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1305; SI-NEXT:    s_waitcnt vmcnt(0)
1306; SI-NEXT:    v_bfe_u32 v0, v0, 16, 8
1307; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1308; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1309; SI-NEXT:    s_endpgm
1310;
1311; VI-LABEL: extract_byte2_to_f32:
1312; VI:       ; %bb.0:
1313; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1314; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1315; VI-NEXT:    s_waitcnt lgkmcnt(0)
1316; VI-NEXT:    v_mov_b32_e32 v0, s2
1317; VI-NEXT:    v_mov_b32_e32 v1, s3
1318; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1319; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1320; VI-NEXT:    flat_load_dword v0, v[0:1]
1321; VI-NEXT:    s_waitcnt vmcnt(0)
1322; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
1323; VI-NEXT:    v_mov_b32_e32 v0, s0
1324; VI-NEXT:    v_mov_b32_e32 v1, s1
1325; VI-NEXT:    flat_store_dword v[0:1], v2
1326; VI-NEXT:    s_endpgm
1327  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1328  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1329  %val = load i32, ptr addrspace(1) %gep
1330  %srl = lshr i32 %val, 16
1331  %and = and i32 %srl, 255
1332  %cvt = uitofp i32 %and to float
1333  store float %cvt, ptr addrspace(1) %out
1334  ret void
1335}
1336
1337define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1338; SI-LABEL: extract_byte3_to_f32:
1339; SI:       ; %bb.0:
1340; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1341; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1342; SI-NEXT:    v_mov_b32_e32 v1, 0
1343; SI-NEXT:    s_mov_b32 s6, 0
1344; SI-NEXT:    s_mov_b32 s7, 0xf000
1345; SI-NEXT:    s_waitcnt lgkmcnt(0)
1346; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1347; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1348; SI-NEXT:    s_mov_b32 s6, -1
1349; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1350; SI-NEXT:    s_waitcnt vmcnt(0)
1351; SI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
1352; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1353; SI-NEXT:    s_endpgm
1354;
1355; VI-LABEL: extract_byte3_to_f32:
1356; VI:       ; %bb.0:
1357; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1358; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1359; VI-NEXT:    s_waitcnt lgkmcnt(0)
1360; VI-NEXT:    v_mov_b32_e32 v0, s2
1361; VI-NEXT:    v_mov_b32_e32 v1, s3
1362; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1363; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1364; VI-NEXT:    flat_load_dword v0, v[0:1]
1365; VI-NEXT:    s_waitcnt vmcnt(0)
1366; VI-NEXT:    v_cvt_f32_ubyte3_e32 v2, v0
1367; VI-NEXT:    v_mov_b32_e32 v0, s0
1368; VI-NEXT:    v_mov_b32_e32 v1, s1
1369; VI-NEXT:    flat_store_dword v[0:1], v2
1370; VI-NEXT:    s_endpgm
1371  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1372  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
1373  %val = load i32, ptr addrspace(1) %gep
1374  %srl = lshr i32 %val, 24
1375  %and = and i32 %srl, 255
1376  %cvt = uitofp i32 %and to float
1377  store float %cvt, ptr addrspace(1) %out
1378  ret void
1379}
1380
1381define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) {
1382; SI-LABEL: cvt_ubyte0_or_multiuse:
1383; SI:       ; %bb.0: ; %bb
1384; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1385; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1386; SI-NEXT:    v_mov_b32_e32 v1, 0
1387; SI-NEXT:    s_mov_b32 s6, 0
1388; SI-NEXT:    s_mov_b32 s7, 0xf000
1389; SI-NEXT:    s_waitcnt lgkmcnt(0)
1390; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
1391; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
1392; SI-NEXT:    s_mov_b32 s6, -1
1393; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1394; SI-NEXT:    s_waitcnt vmcnt(0)
1395; SI-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
1396; SI-NEXT:    v_and_b32_e32 v1, 0xff, v0
1397; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1398; SI-NEXT:    v_add_f32_e32 v0, v0, v1
1399; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1400; SI-NEXT:    s_endpgm
1401;
1402; VI-LABEL: cvt_ubyte0_or_multiuse:
1403; VI:       ; %bb.0: ; %bb
1404; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1405; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1406; VI-NEXT:    s_waitcnt lgkmcnt(0)
1407; VI-NEXT:    v_mov_b32_e32 v0, s0
1408; VI-NEXT:    v_mov_b32_e32 v1, s1
1409; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
1410; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1411; VI-NEXT:    flat_load_dword v0, v[0:1]
1412; VI-NEXT:    s_waitcnt vmcnt(0)
1413; VI-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
1414; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
1415; VI-NEXT:    v_add_f32_e32 v2, v0, v1
1416; VI-NEXT:    v_mov_b32_e32 v0, s2
1417; VI-NEXT:    v_mov_b32_e32 v1, s3
1418; VI-NEXT:    flat_store_dword v[0:1], v2
1419; VI-NEXT:    s_endpgm
1420bb:
1421  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
1422  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %lid
1423  %load = load i32, ptr addrspace(1) %gep
1424  %or = or i32 %load, -2147483647
1425  %and = and i32 %or, 255
1426  %uitofp = uitofp i32 %and to float
1427  %cast = bitcast i32 %or to float
1428  %add = fadd float %cast, %uitofp
1429  store float %add, ptr addrspace(1) %out
1430  ret void
1431}
1432
1433define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) {
1434; SI-LABEL: v_test_sitofp_i64_byte_to_f32:
1435; SI:       ; %bb.0:
1436; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1437; SI-NEXT:    v_ffbh_i32_e32 v2, 0
1438; SI-NEXT:    v_add_i32_e32 v2, vcc, -1, v2
1439; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1440; SI-NEXT:    v_mov_b32_e32 v1, 0
1441; SI-NEXT:    v_min_u32_e32 v2, 32, v2
1442; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
1443; SI-NEXT:    v_min_u32_e32 v0, 1, v0
1444; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1445; SI-NEXT:    v_cvt_f32_i32_e32 v0, v0
1446; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v2
1447; SI-NEXT:    v_ldexp_f32_e32 v0, v0, v1
1448; SI-NEXT:    s_setpc_b64 s[30:31]
1449;
1450; VI-LABEL: v_test_sitofp_i64_byte_to_f32:
1451; VI:       ; %bb.0:
1452; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1453; VI-NEXT:    v_ffbh_i32_e32 v2, 0
1454; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v2
1455; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1456; VI-NEXT:    v_mov_b32_e32 v1, 0
1457; VI-NEXT:    v_min_u32_e32 v2, 32, v2
1458; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
1459; VI-NEXT:    v_min_u32_e32 v0, 1, v0
1460; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1461; VI-NEXT:    v_cvt_f32_i32_e32 v0, v0
1462; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
1463; VI-NEXT:    v_ldexp_f32 v0, v0, v1
1464; VI-NEXT:    s_setpc_b64 s[30:31]
1465  %masked = and i64 %arg0, 255
1466  %itofp = sitofp i64 %masked to float
1467  ret float %itofp
1468}
1469
1470define float @v_test_uitofp_i64_byte_to_f32(i64 %arg0) {
1471; SI-LABEL: v_test_uitofp_i64_byte_to_f32:
1472; SI:       ; %bb.0:
1473; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1474; SI-NEXT:    v_ffbh_u32_e32 v2, 0
1475; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1476; SI-NEXT:    v_mov_b32_e32 v1, 0
1477; SI-NEXT:    v_min_u32_e32 v2, 32, v2
1478; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
1479; SI-NEXT:    v_min_u32_e32 v0, 1, v0
1480; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1481; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1482; SI-NEXT:    v_sub_i32_e32 v1, vcc, 32, v2
1483; SI-NEXT:    v_ldexp_f32_e32 v0, v0, v1
1484; SI-NEXT:    s_setpc_b64 s[30:31]
1485;
1486; VI-LABEL: v_test_uitofp_i64_byte_to_f32:
1487; VI:       ; %bb.0:
1488; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1489; VI-NEXT:    v_ffbh_u32_e32 v2, 0
1490; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1491; VI-NEXT:    v_mov_b32_e32 v1, 0
1492; VI-NEXT:    v_min_u32_e32 v2, 32, v2
1493; VI-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
1494; VI-NEXT:    v_min_u32_e32 v0, 1, v0
1495; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1496; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
1497; VI-NEXT:    v_sub_u32_e32 v1, vcc, 32, v2
1498; VI-NEXT:    v_ldexp_f32 v0, v0, v1
1499; VI-NEXT:    s_setpc_b64 s[30:31]
1500  %masked = and i64 %arg0, 255
1501  %itofp = uitofp i64 %masked to float
1502  ret float %itofp
1503}
1504
1505define float @v_test_sitofp_i16_byte_to_f32(i16 %arg0) {
1506; SI-LABEL: v_test_sitofp_i16_byte_to_f32:
1507; SI:       ; %bb.0:
1508; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1509; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1510; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
1511; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1512; SI-NEXT:    s_setpc_b64 s[30:31]
1513;
1514; VI-LABEL: v_test_sitofp_i16_byte_to_f32:
1515; VI:       ; %bb.0:
1516; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1517; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1518; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
1519; VI-NEXT:    s_setpc_b64 s[30:31]
1520  %masked = and i16 %arg0, 255
1521  %itofp = sitofp i16 %masked to float
1522  ret float %itofp
1523}
1524
1525define float @v_test_uitofp_i16_byte_to_f32(i16 %arg0) {
1526; SI-LABEL: v_test_uitofp_i16_byte_to_f32:
1527; SI:       ; %bb.0:
1528; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1529; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
1530; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1531; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1532; SI-NEXT:    s_setpc_b64 s[30:31]
1533;
1534; VI-LABEL: v_test_uitofp_i16_byte_to_f32:
1535; VI:       ; %bb.0:
1536; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1537; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
1538; VI-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1539; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1540; VI-NEXT:    s_setpc_b64 s[30:31]
1541  %masked = and i16 %arg0, 255
1542  %itofp = uitofp i16 %masked to float
1543  ret float %itofp
1544}
1545