xref: /llvm-project/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (revision 7c58d6363a40fc6d1cdf6a147da8f3bb0d4f96ec)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,SI
3; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,VI
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10
5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx908 -start-before=amdgpu-isel -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9
6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11
7
8declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
9declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
10
11define float @v_uitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
12; GCN-LABEL: v_uitofp_i32_to_f32_mask255:
13; GCN:       ; %bb.0:
14; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
16; GCN-NEXT:    s_setpc_b64 s[30:31]
17;
18; GFX10-LABEL: v_uitofp_i32_to_f32_mask255:
19; GFX10:       ; %bb.0:
20; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
22; GFX10-NEXT:    s_setpc_b64 s[30:31]
23;
24; GFX9-LABEL: v_uitofp_i32_to_f32_mask255:
25; GFX9:       ; %bb.0:
26; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
28; GFX9-NEXT:    s_setpc_b64 s[30:31]
29;
30; GFX11-LABEL: v_uitofp_i32_to_f32_mask255:
31; GFX11:       ; %bb.0:
32; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
34; GFX11-NEXT:    s_setpc_b64 s[30:31]
35  %masked = and i32 %arg0, 255
36  %cvt = uitofp i32 %masked to float
37  ret float %cvt
38}
39
40define float @v_sitofp_i32_to_f32_mask255(i32 %arg0) nounwind {
41; GCN-LABEL: v_sitofp_i32_to_f32_mask255:
42; GCN:       ; %bb.0:
43; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
44; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
45; GCN-NEXT:    s_setpc_b64 s[30:31]
46;
47; GFX10-LABEL: v_sitofp_i32_to_f32_mask255:
48; GFX10:       ; %bb.0:
49; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
51; GFX10-NEXT:    s_setpc_b64 s[30:31]
52;
53; GFX9-LABEL: v_sitofp_i32_to_f32_mask255:
54; GFX9:       ; %bb.0:
55; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
57; GFX9-NEXT:    s_setpc_b64 s[30:31]
58;
59; GFX11-LABEL: v_sitofp_i32_to_f32_mask255:
60; GFX11:       ; %bb.0:
61; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
62; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
63; GFX11-NEXT:    s_setpc_b64 s[30:31]
64  %masked = and i32 %arg0, 255
65  %cvt = sitofp i32 %masked to float
66  ret float %cvt
67}
68
69define float @v_uitofp_to_f32_lshr7_mask255(i32 %arg0) nounwind {
70; GCN-LABEL: v_uitofp_to_f32_lshr7_mask255:
71; GCN:       ; %bb.0:
72; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
73; GCN-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
74; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
75; GCN-NEXT:    s_setpc_b64 s[30:31]
76;
77; GFX10-LABEL: v_uitofp_to_f32_lshr7_mask255:
78; GFX10:       ; %bb.0:
79; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
80; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
81; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
82; GFX10-NEXT:    s_setpc_b64 s[30:31]
83;
84; GFX9-LABEL: v_uitofp_to_f32_lshr7_mask255:
85; GFX9:       ; %bb.0:
86; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
87; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
88; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
89; GFX9-NEXT:    s_setpc_b64 s[30:31]
90;
91; GFX11-LABEL: v_uitofp_to_f32_lshr7_mask255:
92; GFX11:       ; %bb.0:
93; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
94; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 7, v0
95; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
96; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
97; GFX11-NEXT:    s_setpc_b64 s[30:31]
98  %lshr.7 = lshr i32 %arg0, 7
99  %masked = and i32 %lshr.7, 255
100  %cvt = uitofp i32 %masked to float
101  ret float %cvt
102}
103
104define float @v_uitofp_to_f32_lshr8_mask255(i32 %arg0) nounwind {
105; GCN-LABEL: v_uitofp_to_f32_lshr8_mask255:
106; GCN:       ; %bb.0:
107; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
108; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
109; GCN-NEXT:    s_setpc_b64 s[30:31]
110;
111; GFX10-LABEL: v_uitofp_to_f32_lshr8_mask255:
112; GFX10:       ; %bb.0:
113; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
115; GFX10-NEXT:    s_setpc_b64 s[30:31]
116;
117; GFX9-LABEL: v_uitofp_to_f32_lshr8_mask255:
118; GFX9:       ; %bb.0:
119; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
120; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
121; GFX9-NEXT:    s_setpc_b64 s[30:31]
122;
123; GFX11-LABEL: v_uitofp_to_f32_lshr8_mask255:
124; GFX11:       ; %bb.0:
125; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
127; GFX11-NEXT:    s_setpc_b64 s[30:31]
128  %lshr.8 = lshr i32 %arg0, 8
129  %masked = and i32 %lshr.8, 255
130  %cvt = uitofp i32 %masked to float
131  ret float %cvt
132}
133
134define float @v_uitofp_to_f32_multi_use_lshr8_mask255(i32 %arg0) nounwind {
135; SI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
136; SI:       ; %bb.0:
137; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
138; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
139; SI-NEXT:    s_mov_b32 s7, 0xf000
140; SI-NEXT:    s_mov_b32 s6, -1
141; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
142; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
143; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
144; SI-NEXT:    s_setpc_b64 s[30:31]
145;
146; VI-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
147; VI:       ; %bb.0:
148; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149; VI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
150; VI-NEXT:    s_mov_b32 s7, 0xf000
151; VI-NEXT:    s_mov_b32 s6, -1
152; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
153; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
154; VI-NEXT:    s_waitcnt vmcnt(0)
155; VI-NEXT:    s_setpc_b64 s[30:31]
156;
157; GFX10-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
158; GFX10:       ; %bb.0:
159; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
160; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
161; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
162; GFX10-NEXT:    global_store_dword v[0:1], v1, off
163; GFX10-NEXT:    s_setpc_b64 s[30:31]
164;
165; GFX9-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
166; GFX9:       ; %bb.0:
167; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
169; GFX9-NEXT:    global_store_dword v[0:1], v1, off
170; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
171; GFX9-NEXT:    s_waitcnt vmcnt(0)
172; GFX9-NEXT:    s_setpc_b64 s[30:31]
173;
174; GFX11-LABEL: v_uitofp_to_f32_multi_use_lshr8_mask255:
175; GFX11:       ; %bb.0:
176; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
177; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
178; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
179; GFX11-NEXT:    global_store_b32 v[0:1], v1, off
180; GFX11-NEXT:    s_setpc_b64 s[30:31]
181  %lshr.8 = lshr i32 %arg0, 8
182  store i32 %lshr.8, ptr addrspace(1) undef
183  %masked = and i32 %lshr.8, 255
184  %cvt = uitofp i32 %masked to float
185  ret float %cvt
186}
187
188define float @v_uitofp_to_f32_lshr16_mask255(i32 %arg0) nounwind {
189; GCN-LABEL: v_uitofp_to_f32_lshr16_mask255:
190; GCN:       ; %bb.0:
191; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
192; GCN-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
193; GCN-NEXT:    s_setpc_b64 s[30:31]
194;
195; GFX10-LABEL: v_uitofp_to_f32_lshr16_mask255:
196; GFX10:       ; %bb.0:
197; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
198; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
199; GFX10-NEXT:    s_setpc_b64 s[30:31]
200;
201; GFX9-LABEL: v_uitofp_to_f32_lshr16_mask255:
202; GFX9:       ; %bb.0:
203; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
204; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
205; GFX9-NEXT:    s_setpc_b64 s[30:31]
206;
207; GFX11-LABEL: v_uitofp_to_f32_lshr16_mask255:
208; GFX11:       ; %bb.0:
209; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
210; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
211; GFX11-NEXT:    s_setpc_b64 s[30:31]
212  %lshr.16 = lshr i32 %arg0, 16
213  %masked = and i32 %lshr.16, 255
214  %cvt = uitofp i32 %masked to float
215  ret float %cvt
216}
217
218define float @v_uitofp_to_f32_lshr24_mask255(i32 %arg0) nounwind {
219; GCN-LABEL: v_uitofp_to_f32_lshr24_mask255:
220; GCN:       ; %bb.0:
221; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
222; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
223; GCN-NEXT:    s_setpc_b64 s[30:31]
224;
225; GFX10-LABEL: v_uitofp_to_f32_lshr24_mask255:
226; GFX10:       ; %bb.0:
227; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
229; GFX10-NEXT:    s_setpc_b64 s[30:31]
230;
231; GFX9-LABEL: v_uitofp_to_f32_lshr24_mask255:
232; GFX9:       ; %bb.0:
233; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
234; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
235; GFX9-NEXT:    s_setpc_b64 s[30:31]
236;
237; GFX11-LABEL: v_uitofp_to_f32_lshr24_mask255:
238; GFX11:       ; %bb.0:
239; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
240; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
241; GFX11-NEXT:    s_setpc_b64 s[30:31]
242  %lshr.16 = lshr i32 %arg0, 24
243  %masked = and i32 %lshr.16, 255
244  %cvt = uitofp i32 %masked to float
245  ret float %cvt
246}
247
248define float @v_uitofp_i8_to_f32(i8 %arg0) nounwind {
249; GCN-LABEL: v_uitofp_i8_to_f32:
250; GCN:       ; %bb.0:
251; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
252; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
253; GCN-NEXT:    s_setpc_b64 s[30:31]
254;
255; GFX10-LABEL: v_uitofp_i8_to_f32:
256; GFX10:       ; %bb.0:
257; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
258; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
259; GFX10-NEXT:    s_setpc_b64 s[30:31]
260;
261; GFX9-LABEL: v_uitofp_i8_to_f32:
262; GFX9:       ; %bb.0:
263; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
264; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
265; GFX9-NEXT:    s_setpc_b64 s[30:31]
266;
267; GFX11-LABEL: v_uitofp_i8_to_f32:
268; GFX11:       ; %bb.0:
269; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
270; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
271; GFX11-NEXT:    s_setpc_b64 s[30:31]
272  %cvt = uitofp i8 %arg0 to float
273  ret float %cvt
274}
275
276define <2 x float> @v_uitofp_v2i8_to_v2f32(i16 %arg0) nounwind {
277; GCN-LABEL: v_uitofp_v2i8_to_v2f32:
278; GCN:       ; %bb.0:
279; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
281; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
282; GCN-NEXT:    v_mov_b32_e32 v0, v2
283; GCN-NEXT:    s_setpc_b64 s[30:31]
284;
285; GFX10-LABEL: v_uitofp_v2i8_to_v2f32:
286; GFX10:       ; %bb.0:
287; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
288; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
289; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
290; GFX10-NEXT:    v_mov_b32_e32 v0, v2
291; GFX10-NEXT:    s_setpc_b64 s[30:31]
292;
293; GFX9-LABEL: v_uitofp_v2i8_to_v2f32:
294; GFX9:       ; %bb.0:
295; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
296; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
297; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
298; GFX9-NEXT:    v_mov_b32_e32 v0, v2
299; GFX9-NEXT:    s_setpc_b64 s[30:31]
300;
301; GFX11-LABEL: v_uitofp_v2i8_to_v2f32:
302; GFX11:       ; %bb.0:
303; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
304; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
305; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
306; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
307; GFX11-NEXT:    v_mov_b32_e32 v0, v2
308; GFX11-NEXT:    s_setpc_b64 s[30:31]
309  %val = bitcast i16 %arg0 to <2 x i8>
310  %cvt = uitofp <2 x i8> %val to <2 x float>
311  ret <2 x float> %cvt
312}
313
314define <3 x float> @v_uitofp_v3i8_to_v3f32(i32 %arg0) nounwind {
315; GCN-LABEL: v_uitofp_v3i8_to_v3f32:
316; GCN:       ; %bb.0:
317; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
318; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v3, v0
319; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
320; GCN-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
321; GCN-NEXT:    v_mov_b32_e32 v0, v3
322; GCN-NEXT:    s_setpc_b64 s[30:31]
323;
324; GFX10-LABEL: v_uitofp_v3i8_to_v3f32:
325; GFX10:       ; %bb.0:
326; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
327; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v0
328; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
329; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
330; GFX10-NEXT:    v_mov_b32_e32 v0, v3
331; GFX10-NEXT:    s_setpc_b64 s[30:31]
332;
333; GFX9-LABEL: v_uitofp_v3i8_to_v3f32:
334; GFX9:       ; %bb.0:
335; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
336; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, v0
337; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
338; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
339; GFX9-NEXT:    v_mov_b32_e32 v0, v3
340; GFX9-NEXT:    s_setpc_b64 s[30:31]
341;
342; GFX11-LABEL: v_uitofp_v3i8_to_v3f32:
343; GFX11:       ; %bb.0:
344; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
345; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v3, v0
346; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
347; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
348; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
349; GFX11-NEXT:    v_mov_b32_e32 v0, v3
350; GFX11-NEXT:    s_setpc_b64 s[30:31]
351  %trunc = trunc i32 %arg0 to i24
352  %val = bitcast i24 %trunc to <3 x i8>
353  %cvt = uitofp <3 x i8> %val to <3 x float>
354  ret <3 x float> %cvt
355}
356
357define <4 x float> @v_uitofp_v4i8_to_v4f32(i32 %arg0) nounwind {
358; GCN-LABEL: v_uitofp_v4i8_to_v4f32:
359; GCN:       ; %bb.0:
360; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
361; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
362; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
363; GCN-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
364; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
365; GCN-NEXT:    v_mov_b32_e32 v0, v4
366; GCN-NEXT:    s_setpc_b64 s[30:31]
367;
368; GFX10-LABEL: v_uitofp_v4i8_to_v4f32:
369; GFX10:       ; %bb.0:
370; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
371; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
372; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
373; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
374; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
375; GFX10-NEXT:    v_mov_b32_e32 v0, v4
376; GFX10-NEXT:    s_setpc_b64 s[30:31]
377;
378; GFX9-LABEL: v_uitofp_v4i8_to_v4f32:
379; GFX9:       ; %bb.0:
380; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
381; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
382; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
383; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
384; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
385; GFX9-NEXT:    v_mov_b32_e32 v0, v4
386; GFX9-NEXT:    s_setpc_b64 s[30:31]
387;
388; GFX11-LABEL: v_uitofp_v4i8_to_v4f32:
389; GFX11:       ; %bb.0:
390; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
391; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
392; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
393; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
394; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
395; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
396; GFX11-NEXT:    v_mov_b32_e32 v0, v4
397; GFX11-NEXT:    s_setpc_b64 s[30:31]
398  %val = bitcast i32 %arg0 to <4 x i8>
399  %cvt = uitofp <4 x i8> %val to <4 x float>
400  ret <4 x float> %cvt
401}
402
403define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
404; GCN-LABEL: v_uitofp_unpack_i32_to_v4f32:
405; GCN:       ; %bb.0:
406; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
407; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
408; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
409; GCN-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
410; GCN-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
411; GCN-NEXT:    v_mov_b32_e32 v0, v4
412; GCN-NEXT:    s_setpc_b64 s[30:31]
413;
414; GFX10-LABEL: v_uitofp_unpack_i32_to_v4f32:
415; GFX10:       ; %bb.0:
416; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
417; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
418; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
419; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
420; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
421; GFX10-NEXT:    v_mov_b32_e32 v0, v4
422; GFX10-NEXT:    s_setpc_b64 s[30:31]
423;
424; GFX9-LABEL: v_uitofp_unpack_i32_to_v4f32:
425; GFX9:       ; %bb.0:
426; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
427; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
428; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
429; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
430; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
431; GFX9-NEXT:    v_mov_b32_e32 v0, v4
432; GFX9-NEXT:    s_setpc_b64 s[30:31]
433;
434; GFX11-LABEL: v_uitofp_unpack_i32_to_v4f32:
435; GFX11:       ; %bb.0:
436; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
437; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v4, v0
438; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
439; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
440; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
441; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
442; GFX11-NEXT:    v_mov_b32_e32 v0, v4
443; GFX11-NEXT:    s_setpc_b64 s[30:31]
444  %mask.arg0 = and i32 %arg0, 255
445  %cvt0 = uitofp i32 %mask.arg0 to float
446
447  %lshr.8 = lshr i32 %arg0, 8
448  %mask.lshr.8 = and i32 %lshr.8, 255
449  %cvt1 = uitofp i32 %mask.lshr.8 to float
450
451  %lshr.16 = lshr i32 %arg0, 16
452  %mask.lshr.16 = and i32 %lshr.16, 255
453  %cvt2 = uitofp i32 %mask.lshr.16 to float
454
455  %lshr.24 = lshr i32 %arg0, 24
456  %mask.lshr.24 = and i32 %lshr.24, 255
457  %cvt3 = uitofp i32 %mask.lshr.24 to float
458
459  %ins.0 = insertelement <4 x float> undef, float %cvt0, i32 0
460  %ins.1 = insertelement <4 x float> %ins.0, float %cvt1, i32 1
461  %ins.2 = insertelement <4 x float> %ins.1, float %cvt2, i32 2
462  %ins.3 = insertelement <4 x float> %ins.2, float %cvt3, i32 3
463  ret <4 x float> %ins.3
464}
465
466define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
467; SI-LABEL: v_uitofp_i32_to_f16_mask255:
468; SI:       ; %bb.0:
469; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
470; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
471; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
472; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
473; SI-NEXT:    s_setpc_b64 s[30:31]
474;
475; VI-LABEL: v_uitofp_i32_to_f16_mask255:
476; VI:       ; %bb.0:
477; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
478; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
479; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
480; VI-NEXT:    s_setpc_b64 s[30:31]
481;
482; GFX10-LABEL: v_uitofp_i32_to_f16_mask255:
483; GFX10:       ; %bb.0:
484; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
485; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
486; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
487; GFX10-NEXT:    s_setpc_b64 s[30:31]
488;
489; GFX9-LABEL: v_uitofp_i32_to_f16_mask255:
490; GFX9:       ; %bb.0:
491; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
492; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
493; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
494; GFX9-NEXT:    s_setpc_b64 s[30:31]
495;
496; GFX11-LABEL: v_uitofp_i32_to_f16_mask255:
497; GFX11:       ; %bb.0:
498; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
499; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
500; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
501; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
502; GFX11-NEXT:    s_setpc_b64 s[30:31]
503  %masked = and i32 %arg0, 255
504  %cvt = uitofp i32 %masked to half
505  ret half %cvt
506}
507
508define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
509; SI-LABEL: v_sitofp_i32_to_f16_mask255:
510; SI:       ; %bb.0:
511; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
512; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
513; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
514; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
515; SI-NEXT:    s_setpc_b64 s[30:31]
516;
517; VI-LABEL: v_sitofp_i32_to_f16_mask255:
518; VI:       ; %bb.0:
519; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
520; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
521; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
522; VI-NEXT:    s_setpc_b64 s[30:31]
523;
524; GFX10-LABEL: v_sitofp_i32_to_f16_mask255:
525; GFX10:       ; %bb.0:
526; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
527; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
528; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
529; GFX10-NEXT:    s_setpc_b64 s[30:31]
530;
531; GFX9-LABEL: v_sitofp_i32_to_f16_mask255:
532; GFX9:       ; %bb.0:
533; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
535; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
536; GFX9-NEXT:    s_setpc_b64 s[30:31]
537;
538; GFX11-LABEL: v_sitofp_i32_to_f16_mask255:
539; GFX11:       ; %bb.0:
540; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
541; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
542; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
543; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
544; GFX11-NEXT:    s_setpc_b64 s[30:31]
545  %masked = and i32 %arg0, 255
546  %cvt = sitofp i32 %masked to half
547  ret half %cvt
548}
549
550define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
551; SI-LABEL: v_uitofp_to_f16_lshr8_mask255:
552; SI:       ; %bb.0:
553; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
554; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
555; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
556; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
557; SI-NEXT:    s_setpc_b64 s[30:31]
558;
559; VI-LABEL: v_uitofp_to_f16_lshr8_mask255:
560; VI:       ; %bb.0:
561; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
562; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
563; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
564; VI-NEXT:    s_setpc_b64 s[30:31]
565;
566; GFX10-LABEL: v_uitofp_to_f16_lshr8_mask255:
567; GFX10:       ; %bb.0:
568; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
569; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
570; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
571; GFX10-NEXT:    s_setpc_b64 s[30:31]
572;
573; GFX9-LABEL: v_uitofp_to_f16_lshr8_mask255:
574; GFX9:       ; %bb.0:
575; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
576; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
577; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
578; GFX9-NEXT:    s_setpc_b64 s[30:31]
579;
580; GFX11-LABEL: v_uitofp_to_f16_lshr8_mask255:
581; GFX11:       ; %bb.0:
582; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
583; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
584; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
585; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
586; GFX11-NEXT:    s_setpc_b64 s[30:31]
587  %lshr.8 = lshr i32 %arg0, 8
588  %masked = and i32 %lshr.8, 255
589  %cvt = uitofp i32 %masked to half
590  ret half %cvt
591}
592
593define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
594; SI-LABEL: v_uitofp_to_f16_lshr16_mask255:
595; SI:       ; %bb.0:
596; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
597; SI-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
598; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
599; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
600; SI-NEXT:    s_setpc_b64 s[30:31]
601;
602; VI-LABEL: v_uitofp_to_f16_lshr16_mask255:
603; VI:       ; %bb.0:
604; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
605; VI-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
606; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
607; VI-NEXT:    s_setpc_b64 s[30:31]
608;
609; GFX10-LABEL: v_uitofp_to_f16_lshr16_mask255:
610; GFX10:       ; %bb.0:
611; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
612; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
613; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
614; GFX10-NEXT:    s_setpc_b64 s[30:31]
615;
616; GFX9-LABEL: v_uitofp_to_f16_lshr16_mask255:
617; GFX9:       ; %bb.0:
618; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
619; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
620; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
621; GFX9-NEXT:    s_setpc_b64 s[30:31]
622;
623; GFX11-LABEL: v_uitofp_to_f16_lshr16_mask255:
624; GFX11:       ; %bb.0:
625; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
626; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
627; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
628; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
629; GFX11-NEXT:    s_setpc_b64 s[30:31]
630  %lshr.16 = lshr i32 %arg0, 16
631  %masked = and i32 %lshr.16, 255
632  %cvt = uitofp i32 %masked to half
633  ret half %cvt
634}
635
636define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind {
637; SI-LABEL: v_uitofp_to_f16_lshr24_mask255:
638; SI:       ; %bb.0:
639; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
640; SI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
641; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
642; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
643; SI-NEXT:    s_setpc_b64 s[30:31]
644;
645; VI-LABEL: v_uitofp_to_f16_lshr24_mask255:
646; VI:       ; %bb.0:
647; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
648; VI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
649; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
650; VI-NEXT:    s_setpc_b64 s[30:31]
651;
652; GFX10-LABEL: v_uitofp_to_f16_lshr24_mask255:
653; GFX10:       ; %bb.0:
654; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
655; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
656; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
657; GFX10-NEXT:    s_setpc_b64 s[30:31]
658;
659; GFX9-LABEL: v_uitofp_to_f16_lshr24_mask255:
660; GFX9:       ; %bb.0:
661; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
662; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
663; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
664; GFX9-NEXT:    s_setpc_b64 s[30:31]
665;
666; GFX11-LABEL: v_uitofp_to_f16_lshr24_mask255:
667; GFX11:       ; %bb.0:
668; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
669; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
670; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
671; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
672; GFX11-NEXT:    s_setpc_b64 s[30:31]
673  %lshr.16 = lshr i32 %arg0, 24
674  %masked = and i32 %lshr.16, 255
675  %cvt = uitofp i32 %masked to half
676  ret half %cvt
677}
678
679define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind {
680; SI-LABEL: v_uitofp_i8_to_f16:
681; SI:       ; %bb.0:
682; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
683; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
684; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
685; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
686; SI-NEXT:    s_setpc_b64 s[30:31]
687;
688; VI-LABEL: v_uitofp_i8_to_f16:
689; VI:       ; %bb.0:
690; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
691; VI-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
692; VI-NEXT:    s_setpc_b64 s[30:31]
693;
694; GFX10-LABEL: v_uitofp_i8_to_f16:
695; GFX10:       ; %bb.0:
696; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
697; GFX10-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
698; GFX10-NEXT:    s_setpc_b64 s[30:31]
699;
700; GFX9-LABEL: v_uitofp_i8_to_f16:
701; GFX9:       ; %bb.0:
702; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
703; GFX9-NEXT:    v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
704; GFX9-NEXT:    s_setpc_b64 s[30:31]
705;
706; GFX11-LABEL: v_uitofp_i8_to_f16:
707; GFX11:       ; %bb.0:
708; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
709; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
710; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
711; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
712; GFX11-NEXT:    s_setpc_b64 s[30:31]
713  %cvt = uitofp i8 %arg0 to half
714  ret half %cvt
715}
716
717define double @v_uitofp_i32_to_f64_mask255(i32 %arg0) nounwind {
718; GCN-LABEL: v_uitofp_i32_to_f64_mask255:
719; GCN:       ; %bb.0:
720; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
721; GCN-NEXT:    v_and_b32_e32 v0, 0xff, v0
722; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
723; GCN-NEXT:    s_setpc_b64 s[30:31]
724;
725; GFX10-LABEL: v_uitofp_i32_to_f64_mask255:
726; GFX10:       ; %bb.0:
727; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
728; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
729; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
730; GFX10-NEXT:    s_setpc_b64 s[30:31]
731;
732; GFX9-LABEL: v_uitofp_i32_to_f64_mask255:
733; GFX9:       ; %bb.0:
734; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
735; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
736; GFX9-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
737; GFX9-NEXT:    s_setpc_b64 s[30:31]
738;
739; GFX11-LABEL: v_uitofp_i32_to_f64_mask255:
740; GFX11:       ; %bb.0:
741; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
742; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
743; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
744; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
745; GFX11-NEXT:    s_setpc_b64 s[30:31]
746  %masked = and i32 %arg0, 255
747  %cvt = uitofp i32 %masked to double
748  ret double %cvt
749}
750
751define double @v_uitofp_to_f64_lshr8_mask255(i32 %arg0) nounwind {
752; GCN-LABEL: v_uitofp_to_f64_lshr8_mask255:
753; GCN:       ; %bb.0:
754; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
755; GCN-NEXT:    v_bfe_u32 v0, v0, 8, 8
756; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
757; GCN-NEXT:    s_setpc_b64 s[30:31]
758;
759; GFX10-LABEL: v_uitofp_to_f64_lshr8_mask255:
760; GFX10:       ; %bb.0:
761; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
762; GFX10-NEXT:    v_bfe_u32 v0, v0, 8, 8
763; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
764; GFX10-NEXT:    s_setpc_b64 s[30:31]
765;
766; GFX9-LABEL: v_uitofp_to_f64_lshr8_mask255:
767; GFX9:       ; %bb.0:
768; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
769; GFX9-NEXT:    v_bfe_u32 v0, v0, 8, 8
770; GFX9-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
771; GFX9-NEXT:    s_setpc_b64 s[30:31]
772;
773; GFX11-LABEL: v_uitofp_to_f64_lshr8_mask255:
774; GFX11:       ; %bb.0:
775; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
776; GFX11-NEXT:    v_bfe_u32 v0, v0, 8, 8
777; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
778; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
779; GFX11-NEXT:    s_setpc_b64 s[30:31]
780  %lshr.8 = lshr i32 %arg0, 8
781  %masked = and i32 %lshr.8, 255
782  %cvt = uitofp i32 %masked to double
783  ret double %cvt
784}
785
786define double @v_uitofp_to_f64_lshr16_mask255(i32 %arg0) nounwind {
787; GCN-LABEL: v_uitofp_to_f64_lshr16_mask255:
788; GCN:       ; %bb.0:
789; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
790; GCN-NEXT:    v_bfe_u32 v0, v0, 16, 8
791; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
792; GCN-NEXT:    s_setpc_b64 s[30:31]
793;
794; GFX10-LABEL: v_uitofp_to_f64_lshr16_mask255:
795; GFX10:       ; %bb.0:
796; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
797; GFX10-NEXT:    v_bfe_u32 v0, v0, 16, 8
798; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
799; GFX10-NEXT:    s_setpc_b64 s[30:31]
800;
801; GFX9-LABEL: v_uitofp_to_f64_lshr16_mask255:
802; GFX9:       ; %bb.0:
803; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
804; GFX9-NEXT:    v_bfe_u32 v0, v0, 16, 8
805; GFX9-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
806; GFX9-NEXT:    s_setpc_b64 s[30:31]
807;
808; GFX11-LABEL: v_uitofp_to_f64_lshr16_mask255:
809; GFX11:       ; %bb.0:
810; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
811; GFX11-NEXT:    v_bfe_u32 v0, v0, 16, 8
812; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
813; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
814; GFX11-NEXT:    s_setpc_b64 s[30:31]
815  %lshr.16 = lshr i32 %arg0, 16
816  %masked = and i32 %lshr.16, 255
817  %cvt = uitofp i32 %masked to double
818  ret double %cvt
819}
820
821define double @v_uitofp_to_f64_lshr24_mask255(i32 %arg0) nounwind {
822; GCN-LABEL: v_uitofp_to_f64_lshr24_mask255:
823; GCN:       ; %bb.0:
824; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
825; GCN-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
826; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
827; GCN-NEXT:    s_setpc_b64 s[30:31]
828;
829; GFX10-LABEL: v_uitofp_to_f64_lshr24_mask255:
830; GFX10:       ; %bb.0:
831; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
832; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
833; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
834; GFX10-NEXT:    s_setpc_b64 s[30:31]
835;
836; GFX9-LABEL: v_uitofp_to_f64_lshr24_mask255:
837; GFX9:       ; %bb.0:
838; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
839; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
840; GFX9-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
841; GFX9-NEXT:    s_setpc_b64 s[30:31]
842;
843; GFX11-LABEL: v_uitofp_to_f64_lshr24_mask255:
844; GFX11:       ; %bb.0:
845; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
846; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
847; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
848; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
849; GFX11-NEXT:    s_setpc_b64 s[30:31]
850  %lshr.16 = lshr i32 %arg0, 24
851  %masked = and i32 %lshr.16, 255
852  %cvt = uitofp i32 %masked to double
853  ret double %cvt
854}
855
856define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind {
857; SI-LABEL: v_uitofp_i8_to_f64:
858; SI:       ; %bb.0:
859; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
860; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
861; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
862; SI-NEXT:    s_setpc_b64 s[30:31]
863;
864; VI-LABEL: v_uitofp_i8_to_f64:
865; VI:       ; %bb.0:
866; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
867; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
868; VI-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
869; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
870; VI-NEXT:    s_setpc_b64 s[30:31]
871;
872; GFX10-LABEL: v_uitofp_i8_to_f64:
873; GFX10:       ; %bb.0:
874; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
875; GFX10-NEXT:    v_mov_b32_e32 v1, 0xffff
876; GFX10-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
877; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
878; GFX10-NEXT:    s_setpc_b64 s[30:31]
879;
880; GFX9-LABEL: v_uitofp_i8_to_f64:
881; GFX9:       ; %bb.0:
882; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
883; GFX9-NEXT:    s_mov_b32 s4, 0xffff
884; GFX9-NEXT:    v_and_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
885; GFX9-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
886; GFX9-NEXT:    s_setpc_b64 s[30:31]
887;
888; GFX11-LABEL: v_uitofp_i8_to_f64:
889; GFX11:       ; %bb.0:
890; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
891; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
892; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
893; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
894; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
895; GFX11-NEXT:    s_setpc_b64 s[30:31]
896  %cvt = uitofp i8 %arg0 to double
897  ret double %cvt
898}
899
900define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
901; SI-LABEL: load_i8_to_f32:
902; SI:       ; %bb.0:
903; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
904; SI-NEXT:    s_mov_b32 s7, 0xf000
905; SI-NEXT:    v_mov_b32_e32 v1, 0
906; SI-NEXT:    s_mov_b32 s10, 0
907; SI-NEXT:    s_mov_b32 s11, s7
908; SI-NEXT:    s_waitcnt lgkmcnt(0)
909; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
910; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
911; SI-NEXT:    s_mov_b32 s6, -1
912; SI-NEXT:    s_mov_b32 s4, s0
913; SI-NEXT:    s_mov_b32 s5, s1
914; SI-NEXT:    s_waitcnt vmcnt(0)
915; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
916; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
917; SI-NEXT:    s_endpgm
918;
919; VI-LABEL: load_i8_to_f32:
920; VI:       ; %bb.0:
921; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
922; VI-NEXT:    s_waitcnt lgkmcnt(0)
923; VI-NEXT:    v_mov_b32_e32 v1, s3
924; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
925; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
926; VI-NEXT:    flat_load_ubyte v0, v[0:1]
927; VI-NEXT:    s_mov_b32 s3, 0xf000
928; VI-NEXT:    s_mov_b32 s2, -1
929; VI-NEXT:    s_waitcnt vmcnt(0)
930; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
931; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
932; VI-NEXT:    s_endpgm
933;
934; GFX10-LABEL: load_i8_to_f32:
935; GFX10:       ; %bb.0:
936; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
937; GFX10-NEXT:    v_mov_b32_e32 v1, 0
938; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
939; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
940; GFX10-NEXT:    s_waitcnt vmcnt(0)
941; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
942; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
943; GFX10-NEXT:    s_endpgm
944;
945; GFX9-LABEL: load_i8_to_f32:
946; GFX9:       ; %bb.0:
947; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
948; GFX9-NEXT:    v_mov_b32_e32 v1, 0
949; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
950; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
951; GFX9-NEXT:    s_waitcnt vmcnt(0)
952; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
953; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
954; GFX9-NEXT:    s_endpgm
955;
956; GFX11-LABEL: load_i8_to_f32:
957; GFX11:       ; %bb.0:
958; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
959; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
960; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
961; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
962; GFX11-NEXT:    s_waitcnt vmcnt(0)
963; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
964; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
965; GFX11-NEXT:    s_endpgm
966  %tid = call i32 @llvm.amdgcn.workitem.id.x()
967  %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid
968  %load = load i8, ptr addrspace(1) %gep, align 1
969  %cvt = uitofp i8 %load to float
970  store float %cvt, ptr addrspace(1) %out, align 4
971  ret void
972}
973
974define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
975; SI-LABEL: load_v2i8_to_v2f32:
976; SI:       ; %bb.0:
977; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
978; SI-NEXT:    s_mov_b32 s7, 0xf000
979; SI-NEXT:    s_mov_b32 s10, 0
980; SI-NEXT:    s_mov_b32 s11, s7
981; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
982; SI-NEXT:    s_waitcnt lgkmcnt(0)
983; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
984; SI-NEXT:    v_mov_b32_e32 v1, 0
985; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
986; SI-NEXT:    s_mov_b32 s6, -1
987; SI-NEXT:    s_mov_b32 s4, s0
988; SI-NEXT:    s_mov_b32 s5, s1
989; SI-NEXT:    s_waitcnt vmcnt(0)
990; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
991; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
992; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
993; SI-NEXT:    s_endpgm
994;
995; VI-LABEL: load_v2i8_to_v2f32:
996; VI:       ; %bb.0:
997; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
998; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
999; VI-NEXT:    s_waitcnt lgkmcnt(0)
1000; VI-NEXT:    v_mov_b32_e32 v1, s3
1001; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1002; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1003; VI-NEXT:    flat_load_ushort v0, v[0:1]
1004; VI-NEXT:    s_mov_b32 s3, 0xf000
1005; VI-NEXT:    s_mov_b32 s2, -1
1006; VI-NEXT:    s_waitcnt vmcnt(0)
1007; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
1008; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1009; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1010; VI-NEXT:    s_endpgm
1011;
1012; GFX10-LABEL: load_v2i8_to_v2f32:
1013; GFX10:       ; %bb.0:
1014; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1015; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1016; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1017; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1018; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3]
1019; GFX10-NEXT:    s_waitcnt vmcnt(0)
1020; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
1021; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1022; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1023; GFX10-NEXT:    s_endpgm
1024;
1025; GFX9-LABEL: load_v2i8_to_v2f32:
1026; GFX9:       ; %bb.0:
1027; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1028; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1029; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1030; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1031; GFX9-NEXT:    global_load_ushort v0, v0, s[2:3]
1032; GFX9-NEXT:    s_waitcnt vmcnt(0)
1033; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
1034; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1035; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1036; GFX9-NEXT:    s_endpgm
1037;
1038; GFX11-LABEL: load_v2i8_to_v2f32:
1039; GFX11:       ; %bb.0:
1040; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1041; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1042; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1043; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1044; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1045; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1046; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3]
1047; GFX11-NEXT:    s_waitcnt vmcnt(0)
1048; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
1049; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1050; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1051; GFX11-NEXT:    s_endpgm
1052  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1053  %gep = getelementptr <2 x i8>, ptr addrspace(1) %in, i32 %tid
1054  %load = load <2 x i8>, ptr addrspace(1) %gep, align 2
1055  %cvt = uitofp <2 x i8> %load to <2 x float>
1056  store <2 x float> %cvt, ptr addrspace(1) %out, align 16
1057  ret void
1058}
1059
1060define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1061; SI-LABEL: load_v3i8_to_v3f32:
1062; SI:       ; %bb.0:
1063; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1064; SI-NEXT:    s_mov_b32 s7, 0xf000
1065; SI-NEXT:    s_mov_b32 s10, 0
1066; SI-NEXT:    s_mov_b32 s11, s7
1067; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1068; SI-NEXT:    s_waitcnt lgkmcnt(0)
1069; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1070; SI-NEXT:    v_mov_b32_e32 v1, 0
1071; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
1072; SI-NEXT:    s_mov_b32 s6, -1
1073; SI-NEXT:    s_mov_b32 s4, s0
1074; SI-NEXT:    s_mov_b32 s5, s1
1075; SI-NEXT:    s_waitcnt vmcnt(0)
1076; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v2
1077; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v2
1078; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v2
1079; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
1080; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1081; SI-NEXT:    s_endpgm
1082;
1083; VI-LABEL: load_v3i8_to_v3f32:
1084; VI:       ; %bb.0:
1085; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1086; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1087; VI-NEXT:    s_waitcnt lgkmcnt(0)
1088; VI-NEXT:    v_mov_b32_e32 v1, s3
1089; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1090; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1091; VI-NEXT:    flat_load_dword v0, v[0:1]
1092; VI-NEXT:    s_mov_b32 s3, 0xf000
1093; VI-NEXT:    s_mov_b32 s2, -1
1094; VI-NEXT:    s_waitcnt vmcnt(0)
1095; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
1096; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
1097; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1098; VI-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
1099; VI-NEXT:    s_endpgm
1100;
1101; GFX10-LABEL: load_v3i8_to_v3f32:
1102; GFX10:       ; %bb.0:
1103; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1104; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1105; GFX10-NEXT:    v_mov_b32_e32 v3, 0
1106; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1107; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1108; GFX10-NEXT:    s_waitcnt vmcnt(0)
1109; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
1110; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
1111; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1112; GFX10-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
1113; GFX10-NEXT:    s_endpgm
1114;
1115; GFX9-LABEL: load_v3i8_to_v3f32:
1116; GFX9:       ; %bb.0:
1117; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1118; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1119; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1120; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1121; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
1122; GFX9-NEXT:    s_waitcnt vmcnt(0)
1123; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
1124; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
1125; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1126; GFX9-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
1127; GFX9-NEXT:    s_endpgm
1128;
1129; GFX11-LABEL: load_v3i8_to_v3f32:
1130; GFX11:       ; %bb.0:
1131; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1132; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1133; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1134; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1135; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1136; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1137; GFX11-NEXT:    s_waitcnt vmcnt(0)
1138; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
1139; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
1140; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1141; GFX11-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
1142; GFX11-NEXT:    s_endpgm
1143  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1144  %gep = getelementptr <3 x i8>, ptr addrspace(1) %in, i32 %tid
1145  %load = load <3 x i8>, ptr addrspace(1) %gep, align 4
1146  %cvt = uitofp <3 x i8> %load to <3 x float>
1147  store <3 x float> %cvt, ptr addrspace(1) %out, align 16
1148  ret void
1149}
1150
1151define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1152; SI-LABEL: load_v4i8_to_v4f32:
1153; SI:       ; %bb.0:
1154; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1155; SI-NEXT:    s_mov_b32 s7, 0xf000
1156; SI-NEXT:    s_mov_b32 s10, 0
1157; SI-NEXT:    s_mov_b32 s11, s7
1158; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1159; SI-NEXT:    s_waitcnt lgkmcnt(0)
1160; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1161; SI-NEXT:    v_mov_b32_e32 v1, 0
1162; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1163; SI-NEXT:    s_mov_b32 s6, -1
1164; SI-NEXT:    s_mov_b32 s4, s0
1165; SI-NEXT:    s_mov_b32 s5, s1
1166; SI-NEXT:    s_waitcnt vmcnt(0)
1167; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
1168; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
1169; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
1170; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1171; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1172; SI-NEXT:    s_endpgm
1173;
1174; VI-LABEL: load_v4i8_to_v4f32:
1175; VI:       ; %bb.0:
1176; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1177; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1178; VI-NEXT:    s_waitcnt lgkmcnt(0)
1179; VI-NEXT:    v_mov_b32_e32 v1, s3
1180; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1181; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1182; VI-NEXT:    flat_load_dword v0, v[0:1]
1183; VI-NEXT:    s_mov_b32 s3, 0xf000
1184; VI-NEXT:    s_mov_b32 s2, -1
1185; VI-NEXT:    s_waitcnt vmcnt(0)
1186; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
1187; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
1188; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
1189; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1190; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1191; VI-NEXT:    s_endpgm
1192;
1193; GFX10-LABEL: load_v4i8_to_v4f32:
1194; GFX10:       ; %bb.0:
1195; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1196; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1197; GFX10-NEXT:    v_mov_b32_e32 v4, 0
1198; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1199; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1200; GFX10-NEXT:    s_waitcnt vmcnt(0)
1201; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
1202; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
1203; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
1204; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1205; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1206; GFX10-NEXT:    s_endpgm
1207;
1208; GFX9-LABEL: load_v4i8_to_v4f32:
1209; GFX9:       ; %bb.0:
1210; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1211; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1212; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1213; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1214; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
1215; GFX9-NEXT:    s_waitcnt vmcnt(0)
1216; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
1217; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
1218; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
1219; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1220; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1221; GFX9-NEXT:    s_endpgm
1222;
1223; GFX11-LABEL: load_v4i8_to_v4f32:
1224; GFX11:       ; %bb.0:
1225; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1226; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1227; GFX11-NEXT:    v_mov_b32_e32 v4, 0
1228; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1229; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1230; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1231; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1232; GFX11-NEXT:    s_waitcnt vmcnt(0)
1233; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
1234; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
1235; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
1236; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1237; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
1238; GFX11-NEXT:    s_endpgm
1239  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1240  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
1241  %load = load <4 x i8>, ptr addrspace(1) %gep, align 4
1242  %cvt = uitofp <4 x i8> %load to <4 x float>
1243  store <4 x float> %cvt, ptr addrspace(1) %out, align 16
1244  ret void
1245}
1246
1247; This should not be adding instructions to shift into the correct
1248; position in the word for the component.
1249
1250; FIXME: Packing bytes
1251define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1252; SI-LABEL: load_v4i8_to_v4f32_unaligned:
1253; SI:       ; %bb.0:
1254; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1255; SI-NEXT:    s_mov_b32 s7, 0xf000
1256; SI-NEXT:    s_mov_b32 s10, 0
1257; SI-NEXT:    s_mov_b32 s11, s7
1258; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1259; SI-NEXT:    s_waitcnt lgkmcnt(0)
1260; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1261; SI-NEXT:    v_mov_b32_e32 v1, 0
1262; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[8:11], 0 addr64 offset:3
1263; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[8:11], 0 addr64 offset:2
1264; SI-NEXT:    buffer_load_ubyte v5, v[0:1], s[8:11], 0 addr64 offset:1
1265; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
1266; SI-NEXT:    s_mov_b32 s6, -1
1267; SI-NEXT:    s_mov_b32 s4, s0
1268; SI-NEXT:    s_mov_b32 s5, s1
1269; SI-NEXT:    s_waitcnt vmcnt(3)
1270; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v2
1271; SI-NEXT:    s_waitcnt vmcnt(2)
1272; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
1273; SI-NEXT:    s_waitcnt vmcnt(1)
1274; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v5
1275; SI-NEXT:    s_waitcnt vmcnt(0)
1276; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1277; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1278; SI-NEXT:    s_endpgm
1279;
1280; VI-LABEL: load_v4i8_to_v4f32_unaligned:
1281; VI:       ; %bb.0:
1282; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1283; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1284; VI-NEXT:    s_waitcnt lgkmcnt(0)
1285; VI-NEXT:    v_mov_b32_e32 v1, s3
1286; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1287; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1288; VI-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
1289; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1290; VI-NEXT:    v_add_u32_e32 v4, vcc, 3, v0
1291; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1292; VI-NEXT:    flat_load_ubyte v2, v[2:3]
1293; VI-NEXT:    flat_load_ubyte v3, v[4:5]
1294; VI-NEXT:    flat_load_ubyte v4, v[0:1]
1295; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
1296; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1297; VI-NEXT:    flat_load_ubyte v1, v[0:1]
1298; VI-NEXT:    s_mov_b32 s3, 0xf000
1299; VI-NEXT:    s_mov_b32 s2, -1
1300; VI-NEXT:    s_waitcnt vmcnt(3)
1301; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
1302; VI-NEXT:    s_waitcnt vmcnt(2)
1303; VI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v3
1304; VI-NEXT:    s_waitcnt vmcnt(1)
1305; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
1306; VI-NEXT:    s_waitcnt vmcnt(0)
1307; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1308; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1309; VI-NEXT:    s_endpgm
1310;
1311; GFX10-LABEL: load_v4i8_to_v4f32_unaligned:
1312; GFX10:       ; %bb.0:
1313; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1314; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1315; GFX10-NEXT:    v_mov_b32_e32 v6, 0
1316; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1317; GFX10-NEXT:    s_clause 0x3
1318; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
1319; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
1320; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
1321; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3]
1322; GFX10-NEXT:    s_waitcnt vmcnt(3)
1323; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
1324; GFX10-NEXT:    s_waitcnt vmcnt(2)
1325; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
1326; GFX10-NEXT:    s_waitcnt vmcnt(1)
1327; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
1328; GFX10-NEXT:    s_waitcnt vmcnt(0)
1329; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
1330; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
1331; GFX10-NEXT:    s_endpgm
1332;
1333; GFX9-LABEL: load_v4i8_to_v4f32_unaligned:
1334; GFX9:       ; %bb.0:
1335; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1336; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1337; GFX9-NEXT:    v_mov_b32_e32 v6, 0
1338; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1339; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
1340; GFX9-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
1341; GFX9-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
1342; GFX9-NEXT:    global_load_ubyte v5, v0, s[2:3]
1343; GFX9-NEXT:    s_waitcnt vmcnt(3)
1344; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
1345; GFX9-NEXT:    s_waitcnt vmcnt(2)
1346; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
1347; GFX9-NEXT:    s_waitcnt vmcnt(1)
1348; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
1349; GFX9-NEXT:    s_waitcnt vmcnt(0)
1350; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
1351; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
1352; GFX9-NEXT:    s_endpgm
1353;
1354; GFX11-LABEL: load_v4i8_to_v4f32_unaligned:
1355; GFX11:       ; %bb.0:
1356; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1357; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1358; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1359; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1360; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1361; GFX11-NEXT:    s_clause 0x3
1362; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3] offset:3
1363; GFX11-NEXT:    global_load_u8 v2, v0, s[2:3] offset:2
1364; GFX11-NEXT:    global_load_u8 v4, v0, s[2:3] offset:1
1365; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
1366; GFX11-NEXT:    s_waitcnt vmcnt(3)
1367; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
1368; GFX11-NEXT:    s_waitcnt vmcnt(2)
1369; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
1370; GFX11-NEXT:    s_waitcnt vmcnt(1)
1371; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
1372; GFX11-NEXT:    s_waitcnt vmcnt(0)
1373; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1374; GFX11-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
1375; GFX11-NEXT:    s_endpgm
1376  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1377  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
1378  %load = load <4 x i8>, ptr addrspace(1) %gep, align 1
1379  %cvt = uitofp <4 x i8> %load to <4 x float>
1380  store <4 x float> %cvt, ptr addrspace(1) %out, align 16
1381  ret void
1382}
1383
1384; The other use of shuffle0_0 make it profitable to lower into v_perm
1385
1386define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind {
1387; SI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
1388; SI:       ; %bb.0:
1389; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
1390; SI-NEXT:    s_mov_b32 s11, 0xf000
1391; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1392; SI-NEXT:    v_mov_b32_e32 v1, 0
1393; SI-NEXT:    s_mov_b32 s14, 0
1394; SI-NEXT:    s_mov_b32 s15, s11
1395; SI-NEXT:    s_waitcnt lgkmcnt(0)
1396; SI-NEXT:    s_mov_b64 s[12:13], s[4:5]
1397; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[12:15], 0 addr64 offset:3
1398; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[12:15], 0 addr64 offset:2
1399; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
1400; SI-NEXT:    buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2
1401; SI-NEXT:    s_mov_b32 s10, -1
1402; SI-NEXT:    s_mov_b32 s8, s2
1403; SI-NEXT:    s_mov_b32 s9, s3
1404; SI-NEXT:    s_mov_b32 s2, s10
1405; SI-NEXT:    s_mov_b32 s3, s11
1406; SI-NEXT:    s_waitcnt vmcnt(2)
1407; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v2
1408; SI-NEXT:    s_waitcnt vmcnt(1)
1409; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v4
1410; SI-NEXT:    v_or_b32_e32 v5, v5, v4
1411; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
1412; SI-NEXT:    s_waitcnt vmcnt(0)
1413; SI-NEXT:    v_or_b32_e32 v6, v3, v6
1414; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
1415; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v2
1416; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v3
1417; SI-NEXT:    v_mov_b32_e32 v3, v1
1418; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
1419; SI-NEXT:    v_alignbit_b32 v4, v4, v5, 24
1420; SI-NEXT:    v_or_b32_e32 v4, v4, v6
1421; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1422; SI-NEXT:    buffer_store_dword v4, off, s[8:11], 0
1423; SI-NEXT:    s_endpgm
1424;
1425; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
1426; VI:       ; %bb.0:
1427; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
1428; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1429; VI-NEXT:    s_mov_b32 s8, 0x4000405
1430; VI-NEXT:    s_waitcnt lgkmcnt(0)
1431; VI-NEXT:    v_mov_b32_e32 v1, s5
1432; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v0
1433; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1434; VI-NEXT:    v_mov_b32_e32 v1, s7
1435; VI-NEXT:    v_add_u32_e32 v4, vcc, s6, v0
1436; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1437; VI-NEXT:    v_add_u32_e32 v0, vcc, 3, v2
1438; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
1439; VI-NEXT:    flat_load_ubyte v6, v[0:1]
1440; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v2
1441; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
1442; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v4
1443; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
1444; VI-NEXT:    v_add_u32_e32 v4, vcc, 2, v4
1445; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
1446; VI-NEXT:    flat_load_ubyte v2, v[2:3]
1447; VI-NEXT:    flat_load_ubyte v3, v[4:5]
1448; VI-NEXT:    flat_load_ubyte v4, v[0:1]
1449; VI-NEXT:    s_mov_b32 s7, 0xf000
1450; VI-NEXT:    s_mov_b32 s6, -1
1451; VI-NEXT:    s_mov_b32 s4, s2
1452; VI-NEXT:    s_mov_b32 s5, s3
1453; VI-NEXT:    s_mov_b32 s2, s6
1454; VI-NEXT:    s_mov_b32 s3, s7
1455; VI-NEXT:    s_waitcnt vmcnt(3)
1456; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v6
1457; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v6
1458; VI-NEXT:    s_waitcnt vmcnt(2)
1459; VI-NEXT:    v_lshlrev_b32_e32 v7, 8, v2
1460; VI-NEXT:    s_waitcnt vmcnt(1)
1461; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v3
1462; VI-NEXT:    s_waitcnt vmcnt(0)
1463; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
1464; VI-NEXT:    v_or_b32_e32 v4, v5, v4
1465; VI-NEXT:    v_or_b32_e32 v5, v7, v3
1466; VI-NEXT:    v_mov_b32_e32 v3, v1
1467; VI-NEXT:    v_perm_b32 v4, v4, v5, s8
1468; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1469; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0
1470; VI-NEXT:    s_endpgm
1471;
1472; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
1473; GFX10:       ; %bb.0:
1474; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
1475; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1476; GFX10-NEXT:    v_mov_b32_e32 v7, 0
1477; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1478; GFX10-NEXT:    s_clause 0x3
1479; GFX10-NEXT:    global_load_ubyte v1, v0, s[12:13] offset:2
1480; GFX10-NEXT:    global_load_ubyte v3, v0, s[12:13] offset:3
1481; GFX10-NEXT:    global_load_ubyte v2, v0, s[14:15] offset:3
1482; GFX10-NEXT:    global_load_ubyte v4, v0, s[14:15] offset:2
1483; GFX10-NEXT:    s_waitcnt vmcnt(2)
1484; GFX10-NEXT:    v_lshl_or_b32 v5, v3, 8, v1
1485; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1486; GFX10-NEXT:    s_waitcnt vmcnt(0)
1487; GFX10-NEXT:    v_lshl_or_b32 v6, v2, 8, v4
1488; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
1489; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v3
1490; GFX10-NEXT:    v_mov_b32_e32 v3, v1
1491; GFX10-NEXT:    v_perm_b32 v4, v5, v6, 0x4000405
1492; GFX10-NEXT:    global_store_dwordx4 v7, v[0:3], s[8:9]
1493; GFX10-NEXT:    global_store_dword v7, v4, s[10:11]
1494; GFX10-NEXT:    s_endpgm
1495;
1496; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
1497; GFX9:       ; %bb.0:
1498; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
1499; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1500; GFX9-NEXT:    v_mov_b32_e32 v5, 0
1501; GFX9-NEXT:    s_mov_b32 s0, 0x4000405
1502; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1503; GFX9-NEXT:    global_load_ubyte v1, v0, s[12:13] offset:2
1504; GFX9-NEXT:    global_load_ubyte v2, v0, s[14:15] offset:3
1505; GFX9-NEXT:    global_load_ubyte v3, v0, s[12:13] offset:3
1506; GFX9-NEXT:    global_load_ubyte v4, v0, s[14:15] offset:2
1507; GFX9-NEXT:    s_waitcnt vmcnt(1)
1508; GFX9-NEXT:    v_lshl_or_b32 v6, v3, 8, v1
1509; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1510; GFX9-NEXT:    s_waitcnt vmcnt(0)
1511; GFX9-NEXT:    v_lshl_or_b32 v7, v2, 8, v4
1512; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
1513; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v3
1514; GFX9-NEXT:    v_mov_b32_e32 v3, v1
1515; GFX9-NEXT:    v_perm_b32 v4, v6, v7, s0
1516; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[8:9]
1517; GFX9-NEXT:    global_store_dword v5, v4, s[10:11]
1518; GFX9-NEXT:    s_endpgm
1519;
1520; GFX11-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
1521; GFX11:       ; %bb.0:
1522; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
1523; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1524; GFX11-NEXT:    v_mov_b32_e32 v6, 0
1525; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1526; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1527; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1528; GFX11-NEXT:    s_clause 0x3
1529; GFX11-NEXT:    global_load_u8 v1, v0, s[4:5] offset:2
1530; GFX11-NEXT:    global_load_u8 v3, v0, s[4:5] offset:3
1531; GFX11-NEXT:    global_load_u8 v2, v0, s[6:7] offset:3
1532; GFX11-NEXT:    global_load_u8 v0, v0, s[6:7] offset:2
1533; GFX11-NEXT:    s_waitcnt vmcnt(2)
1534; GFX11-NEXT:    v_lshl_or_b32 v4, v3, 8, v1
1535; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
1536; GFX11-NEXT:    s_waitcnt vmcnt(0)
1537; GFX11-NEXT:    v_lshl_or_b32 v5, v2, 8, v0
1538; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
1539; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v3
1540; GFX11-NEXT:    v_mov_b32_e32 v3, v1
1541; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
1542; GFX11-NEXT:    v_perm_b32 v4, v4, v5, 0x4000405
1543; GFX11-NEXT:    s_clause 0x1
1544; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
1545; GFX11-NEXT:    global_store_b32 v6, v4, s[2:3]
1546; GFX11-NEXT:    s_endpgm
1547  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1548  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
1549  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
1550  %load = load <4 x i8>, ptr addrspace(1) %gep, align 1
1551  %load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1
1552  %shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> <i32 3, i32 2, i32 6, i32 2>
1553  %cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float>
1554  store <4 x float> %cvt, ptr addrspace(1) %out, align 16
1555  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4
1556  ret void
1557}
1558
1559; FIXME: Need to handle non-uniform case for function below (load without gep).
1560; Instructions still emitted to repack bytes for add use.
1561define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out2, ptr addrspace(1) noalias %in) nounwind {
1562; SI-LABEL: load_v4i8_to_v4f32_2_uses:
1563; SI:       ; %bb.0:
1564; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
1565; SI-NEXT:    s_mov_b32 s3, 0xf000
1566; SI-NEXT:    s_mov_b32 s10, 0
1567; SI-NEXT:    s_mov_b32 s11, s3
1568; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1569; SI-NEXT:    v_mov_b32_e32 v1, 0
1570; SI-NEXT:    s_waitcnt lgkmcnt(0)
1571; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64
1572; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
1573; SI-NEXT:    s_mov_b32 s2, -1
1574; SI-NEXT:    s_waitcnt lgkmcnt(0)
1575; SI-NEXT:    s_mov_b32 s0, s6
1576; SI-NEXT:    s_mov_b32 s1, s7
1577; SI-NEXT:    s_mov_b32 s6, s2
1578; SI-NEXT:    s_mov_b32 s7, s3
1579; SI-NEXT:    s_waitcnt vmcnt(0)
1580; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
1581; SI-NEXT:    v_and_b32_e32 v6, 0xff00, v4
1582; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
1583; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
1584; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
1585; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
1586; SI-NEXT:    v_add_i32_e32 v4, vcc, 9, v4
1587; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1588; SI-NEXT:    s_waitcnt expcnt(0)
1589; SI-NEXT:    v_and_b32_e32 v0, 0xff, v4
1590; SI-NEXT:    v_add_i32_e32 v2, vcc, 9, v5
1591; SI-NEXT:    v_and_b32_e32 v1, 0xff00, v5
1592; SI-NEXT:    v_or_b32_e32 v0, v6, v0
1593; SI-NEXT:    v_and_b32_e32 v2, 0xff, v2
1594; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x900, v0
1595; SI-NEXT:    v_or_b32_e32 v1, v1, v2
1596; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1597; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1598; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1599; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x9000000, v0
1600; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1601; SI-NEXT:    s_endpgm
1602;
1603; VI-LABEL: load_v4i8_to_v4f32_2_uses:
1604; VI:       ; %bb.0:
1605; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1606; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1607; VI-NEXT:    s_mov_b32 s7, 0xf000
1608; VI-NEXT:    s_mov_b32 s6, -1
1609; VI-NEXT:    v_mov_b32_e32 v5, 0xffffff00
1610; VI-NEXT:    s_waitcnt lgkmcnt(0)
1611; VI-NEXT:    v_mov_b32_e32 v1, s1
1612; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
1613; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1614; VI-NEXT:    flat_load_dword v4, v[0:1]
1615; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1616; VI-NEXT:    v_mov_b32_e32 v6, 9
1617; VI-NEXT:    v_mov_b32_e32 v7, 0x900
1618; VI-NEXT:    s_waitcnt lgkmcnt(0)
1619; VI-NEXT:    s_mov_b32 s4, s2
1620; VI-NEXT:    s_mov_b32 s5, s3
1621; VI-NEXT:    s_mov_b32 s2, s6
1622; VI-NEXT:    s_mov_b32 s3, s7
1623; VI-NEXT:    s_waitcnt vmcnt(0)
1624; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
1625; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
1626; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
1627; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
1628; VI-NEXT:    v_and_b32_e32 v8, 0xffffff00, v4
1629; VI-NEXT:    v_add_u16_e32 v9, 9, v4
1630; VI-NEXT:    v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1631; VI-NEXT:    v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1632; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1633; VI-NEXT:    s_nop 0
1634; VI-NEXT:    v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1635; VI-NEXT:    v_or_b32_sdwa v1, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1636; VI-NEXT:    v_add_u16_e32 v0, 0x900, v0
1637; VI-NEXT:    v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1638; VI-NEXT:    v_or_b32_e32 v0, v0, v1
1639; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1640; VI-NEXT:    s_endpgm
1641;
1642; GFX10-LABEL: load_v4i8_to_v4f32_2_uses:
1643; GFX10:       ; %bb.0:
1644; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1645; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1646; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1647; GFX10-NEXT:    global_load_dword v0, v0, s[0:1]
1648; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
1649; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1650; GFX10-NEXT:    s_waitcnt vmcnt(0)
1651; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1652; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff00, v0
1653; GFX10-NEXT:    v_add_nc_u16 v4, v0, 9
1654; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff00, v1
1655; GFX10-NEXT:    v_add_nc_u16 v1, v1, 9
1656; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1657; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1658; GFX10-NEXT:    v_mov_b32_e32 v4, 0
1659; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
1660; GFX10-NEXT:    v_add_nc_u16 v1, 0x900, v1
1661; GFX10-NEXT:    v_add_nc_u16 v5, 0x900, v2
1662; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
1663; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
1664; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
1665; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1666; GFX10-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1667; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1668; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1669; GFX10-NEXT:    global_store_dword v4, v5, s[2:3]
1670; GFX10-NEXT:    s_endpgm
1671;
1672; GFX9-LABEL: load_v4i8_to_v4f32_2_uses:
1673; GFX9:       ; %bb.0:
1674; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1675; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1676; GFX9-NEXT:    v_mov_b32_e32 v6, 9
1677; GFX9-NEXT:    v_mov_b32_e32 v5, 0
1678; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1679; GFX9-NEXT:    global_load_dword v4, v0, s[0:1]
1680; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1681; GFX9-NEXT:    s_movk_i32 s4, 0xff00
1682; GFX9-NEXT:    s_movk_i32 s5, 0x900
1683; GFX9-NEXT:    s_waitcnt vmcnt(0)
1684; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v4
1685; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v4
1686; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v4
1687; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
1688; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff00, v4
1689; GFX9-NEXT:    v_add_u16_e32 v8, 9, v4
1690; GFX9-NEXT:    v_and_b32_sdwa v9, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1691; GFX9-NEXT:    v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1692; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1693; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
1694; GFX9-NEXT:    s_nop 0
1695; GFX9-NEXT:    v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1696; GFX9-NEXT:    v_or_b32_sdwa v1, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1697; GFX9-NEXT:    v_add_u16_e32 v0, 0x900, v0
1698; GFX9-NEXT:    v_add_u16_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1699; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
1700; GFX9-NEXT:    global_store_dword v5, v0, s[2:3]
1701; GFX9-NEXT:    s_endpgm
1702;
1703; GFX11-LABEL: load_v4i8_to_v4f32_2_uses:
1704; GFX11:       ; %bb.0:
1705; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
1706; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1707; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1708; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1709; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1710; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1]
1711; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1712; GFX11-NEXT:    s_waitcnt vmcnt(0)
1713; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1714; GFX11-NEXT:    v_add_nc_u16 v2, v0, 9
1715; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff00, v0
1716; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1717; GFX11-NEXT:    v_add_nc_u16 v3, v1, 9
1718; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
1719; GFX11-NEXT:    v_and_b32_e32 v1, 0xffffff00, v1
1720; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1721; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
1722; GFX11-NEXT:    v_or_b32_e32 v2, v4, v2
1723; GFX11-NEXT:    v_mov_b32_e32 v4, 0
1724; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1725; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
1726; GFX11-NEXT:    v_add_nc_u16 v2, 0x900, v2
1727; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v3, v0
1728; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1729; GFX11-NEXT:    v_add_nc_u16 v1, 0x900, v1
1730; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff, v2
1731; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v2, v0
1732; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
1733; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
1734; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v0
1735; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1736; GFX11-NEXT:    v_or_b32_e32 v5, v5, v6
1737; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1738; GFX11-NEXT:    s_clause 0x1
1739; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
1740; GFX11-NEXT:    global_store_b32 v4, v5, s[2:3]
1741; GFX11-NEXT:    s_endpgm
1742  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
1743  %in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
1744  %load = load <4 x i8>, ptr addrspace(1) %in.ptr, align 4
1745  %cvt = uitofp <4 x i8> %load to <4 x float>
1746  store <4 x float> %cvt, ptr addrspace(1) %out, align 16
1747  %add = add <4 x i8> %load, <i8 9, i8 9, i8 9, i8 9> ; Second use of %load
1748  store <4 x i8> %add, ptr addrspace(1) %out2, align 4
1749  ret void
1750}
1751
1752; Make sure this doesn't crash.
1753define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1754; SI-LABEL: load_v7i8_to_v7f32:
1755; SI:       ; %bb.0:
1756; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1757; SI-NEXT:    s_mov_b32 s7, 0xf000
1758; SI-NEXT:    s_mov_b32 s10, 0
1759; SI-NEXT:    s_mov_b32 s11, s7
1760; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1761; SI-NEXT:    s_waitcnt lgkmcnt(0)
1762; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1763; SI-NEXT:    v_mov_b32_e32 v1, 0
1764; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[8:11], 0 addr64 offset:3
1765; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[8:11], 0 addr64 offset:2
1766; SI-NEXT:    buffer_load_ubyte v5, v[0:1], s[8:11], 0 addr64 offset:1
1767; SI-NEXT:    buffer_load_ubyte v6, v[0:1], s[8:11], 0 addr64
1768; SI-NEXT:    buffer_load_ubyte v7, v[0:1], s[8:11], 0 addr64 offset:5
1769; SI-NEXT:    buffer_load_ubyte v8, v[0:1], s[8:11], 0 addr64 offset:4
1770; SI-NEXT:    buffer_load_ubyte v9, v[0:1], s[8:11], 0 addr64 offset:6
1771; SI-NEXT:    s_mov_b32 s6, -1
1772; SI-NEXT:    s_mov_b32 s4, s0
1773; SI-NEXT:    s_mov_b32 s5, s1
1774; SI-NEXT:    s_waitcnt vmcnt(6)
1775; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v2
1776; SI-NEXT:    s_waitcnt vmcnt(5)
1777; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
1778; SI-NEXT:    s_waitcnt vmcnt(4)
1779; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v5
1780; SI-NEXT:    s_waitcnt vmcnt(3)
1781; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v6
1782; SI-NEXT:    s_waitcnt vmcnt(2)
1783; SI-NEXT:    v_cvt_f32_ubyte0_e32 v5, v7
1784; SI-NEXT:    s_waitcnt vmcnt(1)
1785; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v8
1786; SI-NEXT:    s_waitcnt vmcnt(0)
1787; SI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v9
1788; SI-NEXT:    buffer_store_dword v6, off, s[4:7], 0 offset:24
1789; SI-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
1790; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1791; SI-NEXT:    s_endpgm
1792;
1793; VI-LABEL: load_v7i8_to_v7f32:
1794; VI:       ; %bb.0:
1795; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1796; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1797; VI-NEXT:    s_waitcnt lgkmcnt(0)
1798; VI-NEXT:    v_mov_b32_e32 v1, s3
1799; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1800; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1801; VI-NEXT:    v_add_u32_e32 v2, vcc, 5, v0
1802; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1803; VI-NEXT:    flat_load_ubyte v10, v[2:3]
1804; VI-NEXT:    v_add_u32_e32 v2, vcc, 6, v0
1805; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
1806; VI-NEXT:    v_add_u32_e32 v4, vcc, 1, v0
1807; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
1808; VI-NEXT:    v_add_u32_e32 v6, vcc, 2, v0
1809; VI-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
1810; VI-NEXT:    v_add_u32_e32 v8, vcc, 3, v0
1811; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
1812; VI-NEXT:    flat_load_ubyte v6, v[6:7]
1813; VI-NEXT:    flat_load_ubyte v7, v[8:9]
1814; VI-NEXT:    flat_load_ubyte v8, v[2:3]
1815; VI-NEXT:    flat_load_ubyte v2, v[0:1]
1816; VI-NEXT:    flat_load_ubyte v4, v[4:5]
1817; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
1818; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1819; VI-NEXT:    flat_load_ubyte v9, v[0:1]
1820; VI-NEXT:    s_mov_b32 s3, 0xf000
1821; VI-NEXT:    s_mov_b32 s2, -1
1822; VI-NEXT:    s_waitcnt vmcnt(6)
1823; VI-NEXT:    v_cvt_f32_ubyte0_e32 v5, v10
1824; VI-NEXT:    s_waitcnt vmcnt(4)
1825; VI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v7
1826; VI-NEXT:    s_waitcnt vmcnt(2)
1827; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v2
1828; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v6
1829; VI-NEXT:    s_waitcnt vmcnt(1)
1830; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
1831; VI-NEXT:    v_cvt_f32_ubyte0_e32 v6, v8
1832; VI-NEXT:    s_waitcnt vmcnt(0)
1833; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v9
1834; VI-NEXT:    buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16
1835; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1836; VI-NEXT:    s_endpgm
1837;
1838; GFX10-LABEL: load_v7i8_to_v7f32:
1839; GFX10:       ; %bb.0:
1840; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1841; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1842; GFX10-NEXT:    v_mov_b32_e32 v8, 0
1843; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1844; GFX10-NEXT:    s_clause 0x5
1845; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:6
1846; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
1847; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
1848; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3] offset:1
1849; GFX10-NEXT:    global_load_short_d16 v7, v0, s[2:3] offset:4
1850; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1851; GFX10-NEXT:    s_waitcnt vmcnt(5)
1852; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
1853; GFX10-NEXT:    s_waitcnt vmcnt(4)
1854; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
1855; GFX10-NEXT:    s_waitcnt vmcnt(3)
1856; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
1857; GFX10-NEXT:    s_waitcnt vmcnt(2)
1858; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v5
1859; GFX10-NEXT:    s_waitcnt vmcnt(1)
1860; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v5, v7
1861; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
1862; GFX10-NEXT:    s_waitcnt vmcnt(0)
1863; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1864; GFX10-NEXT:    global_store_dwordx3 v8, v[4:6], s[0:1] offset:16
1865; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
1866; GFX10-NEXT:    s_endpgm
1867;
1868; GFX9-LABEL: load_v7i8_to_v7f32:
1869; GFX9:       ; %bb.0:
1870; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1871; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1872; GFX9-NEXT:    v_mov_b32_e32 v10, 0
1873; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1874; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
1875; GFX9-NEXT:    global_load_ushort v2, v0, s[2:3] offset:4
1876; GFX9-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
1877; GFX9-NEXT:    global_load_ubyte v7, v0, s[2:3] offset:2
1878; GFX9-NEXT:    global_load_ubyte v8, v0, s[2:3] offset:1
1879; GFX9-NEXT:    global_load_ubyte v9, v0, s[2:3]
1880; GFX9-NEXT:    s_waitcnt vmcnt(5)
1881; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, v1
1882; GFX9-NEXT:    s_waitcnt vmcnt(4)
1883; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v5, v2
1884; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v4, v2
1885; GFX9-NEXT:    s_waitcnt vmcnt(3)
1886; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, v3
1887; GFX9-NEXT:    s_waitcnt vmcnt(2)
1888; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v7
1889; GFX9-NEXT:    s_waitcnt vmcnt(1)
1890; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v8
1891; GFX9-NEXT:    s_waitcnt vmcnt(0)
1892; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v9
1893; GFX9-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
1894; GFX9-NEXT:    global_store_dwordx3 v10, v[4:6], s[0:1] offset:16
1895; GFX9-NEXT:    s_endpgm
1896;
1897; GFX11-LABEL: load_v7i8_to_v7f32:
1898; GFX11:       ; %bb.0:
1899; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1900; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1901; GFX11-NEXT:    v_mov_b32_e32 v8, 0
1902; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1903; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1904; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1905; GFX11-NEXT:    s_clause 0x5
1906; GFX11-NEXT:    global_load_u8 v4, v0, s[2:3] offset:6
1907; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3] offset:3
1908; GFX11-NEXT:    global_load_u8 v2, v0, s[2:3] offset:2
1909; GFX11-NEXT:    global_load_u8 v5, v0, s[2:3] offset:1
1910; GFX11-NEXT:    global_load_d16_b16 v7, v0, s[2:3] offset:4
1911; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
1912; GFX11-NEXT:    s_waitcnt vmcnt(5)
1913; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v6, v4
1914; GFX11-NEXT:    s_waitcnt vmcnt(4)
1915; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
1916; GFX11-NEXT:    s_waitcnt vmcnt(3)
1917; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
1918; GFX11-NEXT:    s_waitcnt vmcnt(2)
1919; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v5
1920; GFX11-NEXT:    s_waitcnt vmcnt(1)
1921; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v5, v7
1922; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v4, v7
1923; GFX11-NEXT:    s_waitcnt vmcnt(0)
1924; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
1925; GFX11-NEXT:    s_clause 0x1
1926; GFX11-NEXT:    global_store_b96 v8, v[4:6], s[0:1] offset:16
1927; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
1928; GFX11-NEXT:    s_endpgm
1929  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1930  %gep = getelementptr <7 x i8>, ptr addrspace(1) %in, i32 %tid
1931  %load = load <7 x i8>, ptr addrspace(1) %gep, align 1
1932  %cvt = uitofp <7 x i8> %load to <7 x float>
1933  store <7 x float> %cvt, ptr addrspace(1) %out, align 16
1934  ret void
1935}
1936
1937define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1938; SI-LABEL: load_v8i8_to_v8f32:
1939; SI:       ; %bb.0:
1940; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1941; SI-NEXT:    s_mov_b32 s7, 0xf000
1942; SI-NEXT:    s_mov_b32 s10, 0
1943; SI-NEXT:    s_mov_b32 s11, s7
1944; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1945; SI-NEXT:    s_waitcnt lgkmcnt(0)
1946; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1947; SI-NEXT:    v_mov_b32_e32 v1, 0
1948; SI-NEXT:    buffer_load_dwordx2 v[7:8], v[0:1], s[8:11], 0 addr64
1949; SI-NEXT:    s_mov_b32 s6, -1
1950; SI-NEXT:    s_mov_b32 s4, s0
1951; SI-NEXT:    s_mov_b32 s5, s1
1952; SI-NEXT:    s_waitcnt vmcnt(0)
1953; SI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v7
1954; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v7
1955; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v7
1956; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v7
1957; SI-NEXT:    v_cvt_f32_ubyte3_e32 v7, v8
1958; SI-NEXT:    v_cvt_f32_ubyte2_e32 v6, v8
1959; SI-NEXT:    v_cvt_f32_ubyte1_e32 v5, v8
1960; SI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v8
1961; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
1962; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1963; SI-NEXT:    s_endpgm
1964;
1965; VI-LABEL: load_v8i8_to_v8f32:
1966; VI:       ; %bb.0:
1967; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1968; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1969; VI-NEXT:    s_waitcnt lgkmcnt(0)
1970; VI-NEXT:    v_mov_b32_e32 v1, s3
1971; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1972; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1973; VI-NEXT:    flat_load_dwordx2 v[7:8], v[0:1]
1974; VI-NEXT:    s_mov_b32 s3, 0xf000
1975; VI-NEXT:    s_mov_b32 s2, -1
1976; VI-NEXT:    s_waitcnt vmcnt(0)
1977; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v7
1978; VI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v7
1979; VI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v7
1980; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v7
1981; VI-NEXT:    v_cvt_f32_ubyte3_e32 v7, v8
1982; VI-NEXT:    v_cvt_f32_ubyte2_e32 v6, v8
1983; VI-NEXT:    v_cvt_f32_ubyte1_e32 v5, v8
1984; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v8
1985; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
1986; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1987; VI-NEXT:    s_endpgm
1988;
1989; GFX10-LABEL: load_v8i8_to_v8f32:
1990; GFX10:       ; %bb.0:
1991; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1992; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1993; GFX10-NEXT:    v_mov_b32_e32 v10, 0
1994; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1995; GFX10-NEXT:    global_load_dwordx2 v[8:9], v0, s[2:3]
1996; GFX10-NEXT:    s_waitcnt vmcnt(0)
1997; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v7, v9
1998; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v6, v9
1999; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v5, v9
2000; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v4, v9
2001; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v3, v8
2002; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v2, v8
2003; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v1, v8
2004; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v8
2005; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
2006; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
2007; GFX10-NEXT:    s_endpgm
2008;
2009; GFX9-LABEL: load_v8i8_to_v8f32:
2010; GFX9:       ; %bb.0:
2011; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2012; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
2013; GFX9-NEXT:    v_mov_b32_e32 v9, 0
2014; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2015; GFX9-NEXT:    global_load_dwordx2 v[7:8], v0, s[2:3]
2016; GFX9-NEXT:    s_waitcnt vmcnt(0)
2017; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v3, v7
2018; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v2, v7
2019; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v1, v7
2020; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v7
2021; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v7, v8
2022; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v6, v8
2023; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v5, v8
2024; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v4, v8
2025; GFX9-NEXT:    global_store_dwordx4 v9, v[4:7], s[0:1] offset:16
2026; GFX9-NEXT:    global_store_dwordx4 v9, v[0:3], s[0:1]
2027; GFX9-NEXT:    s_endpgm
2028;
2029; GFX11-LABEL: load_v8i8_to_v8f32:
2030; GFX11:       ; %bb.0:
2031; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2032; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2033; GFX11-NEXT:    v_mov_b32_e32 v10, 0
2034; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2035; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
2036; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2037; GFX11-NEXT:    global_load_b64 v[8:9], v0, s[2:3]
2038; GFX11-NEXT:    s_waitcnt vmcnt(0)
2039; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v7, v9
2040; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v6, v9
2041; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v5, v9
2042; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v4, v9
2043; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v3, v8
2044; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v2, v8
2045; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v1, v8
2046; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v8
2047; GFX11-NEXT:    s_clause 0x1
2048; GFX11-NEXT:    global_store_b128 v10, v[4:7], s[0:1] offset:16
2049; GFX11-NEXT:    global_store_b128 v10, v[0:3], s[0:1]
2050; GFX11-NEXT:    s_endpgm
2051  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2052  %gep = getelementptr <8 x i8>, ptr addrspace(1) %in, i32 %tid
2053  %load = load <8 x i8>, ptr addrspace(1) %gep, align 8
2054  %cvt = uitofp <8 x i8> %load to <8 x float>
2055  store <8 x float> %cvt, ptr addrspace(1) %out, align 16
2056  ret void
2057}
2058
2059define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
2060; SI-LABEL: i8_zext_inreg_i32_to_f32:
2061; SI:       ; %bb.0:
2062; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2063; SI-NEXT:    s_mov_b32 s7, 0xf000
2064; SI-NEXT:    s_mov_b32 s10, 0
2065; SI-NEXT:    s_mov_b32 s11, s7
2066; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2067; SI-NEXT:    s_waitcnt lgkmcnt(0)
2068; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
2069; SI-NEXT:    v_mov_b32_e32 v1, 0
2070; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2071; SI-NEXT:    s_mov_b32 s6, -1
2072; SI-NEXT:    s_mov_b32 s4, s0
2073; SI-NEXT:    s_mov_b32 s5, s1
2074; SI-NEXT:    s_waitcnt vmcnt(0)
2075; SI-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
2076; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2077; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2078; SI-NEXT:    s_endpgm
2079;
2080; VI-LABEL: i8_zext_inreg_i32_to_f32:
2081; VI:       ; %bb.0:
2082; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2083; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2084; VI-NEXT:    s_waitcnt lgkmcnt(0)
2085; VI-NEXT:    v_mov_b32_e32 v1, s3
2086; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
2087; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2088; VI-NEXT:    flat_load_dword v0, v[0:1]
2089; VI-NEXT:    s_mov_b32 s3, 0xf000
2090; VI-NEXT:    s_mov_b32 s2, -1
2091; VI-NEXT:    s_waitcnt vmcnt(0)
2092; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
2093; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2094; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2095; VI-NEXT:    s_endpgm
2096;
2097; GFX10-LABEL: i8_zext_inreg_i32_to_f32:
2098; GFX10:       ; %bb.0:
2099; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2100; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2101; GFX10-NEXT:    v_mov_b32_e32 v1, 0
2102; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2103; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
2104; GFX10-NEXT:    s_waitcnt vmcnt(0)
2105; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
2106; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2107; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
2108; GFX10-NEXT:    s_endpgm
2109;
2110; GFX9-LABEL: i8_zext_inreg_i32_to_f32:
2111; GFX9:       ; %bb.0:
2112; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2113; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2114; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2115; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2116; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
2117; GFX9-NEXT:    s_waitcnt vmcnt(0)
2118; GFX9-NEXT:    v_add_u32_e32 v0, 2, v0
2119; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2120; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
2121; GFX9-NEXT:    s_endpgm
2122;
2123; GFX11-LABEL: i8_zext_inreg_i32_to_f32:
2124; GFX11:       ; %bb.0:
2125; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2126; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2127; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
2128; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2129; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2130; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
2131; GFX11-NEXT:    s_waitcnt vmcnt(0)
2132; GFX11-NEXT:    v_add_nc_u32_e32 v0, 2, v0
2133; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2134; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
2135; GFX11-NEXT:    s_endpgm
2136  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2137  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
2138  %load = load i32, ptr addrspace(1) %gep, align 4
2139  %add = add i32 %load, 2
2140  %inreg = and i32 %add, 255
2141  %cvt = uitofp i32 %inreg to float
2142  store float %cvt, ptr addrspace(1) %out, align 4
2143  ret void
2144}
2145
2146define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
2147; SI-LABEL: i8_zext_inreg_hi1_to_f32:
2148; SI:       ; %bb.0:
2149; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2150; SI-NEXT:    s_mov_b32 s7, 0xf000
2151; SI-NEXT:    s_mov_b32 s10, 0
2152; SI-NEXT:    s_mov_b32 s11, s7
2153; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2154; SI-NEXT:    s_waitcnt lgkmcnt(0)
2155; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
2156; SI-NEXT:    v_mov_b32_e32 v1, 0
2157; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2158; SI-NEXT:    s_mov_b32 s6, -1
2159; SI-NEXT:    s_mov_b32 s4, s0
2160; SI-NEXT:    s_mov_b32 s5, s1
2161; SI-NEXT:    s_waitcnt vmcnt(0)
2162; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
2163; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2164; SI-NEXT:    s_endpgm
2165;
2166; VI-LABEL: i8_zext_inreg_hi1_to_f32:
2167; VI:       ; %bb.0:
2168; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2169; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2170; VI-NEXT:    s_waitcnt lgkmcnt(0)
2171; VI-NEXT:    v_mov_b32_e32 v1, s3
2172; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
2173; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2174; VI-NEXT:    flat_load_dword v0, v[0:1]
2175; VI-NEXT:    s_mov_b32 s3, 0xf000
2176; VI-NEXT:    s_mov_b32 s2, -1
2177; VI-NEXT:    s_waitcnt vmcnt(0)
2178; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
2179; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2180; VI-NEXT:    s_endpgm
2181;
2182; GFX10-LABEL: i8_zext_inreg_hi1_to_f32:
2183; GFX10:       ; %bb.0:
2184; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2185; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2186; GFX10-NEXT:    v_mov_b32_e32 v1, 0
2187; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2188; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
2189; GFX10-NEXT:    s_waitcnt vmcnt(0)
2190; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
2191; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
2192; GFX10-NEXT:    s_endpgm
2193;
2194; GFX9-LABEL: i8_zext_inreg_hi1_to_f32:
2195; GFX9:       ; %bb.0:
2196; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2197; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2198; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2199; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2200; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
2201; GFX9-NEXT:    s_waitcnt vmcnt(0)
2202; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
2203; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
2204; GFX9-NEXT:    s_endpgm
2205;
2206; GFX11-LABEL: i8_zext_inreg_hi1_to_f32:
2207; GFX11:       ; %bb.0:
2208; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2209; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2210; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2211; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2212; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2213; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
2214; GFX11-NEXT:    s_waitcnt vmcnt(0)
2215; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
2216; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
2217; GFX11-NEXT:    s_endpgm
2218  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2219  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
2220  %load = load i32, ptr addrspace(1) %gep, align 4
2221  %inreg = and i32 %load, 65280
2222  %shr = lshr i32 %inreg, 8
2223  %cvt = uitofp i32 %shr to float
2224  store float %cvt, ptr addrspace(1) %out, align 4
2225  ret void
2226}
2227
2228; We don't get these ones because of the zext, but instcombine removes
2229; them so it shouldn't really matter.
2230define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
2231; SI-LABEL: i8_zext_i32_to_f32:
2232; SI:       ; %bb.0:
2233; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2234; SI-NEXT:    s_mov_b32 s7, 0xf000
2235; SI-NEXT:    v_mov_b32_e32 v1, 0
2236; SI-NEXT:    s_mov_b32 s10, 0
2237; SI-NEXT:    s_mov_b32 s11, s7
2238; SI-NEXT:    s_waitcnt lgkmcnt(0)
2239; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
2240; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
2241; SI-NEXT:    s_mov_b32 s6, -1
2242; SI-NEXT:    s_mov_b32 s4, s0
2243; SI-NEXT:    s_mov_b32 s5, s1
2244; SI-NEXT:    s_waitcnt vmcnt(0)
2245; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2246; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2247; SI-NEXT:    s_endpgm
2248;
2249; VI-LABEL: i8_zext_i32_to_f32:
2250; VI:       ; %bb.0:
2251; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2252; VI-NEXT:    s_waitcnt lgkmcnt(0)
2253; VI-NEXT:    v_mov_b32_e32 v1, s3
2254; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
2255; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2256; VI-NEXT:    flat_load_ubyte v0, v[0:1]
2257; VI-NEXT:    s_mov_b32 s3, 0xf000
2258; VI-NEXT:    s_mov_b32 s2, -1
2259; VI-NEXT:    s_waitcnt vmcnt(0)
2260; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2261; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2262; VI-NEXT:    s_endpgm
2263;
2264; GFX10-LABEL: i8_zext_i32_to_f32:
2265; GFX10:       ; %bb.0:
2266; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2267; GFX10-NEXT:    v_mov_b32_e32 v1, 0
2268; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2269; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
2270; GFX10-NEXT:    s_waitcnt vmcnt(0)
2271; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2272; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
2273; GFX10-NEXT:    s_endpgm
2274;
2275; GFX9-LABEL: i8_zext_i32_to_f32:
2276; GFX9:       ; %bb.0:
2277; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2278; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2279; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2280; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3]
2281; GFX9-NEXT:    s_waitcnt vmcnt(0)
2282; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2283; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
2284; GFX9-NEXT:    s_endpgm
2285;
2286; GFX11-LABEL: i8_zext_i32_to_f32:
2287; GFX11:       ; %bb.0:
2288; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2289; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2290; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2291; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
2292; GFX11-NEXT:    s_waitcnt vmcnt(0)
2293; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2294; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
2295; GFX11-NEXT:    s_endpgm
2296  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2297  %gep = getelementptr i8, ptr addrspace(1) %in, i32 %tid
2298  %load = load i8, ptr addrspace(1) %gep, align 1
2299  %ext = zext i8 %load to i32
2300  %cvt = uitofp i32 %ext to float
2301  store float %cvt, ptr addrspace(1) %out, align 4
2302  ret void
2303}
2304
2305define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
2306; SI-LABEL: v4i8_zext_v4i32_to_v4f32:
2307; SI:       ; %bb.0:
2308; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2309; SI-NEXT:    s_mov_b32 s7, 0xf000
2310; SI-NEXT:    s_mov_b32 s10, 0
2311; SI-NEXT:    s_mov_b32 s11, s7
2312; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2313; SI-NEXT:    s_waitcnt lgkmcnt(0)
2314; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
2315; SI-NEXT:    v_mov_b32_e32 v1, 0
2316; SI-NEXT:    buffer_load_ubyte v2, v[0:1], s[8:11], 0 addr64 offset:3
2317; SI-NEXT:    buffer_load_ubyte v4, v[0:1], s[8:11], 0 addr64 offset:2
2318; SI-NEXT:    buffer_load_ubyte v5, v[0:1], s[8:11], 0 addr64 offset:1
2319; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
2320; SI-NEXT:    s_mov_b32 s6, -1
2321; SI-NEXT:    s_mov_b32 s4, s0
2322; SI-NEXT:    s_mov_b32 s5, s1
2323; SI-NEXT:    s_waitcnt vmcnt(3)
2324; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v2
2325; SI-NEXT:    s_waitcnt vmcnt(2)
2326; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
2327; SI-NEXT:    s_waitcnt vmcnt(1)
2328; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v5
2329; SI-NEXT:    s_waitcnt vmcnt(0)
2330; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2331; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2332; SI-NEXT:    s_endpgm
2333;
2334; VI-LABEL: v4i8_zext_v4i32_to_v4f32:
2335; VI:       ; %bb.0:
2336; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2337; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2338; VI-NEXT:    s_waitcnt lgkmcnt(0)
2339; VI-NEXT:    v_mov_b32_e32 v1, s3
2340; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
2341; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2342; VI-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
2343; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
2344; VI-NEXT:    v_add_u32_e32 v4, vcc, 3, v0
2345; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
2346; VI-NEXT:    flat_load_ubyte v2, v[2:3]
2347; VI-NEXT:    flat_load_ubyte v3, v[4:5]
2348; VI-NEXT:    flat_load_ubyte v4, v[0:1]
2349; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
2350; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2351; VI-NEXT:    flat_load_ubyte v1, v[0:1]
2352; VI-NEXT:    s_mov_b32 s3, 0xf000
2353; VI-NEXT:    s_mov_b32 s2, -1
2354; VI-NEXT:    s_waitcnt vmcnt(3)
2355; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
2356; VI-NEXT:    s_waitcnt vmcnt(2)
2357; VI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v3
2358; VI-NEXT:    s_waitcnt vmcnt(1)
2359; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v4
2360; VI-NEXT:    s_waitcnt vmcnt(0)
2361; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
2362; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2363; VI-NEXT:    s_endpgm
2364;
2365; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32:
2366; GFX10:       ; %bb.0:
2367; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2368; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2369; GFX10-NEXT:    v_mov_b32_e32 v6, 0
2370; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2371; GFX10-NEXT:    s_clause 0x3
2372; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
2373; GFX10-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
2374; GFX10-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
2375; GFX10-NEXT:    global_load_ubyte v5, v0, s[2:3]
2376; GFX10-NEXT:    s_waitcnt vmcnt(3)
2377; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
2378; GFX10-NEXT:    s_waitcnt vmcnt(2)
2379; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
2380; GFX10-NEXT:    s_waitcnt vmcnt(1)
2381; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
2382; GFX10-NEXT:    s_waitcnt vmcnt(0)
2383; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
2384; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
2385; GFX10-NEXT:    s_endpgm
2386;
2387; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32:
2388; GFX9:       ; %bb.0:
2389; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2390; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2391; GFX9-NEXT:    v_mov_b32_e32 v6, 0
2392; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2393; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
2394; GFX9-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:2
2395; GFX9-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:1
2396; GFX9-NEXT:    global_load_ubyte v5, v0, s[2:3]
2397; GFX9-NEXT:    s_waitcnt vmcnt(3)
2398; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
2399; GFX9-NEXT:    s_waitcnt vmcnt(2)
2400; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
2401; GFX9-NEXT:    s_waitcnt vmcnt(1)
2402; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
2403; GFX9-NEXT:    s_waitcnt vmcnt(0)
2404; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v5
2405; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[0:1]
2406; GFX9-NEXT:    s_endpgm
2407;
2408; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32:
2409; GFX11:       ; %bb.0:
2410; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2411; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2412; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2413; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2414; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2415; GFX11-NEXT:    s_clause 0x3
2416; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3] offset:3
2417; GFX11-NEXT:    global_load_u8 v2, v0, s[2:3] offset:2
2418; GFX11-NEXT:    global_load_u8 v4, v0, s[2:3] offset:1
2419; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
2420; GFX11-NEXT:    s_waitcnt vmcnt(3)
2421; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
2422; GFX11-NEXT:    s_waitcnt vmcnt(2)
2423; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
2424; GFX11-NEXT:    s_waitcnt vmcnt(1)
2425; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
2426; GFX11-NEXT:    s_waitcnt vmcnt(0)
2427; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2428; GFX11-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
2429; GFX11-NEXT:    s_endpgm
2430  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2431  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
2432  %load = load <4 x i8>, ptr addrspace(1) %gep, align 1
2433  %ext = zext <4 x i8> %load to <4 x i32>
2434  %cvt = uitofp <4 x i32> %ext to <4 x float>
2435  store <4 x float> %cvt, ptr addrspace(1) %out, align 16
2436  ret void
2437}
2438
2439define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
2440; SI-LABEL: extract_byte0_to_f32:
2441; SI:       ; %bb.0:
2442; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2443; SI-NEXT:    s_mov_b32 s7, 0xf000
2444; SI-NEXT:    s_mov_b32 s10, 0
2445; SI-NEXT:    s_mov_b32 s11, s7
2446; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2447; SI-NEXT:    s_waitcnt lgkmcnt(0)
2448; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
2449; SI-NEXT:    v_mov_b32_e32 v1, 0
2450; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2451; SI-NEXT:    s_mov_b32 s6, -1
2452; SI-NEXT:    s_mov_b32 s4, s0
2453; SI-NEXT:    s_mov_b32 s5, s1
2454; SI-NEXT:    s_waitcnt vmcnt(0)
2455; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2456; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2457; SI-NEXT:    s_endpgm
2458;
2459; VI-LABEL: extract_byte0_to_f32:
2460; VI:       ; %bb.0:
2461; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2462; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2463; VI-NEXT:    s_waitcnt lgkmcnt(0)
2464; VI-NEXT:    v_mov_b32_e32 v1, s3
2465; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
2466; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2467; VI-NEXT:    flat_load_dword v0, v[0:1]
2468; VI-NEXT:    s_mov_b32 s3, 0xf000
2469; VI-NEXT:    s_mov_b32 s2, -1
2470; VI-NEXT:    s_waitcnt vmcnt(0)
2471; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2472; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2473; VI-NEXT:    s_endpgm
2474;
2475; GFX10-LABEL: extract_byte0_to_f32:
2476; GFX10:       ; %bb.0:
2477; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2478; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2479; GFX10-NEXT:    v_mov_b32_e32 v1, 0
2480; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2481; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
2482; GFX10-NEXT:    s_waitcnt vmcnt(0)
2483; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2484; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
2485; GFX10-NEXT:    s_endpgm
2486;
2487; GFX9-LABEL: extract_byte0_to_f32:
2488; GFX9:       ; %bb.0:
2489; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2490; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2491; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2492; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2493; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
2494; GFX9-NEXT:    s_waitcnt vmcnt(0)
2495; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2496; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
2497; GFX9-NEXT:    s_endpgm
2498;
2499; GFX11-LABEL: extract_byte0_to_f32:
2500; GFX11:       ; %bb.0:
2501; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2502; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2503; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2504; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2505; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2506; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
2507; GFX11-NEXT:    s_waitcnt vmcnt(0)
2508; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2509; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
2510; GFX11-NEXT:    s_endpgm
2511  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2512  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
2513  %val = load i32, ptr addrspace(1) %gep
2514  %and = and i32 %val, 255
2515  %cvt = uitofp i32 %and to float
2516  store float %cvt, ptr addrspace(1) %out
2517  ret void
2518}
2519
2520define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
2521; SI-LABEL: extract_byte1_to_f32:
2522; SI:       ; %bb.0:
2523; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2524; SI-NEXT:    s_mov_b32 s7, 0xf000
2525; SI-NEXT:    s_mov_b32 s10, 0
2526; SI-NEXT:    s_mov_b32 s11, s7
2527; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2528; SI-NEXT:    s_waitcnt lgkmcnt(0)
2529; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
2530; SI-NEXT:    v_mov_b32_e32 v1, 0
2531; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2532; SI-NEXT:    s_mov_b32 s6, -1
2533; SI-NEXT:    s_mov_b32 s4, s0
2534; SI-NEXT:    s_mov_b32 s5, s1
2535; SI-NEXT:    s_waitcnt vmcnt(0)
2536; SI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
2537; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2538; SI-NEXT:    s_endpgm
2539;
2540; VI-LABEL: extract_byte1_to_f32:
2541; VI:       ; %bb.0:
2542; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2543; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2544; VI-NEXT:    s_waitcnt lgkmcnt(0)
2545; VI-NEXT:    v_mov_b32_e32 v1, s3
2546; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
2547; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2548; VI-NEXT:    flat_load_dword v0, v[0:1]
2549; VI-NEXT:    s_mov_b32 s3, 0xf000
2550; VI-NEXT:    s_mov_b32 s2, -1
2551; VI-NEXT:    s_waitcnt vmcnt(0)
2552; VI-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
2553; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2554; VI-NEXT:    s_endpgm
2555;
2556; GFX10-LABEL: extract_byte1_to_f32:
2557; GFX10:       ; %bb.0:
2558; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2559; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2560; GFX10-NEXT:    v_mov_b32_e32 v1, 0
2561; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2562; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
2563; GFX10-NEXT:    s_waitcnt vmcnt(0)
2564; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
2565; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
2566; GFX10-NEXT:    s_endpgm
2567;
2568; GFX9-LABEL: extract_byte1_to_f32:
2569; GFX9:       ; %bb.0:
2570; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2571; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2572; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2573; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2574; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
2575; GFX9-NEXT:    s_waitcnt vmcnt(0)
2576; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
2577; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
2578; GFX9-NEXT:    s_endpgm
2579;
2580; GFX11-LABEL: extract_byte1_to_f32:
2581; GFX11:       ; %bb.0:
2582; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2583; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2584; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2585; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2586; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2587; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
2588; GFX11-NEXT:    s_waitcnt vmcnt(0)
2589; GFX11-NEXT:    v_cvt_f32_ubyte1_e32 v0, v0
2590; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
2591; GFX11-NEXT:    s_endpgm
2592  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2593  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
2594  %val = load i32, ptr addrspace(1) %gep
2595  %srl = lshr i32 %val, 8
2596  %and = and i32 %srl, 255
2597  %cvt = uitofp i32 %and to float
2598  store float %cvt, ptr addrspace(1) %out
2599  ret void
2600}
2601
2602define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
2603; SI-LABEL: extract_byte2_to_f32:
2604; SI:       ; %bb.0:
2605; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2606; SI-NEXT:    s_mov_b32 s7, 0xf000
2607; SI-NEXT:    s_mov_b32 s10, 0
2608; SI-NEXT:    s_mov_b32 s11, s7
2609; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2610; SI-NEXT:    s_waitcnt lgkmcnt(0)
2611; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
2612; SI-NEXT:    v_mov_b32_e32 v1, 0
2613; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2614; SI-NEXT:    s_mov_b32 s6, -1
2615; SI-NEXT:    s_mov_b32 s4, s0
2616; SI-NEXT:    s_mov_b32 s5, s1
2617; SI-NEXT:    s_waitcnt vmcnt(0)
2618; SI-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
2619; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2620; SI-NEXT:    s_endpgm
2621;
2622; VI-LABEL: extract_byte2_to_f32:
2623; VI:       ; %bb.0:
2624; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2625; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2626; VI-NEXT:    s_waitcnt lgkmcnt(0)
2627; VI-NEXT:    v_mov_b32_e32 v1, s3
2628; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
2629; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2630; VI-NEXT:    flat_load_dword v0, v[0:1]
2631; VI-NEXT:    s_mov_b32 s3, 0xf000
2632; VI-NEXT:    s_mov_b32 s2, -1
2633; VI-NEXT:    s_waitcnt vmcnt(0)
2634; VI-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
2635; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2636; VI-NEXT:    s_endpgm
2637;
2638; GFX10-LABEL: extract_byte2_to_f32:
2639; GFX10:       ; %bb.0:
2640; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2641; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2642; GFX10-NEXT:    v_mov_b32_e32 v1, 0
2643; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2644; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
2645; GFX10-NEXT:    s_waitcnt vmcnt(0)
2646; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
2647; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
2648; GFX10-NEXT:    s_endpgm
2649;
2650; GFX9-LABEL: extract_byte2_to_f32:
2651; GFX9:       ; %bb.0:
2652; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2653; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2654; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2655; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2656; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
2657; GFX9-NEXT:    s_waitcnt vmcnt(0)
2658; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
2659; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
2660; GFX9-NEXT:    s_endpgm
2661;
2662; GFX11-LABEL: extract_byte2_to_f32:
2663; GFX11:       ; %bb.0:
2664; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2665; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2666; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2667; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2668; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2669; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
2670; GFX11-NEXT:    s_waitcnt vmcnt(0)
2671; GFX11-NEXT:    v_cvt_f32_ubyte2_e32 v0, v0
2672; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
2673; GFX11-NEXT:    s_endpgm
2674  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2675  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
2676  %val = load i32, ptr addrspace(1) %gep
2677  %srl = lshr i32 %val, 16
2678  %and = and i32 %srl, 255
2679  %cvt = uitofp i32 %and to float
2680  store float %cvt, ptr addrspace(1) %out
2681  ret void
2682}
2683
2684define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
2685; SI-LABEL: extract_byte3_to_f32:
2686; SI:       ; %bb.0:
2687; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2688; SI-NEXT:    s_mov_b32 s7, 0xf000
2689; SI-NEXT:    s_mov_b32 s10, 0
2690; SI-NEXT:    s_mov_b32 s11, s7
2691; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2692; SI-NEXT:    s_waitcnt lgkmcnt(0)
2693; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
2694; SI-NEXT:    v_mov_b32_e32 v1, 0
2695; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2696; SI-NEXT:    s_mov_b32 s6, -1
2697; SI-NEXT:    s_mov_b32 s4, s0
2698; SI-NEXT:    s_mov_b32 s5, s1
2699; SI-NEXT:    s_waitcnt vmcnt(0)
2700; SI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
2701; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2702; SI-NEXT:    s_endpgm
2703;
2704; VI-LABEL: extract_byte3_to_f32:
2705; VI:       ; %bb.0:
2706; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2707; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2708; VI-NEXT:    s_waitcnt lgkmcnt(0)
2709; VI-NEXT:    v_mov_b32_e32 v1, s3
2710; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
2711; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2712; VI-NEXT:    flat_load_dword v0, v[0:1]
2713; VI-NEXT:    s_mov_b32 s3, 0xf000
2714; VI-NEXT:    s_mov_b32 s2, -1
2715; VI-NEXT:    s_waitcnt vmcnt(0)
2716; VI-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
2717; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2718; VI-NEXT:    s_endpgm
2719;
2720; GFX10-LABEL: extract_byte3_to_f32:
2721; GFX10:       ; %bb.0:
2722; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2723; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2724; GFX10-NEXT:    v_mov_b32_e32 v1, 0
2725; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2726; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
2727; GFX10-NEXT:    s_waitcnt vmcnt(0)
2728; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
2729; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
2730; GFX10-NEXT:    s_endpgm
2731;
2732; GFX9-LABEL: extract_byte3_to_f32:
2733; GFX9:       ; %bb.0:
2734; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2735; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2736; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2737; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2738; GFX9-NEXT:    global_load_dword v0, v0, s[2:3]
2739; GFX9-NEXT:    s_waitcnt vmcnt(0)
2740; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
2741; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
2742; GFX9-NEXT:    s_endpgm
2743;
2744; GFX11-LABEL: extract_byte3_to_f32:
2745; GFX11:       ; %bb.0:
2746; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2747; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
2748; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2749; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2750; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2751; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
2752; GFX11-NEXT:    s_waitcnt vmcnt(0)
2753; GFX11-NEXT:    v_cvt_f32_ubyte3_e32 v0, v0
2754; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
2755; GFX11-NEXT:    s_endpgm
2756  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2757  %gep = getelementptr i32, ptr addrspace(1) %in, i32 %tid
2758  %val = load i32, ptr addrspace(1) %gep
2759  %srl = lshr i32 %val, 24
2760  %and = and i32 %srl, 255
2761  %cvt = uitofp i32 %and to float
2762  store float %cvt, ptr addrspace(1) %out
2763  ret void
2764}
2765
2766define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addrspace(1) %out) {
2767; SI-LABEL: cvt_ubyte0_or_multiuse:
2768; SI:       ; %bb.0: ; %bb
2769; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2770; SI-NEXT:    s_mov_b32 s7, 0xf000
2771; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2772; SI-NEXT:    v_mov_b32_e32 v1, 0
2773; SI-NEXT:    s_mov_b32 s6, -1
2774; SI-NEXT:    s_waitcnt lgkmcnt(0)
2775; SI-NEXT:    s_mov_b32 s4, s2
2776; SI-NEXT:    s_mov_b32 s5, s3
2777; SI-NEXT:    s_mov_b32 s2, 0
2778; SI-NEXT:    s_mov_b32 s3, s7
2779; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
2780; SI-NEXT:    s_waitcnt vmcnt(0)
2781; SI-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
2782; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v0
2783; SI-NEXT:    v_add_f32_e32 v0, v0, v1
2784; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2785; SI-NEXT:    s_endpgm
2786;
2787; VI-LABEL: cvt_ubyte0_or_multiuse:
2788; VI:       ; %bb.0: ; %bb
2789; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2790; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2791; VI-NEXT:    s_mov_b32 s7, 0xf000
2792; VI-NEXT:    s_mov_b32 s6, -1
2793; VI-NEXT:    s_waitcnt lgkmcnt(0)
2794; VI-NEXT:    v_mov_b32_e32 v1, s1
2795; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
2796; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2797; VI-NEXT:    flat_load_dword v0, v[0:1]
2798; VI-NEXT:    s_mov_b32 s4, s2
2799; VI-NEXT:    s_mov_b32 s5, s3
2800; VI-NEXT:    s_waitcnt vmcnt(0)
2801; VI-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
2802; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v0
2803; VI-NEXT:    v_add_f32_e32 v0, v0, v1
2804; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2805; VI-NEXT:    s_endpgm
2806;
2807; GFX10-LABEL: cvt_ubyte0_or_multiuse:
2808; GFX10:       ; %bb.0: ; %bb
2809; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2810; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2811; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2812; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2813; GFX10-NEXT:    global_load_dword v0, v0, s[0:1]
2814; GFX10-NEXT:    s_waitcnt vmcnt(0)
2815; GFX10-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
2816; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v0
2817; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
2818; GFX10-NEXT:    global_store_dword v2, v0, s[2:3]
2819; GFX10-NEXT:    s_endpgm
2820;
2821; GFX9-LABEL: cvt_ubyte0_or_multiuse:
2822; GFX9:       ; %bb.0: ; %bb
2823; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2824; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2825; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2826; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2827; GFX9-NEXT:    global_load_dword v0, v0, s[0:1]
2828; GFX9-NEXT:    s_waitcnt vmcnt(0)
2829; GFX9-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
2830; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
2831; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
2832; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
2833; GFX9-NEXT:    s_endpgm
2834;
2835; GFX11-LABEL: cvt_ubyte0_or_multiuse:
2836; GFX11:       ; %bb.0: ; %bb
2837; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2838; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
2839; GFX11-NEXT:    v_mov_b32_e32 v2, 0
2840; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
2841; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2842; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2843; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1]
2844; GFX11-NEXT:    s_waitcnt vmcnt(0)
2845; GFX11-NEXT:    v_or_b32_e32 v0, 0x80000001, v0
2846; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v0
2847; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2848; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
2849; GFX11-NEXT:    global_store_b32 v2, v0, s[2:3]
2850; GFX11-NEXT:    s_endpgm
2851bb:
2852  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
2853  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %lid
2854  %load = load i32, ptr addrspace(1) %gep
2855  %or = or i32 %load, -2147483647
2856  %and = and i32 %or, 255
2857  %uitofp = uitofp i32 %and to float
2858  %cast = bitcast i32 %or to float
2859  %add = fadd float %cast, %uitofp
2860  store float %add, ptr addrspace(1) %out
2861  ret void
2862}
2863
2864%Vec = type { [4 x i8] }
2865
2866define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr {
2867; SI-LABEL: cvt_f32_ubyte0_vector:
2868; SI:       ; %bb.0: ; %entry
2869; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
2870; SI-NEXT:    s_mov_b32 s3, 0xf000
2871; SI-NEXT:    s_mov_b32 s2, -1
2872; SI-NEXT:    s_waitcnt lgkmcnt(0)
2873; SI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:3
2874; SI-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0 offset:2
2875; SI-NEXT:    buffer_load_ubyte v2, off, s[0:3], 0 offset:1
2876; SI-NEXT:    buffer_load_ubyte v3, off, s[0:3], 0
2877; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
2878; SI-NEXT:    s_waitcnt vmcnt(3)
2879; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2880; SI-NEXT:    s_waitcnt lgkmcnt(0)
2881; SI-NEXT:    v_fma_f32 v0, s0, v0, 0.5
2882; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
2883; SI-NEXT:    s_waitcnt vmcnt(2)
2884; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
2885; SI-NEXT:    s_waitcnt vmcnt(2)
2886; SI-NEXT:    buffer_store_byte v2, off, s[0:3], 0
2887; SI-NEXT:    s_waitcnt vmcnt(2)
2888; SI-NEXT:    buffer_store_byte v3, off, s[0:3], 0
2889; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2890; SI-NEXT:    s_endpgm
2891;
2892; VI-LABEL: cvt_f32_ubyte0_vector:
2893; VI:       ; %bb.0: ; %entry
2894; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
2895; VI-NEXT:    s_mov_b32 s3, 0xf000
2896; VI-NEXT:    s_mov_b32 s2, -1
2897; VI-NEXT:    s_waitcnt lgkmcnt(0)
2898; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:3
2899; VI-NEXT:    buffer_load_ubyte v1, off, s[0:3], 0 offset:2
2900; VI-NEXT:    buffer_load_ubyte v2, off, s[0:3], 0 offset:1
2901; VI-NEXT:    buffer_load_ubyte v3, off, s[0:3], 0
2902; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
2903; VI-NEXT:    s_waitcnt vmcnt(3)
2904; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
2905; VI-NEXT:    s_waitcnt lgkmcnt(0)
2906; VI-NEXT:    v_mul_f32_e32 v0, s0, v0
2907; VI-NEXT:    v_add_f32_e32 v0, 0.5, v0
2908; VI-NEXT:    v_cvt_i32_f32_e32 v0, v0
2909; VI-NEXT:    s_waitcnt vmcnt(2)
2910; VI-NEXT:    buffer_store_byte v1, off, s[0:3], 0
2911; VI-NEXT:    s_waitcnt vmcnt(2)
2912; VI-NEXT:    buffer_store_byte v2, off, s[0:3], 0
2913; VI-NEXT:    s_waitcnt vmcnt(2)
2914; VI-NEXT:    buffer_store_byte v3, off, s[0:3], 0
2915; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
2916; VI-NEXT:    s_endpgm
2917;
2918; GFX10-LABEL: cvt_f32_ubyte0_vector:
2919; GFX10:       ; %bb.0: ; %entry
2920; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
2921; GFX10-NEXT:    v_mov_b32_e32 v0, 0
2922; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2923; GFX10-NEXT:    s_clause 0x3
2924; GFX10-NEXT:    global_load_ubyte v1, v0, s[0:1] offset:3
2925; GFX10-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:2
2926; GFX10-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:1
2927; GFX10-NEXT:    global_load_ubyte v4, v0, s[0:1]
2928; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
2929; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x0
2930; GFX10-NEXT:    s_waitcnt vmcnt(3)
2931; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v1
2932; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2933; GFX10-NEXT:    v_fma_f32 v0, s0, v0, 0.5
2934; GFX10-NEXT:    v_cvt_i32_f32_e32 v0, v0
2935; GFX10-NEXT:    s_waitcnt vmcnt(2)
2936; GFX10-NEXT:    global_store_byte v[0:1], v2, off
2937; GFX10-NEXT:    s_waitcnt vmcnt(1)
2938; GFX10-NEXT:    global_store_byte v[0:1], v3, off
2939; GFX10-NEXT:    s_waitcnt vmcnt(0)
2940; GFX10-NEXT:    global_store_byte v[0:1], v4, off
2941; GFX10-NEXT:    global_store_byte v[0:1], v0, off
2942; GFX10-NEXT:    s_endpgm
2943;
2944; GFX9-LABEL: cvt_f32_ubyte0_vector:
2945; GFX9:       ; %bb.0: ; %entry
2946; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
2947; GFX9-NEXT:    s_waitcnt vmcnt(0)
2948; GFX9-NEXT:    global_load_ubyte v2, v[0:1], off offset:3
2949; GFX9-NEXT:    global_load_dword v3, v[0:1], off
2950; GFX9-NEXT:    global_load_ubyte v4, v[0:1], off offset:2
2951; GFX9-NEXT:    global_load_ubyte v5, v[0:1], off offset:1
2952; GFX9-NEXT:    global_load_ubyte v6, v[0:1], off
2953; GFX9-NEXT:    s_waitcnt vmcnt(4)
2954; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v2
2955; GFX9-NEXT:    s_waitcnt vmcnt(3)
2956; GFX9-NEXT:    v_fma_f32 v0, v3, v0, 0.5
2957; GFX9-NEXT:    v_cvt_i32_f32_e32 v0, v0
2958; GFX9-NEXT:    s_waitcnt vmcnt(2)
2959; GFX9-NEXT:    global_store_byte v[0:1], v4, off
2960; GFX9-NEXT:    s_waitcnt vmcnt(2)
2961; GFX9-NEXT:    global_store_byte v[0:1], v5, off
2962; GFX9-NEXT:    s_waitcnt vmcnt(2)
2963; GFX9-NEXT:    global_store_byte v[0:1], v6, off
2964; GFX9-NEXT:    global_store_byte v[0:1], v0, off
2965; GFX9-NEXT:    s_endpgm
2966;
2967; GFX11-LABEL: cvt_f32_ubyte0_vector:
2968; GFX11:       ; %bb.0: ; %entry
2969; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
2970; GFX11-NEXT:    v_mov_b32_e32 v0, 0
2971; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2972; GFX11-NEXT:    s_clause 0x3
2973; GFX11-NEXT:    global_load_u8 v1, v0, s[0:1] offset:3
2974; GFX11-NEXT:    global_load_u8 v2, v0, s[0:1] offset:2
2975; GFX11-NEXT:    global_load_u8 v3, v0, s[0:1] offset:1
2976; GFX11-NEXT:    global_load_u8 v0, v0, s[0:1]
2977; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
2978; GFX11-NEXT:    s_waitcnt vmcnt(3)
2979; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
2980; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2981; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2982; GFX11-NEXT:    v_fma_f32 v1, s0, v1, 0.5
2983; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
2984; GFX11-NEXT:    s_waitcnt vmcnt(0)
2985; GFX11-NEXT:    s_clause 0x3
2986; GFX11-NEXT:    global_store_b8 v[0:1], v2, off
2987; GFX11-NEXT:    global_store_b8 v[0:1], v3, off
2988; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
2989; GFX11-NEXT:    global_store_b8 v[0:1], v1, off
2990; GFX11-NEXT:    s_endpgm
2991entry:
2992  br label %for.body.i
2993
2994for.body.i:                                       ; preds = %for.body.i, %entry
2995  %retval.sroa.0.0.copyload = load ptr, ptr addrspace(1) undef, align 8
2996  %add.ptr = getelementptr inbounds %Vec, ptr %retval.sroa.0.0.copyload, i64 undef
2997  %retval.sroa.0.0..sroa_cast_adr = addrspacecast ptr %add.ptr to ptr addrspace(1)
2998  %retval.sroa.0.0.copyload.i = load i32, ptr addrspace(1) %retval.sroa.0.0..sroa_cast_adr, align 1
2999  %p1.sroa.6.0.extract.shift = lshr i32 %retval.sroa.0.0.copyload.i, 24
3000  %p1.sroa.6.0.extract.trunc = trunc i32 %p1.sroa.6.0.extract.shift to i8
3001  %conv12 = uitofp i8 %p1.sroa.6.0.extract.trunc to float
3002  %0 = load float, ptr addrspace(1) undef, align 8
3003  %mul = fmul contract float %0, %conv12
3004  %add = fadd contract float %mul, 5.000000e-01
3005  %conv13 = fptoui float %add to i8
3006  %retval.sroa.4.0.insert.ext = zext i8 %conv13 to i32
3007  %retval.sroa.4.0.insert.shift = shl nuw i32 %retval.sroa.4.0.insert.ext, 24
3008  %retval.sroa.3.0.insert.ext = and i32 %retval.sroa.0.0.copyload.i, 16711680
3009  %retval.sroa.3.0.insert.insert = or i32 %retval.sroa.4.0.insert.shift, %retval.sroa.3.0.insert.ext
3010  %retval.sroa.2.0.insert.ext = and i32 %retval.sroa.0.0.copyload.i, 65280
3011  %retval.sroa.2.0.insert.insert = or i32 %retval.sroa.3.0.insert.insert, %retval.sroa.2.0.insert.ext
3012  %retval.sroa.0.0.insert.ext = and i32 %retval.sroa.0.0.copyload.i, 255
3013  %retval.sroa.0.0.insert.insert = or i32 %retval.sroa.2.0.insert.insert, %retval.sroa.0.0.insert.ext
3014  store i32 %retval.sroa.0.0.insert.insert, ptr addrspace(1) undef, align 1
3015  ret void
3016}
3017
3018!llvm.module.flags = !{!0}
3019!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
3020