xref: /llvm-project/llvm/test/CodeGen/AMDGPU/half.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=CIVI,CI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=CIVI,VI %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GFX11 %s
5
6; half args should be promoted to float for CI and lower.
7
8define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
9; CI-LABEL: load_f16_arg:
10; CI:       ; %bb.0:
11; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
12; CI-NEXT:    s_load_dword s2, s[8:9], 0x2
13; CI-NEXT:    s_waitcnt lgkmcnt(0)
14; CI-NEXT:    v_mov_b32_e32 v0, s0
15; CI-NEXT:    v_mov_b32_e32 v1, s1
16; CI-NEXT:    v_mov_b32_e32 v2, s2
17; CI-NEXT:    flat_store_short v[0:1], v2
18; CI-NEXT:    s_endpgm
19;
20; VI-LABEL: load_f16_arg:
21; VI:       ; %bb.0:
22; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
23; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
24; VI-NEXT:    s_waitcnt lgkmcnt(0)
25; VI-NEXT:    v_mov_b32_e32 v0, s0
26; VI-NEXT:    v_mov_b32_e32 v1, s1
27; VI-NEXT:    v_mov_b32_e32 v2, s2
28; VI-NEXT:    flat_store_short v[0:1], v2
29; VI-NEXT:    s_endpgm
30;
31; GFX11-LABEL: load_f16_arg:
32; GFX11:       ; %bb.0:
33; GFX11-NEXT:    s_clause 0x1
34; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
35; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
36; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
37; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
38; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
39; GFX11-NEXT:    s_endpgm
40  store half %arg, ptr addrspace(1) %out
41  ret void
42}
43
44define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 {
45; CI-LABEL: load_v2f16_arg:
46; CI:       ; %bb.0:
47; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
48; CI-NEXT:    s_load_dword s2, s[8:9], 0x2
49; CI-NEXT:    s_waitcnt lgkmcnt(0)
50; CI-NEXT:    v_mov_b32_e32 v0, s0
51; CI-NEXT:    v_mov_b32_e32 v1, s1
52; CI-NEXT:    v_mov_b32_e32 v2, s2
53; CI-NEXT:    flat_store_dword v[0:1], v2
54; CI-NEXT:    s_endpgm
55;
56; VI-LABEL: load_v2f16_arg:
57; VI:       ; %bb.0:
58; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
59; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
60; VI-NEXT:    s_waitcnt lgkmcnt(0)
61; VI-NEXT:    v_mov_b32_e32 v0, s0
62; VI-NEXT:    v_mov_b32_e32 v1, s1
63; VI-NEXT:    v_mov_b32_e32 v2, s2
64; VI-NEXT:    flat_store_dword v[0:1], v2
65; VI-NEXT:    s_endpgm
66;
67; GFX11-LABEL: load_v2f16_arg:
68; GFX11:       ; %bb.0:
69; GFX11-NEXT:    s_clause 0x1
70; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
71; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
72; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
74; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
75; GFX11-NEXT:    s_endpgm
76  store <2 x half> %arg, ptr addrspace(1) %out
77  ret void
78}
79
80define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 {
81; CIVI-LABEL: load_v3f16_arg:
82; CIVI:       ; %bb.0:
83; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
84; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
85; CIVI-NEXT:    s_add_u32 s4, s0, 4
86; CIVI-NEXT:    s_addc_u32 s5, s1, 0
87; CIVI-NEXT:    v_mov_b32_e32 v2, s4
88; CIVI-NEXT:    v_mov_b32_e32 v4, s3
89; CIVI-NEXT:    v_mov_b32_e32 v0, s0
90; CIVI-NEXT:    v_mov_b32_e32 v3, s5
91; CIVI-NEXT:    v_mov_b32_e32 v1, s1
92; CIVI-NEXT:    v_mov_b32_e32 v5, s2
93; CIVI-NEXT:    flat_store_short v[2:3], v4
94; CIVI-NEXT:    flat_store_dword v[0:1], v5
95; CIVI-NEXT:    s_endpgm
96;
97; GFX11-LABEL: load_v3f16_arg:
98; GFX11:       ; %bb.0:
99; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
100; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
101; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
102; GFX11-NEXT:    v_mov_b32_e32 v2, s2
103; GFX11-NEXT:    s_clause 0x1
104; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1] offset:4
105; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
106; GFX11-NEXT:    s_endpgm
107  store <3 x half> %arg, ptr addrspace(1) %out
108  ret void
109}
110
111
112; FIXME: Why not one load?
113define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 {
114; CIVI-LABEL: load_v4f16_arg:
115; CIVI:       ; %bb.0:
116; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
117; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
118; CIVI-NEXT:    v_mov_b32_e32 v0, s0
119; CIVI-NEXT:    v_mov_b32_e32 v2, s2
120; CIVI-NEXT:    v_mov_b32_e32 v1, s1
121; CIVI-NEXT:    v_mov_b32_e32 v3, s3
122; CIVI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
123; CIVI-NEXT:    s_endpgm
124;
125; GFX11-LABEL: load_v4f16_arg:
126; GFX11:       ; %bb.0:
127; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
128; GFX11-NEXT:    v_mov_b32_e32 v2, 0
129; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
130; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
131; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
132; GFX11-NEXT:    s_endpgm
133  store <4 x half> %arg, ptr addrspace(1) %out
134  ret void
135}
136
137define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 {
138; CI-LABEL: load_v8f16_arg:
139; CI:       ; %bb.0:
140; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
141; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x4
142; CI-NEXT:    s_waitcnt lgkmcnt(0)
143; CI-NEXT:    v_mov_b32_e32 v4, s4
144; CI-NEXT:    v_mov_b32_e32 v0, s0
145; CI-NEXT:    v_mov_b32_e32 v5, s5
146; CI-NEXT:    v_mov_b32_e32 v1, s1
147; CI-NEXT:    v_mov_b32_e32 v2, s2
148; CI-NEXT:    v_mov_b32_e32 v3, s3
149; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
150; CI-NEXT:    s_endpgm
151;
152; VI-LABEL: load_v8f16_arg:
153; VI:       ; %bb.0:
154; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
155; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
156; VI-NEXT:    s_waitcnt lgkmcnt(0)
157; VI-NEXT:    v_mov_b32_e32 v4, s4
158; VI-NEXT:    v_mov_b32_e32 v0, s0
159; VI-NEXT:    v_mov_b32_e32 v5, s5
160; VI-NEXT:    v_mov_b32_e32 v1, s1
161; VI-NEXT:    v_mov_b32_e32 v2, s2
162; VI-NEXT:    v_mov_b32_e32 v3, s3
163; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
164; VI-NEXT:    s_endpgm
165;
166; GFX11-LABEL: load_v8f16_arg:
167; GFX11:       ; %bb.0:
168; GFX11-NEXT:    s_clause 0x1
169; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
170; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
171; GFX11-NEXT:    v_mov_b32_e32 v4, 0
172; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
173; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
174; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
175; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
176; GFX11-NEXT:    s_endpgm
177  store <8 x half> %arg, ptr addrspace(1) %out
178  ret void
179}
180
181define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %in) #0 {
182; CI-LABEL: extload_v2f16_arg:
183; CI:       ; %bb.0:
184; CI-NEXT:    s_load_dword s2, s[8:9], 0x2
185; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
186; CI-NEXT:    s_waitcnt lgkmcnt(0)
187; CI-NEXT:    s_lshr_b32 s3, s2, 16
188; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
189; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
190; CI-NEXT:    v_mov_b32_e32 v3, s1
191; CI-NEXT:    v_mov_b32_e32 v2, s0
192; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
193; CI-NEXT:    s_endpgm
194;
195; VI-LABEL: extload_v2f16_arg:
196; VI:       ; %bb.0:
197; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
198; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
199; VI-NEXT:    s_waitcnt lgkmcnt(0)
200; VI-NEXT:    s_lshr_b32 s3, s2, 16
201; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
202; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
203; VI-NEXT:    v_mov_b32_e32 v3, s1
204; VI-NEXT:    v_mov_b32_e32 v2, s0
205; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
206; VI-NEXT:    s_endpgm
207;
208; GFX11-LABEL: extload_v2f16_arg:
209; GFX11:       ; %bb.0:
210; GFX11-NEXT:    s_clause 0x1
211; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
212; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
213; GFX11-NEXT:    v_mov_b32_e32 v2, 0
214; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
215; GFX11-NEXT:    s_lshr_b32 s3, s2, 16
216; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
217; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s3
218; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
219; GFX11-NEXT:    s_endpgm
220  %fpext = fpext <2 x half> %in to <2 x float>
221  store <2 x float> %fpext, ptr addrspace(1) %out
222  ret void
223}
224
225define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %arg) #0 {
226; CI-LABEL: extload_f16_to_f32_arg:
227; CI:       ; %bb.0:
228; CI-NEXT:    s_load_dword s2, s[8:9], 0x2
229; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
230; CI-NEXT:    s_waitcnt lgkmcnt(0)
231; CI-NEXT:    v_cvt_f32_f16_e32 v2, s2
232; CI-NEXT:    v_mov_b32_e32 v0, s0
233; CI-NEXT:    v_mov_b32_e32 v1, s1
234; CI-NEXT:    flat_store_dword v[0:1], v2
235; CI-NEXT:    s_endpgm
236;
237; VI-LABEL: extload_f16_to_f32_arg:
238; VI:       ; %bb.0:
239; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
240; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
241; VI-NEXT:    s_waitcnt lgkmcnt(0)
242; VI-NEXT:    v_cvt_f32_f16_e32 v2, s2
243; VI-NEXT:    v_mov_b32_e32 v0, s0
244; VI-NEXT:    v_mov_b32_e32 v1, s1
245; VI-NEXT:    flat_store_dword v[0:1], v2
246; VI-NEXT:    s_endpgm
247;
248; GFX11-LABEL: extload_f16_to_f32_arg:
249; GFX11:       ; %bb.0:
250; GFX11-NEXT:    s_clause 0x1
251; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
252; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
253; GFX11-NEXT:    v_mov_b32_e32 v0, 0
254; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
255; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s2
256; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
257; GFX11-NEXT:    s_endpgm
258  %ext = fpext half %arg to float
259  store float %ext, ptr addrspace(1) %out
260  ret void
261}
262
263define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 {
264; CI-LABEL: extload_v2f16_to_v2f32_arg:
265; CI:       ; %bb.0:
266; CI-NEXT:    s_load_dword s2, s[8:9], 0x2
267; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
268; CI-NEXT:    s_waitcnt lgkmcnt(0)
269; CI-NEXT:    s_lshr_b32 s3, s2, 16
270; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
271; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
272; CI-NEXT:    v_mov_b32_e32 v3, s1
273; CI-NEXT:    v_mov_b32_e32 v2, s0
274; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
275; CI-NEXT:    s_endpgm
276;
277; VI-LABEL: extload_v2f16_to_v2f32_arg:
278; VI:       ; %bb.0:
279; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
280; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
281; VI-NEXT:    s_waitcnt lgkmcnt(0)
282; VI-NEXT:    s_lshr_b32 s3, s2, 16
283; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
284; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
285; VI-NEXT:    v_mov_b32_e32 v3, s1
286; VI-NEXT:    v_mov_b32_e32 v2, s0
287; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
288; VI-NEXT:    s_endpgm
289;
290; GFX11-LABEL: extload_v2f16_to_v2f32_arg:
291; GFX11:       ; %bb.0:
292; GFX11-NEXT:    s_clause 0x1
293; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
294; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
295; GFX11-NEXT:    v_mov_b32_e32 v2, 0
296; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
297; GFX11-NEXT:    s_lshr_b32 s3, s2, 16
298; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
299; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s3
300; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
301; GFX11-NEXT:    s_endpgm
302  %ext = fpext <2 x half> %arg to <2 x float>
303  store <2 x float> %ext, ptr addrspace(1) %out
304  ret void
305}
306
307define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 {
308; CI-LABEL: extload_v3f16_to_v3f32_arg:
309; CI:       ; %bb.0:
310; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
311; CI-NEXT:    s_waitcnt lgkmcnt(0)
312; CI-NEXT:    s_lshr_b32 s4, s2, 16
313; CI-NEXT:    v_cvt_f32_f16_e32 v2, s3
314; CI-NEXT:    v_cvt_f32_f16_e32 v1, s4
315; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
316; CI-NEXT:    v_mov_b32_e32 v4, s1
317; CI-NEXT:    v_mov_b32_e32 v3, s0
318; CI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
319; CI-NEXT:    s_endpgm
320;
321; VI-LABEL: extload_v3f16_to_v3f32_arg:
322; VI:       ; %bb.0:
323; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
324; VI-NEXT:    s_waitcnt lgkmcnt(0)
325; VI-NEXT:    s_lshr_b32 s4, s2, 16
326; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
327; VI-NEXT:    v_cvt_f32_f16_e32 v1, s4
328; VI-NEXT:    v_cvt_f32_f16_e32 v2, s3
329; VI-NEXT:    v_mov_b32_e32 v4, s1
330; VI-NEXT:    v_mov_b32_e32 v3, s0
331; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
332; VI-NEXT:    s_endpgm
333;
334; GFX11-LABEL: extload_v3f16_to_v3f32_arg:
335; GFX11:       ; %bb.0:
336; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
337; GFX11-NEXT:    v_mov_b32_e32 v3, 0
338; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
339; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
340; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
341; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s4
342; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s3
343; GFX11-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
344; GFX11-NEXT:    s_endpgm
345  %ext = fpext <3 x half> %arg to <3 x float>
346  store <3 x float> %ext, ptr addrspace(1) %out
347  ret void
348}
349
350define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 {
351; CI-LABEL: extload_v4f16_to_v4f32_arg:
352; CI:       ; %bb.0:
353; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
354; CI-NEXT:    s_waitcnt lgkmcnt(0)
355; CI-NEXT:    s_lshr_b32 s4, s3, 16
356; CI-NEXT:    s_lshr_b32 s5, s2, 16
357; CI-NEXT:    v_cvt_f32_f16_e32 v2, s3
358; CI-NEXT:    v_cvt_f32_f16_e32 v3, s4
359; CI-NEXT:    v_cvt_f32_f16_e32 v1, s5
360; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
361; CI-NEXT:    v_mov_b32_e32 v5, s1
362; CI-NEXT:    v_mov_b32_e32 v4, s0
363; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
364; CI-NEXT:    s_endpgm
365;
366; VI-LABEL: extload_v4f16_to_v4f32_arg:
367; VI:       ; %bb.0:
368; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
369; VI-NEXT:    s_waitcnt lgkmcnt(0)
370; VI-NEXT:    s_lshr_b32 s4, s3, 16
371; VI-NEXT:    s_lshr_b32 s5, s2, 16
372; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
373; VI-NEXT:    v_cvt_f32_f16_e32 v3, s4
374; VI-NEXT:    v_cvt_f32_f16_e32 v1, s5
375; VI-NEXT:    v_cvt_f32_f16_e32 v2, s3
376; VI-NEXT:    v_mov_b32_e32 v5, s1
377; VI-NEXT:    v_mov_b32_e32 v4, s0
378; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
379; VI-NEXT:    s_endpgm
380;
381; GFX11-LABEL: extload_v4f16_to_v4f32_arg:
382; GFX11:       ; %bb.0:
383; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
384; GFX11-NEXT:    v_mov_b32_e32 v4, 0
385; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
386; GFX11-NEXT:    s_lshr_b32 s4, s3, 16
387; GFX11-NEXT:    s_lshr_b32 s5, s2, 16
388; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
389; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, s4
390; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s5
391; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s3
392; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
393; GFX11-NEXT:    s_endpgm
394  %ext = fpext <4 x half> %arg to <4 x float>
395  store <4 x float> %ext, ptr addrspace(1) %out
396  ret void
397}
398
399define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 {
400; CI-LABEL: extload_v8f16_to_v8f32_arg:
401; CI:       ; %bb.0:
402; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x4
403; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
404; CI-NEXT:    s_waitcnt lgkmcnt(0)
405; CI-NEXT:    s_lshr_b32 s6, s1, 16
406; CI-NEXT:    s_lshr_b32 s7, s0, 16
407; CI-NEXT:    s_lshr_b32 s8, s3, 16
408; CI-NEXT:    v_cvt_f32_f16_e32 v3, s6
409; CI-NEXT:    s_lshr_b32 s6, s2, 16
410; CI-NEXT:    v_cvt_f32_f16_e32 v7, s8
411; CI-NEXT:    v_cvt_f32_f16_e32 v5, s6
412; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
413; CI-NEXT:    v_cvt_f32_f16_e32 v6, s3
414; CI-NEXT:    v_cvt_f32_f16_e32 v4, s2
415; CI-NEXT:    s_add_u32 s0, s4, 16
416; CI-NEXT:    v_cvt_f32_f16_e32 v2, s1
417; CI-NEXT:    s_addc_u32 s1, s5, 0
418; CI-NEXT:    v_cvt_f32_f16_e32 v1, s7
419; CI-NEXT:    v_mov_b32_e32 v9, s1
420; CI-NEXT:    v_mov_b32_e32 v8, s0
421; CI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
422; CI-NEXT:    s_nop 0
423; CI-NEXT:    v_mov_b32_e32 v4, s4
424; CI-NEXT:    v_mov_b32_e32 v5, s5
425; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
426; CI-NEXT:    s_endpgm
427;
428; VI-LABEL: extload_v8f16_to_v8f32_arg:
429; VI:       ; %bb.0:
430; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
431; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
432; VI-NEXT:    s_waitcnt lgkmcnt(0)
433; VI-NEXT:    s_lshr_b32 s6, s1, 16
434; VI-NEXT:    s_lshr_b32 s7, s0, 16
435; VI-NEXT:    s_lshr_b32 s8, s3, 16
436; VI-NEXT:    v_cvt_f32_f16_e32 v3, s6
437; VI-NEXT:    s_lshr_b32 s6, s2, 16
438; VI-NEXT:    v_cvt_f32_f16_e32 v7, s8
439; VI-NEXT:    v_cvt_f32_f16_e32 v5, s6
440; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
441; VI-NEXT:    v_cvt_f32_f16_e32 v6, s3
442; VI-NEXT:    v_cvt_f32_f16_e32 v4, s2
443; VI-NEXT:    s_add_u32 s0, s4, 16
444; VI-NEXT:    v_cvt_f32_f16_e32 v2, s1
445; VI-NEXT:    s_addc_u32 s1, s5, 0
446; VI-NEXT:    v_cvt_f32_f16_e32 v1, s7
447; VI-NEXT:    v_mov_b32_e32 v9, s1
448; VI-NEXT:    v_mov_b32_e32 v8, s0
449; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
450; VI-NEXT:    s_nop 0
451; VI-NEXT:    v_mov_b32_e32 v4, s4
452; VI-NEXT:    v_mov_b32_e32 v5, s5
453; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
454; VI-NEXT:    s_endpgm
455;
456; GFX11-LABEL: extload_v8f16_to_v8f32_arg:
457; GFX11:       ; %bb.0:
458; GFX11-NEXT:    s_clause 0x1
459; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
460; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
461; GFX11-NEXT:    v_mov_b32_e32 v8, 0
462; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
463; GFX11-NEXT:    s_lshr_b32 s8, s3, 16
464; GFX11-NEXT:    s_lshr_b32 s9, s2, 16
465; GFX11-NEXT:    s_lshr_b32 s6, s1, 16
466; GFX11-NEXT:    s_lshr_b32 s7, s0, 16
467; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, s3
468; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, s2
469; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, s8
470; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, s9
471; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s1
472; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s0
473; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, s6
474; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s7
475; GFX11-NEXT:    s_clause 0x1
476; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[4:5] offset:16
477; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
478; GFX11-NEXT:    s_endpgm
479  %ext = fpext <8 x half> %arg to <8 x float>
480  store <8 x float> %ext, ptr addrspace(1) %out
481  ret void
482}
483
484define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %arg) #0 {
485; CI-LABEL: extload_f16_to_f64_arg:
486; CI:       ; %bb.0:
487; CI-NEXT:    s_load_dword s0, s[8:9], 0x2
488; CI-NEXT:    s_waitcnt lgkmcnt(0)
489; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
490; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
491; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
492; CI-NEXT:    s_waitcnt lgkmcnt(0)
493; CI-NEXT:    v_mov_b32_e32 v3, s1
494; CI-NEXT:    v_mov_b32_e32 v2, s0
495; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
496; CI-NEXT:    s_endpgm
497;
498; VI-LABEL: extload_f16_to_f64_arg:
499; VI:       ; %bb.0:
500; VI-NEXT:    s_load_dword s0, s[8:9], 0x8
501; VI-NEXT:    s_waitcnt lgkmcnt(0)
502; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
503; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
504; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
505; VI-NEXT:    s_waitcnt lgkmcnt(0)
506; VI-NEXT:    v_mov_b32_e32 v3, s1
507; VI-NEXT:    v_mov_b32_e32 v2, s0
508; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
509; VI-NEXT:    s_endpgm
510;
511; GFX11-LABEL: extload_f16_to_f64_arg:
512; GFX11:       ; %bb.0:
513; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x8
514; GFX11-NEXT:    v_mov_b32_e32 v2, 0
515; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
516; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s0
517; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
518; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
519; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
520; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
521; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
522; GFX11-NEXT:    s_endpgm
523  %ext = fpext half %arg to double
524  store double %ext, ptr addrspace(1) %out
525  ret void
526}
527
528define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2 x half> %arg) #0 {
529; CI-LABEL: extload_v2f16_to_v2f64_arg:
530; CI:       ; %bb.0:
531; CI-NEXT:    s_load_dword s0, s[8:9], 0x2
532; CI-NEXT:    s_waitcnt lgkmcnt(0)
533; CI-NEXT:    s_lshr_b32 s1, s0, 16
534; CI-NEXT:    v_cvt_f32_f16_e32 v0, s1
535; CI-NEXT:    v_cvt_f32_f16_e32 v1, s0
536; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
537; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
538; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
539; CI-NEXT:    s_waitcnt lgkmcnt(0)
540; CI-NEXT:    v_mov_b32_e32 v5, s1
541; CI-NEXT:    v_mov_b32_e32 v4, s0
542; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
543; CI-NEXT:    s_endpgm
544;
545; VI-LABEL: extload_v2f16_to_v2f64_arg:
546; VI:       ; %bb.0:
547; VI-NEXT:    s_load_dword s0, s[8:9], 0x8
548; VI-NEXT:    s_waitcnt lgkmcnt(0)
549; VI-NEXT:    s_lshr_b32 s1, s0, 16
550; VI-NEXT:    v_cvt_f32_f16_e32 v0, s1
551; VI-NEXT:    v_cvt_f32_f16_e32 v1, s0
552; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
553; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
554; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
555; VI-NEXT:    s_waitcnt lgkmcnt(0)
556; VI-NEXT:    v_mov_b32_e32 v5, s1
557; VI-NEXT:    v_mov_b32_e32 v4, s0
558; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
559; VI-NEXT:    s_endpgm
560;
561; GFX11-LABEL: extload_v2f16_to_v2f64_arg:
562; GFX11:       ; %bb.0:
563; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x8
564; GFX11-NEXT:    v_mov_b32_e32 v4, 0
565; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
566; GFX11-NEXT:    s_lshr_b32 s1, s0, 16
567; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s0
568; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s1
569; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
570; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
571; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
572; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
573; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
574; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
575; GFX11-NEXT:    s_endpgm
576  %ext = fpext <2 x half> %arg to <2 x double>
577  store <2 x double> %ext, ptr addrspace(1) %out
578  ret void
579}
580
581define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 x half> %arg) #0 {
582; CI-LABEL: extload_v3f16_to_v3f64_arg:
583; CI:       ; %bb.0:
584; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
585; CI-NEXT:    s_waitcnt lgkmcnt(0)
586; CI-NEXT:    v_cvt_f32_f16_e32 v0, s3
587; CI-NEXT:    s_lshr_b32 s4, s2, 16
588; CI-NEXT:    v_cvt_f32_f16_e32 v1, s2
589; CI-NEXT:    v_cvt_f32_f16_e32 v2, s4
590; CI-NEXT:    s_add_u32 s2, s0, 16
591; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v0
592; CI-NEXT:    s_addc_u32 s3, s1, 0
593; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
594; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
595; CI-NEXT:    v_mov_b32_e32 v7, s3
596; CI-NEXT:    v_mov_b32_e32 v6, s2
597; CI-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
598; CI-NEXT:    v_mov_b32_e32 v5, s1
599; CI-NEXT:    v_mov_b32_e32 v4, s0
600; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
601; CI-NEXT:    s_endpgm
602;
603; VI-LABEL: extload_v3f16_to_v3f64_arg:
604; VI:       ; %bb.0:
605; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
606; VI-NEXT:    s_waitcnt lgkmcnt(0)
607; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
608; VI-NEXT:    s_lshr_b32 s4, s2, 16
609; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
610; VI-NEXT:    v_cvt_f32_f16_e32 v2, s4
611; VI-NEXT:    s_add_u32 s2, s0, 16
612; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
613; VI-NEXT:    s_addc_u32 s3, s1, 0
614; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
615; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
616; VI-NEXT:    v_mov_b32_e32 v7, s3
617; VI-NEXT:    v_mov_b32_e32 v6, s2
618; VI-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
619; VI-NEXT:    v_mov_b32_e32 v5, s1
620; VI-NEXT:    v_mov_b32_e32 v4, s0
621; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
622; VI-NEXT:    s_endpgm
623;
624; GFX11-LABEL: extload_v3f16_to_v3f64_arg:
625; GFX11:       ; %bb.0:
626; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
627; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
628; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
629; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s3
630; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s4
631; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, s2
632; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
633; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v0
634; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v1
635; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
636; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v6
637; GFX11-NEXT:    v_mov_b32_e32 v6, 0
638; GFX11-NEXT:    s_clause 0x1
639; GFX11-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
640; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
641; GFX11-NEXT:    s_endpgm
642  %ext = fpext <3 x half> %arg to <3 x double>
643  store <3 x double> %ext, ptr addrspace(1) %out
644  ret void
645}
646
647define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 x half> %arg) #0 {
648; CI-LABEL: extload_v4f16_to_v4f64_arg:
649; CI:       ; %bb.0:
650; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
651; CI-NEXT:    s_waitcnt lgkmcnt(0)
652; CI-NEXT:    s_lshr_b32 s4, s3, 16
653; CI-NEXT:    v_cvt_f32_f16_e32 v0, s3
654; CI-NEXT:    v_cvt_f32_f16_e32 v2, s4
655; CI-NEXT:    s_lshr_b32 s5, s2, 16
656; CI-NEXT:    v_cvt_f32_f16_e32 v4, s2
657; CI-NEXT:    v_cvt_f32_f16_e32 v6, s5
658; CI-NEXT:    s_add_u32 s2, s0, 16
659; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
660; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
661; CI-NEXT:    s_addc_u32 s3, s1, 0
662; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
663; CI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
664; CI-NEXT:    v_mov_b32_e32 v9, s3
665; CI-NEXT:    v_mov_b32_e32 v8, s2
666; CI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
667; CI-NEXT:    s_nop 0
668; CI-NEXT:    v_mov_b32_e32 v0, s0
669; CI-NEXT:    v_mov_b32_e32 v1, s1
670; CI-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
671; CI-NEXT:    s_endpgm
672;
673; VI-LABEL: extload_v4f16_to_v4f64_arg:
674; VI:       ; %bb.0:
675; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
676; VI-NEXT:    s_waitcnt lgkmcnt(0)
677; VI-NEXT:    s_lshr_b32 s5, s3, 16
678; VI-NEXT:    v_cvt_f32_f16_e32 v0, s3
679; VI-NEXT:    v_cvt_f32_f16_e32 v2, s5
680; VI-NEXT:    s_lshr_b32 s4, s2, 16
681; VI-NEXT:    v_cvt_f32_f16_e32 v4, s2
682; VI-NEXT:    v_cvt_f32_f16_e32 v6, s4
683; VI-NEXT:    s_add_u32 s2, s0, 16
684; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
685; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
686; VI-NEXT:    s_addc_u32 s3, s1, 0
687; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
688; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
689; VI-NEXT:    v_mov_b32_e32 v9, s3
690; VI-NEXT:    v_mov_b32_e32 v8, s2
691; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
692; VI-NEXT:    s_nop 0
693; VI-NEXT:    v_mov_b32_e32 v0, s0
694; VI-NEXT:    v_mov_b32_e32 v1, s1
695; VI-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
696; VI-NEXT:    s_endpgm
697;
698; GFX11-LABEL: extload_v4f16_to_v4f64_arg:
699; GFX11:       ; %bb.0:
700; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
701; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
702; GFX11-NEXT:    s_lshr_b32 s5, s3, 16
703; GFX11-NEXT:    s_lshr_b32 s4, s2, 16
704; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s3
705; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, s5
706; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s2
707; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, s4
708; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
709; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v2
710; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v3
711; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
712; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
713; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
714; GFX11-NEXT:    v_mov_b32_e32 v8, 0
715; GFX11-NEXT:    s_clause 0x1
716; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
717; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
718; GFX11-NEXT:    s_endpgm
719  %ext = fpext <4 x half> %arg to <4 x double>
720  store <4 x double> %ext, ptr addrspace(1) %out
721  ret void
722}
723
724define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8 x half> %arg) #0 {
725; CI-LABEL: extload_v8f16_to_v8f64_arg:
726; CI:       ; %bb.0:
727; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x4
728; CI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
729; CI-NEXT:    s_waitcnt lgkmcnt(0)
730; CI-NEXT:    s_lshr_b32 s6, s3, 16
731; CI-NEXT:    v_cvt_f32_f16_e32 v0, s6
732; CI-NEXT:    v_cvt_f32_f16_e32 v12, s3
733; CI-NEXT:    s_lshr_b32 s7, s2, 16
734; CI-NEXT:    s_lshr_b32 s8, s1, 16
735; CI-NEXT:    s_lshr_b32 s6, s0, 16
736; CI-NEXT:    v_cvt_f32_f16_e32 v1, s7
737; CI-NEXT:    v_cvt_f32_f16_e32 v8, s2
738; CI-NEXT:    v_cvt_f32_f16_e32 v9, s0
739; CI-NEXT:    s_add_u32 s0, s4, 48
740; CI-NEXT:    v_cvt_f32_f16_e32 v5, s1
741; CI-NEXT:    v_cvt_f64_f32_e32 v[14:15], v0
742; CI-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
743; CI-NEXT:    s_addc_u32 s1, s5, 0
744; CI-NEXT:    v_cvt_f32_f16_e32 v4, s8
745; CI-NEXT:    v_mov_b32_e32 v17, s1
746; CI-NEXT:    v_mov_b32_e32 v16, s0
747; CI-NEXT:    s_add_u32 s0, s4, 32
748; CI-NEXT:    v_cvt_f32_f16_e32 v2, s6
749; CI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v1
750; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
751; CI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
752; CI-NEXT:    s_addc_u32 s1, s5, 0
753; CI-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
754; CI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
755; CI-NEXT:    v_mov_b32_e32 v13, s1
756; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
757; CI-NEXT:    v_mov_b32_e32 v12, s0
758; CI-NEXT:    s_add_u32 s0, s4, 16
759; CI-NEXT:    s_addc_u32 s1, s5, 0
760; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
761; CI-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
762; CI-NEXT:    s_nop 0
763; CI-NEXT:    v_mov_b32_e32 v9, s1
764; CI-NEXT:    v_mov_b32_e32 v8, s0
765; CI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
766; CI-NEXT:    s_nop 0
767; CI-NEXT:    v_mov_b32_e32 v4, s4
768; CI-NEXT:    v_mov_b32_e32 v5, s5
769; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
770; CI-NEXT:    s_endpgm
771;
772; VI-LABEL: extload_v8f16_to_v8f64_arg:
773; VI:       ; %bb.0:
774; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x10
775; VI-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
776; VI-NEXT:    s_waitcnt lgkmcnt(0)
777; VI-NEXT:    s_lshr_b32 s6, s0, 16
778; VI-NEXT:    s_lshr_b32 s8, s2, 16
779; VI-NEXT:    s_lshr_b32 s9, s3, 16
780; VI-NEXT:    v_cvt_f32_f16_e32 v0, s6
781; VI-NEXT:    v_cvt_f32_f16_e32 v4, s8
782; VI-NEXT:    v_cvt_f32_f16_e32 v5, s9
783; VI-NEXT:    v_cvt_f32_f16_e32 v12, s3
784; VI-NEXT:    s_lshr_b32 s7, s1, 16
785; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
786; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
787; VI-NEXT:    v_cvt_f32_f16_e32 v8, s2
788; VI-NEXT:    s_add_u32 s0, s4, 48
789; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v4
790; VI-NEXT:    v_cvt_f64_f32_e32 v[14:15], v5
791; VI-NEXT:    v_cvt_f32_f16_e32 v4, s1
792; VI-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
793; VI-NEXT:    s_addc_u32 s1, s5, 0
794; VI-NEXT:    v_cvt_f32_f16_e32 v1, s7
795; VI-NEXT:    v_mov_b32_e32 v17, s1
796; VI-NEXT:    v_mov_b32_e32 v16, s0
797; VI-NEXT:    s_add_u32 s0, s4, 32
798; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
799; VI-NEXT:    s_addc_u32 s1, s5, 0
800; VI-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
801; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v1
802; VI-NEXT:    v_mov_b32_e32 v13, s1
803; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
804; VI-NEXT:    v_mov_b32_e32 v12, s0
805; VI-NEXT:    s_add_u32 s0, s4, 16
806; VI-NEXT:    s_addc_u32 s1, s5, 0
807; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
808; VI-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
809; VI-NEXT:    s_nop 0
810; VI-NEXT:    v_mov_b32_e32 v9, s1
811; VI-NEXT:    v_mov_b32_e32 v8, s0
812; VI-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
813; VI-NEXT:    s_nop 0
814; VI-NEXT:    v_mov_b32_e32 v4, s4
815; VI-NEXT:    v_mov_b32_e32 v5, s5
816; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
817; VI-NEXT:    s_endpgm
818;
819; GFX11-LABEL: extload_v8f16_to_v8f64_arg:
820; GFX11:       ; %bb.0:
821; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x10
822; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX11-NEXT:    s_lshr_b32 s9, s3, 16
824; GFX11-NEXT:    s_lshr_b32 s8, s2, 16
825; GFX11-NEXT:    s_lshr_b32 s7, s1, 16
826; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, s3
827; GFX11-NEXT:    v_cvt_f32_f16_e32 v11, s9
828; GFX11-NEXT:    s_lshr_b32 s6, s0, 16
829; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, s2
830; GFX11-NEXT:    v_cvt_f32_f16_e32 v10, s8
831; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s1
832; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, s7
833; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s0
834; GFX11-NEXT:    v_cvt_f32_f16_e32 v16, s6
835; GFX11-NEXT:    v_cvt_f64_f32_e32 v[12:13], v6
836; GFX11-NEXT:    v_cvt_f64_f32_e32 v[14:15], v11
837; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v3
838; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
839; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v2
840; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
841; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
842; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v16
843; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
844; GFX11-NEXT:    v_mov_b32_e32 v16, 0
845; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
846; GFX11-NEXT:    s_clause 0x3
847; GFX11-NEXT:    global_store_b128 v16, v[12:15], s[0:1] offset:48
848; GFX11-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:32
849; GFX11-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:16
850; GFX11-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
851; GFX11-NEXT:    s_endpgm
852  %ext = fpext <8 x half> %arg to <8 x double>
853  store <8 x double> %ext, ptr addrspace(1) %out
854  ret void
855}
856
857define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
858; CIVI-LABEL: global_load_store_f16:
859; CIVI:       ; %bb.0:
860; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
861; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
862; CIVI-NEXT:    v_mov_b32_e32 v0, s2
863; CIVI-NEXT:    v_mov_b32_e32 v1, s3
864; CIVI-NEXT:    flat_load_ushort v2, v[0:1]
865; CIVI-NEXT:    v_mov_b32_e32 v0, s0
866; CIVI-NEXT:    v_mov_b32_e32 v1, s1
867; CIVI-NEXT:    s_waitcnt vmcnt(0)
868; CIVI-NEXT:    flat_store_short v[0:1], v2
869; CIVI-NEXT:    s_endpgm
870;
871; GFX11-LABEL: global_load_store_f16:
872; GFX11:       ; %bb.0:
873; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
874; GFX11-NEXT:    v_mov_b32_e32 v0, 0
875; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
876; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
877; GFX11-NEXT:    s_waitcnt vmcnt(0)
878; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
879; GFX11-NEXT:    s_endpgm
880  %val = load half, ptr addrspace(1) %in
881  store half %val, ptr addrspace(1) %out
882  ret void
883}
884
885define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
886; CIVI-LABEL: global_load_store_v2f16:
887; CIVI:       ; %bb.0:
888; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
889; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
890; CIVI-NEXT:    v_mov_b32_e32 v0, s2
891; CIVI-NEXT:    v_mov_b32_e32 v1, s3
892; CIVI-NEXT:    flat_load_dword v2, v[0:1]
893; CIVI-NEXT:    v_mov_b32_e32 v0, s0
894; CIVI-NEXT:    v_mov_b32_e32 v1, s1
895; CIVI-NEXT:    s_waitcnt vmcnt(0)
896; CIVI-NEXT:    flat_store_dword v[0:1], v2
897; CIVI-NEXT:    s_endpgm
898;
899; GFX11-LABEL: global_load_store_v2f16:
900; GFX11:       ; %bb.0:
901; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
902; GFX11-NEXT:    v_mov_b32_e32 v0, 0
903; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
904; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
905; GFX11-NEXT:    s_waitcnt vmcnt(0)
906; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
907; GFX11-NEXT:    s_endpgm
908  %val = load <2 x half>, ptr addrspace(1) %in
909  store <2 x half> %val, ptr addrspace(1) %out
910  ret void
911}
912
913define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
914; CIVI-LABEL: global_load_store_v4f16:
915; CIVI:       ; %bb.0:
916; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
917; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
918; CIVI-NEXT:    v_mov_b32_e32 v0, s0
919; CIVI-NEXT:    v_mov_b32_e32 v1, s1
920; CIVI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
921; CIVI-NEXT:    v_mov_b32_e32 v2, s2
922; CIVI-NEXT:    v_mov_b32_e32 v3, s3
923; CIVI-NEXT:    s_waitcnt vmcnt(0)
924; CIVI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
925; CIVI-NEXT:    s_endpgm
926;
927; GFX11-LABEL: global_load_store_v4f16:
928; GFX11:       ; %bb.0:
929; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
930; GFX11-NEXT:    v_mov_b32_e32 v2, 0
931; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
932; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[0:1]
933; GFX11-NEXT:    s_waitcnt vmcnt(0)
934; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
935; GFX11-NEXT:    s_endpgm
936  %val = load <4 x half>, ptr addrspace(1) %in
937  store <4 x half> %val, ptr addrspace(1) %out
938  ret void
939}
940
941define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
942; CIVI-LABEL: global_load_store_v8f16:
943; CIVI:       ; %bb.0:
944; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
945; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
946; CIVI-NEXT:    v_mov_b32_e32 v0, s2
947; CIVI-NEXT:    v_mov_b32_e32 v1, s3
948; CIVI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
949; CIVI-NEXT:    v_mov_b32_e32 v4, s0
950; CIVI-NEXT:    v_mov_b32_e32 v5, s1
951; CIVI-NEXT:    s_waitcnt vmcnt(0)
952; CIVI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
953; CIVI-NEXT:    s_endpgm
954;
955; GFX11-LABEL: global_load_store_v8f16:
956; GFX11:       ; %bb.0:
957; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
958; GFX11-NEXT:    v_mov_b32_e32 v4, 0
959; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
960; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
961; GFX11-NEXT:    s_waitcnt vmcnt(0)
962; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
963; GFX11-NEXT:    s_endpgm
964  %val = load <8 x half>, ptr addrspace(1) %in
965  store <8 x half> %val, ptr addrspace(1) %out
966  ret void
967}
968
969define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
970; CIVI-LABEL: global_extload_f16_to_f32:
971; CIVI:       ; %bb.0:
972; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
973; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
974; CIVI-NEXT:    v_mov_b32_e32 v0, s2
975; CIVI-NEXT:    v_mov_b32_e32 v1, s3
976; CIVI-NEXT:    flat_load_ushort v0, v[0:1]
977; CIVI-NEXT:    v_mov_b32_e32 v1, s1
978; CIVI-NEXT:    s_waitcnt vmcnt(0)
979; CIVI-NEXT:    v_cvt_f32_f16_e32 v2, v0
980; CIVI-NEXT:    v_mov_b32_e32 v0, s0
981; CIVI-NEXT:    flat_store_dword v[0:1], v2
982; CIVI-NEXT:    s_endpgm
983;
984; GFX11-LABEL: global_extload_f16_to_f32:
985; GFX11:       ; %bb.0:
986; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
987; GFX11-NEXT:    v_mov_b32_e32 v0, 0
988; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
989; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
990; GFX11-NEXT:    s_waitcnt vmcnt(0)
991; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
992; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
993; GFX11-NEXT:    s_endpgm
994  %val = load half, ptr addrspace(1) %in
995  %cvt = fpext half %val to float
996  store float %cvt, ptr addrspace(1) %out
997  ret void
998}
999
1000define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1001; CI-LABEL: global_extload_v2f16_to_v2f32:
1002; CI:       ; %bb.0:
1003; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1004; CI-NEXT:    s_waitcnt lgkmcnt(0)
1005; CI-NEXT:    v_mov_b32_e32 v0, s2
1006; CI-NEXT:    v_mov_b32_e32 v1, s3
1007; CI-NEXT:    flat_load_dword v1, v[0:1]
1008; CI-NEXT:    v_mov_b32_e32 v2, s0
1009; CI-NEXT:    v_mov_b32_e32 v3, s1
1010; CI-NEXT:    s_waitcnt vmcnt(0)
1011; CI-NEXT:    v_cvt_f32_f16_e32 v0, v1
1012; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1013; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1014; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1015; CI-NEXT:    s_endpgm
1016;
1017; VI-LABEL: global_extload_v2f16_to_v2f32:
1018; VI:       ; %bb.0:
1019; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1020; VI-NEXT:    s_waitcnt lgkmcnt(0)
1021; VI-NEXT:    v_mov_b32_e32 v0, s2
1022; VI-NEXT:    v_mov_b32_e32 v1, s3
1023; VI-NEXT:    flat_load_dword v1, v[0:1]
1024; VI-NEXT:    v_mov_b32_e32 v2, s0
1025; VI-NEXT:    v_mov_b32_e32 v3, s1
1026; VI-NEXT:    s_waitcnt vmcnt(0)
1027; VI-NEXT:    v_cvt_f32_f16_e32 v0, v1
1028; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1029; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1030; VI-NEXT:    s_endpgm
1031;
1032; GFX11-LABEL: global_extload_v2f16_to_v2f32:
1033; GFX11:       ; %bb.0:
1034; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1035; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1036; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1037; GFX11-NEXT:    global_load_b32 v0, v2, s[2:3]
1038; GFX11-NEXT:    s_waitcnt vmcnt(0)
1039; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1040; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
1041; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1042; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
1043; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1044; GFX11-NEXT:    s_endpgm
1045  %val = load <2 x half>, ptr addrspace(1) %in
1046  %cvt = fpext <2 x half> %val to <2 x float>
1047  store <2 x float> %cvt, ptr addrspace(1) %out
1048  ret void
1049}
1050
1051define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1052; CI-LABEL: global_extload_v3f16_to_v3f32:
1053; CI:       ; %bb.0:
1054; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1055; CI-NEXT:    s_waitcnt lgkmcnt(0)
1056; CI-NEXT:    v_mov_b32_e32 v0, s2
1057; CI-NEXT:    v_mov_b32_e32 v1, s3
1058; CI-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
1059; CI-NEXT:    v_mov_b32_e32 v3, s0
1060; CI-NEXT:    v_mov_b32_e32 v4, s1
1061; CI-NEXT:    s_waitcnt vmcnt(0)
1062; CI-NEXT:    v_cvt_f32_f16_e32 v0, v1
1063; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1064; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1065; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1066; CI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
1067; CI-NEXT:    s_endpgm
1068;
1069; VI-LABEL: global_extload_v3f16_to_v3f32:
1070; VI:       ; %bb.0:
1071; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1072; VI-NEXT:    s_waitcnt lgkmcnt(0)
1073; VI-NEXT:    v_mov_b32_e32 v0, s2
1074; VI-NEXT:    v_mov_b32_e32 v1, s3
1075; VI-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
1076; VI-NEXT:    v_mov_b32_e32 v3, s0
1077; VI-NEXT:    v_mov_b32_e32 v4, s1
1078; VI-NEXT:    s_waitcnt vmcnt(0)
1079; VI-NEXT:    v_cvt_f32_f16_e32 v0, v1
1080; VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1081; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1082; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
1083; VI-NEXT:    s_endpgm
1084;
1085; GFX11-LABEL: global_extload_v3f16_to_v3f32:
1086; GFX11:       ; %bb.0:
1087; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1088; GFX11-NEXT:    v_mov_b32_e32 v3, 0
1089; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1090; GFX11-NEXT:    global_load_b64 v[0:1], v3, s[2:3]
1091; GFX11-NEXT:    s_waitcnt vmcnt(0)
1092; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
1093; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
1094; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v1
1095; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
1096; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v4
1097; GFX11-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
1098; GFX11-NEXT:    s_endpgm
1099  %val = load <3 x half>, ptr addrspace(1) %in
1100  %cvt = fpext <3 x half> %val to <3 x float>
1101  store <3 x float> %cvt, ptr addrspace(1) %out
1102  ret void
1103}
1104
1105define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1106; CI-LABEL: global_extload_v4f16_to_v4f32:
1107; CI:       ; %bb.0:
1108; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1109; CI-NEXT:    s_waitcnt lgkmcnt(0)
1110; CI-NEXT:    v_mov_b32_e32 v0, s2
1111; CI-NEXT:    v_mov_b32_e32 v1, s3
1112; CI-NEXT:    flat_load_dwordx2 v[3:4], v[0:1]
1113; CI-NEXT:    v_mov_b32_e32 v5, s1
1114; CI-NEXT:    s_waitcnt vmcnt(0)
1115; CI-NEXT:    v_cvt_f32_f16_e32 v2, v4
1116; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
1117; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
1118; CI-NEXT:    v_cvt_f32_f16_e32 v0, v3
1119; CI-NEXT:    v_cvt_f32_f16_e32 v3, v1
1120; CI-NEXT:    v_cvt_f32_f16_e32 v1, v4
1121; CI-NEXT:    v_mov_b32_e32 v4, s0
1122; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1123; CI-NEXT:    s_endpgm
1124;
1125; VI-LABEL: global_extload_v4f16_to_v4f32:
1126; VI:       ; %bb.0:
1127; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1128; VI-NEXT:    s_waitcnt lgkmcnt(0)
1129; VI-NEXT:    v_mov_b32_e32 v0, s2
1130; VI-NEXT:    v_mov_b32_e32 v1, s3
1131; VI-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
1132; VI-NEXT:    s_waitcnt vmcnt(0)
1133; VI-NEXT:    v_cvt_f32_f16_e32 v0, v4
1134; VI-NEXT:    v_cvt_f32_f16_e32 v2, v5
1135; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1136; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1137; VI-NEXT:    v_mov_b32_e32 v4, s0
1138; VI-NEXT:    v_mov_b32_e32 v5, s1
1139; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1140; VI-NEXT:    s_endpgm
1141;
1142; GFX11-LABEL: global_extload_v4f16_to_v4f32:
1143; GFX11:       ; %bb.0:
1144; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1145; GFX11-NEXT:    v_mov_b32_e32 v4, 0
1146; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1147; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
1148; GFX11-NEXT:    s_waitcnt vmcnt(0)
1149; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
1150; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
1151; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
1152; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v1
1153; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1154; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
1155; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v5
1156; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
1157; GFX11-NEXT:    s_endpgm
1158  %val = load <4 x half>, ptr addrspace(1) %in
1159  %cvt = fpext <4 x half> %val to <4 x float>
1160  store <4 x float> %cvt, ptr addrspace(1) %out
1161  ret void
1162}
1163
1164define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1165; CI-LABEL: global_extload_v8f16_to_v8f32:
1166; CI:       ; %bb.0:
1167; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1168; CI-NEXT:    s_waitcnt lgkmcnt(0)
1169; CI-NEXT:    v_mov_b32_e32 v0, s2
1170; CI-NEXT:    v_mov_b32_e32 v1, s3
1171; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1172; CI-NEXT:    s_add_u32 s2, s0, 16
1173; CI-NEXT:    s_addc_u32 s3, s1, 0
1174; CI-NEXT:    v_mov_b32_e32 v13, s1
1175; CI-NEXT:    v_mov_b32_e32 v12, s0
1176; CI-NEXT:    s_waitcnt vmcnt(0)
1177; CI-NEXT:    v_cvt_f32_f16_e32 v10, v3
1178; CI-NEXT:    v_cvt_f32_f16_e32 v8, v2
1179; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1180; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1181; CI-NEXT:    v_cvt_f32_f16_e32 v6, v1
1182; CI-NEXT:    v_cvt_f32_f16_e32 v4, v0
1183; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1184; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1185; CI-NEXT:    v_cvt_f32_f16_e32 v11, v3
1186; CI-NEXT:    v_cvt_f32_f16_e32 v9, v2
1187; CI-NEXT:    v_cvt_f32_f16_e32 v7, v1
1188; CI-NEXT:    v_cvt_f32_f16_e32 v5, v0
1189; CI-NEXT:    v_mov_b32_e32 v0, s2
1190; CI-NEXT:    v_mov_b32_e32 v1, s3
1191; CI-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
1192; CI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
1193; CI-NEXT:    s_endpgm
1194;
1195; VI-LABEL: global_extload_v8f16_to_v8f32:
1196; VI:       ; %bb.0:
1197; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1198; VI-NEXT:    s_waitcnt lgkmcnt(0)
1199; VI-NEXT:    v_mov_b32_e32 v0, s2
1200; VI-NEXT:    v_mov_b32_e32 v1, s3
1201; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1202; VI-NEXT:    s_add_u32 s2, s0, 16
1203; VI-NEXT:    s_addc_u32 s3, s1, 0
1204; VI-NEXT:    v_mov_b32_e32 v13, s1
1205; VI-NEXT:    v_mov_b32_e32 v12, s0
1206; VI-NEXT:    s_waitcnt vmcnt(0)
1207; VI-NEXT:    v_cvt_f32_f16_e32 v10, v3
1208; VI-NEXT:    v_cvt_f32_f16_e32 v8, v2
1209; VI-NEXT:    v_cvt_f32_f16_sdwa v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1210; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1211; VI-NEXT:    v_cvt_f32_f16_e32 v6, v1
1212; VI-NEXT:    v_cvt_f32_f16_e32 v4, v0
1213; VI-NEXT:    v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1214; VI-NEXT:    v_cvt_f32_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1215; VI-NEXT:    v_mov_b32_e32 v0, s2
1216; VI-NEXT:    v_mov_b32_e32 v1, s3
1217; VI-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
1218; VI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
1219; VI-NEXT:    s_endpgm
1220;
1221; GFX11-LABEL: global_extload_v8f16_to_v8f32:
1222; GFX11:       ; %bb.0:
1223; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1224; GFX11-NEXT:    v_mov_b32_e32 v12, 0
1225; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1226; GFX11-NEXT:    global_load_b128 v[0:3], v12, s[2:3]
1227; GFX11-NEXT:    s_waitcnt vmcnt(0)
1228; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
1229; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
1230; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v1
1231; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v0
1232; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1233; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1234; GFX11-NEXT:    v_cvt_f32_f16_e32 v10, v3
1235; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, v2
1236; GFX11-NEXT:    v_cvt_f32_f16_e32 v11, v5
1237; GFX11-NEXT:    v_cvt_f32_f16_e32 v9, v9
1238; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v1
1239; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v0
1240; GFX11-NEXT:    s_clause 0x1
1241; GFX11-NEXT:    global_store_b128 v12, v[8:11], s[0:1] offset:16
1242; GFX11-NEXT:    global_store_b128 v12, v[4:7], s[0:1]
1243; GFX11-NEXT:    s_endpgm
1244  %val = load <8 x half>, ptr addrspace(1) %in
1245  %cvt = fpext <8 x half> %val to <8 x float>
1246  store <8 x float> %cvt, ptr addrspace(1) %out
1247  ret void
1248}
1249
1250define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1251; CI-LABEL: global_extload_v16f16_to_v16f32:
1252; CI:       ; %bb.0:
1253; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1254; CI-NEXT:    s_waitcnt lgkmcnt(0)
1255; CI-NEXT:    s_add_u32 s4, s2, 16
1256; CI-NEXT:    v_mov_b32_e32 v5, s3
1257; CI-NEXT:    s_addc_u32 s5, s3, 0
1258; CI-NEXT:    v_mov_b32_e32 v0, s4
1259; CI-NEXT:    v_mov_b32_e32 v4, s2
1260; CI-NEXT:    v_mov_b32_e32 v1, s5
1261; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1262; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1263; CI-NEXT:    s_add_u32 s2, s0, 16
1264; CI-NEXT:    s_addc_u32 s3, s1, 0
1265; CI-NEXT:    v_mov_b32_e32 v14, s3
1266; CI-NEXT:    v_mov_b32_e32 v13, s2
1267; CI-NEXT:    s_add_u32 s2, s0, 48
1268; CI-NEXT:    s_addc_u32 s3, s1, 0
1269; CI-NEXT:    s_waitcnt vmcnt(1)
1270; CI-NEXT:    v_cvt_f32_f16_e32 v8, v1
1271; CI-NEXT:    s_waitcnt vmcnt(0)
1272; CI-NEXT:    v_cvt_f32_f16_e32 v11, v7
1273; CI-NEXT:    v_cvt_f32_f16_e32 v9, v6
1274; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
1275; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
1276; CI-NEXT:    v_cvt_f32_f16_e32 v12, v7
1277; CI-NEXT:    v_cvt_f32_f16_e32 v10, v6
1278; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1279; CI-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
1280; CI-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
1281; CI-NEXT:    flat_store_dwordx4 v[13:14], v[9:12]
1282; CI-NEXT:    v_cvt_f32_f16_e32 v6, v0
1283; CI-NEXT:    v_cvt_f32_f16_e32 v12, v3
1284; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1285; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
1286; CI-NEXT:    v_cvt_f32_f16_e32 v10, v2
1287; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
1288; CI-NEXT:    v_cvt_f32_f16_e32 v2, v5
1289; CI-NEXT:    v_cvt_f32_f16_e32 v0, v4
1290; CI-NEXT:    v_mov_b32_e32 v5, s1
1291; CI-NEXT:    v_cvt_f32_f16_e32 v9, v1
1292; CI-NEXT:    v_cvt_f32_f16_e32 v13, v3
1293; CI-NEXT:    v_cvt_f32_f16_e32 v3, v16
1294; CI-NEXT:    v_cvt_f32_f16_e32 v1, v17
1295; CI-NEXT:    v_mov_b32_e32 v4, s0
1296; CI-NEXT:    s_add_u32 s0, s0, 32
1297; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
1298; CI-NEXT:    s_addc_u32 s1, s1, 0
1299; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
1300; CI-NEXT:    v_mov_b32_e32 v15, s3
1301; CI-NEXT:    v_mov_b32_e32 v17, s1
1302; CI-NEXT:    v_mov_b32_e32 v14, s2
1303; CI-NEXT:    v_mov_b32_e32 v16, s0
1304; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1305; CI-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
1306; CI-NEXT:    flat_store_dwordx4 v[16:17], v[6:9]
1307; CI-NEXT:    s_endpgm
1308;
1309; VI-LABEL: global_extload_v16f16_to_v16f32:
1310; VI:       ; %bb.0:
1311; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1312; VI-NEXT:    s_waitcnt lgkmcnt(0)
1313; VI-NEXT:    v_mov_b32_e32 v0, s2
1314; VI-NEXT:    v_mov_b32_e32 v1, s3
1315; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1316; VI-NEXT:    s_add_u32 s2, s2, 16
1317; VI-NEXT:    s_addc_u32 s3, s3, 0
1318; VI-NEXT:    v_mov_b32_e32 v5, s3
1319; VI-NEXT:    v_mov_b32_e32 v4, s2
1320; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1321; VI-NEXT:    s_add_u32 s2, s0, 16
1322; VI-NEXT:    s_addc_u32 s3, s1, 0
1323; VI-NEXT:    v_mov_b32_e32 v19, s3
1324; VI-NEXT:    v_mov_b32_e32 v18, s2
1325; VI-NEXT:    s_add_u32 s2, s0, 48
1326; VI-NEXT:    v_mov_b32_e32 v17, s1
1327; VI-NEXT:    s_addc_u32 s3, s1, 0
1328; VI-NEXT:    v_mov_b32_e32 v16, s0
1329; VI-NEXT:    s_add_u32 s0, s0, 32
1330; VI-NEXT:    s_addc_u32 s1, s1, 0
1331; VI-NEXT:    v_mov_b32_e32 v21, s3
1332; VI-NEXT:    v_mov_b32_e32 v20, s2
1333; VI-NEXT:    s_waitcnt vmcnt(1)
1334; VI-NEXT:    v_cvt_f32_f16_e32 v14, v3
1335; VI-NEXT:    v_cvt_f32_f16_e32 v12, v2
1336; VI-NEXT:    v_cvt_f32_f16_sdwa v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1337; VI-NEXT:    v_cvt_f32_f16_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1338; VI-NEXT:    v_cvt_f32_f16_e32 v10, v1
1339; VI-NEXT:    v_cvt_f32_f16_e32 v8, v0
1340; VI-NEXT:    v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1341; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1342; VI-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
1343; VI-NEXT:    s_waitcnt vmcnt(1)
1344; VI-NEXT:    v_cvt_f32_f16_e32 v2, v5
1345; VI-NEXT:    v_cvt_f32_f16_e32 v14, v7
1346; VI-NEXT:    v_cvt_f32_f16_e32 v12, v6
1347; VI-NEXT:    v_cvt_f32_f16_sdwa v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1348; VI-NEXT:    v_cvt_f32_f16_sdwa v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1349; VI-NEXT:    v_cvt_f32_f16_e32 v0, v4
1350; VI-NEXT:    v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1351; VI-NEXT:    v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1352; VI-NEXT:    v_mov_b32_e32 v5, s1
1353; VI-NEXT:    v_mov_b32_e32 v4, s0
1354; VI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
1355; VI-NEXT:    flat_store_dwordx4 v[20:21], v[12:15]
1356; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1357; VI-NEXT:    s_endpgm
1358;
1359; GFX11-LABEL: global_extload_v16f16_to_v16f32:
1360; GFX11:       ; %bb.0:
1361; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1362; GFX11-NEXT:    v_mov_b32_e32 v20, 0
1363; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1364; GFX11-NEXT:    s_clause 0x1
1365; GFX11-NEXT:    global_load_b128 v[0:3], v20, s[2:3]
1366; GFX11-NEXT:    global_load_b128 v[4:7], v20, s[2:3] offset:16
1367; GFX11-NEXT:    s_waitcnt vmcnt(1)
1368; GFX11-NEXT:    v_cvt_f32_f16_e32 v10, v1
1369; GFX11-NEXT:    s_waitcnt vmcnt(0)
1370; GFX11-NEXT:    v_cvt_f32_f16_e32 v18, v7
1371; GFX11-NEXT:    v_cvt_f32_f16_e32 v16, v6
1372; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
1373; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
1374; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, v0
1375; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1376; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
1377; GFX11-NEXT:    v_cvt_f32_f16_e32 v14, v3
1378; GFX11-NEXT:    v_cvt_f32_f16_e32 v12, v2
1379; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1380; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
1381; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v5
1382; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v4
1383; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
1384; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
1385; GFX11-NEXT:    v_cvt_f32_f16_e32 v19, v7
1386; GFX11-NEXT:    v_cvt_f32_f16_e32 v17, v6
1387; GFX11-NEXT:    v_cvt_f32_f16_e32 v11, v1
1388; GFX11-NEXT:    v_cvt_f32_f16_e32 v15, v3
1389; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v5
1390; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v4
1391; GFX11-NEXT:    v_cvt_f32_f16_e32 v13, v13
1392; GFX11-NEXT:    v_cvt_f32_f16_e32 v9, v9
1393; GFX11-NEXT:    s_clause 0x3
1394; GFX11-NEXT:    global_store_b128 v20, v[16:19], s[0:1] offset:48
1395; GFX11-NEXT:    global_store_b128 v20, v[0:3], s[0:1] offset:32
1396; GFX11-NEXT:    global_store_b128 v20, v[12:15], s[0:1] offset:16
1397; GFX11-NEXT:    global_store_b128 v20, v[8:11], s[0:1]
1398; GFX11-NEXT:    s_endpgm
1399  %val = load <16 x half>, ptr addrspace(1) %in
1400  %cvt = fpext <16 x half> %val to <16 x float>
1401  store <16 x float> %cvt, ptr addrspace(1) %out
1402  ret void
1403}
1404
1405define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1406; CIVI-LABEL: global_extload_f16_to_f64:
1407; CIVI:       ; %bb.0:
1408; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1409; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
1410; CIVI-NEXT:    v_mov_b32_e32 v0, s2
1411; CIVI-NEXT:    v_mov_b32_e32 v1, s3
1412; CIVI-NEXT:    flat_load_ushort v0, v[0:1]
1413; CIVI-NEXT:    v_mov_b32_e32 v2, s0
1414; CIVI-NEXT:    v_mov_b32_e32 v3, s1
1415; CIVI-NEXT:    s_waitcnt vmcnt(0)
1416; CIVI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1417; CIVI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
1418; CIVI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1419; CIVI-NEXT:    s_endpgm
1420;
1421; GFX11-LABEL: global_extload_f16_to_f64:
1422; GFX11:       ; %bb.0:
1423; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1424; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1425; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1426; GFX11-NEXT:    global_load_u16 v0, v2, s[2:3]
1427; GFX11-NEXT:    s_waitcnt vmcnt(0)
1428; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
1429; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1430; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
1431; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1432; GFX11-NEXT:    s_endpgm
1433  %val = load half, ptr addrspace(1) %in
1434  %cvt = fpext half %val to double
1435  store double %cvt, ptr addrspace(1) %out
1436  ret void
1437}
1438
1439define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1440; CI-LABEL: global_extload_v2f16_to_v2f64:
1441; CI:       ; %bb.0:
1442; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1443; CI-NEXT:    s_waitcnt lgkmcnt(0)
1444; CI-NEXT:    v_mov_b32_e32 v0, s2
1445; CI-NEXT:    v_mov_b32_e32 v1, s3
1446; CI-NEXT:    flat_load_dword v0, v[0:1]
1447; CI-NEXT:    v_mov_b32_e32 v4, s0
1448; CI-NEXT:    v_mov_b32_e32 v5, s1
1449; CI-NEXT:    s_waitcnt vmcnt(0)
1450; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1451; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1452; CI-NEXT:    v_cvt_f32_f16_e32 v2, v1
1453; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
1454; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1455; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1456; CI-NEXT:    s_endpgm
1457;
1458; VI-LABEL: global_extload_v2f16_to_v2f64:
1459; VI:       ; %bb.0:
1460; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1461; VI-NEXT:    s_waitcnt lgkmcnt(0)
1462; VI-NEXT:    v_mov_b32_e32 v0, s2
1463; VI-NEXT:    v_mov_b32_e32 v1, s3
1464; VI-NEXT:    flat_load_dword v0, v[0:1]
1465; VI-NEXT:    v_mov_b32_e32 v4, s0
1466; VI-NEXT:    v_mov_b32_e32 v5, s1
1467; VI-NEXT:    s_waitcnt vmcnt(0)
1468; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1469; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1470; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
1471; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1472; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1473; VI-NEXT:    s_endpgm
1474;
1475; GFX11-LABEL: global_extload_v2f16_to_v2f64:
1476; GFX11:       ; %bb.0:
1477; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1478; GFX11-NEXT:    v_mov_b32_e32 v4, 0
1479; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1480; GFX11-NEXT:    global_load_b32 v0, v4, s[2:3]
1481; GFX11-NEXT:    s_waitcnt vmcnt(0)
1482; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
1483; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
1484; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1485; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v1
1486; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
1487; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1488; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1489; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
1490; GFX11-NEXT:    s_endpgm
1491  %val = load <2 x half>, ptr addrspace(1) %in
1492  %cvt = fpext <2 x half> %val to <2 x double>
1493  store <2 x double> %cvt, ptr addrspace(1) %out
1494  ret void
1495}
1496
1497define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1498; CI-LABEL: global_extload_v3f16_to_v3f64:
1499; CI:       ; %bb.0:
1500; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1501; CI-NEXT:    s_waitcnt lgkmcnt(0)
1502; CI-NEXT:    v_mov_b32_e32 v0, s2
1503; CI-NEXT:    v_mov_b32_e32 v1, s3
1504; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1505; CI-NEXT:    s_add_u32 s2, s0, 16
1506; CI-NEXT:    s_addc_u32 s3, s1, 0
1507; CI-NEXT:    v_mov_b32_e32 v7, s3
1508; CI-NEXT:    v_mov_b32_e32 v6, s2
1509; CI-NEXT:    s_waitcnt vmcnt(0)
1510; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1511; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
1512; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
1513; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1514; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v1
1515; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
1516; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1517; CI-NEXT:    flat_store_dwordx2 v[6:7], v[4:5]
1518; CI-NEXT:    v_mov_b32_e32 v5, s1
1519; CI-NEXT:    v_mov_b32_e32 v4, s0
1520; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1521; CI-NEXT:    s_endpgm
1522;
1523; VI-LABEL: global_extload_v3f16_to_v3f64:
1524; VI:       ; %bb.0:
1525; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1526; VI-NEXT:    s_waitcnt lgkmcnt(0)
1527; VI-NEXT:    v_mov_b32_e32 v0, s2
1528; VI-NEXT:    v_mov_b32_e32 v1, s3
1529; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1530; VI-NEXT:    s_add_u32 s2, s0, 16
1531; VI-NEXT:    s_addc_u32 s3, s1, 0
1532; VI-NEXT:    v_mov_b32_e32 v5, s1
1533; VI-NEXT:    v_mov_b32_e32 v4, s0
1534; VI-NEXT:    s_waitcnt vmcnt(0)
1535; VI-NEXT:    v_cvt_f32_f16_e32 v3, v1
1536; VI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1537; VI-NEXT:    v_cvt_f32_f16_sdwa v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1538; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v3
1539; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
1540; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
1541; VI-NEXT:    v_mov_b32_e32 v9, s3
1542; VI-NEXT:    v_mov_b32_e32 v8, s2
1543; VI-NEXT:    flat_store_dwordx2 v[8:9], v[6:7]
1544; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1545; VI-NEXT:    s_endpgm
1546;
1547; GFX11-LABEL: global_extload_v3f16_to_v3f64:
1548; GFX11:       ; %bb.0:
1549; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1550; GFX11-NEXT:    v_mov_b32_e32 v6, 0
1551; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1552; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3]
1553; GFX11-NEXT:    s_waitcnt vmcnt(0)
1554; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
1555; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
1556; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
1557; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1558; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
1559; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v3
1560; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1561; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
1562; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1563; GFX11-NEXT:    s_clause 0x1
1564; GFX11-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
1565; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
1566; GFX11-NEXT:    s_endpgm
1567  %val = load <3 x half>, ptr addrspace(1) %in
1568  %cvt = fpext <3 x half> %val to <3 x double>
1569  store <3 x double> %cvt, ptr addrspace(1) %out
1570  ret void
1571}
1572
1573define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1574; CI-LABEL: global_extload_v4f16_to_v4f64:
1575; CI:       ; %bb.0:
1576; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1577; CI-NEXT:    s_waitcnt lgkmcnt(0)
1578; CI-NEXT:    v_mov_b32_e32 v0, s2
1579; CI-NEXT:    v_mov_b32_e32 v1, s3
1580; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1581; CI-NEXT:    s_add_u32 s2, s0, 16
1582; CI-NEXT:    s_addc_u32 s3, s1, 0
1583; CI-NEXT:    v_mov_b32_e32 v9, s1
1584; CI-NEXT:    v_mov_b32_e32 v8, s0
1585; CI-NEXT:    s_waitcnt vmcnt(0)
1586; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
1587; CI-NEXT:    v_cvt_f32_f16_e32 v3, v1
1588; CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
1589; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
1590; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1591; CI-NEXT:    v_cvt_f32_f16_e32 v10, v0
1592; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v3
1593; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
1594; CI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v2
1595; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v10
1596; CI-NEXT:    v_mov_b32_e32 v11, s3
1597; CI-NEXT:    v_mov_b32_e32 v10, s2
1598; CI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
1599; CI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1600; CI-NEXT:    s_endpgm
1601;
1602; VI-LABEL: global_extload_v4f16_to_v4f64:
1603; VI:       ; %bb.0:
1604; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1605; VI-NEXT:    s_waitcnt lgkmcnt(0)
1606; VI-NEXT:    v_mov_b32_e32 v0, s2
1607; VI-NEXT:    v_mov_b32_e32 v1, s3
1608; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1609; VI-NEXT:    s_add_u32 s2, s0, 16
1610; VI-NEXT:    s_addc_u32 s3, s1, 0
1611; VI-NEXT:    v_mov_b32_e32 v9, s1
1612; VI-NEXT:    v_mov_b32_e32 v8, s0
1613; VI-NEXT:    s_waitcnt vmcnt(0)
1614; VI-NEXT:    v_cvt_f32_f16_e32 v3, v1
1615; VI-NEXT:    v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1616; VI-NEXT:    v_cvt_f32_f16_e32 v2, v0
1617; VI-NEXT:    v_cvt_f32_f16_sdwa v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1618; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v3
1619; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
1620; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
1621; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v10
1622; VI-NEXT:    v_mov_b32_e32 v11, s3
1623; VI-NEXT:    v_mov_b32_e32 v10, s2
1624; VI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
1625; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
1626; VI-NEXT:    s_endpgm
1627;
1628; GFX11-LABEL: global_extload_v4f16_to_v4f64:
1629; GFX11:       ; %bb.0:
1630; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1631; GFX11-NEXT:    v_mov_b32_e32 v8, 0
1632; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1633; GFX11-NEXT:    global_load_b64 v[0:1], v8, s[2:3]
1634; GFX11-NEXT:    s_waitcnt vmcnt(0)
1635; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
1636; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
1637; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v1
1638; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
1639; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1640; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
1641; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
1642; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1643; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
1644; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
1645; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1646; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v2
1647; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
1648; GFX11-NEXT:    s_clause 0x1
1649; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
1650; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
1651; GFX11-NEXT:    s_endpgm
1652  %val = load <4 x half>, ptr addrspace(1) %in
1653  %cvt = fpext <4 x half> %val to <4 x double>
1654  store <4 x double> %cvt, ptr addrspace(1) %out
1655  ret void
1656}
1657
1658define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1659; CI-LABEL: global_extload_v8f16_to_v8f64:
1660; CI:       ; %bb.0:
1661; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1662; CI-NEXT:    s_waitcnt lgkmcnt(0)
1663; CI-NEXT:    v_mov_b32_e32 v0, s2
1664; CI-NEXT:    v_mov_b32_e32 v1, s3
1665; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1666; CI-NEXT:    s_add_u32 s2, s0, 48
1667; CI-NEXT:    s_addc_u32 s3, s1, 0
1668; CI-NEXT:    v_mov_b32_e32 v7, s3
1669; CI-NEXT:    v_mov_b32_e32 v6, s2
1670; CI-NEXT:    s_add_u32 s2, s0, 32
1671; CI-NEXT:    v_mov_b32_e32 v13, s1
1672; CI-NEXT:    s_addc_u32 s3, s1, 0
1673; CI-NEXT:    v_mov_b32_e32 v12, s0
1674; CI-NEXT:    s_add_u32 s0, s0, 16
1675; CI-NEXT:    v_mov_b32_e32 v15, s3
1676; CI-NEXT:    s_addc_u32 s1, s1, 0
1677; CI-NEXT:    v_mov_b32_e32 v14, s2
1678; CI-NEXT:    s_waitcnt vmcnt(0)
1679; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
1680; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1681; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
1682; CI-NEXT:    v_cvt_f32_f16_e32 v8, v2
1683; CI-NEXT:    v_cvt_f32_f16_e32 v2, v4
1684; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
1685; CI-NEXT:    v_cvt_f32_f16_e32 v10, v1
1686; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
1687; CI-NEXT:    v_cvt_f32_f16_e32 v4, v0
1688; CI-NEXT:    v_cvt_f32_f16_e32 v16, v5
1689; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
1690; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1691; CI-NEXT:    v_cvt_f32_f16_e32 v17, v9
1692; CI-NEXT:    v_cvt_f32_f16_e32 v18, v11
1693; CI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
1694; CI-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
1695; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
1696; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v10
1697; CI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v16
1698; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v17
1699; CI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v18
1700; CI-NEXT:    v_mov_b32_e32 v17, s1
1701; CI-NEXT:    v_mov_b32_e32 v16, s0
1702; CI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
1703; CI-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
1704; CI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
1705; CI-NEXT:    s_endpgm
1706;
1707; VI-LABEL: global_extload_v8f16_to_v8f64:
1708; VI:       ; %bb.0:
1709; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1710; VI-NEXT:    s_waitcnt lgkmcnt(0)
1711; VI-NEXT:    v_mov_b32_e32 v0, s2
1712; VI-NEXT:    v_mov_b32_e32 v1, s3
1713; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1714; VI-NEXT:    s_add_u32 s2, s0, 48
1715; VI-NEXT:    s_addc_u32 s3, s1, 0
1716; VI-NEXT:    v_mov_b32_e32 v8, s3
1717; VI-NEXT:    v_mov_b32_e32 v7, s2
1718; VI-NEXT:    s_add_u32 s2, s0, 32
1719; VI-NEXT:    v_mov_b32_e32 v13, s1
1720; VI-NEXT:    s_addc_u32 s3, s1, 0
1721; VI-NEXT:    v_mov_b32_e32 v12, s0
1722; VI-NEXT:    s_add_u32 s0, s0, 16
1723; VI-NEXT:    v_mov_b32_e32 v15, s3
1724; VI-NEXT:    s_addc_u32 s1, s1, 0
1725; VI-NEXT:    v_mov_b32_e32 v14, s2
1726; VI-NEXT:    s_waitcnt vmcnt(0)
1727; VI-NEXT:    v_cvt_f32_f16_e32 v9, v0
1728; VI-NEXT:    v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1729; VI-NEXT:    v_cvt_f32_f16_e32 v0, v3
1730; VI-NEXT:    v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1731; VI-NEXT:    v_cvt_f32_f16_e32 v10, v1
1732; VI-NEXT:    v_cvt_f32_f16_e32 v11, v2
1733; VI-NEXT:    v_cvt_f64_f32_e32 v[3:4], v0
1734; VI-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
1735; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1736; VI-NEXT:    v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1737; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
1738; VI-NEXT:    flat_store_dwordx4 v[7:8], v[3:6]
1739; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v11
1740; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
1741; VI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
1742; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v17
1743; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v16
1744; VI-NEXT:    v_mov_b32_e32 v17, s1
1745; VI-NEXT:    v_mov_b32_e32 v16, s0
1746; VI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
1747; VI-NEXT:    flat_store_dwordx4 v[16:17], v[4:7]
1748; VI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
1749; VI-NEXT:    s_endpgm
1750;
1751; GFX11-LABEL: global_extload_v8f16_to_v8f64:
1752; GFX11:       ; %bb.0:
1753; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1754; GFX11-NEXT:    v_mov_b32_e32 v16, 0
1755; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1756; GFX11-NEXT:    global_load_b128 v[0:3], v16, s[2:3]
1757; GFX11-NEXT:    s_waitcnt vmcnt(0)
1758; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v0
1759; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
1760; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v1
1761; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
1762; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, v2
1763; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1764; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
1765; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
1766; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
1767; GFX11-NEXT:    v_cvt_f32_f16_e32 v17, v5
1768; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
1769; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v9
1770; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
1771; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v7
1772; GFX11-NEXT:    v_cvt_f64_f32_e32 v[12:13], v3
1773; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
1774; GFX11-NEXT:    v_cvt_f64_f32_e32 v[14:15], v6
1775; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
1776; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
1777; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v17
1778; GFX11-NEXT:    s_clause 0x3
1779; GFX11-NEXT:    global_store_b128 v16, v[12:15], s[0:1] offset:48
1780; GFX11-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:32
1781; GFX11-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:16
1782; GFX11-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
1783; GFX11-NEXT:    s_endpgm
1784  %val = load <8 x half>, ptr addrspace(1) %in
1785  %cvt = fpext <8 x half> %val to <8 x double>
1786  store <8 x double> %cvt, ptr addrspace(1) %out
1787  ret void
1788}
1789
1790define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1791; CI-LABEL: global_extload_v16f16_to_v16f64:
1792; CI:       ; %bb.0:
1793; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1794; CI-NEXT:    s_waitcnt lgkmcnt(0)
1795; CI-NEXT:    v_mov_b32_e32 v0, s2
1796; CI-NEXT:    v_mov_b32_e32 v1, s3
1797; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1798; CI-NEXT:    s_add_u32 s2, s2, 16
1799; CI-NEXT:    s_addc_u32 s3, s3, 0
1800; CI-NEXT:    v_mov_b32_e32 v5, s3
1801; CI-NEXT:    v_mov_b32_e32 v4, s2
1802; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
1803; CI-NEXT:    s_add_u32 s2, s0, 48
1804; CI-NEXT:    s_addc_u32 s3, s1, 0
1805; CI-NEXT:    v_mov_b32_e32 v15, s3
1806; CI-NEXT:    v_mov_b32_e32 v14, s2
1807; CI-NEXT:    s_add_u32 s2, s0, 32
1808; CI-NEXT:    s_addc_u32 s3, s1, 0
1809; CI-NEXT:    v_mov_b32_e32 v17, s3
1810; CI-NEXT:    v_mov_b32_e32 v16, s2
1811; CI-NEXT:    s_add_u32 s2, s0, 16
1812; CI-NEXT:    s_addc_u32 s3, s1, 0
1813; CI-NEXT:    v_mov_b32_e32 v19, s3
1814; CI-NEXT:    v_mov_b32_e32 v18, s2
1815; CI-NEXT:    s_add_u32 s2, s0, 0x70
1816; CI-NEXT:    s_addc_u32 s3, s1, 0
1817; CI-NEXT:    v_mov_b32_e32 v13, s1
1818; CI-NEXT:    v_mov_b32_e32 v12, s0
1819; CI-NEXT:    s_waitcnt vmcnt(1)
1820; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
1821; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1822; CI-NEXT:    v_cvt_f32_f16_e32 v10, v8
1823; CI-NEXT:    s_waitcnt vmcnt(0)
1824; CI-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
1825; CI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v3
1826; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
1827; CI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
1828; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1829; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
1830; CI-NEXT:    v_cvt_f32_f16_e32 v21, v5
1831; CI-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
1832; CI-NEXT:    v_mov_b32_e32 v15, s3
1833; CI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
1834; CI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v3
1835; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
1836; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
1837; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
1838; CI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
1839; CI-NEXT:    v_mov_b32_e32 v14, s2
1840; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v0
1841; CI-NEXT:    v_cvt_f32_f16_e32 v9, v0
1842; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
1843; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
1844; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
1845; CI-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
1846; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
1847; CI-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
1848; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v6
1849; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v9
1850; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
1851; CI-NEXT:    v_cvt_f32_f16_e32 v8, v10
1852; CI-NEXT:    s_add_u32 s2, s0, 0x60
1853; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
1854; CI-NEXT:    v_cvt_f32_f16_e32 v10, v11
1855; CI-NEXT:    s_addc_u32 s3, s1, 0
1856; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
1857; CI-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
1858; CI-NEXT:    v_mov_b32_e32 v17, s3
1859; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v7
1860; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v8
1861; CI-NEXT:    v_cvt_f32_f16_e32 v7, v20
1862; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
1863; CI-NEXT:    v_cvt_f32_f16_e32 v12, v5
1864; CI-NEXT:    v_mov_b32_e32 v16, s2
1865; CI-NEXT:    s_add_u32 s2, s0, 0x50
1866; CI-NEXT:    s_addc_u32 s3, s1, 0
1867; CI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v6
1868; CI-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
1869; CI-NEXT:    s_add_u32 s0, s0, 64
1870; CI-NEXT:    flat_store_dwordx4 v[14:15], v[0:3]
1871; CI-NEXT:    s_addc_u32 s1, s1, 0
1872; CI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v21
1873; CI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v7
1874; CI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
1875; CI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v12
1876; CI-NEXT:    v_mov_b32_e32 v19, s3
1877; CI-NEXT:    v_mov_b32_e32 v13, s1
1878; CI-NEXT:    v_mov_b32_e32 v18, s2
1879; CI-NEXT:    v_mov_b32_e32 v12, s0
1880; CI-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
1881; CI-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
1882; CI-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
1883; CI-NEXT:    s_endpgm
1884;
1885; VI-LABEL: global_extload_v16f16_to_v16f64:
1886; VI:       ; %bb.0:
1887; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
1888; VI-NEXT:    s_waitcnt lgkmcnt(0)
1889; VI-NEXT:    v_mov_b32_e32 v0, s2
1890; VI-NEXT:    v_mov_b32_e32 v1, s3
1891; VI-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
1892; VI-NEXT:    s_add_u32 s2, s2, 16
1893; VI-NEXT:    s_addc_u32 s3, s3, 0
1894; VI-NEXT:    v_mov_b32_e32 v0, s2
1895; VI-NEXT:    v_mov_b32_e32 v1, s3
1896; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1897; VI-NEXT:    s_add_u32 s2, s0, 48
1898; VI-NEXT:    s_addc_u32 s3, s1, 0
1899; VI-NEXT:    v_mov_b32_e32 v14, s3
1900; VI-NEXT:    v_mov_b32_e32 v13, s2
1901; VI-NEXT:    s_add_u32 s2, s0, 32
1902; VI-NEXT:    s_addc_u32 s3, s1, 0
1903; VI-NEXT:    v_mov_b32_e32 v16, s3
1904; VI-NEXT:    v_mov_b32_e32 v15, s2
1905; VI-NEXT:    s_add_u32 s2, s0, 16
1906; VI-NEXT:    s_addc_u32 s3, s1, 0
1907; VI-NEXT:    v_mov_b32_e32 v18, s3
1908; VI-NEXT:    v_mov_b32_e32 v17, s2
1909; VI-NEXT:    s_add_u32 s2, s0, 0x50
1910; VI-NEXT:    v_mov_b32_e32 v12, s1
1911; VI-NEXT:    s_addc_u32 s3, s1, 0
1912; VI-NEXT:    v_mov_b32_e32 v11, s0
1913; VI-NEXT:    s_waitcnt vmcnt(1)
1914; VI-NEXT:    v_cvt_f32_f16_e32 v8, v7
1915; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1916; VI-NEXT:    v_cvt_f64_f32_e32 v[7:8], v8
1917; VI-NEXT:    v_cvt_f64_f32_e32 v[9:10], v9
1918; VI-NEXT:    flat_store_dwordx4 v[13:14], v[7:10]
1919; VI-NEXT:    s_nop 0
1920; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
1921; VI-NEXT:    v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1922; VI-NEXT:    s_waitcnt vmcnt(1)
1923; VI-NEXT:    v_cvt_f32_f16_e32 v10, v2
1924; VI-NEXT:    v_mov_b32_e32 v14, s3
1925; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
1926; VI-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
1927; VI-NEXT:    v_mov_b32_e32 v13, s2
1928; VI-NEXT:    s_add_u32 s2, s0, 64
1929; VI-NEXT:    s_addc_u32 s3, s1, 0
1930; VI-NEXT:    flat_store_dwordx4 v[15:16], v[6:9]
1931; VI-NEXT:    v_mov_b32_e32 v16, s3
1932; VI-NEXT:    v_cvt_f32_f16_e32 v6, v5
1933; VI-NEXT:    v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1934; VI-NEXT:    v_cvt_f32_f16_e32 v8, v4
1935; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1936; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
1937; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
1938; VI-NEXT:    v_mov_b32_e32 v15, s2
1939; VI-NEXT:    s_add_u32 s2, s0, 0x70
1940; VI-NEXT:    s_addc_u32 s3, s1, 0
1941; VI-NEXT:    flat_store_dwordx4 v[17:18], v[4:7]
1942; VI-NEXT:    v_cvt_f32_f16_sdwa v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1943; VI-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
1944; VI-NEXT:    v_cvt_f64_f32_e32 v[6:7], v9
1945; VI-NEXT:    v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1946; VI-NEXT:    v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1947; VI-NEXT:    v_cvt_f32_f16_e32 v2, v1
1948; VI-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
1949; VI-NEXT:    v_cvt_f32_f16_sdwa v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
1950; VI-NEXT:    v_cvt_f32_f16_e32 v7, v3
1951; VI-NEXT:    v_cvt_f64_f32_e32 v[3:4], v9
1952; VI-NEXT:    v_cvt_f32_f16_e32 v9, v0
1953; VI-NEXT:    v_cvt_f64_f32_e32 v[1:2], v2
1954; VI-NEXT:    v_cvt_f64_f32_e32 v[5:6], v10
1955; VI-NEXT:    v_cvt_f64_f32_e32 v[11:12], v11
1956; VI-NEXT:    v_cvt_f64_f32_e32 v[9:10], v9
1957; VI-NEXT:    s_add_u32 s0, s0, 0x60
1958; VI-NEXT:    flat_store_dwordx4 v[13:14], v[1:4]
1959; VI-NEXT:    s_addc_u32 s1, s1, 0
1960; VI-NEXT:    v_cvt_f64_f32_e32 v[0:1], v7
1961; VI-NEXT:    v_cvt_f64_f32_e32 v[2:3], v17
1962; VI-NEXT:    v_cvt_f64_f32_e32 v[7:8], v8
1963; VI-NEXT:    v_mov_b32_e32 v20, s3
1964; VI-NEXT:    v_mov_b32_e32 v14, s1
1965; VI-NEXT:    v_mov_b32_e32 v19, s2
1966; VI-NEXT:    v_mov_b32_e32 v13, s0
1967; VI-NEXT:    flat_store_dwordx4 v[15:16], v[9:12]
1968; VI-NEXT:    flat_store_dwordx4 v[19:20], v[0:3]
1969; VI-NEXT:    flat_store_dwordx4 v[13:14], v[5:8]
1970; VI-NEXT:    s_endpgm
1971;
1972; GFX11-LABEL: global_extload_v16f16_to_v16f64:
1973; GFX11:       ; %bb.0:
1974; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
1975; GFX11-NEXT:    v_mov_b32_e32 v32, 0
1976; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1977; GFX11-NEXT:    s_clause 0x1
1978; GFX11-NEXT:    global_load_b128 v[0:3], v32, s[2:3]
1979; GFX11-NEXT:    global_load_b128 v[4:7], v32, s[2:3] offset:16
1980; GFX11-NEXT:    s_waitcnt vmcnt(1)
1981; GFX11-NEXT:    v_cvt_f32_f16_e32 v10, v1
1982; GFX11-NEXT:    s_waitcnt vmcnt(0)
1983; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 16, v5
1984; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
1985; GFX11-NEXT:    v_lshrrev_b32_e32 v19, 16, v4
1986; GFX11-NEXT:    v_cvt_f32_f16_e32 v15, v7
1987; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
1988; GFX11-NEXT:    v_cvt_f32_f16_e32 v14, v6
1989; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
1990; GFX11-NEXT:    v_cvt_f32_f16_e32 v13, v3
1991; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
1992; GFX11-NEXT:    v_cvt_f32_f16_e32 v12, v2
1993; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1994; GFX11-NEXT:    v_cvt_f32_f16_e32 v18, v4
1995; GFX11-NEXT:    v_cvt_f32_f16_e32 v22, v5
1996; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
1997; GFX11-NEXT:    v_cvt_f32_f16_e32 v10, v23
1998; GFX11-NEXT:    v_cvt_f32_f16_e32 v34, v11
1999; GFX11-NEXT:    v_cvt_f32_f16_e32 v11, v19
2000; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
2001; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v7
2002; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v6
2003; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, v0
2004; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v3
2005; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v2
2006; GFX11-NEXT:    v_cvt_f64_f32_e32 v[28:29], v22
2007; GFX11-NEXT:    v_cvt_f64_f32_e32 v[30:31], v10
2008; GFX11-NEXT:    v_cvt_f64_f32_e32 v[24:25], v18
2009; GFX11-NEXT:    v_cvt_f64_f32_e32 v[26:27], v11
2010; GFX11-NEXT:    v_cvt_f32_f16_e32 v33, v9
2011; GFX11-NEXT:    v_cvt_f64_f32_e32 v[20:21], v15
2012; GFX11-NEXT:    v_cvt_f64_f32_e32 v[22:23], v7
2013; GFX11-NEXT:    v_cvt_f64_f32_e32 v[16:17], v14
2014; GFX11-NEXT:    v_cvt_f64_f32_e32 v[18:19], v6
2015; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v8
2016; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
2017; GFX11-NEXT:    v_cvt_f64_f32_e32 v[12:13], v13
2018; GFX11-NEXT:    v_cvt_f64_f32_e32 v[14:15], v3
2019; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
2020; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v34
2021; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v33
2022; GFX11-NEXT:    s_clause 0x7
2023; GFX11-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:80
2024; GFX11-NEXT:    global_store_b128 v32, v[24:27], s[0:1] offset:64
2025; GFX11-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:112
2026; GFX11-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:96
2027; GFX11-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:48
2028; GFX11-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:32
2029; GFX11-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:16
2030; GFX11-NEXT:    global_store_b128 v32, v[0:3], s[0:1]
2031; GFX11-NEXT:    s_endpgm
2032  %val = load <16 x half>, ptr addrspace(1) %in
2033  %cvt = fpext <16 x half> %val to <16 x double>
2034  store <16 x double> %cvt, ptr addrspace(1) %out
2035  ret void
2036}
2037
2038define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2039; CIVI-LABEL: global_truncstore_f32_to_f16:
2040; CIVI:       ; %bb.0:
2041; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2042; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
2043; CIVI-NEXT:    v_mov_b32_e32 v0, s2
2044; CIVI-NEXT:    v_mov_b32_e32 v1, s3
2045; CIVI-NEXT:    flat_load_dword v0, v[0:1]
2046; CIVI-NEXT:    v_mov_b32_e32 v1, s1
2047; CIVI-NEXT:    s_waitcnt vmcnt(0)
2048; CIVI-NEXT:    v_cvt_f16_f32_e32 v2, v0
2049; CIVI-NEXT:    v_mov_b32_e32 v0, s0
2050; CIVI-NEXT:    flat_store_short v[0:1], v2
2051; CIVI-NEXT:    s_endpgm
2052;
2053; GFX11-LABEL: global_truncstore_f32_to_f16:
2054; GFX11:       ; %bb.0:
2055; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2056; GFX11-NEXT:    v_mov_b32_e32 v0, 0
2057; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2058; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
2059; GFX11-NEXT:    s_waitcnt vmcnt(0)
2060; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
2061; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
2062; GFX11-NEXT:    s_endpgm
2063  %val = load float, ptr addrspace(1) %in
2064  %cvt = fptrunc float %val to half
2065  store half %cvt, ptr addrspace(1) %out
2066  ret void
2067}
2068
2069define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2070; CI-LABEL: global_truncstore_v2f32_to_v2f16:
2071; CI:       ; %bb.0:
2072; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2073; CI-NEXT:    s_waitcnt lgkmcnt(0)
2074; CI-NEXT:    v_mov_b32_e32 v0, s2
2075; CI-NEXT:    v_mov_b32_e32 v1, s3
2076; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2077; CI-NEXT:    s_waitcnt vmcnt(0)
2078; CI-NEXT:    v_cvt_f16_f32_e32 v2, v1
2079; CI-NEXT:    v_cvt_f16_f32_e32 v3, v0
2080; CI-NEXT:    v_mov_b32_e32 v0, s0
2081; CI-NEXT:    v_mov_b32_e32 v1, s1
2082; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2083; CI-NEXT:    v_or_b32_e32 v2, v3, v2
2084; CI-NEXT:    flat_store_dword v[0:1], v2
2085; CI-NEXT:    s_endpgm
2086;
2087; VI-LABEL: global_truncstore_v2f32_to_v2f16:
2088; VI:       ; %bb.0:
2089; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2090; VI-NEXT:    s_waitcnt lgkmcnt(0)
2091; VI-NEXT:    v_mov_b32_e32 v0, s2
2092; VI-NEXT:    v_mov_b32_e32 v1, s3
2093; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
2094; VI-NEXT:    s_waitcnt vmcnt(0)
2095; VI-NEXT:    v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2096; VI-NEXT:    v_cvt_f16_f32_e32 v3, v0
2097; VI-NEXT:    v_mov_b32_e32 v0, s0
2098; VI-NEXT:    v_mov_b32_e32 v1, s1
2099; VI-NEXT:    v_or_b32_e32 v2, v3, v2
2100; VI-NEXT:    flat_store_dword v[0:1], v2
2101; VI-NEXT:    s_endpgm
2102;
2103; GFX11-LABEL: global_truncstore_v2f32_to_v2f16:
2104; GFX11:       ; %bb.0:
2105; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2106; GFX11-NEXT:    v_mov_b32_e32 v2, 0
2107; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2108; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
2109; GFX11-NEXT:    s_waitcnt vmcnt(0)
2110; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
2111; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
2112; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2113; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
2114; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
2115; GFX11-NEXT:    s_endpgm
2116  %val = load <2 x float>, ptr addrspace(1) %in
2117  %cvt = fptrunc <2 x float> %val to <2 x half>
2118  store <2 x half> %cvt, ptr addrspace(1) %out
2119  ret void
2120}
2121
2122define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2123; CI-LABEL: global_truncstore_v3f32_to_v3f16:
2124; CI:       ; %bb.0:
2125; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2126; CI-NEXT:    s_waitcnt lgkmcnt(0)
2127; CI-NEXT:    v_mov_b32_e32 v0, s2
2128; CI-NEXT:    v_mov_b32_e32 v1, s3
2129; CI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
2130; CI-NEXT:    s_add_u32 s2, s0, 4
2131; CI-NEXT:    s_addc_u32 s3, s1, 0
2132; CI-NEXT:    s_waitcnt vmcnt(0)
2133; CI-NEXT:    v_cvt_f16_f32_e32 v3, v1
2134; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2135; CI-NEXT:    v_cvt_f16_f32_e32 v4, v0
2136; CI-NEXT:    v_mov_b32_e32 v0, s2
2137; CI-NEXT:    v_mov_b32_e32 v1, s3
2138; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2139; CI-NEXT:    flat_store_short v[0:1], v2
2140; CI-NEXT:    v_mov_b32_e32 v0, s0
2141; CI-NEXT:    v_or_b32_e32 v2, v4, v3
2142; CI-NEXT:    v_mov_b32_e32 v1, s1
2143; CI-NEXT:    flat_store_dword v[0:1], v2
2144; CI-NEXT:    s_endpgm
2145;
2146; VI-LABEL: global_truncstore_v3f32_to_v3f16:
2147; VI:       ; %bb.0:
2148; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2149; VI-NEXT:    s_waitcnt lgkmcnt(0)
2150; VI-NEXT:    v_mov_b32_e32 v0, s2
2151; VI-NEXT:    v_mov_b32_e32 v1, s3
2152; VI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
2153; VI-NEXT:    s_add_u32 s2, s0, 4
2154; VI-NEXT:    s_addc_u32 s3, s1, 0
2155; VI-NEXT:    s_waitcnt vmcnt(0)
2156; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2157; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2158; VI-NEXT:    v_cvt_f16_f32_e32 v4, v0
2159; VI-NEXT:    v_mov_b32_e32 v0, s2
2160; VI-NEXT:    v_mov_b32_e32 v1, s3
2161; VI-NEXT:    flat_store_short v[0:1], v2
2162; VI-NEXT:    v_mov_b32_e32 v0, s0
2163; VI-NEXT:    v_or_b32_e32 v3, v4, v3
2164; VI-NEXT:    v_mov_b32_e32 v1, s1
2165; VI-NEXT:    flat_store_dword v[0:1], v3
2166; VI-NEXT:    s_endpgm
2167;
2168; GFX11-LABEL: global_truncstore_v3f32_to_v3f16:
2169; GFX11:       ; %bb.0:
2170; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2171; GFX11-NEXT:    v_mov_b32_e32 v3, 0
2172; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2173; GFX11-NEXT:    global_load_b96 v[0:2], v3, s[2:3]
2174; GFX11-NEXT:    s_waitcnt vmcnt(0)
2175; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
2176; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
2177; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
2178; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2179; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
2180; GFX11-NEXT:    s_clause 0x1
2181; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1] offset:4
2182; GFX11-NEXT:    global_store_b32 v3, v0, s[0:1]
2183; GFX11-NEXT:    s_endpgm
2184  %val = load <3 x float>, ptr addrspace(1) %in
2185  %cvt = fptrunc <3 x float> %val to <3 x half>
2186  store <3 x half> %cvt, ptr addrspace(1) %out
2187  ret void
2188}
2189
2190define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2191; CI-LABEL: global_truncstore_v4f32_to_v4f16:
2192; CI:       ; %bb.0:
2193; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2194; CI-NEXT:    s_waitcnt lgkmcnt(0)
2195; CI-NEXT:    v_mov_b32_e32 v0, s2
2196; CI-NEXT:    v_mov_b32_e32 v1, s3
2197; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2198; CI-NEXT:    v_mov_b32_e32 v4, s0
2199; CI-NEXT:    v_mov_b32_e32 v5, s1
2200; CI-NEXT:    s_waitcnt vmcnt(0)
2201; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
2202; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2203; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2204; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2205; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2206; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
2207; CI-NEXT:    v_or_b32_e32 v1, v2, v3
2208; CI-NEXT:    v_or_b32_e32 v0, v0, v6
2209; CI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
2210; CI-NEXT:    s_endpgm
2211;
2212; VI-LABEL: global_truncstore_v4f32_to_v4f16:
2213; VI:       ; %bb.0:
2214; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2215; VI-NEXT:    s_waitcnt lgkmcnt(0)
2216; VI-NEXT:    v_mov_b32_e32 v0, s2
2217; VI-NEXT:    v_mov_b32_e32 v1, s3
2218; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2219; VI-NEXT:    s_waitcnt vmcnt(0)
2220; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2221; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2222; VI-NEXT:    v_cvt_f16_f32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2223; VI-NEXT:    v_cvt_f16_f32_e32 v5, v0
2224; VI-NEXT:    v_mov_b32_e32 v0, s0
2225; VI-NEXT:    v_mov_b32_e32 v1, s1
2226; VI-NEXT:    v_or_b32_e32 v3, v2, v3
2227; VI-NEXT:    v_or_b32_e32 v2, v5, v4
2228; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
2229; VI-NEXT:    s_endpgm
2230;
2231; GFX11-LABEL: global_truncstore_v4f32_to_v4f16:
2232; GFX11:       ; %bb.0:
2233; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2234; GFX11-NEXT:    v_mov_b32_e32 v4, 0
2235; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2236; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
2237; GFX11-NEXT:    s_waitcnt vmcnt(0)
2238; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
2239; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
2240; GFX11-NEXT:    v_cvt_f16_f32_e32 v5, v1
2241; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
2242; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2243; GFX11-NEXT:    v_pack_b32_f16 v1, v2, v3
2244; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v5
2245; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
2246; GFX11-NEXT:    s_endpgm
2247  %val = load <4 x float>, ptr addrspace(1) %in
2248  %cvt = fptrunc <4 x float> %val to <4 x half>
2249  store <4 x half> %cvt, ptr addrspace(1) %out
2250  ret void
2251}
2252
2253define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2254; CI-LABEL: global_truncstore_v8f32_to_v8f16:
2255; CI:       ; %bb.0:
2256; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2257; CI-NEXT:    s_waitcnt lgkmcnt(0)
2258; CI-NEXT:    v_mov_b32_e32 v0, s2
2259; CI-NEXT:    v_mov_b32_e32 v1, s3
2260; CI-NEXT:    s_add_u32 s2, s2, 16
2261; CI-NEXT:    s_addc_u32 s3, s3, 0
2262; CI-NEXT:    v_mov_b32_e32 v5, s3
2263; CI-NEXT:    v_mov_b32_e32 v4, s2
2264; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2265; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2266; CI-NEXT:    v_mov_b32_e32 v8, s0
2267; CI-NEXT:    v_mov_b32_e32 v9, s1
2268; CI-NEXT:    s_waitcnt vmcnt(1)
2269; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
2270; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2271; CI-NEXT:    s_waitcnt vmcnt(0)
2272; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
2273; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
2274; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2275; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2276; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
2277; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
2278; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2279; CI-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
2280; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
2281; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
2282; CI-NEXT:    v_or_b32_e32 v1, v2, v3
2283; CI-NEXT:    v_or_b32_e32 v0, v0, v10
2284; CI-NEXT:    v_or_b32_e32 v3, v6, v7
2285; CI-NEXT:    v_or_b32_e32 v2, v4, v5
2286; CI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2287; CI-NEXT:    s_endpgm
2288;
2289; VI-LABEL: global_truncstore_v8f32_to_v8f16:
2290; VI:       ; %bb.0:
2291; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2292; VI-NEXT:    s_waitcnt lgkmcnt(0)
2293; VI-NEXT:    v_mov_b32_e32 v0, s2
2294; VI-NEXT:    v_mov_b32_e32 v1, s3
2295; VI-NEXT:    s_add_u32 s2, s2, 16
2296; VI-NEXT:    s_addc_u32 s3, s3, 0
2297; VI-NEXT:    v_mov_b32_e32 v5, s3
2298; VI-NEXT:    v_mov_b32_e32 v4, s2
2299; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2300; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2301; VI-NEXT:    v_mov_b32_e32 v8, s0
2302; VI-NEXT:    v_mov_b32_e32 v9, s1
2303; VI-NEXT:    s_waitcnt vmcnt(1)
2304; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2305; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2306; VI-NEXT:    v_cvt_f16_f32_sdwa v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2307; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2308; VI-NEXT:    s_waitcnt vmcnt(0)
2309; VI-NEXT:    v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2310; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
2311; VI-NEXT:    v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2312; VI-NEXT:    v_cvt_f16_f32_e32 v4, v4
2313; VI-NEXT:    v_or_b32_e32 v1, v2, v3
2314; VI-NEXT:    v_or_b32_e32 v0, v0, v10
2315; VI-NEXT:    v_or_b32_e32 v3, v6, v7
2316; VI-NEXT:    v_or_b32_e32 v2, v4, v5
2317; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2318; VI-NEXT:    s_endpgm
2319;
2320; GFX11-LABEL: global_truncstore_v8f32_to_v8f16:
2321; GFX11:       ; %bb.0:
2322; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2323; GFX11-NEXT:    v_mov_b32_e32 v8, 0
2324; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2325; GFX11-NEXT:    s_clause 0x1
2326; GFX11-NEXT:    global_load_b128 v[0:3], v8, s[2:3] offset:16
2327; GFX11-NEXT:    global_load_b128 v[4:7], v8, s[2:3]
2328; GFX11-NEXT:    s_waitcnt vmcnt(1)
2329; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
2330; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
2331; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
2332; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
2333; GFX11-NEXT:    s_waitcnt vmcnt(0)
2334; GFX11-NEXT:    v_cvt_f16_f32_e32 v7, v7
2335; GFX11-NEXT:    v_cvt_f16_f32_e32 v6, v6
2336; GFX11-NEXT:    v_cvt_f16_f32_e32 v5, v5
2337; GFX11-NEXT:    v_cvt_f16_f32_e32 v4, v4
2338; GFX11-NEXT:    v_pack_b32_f16 v3, v2, v3
2339; GFX11-NEXT:    v_pack_b32_f16 v2, v0, v1
2340; GFX11-NEXT:    v_pack_b32_f16 v1, v6, v7
2341; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
2342; GFX11-NEXT:    v_pack_b32_f16 v0, v4, v5
2343; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
2344; GFX11-NEXT:    s_endpgm
2345  %val = load <8 x float>, ptr addrspace(1) %in
2346  %cvt = fptrunc <8 x float> %val to <8 x half>
2347  store <8 x half> %cvt, ptr addrspace(1) %out
2348  ret void
2349}
2350
2351define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2352; CI-LABEL: global_truncstore_v16f32_to_v16f16:
2353; CI:       ; %bb.0:
2354; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2355; CI-NEXT:    s_waitcnt lgkmcnt(0)
2356; CI-NEXT:    s_add_u32 s4, s2, 32
2357; CI-NEXT:    s_addc_u32 s5, s3, 0
2358; CI-NEXT:    v_mov_b32_e32 v0, s4
2359; CI-NEXT:    v_mov_b32_e32 v1, s5
2360; CI-NEXT:    s_add_u32 s4, s2, 48
2361; CI-NEXT:    s_addc_u32 s5, s3, 0
2362; CI-NEXT:    v_mov_b32_e32 v9, s3
2363; CI-NEXT:    v_mov_b32_e32 v4, s4
2364; CI-NEXT:    v_mov_b32_e32 v8, s2
2365; CI-NEXT:    s_add_u32 s2, s2, 16
2366; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2367; CI-NEXT:    v_mov_b32_e32 v5, s5
2368; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2369; CI-NEXT:    s_addc_u32 s3, s3, 0
2370; CI-NEXT:    v_mov_b32_e32 v13, s3
2371; CI-NEXT:    v_mov_b32_e32 v12, s2
2372; CI-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
2373; CI-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
2374; CI-NEXT:    s_add_u32 s2, s0, 16
2375; CI-NEXT:    s_addc_u32 s3, s1, 0
2376; CI-NEXT:    s_waitcnt vmcnt(3)
2377; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
2378; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2379; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2380; CI-NEXT:    s_waitcnt vmcnt(2)
2381; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
2382; CI-NEXT:    v_cvt_f16_f32_e32 v16, v5
2383; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2384; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
2385; CI-NEXT:    v_cvt_f16_f32_e32 v17, v4
2386; CI-NEXT:    s_waitcnt vmcnt(1)
2387; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
2388; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
2389; CI-NEXT:    s_waitcnt vmcnt(0)
2390; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
2391; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
2392; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
2393; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
2394; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
2395; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
2396; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2397; CI-NEXT:    v_mov_b32_e32 v5, s3
2398; CI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
2399; CI-NEXT:    v_or_b32_e32 v1, v2, v3
2400; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
2401; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v16
2402; CI-NEXT:    v_mov_b32_e32 v4, s2
2403; CI-NEXT:    v_or_b32_e32 v0, v0, v18
2404; CI-NEXT:    v_or_b32_e32 v3, v6, v2
2405; CI-NEXT:    v_or_b32_e32 v2, v17, v7
2406; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
2407; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v9
2408; CI-NEXT:    v_lshlrev_b32_e32 v9, 16, v15
2409; CI-NEXT:    v_lshlrev_b32_e32 v11, 16, v13
2410; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2411; CI-NEXT:    v_mov_b32_e32 v5, s1
2412; CI-NEXT:    v_or_b32_e32 v1, v10, v6
2413; CI-NEXT:    v_or_b32_e32 v0, v8, v7
2414; CI-NEXT:    v_or_b32_e32 v3, v14, v9
2415; CI-NEXT:    v_or_b32_e32 v2, v12, v11
2416; CI-NEXT:    v_mov_b32_e32 v4, s0
2417; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2418; CI-NEXT:    s_endpgm
2419;
2420; VI-LABEL: global_truncstore_v16f32_to_v16f16:
2421; VI:       ; %bb.0:
2422; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2423; VI-NEXT:    s_waitcnt lgkmcnt(0)
2424; VI-NEXT:    s_add_u32 s4, s2, 32
2425; VI-NEXT:    s_addc_u32 s5, s3, 0
2426; VI-NEXT:    v_mov_b32_e32 v0, s4
2427; VI-NEXT:    v_mov_b32_e32 v1, s5
2428; VI-NEXT:    s_add_u32 s4, s2, 48
2429; VI-NEXT:    s_addc_u32 s5, s3, 0
2430; VI-NEXT:    v_mov_b32_e32 v9, s3
2431; VI-NEXT:    v_mov_b32_e32 v4, s4
2432; VI-NEXT:    v_mov_b32_e32 v8, s2
2433; VI-NEXT:    s_add_u32 s2, s2, 16
2434; VI-NEXT:    v_mov_b32_e32 v5, s5
2435; VI-NEXT:    s_addc_u32 s3, s3, 0
2436; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2437; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2438; VI-NEXT:    v_mov_b32_e32 v13, s3
2439; VI-NEXT:    v_mov_b32_e32 v12, s2
2440; VI-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
2441; VI-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
2442; VI-NEXT:    s_add_u32 s2, s0, 16
2443; VI-NEXT:    s_addc_u32 s3, s1, 0
2444; VI-NEXT:    s_waitcnt vmcnt(3)
2445; VI-NEXT:    v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2446; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2447; VI-NEXT:    v_cvt_f16_f32_sdwa v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2448; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2449; VI-NEXT:    s_waitcnt vmcnt(2)
2450; VI-NEXT:    v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2451; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
2452; VI-NEXT:    v_cvt_f16_f32_sdwa v17, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2453; VI-NEXT:    v_cvt_f16_f32_e32 v18, v4
2454; VI-NEXT:    s_waitcnt vmcnt(1)
2455; VI-NEXT:    v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2456; VI-NEXT:    v_cvt_f16_f32_e32 v10, v10
2457; VI-NEXT:    v_cvt_f16_f32_sdwa v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2458; VI-NEXT:    v_cvt_f16_f32_e32 v8, v8
2459; VI-NEXT:    s_waitcnt vmcnt(0)
2460; VI-NEXT:    v_cvt_f16_f32_sdwa v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2461; VI-NEXT:    v_cvt_f16_f32_e32 v14, v14
2462; VI-NEXT:    v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2463; VI-NEXT:    v_cvt_f16_f32_e32 v12, v12
2464; VI-NEXT:    v_mov_b32_e32 v5, s3
2465; VI-NEXT:    v_mov_b32_e32 v4, s2
2466; VI-NEXT:    v_or_b32_e32 v1, v2, v3
2467; VI-NEXT:    v_or_b32_e32 v0, v0, v16
2468; VI-NEXT:    v_or_b32_e32 v3, v6, v7
2469; VI-NEXT:    v_or_b32_e32 v2, v18, v17
2470; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2471; VI-NEXT:    v_mov_b32_e32 v5, s1
2472; VI-NEXT:    v_or_b32_e32 v1, v10, v11
2473; VI-NEXT:    v_or_b32_e32 v0, v8, v9
2474; VI-NEXT:    v_or_b32_e32 v3, v14, v15
2475; VI-NEXT:    v_or_b32_e32 v2, v12, v13
2476; VI-NEXT:    v_mov_b32_e32 v4, s0
2477; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2478; VI-NEXT:    s_endpgm
2479;
2480; GFX11-LABEL: global_truncstore_v16f32_to_v16f16:
2481; GFX11:       ; %bb.0:
2482; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2483; GFX11-NEXT:    v_mov_b32_e32 v16, 0
2484; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2485; GFX11-NEXT:    s_clause 0x3
2486; GFX11-NEXT:    global_load_b128 v[0:3], v16, s[2:3] offset:16
2487; GFX11-NEXT:    global_load_b128 v[4:7], v16, s[2:3]
2488; GFX11-NEXT:    global_load_b128 v[8:11], v16, s[2:3] offset:48
2489; GFX11-NEXT:    global_load_b128 v[12:15], v16, s[2:3] offset:32
2490; GFX11-NEXT:    s_waitcnt vmcnt(3)
2491; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
2492; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
2493; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
2494; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
2495; GFX11-NEXT:    s_waitcnt vmcnt(2)
2496; GFX11-NEXT:    v_cvt_f16_f32_e32 v7, v7
2497; GFX11-NEXT:    v_cvt_f16_f32_e32 v6, v6
2498; GFX11-NEXT:    v_cvt_f16_f32_e32 v17, v5
2499; GFX11-NEXT:    v_cvt_f16_f32_e32 v18, v4
2500; GFX11-NEXT:    s_waitcnt vmcnt(1)
2501; GFX11-NEXT:    v_cvt_f16_f32_e32 v4, v11
2502; GFX11-NEXT:    v_cvt_f16_f32_e32 v5, v10
2503; GFX11-NEXT:    v_cvt_f16_f32_e32 v9, v9
2504; GFX11-NEXT:    v_cvt_f16_f32_e32 v8, v8
2505; GFX11-NEXT:    s_waitcnt vmcnt(0)
2506; GFX11-NEXT:    v_cvt_f16_f32_e32 v10, v15
2507; GFX11-NEXT:    v_cvt_f16_f32_e32 v11, v14
2508; GFX11-NEXT:    v_cvt_f16_f32_e32 v13, v13
2509; GFX11-NEXT:    v_cvt_f16_f32_e32 v12, v12
2510; GFX11-NEXT:    v_pack_b32_f16 v3, v2, v3
2511; GFX11-NEXT:    v_pack_b32_f16 v2, v0, v1
2512; GFX11-NEXT:    v_pack_b32_f16 v1, v6, v7
2513; GFX11-NEXT:    v_pack_b32_f16 v7, v5, v4
2514; GFX11-NEXT:    v_pack_b32_f16 v6, v8, v9
2515; GFX11-NEXT:    v_pack_b32_f16 v5, v11, v10
2516; GFX11-NEXT:    v_pack_b32_f16 v4, v12, v13
2517; GFX11-NEXT:    v_pack_b32_f16 v0, v18, v17
2518; GFX11-NEXT:    s_clause 0x1
2519; GFX11-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:16
2520; GFX11-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
2521; GFX11-NEXT:    s_endpgm
2522  %val = load <16 x float>, ptr addrspace(1) %in
2523  %cvt = fptrunc <16 x float> %val to <16 x half>
2524  store <16 x half> %cvt, ptr addrspace(1) %out
2525  ret void
2526}
2527
2528; FIXME: Unsafe math should fold conversions away
2529define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0 {
2530; CI-LABEL: fadd_f16:
2531; CI:       ; %bb.0:
2532; CI-NEXT:    s_load_dword s0, s[8:9], 0x2
2533; CI-NEXT:    s_waitcnt lgkmcnt(0)
2534; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
2535; CI-NEXT:    s_lshr_b32 s0, s0, 16
2536; CI-NEXT:    v_cvt_f32_f16_e32 v1, s0
2537; CI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
2538; CI-NEXT:    v_add_f32_e32 v0, v0, v1
2539; CI-NEXT:    v_cvt_f16_f32_e32 v2, v0
2540; CI-NEXT:    s_waitcnt lgkmcnt(0)
2541; CI-NEXT:    v_mov_b32_e32 v0, s0
2542; CI-NEXT:    v_mov_b32_e32 v1, s1
2543; CI-NEXT:    flat_store_short v[0:1], v2
2544; CI-NEXT:    s_endpgm
2545;
2546; VI-LABEL: fadd_f16:
2547; VI:       ; %bb.0:
2548; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
2549; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
2550; VI-NEXT:    s_waitcnt lgkmcnt(0)
2551; VI-NEXT:    s_lshr_b32 s3, s2, 16
2552; VI-NEXT:    v_mov_b32_e32 v0, s3
2553; VI-NEXT:    v_add_f16_e32 v2, s2, v0
2554; VI-NEXT:    v_mov_b32_e32 v0, s0
2555; VI-NEXT:    v_mov_b32_e32 v1, s1
2556; VI-NEXT:    flat_store_short v[0:1], v2
2557; VI-NEXT:    s_endpgm
2558;
2559; GFX11-LABEL: fadd_f16:
2560; GFX11:       ; %bb.0:
2561; GFX11-NEXT:    s_clause 0x1
2562; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
2563; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2564; GFX11-NEXT:    v_mov_b32_e32 v0, 0
2565; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2566; GFX11-NEXT:    s_lshr_b32 s3, s2, 16
2567; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2568; GFX11-NEXT:    v_add_f16_e64 v1, s2, s3
2569; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
2570; GFX11-NEXT:    s_endpgm
2571   %add = fadd half %a, %b
2572   store half %add, ptr addrspace(1) %out, align 4
2573   ret void
2574}
2575
2576define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x half> %b) #0 {
2577; CI-LABEL: fadd_v2f16:
2578; CI:       ; %bb.0:
2579; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2580; CI-NEXT:    s_waitcnt lgkmcnt(0)
2581; CI-NEXT:    s_lshr_b32 s4, s2, 16
2582; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
2583; CI-NEXT:    s_lshr_b32 s2, s3, 16
2584; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
2585; CI-NEXT:    v_cvt_f32_f16_e32 v2, s4
2586; CI-NEXT:    v_cvt_f32_f16_e32 v3, s2
2587; CI-NEXT:    v_add_f32_e32 v0, v0, v1
2588; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2589; CI-NEXT:    v_add_f32_e32 v1, v2, v3
2590; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2591; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2592; CI-NEXT:    v_or_b32_e32 v2, v0, v1
2593; CI-NEXT:    v_mov_b32_e32 v0, s0
2594; CI-NEXT:    v_mov_b32_e32 v1, s1
2595; CI-NEXT:    flat_store_dword v[0:1], v2
2596; CI-NEXT:    s_endpgm
2597;
2598; VI-LABEL: fadd_v2f16:
2599; VI:       ; %bb.0:
2600; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2601; VI-NEXT:    s_waitcnt lgkmcnt(0)
2602; VI-NEXT:    s_lshr_b32 s4, s3, 16
2603; VI-NEXT:    s_lshr_b32 s5, s2, 16
2604; VI-NEXT:    v_mov_b32_e32 v0, s3
2605; VI-NEXT:    v_mov_b32_e32 v1, s4
2606; VI-NEXT:    v_mov_b32_e32 v2, s5
2607; VI-NEXT:    v_add_f16_e32 v0, s2, v0
2608; VI-NEXT:    v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2609; VI-NEXT:    v_or_b32_e32 v2, v0, v1
2610; VI-NEXT:    v_mov_b32_e32 v0, s0
2611; VI-NEXT:    v_mov_b32_e32 v1, s1
2612; VI-NEXT:    flat_store_dword v[0:1], v2
2613; VI-NEXT:    s_endpgm
2614;
2615; GFX11-LABEL: fadd_v2f16:
2616; GFX11:       ; %bb.0:
2617; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2618; GFX11-NEXT:    v_mov_b32_e32 v0, 0
2619; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2620; GFX11-NEXT:    v_pk_add_f16 v1, s2, s3
2621; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2622; GFX11-NEXT:    s_endpgm
2623  %add = fadd <2 x half> %a, %b
2624  store <2 x half> %add, ptr addrspace(1) %out, align 8
2625  ret void
2626}
2627
2628define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2629; CI-LABEL: fadd_v4f16:
2630; CI:       ; %bb.0:
2631; CI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2632; CI-NEXT:    s_waitcnt lgkmcnt(0)
2633; CI-NEXT:    v_mov_b32_e32 v0, s2
2634; CI-NEXT:    v_mov_b32_e32 v1, s3
2635; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2636; CI-NEXT:    v_mov_b32_e32 v4, s0
2637; CI-NEXT:    v_mov_b32_e32 v5, s1
2638; CI-NEXT:    s_waitcnt vmcnt(0)
2639; CI-NEXT:    v_cvt_f32_f16_e32 v6, v0
2640; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
2641; CI-NEXT:    v_cvt_f32_f16_e32 v7, v1
2642; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
2643; CI-NEXT:    v_cvt_f32_f16_e32 v8, v2
2644; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
2645; CI-NEXT:    v_cvt_f32_f16_e32 v9, v3
2646; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
2647; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
2648; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
2649; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
2650; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
2651; CI-NEXT:    v_add_f32_e32 v7, v7, v9
2652; CI-NEXT:    v_add_f32_e32 v6, v6, v8
2653; CI-NEXT:    v_add_f32_e32 v1, v1, v3
2654; CI-NEXT:    v_add_f32_e32 v0, v0, v2
2655; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2656; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2657; CI-NEXT:    v_cvt_f16_f32_e32 v2, v7
2658; CI-NEXT:    v_cvt_f16_f32_e32 v3, v6
2659; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2660; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2661; CI-NEXT:    v_or_b32_e32 v1, v2, v1
2662; CI-NEXT:    v_or_b32_e32 v0, v3, v0
2663; CI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
2664; CI-NEXT:    s_endpgm
2665;
2666; VI-LABEL: fadd_v4f16:
2667; VI:       ; %bb.0:
2668; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2669; VI-NEXT:    s_waitcnt lgkmcnt(0)
2670; VI-NEXT:    v_mov_b32_e32 v0, s2
2671; VI-NEXT:    v_mov_b32_e32 v1, s3
2672; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2673; VI-NEXT:    v_mov_b32_e32 v4, s0
2674; VI-NEXT:    v_mov_b32_e32 v5, s1
2675; VI-NEXT:    s_waitcnt vmcnt(0)
2676; VI-NEXT:    v_add_f16_sdwa v6, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2677; VI-NEXT:    v_add_f16_e32 v1, v1, v3
2678; VI-NEXT:    v_add_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
2679; VI-NEXT:    v_add_f16_e32 v0, v0, v2
2680; VI-NEXT:    v_or_b32_e32 v1, v1, v6
2681; VI-NEXT:    v_or_b32_e32 v0, v0, v3
2682; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
2683; VI-NEXT:    s_endpgm
2684;
2685; GFX11-LABEL: fadd_v4f16:
2686; GFX11:       ; %bb.0:
2687; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2688; GFX11-NEXT:    v_mov_b32_e32 v4, 0
2689; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2690; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
2691; GFX11-NEXT:    s_waitcnt vmcnt(0)
2692; GFX11-NEXT:    v_pk_add_f16 v1, v1, v3
2693; GFX11-NEXT:    v_pk_add_f16 v0, v0, v2
2694; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
2695; GFX11-NEXT:    s_endpgm
2696  %b_ptr = getelementptr <4 x half>, ptr addrspace(1) %in, i32 1
2697  %a = load <4 x half>, ptr addrspace(1) %in, align 16
2698  %b = load <4 x half>, ptr addrspace(1) %b_ptr, align 16
2699  %result = fadd <4 x half> %a, %b
2700  store <4 x half> %result, ptr addrspace(1) %out, align 16
2701  ret void
2702}
2703
2704define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x half> %b) #0 {
2705; CI-LABEL: fadd_v8f16:
2706; CI:       ; %bb.0:
2707; CI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x4
2708; CI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
2709; CI-NEXT:    s_waitcnt lgkmcnt(0)
2710; CI-NEXT:    s_lshr_b32 s10, s0, 16
2711; CI-NEXT:    v_cvt_f32_f16_e32 v4, s0
2712; CI-NEXT:    s_lshr_b32 s0, s4, 16
2713; CI-NEXT:    v_cvt_f32_f16_e32 v8, s0
2714; CI-NEXT:    s_lshr_b32 s0, s5, 16
2715; CI-NEXT:    s_lshr_b32 s11, s1, 16
2716; CI-NEXT:    v_cvt_f32_f16_e32 v9, s0
2717; CI-NEXT:    s_lshr_b32 s0, s6, 16
2718; CI-NEXT:    s_lshr_b32 s12, s2, 16
2719; CI-NEXT:    v_cvt_f32_f16_e32 v0, s10
2720; CI-NEXT:    v_cvt_f32_f16_e32 v1, s11
2721; CI-NEXT:    s_lshr_b32 s10, s3, 16
2722; CI-NEXT:    v_cvt_f32_f16_e32 v10, s0
2723; CI-NEXT:    s_lshr_b32 s0, s7, 16
2724; CI-NEXT:    v_cvt_f32_f16_e32 v2, s12
2725; CI-NEXT:    v_cvt_f32_f16_e32 v3, s10
2726; CI-NEXT:    v_cvt_f32_f16_e32 v5, s1
2727; CI-NEXT:    v_cvt_f32_f16_e32 v11, s0
2728; CI-NEXT:    v_cvt_f32_f16_e32 v12, s4
2729; CI-NEXT:    v_cvt_f32_f16_e32 v13, s5
2730; CI-NEXT:    v_cvt_f32_f16_e32 v6, s2
2731; CI-NEXT:    v_cvt_f32_f16_e32 v7, s3
2732; CI-NEXT:    v_cvt_f32_f16_e32 v14, s7
2733; CI-NEXT:    v_cvt_f32_f16_e32 v15, s6
2734; CI-NEXT:    v_add_f32_e32 v1, v1, v9
2735; CI-NEXT:    v_add_f32_e32 v0, v0, v8
2736; CI-NEXT:    v_add_f32_e32 v3, v3, v11
2737; CI-NEXT:    v_add_f32_e32 v2, v2, v10
2738; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
2739; CI-NEXT:    v_add_f32_e32 v5, v5, v13
2740; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
2741; CI-NEXT:    v_add_f32_e32 v4, v4, v12
2742; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
2743; CI-NEXT:    v_add_f32_e32 v7, v7, v14
2744; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
2745; CI-NEXT:    v_add_f32_e32 v6, v6, v15
2746; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
2747; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
2748; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
2749; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
2750; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2751; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2752; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2753; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2754; CI-NEXT:    v_or_b32_e32 v1, v5, v1
2755; CI-NEXT:    v_or_b32_e32 v0, v4, v0
2756; CI-NEXT:    v_mov_b32_e32 v4, s8
2757; CI-NEXT:    v_or_b32_e32 v3, v7, v3
2758; CI-NEXT:    v_or_b32_e32 v2, v6, v2
2759; CI-NEXT:    v_mov_b32_e32 v5, s9
2760; CI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2761; CI-NEXT:    s_endpgm
2762;
2763; VI-LABEL: fadd_v8f16:
2764; VI:       ; %bb.0:
2765; VI-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x10
2766; VI-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
2767; VI-NEXT:    s_waitcnt lgkmcnt(0)
2768; VI-NEXT:    s_lshr_b32 s10, s7, 16
2769; VI-NEXT:    s_lshr_b32 s11, s3, 16
2770; VI-NEXT:    v_mov_b32_e32 v0, s7
2771; VI-NEXT:    v_mov_b32_e32 v1, s10
2772; VI-NEXT:    v_mov_b32_e32 v2, s11
2773; VI-NEXT:    v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2774; VI-NEXT:    v_add_f16_e32 v0, s3, v0
2775; VI-NEXT:    s_lshr_b32 s3, s6, 16
2776; VI-NEXT:    s_lshr_b32 s7, s2, 16
2777; VI-NEXT:    v_or_b32_e32 v3, v0, v1
2778; VI-NEXT:    v_mov_b32_e32 v0, s3
2779; VI-NEXT:    v_mov_b32_e32 v1, s7
2780; VI-NEXT:    v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2781; VI-NEXT:    v_mov_b32_e32 v1, s6
2782; VI-NEXT:    v_add_f16_e32 v1, s2, v1
2783; VI-NEXT:    s_lshr_b32 s2, s5, 16
2784; VI-NEXT:    s_lshr_b32 s3, s1, 16
2785; VI-NEXT:    v_or_b32_e32 v2, v1, v0
2786; VI-NEXT:    v_mov_b32_e32 v0, s2
2787; VI-NEXT:    v_mov_b32_e32 v1, s3
2788; VI-NEXT:    v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2789; VI-NEXT:    v_mov_b32_e32 v1, s5
2790; VI-NEXT:    v_add_f16_e32 v1, s1, v1
2791; VI-NEXT:    s_lshr_b32 s1, s4, 16
2792; VI-NEXT:    s_lshr_b32 s2, s0, 16
2793; VI-NEXT:    v_or_b32_e32 v1, v1, v0
2794; VI-NEXT:    v_mov_b32_e32 v0, s1
2795; VI-NEXT:    v_mov_b32_e32 v4, s2
2796; VI-NEXT:    v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2797; VI-NEXT:    v_mov_b32_e32 v4, s4
2798; VI-NEXT:    v_add_f16_e32 v4, s0, v4
2799; VI-NEXT:    v_or_b32_e32 v0, v4, v0
2800; VI-NEXT:    v_mov_b32_e32 v4, s8
2801; VI-NEXT:    v_mov_b32_e32 v5, s9
2802; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2803; VI-NEXT:    s_endpgm
2804;
2805; GFX11-LABEL: fadd_v8f16:
2806; GFX11:       ; %bb.0:
2807; GFX11-NEXT:    s_clause 0x1
2808; GFX11-NEXT:    s_load_b256 s[8:15], s[4:5], 0x10
2809; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
2810; GFX11-NEXT:    v_mov_b32_e32 v4, 0
2811; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2812; GFX11-NEXT:    v_pk_add_f16 v3, s11, s15
2813; GFX11-NEXT:    v_pk_add_f16 v2, s10, s14
2814; GFX11-NEXT:    v_pk_add_f16 v1, s9, s13
2815; GFX11-NEXT:    v_pk_add_f16 v0, s8, s12
2816; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
2817; GFX11-NEXT:    s_endpgm
2818  %add = fadd <8 x half> %a, %b
2819  store <8 x half> %add, ptr addrspace(1) %out, align 32
2820  ret void
2821}
2822
2823define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
2824; CIVI-LABEL: test_bitcast_from_half:
2825; CIVI:       ; %bb.0:
2826; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2827; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
2828; CIVI-NEXT:    v_mov_b32_e32 v0, s0
2829; CIVI-NEXT:    v_mov_b32_e32 v1, s1
2830; CIVI-NEXT:    flat_load_ushort v2, v[0:1]
2831; CIVI-NEXT:    v_mov_b32_e32 v0, s2
2832; CIVI-NEXT:    v_mov_b32_e32 v1, s3
2833; CIVI-NEXT:    s_waitcnt vmcnt(0)
2834; CIVI-NEXT:    flat_store_short v[0:1], v2
2835; CIVI-NEXT:    s_endpgm
2836;
2837; GFX11-LABEL: test_bitcast_from_half:
2838; GFX11:       ; %bb.0:
2839; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2840; GFX11-NEXT:    v_mov_b32_e32 v0, 0
2841; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2842; GFX11-NEXT:    global_load_u16 v1, v0, s[0:1]
2843; GFX11-NEXT:    s_waitcnt vmcnt(0)
2844; GFX11-NEXT:    global_store_b16 v0, v1, s[2:3]
2845; GFX11-NEXT:    s_endpgm
2846  %val = load half, ptr addrspace(1) %in
2847  %val_int = bitcast half %val to i16
2848  store i16 %val_int, ptr addrspace(1) %out
2849  ret void
2850}
2851
2852define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
2853; CIVI-LABEL: test_bitcast_to_half:
2854; CIVI:       ; %bb.0:
2855; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
2856; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
2857; CIVI-NEXT:    v_mov_b32_e32 v0, s2
2858; CIVI-NEXT:    v_mov_b32_e32 v1, s3
2859; CIVI-NEXT:    flat_load_ushort v2, v[0:1]
2860; CIVI-NEXT:    v_mov_b32_e32 v0, s0
2861; CIVI-NEXT:    v_mov_b32_e32 v1, s1
2862; CIVI-NEXT:    s_waitcnt vmcnt(0)
2863; CIVI-NEXT:    flat_store_short v[0:1], v2
2864; CIVI-NEXT:    s_endpgm
2865;
2866; GFX11-LABEL: test_bitcast_to_half:
2867; GFX11:       ; %bb.0:
2868; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
2869; GFX11-NEXT:    v_mov_b32_e32 v0, 0
2870; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2871; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
2872; GFX11-NEXT:    s_waitcnt vmcnt(0)
2873; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
2874; GFX11-NEXT:    s_endpgm
2875  %val = load i16, ptr addrspace(1) %in
2876  %val_fp = bitcast i16 %val to half
2877  store half %val_fp, ptr addrspace(1) %out
2878  ret void
2879}
2880
2881attributes #0 = { nounwind }
2882