xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fmul.f16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,VI %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
6
7define amdgpu_kernel void @fmul_f16(
8; SI-LABEL: fmul_f16:
9; SI:       ; %bb.0: ; %entry
10; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
11; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s6, -1
14; SI-NEXT:    s_mov_b32 s14, s6
15; SI-NEXT:    s_waitcnt lgkmcnt(0)
16; SI-NEXT:    s_mov_b32 s12, s2
17; SI-NEXT:    s_mov_b32 s13, s3
18; SI-NEXT:    s_mov_b32 s15, s7
19; SI-NEXT:    s_mov_b32 s10, s6
20; SI-NEXT:    s_mov_b32 s11, s7
21; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
24; SI-NEXT:    s_waitcnt vmcnt(0)
25; SI-NEXT:    s_mov_b32 s4, s0
26; SI-NEXT:    s_mov_b32 s5, s1
27; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
28; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
29; SI-NEXT:    v_mul_f32_e32 v0, v0, v1
30; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
31; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
32; SI-NEXT:    s_endpgm
33;
34; GFX89-LABEL: fmul_f16:
35; GFX89:       ; %bb.0: ; %entry
36; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
37; GFX89-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
38; GFX89-NEXT:    s_mov_b32 s7, 0xf000
39; GFX89-NEXT:    s_mov_b32 s6, -1
40; GFX89-NEXT:    s_mov_b32 s14, s6
41; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
42; GFX89-NEXT:    s_mov_b32 s12, s2
43; GFX89-NEXT:    s_mov_b32 s13, s3
44; GFX89-NEXT:    s_mov_b32 s15, s7
45; GFX89-NEXT:    s_mov_b32 s10, s6
46; GFX89-NEXT:    s_mov_b32 s11, s7
47; GFX89-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
48; GFX89-NEXT:    s_waitcnt vmcnt(0)
49; GFX89-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
50; GFX89-NEXT:    s_waitcnt vmcnt(0)
51; GFX89-NEXT:    s_mov_b32 s4, s0
52; GFX89-NEXT:    s_mov_b32 s5, s1
53; GFX89-NEXT:    v_mul_f16_e32 v0, v0, v1
54; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
55; GFX89-NEXT:    s_endpgm
56;
57; GFX11-LABEL: fmul_f16:
58; GFX11:       ; %bb.0: ; %entry
59; GFX11-NEXT:    s_clause 0x1
60; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
61; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
62; GFX11-NEXT:    s_mov_b32 s10, -1
63; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
64; GFX11-NEXT:    s_mov_b32 s14, s10
65; GFX11-NEXT:    s_mov_b32 s15, s11
66; GFX11-NEXT:    s_mov_b32 s6, s10
67; GFX11-NEXT:    s_mov_b32 s7, s11
68; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX11-NEXT:    s_mov_b32 s12, s2
70; GFX11-NEXT:    s_mov_b32 s13, s3
71; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
72; GFX11-NEXT:    s_waitcnt vmcnt(0)
73; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
74; GFX11-NEXT:    s_waitcnt vmcnt(0)
75; GFX11-NEXT:    s_mov_b32 s8, s0
76; GFX11-NEXT:    s_mov_b32 s9, s1
77; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
78; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
79; GFX11-NEXT:    s_endpgm
80    ptr addrspace(1) %r,
81    ptr addrspace(1) %a,
82    ptr addrspace(1) %b) {
83entry:
84  %a.val = load volatile half, ptr addrspace(1) %a
85  %b.val = load volatile half, ptr addrspace(1) %b
86  %r.val = fmul half %a.val, %b.val
87  store half %r.val, ptr addrspace(1) %r
88  ret void
89}
90
91define amdgpu_kernel void @fmul_f16_imm_a(
92; SI-LABEL: fmul_f16_imm_a:
93; SI:       ; %bb.0: ; %entry
94; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
95; SI-NEXT:    s_mov_b32 s7, 0xf000
96; SI-NEXT:    s_mov_b32 s6, -1
97; SI-NEXT:    s_mov_b32 s10, s6
98; SI-NEXT:    s_mov_b32 s11, s7
99; SI-NEXT:    s_waitcnt lgkmcnt(0)
100; SI-NEXT:    s_mov_b32 s8, s2
101; SI-NEXT:    s_mov_b32 s9, s3
102; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
103; SI-NEXT:    s_waitcnt vmcnt(0)
104; SI-NEXT:    s_mov_b32 s4, s0
105; SI-NEXT:    s_mov_b32 s5, s1
106; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
107; SI-NEXT:    v_mul_f32_e32 v0, 0x40400000, v0
108; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
109; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
110; SI-NEXT:    s_endpgm
111;
112; GFX89-LABEL: fmul_f16_imm_a:
113; GFX89:       ; %bb.0: ; %entry
114; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
115; GFX89-NEXT:    s_mov_b32 s7, 0xf000
116; GFX89-NEXT:    s_mov_b32 s6, -1
117; GFX89-NEXT:    s_mov_b32 s10, s6
118; GFX89-NEXT:    s_mov_b32 s11, s7
119; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
120; GFX89-NEXT:    s_mov_b32 s8, s2
121; GFX89-NEXT:    s_mov_b32 s9, s3
122; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
123; GFX89-NEXT:    s_waitcnt vmcnt(0)
124; GFX89-NEXT:    s_mov_b32 s4, s0
125; GFX89-NEXT:    s_mov_b32 s5, s1
126; GFX89-NEXT:    v_mul_f16_e32 v0, 0x4200, v0
127; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
128; GFX89-NEXT:    s_endpgm
129;
130; GFX11-LABEL: fmul_f16_imm_a:
131; GFX11:       ; %bb.0: ; %entry
132; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
133; GFX11-NEXT:    s_mov_b32 s6, -1
134; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
135; GFX11-NEXT:    s_mov_b32 s10, s6
136; GFX11-NEXT:    s_mov_b32 s11, s7
137; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
138; GFX11-NEXT:    s_mov_b32 s8, s2
139; GFX11-NEXT:    s_mov_b32 s9, s3
140; GFX11-NEXT:    s_mov_b32 s4, s0
141; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0 glc dlc
142; GFX11-NEXT:    s_waitcnt vmcnt(0)
143; GFX11-NEXT:    s_mov_b32 s5, s1
144; GFX11-NEXT:    v_mul_f16_e32 v0, 0x4200, v0
145; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
146; GFX11-NEXT:    s_endpgm
147    ptr addrspace(1) %r,
148    ptr addrspace(1) %b) {
149entry:
150  %b.val = load volatile half, ptr addrspace(1) %b
151  %r.val = fmul half 3.0, %b.val
152  store half %r.val, ptr addrspace(1) %r
153  ret void
154}
155
156define amdgpu_kernel void @fmul_f16_imm_b(
157; SI-LABEL: fmul_f16_imm_b:
158; SI:       ; %bb.0: ; %entry
159; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
160; SI-NEXT:    s_mov_b32 s7, 0xf000
161; SI-NEXT:    s_mov_b32 s6, -1
162; SI-NEXT:    s_mov_b32 s10, s6
163; SI-NEXT:    s_mov_b32 s11, s7
164; SI-NEXT:    s_waitcnt lgkmcnt(0)
165; SI-NEXT:    s_mov_b32 s8, s2
166; SI-NEXT:    s_mov_b32 s9, s3
167; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
168; SI-NEXT:    s_waitcnt vmcnt(0)
169; SI-NEXT:    s_mov_b32 s4, s0
170; SI-NEXT:    s_mov_b32 s5, s1
171; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
172; SI-NEXT:    v_mul_f32_e32 v0, 4.0, v0
173; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
174; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
175; SI-NEXT:    s_endpgm
176;
177; GFX89-LABEL: fmul_f16_imm_b:
178; GFX89:       ; %bb.0: ; %entry
179; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
180; GFX89-NEXT:    s_mov_b32 s7, 0xf000
181; GFX89-NEXT:    s_mov_b32 s6, -1
182; GFX89-NEXT:    s_mov_b32 s10, s6
183; GFX89-NEXT:    s_mov_b32 s11, s7
184; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
185; GFX89-NEXT:    s_mov_b32 s8, s2
186; GFX89-NEXT:    s_mov_b32 s9, s3
187; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
188; GFX89-NEXT:    s_waitcnt vmcnt(0)
189; GFX89-NEXT:    s_mov_b32 s4, s0
190; GFX89-NEXT:    s_mov_b32 s5, s1
191; GFX89-NEXT:    v_mul_f16_e32 v0, 4.0, v0
192; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
193; GFX89-NEXT:    s_endpgm
194;
195; GFX11-LABEL: fmul_f16_imm_b:
196; GFX11:       ; %bb.0: ; %entry
197; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
198; GFX11-NEXT:    s_mov_b32 s6, -1
199; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
200; GFX11-NEXT:    s_mov_b32 s10, s6
201; GFX11-NEXT:    s_mov_b32 s11, s7
202; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
203; GFX11-NEXT:    s_mov_b32 s8, s2
204; GFX11-NEXT:    s_mov_b32 s9, s3
205; GFX11-NEXT:    s_mov_b32 s4, s0
206; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0 glc dlc
207; GFX11-NEXT:    s_waitcnt vmcnt(0)
208; GFX11-NEXT:    s_mov_b32 s5, s1
209; GFX11-NEXT:    v_mul_f16_e32 v0, 4.0, v0
210; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
211; GFX11-NEXT:    s_endpgm
212    ptr addrspace(1) %r,
213    ptr addrspace(1) %a) {
214entry:
215  %a.val = load volatile half, ptr addrspace(1) %a
216  %r.val = fmul half %a.val, 4.0
217  store half %r.val, ptr addrspace(1) %r
218  ret void
219}
220
221define amdgpu_kernel void @fmul_v2f16(
222; SI-LABEL: fmul_v2f16:
223; SI:       ; %bb.0: ; %entry
224; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
225; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
226; SI-NEXT:    s_mov_b32 s7, 0xf000
227; SI-NEXT:    s_mov_b32 s6, -1
228; SI-NEXT:    s_mov_b32 s10, s6
229; SI-NEXT:    s_mov_b32 s11, s7
230; SI-NEXT:    s_waitcnt lgkmcnt(0)
231; SI-NEXT:    s_mov_b32 s12, s2
232; SI-NEXT:    s_mov_b32 s13, s3
233; SI-NEXT:    s_mov_b32 s14, s6
234; SI-NEXT:    s_mov_b32 s15, s7
235; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
236; SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
237; SI-NEXT:    s_mov_b32 s4, s0
238; SI-NEXT:    s_mov_b32 s5, s1
239; SI-NEXT:    s_waitcnt vmcnt(1)
240; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
241; SI-NEXT:    s_waitcnt vmcnt(0)
242; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
243; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
244; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
245; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
246; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
247; SI-NEXT:    v_mul_f32_e32 v2, v3, v2
248; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
249; SI-NEXT:    v_mul_f32_e32 v0, v1, v0
250; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
251; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
252; SI-NEXT:    v_or_b32_e32 v0, v0, v1
253; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
254; SI-NEXT:    s_endpgm
255;
256; VI-LABEL: fmul_v2f16:
257; VI:       ; %bb.0: ; %entry
258; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
259; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
260; VI-NEXT:    s_mov_b32 s7, 0xf000
261; VI-NEXT:    s_mov_b32 s6, -1
262; VI-NEXT:    s_mov_b32 s10, s6
263; VI-NEXT:    s_mov_b32 s11, s7
264; VI-NEXT:    s_waitcnt lgkmcnt(0)
265; VI-NEXT:    s_mov_b32 s12, s2
266; VI-NEXT:    s_mov_b32 s13, s3
267; VI-NEXT:    s_mov_b32 s14, s6
268; VI-NEXT:    s_mov_b32 s15, s7
269; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
270; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
271; VI-NEXT:    s_mov_b32 s4, s0
272; VI-NEXT:    s_mov_b32 s5, s1
273; VI-NEXT:    s_waitcnt vmcnt(0)
274; VI-NEXT:    v_mul_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
275; VI-NEXT:    v_mul_f16_e32 v0, v1, v0
276; VI-NEXT:    v_or_b32_e32 v0, v0, v2
277; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
278; VI-NEXT:    s_endpgm
279;
280; GFX9-LABEL: fmul_v2f16:
281; GFX9:       ; %bb.0: ; %entry
282; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
283; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
284; GFX9-NEXT:    s_mov_b32 s7, 0xf000
285; GFX9-NEXT:    s_mov_b32 s6, -1
286; GFX9-NEXT:    s_mov_b32 s14, s6
287; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
288; GFX9-NEXT:    s_mov_b32 s12, s2
289; GFX9-NEXT:    s_mov_b32 s13, s3
290; GFX9-NEXT:    s_mov_b32 s15, s7
291; GFX9-NEXT:    s_mov_b32 s10, s6
292; GFX9-NEXT:    s_mov_b32 s11, s7
293; GFX9-NEXT:    buffer_load_dword v0, off, s[12:15], 0
294; GFX9-NEXT:    buffer_load_dword v1, off, s[8:11], 0
295; GFX9-NEXT:    s_mov_b32 s4, s0
296; GFX9-NEXT:    s_mov_b32 s5, s1
297; GFX9-NEXT:    s_waitcnt vmcnt(0)
298; GFX9-NEXT:    v_pk_mul_f16 v0, v0, v1
299; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
300; GFX9-NEXT:    s_endpgm
301;
302; GFX11-LABEL: fmul_v2f16:
303; GFX11:       ; %bb.0: ; %entry
304; GFX11-NEXT:    s_clause 0x1
305; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
306; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
307; GFX11-NEXT:    s_mov_b32 s10, -1
308; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
309; GFX11-NEXT:    s_mov_b32 s14, s10
310; GFX11-NEXT:    s_mov_b32 s15, s11
311; GFX11-NEXT:    s_mov_b32 s6, s10
312; GFX11-NEXT:    s_mov_b32 s7, s11
313; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
314; GFX11-NEXT:    s_mov_b32 s12, s2
315; GFX11-NEXT:    s_mov_b32 s13, s3
316; GFX11-NEXT:    buffer_load_b32 v0, off, s[12:15], 0
317; GFX11-NEXT:    buffer_load_b32 v1, off, s[4:7], 0
318; GFX11-NEXT:    s_mov_b32 s8, s0
319; GFX11-NEXT:    s_mov_b32 s9, s1
320; GFX11-NEXT:    s_waitcnt vmcnt(0)
321; GFX11-NEXT:    v_pk_mul_f16 v0, v0, v1
322; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
323; GFX11-NEXT:    s_endpgm
324    ptr addrspace(1) %r,
325    ptr addrspace(1) %a,
326    ptr addrspace(1) %b) {
327entry:
328  %a.val = load <2 x half>, ptr addrspace(1) %a
329  %b.val = load <2 x half>, ptr addrspace(1) %b
330  %r.val = fmul <2 x half> %a.val, %b.val
331  store <2 x half> %r.val, ptr addrspace(1) %r
332  ret void
333}
334
335define amdgpu_kernel void @fmul_v2f16_imm_a(
336; SI-LABEL: fmul_v2f16_imm_a:
337; SI:       ; %bb.0: ; %entry
338; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
339; SI-NEXT:    s_mov_b32 s7, 0xf000
340; SI-NEXT:    s_mov_b32 s6, -1
341; SI-NEXT:    s_mov_b32 s10, s6
342; SI-NEXT:    s_mov_b32 s11, s7
343; SI-NEXT:    s_waitcnt lgkmcnt(0)
344; SI-NEXT:    s_mov_b32 s8, s2
345; SI-NEXT:    s_mov_b32 s9, s3
346; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
347; SI-NEXT:    s_mov_b32 s4, s0
348; SI-NEXT:    s_mov_b32 s5, s1
349; SI-NEXT:    s_waitcnt vmcnt(0)
350; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
351; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
352; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
353; SI-NEXT:    v_mul_f32_e32 v1, 4.0, v1
354; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
355; SI-NEXT:    v_mul_f32_e32 v0, 0x40400000, v0
356; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
357; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
358; SI-NEXT:    v_or_b32_e32 v0, v0, v1
359; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
360; SI-NEXT:    s_endpgm
361;
362; VI-LABEL: fmul_v2f16_imm_a:
363; VI:       ; %bb.0: ; %entry
364; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
365; VI-NEXT:    s_mov_b32 s7, 0xf000
366; VI-NEXT:    s_mov_b32 s6, -1
367; VI-NEXT:    s_mov_b32 s10, s6
368; VI-NEXT:    s_mov_b32 s11, s7
369; VI-NEXT:    s_waitcnt lgkmcnt(0)
370; VI-NEXT:    s_mov_b32 s8, s2
371; VI-NEXT:    s_mov_b32 s9, s3
372; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
373; VI-NEXT:    v_mov_b32_e32 v1, 0x4400
374; VI-NEXT:    s_mov_b32 s4, s0
375; VI-NEXT:    s_mov_b32 s5, s1
376; VI-NEXT:    s_waitcnt vmcnt(0)
377; VI-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
378; VI-NEXT:    v_mul_f16_e32 v0, 0x4200, v0
379; VI-NEXT:    v_or_b32_e32 v0, v0, v1
380; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
381; VI-NEXT:    s_endpgm
382;
383; GFX9-LABEL: fmul_v2f16_imm_a:
384; GFX9:       ; %bb.0: ; %entry
385; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
386; GFX9-NEXT:    s_mov_b32 s7, 0xf000
387; GFX9-NEXT:    s_mov_b32 s6, -1
388; GFX9-NEXT:    s_mov_b32 s10, s6
389; GFX9-NEXT:    s_mov_b32 s11, s7
390; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
391; GFX9-NEXT:    s_mov_b32 s8, s2
392; GFX9-NEXT:    s_mov_b32 s9, s3
393; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
394; GFX9-NEXT:    s_mov_b32 s4, s0
395; GFX9-NEXT:    s_mov_b32 s0, 0x44004200
396; GFX9-NEXT:    s_mov_b32 s5, s1
397; GFX9-NEXT:    s_waitcnt vmcnt(0)
398; GFX9-NEXT:    v_pk_mul_f16 v0, v0, s0
399; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
400; GFX9-NEXT:    s_endpgm
401;
402; GFX11-LABEL: fmul_v2f16_imm_a:
403; GFX11:       ; %bb.0: ; %entry
404; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
405; GFX11-NEXT:    s_mov_b32 s6, -1
406; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
407; GFX11-NEXT:    s_mov_b32 s10, s6
408; GFX11-NEXT:    s_mov_b32 s11, s7
409; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
410; GFX11-NEXT:    s_mov_b32 s8, s2
411; GFX11-NEXT:    s_mov_b32 s9, s3
412; GFX11-NEXT:    s_mov_b32 s4, s0
413; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
414; GFX11-NEXT:    s_mov_b32 s5, s1
415; GFX11-NEXT:    s_waitcnt vmcnt(0)
416; GFX11-NEXT:    v_pk_mul_f16 v0, 0x44004200, v0
417; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
418; GFX11-NEXT:    s_endpgm
419    ptr addrspace(1) %r,
420    ptr addrspace(1) %b) {
421entry:
422  %b.val = load <2 x half>, ptr addrspace(1) %b
423  %r.val = fmul <2 x half> <half 3.0, half 4.0>, %b.val
424  store <2 x half> %r.val, ptr addrspace(1) %r
425  ret void
426}
427
428define amdgpu_kernel void @fmul_v2f16_imm_b(
429; SI-LABEL: fmul_v2f16_imm_b:
430; SI:       ; %bb.0: ; %entry
431; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
432; SI-NEXT:    s_mov_b32 s7, 0xf000
433; SI-NEXT:    s_mov_b32 s6, -1
434; SI-NEXT:    s_mov_b32 s10, s6
435; SI-NEXT:    s_mov_b32 s11, s7
436; SI-NEXT:    s_waitcnt lgkmcnt(0)
437; SI-NEXT:    s_mov_b32 s8, s2
438; SI-NEXT:    s_mov_b32 s9, s3
439; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
440; SI-NEXT:    s_mov_b32 s4, s0
441; SI-NEXT:    s_mov_b32 s5, s1
442; SI-NEXT:    s_waitcnt vmcnt(0)
443; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
444; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
445; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
446; SI-NEXT:    v_mul_f32_e32 v1, 0x40400000, v1
447; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
448; SI-NEXT:    v_mul_f32_e32 v0, 4.0, v0
449; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
450; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
451; SI-NEXT:    v_or_b32_e32 v0, v0, v1
452; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
453; SI-NEXT:    s_endpgm
454;
455; VI-LABEL: fmul_v2f16_imm_b:
456; VI:       ; %bb.0: ; %entry
457; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
458; VI-NEXT:    s_mov_b32 s7, 0xf000
459; VI-NEXT:    s_mov_b32 s6, -1
460; VI-NEXT:    s_mov_b32 s10, s6
461; VI-NEXT:    s_mov_b32 s11, s7
462; VI-NEXT:    s_waitcnt lgkmcnt(0)
463; VI-NEXT:    s_mov_b32 s8, s2
464; VI-NEXT:    s_mov_b32 s9, s3
465; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
466; VI-NEXT:    v_mov_b32_e32 v1, 0x4200
467; VI-NEXT:    s_mov_b32 s4, s0
468; VI-NEXT:    s_mov_b32 s5, s1
469; VI-NEXT:    s_waitcnt vmcnt(0)
470; VI-NEXT:    v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
471; VI-NEXT:    v_mul_f16_e32 v0, 4.0, v0
472; VI-NEXT:    v_or_b32_e32 v0, v0, v1
473; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
474; VI-NEXT:    s_endpgm
475;
476; GFX9-LABEL: fmul_v2f16_imm_b:
477; GFX9:       ; %bb.0: ; %entry
478; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
479; GFX9-NEXT:    s_mov_b32 s7, 0xf000
480; GFX9-NEXT:    s_mov_b32 s6, -1
481; GFX9-NEXT:    s_mov_b32 s10, s6
482; GFX9-NEXT:    s_mov_b32 s11, s7
483; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
484; GFX9-NEXT:    s_mov_b32 s8, s2
485; GFX9-NEXT:    s_mov_b32 s9, s3
486; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
487; GFX9-NEXT:    s_mov_b32 s4, s0
488; GFX9-NEXT:    s_mov_b32 s0, 0x42004400
489; GFX9-NEXT:    s_mov_b32 s5, s1
490; GFX9-NEXT:    s_waitcnt vmcnt(0)
491; GFX9-NEXT:    v_pk_mul_f16 v0, v0, s0
492; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
493; GFX9-NEXT:    s_endpgm
494;
495; GFX11-LABEL: fmul_v2f16_imm_b:
496; GFX11:       ; %bb.0: ; %entry
497; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
498; GFX11-NEXT:    s_mov_b32 s6, -1
499; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
500; GFX11-NEXT:    s_mov_b32 s10, s6
501; GFX11-NEXT:    s_mov_b32 s11, s7
502; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
503; GFX11-NEXT:    s_mov_b32 s8, s2
504; GFX11-NEXT:    s_mov_b32 s9, s3
505; GFX11-NEXT:    s_mov_b32 s4, s0
506; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
507; GFX11-NEXT:    s_mov_b32 s5, s1
508; GFX11-NEXT:    s_waitcnt vmcnt(0)
509; GFX11-NEXT:    v_pk_mul_f16 v0, 0x42004400, v0
510; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
511; GFX11-NEXT:    s_endpgm
512    ptr addrspace(1) %r,
513    ptr addrspace(1) %a) {
514entry:
515  %a.val = load <2 x half>, ptr addrspace(1) %a
516  %r.val = fmul <2 x half> %a.val, <half 4.0, half 3.0>
517  store <2 x half> %r.val, ptr addrspace(1) %r
518  ret void
519}
520
521define amdgpu_kernel void @fmul_v4f16(
522; SI-LABEL: fmul_v4f16:
523; SI:       ; %bb.0: ; %entry
524; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
525; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
526; SI-NEXT:    s_mov_b32 s3, 0xf000
527; SI-NEXT:    s_mov_b32 s2, -1
528; SI-NEXT:    s_mov_b32 s6, s2
529; SI-NEXT:    s_waitcnt lgkmcnt(0)
530; SI-NEXT:    s_mov_b32 s12, s10
531; SI-NEXT:    s_mov_b32 s7, s3
532; SI-NEXT:    s_mov_b32 s13, s11
533; SI-NEXT:    s_mov_b32 s14, s2
534; SI-NEXT:    s_mov_b32 s15, s3
535; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
536; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
537; SI-NEXT:    s_mov_b32 s0, s8
538; SI-NEXT:    s_mov_b32 s1, s9
539; SI-NEXT:    s_waitcnt vmcnt(1)
540; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
541; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
542; SI-NEXT:    v_cvt_f32_f16_e32 v5, v1
543; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
544; SI-NEXT:    s_waitcnt vmcnt(0)
545; SI-NEXT:    v_cvt_f32_f16_e32 v6, v2
546; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
547; SI-NEXT:    v_cvt_f32_f16_e32 v7, v3
548; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
549; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
550; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
551; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
552; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
553; SI-NEXT:    v_mul_f32_e32 v5, v7, v5
554; SI-NEXT:    v_mul_f32_e32 v4, v6, v4
555; SI-NEXT:    v_mul_f32_e32 v1, v3, v1
556; SI-NEXT:    v_mul_f32_e32 v0, v2, v0
557; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
558; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
559; SI-NEXT:    v_cvt_f16_f32_e32 v2, v5
560; SI-NEXT:    v_cvt_f16_f32_e32 v3, v4
561; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
562; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
563; SI-NEXT:    v_or_b32_e32 v1, v2, v1
564; SI-NEXT:    v_or_b32_e32 v0, v3, v0
565; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
566; SI-NEXT:    s_endpgm
567;
568; VI-LABEL: fmul_v4f16:
569; VI:       ; %bb.0: ; %entry
570; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
571; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
572; VI-NEXT:    s_mov_b32 s7, 0xf000
573; VI-NEXT:    s_mov_b32 s6, -1
574; VI-NEXT:    s_mov_b32 s10, s6
575; VI-NEXT:    s_mov_b32 s11, s7
576; VI-NEXT:    s_waitcnt lgkmcnt(0)
577; VI-NEXT:    s_mov_b32 s12, s2
578; VI-NEXT:    s_mov_b32 s13, s3
579; VI-NEXT:    s_mov_b32 s14, s6
580; VI-NEXT:    s_mov_b32 s15, s7
581; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
582; VI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
583; VI-NEXT:    s_mov_b32 s4, s0
584; VI-NEXT:    s_mov_b32 s5, s1
585; VI-NEXT:    s_waitcnt vmcnt(0)
586; VI-NEXT:    v_mul_f16_sdwa v4, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
587; VI-NEXT:    v_mul_f16_e32 v1, v3, v1
588; VI-NEXT:    v_mul_f16_sdwa v3, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
589; VI-NEXT:    v_mul_f16_e32 v0, v2, v0
590; VI-NEXT:    v_or_b32_e32 v1, v1, v4
591; VI-NEXT:    v_or_b32_e32 v0, v0, v3
592; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
593; VI-NEXT:    s_endpgm
594;
595; GFX9-LABEL: fmul_v4f16:
596; GFX9:       ; %bb.0: ; %entry
597; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
598; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
599; GFX9-NEXT:    s_mov_b32 s7, 0xf000
600; GFX9-NEXT:    s_mov_b32 s6, -1
601; GFX9-NEXT:    s_mov_b32 s10, s6
602; GFX9-NEXT:    s_mov_b32 s11, s7
603; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
604; GFX9-NEXT:    s_mov_b32 s12, s2
605; GFX9-NEXT:    s_mov_b32 s13, s3
606; GFX9-NEXT:    s_mov_b32 s14, s6
607; GFX9-NEXT:    s_mov_b32 s15, s7
608; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
609; GFX9-NEXT:    buffer_load_dwordx2 v[2:3], off, s[12:15], 0
610; GFX9-NEXT:    s_mov_b32 s4, s0
611; GFX9-NEXT:    s_mov_b32 s5, s1
612; GFX9-NEXT:    s_waitcnt vmcnt(0)
613; GFX9-NEXT:    v_pk_mul_f16 v1, v3, v1
614; GFX9-NEXT:    v_pk_mul_f16 v0, v2, v0
615; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
616; GFX9-NEXT:    s_endpgm
617;
618; GFX11-LABEL: fmul_v4f16:
619; GFX11:       ; %bb.0: ; %entry
620; GFX11-NEXT:    s_clause 0x1
621; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
622; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
623; GFX11-NEXT:    s_mov_b32 s10, -1
624; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
625; GFX11-NEXT:    s_mov_b32 s6, s10
626; GFX11-NEXT:    s_mov_b32 s7, s11
627; GFX11-NEXT:    s_mov_b32 s14, s10
628; GFX11-NEXT:    s_mov_b32 s15, s11
629; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
630; GFX11-NEXT:    s_mov_b32 s12, s2
631; GFX11-NEXT:    s_mov_b32 s13, s3
632; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[4:7], 0
633; GFX11-NEXT:    buffer_load_b64 v[2:3], off, s[12:15], 0
634; GFX11-NEXT:    s_mov_b32 s8, s0
635; GFX11-NEXT:    s_mov_b32 s9, s1
636; GFX11-NEXT:    s_waitcnt vmcnt(0)
637; GFX11-NEXT:    v_pk_mul_f16 v1, v3, v1
638; GFX11-NEXT:    v_pk_mul_f16 v0, v2, v0
639; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[8:11], 0
640; GFX11-NEXT:    s_endpgm
641    ptr addrspace(1) %r,
642    ptr addrspace(1) %a,
643    ptr addrspace(1) %b) {
644entry:
645  %a.val = load <4 x half>, ptr addrspace(1) %a
646  %b.val = load <4 x half>, ptr addrspace(1) %b
647  %r.val = fmul <4 x half> %a.val, %b.val
648  store <4 x half> %r.val, ptr addrspace(1) %r
649  ret void
650}
651
652define amdgpu_kernel void @fmul_v4f16_imm_a(
653; SI-LABEL: fmul_v4f16_imm_a:
654; SI:       ; %bb.0: ; %entry
655; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
656; SI-NEXT:    s_mov_b32 s7, 0xf000
657; SI-NEXT:    s_mov_b32 s6, -1
658; SI-NEXT:    s_mov_b32 s10, s6
659; SI-NEXT:    s_mov_b32 s11, s7
660; SI-NEXT:    s_waitcnt lgkmcnt(0)
661; SI-NEXT:    s_mov_b32 s8, s2
662; SI-NEXT:    s_mov_b32 s9, s3
663; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
664; SI-NEXT:    s_mov_b32 s4, s0
665; SI-NEXT:    s_mov_b32 s5, s1
666; SI-NEXT:    s_waitcnt vmcnt(0)
667; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
668; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
669; SI-NEXT:    v_cvt_f32_f16_e32 v3, v1
670; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
671; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
672; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
673; SI-NEXT:    v_mul_f32_e32 v3, 0x40400000, v3
674; SI-NEXT:    v_mul_f32_e32 v2, 0x41000000, v2
675; SI-NEXT:    v_mul_f32_e32 v1, 4.0, v1
676; SI-NEXT:    v_add_f32_e32 v0, v0, v0
677; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
678; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
679; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
680; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
681; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
682; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
683; SI-NEXT:    v_or_b32_e32 v1, v3, v1
684; SI-NEXT:    v_or_b32_e32 v0, v2, v0
685; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
686; SI-NEXT:    s_endpgm
687;
688; VI-LABEL: fmul_v4f16_imm_a:
689; VI:       ; %bb.0: ; %entry
690; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
691; VI-NEXT:    s_mov_b32 s7, 0xf000
692; VI-NEXT:    s_mov_b32 s6, -1
693; VI-NEXT:    s_mov_b32 s10, s6
694; VI-NEXT:    s_mov_b32 s11, s7
695; VI-NEXT:    s_waitcnt lgkmcnt(0)
696; VI-NEXT:    s_mov_b32 s8, s2
697; VI-NEXT:    s_mov_b32 s9, s3
698; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
699; VI-NEXT:    v_mov_b32_e32 v2, 0x4400
700; VI-NEXT:    s_mov_b32 s4, s0
701; VI-NEXT:    s_mov_b32 s5, s1
702; VI-NEXT:    s_waitcnt vmcnt(0)
703; VI-NEXT:    v_mul_f16_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
704; VI-NEXT:    v_mul_f16_e32 v1, 0x4200, v1
705; VI-NEXT:    v_add_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
706; VI-NEXT:    v_mul_f16_e32 v0, 0x4800, v0
707; VI-NEXT:    v_or_b32_e32 v1, v1, v2
708; VI-NEXT:    v_or_b32_e32 v0, v0, v3
709; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
710; VI-NEXT:    s_endpgm
711;
712; GFX9-LABEL: fmul_v4f16_imm_a:
713; GFX9:       ; %bb.0: ; %entry
714; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
715; GFX9-NEXT:    s_mov_b32 s7, 0xf000
716; GFX9-NEXT:    s_mov_b32 s6, -1
717; GFX9-NEXT:    s_mov_b32 s10, s6
718; GFX9-NEXT:    s_mov_b32 s11, s7
719; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
720; GFX9-NEXT:    s_mov_b32 s8, s2
721; GFX9-NEXT:    s_mov_b32 s9, s3
722; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
723; GFX9-NEXT:    s_mov_b32 s2, 0x44004200
724; GFX9-NEXT:    s_mov_b32 s3, 0x40004800
725; GFX9-NEXT:    s_mov_b32 s4, s0
726; GFX9-NEXT:    s_mov_b32 s5, s1
727; GFX9-NEXT:    s_waitcnt vmcnt(0)
728; GFX9-NEXT:    v_pk_mul_f16 v1, v1, s2
729; GFX9-NEXT:    v_pk_mul_f16 v0, v0, s3
730; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
731; GFX9-NEXT:    s_endpgm
732;
733; GFX11-LABEL: fmul_v4f16_imm_a:
734; GFX11:       ; %bb.0: ; %entry
735; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
736; GFX11-NEXT:    s_mov_b32 s6, -1
737; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
738; GFX11-NEXT:    s_mov_b32 s10, s6
739; GFX11-NEXT:    s_mov_b32 s11, s7
740; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
741; GFX11-NEXT:    s_mov_b32 s8, s2
742; GFX11-NEXT:    s_mov_b32 s9, s3
743; GFX11-NEXT:    s_mov_b32 s4, s0
744; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
745; GFX11-NEXT:    s_mov_b32 s5, s1
746; GFX11-NEXT:    s_waitcnt vmcnt(0)
747; GFX11-NEXT:    v_pk_mul_f16 v1, 0x44004200, v1
748; GFX11-NEXT:    v_pk_mul_f16 v0, 0x40004800, v0
749; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
750; GFX11-NEXT:    s_endpgm
751    ptr addrspace(1) %r,
752    ptr addrspace(1) %b) {
753entry:
754  %b.val = load <4 x half>, ptr addrspace(1) %b
755  %r.val = fmul <4 x half> <half 8.0, half 2.0, half 3.0, half 4.0>, %b.val
756  store <4 x half> %r.val, ptr addrspace(1) %r
757  ret void
758}
759