xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fsub.f16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,VI %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s
5; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
6
7define amdgpu_kernel void @fsub_f16(
8; SI-LABEL: fsub_f16:
9; SI:       ; %bb.0: ; %entry
10; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
11; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s6, -1
14; SI-NEXT:    s_mov_b32 s14, s6
15; SI-NEXT:    s_waitcnt lgkmcnt(0)
16; SI-NEXT:    s_mov_b32 s12, s2
17; SI-NEXT:    s_mov_b32 s13, s3
18; SI-NEXT:    s_mov_b32 s15, s7
19; SI-NEXT:    s_mov_b32 s10, s6
20; SI-NEXT:    s_mov_b32 s11, s7
21; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
24; SI-NEXT:    s_waitcnt vmcnt(0)
25; SI-NEXT:    s_mov_b32 s4, s0
26; SI-NEXT:    s_mov_b32 s5, s1
27; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
28; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
29; SI-NEXT:    v_sub_f32_e32 v0, v0, v1
30; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
31; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
32; SI-NEXT:    s_endpgm
33;
34; GFX89-LABEL: fsub_f16:
35; GFX89:       ; %bb.0: ; %entry
36; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
37; GFX89-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
38; GFX89-NEXT:    s_mov_b32 s7, 0xf000
39; GFX89-NEXT:    s_mov_b32 s6, -1
40; GFX89-NEXT:    s_mov_b32 s14, s6
41; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
42; GFX89-NEXT:    s_mov_b32 s12, s2
43; GFX89-NEXT:    s_mov_b32 s13, s3
44; GFX89-NEXT:    s_mov_b32 s15, s7
45; GFX89-NEXT:    s_mov_b32 s10, s6
46; GFX89-NEXT:    s_mov_b32 s11, s7
47; GFX89-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
48; GFX89-NEXT:    s_waitcnt vmcnt(0)
49; GFX89-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
50; GFX89-NEXT:    s_waitcnt vmcnt(0)
51; GFX89-NEXT:    s_mov_b32 s4, s0
52; GFX89-NEXT:    s_mov_b32 s5, s1
53; GFX89-NEXT:    v_sub_f16_e32 v0, v0, v1
54; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
55; GFX89-NEXT:    s_endpgm
56;
57; GFX11-LABEL: fsub_f16:
58; GFX11:       ; %bb.0: ; %entry
59; GFX11-NEXT:    s_clause 0x1
60; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
61; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
62; GFX11-NEXT:    s_mov_b32 s10, -1
63; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
64; GFX11-NEXT:    s_mov_b32 s14, s10
65; GFX11-NEXT:    s_mov_b32 s15, s11
66; GFX11-NEXT:    s_mov_b32 s6, s10
67; GFX11-NEXT:    s_mov_b32 s7, s11
68; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX11-NEXT:    s_mov_b32 s12, s2
70; GFX11-NEXT:    s_mov_b32 s13, s3
71; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
72; GFX11-NEXT:    s_waitcnt vmcnt(0)
73; GFX11-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
74; GFX11-NEXT:    s_waitcnt vmcnt(0)
75; GFX11-NEXT:    s_mov_b32 s8, s0
76; GFX11-NEXT:    s_mov_b32 s9, s1
77; GFX11-NEXT:    v_sub_f16_e32 v0, v0, v1
78; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
79; GFX11-NEXT:    s_endpgm
80    ptr addrspace(1) %r,
81    ptr addrspace(1) %a,
82    ptr addrspace(1) %b) {
83entry:
84  %a.val = load volatile half, ptr addrspace(1) %a
85  %b.val = load volatile half, ptr addrspace(1) %b
86  %r.val = fsub half %a.val, %b.val
87  store half %r.val, ptr addrspace(1) %r
88  ret void
89}
90
91define amdgpu_kernel void @fsub_f16_imm_a(
92; SI-LABEL: fsub_f16_imm_a:
93; SI:       ; %bb.0: ; %entry
94; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
95; SI-NEXT:    s_mov_b32 s7, 0xf000
96; SI-NEXT:    s_mov_b32 s6, -1
97; SI-NEXT:    s_mov_b32 s10, s6
98; SI-NEXT:    s_mov_b32 s11, s7
99; SI-NEXT:    s_waitcnt lgkmcnt(0)
100; SI-NEXT:    s_mov_b32 s8, s2
101; SI-NEXT:    s_mov_b32 s9, s3
102; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
103; SI-NEXT:    s_waitcnt vmcnt(0)
104; SI-NEXT:    s_mov_b32 s4, s0
105; SI-NEXT:    s_mov_b32 s5, s1
106; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
107; SI-NEXT:    v_sub_f32_e32 v0, 1.0, v0
108; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
109; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
110; SI-NEXT:    s_endpgm
111;
112; GFX89-LABEL: fsub_f16_imm_a:
113; GFX89:       ; %bb.0: ; %entry
114; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
115; GFX89-NEXT:    s_mov_b32 s7, 0xf000
116; GFX89-NEXT:    s_mov_b32 s6, -1
117; GFX89-NEXT:    s_mov_b32 s10, s6
118; GFX89-NEXT:    s_mov_b32 s11, s7
119; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
120; GFX89-NEXT:    s_mov_b32 s8, s2
121; GFX89-NEXT:    s_mov_b32 s9, s3
122; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
123; GFX89-NEXT:    s_waitcnt vmcnt(0)
124; GFX89-NEXT:    s_mov_b32 s4, s0
125; GFX89-NEXT:    s_mov_b32 s5, s1
126; GFX89-NEXT:    v_sub_f16_e32 v0, 1.0, v0
127; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
128; GFX89-NEXT:    s_endpgm
129;
130; GFX11-LABEL: fsub_f16_imm_a:
131; GFX11:       ; %bb.0: ; %entry
132; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
133; GFX11-NEXT:    s_mov_b32 s6, -1
134; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
135; GFX11-NEXT:    s_mov_b32 s10, s6
136; GFX11-NEXT:    s_mov_b32 s11, s7
137; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
138; GFX11-NEXT:    s_mov_b32 s8, s2
139; GFX11-NEXT:    s_mov_b32 s9, s3
140; GFX11-NEXT:    s_mov_b32 s4, s0
141; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0 glc dlc
142; GFX11-NEXT:    s_waitcnt vmcnt(0)
143; GFX11-NEXT:    s_mov_b32 s5, s1
144; GFX11-NEXT:    v_sub_f16_e32 v0, 1.0, v0
145; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
146; GFX11-NEXT:    s_endpgm
147    ptr addrspace(1) %r,
148    ptr addrspace(1) %b) {
149entry:
150  %b.val = load volatile half, ptr addrspace(1) %b
151  %r.val = fsub half 1.0, %b.val
152  store half %r.val, ptr addrspace(1) %r
153  ret void
154}
155
156define amdgpu_kernel void @fsub_f16_imm_b(
157; SI-LABEL: fsub_f16_imm_b:
158; SI:       ; %bb.0: ; %entry
159; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
160; SI-NEXT:    s_mov_b32 s7, 0xf000
161; SI-NEXT:    s_mov_b32 s6, -1
162; SI-NEXT:    s_mov_b32 s10, s6
163; SI-NEXT:    s_mov_b32 s11, s7
164; SI-NEXT:    s_waitcnt lgkmcnt(0)
165; SI-NEXT:    s_mov_b32 s8, s2
166; SI-NEXT:    s_mov_b32 s9, s3
167; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
168; SI-NEXT:    s_waitcnt vmcnt(0)
169; SI-NEXT:    s_mov_b32 s4, s0
170; SI-NEXT:    s_mov_b32 s5, s1
171; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
172; SI-NEXT:    v_add_f32_e32 v0, -2.0, v0
173; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
174; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
175; SI-NEXT:    s_endpgm
176;
177; GFX89-LABEL: fsub_f16_imm_b:
178; GFX89:       ; %bb.0: ; %entry
179; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
180; GFX89-NEXT:    s_mov_b32 s7, 0xf000
181; GFX89-NEXT:    s_mov_b32 s6, -1
182; GFX89-NEXT:    s_mov_b32 s10, s6
183; GFX89-NEXT:    s_mov_b32 s11, s7
184; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
185; GFX89-NEXT:    s_mov_b32 s8, s2
186; GFX89-NEXT:    s_mov_b32 s9, s3
187; GFX89-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 glc
188; GFX89-NEXT:    s_waitcnt vmcnt(0)
189; GFX89-NEXT:    s_mov_b32 s4, s0
190; GFX89-NEXT:    s_mov_b32 s5, s1
191; GFX89-NEXT:    v_add_f16_e32 v0, -2.0, v0
192; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
193; GFX89-NEXT:    s_endpgm
194;
195; GFX11-LABEL: fsub_f16_imm_b:
196; GFX11:       ; %bb.0: ; %entry
197; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
198; GFX11-NEXT:    s_mov_b32 s6, -1
199; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
200; GFX11-NEXT:    s_mov_b32 s10, s6
201; GFX11-NEXT:    s_mov_b32 s11, s7
202; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
203; GFX11-NEXT:    s_mov_b32 s8, s2
204; GFX11-NEXT:    s_mov_b32 s9, s3
205; GFX11-NEXT:    s_mov_b32 s4, s0
206; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0 glc dlc
207; GFX11-NEXT:    s_waitcnt vmcnt(0)
208; GFX11-NEXT:    s_mov_b32 s5, s1
209; GFX11-NEXT:    v_add_f16_e32 v0, -2.0, v0
210; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
211; GFX11-NEXT:    s_endpgm
212    ptr addrspace(1) %r,
213    ptr addrspace(1) %a) {
214entry:
215  %a.val = load volatile half, ptr addrspace(1) %a
216  %r.val = fsub half %a.val, 2.0
217  store half %r.val, ptr addrspace(1) %r
218  ret void
219}
220
221define amdgpu_kernel void @fsub_v2f16(
222; SI-LABEL: fsub_v2f16:
223; SI:       ; %bb.0: ; %entry
224; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
225; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
226; SI-NEXT:    s_mov_b32 s7, 0xf000
227; SI-NEXT:    s_mov_b32 s6, -1
228; SI-NEXT:    s_mov_b32 s10, s6
229; SI-NEXT:    s_mov_b32 s11, s7
230; SI-NEXT:    s_waitcnt lgkmcnt(0)
231; SI-NEXT:    s_mov_b32 s12, s2
232; SI-NEXT:    s_mov_b32 s13, s3
233; SI-NEXT:    s_mov_b32 s14, s6
234; SI-NEXT:    s_mov_b32 s15, s7
235; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
236; SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
237; SI-NEXT:    s_mov_b32 s4, s0
238; SI-NEXT:    s_mov_b32 s5, s1
239; SI-NEXT:    s_waitcnt vmcnt(1)
240; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
241; SI-NEXT:    s_waitcnt vmcnt(0)
242; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
243; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
244; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
245; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
246; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
247; SI-NEXT:    v_sub_f32_e32 v2, v3, v2
248; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
249; SI-NEXT:    v_sub_f32_e32 v0, v1, v0
250; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
251; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
252; SI-NEXT:    v_or_b32_e32 v0, v0, v1
253; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
254; SI-NEXT:    s_endpgm
255;
256; VI-LABEL: fsub_v2f16:
257; VI:       ; %bb.0: ; %entry
258; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
259; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
260; VI-NEXT:    s_mov_b32 s7, 0xf000
261; VI-NEXT:    s_mov_b32 s6, -1
262; VI-NEXT:    s_mov_b32 s10, s6
263; VI-NEXT:    s_mov_b32 s11, s7
264; VI-NEXT:    s_waitcnt lgkmcnt(0)
265; VI-NEXT:    s_mov_b32 s12, s2
266; VI-NEXT:    s_mov_b32 s13, s3
267; VI-NEXT:    s_mov_b32 s14, s6
268; VI-NEXT:    s_mov_b32 s15, s7
269; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
270; VI-NEXT:    buffer_load_dword v1, off, s[12:15], 0
271; VI-NEXT:    s_mov_b32 s4, s0
272; VI-NEXT:    s_mov_b32 s5, s1
273; VI-NEXT:    s_waitcnt vmcnt(0)
274; VI-NEXT:    v_sub_f16_sdwa v2, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
275; VI-NEXT:    v_sub_f16_e32 v0, v1, v0
276; VI-NEXT:    v_or_b32_e32 v0, v0, v2
277; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
278; VI-NEXT:    s_endpgm
279;
280; GFX9-LABEL: fsub_v2f16:
281; GFX9:       ; %bb.0: ; %entry
282; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
283; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
284; GFX9-NEXT:    s_mov_b32 s7, 0xf000
285; GFX9-NEXT:    s_mov_b32 s6, -1
286; GFX9-NEXT:    s_mov_b32 s14, s6
287; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
288; GFX9-NEXT:    s_mov_b32 s12, s2
289; GFX9-NEXT:    s_mov_b32 s13, s3
290; GFX9-NEXT:    s_mov_b32 s15, s7
291; GFX9-NEXT:    s_mov_b32 s10, s6
292; GFX9-NEXT:    s_mov_b32 s11, s7
293; GFX9-NEXT:    buffer_load_dword v0, off, s[12:15], 0
294; GFX9-NEXT:    buffer_load_dword v1, off, s[8:11], 0
295; GFX9-NEXT:    s_mov_b32 s4, s0
296; GFX9-NEXT:    s_mov_b32 s5, s1
297; GFX9-NEXT:    s_waitcnt vmcnt(0)
298; GFX9-NEXT:    v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
299; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
300; GFX9-NEXT:    s_endpgm
301;
302; GFX11-LABEL: fsub_v2f16:
303; GFX11:       ; %bb.0: ; %entry
304; GFX11-NEXT:    s_clause 0x1
305; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
306; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
307; GFX11-NEXT:    s_mov_b32 s10, -1
308; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
309; GFX11-NEXT:    s_mov_b32 s14, s10
310; GFX11-NEXT:    s_mov_b32 s15, s11
311; GFX11-NEXT:    s_mov_b32 s6, s10
312; GFX11-NEXT:    s_mov_b32 s7, s11
313; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
314; GFX11-NEXT:    s_mov_b32 s12, s2
315; GFX11-NEXT:    s_mov_b32 s13, s3
316; GFX11-NEXT:    buffer_load_b32 v0, off, s[12:15], 0
317; GFX11-NEXT:    buffer_load_b32 v1, off, s[4:7], 0
318; GFX11-NEXT:    s_mov_b32 s8, s0
319; GFX11-NEXT:    s_mov_b32 s9, s1
320; GFX11-NEXT:    s_waitcnt vmcnt(0)
321; GFX11-NEXT:    v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
322; GFX11-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
323; GFX11-NEXT:    s_endpgm
324    ptr addrspace(1) %r,
325    ptr addrspace(1) %a,
326    ptr addrspace(1) %b) {
327entry:
328  %a.val = load <2 x half>, ptr addrspace(1) %a
329  %b.val = load <2 x half>, ptr addrspace(1) %b
330  %r.val = fsub <2 x half> %a.val, %b.val
331  store <2 x half> %r.val, ptr addrspace(1) %r
332  ret void
333}
334
335define amdgpu_kernel void @fsub_v2f16_imm_a(
336; SI-LABEL: fsub_v2f16_imm_a:
337; SI:       ; %bb.0: ; %entry
338; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
339; SI-NEXT:    s_mov_b32 s7, 0xf000
340; SI-NEXT:    s_mov_b32 s6, -1
341; SI-NEXT:    s_mov_b32 s10, s6
342; SI-NEXT:    s_mov_b32 s11, s7
343; SI-NEXT:    s_waitcnt lgkmcnt(0)
344; SI-NEXT:    s_mov_b32 s8, s2
345; SI-NEXT:    s_mov_b32 s9, s3
346; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
347; SI-NEXT:    s_mov_b32 s4, s0
348; SI-NEXT:    s_mov_b32 s5, s1
349; SI-NEXT:    s_waitcnt vmcnt(0)
350; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
351; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
352; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
353; SI-NEXT:    v_sub_f32_e32 v1, 2.0, v1
354; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
355; SI-NEXT:    v_sub_f32_e32 v0, 1.0, v0
356; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
357; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
358; SI-NEXT:    v_or_b32_e32 v0, v0, v1
359; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
360; SI-NEXT:    s_endpgm
361;
362; VI-LABEL: fsub_v2f16_imm_a:
363; VI:       ; %bb.0: ; %entry
364; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
365; VI-NEXT:    s_mov_b32 s7, 0xf000
366; VI-NEXT:    s_mov_b32 s6, -1
367; VI-NEXT:    s_mov_b32 s10, s6
368; VI-NEXT:    s_mov_b32 s11, s7
369; VI-NEXT:    s_waitcnt lgkmcnt(0)
370; VI-NEXT:    s_mov_b32 s8, s2
371; VI-NEXT:    s_mov_b32 s9, s3
372; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
373; VI-NEXT:    v_mov_b32_e32 v1, 0x4000
374; VI-NEXT:    s_mov_b32 s4, s0
375; VI-NEXT:    s_mov_b32 s5, s1
376; VI-NEXT:    s_waitcnt vmcnt(0)
377; VI-NEXT:    v_sub_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
378; VI-NEXT:    v_sub_f16_e32 v0, 1.0, v0
379; VI-NEXT:    v_or_b32_e32 v0, v0, v1
380; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
381; VI-NEXT:    s_endpgm
382;
383; GFX9-LABEL: fsub_v2f16_imm_a:
384; GFX9:       ; %bb.0: ; %entry
385; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
386; GFX9-NEXT:    s_mov_b32 s7, 0xf000
387; GFX9-NEXT:    s_mov_b32 s6, -1
388; GFX9-NEXT:    s_mov_b32 s10, s6
389; GFX9-NEXT:    s_mov_b32 s11, s7
390; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
391; GFX9-NEXT:    s_mov_b32 s8, s2
392; GFX9-NEXT:    s_mov_b32 s9, s3
393; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
394; GFX9-NEXT:    s_mov_b32 s4, s0
395; GFX9-NEXT:    s_mov_b32 s0, 0x40003c00
396; GFX9-NEXT:    s_mov_b32 s5, s1
397; GFX9-NEXT:    s_waitcnt vmcnt(0)
398; GFX9-NEXT:    v_pk_add_f16 v0, v0, s0 neg_lo:[1,0] neg_hi:[1,0]
399; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
400; GFX9-NEXT:    s_endpgm
401;
402; GFX11-LABEL: fsub_v2f16_imm_a:
403; GFX11:       ; %bb.0: ; %entry
404; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
405; GFX11-NEXT:    s_mov_b32 s6, -1
406; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
407; GFX11-NEXT:    s_mov_b32 s10, s6
408; GFX11-NEXT:    s_mov_b32 s11, s7
409; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
410; GFX11-NEXT:    s_mov_b32 s8, s2
411; GFX11-NEXT:    s_mov_b32 s9, s3
412; GFX11-NEXT:    s_mov_b32 s4, s0
413; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
414; GFX11-NEXT:    s_mov_b32 s5, s1
415; GFX11-NEXT:    s_waitcnt vmcnt(0)
416; GFX11-NEXT:    v_pk_add_f16 v0, 0x40003c00, v0 neg_lo:[0,1] neg_hi:[0,1]
417; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
418; GFX11-NEXT:    s_endpgm
419    ptr addrspace(1) %r,
420    ptr addrspace(1) %b) {
421entry:
422  %b.val = load <2 x half>, ptr addrspace(1) %b
423  %r.val = fsub <2 x half> <half 1.0, half 2.0>, %b.val
424  store <2 x half> %r.val, ptr addrspace(1) %r
425  ret void
426}
427
428define amdgpu_kernel void @fsub_v2f16_imm_b(
429; SI-LABEL: fsub_v2f16_imm_b:
430; SI:       ; %bb.0: ; %entry
431; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
432; SI-NEXT:    s_mov_b32 s7, 0xf000
433; SI-NEXT:    s_mov_b32 s6, -1
434; SI-NEXT:    s_mov_b32 s10, s6
435; SI-NEXT:    s_mov_b32 s11, s7
436; SI-NEXT:    s_waitcnt lgkmcnt(0)
437; SI-NEXT:    s_mov_b32 s8, s2
438; SI-NEXT:    s_mov_b32 s9, s3
439; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
440; SI-NEXT:    s_mov_b32 s4, s0
441; SI-NEXT:    s_mov_b32 s5, s1
442; SI-NEXT:    s_waitcnt vmcnt(0)
443; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
444; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
445; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
446; SI-NEXT:    v_add_f32_e32 v1, -1.0, v1
447; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
448; SI-NEXT:    v_add_f32_e32 v0, -2.0, v0
449; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
450; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
451; SI-NEXT:    v_or_b32_e32 v0, v0, v1
452; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
453; SI-NEXT:    s_endpgm
454;
455; VI-LABEL: fsub_v2f16_imm_b:
456; VI:       ; %bb.0: ; %entry
457; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
458; VI-NEXT:    s_mov_b32 s7, 0xf000
459; VI-NEXT:    s_mov_b32 s6, -1
460; VI-NEXT:    s_mov_b32 s10, s6
461; VI-NEXT:    s_mov_b32 s11, s7
462; VI-NEXT:    s_waitcnt lgkmcnt(0)
463; VI-NEXT:    s_mov_b32 s8, s2
464; VI-NEXT:    s_mov_b32 s9, s3
465; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
466; VI-NEXT:    v_mov_b32_e32 v1, 0xbc00
467; VI-NEXT:    s_mov_b32 s4, s0
468; VI-NEXT:    s_mov_b32 s5, s1
469; VI-NEXT:    s_waitcnt vmcnt(0)
470; VI-NEXT:    v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
471; VI-NEXT:    v_add_f16_e32 v0, -2.0, v0
472; VI-NEXT:    v_or_b32_e32 v0, v0, v1
473; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
474; VI-NEXT:    s_endpgm
475;
476; GFX9-LABEL: fsub_v2f16_imm_b:
477; GFX9:       ; %bb.0: ; %entry
478; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
479; GFX9-NEXT:    s_mov_b32 s7, 0xf000
480; GFX9-NEXT:    s_mov_b32 s6, -1
481; GFX9-NEXT:    s_mov_b32 s10, s6
482; GFX9-NEXT:    s_mov_b32 s11, s7
483; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
484; GFX9-NEXT:    s_mov_b32 s8, s2
485; GFX9-NEXT:    s_mov_b32 s9, s3
486; GFX9-NEXT:    buffer_load_dword v0, off, s[8:11], 0
487; GFX9-NEXT:    s_mov_b32 s4, s0
488; GFX9-NEXT:    s_mov_b32 s0, 0xbc00c000
489; GFX9-NEXT:    s_mov_b32 s5, s1
490; GFX9-NEXT:    s_waitcnt vmcnt(0)
491; GFX9-NEXT:    v_pk_add_f16 v0, v0, s0
492; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
493; GFX9-NEXT:    s_endpgm
494;
495; GFX11-LABEL: fsub_v2f16_imm_b:
496; GFX11:       ; %bb.0: ; %entry
497; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
498; GFX11-NEXT:    s_mov_b32 s6, -1
499; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
500; GFX11-NEXT:    s_mov_b32 s10, s6
501; GFX11-NEXT:    s_mov_b32 s11, s7
502; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
503; GFX11-NEXT:    s_mov_b32 s8, s2
504; GFX11-NEXT:    s_mov_b32 s9, s3
505; GFX11-NEXT:    s_mov_b32 s4, s0
506; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
507; GFX11-NEXT:    s_mov_b32 s5, s1
508; GFX11-NEXT:    s_waitcnt vmcnt(0)
509; GFX11-NEXT:    v_pk_add_f16 v0, 0xbc00c000, v0
510; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
511; GFX11-NEXT:    s_endpgm
512    ptr addrspace(1) %r,
513    ptr addrspace(1) %a) {
514entry:
515  %a.val = load <2 x half>, ptr addrspace(1) %a
516  %r.val = fsub <2 x half> %a.val, <half 2.0, half 1.0>
517  store <2 x half> %r.val, ptr addrspace(1) %r
518  ret void
519}
520