xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fadd.f16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=VI %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+real-true16,-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX11-SDAG %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+real-true16,-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX11-GISEL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-real-true16,-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX11-FAKE16-SDAG %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-real-true16,-flat-for-global -verify-machineinstrs -enable-misched=false < %s | FileCheck -check-prefixes=GFX11-FAKE16-GISEL %s
8
9define amdgpu_kernel void @fadd_f16(
10; SI-LABEL: fadd_f16:
11; SI:       ; %bb.0: ; %entry
12; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
13; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
14; SI-NEXT:    s_mov_b32 s11, 0xf000
15; SI-NEXT:    s_mov_b32 s10, -1
16; SI-NEXT:    s_mov_b32 s6, s10
17; SI-NEXT:    s_waitcnt lgkmcnt(0)
18; SI-NEXT:    s_mov_b32 s8, s0
19; SI-NEXT:    s_mov_b32 s9, s1
20; SI-NEXT:    s_mov_b32 s0, s2
21; SI-NEXT:    s_mov_b32 s1, s3
22; SI-NEXT:    s_mov_b32 s2, s10
23; SI-NEXT:    s_mov_b32 s3, s11
24; SI-NEXT:    s_mov_b32 s7, s11
25; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 glc
26; SI-NEXT:    s_waitcnt vmcnt(0)
27; SI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 glc
28; SI-NEXT:    s_waitcnt vmcnt(0)
29; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
30; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
31; SI-NEXT:    v_add_f32_e32 v0, v0, v1
32; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
33; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
34; SI-NEXT:    s_endpgm
35;
36; VI-LABEL: fadd_f16:
37; VI:       ; %bb.0: ; %entry
38; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
39; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
40; VI-NEXT:    s_mov_b32 s11, 0xf000
41; VI-NEXT:    s_mov_b32 s10, -1
42; VI-NEXT:    s_mov_b32 s6, s10
43; VI-NEXT:    s_waitcnt lgkmcnt(0)
44; VI-NEXT:    s_mov_b32 s8, s0
45; VI-NEXT:    s_mov_b32 s9, s1
46; VI-NEXT:    s_mov_b32 s0, s2
47; VI-NEXT:    s_mov_b32 s1, s3
48; VI-NEXT:    s_mov_b32 s2, s10
49; VI-NEXT:    s_mov_b32 s3, s11
50; VI-NEXT:    s_mov_b32 s7, s11
51; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 glc
52; VI-NEXT:    s_waitcnt vmcnt(0)
53; VI-NEXT:    buffer_load_ushort v1, off, s[4:7], 0 glc
54; VI-NEXT:    s_waitcnt vmcnt(0)
55; VI-NEXT:    v_add_f16_e32 v0, v0, v1
56; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
57; VI-NEXT:    s_endpgm
58;
59; GFX11-SDAG-LABEL: fadd_f16:
60; GFX11-SDAG:       ; %bb.0: ; %entry
61; GFX11-SDAG-NEXT:    s_clause 0x1
62; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
63; GFX11-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
64; GFX11-SDAG-NEXT:    s_mov_b32 s11, 0x31016000
65; GFX11-SDAG-NEXT:    s_mov_b32 s10, -1
66; GFX11-SDAG-NEXT:    s_mov_b32 s7, s11
67; GFX11-SDAG-NEXT:    s_mov_b32 s6, s10
68; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX11-SDAG-NEXT:    s_mov_b32 s8, s0
70; GFX11-SDAG-NEXT:    s_mov_b32 s9, s1
71; GFX11-SDAG-NEXT:    s_mov_b32 s0, s2
72; GFX11-SDAG-NEXT:    s_mov_b32 s1, s3
73; GFX11-SDAG-NEXT:    s_mov_b32 s2, s10
74; GFX11-SDAG-NEXT:    s_mov_b32 s3, s11
75; GFX11-SDAG-NEXT:    buffer_load_u16 v0, off, s[0:3], 0 glc dlc
76; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
77; GFX11-SDAG-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
78; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
79; GFX11-SDAG-NEXT:    v_mov_b16_e32 v0.h, v1.l
80; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
81; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
82; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
83; GFX11-SDAG-NEXT:    s_endpgm
84;
85; GFX11-GISEL-LABEL: fadd_f16:
86; GFX11-GISEL:       ; %bb.0: ; %entry
87; GFX11-GISEL-NEXT:    s_clause 0x1
88; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
89; GFX11-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
90; GFX11-GISEL-NEXT:    s_mov_b32 s10, -1
91; GFX11-GISEL-NEXT:    s_mov_b32 s11, 0x31016000
92; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
93; GFX11-GISEL-NEXT:    s_mov_b64 s[6:7], s[10:11]
94; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX11-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
96; GFX11-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
97; GFX11-GISEL-NEXT:    buffer_load_u16 v0, off, s[8:11], 0 glc dlc
98; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
99; GFX11-GISEL-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
100; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
101; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
102; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
103; GFX11-GISEL-NEXT:    s_endpgm
104;
105; GFX11-FAKE16-SDAG-LABEL: fadd_f16:
106; GFX11-FAKE16-SDAG:       ; %bb.0: ; %entry
107; GFX11-FAKE16-SDAG-NEXT:    s_clause 0x1
108; GFX11-FAKE16-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
109; GFX11-FAKE16-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
110; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s11, 0x31016000
111; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s10, -1
112; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s7, s11
113; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s6, s10
114; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
115; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s8, s0
116; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s9, s1
117; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s0, s2
118; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s1, s3
119; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s2, s10
120; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s3, s11
121; GFX11-FAKE16-SDAG-NEXT:    buffer_load_u16 v0, off, s[0:3], 0 glc dlc
122; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0)
123; GFX11-FAKE16-SDAG-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
124; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0)
125; GFX11-FAKE16-SDAG-NEXT:    v_add_f16_e32 v0, v0, v1
126; GFX11-FAKE16-SDAG-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
127; GFX11-FAKE16-SDAG-NEXT:    s_endpgm
128;
129; GFX11-FAKE16-GISEL-LABEL: fadd_f16:
130; GFX11-FAKE16-GISEL:       ; %bb.0: ; %entry
131; GFX11-FAKE16-GISEL-NEXT:    s_clause 0x1
132; GFX11-FAKE16-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
133; GFX11-FAKE16-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
134; GFX11-FAKE16-GISEL-NEXT:    s_mov_b32 s10, -1
135; GFX11-FAKE16-GISEL-NEXT:    s_mov_b32 s11, 0x31016000
136; GFX11-FAKE16-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
137; GFX11-FAKE16-GISEL-NEXT:    s_mov_b64 s[6:7], s[10:11]
138; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
139; GFX11-FAKE16-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
140; GFX11-FAKE16-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
141; GFX11-FAKE16-GISEL-NEXT:    buffer_load_u16 v0, off, s[8:11], 0 glc dlc
142; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0)
143; GFX11-FAKE16-GISEL-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
144; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0)
145; GFX11-FAKE16-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
146; GFX11-FAKE16-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
147; GFX11-FAKE16-GISEL-NEXT:    s_endpgm
148; GFX11-LABEL: fadd_f16:
149; GFX11:       ; %bb.0: ; %entry
150; GFX11-NEXT:    s_clause 0x1
151; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
152; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
153; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
154; GFX11-NEXT:    s_mov_b32 s10, -1
155; GFX11-NEXT:    s_mov_b32 s3, s11
156; GFX11-NEXT:    s_mov_b32 s2, s10
157; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
158; GFX11-NEXT:    s_mov_b32 s8, s4
159; GFX11-NEXT:    s_mov_b32 s9, s5
160; GFX11-NEXT:    s_mov_b32 s4, s6
161; GFX11-NEXT:    s_mov_b32 s5, s7
162; GFX11-NEXT:    s_mov_b32 s6, s10
163; GFX11-NEXT:    s_mov_b32 s7, s11
164; GFX11-NEXT:    buffer_load_u16 v0, off, s[4:7], 0 glc dlc
165; GFX11-NEXT:    s_waitcnt vmcnt(0)
166; GFX11-NEXT:    buffer_load_u16 v1, off, s[0:3], 0 glc dlc
167; GFX11-NEXT:    s_waitcnt vmcnt(0)
168; GFX11-NEXT:    v_mov_b16_e32 v0.h, v1.l
169; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
170; GFX11-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
171; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
172; GFX11-NEXT:    s_nop 0
173; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
174; GFX11-NEXT:    s_endpgm
175    ptr addrspace(1) %r,
176    ptr addrspace(1) %a,
177    ptr addrspace(1) %b) {
178entry:
179  %a.val = load volatile half, ptr addrspace(1) %a
180  %b.val = load volatile half, ptr addrspace(1) %b
181  %r.val = fadd half %a.val, %b.val
182  store half %r.val, ptr addrspace(1) %r
183  ret void
184}
185
186define amdgpu_kernel void @fadd_f16_imm_a(
187; SI-LABEL: fadd_f16_imm_a:
188; SI:       ; %bb.0: ; %entry
189; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
190; SI-NEXT:    s_mov_b32 s7, 0xf000
191; SI-NEXT:    s_mov_b32 s6, -1
192; SI-NEXT:    s_waitcnt lgkmcnt(0)
193; SI-NEXT:    s_mov_b32 s4, s0
194; SI-NEXT:    s_mov_b32 s5, s1
195; SI-NEXT:    s_mov_b32 s0, s2
196; SI-NEXT:    s_mov_b32 s1, s3
197; SI-NEXT:    s_mov_b32 s2, s6
198; SI-NEXT:    s_mov_b32 s3, s7
199; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
200; SI-NEXT:    s_waitcnt vmcnt(0)
201; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
202; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
203; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
204; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
205; SI-NEXT:    s_endpgm
206;
207; VI-LABEL: fadd_f16_imm_a:
208; VI:       ; %bb.0: ; %entry
209; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
210; VI-NEXT:    s_mov_b32 s7, 0xf000
211; VI-NEXT:    s_mov_b32 s6, -1
212; VI-NEXT:    s_waitcnt lgkmcnt(0)
213; VI-NEXT:    s_mov_b32 s4, s0
214; VI-NEXT:    s_mov_b32 s5, s1
215; VI-NEXT:    s_mov_b32 s0, s2
216; VI-NEXT:    s_mov_b32 s1, s3
217; VI-NEXT:    s_mov_b32 s2, s6
218; VI-NEXT:    s_mov_b32 s3, s7
219; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
220; VI-NEXT:    s_waitcnt vmcnt(0)
221; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
222; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
223; VI-NEXT:    s_endpgm
224;
225; GFX11-SDAG-LABEL: fadd_f16_imm_a:
226; GFX11-SDAG:       ; %bb.0: ; %entry
227; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
228; GFX11-SDAG-NEXT:    s_mov_b32 s7, 0x31016000
229; GFX11-SDAG-NEXT:    s_mov_b32 s6, -1
230; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
231; GFX11-SDAG-NEXT:    s_mov_b32 s4, s0
232; GFX11-SDAG-NEXT:    s_mov_b32 s5, s1
233; GFX11-SDAG-NEXT:    s_mov_b32 s0, s2
234; GFX11-SDAG-NEXT:    s_mov_b32 s1, s3
235; GFX11-SDAG-NEXT:    s_mov_b32 s2, s6
236; GFX11-SDAG-NEXT:    s_mov_b32 s3, s7
237; GFX11-SDAG-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
238; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
239; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
240; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
241; GFX11-SDAG-NEXT:    s_endpgm
242;
243; GFX11-GISEL-LABEL: fadd_f16_imm_a:
244; GFX11-GISEL:       ; %bb.0: ; %entry
245; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
246; GFX11-GISEL-NEXT:    s_mov_b32 s6, -1
247; GFX11-GISEL-NEXT:    s_mov_b32 s7, 0x31016000
248; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
249; GFX11-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
250; GFX11-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
251; GFX11-GISEL-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
252; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
253; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
254; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
255; GFX11-GISEL-NEXT:    s_endpgm
256;
257; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_a:
258; GFX11-FAKE16-SDAG:       ; %bb.0: ; %entry
259; GFX11-FAKE16-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
260; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s7, 0x31016000
261; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s6, -1
262; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
263; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s4, s0
264; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s5, s1
265; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s0, s2
266; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s1, s3
267; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s2, s6
268; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s3, s7
269; GFX11-FAKE16-SDAG-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
270; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0)
271; GFX11-FAKE16-SDAG-NEXT:    v_add_f16_e32 v0, 1.0, v0
272; GFX11-FAKE16-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
273; GFX11-FAKE16-SDAG-NEXT:    s_endpgm
274;
275; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_a:
276; GFX11-FAKE16-GISEL:       ; %bb.0: ; %entry
277; GFX11-FAKE16-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
278; GFX11-FAKE16-GISEL-NEXT:    s_mov_b32 s6, -1
279; GFX11-FAKE16-GISEL-NEXT:    s_mov_b32 s7, 0x31016000
280; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
281; GFX11-FAKE16-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
282; GFX11-FAKE16-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
283; GFX11-FAKE16-GISEL-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
284; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0)
285; GFX11-FAKE16-GISEL-NEXT:    v_add_f16_e32 v0, 1.0, v0
286; GFX11-FAKE16-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
287; GFX11-FAKE16-GISEL-NEXT:    s_endpgm
288; GFX11-LABEL: fadd_f16_imm_a:
289; GFX11:       ; %bb.0: ; %entry
290; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
291; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
292; GFX11-NEXT:    s_mov_b32 s6, -1
293; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
294; GFX11-NEXT:    s_mov_b32 s4, s0
295; GFX11-NEXT:    s_mov_b32 s5, s1
296; GFX11-NEXT:    s_mov_b32 s0, s2
297; GFX11-NEXT:    s_mov_b32 s1, s3
298; GFX11-NEXT:    s_mov_b32 s2, s6
299; GFX11-NEXT:    s_mov_b32 s3, s7
300; GFX11-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
301; GFX11-NEXT:    s_waitcnt vmcnt(0)
302; GFX11-NEXT:    v_mov_b16_e32 v0.h, 0x3c00
303; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
304; GFX11-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
305; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
306; GFX11-NEXT:    s_nop 0
307; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
308; GFX11-NEXT:    s_endpgm
309    ptr addrspace(1) %r,
310    ptr addrspace(1) %b) {
311entry:
312  %b.val = load half, ptr addrspace(1) %b
313  %r.val = fadd half 1.0, %b.val
314  store half %r.val, ptr addrspace(1) %r
315  ret void
316}
317
318define amdgpu_kernel void @fadd_f16_imm_b(
319; SI-LABEL: fadd_f16_imm_b:
320; SI:       ; %bb.0: ; %entry
321; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
322; SI-NEXT:    s_mov_b32 s7, 0xf000
323; SI-NEXT:    s_mov_b32 s6, -1
324; SI-NEXT:    s_waitcnt lgkmcnt(0)
325; SI-NEXT:    s_mov_b32 s4, s0
326; SI-NEXT:    s_mov_b32 s5, s1
327; SI-NEXT:    s_mov_b32 s0, s2
328; SI-NEXT:    s_mov_b32 s1, s3
329; SI-NEXT:    s_mov_b32 s2, s6
330; SI-NEXT:    s_mov_b32 s3, s7
331; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
332; SI-NEXT:    s_waitcnt vmcnt(0)
333; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
334; SI-NEXT:    v_add_f32_e32 v0, 2.0, v0
335; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
336; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
337; SI-NEXT:    s_endpgm
338;
339; VI-LABEL: fadd_f16_imm_b:
340; VI:       ; %bb.0: ; %entry
341; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
342; VI-NEXT:    s_mov_b32 s7, 0xf000
343; VI-NEXT:    s_mov_b32 s6, -1
344; VI-NEXT:    s_waitcnt lgkmcnt(0)
345; VI-NEXT:    s_mov_b32 s4, s0
346; VI-NEXT:    s_mov_b32 s5, s1
347; VI-NEXT:    s_mov_b32 s0, s2
348; VI-NEXT:    s_mov_b32 s1, s3
349; VI-NEXT:    s_mov_b32 s2, s6
350; VI-NEXT:    s_mov_b32 s3, s7
351; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
352; VI-NEXT:    s_waitcnt vmcnt(0)
353; VI-NEXT:    v_add_f16_e32 v0, 2.0, v0
354; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
355; VI-NEXT:    s_endpgm
356;
357; GFX11-SDAG-LABEL: fadd_f16_imm_b:
358; GFX11-SDAG:       ; %bb.0: ; %entry
359; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
360; GFX11-SDAG-NEXT:    s_mov_b32 s7, 0x31016000
361; GFX11-SDAG-NEXT:    s_mov_b32 s6, -1
362; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
363; GFX11-SDAG-NEXT:    s_mov_b32 s4, s0
364; GFX11-SDAG-NEXT:    s_mov_b32 s5, s1
365; GFX11-SDAG-NEXT:    s_mov_b32 s0, s2
366; GFX11-SDAG-NEXT:    s_mov_b32 s1, s3
367; GFX11-SDAG-NEXT:    s_mov_b32 s2, s6
368; GFX11-SDAG-NEXT:    s_mov_b32 s3, s7
369; GFX11-SDAG-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
370; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
371; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
372; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
373; GFX11-SDAG-NEXT:    s_endpgm
374;
375; GFX11-GISEL-LABEL: fadd_f16_imm_b:
376; GFX11-GISEL:       ; %bb.0: ; %entry
377; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
378; GFX11-GISEL-NEXT:    s_mov_b32 s6, -1
379; GFX11-GISEL-NEXT:    s_mov_b32 s7, 0x31016000
380; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
381; GFX11-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
382; GFX11-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
383; GFX11-GISEL-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
384; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
385; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
386; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
387; GFX11-GISEL-NEXT:    s_endpgm
388;
389; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_b:
390; GFX11-FAKE16-SDAG:       ; %bb.0: ; %entry
391; GFX11-FAKE16-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
392; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s7, 0x31016000
393; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s6, -1
394; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
395; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s4, s0
396; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s5, s1
397; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s0, s2
398; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s1, s3
399; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s2, s6
400; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s3, s7
401; GFX11-FAKE16-SDAG-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
402; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0)
403; GFX11-FAKE16-SDAG-NEXT:    v_add_f16_e32 v0, 2.0, v0
404; GFX11-FAKE16-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
405; GFX11-FAKE16-SDAG-NEXT:    s_endpgm
406;
407; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_b:
408; GFX11-FAKE16-GISEL:       ; %bb.0: ; %entry
409; GFX11-FAKE16-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
410; GFX11-FAKE16-GISEL-NEXT:    s_mov_b32 s6, -1
411; GFX11-FAKE16-GISEL-NEXT:    s_mov_b32 s7, 0x31016000
412; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
413; GFX11-FAKE16-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
414; GFX11-FAKE16-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
415; GFX11-FAKE16-GISEL-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
416; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0)
417; GFX11-FAKE16-GISEL-NEXT:    v_add_f16_e32 v0, 2.0, v0
418; GFX11-FAKE16-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
419; GFX11-FAKE16-GISEL-NEXT:    s_endpgm
420; GFX11-LABEL: fadd_f16_imm_b:
421; GFX11:       ; %bb.0: ; %entry
422; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
423; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
424; GFX11-NEXT:    s_mov_b32 s6, -1
425; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
426; GFX11-NEXT:    s_mov_b32 s4, s0
427; GFX11-NEXT:    s_mov_b32 s5, s1
428; GFX11-NEXT:    s_mov_b32 s0, s2
429; GFX11-NEXT:    s_mov_b32 s1, s3
430; GFX11-NEXT:    s_mov_b32 s2, s6
431; GFX11-NEXT:    s_mov_b32 s3, s7
432; GFX11-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
433; GFX11-NEXT:    s_waitcnt vmcnt(0)
434; GFX11-NEXT:    v_mov_b16_e32 v0.h, 0x4000
435; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
436; GFX11-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
437; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
438; GFX11-NEXT:    s_nop 0
439; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
440; GFX11-NEXT:    s_endpgm
441    ptr addrspace(1) %r,
442    ptr addrspace(1) %a) {
443entry:
444  %a.val = load half, ptr addrspace(1) %a
445  %r.val = fadd half %a.val, 2.0
446  store half %r.val, ptr addrspace(1) %r
447  ret void
448}
449
450define amdgpu_kernel void @fadd_v2f16(
451; SI-LABEL: fadd_v2f16:
452; SI:       ; %bb.0: ; %entry
453; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
454; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
455; SI-NEXT:    s_mov_b32 s11, 0xf000
456; SI-NEXT:    s_mov_b32 s14, 0
457; SI-NEXT:    s_mov_b32 s15, s11
458; SI-NEXT:    s_waitcnt lgkmcnt(0)
459; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
460; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
461; SI-NEXT:    v_mov_b32_e32 v1, 0
462; SI-NEXT:    s_mov_b64 s[6:7], s[14:15]
463; SI-NEXT:    buffer_load_dword v2, v[0:1], s[12:15], 0 addr64
464; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
465; SI-NEXT:    s_mov_b32 s10, -1
466; SI-NEXT:    s_mov_b32 s8, s0
467; SI-NEXT:    s_mov_b32 s9, s1
468; SI-NEXT:    s_waitcnt vmcnt(1)
469; SI-NEXT:    v_cvt_f32_f16_e32 v3, v2
470; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
471; SI-NEXT:    s_waitcnt vmcnt(0)
472; SI-NEXT:    v_cvt_f32_f16_e32 v1, v0
473; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
474; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
475; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
476; SI-NEXT:    v_add_f32_e32 v1, v3, v1
477; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
478; SI-NEXT:    v_add_f32_e32 v0, v2, v0
479; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
480; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
481; SI-NEXT:    v_or_b32_e32 v0, v1, v0
482; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
483; SI-NEXT:    s_endpgm
484;
485; VI-LABEL: fadd_v2f16:
486; VI:       ; %bb.0: ; %entry
487; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
488; VI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
489; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
490; VI-NEXT:    s_mov_b32 s7, 0xf000
491; VI-NEXT:    s_mov_b32 s6, -1
492; VI-NEXT:    s_waitcnt lgkmcnt(0)
493; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
494; VI-NEXT:    v_mov_b32_e32 v1, s3
495; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
496; VI-NEXT:    v_add_u32_e32 v2, vcc, s8, v2
497; VI-NEXT:    v_mov_b32_e32 v3, s9
498; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
499; VI-NEXT:    flat_load_dword v0, v[0:1]
500; VI-NEXT:    flat_load_dword v1, v[2:3]
501; VI-NEXT:    s_mov_b32 s4, s0
502; VI-NEXT:    s_mov_b32 s5, s1
503; VI-NEXT:    s_waitcnt vmcnt(0)
504; VI-NEXT:    v_add_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
505; VI-NEXT:    v_add_f16_e32 v0, v0, v1
506; VI-NEXT:    v_or_b32_e32 v0, v0, v2
507; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
508; VI-NEXT:    s_endpgm
509;
510; GFX11-SDAG-LABEL: fadd_v2f16:
511; GFX11-SDAG:       ; %bb.0: ; %entry
512; GFX11-SDAG-NEXT:    s_clause 0x1
513; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
514; GFX11-SDAG-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
515; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
516; GFX11-SDAG-NEXT:    s_mov_b32 s7, 0x31016000
517; GFX11-SDAG-NEXT:    s_mov_b32 s6, -1
518; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
519; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
520; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
521; GFX11-SDAG-NEXT:    s_clause 0x1
522; GFX11-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
523; GFX11-SDAG-NEXT:    global_load_b32 v0, v0, s[8:9]
524; GFX11-SDAG-NEXT:    s_mov_b32 s4, s0
525; GFX11-SDAG-NEXT:    s_mov_b32 s5, s1
526; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
527; GFX11-SDAG-NEXT:    v_pk_add_f16 v0, v1, v0
528; GFX11-SDAG-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
529; GFX11-SDAG-NEXT:    s_endpgm
530;
531; GFX11-GISEL-LABEL: fadd_v2f16:
532; GFX11-GISEL:       ; %bb.0: ; %entry
533; GFX11-GISEL-NEXT:    s_clause 0x1
534; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
535; GFX11-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
536; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
537; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
538; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
539; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
540; GFX11-GISEL-NEXT:    s_clause 0x1
541; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
542; GFX11-GISEL-NEXT:    global_load_b32 v0, v0, s[4:5]
543; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
544; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
545; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
546; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, v1, v0
547; GFX11-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
548; GFX11-GISEL-NEXT:    s_endpgm
549;
550; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16:
551; GFX11-FAKE16-SDAG:       ; %bb.0: ; %entry
552; GFX11-FAKE16-SDAG-NEXT:    s_clause 0x1
553; GFX11-FAKE16-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
554; GFX11-FAKE16-SDAG-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
555; GFX11-FAKE16-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
556; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s7, 0x31016000
557; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s6, -1
558; GFX11-FAKE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
559; GFX11-FAKE16-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
560; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
561; GFX11-FAKE16-SDAG-NEXT:    s_clause 0x1
562; GFX11-FAKE16-SDAG-NEXT:    global_load_b32 v1, v0, s[2:3]
563; GFX11-FAKE16-SDAG-NEXT:    global_load_b32 v0, v0, s[8:9]
564; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s4, s0
565; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s5, s1
566; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0)
567; GFX11-FAKE16-SDAG-NEXT:    v_pk_add_f16 v0, v1, v0
568; GFX11-FAKE16-SDAG-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
569; GFX11-FAKE16-SDAG-NEXT:    s_endpgm
570;
571; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16:
572; GFX11-FAKE16-GISEL:       ; %bb.0: ; %entry
573; GFX11-FAKE16-GISEL-NEXT:    s_clause 0x1
574; GFX11-FAKE16-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
575; GFX11-FAKE16-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
576; GFX11-FAKE16-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
577; GFX11-FAKE16-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
578; GFX11-FAKE16-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
579; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
580; GFX11-FAKE16-GISEL-NEXT:    s_clause 0x1
581; GFX11-FAKE16-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
582; GFX11-FAKE16-GISEL-NEXT:    global_load_b32 v0, v0, s[4:5]
583; GFX11-FAKE16-GISEL-NEXT:    s_mov_b32 s2, -1
584; GFX11-FAKE16-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
585; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0)
586; GFX11-FAKE16-GISEL-NEXT:    v_pk_add_f16 v0, v1, v0
587; GFX11-FAKE16-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
588; GFX11-FAKE16-GISEL-NEXT:    s_endpgm
589; GFX11-LABEL: fadd_v2f16:
590; GFX11:       ; %bb.0: ; %entry
591; GFX11-NEXT:    s_clause 0x1
592; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
593; GFX11-NEXT:    s_load_b64 s[8:9], s[0:1], 0x34
594; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
595; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
596; GFX11-NEXT:    s_mov_b32 s2, -1
597; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
598; GFX11-NEXT:    s_clause 0x1
599; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7]
600; GFX11-NEXT:    global_load_b32 v0, v0, s[8:9]
601; GFX11-NEXT:    s_mov_b32 s0, s4
602; GFX11-NEXT:    s_mov_b32 s1, s5
603; GFX11-NEXT:    s_waitcnt vmcnt(0)
604; GFX11-NEXT:    v_pk_add_f16 v0, v1, v0
605; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
606; GFX11-NEXT:    s_nop 0
607; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
608; GFX11-NEXT:    s_endpgm
609    ptr addrspace(1) %r,
610    ptr addrspace(1) %a,
611    ptr addrspace(1) %b) {
612entry:
613  %tid = call i32 @llvm.amdgcn.workitem.id.x()
614  %gep.a = getelementptr inbounds <2 x half>, ptr addrspace(1) %a, i32 %tid
615  %gep.b = getelementptr inbounds <2 x half>, ptr addrspace(1) %b, i32 %tid
616  %a.val = load <2 x half>, ptr addrspace(1) %gep.a
617  %b.val = load <2 x half>, ptr addrspace(1) %gep.b
618  %r.val = fadd <2 x half> %a.val, %b.val
619  store <2 x half> %r.val, ptr addrspace(1) %r
620  ret void
621}
622
623define amdgpu_kernel void @fadd_v2f16_imm_a(
624; SI-LABEL: fadd_v2f16_imm_a:
625; SI:       ; %bb.0: ; %entry
626; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
627; SI-NEXT:    s_mov_b32 s7, 0xf000
628; SI-NEXT:    s_mov_b32 s10, 0
629; SI-NEXT:    s_mov_b32 s11, s7
630; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
631; SI-NEXT:    s_waitcnt lgkmcnt(0)
632; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
633; SI-NEXT:    v_mov_b32_e32 v1, 0
634; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
635; SI-NEXT:    s_mov_b32 s6, -1
636; SI-NEXT:    s_mov_b32 s4, s0
637; SI-NEXT:    s_mov_b32 s5, s1
638; SI-NEXT:    s_waitcnt vmcnt(0)
639; SI-NEXT:    v_cvt_f32_f16_e32 v1, v0
640; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
641; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
642; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
643; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
644; SI-NEXT:    v_add_f32_e32 v0, 2.0, v0
645; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
646; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
647; SI-NEXT:    v_or_b32_e32 v0, v1, v0
648; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
649; SI-NEXT:    s_endpgm
650;
651; VI-LABEL: fadd_v2f16_imm_a:
652; VI:       ; %bb.0: ; %entry
653; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
654; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
655; VI-NEXT:    s_mov_b32 s7, 0xf000
656; VI-NEXT:    s_mov_b32 s6, -1
657; VI-NEXT:    s_waitcnt lgkmcnt(0)
658; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
659; VI-NEXT:    v_mov_b32_e32 v1, s3
660; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
661; VI-NEXT:    flat_load_dword v0, v[0:1]
662; VI-NEXT:    v_mov_b32_e32 v1, 0x4000
663; VI-NEXT:    s_mov_b32 s4, s0
664; VI-NEXT:    s_mov_b32 s5, s1
665; VI-NEXT:    s_waitcnt vmcnt(0)
666; VI-NEXT:    v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
667; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
668; VI-NEXT:    v_or_b32_e32 v0, v0, v1
669; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
670; VI-NEXT:    s_endpgm
671;
672; GFX11-SDAG-LABEL: fadd_v2f16_imm_a:
673; GFX11-SDAG:       ; %bb.0: ; %entry
674; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
675; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
676; GFX11-SDAG-NEXT:    s_mov_b32 s7, 0x31016000
677; GFX11-SDAG-NEXT:    s_mov_b32 s6, -1
678; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
679; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
680; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
681; GFX11-SDAG-NEXT:    global_load_b32 v0, v0, s[2:3]
682; GFX11-SDAG-NEXT:    s_mov_b32 s4, s0
683; GFX11-SDAG-NEXT:    s_mov_b32 s5, s1
684; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
685; GFX11-SDAG-NEXT:    v_pk_add_f16 v0, 0x40003c00, v0
686; GFX11-SDAG-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
687; GFX11-SDAG-NEXT:    s_endpgm
688;
689; GFX11-GISEL-LABEL: fadd_v2f16_imm_a:
690; GFX11-GISEL:       ; %bb.0: ; %entry
691; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
692; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
693; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
694; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
695; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
696; GFX11-GISEL-NEXT:    global_load_b32 v0, v0, s[2:3]
697; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
698; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
699; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
700; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, 0x40003c00, v0
701; GFX11-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
702; GFX11-GISEL-NEXT:    s_endpgm
703;
704; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_a:
705; GFX11-FAKE16-SDAG:       ; %bb.0: ; %entry
706; GFX11-FAKE16-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
707; GFX11-FAKE16-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
708; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s7, 0x31016000
709; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s6, -1
710; GFX11-FAKE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
711; GFX11-FAKE16-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
712; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
713; GFX11-FAKE16-SDAG-NEXT:    global_load_b32 v0, v0, s[2:3]
714; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s4, s0
715; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s5, s1
716; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0)
717; GFX11-FAKE16-SDAG-NEXT:    v_pk_add_f16 v0, 0x40003c00, v0
718; GFX11-FAKE16-SDAG-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
719; GFX11-FAKE16-SDAG-NEXT:    s_endpgm
720;
721; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_a:
722; GFX11-FAKE16-GISEL:       ; %bb.0: ; %entry
723; GFX11-FAKE16-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
724; GFX11-FAKE16-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
725; GFX11-FAKE16-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
726; GFX11-FAKE16-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
727; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
728; GFX11-FAKE16-GISEL-NEXT:    global_load_b32 v0, v0, s[2:3]
729; GFX11-FAKE16-GISEL-NEXT:    s_mov_b32 s2, -1
730; GFX11-FAKE16-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
731; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0)
732; GFX11-FAKE16-GISEL-NEXT:    v_pk_add_f16 v0, 0x40003c00, v0
733; GFX11-FAKE16-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
734; GFX11-FAKE16-GISEL-NEXT:    s_endpgm
735; GFX11-LABEL: fadd_v2f16_imm_a:
736; GFX11:       ; %bb.0: ; %entry
737; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
738; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
739; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
740; GFX11-NEXT:    s_mov_b32 s6, -1
741; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
742; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
743; GFX11-NEXT:    s_mov_b32 s4, s0
744; GFX11-NEXT:    s_mov_b32 s5, s1
745; GFX11-NEXT:    s_waitcnt vmcnt(0)
746; GFX11-NEXT:    v_pk_add_f16 v0, 0x40003c00, v0
747; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
748; GFX11-NEXT:    s_nop 0
749; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
750; GFX11-NEXT:    s_endpgm
751    ptr addrspace(1) %r,
752    ptr addrspace(1) %b) {
753entry:
754  %tid = call i32 @llvm.amdgcn.workitem.id.x()
755  %gep.b = getelementptr inbounds <2 x half>, ptr addrspace(1) %b, i32 %tid
756  %b.val = load <2 x half>, ptr addrspace(1) %gep.b
757  %r.val = fadd <2 x half> <half 1.0, half 2.0>, %b.val
758  store <2 x half> %r.val, ptr addrspace(1) %r
759  ret void
760}
761
762define amdgpu_kernel void @fadd_v2f16_imm_b(
763; SI-LABEL: fadd_v2f16_imm_b:
764; SI:       ; %bb.0: ; %entry
765; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
766; SI-NEXT:    s_mov_b32 s7, 0xf000
767; SI-NEXT:    s_mov_b32 s10, 0
768; SI-NEXT:    s_mov_b32 s11, s7
769; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
770; SI-NEXT:    s_waitcnt lgkmcnt(0)
771; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
772; SI-NEXT:    v_mov_b32_e32 v1, 0
773; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
774; SI-NEXT:    s_mov_b32 s6, -1
775; SI-NEXT:    s_mov_b32 s4, s0
776; SI-NEXT:    s_mov_b32 s5, s1
777; SI-NEXT:    s_waitcnt vmcnt(0)
778; SI-NEXT:    v_cvt_f32_f16_e32 v1, v0
779; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
780; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
781; SI-NEXT:    v_add_f32_e32 v1, 2.0, v1
782; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
783; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
784; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
785; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
786; SI-NEXT:    v_or_b32_e32 v0, v1, v0
787; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
788; SI-NEXT:    s_endpgm
789;
790; VI-LABEL: fadd_v2f16_imm_b:
791; VI:       ; %bb.0: ; %entry
792; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
793; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
794; VI-NEXT:    s_mov_b32 s7, 0xf000
795; VI-NEXT:    s_mov_b32 s6, -1
796; VI-NEXT:    s_waitcnt lgkmcnt(0)
797; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
798; VI-NEXT:    v_mov_b32_e32 v1, s3
799; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
800; VI-NEXT:    flat_load_dword v0, v[0:1]
801; VI-NEXT:    v_mov_b32_e32 v1, 0x3c00
802; VI-NEXT:    s_mov_b32 s4, s0
803; VI-NEXT:    s_mov_b32 s5, s1
804; VI-NEXT:    s_waitcnt vmcnt(0)
805; VI-NEXT:    v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
806; VI-NEXT:    v_add_f16_e32 v0, 2.0, v0
807; VI-NEXT:    v_or_b32_e32 v0, v0, v1
808; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
809; VI-NEXT:    s_endpgm
810;
811; GFX11-SDAG-LABEL: fadd_v2f16_imm_b:
812; GFX11-SDAG:       ; %bb.0: ; %entry
813; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
814; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
815; GFX11-SDAG-NEXT:    s_mov_b32 s7, 0x31016000
816; GFX11-SDAG-NEXT:    s_mov_b32 s6, -1
817; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
818; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
819; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
820; GFX11-SDAG-NEXT:    global_load_b32 v0, v0, s[2:3]
821; GFX11-SDAG-NEXT:    s_mov_b32 s4, s0
822; GFX11-SDAG-NEXT:    s_mov_b32 s5, s1
823; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
824; GFX11-SDAG-NEXT:    v_pk_add_f16 v0, 0x3c004000, v0
825; GFX11-SDAG-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
826; GFX11-SDAG-NEXT:    s_endpgm
827;
828; GFX11-GISEL-LABEL: fadd_v2f16_imm_b:
829; GFX11-GISEL:       ; %bb.0: ; %entry
830; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
831; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
832; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
833; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
834; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
835; GFX11-GISEL-NEXT:    global_load_b32 v0, v0, s[2:3]
836; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
837; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
838; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
839; GFX11-GISEL-NEXT:    v_pk_add_f16 v0, 0x3c004000, v0
840; GFX11-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
841; GFX11-GISEL-NEXT:    s_endpgm
842;
843; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_b:
844; GFX11-FAKE16-SDAG:       ; %bb.0: ; %entry
845; GFX11-FAKE16-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
846; GFX11-FAKE16-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
847; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s7, 0x31016000
848; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s6, -1
849; GFX11-FAKE16-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
850; GFX11-FAKE16-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
851; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
852; GFX11-FAKE16-SDAG-NEXT:    global_load_b32 v0, v0, s[2:3]
853; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s4, s0
854; GFX11-FAKE16-SDAG-NEXT:    s_mov_b32 s5, s1
855; GFX11-FAKE16-SDAG-NEXT:    s_waitcnt vmcnt(0)
856; GFX11-FAKE16-SDAG-NEXT:    v_pk_add_f16 v0, 0x3c004000, v0
857; GFX11-FAKE16-SDAG-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
858; GFX11-FAKE16-SDAG-NEXT:    s_endpgm
859;
860; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_b:
861; GFX11-FAKE16-GISEL:       ; %bb.0: ; %entry
862; GFX11-FAKE16-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
863; GFX11-FAKE16-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
864; GFX11-FAKE16-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
865; GFX11-FAKE16-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
866; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
867; GFX11-FAKE16-GISEL-NEXT:    global_load_b32 v0, v0, s[2:3]
868; GFX11-FAKE16-GISEL-NEXT:    s_mov_b32 s2, -1
869; GFX11-FAKE16-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
870; GFX11-FAKE16-GISEL-NEXT:    s_waitcnt vmcnt(0)
871; GFX11-FAKE16-GISEL-NEXT:    v_pk_add_f16 v0, 0x3c004000, v0
872; GFX11-FAKE16-GISEL-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
873; GFX11-FAKE16-GISEL-NEXT:    s_endpgm
874; GFX11-LABEL: fadd_v2f16_imm_b:
875; GFX11:       ; %bb.0: ; %entry
876; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
877; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
878; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
879; GFX11-NEXT:    s_mov_b32 s6, -1
880; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
881; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
882; GFX11-NEXT:    s_mov_b32 s4, s0
883; GFX11-NEXT:    s_mov_b32 s5, s1
884; GFX11-NEXT:    s_waitcnt vmcnt(0)
885; GFX11-NEXT:    v_pk_add_f16 v0, 0x3c004000, v0
886; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
887; GFX11-NEXT:    s_nop 0
888; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
889; GFX11-NEXT:    s_endpgm
890    ptr addrspace(1) %r,
891    ptr addrspace(1) %a) {
892entry:
893  %tid = call i32 @llvm.amdgcn.workitem.id.x()
894  %gep.a = getelementptr inbounds <2 x half>, ptr addrspace(1) %a, i32 %tid
895  %a.val = load <2 x half>, ptr addrspace(1) %gep.a
896  %r.val = fadd <2 x half> %a.val, <half 2.0, half 1.0>
897  store <2 x half> %r.val, ptr addrspace(1) %r
898  ret void
899}
900
901declare i32 @llvm.amdgcn.workitem.id.x() #1
902
903attributes #0 = { nounwind }
904attributes #1 = { nounwind readnone }
905