xref: /llvm-project/llvm/test/CodeGen/AMDGPU/add.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
8
9define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
10; GFX6-LABEL: s_add_i32:
11; GFX6:       ; %bb.0:
12; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
13; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
15; GFX6-NEXT:    s_mov_b32 s3, 0xf000
16; GFX6-NEXT:    s_mov_b32 s2, -1
17; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
18; GFX6-NEXT:    s_add_i32 s4, s4, s5
19; GFX6-NEXT:    v_mov_b32_e32 v0, s4
20; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
21; GFX6-NEXT:    s_endpgm
22;
23; GFX8-LABEL: s_add_i32:
24; GFX8:       ; %bb.0:
25; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
26; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
27; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
28; GFX8-NEXT:    v_mov_b32_e32 v0, s0
29; GFX8-NEXT:    v_mov_b32_e32 v1, s1
30; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
31; GFX8-NEXT:    s_add_i32 s0, s2, s3
32; GFX8-NEXT:    v_mov_b32_e32 v2, s0
33; GFX8-NEXT:    flat_store_dword v[0:1], v2
34; GFX8-NEXT:    s_endpgm
35;
36; GFX9-LABEL: s_add_i32:
37; GFX9:       ; %bb.0:
38; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
39; GFX9-NEXT:    v_mov_b32_e32 v0, 0
40; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
42; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
43; GFX9-NEXT:    s_add_i32 s2, s4, s5
44; GFX9-NEXT:    v_mov_b32_e32 v1, s2
45; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
46; GFX9-NEXT:    s_endpgm
47;
48; GFX10-LABEL: s_add_i32:
49; GFX10:       ; %bb.0:
50; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
51; GFX10-NEXT:    v_mov_b32_e32 v0, 0
52; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
53; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
54; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
55; GFX10-NEXT:    s_add_i32 s2, s4, s5
56; GFX10-NEXT:    v_mov_b32_e32 v1, s2
57; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
58; GFX10-NEXT:    s_endpgm
59;
60; GFX11-LABEL: s_add_i32:
61; GFX11:       ; %bb.0:
62; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
63; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
64; GFX11-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
65; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX11-NEXT:    s_add_i32 s2, s2, s3
67; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
68; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
69; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
70; GFX11-NEXT:    s_endpgm
71;
72; GFX12-LABEL: s_add_i32:
73; GFX12:       ; %bb.0:
74; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
75; GFX12-NEXT:    s_wait_kmcnt 0x0
76; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
77; GFX12-NEXT:    s_wait_kmcnt 0x0
78; GFX12-NEXT:    s_add_co_i32 s2, s2, s3
79; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
80; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
81; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
82; GFX12-NEXT:    s_endpgm
83  %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1
84  %a = load i32, ptr addrspace(1) %in
85  %b = load i32, ptr addrspace(1) %b_ptr
86  %result = add i32 %a, %b
87  store i32 %result, ptr addrspace(1) %out
88  ret void
89}
90
91define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
92; GFX6-LABEL: s_add_v2i32:
93; GFX6:       ; %bb.0:
94; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
95; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
97; GFX6-NEXT:    s_mov_b32 s3, 0xf000
98; GFX6-NEXT:    s_mov_b32 s2, -1
99; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
100; GFX6-NEXT:    s_add_i32 s5, s5, s7
101; GFX6-NEXT:    s_add_i32 s4, s4, s6
102; GFX6-NEXT:    v_mov_b32_e32 v0, s4
103; GFX6-NEXT:    v_mov_b32_e32 v1, s5
104; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
105; GFX6-NEXT:    s_endpgm
106;
107; GFX8-LABEL: s_add_v2i32:
108; GFX8:       ; %bb.0:
109; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
110; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
112; GFX8-NEXT:    v_mov_b32_e32 v0, s0
113; GFX8-NEXT:    v_mov_b32_e32 v1, s1
114; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
115; GFX8-NEXT:    s_add_i32 s0, s5, s7
116; GFX8-NEXT:    s_add_i32 s1, s4, s6
117; GFX8-NEXT:    v_mov_b32_e32 v2, s1
118; GFX8-NEXT:    v_mov_b32_e32 v3, s0
119; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
120; GFX8-NEXT:    s_endpgm
121;
122; GFX9-LABEL: s_add_v2i32:
123; GFX9:       ; %bb.0:
124; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
125; GFX9-NEXT:    v_mov_b32_e32 v2, 0
126; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
128; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
129; GFX9-NEXT:    s_add_i32 s2, s5, s7
130; GFX9-NEXT:    s_add_i32 s3, s4, s6
131; GFX9-NEXT:    v_mov_b32_e32 v0, s3
132; GFX9-NEXT:    v_mov_b32_e32 v1, s2
133; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
134; GFX9-NEXT:    s_endpgm
135;
136; GFX10-LABEL: s_add_v2i32:
137; GFX10:       ; %bb.0:
138; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
139; GFX10-NEXT:    v_mov_b32_e32 v2, 0
140; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
141; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
142; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
143; GFX10-NEXT:    s_add_i32 s2, s4, s6
144; GFX10-NEXT:    s_add_i32 s3, s5, s7
145; GFX10-NEXT:    v_mov_b32_e32 v0, s2
146; GFX10-NEXT:    v_mov_b32_e32 v1, s3
147; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
148; GFX10-NEXT:    s_endpgm
149;
150; GFX11-LABEL: s_add_v2i32:
151; GFX11:       ; %bb.0:
152; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
153; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
154; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
155; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
156; GFX11-NEXT:    s_add_i32 s2, s4, s6
157; GFX11-NEXT:    s_add_i32 s3, s5, s7
158; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
159; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
160; GFX11-NEXT:    v_mov_b32_e32 v0, s2
161; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
162; GFX11-NEXT:    s_endpgm
163;
164; GFX12-LABEL: s_add_v2i32:
165; GFX12:       ; %bb.0:
166; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
167; GFX12-NEXT:    s_wait_kmcnt 0x0
168; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
169; GFX12-NEXT:    s_wait_kmcnt 0x0
170; GFX12-NEXT:    s_add_co_i32 s2, s4, s6
171; GFX12-NEXT:    s_add_co_i32 s3, s5, s7
172; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
173; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
174; GFX12-NEXT:    v_mov_b32_e32 v0, s2
175; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
176; GFX12-NEXT:    s_endpgm
177  %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
178  %a = load <2 x i32>, ptr addrspace(1) %in
179  %b = load <2 x i32>, ptr addrspace(1) %b_ptr
180  %result = add <2 x i32> %a, %b
181  store <2 x i32> %result, ptr addrspace(1) %out
182  ret void
183}
184
185define amdgpu_kernel void @s_add_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
186; GFX6-LABEL: s_add_v4i32:
187; GFX6:       ; %bb.0:
188; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
189; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
190; GFX6-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
191; GFX6-NEXT:    s_mov_b32 s11, 0xf000
192; GFX6-NEXT:    s_mov_b32 s10, -1
193; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
194; GFX6-NEXT:    s_add_i32 s3, s3, s7
195; GFX6-NEXT:    s_add_i32 s2, s2, s6
196; GFX6-NEXT:    s_add_i32 s1, s1, s5
197; GFX6-NEXT:    s_add_i32 s0, s0, s4
198; GFX6-NEXT:    v_mov_b32_e32 v0, s0
199; GFX6-NEXT:    v_mov_b32_e32 v1, s1
200; GFX6-NEXT:    v_mov_b32_e32 v2, s2
201; GFX6-NEXT:    v_mov_b32_e32 v3, s3
202; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
203; GFX6-NEXT:    s_endpgm
204;
205; GFX8-LABEL: s_add_v4i32:
206; GFX8:       ; %bb.0:
207; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
208; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
209; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
210; GFX8-NEXT:    v_mov_b32_e32 v4, s8
211; GFX8-NEXT:    v_mov_b32_e32 v5, s9
212; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
213; GFX8-NEXT:    s_add_i32 s3, s3, s7
214; GFX8-NEXT:    s_add_i32 s2, s2, s6
215; GFX8-NEXT:    s_add_i32 s1, s1, s5
216; GFX8-NEXT:    s_add_i32 s0, s0, s4
217; GFX8-NEXT:    v_mov_b32_e32 v0, s0
218; GFX8-NEXT:    v_mov_b32_e32 v1, s1
219; GFX8-NEXT:    v_mov_b32_e32 v2, s2
220; GFX8-NEXT:    v_mov_b32_e32 v3, s3
221; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
222; GFX8-NEXT:    s_endpgm
223;
224; GFX9-LABEL: s_add_v4i32:
225; GFX9:       ; %bb.0:
226; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
227; GFX9-NEXT:    v_mov_b32_e32 v4, 0
228; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
229; GFX9-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
230; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
231; GFX9-NEXT:    s_add_i32 s3, s3, s7
232; GFX9-NEXT:    s_add_i32 s2, s2, s6
233; GFX9-NEXT:    s_add_i32 s1, s1, s5
234; GFX9-NEXT:    s_add_i32 s0, s0, s4
235; GFX9-NEXT:    v_mov_b32_e32 v0, s0
236; GFX9-NEXT:    v_mov_b32_e32 v1, s1
237; GFX9-NEXT:    v_mov_b32_e32 v2, s2
238; GFX9-NEXT:    v_mov_b32_e32 v3, s3
239; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
240; GFX9-NEXT:    s_endpgm
241;
242; GFX10-LABEL: s_add_v4i32:
243; GFX10:       ; %bb.0:
244; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
245; GFX10-NEXT:    v_mov_b32_e32 v4, 0
246; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
247; GFX10-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
248; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
249; GFX10-NEXT:    s_add_i32 s3, s3, s7
250; GFX10-NEXT:    s_add_i32 s2, s2, s6
251; GFX10-NEXT:    s_add_i32 s0, s0, s4
252; GFX10-NEXT:    s_add_i32 s1, s1, s5
253; GFX10-NEXT:    v_mov_b32_e32 v0, s0
254; GFX10-NEXT:    v_mov_b32_e32 v1, s1
255; GFX10-NEXT:    v_mov_b32_e32 v2, s2
256; GFX10-NEXT:    v_mov_b32_e32 v3, s3
257; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
258; GFX10-NEXT:    s_endpgm
259;
260; GFX11-LABEL: s_add_v4i32:
261; GFX11:       ; %bb.0:
262; GFX11-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
263; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
264; GFX11-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
265; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
266; GFX11-NEXT:    s_add_i32 s3, s3, s7
267; GFX11-NEXT:    s_add_i32 s2, s2, s6
268; GFX11-NEXT:    s_add_i32 s0, s0, s4
269; GFX11-NEXT:    s_add_i32 s1, s1, s5
270; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
271; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
272; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
273; GFX11-NEXT:    v_mov_b32_e32 v2, s2
274; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[8:9]
275; GFX11-NEXT:    s_endpgm
276;
277; GFX12-LABEL: s_add_v4i32:
278; GFX12:       ; %bb.0:
279; GFX12-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
280; GFX12-NEXT:    s_wait_kmcnt 0x0
281; GFX12-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
282; GFX12-NEXT:    s_wait_kmcnt 0x0
283; GFX12-NEXT:    s_add_co_i32 s3, s3, s7
284; GFX12-NEXT:    s_add_co_i32 s2, s2, s6
285; GFX12-NEXT:    s_add_co_i32 s0, s0, s4
286; GFX12-NEXT:    s_add_co_i32 s1, s1, s5
287; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
288; GFX12-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
289; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
290; GFX12-NEXT:    v_mov_b32_e32 v2, s2
291; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[8:9]
292; GFX12-NEXT:    s_endpgm
293  %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
294  %a = load <4 x i32>, ptr addrspace(1) %in
295  %b = load <4 x i32>, ptr addrspace(1) %b_ptr
296  %result = add <4 x i32> %a, %b
297  store <4 x i32> %result, ptr addrspace(1) %out
298  ret void
299}
300
301define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) {
302; GFX6-LABEL: s_add_v8i32:
303; GFX6:       ; %bb.0: ; %entry
304; GFX6-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x11
305; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
306; GFX6-NEXT:    s_mov_b32 s3, 0xf000
307; GFX6-NEXT:    s_mov_b32 s2, -1
308; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
309; GFX6-NEXT:    s_add_i32 s4, s11, s19
310; GFX6-NEXT:    s_add_i32 s5, s10, s18
311; GFX6-NEXT:    s_add_i32 s6, s9, s17
312; GFX6-NEXT:    s_add_i32 s7, s8, s16
313; GFX6-NEXT:    s_add_i32 s8, s15, s23
314; GFX6-NEXT:    s_add_i32 s9, s14, s22
315; GFX6-NEXT:    s_add_i32 s10, s13, s21
316; GFX6-NEXT:    s_add_i32 s11, s12, s20
317; GFX6-NEXT:    v_mov_b32_e32 v0, s11
318; GFX6-NEXT:    v_mov_b32_e32 v1, s10
319; GFX6-NEXT:    v_mov_b32_e32 v2, s9
320; GFX6-NEXT:    v_mov_b32_e32 v3, s8
321; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
322; GFX6-NEXT:    s_waitcnt expcnt(0)
323; GFX6-NEXT:    v_mov_b32_e32 v0, s7
324; GFX6-NEXT:    v_mov_b32_e32 v1, s6
325; GFX6-NEXT:    v_mov_b32_e32 v2, s5
326; GFX6-NEXT:    v_mov_b32_e32 v3, s4
327; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
328; GFX6-NEXT:    s_endpgm
329;
330; GFX8-LABEL: s_add_v8i32:
331; GFX8:       ; %bb.0: ; %entry
332; GFX8-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x44
333; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
334; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
335; GFX8-NEXT:    s_add_i32 s4, s11, s19
336; GFX8-NEXT:    s_add_i32 s5, s10, s18
337; GFX8-NEXT:    s_add_i32 s6, s9, s17
338; GFX8-NEXT:    s_add_i32 s7, s8, s16
339; GFX8-NEXT:    s_add_i32 s2, s15, s23
340; GFX8-NEXT:    s_add_i32 s3, s14, s22
341; GFX8-NEXT:    s_add_i32 s8, s13, s21
342; GFX8-NEXT:    s_add_i32 s9, s12, s20
343; GFX8-NEXT:    v_mov_b32_e32 v3, s2
344; GFX8-NEXT:    s_add_u32 s2, s0, 16
345; GFX8-NEXT:    v_mov_b32_e32 v2, s3
346; GFX8-NEXT:    s_addc_u32 s3, s1, 0
347; GFX8-NEXT:    v_mov_b32_e32 v5, s3
348; GFX8-NEXT:    v_mov_b32_e32 v0, s9
349; GFX8-NEXT:    v_mov_b32_e32 v1, s8
350; GFX8-NEXT:    v_mov_b32_e32 v4, s2
351; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
352; GFX8-NEXT:    v_mov_b32_e32 v5, s1
353; GFX8-NEXT:    v_mov_b32_e32 v0, s7
354; GFX8-NEXT:    v_mov_b32_e32 v1, s6
355; GFX8-NEXT:    v_mov_b32_e32 v2, s5
356; GFX8-NEXT:    v_mov_b32_e32 v3, s4
357; GFX8-NEXT:    v_mov_b32_e32 v4, s0
358; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
359; GFX8-NEXT:    s_endpgm
360;
361; GFX9-LABEL: s_add_v8i32:
362; GFX9:       ; %bb.0: ; %entry
363; GFX9-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x44
364; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
365; GFX9-NEXT:    v_mov_b32_e32 v4, 0
366; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
367; GFX9-NEXT:    s_add_i32 s4, s9, s17
368; GFX9-NEXT:    s_add_i32 s5, s8, s16
369; GFX9-NEXT:    s_add_i32 s6, s15, s23
370; GFX9-NEXT:    s_add_i32 s7, s14, s22
371; GFX9-NEXT:    s_add_i32 s8, s13, s21
372; GFX9-NEXT:    s_add_i32 s9, s12, s20
373; GFX9-NEXT:    s_add_i32 s2, s11, s19
374; GFX9-NEXT:    s_add_i32 s3, s10, s18
375; GFX9-NEXT:    v_mov_b32_e32 v0, s9
376; GFX9-NEXT:    v_mov_b32_e32 v1, s8
377; GFX9-NEXT:    v_mov_b32_e32 v2, s7
378; GFX9-NEXT:    v_mov_b32_e32 v3, s6
379; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
380; GFX9-NEXT:    s_nop 0
381; GFX9-NEXT:    v_mov_b32_e32 v0, s5
382; GFX9-NEXT:    v_mov_b32_e32 v1, s4
383; GFX9-NEXT:    v_mov_b32_e32 v2, s3
384; GFX9-NEXT:    v_mov_b32_e32 v3, s2
385; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
386; GFX9-NEXT:    s_endpgm
387;
388; GFX10-LABEL: s_add_v8i32:
389; GFX10:       ; %bb.0: ; %entry
390; GFX10-NEXT:    s_clause 0x1
391; GFX10-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x44
392; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
393; GFX10-NEXT:    v_mov_b32_e32 v8, 0
394; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
395; GFX10-NEXT:    s_add_i32 s4, s9, s17
396; GFX10-NEXT:    s_add_i32 s5, s8, s16
397; GFX10-NEXT:    s_add_i32 s6, s15, s23
398; GFX10-NEXT:    s_add_i32 s7, s14, s22
399; GFX10-NEXT:    s_add_i32 s8, s12, s20
400; GFX10-NEXT:    s_add_i32 s9, s13, s21
401; GFX10-NEXT:    s_add_i32 s2, s11, s19
402; GFX10-NEXT:    s_add_i32 s3, s10, s18
403; GFX10-NEXT:    v_mov_b32_e32 v0, s8
404; GFX10-NEXT:    v_mov_b32_e32 v1, s9
405; GFX10-NEXT:    v_mov_b32_e32 v2, s7
406; GFX10-NEXT:    v_mov_b32_e32 v3, s6
407; GFX10-NEXT:    v_mov_b32_e32 v4, s5
408; GFX10-NEXT:    v_mov_b32_e32 v5, s4
409; GFX10-NEXT:    v_mov_b32_e32 v6, s3
410; GFX10-NEXT:    v_mov_b32_e32 v7, s2
411; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
412; GFX10-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
413; GFX10-NEXT:    s_endpgm
414;
415; GFX11-LABEL: s_add_v8i32:
416; GFX11:       ; %bb.0: ; %entry
417; GFX11-NEXT:    s_clause 0x1
418; GFX11-NEXT:    s_load_b512 s[8:23], s[4:5], 0x44
419; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
420; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
421; GFX11-NEXT:    s_add_i32 s4, s9, s17
422; GFX11-NEXT:    s_add_i32 s5, s8, s16
423; GFX11-NEXT:    s_add_i32 s6, s15, s23
424; GFX11-NEXT:    s_add_i32 s7, s14, s22
425; GFX11-NEXT:    s_add_i32 s8, s12, s20
426; GFX11-NEXT:    s_add_i32 s9, s13, s21
427; GFX11-NEXT:    s_add_i32 s2, s11, s19
428; GFX11-NEXT:    s_add_i32 s3, s10, s18
429; GFX11-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
430; GFX11-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6
431; GFX11-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s4
432; GFX11-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v7, s2
433; GFX11-NEXT:    v_mov_b32_e32 v6, s3
434; GFX11-NEXT:    s_clause 0x1
435; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
436; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
437; GFX11-NEXT:    s_endpgm
438;
439; GFX12-LABEL: s_add_v8i32:
440; GFX12:       ; %bb.0: ; %entry
441; GFX12-NEXT:    s_clause 0x1
442; GFX12-NEXT:    s_load_b512 s[8:23], s[4:5], 0x44
443; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
444; GFX12-NEXT:    s_wait_kmcnt 0x0
445; GFX12-NEXT:    s_add_co_i32 s4, s9, s17
446; GFX12-NEXT:    s_add_co_i32 s5, s8, s16
447; GFX12-NEXT:    s_add_co_i32 s6, s15, s23
448; GFX12-NEXT:    s_add_co_i32 s7, s14, s22
449; GFX12-NEXT:    s_add_co_i32 s8, s12, s20
450; GFX12-NEXT:    s_add_co_i32 s9, s13, s21
451; GFX12-NEXT:    s_add_co_i32 s2, s11, s19
452; GFX12-NEXT:    s_add_co_i32 s3, s10, s18
453; GFX12-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
454; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6
455; GFX12-NEXT:    v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s4
456; GFX12-NEXT:    v_dual_mov_b32 v4, s5 :: v_dual_mov_b32 v7, s2
457; GFX12-NEXT:    v_mov_b32_e32 v6, s3
458; GFX12-NEXT:    s_clause 0x1
459; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
460; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
461; GFX12-NEXT:    s_endpgm
462entry:
463  %0 = add <8 x i32> %a, %b
464  store <8 x i32> %0, ptr addrspace(1) %out
465  ret void
466}
467
468define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <16 x i32> %b) {
469; GFX6-LABEL: s_add_v16i32:
470; GFX6:       ; %bb.0: ; %entry
471; GFX6-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x19
472; GFX6-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x29
473; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
474; GFX6-NEXT:    s_mov_b32 s3, 0xf000
475; GFX6-NEXT:    s_mov_b32 s2, -1
476; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
477; GFX6-NEXT:    s_add_i32 s6, s11, s39
478; GFX6-NEXT:    s_add_i32 s7, s10, s38
479; GFX6-NEXT:    s_add_i32 s10, s15, s43
480; GFX6-NEXT:    s_add_i32 s11, s14, s42
481; GFX6-NEXT:    s_add_i32 s14, s19, s47
482; GFX6-NEXT:    s_add_i32 s15, s18, s46
483; GFX6-NEXT:    s_add_i32 s18, s23, s51
484; GFX6-NEXT:    s_add_i32 s19, s22, s50
485; GFX6-NEXT:    s_add_i32 s21, s21, s49
486; GFX6-NEXT:    s_add_i32 s20, s20, s48
487; GFX6-NEXT:    s_add_i32 s17, s17, s45
488; GFX6-NEXT:    s_add_i32 s16, s16, s44
489; GFX6-NEXT:    v_mov_b32_e32 v0, s20
490; GFX6-NEXT:    v_mov_b32_e32 v1, s21
491; GFX6-NEXT:    v_mov_b32_e32 v2, s19
492; GFX6-NEXT:    v_mov_b32_e32 v3, s18
493; GFX6-NEXT:    s_add_i32 s13, s13, s41
494; GFX6-NEXT:    s_add_i32 s12, s12, s40
495; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
496; GFX6-NEXT:    s_waitcnt expcnt(0)
497; GFX6-NEXT:    v_mov_b32_e32 v0, s16
498; GFX6-NEXT:    v_mov_b32_e32 v1, s17
499; GFX6-NEXT:    v_mov_b32_e32 v2, s15
500; GFX6-NEXT:    v_mov_b32_e32 v3, s14
501; GFX6-NEXT:    s_add_i32 s9, s9, s37
502; GFX6-NEXT:    s_add_i32 s8, s8, s36
503; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
504; GFX6-NEXT:    s_waitcnt expcnt(0)
505; GFX6-NEXT:    v_mov_b32_e32 v0, s12
506; GFX6-NEXT:    v_mov_b32_e32 v1, s13
507; GFX6-NEXT:    v_mov_b32_e32 v2, s11
508; GFX6-NEXT:    v_mov_b32_e32 v3, s10
509; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
510; GFX6-NEXT:    s_waitcnt expcnt(0)
511; GFX6-NEXT:    v_mov_b32_e32 v0, s8
512; GFX6-NEXT:    v_mov_b32_e32 v1, s9
513; GFX6-NEXT:    v_mov_b32_e32 v2, s7
514; GFX6-NEXT:    v_mov_b32_e32 v3, s6
515; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
516; GFX6-NEXT:    s_endpgm
517;
518; GFX8-LABEL: s_add_v16i32:
519; GFX8:       ; %bb.0: ; %entry
520; GFX8-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
521; GFX8-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
522; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
523; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
524; GFX8-NEXT:    s_add_i32 s4, s11, s39
525; GFX8-NEXT:    s_add_i32 s5, s10, s38
526; GFX8-NEXT:    s_add_i32 s6, s9, s37
527; GFX8-NEXT:    s_add_i32 s7, s8, s36
528; GFX8-NEXT:    s_add_i32 s8, s15, s43
529; GFX8-NEXT:    s_add_i32 s9, s14, s42
530; GFX8-NEXT:    s_add_i32 s10, s13, s41
531; GFX8-NEXT:    s_add_i32 s11, s12, s40
532; GFX8-NEXT:    s_add_i32 s12, s19, s47
533; GFX8-NEXT:    s_add_i32 s13, s18, s46
534; GFX8-NEXT:    s_add_i32 s14, s17, s45
535; GFX8-NEXT:    s_add_i32 s15, s16, s44
536; GFX8-NEXT:    s_add_i32 s2, s23, s51
537; GFX8-NEXT:    s_add_i32 s3, s22, s50
538; GFX8-NEXT:    s_add_i32 s16, s21, s49
539; GFX8-NEXT:    s_add_i32 s17, s20, s48
540; GFX8-NEXT:    v_mov_b32_e32 v3, s2
541; GFX8-NEXT:    s_add_u32 s2, s0, 48
542; GFX8-NEXT:    v_mov_b32_e32 v2, s3
543; GFX8-NEXT:    s_addc_u32 s3, s1, 0
544; GFX8-NEXT:    v_mov_b32_e32 v5, s3
545; GFX8-NEXT:    v_mov_b32_e32 v4, s2
546; GFX8-NEXT:    s_add_u32 s2, s0, 32
547; GFX8-NEXT:    v_mov_b32_e32 v0, s17
548; GFX8-NEXT:    v_mov_b32_e32 v1, s16
549; GFX8-NEXT:    s_addc_u32 s3, s1, 0
550; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
551; GFX8-NEXT:    v_mov_b32_e32 v5, s3
552; GFX8-NEXT:    v_mov_b32_e32 v4, s2
553; GFX8-NEXT:    s_add_u32 s2, s0, 16
554; GFX8-NEXT:    v_mov_b32_e32 v0, s15
555; GFX8-NEXT:    v_mov_b32_e32 v1, s14
556; GFX8-NEXT:    v_mov_b32_e32 v2, s13
557; GFX8-NEXT:    v_mov_b32_e32 v3, s12
558; GFX8-NEXT:    s_addc_u32 s3, s1, 0
559; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
560; GFX8-NEXT:    v_mov_b32_e32 v5, s3
561; GFX8-NEXT:    v_mov_b32_e32 v0, s11
562; GFX8-NEXT:    v_mov_b32_e32 v1, s10
563; GFX8-NEXT:    v_mov_b32_e32 v2, s9
564; GFX8-NEXT:    v_mov_b32_e32 v3, s8
565; GFX8-NEXT:    v_mov_b32_e32 v4, s2
566; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
567; GFX8-NEXT:    v_mov_b32_e32 v5, s1
568; GFX8-NEXT:    v_mov_b32_e32 v0, s7
569; GFX8-NEXT:    v_mov_b32_e32 v1, s6
570; GFX8-NEXT:    v_mov_b32_e32 v2, s5
571; GFX8-NEXT:    v_mov_b32_e32 v3, s4
572; GFX8-NEXT:    v_mov_b32_e32 v4, s0
573; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
574; GFX8-NEXT:    s_endpgm
575;
576; GFX9-LABEL: s_add_v16i32:
577; GFX9:       ; %bb.0: ; %entry
578; GFX9-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
579; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
580; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
581; GFX9-NEXT:    v_mov_b32_e32 v4, 0
582; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
583; GFX9-NEXT:    s_add_i32 s4, s9, s37
584; GFX9-NEXT:    s_add_i32 s5, s8, s36
585; GFX9-NEXT:    s_add_i32 s6, s15, s43
586; GFX9-NEXT:    s_add_i32 s7, s14, s42
587; GFX9-NEXT:    s_add_i32 s8, s13, s41
588; GFX9-NEXT:    s_add_i32 s9, s12, s40
589; GFX9-NEXT:    s_add_i32 s12, s17, s45
590; GFX9-NEXT:    s_add_i32 s13, s16, s44
591; GFX9-NEXT:    s_add_i32 s14, s23, s51
592; GFX9-NEXT:    s_add_i32 s15, s22, s50
593; GFX9-NEXT:    s_add_i32 s16, s21, s49
594; GFX9-NEXT:    s_add_i32 s17, s20, s48
595; GFX9-NEXT:    s_add_i32 s2, s11, s39
596; GFX9-NEXT:    s_add_i32 s3, s10, s38
597; GFX9-NEXT:    s_add_i32 s10, s19, s47
598; GFX9-NEXT:    s_add_i32 s11, s18, s46
599; GFX9-NEXT:    v_mov_b32_e32 v0, s17
600; GFX9-NEXT:    v_mov_b32_e32 v1, s16
601; GFX9-NEXT:    v_mov_b32_e32 v2, s15
602; GFX9-NEXT:    v_mov_b32_e32 v3, s14
603; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
604; GFX9-NEXT:    s_nop 0
605; GFX9-NEXT:    v_mov_b32_e32 v0, s13
606; GFX9-NEXT:    v_mov_b32_e32 v1, s12
607; GFX9-NEXT:    v_mov_b32_e32 v2, s11
608; GFX9-NEXT:    v_mov_b32_e32 v3, s10
609; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
610; GFX9-NEXT:    s_nop 0
611; GFX9-NEXT:    v_mov_b32_e32 v0, s9
612; GFX9-NEXT:    v_mov_b32_e32 v1, s8
613; GFX9-NEXT:    v_mov_b32_e32 v2, s7
614; GFX9-NEXT:    v_mov_b32_e32 v3, s6
615; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
616; GFX9-NEXT:    s_nop 0
617; GFX9-NEXT:    v_mov_b32_e32 v0, s5
618; GFX9-NEXT:    v_mov_b32_e32 v1, s4
619; GFX9-NEXT:    v_mov_b32_e32 v2, s3
620; GFX9-NEXT:    v_mov_b32_e32 v3, s2
621; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
622; GFX9-NEXT:    s_endpgm
623;
624; GFX10-LABEL: s_add_v16i32:
625; GFX10:       ; %bb.0: ; %entry
626; GFX10-NEXT:    s_clause 0x2
627; GFX10-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
628; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
629; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
630; GFX10-NEXT:    v_mov_b32_e32 v16, 0
631; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
632; GFX10-NEXT:    s_add_i32 s4, s9, s37
633; GFX10-NEXT:    s_add_i32 s5, s8, s36
634; GFX10-NEXT:    s_add_i32 s6, s15, s43
635; GFX10-NEXT:    s_add_i32 s7, s14, s42
636; GFX10-NEXT:    s_add_i32 s8, s13, s41
637; GFX10-NEXT:    s_add_i32 s9, s12, s40
638; GFX10-NEXT:    s_add_i32 s12, s17, s45
639; GFX10-NEXT:    s_add_i32 s13, s16, s44
640; GFX10-NEXT:    s_add_i32 s14, s23, s51
641; GFX10-NEXT:    s_add_i32 s15, s22, s50
642; GFX10-NEXT:    s_add_i32 s16, s20, s48
643; GFX10-NEXT:    s_add_i32 s17, s21, s49
644; GFX10-NEXT:    s_add_i32 s2, s11, s39
645; GFX10-NEXT:    s_add_i32 s3, s10, s38
646; GFX10-NEXT:    s_add_i32 s10, s19, s47
647; GFX10-NEXT:    s_add_i32 s11, s18, s46
648; GFX10-NEXT:    v_mov_b32_e32 v0, s16
649; GFX10-NEXT:    v_mov_b32_e32 v1, s17
650; GFX10-NEXT:    v_mov_b32_e32 v2, s15
651; GFX10-NEXT:    v_mov_b32_e32 v3, s14
652; GFX10-NEXT:    v_mov_b32_e32 v4, s13
653; GFX10-NEXT:    v_mov_b32_e32 v5, s12
654; GFX10-NEXT:    v_mov_b32_e32 v6, s11
655; GFX10-NEXT:    v_mov_b32_e32 v7, s10
656; GFX10-NEXT:    v_mov_b32_e32 v8, s9
657; GFX10-NEXT:    v_mov_b32_e32 v9, s8
658; GFX10-NEXT:    v_mov_b32_e32 v10, s7
659; GFX10-NEXT:    v_mov_b32_e32 v11, s6
660; GFX10-NEXT:    v_mov_b32_e32 v12, s5
661; GFX10-NEXT:    v_mov_b32_e32 v13, s4
662; GFX10-NEXT:    v_mov_b32_e32 v14, s3
663; GFX10-NEXT:    v_mov_b32_e32 v15, s2
664; GFX10-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1] offset:48
665; GFX10-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:32
666; GFX10-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:16
667; GFX10-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1]
668; GFX10-NEXT:    s_endpgm
669;
670; GFX11-LABEL: s_add_v16i32:
671; GFX11:       ; %bb.0: ; %entry
672; GFX11-NEXT:    s_clause 0x2
673; GFX11-NEXT:    s_load_b512 s[8:23], s[4:5], 0x64
674; GFX11-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
675; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
676; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX11-NEXT:    s_add_i32 s4, s9, s37
678; GFX11-NEXT:    s_add_i32 s5, s8, s36
679; GFX11-NEXT:    s_add_i32 s6, s15, s43
680; GFX11-NEXT:    s_add_i32 s7, s14, s42
681; GFX11-NEXT:    s_add_i32 s8, s13, s41
682; GFX11-NEXT:    s_add_i32 s9, s12, s40
683; GFX11-NEXT:    s_add_i32 s12, s17, s45
684; GFX11-NEXT:    s_add_i32 s13, s16, s44
685; GFX11-NEXT:    s_add_i32 s14, s23, s51
686; GFX11-NEXT:    s_add_i32 s15, s22, s50
687; GFX11-NEXT:    s_add_i32 s16, s20, s48
688; GFX11-NEXT:    s_add_i32 s17, s21, s49
689; GFX11-NEXT:    s_add_i32 s2, s11, s39
690; GFX11-NEXT:    s_add_i32 s3, s10, s38
691; GFX11-NEXT:    s_add_i32 s10, s19, s47
692; GFX11-NEXT:    s_add_i32 s11, s18, s46
693; GFX11-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17
694; GFX11-NEXT:    v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v3, s14
695; GFX11-NEXT:    v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s12
696; GFX11-NEXT:    v_dual_mov_b32 v4, s13 :: v_dual_mov_b32 v7, s10
697; GFX11-NEXT:    v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s8
698; GFX11-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s6
699; GFX11-NEXT:    v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s4
700; GFX11-NEXT:    v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v15, s2
701; GFX11-NEXT:    v_mov_b32_e32 v14, s3
702; GFX11-NEXT:    s_clause 0x3
703; GFX11-NEXT:    global_store_b128 v16, v[0:3], s[0:1] offset:48
704; GFX11-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
705; GFX11-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
706; GFX11-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
707; GFX11-NEXT:    s_endpgm
708;
709; GFX12-LABEL: s_add_v16i32:
710; GFX12:       ; %bb.0: ; %entry
711; GFX12-NEXT:    s_clause 0x2
712; GFX12-NEXT:    s_load_b512 s[8:23], s[4:5], 0x64
713; GFX12-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
714; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
715; GFX12-NEXT:    s_wait_kmcnt 0x0
716; GFX12-NEXT:    s_add_co_i32 s4, s9, s37
717; GFX12-NEXT:    s_add_co_i32 s5, s8, s36
718; GFX12-NEXT:    s_add_co_i32 s6, s15, s43
719; GFX12-NEXT:    s_add_co_i32 s7, s14, s42
720; GFX12-NEXT:    s_add_co_i32 s8, s13, s41
721; GFX12-NEXT:    s_add_co_i32 s9, s12, s40
722; GFX12-NEXT:    s_add_co_i32 s12, s17, s45
723; GFX12-NEXT:    s_add_co_i32 s13, s16, s44
724; GFX12-NEXT:    s_add_co_i32 s14, s23, s51
725; GFX12-NEXT:    s_add_co_i32 s15, s22, s50
726; GFX12-NEXT:    s_add_co_i32 s16, s20, s48
727; GFX12-NEXT:    s_add_co_i32 s17, s21, s49
728; GFX12-NEXT:    s_add_co_i32 s2, s11, s39
729; GFX12-NEXT:    s_add_co_i32 s3, s10, s38
730; GFX12-NEXT:    s_add_co_i32 s10, s19, s47
731; GFX12-NEXT:    s_add_co_i32 s11, s18, s46
732; GFX12-NEXT:    v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17
733; GFX12-NEXT:    v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v3, s14
734; GFX12-NEXT:    v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s12
735; GFX12-NEXT:    v_dual_mov_b32 v4, s13 :: v_dual_mov_b32 v7, s10
736; GFX12-NEXT:    v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s8
737; GFX12-NEXT:    v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s6
738; GFX12-NEXT:    v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s4
739; GFX12-NEXT:    v_dual_mov_b32 v12, s5 :: v_dual_mov_b32 v15, s2
740; GFX12-NEXT:    v_mov_b32_e32 v14, s3
741; GFX12-NEXT:    s_clause 0x3
742; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[0:1] offset:48
743; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
744; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
745; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
746; GFX12-NEXT:    s_endpgm
747entry:
748  %0 = add <16 x i32> %a, %b
749  store <16 x i32> %0, ptr addrspace(1) %out
750  ret void
751}
752
753define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
754; GFX6-LABEL: v_add_i32:
755; GFX6:       ; %bb.0:
756; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
757; GFX6-NEXT:    s_mov_b32 s7, 0xf000
758; GFX6-NEXT:    s_mov_b32 s10, 0
759; GFX6-NEXT:    s_mov_b32 s11, s7
760; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
761; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
762; GFX6-NEXT:    s_mov_b64 s[8:9], s[2:3]
763; GFX6-NEXT:    v_mov_b32_e32 v1, 0
764; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
765; GFX6-NEXT:    s_waitcnt vmcnt(0)
766; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 glc
767; GFX6-NEXT:    s_waitcnt vmcnt(0)
768; GFX6-NEXT:    s_mov_b32 s6, -1
769; GFX6-NEXT:    s_mov_b32 s4, s0
770; GFX6-NEXT:    s_mov_b32 s5, s1
771; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
772; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
773; GFX6-NEXT:    s_endpgm
774;
775; GFX8-LABEL: v_add_i32:
776; GFX8:       ; %bb.0:
777; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
778; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
779; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
780; GFX8-NEXT:    v_mov_b32_e32 v1, s3
781; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
782; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
783; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
784; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
785; GFX8-NEXT:    flat_load_dword v4, v[0:1] glc
786; GFX8-NEXT:    s_waitcnt vmcnt(0)
787; GFX8-NEXT:    flat_load_dword v2, v[2:3] glc
788; GFX8-NEXT:    s_waitcnt vmcnt(0)
789; GFX8-NEXT:    v_mov_b32_e32 v0, s0
790; GFX8-NEXT:    v_mov_b32_e32 v1, s1
791; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v4, v2
792; GFX8-NEXT:    flat_store_dword v[0:1], v2
793; GFX8-NEXT:    s_endpgm
794;
795; GFX9-LABEL: v_add_i32:
796; GFX9:       ; %bb.0:
797; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
798; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
799; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
800; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
801; GFX9-NEXT:    s_waitcnt vmcnt(0)
802; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc
803; GFX9-NEXT:    s_waitcnt vmcnt(0)
804; GFX9-NEXT:    v_mov_b32_e32 v0, 0
805; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
806; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
807; GFX9-NEXT:    s_endpgm
808;
809; GFX10-LABEL: v_add_i32:
810; GFX10:       ; %bb.0:
811; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
812; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
813; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
814; GFX10-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
815; GFX10-NEXT:    s_waitcnt vmcnt(0)
816; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
817; GFX10-NEXT:    s_waitcnt vmcnt(0)
818; GFX10-NEXT:    v_mov_b32_e32 v0, 0
819; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
820; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
821; GFX10-NEXT:    s_endpgm
822;
823; GFX11-LABEL: v_add_i32:
824; GFX11:       ; %bb.0:
825; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
826; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
827; GFX11-NEXT:    v_mov_b32_e32 v2, 0
828; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
829; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
830; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
831; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
832; GFX11-NEXT:    s_waitcnt vmcnt(0)
833; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
834; GFX11-NEXT:    s_waitcnt vmcnt(0)
835; GFX11-NEXT:    v_add_nc_u32_e32 v0, v1, v0
836; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
837; GFX11-NEXT:    s_endpgm
838;
839; GFX12-LABEL: v_add_i32:
840; GFX12:       ; %bb.0:
841; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
842; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
843; GFX12-NEXT:    v_mov_b32_e32 v2, 0
844; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
845; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
846; GFX12-NEXT:    s_wait_kmcnt 0x0
847; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
848; GFX12-NEXT:    s_wait_loadcnt 0x0
849; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] offset:4 scope:SCOPE_SYS
850; GFX12-NEXT:    s_wait_loadcnt 0x0
851; GFX12-NEXT:    v_add_nc_u32_e32 v0, v1, v0
852; GFX12-NEXT:    global_store_b32 v2, v0, s[0:1]
853; GFX12-NEXT:    s_endpgm
854  %tid = call i32 @llvm.amdgcn.workitem.id.x()
855  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
856  %b_ptr = getelementptr i32, ptr addrspace(1) %gep, i32 1
857  %a = load volatile i32, ptr addrspace(1) %gep
858  %b = load volatile i32, ptr addrspace(1) %b_ptr
859  %result = add i32 %a, %b
860  store i32 %result, ptr addrspace(1) %out
861  ret void
862}
863
864define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
865; GFX6-LABEL: v_add_imm_i32:
866; GFX6:       ; %bb.0:
867; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
868; GFX6-NEXT:    s_mov_b32 s7, 0xf000
869; GFX6-NEXT:    s_mov_b32 s10, 0
870; GFX6-NEXT:    s_mov_b32 s11, s7
871; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
872; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
873; GFX6-NEXT:    s_mov_b64 s[8:9], s[2:3]
874; GFX6-NEXT:    v_mov_b32_e32 v1, 0
875; GFX6-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 glc
876; GFX6-NEXT:    s_waitcnt vmcnt(0)
877; GFX6-NEXT:    s_mov_b32 s6, -1
878; GFX6-NEXT:    s_mov_b32 s4, s0
879; GFX6-NEXT:    s_mov_b32 s5, s1
880; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 0x7b, v0
881; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
882; GFX6-NEXT:    s_endpgm
883;
884; GFX8-LABEL: v_add_imm_i32:
885; GFX8:       ; %bb.0:
886; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
887; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
888; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
889; GFX8-NEXT:    v_mov_b32_e32 v1, s3
890; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
891; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
892; GFX8-NEXT:    flat_load_dword v2, v[0:1] glc
893; GFX8-NEXT:    s_waitcnt vmcnt(0)
894; GFX8-NEXT:    v_mov_b32_e32 v0, s0
895; GFX8-NEXT:    v_mov_b32_e32 v1, s1
896; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7b, v2
897; GFX8-NEXT:    flat_store_dword v[0:1], v2
898; GFX8-NEXT:    s_endpgm
899;
900; GFX9-LABEL: v_add_imm_i32:
901; GFX9:       ; %bb.0:
902; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
903; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
904; GFX9-NEXT:    v_mov_b32_e32 v1, 0
905; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
906; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
907; GFX9-NEXT:    s_waitcnt vmcnt(0)
908; GFX9-NEXT:    v_add_u32_e32 v0, 0x7b, v0
909; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
910; GFX9-NEXT:    s_endpgm
911;
912; GFX10-LABEL: v_add_imm_i32:
913; GFX10:       ; %bb.0:
914; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
915; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
916; GFX10-NEXT:    v_mov_b32_e32 v1, 0
917; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
918; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
919; GFX10-NEXT:    s_waitcnt vmcnt(0)
920; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x7b, v0
921; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
922; GFX10-NEXT:    s_endpgm
923;
924; GFX11-LABEL: v_add_imm_i32:
925; GFX11:       ; %bb.0:
926; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
927; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
928; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
929; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
930; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
931; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3] glc dlc
932; GFX11-NEXT:    s_waitcnt vmcnt(0)
933; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0x7b, v0
934; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
935; GFX11-NEXT:    s_endpgm
936;
937; GFX12-LABEL: v_add_imm_i32:
938; GFX12:       ; %bb.0:
939; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
940; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
941; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
942; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
943; GFX12-NEXT:    s_wait_kmcnt 0x0
944; GFX12-NEXT:    global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS
945; GFX12-NEXT:    s_wait_loadcnt 0x0
946; GFX12-NEXT:    v_add_nc_u32_e32 v0, 0x7b, v0
947; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
948; GFX12-NEXT:    s_endpgm
949  %tid = call i32 @llvm.amdgcn.workitem.id.x()
950  %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
951  %b_ptr = getelementptr i32, ptr addrspace(1) %gep, i32 1
952  %a = load volatile i32, ptr addrspace(1) %gep
953  %result = add i32 %a, 123
954  store i32 %result, ptr addrspace(1) %out
955  ret void
956}
957
958define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
959; GFX6-LABEL: add64:
960; GFX6:       ; %bb.0: ; %entry
961; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
962; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
963; GFX6-NEXT:    s_mov_b32 s7, 0xf000
964; GFX6-NEXT:    s_mov_b32 s6, -1
965; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
966; GFX6-NEXT:    s_mov_b32 s4, s0
967; GFX6-NEXT:    s_add_u32 s0, s2, s8
968; GFX6-NEXT:    s_mov_b32 s5, s1
969; GFX6-NEXT:    s_addc_u32 s1, s3, s9
970; GFX6-NEXT:    v_mov_b32_e32 v0, s0
971; GFX6-NEXT:    v_mov_b32_e32 v1, s1
972; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
973; GFX6-NEXT:    s_endpgm
974;
975; GFX8-LABEL: add64:
976; GFX8:       ; %bb.0: ; %entry
977; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
978; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
979; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
980; GFX8-NEXT:    v_mov_b32_e32 v0, s0
981; GFX8-NEXT:    s_add_u32 s0, s2, s4
982; GFX8-NEXT:    v_mov_b32_e32 v1, s1
983; GFX8-NEXT:    s_addc_u32 s1, s3, s5
984; GFX8-NEXT:    v_mov_b32_e32 v3, s1
985; GFX8-NEXT:    v_mov_b32_e32 v2, s0
986; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
987; GFX8-NEXT:    s_endpgm
988;
989; GFX9-LABEL: add64:
990; GFX9:       ; %bb.0: ; %entry
991; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
992; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
993; GFX9-NEXT:    v_mov_b32_e32 v2, 0
994; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
995; GFX9-NEXT:    s_add_u32 s2, s2, s6
996; GFX9-NEXT:    s_addc_u32 s3, s3, s7
997; GFX9-NEXT:    v_mov_b32_e32 v0, s2
998; GFX9-NEXT:    v_mov_b32_e32 v1, s3
999; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1000; GFX9-NEXT:    s_endpgm
1001;
1002; GFX10-LABEL: add64:
1003; GFX10:       ; %bb.0: ; %entry
1004; GFX10-NEXT:    s_clause 0x1
1005; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1006; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1007; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1008; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1009; GFX10-NEXT:    s_add_u32 s2, s2, s6
1010; GFX10-NEXT:    s_addc_u32 s3, s3, s7
1011; GFX10-NEXT:    v_mov_b32_e32 v0, s2
1012; GFX10-NEXT:    v_mov_b32_e32 v1, s3
1013; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1014; GFX10-NEXT:    s_endpgm
1015;
1016; GFX11-LABEL: add64:
1017; GFX11:       ; %bb.0: ; %entry
1018; GFX11-NEXT:    s_clause 0x1
1019; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1020; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1021; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1022; GFX11-NEXT:    s_add_u32 s2, s2, s4
1023; GFX11-NEXT:    s_addc_u32 s3, s3, s5
1024; GFX11-NEXT:    v_mov_b32_e32 v0, s2
1025; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
1026; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1027; GFX11-NEXT:    s_endpgm
1028;
1029; GFX12-LABEL: add64:
1030; GFX12:       ; %bb.0: ; %entry
1031; GFX12-NEXT:    s_clause 0x1
1032; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1033; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
1034; GFX12-NEXT:    s_wait_kmcnt 0x0
1035; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
1036; GFX12-NEXT:    v_mov_b32_e32 v2, 0
1037; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1038; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1039; GFX12-NEXT:    s_endpgm
1040entry:
1041  %add = add i64 %a, %b
1042  store i64 %add, ptr addrspace(1) %out
1043  ret void
1044}
1045
1046; The v_addc_u32 and v_add_i32 instruction can't read SGPRs, because they
1047; use VCC.  The test is designed so that %a will be stored in an SGPR and
1048; %0 will be stored in a VGPR, so the comiler will be forced to copy %a
1049; to a VGPR before doing the add.
1050define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr addrspace(1) %in) {
1051; GFX6-LABEL: add64_sgpr_vgpr:
1052; GFX6:       ; %bb.0: ; %entry
1053; GFX6-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
1054; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1055; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1056; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
1057; GFX6-NEXT:    s_mov_b32 s4, s0
1058; GFX6-NEXT:    s_mov_b32 s5, s1
1059; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1060; GFX6-NEXT:    s_mov_b32 s6, -1
1061; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1062; GFX6-NEXT:    s_add_u32 s0, s2, s8
1063; GFX6-NEXT:    s_addc_u32 s1, s3, s9
1064; GFX6-NEXT:    v_mov_b32_e32 v0, s0
1065; GFX6-NEXT:    v_mov_b32_e32 v1, s1
1066; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1067; GFX6-NEXT:    s_endpgm
1068;
1069; GFX8-LABEL: add64_sgpr_vgpr:
1070; GFX8:       ; %bb.0: ; %entry
1071; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1072; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1073; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1074; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
1075; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1076; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1077; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1078; GFX8-NEXT:    s_add_u32 s0, s2, s4
1079; GFX8-NEXT:    s_addc_u32 s1, s3, s5
1080; GFX8-NEXT:    v_mov_b32_e32 v3, s1
1081; GFX8-NEXT:    v_mov_b32_e32 v2, s0
1082; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1083; GFX8-NEXT:    s_endpgm
1084;
1085; GFX9-LABEL: add64_sgpr_vgpr:
1086; GFX9:       ; %bb.0: ; %entry
1087; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1088; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1089; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1090; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1091; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
1092; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1093; GFX9-NEXT:    s_add_u32 s2, s2, s4
1094; GFX9-NEXT:    s_addc_u32 s3, s3, s5
1095; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1096; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1097; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1098; GFX9-NEXT:    s_endpgm
1099;
1100; GFX10-LABEL: add64_sgpr_vgpr:
1101; GFX10:       ; %bb.0: ; %entry
1102; GFX10-NEXT:    s_clause 0x1
1103; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
1104; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1105; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1106; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1107; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
1108; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1109; GFX10-NEXT:    s_add_u32 s2, s2, s4
1110; GFX10-NEXT:    s_addc_u32 s3, s3, s5
1111; GFX10-NEXT:    v_mov_b32_e32 v0, s2
1112; GFX10-NEXT:    v_mov_b32_e32 v1, s3
1113; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1114; GFX10-NEXT:    s_endpgm
1115;
1116; GFX11-LABEL: add64_sgpr_vgpr:
1117; GFX11:       ; %bb.0: ; %entry
1118; GFX11-NEXT:    s_clause 0x1
1119; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
1120; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1121; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1122; GFX11-NEXT:    s_load_b64 s[4:5], s[6:7], 0x0
1123; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1124; GFX11-NEXT:    s_add_u32 s2, s2, s4
1125; GFX11-NEXT:    s_addc_u32 s3, s3, s5
1126; GFX11-NEXT:    v_mov_b32_e32 v0, s2
1127; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
1128; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1129; GFX11-NEXT:    s_endpgm
1130;
1131; GFX12-LABEL: add64_sgpr_vgpr:
1132; GFX12:       ; %bb.0: ; %entry
1133; GFX12-NEXT:    s_clause 0x1
1134; GFX12-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
1135; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1136; GFX12-NEXT:    s_wait_kmcnt 0x0
1137; GFX12-NEXT:    s_load_b64 s[4:5], s[6:7], 0x0
1138; GFX12-NEXT:    s_wait_kmcnt 0x0
1139; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
1140; GFX12-NEXT:    v_mov_b32_e32 v2, 0
1141; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
1142; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1143; GFX12-NEXT:    s_endpgm
1144entry:
1145  %0 = load i64, ptr addrspace(1) %in
1146  %1 = add i64 %a, %0
1147  store i64 %1, ptr addrspace(1) %out
1148  ret void
1149}
1150
1151; Test i64 add inside a branch.
1152define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b, i64 %c) {
1153; GFX6-LABEL: add64_in_branch:
1154; GFX6:       ; %bb.0: ; %entry
1155; GFX6-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
1156; GFX6-NEXT:    s_mov_b64 s[8:9], 0
1157; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1158; GFX6-NEXT:    v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
1159; GFX6-NEXT:    s_and_b64 vcc, exec, s[10:11]
1160; GFX6-NEXT:    s_cbranch_vccz .LBB9_4
1161; GFX6-NEXT:  ; %bb.1: ; %else
1162; GFX6-NEXT:    s_add_u32 s4, s4, s6
1163; GFX6-NEXT:    s_addc_u32 s5, s5, s7
1164; GFX6-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
1165; GFX6-NEXT:    s_cbranch_vccnz .LBB9_3
1166; GFX6-NEXT:  .LBB9_2: ; %if
1167; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
1168; GFX6-NEXT:  .LBB9_3: ; %endif
1169; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1170; GFX6-NEXT:    v_mov_b32_e32 v0, s4
1171; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1172; GFX6-NEXT:    s_mov_b32 s2, -1
1173; GFX6-NEXT:    v_mov_b32_e32 v1, s5
1174; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1175; GFX6-NEXT:    s_endpgm
1176; GFX6-NEXT:  .LBB9_4:
1177; GFX6-NEXT:    ; implicit-def: $sgpr4_sgpr5
1178; GFX6-NEXT:    s_branch .LBB9_2
1179;
1180; GFX8-LABEL: add64_in_branch:
1181; GFX8:       ; %bb.0: ; %entry
1182; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
1183; GFX8-NEXT:    s_mov_b64 s[8:9], 0
1184; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1185; GFX8-NEXT:    s_cmp_lg_u64 s[4:5], 0
1186; GFX8-NEXT:    s_cbranch_scc0 .LBB9_4
1187; GFX8-NEXT:  ; %bb.1: ; %else
1188; GFX8-NEXT:    s_add_u32 s4, s4, s6
1189; GFX8-NEXT:    s_addc_u32 s5, s5, s7
1190; GFX8-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
1191; GFX8-NEXT:    s_cbranch_vccnz .LBB9_3
1192; GFX8-NEXT:  .LBB9_2: ; %if
1193; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
1194; GFX8-NEXT:  .LBB9_3: ; %endif
1195; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1196; GFX8-NEXT:    v_mov_b32_e32 v2, s4
1197; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1198; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1199; GFX8-NEXT:    v_mov_b32_e32 v3, s5
1200; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1201; GFX8-NEXT:    s_endpgm
1202; GFX8-NEXT:  .LBB9_4:
1203; GFX8-NEXT:    ; implicit-def: $sgpr4_sgpr5
1204; GFX8-NEXT:    s_branch .LBB9_2
1205;
1206; GFX9-LABEL: add64_in_branch:
1207; GFX9:       ; %bb.0: ; %entry
1208; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
1209; GFX9-NEXT:    s_mov_b64 s[2:3], 0
1210; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1211; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
1212; GFX9-NEXT:    s_cbranch_scc0 .LBB9_4
1213; GFX9-NEXT:  ; %bb.1: ; %else
1214; GFX9-NEXT:    s_add_u32 s0, s12, s14
1215; GFX9-NEXT:    s_addc_u32 s1, s13, s15
1216; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
1217; GFX9-NEXT:    s_cbranch_vccnz .LBB9_3
1218; GFX9-NEXT:  .LBB9_2: ; %if
1219; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[10:11], 0x0
1220; GFX9-NEXT:  .LBB9_3: ; %endif
1221; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1222; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1223; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1224; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1225; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
1226; GFX9-NEXT:    s_endpgm
1227; GFX9-NEXT:  .LBB9_4:
1228; GFX9-NEXT:    ; implicit-def: $sgpr0_sgpr1
1229; GFX9-NEXT:    s_branch .LBB9_2
1230;
1231; GFX10-LABEL: add64_in_branch:
1232; GFX10:       ; %bb.0: ; %entry
1233; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
1234; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1235; GFX10-NEXT:    s_cmp_lg_u64 s[12:13], 0
1236; GFX10-NEXT:    s_cbranch_scc0 .LBB9_4
1237; GFX10-NEXT:  ; %bb.1: ; %else
1238; GFX10-NEXT:    s_add_u32 s0, s12, s14
1239; GFX10-NEXT:    s_addc_u32 s1, s13, s15
1240; GFX10-NEXT:    s_cbranch_execnz .LBB9_3
1241; GFX10-NEXT:  .LBB9_2: ; %if
1242; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[10:11], 0x0
1243; GFX10-NEXT:  .LBB9_3: ; %endif
1244; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1245; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1246; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1247; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1248; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
1249; GFX10-NEXT:    s_endpgm
1250; GFX10-NEXT:  .LBB9_4:
1251; GFX10-NEXT:    ; implicit-def: $sgpr0_sgpr1
1252; GFX10-NEXT:    s_branch .LBB9_2
1253;
1254; GFX11-LABEL: add64_in_branch:
1255; GFX11:       ; %bb.0: ; %entry
1256; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
1257; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1258; GFX11-NEXT:    s_cmp_lg_u64 s[4:5], 0
1259; GFX11-NEXT:    s_cbranch_scc0 .LBB9_4
1260; GFX11-NEXT:  ; %bb.1: ; %else
1261; GFX11-NEXT:    s_add_u32 s4, s4, s6
1262; GFX11-NEXT:    s_addc_u32 s5, s5, s7
1263; GFX11-NEXT:    s_cbranch_execnz .LBB9_3
1264; GFX11-NEXT:  .LBB9_2: ; %if
1265; GFX11-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
1266; GFX11-NEXT:  .LBB9_3: ; %endif
1267; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1268; GFX11-NEXT:    v_mov_b32_e32 v0, s4
1269; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
1270; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1271; GFX11-NEXT:    s_endpgm
1272; GFX11-NEXT:  .LBB9_4:
1273; GFX11-NEXT:    ; implicit-def: $sgpr4_sgpr5
1274; GFX11-NEXT:    s_branch .LBB9_2
1275;
1276; GFX12-LABEL: add64_in_branch:
1277; GFX12:       ; %bb.0: ; %entry
1278; GFX12-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
1279; GFX12-NEXT:    s_wait_kmcnt 0x0
1280; GFX12-NEXT:    s_cmp_lg_u64 s[4:5], 0
1281; GFX12-NEXT:    s_cbranch_scc0 .LBB9_4
1282; GFX12-NEXT:  ; %bb.1: ; %else
1283; GFX12-NEXT:    s_add_nc_u64 s[4:5], s[4:5], s[6:7]
1284; GFX12-NEXT:    s_cbranch_execnz .LBB9_3
1285; GFX12-NEXT:  .LBB9_2: ; %if
1286; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x0
1287; GFX12-NEXT:  .LBB9_3: ; %endif
1288; GFX12-NEXT:    s_wait_kmcnt 0x0
1289; GFX12-NEXT:    v_mov_b32_e32 v0, s4
1290; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
1291; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
1292; GFX12-NEXT:    s_endpgm
1293; GFX12-NEXT:  .LBB9_4:
1294; GFX12-NEXT:    ; implicit-def: $sgpr4_sgpr5
1295; GFX12-NEXT:    s_branch .LBB9_2
1296entry:
1297  %0 = icmp eq i64 %a, 0
1298  br i1 %0, label %if, label %else
1299
1300if:
1301  %1 = load i64, ptr addrspace(1) %in
1302  br label %endif
1303
1304else:
1305  %2 = add i64 %a, %b
1306  br label %endif
1307
1308endif:
1309  %3 = phi i64 [%1, %if], [%2, %else]
1310  store i64 %3, ptr addrspace(1) %out
1311  ret void
1312}
1313
1314; Make sure the VOP3 form of add is initially selected. Otherwise pair
1315; of opies from/to VCC would be necessary
1316define amdgpu_ps void @add_select_vop3(i32 inreg %s, i32 %v) {
1317; GFX6-LABEL: add_select_vop3:
1318; GFX6:       ; %bb.0:
1319; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], s0, v0
1320; GFX6-NEXT:    s_mov_b32 m0, -1
1321; GFX6-NEXT:    ;;#ASMSTART
1322; GFX6-NEXT:    ; def vcc
1323; GFX6-NEXT:    ;;#ASMEND
1324; GFX6-NEXT:    ds_write_b32 v0, v0
1325; GFX6-NEXT:    ;;#ASMSTART
1326; GFX6-NEXT:    ; use vcc
1327; GFX6-NEXT:    ;;#ASMEND
1328; GFX6-NEXT:    s_endpgm
1329;
1330; GFX8-LABEL: add_select_vop3:
1331; GFX8:       ; %bb.0:
1332; GFX8-NEXT:    v_add_u32_e64 v0, s[0:1], s0, v0
1333; GFX8-NEXT:    s_mov_b32 m0, -1
1334; GFX8-NEXT:    ;;#ASMSTART
1335; GFX8-NEXT:    ; def vcc
1336; GFX8-NEXT:    ;;#ASMEND
1337; GFX8-NEXT:    ds_write_b32 v0, v0
1338; GFX8-NEXT:    ;;#ASMSTART
1339; GFX8-NEXT:    ; use vcc
1340; GFX8-NEXT:    ;;#ASMEND
1341; GFX8-NEXT:    s_endpgm
1342;
1343; GFX9-LABEL: add_select_vop3:
1344; GFX9:       ; %bb.0:
1345; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
1346; GFX9-NEXT:    ;;#ASMSTART
1347; GFX9-NEXT:    ; def vcc
1348; GFX9-NEXT:    ;;#ASMEND
1349; GFX9-NEXT:    ds_write_b32 v0, v0
1350; GFX9-NEXT:    ;;#ASMSTART
1351; GFX9-NEXT:    ; use vcc
1352; GFX9-NEXT:    ;;#ASMEND
1353; GFX9-NEXT:    s_endpgm
1354;
1355; GFX10-LABEL: add_select_vop3:
1356; GFX10:       ; %bb.0:
1357; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
1358; GFX10-NEXT:    ;;#ASMSTART
1359; GFX10-NEXT:    ; def vcc
1360; GFX10-NEXT:    ;;#ASMEND
1361; GFX10-NEXT:    ds_write_b32 v0, v0
1362; GFX10-NEXT:    ;;#ASMSTART
1363; GFX10-NEXT:    ; use vcc
1364; GFX10-NEXT:    ;;#ASMEND
1365; GFX10-NEXT:    s_endpgm
1366;
1367; GFX11-LABEL: add_select_vop3:
1368; GFX11:       ; %bb.0:
1369; GFX11-NEXT:    v_add_nc_u32_e32 v0, s0, v0
1370; GFX11-NEXT:    ;;#ASMSTART
1371; GFX11-NEXT:    ; def vcc
1372; GFX11-NEXT:    ;;#ASMEND
1373; GFX11-NEXT:    ds_store_b32 v0, v0
1374; GFX11-NEXT:    ;;#ASMSTART
1375; GFX11-NEXT:    ; use vcc
1376; GFX11-NEXT:    ;;#ASMEND
1377; GFX11-NEXT:    s_endpgm
1378;
1379; GFX12-LABEL: add_select_vop3:
1380; GFX12:       ; %bb.0:
1381; GFX12-NEXT:    v_add_nc_u32_e32 v0, s0, v0
1382; GFX12-NEXT:    ;;#ASMSTART
1383; GFX12-NEXT:    ; def vcc
1384; GFX12-NEXT:    ;;#ASMEND
1385; GFX12-NEXT:    ds_store_b32 v0, v0
1386; GFX12-NEXT:    ;;#ASMSTART
1387; GFX12-NEXT:    ; use vcc
1388; GFX12-NEXT:    ;;#ASMEND
1389; GFX12-NEXT:    s_endpgm
1390  %vcc = call i64 asm sideeffect "; def vcc", "={vcc}"()
1391  %sub = add i32 %v, %s
1392  store i32 %sub, ptr addrspace(3) undef
1393  call void asm sideeffect "; use vcc", "{vcc}"(i64 %vcc)
1394  ret void
1395}
1396
1397declare i32 @llvm.amdgcn.workitem.id.x() #1
1398
1399attributes #0 = { nounwind }
1400attributes #1 = { nounwind readnone speculatable }
1401