xref: /llvm-project/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
6
7declare i32 @llvm.amdgcn.workitem.id.x() #0
8
9@lds.obj = addrspace(3) global [256 x i32] undef, align 4
10
11define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
12; CI-LABEL: write_ds_sub0_offset0_global:
13; CI:       ; %bb.0: ; %entry
14; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
15; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
16; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
17; CI-NEXT:    s_mov_b32 m0, -1
18; CI-NEXT:    ds_write_b32 v0, v1 offset:12
19; CI-NEXT:    s_endpgm
20;
21; GFX9-LABEL: write_ds_sub0_offset0_global:
22; GFX9:       ; %bb.0: ; %entry
23; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
24; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
25; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7b
26; GFX9-NEXT:    ds_write_b32 v0, v1 offset:12
27; GFX9-NEXT:    s_endpgm
28;
29; GFX10-LABEL: write_ds_sub0_offset0_global:
30; GFX10:       ; %bb.0: ; %entry
31; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
32; GFX10-NEXT:    v_mov_b32_e32 v1, 0x7b
33; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
34; GFX10-NEXT:    ds_write_b32 v0, v1 offset:12
35; GFX10-NEXT:    s_endpgm
36;
37; GFX11-LABEL: write_ds_sub0_offset0_global:
38; GFX11:       ; %bb.0: ; %entry
39; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
40; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
41; GFX11-NEXT:    v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
42; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
43; GFX11-NEXT:    ds_store_b32 v0, v1 offset:12
44; GFX11-NEXT:    s_endpgm
45entry:
46  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
47  %sub1 = sub i32 0, %x.i
48  %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1
49  %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 3
50  store i32 123, ptr addrspace(3) %arrayidx
51  ret void
52}
53
54define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.val) #0 {
55; CI-LABEL: write_ds_sub0_offset0_global_clamp_bit:
56; CI:       ; %bb.0: ; %entry
57; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
58; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
59; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
60; CI-NEXT:    s_mov_b64 vcc, 0
61; CI-NEXT:    s_waitcnt lgkmcnt(0)
62; CI-NEXT:    v_mov_b32_e32 v1, s0
63; CI-NEXT:    s_mov_b32 s0, 0
64; CI-NEXT:    v_div_fmas_f32 v1, v1, v1, v1
65; CI-NEXT:    v_mov_b32_e32 v2, 0x7b
66; CI-NEXT:    s_mov_b32 m0, -1
67; CI-NEXT:    s_mov_b32 s3, 0xf000
68; CI-NEXT:    s_mov_b32 s2, -1
69; CI-NEXT:    s_mov_b32 s1, s0
70; CI-NEXT:    ds_write_b32 v0, v2 offset:12
71; CI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
72; CI-NEXT:    s_waitcnt vmcnt(0)
73; CI-NEXT:    s_endpgm
74;
75; GFX9-LABEL: write_ds_sub0_offset0_global_clamp_bit:
76; GFX9:       ; %bb.0: ; %entry
77; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
78; GFX9-NEXT:    s_mov_b64 vcc, 0
79; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
80; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v0
81; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7b
82; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
83; GFX9-NEXT:    v_mov_b32_e32 v1, s0
84; GFX9-NEXT:    v_div_fmas_f32 v2, v1, v1, v1
85; GFX9-NEXT:    v_mov_b32_e32 v0, 0
86; GFX9-NEXT:    v_mov_b32_e32 v1, 0
87; GFX9-NEXT:    ds_write_b32 v3, v4 offset:12
88; GFX9-NEXT:    global_store_dword v[0:1], v2, off
89; GFX9-NEXT:    s_waitcnt vmcnt(0)
90; GFX9-NEXT:    s_endpgm
91;
92; GFX10-LABEL: write_ds_sub0_offset0_global_clamp_bit:
93; GFX10:       ; %bb.0: ; %entry
94; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x0
95; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
96; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
97; GFX10-NEXT:    v_mov_b32_e32 v3, 0x7b
98; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
99; GFX10-NEXT:    v_mov_b32_e32 v0, 0
100; GFX10-NEXT:    v_mov_b32_e32 v1, 0
101; GFX10-NEXT:    ds_write_b32 v2, v3 offset:12
102; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
103; GFX10-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
104; GFX10-NEXT:    global_store_dword v[0:1], v4, off
105; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
106; GFX10-NEXT:    s_endpgm
107;
108; GFX11-LABEL: write_ds_sub0_offset0_global_clamp_bit:
109; GFX11:       ; %bb.0: ; %entry
110; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
111; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
112; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
113; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
114; GFX11-NEXT:    v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0
115; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
116; GFX11-NEXT:    v_mov_b32_e32 v0, 0
117; GFX11-NEXT:    v_mov_b32_e32 v1, 0
118; GFX11-NEXT:    ds_store_b32 v2, v3 offset:12
119; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
120; GFX11-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
121; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
122; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
123; GFX11-NEXT:    s_endpgm
124entry:
125  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
126  %sub1 = sub i32 0, %x.i
127  %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1
128  %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 3
129  store i32 123, ptr addrspace(3) %arrayidx
130  %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
131  store volatile float %fmas, ptr addrspace(1) null
132  ret void
133}
134
135define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy.val) #0 {
136; CI-LABEL: write_ds_sub_max_offset_global_clamp_bit:
137; CI:       ; %bb.0:
138; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
139; CI-NEXT:    s_mov_b64 vcc, 0
140; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
141; CI-NEXT:    v_mov_b32_e32 v2, 0
142; CI-NEXT:    s_mov_b32 m0, -1
143; CI-NEXT:    s_waitcnt lgkmcnt(0)
144; CI-NEXT:    v_mov_b32_e32 v0, s0
145; CI-NEXT:    v_div_fmas_f32 v0, v0, v0, v0
146; CI-NEXT:    s_mov_b32 s0, 0
147; CI-NEXT:    s_mov_b32 s3, 0xf000
148; CI-NEXT:    s_mov_b32 s2, -1
149; CI-NEXT:    s_mov_b32 s1, s0
150; CI-NEXT:    ds_write_b32 v2, v1
151; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
152; CI-NEXT:    s_waitcnt vmcnt(0)
153; CI-NEXT:    s_endpgm
154;
155; GFX9-LABEL: write_ds_sub_max_offset_global_clamp_bit:
156; GFX9:       ; %bb.0:
157; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
158; GFX9-NEXT:    s_mov_b64 vcc, 0
159; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7b
160; GFX9-NEXT:    v_mov_b32_e32 v4, 0
161; GFX9-NEXT:    ds_write_b32 v4, v3
162; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
163; GFX9-NEXT:    v_mov_b32_e32 v0, s0
164; GFX9-NEXT:    v_div_fmas_f32 v2, v0, v0, v0
165; GFX9-NEXT:    v_mov_b32_e32 v0, 0
166; GFX9-NEXT:    v_mov_b32_e32 v1, 0
167; GFX9-NEXT:    global_store_dword v[0:1], v2, off
168; GFX9-NEXT:    s_waitcnt vmcnt(0)
169; GFX9-NEXT:    s_endpgm
170;
171; GFX10-LABEL: write_ds_sub_max_offset_global_clamp_bit:
172; GFX10:       ; %bb.0:
173; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x0
174; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
175; GFX10-NEXT:    v_mov_b32_e32 v0, 0
176; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7b
177; GFX10-NEXT:    v_mov_b32_e32 v3, 0
178; GFX10-NEXT:    v_mov_b32_e32 v1, 0
179; GFX10-NEXT:    ds_write_b32 v3, v2
180; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
181; GFX10-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
182; GFX10-NEXT:    global_store_dword v[0:1], v4, off
183; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
184; GFX10-NEXT:    s_endpgm
185;
186; GFX11-LABEL: write_ds_sub_max_offset_global_clamp_bit:
187; GFX11:       ; %bb.0:
188; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
189; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
190; GFX11-NEXT:    v_mov_b32_e32 v0, 0
191; GFX11-NEXT:    v_dual_mov_b32 v2, 0x7b :: v_dual_mov_b32 v3, 0
192; GFX11-NEXT:    v_mov_b32_e32 v1, 0
193; GFX11-NEXT:    ds_store_b32 v3, v2
194; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
195; GFX11-NEXT:    v_div_fmas_f32 v4, s0, s0, s0
196; GFX11-NEXT:    global_store_b32 v[0:1], v4, off dlc
197; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
198; GFX11-NEXT:    s_endpgm
199  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
200  %sub1 = sub i32 -1, %x.i
201  %tmp0 = getelementptr [256 x i32], ptr addrspace(3) @lds.obj, i32 0, i32 %sub1
202  %arrayidx = getelementptr inbounds i32, ptr addrspace(3) %tmp0, i32 16383
203  store i32 123, ptr addrspace(3) %arrayidx
204  %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
205  store volatile float %fmas, ptr addrspace(1) null
206  ret void
207}
208
209define amdgpu_kernel void @add_x_shl_max_offset() #1 {
210; CI-LABEL: add_x_shl_max_offset:
211; CI:       ; %bb.0:
212; CI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
213; CI-NEXT:    v_mov_b32_e32 v1, 13
214; CI-NEXT:    s_mov_b32 m0, -1
215; CI-NEXT:    ds_write_b8 v0, v1 offset:65535
216; CI-NEXT:    s_endpgm
217;
218; GFX9-LABEL: add_x_shl_max_offset:
219; GFX9:       ; %bb.0:
220; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
221; GFX9-NEXT:    v_mov_b32_e32 v1, 13
222; GFX9-NEXT:    ds_write_b8 v0, v1 offset:65535
223; GFX9-NEXT:    s_endpgm
224;
225; GFX10-LABEL: add_x_shl_max_offset:
226; GFX10:       ; %bb.0:
227; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
228; GFX10-NEXT:    v_mov_b32_e32 v1, 13
229; GFX10-NEXT:    ds_write_b8 v0, v1 offset:65535
230; GFX10-NEXT:    s_endpgm
231;
232; GFX11-LABEL: add_x_shl_max_offset:
233; GFX11:       ; %bb.0:
234; GFX11-NEXT:    v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
235; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
236; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
237; GFX11-NEXT:    ds_store_b8 v0, v1 offset:65535
238; GFX11-NEXT:    s_endpgm
239  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
240  %shl = shl i32 %x.i, 4
241  %add = add i32 %shl, 65535
242  %z = zext i32 %add to i64
243  %ptr = inttoptr i64 %z to ptr addrspace(3)
244  store i8 13, ptr addrspace(3) %ptr, align 1
245  ret void
246}
247
248; this could have the offset transform, but sub became xor
249
250define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 {
251; CI-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
252; CI:       ; %bb.0:
253; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
254; CI-NEXT:    v_xor_b32_e32 v0, 0xffff, v0
255; CI-NEXT:    v_mov_b32_e32 v1, 13
256; CI-NEXT:    s_mov_b32 m0, -1
257; CI-NEXT:    ds_write_b8 v0, v1
258; CI-NEXT:    s_endpgm
259;
260; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
261; GFX9:       ; %bb.0:
262; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
263; GFX9-NEXT:    v_xor_b32_e32 v0, 0xffff, v0
264; GFX9-NEXT:    v_mov_b32_e32 v1, 13
265; GFX9-NEXT:    ds_write_b8 v0, v1
266; GFX9-NEXT:    s_endpgm
267;
268; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
269; GFX10:       ; %bb.0:
270; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
271; GFX10-NEXT:    v_mov_b32_e32 v1, 13
272; GFX10-NEXT:    v_xor_b32_e32 v0, 0xffff, v0
273; GFX10-NEXT:    ds_write_b8 v0, v1
274; GFX10-NEXT:    s_endpgm
275;
276; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_alt:
277; GFX11:       ; %bb.0:
278; GFX11-NEXT:    v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
279; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
280; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
281; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff, v0
282; GFX11-NEXT:    ds_store_b8 v0, v1
283; GFX11-NEXT:    s_endpgm
284  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x()
285  %.neg = mul i32 %x.i, -4
286  %add = add i32 %.neg, 65535
287  %z = zext i32 %add to i64
288  %ptr = inttoptr i64 %z to ptr addrspace(3)
289  store i8 13, ptr addrspace(3) %ptr, align 1
290  ret void
291}
292
293; this could have the offset transform, but sub became xor
294
295define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 {
296; CI-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
297; CI:       ; %bb.0:
298; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
299; CI-NEXT:    v_xor_b32_e32 v0, 0xffff, v0
300; CI-NEXT:    v_mov_b32_e32 v1, 13
301; CI-NEXT:    s_mov_b32 m0, -1
302; CI-NEXT:    ds_write_b8 v0, v1
303; CI-NEXT:    s_endpgm
304;
305; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
306; GFX9:       ; %bb.0:
307; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
308; GFX9-NEXT:    v_xor_b32_e32 v0, 0xffff, v0
309; GFX9-NEXT:    v_mov_b32_e32 v1, 13
310; GFX9-NEXT:    ds_write_b8 v0, v1
311; GFX9-NEXT:    s_endpgm
312;
313; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
314; GFX10:       ; %bb.0:
315; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
316; GFX10-NEXT:    v_mov_b32_e32 v1, 13
317; GFX10-NEXT:    v_xor_b32_e32 v0, 0xffff, v0
318; GFX10-NEXT:    ds_write_b8 v0, v1
319; GFX10-NEXT:    s_endpgm
320;
321; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical:
322; GFX11:       ; %bb.0:
323; GFX11-NEXT:    v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
324; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
325; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
326; GFX11-NEXT:    v_xor_b32_e32 v0, 0xffff, v0
327; GFX11-NEXT:    ds_store_b8 v0, v1
328; GFX11-NEXT:    s_endpgm
329  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
330  %neg = sub i32 0, %x.i
331  %shl = shl i32 %neg, 2
332  %add = add i32 65535, %shl
333  %ptr = inttoptr i32 %add to ptr addrspace(3)
334  store i8 13, ptr addrspace(3) %ptr
335  ret void
336}
337
338define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
339; CI-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
340; CI:       ; %bb.0:
341; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
342; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0x10000, v0
343; CI-NEXT:    v_mov_b32_e32 v1, 13
344; CI-NEXT:    s_mov_b32 m0, -1
345; CI-NEXT:    ds_write_b8 v0, v1
346; CI-NEXT:    s_endpgm
347;
348; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
349; GFX9:       ; %bb.0:
350; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
351; GFX9-NEXT:    v_sub_u32_e32 v0, 0x10000, v0
352; GFX9-NEXT:    v_mov_b32_e32 v1, 13
353; GFX9-NEXT:    ds_write_b8 v0, v1
354; GFX9-NEXT:    s_endpgm
355;
356; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
357; GFX10:       ; %bb.0:
358; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
359; GFX10-NEXT:    v_mov_b32_e32 v1, 13
360; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x10000, v0
361; GFX10-NEXT:    ds_write_b8 v0, v1
362; GFX10-NEXT:    s_endpgm
363;
364; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_p1:
365; GFX11:       ; %bb.0:
366; GFX11-NEXT:    v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
367; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
368; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
369; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0x10000, v0
370; GFX11-NEXT:    ds_store_b8 v0, v1
371; GFX11-NEXT:    s_endpgm
372  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
373  %neg = sub i32 0, %x.i
374  %shl = shl i32 %neg, 2
375  %add = add i32 65536, %shl
376  %ptr = inttoptr i32 %add to ptr addrspace(3)
377  store i8 13, ptr addrspace(3) %ptr
378  ret void
379}
380
381define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 {
382; CI-LABEL: add_x_shl_neg_to_sub_multi_use:
383; CI:       ; %bb.0:
384; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
385; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
386; CI-NEXT:    v_mov_b32_e32 v1, 13
387; CI-NEXT:    s_mov_b32 m0, -1
388; CI-NEXT:    ds_write_b32 v0, v1 offset:123
389; CI-NEXT:    ds_write_b32 v0, v1 offset:456
390; CI-NEXT:    s_endpgm
391;
392; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use:
393; GFX9:       ; %bb.0:
394; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
395; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
396; GFX9-NEXT:    v_mov_b32_e32 v1, 13
397; GFX9-NEXT:    ds_write_b32 v0, v1 offset:123
398; GFX9-NEXT:    ds_write_b32 v0, v1 offset:456
399; GFX9-NEXT:    s_endpgm
400;
401; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use:
402; GFX10:       ; %bb.0:
403; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
404; GFX10-NEXT:    v_mov_b32_e32 v1, 13
405; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
406; GFX10-NEXT:    ds_write_b32 v0, v1 offset:123
407; GFX10-NEXT:    ds_write_b32 v0, v1 offset:456
408; GFX10-NEXT:    s_endpgm
409;
410; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use:
411; GFX11:       ; %bb.0:
412; GFX11-NEXT:    v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0
413; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
414; GFX11-NEXT:    v_and_b32_e32 v0, 0xffc, v0
415; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
416; GFX11-NEXT:    ds_store_b32 v0, v1 offset:123
417; GFX11-NEXT:    ds_store_b32 v0, v1 offset:456
418; GFX11-NEXT:    s_endpgm
419  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
420  %neg = sub i32 0, %x.i
421  %shl = shl i32 %neg, 2
422  %add0 = add i32 123, %shl
423  %add1 = add i32 456, %shl
424  %ptr0 = inttoptr i32 %add0 to ptr addrspace(3)
425  store volatile i32 13, ptr addrspace(3) %ptr0
426  %ptr1 = inttoptr i32 %add1 to ptr addrspace(3)
427  store volatile i32 13, ptr addrspace(3) %ptr1
428  ret void
429}
430
431define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
432; CI-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
433; CI:       ; %bb.0:
434; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
435; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
436; CI-NEXT:    v_mov_b32_e32 v1, 13
437; CI-NEXT:    s_mov_b32 m0, -1
438; CI-NEXT:    ds_write_b32 v0, v1 offset:123
439; CI-NEXT:    ds_write_b32 v0, v1 offset:123
440; CI-NEXT:    s_endpgm
441;
442; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
443; GFX9:       ; %bb.0:
444; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
445; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
446; GFX9-NEXT:    v_mov_b32_e32 v1, 13
447; GFX9-NEXT:    ds_write_b32 v0, v1 offset:123
448; GFX9-NEXT:    ds_write_b32 v0, v1 offset:123
449; GFX9-NEXT:    s_endpgm
450;
451; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
452; GFX10:       ; %bb.0:
453; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
454; GFX10-NEXT:    v_mov_b32_e32 v1, 13
455; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
456; GFX10-NEXT:    ds_write_b32 v0, v1 offset:123
457; GFX10-NEXT:    ds_write_b32 v0, v1 offset:123
458; GFX10-NEXT:    s_endpgm
459;
460; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset:
461; GFX11:       ; %bb.0:
462; GFX11-NEXT:    v_dual_mov_b32 v1, 13 :: v_dual_and_b32 v0, 0x3ff, v0
463; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
464; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
465; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
466; GFX11-NEXT:    ds_store_b32 v0, v1 offset:123
467; GFX11-NEXT:    ds_store_b32 v0, v1 offset:123
468; GFX11-NEXT:    s_endpgm
469  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
470  %neg = sub i32 0, %x.i
471  %shl = shl i32 %neg, 2
472  %add = add i32 123, %shl
473  %ptr = inttoptr i32 %add to ptr addrspace(3)
474  store volatile i32 13, ptr addrspace(3) %ptr
475  store volatile i32 13, ptr addrspace(3) %ptr
476  ret void
477}
478
479define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
480; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
481; CI:       ; %bb.0:
482; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
483; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0x3fb, v0
484; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
485; CI-NEXT:    v_mov_b32_e32 v2, 0
486; CI-NEXT:    s_mov_b32 m0, -1
487; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
488; CI-NEXT:    s_endpgm
489;
490; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
491; GFX9:       ; %bb.0:
492; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
493; GFX9-NEXT:    v_sub_u32_e32 v0, 0x3fb, v0
494; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7b
495; GFX9-NEXT:    v_mov_b32_e32 v2, 0
496; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
497; GFX9-NEXT:    s_endpgm
498;
499; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
500; GFX10:       ; %bb.0:
501; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
502; GFX10-NEXT:    v_mov_b32_e32 v1, 0
503; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7b
504; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
505; GFX10-NEXT:    ds_write_b32 v0, v1 offset:1023
506; GFX10-NEXT:    ds_write_b32 v0, v2 offset:1019
507; GFX10-NEXT:    s_endpgm
508;
509; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset:
510; GFX11:       ; %bb.0:
511; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
512; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x7b
513; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
514; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
515; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0x3fb, v0
516; GFX11-NEXT:    ds_store_2addr_b32 v0, v1, v2 offset1:1
517; GFX11-NEXT:    s_endpgm
518  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
519  %neg = sub i32 0, %x.i
520  %shl = shl i32 %neg, 2
521  %add = add i32 1019, %shl
522  %ptr = inttoptr i32 %add to ptr addrspace(3)
523  store i64 123, ptr addrspace(3) %ptr, align 4
524  ret void
525}
526
527define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit(float %dummy.val) #1 {
528; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
529; CI:       ; %bb.0:
530; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
531; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
532; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0x3fb, v0
533; CI-NEXT:    s_mov_b64 vcc, 0
534; CI-NEXT:    s_waitcnt lgkmcnt(0)
535; CI-NEXT:    v_mov_b32_e32 v1, s0
536; CI-NEXT:    s_mov_b32 s0, 0
537; CI-NEXT:    v_div_fmas_f32 v1, v1, v1, v1
538; CI-NEXT:    v_mov_b32_e32 v2, 0x7b
539; CI-NEXT:    v_mov_b32_e32 v3, 0
540; CI-NEXT:    s_mov_b32 m0, -1
541; CI-NEXT:    s_mov_b32 s3, 0xf000
542; CI-NEXT:    s_mov_b32 s2, -1
543; CI-NEXT:    s_mov_b32 s1, s0
544; CI-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
545; CI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
546; CI-NEXT:    s_waitcnt vmcnt(0)
547; CI-NEXT:    s_endpgm
548;
549; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
550; GFX9:       ; %bb.0:
551; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
552; GFX9-NEXT:    s_mov_b64 vcc, 0
553; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
554; GFX9-NEXT:    v_sub_u32_e32 v3, 0x3fb, v0
555; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7b
556; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
557; GFX9-NEXT:    v_mov_b32_e32 v1, s0
558; GFX9-NEXT:    v_div_fmas_f32 v2, v1, v1, v1
559; GFX9-NEXT:    v_mov_b32_e32 v0, 0
560; GFX9-NEXT:    v_mov_b32_e32 v5, 0
561; GFX9-NEXT:    v_mov_b32_e32 v1, 0
562; GFX9-NEXT:    ds_write2_b32 v3, v4, v5 offset1:1
563; GFX9-NEXT:    global_store_dword v[0:1], v2, off
564; GFX9-NEXT:    s_waitcnt vmcnt(0)
565; GFX9-NEXT:    s_endpgm
566;
567; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
568; GFX10:       ; %bb.0:
569; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x0
570; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
571; GFX10-NEXT:    s_mov_b32 vcc_lo, 0
572; GFX10-NEXT:    v_mov_b32_e32 v3, 0
573; GFX10-NEXT:    v_mov_b32_e32 v4, 0x7b
574; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
575; GFX10-NEXT:    v_mov_b32_e32 v0, 0
576; GFX10-NEXT:    v_mov_b32_e32 v1, 0
577; GFX10-NEXT:    ds_write_b32 v2, v3 offset:1023
578; GFX10-NEXT:    ds_write_b32 v2, v4 offset:1019
579; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
580; GFX10-NEXT:    v_div_fmas_f32 v5, s0, s0, s0
581; GFX10-NEXT:    global_store_dword v[0:1], v5, off
582; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
583; GFX10-NEXT:    s_endpgm
584;
585; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit:
586; GFX11:       ; %bb.0:
587; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
588; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
589; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
590; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, 0x7b
591; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
592; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
593; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x3fb, v0
594; GFX11-NEXT:    v_mov_b32_e32 v0, 0
595; GFX11-NEXT:    v_mov_b32_e32 v1, 0
596; GFX11-NEXT:    ds_store_2addr_b32 v2, v3, v4 offset1:1
597; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
598; GFX11-NEXT:    v_div_fmas_f32 v5, s0, s0, s0
599; GFX11-NEXT:    global_store_b32 v[0:1], v5, off dlc
600; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
601; GFX11-NEXT:    s_endpgm
602  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
603  %neg = sub i32 0, %x.i
604  %shl = shl i32 %neg, 2
605  %add = add i32 1019, %shl
606  %ptr = inttoptr i32 %add to ptr addrspace(3)
607  store i64 123, ptr addrspace(3) %ptr, align 4
608  %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
609  store volatile float %fmas, ptr addrspace(1) null
610  ret void
611}
612
613define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
614; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
615; CI:       ; %bb.0:
616; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
617; CI-NEXT:    v_sub_i32_e32 v0, vcc, 0x3fc, v0
618; CI-NEXT:    v_mov_b32_e32 v1, 0x7b
619; CI-NEXT:    v_mov_b32_e32 v2, 0
620; CI-NEXT:    s_mov_b32 m0, -1
621; CI-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
622; CI-NEXT:    s_endpgm
623;
624; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
625; GFX9:       ; %bb.0:
626; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
627; GFX9-NEXT:    v_sub_u32_e32 v0, 0x3fc, v0
628; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7b
629; GFX9-NEXT:    v_mov_b32_e32 v2, 0
630; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
631; GFX9-NEXT:    s_endpgm
632;
633; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
634; GFX10:       ; %bb.0:
635; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
636; GFX10-NEXT:    v_mov_b32_e32 v1, 0
637; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7b
638; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
639; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x200, v0
640; GFX10-NEXT:    ds_write2_b32 v0, v2, v1 offset0:127 offset1:128
641; GFX10-NEXT:    s_endpgm
642;
643; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1:
644; GFX11:       ; %bb.0:
645; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
646; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x7b
647; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
648; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
649; GFX11-NEXT:    v_sub_nc_u32_e32 v0, 0x3fc, v0
650; GFX11-NEXT:    ds_store_2addr_b32 v0, v1, v2 offset1:1
651; GFX11-NEXT:    s_endpgm
652  %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
653  %neg = sub i32 0, %x.i
654  %shl = shl i32 %neg, 2
655  %add = add i32 1020, %shl
656  %ptr = inttoptr i32 %add to ptr addrspace(3)
657  store i64 123, ptr addrspace(3) %ptr, align 4
658  ret void
659}
660
661declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1)
662
663attributes #0 = { nounwind readnone }
664attributes #1 = { nounwind }
665attributes #2 = { nounwind convergent }
666