xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX10 %s
3; RUN:  llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX11 %s
4declare i32 @llvm.amdgcn.workitem.id.x()
5
6; A 64-bit multiplication where no arguments were zero extended.
7define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
8; GFX10-LABEL: v_mul_i64_no_zext:
9; GFX10:       ; %bb.0:
10; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
11; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 3, v0
12; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
13; GFX10-NEXT:    s_clause 0x1
14; GFX10-NEXT:    global_load_dwordx2 v[0:1], v7, s[0:1]
15; GFX10-NEXT:    global_load_dwordx2 v[2:3], v7, s[2:3]
16; GFX10-NEXT:    s_waitcnt vmcnt(0)
17; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s0, v0, v2, 0
18; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
19; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
20; GFX10-NEXT:    v_mov_b32_e32 v5, v0
21; GFX10-NEXT:    global_store_dwordx2 v7, v[4:5], s[2:3]
22; GFX10-NEXT:    s_endpgm
23;
24; GFX11-LABEL: v_mul_i64_no_zext:
25; GFX11:       ; %bb.0:
26; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
27; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
28; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
29; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 3, v0
30; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
31; GFX11-NEXT:    s_clause 0x1
32; GFX11-NEXT:    global_load_b64 v[0:1], v9, s[0:1]
33; GFX11-NEXT:    global_load_b64 v[2:3], v9, s[2:3]
34; GFX11-NEXT:    s_waitcnt vmcnt(0)
35; GFX11-NEXT:    v_mad_u64_u32 v[4:5], null, v0, v2, 0
36; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
37; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6]
38; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7]
39; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
40; GFX11-NEXT:    v_mov_b32_e32 v5, v7
41; GFX11-NEXT:    global_store_b64 v9, v[4:5], s[2:3]
42; GFX11-NEXT:    s_endpgm
43  %tid = call i32 @llvm.amdgcn.workitem.id.x()
44  %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
45  %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid
46  %gep.out = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid
47  %a = load i64, ptr addrspace(1) %gep.a
48  %b = load i64, ptr addrspace(1) %gep.b
49  %mul = mul i64 %a, %b
50  store i64 %mul, ptr addrspace(1) %gep.out
51  ret void
52}
53
54; a 64 bit multiplication where the second argument was zero extended.
55define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
56; GFX10-LABEL: v_mul_i64_zext_src1:
57; GFX10:       ; %bb.0:
58; GFX10-NEXT:    s_clause 0x1
59; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
60; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
61; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
62; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
63; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
64; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
65; GFX10-NEXT:    global_load_dword v4, v3, s[6:7]
66; GFX10-NEXT:    s_waitcnt vmcnt(0)
67; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s2, v0, v4, 0
68; GFX10-NEXT:    v_mov_b32_e32 v0, v3
69; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v1, v4, v[0:1]
70; GFX10-NEXT:    v_mov_b32_e32 v3, v0
71; GFX10-NEXT:    v_mov_b32_e32 v0, 0
72; GFX10-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
73; GFX10-NEXT:    s_endpgm
74;
75; GFX11-LABEL: v_mul_i64_zext_src1:
76; GFX11:       ; %bb.0:
77; GFX11-NEXT:    s_clause 0x1
78; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
79; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
80; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
81; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
82; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
83; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
84; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
85; GFX11-NEXT:    global_load_b64 v[0:1], v1, s[2:3]
86; GFX11-NEXT:    global_load_b32 v5, v2, s[4:5]
87; GFX11-NEXT:    s_waitcnt vmcnt(0)
88; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v0, v5, 0
89; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
90; GFX11-NEXT:    v_mov_b32_e32 v0, v3
91; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1]
92; GFX11-NEXT:    v_mov_b32_e32 v0, 0
93; GFX11-NEXT:    global_store_b64 v0, v[2:3], s[0:1]
94; GFX11-NEXT:    s_endpgm
95  %tid = call i32 @llvm.amdgcn.workitem.id.x()
96  %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
97  %gep.b = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid
98  %a = load i64, ptr addrspace(1) %gep.a
99  %b = load i32, ptr addrspace(1) %gep.b
100  %b_ext = zext i32 %b to i64
101  %mul = mul i64 %a, %b_ext
102  store i64 %mul, ptr addrspace(1) %out
103  ret void
104}
105
106; 64 bit multiplication where the first argument was zero extended.
107define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
108; GFX10-LABEL: v_mul_i64_zext_src0:
109; GFX10:       ; %bb.0:
110; GFX10-NEXT:    s_clause 0x1
111; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
112; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
113; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
114; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
115; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
116; GFX10-NEXT:    global_load_dword v4, v2, s[2:3]
117; GFX10-NEXT:    global_load_dwordx2 v[0:1], v3, s[6:7]
118; GFX10-NEXT:    s_waitcnt vmcnt(0)
119; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s2, v4, v0, 0
120; GFX10-NEXT:    v_mov_b32_e32 v0, v3
121; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1]
122; GFX10-NEXT:    v_mov_b32_e32 v3, v0
123; GFX10-NEXT:    v_mov_b32_e32 v0, 0
124; GFX10-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
125; GFX10-NEXT:    s_endpgm
126;
127; GFX11-LABEL: v_mul_i64_zext_src0:
128; GFX11:       ; %bb.0:
129; GFX11-NEXT:    s_clause 0x1
130; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
131; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
132; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
133; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
134; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
135; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
136; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
137; GFX11-NEXT:    global_load_b32 v5, v1, s[2:3]
138; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[4:5]
139; GFX11-NEXT:    s_waitcnt vmcnt(0)
140; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v5, v0, 0
141; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
142; GFX11-NEXT:    v_mov_b32_e32 v0, v3
143; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
144; GFX11-NEXT:    v_mov_b32_e32 v0, 0
145; GFX11-NEXT:    global_store_b64 v0, v[2:3], s[0:1]
146; GFX11-NEXT:    s_endpgm
147  %tid = call i32 @llvm.amdgcn.workitem.id.x()
148  %gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid
149  %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid
150  %a = load i32, ptr addrspace(1) %gep.a
151  %b = load i64, ptr addrspace(1) %gep.b
152  %a_ext = zext i32 %a to i64
153  %mul = mul i64 %a_ext, %b
154  store i64 %mul, ptr addrspace(1) %out
155  ret void
156}
157
158; 64-bit multiplication where both arguments were zero extended.
159define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
160; GFX10-LABEL: v_mul_i64_zext_src0_src1:
161; GFX10:       ; %bb.0:
162; GFX10-NEXT:    s_clause 0x1
163; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
164; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
165; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
166; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
167; GFX10-NEXT:    s_clause 0x1
168; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
169; GFX10-NEXT:    global_load_dword v2, v0, s[6:7]
170; GFX10-NEXT:    s_waitcnt vmcnt(0)
171; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v1, v2, 0
172; GFX10-NEXT:    v_mov_b32_e32 v2, 0
173; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
174; GFX10-NEXT:    s_endpgm
175;
176; GFX11-LABEL: v_mul_i64_zext_src0_src1:
177; GFX11:       ; %bb.0:
178; GFX11-NEXT:    s_clause 0x1
179; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
180; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
181; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
182; GFX11-NEXT:    v_mov_b32_e32 v2, 0
183; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
184; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
185; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
186; GFX11-NEXT:    s_clause 0x1
187; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
188; GFX11-NEXT:    global_load_b32 v0, v0, s[4:5]
189; GFX11-NEXT:    s_waitcnt vmcnt(0)
190; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v1, v0, 0
191; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
192; GFX11-NEXT:    s_endpgm
193  %tid = call i32 @llvm.amdgcn.workitem.id.x()
194  %gep.a = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid
195  %gep.b = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid
196  %a = load i32, ptr addrspace(1) %gep.a
197  %b = load i32, ptr addrspace(1) %gep.b
198  %a_ext = zext i32 %a to i64
199  %b_ext = zext i32 %b to i64
200  %mul = mul i64 %a_ext, %b_ext
201  store i64 %mul, ptr addrspace(1) %out
202  ret void
203}
204
205; 64-bit multiplication where the upper bytes of the first argument were masked.
206define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
207; GFX10-LABEL: v_mul_i64_masked_src0_hi:
208; GFX10:       ; %bb.0:
209; GFX10-NEXT:    s_clause 0x1
210; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
211; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
212; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
213; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
214; GFX10-NEXT:    s_clause 0x1
215; GFX10-NEXT:    global_load_dword v4, v2, s[2:3]
216; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[6:7]
217; GFX10-NEXT:    s_waitcnt vmcnt(0)
218; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s2, v4, v0, 0
219; GFX10-NEXT:    v_mov_b32_e32 v0, v3
220; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v4, v1, v[0:1]
221; GFX10-NEXT:    v_mov_b32_e32 v3, v0
222; GFX10-NEXT:    v_mov_b32_e32 v0, 0
223; GFX10-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
224; GFX10-NEXT:    s_endpgm
225;
226; GFX11-LABEL: v_mul_i64_masked_src0_hi:
227; GFX11:       ; %bb.0:
228; GFX11-NEXT:    s_clause 0x1
229; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
230; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
231; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
232; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
233; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
234; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
235; GFX11-NEXT:    s_clause 0x1
236; GFX11-NEXT:    global_load_b32 v5, v0, s[2:3]
237; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[4:5]
238; GFX11-NEXT:    s_waitcnt vmcnt(0)
239; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v5, v0, 0
240; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
241; GFX11-NEXT:    v_mov_b32_e32 v0, v3
242; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
243; GFX11-NEXT:    v_mov_b32_e32 v0, 0
244; GFX11-NEXT:    global_store_b64 v0, v[2:3], s[0:1]
245; GFX11-NEXT:    s_endpgm
246 %tid = call i32 @llvm.amdgcn.workitem.id.x()
247  %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
248  %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid
249  %a = load i64, ptr addrspace(1) %gep.a
250  %b = load i64, ptr addrspace(1) %gep.b
251  %a_and = and i64 %a, u0x00000000FFFFFFFF
252  %mul = mul i64 %a_and, %b
253  store i64 %mul, ptr addrspace(1) %out
254  ret void
255}
256
257; 64-bit multiplication where lower bytes of first argument were masked.
258define amdgpu_kernel void @v_mul_i64_masked_src0_lo(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
259; GFX10-LABEL: v_mul_i64_masked_src0_lo:
260; GFX10:       ; %bb.0:
261; GFX10-NEXT:    s_clause 0x1
262; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
263; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
264; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
265; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
266; GFX10-NEXT:    s_clause 0x1
267; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
268; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
269; GFX10-NEXT:    s_waitcnt vmcnt(1)
270; GFX10-NEXT:    v_mov_b32_e32 v0, 0
271; GFX10-NEXT:    s_waitcnt vmcnt(0)
272; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v2
273; GFX10-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
274; GFX10-NEXT:    s_endpgm
275;
276; GFX11-LABEL: v_mul_i64_masked_src0_lo:
277; GFX11:       ; %bb.0:
278; GFX11-NEXT:    s_clause 0x1
279; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
280; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
281; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
282; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
283; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
284; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
285; GFX11-NEXT:    s_clause 0x1
286; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
287; GFX11-NEXT:    global_load_b64 v[2:3], v2, s[4:5]
288; GFX11-NEXT:    s_waitcnt vmcnt(1)
289; GFX11-NEXT:    v_mov_b32_e32 v0, 0
290; GFX11-NEXT:    s_waitcnt vmcnt(0)
291; GFX11-NEXT:    v_mul_lo_u32 v1, v1, v2
292; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
293; GFX11-NEXT:    s_endpgm
294  %tid = call i32 @llvm.amdgcn.workitem.id.x()
295  %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
296  %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid
297  %a = load i64, ptr addrspace(1) %gep.a
298  %b = load i64, ptr addrspace(1) %gep.b
299  %a_and = and i64 %a, u0xFFFFFFFF00000000
300  %mul = mul i64 %a_and, %b
301  store i64 %mul, ptr addrspace(1) %out
302  ret void
303}
304
305; 64-bit multiplication where the lower bytes of the second argument were masked.
306define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
307; GFX10-LABEL: v_mul_i64_masked_src1_lo:
308; GFX10:       ; %bb.0:
309; GFX10-NEXT:    s_clause 0x1
310; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
311; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
312; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
313; GFX10-NEXT:    ; kill: killed $vgpr3
314; GFX10-NEXT:    ; kill: killed $sgpr2_sgpr3
315; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
316; GFX10-NEXT:    s_clause 0x1
317; GFX10-NEXT:    global_load_dwordx2 v[0:1], v3, s[2:3]
318; GFX10-NEXT:    global_load_dwordx2 v[1:2], v3, s[6:7]
319; GFX10-NEXT:    ; kill: killed $sgpr6_sgpr7
320; GFX10-NEXT:    s_waitcnt vmcnt(0)
321; GFX10-NEXT:    v_mul_lo_u32 v1, v0, v2
322; GFX10-NEXT:    v_mov_b32_e32 v0, 0
323; GFX10-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
324; GFX10-NEXT:    s_endpgm
325;
326; GFX11-LABEL: v_mul_i64_masked_src1_lo:
327; GFX11:       ; %bb.0:
328; GFX11-NEXT:    s_clause 0x1
329; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
330; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
331; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
332; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
333; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
334; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
335; GFX11-NEXT:    s_clause 0x1
336; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
337; GFX11-NEXT:    global_load_b64 v[1:2], v2, s[4:5]
338; GFX11-NEXT:    s_waitcnt vmcnt(0)
339; GFX11-NEXT:    v_mul_lo_u32 v1, v0, v2
340; GFX11-NEXT:    v_mov_b32_e32 v0, 0
341; GFX11-NEXT:    global_store_b64 v0, v[0:1], s[0:1]
342; GFX11-NEXT:    s_endpgm
343  %tid = call i32 @llvm.amdgcn.workitem.id.x()
344  %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
345  %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid
346  %a = load i64, ptr addrspace(1) %gep.a
347  %b = load i64, ptr addrspace(1) %gep.b
348  %b_and = and i64 %b, u0xFFFFFFFF00000000
349  %mul = mul i64 %a, %b_and
350  store i64 %mul, ptr addrspace(1) %out
351  ret void
352}
353
354; 64-bit multiplication where the entire first argument is masked.
355define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
356; GFX10-LABEL: v_mul_i64_masked_src0:
357; GFX10:       ; %bb.0:
358; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
359; GFX10-NEXT:    v_mov_b32_e32 v0, 0
360; GFX10-NEXT:    v_mov_b32_e32 v1, 0
361; GFX10-NEXT:    v_mov_b32_e32 v2, 0
362; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
363; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
364; GFX10-NEXT:    s_endpgm
365;
366; GFX11-LABEL: v_mul_i64_masked_src0:
367; GFX11:       ; %bb.0:
368; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
369; GFX11-NEXT:    v_mov_b32_e32 v0, 0
370; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
371; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
372; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
373; GFX11-NEXT:    s_endpgm
374  %tid = call i32 @llvm.amdgcn.workitem.id.x()
375  %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
376  %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid
377  %a = load i64, ptr addrspace(1) %gep.a
378  %b = load i64, ptr addrspace(1) %gep.b
379  %a_and = and i64 %a, u0x0000000000000000
380  %mul = mul i64 %a_and, %b
381  store i64 %mul, ptr addrspace(1) %out
382  ret void
383}
384
385; 64-bit multiplication where the parts of the high and low bytes of the first argument are masked.
386define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
387; GFX10-LABEL: v_mul_i64_partially_masked_src0:
388; GFX10:       ; %bb.0:
389; GFX10-NEXT:    s_clause 0x1
390; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
391; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
392; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
393; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX10-NEXT:    s_clause 0x1
395; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
396; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[6:7]
397; GFX10-NEXT:    s_waitcnt vmcnt(1)
398; GFX10-NEXT:    v_and_b32_e32 v6, 0xfff00000, v0
399; GFX10-NEXT:    s_waitcnt vmcnt(0)
400; GFX10-NEXT:    v_mad_u64_u32 v[4:5], s2, v6, v2, 0
401; GFX10-NEXT:    v_mov_b32_e32 v0, v5
402; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s2, v6, v3, v[0:1]
403; GFX10-NEXT:    v_and_b32_e32 v0, 0xf00f, v1
404; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v0, v2, v[5:6]
405; GFX10-NEXT:    v_mov_b32_e32 v5, v0
406; GFX10-NEXT:    v_mov_b32_e32 v0, 0
407; GFX10-NEXT:    global_store_dwordx2 v0, v[4:5], s[0:1]
408; GFX10-NEXT:    s_endpgm
409;
410; GFX11-LABEL: v_mul_i64_partially_masked_src0:
411; GFX11:       ; %bb.0:
412; GFX11-NEXT:    s_clause 0x1
413; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
414; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
415; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
416; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
417; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
418; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
419; GFX11-NEXT:    s_clause 0x1
420; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
421; GFX11-NEXT:    global_load_b64 v[2:3], v2, s[4:5]
422; GFX11-NEXT:    s_waitcnt vmcnt(1)
423; GFX11-NEXT:    v_and_b32_e32 v7, 0xfff00000, v0
424; GFX11-NEXT:    s_waitcnt vmcnt(0)
425; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
426; GFX11-NEXT:    v_mad_u64_u32 v[4:5], null, v7, v2, 0
427; GFX11-NEXT:    v_mov_b32_e32 v0, v5
428; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
429; GFX11-NEXT:    v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1]
430; GFX11-NEXT:    v_and_b32_e32 v3, 0xf00f, v1
431; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6]
432; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
433; GFX11-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0
434; GFX11-NEXT:    global_store_b64 v0, v[4:5], s[0:1]
435; GFX11-NEXT:    s_endpgm
436  %tid = call i32 @llvm.amdgcn.workitem.id.x()
437  %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
438  %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid
439  %a = load i64, ptr addrspace(1) %gep.a
440  %b = load i64, ptr addrspace(1) %gep.b
441  %a_and = and i64 %a, u0x0000F00FFFF00000
442  %mul = mul i64 %a_and, %b
443  store i64 %mul, ptr addrspace(1) %out
444  ret void
445}
446
447; 64-bit multiplication, where the first argument is masked before a branch
448define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
449; GFX10-LABEL: v_mul64_masked_before_branch:
450; GFX10:       ; %bb.0: ; %entry
451; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
452; GFX10-NEXT:    v_mov_b32_e32 v0, 0
453; GFX10-NEXT:    v_mov_b32_e32 v1, 0
454; GFX10-NEXT:    v_mov_b32_e32 v2, 0
455; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
456; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
457; GFX10-NEXT:    s_endpgm
458;
459; GFX11-LABEL: v_mul64_masked_before_branch:
460; GFX11:       ; %bb.0: ; %entry
461; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
462; GFX11-NEXT:    v_mov_b32_e32 v0, 0
463; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
464; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
465; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
466; GFX11-NEXT:    s_endpgm
467entry:
468  %tid = call i32 @llvm.amdgcn.workitem.id.x()
469  %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
470  %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid
471  %a = load i64, ptr addrspace(1) %gep.a
472  %b = load i64, ptr addrspace(1) %gep.b
473  %a_and = and i64 %a, u0x0000000000000000
474  %0 = icmp eq i64 %b, 0
475  br i1 %0, label %if, label %else
476
477if:
478  %b_and = and i64 %b, u0xFFFFFFFF00000000
479  %1 = mul i64 %a_and, %b_and
480  br label %endif
481
482else:
483  %2 = mul i64 %a_and, %b
484  br label %endif
485
486endif:
487  %3 = phi i64 [%1, %if], [%2, %else]
488  store i64 %3, ptr addrspace(1) %out
489  ret void
490}
491
492; 64-bit multiplication with both arguments changed in different basic blocks.
493define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
494; GFX10-LABEL: v_mul64_masked_before_and_in_branch:
495; GFX10:       ; %bb.0: ; %entry
496; GFX10-NEXT:    s_clause 0x1
497; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
498; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
499; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
500; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
501; GFX10-NEXT:    s_clause 0x1
502; GFX10-NEXT:    global_load_dwordx2 v[2:3], v0, s[2:3]
503; GFX10-NEXT:    global_load_dwordx2 v[4:5], v0, s[6:7]
504; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1
505; GFX10-NEXT:    s_waitcnt vmcnt(1)
506; GFX10-NEXT:    v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3]
507; GFX10-NEXT:    s_and_saveexec_b32 s2, vcc_lo
508; GFX10-NEXT:    s_xor_b32 s2, exec_lo, s2
509; GFX10-NEXT:    s_cbranch_execz .LBB10_2
510; GFX10-NEXT:  ; %bb.1: ; %else
511; GFX10-NEXT:    s_waitcnt vmcnt(0)
512; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s3, v2, v4, 0
513; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s3, v2, v5, v[1:2]
514; GFX10-NEXT:    ; implicit-def: $vgpr2_vgpr3
515; GFX10-NEXT:    ; implicit-def: $vgpr4_vgpr5
516; GFX10-NEXT:  .LBB10_2: ; %Flow
517; GFX10-NEXT:    s_andn2_saveexec_b32 s2, s2
518; GFX10-NEXT:    s_cbranch_execz .LBB10_4
519; GFX10-NEXT:  ; %bb.3: ; %if
520; GFX10-NEXT:    s_waitcnt vmcnt(0)
521; GFX10-NEXT:    v_mul_lo_u32 v1, v2, v5
522; GFX10-NEXT:    v_mov_b32_e32 v0, 0
523; GFX10-NEXT:  .LBB10_4: ; %endif
524; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
525; GFX10-NEXT:    v_mov_b32_e32 v2, 0
526; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
527; GFX10-NEXT:    s_endpgm
528;
529; GFX11-LABEL: v_mul64_masked_before_and_in_branch:
530; GFX11:       ; %bb.0: ; %entry
531; GFX11-NEXT:    s_clause 0x1
532; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
533; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
534; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
535; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
536; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
537; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
538; GFX11-NEXT:    s_clause 0x1
539; GFX11-NEXT:    global_load_b64 v[2:3], v0, s[2:3]
540; GFX11-NEXT:    global_load_b64 v[4:5], v0, s[4:5]
541; GFX11-NEXT:    s_mov_b32 s2, exec_lo
542; GFX11-NEXT:    ; implicit-def: $vgpr0_vgpr1
543; GFX11-NEXT:    s_waitcnt vmcnt(1)
544; GFX11-NEXT:    v_cmpx_ge_u64_e32 0, v[2:3]
545; GFX11-NEXT:    s_xor_b32 s2, exec_lo, s2
546; GFX11-NEXT:    s_cbranch_execz .LBB10_2
547; GFX11-NEXT:  ; %bb.1: ; %else
548; GFX11-NEXT:    s_waitcnt vmcnt(0)
549; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v4, 0
550; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
551; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2]
552; GFX11-NEXT:    ; implicit-def: $vgpr4_vgpr5
553; GFX11-NEXT:    v_mov_b32_e32 v1, v3
554; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3
555; GFX11-NEXT:  .LBB10_2: ; %Flow
556; GFX11-NEXT:    s_and_not1_saveexec_b32 s2, s2
557; GFX11-NEXT:    s_cbranch_execz .LBB10_4
558; GFX11-NEXT:  ; %bb.3: ; %if
559; GFX11-NEXT:    s_waitcnt vmcnt(0)
560; GFX11-NEXT:    v_mul_lo_u32 v1, v2, v5
561; GFX11-NEXT:    v_mov_b32_e32 v0, 0
562; GFX11-NEXT:  .LBB10_4: ; %endif
563; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s2
564; GFX11-NEXT:    v_mov_b32_e32 v2, 0
565; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
566; GFX11-NEXT:    s_endpgm
567entry:
568  %tid = call i32 @llvm.amdgcn.workitem.id.x()
569  %gep.a = getelementptr inbounds i64, ptr addrspace(1) %aptr, i32 %tid
570  %gep.b = getelementptr inbounds i64, ptr addrspace(1) %bptr, i32 %tid
571  %a = load i64, ptr addrspace(1) %gep.a
572  %b = load i64, ptr addrspace(1) %gep.b
573  %a_and = and i64 %a, u0x00000000FFFFFFFF
574  %0 = icmp ugt i64 %a, 0
575  br i1 %0, label %if, label %else
576
577if:
578  %b_and = and i64 %b, u0xFFFFFFFF00000000
579  %1 = mul i64 %a_and, %b_and
580  br label %endif
581
582else:
583  %2 = mul i64 %a_and, %b
584  br label %endif
585
586endif:
587  %3 = phi i64 [%1, %if], [%2, %else]
588  store i64 %3, ptr addrspace(1) %out
589  ret void
590}
591
592
593