xref: /llvm-project/llvm/test/CodeGen/AMDGPU/mad_64_32.ll (revision 225fc4f3562002cc77e68340c7077442ca6d4d20)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX1100 %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX1150 %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
8; RUN: llc -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX1100 %s
9
10; On GFX11, ensure vdst and src2 do not partially overlap. Full overlap is ok.
11
12define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
13; CI-LABEL: mad_i64_i32_sextops:
14; CI:       ; %bb.0:
15; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
17; CI-NEXT:    s_setpc_b64 s[30:31]
18;
19; SI-LABEL: mad_i64_i32_sextops:
20; SI:       ; %bb.0:
21; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
23; SI-NEXT:    v_mul_hi_i32 v1, v0, v1
24; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v2
25; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
26; SI-NEXT:    s_setpc_b64 s[30:31]
27;
28; GFX9-LABEL: mad_i64_i32_sextops:
29; GFX9:       ; %bb.0:
30; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
31; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
32; GFX9-NEXT:    s_setpc_b64 s[30:31]
33;
34; GFX1100-LABEL: mad_i64_i32_sextops:
35; GFX1100:       ; %bb.0:
36; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37; GFX1100-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
38; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
39; GFX1100-NEXT:    v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
40; GFX1100-NEXT:    s_setpc_b64 s[30:31]
41;
42; GFX1150-LABEL: mad_i64_i32_sextops:
43; GFX1150:       ; %bb.0:
44; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
45; GFX1150-NEXT:    v_mad_i64_i32 v[0:1], null, v0, v1, v[2:3]
46; GFX1150-NEXT:    s_setpc_b64 s[30:31]
47;
48; GFX12-LABEL: mad_i64_i32_sextops:
49; GFX12:       ; %bb.0:
50; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
51; GFX12-NEXT:    s_wait_expcnt 0x0
52; GFX12-NEXT:    s_wait_samplecnt 0x0
53; GFX12-NEXT:    s_wait_bvhcnt 0x0
54; GFX12-NEXT:    s_wait_kmcnt 0x0
55; GFX12-NEXT:    v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
56; GFX12-NEXT:    s_setpc_b64 s[30:31]
57  %sext0 = sext i32 %arg0 to i64
58  %sext1 = sext i32 %arg1 to i64
59  %mul = mul i64 %sext0, %sext1
60  %mad = add i64 %mul, %arg2
61  ret i64 %mad
62}
63
64define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
65; CI-LABEL: mad_i64_i32_sextops_commute:
66; CI:       ; %bb.0:
67; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
68; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
69; CI-NEXT:    s_setpc_b64 s[30:31]
70;
71; SI-LABEL: mad_i64_i32_sextops_commute:
72; SI:       ; %bb.0:
73; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
74; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
75; SI-NEXT:    v_mul_hi_i32 v1, v0, v1
76; SI-NEXT:    v_add_i32_e32 v0, vcc, v2, v4
77; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
78; SI-NEXT:    s_setpc_b64 s[30:31]
79;
80; GFX9-LABEL: mad_i64_i32_sextops_commute:
81; GFX9:       ; %bb.0:
82; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
83; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
84; GFX9-NEXT:    s_setpc_b64 s[30:31]
85;
86; GFX1100-LABEL: mad_i64_i32_sextops_commute:
87; GFX1100:       ; %bb.0:
88; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
89; GFX1100-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
90; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
91; GFX1100-NEXT:    v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
92; GFX1100-NEXT:    s_setpc_b64 s[30:31]
93;
94; GFX1150-LABEL: mad_i64_i32_sextops_commute:
95; GFX1150:       ; %bb.0:
96; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97; GFX1150-NEXT:    v_mad_i64_i32 v[0:1], null, v0, v1, v[2:3]
98; GFX1150-NEXT:    s_setpc_b64 s[30:31]
99;
100; GFX12-LABEL: mad_i64_i32_sextops_commute:
101; GFX12:       ; %bb.0:
102; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
103; GFX12-NEXT:    s_wait_expcnt 0x0
104; GFX12-NEXT:    s_wait_samplecnt 0x0
105; GFX12-NEXT:    s_wait_bvhcnt 0x0
106; GFX12-NEXT:    s_wait_kmcnt 0x0
107; GFX12-NEXT:    v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
108; GFX12-NEXT:    s_setpc_b64 s[30:31]
109  %sext0 = sext i32 %arg0 to i64
110  %sext1 = sext i32 %arg1 to i64
111  %mul = mul i64 %sext0, %sext1
112  %mad = add i64 %arg2, %mul
113  ret i64 %mad
114}
115
116define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
117; CI-LABEL: mad_u64_u32_zextops:
118; CI:       ; %bb.0:
119; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
120; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
121; CI-NEXT:    s_setpc_b64 s[30:31]
122;
123; SI-LABEL: mad_u64_u32_zextops:
124; SI:       ; %bb.0:
125; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
127; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
128; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v2
129; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
130; SI-NEXT:    s_setpc_b64 s[30:31]
131;
132; GFX9-LABEL: mad_u64_u32_zextops:
133; GFX9:       ; %bb.0:
134; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
135; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
136; GFX9-NEXT:    s_setpc_b64 s[30:31]
137;
138; GFX1100-LABEL: mad_u64_u32_zextops:
139; GFX1100:       ; %bb.0:
140; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
141; GFX1100-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
142; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
143; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
144; GFX1100-NEXT:    s_setpc_b64 s[30:31]
145;
146; GFX1150-LABEL: mad_u64_u32_zextops:
147; GFX1150:       ; %bb.0:
148; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v1, v[2:3]
150; GFX1150-NEXT:    s_setpc_b64 s[30:31]
151;
152; GFX12-LABEL: mad_u64_u32_zextops:
153; GFX12:       ; %bb.0:
154; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
155; GFX12-NEXT:    s_wait_expcnt 0x0
156; GFX12-NEXT:    s_wait_samplecnt 0x0
157; GFX12-NEXT:    s_wait_bvhcnt 0x0
158; GFX12-NEXT:    s_wait_kmcnt 0x0
159; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3]
160; GFX12-NEXT:    s_setpc_b64 s[30:31]
161  %sext0 = zext i32 %arg0 to i64
162  %sext1 = zext i32 %arg1 to i64
163  %mul = mul i64 %sext0, %sext1
164  %mad = add i64 %mul, %arg2
165  ret i64 %mad
166}
167
168define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
169; CI-LABEL: mad_u64_u32_zextops_commute:
170; CI:       ; %bb.0:
171; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
172; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
173; CI-NEXT:    s_setpc_b64 s[30:31]
174;
175; SI-LABEL: mad_u64_u32_zextops_commute:
176; SI:       ; %bb.0:
177; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
178; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
179; SI-NEXT:    v_mul_hi_u32 v1, v0, v1
180; SI-NEXT:    v_add_i32_e32 v0, vcc, v2, v4
181; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
182; SI-NEXT:    s_setpc_b64 s[30:31]
183;
184; GFX9-LABEL: mad_u64_u32_zextops_commute:
185; GFX9:       ; %bb.0:
186; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
187; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
188; GFX9-NEXT:    s_setpc_b64 s[30:31]
189;
190; GFX1100-LABEL: mad_u64_u32_zextops_commute:
191; GFX1100:       ; %bb.0:
192; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
193; GFX1100-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
194; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
195; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
196; GFX1100-NEXT:    s_setpc_b64 s[30:31]
197;
198; GFX1150-LABEL: mad_u64_u32_zextops_commute:
199; GFX1150:       ; %bb.0:
200; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
201; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v1, v[2:3]
202; GFX1150-NEXT:    s_setpc_b64 s[30:31]
203;
204; GFX12-LABEL: mad_u64_u32_zextops_commute:
205; GFX12:       ; %bb.0:
206; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
207; GFX12-NEXT:    s_wait_expcnt 0x0
208; GFX12-NEXT:    s_wait_samplecnt 0x0
209; GFX12-NEXT:    s_wait_bvhcnt 0x0
210; GFX12-NEXT:    s_wait_kmcnt 0x0
211; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3]
212; GFX12-NEXT:    s_setpc_b64 s[30:31]
213  %sext0 = zext i32 %arg0 to i64
214  %sext1 = zext i32 %arg1 to i64
215  %mul = mul i64 %sext0, %sext1
216  %mad = add i64 %arg2, %mul
217  ret i64 %mad
218}
219
220define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
221; CI-LABEL: mad_i64_i32_sextops_i32_i128:
222; CI:       ; %bb.0:
223; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
224; CI-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
225; CI-NEXT:    v_ashrrev_i32_e32 v12, 31, v0
226; CI-NEXT:    v_mov_b32_e32 v8, 0
227; CI-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v1, v[7:8]
228; CI-NEXT:    v_ashrrev_i32_e32 v13, 31, v1
229; CI-NEXT:    v_mov_b32_e32 v11, v10
230; CI-NEXT:    v_mov_b32_e32 v10, v8
231; CI-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v0, v13, v[9:10]
232; CI-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
233; CI-NEXT:    v_mad_i64_i32 v[10:11], s[4:5], v1, v12, 0
234; CI-NEXT:    v_addc_u32_e64 v9, s[4:5], 0, 0, vcc
235; CI-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v13, v[8:9]
236; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v13, v0, v[10:11]
237; CI-NEXT:    v_add_i32_e32 v8, vcc, v8, v0
238; CI-NEXT:    v_addc_u32_e32 v9, vcc, v9, v1, vcc
239; CI-NEXT:    v_mov_b32_e32 v1, v7
240; CI-NEXT:    v_add_i32_e32 v0, vcc, v6, v2
241; CI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
242; CI-NEXT:    v_addc_u32_e32 v2, vcc, v8, v4, vcc
243; CI-NEXT:    v_addc_u32_e32 v3, vcc, v9, v5, vcc
244; CI-NEXT:    s_setpc_b64 s[30:31]
245;
246; SI-LABEL: mad_i64_i32_sextops_i32_i128:
247; SI:       ; %bb.0:
248; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
249; SI-NEXT:    v_ashrrev_i32_e32 v6, 31, v0
250; SI-NEXT:    v_mul_lo_u32 v11, v6, v1
251; SI-NEXT:    v_mul_hi_u32 v12, v0, v1
252; SI-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
253; SI-NEXT:    v_mul_hi_u32 v14, v6, v1
254; SI-NEXT:    v_mul_lo_u32 v13, v0, v7
255; SI-NEXT:    v_mul_hi_u32 v10, v0, v7
256; SI-NEXT:    v_add_i32_e32 v12, vcc, v11, v12
257; SI-NEXT:    v_addc_u32_e32 v14, vcc, 0, v14, vcc
258; SI-NEXT:    v_mul_hi_u32 v8, v6, v7
259; SI-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
260; SI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
261; SI-NEXT:    v_mul_i32_i24_e32 v9, v6, v7
262; SI-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
263; SI-NEXT:    v_mul_hi_i32 v6, v1, v6
264; SI-NEXT:    v_mul_hi_i32 v7, v7, v0
265; SI-NEXT:    v_addc_u32_e64 v14, s[4:5], 0, 0, vcc
266; SI-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
267; SI-NEXT:    v_addc_u32_e32 v8, vcc, v8, v14, vcc
268; SI-NEXT:    v_add_i32_e32 v10, vcc, v13, v11
269; SI-NEXT:    v_mul_lo_u32 v0, v0, v1
270; SI-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
271; SI-NEXT:    v_add_i32_e32 v7, vcc, v9, v10
272; SI-NEXT:    v_addc_u32_e32 v6, vcc, v8, v6, vcc
273; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
274; SI-NEXT:    v_addc_u32_e32 v1, vcc, v12, v3, vcc
275; SI-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
276; SI-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
277; SI-NEXT:    s_setpc_b64 s[30:31]
278;
279; GFX9-LABEL: mad_i64_i32_sextops_i32_i128:
280; GFX9:       ; %bb.0:
281; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
282; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
283; GFX9-NEXT:    v_ashrrev_i32_e32 v13, 31, v0
284; GFX9-NEXT:    v_mov_b32_e32 v9, 0
285; GFX9-NEXT:    v_mov_b32_e32 v8, v7
286; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v1, v[8:9]
287; GFX9-NEXT:    v_ashrrev_i32_e32 v14, 31, v1
288; GFX9-NEXT:    v_mov_b32_e32 v8, v11
289; GFX9-NEXT:    v_mov_b32_e32 v11, v9
290; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v0, v14, v[10:11]
291; GFX9-NEXT:    v_mov_b32_e32 v12, v11
292; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
293; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc
294; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v14, v[8:9]
295; GFX9-NEXT:    v_mad_i64_i32 v[12:13], s[4:5], v1, v13, 0
296; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[12:13]
297; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v0
298; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v1, vcc
299; GFX9-NEXT:    v_mov_b32_e32 v1, v10
300; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
301; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
302; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v4, vcc
303; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v5, vcc
304; GFX9-NEXT:    s_setpc_b64 s[30:31]
305;
306; GFX1100-LABEL: mad_i64_i32_sextops_i32_i128:
307; GFX1100:       ; %bb.0:
308; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
309; GFX1100-NEXT:    v_mad_u64_u32 v[6:7], null, v0, v1, 0
310; GFX1100-NEXT:    v_mov_b32_e32 v8, 0
311; GFX1100-NEXT:    v_ashrrev_i32_e32 v14, 31, v0
312; GFX1100-NEXT:    v_ashrrev_i32_e32 v15, 31, v1
313; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
314; GFX1100-NEXT:    v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8]
315; GFX1100-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v8
316; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
317; GFX1100-NEXT:    v_mad_u64_u32 v[7:8], null, v0, v15, v[9:10]
318; GFX1100-NEXT:    v_mov_b32_e32 v10, v8
319; GFX1100-NEXT:    v_mad_i64_i32 v[8:9], null, v1, v14, 0
320; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
321; GFX1100-NEXT:    v_add_co_u32 v10, s0, v11, v10
322; GFX1100-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, 0, s0
323; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
324; GFX1100-NEXT:    v_mad_i64_i32 v[12:13], null, v15, v0, v[8:9]
325; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, v14, v15, v[10:11]
326; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
327; GFX1100-NEXT:    v_add_co_u32 v8, vcc_lo, v0, v12
328; GFX1100-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo
329; GFX1100-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v2
330; GFX1100-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
331; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
332; GFX1100-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
333; GFX1100-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
334; GFX1100-NEXT:    s_setpc_b64 s[30:31]
335;
336; GFX1150-LABEL: mad_i64_i32_sextops_i32_i128:
337; GFX1150:       ; %bb.0:
338; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
339; GFX1150-NEXT:    v_mad_u64_u32 v[6:7], null, v0, v1, 0
340; GFX1150-NEXT:    v_mov_b32_e32 v8, 0
341; GFX1150-NEXT:    v_ashrrev_i32_e32 v12, 31, v0
342; GFX1150-NEXT:    v_ashrrev_i32_e32 v13, 31, v1
343; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
344; GFX1150-NEXT:    v_mad_u64_u32 v[9:10], null, v12, v1, v[7:8]
345; GFX1150-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v8
346; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
347; GFX1150-NEXT:    v_mad_u64_u32 v[7:8], null, v0, v13, v[9:10]
348; GFX1150-NEXT:    v_mov_b32_e32 v10, v8
349; GFX1150-NEXT:    v_mad_i64_i32 v[8:9], null, v1, v12, 0
350; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
351; GFX1150-NEXT:    v_add_co_u32 v10, s0, v11, v10
352; GFX1150-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, 0, s0
353; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
354; GFX1150-NEXT:    v_mad_i64_i32 v[0:1], null, v13, v0, v[8:9]
355; GFX1150-NEXT:    v_mad_u64_u32 v[8:9], null, v12, v13, v[10:11]
356; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
357; GFX1150-NEXT:    v_add_co_u32 v8, vcc_lo, v8, v0
358; GFX1150-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v1, vcc_lo
359; GFX1150-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v2
360; GFX1150-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
361; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
362; GFX1150-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
363; GFX1150-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
364; GFX1150-NEXT:    s_setpc_b64 s[30:31]
365;
366; GFX12-LABEL: mad_i64_i32_sextops_i32_i128:
367; GFX12:       ; %bb.0:
368; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
369; GFX12-NEXT:    s_wait_expcnt 0x0
370; GFX12-NEXT:    s_wait_samplecnt 0x0
371; GFX12-NEXT:    s_wait_bvhcnt 0x0
372; GFX12-NEXT:    s_wait_kmcnt 0x0
373; GFX12-NEXT:    v_mad_co_u64_u32 v[6:7], null, v0, v1, 0
374; GFX12-NEXT:    v_mov_b32_e32 v8, 0
375; GFX12-NEXT:    v_ashrrev_i32_e32 v12, 31, v0
376; GFX12-NEXT:    v_ashrrev_i32_e32 v13, 31, v1
377; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
378; GFX12-NEXT:    v_mad_co_u64_u32 v[9:10], null, v12, v1, v[7:8]
379; GFX12-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v8
380; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
381; GFX12-NEXT:    v_mad_co_u64_u32 v[7:8], null, v0, v13, v[9:10]
382; GFX12-NEXT:    v_mov_b32_e32 v10, v8
383; GFX12-NEXT:    v_mad_co_i64_i32 v[8:9], null, v1, v12, 0
384; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
385; GFX12-NEXT:    v_add_co_u32 v10, s0, v11, v10
386; GFX12-NEXT:    v_add_co_ci_u32_e64 v11, null, 0, 0, s0
387; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
388; GFX12-NEXT:    v_mad_co_i64_i32 v[0:1], null, v13, v0, v[8:9]
389; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v12, v13, v[10:11]
390; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
391; GFX12-NEXT:    v_add_co_u32 v8, vcc_lo, v8, v0
392; GFX12-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v1, vcc_lo
393; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v2
394; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
395; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
396; GFX12-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
397; GFX12-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
398; GFX12-NEXT:    s_setpc_b64 s[30:31]
399  %sext0 = sext i32 %arg0 to i128
400  %sext1 = sext i32 %arg1 to i128
401  %mul = mul i128 %sext0, %sext1
402  %mad = add i128 %mul, %arg2
403  ret i128 %mad
404}
405
406define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 {
407; CI-LABEL: mad_i64_i32_sextops_i32_i63:
408; CI:       ; %bb.0:
409; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
410; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
411; CI-NEXT:    s_setpc_b64 s[30:31]
412;
413; SI-LABEL: mad_i64_i32_sextops_i32_i63:
414; SI:       ; %bb.0:
415; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
416; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
417; SI-NEXT:    v_mul_hi_i32 v1, v0, v1
418; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v2
419; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
420; SI-NEXT:    s_setpc_b64 s[30:31]
421;
422; GFX9-LABEL: mad_i64_i32_sextops_i32_i63:
423; GFX9:       ; %bb.0:
424; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
425; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
426; GFX9-NEXT:    s_setpc_b64 s[30:31]
427;
428; GFX1100-LABEL: mad_i64_i32_sextops_i32_i63:
429; GFX1100:       ; %bb.0:
430; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
431; GFX1100-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
432; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
433; GFX1100-NEXT:    v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
434; GFX1100-NEXT:    s_setpc_b64 s[30:31]
435;
436; GFX1150-LABEL: mad_i64_i32_sextops_i32_i63:
437; GFX1150:       ; %bb.0:
438; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
439; GFX1150-NEXT:    v_mad_i64_i32 v[0:1], null, v0, v1, v[2:3]
440; GFX1150-NEXT:    s_setpc_b64 s[30:31]
441;
442; GFX12-LABEL: mad_i64_i32_sextops_i32_i63:
443; GFX12:       ; %bb.0:
444; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
445; GFX12-NEXT:    s_wait_expcnt 0x0
446; GFX12-NEXT:    s_wait_samplecnt 0x0
447; GFX12-NEXT:    s_wait_bvhcnt 0x0
448; GFX12-NEXT:    s_wait_kmcnt 0x0
449; GFX12-NEXT:    v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
450; GFX12-NEXT:    s_setpc_b64 s[30:31]
451  %sext0 = sext i32 %arg0 to i63
452  %sext1 = sext i32 %arg1 to i63
453  %mul = mul i63 %sext0, %sext1
454  %mad = add i63 %mul, %arg2
455  ret i63 %mad
456}
457
458define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 {
459; CI-LABEL: mad_i64_i32_sextops_i31_i63:
460; CI:       ; %bb.0:
461; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
462; CI-NEXT:    v_bfe_i32 v1, v1, 0, 31
463; CI-NEXT:    v_bfe_i32 v0, v0, 0, 31
464; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
465; CI-NEXT:    s_setpc_b64 s[30:31]
466;
467; SI-LABEL: mad_i64_i32_sextops_i31_i63:
468; SI:       ; %bb.0:
469; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
470; SI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
471; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
472; SI-NEXT:    v_ashr_i64 v[4:5], v[3:4], 33
473; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], 33
474; SI-NEXT:    v_mul_lo_u32 v1, v4, v0
475; SI-NEXT:    v_mul_hi_i32 v4, v4, v0
476; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v2
477; SI-NEXT:    v_addc_u32_e32 v1, vcc, v4, v3, vcc
478; SI-NEXT:    s_setpc_b64 s[30:31]
479;
480; GFX9-LABEL: mad_i64_i32_sextops_i31_i63:
481; GFX9:       ; %bb.0:
482; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
483; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 31
484; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 31
485; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
486; GFX9-NEXT:    s_setpc_b64 s[30:31]
487;
488; GFX1100-LABEL: mad_i64_i32_sextops_i31_i63:
489; GFX1100:       ; %bb.0:
490; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
491; GFX1100-NEXT:    v_bfe_i32 v4, v1, 0, 31
492; GFX1100-NEXT:    v_bfe_i32 v5, v0, 0, 31
493; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
494; GFX1100-NEXT:    v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3]
495; GFX1100-NEXT:    s_setpc_b64 s[30:31]
496;
497; GFX1150-LABEL: mad_i64_i32_sextops_i31_i63:
498; GFX1150:       ; %bb.0:
499; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
500; GFX1150-NEXT:    v_bfe_i32 v1, v1, 0, 31
501; GFX1150-NEXT:    v_bfe_i32 v0, v0, 0, 31
502; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
503; GFX1150-NEXT:    v_mad_i64_i32 v[0:1], null, v0, v1, v[2:3]
504; GFX1150-NEXT:    s_setpc_b64 s[30:31]
505;
506; GFX12-LABEL: mad_i64_i32_sextops_i31_i63:
507; GFX12:       ; %bb.0:
508; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
509; GFX12-NEXT:    s_wait_expcnt 0x0
510; GFX12-NEXT:    s_wait_samplecnt 0x0
511; GFX12-NEXT:    s_wait_bvhcnt 0x0
512; GFX12-NEXT:    s_wait_kmcnt 0x0
513; GFX12-NEXT:    v_bfe_i32 v1, v1, 0, 31
514; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 31
515; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
516; GFX12-NEXT:    v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3]
517; GFX12-NEXT:    s_setpc_b64 s[30:31]
518  %sext0 = sext i31 %arg0 to i63
519  %sext1 = sext i31 %arg1 to i63
520  %mul = mul i63 %sext0, %sext1
521  %mad = add i63 %mul, %arg2
522  ret i63 %mad
523}
524
525define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
526; CI-LABEL: mad_i64_i32_extops_i32_i64:
527; CI:       ; %bb.0:
528; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
529; CI-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
530; CI-NEXT:    v_mul_lo_u32 v4, v4, v1
531; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v1, v[2:3]
532; CI-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
533; CI-NEXT:    s_setpc_b64 s[30:31]
534;
535; SI-LABEL: mad_i64_i32_extops_i32_i64:
536; SI:       ; %bb.0:
537; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
538; SI-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
539; SI-NEXT:    v_mul_hi_u32 v5, v0, v1
540; SI-NEXT:    v_mul_lo_u32 v4, v4, v1
541; SI-NEXT:    v_mul_lo_u32 v0, v0, v1
542; SI-NEXT:    v_add_i32_e32 v1, vcc, v5, v4
543; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
544; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
545; SI-NEXT:    s_setpc_b64 s[30:31]
546;
547; GFX9-LABEL: mad_i64_i32_extops_i32_i64:
548; GFX9:       ; %bb.0:
549; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
550; GFX9-NEXT:    v_mov_b32_e32 v4, v1
551; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v0
552; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v4, v[2:3]
553; GFX9-NEXT:    v_mov_b32_e32 v2, v1
554; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v4, v[2:3]
555; GFX9-NEXT:    v_mov_b32_e32 v1, v2
556; GFX9-NEXT:    s_setpc_b64 s[30:31]
557;
558; GFX1100-LABEL: mad_i64_i32_extops_i32_i64:
559; GFX1100:       ; %bb.0:
560; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
561; GFX1100-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
562; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
563; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
564; GFX1100-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
565; GFX1100-NEXT:    v_mov_b32_e32 v3, v1
566; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
567; GFX1100-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4]
568; GFX1100-NEXT:    s_setpc_b64 s[30:31]
569;
570; GFX1150-LABEL: mad_i64_i32_extops_i32_i64:
571; GFX1150:       ; %bb.0:
572; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
573; GFX1150-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
574; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
575; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3]
576; GFX1150-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
577; GFX1150-NEXT:    v_mad_u64_u32 v[1:2], null, v2, v4, v[1:2]
578; GFX1150-NEXT:    s_setpc_b64 s[30:31]
579;
580; GFX12-LABEL: mad_i64_i32_extops_i32_i64:
581; GFX12:       ; %bb.0:
582; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
583; GFX12-NEXT:    s_wait_expcnt 0x0
584; GFX12-NEXT:    s_wait_samplecnt 0x0
585; GFX12-NEXT:    s_wait_bvhcnt 0x0
586; GFX12-NEXT:    s_wait_kmcnt 0x0
587; GFX12-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
588; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
589; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v5, v4, v[2:3]
590; GFX12-NEXT:    v_ashrrev_i32_e32 v2, 31, v5
591; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v2, v4, v[1:2]
592; GFX12-NEXT:    s_setpc_b64 s[30:31]
593  %ext0 = sext i32 %arg0 to i64
594  %ext1 = zext i32 %arg1 to i64
595  %mul = mul i64 %ext0, %ext1
596  %mad = add i64 %mul, %arg2
597  ret i64 %mad
598}
599
600define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
601; CI-LABEL: mad_u64_u32_bitops:
602; CI:       ; %bb.0:
603; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
604; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
605; CI-NEXT:    s_setpc_b64 s[30:31]
606;
607; SI-LABEL: mad_u64_u32_bitops:
608; SI:       ; %bb.0:
609; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
610; SI-NEXT:    v_mul_lo_u32 v1, v0, v2
611; SI-NEXT:    v_mul_hi_u32 v2, v0, v2
612; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v4
613; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v5, vcc
614; SI-NEXT:    s_setpc_b64 s[30:31]
615;
616; GFX9-LABEL: mad_u64_u32_bitops:
617; GFX9:       ; %bb.0:
618; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
619; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
620; GFX9-NEXT:    s_setpc_b64 s[30:31]
621;
622; GFX1100-LABEL: mad_u64_u32_bitops:
623; GFX1100:       ; %bb.0:
624; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
625; GFX1100-NEXT:    v_mov_b32_e32 v3, v0
626; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
627; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5]
628; GFX1100-NEXT:    s_setpc_b64 s[30:31]
629;
630; GFX1150-LABEL: mad_u64_u32_bitops:
631; GFX1150:       ; %bb.0:
632; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
633; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v2, v[4:5]
634; GFX1150-NEXT:    s_setpc_b64 s[30:31]
635;
636; GFX12-LABEL: mad_u64_u32_bitops:
637; GFX12:       ; %bb.0:
638; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
639; GFX12-NEXT:    s_wait_expcnt 0x0
640; GFX12-NEXT:    s_wait_samplecnt 0x0
641; GFX12-NEXT:    s_wait_bvhcnt 0x0
642; GFX12-NEXT:    s_wait_kmcnt 0x0
643; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v0, v2, v[4:5]
644; GFX12-NEXT:    s_setpc_b64 s[30:31]
645  %trunc.lhs = and i64 %arg0, 4294967295
646  %trunc.rhs = and i64 %arg1, 4294967295
647  %mul = mul i64 %trunc.lhs, %trunc.rhs
648  %add = add i64 %mul, %arg2
649  ret i64 %add
650}
651
652define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
653; CI-LABEL: mad_u64_u32_bitops_lhs_mask_small:
654; CI:       ; %bb.0:
655; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
656; CI-NEXT:    v_and_b32_e32 v3, 1, v1
657; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
658; CI-NEXT:    v_mul_lo_u32 v2, v3, v2
659; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
660; CI-NEXT:    s_setpc_b64 s[30:31]
661;
662; SI-LABEL: mad_u64_u32_bitops_lhs_mask_small:
663; SI:       ; %bb.0:
664; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
665; SI-NEXT:    v_and_b32_e32 v1, 1, v1
666; SI-NEXT:    v_mul_hi_u32 v3, v0, v2
667; SI-NEXT:    v_mul_lo_u32 v1, v1, v2
668; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
669; SI-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
670; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
671; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
672; SI-NEXT:    s_setpc_b64 s[30:31]
673;
674; GFX9-LABEL: mad_u64_u32_bitops_lhs_mask_small:
675; GFX9:       ; %bb.0:
676; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
677; GFX9-NEXT:    v_and_b32_e32 v3, 1, v1
678; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, v2, v[4:5]
679; GFX9-NEXT:    v_mov_b32_e32 v4, v1
680; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v2, v[4:5]
681; GFX9-NEXT:    v_mov_b32_e32 v1, v2
682; GFX9-NEXT:    s_setpc_b64 s[30:31]
683;
684; GFX1100-LABEL: mad_u64_u32_bitops_lhs_mask_small:
685; GFX1100:       ; %bb.0:
686; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
687; GFX1100-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v0
688; GFX1100-NEXT:    v_mov_b32_e32 v6, v1
689; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
690; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5]
691; GFX1100-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_and_b32 v5, 1, v6
692; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
693; GFX1100-NEXT:    v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5]
694; GFX1100-NEXT:    s_setpc_b64 s[30:31]
695;
696; GFX1150-LABEL: mad_u64_u32_bitops_lhs_mask_small:
697; GFX1150:       ; %bb.0:
698; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
699; GFX1150-NEXT:    v_mov_b32_e32 v3, v1
700; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, v0, v2, v[4:5]
701; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
702; GFX1150-NEXT:    v_and_b32_e32 v3, 1, v3
703; GFX1150-NEXT:    v_mad_u64_u32 v[1:2], null, v3, v2, v[1:2]
704; GFX1150-NEXT:    s_setpc_b64 s[30:31]
705;
706; GFX12-LABEL: mad_u64_u32_bitops_lhs_mask_small:
707; GFX12:       ; %bb.0:
708; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
709; GFX12-NEXT:    s_wait_expcnt 0x0
710; GFX12-NEXT:    s_wait_samplecnt 0x0
711; GFX12-NEXT:    s_wait_bvhcnt 0x0
712; GFX12-NEXT:    s_wait_kmcnt 0x0
713; GFX12-NEXT:    v_mov_b32_e32 v3, v1
714; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v0, v2, v[4:5]
715; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
716; GFX12-NEXT:    v_and_b32_e32 v3, 1, v3
717; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v3, v2, v[1:2]
718; GFX12-NEXT:    s_setpc_b64 s[30:31]
719  %trunc.lhs = and i64 %arg0, 8589934591
720  %trunc.rhs = and i64 %arg1, 4294967295
721  %mul = mul i64 %trunc.lhs, %trunc.rhs
722  %add = add i64 %mul, %arg2
723  ret i64 %add
724}
725
726define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
727; CI-LABEL: mad_u64_u32_bitops_rhs_mask_small:
728; CI:       ; %bb.0:
729; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
730; CI-NEXT:    v_mov_b32_e32 v6, v0
731; CI-NEXT:    v_and_b32_e32 v3, 1, v3
732; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5]
733; CI-NEXT:    v_mul_lo_u32 v2, v6, v3
734; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
735; CI-NEXT:    s_setpc_b64 s[30:31]
736;
737; SI-LABEL: mad_u64_u32_bitops_rhs_mask_small:
738; SI:       ; %bb.0:
739; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
740; SI-NEXT:    v_and_b32_e32 v1, 1, v3
741; SI-NEXT:    v_mul_hi_u32 v3, v0, v2
742; SI-NEXT:    v_mul_lo_u32 v1, v0, v1
743; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
744; SI-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
745; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
746; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
747; SI-NEXT:    s_setpc_b64 s[30:31]
748;
749; GFX9-LABEL: mad_u64_u32_bitops_rhs_mask_small:
750; GFX9:       ; %bb.0:
751; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
752; GFX9-NEXT:    v_mov_b32_e32 v6, v0
753; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, v[4:5]
754; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
755; GFX9-NEXT:    v_mov_b32_e32 v2, v1
756; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v3, v[2:3]
757; GFX9-NEXT:    v_mov_b32_e32 v1, v2
758; GFX9-NEXT:    s_setpc_b64 s[30:31]
759;
760; GFX1100-LABEL: mad_u64_u32_bitops_rhs_mask_small:
761; GFX1100:       ; %bb.0:
762; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
763; GFX1100-NEXT:    v_mov_b32_e32 v6, v0
764; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
765; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5]
766; GFX1100-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v4, 1, v3
767; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
768; GFX1100-NEXT:    v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4]
769; GFX1100-NEXT:    s_setpc_b64 s[30:31]
770;
771; GFX1150-LABEL: mad_u64_u32_bitops_rhs_mask_small:
772; GFX1150:       ; %bb.0:
773; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
774; GFX1150-NEXT:    v_mov_b32_e32 v6, v0
775; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
776; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5]
777; GFX1150-NEXT:    v_and_b32_e32 v2, 1, v3
778; GFX1150-NEXT:    v_mad_u64_u32 v[1:2], null, v6, v2, v[1:2]
779; GFX1150-NEXT:    s_setpc_b64 s[30:31]
780;
781; GFX12-LABEL: mad_u64_u32_bitops_rhs_mask_small:
782; GFX12:       ; %bb.0:
783; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
784; GFX12-NEXT:    s_wait_expcnt 0x0
785; GFX12-NEXT:    s_wait_samplecnt 0x0
786; GFX12-NEXT:    s_wait_bvhcnt 0x0
787; GFX12-NEXT:    s_wait_kmcnt 0x0
788; GFX12-NEXT:    v_mov_b32_e32 v6, v0
789; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
790; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v6, v2, v[4:5]
791; GFX12-NEXT:    v_and_b32_e32 v2, 1, v3
792; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v6, v2, v[1:2]
793; GFX12-NEXT:    s_setpc_b64 s[30:31]
794  %trunc.lhs = and i64 %arg0, 4294967295
795  %trunc.rhs = and i64 %arg1, 8589934591
796  %mul = mul i64 %trunc.lhs, %trunc.rhs
797  %add = add i64 %mul, %arg2
798  ret i64 %add
799}
800
801define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 {
802; CI-LABEL: mad_i64_i32_bitops:
803; CI:       ; %bb.0:
804; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
805; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v2, v[4:5]
806; CI-NEXT:    s_setpc_b64 s[30:31]
807;
808; SI-LABEL: mad_i64_i32_bitops:
809; SI:       ; %bb.0:
810; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
811; SI-NEXT:    v_mul_lo_u32 v1, v0, v2
812; SI-NEXT:    v_mul_hi_i32 v2, v0, v2
813; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v4
814; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v5, vcc
815; SI-NEXT:    s_setpc_b64 s[30:31]
816;
817; GFX9-LABEL: mad_i64_i32_bitops:
818; GFX9:       ; %bb.0:
819; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
820; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v2, v[4:5]
821; GFX9-NEXT:    s_setpc_b64 s[30:31]
822;
823; GFX1100-LABEL: mad_i64_i32_bitops:
824; GFX1100:       ; %bb.0:
825; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
826; GFX1100-NEXT:    v_mov_b32_e32 v3, v0
827; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
828; GFX1100-NEXT:    v_mad_i64_i32 v[0:1], null, v3, v2, v[4:5]
829; GFX1100-NEXT:    s_setpc_b64 s[30:31]
830;
831; GFX1150-LABEL: mad_i64_i32_bitops:
832; GFX1150:       ; %bb.0:
833; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
834; GFX1150-NEXT:    v_mad_i64_i32 v[0:1], null, v0, v2, v[4:5]
835; GFX1150-NEXT:    s_setpc_b64 s[30:31]
836;
837; GFX12-LABEL: mad_i64_i32_bitops:
838; GFX12:       ; %bb.0:
839; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
840; GFX12-NEXT:    s_wait_expcnt 0x0
841; GFX12-NEXT:    s_wait_samplecnt 0x0
842; GFX12-NEXT:    s_wait_bvhcnt 0x0
843; GFX12-NEXT:    s_wait_kmcnt 0x0
844; GFX12-NEXT:    v_mad_co_i64_i32 v[0:1], null, v0, v2, v[4:5]
845; GFX12-NEXT:    s_setpc_b64 s[30:31]
846  %shl.lhs = shl i64 %arg0, 32
847  %trunc.lhs = ashr i64 %shl.lhs, 32
848  %shl.rhs = shl i64 %arg1, 32
849  %trunc.rhs = ashr i64 %shl.rhs, 32
850  %mul = mul i64 %trunc.lhs, %trunc.rhs
851  %add = add i64 %mul, %arg2
852  ret i64 %add
853}
854
855; Example from bug report
856define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 {
857; CI-LABEL: mad_i64_i32_unpack_i64ops:
858; CI:       ; %bb.0:
859; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
860; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v0, v[0:1]
861; CI-NEXT:    s_setpc_b64 s[30:31]
862;
863; SI-LABEL: mad_i64_i32_unpack_i64ops:
864; SI:       ; %bb.0:
865; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
866; SI-NEXT:    v_mul_lo_u32 v2, v1, v0
867; SI-NEXT:    v_mul_hi_u32 v3, v1, v0
868; SI-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
869; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
870; SI-NEXT:    s_setpc_b64 s[30:31]
871;
872; GFX9-LABEL: mad_i64_i32_unpack_i64ops:
873; GFX9:       ; %bb.0:
874; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
875; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v0, v[0:1]
876; GFX9-NEXT:    s_setpc_b64 s[30:31]
877;
878; GFX1100-LABEL: mad_i64_i32_unpack_i64ops:
879; GFX1100:       ; %bb.0:
880; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
881; GFX1100-NEXT:    v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1]
882; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
883; GFX1100-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
884; GFX1100-NEXT:    s_setpc_b64 s[30:31]
885;
886; GFX1150-LABEL: mad_i64_i32_unpack_i64ops:
887; GFX1150:       ; %bb.0:
888; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
889; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, v1, v0, v[0:1]
890; GFX1150-NEXT:    s_setpc_b64 s[30:31]
891;
892; GFX12-LABEL: mad_i64_i32_unpack_i64ops:
893; GFX12:       ; %bb.0:
894; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
895; GFX12-NEXT:    s_wait_expcnt 0x0
896; GFX12-NEXT:    s_wait_samplecnt 0x0
897; GFX12-NEXT:    s_wait_bvhcnt 0x0
898; GFX12-NEXT:    s_wait_kmcnt 0x0
899; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v1, v0, v[0:1]
900; GFX12-NEXT:    s_setpc_b64 s[30:31]
901  %tmp4 = lshr i64 %arg0, 32
902  %tmp5 = and i64 %arg0, 4294967295
903  %mul = mul nuw i64 %tmp4, %tmp5
904  %mad = add i64 %mul, %arg0
905  ret i64 %mad
906}
907
908define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, i32 %arg1, i64 %arg2) #0 {
909; CI-LABEL: mad_i64_i32_uniform:
910; CI:       ; %bb.0:
911; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
912; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
913; CI-NEXT:    s_mov_b32 s7, 0xf000
914; CI-NEXT:    s_mov_b32 s6, -1
915; CI-NEXT:    s_waitcnt lgkmcnt(0)
916; CI-NEXT:    v_mov_b32_e32 v2, s3
917; CI-NEXT:    v_mov_b32_e32 v0, s4
918; CI-NEXT:    v_mov_b32_e32 v1, s5
919; CI-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
920; CI-NEXT:    s_mov_b32 s4, s0
921; CI-NEXT:    s_mov_b32 s5, s1
922; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
923; CI-NEXT:    s_endpgm
924;
925; SI-LABEL: mad_i64_i32_uniform:
926; SI:       ; %bb.0:
927; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
928; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
929; SI-NEXT:    s_mov_b32 s7, 0xf000
930; SI-NEXT:    s_mov_b32 s6, -1
931; SI-NEXT:    s_waitcnt lgkmcnt(0)
932; SI-NEXT:    v_mov_b32_e32 v0, s3
933; SI-NEXT:    v_mul_hi_u32 v1, s2, v0
934; SI-NEXT:    s_mov_b32 s4, s0
935; SI-NEXT:    s_mul_i32 s0, s2, s3
936; SI-NEXT:    v_mov_b32_e32 v0, s0
937; SI-NEXT:    v_mov_b32_e32 v2, s9
938; SI-NEXT:    v_add_i32_e32 v0, vcc, s8, v0
939; SI-NEXT:    s_mov_b32 s5, s1
940; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
941; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
942; SI-NEXT:    s_endpgm
943;
944; GFX9-LABEL: mad_i64_i32_uniform:
945; GFX9:       ; %bb.0:
946; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
947; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
948; GFX9-NEXT:    v_mov_b32_e32 v2, 0
949; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
950; GFX9-NEXT:    s_mul_hi_u32 s4, s2, s3
951; GFX9-NEXT:    s_mul_i32 s2, s2, s3
952; GFX9-NEXT:    s_add_u32 s2, s2, s6
953; GFX9-NEXT:    s_addc_u32 s3, s4, s7
954; GFX9-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
955; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
956; GFX9-NEXT:    s_endpgm
957;
958; GFX11-LABEL: mad_i64_i32_uniform:
959; GFX11:       ; %bb.0:
960; GFX11-NEXT:    s_clause 0x1
961; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
962; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
963; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
964; GFX11-NEXT:    s_mul_i32 s6, s2, s3
965; GFX11-NEXT:    s_mul_hi_u32 s3, s2, s3
966; GFX11-NEXT:    s_add_u32 s2, s6, s4
967; GFX11-NEXT:    s_addc_u32 s3, s3, s5
968; GFX11-NEXT:    v_mov_b32_e32 v0, s2
969; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
970; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
971; GFX11-NEXT:    s_endpgm
972;
973; GFX12-LABEL: mad_i64_i32_uniform:
974; GFX12:       ; %bb.0:
975; GFX12-NEXT:    s_clause 0x1
976; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
977; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
978; GFX12-NEXT:    s_mov_b32 s7, 0
979; GFX12-NEXT:    s_wait_kmcnt 0x0
980; GFX12-NEXT:    s_mov_b32 s6, s2
981; GFX12-NEXT:    s_mov_b32 s2, s3
982; GFX12-NEXT:    s_mov_b32 s3, s7
983; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
984; GFX12-NEXT:    s_mul_u64 s[2:3], s[6:7], s[2:3]
985; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
986; GFX12-NEXT:    v_mov_b32_e32 v2, 0
987; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
988; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
989; GFX12-NEXT:    s_endpgm
990  %ext0 = zext i32 %arg0 to i64
991  %ext1 = zext i32 %arg1 to i64
992  %mul = mul i64 %ext0, %ext1
993  %mad = add i64 %mul, %arg2
994  store i64 %mad, ptr addrspace(1) %out
995  ret void
996}
997
998define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 {
999; CI-LABEL: mad_i64_i32_twice:
1000; CI:       ; %bb.0:
1001; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1002; CI-NEXT:    v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3]
1003; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[4:5]
1004; CI-NEXT:    v_xor_b32_e32 v1, v3, v1
1005; CI-NEXT:    v_xor_b32_e32 v0, v2, v0
1006; CI-NEXT:    s_setpc_b64 s[30:31]
1007;
1008; SI-LABEL: mad_i64_i32_twice:
1009; SI:       ; %bb.0:
1010; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1011; SI-NEXT:    v_mul_lo_u32 v6, v0, v1
1012; SI-NEXT:    v_mul_hi_i32 v0, v0, v1
1013; SI-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
1014; SI-NEXT:    v_addc_u32_e32 v1, vcc, v0, v3, vcc
1015; SI-NEXT:    v_add_i32_e32 v3, vcc, v6, v4
1016; SI-NEXT:    v_addc_u32_e32 v0, vcc, v0, v5, vcc
1017; SI-NEXT:    v_xor_b32_e32 v1, v1, v0
1018; SI-NEXT:    v_xor_b32_e32 v0, v2, v3
1019; SI-NEXT:    s_setpc_b64 s[30:31]
1020;
1021; GFX9-LABEL: mad_i64_i32_twice:
1022; GFX9:       ; %bb.0:
1023; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1024; GFX9-NEXT:    v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3]
1025; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[4:5]
1026; GFX9-NEXT:    v_xor_b32_e32 v1, v3, v1
1027; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v0
1028; GFX9-NEXT:    s_setpc_b64 s[30:31]
1029;
1030; GFX1100-LABEL: mad_i64_i32_twice:
1031; GFX1100:       ; %bb.0:
1032; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1033; GFX1100-NEXT:    v_mad_i64_i32 v[6:7], null, v0, v1, v[2:3]
1034; GFX1100-NEXT:    v_mad_i64_i32 v[2:3], null, v0, v1, v[4:5]
1035; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1036; GFX1100-NEXT:    v_xor_b32_e32 v0, v6, v2
1037; GFX1100-NEXT:    v_xor_b32_e32 v1, v7, v3
1038; GFX1100-NEXT:    s_setpc_b64 s[30:31]
1039;
1040; GFX1150-LABEL: mad_i64_i32_twice:
1041; GFX1150:       ; %bb.0:
1042; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1043; GFX1150-NEXT:    v_mad_i64_i32 v[2:3], null, v0, v1, v[2:3]
1044; GFX1150-NEXT:    v_mad_i64_i32 v[0:1], null, v0, v1, v[4:5]
1045; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1046; GFX1150-NEXT:    v_xor_b32_e32 v0, v2, v0
1047; GFX1150-NEXT:    v_xor_b32_e32 v1, v3, v1
1048; GFX1150-NEXT:    s_setpc_b64 s[30:31]
1049;
1050; GFX12-LABEL: mad_i64_i32_twice:
1051; GFX12:       ; %bb.0:
1052; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1053; GFX12-NEXT:    s_wait_expcnt 0x0
1054; GFX12-NEXT:    s_wait_samplecnt 0x0
1055; GFX12-NEXT:    s_wait_bvhcnt 0x0
1056; GFX12-NEXT:    s_wait_kmcnt 0x0
1057; GFX12-NEXT:    v_mad_co_i64_i32 v[2:3], null, v0, v1, v[2:3]
1058; GFX12-NEXT:    v_mad_co_i64_i32 v[0:1], null, v0, v1, v[4:5]
1059; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1060; GFX12-NEXT:    v_xor_b32_e32 v0, v2, v0
1061; GFX12-NEXT:    v_xor_b32_e32 v1, v3, v1
1062; GFX12-NEXT:    s_setpc_b64 s[30:31]
1063  %sext0 = sext i32 %arg0 to i64
1064  %sext1 = sext i32 %arg1 to i64
1065  %mul = mul i64 %sext0, %sext1
1066  %mad1 = add i64 %mul, %arg2
1067  %mad2 = add i64 %mul, %arg3
1068  %out = xor i64 %mad1, %mad2
1069  ret i64 %out
1070}
1071
1072define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 %arg4) #0 {
1073; CI-LABEL: mad_i64_i32_thrice:
1074; CI:       ; %bb.0:
1075; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1076; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, 0
1077; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
1078; CI-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
1079; CI-NEXT:    v_add_i32_e32 v4, vcc, v0, v4
1080; CI-NEXT:    v_addc_u32_e32 v5, vcc, v1, v5, vcc
1081; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
1082; CI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
1083; CI-NEXT:    v_xor_b32_e32 v3, v3, v5
1084; CI-NEXT:    v_xor_b32_e32 v2, v2, v4
1085; CI-NEXT:    v_xor_b32_e32 v1, v3, v1
1086; CI-NEXT:    v_xor_b32_e32 v0, v2, v0
1087; CI-NEXT:    s_setpc_b64 s[30:31]
1088;
1089; SI-LABEL: mad_i64_i32_thrice:
1090; SI:       ; %bb.0:
1091; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1092; SI-NEXT:    v_mul_lo_u32 v8, v0, v1
1093; SI-NEXT:    v_mul_hi_i32 v0, v0, v1
1094; SI-NEXT:    v_add_i32_e32 v1, vcc, v8, v2
1095; SI-NEXT:    v_addc_u32_e32 v2, vcc, v0, v3, vcc
1096; SI-NEXT:    v_add_i32_e32 v3, vcc, v8, v4
1097; SI-NEXT:    v_addc_u32_e32 v4, vcc, v0, v5, vcc
1098; SI-NEXT:    v_add_i32_e32 v5, vcc, v8, v6
1099; SI-NEXT:    v_addc_u32_e32 v0, vcc, v0, v7, vcc
1100; SI-NEXT:    v_xor_b32_e32 v2, v2, v4
1101; SI-NEXT:    v_xor_b32_e32 v3, v1, v3
1102; SI-NEXT:    v_xor_b32_e32 v1, v2, v0
1103; SI-NEXT:    v_xor_b32_e32 v0, v3, v5
1104; SI-NEXT:    s_setpc_b64 s[30:31]
1105;
1106; GFX9-LABEL: mad_i64_i32_thrice:
1107; GFX9:       ; %bb.0:
1108; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1109; GFX9-NEXT:    v_mad_i64_i32 v[2:3], s[4:5], v0, v1, v[2:3]
1110; GFX9-NEXT:    v_mad_i64_i32 v[4:5], s[4:5], v0, v1, v[4:5]
1111; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[6:7]
1112; GFX9-NEXT:    v_xor_b32_e32 v3, v3, v5
1113; GFX9-NEXT:    v_xor_b32_e32 v2, v2, v4
1114; GFX9-NEXT:    v_xor_b32_e32 v1, v3, v1
1115; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v0
1116; GFX9-NEXT:    s_setpc_b64 s[30:31]
1117;
1118; GFX1100-LABEL: mad_i64_i32_thrice:
1119; GFX1100:       ; %bb.0:
1120; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1121; GFX1100-NEXT:    v_mad_i64_i32 v[8:9], null, v0, v1, 0
1122; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1123; GFX1100-NEXT:    v_add_co_u32 v0, vcc_lo, v8, v2
1124; GFX1100-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v9, v3, vcc_lo
1125; GFX1100-NEXT:    v_add_co_u32 v2, vcc_lo, v8, v4
1126; GFX1100-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo
1127; GFX1100-NEXT:    v_add_co_u32 v4, vcc_lo, v8, v6
1128; GFX1100-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v9, v7, vcc_lo
1129; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1130; GFX1100-NEXT:    v_xor_b32_e32 v0, v0, v2
1131; GFX1100-NEXT:    v_xor_b32_e32 v1, v1, v3
1132; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1133; GFX1100-NEXT:    v_xor_b32_e32 v0, v0, v4
1134; GFX1100-NEXT:    v_xor_b32_e32 v1, v1, v5
1135; GFX1100-NEXT:    s_setpc_b64 s[30:31]
1136;
1137; GFX1150-LABEL: mad_i64_i32_thrice:
1138; GFX1150:       ; %bb.0:
1139; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1140; GFX1150-NEXT:    v_mad_i64_i32 v[0:1], null, v0, v1, 0
1141; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1142; GFX1150-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v2
1143; GFX1150-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
1144; GFX1150-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v4
1145; GFX1150-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo
1146; GFX1150-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v6
1147; GFX1150-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v7, vcc_lo
1148; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1149; GFX1150-NEXT:    v_xor_b32_e32 v2, v2, v4
1150; GFX1150-NEXT:    v_xor_b32_e32 v3, v3, v5
1151; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1152; GFX1150-NEXT:    v_xor_b32_e32 v0, v2, v0
1153; GFX1150-NEXT:    v_xor_b32_e32 v1, v3, v1
1154; GFX1150-NEXT:    s_setpc_b64 s[30:31]
1155;
1156; GFX12-LABEL: mad_i64_i32_thrice:
1157; GFX12:       ; %bb.0:
1158; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1159; GFX12-NEXT:    s_wait_expcnt 0x0
1160; GFX12-NEXT:    s_wait_samplecnt 0x0
1161; GFX12-NEXT:    s_wait_bvhcnt 0x0
1162; GFX12-NEXT:    s_wait_kmcnt 0x0
1163; GFX12-NEXT:    v_mad_co_i64_i32 v[0:1], null, v0, v1, 0
1164; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1165; GFX12-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v2
1166; GFX12-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
1167; GFX12-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v4
1168; GFX12-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo
1169; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v6
1170; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v7, vcc_lo
1171; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1172; GFX12-NEXT:    v_xor_b32_e32 v2, v2, v4
1173; GFX12-NEXT:    v_xor_b32_e32 v3, v3, v5
1174; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1175; GFX12-NEXT:    v_xor_b32_e32 v0, v2, v0
1176; GFX12-NEXT:    v_xor_b32_e32 v1, v3, v1
1177; GFX12-NEXT:    s_setpc_b64 s[30:31]
1178  %sext0 = sext i32 %arg0 to i64
1179  %sext1 = sext i32 %arg1 to i64
1180  %mul = mul i64 %sext0, %sext1
1181  %mad1 = add i64 %mul, %arg2
1182  %mad2 = add i64 %mul, %arg3
1183  %mad3 = add i64 %mul, %arg4
1184  %out.p = xor i64 %mad1, %mad2
1185  %out = xor i64 %out.p, %mad3
1186  ret i64 %out
1187}
1188
1189define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 {
1190; CI-LABEL: mad_i64_i32_secondary_use:
1191; CI:       ; %bb.0:
1192; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1193; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, 0
1194; CI-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
1195; CI-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
1196; CI-NEXT:    v_xor_b32_e32 v1, v3, v1
1197; CI-NEXT:    v_xor_b32_e32 v0, v2, v0
1198; CI-NEXT:    s_setpc_b64 s[30:31]
1199;
1200; SI-LABEL: mad_i64_i32_secondary_use:
1201; SI:       ; %bb.0:
1202; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1203; SI-NEXT:    v_mul_lo_u32 v4, v0, v1
1204; SI-NEXT:    v_mul_hi_i32 v0, v0, v1
1205; SI-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
1206; SI-NEXT:    v_addc_u32_e32 v1, vcc, v0, v3, vcc
1207; SI-NEXT:    v_xor_b32_e32 v1, v1, v0
1208; SI-NEXT:    v_xor_b32_e32 v0, v2, v4
1209; SI-NEXT:    s_setpc_b64 s[30:31]
1210;
1211; GFX9-LABEL: mad_i64_i32_secondary_use:
1212; GFX9:       ; %bb.0:
1213; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1214; GFX9-NEXT:    v_mad_i64_i32 v[4:5], s[4:5], v0, v1, 0
1215; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v0, v1, v[2:3]
1216; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v5
1217; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v4
1218; GFX9-NEXT:    s_setpc_b64 s[30:31]
1219;
1220; GFX1100-LABEL: mad_i64_i32_secondary_use:
1221; GFX1100:       ; %bb.0:
1222; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1223; GFX1100-NEXT:    v_mad_i64_i32 v[4:5], null, v0, v1, 0
1224; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1225; GFX1100-NEXT:    v_add_co_u32 v0, vcc_lo, v4, v2
1226; GFX1100-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v5, v3, vcc_lo
1227; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1228; GFX1100-NEXT:    v_xor_b32_e32 v0, v0, v4
1229; GFX1100-NEXT:    v_xor_b32_e32 v1, v1, v5
1230; GFX1100-NEXT:    s_setpc_b64 s[30:31]
1231;
1232; GFX1150-LABEL: mad_i64_i32_secondary_use:
1233; GFX1150:       ; %bb.0:
1234; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1235; GFX1150-NEXT:    v_mad_i64_i32 v[0:1], null, v0, v1, 0
1236; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1237; GFX1150-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v2
1238; GFX1150-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
1239; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1240; GFX1150-NEXT:    v_xor_b32_e32 v0, v2, v0
1241; GFX1150-NEXT:    v_xor_b32_e32 v1, v3, v1
1242; GFX1150-NEXT:    s_setpc_b64 s[30:31]
1243;
1244; GFX12-LABEL: mad_i64_i32_secondary_use:
1245; GFX12:       ; %bb.0:
1246; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1247; GFX12-NEXT:    s_wait_expcnt 0x0
1248; GFX12-NEXT:    s_wait_samplecnt 0x0
1249; GFX12-NEXT:    s_wait_bvhcnt 0x0
1250; GFX12-NEXT:    s_wait_kmcnt 0x0
1251; GFX12-NEXT:    v_mad_co_i64_i32 v[0:1], null, v0, v1, 0
1252; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1253; GFX12-NEXT:    v_add_co_u32 v2, vcc_lo, v0, v2
1254; GFX12-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
1255; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1256; GFX12-NEXT:    v_xor_b32_e32 v0, v2, v0
1257; GFX12-NEXT:    v_xor_b32_e32 v1, v3, v1
1258; GFX12-NEXT:    s_setpc_b64 s[30:31]
1259  %sext0 = sext i32 %arg0 to i64
1260  %sext1 = sext i32 %arg1 to i64
1261  %mul = mul i64 %sext0, %sext1
1262  %mad = add i64 %mul, %arg2
1263  %out = xor i64 %mad, %mul
1264  ret i64 %out
1265}
1266
1267define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
1268; CI-LABEL: mad_i48_i48:
1269; CI:       ; %bb.0:
1270; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1271; CI-NEXT:    v_mov_b32_e32 v6, v1
1272; CI-NEXT:    v_mov_b32_e32 v7, v0
1273; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v2, v[4:5]
1274; CI-NEXT:    v_mul_lo_u32 v2, v6, v2
1275; CI-NEXT:    v_mul_lo_u32 v3, v7, v3
1276; CI-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1277; CI-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
1278; CI-NEXT:    s_setpc_b64 s[30:31]
1279;
1280; SI-LABEL: mad_i48_i48:
1281; SI:       ; %bb.0:
1282; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1283; SI-NEXT:    v_mul_lo_u32 v3, v0, v3
1284; SI-NEXT:    v_mul_hi_u32 v6, v0, v2
1285; SI-NEXT:    v_mul_lo_u32 v1, v1, v2
1286; SI-NEXT:    v_mul_lo_u32 v0, v0, v2
1287; SI-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
1288; SI-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
1289; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
1290; SI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
1291; SI-NEXT:    s_setpc_b64 s[30:31]
1292;
1293; GFX9-LABEL: mad_i48_i48:
1294; GFX9:       ; %bb.0:
1295; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1296; GFX9-NEXT:    v_mov_b32_e32 v6, v1
1297; GFX9-NEXT:    v_mov_b32_e32 v7, v0
1298; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v2, v[4:5]
1299; GFX9-NEXT:    v_mul_lo_u32 v3, v7, v3
1300; GFX9-NEXT:    v_mul_lo_u32 v2, v6, v2
1301; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
1302; GFX9-NEXT:    s_setpc_b64 s[30:31]
1303;
1304; GFX11-LABEL: mad_i48_i48:
1305; GFX11:       ; %bb.0:
1306; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1307; GFX11-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v0
1308; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1309; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v7, v2, v[4:5]
1310; GFX11-NEXT:    v_mul_lo_u32 v3, v7, v3
1311; GFX11-NEXT:    v_mul_lo_u32 v2, v6, v2
1312; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1313; GFX11-NEXT:    v_add3_u32 v1, v2, v1, v3
1314; GFX11-NEXT:    s_setpc_b64 s[30:31]
1315;
1316; GFX12-LABEL: mad_i48_i48:
1317; GFX12:       ; %bb.0:
1318; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1319; GFX12-NEXT:    s_wait_expcnt 0x0
1320; GFX12-NEXT:    s_wait_samplecnt 0x0
1321; GFX12-NEXT:    s_wait_bvhcnt 0x0
1322; GFX12-NEXT:    s_wait_kmcnt 0x0
1323; GFX12-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v0
1324; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1325; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v7, v2, v[4:5]
1326; GFX12-NEXT:    v_mul_lo_u32 v3, v7, v3
1327; GFX12-NEXT:    v_mul_lo_u32 v2, v6, v2
1328; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1329; GFX12-NEXT:    v_add3_u32 v1, v2, v1, v3
1330; GFX12-NEXT:    s_setpc_b64 s[30:31]
1331  %m = mul i48 %arg0, %arg1
1332  %a = add i48 %m, %arg2
1333  ret i48 %a
1334}
1335
1336define i64 @lshr_mad_i64_1(i64 %arg0, i64 %arg1) #0 {
1337; CI-LABEL: lshr_mad_i64_1:
1338; CI:       ; %bb.0:
1339; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1340; CI-NEXT:    v_mov_b32_e32 v2, v1
1341; CI-NEXT:    v_mov_b32_e32 v1, 0
1342; CI-NEXT:    s_movk_i32 s4, 0xfc19
1343; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[0:1]
1344; CI-NEXT:    s_setpc_b64 s[30:31]
1345;
1346; SI-LABEL: lshr_mad_i64_1:
1347; SI:       ; %bb.0:
1348; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1349; SI-NEXT:    s_movk_i32 s4, 0xfc19
1350; SI-NEXT:    v_mul_hi_u32 v2, v1, s4
1351; SI-NEXT:    v_mul_lo_u32 v3, v1, s4
1352; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v1
1353; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
1354; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1355; SI-NEXT:    s_setpc_b64 s[30:31]
1356;
1357; GFX9-LABEL: lshr_mad_i64_1:
1358; GFX9:       ; %bb.0:
1359; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1360; GFX9-NEXT:    v_mov_b32_e32 v2, v1
1361; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1362; GFX9-NEXT:    s_movk_i32 s4, 0xfc19
1363; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[0:1]
1364; GFX9-NEXT:    s_setpc_b64 s[30:31]
1365;
1366; GFX1100-LABEL: lshr_mad_i64_1:
1367; GFX1100:       ; %bb.0:
1368; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1369; GFX1100-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, 0
1370; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1371; GFX1100-NEXT:    v_mad_u64_u32 v[2:3], null, 0xfffffc19, v4, v[0:1]
1372; GFX1100-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
1373; GFX1100-NEXT:    s_setpc_b64 s[30:31]
1374;
1375; GFX1150-LABEL: lshr_mad_i64_1:
1376; GFX1150:       ; %bb.0:
1377; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1378; GFX1150-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0
1379; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1380; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, 0xfffffc19, v2, v[0:1]
1381; GFX1150-NEXT:    s_setpc_b64 s[30:31]
1382;
1383; GFX12-LABEL: lshr_mad_i64_1:
1384; GFX12:       ; %bb.0:
1385; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1386; GFX12-NEXT:    s_wait_expcnt 0x0
1387; GFX12-NEXT:    s_wait_samplecnt 0x0
1388; GFX12-NEXT:    s_wait_bvhcnt 0x0
1389; GFX12-NEXT:    s_wait_kmcnt 0x0
1390; GFX12-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0
1391; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1392; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xfffffc19, v2, v[0:1]
1393; GFX12-NEXT:    s_setpc_b64 s[30:31]
1394  %lsh = lshr i64 %arg0, 32
1395  %mul = mul i64 %lsh, s0xfffffffffffffc19
1396  %mad = add i64 %mul, %arg0
1397
1398  ret i64 %mad
1399}
1400
1401define i64 @lshr_mad_i64_2(i64 %arg0) #0 {
1402; CI-LABEL: lshr_mad_i64_2:
1403; CI:       ; %bb.0:
1404; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1405; CI-NEXT:    v_mov_b32_e32 v2, v1
1406; CI-NEXT:    v_mov_b32_e32 v1, 0
1407; CI-NEXT:    s_movk_i32 s4, 0xd1
1408; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[0:1]
1409; CI-NEXT:    s_setpc_b64 s[30:31]
1410;
1411; SI-LABEL: lshr_mad_i64_2:
1412; SI:       ; %bb.0:
1413; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1414; SI-NEXT:    s_movk_i32 s4, 0xd1
1415; SI-NEXT:    v_mul_hi_u32 v2, v1, s4
1416; SI-NEXT:    v_mul_lo_u32 v3, v1, s4
1417; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v1
1418; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
1419; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1420; SI-NEXT:    s_setpc_b64 s[30:31]
1421;
1422; GFX9-LABEL: lshr_mad_i64_2:
1423; GFX9:       ; %bb.0:
1424; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1425; GFX9-NEXT:    v_mov_b32_e32 v2, v1
1426; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1427; GFX9-NEXT:    s_movk_i32 s4, 0xd1
1428; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[0:1]
1429; GFX9-NEXT:    s_setpc_b64 s[30:31]
1430;
1431; GFX1100-LABEL: lshr_mad_i64_2:
1432; GFX1100:       ; %bb.0:
1433; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1434; GFX1100-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, 0
1435; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1436; GFX1100-NEXT:    v_mad_u64_u32 v[2:3], null, 0xd1, v4, v[0:1]
1437; GFX1100-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
1438; GFX1100-NEXT:    s_setpc_b64 s[30:31]
1439;
1440; GFX1150-LABEL: lshr_mad_i64_2:
1441; GFX1150:       ; %bb.0:
1442; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1443; GFX1150-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0
1444; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1445; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, 0xd1, v2, v[0:1]
1446; GFX1150-NEXT:    s_setpc_b64 s[30:31]
1447;
1448; GFX12-LABEL: lshr_mad_i64_2:
1449; GFX12:       ; %bb.0:
1450; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1451; GFX12-NEXT:    s_wait_expcnt 0x0
1452; GFX12-NEXT:    s_wait_samplecnt 0x0
1453; GFX12-NEXT:    s_wait_bvhcnt 0x0
1454; GFX12-NEXT:    s_wait_kmcnt 0x0
1455; GFX12-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0
1456; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1457; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xd1, v2, v[0:1]
1458; GFX12-NEXT:    s_setpc_b64 s[30:31]
1459  %lsh = lshr i64 %arg0, 32
1460  %mul = mul i64 %lsh, s0xffffffff000000d1
1461  %mad = add i64 %mul, %arg0
1462
1463  ret i64 %mad
1464}
1465
1466define i64 @lshr_mad_i64_3(i64 %arg0) #0 {
1467; CI-LABEL: lshr_mad_i64_3:
1468; CI:       ; %bb.0:
1469; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1470; CI-NEXT:    v_mov_b32_e32 v2, v1
1471; CI-NEXT:    v_mov_b32_e32 v1, 0
1472; CI-NEXT:    s_movk_i32 s4, 0xfc88
1473; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[0:1]
1474; CI-NEXT:    s_setpc_b64 s[30:31]
1475;
1476; SI-LABEL: lshr_mad_i64_3:
1477; SI:       ; %bb.0:
1478; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1479; SI-NEXT:    s_movk_i32 s4, 0xfc88
1480; SI-NEXT:    v_mul_hi_u32 v2, v1, s4
1481; SI-NEXT:    v_mul_lo_u32 v3, v1, s4
1482; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v1
1483; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
1484; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1485; SI-NEXT:    s_setpc_b64 s[30:31]
1486;
1487; GFX9-LABEL: lshr_mad_i64_3:
1488; GFX9:       ; %bb.0:
1489; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1490; GFX9-NEXT:    v_mov_b32_e32 v2, v1
1491; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1492; GFX9-NEXT:    s_movk_i32 s4, 0xfc88
1493; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[0:1]
1494; GFX9-NEXT:    s_setpc_b64 s[30:31]
1495;
1496; GFX1100-LABEL: lshr_mad_i64_3:
1497; GFX1100:       ; %bb.0:
1498; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1499; GFX1100-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, 0
1500; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1501; GFX1100-NEXT:    v_mad_u64_u32 v[2:3], null, 0xfffffc88, v4, v[0:1]
1502; GFX1100-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
1503; GFX1100-NEXT:    s_setpc_b64 s[30:31]
1504;
1505; GFX1150-LABEL: lshr_mad_i64_3:
1506; GFX1150:       ; %bb.0:
1507; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1508; GFX1150-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0
1509; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1510; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, 0xfffffc88, v2, v[0:1]
1511; GFX1150-NEXT:    s_setpc_b64 s[30:31]
1512;
1513; GFX12-LABEL: lshr_mad_i64_3:
1514; GFX12:       ; %bb.0:
1515; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1516; GFX12-NEXT:    s_wait_expcnt 0x0
1517; GFX12-NEXT:    s_wait_samplecnt 0x0
1518; GFX12-NEXT:    s_wait_bvhcnt 0x0
1519; GFX12-NEXT:    s_wait_kmcnt 0x0
1520; GFX12-NEXT:    v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0
1521; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1522; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v2, v[0:1]
1523; GFX12-NEXT:    s_setpc_b64 s[30:31]
1524  %lsh = lshr i64 %arg0, 32
1525  %mul = mul i64 s0xfffffffffffffc88, %lsh
1526  %mad = add i64 %mul, %arg0
1527
1528  ret i64 %mad
1529}
1530
1531define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 {
1532; CI-LABEL: lshr_mad_i64_4:
1533; CI:       ; %bb.0:
1534; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1535; CI-NEXT:    v_mul_lo_u32 v2, v2, v0
1536; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v0, 0
1537; CI-NEXT:    s_movk_i32 s4, 0xfc88
1538; CI-NEXT:    v_add_i32_e32 v2, vcc, v1, v2
1539; CI-NEXT:    v_mov_b32_e32 v1, 0
1540; CI-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[0:1]
1541; CI-NEXT:    s_setpc_b64 s[30:31]
1542;
1543; SI-LABEL: lshr_mad_i64_4:
1544; SI:       ; %bb.0:
1545; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1546; SI-NEXT:    v_mul_lo_u32 v2, v2, v0
1547; SI-NEXT:    v_mul_hi_u32 v3, v1, v0
1548; SI-NEXT:    s_movk_i32 s4, 0xfc88
1549; SI-NEXT:    v_mul_lo_u32 v0, v1, v0
1550; SI-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1551; SI-NEXT:    v_mul_hi_u32 v3, v2, s4
1552; SI-NEXT:    v_mul_lo_u32 v1, v2, s4
1553; SI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v2
1554; SI-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
1555; SI-NEXT:    v_addc_u32_e32 v1, vcc, v3, v2, vcc
1556; SI-NEXT:    s_setpc_b64 s[30:31]
1557;
1558; GFX9-LABEL: lshr_mad_i64_4:
1559; GFX9:       ; %bb.0:
1560; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1561; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v0, 0
1562; GFX9-NEXT:    v_mov_b32_e32 v6, v5
1563; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, v[6:7]
1564; GFX9-NEXT:    v_mov_b32_e32 v5, 0
1565; GFX9-NEXT:    s_movk_i32 s4, 0xfc88
1566; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[4:5]
1567; GFX9-NEXT:    s_setpc_b64 s[30:31]
1568;
1569; GFX1100-LABEL: lshr_mad_i64_4:
1570; GFX1100:       ; %bb.0:
1571; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1572; GFX1100-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v0, 0
1573; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1574; GFX1100-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0
1575; GFX1100-NEXT:    v_mad_u64_u32 v[5:6], null, v2, v0, v[1:2]
1576; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1577; GFX1100-NEXT:    v_mad_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4]
1578; GFX1100-NEXT:    s_setpc_b64 s[30:31]
1579;
1580; GFX1150-LABEL: lshr_mad_i64_4:
1581; GFX1150:       ; %bb.0:
1582; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1583; GFX1150-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v0, 0
1584; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1585; GFX1150-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0
1586; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v0, v[1:2]
1587; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1588; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4]
1589; GFX1150-NEXT:    s_setpc_b64 s[30:31]
1590;
1591; GFX12-LABEL: lshr_mad_i64_4:
1592; GFX12:       ; %bb.0:
1593; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1594; GFX12-NEXT:    s_wait_expcnt 0x0
1595; GFX12-NEXT:    s_wait_samplecnt 0x0
1596; GFX12-NEXT:    s_wait_bvhcnt 0x0
1597; GFX12-NEXT:    s_wait_kmcnt 0x0
1598; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v1, v0, 0
1599; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1600; GFX12-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, 0
1601; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v2, v0, v[1:2]
1602; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1603; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4]
1604; GFX12-NEXT:    s_setpc_b64 s[30:31]
1605  %ext = zext i32 %arg0 to i64
1606  %mul1 = mul i64 %arg1, %ext
1607  %lsh = lshr i64 %mul1, 32
1608  %mul2 = mul i64 %lsh, s0xfffffffffffffc88
1609  %mad = add i64 %mul2, %mul1
1610  ret i64 %mad
1611}
1612
1613define i64 @lshr_mad_i64_negative_1(i64 %arg0) #0 {
1614; CI-LABEL: lshr_mad_i64_negative_1:
1615; CI:       ; %bb.0:
1616; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1617; CI-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
1618; CI-NEXT:    s_movk_i32 s4, 0xfc19
1619; CI-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v2, s4, v[0:1]
1620; CI-NEXT:    s_setpc_b64 s[30:31]
1621;
1622; SI-LABEL: lshr_mad_i64_negative_1:
1623; SI:       ; %bb.0:
1624; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1625; SI-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
1626; SI-NEXT:    s_movk_i32 s4, 0xfc19
1627; SI-NEXT:    v_mul_lo_u32 v3, v2, s4
1628; SI-NEXT:    v_mul_hi_i32 v2, v2, s4
1629; SI-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
1630; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1631; SI-NEXT:    s_setpc_b64 s[30:31]
1632;
1633; GFX9-LABEL: lshr_mad_i64_negative_1:
1634; GFX9:       ; %bb.0:
1635; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1636; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
1637; GFX9-NEXT:    s_movk_i32 s4, 0xfc19
1638; GFX9-NEXT:    v_mad_i64_i32 v[0:1], s[4:5], v2, s4, v[0:1]
1639; GFX9-NEXT:    s_setpc_b64 s[30:31]
1640;
1641; GFX1100-LABEL: lshr_mad_i64_negative_1:
1642; GFX1100:       ; %bb.0:
1643; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1644; GFX1100-NEXT:    v_lshrrev_b32_e32 v4, 4, v1
1645; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1646; GFX1100-NEXT:    v_mad_i64_i32 v[2:3], null, 0xfffffc19, v4, v[0:1]
1647; GFX1100-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
1648; GFX1100-NEXT:    s_setpc_b64 s[30:31]
1649;
1650; GFX1150-LABEL: lshr_mad_i64_negative_1:
1651; GFX1150:       ; %bb.0:
1652; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1653; GFX1150-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
1654; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1655; GFX1150-NEXT:    v_mad_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1]
1656; GFX1150-NEXT:    s_setpc_b64 s[30:31]
1657;
1658; GFX12-LABEL: lshr_mad_i64_negative_1:
1659; GFX12:       ; %bb.0:
1660; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1661; GFX12-NEXT:    s_wait_expcnt 0x0
1662; GFX12-NEXT:    s_wait_samplecnt 0x0
1663; GFX12-NEXT:    s_wait_bvhcnt 0x0
1664; GFX12-NEXT:    s_wait_kmcnt 0x0
1665; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 4, v1
1666; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1667; GFX12-NEXT:    v_mad_co_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1]
1668; GFX12-NEXT:    s_setpc_b64 s[30:31]
1669  %lsh = lshr i64 %arg0, 36
1670  %mul = mul i64 %lsh, s0xfffffffffffffc19
1671  %mad = add i64 %mul, %arg0
1672
1673  ret i64 %mad
1674}
1675
1676define i64 @lshr_mad_i64_negative_2(i64 %arg0) #0 {
1677; CI-LABEL: lshr_mad_i64_negative_2:
1678; CI:       ; %bb.0:
1679; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1680; CI-NEXT:    s_movk_i32 s4, 0xd1
1681; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
1682; CI-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
1683; CI-NEXT:    v_sub_i32_e32 v1, vcc, v3, v0
1684; CI-NEXT:    v_mov_b32_e32 v0, v2
1685; CI-NEXT:    s_setpc_b64 s[30:31]
1686;
1687; SI-LABEL: lshr_mad_i64_negative_2:
1688; SI:       ; %bb.0:
1689; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1690; SI-NEXT:    s_movk_i32 s4, 0xd1
1691; SI-NEXT:    v_mul_hi_u32 v2, v1, s4
1692; SI-NEXT:    v_mul_lo_u32 v4, v1, s4
1693; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v1
1694; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
1695; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
1696; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1697; SI-NEXT:    s_setpc_b64 s[30:31]
1698;
1699; GFX9-LABEL: lshr_mad_i64_negative_2:
1700; GFX9:       ; %bb.0:
1701; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1702; GFX9-NEXT:    s_movk_i32 s4, 0xd1
1703; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
1704; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
1705; GFX9-NEXT:    v_sub_u32_e32 v1, v3, v0
1706; GFX9-NEXT:    v_mov_b32_e32 v0, v2
1707; GFX9-NEXT:    s_setpc_b64 s[30:31]
1708;
1709; GFX11-LABEL: lshr_mad_i64_negative_2:
1710; GFX11:       ; %bb.0:
1711; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1712; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
1713; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
1714; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
1715; GFX11-NEXT:    v_sub_nc_u32_e32 v1, v3, v0
1716; GFX11-NEXT:    v_mov_b32_e32 v0, v2
1717; GFX11-NEXT:    s_setpc_b64 s[30:31]
1718;
1719; GFX12-LABEL: lshr_mad_i64_negative_2:
1720; GFX12:       ; %bb.0:
1721; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1722; GFX12-NEXT:    s_wait_expcnt 0x0
1723; GFX12-NEXT:    s_wait_samplecnt 0x0
1724; GFX12-NEXT:    s_wait_bvhcnt 0x0
1725; GFX12-NEXT:    s_wait_kmcnt 0x0
1726; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
1727; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
1728; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
1729; GFX12-NEXT:    v_sub_nc_u32_e32 v1, v3, v0
1730; GFX12-NEXT:    v_mov_b32_e32 v0, v2
1731; GFX12-NEXT:    s_setpc_b64 s[30:31]
1732  %lsh = lshr i64 %arg0, 32
1733  %mul = mul i64 %lsh, s0xffffff00000000d1
1734  %mad = add i64 %mul, %arg0
1735
1736  ret i64 %mad
1737}
1738
1739define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 {
1740; CI-LABEL: lshr_mad_i64_negative_3:
1741; CI:       ; %bb.0:
1742; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1743; CI-NEXT:    v_lshr_b64 v[2:3], v[0:1], 22
1744; CI-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
1745; CI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
1746; CI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
1747; CI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
1748; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1749; CI-NEXT:    s_setpc_b64 s[30:31]
1750;
1751; SI-LABEL: lshr_mad_i64_negative_3:
1752; SI:       ; %bb.0:
1753; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1754; SI-NEXT:    v_lshr_b64 v[2:3], v[0:1], 22
1755; SI-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
1756; SI-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
1757; SI-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
1758; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
1759; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1760; SI-NEXT:    s_setpc_b64 s[30:31]
1761;
1762; GFX9-LABEL: lshr_mad_i64_negative_3:
1763; GFX9:       ; %bb.0:
1764; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1765; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 22, v[0:1]
1766; GFX9-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
1767; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
1768; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
1769; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
1770; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
1771; GFX9-NEXT:    s_setpc_b64 s[30:31]
1772;
1773; GFX11-LABEL: lshr_mad_i64_negative_3:
1774; GFX11:       ; %bb.0:
1775; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1776; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 22, v[0:1]
1777; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1778; GFX11-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
1779; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
1780; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
1781; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
1782; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 1
1783; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1784; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1785; GFX11-NEXT:    s_setpc_b64 s[30:31]
1786;
1787; GFX12-LABEL: lshr_mad_i64_negative_3:
1788; GFX12:       ; %bb.0:
1789; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1790; GFX12-NEXT:    s_wait_expcnt 0x0
1791; GFX12-NEXT:    s_wait_samplecnt 0x0
1792; GFX12-NEXT:    s_wait_bvhcnt 0x0
1793; GFX12-NEXT:    s_wait_kmcnt 0x0
1794; GFX12-NEXT:    v_lshrrev_b64 v[2:3], 22, v[0:1]
1795; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1796; GFX12-NEXT:    v_and_b32_e32 v2, 0xfffffc00, v2
1797; GFX12-NEXT:    v_sub_co_u32 v0, vcc_lo, v0, v2
1798; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
1799; GFX12-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
1800; GFX12-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 1
1801; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1802; GFX12-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
1803; GFX12-NEXT:    s_setpc_b64 s[30:31]
1804  %op = add i64 %arg0, 1
1805  %lsh = lshr i64 %arg0, 32
1806  %mul = mul i64 %lsh, s0xfffffffffffffc00
1807  %mad = add i64 %mul, %op
1808
1809  ret i64 %mad
1810}
1811
1812define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 {
1813; CI-LABEL: lshr_mad_i64_negative_4:
1814; CI:       ; %bb.0:
1815; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1816; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v0, v[0:1]
1817; CI-NEXT:    v_mul_lo_u32 v0, v1, v1
1818; CI-NEXT:    v_add_i32_e32 v1, vcc, v0, v3
1819; CI-NEXT:    v_mov_b32_e32 v0, v2
1820; CI-NEXT:    s_setpc_b64 s[30:31]
1821;
1822; SI-LABEL: lshr_mad_i64_negative_4:
1823; SI:       ; %bb.0:
1824; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1825; SI-NEXT:    v_mul_hi_u32 v2, v1, v0
1826; SI-NEXT:    v_mul_lo_u32 v3, v1, v1
1827; SI-NEXT:    v_mul_lo_u32 v4, v1, v0
1828; SI-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1829; SI-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
1830; SI-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
1831; SI-NEXT:    s_setpc_b64 s[30:31]
1832;
1833; GFX9-LABEL: lshr_mad_i64_negative_4:
1834; GFX9:       ; %bb.0:
1835; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1836; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v1, v0, v[0:1]
1837; GFX9-NEXT:    v_mov_b32_e32 v0, v3
1838; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v1, v[0:1]
1839; GFX9-NEXT:    v_mov_b32_e32 v0, v2
1840; GFX9-NEXT:    v_mov_b32_e32 v1, v4
1841; GFX9-NEXT:    s_setpc_b64 s[30:31]
1842;
1843; GFX1100-LABEL: lshr_mad_i64_negative_4:
1844; GFX1100:       ; %bb.0:
1845; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1846; GFX1100-NEXT:    v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1]
1847; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1848; GFX1100-NEXT:    v_mov_b32_e32 v0, v3
1849; GFX1100-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v1, v[0:1]
1850; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1851; GFX1100-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
1852; GFX1100-NEXT:    s_setpc_b64 s[30:31]
1853;
1854; GFX1150-LABEL: lshr_mad_i64_negative_4:
1855; GFX1150:       ; %bb.0:
1856; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1857; GFX1150-NEXT:    v_mad_u64_u32 v[3:4], null, v1, v0, v[0:1]
1858; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1859; GFX1150-NEXT:    v_mov_b32_e32 v0, v4
1860; GFX1150-NEXT:    v_mad_u64_u32 v[1:2], null, v1, v1, v[0:1]
1861; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3)
1862; GFX1150-NEXT:    v_mov_b32_e32 v0, v3
1863; GFX1150-NEXT:    s_setpc_b64 s[30:31]
1864;
1865; GFX12-LABEL: lshr_mad_i64_negative_4:
1866; GFX12:       ; %bb.0:
1867; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1868; GFX12-NEXT:    s_wait_expcnt 0x0
1869; GFX12-NEXT:    s_wait_samplecnt 0x0
1870; GFX12-NEXT:    s_wait_bvhcnt 0x0
1871; GFX12-NEXT:    s_wait_kmcnt 0x0
1872; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v1, v0, v[0:1]
1873; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1874; GFX12-NEXT:    v_mov_b32_e32 v0, v4
1875; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v1, v1, v[0:1]
1876; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
1877; GFX12-NEXT:    v_mov_b32_e32 v0, v3
1878; GFX12-NEXT:    s_setpc_b64 s[30:31]
1879  %lsh = lshr i64 %arg0, 32
1880  %mul = mul i64 %lsh, %arg0
1881  %mad = add i64 %mul, %arg0
1882
1883  ret i64 %mad
1884}
1885
1886define amdgpu_ps i64 @lshr_mad_i64_sgpr(i64 inreg %arg0) #0 {
1887; CI-LABEL: lshr_mad_i64_sgpr:
1888; CI:       ; %bb.0:
1889; CI-NEXT:    v_mov_b32_e32 v0, s0
1890; CI-NEXT:    v_mov_b32_e32 v1, 0
1891; CI-NEXT:    v_mov_b32_e32 v2, 0xffff1c18
1892; CI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s1, v2, v[0:1]
1893; CI-NEXT:    v_readfirstlane_b32 s0, v0
1894; CI-NEXT:    v_readfirstlane_b32 s1, v1
1895; CI-NEXT:    ; return to shader part epilog
1896;
1897; SI-LABEL: lshr_mad_i64_sgpr:
1898; SI:       ; %bb.0:
1899; SI-NEXT:    v_mov_b32_e32 v0, 0xffff1c18
1900; SI-NEXT:    v_mul_hi_u32 v0, s1, v0
1901; SI-NEXT:    s_mul_i32 s2, s1, 0xffff1c18
1902; SI-NEXT:    v_readfirstlane_b32 s3, v0
1903; SI-NEXT:    s_sub_i32 s3, s3, s1
1904; SI-NEXT:    s_add_u32 s0, s2, s0
1905; SI-NEXT:    s_addc_u32 s1, s3, s1
1906; SI-NEXT:    ; return to shader part epilog
1907;
1908; GFX9-LABEL: lshr_mad_i64_sgpr:
1909; GFX9:       ; %bb.0:
1910; GFX9-NEXT:    s_mul_hi_u32 s2, s1, 0xffff1c18
1911; GFX9-NEXT:    s_sub_i32 s2, s2, s1
1912; GFX9-NEXT:    s_mul_i32 s3, s1, 0xffff1c18
1913; GFX9-NEXT:    s_add_u32 s0, s3, s0
1914; GFX9-NEXT:    s_addc_u32 s1, s2, s1
1915; GFX9-NEXT:    ; return to shader part epilog
1916;
1917; GFX11-LABEL: lshr_mad_i64_sgpr:
1918; GFX11:       ; %bb.0:
1919; GFX11-NEXT:    s_mul_hi_u32 s2, s1, 0xffff1c18
1920; GFX11-NEXT:    s_mul_i32 s3, s1, 0xffff1c18
1921; GFX11-NEXT:    s_sub_i32 s2, s2, s1
1922; GFX11-NEXT:    s_add_u32 s0, s3, s0
1923; GFX11-NEXT:    s_addc_u32 s1, s2, s1
1924; GFX11-NEXT:    ; return to shader part epilog
1925;
1926; GFX12-LABEL: lshr_mad_i64_sgpr:
1927; GFX12:       ; %bb.0:
1928; GFX12-NEXT:    s_mov_b32 s4, 0xffff1c18
1929; GFX12-NEXT:    s_mov_b32 s3, 0
1930; GFX12-NEXT:    s_mov_b32 s2, s1
1931; GFX12-NEXT:    s_mov_b32 s5, -1
1932; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1933; GFX12-NEXT:    s_mul_u64 s[2:3], s[2:3], s[4:5]
1934; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
1935; GFX12-NEXT:    ; return to shader part epilog
1936  %lsh = lshr i64 %arg0, 32
1937  %mul = mul i64 %lsh, s0xffffffffffff1c18
1938  %mad = add i64 %mul, %arg0
1939
1940  ret i64 %mad
1941}
1942
1943define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 {
1944; CI-LABEL: lshr_mad_i64_vec:
1945; CI:       ; %bb.0:
1946; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1947; CI-NEXT:    v_mov_b32_e32 v6, v3
1948; CI-NEXT:    v_mov_b32_e32 v3, v1
1949; CI-NEXT:    v_mov_b32_e32 v1, 0
1950; CI-NEXT:    s_mov_b32 s4, 0xffff1c18
1951; CI-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, s4, v[0:1]
1952; CI-NEXT:    v_mov_b32_e32 v3, v1
1953; CI-NEXT:    s_mov_b32 s4, 0xffff1118
1954; CI-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s4, v[2:3]
1955; CI-NEXT:    v_mov_b32_e32 v0, v4
1956; CI-NEXT:    v_mov_b32_e32 v1, v5
1957; CI-NEXT:    s_setpc_b64 s[30:31]
1958;
1959; SI-LABEL: lshr_mad_i64_vec:
1960; SI:       ; %bb.0:
1961; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1962; SI-NEXT:    s_mov_b32 s4, 0xffff1118
1963; SI-NEXT:    v_mul_lo_u32 v4, v3, s4
1964; SI-NEXT:    v_mul_hi_u32 v5, v3, s4
1965; SI-NEXT:    s_mov_b32 s4, 0xffff1c18
1966; SI-NEXT:    v_mul_hi_u32 v6, v1, s4
1967; SI-NEXT:    v_mul_lo_u32 v7, v1, s4
1968; SI-NEXT:    v_sub_i32_e32 v5, vcc, v5, v3
1969; SI-NEXT:    v_sub_i32_e32 v6, vcc, v6, v1
1970; SI-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
1971; SI-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
1972; SI-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
1973; SI-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
1974; SI-NEXT:    s_setpc_b64 s[30:31]
1975;
1976; GFX9-LABEL: lshr_mad_i64_vec:
1977; GFX9:       ; %bb.0:
1978; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1979; GFX9-NEXT:    v_mov_b32_e32 v6, v3
1980; GFX9-NEXT:    v_mov_b32_e32 v3, v1
1981; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1982; GFX9-NEXT:    s_mov_b32 s4, 0xffff1c18
1983; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, s4, v[0:1]
1984; GFX9-NEXT:    v_mov_b32_e32 v3, v1
1985; GFX9-NEXT:    s_mov_b32 s4, 0xffff1118
1986; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, s4, v[2:3]
1987; GFX9-NEXT:    v_mov_b32_e32 v0, v4
1988; GFX9-NEXT:    v_mov_b32_e32 v1, v5
1989; GFX9-NEXT:    s_setpc_b64 s[30:31]
1990;
1991; GFX1100-LABEL: lshr_mad_i64_vec:
1992; GFX1100:       ; %bb.0:
1993; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1994; GFX1100-NEXT:    v_mov_b32_e32 v8, v3
1995; GFX1100-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v1, 0
1996; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1997; GFX1100-NEXT:    v_mad_u64_u32 v[4:5], null, 0xffff1c18, v6, v[0:1]
1998; GFX1100-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, v4
1999; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2000; GFX1100-NEXT:    v_mad_u64_u32 v[6:7], null, 0xffff1118, v8, v[2:3]
2001; GFX1100-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v2, v6
2002; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2003; GFX1100-NEXT:    v_mov_b32_e32 v3, v7
2004; GFX1100-NEXT:    s_setpc_b64 s[30:31]
2005;
2006; GFX1150-LABEL: lshr_mad_i64_vec:
2007; GFX1150:       ; %bb.0:
2008; GFX1150-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2009; GFX1150-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v5, v1
2010; GFX1150-NEXT:    v_mov_b32_e32 v1, 0
2011; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
2012; GFX1150-NEXT:    v_mov_b32_e32 v3, v1
2013; GFX1150-NEXT:    v_mad_u64_u32 v[0:1], null, 0xffff1c18, v5, v[0:1]
2014; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2015; GFX1150-NEXT:    v_mad_u64_u32 v[2:3], null, 0xffff1118, v4, v[2:3]
2016; GFX1150-NEXT:    s_setpc_b64 s[30:31]
2017;
2018; GFX12-LABEL: lshr_mad_i64_vec:
2019; GFX12:       ; %bb.0:
2020; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2021; GFX12-NEXT:    s_wait_expcnt 0x0
2022; GFX12-NEXT:    s_wait_samplecnt 0x0
2023; GFX12-NEXT:    s_wait_bvhcnt 0x0
2024; GFX12-NEXT:    s_wait_kmcnt 0x0
2025; GFX12-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v5, v1
2026; GFX12-NEXT:    v_mov_b32_e32 v1, 0
2027; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
2028; GFX12-NEXT:    v_mov_b32_e32 v3, v1
2029; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, 0xffff1c18, v5, v[0:1]
2030; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2031; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0xffff1118, v4, v[2:3]
2032; GFX12-NEXT:    s_setpc_b64 s[30:31]
2033  %lsh = lshr <2 x i64> %arg0, <i64 32, i64 32>
2034  %mul = mul <2 x i64> %lsh, <i64 s0xffffffffffff1c18, i64 s0xffffffffffff1118>
2035  %mad = add <2 x i64> %mul, %arg0
2036
2037  ret <2 x i64> %mad
2038}
2039
2040attributes #0 = { nounwind }
2041attributes #1 = { nounwind readnone speculatable }
2042