xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX7 %s
3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
5; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
6; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16, -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
7; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16, -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
8; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s
9
10define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
11; GFX7-LABEL: s_mul_i16:
12; GFX7:       ; %bb.0:
13; GFX7-NEXT:    s_mul_i32 s0, s0, s1
14; GFX7-NEXT:    ; return to shader part epilog
15;
16; GFX8-LABEL: s_mul_i16:
17; GFX8:       ; %bb.0:
18; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
19; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
20; GFX8-NEXT:    s_mul_i32 s0, s0, s1
21; GFX8-NEXT:    ; return to shader part epilog
22;
23; GFX9-LABEL: s_mul_i16:
24; GFX9:       ; %bb.0:
25; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
26; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
27; GFX9-NEXT:    s_mul_i32 s0, s0, s1
28; GFX9-NEXT:    ; return to shader part epilog
29;
30; GFX10PLUS-LABEL: s_mul_i16:
31; GFX10PLUS:       ; %bb.0:
32; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
33; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
34; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
35; GFX10PLUS-NEXT:    ; return to shader part epilog
36;
37; GFX12-LABEL: s_mul_i16:
38; GFX12:       ; %bb.0:
39; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
40; GFX12-NEXT:    s_and_b32 s1, s1, 0xffff
41; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
42; GFX12-NEXT:    s_mul_i32 s0, s0, s1
43; GFX12-NEXT:    ; return to shader part epilog
44  %result = mul i16 %num, %den
45  ret i16 %result
46}
47
48define i16 @v_mul_i16(i16 %num, i16 %den) {
49; GFX7-LABEL: v_mul_i16:
50; GFX7:       ; %bb.0:
51; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
52; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
53; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
54; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
55; GFX7-NEXT:    s_setpc_b64 s[30:31]
56;
57; GFX8-LABEL: v_mul_i16:
58; GFX8:       ; %bb.0:
59; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
60; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
61; GFX8-NEXT:    s_setpc_b64 s[30:31]
62;
63; GFX9-LABEL: v_mul_i16:
64; GFX9:       ; %bb.0:
65; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
66; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
67; GFX9-NEXT:    s_setpc_b64 s[30:31]
68;
69; GFX10-LABEL: v_mul_i16:
70; GFX10:       ; %bb.0:
71; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
72; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
73; GFX10-NEXT:    s_setpc_b64 s[30:31]
74;
75; GFX11-TRUE16-LABEL: v_mul_i16:
76; GFX11-TRUE16:       ; %bb.0:
77; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
78; GFX11-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v1.l
79; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
80;
81; GFX11-FAKE16-LABEL: v_mul_i16:
82; GFX11-FAKE16:       ; %bb.0:
83; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
84; GFX11-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v1
85; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
86;
87; GFX12-LABEL: v_mul_i16:
88; GFX12:       ; %bb.0:
89; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
90; GFX12-NEXT:    s_wait_expcnt 0x0
91; GFX12-NEXT:    s_wait_samplecnt 0x0
92; GFX12-NEXT:    s_wait_bvhcnt 0x0
93; GFX12-NEXT:    s_wait_kmcnt 0x0
94; GFX12-NEXT:    v_mul_lo_u16 v0, v0, v1
95; GFX12-NEXT:    s_setpc_b64 s[30:31]
96  %result = mul i16 %num, %den
97  ret i16 %result
98}
99
100define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inreg zeroext %den) {
101; GFX7-LABEL: s_mul_i16_zeroext:
102; GFX7:       ; %bb.0:
103; GFX7-NEXT:    s_mul_i32 s0, s0, s1
104; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
105; GFX7-NEXT:    ; return to shader part epilog
106;
107; GFX8-LABEL: s_mul_i16_zeroext:
108; GFX8:       ; %bb.0:
109; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
110; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
111; GFX8-NEXT:    s_mul_i32 s0, s0, s1
112; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
113; GFX8-NEXT:    ; return to shader part epilog
114;
115; GFX9-LABEL: s_mul_i16_zeroext:
116; GFX9:       ; %bb.0:
117; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
118; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
119; GFX9-NEXT:    s_mul_i32 s0, s0, s1
120; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
121; GFX9-NEXT:    ; return to shader part epilog
122;
123; GFX10PLUS-LABEL: s_mul_i16_zeroext:
124; GFX10PLUS:       ; %bb.0:
125; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
126; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
127; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
128; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
129; GFX10PLUS-NEXT:    ; return to shader part epilog
130;
131; GFX12-LABEL: s_mul_i16_zeroext:
132; GFX12:       ; %bb.0:
133; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
134; GFX12-NEXT:    s_and_b32 s1, s1, 0xffff
135; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
136; GFX12-NEXT:    s_mul_i32 s0, s0, s1
137; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
138; GFX12-NEXT:    ; return to shader part epilog
139  %result = mul i16 %num, %den
140  ret i16 %result
141}
142
143define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
144; GFX7-LABEL: v_mul_i16_zeroext:
145; GFX7:       ; %bb.0:
146; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
147; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
148; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
149; GFX7-NEXT:    s_setpc_b64 s[30:31]
150;
151; GFX8-LABEL: v_mul_i16_zeroext:
152; GFX8:       ; %bb.0:
153; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
155; GFX8-NEXT:    s_setpc_b64 s[30:31]
156;
157; GFX9-LABEL: v_mul_i16_zeroext:
158; GFX9:       ; %bb.0:
159; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
160; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
161; GFX9-NEXT:    s_setpc_b64 s[30:31]
162;
163; GFX10-LABEL: v_mul_i16_zeroext:
164; GFX10:       ; %bb.0:
165; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
166; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
167; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
168; GFX10-NEXT:    s_setpc_b64 s[30:31]
169;
170; GFX11-TRUE16-LABEL: v_mul_i16_zeroext:
171; GFX11-TRUE16:       ; %bb.0:
172; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
173; GFX11-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v1.l
174; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
175; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
176;
177; GFX11-FAKE16-LABEL: v_mul_i16_zeroext:
178; GFX11-FAKE16:       ; %bb.0:
179; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
180; GFX11-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v1
181; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
182; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
183;
184; GFX12-LABEL: v_mul_i16_zeroext:
185; GFX12:       ; %bb.0:
186; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
187; GFX12-NEXT:    s_wait_expcnt 0x0
188; GFX12-NEXT:    s_wait_samplecnt 0x0
189; GFX12-NEXT:    s_wait_bvhcnt 0x0
190; GFX12-NEXT:    s_wait_kmcnt 0x0
191; GFX12-NEXT:    v_mul_lo_u16 v0, v0, v1
192; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
193; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
194; GFX12-NEXT:    s_setpc_b64 s[30:31]
195  %result = mul i16 %num, %den
196  ret i16 %result
197}
198
199define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) {
200; GFX7-LABEL: s_mul_i16_signext:
201; GFX7:       ; %bb.0:
202; GFX7-NEXT:    s_mul_i32 s0, s0, s1
203; GFX7-NEXT:    s_sext_i32_i16 s0, s0
204; GFX7-NEXT:    ; return to shader part epilog
205;
206; GFX8-LABEL: s_mul_i16_signext:
207; GFX8:       ; %bb.0:
208; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
209; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
210; GFX8-NEXT:    s_mul_i32 s0, s0, s1
211; GFX8-NEXT:    s_sext_i32_i16 s0, s0
212; GFX8-NEXT:    ; return to shader part epilog
213;
214; GFX9-LABEL: s_mul_i16_signext:
215; GFX9:       ; %bb.0:
216; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
217; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
218; GFX9-NEXT:    s_mul_i32 s0, s0, s1
219; GFX9-NEXT:    s_sext_i32_i16 s0, s0
220; GFX9-NEXT:    ; return to shader part epilog
221;
222; GFX10PLUS-LABEL: s_mul_i16_signext:
223; GFX10PLUS:       ; %bb.0:
224; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
225; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
226; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
227; GFX10PLUS-NEXT:    s_sext_i32_i16 s0, s0
228; GFX10PLUS-NEXT:    ; return to shader part epilog
229;
230; GFX12-LABEL: s_mul_i16_signext:
231; GFX12:       ; %bb.0:
232; GFX12-NEXT:    s_and_b32 s0, s0, 0xffff
233; GFX12-NEXT:    s_and_b32 s1, s1, 0xffff
234; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
235; GFX12-NEXT:    s_mul_i32 s0, s0, s1
236; GFX12-NEXT:    s_sext_i32_i16 s0, s0
237; GFX12-NEXT:    ; return to shader part epilog
238  %result = mul i16 %num, %den
239  ret i16 %result
240}
241
242define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
243; GFX7-LABEL: v_mul_i16_signext:
244; GFX7:       ; %bb.0:
245; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
246; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
247; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
248; GFX7-NEXT:    v_mul_u32_u24_e32 v0, v0, v1
249; GFX7-NEXT:    v_bfe_i32 v0, v0, 0, 16
250; GFX7-NEXT:    s_setpc_b64 s[30:31]
251;
252; GFX8-LABEL: v_mul_i16_signext:
253; GFX8:       ; %bb.0:
254; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
255; GFX8-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
256; GFX8-NEXT:    v_bfe_i32 v0, v0, 0, 16
257; GFX8-NEXT:    s_setpc_b64 s[30:31]
258;
259; GFX9-LABEL: v_mul_i16_signext:
260; GFX9:       ; %bb.0:
261; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
262; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v1
263; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 16
264; GFX9-NEXT:    s_setpc_b64 s[30:31]
265;
266; GFX10-LABEL: v_mul_i16_signext:
267; GFX10:       ; %bb.0:
268; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
269; GFX10-NEXT:    v_mul_lo_u16 v0, v0, v1
270; GFX10-NEXT:    v_bfe_i32 v0, v0, 0, 16
271; GFX10-NEXT:    s_setpc_b64 s[30:31]
272;
273; GFX11-TRUE16-LABEL: v_mul_i16_signext:
274; GFX11-TRUE16:       ; %bb.0:
275; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
276; GFX11-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v1.l
277; GFX11-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
278; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
279;
280; GFX11-FAKE16-LABEL: v_mul_i16_signext:
281; GFX11-FAKE16:       ; %bb.0:
282; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
283; GFX11-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v1
284; GFX11-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
285; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
286;
287; GFX12-LABEL: v_mul_i16_signext:
288; GFX12:       ; %bb.0:
289; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
290; GFX12-NEXT:    s_wait_expcnt 0x0
291; GFX12-NEXT:    s_wait_samplecnt 0x0
292; GFX12-NEXT:    s_wait_bvhcnt 0x0
293; GFX12-NEXT:    s_wait_kmcnt 0x0
294; GFX12-NEXT:    v_mul_lo_u16 v0, v0, v1
295; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
296; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 16
297; GFX12-NEXT:    s_setpc_b64 s[30:31]
298  %result = mul i16 %num, %den
299  ret i16 %result
300}
301
302define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
303; GCN-LABEL: s_mul_i32:
304; GCN:       ; %bb.0:
305; GCN-NEXT:    s_mul_i32 s0, s0, s1
306; GCN-NEXT:    ; return to shader part epilog
307;
308; GFX10PLUS-LABEL: s_mul_i32:
309; GFX10PLUS:       ; %bb.0:
310; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s1
311; GFX10PLUS-NEXT:    ; return to shader part epilog
312;
313; GFX12-LABEL: s_mul_i32:
314; GFX12:       ; %bb.0:
315; GFX12-NEXT:    s_mul_i32 s0, s0, s1
316; GFX12-NEXT:    ; return to shader part epilog
317  %result = mul i32 %num, %den
318  ret i32 %result
319}
320
321define i32 @v_mul_i32(i32 %num, i32 %den) {
322; GCN-LABEL: v_mul_i32:
323; GCN:       ; %bb.0:
324; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325; GCN-NEXT:    v_mul_lo_u32 v0, v0, v1
326; GCN-NEXT:    s_setpc_b64 s[30:31]
327;
328; GFX10PLUS-LABEL: v_mul_i32:
329; GFX10PLUS:       ; %bb.0:
330; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
331; GFX10PLUS-NEXT:    v_mul_lo_u32 v0, v0, v1
332; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
333;
334; GFX12-LABEL: v_mul_i32:
335; GFX12:       ; %bb.0:
336; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
337; GFX12-NEXT:    s_wait_expcnt 0x0
338; GFX12-NEXT:    s_wait_samplecnt 0x0
339; GFX12-NEXT:    s_wait_bvhcnt 0x0
340; GFX12-NEXT:    s_wait_kmcnt 0x0
341; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v1
342; GFX12-NEXT:    s_setpc_b64 s[30:31]
343  %result = mul i32 %num, %den
344  ret i32 %result
345}
346
347define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
348; GCN-LABEL: s_mul_v2i32:
349; GCN:       ; %bb.0:
350; GCN-NEXT:    s_mul_i32 s0, s0, s2
351; GCN-NEXT:    s_mul_i32 s1, s1, s3
352; GCN-NEXT:    ; return to shader part epilog
353;
354; GFX10PLUS-LABEL: s_mul_v2i32:
355; GFX10PLUS:       ; %bb.0:
356; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s2
357; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s3
358; GFX10PLUS-NEXT:    ; return to shader part epilog
359;
360; GFX12-LABEL: s_mul_v2i32:
361; GFX12:       ; %bb.0:
362; GFX12-NEXT:    s_mul_i32 s0, s0, s2
363; GFX12-NEXT:    s_mul_i32 s1, s1, s3
364; GFX12-NEXT:    ; return to shader part epilog
365  %result = mul <2 x i32> %num, %den
366  ret <2 x i32> %result
367}
368
369define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
370; GCN-LABEL: v_mul_v2i32:
371; GCN:       ; %bb.0:
372; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
373; GCN-NEXT:    v_mul_lo_u32 v0, v0, v2
374; GCN-NEXT:    v_mul_lo_u32 v1, v1, v3
375; GCN-NEXT:    s_setpc_b64 s[30:31]
376;
377; GFX10PLUS-LABEL: v_mul_v2i32:
378; GFX10PLUS:       ; %bb.0:
379; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
380; GFX10PLUS-NEXT:    v_mul_lo_u32 v0, v0, v2
381; GFX10PLUS-NEXT:    v_mul_lo_u32 v1, v1, v3
382; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
383;
384; GFX12-LABEL: v_mul_v2i32:
385; GFX12:       ; %bb.0:
386; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
387; GFX12-NEXT:    s_wait_expcnt 0x0
388; GFX12-NEXT:    s_wait_samplecnt 0x0
389; GFX12-NEXT:    s_wait_bvhcnt 0x0
390; GFX12-NEXT:    s_wait_kmcnt 0x0
391; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v2
392; GFX12-NEXT:    v_mul_lo_u32 v1, v1, v3
393; GFX12-NEXT:    s_setpc_b64 s[30:31]
394  %result = mul <2 x i32> %num, %den
395  ret <2 x i32> %result
396}
397
398define amdgpu_cs i33 @s_mul_i33(i33 inreg %num,  i33 inreg %den) {
399; GFX7-LABEL: s_mul_i33:
400; GFX7:       ; %bb.0:
401; GFX7-NEXT:    v_mov_b32_e32 v0, s2
402; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
403; GFX7-NEXT:    s_mul_i32 s4, s0, s2
404; GFX7-NEXT:    s_mul_i32 s0, s0, s3
405; GFX7-NEXT:    s_mul_i32 s1, s1, s2
406; GFX7-NEXT:    v_readfirstlane_b32 s5, v0
407; GFX7-NEXT:    s_add_u32 s0, s0, s5
408; GFX7-NEXT:    s_add_u32 s1, s1, s0
409; GFX7-NEXT:    s_mov_b32 s0, s4
410; GFX7-NEXT:    ; return to shader part epilog
411;
412; GFX8-LABEL: s_mul_i33:
413; GFX8:       ; %bb.0:
414; GFX8-NEXT:    v_mov_b32_e32 v0, s2
415; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
416; GFX8-NEXT:    s_mul_i32 s4, s0, s2
417; GFX8-NEXT:    s_mul_i32 s0, s0, s3
418; GFX8-NEXT:    s_mul_i32 s1, s1, s2
419; GFX8-NEXT:    v_readfirstlane_b32 s5, v0
420; GFX8-NEXT:    s_add_u32 s0, s0, s5
421; GFX8-NEXT:    s_add_u32 s1, s1, s0
422; GFX8-NEXT:    s_mov_b32 s0, s4
423; GFX8-NEXT:    ; return to shader part epilog
424;
425; GFX9-LABEL: s_mul_i33:
426; GFX9:       ; %bb.0:
427; GFX9-NEXT:    s_mul_i32 s4, s0, s2
428; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s2
429; GFX9-NEXT:    s_mul_i32 s0, s0, s3
430; GFX9-NEXT:    s_add_u32 s0, s0, s5
431; GFX9-NEXT:    s_mul_i32 s1, s1, s2
432; GFX9-NEXT:    s_add_u32 s1, s1, s0
433; GFX9-NEXT:    s_mov_b32 s0, s4
434; GFX9-NEXT:    ; return to shader part epilog
435;
436; GFX10PLUS-LABEL: s_mul_i33:
437; GFX10PLUS:       ; %bb.0:
438; GFX10PLUS-NEXT:    s_mul_hi_u32 s4, s0, s2
439; GFX10PLUS-NEXT:    s_mul_i32 s3, s0, s3
440; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s2
441; GFX10PLUS-NEXT:    s_add_i32 s3, s4, s3
442; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s2
443; GFX10PLUS-NEXT:    s_add_i32 s1, s3, s1
444; GFX10PLUS-NEXT:    ; return to shader part epilog
445;
446; GFX12-LABEL: s_mul_i33:
447; GFX12:       ; %bb.0:
448; GFX12-NEXT:    s_mul_u64 s[0:1], s[0:1], s[2:3]
449; GFX12-NEXT:    ; return to shader part epilog
450  %result = mul i33 %num, %den
451  ret i33 %result
452}
453
454define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
455; GFX7-LABEL: s_mul_i64:
456; GFX7:       ; %bb.0:
457; GFX7-NEXT:    v_mov_b32_e32 v0, s2
458; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
459; GFX7-NEXT:    s_mul_i32 s4, s0, s2
460; GFX7-NEXT:    s_mul_i32 s0, s0, s3
461; GFX7-NEXT:    s_mul_i32 s1, s1, s2
462; GFX7-NEXT:    v_readfirstlane_b32 s5, v0
463; GFX7-NEXT:    s_add_u32 s0, s0, s5
464; GFX7-NEXT:    s_add_u32 s1, s1, s0
465; GFX7-NEXT:    s_mov_b32 s0, s4
466; GFX7-NEXT:    ; return to shader part epilog
467;
468; GFX8-LABEL: s_mul_i64:
469; GFX8:       ; %bb.0:
470; GFX8-NEXT:    v_mov_b32_e32 v0, s2
471; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
472; GFX8-NEXT:    s_mul_i32 s4, s0, s2
473; GFX8-NEXT:    s_mul_i32 s0, s0, s3
474; GFX8-NEXT:    s_mul_i32 s1, s1, s2
475; GFX8-NEXT:    v_readfirstlane_b32 s5, v0
476; GFX8-NEXT:    s_add_u32 s0, s0, s5
477; GFX8-NEXT:    s_add_u32 s1, s1, s0
478; GFX8-NEXT:    s_mov_b32 s0, s4
479; GFX8-NEXT:    ; return to shader part epilog
480;
481; GFX9-LABEL: s_mul_i64:
482; GFX9:       ; %bb.0:
483; GFX9-NEXT:    s_mul_i32 s4, s0, s2
484; GFX9-NEXT:    s_mul_hi_u32 s5, s0, s2
485; GFX9-NEXT:    s_mul_i32 s0, s0, s3
486; GFX9-NEXT:    s_add_u32 s0, s0, s5
487; GFX9-NEXT:    s_mul_i32 s1, s1, s2
488; GFX9-NEXT:    s_add_u32 s1, s1, s0
489; GFX9-NEXT:    s_mov_b32 s0, s4
490; GFX9-NEXT:    ; return to shader part epilog
491;
492; GFX10PLUS-LABEL: s_mul_i64:
493; GFX10PLUS:       ; %bb.0:
494; GFX10PLUS-NEXT:    s_mul_hi_u32 s4, s0, s2
495; GFX10PLUS-NEXT:    s_mul_i32 s3, s0, s3
496; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s2
497; GFX10PLUS-NEXT:    s_add_i32 s3, s4, s3
498; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s2
499; GFX10PLUS-NEXT:    s_add_i32 s1, s3, s1
500; GFX10PLUS-NEXT:    ; return to shader part epilog
501;
502; GFX12-LABEL: s_mul_i64:
503; GFX12:       ; %bb.0:
504; GFX12-NEXT:    s_mul_u64 s[0:1], s[0:1], s[2:3]
505; GFX12-NEXT:    ; return to shader part epilog
506  %result = mul i64 %num, %den
507  ret i64 %result
508}
509
510define i64 @v_mul_i64(i64 %num, i64 %den) {
511; GCN-LABEL: v_mul_i64:
512; GCN:       ; %bb.0:
513; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
514; GCN-NEXT:    v_mov_b32_e32 v4, v0
515; GCN-NEXT:    v_mov_b32_e32 v5, v1
516; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
517; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
518; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
519; GCN-NEXT:    s_setpc_b64 s[30:31]
520;
521; GFX10-LABEL: v_mul_i64:
522; GFX10:       ; %bb.0:
523; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
524; GFX10-NEXT:    v_mov_b32_e32 v4, v0
525; GFX10-NEXT:    v_mov_b32_e32 v5, v1
526; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v2, 0
527; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s4, v4, v3, v[1:2]
528; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v5, v2, v[3:4]
529; GFX10-NEXT:    s_setpc_b64 s[30:31]
530;
531; GFX11-LABEL: v_mul_i64:
532; GFX11:       ; %bb.0:
533; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534; GFX11-NEXT:    v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v2
535; GFX11-NEXT:    v_mov_b32_e32 v6, v1
536; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v5, 0
537; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v4, v3, v[1:2]
538; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v6, v5, v[7:8]
539; GFX11-NEXT:    s_setpc_b64 s[30:31]
540;
541; GFX12-LABEL: v_mul_i64:
542; GFX12:       ; %bb.0:
543; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
544; GFX12-NEXT:    s_wait_expcnt 0x0
545; GFX12-NEXT:    s_wait_samplecnt 0x0
546; GFX12-NEXT:    s_wait_bvhcnt 0x0
547; GFX12-NEXT:    s_wait_kmcnt 0x0
548; GFX12-NEXT:    v_mul_hi_u32 v4, v0, v2
549; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
550; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v0, v3, v[4:5]
551; GFX12-NEXT:    v_mul_lo_u32 v0, v0, v2
552; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v1, v2, v[3:4]
553; GFX12-NEXT:    s_setpc_b64 s[30:31]
554  %result = mul i64 %num, %den
555  ret i64 %result
556}
557
558define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
559; GFX7-LABEL: s_mul_i96:
560; GFX7:       ; %bb.0:
561; GFX7-NEXT:    v_mov_b32_e32 v0, s3
562; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
563; GFX7-NEXT:    v_mov_b32_e32 v1, s4
564; GFX7-NEXT:    v_mul_hi_u32 v1, s0, v1
565; GFX7-NEXT:    s_mul_i32 s5, s0, s5
566; GFX7-NEXT:    v_readfirstlane_b32 s7, v0
567; GFX7-NEXT:    s_mul_i32 s8, s1, s4
568; GFX7-NEXT:    v_mov_b32_e32 v0, s1
569; GFX7-NEXT:    s_add_u32 s5, s8, s5
570; GFX7-NEXT:    s_mul_i32 s2, s2, s3
571; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s3
572; GFX7-NEXT:    s_mul_i32 s6, s0, s3
573; GFX7-NEXT:    s_add_u32 s2, s2, s5
574; GFX7-NEXT:    s_mul_i32 s0, s0, s4
575; GFX7-NEXT:    v_readfirstlane_b32 s4, v1
576; GFX7-NEXT:    s_add_u32 s0, s0, s7
577; GFX7-NEXT:    s_addc_u32 s2, s4, s2
578; GFX7-NEXT:    s_mul_i32 s1, s1, s3
579; GFX7-NEXT:    v_readfirstlane_b32 s3, v0
580; GFX7-NEXT:    s_add_u32 s1, s1, s0
581; GFX7-NEXT:    s_addc_u32 s2, s3, s2
582; GFX7-NEXT:    s_mov_b32 s0, s6
583; GFX7-NEXT:    ; return to shader part epilog
584;
585; GFX8-LABEL: s_mul_i96:
586; GFX8:       ; %bb.0:
587; GFX8-NEXT:    v_mov_b32_e32 v0, s3
588; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
589; GFX8-NEXT:    v_mov_b32_e32 v1, s4
590; GFX8-NEXT:    v_mul_hi_u32 v1, s0, v1
591; GFX8-NEXT:    s_mul_i32 s5, s0, s5
592; GFX8-NEXT:    v_readfirstlane_b32 s7, v0
593; GFX8-NEXT:    s_mul_i32 s8, s1, s4
594; GFX8-NEXT:    v_mov_b32_e32 v0, s1
595; GFX8-NEXT:    s_add_u32 s5, s8, s5
596; GFX8-NEXT:    s_mul_i32 s2, s2, s3
597; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s3
598; GFX8-NEXT:    s_mul_i32 s6, s0, s3
599; GFX8-NEXT:    s_add_u32 s2, s2, s5
600; GFX8-NEXT:    s_mul_i32 s0, s0, s4
601; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
602; GFX8-NEXT:    s_add_u32 s0, s0, s7
603; GFX8-NEXT:    s_addc_u32 s2, s4, s2
604; GFX8-NEXT:    s_mul_i32 s1, s1, s3
605; GFX8-NEXT:    v_readfirstlane_b32 s3, v0
606; GFX8-NEXT:    s_add_u32 s1, s1, s0
607; GFX8-NEXT:    s_addc_u32 s2, s3, s2
608; GFX8-NEXT:    s_mov_b32 s0, s6
609; GFX8-NEXT:    ; return to shader part epilog
610;
611; GFX9-LABEL: s_mul_i96:
612; GFX9:       ; %bb.0:
613; GFX9-NEXT:    s_mul_i32 s5, s0, s5
614; GFX9-NEXT:    s_mul_i32 s8, s1, s4
615; GFX9-NEXT:    s_add_u32 s5, s8, s5
616; GFX9-NEXT:    s_mul_i32 s2, s2, s3
617; GFX9-NEXT:    s_mul_hi_u32 s7, s0, s3
618; GFX9-NEXT:    s_add_u32 s2, s2, s5
619; GFX9-NEXT:    s_mul_i32 s5, s0, s4
620; GFX9-NEXT:    s_mul_i32 s6, s0, s3
621; GFX9-NEXT:    s_mul_hi_u32 s0, s0, s4
622; GFX9-NEXT:    s_add_u32 s4, s5, s7
623; GFX9-NEXT:    s_addc_u32 s0, s0, s2
624; GFX9-NEXT:    s_mul_i32 s2, s1, s3
625; GFX9-NEXT:    s_mul_hi_u32 s3, s1, s3
626; GFX9-NEXT:    s_add_u32 s1, s2, s4
627; GFX9-NEXT:    s_addc_u32 s2, s3, s0
628; GFX9-NEXT:    s_mov_b32 s0, s6
629; GFX9-NEXT:    ; return to shader part epilog
630;
631; GFX10PLUS-LABEL: s_mul_i96:
632; GFX10PLUS:       ; %bb.0:
633; GFX10PLUS-NEXT:    s_mul_i32 s6, s0, s5
634; GFX10PLUS-NEXT:    s_mul_i32 s7, s1, s4
635; GFX10PLUS-NEXT:    s_mul_i32 s2, s2, s3
636; GFX10PLUS-NEXT:    s_add_i32 s6, s6, s7
637; GFX10PLUS-NEXT:    s_mul_hi_u32 s7, s0, s3
638; GFX10PLUS-NEXT:    s_add_i32 s6, s6, s2
639; GFX10PLUS-NEXT:    s_mul_i32 s2, s0, s4
640; GFX10PLUS-NEXT:    s_mul_i32 s5, s0, s3
641; GFX10PLUS-NEXT:    s_mul_hi_u32 s0, s0, s4
642; GFX10PLUS-NEXT:    s_add_u32 s2, s2, s7
643; GFX10PLUS-NEXT:    s_mul_i32 s4, s1, s3
644; GFX10PLUS-NEXT:    s_addc_u32 s0, s0, s6
645; GFX10PLUS-NEXT:    s_mul_hi_u32 s3, s1, s3
646; GFX10PLUS-NEXT:    s_add_u32 s1, s4, s2
647; GFX10PLUS-NEXT:    s_addc_u32 s2, s3, s0
648; GFX10PLUS-NEXT:    s_mov_b32 s0, s5
649; GFX10PLUS-NEXT:    ; return to shader part epilog
650;
651; GFX12-LABEL: s_mul_i96:
652; GFX12:       ; %bb.0:
653; GFX12-NEXT:    s_mul_i32 s6, s0, s5
654; GFX12-NEXT:    s_mul_i32 s7, s1, s4
655; GFX12-NEXT:    s_mul_i32 s2, s2, s3
656; GFX12-NEXT:    s_add_co_i32 s6, s6, s7
657; GFX12-NEXT:    s_mul_hi_u32 s7, s0, s3
658; GFX12-NEXT:    s_add_co_i32 s6, s6, s2
659; GFX12-NEXT:    s_mul_i32 s2, s0, s4
660; GFX12-NEXT:    s_mul_i32 s5, s0, s3
661; GFX12-NEXT:    s_mul_hi_u32 s0, s0, s4
662; GFX12-NEXT:    s_add_co_u32 s2, s2, s7
663; GFX12-NEXT:    s_mul_i32 s4, s1, s3
664; GFX12-NEXT:    s_add_co_ci_u32 s0, s0, s6
665; GFX12-NEXT:    s_mul_hi_u32 s3, s1, s3
666; GFX12-NEXT:    s_add_co_u32 s1, s4, s2
667; GFX12-NEXT:    s_add_co_ci_u32 s2, s3, s0
668; GFX12-NEXT:    s_mov_b32 s0, s5
669; GFX12-NEXT:    ; return to shader part epilog
670  %result = mul i96 %num, %den
671  %cast = bitcast i96 %result to <3 x i32>
672  ret <3 x i32> %cast
673}
674
675define i96 @v_mul_i96(i96 %num, i96 %den) {
676; GCN-LABEL: v_mul_i96:
677; GCN:       ; %bb.0:
678; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
679; GCN-NEXT:    v_mov_b32_e32 v6, v0
680; GCN-NEXT:    v_mov_b32_e32 v7, v1
681; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
682; GCN-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1]
683; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
684; GCN-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9]
685; GCN-NEXT:    v_mov_b32_e32 v2, v8
686; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2]
687; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2]
688; GCN-NEXT:    s_setpc_b64 s[30:31]
689;
690; GFX10-LABEL: v_mul_i96:
691; GFX10:       ; %bb.0:
692; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
693; GFX10-NEXT:    v_mov_b32_e32 v6, v0
694; GFX10-NEXT:    v_mov_b32_e32 v7, v1
695; GFX10-NEXT:    v_mul_lo_u32 v0, v6, v5
696; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s4, v7, v4, v[0:1]
697; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v6, v3, 0
698; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9]
699; GFX10-NEXT:    v_mov_b32_e32 v2, v8
700; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2]
701; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2]
702; GFX10-NEXT:    s_setpc_b64 s[30:31]
703;
704; GFX11-LABEL: v_mul_i96:
705; GFX11:       ; %bb.0:
706; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
707; GFX11-NEXT:    v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
708; GFX11-NEXT:    v_mul_lo_u32 v0, v6, v5
709; GFX11-NEXT:    v_mad_u64_u32 v[8:9], null, v7, v4, v[0:1]
710; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v6, v3, 0
711; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9]
712; GFX11-NEXT:    v_mov_b32_e32 v2, v9
713; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2]
714; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2]
715; GFX11-NEXT:    s_setpc_b64 s[30:31]
716;
717; GFX12-LABEL: v_mul_i96:
718; GFX12:       ; %bb.0:
719; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
720; GFX12-NEXT:    s_wait_expcnt 0x0
721; GFX12-NEXT:    s_wait_samplecnt 0x0
722; GFX12-NEXT:    s_wait_bvhcnt 0x0
723; GFX12-NEXT:    s_wait_kmcnt 0x0
724; GFX12-NEXT:    v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
725; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
726; GFX12-NEXT:    v_mul_lo_u32 v0, v6, v5
727; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v7, v4, v[0:1]
728; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v6, v3, 0
729; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
730; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9]
731; GFX12-NEXT:    v_mov_b32_e32 v2, v8
732; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
733; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2]
734; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2]
735; GFX12-NEXT:    s_setpc_b64 s[30:31]
736  %result = mul i96 %num, %den
737  ret i96 %result
738}
739
740define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
741; GFX7-LABEL: s_mul_i128:
742; GFX7:       ; %bb.0:
743; GFX7-NEXT:    v_mov_b32_e32 v0, s4
744; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
745; GFX7-NEXT:    v_mov_b32_e32 v1, s5
746; GFX7-NEXT:    v_mul_hi_u32 v2, s1, v1
747; GFX7-NEXT:    s_mul_i32 s10, s0, s6
748; GFX7-NEXT:    v_readfirstlane_b32 s9, v0
749; GFX7-NEXT:    v_mov_b32_e32 v0, s6
750; GFX7-NEXT:    v_mul_hi_u32 v0, s0, v0
751; GFX7-NEXT:    v_readfirstlane_b32 s13, v2
752; GFX7-NEXT:    v_mov_b32_e32 v2, s2
753; GFX7-NEXT:    v_mul_hi_u32 v2, v2, s4
754; GFX7-NEXT:    s_mul_i32 s12, s1, s5
755; GFX7-NEXT:    v_readfirstlane_b32 s11, v0
756; GFX7-NEXT:    s_add_u32 s10, s12, s10
757; GFX7-NEXT:    v_mul_hi_u32 v1, s0, v1
758; GFX7-NEXT:    v_mov_b32_e32 v0, s1
759; GFX7-NEXT:    s_addc_u32 s11, s13, s11
760; GFX7-NEXT:    s_mul_i32 s12, s2, s4
761; GFX7-NEXT:    v_readfirstlane_b32 s13, v2
762; GFX7-NEXT:    s_add_u32 s10, s12, s10
763; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s4
764; GFX7-NEXT:    s_addc_u32 s11, s13, s11
765; GFX7-NEXT:    s_mul_i32 s12, s0, s5
766; GFX7-NEXT:    v_readfirstlane_b32 s13, v1
767; GFX7-NEXT:    s_add_u32 s9, s12, s9
768; GFX7-NEXT:    s_addc_u32 s10, s13, s10
769; GFX7-NEXT:    s_mul_i32 s13, s1, s4
770; GFX7-NEXT:    s_cselect_b32 s12, 1, 0
771; GFX7-NEXT:    v_readfirstlane_b32 s14, v0
772; GFX7-NEXT:    s_add_u32 s9, s13, s9
773; GFX7-NEXT:    s_mul_i32 s8, s0, s4
774; GFX7-NEXT:    s_addc_u32 s10, s14, s10
775; GFX7-NEXT:    s_mul_i32 s0, s0, s7
776; GFX7-NEXT:    s_addc_u32 s0, s11, s0
777; GFX7-NEXT:    s_mul_i32 s1, s1, s6
778; GFX7-NEXT:    s_cmp_lg_u32 s12, 0
779; GFX7-NEXT:    s_addc_u32 s0, s0, s1
780; GFX7-NEXT:    s_mul_i32 s2, s2, s5
781; GFX7-NEXT:    s_add_u32 s0, s2, s0
782; GFX7-NEXT:    s_mul_i32 s3, s3, s4
783; GFX7-NEXT:    s_add_u32 s3, s3, s0
784; GFX7-NEXT:    s_mov_b32 s0, s8
785; GFX7-NEXT:    s_mov_b32 s1, s9
786; GFX7-NEXT:    s_mov_b32 s2, s10
787; GFX7-NEXT:    ; return to shader part epilog
788;
789; GFX8-LABEL: s_mul_i128:
790; GFX8:       ; %bb.0:
791; GFX8-NEXT:    v_mov_b32_e32 v0, s4
792; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
793; GFX8-NEXT:    v_mov_b32_e32 v1, s5
794; GFX8-NEXT:    v_mul_hi_u32 v2, s1, v1
795; GFX8-NEXT:    s_mul_i32 s10, s0, s6
796; GFX8-NEXT:    v_readfirstlane_b32 s9, v0
797; GFX8-NEXT:    v_mov_b32_e32 v0, s6
798; GFX8-NEXT:    v_mul_hi_u32 v0, s0, v0
799; GFX8-NEXT:    v_readfirstlane_b32 s13, v2
800; GFX8-NEXT:    v_mov_b32_e32 v2, s2
801; GFX8-NEXT:    v_mul_hi_u32 v2, v2, s4
802; GFX8-NEXT:    s_mul_i32 s12, s1, s5
803; GFX8-NEXT:    v_readfirstlane_b32 s11, v0
804; GFX8-NEXT:    s_add_u32 s10, s12, s10
805; GFX8-NEXT:    v_mul_hi_u32 v1, s0, v1
806; GFX8-NEXT:    v_mov_b32_e32 v0, s1
807; GFX8-NEXT:    s_addc_u32 s11, s13, s11
808; GFX8-NEXT:    s_mul_i32 s12, s2, s4
809; GFX8-NEXT:    v_readfirstlane_b32 s13, v2
810; GFX8-NEXT:    s_add_u32 s10, s12, s10
811; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s4
812; GFX8-NEXT:    s_addc_u32 s11, s13, s11
813; GFX8-NEXT:    s_mul_i32 s12, s0, s5
814; GFX8-NEXT:    v_readfirstlane_b32 s13, v1
815; GFX8-NEXT:    s_add_u32 s9, s12, s9
816; GFX8-NEXT:    s_addc_u32 s10, s13, s10
817; GFX8-NEXT:    s_mul_i32 s13, s1, s4
818; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
819; GFX8-NEXT:    v_readfirstlane_b32 s14, v0
820; GFX8-NEXT:    s_add_u32 s9, s13, s9
821; GFX8-NEXT:    s_mul_i32 s8, s0, s4
822; GFX8-NEXT:    s_addc_u32 s10, s14, s10
823; GFX8-NEXT:    s_mul_i32 s0, s0, s7
824; GFX8-NEXT:    s_addc_u32 s0, s11, s0
825; GFX8-NEXT:    s_mul_i32 s1, s1, s6
826; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
827; GFX8-NEXT:    s_addc_u32 s0, s0, s1
828; GFX8-NEXT:    s_mul_i32 s2, s2, s5
829; GFX8-NEXT:    s_add_u32 s0, s2, s0
830; GFX8-NEXT:    s_mul_i32 s3, s3, s4
831; GFX8-NEXT:    s_add_u32 s3, s3, s0
832; GFX8-NEXT:    s_mov_b32 s0, s8
833; GFX8-NEXT:    s_mov_b32 s1, s9
834; GFX8-NEXT:    s_mov_b32 s2, s10
835; GFX8-NEXT:    ; return to shader part epilog
836;
837; GFX9-LABEL: s_mul_i128:
838; GFX9:       ; %bb.0:
839; GFX9-NEXT:    s_mul_i32 s10, s0, s6
840; GFX9-NEXT:    s_mul_i32 s12, s1, s5
841; GFX9-NEXT:    s_mul_hi_u32 s11, s0, s6
842; GFX9-NEXT:    s_mul_hi_u32 s13, s1, s5
843; GFX9-NEXT:    s_add_u32 s10, s12, s10
844; GFX9-NEXT:    s_addc_u32 s11, s13, s11
845; GFX9-NEXT:    s_mul_i32 s12, s2, s4
846; GFX9-NEXT:    s_mul_hi_u32 s13, s2, s4
847; GFX9-NEXT:    s_add_u32 s10, s12, s10
848; GFX9-NEXT:    s_mul_hi_u32 s9, s0, s4
849; GFX9-NEXT:    s_addc_u32 s11, s13, s11
850; GFX9-NEXT:    s_mul_i32 s12, s0, s5
851; GFX9-NEXT:    s_mul_hi_u32 s13, s0, s5
852; GFX9-NEXT:    s_add_u32 s9, s12, s9
853; GFX9-NEXT:    s_addc_u32 s10, s13, s10
854; GFX9-NEXT:    s_mul_i32 s13, s1, s4
855; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
856; GFX9-NEXT:    s_mul_hi_u32 s14, s1, s4
857; GFX9-NEXT:    s_add_u32 s9, s13, s9
858; GFX9-NEXT:    s_mul_i32 s8, s0, s4
859; GFX9-NEXT:    s_addc_u32 s10, s14, s10
860; GFX9-NEXT:    s_mul_i32 s0, s0, s7
861; GFX9-NEXT:    s_addc_u32 s0, s11, s0
862; GFX9-NEXT:    s_mul_i32 s1, s1, s6
863; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
864; GFX9-NEXT:    s_addc_u32 s0, s0, s1
865; GFX9-NEXT:    s_mul_i32 s2, s2, s5
866; GFX9-NEXT:    s_add_u32 s0, s2, s0
867; GFX9-NEXT:    s_mul_i32 s3, s3, s4
868; GFX9-NEXT:    s_add_u32 s3, s3, s0
869; GFX9-NEXT:    s_mov_b32 s0, s8
870; GFX9-NEXT:    s_mov_b32 s1, s9
871; GFX9-NEXT:    s_mov_b32 s2, s10
872; GFX9-NEXT:    ; return to shader part epilog
873;
874; GFX10PLUS-LABEL: s_mul_i128:
875; GFX10PLUS:       ; %bb.0:
876; GFX10PLUS-NEXT:    s_mul_i32 s9, s0, s6
877; GFX10PLUS-NEXT:    s_mul_i32 s11, s1, s5
878; GFX10PLUS-NEXT:    s_mul_hi_u32 s10, s0, s6
879; GFX10PLUS-NEXT:    s_mul_hi_u32 s12, s1, s5
880; GFX10PLUS-NEXT:    s_add_u32 s9, s11, s9
881; GFX10PLUS-NEXT:    s_mul_i32 s11, s2, s4
882; GFX10PLUS-NEXT:    s_addc_u32 s10, s12, s10
883; GFX10PLUS-NEXT:    s_mul_hi_u32 s12, s2, s4
884; GFX10PLUS-NEXT:    s_mul_hi_u32 s8, s0, s4
885; GFX10PLUS-NEXT:    s_add_u32 s9, s11, s9
886; GFX10PLUS-NEXT:    s_mul_i32 s11, s0, s5
887; GFX10PLUS-NEXT:    s_addc_u32 s10, s12, s10
888; GFX10PLUS-NEXT:    s_mul_hi_u32 s12, s0, s5
889; GFX10PLUS-NEXT:    s_add_u32 s8, s11, s8
890; GFX10PLUS-NEXT:    s_addc_u32 s9, s12, s9
891; GFX10PLUS-NEXT:    s_mul_i32 s12, s1, s4
892; GFX10PLUS-NEXT:    s_mul_hi_u32 s13, s1, s4
893; GFX10PLUS-NEXT:    s_cselect_b32 s11, 1, 0
894; GFX10PLUS-NEXT:    s_add_u32 s8, s12, s8
895; GFX10PLUS-NEXT:    s_mul_i32 s12, s0, s7
896; GFX10PLUS-NEXT:    s_addc_u32 s7, s13, s9
897; GFX10PLUS-NEXT:    s_addc_u32 s9, s10, s12
898; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s6
899; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
900; GFX10PLUS-NEXT:    s_mul_i32 s2, s2, s5
901; GFX10PLUS-NEXT:    s_addc_u32 s1, s9, s1
902; GFX10PLUS-NEXT:    s_mul_i32 s3, s3, s4
903; GFX10PLUS-NEXT:    s_add_i32 s1, s1, s2
904; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s4
905; GFX10PLUS-NEXT:    s_add_i32 s3, s1, s3
906; GFX10PLUS-NEXT:    s_mov_b32 s1, s8
907; GFX10PLUS-NEXT:    s_mov_b32 s2, s7
908; GFX10PLUS-NEXT:    ; return to shader part epilog
909;
910; GFX12-LABEL: s_mul_i128:
911; GFX12:       ; %bb.0:
912; GFX12-NEXT:    s_mul_i32 s9, s0, s6
913; GFX12-NEXT:    s_mul_i32 s11, s1, s5
914; GFX12-NEXT:    s_mul_hi_u32 s10, s0, s6
915; GFX12-NEXT:    s_mul_hi_u32 s12, s1, s5
916; GFX12-NEXT:    s_add_co_u32 s9, s11, s9
917; GFX12-NEXT:    s_mul_i32 s11, s2, s4
918; GFX12-NEXT:    s_add_co_ci_u32 s10, s12, s10
919; GFX12-NEXT:    s_mul_hi_u32 s12, s2, s4
920; GFX12-NEXT:    s_mul_hi_u32 s8, s0, s4
921; GFX12-NEXT:    s_add_co_u32 s9, s11, s9
922; GFX12-NEXT:    s_mul_i32 s11, s0, s5
923; GFX12-NEXT:    s_add_co_ci_u32 s10, s12, s10
924; GFX12-NEXT:    s_mul_hi_u32 s12, s0, s5
925; GFX12-NEXT:    s_add_co_u32 s8, s11, s8
926; GFX12-NEXT:    s_add_co_ci_u32 s9, s12, s9
927; GFX12-NEXT:    s_mul_i32 s12, s1, s4
928; GFX12-NEXT:    s_mul_hi_u32 s13, s1, s4
929; GFX12-NEXT:    s_cselect_b32 s11, 1, 0
930; GFX12-NEXT:    s_add_co_u32 s8, s12, s8
931; GFX12-NEXT:    s_mul_i32 s12, s0, s7
932; GFX12-NEXT:    s_add_co_ci_u32 s7, s13, s9
933; GFX12-NEXT:    s_add_co_ci_u32 s9, s10, s12
934; GFX12-NEXT:    s_mul_i32 s1, s1, s6
935; GFX12-NEXT:    s_cmp_lg_u32 s11, 0
936; GFX12-NEXT:    s_mul_i32 s2, s2, s5
937; GFX12-NEXT:    s_add_co_ci_u32 s1, s9, s1
938; GFX12-NEXT:    s_mul_i32 s3, s3, s4
939; GFX12-NEXT:    s_add_co_i32 s1, s1, s2
940; GFX12-NEXT:    s_mul_i32 s0, s0, s4
941; GFX12-NEXT:    s_add_co_i32 s3, s1, s3
942; GFX12-NEXT:    s_mov_b32 s1, s8
943; GFX12-NEXT:    s_mov_b32 s2, s7
944; GFX12-NEXT:    ; return to shader part epilog
945  %result = mul i128 %num, %den
946  %cast = bitcast i128 %result to <4 x i32>
947  ret <4 x i32> %cast
948}
949
950define i128 @v_mul_i128(i128 %num, i128 %den) {
951; GFX7-LABEL: v_mul_i128:
952; GFX7:       ; %bb.0:
953; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
954; GFX7-NEXT:    v_mov_b32_e32 v8, v0
955; GFX7-NEXT:    v_mov_b32_e32 v9, v1
956; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
957; GFX7-NEXT:    v_mov_b32_e32 v10, v2
958; GFX7-NEXT:    v_mul_lo_u32 v7, v8, v7
959; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
960; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
961; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
962; GFX7-NEXT:    v_mul_lo_u32 v6, v9, v6
963; GFX7-NEXT:    v_mov_b32_e32 v2, v11
964; GFX7-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
965; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
966; GFX7-NEXT:    v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
967; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
968; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
969; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
970; GFX7-NEXT:    s_setpc_b64 s[30:31]
971;
972; GFX8-LABEL: v_mul_i128:
973; GFX8:       ; %bb.0:
974; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
975; GFX8-NEXT:    v_mov_b32_e32 v8, v0
976; GFX8-NEXT:    v_mov_b32_e32 v9, v1
977; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
978; GFX8-NEXT:    v_mov_b32_e32 v10, v2
979; GFX8-NEXT:    v_mul_lo_u32 v7, v8, v7
980; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
981; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
982; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
983; GFX8-NEXT:    v_mul_lo_u32 v6, v9, v6
984; GFX8-NEXT:    v_mov_b32_e32 v2, v11
985; GFX8-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
986; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
987; GFX8-NEXT:    v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
988; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
989; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
990; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
991; GFX8-NEXT:    s_setpc_b64 s[30:31]
992;
993; GFX9-LABEL: v_mul_i128:
994; GFX9:       ; %bb.0:
995; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
996; GFX9-NEXT:    v_mov_b32_e32 v8, v0
997; GFX9-NEXT:    v_mov_b32_e32 v9, v1
998; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
999; GFX9-NEXT:    v_mov_b32_e32 v10, v2
1000; GFX9-NEXT:    v_mul_lo_u32 v7, v8, v7
1001; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
1002; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
1003; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
1004; GFX9-NEXT:    v_mul_lo_u32 v6, v9, v6
1005; GFX9-NEXT:    v_mov_b32_e32 v2, v11
1006; GFX9-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
1007; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
1008; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5]
1009; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v6, vcc
1010; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
1011; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
1012; GFX9-NEXT:    s_setpc_b64 s[30:31]
1013;
1014; GFX10-LABEL: v_mul_i128:
1015; GFX10:       ; %bb.0:
1016; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1017; GFX10-NEXT:    v_mov_b32_e32 v8, v0
1018; GFX10-NEXT:    v_mov_b32_e32 v9, v1
1019; GFX10-NEXT:    v_mov_b32_e32 v10, v2
1020; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v8, v6, 0
1021; GFX10-NEXT:    v_mul_lo_u32 v7, v8, v7
1022; GFX10-NEXT:    v_mul_lo_u32 v6, v9, v6
1023; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1]
1024; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v8, v4, 0
1025; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12]
1026; GFX10-NEXT:    v_mov_b32_e32 v2, v11
1027; GFX10-NEXT:    v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
1028; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2]
1029; GFX10-NEXT:    v_add_co_ci_u32_e64 v7, s4, v12, v7, s4
1030; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo
1031; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s4, v10, v5, v[6:7]
1032; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s4, v3, v4, v[5:6]
1033; GFX10-NEXT:    s_setpc_b64 s[30:31]
1034;
1035; GFX11-LABEL: v_mul_i128:
1036; GFX11:       ; %bb.0:
1037; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1038; GFX11-NEXT:    v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
1039; GFX11-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v4
1040; GFX11-NEXT:    v_mov_b32_e32 v12, v3
1041; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v6, 0
1042; GFX11-NEXT:    v_mul_lo_u32 v4, v9, v6
1043; GFX11-NEXT:    v_mul_lo_u32 v6, v8, v7
1044; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v9, v5, v[0:1]
1045; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v11, 0
1046; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v10, v11, v[2:3]
1047; GFX11-NEXT:    v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
1048; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s0, v9, v11, v[1:2]
1049; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s0, v3, v6, s0
1050; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
1051; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4]
1052; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v12, v11, v[6:7]
1053; GFX11-NEXT:    s_setpc_b64 s[30:31]
1054;
1055; GFX12-LABEL: v_mul_i128:
1056; GFX12:       ; %bb.0:
1057; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1058; GFX12-NEXT:    s_wait_expcnt 0x0
1059; GFX12-NEXT:    s_wait_samplecnt 0x0
1060; GFX12-NEXT:    s_wait_bvhcnt 0x0
1061; GFX12-NEXT:    s_wait_kmcnt 0x0
1062; GFX12-NEXT:    v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
1063; GFX12-NEXT:    v_mov_b32_e32 v10, v2
1064; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
1065; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v8, v6, 0
1066; GFX12-NEXT:    v_mul_lo_u32 v7, v8, v7
1067; GFX12-NEXT:    v_mul_lo_u32 v6, v9, v6
1068; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1069; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1]
1070; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
1071; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], null, v10, v4, v[11:12]
1072; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1073; GFX12-NEXT:    v_mov_b32_e32 v2, v11
1074; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
1075; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1076; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
1077; GFX12-NEXT:    v_add_co_ci_u32_e64 v7, s0, v12, v7, s0
1078; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1079; GFX12-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo
1080; GFX12-NEXT:    v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7]
1081; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1082; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], null, v3, v4, v[5:6]
1083; GFX12-NEXT:    s_setpc_b64 s[30:31]
1084  %result = mul i128 %num, %den
1085  ret i128 %result
1086}
1087
1088define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
1089; GFX7-LABEL: s_mul_i256:
1090; GFX7:       ; %bb.0:
1091; GFX7-NEXT:    s_mov_b32 s16, s0
1092; GFX7-NEXT:    v_mov_b32_e32 v0, s8
1093; GFX7-NEXT:    v_mul_hi_u32 v0, s16, v0
1094; GFX7-NEXT:    v_mov_b32_e32 v1, s9
1095; GFX7-NEXT:    v_mul_hi_u32 v2, s1, v1
1096; GFX7-NEXT:    v_mul_hi_u32 v1, s16, v1
1097; GFX7-NEXT:    v_readfirstlane_b32 s17, v0
1098; GFX7-NEXT:    v_mov_b32_e32 v0, s10
1099; GFX7-NEXT:    v_mul_hi_u32 v0, s16, v0
1100; GFX7-NEXT:    v_readfirstlane_b32 s21, v2
1101; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1102; GFX7-NEXT:    v_mul_hi_u32 v3, v2, s8
1103; GFX7-NEXT:    s_mul_i32 s18, s16, s10
1104; GFX7-NEXT:    s_mul_i32 s20, s1, s9
1105; GFX7-NEXT:    v_readfirstlane_b32 s19, v0
1106; GFX7-NEXT:    v_mov_b32_e32 v0, s1
1107; GFX7-NEXT:    s_add_u32 s18, s20, s18
1108; GFX7-NEXT:    s_addc_u32 s19, s21, s19
1109; GFX7-NEXT:    s_mul_i32 s21, s2, s8
1110; GFX7-NEXT:    v_readfirstlane_b32 s23, v1
1111; GFX7-NEXT:    v_mul_hi_u32 v1, v0, s8
1112; GFX7-NEXT:    s_cselect_b32 s20, 1, 0
1113; GFX7-NEXT:    v_readfirstlane_b32 s22, v3
1114; GFX7-NEXT:    s_add_u32 s18, s21, s18
1115; GFX7-NEXT:    s_addc_u32 s19, s22, s19
1116; GFX7-NEXT:    s_mul_i32 s22, s16, s9
1117; GFX7-NEXT:    s_cselect_b32 s21, 1, 0
1118; GFX7-NEXT:    s_add_u32 s17, s22, s17
1119; GFX7-NEXT:    s_addc_u32 s22, s23, s18
1120; GFX7-NEXT:    v_readfirstlane_b32 s23, v1
1121; GFX7-NEXT:    v_mov_b32_e32 v1, s12
1122; GFX7-NEXT:    v_mul_hi_u32 v3, s16, v1
1123; GFX7-NEXT:    s_mul_i32 s18, s1, s8
1124; GFX7-NEXT:    s_cselect_b32 s25, 1, 0
1125; GFX7-NEXT:    s_add_u32 s18, s18, s17
1126; GFX7-NEXT:    s_addc_u32 s17, s23, s22
1127; GFX7-NEXT:    v_mov_b32_e32 v4, s11
1128; GFX7-NEXT:    v_readfirstlane_b32 s23, v3
1129; GFX7-NEXT:    v_mul_hi_u32 v3, v2, s10
1130; GFX7-NEXT:    v_mul_hi_u32 v5, s1, v4
1131; GFX7-NEXT:    s_mul_i32 s22, s16, s12
1132; GFX7-NEXT:    s_mul_i32 s24, s1, s11
1133; GFX7-NEXT:    v_readfirstlane_b32 s28, v3
1134; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1135; GFX7-NEXT:    v_readfirstlane_b32 s27, v5
1136; GFX7-NEXT:    v_mul_hi_u32 v5, v3, s9
1137; GFX7-NEXT:    s_cselect_b32 s26, 1, 0
1138; GFX7-NEXT:    s_add_u32 s24, s24, s22
1139; GFX7-NEXT:    s_addc_u32 s23, s27, s23
1140; GFX7-NEXT:    v_readfirstlane_b32 s29, v5
1141; GFX7-NEXT:    v_mov_b32_e32 v5, s4
1142; GFX7-NEXT:    v_mul_hi_u32 v6, v5, s8
1143; GFX7-NEXT:    s_mul_i32 s27, s2, s10
1144; GFX7-NEXT:    s_cselect_b32 s22, 1, 0
1145; GFX7-NEXT:    s_add_u32 s24, s27, s24
1146; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s10
1147; GFX7-NEXT:    s_addc_u32 s27, s28, s23
1148; GFX7-NEXT:    s_mul_i32 s28, s3, s9
1149; GFX7-NEXT:    s_cselect_b32 s23, 1, 0
1150; GFX7-NEXT:    s_add_u32 s28, s28, s24
1151; GFX7-NEXT:    v_readfirstlane_b32 s30, v6
1152; GFX7-NEXT:    v_mul_hi_u32 v6, s16, v4
1153; GFX7-NEXT:    s_addc_u32 s27, s29, s27
1154; GFX7-NEXT:    s_mul_i32 s29, s4, s8
1155; GFX7-NEXT:    s_cselect_b32 s24, 1, 0
1156; GFX7-NEXT:    s_add_u32 s28, s29, s28
1157; GFX7-NEXT:    v_readfirstlane_b32 s33, v0
1158; GFX7-NEXT:    v_mul_hi_u32 v0, v2, s9
1159; GFX7-NEXT:    s_addc_u32 s27, s30, s27
1160; GFX7-NEXT:    s_mul_i32 s30, s16, s11
1161; GFX7-NEXT:    s_cselect_b32 s29, 1, 0
1162; GFX7-NEXT:    v_readfirstlane_b32 s31, v6
1163; GFX7-NEXT:    s_add_u32 s19, s30, s19
1164; GFX7-NEXT:    s_addc_u32 s28, s31, s28
1165; GFX7-NEXT:    s_mul_i32 s31, s1, s10
1166; GFX7-NEXT:    s_cselect_b32 s30, 1, 0
1167; GFX7-NEXT:    s_add_u32 s19, s31, s19
1168; GFX7-NEXT:    v_readfirstlane_b32 s34, v0
1169; GFX7-NEXT:    v_mul_hi_u32 v0, v3, s8
1170; GFX7-NEXT:    s_addc_u32 s28, s33, s28
1171; GFX7-NEXT:    s_mul_i32 s33, s2, s9
1172; GFX7-NEXT:    s_cselect_b32 s31, 1, 0
1173; GFX7-NEXT:    s_add_u32 s19, s33, s19
1174; GFX7-NEXT:    s_addc_u32 s28, s34, s28
1175; GFX7-NEXT:    s_mul_i32 s34, s3, s8
1176; GFX7-NEXT:    s_cselect_b32 s33, 1, 0
1177; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
1178; GFX7-NEXT:    s_add_u32 s19, s34, s19
1179; GFX7-NEXT:    v_mov_b32_e32 v0, s14
1180; GFX7-NEXT:    s_addc_u32 s28, s35, s28
1181; GFX7-NEXT:    v_mul_hi_u32 v0, s16, v0
1182; GFX7-NEXT:    s_cselect_b32 s34, 1, 0
1183; GFX7-NEXT:    s_cmp_lg_u32 s26, 0
1184; GFX7-NEXT:    s_addc_u32 s19, s25, s19
1185; GFX7-NEXT:    v_mov_b32_e32 v2, s13
1186; GFX7-NEXT:    s_cselect_b32 s25, 1, 0
1187; GFX7-NEXT:    s_cmp_lg_u32 s21, 0
1188; GFX7-NEXT:    v_mul_hi_u32 v6, s1, v2
1189; GFX7-NEXT:    s_addc_u32 s20, s20, 0
1190; GFX7-NEXT:    v_readfirstlane_b32 s26, v0
1191; GFX7-NEXT:    v_mul_hi_u32 v0, s2, v1
1192; GFX7-NEXT:    s_cmp_lg_u32 s25, 0
1193; GFX7-NEXT:    s_addc_u32 s20, s20, s28
1194; GFX7-NEXT:    s_mul_i32 s25, s16, s14
1195; GFX7-NEXT:    s_mul_i32 s28, s1, s13
1196; GFX7-NEXT:    s_cselect_b32 s21, 1, 0
1197; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
1198; GFX7-NEXT:    s_add_u32 s25, s28, s25
1199; GFX7-NEXT:    s_addc_u32 s26, s35, s26
1200; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
1201; GFX7-NEXT:    v_mul_hi_u32 v0, v3, s11
1202; GFX7-NEXT:    s_mul_i32 s28, s2, s12
1203; GFX7-NEXT:    s_add_u32 s25, s28, s25
1204; GFX7-NEXT:    s_addc_u32 s26, s35, s26
1205; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
1206; GFX7-NEXT:    v_mul_hi_u32 v0, v5, s10
1207; GFX7-NEXT:    s_mul_i32 s28, s3, s11
1208; GFX7-NEXT:    s_add_u32 s25, s28, s25
1209; GFX7-NEXT:    s_addc_u32 s26, s35, s26
1210; GFX7-NEXT:    v_readfirstlane_b32 s35, v0
1211; GFX7-NEXT:    v_mov_b32_e32 v0, s5
1212; GFX7-NEXT:    v_mul_hi_u32 v6, v0, s9
1213; GFX7-NEXT:    s_mul_i32 s28, s4, s10
1214; GFX7-NEXT:    s_add_u32 s25, s28, s25
1215; GFX7-NEXT:    v_mul_hi_u32 v1, s1, v1
1216; GFX7-NEXT:    s_addc_u32 s26, s35, s26
1217; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
1218; GFX7-NEXT:    v_mov_b32_e32 v6, s6
1219; GFX7-NEXT:    v_mul_hi_u32 v6, v6, s8
1220; GFX7-NEXT:    s_mul_i32 s28, s5, s9
1221; GFX7-NEXT:    s_add_u32 s25, s28, s25
1222; GFX7-NEXT:    v_mul_hi_u32 v2, s16, v2
1223; GFX7-NEXT:    v_readfirstlane_b32 s36, v1
1224; GFX7-NEXT:    v_mul_hi_u32 v1, s2, v4
1225; GFX7-NEXT:    s_addc_u32 s26, s35, s26
1226; GFX7-NEXT:    s_mul_i32 s28, s6, s8
1227; GFX7-NEXT:    v_readfirstlane_b32 s35, v6
1228; GFX7-NEXT:    s_add_u32 s25, s28, s25
1229; GFX7-NEXT:    s_addc_u32 s26, s35, s26
1230; GFX7-NEXT:    s_mul_i32 s28, s16, s13
1231; GFX7-NEXT:    v_readfirstlane_b32 s35, v2
1232; GFX7-NEXT:    s_add_u32 s27, s28, s27
1233; GFX7-NEXT:    v_readfirstlane_b32 s37, v1
1234; GFX7-NEXT:    v_mul_hi_u32 v1, v3, s10
1235; GFX7-NEXT:    s_addc_u32 s25, s35, s25
1236; GFX7-NEXT:    s_mul_i32 s35, s1, s12
1237; GFX7-NEXT:    s_cselect_b32 s28, 1, 0
1238; GFX7-NEXT:    s_add_u32 s27, s35, s27
1239; GFX7-NEXT:    s_addc_u32 s25, s36, s25
1240; GFX7-NEXT:    s_mul_i32 s36, s2, s11
1241; GFX7-NEXT:    s_cselect_b32 s35, 1, 0
1242; GFX7-NEXT:    s_add_u32 s27, s36, s27
1243; GFX7-NEXT:    v_readfirstlane_b32 s38, v1
1244; GFX7-NEXT:    v_mul_hi_u32 v1, v5, s9
1245; GFX7-NEXT:    s_addc_u32 s25, s37, s25
1246; GFX7-NEXT:    s_mul_i32 s37, s3, s10
1247; GFX7-NEXT:    s_cselect_b32 s36, 1, 0
1248; GFX7-NEXT:    s_add_u32 s27, s37, s27
1249; GFX7-NEXT:    v_mul_hi_u32 v0, v0, s8
1250; GFX7-NEXT:    s_addc_u32 s25, s38, s25
1251; GFX7-NEXT:    s_mul_i32 s38, s4, s9
1252; GFX7-NEXT:    s_cselect_b32 s37, 1, 0
1253; GFX7-NEXT:    v_readfirstlane_b32 s39, v1
1254; GFX7-NEXT:    s_add_u32 s27, s38, s27
1255; GFX7-NEXT:    s_addc_u32 s25, s39, s25
1256; GFX7-NEXT:    s_mul_i32 s39, s5, s8
1257; GFX7-NEXT:    s_cselect_b32 s38, 1, 0
1258; GFX7-NEXT:    v_readfirstlane_b32 s40, v0
1259; GFX7-NEXT:    s_add_u32 s27, s39, s27
1260; GFX7-NEXT:    s_addc_u32 s25, s40, s25
1261; GFX7-NEXT:    s_cselect_b32 s39, 1, 0
1262; GFX7-NEXT:    s_cmp_lg_u32 s31, 0
1263; GFX7-NEXT:    s_addc_u32 s30, s30, 0
1264; GFX7-NEXT:    s_cmp_lg_u32 s33, 0
1265; GFX7-NEXT:    s_addc_u32 s30, s30, 0
1266; GFX7-NEXT:    s_cmp_lg_u32 s34, 0
1267; GFX7-NEXT:    s_addc_u32 s30, s30, 0
1268; GFX7-NEXT:    s_cmp_lg_u32 s21, 0
1269; GFX7-NEXT:    s_addc_u32 s21, s30, s27
1270; GFX7-NEXT:    s_cselect_b32 s27, 1, 0
1271; GFX7-NEXT:    s_cmp_lg_u32 s23, 0
1272; GFX7-NEXT:    s_addc_u32 s22, s22, 0
1273; GFX7-NEXT:    s_cmp_lg_u32 s24, 0
1274; GFX7-NEXT:    s_addc_u32 s22, s22, 0
1275; GFX7-NEXT:    s_cmp_lg_u32 s29, 0
1276; GFX7-NEXT:    s_addc_u32 s22, s22, 0
1277; GFX7-NEXT:    s_cmp_lg_u32 s27, 0
1278; GFX7-NEXT:    s_addc_u32 s22, s22, s25
1279; GFX7-NEXT:    s_mul_i32 s16, s16, s15
1280; GFX7-NEXT:    s_addc_u32 s15, s26, s16
1281; GFX7-NEXT:    s_mul_i32 s1, s1, s14
1282; GFX7-NEXT:    s_cmp_lg_u32 s39, 0
1283; GFX7-NEXT:    s_addc_u32 s1, s15, s1
1284; GFX7-NEXT:    s_mul_i32 s2, s2, s13
1285; GFX7-NEXT:    s_cmp_lg_u32 s38, 0
1286; GFX7-NEXT:    s_addc_u32 s1, s1, s2
1287; GFX7-NEXT:    s_mul_i32 s3, s3, s12
1288; GFX7-NEXT:    s_cmp_lg_u32 s37, 0
1289; GFX7-NEXT:    s_addc_u32 s1, s1, s3
1290; GFX7-NEXT:    s_mul_i32 s4, s4, s11
1291; GFX7-NEXT:    s_cmp_lg_u32 s36, 0
1292; GFX7-NEXT:    s_addc_u32 s1, s1, s4
1293; GFX7-NEXT:    s_mul_i32 s5, s5, s10
1294; GFX7-NEXT:    s_cmp_lg_u32 s35, 0
1295; GFX7-NEXT:    s_addc_u32 s1, s1, s5
1296; GFX7-NEXT:    s_mul_i32 s6, s6, s9
1297; GFX7-NEXT:    s_cmp_lg_u32 s28, 0
1298; GFX7-NEXT:    s_addc_u32 s1, s1, s6
1299; GFX7-NEXT:    s_mul_i32 s7, s7, s8
1300; GFX7-NEXT:    s_mul_i32 s0, s0, s8
1301; GFX7-NEXT:    s_add_u32 s7, s7, s1
1302; GFX7-NEXT:    s_mov_b32 s1, s18
1303; GFX7-NEXT:    s_mov_b32 s2, s17
1304; GFX7-NEXT:    s_mov_b32 s3, s19
1305; GFX7-NEXT:    s_mov_b32 s4, s20
1306; GFX7-NEXT:    s_mov_b32 s5, s21
1307; GFX7-NEXT:    s_mov_b32 s6, s22
1308; GFX7-NEXT:    ; return to shader part epilog
1309;
1310; GFX8-LABEL: s_mul_i256:
1311; GFX8:       ; %bb.0:
1312; GFX8-NEXT:    s_mov_b32 s16, s0
1313; GFX8-NEXT:    v_mov_b32_e32 v0, s8
1314; GFX8-NEXT:    v_mul_hi_u32 v0, s16, v0
1315; GFX8-NEXT:    v_mov_b32_e32 v1, s9
1316; GFX8-NEXT:    v_mul_hi_u32 v2, s1, v1
1317; GFX8-NEXT:    v_mul_hi_u32 v1, s16, v1
1318; GFX8-NEXT:    v_readfirstlane_b32 s17, v0
1319; GFX8-NEXT:    v_mov_b32_e32 v0, s10
1320; GFX8-NEXT:    v_mul_hi_u32 v0, s16, v0
1321; GFX8-NEXT:    v_readfirstlane_b32 s21, v2
1322; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1323; GFX8-NEXT:    v_mul_hi_u32 v3, v2, s8
1324; GFX8-NEXT:    s_mul_i32 s18, s16, s10
1325; GFX8-NEXT:    s_mul_i32 s20, s1, s9
1326; GFX8-NEXT:    v_readfirstlane_b32 s19, v0
1327; GFX8-NEXT:    v_mov_b32_e32 v0, s1
1328; GFX8-NEXT:    s_add_u32 s18, s20, s18
1329; GFX8-NEXT:    s_addc_u32 s19, s21, s19
1330; GFX8-NEXT:    s_mul_i32 s21, s2, s8
1331; GFX8-NEXT:    v_readfirstlane_b32 s23, v1
1332; GFX8-NEXT:    v_mul_hi_u32 v1, v0, s8
1333; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
1334; GFX8-NEXT:    v_readfirstlane_b32 s22, v3
1335; GFX8-NEXT:    s_add_u32 s18, s21, s18
1336; GFX8-NEXT:    s_addc_u32 s19, s22, s19
1337; GFX8-NEXT:    s_mul_i32 s22, s16, s9
1338; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
1339; GFX8-NEXT:    s_add_u32 s17, s22, s17
1340; GFX8-NEXT:    s_addc_u32 s22, s23, s18
1341; GFX8-NEXT:    v_readfirstlane_b32 s23, v1
1342; GFX8-NEXT:    v_mov_b32_e32 v1, s12
1343; GFX8-NEXT:    v_mul_hi_u32 v3, s16, v1
1344; GFX8-NEXT:    s_mul_i32 s18, s1, s8
1345; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
1346; GFX8-NEXT:    s_add_u32 s18, s18, s17
1347; GFX8-NEXT:    s_addc_u32 s17, s23, s22
1348; GFX8-NEXT:    v_mov_b32_e32 v4, s11
1349; GFX8-NEXT:    v_readfirstlane_b32 s23, v3
1350; GFX8-NEXT:    v_mul_hi_u32 v3, v2, s10
1351; GFX8-NEXT:    v_mul_hi_u32 v5, s1, v4
1352; GFX8-NEXT:    s_mul_i32 s22, s16, s12
1353; GFX8-NEXT:    s_mul_i32 s24, s1, s11
1354; GFX8-NEXT:    v_readfirstlane_b32 s28, v3
1355; GFX8-NEXT:    v_mov_b32_e32 v3, s3
1356; GFX8-NEXT:    v_readfirstlane_b32 s27, v5
1357; GFX8-NEXT:    v_mul_hi_u32 v5, v3, s9
1358; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
1359; GFX8-NEXT:    s_add_u32 s24, s24, s22
1360; GFX8-NEXT:    s_addc_u32 s23, s27, s23
1361; GFX8-NEXT:    v_readfirstlane_b32 s29, v5
1362; GFX8-NEXT:    v_mov_b32_e32 v5, s4
1363; GFX8-NEXT:    v_mul_hi_u32 v6, v5, s8
1364; GFX8-NEXT:    s_mul_i32 s27, s2, s10
1365; GFX8-NEXT:    s_cselect_b32 s22, 1, 0
1366; GFX8-NEXT:    s_add_u32 s24, s27, s24
1367; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s10
1368; GFX8-NEXT:    s_addc_u32 s27, s28, s23
1369; GFX8-NEXT:    s_mul_i32 s28, s3, s9
1370; GFX8-NEXT:    s_cselect_b32 s23, 1, 0
1371; GFX8-NEXT:    s_add_u32 s28, s28, s24
1372; GFX8-NEXT:    v_readfirstlane_b32 s30, v6
1373; GFX8-NEXT:    v_mul_hi_u32 v6, s16, v4
1374; GFX8-NEXT:    s_addc_u32 s27, s29, s27
1375; GFX8-NEXT:    s_mul_i32 s29, s4, s8
1376; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
1377; GFX8-NEXT:    s_add_u32 s28, s29, s28
1378; GFX8-NEXT:    v_readfirstlane_b32 s33, v0
1379; GFX8-NEXT:    v_mul_hi_u32 v0, v2, s9
1380; GFX8-NEXT:    s_addc_u32 s27, s30, s27
1381; GFX8-NEXT:    s_mul_i32 s30, s16, s11
1382; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
1383; GFX8-NEXT:    v_readfirstlane_b32 s31, v6
1384; GFX8-NEXT:    s_add_u32 s19, s30, s19
1385; GFX8-NEXT:    s_addc_u32 s28, s31, s28
1386; GFX8-NEXT:    s_mul_i32 s31, s1, s10
1387; GFX8-NEXT:    s_cselect_b32 s30, 1, 0
1388; GFX8-NEXT:    s_add_u32 s19, s31, s19
1389; GFX8-NEXT:    v_readfirstlane_b32 s34, v0
1390; GFX8-NEXT:    v_mul_hi_u32 v0, v3, s8
1391; GFX8-NEXT:    s_addc_u32 s28, s33, s28
1392; GFX8-NEXT:    s_mul_i32 s33, s2, s9
1393; GFX8-NEXT:    s_cselect_b32 s31, 1, 0
1394; GFX8-NEXT:    s_add_u32 s19, s33, s19
1395; GFX8-NEXT:    s_addc_u32 s28, s34, s28
1396; GFX8-NEXT:    s_mul_i32 s34, s3, s8
1397; GFX8-NEXT:    s_cselect_b32 s33, 1, 0
1398; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
1399; GFX8-NEXT:    s_add_u32 s19, s34, s19
1400; GFX8-NEXT:    v_mov_b32_e32 v0, s14
1401; GFX8-NEXT:    s_addc_u32 s28, s35, s28
1402; GFX8-NEXT:    v_mul_hi_u32 v0, s16, v0
1403; GFX8-NEXT:    s_cselect_b32 s34, 1, 0
1404; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
1405; GFX8-NEXT:    s_addc_u32 s19, s25, s19
1406; GFX8-NEXT:    v_mov_b32_e32 v2, s13
1407; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
1408; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
1409; GFX8-NEXT:    v_mul_hi_u32 v6, s1, v2
1410; GFX8-NEXT:    s_addc_u32 s20, s20, 0
1411; GFX8-NEXT:    v_readfirstlane_b32 s26, v0
1412; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v1
1413; GFX8-NEXT:    s_cmp_lg_u32 s25, 0
1414; GFX8-NEXT:    s_addc_u32 s20, s20, s28
1415; GFX8-NEXT:    s_mul_i32 s25, s16, s14
1416; GFX8-NEXT:    s_mul_i32 s28, s1, s13
1417; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
1418; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
1419; GFX8-NEXT:    s_add_u32 s25, s28, s25
1420; GFX8-NEXT:    s_addc_u32 s26, s35, s26
1421; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
1422; GFX8-NEXT:    v_mul_hi_u32 v0, v3, s11
1423; GFX8-NEXT:    s_mul_i32 s28, s2, s12
1424; GFX8-NEXT:    s_add_u32 s25, s28, s25
1425; GFX8-NEXT:    s_addc_u32 s26, s35, s26
1426; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
1427; GFX8-NEXT:    v_mul_hi_u32 v0, v5, s10
1428; GFX8-NEXT:    s_mul_i32 s28, s3, s11
1429; GFX8-NEXT:    s_add_u32 s25, s28, s25
1430; GFX8-NEXT:    s_addc_u32 s26, s35, s26
1431; GFX8-NEXT:    v_readfirstlane_b32 s35, v0
1432; GFX8-NEXT:    v_mov_b32_e32 v0, s5
1433; GFX8-NEXT:    v_mul_hi_u32 v6, v0, s9
1434; GFX8-NEXT:    s_mul_i32 s28, s4, s10
1435; GFX8-NEXT:    s_add_u32 s25, s28, s25
1436; GFX8-NEXT:    v_mul_hi_u32 v1, s1, v1
1437; GFX8-NEXT:    s_addc_u32 s26, s35, s26
1438; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
1439; GFX8-NEXT:    v_mov_b32_e32 v6, s6
1440; GFX8-NEXT:    v_mul_hi_u32 v6, v6, s8
1441; GFX8-NEXT:    s_mul_i32 s28, s5, s9
1442; GFX8-NEXT:    s_add_u32 s25, s28, s25
1443; GFX8-NEXT:    v_mul_hi_u32 v2, s16, v2
1444; GFX8-NEXT:    v_readfirstlane_b32 s36, v1
1445; GFX8-NEXT:    v_mul_hi_u32 v1, s2, v4
1446; GFX8-NEXT:    s_addc_u32 s26, s35, s26
1447; GFX8-NEXT:    s_mul_i32 s28, s6, s8
1448; GFX8-NEXT:    v_readfirstlane_b32 s35, v6
1449; GFX8-NEXT:    s_add_u32 s25, s28, s25
1450; GFX8-NEXT:    s_addc_u32 s26, s35, s26
1451; GFX8-NEXT:    s_mul_i32 s28, s16, s13
1452; GFX8-NEXT:    v_readfirstlane_b32 s35, v2
1453; GFX8-NEXT:    s_add_u32 s27, s28, s27
1454; GFX8-NEXT:    v_readfirstlane_b32 s37, v1
1455; GFX8-NEXT:    v_mul_hi_u32 v1, v3, s10
1456; GFX8-NEXT:    s_addc_u32 s25, s35, s25
1457; GFX8-NEXT:    s_mul_i32 s35, s1, s12
1458; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
1459; GFX8-NEXT:    s_add_u32 s27, s35, s27
1460; GFX8-NEXT:    s_addc_u32 s25, s36, s25
1461; GFX8-NEXT:    s_mul_i32 s36, s2, s11
1462; GFX8-NEXT:    s_cselect_b32 s35, 1, 0
1463; GFX8-NEXT:    s_add_u32 s27, s36, s27
1464; GFX8-NEXT:    v_readfirstlane_b32 s38, v1
1465; GFX8-NEXT:    v_mul_hi_u32 v1, v5, s9
1466; GFX8-NEXT:    s_addc_u32 s25, s37, s25
1467; GFX8-NEXT:    s_mul_i32 s37, s3, s10
1468; GFX8-NEXT:    s_cselect_b32 s36, 1, 0
1469; GFX8-NEXT:    s_add_u32 s27, s37, s27
1470; GFX8-NEXT:    v_mul_hi_u32 v0, v0, s8
1471; GFX8-NEXT:    s_addc_u32 s25, s38, s25
1472; GFX8-NEXT:    s_mul_i32 s38, s4, s9
1473; GFX8-NEXT:    s_cselect_b32 s37, 1, 0
1474; GFX8-NEXT:    v_readfirstlane_b32 s39, v1
1475; GFX8-NEXT:    s_add_u32 s27, s38, s27
1476; GFX8-NEXT:    s_addc_u32 s25, s39, s25
1477; GFX8-NEXT:    s_mul_i32 s39, s5, s8
1478; GFX8-NEXT:    s_cselect_b32 s38, 1, 0
1479; GFX8-NEXT:    v_readfirstlane_b32 s40, v0
1480; GFX8-NEXT:    s_add_u32 s27, s39, s27
1481; GFX8-NEXT:    s_addc_u32 s25, s40, s25
1482; GFX8-NEXT:    s_cselect_b32 s39, 1, 0
1483; GFX8-NEXT:    s_cmp_lg_u32 s31, 0
1484; GFX8-NEXT:    s_addc_u32 s30, s30, 0
1485; GFX8-NEXT:    s_cmp_lg_u32 s33, 0
1486; GFX8-NEXT:    s_addc_u32 s30, s30, 0
1487; GFX8-NEXT:    s_cmp_lg_u32 s34, 0
1488; GFX8-NEXT:    s_addc_u32 s30, s30, 0
1489; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
1490; GFX8-NEXT:    s_addc_u32 s21, s30, s27
1491; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
1492; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
1493; GFX8-NEXT:    s_addc_u32 s22, s22, 0
1494; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
1495; GFX8-NEXT:    s_addc_u32 s22, s22, 0
1496; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
1497; GFX8-NEXT:    s_addc_u32 s22, s22, 0
1498; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
1499; GFX8-NEXT:    s_addc_u32 s22, s22, s25
1500; GFX8-NEXT:    s_mul_i32 s16, s16, s15
1501; GFX8-NEXT:    s_addc_u32 s15, s26, s16
1502; GFX8-NEXT:    s_mul_i32 s1, s1, s14
1503; GFX8-NEXT:    s_cmp_lg_u32 s39, 0
1504; GFX8-NEXT:    s_addc_u32 s1, s15, s1
1505; GFX8-NEXT:    s_mul_i32 s2, s2, s13
1506; GFX8-NEXT:    s_cmp_lg_u32 s38, 0
1507; GFX8-NEXT:    s_addc_u32 s1, s1, s2
1508; GFX8-NEXT:    s_mul_i32 s3, s3, s12
1509; GFX8-NEXT:    s_cmp_lg_u32 s37, 0
1510; GFX8-NEXT:    s_addc_u32 s1, s1, s3
1511; GFX8-NEXT:    s_mul_i32 s4, s4, s11
1512; GFX8-NEXT:    s_cmp_lg_u32 s36, 0
1513; GFX8-NEXT:    s_addc_u32 s1, s1, s4
1514; GFX8-NEXT:    s_mul_i32 s5, s5, s10
1515; GFX8-NEXT:    s_cmp_lg_u32 s35, 0
1516; GFX8-NEXT:    s_addc_u32 s1, s1, s5
1517; GFX8-NEXT:    s_mul_i32 s6, s6, s9
1518; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
1519; GFX8-NEXT:    s_addc_u32 s1, s1, s6
1520; GFX8-NEXT:    s_mul_i32 s7, s7, s8
1521; GFX8-NEXT:    s_mul_i32 s0, s0, s8
1522; GFX8-NEXT:    s_add_u32 s7, s7, s1
1523; GFX8-NEXT:    s_mov_b32 s1, s18
1524; GFX8-NEXT:    s_mov_b32 s2, s17
1525; GFX8-NEXT:    s_mov_b32 s3, s19
1526; GFX8-NEXT:    s_mov_b32 s4, s20
1527; GFX8-NEXT:    s_mov_b32 s5, s21
1528; GFX8-NEXT:    s_mov_b32 s6, s22
1529; GFX8-NEXT:    ; return to shader part epilog
1530;
1531; GFX9-LABEL: s_mul_i256:
1532; GFX9:       ; %bb.0:
1533; GFX9-NEXT:    s_mov_b32 s16, s0
1534; GFX9-NEXT:    s_mul_i32 s18, s16, s10
1535; GFX9-NEXT:    s_mul_i32 s20, s1, s9
1536; GFX9-NEXT:    s_mul_hi_u32 s19, s16, s10
1537; GFX9-NEXT:    s_mul_hi_u32 s21, s1, s9
1538; GFX9-NEXT:    s_add_u32 s18, s20, s18
1539; GFX9-NEXT:    s_addc_u32 s19, s21, s19
1540; GFX9-NEXT:    s_mul_i32 s21, s2, s8
1541; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
1542; GFX9-NEXT:    s_mul_hi_u32 s22, s2, s8
1543; GFX9-NEXT:    s_add_u32 s18, s21, s18
1544; GFX9-NEXT:    s_mul_hi_u32 s17, s16, s8
1545; GFX9-NEXT:    s_addc_u32 s19, s22, s19
1546; GFX9-NEXT:    s_mul_i32 s22, s16, s9
1547; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
1548; GFX9-NEXT:    s_mul_hi_u32 s23, s16, s9
1549; GFX9-NEXT:    s_add_u32 s17, s22, s17
1550; GFX9-NEXT:    s_addc_u32 s18, s23, s18
1551; GFX9-NEXT:    s_mul_i32 s23, s1, s8
1552; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
1553; GFX9-NEXT:    s_mul_hi_u32 s24, s1, s8
1554; GFX9-NEXT:    s_add_u32 s17, s23, s17
1555; GFX9-NEXT:    s_addc_u32 s18, s24, s18
1556; GFX9-NEXT:    s_mul_i32 s24, s16, s12
1557; GFX9-NEXT:    s_mul_i32 s26, s1, s11
1558; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
1559; GFX9-NEXT:    s_mul_hi_u32 s25, s16, s12
1560; GFX9-NEXT:    s_mul_hi_u32 s27, s1, s11
1561; GFX9-NEXT:    s_add_u32 s24, s26, s24
1562; GFX9-NEXT:    s_addc_u32 s25, s27, s25
1563; GFX9-NEXT:    s_mul_i32 s27, s2, s10
1564; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
1565; GFX9-NEXT:    s_mul_hi_u32 s28, s2, s10
1566; GFX9-NEXT:    s_add_u32 s24, s27, s24
1567; GFX9-NEXT:    s_addc_u32 s25, s28, s25
1568; GFX9-NEXT:    s_mul_i32 s28, s3, s9
1569; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
1570; GFX9-NEXT:    s_mul_hi_u32 s29, s3, s9
1571; GFX9-NEXT:    s_add_u32 s24, s28, s24
1572; GFX9-NEXT:    s_addc_u32 s25, s29, s25
1573; GFX9-NEXT:    s_mul_i32 s29, s4, s8
1574; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
1575; GFX9-NEXT:    s_mul_hi_u32 s30, s4, s8
1576; GFX9-NEXT:    s_add_u32 s24, s29, s24
1577; GFX9-NEXT:    s_addc_u32 s25, s30, s25
1578; GFX9-NEXT:    s_mul_i32 s30, s16, s11
1579; GFX9-NEXT:    s_cselect_b32 s29, 1, 0
1580; GFX9-NEXT:    s_mul_hi_u32 s31, s16, s11
1581; GFX9-NEXT:    s_add_u32 s19, s30, s19
1582; GFX9-NEXT:    s_addc_u32 s24, s31, s24
1583; GFX9-NEXT:    s_mul_i32 s31, s1, s10
1584; GFX9-NEXT:    s_cselect_b32 s30, 1, 0
1585; GFX9-NEXT:    s_mul_hi_u32 s33, s1, s10
1586; GFX9-NEXT:    s_add_u32 s19, s31, s19
1587; GFX9-NEXT:    s_addc_u32 s24, s33, s24
1588; GFX9-NEXT:    s_mul_i32 s33, s2, s9
1589; GFX9-NEXT:    s_cselect_b32 s31, 1, 0
1590; GFX9-NEXT:    s_mul_hi_u32 s34, s2, s9
1591; GFX9-NEXT:    s_add_u32 s19, s33, s19
1592; GFX9-NEXT:    s_addc_u32 s24, s34, s24
1593; GFX9-NEXT:    s_mul_i32 s34, s3, s8
1594; GFX9-NEXT:    s_cselect_b32 s33, 1, 0
1595; GFX9-NEXT:    s_mul_hi_u32 s35, s3, s8
1596; GFX9-NEXT:    s_add_u32 s19, s34, s19
1597; GFX9-NEXT:    s_addc_u32 s24, s35, s24
1598; GFX9-NEXT:    s_cselect_b32 s34, 1, 0
1599; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
1600; GFX9-NEXT:    s_addc_u32 s19, s22, s19
1601; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
1602; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
1603; GFX9-NEXT:    s_addc_u32 s20, s20, 0
1604; GFX9-NEXT:    s_cmp_lg_u32 s22, 0
1605; GFX9-NEXT:    s_addc_u32 s20, s20, s24
1606; GFX9-NEXT:    s_mul_i32 s22, s16, s14
1607; GFX9-NEXT:    s_mul_i32 s24, s1, s13
1608; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
1609; GFX9-NEXT:    s_mul_hi_u32 s23, s16, s14
1610; GFX9-NEXT:    s_mul_hi_u32 s35, s1, s13
1611; GFX9-NEXT:    s_add_u32 s22, s24, s22
1612; GFX9-NEXT:    s_addc_u32 s23, s35, s23
1613; GFX9-NEXT:    s_mul_i32 s24, s2, s12
1614; GFX9-NEXT:    s_mul_hi_u32 s35, s2, s12
1615; GFX9-NEXT:    s_add_u32 s22, s24, s22
1616; GFX9-NEXT:    s_addc_u32 s23, s35, s23
1617; GFX9-NEXT:    s_mul_i32 s24, s3, s11
1618; GFX9-NEXT:    s_mul_hi_u32 s35, s3, s11
1619; GFX9-NEXT:    s_add_u32 s22, s24, s22
1620; GFX9-NEXT:    s_addc_u32 s23, s35, s23
1621; GFX9-NEXT:    s_mul_i32 s24, s4, s10
1622; GFX9-NEXT:    s_mul_hi_u32 s35, s4, s10
1623; GFX9-NEXT:    s_add_u32 s22, s24, s22
1624; GFX9-NEXT:    s_addc_u32 s23, s35, s23
1625; GFX9-NEXT:    s_mul_i32 s24, s5, s9
1626; GFX9-NEXT:    s_mul_hi_u32 s35, s5, s9
1627; GFX9-NEXT:    s_add_u32 s22, s24, s22
1628; GFX9-NEXT:    s_addc_u32 s23, s35, s23
1629; GFX9-NEXT:    s_mul_i32 s24, s6, s8
1630; GFX9-NEXT:    s_mul_hi_u32 s35, s6, s8
1631; GFX9-NEXT:    s_add_u32 s22, s24, s22
1632; GFX9-NEXT:    s_addc_u32 s23, s35, s23
1633; GFX9-NEXT:    s_mul_i32 s24, s16, s13
1634; GFX9-NEXT:    s_mul_hi_u32 s35, s16, s13
1635; GFX9-NEXT:    s_add_u32 s24, s24, s25
1636; GFX9-NEXT:    s_addc_u32 s22, s35, s22
1637; GFX9-NEXT:    s_mul_i32 s35, s1, s12
1638; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
1639; GFX9-NEXT:    s_mul_hi_u32 s36, s1, s12
1640; GFX9-NEXT:    s_add_u32 s24, s35, s24
1641; GFX9-NEXT:    s_addc_u32 s22, s36, s22
1642; GFX9-NEXT:    s_mul_i32 s36, s2, s11
1643; GFX9-NEXT:    s_cselect_b32 s35, 1, 0
1644; GFX9-NEXT:    s_mul_hi_u32 s37, s2, s11
1645; GFX9-NEXT:    s_add_u32 s24, s36, s24
1646; GFX9-NEXT:    s_addc_u32 s22, s37, s22
1647; GFX9-NEXT:    s_mul_i32 s37, s3, s10
1648; GFX9-NEXT:    s_cselect_b32 s36, 1, 0
1649; GFX9-NEXT:    s_mul_hi_u32 s38, s3, s10
1650; GFX9-NEXT:    s_add_u32 s24, s37, s24
1651; GFX9-NEXT:    s_addc_u32 s22, s38, s22
1652; GFX9-NEXT:    s_mul_i32 s38, s4, s9
1653; GFX9-NEXT:    s_cselect_b32 s37, 1, 0
1654; GFX9-NEXT:    s_mul_hi_u32 s39, s4, s9
1655; GFX9-NEXT:    s_add_u32 s24, s38, s24
1656; GFX9-NEXT:    s_addc_u32 s22, s39, s22
1657; GFX9-NEXT:    s_mul_i32 s39, s5, s8
1658; GFX9-NEXT:    s_cselect_b32 s38, 1, 0
1659; GFX9-NEXT:    s_mul_hi_u32 s40, s5, s8
1660; GFX9-NEXT:    s_add_u32 s24, s39, s24
1661; GFX9-NEXT:    s_addc_u32 s22, s40, s22
1662; GFX9-NEXT:    s_cselect_b32 s39, 1, 0
1663; GFX9-NEXT:    s_cmp_lg_u32 s31, 0
1664; GFX9-NEXT:    s_addc_u32 s30, s30, 0
1665; GFX9-NEXT:    s_cmp_lg_u32 s33, 0
1666; GFX9-NEXT:    s_addc_u32 s30, s30, 0
1667; GFX9-NEXT:    s_cmp_lg_u32 s34, 0
1668; GFX9-NEXT:    s_addc_u32 s30, s30, 0
1669; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
1670; GFX9-NEXT:    s_addc_u32 s21, s30, s24
1671; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
1672; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
1673; GFX9-NEXT:    s_addc_u32 s26, s26, 0
1674; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
1675; GFX9-NEXT:    s_addc_u32 s26, s26, 0
1676; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
1677; GFX9-NEXT:    s_addc_u32 s26, s26, 0
1678; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
1679; GFX9-NEXT:    s_addc_u32 s22, s26, s22
1680; GFX9-NEXT:    s_mul_i32 s16, s16, s15
1681; GFX9-NEXT:    s_addc_u32 s15, s23, s16
1682; GFX9-NEXT:    s_mul_i32 s1, s1, s14
1683; GFX9-NEXT:    s_cmp_lg_u32 s39, 0
1684; GFX9-NEXT:    s_addc_u32 s1, s15, s1
1685; GFX9-NEXT:    s_mul_i32 s2, s2, s13
1686; GFX9-NEXT:    s_cmp_lg_u32 s38, 0
1687; GFX9-NEXT:    s_addc_u32 s1, s1, s2
1688; GFX9-NEXT:    s_mul_i32 s3, s3, s12
1689; GFX9-NEXT:    s_cmp_lg_u32 s37, 0
1690; GFX9-NEXT:    s_addc_u32 s1, s1, s3
1691; GFX9-NEXT:    s_mul_i32 s4, s4, s11
1692; GFX9-NEXT:    s_cmp_lg_u32 s36, 0
1693; GFX9-NEXT:    s_addc_u32 s1, s1, s4
1694; GFX9-NEXT:    s_mul_i32 s5, s5, s10
1695; GFX9-NEXT:    s_cmp_lg_u32 s35, 0
1696; GFX9-NEXT:    s_addc_u32 s1, s1, s5
1697; GFX9-NEXT:    s_mul_i32 s6, s6, s9
1698; GFX9-NEXT:    s_cmp_lg_u32 s25, 0
1699; GFX9-NEXT:    s_addc_u32 s1, s1, s6
1700; GFX9-NEXT:    s_mul_i32 s7, s7, s8
1701; GFX9-NEXT:    s_mul_i32 s0, s0, s8
1702; GFX9-NEXT:    s_add_u32 s7, s7, s1
1703; GFX9-NEXT:    s_mov_b32 s1, s17
1704; GFX9-NEXT:    s_mov_b32 s2, s18
1705; GFX9-NEXT:    s_mov_b32 s3, s19
1706; GFX9-NEXT:    s_mov_b32 s4, s20
1707; GFX9-NEXT:    s_mov_b32 s5, s21
1708; GFX9-NEXT:    s_mov_b32 s6, s22
1709; GFX9-NEXT:    ; return to shader part epilog
1710;
1711; GFX10PLUS-LABEL: s_mul_i256:
1712; GFX10PLUS:       ; %bb.0:
1713; GFX10PLUS-NEXT:    s_mul_i32 s17, s0, s10
1714; GFX10PLUS-NEXT:    s_mul_i32 s19, s1, s9
1715; GFX10PLUS-NEXT:    s_mul_hi_u32 s18, s0, s10
1716; GFX10PLUS-NEXT:    s_mul_hi_u32 s20, s1, s9
1717; GFX10PLUS-NEXT:    s_add_u32 s17, s19, s17
1718; GFX10PLUS-NEXT:    s_addc_u32 s18, s20, s18
1719; GFX10PLUS-NEXT:    s_mul_i32 s20, s2, s8
1720; GFX10PLUS-NEXT:    s_mul_hi_u32 s21, s2, s8
1721; GFX10PLUS-NEXT:    s_cselect_b32 s19, 1, 0
1722; GFX10PLUS-NEXT:    s_add_u32 s17, s20, s17
1723; GFX10PLUS-NEXT:    s_mul_hi_u32 s16, s0, s8
1724; GFX10PLUS-NEXT:    s_addc_u32 s18, s21, s18
1725; GFX10PLUS-NEXT:    s_mul_i32 s21, s0, s9
1726; GFX10PLUS-NEXT:    s_mul_hi_u32 s22, s0, s9
1727; GFX10PLUS-NEXT:    s_cselect_b32 s20, 1, 0
1728; GFX10PLUS-NEXT:    s_add_u32 s16, s21, s16
1729; GFX10PLUS-NEXT:    s_addc_u32 s17, s22, s17
1730; GFX10PLUS-NEXT:    s_mul_i32 s22, s1, s8
1731; GFX10PLUS-NEXT:    s_mul_hi_u32 s23, s1, s8
1732; GFX10PLUS-NEXT:    s_cselect_b32 s21, 1, 0
1733; GFX10PLUS-NEXT:    s_add_u32 s16, s22, s16
1734; GFX10PLUS-NEXT:    s_addc_u32 s17, s23, s17
1735; GFX10PLUS-NEXT:    s_mul_i32 s23, s0, s12
1736; GFX10PLUS-NEXT:    s_mul_i32 s25, s1, s11
1737; GFX10PLUS-NEXT:    s_mul_hi_u32 s24, s0, s12
1738; GFX10PLUS-NEXT:    s_mul_hi_u32 s26, s1, s11
1739; GFX10PLUS-NEXT:    s_cselect_b32 s22, 1, 0
1740; GFX10PLUS-NEXT:    s_add_u32 s23, s25, s23
1741; GFX10PLUS-NEXT:    s_addc_u32 s24, s26, s24
1742; GFX10PLUS-NEXT:    s_mul_i32 s26, s2, s10
1743; GFX10PLUS-NEXT:    s_mul_hi_u32 s27, s2, s10
1744; GFX10PLUS-NEXT:    s_cselect_b32 s25, 1, 0
1745; GFX10PLUS-NEXT:    s_add_u32 s23, s26, s23
1746; GFX10PLUS-NEXT:    s_addc_u32 s24, s27, s24
1747; GFX10PLUS-NEXT:    s_mul_i32 s27, s3, s9
1748; GFX10PLUS-NEXT:    s_mul_hi_u32 s28, s3, s9
1749; GFX10PLUS-NEXT:    s_cselect_b32 s26, 1, 0
1750; GFX10PLUS-NEXT:    s_add_u32 s23, s27, s23
1751; GFX10PLUS-NEXT:    s_addc_u32 s24, s28, s24
1752; GFX10PLUS-NEXT:    s_mul_i32 s28, s4, s8
1753; GFX10PLUS-NEXT:    s_mul_hi_u32 s29, s4, s8
1754; GFX10PLUS-NEXT:    s_cselect_b32 s27, 1, 0
1755; GFX10PLUS-NEXT:    s_add_u32 s23, s28, s23
1756; GFX10PLUS-NEXT:    s_addc_u32 s24, s29, s24
1757; GFX10PLUS-NEXT:    s_mul_i32 s29, s0, s11
1758; GFX10PLUS-NEXT:    s_mul_hi_u32 s30, s0, s11
1759; GFX10PLUS-NEXT:    s_cselect_b32 s28, 1, 0
1760; GFX10PLUS-NEXT:    s_add_u32 s18, s29, s18
1761; GFX10PLUS-NEXT:    s_addc_u32 s23, s30, s23
1762; GFX10PLUS-NEXT:    s_mul_i32 s30, s1, s10
1763; GFX10PLUS-NEXT:    s_mul_hi_u32 s31, s1, s10
1764; GFX10PLUS-NEXT:    s_cselect_b32 s29, 1, 0
1765; GFX10PLUS-NEXT:    s_add_u32 s18, s30, s18
1766; GFX10PLUS-NEXT:    s_addc_u32 s23, s31, s23
1767; GFX10PLUS-NEXT:    s_mul_i32 s31, s2, s9
1768; GFX10PLUS-NEXT:    s_mul_hi_u32 s33, s2, s9
1769; GFX10PLUS-NEXT:    s_cselect_b32 s30, 1, 0
1770; GFX10PLUS-NEXT:    s_add_u32 s18, s31, s18
1771; GFX10PLUS-NEXT:    s_addc_u32 s23, s33, s23
1772; GFX10PLUS-NEXT:    s_mul_i32 s33, s3, s8
1773; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s3, s8
1774; GFX10PLUS-NEXT:    s_cselect_b32 s31, 1, 0
1775; GFX10PLUS-NEXT:    s_add_u32 s18, s33, s18
1776; GFX10PLUS-NEXT:    s_addc_u32 s23, s34, s23
1777; GFX10PLUS-NEXT:    s_cselect_b32 s33, 1, 0
1778; GFX10PLUS-NEXT:    s_cmp_lg_u32 s22, 0
1779; GFX10PLUS-NEXT:    s_mul_hi_u32 s22, s0, s14
1780; GFX10PLUS-NEXT:    s_addc_u32 s18, s21, s18
1781; GFX10PLUS-NEXT:    s_cselect_b32 s21, 1, 0
1782; GFX10PLUS-NEXT:    s_cmp_lg_u32 s20, 0
1783; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s1, s13
1784; GFX10PLUS-NEXT:    s_addc_u32 s19, s19, 0
1785; GFX10PLUS-NEXT:    s_cmp_lg_u32 s21, 0
1786; GFX10PLUS-NEXT:    s_mul_i32 s21, s0, s14
1787; GFX10PLUS-NEXT:    s_addc_u32 s19, s19, s23
1788; GFX10PLUS-NEXT:    s_mul_i32 s23, s1, s13
1789; GFX10PLUS-NEXT:    s_cselect_b32 s20, 1, 0
1790; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
1791; GFX10PLUS-NEXT:    s_mul_i32 s23, s2, s12
1792; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
1793; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s2, s12
1794; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
1795; GFX10PLUS-NEXT:    s_mul_i32 s23, s3, s11
1796; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
1797; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s3, s11
1798; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
1799; GFX10PLUS-NEXT:    s_mul_i32 s23, s4, s10
1800; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
1801; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s4, s10
1802; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
1803; GFX10PLUS-NEXT:    s_mul_i32 s23, s5, s9
1804; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
1805; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s5, s9
1806; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
1807; GFX10PLUS-NEXT:    s_mul_i32 s23, s6, s8
1808; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
1809; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s6, s8
1810; GFX10PLUS-NEXT:    s_add_u32 s21, s23, s21
1811; GFX10PLUS-NEXT:    s_mul_i32 s23, s0, s13
1812; GFX10PLUS-NEXT:    s_addc_u32 s22, s34, s22
1813; GFX10PLUS-NEXT:    s_mul_hi_u32 s34, s0, s13
1814; GFX10PLUS-NEXT:    s_add_u32 s23, s23, s24
1815; GFX10PLUS-NEXT:    s_addc_u32 s21, s34, s21
1816; GFX10PLUS-NEXT:    s_mul_i32 s34, s1, s12
1817; GFX10PLUS-NEXT:    s_mul_hi_u32 s35, s1, s12
1818; GFX10PLUS-NEXT:    s_cselect_b32 s24, 1, 0
1819; GFX10PLUS-NEXT:    s_add_u32 s23, s34, s23
1820; GFX10PLUS-NEXT:    s_addc_u32 s21, s35, s21
1821; GFX10PLUS-NEXT:    s_mul_i32 s35, s2, s11
1822; GFX10PLUS-NEXT:    s_mul_hi_u32 s36, s2, s11
1823; GFX10PLUS-NEXT:    s_cselect_b32 s34, 1, 0
1824; GFX10PLUS-NEXT:    s_add_u32 s23, s35, s23
1825; GFX10PLUS-NEXT:    s_addc_u32 s21, s36, s21
1826; GFX10PLUS-NEXT:    s_mul_i32 s36, s3, s10
1827; GFX10PLUS-NEXT:    s_mul_hi_u32 s37, s3, s10
1828; GFX10PLUS-NEXT:    s_cselect_b32 s35, 1, 0
1829; GFX10PLUS-NEXT:    s_add_u32 s23, s36, s23
1830; GFX10PLUS-NEXT:    s_addc_u32 s21, s37, s21
1831; GFX10PLUS-NEXT:    s_mul_i32 s37, s4, s9
1832; GFX10PLUS-NEXT:    s_mul_hi_u32 s38, s4, s9
1833; GFX10PLUS-NEXT:    s_cselect_b32 s36, 1, 0
1834; GFX10PLUS-NEXT:    s_add_u32 s23, s37, s23
1835; GFX10PLUS-NEXT:    s_addc_u32 s21, s38, s21
1836; GFX10PLUS-NEXT:    s_mul_i32 s38, s5, s8
1837; GFX10PLUS-NEXT:    s_mul_hi_u32 s39, s5, s8
1838; GFX10PLUS-NEXT:    s_cselect_b32 s37, 1, 0
1839; GFX10PLUS-NEXT:    s_add_u32 s23, s38, s23
1840; GFX10PLUS-NEXT:    s_addc_u32 s21, s39, s21
1841; GFX10PLUS-NEXT:    s_cselect_b32 s38, 1, 0
1842; GFX10PLUS-NEXT:    s_cmp_lg_u32 s30, 0
1843; GFX10PLUS-NEXT:    s_mul_i32 s1, s1, s14
1844; GFX10PLUS-NEXT:    s_addc_u32 s29, s29, 0
1845; GFX10PLUS-NEXT:    s_cmp_lg_u32 s31, 0
1846; GFX10PLUS-NEXT:    s_mul_i32 s2, s2, s13
1847; GFX10PLUS-NEXT:    s_addc_u32 s29, s29, 0
1848; GFX10PLUS-NEXT:    s_cmp_lg_u32 s33, 0
1849; GFX10PLUS-NEXT:    s_mul_i32 s3, s3, s12
1850; GFX10PLUS-NEXT:    s_addc_u32 s29, s29, 0
1851; GFX10PLUS-NEXT:    s_cmp_lg_u32 s20, 0
1852; GFX10PLUS-NEXT:    s_mul_i32 s4, s4, s11
1853; GFX10PLUS-NEXT:    s_addc_u32 s20, s29, s23
1854; GFX10PLUS-NEXT:    s_cselect_b32 s23, 1, 0
1855; GFX10PLUS-NEXT:    s_cmp_lg_u32 s26, 0
1856; GFX10PLUS-NEXT:    s_mul_i32 s26, s0, s15
1857; GFX10PLUS-NEXT:    s_addc_u32 s25, s25, 0
1858; GFX10PLUS-NEXT:    s_cmp_lg_u32 s27, 0
1859; GFX10PLUS-NEXT:    s_mul_i32 s5, s5, s10
1860; GFX10PLUS-NEXT:    s_addc_u32 s25, s25, 0
1861; GFX10PLUS-NEXT:    s_cmp_lg_u32 s28, 0
1862; GFX10PLUS-NEXT:    s_mul_i32 s6, s6, s9
1863; GFX10PLUS-NEXT:    s_addc_u32 s25, s25, 0
1864; GFX10PLUS-NEXT:    s_cmp_lg_u32 s23, 0
1865; GFX10PLUS-NEXT:    s_mul_i32 s7, s7, s8
1866; GFX10PLUS-NEXT:    s_addc_u32 s15, s25, s21
1867; GFX10PLUS-NEXT:    s_addc_u32 s21, s22, s26
1868; GFX10PLUS-NEXT:    s_cmp_lg_u32 s38, 0
1869; GFX10PLUS-NEXT:    s_mul_i32 s0, s0, s8
1870; GFX10PLUS-NEXT:    s_addc_u32 s1, s21, s1
1871; GFX10PLUS-NEXT:    s_cmp_lg_u32 s37, 0
1872; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s2
1873; GFX10PLUS-NEXT:    s_cmp_lg_u32 s36, 0
1874; GFX10PLUS-NEXT:    s_mov_b32 s2, s17
1875; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s3
1876; GFX10PLUS-NEXT:    s_cmp_lg_u32 s35, 0
1877; GFX10PLUS-NEXT:    s_mov_b32 s3, s18
1878; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s4
1879; GFX10PLUS-NEXT:    s_cmp_lg_u32 s34, 0
1880; GFX10PLUS-NEXT:    s_mov_b32 s4, s19
1881; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s5
1882; GFX10PLUS-NEXT:    s_cmp_lg_u32 s24, 0
1883; GFX10PLUS-NEXT:    s_mov_b32 s5, s20
1884; GFX10PLUS-NEXT:    s_addc_u32 s1, s1, s6
1885; GFX10PLUS-NEXT:    s_mov_b32 s6, s15
1886; GFX10PLUS-NEXT:    s_add_i32 s7, s1, s7
1887; GFX10PLUS-NEXT:    s_mov_b32 s1, s16
1888; GFX10PLUS-NEXT:    ; return to shader part epilog
1889;
1890; GFX12-LABEL: s_mul_i256:
1891; GFX12:       ; %bb.0:
1892; GFX12-NEXT:    s_mul_i32 s17, s0, s10
1893; GFX12-NEXT:    s_mul_i32 s19, s1, s9
1894; GFX12-NEXT:    s_mul_hi_u32 s18, s0, s10
1895; GFX12-NEXT:    s_mul_hi_u32 s20, s1, s9
1896; GFX12-NEXT:    s_add_co_u32 s17, s19, s17
1897; GFX12-NEXT:    s_add_co_ci_u32 s18, s20, s18
1898; GFX12-NEXT:    s_mul_i32 s20, s2, s8
1899; GFX12-NEXT:    s_mul_hi_u32 s21, s2, s8
1900; GFX12-NEXT:    s_cselect_b32 s19, 1, 0
1901; GFX12-NEXT:    s_add_co_u32 s17, s20, s17
1902; GFX12-NEXT:    s_mul_hi_u32 s16, s0, s8
1903; GFX12-NEXT:    s_add_co_ci_u32 s18, s21, s18
1904; GFX12-NEXT:    s_mul_i32 s21, s0, s9
1905; GFX12-NEXT:    s_mul_hi_u32 s22, s0, s9
1906; GFX12-NEXT:    s_cselect_b32 s20, 1, 0
1907; GFX12-NEXT:    s_add_co_u32 s16, s21, s16
1908; GFX12-NEXT:    s_add_co_ci_u32 s17, s22, s17
1909; GFX12-NEXT:    s_mul_i32 s22, s1, s8
1910; GFX12-NEXT:    s_mul_hi_u32 s23, s1, s8
1911; GFX12-NEXT:    s_cselect_b32 s21, 1, 0
1912; GFX12-NEXT:    s_add_co_u32 s16, s22, s16
1913; GFX12-NEXT:    s_add_co_ci_u32 s17, s23, s17
1914; GFX12-NEXT:    s_mul_i32 s23, s0, s12
1915; GFX12-NEXT:    s_mul_i32 s25, s1, s11
1916; GFX12-NEXT:    s_mul_hi_u32 s24, s0, s12
1917; GFX12-NEXT:    s_mul_hi_u32 s26, s1, s11
1918; GFX12-NEXT:    s_cselect_b32 s22, 1, 0
1919; GFX12-NEXT:    s_add_co_u32 s23, s25, s23
1920; GFX12-NEXT:    s_add_co_ci_u32 s24, s26, s24
1921; GFX12-NEXT:    s_mul_i32 s26, s2, s10
1922; GFX12-NEXT:    s_mul_hi_u32 s27, s2, s10
1923; GFX12-NEXT:    s_cselect_b32 s25, 1, 0
1924; GFX12-NEXT:    s_add_co_u32 s23, s26, s23
1925; GFX12-NEXT:    s_add_co_ci_u32 s24, s27, s24
1926; GFX12-NEXT:    s_mul_i32 s27, s3, s9
1927; GFX12-NEXT:    s_mul_hi_u32 s28, s3, s9
1928; GFX12-NEXT:    s_cselect_b32 s26, 1, 0
1929; GFX12-NEXT:    s_add_co_u32 s23, s27, s23
1930; GFX12-NEXT:    s_add_co_ci_u32 s24, s28, s24
1931; GFX12-NEXT:    s_mul_i32 s28, s4, s8
1932; GFX12-NEXT:    s_mul_hi_u32 s29, s4, s8
1933; GFX12-NEXT:    s_cselect_b32 s27, 1, 0
1934; GFX12-NEXT:    s_add_co_u32 s23, s28, s23
1935; GFX12-NEXT:    s_add_co_ci_u32 s24, s29, s24
1936; GFX12-NEXT:    s_mul_i32 s29, s0, s11
1937; GFX12-NEXT:    s_mul_hi_u32 s30, s0, s11
1938; GFX12-NEXT:    s_cselect_b32 s28, 1, 0
1939; GFX12-NEXT:    s_add_co_u32 s18, s29, s18
1940; GFX12-NEXT:    s_add_co_ci_u32 s23, s30, s23
1941; GFX12-NEXT:    s_mul_i32 s30, s1, s10
1942; GFX12-NEXT:    s_mul_hi_u32 s31, s1, s10
1943; GFX12-NEXT:    s_cselect_b32 s29, 1, 0
1944; GFX12-NEXT:    s_add_co_u32 s18, s30, s18
1945; GFX12-NEXT:    s_add_co_ci_u32 s23, s31, s23
1946; GFX12-NEXT:    s_mul_i32 s31, s2, s9
1947; GFX12-NEXT:    s_mul_hi_u32 s33, s2, s9
1948; GFX12-NEXT:    s_cselect_b32 s30, 1, 0
1949; GFX12-NEXT:    s_add_co_u32 s18, s31, s18
1950; GFX12-NEXT:    s_add_co_ci_u32 s23, s33, s23
1951; GFX12-NEXT:    s_mul_i32 s33, s3, s8
1952; GFX12-NEXT:    s_mul_hi_u32 s34, s3, s8
1953; GFX12-NEXT:    s_cselect_b32 s31, 1, 0
1954; GFX12-NEXT:    s_add_co_u32 s18, s33, s18
1955; GFX12-NEXT:    s_add_co_ci_u32 s23, s34, s23
1956; GFX12-NEXT:    s_cselect_b32 s33, 1, 0
1957; GFX12-NEXT:    s_cmp_lg_u32 s22, 0
1958; GFX12-NEXT:    s_mul_hi_u32 s22, s0, s14
1959; GFX12-NEXT:    s_add_co_ci_u32 s18, s21, s18
1960; GFX12-NEXT:    s_cselect_b32 s21, 1, 0
1961; GFX12-NEXT:    s_cmp_lg_u32 s20, 0
1962; GFX12-NEXT:    s_mul_hi_u32 s34, s1, s13
1963; GFX12-NEXT:    s_add_co_ci_u32 s19, s19, 0
1964; GFX12-NEXT:    s_cmp_lg_u32 s21, 0
1965; GFX12-NEXT:    s_mul_i32 s21, s0, s14
1966; GFX12-NEXT:    s_add_co_ci_u32 s19, s19, s23
1967; GFX12-NEXT:    s_mul_i32 s23, s1, s13
1968; GFX12-NEXT:    s_cselect_b32 s20, 1, 0
1969; GFX12-NEXT:    s_add_co_u32 s21, s23, s21
1970; GFX12-NEXT:    s_mul_i32 s23, s2, s12
1971; GFX12-NEXT:    s_add_co_ci_u32 s22, s34, s22
1972; GFX12-NEXT:    s_mul_hi_u32 s34, s2, s12
1973; GFX12-NEXT:    s_add_co_u32 s21, s23, s21
1974; GFX12-NEXT:    s_mul_i32 s23, s3, s11
1975; GFX12-NEXT:    s_add_co_ci_u32 s22, s34, s22
1976; GFX12-NEXT:    s_mul_hi_u32 s34, s3, s11
1977; GFX12-NEXT:    s_add_co_u32 s21, s23, s21
1978; GFX12-NEXT:    s_mul_i32 s23, s4, s10
1979; GFX12-NEXT:    s_add_co_ci_u32 s22, s34, s22
1980; GFX12-NEXT:    s_mul_hi_u32 s34, s4, s10
1981; GFX12-NEXT:    s_add_co_u32 s21, s23, s21
1982; GFX12-NEXT:    s_mul_i32 s23, s5, s9
1983; GFX12-NEXT:    s_add_co_ci_u32 s22, s34, s22
1984; GFX12-NEXT:    s_mul_hi_u32 s34, s5, s9
1985; GFX12-NEXT:    s_add_co_u32 s21, s23, s21
1986; GFX12-NEXT:    s_mul_i32 s23, s6, s8
1987; GFX12-NEXT:    s_add_co_ci_u32 s22, s34, s22
1988; GFX12-NEXT:    s_mul_hi_u32 s34, s6, s8
1989; GFX12-NEXT:    s_add_co_u32 s21, s23, s21
1990; GFX12-NEXT:    s_mul_i32 s23, s0, s13
1991; GFX12-NEXT:    s_add_co_ci_u32 s22, s34, s22
1992; GFX12-NEXT:    s_mul_hi_u32 s34, s0, s13
1993; GFX12-NEXT:    s_add_co_u32 s23, s23, s24
1994; GFX12-NEXT:    s_add_co_ci_u32 s21, s34, s21
1995; GFX12-NEXT:    s_mul_i32 s34, s1, s12
1996; GFX12-NEXT:    s_mul_hi_u32 s35, s1, s12
1997; GFX12-NEXT:    s_cselect_b32 s24, 1, 0
1998; GFX12-NEXT:    s_add_co_u32 s23, s34, s23
1999; GFX12-NEXT:    s_add_co_ci_u32 s21, s35, s21
2000; GFX12-NEXT:    s_mul_i32 s35, s2, s11
2001; GFX12-NEXT:    s_mul_hi_u32 s36, s2, s11
2002; GFX12-NEXT:    s_cselect_b32 s34, 1, 0
2003; GFX12-NEXT:    s_add_co_u32 s23, s35, s23
2004; GFX12-NEXT:    s_add_co_ci_u32 s21, s36, s21
2005; GFX12-NEXT:    s_mul_i32 s36, s3, s10
2006; GFX12-NEXT:    s_mul_hi_u32 s37, s3, s10
2007; GFX12-NEXT:    s_cselect_b32 s35, 1, 0
2008; GFX12-NEXT:    s_add_co_u32 s23, s36, s23
2009; GFX12-NEXT:    s_add_co_ci_u32 s21, s37, s21
2010; GFX12-NEXT:    s_mul_i32 s37, s4, s9
2011; GFX12-NEXT:    s_mul_hi_u32 s38, s4, s9
2012; GFX12-NEXT:    s_cselect_b32 s36, 1, 0
2013; GFX12-NEXT:    s_add_co_u32 s23, s37, s23
2014; GFX12-NEXT:    s_add_co_ci_u32 s21, s38, s21
2015; GFX12-NEXT:    s_mul_i32 s38, s5, s8
2016; GFX12-NEXT:    s_mul_hi_u32 s39, s5, s8
2017; GFX12-NEXT:    s_cselect_b32 s37, 1, 0
2018; GFX12-NEXT:    s_add_co_u32 s23, s38, s23
2019; GFX12-NEXT:    s_add_co_ci_u32 s21, s39, s21
2020; GFX12-NEXT:    s_cselect_b32 s38, 1, 0
2021; GFX12-NEXT:    s_cmp_lg_u32 s30, 0
2022; GFX12-NEXT:    s_mul_i32 s1, s1, s14
2023; GFX12-NEXT:    s_add_co_ci_u32 s29, s29, 0
2024; GFX12-NEXT:    s_cmp_lg_u32 s31, 0
2025; GFX12-NEXT:    s_mul_i32 s2, s2, s13
2026; GFX12-NEXT:    s_add_co_ci_u32 s29, s29, 0
2027; GFX12-NEXT:    s_cmp_lg_u32 s33, 0
2028; GFX12-NEXT:    s_mul_i32 s3, s3, s12
2029; GFX12-NEXT:    s_add_co_ci_u32 s29, s29, 0
2030; GFX12-NEXT:    s_cmp_lg_u32 s20, 0
2031; GFX12-NEXT:    s_mul_i32 s4, s4, s11
2032; GFX12-NEXT:    s_add_co_ci_u32 s20, s29, s23
2033; GFX12-NEXT:    s_cselect_b32 s23, 1, 0
2034; GFX12-NEXT:    s_cmp_lg_u32 s26, 0
2035; GFX12-NEXT:    s_mul_i32 s26, s0, s15
2036; GFX12-NEXT:    s_add_co_ci_u32 s25, s25, 0
2037; GFX12-NEXT:    s_cmp_lg_u32 s27, 0
2038; GFX12-NEXT:    s_mul_i32 s5, s5, s10
2039; GFX12-NEXT:    s_add_co_ci_u32 s25, s25, 0
2040; GFX12-NEXT:    s_cmp_lg_u32 s28, 0
2041; GFX12-NEXT:    s_mul_i32 s6, s6, s9
2042; GFX12-NEXT:    s_add_co_ci_u32 s25, s25, 0
2043; GFX12-NEXT:    s_cmp_lg_u32 s23, 0
2044; GFX12-NEXT:    s_mul_i32 s7, s7, s8
2045; GFX12-NEXT:    s_add_co_ci_u32 s15, s25, s21
2046; GFX12-NEXT:    s_add_co_ci_u32 s21, s22, s26
2047; GFX12-NEXT:    s_cmp_lg_u32 s38, 0
2048; GFX12-NEXT:    s_mul_i32 s0, s0, s8
2049; GFX12-NEXT:    s_add_co_ci_u32 s1, s21, s1
2050; GFX12-NEXT:    s_cmp_lg_u32 s37, 0
2051; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s2
2052; GFX12-NEXT:    s_cmp_lg_u32 s36, 0
2053; GFX12-NEXT:    s_mov_b32 s2, s17
2054; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s3
2055; GFX12-NEXT:    s_cmp_lg_u32 s35, 0
2056; GFX12-NEXT:    s_mov_b32 s3, s18
2057; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s4
2058; GFX12-NEXT:    s_cmp_lg_u32 s34, 0
2059; GFX12-NEXT:    s_mov_b32 s4, s19
2060; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s5
2061; GFX12-NEXT:    s_cmp_lg_u32 s24, 0
2062; GFX12-NEXT:    s_mov_b32 s5, s20
2063; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, s6
2064; GFX12-NEXT:    s_mov_b32 s6, s15
2065; GFX12-NEXT:    s_add_co_i32 s7, s1, s7
2066; GFX12-NEXT:    s_mov_b32 s1, s16
2067; GFX12-NEXT:    ; return to shader part epilog
2068  %result = mul i256 %num, %den
2069  %cast = bitcast i256 %result to <8 x i32>
2070  ret <8 x i32> %cast
2071}
2072
2073define i256 @v_mul_i256(i256 %num, i256 %den) {
2074; GFX7-LABEL: v_mul_i256:
2075; GFX7:       ; %bb.0:
2076; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2077; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
2078; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
2079; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
2080; GFX7-NEXT:    v_mul_lo_u32 v28, v4, v11
2081; GFX7-NEXT:    v_mul_lo_u32 v27, v5, v10
2082; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
2083; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
2084; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
2085; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
2086; GFX7-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
2087; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
2088; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
2089; GFX7-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
2090; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
2091; GFX7-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
2092; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
2093; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
2094; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
2095; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
2096; GFX7-NEXT:    v_addc_u32_e32 v25, vcc, 0, v20, vcc
2097; GFX7-NEXT:    v_mov_b32_e32 v20, v18
2098; GFX7-NEXT:    v_mov_b32_e32 v18, v19
2099; GFX7-NEXT:    v_mov_b32_e32 v19, v16
2100; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
2101; GFX7-NEXT:    v_mul_lo_u32 v16, v6, v9
2102; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
2103; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
2104; GFX7-NEXT:    v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5]
2105; GFX7-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
2106; GFX7-NEXT:    v_mov_b32_e32 v19, v22
2107; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
2108; GFX7-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
2109; GFX7-NEXT:    v_mul_lo_u32 v24, v3, v12
2110; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
2111; GFX7-NEXT:    v_mul_lo_u32 v22, v2, v13
2112; GFX7-NEXT:    v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
2113; GFX7-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
2114; GFX7-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
2115; GFX7-NEXT:    v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13]
2116; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
2117; GFX7-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
2118; GFX7-NEXT:    v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13]
2119; GFX7-NEXT:    v_mov_b32_e32 v20, v11
2120; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
2121; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
2122; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
2123; GFX7-NEXT:    v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13]
2124; GFX7-NEXT:    v_mul_lo_u32 v9, v1, v14
2125; GFX7-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
2126; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
2127; GFX7-NEXT:    v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13]
2128; GFX7-NEXT:    v_mul_lo_u32 v0, v0, v15
2129; GFX7-NEXT:    v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13]
2130; GFX7-NEXT:    v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13]
2131; GFX7-NEXT:    v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13]
2132; GFX7-NEXT:    v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13]
2133; GFX7-NEXT:    v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15]
2134; GFX7-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11]
2135; GFX7-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9]
2136; GFX7-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7]
2137; GFX7-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
2138; GFX7-NEXT:    v_addc_u32_e32 v0, vcc, v0, v16, vcc
2139; GFX7-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
2140; GFX7-NEXT:    v_mov_b32_e32 v0, v10
2141; GFX7-NEXT:    s_setpc_b64 s[30:31]
2142;
2143; GFX8-LABEL: v_mul_i256:
2144; GFX8:       ; %bb.0:
2145; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2146; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
2147; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
2148; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
2149; GFX8-NEXT:    v_mul_lo_u32 v28, v4, v11
2150; GFX8-NEXT:    v_mul_lo_u32 v27, v5, v10
2151; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
2152; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
2153; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
2154; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
2155; GFX8-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
2156; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
2157; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
2158; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
2159; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
2160; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, 0, v20, vcc
2161; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
2162; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
2163; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
2164; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
2165; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, 0, v20, vcc
2166; GFX8-NEXT:    v_mov_b32_e32 v20, v18
2167; GFX8-NEXT:    v_mov_b32_e32 v18, v19
2168; GFX8-NEXT:    v_mov_b32_e32 v19, v16
2169; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
2170; GFX8-NEXT:    v_mul_lo_u32 v16, v6, v9
2171; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
2172; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
2173; GFX8-NEXT:    v_addc_u32_e64 v26, s[4:5], 0, v6, s[4:5]
2174; GFX8-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
2175; GFX8-NEXT:    v_mov_b32_e32 v19, v22
2176; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
2177; GFX8-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
2178; GFX8-NEXT:    v_mul_lo_u32 v24, v3, v12
2179; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
2180; GFX8-NEXT:    v_mul_lo_u32 v22, v2, v13
2181; GFX8-NEXT:    v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
2182; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
2183; GFX8-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
2184; GFX8-NEXT:    v_addc_u32_e64 v4, s[12:13], 0, v4, s[12:13]
2185; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
2186; GFX8-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
2187; GFX8-NEXT:    v_addc_u32_e64 v2, s[12:13], 0, v4, s[12:13]
2188; GFX8-NEXT:    v_mov_b32_e32 v20, v11
2189; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
2190; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
2191; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
2192; GFX8-NEXT:    v_addc_u32_e64 v11, s[12:13], 0, v2, s[12:13]
2193; GFX8-NEXT:    v_mul_lo_u32 v9, v1, v14
2194; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
2195; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
2196; GFX8-NEXT:    v_addc_u32_e64 v3, s[12:13], v12, v3, s[12:13]
2197; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v15
2198; GFX8-NEXT:    v_addc_u32_e64 v4, s[12:13], v26, v4, s[12:13]
2199; GFX8-NEXT:    v_addc_u32_e64 v5, s[12:13], v11, v5, s[12:13]
2200; GFX8-NEXT:    v_addc_u32_e64 v6, s[12:13], v25, v6, s[12:13]
2201; GFX8-NEXT:    v_addc_u32_e64 v0, s[12:13], v17, v0, s[12:13]
2202; GFX8-NEXT:    v_addc_u32_e64 v0, s[12:13], v0, v9, s[14:15]
2203; GFX8-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v22, s[10:11]
2204; GFX8-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v24, s[8:9]
2205; GFX8-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v28, s[6:7]
2206; GFX8-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
2207; GFX8-NEXT:    v_addc_u32_e32 v0, vcc, v0, v16, vcc
2208; GFX8-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
2209; GFX8-NEXT:    v_mov_b32_e32 v0, v10
2210; GFX8-NEXT:    s_setpc_b64 s[30:31]
2211;
2212; GFX9-LABEL: v_mul_i256:
2213; GFX9:       ; %bb.0:
2214; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2215; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
2216; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
2217; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
2218; GFX9-NEXT:    v_mul_lo_u32 v28, v4, v11
2219; GFX9-NEXT:    v_mul_lo_u32 v27, v5, v10
2220; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
2221; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v3, v11, v[16:17]
2222; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v4, v10, v[16:17]
2223; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
2224; GFX9-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s[4:5]
2225; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[16:17]
2226; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
2227; GFX9-NEXT:    v_addc_co_u32_e32 v20, vcc, 0, v20, vcc
2228; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
2229; GFX9-NEXT:    v_addc_co_u32_e32 v20, vcc, 0, v20, vcc
2230; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v0, v10, 0
2231; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
2232; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
2233; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v1, v9, v[21:22]
2234; GFX9-NEXT:    v_addc_co_u32_e32 v25, vcc, 0, v20, vcc
2235; GFX9-NEXT:    v_mov_b32_e32 v20, v18
2236; GFX9-NEXT:    v_mov_b32_e32 v18, v19
2237; GFX9-NEXT:    v_mov_b32_e32 v19, v16
2238; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
2239; GFX9-NEXT:    v_mul_lo_u32 v16, v6, v9
2240; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
2241; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v2, v8, v[21:22]
2242; GFX9-NEXT:    v_addc_co_u32_e64 v26, s[4:5], 0, v6, s[4:5]
2243; GFX9-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v1, v12, v[18:19]
2244; GFX9-NEXT:    v_mov_b32_e32 v19, v22
2245; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v0, v11, v[19:20]
2246; GFX9-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v2, v11, v[23:24]
2247; GFX9-NEXT:    v_mul_lo_u32 v24, v3, v12
2248; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[22:23]
2249; GFX9-NEXT:    v_mul_lo_u32 v22, v2, v13
2250; GFX9-NEXT:    v_mad_u64_u32 v[12:13], s[10:11], v4, v9, v[11:12]
2251; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[12:13]
2252; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[12:13], v1, v10, v[18:19]
2253; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[12:13], 0, v4, s[12:13]
2254; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[12:13], v2, v9, v[10:11]
2255; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
2256; GFX9-NEXT:    v_addc_co_u32_e64 v2, s[12:13], 0, v4, s[12:13]
2257; GFX9-NEXT:    v_mov_b32_e32 v20, v11
2258; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
2259; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[12:13], v3, v8, v[18:19]
2260; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
2261; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[12:13], 0, v2, s[12:13]
2262; GFX9-NEXT:    v_mul_lo_u32 v9, v1, v14
2263; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
2264; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[12:13], v1, v8, v[20:21]
2265; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[12:13], v12, v3, s[12:13]
2266; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v15
2267; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[12:13], v26, v4, s[12:13]
2268; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[12:13], v11, v5, s[12:13]
2269; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[12:13], v25, v6, s[12:13]
2270; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[12:13], v17, v0, s[12:13]
2271; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[12:13], v0, v9, s[14:15]
2272; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[10:11], v0, v22, s[10:11]
2273; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[8:9], v0, v24, s[8:9]
2274; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[6:7], v0, v28, s[6:7]
2275; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5]
2276; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v16, vcc
2277; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
2278; GFX9-NEXT:    v_mov_b32_e32 v0, v10
2279; GFX9-NEXT:    s_setpc_b64 s[30:31]
2280;
2281; GFX10-LABEL: v_mul_i256:
2282; GFX10:       ; %bb.0:
2283; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2284; GFX10-NEXT:    v_mov_b32_e32 v16, v0
2285; GFX10-NEXT:    v_mov_b32_e32 v17, v1
2286; GFX10-NEXT:    v_mul_lo_u32 v27, v6, v9
2287; GFX10-NEXT:    v_mul_lo_u32 v28, v5, v10
2288; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v14, 0
2289; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v16, v12, 0
2290; GFX10-NEXT:    v_mul_lo_u32 v30, v17, v14
2291; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1]
2292; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1]
2293; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19]
2294; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s4
2295; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1]
2296; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
2297; GFX10-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
2298; GFX10-NEXT:    v_mad_u64_u32 v[20:21], s4, v16, v10, 0
2299; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1]
2300; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
2301; GFX10-NEXT:    v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
2302; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1]
2303; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
2304; GFX10-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
2305; GFX10-NEXT:    v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1]
2306; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21]
2307; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
2308; GFX10-NEXT:    v_mov_b32_e32 v20, v22
2309; GFX10-NEXT:    v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
2310; GFX10-NEXT:    v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
2311; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20]
2312; GFX10-NEXT:    v_mov_b32_e32 v20, v18
2313; GFX10-NEXT:    v_mov_b32_e32 v19, v22
2314; GFX10-NEXT:    v_mul_lo_u32 v22, v16, v15
2315; GFX10-NEXT:    v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
2316; GFX10-NEXT:    v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20]
2317; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, v16, v8, 0
2318; GFX10-NEXT:    v_mul_lo_u32 v20, v4, v11
2319; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s6
2320; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25]
2321; GFX10-NEXT:    v_mul_lo_u32 v25, v3, v12
2322; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15]
2323; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
2324; GFX10-NEXT:    v_mul_lo_u32 v24, v2, v13
2325; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19]
2326; GFX10-NEXT:    v_mov_b32_e32 v13, v1
2327; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12]
2328; GFX10-NEXT:    v_mov_b32_e32 v14, v21
2329; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
2330; GFX10-NEXT:    v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19]
2331; GFX10-NEXT:    v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14]
2332; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s8
2333; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2]
2334; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s8, 0, v6, s8
2335; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11]
2336; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13]
2337; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s9, v9, v3, s9
2338; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s9, v29, v4, s9
2339; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s9, v14, v5, s9
2340; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s9, v26, v6, s9
2341; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s9, v23, v22, s9
2342; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s8, v9, v30, s8
2343; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v24, s6
2344; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v25, s7
2345; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s5, v9, v20, s5
2346; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
2347; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s4
2348; GFX10-NEXT:    v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10]
2349; GFX10-NEXT:    s_setpc_b64 s[30:31]
2350;
2351; GFX11-LABEL: v_mul_i256:
2352; GFX11:       ; %bb.0:
2353; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2354; GFX11-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
2355; GFX11-NEXT:    v_dual_mov_b32 v18, v8 :: v_dual_mov_b32 v19, v7
2356; GFX11-NEXT:    v_mul_lo_u32 v30, v4, v11
2357; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v14, 0
2358; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v16, v12, 0
2359; GFX11-NEXT:    v_mul_lo_u32 v29, v17, v14
2360; GFX11-NEXT:    v_mul_lo_u32 v28, v5, v10
2361; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1]
2362; GFX11-NEXT:    v_mad_u64_u32 v[7:8], s0, v17, v11, v[7:8]
2363; GFX11-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
2364; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1]
2365; GFX11-NEXT:    v_mad_u64_u32 v[7:8], vcc_lo, v2, v10, v[7:8]
2366; GFX11-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
2367; GFX11-NEXT:    v_mad_u64_u32 v[20:21], null, v16, v10, 0
2368; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1]
2369; GFX11-NEXT:    v_mad_u64_u32 v[7:8], vcc_lo, v3, v9, v[7:8]
2370; GFX11-NEXT:    v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
2371; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1]
2372; GFX11-NEXT:    v_mad_u64_u32 v[7:8], vcc_lo, v4, v18, v[7:8]
2373; GFX11-NEXT:    v_add_co_ci_u32_e32 v27, vcc_lo, 0, v24, vcc_lo
2374; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1]
2375; GFX11-NEXT:    v_mad_u64_u32 v[22:23], null, v6, v18, v[0:1]
2376; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21]
2377; GFX11-NEXT:    v_mov_b32_e32 v20, v8
2378; GFX11-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s0
2379; GFX11-NEXT:    v_mov_b32_e32 v21, v22
2380; GFX11-NEXT:    v_mul_lo_u32 v22, v6, v9
2381; GFX11-NEXT:    v_mad_u64_u32 v[24:25], vcc_lo, v2, v18, v[0:1]
2382; GFX11-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v26, vcc_lo
2383; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v16, v13, v[20:21]
2384; GFX11-NEXT:    v_mov_b32_e32 v6, v25
2385; GFX11-NEXT:    v_mul_lo_u32 v25, v16, v15
2386; GFX11-NEXT:    v_mad_u64_u32 v[20:21], vcc_lo, v17, v12, v[0:1]
2387; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v16, v11, v[6:7]
2388; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v18, 0
2389; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s2
2390; GFX11-NEXT:    v_mad_u64_u32 v[14:15], s1, v2, v11, v[20:21]
2391; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v17, v10, v[6:7]
2392; GFX11-NEXT:    v_mul_lo_u32 v20, v2, v13
2393; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, s2, 0, v8, s2
2394; GFX11-NEXT:    v_mov_b32_e32 v11, v1
2395; GFX11-NEXT:    v_mad_u64_u32 v[13:14], s3, v3, v10, v[14:15]
2396; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s2, v2, v9, v[6:7]
2397; GFX11-NEXT:    v_mul_lo_u32 v21, v3, v12
2398; GFX11-NEXT:    v_mov_b32_e32 v12, v24
2399; GFX11-NEXT:    v_add_co_ci_u32_e64 v10, s2, 0, v8, s2
2400; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v4, v9, v[13:14]
2401; GFX11-NEXT:    v_mad_u64_u32 v[8:9], s4, v16, v9, v[11:12]
2402; GFX11-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s4
2403; GFX11-NEXT:    v_mad_u64_u32 v[3:4], s4, v3, v18, v[1:2]
2404; GFX11-NEXT:    v_add_co_ci_u32_e64 v10, s4, 0, v10, s4
2405; GFX11-NEXT:    v_mad_u64_u32 v[5:6], s4, v5, v18, v[6:7]
2406; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s5, v17, v18, v[8:9]
2407; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s5, v11, v3, s5
2408; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, s5, v26, v4, s5
2409; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s5, v10, v5, s5
2410; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s5, v27, v6, s5
2411; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, s5, v23, v25, s5
2412; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, s4, v7, v29, s4
2413; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, s2, v7, v20, s2
2414; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, s2, v7, v21, s3
2415; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, s1, v7, v30, s1
2416; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v7, v28, vcc_lo
2417; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, vcc_lo, v7, v22, s0
2418; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10]
2419; GFX11-NEXT:    s_setpc_b64 s[30:31]
2420;
2421; GFX12-LABEL: v_mul_i256:
2422; GFX12:       ; %bb.0:
2423; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2424; GFX12-NEXT:    s_wait_expcnt 0x0
2425; GFX12-NEXT:    s_wait_samplecnt 0x0
2426; GFX12-NEXT:    s_wait_bvhcnt 0x0
2427; GFX12-NEXT:    s_wait_kmcnt 0x0
2428; GFX12-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
2429; GFX12-NEXT:    v_mul_lo_u32 v27, v6, v9
2430; GFX12-NEXT:    v_mul_lo_u32 v28, v5, v10
2431; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
2432; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v16, v14, 0
2433; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
2434; GFX12-NEXT:    v_mul_lo_u32 v30, v17, v14
2435; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
2436; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2437; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
2438; GFX12-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
2439; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2440; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
2441; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
2442; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
2443; GFX12-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
2444; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
2445; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
2446; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
2447; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
2448; GFX12-NEXT:    v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
2449; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2450; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
2451; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
2452; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2453; GFX12-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
2454; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
2455; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2456; GFX12-NEXT:    v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
2457; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
2458; GFX12-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s0
2459; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2460; GFX12-NEXT:    v_mov_b32_e32 v20, v22
2461; GFX12-NEXT:    v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
2462; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
2463; GFX12-NEXT:    v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
2464; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
2465; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
2466; GFX12-NEXT:    v_mov_b32_e32 v19, v22
2467; GFX12-NEXT:    v_mul_lo_u32 v22, v16, v15
2468; GFX12-NEXT:    v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
2469; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v16, v8, 0
2470; GFX12-NEXT:    v_mov_b32_e32 v20, v18
2471; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
2472; GFX12-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20]
2473; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25]
2474; GFX12-NEXT:    v_mul_lo_u32 v20, v4, v11
2475; GFX12-NEXT:    v_mul_lo_u32 v25, v3, v12
2476; GFX12-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s2
2477; GFX12-NEXT:    v_mul_lo_u32 v24, v2, v13
2478; GFX12-NEXT:    v_mov_b32_e32 v13, v1
2479; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15]
2480; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19]
2481; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
2482; GFX12-NEXT:    v_mov_b32_e32 v14, v21
2483; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
2484; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
2485; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, s2, 0, v6, s2
2486; GFX12-NEXT:    v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19]
2487; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
2488; GFX12-NEXT:    v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14]
2489; GFX12-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
2490; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2]
2491; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
2492; GFX12-NEXT:    v_add_co_ci_u32_e64 v14, s4, 0, v6, s4
2493; GFX12-NEXT:    v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
2494; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
2495; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
2496; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2497; GFX12-NEXT:    v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
2498; GFX12-NEXT:    v_add_co_ci_u32_e64 v5, s5, v14, v5, s5
2499; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2500; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
2501; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s5, v23, v22, s5
2502; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2503; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s4, v9, v30, s4
2504; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s2, v9, v24, s2
2505; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2506; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s2, v9, v25, s3
2507; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, s1, v9, v20, s1
2508; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2509; GFX12-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
2510; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s0
2511; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2512; GFX12-NEXT:    v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
2513; GFX12-NEXT:    s_setpc_b64 s[30:31]
2514  %result = mul i256 %num, %den
2515  ret i256 %result
2516}
2517
2518define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
2519; GFX7-LABEL: s_mul_u64_zext_with_vregs:
2520; GFX7:       ; %bb.0:
2521; GFX7-NEXT:    s_mov_b32 s2, 0
2522; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2523; GFX7-NEXT:    s_mov_b64 s[0:1], 0
2524; GFX7-NEXT:    buffer_load_dword v2, v[2:3], s[0:3], 0 addr64
2525; GFX7-NEXT:    v_mov_b32_e32 v3, 0x50
2526; GFX7-NEXT:    s_waitcnt vmcnt(0)
2527; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v3, 0
2528; GFX7-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
2529; GFX7-NEXT:    s_endpgm
2530;
2531; GFX8-LABEL: s_mul_u64_zext_with_vregs:
2532; GFX8:       ; %bb.0:
2533; GFX8-NEXT:    flat_load_dword v2, v[2:3]
2534; GFX8-NEXT:    v_mov_b32_e32 v3, 0x50
2535; GFX8-NEXT:    s_waitcnt vmcnt(0)
2536; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
2537; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
2538; GFX8-NEXT:    s_endpgm
2539;
2540; GFX9-LABEL: s_mul_u64_zext_with_vregs:
2541; GFX9:       ; %bb.0:
2542; GFX9-NEXT:    global_load_dword v2, v[2:3], off
2543; GFX9-NEXT:    v_mov_b32_e32 v3, 0x50
2544; GFX9-NEXT:    s_waitcnt vmcnt(0)
2545; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
2546; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
2547; GFX9-NEXT:    s_endpgm
2548;
2549; GFX10-LABEL: s_mul_u64_zext_with_vregs:
2550; GFX10:       ; %bb.0:
2551; GFX10-NEXT:    global_load_dword v2, v[2:3], off
2552; GFX10-NEXT:    s_waitcnt vmcnt(0)
2553; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0
2554; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
2555; GFX10-NEXT:    s_endpgm
2556;
2557; GFX11-LABEL: s_mul_u64_zext_with_vregs:
2558; GFX11:       ; %bb.0:
2559; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
2560; GFX11-NEXT:    s_waitcnt vmcnt(0)
2561; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0x50, v2, 0
2562; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
2563; GFX11-NEXT:    s_endpgm
2564;
2565; GFX12-LABEL: s_mul_u64_zext_with_vregs:
2566; GFX12:       ; %bb.0:
2567; GFX12-NEXT:    global_load_b32 v2, v[2:3], off
2568; GFX12-NEXT:    s_wait_loadcnt 0x0
2569; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
2570; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
2571; GFX12-NEXT:    s_endpgm
2572  %val = load i32, ptr addrspace(1) %in, align 4
2573  %ext = zext i32 %val to i64
2574  %mul = mul i64 %ext, 80
2575  store i64 %mul, ptr addrspace(1) %out, align 8
2576  ret void
2577}
2578
2579define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
2580; GFX7-LABEL: s_mul_u64_zext_with_sregs:
2581; GFX7:       ; %bb.0:
2582; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2583; GFX7-NEXT:    v_mov_b32_e32 v0, 0x50
2584; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2585; GFX7-NEXT:    s_load_dword s3, s[2:3], 0x0
2586; GFX7-NEXT:    s_mov_b32 s2, -1
2587; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2588; GFX7-NEXT:    v_mul_hi_u32 v0, s3, v0
2589; GFX7-NEXT:    s_mul_i32 s4, s3, 0x50
2590; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2591; GFX7-NEXT:    v_readfirstlane_b32 s5, v0
2592; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2593; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2594; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2595; GFX7-NEXT:    s_endpgm
2596;
2597; GFX8-LABEL: s_mul_u64_zext_with_sregs:
2598; GFX8:       ; %bb.0:
2599; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2600; GFX8-NEXT:    v_mov_b32_e32 v0, 0x50
2601; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2602; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
2603; GFX8-NEXT:    v_mov_b32_e32 v3, s1
2604; GFX8-NEXT:    v_mov_b32_e32 v2, s0
2605; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2606; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
2607; GFX8-NEXT:    s_mulk_i32 s2, 0x50
2608; GFX8-NEXT:    v_readfirstlane_b32 s3, v0
2609; GFX8-NEXT:    v_mov_b32_e32 v0, s2
2610; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2611; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2612; GFX8-NEXT:    s_endpgm
2613;
2614; GFX9-LABEL: s_mul_u64_zext_with_sregs:
2615; GFX9:       ; %bb.0:
2616; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2617; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2618; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2619; GFX9-NEXT:    s_load_dword s3, s[2:3], 0x0
2620; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2621; GFX9-NEXT:    s_mul_i32 s2, s3, 0x50
2622; GFX9-NEXT:    s_mul_hi_u32 s3, s3, 0x50
2623; GFX9-NEXT:    v_mov_b32_e32 v0, s2
2624; GFX9-NEXT:    v_mov_b32_e32 v1, s3
2625; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
2626; GFX9-NEXT:    s_endpgm
2627;
2628; GFX10-LABEL: s_mul_u64_zext_with_sregs:
2629; GFX10:       ; %bb.0:
2630; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2631; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2632; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2633; GFX10-NEXT:    s_load_dword s3, s[2:3], 0x0
2634; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2635; GFX10-NEXT:    s_mul_i32 s2, s3, 0x50
2636; GFX10-NEXT:    s_mul_hi_u32 s3, s3, 0x50
2637; GFX10-NEXT:    v_mov_b32_e32 v0, s2
2638; GFX10-NEXT:    v_mov_b32_e32 v1, s3
2639; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
2640; GFX10-NEXT:    s_endpgm
2641;
2642; GFX11-LABEL: s_mul_u64_zext_with_sregs:
2643; GFX11:       ; %bb.0:
2644; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2645; GFX11-NEXT:    v_mov_b32_e32 v2, 0
2646; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2647; GFX11-NEXT:    s_load_b32 s3, s[2:3], 0x0
2648; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2649; GFX11-NEXT:    s_mul_i32 s2, s3, 0x50
2650; GFX11-NEXT:    s_mul_hi_u32 s3, s3, 0x50
2651; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2652; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
2653; GFX11-NEXT:    s_endpgm
2654;
2655; GFX12-LABEL: s_mul_u64_zext_with_sregs:
2656; GFX12:       ; %bb.0:
2657; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2658; GFX12-NEXT:    v_mov_b32_e32 v2, 0
2659; GFX12-NEXT:    s_wait_kmcnt 0x0
2660; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
2661; GFX12-NEXT:    s_mov_b32 s3, 0
2662; GFX12-NEXT:    s_wait_kmcnt 0x0
2663; GFX12-NEXT:    s_mul_u64 s[2:3], s[2:3], 0x50
2664; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2665; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2666; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
2667; GFX12-NEXT:    s_endpgm
2668  %val = load i32, ptr addrspace(1) %in, align 4
2669  %ext = zext i32 %val to i64
2670  %mul = mul i64 %ext, 80
2671  store i64 %mul, ptr addrspace(1) %out, align 8
2672  ret void
2673}
2674
2675define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
2676; GFX7-LABEL: s_mul_u64_sext_with_vregs:
2677; GFX7:       ; %bb.0:
2678; GFX7-NEXT:    s_mov_b32 s2, 0
2679; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2680; GFX7-NEXT:    s_mov_b64 s[0:1], 0
2681; GFX7-NEXT:    buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
2682; GFX7-NEXT:    v_mov_b32_e32 v5, 0x50
2683; GFX7-NEXT:    s_waitcnt vmcnt(0)
2684; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
2685; GFX7-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
2686; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4]
2687; GFX7-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
2688; GFX7-NEXT:    s_endpgm
2689;
2690; GFX8-LABEL: s_mul_u64_sext_with_vregs:
2691; GFX8:       ; %bb.0:
2692; GFX8-NEXT:    flat_load_dword v4, v[2:3]
2693; GFX8-NEXT:    v_mov_b32_e32 v5, 0x50
2694; GFX8-NEXT:    s_waitcnt vmcnt(0)
2695; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
2696; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
2697; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
2698; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
2699; GFX8-NEXT:    s_endpgm
2700;
2701; GFX9-LABEL: s_mul_u64_sext_with_vregs:
2702; GFX9:       ; %bb.0:
2703; GFX9-NEXT:    global_load_dword v4, v[2:3], off
2704; GFX9-NEXT:    v_mov_b32_e32 v5, 0x50
2705; GFX9-NEXT:    s_waitcnt vmcnt(0)
2706; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
2707; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
2708; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
2709; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
2710; GFX9-NEXT:    s_endpgm
2711;
2712; GFX10-LABEL: s_mul_u64_sext_with_vregs:
2713; GFX10:       ; %bb.0:
2714; GFX10-NEXT:    global_load_dword v4, v[2:3], off
2715; GFX10-NEXT:    s_waitcnt vmcnt(0)
2716; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0
2717; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
2718; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s0, 0x50, v4, v[3:4]
2719; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
2720; GFX10-NEXT:    s_endpgm
2721;
2722; GFX11-LABEL: s_mul_u64_sext_with_vregs:
2723; GFX11:       ; %bb.0:
2724; GFX11-NEXT:    global_load_b32 v4, v[2:3], off
2725; GFX11-NEXT:    s_waitcnt vmcnt(0)
2726; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0x50, v4, 0
2727; GFX11-NEXT:    v_ashrrev_i32_e32 v6, 31, v4
2728; GFX11-NEXT:    v_mad_u64_u32 v[4:5], null, 0x50, v6, v[3:4]
2729; GFX11-NEXT:    v_mov_b32_e32 v3, v4
2730; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
2731; GFX11-NEXT:    s_endpgm
2732;
2733; GFX12-LABEL: s_mul_u64_sext_with_vregs:
2734; GFX12:       ; %bb.0:
2735; GFX12-NEXT:    global_load_b32 v2, v[2:3], off
2736; GFX12-NEXT:    s_wait_loadcnt 0x0
2737; GFX12-NEXT:    v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
2738; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
2739; GFX12-NEXT:    s_endpgm
2740  %val = load i32, ptr addrspace(1) %in, align 4
2741  %ext = sext i32 %val to i64
2742  %mul = mul i64 %ext, 80
2743  store i64 %mul, ptr addrspace(1) %out, align 8
2744  ret void
2745}
2746
2747define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
2748; GFX7-LABEL: s_mul_u64_sext_with_sregs:
2749; GFX7:       ; %bb.0:
2750; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2751; GFX7-NEXT:    v_mov_b32_e32 v0, 0x50
2752; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2753; GFX7-NEXT:    s_load_dword s3, s[2:3], 0x0
2754; GFX7-NEXT:    s_mov_b32 s2, -1
2755; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2756; GFX7-NEXT:    v_mul_hi_u32 v0, s3, v0
2757; GFX7-NEXT:    s_ashr_i32 s5, s3, 31
2758; GFX7-NEXT:    s_mul_i32 s4, s3, 0x50
2759; GFX7-NEXT:    s_mulk_i32 s5, 0x50
2760; GFX7-NEXT:    v_readfirstlane_b32 s3, v0
2761; GFX7-NEXT:    s_add_u32 s5, s5, s3
2762; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2763; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2764; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2765; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2766; GFX7-NEXT:    s_endpgm
2767;
2768; GFX8-LABEL: s_mul_u64_sext_with_sregs:
2769; GFX8:       ; %bb.0:
2770; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2771; GFX8-NEXT:    v_mov_b32_e32 v0, 0x50
2772; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2773; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
2774; GFX8-NEXT:    v_mov_b32_e32 v3, s1
2775; GFX8-NEXT:    v_mov_b32_e32 v2, s0
2776; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2777; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
2778; GFX8-NEXT:    s_ashr_i32 s3, s2, 31
2779; GFX8-NEXT:    s_mulk_i32 s2, 0x50
2780; GFX8-NEXT:    s_mulk_i32 s3, 0x50
2781; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
2782; GFX8-NEXT:    s_add_u32 s3, s3, s4
2783; GFX8-NEXT:    v_mov_b32_e32 v0, s2
2784; GFX8-NEXT:    v_mov_b32_e32 v1, s3
2785; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2786; GFX8-NEXT:    s_endpgm
2787;
2788; GFX9-LABEL: s_mul_u64_sext_with_sregs:
2789; GFX9:       ; %bb.0:
2790; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2791; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2792; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2793; GFX9-NEXT:    s_load_dword s3, s[2:3], 0x0
2794; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2795; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
2796; GFX9-NEXT:    s_mul_i32 s2, s3, 0x50
2797; GFX9-NEXT:    s_mul_hi_u32 s3, s3, 0x50
2798; GFX9-NEXT:    s_mulk_i32 s4, 0x50
2799; GFX9-NEXT:    s_add_u32 s3, s4, s3
2800; GFX9-NEXT:    v_mov_b32_e32 v0, s2
2801; GFX9-NEXT:    v_mov_b32_e32 v1, s3
2802; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
2803; GFX9-NEXT:    s_endpgm
2804;
2805; GFX10-LABEL: s_mul_u64_sext_with_sregs:
2806; GFX10:       ; %bb.0:
2807; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2808; GFX10-NEXT:    v_mov_b32_e32 v2, 0
2809; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2810; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
2811; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2812; GFX10-NEXT:    s_ashr_i32 s3, s2, 31
2813; GFX10-NEXT:    s_mul_hi_u32 s4, s2, 0x50
2814; GFX10-NEXT:    s_mulk_i32 s3, 0x50
2815; GFX10-NEXT:    s_mulk_i32 s2, 0x50
2816; GFX10-NEXT:    s_add_i32 s3, s4, s3
2817; GFX10-NEXT:    v_mov_b32_e32 v0, s2
2818; GFX10-NEXT:    v_mov_b32_e32 v1, s3
2819; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
2820; GFX10-NEXT:    s_endpgm
2821;
2822; GFX11-LABEL: s_mul_u64_sext_with_sregs:
2823; GFX11:       ; %bb.0:
2824; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2825; GFX11-NEXT:    v_mov_b32_e32 v2, 0
2826; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2827; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
2828; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2829; GFX11-NEXT:    s_ashr_i32 s3, s2, 31
2830; GFX11-NEXT:    s_mul_hi_u32 s4, s2, 0x50
2831; GFX11-NEXT:    s_mulk_i32 s3, 0x50
2832; GFX11-NEXT:    s_mulk_i32 s2, 0x50
2833; GFX11-NEXT:    s_add_i32 s3, s4, s3
2834; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2835; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
2836; GFX11-NEXT:    s_endpgm
2837;
2838; GFX12-LABEL: s_mul_u64_sext_with_sregs:
2839; GFX12:       ; %bb.0:
2840; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
2841; GFX12-NEXT:    v_mov_b32_e32 v2, 0
2842; GFX12-NEXT:    s_wait_kmcnt 0x0
2843; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
2844; GFX12-NEXT:    s_wait_kmcnt 0x0
2845; GFX12-NEXT:    s_ashr_i32 s3, s2, 31
2846; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2847; GFX12-NEXT:    s_mul_u64 s[2:3], s[2:3], 0x50
2848; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
2849; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
2850; GFX12-NEXT:    s_endpgm
2851  %val = load i32, ptr addrspace(1) %in, align 4
2852  %ext = sext i32 %val to i64
2853  %mul = mul i64 %ext, 80
2854  store i64 %mul, ptr addrspace(1) %out, align 8
2855  ret void
2856}
2857