xref: /llvm-project/llvm/test/CodeGen/AMDGPU/add3.ll (revision b434051dc83d77c8e8e349ab1992dcb0c795a7ea)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=VI %s
3; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefix=GFX9 %s
4; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
5; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs | FileCheck -check-prefix=GFX10 %s
6
7; ===================================================================================
8; V_ADD3_U32
9; ===================================================================================
10
11define amdgpu_ps float @add3(i32 %a, i32 %b, i32 %c) {
12; VI-LABEL: add3:
13; VI:       ; %bb.0:
14; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
15; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
16; VI-NEXT:    ; return to shader part epilog
17;
18; GFX9-LABEL: add3:
19; GFX9:       ; %bb.0:
20; GFX9-NEXT:    v_add3_u32 v0, v0, v1, v2
21; GFX9-NEXT:    ; return to shader part epilog
22;
23; GFX10-LABEL: add3:
24; GFX10:       ; %bb.0:
25; GFX10-NEXT:    v_add3_u32 v0, v0, v1, v2
26; GFX10-NEXT:    ; return to shader part epilog
27  %x = add i32 %a, %b
28  %result = add i32 %x, %c
29  %bc = bitcast i32 %result to float
30  ret float %bc
31}
32
33; V_MAD_U32_U24 is given higher priority.
34define amdgpu_ps float @mad_no_add3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
35; VI-LABEL: mad_no_add3:
36; VI:       ; %bb.0:
37; VI-NEXT:    v_mad_u32_u24 v0, v0, v1, v4
38; VI-NEXT:    v_mad_u32_u24 v0, v2, v3, v0
39; VI-NEXT:    ; return to shader part epilog
40;
41; GFX9-LABEL: mad_no_add3:
42; GFX9:       ; %bb.0:
43; GFX9-NEXT:    v_mad_u32_u24 v0, v0, v1, v4
44; GFX9-NEXT:    v_mad_u32_u24 v0, v2, v3, v0
45; GFX9-NEXT:    ; return to shader part epilog
46;
47; GFX10-LABEL: mad_no_add3:
48; GFX10:       ; %bb.0:
49; GFX10-NEXT:    v_mad_u32_u24 v0, v0, v1, v4
50; GFX10-NEXT:    v_mad_u32_u24 v0, v2, v3, v0
51; GFX10-NEXT:    ; return to shader part epilog
52  %a0 = shl i32 %a, 8
53  %a1 = lshr i32 %a0, 8
54  %b0 = shl i32 %b, 8
55  %b1 = lshr i32 %b0, 8
56  %mul1 = mul i32 %a1, %b1
57
58  %c0 = shl i32 %c, 8
59  %c1 = lshr i32 %c0, 8
60  %d0 = shl i32 %d, 8
61  %d1 = lshr i32 %d0, 8
62  %mul2 = mul i32 %c1, %d1
63
64  %add0 = add i32 %e, %mul1
65  %add1 = add i32 %mul2, %add0
66
67  %bc = bitcast i32 %add1 to float
68  ret float %bc
69}
70
71; ThreeOp instruction variant not used due to Constant Bus Limitations
72; TODO: with reassociation it is possible to replace a v_add_u32_e32 with a s_add_i32
73define amdgpu_ps float @add3_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) {
74; VI-LABEL: add3_vgpr_b:
75; VI:       ; %bb.0:
76; VI-NEXT:    s_add_i32 s3, s3, s2
77; VI-NEXT:    v_add_u32_e32 v0, vcc, s3, v0
78; VI-NEXT:    ; return to shader part epilog
79;
80; GFX9-LABEL: add3_vgpr_b:
81; GFX9:       ; %bb.0:
82; GFX9-NEXT:    s_add_i32 s3, s3, s2
83; GFX9-NEXT:    v_add_u32_e32 v0, s3, v0
84; GFX9-NEXT:    ; return to shader part epilog
85;
86; GFX10-LABEL: add3_vgpr_b:
87; GFX10:       ; %bb.0:
88; GFX10-NEXT:    v_add3_u32 v0, s3, s2, v0
89; GFX10-NEXT:    ; return to shader part epilog
90  %x = add i32 %a, %b
91  %result = add i32 %x, %c
92  %bc = bitcast i32 %result to float
93  ret float %bc
94}
95
96define amdgpu_ps float @add3_vgpr_all2(i32 %a, i32 %b, i32 %c) {
97; VI-LABEL: add3_vgpr_all2:
98; VI:       ; %bb.0:
99; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
100; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
101; VI-NEXT:    ; return to shader part epilog
102;
103; GFX9-LABEL: add3_vgpr_all2:
104; GFX9:       ; %bb.0:
105; GFX9-NEXT:    v_add3_u32 v0, v1, v2, v0
106; GFX9-NEXT:    ; return to shader part epilog
107;
108; GFX10-LABEL: add3_vgpr_all2:
109; GFX10:       ; %bb.0:
110; GFX10-NEXT:    v_add3_u32 v0, v1, v2, v0
111; GFX10-NEXT:    ; return to shader part epilog
112  %x = add i32 %b, %c
113  %result = add i32 %a, %x
114  %bc = bitcast i32 %result to float
115  ret float %bc
116}
117
118define amdgpu_ps float @add3_vgpr_bc(i32 inreg %a, i32 %b, i32 %c) {
119; VI-LABEL: add3_vgpr_bc:
120; VI:       ; %bb.0:
121; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
122; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
123; VI-NEXT:    ; return to shader part epilog
124;
125; GFX9-LABEL: add3_vgpr_bc:
126; GFX9:       ; %bb.0:
127; GFX9-NEXT:    v_add3_u32 v0, s2, v0, v1
128; GFX9-NEXT:    ; return to shader part epilog
129;
130; GFX10-LABEL: add3_vgpr_bc:
131; GFX10:       ; %bb.0:
132; GFX10-NEXT:    v_add3_u32 v0, s2, v0, v1
133; GFX10-NEXT:    ; return to shader part epilog
134  %x = add i32 %a, %b
135  %result = add i32 %x, %c
136  %bc = bitcast i32 %result to float
137  ret float %bc
138}
139
140define amdgpu_ps float @add3_vgpr_const(i32 %a, i32 %b) {
141; VI-LABEL: add3_vgpr_const:
142; VI:       ; %bb.0:
143; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
144; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
145; VI-NEXT:    ; return to shader part epilog
146;
147; GFX9-LABEL: add3_vgpr_const:
148; GFX9:       ; %bb.0:
149; GFX9-NEXT:    v_add3_u32 v0, v0, v1, 16
150; GFX9-NEXT:    ; return to shader part epilog
151;
152; GFX10-LABEL: add3_vgpr_const:
153; GFX10:       ; %bb.0:
154; GFX10-NEXT:    v_add3_u32 v0, v0, v1, 16
155; GFX10-NEXT:    ; return to shader part epilog
156  %x = add i32 %a, %b
157  %result = add i32 %x, 16
158  %bc = bitcast i32 %result to float
159  ret float %bc
160}
161
162define amdgpu_ps <2 x float> @add3_multiuse_outer(i32 %a, i32 %b, i32 %c, i32 %x) {
163; VI-LABEL: add3_multiuse_outer:
164; VI:       ; %bb.0:
165; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
166; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
167; VI-NEXT:    v_mul_lo_u32 v1, v0, v3
168; VI-NEXT:    ; return to shader part epilog
169;
170; GFX9-LABEL: add3_multiuse_outer:
171; GFX9:       ; %bb.0:
172; GFX9-NEXT:    v_add3_u32 v0, v0, v1, v2
173; GFX9-NEXT:    v_mul_lo_u32 v1, v0, v3
174; GFX9-NEXT:    ; return to shader part epilog
175;
176; GFX10-LABEL: add3_multiuse_outer:
177; GFX10:       ; %bb.0:
178; GFX10-NEXT:    v_add3_u32 v0, v0, v1, v2
179; GFX10-NEXT:    v_mul_lo_u32 v1, v0, v3
180; GFX10-NEXT:    ; return to shader part epilog
181  %inner = add i32 %a, %b
182  %outer = add i32 %inner, %c
183  %x1 = mul i32 %outer, %x
184  %r1 = insertelement <2 x i32> undef, i32 %outer, i32 0
185  %r0 = insertelement <2 x i32> %r1, i32 %x1, i32 1
186  %bc = bitcast <2 x i32> %r0 to <2 x float>
187  ret <2 x float> %bc
188}
189
190define amdgpu_ps <2 x float> @add3_multiuse_inner(i32 %a, i32 %b, i32 %c) {
191; VI-LABEL: add3_multiuse_inner:
192; VI:       ; %bb.0:
193; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
194; VI-NEXT:    v_add_u32_e32 v1, vcc, v0, v2
195; VI-NEXT:    ; return to shader part epilog
196;
197; GFX9-LABEL: add3_multiuse_inner:
198; GFX9:       ; %bb.0:
199; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
200; GFX9-NEXT:    v_add_u32_e32 v1, v0, v2
201; GFX9-NEXT:    ; return to shader part epilog
202;
203; GFX10-LABEL: add3_multiuse_inner:
204; GFX10:       ; %bb.0:
205; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
206; GFX10-NEXT:    v_add_nc_u32_e32 v1, v0, v2
207; GFX10-NEXT:    ; return to shader part epilog
208  %inner = add i32 %a, %b
209  %outer = add i32 %inner, %c
210  %r1 = insertelement <2 x i32> undef, i32 %inner, i32 0
211  %r0 = insertelement <2 x i32> %r1, i32 %outer, i32 1
212  %bc = bitcast <2 x i32> %r0 to <2 x float>
213  ret <2 x float> %bc
214}
215
216; A case where uniform values end up in VGPRs -- we could use v_add3_u32 here,
217; but we don't.
218define amdgpu_ps float @add3_uniform_vgpr(float inreg %a, float inreg %b, float inreg %c) {
219; VI-LABEL: add3_uniform_vgpr:
220; VI:       ; %bb.0:
221; VI-NEXT:    v_add_f32_e64 v0, s2, 1.0
222; VI-NEXT:    v_add_f32_e64 v1, s3, 2.0
223; VI-NEXT:    v_mov_b32_e32 v2, 0x40400000
224; VI-NEXT:    v_add_f32_e32 v2, s4, v2
225; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
226; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
227; VI-NEXT:    ; return to shader part epilog
228;
229; GFX9-LABEL: add3_uniform_vgpr:
230; GFX9:       ; %bb.0:
231; GFX9-NEXT:    v_add_f32_e64 v0, s2, 1.0
232; GFX9-NEXT:    v_add_f32_e64 v1, s3, 2.0
233; GFX9-NEXT:    v_mov_b32_e32 v2, 0x40400000
234; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
235; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
236; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
237; GFX9-NEXT:    ; return to shader part epilog
238;
239; GFX10-LABEL: add3_uniform_vgpr:
240; GFX10:       ; %bb.0:
241; GFX10-NEXT:    v_add_f32_e64 v0, s2, 1.0
242; GFX10-NEXT:    v_add_f32_e64 v1, s3, 2.0
243; GFX10-NEXT:    v_add_f32_e64 v2, 0x40400000, s4
244; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
245; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
246; GFX10-NEXT:    ; return to shader part epilog
247  %a1 = fadd float %a, 1.0
248  %b2 = fadd float %b, 2.0
249  %c3 = fadd float %c, 3.0
250  %bc.a = bitcast float %a1 to i32
251  %bc.b = bitcast float %b2 to i32
252  %bc.c = bitcast float %c3 to i32
253  %x = add i32 %bc.a, %bc.b
254  %result = add i32 %x, %bc.c
255  %bc = bitcast i32 %result to float
256  ret float %bc
257}
258