xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll (revision 9e9907f1cfa424366fba58d9520f9305b537cec9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
3
4define amdgpu_cs float @div_sqrt(float inreg %arg1) {
5; GCN-LABEL: div_sqrt:
6; GCN:       ; %bb.0: ; %.entry
7; GCN-NEXT:    v_mul_f32_e64 v0, 0x4f800000, s0
8; GCN-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0xf800000, s0
9; GCN-NEXT:    v_cndmask_b32_e32 v0, s0, v0, vcc_lo
10; GCN-NEXT:    v_sqrt_f32_e32 v1, v0
11; GCN-NEXT:    v_add_nc_u32_e32 v2, -1, v1
12; GCN-NEXT:    v_add_nc_u32_e32 v3, 1, v1
13; GCN-NEXT:    v_fma_f32 v4, -v2, v1, v0
14; GCN-NEXT:    v_fma_f32 v5, -v3, v1, v0
15; GCN-NEXT:    v_cmp_ge_f32_e64 s0, 0, v4
16; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s0
17; GCN-NEXT:    v_cmp_lt_f32_e64 s0, 0, v5
18; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
19; GCN-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
20; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
21; GCN-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
22; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
23; GCN-NEXT:    v_rcp_f32_e32 v0, v0
24; GCN-NEXT:    ; return to shader part epilog
25.entry:
26  %a = call float @llvm.sqrt.f32(float %arg1)
27  %b = fdiv afn float 1.000000e+00, %a
28  ret float %b
29}
30
31define amdgpu_cs float @sqrt_div(float inreg %arg1) {
32; GCN-LABEL: sqrt_div:
33; GCN:       ; %bb.0: ; %.entry
34; GCN-NEXT:    v_rcp_f32_e32 v0, s0
35; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
36; GCN-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
37; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
38; GCN-NEXT:    v_sqrt_f32_e32 v1, v0
39; GCN-NEXT:    v_add_nc_u32_e32 v2, -1, v1
40; GCN-NEXT:    v_add_nc_u32_e32 v3, 1, v1
41; GCN-NEXT:    v_fma_f32 v4, -v2, v1, v0
42; GCN-NEXT:    v_fma_f32 v5, -v3, v1, v0
43; GCN-NEXT:    v_cmp_ge_f32_e64 s0, 0, v4
44; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s0
45; GCN-NEXT:    v_cmp_lt_f32_e64 s0, 0, v5
46; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
47; GCN-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
48; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
49; GCN-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
50; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
51; GCN-NEXT:    ; return to shader part epilog
52.entry:
53  %a = fdiv afn float 1.000000e+00, %arg1
54  %b = call float @llvm.sqrt.f32(float %a)
55  ret float %b
56}
57
58define amdgpu_cs float @rcp_sqrt(float inreg %arg1) {
59; GCN-LABEL: rcp_sqrt:
60; GCN:       ; %bb.0: ; %.entry
61; GCN-NEXT:    v_mul_f32_e64 v0, 0x4f800000, s0
62; GCN-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0xf800000, s0
63; GCN-NEXT:    v_cndmask_b32_e32 v0, s0, v0, vcc_lo
64; GCN-NEXT:    v_sqrt_f32_e32 v1, v0
65; GCN-NEXT:    v_add_nc_u32_e32 v2, -1, v1
66; GCN-NEXT:    v_add_nc_u32_e32 v3, 1, v1
67; GCN-NEXT:    v_fma_f32 v4, -v2, v1, v0
68; GCN-NEXT:    v_fma_f32 v5, -v3, v1, v0
69; GCN-NEXT:    v_cmp_ge_f32_e64 s0, 0, v4
70; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s0
71; GCN-NEXT:    v_cmp_lt_f32_e64 s0, 0, v5
72; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
73; GCN-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
74; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
75; GCN-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
76; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
77; GCN-NEXT:    v_rcp_f32_e32 v0, v0
78; GCN-NEXT:    ; return to shader part epilog
79.entry:
80  %a = call float @llvm.sqrt.f32(float %arg1)
81  %b = call float @llvm.amdgcn.rcp.f32(float %a)
82  ret float %b
83}
84
85define amdgpu_cs float @sqrt_rcp(float inreg %arg1) {
86; GCN-LABEL: sqrt_rcp:
87; GCN:       ; %bb.0: ; %.entry
88; GCN-NEXT:    v_rcp_f32_e32 v0, s0
89; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
90; GCN-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
91; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
92; GCN-NEXT:    v_sqrt_f32_e32 v1, v0
93; GCN-NEXT:    v_add_nc_u32_e32 v2, -1, v1
94; GCN-NEXT:    v_add_nc_u32_e32 v3, 1, v1
95; GCN-NEXT:    v_fma_f32 v4, -v2, v1, v0
96; GCN-NEXT:    v_fma_f32 v5, -v3, v1, v0
97; GCN-NEXT:    v_cmp_ge_f32_e64 s0, 0, v4
98; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s0
99; GCN-NEXT:    v_cmp_lt_f32_e64 s0, 0, v5
100; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
101; GCN-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
102; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
103; GCN-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
104; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
105; GCN-NEXT:    ; return to shader part epilog
106.entry:
107  %a = call float @llvm.amdgcn.rcp.f32(float %arg1)
108  %b = call float @llvm.sqrt.f32(float %a)
109  ret float %b
110}
111
112define amdgpu_cs float @div_sqrt_contract(float inreg %arg1) {
113; GCN-LABEL: div_sqrt_contract:
114; GCN:       ; %bb.0: ; %.entry
115; GCN-NEXT:    v_mul_f32_e64 v0, 0x4f800000, s0
116; GCN-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0xf800000, s0
117; GCN-NEXT:    v_cndmask_b32_e32 v0, s0, v0, vcc_lo
118; GCN-NEXT:    v_sqrt_f32_e32 v1, v0
119; GCN-NEXT:    v_add_nc_u32_e32 v2, -1, v1
120; GCN-NEXT:    v_add_nc_u32_e32 v3, 1, v1
121; GCN-NEXT:    v_fma_f32 v4, -v2, v1, v0
122; GCN-NEXT:    v_fma_f32 v5, -v3, v1, v0
123; GCN-NEXT:    v_cmp_ge_f32_e64 s0, 0, v4
124; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s0
125; GCN-NEXT:    v_cmp_lt_f32_e64 s0, 0, v5
126; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
127; GCN-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
128; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
129; GCN-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
130; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
131; GCN-NEXT:    v_rcp_f32_e32 v0, v0
132; GCN-NEXT:    ; return to shader part epilog
133.entry:
134  %a = call contract float @llvm.sqrt.f32(float %arg1)
135  %b = fdiv afn contract float 1.000000e+00, %a
136  ret float %b
137}
138
139define amdgpu_cs float @sqrt_div_contract(float inreg %arg1) {
140; GCN-LABEL: sqrt_div_contract:
141; GCN:       ; %bb.0: ; %.entry
142; GCN-NEXT:    v_rcp_f32_e32 v0, s0
143; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
144; GCN-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
145; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
146; GCN-NEXT:    v_sqrt_f32_e32 v1, v0
147; GCN-NEXT:    v_add_nc_u32_e32 v2, -1, v1
148; GCN-NEXT:    v_add_nc_u32_e32 v3, 1, v1
149; GCN-NEXT:    v_fma_f32 v4, -v2, v1, v0
150; GCN-NEXT:    v_fma_f32 v5, -v3, v1, v0
151; GCN-NEXT:    v_cmp_ge_f32_e64 s0, 0, v4
152; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s0
153; GCN-NEXT:    v_cmp_lt_f32_e64 s0, 0, v5
154; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
155; GCN-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
156; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
157; GCN-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
158; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
159; GCN-NEXT:    ; return to shader part epilog
160.entry:
161  %a = fdiv afn contract float 1.000000e+00, %arg1
162  %b = call contract float @llvm.sqrt.f32(float %a)
163  ret float %b
164}
165
166define amdgpu_cs float @rcp_sqrt_contract(float inreg %arg1) {
167; GCN-LABEL: rcp_sqrt_contract:
168; GCN:       ; %bb.0: ; %.entry
169; GCN-NEXT:    v_mul_f32_e64 v0, 0x4f800000, s0
170; GCN-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0xf800000, s0
171; GCN-NEXT:    v_cndmask_b32_e32 v0, s0, v0, vcc_lo
172; GCN-NEXT:    v_sqrt_f32_e32 v1, v0
173; GCN-NEXT:    v_add_nc_u32_e32 v2, -1, v1
174; GCN-NEXT:    v_add_nc_u32_e32 v3, 1, v1
175; GCN-NEXT:    v_fma_f32 v4, -v2, v1, v0
176; GCN-NEXT:    v_fma_f32 v5, -v3, v1, v0
177; GCN-NEXT:    v_cmp_ge_f32_e64 s0, 0, v4
178; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s0
179; GCN-NEXT:    v_cmp_lt_f32_e64 s0, 0, v5
180; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
181; GCN-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
182; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
183; GCN-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
184; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
185; GCN-NEXT:    v_rcp_f32_e32 v0, v0
186; GCN-NEXT:    ; return to shader part epilog
187.entry:
188  %a = call contract float @llvm.sqrt.f32(float %arg1)
189  %b = call contract float @llvm.amdgcn.rcp.f32(float %a)
190  ret float %b
191}
192
193define amdgpu_cs float @sqrt_rcp_contract(float inreg %arg1) {
194; GCN-LABEL: sqrt_rcp_contract:
195; GCN:       ; %bb.0: ; %.entry
196; GCN-NEXT:    v_rcp_f32_e32 v0, s0
197; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
198; GCN-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0
199; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
200; GCN-NEXT:    v_sqrt_f32_e32 v1, v0
201; GCN-NEXT:    v_add_nc_u32_e32 v2, -1, v1
202; GCN-NEXT:    v_add_nc_u32_e32 v3, 1, v1
203; GCN-NEXT:    v_fma_f32 v4, -v2, v1, v0
204; GCN-NEXT:    v_fma_f32 v5, -v3, v1, v0
205; GCN-NEXT:    v_cmp_ge_f32_e64 s0, 0, v4
206; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s0
207; GCN-NEXT:    v_cmp_lt_f32_e64 s0, 0, v5
208; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
209; GCN-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
210; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
211; GCN-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
212; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
213; GCN-NEXT:    ; return to shader part epilog
214.entry:
215  %a = call contract float @llvm.amdgcn.rcp.f32(float %arg1)
216  %b = call contract float @llvm.sqrt.f32(float %a)
217  ret float %b
218}
219
220declare float @llvm.sqrt.f32(float)
221declare float @llvm.amdgcn.rcp.f32(float)
222