xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll (revision b356aa3e2da7d1792412783e2c6247538ead75e8)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
4; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
5
6define float @v_rsq_clamp_f32(float %src) #0 {
7; SI-LABEL: v_rsq_clamp_f32:
8; SI:       ; %bb.0:
9; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10; SI-NEXT:    v_rsq_clamp_f32_e32 v0, v0
11; SI-NEXT:    s_setpc_b64 s[30:31]
12;
13; VI-LABEL: v_rsq_clamp_f32:
14; VI:       ; %bb.0:
15; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16; VI-NEXT:    v_rsq_f32_e32 v0, v0
17; VI-NEXT:    v_min_f32_e32 v0, 0x7f7fffff, v0
18; VI-NEXT:    v_max_f32_e32 v0, 0xff7fffff, v0
19; VI-NEXT:    s_setpc_b64 s[30:31]
20;
21; GFX12-LABEL: v_rsq_clamp_f32:
22; GFX12:       ; %bb.0:
23; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
24; GFX12-NEXT:    s_wait_expcnt 0x0
25; GFX12-NEXT:    s_wait_samplecnt 0x0
26; GFX12-NEXT:    s_wait_bvhcnt 0x0
27; GFX12-NEXT:    s_wait_kmcnt 0x0
28; GFX12-NEXT:    v_rsq_f32_e32 v0, v0
29; GFX12-NEXT:    v_mov_b32_e32 v1, 0xff7fffff
30; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
31; GFX12-NEXT:    v_minmax_num_f32 v0, v0, 0x7f7fffff, v1
32; GFX12-NEXT:    s_setpc_b64 s[30:31]
33  %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
34  ret float %rsq_clamp
35}
36
37define float @v_rsq_clamp_fabs_f32(float %src) #0 {
38; SI-LABEL: v_rsq_clamp_fabs_f32:
39; SI:       ; %bb.0:
40; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41; SI-NEXT:    v_rsq_clamp_f32_e64 v0, |v0|
42; SI-NEXT:    s_setpc_b64 s[30:31]
43;
44; VI-LABEL: v_rsq_clamp_fabs_f32:
45; VI:       ; %bb.0:
46; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47; VI-NEXT:    v_rsq_f32_e64 v0, |v0|
48; VI-NEXT:    v_min_f32_e32 v0, 0x7f7fffff, v0
49; VI-NEXT:    v_max_f32_e32 v0, 0xff7fffff, v0
50; VI-NEXT:    s_setpc_b64 s[30:31]
51;
52; GFX12-LABEL: v_rsq_clamp_fabs_f32:
53; GFX12:       ; %bb.0:
54; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
55; GFX12-NEXT:    s_wait_expcnt 0x0
56; GFX12-NEXT:    s_wait_samplecnt 0x0
57; GFX12-NEXT:    s_wait_bvhcnt 0x0
58; GFX12-NEXT:    s_wait_kmcnt 0x0
59; GFX12-NEXT:    v_rsq_f32_e64 v0, |v0|
60; GFX12-NEXT:    v_mov_b32_e32 v1, 0xff7fffff
61; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
62; GFX12-NEXT:    v_minmax_num_f32 v0, v0, 0x7f7fffff, v1
63; GFX12-NEXT:    s_setpc_b64 s[30:31]
64  %fabs.src = call float @llvm.fabs.f32(float %src)
65  %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %fabs.src)
66  ret float %rsq_clamp
67}
68
69define double @v_rsq_clamp_f64(double %src) #0 {
70; SI-LABEL: v_rsq_clamp_f64:
71; SI:       ; %bb.0:
72; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
73; SI-NEXT:    v_rsq_clamp_f64_e32 v[0:1], v[0:1]
74; SI-NEXT:    s_setpc_b64 s[30:31]
75;
76; VI-LABEL: v_rsq_clamp_f64:
77; VI:       ; %bb.0:
78; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
79; VI-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
80; VI-NEXT:    v_mov_b32_e32 v2, -1
81; VI-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
82; VI-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
83; VI-NEXT:    v_mov_b32_e32 v2, -1
84; VI-NEXT:    v_mov_b32_e32 v3, 0xffefffff
85; VI-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
86; VI-NEXT:    s_setpc_b64 s[30:31]
87;
88; GFX12-LABEL: v_rsq_clamp_f64:
89; GFX12:       ; %bb.0:
90; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
91; GFX12-NEXT:    s_wait_expcnt 0x0
92; GFX12-NEXT:    s_wait_samplecnt 0x0
93; GFX12-NEXT:    s_wait_bvhcnt 0x0
94; GFX12-NEXT:    s_wait_kmcnt 0x0
95; GFX12-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
96; GFX12-NEXT:    v_mov_b32_e32 v2, -1
97; GFX12-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
98; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
99; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
100; GFX12-NEXT:    v_mov_b32_e32 v2, -1
101; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffefffff
102; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
103; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
104; GFX12-NEXT:    s_setpc_b64 s[30:31]
105  %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
106  ret double %rsq_clamp
107}
108
109define double @v_rsq_clamp_fabs_f64(double %src) #0 {
110; SI-LABEL: v_rsq_clamp_fabs_f64:
111; SI:       ; %bb.0:
112; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113; SI-NEXT:    v_rsq_clamp_f64_e64 v[0:1], |v[0:1]|
114; SI-NEXT:    s_setpc_b64 s[30:31]
115;
116; VI-LABEL: v_rsq_clamp_fabs_f64:
117; VI:       ; %bb.0:
118; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119; VI-NEXT:    v_rsq_f64_e64 v[0:1], |v[0:1]|
120; VI-NEXT:    v_mov_b32_e32 v2, -1
121; VI-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
122; VI-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
123; VI-NEXT:    v_mov_b32_e32 v2, -1
124; VI-NEXT:    v_mov_b32_e32 v3, 0xffefffff
125; VI-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
126; VI-NEXT:    s_setpc_b64 s[30:31]
127;
128; GFX12-LABEL: v_rsq_clamp_fabs_f64:
129; GFX12:       ; %bb.0:
130; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
131; GFX12-NEXT:    s_wait_expcnt 0x0
132; GFX12-NEXT:    s_wait_samplecnt 0x0
133; GFX12-NEXT:    s_wait_bvhcnt 0x0
134; GFX12-NEXT:    s_wait_kmcnt 0x0
135; GFX12-NEXT:    v_rsq_f64_e64 v[0:1], |v[0:1]|
136; GFX12-NEXT:    v_mov_b32_e32 v2, -1
137; GFX12-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
138; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
139; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
140; GFX12-NEXT:    v_mov_b32_e32 v2, -1
141; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffefffff
142; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
143; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
144; GFX12-NEXT:    s_setpc_b64 s[30:31]
145  %fabs.src = call double @llvm.fabs.f64(double %src)
146  %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %fabs.src)
147  ret double %rsq_clamp
148}
149
150define float @v_rsq_clamp_undef_f32() #0 {
151; SI-LABEL: v_rsq_clamp_undef_f32:
152; SI:       ; %bb.0:
153; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154; SI-NEXT:    v_rsq_clamp_f32_e32 v0, s4
155; SI-NEXT:    s_setpc_b64 s[30:31]
156;
157; VI-LABEL: v_rsq_clamp_undef_f32:
158; VI:       ; %bb.0:
159; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
160; VI-NEXT:    v_rsq_f32_e32 v0, s4
161; VI-NEXT:    v_min_f32_e32 v0, 0x7f7fffff, v0
162; VI-NEXT:    v_max_f32_e32 v0, 0xff7fffff, v0
163; VI-NEXT:    s_setpc_b64 s[30:31]
164;
165; GFX12-LABEL: v_rsq_clamp_undef_f32:
166; GFX12:       ; %bb.0:
167; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
168; GFX12-NEXT:    s_wait_expcnt 0x0
169; GFX12-NEXT:    s_wait_samplecnt 0x0
170; GFX12-NEXT:    s_wait_bvhcnt 0x0
171; GFX12-NEXT:    s_wait_kmcnt 0x0
172; GFX12-NEXT:    v_s_rsq_f32 s0, s0
173; GFX12-NEXT:    v_mov_b32_e32 v0, 0xff7fffff
174; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
175; GFX12-NEXT:    v_minmax_num_f32 v0, s0, 0x7f7fffff, v0
176; GFX12-NEXT:    s_setpc_b64 s[30:31]
177  %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)
178  ret float %rsq_clamp
179}
180
181define double @v_rsq_clamp_undef_f64() #0 {
182; SI-LABEL: v_rsq_clamp_undef_f64:
183; SI:       ; %bb.0:
184; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
185; SI-NEXT:    v_rsq_clamp_f64_e32 v[0:1], s[4:5]
186; SI-NEXT:    s_setpc_b64 s[30:31]
187;
188; VI-LABEL: v_rsq_clamp_undef_f64:
189; VI:       ; %bb.0:
190; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
191; VI-NEXT:    v_rsq_f64_e32 v[0:1], s[4:5]
192; VI-NEXT:    v_mov_b32_e32 v2, -1
193; VI-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
194; VI-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
195; VI-NEXT:    v_mov_b32_e32 v2, -1
196; VI-NEXT:    v_mov_b32_e32 v3, 0xffefffff
197; VI-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
198; VI-NEXT:    s_setpc_b64 s[30:31]
199;
200; GFX12-LABEL: v_rsq_clamp_undef_f64:
201; GFX12:       ; %bb.0:
202; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
203; GFX12-NEXT:    s_wait_expcnt 0x0
204; GFX12-NEXT:    s_wait_samplecnt 0x0
205; GFX12-NEXT:    s_wait_bvhcnt 0x0
206; GFX12-NEXT:    s_wait_kmcnt 0x0
207; GFX12-NEXT:    v_rsq_f64_e32 v[0:1], s[0:1]
208; GFX12-NEXT:    v_mov_b32_e32 v2, -1
209; GFX12-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
210; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
211; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
212; GFX12-NEXT:    v_mov_b32_e32 v2, -1
213; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffefffff
214; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
215; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
216; GFX12-NEXT:    s_setpc_b64 s[30:31]
217  %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double undef)
218  ret double %rsq_clamp
219}
220
221define float @v_rsq_clamp_f32_non_ieee(float %src) #2 {
222; SI-LABEL: v_rsq_clamp_f32_non_ieee:
223; SI:       ; %bb.0:
224; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
225; SI-NEXT:    v_rsq_clamp_f32_e32 v0, v0
226; SI-NEXT:    s_setpc_b64 s[30:31]
227;
228; VI-LABEL: v_rsq_clamp_f32_non_ieee:
229; VI:       ; %bb.0:
230; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
231; VI-NEXT:    v_rsq_f32_e32 v0, v0
232; VI-NEXT:    v_min_f32_e32 v0, 0x7f7fffff, v0
233; VI-NEXT:    v_max_f32_e32 v0, 0xff7fffff, v0
234; VI-NEXT:    s_setpc_b64 s[30:31]
235;
236; GFX12-LABEL: v_rsq_clamp_f32_non_ieee:
237; GFX12:       ; %bb.0:
238; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
239; GFX12-NEXT:    s_wait_expcnt 0x0
240; GFX12-NEXT:    s_wait_samplecnt 0x0
241; GFX12-NEXT:    s_wait_bvhcnt 0x0
242; GFX12-NEXT:    s_wait_kmcnt 0x0
243; GFX12-NEXT:    v_rsq_f32_e32 v0, v0
244; GFX12-NEXT:    v_mov_b32_e32 v1, 0xff7fffff
245; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
246; GFX12-NEXT:    v_minmax_num_f32 v0, v0, 0x7f7fffff, v1
247; GFX12-NEXT:    s_setpc_b64 s[30:31]
248  %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
249  ret float %rsq_clamp
250}
251
252define double @v_rsq_clamp_f64_non_ieee(double %src) #2 {
253; SI-LABEL: v_rsq_clamp_f64_non_ieee:
254; SI:       ; %bb.0:
255; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
256; SI-NEXT:    v_rsq_clamp_f64_e32 v[0:1], v[0:1]
257; SI-NEXT:    s_setpc_b64 s[30:31]
258;
259; VI-LABEL: v_rsq_clamp_f64_non_ieee:
260; VI:       ; %bb.0:
261; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
262; VI-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
263; VI-NEXT:    v_mov_b32_e32 v2, -1
264; VI-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
265; VI-NEXT:    v_min_f64 v[0:1], v[0:1], v[2:3]
266; VI-NEXT:    v_mov_b32_e32 v2, -1
267; VI-NEXT:    v_mov_b32_e32 v3, 0xffefffff
268; VI-NEXT:    v_max_f64 v[0:1], v[0:1], v[2:3]
269; VI-NEXT:    s_setpc_b64 s[30:31]
270;
271; GFX12-LABEL: v_rsq_clamp_f64_non_ieee:
272; GFX12:       ; %bb.0:
273; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
274; GFX12-NEXT:    s_wait_expcnt 0x0
275; GFX12-NEXT:    s_wait_samplecnt 0x0
276; GFX12-NEXT:    s_wait_bvhcnt 0x0
277; GFX12-NEXT:    s_wait_kmcnt 0x0
278; GFX12-NEXT:    v_rsq_f64_e32 v[0:1], v[0:1]
279; GFX12-NEXT:    v_mov_b32_e32 v2, -1
280; GFX12-NEXT:    v_mov_b32_e32 v3, 0x7fefffff
281; GFX12-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
282; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[2:3]
283; GFX12-NEXT:    v_mov_b32_e32 v2, -1
284; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffefffff
285; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
286; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[2:3]
287; GFX12-NEXT:    s_setpc_b64 s[30:31]
288  %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
289  ret double %rsq_clamp
290}
291
292declare float @llvm.fabs.f32(float) #1
293declare float @llvm.amdgcn.rsq.clamp.f32(float) #1
294declare double @llvm.fabs.f64(double) #1
295declare double @llvm.amdgcn.rsq.clamp.f64(double) #1
296
297attributes #0 = { nounwind }
298attributes #1 = { nounwind readnone }
299attributes #2 = { nounwind "amdgpu-ieee"="false" }
300