1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s 3; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s 4; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s 5 6define float @v_rsq_clamp_f32(float %src) #0 { 7; SI-LABEL: v_rsq_clamp_f32: 8; SI: ; %bb.0: 9; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 10; SI-NEXT: v_rsq_clamp_f32_e32 v0, v0 11; SI-NEXT: s_setpc_b64 s[30:31] 12; 13; VI-LABEL: v_rsq_clamp_f32: 14; VI: ; %bb.0: 15; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16; VI-NEXT: v_rsq_f32_e32 v0, v0 17; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0 18; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0 19; VI-NEXT: s_setpc_b64 s[30:31] 20; 21; GFX12-LABEL: v_rsq_clamp_f32: 22; GFX12: ; %bb.0: 23; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 24; GFX12-NEXT: s_wait_expcnt 0x0 25; GFX12-NEXT: s_wait_samplecnt 0x0 26; GFX12-NEXT: s_wait_bvhcnt 0x0 27; GFX12-NEXT: s_wait_kmcnt 0x0 28; GFX12-NEXT: v_rsq_f32_e32 v0, v0 29; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff 30; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) 31; GFX12-NEXT: v_minmax_num_f32 v0, v0, 0x7f7fffff, v1 32; GFX12-NEXT: s_setpc_b64 s[30:31] 33 %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src) 34 ret float %rsq_clamp 35} 36 37define float @v_rsq_clamp_fabs_f32(float %src) #0 { 38; SI-LABEL: v_rsq_clamp_fabs_f32: 39; SI: ; %bb.0: 40; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 41; SI-NEXT: v_rsq_clamp_f32_e64 v0, |v0| 42; SI-NEXT: s_setpc_b64 s[30:31] 43; 44; VI-LABEL: v_rsq_clamp_fabs_f32: 45; VI: ; %bb.0: 46; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 47; VI-NEXT: v_rsq_f32_e64 v0, |v0| 48; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0 49; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0 50; VI-NEXT: s_setpc_b64 s[30:31] 51; 52; GFX12-LABEL: v_rsq_clamp_fabs_f32: 53; GFX12: ; %bb.0: 54; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 55; GFX12-NEXT: s_wait_expcnt 0x0 56; GFX12-NEXT: s_wait_samplecnt 0x0 57; GFX12-NEXT: s_wait_bvhcnt 0x0 58; GFX12-NEXT: s_wait_kmcnt 0x0 59; GFX12-NEXT: v_rsq_f32_e64 v0, |v0| 60; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff 61; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) 62; GFX12-NEXT: v_minmax_num_f32 v0, v0, 0x7f7fffff, v1 63; GFX12-NEXT: s_setpc_b64 s[30:31] 64 %fabs.src = call float @llvm.fabs.f32(float %src) 65 %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %fabs.src) 66 ret float %rsq_clamp 67} 68 69define double @v_rsq_clamp_f64(double %src) #0 { 70; SI-LABEL: v_rsq_clamp_f64: 71; SI: ; %bb.0: 72; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 73; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], v[0:1] 74; SI-NEXT: s_setpc_b64 s[30:31] 75; 76; VI-LABEL: v_rsq_clamp_f64: 77; VI: ; %bb.0: 78; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 79; VI-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] 80; VI-NEXT: v_mov_b32_e32 v2, -1 81; VI-NEXT: v_mov_b32_e32 v3, 0x7fefffff 82; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] 83; VI-NEXT: v_mov_b32_e32 v2, -1 84; VI-NEXT: v_mov_b32_e32 v3, 0xffefffff 85; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] 86; VI-NEXT: s_setpc_b64 s[30:31] 87; 88; GFX12-LABEL: v_rsq_clamp_f64: 89; GFX12: ; %bb.0: 90; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 91; GFX12-NEXT: s_wait_expcnt 0x0 92; GFX12-NEXT: s_wait_samplecnt 0x0 93; GFX12-NEXT: s_wait_bvhcnt 0x0 94; GFX12-NEXT: s_wait_kmcnt 0x0 95; GFX12-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] 96; GFX12-NEXT: v_mov_b32_e32 v2, -1 97; GFX12-NEXT: v_mov_b32_e32 v3, 0x7fefffff 98; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) 99; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] 100; GFX12-NEXT: v_mov_b32_e32 v2, -1 101; GFX12-NEXT: v_mov_b32_e32 v3, 0xffefffff 102; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 103; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] 104; GFX12-NEXT: s_setpc_b64 s[30:31] 105 %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src) 106 ret double %rsq_clamp 107} 108 109define double @v_rsq_clamp_fabs_f64(double %src) #0 { 110; SI-LABEL: v_rsq_clamp_fabs_f64: 111; SI: ; %bb.0: 112; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 113; SI-NEXT: v_rsq_clamp_f64_e64 v[0:1], |v[0:1]| 114; SI-NEXT: s_setpc_b64 s[30:31] 115; 116; VI-LABEL: v_rsq_clamp_fabs_f64: 117; VI: ; %bb.0: 118; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 119; VI-NEXT: v_rsq_f64_e64 v[0:1], |v[0:1]| 120; VI-NEXT: v_mov_b32_e32 v2, -1 121; VI-NEXT: v_mov_b32_e32 v3, 0x7fefffff 122; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] 123; VI-NEXT: v_mov_b32_e32 v2, -1 124; VI-NEXT: v_mov_b32_e32 v3, 0xffefffff 125; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] 126; VI-NEXT: s_setpc_b64 s[30:31] 127; 128; GFX12-LABEL: v_rsq_clamp_fabs_f64: 129; GFX12: ; %bb.0: 130; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 131; GFX12-NEXT: s_wait_expcnt 0x0 132; GFX12-NEXT: s_wait_samplecnt 0x0 133; GFX12-NEXT: s_wait_bvhcnt 0x0 134; GFX12-NEXT: s_wait_kmcnt 0x0 135; GFX12-NEXT: v_rsq_f64_e64 v[0:1], |v[0:1]| 136; GFX12-NEXT: v_mov_b32_e32 v2, -1 137; GFX12-NEXT: v_mov_b32_e32 v3, 0x7fefffff 138; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) 139; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] 140; GFX12-NEXT: v_mov_b32_e32 v2, -1 141; GFX12-NEXT: v_mov_b32_e32 v3, 0xffefffff 142; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 143; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] 144; GFX12-NEXT: s_setpc_b64 s[30:31] 145 %fabs.src = call double @llvm.fabs.f64(double %src) 146 %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %fabs.src) 147 ret double %rsq_clamp 148} 149 150define float @v_rsq_clamp_undef_f32() #0 { 151; SI-LABEL: v_rsq_clamp_undef_f32: 152; SI: ; %bb.0: 153; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 154; SI-NEXT: v_rsq_clamp_f32_e32 v0, s4 155; SI-NEXT: s_setpc_b64 s[30:31] 156; 157; VI-LABEL: v_rsq_clamp_undef_f32: 158; VI: ; %bb.0: 159; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 160; VI-NEXT: v_rsq_f32_e32 v0, s4 161; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0 162; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0 163; VI-NEXT: s_setpc_b64 s[30:31] 164; 165; GFX12-LABEL: v_rsq_clamp_undef_f32: 166; GFX12: ; %bb.0: 167; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 168; GFX12-NEXT: s_wait_expcnt 0x0 169; GFX12-NEXT: s_wait_samplecnt 0x0 170; GFX12-NEXT: s_wait_bvhcnt 0x0 171; GFX12-NEXT: s_wait_kmcnt 0x0 172; GFX12-NEXT: v_s_rsq_f32 s0, s0 173; GFX12-NEXT: v_mov_b32_e32 v0, 0xff7fffff 174; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) 175; GFX12-NEXT: v_minmax_num_f32 v0, s0, 0x7f7fffff, v0 176; GFX12-NEXT: s_setpc_b64 s[30:31] 177 %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef) 178 ret float %rsq_clamp 179} 180 181define double @v_rsq_clamp_undef_f64() #0 { 182; SI-LABEL: v_rsq_clamp_undef_f64: 183; SI: ; %bb.0: 184; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 185; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], s[4:5] 186; SI-NEXT: s_setpc_b64 s[30:31] 187; 188; VI-LABEL: v_rsq_clamp_undef_f64: 189; VI: ; %bb.0: 190; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 191; VI-NEXT: v_rsq_f64_e32 v[0:1], s[4:5] 192; VI-NEXT: v_mov_b32_e32 v2, -1 193; VI-NEXT: v_mov_b32_e32 v3, 0x7fefffff 194; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] 195; VI-NEXT: v_mov_b32_e32 v2, -1 196; VI-NEXT: v_mov_b32_e32 v3, 0xffefffff 197; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] 198; VI-NEXT: s_setpc_b64 s[30:31] 199; 200; GFX12-LABEL: v_rsq_clamp_undef_f64: 201; GFX12: ; %bb.0: 202; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 203; GFX12-NEXT: s_wait_expcnt 0x0 204; GFX12-NEXT: s_wait_samplecnt 0x0 205; GFX12-NEXT: s_wait_bvhcnt 0x0 206; GFX12-NEXT: s_wait_kmcnt 0x0 207; GFX12-NEXT: v_rsq_f64_e32 v[0:1], s[0:1] 208; GFX12-NEXT: v_mov_b32_e32 v2, -1 209; GFX12-NEXT: v_mov_b32_e32 v3, 0x7fefffff 210; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) 211; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] 212; GFX12-NEXT: v_mov_b32_e32 v2, -1 213; GFX12-NEXT: v_mov_b32_e32 v3, 0xffefffff 214; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 215; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] 216; GFX12-NEXT: s_setpc_b64 s[30:31] 217 %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double undef) 218 ret double %rsq_clamp 219} 220 221define float @v_rsq_clamp_f32_non_ieee(float %src) #2 { 222; SI-LABEL: v_rsq_clamp_f32_non_ieee: 223; SI: ; %bb.0: 224; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 225; SI-NEXT: v_rsq_clamp_f32_e32 v0, v0 226; SI-NEXT: s_setpc_b64 s[30:31] 227; 228; VI-LABEL: v_rsq_clamp_f32_non_ieee: 229; VI: ; %bb.0: 230; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 231; VI-NEXT: v_rsq_f32_e32 v0, v0 232; VI-NEXT: v_min_f32_e32 v0, 0x7f7fffff, v0 233; VI-NEXT: v_max_f32_e32 v0, 0xff7fffff, v0 234; VI-NEXT: s_setpc_b64 s[30:31] 235; 236; GFX12-LABEL: v_rsq_clamp_f32_non_ieee: 237; GFX12: ; %bb.0: 238; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 239; GFX12-NEXT: s_wait_expcnt 0x0 240; GFX12-NEXT: s_wait_samplecnt 0x0 241; GFX12-NEXT: s_wait_bvhcnt 0x0 242; GFX12-NEXT: s_wait_kmcnt 0x0 243; GFX12-NEXT: v_rsq_f32_e32 v0, v0 244; GFX12-NEXT: v_mov_b32_e32 v1, 0xff7fffff 245; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) 246; GFX12-NEXT: v_minmax_num_f32 v0, v0, 0x7f7fffff, v1 247; GFX12-NEXT: s_setpc_b64 s[30:31] 248 %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src) 249 ret float %rsq_clamp 250} 251 252define double @v_rsq_clamp_f64_non_ieee(double %src) #2 { 253; SI-LABEL: v_rsq_clamp_f64_non_ieee: 254; SI: ; %bb.0: 255; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 256; SI-NEXT: v_rsq_clamp_f64_e32 v[0:1], v[0:1] 257; SI-NEXT: s_setpc_b64 s[30:31] 258; 259; VI-LABEL: v_rsq_clamp_f64_non_ieee: 260; VI: ; %bb.0: 261; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 262; VI-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] 263; VI-NEXT: v_mov_b32_e32 v2, -1 264; VI-NEXT: v_mov_b32_e32 v3, 0x7fefffff 265; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] 266; VI-NEXT: v_mov_b32_e32 v2, -1 267; VI-NEXT: v_mov_b32_e32 v3, 0xffefffff 268; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] 269; VI-NEXT: s_setpc_b64 s[30:31] 270; 271; GFX12-LABEL: v_rsq_clamp_f64_non_ieee: 272; GFX12: ; %bb.0: 273; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 274; GFX12-NEXT: s_wait_expcnt 0x0 275; GFX12-NEXT: s_wait_samplecnt 0x0 276; GFX12-NEXT: s_wait_bvhcnt 0x0 277; GFX12-NEXT: s_wait_kmcnt 0x0 278; GFX12-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] 279; GFX12-NEXT: v_mov_b32_e32 v2, -1 280; GFX12-NEXT: v_mov_b32_e32 v3, 0x7fefffff 281; GFX12-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) 282; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] 283; GFX12-NEXT: v_mov_b32_e32 v2, -1 284; GFX12-NEXT: v_mov_b32_e32 v3, 0xffefffff 285; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 286; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] 287; GFX12-NEXT: s_setpc_b64 s[30:31] 288 %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src) 289 ret double %rsq_clamp 290} 291 292declare float @llvm.fabs.f32(float) #1 293declare float @llvm.amdgcn.rsq.clamp.f32(float) #1 294declare double @llvm.fabs.f64(double) #1 295declare double @llvm.amdgcn.rsq.clamp.f64(double) #1 296 297attributes #0 = { nounwind } 298attributes #1 = { nounwind readnone } 299attributes #2 = { nounwind "amdgpu-ieee"="false" } 300