1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s 4; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=R600,EG %s 5; RUN: llc -mtriple=r600 -mcpu=cayman < %s | FileCheck -check-prefixes=R600,CM %s 6 7define float @v_rcp_f32_ieee(float %x) #3 { 8; SI-LABEL: v_rcp_f32_ieee: 9; SI: ; %bb.0: 10; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 12; SI-NEXT: v_rcp_f32_e32 v2, v1 13; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 14; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 15; SI-NEXT: v_fma_f32 v2, v4, v2, v2 16; SI-NEXT: v_mul_f32_e32 v4, v3, v2 17; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 18; SI-NEXT: v_fma_f32 v4, v5, v2, v4 19; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 20; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 21; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 22; SI-NEXT: s_setpc_b64 s[30:31] 23; 24; VI-LABEL: v_rcp_f32_ieee: 25; VI: ; %bb.0: 26; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 28; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 29; VI-NEXT: v_rcp_f32_e32 v3, v1 30; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 31; VI-NEXT: v_fma_f32 v3, v4, v3, v3 32; VI-NEXT: v_mul_f32_e32 v4, v2, v3 33; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 34; VI-NEXT: v_fma_f32 v4, v5, v3, v4 35; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 36; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 37; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 38; VI-NEXT: s_setpc_b64 s[30:31] 39; 40; R600-LABEL: v_rcp_f32_ieee: 41; R600: ; %bb.0: 42; R600-NEXT: CF_END 43; R600-NEXT: PAD 44 %rcp = fdiv float 1.0, %x 45 ret float %rcp 46} 47 48define float @v_rcp_f32_ieee_unsafe(float %x) #4 { 49; GCN-LABEL: v_rcp_f32_ieee_unsafe: 50; GCN: ; %bb.0: 51; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 52; GCN-NEXT: v_rcp_f32_e32 v0, v0 53; GCN-NEXT: s_setpc_b64 s[30:31] 54; 55; R600-LABEL: v_rcp_f32_ieee_unsafe: 56; R600: ; %bb.0: 57; R600-NEXT: CF_END 58; R600-NEXT: PAD 59 %rcp = fdiv float 1.0, %x 60 ret float %rcp 61} 62 63define float @v_rcp_f32_ieee_known_not_denormal(float nofpclass(sub) %x) #3 { 64; SI-LABEL: v_rcp_f32_ieee_known_not_denormal: 65; SI: ; %bb.0: 66; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 67; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 68; SI-NEXT: v_rcp_f32_e32 v2, v1 69; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 70; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 71; SI-NEXT: v_fma_f32 v2, v4, v2, v2 72; SI-NEXT: v_mul_f32_e32 v4, v3, v2 73; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 74; SI-NEXT: v_fma_f32 v4, v5, v2, v4 75; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 76; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 77; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 78; SI-NEXT: s_setpc_b64 s[30:31] 79; 80; VI-LABEL: v_rcp_f32_ieee_known_not_denormal: 81; VI: ; %bb.0: 82; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 83; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 84; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 85; VI-NEXT: v_rcp_f32_e32 v3, v1 86; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 87; VI-NEXT: v_fma_f32 v3, v4, v3, v3 88; VI-NEXT: v_mul_f32_e32 v4, v2, v3 89; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 90; VI-NEXT: v_fma_f32 v4, v5, v3, v4 91; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 92; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 93; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 94; VI-NEXT: s_setpc_b64 s[30:31] 95; 96; R600-LABEL: v_rcp_f32_ieee_known_not_denormal: 97; R600: ; %bb.0: 98; R600-NEXT: CF_END 99; R600-NEXT: PAD 100 %rcp = fdiv float 1.0, %x 101 ret float %rcp 102} 103 104define float @v_rcp_f32_ieee_nnan_ninf(float %x) #3 { 105; SI-LABEL: v_rcp_f32_ieee_nnan_ninf: 106; SI: ; %bb.0: 107; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 108; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 109; SI-NEXT: v_rcp_f32_e32 v2, v1 110; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 111; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 112; SI-NEXT: v_fma_f32 v2, v4, v2, v2 113; SI-NEXT: v_mul_f32_e32 v4, v3, v2 114; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 115; SI-NEXT: v_fma_f32 v4, v5, v2, v4 116; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 117; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 118; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 119; SI-NEXT: s_setpc_b64 s[30:31] 120; 121; VI-LABEL: v_rcp_f32_ieee_nnan_ninf: 122; VI: ; %bb.0: 123; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 124; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 125; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 126; VI-NEXT: v_rcp_f32_e32 v3, v1 127; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 128; VI-NEXT: v_fma_f32 v3, v4, v3, v3 129; VI-NEXT: v_mul_f32_e32 v4, v2, v3 130; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 131; VI-NEXT: v_fma_f32 v4, v5, v3, v4 132; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 133; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 134; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 135; VI-NEXT: s_setpc_b64 s[30:31] 136; 137; R600-LABEL: v_rcp_f32_ieee_nnan_ninf: 138; R600: ; %bb.0: 139; R600-NEXT: CF_END 140; R600-NEXT: PAD 141 %rcp = fdiv nnan ninf float 1.0, %x 142 ret float %rcp 143} 144 145define float @v_neg_rcp_f32_ieee(float %x) #3 { 146; SI-LABEL: v_neg_rcp_f32_ieee: 147; SI: ; %bb.0: 148; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 149; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 150; SI-NEXT: v_rcp_f32_e32 v2, v1 151; SI-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 152; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 153; SI-NEXT: v_fma_f32 v2, v4, v2, v2 154; SI-NEXT: v_mul_f32_e32 v4, v3, v2 155; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 156; SI-NEXT: v_fma_f32 v4, v5, v2, v4 157; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 158; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 159; SI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 160; SI-NEXT: s_setpc_b64 s[30:31] 161; 162; VI-LABEL: v_neg_rcp_f32_ieee: 163; VI: ; %bb.0: 164; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 165; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 166; VI-NEXT: v_div_scale_f32 v2, vcc, -1.0, v0, -1.0 167; VI-NEXT: v_rcp_f32_e32 v3, v1 168; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 169; VI-NEXT: v_fma_f32 v3, v4, v3, v3 170; VI-NEXT: v_mul_f32_e32 v4, v2, v3 171; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 172; VI-NEXT: v_fma_f32 v4, v5, v3, v4 173; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 174; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 175; VI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 176; VI-NEXT: s_setpc_b64 s[30:31] 177; 178; R600-LABEL: v_neg_rcp_f32_ieee: 179; R600: ; %bb.0: 180; R600-NEXT: CF_END 181; R600-NEXT: PAD 182 %rcp = fdiv float -1.0, %x 183 ret float %rcp 184} 185 186define float @v_rcp_f32_daz(float %x) #0 { 187; SI-LABEL: v_rcp_f32_daz: 188; SI: ; %bb.0: 189; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 190; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 191; SI-NEXT: v_rcp_f32_e32 v2, v1 192; SI-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0 193; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 194; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 195; SI-NEXT: v_fma_f32 v2, v4, v2, v2 196; SI-NEXT: v_mul_f32_e32 v4, v3, v2 197; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 198; SI-NEXT: v_fma_f32 v4, v5, v2, v4 199; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 200; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 201; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 202; SI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 203; SI-NEXT: s_setpc_b64 s[30:31] 204; 205; VI-LABEL: v_rcp_f32_daz: 206; VI: ; %bb.0: 207; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 208; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0 209; VI-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0 210; VI-NEXT: v_rcp_f32_e32 v3, v1 211; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 212; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 213; VI-NEXT: v_fma_f32 v3, v4, v3, v3 214; VI-NEXT: v_mul_f32_e32 v4, v2, v3 215; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 216; VI-NEXT: v_fma_f32 v4, v5, v3, v4 217; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 218; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 219; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 220; VI-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0 221; VI-NEXT: s_setpc_b64 s[30:31] 222; 223; R600-LABEL: v_rcp_f32_daz: 224; R600: ; %bb.0: 225; R600-NEXT: CF_END 226; R600-NEXT: PAD 227 %rcp = fdiv float 1.0, %x 228 ret float %rcp 229} 230 231define float @v_neg_rcp_f32_daz(float %x) #0 { 232; SI-LABEL: v_neg_rcp_f32_daz: 233; SI: ; %bb.0: 234; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 235; SI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 236; SI-NEXT: v_rcp_f32_e32 v2, v1 237; SI-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0 238; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 239; SI-NEXT: v_fma_f32 v4, -v1, v2, 1.0 240; SI-NEXT: v_fma_f32 v2, v4, v2, v2 241; SI-NEXT: v_mul_f32_e32 v4, v3, v2 242; SI-NEXT: v_fma_f32 v5, -v1, v4, v3 243; SI-NEXT: v_fma_f32 v4, v5, v2, v4 244; SI-NEXT: v_fma_f32 v1, -v1, v4, v3 245; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 246; SI-NEXT: v_div_fmas_f32 v1, v1, v2, v4 247; SI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 248; SI-NEXT: s_setpc_b64 s[30:31] 249; 250; VI-LABEL: v_neg_rcp_f32_daz: 251; VI: ; %bb.0: 252; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 253; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0 254; VI-NEXT: v_div_scale_f32 v2, vcc, -1.0, v0, -1.0 255; VI-NEXT: v_rcp_f32_e32 v3, v1 256; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 257; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 258; VI-NEXT: v_fma_f32 v3, v4, v3, v3 259; VI-NEXT: v_mul_f32_e32 v4, v2, v3 260; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 261; VI-NEXT: v_fma_f32 v4, v5, v3, v4 262; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 263; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 264; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 265; VI-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0 266; VI-NEXT: s_setpc_b64 s[30:31] 267; 268; R600-LABEL: v_neg_rcp_f32_daz: 269; R600: ; %bb.0: 270; R600-NEXT: CF_END 271; R600-NEXT: PAD 272 %rcp = fdiv float -1.0, %x 273 ret float %rcp 274} 275 276define float @v_rcp_f32_ieee_ulp25(float %x) #3 { 277; SI-LABEL: v_rcp_f32_ieee_ulp25: 278; SI: ; %bb.0: 279; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 280; SI-NEXT: s_mov_b32 s4, 0x7f800000 281; SI-NEXT: v_frexp_mant_f32_e32 v1, v0 282; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 283; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 284; SI-NEXT: v_rcp_f32_e32 v1, v1 285; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 286; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 287; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 288; SI-NEXT: s_setpc_b64 s[30:31] 289; 290; VI-LABEL: v_rcp_f32_ieee_ulp25: 291; VI: ; %bb.0: 292; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 293; VI-NEXT: v_frexp_mant_f32_e32 v1, v0 294; VI-NEXT: v_rcp_f32_e32 v1, v1 295; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 296; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 297; VI-NEXT: v_ldexp_f32 v0, v1, v0 298; VI-NEXT: s_setpc_b64 s[30:31] 299; 300; R600-LABEL: v_rcp_f32_ieee_ulp25: 301; R600: ; %bb.0: 302; R600-NEXT: CF_END 303; R600-NEXT: PAD 304 %rcp = fdiv float 1.0, %x, !fpmath !0 305 ret float %rcp 306} 307 308define float @v_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #3 { 309; SI-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal: 310; SI: ; %bb.0: 311; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 312; SI-NEXT: s_mov_b32 s4, 0x7f800000 313; SI-NEXT: v_frexp_mant_f32_e32 v1, v0 314; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 315; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 316; SI-NEXT: v_rcp_f32_e32 v1, v1 317; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 318; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 319; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 320; SI-NEXT: s_setpc_b64 s[30:31] 321; 322; VI-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal: 323; VI: ; %bb.0: 324; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 325; VI-NEXT: v_frexp_mant_f32_e32 v1, v0 326; VI-NEXT: v_rcp_f32_e32 v1, v1 327; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 328; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 329; VI-NEXT: v_ldexp_f32 v0, v1, v0 330; VI-NEXT: s_setpc_b64 s[30:31] 331; 332; R600-LABEL: v_rcp_f32_ieee_ulp25_known_not_denormal: 333; R600: ; %bb.0: 334; R600-NEXT: CF_END 335; R600-NEXT: PAD 336 %rcp = fdiv float 1.0, %x, !fpmath !0 337 ret float %rcp 338} 339 340define float @v_neg_rcp_f32_ieee_ulp25_known_not_denormal(float nofpclass(sub) %x) #3 { 341; SI-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal: 342; SI: ; %bb.0: 343; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 344; SI-NEXT: s_mov_b32 s4, 0x7f800000 345; SI-NEXT: v_frexp_mant_f32_e64 v1, -v0 346; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 347; SI-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5] 348; SI-NEXT: v_rcp_f32_e32 v1, v1 349; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 350; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 351; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 352; SI-NEXT: s_setpc_b64 s[30:31] 353; 354; VI-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal: 355; VI: ; %bb.0: 356; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 357; VI-NEXT: v_frexp_mant_f32_e64 v1, -v0 358; VI-NEXT: v_rcp_f32_e32 v1, v1 359; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 360; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 361; VI-NEXT: v_ldexp_f32 v0, v1, v0 362; VI-NEXT: s_setpc_b64 s[30:31] 363; 364; R600-LABEL: v_neg_rcp_f32_ieee_ulp25_known_not_denormal: 365; R600: ; %bb.0: 366; R600-NEXT: CF_END 367; R600-NEXT: PAD 368 %rcp = fdiv float -1.0, %x, !fpmath !0 369 ret float %rcp 370} 371 372define float @v_rcp_f32_ieee_ulp25_ninf_nnan(float %x) #3 { 373; SI-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan: 374; SI: ; %bb.0: 375; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 376; SI-NEXT: s_mov_b32 s4, 0x7f800000 377; SI-NEXT: v_frexp_mant_f32_e32 v1, v0 378; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 379; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 380; SI-NEXT: v_rcp_f32_e32 v1, v1 381; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 382; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 383; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 384; SI-NEXT: s_setpc_b64 s[30:31] 385; 386; VI-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan: 387; VI: ; %bb.0: 388; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 389; VI-NEXT: v_frexp_mant_f32_e32 v1, v0 390; VI-NEXT: v_rcp_f32_e32 v1, v1 391; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 392; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 393; VI-NEXT: v_ldexp_f32 v0, v1, v0 394; VI-NEXT: s_setpc_b64 s[30:31] 395; 396; R600-LABEL: v_rcp_f32_ieee_ulp25_ninf_nnan: 397; R600: ; %bb.0: 398; R600-NEXT: CF_END 399; R600-NEXT: PAD 400 %rcp = fdiv ninf nnan float 1.0, %x, !fpmath !0 401 ret float %rcp 402} 403 404define float @v_rcp_f32_daz_ulp25(float %x) #0 { 405; GCN-LABEL: v_rcp_f32_daz_ulp25: 406; GCN: ; %bb.0: 407; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 408; GCN-NEXT: v_rcp_f32_e32 v0, v0 409; GCN-NEXT: s_setpc_b64 s[30:31] 410; 411; R600-LABEL: v_rcp_f32_daz_ulp25: 412; R600: ; %bb.0: 413; R600-NEXT: CF_END 414; R600-NEXT: PAD 415 %rcp = fdiv float 1.0, %x, !fpmath !0 416 ret float %rcp 417} 418 419define float @v_neg_rcp_f32_ieee_ulp25(float %x) #3 { 420; SI-LABEL: v_neg_rcp_f32_ieee_ulp25: 421; SI: ; %bb.0: 422; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 423; SI-NEXT: s_mov_b32 s4, 0x7f800000 424; SI-NEXT: v_frexp_mant_f32_e64 v1, -v0 425; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 426; SI-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5] 427; SI-NEXT: v_rcp_f32_e32 v1, v1 428; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 429; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 430; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 431; SI-NEXT: s_setpc_b64 s[30:31] 432; 433; VI-LABEL: v_neg_rcp_f32_ieee_ulp25: 434; VI: ; %bb.0: 435; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 436; VI-NEXT: v_frexp_mant_f32_e64 v1, -v0 437; VI-NEXT: v_rcp_f32_e32 v1, v1 438; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 439; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 440; VI-NEXT: v_ldexp_f32 v0, v1, v0 441; VI-NEXT: s_setpc_b64 s[30:31] 442; 443; R600-LABEL: v_neg_rcp_f32_ieee_ulp25: 444; R600: ; %bb.0: 445; R600-NEXT: CF_END 446; R600-NEXT: PAD 447 %rcp = fdiv float -1.0, %x, !fpmath !0 448 ret float %rcp 449} 450 451define float @v_neg_rcp_f32_daz_ulp25(float %x) #0 { 452; GCN-LABEL: v_neg_rcp_f32_daz_ulp25: 453; GCN: ; %bb.0: 454; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 455; GCN-NEXT: v_rcp_f32_e64 v0, -v0 456; GCN-NEXT: s_setpc_b64 s[30:31] 457; 458; R600-LABEL: v_neg_rcp_f32_daz_ulp25: 459; R600: ; %bb.0: 460; R600-NEXT: CF_END 461; R600-NEXT: PAD 462 %rcp = fdiv float -1.0, %x, !fpmath !0 463 ret float %rcp 464} 465 466define float @v_rcp_fabs_f32_ieee(float %x) #3 { 467; SI-LABEL: v_rcp_fabs_f32_ieee: 468; SI: ; %bb.0: 469; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 470; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 471; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 472; SI-NEXT: v_rcp_f32_e32 v3, v2 473; SI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0 474; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 475; SI-NEXT: v_fma_f32 v3, v4, v3, v3 476; SI-NEXT: v_mul_f32_e32 v4, v1, v3 477; SI-NEXT: v_fma_f32 v5, -v2, v4, v1 478; SI-NEXT: v_fma_f32 v4, v5, v3, v4 479; SI-NEXT: v_fma_f32 v1, -v2, v4, v1 480; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 481; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0 482; SI-NEXT: s_setpc_b64 s[30:31] 483; 484; VI-LABEL: v_rcp_fabs_f32_ieee: 485; VI: ; %bb.0: 486; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 487; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 488; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 489; VI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0 490; VI-NEXT: v_rcp_f32_e32 v3, v2 491; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 492; VI-NEXT: v_fma_f32 v3, v4, v3, v3 493; VI-NEXT: v_mul_f32_e32 v4, v1, v3 494; VI-NEXT: v_fma_f32 v5, -v2, v4, v1 495; VI-NEXT: v_fma_f32 v4, v5, v3, v4 496; VI-NEXT: v_fma_f32 v1, -v2, v4, v1 497; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 498; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0 499; VI-NEXT: s_setpc_b64 s[30:31] 500; 501; R600-LABEL: v_rcp_fabs_f32_ieee: 502; R600: ; %bb.0: 503; R600-NEXT: CF_END 504; R600-NEXT: PAD 505 %fabs.x = call float @llvm.fabs.f32(float %x) 506 %rcp = fdiv float 1.0, %fabs.x 507 ret float %rcp 508} 509 510define float @v_rcp_fabs_f32_daz(float %x) #0 { 511; SI-LABEL: v_rcp_fabs_f32_daz: 512; SI: ; %bb.0: 513; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 514; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 515; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 516; SI-NEXT: v_rcp_f32_e32 v3, v2 517; SI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0 518; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 519; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 520; SI-NEXT: v_fma_f32 v3, v4, v3, v3 521; SI-NEXT: v_mul_f32_e32 v4, v1, v3 522; SI-NEXT: v_fma_f32 v5, -v2, v4, v1 523; SI-NEXT: v_fma_f32 v4, v5, v3, v4 524; SI-NEXT: v_fma_f32 v1, -v2, v4, v1 525; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 526; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 527; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0 528; SI-NEXT: s_setpc_b64 s[30:31] 529; 530; VI-LABEL: v_rcp_fabs_f32_daz: 531; VI: ; %bb.0: 532; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 533; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 534; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0 535; VI-NEXT: v_div_scale_f32 v1, vcc, 1.0, v1, 1.0 536; VI-NEXT: v_rcp_f32_e32 v3, v2 537; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 538; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 539; VI-NEXT: v_fma_f32 v3, v4, v3, v3 540; VI-NEXT: v_mul_f32_e32 v4, v1, v3 541; VI-NEXT: v_fma_f32 v5, -v2, v4, v1 542; VI-NEXT: v_fma_f32 v4, v5, v3, v4 543; VI-NEXT: v_fma_f32 v1, -v2, v4, v1 544; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 545; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 546; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, 1.0 547; VI-NEXT: s_setpc_b64 s[30:31] 548; 549; R600-LABEL: v_rcp_fabs_f32_daz: 550; R600: ; %bb.0: 551; R600-NEXT: CF_END 552; R600-NEXT: PAD 553 %fabs.x = call float @llvm.fabs.f32(float %x) 554 %rcp = fdiv float 1.0, %fabs.x 555 ret float %rcp 556} 557 558define float @v_rcp_fabs_f32_ieee_ulp25(float %x) #3 { 559; SI-LABEL: v_rcp_fabs_f32_ieee_ulp25: 560; SI: ; %bb.0: 561; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 562; SI-NEXT: s_mov_b32 s4, 0x7f800000 563; SI-NEXT: v_frexp_mant_f32_e64 v1, |v0| 564; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 565; SI-NEXT: v_cndmask_b32_e64 v1, |v0|, v1, s[4:5] 566; SI-NEXT: v_rcp_f32_e32 v1, v1 567; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 568; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 569; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 570; SI-NEXT: s_setpc_b64 s[30:31] 571; 572; VI-LABEL: v_rcp_fabs_f32_ieee_ulp25: 573; VI: ; %bb.0: 574; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 575; VI-NEXT: v_frexp_mant_f32_e64 v1, |v0| 576; VI-NEXT: v_rcp_f32_e32 v1, v1 577; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 578; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 579; VI-NEXT: v_ldexp_f32 v0, v1, v0 580; VI-NEXT: s_setpc_b64 s[30:31] 581; 582; R600-LABEL: v_rcp_fabs_f32_ieee_ulp25: 583; R600: ; %bb.0: 584; R600-NEXT: CF_END 585; R600-NEXT: PAD 586 %fabs.x = call float @llvm.fabs.f32(float %x) 587 %rcp = fdiv float 1.0, %fabs.x, !fpmath !0 588 ret float %rcp 589} 590 591define float @v_rcp_fabs_f32_daz_ulp25(float %x) #0 { 592; GCN-LABEL: v_rcp_fabs_f32_daz_ulp25: 593; GCN: ; %bb.0: 594; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 595; GCN-NEXT: v_rcp_f32_e64 v0, |v0| 596; GCN-NEXT: s_setpc_b64 s[30:31] 597; 598; R600-LABEL: v_rcp_fabs_f32_daz_ulp25: 599; R600: ; %bb.0: 600; R600-NEXT: CF_END 601; R600-NEXT: PAD 602 %fabs.x = call float @llvm.fabs.f32(float %x) 603 %rcp = fdiv float 1.0, %fabs.x, !fpmath !0 604 ret float %rcp 605} 606 607define float @v_rcp_neg_fabs_f32_ieee(float %x) #3 { 608; SI-LABEL: v_rcp_neg_fabs_f32_ieee: 609; SI: ; %bb.0: 610; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 611; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 612; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 613; SI-NEXT: v_rcp_f32_e32 v3, v2 614; SI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0 615; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 616; SI-NEXT: v_fma_f32 v3, v4, v3, v3 617; SI-NEXT: v_mul_f32_e32 v4, v1, v3 618; SI-NEXT: v_fma_f32 v5, -v2, v4, v1 619; SI-NEXT: v_fma_f32 v4, v5, v3, v4 620; SI-NEXT: v_fma_f32 v1, -v2, v4, v1 621; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 622; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0 623; SI-NEXT: s_setpc_b64 s[30:31] 624; 625; VI-LABEL: v_rcp_neg_fabs_f32_ieee: 626; VI: ; %bb.0: 627; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 628; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 629; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 630; VI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0 631; VI-NEXT: v_rcp_f32_e32 v3, v2 632; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 633; VI-NEXT: v_fma_f32 v3, v4, v3, v3 634; VI-NEXT: v_mul_f32_e32 v4, v1, v3 635; VI-NEXT: v_fma_f32 v5, -v2, v4, v1 636; VI-NEXT: v_fma_f32 v4, v5, v3, v4 637; VI-NEXT: v_fma_f32 v1, -v2, v4, v1 638; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 639; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0 640; VI-NEXT: s_setpc_b64 s[30:31] 641; 642; R600-LABEL: v_rcp_neg_fabs_f32_ieee: 643; R600: ; %bb.0: 644; R600-NEXT: CF_END 645; R600-NEXT: PAD 646 %fabs.x = call float @llvm.fabs.f32(float %x) 647 %rcp = fdiv float -1.0, %fabs.x 648 ret float %rcp 649} 650 651define float @v_rcp_neg_fabs_f32_daz(float %x) #0 { 652; SI-LABEL: v_rcp_neg_fabs_f32_daz: 653; SI: ; %bb.0: 654; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 655; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 656; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 657; SI-NEXT: v_rcp_f32_e32 v3, v2 658; SI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0 659; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 660; SI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 661; SI-NEXT: v_fma_f32 v3, v4, v3, v3 662; SI-NEXT: v_mul_f32_e32 v4, v1, v3 663; SI-NEXT: v_fma_f32 v5, -v2, v4, v1 664; SI-NEXT: v_fma_f32 v4, v5, v3, v4 665; SI-NEXT: v_fma_f32 v1, -v2, v4, v1 666; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 667; SI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 668; SI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0 669; SI-NEXT: s_setpc_b64 s[30:31] 670; 671; VI-LABEL: v_rcp_neg_fabs_f32_daz: 672; VI: ; %bb.0: 673; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 674; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 675; VI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0 676; VI-NEXT: v_div_scale_f32 v1, vcc, -1.0, v1, -1.0 677; VI-NEXT: v_rcp_f32_e32 v3, v2 678; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 679; VI-NEXT: v_fma_f32 v4, -v2, v3, 1.0 680; VI-NEXT: v_fma_f32 v3, v4, v3, v3 681; VI-NEXT: v_mul_f32_e32 v4, v1, v3 682; VI-NEXT: v_fma_f32 v5, -v2, v4, v1 683; VI-NEXT: v_fma_f32 v4, v5, v3, v4 684; VI-NEXT: v_fma_f32 v1, -v2, v4, v1 685; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 686; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 687; VI-NEXT: v_div_fixup_f32 v0, v1, |v0|, -1.0 688; VI-NEXT: s_setpc_b64 s[30:31] 689; 690; R600-LABEL: v_rcp_neg_fabs_f32_daz: 691; R600: ; %bb.0: 692; R600-NEXT: CF_END 693; R600-NEXT: PAD 694 %fabs.x = call float @llvm.fabs.f32(float %x) 695 %rcp = fdiv float -1.0, %fabs.x 696 ret float %rcp 697} 698 699define float @v_rcp_neg_fabs_f32_ieee_ulp25(float %x) #3 { 700; SI-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25: 701; SI: ; %bb.0: 702; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 703; SI-NEXT: s_mov_b32 s4, 0x7f800000 704; SI-NEXT: v_frexp_mant_f32_e64 v1, -|v0| 705; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 706; SI-NEXT: v_cndmask_b32_e64 v1, -|v0|, v1, s[4:5] 707; SI-NEXT: v_rcp_f32_e32 v1, v1 708; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 709; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 710; SI-NEXT: v_ldexp_f32_e32 v0, v1, v0 711; SI-NEXT: s_setpc_b64 s[30:31] 712; 713; VI-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25: 714; VI: ; %bb.0: 715; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 716; VI-NEXT: v_frexp_mant_f32_e64 v1, -|v0| 717; VI-NEXT: v_rcp_f32_e32 v1, v1 718; VI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 719; VI-NEXT: v_sub_u32_e32 v0, vcc, 0, v0 720; VI-NEXT: v_ldexp_f32 v0, v1, v0 721; VI-NEXT: s_setpc_b64 s[30:31] 722; 723; R600-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25: 724; R600: ; %bb.0: 725; R600-NEXT: CF_END 726; R600-NEXT: PAD 727 %fabs.x = call float @llvm.fabs.f32(float %x) 728 %rcp = fdiv float -1.0, %fabs.x, !fpmath !0 729 ret float %rcp 730} 731 732define float @v_rcp_neg_fabs_f32_daz_ulp25(float %x) #0 { 733; GCN-LABEL: v_rcp_neg_fabs_f32_daz_ulp25: 734; GCN: ; %bb.0: 735; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 736; GCN-NEXT: v_rcp_f32_e64 v0, -|v0| 737; GCN-NEXT: s_setpc_b64 s[30:31] 738; 739; R600-LABEL: v_rcp_neg_fabs_f32_daz_ulp25: 740; R600: ; %bb.0: 741; R600-NEXT: CF_END 742; R600-NEXT: PAD 743 %fabs.x = call float @llvm.fabs.f32(float %x) 744 %rcp = fdiv float -1.0, %fabs.x, !fpmath !0 745 ret float %rcp 746} 747 748define amdgpu_kernel void @s_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { 749; SI-LABEL: s_rcp_pat_f32_daz: 750; SI: ; %bb.0: 751; SI-NEXT: s_load_dword s2, s[4:5], 0xb 752; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 753; SI-NEXT: s_mov_b32 s3, 0xf000 754; SI-NEXT: s_waitcnt lgkmcnt(0) 755; SI-NEXT: v_rcp_f32_e32 v0, s2 756; SI-NEXT: s_mov_b32 s2, -1 757; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 758; SI-NEXT: s_endpgm 759; 760; VI-LABEL: s_rcp_pat_f32_daz: 761; VI: ; %bb.0: 762; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 763; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 764; VI-NEXT: s_waitcnt lgkmcnt(0) 765; VI-NEXT: v_rcp_f32_e32 v2, s2 766; VI-NEXT: v_mov_b32_e32 v0, s0 767; VI-NEXT: v_mov_b32_e32 v1, s1 768; VI-NEXT: flat_store_dword v[0:1], v2 769; VI-NEXT: s_endpgm 770; 771; EG-LABEL: s_rcp_pat_f32_daz: 772; EG: ; %bb.0: 773; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 774; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 775; EG-NEXT: CF_END 776; EG-NEXT: PAD 777; EG-NEXT: ALU clause starting at 4: 778; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 779; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z, 780; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 781; 782; CM-LABEL: s_rcp_pat_f32_daz: 783; CM: ; %bb.0: 784; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 785; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 786; CM-NEXT: CF_END 787; CM-NEXT: PAD 788; CM-NEXT: ALU clause starting at 4: 789; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 790; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 791; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z, 792; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z, 793; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z, 794; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z, 795 %rcp = fdiv float 1.0, %src, !fpmath !0 796 store float %rcp, ptr addrspace(1) %out, align 4 797 ret void 798} 799 800define amdgpu_kernel void @s_rcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { 801; SI-LABEL: s_rcp_ulp25_pat_f32_daz: 802; SI: ; %bb.0: 803; SI-NEXT: s_load_dword s2, s[4:5], 0xb 804; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 805; SI-NEXT: s_mov_b32 s3, 0xf000 806; SI-NEXT: s_waitcnt lgkmcnt(0) 807; SI-NEXT: v_rcp_f32_e32 v0, s2 808; SI-NEXT: s_mov_b32 s2, -1 809; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 810; SI-NEXT: s_endpgm 811; 812; VI-LABEL: s_rcp_ulp25_pat_f32_daz: 813; VI: ; %bb.0: 814; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 815; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 816; VI-NEXT: s_waitcnt lgkmcnt(0) 817; VI-NEXT: v_rcp_f32_e32 v2, s2 818; VI-NEXT: v_mov_b32_e32 v0, s0 819; VI-NEXT: v_mov_b32_e32 v1, s1 820; VI-NEXT: flat_store_dword v[0:1], v2 821; VI-NEXT: s_endpgm 822; 823; EG-LABEL: s_rcp_ulp25_pat_f32_daz: 824; EG: ; %bb.0: 825; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 826; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 827; EG-NEXT: CF_END 828; EG-NEXT: PAD 829; EG-NEXT: ALU clause starting at 4: 830; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 831; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z, 832; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 833; 834; CM-LABEL: s_rcp_ulp25_pat_f32_daz: 835; CM: ; %bb.0: 836; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 837; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 838; CM-NEXT: CF_END 839; CM-NEXT: PAD 840; CM-NEXT: ALU clause starting at 4: 841; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 842; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 843; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z, 844; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z, 845; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z, 846; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z, 847 %rcp = fdiv float 1.0, %src, !fpmath !0 848 store float %rcp, ptr addrspace(1) %out, align 4 849 ret void 850} 851 852define amdgpu_kernel void @s_rcp_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { 853; SI-LABEL: s_rcp_fast_ulp25_pat_f32_daz: 854; SI: ; %bb.0: 855; SI-NEXT: s_load_dword s2, s[4:5], 0xb 856; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 857; SI-NEXT: s_mov_b32 s3, 0xf000 858; SI-NEXT: s_waitcnt lgkmcnt(0) 859; SI-NEXT: v_rcp_f32_e32 v0, s2 860; SI-NEXT: s_mov_b32 s2, -1 861; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 862; SI-NEXT: s_endpgm 863; 864; VI-LABEL: s_rcp_fast_ulp25_pat_f32_daz: 865; VI: ; %bb.0: 866; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 867; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 868; VI-NEXT: s_waitcnt lgkmcnt(0) 869; VI-NEXT: v_rcp_f32_e32 v2, s2 870; VI-NEXT: v_mov_b32_e32 v0, s0 871; VI-NEXT: v_mov_b32_e32 v1, s1 872; VI-NEXT: flat_store_dword v[0:1], v2 873; VI-NEXT: s_endpgm 874; 875; EG-LABEL: s_rcp_fast_ulp25_pat_f32_daz: 876; EG: ; %bb.0: 877; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 878; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 879; EG-NEXT: CF_END 880; EG-NEXT: PAD 881; EG-NEXT: ALU clause starting at 4: 882; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 883; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z, 884; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 885; 886; CM-LABEL: s_rcp_fast_ulp25_pat_f32_daz: 887; CM: ; %bb.0: 888; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 889; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 890; CM-NEXT: CF_END 891; CM-NEXT: PAD 892; CM-NEXT: ALU clause starting at 4: 893; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 894; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 895; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z, 896; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z, 897; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z, 898; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z, 899 %rcp = fdiv fast float 1.0, %src, !fpmath !0 900 store float %rcp, ptr addrspace(1) %out, align 4 901 ret void 902} 903 904define amdgpu_kernel void @s_rcp_arcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { 905; SI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: 906; SI: ; %bb.0: 907; SI-NEXT: s_load_dword s2, s[4:5], 0xb 908; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 909; SI-NEXT: s_mov_b32 s3, 0xf000 910; SI-NEXT: s_waitcnt lgkmcnt(0) 911; SI-NEXT: v_rcp_f32_e32 v0, s2 912; SI-NEXT: s_mov_b32 s2, -1 913; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 914; SI-NEXT: s_endpgm 915; 916; VI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: 917; VI: ; %bb.0: 918; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 919; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 920; VI-NEXT: s_waitcnt lgkmcnt(0) 921; VI-NEXT: v_rcp_f32_e32 v2, s2 922; VI-NEXT: v_mov_b32_e32 v0, s0 923; VI-NEXT: v_mov_b32_e32 v1, s1 924; VI-NEXT: flat_store_dword v[0:1], v2 925; VI-NEXT: s_endpgm 926; 927; EG-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: 928; EG: ; %bb.0: 929; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 930; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 931; EG-NEXT: CF_END 932; EG-NEXT: PAD 933; EG-NEXT: ALU clause starting at 4: 934; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 935; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z, 936; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 937; 938; CM-LABEL: s_rcp_arcp_ulp25_pat_f32_daz: 939; CM: ; %bb.0: 940; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 941; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 942; CM-NEXT: CF_END 943; CM-NEXT: PAD 944; CM-NEXT: ALU clause starting at 4: 945; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 946; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 947; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z, 948; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z, 949; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z, 950; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z, 951 %rcp = fdiv arcp float 1.0, %src, !fpmath !0 952 store float %rcp, ptr addrspace(1) %out, align 4 953 ret void 954} 955 956define amdgpu_kernel void @s_rcp_global_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, float %src) #2 { 957; SI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: 958; SI: ; %bb.0: 959; SI-NEXT: s_load_dword s2, s[4:5], 0xb 960; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 961; SI-NEXT: s_mov_b32 s3, 0xf000 962; SI-NEXT: s_waitcnt lgkmcnt(0) 963; SI-NEXT: v_rcp_f32_e32 v0, s2 964; SI-NEXT: s_mov_b32 s2, -1 965; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 966; SI-NEXT: s_endpgm 967; 968; VI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: 969; VI: ; %bb.0: 970; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 971; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 972; VI-NEXT: s_waitcnt lgkmcnt(0) 973; VI-NEXT: v_rcp_f32_e32 v2, s2 974; VI-NEXT: v_mov_b32_e32 v0, s0 975; VI-NEXT: v_mov_b32_e32 v1, s1 976; VI-NEXT: flat_store_dword v[0:1], v2 977; VI-NEXT: s_endpgm 978; 979; EG-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: 980; EG: ; %bb.0: 981; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 982; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 983; EG-NEXT: CF_END 984; EG-NEXT: PAD 985; EG-NEXT: ALU clause starting at 4: 986; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 987; EG-NEXT: RECIP_IEEE * T1.X, KC0[2].Z, 988; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 989; 990; CM-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz: 991; CM: ; %bb.0: 992; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 993; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 994; CM-NEXT: CF_END 995; CM-NEXT: PAD 996; CM-NEXT: ALU clause starting at 4: 997; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 998; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 999; CM-NEXT: RECIP_IEEE T1.X, KC0[2].Z, 1000; CM-NEXT: RECIP_IEEE T1.Y (MASKED), KC0[2].Z, 1001; CM-NEXT: RECIP_IEEE T1.Z (MASKED), KC0[2].Z, 1002; CM-NEXT: RECIP_IEEE * T1.W (MASKED), KC0[2].Z, 1003 %rcp = fdiv float 1.0, %src, !fpmath !0 1004 store float %rcp, ptr addrspace(1) %out, align 4 1005 ret void 1006} 1007 1008define amdgpu_kernel void @s_rcp_fabs_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { 1009; SI-LABEL: s_rcp_fabs_pat_f32_daz: 1010; SI: ; %bb.0: 1011; SI-NEXT: s_load_dword s2, s[4:5], 0xb 1012; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1013; SI-NEXT: s_mov_b32 s3, 0xf000 1014; SI-NEXT: s_waitcnt lgkmcnt(0) 1015; SI-NEXT: v_rcp_f32_e64 v0, |s2| 1016; SI-NEXT: s_mov_b32 s2, -1 1017; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1018; SI-NEXT: s_endpgm 1019; 1020; VI-LABEL: s_rcp_fabs_pat_f32_daz: 1021; VI: ; %bb.0: 1022; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 1023; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1024; VI-NEXT: s_waitcnt lgkmcnt(0) 1025; VI-NEXT: v_rcp_f32_e64 v2, |s2| 1026; VI-NEXT: v_mov_b32_e32 v0, s0 1027; VI-NEXT: v_mov_b32_e32 v1, s1 1028; VI-NEXT: flat_store_dword v[0:1], v2 1029; VI-NEXT: s_endpgm 1030; 1031; EG-LABEL: s_rcp_fabs_pat_f32_daz: 1032; EG: ; %bb.0: 1033; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 1034; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 1035; EG-NEXT: CF_END 1036; EG-NEXT: PAD 1037; EG-NEXT: ALU clause starting at 4: 1038; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 1039; EG-NEXT: RECIP_IEEE * T1.X, |KC0[2].Z|, 1040; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1041; 1042; CM-LABEL: s_rcp_fabs_pat_f32_daz: 1043; CM: ; %bb.0: 1044; CM-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 1045; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X 1046; CM-NEXT: CF_END 1047; CM-NEXT: PAD 1048; CM-NEXT: ALU clause starting at 4: 1049; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 1050; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1051; CM-NEXT: RECIP_IEEE T1.X, |KC0[2].Z|, 1052; CM-NEXT: RECIP_IEEE T1.Y (MASKED), |KC0[2].Z|, 1053; CM-NEXT: RECIP_IEEE T1.Z (MASKED), |KC0[2].Z|, 1054; CM-NEXT: RECIP_IEEE * T1.W (MASKED), |KC0[2].Z|, 1055 %src.fabs = call float @llvm.fabs.f32(float %src) 1056 %rcp = fdiv float 1.0, %src.fabs, !fpmath !0 1057 store float %rcp, ptr addrspace(1) %out, align 4 1058 ret void 1059} 1060 1061define amdgpu_kernel void @s_neg_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { 1062; SI-LABEL: s_neg_rcp_pat_f32_daz: 1063; SI: ; %bb.0: 1064; SI-NEXT: s_load_dword s2, s[4:5], 0xb 1065; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1066; SI-NEXT: s_mov_b32 s3, 0xf000 1067; SI-NEXT: s_waitcnt lgkmcnt(0) 1068; SI-NEXT: v_rcp_f32_e64 v0, -s2 1069; SI-NEXT: s_mov_b32 s2, -1 1070; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1071; SI-NEXT: s_endpgm 1072; 1073; VI-LABEL: s_neg_rcp_pat_f32_daz: 1074; VI: ; %bb.0: 1075; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 1076; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1077; VI-NEXT: s_waitcnt lgkmcnt(0) 1078; VI-NEXT: v_rcp_f32_e64 v2, -s2 1079; VI-NEXT: v_mov_b32_e32 v0, s0 1080; VI-NEXT: v_mov_b32_e32 v1, s1 1081; VI-NEXT: flat_store_dword v[0:1], v2 1082; VI-NEXT: s_endpgm 1083; 1084; EG-LABEL: s_neg_rcp_pat_f32_daz: 1085; EG: ; %bb.0: 1086; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 1087; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1088; EG-NEXT: CF_END 1089; EG-NEXT: PAD 1090; EG-NEXT: ALU clause starting at 4: 1091; EG-NEXT: RECIP_IEEE * T0.X, KC0[2].Z, 1092; EG-NEXT: MUL_IEEE T0.X, literal.x, PS, 1093; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1094; EG-NEXT: -1082130432(-1.000000e+00), 2(2.802597e-45) 1095; 1096; CM-LABEL: s_neg_rcp_pat_f32_daz: 1097; CM: ; %bb.0: 1098; CM-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1099; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 1100; CM-NEXT: CF_END 1101; CM-NEXT: PAD 1102; CM-NEXT: ALU clause starting at 4: 1103; CM-NEXT: RECIP_IEEE T0.X, KC0[2].Z, 1104; CM-NEXT: RECIP_IEEE T0.Y (MASKED), KC0[2].Z, 1105; CM-NEXT: RECIP_IEEE T0.Z (MASKED), KC0[2].Z, 1106; CM-NEXT: RECIP_IEEE * T0.W (MASKED), KC0[2].Z, 1107; CM-NEXT: MUL_IEEE * T0.X, literal.x, PV.X, 1108; CM-NEXT: -1082130432(-1.000000e+00), 0(0.000000e+00) 1109; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1110; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1111 %rcp = fdiv float -1.0, %src, !fpmath !0 1112 store float %rcp, ptr addrspace(1) %out, align 4 1113 ret void 1114} 1115 1116define amdgpu_kernel void @s_rcp_fabs_fneg_pat_f32_daz(ptr addrspace(1) %out, float %src) #0 { 1117; SI-LABEL: s_rcp_fabs_fneg_pat_f32_daz: 1118; SI: ; %bb.0: 1119; SI-NEXT: s_load_dword s2, s[4:5], 0xb 1120; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1121; SI-NEXT: s_mov_b32 s3, 0xf000 1122; SI-NEXT: s_waitcnt lgkmcnt(0) 1123; SI-NEXT: v_rcp_f32_e64 v0, -|s2| 1124; SI-NEXT: s_mov_b32 s2, -1 1125; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1126; SI-NEXT: s_endpgm 1127; 1128; VI-LABEL: s_rcp_fabs_fneg_pat_f32_daz: 1129; VI: ; %bb.0: 1130; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 1131; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1132; VI-NEXT: s_waitcnt lgkmcnt(0) 1133; VI-NEXT: v_rcp_f32_e64 v2, -|s2| 1134; VI-NEXT: v_mov_b32_e32 v0, s0 1135; VI-NEXT: v_mov_b32_e32 v1, s1 1136; VI-NEXT: flat_store_dword v[0:1], v2 1137; VI-NEXT: s_endpgm 1138; 1139; EG-LABEL: s_rcp_fabs_fneg_pat_f32_daz: 1140; EG: ; %bb.0: 1141; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 1142; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1143; EG-NEXT: CF_END 1144; EG-NEXT: PAD 1145; EG-NEXT: ALU clause starting at 4: 1146; EG-NEXT: RECIP_IEEE * T0.X, |KC0[2].Z|, 1147; EG-NEXT: MUL_IEEE T0.X, literal.x, PS, 1148; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1149; EG-NEXT: -1082130432(-1.000000e+00), 2(2.802597e-45) 1150; 1151; CM-LABEL: s_rcp_fabs_fneg_pat_f32_daz: 1152; CM: ; %bb.0: 1153; CM-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1154; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 1155; CM-NEXT: CF_END 1156; CM-NEXT: PAD 1157; CM-NEXT: ALU clause starting at 4: 1158; CM-NEXT: RECIP_IEEE T0.X, |KC0[2].Z|, 1159; CM-NEXT: RECIP_IEEE T0.Y (MASKED), |KC0[2].Z|, 1160; CM-NEXT: RECIP_IEEE T0.Z (MASKED), |KC0[2].Z|, 1161; CM-NEXT: RECIP_IEEE * T0.W (MASKED), |KC0[2].Z|, 1162; CM-NEXT: MUL_IEEE * T0.X, literal.x, PV.X, 1163; CM-NEXT: -1082130432(-1.000000e+00), 0(0.000000e+00) 1164; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1165; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1166 %src.fabs = call float @llvm.fabs.f32(float %src) 1167 %src.fabs.fneg = fneg float %src.fabs 1168 %rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0 1169 store float %rcp, ptr addrspace(1) %out, align 4 1170 ret void 1171} 1172 1173define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1) %out, float %src) #0 { 1174; SI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: 1175; SI: ; %bb.0: 1176; SI-NEXT: s_load_dword s6, s[4:5], 0xb 1177; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1178; SI-NEXT: s_mov_b32 s3, 0xf000 1179; SI-NEXT: s_mov_b32 s2, -1 1180; SI-NEXT: s_waitcnt lgkmcnt(0) 1181; SI-NEXT: v_rcp_f32_e64 v0, -|s6| 1182; SI-NEXT: v_mul_f32_e64 v1, s6, -|s6| 1183; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1184; SI-NEXT: s_waitcnt vmcnt(0) 1185; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0 1186; SI-NEXT: s_waitcnt vmcnt(0) 1187; SI-NEXT: s_endpgm 1188; 1189; VI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: 1190; VI: ; %bb.0: 1191; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 1192; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1193; VI-NEXT: s_waitcnt lgkmcnt(0) 1194; VI-NEXT: v_rcp_f32_e64 v2, -|s2| 1195; VI-NEXT: v_mov_b32_e32 v0, s0 1196; VI-NEXT: v_mov_b32_e32 v1, s1 1197; VI-NEXT: v_mul_f32_e64 v3, s2, -|s2| 1198; VI-NEXT: flat_store_dword v[0:1], v2 1199; VI-NEXT: s_waitcnt vmcnt(0) 1200; VI-NEXT: flat_store_dword v[0:1], v3 1201; VI-NEXT: s_waitcnt vmcnt(0) 1202; VI-NEXT: s_endpgm 1203; 1204; EG-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: 1205; EG: ; %bb.0: 1206; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 1207; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 1208; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 1209; EG-NEXT: CF_END 1210; EG-NEXT: ALU clause starting at 4: 1211; EG-NEXT: MUL_IEEE T0.X, KC0[2].Z, -|KC0[2].Z|, 1212; EG-NEXT: RECIP_IEEE * T0.Y, |KC0[2].Z|, 1213; EG-NEXT: MUL_IEEE T1.X, literal.x, PS, 1214; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.y, 1215; EG-NEXT: -1082130432(-1.000000e+00), 2(2.802597e-45) 1216; 1217; CM-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz: 1218; CM: ; %bb.0: 1219; CM-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 1220; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X 1221; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X 1222; CM-NEXT: CF_END 1223; CM-NEXT: ALU clause starting at 4: 1224; CM-NEXT: MUL_IEEE * T0.X, KC0[2].Z, -|KC0[2].Z|, 1225; CM-NEXT: RECIP_IEEE T0.X (MASKED), |KC0[2].Z|, 1226; CM-NEXT: RECIP_IEEE T0.Y, |KC0[2].Z|, 1227; CM-NEXT: RECIP_IEEE T0.Z (MASKED), |KC0[2].Z|, 1228; CM-NEXT: RECIP_IEEE * T0.W (MASKED), |KC0[2].Z|, 1229; CM-NEXT: MUL_IEEE * T1.X, literal.x, PV.Y, 1230; CM-NEXT: -1082130432(-1.000000e+00), 0(0.000000e+00) 1231; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, 1232; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1233 %src.fabs = call float @llvm.fabs.f32(float %src) 1234 %src.fabs.fneg = fneg float %src.fabs 1235 %rcp = fdiv float 1.0, %src.fabs.fneg, !fpmath !0 1236 store volatile float %rcp, ptr addrspace(1) %out, align 4 1237 1238 %other = fmul float %src, %src.fabs.fneg 1239 store volatile float %other, ptr addrspace(1) %out, align 4 1240 ret void 1241} 1242 1243define amdgpu_kernel void @s_div_arcp_2_x_pat_f32_daz(ptr addrspace(1) %out) #0 { 1244; SI-LABEL: s_div_arcp_2_x_pat_f32_daz: 1245; SI: ; %bb.0: 1246; SI-NEXT: s_load_dword s6, s[0:1], 0x0 1247; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1248; SI-NEXT: s_mov_b32 s3, 0xf000 1249; SI-NEXT: s_mov_b32 s2, -1 1250; SI-NEXT: s_waitcnt lgkmcnt(0) 1251; SI-NEXT: v_mul_f32_e64 v0, s6, 0.5 1252; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1253; SI-NEXT: s_endpgm 1254; 1255; VI-LABEL: s_div_arcp_2_x_pat_f32_daz: 1256; VI: ; %bb.0: 1257; VI-NEXT: s_load_dword s2, s[0:1], 0x0 1258; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1259; VI-NEXT: s_waitcnt lgkmcnt(0) 1260; VI-NEXT: v_mul_f32_e64 v2, s2, 0.5 1261; VI-NEXT: v_mov_b32_e32 v0, s0 1262; VI-NEXT: v_mov_b32_e32 v1, s1 1263; VI-NEXT: flat_store_dword v[0:1], v2 1264; VI-NEXT: s_endpgm 1265; 1266; EG-LABEL: s_div_arcp_2_x_pat_f32_daz: 1267; EG: ; %bb.0: 1268; EG-NEXT: TEX 0 @4 1269; EG-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[] 1270; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1271; EG-NEXT: CF_END 1272; EG-NEXT: Fetch clause starting at 4: 1273; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1274; EG-NEXT: ALU clause starting at 6: 1275; EG-NEXT: MUL_IEEE T0.X, T0.X, 0.5, 1276; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1277; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1278; 1279; CM-LABEL: s_div_arcp_2_x_pat_f32_daz: 1280; CM: ; %bb.0: 1281; CM-NEXT: TEX 0 @4 1282; CM-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[] 1283; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 1284; CM-NEXT: CF_END 1285; CM-NEXT: Fetch clause starting at 4: 1286; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1287; CM-NEXT: ALU clause starting at 6: 1288; CM-NEXT: MUL_IEEE * T0.X, T0.X, 0.5, 1289; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1290; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1291 %x = load float, ptr addrspace(1) undef 1292 %rcp = fdiv arcp float %x, 2.0 1293 store float %rcp, ptr addrspace(1) %out, align 4 1294 ret void 1295} 1296 1297define amdgpu_kernel void @s_div_arcp_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 { 1298; SI-LABEL: s_div_arcp_k_x_pat_f32_daz: 1299; SI: ; %bb.0: 1300; SI-NEXT: s_load_dword s6, s[0:1], 0x0 1301; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1302; SI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd 1303; SI-NEXT: s_mov_b32 s3, 0xf000 1304; SI-NEXT: s_mov_b32 s2, -1 1305; SI-NEXT: s_waitcnt lgkmcnt(0) 1306; SI-NEXT: v_mul_f32_e32 v0, s6, v0 1307; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1308; SI-NEXT: s_endpgm 1309; 1310; VI-LABEL: s_div_arcp_k_x_pat_f32_daz: 1311; VI: ; %bb.0: 1312; VI-NEXT: s_load_dword s2, s[0:1], 0x0 1313; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1314; VI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd 1315; VI-NEXT: s_waitcnt lgkmcnt(0) 1316; VI-NEXT: v_mul_f32_e32 v2, s2, v0 1317; VI-NEXT: v_mov_b32_e32 v0, s0 1318; VI-NEXT: v_mov_b32_e32 v1, s1 1319; VI-NEXT: flat_store_dword v[0:1], v2 1320; VI-NEXT: s_endpgm 1321; 1322; EG-LABEL: s_div_arcp_k_x_pat_f32_daz: 1323; EG: ; %bb.0: 1324; EG-NEXT: TEX 0 @4 1325; EG-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[] 1326; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1327; EG-NEXT: CF_END 1328; EG-NEXT: Fetch clause starting at 4: 1329; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1330; EG-NEXT: ALU clause starting at 6: 1331; EG-NEXT: MUL_IEEE T0.X, T0.X, literal.x, 1332; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1333; EG-NEXT: 1036831949(1.000000e-01), 2(2.802597e-45) 1334; 1335; CM-LABEL: s_div_arcp_k_x_pat_f32_daz: 1336; CM: ; %bb.0: 1337; CM-NEXT: TEX 0 @4 1338; CM-NEXT: ALU 3, @6, KC0[CB0:0-32], KC1[] 1339; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 1340; CM-NEXT: CF_END 1341; CM-NEXT: Fetch clause starting at 4: 1342; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1343; CM-NEXT: ALU clause starting at 6: 1344; CM-NEXT: MUL_IEEE * T0.X, T0.X, literal.x, 1345; CM-NEXT: 1036831949(1.000000e-01), 0(0.000000e+00) 1346; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1347; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1348 %x = load float, ptr addrspace(1) undef 1349 %rcp = fdiv arcp float %x, 10.0 1350 store float %rcp, ptr addrspace(1) %out, align 4 1351 ret void 1352} 1353 1354define amdgpu_kernel void @s_div_arcp_neg_k_x_pat_f32_daz(ptr addrspace(1) %out) #0 { 1355; SI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: 1356; SI: ; %bb.0: 1357; SI-NEXT: s_load_dword s6, s[0:1], 0x0 1358; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1359; SI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd 1360; SI-NEXT: s_mov_b32 s3, 0xf000 1361; SI-NEXT: s_mov_b32 s2, -1 1362; SI-NEXT: s_waitcnt lgkmcnt(0) 1363; SI-NEXT: v_mul_f32_e32 v0, s6, v0 1364; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1365; SI-NEXT: s_endpgm 1366; 1367; VI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: 1368; VI: ; %bb.0: 1369; VI-NEXT: s_load_dword s2, s[0:1], 0x0 1370; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1371; VI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd 1372; VI-NEXT: s_waitcnt lgkmcnt(0) 1373; VI-NEXT: v_mul_f32_e32 v2, s2, v0 1374; VI-NEXT: v_mov_b32_e32 v0, s0 1375; VI-NEXT: v_mov_b32_e32 v1, s1 1376; VI-NEXT: flat_store_dword v[0:1], v2 1377; VI-NEXT: s_endpgm 1378; 1379; EG-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: 1380; EG: ; %bb.0: 1381; EG-NEXT: TEX 0 @4 1382; EG-NEXT: ALU 2, @6, KC0[CB0:0-32], KC1[] 1383; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1384; EG-NEXT: CF_END 1385; EG-NEXT: Fetch clause starting at 4: 1386; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1387; EG-NEXT: ALU clause starting at 6: 1388; EG-NEXT: MUL_IEEE T0.X, T0.X, literal.x, 1389; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1390; EG-NEXT: -1110651699(-1.000000e-01), 2(2.802597e-45) 1391; 1392; CM-LABEL: s_div_arcp_neg_k_x_pat_f32_daz: 1393; CM: ; %bb.0: 1394; CM-NEXT: TEX 0 @4 1395; CM-NEXT: ALU 3, @6, KC0[CB0:0-32], KC1[] 1396; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X 1397; CM-NEXT: CF_END 1398; CM-NEXT: Fetch clause starting at 4: 1399; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1400; CM-NEXT: ALU clause starting at 6: 1401; CM-NEXT: MUL_IEEE * T0.X, T0.X, literal.x, 1402; CM-NEXT: -1110651699(-1.000000e-01), 0(0.000000e+00) 1403; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1404; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1405 %x = load float, ptr addrspace(1) undef 1406 %rcp = fdiv arcp float %x, -10.0 1407 store float %rcp, ptr addrspace(1) %out, align 4 1408 ret void 1409} 1410 1411declare float @llvm.fabs.f32(float) #1 1412declare float @llvm.sqrt.f32(float) #1 1413 1414attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 1415attributes #1 = { nounwind readnone } 1416attributes #2 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } 1417attributes #3 = { nounwind "denormal-fp-math-f32"="ieee,ieee" } 1418attributes #4 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="ieee,ieee" } 1419 1420!0 = !{float 2.500000e+00} 1421