1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 2; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s 7 8; Make sure fdiv is promoted to f32. 9 10define amdgpu_kernel void @v_fdiv_f16( 11; SI-LABEL: v_fdiv_f16: 12; SI: ; %bb.0: ; %entry 13; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 14; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 15; SI-NEXT: s_mov_b32 s7, 0xf000 16; SI-NEXT: s_mov_b32 s6, 0 17; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 18; SI-NEXT: s_waitcnt lgkmcnt(0) 19; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 20; SI-NEXT: v_mov_b32_e32 v1, 0 21; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 22; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 23; SI-NEXT: s_waitcnt vmcnt(0) 24; SI-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc 25; SI-NEXT: s_waitcnt vmcnt(0) 26; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 27; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 28; SI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, v2 29; SI-NEXT: v_rcp_f32_e32 v5, v4 30; SI-NEXT: v_div_scale_f32 v6, vcc, v2, v3, v2 31; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 32; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 33; SI-NEXT: v_fma_f32 v5, v7, v5, v5 34; SI-NEXT: v_mul_f32_e32 v7, v6, v5 35; SI-NEXT: v_fma_f32 v8, -v4, v7, v6 36; SI-NEXT: v_fma_f32 v7, v8, v5, v7 37; SI-NEXT: v_fma_f32 v4, -v4, v7, v6 38; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 39; SI-NEXT: v_div_fmas_f32 v4, v4, v5, v7 40; SI-NEXT: v_div_fixup_f32 v2, v4, v3, v2 41; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 42; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 43; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 44; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 45; SI-NEXT: s_endpgm 46; 47; GFX8-LABEL: v_fdiv_f16: 48; GFX8: ; %bb.0: ; %entry 49; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 50; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 51; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 52; GFX8-NEXT: s_waitcnt lgkmcnt(0) 53; GFX8-NEXT: v_mov_b32_e32 v1, s3 54; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4 55; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 56; GFX8-NEXT: v_mov_b32_e32 v3, s5 57; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4 58; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 59; GFX8-NEXT: flat_load_ushort v5, v[0:1] glc 60; GFX8-NEXT: s_waitcnt vmcnt(0) 61; GFX8-NEXT: flat_load_ushort v2, v[2:3] glc 62; GFX8-NEXT: s_waitcnt vmcnt(0) 63; GFX8-NEXT: v_mov_b32_e32 v6, s1 64; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v5 65; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v2 66; GFX8-NEXT: v_rcp_f32_e32 v3, v0 67; GFX8-NEXT: v_mul_f32_e32 v7, v1, v3 68; GFX8-NEXT: v_mad_f32 v8, -v0, v7, v1 69; GFX8-NEXT: v_mac_f32_e32 v7, v8, v3 70; GFX8-NEXT: v_mad_f32 v0, -v0, v7, v1 71; GFX8-NEXT: v_mul_f32_e32 v0, v0, v3 72; GFX8-NEXT: v_and_b32_e32 v0, 0xff800000, v0 73; GFX8-NEXT: v_add_f32_e32 v0, v0, v7 74; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v0 75; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 76; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc 77; GFX8-NEXT: v_div_fixup_f16 v2, v3, v2, v5 78; GFX8-NEXT: flat_store_short v[0:1], v2 79; GFX8-NEXT: s_endpgm 80; 81; GFX9-LABEL: v_fdiv_f16: 82; GFX9: ; %bb.0: ; %entry 83; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 84; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 85; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 86; GFX9-NEXT: s_waitcnt lgkmcnt(0) 87; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc 88; GFX9-NEXT: s_waitcnt vmcnt(0) 89; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] glc 90; GFX9-NEXT: s_waitcnt vmcnt(0) 91; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1 92; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2 93; GFX9-NEXT: v_rcp_f32_e32 v3, v3 94; GFX9-NEXT: v_mul_f32_e32 v4, v4, v3 95; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] 96; GFX9-NEXT: v_mac_f32_e32 v4, v5, v3 97; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] 98; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 99; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v3 100; GFX9-NEXT: v_add_f32_e32 v3, v3, v4 101; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3 102; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, v1 103; GFX9-NEXT: global_store_short v0, v1, s[0:1] 104; GFX9-NEXT: s_endpgm 105; 106; GFX10-LABEL: v_fdiv_f16: 107; GFX10: ; %bb.0: ; %entry 108; GFX10-NEXT: s_clause 0x1 109; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 110; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 111; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 112; GFX10-NEXT: s_waitcnt lgkmcnt(0) 113; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc 114; GFX10-NEXT: s_waitcnt vmcnt(0) 115; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc 116; GFX10-NEXT: s_waitcnt vmcnt(0) 117; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1 118; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2 119; GFX10-NEXT: v_rcp_f32_e32 v4, v3 120; GFX10-NEXT: v_mul_f32_e32 v6, v5, v4 121; GFX10-NEXT: v_mad_f32 v7, -v3, v6, v5 122; GFX10-NEXT: v_mac_f32_e32 v6, v7, v4 123; GFX10-NEXT: v_mad_f32 v3, -v3, v6, v5 124; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 125; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3 126; GFX10-NEXT: v_add_f32_e32 v3, v3, v6 127; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3 128; GFX10-NEXT: v_div_fixup_f16 v1, v3, v2, v1 129; GFX10-NEXT: global_store_short v0, v1, s[0:1] 130; GFX10-NEXT: s_endpgm 131; 132; GFX11-LABEL: v_fdiv_f16: 133; GFX11: ; %bb.0: ; %entry 134; GFX11-NEXT: s_clause 0x1 135; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 136; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 137; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 138; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 139; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 140; GFX11-NEXT: s_waitcnt lgkmcnt(0) 141; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc 142; GFX11-NEXT: s_waitcnt vmcnt(0) 143; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc 144; GFX11-NEXT: s_waitcnt vmcnt(0) 145; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1 146; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2 147; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 148; GFX11-NEXT: v_rcp_f32_e32 v3, v3 149; GFX11-NEXT: s_waitcnt_depctr 0xfff 150; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3 151; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] 152; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 153; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v3 154; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1] 155; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 156; GFX11-NEXT: v_mul_f32_e32 v3, v5, v3 157; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3 158; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 159; GFX11-NEXT: v_add_f32_e32 v3, v3, v4 160; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3 161; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 162; GFX11-NEXT: v_div_fixup_f16 v1, v3, v2, v1 163; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 164; GFX11-NEXT: s_endpgm 165 ptr addrspace(1) %r, 166 ptr addrspace(1) %a, 167 ptr addrspace(1) %b) #0 { 168entry: 169 %tid = call i32 @llvm.amdgcn.workitem.id.x() 170 %tid.ext = sext i32 %tid to i64 171 %gep.a = getelementptr inbounds half, ptr addrspace(1) %a, i64 %tid.ext 172 %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext 173 %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext 174 %a.val = load volatile half, ptr addrspace(1) %gep.a 175 %b.val = load volatile half, ptr addrspace(1) %gep.b 176 %r.val = fdiv half %a.val, %b.val 177 store half %r.val, ptr addrspace(1) %gep.r 178 ret void 179} 180 181define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { 182; SI-LABEL: v_rcp_f16: 183; SI: ; %bb.0: ; %entry 184; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 185; SI-NEXT: s_mov_b32 s7, 0xf000 186; SI-NEXT: s_mov_b32 s6, 0 187; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 188; SI-NEXT: v_mov_b32_e32 v1, 0 189; SI-NEXT: s_waitcnt lgkmcnt(0) 190; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 191; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 192; SI-NEXT: s_waitcnt vmcnt(0) 193; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 194; SI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, 1.0 195; SI-NEXT: v_rcp_f32_e32 v4, v3 196; SI-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0 197; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 198; SI-NEXT: v_fma_f32 v6, -v3, v4, 1.0 199; SI-NEXT: v_fma_f32 v4, v6, v4, v4 200; SI-NEXT: v_mul_f32_e32 v6, v5, v4 201; SI-NEXT: v_fma_f32 v7, -v3, v6, v5 202; SI-NEXT: v_fma_f32 v6, v7, v4, v6 203; SI-NEXT: v_fma_f32 v3, -v3, v6, v5 204; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 205; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6 206; SI-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0 207; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 208; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 209; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 210; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 211; SI-NEXT: s_endpgm 212; 213; GFX8-LABEL: v_rcp_f16: 214; GFX8: ; %bb.0: ; %entry 215; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 216; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 217; GFX8-NEXT: s_waitcnt lgkmcnt(0) 218; GFX8-NEXT: v_mov_b32_e32 v1, s3 219; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 220; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 221; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc 222; GFX8-NEXT: s_waitcnt vmcnt(0) 223; GFX8-NEXT: v_mov_b32_e32 v1, s1 224; GFX8-NEXT: v_rcp_f16_e32 v3, v0 225; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 226; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 227; GFX8-NEXT: flat_store_short v[0:1], v3 228; GFX8-NEXT: s_endpgm 229; 230; GFX9-LABEL: v_rcp_f16: 231; GFX9: ; %bb.0: ; %entry 232; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 233; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 234; GFX9-NEXT: s_waitcnt lgkmcnt(0) 235; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc 236; GFX9-NEXT: s_waitcnt vmcnt(0) 237; GFX9-NEXT: v_rcp_f16_e32 v1, v1 238; GFX9-NEXT: global_store_short v0, v1, s[0:1] 239; GFX9-NEXT: s_endpgm 240; 241; GFX10-LABEL: v_rcp_f16: 242; GFX10: ; %bb.0: ; %entry 243; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 244; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 245; GFX10-NEXT: s_waitcnt lgkmcnt(0) 246; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc 247; GFX10-NEXT: s_waitcnt vmcnt(0) 248; GFX10-NEXT: v_rcp_f16_e32 v1, v1 249; GFX10-NEXT: global_store_short v0, v1, s[0:1] 250; GFX10-NEXT: s_endpgm 251; 252; GFX11-LABEL: v_rcp_f16: 253; GFX11: ; %bb.0: ; %entry 254; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 255; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 256; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 257; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 258; GFX11-NEXT: s_waitcnt lgkmcnt(0) 259; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc 260; GFX11-NEXT: s_waitcnt vmcnt(0) 261; GFX11-NEXT: v_rcp_f16_e32 v1, v1 262; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 263; GFX11-NEXT: s_endpgm 264entry: 265 %tid = call i32 @llvm.amdgcn.workitem.id.x() 266 %tid.ext = sext i32 %tid to i64 267 %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext 268 %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext 269 %b.val = load volatile half, ptr addrspace(1) %gep.b 270 %r.val = fdiv half 1.0, %b.val 271 store half %r.val, ptr addrspace(1) %gep.r 272 ret void 273} 274 275define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { 276; SI-LABEL: v_rcp_f16_abs: 277; SI: ; %bb.0: ; %entry 278; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 279; SI-NEXT: s_mov_b32 s7, 0xf000 280; SI-NEXT: s_mov_b32 s6, 0 281; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 282; SI-NEXT: v_mov_b32_e32 v1, 0 283; SI-NEXT: s_waitcnt lgkmcnt(0) 284; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 285; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 286; SI-NEXT: s_waitcnt vmcnt(0) 287; SI-NEXT: v_cvt_f32_f16_e64 v2, |v2| 288; SI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, 1.0 289; SI-NEXT: v_rcp_f32_e32 v4, v3 290; SI-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0 291; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 292; SI-NEXT: v_fma_f32 v6, -v3, v4, 1.0 293; SI-NEXT: v_fma_f32 v4, v6, v4, v4 294; SI-NEXT: v_mul_f32_e32 v6, v5, v4 295; SI-NEXT: v_fma_f32 v7, -v3, v6, v5 296; SI-NEXT: v_fma_f32 v6, v7, v4, v6 297; SI-NEXT: v_fma_f32 v3, -v3, v6, v5 298; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 299; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6 300; SI-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0 301; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 302; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 303; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 304; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 305; SI-NEXT: s_endpgm 306; 307; GFX8-LABEL: v_rcp_f16_abs: 308; GFX8: ; %bb.0: ; %entry 309; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 310; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 311; GFX8-NEXT: s_waitcnt lgkmcnt(0) 312; GFX8-NEXT: v_mov_b32_e32 v1, s3 313; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 314; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 315; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc 316; GFX8-NEXT: s_waitcnt vmcnt(0) 317; GFX8-NEXT: v_mov_b32_e32 v1, s1 318; GFX8-NEXT: v_rcp_f16_e64 v3, |v0| 319; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 320; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 321; GFX8-NEXT: flat_store_short v[0:1], v3 322; GFX8-NEXT: s_endpgm 323; 324; GFX9-LABEL: v_rcp_f16_abs: 325; GFX9: ; %bb.0: ; %entry 326; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 327; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 328; GFX9-NEXT: s_waitcnt lgkmcnt(0) 329; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc 330; GFX9-NEXT: s_waitcnt vmcnt(0) 331; GFX9-NEXT: v_rcp_f16_e64 v1, |v1| 332; GFX9-NEXT: global_store_short v0, v1, s[0:1] 333; GFX9-NEXT: s_endpgm 334; 335; GFX10-LABEL: v_rcp_f16_abs: 336; GFX10: ; %bb.0: ; %entry 337; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 338; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 339; GFX10-NEXT: s_waitcnt lgkmcnt(0) 340; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc 341; GFX10-NEXT: s_waitcnt vmcnt(0) 342; GFX10-NEXT: v_rcp_f16_e64 v1, |v1| 343; GFX10-NEXT: global_store_short v0, v1, s[0:1] 344; GFX10-NEXT: s_endpgm 345; 346; GFX11-LABEL: v_rcp_f16_abs: 347; GFX11: ; %bb.0: ; %entry 348; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 349; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 350; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 351; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 352; GFX11-NEXT: s_waitcnt lgkmcnt(0) 353; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc 354; GFX11-NEXT: s_waitcnt vmcnt(0) 355; GFX11-NEXT: v_rcp_f16_e64 v1, |v1| 356; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 357; GFX11-NEXT: s_endpgm 358entry: 359 %tid = call i32 @llvm.amdgcn.workitem.id.x() 360 %tid.ext = sext i32 %tid to i64 361 %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext 362 %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext 363 %b.val = load volatile half, ptr addrspace(1) %gep.b 364 %b.abs = call half @llvm.fabs.f16(half %b.val) 365 %r.val = fdiv half 1.0, %b.abs 366 store half %r.val, ptr addrspace(1) %gep.r 367 ret void 368} 369 370; We could not do 1/b -> rcp_f32(b) under !fpmath < 1ulp. 371 372define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { 373; SI-LABEL: reciprocal_f16_rounded: 374; SI: ; %bb.0: ; %entry 375; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 376; SI-NEXT: s_mov_b32 s7, 0xf000 377; SI-NEXT: s_mov_b32 s6, 0 378; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 379; SI-NEXT: v_mov_b32_e32 v1, 0 380; SI-NEXT: s_waitcnt lgkmcnt(0) 381; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 382; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 383; SI-NEXT: s_waitcnt vmcnt(0) 384; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 385; SI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, 1.0 386; SI-NEXT: v_rcp_f32_e32 v4, v3 387; SI-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0 388; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 389; SI-NEXT: v_fma_f32 v6, -v3, v4, 1.0 390; SI-NEXT: v_fma_f32 v4, v6, v4, v4 391; SI-NEXT: v_mul_f32_e32 v6, v5, v4 392; SI-NEXT: v_fma_f32 v7, -v3, v6, v5 393; SI-NEXT: v_fma_f32 v6, v7, v4, v6 394; SI-NEXT: v_fma_f32 v3, -v3, v6, v5 395; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 396; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6 397; SI-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0 398; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 399; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 400; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 401; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 402; SI-NEXT: s_endpgm 403; 404; GFX8-LABEL: reciprocal_f16_rounded: 405; GFX8: ; %bb.0: ; %entry 406; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 407; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 408; GFX8-NEXT: s_waitcnt lgkmcnt(0) 409; GFX8-NEXT: v_mov_b32_e32 v1, s3 410; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 411; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 412; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc 413; GFX8-NEXT: s_waitcnt vmcnt(0) 414; GFX8-NEXT: v_mov_b32_e32 v1, s1 415; GFX8-NEXT: v_rcp_f16_e32 v3, v0 416; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 417; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 418; GFX8-NEXT: flat_store_short v[0:1], v3 419; GFX8-NEXT: s_endpgm 420; 421; GFX9-LABEL: reciprocal_f16_rounded: 422; GFX9: ; %bb.0: ; %entry 423; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 424; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 425; GFX9-NEXT: s_waitcnt lgkmcnt(0) 426; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc 427; GFX9-NEXT: s_waitcnt vmcnt(0) 428; GFX9-NEXT: v_rcp_f16_e32 v1, v1 429; GFX9-NEXT: global_store_short v0, v1, s[0:1] 430; GFX9-NEXT: s_endpgm 431; 432; GFX10-LABEL: reciprocal_f16_rounded: 433; GFX10: ; %bb.0: ; %entry 434; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 435; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 436; GFX10-NEXT: s_waitcnt lgkmcnt(0) 437; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc 438; GFX10-NEXT: s_waitcnt vmcnt(0) 439; GFX10-NEXT: v_rcp_f16_e32 v1, v1 440; GFX10-NEXT: global_store_short v0, v1, s[0:1] 441; GFX10-NEXT: s_endpgm 442; 443; GFX11-LABEL: reciprocal_f16_rounded: 444; GFX11: ; %bb.0: ; %entry 445; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 446; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 447; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 448; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 449; GFX11-NEXT: s_waitcnt lgkmcnt(0) 450; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc 451; GFX11-NEXT: s_waitcnt vmcnt(0) 452; GFX11-NEXT: v_rcp_f16_e32 v1, v1 453; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 454; GFX11-NEXT: s_endpgm 455entry: 456 %tid = call i32 @llvm.amdgcn.workitem.id.x() 457 %tid.ext = sext i32 %tid to i64 458 %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext 459 %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext 460 %b.val = load volatile half, ptr addrspace(1) %gep.b 461 %r.val = fdiv half 1.0, %b.val 462 store half %r.val, ptr addrspace(1) %gep.r 463 ret void 464} 465 466define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { 467; SI-LABEL: v_rcp_f16_afn: 468; SI: ; %bb.0: ; %entry 469; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 470; SI-NEXT: s_mov_b32 s7, 0xf000 471; SI-NEXT: s_mov_b32 s6, 0 472; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 473; SI-NEXT: v_mov_b32_e32 v1, 0 474; SI-NEXT: s_waitcnt lgkmcnt(0) 475; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 476; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 477; SI-NEXT: s_waitcnt vmcnt(0) 478; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 479; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 480; SI-NEXT: v_rcp_f32_e32 v2, v2 481; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 482; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 483; SI-NEXT: s_endpgm 484; 485; GFX8-LABEL: v_rcp_f16_afn: 486; GFX8: ; %bb.0: ; %entry 487; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 488; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 489; GFX8-NEXT: s_waitcnt lgkmcnt(0) 490; GFX8-NEXT: v_mov_b32_e32 v1, s3 491; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 492; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 493; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc 494; GFX8-NEXT: s_waitcnt vmcnt(0) 495; GFX8-NEXT: v_mov_b32_e32 v1, s1 496; GFX8-NEXT: v_rcp_f16_e32 v3, v0 497; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 498; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 499; GFX8-NEXT: flat_store_short v[0:1], v3 500; GFX8-NEXT: s_endpgm 501; 502; GFX9-LABEL: v_rcp_f16_afn: 503; GFX9: ; %bb.0: ; %entry 504; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 505; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 506; GFX9-NEXT: s_waitcnt lgkmcnt(0) 507; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc 508; GFX9-NEXT: s_waitcnt vmcnt(0) 509; GFX9-NEXT: v_rcp_f16_e32 v1, v1 510; GFX9-NEXT: global_store_short v0, v1, s[0:1] 511; GFX9-NEXT: s_endpgm 512; 513; GFX10-LABEL: v_rcp_f16_afn: 514; GFX10: ; %bb.0: ; %entry 515; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 516; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 517; GFX10-NEXT: s_waitcnt lgkmcnt(0) 518; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc 519; GFX10-NEXT: s_waitcnt vmcnt(0) 520; GFX10-NEXT: v_rcp_f16_e32 v1, v1 521; GFX10-NEXT: global_store_short v0, v1, s[0:1] 522; GFX10-NEXT: s_endpgm 523; 524; GFX11-LABEL: v_rcp_f16_afn: 525; GFX11: ; %bb.0: ; %entry 526; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 527; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 528; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 529; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 530; GFX11-NEXT: s_waitcnt lgkmcnt(0) 531; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc 532; GFX11-NEXT: s_waitcnt vmcnt(0) 533; GFX11-NEXT: v_rcp_f16_e32 v1, v1 534; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 535; GFX11-NEXT: s_endpgm 536entry: 537 %tid = call i32 @llvm.amdgcn.workitem.id.x() 538 %tid.ext = sext i32 %tid to i64 539 %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext 540 %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext 541 %b.val = load volatile half, ptr addrspace(1) %gep.b 542 %r.val = fdiv afn half 1.0, %b.val 543 store half %r.val, ptr addrspace(1) %gep.r 544 ret void 545} 546 547define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { 548; SI-LABEL: v_rcp_f16_neg: 549; SI: ; %bb.0: ; %entry 550; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 551; SI-NEXT: s_mov_b32 s7, 0xf000 552; SI-NEXT: s_mov_b32 s6, 0 553; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 554; SI-NEXT: v_mov_b32_e32 v1, 0 555; SI-NEXT: s_waitcnt lgkmcnt(0) 556; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 557; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 558; SI-NEXT: s_waitcnt vmcnt(0) 559; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 560; SI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, -1.0 561; SI-NEXT: v_rcp_f32_e32 v4, v3 562; SI-NEXT: v_div_scale_f32 v5, vcc, -1.0, v2, -1.0 563; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 564; SI-NEXT: v_fma_f32 v6, -v3, v4, 1.0 565; SI-NEXT: v_fma_f32 v4, v6, v4, v4 566; SI-NEXT: v_mul_f32_e32 v6, v5, v4 567; SI-NEXT: v_fma_f32 v7, -v3, v6, v5 568; SI-NEXT: v_fma_f32 v6, v7, v4, v6 569; SI-NEXT: v_fma_f32 v3, -v3, v6, v5 570; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 571; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6 572; SI-NEXT: v_div_fixup_f32 v2, v3, v2, -1.0 573; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 574; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 575; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 576; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 577; SI-NEXT: s_endpgm 578; 579; GFX8-LABEL: v_rcp_f16_neg: 580; GFX8: ; %bb.0: ; %entry 581; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 582; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 583; GFX8-NEXT: s_waitcnt lgkmcnt(0) 584; GFX8-NEXT: v_mov_b32_e32 v1, s3 585; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 586; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 587; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc 588; GFX8-NEXT: s_waitcnt vmcnt(0) 589; GFX8-NEXT: v_mov_b32_e32 v1, s1 590; GFX8-NEXT: v_rcp_f16_e64 v3, -v0 591; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 592; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 593; GFX8-NEXT: flat_store_short v[0:1], v3 594; GFX8-NEXT: s_endpgm 595; 596; GFX9-LABEL: v_rcp_f16_neg: 597; GFX9: ; %bb.0: ; %entry 598; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 599; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 600; GFX9-NEXT: s_waitcnt lgkmcnt(0) 601; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc 602; GFX9-NEXT: s_waitcnt vmcnt(0) 603; GFX9-NEXT: v_rcp_f16_e64 v1, -v1 604; GFX9-NEXT: global_store_short v0, v1, s[0:1] 605; GFX9-NEXT: s_endpgm 606; 607; GFX10-LABEL: v_rcp_f16_neg: 608; GFX10: ; %bb.0: ; %entry 609; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 610; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 611; GFX10-NEXT: s_waitcnt lgkmcnt(0) 612; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc 613; GFX10-NEXT: s_waitcnt vmcnt(0) 614; GFX10-NEXT: v_rcp_f16_e64 v1, -v1 615; GFX10-NEXT: global_store_short v0, v1, s[0:1] 616; GFX10-NEXT: s_endpgm 617; 618; GFX11-LABEL: v_rcp_f16_neg: 619; GFX11: ; %bb.0: ; %entry 620; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 621; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 622; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 623; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 624; GFX11-NEXT: s_waitcnt lgkmcnt(0) 625; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc 626; GFX11-NEXT: s_waitcnt vmcnt(0) 627; GFX11-NEXT: v_rcp_f16_e64 v1, -v1 628; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 629; GFX11-NEXT: s_endpgm 630entry: 631 %tid = call i32 @llvm.amdgcn.workitem.id.x() 632 %tid.ext = sext i32 %tid to i64 633 %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext 634 %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext 635 %b.val = load volatile half, ptr addrspace(1) %gep.b 636 %r.val = fdiv half -1.0, %b.val 637 store half %r.val, ptr addrspace(1) %gep.r 638 ret void 639} 640 641define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { 642; SI-LABEL: v_rsq_f16: 643; SI: ; %bb.0: ; %entry 644; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 645; SI-NEXT: s_mov_b32 s7, 0xf000 646; SI-NEXT: s_mov_b32 s6, 0 647; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 648; SI-NEXT: v_mov_b32_e32 v1, 0 649; SI-NEXT: s_waitcnt lgkmcnt(0) 650; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 651; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 652; SI-NEXT: s_waitcnt vmcnt(0) 653; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 654; SI-NEXT: v_sqrt_f32_e32 v2, v2 655; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 656; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 657; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 658; SI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, 1.0 659; SI-NEXT: v_rcp_f32_e32 v4, v3 660; SI-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0 661; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 662; SI-NEXT: v_fma_f32 v6, -v3, v4, 1.0 663; SI-NEXT: v_fma_f32 v4, v6, v4, v4 664; SI-NEXT: v_mul_f32_e32 v6, v5, v4 665; SI-NEXT: v_fma_f32 v7, -v3, v6, v5 666; SI-NEXT: v_fma_f32 v6, v7, v4, v6 667; SI-NEXT: v_fma_f32 v3, -v3, v6, v5 668; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 669; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6 670; SI-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0 671; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 672; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 673; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 674; SI-NEXT: s_endpgm 675; 676; GFX8-LABEL: v_rsq_f16: 677; GFX8: ; %bb.0: ; %entry 678; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 679; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 680; GFX8-NEXT: s_waitcnt lgkmcnt(0) 681; GFX8-NEXT: v_mov_b32_e32 v1, s3 682; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 683; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 684; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc 685; GFX8-NEXT: s_waitcnt vmcnt(0) 686; GFX8-NEXT: v_mov_b32_e32 v1, s1 687; GFX8-NEXT: v_rsq_f16_e32 v3, v0 688; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 689; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 690; GFX8-NEXT: flat_store_short v[0:1], v3 691; GFX8-NEXT: s_endpgm 692; 693; GFX9-LABEL: v_rsq_f16: 694; GFX9: ; %bb.0: ; %entry 695; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 696; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 697; GFX9-NEXT: s_waitcnt lgkmcnt(0) 698; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc 699; GFX9-NEXT: s_waitcnt vmcnt(0) 700; GFX9-NEXT: v_rsq_f16_e32 v1, v1 701; GFX9-NEXT: global_store_short v0, v1, s[0:1] 702; GFX9-NEXT: s_endpgm 703; 704; GFX10-LABEL: v_rsq_f16: 705; GFX10: ; %bb.0: ; %entry 706; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 707; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 708; GFX10-NEXT: s_waitcnt lgkmcnt(0) 709; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc 710; GFX10-NEXT: s_waitcnt vmcnt(0) 711; GFX10-NEXT: v_rsq_f16_e32 v1, v1 712; GFX10-NEXT: global_store_short v0, v1, s[0:1] 713; GFX10-NEXT: s_endpgm 714; 715; GFX11-LABEL: v_rsq_f16: 716; GFX11: ; %bb.0: ; %entry 717; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 718; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 719; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 720; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 721; GFX11-NEXT: s_waitcnt lgkmcnt(0) 722; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc 723; GFX11-NEXT: s_waitcnt vmcnt(0) 724; GFX11-NEXT: v_rsq_f16_e32 v1, v1 725; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 726; GFX11-NEXT: s_endpgm 727entry: 728 %tid = call i32 @llvm.amdgcn.workitem.id.x() 729 %tid.ext = sext i32 %tid to i64 730 %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext 731 %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext 732 %b.val = load volatile half, ptr addrspace(1) %gep.b 733 %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val) 734 %r.val = fdiv contract half 1.0, %b.sqrt 735 store half %r.val, ptr addrspace(1) %gep.r 736 ret void 737} 738 739define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { 740; SI-LABEL: v_rsq_f16_neg: 741; SI: ; %bb.0: ; %entry 742; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 743; SI-NEXT: s_mov_b32 s7, 0xf000 744; SI-NEXT: s_mov_b32 s6, 0 745; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 746; SI-NEXT: v_mov_b32_e32 v1, 0 747; SI-NEXT: s_waitcnt lgkmcnt(0) 748; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 749; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 750; SI-NEXT: s_waitcnt vmcnt(0) 751; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 752; SI-NEXT: v_sqrt_f32_e32 v2, v2 753; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 754; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 755; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 756; SI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, -1.0 757; SI-NEXT: v_rcp_f32_e32 v4, v3 758; SI-NEXT: v_div_scale_f32 v5, vcc, -1.0, v2, -1.0 759; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 760; SI-NEXT: v_fma_f32 v6, -v3, v4, 1.0 761; SI-NEXT: v_fma_f32 v4, v6, v4, v4 762; SI-NEXT: v_mul_f32_e32 v6, v5, v4 763; SI-NEXT: v_fma_f32 v7, -v3, v6, v5 764; SI-NEXT: v_fma_f32 v6, v7, v4, v6 765; SI-NEXT: v_fma_f32 v3, -v3, v6, v5 766; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 767; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6 768; SI-NEXT: v_div_fixup_f32 v2, v3, v2, -1.0 769; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 770; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 771; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 772; SI-NEXT: s_endpgm 773; 774; GFX8-LABEL: v_rsq_f16_neg: 775; GFX8: ; %bb.0: ; %entry 776; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 777; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 778; GFX8-NEXT: s_waitcnt lgkmcnt(0) 779; GFX8-NEXT: v_mov_b32_e32 v1, s3 780; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 781; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 782; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc 783; GFX8-NEXT: s_waitcnt vmcnt(0) 784; GFX8-NEXT: v_mov_b32_e32 v1, s1 785; GFX8-NEXT: v_rsq_f16_e32 v3, v0 786; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 787; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 788; GFX8-NEXT: v_xor_b32_e32 v2, 0x8000, v3 789; GFX8-NEXT: flat_store_short v[0:1], v2 790; GFX8-NEXT: s_endpgm 791; 792; GFX9-LABEL: v_rsq_f16_neg: 793; GFX9: ; %bb.0: ; %entry 794; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 795; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 796; GFX9-NEXT: s_waitcnt lgkmcnt(0) 797; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc 798; GFX9-NEXT: s_waitcnt vmcnt(0) 799; GFX9-NEXT: v_rsq_f16_e32 v1, v1 800; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1 801; GFX9-NEXT: global_store_short v0, v1, s[0:1] 802; GFX9-NEXT: s_endpgm 803; 804; GFX10-LABEL: v_rsq_f16_neg: 805; GFX10: ; %bb.0: ; %entry 806; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 807; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 808; GFX10-NEXT: s_waitcnt lgkmcnt(0) 809; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc 810; GFX10-NEXT: s_waitcnt vmcnt(0) 811; GFX10-NEXT: v_rsq_f16_e32 v1, v1 812; GFX10-NEXT: v_xor_b32_e32 v1, 0x8000, v1 813; GFX10-NEXT: global_store_short v0, v1, s[0:1] 814; GFX10-NEXT: s_endpgm 815; 816; GFX11-LABEL: v_rsq_f16_neg: 817; GFX11: ; %bb.0: ; %entry 818; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 819; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 820; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 821; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 822; GFX11-NEXT: s_waitcnt lgkmcnt(0) 823; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc 824; GFX11-NEXT: s_waitcnt vmcnt(0) 825; GFX11-NEXT: v_rsq_f16_e32 v1, v1 826; GFX11-NEXT: s_waitcnt_depctr 0xfff 827; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 828; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 829; GFX11-NEXT: s_endpgm 830entry: 831 %tid = call i32 @llvm.amdgcn.workitem.id.x() 832 %tid.ext = sext i32 %tid to i64 833 %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext 834 %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext 835 %b.val = load volatile half, ptr addrspace(1) %gep.b 836 %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val) 837 %r.val = fdiv contract half -1.0, %b.sqrt 838 store half %r.val, ptr addrspace(1) %gep.r 839 ret void 840} 841 842define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { 843; SI-LABEL: v_rsq_f16_multi_use: 844; SI: ; %bb.0: ; %entry 845; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 846; SI-NEXT: s_mov_b32 s7, 0xf000 847; SI-NEXT: s_mov_b32 s6, 0 848; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 849; SI-NEXT: v_mov_b32_e32 v1, 0 850; SI-NEXT: s_waitcnt lgkmcnt(0) 851; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 852; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 853; SI-NEXT: s_waitcnt vmcnt(0) 854; SI-NEXT: v_cvt_f32_f16_e32 v3, v2 855; SI-NEXT: v_sqrt_f32_e32 v3, v3 856; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 857; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 858; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 859; SI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, 1.0 860; SI-NEXT: v_rcp_f32_e32 v5, v4 861; SI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0 862; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 863; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 864; SI-NEXT: v_fma_f32 v5, v7, v5, v5 865; SI-NEXT: v_mul_f32_e32 v7, v6, v5 866; SI-NEXT: v_fma_f32 v8, -v4, v7, v6 867; SI-NEXT: v_fma_f32 v7, v8, v5, v7 868; SI-NEXT: v_fma_f32 v4, -v4, v7, v6 869; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 870; SI-NEXT: v_div_fmas_f32 v4, v4, v5, v7 871; SI-NEXT: v_div_fixup_f32 v3, v4, v3, 1.0 872; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 873; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 874; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 875; SI-NEXT: s_waitcnt vmcnt(0) 876; SI-NEXT: buffer_store_short v3, v[0:1], s[0:3], 0 addr64 877; SI-NEXT: s_endpgm 878; 879; GFX8-LABEL: v_rsq_f16_multi_use: 880; GFX8: ; %bb.0: ; %entry 881; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 882; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 883; GFX8-NEXT: s_waitcnt lgkmcnt(0) 884; GFX8-NEXT: v_mov_b32_e32 v1, s3 885; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 886; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 887; GFX8-NEXT: flat_load_ushort v3, v[0:1] glc 888; GFX8-NEXT: s_waitcnt vmcnt(0) 889; GFX8-NEXT: v_mov_b32_e32 v1, s1 890; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 891; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 892; GFX8-NEXT: v_rsq_f16_e32 v4, v3 893; GFX8-NEXT: flat_store_short v[0:1], v3 894; GFX8-NEXT: s_waitcnt vmcnt(0) 895; GFX8-NEXT: flat_store_short v[0:1], v4 896; GFX8-NEXT: s_endpgm 897; 898; GFX9-LABEL: v_rsq_f16_multi_use: 899; GFX9: ; %bb.0: ; %entry 900; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 901; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 902; GFX9-NEXT: s_waitcnt lgkmcnt(0) 903; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc 904; GFX9-NEXT: s_waitcnt vmcnt(0) 905; GFX9-NEXT: v_rsq_f16_e32 v2, v1 906; GFX9-NEXT: global_store_short v0, v1, s[0:1] 907; GFX9-NEXT: s_waitcnt vmcnt(0) 908; GFX9-NEXT: global_store_short v0, v2, s[0:1] 909; GFX9-NEXT: s_endpgm 910; 911; GFX10-LABEL: v_rsq_f16_multi_use: 912; GFX10: ; %bb.0: ; %entry 913; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 914; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 915; GFX10-NEXT: s_waitcnt lgkmcnt(0) 916; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc 917; GFX10-NEXT: s_waitcnt vmcnt(0) 918; GFX10-NEXT: v_rsq_f16_e32 v2, v1 919; GFX10-NEXT: global_store_short v0, v1, s[0:1] 920; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 921; GFX10-NEXT: global_store_short v0, v2, s[0:1] 922; GFX10-NEXT: s_endpgm 923; 924; GFX11-LABEL: v_rsq_f16_multi_use: 925; GFX11: ; %bb.0: ; %entry 926; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 927; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 928; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 929; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 930; GFX11-NEXT: s_waitcnt lgkmcnt(0) 931; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc 932; GFX11-NEXT: s_waitcnt vmcnt(0) 933; GFX11-NEXT: v_rsq_f16_e32 v2, v1 934; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] dlc 935; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 936; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] 937; GFX11-NEXT: s_endpgm 938entry: 939 %tid = call i32 @llvm.amdgcn.workitem.id.x() 940 %tid.ext = sext i32 %tid to i64 941 %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext 942 %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext 943 %b.val = load volatile half, ptr addrspace(1) %gep.b 944 store volatile half %b.val, ptr addrspace(1) %gep.r 945 %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val) 946 %r.val = fdiv contract half 1.0, %b.sqrt 947 store half %r.val, ptr addrspace(1) %gep.r 948 ret void 949} 950 951define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { 952; SI-LABEL: v_rsq_f16_missing_contract0: 953; SI: ; %bb.0: ; %entry 954; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 955; SI-NEXT: s_mov_b32 s7, 0xf000 956; SI-NEXT: s_mov_b32 s6, 0 957; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 958; SI-NEXT: v_mov_b32_e32 v1, 0 959; SI-NEXT: s_waitcnt lgkmcnt(0) 960; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 961; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 962; SI-NEXT: s_waitcnt vmcnt(0) 963; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 964; SI-NEXT: v_sqrt_f32_e32 v2, v2 965; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 966; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 967; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 968; SI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, 1.0 969; SI-NEXT: v_rcp_f32_e32 v4, v3 970; SI-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0 971; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 972; SI-NEXT: v_fma_f32 v6, -v3, v4, 1.0 973; SI-NEXT: v_fma_f32 v4, v6, v4, v4 974; SI-NEXT: v_mul_f32_e32 v6, v5, v4 975; SI-NEXT: v_fma_f32 v7, -v3, v6, v5 976; SI-NEXT: v_fma_f32 v6, v7, v4, v6 977; SI-NEXT: v_fma_f32 v3, -v3, v6, v5 978; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 979; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6 980; SI-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0 981; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 982; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 983; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 984; SI-NEXT: s_endpgm 985; 986; GFX8-LABEL: v_rsq_f16_missing_contract0: 987; GFX8: ; %bb.0: ; %entry 988; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 989; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 990; GFX8-NEXT: s_waitcnt lgkmcnt(0) 991; GFX8-NEXT: v_mov_b32_e32 v1, s3 992; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 993; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 994; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc 995; GFX8-NEXT: s_waitcnt vmcnt(0) 996; GFX8-NEXT: v_mov_b32_e32 v1, s1 997; GFX8-NEXT: v_sqrt_f16_e32 v0, v0 998; GFX8-NEXT: v_rcp_f16_e32 v3, v0 999; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1000; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1001; GFX8-NEXT: flat_store_short v[0:1], v3 1002; GFX8-NEXT: s_endpgm 1003; 1004; GFX9-LABEL: v_rsq_f16_missing_contract0: 1005; GFX9: ; %bb.0: ; %entry 1006; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1007; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1008; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1009; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc 1010; GFX9-NEXT: s_waitcnt vmcnt(0) 1011; GFX9-NEXT: v_sqrt_f16_e32 v1, v1 1012; GFX9-NEXT: v_rcp_f16_e32 v1, v1 1013; GFX9-NEXT: global_store_short v0, v1, s[0:1] 1014; GFX9-NEXT: s_endpgm 1015; 1016; GFX10-LABEL: v_rsq_f16_missing_contract0: 1017; GFX10: ; %bb.0: ; %entry 1018; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1019; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1020; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1021; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc 1022; GFX10-NEXT: s_waitcnt vmcnt(0) 1023; GFX10-NEXT: v_sqrt_f16_e32 v1, v1 1024; GFX10-NEXT: v_rcp_f16_e32 v1, v1 1025; GFX10-NEXT: global_store_short v0, v1, s[0:1] 1026; GFX10-NEXT: s_endpgm 1027; 1028; GFX11-LABEL: v_rsq_f16_missing_contract0: 1029; GFX11: ; %bb.0: ; %entry 1030; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1031; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1032; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1033; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1034; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1035; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc 1036; GFX11-NEXT: s_waitcnt vmcnt(0) 1037; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 1038; GFX11-NEXT: s_waitcnt_depctr 0xfff 1039; GFX11-NEXT: v_rcp_f16_e32 v1, v1 1040; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 1041; GFX11-NEXT: s_endpgm 1042entry: 1043 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1044 %tid.ext = sext i32 %tid to i64 1045 %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext 1046 %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext 1047 %b.val = load volatile half, ptr addrspace(1) %gep.b 1048 %b.sqrt = call half @llvm.sqrt.f16(half %b.val) 1049 %r.val = fdiv contract half 1.0, %b.sqrt 1050 store half %r.val, ptr addrspace(1) %gep.r 1051 ret void 1052} 1053 1054define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { 1055; SI-LABEL: v_rsq_f16_missing_contract1: 1056; SI: ; %bb.0: ; %entry 1057; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1058; SI-NEXT: s_mov_b32 s7, 0xf000 1059; SI-NEXT: s_mov_b32 s6, 0 1060; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1061; SI-NEXT: v_mov_b32_e32 v1, 0 1062; SI-NEXT: s_waitcnt lgkmcnt(0) 1063; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1064; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 1065; SI-NEXT: s_waitcnt vmcnt(0) 1066; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1067; SI-NEXT: v_sqrt_f32_e32 v2, v2 1068; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1069; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1070; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1071; SI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, 1.0 1072; SI-NEXT: v_rcp_f32_e32 v4, v3 1073; SI-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0 1074; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1075; SI-NEXT: v_fma_f32 v6, -v3, v4, 1.0 1076; SI-NEXT: v_fma_f32 v4, v6, v4, v4 1077; SI-NEXT: v_mul_f32_e32 v6, v5, v4 1078; SI-NEXT: v_fma_f32 v7, -v3, v6, v5 1079; SI-NEXT: v_fma_f32 v6, v7, v4, v6 1080; SI-NEXT: v_fma_f32 v3, -v3, v6, v5 1081; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1082; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6 1083; SI-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0 1084; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1085; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1086; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 1087; SI-NEXT: s_endpgm 1088; 1089; GFX8-LABEL: v_rsq_f16_missing_contract1: 1090; GFX8: ; %bb.0: ; %entry 1091; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1092; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 1093; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1094; GFX8-NEXT: v_mov_b32_e32 v1, s3 1095; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1096; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1097; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc 1098; GFX8-NEXT: s_waitcnt vmcnt(0) 1099; GFX8-NEXT: v_mov_b32_e32 v1, s1 1100; GFX8-NEXT: v_sqrt_f16_e32 v0, v0 1101; GFX8-NEXT: v_rcp_f16_e32 v3, v0 1102; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1103; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1104; GFX8-NEXT: flat_store_short v[0:1], v3 1105; GFX8-NEXT: s_endpgm 1106; 1107; GFX9-LABEL: v_rsq_f16_missing_contract1: 1108; GFX9: ; %bb.0: ; %entry 1109; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1110; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1111; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1112; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc 1113; GFX9-NEXT: s_waitcnt vmcnt(0) 1114; GFX9-NEXT: v_sqrt_f16_e32 v1, v1 1115; GFX9-NEXT: v_rcp_f16_e32 v1, v1 1116; GFX9-NEXT: global_store_short v0, v1, s[0:1] 1117; GFX9-NEXT: s_endpgm 1118; 1119; GFX10-LABEL: v_rsq_f16_missing_contract1: 1120; GFX10: ; %bb.0: ; %entry 1121; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1122; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1123; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1124; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc 1125; GFX10-NEXT: s_waitcnt vmcnt(0) 1126; GFX10-NEXT: v_sqrt_f16_e32 v1, v1 1127; GFX10-NEXT: v_rcp_f16_e32 v1, v1 1128; GFX10-NEXT: global_store_short v0, v1, s[0:1] 1129; GFX10-NEXT: s_endpgm 1130; 1131; GFX11-LABEL: v_rsq_f16_missing_contract1: 1132; GFX11: ; %bb.0: ; %entry 1133; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1134; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1135; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1136; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1137; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1138; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc 1139; GFX11-NEXT: s_waitcnt vmcnt(0) 1140; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 1141; GFX11-NEXT: s_waitcnt_depctr 0xfff 1142; GFX11-NEXT: v_rcp_f16_e32 v1, v1 1143; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 1144; GFX11-NEXT: s_endpgm 1145entry: 1146 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1147 %tid.ext = sext i32 %tid to i64 1148 %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext 1149 %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext 1150 %b.val = load volatile half, ptr addrspace(1) %gep.b 1151 %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val) 1152 %r.val = fdiv half 1.0, %b.sqrt 1153 store half %r.val, ptr addrspace(1) %gep.r 1154 ret void 1155} 1156 1157define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { 1158; SI-LABEL: v_neg_rsq_f16_missing_contract1: 1159; SI: ; %bb.0: ; %entry 1160; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1161; SI-NEXT: s_mov_b32 s7, 0xf000 1162; SI-NEXT: s_mov_b32 s6, 0 1163; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1164; SI-NEXT: v_mov_b32_e32 v1, 0 1165; SI-NEXT: s_waitcnt lgkmcnt(0) 1166; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1167; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 1168; SI-NEXT: s_waitcnt vmcnt(0) 1169; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1170; SI-NEXT: v_sqrt_f32_e32 v2, v2 1171; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1172; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1173; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1174; SI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, -1.0 1175; SI-NEXT: v_rcp_f32_e32 v4, v3 1176; SI-NEXT: v_div_scale_f32 v5, vcc, -1.0, v2, -1.0 1177; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1178; SI-NEXT: v_fma_f32 v6, -v3, v4, 1.0 1179; SI-NEXT: v_fma_f32 v4, v6, v4, v4 1180; SI-NEXT: v_mul_f32_e32 v6, v5, v4 1181; SI-NEXT: v_fma_f32 v7, -v3, v6, v5 1182; SI-NEXT: v_fma_f32 v6, v7, v4, v6 1183; SI-NEXT: v_fma_f32 v3, -v3, v6, v5 1184; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1185; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6 1186; SI-NEXT: v_div_fixup_f32 v2, v3, v2, -1.0 1187; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1188; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1189; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 1190; SI-NEXT: s_endpgm 1191; 1192; GFX8-LABEL: v_neg_rsq_f16_missing_contract1: 1193; GFX8: ; %bb.0: ; %entry 1194; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1195; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 1196; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1197; GFX8-NEXT: v_mov_b32_e32 v1, s3 1198; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1199; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1200; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc 1201; GFX8-NEXT: s_waitcnt vmcnt(0) 1202; GFX8-NEXT: v_mov_b32_e32 v1, s1 1203; GFX8-NEXT: v_sqrt_f16_e32 v0, v0 1204; GFX8-NEXT: v_rcp_f16_e64 v3, -v0 1205; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1206; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1207; GFX8-NEXT: flat_store_short v[0:1], v3 1208; GFX8-NEXT: s_endpgm 1209; 1210; GFX9-LABEL: v_neg_rsq_f16_missing_contract1: 1211; GFX9: ; %bb.0: ; %entry 1212; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1213; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1214; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1215; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc 1216; GFX9-NEXT: s_waitcnt vmcnt(0) 1217; GFX9-NEXT: v_sqrt_f16_e32 v1, v1 1218; GFX9-NEXT: v_rcp_f16_e64 v1, -v1 1219; GFX9-NEXT: global_store_short v0, v1, s[0:1] 1220; GFX9-NEXT: s_endpgm 1221; 1222; GFX10-LABEL: v_neg_rsq_f16_missing_contract1: 1223; GFX10: ; %bb.0: ; %entry 1224; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1225; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1226; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1227; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc 1228; GFX10-NEXT: s_waitcnt vmcnt(0) 1229; GFX10-NEXT: v_sqrt_f16_e32 v1, v1 1230; GFX10-NEXT: v_rcp_f16_e64 v1, -v1 1231; GFX10-NEXT: global_store_short v0, v1, s[0:1] 1232; GFX10-NEXT: s_endpgm 1233; 1234; GFX11-LABEL: v_neg_rsq_f16_missing_contract1: 1235; GFX11: ; %bb.0: ; %entry 1236; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1237; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1238; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1239; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1240; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1241; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc 1242; GFX11-NEXT: s_waitcnt vmcnt(0) 1243; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 1244; GFX11-NEXT: s_waitcnt_depctr 0xfff 1245; GFX11-NEXT: v_rcp_f16_e64 v1, -v1 1246; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 1247; GFX11-NEXT: s_endpgm 1248entry: 1249 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1250 %tid.ext = sext i32 %tid to i64 1251 %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext 1252 %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext 1253 %b.val = load volatile half, ptr addrspace(1) %gep.b 1254 %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val) 1255 %r.val = fdiv half -1.0, %b.sqrt 1256 store half %r.val, ptr addrspace(1) %gep.r 1257 ret void 1258} 1259 1260define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 { 1261; SI-LABEL: v_fdiv_f16_afn: 1262; SI: ; %bb.0: ; %entry 1263; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1264; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1265; SI-NEXT: s_mov_b32 s7, 0xf000 1266; SI-NEXT: s_mov_b32 s6, 0 1267; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1268; SI-NEXT: s_waitcnt lgkmcnt(0) 1269; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1270; SI-NEXT: v_mov_b32_e32 v1, 0 1271; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 1272; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 1273; SI-NEXT: s_waitcnt vmcnt(0) 1274; SI-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc 1275; SI-NEXT: s_waitcnt vmcnt(0) 1276; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1277; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1278; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1279; SI-NEXT: v_rcp_f32_e32 v3, v3 1280; SI-NEXT: v_mul_f32_e32 v2, v2, v3 1281; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1282; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 1283; SI-NEXT: s_endpgm 1284; 1285; GFX8-LABEL: v_fdiv_f16_afn: 1286; GFX8: ; %bb.0: ; %entry 1287; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1288; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1289; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 1290; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1291; GFX8-NEXT: v_mov_b32_e32 v1, s3 1292; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4 1293; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1294; GFX8-NEXT: v_mov_b32_e32 v3, s5 1295; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4 1296; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1297; GFX8-NEXT: flat_load_ushort v5, v[0:1] glc 1298; GFX8-NEXT: s_waitcnt vmcnt(0) 1299; GFX8-NEXT: flat_load_ushort v0, v[2:3] glc 1300; GFX8-NEXT: s_waitcnt vmcnt(0) 1301; GFX8-NEXT: v_mov_b32_e32 v1, s1 1302; GFX8-NEXT: v_rcp_f16_e32 v2, v0 1303; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 1304; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1305; GFX8-NEXT: v_mul_f16_e32 v2, v5, v2 1306; GFX8-NEXT: flat_store_short v[0:1], v2 1307; GFX8-NEXT: s_endpgm 1308; 1309; GFX9-LABEL: v_fdiv_f16_afn: 1310; GFX9: ; %bb.0: ; %entry 1311; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1312; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1313; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1314; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1315; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc 1316; GFX9-NEXT: s_waitcnt vmcnt(0) 1317; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] glc 1318; GFX9-NEXT: s_waitcnt vmcnt(0) 1319; GFX9-NEXT: v_rcp_f16_e32 v2, v2 1320; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2 1321; GFX9-NEXT: global_store_short v0, v1, s[0:1] 1322; GFX9-NEXT: s_endpgm 1323; 1324; GFX10-LABEL: v_fdiv_f16_afn: 1325; GFX10: ; %bb.0: ; %entry 1326; GFX10-NEXT: s_clause 0x1 1327; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1328; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1329; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1330; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1331; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc 1332; GFX10-NEXT: s_waitcnt vmcnt(0) 1333; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc 1334; GFX10-NEXT: s_waitcnt vmcnt(0) 1335; GFX10-NEXT: v_rcp_f16_e32 v2, v2 1336; GFX10-NEXT: v_mul_f16_e32 v1, v1, v2 1337; GFX10-NEXT: global_store_short v0, v1, s[0:1] 1338; GFX10-NEXT: s_endpgm 1339; 1340; GFX11-LABEL: v_fdiv_f16_afn: 1341; GFX11: ; %bb.0: ; %entry 1342; GFX11-NEXT: s_clause 0x1 1343; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1344; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1345; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1346; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1347; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1348; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1349; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc 1350; GFX11-NEXT: s_waitcnt vmcnt(0) 1351; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc 1352; GFX11-NEXT: s_waitcnt vmcnt(0) 1353; GFX11-NEXT: v_rcp_f16_e32 v2, v2 1354; GFX11-NEXT: s_waitcnt_depctr 0xfff 1355; GFX11-NEXT: v_mul_f16_e32 v1, v1, v2 1356; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 1357; GFX11-NEXT: s_endpgm 1358entry: 1359 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1360 %tid.ext = sext i32 %tid to i64 1361 %gep.a = getelementptr inbounds half, ptr addrspace(1) %a, i64 %tid.ext 1362 %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext 1363 %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext 1364 %a.val = load volatile half, ptr addrspace(1) %gep.a 1365 %b.val = load volatile half, ptr addrspace(1) %gep.b 1366 %r.val = fdiv afn half %a.val, %b.val 1367 store half %r.val, ptr addrspace(1) %gep.r 1368 ret void 1369} 1370 1371define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) #2 { 1372; SI-LABEL: v_fdiv_f16_unsafe: 1373; SI: ; %bb.0: ; %entry 1374; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1375; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd 1376; SI-NEXT: s_mov_b32 s7, 0xf000 1377; SI-NEXT: s_mov_b32 s6, 0 1378; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1379; SI-NEXT: s_waitcnt lgkmcnt(0) 1380; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1381; SI-NEXT: v_mov_b32_e32 v1, 0 1382; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 1383; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc 1384; SI-NEXT: s_waitcnt vmcnt(0) 1385; SI-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc 1386; SI-NEXT: s_waitcnt vmcnt(0) 1387; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1388; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1389; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1390; SI-NEXT: v_rcp_f32_e32 v3, v3 1391; SI-NEXT: v_mul_f32_e32 v2, v2, v3 1392; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1393; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 1394; SI-NEXT: s_endpgm 1395; 1396; GFX8-LABEL: v_fdiv_f16_unsafe: 1397; GFX8: ; %bb.0: ; %entry 1398; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1399; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1400; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0 1401; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1402; GFX8-NEXT: v_mov_b32_e32 v1, s3 1403; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4 1404; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1405; GFX8-NEXT: v_mov_b32_e32 v3, s5 1406; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4 1407; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1408; GFX8-NEXT: flat_load_ushort v5, v[0:1] glc 1409; GFX8-NEXT: s_waitcnt vmcnt(0) 1410; GFX8-NEXT: flat_load_ushort v0, v[2:3] glc 1411; GFX8-NEXT: s_waitcnt vmcnt(0) 1412; GFX8-NEXT: v_mov_b32_e32 v1, s1 1413; GFX8-NEXT: v_rcp_f16_e32 v2, v0 1414; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v4 1415; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1416; GFX8-NEXT: v_mul_f16_e32 v2, v5, v2 1417; GFX8-NEXT: flat_store_short v[0:1], v2 1418; GFX8-NEXT: s_endpgm 1419; 1420; GFX9-LABEL: v_fdiv_f16_unsafe: 1421; GFX9: ; %bb.0: ; %entry 1422; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1423; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1424; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1425; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1426; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc 1427; GFX9-NEXT: s_waitcnt vmcnt(0) 1428; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] glc 1429; GFX9-NEXT: s_waitcnt vmcnt(0) 1430; GFX9-NEXT: v_rcp_f16_e32 v2, v2 1431; GFX9-NEXT: v_mul_f16_e32 v1, v1, v2 1432; GFX9-NEXT: global_store_short v0, v1, s[0:1] 1433; GFX9-NEXT: s_endpgm 1434; 1435; GFX10-LABEL: v_fdiv_f16_unsafe: 1436; GFX10: ; %bb.0: ; %entry 1437; GFX10-NEXT: s_clause 0x1 1438; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1439; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1440; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1441; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1442; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc 1443; GFX10-NEXT: s_waitcnt vmcnt(0) 1444; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc 1445; GFX10-NEXT: s_waitcnt vmcnt(0) 1446; GFX10-NEXT: v_rcp_f16_e32 v2, v2 1447; GFX10-NEXT: v_mul_f16_e32 v1, v1, v2 1448; GFX10-NEXT: global_store_short v0, v1, s[0:1] 1449; GFX10-NEXT: s_endpgm 1450; 1451; GFX11-LABEL: v_fdiv_f16_unsafe: 1452; GFX11: ; %bb.0: ; %entry 1453; GFX11-NEXT: s_clause 0x1 1454; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1455; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1456; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1457; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1458; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1459; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1460; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc 1461; GFX11-NEXT: s_waitcnt vmcnt(0) 1462; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc 1463; GFX11-NEXT: s_waitcnt vmcnt(0) 1464; GFX11-NEXT: v_rcp_f16_e32 v2, v2 1465; GFX11-NEXT: s_waitcnt_depctr 0xfff 1466; GFX11-NEXT: v_mul_f16_e32 v1, v1, v2 1467; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 1468; GFX11-NEXT: s_endpgm 1469entry: 1470 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1471 %tid.ext = sext i32 %tid to i64 1472 %gep.a = getelementptr inbounds half, ptr addrspace(1) %a, i64 %tid.ext 1473 %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext 1474 %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext 1475 %a.val = load volatile half, ptr addrspace(1) %gep.a 1476 %b.val = load volatile half, ptr addrspace(1) %gep.b 1477 %r.val = fdiv half %a.val, %b.val 1478 store half %r.val, ptr addrspace(1) %gep.r 1479 ret void 1480} 1481 1482define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 { 1483; SI-LABEL: div_afn_2_x_pat_f16: 1484; SI: ; %bb.0: 1485; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1486; SI-NEXT: s_mov_b32 s3, 0xf000 1487; SI-NEXT: s_mov_b32 s2, -1 1488; SI-NEXT: s_waitcnt lgkmcnt(0) 1489; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 1490; SI-NEXT: s_waitcnt vmcnt(0) 1491; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1492; SI-NEXT: v_mul_f32_e32 v0, 0.5, v0 1493; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1494; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1495; SI-NEXT: s_endpgm 1496; 1497; GFX8-LABEL: div_afn_2_x_pat_f16: 1498; GFX8: ; %bb.0: 1499; GFX8-NEXT: flat_load_ushort v0, v[0:1] 1500; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1501; GFX8-NEXT: s_waitcnt vmcnt(0) 1502; GFX8-NEXT: v_mul_f16_e32 v2, 0.5, v0 1503; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1504; GFX8-NEXT: v_mov_b32_e32 v0, s0 1505; GFX8-NEXT: v_mov_b32_e32 v1, s1 1506; GFX8-NEXT: flat_store_short v[0:1], v2 1507; GFX8-NEXT: s_endpgm 1508; 1509; GFX9-LABEL: div_afn_2_x_pat_f16: 1510; GFX9: ; %bb.0: 1511; GFX9-NEXT: global_load_ushort v0, v[0:1], off 1512; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1513; GFX9-NEXT: v_mov_b32_e32 v1, 0 1514; GFX9-NEXT: s_waitcnt vmcnt(0) 1515; GFX9-NEXT: v_mul_f16_e32 v0, 0.5, v0 1516; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1517; GFX9-NEXT: global_store_short v1, v0, s[0:1] 1518; GFX9-NEXT: s_endpgm 1519; 1520; GFX10-LABEL: div_afn_2_x_pat_f16: 1521; GFX10: ; %bb.0: 1522; GFX10-NEXT: global_load_ushort v0, v[0:1], off 1523; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1524; GFX10-NEXT: v_mov_b32_e32 v1, 0 1525; GFX10-NEXT: s_waitcnt vmcnt(0) 1526; GFX10-NEXT: v_mul_f16_e32 v0, 0.5, v0 1527; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1528; GFX10-NEXT: global_store_short v1, v0, s[0:1] 1529; GFX10-NEXT: s_endpgm 1530; 1531; GFX11-LABEL: div_afn_2_x_pat_f16: 1532; GFX11: ; %bb.0: 1533; GFX11-NEXT: global_load_u16 v0, v[0:1], off 1534; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1535; GFX11-NEXT: v_mov_b32_e32 v1, 0 1536; GFX11-NEXT: s_waitcnt vmcnt(0) 1537; GFX11-NEXT: v_mul_f16_e32 v0, 0.5, v0 1538; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1539; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] 1540; GFX11-NEXT: s_endpgm 1541 %x = load half, ptr addrspace(1) undef 1542 %rcp = fdiv afn half %x, 2.0 1543 store half %rcp, ptr addrspace(1) %out, align 4 1544 ret void 1545} 1546 1547define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 { 1548; SI-LABEL: div_afn_k_x_pat_f16: 1549; SI: ; %bb.0: 1550; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1551; SI-NEXT: s_mov_b32 s3, 0xf000 1552; SI-NEXT: s_mov_b32 s2, -1 1553; SI-NEXT: s_waitcnt lgkmcnt(0) 1554; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 1555; SI-NEXT: s_waitcnt vmcnt(0) 1556; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1557; SI-NEXT: v_mul_f32_e32 v0, 0x3dcccccd, v0 1558; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1559; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1560; SI-NEXT: s_endpgm 1561; 1562; GFX8-LABEL: div_afn_k_x_pat_f16: 1563; GFX8: ; %bb.0: 1564; GFX8-NEXT: flat_load_ushort v0, v[0:1] 1565; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1566; GFX8-NEXT: s_waitcnt vmcnt(0) 1567; GFX8-NEXT: v_mul_f16_e32 v2, 0x2e66, v0 1568; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1569; GFX8-NEXT: v_mov_b32_e32 v0, s0 1570; GFX8-NEXT: v_mov_b32_e32 v1, s1 1571; GFX8-NEXT: flat_store_short v[0:1], v2 1572; GFX8-NEXT: s_endpgm 1573; 1574; GFX9-LABEL: div_afn_k_x_pat_f16: 1575; GFX9: ; %bb.0: 1576; GFX9-NEXT: global_load_ushort v0, v[0:1], off 1577; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1578; GFX9-NEXT: v_mov_b32_e32 v1, 0 1579; GFX9-NEXT: s_waitcnt vmcnt(0) 1580; GFX9-NEXT: v_mul_f16_e32 v0, 0x2e66, v0 1581; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1582; GFX9-NEXT: global_store_short v1, v0, s[0:1] 1583; GFX9-NEXT: s_endpgm 1584; 1585; GFX10-LABEL: div_afn_k_x_pat_f16: 1586; GFX10: ; %bb.0: 1587; GFX10-NEXT: global_load_ushort v0, v[0:1], off 1588; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1589; GFX10-NEXT: v_mov_b32_e32 v1, 0 1590; GFX10-NEXT: s_waitcnt vmcnt(0) 1591; GFX10-NEXT: v_mul_f16_e32 v0, 0x2e66, v0 1592; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1593; GFX10-NEXT: global_store_short v1, v0, s[0:1] 1594; GFX10-NEXT: s_endpgm 1595; 1596; GFX11-LABEL: div_afn_k_x_pat_f16: 1597; GFX11: ; %bb.0: 1598; GFX11-NEXT: global_load_u16 v0, v[0:1], off 1599; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1600; GFX11-NEXT: v_mov_b32_e32 v1, 0 1601; GFX11-NEXT: s_waitcnt vmcnt(0) 1602; GFX11-NEXT: v_mul_f16_e32 v0, 0x2e66, v0 1603; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1604; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] 1605; GFX11-NEXT: s_endpgm 1606 %x = load half, ptr addrspace(1) undef 1607 %rcp = fdiv afn half %x, 10.0 1608 store half %rcp, ptr addrspace(1) %out, align 4 1609 ret void 1610} 1611 1612define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 { 1613; SI-LABEL: div_afn_neg_k_x_pat_f16: 1614; SI: ; %bb.0: 1615; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1616; SI-NEXT: s_mov_b32 s3, 0xf000 1617; SI-NEXT: s_mov_b32 s2, -1 1618; SI-NEXT: s_waitcnt lgkmcnt(0) 1619; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 1620; SI-NEXT: s_waitcnt vmcnt(0) 1621; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1622; SI-NEXT: v_mul_f32_e32 v0, 0xbdcccccd, v0 1623; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1624; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1625; SI-NEXT: s_endpgm 1626; 1627; GFX8-LABEL: div_afn_neg_k_x_pat_f16: 1628; GFX8: ; %bb.0: 1629; GFX8-NEXT: flat_load_ushort v0, v[0:1] 1630; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1631; GFX8-NEXT: s_waitcnt vmcnt(0) 1632; GFX8-NEXT: v_mul_f16_e32 v2, 0xae66, v0 1633; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1634; GFX8-NEXT: v_mov_b32_e32 v0, s0 1635; GFX8-NEXT: v_mov_b32_e32 v1, s1 1636; GFX8-NEXT: flat_store_short v[0:1], v2 1637; GFX8-NEXT: s_endpgm 1638; 1639; GFX9-LABEL: div_afn_neg_k_x_pat_f16: 1640; GFX9: ; %bb.0: 1641; GFX9-NEXT: global_load_ushort v0, v[0:1], off 1642; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1643; GFX9-NEXT: v_mov_b32_e32 v1, 0 1644; GFX9-NEXT: s_waitcnt vmcnt(0) 1645; GFX9-NEXT: v_mul_f16_e32 v0, 0xae66, v0 1646; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1647; GFX9-NEXT: global_store_short v1, v0, s[0:1] 1648; GFX9-NEXT: s_endpgm 1649; 1650; GFX10-LABEL: div_afn_neg_k_x_pat_f16: 1651; GFX10: ; %bb.0: 1652; GFX10-NEXT: global_load_ushort v0, v[0:1], off 1653; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1654; GFX10-NEXT: v_mov_b32_e32 v1, 0 1655; GFX10-NEXT: s_waitcnt vmcnt(0) 1656; GFX10-NEXT: v_mul_f16_e32 v0, 0xae66, v0 1657; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1658; GFX10-NEXT: global_store_short v1, v0, s[0:1] 1659; GFX10-NEXT: s_endpgm 1660; 1661; GFX11-LABEL: div_afn_neg_k_x_pat_f16: 1662; GFX11: ; %bb.0: 1663; GFX11-NEXT: global_load_u16 v0, v[0:1], off 1664; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 1665; GFX11-NEXT: v_mov_b32_e32 v1, 0 1666; GFX11-NEXT: s_waitcnt vmcnt(0) 1667; GFX11-NEXT: v_mul_f16_e32 v0, 0xae66, v0 1668; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1669; GFX11-NEXT: global_store_b16 v1, v0, s[0:1] 1670; GFX11-NEXT: s_endpgm 1671 %x = load half, ptr addrspace(1) undef 1672 %rcp = fdiv afn half %x, -10.0 1673 store half %rcp, ptr addrspace(1) %out, align 4 1674 ret void 1675} 1676 1677define half @v_fdiv_f16_arcp(half %x, half %y) { 1678; SI-LABEL: v_fdiv_f16_arcp: 1679; SI: ; %bb.0: 1680; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1681; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1682; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1683; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1684; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1685; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1686; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 1687; SI-NEXT: v_rcp_f32_e32 v3, v2 1688; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 1689; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1690; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0 1691; SI-NEXT: v_fma_f32 v3, v5, v3, v3 1692; SI-NEXT: v_mul_f32_e32 v5, v4, v3 1693; SI-NEXT: v_fma_f32 v6, -v2, v5, v4 1694; SI-NEXT: v_fma_f32 v5, v6, v3, v5 1695; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 1696; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1697; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 1698; SI-NEXT: v_div_fixup_f32 v0, v2, v1, v0 1699; SI-NEXT: s_setpc_b64 s[30:31] 1700; 1701; GFX8-LABEL: v_fdiv_f16_arcp: 1702; GFX8: ; %bb.0: 1703; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1704; GFX8-NEXT: v_rcp_f16_e32 v1, v1 1705; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1 1706; GFX8-NEXT: s_setpc_b64 s[30:31] 1707; 1708; GFX9-LABEL: v_fdiv_f16_arcp: 1709; GFX9: ; %bb.0: 1710; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1711; GFX9-NEXT: v_rcp_f16_e32 v1, v1 1712; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1 1713; GFX9-NEXT: s_setpc_b64 s[30:31] 1714; 1715; GFX10-LABEL: v_fdiv_f16_arcp: 1716; GFX10: ; %bb.0: 1717; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1718; GFX10-NEXT: v_rcp_f16_e32 v1, v1 1719; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 1720; GFX10-NEXT: s_setpc_b64 s[30:31] 1721; 1722; GFX11-LABEL: v_fdiv_f16_arcp: 1723; GFX11: ; %bb.0: 1724; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1725; GFX11-NEXT: v_rcp_f16_e32 v1, v1 1726; GFX11-NEXT: s_waitcnt_depctr 0xfff 1727; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 1728; GFX11-NEXT: s_setpc_b64 s[30:31] 1729 %fdiv = fdiv arcp half %x, %y 1730 ret half %fdiv 1731} 1732 1733define half @v_fdiv_f16_afn_nsz(half %x, half %y) { 1734; SI-LABEL: v_fdiv_f16_afn_nsz: 1735; SI: ; %bb.0: 1736; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1737; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1738; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1739; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1740; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1741; SI-NEXT: v_rcp_f32_e32 v1, v1 1742; SI-NEXT: v_mul_f32_e32 v0, v0, v1 1743; SI-NEXT: s_setpc_b64 s[30:31] 1744; 1745; GFX8-LABEL: v_fdiv_f16_afn_nsz: 1746; GFX8: ; %bb.0: 1747; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1748; GFX8-NEXT: v_rcp_f16_e32 v1, v1 1749; GFX8-NEXT: v_mul_f16_e32 v0, v0, v1 1750; GFX8-NEXT: s_setpc_b64 s[30:31] 1751; 1752; GFX9-LABEL: v_fdiv_f16_afn_nsz: 1753; GFX9: ; %bb.0: 1754; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1755; GFX9-NEXT: v_rcp_f16_e32 v1, v1 1756; GFX9-NEXT: v_mul_f16_e32 v0, v0, v1 1757; GFX9-NEXT: s_setpc_b64 s[30:31] 1758; 1759; GFX10-LABEL: v_fdiv_f16_afn_nsz: 1760; GFX10: ; %bb.0: 1761; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1762; GFX10-NEXT: v_rcp_f16_e32 v1, v1 1763; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 1764; GFX10-NEXT: s_setpc_b64 s[30:31] 1765; 1766; GFX11-LABEL: v_fdiv_f16_afn_nsz: 1767; GFX11: ; %bb.0: 1768; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1769; GFX11-NEXT: v_rcp_f16_e32 v1, v1 1770; GFX11-NEXT: s_waitcnt_depctr 0xfff 1771; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 1772; GFX11-NEXT: s_setpc_b64 s[30:31] 1773 %fdiv = fdiv afn nsz half %x, %y 1774 ret half %fdiv 1775} 1776 1777define <2 x half> @v_rsq_v2f16(<2 x half> %a) { 1778; GFX6-IEEE-LABEL: v_rsq_v2f16: 1779; GFX6-IEEE: ; %bb.0: 1780; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1781; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 1782; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 1783; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0 1784; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 1785; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 1786; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 1787; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 1788; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 1789; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 1790; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 1791; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3 1792; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2 1793; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2 1794; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0 1795; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6 1796; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 1797; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5 1798; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4 1799; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9 1800; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4 1801; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9 1802; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 1803; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0 1804; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2 1805; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8 1806; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3 1807; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7 1808; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4 1809; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7 1810; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5] 1811; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4 1812; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 1813; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 1814; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 1815; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] 1816; 1817; GFX6-FLUSH-LABEL: v_rsq_v2f16: 1818; GFX6-FLUSH: ; %bb.0: 1819; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1820; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 1821; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 1822; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0 1823; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 1824; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1 1825; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1826; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 1827; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 1828; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 1829; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 1830; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 1831; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 1832; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1833; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 1834; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 1835; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 1836; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 1837; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 1838; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 1839; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1840; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 1841; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0 1842; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 1843; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 1844; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 1845; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 1846; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 1847; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 1848; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1849; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 1850; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 1851; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 1852; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 1853; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 1854; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 1855; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1856; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 1857; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 1858; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 1859; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] 1860; 1861; SI-LABEL: v_rsq_v2f16: 1862; SI: ; %bb.0: 1863; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1864; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 1865; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1866; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1867; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1868; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1869; SI-NEXT: v_sqrt_f32_e32 v0, v0 1870; SI-NEXT: v_sqrt_f32_e32 v1, v1 1871; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1872; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1873; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1874; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1875; SI-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0 1876; SI-NEXT: v_rcp_f32_e32 v3, v2 1877; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 1878; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1879; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0 1880; SI-NEXT: v_fma_f32 v3, v5, v3, v3 1881; SI-NEXT: v_mul_f32_e32 v5, v4, v3 1882; SI-NEXT: v_fma_f32 v6, -v2, v5, v4 1883; SI-NEXT: v_fma_f32 v5, v6, v3, v5 1884; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 1885; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1886; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 1887; SI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0 1888; SI-NEXT: v_rcp_f32_e32 v4, v3 1889; SI-NEXT: v_div_scale_f32 v5, vcc, 1.0, v1, 1.0 1890; SI-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 1891; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 1892; SI-NEXT: v_fma_f32 v2, -v3, v4, 1.0 1893; SI-NEXT: v_fma_f32 v2, v2, v4, v4 1894; SI-NEXT: v_mul_f32_e32 v4, v5, v2 1895; SI-NEXT: v_fma_f32 v6, -v3, v4, v5 1896; SI-NEXT: v_fma_f32 v4, v6, v2, v4 1897; SI-NEXT: v_fma_f32 v3, -v3, v4, v5 1898; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 1899; SI-NEXT: v_div_fmas_f32 v2, v3, v2, v4 1900; SI-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0 1901; SI-NEXT: s_setpc_b64 s[30:31] 1902; 1903; GFX8-LABEL: v_rsq_v2f16: 1904; GFX8: ; %bb.0: 1905; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1906; GFX8-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 1907; GFX8-NEXT: v_rsq_f16_e32 v0, v0 1908; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 1909; GFX8-NEXT: s_setpc_b64 s[30:31] 1910; 1911; GFX9-LABEL: v_rsq_v2f16: 1912; GFX9: ; %bb.0: 1913; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1914; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1915; GFX9-NEXT: v_rsq_f16_e32 v0, v0 1916; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1 1917; GFX9-NEXT: s_setpc_b64 s[30:31] 1918; 1919; GFX10-LABEL: v_rsq_v2f16: 1920; GFX10: ; %bb.0: 1921; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1922; GFX10-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1923; GFX10-NEXT: v_rsq_f16_e32 v0, v0 1924; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1 1925; GFX10-NEXT: s_setpc_b64 s[30:31] 1926; 1927; GFX11-LABEL: v_rsq_v2f16: 1928; GFX11: ; %bb.0: 1929; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1930; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1931; GFX11-NEXT: v_rsq_f16_e32 v0, v0 1932; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1933; GFX11-NEXT: v_rsq_f16_e32 v1, v1 1934; GFX11-NEXT: s_waitcnt_depctr 0xfff 1935; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 1936; GFX11-NEXT: s_setpc_b64 s[30:31] 1937; GFX9-IEEE-LABEL: v_rsq_v2f16: 1938; GFX9-IEEE: ; %bb.0: 1939; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1940; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 1941; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1942; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0 1943; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 1944; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 1945; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 1946; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 1947; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2 1948; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 1949; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 1950; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 1951; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 1952; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 1953; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v1, v0 1954; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] 1955; GFX9-FLUSH-LABEL: v_rsq_v2f16: 1956; GFX9-FLUSH: ; %bb.0: 1957; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1958; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 1959; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1960; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 1961; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 1962; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 1963; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 1964; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0] 1965; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0] 1966; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0 1967; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0 1968; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0 1969; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] 1970 %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) 1971 %fdiv = fdiv contract <2 x half> <half 1.0, half 1.0>, %sqrt 1972 ret <2 x half> %fdiv 1973} 1974 1975define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) { 1976; GFX6-IEEE-LABEL: v_neg_rsq_v2f16: 1977; GFX6-IEEE: ; %bb.0: 1978; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1979; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 1980; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 1981; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0 1982; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0 1983; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1 1984; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 1985; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 1986; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0 1987; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1 1988; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 1989; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3 1990; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2 1991; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2 1992; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0 1993; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6 1994; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6 1995; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5 1996; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4 1997; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9 1998; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4 1999; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9 2000; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2 2001; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0 2002; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2 2003; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8 2004; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3 2005; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7 2006; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4 2007; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7 2008; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5] 2009; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4 2010; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2 2011; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0 2012; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1 2013; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31] 2014; 2015; GFX6-FLUSH-LABEL: v_neg_rsq_v2f16: 2016; GFX6-FLUSH: ; %bb.0: 2017; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2018; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 2019; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 2020; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0 2021; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0 2022; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1 2023; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 2024; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 2025; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 2026; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0 2027; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2 2028; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3 2029; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2 2030; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2031; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0 2032; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4 2033; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4 2034; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5 2035; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6 2036; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5 2037; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2038; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6 2039; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0 2040; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1 2041; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2 2042; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0 2043; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4 2044; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2 2045; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4 2046; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2047; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0 2048; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3 2049; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3 2050; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5 2051; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6 2052; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5 2053; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2054; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6 2055; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4 2056; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1 2057; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31] 2058; 2059; SI-LABEL: v_neg_rsq_v2f16: 2060; SI: ; %bb.0: 2061; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2062; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 2063; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 2064; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 2065; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 2066; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 2067; SI-NEXT: v_sqrt_f32_e32 v0, v0 2068; SI-NEXT: v_sqrt_f32_e32 v1, v1 2069; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 2070; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 2071; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 2072; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 2073; SI-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0 2074; SI-NEXT: v_rcp_f32_e32 v3, v2 2075; SI-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0 2076; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2077; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0 2078; SI-NEXT: v_fma_f32 v3, v5, v3, v3 2079; SI-NEXT: v_mul_f32_e32 v5, v4, v3 2080; SI-NEXT: v_fma_f32 v6, -v2, v5, v4 2081; SI-NEXT: v_fma_f32 v5, v6, v3, v5 2082; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 2083; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2084; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 2085; SI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, -1.0 2086; SI-NEXT: v_rcp_f32_e32 v4, v3 2087; SI-NEXT: v_div_scale_f32 v5, vcc, -1.0, v1, -1.0 2088; SI-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0 2089; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 2090; SI-NEXT: v_fma_f32 v2, -v3, v4, 1.0 2091; SI-NEXT: v_fma_f32 v2, v2, v4, v4 2092; SI-NEXT: v_mul_f32_e32 v4, v5, v2 2093; SI-NEXT: v_fma_f32 v6, -v3, v4, v5 2094; SI-NEXT: v_fma_f32 v4, v6, v2, v4 2095; SI-NEXT: v_fma_f32 v3, -v3, v4, v5 2096; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 2097; SI-NEXT: v_div_fmas_f32 v2, v3, v2, v4 2098; SI-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0 2099; SI-NEXT: s_setpc_b64 s[30:31] 2100; 2101; GFX8-LABEL: v_neg_rsq_v2f16: 2102; GFX8: ; %bb.0: 2103; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2104; GFX8-NEXT: v_rsq_f16_e32 v1, v0 2105; GFX8-NEXT: v_rsq_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 2106; GFX8-NEXT: v_mov_b32_e32 v2, 0x8000 2107; GFX8-NEXT: v_xor_b32_e32 v1, 0x8000, v1 2108; GFX8-NEXT: v_xor_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2109; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2110; GFX8-NEXT: s_setpc_b64 s[30:31] 2111; 2112; GFX9-LABEL: v_neg_rsq_v2f16: 2113; GFX9: ; %bb.0: 2114; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2115; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 2116; GFX9-NEXT: v_rsq_f16_e32 v0, v0 2117; GFX9-NEXT: v_pack_b32_f16 v0, -v0, -v1 2118; GFX9-NEXT: s_setpc_b64 s[30:31] 2119; 2120; GFX10-LABEL: v_neg_rsq_v2f16: 2121; GFX10: ; %bb.0: 2122; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2123; GFX10-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 2124; GFX10-NEXT: v_rsq_f16_e32 v0, v0 2125; GFX10-NEXT: v_pack_b32_f16 v0, -v0, -v1 2126; GFX10-NEXT: s_setpc_b64 s[30:31] 2127; 2128; GFX11-LABEL: v_neg_rsq_v2f16: 2129; GFX11: ; %bb.0: 2130; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2131; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 2132; GFX11-NEXT: v_rsq_f16_e32 v0, v0 2133; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2134; GFX11-NEXT: v_rsq_f16_e32 v1, v1 2135; GFX11-NEXT: s_waitcnt_depctr 0xfff 2136; GFX11-NEXT: v_pack_b32_f16 v0, -v0, -v1 2137; GFX11-NEXT: s_setpc_b64 s[30:31] 2138; GFX9-IEEE-LABEL: v_neg_rsq_v2f16: 2139; GFX9-IEEE: ; %bb.0: 2140; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2141; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0 2142; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 2143; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0 2144; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1 2145; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0 2146; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2 2147; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3 2148; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2 2149; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3 2150; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2 2151; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3 2152; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 2153; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 2154; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v1, v0 2155; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31] 2156; GFX9-FLUSH-LABEL: v_neg_rsq_v2f16: 2157; GFX9-FLUSH: ; %bb.0: 2158; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2159; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0 2160; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 2161; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1 2162; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0 2163; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 2164; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 2165; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0] 2166; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0] 2167; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0 2168; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0 2169; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0 2170; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31] 2171 %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) 2172 %fdiv = fdiv contract <2 x half> <half -1.0, half -1.0>, %sqrt 2173 ret <2 x half> %fdiv 2174} 2175 2176declare i32 @llvm.amdgcn.workitem.id.x() #2 2177declare half @llvm.sqrt.f16(half) #2 2178declare half @llvm.fabs.f16(half) #2 2179declare <2 x half> @llvm.sqrt.v2f16(<2 x half>) #2 2180 2181attributes #0 = { nounwind } 2182attributes #1 = { nounwind readnone } 2183attributes #2 = { nounwind "unsafe-fp-math"="true" } 2184