1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s 5 6declare i16 @llvm.umax.i16(i16, i16) 7declare i64 @llvm.umin.i64(i64, i64) 8 9declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>) 10 11define <4 x float> @fmul_pow2_4xfloat(<4 x i32> %i) { 12; VI-LABEL: fmul_pow2_4xfloat: 13; VI: ; %bb.0: 14; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1 16; VI-NEXT: v_lshlrev_b32_e64 v1, v1, 1 17; VI-NEXT: v_lshlrev_b32_e64 v2, v2, 1 18; VI-NEXT: v_lshlrev_b32_e64 v3, v3, 1 19; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 20; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 21; VI-NEXT: v_cvt_f32_u32_e32 v2, v2 22; VI-NEXT: v_cvt_f32_u32_e32 v3, v3 23; VI-NEXT: v_mul_f32_e32 v0, 0x41100000, v0 24; VI-NEXT: v_mul_f32_e32 v1, 0x41100000, v1 25; VI-NEXT: v_mul_f32_e32 v2, 0x41100000, v2 26; VI-NEXT: v_mul_f32_e32 v3, 0x41100000, v3 27; VI-NEXT: s_setpc_b64 s[30:31] 28; 29; GFX10-LABEL: fmul_pow2_4xfloat: 30; GFX10: ; %bb.0: 31; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 1 33; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, 1 34; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, 1 35; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, 1 36; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 37; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 38; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2 39; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3 40; GFX10-NEXT: v_mul_f32_e32 v0, 0x41100000, v0 41; GFX10-NEXT: v_mul_f32_e32 v1, 0x41100000, v1 42; GFX10-NEXT: v_mul_f32_e32 v2, 0x41100000, v2 43; GFX10-NEXT: v_mul_f32_e32 v3, 0x41100000, v3 44; GFX10-NEXT: s_setpc_b64 s[30:31] 45; 46; GFX11-LABEL: fmul_pow2_4xfloat: 47; GFX11: ; %bb.0: 48; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 49; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 1 50; GFX11-NEXT: v_lshlrev_b32_e64 v1, v1, 1 51; GFX11-NEXT: v_lshlrev_b32_e64 v2, v2, 1 52; GFX11-NEXT: v_lshlrev_b32_e64 v3, v3, 1 53; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 54; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 55; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 56; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 57; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 58; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 59; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 60; GFX11-NEXT: v_dual_mul_f32 v0, 0x41100000, v0 :: v_dual_mul_f32 v1, 0x41100000, v1 61; GFX11-NEXT: v_dual_mul_f32 v2, 0x41100000, v2 :: v_dual_mul_f32 v3, 0x41100000, v3 62; GFX11-NEXT: s_setpc_b64 s[30:31] 63 %p2 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %i 64 %p2_f = uitofp <4 x i32> %p2 to <4 x float> 65 %r = fmul <4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, %p2_f 66 ret <4 x float> %r 67} 68 69define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) { 70; VI-LABEL: fmul_pow2_ldexp_4xfloat: 71; VI: ; %bb.0: 72; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 73; VI-NEXT: s_mov_b32 s4, 0x41100000 74; VI-NEXT: v_ldexp_f32 v0, s4, v0 75; VI-NEXT: v_ldexp_f32 v1, s4, v1 76; VI-NEXT: v_ldexp_f32 v2, s4, v2 77; VI-NEXT: v_ldexp_f32 v3, s4, v3 78; VI-NEXT: s_setpc_b64 s[30:31] 79; 80; GFX10-LABEL: fmul_pow2_ldexp_4xfloat: 81; GFX10: ; %bb.0: 82; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 83; GFX10-NEXT: v_ldexp_f32 v0, 0x41100000, v0 84; GFX10-NEXT: v_ldexp_f32 v1, 0x41100000, v1 85; GFX10-NEXT: v_ldexp_f32 v2, 0x41100000, v2 86; GFX10-NEXT: v_ldexp_f32 v3, 0x41100000, v3 87; GFX10-NEXT: s_setpc_b64 s[30:31] 88; 89; GFX11-LABEL: fmul_pow2_ldexp_4xfloat: 90; GFX11: ; %bb.0: 91; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 92; GFX11-NEXT: v_ldexp_f32 v0, 0x41100000, v0 93; GFX11-NEXT: v_ldexp_f32 v1, 0x41100000, v1 94; GFX11-NEXT: v_ldexp_f32 v2, 0x41100000, v2 95; GFX11-NEXT: v_ldexp_f32 v3, 0x41100000, v3 96; GFX11-NEXT: s_setpc_b64 s[30:31] 97 %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, <4 x i32> %i) 98 ret <4 x float> %r 99} 100 101define <4 x float> @fdiv_pow2_4xfloat(<4 x i32> %i) { 102; VI-LABEL: fdiv_pow2_4xfloat: 103; VI: ; %bb.0: 104; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 105; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 106; VI-NEXT: v_lshlrev_b32_e32 v1, 23, v1 107; VI-NEXT: v_lshlrev_b32_e32 v2, 23, v2 108; VI-NEXT: v_lshlrev_b32_e32 v3, 23, v3 109; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x41100000, v0 110; VI-NEXT: v_sub_u32_e32 v1, vcc, 0x41100000, v1 111; VI-NEXT: v_sub_u32_e32 v2, vcc, 0x41100000, v2 112; VI-NEXT: v_sub_u32_e32 v3, vcc, 0x41100000, v3 113; VI-NEXT: s_setpc_b64 s[30:31] 114; 115; GFX10-LABEL: fdiv_pow2_4xfloat: 116; GFX10: ; %bb.0: 117; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 118; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0 119; GFX10-NEXT: v_lshlrev_b32_e32 v1, 23, v1 120; GFX10-NEXT: v_lshlrev_b32_e32 v2, 23, v2 121; GFX10-NEXT: v_lshlrev_b32_e32 v3, 23, v3 122; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x41100000, v0 123; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x41100000, v1 124; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x41100000, v2 125; GFX10-NEXT: v_sub_nc_u32_e32 v3, 0x41100000, v3 126; GFX10-NEXT: s_setpc_b64 s[30:31] 127; 128; GFX11-LABEL: fdiv_pow2_4xfloat: 129; GFX11: ; %bb.0: 130; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 131; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0 132; GFX11-NEXT: v_lshlrev_b32_e32 v1, 23, v1 133; GFX11-NEXT: v_lshlrev_b32_e32 v2, 23, v2 134; GFX11-NEXT: v_lshlrev_b32_e32 v3, 23, v3 135; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 136; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x41100000, v0 137; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x41100000, v1 138; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 139; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x41100000, v2 140; GFX11-NEXT: v_sub_nc_u32_e32 v3, 0x41100000, v3 141; GFX11-NEXT: s_setpc_b64 s[30:31] 142 %p2 = shl <4 x i32> <i32 1, i32 1, i32 1, i32 1>, %i 143 %p2_f = uitofp <4 x i32> %p2 to <4 x float> 144 %r = fdiv <4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, %p2_f 145 ret <4 x float> %r 146} 147 148declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>) 149 150define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) { 151; VI-LABEL: fmul_pow2_8xhalf: 152; VI: ; %bb.0: 153; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 154; VI-NEXT: v_mov_b32_e32 v5, 1 155; VI-NEXT: v_lshlrev_b16_e64 v4, v3, 1 156; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 157; VI-NEXT: v_lshlrev_b16_e64 v6, v2, 1 158; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 159; VI-NEXT: v_lshlrev_b16_e64 v7, v1, 1 160; VI-NEXT: v_lshlrev_b16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 161; VI-NEXT: v_lshlrev_b16_e64 v8, v0, 1 162; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 163; VI-NEXT: v_cvt_f16_u16_e32 v0, v0 164; VI-NEXT: v_cvt_f16_u16_e32 v5, v8 165; VI-NEXT: v_cvt_f16_u16_e32 v1, v1 166; VI-NEXT: v_cvt_f16_u16_e32 v7, v7 167; VI-NEXT: v_cvt_f16_u16_e32 v2, v2 168; VI-NEXT: v_cvt_f16_u16_e32 v6, v6 169; VI-NEXT: v_cvt_f16_u16_e32 v3, v3 170; VI-NEXT: v_cvt_f16_u16_e32 v4, v4 171; VI-NEXT: v_mov_b32_e32 v8, 0x7000 172; VI-NEXT: v_mul_f16_e32 v4, 0x7000, v4 173; VI-NEXT: v_mul_f16_sdwa v3, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 174; VI-NEXT: v_mul_f16_e32 v6, 0x7000, v6 175; VI-NEXT: v_mul_f16_sdwa v2, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 176; VI-NEXT: v_mul_f16_e32 v7, 0x7000, v7 177; VI-NEXT: v_mul_f16_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 178; VI-NEXT: v_mul_f16_e32 v5, 0x7000, v5 179; VI-NEXT: v_mul_f16_sdwa v0, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 180; VI-NEXT: v_or_b32_e32 v0, v5, v0 181; VI-NEXT: v_or_b32_e32 v1, v7, v1 182; VI-NEXT: v_or_b32_e32 v2, v6, v2 183; VI-NEXT: v_or_b32_e32 v3, v4, v3 184; VI-NEXT: s_setpc_b64 s[30:31] 185; 186; GFX10-LABEL: fmul_pow2_8xhalf: 187; GFX10: ; %bb.0: 188; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 189; GFX10-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0] 190; GFX10-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0] 191; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0] 192; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0] 193; GFX10-NEXT: v_cvt_f16_u16_e32 v4, v3 194; GFX10-NEXT: v_cvt_f16_u16_e32 v5, v2 195; GFX10-NEXT: v_cvt_f16_u16_e32 v6, v1 196; GFX10-NEXT: v_cvt_f16_u16_e32 v7, v0 197; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 198; GFX10-NEXT: v_cvt_f16_u16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 199; GFX10-NEXT: v_cvt_f16_u16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 200; GFX10-NEXT: v_cvt_f16_u16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 201; GFX10-NEXT: v_pack_b32_f16 v0, v7, v0 202; GFX10-NEXT: v_pack_b32_f16 v1, v6, v1 203; GFX10-NEXT: v_pack_b32_f16 v2, v5, v2 204; GFX10-NEXT: v_pack_b32_f16 v3, v4, v3 205; GFX10-NEXT: v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1] 206; GFX10-NEXT: v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1] 207; GFX10-NEXT: v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1] 208; GFX10-NEXT: v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1] 209; GFX10-NEXT: s_setpc_b64 s[30:31] 210; 211; GFX11-LABEL: fmul_pow2_8xhalf: 212; GFX11: ; %bb.0: 213; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 214; GFX11-NEXT: v_pk_lshlrev_b16 v3, v3, 1 op_sel_hi:[1,0] 215; GFX11-NEXT: v_pk_lshlrev_b16 v2, v2, 1 op_sel_hi:[1,0] 216; GFX11-NEXT: v_pk_lshlrev_b16 v0, v0, 1 op_sel_hi:[1,0] 217; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, 1 op_sel_hi:[1,0] 218; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 219; GFX11-NEXT: v_cvt_f16_u16_e32 v4, v3 220; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 221; GFX11-NEXT: v_cvt_f16_u16_e32 v5, v2 222; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 223; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 224; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 225; GFX11-NEXT: v_cvt_f16_u16_e32 v1, v1 226; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 227; GFX11-NEXT: v_cvt_f16_u16_e32 v6, v6 228; GFX11-NEXT: v_cvt_f16_u16_e32 v7, v7 229; GFX11-NEXT: v_cvt_f16_u16_e32 v2, v2 230; GFX11-NEXT: v_cvt_f16_u16_e32 v3, v3 231; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 232; GFX11-NEXT: v_pack_b32_f16 v0, v0, v6 233; GFX11-NEXT: v_pack_b32_f16 v1, v1, v7 234; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 235; GFX11-NEXT: v_pack_b32_f16 v2, v5, v2 236; GFX11-NEXT: v_pack_b32_f16 v3, v4, v3 237; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 238; GFX11-NEXT: v_pk_mul_f16 v0, 0x7000, v0 op_sel_hi:[0,1] 239; GFX11-NEXT: v_pk_mul_f16 v1, 0x7000, v1 op_sel_hi:[0,1] 240; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 241; GFX11-NEXT: v_pk_mul_f16 v2, 0x7000, v2 op_sel_hi:[0,1] 242; GFX11-NEXT: v_pk_mul_f16 v3, 0x7000, v3 op_sel_hi:[0,1] 243; GFX11-NEXT: s_setpc_b64 s[30:31] 244 %p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i 245 %p2_f = uitofp <8 x i16> %p2 to <8 x half> 246 %r = fmul <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f 247 ret <8 x half> %r 248} 249 250define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) { 251; VI-LABEL: fmul_pow2_ldexp_8xhalf: 252; VI: ; %bb.0: 253; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 254; VI-NEXT: v_mov_b32_e32 v5, 0x7000 255; VI-NEXT: v_ldexp_f16_e32 v4, 0x7000, v3 256; VI-NEXT: v_ldexp_f16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 257; VI-NEXT: v_ldexp_f16_e32 v6, 0x7000, v2 258; VI-NEXT: v_ldexp_f16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 259; VI-NEXT: v_ldexp_f16_e32 v7, 0x7000, v1 260; VI-NEXT: v_ldexp_f16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 261; VI-NEXT: v_ldexp_f16_e32 v8, 0x7000, v0 262; VI-NEXT: v_ldexp_f16_sdwa v0, v5, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 263; VI-NEXT: v_or_b32_e32 v0, v8, v0 264; VI-NEXT: v_or_b32_e32 v1, v7, v1 265; VI-NEXT: v_or_b32_e32 v2, v6, v2 266; VI-NEXT: v_or_b32_e32 v3, v4, v3 267; VI-NEXT: s_setpc_b64 s[30:31] 268; 269; GFX10-LABEL: fmul_pow2_ldexp_8xhalf: 270; GFX10: ; %bb.0: 271; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 272; GFX10-NEXT: v_mov_b32_e32 v4, 0x7000 273; GFX10-NEXT: v_ldexp_f16_e32 v5, 0x7000, v3 274; GFX10-NEXT: v_ldexp_f16_e32 v6, 0x7000, v2 275; GFX10-NEXT: v_ldexp_f16_e32 v7, 0x7000, v1 276; GFX10-NEXT: v_ldexp_f16_e32 v8, 0x7000, v0 277; GFX10-NEXT: v_ldexp_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 278; GFX10-NEXT: v_ldexp_f16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 279; GFX10-NEXT: v_ldexp_f16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 280; GFX10-NEXT: v_ldexp_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 281; GFX10-NEXT: v_pack_b32_f16 v0, v8, v0 282; GFX10-NEXT: v_pack_b32_f16 v1, v7, v1 283; GFX10-NEXT: v_pack_b32_f16 v2, v6, v2 284; GFX10-NEXT: v_pack_b32_f16 v3, v5, v3 285; GFX10-NEXT: s_setpc_b64 s[30:31] 286; 287; GFX11-LABEL: fmul_pow2_ldexp_8xhalf: 288; GFX11: ; %bb.0: 289; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 290; GFX11-NEXT: v_ldexp_f16_e32 v4, 0x7000, v3 291; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 292; GFX11-NEXT: v_ldexp_f16_e32 v5, 0x7000, v2 293; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 294; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 295; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 296; GFX11-NEXT: v_ldexp_f16_e32 v1, 0x7000, v1 297; GFX11-NEXT: v_ldexp_f16_e32 v0, 0x7000, v0 298; GFX11-NEXT: v_ldexp_f16_e32 v6, 0x7000, v6 299; GFX11-NEXT: v_ldexp_f16_e32 v7, 0x7000, v7 300; GFX11-NEXT: v_ldexp_f16_e32 v2, 0x7000, v2 301; GFX11-NEXT: v_ldexp_f16_e32 v3, 0x7000, v3 302; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 303; GFX11-NEXT: v_pack_b32_f16 v0, v0, v6 304; GFX11-NEXT: v_pack_b32_f16 v1, v1, v7 305; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 306; GFX11-NEXT: v_pack_b32_f16 v2, v5, v2 307; GFX11-NEXT: v_pack_b32_f16 v3, v4, v3 308; GFX11-NEXT: s_setpc_b64 s[30:31] 309 %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, <8 x i16> %i) 310 ret <8 x half> %r 311} 312 313define <8 x half> @fdiv_pow2_8xhalf(<8 x i16> %i) { 314; VI-LABEL: fdiv_pow2_8xhalf: 315; VI: ; %bb.0: 316; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 317; VI-NEXT: v_mov_b32_e32 v4, 10 318; VI-NEXT: v_lshlrev_b16_sdwa v5, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 319; VI-NEXT: v_mov_b32_e32 v6, 0x7000 320; VI-NEXT: v_lshlrev_b16_e32 v3, 10, v3 321; VI-NEXT: v_lshlrev_b16_sdwa v7, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 322; VI-NEXT: v_lshlrev_b16_e32 v2, 10, v2 323; VI-NEXT: v_lshlrev_b16_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 324; VI-NEXT: v_lshlrev_b16_e32 v1, 10, v1 325; VI-NEXT: v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 326; VI-NEXT: v_lshlrev_b16_e32 v0, 10, v0 327; VI-NEXT: v_sub_u16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 328; VI-NEXT: v_sub_u16_sdwa v7, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 329; VI-NEXT: v_sub_u16_sdwa v8, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 330; VI-NEXT: v_sub_u16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 331; VI-NEXT: v_sub_u16_e32 v0, 0x7000, v0 332; VI-NEXT: v_sub_u16_e32 v1, 0x7000, v1 333; VI-NEXT: v_sub_u16_e32 v2, 0x7000, v2 334; VI-NEXT: v_sub_u16_e32 v3, 0x7000, v3 335; VI-NEXT: v_or_b32_e32 v0, v0, v4 336; VI-NEXT: v_or_b32_e32 v1, v1, v8 337; VI-NEXT: v_or_b32_e32 v2, v2, v7 338; VI-NEXT: v_or_b32_e32 v3, v3, v5 339; VI-NEXT: s_setpc_b64 s[30:31] 340; 341; GFX10-LABEL: fdiv_pow2_8xhalf: 342; GFX10: ; %bb.0: 343; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 344; GFX10-NEXT: v_pk_lshlrev_b16 v0, 10, v0 op_sel_hi:[0,1] 345; GFX10-NEXT: v_pk_lshlrev_b16 v1, 10, v1 op_sel_hi:[0,1] 346; GFX10-NEXT: v_pk_lshlrev_b16 v2, 10, v2 op_sel_hi:[0,1] 347; GFX10-NEXT: v_pk_lshlrev_b16 v3, 10, v3 op_sel_hi:[0,1] 348; GFX10-NEXT: v_pk_sub_i16 v0, 0x7000, v0 op_sel_hi:[0,1] 349; GFX10-NEXT: v_pk_sub_i16 v1, 0x7000, v1 op_sel_hi:[0,1] 350; GFX10-NEXT: v_pk_sub_i16 v2, 0x7000, v2 op_sel_hi:[0,1] 351; GFX10-NEXT: v_pk_sub_i16 v3, 0x7000, v3 op_sel_hi:[0,1] 352; GFX10-NEXT: s_setpc_b64 s[30:31] 353; 354; GFX11-LABEL: fdiv_pow2_8xhalf: 355; GFX11: ; %bb.0: 356; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 357; GFX11-NEXT: v_pk_lshlrev_b16 v0, 10, v0 op_sel_hi:[0,1] 358; GFX11-NEXT: v_pk_lshlrev_b16 v1, 10, v1 op_sel_hi:[0,1] 359; GFX11-NEXT: v_pk_lshlrev_b16 v2, 10, v2 op_sel_hi:[0,1] 360; GFX11-NEXT: v_pk_lshlrev_b16 v3, 10, v3 op_sel_hi:[0,1] 361; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 362; GFX11-NEXT: v_pk_sub_i16 v0, 0x7000, v0 op_sel_hi:[0,1] 363; GFX11-NEXT: v_pk_sub_i16 v1, 0x7000, v1 op_sel_hi:[0,1] 364; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 365; GFX11-NEXT: v_pk_sub_i16 v2, 0x7000, v2 op_sel_hi:[0,1] 366; GFX11-NEXT: v_pk_sub_i16 v3, 0x7000, v3 op_sel_hi:[0,1] 367; GFX11-NEXT: s_setpc_b64 s[30:31] 368 %p2 = shl <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>, %i 369 %p2_f = uitofp <8 x i16> %p2 to <8 x half> 370 %r = fdiv <8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, %p2_f 371 ret <8 x half> %r 372} 373 374define double @fmul_pow_shl_cnt(i64 %cnt) nounwind { 375; VI-LABEL: fmul_pow_shl_cnt: 376; VI: ; %bb.0: 377; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 378; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 1 379; VI-NEXT: s_mov_b32 s4, 0 380; VI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 381; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 382; VI-NEXT: s_mov_b32 s5, 0x40220000 383; VI-NEXT: v_ldexp_f64 v[1:2], v[1:2], 32 384; VI-NEXT: v_add_f64 v[0:1], v[1:2], v[3:4] 385; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] 386; VI-NEXT: s_setpc_b64 s[30:31] 387; 388; GFX10-LABEL: fmul_pow_shl_cnt: 389; GFX10: ; %bb.0: 390; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 391; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 1 392; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 393; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 394; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 395; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] 396; GFX10-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1] 397; GFX10-NEXT: s_setpc_b64 s[30:31] 398; 399; GFX11-LABEL: fmul_pow_shl_cnt: 400; GFX11: ; %bb.0: 401; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 402; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 1 403; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 404; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 405; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 406; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 407; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 408; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] 409; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 410; GFX11-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1] 411; GFX11-NEXT: s_setpc_b64 s[30:31] 412 %shl = shl nuw i64 1, %cnt 413 %conv = uitofp i64 %shl to double 414 %mul = fmul double 9.000000e+00, %conv 415 ret double %mul 416} 417 418define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind { 419; VI-LABEL: fmul_pow_shl_cnt2: 420; VI: ; %bb.0: 421; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 422; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 2 423; VI-NEXT: s_mov_b32 s4, 0 424; VI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 425; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 426; VI-NEXT: s_mov_b32 s5, 0xc0220000 427; VI-NEXT: v_ldexp_f64 v[1:2], v[1:2], 32 428; VI-NEXT: v_add_f64 v[0:1], v[1:2], v[3:4] 429; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] 430; VI-NEXT: s_setpc_b64 s[30:31] 431; 432; GFX10-LABEL: fmul_pow_shl_cnt2: 433; GFX10: ; %bb.0: 434; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 435; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2 436; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 437; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 438; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 439; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] 440; GFX10-NEXT: v_mul_f64 v[0:1], 0xc0220000, v[0:1] 441; GFX10-NEXT: s_setpc_b64 s[30:31] 442; 443; GFX11-LABEL: fmul_pow_shl_cnt2: 444; GFX11: ; %bb.0: 445; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 446; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2 447; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 448; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 449; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 450; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 451; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 452; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] 453; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 454; GFX11-NEXT: v_mul_f64 v[0:1], 0xc0220000, v[0:1] 455; GFX11-NEXT: s_setpc_b64 s[30:31] 456 %shl = shl nuw i64 2, %cnt 457 %conv = uitofp i64 %shl to double 458 %mul = fmul double -9.000000e+00, %conv 459 ret double %mul 460} 461 462define float @fmul_pow_select(i32 %cnt, i1 %c) nounwind { 463; VI-LABEL: fmul_pow_select: 464; VI: ; %bb.0: 465; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 466; VI-NEXT: v_and_b32_e32 v1, 1, v1 467; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 468; VI-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc 469; VI-NEXT: v_lshlrev_b32_e32 v0, v0, v1 470; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 471; VI-NEXT: v_mul_f32_e32 v0, 0x41100000, v0 472; VI-NEXT: s_setpc_b64 s[30:31] 473; 474; GFX10-LABEL: fmul_pow_select: 475; GFX10: ; %bb.0: 476; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 477; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 478; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 479; GFX10-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc_lo 480; GFX10-NEXT: v_lshlrev_b32_e32 v0, v0, v1 481; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 482; GFX10-NEXT: v_mul_f32_e32 v0, 0x41100000, v0 483; GFX10-NEXT: s_setpc_b64 s[30:31] 484; 485; GFX11-LABEL: fmul_pow_select: 486; GFX11: ; %bb.0: 487; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 488; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 489; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 490; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 491; GFX11-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc_lo 492; GFX11-NEXT: v_lshlrev_b32_e32 v0, v0, v1 493; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 494; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 495; GFX11-NEXT: v_mul_f32_e32 v0, 0x41100000, v0 496; GFX11-NEXT: s_setpc_b64 s[30:31] 497 %shl2 = shl nuw i32 2, %cnt 498 %shl1 = shl nuw i32 1, %cnt 499 %shl = select i1 %c, i32 %shl1, i32 %shl2 500 %conv = uitofp i32 %shl to float 501 %mul = fmul float 9.000000e+00, %conv 502 ret float %mul 503} 504 505define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind { 506; VI-LABEL: fmul_fly_pow_mul_min_pow2: 507; VI: ; %bb.0: 508; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 509; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 8 510; VI-NEXT: s_mov_b64 s[4:5], 0x2000 511; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] 512; VI-NEXT: v_mov_b32_e32 v2, 0x2000 513; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 514; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 515; VI-NEXT: v_ffbh_u32_e32 v2, v1 516; VI-NEXT: v_min_u32_e32 v2, 32, v2 517; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 518; VI-NEXT: v_min_u32_e32 v0, 1, v0 519; VI-NEXT: v_or_b32_e32 v0, v1, v0 520; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 521; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2 522; VI-NEXT: v_ldexp_f32 v0, v0, v1 523; VI-NEXT: v_mul_f32_e32 v0, 0x41100000, v0 524; VI-NEXT: s_setpc_b64 s[30:31] 525; 526; GFX10-LABEL: fmul_fly_pow_mul_min_pow2: 527; GFX10: ; %bb.0: 528; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 529; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 8 530; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1] 531; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo 532; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo 533; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 534; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 535; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 536; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 537; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 538; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2 539; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 540; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 541; GFX10-NEXT: v_mul_f32_e32 v0, 0x41100000, v0 542; GFX10-NEXT: s_setpc_b64 s[30:31] 543; 544; GFX11-LABEL: fmul_fly_pow_mul_min_pow2: 545; GFX11: ; %bb.0: 546; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 547; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 8 548; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 549; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1] 550; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo 551; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo 552; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1 553; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 554; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 555; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 556; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 557; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 558; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 559; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2 560; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 561; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 562; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 563; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 564; GFX11-NEXT: v_mul_f32_e32 v0, 0x41100000, v0 565; GFX11-NEXT: s_setpc_b64 s[30:31] 566 %shl8 = shl nuw i64 8, %cnt 567 %shl = call i64 @llvm.umin.i64(i64 %shl8, i64 8192) 568 %conv = uitofp i64 %shl to float 569 %mul = fmul float 9.000000e+00, %conv 570 ret float %mul 571} 572 573define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind { 574; VI-LABEL: fmul_pow_mul_max_pow2: 575; VI: ; %bb.0: 576; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 577; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 2 578; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 579; VI-NEXT: s_mov_b32 s4, 0 580; VI-NEXT: s_mov_b32 s5, 0x40080000 581; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] 582; VI-NEXT: s_setpc_b64 s[30:31] 583; 584; GFX10-LABEL: fmul_pow_mul_max_pow2: 585; GFX10: ; %bb.0: 586; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 587; GFX10-NEXT: v_lshlrev_b16 v0, v0, 2 588; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 589; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 590; GFX10-NEXT: v_mul_f64 v[0:1], 0x40080000, v[0:1] 591; GFX10-NEXT: s_setpc_b64 s[30:31] 592; 593; GFX11-LABEL: fmul_pow_mul_max_pow2: 594; GFX11: ; %bb.0: 595; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 596; GFX11-NEXT: v_lshlrev_b16 v0, v0, 2 597; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 598; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 599; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 600; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 601; GFX11-NEXT: v_mul_f64 v[0:1], 0x40080000, v[0:1] 602; GFX11-NEXT: s_setpc_b64 s[30:31] 603 %shl2 = shl nuw i16 2, %cnt 604 %shl1 = shl nuw i16 1, %cnt 605 %shl = call i16 @llvm.umax.i16(i16 %shl1, i16 %shl2) 606 %conv = uitofp i16 %shl to double 607 %mul = fmul double 3.000000e+00, %conv 608 ret double %mul 609} 610 611define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind { 612; VI-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: 613; VI: ; %bb.0: 614; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 615; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 616; VI-NEXT: s_mov_b32 s4, 0 617; VI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 618; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 619; VI-NEXT: s_mov_b32 s5, 0x40220000 620; VI-NEXT: v_ldexp_f64 v[1:2], v[1:2], 32 621; VI-NEXT: v_add_f64 v[0:1], v[1:2], v[3:4] 622; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] 623; VI-NEXT: s_setpc_b64 s[30:31] 624; 625; GFX10-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: 626; GFX10: ; %bb.0: 627; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 628; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 629; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 630; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 631; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 632; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] 633; GFX10-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1] 634; GFX10-NEXT: s_setpc_b64 s[30:31] 635; 636; GFX11-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: 637; GFX11: ; %bb.0: 638; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 639; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 640; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 641; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 642; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 643; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 644; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 645; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] 646; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 647; GFX11-NEXT: v_mul_f64 v[0:1], 0x40220000, v[0:1] 648; GFX11-NEXT: s_setpc_b64 s[30:31] 649 %shl = shl nuw i64 %v, %cnt 650 %conv = uitofp i64 %shl to double 651 %mul = fmul double 9.000000e+00, %conv 652 ret double %mul 653} 654 655define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nounwind { 656; VI-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: 657; VI: ; %bb.0: 658; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 659; VI-NEXT: v_lshlrev_b64 v[1:2], v2, 2 660; VI-NEXT: v_ffbh_u32_e32 v3, v2 661; VI-NEXT: v_min_u32_e32 v5, 32, v3 662; VI-NEXT: v_lshlrev_b64 v[1:2], v5, v[1:2] 663; VI-NEXT: v_lshlrev_b64 v[3:4], v0, 2 664; VI-NEXT: v_min_u32_e32 v0, 1, v1 665; VI-NEXT: v_or_b32_e32 v0, v2, v0 666; VI-NEXT: v_cvt_f32_u32_e32 v2, v0 667; VI-NEXT: v_ffbh_u32_e32 v0, v4 668; VI-NEXT: v_min_u32_e32 v6, 32, v0 669; VI-NEXT: v_lshlrev_b64 v[0:1], v6, v[3:4] 670; VI-NEXT: v_sub_u32_e32 v3, vcc, 32, v5 671; VI-NEXT: v_min_u32_e32 v0, 1, v0 672; VI-NEXT: v_or_b32_e32 v0, v1, v0 673; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 674; VI-NEXT: v_ldexp_f32 v1, v2, v3 675; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v6 676; VI-NEXT: v_ldexp_f32 v0, v0, v2 677; VI-NEXT: v_mul_f32_e32 v0, 0x41700000, v0 678; VI-NEXT: v_mul_f32_e32 v1, 0x41700000, v1 679; VI-NEXT: s_setpc_b64 s[30:31] 680; 681; GFX10-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: 682; GFX10: ; %bb.0: 683; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 684; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2 685; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 2 686; GFX10-NEXT: v_ffbh_u32_e32 v4, v1 687; GFX10-NEXT: v_ffbh_u32_e32 v5, v3 688; GFX10-NEXT: v_min_u32_e32 v4, 32, v4 689; GFX10-NEXT: v_min_u32_e32 v5, 32, v5 690; GFX10-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 691; GFX10-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] 692; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 693; GFX10-NEXT: v_min_u32_e32 v2, 1, v2 694; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 695; GFX10-NEXT: v_or_b32_e32 v1, v3, v2 696; GFX10-NEXT: v_sub_nc_u32_e32 v2, 32, v5 697; GFX10-NEXT: v_sub_nc_u32_e32 v3, 32, v4 698; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 699; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 700; GFX10-NEXT: v_ldexp_f32 v0, v0, v3 701; GFX10-NEXT: v_ldexp_f32 v1, v1, v2 702; GFX10-NEXT: v_mul_f32_e32 v0, 0x41700000, v0 703; GFX10-NEXT: v_mul_f32_e32 v1, 0x41700000, v1 704; GFX10-NEXT: s_setpc_b64 s[30:31] 705; 706; GFX11-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: 707; GFX11: ; %bb.0: 708; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 709; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2 710; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 2 711; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 712; GFX11-NEXT: v_clz_i32_u32_e32 v4, v1 713; GFX11-NEXT: v_clz_i32_u32_e32 v5, v3 714; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 715; GFX11-NEXT: v_min_u32_e32 v4, 32, v4 716; GFX11-NEXT: v_min_u32_e32 v5, 32, v5 717; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 718; GFX11-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] 719; GFX11-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] 720; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 721; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 722; GFX11-NEXT: v_min_u32_e32 v2, 1, v2 723; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 724; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 725; GFX11-NEXT: v_or_b32_e32 v1, v3, v2 726; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v5 727; GFX11-NEXT: v_sub_nc_u32_e32 v3, 32, v4 728; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 729; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 730; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 731; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 732; GFX11-NEXT: v_ldexp_f32 v0, v0, v3 733; GFX11-NEXT: v_ldexp_f32 v1, v1, v2 734; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 735; GFX11-NEXT: v_dual_mul_f32 v0, 0x41700000, v0 :: v_dual_mul_f32 v1, 0x41700000, v1 736; GFX11-NEXT: s_setpc_b64 s[30:31] 737 %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt 738 %conv = uitofp <2 x i64> %shl to <2 x float> 739 %mul = fmul <2 x float> <float 15.000000e+00, float 15.000000e+00>, %conv 740 ret <2 x float> %mul 741} 742 743define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { 744; VI-LABEL: fmul_pow_shl_cnt_vec: 745; VI: ; %bb.0: 746; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 747; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 2 748; VI-NEXT: v_lshlrev_b64 v[2:3], v2, 2 749; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 750; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 751; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 752; VI-NEXT: s_mov_b32 s4, 0 753; VI-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 754; VI-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 755; VI-NEXT: v_cvt_f64_u32_e32 v[7:8], v2 756; VI-NEXT: s_mov_b32 s5, 0x402e0000 757; VI-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] 758; VI-NEXT: v_add_f64 v[2:3], v[5:6], v[7:8] 759; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] 760; VI-NEXT: v_mul_f64 v[2:3], v[2:3], s[4:5] 761; VI-NEXT: s_setpc_b64 s[30:31] 762; 763; GFX10-LABEL: fmul_pow_shl_cnt_vec: 764; GFX10: ; %bb.0: 765; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 766; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2 767; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 2 768; GFX10-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 769; GFX10-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 770; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 771; GFX10-NEXT: v_cvt_f64_u32_e32 v[8:9], v2 772; GFX10-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 773; GFX10-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 774; GFX10-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] 775; GFX10-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] 776; GFX10-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] 777; GFX10-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3] 778; GFX10-NEXT: s_setpc_b64 s[30:31] 779; 780; GFX11-LABEL: fmul_pow_shl_cnt_vec: 781; GFX11: ; %bb.0: 782; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 783; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2 784; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 2 785; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 786; GFX11-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 787; GFX11-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 788; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 789; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 790; GFX11-NEXT: v_cvt_f64_u32_e32 v[8:9], v2 791; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 792; GFX11-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 793; GFX11-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 794; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 795; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] 796; GFX11-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] 797; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 798; GFX11-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] 799; GFX11-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3] 800; GFX11-NEXT: s_setpc_b64 s[30:31] 801 %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt 802 %conv = uitofp <2 x i64> %shl to <2 x double> 803 %mul = fmul <2 x double> <double 15.000000e+00, double 15.000000e+00>, %conv 804 ret <2 x double> %mul 805} 806 807define <4 x float> @fmul_pow_shl_cnt_vec_preserve_fma(<4 x i32> %cnt, <4 x float> %add) nounwind { 808; VI-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: 809; VI: ; %bb.0: 810; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 811; VI-NEXT: v_lshlrev_b32_e64 v3, v3, 2 812; VI-NEXT: v_lshlrev_b32_e64 v2, v2, 2 813; VI-NEXT: v_lshlrev_b32_e64 v1, v1, 2 814; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 2 815; VI-NEXT: v_cvt_f32_u32_e32 v3, v3 816; VI-NEXT: v_cvt_f32_u32_e32 v2, v2 817; VI-NEXT: v_cvt_f32_u32_e32 v1, v1 818; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 819; VI-NEXT: v_mul_f32_e32 v3, 0x40a00000, v3 820; VI-NEXT: v_mul_f32_e32 v2, 0x40a00000, v2 821; VI-NEXT: v_mul_f32_e32 v1, 0x40a00000, v1 822; VI-NEXT: v_mul_f32_e32 v0, 0x40a00000, v0 823; VI-NEXT: v_add_f32_e32 v0, v0, v4 824; VI-NEXT: v_add_f32_e32 v1, v1, v5 825; VI-NEXT: v_add_f32_e32 v2, v2, v6 826; VI-NEXT: v_add_f32_e32 v3, v3, v7 827; VI-NEXT: s_setpc_b64 s[30:31] 828; 829; GFX10-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: 830; GFX10: ; %bb.0: 831; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 832; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 2 833; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, 2 834; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, 2 835; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, 2 836; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 837; GFX10-NEXT: v_cvt_f32_u32_e32 v1, v1 838; GFX10-NEXT: v_cvt_f32_u32_e32 v2, v2 839; GFX10-NEXT: v_cvt_f32_u32_e32 v3, v3 840; GFX10-NEXT: v_mul_f32_e32 v0, 0x40a00000, v0 841; GFX10-NEXT: v_mul_f32_e32 v1, 0x40a00000, v1 842; GFX10-NEXT: v_mul_f32_e32 v2, 0x40a00000, v2 843; GFX10-NEXT: v_mul_f32_e32 v3, 0x40a00000, v3 844; GFX10-NEXT: v_add_f32_e32 v0, v0, v4 845; GFX10-NEXT: v_add_f32_e32 v1, v1, v5 846; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 847; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 848; GFX10-NEXT: s_setpc_b64 s[30:31] 849; 850; GFX11-LABEL: fmul_pow_shl_cnt_vec_preserve_fma: 851; GFX11: ; %bb.0: 852; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 853; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 2 854; GFX11-NEXT: v_lshlrev_b32_e64 v1, v1, 2 855; GFX11-NEXT: v_lshlrev_b32_e64 v2, v2, 2 856; GFX11-NEXT: v_lshlrev_b32_e64 v3, v3, 2 857; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 858; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 859; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1 860; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 861; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2 862; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3 863; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) 864; GFX11-NEXT: v_dual_mul_f32 v0, 0x40a00000, v0 :: v_dual_mul_f32 v1, 0x40a00000, v1 865; GFX11-NEXT: v_dual_mul_f32 v2, 0x40a00000, v2 :: v_dual_mul_f32 v3, 0x40a00000, v3 866; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 867; GFX11-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5 868; GFX11-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7 869; GFX11-NEXT: s_setpc_b64 s[30:31] 870 %shl = shl nsw nuw <4 x i32> <i32 2, i32 2, i32 2, i32 2>, %cnt 871 %conv = uitofp <4 x i32> %shl to <4 x float> 872 %mul = fmul <4 x float> <float 5.000000e+00, float 5.000000e+00, float 5.000000e+00, float 5.000000e+00>, %conv 873 %res = fadd <4 x float> %mul, %add 874 ret <4 x float> %res 875} 876 877define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwind { 878; VI-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: 879; VI: ; %bb.0: 880; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 881; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 2 882; VI-NEXT: s_mov_b32 s4, 0 883; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v1 884; VI-NEXT: v_lshlrev_b64 v[1:2], v2, 2 885; VI-NEXT: s_mov_b32 s5, 0x402e0000 886; VI-NEXT: v_cvt_f64_u32_e32 v[5:6], v2 887; VI-NEXT: v_ldexp_f64 v[2:3], v[3:4], 32 888; VI-NEXT: v_ldexp_f64 v[4:5], v[5:6], 32 889; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], v0 890; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v1 891; VI-NEXT: v_add_f64 v[2:3], v[2:3], v[6:7] 892; VI-NEXT: v_add_f64 v[4:5], v[4:5], v[0:1] 893; VI-NEXT: v_mul_f64 v[0:1], v[2:3], s[4:5] 894; VI-NEXT: s_mov_b32 s4, 0 895; VI-NEXT: s_mov_b32 s5, 0x402c0000 896; VI-NEXT: v_mul_f64 v[2:3], v[4:5], s[4:5] 897; VI-NEXT: s_setpc_b64 s[30:31] 898; 899; GFX10-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: 900; GFX10: ; %bb.0: 901; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 902; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2 903; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 2 904; GFX10-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 905; GFX10-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 906; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 907; GFX10-NEXT: v_cvt_f64_u32_e32 v[8:9], v2 908; GFX10-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 909; GFX10-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 910; GFX10-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] 911; GFX10-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] 912; GFX10-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] 913; GFX10-NEXT: v_mul_f64 v[2:3], 0x402c0000, v[2:3] 914; GFX10-NEXT: s_setpc_b64 s[30:31] 915; 916; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo: 917; GFX11: ; %bb.0: 918; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 919; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2 920; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 2 921; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 922; GFX11-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 923; GFX11-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 924; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 925; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 926; GFX11-NEXT: v_cvt_f64_u32_e32 v[8:9], v2 927; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 928; GFX11-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 929; GFX11-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 930; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 931; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] 932; GFX11-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] 933; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 934; GFX11-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] 935; GFX11-NEXT: v_mul_f64 v[2:3], 0x402c0000, v[2:3] 936; GFX11-NEXT: s_setpc_b64 s[30:31] 937 %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt 938 %conv = uitofp <2 x i64> %shl to <2 x double> 939 %mul = fmul <2 x double> <double 15.000000e+00, double 14.000000e+00>, %conv 940 ret <2 x double> %mul 941} 942 943define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwind { 944; VI-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: 945; VI: ; %bb.0: 946; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 947; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 2 948; VI-NEXT: v_lshlrev_b64 v[2:3], v2, 1 949; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 950; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 951; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 952; VI-NEXT: s_mov_b32 s4, 0 953; VI-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 954; VI-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 955; VI-NEXT: v_cvt_f64_u32_e32 v[7:8], v2 956; VI-NEXT: s_mov_b32 s5, 0x402e0000 957; VI-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] 958; VI-NEXT: v_add_f64 v[2:3], v[5:6], v[7:8] 959; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] 960; VI-NEXT: v_mul_f64 v[2:3], v[2:3], s[4:5] 961; VI-NEXT: s_setpc_b64 s[30:31] 962; 963; GFX10-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: 964; GFX10: ; %bb.0: 965; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 966; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 2 967; GFX10-NEXT: v_lshlrev_b64 v[2:3], v2, 1 968; GFX10-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 969; GFX10-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 970; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 971; GFX10-NEXT: v_cvt_f64_u32_e32 v[8:9], v2 972; GFX10-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 973; GFX10-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 974; GFX10-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] 975; GFX10-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] 976; GFX10-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] 977; GFX10-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3] 978; GFX10-NEXT: s_setpc_b64 s[30:31] 979; 980; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo: 981; GFX11: ; %bb.0: 982; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 983; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 2 984; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 1 985; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 986; GFX11-NEXT: v_cvt_f64_u32_e32 v[4:5], v1 987; GFX11-NEXT: v_cvt_f64_u32_e32 v[6:7], v3 988; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 989; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 990; GFX11-NEXT: v_cvt_f64_u32_e32 v[8:9], v2 991; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 992; GFX11-NEXT: v_ldexp_f64 v[3:4], v[4:5], 32 993; GFX11-NEXT: v_ldexp_f64 v[5:6], v[6:7], 32 994; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 995; GFX11-NEXT: v_add_f64 v[0:1], v[3:4], v[0:1] 996; GFX11-NEXT: v_add_f64 v[2:3], v[5:6], v[8:9] 997; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 998; GFX11-NEXT: v_mul_f64 v[0:1], 0x402e0000, v[0:1] 999; GFX11-NEXT: v_mul_f64 v[2:3], 0x402e0000, v[2:3] 1000; GFX11-NEXT: s_setpc_b64 s[30:31] 1001 %shl = shl nsw nuw <2 x i64> <i64 2, i64 1>, %cnt 1002 %conv = uitofp <2 x i64> %shl to <2 x double> 1003 %mul = fmul <2 x double> <double 15.000000e+00, double 15.000000e+00>, %conv 1004 ret <2 x double> %mul 1005} 1006 1007define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { 1008; VI-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: 1009; VI: ; %bb.0: 1010; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1011; VI-NEXT: v_mov_b32_e32 v1, 2 1012; VI-NEXT: v_lshlrev_b16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1013; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 2 1014; VI-NEXT: v_cvt_f16_u16_e32 v0, v0 1015; VI-NEXT: v_cvt_f16_u16_e32 v1, v1 1016; VI-NEXT: v_mov_b32_e32 v2, 0x4b80 1017; VI-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1018; VI-NEXT: v_mul_f16_e32 v0, 0x4b80, v0 1019; VI-NEXT: v_or_b32_e32 v0, v0, v1 1020; VI-NEXT: s_setpc_b64 s[30:31] 1021; 1022; GFX10-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: 1023; GFX10: ; %bb.0: 1024; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1025; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0] 1026; GFX10-NEXT: v_cvt_f16_u16_e32 v1, v0 1027; GFX10-NEXT: v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1028; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0 1029; GFX10-NEXT: v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1] 1030; GFX10-NEXT: s_setpc_b64 s[30:31] 1031; 1032; GFX11-LABEL: fmul_pow_shl_cnt_vec_fail_to_large: 1033; GFX11: ; %bb.0: 1034; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1035; GFX11-NEXT: v_pk_lshlrev_b16 v0, v0, 2 op_sel_hi:[1,0] 1036; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1037; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1038; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 1039; GFX11-NEXT: v_cvt_f16_u16_e32 v1, v1 1040; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1041; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 1042; GFX11-NEXT: v_pk_mul_f16 v0, 0x4b80, v0 op_sel_hi:[0,1] 1043; GFX11-NEXT: s_setpc_b64 s[30:31] 1044 %shl = shl nsw nuw <2 x i16> <i16 2, i16 2>, %cnt 1045 %conv = uitofp <2 x i16> %shl to <2 x half> 1046 %mul = fmul <2 x half> <half 15.000000e+00, half 15.000000e+00>, %conv 1047 ret <2 x half> %mul 1048} 1049 1050define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind { 1051; VI-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: 1052; VI: ; %bb.0: 1053; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1054; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 1 1055; VI-NEXT: s_mov_b32 s4, 0xff5f3992 1056; VI-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 1057; VI-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 1058; VI-NEXT: s_mov_b32 s5, 0x7befffff 1059; VI-NEXT: v_ldexp_f64 v[1:2], v[1:2], 32 1060; VI-NEXT: v_add_f64 v[0:1], v[1:2], v[3:4] 1061; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] 1062; VI-NEXT: s_setpc_b64 s[30:31] 1063; 1064; GFX10-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: 1065; GFX10: ; %bb.0: 1066; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1067; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 1 1068; GFX10-NEXT: s_mov_b32 s4, 0xff5f3992 1069; GFX10-NEXT: s_mov_b32 s5, 0x7befffff 1070; GFX10-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 1071; GFX10-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 1072; GFX10-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 1073; GFX10-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] 1074; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] 1075; GFX10-NEXT: s_setpc_b64 s[30:31] 1076; 1077; GFX11-LABEL: fmul_pow_shl_cnt_fail_maybe_bad_exp: 1078; GFX11: ; %bb.0: 1079; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1080; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 1 1081; GFX11-NEXT: s_mov_b32 s0, 0xff5f3992 1082; GFX11-NEXT: s_mov_b32 s1, 0x7befffff 1083; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) 1084; GFX11-NEXT: v_cvt_f64_u32_e32 v[1:2], v1 1085; GFX11-NEXT: v_cvt_f64_u32_e32 v[3:4], v0 1086; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1087; GFX11-NEXT: v_ldexp_f64 v[0:1], v[1:2], 32 1088; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[3:4] 1089; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1090; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] 1091; GFX11-NEXT: s_setpc_b64 s[30:31] 1092 %shl = shl nuw i64 1, %cnt 1093 %conv = uitofp i64 %shl to double 1094 %mul = fmul double 9.745314e+288, %conv 1095 ret double %mul 1096} 1097 1098define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind { 1099; VI-LABEL: fmul_pow_shl_cnt_safe: 1100; VI: ; %bb.0: 1101; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1102; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 1 1103; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 1104; VI-NEXT: s_mov_b32 s4, 0xff5f3992 1105; VI-NEXT: s_mov_b32 s5, 0x7befffff 1106; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] 1107; VI-NEXT: s_setpc_b64 s[30:31] 1108; 1109; GFX10-LABEL: fmul_pow_shl_cnt_safe: 1110; GFX10: ; %bb.0: 1111; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1112; GFX10-NEXT: v_lshlrev_b16 v0, v0, 1 1113; GFX10-NEXT: s_mov_b32 s4, 0xff5f3992 1114; GFX10-NEXT: s_mov_b32 s5, 0x7befffff 1115; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 1116; GFX10-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 1117; GFX10-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] 1118; GFX10-NEXT: s_setpc_b64 s[30:31] 1119; 1120; GFX11-LABEL: fmul_pow_shl_cnt_safe: 1121; GFX11: ; %bb.0: 1122; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1123; GFX11-NEXT: v_lshlrev_b16 v0, v0, 1 1124; GFX11-NEXT: s_mov_b32 s0, 0xff5f3992 1125; GFX11-NEXT: s_mov_b32 s1, 0x7befffff 1126; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1127; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 1128; GFX11-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 1129; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1130; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] 1131; GFX11-NEXT: s_setpc_b64 s[30:31] 1132 %shl = shl nuw i16 1, %cnt 1133 %conv = uitofp i16 %shl to double 1134 %mul = fmul double 9.745314e+288, %conv 1135 ret double %mul 1136} 1137 1138define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind { 1139; VI-LABEL: fdiv_pow_shl_cnt_vec: 1140; VI: ; %bb.0: 1141; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1142; VI-NEXT: v_lshlrev_b32_e32 v1, 20, v0 1143; VI-NEXT: v_mov_b32_e32 v3, 0x3ff00000 1144; VI-NEXT: v_sub_u32_e64 v0, vcc, 0, 0 1145; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc 1146; VI-NEXT: v_lshlrev_b32_e32 v4, 20, v2 1147; VI-NEXT: v_sub_u32_e64 v2, vcc, 0, 0 1148; VI-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 1149; VI-NEXT: s_setpc_b64 s[30:31] 1150; 1151; GFX10-LABEL: fdiv_pow_shl_cnt_vec: 1152; GFX10: ; %bb.0: 1153; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1154; GFX10-NEXT: v_lshlrev_b32_e32 v1, 20, v0 1155; GFX10-NEXT: v_lshlrev_b32_e32 v3, 20, v2 1156; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, 0, 0 1157; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x3ff00000, v1, vcc_lo 1158; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, 0, 0 1159; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v3, vcc_lo 1160; GFX10-NEXT: s_setpc_b64 s[30:31] 1161; 1162; GFX11-LABEL: fdiv_pow_shl_cnt_vec: 1163; GFX11: ; %bb.0: 1164; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1165; GFX11-NEXT: v_lshlrev_b32_e32 v1, 20, v0 1166; GFX11-NEXT: v_lshlrev_b32_e32 v3, 20, v2 1167; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, 0, 0 1168; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) 1169; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x3ff00000, v1, vcc_lo 1170; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, 0, 0 1171; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v3, vcc_lo 1172; GFX11-NEXT: s_setpc_b64 s[30:31] 1173 %shl = shl nuw <2 x i64> <i64 1, i64 1>, %cnt 1174 %conv = uitofp <2 x i64> %shl to <2 x double> 1175 %mul = fdiv <2 x double> <double 1.000000e+00, double 1.000000e+00>, %conv 1176 ret <2 x double> %mul 1177} 1178 1179define <2 x float> @fdiv_pow_shl_cnt_vec_with_expensive_cast(<2 x i64> %cnt) nounwind { 1180; VI-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: 1181; VI: ; %bb.0: 1182; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1183; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 1184; VI-NEXT: v_lshlrev_b32_e32 v1, 23, v2 1185; VI-NEXT: v_sub_u32_e32 v0, vcc, 1.0, v0 1186; VI-NEXT: v_sub_u32_e32 v1, vcc, 1.0, v1 1187; VI-NEXT: s_setpc_b64 s[30:31] 1188; 1189; GFX10-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: 1190; GFX10: ; %bb.0: 1191; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1192; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0 1193; GFX10-NEXT: v_lshlrev_b32_e32 v1, 23, v2 1194; GFX10-NEXT: v_sub_nc_u32_e32 v0, 1.0, v0 1195; GFX10-NEXT: v_sub_nc_u32_e32 v1, 1.0, v1 1196; GFX10-NEXT: s_setpc_b64 s[30:31] 1197; 1198; GFX11-LABEL: fdiv_pow_shl_cnt_vec_with_expensive_cast: 1199; GFX11: ; %bb.0: 1200; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1201; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0 1202; GFX11-NEXT: v_lshlrev_b32_e32 v1, 23, v2 1203; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1204; GFX11-NEXT: v_sub_nc_u32_e32 v0, 1.0, v0 1205; GFX11-NEXT: v_sub_nc_u32_e32 v1, 1.0, v1 1206; GFX11-NEXT: s_setpc_b64 s[30:31] 1207 %shl = shl nuw <2 x i64> <i64 1, i64 1>, %cnt 1208 %conv = uitofp <2 x i64> %shl to <2 x float> 1209 %mul = fdiv <2 x float> <float 1.000000e+00, float 1.000000e+00>, %conv 1210 ret <2 x float> %mul 1211} 1212 1213define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind { 1214; VI-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: 1215; VI: ; %bb.0: 1216; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1217; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 8 1218; VI-NEXT: s_mov_b32 s6, 0xc1100000 1219; VI-NEXT: v_ffbh_u32_e32 v2, v1 1220; VI-NEXT: v_min_u32_e32 v2, 32, v2 1221; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 1222; VI-NEXT: v_min_u32_e32 v0, 1, v0 1223; VI-NEXT: v_or_b32_e32 v0, v1, v0 1224; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 1225; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2 1226; VI-NEXT: v_ldexp_f32 v0, v0, v1 1227; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 1228; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 1229; VI-NEXT: v_rcp_f32_e32 v3, v1 1230; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 1231; VI-NEXT: v_fma_f32 v3, v4, v3, v3 1232; VI-NEXT: v_mul_f32_e32 v4, v2, v3 1233; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 1234; VI-NEXT: v_fma_f32 v4, v5, v3, v4 1235; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 1236; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 1237; VI-NEXT: v_div_fixup_f32 v0, v1, v0, s6 1238; VI-NEXT: s_setpc_b64 s[30:31] 1239; 1240; GFX10-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: 1241; GFX10: ; %bb.0: 1242; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1243; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 8 1244; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 1245; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 1246; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 1247; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 1248; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 1249; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2 1250; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 1251; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 1252; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0xc1100000 1253; GFX10-NEXT: v_rcp_f32_e32 v2, v1 1254; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0 1255; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2 1256; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000 1257; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2 1258; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3 1259; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2 1260; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3 1261; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4 1262; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000 1263; GFX10-NEXT: s_setpc_b64 s[30:31] 1264; 1265; GFX11-LABEL: fdiv_pow_shl_cnt_fail_maybe_z: 1266; GFX11: ; %bb.0: 1267; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1268; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 8 1269; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1270; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1 1271; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 1272; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1273; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 1274; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 1275; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1276; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 1277; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2 1278; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 1279; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1280; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 1281; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0xc1100000 1282; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 1283; GFX11-NEXT: v_rcp_f32_e32 v2, v1 1284; GFX11-NEXT: s_waitcnt_depctr 0xfff 1285; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0 1286; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2 1287; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000 1288; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1289; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2 1290; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3 1291; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1292; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2 1293; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3 1294; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1295; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4 1296; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000 1297; GFX11-NEXT: s_setpc_b64 s[30:31] 1298 %shl = shl i64 8, %cnt 1299 %conv = uitofp i64 %shl to float 1300 %mul = fdiv float -9.000000e+00, %conv 1301 ret float %mul 1302} 1303 1304define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind { 1305; VI-LABEL: fdiv_pow_shl_cnt_fail_neg_int: 1306; VI: ; %bb.0: 1307; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1308; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 8 1309; VI-NEXT: s_mov_b32 s6, 0xc1100000 1310; VI-NEXT: v_xor_b32_e32 v2, v0, v1 1311; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v2 1312; VI-NEXT: v_ffbh_i32_e32 v3, v1 1313; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2 1314; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v3 1315; VI-NEXT: v_min_u32_e32 v2, v3, v2 1316; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 1317; VI-NEXT: v_min_u32_e32 v0, 1, v0 1318; VI-NEXT: v_or_b32_e32 v0, v1, v0 1319; VI-NEXT: v_cvt_f32_i32_e32 v0, v0 1320; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2 1321; VI-NEXT: v_ldexp_f32 v0, v0, v1 1322; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 1323; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 1324; VI-NEXT: v_rcp_f32_e32 v3, v1 1325; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 1326; VI-NEXT: v_fma_f32 v3, v4, v3, v3 1327; VI-NEXT: v_mul_f32_e32 v4, v2, v3 1328; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 1329; VI-NEXT: v_fma_f32 v4, v5, v3, v4 1330; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 1331; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 1332; VI-NEXT: v_div_fixup_f32 v0, v1, v0, s6 1333; VI-NEXT: s_setpc_b64 s[30:31] 1334; 1335; GFX10-LABEL: fdiv_pow_shl_cnt_fail_neg_int: 1336; GFX10: ; %bb.0: 1337; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1338; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 8 1339; GFX10-NEXT: v_xor_b32_e32 v2, v0, v1 1340; GFX10-NEXT: v_ffbh_i32_e32 v3, v1 1341; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v2 1342; GFX10-NEXT: v_add_nc_u32_e32 v3, -1, v3 1343; GFX10-NEXT: v_add_nc_u32_e32 v2, 32, v2 1344; GFX10-NEXT: v_min_u32_e32 v2, v3, v2 1345; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 1346; GFX10-NEXT: v_min_u32_e32 v0, 1, v0 1347; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 1348; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2 1349; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0 1350; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 1351; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0xc1100000 1352; GFX10-NEXT: v_rcp_f32_e32 v2, v1 1353; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0 1354; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2 1355; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000 1356; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2 1357; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3 1358; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2 1359; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3 1360; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4 1361; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000 1362; GFX10-NEXT: s_setpc_b64 s[30:31] 1363; 1364; GFX11-LABEL: fdiv_pow_shl_cnt_fail_neg_int: 1365; GFX11: ; %bb.0: 1366; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1367; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 8 1368; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1369; GFX11-NEXT: v_xor_b32_e32 v2, v0, v1 1370; GFX11-NEXT: v_cls_i32_e32 v3, v1 1371; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v2 1372; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1373; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3 1374; GFX11-NEXT: v_add_nc_u32_e32 v2, 32, v2 1375; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1376; GFX11-NEXT: v_min_u32_e32 v2, v3, v2 1377; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] 1378; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1379; GFX11-NEXT: v_min_u32_e32 v0, 1, v0 1380; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 1381; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2 1382; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1383; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0 1384; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 1385; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1386; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0xc1100000 1387; GFX11-NEXT: v_rcp_f32_e32 v2, v1 1388; GFX11-NEXT: s_waitcnt_depctr 0xfff 1389; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0 1390; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) 1391; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2 1392; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000 1393; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2 1394; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1395; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3 1396; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2 1397; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1398; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3 1399; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4 1400; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1401; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000 1402; GFX11-NEXT: s_setpc_b64 s[30:31] 1403 %shl = shl i64 8, %cnt 1404 %conv = sitofp i64 %shl to float 1405 %mul = fdiv float -9.000000e+00, %conv 1406 ret float %mul 1407} 1408 1409define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind { 1410; VI-LABEL: fdiv_pow_shl_cnt: 1411; VI: ; %bb.0: 1412; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1413; VI-NEXT: v_and_b32_e32 v0, 31, v0 1414; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 1415; VI-NEXT: v_sub_u32_e32 v0, vcc, 0xbd800000, v0 1416; VI-NEXT: s_setpc_b64 s[30:31] 1417; 1418; GFX10-LABEL: fdiv_pow_shl_cnt: 1419; GFX10: ; %bb.0: 1420; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1421; GFX10-NEXT: v_and_b32_e32 v0, 31, v0 1422; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0 1423; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0xbd800000, v0 1424; GFX10-NEXT: s_setpc_b64 s[30:31] 1425; 1426; GFX11-LABEL: fdiv_pow_shl_cnt: 1427; GFX11: ; %bb.0: 1428; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1429; GFX11-NEXT: v_and_b32_e32 v0, 31, v0 1430; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1431; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0 1432; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0xbd800000, v0 1433; GFX11-NEXT: s_setpc_b64 s[30:31] 1434 %cnt = and i64 %cnt_in, 31 1435 %shl = shl i64 8, %cnt 1436 %conv = sitofp i64 %shl to float 1437 %mul = fdiv float -0.500000e+00, %conv 1438 ret float %mul 1439} 1440 1441define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind { 1442; VI-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: 1443; VI: ; %bb.0: 1444; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1445; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1 1446; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 1447; VI-NEXT: s_mov_b32 s4, 0x46000000 1448; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 1449; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 1450; VI-NEXT: v_rcp_f32_e32 v2, v1 1451; VI-NEXT: v_mul_f32_e32 v3, 0x46000000, v2 1452; VI-NEXT: v_mad_f32 v4, -v1, v3, s4 1453; VI-NEXT: v_mac_f32_e32 v3, v4, v2 1454; VI-NEXT: v_mad_f32 v1, -v1, v3, s4 1455; VI-NEXT: v_mul_f32_e32 v1, v1, v2 1456; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1 1457; VI-NEXT: v_add_f32_e32 v1, v1, v3 1458; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 1459; VI-NEXT: s_movk_i32 s4, 0x7000 1460; VI-NEXT: v_div_fixup_f16 v0, v1, v0, s4 1461; VI-NEXT: s_setpc_b64 s[30:31] 1462; 1463; GFX10-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: 1464; GFX10: ; %bb.0: 1465; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1466; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 1 1467; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 1468; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 1469; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 1470; GFX10-NEXT: v_rcp_f32_e32 v2, v1 1471; GFX10-NEXT: v_mul_f32_e32 v3, 0x46000000, v2 1472; GFX10-NEXT: v_mad_f32 v4, -v1, v3, 0x46000000 1473; GFX10-NEXT: v_mac_f32_e32 v3, v4, v2 1474; GFX10-NEXT: v_mad_f32 v1, -v1, v3, 0x46000000 1475; GFX10-NEXT: v_mul_f32_e32 v1, v1, v2 1476; GFX10-NEXT: v_and_b32_e32 v1, 0xff800000, v1 1477; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 1478; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 1479; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 0x7000 1480; GFX10-NEXT: s_setpc_b64 s[30:31] 1481; 1482; GFX11-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: 1483; GFX11: ; %bb.0: 1484; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1485; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 1 1486; GFX11-NEXT: s_mov_b32 s0, 0x46000000 1487; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1488; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 1489; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 1490; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1491; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 1492; GFX11-NEXT: v_rcp_f32_e32 v1, v1 1493; GFX11-NEXT: s_waitcnt_depctr 0xfff 1494; GFX11-NEXT: v_mul_f32_e32 v2, 0x46000000, v1 1495; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1496; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0] 1497; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v1 1498; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1499; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0] 1500; GFX11-NEXT: v_mul_f32_e32 v1, v3, v1 1501; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1502; GFX11-NEXT: v_and_b32_e32 v1, 0xff800000, v1 1503; GFX11-NEXT: v_add_f32_e32 v1, v1, v2 1504; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1505; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 1506; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 0x7000 1507; GFX11-NEXT: s_setpc_b64 s[30:31] 1508 %shl = shl nuw i32 1, %cnt 1509 %conv = uitofp i32 %shl to half 1510 %mul = fdiv half 0xH7000, %conv 1511 ret half %mul 1512} 1513 1514define half @fdiv_pow_shl_cnt_in_bounds(i16 %cnt) nounwind { 1515; VI-LABEL: fdiv_pow_shl_cnt_in_bounds: 1516; VI: ; %bb.0: 1517; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1518; VI-NEXT: v_lshlrev_b16_e32 v0, 10, v0 1519; VI-NEXT: v_sub_u16_e32 v0, 0x7000, v0 1520; VI-NEXT: s_setpc_b64 s[30:31] 1521; 1522; GFX10-LABEL: fdiv_pow_shl_cnt_in_bounds: 1523; GFX10: ; %bb.0: 1524; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1525; GFX10-NEXT: v_lshlrev_b16 v0, 10, v0 1526; GFX10-NEXT: v_sub_nc_u16 v0, 0x7000, v0 1527; GFX10-NEXT: s_setpc_b64 s[30:31] 1528; 1529; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds: 1530; GFX11: ; %bb.0: 1531; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1532; GFX11-NEXT: v_lshlrev_b16 v0, 10, v0 1533; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1534; GFX11-NEXT: v_sub_nc_u16 v0, 0x7000, v0 1535; GFX11-NEXT: s_setpc_b64 s[30:31] 1536 %shl = shl nuw i16 1, %cnt 1537 %conv = uitofp i16 %shl to half 1538 %mul = fdiv half 0xH7000, %conv 1539 ret half %mul 1540} 1541 1542define half @fdiv_pow_shl_cnt_in_bounds2(i16 %cnt) nounwind { 1543; VI-LABEL: fdiv_pow_shl_cnt_in_bounds2: 1544; VI: ; %bb.0: 1545; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1546; VI-NEXT: v_lshlrev_b16_e32 v0, 10, v0 1547; VI-NEXT: v_sub_u16_e32 v0, 0x4800, v0 1548; VI-NEXT: s_setpc_b64 s[30:31] 1549; 1550; GFX10-LABEL: fdiv_pow_shl_cnt_in_bounds2: 1551; GFX10: ; %bb.0: 1552; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1553; GFX10-NEXT: v_lshlrev_b16 v0, 10, v0 1554; GFX10-NEXT: v_sub_nc_u16 v0, 0x4800, v0 1555; GFX10-NEXT: s_setpc_b64 s[30:31] 1556; 1557; GFX11-LABEL: fdiv_pow_shl_cnt_in_bounds2: 1558; GFX11: ; %bb.0: 1559; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1560; GFX11-NEXT: v_lshlrev_b16 v0, 10, v0 1561; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1562; GFX11-NEXT: v_sub_nc_u16 v0, 0x4800, v0 1563; GFX11-NEXT: s_setpc_b64 s[30:31] 1564 %shl = shl nuw i16 1, %cnt 1565 %conv = uitofp i16 %shl to half 1566 %mul = fdiv half 0xH4800, %conv 1567 ret half %mul 1568} 1569 1570define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind { 1571; VI-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: 1572; VI: ; %bb.0: 1573; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1574; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 1 1575; VI-NEXT: v_cvt_f16_u16_e32 v0, v0 1576; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 1577; VI-NEXT: v_rcp_f32_e32 v2, v1 1578; VI-NEXT: v_add_f32_e32 v3, v2, v2 1579; VI-NEXT: v_mad_f32 v4, -v1, v3, 2.0 1580; VI-NEXT: v_mac_f32_e32 v3, v4, v2 1581; VI-NEXT: v_mad_f32 v1, -v1, v3, 2.0 1582; VI-NEXT: v_mul_f32_e32 v1, v1, v2 1583; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1 1584; VI-NEXT: v_add_f32_e32 v1, v1, v3 1585; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 1586; VI-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0 1587; VI-NEXT: s_setpc_b64 s[30:31] 1588; 1589; GFX10-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: 1590; GFX10: ; %bb.0: 1591; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1592; GFX10-NEXT: v_lshlrev_b16 v0, v0, 1 1593; GFX10-NEXT: v_cvt_f16_u16_e32 v0, v0 1594; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0 1595; GFX10-NEXT: v_rcp_f32_e32 v2, v1 1596; GFX10-NEXT: v_add_f32_e32 v3, v2, v2 1597; GFX10-NEXT: v_mad_f32 v4, -v1, v3, 2.0 1598; GFX10-NEXT: v_mac_f32_e32 v3, v4, v2 1599; GFX10-NEXT: v_mad_f32 v1, -v1, v3, 2.0 1600; GFX10-NEXT: v_mul_f32_e32 v1, v1, v2 1601; GFX10-NEXT: v_and_b32_e32 v1, 0xff800000, v1 1602; GFX10-NEXT: v_add_f32_e32 v1, v1, v3 1603; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 1604; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0 1605; GFX10-NEXT: s_setpc_b64 s[30:31] 1606; 1607; GFX11-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: 1608; GFX11: ; %bb.0: 1609; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1610; GFX11-NEXT: v_lshlrev_b16 v0, v0, 1 1611; GFX11-NEXT: s_mov_b32 s0, 2.0 1612; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1613; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0 1614; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0 1615; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 1616; GFX11-NEXT: v_rcp_f32_e32 v1, v1 1617; GFX11-NEXT: s_waitcnt_depctr 0xfff 1618; GFX11-NEXT: v_add_f32_e32 v2, v1, v1 1619; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0] 1620; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1621; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v1 1622; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0] 1623; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1624; GFX11-NEXT: v_mul_f32_e32 v1, v3, v1 1625; GFX11-NEXT: v_and_b32_e32 v1, 0xff800000, v1 1626; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1627; GFX11-NEXT: v_add_f32_e32 v1, v1, v2 1628; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 1629; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1630; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0 1631; GFX11-NEXT: s_setpc_b64 s[30:31] 1632 %shl = shl nuw i16 1, %cnt 1633 %conv = uitofp i16 %shl to half 1634 %mul = fdiv half 0xH4000, %conv 1635 ret half %mul 1636} 1637 1638define double @fdiv_pow_shl_cnt32_to_dbl_okay(i32 %cnt) nounwind { 1639; VI-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: 1640; VI: ; %bb.0: 1641; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1642; VI-NEXT: v_lshlrev_b32_e32 v0, 20, v0 1643; VI-NEXT: v_mov_b32_e32 v1, 0x36a00000 1644; VI-NEXT: v_sub_u32_e64 v2, vcc, 0, 0 1645; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v0, vcc 1646; VI-NEXT: v_mov_b32_e32 v0, 0 1647; VI-NEXT: s_setpc_b64 s[30:31] 1648; 1649; GFX10-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: 1650; GFX10: ; %bb.0: 1651; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1652; GFX10-NEXT: v_lshlrev_b32_e32 v0, 20, v0 1653; GFX10-NEXT: v_sub_co_u32 v1, vcc_lo, 0, 0 1654; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v0, vcc_lo 1655; GFX10-NEXT: v_mov_b32_e32 v0, 0 1656; GFX10-NEXT: s_setpc_b64 s[30:31] 1657; 1658; GFX11-LABEL: fdiv_pow_shl_cnt32_to_dbl_okay: 1659; GFX11: ; %bb.0: 1660; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1661; GFX11-NEXT: v_lshlrev_b32_e32 v0, 20, v0 1662; GFX11-NEXT: v_sub_co_u32 v1, vcc_lo, 0, 0 1663; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 1664; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, 0x36a00000, v0, vcc_lo 1665; GFX11-NEXT: v_mov_b32_e32 v0, 0 1666; GFX11-NEXT: s_setpc_b64 s[30:31] 1667 %shl = shl nuw i32 1, %cnt 1668 %conv = uitofp i32 %shl to double 1669 %mul = fdiv double 0x36A0000000000000, %conv 1670 ret double %mul 1671} 1672 1673define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind { 1674; VI-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: 1675; VI: ; %bb.0: 1676; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1677; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1 1678; VI-NEXT: v_cvt_f32_u32_e32 v0, v0 1679; VI-NEXT: s_mov_b32 s6, 0x10fffff8 1680; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6 1681; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6 1682; VI-NEXT: v_rcp_f32_e32 v3, v1 1683; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0 1684; VI-NEXT: v_fma_f32 v3, v4, v3, v3 1685; VI-NEXT: v_mul_f32_e32 v4, v2, v3 1686; VI-NEXT: v_fma_f32 v5, -v1, v4, v2 1687; VI-NEXT: v_fma_f32 v4, v5, v3, v4 1688; VI-NEXT: v_fma_f32 v1, -v1, v4, v2 1689; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4 1690; VI-NEXT: v_div_fixup_f32 v0, v1, v0, s6 1691; VI-NEXT: s_setpc_b64 s[30:31] 1692; 1693; GFX10-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: 1694; GFX10: ; %bb.0: 1695; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1696; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 1 1697; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0 1698; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0x10fffff8 1699; GFX10-NEXT: v_rcp_f32_e32 v2, v1 1700; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0 1701; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2 1702; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, 0x10fffff8, v0, 0x10fffff8 1703; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2 1704; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3 1705; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2 1706; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3 1707; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4 1708; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0x10fffff8 1709; GFX10-NEXT: s_setpc_b64 s[30:31] 1710; 1711; GFX11-LABEL: fdiv_pow_shl_cnt32_out_of_bounds2: 1712; GFX11: ; %bb.0: 1713; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1714; GFX11-NEXT: v_lshlrev_b32_e64 v0, v0, 1 1715; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1716; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0 1717; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0x10fffff8 1718; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 1719; GFX11-NEXT: v_rcp_f32_e32 v2, v1 1720; GFX11-NEXT: s_waitcnt_depctr 0xfff 1721; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0 1722; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2 1723; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0x10fffff8, v0, 0x10fffff8 1724; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1725; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2 1726; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3 1727; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1728; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2 1729; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3 1730; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1731; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4 1732; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0x10fffff8 1733; GFX11-NEXT: s_setpc_b64 s[30:31] 1734 %shl = shl nuw i32 1, %cnt 1735 %conv = uitofp i32 %shl to float 1736 %mul = fdiv float 0x3a1fffff00000000, %conv 1737 ret float %mul 1738} 1739 1740define float @fdiv_pow_shl_cnt32_okay(i32 %cnt) nounwind { 1741; VI-LABEL: fdiv_pow_shl_cnt32_okay: 1742; VI: ; %bb.0: 1743; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1744; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0 1745; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x11000000, v0 1746; VI-NEXT: s_setpc_b64 s[30:31] 1747; 1748; GFX10-LABEL: fdiv_pow_shl_cnt32_okay: 1749; GFX10: ; %bb.0: 1750; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1751; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0 1752; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x11000000, v0 1753; GFX10-NEXT: s_setpc_b64 s[30:31] 1754; 1755; GFX11-LABEL: fdiv_pow_shl_cnt32_okay: 1756; GFX11: ; %bb.0: 1757; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1758; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0 1759; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1760; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x11000000, v0 1761; GFX11-NEXT: s_setpc_b64 s[30:31] 1762 %shl = shl nuw i32 1, %cnt 1763 %conv = uitofp i32 %shl to float 1764 %mul = fdiv float 0x3a20000000000000, %conv 1765 ret float %mul 1766} 1767