1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s 7 8declare i32 @llvm.amdgcn.workitem.id.x() #1 9declare half @llvm.fabs.f16(half) 10declare float @llvm.fabs.f32(float) 11declare double @llvm.fabs.f64(double) 12 13; All nan values are converted to 0xffffffff 14define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { 15; SI-LABEL: v_cnd_nan_nosgpr: 16; SI: ; %bb.0: 17; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 18; SI-NEXT: s_load_dword s8, s[4:5], 0xb 19; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 20; SI-NEXT: s_mov_b32 s3, 0xf000 21; SI-NEXT: s_mov_b32 s6, 0 22; SI-NEXT: s_mov_b32 s7, s3 23; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 24; SI-NEXT: v_mov_b32_e32 v1, 0 25; SI-NEXT: s_waitcnt lgkmcnt(0) 26; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 27; SI-NEXT: s_mov_b32 s2, -1 28; SI-NEXT: s_cmp_eq_u32 s8, 0 29; SI-NEXT: s_cselect_b64 vcc, -1, 0 30; SI-NEXT: s_waitcnt vmcnt(0) 31; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 32; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 33; SI-NEXT: s_endpgm 34; 35; VI-LABEL: v_cnd_nan_nosgpr: 36; VI: ; %bb.0: 37; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 38; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 39; VI-NEXT: s_waitcnt lgkmcnt(0) 40; VI-NEXT: v_mov_b32_e32 v1, s1 41; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 42; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 43; VI-NEXT: flat_load_dword v0, v[0:1] 44; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 45; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 46; VI-NEXT: s_waitcnt lgkmcnt(0) 47; VI-NEXT: s_cmp_eq_u32 s2, 0 48; VI-NEXT: s_cselect_b64 vcc, -1, 0 49; VI-NEXT: s_waitcnt vmcnt(0) 50; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc 51; VI-NEXT: v_mov_b32_e32 v0, s0 52; VI-NEXT: v_mov_b32_e32 v1, s1 53; VI-NEXT: flat_store_dword v[0:1], v2 54; VI-NEXT: s_endpgm 55; 56; GFX10-LABEL: v_cnd_nan_nosgpr: 57; GFX10: ; %bb.0: 58; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 59; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 60; GFX10-NEXT: v_mov_b32_e32 v1, 0 61; GFX10-NEXT: s_waitcnt lgkmcnt(0) 62; GFX10-NEXT: global_load_dword v0, v0, s[0:1] 63; GFX10-NEXT: s_clause 0x1 64; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c 65; GFX10-NEXT: s_waitcnt_depctr 0xffe3 66; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 67; GFX10-NEXT: s_waitcnt lgkmcnt(0) 68; GFX10-NEXT: s_cmp_eq_u32 s2, 0 69; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 70; GFX10-NEXT: s_waitcnt vmcnt(0) 71; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 72; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 73; GFX10-NEXT: s_endpgm 74; 75; GFX11-LABEL: v_cnd_nan_nosgpr: 76; GFX11: ; %bb.0: 77; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 78; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 79; GFX11-NEXT: v_mov_b32_e32 v1, 0 80; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 81; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 82; GFX11-NEXT: s_waitcnt lgkmcnt(0) 83; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] 84; GFX11-NEXT: s_clause 0x1 85; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 86; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 87; GFX11-NEXT: s_waitcnt lgkmcnt(0) 88; GFX11-NEXT: s_cmp_eq_u32 s2, 0 89; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 90; GFX11-NEXT: s_waitcnt vmcnt(0) 91; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 92; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 93; GFX11-NEXT: s_endpgm 94; 95; GFX12-LABEL: v_cnd_nan_nosgpr: 96; GFX12: ; %bb.0: 97; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 98; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 99; GFX12-NEXT: v_mov_b32_e32 v1, 0 100; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 101; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 102; GFX12-NEXT: s_wait_kmcnt 0x0 103; GFX12-NEXT: global_load_b32 v0, v0, s[0:1] 104; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 105; GFX12-NEXT: s_wait_kmcnt 0x0 106; GFX12-NEXT: s_cmp_eq_u32 s2, 0 107; GFX12-NEXT: s_cselect_b64 vcc, -1, 0 108; GFX12-NEXT: s_wait_loadcnt 0x0 109; GFX12-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 110; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] 111; GFX12-NEXT: s_endpgm 112 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 113 %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx 114 %f = load float, ptr addrspace(1) %f.gep 115 %setcc = icmp ne i32 %c, 0 116 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f 117 store float %select, ptr addrspace(1) %out 118 ret void 119} 120 121; This requires slightly trickier SGPR operand legalization since the 122; single constant bus SGPR usage is the last operand, and it should 123; never be moved. 124; However on GFX10 constant bus is limited to 2 scalar operands, not one. 125; All nan values are converted to 0xffffffff 126define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 { 127; SI-LABEL: v_cnd_nan: 128; SI: ; %bb.0: 129; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 130; SI-NEXT: s_mov_b32 s7, 0xf000 131; SI-NEXT: s_mov_b32 s6, -1 132; SI-NEXT: s_waitcnt lgkmcnt(0) 133; SI-NEXT: s_mov_b32 s4, s0 134; SI-NEXT: s_mov_b32 s5, s1 135; SI-NEXT: s_cmp_eq_u32 s2, 0 136; SI-NEXT: v_mov_b32_e32 v0, s3 137; SI-NEXT: s_cselect_b64 vcc, -1, 0 138; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 139; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 140; SI-NEXT: s_endpgm 141; 142; VI-LABEL: v_cnd_nan: 143; VI: ; %bb.0: 144; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 145; VI-NEXT: s_waitcnt lgkmcnt(0) 146; VI-NEXT: s_cmp_eq_u32 s2, 0 147; VI-NEXT: v_mov_b32_e32 v0, s3 148; VI-NEXT: s_cselect_b64 vcc, -1, 0 149; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc 150; VI-NEXT: v_mov_b32_e32 v0, s0 151; VI-NEXT: v_mov_b32_e32 v1, s1 152; VI-NEXT: flat_store_dword v[0:1], v2 153; VI-NEXT: s_endpgm 154; 155; GFX10-LABEL: v_cnd_nan: 156; GFX10: ; %bb.0: 157; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 158; GFX10-NEXT: v_mov_b32_e32 v0, 0 159; GFX10-NEXT: s_waitcnt lgkmcnt(0) 160; GFX10-NEXT: s_cmp_eq_u32 s2, 0 161; GFX10-NEXT: s_cselect_b64 s[4:5], -1, 0 162; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5] 163; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 164; GFX10-NEXT: s_endpgm 165; 166; GFX11-LABEL: v_cnd_nan: 167; GFX11: ; %bb.0: 168; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 169; GFX11-NEXT: v_mov_b32_e32 v0, 0 170; GFX11-NEXT: s_waitcnt lgkmcnt(0) 171; GFX11-NEXT: s_cmp_eq_u32 s2, 0 172; GFX11-NEXT: s_cselect_b64 s[4:5], -1, 0 173; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 174; GFX11-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5] 175; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 176; GFX11-NEXT: s_endpgm 177; 178; GFX12-LABEL: v_cnd_nan: 179; GFX12: ; %bb.0: 180; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 181; GFX12-NEXT: v_mov_b32_e32 v0, 0 182; GFX12-NEXT: s_wait_kmcnt 0x0 183; GFX12-NEXT: s_cmp_eq_u32 s2, 0 184; GFX12-NEXT: s_cselect_b32 s2, s3, -1 185; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 186; GFX12-NEXT: v_mov_b32_e32 v1, s2 187; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 188; GFX12-NEXT: s_endpgm 189 %setcc = icmp ne i32 %c, 0 190 %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f 191 store float %select, ptr addrspace(1) %out 192 ret void 193} 194 195; Test different compare and select operand types for optimal code 196; shrinking. 197; (select (cmp (sgprX, constant)), constant, sgprZ) 198define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { 199; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: 200; SI: ; %bb.0: 201; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 202; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13 203; SI-NEXT: s_mov_b32 s3, 0xf000 204; SI-NEXT: s_mov_b32 s2, 0 205; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 206; SI-NEXT: v_mov_b32_e32 v1, 0 207; SI-NEXT: s_waitcnt lgkmcnt(0) 208; SI-NEXT: v_mov_b32_e32 v2, s5 209; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 210; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc 211; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 212; SI-NEXT: s_endpgm 213; 214; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: 215; VI: ; %bb.0: 216; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 217; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x4c 218; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 219; VI-NEXT: s_waitcnt lgkmcnt(0) 220; VI-NEXT: v_mov_b32_e32 v1, s1 221; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 222; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 223; VI-NEXT: v_mov_b32_e32 v2, s3 224; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 225; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc 226; VI-NEXT: flat_store_dword v[0:1], v2 227; VI-NEXT: s_endpgm 228; 229; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: 230; GFX10: ; %bb.0: 231; GFX10-NEXT: s_clause 0x1 232; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c 233; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 234; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 235; GFX10-NEXT: s_waitcnt lgkmcnt(0) 236; GFX10-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 237; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[4:5] 238; GFX10-NEXT: global_store_dword v0, v1, s[2:3] 239; GFX10-NEXT: s_endpgm 240; 241; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: 242; GFX11: ; %bb.0: 243; GFX11-NEXT: s_clause 0x1 244; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c 245; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 246; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 247; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 248; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 249; GFX11-NEXT: s_waitcnt lgkmcnt(0) 250; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 251; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s1, s[4:5] 252; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] 253; GFX11-NEXT: s_endpgm 254; 255; GFX12-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32: 256; GFX12: ; %bb.0: 257; GFX12-NEXT: s_clause 0x1 258; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c 259; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 260; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 261; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 262; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 263; GFX12-NEXT: s_wait_kmcnt 0x0 264; GFX12-NEXT: s_cmp_nlg_f32 s0, 0 265; GFX12-NEXT: s_cselect_b32 s0, s1, 1.0 266; GFX12-NEXT: v_mov_b32_e32 v1, s0 267; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] 268; GFX12-NEXT: s_endpgm 269 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 270 %tid.ext = sext i32 %tid to i64 271 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 272 %setcc = fcmp one float %x, 0.0 273 %select = select i1 %setcc, float 1.0, float %z 274 store float %select, ptr addrspace(1) %out.gep 275 ret void 276} 277 278define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { 279; SI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: 280; SI: ; %bb.0: 281; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 282; SI-NEXT: s_load_dword s4, s[4:5], 0xb 283; SI-NEXT: s_mov_b32 s3, 0xf000 284; SI-NEXT: s_mov_b32 s2, 0 285; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 286; SI-NEXT: v_mov_b32_e32 v1, 0 287; SI-NEXT: s_waitcnt lgkmcnt(0) 288; SI-NEXT: v_mov_b32_e32 v2, s4 289; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 290; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc 291; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 292; SI-NEXT: s_endpgm 293; 294; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: 295; VI: ; %bb.0: 296; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 297; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 298; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 299; VI-NEXT: s_waitcnt lgkmcnt(0) 300; VI-NEXT: v_mov_b32_e32 v1, s1 301; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 302; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 303; VI-NEXT: v_mov_b32_e32 v2, s2 304; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 305; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc 306; VI-NEXT: flat_store_dword v[0:1], v2 307; VI-NEXT: s_endpgm 308; 309; GFX10-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: 310; GFX10: ; %bb.0: 311; GFX10-NEXT: s_clause 0x1 312; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c 313; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 314; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 315; GFX10-NEXT: s_waitcnt lgkmcnt(0) 316; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0 317; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s6, s[2:3] 318; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 319; GFX10-NEXT: s_endpgm 320; 321; GFX11-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: 322; GFX11: ; %bb.0: 323; GFX11-NEXT: s_clause 0x1 324; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c 325; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 326; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 327; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 328; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 329; GFX11-NEXT: s_waitcnt lgkmcnt(0) 330; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0 331; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s6, s[2:3] 332; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 333; GFX11-NEXT: s_endpgm 334; 335; GFX12-LABEL: fcmp_sgprX_k0_select_k1_sgprX_f32: 336; GFX12: ; %bb.0: 337; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 338; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 339; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 340; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 341; GFX12-NEXT: s_wait_kmcnt 0x0 342; GFX12-NEXT: s_cmp_nlg_f32 s2, 0 343; GFX12-NEXT: s_cselect_b32 s2, s2, 1.0 344; GFX12-NEXT: v_mov_b32_e32 v1, s2 345; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 346; GFX12-NEXT: s_endpgm 347 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 348 %tid.ext = sext i32 %tid to i64 349 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 350 %setcc = fcmp one float %x, 0.0 351 %select = select i1 %setcc, float 1.0, float %x 352 store float %select, ptr addrspace(1) %out.gep 353 ret void 354} 355 356define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %out, [8 x i32], float %x, float %z) #0 { 357; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: 358; SI: ; %bb.0: 359; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 360; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13 361; SI-NEXT: s_mov_b32 s3, 0xf000 362; SI-NEXT: s_mov_b32 s2, 0 363; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 364; SI-NEXT: v_mov_b32_e32 v1, 0 365; SI-NEXT: s_waitcnt lgkmcnt(0) 366; SI-NEXT: v_mov_b32_e32 v2, s5 367; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 368; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc 369; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 370; SI-NEXT: s_endpgm 371; 372; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: 373; VI: ; %bb.0: 374; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 375; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x4c 376; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 377; VI-NEXT: s_waitcnt lgkmcnt(0) 378; VI-NEXT: v_mov_b32_e32 v1, s1 379; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 380; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 381; VI-NEXT: v_mov_b32_e32 v2, s3 382; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 383; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc 384; VI-NEXT: flat_store_dword v[0:1], v2 385; VI-NEXT: s_endpgm 386; 387; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: 388; GFX10: ; %bb.0: 389; GFX10-NEXT: s_clause 0x1 390; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c 391; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 392; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 393; GFX10-NEXT: s_waitcnt lgkmcnt(0) 394; GFX10-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 395; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[4:5] 396; GFX10-NEXT: global_store_dword v0, v1, s[2:3] 397; GFX10-NEXT: s_endpgm 398; 399; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: 400; GFX11: ; %bb.0: 401; GFX11-NEXT: s_clause 0x1 402; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c 403; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 404; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 405; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 406; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 407; GFX11-NEXT: s_waitcnt lgkmcnt(0) 408; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s0, 0 409; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s1, s[4:5] 410; GFX11-NEXT: global_store_b32 v0, v1, s[2:3] 411; GFX11-NEXT: s_endpgm 412; 413; GFX12-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32: 414; GFX12: ; %bb.0: 415; GFX12-NEXT: s_clause 0x1 416; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c 417; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 418; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 419; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 420; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 421; GFX12-NEXT: s_wait_kmcnt 0x0 422; GFX12-NEXT: s_cmp_nlg_f32 s0, 0 423; GFX12-NEXT: s_cselect_b32 s0, s1, 0 424; GFX12-NEXT: v_mov_b32_e32 v1, s0 425; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] 426; GFX12-NEXT: s_endpgm 427 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 428 %tid.ext = sext i32 %tid to i64 429 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 430 %setcc = fcmp one float %x, 0.0 431 %select = select i1 %setcc, float 0.0, float %z 432 store float %select, ptr addrspace(1) %out.gep 433 ret void 434} 435 436define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %out, float %x) #0 { 437; SI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: 438; SI: ; %bb.0: 439; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 440; SI-NEXT: s_load_dword s4, s[4:5], 0xb 441; SI-NEXT: s_mov_b32 s3, 0xf000 442; SI-NEXT: s_mov_b32 s2, 0 443; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 444; SI-NEXT: v_mov_b32_e32 v1, 0 445; SI-NEXT: s_waitcnt lgkmcnt(0) 446; SI-NEXT: v_mov_b32_e32 v2, s4 447; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0 448; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc 449; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 450; SI-NEXT: s_endpgm 451; 452; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: 453; VI: ; %bb.0: 454; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 455; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 456; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 457; VI-NEXT: s_waitcnt lgkmcnt(0) 458; VI-NEXT: v_mov_b32_e32 v1, s1 459; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 460; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 461; VI-NEXT: v_mov_b32_e32 v2, s2 462; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 463; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc 464; VI-NEXT: flat_store_dword v[0:1], v2 465; VI-NEXT: s_endpgm 466; 467; GFX10-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: 468; GFX10: ; %bb.0: 469; GFX10-NEXT: s_clause 0x1 470; GFX10-NEXT: s_load_dword s6, s[4:5], 0x2c 471; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 472; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 473; GFX10-NEXT: s_waitcnt lgkmcnt(0) 474; GFX10-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0 475; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, s6, s[2:3] 476; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 477; GFX10-NEXT: s_endpgm 478; 479; GFX11-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: 480; GFX11: ; %bb.0: 481; GFX11-NEXT: s_clause 0x1 482; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x2c 483; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 484; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 485; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) 486; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 487; GFX11-NEXT: s_waitcnt lgkmcnt(0) 488; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s6, 0 489; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s6, s[2:3] 490; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 491; GFX11-NEXT: s_endpgm 492; 493; GFX12-LABEL: fcmp_sgprX_k0_select_k0_sgprX_f32: 494; GFX12: ; %bb.0: 495; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 496; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 497; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) 498; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 499; GFX12-NEXT: s_wait_kmcnt 0x0 500; GFX12-NEXT: s_cmp_nlg_f32 s2, 0 501; GFX12-NEXT: s_cselect_b32 s2, s2, 0 502; GFX12-NEXT: v_mov_b32_e32 v1, s2 503; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 504; GFX12-NEXT: s_endpgm 505 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 506 %tid.ext = sext i32 %tid to i64 507 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 508 %setcc = fcmp one float %x, 0.0 509 %select = select i1 %setcc, float 0.0, float %x 510 store float %select, ptr addrspace(1) %out.gep 511 ret void 512} 513 514define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { 515; SI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: 516; SI: ; %bb.0: 517; SI-NEXT: s_load_dword s6, s[4:5], 0xb 518; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 519; SI-NEXT: s_mov_b32 s3, 0xf000 520; SI-NEXT: s_mov_b32 s2, 0 521; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 522; SI-NEXT: v_mov_b32_e32 v1, 0 523; SI-NEXT: s_waitcnt lgkmcnt(0) 524; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 525; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 526; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s6, 0 527; SI-NEXT: s_waitcnt vmcnt(0) 528; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc 529; SI-NEXT: s_waitcnt lgkmcnt(0) 530; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 531; SI-NEXT: s_endpgm 532; 533; VI-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: 534; VI: ; %bb.0: 535; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 536; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 537; VI-NEXT: s_waitcnt lgkmcnt(0) 538; VI-NEXT: v_mov_b32_e32 v1, s1 539; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 540; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 541; VI-NEXT: flat_load_dword v3, v[0:1] 542; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 543; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 544; VI-NEXT: s_waitcnt lgkmcnt(0) 545; VI-NEXT: v_mov_b32_e32 v1, s1 546; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 547; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 548; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 549; VI-NEXT: s_waitcnt vmcnt(0) 550; VI-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc 551; VI-NEXT: flat_store_dword v[0:1], v2 552; VI-NEXT: s_endpgm 553; 554; GFX10-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: 555; GFX10: ; %bb.0: 556; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 557; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 558; GFX10-NEXT: s_waitcnt lgkmcnt(0) 559; GFX10-NEXT: global_load_dword v1, v0, s[0:1] 560; GFX10-NEXT: s_clause 0x1 561; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c 562; GFX10-NEXT: s_waitcnt_depctr 0xffe3 563; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 564; GFX10-NEXT: s_waitcnt lgkmcnt(0) 565; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 566; GFX10-NEXT: s_waitcnt vmcnt(0) 567; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 568; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 569; GFX10-NEXT: s_endpgm 570; 571; GFX11-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: 572; GFX11: ; %bb.0: 573; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 574; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 575; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 576; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 577; GFX11-NEXT: s_waitcnt lgkmcnt(0) 578; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] 579; GFX11-NEXT: s_clause 0x1 580; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 581; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 582; GFX11-NEXT: s_waitcnt lgkmcnt(0) 583; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 584; GFX11-NEXT: s_waitcnt vmcnt(0) 585; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 586; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 587; GFX11-NEXT: s_endpgm 588; 589; GFX12-LABEL: fcmp_sgprX_k0_select_k0_vgprZ_f32: 590; GFX12: ; %bb.0: 591; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 592; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 593; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 594; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 595; GFX12-NEXT: s_wait_kmcnt 0x0 596; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] 597; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 598; GFX12-NEXT: s_wait_kmcnt 0x0 599; GFX12-NEXT: s_cmp_nlg_f32 s2, 0 600; GFX12-NEXT: s_cselect_b64 vcc, -1, 0 601; GFX12-NEXT: s_wait_loadcnt 0x0 602; GFX12-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 603; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 604; GFX12-NEXT: s_endpgm 605 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 606 %tid.ext = sext i32 %tid to i64 607 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext 608 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 609 %z = load float, ptr addrspace(1) %z.gep 610 %setcc = fcmp one float %x, 0.0 611 %select = select i1 %setcc, float 0.0, float %z 612 store float %select, ptr addrspace(1) %out.gep 613 ret void 614} 615 616define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, float %x, ptr addrspace(1) %z.ptr) #0 { 617; SI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: 618; SI: ; %bb.0: 619; SI-NEXT: s_load_dword s6, s[4:5], 0xb 620; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 621; SI-NEXT: s_mov_b32 s3, 0xf000 622; SI-NEXT: s_mov_b32 s2, 0 623; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 624; SI-NEXT: v_mov_b32_e32 v1, 0 625; SI-NEXT: s_waitcnt lgkmcnt(0) 626; SI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 627; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 628; SI-NEXT: v_cmp_nlg_f32_e64 vcc, s6, 0 629; SI-NEXT: s_waitcnt vmcnt(0) 630; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc 631; SI-NEXT: s_waitcnt lgkmcnt(0) 632; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 633; SI-NEXT: s_endpgm 634; 635; VI-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: 636; VI: ; %bb.0: 637; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 638; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 639; VI-NEXT: s_waitcnt lgkmcnt(0) 640; VI-NEXT: v_mov_b32_e32 v1, s1 641; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 642; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 643; VI-NEXT: flat_load_dword v3, v[0:1] 644; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 645; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 646; VI-NEXT: s_waitcnt lgkmcnt(0) 647; VI-NEXT: v_mov_b32_e32 v1, s1 648; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 649; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 650; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 651; VI-NEXT: s_waitcnt vmcnt(0) 652; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc 653; VI-NEXT: flat_store_dword v[0:1], v2 654; VI-NEXT: s_endpgm 655; 656; GFX10-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: 657; GFX10: ; %bb.0: 658; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 659; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 660; GFX10-NEXT: s_waitcnt lgkmcnt(0) 661; GFX10-NEXT: global_load_dword v1, v0, s[0:1] 662; GFX10-NEXT: s_clause 0x1 663; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c 664; GFX10-NEXT: s_waitcnt_depctr 0xffe3 665; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 666; GFX10-NEXT: s_waitcnt lgkmcnt(0) 667; GFX10-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 668; GFX10-NEXT: s_waitcnt vmcnt(0) 669; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc 670; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 671; GFX10-NEXT: s_endpgm 672; 673; GFX11-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: 674; GFX11: ; %bb.0: 675; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 676; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 677; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 678; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 679; GFX11-NEXT: s_waitcnt lgkmcnt(0) 680; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] 681; GFX11-NEXT: s_clause 0x1 682; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 683; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 684; GFX11-NEXT: s_waitcnt lgkmcnt(0) 685; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0 686; GFX11-NEXT: s_waitcnt vmcnt(0) 687; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc 688; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 689; GFX11-NEXT: s_endpgm 690; 691; GFX12-LABEL: fcmp_sgprX_k0_select_k1_vgprZ_f32: 692; GFX12: ; %bb.0: 693; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 694; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 695; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 696; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 697; GFX12-NEXT: s_wait_kmcnt 0x0 698; GFX12-NEXT: global_load_b32 v1, v0, s[0:1] 699; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 700; GFX12-NEXT: s_wait_kmcnt 0x0 701; GFX12-NEXT: s_cmp_nlg_f32 s2, 0 702; GFX12-NEXT: s_cselect_b64 vcc, -1, 0 703; GFX12-NEXT: s_wait_loadcnt 0x0 704; GFX12-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc 705; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 706; GFX12-NEXT: s_endpgm 707 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 708 %tid.ext = sext i32 %tid to i64 709 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext 710 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 711 %z = load float, ptr addrspace(1) %z.gep 712 %setcc = fcmp one float %x, 0.0 713 %select = select i1 %setcc, float 1.0, float %z 714 store float %select, ptr addrspace(1) %out.gep 715 ret void 716} 717 718define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, float %z) #0 { 719; SI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: 720; SI: ; %bb.0: 721; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 722; SI-NEXT: s_load_dword s8, s[4:5], 0xd 723; SI-NEXT: s_mov_b32 s7, 0xf000 724; SI-NEXT: s_mov_b32 s6, 0 725; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 726; SI-NEXT: v_mov_b32_e32 v1, 0 727; SI-NEXT: s_waitcnt lgkmcnt(0) 728; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 729; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 730; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 731; SI-NEXT: v_mov_b32_e32 v3, s8 732; SI-NEXT: s_waitcnt vmcnt(0) 733; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v2 734; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc 735; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 736; SI-NEXT: s_endpgm 737; 738; VI-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: 739; VI: ; %bb.0: 740; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 741; VI-NEXT: s_load_dword s4, s[4:5], 0x34 742; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 743; VI-NEXT: s_waitcnt lgkmcnt(0) 744; VI-NEXT: v_mov_b32_e32 v1, s3 745; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 746; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 747; VI-NEXT: flat_load_dword v3, v[0:1] 748; VI-NEXT: v_mov_b32_e32 v1, s1 749; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 750; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 751; VI-NEXT: v_mov_b32_e32 v4, s4 752; VI-NEXT: s_waitcnt vmcnt(0) 753; VI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v3 754; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v4, vcc 755; VI-NEXT: flat_store_dword v[0:1], v2 756; VI-NEXT: s_endpgm 757; 758; GFX10-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: 759; GFX10: ; %bb.0: 760; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 761; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 762; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34 763; GFX10-NEXT: s_waitcnt lgkmcnt(0) 764; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 765; GFX10-NEXT: s_waitcnt vmcnt(0) 766; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 767; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, vcc 768; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 769; GFX10-NEXT: s_endpgm 770; 771; GFX11-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: 772; GFX11: ; %bb.0: 773; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 774; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 775; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 776; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 777; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 778; GFX11-NEXT: s_waitcnt lgkmcnt(0) 779; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 780; GFX11-NEXT: s_waitcnt vmcnt(0) 781; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 782; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, vcc 783; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 784; GFX11-NEXT: s_endpgm 785; 786; GFX12-LABEL: fcmp_vgprX_k0_select_k1_sgprZ_f32: 787; GFX12: ; %bb.0: 788; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 789; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 790; GFX12-NEXT: s_load_b32 s4, s[4:5], 0x34 791; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 792; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 793; GFX12-NEXT: s_wait_kmcnt 0x0 794; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] 795; GFX12-NEXT: s_wait_loadcnt 0x0 796; GFX12-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 797; GFX12-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, vcc 798; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 799; GFX12-NEXT: s_endpgm 800 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 801 %tid.ext = sext i32 %tid to i64 802 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext 803 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 804 %x = load float, ptr addrspace(1) %x.gep 805 %setcc = fcmp olt float %x, 0.0 806 %select = select i1 %setcc, float 1.0, float %z 807 store float %select, ptr addrspace(1) %out.gep 808 ret void 809} 810 811define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { 812; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: 813; SI: ; %bb.0: 814; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 815; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 816; SI-NEXT: s_mov_b32 s11, 0xf000 817; SI-NEXT: s_mov_b32 s10, 0 818; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 819; SI-NEXT: v_mov_b32_e32 v1, 0 820; SI-NEXT: s_mov_b64 s[6:7], s[10:11] 821; SI-NEXT: s_waitcnt lgkmcnt(0) 822; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 823; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc 824; SI-NEXT: s_waitcnt vmcnt(0) 825; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc 826; SI-NEXT: s_waitcnt vmcnt(0) 827; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 828; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v2 829; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc 830; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 831; SI-NEXT: s_endpgm 832; 833; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: 834; VI: ; %bb.0: 835; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 836; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 837; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 838; VI-NEXT: s_waitcnt lgkmcnt(0) 839; VI-NEXT: v_mov_b32_e32 v1, s3 840; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 841; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 842; VI-NEXT: v_mov_b32_e32 v3, s5 843; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 844; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 845; VI-NEXT: flat_load_dword v5, v[0:1] glc 846; VI-NEXT: s_waitcnt vmcnt(0) 847; VI-NEXT: flat_load_dword v2, v[2:3] glc 848; VI-NEXT: s_waitcnt vmcnt(0) 849; VI-NEXT: v_mov_b32_e32 v1, s1 850; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 851; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 852; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v5 853; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc 854; VI-NEXT: flat_store_dword v[0:1], v2 855; VI-NEXT: s_endpgm 856; 857; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: 858; GFX10: ; %bb.0: 859; GFX10-NEXT: s_clause 0x1 860; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 861; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 862; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 863; GFX10-NEXT: s_waitcnt lgkmcnt(0) 864; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 865; GFX10-NEXT: s_waitcnt vmcnt(0) 866; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc 867; GFX10-NEXT: s_waitcnt vmcnt(0) 868; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 869; GFX10-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc 870; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 871; GFX10-NEXT: s_endpgm 872; 873; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: 874; GFX11: ; %bb.0: 875; GFX11-NEXT: s_clause 0x1 876; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 877; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 878; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 879; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 880; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 881; GFX11-NEXT: s_waitcnt lgkmcnt(0) 882; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 883; GFX11-NEXT: s_waitcnt vmcnt(0) 884; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc 885; GFX11-NEXT: s_waitcnt vmcnt(0) 886; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 887; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc 888; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 889; GFX11-NEXT: s_endpgm 890; 891; GFX12-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32: 892; GFX12: ; %bb.0: 893; GFX12-NEXT: s_clause 0x1 894; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 895; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 896; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 897; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 898; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 899; GFX12-NEXT: s_wait_kmcnt 0x0 900; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS 901; GFX12-NEXT: s_wait_loadcnt 0x0 902; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS 903; GFX12-NEXT: s_wait_loadcnt 0x0 904; GFX12-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 905; GFX12-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc 906; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 907; GFX12-NEXT: s_endpgm 908 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 909 %tid.ext = sext i32 %tid to i64 910 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext 911 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext 912 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 913 %x = load volatile float, ptr addrspace(1) %x.gep 914 %z = load volatile float, ptr addrspace(1) %z.gep 915 %setcc = fcmp ult float %x, 0.0 916 %select = select i1 %setcc, float 1.0, float %z 917 store float %select, ptr addrspace(1) %out.gep 918 ret void 919} 920 921define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { 922; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: 923; SI: ; %bb.0: 924; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 925; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 926; SI-NEXT: s_mov_b32 s11, 0xf000 927; SI-NEXT: s_mov_b32 s10, 0 928; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 929; SI-NEXT: v_mov_b32_e32 v1, 0 930; SI-NEXT: s_mov_b64 s[6:7], s[10:11] 931; SI-NEXT: s_waitcnt lgkmcnt(0) 932; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 933; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc 934; SI-NEXT: s_waitcnt vmcnt(0) 935; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc 936; SI-NEXT: s_waitcnt vmcnt(0) 937; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 938; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 939; SI-NEXT: v_cndmask_b32_e32 v2, 2, v3, vcc 940; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 941; SI-NEXT: s_endpgm 942; 943; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: 944; VI: ; %bb.0: 945; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 946; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 947; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 948; VI-NEXT: s_waitcnt lgkmcnt(0) 949; VI-NEXT: v_mov_b32_e32 v1, s3 950; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 951; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 952; VI-NEXT: v_mov_b32_e32 v3, s5 953; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 954; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 955; VI-NEXT: flat_load_dword v5, v[0:1] glc 956; VI-NEXT: s_waitcnt vmcnt(0) 957; VI-NEXT: flat_load_dword v2, v[2:3] glc 958; VI-NEXT: s_waitcnt vmcnt(0) 959; VI-NEXT: v_mov_b32_e32 v1, s1 960; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 961; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 962; VI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v5 963; VI-NEXT: v_cndmask_b32_e32 v2, 2, v2, vcc 964; VI-NEXT: flat_store_dword v[0:1], v2 965; VI-NEXT: s_endpgm 966; 967; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: 968; GFX10: ; %bb.0: 969; GFX10-NEXT: s_clause 0x1 970; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 971; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 972; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 973; GFX10-NEXT: s_waitcnt lgkmcnt(0) 974; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 975; GFX10-NEXT: s_waitcnt vmcnt(0) 976; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc 977; GFX10-NEXT: s_waitcnt vmcnt(0) 978; GFX10-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 979; GFX10-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc 980; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 981; GFX10-NEXT: s_endpgm 982; 983; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: 984; GFX11: ; %bb.0: 985; GFX11-NEXT: s_clause 0x1 986; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 987; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 988; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 989; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 990; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 991; GFX11-NEXT: s_waitcnt lgkmcnt(0) 992; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 993; GFX11-NEXT: s_waitcnt vmcnt(0) 994; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc 995; GFX11-NEXT: s_waitcnt vmcnt(0) 996; GFX11-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 997; GFX11-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc 998; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 999; GFX11-NEXT: s_endpgm 1000; 1001; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32: 1002; GFX12: ; %bb.0: 1003; GFX12-NEXT: s_clause 0x1 1004; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1005; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1006; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1007; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1008; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1009; GFX12-NEXT: s_wait_kmcnt 0x0 1010; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS 1011; GFX12-NEXT: s_wait_loadcnt 0x0 1012; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS 1013; GFX12-NEXT: s_wait_loadcnt 0x0 1014; GFX12-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 1015; GFX12-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc 1016; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 1017; GFX12-NEXT: s_endpgm 1018 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1019 %tid.ext = sext i32 %tid to i64 1020 %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext 1021 %z.gep = getelementptr inbounds i32, ptr addrspace(1) %z.ptr, i64 %tid.ext 1022 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %tid.ext 1023 %x = load volatile i32, ptr addrspace(1) %x.gep 1024 %z = load volatile i32, ptr addrspace(1) %z.gep 1025 %setcc = icmp slt i32 %x, 0 1026 %select = select i1 %setcc, i32 2, i32 %z 1027 store i32 %select, ptr addrspace(1) %out.gep 1028 ret void 1029} 1030 1031define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { 1032; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: 1033; SI: ; %bb.0: 1034; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1035; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1036; SI-NEXT: s_mov_b32 s11, 0xf000 1037; SI-NEXT: s_mov_b32 s10, 0 1038; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1039; SI-NEXT: v_mov_b32_e32 v1, 0 1040; SI-NEXT: s_mov_b64 s[6:7], s[10:11] 1041; SI-NEXT: s_waitcnt lgkmcnt(0) 1042; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1043; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 glc 1044; SI-NEXT: s_waitcnt vmcnt(0) 1045; SI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 glc 1046; SI-NEXT: s_waitcnt vmcnt(0) 1047; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 1048; SI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[2:3] 1049; SI-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc 1050; SI-NEXT: v_cndmask_b32_e32 v2, 2, v4, vcc 1051; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 1052; SI-NEXT: s_endpgm 1053; 1054; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: 1055; VI: ; %bb.0: 1056; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1057; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1058; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1059; VI-NEXT: s_waitcnt lgkmcnt(0) 1060; VI-NEXT: v_mov_b32_e32 v1, s3 1061; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 1062; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1063; VI-NEXT: v_mov_b32_e32 v3, s5 1064; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 1065; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1066; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc 1067; VI-NEXT: s_waitcnt vmcnt(0) 1068; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc 1069; VI-NEXT: s_waitcnt vmcnt(0) 1070; VI-NEXT: v_mov_b32_e32 v5, s1 1071; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 1072; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 1073; VI-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] 1074; VI-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc 1075; VI-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc 1076; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 1077; VI-NEXT: s_endpgm 1078; 1079; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: 1080; GFX10: ; %bb.0: 1081; GFX10-NEXT: s_clause 0x1 1082; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1083; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1084; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1085; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1086; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc 1087; GFX10-NEXT: s_waitcnt vmcnt(0) 1088; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7] glc dlc 1089; GFX10-NEXT: s_waitcnt vmcnt(0) 1090; GFX10-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] 1091; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc 1092; GFX10-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc 1093; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 1094; GFX10-NEXT: s_endpgm 1095; 1096; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: 1097; GFX11: ; %bb.0: 1098; GFX11-NEXT: s_clause 0x1 1099; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1100; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1101; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1102; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1103; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1104; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1105; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3] glc dlc 1106; GFX11-NEXT: s_waitcnt vmcnt(0) 1107; GFX11-NEXT: global_load_b64 v[2:3], v4, s[4:5] glc dlc 1108; GFX11-NEXT: s_waitcnt vmcnt(0) 1109; GFX11-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] 1110; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc 1111; GFX11-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc 1112; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] 1113; GFX11-NEXT: s_endpgm 1114; 1115; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64: 1116; GFX12: ; %bb.0: 1117; GFX12-NEXT: s_clause 0x1 1118; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1119; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1120; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1121; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1122; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1123; GFX12-NEXT: s_wait_kmcnt 0x0 1124; GFX12-NEXT: global_load_b64 v[0:1], v4, s[2:3] scope:SCOPE_SYS 1125; GFX12-NEXT: s_wait_loadcnt 0x0 1126; GFX12-NEXT: global_load_b64 v[2:3], v4, s[4:5] scope:SCOPE_SYS 1127; GFX12-NEXT: s_wait_loadcnt 0x0 1128; GFX12-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1] 1129; GFX12-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc 1130; GFX12-NEXT: v_cndmask_b32_e32 v0, 2, v2, vcc 1131; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1] 1132; GFX12-NEXT: s_endpgm 1133 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1134 %tid.ext = sext i32 %tid to i64 1135 %x.gep = getelementptr inbounds i64, ptr addrspace(1) %x.ptr, i64 %tid.ext 1136 %z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext 1137 %out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext 1138 %x = load volatile i64, ptr addrspace(1) %x.gep 1139 %z = load volatile i64, ptr addrspace(1) %z.gep 1140 %setcc = icmp slt i64 %x, 0 1141 %select = select i1 %setcc, i64 2, i64 %z 1142 store i64 %select, ptr addrspace(1) %out.gep 1143 ret void 1144} 1145 1146define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { 1147; SI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: 1148; SI: ; %bb.0: 1149; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1150; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1151; SI-NEXT: s_mov_b32 s11, 0xf000 1152; SI-NEXT: s_mov_b32 s10, 0 1153; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1154; SI-NEXT: v_mov_b32_e32 v2, 0 1155; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1156; SI-NEXT: s_mov_b64 s[6:7], s[10:11] 1157; SI-NEXT: v_mov_b32_e32 v5, v2 1158; SI-NEXT: s_waitcnt lgkmcnt(0) 1159; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1160; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc 1161; SI-NEXT: s_waitcnt vmcnt(0) 1162; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc 1163; SI-NEXT: s_waitcnt vmcnt(0) 1164; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 1165; SI-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6 1166; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc 1167; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc 1168; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc 1169; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc 1170; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 1171; SI-NEXT: s_endpgm 1172; 1173; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: 1174; VI: ; %bb.0: 1175; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1176; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1177; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1178; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 1179; VI-NEXT: s_waitcnt lgkmcnt(0) 1180; VI-NEXT: v_mov_b32_e32 v2, s3 1181; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 1182; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 1183; VI-NEXT: v_mov_b32_e32 v0, s5 1184; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5 1185; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc 1186; VI-NEXT: flat_load_dword v6, v[1:2] glc 1187; VI-NEXT: s_waitcnt vmcnt(0) 1188; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc 1189; VI-NEXT: s_waitcnt vmcnt(0) 1190; VI-NEXT: v_mov_b32_e32 v7, s1 1191; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v5 1192; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 1193; VI-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6 1194; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc 1195; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc 1196; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc 1197; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc 1198; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1199; VI-NEXT: s_endpgm 1200; 1201; GFX10-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: 1202; GFX10: ; %bb.0: 1203; GFX10-NEXT: s_clause 0x1 1204; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1205; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1206; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 1207; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 1208; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1209; GFX10-NEXT: global_load_dword v6, v4, s[2:3] glc dlc 1210; GFX10-NEXT: s_waitcnt vmcnt(0) 1211; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc 1212; GFX10-NEXT: s_waitcnt vmcnt(0) 1213; GFX10-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v6 1214; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc 1215; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc 1216; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc 1217; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc 1218; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] 1219; GFX10-NEXT: s_endpgm 1220; 1221; GFX11-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: 1222; GFX11: ; %bb.0: 1223; GFX11-NEXT: s_clause 0x1 1224; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1225; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1226; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1227; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1228; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1229; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1230; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1231; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] glc dlc 1232; GFX11-NEXT: s_waitcnt vmcnt(0) 1233; GFX11-NEXT: global_load_b128 v[0:3], v4, s[4:5] glc dlc 1234; GFX11-NEXT: s_waitcnt vmcnt(0) 1235; GFX11-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v5 1236; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc 1237; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc 1238; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc 1239; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc 1240; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 1241; GFX11-NEXT: s_endpgm 1242; 1243; GFX12-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32: 1244; GFX12: ; %bb.0: 1245; GFX12-NEXT: s_clause 0x1 1246; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1247; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1248; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1249; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1250; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1251; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1252; GFX12-NEXT: s_wait_kmcnt 0x0 1253; GFX12-NEXT: global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS 1254; GFX12-NEXT: s_wait_loadcnt 0x0 1255; GFX12-NEXT: global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS 1256; GFX12-NEXT: s_wait_loadcnt 0x0 1257; GFX12-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v5 1258; GFX12-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc 1259; GFX12-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc 1260; GFX12-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc 1261; GFX12-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc 1262; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] 1263; GFX12-NEXT: s_endpgm 1264 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1265 %tid.ext = sext i32 %tid to i64 1266 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext 1267 %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext 1268 %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext 1269 %x = load volatile float, ptr addrspace(1) %x.gep 1270 %z = load volatile <4 x float>, ptr addrspace(1) %z.gep 1271 %setcc = fcmp ugt float %x, 4.0 1272 %select = select i1 %setcc, <4 x float> %z, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0> 1273 store <4 x float> %select, ptr addrspace(1) %out.gep 1274 ret void 1275} 1276 1277define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { 1278; SI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: 1279; SI: ; %bb.0: 1280; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1281; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1282; SI-NEXT: s_mov_b32 s11, 0xf000 1283; SI-NEXT: s_mov_b32 s10, 0 1284; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1285; SI-NEXT: v_mov_b32_e32 v2, 0 1286; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1287; SI-NEXT: s_mov_b64 s[6:7], s[10:11] 1288; SI-NEXT: v_mov_b32_e32 v5, v2 1289; SI-NEXT: s_waitcnt lgkmcnt(0) 1290; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1291; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc 1292; SI-NEXT: s_waitcnt vmcnt(0) 1293; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc 1294; SI-NEXT: s_waitcnt vmcnt(0) 1295; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 1296; SI-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6 1297; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc 1298; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc 1299; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc 1300; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc 1301; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 1302; SI-NEXT: s_endpgm 1303; 1304; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: 1305; VI: ; %bb.0: 1306; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1307; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1308; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1309; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 1310; VI-NEXT: s_waitcnt lgkmcnt(0) 1311; VI-NEXT: v_mov_b32_e32 v2, s3 1312; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 1313; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 1314; VI-NEXT: v_mov_b32_e32 v0, s5 1315; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5 1316; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc 1317; VI-NEXT: flat_load_dword v6, v[1:2] glc 1318; VI-NEXT: s_waitcnt vmcnt(0) 1319; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc 1320; VI-NEXT: s_waitcnt vmcnt(0) 1321; VI-NEXT: v_mov_b32_e32 v7, s1 1322; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v5 1323; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 1324; VI-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6 1325; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc 1326; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc 1327; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc 1328; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc 1329; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1330; VI-NEXT: s_endpgm 1331; 1332; GFX10-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: 1333; GFX10: ; %bb.0: 1334; GFX10-NEXT: s_clause 0x1 1335; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1336; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1337; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 1338; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 1339; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1340; GFX10-NEXT: global_load_dword v6, v4, s[2:3] glc dlc 1341; GFX10-NEXT: s_waitcnt vmcnt(0) 1342; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc 1343; GFX10-NEXT: s_waitcnt vmcnt(0) 1344; GFX10-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v6 1345; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc 1346; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc 1347; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc 1348; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc 1349; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] 1350; GFX10-NEXT: s_endpgm 1351; 1352; GFX11-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: 1353; GFX11: ; %bb.0: 1354; GFX11-NEXT: s_clause 0x1 1355; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1356; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1357; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1358; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1359; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1360; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1361; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1362; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] glc dlc 1363; GFX11-NEXT: s_waitcnt vmcnt(0) 1364; GFX11-NEXT: global_load_b128 v[0:3], v4, s[4:5] glc dlc 1365; GFX11-NEXT: s_waitcnt vmcnt(0) 1366; GFX11-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v5 1367; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc 1368; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc 1369; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc 1370; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc 1371; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 1372; GFX11-NEXT: s_endpgm 1373; 1374; GFX12-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32: 1375; GFX12: ; %bb.0: 1376; GFX12-NEXT: s_clause 0x1 1377; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1378; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1379; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1380; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1381; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1382; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1383; GFX12-NEXT: s_wait_kmcnt 0x0 1384; GFX12-NEXT: global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS 1385; GFX12-NEXT: s_wait_loadcnt 0x0 1386; GFX12-NEXT: global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS 1387; GFX12-NEXT: s_wait_loadcnt 0x0 1388; GFX12-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v5 1389; GFX12-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc 1390; GFX12-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc 1391; GFX12-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc 1392; GFX12-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc 1393; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] 1394; GFX12-NEXT: s_endpgm 1395 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1396 %tid.ext = sext i32 %tid to i64 1397 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext 1398 %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext 1399 %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext 1400 %x = load volatile float, ptr addrspace(1) %x.gep 1401 %z = load volatile <4 x float>, ptr addrspace(1) %z.gep 1402 %setcc = fcmp ugt float %x, 4.0 1403 %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z 1404 store <4 x float> %select, ptr addrspace(1) %out.gep 1405 ret void 1406} 1407 1408; This must be swapped as a vector type before the condition has 1409; multiple uses. 1410define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { 1411; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: 1412; SI: ; %bb.0: 1413; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1414; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1415; SI-NEXT: s_mov_b32 s11, 0xf000 1416; SI-NEXT: s_mov_b32 s10, 0 1417; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1418; SI-NEXT: v_mov_b32_e32 v2, 0 1419; SI-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1420; SI-NEXT: s_mov_b64 s[6:7], s[10:11] 1421; SI-NEXT: v_mov_b32_e32 v5, v2 1422; SI-NEXT: s_waitcnt lgkmcnt(0) 1423; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1424; SI-NEXT: buffer_load_dword v6, v[1:2], s[8:11], 0 addr64 glc 1425; SI-NEXT: s_waitcnt vmcnt(0) 1426; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 glc 1427; SI-NEXT: s_waitcnt vmcnt(0) 1428; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 1429; SI-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6 1430; SI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc 1431; SI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc 1432; SI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc 1433; SI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc 1434; SI-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64 1435; SI-NEXT: s_endpgm 1436; 1437; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: 1438; VI: ; %bb.0: 1439; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1440; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1441; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1442; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0 1443; VI-NEXT: s_waitcnt lgkmcnt(0) 1444; VI-NEXT: v_mov_b32_e32 v2, s3 1445; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 1446; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 1447; VI-NEXT: v_mov_b32_e32 v0, s5 1448; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5 1449; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc 1450; VI-NEXT: flat_load_dword v6, v[1:2] glc 1451; VI-NEXT: s_waitcnt vmcnt(0) 1452; VI-NEXT: flat_load_dwordx4 v[0:3], v[3:4] glc 1453; VI-NEXT: s_waitcnt vmcnt(0) 1454; VI-NEXT: v_mov_b32_e32 v7, s1 1455; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v5 1456; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 1457; VI-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6 1458; VI-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc 1459; VI-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc 1460; VI-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc 1461; VI-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc 1462; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1463; VI-NEXT: s_endpgm 1464; 1465; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: 1466; GFX10: ; %bb.0: 1467; GFX10-NEXT: s_clause 0x1 1468; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1469; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1470; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v0 1471; GFX10-NEXT: v_lshlrev_b32_e32 v5, 4, v0 1472; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1473; GFX10-NEXT: global_load_dword v6, v4, s[2:3] glc dlc 1474; GFX10-NEXT: s_waitcnt vmcnt(0) 1475; GFX10-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] glc dlc 1476; GFX10-NEXT: s_waitcnt vmcnt(0) 1477; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v6 1478; GFX10-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc 1479; GFX10-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc 1480; GFX10-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc 1481; GFX10-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc 1482; GFX10-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] 1483; GFX10-NEXT: s_endpgm 1484; 1485; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: 1486; GFX11: ; %bb.0: 1487; GFX11-NEXT: s_clause 0x1 1488; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1489; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1490; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1491; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1492; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1493; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1494; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1495; GFX11-NEXT: global_load_b32 v5, v1, s[2:3] glc dlc 1496; GFX11-NEXT: s_waitcnt vmcnt(0) 1497; GFX11-NEXT: global_load_b128 v[0:3], v4, s[4:5] glc dlc 1498; GFX11-NEXT: s_waitcnt vmcnt(0) 1499; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v5 1500; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc 1501; GFX11-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc 1502; GFX11-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc 1503; GFX11-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc 1504; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 1505; GFX11-NEXT: s_endpgm 1506; 1507; GFX12-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32: 1508; GFX12: ; %bb.0: 1509; GFX12-NEXT: s_clause 0x1 1510; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1511; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1512; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1513; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1514; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1515; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0 1516; GFX12-NEXT: s_wait_kmcnt 0x0 1517; GFX12-NEXT: global_load_b32 v5, v1, s[2:3] scope:SCOPE_SYS 1518; GFX12-NEXT: s_wait_loadcnt 0x0 1519; GFX12-NEXT: global_load_b128 v[0:3], v4, s[4:5] scope:SCOPE_SYS 1520; GFX12-NEXT: s_wait_loadcnt 0x0 1521; GFX12-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v5 1522; GFX12-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc 1523; GFX12-NEXT: v_cndmask_b32_e32 v2, -0.5, v2, vcc 1524; GFX12-NEXT: v_cndmask_b32_e32 v1, 2.0, v1, vcc 1525; GFX12-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc 1526; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] 1527; GFX12-NEXT: s_endpgm 1528 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1529 %tid.ext = sext i32 %tid to i64 1530 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext 1531 %z.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %z.ptr, i64 %tid.ext 1532 %out.gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %out, i64 %tid.ext 1533 %x = load volatile float, ptr addrspace(1) %x.gep 1534 %z = load volatile <4 x float>, ptr addrspace(1) %z.gep 1535 %setcc = fcmp ugt float 4.0, %x 1536 %select = select i1 %setcc, <4 x float> <float 1.0, float 2.0, float -0.5, float 4.0>, <4 x float> %z 1537 store <4 x float> %select, ptr addrspace(1) %out.gep 1538 ret void 1539} 1540 1541define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { 1542; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: 1543; SI: ; %bb.0: 1544; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1545; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 1546; SI-NEXT: s_mov_b32 s6, 0 1547; SI-NEXT: v_mov_b32_e32 v1, 0 1548; SI-NEXT: s_mov_b32 s7, 0xf000 1549; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1550; SI-NEXT: v_mov_b32_e32 v3, v1 1551; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1552; SI-NEXT: s_waitcnt lgkmcnt(0) 1553; SI-NEXT: s_mov_b64 s[4:5], s[10:11] 1554; SI-NEXT: buffer_load_dword v2, v[2:3], s[4:7], 0 addr64 glc 1555; SI-NEXT: s_waitcnt vmcnt(0) 1556; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 glc 1557; SI-NEXT: s_waitcnt vmcnt(0) 1558; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 1559; SI-NEXT: v_and_b32_e32 v3, 1, v3 1560; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 1561; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3 1562; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1563; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 1564; SI-NEXT: buffer_store_byte v2, v[0:1], s[8:11], 0 addr64 1565; SI-NEXT: s_endpgm 1566; 1567; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: 1568; VI: ; %bb.0: 1569; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1570; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1571; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1572; VI-NEXT: s_waitcnt lgkmcnt(0) 1573; VI-NEXT: v_mov_b32_e32 v2, s3 1574; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 1575; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 1576; VI-NEXT: v_mov_b32_e32 v4, s5 1577; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v0 1578; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 1579; VI-NEXT: flat_load_dword v2, v[1:2] glc 1580; VI-NEXT: s_waitcnt vmcnt(0) 1581; VI-NEXT: flat_load_ubyte v3, v[3:4] glc 1582; VI-NEXT: s_waitcnt vmcnt(0) 1583; VI-NEXT: v_mov_b32_e32 v1, s1 1584; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 1585; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1586; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 1587; VI-NEXT: v_and_b32_e32 v3, 1, v3 1588; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3 1589; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1590; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] 1591; VI-NEXT: flat_store_byte v[0:1], v2 1592; VI-NEXT: s_endpgm 1593; 1594; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: 1595; GFX10: ; %bb.0: 1596; GFX10-NEXT: s_clause 0x1 1597; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 1598; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 1599; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1600; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1601; GFX10-NEXT: global_load_dword v2, v1, s[10:11] glc dlc 1602; GFX10-NEXT: s_waitcnt vmcnt(0) 1603; GFX10-NEXT: global_load_ubyte v3, v0, s[0:1] glc dlc 1604; GFX10-NEXT: s_waitcnt vmcnt(0) 1605; GFX10-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 1606; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 1607; GFX10-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 1608; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1609; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 1610; GFX10-NEXT: global_store_byte v0, v1, s[8:9] 1611; GFX10-NEXT: s_endpgm 1612; 1613; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: 1614; GFX11: ; %bb.0: 1615; GFX11-NEXT: s_clause 0x1 1616; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 1617; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 1618; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1619; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1620; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1621; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1622; GFX11-NEXT: global_load_b32 v1, v1, s[10:11] glc dlc 1623; GFX11-NEXT: s_waitcnt vmcnt(0) 1624; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] glc dlc 1625; GFX11-NEXT: s_waitcnt vmcnt(0) 1626; GFX11-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 1627; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 1628; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1629; GFX11-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2 1630; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1631; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1632; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 1633; GFX11-NEXT: global_store_b8 v0, v1, s[8:9] 1634; GFX11-NEXT: s_endpgm 1635; 1636; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: 1637; GFX12: ; %bb.0: 1638; GFX12-NEXT: s_clause 0x1 1639; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 1640; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 1641; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1642; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1643; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1644; GFX12-NEXT: s_wait_kmcnt 0x0 1645; GFX12-NEXT: global_load_b32 v1, v1, s[10:11] scope:SCOPE_SYS 1646; GFX12-NEXT: s_wait_loadcnt 0x0 1647; GFX12-NEXT: global_load_u8 v2, v0, s[0:1] scope:SCOPE_SYS 1648; GFX12-NEXT: s_wait_loadcnt 0x0 1649; GFX12-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 1650; GFX12-NEXT: v_and_b32_e32 v2, 1, v2 1651; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 1652; GFX12-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2 1653; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] 1654; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 1655; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] 1656; GFX12-NEXT: global_store_b8 v0, v1, s[8:9] 1657; GFX12-NEXT: s_endpgm 1658 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1659 %tid.ext = sext i32 %tid to i64 1660 %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext 1661 %z.gep = getelementptr inbounds i1, ptr addrspace(1) %z.ptr, i64 %tid.ext 1662 %out.gep = getelementptr inbounds i1, ptr addrspace(1) %out, i64 %tid.ext 1663 %x = load volatile i32, ptr addrspace(1) %x.gep 1664 %z = load volatile i1, ptr addrspace(1) %z.gep 1665 %setcc = icmp slt i32 %x, 0 1666 %select = select i1 %setcc, i1 true, i1 %z 1667 store i1 %select, ptr addrspace(1) %out.gep 1668 ret void 1669} 1670 1671; Different types compared vs. selected 1672define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { 1673; SI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: 1674; SI: ; %bb.0: 1675; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1676; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1677; SI-NEXT: s_mov_b32 s11, 0xf000 1678; SI-NEXT: s_mov_b32 s10, 0 1679; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1680; SI-NEXT: v_mov_b32_e32 v2, 0 1681; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 1682; SI-NEXT: s_mov_b64 s[6:7], s[10:11] 1683; SI-NEXT: v_mov_b32_e32 v4, v2 1684; SI-NEXT: s_waitcnt lgkmcnt(0) 1685; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1686; SI-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc 1687; SI-NEXT: s_waitcnt vmcnt(0) 1688; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 glc 1689; SI-NEXT: s_waitcnt vmcnt(0) 1690; SI-NEXT: v_mov_b32_e32 v5, 0x3ff00000 1691; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 1692; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v2 1693; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc 1694; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1695; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64 1696; SI-NEXT: s_endpgm 1697; 1698; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: 1699; VI: ; %bb.0: 1700; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1701; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1702; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1703; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 1704; VI-NEXT: s_waitcnt lgkmcnt(0) 1705; VI-NEXT: v_mov_b32_e32 v2, s3 1706; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 1707; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 1708; VI-NEXT: v_mov_b32_e32 v0, s5 1709; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5 1710; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc 1711; VI-NEXT: flat_load_dword v6, v[1:2] glc 1712; VI-NEXT: s_waitcnt vmcnt(0) 1713; VI-NEXT: flat_load_dwordx2 v[0:1], v[3:4] glc 1714; VI-NEXT: s_waitcnt vmcnt(0) 1715; VI-NEXT: v_mov_b32_e32 v3, s1 1716; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v5 1717; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1718; VI-NEXT: v_mov_b32_e32 v4, 0x3ff00000 1719; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v6 1720; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc 1721; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1722; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1723; VI-NEXT: s_endpgm 1724; 1725; GFX10-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: 1726; GFX10: ; %bb.0: 1727; GFX10-NEXT: s_clause 0x1 1728; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1729; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1730; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1731; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 1732; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1733; GFX10-NEXT: global_load_dword v4, v2, s[2:3] glc dlc 1734; GFX10-NEXT: s_waitcnt vmcnt(0) 1735; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] glc dlc 1736; GFX10-NEXT: s_waitcnt vmcnt(0) 1737; GFX10-NEXT: v_cmp_le_f32_e32 vcc, 0, v4 1738; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc 1739; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1740; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] 1741; GFX10-NEXT: s_endpgm 1742; 1743; GFX11-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: 1744; GFX11: ; %bb.0: 1745; GFX11-NEXT: s_clause 0x1 1746; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1747; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1748; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1749; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1750; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1751; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1752; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1753; GFX11-NEXT: global_load_b32 v3, v1, s[2:3] glc dlc 1754; GFX11-NEXT: s_waitcnt vmcnt(0) 1755; GFX11-NEXT: global_load_b64 v[0:1], v2, s[4:5] glc dlc 1756; GFX11-NEXT: s_waitcnt vmcnt(0) 1757; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v3 1758; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc 1759; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1760; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1761; GFX11-NEXT: s_endpgm 1762; 1763; GFX12-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32: 1764; GFX12: ; %bb.0: 1765; GFX12-NEXT: s_clause 0x1 1766; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1767; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1768; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1769; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1770; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1771; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1772; GFX12-NEXT: s_wait_kmcnt 0x0 1773; GFX12-NEXT: global_load_b32 v3, v1, s[2:3] scope:SCOPE_SYS 1774; GFX12-NEXT: s_wait_loadcnt 0x0 1775; GFX12-NEXT: global_load_b64 v[0:1], v2, s[4:5] scope:SCOPE_SYS 1776; GFX12-NEXT: s_wait_loadcnt 0x0 1777; GFX12-NEXT: v_cmp_le_f32_e32 vcc, 0, v3 1778; GFX12-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc 1779; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 1780; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1781; GFX12-NEXT: s_endpgm 1782 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1783 %tid.ext = sext i32 %tid to i64 1784 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext 1785 %z.gep = getelementptr inbounds double, ptr addrspace(1) %z.ptr, i64 %tid.ext 1786 %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i64 %tid.ext 1787 %x = load volatile float, ptr addrspace(1) %x.gep 1788 %z = load volatile double, ptr addrspace(1) %z.gep 1789 %setcc = fcmp ult float %x, 0.0 1790 %select = select i1 %setcc, double 1.0, double %z 1791 store double %select, ptr addrspace(1) %out.gep 1792 ret void 1793} 1794 1795; Different types compared vs. selected 1796define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { 1797; SI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: 1798; SI: ; %bb.0: 1799; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1800; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1801; SI-NEXT: s_mov_b32 s11, 0xf000 1802; SI-NEXT: s_mov_b32 s10, 0 1803; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1804; SI-NEXT: v_mov_b32_e32 v2, 0 1805; SI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 1806; SI-NEXT: s_mov_b64 s[6:7], s[10:11] 1807; SI-NEXT: v_mov_b32_e32 v4, v2 1808; SI-NEXT: s_waitcnt lgkmcnt(0) 1809; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1810; SI-NEXT: buffer_load_dword v2, v[1:2], s[8:11], 0 addr64 glc 1811; SI-NEXT: s_waitcnt vmcnt(0) 1812; SI-NEXT: buffer_load_dwordx2 v[0:1], v[3:4], s[4:7], 0 addr64 glc 1813; SI-NEXT: s_waitcnt vmcnt(0) 1814; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 1815; SI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v2 1816; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 1817; SI-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc 1818; SI-NEXT: buffer_store_dwordx2 v[0:1], v[3:4], s[0:3], 0 addr64 1819; SI-NEXT: s_endpgm 1820; 1821; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: 1822; VI: ; %bb.0: 1823; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1824; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1825; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1826; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0 1827; VI-NEXT: s_waitcnt lgkmcnt(0) 1828; VI-NEXT: v_mov_b32_e32 v2, s3 1829; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 1830; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 1831; VI-NEXT: v_mov_b32_e32 v0, s5 1832; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v5 1833; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc 1834; VI-NEXT: flat_load_dword v6, v[1:2] glc 1835; VI-NEXT: s_waitcnt vmcnt(0) 1836; VI-NEXT: flat_load_dwordx2 v[0:1], v[3:4] glc 1837; VI-NEXT: s_waitcnt vmcnt(0) 1838; VI-NEXT: v_mov_b32_e32 v3, s1 1839; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v5 1840; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1841; VI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v6 1842; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 1843; VI-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc 1844; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1845; VI-NEXT: s_endpgm 1846; 1847; GFX10-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: 1848; GFX10: ; %bb.0: 1849; GFX10-NEXT: s_clause 0x1 1850; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1851; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1852; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1853; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0 1854; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1855; GFX10-NEXT: global_load_dword v4, v2, s[2:3] glc dlc 1856; GFX10-NEXT: s_waitcnt vmcnt(0) 1857; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[6:7] glc dlc 1858; GFX10-NEXT: s_waitcnt vmcnt(0) 1859; GFX10-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4 1860; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 1861; GFX10-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc 1862; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] 1863; GFX10-NEXT: s_endpgm 1864; 1865; GFX11-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: 1866; GFX11: ; %bb.0: 1867; GFX11-NEXT: s_clause 0x1 1868; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1869; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1870; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1871; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1872; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1873; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1874; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1875; GFX11-NEXT: global_load_b32 v3, v1, s[2:3] glc dlc 1876; GFX11-NEXT: s_waitcnt vmcnt(0) 1877; GFX11-NEXT: global_load_b64 v[0:1], v2, s[4:5] glc dlc 1878; GFX11-NEXT: s_waitcnt vmcnt(0) 1879; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3 1880; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 1881; GFX11-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc 1882; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1883; GFX11-NEXT: s_endpgm 1884; 1885; GFX12-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32: 1886; GFX12: ; %bb.0: 1887; GFX12-NEXT: s_clause 0x1 1888; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1889; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1890; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1891; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 1892; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1893; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1894; GFX12-NEXT: s_wait_kmcnt 0x0 1895; GFX12-NEXT: global_load_b32 v3, v1, s[2:3] scope:SCOPE_SYS 1896; GFX12-NEXT: s_wait_loadcnt 0x0 1897; GFX12-NEXT: global_load_b64 v[0:1], v2, s[4:5] scope:SCOPE_SYS 1898; GFX12-NEXT: s_wait_loadcnt 0x0 1899; GFX12-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3 1900; GFX12-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 1901; GFX12-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc 1902; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] 1903; GFX12-NEXT: s_endpgm 1904 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 1905 %tid.ext = sext i32 %tid to i64 1906 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext 1907 %z.gep = getelementptr inbounds i64, ptr addrspace(1) %z.ptr, i64 %tid.ext 1908 %out.gep = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %tid.ext 1909 %x = load volatile float, ptr addrspace(1) %x.gep 1910 %z = load volatile i64, ptr addrspace(1) %z.gep 1911 %setcc = fcmp one float %x, 0.0 1912 %select = select i1 %setcc, i64 3, i64 %z 1913 store i64 %select, ptr addrspace(1) %out.gep 1914 ret void 1915} 1916 1917; Different types compared vs. selected 1918define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { 1919; SI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: 1920; SI: ; %bb.0: 1921; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1922; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1923; SI-NEXT: s_mov_b32 s11, 0xf000 1924; SI-NEXT: s_mov_b32 s10, 0 1925; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1926; SI-NEXT: v_mov_b32_e32 v1, 0 1927; SI-NEXT: s_mov_b64 s[6:7], s[10:11] 1928; SI-NEXT: s_waitcnt lgkmcnt(0) 1929; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1930; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc 1931; SI-NEXT: s_waitcnt vmcnt(0) 1932; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc 1933; SI-NEXT: s_waitcnt vmcnt(0) 1934; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 1935; SI-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2 1936; SI-NEXT: v_cndmask_b32_e32 v2, 4.0, v3, vcc 1937; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 1938; SI-NEXT: s_endpgm 1939; 1940; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: 1941; VI: ; %bb.0: 1942; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1943; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1944; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 1945; VI-NEXT: s_waitcnt lgkmcnt(0) 1946; VI-NEXT: v_mov_b32_e32 v1, s3 1947; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 1948; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1949; VI-NEXT: v_mov_b32_e32 v3, s5 1950; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 1951; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1952; VI-NEXT: flat_load_dword v5, v[0:1] glc 1953; VI-NEXT: s_waitcnt vmcnt(0) 1954; VI-NEXT: flat_load_dword v2, v[2:3] glc 1955; VI-NEXT: s_waitcnt vmcnt(0) 1956; VI-NEXT: v_mov_b32_e32 v1, s1 1957; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 1958; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1959; VI-NEXT: v_cmp_gt_u32_e32 vcc, 2, v5 1960; VI-NEXT: v_cndmask_b32_e32 v2, 4.0, v2, vcc 1961; VI-NEXT: flat_store_dword v[0:1], v2 1962; VI-NEXT: s_endpgm 1963; 1964; GFX10-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: 1965; GFX10: ; %bb.0: 1966; GFX10-NEXT: s_clause 0x1 1967; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1968; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1969; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1970; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1971; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 1972; GFX10-NEXT: s_waitcnt vmcnt(0) 1973; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc 1974; GFX10-NEXT: s_waitcnt vmcnt(0) 1975; GFX10-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 1976; GFX10-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc 1977; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 1978; GFX10-NEXT: s_endpgm 1979; 1980; GFX11-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: 1981; GFX11: ; %bb.0: 1982; GFX11-NEXT: s_clause 0x1 1983; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1984; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1985; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1986; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1987; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1988; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1989; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 1990; GFX11-NEXT: s_waitcnt vmcnt(0) 1991; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc 1992; GFX11-NEXT: s_waitcnt vmcnt(0) 1993; GFX11-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 1994; GFX11-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc 1995; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 1996; GFX11-NEXT: s_endpgm 1997; 1998; GFX12-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32: 1999; GFX12: ; %bb.0: 2000; GFX12-NEXT: s_clause 0x1 2001; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2002; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2003; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2004; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2005; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2006; GFX12-NEXT: s_wait_kmcnt 0x0 2007; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS 2008; GFX12-NEXT: s_wait_loadcnt 0x0 2009; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS 2010; GFX12-NEXT: s_wait_loadcnt 0x0 2011; GFX12-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 2012; GFX12-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc 2013; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] 2014; GFX12-NEXT: s_endpgm 2015 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 2016 %tid.ext = sext i32 %tid to i64 2017 %x.gep = getelementptr inbounds i32, ptr addrspace(1) %x.ptr, i64 %tid.ext 2018 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext 2019 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2020 %x = load volatile i32, ptr addrspace(1) %x.gep 2021 %z = load volatile float, ptr addrspace(1) %z.gep 2022 %setcc = icmp ugt i32 %x, 1 2023 %select = select i1 %setcc, float 4.0, float %z 2024 store float %select, ptr addrspace(1) %out.gep 2025 ret void 2026} 2027 2028; FIXME: Should be able to handle multiple uses 2029define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { 2030; SI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: 2031; SI: ; %bb.0: 2032; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2033; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2034; SI-NEXT: s_mov_b32 s11, 0xf000 2035; SI-NEXT: s_mov_b32 s10, 0 2036; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2037; SI-NEXT: v_mov_b32_e32 v1, 0 2038; SI-NEXT: s_mov_b64 s[6:7], s[10:11] 2039; SI-NEXT: s_waitcnt lgkmcnt(0) 2040; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 2041; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc 2042; SI-NEXT: s_waitcnt vmcnt(0) 2043; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc 2044; SI-NEXT: s_waitcnt vmcnt(0) 2045; SI-NEXT: s_mov_b64 s[2:3], s[10:11] 2046; SI-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v2 2047; SI-NEXT: v_cndmask_b32_e64 v2, v3, -1.0, vcc 2048; SI-NEXT: v_cndmask_b32_e64 v3, v3, -2.0, vcc 2049; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 2050; SI-NEXT: s_waitcnt vmcnt(0) 2051; SI-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 2052; SI-NEXT: s_waitcnt vmcnt(0) 2053; SI-NEXT: s_endpgm 2054; 2055; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: 2056; VI: ; %bb.0: 2057; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2058; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2059; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 2060; VI-NEXT: s_waitcnt lgkmcnt(0) 2061; VI-NEXT: v_mov_b32_e32 v1, s3 2062; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 2063; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2064; VI-NEXT: v_mov_b32_e32 v3, s5 2065; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4 2066; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 2067; VI-NEXT: flat_load_dword v5, v[0:1] glc 2068; VI-NEXT: s_waitcnt vmcnt(0) 2069; VI-NEXT: flat_load_dword v2, v[2:3] glc 2070; VI-NEXT: s_waitcnt vmcnt(0) 2071; VI-NEXT: v_mov_b32_e32 v1, s1 2072; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 2073; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2074; VI-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v5 2075; VI-NEXT: v_cndmask_b32_e64 v3, v2, -1.0, vcc 2076; VI-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc 2077; VI-NEXT: flat_store_dword v[0:1], v3 2078; VI-NEXT: s_waitcnt vmcnt(0) 2079; VI-NEXT: flat_store_dword v[0:1], v2 2080; VI-NEXT: s_waitcnt vmcnt(0) 2081; VI-NEXT: s_endpgm 2082; 2083; GFX10-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: 2084; GFX10: ; %bb.0: 2085; GFX10-NEXT: s_clause 0x1 2086; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2087; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2088; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2089; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2090; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 2091; GFX10-NEXT: s_waitcnt vmcnt(0) 2092; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc 2093; GFX10-NEXT: s_waitcnt vmcnt(0) 2094; GFX10-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1 2095; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc 2096; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc 2097; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 2098; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2099; GFX10-NEXT: global_store_dword v0, v2, s[0:1] 2100; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2101; GFX10-NEXT: s_endpgm 2102; 2103; GFX11-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: 2104; GFX11: ; %bb.0: 2105; GFX11-NEXT: s_clause 0x1 2106; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2107; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2108; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2109; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2110; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2111; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2112; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 2113; GFX11-NEXT: s_waitcnt vmcnt(0) 2114; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc 2115; GFX11-NEXT: s_waitcnt vmcnt(0) 2116; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1 2117; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc 2118; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc 2119; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc 2120; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2121; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc 2122; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 2123; GFX11-NEXT: s_endpgm 2124; 2125; GFX12-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2: 2126; GFX12: ; %bb.0: 2127; GFX12-NEXT: s_clause 0x1 2128; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2129; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 2130; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2131; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2132; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2133; GFX12-NEXT: s_wait_kmcnt 0x0 2134; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS 2135; GFX12-NEXT: s_wait_loadcnt 0x0 2136; GFX12-NEXT: global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS 2137; GFX12-NEXT: s_wait_loadcnt 0x0 2138; GFX12-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1 2139; GFX12-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc 2140; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, -2.0, vcc 2141; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS 2142; GFX12-NEXT: s_wait_storecnt 0x0 2143; GFX12-NEXT: global_store_b32 v0, v2, s[0:1] scope:SCOPE_SYS 2144; GFX12-NEXT: s_wait_storecnt 0x0 2145; GFX12-NEXT: s_endpgm 2146 %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 2147 %tid.ext = sext i32 %tid to i64 2148 %x.gep = getelementptr inbounds float, ptr addrspace(1) %x.ptr, i64 %tid.ext 2149 %z.gep = getelementptr inbounds float, ptr addrspace(1) %z.ptr, i64 %tid.ext 2150 %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext 2151 %x = load volatile float, ptr addrspace(1) %x.gep 2152 %z = load volatile float, ptr addrspace(1) %z.gep 2153 %setcc = fcmp ugt float 4.0, %x 2154 %select0 = select i1 %setcc, float -1.0, float %z 2155 %select1 = select i1 %setcc, float -2.0, float %z 2156 store volatile float %select0, ptr addrspace(1) %out.gep 2157 store volatile float %select1, ptr addrspace(1) %out.gep 2158 ret void 2159} 2160 2161; Source modifiers abs/neg only work for f32 2162define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { 2163; SI-LABEL: v_cndmask_abs_neg_f16: 2164; SI: ; %bb.0: 2165; SI-NEXT: s_load_dword s8, s[4:5], 0xb 2166; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 2167; SI-NEXT: s_mov_b32 s7, 0xf000 2168; SI-NEXT: s_mov_b32 s2, 0 2169; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2170; SI-NEXT: v_mov_b32_e32 v1, 0 2171; SI-NEXT: s_mov_b32 s3, s7 2172; SI-NEXT: s_waitcnt lgkmcnt(0) 2173; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 2174; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 2175; SI-NEXT: s_mov_b32 s6, -1 2176; SI-NEXT: s_cmp_lg_u32 s8, 0 2177; SI-NEXT: s_waitcnt vmcnt(0) 2178; SI-NEXT: v_cvt_f32_f16_e64 v1, |v0| 2179; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 2180; SI-NEXT: s_cselect_b64 vcc, -1, 0 2181; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 2182; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 2183; SI-NEXT: s_waitcnt lgkmcnt(0) 2184; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 2185; SI-NEXT: s_endpgm 2186; 2187; VI-LABEL: v_cndmask_abs_neg_f16: 2188; VI: ; %bb.0: 2189; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2190; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2191; VI-NEXT: s_waitcnt lgkmcnt(0) 2192; VI-NEXT: v_mov_b32_e32 v1, s1 2193; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 2194; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2195; VI-NEXT: flat_load_ushort v0, v[0:1] 2196; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 2197; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2198; VI-NEXT: s_waitcnt lgkmcnt(0) 2199; VI-NEXT: s_cmp_lg_u32 s2, 0 2200; VI-NEXT: s_cselect_b64 vcc, -1, 0 2201; VI-NEXT: s_waitcnt vmcnt(0) 2202; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v0 2203; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 2204; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 2205; VI-NEXT: v_mov_b32_e32 v0, s0 2206; VI-NEXT: v_mov_b32_e32 v1, s1 2207; VI-NEXT: flat_store_short v[0:1], v2 2208; VI-NEXT: s_endpgm 2209; 2210; GFX10-LABEL: v_cndmask_abs_neg_f16: 2211; GFX10: ; %bb.0: 2212; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2213; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2214; GFX10-NEXT: v_mov_b32_e32 v2, 0 2215; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2216; GFX10-NEXT: global_load_ushort v0, v0, s[0:1] 2217; GFX10-NEXT: s_clause 0x1 2218; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c 2219; GFX10-NEXT: s_waitcnt_depctr 0xffe3 2220; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2221; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2222; GFX10-NEXT: s_cmp_lg_u32 s2, 0 2223; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 2224; GFX10-NEXT: s_waitcnt vmcnt(0) 2225; GFX10-NEXT: v_and_b32_e32 v1, 0x7fff, v0 2226; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0 2227; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 2228; GFX10-NEXT: global_store_short v2, v0, s[0:1] 2229; GFX10-NEXT: s_endpgm 2230; 2231; GFX11-LABEL: v_cndmask_abs_neg_f16: 2232; GFX11: ; %bb.0: 2233; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 2234; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2235; GFX11-NEXT: v_mov_b32_e32 v2, 0 2236; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 2237; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2238; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2239; GFX11-NEXT: global_load_u16 v0, v0, s[0:1] 2240; GFX11-NEXT: s_clause 0x1 2241; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 2242; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2243; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2244; GFX11-NEXT: s_cmp_lg_u32 s2, 0 2245; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 2246; GFX11-NEXT: s_waitcnt vmcnt(0) 2247; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0 2248; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0 2249; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 2250; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 2251; GFX11-NEXT: global_store_b16 v2, v0, s[0:1] 2252; GFX11-NEXT: s_endpgm 2253; 2254; GFX12-LABEL: v_cndmask_abs_neg_f16: 2255; GFX12: ; %bb.0: 2256; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 2257; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2258; GFX12-NEXT: v_mov_b32_e32 v2, 0 2259; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 2260; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2261; GFX12-NEXT: s_wait_kmcnt 0x0 2262; GFX12-NEXT: global_load_u16 v0, v0, s[0:1] 2263; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 2264; GFX12-NEXT: s_wait_kmcnt 0x0 2265; GFX12-NEXT: s_cmp_lg_u32 s2, 0 2266; GFX12-NEXT: s_cselect_b64 vcc, -1, 0 2267; GFX12-NEXT: s_wait_loadcnt 0x0 2268; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff, v0 2269; GFX12-NEXT: v_xor_b32_e32 v0, 0x8000, v0 2270; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) 2271; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 2272; GFX12-NEXT: global_store_b16 v2, v0, s[0:1] 2273; GFX12-NEXT: s_endpgm 2274 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 2275 %f.gep = getelementptr half, ptr addrspace(1) %fptr, i32 %idx 2276 %f = load half, ptr addrspace(1) %f.gep 2277 %f.abs = call half @llvm.fabs.f16(half %f) 2278 %f.neg = fneg half %f 2279 %setcc = icmp ne i32 %c, 0 2280 %select = select i1 %setcc, half %f.abs, half %f.neg 2281 store half %select, ptr addrspace(1) %out 2282 ret void 2283} 2284 2285define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { 2286; SI-LABEL: v_cndmask_abs_neg_f32: 2287; SI: ; %bb.0: 2288; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2289; SI-NEXT: s_load_dword s8, s[4:5], 0xb 2290; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2291; SI-NEXT: s_mov_b32 s3, 0xf000 2292; SI-NEXT: s_mov_b32 s6, 0 2293; SI-NEXT: s_mov_b32 s7, s3 2294; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2295; SI-NEXT: v_mov_b32_e32 v1, 0 2296; SI-NEXT: s_waitcnt lgkmcnt(0) 2297; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 2298; SI-NEXT: s_mov_b32 s2, -1 2299; SI-NEXT: s_cmp_lg_u32 s8, 0 2300; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 2301; SI-NEXT: s_waitcnt vmcnt(0) 2302; SI-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[4:5] 2303; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2304; SI-NEXT: s_endpgm 2305; 2306; VI-LABEL: v_cndmask_abs_neg_f32: 2307; VI: ; %bb.0: 2308; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2309; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2310; VI-NEXT: s_waitcnt lgkmcnt(0) 2311; VI-NEXT: v_mov_b32_e32 v1, s1 2312; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 2313; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2314; VI-NEXT: flat_load_dword v0, v[0:1] 2315; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 2316; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2317; VI-NEXT: s_waitcnt lgkmcnt(0) 2318; VI-NEXT: s_cmp_lg_u32 s2, 0 2319; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 2320; VI-NEXT: s_waitcnt vmcnt(0) 2321; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3] 2322; VI-NEXT: v_mov_b32_e32 v0, s0 2323; VI-NEXT: v_mov_b32_e32 v1, s1 2324; VI-NEXT: flat_store_dword v[0:1], v2 2325; VI-NEXT: s_endpgm 2326; 2327; GFX10-LABEL: v_cndmask_abs_neg_f32: 2328; GFX10: ; %bb.0: 2329; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2330; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2331; GFX10-NEXT: v_mov_b32_e32 v1, 0 2332; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2333; GFX10-NEXT: global_load_dword v0, v0, s[0:1] 2334; GFX10-NEXT: s_clause 0x1 2335; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c 2336; GFX10-NEXT: s_waitcnt_depctr 0xffe3 2337; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2338; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2339; GFX10-NEXT: s_cmp_lg_u32 s2, 0 2340; GFX10-NEXT: s_cselect_b64 s[2:3], -1, 0 2341; GFX10-NEXT: s_waitcnt vmcnt(0) 2342; GFX10-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] 2343; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 2344; GFX10-NEXT: s_endpgm 2345; 2346; GFX11-LABEL: v_cndmask_abs_neg_f32: 2347; GFX11: ; %bb.0: 2348; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 2349; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2350; GFX11-NEXT: v_mov_b32_e32 v1, 0 2351; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 2352; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2353; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2354; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] 2355; GFX11-NEXT: s_clause 0x1 2356; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 2357; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2358; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2359; GFX11-NEXT: s_cmp_lg_u32 s2, 0 2360; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0 2361; GFX11-NEXT: s_waitcnt vmcnt(0) 2362; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] 2363; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 2364; GFX11-NEXT: s_endpgm 2365; 2366; GFX12-LABEL: v_cndmask_abs_neg_f32: 2367; GFX12: ; %bb.0: 2368; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 2369; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2370; GFX12-NEXT: v_mov_b32_e32 v1, 0 2371; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 2372; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2373; GFX12-NEXT: s_wait_kmcnt 0x0 2374; GFX12-NEXT: global_load_b32 v0, v0, s[0:1] 2375; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 2376; GFX12-NEXT: s_wait_kmcnt 0x0 2377; GFX12-NEXT: s_cmp_lg_u32 s2, 0 2378; GFX12-NEXT: s_cselect_b64 s[2:3], -1, 0 2379; GFX12-NEXT: s_wait_loadcnt 0x0 2380; GFX12-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3] 2381; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] 2382; GFX12-NEXT: s_endpgm 2383 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 2384 %f.gep = getelementptr float, ptr addrspace(1) %fptr, i32 %idx 2385 %f = load float, ptr addrspace(1) %f.gep 2386 %f.abs = call float @llvm.fabs.f32(float %f) 2387 %f.neg = fneg float %f 2388 %setcc = icmp ne i32 %c, 0 2389 %select = select i1 %setcc, float %f.abs, float %f.neg 2390 store float %select, ptr addrspace(1) %out 2391 ret void 2392} 2393 2394define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c, ptr addrspace(1) %fptr) #0 { 2395; SI-LABEL: v_cndmask_abs_neg_f64: 2396; SI: ; %bb.0: 2397; SI-NEXT: s_load_dword s8, s[4:5], 0xb 2398; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 2399; SI-NEXT: s_mov_b32 s7, 0xf000 2400; SI-NEXT: s_mov_b32 s2, 0 2401; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 2402; SI-NEXT: v_mov_b32_e32 v1, 0 2403; SI-NEXT: s_mov_b32 s3, s7 2404; SI-NEXT: s_waitcnt lgkmcnt(0) 2405; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 2406; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 2407; SI-NEXT: s_mov_b32 s6, -1 2408; SI-NEXT: s_cmp_lg_u32 s8, 0 2409; SI-NEXT: s_waitcnt vmcnt(0) 2410; SI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 2411; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 2412; SI-NEXT: s_cselect_b64 vcc, -1, 0 2413; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 2414; SI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc 2415; SI-NEXT: s_waitcnt lgkmcnt(0) 2416; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2417; SI-NEXT: s_endpgm 2418; 2419; VI-LABEL: v_cndmask_abs_neg_f64: 2420; VI: ; %bb.0: 2421; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2422; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 2423; VI-NEXT: s_waitcnt lgkmcnt(0) 2424; VI-NEXT: v_mov_b32_e32 v1, s1 2425; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 2426; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2427; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2428; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 2429; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2430; VI-NEXT: s_waitcnt lgkmcnt(0) 2431; VI-NEXT: s_cmp_lg_u32 s2, 0 2432; VI-NEXT: s_cselect_b64 vcc, -1, 0 2433; VI-NEXT: s_waitcnt vmcnt(0) 2434; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 2435; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 2436; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 2437; VI-NEXT: v_mov_b32_e32 v3, s1 2438; VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc 2439; VI-NEXT: v_mov_b32_e32 v2, s0 2440; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 2441; VI-NEXT: s_endpgm 2442; 2443; GFX10-LABEL: v_cndmask_abs_neg_f64: 2444; GFX10: ; %bb.0: 2445; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 2446; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 2447; GFX10-NEXT: v_mov_b32_e32 v3, 0 2448; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2449; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[0:1] 2450; GFX10-NEXT: s_clause 0x1 2451; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c 2452; GFX10-NEXT: s_waitcnt_depctr 0xffe3 2453; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2454; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2455; GFX10-NEXT: s_cmp_lg_u32 s2, 0 2456; GFX10-NEXT: s_cselect_b64 vcc, -1, 0 2457; GFX10-NEXT: s_waitcnt vmcnt(0) 2458; GFX10-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 2459; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 2460; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc 2461; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 2462; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] 2463; GFX10-NEXT: s_endpgm 2464; 2465; GFX11-LABEL: v_cndmask_abs_neg_f64: 2466; GFX11: ; %bb.0: 2467; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 2468; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2469; GFX11-NEXT: v_mov_b32_e32 v3, 0 2470; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 2471; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 2472; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2473; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1] 2474; GFX11-NEXT: s_clause 0x1 2475; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 2476; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 2477; GFX11-NEXT: s_waitcnt lgkmcnt(0) 2478; GFX11-NEXT: s_cmp_lg_u32 s2, 0 2479; GFX11-NEXT: s_cselect_b64 vcc, -1, 0 2480; GFX11-NEXT: s_waitcnt vmcnt(0) 2481; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 2482; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 2483; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc 2484; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 2485; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 2486; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] 2487; GFX11-NEXT: s_endpgm 2488; 2489; GFX12-LABEL: v_cndmask_abs_neg_f64: 2490; GFX12: ; %bb.0: 2491; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 2492; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2493; GFX12-NEXT: v_mov_b32_e32 v3, 0 2494; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 2495; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 2496; GFX12-NEXT: s_wait_kmcnt 0x0 2497; GFX12-NEXT: global_load_b64 v[0:1], v0, s[0:1] 2498; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 2499; GFX12-NEXT: s_wait_kmcnt 0x0 2500; GFX12-NEXT: s_cmp_lg_u32 s2, 0 2501; GFX12-NEXT: s_cselect_b64 vcc, -1, 0 2502; GFX12-NEXT: s_wait_loadcnt 0x0 2503; GFX12-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 2504; GFX12-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 2505; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc 2506; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) 2507; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 2508; GFX12-NEXT: global_store_b64 v3, v[0:1], s[0:1] 2509; GFX12-NEXT: s_endpgm 2510 %idx = call i32 @llvm.amdgcn.workitem.id.x() #1 2511 %f.gep = getelementptr double, ptr addrspace(1) %fptr, i32 %idx 2512 %f = load double, ptr addrspace(1) %f.gep 2513 %f.abs = call double @llvm.fabs.f64(double %f) 2514 %f.neg = fneg double %f 2515 %setcc = icmp ne i32 %c, 0 2516 %select = select i1 %setcc, double %f.abs, double %f.neg 2517 store double %select, ptr addrspace(1) %out 2518 ret void 2519} 2520 2521attributes #0 = { nounwind } 2522attributes #1 = { nounwind readnone } 2523