1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI 3; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI 4; RUN: llc < %s -mtriple=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG 5; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10 6; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL 7; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX11 8 9declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone 10declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone 11declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone 12 13declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone 14declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone 15declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone 16 17declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone 18declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone 19declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone 20 21declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 22 23define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { 24; SI-LABEL: s_ctlz_i32: 25; SI: ; %bb.0: 26; SI-NEXT: s_load_dword s2, s[4:5], 0xb 27; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 28; SI-NEXT: s_mov_b32 s3, 0xf000 29; SI-NEXT: s_waitcnt lgkmcnt(0) 30; SI-NEXT: s_flbit_i32_b32 s2, s2 31; SI-NEXT: s_min_u32 s4, s2, 32 32; SI-NEXT: s_mov_b32 s2, -1 33; SI-NEXT: v_mov_b32_e32 v0, s4 34; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 35; SI-NEXT: s_endpgm 36; 37; VI-LABEL: s_ctlz_i32: 38; VI: ; %bb.0: 39; VI-NEXT: s_load_dword s6, s[4:5], 0x2c 40; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 41; VI-NEXT: s_mov_b32 s3, 0xf000 42; VI-NEXT: s_mov_b32 s2, -1 43; VI-NEXT: s_waitcnt lgkmcnt(0) 44; VI-NEXT: s_flbit_i32_b32 s4, s6 45; VI-NEXT: s_min_u32 s4, s4, 32 46; VI-NEXT: v_mov_b32_e32 v0, s4 47; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 48; VI-NEXT: s_endpgm 49; 50; EG-LABEL: s_ctlz_i32: 51; EG: ; %bb.0: 52; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 53; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 54; EG-NEXT: CF_END 55; EG-NEXT: PAD 56; EG-NEXT: ALU clause starting at 4: 57; EG-NEXT: FFBH_UINT * T0.W, KC0[2].Z, 58; EG-NEXT: CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W, 59; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 60; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 61; 62; GFX10-LABEL: s_ctlz_i32: 63; GFX10: ; %bb.0: 64; GFX10-NEXT: s_clause 0x1 65; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c 66; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 67; GFX10-NEXT: v_mov_b32_e32 v0, 0 68; GFX10-NEXT: s_waitcnt lgkmcnt(0) 69; GFX10-NEXT: s_flbit_i32_b32 s2, s2 70; GFX10-NEXT: s_min_u32 s2, s2, 32 71; GFX10-NEXT: v_mov_b32_e32 v1, s2 72; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 73; GFX10-NEXT: s_endpgm 74; 75; GFX10-GISEL-LABEL: s_ctlz_i32: 76; GFX10-GISEL: ; %bb.0: 77; GFX10-GISEL-NEXT: s_clause 0x1 78; GFX10-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c 79; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 80; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 81; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 82; GFX10-GISEL-NEXT: s_flbit_i32_b32 s2, s2 83; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 32 84; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 85; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 86; GFX10-GISEL-NEXT: s_endpgm 87; 88; GFX11-LABEL: s_ctlz_i32: 89; GFX11: ; %bb.0: 90; GFX11-NEXT: s_clause 0x1 91; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c 92; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 93; GFX11-NEXT: s_waitcnt lgkmcnt(0) 94; GFX11-NEXT: s_clz_i32_u32 s2, s2 95; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 96; GFX11-NEXT: s_min_u32 s2, s2, 32 97; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 98; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 99; GFX11-NEXT: s_endpgm 100 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 101 store i32 %ctlz, ptr addrspace(1) %out, align 4 102 ret void 103} 104 105define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 106; SI-LABEL: v_ctlz_i32: 107; SI: ; %bb.0: 108; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 109; SI-NEXT: s_mov_b32 s7, 0xf000 110; SI-NEXT: s_mov_b32 s10, 0 111; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 112; SI-NEXT: v_mov_b32_e32 v1, 0 113; SI-NEXT: s_mov_b32 s11, s7 114; SI-NEXT: s_waitcnt lgkmcnt(0) 115; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 116; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 117; SI-NEXT: s_mov_b32 s6, -1 118; SI-NEXT: s_mov_b32 s4, s0 119; SI-NEXT: s_mov_b32 s5, s1 120; SI-NEXT: s_waitcnt vmcnt(0) 121; SI-NEXT: v_ffbh_u32_e32 v0, v0 122; SI-NEXT: v_min_u32_e32 v0, 32, v0 123; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 124; SI-NEXT: s_endpgm 125; 126; VI-LABEL: v_ctlz_i32: 127; VI: ; %bb.0: 128; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 129; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 130; VI-NEXT: s_waitcnt lgkmcnt(0) 131; VI-NEXT: v_mov_b32_e32 v1, s3 132; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 133; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 134; VI-NEXT: flat_load_dword v0, v[0:1] 135; VI-NEXT: s_mov_b32 s3, 0xf000 136; VI-NEXT: s_mov_b32 s2, -1 137; VI-NEXT: s_waitcnt vmcnt(0) 138; VI-NEXT: v_ffbh_u32_e32 v0, v0 139; VI-NEXT: v_min_u32_e32 v0, 32, v0 140; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 141; VI-NEXT: s_endpgm 142; 143; EG-LABEL: v_ctlz_i32: 144; EG: ; %bb.0: 145; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 146; EG-NEXT: TEX 0 @6 147; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 148; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 149; EG-NEXT: CF_END 150; EG-NEXT: PAD 151; EG-NEXT: Fetch clause starting at 6: 152; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 153; EG-NEXT: ALU clause starting at 8: 154; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 155; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 156; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 157; EG-NEXT: ALU clause starting at 11: 158; EG-NEXT: FFBH_UINT * T0.W, T0.X, 159; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 160; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 161; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 162; 163; GFX10-LABEL: v_ctlz_i32: 164; GFX10: ; %bb.0: 165; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 166; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 167; GFX10-NEXT: v_mov_b32_e32 v1, 0 168; GFX10-NEXT: s_waitcnt lgkmcnt(0) 169; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 170; GFX10-NEXT: s_waitcnt vmcnt(0) 171; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 172; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 173; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 174; GFX10-NEXT: s_endpgm 175; 176; GFX10-GISEL-LABEL: v_ctlz_i32: 177; GFX10-GISEL: ; %bb.0: 178; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 179; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 180; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 181; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 182; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 183; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 184; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 185; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 186; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 187; GFX10-GISEL-NEXT: s_endpgm 188; 189; GFX11-LABEL: v_ctlz_i32: 190; GFX11: ; %bb.0: 191; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 192; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 193; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 194; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 195; GFX11-NEXT: s_waitcnt lgkmcnt(0) 196; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 197; GFX11-NEXT: s_waitcnt vmcnt(0) 198; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 199; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 200; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 201; GFX11-NEXT: s_endpgm 202 %tid = call i32 @llvm.amdgcn.workitem.id.x() 203 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 204 %val = load i32, ptr addrspace(1) %in.gep, align 4 205 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 206 store i32 %ctlz, ptr addrspace(1) %out, align 4 207 ret void 208} 209 210define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 211; SI-LABEL: v_ctlz_v2i32: 212; SI: ; %bb.0: 213; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 214; SI-NEXT: s_mov_b32 s7, 0xf000 215; SI-NEXT: s_mov_b32 s10, 0 216; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 217; SI-NEXT: v_mov_b32_e32 v1, 0 218; SI-NEXT: s_mov_b32 s11, s7 219; SI-NEXT: s_waitcnt lgkmcnt(0) 220; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 221; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 222; SI-NEXT: s_mov_b32 s6, -1 223; SI-NEXT: s_mov_b32 s4, s0 224; SI-NEXT: s_mov_b32 s5, s1 225; SI-NEXT: s_waitcnt vmcnt(0) 226; SI-NEXT: v_ffbh_u32_e32 v1, v1 227; SI-NEXT: v_ffbh_u32_e32 v0, v0 228; SI-NEXT: v_min_u32_e32 v1, 32, v1 229; SI-NEXT: v_min_u32_e32 v0, 32, v0 230; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 231; SI-NEXT: s_endpgm 232; 233; VI-LABEL: v_ctlz_v2i32: 234; VI: ; %bb.0: 235; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 236; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 237; VI-NEXT: s_waitcnt lgkmcnt(0) 238; VI-NEXT: v_mov_b32_e32 v1, s3 239; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 240; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 241; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 242; VI-NEXT: s_mov_b32 s3, 0xf000 243; VI-NEXT: s_mov_b32 s2, -1 244; VI-NEXT: s_waitcnt vmcnt(0) 245; VI-NEXT: v_ffbh_u32_e32 v1, v1 246; VI-NEXT: v_ffbh_u32_e32 v0, v0 247; VI-NEXT: v_min_u32_e32 v1, 32, v1 248; VI-NEXT: v_min_u32_e32 v0, 32, v0 249; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 250; VI-NEXT: s_endpgm 251; 252; EG-LABEL: v_ctlz_v2i32: 253; EG: ; %bb.0: 254; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 255; EG-NEXT: TEX 0 @6 256; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] 257; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 258; EG-NEXT: CF_END 259; EG-NEXT: PAD 260; EG-NEXT: Fetch clause starting at 6: 261; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 262; EG-NEXT: ALU clause starting at 8: 263; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 264; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 265; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 266; EG-NEXT: ALU clause starting at 11: 267; EG-NEXT: FFBH_UINT * T0.W, T0.Y, 268; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W, 269; EG-NEXT: FFBH_UINT * T0.W, T0.X, 270; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 271; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 272; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 273; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 274; 275; GFX10-LABEL: v_ctlz_v2i32: 276; GFX10: ; %bb.0: 277; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 278; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 279; GFX10-NEXT: v_mov_b32_e32 v2, 0 280; GFX10-NEXT: s_waitcnt lgkmcnt(0) 281; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 282; GFX10-NEXT: s_waitcnt vmcnt(0) 283; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 284; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 285; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 286; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 287; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 288; GFX10-NEXT: s_endpgm 289; 290; GFX10-GISEL-LABEL: v_ctlz_v2i32: 291; GFX10-GISEL: ; %bb.0: 292; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 293; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 294; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 295; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 296; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 297; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 298; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 299; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 300; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 301; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 302; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 303; GFX10-GISEL-NEXT: s_endpgm 304; 305; GFX11-LABEL: v_ctlz_v2i32: 306; GFX11: ; %bb.0: 307; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 308; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 309; GFX11-NEXT: v_mov_b32_e32 v2, 0 310; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 311; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 312; GFX11-NEXT: s_waitcnt lgkmcnt(0) 313; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] 314; GFX11-NEXT: s_waitcnt vmcnt(0) 315; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 316; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 317; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 318; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 319; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 320; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 321; GFX11-NEXT: s_endpgm 322 %tid = call i32 @llvm.amdgcn.workitem.id.x() 323 %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid 324 %val = load <2 x i32>, ptr addrspace(1) %in.gep, align 8 325 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone 326 store <2 x i32> %ctlz, ptr addrspace(1) %out, align 8 327 ret void 328} 329 330define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 331; SI-LABEL: v_ctlz_v4i32: 332; SI: ; %bb.0: 333; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 334; SI-NEXT: s_mov_b32 s7, 0xf000 335; SI-NEXT: s_mov_b32 s10, 0 336; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 337; SI-NEXT: v_mov_b32_e32 v1, 0 338; SI-NEXT: s_mov_b32 s11, s7 339; SI-NEXT: s_waitcnt lgkmcnt(0) 340; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 341; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 342; SI-NEXT: s_mov_b32 s6, -1 343; SI-NEXT: s_mov_b32 s4, s0 344; SI-NEXT: s_mov_b32 s5, s1 345; SI-NEXT: s_waitcnt vmcnt(0) 346; SI-NEXT: v_ffbh_u32_e32 v3, v3 347; SI-NEXT: v_ffbh_u32_e32 v2, v2 348; SI-NEXT: v_ffbh_u32_e32 v1, v1 349; SI-NEXT: v_ffbh_u32_e32 v0, v0 350; SI-NEXT: v_min_u32_e32 v3, 32, v3 351; SI-NEXT: v_min_u32_e32 v2, 32, v2 352; SI-NEXT: v_min_u32_e32 v1, 32, v1 353; SI-NEXT: v_min_u32_e32 v0, 32, v0 354; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 355; SI-NEXT: s_endpgm 356; 357; VI-LABEL: v_ctlz_v4i32: 358; VI: ; %bb.0: 359; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 360; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 361; VI-NEXT: s_waitcnt lgkmcnt(0) 362; VI-NEXT: v_mov_b32_e32 v1, s3 363; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 364; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 365; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 366; VI-NEXT: s_mov_b32 s3, 0xf000 367; VI-NEXT: s_mov_b32 s2, -1 368; VI-NEXT: s_waitcnt vmcnt(0) 369; VI-NEXT: v_ffbh_u32_e32 v3, v3 370; VI-NEXT: v_ffbh_u32_e32 v2, v2 371; VI-NEXT: v_ffbh_u32_e32 v1, v1 372; VI-NEXT: v_ffbh_u32_e32 v0, v0 373; VI-NEXT: v_min_u32_e32 v3, 32, v3 374; VI-NEXT: v_min_u32_e32 v2, 32, v2 375; VI-NEXT: v_min_u32_e32 v1, 32, v1 376; VI-NEXT: v_min_u32_e32 v0, 32, v0 377; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 378; VI-NEXT: s_endpgm 379; 380; EG-LABEL: v_ctlz_v4i32: 381; EG: ; %bb.0: 382; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 383; EG-NEXT: TEX 0 @6 384; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] 385; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 386; EG-NEXT: CF_END 387; EG-NEXT: PAD 388; EG-NEXT: Fetch clause starting at 6: 389; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 390; EG-NEXT: ALU clause starting at 8: 391; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 392; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 393; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 394; EG-NEXT: ALU clause starting at 11: 395; EG-NEXT: FFBH_UINT * T1.W, T0.W, 396; EG-NEXT: FFBH_UINT T2.W, T0.Z, 397; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122 398; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 399; EG-NEXT: CNDE_INT T0.Z, T0.Z, literal.x, PV.W, 400; EG-NEXT: FFBH_UINT * T1.W, T0.Y, 401; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 402; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W, 403; EG-NEXT: FFBH_UINT * T1.W, T0.X, 404; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 405; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 406; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 407; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 408; 409; GFX10-LABEL: v_ctlz_v4i32: 410; GFX10: ; %bb.0: 411; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 412; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 413; GFX10-NEXT: v_mov_b32_e32 v4, 0 414; GFX10-NEXT: s_waitcnt lgkmcnt(0) 415; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 416; GFX10-NEXT: s_waitcnt vmcnt(0) 417; GFX10-NEXT: v_ffbh_u32_e32 v3, v3 418; GFX10-NEXT: v_ffbh_u32_e32 v2, v2 419; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 420; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 421; GFX10-NEXT: v_min_u32_e32 v3, 32, v3 422; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 423; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 424; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 425; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 426; GFX10-NEXT: s_endpgm 427; 428; GFX10-GISEL-LABEL: v_ctlz_v4i32: 429; GFX10-GISEL: ; %bb.0: 430; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 431; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 432; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 433; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 434; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 435; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 436; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 437; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 438; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 439; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v3, v3 440; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 441; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 442; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 443; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 444; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 445; GFX10-GISEL-NEXT: s_endpgm 446; 447; GFX11-LABEL: v_ctlz_v4i32: 448; GFX11: ; %bb.0: 449; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 450; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 451; GFX11-NEXT: v_mov_b32_e32 v4, 0 452; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 453; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 454; GFX11-NEXT: s_waitcnt lgkmcnt(0) 455; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] 456; GFX11-NEXT: s_waitcnt vmcnt(0) 457; GFX11-NEXT: v_clz_i32_u32_e32 v3, v3 458; GFX11-NEXT: v_clz_i32_u32_e32 v2, v2 459; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 460; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 461; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 462; GFX11-NEXT: v_min_u32_e32 v3, 32, v3 463; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 464; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 465; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 466; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 467; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 468; GFX11-NEXT: s_endpgm 469 %tid = call i32 @llvm.amdgcn.workitem.id.x() 470 %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid 471 %val = load <4 x i32>, ptr addrspace(1) %in.gep, align 16 472 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone 473 store <4 x i32> %ctlz, ptr addrspace(1) %out, align 16 474 ret void 475} 476 477define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 478; SI-LABEL: v_ctlz_i8: 479; SI: ; %bb.0: 480; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 481; SI-NEXT: s_mov_b32 s7, 0xf000 482; SI-NEXT: s_mov_b32 s6, -1 483; SI-NEXT: s_mov_b32 s10, s6 484; SI-NEXT: s_mov_b32 s11, s7 485; SI-NEXT: s_waitcnt lgkmcnt(0) 486; SI-NEXT: s_mov_b32 s8, s2 487; SI-NEXT: s_mov_b32 s9, s3 488; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 489; SI-NEXT: s_mov_b32 s4, s0 490; SI-NEXT: s_mov_b32 s5, s1 491; SI-NEXT: s_waitcnt vmcnt(0) 492; SI-NEXT: v_ffbh_u32_e32 v0, v0 493; SI-NEXT: v_min_u32_e32 v0, 32, v0 494; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0 495; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 496; SI-NEXT: s_endpgm 497; 498; VI-LABEL: v_ctlz_i8: 499; VI: ; %bb.0: 500; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 501; VI-NEXT: s_mov_b32 s7, 0xf000 502; VI-NEXT: s_mov_b32 s6, -1 503; VI-NEXT: s_mov_b32 s10, s6 504; VI-NEXT: s_mov_b32 s11, s7 505; VI-NEXT: s_waitcnt lgkmcnt(0) 506; VI-NEXT: s_mov_b32 s8, s2 507; VI-NEXT: s_mov_b32 s9, s3 508; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 509; VI-NEXT: s_mov_b32 s4, s0 510; VI-NEXT: s_mov_b32 s5, s1 511; VI-NEXT: s_waitcnt vmcnt(0) 512; VI-NEXT: v_ffbh_u32_e32 v0, v0 513; VI-NEXT: v_min_u32_e32 v0, 32, v0 514; VI-NEXT: v_subrev_u32_e32 v0, vcc, 24, v0 515; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 516; VI-NEXT: s_endpgm 517; 518; EG-LABEL: v_ctlz_i8: 519; EG: ; %bb.0: 520; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 521; EG-NEXT: TEX 0 @6 522; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[] 523; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 524; EG-NEXT: CF_END 525; EG-NEXT: PAD 526; EG-NEXT: Fetch clause starting at 6: 527; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 528; EG-NEXT: ALU clause starting at 8: 529; EG-NEXT: MOV * T0.X, KC0[2].Z, 530; EG-NEXT: ALU clause starting at 9: 531; EG-NEXT: FFBH_UINT * T0.W, T0.X, 532; EG-NEXT: CNDE_INT T0.W, T0.X, literal.x, PV.W, 533; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, 534; EG-NEXT: 32(4.484155e-44), 3(4.203895e-45) 535; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 536; EG-NEXT: -24(nan), 0(0.000000e+00) 537; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 538; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 539; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 540; EG-NEXT: LSHL T0.X, PV.W, PS, 541; EG-NEXT: LSHL * T0.W, literal.x, PS, 542; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 543; EG-NEXT: MOV T0.Y, 0.0, 544; EG-NEXT: MOV * T0.Z, 0.0, 545; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 546; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 547; 548; GFX10-LABEL: v_ctlz_i8: 549; GFX10: ; %bb.0: 550; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 551; GFX10-NEXT: v_mov_b32_e32 v0, 0 552; GFX10-NEXT: s_waitcnt lgkmcnt(0) 553; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] 554; GFX10-NEXT: s_waitcnt vmcnt(0) 555; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 556; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 557; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 558; GFX10-NEXT: global_store_byte v0, v1, s[0:1] 559; GFX10-NEXT: s_endpgm 560; 561; GFX10-GISEL-LABEL: v_ctlz_i8: 562; GFX10-GISEL: ; %bb.0: 563; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 564; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 565; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 566; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 567; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 568; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 569; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 570; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v1 571; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] 572; GFX10-GISEL-NEXT: s_endpgm 573; 574; GFX11-LABEL: v_ctlz_i8: 575; GFX11: ; %bb.0: 576; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 577; GFX11-NEXT: v_mov_b32_e32 v0, 0 578; GFX11-NEXT: s_waitcnt lgkmcnt(0) 579; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] 580; GFX11-NEXT: s_waitcnt vmcnt(0) 581; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 582; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) 583; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 584; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 585; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] 586; GFX11-NEXT: s_endpgm 587 %val = load i8, ptr addrspace(1) %valptr 588 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone 589 store i8 %ctlz, ptr addrspace(1) %out 590 ret void 591} 592 593define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { 594; SI-LABEL: s_ctlz_i64: 595; SI: ; %bb.0: 596; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 597; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 598; SI-NEXT: s_mov_b32 s3, 0xf000 599; SI-NEXT: s_mov_b32 s2, -1 600; SI-NEXT: s_waitcnt lgkmcnt(0) 601; SI-NEXT: s_flbit_i32_b64 s4, s[6:7] 602; SI-NEXT: s_min_u32 s4, s4, 64 603; SI-NEXT: v_mov_b32_e32 v1, 0 604; SI-NEXT: v_mov_b32_e32 v0, s4 605; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 606; SI-NEXT: s_endpgm 607; 608; VI-LABEL: s_ctlz_i64: 609; VI: ; %bb.0: 610; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c 611; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 612; VI-NEXT: s_mov_b32 s3, 0xf000 613; VI-NEXT: s_mov_b32 s2, -1 614; VI-NEXT: v_mov_b32_e32 v1, 0 615; VI-NEXT: s_waitcnt lgkmcnt(0) 616; VI-NEXT: s_flbit_i32_b64 s4, s[6:7] 617; VI-NEXT: s_min_u32 s4, s4, 64 618; VI-NEXT: v_mov_b32_e32 v0, s4 619; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 620; VI-NEXT: s_endpgm 621; 622; EG-LABEL: s_ctlz_i64: 623; EG: ; %bb.0: 624; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 625; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 626; EG-NEXT: CF_END 627; EG-NEXT: PAD 628; EG-NEXT: ALU clause starting at 4: 629; EG-NEXT: FFBH_UINT * T0.W, KC0[4].W, 630; EG-NEXT: CNDE_INT * T0.W, KC0[4].W, literal.x, PV.W, 631; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 632; EG-NEXT: FFBH_UINT T1.W, KC0[5].X, 633; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 634; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 635; EG-NEXT: CNDE_INT T0.X, KC0[5].X, PS, PV.W, 636; EG-NEXT: MOV T0.Y, 0.0, 637; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 638; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 639; 640; GFX10-LABEL: s_ctlz_i64: 641; GFX10: ; %bb.0: 642; GFX10-NEXT: s_clause 0x1 643; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c 644; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 645; GFX10-NEXT: v_mov_b32_e32 v1, 0 646; GFX10-NEXT: s_waitcnt lgkmcnt(0) 647; GFX10-NEXT: s_flbit_i32_b64 s0, s[0:1] 648; GFX10-NEXT: s_min_u32 s0, s0, 64 649; GFX10-NEXT: v_mov_b32_e32 v0, s0 650; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[2:3] 651; GFX10-NEXT: s_endpgm 652; 653; GFX10-GISEL-LABEL: s_ctlz_i64: 654; GFX10-GISEL: ; %bb.0: 655; GFX10-GISEL-NEXT: s_clause 0x1 656; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c 657; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 658; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 659; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 660; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[0:1] 661; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 662; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 663; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 664; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 665; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 666; GFX10-GISEL-NEXT: s_endpgm 667; 668; GFX11-LABEL: s_ctlz_i64: 669; GFX11: ; %bb.0: 670; GFX11-NEXT: s_clause 0x1 671; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x4c 672; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 673; GFX11-NEXT: s_waitcnt lgkmcnt(0) 674; GFX11-NEXT: s_clz_i32_u64 s0, s[0:1] 675; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 676; GFX11-NEXT: s_min_u32 s0, s0, 64 677; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0 678; GFX11-NEXT: global_store_b64 v1, v[0:1], s[2:3] 679; GFX11-NEXT: s_endpgm 680 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 681 store i64 %ctlz, ptr addrspace(1) %out 682 ret void 683} 684 685define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { 686; SI-LABEL: s_ctlz_i64_trunc: 687; SI: ; %bb.0: 688; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 689; SI-NEXT: s_mov_b32 s7, 0xf000 690; SI-NEXT: s_waitcnt lgkmcnt(0) 691; SI-NEXT: s_flbit_i32_b64 s2, s[2:3] 692; SI-NEXT: s_min_u32 s2, s2, 64 693; SI-NEXT: s_mov_b32 s6, -1 694; SI-NEXT: s_mov_b32 s4, s0 695; SI-NEXT: s_mov_b32 s5, s1 696; SI-NEXT: v_mov_b32_e32 v0, s2 697; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 698; SI-NEXT: s_endpgm 699; 700; VI-LABEL: s_ctlz_i64_trunc: 701; VI: ; %bb.0: 702; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 703; VI-NEXT: s_mov_b32 s7, 0xf000 704; VI-NEXT: s_mov_b32 s6, -1 705; VI-NEXT: s_waitcnt lgkmcnt(0) 706; VI-NEXT: s_mov_b32 s4, s0 707; VI-NEXT: s_flbit_i32_b64 s0, s[2:3] 708; VI-NEXT: s_min_u32 s0, s0, 64 709; VI-NEXT: s_mov_b32 s5, s1 710; VI-NEXT: v_mov_b32_e32 v0, s0 711; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 712; VI-NEXT: s_endpgm 713; 714; EG-LABEL: s_ctlz_i64_trunc: 715; EG: ; %bb.0: 716; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 717; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 718; EG-NEXT: CF_END 719; EG-NEXT: PAD 720; EG-NEXT: ALU clause starting at 4: 721; EG-NEXT: FFBH_UINT * T0.W, KC0[2].W, 722; EG-NEXT: CNDE_INT * T0.W, KC0[2].W, literal.x, PV.W, 723; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 724; EG-NEXT: FFBH_UINT T1.W, KC0[3].X, 725; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 726; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 727; EG-NEXT: CNDE_INT T0.X, KC0[3].X, PS, PV.W, 728; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 729; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 730; 731; GFX10-LABEL: s_ctlz_i64_trunc: 732; GFX10: ; %bb.0: 733; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 734; GFX10-NEXT: v_mov_b32_e32 v0, 0 735; GFX10-NEXT: s_waitcnt lgkmcnt(0) 736; GFX10-NEXT: s_flbit_i32_b64 s2, s[2:3] 737; GFX10-NEXT: s_min_u32 s2, s2, 64 738; GFX10-NEXT: v_mov_b32_e32 v1, s2 739; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 740; GFX10-NEXT: s_endpgm 741; 742; GFX10-GISEL-LABEL: s_ctlz_i64_trunc: 743; GFX10-GISEL: ; %bb.0: 744; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 745; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 746; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 747; GFX10-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] 748; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 64 749; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 750; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 751; GFX10-GISEL-NEXT: s_endpgm 752; 753; GFX11-LABEL: s_ctlz_i64_trunc: 754; GFX11: ; %bb.0: 755; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 756; GFX11-NEXT: s_waitcnt lgkmcnt(0) 757; GFX11-NEXT: s_clz_i32_u64 s2, s[2:3] 758; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) 759; GFX11-NEXT: s_min_u32 s2, s2, 64 760; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 761; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 762; GFX11-NEXT: s_endpgm 763 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 764 %trunc = trunc i64 %ctlz to i32 765 store i32 %trunc, ptr addrspace(1) %out 766 ret void 767} 768 769define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 770; SI-LABEL: v_ctlz_i64: 771; SI: ; %bb.0: 772; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 773; SI-NEXT: s_mov_b32 s7, 0xf000 774; SI-NEXT: s_mov_b32 s6, 0 775; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 776; SI-NEXT: v_mov_b32_e32 v1, 0 777; SI-NEXT: s_waitcnt lgkmcnt(0) 778; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 779; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 780; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 781; SI-NEXT: s_waitcnt vmcnt(0) 782; SI-NEXT: v_ffbh_u32_e32 v2, v2 783; SI-NEXT: v_min_u32_e32 v2, 0xffffffdf, v2 784; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2 785; SI-NEXT: v_ffbh_u32_e32 v3, v3 786; SI-NEXT: v_min3_u32 v2, v2, v3, 64 787; SI-NEXT: v_mov_b32_e32 v3, v1 788; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 789; SI-NEXT: s_endpgm 790; 791; VI-LABEL: v_ctlz_i64: 792; VI: ; %bb.0: 793; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 794; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 795; VI-NEXT: v_mov_b32_e32 v2, 0 796; VI-NEXT: s_waitcnt lgkmcnt(0) 797; VI-NEXT: v_mov_b32_e32 v1, s3 798; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v3 799; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 800; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 801; VI-NEXT: v_mov_b32_e32 v4, s1 802; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3 803; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 804; VI-NEXT: s_waitcnt vmcnt(0) 805; VI-NEXT: v_ffbh_u32_e32 v0, v0 806; VI-NEXT: v_add_u32_e64 v0, s[0:1], v0, 32 clamp 807; VI-NEXT: v_ffbh_u32_e32 v1, v1 808; VI-NEXT: v_min3_u32 v1, v0, v1, 64 809; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] 810; VI-NEXT: s_endpgm 811; 812; EG-LABEL: v_ctlz_i64: 813; EG: ; %bb.0: 814; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 815; EG-NEXT: TEX 0 @6 816; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 817; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 818; EG-NEXT: CF_END 819; EG-NEXT: PAD 820; EG-NEXT: Fetch clause starting at 6: 821; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 822; EG-NEXT: ALU clause starting at 8: 823; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 824; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 825; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 826; EG-NEXT: ALU clause starting at 11: 827; EG-NEXT: FFBH_UINT * T1.W, T0.X, 828; EG-NEXT: CNDE_INT * T1.W, T0.X, literal.x, PV.W, 829; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 830; EG-NEXT: FFBH_UINT T2.W, T0.Y, 831; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x, 832; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 833; EG-NEXT: CNDE_INT T0.X, T0.Y, PS, PV.W, 834; EG-NEXT: MOV T0.Y, 0.0, 835; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 836; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 837; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 838; 839; GFX10-LABEL: v_ctlz_i64: 840; GFX10: ; %bb.0: 841; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 842; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 843; GFX10-NEXT: s_waitcnt lgkmcnt(0) 844; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 845; GFX10-NEXT: s_waitcnt vmcnt(0) 846; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 847; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 848; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp 849; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64 850; GFX10-NEXT: v_mov_b32_e32 v1, 0 851; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 852; GFX10-NEXT: s_endpgm 853; 854; GFX10-GISEL-LABEL: v_ctlz_i64: 855; GFX10-GISEL: ; %bb.0: 856; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 857; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 858; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 859; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 860; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 861; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 862; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 863; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp 864; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 865; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 866; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 867; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 868; GFX10-GISEL-NEXT: s_endpgm 869; 870; GFX11-LABEL: v_ctlz_i64: 871; GFX11: ; %bb.0: 872; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 873; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 874; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 875; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 876; GFX11-NEXT: s_waitcnt lgkmcnt(0) 877; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 878; GFX11-NEXT: s_waitcnt vmcnt(0) 879; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 880; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 881; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 882; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp 883; GFX11-NEXT: v_min3_u32 v0, v0, v1, 64 884; GFX11-NEXT: v_mov_b32_e32 v1, 0 885; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 886; GFX11-NEXT: s_endpgm 887 %tid = call i32 @llvm.amdgcn.workitem.id.x() 888 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid 889 %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %tid 890 %val = load i64, ptr addrspace(1) %in.gep 891 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 892 store i64 %ctlz, ptr addrspace(1) %out.gep 893 ret void 894} 895 896define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 897; SI-LABEL: v_ctlz_i64_trunc: 898; SI: ; %bb.0: 899; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 900; SI-NEXT: s_mov_b32 s7, 0xf000 901; SI-NEXT: s_mov_b32 s6, 0 902; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 903; SI-NEXT: v_mov_b32_e32 v2, 0 904; SI-NEXT: s_waitcnt lgkmcnt(0) 905; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 906; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 907; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 908; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 909; SI-NEXT: s_waitcnt vmcnt(0) 910; SI-NEXT: v_ffbh_u32_e32 v0, v3 911; SI-NEXT: v_min_u32_e32 v0, 0xffffffdf, v0 912; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0 913; SI-NEXT: v_ffbh_u32_e32 v3, v4 914; SI-NEXT: v_min3_u32 v0, v0, v3, 64 915; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 916; SI-NEXT: s_endpgm 917; 918; VI-LABEL: v_ctlz_i64_trunc: 919; VI: ; %bb.0: 920; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 921; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 922; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 923; VI-NEXT: s_waitcnt lgkmcnt(0) 924; VI-NEXT: v_mov_b32_e32 v2, s3 925; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 926; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 927; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] 928; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 929; VI-NEXT: v_mov_b32_e32 v4, s1 930; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 931; VI-NEXT: s_waitcnt vmcnt(0) 932; VI-NEXT: v_ffbh_u32_e32 v0, v1 933; VI-NEXT: v_add_u32_e64 v0, s[0:1], v0, 32 clamp 934; VI-NEXT: v_ffbh_u32_e32 v1, v2 935; VI-NEXT: v_min3_u32 v0, v0, v1, 64 936; VI-NEXT: flat_store_dword v[3:4], v0 937; VI-NEXT: s_endpgm 938; 939; EG-LABEL: v_ctlz_i64_trunc: 940; EG: ; %bb.0: 941; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 942; EG-NEXT: TEX 0 @6 943; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 944; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 945; EG-NEXT: CF_END 946; EG-NEXT: PAD 947; EG-NEXT: Fetch clause starting at 6: 948; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 949; EG-NEXT: ALU clause starting at 8: 950; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 951; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 952; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W, 953; EG-NEXT: ALU clause starting at 11: 954; EG-NEXT: FFBH_UINT * T0.W, T1.X, 955; EG-NEXT: CNDE_INT * T0.W, T1.X, literal.x, PV.W, 956; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 957; EG-NEXT: LSHL T0.Z, T0.X, literal.x, 958; EG-NEXT: FFBH_UINT T1.W, T1.Y, 959; EG-NEXT: ADD_INT * T0.W, PV.W, literal.y, 960; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 961; EG-NEXT: CNDE_INT T0.X, T1.Y, PS, PV.W, 962; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z, 963; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 964; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 965; 966; GFX10-LABEL: v_ctlz_i64_trunc: 967; GFX10: ; %bb.0: 968; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 969; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 970; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 971; GFX10-NEXT: s_waitcnt lgkmcnt(0) 972; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] 973; GFX10-NEXT: s_waitcnt vmcnt(0) 974; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 975; GFX10-NEXT: v_ffbh_u32_e32 v2, v2 976; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp 977; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64 978; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 979; GFX10-NEXT: s_endpgm 980; 981; GFX10-GISEL-LABEL: v_ctlz_i64_trunc: 982; GFX10-GISEL: ; %bb.0: 983; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 984; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 985; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 986; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 987; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] 988; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 989; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 990; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 991; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp 992; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 993; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 994; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] 995; GFX10-GISEL-NEXT: s_endpgm 996; 997; GFX11-LABEL: v_ctlz_i64_trunc: 998; GFX11: ; %bb.0: 999; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1000; GFX11-NEXT: v_and_b32_e32 v2, 0x3ff, v0 1001; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1002; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v2 1003; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v2 1004; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1005; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] 1006; GFX11-NEXT: s_waitcnt vmcnt(0) 1007; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 1008; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 1009; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1010; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp 1011; GFX11-NEXT: v_min3_u32 v0, v0, v1, 64 1012; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] 1013; GFX11-NEXT: s_endpgm 1014 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1015 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid 1016 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid 1017 %val = load i64, ptr addrspace(1) %in.gep 1018 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false) 1019 %trunc = trunc i64 %ctlz to i32 1020 store i32 %trunc, ptr addrspace(1) %out.gep 1021 ret void 1022} 1023 1024define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1025; SI-LABEL: v_ctlz_i32_sel_eq_neg1: 1026; SI: ; %bb.0: 1027; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1028; SI-NEXT: s_mov_b32 s7, 0xf000 1029; SI-NEXT: s_mov_b32 s10, 0 1030; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1031; SI-NEXT: v_mov_b32_e32 v1, 0 1032; SI-NEXT: s_mov_b32 s11, s7 1033; SI-NEXT: s_waitcnt lgkmcnt(0) 1034; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1035; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1036; SI-NEXT: s_mov_b32 s6, -1 1037; SI-NEXT: s_mov_b32 s4, s0 1038; SI-NEXT: s_mov_b32 s5, s1 1039; SI-NEXT: s_waitcnt vmcnt(0) 1040; SI-NEXT: v_ffbh_u32_e32 v0, v0 1041; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1042; SI-NEXT: s_endpgm 1043; 1044; VI-LABEL: v_ctlz_i32_sel_eq_neg1: 1045; VI: ; %bb.0: 1046; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1047; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1048; VI-NEXT: s_waitcnt lgkmcnt(0) 1049; VI-NEXT: v_mov_b32_e32 v1, s3 1050; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1051; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1052; VI-NEXT: flat_load_dword v0, v[0:1] 1053; VI-NEXT: s_mov_b32 s3, 0xf000 1054; VI-NEXT: s_mov_b32 s2, -1 1055; VI-NEXT: s_waitcnt vmcnt(0) 1056; VI-NEXT: v_ffbh_u32_e32 v0, v0 1057; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1058; VI-NEXT: s_endpgm 1059; 1060; EG-LABEL: v_ctlz_i32_sel_eq_neg1: 1061; EG: ; %bb.0: 1062; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1063; EG-NEXT: TEX 0 @6 1064; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 1065; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1066; EG-NEXT: CF_END 1067; EG-NEXT: PAD 1068; EG-NEXT: Fetch clause starting at 6: 1069; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1070; EG-NEXT: ALU clause starting at 8: 1071; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1072; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1073; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1074; EG-NEXT: ALU clause starting at 11: 1075; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1076; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1077; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1078; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 1079; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1080; EG-NEXT: -1(nan), 2(2.802597e-45) 1081; 1082; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1: 1083; GFX10: ; %bb.0: 1084; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1085; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1086; GFX10-NEXT: v_mov_b32_e32 v1, 0 1087; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1088; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1089; GFX10-NEXT: s_waitcnt vmcnt(0) 1090; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 1091; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1092; GFX10-NEXT: s_endpgm 1093; 1094; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1: 1095; GFX10-GISEL: ; %bb.0: 1096; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1097; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1098; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1099; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1100; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1101; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1102; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1103; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 1104; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo 1105; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1106; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1107; GFX10-GISEL-NEXT: s_endpgm 1108; 1109; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1: 1110; GFX11: ; %bb.0: 1111; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1112; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1113; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1114; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1115; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1116; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1117; GFX11-NEXT: s_waitcnt vmcnt(0) 1118; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 1119; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 1120; GFX11-NEXT: s_endpgm 1121 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1122 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 1123 %val = load i32, ptr addrspace(1) %in.gep 1124 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 1125 %cmp = icmp eq i32 %val, 0 1126 %sel = select i1 %cmp, i32 -1, i32 %ctlz 1127 store i32 %sel, ptr addrspace(1) %out 1128 ret void 1129} 1130 1131define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1132; SI-LABEL: v_ctlz_i32_sel_ne_neg1: 1133; SI: ; %bb.0: 1134; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1135; SI-NEXT: s_mov_b32 s7, 0xf000 1136; SI-NEXT: s_mov_b32 s10, 0 1137; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1138; SI-NEXT: v_mov_b32_e32 v1, 0 1139; SI-NEXT: s_mov_b32 s11, s7 1140; SI-NEXT: s_waitcnt lgkmcnt(0) 1141; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1142; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1143; SI-NEXT: s_mov_b32 s6, -1 1144; SI-NEXT: s_mov_b32 s4, s0 1145; SI-NEXT: s_mov_b32 s5, s1 1146; SI-NEXT: s_waitcnt vmcnt(0) 1147; SI-NEXT: v_ffbh_u32_e32 v0, v0 1148; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1149; SI-NEXT: s_endpgm 1150; 1151; VI-LABEL: v_ctlz_i32_sel_ne_neg1: 1152; VI: ; %bb.0: 1153; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1154; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1155; VI-NEXT: s_waitcnt lgkmcnt(0) 1156; VI-NEXT: v_mov_b32_e32 v1, s3 1157; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1158; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1159; VI-NEXT: flat_load_dword v0, v[0:1] 1160; VI-NEXT: s_mov_b32 s3, 0xf000 1161; VI-NEXT: s_mov_b32 s2, -1 1162; VI-NEXT: s_waitcnt vmcnt(0) 1163; VI-NEXT: v_ffbh_u32_e32 v0, v0 1164; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1165; VI-NEXT: s_endpgm 1166; 1167; EG-LABEL: v_ctlz_i32_sel_ne_neg1: 1168; EG: ; %bb.0: 1169; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1170; EG-NEXT: TEX 0 @6 1171; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 1172; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1173; EG-NEXT: CF_END 1174; EG-NEXT: PAD 1175; EG-NEXT: Fetch clause starting at 6: 1176; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1177; EG-NEXT: ALU clause starting at 8: 1178; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1179; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1180; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1181; EG-NEXT: ALU clause starting at 11: 1182; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1183; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1184; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1185; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 1186; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1187; EG-NEXT: -1(nan), 2(2.802597e-45) 1188; 1189; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1: 1190; GFX10: ; %bb.0: 1191; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1192; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1193; GFX10-NEXT: v_mov_b32_e32 v1, 0 1194; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1195; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1196; GFX10-NEXT: s_waitcnt vmcnt(0) 1197; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 1198; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1199; GFX10-NEXT: s_endpgm 1200; 1201; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1: 1202; GFX10-GISEL: ; %bb.0: 1203; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1204; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1205; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1206; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1207; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1208; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1209; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 1210; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 1211; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo 1212; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1213; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1214; GFX10-GISEL-NEXT: s_endpgm 1215; 1216; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1: 1217; GFX11: ; %bb.0: 1218; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1219; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1220; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1221; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1222; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1223; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1224; GFX11-NEXT: s_waitcnt vmcnt(0) 1225; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 1226; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 1227; GFX11-NEXT: s_endpgm 1228 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1229 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 1230 %val = load i32, ptr addrspace(1) %in.gep 1231 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 1232 %cmp = icmp ne i32 %val, 0 1233 %sel = select i1 %cmp, i32 %ctlz, i32 -1 1234 store i32 %sel, ptr addrspace(1) %out 1235 ret void 1236} 1237 1238; TODO: Should be able to eliminate select here as well. 1239define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1240; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth: 1241; SI: ; %bb.0: 1242; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1243; SI-NEXT: s_mov_b32 s7, 0xf000 1244; SI-NEXT: s_mov_b32 s10, 0 1245; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1246; SI-NEXT: v_mov_b32_e32 v1, 0 1247; SI-NEXT: s_mov_b32 s11, s7 1248; SI-NEXT: s_waitcnt lgkmcnt(0) 1249; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1250; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1251; SI-NEXT: s_mov_b32 s6, -1 1252; SI-NEXT: s_mov_b32 s4, s0 1253; SI-NEXT: s_mov_b32 s5, s1 1254; SI-NEXT: s_waitcnt vmcnt(0) 1255; SI-NEXT: v_ffbh_u32_e32 v0, v0 1256; SI-NEXT: v_min_u32_e32 v0, 32, v0 1257; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1258; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1259; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1260; SI-NEXT: s_endpgm 1261; 1262; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: 1263; VI: ; %bb.0: 1264; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1265; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1266; VI-NEXT: s_waitcnt lgkmcnt(0) 1267; VI-NEXT: v_mov_b32_e32 v1, s3 1268; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1269; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1270; VI-NEXT: flat_load_dword v0, v[0:1] 1271; VI-NEXT: s_mov_b32 s3, 0xf000 1272; VI-NEXT: s_mov_b32 s2, -1 1273; VI-NEXT: s_waitcnt vmcnt(0) 1274; VI-NEXT: v_ffbh_u32_e32 v0, v0 1275; VI-NEXT: v_min_u32_e32 v0, 32, v0 1276; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1277; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1278; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1279; VI-NEXT: s_endpgm 1280; 1281; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth: 1282; EG: ; %bb.0: 1283; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1284; EG-NEXT: TEX 0 @6 1285; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 1286; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1287; EG-NEXT: CF_END 1288; EG-NEXT: PAD 1289; EG-NEXT: Fetch clause starting at 6: 1290; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1291; EG-NEXT: ALU clause starting at 8: 1292; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1293; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1294; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1295; EG-NEXT: ALU clause starting at 11: 1296; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1297; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1298; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1299; EG-NEXT: SETE_INT * T1.W, PV.W, literal.x, 1300; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1301; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, literal.x, 1302; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1303; EG-NEXT: -1(nan), 2(2.802597e-45) 1304; 1305; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth: 1306; GFX10: ; %bb.0: 1307; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1308; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1309; GFX10-NEXT: v_mov_b32_e32 v1, 0 1310; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1311; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1312; GFX10-NEXT: s_waitcnt vmcnt(0) 1313; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 1314; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 1315; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1316; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1317; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1318; GFX10-NEXT: s_endpgm 1319; 1320; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth: 1321; GFX10-GISEL: ; %bb.0: 1322; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1323; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1324; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1325; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1326; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1327; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1328; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 1329; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 1330; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 1331; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo 1332; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1333; GFX10-GISEL-NEXT: s_endpgm 1334; 1335; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth: 1336; GFX11: ; %bb.0: 1337; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1338; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1339; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 1340; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1341; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1342; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1343; GFX11-NEXT: s_waitcnt vmcnt(0) 1344; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 1345; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 1346; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1347; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1348; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1349; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 1350; GFX11-NEXT: s_endpgm 1351 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1352 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 1353 %val = load i32, ptr addrspace(1) %in.gep 1354 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 1355 %cmp = icmp eq i32 %ctlz, 32 1356 %sel = select i1 %cmp, i32 -1, i32 %ctlz 1357 store i32 %sel, ptr addrspace(1) %out 1358 ret void 1359} 1360 1361define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1362; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth: 1363; SI: ; %bb.0: 1364; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1365; SI-NEXT: s_mov_b32 s7, 0xf000 1366; SI-NEXT: s_mov_b32 s10, 0 1367; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1368; SI-NEXT: v_mov_b32_e32 v1, 0 1369; SI-NEXT: s_mov_b32 s11, s7 1370; SI-NEXT: s_waitcnt lgkmcnt(0) 1371; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1372; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1373; SI-NEXT: s_mov_b32 s6, -1 1374; SI-NEXT: s_mov_b32 s4, s0 1375; SI-NEXT: s_mov_b32 s5, s1 1376; SI-NEXT: s_waitcnt vmcnt(0) 1377; SI-NEXT: v_ffbh_u32_e32 v0, v0 1378; SI-NEXT: v_min_u32_e32 v0, 32, v0 1379; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1380; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1381; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1382; SI-NEXT: s_endpgm 1383; 1384; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: 1385; VI: ; %bb.0: 1386; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1387; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1388; VI-NEXT: s_waitcnt lgkmcnt(0) 1389; VI-NEXT: v_mov_b32_e32 v1, s3 1390; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1391; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1392; VI-NEXT: flat_load_dword v0, v[0:1] 1393; VI-NEXT: s_mov_b32 s3, 0xf000 1394; VI-NEXT: s_mov_b32 s2, -1 1395; VI-NEXT: s_waitcnt vmcnt(0) 1396; VI-NEXT: v_ffbh_u32_e32 v0, v0 1397; VI-NEXT: v_min_u32_e32 v0, 32, v0 1398; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1399; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1400; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1401; VI-NEXT: s_endpgm 1402; 1403; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth: 1404; EG: ; %bb.0: 1405; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1406; EG-NEXT: TEX 0 @6 1407; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 1408; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1409; EG-NEXT: CF_END 1410; EG-NEXT: PAD 1411; EG-NEXT: Fetch clause starting at 6: 1412; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1413; EG-NEXT: ALU clause starting at 8: 1414; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1415; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1416; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1417; EG-NEXT: ALU clause starting at 11: 1418; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1419; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1420; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1421; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x, 1422; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1423; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W, 1424; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1425; EG-NEXT: -1(nan), 2(2.802597e-45) 1426; 1427; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth: 1428; GFX10: ; %bb.0: 1429; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1430; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1431; GFX10-NEXT: v_mov_b32_e32 v1, 0 1432; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1433; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1434; GFX10-NEXT: s_waitcnt vmcnt(0) 1435; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 1436; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 1437; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1438; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1439; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1440; GFX10-NEXT: s_endpgm 1441; 1442; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth: 1443; GFX10-GISEL: ; %bb.0: 1444; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1445; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1446; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1447; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1448; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1449; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1450; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 1451; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 1452; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1453; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1454; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1455; GFX10-GISEL-NEXT: s_endpgm 1456; 1457; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth: 1458; GFX11: ; %bb.0: 1459; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1460; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1461; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) 1462; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1463; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1464; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] 1465; GFX11-NEXT: s_waitcnt vmcnt(0) 1466; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 1467; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 1468; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1469; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1470; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1471; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 1472; GFX11-NEXT: s_endpgm 1473 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1474 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 1475 %val = load i32, ptr addrspace(1) %in.gep 1476 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone 1477 %cmp = icmp ne i32 %ctlz, 32 1478 %sel = select i1 %cmp, i32 %ctlz, i32 -1 1479 store i32 %sel, ptr addrspace(1) %out 1480 ret void 1481} 1482 1483 define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1484; SI-LABEL: v_ctlz_i8_sel_eq_neg1: 1485; SI: ; %bb.0: 1486; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1487; SI-NEXT: s_mov_b32 s7, 0xf000 1488; SI-NEXT: v_mov_b32_e32 v1, 0 1489; SI-NEXT: s_mov_b32 s10, 0 1490; SI-NEXT: s_mov_b32 s11, s7 1491; SI-NEXT: s_waitcnt lgkmcnt(0) 1492; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1493; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 1494; SI-NEXT: s_mov_b32 s6, -1 1495; SI-NEXT: s_mov_b32 s4, s0 1496; SI-NEXT: s_mov_b32 s5, s1 1497; SI-NEXT: s_waitcnt vmcnt(0) 1498; SI-NEXT: v_ffbh_u32_e32 v0, v0 1499; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1500; SI-NEXT: s_endpgm 1501; 1502; VI-LABEL: v_ctlz_i8_sel_eq_neg1: 1503; VI: ; %bb.0: 1504; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1505; VI-NEXT: s_waitcnt lgkmcnt(0) 1506; VI-NEXT: v_mov_b32_e32 v1, s3 1507; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1508; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1509; VI-NEXT: flat_load_ubyte v0, v[0:1] 1510; VI-NEXT: s_mov_b32 s3, 0xf000 1511; VI-NEXT: s_mov_b32 s2, -1 1512; VI-NEXT: s_waitcnt vmcnt(0) 1513; VI-NEXT: v_ffbh_u32_e32 v0, v0 1514; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1515; VI-NEXT: s_endpgm 1516; 1517; EG-LABEL: v_ctlz_i8_sel_eq_neg1: 1518; EG: ; %bb.0: 1519; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1520; EG-NEXT: TEX 0 @6 1521; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1522; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1523; EG-NEXT: CF_END 1524; EG-NEXT: PAD 1525; EG-NEXT: Fetch clause starting at 6: 1526; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1527; EG-NEXT: ALU clause starting at 8: 1528; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1529; EG-NEXT: ALU clause starting at 9: 1530; EG-NEXT: FFBH_UINT T0.W, T0.X, 1531; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1532; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1533; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1534; EG-NEXT: LSHL * T1.W, PS, literal.y, 1535; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1536; EG-NEXT: LSHL T0.X, PV.W, PS, 1537; EG-NEXT: LSHL * T0.W, literal.x, PS, 1538; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1539; EG-NEXT: MOV T0.Y, 0.0, 1540; EG-NEXT: MOV * T0.Z, 0.0, 1541; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1542; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1543; 1544; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1: 1545; GFX10: ; %bb.0: 1546; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1547; GFX10-NEXT: v_mov_b32_e32 v1, 0 1548; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1549; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 1550; GFX10-NEXT: s_waitcnt vmcnt(0) 1551; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 1552; GFX10-NEXT: global_store_byte v1, v0, s[0:1] 1553; GFX10-NEXT: s_endpgm 1554; 1555; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1: 1556; GFX10-GISEL: ; %bb.0: 1557; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1558; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 1559; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1560; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 1561; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 1562; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 1563; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo 1564; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off 1565; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1566; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1567; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1568; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 1569; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffe8, v1 1570; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo 1571; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1572; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] 1573; GFX10-GISEL-NEXT: s_endpgm 1574; 1575; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1: 1576; GFX11: ; %bb.0: 1577; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1578; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1579; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1580; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] 1581; GFX11-NEXT: s_waitcnt vmcnt(0) 1582; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 1583; GFX11-NEXT: global_store_b8 v1, v0, s[0:1] 1584; GFX11-NEXT: s_endpgm 1585 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1586 %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid 1587 %val = load i8, ptr addrspace(1) %valptr.gep 1588 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone 1589 %cmp = icmp eq i8 %val, 0 1590 %sel = select i1 %cmp, i8 -1, i8 %ctlz 1591 store i8 %sel, ptr addrspace(1) %out 1592 ret void 1593} 1594 1595 define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1596; SI-LABEL: v_ctlz_i16_sel_eq_neg1: 1597; SI: ; %bb.0: 1598; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1599; SI-NEXT: s_mov_b32 s7, 0xf000 1600; SI-NEXT: s_mov_b32 s6, -1 1601; SI-NEXT: s_mov_b32 s10, s6 1602; SI-NEXT: s_mov_b32 s11, s7 1603; SI-NEXT: s_waitcnt lgkmcnt(0) 1604; SI-NEXT: s_mov_b32 s8, s2 1605; SI-NEXT: s_mov_b32 s9, s3 1606; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 1607; SI-NEXT: s_mov_b32 s4, s0 1608; SI-NEXT: s_mov_b32 s5, s1 1609; SI-NEXT: s_waitcnt vmcnt(0) 1610; SI-NEXT: v_ffbh_u32_e32 v0, v0 1611; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 1612; SI-NEXT: s_endpgm 1613; 1614; VI-LABEL: v_ctlz_i16_sel_eq_neg1: 1615; VI: ; %bb.0: 1616; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1617; VI-NEXT: s_mov_b32 s7, 0xf000 1618; VI-NEXT: s_mov_b32 s6, -1 1619; VI-NEXT: s_mov_b32 s10, s6 1620; VI-NEXT: s_mov_b32 s11, s7 1621; VI-NEXT: s_waitcnt lgkmcnt(0) 1622; VI-NEXT: s_mov_b32 s8, s2 1623; VI-NEXT: s_mov_b32 s9, s3 1624; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 1625; VI-NEXT: v_mov_b32_e32 v1, 0xffff 1626; VI-NEXT: s_mov_b32 s4, s0 1627; VI-NEXT: s_mov_b32 s5, s1 1628; VI-NEXT: s_waitcnt vmcnt(0) 1629; VI-NEXT: v_ffbh_u32_e32 v2, v0 1630; VI-NEXT: v_min_u32_e32 v2, 32, v2 1631; VI-NEXT: v_add_u32_e32 v2, vcc, -16, v2 1632; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1633; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 1634; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 1635; VI-NEXT: s_endpgm 1636; 1637; EG-LABEL: v_ctlz_i16_sel_eq_neg1: 1638; EG: ; %bb.0: 1639; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1640; EG-NEXT: TEX 0 @6 1641; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1642; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1643; EG-NEXT: CF_END 1644; EG-NEXT: PAD 1645; EG-NEXT: Fetch clause starting at 6: 1646; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1647; EG-NEXT: ALU clause starting at 8: 1648; EG-NEXT: MOV * T0.X, KC0[2].Z, 1649; EG-NEXT: ALU clause starting at 9: 1650; EG-NEXT: FFBH_UINT T0.W, T0.X, 1651; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1652; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1653; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1654; EG-NEXT: LSHL * T1.W, PS, literal.y, 1655; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1656; EG-NEXT: LSHL T0.X, PV.W, PS, 1657; EG-NEXT: LSHL * T0.W, literal.x, PS, 1658; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1659; EG-NEXT: MOV T0.Y, 0.0, 1660; EG-NEXT: MOV * T0.Z, 0.0, 1661; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1662; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1663; 1664; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1: 1665; GFX10: ; %bb.0: 1666; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1667; GFX10-NEXT: v_mov_b32_e32 v0, 0 1668; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1669; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] 1670; GFX10-NEXT: s_waitcnt vmcnt(0) 1671; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 1672; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 1673; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 1674; GFX10-NEXT: v_add_nc_u32_e32 v2, -16, v2 1675; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo 1676; GFX10-NEXT: global_store_short v0, v1, s[0:1] 1677; GFX10-NEXT: s_endpgm 1678; 1679; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1: 1680; GFX10-GISEL: ; %bb.0: 1681; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1682; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 1683; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1684; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] 1685; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1686; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 1687; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 1688; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 1689; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v2, -16, v2 1690; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 1691; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo 1692; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] 1693; GFX10-GISEL-NEXT: s_endpgm 1694; 1695; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1: 1696; GFX11: ; %bb.0: 1697; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1698; GFX11-NEXT: v_mov_b32_e32 v0, 0 1699; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1700; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] 1701; GFX11-NEXT: s_waitcnt vmcnt(0) 1702; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1 1703; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 1704; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1705; GFX11-NEXT: v_min_u32_e32 v2, 32, v2 1706; GFX11-NEXT: v_add_nc_u32_e32 v2, -16, v2 1707; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1708; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo 1709; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] 1710; GFX11-NEXT: s_endpgm 1711 %val = load i16, ptr addrspace(1) %valptr 1712 %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone 1713 %cmp = icmp eq i16 %val, 0 1714 %sel = select i1 %cmp, i16 -1, i16 %ctlz 1715 store i16 %sel, ptr addrspace(1) %out 1716 ret void 1717} 1718 1719; FIXME: Need to handle non-uniform case for function below (load without gep). 1720define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1721; SI-LABEL: v_ctlz_i7_sel_eq_neg1: 1722; SI: ; %bb.0: 1723; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1724; SI-NEXT: s_mov_b32 s7, 0xf000 1725; SI-NEXT: v_mov_b32_e32 v1, 0 1726; SI-NEXT: s_mov_b32 s10, 0 1727; SI-NEXT: s_mov_b32 s11, s7 1728; SI-NEXT: s_waitcnt lgkmcnt(0) 1729; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1730; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 1731; SI-NEXT: s_mov_b32 s6, -1 1732; SI-NEXT: s_mov_b32 s4, s0 1733; SI-NEXT: s_mov_b32 s5, s1 1734; SI-NEXT: s_waitcnt vmcnt(0) 1735; SI-NEXT: v_ffbh_u32_e32 v0, v0 1736; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 1737; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1738; SI-NEXT: s_endpgm 1739; 1740; VI-LABEL: v_ctlz_i7_sel_eq_neg1: 1741; VI: ; %bb.0: 1742; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1743; VI-NEXT: s_waitcnt lgkmcnt(0) 1744; VI-NEXT: v_mov_b32_e32 v1, s3 1745; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1746; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1747; VI-NEXT: flat_load_ubyte v0, v[0:1] 1748; VI-NEXT: s_mov_b32 s3, 0xf000 1749; VI-NEXT: s_mov_b32 s2, -1 1750; VI-NEXT: s_waitcnt vmcnt(0) 1751; VI-NEXT: v_ffbh_u32_e32 v0, v0 1752; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0 1753; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1754; VI-NEXT: s_endpgm 1755; 1756; EG-LABEL: v_ctlz_i7_sel_eq_neg1: 1757; EG: ; %bb.0: 1758; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1759; EG-NEXT: TEX 0 @6 1760; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1761; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1762; EG-NEXT: CF_END 1763; EG-NEXT: PAD 1764; EG-NEXT: Fetch clause starting at 6: 1765; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1766; EG-NEXT: ALU clause starting at 8: 1767; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1768; EG-NEXT: ALU clause starting at 9: 1769; EG-NEXT: FFBH_UINT T0.W, T0.X, 1770; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1771; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1772; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1773; EG-NEXT: LSHL * T1.W, PS, literal.y, 1774; EG-NEXT: 127(1.779649e-43), 3(4.203895e-45) 1775; EG-NEXT: LSHL T0.X, PV.W, PS, 1776; EG-NEXT: LSHL * T0.W, literal.x, PS, 1777; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1778; EG-NEXT: MOV T0.Y, 0.0, 1779; EG-NEXT: MOV * T0.Z, 0.0, 1780; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1781; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1782; 1783; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1: 1784; GFX10: ; %bb.0: 1785; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1786; GFX10-NEXT: v_mov_b32_e32 v1, 0 1787; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1788; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 1789; GFX10-NEXT: s_waitcnt vmcnt(0) 1790; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 1791; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 1792; GFX10-NEXT: global_store_byte v1, v0, s[0:1] 1793; GFX10-NEXT: s_endpgm 1794; 1795; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1: 1796; GFX10-GISEL: ; %bb.0: 1797; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1798; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 1799; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1800; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 1801; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 1802; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 1803; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo 1804; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off 1805; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1806; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 1807; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1808; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1809; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 1810; GFX10-GISEL-NEXT: v_add_nc_u16 v1, 0xffe7, v1 1811; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo 1812; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1813; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 1814; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] 1815; GFX10-GISEL-NEXT: s_endpgm 1816; 1817; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1: 1818; GFX11: ; %bb.0: 1819; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1820; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 1821; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1822; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] 1823; GFX11-NEXT: s_waitcnt vmcnt(0) 1824; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 1825; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 1826; GFX11-NEXT: v_and_b32_e32 v0, 0x7f, v0 1827; GFX11-NEXT: global_store_b8 v1, v0, s[0:1] 1828; GFX11-NEXT: s_endpgm 1829 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1830 %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid 1831 %val = load i7, ptr addrspace(1) %valptr.gep 1832 %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone 1833 %cmp = icmp eq i7 %val, 0 1834 %sel = select i1 %cmp, i7 -1, i7 %ctlz 1835 store i7 %sel, ptr addrspace(1) %out 1836 ret void 1837} 1838