1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s 4; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s 5; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s 6 7declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone 8declare <2 x i7> @llvm.ctlz.v2i7(<2 x i7>, i1) nounwind readnone 9declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone 10declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1) nounwind readnone 11 12declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone 13declare i18 @llvm.ctlz.i18(i18, i1) nounwind readnone 14 15declare <2 x i16> @llvm.ctlz.v2i16(<2 x i16>, i1) nounwind readnone 16declare <3 x i16> @llvm.ctlz.v3i16(<3 x i16>, i1) nounwind readnone 17declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone 18 19declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone 20declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone 21declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone 22 23declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone 24declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone 25declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone 26 27declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 28 29define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { 30; SI-LABEL: s_ctlz_zero_undef_i32: 31; SI: ; %bb.0: 32; SI-NEXT: s_load_dword s2, s[4:5], 0xb 33; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 34; SI-NEXT: s_mov_b32 s3, 0xf000 35; SI-NEXT: s_waitcnt lgkmcnt(0) 36; SI-NEXT: s_flbit_i32_b32 s4, s2 37; SI-NEXT: s_mov_b32 s2, -1 38; SI-NEXT: v_mov_b32_e32 v0, s4 39; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 40; SI-NEXT: s_endpgm 41; 42; VI-LABEL: s_ctlz_zero_undef_i32: 43; VI: ; %bb.0: 44; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 45; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 46; VI-NEXT: s_waitcnt lgkmcnt(0) 47; VI-NEXT: s_flbit_i32_b32 s2, s2 48; VI-NEXT: v_mov_b32_e32 v0, s0 49; VI-NEXT: v_mov_b32_e32 v1, s1 50; VI-NEXT: v_mov_b32_e32 v2, s2 51; VI-NEXT: flat_store_dword v[0:1], v2 52; VI-NEXT: s_endpgm 53; 54; EG-LABEL: s_ctlz_zero_undef_i32: 55; EG: ; %bb.0: 56; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 57; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 58; EG-NEXT: CF_END 59; EG-NEXT: PAD 60; EG-NEXT: ALU clause starting at 4: 61; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 62; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 63; EG-NEXT: FFBH_UINT * T1.X, KC0[2].Z, 64; 65; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32: 66; GFX9-GISEL: ; %bb.0: 67; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c 68; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 69; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 70; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 71; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 72; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 73; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 74; GFX9-GISEL-NEXT: s_endpgm 75 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 76 store i32 %ctlz, ptr addrspace(1) %out, align 4 77 ret void 78} 79 80define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 81; SI-LABEL: v_ctlz_zero_undef_i32: 82; SI: ; %bb.0: 83; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 84; SI-NEXT: s_mov_b32 s7, 0xf000 85; SI-NEXT: s_mov_b32 s10, 0 86; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 87; SI-NEXT: v_mov_b32_e32 v1, 0 88; SI-NEXT: s_mov_b32 s11, s7 89; SI-NEXT: s_waitcnt lgkmcnt(0) 90; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 91; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 92; SI-NEXT: s_mov_b32 s6, -1 93; SI-NEXT: s_mov_b32 s4, s0 94; SI-NEXT: s_mov_b32 s5, s1 95; SI-NEXT: s_waitcnt vmcnt(0) 96; SI-NEXT: v_ffbh_u32_e32 v0, v0 97; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 98; SI-NEXT: s_endpgm 99; 100; VI-LABEL: v_ctlz_zero_undef_i32: 101; VI: ; %bb.0: 102; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 103; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 104; VI-NEXT: s_waitcnt lgkmcnt(0) 105; VI-NEXT: v_mov_b32_e32 v1, s3 106; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 107; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 108; VI-NEXT: flat_load_dword v0, v[0:1] 109; VI-NEXT: s_waitcnt vmcnt(0) 110; VI-NEXT: v_ffbh_u32_e32 v2, v0 111; VI-NEXT: v_mov_b32_e32 v0, s0 112; VI-NEXT: v_mov_b32_e32 v1, s1 113; VI-NEXT: flat_store_dword v[0:1], v2 114; VI-NEXT: s_endpgm 115; 116; EG-LABEL: v_ctlz_zero_undef_i32: 117; EG: ; %bb.0: 118; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 119; EG-NEXT: TEX 0 @6 120; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] 121; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 122; EG-NEXT: CF_END 123; EG-NEXT: PAD 124; EG-NEXT: Fetch clause starting at 6: 125; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 126; EG-NEXT: ALU clause starting at 8: 127; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 128; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 129; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 130; EG-NEXT: ALU clause starting at 11: 131; EG-NEXT: FFBH_UINT T0.X, T0.X, 132; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 133; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 134; 135; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32: 136; GFX9-GISEL: ; %bb.0: 137; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 138; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 139; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 140; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 141; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 142; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 143; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 144; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 145; GFX9-GISEL-NEXT: s_endpgm 146 %tid = call i32 @llvm.amdgcn.workitem.id.x() 147 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 148 %val = load i32, ptr addrspace(1) %in.gep, align 4 149 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 150 store i32 %ctlz, ptr addrspace(1) %out, align 4 151 ret void 152} 153 154define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 155; SI-LABEL: v_ctlz_zero_undef_v2i32: 156; SI: ; %bb.0: 157; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 158; SI-NEXT: s_mov_b32 s7, 0xf000 159; SI-NEXT: s_mov_b32 s10, 0 160; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 161; SI-NEXT: v_mov_b32_e32 v1, 0 162; SI-NEXT: s_mov_b32 s11, s7 163; SI-NEXT: s_waitcnt lgkmcnt(0) 164; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 165; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 166; SI-NEXT: s_mov_b32 s6, -1 167; SI-NEXT: s_mov_b32 s4, s0 168; SI-NEXT: s_mov_b32 s5, s1 169; SI-NEXT: s_waitcnt vmcnt(0) 170; SI-NEXT: v_ffbh_u32_e32 v1, v1 171; SI-NEXT: v_ffbh_u32_e32 v0, v0 172; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 173; SI-NEXT: s_endpgm 174; 175; VI-LABEL: v_ctlz_zero_undef_v2i32: 176; VI: ; %bb.0: 177; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 178; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 179; VI-NEXT: s_waitcnt lgkmcnt(0) 180; VI-NEXT: v_mov_b32_e32 v1, s3 181; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 182; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 183; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 184; VI-NEXT: v_mov_b32_e32 v3, s1 185; VI-NEXT: v_mov_b32_e32 v2, s0 186; VI-NEXT: s_waitcnt vmcnt(0) 187; VI-NEXT: v_ffbh_u32_e32 v1, v1 188; VI-NEXT: v_ffbh_u32_e32 v0, v0 189; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 190; VI-NEXT: s_endpgm 191; 192; EG-LABEL: v_ctlz_zero_undef_v2i32: 193; EG: ; %bb.0: 194; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 195; EG-NEXT: TEX 0 @6 196; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 197; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 198; EG-NEXT: CF_END 199; EG-NEXT: PAD 200; EG-NEXT: Fetch clause starting at 6: 201; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 202; EG-NEXT: ALU clause starting at 8: 203; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 204; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 205; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 206; EG-NEXT: ALU clause starting at 11: 207; EG-NEXT: FFBH_UINT * T0.Y, T0.Y, 208; EG-NEXT: FFBH_UINT T0.X, T0.X, 209; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 210; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 211; 212; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32: 213; GFX9-GISEL: ; %bb.0: 214; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 215; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 216; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 217; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 218; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 219; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 220; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 221; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 222; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 223; GFX9-GISEL-NEXT: s_endpgm 224 %tid = call i32 @llvm.amdgcn.workitem.id.x() 225 %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid 226 %val = load <2 x i32>, ptr addrspace(1) %in.gep, align 8 227 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone 228 store <2 x i32> %ctlz, ptr addrspace(1) %out, align 8 229 ret void 230} 231 232define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 233; SI-LABEL: v_ctlz_zero_undef_v4i32: 234; SI: ; %bb.0: 235; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 236; SI-NEXT: s_mov_b32 s7, 0xf000 237; SI-NEXT: s_mov_b32 s10, 0 238; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 239; SI-NEXT: v_mov_b32_e32 v1, 0 240; SI-NEXT: s_mov_b32 s11, s7 241; SI-NEXT: s_waitcnt lgkmcnt(0) 242; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 243; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 244; SI-NEXT: s_mov_b32 s6, -1 245; SI-NEXT: s_mov_b32 s4, s0 246; SI-NEXT: s_mov_b32 s5, s1 247; SI-NEXT: s_waitcnt vmcnt(0) 248; SI-NEXT: v_ffbh_u32_e32 v3, v3 249; SI-NEXT: v_ffbh_u32_e32 v2, v2 250; SI-NEXT: v_ffbh_u32_e32 v1, v1 251; SI-NEXT: v_ffbh_u32_e32 v0, v0 252; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 253; SI-NEXT: s_endpgm 254; 255; VI-LABEL: v_ctlz_zero_undef_v4i32: 256; VI: ; %bb.0: 257; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 258; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 259; VI-NEXT: s_waitcnt lgkmcnt(0) 260; VI-NEXT: v_mov_b32_e32 v1, s3 261; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 262; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 263; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 264; VI-NEXT: v_mov_b32_e32 v5, s1 265; VI-NEXT: v_mov_b32_e32 v4, s0 266; VI-NEXT: s_waitcnt vmcnt(0) 267; VI-NEXT: v_ffbh_u32_e32 v3, v3 268; VI-NEXT: v_ffbh_u32_e32 v2, v2 269; VI-NEXT: v_ffbh_u32_e32 v1, v1 270; VI-NEXT: v_ffbh_u32_e32 v0, v0 271; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 272; VI-NEXT: s_endpgm 273; 274; EG-LABEL: v_ctlz_zero_undef_v4i32: 275; EG: ; %bb.0: 276; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 277; EG-NEXT: TEX 0 @6 278; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 279; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 280; EG-NEXT: CF_END 281; EG-NEXT: PAD 282; EG-NEXT: Fetch clause starting at 6: 283; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 284; EG-NEXT: ALU clause starting at 8: 285; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 286; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 287; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 288; EG-NEXT: ALU clause starting at 11: 289; EG-NEXT: FFBH_UINT * T0.W, T0.W, 290; EG-NEXT: FFBH_UINT * T0.Z, T0.Z, 291; EG-NEXT: FFBH_UINT * T0.Y, T0.Y, 292; EG-NEXT: FFBH_UINT T0.X, T0.X, 293; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 294; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 295; 296; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32: 297; GFX9-GISEL: ; %bb.0: 298; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 299; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 300; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 301; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 302; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 303; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 304; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 305; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 306; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 307; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3 308; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 309; GFX9-GISEL-NEXT: s_endpgm 310 %tid = call i32 @llvm.amdgcn.workitem.id.x() 311 %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid 312 %val = load <4 x i32>, ptr addrspace(1) %in.gep, align 16 313 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone 314 store <4 x i32> %ctlz, ptr addrspace(1) %out, align 16 315 ret void 316} 317 318define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { 319; SI-LABEL: s_ctlz_zero_undef_i8_with_select: 320; SI: ; %bb.0: 321; SI-NEXT: s_load_dword s2, s[4:5], 0xb 322; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 323; SI-NEXT: s_mov_b32 s3, 0xf000 324; SI-NEXT: s_waitcnt lgkmcnt(0) 325; SI-NEXT: s_lshl_b32 s2, s2, 24 326; SI-NEXT: s_flbit_i32_b32 s4, s2 327; SI-NEXT: s_mov_b32 s2, -1 328; SI-NEXT: v_mov_b32_e32 v0, s4 329; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 330; SI-NEXT: s_endpgm 331; 332; VI-LABEL: s_ctlz_zero_undef_i8_with_select: 333; VI: ; %bb.0: 334; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 335; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 336; VI-NEXT: s_waitcnt lgkmcnt(0) 337; VI-NEXT: s_lshl_b32 s2, s2, 24 338; VI-NEXT: s_flbit_i32_b32 s2, s2 339; VI-NEXT: v_mov_b32_e32 v0, s0 340; VI-NEXT: v_mov_b32_e32 v1, s1 341; VI-NEXT: v_mov_b32_e32 v2, s2 342; VI-NEXT: flat_store_byte v[0:1], v2 343; VI-NEXT: s_endpgm 344; 345; EG-LABEL: s_ctlz_zero_undef_i8_with_select: 346; EG: ; %bb.0: 347; EG-NEXT: ALU 0, @8, KC0[], KC1[] 348; EG-NEXT: TEX 0 @6 349; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 350; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 351; EG-NEXT: CF_END 352; EG-NEXT: PAD 353; EG-NEXT: Fetch clause starting at 6: 354; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 355; EG-NEXT: ALU clause starting at 8: 356; EG-NEXT: MOV * T0.X, 0.0, 357; EG-NEXT: ALU clause starting at 9: 358; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 359; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) 360; EG-NEXT: FFBH_UINT T0.W, PV.W, 361; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 362; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 363; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 364; EG-NEXT: LSHL * T1.W, PS, literal.y, 365; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 366; EG-NEXT: LSHL T0.X, PV.W, PS, 367; EG-NEXT: LSHL * T0.W, literal.x, PS, 368; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 369; EG-NEXT: MOV T0.Y, 0.0, 370; EG-NEXT: MOV * T0.Z, 0.0, 371; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 372; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 373; 374; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i8_with_select: 375; GFX9-GISEL: ; %bb.0: 376; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c 377; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 378; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 379; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 380; GFX9-GISEL-NEXT: s_lshl_b32 s2, s2, 24 381; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 382; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 383; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] 384; GFX9-GISEL-NEXT: s_endpgm 385 %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone 386 %ctlz_ret = icmp ne i8 %val, 0 387 %ret = select i1 %ctlz_ret, i8 %ctlz, i8 32 388 store i8 %ctlz, ptr addrspace(1) %out, align 4 389 ret void 390} 391 392define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { 393; SI-LABEL: s_ctlz_zero_undef_i16_with_select: 394; SI: ; %bb.0: 395; SI-NEXT: s_load_dword s2, s[4:5], 0xb 396; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 397; SI-NEXT: s_mov_b32 s3, 0xf000 398; SI-NEXT: s_waitcnt lgkmcnt(0) 399; SI-NEXT: s_lshl_b32 s2, s2, 16 400; SI-NEXT: s_flbit_i32_b32 s4, s2 401; SI-NEXT: s_mov_b32 s2, -1 402; SI-NEXT: v_mov_b32_e32 v0, s4 403; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 404; SI-NEXT: s_endpgm 405; 406; VI-LABEL: s_ctlz_zero_undef_i16_with_select: 407; VI: ; %bb.0: 408; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 409; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 410; VI-NEXT: s_waitcnt lgkmcnt(0) 411; VI-NEXT: s_lshl_b32 s2, s2, 16 412; VI-NEXT: s_flbit_i32_b32 s2, s2 413; VI-NEXT: v_mov_b32_e32 v0, s0 414; VI-NEXT: v_mov_b32_e32 v1, s1 415; VI-NEXT: v_mov_b32_e32 v2, s2 416; VI-NEXT: flat_store_short v[0:1], v2 417; VI-NEXT: s_endpgm 418; 419; EG-LABEL: s_ctlz_zero_undef_i16_with_select: 420; EG: ; %bb.0: 421; EG-NEXT: ALU 0, @8, KC0[], KC1[] 422; EG-NEXT: TEX 0 @6 423; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 424; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 425; EG-NEXT: CF_END 426; EG-NEXT: PAD 427; EG-NEXT: Fetch clause starting at 6: 428; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 429; EG-NEXT: ALU clause starting at 8: 430; EG-NEXT: MOV * T0.X, 0.0, 431; EG-NEXT: ALU clause starting at 9: 432; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 433; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 434; EG-NEXT: FFBH_UINT T0.W, PV.W, 435; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 436; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 437; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 438; EG-NEXT: LSHL * T1.W, PS, literal.y, 439; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 440; EG-NEXT: LSHL T0.X, PV.W, PS, 441; EG-NEXT: LSHL * T0.W, literal.x, PS, 442; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 443; EG-NEXT: MOV T0.Y, 0.0, 444; EG-NEXT: MOV * T0.Z, 0.0, 445; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 446; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 447; 448; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i16_with_select: 449; GFX9-GISEL: ; %bb.0: 450; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c 451; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 452; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 453; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 454; GFX9-GISEL-NEXT: s_lshl_b32 s2, s2, 16 455; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 456; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 457; GFX9-GISEL-NEXT: global_store_short v1, v0, s[0:1] 458; GFX9-GISEL-NEXT: s_endpgm 459 %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone 460 %ctlz_ret = icmp ne i16 %val, 0 461 %ret = select i1 %ctlz_ret, i16 %ctlz, i16 32 462 store i16 %ctlz, ptr addrspace(1) %out, align 4 463 ret void 464} 465 466define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { 467; SI-LABEL: s_ctlz_zero_undef_i32_with_select: 468; SI: ; %bb.0: 469; SI-NEXT: s_load_dword s2, s[4:5], 0xb 470; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 471; SI-NEXT: s_mov_b32 s3, 0xf000 472; SI-NEXT: s_waitcnt lgkmcnt(0) 473; SI-NEXT: s_flbit_i32_b32 s4, s2 474; SI-NEXT: s_mov_b32 s2, -1 475; SI-NEXT: v_mov_b32_e32 v0, s4 476; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 477; SI-NEXT: s_endpgm 478; 479; VI-LABEL: s_ctlz_zero_undef_i32_with_select: 480; VI: ; %bb.0: 481; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 482; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 483; VI-NEXT: s_waitcnt lgkmcnt(0) 484; VI-NEXT: s_flbit_i32_b32 s2, s2 485; VI-NEXT: v_mov_b32_e32 v0, s0 486; VI-NEXT: v_mov_b32_e32 v1, s1 487; VI-NEXT: v_mov_b32_e32 v2, s2 488; VI-NEXT: flat_store_dword v[0:1], v2 489; VI-NEXT: s_endpgm 490; 491; EG-LABEL: s_ctlz_zero_undef_i32_with_select: 492; EG: ; %bb.0: 493; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 494; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 495; EG-NEXT: CF_END 496; EG-NEXT: PAD 497; EG-NEXT: ALU clause starting at 4: 498; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 499; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 500; EG-NEXT: FFBH_UINT * T1.X, KC0[2].Z, 501; 502; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32_with_select: 503; GFX9-GISEL: ; %bb.0: 504; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c 505; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 506; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 507; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 508; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 509; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 510; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 511; GFX9-GISEL-NEXT: s_endpgm 512 %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 513 %ctlz_ret = icmp ne i32 %val, 0 514 %ret = select i1 %ctlz_ret, i32 %ctlz, i32 32 515 store i32 %ctlz, ptr addrspace(1) %out, align 4 516 ret void 517} 518 519define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { 520; SI-LABEL: s_ctlz_zero_undef_i64_with_select: 521; SI: ; %bb.0: 522; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 523; SI-NEXT: s_mov_b32 s7, 0xf000 524; SI-NEXT: s_mov_b32 s6, -1 525; SI-NEXT: s_waitcnt lgkmcnt(0) 526; SI-NEXT: s_flbit_i32_b64 s2, s[2:3] 527; SI-NEXT: v_mov_b32_e32 v1, 0 528; SI-NEXT: s_mov_b32 s4, s0 529; SI-NEXT: s_mov_b32 s5, s1 530; SI-NEXT: v_mov_b32_e32 v0, s2 531; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 532; SI-NEXT: s_endpgm 533; 534; VI-LABEL: s_ctlz_zero_undef_i64_with_select: 535; VI: ; %bb.0: 536; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 537; VI-NEXT: v_mov_b32_e32 v1, 0 538; VI-NEXT: s_waitcnt lgkmcnt(0) 539; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] 540; VI-NEXT: v_mov_b32_e32 v3, s1 541; VI-NEXT: v_mov_b32_e32 v0, s2 542; VI-NEXT: v_mov_b32_e32 v2, s0 543; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 544; VI-NEXT: s_endpgm 545; 546; EG-LABEL: s_ctlz_zero_undef_i64_with_select: 547; EG: ; %bb.0: 548; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 549; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 550; EG-NEXT: CF_END 551; EG-NEXT: PAD 552; EG-NEXT: ALU clause starting at 4: 553; EG-NEXT: FFBH_UINT * T0.W, KC0[2].W, 554; EG-NEXT: FFBH_UINT T1.W, KC0[3].X, 555; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 556; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 557; EG-NEXT: CNDE_INT T0.X, KC0[3].X, PS, PV.W, 558; EG-NEXT: MOV T0.Y, 0.0, 559; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 560; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 561; 562; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select: 563; GFX9-GISEL: ; %bb.0: 564; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 565; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 566; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 567; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 568; GFX9-GISEL-NEXT: s_flbit_i32_b64 s4, s[2:3] 569; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 570; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 571; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 572; GFX9-GISEL-NEXT: s_endpgm 573 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone 574 %ctlz_ret = icmp ne i64 %val, 0 575 %ret = select i1 %ctlz_ret, i64 %ctlz, i64 32 576 store i64 %ctlz, ptr addrspace(1) %out, align 4 577 ret void 578} 579 580define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { 581; SI-LABEL: v_ctlz_zero_undef_i8_with_select: 582; SI: ; %bb.0: 583; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 584; SI-NEXT: s_mov_b32 s7, 0xf000 585; SI-NEXT: s_mov_b32 s6, -1 586; SI-NEXT: s_mov_b32 s10, s6 587; SI-NEXT: s_mov_b32 s11, s7 588; SI-NEXT: s_waitcnt lgkmcnt(0) 589; SI-NEXT: s_mov_b32 s8, s2 590; SI-NEXT: s_mov_b32 s9, s3 591; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 592; SI-NEXT: s_mov_b32 s4, s0 593; SI-NEXT: s_mov_b32 s5, s1 594; SI-NEXT: s_waitcnt vmcnt(0) 595; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v0 596; SI-NEXT: v_ffbh_u32_e32 v1, v1 597; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 598; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 599; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 600; SI-NEXT: s_endpgm 601; 602; VI-LABEL: v_ctlz_zero_undef_i8_with_select: 603; VI: ; %bb.0: 604; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 605; VI-NEXT: s_waitcnt lgkmcnt(0) 606; VI-NEXT: v_mov_b32_e32 v0, s2 607; VI-NEXT: v_mov_b32_e32 v1, s3 608; VI-NEXT: flat_load_ubyte v0, v[0:1] 609; VI-NEXT: s_waitcnt vmcnt(0) 610; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0 611; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 612; VI-NEXT: v_ffbh_u32_e32 v1, v1 613; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 614; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc 615; VI-NEXT: v_mov_b32_e32 v0, s0 616; VI-NEXT: v_mov_b32_e32 v1, s1 617; VI-NEXT: flat_store_byte v[0:1], v2 618; VI-NEXT: s_endpgm 619; 620; EG-LABEL: v_ctlz_zero_undef_i8_with_select: 621; EG: ; %bb.0: 622; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 623; EG-NEXT: TEX 0 @6 624; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[] 625; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 626; EG-NEXT: CF_END 627; EG-NEXT: PAD 628; EG-NEXT: Fetch clause starting at 6: 629; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 630; EG-NEXT: ALU clause starting at 8: 631; EG-NEXT: MOV * T0.X, KC0[2].Z, 632; EG-NEXT: ALU clause starting at 9: 633; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 634; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) 635; EG-NEXT: FFBH_UINT T0.W, PV.W, 636; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 637; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 638; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 639; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 640; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 641; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 642; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 643; EG-NEXT: LSHL T0.X, PV.W, PS, 644; EG-NEXT: LSHL * T0.W, literal.x, PS, 645; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 646; EG-NEXT: MOV T0.Y, 0.0, 647; EG-NEXT: MOV * T0.Z, 0.0, 648; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 649; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 650; 651; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select: 652; GFX9-GISEL: ; %bb.0: 653; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 654; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 655; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 656; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 657; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 658; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1 659; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 660; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 661; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 662; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc 663; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] 664; GFX9-GISEL-NEXT: s_endpgm 665 %val = load i8, ptr addrspace(1) %arrayidx, align 1 666 %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone 667 %ctlz_ret = icmp ne i8 %val, 0 668 %ret = select i1 %ctlz_ret, i8 %ctlz, i8 32 669 store i8 %ret, ptr addrspace(1) %out, align 4 670 ret void 671} 672 673define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { 674; SI-LABEL: v_ctlz_zero_undef_i16_with_select: 675; SI: ; %bb.0: 676; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 677; SI-NEXT: s_mov_b32 s7, 0xf000 678; SI-NEXT: s_mov_b32 s6, -1 679; SI-NEXT: s_mov_b32 s10, s6 680; SI-NEXT: s_mov_b32 s11, s7 681; SI-NEXT: s_waitcnt lgkmcnt(0) 682; SI-NEXT: s_mov_b32 s8, s2 683; SI-NEXT: s_mov_b32 s9, s3 684; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 685; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 686; SI-NEXT: s_mov_b32 s4, s0 687; SI-NEXT: s_mov_b32 s5, s1 688; SI-NEXT: s_waitcnt vmcnt(1) 689; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 690; SI-NEXT: s_waitcnt vmcnt(0) 691; SI-NEXT: v_or_b32_e32 v0, v0, v1 692; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 693; SI-NEXT: v_ffbh_u32_e32 v1, v1 694; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 695; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 696; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 697; SI-NEXT: s_endpgm 698; 699; VI-LABEL: v_ctlz_zero_undef_i16_with_select: 700; VI: ; %bb.0: 701; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 702; VI-NEXT: s_waitcnt lgkmcnt(0) 703; VI-NEXT: s_add_u32 s4, s2, 1 704; VI-NEXT: s_addc_u32 s5, s3, 0 705; VI-NEXT: v_mov_b32_e32 v2, s4 706; VI-NEXT: v_mov_b32_e32 v0, s2 707; VI-NEXT: v_mov_b32_e32 v3, s5 708; VI-NEXT: v_mov_b32_e32 v1, s3 709; VI-NEXT: flat_load_ubyte v2, v[2:3] 710; VI-NEXT: flat_load_ubyte v0, v[0:1] 711; VI-NEXT: s_waitcnt vmcnt(1) 712; VI-NEXT: v_readfirstlane_b32 s2, v2 713; VI-NEXT: s_waitcnt vmcnt(0) 714; VI-NEXT: v_readfirstlane_b32 s3, v0 715; VI-NEXT: s_lshl_b32 s2, s2, 8 716; VI-NEXT: s_or_b32 s2, s2, s3 717; VI-NEXT: s_lshl_b32 s3, s2, 16 718; VI-NEXT: s_and_b32 s2, s2, 0xffff 719; VI-NEXT: s_flbit_i32_b32 s3, s3 720; VI-NEXT: s_cmp_lg_u32 s2, 0 721; VI-NEXT: s_cselect_b32 s2, s3, 32 722; VI-NEXT: v_mov_b32_e32 v0, s0 723; VI-NEXT: v_mov_b32_e32 v1, s1 724; VI-NEXT: v_mov_b32_e32 v2, s2 725; VI-NEXT: flat_store_short v[0:1], v2 726; VI-NEXT: s_endpgm 727; 728; EG-LABEL: v_ctlz_zero_undef_i16_with_select: 729; EG: ; %bb.0: 730; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 731; EG-NEXT: TEX 0 @6 732; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[] 733; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 734; EG-NEXT: CF_END 735; EG-NEXT: PAD 736; EG-NEXT: Fetch clause starting at 6: 737; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 738; EG-NEXT: ALU clause starting at 8: 739; EG-NEXT: MOV * T0.X, KC0[2].Z, 740; EG-NEXT: ALU clause starting at 9: 741; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 742; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 743; EG-NEXT: FFBH_UINT T0.W, PV.W, 744; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 745; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 746; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 747; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 748; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 749; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 750; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 751; EG-NEXT: LSHL T0.X, PV.W, PS, 752; EG-NEXT: LSHL * T0.W, literal.x, PS, 753; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 754; EG-NEXT: MOV T0.Y, 0.0, 755; EG-NEXT: MOV * T0.Z, 0.0, 756; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 757; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 758; 759; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select: 760; GFX9-GISEL: ; %bb.0: 761; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 762; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 763; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 764; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 765; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 766; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 767; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 768; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1 769; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 770; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 771; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 772; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc 773; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] 774; GFX9-GISEL-NEXT: s_endpgm 775 %val = load i16, ptr addrspace(1) %arrayidx, align 1 776 %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone 777 %ctlz_ret = icmp ne i16 %val, 0 778 %ret = select i1 %ctlz_ret, i16 %ctlz, i16 32 779 store i16 %ret, ptr addrspace(1) %out, align 4 780 ret void 781} 782 783define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { 784; SI-LABEL: v_ctlz_zero_undef_i32_with_select: 785; SI: ; %bb.0: 786; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 787; SI-NEXT: s_mov_b32 s7, 0xf000 788; SI-NEXT: s_mov_b32 s6, -1 789; SI-NEXT: s_mov_b32 s10, s6 790; SI-NEXT: s_mov_b32 s11, s7 791; SI-NEXT: s_waitcnt lgkmcnt(0) 792; SI-NEXT: s_mov_b32 s8, s2 793; SI-NEXT: s_mov_b32 s9, s3 794; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 795; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:3 796; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 797; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:2 798; SI-NEXT: s_mov_b32 s4, s0 799; SI-NEXT: s_mov_b32 s5, s1 800; SI-NEXT: s_waitcnt vmcnt(3) 801; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 802; SI-NEXT: s_waitcnt vmcnt(2) 803; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 804; SI-NEXT: s_waitcnt vmcnt(1) 805; SI-NEXT: v_or_b32_e32 v0, v0, v2 806; SI-NEXT: s_waitcnt vmcnt(0) 807; SI-NEXT: v_or_b32_e32 v1, v1, v3 808; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 809; SI-NEXT: v_or_b32_e32 v0, v1, v0 810; SI-NEXT: v_ffbh_u32_e32 v0, v0 811; SI-NEXT: v_min_u32_e32 v0, 32, v0 812; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 813; SI-NEXT: s_endpgm 814; 815; VI-LABEL: v_ctlz_zero_undef_i32_with_select: 816; VI: ; %bb.0: 817; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 818; VI-NEXT: s_waitcnt lgkmcnt(0) 819; VI-NEXT: s_add_u32 s4, s2, 3 820; VI-NEXT: s_addc_u32 s5, s3, 0 821; VI-NEXT: v_mov_b32_e32 v2, s4 822; VI-NEXT: v_mov_b32_e32 v3, s5 823; VI-NEXT: s_add_u32 s4, s2, 2 824; VI-NEXT: v_mov_b32_e32 v0, s2 825; VI-NEXT: s_addc_u32 s5, s3, 0 826; VI-NEXT: v_mov_b32_e32 v1, s3 827; VI-NEXT: s_add_u32 s2, s2, 1 828; VI-NEXT: s_addc_u32 s3, s3, 0 829; VI-NEXT: v_mov_b32_e32 v4, s4 830; VI-NEXT: v_mov_b32_e32 v7, s3 831; VI-NEXT: v_mov_b32_e32 v5, s5 832; VI-NEXT: v_mov_b32_e32 v6, s2 833; VI-NEXT: flat_load_ubyte v2, v[2:3] 834; VI-NEXT: flat_load_ubyte v3, v[4:5] 835; VI-NEXT: flat_load_ubyte v4, v[6:7] 836; VI-NEXT: flat_load_ubyte v0, v[0:1] 837; VI-NEXT: s_waitcnt vmcnt(3) 838; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 839; VI-NEXT: s_waitcnt vmcnt(2) 840; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 841; VI-NEXT: s_waitcnt vmcnt(1) 842; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 843; VI-NEXT: s_waitcnt vmcnt(0) 844; VI-NEXT: v_or_b32_e32 v0, v2, v0 845; VI-NEXT: v_or_b32_e32 v0, v1, v0 846; VI-NEXT: v_ffbh_u32_e32 v0, v0 847; VI-NEXT: v_min_u32_e32 v2, 32, v0 848; VI-NEXT: v_mov_b32_e32 v0, s0 849; VI-NEXT: v_mov_b32_e32 v1, s1 850; VI-NEXT: flat_store_dword v[0:1], v2 851; VI-NEXT: s_endpgm 852; 853; EG-LABEL: v_ctlz_zero_undef_i32_with_select: 854; EG: ; %bb.0: 855; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 856; EG-NEXT: TEX 1 @6 857; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] 858; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 859; EG-NEXT: CF_END 860; EG-NEXT: PAD 861; EG-NEXT: Fetch clause starting at 6: 862; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1 863; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 864; EG-NEXT: ALU clause starting at 10: 865; EG-NEXT: MOV * T0.X, KC0[2].Z, 866; EG-NEXT: ALU clause starting at 11: 867; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 868; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 869; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, 870; EG-NEXT: FFBH_UINT * T1.W, PV.W, 871; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W, 872; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 873; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 874; 875; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select: 876; GFX9-GISEL: ; %bb.0: 877; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 878; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 879; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 880; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 881; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 882; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 883; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 884; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) 885; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 886; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) 887; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3 888; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 889; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v4 890; GFX9-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 891; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 892; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 893; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc 894; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] 895; GFX9-GISEL-NEXT: s_endpgm 896 %val = load i32, ptr addrspace(1) %arrayidx, align 1 897 %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 898 %ctlz_ret = icmp ne i32 %val, 0 899 %ret = select i1 %ctlz_ret, i32 %ctlz, i32 32 900 store i32 %ret, ptr addrspace(1) %out, align 4 901 ret void 902} 903 904define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { 905; SI-LABEL: v_ctlz_zero_undef_i64_with_select: 906; SI: ; %bb.0: 907; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 908; SI-NEXT: s_mov_b32 s3, 0xf000 909; SI-NEXT: s_mov_b32 s2, -1 910; SI-NEXT: s_mov_b32 s10, s2 911; SI-NEXT: s_mov_b32 s11, s3 912; SI-NEXT: s_waitcnt lgkmcnt(0) 913; SI-NEXT: s_mov_b32 s8, s6 914; SI-NEXT: s_mov_b32 s9, s7 915; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:5 916; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:7 917; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 918; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:1 919; SI-NEXT: buffer_load_ubyte v4, off, s[8:11], 0 offset:2 920; SI-NEXT: buffer_load_ubyte v5, off, s[8:11], 0 offset:3 921; SI-NEXT: buffer_load_ubyte v6, off, s[8:11], 0 offset:4 922; SI-NEXT: buffer_load_ubyte v7, off, s[8:11], 0 offset:6 923; SI-NEXT: s_mov_b32 s0, s4 924; SI-NEXT: s_mov_b32 s1, s5 925; SI-NEXT: s_waitcnt vmcnt(7) 926; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 927; SI-NEXT: s_waitcnt vmcnt(6) 928; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 929; SI-NEXT: s_waitcnt vmcnt(4) 930; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 931; SI-NEXT: s_waitcnt vmcnt(2) 932; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 933; SI-NEXT: s_waitcnt vmcnt(1) 934; SI-NEXT: v_or_b32_e32 v0, v0, v6 935; SI-NEXT: s_waitcnt vmcnt(0) 936; SI-NEXT: v_or_b32_e32 v1, v1, v7 937; SI-NEXT: v_or_b32_e32 v2, v3, v2 938; SI-NEXT: v_or_b32_e32 v3, v5, v4 939; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 940; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 941; SI-NEXT: v_or_b32_e32 v0, v1, v0 942; SI-NEXT: v_or_b32_e32 v1, v3, v2 943; SI-NEXT: v_ffbh_u32_e32 v1, v1 944; SI-NEXT: v_ffbh_u32_e32 v0, v0 945; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v1 946; SI-NEXT: v_min_u32_e32 v0, v1, v0 947; SI-NEXT: v_min_u32_e32 v0, 64, v0 948; SI-NEXT: v_mov_b32_e32 v1, 0 949; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 950; SI-NEXT: s_endpgm 951; 952; VI-LABEL: v_ctlz_zero_undef_i64_with_select: 953; VI: ; %bb.0: 954; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 955; VI-NEXT: s_waitcnt lgkmcnt(0) 956; VI-NEXT: s_add_u32 s4, s2, 5 957; VI-NEXT: s_addc_u32 s5, s3, 0 958; VI-NEXT: v_mov_b32_e32 v0, s4 959; VI-NEXT: v_mov_b32_e32 v1, s5 960; VI-NEXT: s_add_u32 s4, s2, 4 961; VI-NEXT: s_addc_u32 s5, s3, 0 962; VI-NEXT: v_mov_b32_e32 v2, s4 963; VI-NEXT: v_mov_b32_e32 v3, s5 964; VI-NEXT: s_add_u32 s4, s2, 7 965; VI-NEXT: s_addc_u32 s5, s3, 0 966; VI-NEXT: v_mov_b32_e32 v4, s4 967; VI-NEXT: v_mov_b32_e32 v5, s5 968; VI-NEXT: s_add_u32 s4, s2, 6 969; VI-NEXT: s_addc_u32 s5, s3, 0 970; VI-NEXT: v_mov_b32_e32 v7, s5 971; VI-NEXT: v_mov_b32_e32 v6, s4 972; VI-NEXT: s_add_u32 s4, s2, 3 973; VI-NEXT: s_addc_u32 s5, s3, 0 974; VI-NEXT: v_mov_b32_e32 v9, s5 975; VI-NEXT: v_mov_b32_e32 v8, s4 976; VI-NEXT: s_add_u32 s4, s2, 2 977; VI-NEXT: s_addc_u32 s5, s3, 0 978; VI-NEXT: v_mov_b32_e32 v11, s5 979; VI-NEXT: v_mov_b32_e32 v10, s4 980; VI-NEXT: s_add_u32 s4, s2, 1 981; VI-NEXT: flat_load_ubyte v12, v[0:1] 982; VI-NEXT: flat_load_ubyte v13, v[2:3] 983; VI-NEXT: flat_load_ubyte v4, v[4:5] 984; VI-NEXT: flat_load_ubyte v5, v[6:7] 985; VI-NEXT: s_addc_u32 s5, s3, 0 986; VI-NEXT: v_mov_b32_e32 v0, s4 987; VI-NEXT: flat_load_ubyte v6, v[8:9] 988; VI-NEXT: v_mov_b32_e32 v2, s2 989; VI-NEXT: v_mov_b32_e32 v1, s5 990; VI-NEXT: v_mov_b32_e32 v3, s3 991; VI-NEXT: flat_load_ubyte v7, v[10:11] 992; VI-NEXT: flat_load_ubyte v0, v[0:1] 993; VI-NEXT: flat_load_ubyte v2, v[2:3] 994; VI-NEXT: v_mov_b32_e32 v1, 0 995; VI-NEXT: s_waitcnt vmcnt(7) 996; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 997; VI-NEXT: s_waitcnt vmcnt(6) 998; VI-NEXT: v_or_b32_e32 v3, v3, v13 999; VI-NEXT: s_waitcnt vmcnt(5) 1000; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 1001; VI-NEXT: s_waitcnt vmcnt(4) 1002; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1003; VI-NEXT: v_or_b32_e32 v3, v4, v3 1004; VI-NEXT: v_ffbh_u32_e32 v3, v3 1005; VI-NEXT: s_waitcnt vmcnt(3) 1006; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 1007; VI-NEXT: s_waitcnt vmcnt(2) 1008; VI-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1009; VI-NEXT: s_waitcnt vmcnt(1) 1010; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1011; VI-NEXT: s_waitcnt vmcnt(0) 1012; VI-NEXT: v_or_b32_e32 v0, v0, v2 1013; VI-NEXT: v_or_b32_e32 v0, v4, v0 1014; VI-NEXT: v_ffbh_u32_e32 v0, v0 1015; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 1016; VI-NEXT: v_min_u32_e32 v0, v0, v3 1017; VI-NEXT: v_mov_b32_e32 v3, s1 1018; VI-NEXT: v_min_u32_e32 v0, 64, v0 1019; VI-NEXT: v_mov_b32_e32 v2, s0 1020; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1021; VI-NEXT: s_endpgm 1022; 1023; EG-LABEL: v_ctlz_zero_undef_i64_with_select: 1024; EG: ; %bb.0: 1025; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 1026; EG-NEXT: TEX 3 @6 1027; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[] 1028; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1029; EG-NEXT: CF_END 1030; EG-NEXT: PAD 1031; EG-NEXT: Fetch clause starting at 6: 1032; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1 1033; EG-NEXT: VTX_READ_16 T2.X, T0.X, 4, #1 1034; EG-NEXT: VTX_READ_16 T3.X, T0.X, 6, #1 1035; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1036; EG-NEXT: ALU clause starting at 14: 1037; EG-NEXT: MOV * T0.X, KC0[2].Z, 1038; EG-NEXT: ALU clause starting at 15: 1039; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 1040; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1041; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, 1042; EG-NEXT: FFBH_UINT T1.W, PV.W, 1043; EG-NEXT: LSHL * T2.W, T3.X, literal.x, 1044; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1045; EG-NEXT: CNDE_INT T0.W, T0.W, literal.x, PV.W, 1046; EG-NEXT: OR_INT * T1.W, PS, T2.X, 1047; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1048; EG-NEXT: FFBH_UINT T2.W, PS, 1049; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 1050; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1051; EG-NEXT: CNDE_INT T0.X, T1.W, PS, PV.W, 1052; EG-NEXT: MOV T0.Y, 0.0, 1053; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1054; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1055; 1056; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select: 1057; GFX9-GISEL: ; %bb.0: 1058; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1059; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 1060; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1061; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] 1062; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1 1063; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2 1064; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3 1065; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4 1066; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5 1067; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:6 1068; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:7 1069; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) 1070; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0 1071; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) 1072; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1073; GFX9-GISEL-NEXT: s_waitcnt vmcnt(4) 1074; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 1075; GFX9-GISEL-NEXT: v_or3_b32 v2, v2, v3, v0 1076; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) 1077; GFX9-GISEL-NEXT: v_lshl_or_b32 v4, v6, 8, v5 1078; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) 1079; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v7 1080; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1081; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v8, 24, v5 1082; GFX9-GISEL-NEXT: v_or3_b32 v3, v0, v4, 0 1083; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v2 1084; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v4, v3 1085; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0 1086; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] 1087; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v4, v0 1088; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc 1089; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 1090; GFX9-GISEL-NEXT: s_endpgm 1091 %val = load i64, ptr addrspace(1) %arrayidx, align 1 1092 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone 1093 %ctlz_ret = icmp ne i64 %val, 0 1094 %ret = select i1 %ctlz_ret, i64 %ctlz, i64 64 1095 store i64 %ret, ptr addrspace(1) %out, align 4 1096 ret void 1097} 1098 1099define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1100; SI-LABEL: v_ctlz_zero_undef_i8: 1101; SI: ; %bb.0: 1102; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1103; SI-NEXT: s_mov_b32 s7, 0xf000 1104; SI-NEXT: v_mov_b32_e32 v1, 0 1105; SI-NEXT: s_mov_b32 s10, 0 1106; SI-NEXT: s_mov_b32 s11, s7 1107; SI-NEXT: s_waitcnt lgkmcnt(0) 1108; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1109; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 1110; SI-NEXT: s_mov_b32 s6, -1 1111; SI-NEXT: s_mov_b32 s4, s0 1112; SI-NEXT: s_mov_b32 s5, s1 1113; SI-NEXT: s_waitcnt vmcnt(0) 1114; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 1115; SI-NEXT: v_ffbh_u32_e32 v0, v0 1116; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1117; SI-NEXT: s_endpgm 1118; 1119; VI-LABEL: v_ctlz_zero_undef_i8: 1120; VI: ; %bb.0: 1121; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1122; VI-NEXT: s_waitcnt lgkmcnt(0) 1123; VI-NEXT: v_mov_b32_e32 v1, s3 1124; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1125; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1126; VI-NEXT: flat_load_ubyte v0, v[0:1] 1127; VI-NEXT: s_waitcnt vmcnt(0) 1128; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 1129; VI-NEXT: v_ffbh_u32_e32 v2, v0 1130; VI-NEXT: v_mov_b32_e32 v0, s0 1131; VI-NEXT: v_mov_b32_e32 v1, s1 1132; VI-NEXT: flat_store_byte v[0:1], v2 1133; VI-NEXT: s_endpgm 1134; 1135; EG-LABEL: v_ctlz_zero_undef_i8: 1136; EG: ; %bb.0: 1137; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1138; EG-NEXT: TEX 0 @6 1139; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 1140; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1141; EG-NEXT: CF_END 1142; EG-NEXT: PAD 1143; EG-NEXT: Fetch clause starting at 6: 1144; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1145; EG-NEXT: ALU clause starting at 8: 1146; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1147; EG-NEXT: ALU clause starting at 9: 1148; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1149; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) 1150; EG-NEXT: FFBH_UINT T0.W, PV.W, 1151; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1152; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1153; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1154; EG-NEXT: LSHL * T1.W, PS, literal.y, 1155; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1156; EG-NEXT: LSHL T0.X, PV.W, PS, 1157; EG-NEXT: LSHL * T0.W, literal.x, PS, 1158; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1159; EG-NEXT: MOV T0.Y, 0.0, 1160; EG-NEXT: MOV * T0.Z, 0.0, 1161; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1162; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1163; 1164; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8: 1165; GFX9-GISEL: ; %bb.0: 1166; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1167; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 1168; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1169; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 1170; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 1171; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 1172; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc 1173; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off 1174; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 1175; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1176; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 1177; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 1178; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] 1179; GFX9-GISEL-NEXT: s_endpgm 1180 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1181 %in.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid 1182 %val = load i8, ptr addrspace(1) %in.gep 1183 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone 1184 store i8 %ctlz, ptr addrspace(1) %out 1185 ret void 1186} 1187 1188define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { 1189; SI-LABEL: s_ctlz_zero_undef_i64: 1190; SI: ; %bb.0: 1191; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 1192; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1193; SI-NEXT: s_mov_b32 s3, 0xf000 1194; SI-NEXT: s_mov_b32 s2, -1 1195; SI-NEXT: s_waitcnt lgkmcnt(0) 1196; SI-NEXT: s_flbit_i32_b64 s4, s[6:7] 1197; SI-NEXT: v_mov_b32_e32 v1, 0 1198; SI-NEXT: v_mov_b32_e32 v0, s4 1199; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1200; SI-NEXT: s_endpgm 1201; 1202; VI-LABEL: s_ctlz_zero_undef_i64: 1203; VI: ; %bb.0: 1204; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c 1205; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1206; VI-NEXT: v_mov_b32_e32 v1, 0 1207; VI-NEXT: s_waitcnt lgkmcnt(0) 1208; VI-NEXT: s_flbit_i32_b64 s0, s[0:1] 1209; VI-NEXT: v_mov_b32_e32 v2, s2 1210; VI-NEXT: v_mov_b32_e32 v0, s0 1211; VI-NEXT: v_mov_b32_e32 v3, s3 1212; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1213; VI-NEXT: s_endpgm 1214; 1215; EG-LABEL: s_ctlz_zero_undef_i64: 1216; EG: ; %bb.0: 1217; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1218; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1219; EG-NEXT: CF_END 1220; EG-NEXT: PAD 1221; EG-NEXT: ALU clause starting at 4: 1222; EG-NEXT: FFBH_UINT * T0.W, KC0[4].W, 1223; EG-NEXT: FFBH_UINT T1.W, KC0[5].X, 1224; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 1225; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1226; EG-NEXT: CNDE_INT T0.X, KC0[5].X, PS, PV.W, 1227; EG-NEXT: MOV T0.Y, 0.0, 1228; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1229; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1230; 1231; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64: 1232; GFX9-GISEL: ; %bb.0: 1233; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c 1234; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 1235; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 1236; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 1237; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1238; GFX9-GISEL-NEXT: s_flbit_i32_b64 s4, s[0:1] 1239; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 1240; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 1241; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 1242; GFX9-GISEL-NEXT: s_endpgm 1243 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) 1244 store i64 %ctlz, ptr addrspace(1) %out 1245 ret void 1246} 1247 1248define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { 1249; SI-LABEL: s_ctlz_zero_undef_i64_trunc: 1250; SI: ; %bb.0: 1251; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1252; SI-NEXT: s_mov_b32 s7, 0xf000 1253; SI-NEXT: s_waitcnt lgkmcnt(0) 1254; SI-NEXT: s_flbit_i32_b64 s2, s[2:3] 1255; SI-NEXT: s_mov_b32 s6, -1 1256; SI-NEXT: s_mov_b32 s4, s0 1257; SI-NEXT: s_mov_b32 s5, s1 1258; SI-NEXT: v_mov_b32_e32 v0, s2 1259; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1260; SI-NEXT: s_endpgm 1261; 1262; VI-LABEL: s_ctlz_zero_undef_i64_trunc: 1263; VI: ; %bb.0: 1264; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1265; VI-NEXT: s_waitcnt lgkmcnt(0) 1266; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] 1267; VI-NEXT: v_mov_b32_e32 v0, s0 1268; VI-NEXT: v_mov_b32_e32 v1, s1 1269; VI-NEXT: v_mov_b32_e32 v2, s2 1270; VI-NEXT: flat_store_dword v[0:1], v2 1271; VI-NEXT: s_endpgm 1272; 1273; EG-LABEL: s_ctlz_zero_undef_i64_trunc: 1274; EG: ; %bb.0: 1275; EG-NEXT: ALU 6, @4, KC0[CB0:0-32], KC1[] 1276; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1277; EG-NEXT: CF_END 1278; EG-NEXT: PAD 1279; EG-NEXT: ALU clause starting at 4: 1280; EG-NEXT: FFBH_UINT * T0.W, KC0[2].W, 1281; EG-NEXT: FFBH_UINT T1.W, KC0[3].X, 1282; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 1283; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1284; EG-NEXT: CNDE_INT T0.X, KC0[3].X, PS, PV.W, 1285; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1286; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1287; 1288; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc: 1289; GFX9-GISEL: ; %bb.0: 1290; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1291; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 1292; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1293; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] 1294; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 1295; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1296; GFX9-GISEL-NEXT: s_endpgm 1297 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) 1298 %trunc = trunc i64 %ctlz to i32 1299 store i32 %trunc, ptr addrspace(1) %out 1300 ret void 1301} 1302 1303define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 1304; SI-LABEL: v_ctlz_zero_undef_i64: 1305; SI: ; %bb.0: 1306; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1307; SI-NEXT: s_mov_b32 s7, 0xf000 1308; SI-NEXT: s_mov_b32 s6, 0 1309; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1310; SI-NEXT: v_mov_b32_e32 v1, 0 1311; SI-NEXT: s_waitcnt lgkmcnt(0) 1312; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1313; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 1314; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1315; SI-NEXT: s_waitcnt vmcnt(0) 1316; SI-NEXT: v_ffbh_u32_e32 v2, v2 1317; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2 1318; SI-NEXT: v_ffbh_u32_e32 v3, v3 1319; SI-NEXT: v_min_u32_e32 v2, v2, v3 1320; SI-NEXT: v_mov_b32_e32 v3, v1 1321; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 1322; SI-NEXT: s_endpgm 1323; 1324; VI-LABEL: v_ctlz_zero_undef_i64: 1325; VI: ; %bb.0: 1326; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1327; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 1328; VI-NEXT: v_mov_b32_e32 v2, 0 1329; VI-NEXT: s_waitcnt lgkmcnt(0) 1330; VI-NEXT: v_mov_b32_e32 v1, s3 1331; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v3 1332; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1333; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1334; VI-NEXT: v_mov_b32_e32 v4, s1 1335; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3 1336; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 1337; VI-NEXT: s_waitcnt vmcnt(0) 1338; VI-NEXT: v_ffbh_u32_e32 v0, v0 1339; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 1340; VI-NEXT: v_ffbh_u32_e32 v1, v1 1341; VI-NEXT: v_min_u32_e32 v1, v0, v1 1342; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] 1343; VI-NEXT: s_endpgm 1344; 1345; EG-LABEL: v_ctlz_zero_undef_i64: 1346; EG: ; %bb.0: 1347; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1348; EG-NEXT: TEX 0 @6 1349; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[] 1350; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1351; EG-NEXT: CF_END 1352; EG-NEXT: PAD 1353; EG-NEXT: Fetch clause starting at 6: 1354; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 1355; EG-NEXT: ALU clause starting at 8: 1356; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1357; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1358; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1359; EG-NEXT: ALU clause starting at 11: 1360; EG-NEXT: FFBH_UINT * T1.W, T0.X, 1361; EG-NEXT: FFBH_UINT T2.W, T0.Y, 1362; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x, 1363; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1364; EG-NEXT: CNDE_INT T0.X, T0.Y, PS, PV.W, 1365; EG-NEXT: MOV T0.Y, 0.0, 1366; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 1367; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 1368; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1369; 1370; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64: 1371; GFX9-GISEL: ; %bb.0: 1372; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1373; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1374; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1375; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 1376; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1377; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 1378; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 1379; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0 1380; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 1381; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 1382; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 1383; GFX9-GISEL-NEXT: s_endpgm 1384 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1385 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid 1386 %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %tid 1387 %val = load i64, ptr addrspace(1) %in.gep 1388 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) 1389 store i64 %ctlz, ptr addrspace(1) %out.gep 1390 ret void 1391} 1392 1393define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 1394; SI-LABEL: v_ctlz_zero_undef_i64_trunc: 1395; SI: ; %bb.0: 1396; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1397; SI-NEXT: s_mov_b32 s7, 0xf000 1398; SI-NEXT: s_mov_b32 s6, 0 1399; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 1400; SI-NEXT: v_mov_b32_e32 v2, 0 1401; SI-NEXT: s_waitcnt lgkmcnt(0) 1402; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1403; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 1404; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1405; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1406; SI-NEXT: s_waitcnt vmcnt(0) 1407; SI-NEXT: v_ffbh_u32_e32 v0, v3 1408; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0 1409; SI-NEXT: v_ffbh_u32_e32 v3, v4 1410; SI-NEXT: v_min_u32_e32 v0, v0, v3 1411; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 1412; SI-NEXT: s_endpgm 1413; 1414; VI-LABEL: v_ctlz_zero_undef_i64_trunc: 1415; VI: ; %bb.0: 1416; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1417; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 1418; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1419; VI-NEXT: s_waitcnt lgkmcnt(0) 1420; VI-NEXT: v_mov_b32_e32 v2, s3 1421; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 1422; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 1423; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] 1424; VI-NEXT: v_mov_b32_e32 v4, s1 1425; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 1426; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 1427; VI-NEXT: s_waitcnt vmcnt(0) 1428; VI-NEXT: v_ffbh_u32_e32 v0, v1 1429; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 1430; VI-NEXT: v_ffbh_u32_e32 v1, v2 1431; VI-NEXT: v_min_u32_e32 v0, v0, v1 1432; VI-NEXT: flat_store_dword v[3:4], v0 1433; VI-NEXT: s_endpgm 1434; 1435; EG-LABEL: v_ctlz_zero_undef_i64_trunc: 1436; EG: ; %bb.0: 1437; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1438; EG-NEXT: TEX 0 @6 1439; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[] 1440; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1441; EG-NEXT: CF_END 1442; EG-NEXT: PAD 1443; EG-NEXT: Fetch clause starting at 6: 1444; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 1445; EG-NEXT: ALU clause starting at 8: 1446; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1447; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1448; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W, 1449; EG-NEXT: ALU clause starting at 11: 1450; EG-NEXT: FFBH_UINT * T0.W, T1.X, 1451; EG-NEXT: LSHL T0.Z, T0.X, literal.x, 1452; EG-NEXT: FFBH_UINT T1.W, T1.Y, 1453; EG-NEXT: ADD_INT * T0.W, PV.W, literal.y, 1454; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 1455; EG-NEXT: CNDE_INT T0.X, T1.Y, PS, PV.W, 1456; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z, 1457; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 1458; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1459; 1460; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc: 1461; GFX9-GISEL: ; %bb.0: 1462; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1463; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 1464; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1465; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1466; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] 1467; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1468; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 1469; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 1470; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 32, v1 1471; GFX9-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 1472; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] 1473; GFX9-GISEL-NEXT: s_endpgm 1474 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1475 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid 1476 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid 1477 %val = load i64, ptr addrspace(1) %in.gep 1478 %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) 1479 %trunc = trunc i64 %ctlz to i32 1480 store i32 %trunc, ptr addrspace(1) %out.gep 1481 ret void 1482} 1483 1484define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1485; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: 1486; SI: ; %bb.0: 1487; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1488; SI-NEXT: s_mov_b32 s7, 0xf000 1489; SI-NEXT: s_mov_b32 s10, 0 1490; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1491; SI-NEXT: v_mov_b32_e32 v1, 0 1492; SI-NEXT: s_mov_b32 s11, s7 1493; SI-NEXT: s_waitcnt lgkmcnt(0) 1494; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1495; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1496; SI-NEXT: s_mov_b32 s6, -1 1497; SI-NEXT: s_mov_b32 s4, s0 1498; SI-NEXT: s_mov_b32 s5, s1 1499; SI-NEXT: s_waitcnt vmcnt(0) 1500; SI-NEXT: v_ffbh_u32_e32 v0, v0 1501; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1502; SI-NEXT: s_endpgm 1503; 1504; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: 1505; VI: ; %bb.0: 1506; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1507; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1508; VI-NEXT: s_waitcnt lgkmcnt(0) 1509; VI-NEXT: v_mov_b32_e32 v1, s3 1510; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1511; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1512; VI-NEXT: flat_load_dword v0, v[0:1] 1513; VI-NEXT: s_waitcnt vmcnt(0) 1514; VI-NEXT: v_ffbh_u32_e32 v2, v0 1515; VI-NEXT: v_mov_b32_e32 v0, s0 1516; VI-NEXT: v_mov_b32_e32 v1, s1 1517; VI-NEXT: flat_store_dword v[0:1], v2 1518; VI-NEXT: s_endpgm 1519; 1520; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: 1521; EG: ; %bb.0: 1522; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1523; EG-NEXT: TEX 0 @6 1524; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 1525; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1526; EG-NEXT: CF_END 1527; EG-NEXT: PAD 1528; EG-NEXT: Fetch clause starting at 6: 1529; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1530; EG-NEXT: ALU clause starting at 8: 1531; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1532; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1533; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1534; EG-NEXT: ALU clause starting at 11: 1535; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1536; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 1537; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1538; EG-NEXT: -1(nan), 2(2.802597e-45) 1539; 1540; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: 1541; GFX9-GISEL: ; %bb.0: 1542; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1543; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1544; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1545; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1546; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1547; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1548; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1549; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc 1550; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 1551; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1552; GFX9-GISEL-NEXT: s_endpgm 1553 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1554 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 1555 %val = load i32, ptr addrspace(1) %in.gep 1556 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 1557 %cmp = icmp eq i32 %val, 0 1558 %sel = select i1 %cmp, i32 -1, i32 %ctlz 1559 store i32 %sel, ptr addrspace(1) %out 1560 ret void 1561} 1562 1563define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1564; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: 1565; SI: ; %bb.0: 1566; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1567; SI-NEXT: s_mov_b32 s7, 0xf000 1568; SI-NEXT: s_mov_b32 s10, 0 1569; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1570; SI-NEXT: v_mov_b32_e32 v1, 0 1571; SI-NEXT: s_mov_b32 s11, s7 1572; SI-NEXT: s_waitcnt lgkmcnt(0) 1573; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1574; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1575; SI-NEXT: s_mov_b32 s6, -1 1576; SI-NEXT: s_mov_b32 s4, s0 1577; SI-NEXT: s_mov_b32 s5, s1 1578; SI-NEXT: s_waitcnt vmcnt(0) 1579; SI-NEXT: v_ffbh_u32_e32 v0, v0 1580; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1581; SI-NEXT: s_endpgm 1582; 1583; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: 1584; VI: ; %bb.0: 1585; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1586; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1587; VI-NEXT: s_waitcnt lgkmcnt(0) 1588; VI-NEXT: v_mov_b32_e32 v1, s3 1589; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1590; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1591; VI-NEXT: flat_load_dword v0, v[0:1] 1592; VI-NEXT: s_waitcnt vmcnt(0) 1593; VI-NEXT: v_ffbh_u32_e32 v2, v0 1594; VI-NEXT: v_mov_b32_e32 v0, s0 1595; VI-NEXT: v_mov_b32_e32 v1, s1 1596; VI-NEXT: flat_store_dword v[0:1], v2 1597; VI-NEXT: s_endpgm 1598; 1599; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: 1600; EG: ; %bb.0: 1601; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1602; EG-NEXT: TEX 0 @6 1603; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 1604; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1605; EG-NEXT: CF_END 1606; EG-NEXT: PAD 1607; EG-NEXT: Fetch clause starting at 6: 1608; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1609; EG-NEXT: ALU clause starting at 8: 1610; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1611; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1612; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1613; EG-NEXT: ALU clause starting at 11: 1614; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1615; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 1616; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1617; EG-NEXT: -1(nan), 2(2.802597e-45) 1618; 1619; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: 1620; GFX9-GISEL: ; %bb.0: 1621; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1622; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1623; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1624; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1625; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1626; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1627; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1628; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc 1629; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 1630; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1631; GFX9-GISEL-NEXT: s_endpgm 1632 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1633 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 1634 %val = load i32, ptr addrspace(1) %in.gep 1635 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 1636 %cmp = icmp ne i32 %val, 0 1637 %sel = select i1 %cmp, i32 %ctlz, i32 -1 1638 store i32 %sel, ptr addrspace(1) %out 1639 ret void 1640} 1641 1642define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1643; SI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: 1644; SI: ; %bb.0: 1645; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1646; SI-NEXT: s_mov_b32 s7, 0xf000 1647; SI-NEXT: v_mov_b32_e32 v1, 0 1648; SI-NEXT: s_mov_b32 s10, 0 1649; SI-NEXT: s_mov_b32 s11, s7 1650; SI-NEXT: s_waitcnt lgkmcnt(0) 1651; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1652; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 1653; SI-NEXT: s_mov_b32 s6, -1 1654; SI-NEXT: s_mov_b32 s4, s0 1655; SI-NEXT: s_mov_b32 s5, s1 1656; SI-NEXT: s_waitcnt vmcnt(0) 1657; SI-NEXT: v_ffbh_u32_e32 v0, v0 1658; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1659; SI-NEXT: s_endpgm 1660; 1661; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: 1662; VI: ; %bb.0: 1663; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1664; VI-NEXT: s_waitcnt lgkmcnt(0) 1665; VI-NEXT: v_mov_b32_e32 v1, s3 1666; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1667; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1668; VI-NEXT: flat_load_ubyte v0, v[0:1] 1669; VI-NEXT: s_waitcnt vmcnt(0) 1670; VI-NEXT: v_ffbh_u32_e32 v2, v0 1671; VI-NEXT: v_mov_b32_e32 v0, s0 1672; VI-NEXT: v_mov_b32_e32 v1, s1 1673; VI-NEXT: flat_store_byte v[0:1], v2 1674; VI-NEXT: s_endpgm 1675; 1676; EG-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: 1677; EG: ; %bb.0: 1678; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1679; EG-NEXT: TEX 0 @6 1680; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1681; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1682; EG-NEXT: CF_END 1683; EG-NEXT: PAD 1684; EG-NEXT: Fetch clause starting at 6: 1685; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1686; EG-NEXT: ALU clause starting at 8: 1687; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1688; EG-NEXT: ALU clause starting at 9: 1689; EG-NEXT: FFBH_UINT T0.W, T0.X, 1690; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1691; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1692; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1693; EG-NEXT: LSHL * T1.W, PS, literal.y, 1694; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1695; EG-NEXT: LSHL T0.X, PV.W, PS, 1696; EG-NEXT: LSHL * T0.W, literal.x, PS, 1697; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1698; EG-NEXT: MOV T0.Y, 0.0, 1699; EG-NEXT: MOV * T0.Z, 0.0, 1700; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1701; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1702; 1703; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: 1704; GFX9-GISEL: ; %bb.0: 1705; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1706; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 1707; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1708; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 1709; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 1710; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 1711; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc 1712; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off 1713; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 1714; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff 1715; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1716; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v0 1717; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3 1718; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa vcc, v0, v1 src0_sel:BYTE_0 src1_sel:DWORD 1719; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc 1720; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] 1721; GFX9-GISEL-NEXT: s_endpgm 1722 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1723 %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid 1724 %val = load i8, ptr addrspace(1) %valptr.gep 1725 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone 1726 %cmp = icmp eq i8 %val, 0 1727 %sel = select i1 %cmp, i8 -1, i8 %ctlz 1728 store i8 %sel, ptr addrspace(1) %out 1729 ret void 1730} 1731 1732define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1733; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: 1734; SI: ; %bb.0: 1735; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1736; SI-NEXT: s_mov_b32 s7, 0xf000 1737; SI-NEXT: s_mov_b32 s10, 0 1738; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1739; SI-NEXT: v_mov_b32_e32 v1, 0 1740; SI-NEXT: s_mov_b32 s11, s7 1741; SI-NEXT: s_waitcnt lgkmcnt(0) 1742; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1743; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1744; SI-NEXT: s_mov_b32 s6, -1 1745; SI-NEXT: s_mov_b32 s4, s0 1746; SI-NEXT: s_mov_b32 s5, s1 1747; SI-NEXT: s_waitcnt vmcnt(0) 1748; SI-NEXT: v_ffbh_u32_e32 v1, v0 1749; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1750; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc 1751; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 1752; SI-NEXT: s_waitcnt vmcnt(0) 1753; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1754; SI-NEXT: s_waitcnt vmcnt(0) 1755; SI-NEXT: s_endpgm 1756; 1757; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: 1758; VI: ; %bb.0: 1759; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1760; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1761; VI-NEXT: s_waitcnt lgkmcnt(0) 1762; VI-NEXT: v_mov_b32_e32 v1, s3 1763; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1764; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1765; VI-NEXT: flat_load_dword v2, v[0:1] 1766; VI-NEXT: v_mov_b32_e32 v0, s0 1767; VI-NEXT: v_mov_b32_e32 v1, s1 1768; VI-NEXT: s_waitcnt vmcnt(0) 1769; VI-NEXT: v_ffbh_u32_e32 v3, v2 1770; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 1771; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 1772; VI-NEXT: flat_store_dword v[0:1], v3 1773; VI-NEXT: s_waitcnt vmcnt(0) 1774; VI-NEXT: flat_store_byte v[0:1], v2 1775; VI-NEXT: s_waitcnt vmcnt(0) 1776; VI-NEXT: s_endpgm 1777; 1778; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: 1779; EG: ; %bb.0: 1780; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1781; EG-NEXT: TEX 0 @6 1782; EG-NEXT: ALU 11, @11, KC0[CB0:0-32], KC1[] 1783; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0 1784; EG-NEXT: MEM_RAT MSKOR T1.XW, T2.X 1785; EG-NEXT: CF_END 1786; EG-NEXT: Fetch clause starting at 6: 1787; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1788; EG-NEXT: ALU clause starting at 8: 1789; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1790; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1791; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1792; EG-NEXT: ALU clause starting at 11: 1793; EG-NEXT: SETE_INT * T0.W, T0.X, 0.0, 1794; EG-NEXT: AND_INT T1.X, PV.W, 1, 1795; EG-NEXT: MOV * T1.W, literal.x, 1796; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1797; EG-NEXT: MOV T1.Y, 0.0, 1798; EG-NEXT: MOV * T1.Z, 0.0, 1799; EG-NEXT: MOV T2.X, literal.x, 1800; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1801; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 1802; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 1803; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.y, 1804; EG-NEXT: -1(nan), 2(2.802597e-45) 1805; 1806; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: 1807; GFX9-GISEL: ; %bb.0: 1808; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1809; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1810; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 1811; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1812; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1813; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1814; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 1815; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1816; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc 1817; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc 1818; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1819; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1820; GFX9-GISEL-NEXT: global_store_byte v[0:1], v2, off 1821; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1822; GFX9-GISEL-NEXT: s_endpgm 1823 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1824 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 1825 %val = load i32, ptr addrspace(1) %in.gep 1826 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 1827 %cmp = icmp eq i32 %val, 0 1828 %sel = select i1 %cmp, i32 -1, i32 %ctlz 1829 store volatile i32 %sel, ptr addrspace(1) %out 1830 store volatile i1 %cmp, ptr addrspace(1) undef 1831 ret void 1832} 1833 1834; Selected on wrong constant 1835define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1836; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: 1837; SI: ; %bb.0: 1838; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1839; SI-NEXT: s_mov_b32 s7, 0xf000 1840; SI-NEXT: s_mov_b32 s10, 0 1841; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1842; SI-NEXT: v_mov_b32_e32 v1, 0 1843; SI-NEXT: s_mov_b32 s11, s7 1844; SI-NEXT: s_waitcnt lgkmcnt(0) 1845; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1846; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1847; SI-NEXT: s_mov_b32 s6, -1 1848; SI-NEXT: s_mov_b32 s4, s0 1849; SI-NEXT: s_mov_b32 s5, s1 1850; SI-NEXT: s_waitcnt vmcnt(0) 1851; SI-NEXT: v_ffbh_u32_e32 v1, v0 1852; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1853; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc 1854; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1855; SI-NEXT: s_endpgm 1856; 1857; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: 1858; VI: ; %bb.0: 1859; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1860; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1861; VI-NEXT: s_waitcnt lgkmcnt(0) 1862; VI-NEXT: v_mov_b32_e32 v1, s3 1863; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1864; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1865; VI-NEXT: flat_load_dword v0, v[0:1] 1866; VI-NEXT: s_waitcnt vmcnt(0) 1867; VI-NEXT: v_ffbh_u32_e32 v1, v0 1868; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1869; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc 1870; VI-NEXT: v_mov_b32_e32 v0, s0 1871; VI-NEXT: v_mov_b32_e32 v1, s1 1872; VI-NEXT: flat_store_dword v[0:1], v2 1873; VI-NEXT: s_endpgm 1874; 1875; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: 1876; EG: ; %bb.0: 1877; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1878; EG-NEXT: TEX 0 @6 1879; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 1880; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1881; EG-NEXT: CF_END 1882; EG-NEXT: PAD 1883; EG-NEXT: Fetch clause starting at 6: 1884; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1885; EG-NEXT: ALU clause starting at 8: 1886; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1887; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1888; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1889; EG-NEXT: ALU clause starting at 11: 1890; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1891; EG-NEXT: CNDE_INT T0.X, T0.X, 0.0, PV.W, 1892; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1893; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1894; 1895; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: 1896; GFX9-GISEL: ; %bb.0: 1897; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1898; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1899; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1900; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1901; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1902; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1903; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 1904; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc 1905; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 1906; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1907; GFX9-GISEL-NEXT: s_endpgm 1908 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1909 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 1910 %val = load i32, ptr addrspace(1) %in.gep 1911 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 1912 %cmp = icmp eq i32 %val, 0 1913 %sel = select i1 %cmp, i32 0, i32 %ctlz 1914 store i32 %sel, ptr addrspace(1) %out 1915 ret void 1916} 1917 1918; Selected on wrong constant 1919define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1920; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: 1921; SI: ; %bb.0: 1922; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1923; SI-NEXT: s_mov_b32 s7, 0xf000 1924; SI-NEXT: s_mov_b32 s10, 0 1925; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1926; SI-NEXT: v_mov_b32_e32 v1, 0 1927; SI-NEXT: s_mov_b32 s11, s7 1928; SI-NEXT: s_waitcnt lgkmcnt(0) 1929; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1930; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1931; SI-NEXT: s_mov_b32 s6, -1 1932; SI-NEXT: s_mov_b32 s4, s0 1933; SI-NEXT: s_mov_b32 s5, s1 1934; SI-NEXT: s_waitcnt vmcnt(0) 1935; SI-NEXT: v_ffbh_u32_e32 v1, v0 1936; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1937; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc 1938; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1939; SI-NEXT: s_endpgm 1940; 1941; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: 1942; VI: ; %bb.0: 1943; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1944; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1945; VI-NEXT: s_waitcnt lgkmcnt(0) 1946; VI-NEXT: v_mov_b32_e32 v1, s3 1947; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1948; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1949; VI-NEXT: flat_load_dword v0, v[0:1] 1950; VI-NEXT: s_waitcnt vmcnt(0) 1951; VI-NEXT: v_ffbh_u32_e32 v1, v0 1952; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1953; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc 1954; VI-NEXT: v_mov_b32_e32 v0, s0 1955; VI-NEXT: v_mov_b32_e32 v1, s1 1956; VI-NEXT: flat_store_dword v[0:1], v2 1957; VI-NEXT: s_endpgm 1958; 1959; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: 1960; EG: ; %bb.0: 1961; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1962; EG-NEXT: TEX 0 @6 1963; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 1964; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1965; EG-NEXT: CF_END 1966; EG-NEXT: PAD 1967; EG-NEXT: Fetch clause starting at 6: 1968; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1969; EG-NEXT: ALU clause starting at 8: 1970; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1971; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1972; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1973; EG-NEXT: ALU clause starting at 11: 1974; EG-NEXT: FFBH_UINT * T0.W, T0.X, 1975; EG-NEXT: CNDE_INT T0.X, T0.X, 0.0, PV.W, 1976; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1977; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1978; 1979; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: 1980; GFX9-GISEL: ; %bb.0: 1981; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1982; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1983; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1984; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1985; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1986; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 1987; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1988; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc 1989; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 1990; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1991; GFX9-GISEL-NEXT: s_endpgm 1992 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1993 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 1994 %val = load i32, ptr addrspace(1) %in.gep 1995 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 1996 %cmp = icmp ne i32 %val, 0 1997 %sel = select i1 %cmp, i32 %ctlz, i32 0 1998 store i32 %sel, ptr addrspace(1) %out 1999 ret void 2000} 2001 2002; Compare on wrong constant 2003define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 2004; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: 2005; SI: ; %bb.0: 2006; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2007; SI-NEXT: s_mov_b32 s7, 0xf000 2008; SI-NEXT: s_mov_b32 s10, 0 2009; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2010; SI-NEXT: v_mov_b32_e32 v1, 0 2011; SI-NEXT: s_mov_b32 s11, s7 2012; SI-NEXT: s_waitcnt lgkmcnt(0) 2013; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 2014; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2015; SI-NEXT: s_mov_b32 s6, -1 2016; SI-NEXT: s_mov_b32 s4, s0 2017; SI-NEXT: s_mov_b32 s5, s1 2018; SI-NEXT: s_waitcnt vmcnt(0) 2019; SI-NEXT: v_ffbh_u32_e32 v1, v0 2020; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 2021; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc 2022; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 2023; SI-NEXT: s_endpgm 2024; 2025; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: 2026; VI: ; %bb.0: 2027; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2028; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2029; VI-NEXT: s_waitcnt lgkmcnt(0) 2030; VI-NEXT: v_mov_b32_e32 v1, s3 2031; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 2032; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2033; VI-NEXT: flat_load_dword v0, v[0:1] 2034; VI-NEXT: s_waitcnt vmcnt(0) 2035; VI-NEXT: v_ffbh_u32_e32 v1, v0 2036; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 2037; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc 2038; VI-NEXT: v_mov_b32_e32 v0, s0 2039; VI-NEXT: v_mov_b32_e32 v1, s1 2040; VI-NEXT: flat_store_dword v[0:1], v2 2041; VI-NEXT: s_endpgm 2042; 2043; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: 2044; EG: ; %bb.0: 2045; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 2046; EG-NEXT: TEX 0 @6 2047; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[] 2048; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 2049; EG-NEXT: CF_END 2050; EG-NEXT: PAD 2051; EG-NEXT: Fetch clause starting at 6: 2052; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 2053; EG-NEXT: ALU clause starting at 8: 2054; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 2055; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2056; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 2057; EG-NEXT: ALU clause starting at 11: 2058; EG-NEXT: FFBH_UINT T0.W, T0.X, 2059; EG-NEXT: SETE_INT * T1.W, T0.X, 1, 2060; EG-NEXT: CNDE_INT T0.X, PS, PV.W, 0.0, 2061; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2062; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2063; 2064; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: 2065; GFX9-GISEL: ; %bb.0: 2066; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2067; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2068; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 2069; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 2070; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 2071; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 2072; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 2073; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc 2074; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 2075; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 2076; GFX9-GISEL-NEXT: s_endpgm 2077 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2078 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 2079 %val = load i32, ptr addrspace(1) %in.gep 2080 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 2081 %cmp = icmp eq i32 %val, 1 2082 %sel = select i1 %cmp, i32 0, i32 %ctlz 2083 store i32 %sel, ptr addrspace(1) %out 2084 ret void 2085} 2086 2087; Selected on wrong constant 2088define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 2089; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: 2090; SI: ; %bb.0: 2091; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2092; SI-NEXT: s_mov_b32 s7, 0xf000 2093; SI-NEXT: s_mov_b32 s10, 0 2094; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2095; SI-NEXT: v_mov_b32_e32 v1, 0 2096; SI-NEXT: s_mov_b32 s11, s7 2097; SI-NEXT: s_waitcnt lgkmcnt(0) 2098; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 2099; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2100; SI-NEXT: s_mov_b32 s6, -1 2101; SI-NEXT: s_mov_b32 s4, s0 2102; SI-NEXT: s_mov_b32 s5, s1 2103; SI-NEXT: s_waitcnt vmcnt(0) 2104; SI-NEXT: v_ffbh_u32_e32 v1, v0 2105; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 2106; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc 2107; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 2108; SI-NEXT: s_endpgm 2109; 2110; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: 2111; VI: ; %bb.0: 2112; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2113; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2114; VI-NEXT: s_waitcnt lgkmcnt(0) 2115; VI-NEXT: v_mov_b32_e32 v1, s3 2116; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 2117; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2118; VI-NEXT: flat_load_dword v0, v[0:1] 2119; VI-NEXT: s_waitcnt vmcnt(0) 2120; VI-NEXT: v_ffbh_u32_e32 v1, v0 2121; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 2122; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc 2123; VI-NEXT: v_mov_b32_e32 v0, s0 2124; VI-NEXT: v_mov_b32_e32 v1, s1 2125; VI-NEXT: flat_store_dword v[0:1], v2 2126; VI-NEXT: s_endpgm 2127; 2128; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: 2129; EG: ; %bb.0: 2130; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 2131; EG-NEXT: TEX 0 @6 2132; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[] 2133; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 2134; EG-NEXT: CF_END 2135; EG-NEXT: PAD 2136; EG-NEXT: Fetch clause starting at 6: 2137; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 2138; EG-NEXT: ALU clause starting at 8: 2139; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 2140; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2141; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 2142; EG-NEXT: ALU clause starting at 11: 2143; EG-NEXT: FFBH_UINT T0.W, T0.X, 2144; EG-NEXT: SETNE_INT * T1.W, T0.X, 1, 2145; EG-NEXT: CNDE_INT T0.X, PS, 0.0, PV.W, 2146; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2147; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2148; 2149; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: 2150; GFX9-GISEL: ; %bb.0: 2151; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2152; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2153; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 2154; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 2155; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 2156; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 2157; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 2158; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc 2159; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 2160; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 2161; GFX9-GISEL-NEXT: s_endpgm 2162 %tid = call i32 @llvm.amdgcn.workitem.id.x() 2163 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 2164 %val = load i32, ptr addrspace(1) %in.gep 2165 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone 2166 %cmp = icmp ne i32 %val, 1 2167 %sel = select i1 %cmp, i32 %ctlz, i32 0 2168 store i32 %sel, ptr addrspace(1) %out 2169 ret void 2170} 2171 2172define i7 @v_ctlz_zero_undef_i7(i7 %val) { 2173; SI-LABEL: v_ctlz_zero_undef_i7: 2174; SI: ; %bb.0: 2175; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2176; SI-NEXT: v_lshlrev_b32_e32 v0, 25, v0 2177; SI-NEXT: v_ffbh_u32_e32 v0, v0 2178; SI-NEXT: s_setpc_b64 s[30:31] 2179; 2180; VI-LABEL: v_ctlz_zero_undef_i7: 2181; VI: ; %bb.0: 2182; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2183; VI-NEXT: v_lshlrev_b32_e32 v0, 25, v0 2184; VI-NEXT: v_ffbh_u32_e32 v0, v0 2185; VI-NEXT: s_setpc_b64 s[30:31] 2186; 2187; EG-LABEL: v_ctlz_zero_undef_i7: 2188; EG: ; %bb.0: 2189; EG-NEXT: CF_END 2190; EG-NEXT: PAD 2191; 2192; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i7: 2193; GFX9-GISEL: ; %bb.0: 2194; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2195; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 25, v0 2196; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 2197; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] 2198 %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 true) 2199 ret i7 %ctlz 2200} 2201 2202define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, i18 %val) nounwind { 2203; SI-LABEL: s_ctlz_zero_undef_i18: 2204; SI: ; %bb.0: 2205; SI-NEXT: s_load_dword s2, s[4:5], 0xb 2206; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2207; SI-NEXT: s_mov_b32 s3, 0xf000 2208; SI-NEXT: s_waitcnt lgkmcnt(0) 2209; SI-NEXT: s_lshl_b32 s2, s2, 14 2210; SI-NEXT: s_flbit_i32_b32 s4, s2 2211; SI-NEXT: s_mov_b32 s2, -1 2212; SI-NEXT: v_mov_b32_e32 v0, s4 2213; SI-NEXT: s_bfe_u32 s4, s4, 0x20010 2214; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 2215; SI-NEXT: s_waitcnt expcnt(0) 2216; SI-NEXT: v_mov_b32_e32 v0, s4 2217; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2 2218; SI-NEXT: s_endpgm 2219; 2220; VI-LABEL: s_ctlz_zero_undef_i18: 2221; VI: ; %bb.0: 2222; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 2223; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2224; VI-NEXT: s_waitcnt lgkmcnt(0) 2225; VI-NEXT: s_lshl_b32 s2, s2, 14 2226; VI-NEXT: v_mov_b32_e32 v0, s0 2227; VI-NEXT: s_flbit_i32_b32 s2, s2 2228; VI-NEXT: v_mov_b32_e32 v1, s1 2229; VI-NEXT: s_add_u32 s0, s0, 2 2230; VI-NEXT: v_mov_b32_e32 v2, s2 2231; VI-NEXT: s_addc_u32 s1, s1, 0 2232; VI-NEXT: flat_store_short v[0:1], v2 2233; VI-NEXT: s_bfe_u32 s2, s2, 0x20010 2234; VI-NEXT: v_mov_b32_e32 v0, s0 2235; VI-NEXT: v_mov_b32_e32 v1, s1 2236; VI-NEXT: v_mov_b32_e32 v2, s2 2237; VI-NEXT: flat_store_byte v[0:1], v2 2238; VI-NEXT: s_endpgm 2239; 2240; EG-LABEL: s_ctlz_zero_undef_i18: 2241; EG: ; %bb.0: 2242; EG-NEXT: ALU 28, @4, KC0[CB0:0-32], KC1[] 2243; EG-NEXT: MEM_RAT MSKOR T1.XW, T3.X 2244; EG-NEXT: MEM_RAT MSKOR T0.XW, T2.X 2245; EG-NEXT: CF_END 2246; EG-NEXT: ALU clause starting at 4: 2247; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x, 2248; EG-NEXT: 14(1.961818e-44), 0(0.000000e+00) 2249; EG-NEXT: FFBH_UINT T0.W, PV.W, 2250; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 2251; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 2252; EG-NEXT: AND_INT T2.W, PV.W, literal.x, 2253; EG-NEXT: LSHL * T1.W, PS, literal.y, 2254; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 2255; EG-NEXT: LSHL T1.X, PV.W, PS, 2256; EG-NEXT: LSHL * T1.W, literal.x, PS, 2257; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 2258; EG-NEXT: MOV T1.Y, 0.0, 2259; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, 2260; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2261; EG-NEXT: AND_INT T3.W, PV.W, literal.x, 2262; EG-NEXT: MOV * T4.W, literal.y, 2263; EG-NEXT: 3(4.203895e-45), 2(2.802597e-45) 2264; EG-NEXT: BFE_UINT T0.W, T0.W, literal.x, PS, 2265; EG-NEXT: LSHL * T3.W, PV.W, literal.y, 2266; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45) 2267; EG-NEXT: LSHL T0.X, PV.W, PS, 2268; EG-NEXT: LSHL * T0.W, literal.x, PS, 2269; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 2270; EG-NEXT: MOV T0.Y, 0.0, 2271; EG-NEXT: MOV T1.Z, 0.0, 2272; EG-NEXT: MOV * T0.Z, 0.0, 2273; EG-NEXT: LSHR T2.X, T2.W, literal.x, 2274; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 2275; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2276; 2277; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i18: 2278; GFX9-GISEL: ; %bb.0: 2279; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c 2280; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2281; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 2282; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 2283; GFX9-GISEL-NEXT: s_lshl_b32 s2, s2, 14 2284; GFX9-GISEL-NEXT: s_flbit_i32_b32 s2, s2 2285; GFX9-GISEL-NEXT: s_and_b32 s2, s2, 0x3ffff 2286; GFX9-GISEL-NEXT: s_lshr_b32 s3, s2, 16 2287; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 2288; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] 2289; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s3 2290; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] offset:2 2291; GFX9-GISEL-NEXT: s_endpgm 2292 %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true) nounwind readnone 2293 store i18 %ctlz, ptr addrspace(1) %out, align 4 2294 ret void 2295} 2296 2297define i18 @v_ctlz_zero_undef_i18(i18 %val) { 2298; SI-LABEL: v_ctlz_zero_undef_i18: 2299; SI: ; %bb.0: 2300; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2301; SI-NEXT: v_lshlrev_b32_e32 v0, 14, v0 2302; SI-NEXT: v_ffbh_u32_e32 v0, v0 2303; SI-NEXT: s_setpc_b64 s[30:31] 2304; 2305; VI-LABEL: v_ctlz_zero_undef_i18: 2306; VI: ; %bb.0: 2307; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2308; VI-NEXT: v_lshlrev_b32_e32 v0, 14, v0 2309; VI-NEXT: v_ffbh_u32_e32 v0, v0 2310; VI-NEXT: s_setpc_b64 s[30:31] 2311; 2312; EG-LABEL: v_ctlz_zero_undef_i18: 2313; EG: ; %bb.0: 2314; EG-NEXT: CF_END 2315; EG-NEXT: PAD 2316; 2317; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i18: 2318; GFX9-GISEL: ; %bb.0: 2319; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2320; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 14, v0 2321; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 2322; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] 2323 %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true) 2324 ret i18 %ctlz 2325} 2326 2327define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) { 2328; SI-LABEL: v_ctlz_zero_undef_v2i18: 2329; SI: ; %bb.0: 2330; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2331; SI-NEXT: v_lshlrev_b32_e32 v0, 14, v0 2332; SI-NEXT: v_lshlrev_b32_e32 v1, 14, v1 2333; SI-NEXT: v_ffbh_u32_e32 v0, v0 2334; SI-NEXT: v_ffbh_u32_e32 v1, v1 2335; SI-NEXT: s_setpc_b64 s[30:31] 2336; 2337; VI-LABEL: v_ctlz_zero_undef_v2i18: 2338; VI: ; %bb.0: 2339; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2340; VI-NEXT: v_lshlrev_b32_e32 v0, 14, v0 2341; VI-NEXT: v_lshlrev_b32_e32 v1, 14, v1 2342; VI-NEXT: v_ffbh_u32_e32 v0, v0 2343; VI-NEXT: v_ffbh_u32_e32 v1, v1 2344; VI-NEXT: s_setpc_b64 s[30:31] 2345; 2346; EG-LABEL: v_ctlz_zero_undef_v2i18: 2347; EG: ; %bb.0: 2348; EG-NEXT: CF_END 2349; EG-NEXT: PAD 2350; 2351; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i18: 2352; GFX9-GISEL: ; %bb.0: 2353; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2354; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 14, v0 2355; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 14, v1 2356; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 2357; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 2358; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] 2359 %ctlz = call <2 x i18> @llvm.ctlz.v2i18(<2 x i18> %val, i1 true) 2360 ret <2 x i18> %ctlz 2361} 2362 2363define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) { 2364; SI-LABEL: v_ctlz_zero_undef_v2i16: 2365; SI: ; %bb.0: 2366; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2367; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2368; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2369; SI-NEXT: v_ffbh_u32_e32 v1, v1 2370; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 2371; SI-NEXT: v_ffbh_u32_e32 v0, v0 2372; SI-NEXT: v_or_b32_e32 v0, v0, v2 2373; SI-NEXT: s_setpc_b64 s[30:31] 2374; 2375; VI-LABEL: v_ctlz_zero_undef_v2i16: 2376; VI: ; %bb.0: 2377; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2378; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 2379; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2380; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2381; VI-NEXT: v_ffbh_u32_e32 v0, v0 2382; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2383; VI-NEXT: s_setpc_b64 s[30:31] 2384; 2385; EG-LABEL: v_ctlz_zero_undef_v2i16: 2386; EG: ; %bb.0: 2387; EG-NEXT: CF_END 2388; EG-NEXT: PAD 2389; 2390; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i16: 2391; GFX9-GISEL: ; %bb.0: 2392; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2393; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 2394; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2395; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 2396; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2397; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 2398; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 2399; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 2400; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] 2401 %ctlz = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %val, i1 true) 2402 ret <2 x i16> %ctlz 2403} 2404 2405define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) { 2406; SI-LABEL: v_ctlz_zero_undef_v3i16: 2407; SI: ; %bb.0: 2408; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2409; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2410; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2411; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2412; SI-NEXT: v_ffbh_u32_e32 v1, v1 2413; SI-NEXT: v_ffbh_u32_e32 v0, v0 2414; SI-NEXT: v_ffbh_u32_e32 v3, v2 2415; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2416; SI-NEXT: v_or_b32_e32 v0, v0, v1 2417; SI-NEXT: v_or_b32_e32 v2, 0x200000, v3 2418; SI-NEXT: v_alignbit_b32 v1, v3, v0, 16 2419; SI-NEXT: s_setpc_b64 s[30:31] 2420; 2421; VI-LABEL: v_ctlz_zero_undef_v3i16: 2422; VI: ; %bb.0: 2423; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2424; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v0 2425; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 2426; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2427; VI-NEXT: v_ffbh_u32_e32 v2, v2 2428; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2429; VI-NEXT: v_ffbh_u32_e32 v1, v1 2430; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2431; VI-NEXT: s_setpc_b64 s[30:31] 2432; 2433; EG-LABEL: v_ctlz_zero_undef_v3i16: 2434; EG: ; %bb.0: 2435; EG-NEXT: CF_END 2436; EG-NEXT: PAD 2437; 2438; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v3i16: 2439; GFX9-GISEL: ; %bb.0: 2440; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2441; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2442; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2443; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 2444; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2445; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 2446; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2447; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 2448; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 2449; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0 2450; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] 2451 %ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %val, i1 true) 2452 ret <3 x i16> %ctlz 2453} 2454 2455define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) { 2456; SI-LABEL: v_ctlz_zero_undef_v4i16: 2457; SI: ; %bb.0: 2458; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2459; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2460; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2461; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2462; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2463; SI-NEXT: v_ffbh_u32_e32 v3, v3 2464; SI-NEXT: v_ffbh_u32_e32 v2, v2 2465; SI-NEXT: v_ffbh_u32_e32 v1, v1 2466; SI-NEXT: v_ffbh_u32_e32 v0, v0 2467; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2468; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2469; SI-NEXT: v_or_b32_e32 v2, v2, v3 2470; SI-NEXT: v_or_b32_e32 v0, v0, v1 2471; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 2472; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 2473; SI-NEXT: s_setpc_b64 s[30:31] 2474; 2475; VI-LABEL: v_ctlz_zero_undef_v4i16: 2476; VI: ; %bb.0: 2477; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2478; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 2479; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 2480; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v0 2481; VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 2482; VI-NEXT: v_ffbh_u32_e32 v2, v2 2483; VI-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2484; VI-NEXT: v_ffbh_u32_e32 v3, v3 2485; VI-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 2486; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2487; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 2488; VI-NEXT: s_setpc_b64 s[30:31] 2489; 2490; EG-LABEL: v_ctlz_zero_undef_v4i16: 2491; EG: ; %bb.0: 2492; EG-NEXT: CF_END 2493; EG-NEXT: PAD 2494; 2495; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i16: 2496; GFX9-GISEL: ; %bb.0: 2497; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2498; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 2499; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 2500; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2501; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2502; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 2503; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2504; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 2505; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2506; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 2507; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3 2508; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 2509; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 2510; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0 2511; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v1 2512; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] 2513 %ctlz = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %val, i1 true) 2514 ret <4 x i16> %ctlz 2515} 2516 2517define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) { 2518; SI-LABEL: v_ctlz_zero_undef_v2i8: 2519; SI: ; %bb.0: 2520; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2521; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 2522; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 2523; SI-NEXT: v_ffbh_u32_e32 v1, v1 2524; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1 2525; SI-NEXT: v_ffbh_u32_e32 v0, v0 2526; SI-NEXT: v_or_b32_e32 v0, v0, v2 2527; SI-NEXT: s_setpc_b64 s[30:31] 2528; 2529; VI-LABEL: v_ctlz_zero_undef_v2i8: 2530; VI: ; %bb.0: 2531; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2532; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 2533; VI-NEXT: v_ffbh_u32_e32 v1, v1 2534; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 2535; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1 2536; VI-NEXT: v_ffbh_u32_e32 v0, v0 2537; VI-NEXT: v_or_b32_e32 v0, v0, v2 2538; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 2539; VI-NEXT: s_setpc_b64 s[30:31] 2540; 2541; EG-LABEL: v_ctlz_zero_undef_v2i8: 2542; EG: ; %bb.0: 2543; EG-NEXT: CF_END 2544; EG-NEXT: PAD 2545; 2546; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i8: 2547; GFX9-GISEL: ; %bb.0: 2548; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2549; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 2550; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 24, v1 2551; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 2552; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 2553; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] 2554 %ctlz = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %val, i1 true) 2555 ret <2 x i8> %ctlz 2556} 2557 2558define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) { 2559; SI-LABEL: v_ctlz_zero_undef_v2i7: 2560; SI: ; %bb.0: 2561; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2562; SI-NEXT: v_lshlrev_b32_e32 v0, 25, v0 2563; SI-NEXT: v_lshlrev_b32_e32 v1, 25, v1 2564; SI-NEXT: v_ffbh_u32_e32 v0, v0 2565; SI-NEXT: v_ffbh_u32_e32 v1, v1 2566; SI-NEXT: s_setpc_b64 s[30:31] 2567; 2568; VI-LABEL: v_ctlz_zero_undef_v2i7: 2569; VI: ; %bb.0: 2570; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2571; VI-NEXT: v_lshlrev_b32_e32 v0, 25, v0 2572; VI-NEXT: v_lshlrev_b32_e32 v1, 25, v1 2573; VI-NEXT: v_ffbh_u32_e32 v0, v0 2574; VI-NEXT: v_ffbh_u32_e32 v1, v1 2575; VI-NEXT: s_setpc_b64 s[30:31] 2576; 2577; EG-LABEL: v_ctlz_zero_undef_v2i7: 2578; EG: ; %bb.0: 2579; EG-NEXT: CF_END 2580; EG-NEXT: PAD 2581; 2582; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i7: 2583; GFX9-GISEL: ; %bb.0: 2584; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2585; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 25, v0 2586; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 25, v1 2587; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 2588; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 2589; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] 2590 %ctlz = call <2 x i7> @llvm.ctlz.v2i7(<2 x i7> %val, i1 true) 2591 ret <2 x i7> %ctlz 2592} 2593