1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s 4; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s 5; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s 6 7declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone 8declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone 9declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone 10declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone 11declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone 12declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone 13declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone 14declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 15 16define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { 17; SI-LABEL: s_cttz_zero_undef_i32: 18; SI: ; %bb.0: 19; SI-NEXT: s_load_dword s2, s[4:5], 0xb 20; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 21; SI-NEXT: s_mov_b32 s3, 0xf000 22; SI-NEXT: s_waitcnt lgkmcnt(0) 23; SI-NEXT: s_ff1_i32_b32 s4, s2 24; SI-NEXT: s_mov_b32 s2, -1 25; SI-NEXT: v_mov_b32_e32 v0, s4 26; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 27; SI-NEXT: s_endpgm 28; 29; VI-LABEL: s_cttz_zero_undef_i32: 30; VI: ; %bb.0: 31; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 32; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 33; VI-NEXT: s_waitcnt lgkmcnt(0) 34; VI-NEXT: s_ff1_i32_b32 s2, s2 35; VI-NEXT: v_mov_b32_e32 v0, s0 36; VI-NEXT: v_mov_b32_e32 v1, s1 37; VI-NEXT: v_mov_b32_e32 v2, s2 38; VI-NEXT: flat_store_dword v[0:1], v2 39; VI-NEXT: s_endpgm 40; 41; EG-LABEL: s_cttz_zero_undef_i32: 42; EG: ; %bb.0: 43; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 44; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 45; EG-NEXT: CF_END 46; EG-NEXT: PAD 47; EG-NEXT: ALU clause starting at 4: 48; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 49; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 50; EG-NEXT: FFBL_INT * T1.X, KC0[2].Z, 51; 52; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32: 53; GFX9-GISEL: ; %bb.0: 54; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c 55; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 56; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 57; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 58; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s2 59; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 60; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 61; GFX9-GISEL-NEXT: s_endpgm 62 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone 63 store i32 %cttz, ptr addrspace(1) %out, align 4 64 ret void 65} 66 67define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 68; SI-LABEL: v_cttz_zero_undef_i32: 69; SI: ; %bb.0: 70; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 71; SI-NEXT: s_mov_b32 s7, 0xf000 72; SI-NEXT: s_mov_b32 s10, 0 73; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 74; SI-NEXT: v_mov_b32_e32 v1, 0 75; SI-NEXT: s_mov_b32 s11, s7 76; SI-NEXT: s_waitcnt lgkmcnt(0) 77; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 78; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 79; SI-NEXT: s_mov_b32 s6, -1 80; SI-NEXT: s_mov_b32 s4, s0 81; SI-NEXT: s_mov_b32 s5, s1 82; SI-NEXT: s_waitcnt vmcnt(0) 83; SI-NEXT: v_ffbl_b32_e32 v0, v0 84; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 85; SI-NEXT: s_endpgm 86; 87; VI-LABEL: v_cttz_zero_undef_i32: 88; VI: ; %bb.0: 89; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 90; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 91; VI-NEXT: s_waitcnt lgkmcnt(0) 92; VI-NEXT: v_mov_b32_e32 v1, s3 93; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 94; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 95; VI-NEXT: flat_load_dword v0, v[0:1] 96; VI-NEXT: s_waitcnt vmcnt(0) 97; VI-NEXT: v_ffbl_b32_e32 v2, v0 98; VI-NEXT: v_mov_b32_e32 v0, s0 99; VI-NEXT: v_mov_b32_e32 v1, s1 100; VI-NEXT: flat_store_dword v[0:1], v2 101; VI-NEXT: s_endpgm 102; 103; EG-LABEL: v_cttz_zero_undef_i32: 104; EG: ; %bb.0: 105; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 106; EG-NEXT: TEX 0 @6 107; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] 108; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 109; EG-NEXT: CF_END 110; EG-NEXT: PAD 111; EG-NEXT: Fetch clause starting at 6: 112; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 113; EG-NEXT: ALU clause starting at 8: 114; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 115; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 116; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 117; EG-NEXT: ALU clause starting at 11: 118; EG-NEXT: FFBL_INT T0.X, T0.X, 119; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 120; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 121; 122; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32: 123; GFX9-GISEL: ; %bb.0: 124; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 125; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 126; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 127; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 128; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 129; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 130; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 131; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 132; GFX9-GISEL-NEXT: s_endpgm 133 %tid = call i32 @llvm.amdgcn.workitem.id.x() 134 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 135 %val = load i32, ptr addrspace(1) %in.gep, align 4 136 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone 137 store i32 %cttz, ptr addrspace(1) %out, align 4 138 ret void 139} 140 141define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 142; SI-LABEL: v_cttz_zero_undef_v2i32: 143; SI: ; %bb.0: 144; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 145; SI-NEXT: s_mov_b32 s7, 0xf000 146; SI-NEXT: s_mov_b32 s10, 0 147; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 148; SI-NEXT: v_mov_b32_e32 v1, 0 149; SI-NEXT: s_mov_b32 s11, s7 150; SI-NEXT: s_waitcnt lgkmcnt(0) 151; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 152; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 153; SI-NEXT: s_mov_b32 s6, -1 154; SI-NEXT: s_mov_b32 s4, s0 155; SI-NEXT: s_mov_b32 s5, s1 156; SI-NEXT: s_waitcnt vmcnt(0) 157; SI-NEXT: v_ffbl_b32_e32 v1, v1 158; SI-NEXT: v_ffbl_b32_e32 v0, v0 159; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 160; SI-NEXT: s_endpgm 161; 162; VI-LABEL: v_cttz_zero_undef_v2i32: 163; VI: ; %bb.0: 164; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 165; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 166; VI-NEXT: s_waitcnt lgkmcnt(0) 167; VI-NEXT: v_mov_b32_e32 v1, s3 168; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 169; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 170; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 171; VI-NEXT: v_mov_b32_e32 v3, s1 172; VI-NEXT: v_mov_b32_e32 v2, s0 173; VI-NEXT: s_waitcnt vmcnt(0) 174; VI-NEXT: v_ffbl_b32_e32 v1, v1 175; VI-NEXT: v_ffbl_b32_e32 v0, v0 176; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 177; VI-NEXT: s_endpgm 178; 179; EG-LABEL: v_cttz_zero_undef_v2i32: 180; EG: ; %bb.0: 181; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 182; EG-NEXT: TEX 0 @6 183; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 184; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 185; EG-NEXT: CF_END 186; EG-NEXT: PAD 187; EG-NEXT: Fetch clause starting at 6: 188; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 189; EG-NEXT: ALU clause starting at 8: 190; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 191; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 192; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 193; EG-NEXT: ALU clause starting at 11: 194; EG-NEXT: FFBL_INT * T0.Y, T0.Y, 195; EG-NEXT: FFBL_INT T0.X, T0.X, 196; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 197; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 198; 199; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32: 200; GFX9-GISEL: ; %bb.0: 201; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 202; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 203; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 204; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 205; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 206; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 207; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 208; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 209; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 210; GFX9-GISEL-NEXT: s_endpgm 211 %tid = call i32 @llvm.amdgcn.workitem.id.x() 212 %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid 213 %val = load <2 x i32>, ptr addrspace(1) %in.gep, align 8 214 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone 215 store <2 x i32> %cttz, ptr addrspace(1) %out, align 8 216 ret void 217} 218 219define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 220; SI-LABEL: v_cttz_zero_undef_v4i32: 221; SI: ; %bb.0: 222; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 223; SI-NEXT: s_mov_b32 s7, 0xf000 224; SI-NEXT: s_mov_b32 s10, 0 225; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 226; SI-NEXT: v_mov_b32_e32 v1, 0 227; SI-NEXT: s_mov_b32 s11, s7 228; SI-NEXT: s_waitcnt lgkmcnt(0) 229; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 230; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 231; SI-NEXT: s_mov_b32 s6, -1 232; SI-NEXT: s_mov_b32 s4, s0 233; SI-NEXT: s_mov_b32 s5, s1 234; SI-NEXT: s_waitcnt vmcnt(0) 235; SI-NEXT: v_ffbl_b32_e32 v3, v3 236; SI-NEXT: v_ffbl_b32_e32 v2, v2 237; SI-NEXT: v_ffbl_b32_e32 v1, v1 238; SI-NEXT: v_ffbl_b32_e32 v0, v0 239; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 240; SI-NEXT: s_endpgm 241; 242; VI-LABEL: v_cttz_zero_undef_v4i32: 243; VI: ; %bb.0: 244; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 245; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 246; VI-NEXT: s_waitcnt lgkmcnt(0) 247; VI-NEXT: v_mov_b32_e32 v1, s3 248; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 249; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 250; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 251; VI-NEXT: v_mov_b32_e32 v5, s1 252; VI-NEXT: v_mov_b32_e32 v4, s0 253; VI-NEXT: s_waitcnt vmcnt(0) 254; VI-NEXT: v_ffbl_b32_e32 v3, v3 255; VI-NEXT: v_ffbl_b32_e32 v2, v2 256; VI-NEXT: v_ffbl_b32_e32 v1, v1 257; VI-NEXT: v_ffbl_b32_e32 v0, v0 258; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 259; VI-NEXT: s_endpgm 260; 261; EG-LABEL: v_cttz_zero_undef_v4i32: 262; EG: ; %bb.0: 263; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 264; EG-NEXT: TEX 0 @6 265; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 266; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 267; EG-NEXT: CF_END 268; EG-NEXT: PAD 269; EG-NEXT: Fetch clause starting at 6: 270; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 271; EG-NEXT: ALU clause starting at 8: 272; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 273; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 274; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 275; EG-NEXT: ALU clause starting at 11: 276; EG-NEXT: FFBL_INT * T0.W, T0.W, 277; EG-NEXT: FFBL_INT * T0.Z, T0.Z, 278; EG-NEXT: FFBL_INT * T0.Y, T0.Y, 279; EG-NEXT: FFBL_INT T0.X, T0.X, 280; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 281; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 282; 283; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32: 284; GFX9-GISEL: ; %bb.0: 285; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 286; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 287; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 288; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 289; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 290; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 291; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 292; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 293; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 294; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 295; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 296; GFX9-GISEL-NEXT: s_endpgm 297 %tid = call i32 @llvm.amdgcn.workitem.id.x() 298 %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid 299 %val = load <4 x i32>, ptr addrspace(1) %in.gep, align 16 300 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone 301 store <4 x i32> %cttz, ptr addrspace(1) %out, align 16 302 ret void 303} 304 305define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind { 306; SI-LABEL: s_cttz_zero_undef_i8_with_select: 307; SI: ; %bb.0: 308; SI-NEXT: s_load_dword s2, s[4:5], 0xb 309; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 310; SI-NEXT: s_mov_b32 s3, 0xf000 311; SI-NEXT: s_waitcnt lgkmcnt(0) 312; SI-NEXT: s_ff1_i32_b32 s4, s2 313; SI-NEXT: s_mov_b32 s2, -1 314; SI-NEXT: v_mov_b32_e32 v0, s4 315; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 316; SI-NEXT: s_endpgm 317; 318; VI-LABEL: s_cttz_zero_undef_i8_with_select: 319; VI: ; %bb.0: 320; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 321; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 322; VI-NEXT: s_waitcnt lgkmcnt(0) 323; VI-NEXT: s_ff1_i32_b32 s2, s2 324; VI-NEXT: v_mov_b32_e32 v0, s0 325; VI-NEXT: v_mov_b32_e32 v1, s1 326; VI-NEXT: v_mov_b32_e32 v2, s2 327; VI-NEXT: flat_store_byte v[0:1], v2 328; VI-NEXT: s_endpgm 329; 330; EG-LABEL: s_cttz_zero_undef_i8_with_select: 331; EG: ; %bb.0: 332; EG-NEXT: ALU 0, @8, KC0[], KC1[] 333; EG-NEXT: TEX 0 @6 334; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 335; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 336; EG-NEXT: CF_END 337; EG-NEXT: PAD 338; EG-NEXT: Fetch clause starting at 6: 339; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 340; EG-NEXT: ALU clause starting at 8: 341; EG-NEXT: MOV * T0.X, 0.0, 342; EG-NEXT: ALU clause starting at 9: 343; EG-NEXT: FFBL_INT T0.W, T0.X, 344; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 345; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 346; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 347; EG-NEXT: LSHL * T1.W, PS, literal.y, 348; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 349; EG-NEXT: LSHL T0.X, PV.W, PS, 350; EG-NEXT: LSHL * T0.W, literal.x, PS, 351; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 352; EG-NEXT: MOV T0.Y, 0.0, 353; EG-NEXT: MOV * T0.Z, 0.0, 354; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 355; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 356; 357; GFX9-GISEL-LABEL: s_cttz_zero_undef_i8_with_select: 358; GFX9-GISEL: ; %bb.0: 359; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c 360; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 361; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 362; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 363; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s2 364; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 365; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] 366; GFX9-GISEL-NEXT: s_endpgm 367 %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone 368 %cttz_ret = icmp ne i8 %val, 0 369 %ret = select i1 %cttz_ret, i8 %cttz, i8 32 370 store i8 %cttz, ptr addrspace(1) %out, align 4 371 ret void 372} 373 374define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind { 375; SI-LABEL: s_cttz_zero_undef_i16_with_select: 376; SI: ; %bb.0: 377; SI-NEXT: s_load_dword s2, s[4:5], 0xb 378; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 379; SI-NEXT: s_mov_b32 s3, 0xf000 380; SI-NEXT: s_waitcnt lgkmcnt(0) 381; SI-NEXT: s_ff1_i32_b32 s4, s2 382; SI-NEXT: s_mov_b32 s2, -1 383; SI-NEXT: v_mov_b32_e32 v0, s4 384; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 385; SI-NEXT: s_endpgm 386; 387; VI-LABEL: s_cttz_zero_undef_i16_with_select: 388; VI: ; %bb.0: 389; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 390; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 391; VI-NEXT: s_waitcnt lgkmcnt(0) 392; VI-NEXT: s_ff1_i32_b32 s2, s2 393; VI-NEXT: v_mov_b32_e32 v0, s0 394; VI-NEXT: v_mov_b32_e32 v1, s1 395; VI-NEXT: v_mov_b32_e32 v2, s2 396; VI-NEXT: flat_store_short v[0:1], v2 397; VI-NEXT: s_endpgm 398; 399; EG-LABEL: s_cttz_zero_undef_i16_with_select: 400; EG: ; %bb.0: 401; EG-NEXT: ALU 0, @8, KC0[], KC1[] 402; EG-NEXT: TEX 0 @6 403; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 404; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 405; EG-NEXT: CF_END 406; EG-NEXT: PAD 407; EG-NEXT: Fetch clause starting at 6: 408; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 409; EG-NEXT: ALU clause starting at 8: 410; EG-NEXT: MOV * T0.X, 0.0, 411; EG-NEXT: ALU clause starting at 9: 412; EG-NEXT: FFBL_INT T0.W, T0.X, 413; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 414; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 415; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 416; EG-NEXT: LSHL * T1.W, PS, literal.y, 417; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 418; EG-NEXT: LSHL T0.X, PV.W, PS, 419; EG-NEXT: LSHL * T0.W, literal.x, PS, 420; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 421; EG-NEXT: MOV T0.Y, 0.0, 422; EG-NEXT: MOV * T0.Z, 0.0, 423; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 424; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 425; 426; GFX9-GISEL-LABEL: s_cttz_zero_undef_i16_with_select: 427; GFX9-GISEL: ; %bb.0: 428; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c 429; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 430; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 431; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 432; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s2 433; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 434; GFX9-GISEL-NEXT: global_store_short v1, v0, s[0:1] 435; GFX9-GISEL-NEXT: s_endpgm 436 %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone 437 %cttz_ret = icmp ne i16 %val, 0 438 %ret = select i1 %cttz_ret, i16 %cttz, i16 32 439 store i16 %cttz, ptr addrspace(1) %out, align 4 440 ret void 441} 442 443define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind { 444; SI-LABEL: s_cttz_zero_undef_i32_with_select: 445; SI: ; %bb.0: 446; SI-NEXT: s_load_dword s2, s[4:5], 0xb 447; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 448; SI-NEXT: s_mov_b32 s3, 0xf000 449; SI-NEXT: s_waitcnt lgkmcnt(0) 450; SI-NEXT: s_ff1_i32_b32 s4, s2 451; SI-NEXT: s_mov_b32 s2, -1 452; SI-NEXT: v_mov_b32_e32 v0, s4 453; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 454; SI-NEXT: s_endpgm 455; 456; VI-LABEL: s_cttz_zero_undef_i32_with_select: 457; VI: ; %bb.0: 458; VI-NEXT: s_load_dword s2, s[4:5], 0x2c 459; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 460; VI-NEXT: s_waitcnt lgkmcnt(0) 461; VI-NEXT: s_ff1_i32_b32 s2, s2 462; VI-NEXT: v_mov_b32_e32 v0, s0 463; VI-NEXT: v_mov_b32_e32 v1, s1 464; VI-NEXT: v_mov_b32_e32 v2, s2 465; VI-NEXT: flat_store_dword v[0:1], v2 466; VI-NEXT: s_endpgm 467; 468; EG-LABEL: s_cttz_zero_undef_i32_with_select: 469; EG: ; %bb.0: 470; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 471; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 472; EG-NEXT: CF_END 473; EG-NEXT: PAD 474; EG-NEXT: ALU clause starting at 4: 475; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 476; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 477; EG-NEXT: FFBL_INT * T1.X, KC0[2].Z, 478; 479; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32_with_select: 480; GFX9-GISEL: ; %bb.0: 481; GFX9-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c 482; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 483; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 484; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 485; GFX9-GISEL-NEXT: s_ff1_i32_b32 s2, s2 486; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 487; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 488; GFX9-GISEL-NEXT: s_endpgm 489 %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone 490 %cttz_ret = icmp ne i32 %val, 0 491 %ret = select i1 %cttz_ret, i32 %cttz, i32 32 492 store i32 %cttz, ptr addrspace(1) %out, align 4 493 ret void 494} 495 496define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind { 497; SI-LABEL: s_cttz_zero_undef_i64_with_select: 498; SI: ; %bb.0: 499; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 500; SI-NEXT: s_mov_b32 s7, 0xf000 501; SI-NEXT: s_mov_b32 s6, -1 502; SI-NEXT: s_waitcnt lgkmcnt(0) 503; SI-NEXT: s_ff1_i32_b64 s2, s[2:3] 504; SI-NEXT: v_mov_b32_e32 v1, 0 505; SI-NEXT: s_mov_b32 s4, s0 506; SI-NEXT: s_mov_b32 s5, s1 507; SI-NEXT: v_mov_b32_e32 v0, s2 508; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 509; SI-NEXT: s_endpgm 510; 511; VI-LABEL: s_cttz_zero_undef_i64_with_select: 512; VI: ; %bb.0: 513; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 514; VI-NEXT: v_mov_b32_e32 v1, 0 515; VI-NEXT: s_waitcnt lgkmcnt(0) 516; VI-NEXT: s_ff1_i32_b64 s2, s[2:3] 517; VI-NEXT: v_mov_b32_e32 v3, s1 518; VI-NEXT: v_mov_b32_e32 v0, s2 519; VI-NEXT: v_mov_b32_e32 v2, s0 520; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 521; VI-NEXT: s_endpgm 522; 523; EG-LABEL: s_cttz_zero_undef_i64_with_select: 524; EG: ; %bb.0: 525; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 526; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 527; EG-NEXT: CF_END 528; EG-NEXT: PAD 529; EG-NEXT: ALU clause starting at 4: 530; EG-NEXT: FFBL_INT * T0.W, KC0[3].X, 531; EG-NEXT: FFBL_INT T1.W, KC0[2].W, 532; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 533; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 534; EG-NEXT: CNDE_INT T0.X, KC0[2].W, PS, PV.W, 535; EG-NEXT: MOV T0.Y, 0.0, 536; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 537; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 538; 539; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select: 540; GFX9-GISEL: ; %bb.0: 541; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 542; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 543; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 544; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 545; GFX9-GISEL-NEXT: s_ff1_i32_b64 s4, s[2:3] 546; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 547; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 548; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 549; GFX9-GISEL-NEXT: s_endpgm 550 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone 551 %cttz_ret = icmp ne i64 %val, 0 552 %ret = select i1 %cttz_ret, i64 %cttz, i64 32 553 store i64 %cttz, ptr addrspace(1) %out, align 4 554 ret void 555} 556 557define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { 558; SI-LABEL: v_cttz_zero_undef_i8_with_select: 559; SI: ; %bb.0: 560; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 561; SI-NEXT: s_mov_b32 s7, 0xf000 562; SI-NEXT: s_mov_b32 s6, -1 563; SI-NEXT: s_mov_b32 s10, s6 564; SI-NEXT: s_mov_b32 s11, s7 565; SI-NEXT: s_waitcnt lgkmcnt(0) 566; SI-NEXT: s_mov_b32 s8, s2 567; SI-NEXT: s_mov_b32 s9, s3 568; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 569; SI-NEXT: s_mov_b32 s4, s0 570; SI-NEXT: s_mov_b32 s5, s1 571; SI-NEXT: s_waitcnt vmcnt(0) 572; SI-NEXT: v_ffbl_b32_e32 v1, v0 573; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 574; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 575; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 576; SI-NEXT: s_endpgm 577; 578; VI-LABEL: v_cttz_zero_undef_i8_with_select: 579; VI: ; %bb.0: 580; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 581; VI-NEXT: s_waitcnt lgkmcnt(0) 582; VI-NEXT: v_mov_b32_e32 v0, s2 583; VI-NEXT: v_mov_b32_e32 v1, s3 584; VI-NEXT: flat_load_ubyte v0, v[0:1] 585; VI-NEXT: s_waitcnt vmcnt(0) 586; VI-NEXT: v_ffbl_b32_e32 v1, v0 587; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 588; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 589; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc 590; VI-NEXT: v_mov_b32_e32 v0, s0 591; VI-NEXT: v_mov_b32_e32 v1, s1 592; VI-NEXT: flat_store_byte v[0:1], v2 593; VI-NEXT: s_endpgm 594; 595; EG-LABEL: v_cttz_zero_undef_i8_with_select: 596; EG: ; %bb.0: 597; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 598; EG-NEXT: TEX 0 @6 599; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 600; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 601; EG-NEXT: CF_END 602; EG-NEXT: PAD 603; EG-NEXT: Fetch clause starting at 6: 604; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 605; EG-NEXT: ALU clause starting at 8: 606; EG-NEXT: MOV * T0.X, KC0[2].Z, 607; EG-NEXT: ALU clause starting at 9: 608; EG-NEXT: FFBL_INT T0.W, T0.X, 609; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 610; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 611; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 612; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 613; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 614; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 615; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 616; EG-NEXT: LSHL T0.X, PV.W, PS, 617; EG-NEXT: LSHL * T0.W, literal.x, PS, 618; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 619; EG-NEXT: MOV T0.Y, 0.0, 620; EG-NEXT: MOV * T0.Z, 0.0, 621; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 622; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 623; 624; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select: 625; GFX9-GISEL: ; %bb.0: 626; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 627; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 628; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 629; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 630; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 631; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 632; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 633; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 634; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc 635; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] 636; GFX9-GISEL-NEXT: s_endpgm 637 %val = load i8, ptr addrspace(1) %arrayidx, align 1 638 %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone 639 %cttz_ret = icmp ne i8 %val, 0 640 %ret = select i1 %cttz_ret, i8 %cttz, i8 32 641 store i8 %ret, ptr addrspace(1) %out, align 4 642 ret void 643} 644 645define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { 646; SI-LABEL: v_cttz_zero_undef_i16_with_select: 647; SI: ; %bb.0: 648; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 649; SI-NEXT: s_mov_b32 s7, 0xf000 650; SI-NEXT: s_mov_b32 s6, -1 651; SI-NEXT: s_mov_b32 s10, s6 652; SI-NEXT: s_mov_b32 s11, s7 653; SI-NEXT: s_waitcnt lgkmcnt(0) 654; SI-NEXT: s_mov_b32 s8, s2 655; SI-NEXT: s_mov_b32 s9, s3 656; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 657; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 658; SI-NEXT: s_mov_b32 s4, s0 659; SI-NEXT: s_mov_b32 s5, s1 660; SI-NEXT: s_waitcnt vmcnt(1) 661; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 662; SI-NEXT: s_waitcnt vmcnt(0) 663; SI-NEXT: v_or_b32_e32 v0, v0, v1 664; SI-NEXT: v_ffbl_b32_e32 v1, v0 665; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 666; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc 667; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 668; SI-NEXT: s_endpgm 669; 670; VI-LABEL: v_cttz_zero_undef_i16_with_select: 671; VI: ; %bb.0: 672; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 673; VI-NEXT: s_waitcnt lgkmcnt(0) 674; VI-NEXT: s_add_u32 s4, s2, 1 675; VI-NEXT: s_addc_u32 s5, s3, 0 676; VI-NEXT: v_mov_b32_e32 v2, s4 677; VI-NEXT: v_mov_b32_e32 v0, s2 678; VI-NEXT: v_mov_b32_e32 v3, s5 679; VI-NEXT: v_mov_b32_e32 v1, s3 680; VI-NEXT: flat_load_ubyte v2, v[2:3] 681; VI-NEXT: flat_load_ubyte v0, v[0:1] 682; VI-NEXT: s_waitcnt vmcnt(1) 683; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 684; VI-NEXT: s_waitcnt vmcnt(0) 685; VI-NEXT: v_or_b32_e32 v0, v1, v0 686; VI-NEXT: v_ffbl_b32_e32 v1, v0 687; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 688; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 689; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc 690; VI-NEXT: v_mov_b32_e32 v0, s0 691; VI-NEXT: v_mov_b32_e32 v1, s1 692; VI-NEXT: flat_store_short v[0:1], v2 693; VI-NEXT: s_endpgm 694; 695; EG-LABEL: v_cttz_zero_undef_i16_with_select: 696; EG: ; %bb.0: 697; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 698; EG-NEXT: TEX 0 @6 699; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 700; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 701; EG-NEXT: CF_END 702; EG-NEXT: PAD 703; EG-NEXT: Fetch clause starting at 6: 704; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 705; EG-NEXT: ALU clause starting at 8: 706; EG-NEXT: MOV * T0.X, KC0[2].Z, 707; EG-NEXT: ALU clause starting at 9: 708; EG-NEXT: FFBL_INT T0.W, T0.X, 709; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 710; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 711; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 712; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 713; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 714; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 715; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 716; EG-NEXT: LSHL T0.X, PV.W, PS, 717; EG-NEXT: LSHL * T0.W, literal.x, PS, 718; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 719; EG-NEXT: MOV T0.Y, 0.0, 720; EG-NEXT: MOV * T0.Z, 0.0, 721; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 722; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 723; 724; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select: 725; GFX9-GISEL: ; %bb.0: 726; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 727; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 728; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 729; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 730; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 731; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 732; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 733; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 734; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 735; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 736; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc 737; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] 738; GFX9-GISEL-NEXT: s_endpgm 739 %val = load i16, ptr addrspace(1) %arrayidx, align 1 740 %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone 741 %cttz_ret = icmp ne i16 %val, 0 742 %ret = select i1 %cttz_ret, i16 %cttz, i16 32 743 store i16 %ret, ptr addrspace(1) %out, align 4 744 ret void 745} 746 747define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { 748; SI-LABEL: v_cttz_zero_undef_i32_with_select: 749; SI: ; %bb.0: 750; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 751; SI-NEXT: s_mov_b32 s7, 0xf000 752; SI-NEXT: s_mov_b32 s6, -1 753; SI-NEXT: s_mov_b32 s10, s6 754; SI-NEXT: s_mov_b32 s11, s7 755; SI-NEXT: s_waitcnt lgkmcnt(0) 756; SI-NEXT: s_mov_b32 s8, s2 757; SI-NEXT: s_mov_b32 s9, s3 758; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 759; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:3 760; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 761; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:2 762; SI-NEXT: s_mov_b32 s4, s0 763; SI-NEXT: s_mov_b32 s5, s1 764; SI-NEXT: s_waitcnt vmcnt(3) 765; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 766; SI-NEXT: s_waitcnt vmcnt(2) 767; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 768; SI-NEXT: s_waitcnt vmcnt(1) 769; SI-NEXT: v_or_b32_e32 v0, v0, v2 770; SI-NEXT: s_waitcnt vmcnt(0) 771; SI-NEXT: v_or_b32_e32 v1, v1, v3 772; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 773; SI-NEXT: v_or_b32_e32 v0, v1, v0 774; SI-NEXT: v_ffbl_b32_e32 v0, v0 775; SI-NEXT: v_min_u32_e32 v0, 32, v0 776; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 777; SI-NEXT: s_endpgm 778; 779; VI-LABEL: v_cttz_zero_undef_i32_with_select: 780; VI: ; %bb.0: 781; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 782; VI-NEXT: s_waitcnt lgkmcnt(0) 783; VI-NEXT: s_add_u32 s4, s2, 3 784; VI-NEXT: s_addc_u32 s5, s3, 0 785; VI-NEXT: v_mov_b32_e32 v2, s4 786; VI-NEXT: v_mov_b32_e32 v3, s5 787; VI-NEXT: s_add_u32 s4, s2, 2 788; VI-NEXT: v_mov_b32_e32 v0, s2 789; VI-NEXT: s_addc_u32 s5, s3, 0 790; VI-NEXT: v_mov_b32_e32 v1, s3 791; VI-NEXT: s_add_u32 s2, s2, 1 792; VI-NEXT: s_addc_u32 s3, s3, 0 793; VI-NEXT: v_mov_b32_e32 v4, s4 794; VI-NEXT: v_mov_b32_e32 v7, s3 795; VI-NEXT: v_mov_b32_e32 v5, s5 796; VI-NEXT: v_mov_b32_e32 v6, s2 797; VI-NEXT: flat_load_ubyte v2, v[2:3] 798; VI-NEXT: flat_load_ubyte v3, v[4:5] 799; VI-NEXT: flat_load_ubyte v4, v[6:7] 800; VI-NEXT: flat_load_ubyte v0, v[0:1] 801; VI-NEXT: s_waitcnt vmcnt(3) 802; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 803; VI-NEXT: s_waitcnt vmcnt(2) 804; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 805; VI-NEXT: s_waitcnt vmcnt(1) 806; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 807; VI-NEXT: s_waitcnt vmcnt(0) 808; VI-NEXT: v_or_b32_e32 v0, v2, v0 809; VI-NEXT: v_or_b32_e32 v0, v1, v0 810; VI-NEXT: v_ffbl_b32_e32 v0, v0 811; VI-NEXT: v_min_u32_e32 v2, 32, v0 812; VI-NEXT: v_mov_b32_e32 v0, s0 813; VI-NEXT: v_mov_b32_e32 v1, s1 814; VI-NEXT: flat_store_dword v[0:1], v2 815; VI-NEXT: s_endpgm 816; 817; EG-LABEL: v_cttz_zero_undef_i32_with_select: 818; EG: ; %bb.0: 819; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 820; EG-NEXT: TEX 1 @6 821; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] 822; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 823; EG-NEXT: CF_END 824; EG-NEXT: PAD 825; EG-NEXT: Fetch clause starting at 6: 826; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1 827; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 828; EG-NEXT: ALU clause starting at 10: 829; EG-NEXT: MOV * T0.X, KC0[2].Z, 830; EG-NEXT: ALU clause starting at 11: 831; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 832; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 833; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, 834; EG-NEXT: FFBL_INT * T1.W, PV.W, 835; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W, 836; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 837; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 838; 839; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select: 840; GFX9-GISEL: ; %bb.0: 841; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 842; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 843; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 844; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 845; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 846; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 847; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 848; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) 849; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 850; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) 851; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3 852; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 853; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v4 854; GFX9-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 855; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 856; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 857; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc 858; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] 859; GFX9-GISEL-NEXT: s_endpgm 860 %val = load i32, ptr addrspace(1) %arrayidx, align 1 861 %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone 862 %cttz_ret = icmp ne i32 %val, 0 863 %ret = select i1 %cttz_ret, i32 %cttz, i32 32 864 store i32 %ret, ptr addrspace(1) %out, align 4 865 ret void 866} 867 868define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { 869; SI-LABEL: v_cttz_zero_undef_i64_with_select: 870; SI: ; %bb.0: 871; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 872; SI-NEXT: s_mov_b32 s3, 0xf000 873; SI-NEXT: s_mov_b32 s2, -1 874; SI-NEXT: s_mov_b32 s10, s2 875; SI-NEXT: s_mov_b32 s11, s3 876; SI-NEXT: s_waitcnt lgkmcnt(0) 877; SI-NEXT: s_mov_b32 s8, s6 878; SI-NEXT: s_mov_b32 s9, s7 879; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:5 880; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:7 881; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 882; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:1 883; SI-NEXT: buffer_load_ubyte v4, off, s[8:11], 0 offset:2 884; SI-NEXT: buffer_load_ubyte v5, off, s[8:11], 0 offset:3 885; SI-NEXT: buffer_load_ubyte v6, off, s[8:11], 0 offset:4 886; SI-NEXT: buffer_load_ubyte v7, off, s[8:11], 0 offset:6 887; SI-NEXT: s_mov_b32 s0, s4 888; SI-NEXT: s_mov_b32 s1, s5 889; SI-NEXT: s_waitcnt vmcnt(7) 890; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 891; SI-NEXT: s_waitcnt vmcnt(6) 892; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 893; SI-NEXT: s_waitcnt vmcnt(4) 894; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 895; SI-NEXT: s_waitcnt vmcnt(2) 896; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 897; SI-NEXT: s_waitcnt vmcnt(1) 898; SI-NEXT: v_or_b32_e32 v0, v0, v6 899; SI-NEXT: s_waitcnt vmcnt(0) 900; SI-NEXT: v_or_b32_e32 v1, v1, v7 901; SI-NEXT: v_or_b32_e32 v2, v3, v2 902; SI-NEXT: v_or_b32_e32 v3, v5, v4 903; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 904; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 905; SI-NEXT: v_or_b32_e32 v0, v1, v0 906; SI-NEXT: v_or_b32_e32 v1, v3, v2 907; SI-NEXT: v_ffbl_b32_e32 v1, v1 908; SI-NEXT: v_ffbl_b32_e32 v0, v0 909; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0 910; SI-NEXT: v_min_u32_e32 v0, v0, v1 911; SI-NEXT: v_min_u32_e32 v0, 64, v0 912; SI-NEXT: v_mov_b32_e32 v1, 0 913; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 914; SI-NEXT: s_endpgm 915; 916; VI-LABEL: v_cttz_zero_undef_i64_with_select: 917; VI: ; %bb.0: 918; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 919; VI-NEXT: s_waitcnt lgkmcnt(0) 920; VI-NEXT: s_add_u32 s4, s2, 5 921; VI-NEXT: s_addc_u32 s5, s3, 0 922; VI-NEXT: v_mov_b32_e32 v0, s4 923; VI-NEXT: v_mov_b32_e32 v1, s5 924; VI-NEXT: s_add_u32 s4, s2, 4 925; VI-NEXT: s_addc_u32 s5, s3, 0 926; VI-NEXT: v_mov_b32_e32 v2, s4 927; VI-NEXT: v_mov_b32_e32 v3, s5 928; VI-NEXT: s_add_u32 s4, s2, 7 929; VI-NEXT: s_addc_u32 s5, s3, 0 930; VI-NEXT: v_mov_b32_e32 v4, s4 931; VI-NEXT: v_mov_b32_e32 v5, s5 932; VI-NEXT: s_add_u32 s4, s2, 6 933; VI-NEXT: s_addc_u32 s5, s3, 0 934; VI-NEXT: v_mov_b32_e32 v7, s5 935; VI-NEXT: v_mov_b32_e32 v6, s4 936; VI-NEXT: s_add_u32 s4, s2, 3 937; VI-NEXT: s_addc_u32 s5, s3, 0 938; VI-NEXT: v_mov_b32_e32 v9, s5 939; VI-NEXT: v_mov_b32_e32 v8, s4 940; VI-NEXT: s_add_u32 s4, s2, 2 941; VI-NEXT: s_addc_u32 s5, s3, 0 942; VI-NEXT: v_mov_b32_e32 v11, s5 943; VI-NEXT: v_mov_b32_e32 v10, s4 944; VI-NEXT: flat_load_ubyte v12, v[0:1] 945; VI-NEXT: flat_load_ubyte v13, v[2:3] 946; VI-NEXT: flat_load_ubyte v4, v[4:5] 947; VI-NEXT: flat_load_ubyte v5, v[6:7] 948; VI-NEXT: s_add_u32 s4, s2, 1 949; VI-NEXT: flat_load_ubyte v6, v[8:9] 950; VI-NEXT: s_addc_u32 s5, s3, 0 951; VI-NEXT: v_mov_b32_e32 v0, s4 952; VI-NEXT: v_mov_b32_e32 v2, s2 953; VI-NEXT: v_mov_b32_e32 v1, s5 954; VI-NEXT: v_mov_b32_e32 v3, s3 955; VI-NEXT: flat_load_ubyte v7, v[10:11] 956; VI-NEXT: flat_load_ubyte v0, v[0:1] 957; VI-NEXT: flat_load_ubyte v2, v[2:3] 958; VI-NEXT: v_mov_b32_e32 v1, 0 959; VI-NEXT: s_waitcnt vmcnt(7) 960; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 961; VI-NEXT: s_waitcnt vmcnt(6) 962; VI-NEXT: v_or_b32_e32 v3, v3, v13 963; VI-NEXT: s_waitcnt vmcnt(5) 964; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 965; VI-NEXT: s_waitcnt vmcnt(4) 966; VI-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 967; VI-NEXT: v_or_b32_e32 v3, v4, v3 968; VI-NEXT: s_waitcnt vmcnt(3) 969; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 970; VI-NEXT: v_ffbl_b32_e32 v3, v3 971; VI-NEXT: v_add_u32_e32 v3, vcc, 32, v3 972; VI-NEXT: s_waitcnt vmcnt(2) 973; VI-NEXT: v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 974; VI-NEXT: s_waitcnt vmcnt(1) 975; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 976; VI-NEXT: s_waitcnt vmcnt(0) 977; VI-NEXT: v_or_b32_e32 v0, v0, v2 978; VI-NEXT: v_or_b32_e32 v0, v4, v0 979; VI-NEXT: v_ffbl_b32_e32 v0, v0 980; VI-NEXT: v_min_u32_e32 v0, v3, v0 981; VI-NEXT: v_mov_b32_e32 v3, s1 982; VI-NEXT: v_min_u32_e32 v0, 64, v0 983; VI-NEXT: v_mov_b32_e32 v2, s0 984; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 985; VI-NEXT: s_endpgm 986; 987; EG-LABEL: v_cttz_zero_undef_i64_with_select: 988; EG: ; %bb.0: 989; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 990; EG-NEXT: TEX 3 @6 991; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[] 992; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 993; EG-NEXT: CF_END 994; EG-NEXT: PAD 995; EG-NEXT: Fetch clause starting at 6: 996; EG-NEXT: VTX_READ_16 T1.X, T0.X, 6, #1 997; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1 998; EG-NEXT: VTX_READ_16 T3.X, T0.X, 2, #1 999; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 1000; EG-NEXT: ALU clause starting at 14: 1001; EG-NEXT: MOV * T0.X, KC0[2].Z, 1002; EG-NEXT: ALU clause starting at 15: 1003; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 1004; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1005; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, 1006; EG-NEXT: FFBL_INT T1.W, PV.W, 1007; EG-NEXT: LSHL * T2.W, T3.X, literal.x, 1008; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1009; EG-NEXT: CNDE_INT T0.W, T0.W, literal.x, PV.W, 1010; EG-NEXT: OR_INT * T1.W, PS, T2.X, 1011; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1012; EG-NEXT: FFBL_INT T2.W, PS, 1013; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 1014; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1015; EG-NEXT: CNDE_INT T0.X, T1.W, PS, PV.W, 1016; EG-NEXT: MOV T0.Y, 0.0, 1017; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1018; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1019; 1020; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select: 1021; GFX9-GISEL: ; %bb.0: 1022; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1023; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 1024; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1025; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] 1026; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1 1027; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2 1028; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3 1029; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4 1030; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5 1031; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:6 1032; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:7 1033; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) 1034; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0 1035; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) 1036; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1037; GFX9-GISEL-NEXT: s_waitcnt vmcnt(4) 1038; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 1039; GFX9-GISEL-NEXT: v_or3_b32 v2, v2, v3, v0 1040; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) 1041; GFX9-GISEL-NEXT: v_lshl_or_b32 v4, v6, 8, v5 1042; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) 1043; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v7 1044; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1045; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v8, 24, v5 1046; GFX9-GISEL-NEXT: v_or3_b32 v3, v0, v4, 0 1047; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v4, v3 1048; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v2 1049; GFX9-GISEL-NEXT: v_add_u32_e32 v4, 32, v4 1050; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] 1051; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v0, v4 1052; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc 1053; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 1054; GFX9-GISEL-NEXT: s_endpgm 1055 %val = load i64, ptr addrspace(1) %arrayidx, align 1 1056 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone 1057 %cttz_ret = icmp ne i64 %val, 0 1058 %ret = select i1 %cttz_ret, i64 %cttz, i64 64 1059 store i64 %ret, ptr addrspace(1) %out, align 4 1060 ret void 1061} 1062 1063define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { 1064; SI-LABEL: v_cttz_i32_sel_eq_neg1: 1065; SI: ; %bb.0: 1066; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1067; SI-NEXT: s_mov_b32 s7, 0xf000 1068; SI-NEXT: s_mov_b32 s6, -1 1069; SI-NEXT: s_mov_b32 s10, s6 1070; SI-NEXT: s_mov_b32 s11, s7 1071; SI-NEXT: s_waitcnt lgkmcnt(0) 1072; SI-NEXT: s_mov_b32 s8, s2 1073; SI-NEXT: s_mov_b32 s9, s3 1074; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 1075; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:3 1076; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 1077; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:2 1078; SI-NEXT: s_mov_b32 s4, s0 1079; SI-NEXT: s_mov_b32 s5, s1 1080; SI-NEXT: s_waitcnt vmcnt(3) 1081; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1082; SI-NEXT: s_waitcnt vmcnt(2) 1083; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1084; SI-NEXT: s_waitcnt vmcnt(1) 1085; SI-NEXT: v_or_b32_e32 v0, v0, v2 1086; SI-NEXT: s_waitcnt vmcnt(0) 1087; SI-NEXT: v_or_b32_e32 v1, v1, v3 1088; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1089; SI-NEXT: v_or_b32_e32 v0, v1, v0 1090; SI-NEXT: v_ffbl_b32_e32 v0, v0 1091; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1092; SI-NEXT: s_endpgm 1093; 1094; VI-LABEL: v_cttz_i32_sel_eq_neg1: 1095; VI: ; %bb.0: 1096; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1097; VI-NEXT: s_waitcnt lgkmcnt(0) 1098; VI-NEXT: s_add_u32 s4, s2, 3 1099; VI-NEXT: s_addc_u32 s5, s3, 0 1100; VI-NEXT: v_mov_b32_e32 v2, s4 1101; VI-NEXT: v_mov_b32_e32 v3, s5 1102; VI-NEXT: s_add_u32 s4, s2, 2 1103; VI-NEXT: v_mov_b32_e32 v0, s2 1104; VI-NEXT: s_addc_u32 s5, s3, 0 1105; VI-NEXT: v_mov_b32_e32 v1, s3 1106; VI-NEXT: s_add_u32 s2, s2, 1 1107; VI-NEXT: s_addc_u32 s3, s3, 0 1108; VI-NEXT: v_mov_b32_e32 v4, s4 1109; VI-NEXT: v_mov_b32_e32 v7, s3 1110; VI-NEXT: v_mov_b32_e32 v5, s5 1111; VI-NEXT: v_mov_b32_e32 v6, s2 1112; VI-NEXT: flat_load_ubyte v2, v[2:3] 1113; VI-NEXT: flat_load_ubyte v3, v[4:5] 1114; VI-NEXT: flat_load_ubyte v4, v[6:7] 1115; VI-NEXT: flat_load_ubyte v0, v[0:1] 1116; VI-NEXT: s_waitcnt vmcnt(3) 1117; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 1118; VI-NEXT: s_waitcnt vmcnt(2) 1119; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1120; VI-NEXT: s_waitcnt vmcnt(1) 1121; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 1122; VI-NEXT: s_waitcnt vmcnt(0) 1123; VI-NEXT: v_or_b32_e32 v0, v2, v0 1124; VI-NEXT: v_or_b32_e32 v0, v1, v0 1125; VI-NEXT: v_ffbl_b32_e32 v2, v0 1126; VI-NEXT: v_mov_b32_e32 v0, s0 1127; VI-NEXT: v_mov_b32_e32 v1, s1 1128; VI-NEXT: flat_store_dword v[0:1], v2 1129; VI-NEXT: s_endpgm 1130; 1131; EG-LABEL: v_cttz_i32_sel_eq_neg1: 1132; EG: ; %bb.0: 1133; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1134; EG-NEXT: TEX 1 @6 1135; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[] 1136; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1137; EG-NEXT: CF_END 1138; EG-NEXT: PAD 1139; EG-NEXT: Fetch clause starting at 6: 1140; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1 1141; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1142; EG-NEXT: ALU clause starting at 10: 1143; EG-NEXT: MOV * T0.X, KC0[2].Z, 1144; EG-NEXT: ALU clause starting at 11: 1145; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 1146; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1147; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, 1148; EG-NEXT: FFBL_INT * T1.W, PV.W, 1149; EG-NEXT: CNDE_INT * T1.W, T0.W, literal.x, PV.W, 1150; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1151; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W, 1152; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1153; EG-NEXT: -1(nan), 2(2.802597e-45) 1154; 1155; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: 1156; GFX9-GISEL: ; %bb.0: 1157; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1158; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 1159; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1160; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 1161; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 1162; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 1163; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 1164; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) 1165; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 1166; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) 1167; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3 1168; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1169; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v4 1170; GFX9-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 1171; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 1172; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 1173; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 1174; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc 1175; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] 1176; GFX9-GISEL-NEXT: s_endpgm 1177 %val = load i32, ptr addrspace(1) %arrayidx, align 1 1178 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 1179 %cmp = icmp eq i32 %val, 0 1180 %sel = select i1 %cmp, i32 -1, i32 %ctlz 1181 store i32 %sel, ptr addrspace(1) %out 1182 ret void 1183} 1184 1185define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { 1186; SI-LABEL: v_cttz_i32_sel_ne_neg1: 1187; SI: ; %bb.0: 1188; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1189; SI-NEXT: s_mov_b32 s7, 0xf000 1190; SI-NEXT: s_mov_b32 s6, -1 1191; SI-NEXT: s_mov_b32 s10, s6 1192; SI-NEXT: s_mov_b32 s11, s7 1193; SI-NEXT: s_waitcnt lgkmcnt(0) 1194; SI-NEXT: s_mov_b32 s8, s2 1195; SI-NEXT: s_mov_b32 s9, s3 1196; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 1197; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:3 1198; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 1199; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:2 1200; SI-NEXT: s_mov_b32 s4, s0 1201; SI-NEXT: s_mov_b32 s5, s1 1202; SI-NEXT: s_waitcnt vmcnt(3) 1203; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1204; SI-NEXT: s_waitcnt vmcnt(2) 1205; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1206; SI-NEXT: s_waitcnt vmcnt(1) 1207; SI-NEXT: v_or_b32_e32 v0, v0, v2 1208; SI-NEXT: s_waitcnt vmcnt(0) 1209; SI-NEXT: v_or_b32_e32 v1, v1, v3 1210; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1211; SI-NEXT: v_or_b32_e32 v0, v1, v0 1212; SI-NEXT: v_ffbl_b32_e32 v0, v0 1213; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1214; SI-NEXT: s_endpgm 1215; 1216; VI-LABEL: v_cttz_i32_sel_ne_neg1: 1217; VI: ; %bb.0: 1218; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1219; VI-NEXT: s_waitcnt lgkmcnt(0) 1220; VI-NEXT: s_add_u32 s4, s2, 3 1221; VI-NEXT: s_addc_u32 s5, s3, 0 1222; VI-NEXT: v_mov_b32_e32 v2, s4 1223; VI-NEXT: v_mov_b32_e32 v3, s5 1224; VI-NEXT: s_add_u32 s4, s2, 2 1225; VI-NEXT: v_mov_b32_e32 v0, s2 1226; VI-NEXT: s_addc_u32 s5, s3, 0 1227; VI-NEXT: v_mov_b32_e32 v1, s3 1228; VI-NEXT: s_add_u32 s2, s2, 1 1229; VI-NEXT: s_addc_u32 s3, s3, 0 1230; VI-NEXT: v_mov_b32_e32 v4, s4 1231; VI-NEXT: v_mov_b32_e32 v7, s3 1232; VI-NEXT: v_mov_b32_e32 v5, s5 1233; VI-NEXT: v_mov_b32_e32 v6, s2 1234; VI-NEXT: flat_load_ubyte v2, v[2:3] 1235; VI-NEXT: flat_load_ubyte v3, v[4:5] 1236; VI-NEXT: flat_load_ubyte v4, v[6:7] 1237; VI-NEXT: flat_load_ubyte v0, v[0:1] 1238; VI-NEXT: s_waitcnt vmcnt(3) 1239; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 1240; VI-NEXT: s_waitcnt vmcnt(2) 1241; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1242; VI-NEXT: s_waitcnt vmcnt(1) 1243; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 1244; VI-NEXT: s_waitcnt vmcnt(0) 1245; VI-NEXT: v_or_b32_e32 v0, v2, v0 1246; VI-NEXT: v_or_b32_e32 v0, v1, v0 1247; VI-NEXT: v_ffbl_b32_e32 v2, v0 1248; VI-NEXT: v_mov_b32_e32 v0, s0 1249; VI-NEXT: v_mov_b32_e32 v1, s1 1250; VI-NEXT: flat_store_dword v[0:1], v2 1251; VI-NEXT: s_endpgm 1252; 1253; EG-LABEL: v_cttz_i32_sel_ne_neg1: 1254; EG: ; %bb.0: 1255; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1256; EG-NEXT: TEX 1 @6 1257; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[] 1258; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1259; EG-NEXT: CF_END 1260; EG-NEXT: PAD 1261; EG-NEXT: Fetch clause starting at 6: 1262; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1 1263; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1264; EG-NEXT: ALU clause starting at 10: 1265; EG-NEXT: MOV * T0.X, KC0[2].Z, 1266; EG-NEXT: ALU clause starting at 11: 1267; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 1268; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1269; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, 1270; EG-NEXT: FFBL_INT * T1.W, PV.W, 1271; EG-NEXT: CNDE_INT * T1.W, T0.W, literal.x, PV.W, 1272; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1273; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W, 1274; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1275; EG-NEXT: -1(nan), 2(2.802597e-45) 1276; 1277; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: 1278; GFX9-GISEL: ; %bb.0: 1279; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1280; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 1281; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1282; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 1283; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 1284; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 1285; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 1286; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) 1287; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 1288; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) 1289; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3 1290; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1291; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v4 1292; GFX9-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 1293; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 1294; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 1295; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 1296; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc 1297; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] 1298; GFX9-GISEL-NEXT: s_endpgm 1299 %val = load i32, ptr addrspace(1) %arrayidx, align 1 1300 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 1301 %cmp = icmp ne i32 %val, 0 1302 %sel = select i1 %cmp, i32 %ctlz, i32 -1 1303 store i32 %sel, ptr addrspace(1) %out 1304 ret void 1305} 1306 1307define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { 1308; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: 1309; SI: ; %bb.0: 1310; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1311; SI-NEXT: s_mov_b32 s7, 0xf000 1312; SI-NEXT: s_mov_b32 s6, -1 1313; SI-NEXT: s_mov_b32 s10, s6 1314; SI-NEXT: s_mov_b32 s11, s7 1315; SI-NEXT: s_waitcnt lgkmcnt(0) 1316; SI-NEXT: s_mov_b32 s8, s2 1317; SI-NEXT: s_mov_b32 s9, s3 1318; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 1319; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:3 1320; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 1321; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:2 1322; SI-NEXT: s_mov_b32 s4, s0 1323; SI-NEXT: s_mov_b32 s5, s1 1324; SI-NEXT: s_waitcnt vmcnt(3) 1325; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1326; SI-NEXT: s_waitcnt vmcnt(2) 1327; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1328; SI-NEXT: s_waitcnt vmcnt(1) 1329; SI-NEXT: v_or_b32_e32 v0, v0, v2 1330; SI-NEXT: s_waitcnt vmcnt(0) 1331; SI-NEXT: v_or_b32_e32 v1, v1, v3 1332; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1333; SI-NEXT: v_or_b32_e32 v0, v1, v0 1334; SI-NEXT: v_ffbl_b32_e32 v0, v0 1335; SI-NEXT: v_min_u32_e32 v0, 32, v0 1336; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1337; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1338; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1339; SI-NEXT: s_endpgm 1340; 1341; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: 1342; VI: ; %bb.0: 1343; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1344; VI-NEXT: s_waitcnt lgkmcnt(0) 1345; VI-NEXT: s_add_u32 s4, s2, 3 1346; VI-NEXT: s_addc_u32 s5, s3, 0 1347; VI-NEXT: v_mov_b32_e32 v2, s4 1348; VI-NEXT: v_mov_b32_e32 v3, s5 1349; VI-NEXT: s_add_u32 s4, s2, 2 1350; VI-NEXT: v_mov_b32_e32 v0, s2 1351; VI-NEXT: s_addc_u32 s5, s3, 0 1352; VI-NEXT: v_mov_b32_e32 v1, s3 1353; VI-NEXT: s_add_u32 s2, s2, 1 1354; VI-NEXT: s_addc_u32 s3, s3, 0 1355; VI-NEXT: v_mov_b32_e32 v4, s4 1356; VI-NEXT: v_mov_b32_e32 v7, s3 1357; VI-NEXT: v_mov_b32_e32 v5, s5 1358; VI-NEXT: v_mov_b32_e32 v6, s2 1359; VI-NEXT: flat_load_ubyte v2, v[2:3] 1360; VI-NEXT: flat_load_ubyte v3, v[4:5] 1361; VI-NEXT: flat_load_ubyte v4, v[6:7] 1362; VI-NEXT: flat_load_ubyte v0, v[0:1] 1363; VI-NEXT: s_waitcnt vmcnt(3) 1364; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 1365; VI-NEXT: s_waitcnt vmcnt(2) 1366; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1367; VI-NEXT: s_waitcnt vmcnt(1) 1368; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 1369; VI-NEXT: s_waitcnt vmcnt(0) 1370; VI-NEXT: v_or_b32_e32 v0, v2, v0 1371; VI-NEXT: v_or_b32_e32 v0, v1, v0 1372; VI-NEXT: v_ffbl_b32_e32 v0, v0 1373; VI-NEXT: v_min_u32_e32 v0, 32, v0 1374; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1375; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc 1376; VI-NEXT: v_mov_b32_e32 v0, s0 1377; VI-NEXT: v_mov_b32_e32 v1, s1 1378; VI-NEXT: flat_store_dword v[0:1], v2 1379; VI-NEXT: s_endpgm 1380; 1381; EG-LABEL: v_cttz_i32_sel_ne_bitwidth: 1382; EG: ; %bb.0: 1383; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 1384; EG-NEXT: TEX 1 @6 1385; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 1386; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1387; EG-NEXT: CF_END 1388; EG-NEXT: PAD 1389; EG-NEXT: Fetch clause starting at 6: 1390; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1 1391; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1392; EG-NEXT: ALU clause starting at 10: 1393; EG-NEXT: MOV * T0.X, KC0[2].Z, 1394; EG-NEXT: ALU clause starting at 11: 1395; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 1396; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1397; EG-NEXT: OR_INT * T0.W, PV.W, T0.X, 1398; EG-NEXT: FFBL_INT * T1.W, PV.W, 1399; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W, 1400; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1401; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x, 1402; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1403; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W, 1404; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1405; EG-NEXT: -1(nan), 2(2.802597e-45) 1406; 1407; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: 1408; GFX9-GISEL: ; %bb.0: 1409; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1410; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 1411; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1412; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 1413; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 1414; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 1415; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 1416; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) 1417; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 1418; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) 1419; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3 1420; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1421; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v4 1422; GFX9-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 1423; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 1424; GFX9-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 1425; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 32, v1 1426; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc 1427; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] 1428; GFX9-GISEL-NEXT: s_endpgm 1429 %val = load i32, ptr addrspace(1) %arrayidx, align 1 1430 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 1431 %cmp = icmp ne i32 %ctlz, 32 1432 %sel = select i1 %cmp, i32 %ctlz, i32 -1 1433 store i32 %sel, ptr addrspace(1) %out 1434 ret void 1435} 1436 1437 define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { 1438; SI-LABEL: v_cttz_i8_sel_eq_neg1: 1439; SI: ; %bb.0: 1440; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1441; SI-NEXT: s_mov_b32 s7, 0xf000 1442; SI-NEXT: s_mov_b32 s6, -1 1443; SI-NEXT: s_mov_b32 s10, s6 1444; SI-NEXT: s_mov_b32 s11, s7 1445; SI-NEXT: s_waitcnt lgkmcnt(0) 1446; SI-NEXT: s_mov_b32 s8, s2 1447; SI-NEXT: s_mov_b32 s9, s3 1448; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 1449; SI-NEXT: s_mov_b32 s4, s0 1450; SI-NEXT: s_mov_b32 s5, s1 1451; SI-NEXT: s_waitcnt vmcnt(0) 1452; SI-NEXT: v_ffbl_b32_e32 v0, v0 1453; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1454; SI-NEXT: s_endpgm 1455; 1456; VI-LABEL: v_cttz_i8_sel_eq_neg1: 1457; VI: ; %bb.0: 1458; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1459; VI-NEXT: s_waitcnt lgkmcnt(0) 1460; VI-NEXT: v_mov_b32_e32 v0, s2 1461; VI-NEXT: v_mov_b32_e32 v1, s3 1462; VI-NEXT: flat_load_ubyte v0, v[0:1] 1463; VI-NEXT: v_mov_b32_e32 v1, 0xff 1464; VI-NEXT: s_waitcnt vmcnt(0) 1465; VI-NEXT: v_or_b32_e32 v2, 0x100, v0 1466; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1467; VI-NEXT: v_ffbl_b32_e32 v2, v2 1468; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1469; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc 1470; VI-NEXT: v_mov_b32_e32 v0, s0 1471; VI-NEXT: v_mov_b32_e32 v1, s1 1472; VI-NEXT: flat_store_byte v[0:1], v2 1473; VI-NEXT: s_endpgm 1474; 1475; EG-LABEL: v_cttz_i8_sel_eq_neg1: 1476; EG: ; %bb.0: 1477; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1478; EG-NEXT: TEX 0 @6 1479; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1480; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1481; EG-NEXT: CF_END 1482; EG-NEXT: PAD 1483; EG-NEXT: Fetch clause starting at 6: 1484; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1485; EG-NEXT: ALU clause starting at 8: 1486; EG-NEXT: MOV * T0.X, KC0[2].Z, 1487; EG-NEXT: ALU clause starting at 9: 1488; EG-NEXT: FFBL_INT T0.W, T0.X, 1489; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1490; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1491; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1492; EG-NEXT: LSHL * T1.W, PS, literal.y, 1493; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1494; EG-NEXT: LSHL T0.X, PV.W, PS, 1495; EG-NEXT: LSHL * T0.W, literal.x, PS, 1496; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1497; EG-NEXT: MOV T0.Y, 0.0, 1498; EG-NEXT: MOV * T0.Z, 0.0, 1499; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1500; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1501; 1502; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: 1503; GFX9-GISEL: ; %bb.0: 1504; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1505; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 1506; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff 1507; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1508; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 1509; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1510; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1 1511; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 1512; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3 1513; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 1514; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc 1515; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] 1516; GFX9-GISEL-NEXT: s_endpgm 1517 %val = load i8, ptr addrspace(1) %arrayidx, align 1 1518 %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone 1519 %cmp = icmp eq i8 %val, 0 1520 %sel = select i1 %cmp, i8 -1, i8 %ctlz 1521 store i8 %sel, ptr addrspace(1) %out 1522 ret void 1523} 1524 1525 define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { 1526; SI-LABEL: v_cttz_i16_sel_eq_neg1: 1527; SI: ; %bb.0: 1528; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1529; SI-NEXT: s_mov_b32 s7, 0xf000 1530; SI-NEXT: s_mov_b32 s6, -1 1531; SI-NEXT: s_mov_b32 s10, s6 1532; SI-NEXT: s_mov_b32 s11, s7 1533; SI-NEXT: s_waitcnt lgkmcnt(0) 1534; SI-NEXT: s_mov_b32 s8, s2 1535; SI-NEXT: s_mov_b32 s9, s3 1536; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1 1537; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 1538; SI-NEXT: s_mov_b32 s4, s0 1539; SI-NEXT: s_mov_b32 s5, s1 1540; SI-NEXT: s_waitcnt vmcnt(1) 1541; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1542; SI-NEXT: s_waitcnt vmcnt(0) 1543; SI-NEXT: v_or_b32_e32 v0, v0, v1 1544; SI-NEXT: v_ffbl_b32_e32 v0, v0 1545; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 1546; SI-NEXT: s_endpgm 1547; 1548; VI-LABEL: v_cttz_i16_sel_eq_neg1: 1549; VI: ; %bb.0: 1550; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1551; VI-NEXT: s_waitcnt lgkmcnt(0) 1552; VI-NEXT: s_add_u32 s4, s2, 1 1553; VI-NEXT: s_addc_u32 s5, s3, 0 1554; VI-NEXT: v_mov_b32_e32 v2, s4 1555; VI-NEXT: v_mov_b32_e32 v0, s2 1556; VI-NEXT: v_mov_b32_e32 v3, s5 1557; VI-NEXT: v_mov_b32_e32 v1, s3 1558; VI-NEXT: flat_load_ubyte v2, v[2:3] 1559; VI-NEXT: flat_load_ubyte v0, v[0:1] 1560; VI-NEXT: s_waitcnt vmcnt(1) 1561; VI-NEXT: v_readfirstlane_b32 s2, v2 1562; VI-NEXT: s_waitcnt vmcnt(0) 1563; VI-NEXT: v_readfirstlane_b32 s3, v0 1564; VI-NEXT: s_lshl_b32 s2, s2, 8 1565; VI-NEXT: s_or_b32 s2, s2, s3 1566; VI-NEXT: s_or_b32 s3, s2, 0x10000 1567; VI-NEXT: s_and_b32 s2, s2, 0xffff 1568; VI-NEXT: s_ff1_i32_b32 s3, s3 1569; VI-NEXT: s_cmp_lg_u32 s2, 0 1570; VI-NEXT: s_cselect_b32 s2, s3, 0xffff 1571; VI-NEXT: v_mov_b32_e32 v0, s0 1572; VI-NEXT: v_mov_b32_e32 v1, s1 1573; VI-NEXT: v_mov_b32_e32 v2, s2 1574; VI-NEXT: flat_store_short v[0:1], v2 1575; VI-NEXT: s_endpgm 1576; 1577; EG-LABEL: v_cttz_i16_sel_eq_neg1: 1578; EG: ; %bb.0: 1579; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1580; EG-NEXT: TEX 0 @6 1581; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1582; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1583; EG-NEXT: CF_END 1584; EG-NEXT: PAD 1585; EG-NEXT: Fetch clause starting at 6: 1586; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1587; EG-NEXT: ALU clause starting at 8: 1588; EG-NEXT: MOV * T0.X, KC0[2].Z, 1589; EG-NEXT: ALU clause starting at 9: 1590; EG-NEXT: FFBL_INT T0.W, T0.X, 1591; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1592; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1593; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1594; EG-NEXT: LSHL * T1.W, PS, literal.y, 1595; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1596; EG-NEXT: LSHL T0.X, PV.W, PS, 1597; EG-NEXT: LSHL * T0.W, literal.x, PS, 1598; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1599; EG-NEXT: MOV T0.Y, 0.0, 1600; EG-NEXT: MOV * T0.Z, 0.0, 1601; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1602; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1603; 1604; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: 1605; GFX9-GISEL: ; %bb.0: 1606; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1607; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 1608; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff 1609; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1610; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 1611; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 1612; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) 1613; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 1614; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 1615; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 1616; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 1617; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 1618; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc 1619; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] 1620; GFX9-GISEL-NEXT: s_endpgm 1621 %val = load i16, ptr addrspace(1) %arrayidx, align 1 1622 %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone 1623 %cmp = icmp eq i16 %val, 0 1624 %sel = select i1 %cmp, i16 -1, i16 %ctlz 1625 store i16 %sel, ptr addrspace(1) %out 1626 ret void 1627} 1628 1629 1630