1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI 3; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI 4; RUN: llc < %s -mtriple=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG 5; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10 6; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL 7 8declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone 9declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone 10declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone 11 12declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone 13declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone 14declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone 15 16declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone 17declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) nounwind readnone 18declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1) nounwind readnone 19 20declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 21 22define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind { 23; SI-LABEL: s_cttz_i32: 24; SI: ; %bb.0: 25; SI-NEXT: s_load_dword s2, s[4:5], 0xb 26; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 27; SI-NEXT: s_mov_b32 s3, 0xf000 28; SI-NEXT: s_waitcnt lgkmcnt(0) 29; SI-NEXT: s_ff1_i32_b32 s2, s2 30; SI-NEXT: s_min_u32 s4, s2, 32 31; SI-NEXT: s_mov_b32 s2, -1 32; SI-NEXT: v_mov_b32_e32 v0, s4 33; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 34; SI-NEXT: s_endpgm 35; 36; VI-LABEL: s_cttz_i32: 37; VI: ; %bb.0: 38; VI-NEXT: s_load_dword s6, s[4:5], 0x2c 39; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 40; VI-NEXT: s_mov_b32 s3, 0xf000 41; VI-NEXT: s_mov_b32 s2, -1 42; VI-NEXT: s_waitcnt lgkmcnt(0) 43; VI-NEXT: s_ff1_i32_b32 s4, s6 44; VI-NEXT: s_min_u32 s4, s4, 32 45; VI-NEXT: v_mov_b32_e32 v0, s4 46; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 47; VI-NEXT: s_endpgm 48; 49; EG-LABEL: s_cttz_i32: 50; EG: ; %bb.0: 51; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 52; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 53; EG-NEXT: CF_END 54; EG-NEXT: PAD 55; EG-NEXT: ALU clause starting at 4: 56; EG-NEXT: FFBL_INT * T0.W, KC0[2].Z, 57; EG-NEXT: CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W, 58; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 59; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 60; 61; GFX10-LABEL: s_cttz_i32: 62; GFX10: ; %bb.0: 63; GFX10-NEXT: s_clause 0x1 64; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c 65; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 66; GFX10-NEXT: v_mov_b32_e32 v0, 0 67; GFX10-NEXT: s_waitcnt lgkmcnt(0) 68; GFX10-NEXT: s_ff1_i32_b32 s2, s2 69; GFX10-NEXT: s_min_u32 s2, s2, 32 70; GFX10-NEXT: v_mov_b32_e32 v1, s2 71; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 72; GFX10-NEXT: s_endpgm 73; 74; GFX10-GISEL-LABEL: s_cttz_i32: 75; GFX10-GISEL: ; %bb.0: 76; GFX10-GISEL-NEXT: s_clause 0x1 77; GFX10-GISEL-NEXT: s_load_dword s2, s[4:5], 0x2c 78; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 79; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 80; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 81; GFX10-GISEL-NEXT: s_ff1_i32_b32 s2, s2 82; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 32 83; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 84; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 85; GFX10-GISEL-NEXT: s_endpgm 86 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 87 store i32 %cttz, ptr addrspace(1) %out, align 4 88 ret void 89} 90 91define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 92; SI-LABEL: v_cttz_i32: 93; SI: ; %bb.0: 94; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 95; SI-NEXT: s_mov_b32 s7, 0xf000 96; SI-NEXT: s_mov_b32 s10, 0 97; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 98; SI-NEXT: v_mov_b32_e32 v1, 0 99; SI-NEXT: s_mov_b32 s11, s7 100; SI-NEXT: s_waitcnt lgkmcnt(0) 101; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 102; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 103; SI-NEXT: s_mov_b32 s6, -1 104; SI-NEXT: s_mov_b32 s4, s0 105; SI-NEXT: s_mov_b32 s5, s1 106; SI-NEXT: s_waitcnt vmcnt(0) 107; SI-NEXT: v_ffbl_b32_e32 v0, v0 108; SI-NEXT: v_min_u32_e32 v0, 32, v0 109; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 110; SI-NEXT: s_endpgm 111; 112; VI-LABEL: v_cttz_i32: 113; VI: ; %bb.0: 114; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 115; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 116; VI-NEXT: s_waitcnt lgkmcnt(0) 117; VI-NEXT: v_mov_b32_e32 v1, s3 118; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 119; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 120; VI-NEXT: flat_load_dword v0, v[0:1] 121; VI-NEXT: s_mov_b32 s3, 0xf000 122; VI-NEXT: s_mov_b32 s2, -1 123; VI-NEXT: s_waitcnt vmcnt(0) 124; VI-NEXT: v_ffbl_b32_e32 v0, v0 125; VI-NEXT: v_min_u32_e32 v0, 32, v0 126; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 127; VI-NEXT: s_endpgm 128; 129; EG-LABEL: v_cttz_i32: 130; EG: ; %bb.0: 131; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 132; EG-NEXT: TEX 0 @6 133; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 134; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 135; EG-NEXT: CF_END 136; EG-NEXT: PAD 137; EG-NEXT: Fetch clause starting at 6: 138; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 139; EG-NEXT: ALU clause starting at 8: 140; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 141; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 142; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 143; EG-NEXT: ALU clause starting at 11: 144; EG-NEXT: FFBL_INT * T0.W, T0.X, 145; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 146; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 147; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 148; 149; GFX10-LABEL: v_cttz_i32: 150; GFX10: ; %bb.0: 151; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 152; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 153; GFX10-NEXT: v_mov_b32_e32 v1, 0 154; GFX10-NEXT: s_waitcnt lgkmcnt(0) 155; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 156; GFX10-NEXT: s_waitcnt vmcnt(0) 157; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 158; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 159; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 160; GFX10-NEXT: s_endpgm 161; 162; GFX10-GISEL-LABEL: v_cttz_i32: 163; GFX10-GISEL: ; %bb.0: 164; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 165; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 166; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 167; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 168; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 169; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 170; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 171; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 172; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 173; GFX10-GISEL-NEXT: s_endpgm 174 %tid = call i32 @llvm.amdgcn.workitem.id.x() 175 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 176 %val = load i32, ptr addrspace(1) %in.gep, align 4 177 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 178 store i32 %cttz, ptr addrspace(1) %out, align 4 179 ret void 180} 181 182define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 183; SI-LABEL: v_cttz_v2i32: 184; SI: ; %bb.0: 185; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 186; SI-NEXT: s_mov_b32 s7, 0xf000 187; SI-NEXT: s_mov_b32 s10, 0 188; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 189; SI-NEXT: v_mov_b32_e32 v1, 0 190; SI-NEXT: s_mov_b32 s11, s7 191; SI-NEXT: s_waitcnt lgkmcnt(0) 192; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 193; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 194; SI-NEXT: s_mov_b32 s6, -1 195; SI-NEXT: s_mov_b32 s4, s0 196; SI-NEXT: s_mov_b32 s5, s1 197; SI-NEXT: s_waitcnt vmcnt(0) 198; SI-NEXT: v_ffbl_b32_e32 v1, v1 199; SI-NEXT: v_ffbl_b32_e32 v0, v0 200; SI-NEXT: v_min_u32_e32 v1, 32, v1 201; SI-NEXT: v_min_u32_e32 v0, 32, v0 202; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 203; SI-NEXT: s_endpgm 204; 205; VI-LABEL: v_cttz_v2i32: 206; VI: ; %bb.0: 207; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 208; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 209; VI-NEXT: s_waitcnt lgkmcnt(0) 210; VI-NEXT: v_mov_b32_e32 v1, s3 211; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 212; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 213; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 214; VI-NEXT: s_mov_b32 s3, 0xf000 215; VI-NEXT: s_mov_b32 s2, -1 216; VI-NEXT: s_waitcnt vmcnt(0) 217; VI-NEXT: v_ffbl_b32_e32 v1, v1 218; VI-NEXT: v_ffbl_b32_e32 v0, v0 219; VI-NEXT: v_min_u32_e32 v1, 32, v1 220; VI-NEXT: v_min_u32_e32 v0, 32, v0 221; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 222; VI-NEXT: s_endpgm 223; 224; EG-LABEL: v_cttz_v2i32: 225; EG: ; %bb.0: 226; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 227; EG-NEXT: TEX 0 @6 228; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[] 229; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 230; EG-NEXT: CF_END 231; EG-NEXT: PAD 232; EG-NEXT: Fetch clause starting at 6: 233; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 234; EG-NEXT: ALU clause starting at 8: 235; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 236; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 237; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 238; EG-NEXT: ALU clause starting at 11: 239; EG-NEXT: FFBL_INT * T0.W, T0.Y, 240; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W, 241; EG-NEXT: FFBL_INT * T0.W, T0.X, 242; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 243; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 244; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 245; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 246; 247; GFX10-LABEL: v_cttz_v2i32: 248; GFX10: ; %bb.0: 249; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 250; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 251; GFX10-NEXT: v_mov_b32_e32 v2, 0 252; GFX10-NEXT: s_waitcnt lgkmcnt(0) 253; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 254; GFX10-NEXT: s_waitcnt vmcnt(0) 255; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 256; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 257; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 258; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 259; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 260; GFX10-NEXT: s_endpgm 261; 262; GFX10-GISEL-LABEL: v_cttz_v2i32: 263; GFX10-GISEL: ; %bb.0: 264; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 265; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 266; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 267; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 268; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] 269; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 270; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 271; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 272; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 273; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 274; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 275; GFX10-GISEL-NEXT: s_endpgm 276 %tid = call i32 @llvm.amdgcn.workitem.id.x() 277 %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid 278 %val = load <2 x i32>, ptr addrspace(1) %in.gep, align 8 279 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 false) nounwind readnone 280 store <2 x i32> %cttz, ptr addrspace(1) %out, align 8 281 ret void 282} 283 284define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 285; SI-LABEL: v_cttz_v4i32: 286; SI: ; %bb.0: 287; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 288; SI-NEXT: s_mov_b32 s7, 0xf000 289; SI-NEXT: s_mov_b32 s10, 0 290; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 291; SI-NEXT: v_mov_b32_e32 v1, 0 292; SI-NEXT: s_mov_b32 s11, s7 293; SI-NEXT: s_waitcnt lgkmcnt(0) 294; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 295; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 296; SI-NEXT: s_mov_b32 s6, -1 297; SI-NEXT: s_mov_b32 s4, s0 298; SI-NEXT: s_mov_b32 s5, s1 299; SI-NEXT: s_waitcnt vmcnt(0) 300; SI-NEXT: v_ffbl_b32_e32 v3, v3 301; SI-NEXT: v_ffbl_b32_e32 v2, v2 302; SI-NEXT: v_ffbl_b32_e32 v1, v1 303; SI-NEXT: v_ffbl_b32_e32 v0, v0 304; SI-NEXT: v_min_u32_e32 v3, 32, v3 305; SI-NEXT: v_min_u32_e32 v2, 32, v2 306; SI-NEXT: v_min_u32_e32 v1, 32, v1 307; SI-NEXT: v_min_u32_e32 v0, 32, v0 308; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 309; SI-NEXT: s_endpgm 310; 311; VI-LABEL: v_cttz_v4i32: 312; VI: ; %bb.0: 313; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 314; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 315; VI-NEXT: s_waitcnt lgkmcnt(0) 316; VI-NEXT: v_mov_b32_e32 v1, s3 317; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 318; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 319; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 320; VI-NEXT: s_mov_b32 s3, 0xf000 321; VI-NEXT: s_mov_b32 s2, -1 322; VI-NEXT: s_waitcnt vmcnt(0) 323; VI-NEXT: v_ffbl_b32_e32 v3, v3 324; VI-NEXT: v_ffbl_b32_e32 v2, v2 325; VI-NEXT: v_ffbl_b32_e32 v1, v1 326; VI-NEXT: v_ffbl_b32_e32 v0, v0 327; VI-NEXT: v_min_u32_e32 v3, 32, v3 328; VI-NEXT: v_min_u32_e32 v2, 32, v2 329; VI-NEXT: v_min_u32_e32 v1, 32, v1 330; VI-NEXT: v_min_u32_e32 v0, 32, v0 331; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 332; VI-NEXT: s_endpgm 333; 334; EG-LABEL: v_cttz_v4i32: 335; EG: ; %bb.0: 336; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 337; EG-NEXT: TEX 0 @6 338; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] 339; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 340; EG-NEXT: CF_END 341; EG-NEXT: PAD 342; EG-NEXT: Fetch clause starting at 6: 343; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 344; EG-NEXT: ALU clause starting at 8: 345; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 346; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 347; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 348; EG-NEXT: ALU clause starting at 11: 349; EG-NEXT: FFBL_INT * T1.W, T0.W, 350; EG-NEXT: FFBL_INT T2.W, T0.Z, 351; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122 352; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 353; EG-NEXT: CNDE_INT T0.Z, T0.Z, literal.x, PV.W, 354; EG-NEXT: FFBL_INT * T1.W, T0.Y, 355; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 356; EG-NEXT: CNDE_INT T0.Y, T0.Y, literal.x, PV.W, 357; EG-NEXT: FFBL_INT * T1.W, T0.X, 358; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 359; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 360; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 361; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45) 362; 363; GFX10-LABEL: v_cttz_v4i32: 364; GFX10: ; %bb.0: 365; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 366; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 367; GFX10-NEXT: v_mov_b32_e32 v4, 0 368; GFX10-NEXT: s_waitcnt lgkmcnt(0) 369; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 370; GFX10-NEXT: s_waitcnt vmcnt(0) 371; GFX10-NEXT: v_ffbl_b32_e32 v3, v3 372; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 373; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 374; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 375; GFX10-NEXT: v_min_u32_e32 v3, 32, v3 376; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 377; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 378; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 379; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 380; GFX10-NEXT: s_endpgm 381; 382; GFX10-GISEL-LABEL: v_cttz_v4i32: 383; GFX10-GISEL: ; %bb.0: 384; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 385; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 386; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 387; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 388; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] 389; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 390; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 391; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 392; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 393; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 394; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 395; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 396; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 397; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 398; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 399; GFX10-GISEL-NEXT: s_endpgm 400 %tid = call i32 @llvm.amdgcn.workitem.id.x() 401 %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid 402 %val = load <4 x i32>, ptr addrspace(1) %in.gep, align 16 403 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 false) nounwind readnone 404 store <4 x i32> %cttz, ptr addrspace(1) %out, align 16 405 ret void 406} 407 408define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 409; SI-LABEL: v_cttz_i8: 410; SI: ; %bb.0: 411; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 412; SI-NEXT: s_mov_b32 s7, 0xf000 413; SI-NEXT: s_mov_b32 s6, -1 414; SI-NEXT: s_mov_b32 s10, s6 415; SI-NEXT: s_mov_b32 s11, s7 416; SI-NEXT: s_waitcnt lgkmcnt(0) 417; SI-NEXT: s_mov_b32 s8, s2 418; SI-NEXT: s_mov_b32 s9, s3 419; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 420; SI-NEXT: s_mov_b32 s4, s0 421; SI-NEXT: s_mov_b32 s5, s1 422; SI-NEXT: s_waitcnt vmcnt(0) 423; SI-NEXT: v_or_b32_e32 v0, 0x100, v0 424; SI-NEXT: v_ffbl_b32_e32 v0, v0 425; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 426; SI-NEXT: s_endpgm 427; 428; VI-LABEL: v_cttz_i8: 429; VI: ; %bb.0: 430; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 431; VI-NEXT: s_mov_b32 s7, 0xf000 432; VI-NEXT: s_mov_b32 s6, -1 433; VI-NEXT: s_mov_b32 s10, s6 434; VI-NEXT: s_mov_b32 s11, s7 435; VI-NEXT: s_waitcnt lgkmcnt(0) 436; VI-NEXT: s_mov_b32 s8, s2 437; VI-NEXT: s_mov_b32 s9, s3 438; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 439; VI-NEXT: s_mov_b32 s4, s0 440; VI-NEXT: s_mov_b32 s5, s1 441; VI-NEXT: s_waitcnt vmcnt(0) 442; VI-NEXT: v_or_b32_e32 v0, 0x100, v0 443; VI-NEXT: v_ffbl_b32_e32 v0, v0 444; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 445; VI-NEXT: s_endpgm 446; 447; EG-LABEL: v_cttz_i8: 448; EG: ; %bb.0: 449; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 450; EG-NEXT: TEX 0 @6 451; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 452; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 453; EG-NEXT: CF_END 454; EG-NEXT: PAD 455; EG-NEXT: Fetch clause starting at 6: 456; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 457; EG-NEXT: ALU clause starting at 8: 458; EG-NEXT: MOV * T0.X, KC0[2].Z, 459; EG-NEXT: ALU clause starting at 9: 460; EG-NEXT: OR_INT * T0.W, T0.X, literal.x, 461; EG-NEXT: 256(3.587324e-43), 0(0.000000e+00) 462; EG-NEXT: FFBL_INT T0.W, PV.W, 463; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 464; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 465; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 466; EG-NEXT: LSHL * T1.W, PS, literal.y, 467; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 468; EG-NEXT: LSHL T0.X, PV.W, PS, 469; EG-NEXT: LSHL * T0.W, literal.x, PS, 470; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 471; EG-NEXT: MOV T0.Y, 0.0, 472; EG-NEXT: MOV * T0.Z, 0.0, 473; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 474; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 475; 476; GFX10-LABEL: v_cttz_i8: 477; GFX10: ; %bb.0: 478; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 479; GFX10-NEXT: v_mov_b32_e32 v0, 0 480; GFX10-NEXT: s_waitcnt lgkmcnt(0) 481; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] 482; GFX10-NEXT: s_waitcnt vmcnt(0) 483; GFX10-NEXT: v_or_b32_e32 v1, 0x100, v1 484; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 485; GFX10-NEXT: global_store_byte v0, v1, s[0:1] 486; GFX10-NEXT: s_endpgm 487; 488; GFX10-GISEL-LABEL: v_cttz_i8: 489; GFX10-GISEL: ; %bb.0: 490; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 491; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 492; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 493; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] 494; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 495; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v1 496; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 497; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] 498; GFX10-GISEL-NEXT: s_endpgm 499 %val = load i8, ptr addrspace(1) %valptr 500 %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone 501 store i8 %cttz, ptr addrspace(1) %out 502 ret void 503} 504 505define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { 506; SI-LABEL: s_cttz_i64: 507; SI: ; %bb.0: 508; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 509; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 510; SI-NEXT: s_mov_b32 s3, 0xf000 511; SI-NEXT: s_mov_b32 s2, -1 512; SI-NEXT: s_waitcnt lgkmcnt(0) 513; SI-NEXT: s_ff1_i32_b64 s4, s[6:7] 514; SI-NEXT: s_min_u32 s4, s4, 64 515; SI-NEXT: v_mov_b32_e32 v1, 0 516; SI-NEXT: v_mov_b32_e32 v0, s4 517; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 518; SI-NEXT: s_endpgm 519; 520; VI-LABEL: s_cttz_i64: 521; VI: ; %bb.0: 522; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c 523; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 524; VI-NEXT: s_mov_b32 s3, 0xf000 525; VI-NEXT: s_mov_b32 s2, -1 526; VI-NEXT: v_mov_b32_e32 v1, 0 527; VI-NEXT: s_waitcnt lgkmcnt(0) 528; VI-NEXT: s_ff1_i32_b64 s4, s[6:7] 529; VI-NEXT: s_min_u32 s4, s4, 64 530; VI-NEXT: v_mov_b32_e32 v0, s4 531; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 532; VI-NEXT: s_endpgm 533; 534; EG-LABEL: s_cttz_i64: 535; EG: ; %bb.0: 536; EG-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[] 537; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 538; EG-NEXT: CF_END 539; EG-NEXT: PAD 540; EG-NEXT: ALU clause starting at 4: 541; EG-NEXT: FFBL_INT * T0.W, KC0[5].X, 542; EG-NEXT: CNDE_INT * T0.W, KC0[5].X, literal.x, PV.W, 543; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 544; EG-NEXT: FFBL_INT T1.W, KC0[4].W, 545; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 546; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 547; EG-NEXT: CNDE_INT T0.X, KC0[4].W, PS, PV.W, 548; EG-NEXT: MOV T0.Y, 0.0, 549; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 550; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 551; 552; GFX10-LABEL: s_cttz_i64: 553; GFX10: ; %bb.0: 554; GFX10-NEXT: s_clause 0x1 555; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c 556; GFX10-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 557; GFX10-NEXT: v_mov_b32_e32 v1, 0 558; GFX10-NEXT: s_waitcnt lgkmcnt(0) 559; GFX10-NEXT: s_ff1_i32_b64 s0, s[0:1] 560; GFX10-NEXT: s_min_u32 s0, s0, 64 561; GFX10-NEXT: v_mov_b32_e32 v0, s0 562; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[2:3] 563; GFX10-NEXT: s_endpgm 564; 565; GFX10-GISEL-LABEL: s_cttz_i64: 566; GFX10-GISEL: ; %bb.0: 567; GFX10-GISEL-NEXT: s_clause 0x1 568; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x4c 569; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 570; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 571; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 572; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[0:1] 573; GFX10-GISEL-NEXT: s_mov_b32 s1, 0 574; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 575; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 576; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1 577; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 578; GFX10-GISEL-NEXT: s_endpgm 579 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) 580 store i64 %cttz, ptr addrspace(1) %out 581 ret void 582} 583 584define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind { 585; SI-LABEL: s_cttz_i64_trunc: 586; SI: ; %bb.0: 587; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 588; SI-NEXT: s_mov_b32 s7, 0xf000 589; SI-NEXT: s_waitcnt lgkmcnt(0) 590; SI-NEXT: s_ff1_i32_b64 s2, s[2:3] 591; SI-NEXT: s_min_u32 s2, s2, 64 592; SI-NEXT: s_mov_b32 s6, -1 593; SI-NEXT: s_mov_b32 s4, s0 594; SI-NEXT: s_mov_b32 s5, s1 595; SI-NEXT: v_mov_b32_e32 v0, s2 596; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 597; SI-NEXT: s_endpgm 598; 599; VI-LABEL: s_cttz_i64_trunc: 600; VI: ; %bb.0: 601; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 602; VI-NEXT: s_mov_b32 s7, 0xf000 603; VI-NEXT: s_mov_b32 s6, -1 604; VI-NEXT: s_waitcnt lgkmcnt(0) 605; VI-NEXT: s_mov_b32 s4, s0 606; VI-NEXT: s_ff1_i32_b64 s0, s[2:3] 607; VI-NEXT: s_min_u32 s0, s0, 64 608; VI-NEXT: s_mov_b32 s5, s1 609; VI-NEXT: v_mov_b32_e32 v0, s0 610; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 611; VI-NEXT: s_endpgm 612; 613; EG-LABEL: s_cttz_i64_trunc: 614; EG: ; %bb.0: 615; EG-NEXT: ALU 8, @4, KC0[CB0:0-32], KC1[] 616; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 617; EG-NEXT: CF_END 618; EG-NEXT: PAD 619; EG-NEXT: ALU clause starting at 4: 620; EG-NEXT: FFBL_INT * T0.W, KC0[3].X, 621; EG-NEXT: CNDE_INT * T0.W, KC0[3].X, literal.x, PV.W, 622; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 623; EG-NEXT: FFBL_INT T1.W, KC0[2].W, 624; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x, 625; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 626; EG-NEXT: CNDE_INT T0.X, KC0[2].W, PS, PV.W, 627; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 628; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 629; 630; GFX10-LABEL: s_cttz_i64_trunc: 631; GFX10: ; %bb.0: 632; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 633; GFX10-NEXT: v_mov_b32_e32 v0, 0 634; GFX10-NEXT: s_waitcnt lgkmcnt(0) 635; GFX10-NEXT: s_ff1_i32_b64 s2, s[2:3] 636; GFX10-NEXT: s_min_u32 s2, s2, 64 637; GFX10-NEXT: v_mov_b32_e32 v1, s2 638; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 639; GFX10-NEXT: s_endpgm 640; 641; GFX10-GISEL-LABEL: s_cttz_i64_trunc: 642; GFX10-GISEL: ; %bb.0: 643; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 644; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 645; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 646; GFX10-GISEL-NEXT: s_ff1_i32_b64 s2, s[2:3] 647; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 64 648; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 649; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 650; GFX10-GISEL-NEXT: s_endpgm 651 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) 652 %trunc = trunc i64 %cttz to i32 653 store i32 %trunc, ptr addrspace(1) %out 654 ret void 655} 656 657define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 658; SI-LABEL: v_cttz_i64: 659; SI: ; %bb.0: 660; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 661; SI-NEXT: s_mov_b32 s7, 0xf000 662; SI-NEXT: s_mov_b32 s6, 0 663; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 664; SI-NEXT: v_mov_b32_e32 v1, 0 665; SI-NEXT: s_waitcnt lgkmcnt(0) 666; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 667; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 668; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 669; SI-NEXT: s_waitcnt vmcnt(0) 670; SI-NEXT: v_ffbl_b32_e32 v3, v3 671; SI-NEXT: v_min_u32_e32 v3, 0xffffffdf, v3 672; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v3 673; SI-NEXT: v_ffbl_b32_e32 v2, v2 674; SI-NEXT: v_min3_u32 v2, v2, v3, 64 675; SI-NEXT: v_mov_b32_e32 v3, v1 676; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 677; SI-NEXT: s_endpgm 678; 679; VI-LABEL: v_cttz_i64: 680; VI: ; %bb.0: 681; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 682; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 683; VI-NEXT: v_mov_b32_e32 v2, 0 684; VI-NEXT: s_waitcnt lgkmcnt(0) 685; VI-NEXT: v_mov_b32_e32 v1, s3 686; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v3 687; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 688; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 689; VI-NEXT: v_mov_b32_e32 v4, s1 690; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3 691; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 692; VI-NEXT: s_waitcnt vmcnt(0) 693; VI-NEXT: v_ffbl_b32_e32 v1, v1 694; VI-NEXT: v_add_u32_e64 v1, s[0:1], v1, 32 clamp 695; VI-NEXT: v_ffbl_b32_e32 v0, v0 696; VI-NEXT: v_min3_u32 v1, v0, v1, 64 697; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2] 698; VI-NEXT: s_endpgm 699; 700; EG-LABEL: v_cttz_i64: 701; EG: ; %bb.0: 702; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 703; EG-NEXT: TEX 0 @6 704; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 705; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 706; EG-NEXT: CF_END 707; EG-NEXT: PAD 708; EG-NEXT: Fetch clause starting at 6: 709; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 710; EG-NEXT: ALU clause starting at 8: 711; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 712; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 713; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 714; EG-NEXT: ALU clause starting at 11: 715; EG-NEXT: FFBL_INT * T1.W, T0.Y, 716; EG-NEXT: CNDE_INT * T1.W, T0.Y, literal.x, PV.W, 717; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 718; EG-NEXT: FFBL_INT T2.W, T0.X, 719; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x, 720; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 721; EG-NEXT: CNDE_INT T0.X, T0.X, PS, PV.W, 722; EG-NEXT: MOV T0.Y, 0.0, 723; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 724; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 725; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 726; 727; GFX10-LABEL: v_cttz_i64: 728; GFX10: ; %bb.0: 729; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 730; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 731; GFX10-NEXT: s_waitcnt lgkmcnt(0) 732; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 733; GFX10-NEXT: s_waitcnt vmcnt(0) 734; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 735; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 736; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp 737; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64 738; GFX10-NEXT: v_mov_b32_e32 v1, 0 739; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 740; GFX10-NEXT: s_endpgm 741; 742; GFX10-GISEL-LABEL: v_cttz_i64: 743; GFX10-GISEL: ; %bb.0: 744; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 745; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 746; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 747; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 748; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 749; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 750; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 751; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp 752; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 753; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 754; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 755; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 756; GFX10-GISEL-NEXT: s_endpgm 757 %tid = call i32 @llvm.amdgcn.workitem.id.x() 758 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid 759 %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %tid 760 %val = load i64, ptr addrspace(1) %in.gep 761 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) 762 store i64 %cttz, ptr addrspace(1) %out.gep 763 ret void 764} 765 766define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 767; SI-LABEL: v_cttz_i64_trunc: 768; SI: ; %bb.0: 769; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 770; SI-NEXT: s_mov_b32 s7, 0xf000 771; SI-NEXT: s_mov_b32 s6, 0 772; SI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 773; SI-NEXT: v_mov_b32_e32 v2, 0 774; SI-NEXT: s_waitcnt lgkmcnt(0) 775; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 776; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 777; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 778; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 779; SI-NEXT: s_waitcnt vmcnt(0) 780; SI-NEXT: v_ffbl_b32_e32 v0, v4 781; SI-NEXT: v_min_u32_e32 v0, 0xffffffdf, v0 782; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0 783; SI-NEXT: v_ffbl_b32_e32 v3, v3 784; SI-NEXT: v_min3_u32 v0, v3, v0, 64 785; SI-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 786; SI-NEXT: s_endpgm 787; 788; VI-LABEL: v_cttz_i64_trunc: 789; VI: ; %bb.0: 790; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 791; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 792; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 793; VI-NEXT: s_waitcnt lgkmcnt(0) 794; VI-NEXT: v_mov_b32_e32 v2, s3 795; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 796; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 797; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] 798; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 799; VI-NEXT: v_mov_b32_e32 v4, s1 800; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 801; VI-NEXT: s_waitcnt vmcnt(0) 802; VI-NEXT: v_ffbl_b32_e32 v0, v2 803; VI-NEXT: v_add_u32_e64 v0, s[0:1], v0, 32 clamp 804; VI-NEXT: v_ffbl_b32_e32 v1, v1 805; VI-NEXT: v_min3_u32 v0, v1, v0, 64 806; VI-NEXT: flat_store_dword v[3:4], v0 807; VI-NEXT: s_endpgm 808; 809; EG-LABEL: v_cttz_i64_trunc: 810; EG: ; %bb.0: 811; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 812; EG-NEXT: TEX 0 @6 813; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 814; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 815; EG-NEXT: CF_END 816; EG-NEXT: PAD 817; EG-NEXT: Fetch clause starting at 6: 818; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 819; EG-NEXT: ALU clause starting at 8: 820; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 821; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 822; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, PV.W, 823; EG-NEXT: ALU clause starting at 11: 824; EG-NEXT: FFBL_INT * T0.W, T1.Y, 825; EG-NEXT: CNDE_INT * T0.W, T1.Y, literal.x, PV.W, 826; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 827; EG-NEXT: LSHL T0.Z, T0.X, literal.x, 828; EG-NEXT: FFBL_INT T1.W, T1.X, BS:VEC_120/SCL_212 829; EG-NEXT: ADD_INT * T0.W, PV.W, literal.y, 830; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44) 831; EG-NEXT: CNDE_INT T0.X, T1.X, PS, PV.W, 832; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, PV.Z, 833; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 834; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 835; 836; GFX10-LABEL: v_cttz_i64_trunc: 837; GFX10: ; %bb.0: 838; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 839; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 840; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 841; GFX10-NEXT: s_waitcnt lgkmcnt(0) 842; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] 843; GFX10-NEXT: s_waitcnt vmcnt(0) 844; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 845; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 846; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp 847; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64 848; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 849; GFX10-NEXT: s_endpgm 850; 851; GFX10-GISEL-LABEL: v_cttz_i64_trunc: 852; GFX10-GISEL: ; %bb.0: 853; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 854; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 855; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 856; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 857; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] 858; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 859; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 860; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 861; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp 862; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v1, v2 863; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 864; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] 865; GFX10-GISEL-NEXT: s_endpgm 866 %tid = call i32 @llvm.amdgcn.workitem.id.x() 867 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid 868 %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid 869 %val = load i64, ptr addrspace(1) %in.gep 870 %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) 871 %trunc = trunc i64 %cttz to i32 872 store i32 %trunc, ptr addrspace(1) %out.gep 873 ret void 874} 875 876define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 877; SI-LABEL: v_cttz_i32_sel_eq_neg1: 878; SI: ; %bb.0: 879; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 880; SI-NEXT: s_mov_b32 s7, 0xf000 881; SI-NEXT: s_mov_b32 s10, 0 882; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 883; SI-NEXT: v_mov_b32_e32 v1, 0 884; SI-NEXT: s_mov_b32 s11, s7 885; SI-NEXT: s_waitcnt lgkmcnt(0) 886; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 887; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 888; SI-NEXT: s_mov_b32 s6, -1 889; SI-NEXT: s_mov_b32 s4, s0 890; SI-NEXT: s_mov_b32 s5, s1 891; SI-NEXT: s_waitcnt vmcnt(0) 892; SI-NEXT: v_ffbl_b32_e32 v0, v0 893; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 894; SI-NEXT: s_endpgm 895; 896; VI-LABEL: v_cttz_i32_sel_eq_neg1: 897; VI: ; %bb.0: 898; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 899; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 900; VI-NEXT: s_waitcnt lgkmcnt(0) 901; VI-NEXT: v_mov_b32_e32 v1, s3 902; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 903; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 904; VI-NEXT: flat_load_dword v0, v[0:1] 905; VI-NEXT: s_mov_b32 s3, 0xf000 906; VI-NEXT: s_mov_b32 s2, -1 907; VI-NEXT: s_waitcnt vmcnt(0) 908; VI-NEXT: v_ffbl_b32_e32 v0, v0 909; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 910; VI-NEXT: s_endpgm 911; 912; EG-LABEL: v_cttz_i32_sel_eq_neg1: 913; EG: ; %bb.0: 914; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 915; EG-NEXT: TEX 0 @6 916; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 917; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 918; EG-NEXT: CF_END 919; EG-NEXT: PAD 920; EG-NEXT: Fetch clause starting at 6: 921; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 922; EG-NEXT: ALU clause starting at 8: 923; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 924; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 925; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 926; EG-NEXT: ALU clause starting at 11: 927; EG-NEXT: FFBL_INT * T0.W, T0.X, 928; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 929; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 930; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 931; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 932; EG-NEXT: -1(nan), 2(2.802597e-45) 933; 934; GFX10-LABEL: v_cttz_i32_sel_eq_neg1: 935; GFX10: ; %bb.0: 936; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 937; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 938; GFX10-NEXT: v_mov_b32_e32 v1, 0 939; GFX10-NEXT: s_waitcnt lgkmcnt(0) 940; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 941; GFX10-NEXT: s_waitcnt vmcnt(0) 942; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 943; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 944; GFX10-NEXT: s_endpgm 945; 946; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: 947; GFX10-GISEL: ; %bb.0: 948; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 949; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 950; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 951; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 952; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 953; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 954; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 955; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 956; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo 957; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 958; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 959; GFX10-GISEL-NEXT: s_endpgm 960 %tid = call i32 @llvm.amdgcn.workitem.id.x() 961 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 962 %val = load i32, ptr addrspace(1) %in.gep 963 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 964 %cmp = icmp eq i32 %val, 0 965 %sel = select i1 %cmp, i32 -1, i32 %cttz 966 store i32 %sel, ptr addrspace(1) %out 967 ret void 968} 969 970define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 971; SI-LABEL: v_cttz_i32_sel_ne_neg1: 972; SI: ; %bb.0: 973; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 974; SI-NEXT: s_mov_b32 s7, 0xf000 975; SI-NEXT: s_mov_b32 s10, 0 976; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 977; SI-NEXT: v_mov_b32_e32 v1, 0 978; SI-NEXT: s_mov_b32 s11, s7 979; SI-NEXT: s_waitcnt lgkmcnt(0) 980; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 981; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 982; SI-NEXT: s_mov_b32 s6, -1 983; SI-NEXT: s_mov_b32 s4, s0 984; SI-NEXT: s_mov_b32 s5, s1 985; SI-NEXT: s_waitcnt vmcnt(0) 986; SI-NEXT: v_ffbl_b32_e32 v0, v0 987; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 988; SI-NEXT: s_endpgm 989; 990; VI-LABEL: v_cttz_i32_sel_ne_neg1: 991; VI: ; %bb.0: 992; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 993; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 994; VI-NEXT: s_waitcnt lgkmcnt(0) 995; VI-NEXT: v_mov_b32_e32 v1, s3 996; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 997; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 998; VI-NEXT: flat_load_dword v0, v[0:1] 999; VI-NEXT: s_mov_b32 s3, 0xf000 1000; VI-NEXT: s_mov_b32 s2, -1 1001; VI-NEXT: s_waitcnt vmcnt(0) 1002; VI-NEXT: v_ffbl_b32_e32 v0, v0 1003; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1004; VI-NEXT: s_endpgm 1005; 1006; EG-LABEL: v_cttz_i32_sel_ne_neg1: 1007; EG: ; %bb.0: 1008; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1009; EG-NEXT: TEX 0 @6 1010; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 1011; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1012; EG-NEXT: CF_END 1013; EG-NEXT: PAD 1014; EG-NEXT: Fetch clause starting at 6: 1015; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1016; EG-NEXT: ALU clause starting at 8: 1017; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1018; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1019; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1020; EG-NEXT: ALU clause starting at 11: 1021; EG-NEXT: FFBL_INT * T0.W, T0.X, 1022; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1023; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1024; EG-NEXT: CNDE_INT T0.X, T0.X, literal.x, PV.W, 1025; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1026; EG-NEXT: -1(nan), 2(2.802597e-45) 1027; 1028; GFX10-LABEL: v_cttz_i32_sel_ne_neg1: 1029; GFX10: ; %bb.0: 1030; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1031; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1032; GFX10-NEXT: v_mov_b32_e32 v1, 0 1033; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1034; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1035; GFX10-NEXT: s_waitcnt vmcnt(0) 1036; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 1037; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1038; GFX10-NEXT: s_endpgm 1039; 1040; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: 1041; GFX10-GISEL: ; %bb.0: 1042; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1043; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1044; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1045; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1046; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1047; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 1048; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 1049; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 1050; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo 1051; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1052; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1053; GFX10-GISEL-NEXT: s_endpgm 1054 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1055 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 1056 %val = load i32, ptr addrspace(1) %in.gep 1057 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 1058 %cmp = icmp ne i32 %val, 0 1059 %sel = select i1 %cmp, i32 %cttz, i32 -1 1060 store i32 %sel, ptr addrspace(1) %out 1061 ret void 1062} 1063 1064; TODO: Should be able to eliminate select here as well. 1065define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1066; SI-LABEL: v_cttz_i32_sel_eq_bitwidth: 1067; SI: ; %bb.0: 1068; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1069; SI-NEXT: s_mov_b32 s7, 0xf000 1070; SI-NEXT: s_mov_b32 s10, 0 1071; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1072; SI-NEXT: v_mov_b32_e32 v1, 0 1073; SI-NEXT: s_mov_b32 s11, s7 1074; SI-NEXT: s_waitcnt lgkmcnt(0) 1075; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1076; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1077; SI-NEXT: s_mov_b32 s6, -1 1078; SI-NEXT: s_mov_b32 s4, s0 1079; SI-NEXT: s_mov_b32 s5, s1 1080; SI-NEXT: s_waitcnt vmcnt(0) 1081; SI-NEXT: v_ffbl_b32_e32 v0, v0 1082; SI-NEXT: v_min_u32_e32 v0, 32, v0 1083; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1084; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1085; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1086; SI-NEXT: s_endpgm 1087; 1088; VI-LABEL: v_cttz_i32_sel_eq_bitwidth: 1089; VI: ; %bb.0: 1090; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1091; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1092; VI-NEXT: s_waitcnt lgkmcnt(0) 1093; VI-NEXT: v_mov_b32_e32 v1, s3 1094; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1095; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1096; VI-NEXT: flat_load_dword v0, v[0:1] 1097; VI-NEXT: s_mov_b32 s3, 0xf000 1098; VI-NEXT: s_mov_b32 s2, -1 1099; VI-NEXT: s_waitcnt vmcnt(0) 1100; VI-NEXT: v_ffbl_b32_e32 v0, v0 1101; VI-NEXT: v_min_u32_e32 v0, 32, v0 1102; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1103; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1104; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1105; VI-NEXT: s_endpgm 1106; 1107; EG-LABEL: v_cttz_i32_sel_eq_bitwidth: 1108; EG: ; %bb.0: 1109; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1110; EG-NEXT: TEX 0 @6 1111; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 1112; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1113; EG-NEXT: CF_END 1114; EG-NEXT: PAD 1115; EG-NEXT: Fetch clause starting at 6: 1116; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1117; EG-NEXT: ALU clause starting at 8: 1118; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1119; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1120; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1121; EG-NEXT: ALU clause starting at 11: 1122; EG-NEXT: FFBL_INT * T0.W, T0.X, 1123; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1124; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1125; EG-NEXT: SETE_INT * T1.W, PV.W, literal.x, 1126; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1127; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, literal.x, 1128; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1129; EG-NEXT: -1(nan), 2(2.802597e-45) 1130; 1131; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth: 1132; GFX10: ; %bb.0: 1133; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1134; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1135; GFX10-NEXT: v_mov_b32_e32 v1, 0 1136; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1137; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1138; GFX10-NEXT: s_waitcnt vmcnt(0) 1139; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 1140; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 1141; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1142; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1143; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1144; GFX10-NEXT: s_endpgm 1145; 1146; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth: 1147; GFX10-GISEL: ; %bb.0: 1148; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1149; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1150; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1151; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1152; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1153; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1154; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 1155; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 1156; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 1157; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo 1158; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1159; GFX10-GISEL-NEXT: s_endpgm 1160 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1161 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 1162 %val = load i32, ptr addrspace(1) %in.gep 1163 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 1164 %cmp = icmp eq i32 %cttz, 32 1165 %sel = select i1 %cmp, i32 -1, i32 %cttz 1166 store i32 %sel, ptr addrspace(1) %out 1167 ret void 1168} 1169 1170define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1171; SI-LABEL: v_cttz_i32_sel_ne_bitwidth: 1172; SI: ; %bb.0: 1173; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1174; SI-NEXT: s_mov_b32 s7, 0xf000 1175; SI-NEXT: s_mov_b32 s10, 0 1176; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1177; SI-NEXT: v_mov_b32_e32 v1, 0 1178; SI-NEXT: s_mov_b32 s11, s7 1179; SI-NEXT: s_waitcnt lgkmcnt(0) 1180; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1181; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1182; SI-NEXT: s_mov_b32 s6, -1 1183; SI-NEXT: s_mov_b32 s4, s0 1184; SI-NEXT: s_mov_b32 s5, s1 1185; SI-NEXT: s_waitcnt vmcnt(0) 1186; SI-NEXT: v_ffbl_b32_e32 v0, v0 1187; SI-NEXT: v_min_u32_e32 v0, 32, v0 1188; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1189; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1190; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 1191; SI-NEXT: s_endpgm 1192; 1193; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: 1194; VI: ; %bb.0: 1195; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1196; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1197; VI-NEXT: s_waitcnt lgkmcnt(0) 1198; VI-NEXT: v_mov_b32_e32 v1, s3 1199; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1200; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1201; VI-NEXT: flat_load_dword v0, v[0:1] 1202; VI-NEXT: s_mov_b32 s3, 0xf000 1203; VI-NEXT: s_mov_b32 s2, -1 1204; VI-NEXT: s_waitcnt vmcnt(0) 1205; VI-NEXT: v_ffbl_b32_e32 v0, v0 1206; VI-NEXT: v_min_u32_e32 v0, 32, v0 1207; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 1208; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc 1209; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1210; VI-NEXT: s_endpgm 1211; 1212; EG-LABEL: v_cttz_i32_sel_ne_bitwidth: 1213; EG: ; %bb.0: 1214; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1215; EG-NEXT: TEX 0 @6 1216; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] 1217; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 1218; EG-NEXT: CF_END 1219; EG-NEXT: PAD 1220; EG-NEXT: Fetch clause starting at 6: 1221; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1222; EG-NEXT: ALU clause starting at 8: 1223; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 1224; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1225; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1226; EG-NEXT: ALU clause starting at 11: 1227; EG-NEXT: FFBL_INT * T0.W, T0.X, 1228; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W, 1229; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1230; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x, 1231; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1232; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W, 1233; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 1234; EG-NEXT: -1(nan), 2(2.802597e-45) 1235; 1236; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth: 1237; GFX10: ; %bb.0: 1238; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1239; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1240; GFX10-NEXT: v_mov_b32_e32 v1, 0 1241; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1242; GFX10-NEXT: global_load_dword v0, v0, s[2:3] 1243; GFX10-NEXT: s_waitcnt vmcnt(0) 1244; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 1245; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 1246; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1247; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1248; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 1249; GFX10-NEXT: s_endpgm 1250; 1251; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: 1252; GFX10-GISEL: ; %bb.0: 1253; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1254; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1255; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1256; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1257; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] 1258; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1259; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 1260; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 1261; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 1262; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo 1263; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] 1264; GFX10-GISEL-NEXT: s_endpgm 1265 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1266 %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid 1267 %val = load i32, ptr addrspace(1) %in.gep 1268 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone 1269 %cmp = icmp ne i32 %cttz, 32 1270 %sel = select i1 %cmp, i32 %cttz, i32 -1 1271 store i32 %sel, ptr addrspace(1) %out 1272 ret void 1273} 1274 1275 define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1276; SI-LABEL: v_cttz_i8_sel_eq_neg1: 1277; SI: ; %bb.0: 1278; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1279; SI-NEXT: s_mov_b32 s7, 0xf000 1280; SI-NEXT: v_mov_b32_e32 v1, 0 1281; SI-NEXT: s_mov_b32 s10, 0 1282; SI-NEXT: s_mov_b32 s11, s7 1283; SI-NEXT: s_waitcnt lgkmcnt(0) 1284; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1285; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 1286; SI-NEXT: s_mov_b32 s6, -1 1287; SI-NEXT: s_mov_b32 s4, s0 1288; SI-NEXT: s_mov_b32 s5, s1 1289; SI-NEXT: s_waitcnt vmcnt(0) 1290; SI-NEXT: v_ffbl_b32_e32 v0, v0 1291; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1292; SI-NEXT: s_endpgm 1293; 1294; VI-LABEL: v_cttz_i8_sel_eq_neg1: 1295; VI: ; %bb.0: 1296; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1297; VI-NEXT: s_waitcnt lgkmcnt(0) 1298; VI-NEXT: v_mov_b32_e32 v1, s3 1299; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1300; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1301; VI-NEXT: flat_load_ubyte v0, v[0:1] 1302; VI-NEXT: s_mov_b32 s3, 0xf000 1303; VI-NEXT: s_mov_b32 s2, -1 1304; VI-NEXT: s_waitcnt vmcnt(0) 1305; VI-NEXT: v_ffbl_b32_e32 v0, v0 1306; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1307; VI-NEXT: s_endpgm 1308; 1309; EG-LABEL: v_cttz_i8_sel_eq_neg1: 1310; EG: ; %bb.0: 1311; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1312; EG-NEXT: TEX 0 @6 1313; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1314; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1315; EG-NEXT: CF_END 1316; EG-NEXT: PAD 1317; EG-NEXT: Fetch clause starting at 6: 1318; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1319; EG-NEXT: ALU clause starting at 8: 1320; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1321; EG-NEXT: ALU clause starting at 9: 1322; EG-NEXT: FFBL_INT T0.W, T0.X, 1323; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1324; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1325; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1326; EG-NEXT: LSHL * T1.W, PS, literal.y, 1327; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45) 1328; EG-NEXT: LSHL T0.X, PV.W, PS, 1329; EG-NEXT: LSHL * T0.W, literal.x, PS, 1330; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1331; EG-NEXT: MOV T0.Y, 0.0, 1332; EG-NEXT: MOV * T0.Z, 0.0, 1333; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1334; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1335; 1336; GFX10-LABEL: v_cttz_i8_sel_eq_neg1: 1337; GFX10: ; %bb.0: 1338; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1339; GFX10-NEXT: v_mov_b32_e32 v1, 0 1340; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1341; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 1342; GFX10-NEXT: s_waitcnt vmcnt(0) 1343; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 1344; GFX10-NEXT: global_store_byte v1, v0, s[0:1] 1345; GFX10-NEXT: s_endpgm 1346; 1347; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: 1348; GFX10-GISEL: ; %bb.0: 1349; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1350; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 1351; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1352; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 1353; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 1354; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 1355; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo 1356; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 1357; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off 1358; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1359; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v0 1360; GFX10-GISEL-NEXT: v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD 1361; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 1362; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0xffff, s2 1363; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[0:1] 1364; GFX10-GISEL-NEXT: s_endpgm 1365 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1366 %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid 1367 %val = load i8, ptr addrspace(1) %valptr.gep 1368 %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone 1369 %cmp = icmp eq i8 %val, 0 1370 %sel = select i1 %cmp, i8 -1, i8 %cttz 1371 store i8 %sel, ptr addrspace(1) %out 1372 ret void 1373} 1374 1375 define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1376; SI-LABEL: v_cttz_i16_sel_eq_neg1: 1377; SI: ; %bb.0: 1378; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1379; SI-NEXT: s_mov_b32 s7, 0xf000 1380; SI-NEXT: s_mov_b32 s6, -1 1381; SI-NEXT: s_mov_b32 s10, s6 1382; SI-NEXT: s_mov_b32 s11, s7 1383; SI-NEXT: s_waitcnt lgkmcnt(0) 1384; SI-NEXT: s_mov_b32 s8, s2 1385; SI-NEXT: s_mov_b32 s9, s3 1386; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 1387; SI-NEXT: s_mov_b32 s4, s0 1388; SI-NEXT: s_mov_b32 s5, s1 1389; SI-NEXT: s_waitcnt vmcnt(0) 1390; SI-NEXT: v_ffbl_b32_e32 v0, v0 1391; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 1392; SI-NEXT: s_endpgm 1393; 1394; VI-LABEL: v_cttz_i16_sel_eq_neg1: 1395; VI: ; %bb.0: 1396; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1397; VI-NEXT: s_mov_b32 s7, 0xf000 1398; VI-NEXT: s_mov_b32 s6, -1 1399; VI-NEXT: s_mov_b32 s10, s6 1400; VI-NEXT: s_mov_b32 s11, s7 1401; VI-NEXT: s_waitcnt lgkmcnt(0) 1402; VI-NEXT: s_mov_b32 s8, s2 1403; VI-NEXT: s_mov_b32 s9, s3 1404; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 1405; VI-NEXT: v_mov_b32_e32 v1, 0xffff 1406; VI-NEXT: s_mov_b32 s4, s0 1407; VI-NEXT: s_mov_b32 s5, s1 1408; VI-NEXT: s_waitcnt vmcnt(0) 1409; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0 1410; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 1411; VI-NEXT: v_ffbl_b32_e32 v2, v2 1412; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 1413; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc 1414; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 1415; VI-NEXT: s_endpgm 1416; 1417; EG-LABEL: v_cttz_i16_sel_eq_neg1: 1418; EG: ; %bb.0: 1419; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1420; EG-NEXT: TEX 0 @6 1421; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1422; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1423; EG-NEXT: CF_END 1424; EG-NEXT: PAD 1425; EG-NEXT: Fetch clause starting at 6: 1426; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1427; EG-NEXT: ALU clause starting at 8: 1428; EG-NEXT: MOV * T0.X, KC0[2].Z, 1429; EG-NEXT: ALU clause starting at 9: 1430; EG-NEXT: FFBL_INT T0.W, T0.X, 1431; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1432; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1433; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1434; EG-NEXT: LSHL * T1.W, PS, literal.y, 1435; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1436; EG-NEXT: LSHL T0.X, PV.W, PS, 1437; EG-NEXT: LSHL * T0.W, literal.x, PS, 1438; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1439; EG-NEXT: MOV T0.Y, 0.0, 1440; EG-NEXT: MOV * T0.Z, 0.0, 1441; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1442; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1443; 1444; GFX10-LABEL: v_cttz_i16_sel_eq_neg1: 1445; GFX10: ; %bb.0: 1446; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1447; GFX10-NEXT: v_mov_b32_e32 v0, 0 1448; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1449; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] 1450; GFX10-NEXT: s_waitcnt vmcnt(0) 1451; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1 1452; GFX10-NEXT: v_cmp_ne_u32_sdwa vcc_lo, v1, v0 src0_sel:WORD_0 src1_sel:DWORD 1453; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 1454; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo 1455; GFX10-NEXT: global_store_short v0, v1, s[0:1] 1456; GFX10-NEXT: s_endpgm 1457; 1458; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: 1459; GFX10-GISEL: ; %bb.0: 1460; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1461; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 1462; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1463; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] 1464; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1465; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 1466; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 1467; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 1468; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 1469; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo 1470; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] 1471; GFX10-GISEL-NEXT: s_endpgm 1472 %val = load i16, ptr addrspace(1) %valptr 1473 %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone 1474 %cmp = icmp eq i16 %val, 0 1475 %sel = select i1 %cmp, i16 -1, i16 %cttz 1476 store i16 %sel, ptr addrspace(1) %out 1477 ret void 1478} 1479 1480; FIXME: Need to handle non-uniform case for function below (load without gep). 1481define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind { 1482; SI-LABEL: v_cttz_i7_sel_eq_neg1: 1483; SI: ; %bb.0: 1484; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1485; SI-NEXT: s_mov_b32 s7, 0xf000 1486; SI-NEXT: v_mov_b32_e32 v1, 0 1487; SI-NEXT: s_mov_b32 s10, 0 1488; SI-NEXT: s_mov_b32 s11, s7 1489; SI-NEXT: s_waitcnt lgkmcnt(0) 1490; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1491; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64 1492; SI-NEXT: s_mov_b32 s6, -1 1493; SI-NEXT: s_mov_b32 s4, s0 1494; SI-NEXT: s_mov_b32 s5, s1 1495; SI-NEXT: s_waitcnt vmcnt(0) 1496; SI-NEXT: v_ffbl_b32_e32 v0, v0 1497; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0 1498; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 1499; SI-NEXT: s_endpgm 1500; 1501; VI-LABEL: v_cttz_i7_sel_eq_neg1: 1502; VI: ; %bb.0: 1503; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1504; VI-NEXT: s_waitcnt lgkmcnt(0) 1505; VI-NEXT: v_mov_b32_e32 v1, s3 1506; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1507; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1508; VI-NEXT: flat_load_ubyte v0, v[0:1] 1509; VI-NEXT: s_mov_b32 s3, 0xf000 1510; VI-NEXT: s_mov_b32 s2, -1 1511; VI-NEXT: s_waitcnt vmcnt(0) 1512; VI-NEXT: v_ffbl_b32_e32 v0, v0 1513; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0 1514; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 1515; VI-NEXT: s_endpgm 1516; 1517; EG-LABEL: v_cttz_i7_sel_eq_neg1: 1518; EG: ; %bb.0: 1519; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1520; EG-NEXT: TEX 0 @6 1521; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1522; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1523; EG-NEXT: CF_END 1524; EG-NEXT: PAD 1525; EG-NEXT: Fetch clause starting at 6: 1526; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1 1527; EG-NEXT: ALU clause starting at 8: 1528; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X, 1529; EG-NEXT: ALU clause starting at 9: 1530; EG-NEXT: FFBL_INT T0.W, T0.X, 1531; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1532; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1533; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1534; EG-NEXT: LSHL * T1.W, PS, literal.y, 1535; EG-NEXT: 127(1.779649e-43), 3(4.203895e-45) 1536; EG-NEXT: LSHL T0.X, PV.W, PS, 1537; EG-NEXT: LSHL * T0.W, literal.x, PS, 1538; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) 1539; EG-NEXT: MOV T0.Y, 0.0, 1540; EG-NEXT: MOV * T0.Z, 0.0, 1541; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1542; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1543; 1544; GFX10-LABEL: v_cttz_i7_sel_eq_neg1: 1545; GFX10: ; %bb.0: 1546; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1547; GFX10-NEXT: v_mov_b32_e32 v1, 0 1548; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1549; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] 1550; GFX10-NEXT: s_waitcnt vmcnt(0) 1551; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 1552; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 1553; GFX10-NEXT: global_store_byte v1, v0, s[0:1] 1554; GFX10-NEXT: s_endpgm 1555; 1556; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1: 1557; GFX10-GISEL: ; %bb.0: 1558; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1559; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 1560; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) 1561; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 1562; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 1563; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 1564; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo 1565; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off 1566; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) 1567; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x80, v0 1568; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 1569; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 1570; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 1571; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo 1572; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 1573; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 1574; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] 1575; GFX10-GISEL-NEXT: s_endpgm 1576 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1577 %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid 1578 %val = load i7, ptr addrspace(1) %valptr.gep 1579 %cttz = call i7 @llvm.cttz.i7(i7 %val, i1 false) nounwind readnone 1580 %cmp = icmp eq i7 %val, 0 1581 %sel = select i1 %cmp, i7 -1, i7 %cttz 1582 store i7 %sel, ptr addrspace(1) %out 1583 ret void 1584} 1585