1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 2; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s 4 5declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 6 7declare i64 @llvm.ctpop.i64(i64) nounwind readnone 8declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone 9declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone 10declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone 11declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone 12 13declare i65 @llvm.ctpop.i65(i65) nounwind readnone 14declare i128 @llvm.ctpop.i128(i128) nounwind readnone 15 16define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind { 17; SI-LABEL: s_ctpop_i64: 18; SI: ; %bb.0: 19; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 20; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 21; SI-NEXT: s_mov_b32 s3, 0xf000 22; SI-NEXT: s_mov_b32 s2, -1 23; SI-NEXT: s_waitcnt lgkmcnt(0) 24; SI-NEXT: s_bcnt1_i32_b64 s4, s[6:7] 25; SI-NEXT: v_mov_b32_e32 v0, s4 26; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 27; SI-NEXT: s_endpgm 28; 29; VI-LABEL: s_ctpop_i64: 30; VI: ; %bb.0: 31; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4c 32; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 33; VI-NEXT: s_mov_b32 s3, 0xf000 34; VI-NEXT: s_mov_b32 s2, -1 35; VI-NEXT: s_waitcnt lgkmcnt(0) 36; VI-NEXT: s_bcnt1_i32_b64 s4, s[6:7] 37; VI-NEXT: v_mov_b32_e32 v0, s4 38; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 39; VI-NEXT: s_endpgm 40 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone 41 %truncctpop = trunc i64 %ctpop to i32 42 store i32 %truncctpop, ptr addrspace(1) %out, align 4 43 ret void 44} 45 46define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 47; SI-LABEL: v_ctpop_i64: 48; SI: ; %bb.0: 49; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 50; SI-NEXT: s_mov_b32 s7, 0xf000 51; SI-NEXT: s_mov_b32 s10, 0 52; SI-NEXT: s_mov_b32 s11, s7 53; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 54; SI-NEXT: s_waitcnt lgkmcnt(0) 55; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 56; SI-NEXT: v_mov_b32_e32 v1, 0 57; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 58; SI-NEXT: s_mov_b32 s6, -1 59; SI-NEXT: s_mov_b32 s4, s0 60; SI-NEXT: s_mov_b32 s5, s1 61; SI-NEXT: s_waitcnt vmcnt(0) 62; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 63; SI-NEXT: v_bcnt_u32_b32_e32 v0, v1, v0 64; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 65; SI-NEXT: s_endpgm 66; 67; VI-LABEL: v_ctpop_i64: 68; VI: ; %bb.0: 69; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 70; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 71; VI-NEXT: s_waitcnt lgkmcnt(0) 72; VI-NEXT: v_mov_b32_e32 v1, s3 73; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 74; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 75; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 76; VI-NEXT: s_mov_b32 s3, 0xf000 77; VI-NEXT: s_mov_b32 s2, -1 78; VI-NEXT: s_waitcnt vmcnt(0) 79; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 80; VI-NEXT: v_bcnt_u32_b32 v0, v1, v0 81; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 82; VI-NEXT: s_endpgm 83 %tid = call i32 @llvm.amdgcn.workitem.id.x() 84 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid 85 %val = load i64, ptr addrspace(1) %in.gep, align 8 86 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone 87 %truncctpop = trunc i64 %ctpop to i32 88 store i32 %truncctpop, ptr addrspace(1) %out, align 4 89 ret void 90} 91 92define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %s.val) nounwind { 93; SI-LABEL: v_ctpop_i64_user: 94; SI: ; %bb.0: 95; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 96; SI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd 97; SI-NEXT: s_mov_b32 s7, 0xf000 98; SI-NEXT: s_mov_b32 s10, 0 99; SI-NEXT: s_mov_b32 s11, s7 100; SI-NEXT: s_waitcnt lgkmcnt(0) 101; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 102; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 103; SI-NEXT: v_mov_b32_e32 v1, 0 104; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 105; SI-NEXT: s_mov_b32 s6, -1 106; SI-NEXT: s_mov_b32 s4, s0 107; SI-NEXT: s_mov_b32 s5, s1 108; SI-NEXT: s_waitcnt vmcnt(0) 109; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 110; SI-NEXT: v_bcnt_u32_b32_e32 v0, v1, v0 111; SI-NEXT: v_mov_b32_e32 v1, s13 112; SI-NEXT: v_or_b32_e32 v0, s12, v0 113; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 114; SI-NEXT: s_endpgm 115; 116; VI-LABEL: v_ctpop_i64_user: 117; VI: ; %bb.0: 118; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 119; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 120; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 121; VI-NEXT: s_waitcnt lgkmcnt(0) 122; VI-NEXT: v_mov_b32_e32 v1, s3 123; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 124; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 125; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 126; VI-NEXT: s_mov_b32 s3, 0xf000 127; VI-NEXT: s_mov_b32 s2, -1 128; VI-NEXT: s_waitcnt vmcnt(0) 129; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 130; VI-NEXT: v_bcnt_u32_b32 v0, v1, v0 131; VI-NEXT: v_mov_b32_e32 v1, s5 132; VI-NEXT: v_or_b32_e32 v0, s4, v0 133; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 134; VI-NEXT: s_endpgm 135 %tid = call i32 @llvm.amdgcn.workitem.id.x() 136 %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid 137 %val = load i64, ptr addrspace(1) %in.gep, align 8 138 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone 139 %or = or i64 %ctpop, %s.val 140 store i64 %or, ptr addrspace(1) %out 141 ret void 142} 143 144define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) nounwind { 145; SI-LABEL: s_ctpop_v2i64: 146; SI: ; %bb.0: 147; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xd 148; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 149; SI-NEXT: s_mov_b32 s7, 0xf000 150; SI-NEXT: s_mov_b32 s6, -1 151; SI-NEXT: s_waitcnt lgkmcnt(0) 152; SI-NEXT: s_bcnt1_i32_b64 s0, s[0:1] 153; SI-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 154; SI-NEXT: v_mov_b32_e32 v0, s0 155; SI-NEXT: v_mov_b32_e32 v1, s1 156; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 157; SI-NEXT: s_endpgm 158; 159; VI-LABEL: s_ctpop_v2i64: 160; VI: ; %bb.0: 161; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 162; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 163; VI-NEXT: s_mov_b32 s7, 0xf000 164; VI-NEXT: s_mov_b32 s6, -1 165; VI-NEXT: s_waitcnt lgkmcnt(0) 166; VI-NEXT: s_bcnt1_i32_b64 s0, s[0:1] 167; VI-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 168; VI-NEXT: v_mov_b32_e32 v0, s0 169; VI-NEXT: v_mov_b32_e32 v1, s1 170; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 171; VI-NEXT: s_endpgm 172 %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone 173 %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> 174 store <2 x i32> %truncctpop, ptr addrspace(1) %out, align 8 175 ret void 176} 177 178define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64> %val) nounwind { 179; SI-LABEL: s_ctpop_v4i64: 180; SI: ; %bb.0: 181; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x11 182; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 183; SI-NEXT: s_mov_b32 s3, 0xf000 184; SI-NEXT: s_mov_b32 s2, -1 185; SI-NEXT: s_waitcnt lgkmcnt(0) 186; SI-NEXT: s_bcnt1_i32_b64 s4, s[8:9] 187; SI-NEXT: s_bcnt1_i32_b64 s5, s[10:11] 188; SI-NEXT: s_bcnt1_i32_b64 s6, s[12:13] 189; SI-NEXT: s_bcnt1_i32_b64 s7, s[14:15] 190; SI-NEXT: v_mov_b32_e32 v0, s4 191; SI-NEXT: v_mov_b32_e32 v1, s5 192; SI-NEXT: v_mov_b32_e32 v2, s6 193; SI-NEXT: v_mov_b32_e32 v3, s7 194; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 195; SI-NEXT: s_endpgm 196; 197; VI-LABEL: s_ctpop_v4i64: 198; VI: ; %bb.0: 199; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 200; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 201; VI-NEXT: s_mov_b32 s3, 0xf000 202; VI-NEXT: s_mov_b32 s2, -1 203; VI-NEXT: s_waitcnt lgkmcnt(0) 204; VI-NEXT: s_bcnt1_i32_b64 s4, s[8:9] 205; VI-NEXT: s_bcnt1_i32_b64 s5, s[10:11] 206; VI-NEXT: s_bcnt1_i32_b64 s6, s[12:13] 207; VI-NEXT: s_bcnt1_i32_b64 s7, s[14:15] 208; VI-NEXT: v_mov_b32_e32 v0, s4 209; VI-NEXT: v_mov_b32_e32 v1, s5 210; VI-NEXT: v_mov_b32_e32 v2, s6 211; VI-NEXT: v_mov_b32_e32 v3, s7 212; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 213; VI-NEXT: s_endpgm 214 %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone 215 %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> 216 store <4 x i32> %truncctpop, ptr addrspace(1) %out, align 16 217 ret void 218} 219 220define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 221; SI-LABEL: v_ctpop_v2i64: 222; SI: ; %bb.0: 223; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 224; SI-NEXT: s_mov_b32 s7, 0xf000 225; SI-NEXT: s_mov_b32 s10, 0 226; SI-NEXT: s_mov_b32 s11, s7 227; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 228; SI-NEXT: s_waitcnt lgkmcnt(0) 229; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 230; SI-NEXT: v_mov_b32_e32 v1, 0 231; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 232; SI-NEXT: s_mov_b32 s6, -1 233; SI-NEXT: s_mov_b32 s4, s0 234; SI-NEXT: s_mov_b32 s5, s1 235; SI-NEXT: s_waitcnt vmcnt(0) 236; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 237; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 238; SI-NEXT: v_bcnt_u32_b32_e32 v0, v1, v0 239; SI-NEXT: v_bcnt_u32_b32_e32 v1, v3, v2 240; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 241; SI-NEXT: s_endpgm 242; 243; VI-LABEL: v_ctpop_v2i64: 244; VI: ; %bb.0: 245; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 246; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 247; VI-NEXT: s_waitcnt lgkmcnt(0) 248; VI-NEXT: v_mov_b32_e32 v1, s3 249; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 250; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 251; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 252; VI-NEXT: s_mov_b32 s3, 0xf000 253; VI-NEXT: s_mov_b32 s2, -1 254; VI-NEXT: s_waitcnt vmcnt(0) 255; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 256; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 257; VI-NEXT: v_bcnt_u32_b32 v0, v1, v0 258; VI-NEXT: v_bcnt_u32_b32 v1, v3, v2 259; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 260; VI-NEXT: s_endpgm 261 %tid = call i32 @llvm.amdgcn.workitem.id.x() 262 %in.gep = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid 263 %val = load <2 x i64>, ptr addrspace(1) %in.gep, align 16 264 %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone 265 %truncctpop = trunc <2 x i64> %ctpop to <2 x i32> 266 store <2 x i32> %truncctpop, ptr addrspace(1) %out, align 8 267 ret void 268} 269 270define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 271; SI-LABEL: v_ctpop_v4i64: 272; SI: ; %bb.0: 273; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 274; SI-NEXT: s_mov_b32 s7, 0xf000 275; SI-NEXT: s_mov_b32 s10, 0 276; SI-NEXT: s_mov_b32 s11, s7 277; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 278; SI-NEXT: s_waitcnt lgkmcnt(0) 279; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 280; SI-NEXT: v_mov_b32_e32 v5, 0 281; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 282; SI-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[8:11], 0 addr64 offset:16 283; SI-NEXT: s_mov_b32 s6, -1 284; SI-NEXT: s_mov_b32 s4, s0 285; SI-NEXT: s_mov_b32 s5, s1 286; SI-NEXT: s_waitcnt vmcnt(1) 287; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 288; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 289; SI-NEXT: s_waitcnt vmcnt(0) 290; SI-NEXT: v_bcnt_u32_b32_e64 v4, v4, 0 291; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0 292; SI-NEXT: v_bcnt_u32_b32_e32 v0, v1, v0 293; SI-NEXT: v_bcnt_u32_b32_e32 v1, v3, v2 294; SI-NEXT: v_bcnt_u32_b32_e32 v2, v5, v4 295; SI-NEXT: v_bcnt_u32_b32_e32 v3, v7, v6 296; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 297; SI-NEXT: s_endpgm 298; 299; VI-LABEL: v_ctpop_v4i64: 300; VI: ; %bb.0: 301; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 302; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 303; VI-NEXT: s_waitcnt lgkmcnt(0) 304; VI-NEXT: v_mov_b32_e32 v1, s3 305; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v0 306; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 307; VI-NEXT: flat_load_dwordx4 v[0:3], v[4:5] 308; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v4 309; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 310; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 311; VI-NEXT: s_mov_b32 s3, 0xf000 312; VI-NEXT: s_mov_b32 s2, -1 313; VI-NEXT: s_waitcnt vmcnt(1) 314; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 315; VI-NEXT: v_bcnt_u32_b32 v8, v2, 0 316; VI-NEXT: v_bcnt_u32_b32 v2, v1, v0 317; VI-NEXT: v_bcnt_u32_b32 v3, v3, v8 318; VI-NEXT: s_waitcnt vmcnt(0) 319; VI-NEXT: v_bcnt_u32_b32 v4, v4, 0 320; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0 321; VI-NEXT: v_bcnt_u32_b32 v4, v5, v4 322; VI-NEXT: v_bcnt_u32_b32 v5, v7, v6 323; VI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 324; VI-NEXT: s_endpgm 325 %tid = call i32 @llvm.amdgcn.workitem.id.x() 326 %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid 327 %val = load <4 x i64>, ptr addrspace(1) %in.gep, align 32 328 %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone 329 %truncctpop = trunc <4 x i64> %ctpop to <4 x i32> 330 store <4 x i32> %truncctpop, ptr addrspace(1) %out, align 16 331 ret void 332} 333 334define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %ctpop_arg, i32 %cond) { 335; SI-LABEL: ctpop_i64_in_br: 336; SI: ; %bb.0: ; %entry 337; SI-NEXT: s_load_dword s8, s[4:5], 0xf 338; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 339; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd 340; SI-NEXT: s_waitcnt lgkmcnt(0) 341; SI-NEXT: s_cmp_lg_u32 s8, 0 342; SI-NEXT: s_cbranch_scc0 .LBB7_4 343; SI-NEXT: ; %bb.1: ; %else 344; SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x2 345; SI-NEXT: s_mov_b64 s[2:3], 0 346; SI-NEXT: s_andn2_b64 vcc, exec, s[2:3] 347; SI-NEXT: s_waitcnt lgkmcnt(0) 348; SI-NEXT: s_mov_b64 vcc, vcc 349; SI-NEXT: s_cbranch_vccnz .LBB7_3 350; SI-NEXT: .LBB7_2: ; %if 351; SI-NEXT: s_bcnt1_i32_b64 s4, s[6:7] 352; SI-NEXT: s_mov_b32 s5, 0 353; SI-NEXT: .LBB7_3: ; %endif 354; SI-NEXT: v_mov_b32_e32 v0, s4 355; SI-NEXT: s_mov_b32 s3, 0xf000 356; SI-NEXT: s_mov_b32 s2, -1 357; SI-NEXT: v_mov_b32_e32 v1, s5 358; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 359; SI-NEXT: s_endpgm 360; SI-NEXT: .LBB7_4: 361; SI-NEXT: ; implicit-def: $sgpr4_sgpr5 362; SI-NEXT: s_branch .LBB7_2 363; 364; VI-LABEL: ctpop_i64_in_br: 365; VI: ; %bb.0: ; %entry 366; VI-NEXT: s_load_dword s8, s[4:5], 0x3c 367; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 368; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 369; VI-NEXT: s_waitcnt lgkmcnt(0) 370; VI-NEXT: s_cmp_lg_u32 s8, 0 371; VI-NEXT: s_cbranch_scc0 .LBB7_4 372; VI-NEXT: ; %bb.1: ; %else 373; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 374; VI-NEXT: s_cbranch_execnz .LBB7_3 375; VI-NEXT: .LBB7_2: ; %if 376; VI-NEXT: s_waitcnt lgkmcnt(0) 377; VI-NEXT: s_bcnt1_i32_b64 s4, s[6:7] 378; VI-NEXT: s_mov_b32 s5, 0 379; VI-NEXT: .LBB7_3: ; %endif 380; VI-NEXT: s_waitcnt lgkmcnt(0) 381; VI-NEXT: v_mov_b32_e32 v0, s4 382; VI-NEXT: s_mov_b32 s3, 0xf000 383; VI-NEXT: s_mov_b32 s2, -1 384; VI-NEXT: v_mov_b32_e32 v1, s5 385; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 386; VI-NEXT: s_endpgm 387; VI-NEXT: .LBB7_4: 388; VI-NEXT: ; implicit-def: $sgpr4_sgpr5 389; VI-NEXT: s_branch .LBB7_2 390entry: 391 %tmp0 = icmp eq i32 %cond, 0 392 br i1 %tmp0, label %if, label %else 393 394if: 395 %tmp2 = call i64 @llvm.ctpop.i64(i64 %ctpop_arg) 396 br label %endif 397 398else: 399 %tmp3 = getelementptr i64, ptr addrspace(1) %in, i32 1 400 %tmp4 = load i64, ptr addrspace(1) %tmp3 401 br label %endif 402 403endif: 404 %tmp5 = phi i64 [%tmp2, %if], [%tmp4, %else] 405 store i64 %tmp5, ptr addrspace(1) %out 406 ret void 407} 408 409define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val) nounwind { 410; SI-LABEL: s_ctpop_i128: 411; SI: ; %bb.0: 412; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 413; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 414; SI-NEXT: s_mov_b32 s7, 0xf000 415; SI-NEXT: s_mov_b32 s6, -1 416; SI-NEXT: s_waitcnt lgkmcnt(0) 417; SI-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 418; SI-NEXT: s_bcnt1_i32_b64 s0, s[0:1] 419; SI-NEXT: s_add_i32 s0, s0, s2 420; SI-NEXT: v_mov_b32_e32 v0, s0 421; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 422; SI-NEXT: s_endpgm 423; 424; VI-LABEL: s_ctpop_i128: 425; VI: ; %bb.0: 426; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 427; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 428; VI-NEXT: s_mov_b32 s7, 0xf000 429; VI-NEXT: s_mov_b32 s6, -1 430; VI-NEXT: s_waitcnt lgkmcnt(0) 431; VI-NEXT: s_bcnt1_i32_b64 s2, s[2:3] 432; VI-NEXT: s_bcnt1_i32_b64 s0, s[0:1] 433; VI-NEXT: s_add_i32 s0, s0, s2 434; VI-NEXT: v_mov_b32_e32 v0, s0 435; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 436; VI-NEXT: s_endpgm 437 %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone 438 %truncctpop = trunc i128 %ctpop to i32 439 store i32 %truncctpop, ptr addrspace(1) %out, align 4 440 ret void 441} 442 443define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) nounwind { 444; SI-LABEL: s_ctpop_i65: 445; SI: ; %bb.0: 446; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 447; SI-NEXT: s_load_dword s8, s[4:5], 0xd 448; SI-NEXT: s_mov_b32 s7, 0xf000 449; SI-NEXT: s_mov_b32 s6, -1 450; SI-NEXT: s_waitcnt lgkmcnt(0) 451; SI-NEXT: s_mov_b32 s4, s0 452; SI-NEXT: s_and_b32 s0, s8, 0xff 453; SI-NEXT: s_mov_b32 s5, s1 454; SI-NEXT: s_bcnt1_i32_b32 s0, s0 455; SI-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 456; SI-NEXT: s_add_i32 s0, s1, s0 457; SI-NEXT: v_mov_b32_e32 v0, s0 458; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 459; SI-NEXT: s_endpgm 460; 461; VI-LABEL: s_ctpop_i65: 462; VI: ; %bb.0: 463; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 464; VI-NEXT: s_load_dword s8, s[4:5], 0x34 465; VI-NEXT: s_mov_b32 s7, 0xf000 466; VI-NEXT: s_mov_b32 s6, -1 467; VI-NEXT: s_waitcnt lgkmcnt(0) 468; VI-NEXT: s_mov_b32 s4, s0 469; VI-NEXT: s_and_b32 s0, s8, 0xff 470; VI-NEXT: s_mov_b32 s5, s1 471; VI-NEXT: s_bcnt1_i32_b32 s0, s0 472; VI-NEXT: s_bcnt1_i32_b64 s1, s[2:3] 473; VI-NEXT: s_add_i32 s0, s1, s0 474; VI-NEXT: v_mov_b32_e32 v0, s0 475; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 476; VI-NEXT: s_endpgm 477 %ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone 478 %truncctpop = trunc i65 %ctpop to i32 479 store i32 %truncctpop, ptr addrspace(1) %out, align 4 480 ret void 481} 482 483; FIXME: Should not have extra add 484define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 485; SI-LABEL: v_ctpop_i128: 486; SI: ; %bb.0: 487; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 488; SI-NEXT: s_mov_b32 s7, 0xf000 489; SI-NEXT: s_mov_b32 s10, 0 490; SI-NEXT: s_mov_b32 s11, s7 491; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 492; SI-NEXT: s_waitcnt lgkmcnt(0) 493; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 494; SI-NEXT: v_mov_b32_e32 v1, 0 495; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 496; SI-NEXT: s_mov_b32 s6, -1 497; SI-NEXT: s_mov_b32 s4, s0 498; SI-NEXT: s_mov_b32 s5, s1 499; SI-NEXT: s_waitcnt vmcnt(0) 500; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 501; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 502; SI-NEXT: v_bcnt_u32_b32_e32 v2, v3, v2 503; SI-NEXT: v_bcnt_u32_b32_e32 v0, v1, v0 504; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 505; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 506; SI-NEXT: s_endpgm 507; 508; VI-LABEL: v_ctpop_i128: 509; VI: ; %bb.0: 510; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 511; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 512; VI-NEXT: s_waitcnt lgkmcnt(0) 513; VI-NEXT: v_mov_b32_e32 v1, s3 514; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 515; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 516; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 517; VI-NEXT: s_mov_b32 s3, 0xf000 518; VI-NEXT: s_mov_b32 s2, -1 519; VI-NEXT: s_waitcnt vmcnt(0) 520; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 521; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 522; VI-NEXT: v_bcnt_u32_b32 v2, v3, v2 523; VI-NEXT: v_bcnt_u32_b32 v0, v1, v0 524; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 525; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 526; VI-NEXT: s_endpgm 527 %tid = call i32 @llvm.amdgcn.workitem.id.x() 528 %in.gep = getelementptr i128, ptr addrspace(1) %in, i32 %tid 529 %val = load i128, ptr addrspace(1) %in.gep, align 8 530 %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone 531 %truncctpop = trunc i128 %ctpop to i32 532 store i32 %truncctpop, ptr addrspace(1) %out, align 4 533 ret void 534} 535