1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s 4; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG %s 5 6declare i16 @llvm.ctpop.i16(i16) nounwind readnone 7declare <2 x i16> @llvm.ctpop.v2i16(<2 x i16>) nounwind readnone 8declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone 9declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone 10declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) nounwind readnone 11 12declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 13 14define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) nounwind { 15; SI-LABEL: s_ctpop_i16: 16; SI: ; %bb.0: 17; SI-NEXT: s_load_dword s6, s[4:5], 0xb 18; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 19; SI-NEXT: s_mov_b32 s3, 0xf000 20; SI-NEXT: s_mov_b32 s2, -1 21; SI-NEXT: s_waitcnt lgkmcnt(0) 22; SI-NEXT: s_and_b32 s4, s6, 0xffff 23; SI-NEXT: s_bcnt1_i32_b32 s4, s4 24; SI-NEXT: v_mov_b32_e32 v0, s4 25; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 26; SI-NEXT: s_endpgm 27; 28; VI-LABEL: s_ctpop_i16: 29; VI: ; %bb.0: 30; VI-NEXT: s_load_dword s6, s[4:5], 0x2c 31; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 32; VI-NEXT: s_mov_b32 s3, 0xf000 33; VI-NEXT: s_mov_b32 s2, -1 34; VI-NEXT: s_waitcnt lgkmcnt(0) 35; VI-NEXT: s_and_b32 s4, s6, 0xffff 36; VI-NEXT: s_bcnt1_i32_b32 s4, s4 37; VI-NEXT: v_mov_b32_e32 v0, s4 38; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 39; VI-NEXT: s_endpgm 40; 41; EG-LABEL: s_ctpop_i16: 42; EG: ; %bb.0: 43; EG-NEXT: ALU 0, @8, KC0[], KC1[] 44; EG-NEXT: TEX 0 @6 45; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] 46; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 47; EG-NEXT: CF_END 48; EG-NEXT: PAD 49; EG-NEXT: Fetch clause starting at 6: 50; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 51; EG-NEXT: ALU clause starting at 8: 52; EG-NEXT: MOV * T0.X, 0.0, 53; EG-NEXT: ALU clause starting at 9: 54; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x, 55; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 56; EG-NEXT: BCNT_INT T1.W, T0.X, 57; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 58; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 59; EG-NEXT: LSHL T0.X, PV.W, PS, 60; EG-NEXT: LSHL * T0.W, literal.x, PS, 61; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 62; EG-NEXT: MOV T0.Y, 0.0, 63; EG-NEXT: MOV * T0.Z, 0.0, 64; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 65; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 66 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 67 store i16 %ctpop, ptr addrspace(1) %out, align 4 68 ret void 69} 70 71; XXX - Why 0 in register? 72define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 73; SI-LABEL: v_ctpop_i16: 74; SI: ; %bb.0: 75; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 76; SI-NEXT: s_mov_b32 s7, 0xf000 77; SI-NEXT: s_mov_b32 s10, 0 78; SI-NEXT: s_mov_b32 s11, s7 79; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 80; SI-NEXT: s_waitcnt lgkmcnt(0) 81; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 82; SI-NEXT: v_mov_b32_e32 v1, 0 83; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 84; SI-NEXT: s_mov_b32 s6, -1 85; SI-NEXT: s_mov_b32 s4, s0 86; SI-NEXT: s_mov_b32 s5, s1 87; SI-NEXT: s_waitcnt vmcnt(0) 88; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 89; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 90; SI-NEXT: s_endpgm 91; 92; VI-LABEL: v_ctpop_i16: 93; VI: ; %bb.0: 94; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 95; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 96; VI-NEXT: s_waitcnt lgkmcnt(0) 97; VI-NEXT: v_mov_b32_e32 v1, s3 98; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 99; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 100; VI-NEXT: flat_load_ushort v0, v[0:1] 101; VI-NEXT: s_mov_b32 s3, 0xf000 102; VI-NEXT: s_mov_b32 s2, -1 103; VI-NEXT: s_waitcnt vmcnt(0) 104; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 105; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 106; VI-NEXT: s_endpgm 107; 108; EG-LABEL: v_ctpop_i16: 109; EG: ; %bb.0: 110; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] 111; EG-NEXT: TEX 0 @6 112; EG-NEXT: ALU 11, @10, KC0[CB0:0-32], KC1[] 113; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 114; EG-NEXT: CF_END 115; EG-NEXT: PAD 116; EG-NEXT: Fetch clause starting at 6: 117; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 118; EG-NEXT: ALU clause starting at 8: 119; EG-NEXT: LSHL * T0.W, T0.X, 1, 120; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 121; EG-NEXT: ALU clause starting at 10: 122; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x, 123; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 124; EG-NEXT: BCNT_INT T1.W, T0.X, 125; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 126; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 127; EG-NEXT: LSHL T0.X, PV.W, PS, 128; EG-NEXT: LSHL * T0.W, literal.x, PS, 129; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 130; EG-NEXT: MOV T0.Y, 0.0, 131; EG-NEXT: MOV * T0.Z, 0.0, 132; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 133; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 134 %tid = call i32 @llvm.amdgcn.workitem.id.x() 135 %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid 136 %val = load i16, ptr addrspace(1) %in.gep, align 4 137 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 138 store i16 %ctpop, ptr addrspace(1) %out, align 4 139 ret void 140} 141 142define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in0, ptr addrspace(1) noalias %in1) nounwind { 143; SI-LABEL: v_ctpop_add_chain_i16: 144; SI: ; %bb.0: 145; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 146; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 147; SI-NEXT: s_mov_b32 s11, 0xf000 148; SI-NEXT: s_mov_b32 s14, 0 149; SI-NEXT: s_mov_b32 s15, s11 150; SI-NEXT: s_waitcnt lgkmcnt(0) 151; SI-NEXT: s_mov_b64 s[12:13], s[2:3] 152; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 153; SI-NEXT: v_mov_b32_e32 v1, 0 154; SI-NEXT: s_mov_b64 s[6:7], s[14:15] 155; SI-NEXT: buffer_load_ushort v2, v[0:1], s[12:15], 0 addr64 glc 156; SI-NEXT: s_waitcnt vmcnt(0) 157; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc 158; SI-NEXT: s_waitcnt vmcnt(0) 159; SI-NEXT: s_mov_b32 s10, -1 160; SI-NEXT: s_mov_b32 s8, s0 161; SI-NEXT: s_mov_b32 s9, s1 162; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 163; SI-NEXT: v_bcnt_u32_b32_e32 v0, v2, v0 164; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 165; SI-NEXT: s_endpgm 166; 167; VI-LABEL: v_ctpop_add_chain_i16: 168; VI: ; %bb.0: 169; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 170; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 171; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 172; VI-NEXT: s_waitcnt lgkmcnt(0) 173; VI-NEXT: v_mov_b32_e32 v1, s3 174; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 175; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 176; VI-NEXT: v_mov_b32_e32 v3, s5 177; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 178; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 179; VI-NEXT: flat_load_ushort v0, v[0:1] glc 180; VI-NEXT: s_waitcnt vmcnt(0) 181; VI-NEXT: flat_load_ushort v1, v[2:3] glc 182; VI-NEXT: s_waitcnt vmcnt(0) 183; VI-NEXT: s_mov_b32 s3, 0xf000 184; VI-NEXT: s_mov_b32 s2, -1 185; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 186; VI-NEXT: v_bcnt_u32_b32 v0, v0, v1 187; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 188; VI-NEXT: s_endpgm 189; 190; EG-LABEL: v_ctpop_add_chain_i16: 191; EG: ; %bb.0: 192; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[] 193; EG-NEXT: TEX 0 @8 194; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 195; EG-NEXT: TEX 0 @10 196; EG-NEXT: ALU 16, @15, KC0[CB0:0-32], KC1[] 197; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 198; EG-NEXT: CF_END 199; EG-NEXT: PAD 200; EG-NEXT: Fetch clause starting at 8: 201; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 202; EG-NEXT: Fetch clause starting at 10: 203; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1 204; EG-NEXT: ALU clause starting at 12: 205; EG-NEXT: LSHL * T0.W, T0.X, 1, 206; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 207; EG-NEXT: ALU clause starting at 14: 208; EG-NEXT: ADD_INT * T1.X, KC0[2].W, T0.W, 209; EG-NEXT: ALU clause starting at 15: 210; EG-NEXT: AND_INT T0.W, T0.X, literal.x, 211; EG-NEXT: AND_INT * T1.W, T1.X, literal.x, 212; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 213; EG-NEXT: BCNT_INT T0.Z, PS, 214; EG-NEXT: BCNT_INT T0.W, PV.W, 215; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 216; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 217; EG-NEXT: ADD_INT T0.W, PV.W, PV.Z, 218; EG-NEXT: LSHL * T1.W, PS, literal.x, 219; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 220; EG-NEXT: LSHL T0.X, PV.W, PS, 221; EG-NEXT: LSHL * T0.W, literal.x, PS, 222; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 223; EG-NEXT: MOV T0.Y, 0.0, 224; EG-NEXT: MOV * T0.Z, 0.0, 225; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 226; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 227 %tid = call i32 @llvm.amdgcn.workitem.id.x() 228 %in0.gep = getelementptr i16, ptr addrspace(1) %in0, i32 %tid 229 %in1.gep = getelementptr i16, ptr addrspace(1) %in1, i32 %tid 230 %val0 = load volatile i16, ptr addrspace(1) %in0.gep, align 4 231 %val1 = load volatile i16, ptr addrspace(1) %in1.gep, align 4 232 %ctpop0 = call i16 @llvm.ctpop.i16(i16 %val0) nounwind readnone 233 %ctpop1 = call i16 @llvm.ctpop.i16(i16 %val1) nounwind readnone 234 %add = add i16 %ctpop0, %ctpop1 235 store i16 %add, ptr addrspace(1) %out, align 4 236 ret void 237} 238 239define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %sval) nounwind { 240; SI-LABEL: v_ctpop_add_sgpr_i16: 241; SI: ; %bb.0: 242; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 243; SI-NEXT: s_load_dword s12, s[4:5], 0xd 244; SI-NEXT: s_mov_b32 s7, 0xf000 245; SI-NEXT: s_mov_b32 s10, 0 246; SI-NEXT: s_mov_b32 s11, s7 247; SI-NEXT: s_waitcnt lgkmcnt(0) 248; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 249; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 250; SI-NEXT: v_mov_b32_e32 v1, 0 251; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 252; SI-NEXT: s_mov_b32 s6, -1 253; SI-NEXT: s_mov_b32 s4, s0 254; SI-NEXT: s_mov_b32 s5, s1 255; SI-NEXT: s_waitcnt vmcnt(0) 256; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s12 257; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 258; SI-NEXT: s_endpgm 259; 260; VI-LABEL: v_ctpop_add_sgpr_i16: 261; VI: ; %bb.0: 262; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 263; VI-NEXT: s_load_dword s4, s[4:5], 0x34 264; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 265; VI-NEXT: s_waitcnt lgkmcnt(0) 266; VI-NEXT: v_mov_b32_e32 v1, s3 267; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 268; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 269; VI-NEXT: flat_load_ushort v0, v[0:1] 270; VI-NEXT: s_mov_b32 s3, 0xf000 271; VI-NEXT: s_mov_b32 s2, -1 272; VI-NEXT: s_waitcnt vmcnt(0) 273; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 274; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 275; VI-NEXT: s_endpgm 276; 277; EG-LABEL: v_ctpop_add_sgpr_i16: 278; EG: ; %bb.0: 279; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[] 280; EG-NEXT: TEX 0 @8 281; EG-NEXT: ALU 0, @14, KC0[], KC1[] 282; EG-NEXT: TEX 0 @10 283; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[] 284; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 285; EG-NEXT: CF_END 286; EG-NEXT: PAD 287; EG-NEXT: Fetch clause starting at 8: 288; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 289; EG-NEXT: Fetch clause starting at 10: 290; EG-NEXT: VTX_READ_16 T1.X, T1.X, 44, #3 291; EG-NEXT: ALU clause starting at 12: 292; EG-NEXT: LSHL * T0.W, T0.X, 1, 293; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 294; EG-NEXT: ALU clause starting at 14: 295; EG-NEXT: MOV * T1.X, 0.0, 296; EG-NEXT: ALU clause starting at 15: 297; EG-NEXT: BCNT_INT T0.W, T0.X, 298; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 299; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 300; EG-NEXT: ADD_INT * T0.W, PV.W, T1.X, 301; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 302; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 303; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 304; EG-NEXT: LSHL T0.X, PV.W, PS, 305; EG-NEXT: LSHL * T0.W, literal.x, PS, 306; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 307; EG-NEXT: MOV T0.Y, 0.0, 308; EG-NEXT: MOV * T0.Z, 0.0, 309; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 310; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 311 %tid = call i32 @llvm.amdgcn.workitem.id.x() 312 %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid 313 %val = load i16, ptr addrspace(1) %in.gep, align 4 314 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 315 %add = add i16 %ctpop, %sval 316 store i16 %add, ptr addrspace(1) %out, align 4 317 ret void 318} 319 320define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 321; SI-LABEL: v_ctpop_v2i16: 322; SI: ; %bb.0: 323; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 324; SI-NEXT: s_mov_b32 s7, 0xf000 325; SI-NEXT: s_mov_b32 s10, 0 326; SI-NEXT: s_mov_b32 s11, s7 327; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 328; SI-NEXT: s_waitcnt lgkmcnt(0) 329; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 330; SI-NEXT: v_mov_b32_e32 v1, 0 331; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 332; SI-NEXT: s_mov_b32 s6, -1 333; SI-NEXT: s_mov_b32 s4, s0 334; SI-NEXT: s_mov_b32 s5, s1 335; SI-NEXT: s_waitcnt vmcnt(0) 336; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 337; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 338; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 339; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 340; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 341; SI-NEXT: v_or_b32_e32 v0, v1, v0 342; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 343; SI-NEXT: s_endpgm 344; 345; VI-LABEL: v_ctpop_v2i16: 346; VI: ; %bb.0: 347; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 348; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 349; VI-NEXT: s_waitcnt lgkmcnt(0) 350; VI-NEXT: v_mov_b32_e32 v1, s3 351; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 352; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 353; VI-NEXT: flat_load_dword v0, v[0:1] 354; VI-NEXT: s_mov_b32 s3, 0xf000 355; VI-NEXT: s_mov_b32 s2, -1 356; VI-NEXT: s_waitcnt vmcnt(0) 357; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 358; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 359; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 360; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 361; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 362; VI-NEXT: v_or_b32_e32 v0, v0, v1 363; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 364; VI-NEXT: s_endpgm 365; 366; EG-LABEL: v_ctpop_v2i16: 367; EG: ; %bb.0: 368; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 369; EG-NEXT: TEX 0 @6 370; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] 371; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1 372; EG-NEXT: CF_END 373; EG-NEXT: PAD 374; EG-NEXT: Fetch clause starting at 6: 375; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 376; EG-NEXT: ALU clause starting at 8: 377; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 378; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 379; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 380; EG-NEXT: ALU clause starting at 11: 381; EG-NEXT: LSHR * T0.W, T0.X, literal.x, 382; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 383; EG-NEXT: BCNT_INT T0.W, PV.W, 384; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 385; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 386; EG-NEXT: BCNT_INT T1.W, PS, 387; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 388; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 389; EG-NEXT: OR_INT T0.X, PV.W, PS, 390; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, 391; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 392 %tid = call i32 @llvm.amdgcn.workitem.id.x() 393 %in.gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %tid 394 %val = load <2 x i16>, ptr addrspace(1) %in.gep, align 8 395 %ctpop = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %val) nounwind readnone 396 store <2 x i16> %ctpop, ptr addrspace(1) %out, align 8 397 ret void 398} 399 400define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 401; SI-LABEL: v_ctpop_v4i16: 402; SI: ; %bb.0: 403; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 404; SI-NEXT: s_mov_b32 s7, 0xf000 405; SI-NEXT: s_mov_b32 s10, 0 406; SI-NEXT: s_mov_b32 s11, s7 407; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 408; SI-NEXT: s_waitcnt lgkmcnt(0) 409; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 410; SI-NEXT: v_mov_b32_e32 v1, 0 411; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64 412; SI-NEXT: s_mov_b32 s6, -1 413; SI-NEXT: s_mov_b32 s4, s0 414; SI-NEXT: s_mov_b32 s5, s1 415; SI-NEXT: s_waitcnt vmcnt(0) 416; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0 417; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 418; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1 419; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 420; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 421; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 422; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0 423; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 424; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 425; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 426; SI-NEXT: v_or_b32_e32 v1, v3, v1 427; SI-NEXT: v_or_b32_e32 v0, v2, v0 428; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 429; SI-NEXT: s_endpgm 430; 431; VI-LABEL: v_ctpop_v4i16: 432; VI: ; %bb.0: 433; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 434; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 435; VI-NEXT: s_waitcnt lgkmcnt(0) 436; VI-NEXT: v_mov_b32_e32 v1, s3 437; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 438; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 439; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 440; VI-NEXT: s_mov_b32 s3, 0xf000 441; VI-NEXT: s_mov_b32 s2, -1 442; VI-NEXT: s_waitcnt vmcnt(0) 443; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 444; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 445; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1 446; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 447; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 448; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 449; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 450; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 451; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 452; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 453; VI-NEXT: v_or_b32_e32 v1, v1, v2 454; VI-NEXT: v_or_b32_e32 v0, v0, v3 455; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 456; VI-NEXT: s_endpgm 457; 458; EG-LABEL: v_ctpop_v4i16: 459; EG: ; %bb.0: 460; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[] 461; EG-NEXT: TEX 0 @6 462; EG-NEXT: ALU 37, @12, KC0[CB0:0-32], KC1[] 463; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T0.X, 1 464; EG-NEXT: CF_END 465; EG-NEXT: PAD 466; EG-NEXT: Fetch clause starting at 6: 467; EG-NEXT: VTX_READ_64 T8.XY, T0.X, 0, #1 468; EG-NEXT: ALU clause starting at 8: 469; EG-NEXT: MOV T0.Y, T4.X, 470; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 471; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 472; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 473; EG-NEXT: ALU clause starting at 12: 474; EG-NEXT: AND_INT * T0.W, T8.X, literal.x, 475; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 476; EG-NEXT: BCNT_INT T0.W, PV.W, 477; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, 478; EG-NEXT: -65536(nan), 0(0.000000e+00) 479; EG-NEXT: OR_INT * T0.W, PS, PV.W, 480; EG-NEXT: MOV * T4.X, PV.W, 481; EG-NEXT: MOV T0.X, PV.X, 482; EG-NEXT: LSHR * T0.W, T8.X, literal.x, 483; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 484; EG-NEXT: BCNT_INT T0.W, PV.W, 485; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 486; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 487; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 488; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 489; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 490; EG-NEXT: MOV T4.X, PV.W, 491; EG-NEXT: MOV * T0.X, T5.X, 492; EG-NEXT: AND_INT * T0.W, T8.Y, literal.x, 493; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 494; EG-NEXT: BCNT_INT T0.W, PV.W, 495; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 496; EG-NEXT: -65536(nan), 0(0.000000e+00) 497; EG-NEXT: OR_INT * T0.W, PS, PV.W, 498; EG-NEXT: MOV * T5.X, PV.W, 499; EG-NEXT: MOV T0.X, PV.X, 500; EG-NEXT: LSHR * T0.W, T8.Y, literal.x, 501; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 502; EG-NEXT: BCNT_INT T0.W, PV.W, 503; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 504; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 505; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 506; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 507; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, 508; EG-NEXT: OR_INT * T8.Y, T1.W, PV.W, 509; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 510; EG-NEXT: MOV T5.X, PV.Y, 511; EG-NEXT: MOV * T8.X, T4.X, 512 %tid = call i32 @llvm.amdgcn.workitem.id.x() 513 %in.gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid 514 %val = load <4 x i16>, ptr addrspace(1) %in.gep, align 16 515 %ctpop = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %val) nounwind readnone 516 store <4 x i16> %ctpop, ptr addrspace(1) %out, align 16 517 ret void 518} 519 520define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 521; SI-LABEL: v_ctpop_v8i16: 522; SI: ; %bb.0: 523; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 524; SI-NEXT: s_mov_b32 s3, 0xf000 525; SI-NEXT: s_mov_b32 s10, 0 526; SI-NEXT: s_mov_b32 s11, s3 527; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 528; SI-NEXT: s_waitcnt lgkmcnt(0) 529; SI-NEXT: s_mov_b64 s[8:9], s[6:7] 530; SI-NEXT: v_mov_b32_e32 v1, 0 531; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 532; SI-NEXT: s_mov_b32 s2, -1 533; SI-NEXT: s_mov_b32 s0, s4 534; SI-NEXT: s_mov_b32 s1, s5 535; SI-NEXT: s_waitcnt vmcnt(0) 536; SI-NEXT: v_and_b32_e32 v4, 0xffff, v0 537; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 538; SI-NEXT: v_and_b32_e32 v5, 0xffff, v1 539; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 540; SI-NEXT: v_and_b32_e32 v6, 0xffff, v2 541; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 542; SI-NEXT: v_and_b32_e32 v7, 0xffff, v3 543; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 544; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0 545; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 546; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 547; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 548; SI-NEXT: v_bcnt_u32_b32_e64 v7, v7, 0 549; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0 550; SI-NEXT: v_bcnt_u32_b32_e64 v5, v5, 0 551; SI-NEXT: v_bcnt_u32_b32_e64 v4, v4, 0 552; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 553; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 554; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 555; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 556; SI-NEXT: v_or_b32_e32 v3, v7, v3 557; SI-NEXT: v_or_b32_e32 v2, v6, v2 558; SI-NEXT: v_or_b32_e32 v1, v5, v1 559; SI-NEXT: v_or_b32_e32 v0, v4, v0 560; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 561; SI-NEXT: s_endpgm 562; 563; VI-LABEL: v_ctpop_v8i16: 564; VI: ; %bb.0: 565; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 566; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 567; VI-NEXT: s_waitcnt lgkmcnt(0) 568; VI-NEXT: v_mov_b32_e32 v1, s3 569; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 570; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 571; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 572; VI-NEXT: s_mov_b32 s3, 0xf000 573; VI-NEXT: s_mov_b32 s2, -1 574; VI-NEXT: s_waitcnt vmcnt(0) 575; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 576; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 577; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 578; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 579; VI-NEXT: v_and_b32_e32 v3, 0xffff, v3 580; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2 581; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1 582; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 583; VI-NEXT: v_bcnt_u32_b32 v4, v4, 0 584; VI-NEXT: v_bcnt_u32_b32 v5, v5, 0 585; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0 586; VI-NEXT: v_bcnt_u32_b32 v7, v7, 0 587; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 588; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 589; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 590; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 591; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 592; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 593; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 594; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 595; VI-NEXT: v_or_b32_e32 v3, v3, v4 596; VI-NEXT: v_or_b32_e32 v2, v2, v5 597; VI-NEXT: v_or_b32_e32 v1, v1, v6 598; VI-NEXT: v_or_b32_e32 v0, v0, v7 599; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 600; VI-NEXT: s_endpgm 601; 602; EG-LABEL: v_ctpop_v8i16: 603; EG: ; %bb.0: 604; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[] 605; EG-NEXT: TEX 0 @6 606; EG-NEXT: ALU 73, @12, KC0[CB0:0-32], KC1[] 607; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T12.X, 1 608; EG-NEXT: CF_END 609; EG-NEXT: PAD 610; EG-NEXT: Fetch clause starting at 6: 611; EG-NEXT: VTX_READ_128 T12.XYZW, T0.X, 0, #1 612; EG-NEXT: ALU clause starting at 8: 613; EG-NEXT: MOV T0.Y, T4.X, 614; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 615; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) 616; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 617; EG-NEXT: ALU clause starting at 12: 618; EG-NEXT: LSHR * T0.W, T12.X, literal.x, 619; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 620; EG-NEXT: BCNT_INT * T0.W, PV.W, 621; EG-NEXT: LSHL T0.W, PV.W, literal.x, 622; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 623; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 624; EG-NEXT: OR_INT * T0.W, PS, PV.W, 625; EG-NEXT: MOV * T4.X, PV.W, 626; EG-NEXT: MOV T0.X, PV.X, 627; EG-NEXT: AND_INT * T0.W, T12.X, literal.x, 628; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 629; EG-NEXT: BCNT_INT T0.W, PV.W, 630; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 631; EG-NEXT: -65536(nan), 0(0.000000e+00) 632; EG-NEXT: OR_INT * T0.W, PS, PV.W, 633; EG-NEXT: MOV T4.X, PV.W, 634; EG-NEXT: MOV * T0.X, T5.X, 635; EG-NEXT: LSHR * T0.W, T12.Y, literal.x, 636; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 637; EG-NEXT: BCNT_INT T0.W, PV.W, 638; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 639; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 640; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 641; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 642; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 643; EG-NEXT: MOV * T5.X, PV.W, 644; EG-NEXT: MOV T0.X, PV.X, 645; EG-NEXT: AND_INT * T0.W, T12.Y, literal.x, 646; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 647; EG-NEXT: BCNT_INT T0.W, PV.W, 648; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 649; EG-NEXT: -65536(nan), 0(0.000000e+00) 650; EG-NEXT: OR_INT * T0.Y, PS, PV.W, 651; EG-NEXT: MOV T5.X, PV.Y, 652; EG-NEXT: MOV * T0.X, T8.X, 653; EG-NEXT: LSHR * T0.W, T12.Z, literal.x, 654; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 655; EG-NEXT: BCNT_INT T0.W, PV.W, 656; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 657; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 658; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 659; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 660; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 661; EG-NEXT: MOV * T8.X, PV.W, 662; EG-NEXT: MOV T0.X, PV.X, 663; EG-NEXT: AND_INT * T0.W, T12.Z, literal.x, 664; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 665; EG-NEXT: BCNT_INT T0.W, PV.W, 666; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 667; EG-NEXT: -65536(nan), 0(0.000000e+00) 668; EG-NEXT: OR_INT * T0.W, PS, PV.W, 669; EG-NEXT: MOV T8.X, PV.W, 670; EG-NEXT: MOV * T0.X, T9.X, 671; EG-NEXT: LSHR * T0.W, T12.W, literal.x, 672; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 673; EG-NEXT: BCNT_INT T0.W, PV.W, 674; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 675; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 676; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 677; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 678; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 679; EG-NEXT: MOV * T9.X, PV.W, 680; EG-NEXT: MOV T0.X, PV.X, 681; EG-NEXT: AND_INT * T0.W, T12.W, literal.x, 682; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 683; EG-NEXT: BCNT_INT T0.W, PV.W, 684; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 685; EG-NEXT: -65536(nan), 0(0.000000e+00) 686; EG-NEXT: LSHR T12.X, KC0[2].Y, literal.x, 687; EG-NEXT: OR_INT * T0.W, PS, PV.W, 688; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 689; EG-NEXT: MOV T9.X, PV.W, 690; EG-NEXT: MOV * T0.X, T4.X, 691; EG-NEXT: MOV * T0.Z, T8.X, 692 %tid = call i32 @llvm.amdgcn.workitem.id.x() 693 %in.gep = getelementptr <8 x i16>, ptr addrspace(1) %in, i32 %tid 694 %val = load <8 x i16>, ptr addrspace(1) %in.gep, align 32 695 %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %val) nounwind readnone 696 store <8 x i16> %ctpop, ptr addrspace(1) %out, align 32 697 ret void 698} 699 700define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 701; SI-LABEL: v_ctpop_v16i16: 702; SI: ; %bb.0: 703; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 704; SI-NEXT: s_mov_b32 s3, 0xf000 705; SI-NEXT: s_mov_b32 s10, 0 706; SI-NEXT: s_mov_b32 s11, s3 707; SI-NEXT: v_lshlrev_b32_e32 v4, 5, v0 708; SI-NEXT: s_waitcnt lgkmcnt(0) 709; SI-NEXT: s_mov_b64 s[8:9], s[6:7] 710; SI-NEXT: v_mov_b32_e32 v5, 0 711; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 offset:16 712; SI-NEXT: buffer_load_dwordx4 v[4:7], v[4:5], s[8:11], 0 addr64 713; SI-NEXT: s_mov_b32 s2, -1 714; SI-NEXT: s_mov_b32 s0, s4 715; SI-NEXT: s_mov_b32 s1, s5 716; SI-NEXT: s_waitcnt vmcnt(1) 717; SI-NEXT: v_and_b32_e32 v8, 0xffff, v0 718; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 719; SI-NEXT: v_and_b32_e32 v9, 0xffff, v1 720; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 721; SI-NEXT: v_and_b32_e32 v10, 0xffff, v2 722; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 723; SI-NEXT: v_and_b32_e32 v11, 0xffff, v3 724; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 725; SI-NEXT: s_waitcnt vmcnt(0) 726; SI-NEXT: v_and_b32_e32 v12, 0xffff, v4 727; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 728; SI-NEXT: v_and_b32_e32 v13, 0xffff, v5 729; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 730; SI-NEXT: v_and_b32_e32 v14, 0xffff, v6 731; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 732; SI-NEXT: v_and_b32_e32 v15, 0xffff, v7 733; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 734; SI-NEXT: v_bcnt_u32_b32_e64 v7, v7, 0 735; SI-NEXT: v_bcnt_u32_b32_e64 v6, v6, 0 736; SI-NEXT: v_bcnt_u32_b32_e64 v5, v5, 0 737; SI-NEXT: v_bcnt_u32_b32_e64 v4, v4, 0 738; SI-NEXT: v_bcnt_u32_b32_e64 v3, v3, 0 739; SI-NEXT: v_bcnt_u32_b32_e64 v2, v2, 0 740; SI-NEXT: v_bcnt_u32_b32_e64 v1, v1, 0 741; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 0 742; SI-NEXT: v_bcnt_u32_b32_e64 v15, v15, 0 743; SI-NEXT: v_bcnt_u32_b32_e64 v14, v14, 0 744; SI-NEXT: v_bcnt_u32_b32_e64 v13, v13, 0 745; SI-NEXT: v_bcnt_u32_b32_e64 v12, v12, 0 746; SI-NEXT: v_bcnt_u32_b32_e64 v11, v11, 0 747; SI-NEXT: v_bcnt_u32_b32_e64 v10, v10, 0 748; SI-NEXT: v_bcnt_u32_b32_e64 v9, v9, 0 749; SI-NEXT: v_bcnt_u32_b32_e64 v8, v8, 0 750; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 751; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 752; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 753; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 754; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 755; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2 756; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 757; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v0 758; SI-NEXT: v_or_b32_e32 v3, v15, v7 759; SI-NEXT: v_or_b32_e32 v2, v14, v6 760; SI-NEXT: v_or_b32_e32 v1, v13, v5 761; SI-NEXT: v_or_b32_e32 v0, v12, v4 762; SI-NEXT: v_or_b32_e32 v7, v11, v16 763; SI-NEXT: v_or_b32_e32 v6, v10, v17 764; SI-NEXT: v_or_b32_e32 v5, v9, v18 765; SI-NEXT: v_or_b32_e32 v4, v8, v19 766; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 767; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 768; SI-NEXT: s_endpgm 769; 770; VI-LABEL: v_ctpop_v16i16: 771; VI: ; %bb.0: 772; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 773; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 774; VI-NEXT: s_waitcnt lgkmcnt(0) 775; VI-NEXT: v_mov_b32_e32 v1, s3 776; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v0 777; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc 778; VI-NEXT: flat_load_dwordx4 v[0:3], v[4:5] 779; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v4 780; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 781; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 782; VI-NEXT: s_mov_b32 s3, 0xf000 783; VI-NEXT: s_mov_b32 s2, -1 784; VI-NEXT: s_waitcnt vmcnt(1) 785; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 786; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 787; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 788; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 789; VI-NEXT: v_and_b32_e32 v3, 0xffff, v3 790; VI-NEXT: v_and_b32_e32 v2, 0xffff, v2 791; VI-NEXT: v_and_b32_e32 v1, 0xffff, v1 792; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 793; VI-NEXT: s_waitcnt vmcnt(0) 794; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 795; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v6 796; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v5 797; VI-NEXT: v_lshrrev_b32_e32 v15, 16, v4 798; VI-NEXT: v_bcnt_u32_b32 v8, v8, 0 799; VI-NEXT: v_bcnt_u32_b32 v9, v9, 0 800; VI-NEXT: v_bcnt_u32_b32 v10, v10, 0 801; VI-NEXT: v_bcnt_u32_b32 v11, v11, 0 802; VI-NEXT: v_and_b32_e32 v7, 0xffff, v7 803; VI-NEXT: v_and_b32_e32 v6, 0xffff, v6 804; VI-NEXT: v_and_b32_e32 v5, 0xffff, v5 805; VI-NEXT: v_and_b32_e32 v4, 0xffff, v4 806; VI-NEXT: v_bcnt_u32_b32 v3, v3, 0 807; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 808; VI-NEXT: v_bcnt_u32_b32 v1, v1, 0 809; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 810; VI-NEXT: v_bcnt_u32_b32 v12, v12, 0 811; VI-NEXT: v_bcnt_u32_b32 v13, v13, 0 812; VI-NEXT: v_bcnt_u32_b32 v14, v14, 0 813; VI-NEXT: v_bcnt_u32_b32 v15, v15, 0 814; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 815; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 816; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 817; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 818; VI-NEXT: v_bcnt_u32_b32 v7, v7, 0 819; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0 820; VI-NEXT: v_bcnt_u32_b32 v5, v5, 0 821; VI-NEXT: v_bcnt_u32_b32 v4, v4, 0 822; VI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 823; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 824; VI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 825; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 826; VI-NEXT: v_or_b32_e32 v3, v3, v8 827; VI-NEXT: v_or_b32_e32 v2, v2, v9 828; VI-NEXT: v_or_b32_e32 v1, v1, v10 829; VI-NEXT: v_or_b32_e32 v0, v0, v11 830; VI-NEXT: v_or_b32_e32 v7, v7, v12 831; VI-NEXT: v_or_b32_e32 v6, v6, v13 832; VI-NEXT: v_or_b32_e32 v5, v5, v14 833; VI-NEXT: v_or_b32_e32 v4, v4, v15 834; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 835; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 836; VI-NEXT: s_endpgm 837; 838; EG-LABEL: v_ctpop_v16i16: 839; EG: ; %bb.0: 840; EG-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[] 841; EG-NEXT: TEX 1 @8 842; EG-NEXT: ALU 114, @16, KC0[], KC1[] 843; EG-NEXT: ALU 34, @131, KC0[CB0:0-32], KC1[] 844; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T22.X, 0 845; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 1 846; EG-NEXT: CF_END 847; EG-NEXT: PAD 848; EG-NEXT: Fetch clause starting at 8: 849; EG-NEXT: VTX_READ_128 T20.XYZW, T0.X, 16, #1 850; EG-NEXT: VTX_READ_128 T21.XYZW, T0.X, 0, #1 851; EG-NEXT: ALU clause starting at 12: 852; EG-NEXT: MOV T0.Y, T4.X, 853; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 854; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00) 855; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 856; EG-NEXT: ALU clause starting at 16: 857; EG-NEXT: LSHR * T0.W, T20.X, literal.x, 858; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 859; EG-NEXT: BCNT_INT * T0.W, PV.W, 860; EG-NEXT: LSHL T0.W, PV.W, literal.x, 861; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 862; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 863; EG-NEXT: OR_INT * T0.W, PS, PV.W, 864; EG-NEXT: MOV * T4.X, PV.W, 865; EG-NEXT: MOV T0.X, PV.X, 866; EG-NEXT: AND_INT * T0.W, T20.X, literal.x, 867; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 868; EG-NEXT: BCNT_INT T0.W, PV.W, 869; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 870; EG-NEXT: -65536(nan), 0(0.000000e+00) 871; EG-NEXT: OR_INT * T0.W, PS, PV.W, 872; EG-NEXT: MOV T4.X, PV.W, 873; EG-NEXT: MOV * T0.X, T5.X, 874; EG-NEXT: LSHR * T0.W, T20.Y, literal.x, 875; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 876; EG-NEXT: BCNT_INT T0.W, PV.W, 877; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 878; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 879; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 880; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 881; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 882; EG-NEXT: MOV * T5.X, PV.W, 883; EG-NEXT: MOV T0.X, PV.X, 884; EG-NEXT: AND_INT * T0.W, T20.Y, literal.x, 885; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 886; EG-NEXT: BCNT_INT T0.W, PV.W, 887; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 888; EG-NEXT: -65536(nan), 0(0.000000e+00) 889; EG-NEXT: OR_INT * T0.Y, PS, PV.W, 890; EG-NEXT: MOV T5.X, PV.Y, 891; EG-NEXT: MOV * T0.X, T8.X, 892; EG-NEXT: LSHR * T0.W, T20.Z, literal.x, 893; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 894; EG-NEXT: BCNT_INT T0.W, PV.W, 895; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 896; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 897; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 898; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 899; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 900; EG-NEXT: MOV * T8.X, PV.W, 901; EG-NEXT: MOV T0.X, PV.X, 902; EG-NEXT: AND_INT * T0.W, T20.Z, literal.x, 903; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 904; EG-NEXT: BCNT_INT T0.W, PV.W, 905; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 906; EG-NEXT: -65536(nan), 0(0.000000e+00) 907; EG-NEXT: OR_INT * T0.W, PS, PV.W, 908; EG-NEXT: MOV T8.X, PV.W, 909; EG-NEXT: MOV * T0.X, T9.X, 910; EG-NEXT: LSHR * T0.W, T20.W, literal.x, 911; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 912; EG-NEXT: BCNT_INT T0.W, PV.W, 913; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 914; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 915; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 916; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 917; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 918; EG-NEXT: MOV * T9.X, PV.W, 919; EG-NEXT: MOV T0.X, PV.X, 920; EG-NEXT: AND_INT * T0.W, T20.W, literal.x, 921; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 922; EG-NEXT: BCNT_INT T0.W, PV.W, 923; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, 924; EG-NEXT: -65536(nan), 0(0.000000e+00) 925; EG-NEXT: OR_INT * T0.W, PS, PV.W, 926; EG-NEXT: MOV T9.X, PV.W, 927; EG-NEXT: MOV * T0.X, T12.X, 928; EG-NEXT: LSHR * T1.W, T21.X, literal.x, 929; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 930; EG-NEXT: BCNT_INT T1.W, PV.W, 931; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, 932; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 933; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 934; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 935; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 936; EG-NEXT: MOV * T12.X, PV.W, 937; EG-NEXT: MOV T0.X, PV.X, 938; EG-NEXT: AND_INT * T1.W, T21.X, literal.x, 939; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 940; EG-NEXT: BCNT_INT T1.W, PV.W, 941; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, 942; EG-NEXT: -65536(nan), 0(0.000000e+00) 943; EG-NEXT: OR_INT * T1.W, PS, PV.W, 944; EG-NEXT: MOV T12.X, PV.W, 945; EG-NEXT: MOV * T0.X, T13.X, 946; EG-NEXT: LSHR * T1.W, T21.Y, literal.x, 947; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 948; EG-NEXT: BCNT_INT T1.W, PV.W, 949; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, 950; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 951; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 952; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 953; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 954; EG-NEXT: MOV * T13.X, PV.W, 955; EG-NEXT: MOV T0.X, PV.X, 956; EG-NEXT: AND_INT * T1.W, T21.Y, literal.x, 957; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 958; EG-NEXT: BCNT_INT T1.W, PV.W, 959; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, 960; EG-NEXT: -65536(nan), 0(0.000000e+00) 961; EG-NEXT: OR_INT * T20.Y, PS, PV.W, 962; EG-NEXT: MOV T13.X, PV.Y, 963; EG-NEXT: MOV * T0.X, T16.X, 964; EG-NEXT: LSHR * T1.W, T21.Z, literal.x, 965; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 966; EG-NEXT: BCNT_INT T1.W, PV.W, 967; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, 968; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 969; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 970; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 971; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 972; EG-NEXT: ALU clause starting at 131: 973; EG-NEXT: MOV * T16.X, T1.W, 974; EG-NEXT: MOV T0.X, PV.X, 975; EG-NEXT: AND_INT * T1.W, T21.Z, literal.x, 976; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 977; EG-NEXT: BCNT_INT T1.W, PV.W, 978; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, 979; EG-NEXT: -65536(nan), 0(0.000000e+00) 980; EG-NEXT: OR_INT * T1.W, PS, PV.W, 981; EG-NEXT: MOV T16.X, PV.W, 982; EG-NEXT: MOV * T0.X, T17.X, 983; EG-NEXT: LSHR * T1.W, T21.W, literal.x, 984; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 985; EG-NEXT: BCNT_INT T1.W, PV.W, 986; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, 987; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 988; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 989; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 990; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 991; EG-NEXT: MOV * T17.X, PV.W, 992; EG-NEXT: MOV T0.X, PV.X, 993; EG-NEXT: AND_INT T1.W, T21.W, literal.x, 994; EG-NEXT: LSHR * T21.X, KC0[2].Y, literal.y, 995; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) 996; EG-NEXT: AND_INT T0.Z, PV.X, literal.x, 997; EG-NEXT: BCNT_INT T1.W, PV.W, 998; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, 999; EG-NEXT: -65536(nan), 16(2.242078e-44) 1000; EG-NEXT: LSHR T22.X, PS, literal.x, 1001; EG-NEXT: OR_INT * T20.W, PV.Z, PV.W, 1002; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1003; EG-NEXT: MOV T17.X, PV.W, 1004; EG-NEXT: MOV * T0.X, T4.X, 1005; EG-NEXT: MOV * T0.Z, T8.X, 1006; EG-NEXT: MOV T20.X, T12.X, 1007; EG-NEXT: MOV * T20.Z, T16.X, BS:VEC_120/SCL_212 1008 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1009 %in.gep = getelementptr <16 x i16>, ptr addrspace(1) %in, i32 %tid 1010 %val = load <16 x i16>, ptr addrspace(1) %in.gep, align 32 1011 %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %val) nounwind readnone 1012 store <16 x i16> %ctpop, ptr addrspace(1) %out, align 32 1013 ret void 1014} 1015 1016define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 1017; SI-LABEL: v_ctpop_i16_add_inline_constant: 1018; SI: ; %bb.0: 1019; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1020; SI-NEXT: s_mov_b32 s7, 0xf000 1021; SI-NEXT: s_mov_b32 s10, 0 1022; SI-NEXT: s_mov_b32 s11, s7 1023; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1024; SI-NEXT: s_waitcnt lgkmcnt(0) 1025; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1026; SI-NEXT: v_mov_b32_e32 v1, 0 1027; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 1028; SI-NEXT: s_mov_b32 s6, -1 1029; SI-NEXT: s_mov_b32 s4, s0 1030; SI-NEXT: s_mov_b32 s5, s1 1031; SI-NEXT: s_waitcnt vmcnt(0) 1032; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 4 1033; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 1034; SI-NEXT: s_endpgm 1035; 1036; VI-LABEL: v_ctpop_i16_add_inline_constant: 1037; VI: ; %bb.0: 1038; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1039; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1040; VI-NEXT: s_waitcnt lgkmcnt(0) 1041; VI-NEXT: v_mov_b32_e32 v1, s3 1042; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1043; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1044; VI-NEXT: flat_load_ushort v0, v[0:1] 1045; VI-NEXT: s_mov_b32 s3, 0xf000 1046; VI-NEXT: s_mov_b32 s2, -1 1047; VI-NEXT: s_waitcnt vmcnt(0) 1048; VI-NEXT: v_bcnt_u32_b32 v0, v0, 4 1049; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1050; VI-NEXT: s_endpgm 1051; 1052; EG-LABEL: v_ctpop_i16_add_inline_constant: 1053; EG: ; %bb.0: 1054; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] 1055; EG-NEXT: TEX 0 @6 1056; EG-NEXT: ALU 12, @10, KC0[CB0:0-32], KC1[] 1057; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1058; EG-NEXT: CF_END 1059; EG-NEXT: PAD 1060; EG-NEXT: Fetch clause starting at 6: 1061; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1062; EG-NEXT: ALU clause starting at 8: 1063; EG-NEXT: LSHL * T0.W, T0.X, 1, 1064; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1065; EG-NEXT: ALU clause starting at 10: 1066; EG-NEXT: BCNT_INT T0.W, T0.X, 1067; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1068; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1069; EG-NEXT: ADD_INT T0.W, PV.W, literal.x, 1070; EG-NEXT: LSHL * T1.W, PS, literal.y, 1071; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45) 1072; EG-NEXT: LSHL T0.X, PV.W, PS, 1073; EG-NEXT: LSHL * T0.W, literal.x, PS, 1074; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1075; EG-NEXT: MOV T0.Y, 0.0, 1076; EG-NEXT: MOV * T0.Z, 0.0, 1077; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1078; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1079 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1080 %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid 1081 %val = load i16, ptr addrspace(1) %in.gep, align 4 1082 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1083 %add = add i16 %ctpop, 4 1084 store i16 %add, ptr addrspace(1) %out, align 4 1085 ret void 1086} 1087 1088define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 1089; SI-LABEL: v_ctpop_i16_add_inline_constant_inv: 1090; SI: ; %bb.0: 1091; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1092; SI-NEXT: s_mov_b32 s7, 0xf000 1093; SI-NEXT: s_mov_b32 s10, 0 1094; SI-NEXT: s_mov_b32 s11, s7 1095; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1096; SI-NEXT: s_waitcnt lgkmcnt(0) 1097; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1098; SI-NEXT: v_mov_b32_e32 v1, 0 1099; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 1100; SI-NEXT: s_mov_b32 s6, -1 1101; SI-NEXT: s_mov_b32 s4, s0 1102; SI-NEXT: s_mov_b32 s5, s1 1103; SI-NEXT: s_waitcnt vmcnt(0) 1104; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, 4 1105; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 1106; SI-NEXT: s_endpgm 1107; 1108; VI-LABEL: v_ctpop_i16_add_inline_constant_inv: 1109; VI: ; %bb.0: 1110; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1111; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1112; VI-NEXT: s_waitcnt lgkmcnt(0) 1113; VI-NEXT: v_mov_b32_e32 v1, s3 1114; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1115; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1116; VI-NEXT: flat_load_ushort v0, v[0:1] 1117; VI-NEXT: s_mov_b32 s3, 0xf000 1118; VI-NEXT: s_mov_b32 s2, -1 1119; VI-NEXT: s_waitcnt vmcnt(0) 1120; VI-NEXT: v_bcnt_u32_b32 v0, v0, 4 1121; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1122; VI-NEXT: s_endpgm 1123; 1124; EG-LABEL: v_ctpop_i16_add_inline_constant_inv: 1125; EG: ; %bb.0: 1126; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] 1127; EG-NEXT: TEX 0 @6 1128; EG-NEXT: ALU 12, @10, KC0[CB0:0-32], KC1[] 1129; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1130; EG-NEXT: CF_END 1131; EG-NEXT: PAD 1132; EG-NEXT: Fetch clause starting at 6: 1133; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1134; EG-NEXT: ALU clause starting at 8: 1135; EG-NEXT: LSHL * T0.W, T0.X, 1, 1136; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1137; EG-NEXT: ALU clause starting at 10: 1138; EG-NEXT: BCNT_INT T0.W, T0.X, 1139; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1140; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1141; EG-NEXT: ADD_INT T0.W, PV.W, literal.x, 1142; EG-NEXT: LSHL * T1.W, PS, literal.y, 1143; EG-NEXT: 4(5.605194e-45), 3(4.203895e-45) 1144; EG-NEXT: LSHL T0.X, PV.W, PS, 1145; EG-NEXT: LSHL * T0.W, literal.x, PS, 1146; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1147; EG-NEXT: MOV T0.Y, 0.0, 1148; EG-NEXT: MOV * T0.Z, 0.0, 1149; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1150; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1151 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1152 %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid 1153 %val = load i16, ptr addrspace(1) %in.gep, align 4 1154 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1155 %add = add i16 4, %ctpop 1156 store i16 %add, ptr addrspace(1) %out, align 4 1157 ret void 1158} 1159 1160define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind { 1161; SI-LABEL: v_ctpop_i16_add_literal: 1162; SI: ; %bb.0: 1163; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1164; SI-NEXT: s_mov_b32 s7, 0xf000 1165; SI-NEXT: s_mov_b32 s10, 0 1166; SI-NEXT: s_mov_b32 s11, s7 1167; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1168; SI-NEXT: s_waitcnt lgkmcnt(0) 1169; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1170; SI-NEXT: v_mov_b32_e32 v1, 0 1171; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 1172; SI-NEXT: s_mov_b32 s4, s0 1173; SI-NEXT: s_movk_i32 s0, 0x3e7 1174; SI-NEXT: s_mov_b32 s6, -1 1175; SI-NEXT: s_mov_b32 s5, s1 1176; SI-NEXT: s_waitcnt vmcnt(0) 1177; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s0 1178; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 1179; SI-NEXT: s_endpgm 1180; 1181; VI-LABEL: v_ctpop_i16_add_literal: 1182; VI: ; %bb.0: 1183; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1184; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1185; VI-NEXT: s_movk_i32 s4, 0x3e7 1186; VI-NEXT: s_waitcnt lgkmcnt(0) 1187; VI-NEXT: v_mov_b32_e32 v1, s3 1188; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1189; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1190; VI-NEXT: flat_load_ushort v0, v[0:1] 1191; VI-NEXT: s_mov_b32 s3, 0xf000 1192; VI-NEXT: s_mov_b32 s2, -1 1193; VI-NEXT: s_waitcnt vmcnt(0) 1194; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 1195; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1196; VI-NEXT: s_endpgm 1197; 1198; EG-LABEL: v_ctpop_i16_add_literal: 1199; EG: ; %bb.0: 1200; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] 1201; EG-NEXT: TEX 0 @6 1202; EG-NEXT: ALU 12, @10, KC0[CB0:0-32], KC1[] 1203; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1204; EG-NEXT: CF_END 1205; EG-NEXT: PAD 1206; EG-NEXT: Fetch clause starting at 6: 1207; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1208; EG-NEXT: ALU clause starting at 8: 1209; EG-NEXT: LSHL * T0.W, T0.X, 1, 1210; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1211; EG-NEXT: ALU clause starting at 10: 1212; EG-NEXT: BCNT_INT T0.W, T0.X, 1213; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1214; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1215; EG-NEXT: ADD_INT T0.W, PV.W, literal.x, 1216; EG-NEXT: LSHL * T1.W, PS, literal.y, 1217; EG-NEXT: 999(1.399897e-42), 3(4.203895e-45) 1218; EG-NEXT: LSHL T0.X, PV.W, PS, 1219; EG-NEXT: LSHL * T0.W, literal.x, PS, 1220; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1221; EG-NEXT: MOV T0.Y, 0.0, 1222; EG-NEXT: MOV * T0.Z, 0.0, 1223; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1224; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1225 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1226 %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid 1227 %val = load i16, ptr addrspace(1) %in.gep, align 4 1228 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1229 %add = add i16 %ctpop, 999 1230 store i16 %add, ptr addrspace(1) %out, align 4 1231 ret void 1232} 1233 1234define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind { 1235; SI-LABEL: v_ctpop_i16_add_var: 1236; SI: ; %bb.0: 1237; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1238; SI-NEXT: s_load_dword s12, s[4:5], 0xd 1239; SI-NEXT: s_mov_b32 s7, 0xf000 1240; SI-NEXT: s_mov_b32 s10, 0 1241; SI-NEXT: s_mov_b32 s11, s7 1242; SI-NEXT: s_waitcnt lgkmcnt(0) 1243; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1244; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1245; SI-NEXT: v_mov_b32_e32 v1, 0 1246; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 1247; SI-NEXT: s_mov_b32 s6, -1 1248; SI-NEXT: s_mov_b32 s4, s0 1249; SI-NEXT: s_mov_b32 s5, s1 1250; SI-NEXT: s_waitcnt vmcnt(0) 1251; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s12 1252; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 1253; SI-NEXT: s_endpgm 1254; 1255; VI-LABEL: v_ctpop_i16_add_var: 1256; VI: ; %bb.0: 1257; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1258; VI-NEXT: s_load_dword s4, s[4:5], 0x34 1259; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1260; VI-NEXT: s_waitcnt lgkmcnt(0) 1261; VI-NEXT: v_mov_b32_e32 v1, s3 1262; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1263; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1264; VI-NEXT: flat_load_ushort v0, v[0:1] 1265; VI-NEXT: s_mov_b32 s3, 0xf000 1266; VI-NEXT: s_mov_b32 s2, -1 1267; VI-NEXT: s_waitcnt vmcnt(0) 1268; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 1269; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1270; VI-NEXT: s_endpgm 1271; 1272; EG-LABEL: v_ctpop_i16_add_var: 1273; EG: ; %bb.0: 1274; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[] 1275; EG-NEXT: TEX 0 @8 1276; EG-NEXT: ALU 0, @14, KC0[], KC1[] 1277; EG-NEXT: TEX 0 @10 1278; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[] 1279; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1280; EG-NEXT: CF_END 1281; EG-NEXT: PAD 1282; EG-NEXT: Fetch clause starting at 8: 1283; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1284; EG-NEXT: Fetch clause starting at 10: 1285; EG-NEXT: VTX_READ_16 T1.X, T1.X, 44, #3 1286; EG-NEXT: ALU clause starting at 12: 1287; EG-NEXT: LSHL * T0.W, T0.X, 1, 1288; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1289; EG-NEXT: ALU clause starting at 14: 1290; EG-NEXT: MOV * T1.X, 0.0, 1291; EG-NEXT: ALU clause starting at 15: 1292; EG-NEXT: BCNT_INT T0.W, T0.X, 1293; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1294; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1295; EG-NEXT: ADD_INT * T0.W, PV.W, T1.X, 1296; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1297; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 1298; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1299; EG-NEXT: LSHL T0.X, PV.W, PS, 1300; EG-NEXT: LSHL * T0.W, literal.x, PS, 1301; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1302; EG-NEXT: MOV T0.Y, 0.0, 1303; EG-NEXT: MOV * T0.Z, 0.0, 1304; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1305; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1306 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1307 %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid 1308 %val = load i16, ptr addrspace(1) %in.gep, align 4 1309 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1310 %add = add i16 %ctpop, %const 1311 store i16 %add, ptr addrspace(1) %out, align 4 1312 ret void 1313} 1314 1315define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind { 1316; SI-LABEL: v_ctpop_i16_add_var_inv: 1317; SI: ; %bb.0: 1318; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1319; SI-NEXT: s_load_dword s12, s[4:5], 0xd 1320; SI-NEXT: s_mov_b32 s7, 0xf000 1321; SI-NEXT: s_mov_b32 s10, 0 1322; SI-NEXT: s_mov_b32 s11, s7 1323; SI-NEXT: s_waitcnt lgkmcnt(0) 1324; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 1325; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1326; SI-NEXT: v_mov_b32_e32 v1, 0 1327; SI-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 1328; SI-NEXT: s_mov_b32 s6, -1 1329; SI-NEXT: s_mov_b32 s4, s0 1330; SI-NEXT: s_mov_b32 s5, s1 1331; SI-NEXT: s_waitcnt vmcnt(0) 1332; SI-NEXT: v_bcnt_u32_b32_e64 v0, v0, s12 1333; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 1334; SI-NEXT: s_endpgm 1335; 1336; VI-LABEL: v_ctpop_i16_add_var_inv: 1337; VI: ; %bb.0: 1338; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1339; VI-NEXT: s_load_dword s4, s[4:5], 0x34 1340; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1341; VI-NEXT: s_waitcnt lgkmcnt(0) 1342; VI-NEXT: v_mov_b32_e32 v1, s3 1343; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 1344; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1345; VI-NEXT: flat_load_ushort v0, v[0:1] 1346; VI-NEXT: s_mov_b32 s3, 0xf000 1347; VI-NEXT: s_mov_b32 s2, -1 1348; VI-NEXT: s_waitcnt vmcnt(0) 1349; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 1350; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1351; VI-NEXT: s_endpgm 1352; 1353; EG-LABEL: v_ctpop_i16_add_var_inv: 1354; EG: ; %bb.0: 1355; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[] 1356; EG-NEXT: TEX 0 @8 1357; EG-NEXT: ALU 0, @14, KC0[], KC1[] 1358; EG-NEXT: TEX 0 @10 1359; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[] 1360; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1361; EG-NEXT: CF_END 1362; EG-NEXT: PAD 1363; EG-NEXT: Fetch clause starting at 8: 1364; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1365; EG-NEXT: Fetch clause starting at 10: 1366; EG-NEXT: VTX_READ_16 T1.X, T1.X, 44, #3 1367; EG-NEXT: ALU clause starting at 12: 1368; EG-NEXT: LSHL * T0.W, T0.X, 1, 1369; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1370; EG-NEXT: ALU clause starting at 14: 1371; EG-NEXT: MOV * T1.X, 0.0, 1372; EG-NEXT: ALU clause starting at 15: 1373; EG-NEXT: BCNT_INT T0.W, T0.X, 1374; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1375; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1376; EG-NEXT: ADD_INT * T0.W, T1.X, PV.W, 1377; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1378; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 1379; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1380; EG-NEXT: LSHL T0.X, PV.W, PS, 1381; EG-NEXT: LSHL * T0.W, literal.x, PS, 1382; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1383; EG-NEXT: MOV T0.Y, 0.0, 1384; EG-NEXT: MOV * T0.Z, 0.0, 1385; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1386; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1387 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1388 %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid 1389 %val = load i16, ptr addrspace(1) %in.gep, align 4 1390 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1391 %add = add i16 %const, %ctpop 1392 store i16 %add, ptr addrspace(1) %out, align 4 1393 ret void 1394} 1395 1396define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %constptr) nounwind { 1397; SI-LABEL: v_ctpop_i16_add_vvar_inv: 1398; SI: ; %bb.0: 1399; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1400; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1401; SI-NEXT: s_mov_b32 s11, 0xf000 1402; SI-NEXT: s_mov_b32 s14, 0 1403; SI-NEXT: s_mov_b32 s15, s11 1404; SI-NEXT: s_waitcnt lgkmcnt(0) 1405; SI-NEXT: s_mov_b64 s[12:13], s[2:3] 1406; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 1407; SI-NEXT: v_mov_b32_e32 v1, 0 1408; SI-NEXT: s_mov_b64 s[6:7], s[14:15] 1409; SI-NEXT: buffer_load_ushort v2, v[0:1], s[12:15], 0 addr64 1410; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 1411; SI-NEXT: s_mov_b32 s10, -1 1412; SI-NEXT: s_mov_b32 s8, s0 1413; SI-NEXT: s_mov_b32 s9, s1 1414; SI-NEXT: s_waitcnt vmcnt(0) 1415; SI-NEXT: v_bcnt_u32_b32_e32 v0, v2, v0 1416; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 1417; SI-NEXT: s_endpgm 1418; 1419; VI-LABEL: v_ctpop_i16_add_vvar_inv: 1420; VI: ; %bb.0: 1421; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1422; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1423; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 1424; VI-NEXT: s_waitcnt lgkmcnt(0) 1425; VI-NEXT: v_mov_b32_e32 v1, s3 1426; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1427; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1428; VI-NEXT: flat_load_ushort v3, v[0:1] 1429; VI-NEXT: v_mov_b32_e32 v1, s5 1430; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 1431; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1432; VI-NEXT: flat_load_ushort v0, v[0:1] 1433; VI-NEXT: s_mov_b32 s3, 0xf000 1434; VI-NEXT: s_mov_b32 s2, -1 1435; VI-NEXT: s_waitcnt vmcnt(0) 1436; VI-NEXT: v_bcnt_u32_b32 v0, v3, v0 1437; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1438; VI-NEXT: s_endpgm 1439; 1440; EG-LABEL: v_ctpop_i16_add_vvar_inv: 1441; EG: ; %bb.0: 1442; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[] 1443; EG-NEXT: TEX 0 @8 1444; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 1445; EG-NEXT: TEX 0 @10 1446; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[] 1447; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 1448; EG-NEXT: CF_END 1449; EG-NEXT: PAD 1450; EG-NEXT: Fetch clause starting at 8: 1451; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 1452; EG-NEXT: Fetch clause starting at 10: 1453; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1 1454; EG-NEXT: ALU clause starting at 12: 1455; EG-NEXT: LSHL * T0.W, T0.X, 1, 1456; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1457; EG-NEXT: ALU clause starting at 14: 1458; EG-NEXT: ADD_INT * T1.X, KC0[2].W, T0.W, 1459; EG-NEXT: ALU clause starting at 15: 1460; EG-NEXT: BCNT_INT T0.W, T0.X, 1461; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x, 1462; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1463; EG-NEXT: ADD_INT * T0.W, T1.X, PV.W, 1464; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 1465; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 1466; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 1467; EG-NEXT: LSHL T0.X, PV.W, PS, 1468; EG-NEXT: LSHL * T0.W, literal.x, PS, 1469; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1470; EG-NEXT: MOV T0.Y, 0.0, 1471; EG-NEXT: MOV * T0.Z, 0.0, 1472; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1473; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1474 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1475 %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid 1476 %val = load i16, ptr addrspace(1) %in.gep, align 4 1477 %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone 1478 %gep = getelementptr i16, ptr addrspace(1) %constptr, i32 %tid 1479 %const = load i16, ptr addrspace(1) %gep, align 4 1480 %add = add i16 %const, %ctpop 1481 store i16 %add, ptr addrspace(1) %out, align 4 1482 ret void 1483} 1484 1485; FIXME: We currently disallow SALU instructions in all branches, 1486; but there are some cases when the should be allowed. 1487define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %ctpop_arg, i16 %cond) { 1488; SI-LABEL: ctpop_i16_in_br: 1489; SI: ; %bb.0: ; %entry 1490; SI-NEXT: s_load_dword s6, s[4:5], 0xd 1491; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1492; SI-NEXT: s_waitcnt lgkmcnt(0) 1493; SI-NEXT: s_lshr_b32 s4, s6, 16 1494; SI-NEXT: s_cmp_lg_u32 s4, 0 1495; SI-NEXT: s_cbranch_scc0 .LBB14_4 1496; SI-NEXT: ; %bb.1: ; %else 1497; SI-NEXT: s_mov_b32 s11, 0xf000 1498; SI-NEXT: s_mov_b32 s10, -1 1499; SI-NEXT: s_mov_b32 s8, s2 1500; SI-NEXT: s_mov_b32 s9, s3 1501; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 1502; SI-NEXT: s_cbranch_execnz .LBB14_3 1503; SI-NEXT: .LBB14_2: ; %if 1504; SI-NEXT: s_and_b32 s2, s6, 0xffff 1505; SI-NEXT: s_bcnt1_i32_b32 s2, s2 1506; SI-NEXT: s_waitcnt vmcnt(0) 1507; SI-NEXT: v_mov_b32_e32 v0, s2 1508; SI-NEXT: .LBB14_3: ; %endif 1509; SI-NEXT: s_mov_b32 s3, 0xf000 1510; SI-NEXT: s_mov_b32 s2, -1 1511; SI-NEXT: s_waitcnt vmcnt(0) 1512; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1513; SI-NEXT: s_endpgm 1514; SI-NEXT: .LBB14_4: 1515; SI-NEXT: v_mov_b32_e32 v0, 0 1516; SI-NEXT: s_branch .LBB14_2 1517; 1518; VI-LABEL: ctpop_i16_in_br: 1519; VI: ; %bb.0: ; %entry 1520; VI-NEXT: s_load_dword s6, s[4:5], 0x34 1521; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1522; VI-NEXT: s_waitcnt lgkmcnt(0) 1523; VI-NEXT: s_lshr_b32 s4, s6, 16 1524; VI-NEXT: s_cmp_lg_u32 s4, 0 1525; VI-NEXT: s_cbranch_scc0 .LBB14_4 1526; VI-NEXT: ; %bb.1: ; %else 1527; VI-NEXT: s_mov_b32 s11, 0xf000 1528; VI-NEXT: s_mov_b32 s10, -1 1529; VI-NEXT: s_mov_b32 s8, s2 1530; VI-NEXT: s_mov_b32 s9, s3 1531; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2 1532; VI-NEXT: s_cbranch_execnz .LBB14_3 1533; VI-NEXT: .LBB14_2: ; %if 1534; VI-NEXT: s_and_b32 s2, s6, 0xffff 1535; VI-NEXT: s_bcnt1_i32_b32 s2, s2 1536; VI-NEXT: s_waitcnt vmcnt(0) 1537; VI-NEXT: v_mov_b32_e32 v0, s2 1538; VI-NEXT: .LBB14_3: ; %endif 1539; VI-NEXT: s_mov_b32 s3, 0xf000 1540; VI-NEXT: s_mov_b32 s2, -1 1541; VI-NEXT: s_waitcnt vmcnt(0) 1542; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1543; VI-NEXT: s_endpgm 1544; VI-NEXT: .LBB14_4: 1545; VI-NEXT: ; implicit-def: $vgpr0 1546; VI-NEXT: s_branch .LBB14_2 1547; 1548; EG-LABEL: ctpop_i16_in_br: 1549; EG: ; %bb.0: ; %entry 1550; EG-NEXT: ALU 0, @20, KC0[], KC1[] 1551; EG-NEXT: TEX 0 @14 1552; EG-NEXT: ALU_PUSH_BEFORE 4, @21, KC0[], KC1[] 1553; EG-NEXT: JUMP @7 POP:1 1554; EG-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[] 1555; EG-NEXT: TEX 0 @16 1556; EG-NEXT: ALU_POP_AFTER 1, @27, KC0[], KC1[] 1557; EG-NEXT: ALU_PUSH_BEFORE 2, @29, KC0[CB0:0-32], KC1[] 1558; EG-NEXT: JUMP @11 POP:1 1559; EG-NEXT: TEX 0 @18 1560; EG-NEXT: ALU_POP_AFTER 0, @32, KC0[], KC1[] 1561; EG-NEXT: ALU 11, @33, KC0[], KC1[] 1562; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X 1563; EG-NEXT: CF_END 1564; EG-NEXT: Fetch clause starting at 14: 1565; EG-NEXT: VTX_READ_16 T2.X, T1.X, 46, #3 1566; EG-NEXT: Fetch clause starting at 16: 1567; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1 1568; EG-NEXT: Fetch clause starting at 18: 1569; EG-NEXT: VTX_READ_16 T0.X, T1.X, 44, #3 1570; EG-NEXT: ALU clause starting at 20: 1571; EG-NEXT: MOV * T1.X, 0.0, 1572; EG-NEXT: ALU clause starting at 21: 1573; EG-NEXT: MOV T0.X, literal.x, 1574; EG-NEXT: MOV T1.W, literal.y, 1575; EG-NEXT: SETNE_INT * T0.W, T2.X, 0.0, 1576; EG-NEXT: 0(0.000000e+00), 1(1.401298e-45) 1577; EG-NEXT: PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 1578; EG-NEXT: ALU clause starting at 26: 1579; EG-NEXT: MOV * T0.X, KC0[2].Z, 1580; EG-NEXT: ALU clause starting at 27: 1581; EG-NEXT: MOV * T1.W, literal.x, 1582; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 1583; EG-NEXT: ALU clause starting at 29: 1584; EG-NEXT: MOV T0.W, KC0[2].Y, 1585; EG-NEXT: SETE_INT * T1.W, T1.W, 0.0, 1586; EG-NEXT: PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0, 1587; EG-NEXT: ALU clause starting at 32: 1588; EG-NEXT: BCNT_INT * T0.X, T0.X, 1589; EG-NEXT: ALU clause starting at 33: 1590; EG-NEXT: LSHL * T1.W, T0.W, literal.x, 1591; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1592; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 1593; EG-NEXT: AND_INT * T2.W, T0.X, literal.y, 1594; EG-NEXT: 24(3.363116e-44), 65535(9.183409e-41) 1595; EG-NEXT: LSHL T1.X, PS, PV.W, 1596; EG-NEXT: LSHL * T1.W, literal.x, PV.W, 1597; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 1598; EG-NEXT: MOV T1.Y, 0.0, 1599; EG-NEXT: MOV * T1.Z, 0.0, 1600; EG-NEXT: LSHR * T0.X, T0.W, literal.x, 1601; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1602entry: 1603 %tmp0 = icmp eq i16 %cond, 0 1604 br i1 %tmp0, label %if, label %else 1605 1606if: 1607 %tmp2 = call i16 @llvm.ctpop.i16(i16 %ctpop_arg) 1608 br label %endif 1609 1610else: 1611 %tmp3 = getelementptr i16, ptr addrspace(1) %in, i16 1 1612 %tmp4 = load i16, ptr addrspace(1) %tmp3 1613 br label %endif 1614 1615endif: 1616 %tmp5 = phi i16 [%tmp2, %if], [%tmp4, %else] 1617 store i16 %tmp5, ptr addrspace(1) %out 1618 ret void 1619} 1620