1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=SI 3; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=VI 4; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=r600-- -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefixes=EG 5 6declare i32 @llvm.amdgcn.workitem.id.x() #0 7 8declare i32 @llvm.amdgcn.workgroup.id.x() #0 9 10define amdgpu_kernel void @shl_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 11; SI-LABEL: shl_v2i32: 12; SI: ; %bb.0: 13; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 14; SI-NEXT: s_mov_b32 s7, 0xf000 15; SI-NEXT: s_mov_b32 s6, -1 16; SI-NEXT: s_mov_b32 s10, s6 17; SI-NEXT: s_mov_b32 s11, s7 18; SI-NEXT: s_waitcnt lgkmcnt(0) 19; SI-NEXT: s_mov_b32 s8, s2 20; SI-NEXT: s_mov_b32 s9, s3 21; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 22; SI-NEXT: s_mov_b32 s4, s0 23; SI-NEXT: s_mov_b32 s5, s1 24; SI-NEXT: s_waitcnt vmcnt(0) 25; SI-NEXT: v_lshl_b32_e32 v1, v1, v3 26; SI-NEXT: v_lshl_b32_e32 v0, v0, v2 27; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 28; SI-NEXT: s_endpgm 29; 30; VI-LABEL: shl_v2i32: 31; VI: ; %bb.0: 32; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 33; VI-NEXT: s_waitcnt lgkmcnt(0) 34; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 35; VI-NEXT: s_mov_b32 s3, 0xf000 36; VI-NEXT: s_mov_b32 s2, -1 37; VI-NEXT: s_waitcnt lgkmcnt(0) 38; VI-NEXT: s_lshl_b32 s5, s5, s7 39; VI-NEXT: s_lshl_b32 s4, s4, s6 40; VI-NEXT: v_mov_b32_e32 v0, s4 41; VI-NEXT: v_mov_b32_e32 v1, s5 42; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 43; VI-NEXT: s_endpgm 44; 45; EG-LABEL: shl_v2i32: 46; EG: ; %bb.0: 47; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 48; EG-NEXT: TEX 0 @6 49; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 50; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 51; EG-NEXT: CF_END 52; EG-NEXT: PAD 53; EG-NEXT: Fetch clause starting at 6: 54; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 55; EG-NEXT: ALU clause starting at 8: 56; EG-NEXT: MOV * T0.X, KC0[2].Z, 57; EG-NEXT: ALU clause starting at 9: 58; EG-NEXT: LSHL * T0.Y, T0.Y, T0.W, 59; EG-NEXT: LSHL T0.X, T0.X, T0.Z, 60; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 61; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 62 %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 63 %a = load <2 x i32>, ptr addrspace(1) %in 64 %b = load <2 x i32>, ptr addrspace(1) %b_ptr 65 %result = shl <2 x i32> %a, %b 66 store <2 x i32> %result, ptr addrspace(1) %out 67 ret void 68} 69 70define amdgpu_kernel void @shl_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 71; SI-LABEL: shl_v4i32: 72; SI: ; %bb.0: 73; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 74; SI-NEXT: s_mov_b32 s7, 0xf000 75; SI-NEXT: s_mov_b32 s6, -1 76; SI-NEXT: s_mov_b32 s10, s6 77; SI-NEXT: s_mov_b32 s11, s7 78; SI-NEXT: s_waitcnt lgkmcnt(0) 79; SI-NEXT: s_mov_b32 s8, s2 80; SI-NEXT: s_mov_b32 s9, s3 81; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 82; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 83; SI-NEXT: s_mov_b32 s4, s0 84; SI-NEXT: s_mov_b32 s5, s1 85; SI-NEXT: s_waitcnt vmcnt(0) 86; SI-NEXT: v_lshl_b32_e32 v3, v3, v7 87; SI-NEXT: v_lshl_b32_e32 v2, v2, v6 88; SI-NEXT: v_lshl_b32_e32 v1, v1, v5 89; SI-NEXT: v_lshl_b32_e32 v0, v0, v4 90; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 91; SI-NEXT: s_endpgm 92; 93; VI-LABEL: shl_v4i32: 94; VI: ; %bb.0: 95; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 96; VI-NEXT: s_waitcnt lgkmcnt(0) 97; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 98; VI-NEXT: s_mov_b32 s11, 0xf000 99; VI-NEXT: s_mov_b32 s10, -1 100; VI-NEXT: s_waitcnt lgkmcnt(0) 101; VI-NEXT: s_lshl_b32 s3, s3, s7 102; VI-NEXT: s_lshl_b32 s2, s2, s6 103; VI-NEXT: s_lshl_b32 s1, s1, s5 104; VI-NEXT: s_lshl_b32 s0, s0, s4 105; VI-NEXT: v_mov_b32_e32 v0, s0 106; VI-NEXT: v_mov_b32_e32 v1, s1 107; VI-NEXT: v_mov_b32_e32 v2, s2 108; VI-NEXT: v_mov_b32_e32 v3, s3 109; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 110; VI-NEXT: s_endpgm 111; 112; EG-LABEL: shl_v4i32: 113; EG: ; %bb.0: 114; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 115; EG-NEXT: TEX 1 @6 116; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 117; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 118; EG-NEXT: CF_END 119; EG-NEXT: PAD 120; EG-NEXT: Fetch clause starting at 6: 121; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 122; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 123; EG-NEXT: ALU clause starting at 10: 124; EG-NEXT: MOV * T0.X, KC0[2].Z, 125; EG-NEXT: ALU clause starting at 11: 126; EG-NEXT: LSHL * T0.W, T0.W, T1.W, 127; EG-NEXT: LSHL * T0.Z, T0.Z, T1.Z, 128; EG-NEXT: LSHL * T0.Y, T0.Y, T1.Y, 129; EG-NEXT: LSHL T0.X, T0.X, T1.X, 130; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 131; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 132 %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 133 %a = load <4 x i32>, ptr addrspace(1) %in 134 %b = load <4 x i32>, ptr addrspace(1) %b_ptr 135 %result = shl <4 x i32> %a, %b 136 store <4 x i32> %result, ptr addrspace(1) %out 137 ret void 138} 139 140define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { 141; SI-LABEL: shl_i16: 142; SI: ; %bb.0: 143; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 144; SI-NEXT: s_mov_b32 s7, 0xf000 145; SI-NEXT: s_mov_b32 s6, -1 146; SI-NEXT: s_mov_b32 s10, s6 147; SI-NEXT: s_mov_b32 s11, s7 148; SI-NEXT: s_waitcnt lgkmcnt(0) 149; SI-NEXT: s_mov_b32 s8, s2 150; SI-NEXT: s_mov_b32 s9, s3 151; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 152; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:2 153; SI-NEXT: s_mov_b32 s4, s0 154; SI-NEXT: s_mov_b32 s5, s1 155; SI-NEXT: s_waitcnt vmcnt(0) 156; SI-NEXT: v_lshl_b32_e32 v0, v0, v1 157; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 158; SI-NEXT: s_endpgm 159; 160; VI-LABEL: shl_i16: 161; VI: ; %bb.0: 162; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 163; VI-NEXT: s_mov_b32 s7, 0xf000 164; VI-NEXT: s_mov_b32 s6, -1 165; VI-NEXT: s_mov_b32 s10, s6 166; VI-NEXT: s_mov_b32 s11, s7 167; VI-NEXT: s_waitcnt lgkmcnt(0) 168; VI-NEXT: s_mov_b32 s8, s2 169; VI-NEXT: s_mov_b32 s9, s3 170; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 171; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:2 172; VI-NEXT: s_mov_b32 s4, s0 173; VI-NEXT: s_mov_b32 s5, s1 174; VI-NEXT: s_waitcnt vmcnt(0) 175; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 176; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 177; VI-NEXT: s_endpgm 178; 179; EG-LABEL: shl_i16: 180; EG: ; %bb.0: 181; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 182; EG-NEXT: TEX 1 @6 183; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] 184; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 185; EG-NEXT: CF_END 186; EG-NEXT: PAD 187; EG-NEXT: Fetch clause starting at 6: 188; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1 189; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1 190; EG-NEXT: ALU clause starting at 10: 191; EG-NEXT: MOV * T0.X, KC0[2].Z, 192; EG-NEXT: ALU clause starting at 11: 193; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 194; EG-NEXT: LSHL * T1.W, T0.X, T1.X, 195; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 196; EG-NEXT: AND_INT T1.W, PS, literal.x, 197; EG-NEXT: LSHL * T0.W, PV.W, literal.y, 198; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 199; EG-NEXT: LSHL T0.X, PV.W, PS, 200; EG-NEXT: LSHL * T0.W, literal.x, PS, 201; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 202; EG-NEXT: MOV T0.Y, 0.0, 203; EG-NEXT: MOV * T0.Z, 0.0, 204; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 205; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 206 %b_ptr = getelementptr i16, ptr addrspace(1) %in, i16 1 207 %a = load i16, ptr addrspace(1) %in 208 %b = load i16, ptr addrspace(1) %b_ptr 209 %result = shl i16 %a, %b 210 store i16 %result, ptr addrspace(1) %out 211 ret void 212} 213 214define amdgpu_kernel void @shl_i16_v_s(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %b) { 215; SI-LABEL: shl_i16_v_s: 216; SI: ; %bb.0: 217; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 218; SI-NEXT: s_load_dword s12, s[4:5], 0xd 219; SI-NEXT: s_mov_b32 s7, 0xf000 220; SI-NEXT: s_mov_b32 s6, -1 221; SI-NEXT: s_mov_b32 s10, s6 222; SI-NEXT: s_waitcnt lgkmcnt(0) 223; SI-NEXT: s_mov_b32 s8, s2 224; SI-NEXT: s_mov_b32 s9, s3 225; SI-NEXT: s_mov_b32 s11, s7 226; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 227; SI-NEXT: s_mov_b32 s4, s0 228; SI-NEXT: s_mov_b32 s5, s1 229; SI-NEXT: s_waitcnt vmcnt(0) 230; SI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 231; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 232; SI-NEXT: s_endpgm 233; 234; VI-LABEL: shl_i16_v_s: 235; VI: ; %bb.0: 236; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 237; VI-NEXT: s_load_dword s12, s[4:5], 0x34 238; VI-NEXT: s_mov_b32 s7, 0xf000 239; VI-NEXT: s_mov_b32 s6, -1 240; VI-NEXT: s_mov_b32 s10, s6 241; VI-NEXT: s_waitcnt lgkmcnt(0) 242; VI-NEXT: s_mov_b32 s8, s2 243; VI-NEXT: s_mov_b32 s9, s3 244; VI-NEXT: s_mov_b32 s11, s7 245; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 246; VI-NEXT: s_mov_b32 s4, s0 247; VI-NEXT: s_mov_b32 s5, s1 248; VI-NEXT: s_waitcnt vmcnt(0) 249; VI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 250; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 251; VI-NEXT: s_endpgm 252; 253; EG-LABEL: shl_i16_v_s: 254; EG: ; %bb.0: 255; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] 256; EG-NEXT: TEX 1 @6 257; EG-NEXT: ALU 12, @12, KC0[CB0:0-32], KC1[] 258; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 259; EG-NEXT: CF_END 260; EG-NEXT: PAD 261; EG-NEXT: Fetch clause starting at 6: 262; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1 263; EG-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3 264; EG-NEXT: ALU clause starting at 10: 265; EG-NEXT: MOV T0.X, 0.0, 266; EG-NEXT: MOV * T1.X, KC0[2].Z, 267; EG-NEXT: ALU clause starting at 12: 268; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, 269; EG-NEXT: LSHL * T1.W, T1.X, T0.X, 270; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 271; EG-NEXT: AND_INT T1.W, PS, literal.x, 272; EG-NEXT: LSHL * T0.W, PV.W, literal.y, 273; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 274; EG-NEXT: LSHL T0.X, PV.W, PS, 275; EG-NEXT: LSHL * T0.W, literal.x, PS, 276; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 277; EG-NEXT: MOV T0.Y, 0.0, 278; EG-NEXT: MOV * T0.Z, 0.0, 279; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 280; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 281 %a = load i16, ptr addrspace(1) %in 282 %result = shl i16 %a, %b 283 store i16 %result, ptr addrspace(1) %out 284 ret void 285} 286 287define amdgpu_kernel void @shl_i16_v_compute_s(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %b) { 288; SI-LABEL: shl_i16_v_compute_s: 289; SI: ; %bb.0: 290; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 291; SI-NEXT: s_load_dword s12, s[4:5], 0xd 292; SI-NEXT: s_mov_b32 s7, 0xf000 293; SI-NEXT: s_mov_b32 s6, -1 294; SI-NEXT: s_mov_b32 s10, s6 295; SI-NEXT: s_waitcnt lgkmcnt(0) 296; SI-NEXT: s_mov_b32 s8, s2 297; SI-NEXT: s_mov_b32 s9, s3 298; SI-NEXT: s_mov_b32 s11, s7 299; SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 300; SI-NEXT: s_add_i32 s12, s12, 3 301; SI-NEXT: s_mov_b32 s4, s0 302; SI-NEXT: s_mov_b32 s5, s1 303; SI-NEXT: s_waitcnt vmcnt(0) 304; SI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 305; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 306; SI-NEXT: s_endpgm 307; 308; VI-LABEL: shl_i16_v_compute_s: 309; VI: ; %bb.0: 310; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 311; VI-NEXT: s_load_dword s12, s[4:5], 0x34 312; VI-NEXT: s_mov_b32 s7, 0xf000 313; VI-NEXT: s_mov_b32 s6, -1 314; VI-NEXT: s_mov_b32 s10, s6 315; VI-NEXT: s_waitcnt lgkmcnt(0) 316; VI-NEXT: s_mov_b32 s8, s2 317; VI-NEXT: s_mov_b32 s9, s3 318; VI-NEXT: s_mov_b32 s11, s7 319; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 320; VI-NEXT: s_add_i32 s12, s12, 3 321; VI-NEXT: s_mov_b32 s4, s0 322; VI-NEXT: s_mov_b32 s5, s1 323; VI-NEXT: s_waitcnt vmcnt(0) 324; VI-NEXT: v_lshlrev_b32_e32 v0, s12, v0 325; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 326; VI-NEXT: s_endpgm 327; 328; EG-LABEL: shl_i16_v_compute_s: 329; EG: ; %bb.0: 330; EG-NEXT: ALU 0, @12, KC0[], KC1[] 331; EG-NEXT: TEX 0 @8 332; EG-NEXT: ALU 0, @13, KC0[CB0:0-32], KC1[] 333; EG-NEXT: TEX 0 @10 334; EG-NEXT: ALU 15, @14, KC0[CB0:0-32], KC1[] 335; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 336; EG-NEXT: CF_END 337; EG-NEXT: PAD 338; EG-NEXT: Fetch clause starting at 8: 339; EG-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3 340; EG-NEXT: Fetch clause starting at 10: 341; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1 342; EG-NEXT: ALU clause starting at 12: 343; EG-NEXT: MOV * T0.X, 0.0, 344; EG-NEXT: ALU clause starting at 13: 345; EG-NEXT: MOV * T1.X, KC0[2].Z, 346; EG-NEXT: ALU clause starting at 14: 347; EG-NEXT: ADD_INT * T0.W, T0.X, literal.x, 348; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 349; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 350; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, 351; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 352; EG-NEXT: LSHL * T0.W, T1.X, PV.W, 353; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 354; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 355; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 356; EG-NEXT: LSHL T0.X, PV.W, PS, 357; EG-NEXT: LSHL * T0.W, literal.x, PS, 358; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 359; EG-NEXT: MOV T0.Y, 0.0, 360; EG-NEXT: MOV * T0.Z, 0.0, 361; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 362; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 363 %a = load i16, ptr addrspace(1) %in 364 %b.add = add i16 %b, 3 365 %result = shl i16 %a, %b.add 366 store i16 %result, ptr addrspace(1) %out 367 ret void 368} 369 370define amdgpu_kernel void @shl_i16_computed_amount(ptr addrspace(1) %out, ptr addrspace(1) %in) { 371; SI-LABEL: shl_i16_computed_amount: 372; SI: ; %bb.0: 373; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 374; SI-NEXT: s_mov_b32 s7, 0xf000 375; SI-NEXT: s_mov_b32 s6, -1 376; SI-NEXT: s_mov_b32 s10, s6 377; SI-NEXT: s_mov_b32 s11, s7 378; SI-NEXT: s_waitcnt lgkmcnt(0) 379; SI-NEXT: s_mov_b32 s8, s2 380; SI-NEXT: s_mov_b32 s9, s3 381; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 382; SI-NEXT: v_mov_b32_e32 v1, 0 383; SI-NEXT: s_mov_b32 s14, 0 384; SI-NEXT: s_mov_b32 s15, s7 385; SI-NEXT: s_mov_b64 s[12:13], s[2:3] 386; SI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc 387; SI-NEXT: s_waitcnt vmcnt(0) 388; SI-NEXT: buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2 glc 389; SI-NEXT: s_waitcnt vmcnt(0) 390; SI-NEXT: s_mov_b32 s4, s0 391; SI-NEXT: s_mov_b32 s5, s1 392; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 393; SI-NEXT: v_lshlrev_b32_e32 v0, v0, v2 394; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 395; SI-NEXT: s_endpgm 396; 397; VI-LABEL: shl_i16_computed_amount: 398; VI: ; %bb.0: 399; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 400; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 401; VI-NEXT: s_mov_b32 s7, 0xf000 402; VI-NEXT: s_mov_b32 s6, -1 403; VI-NEXT: s_mov_b32 s10, s6 404; VI-NEXT: s_waitcnt lgkmcnt(0) 405; VI-NEXT: v_mov_b32_e32 v1, s3 406; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 407; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 408; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 409; VI-NEXT: s_mov_b32 s8, s2 410; VI-NEXT: s_mov_b32 s9, s3 411; VI-NEXT: s_mov_b32 s11, s7 412; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 413; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc 414; VI-NEXT: s_waitcnt vmcnt(0) 415; VI-NEXT: flat_load_ushort v0, v[0:1] glc 416; VI-NEXT: s_waitcnt vmcnt(0) 417; VI-NEXT: s_mov_b32 s4, s0 418; VI-NEXT: s_mov_b32 s5, s1 419; VI-NEXT: v_add_u16_e32 v0, 3, v0 420; VI-NEXT: v_lshlrev_b16_e32 v0, v0, v2 421; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 422; VI-NEXT: s_endpgm 423; 424; EG-LABEL: shl_i16_computed_amount: 425; EG: ; %bb.0: 426; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] 427; EG-NEXT: TEX 0 @8 428; EG-NEXT: ALU 1, @13, KC0[CB0:0-32], KC1[] 429; EG-NEXT: TEX 0 @10 430; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[] 431; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 432; EG-NEXT: CF_END 433; EG-NEXT: PAD 434; EG-NEXT: Fetch clause starting at 8: 435; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1 436; EG-NEXT: Fetch clause starting at 10: 437; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1 438; EG-NEXT: ALU clause starting at 12: 439; EG-NEXT: MOV * T1.X, KC0[2].Z, 440; EG-NEXT: ALU clause starting at 13: 441; EG-NEXT: LSHL * T0.W, T0.X, 1, 442; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 443; EG-NEXT: ALU clause starting at 15: 444; EG-NEXT: ADD_INT * T0.W, T0.X, literal.x, 445; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 446; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 447; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, 448; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 449; EG-NEXT: LSHL * T0.W, T1.X, PV.W, 450; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 451; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 452; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 453; EG-NEXT: LSHL T0.X, PV.W, PS, 454; EG-NEXT: LSHL * T0.W, literal.x, PS, 455; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 456; EG-NEXT: MOV T0.Y, 0.0, 457; EG-NEXT: MOV * T0.Z, 0.0, 458; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 459; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 460 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 461 %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i32 %tid 462 %gep.out = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid 463 %b_ptr = getelementptr i16, ptr addrspace(1) %gep, i16 1 464 %a = load volatile i16, ptr addrspace(1) %in 465 %b = load volatile i16, ptr addrspace(1) %b_ptr 466 %b.add = add i16 %b, 3 467 %result = shl i16 %a, %b.add 468 store i16 %result, ptr addrspace(1) %out 469 ret void 470} 471 472define amdgpu_kernel void @shl_i16_i_s(ptr addrspace(1) %out, i16 zeroext %a) { 473; SI-LABEL: shl_i16_i_s: 474; SI: ; %bb.0: 475; SI-NEXT: s_load_dword s6, s[4:5], 0xb 476; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 477; SI-NEXT: s_mov_b32 s3, 0xf000 478; SI-NEXT: s_mov_b32 s2, -1 479; SI-NEXT: s_waitcnt lgkmcnt(0) 480; SI-NEXT: s_lshl_b32 s4, s6, 12 481; SI-NEXT: v_mov_b32_e32 v0, s4 482; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 483; SI-NEXT: s_endpgm 484; 485; VI-LABEL: shl_i16_i_s: 486; VI: ; %bb.0: 487; VI-NEXT: s_load_dword s6, s[4:5], 0x2c 488; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 489; VI-NEXT: s_mov_b32 s3, 0xf000 490; VI-NEXT: s_mov_b32 s2, -1 491; VI-NEXT: s_waitcnt lgkmcnt(0) 492; VI-NEXT: s_lshl_b32 s4, s6, 12 493; VI-NEXT: v_mov_b32_e32 v0, s4 494; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 495; VI-NEXT: s_endpgm 496; 497; EG-LABEL: shl_i16_i_s: 498; EG: ; %bb.0: 499; EG-NEXT: ALU 0, @8, KC0[], KC1[] 500; EG-NEXT: TEX 0 @6 501; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 502; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X 503; EG-NEXT: CF_END 504; EG-NEXT: PAD 505; EG-NEXT: Fetch clause starting at 6: 506; EG-NEXT: VTX_READ_16 T0.X, T0.X, 40, #3 507; EG-NEXT: ALU clause starting at 8: 508; EG-NEXT: MOV * T0.X, 0.0, 509; EG-NEXT: ALU clause starting at 9: 510; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x, 511; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y, 512; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45) 513; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 514; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00) 515; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 516; EG-NEXT: LSHL * T1.W, T1.W, literal.y, 517; EG-NEXT: 61440(8.609578e-41), 3(4.203895e-45) 518; EG-NEXT: LSHL T0.X, PV.W, PS, 519; EG-NEXT: LSHL * T0.W, literal.x, PS, 520; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 521; EG-NEXT: MOV T0.Y, 0.0, 522; EG-NEXT: MOV * T0.Z, 0.0, 523; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 524; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 525 %result = shl i16 %a, 12 526 store i16 %result, ptr addrspace(1) %out 527 ret void 528} 529 530define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { 531; SI-LABEL: shl_v2i16: 532; SI: ; %bb.0: 533; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 534; SI-NEXT: s_mov_b32 s7, 0xf000 535; SI-NEXT: s_mov_b32 s6, -1 536; SI-NEXT: s_mov_b32 s10, s6 537; SI-NEXT: s_mov_b32 s11, s7 538; SI-NEXT: s_waitcnt lgkmcnt(0) 539; SI-NEXT: s_mov_b32 s8, s2 540; SI-NEXT: s_mov_b32 s9, s3 541; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 542; SI-NEXT: v_mov_b32_e32 v1, 0 543; SI-NEXT: s_mov_b32 s14, 0 544; SI-NEXT: s_mov_b32 s15, s7 545; SI-NEXT: s_mov_b64 s[12:13], s[2:3] 546; SI-NEXT: buffer_load_dword v2, off, s[8:11], 0 547; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4 548; SI-NEXT: s_mov_b32 s4, s0 549; SI-NEXT: s_mov_b32 s5, s1 550; SI-NEXT: s_waitcnt vmcnt(1) 551; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 552; SI-NEXT: s_waitcnt vmcnt(0) 553; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 554; SI-NEXT: v_lshlrev_b32_e32 v0, v0, v2 555; SI-NEXT: v_lshlrev_b32_e32 v1, v3, v1 556; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 557; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 558; SI-NEXT: v_or_b32_e32 v0, v0, v1 559; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 560; SI-NEXT: s_endpgm 561; 562; VI-LABEL: shl_v2i16: 563; VI: ; %bb.0: 564; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 565; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 566; VI-NEXT: s_waitcnt lgkmcnt(0) 567; VI-NEXT: v_mov_b32_e32 v1, s3 568; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 569; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 570; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0 571; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 572; VI-NEXT: flat_load_dword v0, v[0:1] 573; VI-NEXT: s_load_dword s4, s[2:3], 0x0 574; VI-NEXT: s_mov_b32 s3, 0xf000 575; VI-NEXT: s_mov_b32 s2, -1 576; VI-NEXT: s_waitcnt lgkmcnt(0) 577; VI-NEXT: s_lshr_b32 s5, s4, 16 578; VI-NEXT: v_mov_b32_e32 v1, s5 579; VI-NEXT: s_waitcnt vmcnt(0) 580; VI-NEXT: v_lshlrev_b16_e64 v2, v0, s4 581; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 582; VI-NEXT: v_or_b32_e32 v0, v2, v0 583; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 584; VI-NEXT: s_endpgm 585; 586; EG-LABEL: shl_v2i16: 587; EG: ; %bb.0: 588; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[] 589; EG-NEXT: TEX 0 @8 590; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[] 591; EG-NEXT: TEX 0 @10 592; EG-NEXT: ALU 11, @16, KC0[CB0:0-32], KC1[] 593; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1 594; EG-NEXT: CF_END 595; EG-NEXT: PAD 596; EG-NEXT: Fetch clause starting at 8: 597; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 598; EG-NEXT: Fetch clause starting at 10: 599; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 600; EG-NEXT: ALU clause starting at 12: 601; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 602; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 603; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 604; EG-NEXT: ALU clause starting at 15: 605; EG-NEXT: MOV * T7.X, KC0[2].Z, 606; EG-NEXT: ALU clause starting at 16: 607; EG-NEXT: AND_INT T0.Z, T0.X, literal.x, 608; EG-NEXT: LSHR T0.W, T0.X, literal.y, 609; EG-NEXT: LSHR * T1.W, T7.X, literal.y, 610; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 611; EG-NEXT: LSHL T0.W, PS, PV.W, 612; EG-NEXT: LSHL * T1.W, T7.X, PV.Z, 613; EG-NEXT: AND_INT T1.W, PS, literal.x, 614; EG-NEXT: LSHL * T0.W, PV.W, literal.y, 615; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 616; EG-NEXT: OR_INT T0.X, PV.W, PS, 617; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 618; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 619 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 620 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i32 %tid 621 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid 622 %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %gep, i16 1 623 %a = load <2 x i16>, ptr addrspace(1) %in 624 %b = load <2 x i16>, ptr addrspace(1) %b_ptr 625 %result = shl <2 x i16> %a, %b 626 store <2 x i16> %result, ptr addrspace(1) %out 627 ret void 628} 629 630define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { 631; SI-LABEL: shl_v4i16: 632; SI: ; %bb.0: 633; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 634; SI-NEXT: s_mov_b32 s7, 0xf000 635; SI-NEXT: s_mov_b32 s6, 0 636; SI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 637; SI-NEXT: v_mov_b32_e32 v5, 0 638; SI-NEXT: s_waitcnt lgkmcnt(0) 639; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 640; SI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 641; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 642; SI-NEXT: s_waitcnt vmcnt(0) 643; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 644; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 645; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 646; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 647; SI-NEXT: v_lshlrev_b32_e32 v1, v3, v1 648; SI-NEXT: v_lshlrev_b32_e32 v0, v2, v0 649; SI-NEXT: v_lshlrev_b32_e32 v2, v9, v7 650; SI-NEXT: v_lshlrev_b32_e32 v3, v8, v6 651; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 652; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 653; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 654; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 655; SI-NEXT: v_or_b32_e32 v1, v1, v2 656; SI-NEXT: v_or_b32_e32 v0, v0, v3 657; SI-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64 658; SI-NEXT: s_endpgm 659; 660; VI-LABEL: shl_v4i16: 661; VI: ; %bb.0: 662; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 663; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 664; VI-NEXT: s_waitcnt lgkmcnt(0) 665; VI-NEXT: v_mov_b32_e32 v1, s3 666; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 667; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 668; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 669; VI-NEXT: v_mov_b32_e32 v5, s1 670; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 671; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 672; VI-NEXT: s_waitcnt vmcnt(0) 673; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1 674; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 675; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0 676; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 677; VI-NEXT: v_or_b32_e32 v1, v6, v1 678; VI-NEXT: v_or_b32_e32 v0, v3, v0 679; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 680; VI-NEXT: s_endpgm 681; 682; EG-LABEL: shl_v4i16: 683; EG: ; %bb.0: 684; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[] 685; EG-NEXT: TEX 0 @6 686; EG-NEXT: ALU 42, @12, KC0[CB0:0-32], KC1[] 687; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1 688; EG-NEXT: CF_END 689; EG-NEXT: PAD 690; EG-NEXT: Fetch clause starting at 6: 691; EG-NEXT: VTX_READ_128 T10.XYZW, T0.X, 0, #1 692; EG-NEXT: ALU clause starting at 8: 693; EG-NEXT: MOV T0.Y, T6.X, 694; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 695; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 696; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 697; EG-NEXT: ALU clause starting at 12: 698; EG-NEXT: AND_INT * T1.W, T10.Z, literal.x, 699; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 700; EG-NEXT: LSHL * T1.W, T10.X, PV.W, 701; EG-NEXT: AND_INT T1.W, PV.W, literal.x, 702; EG-NEXT: AND_INT * T2.W, T0.Y, literal.y, 703; EG-NEXT: 65535(9.183409e-41), -65536(nan) 704; EG-NEXT: OR_INT * T1.W, PS, PV.W, 705; EG-NEXT: MOV * T6.X, PV.W, 706; EG-NEXT: MOV T0.X, PV.X, 707; EG-NEXT: LSHR T1.W, T10.Z, literal.x, 708; EG-NEXT: LSHR * T2.W, T10.X, literal.x, 709; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 710; EG-NEXT: LSHL T1.W, PS, PV.W, 711; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, 712; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 713; EG-NEXT: LSHL * T1.W, PV.W, literal.x, 714; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 715; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 716; EG-NEXT: MOV T6.X, PV.W, 717; EG-NEXT: MOV * T0.X, T7.X, 718; EG-NEXT: AND_INT * T1.W, T10.W, literal.x, 719; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 720; EG-NEXT: LSHL T1.W, T10.Y, PV.W, 721; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, 722; EG-NEXT: -65536(nan), 0(0.000000e+00) 723; EG-NEXT: AND_INT * T1.W, PV.W, literal.x, 724; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 725; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, 726; EG-NEXT: MOV * T7.X, PV.W, 727; EG-NEXT: MOV T0.X, PV.X, 728; EG-NEXT: LSHR T1.W, T10.W, literal.x, 729; EG-NEXT: LSHR * T2.W, T10.Y, literal.x, 730; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 731; EG-NEXT: LSHL * T1.W, PS, PV.W, 732; EG-NEXT: AND_INT T0.Z, T0.X, literal.x, 733; EG-NEXT: LSHL T1.W, PV.W, literal.y, 734; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 735; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) 736; EG-NEXT: LSHR T0.X, PS, literal.x, 737; EG-NEXT: OR_INT * T10.Y, PV.Z, PV.W, 738; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 739; EG-NEXT: MOV T7.X, PV.Y, 740; EG-NEXT: MOV * T10.X, T6.X, 741 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 742 %gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i32 %tid 743 %gep.out = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i32 %tid 744 %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %gep, i16 1 745 %a = load <4 x i16>, ptr addrspace(1) %gep 746 %b = load <4 x i16>, ptr addrspace(1) %b_ptr 747 %result = shl <4 x i16> %a, %b 748 store <4 x i16> %result, ptr addrspace(1) %gep.out 749 ret void 750} 751 752define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { 753; SI-LABEL: shl_i64: 754; SI: ; %bb.0: 755; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 756; SI-NEXT: s_mov_b32 s7, 0xf000 757; SI-NEXT: s_mov_b32 s6, -1 758; SI-NEXT: s_mov_b32 s10, s6 759; SI-NEXT: s_mov_b32 s11, s7 760; SI-NEXT: s_waitcnt lgkmcnt(0) 761; SI-NEXT: s_mov_b32 s8, s2 762; SI-NEXT: s_mov_b32 s9, s3 763; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 764; SI-NEXT: s_mov_b32 s4, s0 765; SI-NEXT: s_mov_b32 s5, s1 766; SI-NEXT: s_waitcnt vmcnt(0) 767; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2 768; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 769; SI-NEXT: s_endpgm 770; 771; VI-LABEL: shl_i64: 772; VI: ; %bb.0: 773; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 774; VI-NEXT: s_waitcnt lgkmcnt(0) 775; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 776; VI-NEXT: s_mov_b32 s3, 0xf000 777; VI-NEXT: s_mov_b32 s2, -1 778; VI-NEXT: s_waitcnt lgkmcnt(0) 779; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 780; VI-NEXT: v_mov_b32_e32 v0, s4 781; VI-NEXT: v_mov_b32_e32 v1, s5 782; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 783; VI-NEXT: s_endpgm 784; 785; EG-LABEL: shl_i64: 786; EG: ; %bb.0: 787; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 788; EG-NEXT: TEX 0 @6 789; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 790; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 791; EG-NEXT: CF_END 792; EG-NEXT: PAD 793; EG-NEXT: Fetch clause starting at 6: 794; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 795; EG-NEXT: ALU clause starting at 8: 796; EG-NEXT: MOV * T0.X, KC0[2].Z, 797; EG-NEXT: ALU clause starting at 9: 798; EG-NEXT: LSHR T1.Y, T0.Y, 1, 799; EG-NEXT: NOT_INT T1.Z, T0.Z, 800; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1, 801; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, 802; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 803; EG-NEXT: LSHL T2.Z, T0.X, PS, 804; EG-NEXT: BIT_ALIGN_INT T0.W, PV.Y, PV.W, PV.Z, 805; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, 806; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 807; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, PV.Z, 808; EG-NEXT: CNDE_INT T0.X, T1.W, T2.Z, 0.0, 809; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 810; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 811 %b_ptr = getelementptr i64, ptr addrspace(1) %in, i64 1 812 %a = load i64, ptr addrspace(1) %in 813 %b = load i64, ptr addrspace(1) %b_ptr 814 %result = shl i64 %a, %b 815 store i64 %result, ptr addrspace(1) %out 816 ret void 817} 818 819define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { 820; SI-LABEL: shl_v2i64: 821; SI: ; %bb.0: 822; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 823; SI-NEXT: s_mov_b32 s7, 0xf000 824; SI-NEXT: s_mov_b32 s6, -1 825; SI-NEXT: s_mov_b32 s10, s6 826; SI-NEXT: s_mov_b32 s11, s7 827; SI-NEXT: s_waitcnt lgkmcnt(0) 828; SI-NEXT: s_mov_b32 s8, s2 829; SI-NEXT: s_mov_b32 s9, s3 830; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 831; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 832; SI-NEXT: s_mov_b32 s4, s0 833; SI-NEXT: s_mov_b32 s5, s1 834; SI-NEXT: s_waitcnt vmcnt(0) 835; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v6 836; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 837; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 838; SI-NEXT: s_endpgm 839; 840; VI-LABEL: shl_v2i64: 841; VI: ; %bb.0: 842; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 843; VI-NEXT: s_waitcnt lgkmcnt(0) 844; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 845; VI-NEXT: s_mov_b32 s11, 0xf000 846; VI-NEXT: s_mov_b32 s10, -1 847; VI-NEXT: s_waitcnt lgkmcnt(0) 848; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], s6 849; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 850; VI-NEXT: v_mov_b32_e32 v0, s0 851; VI-NEXT: v_mov_b32_e32 v1, s1 852; VI-NEXT: v_mov_b32_e32 v2, s2 853; VI-NEXT: v_mov_b32_e32 v3, s3 854; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 855; VI-NEXT: s_endpgm 856; 857; EG-LABEL: shl_v2i64: 858; EG: ; %bb.0: 859; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 860; EG-NEXT: TEX 1 @6 861; EG-NEXT: ALU 23, @11, KC0[CB0:0-32], KC1[] 862; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1 863; EG-NEXT: CF_END 864; EG-NEXT: PAD 865; EG-NEXT: Fetch clause starting at 6: 866; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 867; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 868; EG-NEXT: ALU clause starting at 10: 869; EG-NEXT: MOV * T0.X, KC0[2].Z, 870; EG-NEXT: ALU clause starting at 11: 871; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x, 872; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 873; EG-NEXT: LSHL T2.X, T0.Z, PV.W, 874; EG-NEXT: AND_INT T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212 875; EG-NEXT: LSHR T2.Z, T0.W, 1, 876; EG-NEXT: BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1, BS:VEC_102/SCL_221 877; EG-NEXT: NOT_INT * T1.W, T1.Z, 878; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 879; EG-NEXT: BIT_ALIGN_INT T3.X, PV.Z, PV.W, PS, 880; EG-NEXT: LSHR T2.Y, T0.Y, 1, 881; EG-NEXT: NOT_INT T0.Z, T1.X, 882; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1, 883; EG-NEXT: AND_INT * T1.W, T1.X, literal.x, 884; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 885; EG-NEXT: LSHL T0.Y, T0.X, PS, BS:VEC_120/SCL_212 886; EG-NEXT: AND_INT T1.Z, T1.X, literal.x, BS:VEC_201 887; EG-NEXT: BIT_ALIGN_INT T0.W, PV.Y, PV.W, PV.Z, 888; EG-NEXT: CNDE_INT * T2.W, T1.Y, PV.X, T2.X, 889; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 890; EG-NEXT: CNDE_INT T2.Y, PV.Z, PV.W, PV.Y, 891; EG-NEXT: CNDE_INT * T2.Z, T1.Y, T2.X, 0.0, 892; EG-NEXT: CNDE_INT T2.X, T1.Z, T0.Y, 0.0, 893; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 894; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 895 %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 1 896 %a = load <2 x i64>, ptr addrspace(1) %in 897 %b = load <2 x i64>, ptr addrspace(1) %b_ptr 898 %result = shl <2 x i64> %a, %b 899 store <2 x i64> %result, ptr addrspace(1) %out 900 ret void 901} 902 903define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { 904; SI-LABEL: shl_v4i64: 905; SI: ; %bb.0: 906; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 907; SI-NEXT: s_mov_b32 s3, 0xf000 908; SI-NEXT: s_mov_b32 s2, -1 909; SI-NEXT: s_mov_b32 s10, s2 910; SI-NEXT: s_mov_b32 s11, s3 911; SI-NEXT: s_waitcnt lgkmcnt(0) 912; SI-NEXT: s_mov_b32 s8, s6 913; SI-NEXT: s_mov_b32 s9, s7 914; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 915; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 916; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 917; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 918; SI-NEXT: s_mov_b32 s0, s4 919; SI-NEXT: s_mov_b32 s1, s5 920; SI-NEXT: s_waitcnt vmcnt(2) 921; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v6 922; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 923; SI-NEXT: s_waitcnt vmcnt(0) 924; SI-NEXT: v_lshl_b64 v[9:10], v[9:10], v13 925; SI-NEXT: v_lshl_b64 v[7:8], v[7:8], v11 926; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 927; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 928; SI-NEXT: s_endpgm 929; 930; VI-LABEL: shl_v4i64: 931; VI: ; %bb.0: 932; VI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 933; VI-NEXT: s_waitcnt lgkmcnt(0) 934; VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 935; VI-NEXT: s_mov_b32 s19, 0xf000 936; VI-NEXT: s_mov_b32 s18, -1 937; VI-NEXT: s_waitcnt lgkmcnt(0) 938; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s14 939; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s12 940; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], s10 941; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 942; VI-NEXT: v_mov_b32_e32 v0, s4 943; VI-NEXT: v_mov_b32_e32 v1, s5 944; VI-NEXT: v_mov_b32_e32 v2, s6 945; VI-NEXT: v_mov_b32_e32 v3, s7 946; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 947; VI-NEXT: s_nop 0 948; VI-NEXT: v_mov_b32_e32 v0, s0 949; VI-NEXT: v_mov_b32_e32 v1, s1 950; VI-NEXT: v_mov_b32_e32 v2, s2 951; VI-NEXT: v_mov_b32_e32 v3, s3 952; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 953; VI-NEXT: s_endpgm 954; 955; EG-LABEL: shl_v4i64: 956; EG: ; %bb.0: 957; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 958; EG-NEXT: TEX 3 @6 959; EG-NEXT: ALU 48, @15, KC0[CB0:0-32], KC1[] 960; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 0 961; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 1 962; EG-NEXT: CF_END 963; EG-NEXT: Fetch clause starting at 6: 964; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 32, #1 965; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 48, #1 966; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 16, #1 967; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 968; EG-NEXT: ALU clause starting at 14: 969; EG-NEXT: MOV * T0.X, KC0[2].Z, 970; EG-NEXT: ALU clause starting at 15: 971; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x, 972; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 973; EG-NEXT: LSHL * T1.W, T0.Z, PV.W, 974; EG-NEXT: AND_INT T4.X, T1.Z, literal.x, 975; EG-NEXT: LSHR T1.Y, T3.W, 1, 976; EG-NEXT: NOT_INT T4.Z, T2.Z, BS:VEC_201 977; EG-NEXT: BIT_ALIGN_INT T2.W, T3.W, T3.Z, 1, 978; EG-NEXT: AND_INT * T3.W, T2.Z, literal.y, 979; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) 980; EG-NEXT: LSHL T5.X, T3.Z, PS, 981; EG-NEXT: AND_INT T2.Y, T2.Z, literal.x, BS:VEC_120/SCL_212 982; EG-NEXT: BIT_ALIGN_INT T2.Z, PV.Y, PV.W, PV.Z, 983; EG-NEXT: LSHR T2.W, T3.Y, 1, 984; EG-NEXT: NOT_INT * T3.W, T2.X, 985; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 986; EG-NEXT: BIT_ALIGN_INT T6.X, T3.Y, T3.X, 1, 987; EG-NEXT: AND_INT T1.Y, T2.X, literal.x, 988; EG-NEXT: LSHR T3.Z, T0.W, 1, 989; EG-NEXT: BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1, 990; EG-NEXT: NOT_INT * T4.W, T1.Z, 991; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 992; EG-NEXT: BIT_ALIGN_INT T7.X, PV.Z, PV.W, PS, 993; EG-NEXT: LSHL T1.Y, T3.X, PV.Y, BS:VEC_120/SCL_212 994; EG-NEXT: AND_INT T0.Z, T2.X, literal.x, BS:VEC_201 995; EG-NEXT: BIT_ALIGN_INT T0.W, T2.W, PV.X, T3.W, 996; EG-NEXT: CNDE_INT * T3.W, T2.Y, T2.Z, T5.X, 997; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 998; EG-NEXT: LSHR T2.X, T0.Y, 1, 999; EG-NEXT: CNDE_INT T3.Y, PV.Z, PV.W, PV.Y, 1000; EG-NEXT: NOT_INT T1.Z, T1.X, 1001; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1, 1002; EG-NEXT: AND_INT * T2.W, T1.X, literal.x, 1003; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1004; EG-NEXT: LSHL T0.X, T0.X, PS, 1005; EG-NEXT: AND_INT T0.Y, T1.X, literal.x, BS:VEC_120/SCL_212 1006; EG-NEXT: CNDE_INT T3.Z, T2.Y, T5.X, 0.0, BS:VEC_021/SCL_122 1007; EG-NEXT: BIT_ALIGN_INT * T0.W, PV.X, PV.W, PV.Z, 1008; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1009; EG-NEXT: CNDE_INT * T2.W, T4.X, T7.X, T1.W, 1010; EG-NEXT: CNDE_INT T3.X, T0.Z, T1.Y, 0.0, 1011; EG-NEXT: CNDE_INT T2.Y, T0.Y, T0.W, T0.X, 1012; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, 1013; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 1014; EG-NEXT: LSHR T1.X, PV.W, literal.x, 1015; EG-NEXT: CNDE_INT T2.Z, T4.X, T1.W, 0.0, 1016; EG-NEXT: CNDE_INT * T2.X, T0.Y, T0.X, 0.0, 1017; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1018; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, 1019; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1020 %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1 1021 %a = load <4 x i64>, ptr addrspace(1) %in 1022 %b = load <4 x i64>, ptr addrspace(1) %b_ptr 1023 %result = shl <4 x i64> %a, %b 1024 store <4 x i64> %result, ptr addrspace(1) %out 1025 ret void 1026} 1027 1028; Make sure load width gets reduced to i32 load. 1029define amdgpu_kernel void @s_shl_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { 1030; SI-LABEL: s_shl_32_i64: 1031; SI: ; %bb.0: 1032; SI-NEXT: s_load_dword s6, s[4:5], 0x13 1033; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1034; SI-NEXT: s_mov_b32 s3, 0xf000 1035; SI-NEXT: s_mov_b32 s2, -1 1036; SI-NEXT: v_mov_b32_e32 v0, 0 1037; SI-NEXT: s_waitcnt lgkmcnt(0) 1038; SI-NEXT: v_mov_b32_e32 v1, s6 1039; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1040; SI-NEXT: s_endpgm 1041; 1042; VI-LABEL: s_shl_32_i64: 1043; VI: ; %bb.0: 1044; VI-NEXT: s_load_dword s6, s[4:5], 0x4c 1045; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1046; VI-NEXT: s_mov_b32 s3, 0xf000 1047; VI-NEXT: s_mov_b32 s2, -1 1048; VI-NEXT: v_mov_b32_e32 v0, 0 1049; VI-NEXT: s_waitcnt lgkmcnt(0) 1050; VI-NEXT: v_mov_b32_e32 v1, s6 1051; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1052; VI-NEXT: s_endpgm 1053; 1054; EG-LABEL: s_shl_32_i64: 1055; EG: ; %bb.0: 1056; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 1057; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1058; EG-NEXT: CF_END 1059; EG-NEXT: PAD 1060; EG-NEXT: ALU clause starting at 4: 1061; EG-NEXT: MOV * T0.Y, KC0[4].W, 1062; EG-NEXT: MOV T0.X, 0.0, 1063; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1064; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1065 %result = shl i64 %a, 32 1066 store i64 %result, ptr addrspace(1) %out 1067 ret void 1068} 1069 1070define amdgpu_kernel void @v_shl_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { 1071; SI-LABEL: v_shl_32_i64: 1072; SI: ; %bb.0: 1073; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1074; SI-NEXT: s_ashr_i32 s9, s8, 31 1075; SI-NEXT: s_mov_b32 s7, 0xf000 1076; SI-NEXT: s_mov_b32 s6, 0 1077; SI-NEXT: v_mov_b32_e32 v2, 0 1078; SI-NEXT: s_waitcnt lgkmcnt(0) 1079; SI-NEXT: s_mov_b64 s[4:5], s[2:3] 1080; SI-NEXT: s_lshl_b64 s[2:3], s[8:9], 3 1081; SI-NEXT: v_mov_b32_e32 v0, s2 1082; SI-NEXT: v_mov_b32_e32 v1, s3 1083; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 1084; SI-NEXT: s_mov_b64 s[2:3], s[6:7] 1085; SI-NEXT: s_waitcnt vmcnt(0) 1086; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 1087; SI-NEXT: s_endpgm 1088; 1089; VI-LABEL: v_shl_32_i64: 1090; VI: ; %bb.0: 1091; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1092; VI-NEXT: s_ashr_i32 s9, s8, 31 1093; VI-NEXT: s_lshl_b64 s[4:5], s[8:9], 3 1094; VI-NEXT: v_mov_b32_e32 v0, 0 1095; VI-NEXT: s_waitcnt lgkmcnt(0) 1096; VI-NEXT: s_add_u32 s2, s2, s4 1097; VI-NEXT: s_addc_u32 s3, s3, s5 1098; VI-NEXT: s_load_dword s2, s[2:3], 0x0 1099; VI-NEXT: s_add_u32 s0, s0, s4 1100; VI-NEXT: s_addc_u32 s1, s1, s5 1101; VI-NEXT: v_mov_b32_e32 v3, s1 1102; VI-NEXT: v_mov_b32_e32 v2, s0 1103; VI-NEXT: s_waitcnt lgkmcnt(0) 1104; VI-NEXT: v_mov_b32_e32 v1, s2 1105; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1106; VI-NEXT: s_endpgm 1107; 1108; EG-LABEL: v_shl_32_i64: 1109; EG: ; %bb.0: 1110; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 1111; EG-NEXT: TEX 0 @6 1112; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[] 1113; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1 1114; EG-NEXT: CF_END 1115; EG-NEXT: PAD 1116; EG-NEXT: Fetch clause starting at 6: 1117; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1118; EG-NEXT: ALU clause starting at 8: 1119; EG-NEXT: LSHL * T0.W, T1.X, literal.x, 1120; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 1121; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 1122; EG-NEXT: ALU clause starting at 11: 1123; EG-NEXT: MOV T1.X, 0.0, 1124; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 1125; EG-NEXT: LSHR T2.X, PV.W, literal.x, 1126; EG-NEXT: MOV * T1.Y, T0.X, 1127; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1128 %tid = call i32 @llvm.amdgcn.workgroup.id.x() #0 1129 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid 1130 %gep.out = getelementptr i64, ptr addrspace(1) %out, i32 %tid 1131 %a = load i64, ptr addrspace(1) %gep.in 1132 %result = shl i64 %a, 32 1133 store i64 %result, ptr addrspace(1) %gep.out 1134 ret void 1135} 1136 1137define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) { 1138; SI-LABEL: s_shl_constant_i64: 1139; SI: ; %bb.0: 1140; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1141; SI-NEXT: s_mov_b32 s6, -1 1142; SI-NEXT: s_mov_b32 s9, 0xffff 1143; SI-NEXT: s_mov_b32 s8, s6 1144; SI-NEXT: s_mov_b32 s7, 0xf000 1145; SI-NEXT: s_waitcnt lgkmcnt(0) 1146; SI-NEXT: s_mov_b32 s4, s0 1147; SI-NEXT: s_mov_b32 s5, s1 1148; SI-NEXT: s_lshl_b64 s[0:1], s[8:9], s2 1149; SI-NEXT: v_mov_b32_e32 v0, s0 1150; SI-NEXT: v_mov_b32_e32 v1, s1 1151; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1152; SI-NEXT: s_endpgm 1153; 1154; VI-LABEL: s_shl_constant_i64: 1155; VI: ; %bb.0: 1156; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1157; VI-NEXT: s_mov_b32 s6, -1 1158; VI-NEXT: s_mov_b32 s9, 0xffff 1159; VI-NEXT: s_mov_b32 s8, s6 1160; VI-NEXT: s_mov_b32 s7, 0xf000 1161; VI-NEXT: s_waitcnt lgkmcnt(0) 1162; VI-NEXT: s_mov_b32 s4, s0 1163; VI-NEXT: s_mov_b32 s5, s1 1164; VI-NEXT: s_lshl_b64 s[0:1], s[8:9], s2 1165; VI-NEXT: v_mov_b32_e32 v0, s0 1166; VI-NEXT: v_mov_b32_e32 v1, s1 1167; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1168; VI-NEXT: s_endpgm 1169; 1170; EG-LABEL: s_shl_constant_i64: 1171; EG: ; %bb.0: 1172; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] 1173; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1174; EG-NEXT: CF_END 1175; EG-NEXT: PAD 1176; EG-NEXT: ALU clause starting at 4: 1177; EG-NEXT: MOV T0.Z, literal.x, 1178; EG-NEXT: NOT_INT T0.W, KC0[2].W, 1179; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1180; EG-NEXT: -1(nan), 31(4.344025e-44) 1181; EG-NEXT: LSHL T1.Z, literal.x, PS, 1182; EG-NEXT: BIT_ALIGN_INT T0.W, literal.y, PV.Z, PV.W, 1183; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.z, 1184; EG-NEXT: -1(nan), 32767(4.591635e-41) 1185; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1186; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, PV.Z, 1187; EG-NEXT: CNDE_INT T0.X, T1.W, T1.Z, 0.0, 1188; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1189; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1190 %shl = shl i64 281474976710655, %a 1191 store i64 %shl, ptr addrspace(1) %out, align 8 1192 ret void 1193} 1194 1195define amdgpu_kernel void @v_shl_constant_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { 1196; SI-LABEL: v_shl_constant_i64: 1197; SI: ; %bb.0: 1198; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1199; SI-NEXT: s_mov_b32 s7, 0xf000 1200; SI-NEXT: s_mov_b32 s6, -1 1201; SI-NEXT: s_mov_b32 s10, s6 1202; SI-NEXT: s_mov_b32 s11, s7 1203; SI-NEXT: s_waitcnt lgkmcnt(0) 1204; SI-NEXT: s_mov_b32 s8, s2 1205; SI-NEXT: s_mov_b32 s9, s3 1206; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1207; SI-NEXT: s_mov_b32 s2, 0xab19b207 1208; SI-NEXT: s_movk_i32 s3, 0x11e 1209; SI-NEXT: s_mov_b32 s4, s0 1210; SI-NEXT: s_mov_b32 s5, s1 1211; SI-NEXT: s_waitcnt vmcnt(0) 1212; SI-NEXT: v_lshl_b64 v[0:1], s[2:3], v0 1213; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1214; SI-NEXT: s_endpgm 1215; 1216; VI-LABEL: v_shl_constant_i64: 1217; VI: ; %bb.0: 1218; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1219; VI-NEXT: s_mov_b32 s7, 0xf000 1220; VI-NEXT: s_mov_b32 s6, -1 1221; VI-NEXT: s_waitcnt lgkmcnt(0) 1222; VI-NEXT: s_load_dword s2, s[2:3], 0x0 1223; VI-NEXT: s_mov_b32 s4, s0 1224; VI-NEXT: s_mov_b32 s5, s1 1225; VI-NEXT: s_mov_b32 s0, 0xab19b207 1226; VI-NEXT: s_movk_i32 s1, 0x11e 1227; VI-NEXT: s_waitcnt lgkmcnt(0) 1228; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 1229; VI-NEXT: v_mov_b32_e32 v0, s0 1230; VI-NEXT: v_mov_b32_e32 v1, s1 1231; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1232; VI-NEXT: s_endpgm 1233; 1234; EG-LABEL: v_shl_constant_i64: 1235; EG: ; %bb.0: 1236; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1237; EG-NEXT: TEX 0 @6 1238; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[] 1239; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1240; EG-NEXT: CF_END 1241; EG-NEXT: PAD 1242; EG-NEXT: Fetch clause starting at 6: 1243; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1244; EG-NEXT: ALU clause starting at 8: 1245; EG-NEXT: MOV * T0.X, KC0[2].Z, 1246; EG-NEXT: ALU clause starting at 9: 1247; EG-NEXT: NOT_INT T0.Z, T0.X, 1248; EG-NEXT: MOV T0.W, literal.x, 1249; EG-NEXT: AND_INT * T1.W, T0.X, literal.y, 1250; EG-NEXT: 1435293955(1.935796e+13), 31(4.344025e-44) 1251; EG-NEXT: LSHL T1.Z, literal.x, PS, 1252; EG-NEXT: BIT_ALIGN_INT T0.W, literal.y, PV.W, PV.Z, 1253; EG-NEXT: AND_INT * T1.W, T0.X, literal.z, 1254; EG-NEXT: -1424379385(-5.460358e-13), 143(2.003857e-43) 1255; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1256; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, PV.Z, 1257; EG-NEXT: CNDE_INT T0.X, T1.W, T1.Z, 0.0, 1258; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1259; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1260 %a = load i64, ptr addrspace(1) %aptr, align 8 1261 %shl = shl i64 1231231234567, %a 1262 store i64 %shl, ptr addrspace(1) %out, align 8 1263 ret void 1264} 1265 1266define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { 1267; SI-LABEL: v_shl_i64_32_bit_constant: 1268; SI: ; %bb.0: 1269; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1270; SI-NEXT: s_mov_b32 s7, 0xf000 1271; SI-NEXT: s_mov_b32 s6, -1 1272; SI-NEXT: s_mov_b32 s10, s6 1273; SI-NEXT: s_mov_b32 s11, s7 1274; SI-NEXT: s_waitcnt lgkmcnt(0) 1275; SI-NEXT: s_mov_b32 s8, s2 1276; SI-NEXT: s_mov_b32 s9, s3 1277; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1278; SI-NEXT: s_mov_b64 s[2:3], 0x12d687 1279; SI-NEXT: s_mov_b32 s4, s0 1280; SI-NEXT: s_mov_b32 s5, s1 1281; SI-NEXT: s_waitcnt vmcnt(0) 1282; SI-NEXT: v_lshl_b64 v[0:1], s[2:3], v0 1283; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1284; SI-NEXT: s_endpgm 1285; 1286; VI-LABEL: v_shl_i64_32_bit_constant: 1287; VI: ; %bb.0: 1288; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1289; VI-NEXT: s_waitcnt lgkmcnt(0) 1290; VI-NEXT: s_load_dword s4, s[2:3], 0x0 1291; VI-NEXT: s_mov_b32 s3, 0xf000 1292; VI-NEXT: s_mov_b32 s2, -1 1293; VI-NEXT: s_waitcnt lgkmcnt(0) 1294; VI-NEXT: s_lshl_b64 s[4:5], 0x12d687, s4 1295; VI-NEXT: v_mov_b32_e32 v0, s4 1296; VI-NEXT: v_mov_b32_e32 v1, s5 1297; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1298; VI-NEXT: s_endpgm 1299; 1300; EG-LABEL: v_shl_i64_32_bit_constant: 1301; EG: ; %bb.0: 1302; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1303; EG-NEXT: TEX 0 @6 1304; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[] 1305; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1306; EG-NEXT: CF_END 1307; EG-NEXT: PAD 1308; EG-NEXT: Fetch clause starting at 6: 1309; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1310; EG-NEXT: ALU clause starting at 8: 1311; EG-NEXT: MOV * T0.X, KC0[2].Z, 1312; EG-NEXT: ALU clause starting at 9: 1313; EG-NEXT: AND_INT T0.W, T0.X, literal.x, 1314; EG-NEXT: NOT_INT * T1.W, T0.X, 1315; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1316; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS, 1317; EG-NEXT: LSHL T0.W, literal.y, PV.W, 1318; EG-NEXT: AND_INT * T1.W, T0.X, literal.z, 1319; EG-NEXT: 617283(8.649977e-40), 1234567(1.729997e-39) 1320; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1321; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W, 1322; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0, 1323; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1324; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1325 %a = load i64, ptr addrspace(1) %aptr, align 8 1326 %shl = shl i64 1234567, %a 1327 store i64 %shl, ptr addrspace(1) %out, align 8 1328 ret void 1329} 1330 1331define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { 1332; SI-LABEL: v_shl_inline_imm_64_i64: 1333; SI: ; %bb.0: 1334; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1335; SI-NEXT: s_mov_b32 s7, 0xf000 1336; SI-NEXT: s_mov_b32 s6, -1 1337; SI-NEXT: s_mov_b32 s10, s6 1338; SI-NEXT: s_mov_b32 s11, s7 1339; SI-NEXT: s_waitcnt lgkmcnt(0) 1340; SI-NEXT: s_mov_b32 s8, s2 1341; SI-NEXT: s_mov_b32 s9, s3 1342; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 1343; SI-NEXT: s_mov_b32 s4, s0 1344; SI-NEXT: s_mov_b32 s5, s1 1345; SI-NEXT: s_waitcnt vmcnt(0) 1346; SI-NEXT: v_lshl_b64 v[0:1], 64, v0 1347; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1348; SI-NEXT: s_endpgm 1349; 1350; VI-LABEL: v_shl_inline_imm_64_i64: 1351; VI: ; %bb.0: 1352; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1353; VI-NEXT: s_waitcnt lgkmcnt(0) 1354; VI-NEXT: s_load_dword s4, s[2:3], 0x0 1355; VI-NEXT: s_mov_b32 s3, 0xf000 1356; VI-NEXT: s_mov_b32 s2, -1 1357; VI-NEXT: s_waitcnt lgkmcnt(0) 1358; VI-NEXT: s_lshl_b64 s[4:5], 64, s4 1359; VI-NEXT: v_mov_b32_e32 v0, s4 1360; VI-NEXT: v_mov_b32_e32 v1, s5 1361; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1362; VI-NEXT: s_endpgm 1363; 1364; EG-LABEL: v_shl_inline_imm_64_i64: 1365; EG: ; %bb.0: 1366; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 1367; EG-NEXT: TEX 0 @6 1368; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] 1369; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1370; EG-NEXT: CF_END 1371; EG-NEXT: PAD 1372; EG-NEXT: Fetch clause starting at 6: 1373; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 1374; EG-NEXT: ALU clause starting at 8: 1375; EG-NEXT: MOV * T0.X, KC0[2].Z, 1376; EG-NEXT: ALU clause starting at 9: 1377; EG-NEXT: AND_INT T0.W, T0.X, literal.x, 1378; EG-NEXT: NOT_INT * T1.W, T0.X, 1379; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1380; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS, 1381; EG-NEXT: LSHL T0.W, literal.y, PV.W, 1382; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, 1383; EG-NEXT: 32(4.484155e-44), 64(8.968310e-44) 1384; EG-NEXT: CNDE_INT * T0.Y, PS, PV.Z, PV.W, 1385; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0, 1386; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1387; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1388 %a = load i64, ptr addrspace(1) %aptr, align 8 1389 %shl = shl i64 64, %a 1390 store i64 %shl, ptr addrspace(1) %out, align 8 1391 ret void 1392} 1393 1394define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { 1395; SI-LABEL: s_shl_inline_imm_64_i64: 1396; SI: ; %bb.0: 1397; SI-NEXT: s_load_dword s6, s[4:5], 0xd 1398; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1399; SI-NEXT: s_mov_b32 s3, 0xf000 1400; SI-NEXT: s_mov_b32 s2, -1 1401; SI-NEXT: s_waitcnt lgkmcnt(0) 1402; SI-NEXT: s_lshl_b64 s[4:5], 64, s6 1403; SI-NEXT: v_mov_b32_e32 v0, s4 1404; SI-NEXT: v_mov_b32_e32 v1, s5 1405; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1406; SI-NEXT: s_endpgm 1407; 1408; VI-LABEL: s_shl_inline_imm_64_i64: 1409; VI: ; %bb.0: 1410; VI-NEXT: s_load_dword s6, s[4:5], 0x34 1411; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1412; VI-NEXT: s_mov_b32 s3, 0xf000 1413; VI-NEXT: s_mov_b32 s2, -1 1414; VI-NEXT: s_waitcnt lgkmcnt(0) 1415; VI-NEXT: s_lshl_b64 s[4:5], 64, s6 1416; VI-NEXT: v_mov_b32_e32 v0, s4 1417; VI-NEXT: v_mov_b32_e32 v1, s5 1418; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1419; VI-NEXT: s_endpgm 1420; 1421; EG-LABEL: s_shl_inline_imm_64_i64: 1422; EG: ; %bb.0: 1423; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[] 1424; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1425; EG-NEXT: CF_END 1426; EG-NEXT: PAD 1427; EG-NEXT: ALU clause starting at 4: 1428; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x, 1429; EG-NEXT: NOT_INT * T1.W, KC0[2].W, 1430; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1431; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS, 1432; EG-NEXT: AND_INT T1.W, KC0[2].W, literal.x, 1433; EG-NEXT: LSHL * T0.W, literal.y, PV.W, 1434; EG-NEXT: 32(4.484155e-44), 64(8.968310e-44) 1435; EG-NEXT: CNDE_INT * T0.Y, PV.W, PV.Z, PS, 1436; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0, 1437; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1438; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1439 %shl = shl i64 64, %a 1440 store i64 %shl, ptr addrspace(1) %out, align 8 1441 ret void 1442} 1443 1444define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { 1445; SI-LABEL: s_shl_inline_imm_1_i64: 1446; SI: ; %bb.0: 1447; SI-NEXT: s_load_dword s6, s[4:5], 0xd 1448; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1449; SI-NEXT: s_mov_b32 s3, 0xf000 1450; SI-NEXT: s_mov_b32 s2, -1 1451; SI-NEXT: s_waitcnt lgkmcnt(0) 1452; SI-NEXT: s_lshl_b64 s[4:5], 1, s6 1453; SI-NEXT: v_mov_b32_e32 v0, s4 1454; SI-NEXT: v_mov_b32_e32 v1, s5 1455; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1456; SI-NEXT: s_endpgm 1457; 1458; VI-LABEL: s_shl_inline_imm_1_i64: 1459; VI: ; %bb.0: 1460; VI-NEXT: s_load_dword s6, s[4:5], 0x34 1461; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1462; VI-NEXT: s_mov_b32 s3, 0xf000 1463; VI-NEXT: s_mov_b32 s2, -1 1464; VI-NEXT: s_waitcnt lgkmcnt(0) 1465; VI-NEXT: s_lshl_b64 s[4:5], 1, s6 1466; VI-NEXT: v_mov_b32_e32 v0, s4 1467; VI-NEXT: v_mov_b32_e32 v1, s5 1468; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1469; VI-NEXT: s_endpgm 1470; 1471; EG-LABEL: s_shl_inline_imm_1_i64: 1472; EG: ; %bb.0: 1473; EG-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[] 1474; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1475; EG-NEXT: CF_END 1476; EG-NEXT: PAD 1477; EG-NEXT: ALU clause starting at 4: 1478; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x, 1479; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.y, 1480; EG-NEXT: 31(4.344025e-44), 26(3.643376e-44) 1481; EG-NEXT: ASHR T1.W, PS, literal.x, 1482; EG-NEXT: LSHL * T0.W, 1, PV.W, 1483; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1484; EG-NEXT: AND_INT T0.Y, PV.W, PS, 1485; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x, 1486; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1487; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, 0.0, 1488; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1489; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1490 %shl = shl i64 1, %a 1491 store i64 %shl, ptr addrspace(1) %out, align 8 1492 ret void 1493} 1494 1495define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { 1496; SI-LABEL: s_shl_inline_imm_1_0_i64: 1497; SI: ; %bb.0: 1498; SI-NEXT: s_load_dword s6, s[4:5], 0xd 1499; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1500; SI-NEXT: s_mov_b32 s3, 0xf000 1501; SI-NEXT: s_mov_b32 s2, -1 1502; SI-NEXT: s_waitcnt lgkmcnt(0) 1503; SI-NEXT: s_lshl_b64 s[4:5], 1.0, s6 1504; SI-NEXT: v_mov_b32_e32 v0, s4 1505; SI-NEXT: v_mov_b32_e32 v1, s5 1506; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1507; SI-NEXT: s_endpgm 1508; 1509; VI-LABEL: s_shl_inline_imm_1_0_i64: 1510; VI: ; %bb.0: 1511; VI-NEXT: s_load_dword s6, s[4:5], 0x34 1512; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1513; VI-NEXT: s_mov_b32 s3, 0xf000 1514; VI-NEXT: s_mov_b32 s2, -1 1515; VI-NEXT: s_waitcnt lgkmcnt(0) 1516; VI-NEXT: s_lshl_b64 s[4:5], 1.0, s6 1517; VI-NEXT: v_mov_b32_e32 v0, s4 1518; VI-NEXT: v_mov_b32_e32 v1, s5 1519; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1520; VI-NEXT: s_endpgm 1521; 1522; EG-LABEL: s_shl_inline_imm_1_0_i64: 1523; EG: ; %bb.0: 1524; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1525; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1526; EG-NEXT: CF_END 1527; EG-NEXT: PAD 1528; EG-NEXT: ALU clause starting at 4: 1529; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 1530; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 1531; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1532; EG-NEXT: 536346624(1.050321e-19), 32(4.484155e-44) 1533; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 1534; EG-NEXT: MOV T0.X, 0.0, 1535; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1536; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1537 %shl = shl i64 4607182418800017408, %a 1538 store i64 %shl, ptr addrspace(1) %out, align 8 1539 ret void 1540} 1541 1542define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { 1543; SI-LABEL: s_shl_inline_imm_neg_1_0_i64: 1544; SI: ; %bb.0: 1545; SI-NEXT: s_load_dword s6, s[4:5], 0xd 1546; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1547; SI-NEXT: s_mov_b32 s3, 0xf000 1548; SI-NEXT: s_mov_b32 s2, -1 1549; SI-NEXT: s_waitcnt lgkmcnt(0) 1550; SI-NEXT: s_lshl_b64 s[4:5], -1.0, s6 1551; SI-NEXT: v_mov_b32_e32 v0, s4 1552; SI-NEXT: v_mov_b32_e32 v1, s5 1553; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1554; SI-NEXT: s_endpgm 1555; 1556; VI-LABEL: s_shl_inline_imm_neg_1_0_i64: 1557; VI: ; %bb.0: 1558; VI-NEXT: s_load_dword s6, s[4:5], 0x34 1559; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1560; VI-NEXT: s_mov_b32 s3, 0xf000 1561; VI-NEXT: s_mov_b32 s2, -1 1562; VI-NEXT: s_waitcnt lgkmcnt(0) 1563; VI-NEXT: s_lshl_b64 s[4:5], -1.0, s6 1564; VI-NEXT: v_mov_b32_e32 v0, s4 1565; VI-NEXT: v_mov_b32_e32 v1, s5 1566; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1567; VI-NEXT: s_endpgm 1568; 1569; EG-LABEL: s_shl_inline_imm_neg_1_0_i64: 1570; EG: ; %bb.0: 1571; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1572; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1573; EG-NEXT: CF_END 1574; EG-NEXT: PAD 1575; EG-NEXT: ALU clause starting at 4: 1576; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 1577; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 1578; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1579; EG-NEXT: 1610088448(3.574057e+19), 32(4.484155e-44) 1580; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 1581; EG-NEXT: MOV T0.X, 0.0, 1582; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1583; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1584 %shl = shl i64 13830554455654793216, %a 1585 store i64 %shl, ptr addrspace(1) %out, align 8 1586 ret void 1587} 1588 1589define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { 1590; SI-LABEL: s_shl_inline_imm_0_5_i64: 1591; SI: ; %bb.0: 1592; SI-NEXT: s_load_dword s6, s[4:5], 0xd 1593; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1594; SI-NEXT: s_mov_b32 s3, 0xf000 1595; SI-NEXT: s_mov_b32 s2, -1 1596; SI-NEXT: s_waitcnt lgkmcnt(0) 1597; SI-NEXT: s_lshl_b64 s[4:5], 0.5, s6 1598; SI-NEXT: v_mov_b32_e32 v0, s4 1599; SI-NEXT: v_mov_b32_e32 v1, s5 1600; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1601; SI-NEXT: s_endpgm 1602; 1603; VI-LABEL: s_shl_inline_imm_0_5_i64: 1604; VI: ; %bb.0: 1605; VI-NEXT: s_load_dword s6, s[4:5], 0x34 1606; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1607; VI-NEXT: s_mov_b32 s3, 0xf000 1608; VI-NEXT: s_mov_b32 s2, -1 1609; VI-NEXT: s_waitcnt lgkmcnt(0) 1610; VI-NEXT: s_lshl_b64 s[4:5], 0.5, s6 1611; VI-NEXT: v_mov_b32_e32 v0, s4 1612; VI-NEXT: v_mov_b32_e32 v1, s5 1613; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1614; VI-NEXT: s_endpgm 1615; 1616; EG-LABEL: s_shl_inline_imm_0_5_i64: 1617; EG: ; %bb.0: 1618; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1619; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1620; EG-NEXT: CF_END 1621; EG-NEXT: PAD 1622; EG-NEXT: ALU clause starting at 4: 1623; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 1624; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 1625; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1626; EG-NEXT: 535822336(1.016440e-19), 32(4.484155e-44) 1627; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 1628; EG-NEXT: MOV T0.X, 0.0, 1629; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1630; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1631 %shl = shl i64 4602678819172646912, %a 1632 store i64 %shl, ptr addrspace(1) %out, align 8 1633 ret void 1634} 1635 1636define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { 1637; SI-LABEL: s_shl_inline_imm_neg_0_5_i64: 1638; SI: ; %bb.0: 1639; SI-NEXT: s_load_dword s6, s[4:5], 0xd 1640; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1641; SI-NEXT: s_mov_b32 s3, 0xf000 1642; SI-NEXT: s_mov_b32 s2, -1 1643; SI-NEXT: s_waitcnt lgkmcnt(0) 1644; SI-NEXT: s_lshl_b64 s[4:5], -0.5, s6 1645; SI-NEXT: v_mov_b32_e32 v0, s4 1646; SI-NEXT: v_mov_b32_e32 v1, s5 1647; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1648; SI-NEXT: s_endpgm 1649; 1650; VI-LABEL: s_shl_inline_imm_neg_0_5_i64: 1651; VI: ; %bb.0: 1652; VI-NEXT: s_load_dword s6, s[4:5], 0x34 1653; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1654; VI-NEXT: s_mov_b32 s3, 0xf000 1655; VI-NEXT: s_mov_b32 s2, -1 1656; VI-NEXT: s_waitcnt lgkmcnt(0) 1657; VI-NEXT: s_lshl_b64 s[4:5], -0.5, s6 1658; VI-NEXT: v_mov_b32_e32 v0, s4 1659; VI-NEXT: v_mov_b32_e32 v1, s5 1660; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1661; VI-NEXT: s_endpgm 1662; 1663; EG-LABEL: s_shl_inline_imm_neg_0_5_i64: 1664; EG: ; %bb.0: 1665; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1666; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1667; EG-NEXT: CF_END 1668; EG-NEXT: PAD 1669; EG-NEXT: ALU clause starting at 4: 1670; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 1671; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 1672; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1673; EG-NEXT: 1609564160(3.458765e+19), 32(4.484155e-44) 1674; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 1675; EG-NEXT: MOV T0.X, 0.0, 1676; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1677; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1678 %shl = shl i64 13826050856027422720, %a 1679 store i64 %shl, ptr addrspace(1) %out, align 8 1680 ret void 1681} 1682 1683define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { 1684; SI-LABEL: s_shl_inline_imm_2_0_i64: 1685; SI: ; %bb.0: 1686; SI-NEXT: s_load_dword s6, s[4:5], 0xd 1687; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1688; SI-NEXT: s_mov_b32 s3, 0xf000 1689; SI-NEXT: s_mov_b32 s2, -1 1690; SI-NEXT: s_waitcnt lgkmcnt(0) 1691; SI-NEXT: s_lshl_b64 s[4:5], 2.0, s6 1692; SI-NEXT: v_mov_b32_e32 v0, s4 1693; SI-NEXT: v_mov_b32_e32 v1, s5 1694; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1695; SI-NEXT: s_endpgm 1696; 1697; VI-LABEL: s_shl_inline_imm_2_0_i64: 1698; VI: ; %bb.0: 1699; VI-NEXT: s_load_dword s6, s[4:5], 0x34 1700; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1701; VI-NEXT: s_mov_b32 s3, 0xf000 1702; VI-NEXT: s_mov_b32 s2, -1 1703; VI-NEXT: s_waitcnt lgkmcnt(0) 1704; VI-NEXT: s_lshl_b64 s[4:5], 2.0, s6 1705; VI-NEXT: v_mov_b32_e32 v0, s4 1706; VI-NEXT: v_mov_b32_e32 v1, s5 1707; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1708; VI-NEXT: s_endpgm 1709; 1710; EG-LABEL: s_shl_inline_imm_2_0_i64: 1711; EG: ; %bb.0: 1712; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1713; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1714; EG-NEXT: CF_END 1715; EG-NEXT: PAD 1716; EG-NEXT: ALU clause starting at 4: 1717; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 1718; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 1719; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1720; EG-NEXT: 536870912(1.084202e-19), 32(4.484155e-44) 1721; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 1722; EG-NEXT: MOV T0.X, 0.0, 1723; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1724; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1725 %shl = shl i64 4611686018427387904, %a 1726 store i64 %shl, ptr addrspace(1) %out, align 8 1727 ret void 1728} 1729 1730define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { 1731; SI-LABEL: s_shl_inline_imm_neg_2_0_i64: 1732; SI: ; %bb.0: 1733; SI-NEXT: s_load_dword s6, s[4:5], 0xd 1734; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1735; SI-NEXT: s_mov_b32 s3, 0xf000 1736; SI-NEXT: s_mov_b32 s2, -1 1737; SI-NEXT: s_waitcnt lgkmcnt(0) 1738; SI-NEXT: s_lshl_b64 s[4:5], -2.0, s6 1739; SI-NEXT: v_mov_b32_e32 v0, s4 1740; SI-NEXT: v_mov_b32_e32 v1, s5 1741; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1742; SI-NEXT: s_endpgm 1743; 1744; VI-LABEL: s_shl_inline_imm_neg_2_0_i64: 1745; VI: ; %bb.0: 1746; VI-NEXT: s_load_dword s6, s[4:5], 0x34 1747; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1748; VI-NEXT: s_mov_b32 s3, 0xf000 1749; VI-NEXT: s_mov_b32 s2, -1 1750; VI-NEXT: s_waitcnt lgkmcnt(0) 1751; VI-NEXT: s_lshl_b64 s[4:5], -2.0, s6 1752; VI-NEXT: v_mov_b32_e32 v0, s4 1753; VI-NEXT: v_mov_b32_e32 v1, s5 1754; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1755; VI-NEXT: s_endpgm 1756; 1757; EG-LABEL: s_shl_inline_imm_neg_2_0_i64: 1758; EG: ; %bb.0: 1759; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1760; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1761; EG-NEXT: CF_END 1762; EG-NEXT: PAD 1763; EG-NEXT: ALU clause starting at 4: 1764; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 1765; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 1766; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1767; EG-NEXT: 1610612736(3.689349e+19), 32(4.484155e-44) 1768; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 1769; EG-NEXT: MOV T0.X, 0.0, 1770; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1771; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1772 %shl = shl i64 13835058055282163712, %a 1773 store i64 %shl, ptr addrspace(1) %out, align 8 1774 ret void 1775} 1776 1777define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { 1778; SI-LABEL: s_shl_inline_imm_4_0_i64: 1779; SI: ; %bb.0: 1780; SI-NEXT: s_load_dword s6, s[4:5], 0xd 1781; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1782; SI-NEXT: s_mov_b32 s3, 0xf000 1783; SI-NEXT: s_mov_b32 s2, -1 1784; SI-NEXT: s_waitcnt lgkmcnt(0) 1785; SI-NEXT: s_lshl_b64 s[4:5], 4.0, s6 1786; SI-NEXT: v_mov_b32_e32 v0, s4 1787; SI-NEXT: v_mov_b32_e32 v1, s5 1788; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1789; SI-NEXT: s_endpgm 1790; 1791; VI-LABEL: s_shl_inline_imm_4_0_i64: 1792; VI: ; %bb.0: 1793; VI-NEXT: s_load_dword s6, s[4:5], 0x34 1794; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1795; VI-NEXT: s_mov_b32 s3, 0xf000 1796; VI-NEXT: s_mov_b32 s2, -1 1797; VI-NEXT: s_waitcnt lgkmcnt(0) 1798; VI-NEXT: s_lshl_b64 s[4:5], 4.0, s6 1799; VI-NEXT: v_mov_b32_e32 v0, s4 1800; VI-NEXT: v_mov_b32_e32 v1, s5 1801; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1802; VI-NEXT: s_endpgm 1803; 1804; EG-LABEL: s_shl_inline_imm_4_0_i64: 1805; EG: ; %bb.0: 1806; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1807; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1808; EG-NEXT: CF_END 1809; EG-NEXT: PAD 1810; EG-NEXT: ALU clause starting at 4: 1811; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 1812; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 1813; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1814; EG-NEXT: 537395200(1.151965e-19), 32(4.484155e-44) 1815; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 1816; EG-NEXT: MOV T0.X, 0.0, 1817; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1818; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1819 %shl = shl i64 4616189618054758400, %a 1820 store i64 %shl, ptr addrspace(1) %out, align 8 1821 ret void 1822} 1823 1824define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { 1825; SI-LABEL: s_shl_inline_imm_neg_4_0_i64: 1826; SI: ; %bb.0: 1827; SI-NEXT: s_load_dword s6, s[4:5], 0xd 1828; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1829; SI-NEXT: s_mov_b32 s3, 0xf000 1830; SI-NEXT: s_mov_b32 s2, -1 1831; SI-NEXT: s_waitcnt lgkmcnt(0) 1832; SI-NEXT: s_lshl_b64 s[4:5], -4.0, s6 1833; SI-NEXT: v_mov_b32_e32 v0, s4 1834; SI-NEXT: v_mov_b32_e32 v1, s5 1835; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1836; SI-NEXT: s_endpgm 1837; 1838; VI-LABEL: s_shl_inline_imm_neg_4_0_i64: 1839; VI: ; %bb.0: 1840; VI-NEXT: s_load_dword s6, s[4:5], 0x34 1841; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1842; VI-NEXT: s_mov_b32 s3, 0xf000 1843; VI-NEXT: s_mov_b32 s2, -1 1844; VI-NEXT: s_waitcnt lgkmcnt(0) 1845; VI-NEXT: s_lshl_b64 s[4:5], -4.0, s6 1846; VI-NEXT: v_mov_b32_e32 v0, s4 1847; VI-NEXT: v_mov_b32_e32 v1, s5 1848; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1849; VI-NEXT: s_endpgm 1850; 1851; EG-LABEL: s_shl_inline_imm_neg_4_0_i64: 1852; EG: ; %bb.0: 1853; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 1854; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1855; EG-NEXT: CF_END 1856; EG-NEXT: PAD 1857; EG-NEXT: ALU clause starting at 4: 1858; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 1859; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 1860; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1861; EG-NEXT: 1611137024(3.919933e+19), 32(4.484155e-44) 1862; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 1863; EG-NEXT: MOV T0.X, 0.0, 1864; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1865; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1866 %shl = shl i64 13839561654909534208, %a 1867 store i64 %shl, ptr addrspace(1) %out, align 8 1868 ret void 1869} 1870 1871 1872; Test with the 64-bit integer bitpattern for a 32-bit float in the 1873; low 32-bits, which is not a valid 64-bit inline immmediate. 1874define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { 1875; SI-LABEL: s_shl_inline_imm_f32_4_0_i64: 1876; SI: ; %bb.0: 1877; SI-NEXT: s_load_dword s6, s[4:5], 0xd 1878; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1879; SI-NEXT: s_mov_b32 s3, 0xf000 1880; SI-NEXT: s_mov_b32 s2, -1 1881; SI-NEXT: s_waitcnt lgkmcnt(0) 1882; SI-NEXT: s_lshl_b64 s[4:5], 0x40800000, s6 1883; SI-NEXT: v_mov_b32_e32 v0, s4 1884; SI-NEXT: v_mov_b32_e32 v1, s5 1885; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1886; SI-NEXT: s_endpgm 1887; 1888; VI-LABEL: s_shl_inline_imm_f32_4_0_i64: 1889; VI: ; %bb.0: 1890; VI-NEXT: s_load_dword s6, s[4:5], 0x34 1891; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1892; VI-NEXT: s_mov_b32 s3, 0xf000 1893; VI-NEXT: s_mov_b32 s2, -1 1894; VI-NEXT: s_waitcnt lgkmcnt(0) 1895; VI-NEXT: s_lshl_b64 s[4:5], 0x40800000, s6 1896; VI-NEXT: v_mov_b32_e32 v0, s4 1897; VI-NEXT: v_mov_b32_e32 v1, s5 1898; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1899; VI-NEXT: s_endpgm 1900; 1901; EG-LABEL: s_shl_inline_imm_f32_4_0_i64: 1902; EG: ; %bb.0: 1903; EG-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[] 1904; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1905; EG-NEXT: CF_END 1906; EG-NEXT: PAD 1907; EG-NEXT: ALU clause starting at 4: 1908; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x, 1909; EG-NEXT: NOT_INT * T1.W, KC0[2].W, 1910; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 1911; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS, 1912; EG-NEXT: AND_INT T1.W, KC0[2].W, literal.y, 1913; EG-NEXT: LSHL * T0.W, literal.z, PV.W, 1914; EG-NEXT: 541065216(1.626303e-19), 32(4.484155e-44) 1915; EG-NEXT: 1082130432(4.000000e+00), 0(0.000000e+00) 1916; EG-NEXT: CNDE_INT * T0.Y, PV.W, PV.Z, PS, 1917; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0, 1918; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1919; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1920 %shl = shl i64 1082130432, %a 1921 store i64 %shl, ptr addrspace(1) %out, align 8 1922 ret void 1923} 1924 1925; FIXME: Copy of -1 register 1926define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { 1927; SI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: 1928; SI: ; %bb.0: 1929; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1930; SI-NEXT: s_load_dword s6, s[4:5], 0xd 1931; SI-NEXT: s_mov_b32 s4, -4.0 1932; SI-NEXT: s_mov_b32 s5, -1 1933; SI-NEXT: s_mov_b32 s3, 0xf000 1934; SI-NEXT: s_mov_b32 s2, -1 1935; SI-NEXT: s_waitcnt lgkmcnt(0) 1936; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 1937; SI-NEXT: v_mov_b32_e32 v0, s4 1938; SI-NEXT: v_mov_b32_e32 v1, s5 1939; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1940; SI-NEXT: s_endpgm 1941; 1942; VI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: 1943; VI: ; %bb.0: 1944; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 1945; VI-NEXT: s_load_dword s6, s[4:5], 0x34 1946; VI-NEXT: s_mov_b32 s4, -4.0 1947; VI-NEXT: s_mov_b32 s5, -1 1948; VI-NEXT: s_mov_b32 s3, 0xf000 1949; VI-NEXT: s_mov_b32 s2, -1 1950; VI-NEXT: s_waitcnt lgkmcnt(0) 1951; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 1952; VI-NEXT: v_mov_b32_e32 v0, s4 1953; VI-NEXT: v_mov_b32_e32 v1, s5 1954; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1955; VI-NEXT: s_endpgm 1956; 1957; EG-LABEL: s_shl_inline_imm_f32_neg_4_0_i64: 1958; EG: ; %bb.0: 1959; EG-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[] 1960; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 1961; EG-NEXT: CF_END 1962; EG-NEXT: PAD 1963; EG-NEXT: ALU clause starting at 4: 1964; EG-NEXT: MOV T0.Z, literal.x, 1965; EG-NEXT: NOT_INT T0.W, KC0[2].W, 1966; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 1967; EG-NEXT: -532676608(-5.534023e+19), 31(4.344025e-44) 1968; EG-NEXT: LSHL T1.Z, literal.x, PS, 1969; EG-NEXT: BIT_ALIGN_INT T0.W, literal.y, PV.Z, PV.W, 1970; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.z, 1971; EG-NEXT: -1065353216(-4.000000e+00), 2147483647(nan) 1972; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 1973; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, PV.Z, 1974; EG-NEXT: CNDE_INT T0.X, T1.W, T1.Z, 0.0, 1975; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 1976; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 1977 %shl = shl i64 -1065353216, %a 1978 store i64 %shl, ptr addrspace(1) %out, align 8 1979 ret void 1980} 1981 1982define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { 1983; SI-LABEL: s_shl_inline_high_imm_f32_4_0_i64: 1984; SI: ; %bb.0: 1985; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 1986; SI-NEXT: s_load_dword s6, s[4:5], 0xd 1987; SI-NEXT: s_mov_b32 s4, 0 1988; SI-NEXT: s_mov_b32 s5, 4.0 1989; SI-NEXT: s_mov_b32 s3, 0xf000 1990; SI-NEXT: s_mov_b32 s2, -1 1991; SI-NEXT: s_waitcnt lgkmcnt(0) 1992; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 1993; SI-NEXT: v_mov_b32_e32 v0, s4 1994; SI-NEXT: v_mov_b32_e32 v1, s5 1995; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1996; SI-NEXT: s_endpgm 1997; 1998; VI-LABEL: s_shl_inline_high_imm_f32_4_0_i64: 1999; VI: ; %bb.0: 2000; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2001; VI-NEXT: s_load_dword s6, s[4:5], 0x34 2002; VI-NEXT: s_mov_b32 s4, 0 2003; VI-NEXT: s_mov_b32 s5, 4.0 2004; VI-NEXT: s_mov_b32 s3, 0xf000 2005; VI-NEXT: s_mov_b32 s2, -1 2006; VI-NEXT: s_waitcnt lgkmcnt(0) 2007; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 2008; VI-NEXT: v_mov_b32_e32 v0, s4 2009; VI-NEXT: v_mov_b32_e32 v1, s5 2010; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2011; VI-NEXT: s_endpgm 2012; 2013; EG-LABEL: s_shl_inline_high_imm_f32_4_0_i64: 2014; EG: ; %bb.0: 2015; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 2016; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 2017; EG-NEXT: CF_END 2018; EG-NEXT: PAD 2019; EG-NEXT: ALU clause starting at 4: 2020; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 2021; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 2022; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 2023; EG-NEXT: 541065216(1.626303e-19), 32(4.484155e-44) 2024; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 2025; EG-NEXT: MOV T0.X, 0.0, 2026; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2027; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2028 %shl = shl i64 4647714815446351872, %a 2029 store i64 %shl, ptr addrspace(1) %out, align 8 2030 ret void 2031} 2032 2033define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { 2034; SI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: 2035; SI: ; %bb.0: 2036; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 2037; SI-NEXT: s_load_dword s6, s[4:5], 0xd 2038; SI-NEXT: s_mov_b32 s4, 0 2039; SI-NEXT: s_mov_b32 s5, -4.0 2040; SI-NEXT: s_mov_b32 s3, 0xf000 2041; SI-NEXT: s_mov_b32 s2, -1 2042; SI-NEXT: s_waitcnt lgkmcnt(0) 2043; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 2044; SI-NEXT: v_mov_b32_e32 v0, s4 2045; SI-NEXT: v_mov_b32_e32 v1, s5 2046; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2047; SI-NEXT: s_endpgm 2048; 2049; VI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: 2050; VI: ; %bb.0: 2051; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 2052; VI-NEXT: s_load_dword s6, s[4:5], 0x34 2053; VI-NEXT: s_mov_b32 s4, 0 2054; VI-NEXT: s_mov_b32 s5, -4.0 2055; VI-NEXT: s_mov_b32 s3, 0xf000 2056; VI-NEXT: s_mov_b32 s2, -1 2057; VI-NEXT: s_waitcnt lgkmcnt(0) 2058; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 2059; VI-NEXT: v_mov_b32_e32 v0, s4 2060; VI-NEXT: v_mov_b32_e32 v1, s5 2061; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2062; VI-NEXT: s_endpgm 2063; 2064; EG-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64: 2065; EG: ; %bb.0: 2066; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 2067; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 2068; EG-NEXT: CF_END 2069; EG-NEXT: PAD 2070; EG-NEXT: ALU clause starting at 4: 2071; EG-NEXT: NOT_INT * T0.W, KC0[2].W, 2072; EG-NEXT: BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W, 2073; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.y, 2074; EG-NEXT: 1614807040(5.534023e+19), 32(4.484155e-44) 2075; EG-NEXT: CNDE_INT * T0.Y, PS, PV.W, 0.0, 2076; EG-NEXT: MOV T0.X, 0.0, 2077; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 2078; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2079 %shl = shl i64 13871086852301127680, %a 2080 store i64 %shl, ptr addrspace(1) %out, align 8 2081 ret void 2082} 2083 2084define amdgpu_kernel void @test_mul2(i32 %p) { 2085; SI-LABEL: test_mul2: 2086; SI: ; %bb.0: 2087; SI-NEXT: s_load_dword s0, s[4:5], 0x9 2088; SI-NEXT: s_mov_b32 s3, 0xf000 2089; SI-NEXT: s_mov_b32 s2, -1 2090; SI-NEXT: s_waitcnt lgkmcnt(0) 2091; SI-NEXT: s_lshl_b32 s0, s0, 1 2092; SI-NEXT: v_mov_b32_e32 v0, s0 2093; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2094; SI-NEXT: s_waitcnt vmcnt(0) 2095; SI-NEXT: s_endpgm 2096; 2097; VI-LABEL: test_mul2: 2098; VI: ; %bb.0: 2099; VI-NEXT: s_load_dword s0, s[4:5], 0x24 2100; VI-NEXT: s_mov_b32 s3, 0xf000 2101; VI-NEXT: s_mov_b32 s2, -1 2102; VI-NEXT: s_waitcnt lgkmcnt(0) 2103; VI-NEXT: s_lshl_b32 s0, s0, 1 2104; VI-NEXT: v_mov_b32_e32 v0, s0 2105; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 2106; VI-NEXT: s_waitcnt vmcnt(0) 2107; VI-NEXT: s_endpgm 2108; 2109; EG-LABEL: test_mul2: 2110; EG: ; %bb.0: 2111; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] 2112; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 2113; EG-NEXT: CF_END 2114; EG-NEXT: PAD 2115; EG-NEXT: ALU clause starting at 4: 2116; EG-NEXT: MOV T0.X, literal.x, 2117; EG-NEXT: LSHL * T1.X, KC0[2].Y, 1, 2118; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) 2119 %i = mul i32 %p, 2 2120 store volatile i32 %i, ptr addrspace(1) undef 2121 ret void 2122} 2123 2124define void @shl_or_k(ptr addrspace(1) %out, i32 %in) { 2125; SI-LABEL: shl_or_k: 2126; SI: ; %bb.0: 2127; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2128; SI-NEXT: s_mov_b32 s6, 0 2129; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v2 2130; SI-NEXT: s_mov_b32 s7, 0xf000 2131; SI-NEXT: s_mov_b32 s4, s6 2132; SI-NEXT: s_mov_b32 s5, s6 2133; SI-NEXT: v_or_b32_e32 v2, 4, v2 2134; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 2135; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2136; SI-NEXT: s_setpc_b64 s[30:31] 2137; 2138; VI-LABEL: shl_or_k: 2139; VI: ; %bb.0: 2140; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2141; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v2 2142; VI-NEXT: v_or_b32_e32 v2, 4, v2 2143; VI-NEXT: flat_store_dword v[0:1], v2 2144; VI-NEXT: s_waitcnt vmcnt(0) 2145; VI-NEXT: s_setpc_b64 s[30:31] 2146; 2147; EG-LABEL: shl_or_k: 2148; EG: ; %bb.0: 2149; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 2150; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 2151; EG-NEXT: CF_END 2152; EG-NEXT: PAD 2153; EG-NEXT: ALU clause starting at 4: 2154; EG-NEXT: LSHL * T0.W, KC0[2].Z, literal.x, 2155; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2156; EG-NEXT: OR_INT T0.X, PV.W, literal.x, 2157; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 2158; EG-NEXT: 4(5.605194e-45), 2(2.802597e-45) 2159 %tmp0 = or i32 %in, 1 2160 %tmp2 = shl i32 %tmp0, 2 2161 store i32 %tmp2, ptr addrspace(1) %out 2162 ret void 2163} 2164 2165define void @shl_or_k_two_uses(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %in) { 2166; SI-LABEL: shl_or_k_two_uses: 2167; SI: ; %bb.0: 2168; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2169; SI-NEXT: s_mov_b32 s6, 0 2170; SI-NEXT: v_or_b32_e32 v4, 1, v4 2171; SI-NEXT: s_mov_b32 s7, 0xf000 2172; SI-NEXT: s_mov_b32 s4, s6 2173; SI-NEXT: s_mov_b32 s5, s6 2174; SI-NEXT: v_lshlrev_b32_e32 v5, 2, v4 2175; SI-NEXT: buffer_store_dword v5, v[0:1], s[4:7], 0 addr64 2176; SI-NEXT: buffer_store_dword v4, v[2:3], s[4:7], 0 addr64 2177; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 2178; SI-NEXT: s_setpc_b64 s[30:31] 2179; 2180; VI-LABEL: shl_or_k_two_uses: 2181; VI: ; %bb.0: 2182; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2183; VI-NEXT: v_or_b32_e32 v4, 1, v4 2184; VI-NEXT: v_lshlrev_b32_e32 v5, 2, v4 2185; VI-NEXT: flat_store_dword v[0:1], v5 2186; VI-NEXT: flat_store_dword v[2:3], v4 2187; VI-NEXT: s_waitcnt vmcnt(0) 2188; VI-NEXT: s_setpc_b64 s[30:31] 2189; 2190; EG-LABEL: shl_or_k_two_uses: 2191; EG: ; %bb.0: 2192; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] 2193; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 2194; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 2195; EG-NEXT: CF_END 2196; EG-NEXT: ALU clause starting at 4: 2197; EG-NEXT: LSHR T0.X, KC0[2].Z, literal.x, 2198; EG-NEXT: OR_INT * T1.X, KC0[2].W, 1, 2199; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2200; EG-NEXT: LSHL T2.X, PS, literal.x, 2201; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 2202; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 2203 %tmp0 = or i32 %in, 1 2204 %tmp2 = shl i32 %tmp0, 2 2205 store i32 %tmp2, ptr addrspace(1) %out0 2206 store i32 %tmp0, ptr addrspace(1) %out1 2207 ret void 2208} 2209 2210attributes #0 = { nounwind readnone } 2211