1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=VI 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck %s -check-prefixes=EG 5 6declare i32 @llvm.amdgcn.workitem.id.x() #0 7 8define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 9; SI-LABEL: lshr_i32: 10; SI: ; %bb.0: 11; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s6, -1 14; SI-NEXT: s_mov_b32 s10, s6 15; SI-NEXT: s_mov_b32 s11, s7 16; SI-NEXT: s_waitcnt lgkmcnt(0) 17; SI-NEXT: s_mov_b32 s8, s2 18; SI-NEXT: s_mov_b32 s9, s3 19; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 20; SI-NEXT: s_mov_b32 s4, s0 21; SI-NEXT: s_mov_b32 s5, s1 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: v_lshr_b32_e32 v0, v0, v1 24; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 25; SI-NEXT: s_endpgm 26; 27; VI-LABEL: lshr_i32: 28; VI: ; %bb.0: 29; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 30; VI-NEXT: s_waitcnt lgkmcnt(0) 31; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 32; VI-NEXT: s_mov_b32 s3, 0xf000 33; VI-NEXT: s_mov_b32 s2, -1 34; VI-NEXT: s_waitcnt lgkmcnt(0) 35; VI-NEXT: s_lshr_b32 s4, s4, s5 36; VI-NEXT: v_mov_b32_e32 v0, s4 37; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 38; VI-NEXT: s_endpgm 39; 40; EG-LABEL: lshr_i32: 41; EG: ; %bb.0: 42; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 43; EG-NEXT: TEX 0 @6 44; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] 45; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 46; EG-NEXT: CF_END 47; EG-NEXT: PAD 48; EG-NEXT: Fetch clause starting at 6: 49; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 50; EG-NEXT: ALU clause starting at 8: 51; EG-NEXT: MOV * T0.X, KC0[2].Z, 52; EG-NEXT: ALU clause starting at 9: 53; EG-NEXT: LSHR T0.X, T0.X, T0.Y, 54; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 55; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 56 %b_ptr = getelementptr i32, ptr addrspace(1) %in, i32 1 57 %a = load i32, ptr addrspace(1) %in 58 %b = load i32, ptr addrspace(1) %b_ptr 59 %result = lshr i32 %a, %b 60 store i32 %result, ptr addrspace(1) %out 61 ret void 62} 63 64define amdgpu_kernel void @lshr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 65; SI-LABEL: lshr_v2i32: 66; SI: ; %bb.0: 67; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 68; SI-NEXT: s_mov_b32 s7, 0xf000 69; SI-NEXT: s_mov_b32 s6, -1 70; SI-NEXT: s_mov_b32 s10, s6 71; SI-NEXT: s_mov_b32 s11, s7 72; SI-NEXT: s_waitcnt lgkmcnt(0) 73; SI-NEXT: s_mov_b32 s8, s2 74; SI-NEXT: s_mov_b32 s9, s3 75; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 76; SI-NEXT: s_mov_b32 s4, s0 77; SI-NEXT: s_mov_b32 s5, s1 78; SI-NEXT: s_waitcnt vmcnt(0) 79; SI-NEXT: v_lshr_b32_e32 v1, v1, v3 80; SI-NEXT: v_lshr_b32_e32 v0, v0, v2 81; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 82; SI-NEXT: s_endpgm 83; 84; VI-LABEL: lshr_v2i32: 85; VI: ; %bb.0: 86; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 87; VI-NEXT: s_waitcnt lgkmcnt(0) 88; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 89; VI-NEXT: s_mov_b32 s3, 0xf000 90; VI-NEXT: s_mov_b32 s2, -1 91; VI-NEXT: s_waitcnt lgkmcnt(0) 92; VI-NEXT: s_lshr_b32 s5, s5, s7 93; VI-NEXT: s_lshr_b32 s4, s4, s6 94; VI-NEXT: v_mov_b32_e32 v0, s4 95; VI-NEXT: v_mov_b32_e32 v1, s5 96; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 97; VI-NEXT: s_endpgm 98; 99; EG-LABEL: lshr_v2i32: 100; EG: ; %bb.0: 101; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 102; EG-NEXT: TEX 0 @6 103; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 104; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 105; EG-NEXT: CF_END 106; EG-NEXT: PAD 107; EG-NEXT: Fetch clause starting at 6: 108; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 109; EG-NEXT: ALU clause starting at 8: 110; EG-NEXT: MOV * T0.X, KC0[2].Z, 111; EG-NEXT: ALU clause starting at 9: 112; EG-NEXT: LSHR * T0.Y, T0.Y, T0.W, 113; EG-NEXT: LSHR T0.X, T0.X, T0.Z, 114; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 115; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 116 %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 117 %a = load <2 x i32>, ptr addrspace(1) %in 118 %b = load <2 x i32>, ptr addrspace(1) %b_ptr 119 %result = lshr <2 x i32> %a, %b 120 store <2 x i32> %result, ptr addrspace(1) %out 121 ret void 122} 123 124define amdgpu_kernel void @lshr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 125; SI-LABEL: lshr_v4i32: 126; SI: ; %bb.0: 127; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 128; SI-NEXT: s_mov_b32 s7, 0xf000 129; SI-NEXT: s_mov_b32 s6, -1 130; SI-NEXT: s_mov_b32 s10, s6 131; SI-NEXT: s_mov_b32 s11, s7 132; SI-NEXT: s_waitcnt lgkmcnt(0) 133; SI-NEXT: s_mov_b32 s8, s2 134; SI-NEXT: s_mov_b32 s9, s3 135; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 136; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 137; SI-NEXT: s_mov_b32 s4, s0 138; SI-NEXT: s_mov_b32 s5, s1 139; SI-NEXT: s_waitcnt vmcnt(0) 140; SI-NEXT: v_lshr_b32_e32 v3, v3, v7 141; SI-NEXT: v_lshr_b32_e32 v2, v2, v6 142; SI-NEXT: v_lshr_b32_e32 v1, v1, v5 143; SI-NEXT: v_lshr_b32_e32 v0, v0, v4 144; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 145; SI-NEXT: s_endpgm 146; 147; VI-LABEL: lshr_v4i32: 148; VI: ; %bb.0: 149; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 150; VI-NEXT: s_waitcnt lgkmcnt(0) 151; VI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 152; VI-NEXT: s_mov_b32 s11, 0xf000 153; VI-NEXT: s_mov_b32 s10, -1 154; VI-NEXT: s_waitcnt lgkmcnt(0) 155; VI-NEXT: s_lshr_b32 s3, s3, s7 156; VI-NEXT: s_lshr_b32 s2, s2, s6 157; VI-NEXT: s_lshr_b32 s1, s1, s5 158; VI-NEXT: s_lshr_b32 s0, s0, s4 159; VI-NEXT: v_mov_b32_e32 v0, s0 160; VI-NEXT: v_mov_b32_e32 v1, s1 161; VI-NEXT: v_mov_b32_e32 v2, s2 162; VI-NEXT: v_mov_b32_e32 v3, s3 163; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 164; VI-NEXT: s_endpgm 165; 166; EG-LABEL: lshr_v4i32: 167; EG: ; %bb.0: 168; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 169; EG-NEXT: TEX 1 @6 170; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 171; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 172; EG-NEXT: CF_END 173; EG-NEXT: PAD 174; EG-NEXT: Fetch clause starting at 6: 175; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 176; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 177; EG-NEXT: ALU clause starting at 10: 178; EG-NEXT: MOV * T0.X, KC0[2].Z, 179; EG-NEXT: ALU clause starting at 11: 180; EG-NEXT: LSHR * T0.W, T0.W, T1.W, 181; EG-NEXT: LSHR * T0.Z, T0.Z, T1.Z, 182; EG-NEXT: LSHR * T0.Y, T0.Y, T1.Y, 183; EG-NEXT: LSHR T0.X, T0.X, T1.X, 184; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 185; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 186 %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 187 %a = load <4 x i32>, ptr addrspace(1) %in 188 %b = load <4 x i32>, ptr addrspace(1) %b_ptr 189 %result = lshr <4 x i32> %a, %b 190 store <4 x i32> %result, ptr addrspace(1) %out 191 ret void 192} 193 194define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { 195; SI-LABEL: lshr_i64: 196; SI: ; %bb.0: 197; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 198; SI-NEXT: s_mov_b32 s7, 0xf000 199; SI-NEXT: s_mov_b32 s6, -1 200; SI-NEXT: s_mov_b32 s10, s6 201; SI-NEXT: s_mov_b32 s11, s7 202; SI-NEXT: s_waitcnt lgkmcnt(0) 203; SI-NEXT: s_mov_b32 s8, s2 204; SI-NEXT: s_mov_b32 s9, s3 205; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 206; SI-NEXT: s_mov_b32 s4, s0 207; SI-NEXT: s_mov_b32 s5, s1 208; SI-NEXT: s_waitcnt vmcnt(0) 209; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v2 210; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 211; SI-NEXT: s_endpgm 212; 213; VI-LABEL: lshr_i64: 214; VI: ; %bb.0: 215; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 216; VI-NEXT: s_waitcnt lgkmcnt(0) 217; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 218; VI-NEXT: s_mov_b32 s3, 0xf000 219; VI-NEXT: s_mov_b32 s2, -1 220; VI-NEXT: s_waitcnt lgkmcnt(0) 221; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 222; VI-NEXT: v_mov_b32_e32 v0, s4 223; VI-NEXT: v_mov_b32_e32 v1, s5 224; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 225; VI-NEXT: s_endpgm 226; 227; EG-LABEL: lshr_i64: 228; EG: ; %bb.0: 229; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 230; EG-NEXT: TEX 0 @6 231; EG-NEXT: ALU 9, @9, KC0[CB0:0-32], KC1[] 232; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 233; EG-NEXT: CF_END 234; EG-NEXT: PAD 235; EG-NEXT: Fetch clause starting at 6: 236; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 237; EG-NEXT: ALU clause starting at 8: 238; EG-NEXT: MOV * T0.X, KC0[2].Z, 239; EG-NEXT: ALU clause starting at 9: 240; EG-NEXT: AND_INT * T0.W, T0.Z, literal.x, 241; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 242; EG-NEXT: LSHR T1.Z, T0.Y, PV.W, 243; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T0.Z, 244; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, 245; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 246; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Z, 247; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 248; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 249; EG-NEXT: CNDE_INT * T0.Y, T1.W, T1.Z, 0.0, 250 %b_ptr = getelementptr i64, ptr addrspace(1) %in, i64 1 251 %a = load i64, ptr addrspace(1) %in 252 %b = load i64, ptr addrspace(1) %b_ptr 253 %result = lshr i64 %a, %b 254 store i64 %result, ptr addrspace(1) %out 255 ret void 256} 257 258define amdgpu_kernel void @lshr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { 259; SI-LABEL: lshr_v4i64: 260; SI: ; %bb.0: 261; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 262; SI-NEXT: s_mov_b32 s3, 0xf000 263; SI-NEXT: s_mov_b32 s2, -1 264; SI-NEXT: s_mov_b32 s10, s2 265; SI-NEXT: s_mov_b32 s11, s3 266; SI-NEXT: s_waitcnt lgkmcnt(0) 267; SI-NEXT: s_mov_b32 s8, s6 268; SI-NEXT: s_mov_b32 s9, s7 269; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 270; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 271; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 272; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 273; SI-NEXT: s_mov_b32 s0, s4 274; SI-NEXT: s_mov_b32 s1, s5 275; SI-NEXT: s_waitcnt vmcnt(2) 276; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], v6 277; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], v4 278; SI-NEXT: s_waitcnt vmcnt(0) 279; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], v13 280; SI-NEXT: v_lshr_b64 v[7:8], v[7:8], v11 281; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 282; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 283; SI-NEXT: s_endpgm 284; 285; VI-LABEL: lshr_v4i64: 286; VI: ; %bb.0: 287; VI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 288; VI-NEXT: s_waitcnt lgkmcnt(0) 289; VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 290; VI-NEXT: s_mov_b32 s19, 0xf000 291; VI-NEXT: s_mov_b32 s18, -1 292; VI-NEXT: s_waitcnt lgkmcnt(0) 293; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 294; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 295; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 296; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s8 297; VI-NEXT: v_mov_b32_e32 v0, s4 298; VI-NEXT: v_mov_b32_e32 v1, s5 299; VI-NEXT: v_mov_b32_e32 v2, s6 300; VI-NEXT: v_mov_b32_e32 v3, s7 301; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 302; VI-NEXT: s_nop 0 303; VI-NEXT: v_mov_b32_e32 v0, s0 304; VI-NEXT: v_mov_b32_e32 v1, s1 305; VI-NEXT: v_mov_b32_e32 v2, s2 306; VI-NEXT: v_mov_b32_e32 v3, s3 307; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 308; VI-NEXT: s_endpgm 309; 310; EG-LABEL: lshr_v4i64: 311; EG: ; %bb.0: 312; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 313; EG-NEXT: TEX 3 @6 314; EG-NEXT: ALU 34, @15, KC0[CB0:0-32], KC1[] 315; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0 316; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1 317; EG-NEXT: CF_END 318; EG-NEXT: Fetch clause starting at 6: 319; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 32, #1 320; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 16, #1 321; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 48, #1 322; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 323; EG-NEXT: ALU clause starting at 14: 324; EG-NEXT: MOV * T0.X, KC0[2].Z, 325; EG-NEXT: ALU clause starting at 15: 326; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x, 327; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 328; EG-NEXT: LSHR T4.Z, T0.W, PV.W, 329; EG-NEXT: AND_INT T1.W, T1.Z, literal.x, 330; EG-NEXT: AND_INT * T3.W, T3.Z, literal.y, 331; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) 332; EG-NEXT: BIT_ALIGN_INT T4.X, T0.W, T0.Z, T1.Z, 333; EG-NEXT: LSHR T1.Y, T2.W, PS, BS:VEC_120/SCL_212 334; EG-NEXT: AND_INT * T0.Z, T3.Z, literal.x, 335; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 336; EG-NEXT: BIT_ALIGN_INT T0.W, T2.W, T2.Z, T3.Z, 337; EG-NEXT: AND_INT * T2.W, T3.X, literal.x, 338; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 339; EG-NEXT: AND_INT T5.X, T1.X, literal.x, 340; EG-NEXT: LSHR T3.Y, T2.Y, PS, 341; EG-NEXT: CNDE_INT T2.Z, T0.Z, PV.W, T1.Y, 342; EG-NEXT: BIT_ALIGN_INT T0.W, T2.Y, T2.X, T3.X, 343; EG-NEXT: AND_INT * T3.W, T3.X, literal.y, 344; EG-NEXT: 31(4.344025e-44), 32(4.484155e-44) 345; EG-NEXT: CNDE_INT T2.X, PS, PV.W, PV.Y, 346; EG-NEXT: LSHR T4.Y, T0.Y, PV.X, 347; EG-NEXT: CNDE_INT T1.Z, T1.W, T4.X, T4.Z, 348; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T1.X, BS:VEC_102/SCL_221 349; EG-NEXT: AND_INT * T4.W, T1.X, literal.x, 350; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 351; EG-NEXT: CNDE_INT T1.X, PS, PV.W, PV.Y, 352; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x, 353; EG-NEXT: CNDE_INT * T2.W, T0.Z, T1.Y, 0.0, 354; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 355; EG-NEXT: LSHR T0.X, PV.W, literal.x, 356; EG-NEXT: CNDE_INT T2.Y, T3.W, T3.Y, 0.0, 357; EG-NEXT: CNDE_INT T1.W, T1.W, T4.Z, 0.0, BS:VEC_120/SCL_212 358; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, 359; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 360; EG-NEXT: CNDE_INT * T1.Y, T4.W, T4.Y, 0.0, 361 %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1 362 %a = load <4 x i64>, ptr addrspace(1) %in 363 %b = load <4 x i64>, ptr addrspace(1) %b_ptr 364 %result = lshr <4 x i64> %a, %b 365 store <4 x i64> %result, ptr addrspace(1) %out 366 ret void 367} 368 369; Make sure load width gets reduced to i32 load. 370define amdgpu_kernel void @s_lshr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { 371; SI-LABEL: s_lshr_32_i64: 372; SI: ; %bb.0: 373; SI-NEXT: s_load_dword s6, s[4:5], 0x14 374; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 375; SI-NEXT: s_mov_b32 s3, 0xf000 376; SI-NEXT: s_mov_b32 s2, -1 377; SI-NEXT: v_mov_b32_e32 v1, 0 378; SI-NEXT: s_waitcnt lgkmcnt(0) 379; SI-NEXT: v_mov_b32_e32 v0, s6 380; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 381; SI-NEXT: s_endpgm 382; 383; VI-LABEL: s_lshr_32_i64: 384; VI: ; %bb.0: 385; VI-NEXT: s_load_dword s6, s[4:5], 0x50 386; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 387; VI-NEXT: s_mov_b32 s3, 0xf000 388; VI-NEXT: s_mov_b32 s2, -1 389; VI-NEXT: v_mov_b32_e32 v1, 0 390; VI-NEXT: s_waitcnt lgkmcnt(0) 391; VI-NEXT: v_mov_b32_e32 v0, s6 392; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 393; VI-NEXT: s_endpgm 394; 395; EG-LABEL: s_lshr_32_i64: 396; EG: ; %bb.0: 397; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] 398; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 399; EG-NEXT: CF_END 400; EG-NEXT: PAD 401; EG-NEXT: ALU clause starting at 4: 402; EG-NEXT: MOV T0.X, KC0[5].X, 403; EG-NEXT: MOV T0.Y, 0.0, 404; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 405; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 406 %result = lshr i64 %a, 32 407 store i64 %result, ptr addrspace(1) %out 408 ret void 409} 410 411define amdgpu_kernel void @v_lshr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { 412; SI-LABEL: v_lshr_32_i64: 413; SI: ; %bb.0: 414; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 415; SI-NEXT: s_mov_b32 s6, 0 416; SI-NEXT: s_mov_b32 s7, 0xf000 417; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 418; SI-NEXT: v_mov_b32_e32 v1, 0 419; SI-NEXT: s_waitcnt lgkmcnt(0) 420; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 421; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 422; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 423; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 424; SI-NEXT: v_mov_b32_e32 v3, v1 425; SI-NEXT: s_waitcnt vmcnt(0) 426; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 427; SI-NEXT: s_endpgm 428; 429; VI-LABEL: v_lshr_32_i64: 430; VI: ; %bb.0: 431; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 432; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 433; VI-NEXT: s_waitcnt lgkmcnt(0) 434; VI-NEXT: v_mov_b32_e32 v0, s3 435; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 436; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc 437; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1 438; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 439; VI-NEXT: flat_load_dword v0, v[0:1] 440; VI-NEXT: v_mov_b32_e32 v3, s1 441; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 442; VI-NEXT: v_mov_b32_e32 v1, 0 443; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 444; VI-NEXT: s_waitcnt vmcnt(0) 445; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 446; VI-NEXT: s_endpgm 447; 448; EG-LABEL: v_lshr_32_i64: 449; EG: ; %bb.0: 450; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 451; EG-NEXT: TEX 0 @6 452; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 453; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 454; EG-NEXT: CF_END 455; EG-NEXT: PAD 456; EG-NEXT: Fetch clause starting at 6: 457; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 458; EG-NEXT: ALU clause starting at 8: 459; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 460; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 461; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 462; EG-NEXT: ALU clause starting at 11: 463; EG-NEXT: MOV T0.Y, 0.0, 464; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 465; EG-NEXT: LSHR * T1.X, PV.W, literal.x, 466; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 467 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 468 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid 469 %gep.out = getelementptr i64, ptr addrspace(1) %out, i32 %tid 470 %a = load i64, ptr addrspace(1) %gep.in 471 %result = lshr i64 %a, 32 472 store i64 %result, ptr addrspace(1) %gep.out 473 ret void 474} 475 476attributes #0 = { nounwind readnone } 477