1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=VI 4; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s -check-prefixes=EG 5 6declare i32 @llvm.amdgcn.workitem.id.x() #0 7 8define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 9; SI-LABEL: ashr_v2i32: 10; SI: ; %bb.0: 11; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s6, -1 14; SI-NEXT: s_mov_b32 s10, s6 15; SI-NEXT: s_mov_b32 s11, s7 16; SI-NEXT: s_waitcnt lgkmcnt(0) 17; SI-NEXT: s_mov_b32 s8, s2 18; SI-NEXT: s_mov_b32 s9, s3 19; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 20; SI-NEXT: s_mov_b32 s4, s0 21; SI-NEXT: s_mov_b32 s5, s1 22; SI-NEXT: s_waitcnt vmcnt(0) 23; SI-NEXT: v_ashr_i32_e32 v1, v1, v3 24; SI-NEXT: v_ashr_i32_e32 v0, v0, v2 25; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 26; SI-NEXT: s_endpgm 27; 28; VI-LABEL: ashr_v2i32: 29; VI: ; %bb.0: 30; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 31; VI-NEXT: s_mov_b32 s7, 0xf000 32; VI-NEXT: s_mov_b32 s6, -1 33; VI-NEXT: s_mov_b32 s10, s6 34; VI-NEXT: s_mov_b32 s11, s7 35; VI-NEXT: s_waitcnt lgkmcnt(0) 36; VI-NEXT: s_mov_b32 s8, s2 37; VI-NEXT: s_mov_b32 s9, s3 38; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 39; VI-NEXT: s_mov_b32 s4, s0 40; VI-NEXT: s_mov_b32 s5, s1 41; VI-NEXT: s_waitcnt vmcnt(0) 42; VI-NEXT: v_ashrrev_i32_e32 v1, v3, v1 43; VI-NEXT: v_ashrrev_i32_e32 v0, v2, v0 44; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 45; VI-NEXT: s_endpgm 46; 47; EG-LABEL: ashr_v2i32: 48; EG: ; %bb.0: 49; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 50; EG-NEXT: TEX 0 @6 51; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] 52; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 53; EG-NEXT: CF_END 54; EG-NEXT: PAD 55; EG-NEXT: Fetch clause starting at 6: 56; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 57; EG-NEXT: ALU clause starting at 8: 58; EG-NEXT: MOV * T0.X, KC0[2].Z, 59; EG-NEXT: ALU clause starting at 9: 60; EG-NEXT: ASHR * T0.Y, T0.Y, T0.W, 61; EG-NEXT: ASHR T0.X, T0.X, T0.Z, 62; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 63; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 64 %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 65 %a = load <2 x i32>, ptr addrspace(1) %in 66 %b = load <2 x i32>, ptr addrspace(1) %b_ptr 67 %result = ashr <2 x i32> %a, %b 68 store <2 x i32> %result, ptr addrspace(1) %out 69 ret void 70} 71 72define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { 73; SI-LABEL: ashr_v4i32: 74; SI: ; %bb.0: 75; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 76; SI-NEXT: s_mov_b32 s7, 0xf000 77; SI-NEXT: s_mov_b32 s6, -1 78; SI-NEXT: s_mov_b32 s10, s6 79; SI-NEXT: s_mov_b32 s11, s7 80; SI-NEXT: s_waitcnt lgkmcnt(0) 81; SI-NEXT: s_mov_b32 s8, s2 82; SI-NEXT: s_mov_b32 s9, s3 83; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 84; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 85; SI-NEXT: s_mov_b32 s4, s0 86; SI-NEXT: s_mov_b32 s5, s1 87; SI-NEXT: s_waitcnt vmcnt(0) 88; SI-NEXT: v_ashr_i32_e32 v3, v3, v7 89; SI-NEXT: v_ashr_i32_e32 v2, v2, v6 90; SI-NEXT: v_ashr_i32_e32 v1, v1, v5 91; SI-NEXT: v_ashr_i32_e32 v0, v0, v4 92; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 93; SI-NEXT: s_endpgm 94; 95; VI-LABEL: ashr_v4i32: 96; VI: ; %bb.0: 97; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 98; VI-NEXT: s_mov_b32 s7, 0xf000 99; VI-NEXT: s_mov_b32 s6, -1 100; VI-NEXT: s_mov_b32 s10, s6 101; VI-NEXT: s_mov_b32 s11, s7 102; VI-NEXT: s_waitcnt lgkmcnt(0) 103; VI-NEXT: s_mov_b32 s8, s2 104; VI-NEXT: s_mov_b32 s9, s3 105; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 106; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 107; VI-NEXT: s_mov_b32 s4, s0 108; VI-NEXT: s_mov_b32 s5, s1 109; VI-NEXT: s_waitcnt vmcnt(0) 110; VI-NEXT: v_ashrrev_i32_e32 v3, v7, v3 111; VI-NEXT: v_ashrrev_i32_e32 v2, v6, v2 112; VI-NEXT: v_ashrrev_i32_e32 v1, v5, v1 113; VI-NEXT: v_ashrrev_i32_e32 v0, v4, v0 114; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 115; VI-NEXT: s_endpgm 116; 117; EG-LABEL: ashr_v4i32: 118; EG: ; %bb.0: 119; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 120; EG-NEXT: TEX 1 @6 121; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 122; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 123; EG-NEXT: CF_END 124; EG-NEXT: PAD 125; EG-NEXT: Fetch clause starting at 6: 126; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 127; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 128; EG-NEXT: ALU clause starting at 10: 129; EG-NEXT: MOV * T0.X, KC0[2].Z, 130; EG-NEXT: ALU clause starting at 11: 131; EG-NEXT: ASHR * T0.W, T0.W, T1.W, 132; EG-NEXT: ASHR * T0.Z, T0.Z, T1.Z, 133; EG-NEXT: ASHR * T0.Y, T0.Y, T1.Y, 134; EG-NEXT: ASHR T0.X, T0.X, T1.X, 135; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 136; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 137 %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 138 %a = load <4 x i32>, ptr addrspace(1) %in 139 %b = load <4 x i32>, ptr addrspace(1) %b_ptr 140 %result = ashr <4 x i32> %a, %b 141 store <4 x i32> %result, ptr addrspace(1) %out 142 ret void 143} 144 145; FIXME: The ashr operation is uniform, but because its operands come from a 146; global load we end up with the vector instructions rather than scalar. 147define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { 148; SI-LABEL: ashr_v2i16: 149; SI: ; %bb.0: 150; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 151; SI-NEXT: s_mov_b32 s7, 0xf000 152; SI-NEXT: s_mov_b32 s6, -1 153; SI-NEXT: s_mov_b32 s10, s6 154; SI-NEXT: s_mov_b32 s11, s7 155; SI-NEXT: s_waitcnt lgkmcnt(0) 156; SI-NEXT: s_mov_b32 s8, s2 157; SI-NEXT: s_mov_b32 s9, s3 158; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 159; SI-NEXT: s_mov_b32 s4, s0 160; SI-NEXT: s_mov_b32 s5, s1 161; SI-NEXT: s_waitcnt vmcnt(0) 162; SI-NEXT: v_readfirstlane_b32 s0, v0 163; SI-NEXT: v_readfirstlane_b32 s1, v1 164; SI-NEXT: s_sext_i32_i16 s2, s0 165; SI-NEXT: s_ashr_i32 s0, s0, 16 166; SI-NEXT: s_lshr_b32 s3, s1, 16 167; SI-NEXT: s_ashr_i32 s0, s0, s3 168; SI-NEXT: s_ashr_i32 s1, s2, s1 169; SI-NEXT: s_lshl_b32 s0, s0, 16 170; SI-NEXT: s_and_b32 s1, s1, 0xffff 171; SI-NEXT: s_or_b32 s0, s1, s0 172; SI-NEXT: v_mov_b32_e32 v0, s0 173; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 174; SI-NEXT: s_endpgm 175; 176; VI-LABEL: ashr_v2i16: 177; VI: ; %bb.0: 178; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 179; VI-NEXT: s_mov_b32 s7, 0xf000 180; VI-NEXT: s_mov_b32 s6, -1 181; VI-NEXT: s_mov_b32 s10, s6 182; VI-NEXT: s_mov_b32 s11, s7 183; VI-NEXT: s_waitcnt lgkmcnt(0) 184; VI-NEXT: s_mov_b32 s8, s2 185; VI-NEXT: s_mov_b32 s9, s3 186; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 187; VI-NEXT: s_mov_b32 s4, s0 188; VI-NEXT: s_mov_b32 s5, s1 189; VI-NEXT: s_waitcnt vmcnt(0) 190; VI-NEXT: v_readfirstlane_b32 s0, v1 191; VI-NEXT: v_readfirstlane_b32 s1, v0 192; VI-NEXT: s_ashr_i32 s2, s1, 16 193; VI-NEXT: s_sext_i32_i16 s1, s1 194; VI-NEXT: s_ashr_i32 s3, s0, 16 195; VI-NEXT: s_sext_i32_i16 s0, s0 196; VI-NEXT: s_ashr_i32 s0, s1, s0 197; VI-NEXT: s_ashr_i32 s1, s2, s3 198; VI-NEXT: s_lshl_b32 s1, s1, 16 199; VI-NEXT: s_and_b32 s0, s0, 0xffff 200; VI-NEXT: s_or_b32 s0, s0, s1 201; VI-NEXT: v_mov_b32_e32 v0, s0 202; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 203; VI-NEXT: s_endpgm 204; 205; EG-LABEL: ashr_v2i16: 206; EG: ; %bb.0: 207; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 208; EG-NEXT: TEX 0 @6 209; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[] 210; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1 211; EG-NEXT: CF_END 212; EG-NEXT: PAD 213; EG-NEXT: Fetch clause starting at 6: 214; EG-NEXT: VTX_READ_64 T6.XY, T6.X, 0, #1 215; EG-NEXT: ALU clause starting at 8: 216; EG-NEXT: MOV * T6.X, KC0[2].Z, 217; EG-NEXT: ALU clause starting at 9: 218; EG-NEXT: LSHR * T0.W, T6.X, literal.x, 219; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 220; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x, 221; EG-NEXT: LSHR T0.Z, T6.Y, literal.x, 222; EG-NEXT: BFE_INT T0.W, T6.X, 0.0, literal.x, 223; EG-NEXT: AND_INT * T1.W, T6.Y, literal.y, 224; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 225; EG-NEXT: ASHR T0.W, PV.W, PS, 226; EG-NEXT: ASHR * T1.W, PV.Y, PV.Z, 227; EG-NEXT: LSHL T1.W, PS, literal.x, 228; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, 229; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 230; EG-NEXT: OR_INT T6.X, PS, PV.W, 231; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, 232; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 233 %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %in, i16 1 234 %a = load <2 x i16>, ptr addrspace(1) %in 235 %b = load <2 x i16>, ptr addrspace(1) %b_ptr 236 %result = ashr <2 x i16> %a, %b 237 store <2 x i16> %result, ptr addrspace(1) %out 238 ret void 239} 240 241; FIXME: The ashr operation is uniform, but because its operands come from a 242; global load we end up with the vector instructions rather than scalar. 243define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) { 244; SI-LABEL: ashr_v4i16: 245; SI: ; %bb.0: 246; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 247; SI-NEXT: s_mov_b32 s7, 0xf000 248; SI-NEXT: s_mov_b32 s6, -1 249; SI-NEXT: s_mov_b32 s10, s6 250; SI-NEXT: s_mov_b32 s11, s7 251; SI-NEXT: s_waitcnt lgkmcnt(0) 252; SI-NEXT: s_mov_b32 s8, s2 253; SI-NEXT: s_mov_b32 s9, s3 254; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 255; SI-NEXT: s_mov_b32 s4, s0 256; SI-NEXT: s_mov_b32 s5, s1 257; SI-NEXT: s_waitcnt vmcnt(0) 258; SI-NEXT: v_readfirstlane_b32 s0, v3 259; SI-NEXT: v_readfirstlane_b32 s1, v2 260; SI-NEXT: v_readfirstlane_b32 s2, v1 261; SI-NEXT: v_readfirstlane_b32 s3, v0 262; SI-NEXT: s_sext_i32_i16 s8, s3 263; SI-NEXT: s_ashr_i32 s3, s3, 16 264; SI-NEXT: s_sext_i32_i16 s9, s2 265; SI-NEXT: s_ashr_i32 s2, s2, 16 266; SI-NEXT: s_lshr_b32 s10, s1, 16 267; SI-NEXT: s_lshr_b32 s11, s0, 16 268; SI-NEXT: s_ashr_i32 s2, s2, s11 269; SI-NEXT: s_ashr_i32 s0, s9, s0 270; SI-NEXT: s_ashr_i32 s3, s3, s10 271; SI-NEXT: s_ashr_i32 s1, s8, s1 272; SI-NEXT: s_lshl_b32 s2, s2, 16 273; SI-NEXT: s_and_b32 s0, s0, 0xffff 274; SI-NEXT: s_lshl_b32 s3, s3, 16 275; SI-NEXT: s_and_b32 s1, s1, 0xffff 276; SI-NEXT: s_or_b32 s0, s0, s2 277; SI-NEXT: s_or_b32 s1, s1, s3 278; SI-NEXT: v_mov_b32_e32 v0, s1 279; SI-NEXT: v_mov_b32_e32 v1, s0 280; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 281; SI-NEXT: s_endpgm 282; 283; VI-LABEL: ashr_v4i16: 284; VI: ; %bb.0: 285; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 286; VI-NEXT: s_mov_b32 s7, 0xf000 287; VI-NEXT: s_mov_b32 s6, -1 288; VI-NEXT: s_mov_b32 s10, s6 289; VI-NEXT: s_mov_b32 s11, s7 290; VI-NEXT: s_waitcnt lgkmcnt(0) 291; VI-NEXT: s_mov_b32 s8, s2 292; VI-NEXT: s_mov_b32 s9, s3 293; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 294; VI-NEXT: s_mov_b32 s4, s0 295; VI-NEXT: s_mov_b32 s5, s1 296; VI-NEXT: s_waitcnt vmcnt(0) 297; VI-NEXT: v_readfirstlane_b32 s0, v2 298; VI-NEXT: v_readfirstlane_b32 s1, v3 299; VI-NEXT: v_readfirstlane_b32 s2, v0 300; VI-NEXT: v_readfirstlane_b32 s3, v1 301; VI-NEXT: s_ashr_i32 s8, s3, 16 302; VI-NEXT: s_sext_i32_i16 s3, s3 303; VI-NEXT: s_ashr_i32 s9, s2, 16 304; VI-NEXT: s_sext_i32_i16 s2, s2 305; VI-NEXT: s_ashr_i32 s10, s1, 16 306; VI-NEXT: s_sext_i32_i16 s1, s1 307; VI-NEXT: s_ashr_i32 s11, s0, 16 308; VI-NEXT: s_sext_i32_i16 s0, s0 309; VI-NEXT: s_ashr_i32 s0, s2, s0 310; VI-NEXT: s_ashr_i32 s2, s9, s11 311; VI-NEXT: s_ashr_i32 s1, s3, s1 312; VI-NEXT: s_ashr_i32 s3, s8, s10 313; VI-NEXT: s_lshl_b32 s3, s3, 16 314; VI-NEXT: s_and_b32 s1, s1, 0xffff 315; VI-NEXT: s_lshl_b32 s2, s2, 16 316; VI-NEXT: s_and_b32 s0, s0, 0xffff 317; VI-NEXT: s_or_b32 s1, s1, s3 318; VI-NEXT: s_or_b32 s0, s0, s2 319; VI-NEXT: v_mov_b32_e32 v0, s0 320; VI-NEXT: v_mov_b32_e32 v1, s1 321; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 322; VI-NEXT: s_endpgm 323; 324; EG-LABEL: ashr_v4i16: 325; EG: ; %bb.0: 326; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] 327; EG-NEXT: TEX 0 @6 328; EG-NEXT: ALU 48, @10, KC0[CB0:0-32], KC1[] 329; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1 330; EG-NEXT: CF_END 331; EG-NEXT: PAD 332; EG-NEXT: Fetch clause starting at 6: 333; EG-NEXT: VTX_READ_128 T9.XYZW, T9.X, 0, #1 334; EG-NEXT: ALU clause starting at 8: 335; EG-NEXT: MOV * T0.Y, T6.X, 336; EG-NEXT: MOV * T9.X, KC0[2].Z, 337; EG-NEXT: ALU clause starting at 10: 338; EG-NEXT: BFE_INT T0.W, T9.X, 0.0, literal.x, 339; EG-NEXT: AND_INT * T1.W, T9.Z, literal.y, 340; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 341; EG-NEXT: ASHR * T0.W, PV.W, PS, 342; EG-NEXT: AND_INT T0.W, PV.W, literal.x, 343; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, 344; EG-NEXT: 65535(9.183409e-41), -65536(nan) 345; EG-NEXT: OR_INT * T0.W, PS, PV.W, 346; EG-NEXT: MOV * T6.X, PV.W, 347; EG-NEXT: MOV T0.Y, PV.X, 348; EG-NEXT: LSHR * T0.W, T9.X, literal.x, 349; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 350; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, 351; EG-NEXT: LSHR * T1.W, T9.Z, literal.x, 352; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 353; EG-NEXT: ASHR T0.W, PV.W, PS, 354; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, 355; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 356; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 357; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 358; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 359; EG-NEXT: MOV T6.X, PV.W, 360; EG-NEXT: MOV T0.Y, T7.X, 361; EG-NEXT: BFE_INT T0.W, T9.Y, 0.0, literal.x, 362; EG-NEXT: AND_INT * T1.W, T9.W, literal.y, 363; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) 364; EG-NEXT: ASHR T0.W, PV.W, PS, 365; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, 366; EG-NEXT: -65536(nan), 0(0.000000e+00) 367; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, 368; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 369; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, 370; EG-NEXT: MOV * T7.X, PV.W, 371; EG-NEXT: MOV T0.Y, PV.X, 372; EG-NEXT: LSHR * T0.W, T9.Y, literal.x, 373; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 374; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, 375; EG-NEXT: LSHR * T1.W, T9.W, literal.x, 376; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 377; EG-NEXT: ASHR T0.W, PV.W, PS, 378; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, 379; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 380; EG-NEXT: LSHL * T0.W, PV.W, literal.x, 381; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) 382; EG-NEXT: LSHR T9.X, KC0[2].Y, literal.x, 383; EG-NEXT: OR_INT * T10.Y, T1.W, PV.W, 384; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 385; EG-NEXT: MOV T7.X, PV.Y, 386; EG-NEXT: MOV * T10.X, T6.X, 387 %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %in, i16 1 388 %a = load <4 x i16>, ptr addrspace(1) %in 389 %b = load <4 x i16>, ptr addrspace(1) %b_ptr 390 %result = ashr <4 x i16> %a, %b 391 store <4 x i16> %result, ptr addrspace(1) %out 392 ret void 393} 394 395define amdgpu_kernel void @s_ashr_i64(ptr addrspace(1) %out, i32 %in) { 396; SI-LABEL: s_ashr_i64: 397; SI: ; %bb.0: ; %entry 398; SI-NEXT: s_load_dword s6, s[4:5], 0xb 399; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 400; SI-NEXT: s_mov_b32 s3, 0xf000 401; SI-NEXT: s_mov_b32 s2, -1 402; SI-NEXT: s_waitcnt lgkmcnt(0) 403; SI-NEXT: s_ashr_i32 s7, s6, 31 404; SI-NEXT: s_ashr_i64 s[4:5], s[6:7], 8 405; SI-NEXT: v_mov_b32_e32 v0, s4 406; SI-NEXT: v_mov_b32_e32 v1, s5 407; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 408; SI-NEXT: s_endpgm 409; 410; VI-LABEL: s_ashr_i64: 411; VI: ; %bb.0: ; %entry 412; VI-NEXT: s_load_dword s6, s[4:5], 0x2c 413; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 414; VI-NEXT: s_mov_b32 s3, 0xf000 415; VI-NEXT: s_mov_b32 s2, -1 416; VI-NEXT: s_waitcnt lgkmcnt(0) 417; VI-NEXT: s_ashr_i32 s7, s6, 31 418; VI-NEXT: s_ashr_i64 s[4:5], s[6:7], 8 419; VI-NEXT: v_mov_b32_e32 v0, s4 420; VI-NEXT: v_mov_b32_e32 v1, s5 421; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 422; VI-NEXT: s_endpgm 423; 424; EG-LABEL: s_ashr_i64: 425; EG: ; %bb.0: ; %entry 426; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 427; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 428; EG-NEXT: CF_END 429; EG-NEXT: PAD 430; EG-NEXT: ALU clause starting at 4: 431; EG-NEXT: ASHR * T0.Y, KC0[2].Z, literal.x, 432; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 433; EG-NEXT: BIT_ALIGN_INT T0.X, PV.Y, KC0[2].Z, literal.x, 434; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 435; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) 436entry: 437 %in.ext = sext i32 %in to i64 438 %ashr = ashr i64 %in.ext, 8 439 store i64 %ashr, ptr addrspace(1) %out 440 ret void 441} 442 443define amdgpu_kernel void @ashr_i64_2(ptr addrspace(1) %out, ptr addrspace(1) %in) { 444; SI-LABEL: ashr_i64_2: 445; SI: ; %bb.0: ; %entry 446; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 447; SI-NEXT: s_mov_b32 s7, 0xf000 448; SI-NEXT: s_mov_b32 s6, -1 449; SI-NEXT: s_mov_b32 s10, s6 450; SI-NEXT: s_mov_b32 s11, s7 451; SI-NEXT: s_waitcnt lgkmcnt(0) 452; SI-NEXT: s_mov_b32 s8, s2 453; SI-NEXT: s_mov_b32 s9, s3 454; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 455; SI-NEXT: s_mov_b32 s4, s0 456; SI-NEXT: s_mov_b32 s5, s1 457; SI-NEXT: s_waitcnt vmcnt(0) 458; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v2 459; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 460; SI-NEXT: s_endpgm 461; 462; VI-LABEL: ashr_i64_2: 463; VI: ; %bb.0: ; %entry 464; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 465; VI-NEXT: s_mov_b32 s7, 0xf000 466; VI-NEXT: s_mov_b32 s6, -1 467; VI-NEXT: s_mov_b32 s10, s6 468; VI-NEXT: s_mov_b32 s11, s7 469; VI-NEXT: s_waitcnt lgkmcnt(0) 470; VI-NEXT: s_mov_b32 s8, s2 471; VI-NEXT: s_mov_b32 s9, s3 472; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 473; VI-NEXT: s_mov_b32 s4, s0 474; VI-NEXT: s_mov_b32 s5, s1 475; VI-NEXT: s_waitcnt vmcnt(0) 476; VI-NEXT: v_ashrrev_i64 v[0:1], v2, v[0:1] 477; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 478; VI-NEXT: s_endpgm 479; 480; EG-LABEL: ashr_i64_2: 481; EG: ; %bb.0: ; %entry 482; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] 483; EG-NEXT: TEX 0 @6 484; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] 485; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 486; EG-NEXT: CF_END 487; EG-NEXT: PAD 488; EG-NEXT: Fetch clause starting at 6: 489; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 490; EG-NEXT: ALU clause starting at 8: 491; EG-NEXT: MOV * T0.X, KC0[2].Z, 492; EG-NEXT: ALU clause starting at 9: 493; EG-NEXT: AND_INT * T0.W, T0.Z, literal.x, 494; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 495; EG-NEXT: ASHR T1.Z, T0.Y, PV.W, 496; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T0.Z, 497; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, 498; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 499; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Z, 500; EG-NEXT: ASHR T0.W, T0.Y, literal.x, 501; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, 502; EG-NEXT: 31(4.344025e-44), 2(2.802597e-45) 503; EG-NEXT: CNDE_INT * T0.Y, T1.W, T1.Z, PV.W, 504entry: 505 %b_ptr = getelementptr i64, ptr addrspace(1) %in, i64 1 506 %a = load i64, ptr addrspace(1) %in 507 %b = load i64, ptr addrspace(1) %b_ptr 508 %result = ashr i64 %a, %b 509 store i64 %result, ptr addrspace(1) %out 510 ret void 511} 512 513define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { 514; SI-LABEL: ashr_v2i64: 515; SI: ; %bb.0: 516; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 517; SI-NEXT: s_mov_b32 s7, 0xf000 518; SI-NEXT: s_mov_b32 s6, -1 519; SI-NEXT: s_mov_b32 s10, s6 520; SI-NEXT: s_mov_b32 s11, s7 521; SI-NEXT: s_waitcnt lgkmcnt(0) 522; SI-NEXT: s_mov_b32 s8, s2 523; SI-NEXT: s_mov_b32 s9, s3 524; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 525; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 526; SI-NEXT: s_mov_b32 s4, s0 527; SI-NEXT: s_mov_b32 s5, s1 528; SI-NEXT: s_waitcnt vmcnt(0) 529; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6 530; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4 531; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 532; SI-NEXT: s_endpgm 533; 534; VI-LABEL: ashr_v2i64: 535; VI: ; %bb.0: 536; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 537; VI-NEXT: s_mov_b32 s7, 0xf000 538; VI-NEXT: s_mov_b32 s6, -1 539; VI-NEXT: s_mov_b32 s10, s6 540; VI-NEXT: s_mov_b32 s11, s7 541; VI-NEXT: s_waitcnt lgkmcnt(0) 542; VI-NEXT: s_mov_b32 s8, s2 543; VI-NEXT: s_mov_b32 s9, s3 544; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 545; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 546; VI-NEXT: s_mov_b32 s4, s0 547; VI-NEXT: s_mov_b32 s5, s1 548; VI-NEXT: s_waitcnt vmcnt(0) 549; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] 550; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] 551; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 552; VI-NEXT: s_endpgm 553; 554; EG-LABEL: ashr_v2i64: 555; EG: ; %bb.0: 556; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] 557; EG-NEXT: TEX 1 @6 558; EG-NEXT: ALU 19, @11, KC0[CB0:0-32], KC1[] 559; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 560; EG-NEXT: CF_END 561; EG-NEXT: PAD 562; EG-NEXT: Fetch clause starting at 6: 563; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 564; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 565; EG-NEXT: ALU clause starting at 10: 566; EG-NEXT: MOV * T0.X, KC0[2].Z, 567; EG-NEXT: ALU clause starting at 11: 568; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x, 569; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 570; EG-NEXT: ASHR T1.Y, T0.W, PV.W, 571; EG-NEXT: AND_INT T2.Z, T1.Z, literal.x, 572; EG-NEXT: BIT_ALIGN_INT T1.W, T0.W, T0.Z, T1.Z, 573; EG-NEXT: AND_INT * T2.W, T1.X, literal.y, 574; EG-NEXT: 32(4.484155e-44), 31(4.344025e-44) 575; EG-NEXT: ASHR T2.Y, T0.Y, PS, 576; EG-NEXT: CNDE_INT T0.Z, PV.Z, PV.W, PV.Y, 577; EG-NEXT: BIT_ALIGN_INT T1.W, T0.Y, T0.X, T1.X, 578; EG-NEXT: AND_INT * T2.W, T1.X, literal.x, 579; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 580; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Y, 581; EG-NEXT: ASHR T0.W, T0.W, literal.x, 582; EG-NEXT: ASHR * T1.W, T0.Y, literal.x, 583; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 584; EG-NEXT: CNDE_INT * T0.W, T2.Z, T1.Y, PV.W, 585; EG-NEXT: LSHR T1.X, KC0[2].Y, literal.x, 586; EG-NEXT: CNDE_INT * T0.Y, T2.W, T2.Y, T1.W, 587; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 588 %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 1 589 %a = load <2 x i64>, ptr addrspace(1) %in 590 %b = load <2 x i64>, ptr addrspace(1) %b_ptr 591 %result = ashr <2 x i64> %a, %b 592 store <2 x i64> %result, ptr addrspace(1) %out 593 ret void 594} 595 596; FIXME: Broken on r600 597define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { 598; SI-LABEL: ashr_v4i64: 599; SI: ; %bb.0: 600; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 601; SI-NEXT: s_mov_b32 s3, 0xf000 602; SI-NEXT: s_mov_b32 s2, -1 603; SI-NEXT: s_mov_b32 s10, s2 604; SI-NEXT: s_mov_b32 s11, s3 605; SI-NEXT: s_waitcnt lgkmcnt(0) 606; SI-NEXT: s_mov_b32 s8, s6 607; SI-NEXT: s_mov_b32 s9, s7 608; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 609; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 610; SI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 611; SI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 612; SI-NEXT: s_mov_b32 s0, s4 613; SI-NEXT: s_mov_b32 s1, s5 614; SI-NEXT: s_waitcnt vmcnt(2) 615; SI-NEXT: v_ashr_i64 v[2:3], v[2:3], v6 616; SI-NEXT: v_ashr_i64 v[0:1], v[0:1], v4 617; SI-NEXT: s_waitcnt vmcnt(0) 618; SI-NEXT: v_ashr_i64 v[9:10], v[9:10], v13 619; SI-NEXT: v_ashr_i64 v[7:8], v[7:8], v11 620; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 621; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 622; SI-NEXT: s_endpgm 623; 624; VI-LABEL: ashr_v4i64: 625; VI: ; %bb.0: 626; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 627; VI-NEXT: s_mov_b32 s3, 0xf000 628; VI-NEXT: s_mov_b32 s2, -1 629; VI-NEXT: s_mov_b32 s10, s2 630; VI-NEXT: s_mov_b32 s11, s3 631; VI-NEXT: s_waitcnt lgkmcnt(0) 632; VI-NEXT: s_mov_b32 s8, s6 633; VI-NEXT: s_mov_b32 s9, s7 634; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 635; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48 636; VI-NEXT: buffer_load_dwordx4 v[7:10], off, s[8:11], 0 637; VI-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32 638; VI-NEXT: s_mov_b32 s0, s4 639; VI-NEXT: s_mov_b32 s1, s5 640; VI-NEXT: s_waitcnt vmcnt(2) 641; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3] 642; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1] 643; VI-NEXT: s_waitcnt vmcnt(0) 644; VI-NEXT: v_ashrrev_i64 v[9:10], v13, v[9:10] 645; VI-NEXT: v_ashrrev_i64 v[7:8], v11, v[7:8] 646; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 647; VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 648; VI-NEXT: s_endpgm 649; 650; EG-LABEL: ashr_v4i64: 651; EG: ; %bb.0: 652; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] 653; EG-NEXT: TEX 3 @6 654; EG-NEXT: ALU 39, @15, KC0[CB0:0-32], KC1[] 655; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0 656; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 657; EG-NEXT: CF_END 658; EG-NEXT: Fetch clause starting at 6: 659; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 32, #1 660; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 48, #1 661; EG-NEXT: VTX_READ_128 T3.XYZW, T0.X, 0, #1 662; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1 663; EG-NEXT: ALU clause starting at 14: 664; EG-NEXT: MOV * T0.X, KC0[2].Z, 665; EG-NEXT: ALU clause starting at 15: 666; EG-NEXT: AND_INT * T1.W, T1.Z, literal.x, 667; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 668; EG-NEXT: ASHR T1.Y, T0.W, literal.x, 669; EG-NEXT: ASHR T4.Z, T3.W, PV.W, BS:VEC_120/SCL_212 670; EG-NEXT: AND_INT T1.W, T1.Z, literal.y, 671; EG-NEXT: AND_INT * T2.W, T2.Z, literal.x, 672; EG-NEXT: 31(4.344025e-44), 32(4.484155e-44) 673; EG-NEXT: BIT_ALIGN_INT T4.X, T3.W, T3.Z, T1.Z, 674; EG-NEXT: ASHR T2.Y, T0.W, PS, BS:VEC_120/SCL_212 675; EG-NEXT: AND_INT * T1.Z, T2.Z, literal.x, 676; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 677; EG-NEXT: BIT_ALIGN_INT T0.W, T0.W, T0.Z, T2.Z, 678; EG-NEXT: AND_INT * T2.W, T2.X, literal.x, 679; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 680; EG-NEXT: AND_INT T5.X, T1.X, literal.x, 681; EG-NEXT: ASHR T4.Y, T0.Y, PS, 682; EG-NEXT: CNDE_INT T0.Z, T1.Z, PV.W, T2.Y, 683; EG-NEXT: BIT_ALIGN_INT T0.W, T0.Y, T0.X, T2.X, 684; EG-NEXT: AND_INT * T2.W, T2.X, literal.y, 685; EG-NEXT: 31(4.344025e-44), 32(4.484155e-44) 686; EG-NEXT: CNDE_INT T0.X, PS, PV.W, PV.Y, 687; EG-NEXT: ASHR T5.Y, T3.Y, PV.X, 688; EG-NEXT: CNDE_INT T2.Z, T1.W, T4.X, T4.Z, 689; EG-NEXT: BIT_ALIGN_INT T0.W, T3.Y, T3.X, T1.X, BS:VEC_102/SCL_221 690; EG-NEXT: AND_INT * T4.W, T1.X, literal.x, 691; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) 692; EG-NEXT: CNDE_INT T2.X, PS, PV.W, PV.Y, 693; EG-NEXT: ASHR T6.Y, T3.W, literal.x, 694; EG-NEXT: ASHR T3.Z, T0.Y, literal.x, BS:VEC_201 695; EG-NEXT: ADD_INT T3.W, KC0[2].Y, literal.y, 696; EG-NEXT: CNDE_INT * T0.W, T1.Z, T2.Y, T1.Y, 697; EG-NEXT: 31(4.344025e-44), 16(2.242078e-44) 698; EG-NEXT: LSHR T1.X, PV.W, literal.x, 699; EG-NEXT: CNDE_INT T0.Y, T2.W, T4.Y, PV.Z, 700; EG-NEXT: ASHR T3.W, T3.Y, literal.y, 701; EG-NEXT: CNDE_INT * T2.W, T1.W, T4.Z, PV.Y, 702; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) 703; EG-NEXT: LSHR T3.X, KC0[2].Y, literal.x, 704; EG-NEXT: CNDE_INT * T2.Y, T4.W, T5.Y, PV.W, 705; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 706 %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1 707 %a = load <4 x i64>, ptr addrspace(1) %in 708 %b = load <4 x i64>, ptr addrspace(1) %b_ptr 709 %result = ashr <4 x i64> %a, %b 710 store <4 x i64> %result, ptr addrspace(1) %out 711 ret void 712} 713 714define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { 715; SI-LABEL: s_ashr_32_i64: 716; SI: ; %bb.0: 717; SI-NEXT: s_load_dword s8, s[4:5], 0x14 718; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x1d 719; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 720; SI-NEXT: s_mov_b32 s3, 0xf000 721; SI-NEXT: s_mov_b32 s2, -1 722; SI-NEXT: s_waitcnt lgkmcnt(0) 723; SI-NEXT: s_ashr_i32 s5, s8, 31 724; SI-NEXT: s_add_u32 s4, s8, s6 725; SI-NEXT: s_addc_u32 s5, s5, s7 726; SI-NEXT: v_mov_b32_e32 v0, s4 727; SI-NEXT: v_mov_b32_e32 v1, s5 728; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 729; SI-NEXT: s_endpgm 730; 731; VI-LABEL: s_ashr_32_i64: 732; VI: ; %bb.0: 733; VI-NEXT: s_load_dword s8, s[4:5], 0x50 734; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x74 735; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 736; VI-NEXT: s_mov_b32 s3, 0xf000 737; VI-NEXT: s_mov_b32 s2, -1 738; VI-NEXT: s_waitcnt lgkmcnt(0) 739; VI-NEXT: s_ashr_i32 s5, s8, 31 740; VI-NEXT: s_add_u32 s4, s8, s6 741; VI-NEXT: s_addc_u32 s5, s5, s7 742; VI-NEXT: v_mov_b32_e32 v0, s4 743; VI-NEXT: v_mov_b32_e32 v1, s5 744; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 745; VI-NEXT: s_endpgm 746; 747; EG-LABEL: s_ashr_32_i64: 748; EG: ; %bb.0: 749; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 750; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 751; EG-NEXT: CF_END 752; EG-NEXT: PAD 753; EG-NEXT: ALU clause starting at 4: 754; EG-NEXT: ASHR * T0.W, KC0[5].X, literal.x, 755; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 756; EG-NEXT: ADD_INT * T0.W, PV.W, KC0[7].Z, 757; EG-NEXT: ADDC_UINT * T1.W, KC0[5].X, KC0[7].Y, 758; EG-NEXT: ADD_INT * T0.Y, T0.W, PV.W, 759; EG-NEXT: ADD_INT * T0.X, KC0[5].X, KC0[7].Y, 760; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 761; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 762 %result = ashr i64 %a, 32 763 %add = add i64 %result, %b 764 store i64 %add, ptr addrspace(1) %out 765 ret void 766} 767 768define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { 769; SI-LABEL: v_ashr_32_i64: 770; SI: ; %bb.0: 771; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 772; SI-NEXT: s_mov_b32 s7, 0xf000 773; SI-NEXT: s_mov_b32 s6, 0 774; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 775; SI-NEXT: v_mov_b32_e32 v1, 0 776; SI-NEXT: s_waitcnt lgkmcnt(0) 777; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 778; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 779; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 780; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 781; SI-NEXT: s_waitcnt vmcnt(0) 782; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 783; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 784; SI-NEXT: s_endpgm 785; 786; VI-LABEL: v_ashr_32_i64: 787; VI: ; %bb.0: 788; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 789; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 790; VI-NEXT: s_waitcnt lgkmcnt(0) 791; VI-NEXT: v_mov_b32_e32 v0, s3 792; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 793; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc 794; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1 795; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 796; VI-NEXT: flat_load_dword v0, v[0:1] 797; VI-NEXT: v_mov_b32_e32 v1, s1 798; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 799; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 800; VI-NEXT: s_waitcnt vmcnt(0) 801; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 802; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 803; VI-NEXT: s_endpgm 804; 805; EG-LABEL: v_ashr_32_i64: 806; EG: ; %bb.0: 807; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 808; EG-NEXT: TEX 0 @6 809; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] 810; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 811; EG-NEXT: CF_END 812; EG-NEXT: PAD 813; EG-NEXT: Fetch clause starting at 6: 814; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 815; EG-NEXT: ALU clause starting at 8: 816; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 817; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 818; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 819; EG-NEXT: ALU clause starting at 11: 820; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 821; EG-NEXT: LSHR T1.X, PV.W, literal.x, 822; EG-NEXT: ASHR * T0.Y, T0.X, literal.y, 823; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44) 824 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 825 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid 826 %gep.out = getelementptr i64, ptr addrspace(1) %out, i32 %tid 827 %a = load i64, ptr addrspace(1) %gep.in 828 %result = ashr i64 %a, 32 829 store i64 %result, ptr addrspace(1) %gep.out 830 ret void 831} 832 833define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { 834; SI-LABEL: s_ashr_63_i64: 835; SI: ; %bb.0: 836; SI-NEXT: s_load_dword s8, s[4:5], 0x14 837; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x1d 838; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 839; SI-NEXT: s_mov_b32 s3, 0xf000 840; SI-NEXT: s_mov_b32 s2, -1 841; SI-NEXT: s_waitcnt lgkmcnt(0) 842; SI-NEXT: s_ashr_i32 s5, s8, 31 843; SI-NEXT: s_add_u32 s4, s5, s6 844; SI-NEXT: s_addc_u32 s5, s5, s7 845; SI-NEXT: v_mov_b32_e32 v0, s4 846; SI-NEXT: v_mov_b32_e32 v1, s5 847; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 848; SI-NEXT: s_endpgm 849; 850; VI-LABEL: s_ashr_63_i64: 851; VI: ; %bb.0: 852; VI-NEXT: s_load_dword s8, s[4:5], 0x50 853; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x74 854; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 855; VI-NEXT: s_mov_b32 s3, 0xf000 856; VI-NEXT: s_mov_b32 s2, -1 857; VI-NEXT: s_waitcnt lgkmcnt(0) 858; VI-NEXT: s_ashr_i32 s5, s8, 31 859; VI-NEXT: s_add_u32 s4, s5, s6 860; VI-NEXT: s_addc_u32 s5, s5, s7 861; VI-NEXT: v_mov_b32_e32 v0, s4 862; VI-NEXT: v_mov_b32_e32 v1, s5 863; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 864; VI-NEXT: s_endpgm 865; 866; EG-LABEL: s_ashr_63_i64: 867; EG: ; %bb.0: 868; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 869; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 870; EG-NEXT: CF_END 871; EG-NEXT: PAD 872; EG-NEXT: ALU clause starting at 4: 873; EG-NEXT: ASHR * T0.W, KC0[5].X, literal.x, 874; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 875; EG-NEXT: ADD_INT T1.W, PV.W, KC0[7].Z, 876; EG-NEXT: ADDC_UINT * T2.W, PV.W, KC0[7].Y, 877; EG-NEXT: ADD_INT * T0.Y, PV.W, PS, 878; EG-NEXT: ADD_INT T0.X, T0.W, KC0[7].Y, 879; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 880; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 881 %result = ashr i64 %a, 63 882 %add = add i64 %result, %b 883 store i64 %add, ptr addrspace(1) %out 884 ret void 885} 886 887define amdgpu_kernel void @v_ashr_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { 888; SI-LABEL: v_ashr_63_i64: 889; SI: ; %bb.0: 890; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 891; SI-NEXT: s_mov_b32 s7, 0xf000 892; SI-NEXT: s_mov_b32 s6, 0 893; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 894; SI-NEXT: v_mov_b32_e32 v1, 0 895; SI-NEXT: s_waitcnt lgkmcnt(0) 896; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 897; SI-NEXT: s_mov_b64 s[10:11], s[6:7] 898; SI-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 899; SI-NEXT: s_mov_b64 s[4:5], s[0:1] 900; SI-NEXT: s_waitcnt vmcnt(0) 901; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v2 902; SI-NEXT: v_mov_b32_e32 v3, v2 903; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 904; SI-NEXT: s_endpgm 905; 906; VI-LABEL: v_ashr_63_i64: 907; VI: ; %bb.0: 908; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 909; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 910; VI-NEXT: s_waitcnt lgkmcnt(0) 911; VI-NEXT: v_mov_b32_e32 v0, s3 912; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2 913; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc 914; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1 915; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 916; VI-NEXT: flat_load_dword v3, v[0:1] 917; VI-NEXT: v_mov_b32_e32 v1, s1 918; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 919; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 920; VI-NEXT: s_waitcnt vmcnt(0) 921; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v3 922; VI-NEXT: v_mov_b32_e32 v3, v2 923; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 924; VI-NEXT: s_endpgm 925; 926; EG-LABEL: v_ashr_63_i64: 927; EG: ; %bb.0: 928; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] 929; EG-NEXT: TEX 0 @6 930; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] 931; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 932; EG-NEXT: CF_END 933; EG-NEXT: PAD 934; EG-NEXT: Fetch clause starting at 6: 935; EG-NEXT: VTX_READ_32 T0.X, T0.X, 4, #1 936; EG-NEXT: ALU clause starting at 8: 937; EG-NEXT: LSHL * T0.W, T0.X, literal.x, 938; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) 939; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, 940; EG-NEXT: ALU clause starting at 11: 941; EG-NEXT: ASHR T0.X, T0.X, literal.x, 942; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, 943; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) 944; EG-NEXT: LSHR T1.X, PV.W, literal.x, 945; EG-NEXT: MOV * T0.Y, PV.X, 946; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) 947 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 948 %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid 949 %gep.out = getelementptr i64, ptr addrspace(1) %out, i32 %tid 950 %a = load i64, ptr addrspace(1) %gep.in 951 %result = ashr i64 %a, 63 952 store i64 %result, ptr addrspace(1) %gep.out 953 ret void 954} 955 956attributes #0 = { nounwind readnone } 957