1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600 %s 3; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s 4; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s 7 8define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { 9; R600-LABEL: rotl_i32: 10; R600: ; %bb.0: ; %entry 11; R600-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] 12; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 13; R600-NEXT: CF_END 14; R600-NEXT: PAD 15; R600-NEXT: ALU clause starting at 4: 16; R600-NEXT: SUB_INT * T0.W, literal.x, KC0[2].W, 17; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00) 18; R600-NEXT: BIT_ALIGN_INT T0.X, KC0[2].Z, KC0[2].Z, PV.W, 19; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 20; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 21; 22; SI-LABEL: rotl_i32: 23; SI: ; %bb.0: ; %entry 24; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 25; SI-NEXT: s_mov_b32 s7, 0xf000 26; SI-NEXT: s_waitcnt lgkmcnt(0) 27; SI-NEXT: s_sub_i32 s3, 32, s3 28; SI-NEXT: s_mov_b32 s6, -1 29; SI-NEXT: s_mov_b32 s4, s0 30; SI-NEXT: s_mov_b32 s5, s1 31; SI-NEXT: v_mov_b32_e32 v0, s3 32; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0 33; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 34; SI-NEXT: s_endpgm 35; 36; GFX8-LABEL: rotl_i32: 37; GFX8: ; %bb.0: ; %entry 38; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 39; GFX8-NEXT: s_waitcnt lgkmcnt(0) 40; GFX8-NEXT: s_sub_i32 s3, 32, s3 41; GFX8-NEXT: v_mov_b32_e32 v0, s3 42; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0 43; GFX8-NEXT: v_mov_b32_e32 v0, s0 44; GFX8-NEXT: v_mov_b32_e32 v1, s1 45; GFX8-NEXT: flat_store_dword v[0:1], v2 46; GFX8-NEXT: s_endpgm 47; 48; GFX10-LABEL: rotl_i32: 49; GFX10: ; %bb.0: ; %entry 50; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 51; GFX10-NEXT: v_mov_b32_e32 v0, 0 52; GFX10-NEXT: s_waitcnt lgkmcnt(0) 53; GFX10-NEXT: s_sub_i32 s3, 32, s3 54; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 55; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 56; GFX10-NEXT: s_endpgm 57; 58; GFX11-LABEL: rotl_i32: 59; GFX11: ; %bb.0: ; %entry 60; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 61; GFX11-NEXT: v_mov_b32_e32 v0, 0 62; GFX11-NEXT: s_waitcnt lgkmcnt(0) 63; GFX11-NEXT: s_sub_i32 s3, 32, s3 64; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 65; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3 66; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 67; GFX11-NEXT: s_endpgm 68entry: 69 %0 = shl i32 %x, %y 70 %1 = sub i32 32, %y 71 %2 = lshr i32 %x, %1 72 %3 = or i32 %0, %2 73 store i32 %3, ptr addrspace(1) %in 74 ret void 75} 76 77define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) { 78; R600-LABEL: rotl_v2i32: 79; R600: ; %bb.0: ; %entry 80; R600-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] 81; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 82; R600-NEXT: CF_END 83; R600-NEXT: PAD 84; R600-NEXT: ALU clause starting at 4: 85; R600-NEXT: SUB_INT * T0.W, literal.x, KC0[3].Z, 86; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00) 87; R600-NEXT: BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].X, PV.W, 88; R600-NEXT: SUB_INT * T0.W, literal.x, KC0[3].Y, 89; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00) 90; R600-NEXT: BIT_ALIGN_INT T0.X, KC0[2].W, KC0[2].W, PV.W, 91; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 92; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 93; 94; SI-LABEL: rotl_v2i32: 95; SI: ; %bb.0: ; %entry 96; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb 97; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 98; SI-NEXT: s_mov_b32 s7, 0xf000 99; SI-NEXT: s_mov_b32 s6, -1 100; SI-NEXT: s_waitcnt lgkmcnt(0) 101; SI-NEXT: s_sub_i32 s3, 32, s3 102; SI-NEXT: s_sub_i32 s2, 32, s2 103; SI-NEXT: v_mov_b32_e32 v0, s3 104; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0 105; SI-NEXT: v_mov_b32_e32 v0, s2 106; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0 107; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 108; SI-NEXT: s_endpgm 109; 110; GFX8-LABEL: rotl_v2i32: 111; GFX8: ; %bb.0: ; %entry 112; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 113; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 114; GFX8-NEXT: s_waitcnt lgkmcnt(0) 115; GFX8-NEXT: s_sub_i32 s2, 32, s2 116; GFX8-NEXT: s_sub_i32 s3, 32, s3 117; GFX8-NEXT: v_mov_b32_e32 v0, s3 118; GFX8-NEXT: v_mov_b32_e32 v2, s2 119; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0 120; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2 121; GFX8-NEXT: v_mov_b32_e32 v2, s4 122; GFX8-NEXT: v_mov_b32_e32 v3, s5 123; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 124; GFX8-NEXT: s_endpgm 125; 126; GFX10-LABEL: rotl_v2i32: 127; GFX10: ; %bb.0: ; %entry 128; GFX10-NEXT: s_clause 0x1 129; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c 130; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 131; GFX10-NEXT: v_mov_b32_e32 v2, 0 132; GFX10-NEXT: s_waitcnt lgkmcnt(0) 133; GFX10-NEXT: s_sub_i32 s3, 32, s3 134; GFX10-NEXT: s_sub_i32 s2, 32, s2 135; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3 136; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2 137; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] 138; GFX10-NEXT: s_endpgm 139; 140; GFX11-LABEL: rotl_v2i32: 141; GFX11: ; %bb.0: ; %entry 142; GFX11-NEXT: s_clause 0x1 143; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c 144; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 145; GFX11-NEXT: v_mov_b32_e32 v2, 0 146; GFX11-NEXT: s_waitcnt lgkmcnt(0) 147; GFX11-NEXT: s_sub_i32 s3, 32, s3 148; GFX11-NEXT: s_sub_i32 s2, 32, s2 149; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3 150; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2 151; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] 152; GFX11-NEXT: s_endpgm 153entry: 154 %0 = shl <2 x i32> %x, %y 155 %1 = sub <2 x i32> <i32 32, i32 32>, %y 156 %2 = lshr <2 x i32> %x, %1 157 %3 = or <2 x i32> %0, %2 158 store <2 x i32> %3, ptr addrspace(1) %in 159 ret void 160} 161 162define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) { 163; R600-LABEL: rotl_v4i32: 164; R600: ; %bb.0: ; %entry 165; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] 166; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 167; R600-NEXT: CF_END 168; R600-NEXT: PAD 169; R600-NEXT: ALU clause starting at 4: 170; R600-NEXT: SUB_INT * T0.W, literal.x, KC0[5].X, 171; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00) 172; R600-NEXT: BIT_ALIGN_INT T0.W, KC0[4].X, KC0[4].X, PV.W, 173; R600-NEXT: SUB_INT * T1.W, literal.x, KC0[4].W, 174; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00) 175; R600-NEXT: BIT_ALIGN_INT T0.Z, KC0[3].W, KC0[3].W, PS, 176; R600-NEXT: SUB_INT * T1.W, literal.x, KC0[4].Z, 177; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00) 178; R600-NEXT: BIT_ALIGN_INT T0.Y, KC0[3].Z, KC0[3].Z, PV.W, 179; R600-NEXT: SUB_INT * T1.W, literal.x, KC0[4].Y, 180; R600-NEXT: 32(4.484155e-44), 0(0.000000e+00) 181; R600-NEXT: BIT_ALIGN_INT T0.X, KC0[3].Y, KC0[3].Y, PV.W, 182; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, 183; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 184; 185; SI-LABEL: rotl_v4i32: 186; SI: ; %bb.0: ; %entry 187; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd 188; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 189; SI-NEXT: s_mov_b32 s3, 0xf000 190; SI-NEXT: s_mov_b32 s2, -1 191; SI-NEXT: s_waitcnt lgkmcnt(0) 192; SI-NEXT: s_sub_i32 s4, 32, s12 193; SI-NEXT: s_sub_i32 s5, 32, s13 194; SI-NEXT: s_sub_i32 s6, 32, s15 195; SI-NEXT: s_sub_i32 s7, 32, s14 196; SI-NEXT: v_mov_b32_e32 v0, s6 197; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0 198; SI-NEXT: v_mov_b32_e32 v0, s7 199; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0 200; SI-NEXT: v_mov_b32_e32 v0, s5 201; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0 202; SI-NEXT: v_mov_b32_e32 v0, s4 203; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0 204; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 205; SI-NEXT: s_endpgm 206; 207; GFX8-LABEL: rotl_v4i32: 208; GFX8: ; %bb.0: ; %entry 209; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 210; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 211; GFX8-NEXT: s_waitcnt lgkmcnt(0) 212; GFX8-NEXT: s_sub_i32 s5, 32, s15 213; GFX8-NEXT: s_sub_i32 s4, 32, s14 214; GFX8-NEXT: v_mov_b32_e32 v0, s5 215; GFX8-NEXT: s_sub_i32 s3, 32, s13 216; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0 217; GFX8-NEXT: v_mov_b32_e32 v0, s4 218; GFX8-NEXT: s_sub_i32 s2, 32, s12 219; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v0 220; GFX8-NEXT: v_mov_b32_e32 v0, s3 221; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v0 222; GFX8-NEXT: v_mov_b32_e32 v0, s2 223; GFX8-NEXT: v_mov_b32_e32 v5, s1 224; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0 225; GFX8-NEXT: v_mov_b32_e32 v4, s0 226; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 227; GFX8-NEXT: s_endpgm 228; 229; GFX10-LABEL: rotl_v4i32: 230; GFX10: ; %bb.0: ; %entry 231; GFX10-NEXT: s_clause 0x1 232; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 233; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 234; GFX10-NEXT: v_mov_b32_e32 v4, 0 235; GFX10-NEXT: s_waitcnt lgkmcnt(0) 236; GFX10-NEXT: s_sub_i32 s2, 32, s12 237; GFX10-NEXT: s_sub_i32 s3, 32, s13 238; GFX10-NEXT: s_sub_i32 s4, 32, s15 239; GFX10-NEXT: s_sub_i32 s5, 32, s14 240; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s4 241; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s5 242; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s3 243; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s2 244; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 245; GFX10-NEXT: s_endpgm 246; 247; GFX11-LABEL: rotl_v4i32: 248; GFX11: ; %bb.0: ; %entry 249; GFX11-NEXT: s_clause 0x1 250; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34 251; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 252; GFX11-NEXT: v_mov_b32_e32 v4, 0 253; GFX11-NEXT: s_waitcnt lgkmcnt(0) 254; GFX11-NEXT: s_sub_i32 s2, 32, s12 255; GFX11-NEXT: s_sub_i32 s3, 32, s13 256; GFX11-NEXT: s_sub_i32 s4, 32, s15 257; GFX11-NEXT: s_sub_i32 s5, 32, s14 258; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s4 259; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s5 260; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s3 261; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s2 262; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 263; GFX11-NEXT: s_endpgm 264entry: 265 %0 = shl <4 x i32> %x, %y 266 %1 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y 267 %2 = lshr <4 x i32> %x, %1 268 %3 = or <4 x i32> %0, %2 269 store <4 x i32> %3, ptr addrspace(1) %in 270 ret void 271} 272 273declare i16 @llvm.fshl.i16(i16, i16, i16) 274 275define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr addrspace(1) nocapture readonly %sourceB, ptr addrspace(1) nocapture %destValues) { 276; R600-LABEL: test_rotl_i16: 277; R600: ; %bb.0: ; %entry 278; R600-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] 279; R600-NEXT: TEX 0 @8 280; R600-NEXT: ALU 0, @13, KC0[CB0:0-32], KC1[] 281; R600-NEXT: TEX 0 @10 282; R600-NEXT: ALU 21, @14, KC0[CB0:0-32], KC1[] 283; R600-NEXT: MEM_RAT MSKOR T0.XW, T1.X 284; R600-NEXT: CF_END 285; R600-NEXT: PAD 286; R600-NEXT: Fetch clause starting at 8: 287; R600-NEXT: VTX_READ_16 T0.X, T0.X, 48, #1 288; R600-NEXT: Fetch clause starting at 10: 289; R600-NEXT: VTX_READ_16 T1.X, T1.X, 32, #1 290; R600-NEXT: ALU clause starting at 12: 291; R600-NEXT: MOV * T0.X, KC0[2].Z, 292; R600-NEXT: ALU clause starting at 13: 293; R600-NEXT: MOV * T1.X, KC0[2].Y, 294; R600-NEXT: ALU clause starting at 14: 295; R600-NEXT: SUB_INT T0.W, 0.0, T0.X, 296; R600-NEXT: AND_INT * T1.W, T0.X, literal.x, 297; R600-NEXT: 15(2.101948e-44), 0(0.000000e+00) 298; R600-NEXT: AND_INT * T0.W, PV.W, literal.x, 299; R600-NEXT: 15(2.101948e-44), 0(0.000000e+00) 300; R600-NEXT: LSHR T0.Z, T1.X, PV.W, 301; R600-NEXT: LSHL T0.W, T1.X, T1.W, 302; R600-NEXT: ADD_INT * T1.W, KC0[2].W, literal.x, 303; R600-NEXT: 8(1.121039e-44), 0(0.000000e+00) 304; R600-NEXT: AND_INT T2.W, PS, literal.x, 305; R600-NEXT: OR_INT * T0.W, PV.W, PV.Z, 306; R600-NEXT: 3(4.203895e-45), 0(0.000000e+00) 307; R600-NEXT: AND_INT T0.W, PS, literal.x, 308; R600-NEXT: LSHL * T2.W, PV.W, literal.y, 309; R600-NEXT: 65535(9.183409e-41), 3(4.203895e-45) 310; R600-NEXT: LSHL T0.X, PV.W, PS, 311; R600-NEXT: LSHL * T0.W, literal.x, PS, 312; R600-NEXT: 65535(9.183409e-41), 0(0.000000e+00) 313; R600-NEXT: MOV T0.Y, 0.0, 314; R600-NEXT: MOV * T0.Z, 0.0, 315; R600-NEXT: LSHR * T1.X, T1.W, literal.x, 316; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) 317; 318; SI-LABEL: test_rotl_i16: 319; SI: ; %bb.0: ; %entry 320; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 321; SI-NEXT: s_mov_b32 s6, 0 322; SI-NEXT: s_mov_b32 s7, 0xf000 323; SI-NEXT: s_mov_b32 s4, s6 324; SI-NEXT: s_mov_b32 s5, s6 325; SI-NEXT: buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:48 326; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:32 327; SI-NEXT: s_waitcnt vmcnt(1) 328; SI-NEXT: v_and_b32_e32 v1, 15, v2 329; SI-NEXT: v_sub_i32_e32 v2, vcc, 0, v2 330; SI-NEXT: s_waitcnt vmcnt(0) 331; SI-NEXT: v_lshlrev_b32_e32 v1, v1, v0 332; SI-NEXT: v_and_b32_e32 v2, 15, v2 333; SI-NEXT: v_lshrrev_b32_e32 v0, v2, v0 334; SI-NEXT: v_or_b32_e32 v0, v1, v0 335; SI-NEXT: buffer_store_short v0, v[4:5], s[4:7], 0 addr64 offset:8 336; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 337; SI-NEXT: s_setpc_b64 s[30:31] 338; 339; GFX8-LABEL: test_rotl_i16: 340; GFX8: ; %bb.0: ; %entry 341; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 342; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 343; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 344; GFX8-NEXT: v_add_u32_e32 v2, vcc, 48, v2 345; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 346; GFX8-NEXT: flat_load_ushort v2, v[2:3] 347; GFX8-NEXT: flat_load_ushort v0, v[0:1] 348; GFX8-NEXT: s_waitcnt vmcnt(0) 349; GFX8-NEXT: v_lshlrev_b16_e32 v1, v2, v0 350; GFX8-NEXT: v_sub_u16_e32 v2, 0, v2 351; GFX8-NEXT: v_lshrrev_b16_e32 v0, v2, v0 352; GFX8-NEXT: v_or_b32_e32 v2, v1, v0 353; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v4 354; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc 355; GFX8-NEXT: flat_store_short v[0:1], v2 356; GFX8-NEXT: s_waitcnt vmcnt(0) 357; GFX8-NEXT: s_setpc_b64 s[30:31] 358; 359; GFX10-LABEL: test_rotl_i16: 360; GFX10: ; %bb.0: ; %entry 361; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 362; GFX10-NEXT: global_load_ushort v6, v[2:3], off offset:48 363; GFX10-NEXT: global_load_ushort v7, v[0:1], off offset:32 364; GFX10-NEXT: s_waitcnt vmcnt(1) 365; GFX10-NEXT: v_sub_nc_u16 v0, 0, v6 366; GFX10-NEXT: s_waitcnt vmcnt(0) 367; GFX10-NEXT: v_lshlrev_b16 v1, v6, v7 368; GFX10-NEXT: v_lshrrev_b16 v0, v0, v7 369; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 370; GFX10-NEXT: global_store_short v[4:5], v0, off offset:8 371; GFX10-NEXT: s_setpc_b64 s[30:31] 372; 373; GFX11-LABEL: test_rotl_i16: 374; GFX11: ; %bb.0: ; %entry 375; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 376; GFX11-NEXT: global_load_u16 v2, v[2:3], off offset:48 377; GFX11-NEXT: global_load_u16 v0, v[0:1], off offset:32 378; GFX11-NEXT: s_waitcnt vmcnt(1) 379; GFX11-NEXT: v_sub_nc_u16 v1, 0, v2 380; GFX11-NEXT: s_waitcnt vmcnt(0) 381; GFX11-NEXT: v_lshlrev_b16 v2, v2, v0 382; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 383; GFX11-NEXT: v_lshrrev_b16 v0, v1, v0 384; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 385; GFX11-NEXT: global_store_b16 v[4:5], v0, off offset:8 386; GFX11-NEXT: s_setpc_b64 s[30:31] 387entry: 388 %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %sourceA, i64 16 389 %a = load i16, ptr addrspace(1) %arrayidx 390 %arrayidx2 = getelementptr inbounds i16, ptr addrspace(1) %sourceB, i64 24 391 %b = load i16, ptr addrspace(1) %arrayidx2 392 %c = tail call i16 @llvm.fshl.i16(i16 %a, i16 %a, i16 %b) 393 %arrayidx5 = getelementptr inbounds i16, ptr addrspace(1) %destValues, i64 4 394 store i16 %c, ptr addrspace(1) %arrayidx5 395 ret void 396} 397