1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s 4; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s 7 8define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { 9; GFX9-LABEL: s_shl_v2i16: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 12; GFX9-NEXT: s_mov_b32 s7, 0xf000 13; GFX9-NEXT: s_mov_b32 s6, -1 14; GFX9-NEXT: s_waitcnt lgkmcnt(0) 15; GFX9-NEXT: v_mov_b32_e32 v0, s2 16; GFX9-NEXT: s_mov_b32 s4, s0 17; GFX9-NEXT: s_mov_b32 s5, s1 18; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 19; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 20; GFX9-NEXT: s_endpgm 21; 22; VI-LABEL: s_shl_v2i16: 23; VI: ; %bb.0: 24; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 25; VI-NEXT: s_mov_b32 s7, 0xf000 26; VI-NEXT: s_mov_b32 s6, -1 27; VI-NEXT: s_waitcnt lgkmcnt(0) 28; VI-NEXT: s_mov_b32 s4, s0 29; VI-NEXT: s_mov_b32 s5, s1 30; VI-NEXT: s_lshr_b32 s0, s2, 16 31; VI-NEXT: s_lshr_b32 s1, s3, 16 32; VI-NEXT: s_lshl_b32 s0, s0, s1 33; VI-NEXT: s_lshl_b32 s1, s2, s3 34; VI-NEXT: s_lshl_b32 s0, s0, 16 35; VI-NEXT: s_and_b32 s1, s1, 0xffff 36; VI-NEXT: s_or_b32 s0, s1, s0 37; VI-NEXT: v_mov_b32_e32 v0, s0 38; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 39; VI-NEXT: s_endpgm 40; 41; CI-LABEL: s_shl_v2i16: 42; CI: ; %bb.0: 43; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 44; CI-NEXT: s_mov_b32 s7, 0xf000 45; CI-NEXT: s_mov_b32 s6, -1 46; CI-NEXT: s_waitcnt lgkmcnt(0) 47; CI-NEXT: s_mov_b32 s4, s0 48; CI-NEXT: s_mov_b32 s5, s1 49; CI-NEXT: s_lshr_b32 s0, s2, 16 50; CI-NEXT: s_lshr_b32 s1, s3, 16 51; CI-NEXT: s_lshl_b32 s0, s0, s1 52; CI-NEXT: s_lshl_b32 s1, s2, s3 53; CI-NEXT: s_lshl_b32 s0, s0, 16 54; CI-NEXT: s_and_b32 s1, s1, 0xffff 55; CI-NEXT: s_or_b32 s0, s1, s0 56; CI-NEXT: v_mov_b32_e32 v0, s0 57; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 58; CI-NEXT: s_endpgm 59; 60; GFX10-LABEL: s_shl_v2i16: 61; GFX10: ; %bb.0: 62; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 63; GFX10-NEXT: s_mov_b32 s7, 0x31016000 64; GFX10-NEXT: s_mov_b32 s6, -1 65; GFX10-NEXT: s_waitcnt lgkmcnt(0) 66; GFX10-NEXT: v_pk_lshlrev_b16 v0, s3, s2 67; GFX10-NEXT: s_mov_b32 s4, s0 68; GFX10-NEXT: s_mov_b32 s5, s1 69; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 70; GFX10-NEXT: s_endpgm 71; 72; GFX11-LABEL: s_shl_v2i16: 73; GFX11: ; %bb.0: 74; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 75; GFX11-NEXT: s_mov_b32 s7, 0x31016000 76; GFX11-NEXT: s_mov_b32 s6, -1 77; GFX11-NEXT: s_waitcnt lgkmcnt(0) 78; GFX11-NEXT: v_pk_lshlrev_b16 v0, s3, s2 79; GFX11-NEXT: s_mov_b32 s4, s0 80; GFX11-NEXT: s_mov_b32 s5, s1 81; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 82; GFX11-NEXT: s_endpgm 83 %result = shl <2 x i16> %lhs, %rhs 84 store <2 x i16> %result, ptr addrspace(1) %out 85 ret void 86} 87 88define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 89; GFX9-LABEL: v_shl_v2i16: 90; GFX9: ; %bb.0: 91; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 92; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 93; GFX9-NEXT: s_waitcnt lgkmcnt(0) 94; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 95; GFX9-NEXT: s_waitcnt vmcnt(0) 96; GFX9-NEXT: v_pk_lshlrev_b16 v0, v1, v0 97; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 98; GFX9-NEXT: s_endpgm 99; 100; VI-LABEL: v_shl_v2i16: 101; VI: ; %bb.0: 102; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 103; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 104; VI-NEXT: s_waitcnt lgkmcnt(0) 105; VI-NEXT: v_mov_b32_e32 v1, s3 106; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 107; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 108; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 109; VI-NEXT: v_mov_b32_e32 v3, s1 110; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 111; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 112; VI-NEXT: s_waitcnt vmcnt(0) 113; VI-NEXT: v_lshlrev_b16_e32 v4, v1, v0 114; VI-NEXT: v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 115; VI-NEXT: v_or_b32_e32 v0, v4, v0 116; VI-NEXT: flat_store_dword v[2:3], v0 117; VI-NEXT: s_endpgm 118; 119; CI-LABEL: v_shl_v2i16: 120; CI: ; %bb.0: 121; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 122; CI-NEXT: s_mov_b32 s7, 0xf000 123; CI-NEXT: s_mov_b32 s6, 0 124; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 125; CI-NEXT: v_mov_b32_e32 v1, 0 126; CI-NEXT: s_waitcnt lgkmcnt(0) 127; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 128; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 129; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 130; CI-NEXT: s_waitcnt vmcnt(0) 131; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 132; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 133; CI-NEXT: v_lshlrev_b32_e32 v2, v3, v2 134; CI-NEXT: v_lshlrev_b32_e32 v3, v5, v4 135; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 136; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 137; CI-NEXT: v_or_b32_e32 v2, v2, v3 138; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 139; CI-NEXT: s_endpgm 140; 141; GFX10-LABEL: v_shl_v2i16: 142; GFX10: ; %bb.0: 143; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 144; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 145; GFX10-NEXT: s_waitcnt lgkmcnt(0) 146; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 147; GFX10-NEXT: s_waitcnt vmcnt(0) 148; GFX10-NEXT: v_pk_lshlrev_b16 v0, v1, v0 149; GFX10-NEXT: global_store_dword v2, v0, s[0:1] 150; GFX10-NEXT: s_endpgm 151; 152; GFX11-LABEL: v_shl_v2i16: 153; GFX11: ; %bb.0: 154; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 155; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 156; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 157; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 158; GFX11-NEXT: s_waitcnt lgkmcnt(0) 159; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 160; GFX11-NEXT: s_waitcnt vmcnt(0) 161; GFX11-NEXT: v_pk_lshlrev_b16 v0, v1, v0 162; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] 163; GFX11-NEXT: s_endpgm 164 %tid = call i32 @llvm.amdgcn.workitem.id.x() 165 %tid.ext = sext i32 %tid to i64 166 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext 167 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext 168 %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %in.gep, i32 1 169 %a = load <2 x i16>, ptr addrspace(1) %in.gep 170 %b = load <2 x i16>, ptr addrspace(1) %b_ptr 171 %result = shl <2 x i16> %a, %b 172 store <2 x i16> %result, ptr addrspace(1) %out.gep 173 ret void 174} 175 176define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { 177; GFX9-LABEL: shl_v_s_v2i16: 178; GFX9: ; %bb.0: 179; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 180; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 181; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 182; GFX9-NEXT: s_waitcnt lgkmcnt(0) 183; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 184; GFX9-NEXT: s_waitcnt vmcnt(0) 185; GFX9-NEXT: v_pk_lshlrev_b16 v1, s6, v1 186; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 187; GFX9-NEXT: s_endpgm 188; 189; VI-LABEL: shl_v_s_v2i16: 190; VI: ; %bb.0: 191; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 192; VI-NEXT: s_load_dword s4, s[4:5], 0x34 193; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 194; VI-NEXT: s_waitcnt lgkmcnt(0) 195; VI-NEXT: v_mov_b32_e32 v1, s3 196; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 197; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 198; VI-NEXT: flat_load_dword v3, v[0:1] 199; VI-NEXT: v_mov_b32_e32 v1, s1 200; VI-NEXT: s_lshr_b32 s1, s4, 16 201; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 202; VI-NEXT: v_mov_b32_e32 v2, s1 203; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 204; VI-NEXT: s_waitcnt vmcnt(0) 205; VI-NEXT: v_lshlrev_b16_e32 v4, s4, v3 206; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 207; VI-NEXT: v_or_b32_e32 v2, v4, v2 208; VI-NEXT: flat_store_dword v[0:1], v2 209; VI-NEXT: s_endpgm 210; 211; CI-LABEL: shl_v_s_v2i16: 212; CI: ; %bb.0: 213; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 214; CI-NEXT: s_load_dword s8, s[4:5], 0xd 215; CI-NEXT: s_mov_b32 s7, 0xf000 216; CI-NEXT: s_mov_b32 s6, 0 217; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 218; CI-NEXT: s_waitcnt lgkmcnt(0) 219; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 220; CI-NEXT: v_mov_b32_e32 v1, 0 221; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 222; CI-NEXT: s_lshr_b32 s4, s8, 16 223; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 224; CI-NEXT: s_waitcnt vmcnt(0) 225; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 226; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 227; CI-NEXT: v_lshlrev_b32_e32 v3, s4, v3 228; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 229; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 230; CI-NEXT: v_or_b32_e32 v2, v2, v3 231; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 232; CI-NEXT: s_endpgm 233; 234; GFX10-LABEL: shl_v_s_v2i16: 235; GFX10: ; %bb.0: 236; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 237; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 238; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34 239; GFX10-NEXT: s_waitcnt lgkmcnt(0) 240; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 241; GFX10-NEXT: s_waitcnt vmcnt(0) 242; GFX10-NEXT: v_pk_lshlrev_b16 v1, s4, v1 243; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 244; GFX10-NEXT: s_endpgm 245; 246; GFX11-LABEL: shl_v_s_v2i16: 247; GFX11: ; %bb.0: 248; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 249; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 250; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 251; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 252; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 253; GFX11-NEXT: s_waitcnt lgkmcnt(0) 254; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 255; GFX11-NEXT: s_waitcnt vmcnt(0) 256; GFX11-NEXT: v_pk_lshlrev_b16 v1, s4, v1 257; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 258; GFX11-NEXT: s_endpgm 259 %tid = call i32 @llvm.amdgcn.workitem.id.x() 260 %tid.ext = sext i32 %tid to i64 261 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext 262 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext 263 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep 264 %result = shl <2 x i16> %vgpr, %sgpr 265 store <2 x i16> %result, ptr addrspace(1) %out.gep 266 ret void 267} 268 269define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { 270; GFX9-LABEL: shl_s_v_v2i16: 271; GFX9: ; %bb.0: 272; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 273; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 274; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 275; GFX9-NEXT: s_waitcnt lgkmcnt(0) 276; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 277; GFX9-NEXT: s_waitcnt vmcnt(0) 278; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s6 279; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 280; GFX9-NEXT: s_endpgm 281; 282; VI-LABEL: shl_s_v_v2i16: 283; VI: ; %bb.0: 284; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 285; VI-NEXT: s_load_dword s4, s[4:5], 0x34 286; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 287; VI-NEXT: s_waitcnt lgkmcnt(0) 288; VI-NEXT: v_mov_b32_e32 v1, s3 289; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 290; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 291; VI-NEXT: flat_load_dword v3, v[0:1] 292; VI-NEXT: v_mov_b32_e32 v1, s1 293; VI-NEXT: s_lshr_b32 s1, s4, 16 294; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 295; VI-NEXT: v_mov_b32_e32 v2, s1 296; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 297; VI-NEXT: s_waitcnt vmcnt(0) 298; VI-NEXT: v_lshlrev_b16_e64 v4, v3, s4 299; VI-NEXT: v_lshlrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 300; VI-NEXT: v_or_b32_e32 v2, v4, v2 301; VI-NEXT: flat_store_dword v[0:1], v2 302; VI-NEXT: s_endpgm 303; 304; CI-LABEL: shl_s_v_v2i16: 305; CI: ; %bb.0: 306; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 307; CI-NEXT: s_load_dword s8, s[4:5], 0xd 308; CI-NEXT: s_mov_b32 s7, 0xf000 309; CI-NEXT: s_mov_b32 s6, 0 310; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 311; CI-NEXT: s_waitcnt lgkmcnt(0) 312; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 313; CI-NEXT: v_mov_b32_e32 v1, 0 314; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 315; CI-NEXT: s_lshr_b32 s4, s8, 16 316; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 317; CI-NEXT: s_waitcnt vmcnt(0) 318; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 319; CI-NEXT: v_lshl_b32_e32 v2, s8, v2 320; CI-NEXT: v_lshl_b32_e32 v3, s4, v3 321; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 322; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 323; CI-NEXT: v_or_b32_e32 v2, v2, v3 324; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 325; CI-NEXT: s_endpgm 326; 327; GFX10-LABEL: shl_s_v_v2i16: 328; GFX10: ; %bb.0: 329; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 330; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 331; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34 332; GFX10-NEXT: s_waitcnt lgkmcnt(0) 333; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 334; GFX10-NEXT: s_waitcnt vmcnt(0) 335; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, s4 336; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 337; GFX10-NEXT: s_endpgm 338; 339; GFX11-LABEL: shl_s_v_v2i16: 340; GFX11: ; %bb.0: 341; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 342; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 343; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 344; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 345; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 346; GFX11-NEXT: s_waitcnt lgkmcnt(0) 347; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 348; GFX11-NEXT: s_waitcnt vmcnt(0) 349; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, s4 350; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 351; GFX11-NEXT: s_endpgm 352 %tid = call i32 @llvm.amdgcn.workitem.id.x() 353 %tid.ext = sext i32 %tid to i64 354 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext 355 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext 356 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep 357 %result = shl <2 x i16> %sgpr, %vgpr 358 store <2 x i16> %result, ptr addrspace(1) %out.gep 359 ret void 360} 361 362define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 363; GFX9-LABEL: shl_imm_v_v2i16: 364; GFX9: ; %bb.0: 365; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 366; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 367; GFX9-NEXT: s_waitcnt lgkmcnt(0) 368; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 369; GFX9-NEXT: s_waitcnt vmcnt(0) 370; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] 371; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 372; GFX9-NEXT: s_endpgm 373; 374; VI-LABEL: shl_imm_v_v2i16: 375; VI: ; %bb.0: 376; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 377; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 378; VI-NEXT: v_mov_b32_e32 v4, 8 379; VI-NEXT: s_waitcnt lgkmcnt(0) 380; VI-NEXT: v_mov_b32_e32 v1, s3 381; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 382; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 383; VI-NEXT: flat_load_dword v3, v[0:1] 384; VI-NEXT: v_mov_b32_e32 v1, s1 385; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 386; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 387; VI-NEXT: s_waitcnt vmcnt(0) 388; VI-NEXT: v_lshlrev_b16_e64 v2, v3, 8 389; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 390; VI-NEXT: v_or_b32_e32 v2, v2, v3 391; VI-NEXT: flat_store_dword v[0:1], v2 392; VI-NEXT: s_endpgm 393; 394; CI-LABEL: shl_imm_v_v2i16: 395; CI: ; %bb.0: 396; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 397; CI-NEXT: s_mov_b32 s7, 0xf000 398; CI-NEXT: s_mov_b32 s6, 0 399; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 400; CI-NEXT: v_mov_b32_e32 v1, 0 401; CI-NEXT: s_waitcnt lgkmcnt(0) 402; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 403; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 404; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 405; CI-NEXT: s_waitcnt vmcnt(0) 406; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 407; CI-NEXT: v_lshl_b32_e32 v2, 8, v2 408; CI-NEXT: v_lshl_b32_e32 v3, 8, v3 409; CI-NEXT: v_and_b32_e32 v2, 0xfff8, v2 410; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 411; CI-NEXT: v_or_b32_e32 v2, v2, v3 412; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 413; CI-NEXT: s_endpgm 414; 415; GFX10-LABEL: shl_imm_v_v2i16: 416; GFX10: ; %bb.0: 417; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 418; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 419; GFX10-NEXT: s_waitcnt lgkmcnt(0) 420; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 421; GFX10-NEXT: s_waitcnt vmcnt(0) 422; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] 423; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 424; GFX10-NEXT: s_endpgm 425; 426; GFX11-LABEL: shl_imm_v_v2i16: 427; GFX11: ; %bb.0: 428; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 429; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 430; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 431; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 432; GFX11-NEXT: s_waitcnt lgkmcnt(0) 433; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 434; GFX11-NEXT: s_waitcnt vmcnt(0) 435; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] 436; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 437; GFX11-NEXT: s_endpgm 438 %tid = call i32 @llvm.amdgcn.workitem.id.x() 439 %tid.ext = sext i32 %tid to i64 440 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext 441 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext 442 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep 443 %result = shl <2 x i16> <i16 8, i16 8>, %vgpr 444 store <2 x i16> %result, ptr addrspace(1) %out.gep 445 ret void 446} 447 448define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 449; GFX9-LABEL: shl_v_imm_v2i16: 450; GFX9: ; %bb.0: 451; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 452; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 453; GFX9-NEXT: s_waitcnt lgkmcnt(0) 454; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 455; GFX9-NEXT: s_waitcnt vmcnt(0) 456; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 457; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 458; GFX9-NEXT: s_endpgm 459; 460; VI-LABEL: shl_v_imm_v2i16: 461; VI: ; %bb.0: 462; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 463; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 464; VI-NEXT: s_waitcnt lgkmcnt(0) 465; VI-NEXT: v_mov_b32_e32 v1, s3 466; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 467; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 468; VI-NEXT: flat_load_dword v3, v[0:1] 469; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 470; VI-NEXT: v_mov_b32_e32 v1, s1 471; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 472; VI-NEXT: s_waitcnt vmcnt(0) 473; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 474; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2 475; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 476; VI-NEXT: v_or_b32_e32 v2, v3, v2 477; VI-NEXT: flat_store_dword v[0:1], v2 478; VI-NEXT: s_endpgm 479; 480; CI-LABEL: shl_v_imm_v2i16: 481; CI: ; %bb.0: 482; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 483; CI-NEXT: s_mov_b32 s7, 0xf000 484; CI-NEXT: s_mov_b32 s6, 0 485; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 486; CI-NEXT: v_mov_b32_e32 v1, 0 487; CI-NEXT: s_waitcnt lgkmcnt(0) 488; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 489; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 490; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 491; CI-NEXT: s_waitcnt vmcnt(0) 492; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 493; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 494; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 495; CI-NEXT: s_endpgm 496; 497; GFX10-LABEL: shl_v_imm_v2i16: 498; GFX10: ; %bb.0: 499; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 500; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 501; GFX10-NEXT: s_waitcnt lgkmcnt(0) 502; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 503; GFX10-NEXT: s_waitcnt vmcnt(0) 504; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 505; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 506; GFX10-NEXT: s_endpgm 507; 508; GFX11-LABEL: shl_v_imm_v2i16: 509; GFX11: ; %bb.0: 510; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 511; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 512; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 513; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 514; GFX11-NEXT: s_waitcnt lgkmcnt(0) 515; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 516; GFX11-NEXT: s_waitcnt vmcnt(0) 517; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 518; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 519; GFX11-NEXT: s_endpgm 520 %tid = call i32 @llvm.amdgcn.workitem.id.x() 521 %tid.ext = sext i32 %tid to i64 522 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext 523 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext 524 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep 525 %result = shl <2 x i16> %vgpr, <i16 8, i16 8> 526 store <2 x i16> %result, ptr addrspace(1) %out.gep 527 ret void 528} 529 530define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 531; GFX9-LABEL: v_shl_v4i16: 532; GFX9: ; %bb.0: 533; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 534; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 535; GFX9-NEXT: s_waitcnt lgkmcnt(0) 536; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 537; GFX9-NEXT: s_waitcnt vmcnt(0) 538; GFX9-NEXT: v_pk_lshlrev_b16 v1, v3, v1 539; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0 540; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 541; GFX9-NEXT: s_endpgm 542; 543; VI-LABEL: v_shl_v4i16: 544; VI: ; %bb.0: 545; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 546; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 547; VI-NEXT: s_waitcnt lgkmcnt(0) 548; VI-NEXT: v_mov_b32_e32 v1, s3 549; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 550; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 551; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 552; VI-NEXT: v_mov_b32_e32 v5, s1 553; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 554; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 555; VI-NEXT: s_waitcnt vmcnt(0) 556; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1 557; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 558; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0 559; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 560; VI-NEXT: v_or_b32_e32 v1, v6, v1 561; VI-NEXT: v_or_b32_e32 v0, v3, v0 562; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 563; VI-NEXT: s_endpgm 564; 565; CI-LABEL: v_shl_v4i16: 566; CI: ; %bb.0: 567; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 568; CI-NEXT: s_mov_b32 s7, 0xf000 569; CI-NEXT: s_mov_b32 s6, 0 570; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 571; CI-NEXT: v_mov_b32_e32 v5, 0 572; CI-NEXT: s_waitcnt lgkmcnt(0) 573; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 574; CI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 575; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 576; CI-NEXT: s_waitcnt vmcnt(0) 577; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 578; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 579; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 580; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 581; CI-NEXT: v_lshlrev_b32_e32 v1, v3, v1 582; CI-NEXT: v_lshlrev_b32_e32 v0, v2, v0 583; CI-NEXT: v_lshlrev_b32_e32 v2, v9, v7 584; CI-NEXT: v_lshlrev_b32_e32 v3, v8, v6 585; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 586; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 587; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 588; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 589; CI-NEXT: v_or_b32_e32 v1, v1, v2 590; CI-NEXT: v_or_b32_e32 v0, v0, v3 591; CI-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64 592; CI-NEXT: s_endpgm 593; 594; GFX10-LABEL: v_shl_v4i16: 595; GFX10: ; %bb.0: 596; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 597; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 598; GFX10-NEXT: s_waitcnt lgkmcnt(0) 599; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 600; GFX10-NEXT: s_waitcnt vmcnt(0) 601; GFX10-NEXT: v_pk_lshlrev_b16 v1, v3, v1 602; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 603; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 604; GFX10-NEXT: s_endpgm 605; 606; GFX11-LABEL: v_shl_v4i16: 607; GFX11: ; %bb.0: 608; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 609; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 610; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 611; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 612; GFX11-NEXT: s_waitcnt lgkmcnt(0) 613; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] 614; GFX11-NEXT: s_waitcnt vmcnt(0) 615; GFX11-NEXT: v_pk_lshlrev_b16 v1, v3, v1 616; GFX11-NEXT: v_pk_lshlrev_b16 v0, v2, v0 617; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] 618; GFX11-NEXT: s_endpgm 619 %tid = call i32 @llvm.amdgcn.workitem.id.x() 620 %tid.ext = sext i32 %tid to i64 621 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext 622 %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext 623 %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %in.gep, i32 1 624 %a = load <4 x i16>, ptr addrspace(1) %in.gep 625 %b = load <4 x i16>, ptr addrspace(1) %b_ptr 626 %result = shl <4 x i16> %a, %b 627 store <4 x i16> %result, ptr addrspace(1) %out.gep 628 ret void 629} 630 631define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 632; GFX9-LABEL: shl_v_imm_v4i16: 633; GFX9: ; %bb.0: 634; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 635; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 636; GFX9-NEXT: s_waitcnt lgkmcnt(0) 637; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 638; GFX9-NEXT: s_waitcnt vmcnt(0) 639; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 640; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 641; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 642; GFX9-NEXT: s_endpgm 643; 644; VI-LABEL: shl_v_imm_v4i16: 645; VI: ; %bb.0: 646; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 647; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 648; VI-NEXT: s_waitcnt lgkmcnt(0) 649; VI-NEXT: v_mov_b32_e32 v1, s3 650; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 651; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 652; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 653; VI-NEXT: v_mov_b32_e32 v3, s1 654; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 655; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 656; VI-NEXT: s_waitcnt vmcnt(0) 657; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 658; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0 659; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 660; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 661; VI-NEXT: v_and_b32_e32 v4, 0xff000000, v4 662; VI-NEXT: v_and_b32_e32 v0, 0xff000000, v0 663; VI-NEXT: v_or_b32_e32 v1, v1, v4 664; VI-NEXT: v_or_b32_e32 v0, v5, v0 665; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 666; VI-NEXT: s_endpgm 667; 668; CI-LABEL: shl_v_imm_v4i16: 669; CI: ; %bb.0: 670; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 671; CI-NEXT: s_mov_b32 s7, 0xf000 672; CI-NEXT: s_mov_b32 s6, 0 673; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 674; CI-NEXT: v_mov_b32_e32 v1, 0 675; CI-NEXT: s_waitcnt lgkmcnt(0) 676; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 677; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 678; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 679; CI-NEXT: s_waitcnt vmcnt(0) 680; CI-NEXT: v_lshlrev_b32_e32 v4, 8, v3 681; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3 682; CI-NEXT: v_and_b32_e32 v3, 0xff00, v3 683; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 684; CI-NEXT: v_and_b32_e32 v4, 0xff00, v4 685; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 686; CI-NEXT: v_or_b32_e32 v3, v4, v3 687; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 688; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 689; CI-NEXT: s_endpgm 690; 691; GFX10-LABEL: shl_v_imm_v4i16: 692; GFX10: ; %bb.0: 693; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 694; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 695; GFX10-NEXT: s_waitcnt lgkmcnt(0) 696; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 697; GFX10-NEXT: s_waitcnt vmcnt(0) 698; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 699; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 700; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 701; GFX10-NEXT: s_endpgm 702; 703; GFX11-LABEL: shl_v_imm_v4i16: 704; GFX11: ; %bb.0: 705; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 706; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 707; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 708; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 709; GFX11-NEXT: s_waitcnt lgkmcnt(0) 710; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 711; GFX11-NEXT: s_waitcnt vmcnt(0) 712; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 713; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 714; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 715; GFX11-NEXT: s_endpgm 716 %tid = call i32 @llvm.amdgcn.workitem.id.x() 717 %tid.ext = sext i32 %tid to i64 718 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext 719 %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext 720 %vgpr = load <4 x i16>, ptr addrspace(1) %in.gep 721 %result = shl <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8> 722 store <4 x i16> %result, ptr addrspace(1) %out.gep 723 ret void 724} 725 726declare i32 @llvm.amdgcn.workitem.id.x() #1 727 728attributes #0 = { nounwind } 729attributes #1 = { nounwind readnone } 730