1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s 4; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s 7 8define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { 9; GFX9-LABEL: s_lshr_v2i16: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 12; GFX9-NEXT: v_mov_b32_e32 v0, 0 13; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14; GFX9-NEXT: v_mov_b32_e32 v1, s2 15; GFX9-NEXT: v_pk_lshrrev_b16 v1, s3, v1 16; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 17; GFX9-NEXT: s_endpgm 18; 19; VI-LABEL: s_lshr_v2i16: 20; VI: ; %bb.0: 21; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 22; VI-NEXT: s_waitcnt lgkmcnt(0) 23; VI-NEXT: s_and_b32 s4, s2, 0xffff 24; VI-NEXT: s_lshr_b32 s2, s2, 16 25; VI-NEXT: s_lshr_b32 s5, s3, 16 26; VI-NEXT: s_lshr_b32 s2, s2, s5 27; VI-NEXT: s_lshr_b32 s3, s4, s3 28; VI-NEXT: s_lshl_b32 s2, s2, 16 29; VI-NEXT: s_or_b32 s2, s3, s2 30; VI-NEXT: v_mov_b32_e32 v0, s0 31; VI-NEXT: v_mov_b32_e32 v1, s1 32; VI-NEXT: v_mov_b32_e32 v2, s2 33; VI-NEXT: flat_store_dword v[0:1], v2 34; VI-NEXT: s_endpgm 35; 36; CI-LABEL: s_lshr_v2i16: 37; CI: ; %bb.0: 38; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 39; CI-NEXT: s_mov_b32 s7, 0xf000 40; CI-NEXT: s_mov_b32 s6, -1 41; CI-NEXT: s_waitcnt lgkmcnt(0) 42; CI-NEXT: s_mov_b32 s4, s0 43; CI-NEXT: s_mov_b32 s5, s1 44; CI-NEXT: s_and_b32 s0, s2, 0xffff 45; CI-NEXT: s_lshr_b32 s1, s2, 16 46; CI-NEXT: s_lshr_b32 s2, s3, 16 47; CI-NEXT: s_lshr_b32 s1, s1, s2 48; CI-NEXT: s_lshl_b32 s1, s1, 16 49; CI-NEXT: s_lshr_b32 s0, s0, s3 50; CI-NEXT: s_or_b32 s0, s0, s1 51; CI-NEXT: v_mov_b32_e32 v0, s0 52; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 53; CI-NEXT: s_endpgm 54; 55; GFX10-LABEL: s_lshr_v2i16: 56; GFX10: ; %bb.0: 57; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 58; GFX10-NEXT: v_mov_b32_e32 v0, 0 59; GFX10-NEXT: s_waitcnt lgkmcnt(0) 60; GFX10-NEXT: v_pk_lshrrev_b16 v1, s3, s2 61; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 62; GFX10-NEXT: s_endpgm 63; 64; GFX11-LABEL: s_lshr_v2i16: 65; GFX11: ; %bb.0: 66; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 67; GFX11-NEXT: v_mov_b32_e32 v0, 0 68; GFX11-NEXT: s_waitcnt lgkmcnt(0) 69; GFX11-NEXT: v_pk_lshrrev_b16 v1, s3, s2 70; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 71; GFX11-NEXT: s_endpgm 72 %result = lshr <2 x i16> %lhs, %rhs 73 store <2 x i16> %result, ptr addrspace(1) %out 74 ret void 75} 76 77define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 78; GFX9-LABEL: v_lshr_v2i16: 79; GFX9: ; %bb.0: 80; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 81; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 82; GFX9-NEXT: s_waitcnt lgkmcnt(0) 83; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 84; GFX9-NEXT: s_waitcnt vmcnt(0) 85; GFX9-NEXT: v_pk_lshrrev_b16 v0, v1, v0 86; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 87; GFX9-NEXT: s_endpgm 88; 89; VI-LABEL: v_lshr_v2i16: 90; VI: ; %bb.0: 91; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 92; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 93; VI-NEXT: s_waitcnt lgkmcnt(0) 94; VI-NEXT: v_mov_b32_e32 v1, s3 95; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 96; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 97; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 98; VI-NEXT: v_mov_b32_e32 v3, s1 99; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 100; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 101; VI-NEXT: s_waitcnt vmcnt(0) 102; VI-NEXT: v_lshrrev_b16_e32 v4, v1, v0 103; VI-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 104; VI-NEXT: v_or_b32_e32 v0, v4, v0 105; VI-NEXT: flat_store_dword v[2:3], v0 106; VI-NEXT: s_endpgm 107; 108; CI-LABEL: v_lshr_v2i16: 109; CI: ; %bb.0: 110; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 111; CI-NEXT: s_mov_b32 s7, 0xf000 112; CI-NEXT: s_mov_b32 s6, 0 113; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 114; CI-NEXT: v_mov_b32_e32 v1, 0 115; CI-NEXT: s_waitcnt lgkmcnt(0) 116; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 117; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 118; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 119; CI-NEXT: s_waitcnt vmcnt(0) 120; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 121; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 122; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 123; CI-NEXT: v_lshrrev_b32_e32 v2, v3, v2 124; CI-NEXT: v_lshrrev_b32_e32 v3, v5, v4 125; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 126; CI-NEXT: v_or_b32_e32 v2, v2, v3 127; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 128; CI-NEXT: s_endpgm 129; 130; GFX10-LABEL: v_lshr_v2i16: 131; GFX10: ; %bb.0: 132; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 133; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 134; GFX10-NEXT: s_waitcnt lgkmcnt(0) 135; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 136; GFX10-NEXT: s_waitcnt vmcnt(0) 137; GFX10-NEXT: v_pk_lshrrev_b16 v0, v1, v0 138; GFX10-NEXT: global_store_dword v2, v0, s[0:1] 139; GFX10-NEXT: s_endpgm 140; 141; GFX11-LABEL: v_lshr_v2i16: 142; GFX11: ; %bb.0: 143; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 144; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 145; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 146; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 147; GFX11-NEXT: s_waitcnt lgkmcnt(0) 148; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 149; GFX11-NEXT: s_waitcnt vmcnt(0) 150; GFX11-NEXT: v_pk_lshrrev_b16 v0, v1, v0 151; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] 152; GFX11-NEXT: s_endpgm 153 %tid = call i32 @llvm.amdgcn.workitem.id.x() 154 %tid.ext = sext i32 %tid to i64 155 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext 156 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext 157 %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %in.gep, i32 1 158 %a = load <2 x i16>, ptr addrspace(1) %in.gep 159 %b = load <2 x i16>, ptr addrspace(1) %b_ptr 160 %result = lshr <2 x i16> %a, %b 161 store <2 x i16> %result, ptr addrspace(1) %out.gep 162 ret void 163} 164 165define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { 166; GFX9-LABEL: lshr_v_s_v2i16: 167; GFX9: ; %bb.0: 168; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 169; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 170; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 171; GFX9-NEXT: s_waitcnt lgkmcnt(0) 172; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 173; GFX9-NEXT: s_waitcnt vmcnt(0) 174; GFX9-NEXT: v_pk_lshrrev_b16 v1, s6, v1 175; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 176; GFX9-NEXT: s_endpgm 177; 178; VI-LABEL: lshr_v_s_v2i16: 179; VI: ; %bb.0: 180; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 181; VI-NEXT: s_load_dword s4, s[4:5], 0x34 182; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 183; VI-NEXT: s_waitcnt lgkmcnt(0) 184; VI-NEXT: v_mov_b32_e32 v1, s3 185; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 186; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 187; VI-NEXT: flat_load_dword v3, v[0:1] 188; VI-NEXT: v_mov_b32_e32 v1, s1 189; VI-NEXT: s_lshr_b32 s1, s4, 16 190; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 191; VI-NEXT: v_mov_b32_e32 v2, s1 192; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 193; VI-NEXT: s_waitcnt vmcnt(0) 194; VI-NEXT: v_lshrrev_b16_e32 v4, s4, v3 195; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 196; VI-NEXT: v_or_b32_e32 v2, v4, v2 197; VI-NEXT: flat_store_dword v[0:1], v2 198; VI-NEXT: s_endpgm 199; 200; CI-LABEL: lshr_v_s_v2i16: 201; CI: ; %bb.0: 202; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 203; CI-NEXT: s_load_dword s8, s[4:5], 0xd 204; CI-NEXT: s_mov_b32 s7, 0xf000 205; CI-NEXT: s_mov_b32 s6, 0 206; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 207; CI-NEXT: s_waitcnt lgkmcnt(0) 208; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 209; CI-NEXT: v_mov_b32_e32 v1, 0 210; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 211; CI-NEXT: s_lshr_b32 s4, s8, 16 212; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 213; CI-NEXT: s_waitcnt vmcnt(0) 214; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 215; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2 216; CI-NEXT: v_lshrrev_b32_e32 v3, s4, v3 217; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2 218; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 219; CI-NEXT: v_or_b32_e32 v2, v2, v3 220; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 221; CI-NEXT: s_endpgm 222; 223; GFX10-LABEL: lshr_v_s_v2i16: 224; GFX10: ; %bb.0: 225; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 226; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 227; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34 228; GFX10-NEXT: s_waitcnt lgkmcnt(0) 229; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 230; GFX10-NEXT: s_waitcnt vmcnt(0) 231; GFX10-NEXT: v_pk_lshrrev_b16 v1, s4, v1 232; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 233; GFX10-NEXT: s_endpgm 234; 235; GFX11-LABEL: lshr_v_s_v2i16: 236; GFX11: ; %bb.0: 237; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 238; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 239; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 240; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 241; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 242; GFX11-NEXT: s_waitcnt lgkmcnt(0) 243; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 244; GFX11-NEXT: s_waitcnt vmcnt(0) 245; GFX11-NEXT: v_pk_lshrrev_b16 v1, s4, v1 246; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 247; GFX11-NEXT: s_endpgm 248 %tid = call i32 @llvm.amdgcn.workitem.id.x() 249 %tid.ext = sext i32 %tid to i64 250 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext 251 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext 252 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep 253 %result = lshr <2 x i16> %vgpr, %sgpr 254 store <2 x i16> %result, ptr addrspace(1) %out.gep 255 ret void 256} 257 258define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 { 259; GFX9-LABEL: lshr_s_v_v2i16: 260; GFX9: ; %bb.0: 261; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 262; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 263; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 264; GFX9-NEXT: s_waitcnt lgkmcnt(0) 265; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 266; GFX9-NEXT: s_waitcnt vmcnt(0) 267; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s6 268; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 269; GFX9-NEXT: s_endpgm 270; 271; VI-LABEL: lshr_s_v_v2i16: 272; VI: ; %bb.0: 273; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 274; VI-NEXT: s_load_dword s4, s[4:5], 0x34 275; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 276; VI-NEXT: s_waitcnt lgkmcnt(0) 277; VI-NEXT: v_mov_b32_e32 v1, s3 278; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 279; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 280; VI-NEXT: flat_load_dword v3, v[0:1] 281; VI-NEXT: v_mov_b32_e32 v1, s1 282; VI-NEXT: s_lshr_b32 s1, s4, 16 283; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 284; VI-NEXT: v_mov_b32_e32 v2, s1 285; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 286; VI-NEXT: s_waitcnt vmcnt(0) 287; VI-NEXT: v_lshrrev_b16_e64 v4, v3, s4 288; VI-NEXT: v_lshrrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 289; VI-NEXT: v_or_b32_e32 v2, v4, v2 290; VI-NEXT: flat_store_dword v[0:1], v2 291; VI-NEXT: s_endpgm 292; 293; CI-LABEL: lshr_s_v_v2i16: 294; CI: ; %bb.0: 295; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 296; CI-NEXT: s_load_dword s8, s[4:5], 0xd 297; CI-NEXT: s_mov_b32 s7, 0xf000 298; CI-NEXT: s_mov_b32 s6, 0 299; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 300; CI-NEXT: s_waitcnt lgkmcnt(0) 301; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 302; CI-NEXT: v_mov_b32_e32 v1, 0 303; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 304; CI-NEXT: s_lshr_b32 s4, s8, 16 305; CI-NEXT: s_and_b32 s5, s8, 0xffff 306; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 307; CI-NEXT: s_waitcnt vmcnt(0) 308; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 309; CI-NEXT: v_lshr_b32_e32 v3, s4, v3 310; CI-NEXT: v_lshr_b32_e32 v2, s5, v2 311; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 312; CI-NEXT: v_or_b32_e32 v2, v2, v3 313; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 314; CI-NEXT: s_endpgm 315; 316; GFX10-LABEL: lshr_s_v_v2i16: 317; GFX10: ; %bb.0: 318; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 319; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 320; GFX10-NEXT: s_load_dword s4, s[4:5], 0x34 321; GFX10-NEXT: s_waitcnt lgkmcnt(0) 322; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 323; GFX10-NEXT: s_waitcnt vmcnt(0) 324; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, s4 325; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 326; GFX10-NEXT: s_endpgm 327; 328; GFX11-LABEL: lshr_s_v_v2i16: 329; GFX11: ; %bb.0: 330; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 331; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 332; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34 333; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 334; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 335; GFX11-NEXT: s_waitcnt lgkmcnt(0) 336; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 337; GFX11-NEXT: s_waitcnt vmcnt(0) 338; GFX11-NEXT: v_pk_lshrrev_b16 v1, v1, s4 339; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 340; GFX11-NEXT: s_endpgm 341 %tid = call i32 @llvm.amdgcn.workitem.id.x() 342 %tid.ext = sext i32 %tid to i64 343 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext 344 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext 345 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep 346 %result = lshr <2 x i16> %sgpr, %vgpr 347 store <2 x i16> %result, ptr addrspace(1) %out.gep 348 ret void 349} 350 351define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 352; GFX9-LABEL: lshr_imm_v_v2i16: 353; GFX9: ; %bb.0: 354; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 355; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 356; GFX9-NEXT: s_waitcnt lgkmcnt(0) 357; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 358; GFX9-NEXT: s_waitcnt vmcnt(0) 359; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0] 360; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 361; GFX9-NEXT: s_endpgm 362; 363; VI-LABEL: lshr_imm_v_v2i16: 364; VI: ; %bb.0: 365; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 366; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 367; VI-NEXT: v_mov_b32_e32 v4, 8 368; VI-NEXT: s_waitcnt lgkmcnt(0) 369; VI-NEXT: v_mov_b32_e32 v1, s3 370; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 371; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 372; VI-NEXT: flat_load_dword v3, v[0:1] 373; VI-NEXT: v_mov_b32_e32 v1, s1 374; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 375; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 376; VI-NEXT: s_waitcnt vmcnt(0) 377; VI-NEXT: v_lshrrev_b16_e64 v2, v3, 8 378; VI-NEXT: v_lshrrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 379; VI-NEXT: v_or_b32_e32 v2, v2, v3 380; VI-NEXT: flat_store_dword v[0:1], v2 381; VI-NEXT: s_endpgm 382; 383; CI-LABEL: lshr_imm_v_v2i16: 384; CI: ; %bb.0: 385; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 386; CI-NEXT: s_mov_b32 s7, 0xf000 387; CI-NEXT: s_mov_b32 s6, 0 388; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 389; CI-NEXT: v_mov_b32_e32 v1, 0 390; CI-NEXT: s_waitcnt lgkmcnt(0) 391; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 392; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 393; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 394; CI-NEXT: s_waitcnt vmcnt(0) 395; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 396; CI-NEXT: v_lshr_b32_e32 v3, 8, v3 397; CI-NEXT: v_lshr_b32_e32 v2, 8, v2 398; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 399; CI-NEXT: v_or_b32_e32 v2, v2, v3 400; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 401; CI-NEXT: s_endpgm 402; 403; GFX10-LABEL: lshr_imm_v_v2i16: 404; GFX10: ; %bb.0: 405; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 406; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 407; GFX10-NEXT: s_waitcnt lgkmcnt(0) 408; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 409; GFX10-NEXT: s_waitcnt vmcnt(0) 410; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0] 411; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 412; GFX10-NEXT: s_endpgm 413; 414; GFX11-LABEL: lshr_imm_v_v2i16: 415; GFX11: ; %bb.0: 416; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 417; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 418; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 419; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 420; GFX11-NEXT: s_waitcnt lgkmcnt(0) 421; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 422; GFX11-NEXT: s_waitcnt vmcnt(0) 423; GFX11-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0] 424; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 425; GFX11-NEXT: s_endpgm 426 %tid = call i32 @llvm.amdgcn.workitem.id.x() 427 %tid.ext = sext i32 %tid to i64 428 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext 429 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext 430 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep 431 %result = lshr <2 x i16> <i16 8, i16 8>, %vgpr 432 store <2 x i16> %result, ptr addrspace(1) %out.gep 433 ret void 434} 435 436define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 437; GFX9-LABEL: lshr_v_imm_v2i16: 438; GFX9: ; %bb.0: 439; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 440; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 441; GFX9-NEXT: s_waitcnt lgkmcnt(0) 442; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 443; GFX9-NEXT: s_waitcnt vmcnt(0) 444; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] 445; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 446; GFX9-NEXT: s_endpgm 447; 448; VI-LABEL: lshr_v_imm_v2i16: 449; VI: ; %bb.0: 450; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 451; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 452; VI-NEXT: s_waitcnt lgkmcnt(0) 453; VI-NEXT: v_mov_b32_e32 v1, s3 454; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 455; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 456; VI-NEXT: flat_load_dword v3, v[0:1] 457; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 458; VI-NEXT: v_mov_b32_e32 v1, s1 459; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 460; VI-NEXT: s_waitcnt vmcnt(0) 461; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v3 462; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 463; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 464; VI-NEXT: flat_store_dword v[0:1], v2 465; VI-NEXT: s_endpgm 466; 467; CI-LABEL: lshr_v_imm_v2i16: 468; CI: ; %bb.0: 469; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 470; CI-NEXT: s_mov_b32 s7, 0xf000 471; CI-NEXT: s_mov_b32 s6, 0 472; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 473; CI-NEXT: v_mov_b32_e32 v1, 0 474; CI-NEXT: s_waitcnt lgkmcnt(0) 475; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 476; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 477; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 478; CI-NEXT: s_waitcnt vmcnt(0) 479; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 480; CI-NEXT: v_and_b32_e32 v2, 0xff00ff, v2 481; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 482; CI-NEXT: s_endpgm 483; 484; GFX10-LABEL: lshr_v_imm_v2i16: 485; GFX10: ; %bb.0: 486; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 487; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 488; GFX10-NEXT: s_waitcnt lgkmcnt(0) 489; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 490; GFX10-NEXT: s_waitcnt vmcnt(0) 491; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] 492; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 493; GFX10-NEXT: s_endpgm 494; 495; GFX11-LABEL: lshr_v_imm_v2i16: 496; GFX11: ; %bb.0: 497; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 498; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 499; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 500; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 501; GFX11-NEXT: s_waitcnt lgkmcnt(0) 502; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 503; GFX11-NEXT: s_waitcnt vmcnt(0) 504; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] 505; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 506; GFX11-NEXT: s_endpgm 507 %tid = call i32 @llvm.amdgcn.workitem.id.x() 508 %tid.ext = sext i32 %tid to i64 509 %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext 510 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext 511 %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep 512 %result = lshr <2 x i16> %vgpr, <i16 8, i16 8> 513 store <2 x i16> %result, ptr addrspace(1) %out.gep 514 ret void 515} 516 517define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 518; GFX9-LABEL: v_lshr_v4i16: 519; GFX9: ; %bb.0: 520; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 521; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 522; GFX9-NEXT: s_waitcnt lgkmcnt(0) 523; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 524; GFX9-NEXT: s_waitcnt vmcnt(0) 525; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1 526; GFX9-NEXT: v_pk_lshrrev_b16 v0, v2, v0 527; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 528; GFX9-NEXT: s_endpgm 529; 530; VI-LABEL: v_lshr_v4i16: 531; VI: ; %bb.0: 532; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 533; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 534; VI-NEXT: s_waitcnt lgkmcnt(0) 535; VI-NEXT: v_mov_b32_e32 v1, s3 536; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 537; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 538; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 539; VI-NEXT: v_mov_b32_e32 v5, s1 540; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 541; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc 542; VI-NEXT: s_waitcnt vmcnt(0) 543; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1 544; VI-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 545; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v0 546; VI-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 547; VI-NEXT: v_or_b32_e32 v1, v6, v1 548; VI-NEXT: v_or_b32_e32 v0, v3, v0 549; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 550; VI-NEXT: s_endpgm 551; 552; CI-LABEL: v_lshr_v4i16: 553; CI: ; %bb.0: 554; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 555; CI-NEXT: s_mov_b32 s7, 0xf000 556; CI-NEXT: s_mov_b32 s6, 0 557; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 558; CI-NEXT: v_mov_b32_e32 v5, 0 559; CI-NEXT: s_waitcnt lgkmcnt(0) 560; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 561; CI-NEXT: buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64 562; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 563; CI-NEXT: s_waitcnt vmcnt(0) 564; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 565; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0 566; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 567; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1 568; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 569; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 570; CI-NEXT: v_lshrrev_b32_e32 v1, v3, v1 571; CI-NEXT: v_lshrrev_b32_e32 v3, v9, v7 572; CI-NEXT: v_lshrrev_b32_e32 v0, v2, v0 573; CI-NEXT: v_lshrrev_b32_e32 v2, v8, v6 574; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 575; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 576; CI-NEXT: v_or_b32_e32 v1, v1, v3 577; CI-NEXT: v_or_b32_e32 v0, v0, v2 578; CI-NEXT: buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64 579; CI-NEXT: s_endpgm 580; 581; GFX10-LABEL: v_lshr_v4i16: 582; GFX10: ; %bb.0: 583; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 584; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 585; GFX10-NEXT: s_waitcnt lgkmcnt(0) 586; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] 587; GFX10-NEXT: s_waitcnt vmcnt(0) 588; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1 589; GFX10-NEXT: v_pk_lshrrev_b16 v0, v2, v0 590; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] 591; GFX10-NEXT: s_endpgm 592; 593; GFX11-LABEL: v_lshr_v4i16: 594; GFX11: ; %bb.0: 595; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 596; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 597; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 598; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 599; GFX11-NEXT: s_waitcnt lgkmcnt(0) 600; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] 601; GFX11-NEXT: s_waitcnt vmcnt(0) 602; GFX11-NEXT: v_pk_lshrrev_b16 v1, v3, v1 603; GFX11-NEXT: v_pk_lshrrev_b16 v0, v2, v0 604; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1] 605; GFX11-NEXT: s_endpgm 606 %tid = call i32 @llvm.amdgcn.workitem.id.x() 607 %tid.ext = sext i32 %tid to i64 608 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext 609 %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext 610 %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %in.gep, i32 1 611 %a = load <4 x i16>, ptr addrspace(1) %in.gep 612 %b = load <4 x i16>, ptr addrspace(1) %b_ptr 613 %result = lshr <4 x i16> %a, %b 614 store <4 x i16> %result, ptr addrspace(1) %out.gep 615 ret void 616} 617 618define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { 619; GFX9-LABEL: lshr_v_imm_v4i16: 620; GFX9: ; %bb.0: 621; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 622; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 623; GFX9-NEXT: s_waitcnt lgkmcnt(0) 624; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 625; GFX9-NEXT: s_waitcnt vmcnt(0) 626; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] 627; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 628; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 629; GFX9-NEXT: s_endpgm 630; 631; VI-LABEL: lshr_v_imm_v4i16: 632; VI: ; %bb.0: 633; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 634; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 635; VI-NEXT: s_waitcnt lgkmcnt(0) 636; VI-NEXT: v_mov_b32_e32 v1, s3 637; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 638; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 639; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 640; VI-NEXT: v_mov_b32_e32 v3, s1 641; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 642; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 643; VI-NEXT: s_waitcnt vmcnt(0) 644; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v1 645; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v0 646; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 647; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 648; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 649; VI-NEXT: v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 650; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 651; VI-NEXT: s_endpgm 652; 653; CI-LABEL: lshr_v_imm_v4i16: 654; CI: ; %bb.0: 655; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 656; CI-NEXT: s_mov_b32 s7, 0xf000 657; CI-NEXT: s_mov_b32 s6, 0 658; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 659; CI-NEXT: v_mov_b32_e32 v1, 0 660; CI-NEXT: s_waitcnt lgkmcnt(0) 661; CI-NEXT: s_mov_b64 s[4:5], s[2:3] 662; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 663; CI-NEXT: s_mov_b64 s[2:3], s[6:7] 664; CI-NEXT: s_waitcnt vmcnt(0) 665; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3 666; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2 667; CI-NEXT: v_and_b32_e32 v3, 0xff00ff, v3 668; CI-NEXT: v_and_b32_e32 v2, 0xff00ff, v2 669; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 670; CI-NEXT: s_endpgm 671; 672; GFX10-LABEL: lshr_v_imm_v4i16: 673; GFX10: ; %bb.0: 674; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 675; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 676; GFX10-NEXT: s_waitcnt lgkmcnt(0) 677; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] 678; GFX10-NEXT: s_waitcnt vmcnt(0) 679; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] 680; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 681; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 682; GFX10-NEXT: s_endpgm 683; 684; GFX11-LABEL: lshr_v_imm_v4i16: 685; GFX11: ; %bb.0: 686; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 687; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 688; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 689; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 690; GFX11-NEXT: s_waitcnt lgkmcnt(0) 691; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] 692; GFX11-NEXT: s_waitcnt vmcnt(0) 693; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] 694; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 695; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 696; GFX11-NEXT: s_endpgm 697 %tid = call i32 @llvm.amdgcn.workitem.id.x() 698 %tid.ext = sext i32 %tid to i64 699 %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext 700 %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext 701 %vgpr = load <4 x i16>, ptr addrspace(1) %in.gep 702 %result = lshr <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8> 703 store <4 x i16> %result, ptr addrspace(1) %out.gep 704 ret void 705} 706 707declare i32 @llvm.amdgcn.workitem.id.x() #1 708 709attributes #0 = { nounwind } 710attributes #1 = { nounwind readnone } 711