1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9 3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11 6 7; FIXME: Need to handle non-uniform case for function below (load without gep). 8define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { 9; GFX9-LABEL: v_test_sub_v2i16: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 12; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 13; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 14; GFX9-NEXT: s_waitcnt lgkmcnt(0) 15; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 16; GFX9-NEXT: s_waitcnt vmcnt(0) 17; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc 18; GFX9-NEXT: s_waitcnt vmcnt(0) 19; GFX9-NEXT: s_mov_b32 s3, 0xf000 20; GFX9-NEXT: s_mov_b32 s2, -1 21; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 22; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 23; GFX9-NEXT: s_endpgm 24; 25; VI-LABEL: v_test_sub_v2i16: 26; VI: ; %bb.0: 27; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 28; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 29; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 30; VI-NEXT: s_waitcnt lgkmcnt(0) 31; VI-NEXT: v_mov_b32_e32 v1, s3 32; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 33; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 34; VI-NEXT: v_mov_b32_e32 v3, s5 35; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 36; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 37; VI-NEXT: flat_load_dword v0, v[0:1] glc 38; VI-NEXT: s_waitcnt vmcnt(0) 39; VI-NEXT: flat_load_dword v1, v[2:3] glc 40; VI-NEXT: s_waitcnt vmcnt(0) 41; VI-NEXT: s_mov_b32 s3, 0xf000 42; VI-NEXT: s_mov_b32 s2, -1 43; VI-NEXT: v_sub_u16_e32 v2, v0, v1 44; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 45; VI-NEXT: v_or_b32_e32 v0, v2, v0 46; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 47; VI-NEXT: s_endpgm 48; 49; GFX10-LABEL: v_test_sub_v2i16: 50; GFX10: ; %bb.0: 51; GFX10-NEXT: s_clause 0x1 52; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 53; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 54; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 55; GFX10-NEXT: s_waitcnt lgkmcnt(0) 56; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 57; GFX10-NEXT: s_waitcnt vmcnt(0) 58; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc 59; GFX10-NEXT: s_waitcnt vmcnt(0) 60; GFX10-NEXT: s_waitcnt_depctr 0xffe3 61; GFX10-NEXT: s_mov_b32 s3, 0x31016000 62; GFX10-NEXT: s_mov_b32 s2, -1 63; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 64; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 65; GFX10-NEXT: s_endpgm 66; 67; GFX11-LABEL: v_test_sub_v2i16: 68; GFX11: ; %bb.0: 69; GFX11-NEXT: s_clause 0x1 70; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 71; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 72; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 73; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 74; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 75; GFX11-NEXT: s_waitcnt lgkmcnt(0) 76; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 77; GFX11-NEXT: s_waitcnt vmcnt(0) 78; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc 79; GFX11-NEXT: s_waitcnt vmcnt(0) 80; GFX11-NEXT: s_mov_b32 s3, 0x31016000 81; GFX11-NEXT: s_mov_b32 s2, -1 82; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 83; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 84; GFX11-NEXT: s_endpgm 85 %tid = call i32 @llvm.amdgcn.workitem.id.x() 86 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid 87 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 88 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid 89 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 90 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1 91 %add = sub <2 x i16> %a, %b 92 store <2 x i16> %add, ptr addrspace(1) %out 93 ret void 94} 95 96define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 { 97; GFX9-LABEL: s_test_sub_v2i16: 98; GFX9: ; %bb.0: 99; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 100; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 101; GFX9-NEXT: s_mov_b32 s7, 0xf000 102; GFX9-NEXT: s_mov_b32 s6, -1 103; GFX9-NEXT: s_waitcnt lgkmcnt(0) 104; GFX9-NEXT: s_load_dword s10, s[8:9], 0x0 105; GFX9-NEXT: s_load_dword s11, s[2:3], 0x0 106; GFX9-NEXT: s_mov_b32 s4, s0 107; GFX9-NEXT: s_mov_b32 s5, s1 108; GFX9-NEXT: s_waitcnt lgkmcnt(0) 109; GFX9-NEXT: v_mov_b32_e32 v0, s10 110; GFX9-NEXT: v_pk_sub_i16 v0, s11, v0 111; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 112; GFX9-NEXT: s_endpgm 113; 114; VI-LABEL: s_test_sub_v2i16: 115; VI: ; %bb.0: 116; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 117; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 118; VI-NEXT: s_mov_b32 s7, 0xf000 119; VI-NEXT: s_mov_b32 s6, -1 120; VI-NEXT: s_waitcnt lgkmcnt(0) 121; VI-NEXT: s_load_dword s2, s[2:3], 0x0 122; VI-NEXT: s_load_dword s3, s[4:5], 0x0 123; VI-NEXT: s_mov_b32 s4, s0 124; VI-NEXT: s_mov_b32 s5, s1 125; VI-NEXT: s_waitcnt lgkmcnt(0) 126; VI-NEXT: s_lshr_b32 s0, s2, 16 127; VI-NEXT: s_lshr_b32 s1, s3, 16 128; VI-NEXT: s_sub_i32 s2, s2, s3 129; VI-NEXT: s_sub_i32 s0, s0, s1 130; VI-NEXT: s_and_b32 s1, s2, 0xffff 131; VI-NEXT: s_lshl_b32 s0, s0, 16 132; VI-NEXT: s_or_b32 s0, s1, s0 133; VI-NEXT: v_mov_b32_e32 v0, s0 134; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 135; VI-NEXT: s_endpgm 136; 137; GFX10-LABEL: s_test_sub_v2i16: 138; GFX10: ; %bb.0: 139; GFX10-NEXT: s_clause 0x1 140; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 141; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 142; GFX10-NEXT: s_waitcnt lgkmcnt(0) 143; GFX10-NEXT: s_load_dword s4, s[2:3], 0x0 144; GFX10-NEXT: s_load_dword s5, s[6:7], 0x0 145; GFX10-NEXT: s_mov_b32 s3, 0x31016000 146; GFX10-NEXT: s_mov_b32 s2, -1 147; GFX10-NEXT: s_waitcnt lgkmcnt(0) 148; GFX10-NEXT: v_pk_sub_i16 v0, s4, s5 149; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 150; GFX10-NEXT: s_endpgm 151; 152; GFX11-LABEL: s_test_sub_v2i16: 153; GFX11: ; %bb.0: 154; GFX11-NEXT: s_clause 0x1 155; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 156; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 157; GFX11-NEXT: s_waitcnt lgkmcnt(0) 158; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 159; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0 160; GFX11-NEXT: s_mov_b32 s3, 0x31016000 161; GFX11-NEXT: s_waitcnt lgkmcnt(0) 162; GFX11-NEXT: v_pk_sub_i16 v0, s2, s4 163; GFX11-NEXT: s_mov_b32 s2, -1 164; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 165; GFX11-NEXT: s_endpgm 166 %a = load <2 x i16>, ptr addrspace(4) %in0 167 %b = load <2 x i16>, ptr addrspace(4) %in1 168 %add = sub <2 x i16> %a, %b 169 store <2 x i16> %add, ptr addrspace(1) %out 170 ret void 171} 172 173define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 { 174; GCN-LABEL: s_test_sub_self_v2i16: 175; GCN: ; %bb.0: 176; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 177; GCN-NEXT: s_mov_b32 s3, 0xf000 178; GCN-NEXT: s_mov_b32 s2, -1 179; GCN-NEXT: v_mov_b32_e32 v0, 0 180; GCN-NEXT: s_waitcnt lgkmcnt(0) 181; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 182; GCN-NEXT: s_endpgm 183; 184; GFX10-LABEL: s_test_sub_self_v2i16: 185; GFX10: ; %bb.0: 186; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 187; GFX10-NEXT: v_mov_b32_e32 v0, 0 188; GFX10-NEXT: s_mov_b32 s3, 0x31016000 189; GFX10-NEXT: s_mov_b32 s2, -1 190; GFX10-NEXT: s_waitcnt lgkmcnt(0) 191; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 192; GFX10-NEXT: s_endpgm 193; 194; GFX11-LABEL: s_test_sub_self_v2i16: 195; GFX11: ; %bb.0: 196; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 197; GFX11-NEXT: v_mov_b32_e32 v0, 0 198; GFX11-NEXT: s_mov_b32 s3, 0x31016000 199; GFX11-NEXT: s_mov_b32 s2, -1 200; GFX11-NEXT: s_waitcnt lgkmcnt(0) 201; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 202; GFX11-NEXT: s_endpgm 203 %a = load <2 x i16>, ptr addrspace(4) %in0 204 %add = sub <2 x i16> %a, %a 205 store <2 x i16> %add, ptr addrspace(1) %out 206 ret void 207} 208 209; FIXME: VI should not scalarize arg access. 210define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { 211; GFX9-LABEL: s_test_sub_v2i16_kernarg: 212; GFX9: ; %bb.0: 213; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 214; GFX9-NEXT: s_mov_b32 s7, 0xf000 215; GFX9-NEXT: s_mov_b32 s6, -1 216; GFX9-NEXT: s_waitcnt lgkmcnt(0) 217; GFX9-NEXT: v_mov_b32_e32 v0, s3 218; GFX9-NEXT: s_mov_b32 s4, s0 219; GFX9-NEXT: s_mov_b32 s5, s1 220; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0 221; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 222; GFX9-NEXT: s_endpgm 223; 224; VI-LABEL: s_test_sub_v2i16_kernarg: 225; VI: ; %bb.0: 226; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 227; VI-NEXT: s_mov_b32 s7, 0xf000 228; VI-NEXT: s_mov_b32 s6, -1 229; VI-NEXT: s_waitcnt lgkmcnt(0) 230; VI-NEXT: s_mov_b32 s4, s0 231; VI-NEXT: s_mov_b32 s5, s1 232; VI-NEXT: s_lshr_b32 s0, s2, 16 233; VI-NEXT: s_lshr_b32 s1, s3, 16 234; VI-NEXT: s_sub_i32 s0, s0, s1 235; VI-NEXT: s_sub_i32 s1, s2, s3 236; VI-NEXT: s_lshl_b32 s0, s0, 16 237; VI-NEXT: s_and_b32 s1, s1, 0xffff 238; VI-NEXT: s_or_b32 s0, s1, s0 239; VI-NEXT: v_mov_b32_e32 v0, s0 240; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 241; VI-NEXT: s_endpgm 242; 243; GFX10-LABEL: s_test_sub_v2i16_kernarg: 244; GFX10: ; %bb.0: 245; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 246; GFX10-NEXT: s_mov_b32 s7, 0x31016000 247; GFX10-NEXT: s_mov_b32 s6, -1 248; GFX10-NEXT: s_waitcnt lgkmcnt(0) 249; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 250; GFX10-NEXT: s_mov_b32 s4, s0 251; GFX10-NEXT: s_mov_b32 s5, s1 252; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 253; GFX10-NEXT: s_endpgm 254; 255; GFX11-LABEL: s_test_sub_v2i16_kernarg: 256; GFX11: ; %bb.0: 257; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 258; GFX11-NEXT: s_mov_b32 s7, 0x31016000 259; GFX11-NEXT: s_mov_b32 s6, -1 260; GFX11-NEXT: s_waitcnt lgkmcnt(0) 261; GFX11-NEXT: v_pk_sub_i16 v0, s2, s3 262; GFX11-NEXT: s_mov_b32 s4, s0 263; GFX11-NEXT: s_mov_b32 s5, s1 264; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 265; GFX11-NEXT: s_endpgm 266 %add = sub <2 x i16> %a, %b 267 store <2 x i16> %add, ptr addrspace(1) %out 268 ret void 269} 270 271define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { 272; GFX9-LABEL: v_test_sub_v2i16_constant: 273; GFX9: ; %bb.0: 274; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 275; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 276; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b 277; GFX9-NEXT: s_waitcnt lgkmcnt(0) 278; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 279; GFX9-NEXT: s_waitcnt vmcnt(0) 280; GFX9-NEXT: s_mov_b32 s3, 0xf000 281; GFX9-NEXT: s_mov_b32 s2, -1 282; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 283; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 284; GFX9-NEXT: s_endpgm 285; 286; VI-LABEL: v_test_sub_v2i16_constant: 287; VI: ; %bb.0: 288; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 289; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 290; VI-NEXT: s_waitcnt lgkmcnt(0) 291; VI-NEXT: v_mov_b32_e32 v1, s3 292; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 293; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 294; VI-NEXT: flat_load_dword v0, v[0:1] glc 295; VI-NEXT: s_waitcnt vmcnt(0) 296; VI-NEXT: v_mov_b32_e32 v1, 0xfffffe38 297; VI-NEXT: s_mov_b32 s3, 0xf000 298; VI-NEXT: s_mov_b32 s2, -1 299; VI-NEXT: v_add_u16_e32 v2, 0xff85, v0 300; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 301; VI-NEXT: v_or_b32_e32 v0, v2, v0 302; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 303; VI-NEXT: s_endpgm 304; 305; GFX10-LABEL: v_test_sub_v2i16_constant: 306; GFX10: ; %bb.0: 307; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 308; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 309; GFX10-NEXT: s_waitcnt lgkmcnt(0) 310; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 311; GFX10-NEXT: s_waitcnt vmcnt(0) 312; GFX10-NEXT: s_waitcnt_depctr 0xffe3 313; GFX10-NEXT: s_mov_b32 s3, 0x31016000 314; GFX10-NEXT: s_mov_b32 s2, -1 315; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b 316; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 317; GFX10-NEXT: s_endpgm 318; 319; GFX11-LABEL: v_test_sub_v2i16_constant: 320; GFX11: ; %bb.0: 321; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 322; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 323; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 324; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 325; GFX11-NEXT: s_waitcnt lgkmcnt(0) 326; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc 327; GFX11-NEXT: s_waitcnt vmcnt(0) 328; GFX11-NEXT: s_mov_b32 s3, 0x31016000 329; GFX11-NEXT: s_mov_b32 s2, -1 330; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b 331; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 332; GFX11-NEXT: s_endpgm 333 %tid = call i32 @llvm.amdgcn.workitem.id.x() 334 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid 335 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 336 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 337 %add = sub <2 x i16> %a, <i16 123, i16 456> 338 store <2 x i16> %add, ptr addrspace(1) %out 339 ret void 340} 341 342; FIXME: Need to handle non-uniform case for function below (load without gep). 343define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { 344; GFX9-LABEL: v_test_sub_v2i16_neg_constant: 345; GFX9: ; %bb.0: 346; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 347; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 348; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 349; GFX9-NEXT: s_waitcnt lgkmcnt(0) 350; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 351; GFX9-NEXT: s_waitcnt vmcnt(0) 352; GFX9-NEXT: s_mov_b32 s3, 0xf000 353; GFX9-NEXT: s_mov_b32 s2, -1 354; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4 355; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 356; GFX9-NEXT: s_endpgm 357; 358; VI-LABEL: v_test_sub_v2i16_neg_constant: 359; VI: ; %bb.0: 360; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 361; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 362; VI-NEXT: s_waitcnt lgkmcnt(0) 363; VI-NEXT: v_mov_b32_e32 v1, s3 364; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 365; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 366; VI-NEXT: flat_load_dword v0, v[0:1] glc 367; VI-NEXT: s_waitcnt vmcnt(0) 368; VI-NEXT: v_mov_b32_e32 v1, 0x3df 369; VI-NEXT: s_mov_b32 s3, 0xf000 370; VI-NEXT: s_mov_b32 s2, -1 371; VI-NEXT: v_add_u16_e32 v2, 0x34d, v0 372; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 373; VI-NEXT: v_or_b32_e32 v0, v2, v0 374; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 375; VI-NEXT: s_endpgm 376; 377; GFX10-LABEL: v_test_sub_v2i16_neg_constant: 378; GFX10: ; %bb.0: 379; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 380; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 381; GFX10-NEXT: s_waitcnt lgkmcnt(0) 382; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 383; GFX10-NEXT: s_waitcnt vmcnt(0) 384; GFX10-NEXT: s_waitcnt_depctr 0xffe3 385; GFX10-NEXT: s_mov_b32 s3, 0x31016000 386; GFX10-NEXT: s_mov_b32 s2, -1 387; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3 388; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 389; GFX10-NEXT: s_endpgm 390; 391; GFX11-LABEL: v_test_sub_v2i16_neg_constant: 392; GFX11: ; %bb.0: 393; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 394; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 395; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 396; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 397; GFX11-NEXT: s_waitcnt lgkmcnt(0) 398; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc 399; GFX11-NEXT: s_waitcnt vmcnt(0) 400; GFX11-NEXT: s_mov_b32 s3, 0x31016000 401; GFX11-NEXT: s_mov_b32 s2, -1 402; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3 403; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 404; GFX11-NEXT: s_endpgm 405 %tid = call i32 @llvm.amdgcn.workitem.id.x() 406 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid 407 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 408 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 409 %add = sub <2 x i16> %a, <i16 -845, i16 -991> 410 store <2 x i16> %add, ptr addrspace(1) %out 411 ret void 412} 413 414define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { 415; GFX9-LABEL: v_test_sub_v2i16_inline_neg1: 416; GFX9: ; %bb.0: 417; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 418; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 419; GFX9-NEXT: s_waitcnt lgkmcnt(0) 420; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 421; GFX9-NEXT: s_waitcnt vmcnt(0) 422; GFX9-NEXT: s_mov_b32 s3, 0xf000 423; GFX9-NEXT: s_mov_b32 s2, -1 424; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 425; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 426; GFX9-NEXT: s_endpgm 427; 428; VI-LABEL: v_test_sub_v2i16_inline_neg1: 429; VI: ; %bb.0: 430; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 431; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 432; VI-NEXT: s_waitcnt lgkmcnt(0) 433; VI-NEXT: v_mov_b32_e32 v1, s3 434; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 435; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 436; VI-NEXT: flat_load_dword v0, v[0:1] glc 437; VI-NEXT: s_waitcnt vmcnt(0) 438; VI-NEXT: v_mov_b32_e32 v1, 1 439; VI-NEXT: s_mov_b32 s3, 0xf000 440; VI-NEXT: s_mov_b32 s2, -1 441; VI-NEXT: v_add_u16_e32 v2, 1, v0 442; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 443; VI-NEXT: v_or_b32_e32 v0, v2, v0 444; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 445; VI-NEXT: s_endpgm 446; 447; GFX10-LABEL: v_test_sub_v2i16_inline_neg1: 448; GFX10: ; %bb.0: 449; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 450; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 451; GFX10-NEXT: s_waitcnt lgkmcnt(0) 452; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 453; GFX10-NEXT: s_waitcnt vmcnt(0) 454; GFX10-NEXT: s_waitcnt_depctr 0xffe3 455; GFX10-NEXT: s_mov_b32 s3, 0x31016000 456; GFX10-NEXT: s_mov_b32 s2, -1 457; GFX10-NEXT: v_pk_sub_i16 v0, v0, -1 458; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 459; GFX10-NEXT: s_endpgm 460; 461; GFX11-LABEL: v_test_sub_v2i16_inline_neg1: 462; GFX11: ; %bb.0: 463; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 464; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 465; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 466; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 467; GFX11-NEXT: s_waitcnt lgkmcnt(0) 468; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc 469; GFX11-NEXT: s_waitcnt vmcnt(0) 470; GFX11-NEXT: s_mov_b32 s3, 0x31016000 471; GFX11-NEXT: s_mov_b32 s2, -1 472; GFX11-NEXT: v_pk_sub_i16 v0, v0, -1 473; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 474; GFX11-NEXT: s_endpgm 475 %tid = call i32 @llvm.amdgcn.workitem.id.x() 476 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid 477 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 478 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 479 %add = sub <2 x i16> %a, <i16 -1, i16 -1> 480 store <2 x i16> %add, ptr addrspace(1) %out 481 ret void 482} 483 484define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { 485; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: 486; GFX9: ; %bb.0: 487; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 488; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 489; GFX9-NEXT: s_waitcnt lgkmcnt(0) 490; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 491; GFX9-NEXT: s_waitcnt vmcnt(0) 492; GFX9-NEXT: s_mov_b32 s3, 0xf000 493; GFX9-NEXT: s_mov_b32 s2, -1 494; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32 495; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 496; GFX9-NEXT: s_endpgm 497; 498; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: 499; VI: ; %bb.0: 500; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 501; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 502; VI-NEXT: s_waitcnt lgkmcnt(0) 503; VI-NEXT: v_mov_b32_e32 v1, s3 504; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 505; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 506; VI-NEXT: flat_load_dword v0, v[0:1] glc 507; VI-NEXT: s_waitcnt vmcnt(0) 508; VI-NEXT: s_mov_b32 s3, 0xf000 509; VI-NEXT: s_mov_b32 s2, -1 510; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 511; VI-NEXT: v_subrev_u16_e32 v0, 32, v0 512; VI-NEXT: v_or_b32_e32 v0, v0, v1 513; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 514; VI-NEXT: s_endpgm 515; 516; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: 517; GFX10: ; %bb.0: 518; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 519; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 520; GFX10-NEXT: s_waitcnt lgkmcnt(0) 521; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 522; GFX10-NEXT: s_waitcnt vmcnt(0) 523; GFX10-NEXT: s_waitcnt_depctr 0xffe3 524; GFX10-NEXT: s_mov_b32 s3, 0x31016000 525; GFX10-NEXT: s_mov_b32 s2, -1 526; GFX10-NEXT: v_pk_sub_i16 v0, v0, 32 527; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 528; GFX10-NEXT: s_endpgm 529; 530; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi: 531; GFX11: ; %bb.0: 532; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 533; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 534; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 535; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 536; GFX11-NEXT: s_waitcnt lgkmcnt(0) 537; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc 538; GFX11-NEXT: s_waitcnt vmcnt(0) 539; GFX11-NEXT: s_mov_b32 s3, 0x31016000 540; GFX11-NEXT: s_mov_b32 s2, -1 541; GFX11-NEXT: v_pk_sub_i16 v0, v0, 32 542; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 543; GFX11-NEXT: s_endpgm 544 %tid = call i32 @llvm.amdgcn.workitem.id.x() 545 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid 546 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 547 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 548 %add = sub <2 x i16> %a, <i16 32, i16 0> 549 store <2 x i16> %add, ptr addrspace(1) %out 550 ret void 551} 552 553; The high element gives fp 554define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { 555; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split: 556; GFX9: ; %bb.0: 557; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 558; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 559; GFX9-NEXT: s_waitcnt lgkmcnt(0) 560; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 561; GFX9-NEXT: s_waitcnt vmcnt(0) 562; GFX9-NEXT: s_mov_b32 s3, 0xf000 563; GFX9-NEXT: s_mov_b32 s2, -1 564; GFX9-NEXT: v_pk_sub_i16 v0, v0, 1.0 565; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 566; GFX9-NEXT: s_endpgm 567; 568; VI-LABEL: v_test_sub_v2i16_inline_fp_split: 569; VI: ; %bb.0: 570; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 571; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 572; VI-NEXT: s_waitcnt lgkmcnt(0) 573; VI-NEXT: v_mov_b32_e32 v1, s3 574; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 575; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 576; VI-NEXT: flat_load_dword v0, v[0:1] glc 577; VI-NEXT: s_waitcnt vmcnt(0) 578; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080 579; VI-NEXT: s_mov_b32 s3, 0xf000 580; VI-NEXT: s_mov_b32 s2, -1 581; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 582; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 583; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 584; VI-NEXT: s_endpgm 585; 586; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split: 587; GFX10: ; %bb.0: 588; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 589; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 590; GFX10-NEXT: s_waitcnt lgkmcnt(0) 591; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 592; GFX10-NEXT: s_waitcnt vmcnt(0) 593; GFX10-NEXT: s_waitcnt_depctr 0xffe3 594; GFX10-NEXT: s_mov_b32 s3, 0x31016000 595; GFX10-NEXT: s_mov_b32 s2, -1 596; GFX10-NEXT: v_pk_sub_i16 v0, v0, 1.0 597; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 598; GFX10-NEXT: s_endpgm 599; 600; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split: 601; GFX11: ; %bb.0: 602; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 603; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 604; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 605; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 606; GFX11-NEXT: s_waitcnt lgkmcnt(0) 607; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc 608; GFX11-NEXT: s_waitcnt vmcnt(0) 609; GFX11-NEXT: s_mov_b32 s3, 0x31016000 610; GFX11-NEXT: s_mov_b32 s2, -1 611; GFX11-NEXT: v_pk_sub_i16 v0, v0, 1.0 612; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 613; GFX11-NEXT: s_endpgm 614 %tid = call i32 @llvm.amdgcn.workitem.id.x() 615 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid 616 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 617 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 618 %add = sub <2 x i16> %a, <i16 0, i16 16256> 619 store <2 x i16> %add, ptr addrspace(1) %out 620 ret void 621} 622 623; FIXME: Need to handle non-uniform case for function below (load without gep). 624define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { 625; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32: 626; GFX9: ; %bb.0: 627; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 628; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 629; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 630; GFX9-NEXT: s_waitcnt lgkmcnt(0) 631; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 632; GFX9-NEXT: s_waitcnt vmcnt(0) 633; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc 634; GFX9-NEXT: s_waitcnt vmcnt(0) 635; GFX9-NEXT: s_mov_b32 s3, 0xf000 636; GFX9-NEXT: s_mov_b32 s2, -1 637; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 638; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 639; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 640; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 641; GFX9-NEXT: s_endpgm 642; 643; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32: 644; VI: ; %bb.0: 645; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 646; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 647; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 648; VI-NEXT: s_waitcnt lgkmcnt(0) 649; VI-NEXT: v_mov_b32_e32 v1, s3 650; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 651; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 652; VI-NEXT: v_mov_b32_e32 v3, s5 653; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 654; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 655; VI-NEXT: flat_load_dword v1, v[0:1] glc 656; VI-NEXT: s_waitcnt vmcnt(0) 657; VI-NEXT: flat_load_dword v2, v[2:3] glc 658; VI-NEXT: s_waitcnt vmcnt(0) 659; VI-NEXT: s_mov_b32 s3, 0xf000 660; VI-NEXT: s_mov_b32 s2, -1 661; VI-NEXT: v_sub_u16_e32 v0, v1, v2 662; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 663; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 664; VI-NEXT: s_endpgm 665; 666; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i32: 667; GFX10: ; %bb.0: 668; GFX10-NEXT: s_clause 0x1 669; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 670; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 671; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 672; GFX10-NEXT: s_waitcnt lgkmcnt(0) 673; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 674; GFX10-NEXT: s_waitcnt vmcnt(0) 675; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc 676; GFX10-NEXT: s_waitcnt vmcnt(0) 677; GFX10-NEXT: s_waitcnt_depctr 0xffe3 678; GFX10-NEXT: s_mov_b32 s3, 0x31016000 679; GFX10-NEXT: s_mov_b32 s2, -1 680; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 681; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 682; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 683; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 684; GFX10-NEXT: s_endpgm 685; 686; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i32: 687; GFX11: ; %bb.0: 688; GFX11-NEXT: s_clause 0x1 689; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 690; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 691; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 692; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 693; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 694; GFX11-NEXT: s_waitcnt lgkmcnt(0) 695; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 696; GFX11-NEXT: s_waitcnt vmcnt(0) 697; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc 698; GFX11-NEXT: s_waitcnt vmcnt(0) 699; GFX11-NEXT: s_mov_b32 s3, 0x31016000 700; GFX11-NEXT: s_mov_b32 s2, -1 701; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 702; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 703; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 704; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 705; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 706; GFX11-NEXT: s_endpgm 707 %tid = call i32 @llvm.amdgcn.workitem.id.x() 708 %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid 709 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 710 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid 711 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 712 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1 713 %add = sub <2 x i16> %a, %b 714 %ext = zext <2 x i16> %add to <2 x i32> 715 store <2 x i32> %ext, ptr addrspace(1) %out 716 ret void 717} 718 719; FIXME: Need to handle non-uniform case for function below (load without gep). 720define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { 721; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64: 722; GFX9: ; %bb.0: 723; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 724; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 725; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 726; GFX9-NEXT: v_mov_b32_e32 v1, 0 727; GFX9-NEXT: s_waitcnt lgkmcnt(0) 728; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 729; GFX9-NEXT: s_waitcnt vmcnt(0) 730; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc 731; GFX9-NEXT: s_waitcnt vmcnt(0) 732; GFX9-NEXT: s_mov_b32 s3, 0xf000 733; GFX9-NEXT: s_mov_b32 s2, -1 734; GFX9-NEXT: v_pk_sub_i16 v0, v2, v3 735; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16 736; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 737; GFX9-NEXT: v_mov_b32_e32 v3, v1 738; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 739; GFX9-NEXT: s_endpgm 740; 741; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64: 742; VI: ; %bb.0: 743; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 744; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 745; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 746; VI-NEXT: s_waitcnt lgkmcnt(0) 747; VI-NEXT: v_mov_b32_e32 v1, s3 748; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 749; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 750; VI-NEXT: v_mov_b32_e32 v3, s5 751; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 752; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 753; VI-NEXT: flat_load_dword v4, v[0:1] glc 754; VI-NEXT: s_waitcnt vmcnt(0) 755; VI-NEXT: flat_load_dword v2, v[2:3] glc 756; VI-NEXT: s_waitcnt vmcnt(0) 757; VI-NEXT: v_mov_b32_e32 v1, 0 758; VI-NEXT: s_mov_b32 s3, 0xf000 759; VI-NEXT: s_mov_b32 s2, -1 760; VI-NEXT: v_mov_b32_e32 v3, v1 761; VI-NEXT: v_sub_u16_e32 v0, v4, v2 762; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 763; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 764; VI-NEXT: s_endpgm 765; 766; GFX10-LABEL: v_test_sub_v2i16_zext_to_v2i64: 767; GFX10: ; %bb.0: 768; GFX10-NEXT: s_clause 0x1 769; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 770; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 771; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 772; GFX10-NEXT: s_waitcnt lgkmcnt(0) 773; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 774; GFX10-NEXT: s_waitcnt vmcnt(0) 775; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc 776; GFX10-NEXT: s_waitcnt vmcnt(0) 777; GFX10-NEXT: s_waitcnt_depctr 0xffe3 778; GFX10-NEXT: s_mov_b32 s3, 0x31016000 779; GFX10-NEXT: s_mov_b32 s2, -1 780; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 781; GFX10-NEXT: v_mov_b32_e32 v1, 0 782; GFX10-NEXT: v_alignbit_b32 v2, 0, v0, 16 783; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 784; GFX10-NEXT: v_mov_b32_e32 v3, v1 785; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 786; GFX10-NEXT: s_endpgm 787; 788; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i64: 789; GFX11: ; %bb.0: 790; GFX11-NEXT: s_clause 0x1 791; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 792; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 793; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 794; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 795; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 796; GFX11-NEXT: s_waitcnt lgkmcnt(0) 797; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 798; GFX11-NEXT: s_waitcnt vmcnt(0) 799; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc 800; GFX11-NEXT: s_waitcnt vmcnt(0) 801; GFX11-NEXT: s_mov_b32 s3, 0x31016000 802; GFX11-NEXT: s_mov_b32 s2, -1 803; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 804; GFX11-NEXT: v_mov_b32_e32 v1, 0 805; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 806; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16 807; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0 808; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 809; GFX11-NEXT: s_endpgm 810 %tid = call i32 @llvm.amdgcn.workitem.id.x() 811 %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid 812 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 813 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid 814 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 815 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1 816 %add = sub <2 x i16> %a, %b 817 %ext = zext <2 x i16> %add to <2 x i64> 818 store <2 x i64> %ext, ptr addrspace(1) %out 819 ret void 820} 821 822; FIXME: Need to handle non-uniform case for function below (load without gep). 823define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { 824; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32: 825; GFX9: ; %bb.0: 826; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 827; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 828; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 829; GFX9-NEXT: s_waitcnt lgkmcnt(0) 830; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 831; GFX9-NEXT: s_waitcnt vmcnt(0) 832; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc 833; GFX9-NEXT: s_waitcnt vmcnt(0) 834; GFX9-NEXT: s_mov_b32 s3, 0xf000 835; GFX9-NEXT: s_mov_b32 s2, -1 836; GFX9-NEXT: v_pk_sub_i16 v0, v1, v2 837; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 838; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 839; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 840; GFX9-NEXT: s_endpgm 841; 842; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32: 843; VI: ; %bb.0: 844; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 845; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 846; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 847; VI-NEXT: s_waitcnt lgkmcnt(0) 848; VI-NEXT: v_mov_b32_e32 v1, s3 849; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 850; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 851; VI-NEXT: v_mov_b32_e32 v3, s5 852; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 853; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 854; VI-NEXT: flat_load_dword v0, v[0:1] glc 855; VI-NEXT: s_waitcnt vmcnt(0) 856; VI-NEXT: flat_load_dword v1, v[2:3] glc 857; VI-NEXT: s_waitcnt vmcnt(0) 858; VI-NEXT: s_mov_b32 s3, 0xf000 859; VI-NEXT: s_mov_b32 s2, -1 860; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 861; VI-NEXT: v_sub_u16_e32 v0, v0, v1 862; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 863; VI-NEXT: v_bfe_i32 v1, v2, 0, 16 864; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 865; VI-NEXT: s_endpgm 866; 867; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i32: 868; GFX10: ; %bb.0: 869; GFX10-NEXT: s_clause 0x1 870; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 871; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 872; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 873; GFX10-NEXT: s_waitcnt lgkmcnt(0) 874; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 875; GFX10-NEXT: s_waitcnt vmcnt(0) 876; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc 877; GFX10-NEXT: s_waitcnt vmcnt(0) 878; GFX10-NEXT: s_waitcnt_depctr 0xffe3 879; GFX10-NEXT: s_mov_b32 s3, 0x31016000 880; GFX10-NEXT: s_mov_b32 s2, -1 881; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 882; GFX10-NEXT: v_ashrrev_i32_e32 v1, 16, v0 883; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 884; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 885; GFX10-NEXT: s_endpgm 886; 887; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i32: 888; GFX11: ; %bb.0: 889; GFX11-NEXT: s_clause 0x1 890; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 891; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 892; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 893; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 894; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 895; GFX11-NEXT: s_waitcnt lgkmcnt(0) 896; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 897; GFX11-NEXT: s_waitcnt vmcnt(0) 898; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc 899; GFX11-NEXT: s_waitcnt vmcnt(0) 900; GFX11-NEXT: s_mov_b32 s3, 0x31016000 901; GFX11-NEXT: s_mov_b32 s2, -1 902; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 903; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 904; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0 905; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 906; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 907; GFX11-NEXT: s_endpgm 908 %tid = call i32 @llvm.amdgcn.workitem.id.x() 909 %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid 910 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 911 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid 912 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 913 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1 914 %add = sub <2 x i16> %a, %b 915 %ext = sext <2 x i16> %add to <2 x i32> 916 store <2 x i32> %ext, ptr addrspace(1) %out 917 ret void 918} 919 920; FIXME: Need to handle non-uniform case for function below (load without gep). 921define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { 922; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64: 923; GFX9: ; %bb.0: 924; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 925; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 926; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 927; GFX9-NEXT: s_waitcnt lgkmcnt(0) 928; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 929; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 930; GFX9-NEXT: s_mov_b32 s3, 0xf000 931; GFX9-NEXT: s_mov_b32 s2, -1 932; GFX9-NEXT: s_waitcnt vmcnt(0) 933; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2 934; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 935; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16 936; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 937; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 938; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 939; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 940; GFX9-NEXT: s_endpgm 941; 942; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64: 943; VI: ; %bb.0: 944; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 945; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 946; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 947; VI-NEXT: s_waitcnt lgkmcnt(0) 948; VI-NEXT: v_mov_b32_e32 v1, s3 949; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 950; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 951; VI-NEXT: v_mov_b32_e32 v3, s5 952; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 953; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 954; VI-NEXT: flat_load_dword v0, v[0:1] 955; VI-NEXT: flat_load_dword v1, v[2:3] 956; VI-NEXT: s_mov_b32 s3, 0xf000 957; VI-NEXT: s_mov_b32 s2, -1 958; VI-NEXT: s_waitcnt vmcnt(0) 959; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 960; VI-NEXT: v_sub_u16_e32 v0, v0, v1 961; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 962; VI-NEXT: v_bfe_i32 v2, v2, 0, 16 963; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 964; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 965; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 966; VI-NEXT: s_endpgm 967; 968; GFX10-LABEL: v_test_sub_v2i16_sext_to_v2i64: 969; GFX10: ; %bb.0: 970; GFX10-NEXT: s_clause 0x1 971; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 972; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 973; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 974; GFX10-NEXT: s_waitcnt lgkmcnt(0) 975; GFX10-NEXT: s_clause 0x1 976; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 977; GFX10-NEXT: global_load_dword v2, v0, s[6:7] 978; GFX10-NEXT: s_waitcnt_depctr 0xffe3 979; GFX10-NEXT: s_mov_b32 s3, 0x31016000 980; GFX10-NEXT: s_mov_b32 s2, -1 981; GFX10-NEXT: s_waitcnt vmcnt(0) 982; GFX10-NEXT: v_pk_sub_i16 v0, v1, v2 983; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 984; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 985; GFX10-NEXT: v_bfe_i32 v2, v1, 0, 16 986; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 987; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 988; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 989; GFX10-NEXT: s_endpgm 990; 991; GFX11-LABEL: v_test_sub_v2i16_sext_to_v2i64: 992; GFX11: ; %bb.0: 993; GFX11-NEXT: s_clause 0x1 994; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 995; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 996; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 997; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 998; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 999; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1000; GFX11-NEXT: s_clause 0x1 1001; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 1002; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] 1003; GFX11-NEXT: s_mov_b32 s3, 0x31016000 1004; GFX11-NEXT: s_mov_b32 s2, -1 1005; GFX11-NEXT: s_waitcnt vmcnt(0) 1006; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 1007; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 1008; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1009; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 1010; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16 1011; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1012; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 1013; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 1014; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 1015; GFX11-NEXT: s_endpgm 1016 %tid = call i32 @llvm.amdgcn.workitem.id.x() 1017 %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid 1018 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 1019 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid 1020 %a = load <2 x i16>, ptr addrspace(1) %gep.in0 1021 %b = load <2 x i16>, ptr addrspace(1) %gep.in1 1022 %add = sub <2 x i16> %a, %b 1023 %ext = sext <2 x i16> %add to <2 x i64> 1024 store <2 x i64> %ext, ptr addrspace(1) %out 1025 ret void 1026} 1027 1028declare i32 @llvm.amdgcn.workitem.id.x() #0 1029 1030attributes #0 = { nounwind readnone } 1031attributes #1 = { nounwind } 1032