1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s 6 7; FIXME: Need to handle non-uniform case for function below (load without gep). 8; FIXME: VI or should be unnecessary 9define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { 10; VI-LABEL: v_test_add_v2i16: 11; VI: ; %bb.0: 12; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 13; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 14; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 15; VI-NEXT: s_waitcnt lgkmcnt(0) 16; VI-NEXT: v_mov_b32_e32 v1, s3 17; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 18; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 19; VI-NEXT: v_mov_b32_e32 v3, s5 20; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 21; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 22; VI-NEXT: flat_load_dword v4, v[0:1] glc 23; VI-NEXT: s_waitcnt vmcnt(0) 24; VI-NEXT: flat_load_dword v2, v[2:3] glc 25; VI-NEXT: s_waitcnt vmcnt(0) 26; VI-NEXT: v_mov_b32_e32 v0, s0 27; VI-NEXT: v_mov_b32_e32 v1, s1 28; VI-NEXT: v_add_u16_e32 v3, v4, v2 29; VI-NEXT: v_add_u16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 30; VI-NEXT: v_or_b32_e32 v2, v3, v2 31; VI-NEXT: flat_store_dword v[0:1], v2 32; VI-NEXT: s_endpgm 33; 34; GFX9-LABEL: v_test_add_v2i16: 35; GFX9: ; %bb.0: 36; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 37; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 38; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 39; GFX9-NEXT: s_waitcnt lgkmcnt(0) 40; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 41; GFX9-NEXT: s_waitcnt vmcnt(0) 42; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc 43; GFX9-NEXT: s_waitcnt vmcnt(0) 44; GFX9-NEXT: v_mov_b32_e32 v0, 0 45; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 46; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 47; GFX9-NEXT: s_endpgm 48; 49; GFX10-LABEL: v_test_add_v2i16: 50; GFX10: ; %bb.0: 51; GFX10-NEXT: s_clause 0x1 52; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 53; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 54; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 55; GFX10-NEXT: s_waitcnt lgkmcnt(0) 56; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 57; GFX10-NEXT: s_waitcnt vmcnt(0) 58; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc 59; GFX10-NEXT: s_waitcnt vmcnt(0) 60; GFX10-NEXT: v_mov_b32_e32 v0, 0 61; GFX10-NEXT: v_pk_add_u16 v1, v1, v2 62; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 63; GFX10-NEXT: s_endpgm 64; 65; GFX11-LABEL: v_test_add_v2i16: 66; GFX11: ; %bb.0: 67; GFX11-NEXT: s_clause 0x1 68; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 69; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 70; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 71; GFX11-NEXT: v_mov_b32_e32 v2, 0 72; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 73; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 74; GFX11-NEXT: s_waitcnt lgkmcnt(0) 75; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 76; GFX11-NEXT: s_waitcnt vmcnt(0) 77; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc 78; GFX11-NEXT: s_waitcnt vmcnt(0) 79; GFX11-NEXT: v_pk_add_u16 v0, v1, v0 80; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] 81; GFX11-NEXT: s_endpgm 82 %tid = call i32 @llvm.amdgcn.workitem.id.x() 83 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid 84 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 85 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid 86 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 87 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1 88 %add = add <2 x i16> %a, %b 89 store <2 x i16> %add, ptr addrspace(1) %out 90 ret void 91} 92 93define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0, ptr addrspace(4) %in1) #1 { 94; VI-LABEL: s_test_add_v2i16: 95; VI: ; %bb.0: 96; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 97; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 98; VI-NEXT: s_waitcnt lgkmcnt(0) 99; VI-NEXT: s_load_dword s2, s[2:3], 0x0 100; VI-NEXT: s_load_dword s3, s[4:5], 0x0 101; VI-NEXT: v_mov_b32_e32 v0, s0 102; VI-NEXT: v_mov_b32_e32 v1, s1 103; VI-NEXT: s_waitcnt lgkmcnt(0) 104; VI-NEXT: s_lshr_b32 s0, s2, 16 105; VI-NEXT: s_lshr_b32 s1, s3, 16 106; VI-NEXT: s_add_i32 s2, s2, s3 107; VI-NEXT: s_add_i32 s0, s0, s1 108; VI-NEXT: s_and_b32 s1, s2, 0xffff 109; VI-NEXT: s_lshl_b32 s0, s0, 16 110; VI-NEXT: s_or_b32 s0, s1, s0 111; VI-NEXT: v_mov_b32_e32 v2, s0 112; VI-NEXT: flat_store_dword v[0:1], v2 113; VI-NEXT: s_endpgm 114; 115; GFX9-LABEL: s_test_add_v2i16: 116; GFX9: ; %bb.0: 117; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 118; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 119; GFX9-NEXT: v_mov_b32_e32 v0, 0 120; GFX9-NEXT: s_waitcnt lgkmcnt(0) 121; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 122; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0 123; GFX9-NEXT: s_waitcnt lgkmcnt(0) 124; GFX9-NEXT: v_mov_b32_e32 v1, s4 125; GFX9-NEXT: v_pk_add_u16 v1, s5, v1 126; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 127; GFX9-NEXT: s_endpgm 128; 129; GFX10-LABEL: s_test_add_v2i16: 130; GFX10: ; %bb.0: 131; GFX10-NEXT: s_clause 0x1 132; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 133; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 134; GFX10-NEXT: v_mov_b32_e32 v0, 0 135; GFX10-NEXT: s_waitcnt lgkmcnt(0) 136; GFX10-NEXT: s_load_dword s4, s[2:3], 0x0 137; GFX10-NEXT: s_load_dword s5, s[6:7], 0x0 138; GFX10-NEXT: s_waitcnt lgkmcnt(0) 139; GFX10-NEXT: v_pk_add_u16 v1, s4, s5 140; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 141; GFX10-NEXT: s_endpgm 142; 143; GFX11-LABEL: s_test_add_v2i16: 144; GFX11: ; %bb.0: 145; GFX11-NEXT: s_clause 0x1 146; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 147; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 148; GFX11-NEXT: v_mov_b32_e32 v0, 0 149; GFX11-NEXT: s_waitcnt lgkmcnt(0) 150; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 151; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 152; GFX11-NEXT: s_waitcnt lgkmcnt(0) 153; GFX11-NEXT: v_pk_add_u16 v1, s2, s3 154; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 155; GFX11-NEXT: s_endpgm 156 %a = load <2 x i16>, ptr addrspace(4) %in0 157 %b = load <2 x i16>, ptr addrspace(4) %in1 158 %add = add <2 x i16> %a, %b 159 store <2 x i16> %add, ptr addrspace(1) %out 160 ret void 161} 162 163define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 { 164; VI-LABEL: s_test_add_self_v2i16: 165; VI: ; %bb.0: 166; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 167; VI-NEXT: s_waitcnt lgkmcnt(0) 168; VI-NEXT: s_load_dword s2, s[2:3], 0x0 169; VI-NEXT: v_mov_b32_e32 v0, s0 170; VI-NEXT: v_mov_b32_e32 v1, s1 171; VI-NEXT: s_waitcnt lgkmcnt(0) 172; VI-NEXT: s_lshr_b32 s0, s2, 16 173; VI-NEXT: s_and_b32 s1, s2, 0xffff 174; VI-NEXT: s_add_i32 s1, s1, s1 175; VI-NEXT: s_add_i32 s0, s0, s0 176; VI-NEXT: s_lshl_b32 s0, s0, 16 177; VI-NEXT: s_and_b32 s1, s1, 0xffff 178; VI-NEXT: s_or_b32 s0, s1, s0 179; VI-NEXT: v_mov_b32_e32 v2, s0 180; VI-NEXT: flat_store_dword v[0:1], v2 181; VI-NEXT: s_endpgm 182; 183; GFX9-LABEL: s_test_add_self_v2i16: 184; GFX9: ; %bb.0: 185; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 186; GFX9-NEXT: v_mov_b32_e32 v0, 0 187; GFX9-NEXT: s_waitcnt lgkmcnt(0) 188; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0 189; GFX9-NEXT: s_waitcnt lgkmcnt(0) 190; GFX9-NEXT: v_pk_add_u16 v1, s2, s2 191; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 192; GFX9-NEXT: s_endpgm 193; 194; GFX10-LABEL: s_test_add_self_v2i16: 195; GFX10: ; %bb.0: 196; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 197; GFX10-NEXT: v_mov_b32_e32 v0, 0 198; GFX10-NEXT: s_waitcnt lgkmcnt(0) 199; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0 200; GFX10-NEXT: s_waitcnt lgkmcnt(0) 201; GFX10-NEXT: v_pk_add_u16 v1, s2, s2 202; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 203; GFX10-NEXT: s_endpgm 204; 205; GFX11-LABEL: s_test_add_self_v2i16: 206; GFX11: ; %bb.0: 207; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 208; GFX11-NEXT: v_mov_b32_e32 v0, 0 209; GFX11-NEXT: s_waitcnt lgkmcnt(0) 210; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 211; GFX11-NEXT: s_waitcnt lgkmcnt(0) 212; GFX11-NEXT: v_pk_add_u16 v1, s2, s2 213; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 214; GFX11-NEXT: s_endpgm 215 %a = load <2 x i16>, ptr addrspace(4) %in0 216 %add = add <2 x i16> %a, %a 217 store <2 x i16> %add, ptr addrspace(1) %out 218 ret void 219} 220 221; FIXME: VI should not scalarize arg access. 222define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { 223; VI-LABEL: s_test_add_v2i16_kernarg: 224; VI: ; %bb.0: 225; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 226; VI-NEXT: s_waitcnt lgkmcnt(0) 227; VI-NEXT: s_lshr_b32 s4, s2, 16 228; VI-NEXT: s_lshr_b32 s5, s3, 16 229; VI-NEXT: s_add_i32 s2, s2, s3 230; VI-NEXT: s_add_i32 s4, s4, s5 231; VI-NEXT: s_and_b32 s2, s2, 0xffff 232; VI-NEXT: s_lshl_b32 s3, s4, 16 233; VI-NEXT: s_or_b32 s2, s2, s3 234; VI-NEXT: v_mov_b32_e32 v0, s0 235; VI-NEXT: v_mov_b32_e32 v1, s1 236; VI-NEXT: v_mov_b32_e32 v2, s2 237; VI-NEXT: flat_store_dword v[0:1], v2 238; VI-NEXT: s_endpgm 239; 240; GFX9-LABEL: s_test_add_v2i16_kernarg: 241; GFX9: ; %bb.0: 242; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 243; GFX9-NEXT: v_mov_b32_e32 v0, 0 244; GFX9-NEXT: s_waitcnt lgkmcnt(0) 245; GFX9-NEXT: v_mov_b32_e32 v1, s3 246; GFX9-NEXT: v_pk_add_u16 v1, s2, v1 247; GFX9-NEXT: global_store_dword v0, v1, s[0:1] 248; GFX9-NEXT: s_endpgm 249; 250; GFX10-LABEL: s_test_add_v2i16_kernarg: 251; GFX10: ; %bb.0: 252; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 253; GFX10-NEXT: v_mov_b32_e32 v0, 0 254; GFX10-NEXT: s_waitcnt lgkmcnt(0) 255; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 256; GFX10-NEXT: global_store_dword v0, v1, s[0:1] 257; GFX10-NEXT: s_endpgm 258; 259; GFX11-LABEL: s_test_add_v2i16_kernarg: 260; GFX11: ; %bb.0: 261; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 262; GFX11-NEXT: v_mov_b32_e32 v0, 0 263; GFX11-NEXT: s_waitcnt lgkmcnt(0) 264; GFX11-NEXT: v_pk_add_u16 v1, s2, s3 265; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] 266; GFX11-NEXT: s_endpgm 267 %add = add <2 x i16> %a, %b 268 store <2 x i16> %add, ptr addrspace(1) %out 269 ret void 270} 271 272; FIXME: Eliminate or with sdwa 273define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { 274; VI-LABEL: v_test_add_v2i16_constant: 275; VI: ; %bb.0: 276; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 277; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 278; VI-NEXT: v_mov_b32_e32 v3, 0x1c8 279; VI-NEXT: s_waitcnt lgkmcnt(0) 280; VI-NEXT: v_mov_b32_e32 v1, s3 281; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 282; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 283; VI-NEXT: flat_load_dword v2, v[0:1] glc 284; VI-NEXT: s_waitcnt vmcnt(0) 285; VI-NEXT: v_mov_b32_e32 v0, s0 286; VI-NEXT: v_mov_b32_e32 v1, s1 287; VI-NEXT: v_add_u16_e32 v4, 0x7b, v2 288; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 289; VI-NEXT: v_or_b32_e32 v2, v4, v2 290; VI-NEXT: flat_store_dword v[0:1], v2 291; VI-NEXT: s_endpgm 292; 293; GFX9-LABEL: v_test_add_v2i16_constant: 294; GFX9: ; %bb.0: 295; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 296; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 297; GFX9-NEXT: v_mov_b32_e32 v1, 0 298; GFX9-NEXT: s_waitcnt lgkmcnt(0) 299; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 300; GFX9-NEXT: s_waitcnt vmcnt(0) 301; GFX9-NEXT: s_mov_b32 s2, 0x1c8007b 302; GFX9-NEXT: v_pk_add_u16 v0, v0, s2 303; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 304; GFX9-NEXT: s_endpgm 305; 306; GFX10-LABEL: v_test_add_v2i16_constant: 307; GFX10: ; %bb.0: 308; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 309; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 310; GFX10-NEXT: v_mov_b32_e32 v1, 0 311; GFX10-NEXT: s_waitcnt lgkmcnt(0) 312; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 313; GFX10-NEXT: s_waitcnt vmcnt(0) 314; GFX10-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0 315; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 316; GFX10-NEXT: s_endpgm 317; 318; GFX11-LABEL: v_test_add_v2i16_constant: 319; GFX11: ; %bb.0: 320; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 321; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 322; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 323; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 324; GFX11-NEXT: s_waitcnt lgkmcnt(0) 325; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc 326; GFX11-NEXT: s_waitcnt vmcnt(0) 327; GFX11-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0 328; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 329; GFX11-NEXT: s_endpgm 330 %tid = call i32 @llvm.amdgcn.workitem.id.x() 331 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid 332 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 333 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 334 %add = add <2 x i16> %a, <i16 123, i16 456> 335 store <2 x i16> %add, ptr addrspace(1) %out 336 ret void 337} 338 339; FIXME: Need to handle non-uniform case for function below (load without gep). 340define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { 341; VI-LABEL: v_test_add_v2i16_neg_constant: 342; VI: ; %bb.0: 343; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 344; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 345; VI-NEXT: v_mov_b32_e32 v3, 0xfffffc21 346; VI-NEXT: s_waitcnt lgkmcnt(0) 347; VI-NEXT: v_mov_b32_e32 v1, s3 348; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 349; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 350; VI-NEXT: flat_load_dword v2, v[0:1] glc 351; VI-NEXT: s_waitcnt vmcnt(0) 352; VI-NEXT: v_mov_b32_e32 v0, s0 353; VI-NEXT: v_mov_b32_e32 v1, s1 354; VI-NEXT: v_add_u16_e32 v4, 0xfcb3, v2 355; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 356; VI-NEXT: v_or_b32_e32 v2, v4, v2 357; VI-NEXT: flat_store_dword v[0:1], v2 358; VI-NEXT: s_endpgm 359; 360; GFX9-LABEL: v_test_add_v2i16_neg_constant: 361; GFX9: ; %bb.0: 362; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 363; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 364; GFX9-NEXT: v_mov_b32_e32 v1, 0 365; GFX9-NEXT: s_waitcnt lgkmcnt(0) 366; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 367; GFX9-NEXT: s_waitcnt vmcnt(0) 368; GFX9-NEXT: s_mov_b32 s2, 0xfc21fcb3 369; GFX9-NEXT: v_pk_add_u16 v0, v0, s2 370; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 371; GFX9-NEXT: s_endpgm 372; 373; GFX10-LABEL: v_test_add_v2i16_neg_constant: 374; GFX10: ; %bb.0: 375; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 376; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 377; GFX10-NEXT: v_mov_b32_e32 v1, 0 378; GFX10-NEXT: s_waitcnt lgkmcnt(0) 379; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 380; GFX10-NEXT: s_waitcnt vmcnt(0) 381; GFX10-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0 382; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 383; GFX10-NEXT: s_endpgm 384; 385; GFX11-LABEL: v_test_add_v2i16_neg_constant: 386; GFX11: ; %bb.0: 387; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 388; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 389; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 390; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 391; GFX11-NEXT: s_waitcnt lgkmcnt(0) 392; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc 393; GFX11-NEXT: s_waitcnt vmcnt(0) 394; GFX11-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0 395; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 396; GFX11-NEXT: s_endpgm 397 %tid = call i32 @llvm.amdgcn.workitem.id.x() 398 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid 399 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 400 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 401 %add = add <2 x i16> %a, <i16 -845, i16 -991> 402 store <2 x i16> %add, ptr addrspace(1) %out 403 ret void 404} 405 406define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { 407; VI-LABEL: v_test_add_v2i16_inline_neg1: 408; VI: ; %bb.0: 409; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 410; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 411; VI-NEXT: v_mov_b32_e32 v3, -1 412; VI-NEXT: s_waitcnt lgkmcnt(0) 413; VI-NEXT: v_mov_b32_e32 v1, s3 414; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 415; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 416; VI-NEXT: flat_load_dword v2, v[0:1] glc 417; VI-NEXT: s_waitcnt vmcnt(0) 418; VI-NEXT: v_mov_b32_e32 v0, s0 419; VI-NEXT: v_mov_b32_e32 v1, s1 420; VI-NEXT: v_add_u16_e32 v4, -1, v2 421; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 422; VI-NEXT: v_or_b32_e32 v2, v4, v2 423; VI-NEXT: flat_store_dword v[0:1], v2 424; VI-NEXT: s_endpgm 425; 426; GFX9-LABEL: v_test_add_v2i16_inline_neg1: 427; GFX9: ; %bb.0: 428; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 429; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 430; GFX9-NEXT: v_mov_b32_e32 v1, 0 431; GFX9-NEXT: s_waitcnt lgkmcnt(0) 432; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 433; GFX9-NEXT: s_waitcnt vmcnt(0) 434; GFX9-NEXT: v_pk_add_u16 v0, v0, -1 435; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 436; GFX9-NEXT: s_endpgm 437; 438; GFX10-LABEL: v_test_add_v2i16_inline_neg1: 439; GFX10: ; %bb.0: 440; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 441; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 442; GFX10-NEXT: v_mov_b32_e32 v1, 0 443; GFX10-NEXT: s_waitcnt lgkmcnt(0) 444; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 445; GFX10-NEXT: s_waitcnt vmcnt(0) 446; GFX10-NEXT: v_pk_add_u16 v0, v0, -1 447; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 448; GFX10-NEXT: s_endpgm 449; 450; GFX11-LABEL: v_test_add_v2i16_inline_neg1: 451; GFX11: ; %bb.0: 452; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 453; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 454; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 455; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 456; GFX11-NEXT: s_waitcnt lgkmcnt(0) 457; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc 458; GFX11-NEXT: s_waitcnt vmcnt(0) 459; GFX11-NEXT: v_pk_add_u16 v0, v0, -1 460; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 461; GFX11-NEXT: s_endpgm 462 %tid = call i32 @llvm.amdgcn.workitem.id.x() 463 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid 464 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 465 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 466 %add = add <2 x i16> %a, <i16 -1, i16 -1> 467 store <2 x i16> %add, ptr addrspace(1) %out 468 ret void 469} 470 471define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { 472; VI-LABEL: v_test_add_v2i16_inline_lo_zero_hi: 473; VI: ; %bb.0: 474; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 475; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 476; VI-NEXT: s_waitcnt lgkmcnt(0) 477; VI-NEXT: v_mov_b32_e32 v1, s3 478; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 479; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 480; VI-NEXT: flat_load_dword v2, v[0:1] glc 481; VI-NEXT: s_waitcnt vmcnt(0) 482; VI-NEXT: v_mov_b32_e32 v0, s0 483; VI-NEXT: v_mov_b32_e32 v1, s1 484; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 485; VI-NEXT: v_add_u16_e32 v2, 32, v2 486; VI-NEXT: v_or_b32_e32 v2, v2, v3 487; VI-NEXT: flat_store_dword v[0:1], v2 488; VI-NEXT: s_endpgm 489; 490; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi: 491; GFX9: ; %bb.0: 492; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 493; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 494; GFX9-NEXT: v_mov_b32_e32 v1, 0 495; GFX9-NEXT: s_waitcnt lgkmcnt(0) 496; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 497; GFX9-NEXT: s_waitcnt vmcnt(0) 498; GFX9-NEXT: v_pk_add_u16 v0, v0, 32 499; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 500; GFX9-NEXT: s_endpgm 501; 502; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi: 503; GFX10: ; %bb.0: 504; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 505; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 506; GFX10-NEXT: v_mov_b32_e32 v1, 0 507; GFX10-NEXT: s_waitcnt lgkmcnt(0) 508; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 509; GFX10-NEXT: s_waitcnt vmcnt(0) 510; GFX10-NEXT: v_pk_add_u16 v0, v0, 32 511; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 512; GFX10-NEXT: s_endpgm 513; 514; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi: 515; GFX11: ; %bb.0: 516; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 517; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 518; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 519; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 520; GFX11-NEXT: s_waitcnt lgkmcnt(0) 521; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc 522; GFX11-NEXT: s_waitcnt vmcnt(0) 523; GFX11-NEXT: v_pk_add_u16 v0, v0, 32 524; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 525; GFX11-NEXT: s_endpgm 526 %tid = call i32 @llvm.amdgcn.workitem.id.x() 527 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid 528 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 529 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 530 %add = add <2 x i16> %a, <i16 32, i16 0> 531 store <2 x i16> %add, ptr addrspace(1) %out 532 ret void 533} 534 535; The high element gives fp 536define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 { 537; VI-LABEL: v_test_add_v2i16_inline_fp_split: 538; VI: ; %bb.0: 539; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 540; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 541; VI-NEXT: v_mov_b32_e32 v3, 0x3f80 542; VI-NEXT: s_waitcnt lgkmcnt(0) 543; VI-NEXT: v_mov_b32_e32 v1, s3 544; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 545; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 546; VI-NEXT: flat_load_dword v2, v[0:1] glc 547; VI-NEXT: s_waitcnt vmcnt(0) 548; VI-NEXT: v_mov_b32_e32 v0, s0 549; VI-NEXT: v_mov_b32_e32 v1, s1 550; VI-NEXT: v_add_u16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 551; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 552; VI-NEXT: flat_store_dword v[0:1], v2 553; VI-NEXT: s_endpgm 554; 555; GFX9-LABEL: v_test_add_v2i16_inline_fp_split: 556; GFX9: ; %bb.0: 557; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 558; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 559; GFX9-NEXT: v_mov_b32_e32 v1, 0 560; GFX9-NEXT: s_waitcnt lgkmcnt(0) 561; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc 562; GFX9-NEXT: s_waitcnt vmcnt(0) 563; GFX9-NEXT: v_pk_add_u16 v0, v0, 1.0 564; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 565; GFX9-NEXT: s_endpgm 566; 567; GFX10-LABEL: v_test_add_v2i16_inline_fp_split: 568; GFX10: ; %bb.0: 569; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 570; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 571; GFX10-NEXT: v_mov_b32_e32 v1, 0 572; GFX10-NEXT: s_waitcnt lgkmcnt(0) 573; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc 574; GFX10-NEXT: s_waitcnt vmcnt(0) 575; GFX10-NEXT: v_pk_add_u16 v0, v0, 1.0 576; GFX10-NEXT: global_store_dword v1, v0, s[0:1] 577; GFX10-NEXT: s_endpgm 578; 579; GFX11-LABEL: v_test_add_v2i16_inline_fp_split: 580; GFX11: ; %bb.0: 581; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 582; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 583; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 584; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 585; GFX11-NEXT: s_waitcnt lgkmcnt(0) 586; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc 587; GFX11-NEXT: s_waitcnt vmcnt(0) 588; GFX11-NEXT: v_pk_add_u16 v0, v0, 1.0 589; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] 590; GFX11-NEXT: s_endpgm 591 %tid = call i32 @llvm.amdgcn.workitem.id.x() 592 %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid 593 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 594 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 595 %add = add <2 x i16> %a, <i16 0, i16 16256> 596 store <2 x i16> %add, ptr addrspace(1) %out 597 ret void 598} 599 600; FIXME: Need to handle non-uniform case for function below (load without gep). 601define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { 602; VI-LABEL: v_test_add_v2i16_zext_to_v2i32: 603; VI: ; %bb.0: 604; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 605; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 606; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 607; VI-NEXT: s_waitcnt lgkmcnt(0) 608; VI-NEXT: v_mov_b32_e32 v1, s3 609; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 610; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 611; VI-NEXT: v_mov_b32_e32 v3, s5 612; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 613; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 614; VI-NEXT: flat_load_dword v4, v[0:1] glc 615; VI-NEXT: s_waitcnt vmcnt(0) 616; VI-NEXT: flat_load_dword v3, v[2:3] glc 617; VI-NEXT: s_waitcnt vmcnt(0) 618; VI-NEXT: v_mov_b32_e32 v0, s0 619; VI-NEXT: v_mov_b32_e32 v1, s1 620; VI-NEXT: v_add_u16_e32 v2, v4, v3 621; VI-NEXT: v_add_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 622; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 623; VI-NEXT: s_endpgm 624; 625; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i32: 626; GFX9: ; %bb.0: 627; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 628; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 629; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 630; GFX9-NEXT: v_mov_b32_e32 v3, 0 631; GFX9-NEXT: s_waitcnt lgkmcnt(0) 632; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 633; GFX9-NEXT: s_waitcnt vmcnt(0) 634; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc 635; GFX9-NEXT: s_waitcnt vmcnt(0) 636; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 637; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 638; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 639; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] 640; GFX9-NEXT: s_endpgm 641; 642; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i32: 643; GFX10: ; %bb.0: 644; GFX10-NEXT: s_clause 0x1 645; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 646; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 647; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 648; GFX10-NEXT: s_waitcnt lgkmcnt(0) 649; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 650; GFX10-NEXT: s_waitcnt vmcnt(0) 651; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc 652; GFX10-NEXT: s_waitcnt vmcnt(0) 653; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 654; GFX10-NEXT: v_mov_b32_e32 v2, 0 655; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 656; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 657; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 658; GFX10-NEXT: s_endpgm 659; 660; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i32: 661; GFX11: ; %bb.0: 662; GFX11-NEXT: s_clause 0x1 663; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 664; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 665; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 666; GFX11-NEXT: v_mov_b32_e32 v2, 0 667; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 668; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 669; GFX11-NEXT: s_waitcnt lgkmcnt(0) 670; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 671; GFX11-NEXT: s_waitcnt vmcnt(0) 672; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc 673; GFX11-NEXT: s_waitcnt vmcnt(0) 674; GFX11-NEXT: v_pk_add_u16 v0, v1, v0 675; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 676; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 677; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 678; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 679; GFX11-NEXT: s_endpgm 680 %tid = call i32 @llvm.amdgcn.workitem.id.x() 681 %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid 682 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 683 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid 684 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 685 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1 686 %add = add <2 x i16> %a, %b 687 %ext = zext <2 x i16> %add to <2 x i32> 688 store <2 x i32> %ext, ptr addrspace(1) %out 689 ret void 690} 691 692; FIXME: Need to handle non-uniform case for function below (load without gep). 693define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { 694; VI-LABEL: v_test_add_v2i16_zext_to_v2i64: 695; VI: ; %bb.0: 696; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 697; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 698; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 699; VI-NEXT: s_waitcnt lgkmcnt(0) 700; VI-NEXT: v_mov_b32_e32 v1, s3 701; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 702; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 703; VI-NEXT: v_mov_b32_e32 v3, s5 704; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 705; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 706; VI-NEXT: flat_load_dword v6, v[0:1] glc 707; VI-NEXT: s_waitcnt vmcnt(0) 708; VI-NEXT: flat_load_dword v2, v[2:3] glc 709; VI-NEXT: s_waitcnt vmcnt(0) 710; VI-NEXT: v_mov_b32_e32 v1, 0 711; VI-NEXT: v_mov_b32_e32 v4, s0 712; VI-NEXT: v_mov_b32_e32 v5, s1 713; VI-NEXT: v_mov_b32_e32 v3, v1 714; VI-NEXT: v_add_u16_e32 v0, v6, v2 715; VI-NEXT: v_add_u16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 716; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 717; VI-NEXT: s_endpgm 718; 719; GFX9-LABEL: v_test_add_v2i16_zext_to_v2i64: 720; GFX9: ; %bb.0: 721; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 722; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 723; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 724; GFX9-NEXT: v_mov_b32_e32 v1, 0 725; GFX9-NEXT: s_waitcnt lgkmcnt(0) 726; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc 727; GFX9-NEXT: s_waitcnt vmcnt(0) 728; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc 729; GFX9-NEXT: s_waitcnt vmcnt(0) 730; GFX9-NEXT: v_pk_add_u16 v0, v2, v3 731; GFX9-NEXT: v_alignbit_b32 v2, 0, v0, 16 732; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 733; GFX9-NEXT: v_mov_b32_e32 v3, v1 734; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] 735; GFX9-NEXT: s_endpgm 736; 737; GFX10-LABEL: v_test_add_v2i16_zext_to_v2i64: 738; GFX10: ; %bb.0: 739; GFX10-NEXT: s_clause 0x1 740; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 741; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 742; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 743; GFX10-NEXT: s_waitcnt lgkmcnt(0) 744; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 745; GFX10-NEXT: s_waitcnt vmcnt(0) 746; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc 747; GFX10-NEXT: s_waitcnt vmcnt(0) 748; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 749; GFX10-NEXT: v_mov_b32_e32 v1, 0 750; GFX10-NEXT: v_alignbit_b32 v2, 0, v0, 16 751; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 752; GFX10-NEXT: v_mov_b32_e32 v3, v1 753; GFX10-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1] 754; GFX10-NEXT: s_endpgm 755; 756; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i64: 757; GFX11: ; %bb.0: 758; GFX11-NEXT: s_clause 0x1 759; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 760; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 761; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 762; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 763; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 764; GFX11-NEXT: s_waitcnt lgkmcnt(0) 765; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 766; GFX11-NEXT: s_waitcnt vmcnt(0) 767; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc 768; GFX11-NEXT: s_waitcnt vmcnt(0) 769; GFX11-NEXT: v_pk_add_u16 v0, v1, v0 770; GFX11-NEXT: v_mov_b32_e32 v1, 0 771; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 772; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16 773; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0 774; GFX11-NEXT: global_store_b128 v1, v[0:3], s[0:1] 775; GFX11-NEXT: s_endpgm 776 %tid = call i32 @llvm.amdgcn.workitem.id.x() 777 %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid 778 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 779 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid 780 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 781 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1 782 %add = add <2 x i16> %a, %b 783 %ext = zext <2 x i16> %add to <2 x i64> 784 store <2 x i64> %ext, ptr addrspace(1) %out 785 ret void 786} 787 788; FIXME: Need to handle non-uniform case for function below (load without gep). 789define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { 790; VI-LABEL: v_test_add_v2i16_sext_to_v2i32: 791; VI: ; %bb.0: 792; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 793; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 794; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 795; VI-NEXT: s_waitcnt lgkmcnt(0) 796; VI-NEXT: v_mov_b32_e32 v1, s3 797; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 798; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 799; VI-NEXT: v_mov_b32_e32 v3, s5 800; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 801; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 802; VI-NEXT: flat_load_dword v4, v[0:1] glc 803; VI-NEXT: s_waitcnt vmcnt(0) 804; VI-NEXT: flat_load_dword v2, v[2:3] glc 805; VI-NEXT: s_waitcnt vmcnt(0) 806; VI-NEXT: v_mov_b32_e32 v0, s0 807; VI-NEXT: v_mov_b32_e32 v1, s1 808; VI-NEXT: v_add_u16_sdwa v3, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 809; VI-NEXT: v_add_u16_e32 v2, v4, v2 810; VI-NEXT: v_bfe_i32 v2, v2, 0, 16 811; VI-NEXT: v_bfe_i32 v3, v3, 0, 16 812; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 813; VI-NEXT: s_endpgm 814; 815; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i32: 816; GFX9: ; %bb.0: 817; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 818; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 819; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 820; GFX9-NEXT: v_mov_b32_e32 v3, 0 821; GFX9-NEXT: s_waitcnt lgkmcnt(0) 822; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc 823; GFX9-NEXT: s_waitcnt vmcnt(0) 824; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc 825; GFX9-NEXT: s_waitcnt vmcnt(0) 826; GFX9-NEXT: v_pk_add_u16 v0, v1, v2 827; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 828; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 829; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] 830; GFX9-NEXT: s_endpgm 831; 832; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i32: 833; GFX10: ; %bb.0: 834; GFX10-NEXT: s_clause 0x1 835; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 836; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 837; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 838; GFX10-NEXT: s_waitcnt lgkmcnt(0) 839; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc 840; GFX10-NEXT: s_waitcnt vmcnt(0) 841; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc 842; GFX10-NEXT: s_waitcnt vmcnt(0) 843; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 844; GFX10-NEXT: v_mov_b32_e32 v2, 0 845; GFX10-NEXT: v_ashrrev_i32_e32 v1, 16, v0 846; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 847; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 848; GFX10-NEXT: s_endpgm 849; 850; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i32: 851; GFX11: ; %bb.0: 852; GFX11-NEXT: s_clause 0x1 853; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 854; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 855; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 856; GFX11-NEXT: v_mov_b32_e32 v2, 0 857; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 858; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 859; GFX11-NEXT: s_waitcnt lgkmcnt(0) 860; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc 861; GFX11-NEXT: s_waitcnt vmcnt(0) 862; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc 863; GFX11-NEXT: s_waitcnt vmcnt(0) 864; GFX11-NEXT: v_pk_add_u16 v0, v1, v0 865; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 866; GFX11-NEXT: v_ashrrev_i32_e32 v1, 16, v0 867; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 868; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] 869; GFX11-NEXT: s_endpgm 870 %tid = call i32 @llvm.amdgcn.workitem.id.x() 871 %gep.out = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i32 %tid 872 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 873 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid 874 %a = load volatile <2 x i16>, ptr addrspace(1) %gep.in0 875 %b = load volatile <2 x i16>, ptr addrspace(1) %gep.in1 876 %add = add <2 x i16> %a, %b 877 %ext = sext <2 x i16> %add to <2 x i32> 878 store <2 x i32> %ext, ptr addrspace(1) %out 879 ret void 880} 881 882; FIXME: Need to handle non-uniform case for function below (load without gep). 883define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { 884; VI-LABEL: v_test_add_v2i16_sext_to_v2i64: 885; VI: ; %bb.0: 886; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 887; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 888; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 889; VI-NEXT: s_waitcnt lgkmcnt(0) 890; VI-NEXT: v_mov_b32_e32 v1, s3 891; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 892; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 893; VI-NEXT: v_mov_b32_e32 v3, s5 894; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 895; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 896; VI-NEXT: flat_load_dword v0, v[0:1] 897; VI-NEXT: flat_load_dword v1, v[2:3] 898; VI-NEXT: v_mov_b32_e32 v4, s0 899; VI-NEXT: v_mov_b32_e32 v5, s1 900; VI-NEXT: s_waitcnt vmcnt(0) 901; VI-NEXT: v_add_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 902; VI-NEXT: v_add_u16_e32 v0, v0, v1 903; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 904; VI-NEXT: v_bfe_i32 v2, v2, 0, 16 905; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 906; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 907; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 908; VI-NEXT: s_endpgm 909; 910; GFX9-LABEL: v_test_add_v2i16_sext_to_v2i64: 911; GFX9: ; %bb.0: 912; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 913; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 914; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 915; GFX9-NEXT: v_mov_b32_e32 v4, 0 916; GFX9-NEXT: s_waitcnt lgkmcnt(0) 917; GFX9-NEXT: global_load_dword v1, v0, s[2:3] 918; GFX9-NEXT: global_load_dword v2, v0, s[6:7] 919; GFX9-NEXT: s_waitcnt vmcnt(0) 920; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 921; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 922; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16 923; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 924; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 925; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2 926; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 927; GFX9-NEXT: s_endpgm 928; 929; GFX10-LABEL: v_test_add_v2i16_sext_to_v2i64: 930; GFX10: ; %bb.0: 931; GFX10-NEXT: s_clause 0x1 932; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 933; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 934; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 935; GFX10-NEXT: v_mov_b32_e32 v4, 0 936; GFX10-NEXT: s_waitcnt lgkmcnt(0) 937; GFX10-NEXT: s_clause 0x1 938; GFX10-NEXT: global_load_dword v1, v0, s[2:3] 939; GFX10-NEXT: global_load_dword v2, v0, s[6:7] 940; GFX10-NEXT: s_waitcnt vmcnt(0) 941; GFX10-NEXT: v_pk_add_u16 v0, v1, v2 942; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 943; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16 944; GFX10-NEXT: v_bfe_i32 v2, v1, 0, 16 945; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 946; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 947; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 948; GFX10-NEXT: s_endpgm 949; 950; GFX11-LABEL: v_test_add_v2i16_sext_to_v2i64: 951; GFX11: ; %bb.0: 952; GFX11-NEXT: s_clause 0x1 953; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 954; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 955; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 956; GFX11-NEXT: v_mov_b32_e32 v4, 0 957; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 958; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 959; GFX11-NEXT: s_waitcnt lgkmcnt(0) 960; GFX11-NEXT: s_clause 0x1 961; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] 962; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] 963; GFX11-NEXT: s_waitcnt vmcnt(0) 964; GFX11-NEXT: v_pk_add_u16 v0, v1, v0 965; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 966; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 967; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16 968; GFX11-NEXT: v_bfe_i32 v2, v1, 0, 16 969; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 970; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 971; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 972; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] 973; GFX11-NEXT: s_endpgm 974 %tid = call i32 @llvm.amdgcn.workitem.id.x() 975 %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid 976 %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid 977 %gep.in1 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in1, i32 %tid 978 %a = load <2 x i16>, ptr addrspace(1) %gep.in0 979 %b = load <2 x i16>, ptr addrspace(1) %gep.in1 980 %add = add <2 x i16> %a, %b 981 %ext = sext <2 x i16> %add to <2 x i64> 982 store <2 x i64> %ext, ptr addrspace(1) %out 983 ret void 984} 985 986define <2 x i16> @add_inline_imm_neg1_0(<2 x i16> %x) { 987; VI-LABEL: add_inline_imm_neg1_0: 988; VI: ; %bb.0: 989; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 990; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 991; VI-NEXT: v_add_u16_e32 v0, -1, v0 992; VI-NEXT: v_or_b32_e32 v0, v0, v1 993; VI-NEXT: s_setpc_b64 s[30:31] 994; 995; GFX9-LABEL: add_inline_imm_neg1_0: 996; GFX9: ; %bb.0: 997; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 998; GFX9-NEXT: v_pk_sub_u16 v0, v0, 1 999; GFX9-NEXT: s_setpc_b64 s[30:31] 1000; 1001; GFX10-LABEL: add_inline_imm_neg1_0: 1002; GFX10: ; %bb.0: 1003; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1004; GFX10-NEXT: v_pk_sub_u16 v0, v0, 1 1005; GFX10-NEXT: s_setpc_b64 s[30:31] 1006; 1007; GFX11-LABEL: add_inline_imm_neg1_0: 1008; GFX11: ; %bb.0: 1009; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1010; GFX11-NEXT: v_pk_sub_u16 v0, v0, 1 1011; GFX11-NEXT: s_setpc_b64 s[30:31] 1012 %y = add <2 x i16> %x, <i16 -1, i16 0> 1013 ret <2 x i16> %y 1014} 1015 1016define <2 x i16> @add_inline_imm_1_0(<2 x i16> %x) { 1017; VI-LABEL: add_inline_imm_1_0: 1018; VI: ; %bb.0: 1019; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1020; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 1021; VI-NEXT: v_add_u16_e32 v0, 1, v0 1022; VI-NEXT: v_or_b32_e32 v0, v0, v1 1023; VI-NEXT: s_setpc_b64 s[30:31] 1024; 1025; GFX9-LABEL: add_inline_imm_1_0: 1026; GFX9: ; %bb.0: 1027; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1028; GFX9-NEXT: v_pk_add_u16 v0, v0, 1 1029; GFX9-NEXT: s_setpc_b64 s[30:31] 1030; 1031; GFX10-LABEL: add_inline_imm_1_0: 1032; GFX10: ; %bb.0: 1033; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1034; GFX10-NEXT: v_pk_add_u16 v0, v0, 1 1035; GFX10-NEXT: s_setpc_b64 s[30:31] 1036; 1037; GFX11-LABEL: add_inline_imm_1_0: 1038; GFX11: ; %bb.0: 1039; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1040; GFX11-NEXT: v_pk_add_u16 v0, v0, 1 1041; GFX11-NEXT: s_setpc_b64 s[30:31] 1042 %y = add <2 x i16> %x, <i16 1, i16 0> 1043 ret <2 x i16> %y 1044} 1045 1046declare i32 @llvm.amdgcn.workitem.id.x() #0 1047 1048attributes #0 = { nounwind readnone } 1049attributes #1 = { nounwind } 1050