1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck --check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9-NODL %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefixes=GFX9-DL %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefixes=GFX10-DL %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefixes=GFX10-DL %s 8 9; add(mul(S0.x, S1.y), 10; add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3) 11 12define amdgpu_kernel void @udot2(ptr addrspace(1) %src1, 13; GFX7-LABEL: udot2: 14; GFX7: ; %bb.0: ; %entry 15; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 16; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 17; GFX7-NEXT: s_mov_b32 s7, 0xf000 18; GFX7-NEXT: s_mov_b32 s10, 0 19; GFX7-NEXT: s_mov_b32 s11, s7 20; GFX7-NEXT: s_waitcnt lgkmcnt(0) 21; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 22; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 23; GFX7-NEXT: v_mov_b32_e32 v1, 0 24; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 25; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 26; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 27; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 28; GFX7-NEXT: s_mov_b32 s6, -1 29; GFX7-NEXT: s_waitcnt vmcnt(1) 30; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 31; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 32; GFX7-NEXT: s_waitcnt vmcnt(0) 33; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 34; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 35; GFX7-NEXT: s_waitcnt lgkmcnt(0) 36; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0 37; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 38; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 39; GFX7-NEXT: s_endpgm 40; 41; GFX8-LABEL: udot2: 42; GFX8: ; %bb.0: ; %entry 43; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 44; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 45; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 46; GFX8-NEXT: s_waitcnt lgkmcnt(0) 47; GFX8-NEXT: v_mov_b32_e32 v1, s1 48; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 49; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 50; GFX8-NEXT: flat_load_dword v3, v[0:1] 51; GFX8-NEXT: v_mov_b32_e32 v1, s3 52; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 53; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 54; GFX8-NEXT: flat_load_dword v0, v[0:1] 55; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 56; GFX8-NEXT: s_waitcnt vmcnt(1) 57; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 58; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 59; GFX8-NEXT: s_waitcnt vmcnt(0) 60; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 61; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 62; GFX8-NEXT: s_waitcnt lgkmcnt(0) 63; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0 64; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 65; GFX8-NEXT: v_mov_b32_e32 v0, s4 66; GFX8-NEXT: v_mov_b32_e32 v1, s5 67; GFX8-NEXT: flat_store_dword v[0:1], v2 68; GFX8-NEXT: s_endpgm 69; 70; GFX9-NODL-LABEL: udot2: 71; GFX9-NODL: ; %bb.0: ; %entry 72; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 73; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 74; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 75; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 76; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 77; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 78; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 79; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 80; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 81; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 82; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 83; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 84; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 85; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 86; GFX9-NODL-NEXT: s_endpgm 87; 88; GFX9-DL-LABEL: udot2: 89; GFX9-DL: ; %bb.0: ; %entry 90; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 91; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 92; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 93; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 94; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 95; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 96; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 97; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 98; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 99; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 100; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 101; GFX9-DL-NEXT: s_endpgm 102; 103; GFX10-DL-LABEL: udot2: 104; GFX10-DL: ; %bb.0: ; %entry 105; GFX10-DL-NEXT: s_clause 0x1 106; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 107; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 108; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 109; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 110; GFX10-DL-NEXT: s_clause 0x1 111; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 112; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 113; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 114; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 115; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 116; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 117; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 118; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] 119; GFX10-DL-NEXT: s_endpgm 120 ptr addrspace(1) %src2, 121 ptr addrspace(1) nocapture %dst) { 122entry: 123 %idx = call i32 @llvm.amdgcn.workitem.id.x() 124 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx 125 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1 126 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx 127 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2 128 129 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 130 %conv = zext i16 %s1.elt1 to i32 131 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 132 %conv2 = zext i16 %s2.elt1 to i32 133 %mul1 = mul nuw i32 %conv2, %conv 134 135 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 136 %conv3 = zext i16 %s1.elt2 to i32 137 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 138 %conv4 = zext i16 %s2.elt2 to i32 139 %mul2 = mul nuw i32 %conv4, %conv3 140 141 %s3 = load i32, ptr addrspace(1) %dst, align 4 142 %add = add i32 %mul2, %s3 143 %add6 = add i32 %add, %mul1 144 store i32 %add6, ptr addrspace(1) %dst, align 4 145 ret void 146} 147 148; TODO: Support this pattern 149; add(S3, 150; add (mul (S0.y, S1.y), mul (S0.y, S1.y))) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3) 151define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1, 152; GFX7-LABEL: udot2_MulMul: 153; GFX7: ; %bb.0: ; %entry 154; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 155; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 156; GFX7-NEXT: s_mov_b32 s7, 0xf000 157; GFX7-NEXT: s_mov_b32 s10, 0 158; GFX7-NEXT: s_mov_b32 s11, s7 159; GFX7-NEXT: s_waitcnt lgkmcnt(0) 160; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 161; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 162; GFX7-NEXT: v_mov_b32_e32 v1, 0 163; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 164; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 165; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 166; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 167; GFX7-NEXT: s_mov_b32 s6, -1 168; GFX7-NEXT: s_waitcnt vmcnt(1) 169; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 170; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 171; GFX7-NEXT: s_waitcnt vmcnt(0) 172; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 173; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 174; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v2 175; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 176; GFX7-NEXT: s_waitcnt lgkmcnt(0) 177; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 178; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 179; GFX7-NEXT: s_endpgm 180; 181; GFX8-LABEL: udot2_MulMul: 182; GFX8: ; %bb.0: ; %entry 183; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 184; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 185; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 186; GFX8-NEXT: s_waitcnt lgkmcnt(0) 187; GFX8-NEXT: v_mov_b32_e32 v1, s1 188; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 189; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 190; GFX8-NEXT: flat_load_dword v3, v[0:1] 191; GFX8-NEXT: v_mov_b32_e32 v1, s3 192; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 193; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 194; GFX8-NEXT: flat_load_dword v0, v[0:1] 195; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 196; GFX8-NEXT: s_waitcnt vmcnt(1) 197; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 198; GFX8-NEXT: s_waitcnt vmcnt(0) 199; GFX8-NEXT: v_mul_u32_u24_sdwa v1, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 200; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 201; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v1 202; GFX8-NEXT: s_waitcnt lgkmcnt(0) 203; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 204; GFX8-NEXT: v_mov_b32_e32 v0, s4 205; GFX8-NEXT: v_mov_b32_e32 v1, s5 206; GFX8-NEXT: flat_store_dword v[0:1], v2 207; GFX8-NEXT: s_endpgm 208; 209; GFX9-NODL-LABEL: udot2_MulMul: 210; GFX9-NODL: ; %bb.0: ; %entry 211; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 212; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 213; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 214; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 215; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 216; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 217; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 218; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 219; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 220; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 221; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 222; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 223; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, s0 224; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 225; GFX9-NODL-NEXT: s_endpgm 226; 227; GFX9-DL-LABEL: udot2_MulMul: 228; GFX9-DL: ; %bb.0: ; %entry 229; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 230; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 231; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 232; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 233; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 234; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 235; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 236; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 237; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 238; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 239; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 240; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 241; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, s0 242; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 243; GFX9-DL-NEXT: s_endpgm 244; 245; GFX10-DL-LABEL: udot2_MulMul: 246; GFX10-DL: ; %bb.0: ; %entry 247; GFX10-DL-NEXT: s_clause 0x1 248; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 249; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 250; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 251; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 252; GFX10-DL-NEXT: s_clause 0x1 253; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 254; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 255; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 256; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 257; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 258; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 259; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 260; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 261; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 262; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, s0 263; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 264; GFX10-DL-NEXT: s_endpgm 265 ptr addrspace(1) %src2, 266 ptr addrspace(1) nocapture %dst) { 267entry: 268 %idx = call i32 @llvm.amdgcn.workitem.id.x() 269 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx 270 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1 271 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx 272 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2 273 274 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 275 %conv = zext i16 %s1.elt1 to i32 276 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 277 %conv2 = zext i16 %s2.elt1 to i32 278 %mul1 = mul nuw i32 %conv2, %conv 279 280 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 281 %conv3 = zext i16 %s1.elt2 to i32 282 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 283 %conv4 = zext i16 %s2.elt2 to i32 284 %mul2 = mul nuw i32 %conv4, %conv3 285 %s3 = load i32, ptr addrspace(1) %dst, align 4 286 %add = add i32 %mul2, %mul1 287 %add6 = add i32 %add, %s3 288 store i32 %add6, ptr addrspace(1) %dst, align 4 289 ret void 290} 291 292define amdgpu_kernel void @idot2(ptr addrspace(1) %src1, 293; GFX7-LABEL: idot2: 294; GFX7: ; %bb.0: ; %entry 295; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 296; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 297; GFX7-NEXT: s_mov_b32 s7, 0xf000 298; GFX7-NEXT: s_mov_b32 s10, 0 299; GFX7-NEXT: s_mov_b32 s11, s7 300; GFX7-NEXT: s_waitcnt lgkmcnt(0) 301; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 302; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 303; GFX7-NEXT: v_mov_b32_e32 v1, 0 304; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 305; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 306; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 307; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 308; GFX7-NEXT: s_mov_b32 s6, -1 309; GFX7-NEXT: s_waitcnt vmcnt(1) 310; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 311; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 312; GFX7-NEXT: s_waitcnt vmcnt(0) 313; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 314; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 315; GFX7-NEXT: s_waitcnt lgkmcnt(0) 316; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0 317; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 318; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 319; GFX7-NEXT: s_endpgm 320; 321; GFX8-LABEL: idot2: 322; GFX8: ; %bb.0: ; %entry 323; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 324; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 325; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 326; GFX8-NEXT: s_waitcnt lgkmcnt(0) 327; GFX8-NEXT: v_mov_b32_e32 v1, s1 328; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 329; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 330; GFX8-NEXT: flat_load_dword v3, v[0:1] 331; GFX8-NEXT: v_mov_b32_e32 v1, s3 332; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 333; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 334; GFX8-NEXT: flat_load_dword v0, v[0:1] 335; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 336; GFX8-NEXT: s_waitcnt vmcnt(1) 337; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 338; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 339; GFX8-NEXT: s_waitcnt vmcnt(0) 340; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 341; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 342; GFX8-NEXT: s_waitcnt lgkmcnt(0) 343; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0 344; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 345; GFX8-NEXT: v_mov_b32_e32 v0, s4 346; GFX8-NEXT: v_mov_b32_e32 v1, s5 347; GFX8-NEXT: flat_store_dword v[0:1], v2 348; GFX8-NEXT: s_endpgm 349; 350; GFX9-NODL-LABEL: idot2: 351; GFX9-NODL: ; %bb.0: ; %entry 352; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 353; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 354; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 355; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 356; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 357; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 358; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 359; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 360; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 361; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 362; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 363; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 364; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 365; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 366; GFX9-NODL-NEXT: s_endpgm 367; 368; GFX9-DL-LABEL: idot2: 369; GFX9-DL: ; %bb.0: ; %entry 370; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 371; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 372; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 373; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 374; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 375; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 376; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 377; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 378; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 379; GFX9-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0 380; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 381; GFX9-DL-NEXT: s_endpgm 382; 383; GFX10-DL-LABEL: idot2: 384; GFX10-DL: ; %bb.0: ; %entry 385; GFX10-DL-NEXT: s_clause 0x1 386; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 387; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 388; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 389; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 390; GFX10-DL-NEXT: s_clause 0x1 391; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 392; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 393; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 394; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 395; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 396; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 397; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0 398; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] 399; GFX10-DL-NEXT: s_endpgm 400 ptr addrspace(1) %src2, 401 ptr addrspace(1) nocapture %dst) { 402entry: 403 %idx = call i32 @llvm.amdgcn.workitem.id.x() 404 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx 405 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1 406 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx 407 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2 408 409 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 410 %conv = sext i16 %s1.elt1 to i32 411 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 412 %conv2 = sext i16 %s2.elt1 to i32 413 %mul1 = mul nuw i32 %conv2, %conv 414 415 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 416 %conv3 = sext i16 %s1.elt2 to i32 417 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 418 %conv4 = sext i16 %s2.elt2 to i32 419 %mul2 = mul nuw i32 %conv4, %conv3 420 421 %s3 = load i32, ptr addrspace(1) %dst, align 4 422 %add = add i32 %mul2, %s3 423 %add6 = add i32 %add, %mul1 424 store i32 %add6, ptr addrspace(1) %dst, align 4 425 ret void 426} 427 428define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1, 429; GFX7-LABEL: idot2_MixedTypedMul: 430; GFX7: ; %bb.0: ; %entry 431; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 432; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 433; GFX7-NEXT: s_mov_b32 s7, 0xf000 434; GFX7-NEXT: s_mov_b32 s10, 0 435; GFX7-NEXT: s_mov_b32 s11, s7 436; GFX7-NEXT: s_waitcnt lgkmcnt(0) 437; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 438; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 439; GFX7-NEXT: v_mov_b32_e32 v1, 0 440; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 441; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 442; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 443; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 444; GFX7-NEXT: s_mov_b32 s6, -1 445; GFX7-NEXT: s_waitcnt vmcnt(1) 446; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 447; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 16 448; GFX7-NEXT: s_waitcnt vmcnt(0) 449; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 450; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 451; GFX7-NEXT: s_waitcnt lgkmcnt(0) 452; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0 453; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v1 454; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 455; GFX7-NEXT: s_endpgm 456; 457; GFX8-LABEL: idot2_MixedTypedMul: 458; GFX8: ; %bb.0: ; %entry 459; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 460; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 461; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 462; GFX8-NEXT: s_waitcnt lgkmcnt(0) 463; GFX8-NEXT: v_mov_b32_e32 v1, s1 464; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 465; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 466; GFX8-NEXT: flat_load_dword v3, v[0:1] 467; GFX8-NEXT: v_mov_b32_e32 v1, s3 468; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 469; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 470; GFX8-NEXT: flat_load_dword v0, v[0:1] 471; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 472; GFX8-NEXT: s_waitcnt vmcnt(1) 473; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 474; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 475; GFX8-NEXT: s_waitcnt vmcnt(0) 476; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 477; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 478; GFX8-NEXT: s_waitcnt lgkmcnt(0) 479; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0 480; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 481; GFX8-NEXT: v_mov_b32_e32 v0, s4 482; GFX8-NEXT: v_mov_b32_e32 v1, s5 483; GFX8-NEXT: flat_store_dword v[0:1], v2 484; GFX8-NEXT: s_endpgm 485; 486; GFX9-NODL-LABEL: idot2_MixedTypedMul: 487; GFX9-NODL: ; %bb.0: ; %entry 488; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 489; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 490; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 491; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 492; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 493; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 494; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 495; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 496; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 497; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 498; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 499; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 500; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 501; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 502; GFX9-NODL-NEXT: s_endpgm 503; 504; GFX9-DL-LABEL: idot2_MixedTypedMul: 505; GFX9-DL: ; %bb.0: ; %entry 506; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 507; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 508; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 509; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 510; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 511; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 512; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 513; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 514; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 515; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 516; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 517; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 518; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 519; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 520; GFX9-DL-NEXT: s_endpgm 521; 522; GFX10-DL-LABEL: idot2_MixedTypedMul: 523; GFX10-DL: ; %bb.0: ; %entry 524; GFX10-DL-NEXT: s_clause 0x1 525; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 526; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 527; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 528; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 529; GFX10-DL-NEXT: s_clause 0x1 530; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 531; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 532; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 533; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 534; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 535; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 536; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 537; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 538; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 539; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0 540; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 541; GFX10-DL-NEXT: s_endpgm 542 ptr addrspace(1) %src2, 543 ptr addrspace(1) nocapture %dst) { 544entry: 545 %idx = call i32 @llvm.amdgcn.workitem.id.x() 546 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx 547 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1 548 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx 549 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2 550 551 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 552 %conv = sext i16 %s1.elt1 to i32 553 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 554 %conv2 = sext i16 %s2.elt1 to i32 555 %mul1 = mul nuw i32 %conv2, %conv 556 557 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 558 %conv3 = zext i16 %s1.elt2 to i32 559 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 560 %conv4 = zext i16 %s2.elt2 to i32 561 %mul2 = mul nuw i32 %conv4, %conv3 562 563 %s3 = load i32, ptr addrspace(1) %dst, align 4 564 %add = add i32 %mul2, %s3 565 %add6 = add i32 %add, %mul1 566 store i32 %add6, ptr addrspace(1) %dst, align 4 567 ret void 568} 569 570define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1, 571; GFX7-LABEL: udot2_alt_AddOperands: 572; GFX7: ; %bb.0: ; %entry 573; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 574; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 575; GFX7-NEXT: s_mov_b32 s7, 0xf000 576; GFX7-NEXT: s_mov_b32 s10, 0 577; GFX7-NEXT: s_mov_b32 s11, s7 578; GFX7-NEXT: s_waitcnt lgkmcnt(0) 579; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 580; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 581; GFX7-NEXT: v_mov_b32_e32 v1, 0 582; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 583; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 584; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 585; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 586; GFX7-NEXT: s_mov_b32 s6, -1 587; GFX7-NEXT: s_waitcnt vmcnt(1) 588; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 589; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 590; GFX7-NEXT: s_waitcnt vmcnt(0) 591; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 592; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 593; GFX7-NEXT: s_waitcnt lgkmcnt(0) 594; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0 595; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 596; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 597; GFX7-NEXT: s_endpgm 598; 599; GFX8-LABEL: udot2_alt_AddOperands: 600; GFX8: ; %bb.0: ; %entry 601; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 602; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 603; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 604; GFX8-NEXT: s_waitcnt lgkmcnt(0) 605; GFX8-NEXT: v_mov_b32_e32 v1, s1 606; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 607; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 608; GFX8-NEXT: flat_load_dword v3, v[0:1] 609; GFX8-NEXT: v_mov_b32_e32 v1, s3 610; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 611; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 612; GFX8-NEXT: flat_load_dword v0, v[0:1] 613; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 614; GFX8-NEXT: s_waitcnt vmcnt(1) 615; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 616; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 617; GFX8-NEXT: s_waitcnt vmcnt(0) 618; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 619; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 620; GFX8-NEXT: s_waitcnt lgkmcnt(0) 621; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0 622; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 623; GFX8-NEXT: v_mov_b32_e32 v0, s4 624; GFX8-NEXT: v_mov_b32_e32 v1, s5 625; GFX8-NEXT: flat_store_dword v[0:1], v2 626; GFX8-NEXT: s_endpgm 627; 628; GFX9-NODL-LABEL: udot2_alt_AddOperands: 629; GFX9-NODL: ; %bb.0: ; %entry 630; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 631; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 632; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 633; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 634; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 635; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 636; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 637; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 638; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 639; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1 640; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 641; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v2 642; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 643; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 644; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 645; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 646; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v4, v3, v1 647; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 648; GFX9-NODL-NEXT: s_endpgm 649; 650; GFX9-DL-LABEL: udot2_alt_AddOperands: 651; GFX9-DL: ; %bb.0: ; %entry 652; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 653; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 654; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 655; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 656; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 657; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 658; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 659; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 660; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 661; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 662; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 663; GFX9-DL-NEXT: s_endpgm 664; 665; GFX10-DL-LABEL: udot2_alt_AddOperands: 666; GFX10-DL: ; %bb.0: ; %entry 667; GFX10-DL-NEXT: s_clause 0x1 668; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 669; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 670; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 671; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 672; GFX10-DL-NEXT: s_clause 0x1 673; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 674; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 675; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 676; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 677; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 678; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 679; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 680; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] 681; GFX10-DL-NEXT: s_endpgm 682 ptr addrspace(1) %src2, 683 ptr addrspace(1) nocapture %dst) { 684entry: 685 %idx = call i32 @llvm.amdgcn.workitem.id.x() 686 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx 687 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1 688 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx 689 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2 690 691 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 692 %conv = zext i16 %s1.elt1 to i32 693 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 694 %conv2 = zext i16 %s2.elt1 to i32 695 %mul1 = mul nuw i32 %conv2, %conv 696 697 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 698 %conv3 = zext i16 %s1.elt2 to i32 699 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 700 %conv4 = zext i16 %s2.elt2 to i32 701 %mul2 = mul nuw i32 %conv4, %conv3 702 703 %s3 = load i32, ptr addrspace(1) %dst, align 4 704 %add = add i32 %s3, %mul2 705 %add6 = add i32 %mul1, %add 706 store i32 %add6, ptr addrspace(1) %dst, align 4 707 ret void 708} 709 710define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1, 711; GFX7-LABEL: idot2_MixedExt: 712; GFX7: ; %bb.0: ; %entry 713; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 714; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 715; GFX7-NEXT: s_mov_b32 s7, 0xf000 716; GFX7-NEXT: s_mov_b32 s10, 0 717; GFX7-NEXT: s_mov_b32 s11, s7 718; GFX7-NEXT: s_waitcnt lgkmcnt(0) 719; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 720; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 721; GFX7-NEXT: v_mov_b32_e32 v1, 0 722; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 723; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 724; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 725; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 726; GFX7-NEXT: s_mov_b32 s6, -1 727; GFX7-NEXT: s_waitcnt vmcnt(1) 728; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 729; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 730; GFX7-NEXT: s_waitcnt vmcnt(0) 731; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 732; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 733; GFX7-NEXT: s_waitcnt lgkmcnt(0) 734; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0 735; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 736; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 737; GFX7-NEXT: s_endpgm 738; 739; GFX8-LABEL: idot2_MixedExt: 740; GFX8: ; %bb.0: ; %entry 741; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 742; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 743; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 744; GFX8-NEXT: s_waitcnt lgkmcnt(0) 745; GFX8-NEXT: v_mov_b32_e32 v1, s1 746; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 747; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 748; GFX8-NEXT: flat_load_dword v3, v[0:1] 749; GFX8-NEXT: v_mov_b32_e32 v1, s3 750; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 751; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 752; GFX8-NEXT: flat_load_dword v0, v[0:1] 753; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 754; GFX8-NEXT: s_waitcnt vmcnt(1) 755; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 756; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 757; GFX8-NEXT: s_waitcnt vmcnt(0) 758; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 759; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 760; GFX8-NEXT: s_waitcnt lgkmcnt(0) 761; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0 762; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 763; GFX8-NEXT: v_mov_b32_e32 v0, s4 764; GFX8-NEXT: v_mov_b32_e32 v1, s5 765; GFX8-NEXT: flat_store_dword v[0:1], v2 766; GFX8-NEXT: s_endpgm 767; 768; GFX9-NODL-LABEL: idot2_MixedExt: 769; GFX9-NODL: ; %bb.0: ; %entry 770; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 771; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 772; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 773; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 774; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 775; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 776; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 777; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 778; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 779; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 780; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 781; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 782; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 783; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 784; GFX9-NODL-NEXT: s_endpgm 785; 786; GFX9-DL-LABEL: idot2_MixedExt: 787; GFX9-DL: ; %bb.0: ; %entry 788; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 789; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 790; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 791; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 792; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 793; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 794; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 795; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 796; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 797; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 798; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 799; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 800; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 801; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 802; GFX9-DL-NEXT: s_endpgm 803; 804; GFX10-DL-LABEL: idot2_MixedExt: 805; GFX10-DL: ; %bb.0: ; %entry 806; GFX10-DL-NEXT: s_clause 0x1 807; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 808; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 809; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 810; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 811; GFX10-DL-NEXT: s_clause 0x1 812; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 813; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 814; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 815; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 816; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 817; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 818; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 819; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 820; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 821; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0 822; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 823; GFX10-DL-NEXT: s_endpgm 824 ptr addrspace(1) %src2, 825 ptr addrspace(1) nocapture %dst) { 826entry: 827 %idx = call i32 @llvm.amdgcn.workitem.id.x() 828 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx 829 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1 830 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx 831 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2 832 833 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 834 %conv = sext i16 %s1.elt1 to i32 835 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 836 %conv2 = zext i16 %s2.elt1 to i32 837 %mul1 = mul nuw i32 %conv2, %conv 838 839 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 840 %conv3 = sext i16 %s1.elt2 to i32 841 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 842 %conv4 = sext i16 %s2.elt2 to i32 843 %mul2 = mul nuw i32 %conv4, %conv3 844 845 %s3 = load i32, ptr addrspace(1) %dst, align 4 846 %add = add i32 %mul2, %s3 847 %add6 = add i32 %add, %mul1 848 store i32 %add6, ptr addrspace(1) %dst, align 4 849 ret void 850} 851 852define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1, 853; GFX7-LABEL: notudot2_SameVec: 854; GFX7: ; %bb.0: ; %entry 855; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 856; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 857; GFX7-NEXT: s_mov_b32 s7, 0xf000 858; GFX7-NEXT: s_mov_b32 s10, 0 859; GFX7-NEXT: s_mov_b32 s11, s7 860; GFX7-NEXT: s_waitcnt lgkmcnt(0) 861; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 862; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 863; GFX7-NEXT: v_mov_b32_e32 v1, 0 864; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 865; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 866; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 867; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 868; GFX7-NEXT: s_mov_b32 s6, -1 869; GFX7-NEXT: s_waitcnt vmcnt(1) 870; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 871; GFX7-NEXT: s_waitcnt vmcnt(0) 872; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 873; GFX7-NEXT: s_waitcnt lgkmcnt(0) 874; GFX7-NEXT: v_mad_u32_u24 v0, v0, v0, s0 875; GFX7-NEXT: v_mad_u32_u24 v0, v1, v1, v0 876; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 877; GFX7-NEXT: s_endpgm 878; 879; GFX8-LABEL: notudot2_SameVec: 880; GFX8: ; %bb.0: ; %entry 881; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 882; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 883; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 884; GFX8-NEXT: s_waitcnt lgkmcnt(0) 885; GFX8-NEXT: v_mov_b32_e32 v1, s1 886; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 887; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 888; GFX8-NEXT: flat_load_dword v3, v[0:1] 889; GFX8-NEXT: v_mov_b32_e32 v1, s3 890; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 891; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 892; GFX8-NEXT: flat_load_dword v0, v[0:1] 893; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 894; GFX8-NEXT: s_waitcnt vmcnt(1) 895; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 896; GFX8-NEXT: s_waitcnt vmcnt(0) 897; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 898; GFX8-NEXT: s_waitcnt lgkmcnt(0) 899; GFX8-NEXT: v_mad_u32_u24 v0, v0, v0, s0 900; GFX8-NEXT: v_mad_u32_u24 v2, v1, v1, v0 901; GFX8-NEXT: v_mov_b32_e32 v0, s4 902; GFX8-NEXT: v_mov_b32_e32 v1, s5 903; GFX8-NEXT: flat_store_dword v[0:1], v2 904; GFX8-NEXT: s_endpgm 905; 906; GFX9-NODL-LABEL: notudot2_SameVec: 907; GFX9-NODL: ; %bb.0: ; %entry 908; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 909; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 910; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 911; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 912; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 913; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 914; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 915; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 916; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 917; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 918; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 919; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 920; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 921; GFX9-NODL-NEXT: v_add3_u32 v1, v2, s0, v1 922; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 923; GFX9-NODL-NEXT: s_endpgm 924; 925; GFX9-DL-LABEL: notudot2_SameVec: 926; GFX9-DL: ; %bb.0: ; %entry 927; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 928; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 929; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 930; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 931; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 932; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 933; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 934; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 935; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 936; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 937; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 938; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 939; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 940; GFX9-DL-NEXT: v_add3_u32 v1, v2, s0, v1 941; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 942; GFX9-DL-NEXT: s_endpgm 943; 944; GFX10-DL-LABEL: notudot2_SameVec: 945; GFX10-DL: ; %bb.0: ; %entry 946; GFX10-DL-NEXT: s_clause 0x1 947; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 948; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 949; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 950; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 951; GFX10-DL-NEXT: s_clause 0x1 952; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 953; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 954; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 955; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 956; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 957; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 958; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 959; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 960; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 961; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 962; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0 963; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 964; GFX10-DL-NEXT: s_endpgm 965 ptr addrspace(1) %src2, 966 ptr addrspace(1) nocapture %dst) { 967entry: 968 %idx = call i32 @llvm.amdgcn.workitem.id.x() 969 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx 970 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1 971 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx 972 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2 973 974 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 975 %conv = zext i16 %s1.elt1 to i32 976 %s2.elt1 = extractelement <2 x i16> %vec1, i64 0 977 %conv2 = zext i16 %s2.elt1 to i32 978 %mul1 = mul i32 %conv2, %conv 979 980 %s1.elt2 = extractelement <2 x i16> %vec2, i64 1 981 %conv3 = zext i16 %s1.elt2 to i32 982 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 983 %conv4 = zext i16 %s2.elt2 to i32 984 %mul2 = mul i32 %conv4, %conv3 985 986 %s3 = load i32, ptr addrspace(1) %dst, align 4 987 %add = add i32 %mul2, %s3 988 %add6 = add i32 %add, %mul1 989 store i32 %add6, ptr addrspace(1) %dst, align 4 990 ret void 991} 992 993define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1, 994; GFX7-LABEL: udot2_v4i16: 995; GFX7: ; %bb.0: ; %entry 996; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 997; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 998; GFX7-NEXT: s_mov_b32 s7, 0xf000 999; GFX7-NEXT: s_mov_b32 s10, 0 1000; GFX7-NEXT: s_mov_b32 s11, s7 1001; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1002; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 1003; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1004; GFX7-NEXT: v_mov_b32_e32 v1, 0 1005; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3] 1006; GFX7-NEXT: s_mov_b64 s[2:3], s[10:11] 1007; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1008; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 1009; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 1010; GFX7-NEXT: s_mov_b32 s6, -1 1011; GFX7-NEXT: s_waitcnt vmcnt(1) 1012; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 1013; GFX7-NEXT: s_waitcnt vmcnt(0) 1014; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 1015; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1016; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1017; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1018; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s0 1019; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 1020; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 1021; GFX7-NEXT: s_endpgm 1022; 1023; GFX8-LABEL: udot2_v4i16: 1024; GFX8: ; %bb.0: ; %entry 1025; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1026; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1027; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1028; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1029; GFX8-NEXT: v_mov_b32_e32 v1, s1 1030; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1031; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1032; GFX8-NEXT: v_mov_b32_e32 v3, s3 1033; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 1034; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1035; GFX8-NEXT: flat_load_dword v0, v[0:1] 1036; GFX8-NEXT: flat_load_dword v1, v[2:3] 1037; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1038; GFX8-NEXT: s_waitcnt vmcnt(1) 1039; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 1040; GFX8-NEXT: s_waitcnt vmcnt(0) 1041; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v1 1042; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1043; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1044; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1045; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0 1046; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0 1047; GFX8-NEXT: v_mov_b32_e32 v0, s4 1048; GFX8-NEXT: v_mov_b32_e32 v1, s5 1049; GFX8-NEXT: flat_store_dword v[0:1], v2 1050; GFX8-NEXT: s_endpgm 1051; 1052; GFX9-NODL-LABEL: udot2_v4i16: 1053; GFX9-NODL: ; %bb.0: ; %entry 1054; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1055; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1056; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1057; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1058; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 1059; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 1060; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 1061; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1062; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1063; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1064; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1065; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1066; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 1067; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 1068; GFX9-NODL-NEXT: s_endpgm 1069; 1070; GFX9-DL-LABEL: udot2_v4i16: 1071; GFX9-DL: ; %bb.0: ; %entry 1072; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1073; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1074; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1075; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1076; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 1077; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 1078; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1079; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1080; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1081; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 1082; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 1083; GFX9-DL-NEXT: s_endpgm 1084; 1085; GFX10-DL-LABEL: udot2_v4i16: 1086; GFX10-DL: ; %bb.0: ; %entry 1087; GFX10-DL-NEXT: s_clause 0x1 1088; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1089; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1090; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1091; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1092; GFX10-DL-NEXT: s_clause 0x1 1093; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 1094; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 1095; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 1096; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1097; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1098; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1099; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 1100; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] 1101; GFX10-DL-NEXT: s_endpgm 1102 ptr addrspace(1) %src2, 1103 ptr addrspace(1) nocapture %dst) { 1104entry: 1105 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1106 %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx 1107 %vec1 = load <4 x i16>, ptr addrspace(1) %gep1 1108 %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx 1109 %vec2 = load <4 x i16>, ptr addrspace(1) %gep2 1110 1111 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0 1112 %conv = zext i16 %s1.elt1 to i32 1113 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0 1114 %conv2 = zext i16 %s2.elt1 to i32 1115 %mul1 = mul i32 %conv2, %conv 1116 1117 %s1.elt2 = extractelement <4 x i16> %vec1, i64 1 1118 %conv3 = zext i16 %s1.elt2 to i32 1119 %s2.elt2 = extractelement <4 x i16> %vec2, i64 1 1120 %conv4 = zext i16 %s2.elt2 to i32 1121 %mul2 = mul i32 %conv4, %conv3 1122 1123 %s3 = load i32, ptr addrspace(1) %dst, align 4 1124 %add = add i32 %mul2, %s3 1125 %add6 = add i32 %add, %mul1 1126 store i32 %add6, ptr addrspace(1) %dst, align 4 1127 ret void 1128} 1129 1130define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1, 1131; GFX7-LABEL: udot2_v4i16_Hi: 1132; GFX7: ; %bb.0: ; %entry 1133; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1134; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1135; GFX7-NEXT: s_mov_b32 s7, 0xf000 1136; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1137; GFX7-NEXT: v_mov_b32_e32 v1, 0 1138; GFX7-NEXT: s_mov_b32 s10, 0 1139; GFX7-NEXT: s_mov_b32 s11, s7 1140; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1141; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 1142; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4 1143; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 1144; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 1145; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 1146; GFX7-NEXT: s_mov_b32 s6, -1 1147; GFX7-NEXT: s_waitcnt vmcnt(1) 1148; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 1149; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1150; GFX7-NEXT: s_waitcnt vmcnt(0) 1151; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v0 1152; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1153; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1154; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, s0 1155; GFX7-NEXT: v_mad_u32_u24 v0, v3, v1, v0 1156; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 1157; GFX7-NEXT: s_endpgm 1158; 1159; GFX8-LABEL: udot2_v4i16_Hi: 1160; GFX8: ; %bb.0: ; %entry 1161; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1162; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1163; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1164; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1165; GFX8-NEXT: v_mov_b32_e32 v1, s1 1166; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 1167; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1168; GFX8-NEXT: v_mov_b32_e32 v3, s3 1169; GFX8-NEXT: v_add_u32_e32 v4, vcc, s2, v0 1170; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1171; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2 1172; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1173; GFX8-NEXT: flat_load_dword v2, v[0:1] 1174; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4 1175; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc 1176; GFX8-NEXT: flat_load_dword v0, v[0:1] 1177; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1178; GFX8-NEXT: s_waitcnt vmcnt(1) 1179; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v2 1180; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1181; GFX8-NEXT: s_waitcnt vmcnt(0) 1182; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v0 1183; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1184; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1185; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s0 1186; GFX8-NEXT: v_mad_u32_u24 v2, v3, v1, v0 1187; GFX8-NEXT: v_mov_b32_e32 v0, s4 1188; GFX8-NEXT: v_mov_b32_e32 v1, s5 1189; GFX8-NEXT: flat_store_dword v[0:1], v2 1190; GFX8-NEXT: s_endpgm 1191; 1192; GFX9-NODL-LABEL: udot2_v4i16_Hi: 1193; GFX9-NODL: ; %bb.0: ; %entry 1194; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1195; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1196; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1197; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1198; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] offset:4 1199; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 1200; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 1201; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1202; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1203; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1204; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1205; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1206; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 1207; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 1208; GFX9-NODL-NEXT: s_endpgm 1209; 1210; GFX9-DL-LABEL: udot2_v4i16_Hi: 1211; GFX9-DL: ; %bb.0: ; %entry 1212; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1213; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1214; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1215; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1216; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4 1217; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 1218; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1219; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1220; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1221; GFX9-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 1222; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 1223; GFX9-DL-NEXT: s_endpgm 1224; 1225; GFX10-DL-LABEL: udot2_v4i16_Hi: 1226; GFX10-DL: ; %bb.0: ; %entry 1227; GFX10-DL-NEXT: s_clause 0x1 1228; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1229; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1230; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1231; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1232; GFX10-DL-NEXT: s_clause 0x1 1233; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] offset:4 1234; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] offset:4 1235; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 1236; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1237; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1238; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1239; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0 1240; GFX10-DL-NEXT: global_store_dword v0, v1, s[6:7] 1241; GFX10-DL-NEXT: s_endpgm 1242 ptr addrspace(1) %src2, 1243 ptr addrspace(1) nocapture %dst) { 1244entry: 1245 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1246 %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx 1247 %vec1 = load <4 x i16>, ptr addrspace(1) %gep1 1248 %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx 1249 %vec2 = load <4 x i16>, ptr addrspace(1) %gep2 1250 1251 %s1.elt1 = extractelement <4 x i16> %vec1, i64 2 1252 %conv = zext i16 %s1.elt1 to i32 1253 %s2.elt1 = extractelement <4 x i16> %vec2, i64 2 1254 %conv2 = zext i16 %s2.elt1 to i32 1255 %mul1 = mul i32 %conv2, %conv 1256 1257 %s1.elt2 = extractelement <4 x i16> %vec1, i64 3 1258 %conv3 = zext i16 %s1.elt2 to i32 1259 %s2.elt2 = extractelement <4 x i16> %vec2, i64 3 1260 %conv4 = zext i16 %s2.elt2 to i32 1261 %mul2 = mul i32 %conv4, %conv3 1262 1263 %s3 = load i32, ptr addrspace(1) %dst, align 4 1264 %add = add i32 %mul2, %s3 1265 %add6 = add i32 %add, %mul1 1266 store i32 %add6, ptr addrspace(1) %dst, align 4 1267 ret void 1268} 1269 1270define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1, 1271; GFX7-LABEL: notudot2_v4i16_Even: 1272; GFX7: ; %bb.0: ; %entry 1273; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1274; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1275; GFX7-NEXT: s_mov_b32 s7, 0xf000 1276; GFX7-NEXT: s_mov_b32 s10, 0 1277; GFX7-NEXT: s_mov_b32 s11, s7 1278; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1279; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 1280; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1281; GFX7-NEXT: v_mov_b32_e32 v1, 0 1282; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3] 1283; GFX7-NEXT: s_mov_b64 s[2:3], s[10:11] 1284; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 1285; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 1286; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 1287; GFX7-NEXT: s_mov_b32 s6, -1 1288; GFX7-NEXT: s_waitcnt vmcnt(1) 1289; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 1290; GFX7-NEXT: s_waitcnt vmcnt(0) 1291; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 1292; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 1293; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 1294; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1295; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s0 1296; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 1297; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 1298; GFX7-NEXT: s_endpgm 1299; 1300; GFX8-LABEL: notudot2_v4i16_Even: 1301; GFX8: ; %bb.0: ; %entry 1302; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1303; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1304; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1305; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1306; GFX8-NEXT: v_mov_b32_e32 v1, s1 1307; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1308; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1309; GFX8-NEXT: v_mov_b32_e32 v3, s3 1310; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 1311; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1312; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1313; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1314; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1315; GFX8-NEXT: s_waitcnt vmcnt(1) 1316; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 1317; GFX8-NEXT: s_waitcnt vmcnt(0) 1318; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 1319; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 1320; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 1321; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1322; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s0 1323; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1 1324; GFX8-NEXT: v_mov_b32_e32 v0, s4 1325; GFX8-NEXT: v_mov_b32_e32 v1, s5 1326; GFX8-NEXT: flat_store_dword v[0:1], v2 1327; GFX8-NEXT: s_endpgm 1328; 1329; GFX9-NODL-LABEL: notudot2_v4i16_Even: 1330; GFX9-NODL: ; %bb.0: ; %entry 1331; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1332; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1333; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1334; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1335; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] 1336; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] 1337; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 1338; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0 1339; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1340; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1341; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1342; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1343; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0 1344; GFX9-NODL-NEXT: global_store_dword v4, v0, s[6:7] 1345; GFX9-NODL-NEXT: s_endpgm 1346; 1347; GFX9-DL-LABEL: notudot2_v4i16_Even: 1348; GFX9-DL: ; %bb.0: ; %entry 1349; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1350; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1351; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1352; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1353; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] 1354; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] 1355; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1356; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0 1357; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1358; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1359; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1360; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1361; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0 1362; GFX9-DL-NEXT: global_store_dword v4, v0, s[6:7] 1363; GFX9-DL-NEXT: s_endpgm 1364; 1365; GFX10-DL-LABEL: notudot2_v4i16_Even: 1366; GFX10-DL: ; %bb.0: ; %entry 1367; GFX10-DL-NEXT: s_clause 0x1 1368; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1369; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1370; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1371; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1372; GFX10-DL-NEXT: s_clause 0x1 1373; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] 1374; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] 1375; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 1376; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1377; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1378; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1379; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1380; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1381; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1382; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0 1383; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 1384; GFX10-DL-NEXT: s_endpgm 1385 ptr addrspace(1) %src2, 1386 ptr addrspace(1) nocapture %dst) { 1387entry: 1388 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1389 %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx 1390 %vec1 = load <4 x i16>, ptr addrspace(1) %gep1 1391 %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx 1392 %vec2 = load <4 x i16>, ptr addrspace(1) %gep2 1393 1394 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0 1395 %conv = zext i16 %s1.elt1 to i32 1396 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0 1397 %conv2 = zext i16 %s2.elt1 to i32 1398 %mul1 = mul i32 %conv2, %conv 1399 1400 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2 1401 %conv3 = zext i16 %s1.elt2 to i32 1402 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2 1403 %conv4 = zext i16 %s2.elt2 to i32 1404 %mul2 = mul i32 %conv4, %conv3 1405 1406 %s3 = load i32, ptr addrspace(1) %dst, align 4 1407 %add = add i32 %mul2, %s3 1408 %add6 = add i32 %add, %mul1 1409 store i32 %add6, ptr addrspace(1) %dst, align 4 1410 ret void 1411} 1412 1413define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1, 1414; GFX7-LABEL: notudot2_v4i16_Middle: 1415; GFX7: ; %bb.0: ; %entry 1416; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1417; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1418; GFX7-NEXT: s_mov_b32 s7, 0xf000 1419; GFX7-NEXT: s_mov_b32 s10, 0 1420; GFX7-NEXT: s_mov_b32 s11, s7 1421; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1422; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 1423; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 1424; GFX7-NEXT: v_mov_b32_e32 v1, 0 1425; GFX7-NEXT: s_mov_b64 s[0:1], s[2:3] 1426; GFX7-NEXT: s_mov_b64 s[2:3], s[10:11] 1427; GFX7-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 1428; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 1429; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 1430; GFX7-NEXT: s_mov_b32 s6, -1 1431; GFX7-NEXT: s_waitcnt vmcnt(1) 1432; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 1433; GFX7-NEXT: s_waitcnt vmcnt(0) 1434; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 1435; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1436; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1437; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1438; GFX7-NEXT: v_mad_u32_u24 v1, v1, v3, s0 1439; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 1440; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 1441; GFX7-NEXT: s_endpgm 1442; 1443; GFX8-LABEL: notudot2_v4i16_Middle: 1444; GFX8: ; %bb.0: ; %entry 1445; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1446; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1447; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 1448; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1449; GFX8-NEXT: v_mov_b32_e32 v1, s1 1450; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1451; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1452; GFX8-NEXT: v_mov_b32_e32 v3, s3 1453; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2 1454; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc 1455; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1456; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1457; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1458; GFX8-NEXT: s_waitcnt vmcnt(1) 1459; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 1460; GFX8-NEXT: s_waitcnt vmcnt(0) 1461; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v3 1462; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1463; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1464; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1465; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s0 1466; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1 1467; GFX8-NEXT: v_mov_b32_e32 v0, s4 1468; GFX8-NEXT: v_mov_b32_e32 v1, s5 1469; GFX8-NEXT: flat_store_dword v[0:1], v2 1470; GFX8-NEXT: s_endpgm 1471; 1472; GFX9-NODL-LABEL: notudot2_v4i16_Middle: 1473; GFX9-NODL: ; %bb.0: ; %entry 1474; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1475; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1476; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1477; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1478; GFX9-NODL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] 1479; GFX9-NODL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] 1480; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 1481; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0 1482; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1483; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1484; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1485; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1486; GFX9-NODL-NEXT: v_add3_u32 v0, v1, s0, v0 1487; GFX9-NODL-NEXT: global_store_dword v4, v0, s[6:7] 1488; GFX9-NODL-NEXT: s_endpgm 1489; 1490; GFX9-DL-LABEL: notudot2_v4i16_Middle: 1491; GFX9-DL: ; %bb.0: ; %entry 1492; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1493; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1494; GFX9-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1495; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1496; GFX9-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] 1497; GFX9-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] 1498; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1499; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0 1500; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1501; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1502; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1503; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1504; GFX9-DL-NEXT: v_add3_u32 v0, v1, s0, v0 1505; GFX9-DL-NEXT: global_store_dword v4, v0, s[6:7] 1506; GFX9-DL-NEXT: s_endpgm 1507; 1508; GFX10-DL-LABEL: notudot2_v4i16_Middle: 1509; GFX10-DL: ; %bb.0: ; %entry 1510; GFX10-DL-NEXT: s_clause 0x1 1511; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1512; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1513; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0 1514; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1515; GFX10-DL-NEXT: s_clause 0x1 1516; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] 1517; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] 1518; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 1519; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1520; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1521; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1522; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1523; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1524; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1525; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0 1526; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 1527; GFX10-DL-NEXT: s_endpgm 1528 ptr addrspace(1) %src2, 1529 ptr addrspace(1) nocapture %dst) { 1530entry: 1531 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1532 %gep1 = getelementptr <4 x i16>, ptr addrspace(1) %src1, i32 %idx 1533 %vec1 = load <4 x i16>, ptr addrspace(1) %gep1 1534 %gep2 = getelementptr <4 x i16>, ptr addrspace(1) %src2, i32 %idx 1535 %vec2 = load <4 x i16>, ptr addrspace(1) %gep2 1536 1537 %s1.elt1 = extractelement <4 x i16> %vec1, i64 1 1538 %conv = zext i16 %s1.elt1 to i32 1539 %s2.elt1 = extractelement <4 x i16> %vec2, i64 1 1540 %conv2 = zext i16 %s2.elt1 to i32 1541 %mul1 = mul i32 %conv2, %conv 1542 1543 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2 1544 %conv3 = zext i16 %s1.elt2 to i32 1545 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2 1546 %conv4 = zext i16 %s2.elt2 to i32 1547 %mul2 = mul i32 %conv4, %conv3 1548 1549 %s3 = load i32, ptr addrspace(1) %dst, align 4 1550 %add = add i32 %mul2, %s3 1551 %add6 = add i32 %add, %mul1 1552 store i32 %add6, ptr addrspace(1) %dst, align 4 1553 ret void 1554} 1555 1556define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1, 1557; GFX7-LABEL: notudot2_DiffIndex: 1558; GFX7: ; %bb.0: ; %entry 1559; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1560; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1561; GFX7-NEXT: s_mov_b32 s7, 0xf000 1562; GFX7-NEXT: s_mov_b32 s10, 0 1563; GFX7-NEXT: s_mov_b32 s11, s7 1564; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1565; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 1566; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1567; GFX7-NEXT: v_mov_b32_e32 v1, 0 1568; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1569; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 1570; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1571; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 1572; GFX7-NEXT: s_mov_b32 s6, -1 1573; GFX7-NEXT: s_waitcnt vmcnt(1) 1574; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 1575; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 1576; GFX7-NEXT: s_waitcnt vmcnt(0) 1577; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 1578; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 1579; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1580; GFX7-NEXT: v_mad_u32_u24 v0, v0, v1, s0 1581; GFX7-NEXT: v_mad_u32_u24 v0, v3, v2, v0 1582; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 1583; GFX7-NEXT: s_endpgm 1584; 1585; GFX8-LABEL: notudot2_DiffIndex: 1586; GFX8: ; %bb.0: ; %entry 1587; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1588; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1589; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1590; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1591; GFX8-NEXT: v_mov_b32_e32 v1, s1 1592; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1593; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1594; GFX8-NEXT: flat_load_dword v3, v[0:1] 1595; GFX8-NEXT: v_mov_b32_e32 v1, s3 1596; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1597; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1598; GFX8-NEXT: flat_load_dword v0, v[0:1] 1599; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1600; GFX8-NEXT: s_waitcnt vmcnt(1) 1601; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 1602; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1603; GFX8-NEXT: s_waitcnt vmcnt(0) 1604; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0 1605; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 1606; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1607; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0 1608; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 1609; GFX8-NEXT: v_mov_b32_e32 v0, s4 1610; GFX8-NEXT: v_mov_b32_e32 v1, s5 1611; GFX8-NEXT: flat_store_dword v[0:1], v2 1612; GFX8-NEXT: s_endpgm 1613; 1614; GFX9-NODL-LABEL: notudot2_DiffIndex: 1615; GFX9-NODL: ; %bb.0: ; %entry 1616; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1617; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1618; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1619; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1620; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 1621; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 1622; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 1623; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1624; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1625; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 1626; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 1627; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1628; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 1629; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 1630; GFX9-NODL-NEXT: s_endpgm 1631; 1632; GFX9-DL-LABEL: notudot2_DiffIndex: 1633; GFX9-DL: ; %bb.0: ; %entry 1634; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1635; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1636; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1637; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1638; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 1639; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 1640; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1641; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1642; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1643; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 1644; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 1645; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1646; GFX9-DL-NEXT: v_add3_u32 v1, v1, s0, v3 1647; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 1648; GFX9-DL-NEXT: s_endpgm 1649; 1650; GFX10-DL-LABEL: notudot2_DiffIndex: 1651; GFX10-DL: ; %bb.0: ; %entry 1652; GFX10-DL-NEXT: s_clause 0x1 1653; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1654; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1655; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1656; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1657; GFX10-DL-NEXT: s_clause 0x1 1658; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 1659; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 1660; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 1661; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1662; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1663; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0 1664; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1 1665; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1666; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1667; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0 1668; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 1669; GFX10-DL-NEXT: s_endpgm 1670 ptr addrspace(1) %src2, 1671 ptr addrspace(1) nocapture %dst) { 1672entry: 1673 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1674 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx 1675 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1 1676 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx 1677 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2 1678 1679 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 1680 %conv = zext i16 %s1.elt1 to i32 1681 %s2.elt1 = extractelement <2 x i16> %vec2, i64 1 1682 %conv2 = zext i16 %s2.elt1 to i32 1683 %mul1 = mul i32 %conv2, %conv 1684 1685 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 1686 %conv3 = zext i16 %s1.elt2 to i32 1687 %s2.elt2 = extractelement <2 x i16> %vec2, i64 0 1688 %conv4 = zext i16 %s2.elt2 to i32 1689 %mul2 = mul i32 %conv4, %conv3 1690 1691 %s3 = load i32, ptr addrspace(1) %dst, align 4 1692 %add = add i32 %mul2, %s3 1693 %add6 = add i32 %add, %mul1 1694 store i32 %add6, ptr addrspace(1) %dst, align 4 1695 ret void 1696} 1697 1698define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, 1699; GFX7-LABEL: udot2_MultipleUses_add1: 1700; GFX7: ; %bb.0: ; %entry 1701; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1702; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1703; GFX7-NEXT: s_mov_b32 s7, 0xf000 1704; GFX7-NEXT: s_mov_b32 s10, 0 1705; GFX7-NEXT: s_mov_b32 s11, s7 1706; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1707; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 1708; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1709; GFX7-NEXT: v_mov_b32_e32 v1, 0 1710; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1711; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 1712; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1713; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 1714; GFX7-NEXT: s_mov_b32 s6, -1 1715; GFX7-NEXT: s_waitcnt vmcnt(1) 1716; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 1717; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 1718; GFX7-NEXT: s_waitcnt vmcnt(0) 1719; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 1720; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 1721; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1722; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0 1723; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 1724; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 1725; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 1726; GFX7-NEXT: s_endpgm 1727; 1728; GFX8-LABEL: udot2_MultipleUses_add1: 1729; GFX8: ; %bb.0: ; %entry 1730; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1731; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1732; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1733; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1734; GFX8-NEXT: v_mov_b32_e32 v1, s1 1735; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1736; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1737; GFX8-NEXT: flat_load_dword v3, v[0:1] 1738; GFX8-NEXT: v_mov_b32_e32 v1, s3 1739; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1740; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1741; GFX8-NEXT: flat_load_dword v0, v[0:1] 1742; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1743; GFX8-NEXT: s_waitcnt vmcnt(1) 1744; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 1745; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1746; GFX8-NEXT: s_waitcnt vmcnt(0) 1747; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 1748; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1749; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1750; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0 1751; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v0 1752; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 1753; GFX8-NEXT: v_mov_b32_e32 v0, s4 1754; GFX8-NEXT: v_mov_b32_e32 v1, s5 1755; GFX8-NEXT: flat_store_dword v[0:1], v2 1756; GFX8-NEXT: s_endpgm 1757; 1758; GFX9-NODL-LABEL: udot2_MultipleUses_add1: 1759; GFX9-NODL: ; %bb.0: ; %entry 1760; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1761; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1762; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1763; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1764; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 1765; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 1766; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 1767; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1768; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1769; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1770; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1771; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1772; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1773; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 1774; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1 1775; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 1776; GFX9-NODL-NEXT: s_endpgm 1777; 1778; GFX9-DL-LABEL: udot2_MultipleUses_add1: 1779; GFX9-DL: ; %bb.0: ; %entry 1780; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1781; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1782; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1783; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1784; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 1785; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 1786; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1787; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1788; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1789; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1790; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1791; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1792; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1793; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 1794; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1 1795; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 1796; GFX9-DL-NEXT: s_endpgm 1797; 1798; GFX10-DL-LABEL: udot2_MultipleUses_add1: 1799; GFX10-DL: ; %bb.0: ; %entry 1800; GFX10-DL-NEXT: s_clause 0x1 1801; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1802; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1803; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1804; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1805; GFX10-DL-NEXT: s_clause 0x1 1806; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 1807; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 1808; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 1809; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1810; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1811; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1 1812; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1813; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 1814; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1815; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1816; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1817; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0 1818; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0 1819; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 1820; GFX10-DL-NEXT: s_endpgm 1821 ptr addrspace(1) %src2, 1822 ptr addrspace(1) nocapture %dst) { 1823entry: 1824 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1825 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx 1826 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1 1827 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx 1828 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2 1829 1830 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 1831 %conv = zext i16 %s1.elt1 to i32 1832 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 1833 %conv2 = zext i16 %s2.elt1 to i32 1834 %mul1 = mul i32 %conv2, %conv 1835 1836 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 1837 %conv3 = zext i16 %s1.elt2 to i32 1838 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 1839 %conv4 = zext i16 %s2.elt2 to i32 1840 %mul2 = mul i32 %conv4, %conv3 1841 1842 %s3 = load i32, ptr addrspace(1) %dst, align 4 1843 %add1 = add i32 %mul2, %s3 1844 %add2 = add i32 %add1, %mul1 1845 1846 %res = add i32 %add2, %add1 1847 store i32 %res, ptr addrspace(1) %dst, align 4 1848 ret void 1849} 1850 1851define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, 1852; GFX7-LABEL: idot2_MultipleUses_add1: 1853; GFX7: ; %bb.0: ; %entry 1854; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1855; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1856; GFX7-NEXT: s_mov_b32 s7, 0xf000 1857; GFX7-NEXT: s_mov_b32 s10, 0 1858; GFX7-NEXT: s_mov_b32 s11, s7 1859; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1860; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 1861; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1862; GFX7-NEXT: v_mov_b32_e32 v1, 0 1863; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1864; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 1865; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1866; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 1867; GFX7-NEXT: s_mov_b32 s6, -1 1868; GFX7-NEXT: s_waitcnt vmcnt(1) 1869; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 1870; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 1871; GFX7-NEXT: s_waitcnt vmcnt(0) 1872; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 1873; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 1874; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1875; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0 1876; GFX7-NEXT: v_mad_i32_i24 v1, v3, v1, v0 1877; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 1878; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 1879; GFX7-NEXT: s_endpgm 1880; 1881; GFX8-LABEL: idot2_MultipleUses_add1: 1882; GFX8: ; %bb.0: ; %entry 1883; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1884; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1885; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1886; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1887; GFX8-NEXT: v_mov_b32_e32 v1, s1 1888; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1889; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1890; GFX8-NEXT: flat_load_dword v3, v[0:1] 1891; GFX8-NEXT: v_mov_b32_e32 v1, s3 1892; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1893; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1894; GFX8-NEXT: flat_load_dword v0, v[0:1] 1895; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1896; GFX8-NEXT: s_waitcnt vmcnt(1) 1897; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 1898; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 1899; GFX8-NEXT: s_waitcnt vmcnt(0) 1900; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 1901; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 1902; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1903; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0 1904; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1, v0 1905; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 1906; GFX8-NEXT: v_mov_b32_e32 v0, s4 1907; GFX8-NEXT: v_mov_b32_e32 v1, s5 1908; GFX8-NEXT: flat_store_dword v[0:1], v2 1909; GFX8-NEXT: s_endpgm 1910; 1911; GFX9-NODL-LABEL: idot2_MultipleUses_add1: 1912; GFX9-NODL: ; %bb.0: ; %entry 1913; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1914; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1915; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1916; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1917; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 1918; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 1919; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 1920; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1921; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1922; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1923; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 1924; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 1925; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1926; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 1927; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1 1928; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 1929; GFX9-NODL-NEXT: s_endpgm 1930; 1931; GFX9-DL-LABEL: idot2_MultipleUses_add1: 1932; GFX9-DL: ; %bb.0: ; %entry 1933; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1934; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1935; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1936; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1937; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 1938; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 1939; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1940; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1941; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1942; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1943; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 1944; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 1945; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1946; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 1947; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1 1948; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 1949; GFX9-DL-NEXT: s_endpgm 1950; 1951; GFX10-DL-LABEL: idot2_MultipleUses_add1: 1952; GFX10-DL: ; %bb.0: ; %entry 1953; GFX10-DL-NEXT: s_clause 0x1 1954; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1955; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1956; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1957; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1958; GFX10-DL-NEXT: s_clause 0x1 1959; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 1960; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 1961; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 1962; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1963; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1964; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1 1965; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1966; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2 1967; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 1968; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 1969; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1970; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0 1971; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0 1972; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] 1973; GFX10-DL-NEXT: s_endpgm 1974 ptr addrspace(1) %src2, 1975 ptr addrspace(1) nocapture %dst) { 1976entry: 1977 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1978 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx 1979 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1 1980 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx 1981 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2 1982 1983 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 1984 %conv = sext i16 %s1.elt1 to i32 1985 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 1986 %conv2 = sext i16 %s2.elt1 to i32 1987 %mul1 = mul i32 %conv2, %conv 1988 1989 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 1990 %conv3 = sext i16 %s1.elt2 to i32 1991 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 1992 %conv4 = sext i16 %s2.elt2 to i32 1993 %mul2 = mul i32 %conv4, %conv3 1994 1995 %s3 = load i32, ptr addrspace(1) %dst, align 4 1996 %add1 = add i32 %mul2, %s3 1997 %add2 = add i32 %add1, %mul1 1998 1999 %res = add i32 %add2, %add1 2000 store i32 %res, ptr addrspace(1) %dst, align 4 2001 ret void 2002} 2003 2004define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1, 2005; GFX7-LABEL: udot2_MultipleUses_mul1: 2006; GFX7: ; %bb.0: ; %entry 2007; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2008; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2009; GFX7-NEXT: s_mov_b32 s7, 0xf000 2010; GFX7-NEXT: s_mov_b32 s10, 0 2011; GFX7-NEXT: s_mov_b32 s11, s7 2012; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2013; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 2014; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2015; GFX7-NEXT: v_mov_b32_e32 v1, 0 2016; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2017; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 2018; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2019; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 2020; GFX7-NEXT: s_mov_b32 s6, -1 2021; GFX7-NEXT: s_waitcnt vmcnt(1) 2022; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 2023; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 2024; GFX7-NEXT: s_waitcnt vmcnt(0) 2025; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 2026; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 2027; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2028; GFX7-NEXT: v_mad_u32_u24 v4, v0, v2, s0 2029; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 2030; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 2031; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 2032; GFX7-NEXT: s_endpgm 2033; 2034; GFX8-LABEL: udot2_MultipleUses_mul1: 2035; GFX8: ; %bb.0: ; %entry 2036; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2037; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2038; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2039; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2040; GFX8-NEXT: v_mov_b32_e32 v1, s1 2041; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2042; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2043; GFX8-NEXT: flat_load_dword v3, v[0:1] 2044; GFX8-NEXT: v_mov_b32_e32 v1, s3 2045; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2046; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2047; GFX8-NEXT: flat_load_dword v0, v[0:1] 2048; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 2049; GFX8-NEXT: s_waitcnt vmcnt(1) 2050; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 2051; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2052; GFX8-NEXT: s_waitcnt vmcnt(0) 2053; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 2054; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2055; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2056; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s0 2057; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4 2058; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 2059; GFX8-NEXT: v_mov_b32_e32 v0, s4 2060; GFX8-NEXT: v_mov_b32_e32 v1, s5 2061; GFX8-NEXT: flat_store_dword v[0:1], v2 2062; GFX8-NEXT: s_endpgm 2063; 2064; GFX9-NODL-LABEL: udot2_MultipleUses_mul1: 2065; GFX9-NODL: ; %bb.0: ; %entry 2066; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2067; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2068; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2069; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2070; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 2071; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 2072; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 2073; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2074; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2075; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1 2076; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2077; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v2 2078; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2079; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 2080; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2081; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v3, s0 2082; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2 2083; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 2084; GFX9-NODL-NEXT: s_endpgm 2085; 2086; GFX9-DL-LABEL: udot2_MultipleUses_mul1: 2087; GFX9-DL: ; %bb.0: ; %entry 2088; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2089; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2090; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2091; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2092; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 2093; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 2094; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2095; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2096; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2097; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v1 2098; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2099; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xffff, v2 2100; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2101; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v4, v3 2102; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2103; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, v3, s0 2104; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2 2105; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 2106; GFX9-DL-NEXT: s_endpgm 2107; 2108; GFX10-DL-LABEL: udot2_MultipleUses_mul1: 2109; GFX10-DL: ; %bb.0: ; %entry 2110; GFX10-DL-NEXT: s_clause 0x1 2111; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2112; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2113; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2114; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2115; GFX10-DL-NEXT: s_clause 0x1 2116; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 2117; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 2118; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 2119; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2120; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2121; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xffff, v1 2122; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2123; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v2 2124; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2125; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0 2126; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2127; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0 2128; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 2129; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2 2130; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] 2131; GFX10-DL-NEXT: s_endpgm 2132 ptr addrspace(1) %src2, 2133 ptr addrspace(1) nocapture %dst) { 2134entry: 2135 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2136 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx 2137 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1 2138 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx 2139 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2 2140 2141 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 2142 %conv = zext i16 %s1.elt1 to i32 2143 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 2144 %conv2 = zext i16 %s2.elt1 to i32 2145 %mul1 = mul i32 %conv2, %conv 2146 2147 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 2148 %conv3 = zext i16 %s1.elt2 to i32 2149 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 2150 %conv4 = zext i16 %s2.elt2 to i32 2151 %mul2 = mul i32 %conv4, %conv3 2152 2153 %s3 = load i32, ptr addrspace(1) %dst, align 4 2154 %add0 = add i32 %mul1, %s3 2155 2156 %add1 = add i32 %mul2, %add0 2157 %add2 = add i32 %add1, %mul1 2158 2159 store i32 %add2, ptr addrspace(1) %dst, align 4 2160 ret void 2161} 2162 2163define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1, 2164; GFX7-LABEL: idot2_MultipleUses_mul1: 2165; GFX7: ; %bb.0: ; %entry 2166; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2167; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2168; GFX7-NEXT: s_mov_b32 s7, 0xf000 2169; GFX7-NEXT: s_mov_b32 s10, 0 2170; GFX7-NEXT: s_mov_b32 s11, s7 2171; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2172; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 2173; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2174; GFX7-NEXT: v_mov_b32_e32 v1, 0 2175; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2176; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 2177; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2178; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 2179; GFX7-NEXT: s_mov_b32 s6, -1 2180; GFX7-NEXT: s_waitcnt vmcnt(1) 2181; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 2182; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 2183; GFX7-NEXT: s_waitcnt vmcnt(0) 2184; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 2185; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2186; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2187; GFX7-NEXT: v_mad_i32_i24 v4, v3, v1, s0 2188; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4 2189; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 2190; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 2191; GFX7-NEXT: s_endpgm 2192; 2193; GFX8-LABEL: idot2_MultipleUses_mul1: 2194; GFX8: ; %bb.0: ; %entry 2195; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2196; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2197; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2198; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2199; GFX8-NEXT: v_mov_b32_e32 v1, s1 2200; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2201; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2202; GFX8-NEXT: flat_load_dword v3, v[0:1] 2203; GFX8-NEXT: v_mov_b32_e32 v1, s3 2204; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2205; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2206; GFX8-NEXT: flat_load_dword v0, v[0:1] 2207; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 2208; GFX8-NEXT: s_waitcnt vmcnt(1) 2209; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 2210; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 2211; GFX8-NEXT: s_waitcnt vmcnt(0) 2212; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 2213; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2214; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2215; GFX8-NEXT: v_mad_i32_i24 v4, v2, v1, s0 2216; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4 2217; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 2218; GFX8-NEXT: v_mov_b32_e32 v0, s4 2219; GFX8-NEXT: v_mov_b32_e32 v1, s5 2220; GFX8-NEXT: flat_store_dword v[0:1], v2 2221; GFX8-NEXT: s_endpgm 2222; 2223; GFX9-NODL-LABEL: idot2_MultipleUses_mul1: 2224; GFX9-NODL: ; %bb.0: ; %entry 2225; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2226; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2227; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2228; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2229; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 2230; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 2231; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 2232; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2233; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2234; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 16 2235; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2236; GFX9-NODL-NEXT: v_bfe_i32 v4, v2, 0, 16 2237; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2238; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v2, v4, v3 2239; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2240; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v3, s0 2241; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v2 2242; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 2243; GFX9-NODL-NEXT: s_endpgm 2244; 2245; GFX9-DL-LABEL: idot2_MultipleUses_mul1: 2246; GFX9-DL: ; %bb.0: ; %entry 2247; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2248; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2249; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2250; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2251; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 2252; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 2253; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2254; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2255; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2256; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 16 2257; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2258; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 16 2259; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2260; GFX9-DL-NEXT: v_mul_i32_i24_e32 v2, v4, v3 2261; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2262; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, v3, s0 2263; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v2 2264; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 2265; GFX9-DL-NEXT: s_endpgm 2266; 2267; GFX10-DL-LABEL: idot2_MultipleUses_mul1: 2268; GFX10-DL: ; %bb.0: ; %entry 2269; GFX10-DL-NEXT: s_clause 0x1 2270; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2271; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2272; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2273; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2274; GFX10-DL-NEXT: s_clause 0x1 2275; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 2276; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 2277; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 2278; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2279; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2280; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 16 2281; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2282; GFX10-DL-NEXT: v_bfe_i32 v3, v2, 0, 16 2283; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2284; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0 2285; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2286; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0 2287; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 2288; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2 2289; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] 2290; GFX10-DL-NEXT: s_endpgm 2291 ptr addrspace(1) %src2, 2292 ptr addrspace(1) nocapture %dst) { 2293entry: 2294 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2295 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx 2296 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1 2297 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx 2298 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2 2299 2300 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 2301 %conv = sext i16 %s1.elt1 to i32 2302 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 2303 %conv2 = sext i16 %s2.elt1 to i32 2304 %mul1 = mul i32 %conv2, %conv 2305 2306 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 2307 %conv3 = sext i16 %s1.elt2 to i32 2308 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 2309 %conv4 = sext i16 %s2.elt2 to i32 2310 %mul2 = mul i32 %conv4, %conv3 2311 2312 %s3 = load i32, ptr addrspace(1) %dst, align 4 2313 %add0 = add i32 %mul1, %s3 2314 2315 %add1 = add i32 %mul2, %add0 2316 %add2 = add i32 %add1, %mul1 2317 2318 store i32 %add2, ptr addrspace(1) %dst, align 4 2319 ret void 2320} 2321 2322define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, 2323; GFX7-LABEL: udot2_MultipleUses_mul2: 2324; GFX7: ; %bb.0: ; %entry 2325; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2326; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2327; GFX7-NEXT: s_mov_b32 s7, 0xf000 2328; GFX7-NEXT: s_mov_b32 s10, 0 2329; GFX7-NEXT: s_mov_b32 s11, s7 2330; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2331; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 2332; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2333; GFX7-NEXT: v_mov_b32_e32 v1, 0 2334; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2335; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 2336; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2337; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 2338; GFX7-NEXT: s_mov_b32 s6, -1 2339; GFX7-NEXT: s_waitcnt vmcnt(1) 2340; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 2341; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 2342; GFX7-NEXT: s_waitcnt vmcnt(0) 2343; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 2344; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2345; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s0 2346; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 2347; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 2348; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 2349; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 2350; GFX7-NEXT: s_endpgm 2351; 2352; GFX8-LABEL: udot2_MultipleUses_mul2: 2353; GFX8: ; %bb.0: ; %entry 2354; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2355; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2356; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2357; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2358; GFX8-NEXT: v_mov_b32_e32 v1, s1 2359; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2360; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2361; GFX8-NEXT: flat_load_dword v3, v[0:1] 2362; GFX8-NEXT: v_mov_b32_e32 v1, s3 2363; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2364; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2365; GFX8-NEXT: flat_load_dword v0, v[0:1] 2366; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 2367; GFX8-NEXT: s_waitcnt vmcnt(1) 2368; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3 2369; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2370; GFX8-NEXT: s_waitcnt vmcnt(0) 2371; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 2372; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 2373; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2374; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s0 2375; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4 2376; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 2377; GFX8-NEXT: v_mov_b32_e32 v0, s4 2378; GFX8-NEXT: v_mov_b32_e32 v1, s5 2379; GFX8-NEXT: flat_store_dword v[0:1], v2 2380; GFX8-NEXT: s_endpgm 2381; 2382; GFX9-NODL-LABEL: udot2_MultipleUses_mul2: 2383; GFX9-NODL: ; %bb.0: ; %entry 2384; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2385; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2386; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2387; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2388; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 2389; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 2390; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 2391; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2392; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2393; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2394; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2395; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 2396; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 2397; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2398; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 2399; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 2400; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 2401; GFX9-NODL-NEXT: s_endpgm 2402; 2403; GFX9-DL-LABEL: udot2_MultipleUses_mul2: 2404; GFX9-DL: ; %bb.0: ; %entry 2405; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2406; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2407; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2408; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2409; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 2410; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 2411; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2412; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2413; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2414; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2415; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2416; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 2417; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 2418; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2419; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 2420; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 2421; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 2422; GFX9-DL-NEXT: s_endpgm 2423; 2424; GFX10-DL-LABEL: udot2_MultipleUses_mul2: 2425; GFX10-DL: ; %bb.0: ; %entry 2426; GFX10-DL-NEXT: s_clause 0x1 2427; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2428; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2429; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2430; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2431; GFX10-DL-NEXT: s_clause 0x1 2432; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 2433; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 2434; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 2435; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2436; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2437; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1 2438; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2439; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 2440; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2441; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0 2442; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2443; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0 2444; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 2445; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1 2446; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] 2447; GFX10-DL-NEXT: s_endpgm 2448 ptr addrspace(1) %src2, 2449 ptr addrspace(1) nocapture %dst) { 2450entry: 2451 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2452 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx 2453 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1 2454 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx 2455 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2 2456 2457 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 2458 %conv = zext i16 %s1.elt1 to i32 2459 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 2460 %conv2 = zext i16 %s2.elt1 to i32 2461 %mul1 = mul i32 %conv2, %conv 2462 2463 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 2464 %conv3 = zext i16 %s1.elt2 to i32 2465 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 2466 %conv4 = zext i16 %s2.elt2 to i32 2467 %mul2 = mul i32 %conv4, %conv3 2468 2469 %s3 = load i32, ptr addrspace(1) %dst, align 4 2470 %add0 = add i32 %mul2, %s3 2471 2472 %add1 = add i32 %mul2, %add0 2473 %add2 = add i32 %add1, %mul1 2474 2475 store i32 %add2, ptr addrspace(1) %dst, align 4 2476 ret void 2477} 2478 2479define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, 2480; GFX7-LABEL: idot2_MultipleUses_mul2: 2481; GFX7: ; %bb.0: ; %entry 2482; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2483; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2484; GFX7-NEXT: s_mov_b32 s7, 0xf000 2485; GFX7-NEXT: s_mov_b32 s10, 0 2486; GFX7-NEXT: s_mov_b32 s11, s7 2487; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2488; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 2489; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2490; GFX7-NEXT: v_mov_b32_e32 v1, 0 2491; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2492; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 2493; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2494; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 2495; GFX7-NEXT: s_mov_b32 s6, -1 2496; GFX7-NEXT: s_waitcnt vmcnt(1) 2497; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 16 2498; GFX7-NEXT: v_ashrrev_i32_e32 v2, 16, v2 2499; GFX7-NEXT: s_waitcnt vmcnt(0) 2500; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 2501; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2502; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2503; GFX7-NEXT: v_mad_i32_i24 v4, v0, v2, s0 2504; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4 2505; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 2506; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 2507; GFX7-NEXT: s_endpgm 2508; 2509; GFX8-LABEL: idot2_MultipleUses_mul2: 2510; GFX8: ; %bb.0: ; %entry 2511; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2512; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2513; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2514; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2515; GFX8-NEXT: v_mov_b32_e32 v1, s1 2516; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2517; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2518; GFX8-NEXT: flat_load_dword v3, v[0:1] 2519; GFX8-NEXT: v_mov_b32_e32 v1, s3 2520; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2521; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2522; GFX8-NEXT: flat_load_dword v0, v[0:1] 2523; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 2524; GFX8-NEXT: s_waitcnt vmcnt(1) 2525; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16 2526; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3 2527; GFX8-NEXT: s_waitcnt vmcnt(0) 2528; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 2529; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 2530; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2531; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s0 2532; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4 2533; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 2534; GFX8-NEXT: v_mov_b32_e32 v0, s4 2535; GFX8-NEXT: v_mov_b32_e32 v1, s5 2536; GFX8-NEXT: flat_store_dword v[0:1], v2 2537; GFX8-NEXT: s_endpgm 2538; 2539; GFX9-NODL-LABEL: idot2_MultipleUses_mul2: 2540; GFX9-NODL: ; %bb.0: ; %entry 2541; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2542; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2543; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2544; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2545; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 2546; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 2547; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 2548; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2549; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2550; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2551; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 2552; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 2553; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 2554; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2555; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 2556; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 2557; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 2558; GFX9-NODL-NEXT: s_endpgm 2559; 2560; GFX9-DL-LABEL: idot2_MultipleUses_mul2: 2561; GFX9-DL: ; %bb.0: ; %entry 2562; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2563; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2564; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2565; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2566; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 2567; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 2568; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2569; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2570; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2571; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2572; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 2573; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 2574; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 2575; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2576; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 2577; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 2578; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 2579; GFX9-DL-NEXT: s_endpgm 2580; 2581; GFX10-DL-LABEL: idot2_MultipleUses_mul2: 2582; GFX10-DL: ; %bb.0: ; %entry 2583; GFX10-DL-NEXT: s_clause 0x1 2584; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2585; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2586; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2587; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2588; GFX10-DL-NEXT: s_clause 0x1 2589; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 2590; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 2591; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 2592; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2593; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2594; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1 2595; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2596; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2 2597; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 2598; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0 2599; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2600; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0 2601; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 2602; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1 2603; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] 2604; GFX10-DL-NEXT: s_endpgm 2605 ptr addrspace(1) %src2, 2606 ptr addrspace(1) nocapture %dst) { 2607entry: 2608 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2609 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx 2610 %vec1 = load <2 x i16>, ptr addrspace(1) %gep1 2611 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx 2612 %vec2 = load <2 x i16>, ptr addrspace(1) %gep2 2613 2614 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0 2615 %conv = sext i16 %s1.elt1 to i32 2616 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0 2617 %conv2 = sext i16 %s2.elt1 to i32 2618 %mul1 = mul i32 %conv2, %conv 2619 2620 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1 2621 %conv3 = sext i16 %s1.elt2 to i32 2622 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1 2623 %conv4 = sext i16 %s2.elt2 to i32 2624 %mul2 = mul i32 %conv4, %conv3 2625 2626 %s3 = load i32, ptr addrspace(1) %dst, align 4 2627 %add0 = add i32 %mul2, %s3 2628 2629 %add1 = add i32 %mul2, %add0 2630 %add2 = add i32 %add1, %mul1 2631 2632 store i32 %add2, ptr addrspace(1) %dst, align 4 2633 ret void 2634} 2635 2636define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1, 2637; GFX7-LABEL: udot2_acc16: 2638; GFX7: ; %bb.0: ; %entry 2639; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2640; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2641; GFX7-NEXT: s_mov_b32 s7, 0xf000 2642; GFX7-NEXT: s_mov_b32 s10, 0 2643; GFX7-NEXT: s_mov_b32 s11, s7 2644; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2645; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 2646; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2647; GFX7-NEXT: v_mov_b32_e32 v1, 0 2648; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2649; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 2650; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2651; GFX7-NEXT: s_mov_b32 s6, -1 2652; GFX7-NEXT: buffer_load_ushort v1, off, s[4:7], 0 2653; GFX7-NEXT: s_waitcnt vmcnt(2) 2654; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2 2655; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 2656; GFX7-NEXT: s_waitcnt vmcnt(1) 2657; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v0 2658; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 2659; GFX7-NEXT: s_waitcnt vmcnt(0) 2660; GFX7-NEXT: v_mad_u32_u24 v1, v3, v4, v1 2661; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 2662; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 2663; GFX7-NEXT: s_endpgm 2664; 2665; GFX8-LABEL: udot2_acc16: 2666; GFX8: ; %bb.0: ; %entry 2667; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2668; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2669; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2670; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2671; GFX8-NEXT: v_mov_b32_e32 v1, s1 2672; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2673; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2674; GFX8-NEXT: flat_load_dword v3, v[0:1] 2675; GFX8-NEXT: v_mov_b32_e32 v1, s3 2676; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2677; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2678; GFX8-NEXT: flat_load_dword v2, v[0:1] 2679; GFX8-NEXT: v_mov_b32_e32 v0, s4 2680; GFX8-NEXT: v_mov_b32_e32 v1, s5 2681; GFX8-NEXT: flat_load_ushort v4, v[0:1] 2682; GFX8-NEXT: s_waitcnt vmcnt(2) 2683; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 2684; GFX8-NEXT: s_waitcnt vmcnt(1) 2685; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 2686; GFX8-NEXT: s_waitcnt vmcnt(0) 2687; GFX8-NEXT: v_mad_u16 v4, v5, v6, v4 2688; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 2689; GFX8-NEXT: flat_store_short v[0:1], v2 2690; GFX8-NEXT: s_endpgm 2691; 2692; GFX9-NODL-LABEL: udot2_acc16: 2693; GFX9-NODL: ; %bb.0: ; %entry 2694; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2695; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2696; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2697; GFX9-NODL-NEXT: v_mov_b32_e32 v1, 0 2698; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2699; GFX9-NODL-NEXT: global_load_dword v2, v0, s[0:1] 2700; GFX9-NODL-NEXT: global_load_dword v3, v0, s[2:3] 2701; GFX9-NODL-NEXT: global_load_ushort v4, v1, s[6:7] 2702; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 2703; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 2704; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2705; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 2706; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2707; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 2708; GFX9-NODL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 2709; GFX9-NODL-NEXT: global_store_short v1, v0, s[6:7] 2710; GFX9-NODL-NEXT: s_endpgm 2711; 2712; GFX9-DL-LABEL: udot2_acc16: 2713; GFX9-DL: ; %bb.0: ; %entry 2714; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2715; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2716; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2717; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 2718; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2719; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] 2720; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] 2721; GFX9-DL-NEXT: global_load_ushort v4, v1, s[6:7] 2722; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 2723; GFX9-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 2724; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2725; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 2726; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2727; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v0, v5, v4 2728; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v2, v3, v0 2729; GFX9-DL-NEXT: global_store_short v1, v0, s[6:7] 2730; GFX9-DL-NEXT: s_endpgm 2731; 2732; GFX10-DL-LABEL: udot2_acc16: 2733; GFX10-DL: ; %bb.0: ; %entry 2734; GFX10-DL-NEXT: s_clause 0x1 2735; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2736; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2737; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2738; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 2739; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2740; GFX10-DL-NEXT: s_clause 0x1 2741; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 2742; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] 2743; GFX10-DL-NEXT: global_load_ushort v4, v1, s[6:7] 2744; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 2745; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 2746; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2747; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v3 2748; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2749; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 2750; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 2751; GFX10-DL-NEXT: global_store_short v1, v0, s[6:7] 2752; GFX10-DL-NEXT: s_endpgm 2753 ptr addrspace(1) %src2, 2754 ptr addrspace(1) nocapture %dst) { 2755entry: 2756 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2757 %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %src1, i32 %idx 2758 %v1 = load <2 x i16>, ptr addrspace(1) %gep1 2759 %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %src2, i32 %idx 2760 %v2 = load <2 x i16>, ptr addrspace(1) %gep2 2761 2762 %v1e1 = extractelement <2 x i16> %v1, i64 0 2763 %v2e1 = extractelement <2 x i16> %v2, i64 0 2764 %mul1 = mul i16 %v1e1, %v2e1 2765 2766 %v1e2 = extractelement <2 x i16> %v1, i64 1 2767 %v2e2 = extractelement <2 x i16> %v2, i64 1 2768 %mul2 = mul i16 %v1e2, %v2e2 2769 2770 %s2 = load i16, ptr addrspace(1) %dst, align 2 2771 %add1 = add i16 %mul2, %s2 2772 %add2 = add i16 %add1, %mul1 2773 store i16 %add2, ptr addrspace(1) %dst, align 2 2774 ret void 2775} 2776 2777define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1, 2778; GFX7-LABEL: notsdot2_sext8: 2779; GFX7: ; %bb.0: ; %entry 2780; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2781; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 2782; GFX7-NEXT: s_mov_b32 s7, 0xf000 2783; GFX7-NEXT: s_mov_b32 s10, 0 2784; GFX7-NEXT: s_mov_b32 s11, s7 2785; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2786; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 2787; GFX7-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2788; GFX7-NEXT: v_mov_b32_e32 v1, 0 2789; GFX7-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 2790; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 2791; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64 2792; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 2793; GFX7-NEXT: s_mov_b32 s6, -1 2794; GFX7-NEXT: s_waitcnt vmcnt(1) 2795; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 2796; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8 2797; GFX7-NEXT: s_waitcnt vmcnt(0) 2798; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 8 2799; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 2800; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2801; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0 2802; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 2803; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 2804; GFX7-NEXT: s_endpgm 2805; 2806; GFX8-LABEL: notsdot2_sext8: 2807; GFX8: ; %bb.0: ; %entry 2808; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2809; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 2810; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0 2811; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2812; GFX8-NEXT: v_mov_b32_e32 v1, s1 2813; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2814; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2815; GFX8-NEXT: flat_load_ushort v3, v[0:1] 2816; GFX8-NEXT: v_mov_b32_e32 v1, s3 2817; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2818; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2819; GFX8-NEXT: flat_load_ushort v0, v[0:1] 2820; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 2821; GFX8-NEXT: s_waitcnt vmcnt(1) 2822; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 2823; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3 2824; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 2825; GFX8-NEXT: s_waitcnt vmcnt(0) 2826; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 2827; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 2828; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 2829; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2830; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0 2831; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 2832; GFX8-NEXT: v_mov_b32_e32 v0, s4 2833; GFX8-NEXT: v_mov_b32_e32 v1, s5 2834; GFX8-NEXT: flat_store_dword v[0:1], v2 2835; GFX8-NEXT: s_endpgm 2836; 2837; GFX9-NODL-LABEL: notsdot2_sext8: 2838; GFX9-NODL: ; %bb.0: ; %entry 2839; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2840; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2841; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2842; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2843; GFX9-NODL-NEXT: global_load_ushort v1, v0, s[0:1] 2844; GFX9-NODL-NEXT: global_load_ushort v2, v0, s[2:3] 2845; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 2846; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2847; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2848; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 2849; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1 2850; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2 2851; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 2852; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2853; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v3 2854; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 2855; GFX9-NODL-NEXT: s_endpgm 2856; 2857; GFX9-DL-LABEL: notsdot2_sext8: 2858; GFX9-DL: ; %bb.0: ; %entry 2859; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2860; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2861; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2862; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2863; GFX9-DL-NEXT: global_load_ushort v1, v0, s[0:1] 2864; GFX9-DL-NEXT: global_load_ushort v2, v0, s[2:3] 2865; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2866; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0001 2867; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2868; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2869; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 2870; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2871; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 2872; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2873; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 2874; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 2875; GFX9-DL-NEXT: s_endpgm 2876; 2877; GFX10-DL-LABEL: notsdot2_sext8: 2878; GFX10-DL: ; %bb.0: ; %entry 2879; GFX10-DL-NEXT: s_clause 0x1 2880; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2881; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 2882; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 2883; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 2884; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2885; GFX10-DL-NEXT: s_clause 0x1 2886; GFX10-DL-NEXT: global_load_ushort v1, v0, s[0:1] 2887; GFX10-DL-NEXT: global_load_ushort v2, v0, s[2:3] 2888; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 2889; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2890; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2891; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0001 2892; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2893; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0001 2894; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2895; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0 2896; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0 2897; GFX10-DL-NEXT: global_store_dword v3, v2, s[6:7] 2898; GFX10-DL-NEXT: s_endpgm 2899 ptr addrspace(1) %src2, 2900 ptr addrspace(1) nocapture %dst) { 2901entry: 2902 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2903 %gep1 = getelementptr <2 x i8>, ptr addrspace(1) %src1, i32 %idx 2904 %vec1 = load <2 x i8>, ptr addrspace(1) %gep1 2905 %gep2 = getelementptr <2 x i8>, ptr addrspace(1) %src2, i32 %idx 2906 %vec2 = load <2 x i8>, ptr addrspace(1) %gep2 2907 2908 %s1.elt1 = extractelement <2 x i8> %vec1, i64 0 2909 %conv = sext i8 %s1.elt1 to i32 2910 %s2.elt1 = extractelement <2 x i8> %vec2, i64 0 2911 %conv2 = sext i8 %s2.elt1 to i32 2912 %mul1 = mul nuw i32 %conv2, %conv 2913 2914 %s1.elt2 = extractelement <2 x i8> %vec1, i64 1 2915 %conv3 = sext i8 %s1.elt2 to i32 2916 %s2.elt2 = extractelement <2 x i8> %vec2, i64 1 2917 %conv4 = sext i8 %s2.elt2 to i32 2918 %mul2 = mul nuw i32 %conv4, %conv3 2919 2920 %s3 = load i32, ptr addrspace(1) %dst, align 4 2921 %add = add i32 %mul2, %s3 2922 %add6 = add i32 %add, %mul1 2923 store i32 %add6, ptr addrspace(1) %dst, align 4 2924 ret void 2925} 2926 2927declare i32 @llvm.amdgcn.workitem.id.x() 2928