1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s 3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s 4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-NODL %s 5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX9-DL %s 6; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GFX10-DL %s 7; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GFX10-DL %s 8; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-DL %s 9 10define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1, 11; GFX7-LABEL: idot4_acc32: 12; GFX7: ; %bb.0: ; %entry 13; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 14; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 15; GFX7-NEXT: s_mov_b32 s3, 0xf000 16; GFX7-NEXT: s_mov_b32 s6, 0 17; GFX7-NEXT: s_mov_b32 s7, s3 18; GFX7-NEXT: s_waitcnt lgkmcnt(0) 19; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 20; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 21; GFX7-NEXT: v_mov_b32_e32 v1, 0 22; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 23; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 24; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 25; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 26; GFX7-NEXT: s_mov_b32 s2, -1 27; GFX7-NEXT: s_waitcnt vmcnt(1) 28; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 29; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8 30; GFX7-NEXT: s_waitcnt vmcnt(0) 31; GFX7-NEXT: v_bfe_i32 v5, v0, 0, 8 32; GFX7-NEXT: v_bfe_i32 v6, v0, 8, 8 33; GFX7-NEXT: s_waitcnt lgkmcnt(0) 34; GFX7-NEXT: v_mad_i32_i24 v1, v1, v5, s4 35; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8 36; GFX7-NEXT: v_bfe_i32 v7, v0, 16, 8 37; GFX7-NEXT: v_mad_i32_i24 v1, v3, v6, v1 38; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 39; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 40; GFX7-NEXT: v_mad_i32_i24 v1, v4, v7, v1 41; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 42; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 43; GFX7-NEXT: s_endpgm 44; 45; GFX8-LABEL: idot4_acc32: 46; GFX8: ; %bb.0: ; %entry 47; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 48; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 49; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 50; GFX8-NEXT: s_waitcnt lgkmcnt(0) 51; GFX8-NEXT: v_mov_b32_e32 v1, s1 52; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 53; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 54; GFX8-NEXT: flat_load_dword v3, v[0:1] 55; GFX8-NEXT: v_mov_b32_e32 v1, s3 56; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 57; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 58; GFX8-NEXT: flat_load_dword v0, v[0:1] 59; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 60; GFX8-NEXT: s_waitcnt vmcnt(1) 61; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 62; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 63; GFX8-NEXT: v_bfe_i32 v6, v3, 16, 8 64; GFX8-NEXT: v_ashrrev_i32_e32 v3, 24, v3 65; GFX8-NEXT: s_waitcnt vmcnt(0) 66; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 67; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8 68; GFX8-NEXT: s_waitcnt lgkmcnt(0) 69; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 70; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8 71; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 72; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0 73; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1 74; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 75; GFX8-NEXT: v_mov_b32_e32 v0, s4 76; GFX8-NEXT: v_mov_b32_e32 v1, s5 77; GFX8-NEXT: flat_store_dword v[0:1], v2 78; GFX8-NEXT: s_endpgm 79; 80; GFX9-NODL-LABEL: idot4_acc32: 81; GFX9-NODL: ; %bb.0: ; %entry 82; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 83; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 84; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 85; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 86; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 87; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 88; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 89; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 90; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 91; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 92; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 93; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 94; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 95; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 96; GFX9-NODL-NEXT: v_add3_u32 v2, v3, s0, v4 97; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 98; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 99; GFX9-NODL-NEXT: s_endpgm 100; 101; GFX9-DL-LABEL: idot4_acc32: 102; GFX9-DL: ; %bb.0: ; %entry 103; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 104; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 105; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 106; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 107; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 108; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 109; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 110; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 111; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 112; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s0 113; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 114; GFX9-DL-NEXT: s_endpgm 115; 116; GFX10-DL-LABEL: idot4_acc32: 117; GFX10-DL: ; %bb.0: ; %entry 118; GFX10-DL-NEXT: s_clause 0x1 119; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 120; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 121; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 122; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 123; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 124; GFX10-DL-NEXT: s_clause 0x1 125; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 126; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 127; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 128; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 129; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 130; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 131; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 132; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2 133; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] 134; GFX10-DL-NEXT: s_endpgm 135; 136; GFX11-DL-LABEL: idot4_acc32: 137; GFX11-DL: ; %bb.0: ; %entry 138; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 139; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 140; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 141; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 142; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 143; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 144; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 145; GFX11-DL-NEXT: s_clause 0x1 146; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 147; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 148; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 149; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 150; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s0 neg_lo:[1,1,0] 151; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 152; GFX11-DL-NEXT: s_endpgm 153 ptr addrspace(1) %src2, 154 ptr addrspace(1) nocapture %dst) { 155entry: 156 %idx = call i32 @llvm.amdgcn.workitem.id.x() 157 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 158 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 159 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 160 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 161 162 %v1e0 = extractelement <4 x i8> %vec1, i64 0 163 %cv1e0 = sext i8 %v1e0 to i32 164 %v2e0 = extractelement <4 x i8> %vec2, i64 0 165 %cv2e0 = sext i8 %v2e0 to i32 166 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 167 168 %v1e1 = extractelement <4 x i8> %vec1, i64 1 169 %cv1e1 = sext i8 %v1e1 to i32 170 %v2e1 = extractelement <4 x i8> %vec2, i64 1 171 %cv2e1 = sext i8 %v2e1 to i32 172 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 173 174 %v1e2 = extractelement <4 x i8> %vec1, i64 2 175 %cv1e2 = sext i8 %v1e2 to i32 176 %v2e2 = extractelement <4 x i8> %vec2, i64 2 177 %cv2e2 = sext i8 %v2e2 to i32 178 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 179 180 %v1e3 = extractelement <4 x i8> %vec1, i64 3 181 %cv1e3 = sext i8 %v1e3 to i32 182 %v2e3 = extractelement <4 x i8> %vec2, i64 3 183 %cv2e3 = sext i8 %v2e3 to i32 184 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 185 186 %acc = load i32, ptr addrspace(1) %dst, align 4 187 %add1 = add i32 %mul1, %acc 188 %add2 = add i32 %add1, %mul2 189 %add3 = add i32 %add2, %mul3 190 %add4 = add i32 %add3, %mul4 191 store i32 %add4, ptr addrspace(1) %dst, align 4 192 ret void 193} 194 195define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1, 196; GFX7-LABEL: idot4_acc16: 197; GFX7: ; %bb.0: ; %entry 198; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 199; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 200; GFX7-NEXT: s_mov_b32 s3, 0xf000 201; GFX7-NEXT: s_mov_b32 s6, 0 202; GFX7-NEXT: s_mov_b32 s7, s3 203; GFX7-NEXT: s_waitcnt lgkmcnt(0) 204; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 205; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 206; GFX7-NEXT: v_mov_b32_e32 v1, 0 207; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 208; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 209; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 210; GFX7-NEXT: s_mov_b32 s2, -1 211; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 212; GFX7-NEXT: s_waitcnt vmcnt(2) 213; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 214; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8 215; GFX7-NEXT: s_waitcnt vmcnt(1) 216; GFX7-NEXT: v_bfe_i32 v6, v0, 0, 8 217; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 218; GFX7-NEXT: v_bfe_i32 v7, v0, 8, 8 219; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 220; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8 221; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 222; GFX7-NEXT: v_bfe_i32 v8, v0, 16, 8 223; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 224; GFX7-NEXT: s_waitcnt vmcnt(0) 225; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 226; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 227; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 228; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 229; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 230; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 231; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 232; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 233; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 234; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 235; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 236; GFX7-NEXT: s_endpgm 237; 238; GFX8-LABEL: idot4_acc16: 239; GFX8: ; %bb.0: ; %entry 240; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 241; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 242; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 243; GFX8-NEXT: s_waitcnt lgkmcnt(0) 244; GFX8-NEXT: v_mov_b32_e32 v1, s1 245; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 246; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 247; GFX8-NEXT: flat_load_dword v3, v[0:1] 248; GFX8-NEXT: v_mov_b32_e32 v1, s3 249; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 250; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 251; GFX8-NEXT: flat_load_dword v2, v[0:1] 252; GFX8-NEXT: v_mov_b32_e32 v0, s4 253; GFX8-NEXT: v_mov_b32_e32 v1, s5 254; GFX8-NEXT: flat_load_ushort v4, v[0:1] 255; GFX8-NEXT: s_waitcnt vmcnt(2) 256; GFX8-NEXT: v_bfe_i32 v7, v3, 0, 8 257; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 258; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 259; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 8 260; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 261; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 262; GFX8-NEXT: s_waitcnt vmcnt(1) 263; GFX8-NEXT: v_bfe_i32 v8, v2, 0, 8 264; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v2 265; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 266; GFX8-NEXT: v_bfe_i32 v10, v10, 0, 8 267; GFX8-NEXT: s_waitcnt vmcnt(0) 268; GFX8-NEXT: v_mad_u16 v4, v7, v8, v4 269; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 270; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 8 271; GFX8-NEXT: v_mad_u16 v4, v9, v10, v4 272; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 273; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 274; GFX8-NEXT: v_mad_u16 v4, v5, v6, v4 275; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 276; GFX8-NEXT: flat_store_short v[0:1], v2 277; GFX8-NEXT: s_endpgm 278; 279; GFX9-NODL-LABEL: idot4_acc16: 280; GFX9-NODL: ; %bb.0: ; %entry 281; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 282; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 283; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 284; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 285; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 286; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 287; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 288; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[6:7] 289; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 290; GFX9-NODL-NEXT: v_bfe_i32 v6, v1, 0, 8 291; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 292; GFX9-NODL-NEXT: v_bfe_i32 v7, v2, 0, 8 293; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 294; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 295; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 296; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 297; GFX9-NODL-NEXT: v_bfe_i32 v8, v8, 0, 8 298; GFX9-NODL-NEXT: v_bfe_i32 v9, v9, 0, 8 299; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 300; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v6, v7, v3 301; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 302; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 303; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8 304; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8 305; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v9, v3 306; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 307; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 308; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v4, v5, v3 309; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 310; GFX9-NODL-NEXT: global_store_short v0, v1, s[6:7] 311; GFX9-NODL-NEXT: s_endpgm 312; 313; GFX9-DL-LABEL: idot4_acc16: 314; GFX9-DL: ; %bb.0: ; %entry 315; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 316; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 317; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 318; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 319; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 320; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] 321; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] 322; GFX9-DL-NEXT: global_load_sshort v4, v1, s[6:7] 323; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 324; GFX9-DL-NEXT: v_dot4_i32_i8 v0, v2, v3, v4 325; GFX9-DL-NEXT: global_store_short v1, v0, s[6:7] 326; GFX9-DL-NEXT: s_endpgm 327; 328; GFX10-DL-LABEL: idot4_acc16: 329; GFX10-DL: ; %bb.0: ; %entry 330; GFX10-DL-NEXT: s_clause 0x1 331; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 332; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 333; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 334; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 335; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 336; GFX10-DL-NEXT: s_clause 0x1 337; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 338; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] 339; GFX10-DL-NEXT: global_load_sshort v4, v1, s[6:7] 340; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 341; GFX10-DL-NEXT: v_dot4c_i32_i8 v4, v2, v3 342; GFX10-DL-NEXT: global_store_short v1, v4, s[6:7] 343; GFX10-DL-NEXT: s_endpgm 344; 345; GFX11-DL-LABEL: idot4_acc16: 346; GFX11-DL: ; %bb.0: ; %entry 347; GFX11-DL-NEXT: s_clause 0x1 348; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 349; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 350; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 351; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 352; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 353; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 354; GFX11-DL-NEXT: s_clause 0x1 355; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] 356; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 357; GFX11-DL-NEXT: global_load_i16 v3, v1, s[4:5] 358; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 359; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v2, v0, v3 neg_lo:[1,1,0] 360; GFX11-DL-NEXT: global_store_b16 v1, v0, s[4:5] 361; GFX11-DL-NEXT: s_endpgm 362 ptr addrspace(1) %src2, 363 ptr addrspace(1) nocapture %dst) { 364entry: 365 %idx = call i32 @llvm.amdgcn.workitem.id.x() 366 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 367 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 368 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 369 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 370 371 %v1e0 = extractelement <4 x i8> %vec1, i64 0 372 %cv1e0 = sext i8 %v1e0 to i16 373 %v2e0 = extractelement <4 x i8> %vec2, i64 0 374 %cv2e0 = sext i8 %v2e0 to i16 375 %mul1 = mul nsw i16 %cv1e0, %cv2e0 376 377 %v1e1 = extractelement <4 x i8> %vec1, i64 1 378 %cv1e1 = sext i8 %v1e1 to i16 379 %v2e1 = extractelement <4 x i8> %vec2, i64 1 380 %cv2e1 = sext i8 %v2e1 to i16 381 %mul2 = mul nsw i16 %cv1e1, %cv2e1 382 383 %v1e2 = extractelement <4 x i8> %vec1, i64 2 384 %cv1e2 = sext i8 %v1e2 to i16 385 %v2e2 = extractelement <4 x i8> %vec2, i64 2 386 %cv2e2 = sext i8 %v2e2 to i16 387 %mul3 = mul nsw i16 %cv1e2, %cv2e2 388 389 %v1e3 = extractelement <4 x i8> %vec1, i64 3 390 %cv1e3 = sext i8 %v1e3 to i16 391 %v2e3 = extractelement <4 x i8> %vec2, i64 3 392 %cv2e3 = sext i8 %v2e3 to i16 393 %mul4 = mul nsw i16 %cv1e3, %cv2e3 394 395 %acc = load i16, ptr addrspace(1) %dst, align 2 396 %add1 = add i16 %mul1, %acc 397 %add2 = add i16 %add1, %mul2 398 %add3 = add i16 %add2, %mul3 399 %add4 = add i16 %add3, %mul4 400 store i16 %add4, ptr addrspace(1) %dst, align 2 401 ret void 402} 403 404define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1, 405; GFX7-LABEL: idot4_acc8: 406; GFX7: ; %bb.0: ; %entry 407; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 408; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 409; GFX7-NEXT: s_mov_b32 s3, 0xf000 410; GFX7-NEXT: s_mov_b32 s6, 0 411; GFX7-NEXT: s_mov_b32 s7, s3 412; GFX7-NEXT: s_waitcnt lgkmcnt(0) 413; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 414; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 415; GFX7-NEXT: v_mov_b32_e32 v1, 0 416; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 417; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 418; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 419; GFX7-NEXT: s_mov_b32 s2, -1 420; GFX7-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 421; GFX7-NEXT: s_waitcnt vmcnt(2) 422; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v2 423; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 424; GFX7-NEXT: s_waitcnt vmcnt(1) 425; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v0 426; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 427; GFX7-NEXT: s_waitcnt vmcnt(0) 428; GFX7-NEXT: v_mad_u32_u24 v1, v3, v6, v1 429; GFX7-NEXT: v_bfe_u32 v5, v2, 16, 8 430; GFX7-NEXT: v_bfe_u32 v8, v0, 16, 8 431; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 432; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 433; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 434; GFX7-NEXT: v_mad_u32_u24 v1, v5, v8, v1 435; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 436; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 437; GFX7-NEXT: s_endpgm 438; 439; GFX8-LABEL: idot4_acc8: 440; GFX8: ; %bb.0: ; %entry 441; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 442; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 443; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 444; GFX8-NEXT: s_waitcnt lgkmcnt(0) 445; GFX8-NEXT: v_mov_b32_e32 v1, s1 446; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 447; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 448; GFX8-NEXT: flat_load_dword v3, v[0:1] 449; GFX8-NEXT: v_mov_b32_e32 v1, s3 450; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 451; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 452; GFX8-NEXT: flat_load_dword v2, v[0:1] 453; GFX8-NEXT: v_mov_b32_e32 v0, s4 454; GFX8-NEXT: v_mov_b32_e32 v1, s5 455; GFX8-NEXT: flat_load_ubyte v4, v[0:1] 456; GFX8-NEXT: s_waitcnt vmcnt(2) 457; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 458; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 459; GFX8-NEXT: v_lshrrev_b32_e32 v9, 24, v3 460; GFX8-NEXT: s_waitcnt vmcnt(1) 461; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 462; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 463; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v2 464; GFX8-NEXT: s_waitcnt vmcnt(0) 465; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 466; GFX8-NEXT: v_mad_u16 v2, v7, v8, v2 467; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2 468; GFX8-NEXT: v_mad_u16 v2, v9, v10, v2 469; GFX8-NEXT: flat_store_byte v[0:1], v2 470; GFX8-NEXT: s_endpgm 471; 472; GFX9-NODL-LABEL: idot4_acc8: 473; GFX9-NODL: ; %bb.0: ; %entry 474; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 475; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 476; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 477; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 478; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 479; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 480; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 481; GFX9-NODL-NEXT: global_load_ubyte v3, v0, s[6:7] 482; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 483; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 484; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 485; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 486; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 487; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v8, 24, v1 488; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 489; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 490; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 491; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v6, v7, v1 492; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v9, 24, v2 493; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v4, v5, v1 494; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v8, v9, v1 495; GFX9-NODL-NEXT: global_store_byte v0, v1, s[6:7] 496; GFX9-NODL-NEXT: s_endpgm 497; 498; GFX9-DL-LABEL: idot4_acc8: 499; GFX9-DL: ; %bb.0: ; %entry 500; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 501; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 502; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 503; GFX9-DL-NEXT: v_mov_b32_e32 v1, 0 504; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 505; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] 506; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3] 507; GFX9-DL-NEXT: global_load_ubyte v4, v1, s[6:7] 508; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 509; GFX9-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 510; GFX9-DL-NEXT: global_store_byte v1, v0, s[6:7] 511; GFX9-DL-NEXT: s_endpgm 512; 513; GFX10-DL-LABEL: idot4_acc8: 514; GFX10-DL: ; %bb.0: ; %entry 515; GFX10-DL-NEXT: s_clause 0x1 516; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 517; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 518; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 519; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 520; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 521; GFX10-DL-NEXT: s_clause 0x1 522; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 523; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] 524; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] 525; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 526; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v2, v3, v4 527; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] 528; GFX10-DL-NEXT: s_endpgm 529; 530; GFX11-DL-LABEL: idot4_acc8: 531; GFX11-DL: ; %bb.0: ; %entry 532; GFX11-DL-NEXT: s_clause 0x1 533; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 534; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 535; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0 536; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 537; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 538; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 539; GFX11-DL-NEXT: s_clause 0x1 540; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] 541; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 542; GFX11-DL-NEXT: global_load_u8 v3, v1, s[4:5] 543; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 544; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3 545; GFX11-DL-NEXT: global_store_b8 v1, v0, s[4:5] 546; GFX11-DL-NEXT: s_endpgm 547 ptr addrspace(1) %src2, 548 ptr addrspace(1) nocapture %dst) { 549entry: 550 %idx = call i32 @llvm.amdgcn.workitem.id.x() 551 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 552 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 553 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 554 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 555 556 %v1e0 = extractelement <4 x i8> %vec1, i64 0 557 %v2e0 = extractelement <4 x i8> %vec2, i64 0 558 %mul1 = mul i8 %v1e0, %v2e0 559 560 %v1e1 = extractelement <4 x i8> %vec1, i64 1 561 %v2e1 = extractelement <4 x i8> %vec2, i64 1 562 %mul2 = mul i8 %v1e1, %v2e1 563 564 %v1e2 = extractelement <4 x i8> %vec1, i64 2 565 %v2e2 = extractelement <4 x i8> %vec2, i64 2 566 %mul3 = mul i8 %v1e2, %v2e2 567 568 %v1e3 = extractelement <4 x i8> %vec1, i64 3 569 %v2e3 = extractelement <4 x i8> %vec2, i64 3 570 %mul4 = mul i8 %v1e3, %v2e3 571 572 %acc = load i8, ptr addrspace(1) %dst, align 2 573 %add1 = add i8 %mul1, %acc 574 %add2 = add i8 %add1, %mul2 575 %add3 = add i8 %add2, %mul3 576 %add4 = add nsw i8 %add3, %mul4 577 store i8 %add4, ptr addrspace(1) %dst, align 2 578 ret void 579} 580 581define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1, 582; GFX7-LABEL: idot4_multiuse_mul1: 583; GFX7: ; %bb.0: ; %entry 584; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 585; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 586; GFX7-NEXT: s_mov_b32 s3, 0xf000 587; GFX7-NEXT: s_mov_b32 s6, 0 588; GFX7-NEXT: s_mov_b32 s7, s3 589; GFX7-NEXT: s_waitcnt lgkmcnt(0) 590; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 591; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 592; GFX7-NEXT: v_mov_b32_e32 v1, 0 593; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 594; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 595; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 596; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 597; GFX7-NEXT: s_mov_b32 s2, -1 598; GFX7-NEXT: s_waitcnt vmcnt(1) 599; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 600; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8 601; GFX7-NEXT: s_waitcnt vmcnt(0) 602; GFX7-NEXT: v_bfe_i32 v5, v0, 0, 8 603; GFX7-NEXT: v_bfe_i32 v6, v0, 8, 8 604; GFX7-NEXT: s_waitcnt lgkmcnt(0) 605; GFX7-NEXT: v_mad_i32_i24 v8, v1, v5, s4 606; GFX7-NEXT: v_mad_i32_i24 v3, v3, v6, v8 607; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8 608; GFX7-NEXT: v_bfe_i32 v7, v0, 16, 8 609; GFX7-NEXT: v_mad_i32_i24 v1, v1, v5, v3 610; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 611; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 612; GFX7-NEXT: v_mad_i32_i24 v1, v4, v7, v1 613; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 614; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 615; GFX7-NEXT: s_endpgm 616; 617; GFX8-LABEL: idot4_multiuse_mul1: 618; GFX8: ; %bb.0: ; %entry 619; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 620; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 621; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 622; GFX8-NEXT: s_waitcnt lgkmcnt(0) 623; GFX8-NEXT: v_mov_b32_e32 v1, s1 624; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 625; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 626; GFX8-NEXT: flat_load_dword v3, v[0:1] 627; GFX8-NEXT: v_mov_b32_e32 v1, s3 628; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 629; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 630; GFX8-NEXT: flat_load_dword v0, v[0:1] 631; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 632; GFX8-NEXT: s_waitcnt vmcnt(1) 633; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 634; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 635; GFX8-NEXT: v_bfe_i32 v6, v3, 16, 8 636; GFX8-NEXT: v_ashrrev_i32_e32 v3, 24, v3 637; GFX8-NEXT: s_waitcnt vmcnt(0) 638; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 639; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8 640; GFX8-NEXT: s_waitcnt lgkmcnt(0) 641; GFX8-NEXT: v_mad_i32_i24 v8, v1, v2, s0 642; GFX8-NEXT: v_mad_i32_i24 v4, v4, v5, v8 643; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8 644; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v4 645; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0 646; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1 647; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 648; GFX8-NEXT: v_mov_b32_e32 v0, s4 649; GFX8-NEXT: v_mov_b32_e32 v1, s5 650; GFX8-NEXT: flat_store_dword v[0:1], v2 651; GFX8-NEXT: s_endpgm 652; 653; GFX9-NODL-LABEL: idot4_multiuse_mul1: 654; GFX9-NODL: ; %bb.0: ; %entry 655; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 656; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 657; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 658; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 659; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 660; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 661; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 662; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 663; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 664; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 665; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 666; GFX9-NODL-NEXT: v_bfe_i32 v4, v2, 0, 8 667; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 668; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 669; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 670; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v2, v3, v4 671; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 672; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v3, v4, s0 673; GFX9-NODL-NEXT: v_add3_u32 v2, v5, v3, v2 674; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 675; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 676; GFX9-NODL-NEXT: s_endpgm 677; 678; GFX9-DL-LABEL: idot4_multiuse_mul1: 679; GFX9-DL: ; %bb.0: ; %entry 680; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 681; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 682; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 683; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 684; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 685; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 686; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 687; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 688; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 689; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 8 690; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 691; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 8 692; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 693; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, s0 694; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, v3 695; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 696; GFX9-DL-NEXT: s_endpgm 697; 698; GFX10-DL-LABEL: idot4_multiuse_mul1: 699; GFX10-DL: ; %bb.0: ; %entry 700; GFX10-DL-NEXT: s_clause 0x1 701; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 702; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 703; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 704; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 705; GFX10-DL-NEXT: s_clause 0x1 706; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 707; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 708; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 709; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 710; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 711; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8 712; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 713; GFX10-DL-NEXT: v_bfe_i32 v3, v2, 0, 8 714; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 715; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v3, s0 716; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 717; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2 718; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] 719; GFX10-DL-NEXT: s_endpgm 720; 721; GFX11-DL-LABEL: idot4_multiuse_mul1: 722; GFX11-DL: ; %bb.0: ; %entry 723; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 724; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 725; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 726; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 727; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 728; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 729; GFX11-DL-NEXT: s_clause 0x1 730; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 731; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 732; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 733; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 734; GFX11-DL-NEXT: v_bfe_i32 v2, v1, 0, 8 735; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 736; GFX11-DL-NEXT: v_bfe_i32 v3, v0, 0, 8 737; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 738; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) 739; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, v3, s0 740; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0 741; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0] 742; GFX11-DL-NEXT: global_store_b32 v3, v0, s[4:5] 743; GFX11-DL-NEXT: s_endpgm 744 ptr addrspace(1) %src2, 745 ptr addrspace(1) nocapture %dst) { 746entry: 747 %idx = call i32 @llvm.amdgcn.workitem.id.x() 748 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 749 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 750 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 751 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 752 753 %v1e0 = extractelement <4 x i8> %vec1, i64 0 754 %cv1e0 = sext i8 %v1e0 to i32 755 %v2e0 = extractelement <4 x i8> %vec2, i64 0 756 %cv2e0 = sext i8 %v2e0 to i32 757 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 758 759 %v1e1 = extractelement <4 x i8> %vec1, i64 1 760 %cv1e1 = sext i8 %v1e1 to i32 761 %v2e1 = extractelement <4 x i8> %vec2, i64 1 762 %cv2e1 = sext i8 %v2e1 to i32 763 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 764 765 %v1e2 = extractelement <4 x i8> %vec1, i64 2 766 %cv1e2 = sext i8 %v1e2 to i32 767 %v2e2 = extractelement <4 x i8> %vec2, i64 2 768 %cv2e2 = sext i8 %v2e2 to i32 769 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 770 771 %v1e3 = extractelement <4 x i8> %vec1, i64 3 772 %cv1e3 = sext i8 %v1e3 to i32 773 %v2e3 = extractelement <4 x i8> %vec2, i64 3 774 %cv2e3 = sext i8 %v2e3 to i32 775 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 776 777 %acc = load i32, ptr addrspace(1) %dst, align 4 778 %add = add i32 %mul1, %acc 779 %add1 = add i32 %mul2, %add 780 %add2 = add i32 %add1, %mul1 781 %add3 = add i32 %add2, %mul3 782 %add4 = add i32 %add3, %mul4 783 784 store i32 %add4, ptr addrspace(1) %dst, align 4 785 ret void 786} 787 788define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1, 789; GFX7-LABEL: idot4_acc32_vecMul: 790; GFX7: ; %bb.0: ; %entry 791; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 792; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 793; GFX7-NEXT: s_mov_b32 s3, 0xf000 794; GFX7-NEXT: s_mov_b32 s6, 0 795; GFX7-NEXT: s_mov_b32 s7, s3 796; GFX7-NEXT: s_waitcnt lgkmcnt(0) 797; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 798; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 799; GFX7-NEXT: v_mov_b32_e32 v1, 0 800; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 801; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 802; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 803; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 804; GFX7-NEXT: s_mov_b32 s2, -1 805; GFX7-NEXT: s_waitcnt vmcnt(1) 806; GFX7-NEXT: v_ashrrev_i32_e32 v1, 24, v2 807; GFX7-NEXT: v_bfe_i32 v3, v2, 16, 8 808; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8 809; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 8 810; GFX7-NEXT: s_waitcnt vmcnt(0) 811; GFX7-NEXT: v_ashrrev_i32_e32 v5, 24, v0 812; GFX7-NEXT: v_bfe_i32 v6, v0, 16, 8 813; GFX7-NEXT: v_bfe_i32 v7, v0, 8, 8 814; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 8 815; GFX7-NEXT: s_waitcnt lgkmcnt(0) 816; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, s4 817; GFX7-NEXT: v_mad_i32_i24 v0, v4, v7, v0 818; GFX7-NEXT: v_mad_i32_i24 v0, v3, v6, v0 819; GFX7-NEXT: v_mad_i32_i24 v0, v1, v5, v0 820; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 821; GFX7-NEXT: s_endpgm 822; 823; GFX8-LABEL: idot4_acc32_vecMul: 824; GFX8: ; %bb.0: ; %entry 825; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 826; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 827; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 828; GFX8-NEXT: s_waitcnt lgkmcnt(0) 829; GFX8-NEXT: v_mov_b32_e32 v1, s1 830; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 831; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 832; GFX8-NEXT: flat_load_dword v3, v[0:1] 833; GFX8-NEXT: v_mov_b32_e32 v1, s3 834; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 835; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 836; GFX8-NEXT: flat_load_dword v0, v[0:1] 837; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 838; GFX8-NEXT: s_waitcnt vmcnt(1) 839; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v3 840; GFX8-NEXT: v_ashrrev_i32_e32 v4, 24, v3 841; GFX8-NEXT: v_bfe_i32 v5, v3, 16, 8 842; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 843; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8 844; GFX8-NEXT: s_waitcnt vmcnt(0) 845; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v0 846; GFX8-NEXT: v_ashrrev_i32_e32 v6, 24, v0 847; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8 848; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8 849; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 850; GFX8-NEXT: s_waitcnt lgkmcnt(0) 851; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s0 852; GFX8-NEXT: v_mad_i32_i24 v0, v1, v2, v0 853; GFX8-NEXT: v_mad_i32_i24 v0, v5, v7, v0 854; GFX8-NEXT: v_mad_i32_i24 v2, v4, v6, v0 855; GFX8-NEXT: v_mov_b32_e32 v0, s4 856; GFX8-NEXT: v_mov_b32_e32 v1, s5 857; GFX8-NEXT: flat_store_dword v[0:1], v2 858; GFX8-NEXT: s_endpgm 859; 860; GFX9-NODL-LABEL: idot4_acc32_vecMul: 861; GFX9-NODL: ; %bb.0: ; %entry 862; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 863; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 864; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 865; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 866; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 867; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 868; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 869; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 870; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 871; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v3, 8, v1 872; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 873; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v4, 8, v2 874; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 875; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v3), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 876; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 877; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 878; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 879; GFX9-NODL-NEXT: v_add3_u32 v2, v5, s0, v3 880; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 881; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 882; GFX9-NODL-NEXT: s_endpgm 883; 884; GFX9-DL-LABEL: idot4_acc32_vecMul: 885; GFX9-DL: ; %bb.0: ; %entry 886; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 887; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 888; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 889; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 890; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 891; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 892; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 893; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 894; GFX9-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 895; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, s0 896; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 897; GFX9-DL-NEXT: s_endpgm 898; 899; GFX10-DL-LABEL: idot4_acc32_vecMul: 900; GFX10-DL: ; %bb.0: ; %entry 901; GFX10-DL-NEXT: s_clause 0x1 902; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 903; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 904; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 905; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 906; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 907; GFX10-DL-NEXT: s_clause 0x1 908; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 909; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 910; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 911; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 912; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 913; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 914; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 915; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2 916; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] 917; GFX10-DL-NEXT: s_endpgm 918; 919; GFX11-DL-LABEL: idot4_acc32_vecMul: 920; GFX11-DL: ; %bb.0: ; %entry 921; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 922; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 923; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 924; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 925; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 926; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 927; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 928; GFX11-DL-NEXT: s_clause 0x1 929; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 930; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 931; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 932; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 933; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s0 neg_lo:[1,1,0] 934; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 935; GFX11-DL-NEXT: s_endpgm 936 ptr addrspace(1) %src2, 937 ptr addrspace(1) nocapture %dst) { 938entry: 939 %idx = call i32 @llvm.amdgcn.workitem.id.x() 940 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 941 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 942 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 943 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 944 945 %cvec1 = sext <4 x i8> %vec1 to <4 x i32> 946 %cvec2 = sext <4 x i8> %vec2 to <4 x i32> 947 948 %mul = mul <4 x i32> %cvec1, %cvec2 949 %mul0 = extractelement <4 x i32> %mul, i64 0 950 %mul1 = extractelement <4 x i32> %mul, i64 1 951 %mul2 = extractelement <4 x i32> %mul, i64 2 952 %mul3 = extractelement <4 x i32> %mul, i64 3 953 954 %acc = load i32, ptr addrspace(1) %dst, align 4 955 %add1 = add i32 %mul0, %acc 956 %add2 = add i32 %add1, %mul1 957 %add3 = add i32 %add2, %mul2 958 %add4 = add i32 %add3, %mul3 959 960 store i32 %add4, ptr addrspace(1) %dst, align 4 961 ret void 962} 963 964define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1, 965; GFX7-LABEL: idot4_acc16_vecMul: 966; GFX7: ; %bb.0: ; %entry 967; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 968; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 969; GFX7-NEXT: s_mov_b32 s3, 0xf000 970; GFX7-NEXT: s_mov_b32 s6, 0 971; GFX7-NEXT: s_mov_b32 s7, s3 972; GFX7-NEXT: s_waitcnt lgkmcnt(0) 973; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 974; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 975; GFX7-NEXT: v_mov_b32_e32 v1, 0 976; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 977; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 978; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 979; GFX7-NEXT: s_mov_b32 s2, -1 980; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0 981; GFX7-NEXT: s_waitcnt vmcnt(2) 982; GFX7-NEXT: v_bfe_i32 v4, v2, 0, 8 983; GFX7-NEXT: v_bfe_i32 v3, v2, 16, 8 984; GFX7-NEXT: s_waitcnt vmcnt(1) 985; GFX7-NEXT: v_bfe_i32 v7, v0, 0, 8 986; GFX7-NEXT: v_ashrrev_i32_e32 v5, 24, v2 987; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8 988; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 989; GFX7-NEXT: v_bfe_i32 v6, v0, 16, 8 990; GFX7-NEXT: v_ashrrev_i32_e32 v8, 24, v0 991; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 992; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 993; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 994; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 995; GFX7-NEXT: s_waitcnt vmcnt(0) 996; GFX7-NEXT: v_mad_u32_u24 v1, v4, v7, v1 997; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 998; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 999; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 1000; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 1001; GFX7-NEXT: v_and_b32_e32 v8, 0xffff, v8 1002; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0 1003; GFX7-NEXT: v_mad_u32_u24 v0, v5, v8, v0 1004; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 1005; GFX7-NEXT: s_endpgm 1006; 1007; GFX8-LABEL: idot4_acc16_vecMul: 1008; GFX8: ; %bb.0: ; %entry 1009; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1010; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1011; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1012; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1013; GFX8-NEXT: v_mov_b32_e32 v1, s1 1014; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1015; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1016; GFX8-NEXT: flat_load_dword v3, v[0:1] 1017; GFX8-NEXT: v_mov_b32_e32 v1, s3 1018; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1019; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1020; GFX8-NEXT: flat_load_dword v2, v[0:1] 1021; GFX8-NEXT: v_mov_b32_e32 v0, s4 1022; GFX8-NEXT: v_mov_b32_e32 v1, s5 1023; GFX8-NEXT: flat_load_ushort v4, v[0:1] 1024; GFX8-NEXT: s_waitcnt vmcnt(2) 1025; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 1026; GFX8-NEXT: v_ashrrev_i16_e32 v7, 8, v3 1027; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 1028; GFX8-NEXT: v_ashrrev_i16_e32 v9, 8, v5 1029; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 1030; GFX8-NEXT: s_waitcnt vmcnt(1) 1031; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v2 1032; GFX8-NEXT: v_ashrrev_i16_e32 v8, 8, v2 1033; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8 1034; GFX8-NEXT: s_waitcnt vmcnt(0) 1035; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 1036; GFX8-NEXT: v_ashrrev_i16_e32 v10, 8, v6 1037; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 8 1038; GFX8-NEXT: v_mad_u16 v2, v7, v8, v2 1039; GFX8-NEXT: v_mad_u16 v2, v5, v6, v2 1040; GFX8-NEXT: v_mad_u16 v2, v9, v10, v2 1041; GFX8-NEXT: flat_store_short v[0:1], v2 1042; GFX8-NEXT: s_endpgm 1043; 1044; GFX9-NODL-LABEL: idot4_acc16_vecMul: 1045; GFX9-NODL: ; %bb.0: ; %entry 1046; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1047; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1048; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1049; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1050; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 1051; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 1052; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1053; GFX9-NODL-NEXT: global_load_ushort v3, v0, s[6:7] 1054; GFX9-NODL-NEXT: s_mov_b32 s0, 0x5040100 1055; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 1056; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 1057; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 1058; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 1059; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v6, 8, v1 1060; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 1061; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v7, 8, v2 1062; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8 1063; GFX9-NODL-NEXT: v_perm_b32 v2, v7, v2, s0 1064; GFX9-NODL-NEXT: v_perm_b32 v1, v6, v1, s0 1065; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v8, 8, v4 1066; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8 1067; GFX9-NODL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 1068; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8 1069; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 1070; GFX9-NODL-NEXT: v_perm_b32 v5, v9, v5, s0 1071; GFX9-NODL-NEXT: v_perm_b32 v4, v8, v4, s0 1072; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1073; GFX9-NODL-NEXT: v_add_u16_e32 v3, v1, v3 1074; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 1075; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1076; GFX9-NODL-NEXT: v_add_u16_e32 v1, v1, v2 1077; GFX9-NODL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1078; GFX9-NODL-NEXT: global_store_short v0, v1, s[6:7] 1079; GFX9-NODL-NEXT: s_endpgm 1080; 1081; GFX9-DL-LABEL: idot4_acc16_vecMul: 1082; GFX9-DL: ; %bb.0: ; %entry 1083; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1084; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1085; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1086; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1087; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 1088; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 1089; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1090; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7] 1091; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 1092; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 1093; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 1094; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1095; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v2 1096; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 8, v1 1097; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 1098; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 8, v2 1099; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 1100; GFX9-DL-NEXT: v_perm_b32 v2, v7, v2, s0 1101; GFX9-DL-NEXT: v_perm_b32 v1, v6, v1, s0 1102; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 8, v4 1103; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8 1104; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 8, v5 1105; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 1106; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 1107; GFX9-DL-NEXT: v_perm_b32 v5, v9, v5, s0 1108; GFX9-DL-NEXT: v_perm_b32 v4, v8, v4, s0 1109; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1110; GFX9-DL-NEXT: v_add_u16_e32 v3, v1, v3 1111; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v5 1112; GFX9-DL-NEXT: v_add_u16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1113; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v2 1114; GFX9-DL-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1115; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7] 1116; GFX9-DL-NEXT: s_endpgm 1117; 1118; GFX10-DL-LABEL: idot4_acc16_vecMul: 1119; GFX10-DL: ; %bb.0: ; %entry 1120; GFX10-DL-NEXT: s_clause 0x1 1121; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1122; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1123; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1124; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1125; GFX10-DL-NEXT: s_clause 0x1 1126; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 1127; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 1128; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1129; GFX10-DL-NEXT: global_load_ushort v3, v0, s[6:7] 1130; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 1131; GFX10-DL-NEXT: v_ashrrev_i16 v4, 8, v1 1132; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1133; GFX10-DL-NEXT: v_ashrrev_i16 v5, 8, v2 1134; GFX10-DL-NEXT: v_bfe_i32 v6, v2, 0, 8 1135; GFX10-DL-NEXT: v_bfe_i32 v7, v1, 0, 8 1136; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1137; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1138; GFX10-DL-NEXT: v_perm_b32 v5, v5, v6, 0x5040100 1139; GFX10-DL-NEXT: v_perm_b32 v4, v4, v7, 0x5040100 1140; GFX10-DL-NEXT: v_ashrrev_i16 v6, 8, v1 1141; GFX10-DL-NEXT: v_ashrrev_i16 v7, 8, v2 1142; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 8 1143; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 1144; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 1145; GFX10-DL-NEXT: v_perm_b32 v2, v7, v2, 0x5040100 1146; GFX10-DL-NEXT: v_perm_b32 v1, v6, v1, 0x5040100 1147; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 1148; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1149; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 1150; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 1151; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5 1152; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 1153; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 1154; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 1155; GFX10-DL-NEXT: global_store_short v0, v1, s[6:7] 1156; GFX10-DL-NEXT: s_endpgm 1157; 1158; GFX11-DL-LABEL: idot4_acc16_vecMul: 1159; GFX11-DL: ; %bb.0: ; %entry 1160; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1161; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1162; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1163; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 1164; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 1165; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1166; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 1167; GFX11-DL-NEXT: s_clause 0x1 1168; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 1169; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 1170; GFX11-DL-NEXT: global_load_u16 v3, v2, s[4:5] 1171; GFX11-DL-NEXT: s_waitcnt vmcnt(2) 1172; GFX11-DL-NEXT: v_ashrrev_i16 v4, 8, v1 1173; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 1174; GFX11-DL-NEXT: v_ashrrev_i16 v5, 8, v0 1175; GFX11-DL-NEXT: v_bfe_i32 v6, v0, 0, 8 1176; GFX11-DL-NEXT: v_bfe_i32 v7, v1, 0, 8 1177; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1178; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1179; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 1180; GFX11-DL-NEXT: v_perm_b32 v5, v5, v6, 0x5040100 1181; GFX11-DL-NEXT: v_perm_b32 v4, v4, v7, 0x5040100 1182; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) 1183; GFX11-DL-NEXT: v_ashrrev_i16 v6, 8, v1 1184; GFX11-DL-NEXT: v_ashrrev_i16 v7, 8, v0 1185; GFX11-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 1186; GFX11-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 1187; GFX11-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 1188; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) 1189; GFX11-DL-NEXT: v_perm_b32 v0, v7, v0, 0x5040100 1190; GFX11-DL-NEXT: v_perm_b32 v1, v6, v1, 0x5040100 1191; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) 1192; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 1193; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 1194; GFX11-DL-NEXT: v_add_nc_u16 v3, v4, v3 1195; GFX11-DL-NEXT: v_pk_mul_lo_u16 v0, v1, v0 1196; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 1197; GFX11-DL-NEXT: v_add_nc_u16 v1, v3, v5 1198; GFX11-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 1199; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 1200; GFX11-DL-NEXT: v_add_nc_u16 v0, v1, v0 1201; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3 1202; GFX11-DL-NEXT: global_store_b16 v2, v0, s[4:5] 1203; GFX11-DL-NEXT: s_endpgm 1204 ptr addrspace(1) %src2, 1205 ptr addrspace(1) nocapture %dst) { 1206entry: 1207 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1208 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 1209 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 1210 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 1211 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 1212 1213 %cvec1 = sext <4 x i8> %vec1 to <4 x i16> 1214 %cvec2 = sext <4 x i8> %vec2 to <4 x i16> 1215 1216 %mul = mul <4 x i16> %cvec1, %cvec2 1217 %mul0 = extractelement <4 x i16> %mul, i64 0 1218 %mul1 = extractelement <4 x i16> %mul, i64 1 1219 %mul2 = extractelement <4 x i16> %mul, i64 2 1220 %mul3 = extractelement <4 x i16> %mul, i64 3 1221 1222 %acc = load i16, ptr addrspace(1) %dst, align 4 1223 %add1 = add i16 %mul0, %acc 1224 %add2 = add i16 %add1, %mul1 1225 %add3 = add i16 %add2, %mul2 1226 %add4 = add i16 %add3, %mul3 1227 1228 store i16 %add4, ptr addrspace(1) %dst, align 4 1229 ret void 1230} 1231 1232define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1, 1233; GFX7-LABEL: idot4_acc32_2ele: 1234; GFX7: ; %bb.0: ; %entry 1235; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 1236; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd 1237; GFX7-NEXT: s_mov_b32 s7, 0xf000 1238; GFX7-NEXT: s_mov_b32 s10, 0 1239; GFX7-NEXT: s_mov_b32 s11, s7 1240; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1241; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 1242; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1243; GFX7-NEXT: v_mov_b32_e32 v1, 0 1244; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 1245; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 1246; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 1247; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 1248; GFX7-NEXT: s_mov_b32 s6, -1 1249; GFX7-NEXT: s_waitcnt vmcnt(1) 1250; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 1251; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8 1252; GFX7-NEXT: s_waitcnt vmcnt(0) 1253; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 8 1254; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 1255; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1256; GFX7-NEXT: v_mad_i32_i24 v1, v1, v3, s0 1257; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 1258; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 1259; GFX7-NEXT: s_endpgm 1260; 1261; GFX8-LABEL: idot4_acc32_2ele: 1262; GFX8: ; %bb.0: ; %entry 1263; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1264; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1265; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1266; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1267; GFX8-NEXT: v_mov_b32_e32 v1, s1 1268; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1269; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1270; GFX8-NEXT: flat_load_dword v3, v[0:1] 1271; GFX8-NEXT: v_mov_b32_e32 v1, s3 1272; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1273; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1274; GFX8-NEXT: flat_load_dword v0, v[0:1] 1275; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1276; GFX8-NEXT: s_waitcnt vmcnt(1) 1277; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 1278; GFX8-NEXT: v_bfe_i32 v3, v3, 8, 8 1279; GFX8-NEXT: s_waitcnt vmcnt(0) 1280; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 1281; GFX8-NEXT: v_bfe_i32 v0, v0, 8, 8 1282; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1283; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 1284; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 1285; GFX8-NEXT: v_mov_b32_e32 v0, s4 1286; GFX8-NEXT: v_mov_b32_e32 v1, s5 1287; GFX8-NEXT: flat_store_dword v[0:1], v2 1288; GFX8-NEXT: s_endpgm 1289; 1290; GFX9-NODL-LABEL: idot4_acc32_2ele: 1291; GFX9-NODL: ; %bb.0: ; %entry 1292; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1293; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1294; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1295; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1296; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 1297; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 1298; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 1299; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1300; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1301; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 1302; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 1303; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1304; GFX9-NODL-NEXT: v_add3_u32 v1, v3, s0, v1 1305; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 1306; GFX9-NODL-NEXT: s_endpgm 1307; 1308; GFX9-DL-LABEL: idot4_acc32_2ele: 1309; GFX9-DL: ; %bb.0: ; %entry 1310; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1311; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1312; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1313; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1314; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] 1315; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] 1316; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1317; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0100 1318; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1319; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1320; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 1321; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1322; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 1323; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1324; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 1325; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 1326; GFX9-DL-NEXT: s_endpgm 1327; 1328; GFX10-DL-LABEL: idot4_acc32_2ele: 1329; GFX10-DL: ; %bb.0: ; %entry 1330; GFX10-DL-NEXT: s_clause 0x1 1331; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1332; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1333; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1334; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 1335; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1336; GFX10-DL-NEXT: s_clause 0x1 1337; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] 1338; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 1339; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 1340; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1341; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1342; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0100 1343; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1344; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0100 1345; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1346; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0 1347; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0 1348; GFX10-DL-NEXT: global_store_dword v3, v2, s[6:7] 1349; GFX10-DL-NEXT: s_endpgm 1350; 1351; GFX11-DL-LABEL: idot4_acc32_2ele: 1352; GFX11-DL: ; %bb.0: ; %entry 1353; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1354; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1355; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1356; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 1357; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 1358; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1359; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 1360; GFX11-DL-NEXT: s_clause 0x1 1361; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] 1362; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] 1363; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 1364; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 1365; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100 1366; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 1367; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0100 1368; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 1369; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 1370; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0] 1371; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 1372; GFX11-DL-NEXT: s_endpgm 1373 ptr addrspace(1) %src2, 1374 ptr addrspace(1) nocapture %dst) { 1375entry: 1376 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1377 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 1378 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 1379 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 1380 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 1381 1382 %v1e0 = extractelement <4 x i8> %vec1, i64 0 1383 %cv1e0 = sext i8 %v1e0 to i32 1384 %v2e0 = extractelement <4 x i8> %vec2, i64 0 1385 %cv2e0 = sext i8 %v2e0 to i32 1386 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 1387 1388 %v1e1 = extractelement <4 x i8> %vec1, i64 1 1389 %cv1e1 = sext i8 %v1e1 to i32 1390 %v2e1 = extractelement <4 x i8> %vec2, i64 1 1391 %cv2e1 = sext i8 %v2e1 to i32 1392 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 1393 1394 %acc = load i32, ptr addrspace(1) %dst, align 4 1395 %add1 = add i32 %mul1, %acc 1396 %add2 = add i32 %add1, %mul2 1397 store i32 %add2, ptr addrspace(1) %dst, align 4 1398 ret void 1399} 1400 1401 1402define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1, 1403; GFX7-LABEL: idot4_acc32_3ele: 1404; GFX7: ; %bb.0: ; %entry 1405; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1406; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 1407; GFX7-NEXT: s_mov_b32 s3, 0xf000 1408; GFX7-NEXT: s_mov_b32 s6, 0 1409; GFX7-NEXT: s_mov_b32 s7, s3 1410; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1411; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 1412; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1413; GFX7-NEXT: v_mov_b32_e32 v1, 0 1414; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1415; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 1416; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1417; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1418; GFX7-NEXT: s_mov_b32 s2, -1 1419; GFX7-NEXT: s_waitcnt vmcnt(1) 1420; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 1421; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8 1422; GFX7-NEXT: s_waitcnt vmcnt(0) 1423; GFX7-NEXT: v_bfe_i32 v4, v0, 0, 8 1424; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8 1425; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1426; GFX7-NEXT: v_mad_i32_i24 v1, v1, v4, s4 1427; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8 1428; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8 1429; GFX7-NEXT: v_mad_i32_i24 v1, v3, v5, v1 1430; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 1431; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1432; GFX7-NEXT: s_endpgm 1433; 1434; GFX8-LABEL: idot4_acc32_3ele: 1435; GFX8: ; %bb.0: ; %entry 1436; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1437; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1438; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1439; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1440; GFX8-NEXT: v_mov_b32_e32 v1, s1 1441; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1442; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1443; GFX8-NEXT: flat_load_dword v3, v[0:1] 1444; GFX8-NEXT: v_mov_b32_e32 v1, s3 1445; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1446; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1447; GFX8-NEXT: flat_load_dword v0, v[0:1] 1448; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1449; GFX8-NEXT: s_waitcnt vmcnt(1) 1450; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 1451; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 1452; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8 1453; GFX8-NEXT: s_waitcnt vmcnt(0) 1454; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 1455; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8 1456; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1457; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 1458; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8 1459; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 1460; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 1461; GFX8-NEXT: v_mov_b32_e32 v0, s4 1462; GFX8-NEXT: v_mov_b32_e32 v1, s5 1463; GFX8-NEXT: flat_store_dword v[0:1], v2 1464; GFX8-NEXT: s_endpgm 1465; 1466; GFX9-NODL-LABEL: idot4_acc32_3ele: 1467; GFX9-NODL: ; %bb.0: ; %entry 1468; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1469; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1470; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1471; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1472; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 1473; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 1474; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 1475; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1476; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 1477; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 1478; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1479; GFX9-NODL-NEXT: v_bfe_i32 v4, v2, 0, 8 1480; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 1481; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 1482; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1483; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 1484; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 1485; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 1486; GFX9-NODL-NEXT: s_endpgm 1487; 1488; GFX9-DL-LABEL: idot4_acc32_3ele: 1489; GFX9-DL: ; %bb.0: ; %entry 1490; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1491; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1492; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1493; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1494; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] 1495; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] 1496; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1497; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 1498; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1499; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1500; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 1501; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1502; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 1503; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1504; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 1505; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 1506; GFX9-DL-NEXT: s_endpgm 1507; 1508; GFX10-DL-LABEL: idot4_acc32_3ele: 1509; GFX10-DL: ; %bb.0: ; %entry 1510; GFX10-DL-NEXT: s_clause 0x1 1511; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1512; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1513; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1514; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 1515; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1516; GFX10-DL-NEXT: s_clause 0x1 1517; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] 1518; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 1519; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 1520; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1521; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1522; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100 1523; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1524; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100 1525; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1526; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0 1527; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0 1528; GFX10-DL-NEXT: global_store_dword v3, v2, s[6:7] 1529; GFX10-DL-NEXT: s_endpgm 1530; 1531; GFX11-DL-LABEL: idot4_acc32_3ele: 1532; GFX11-DL: ; %bb.0: ; %entry 1533; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1534; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1535; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1536; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 1537; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 1538; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1539; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 1540; GFX11-DL-NEXT: s_clause 0x1 1541; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] 1542; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] 1543; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 1544; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 1545; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100 1546; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 1547; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100 1548; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 1549; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 1550; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0] 1551; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 1552; GFX11-DL-NEXT: s_endpgm 1553 ptr addrspace(1) %src2, 1554 ptr addrspace(1) nocapture %dst) { 1555entry: 1556 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1557 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 1558 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 1559 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 1560 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 1561 1562 %v1e0 = extractelement <4 x i8> %vec1, i64 0 1563 %cv1e0 = sext i8 %v1e0 to i32 1564 %v2e0 = extractelement <4 x i8> %vec2, i64 0 1565 %cv2e0 = sext i8 %v2e0 to i32 1566 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 1567 1568 %v1e1 = extractelement <4 x i8> %vec1, i64 1 1569 %cv1e1 = sext i8 %v1e1 to i32 1570 %v2e1 = extractelement <4 x i8> %vec2, i64 1 1571 %cv2e1 = sext i8 %v2e1 to i32 1572 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 1573 1574 %v1e2 = extractelement <4 x i8> %vec1, i64 2 1575 %cv1e2 = sext i8 %v1e2 to i32 1576 %v2e2 = extractelement <4 x i8> %vec2, i64 2 1577 %cv2e2 = sext i8 %v2e2 to i32 1578 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 1579 1580 %acc = load i32, ptr addrspace(1) %dst, align 4 1581 %add1 = add i32 %mul1, %acc 1582 %add2 = add i32 %add1, %mul2 1583 %add3 = add i32 %add2, %mul3 1584 store i32 %add3, ptr addrspace(1) %dst, align 4 1585 ret void 1586} 1587 1588 1589define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1, 1590; GFX7-LABEL: idot4_acc32_3ele_permuted: 1591; GFX7: ; %bb.0: ; %entry 1592; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1593; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 1594; GFX7-NEXT: s_mov_b32 s3, 0xf000 1595; GFX7-NEXT: s_mov_b32 s6, 0 1596; GFX7-NEXT: s_mov_b32 s7, s3 1597; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1598; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 1599; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1600; GFX7-NEXT: v_mov_b32_e32 v1, 0 1601; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1602; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 1603; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1604; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 1605; GFX7-NEXT: s_mov_b32 s2, -1 1606; GFX7-NEXT: s_waitcnt vmcnt(1) 1607; GFX7-NEXT: v_ashrrev_i32_e32 v1, 24, v2 1608; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 1609; GFX7-NEXT: s_waitcnt vmcnt(0) 1610; GFX7-NEXT: v_ashrrev_i32_e32 v4, 24, v0 1611; GFX7-NEXT: v_bfe_i32 v5, v0, 0, 8 1612; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1613; GFX7-NEXT: v_mad_i32_i24 v1, v1, v4, s4 1614; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8 1615; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8 1616; GFX7-NEXT: v_mad_i32_i24 v1, v3, v5, v1 1617; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 1618; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1619; GFX7-NEXT: s_endpgm 1620; 1621; GFX8-LABEL: idot4_acc32_3ele_permuted: 1622; GFX8: ; %bb.0: ; %entry 1623; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1624; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1625; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1626; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1627; GFX8-NEXT: v_mov_b32_e32 v1, s1 1628; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1629; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1630; GFX8-NEXT: flat_load_dword v3, v[0:1] 1631; GFX8-NEXT: v_mov_b32_e32 v1, s3 1632; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1633; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1634; GFX8-NEXT: flat_load_dword v0, v[0:1] 1635; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 1636; GFX8-NEXT: s_waitcnt vmcnt(1) 1637; GFX8-NEXT: v_ashrrev_i32_e32 v1, 24, v3 1638; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 8 1639; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8 1640; GFX8-NEXT: s_waitcnt vmcnt(0) 1641; GFX8-NEXT: v_ashrrev_i32_e32 v2, 24, v0 1642; GFX8-NEXT: v_bfe_i32 v5, v0, 0, 8 1643; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1644; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 1645; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8 1646; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 1647; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 1648; GFX8-NEXT: v_mov_b32_e32 v0, s4 1649; GFX8-NEXT: v_mov_b32_e32 v1, s5 1650; GFX8-NEXT: flat_store_dword v[0:1], v2 1651; GFX8-NEXT: s_endpgm 1652; 1653; GFX9-NODL-LABEL: idot4_acc32_3ele_permuted: 1654; GFX9-NODL: ; %bb.0: ; %entry 1655; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1656; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1657; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1658; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1659; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 1660; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 1661; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 1662; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1663; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 1664; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v3, 24, v1 1665; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1666; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v4, 24, v2 1667; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 1668; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 1669; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1670; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 1671; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 1672; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 1673; GFX9-NODL-NEXT: s_endpgm 1674; 1675; GFX9-DL-LABEL: idot4_acc32_3ele_permuted: 1676; GFX9-DL: ; %bb.0: ; %entry 1677; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1678; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1679; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1680; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1681; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] 1682; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] 1683; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1684; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020003 1685; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1686; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 1687; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 1688; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1689; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 1690; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1691; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 1692; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 1693; GFX9-DL-NEXT: s_endpgm 1694; 1695; GFX10-DL-LABEL: idot4_acc32_3ele_permuted: 1696; GFX10-DL: ; %bb.0: ; %entry 1697; GFX10-DL-NEXT: s_clause 0x1 1698; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1699; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1700; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1701; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 1702; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1703; GFX10-DL-NEXT: s_clause 0x1 1704; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] 1705; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 1706; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 1707; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 1708; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 1709; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020003 1710; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1711; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020003 1712; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1713; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0 1714; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0 1715; GFX10-DL-NEXT: global_store_dword v3, v2, s[6:7] 1716; GFX10-DL-NEXT: s_endpgm 1717; 1718; GFX11-DL-LABEL: idot4_acc32_3ele_permuted: 1719; GFX11-DL: ; %bb.0: ; %entry 1720; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1721; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1722; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1723; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 1724; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 1725; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1726; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 1727; GFX11-DL-NEXT: s_clause 0x1 1728; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] 1729; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] 1730; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 1731; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 1732; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020003 1733; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 1734; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020003 1735; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 1736; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 1737; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0] 1738; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 1739; GFX11-DL-NEXT: s_endpgm 1740 ptr addrspace(1) %src2, 1741 ptr addrspace(1) nocapture %dst) { 1742entry: 1743 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1744 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 1745 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 1746 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 1747 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 1748 1749 %v1e0 = extractelement <4 x i8> %vec1, i64 3 1750 %cv1e0 = sext i8 %v1e0 to i32 1751 %v2e0 = extractelement <4 x i8> %vec2, i64 3 1752 %cv2e0 = sext i8 %v2e0 to i32 1753 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 1754 1755 %v1e1 = extractelement <4 x i8> %vec1, i64 0 1756 %cv1e1 = sext i8 %v1e1 to i32 1757 %v2e1 = extractelement <4 x i8> %vec2, i64 0 1758 %cv2e1 = sext i8 %v2e1 to i32 1759 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 1760 1761 %v1e2 = extractelement <4 x i8> %vec1, i64 2 1762 %cv1e2 = sext i8 %v1e2 to i32 1763 %v2e2 = extractelement <4 x i8> %vec2, i64 2 1764 %cv2e2 = sext i8 %v2e2 to i32 1765 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 1766 1767 %acc = load i32, ptr addrspace(1) %dst, align 4 1768 %add1 = add i32 %mul1, %acc 1769 %add2 = add i32 %add1, %mul2 1770 %add3 = add i32 %add2, %mul3 1771 store i32 %add3, ptr addrspace(1) %dst, align 4 1772 ret void 1773} 1774 1775define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1, 1776; GFX7-LABEL: idot4_acc32_opt: 1777; GFX7: ; %bb.0: ; %entry 1778; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 1779; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 1780; GFX7-NEXT: s_mov_b32 s3, 0xf000 1781; GFX7-NEXT: s_mov_b32 s6, 0 1782; GFX7-NEXT: s_mov_b32 s7, s3 1783; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1784; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 1785; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1786; GFX7-NEXT: v_mov_b32_e32 v1, 0 1787; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 1788; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 1789; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 1790; GFX7-NEXT: s_mov_b32 s2, -1 1791; GFX7-NEXT: s_waitcnt vmcnt(1) 1792; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8 1793; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 1794; GFX7-NEXT: s_waitcnt vmcnt(0) 1795; GFX7-NEXT: v_bfe_i32 v6, v0, 8, 8 1796; GFX7-NEXT: v_bfe_i32 v5, v0, 0, 8 1797; GFX7-NEXT: v_mul_i32_i24_e32 v3, v3, v6 1798; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8 1799; GFX7-NEXT: v_bfe_i32 v7, v0, 16, 8 1800; GFX7-NEXT: v_mad_i32_i24 v1, v1, v5, v3 1801; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 1802; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 1803; GFX7-NEXT: v_mad_i32_i24 v1, v4, v7, v1 1804; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 1805; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 1806; GFX7-NEXT: s_endpgm 1807; 1808; GFX8-LABEL: idot4_acc32_opt: 1809; GFX8: ; %bb.0: ; %entry 1810; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1811; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 1812; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1813; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1814; GFX8-NEXT: v_mov_b32_e32 v1, s1 1815; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1816; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1817; GFX8-NEXT: flat_load_dword v3, v[0:1] 1818; GFX8-NEXT: v_mov_b32_e32 v1, s3 1819; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1820; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1821; GFX8-NEXT: flat_load_dword v2, v[0:1] 1822; GFX8-NEXT: v_mov_b32_e32 v0, s4 1823; GFX8-NEXT: v_mov_b32_e32 v1, s5 1824; GFX8-NEXT: s_waitcnt vmcnt(1) 1825; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 8 1826; GFX8-NEXT: v_bfe_i32 v7, v3, 16, 8 1827; GFX8-NEXT: s_waitcnt vmcnt(0) 1828; GFX8-NEXT: v_bfe_i32 v5, v2, 0, 8 1829; GFX8-NEXT: v_mul_i32_i24_sdwa v6, sext(v3), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 1830; GFX8-NEXT: v_bfe_i32 v8, v2, 16, 8 1831; GFX8-NEXT: v_mad_i32_i24 v4, v4, v5, v6 1832; GFX8-NEXT: v_ashrrev_i32_e32 v3, 24, v3 1833; GFX8-NEXT: v_ashrrev_i32_e32 v2, 24, v2 1834; GFX8-NEXT: v_mad_i32_i24 v4, v7, v8, v4 1835; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v4 1836; GFX8-NEXT: flat_store_dword v[0:1], v2 1837; GFX8-NEXT: s_endpgm 1838; 1839; GFX9-NODL-LABEL: idot4_acc32_opt: 1840; GFX9-NODL: ; %bb.0: ; %entry 1841; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1842; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1843; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1844; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 1845; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 1846; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 1847; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 1848; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 1849; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 1850; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 1851; GFX9-NODL-NEXT: v_bfe_i32 v4, v2, 0, 8 1852; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 1853; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v6, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 1854; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 1855; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, v5 1856; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v6, v1 1857; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 1858; GFX9-NODL-NEXT: s_endpgm 1859; 1860; GFX9-DL-LABEL: idot4_acc32_opt: 1861; GFX9-DL: ; %bb.0: ; %entry 1862; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1863; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1864; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1865; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 1866; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 1867; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 1868; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 1869; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 1870; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, 0 1871; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 1872; GFX9-DL-NEXT: s_endpgm 1873; 1874; GFX10-DL-LABEL: idot4_acc32_opt: 1875; GFX10-DL: ; %bb.0: ; %entry 1876; GFX10-DL-NEXT: s_clause 0x1 1877; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 1878; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 1879; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1880; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 1881; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 1882; GFX10-DL-NEXT: s_clause 0x1 1883; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 1884; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 1885; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 1886; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 1887; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2 1888; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] 1889; GFX10-DL-NEXT: s_endpgm 1890; 1891; GFX11-DL-LABEL: idot4_acc32_opt: 1892; GFX11-DL: ; %bb.0: ; %entry 1893; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 1894; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 1895; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 1896; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 1897; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 1898; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1899; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 1900; GFX11-DL-NEXT: s_clause 0x1 1901; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 1902; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 1903; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 1904; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, 0 neg_lo:[1,1,0] 1905; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 1906; GFX11-DL-NEXT: s_endpgm 1907 ptr addrspace(1) %src2, 1908 ptr addrspace(1) nocapture %dst) { 1909entry: 1910 %idx = call i32 @llvm.amdgcn.workitem.id.x() 1911 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 1912 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 1913 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 1914 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 1915 1916 %v1e0 = extractelement <4 x i8> %vec1, i64 0 1917 %cv1e0 = sext i8 %v1e0 to i32 1918 %v2e0 = extractelement <4 x i8> %vec2, i64 0 1919 %cv2e0 = sext i8 %v2e0 to i32 1920 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 1921 1922 %v1e1 = extractelement <4 x i8> %vec1, i64 1 1923 %cv1e1 = sext i8 %v1e1 to i32 1924 %v2e1 = extractelement <4 x i8> %vec2, i64 1 1925 %cv2e1 = sext i8 %v2e1 to i32 1926 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 1927 1928 %v1e2 = extractelement <4 x i8> %vec1, i64 2 1929 %cv1e2 = sext i8 %v1e2 to i32 1930 %v2e2 = extractelement <4 x i8> %vec2, i64 2 1931 %cv2e2 = sext i8 %v2e2 to i32 1932 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 1933 1934 %v1e3 = extractelement <4 x i8> %vec1, i64 3 1935 %cv1e3 = sext i8 %v1e3 to i32 1936 %v2e3 = extractelement <4 x i8> %vec2, i64 3 1937 %cv2e3 = sext i8 %v2e3 to i32 1938 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3 1939 1940 %add2 = add i32 %mul1, %mul2 1941 %add3 = add i32 %add2, %mul3 1942 %add4 = add i32 %add3, %mul4 1943 store i32 %add4, ptr addrspace(1) %dst, align 4 1944 ret void 1945} 1946 1947define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1, 1948; GFX7-LABEL: idot4_acc32_3src: 1949; GFX7: ; %bb.0: ; %entry 1950; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 1951; GFX7-NEXT: s_mov_b32 s11, 0xf000 1952; GFX7-NEXT: s_mov_b32 s14, 0 1953; GFX7-NEXT: s_mov_b32 s15, s11 1954; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1955; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1956; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1] 1957; GFX7-NEXT: v_mov_b32_e32 v1, 0 1958; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 1959; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3] 1960; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 1961; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] 1962; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 1963; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0 1964; GFX7-NEXT: s_mov_b32 s10, -1 1965; GFX7-NEXT: s_mov_b32 s8, s6 1966; GFX7-NEXT: s_mov_b32 s9, s7 1967; GFX7-NEXT: s_waitcnt vmcnt(2) 1968; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 1969; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8 1970; GFX7-NEXT: s_waitcnt vmcnt(1) 1971; GFX7-NEXT: v_bfe_i32 v3, v3, 8, 8 1972; GFX7-NEXT: s_waitcnt lgkmcnt(0) 1973; GFX7-NEXT: v_mad_i32_i24 v1, v1, v1, s0 1974; GFX7-NEXT: v_bfe_i32 v5, v2, 16, 8 1975; GFX7-NEXT: s_waitcnt vmcnt(0) 1976; GFX7-NEXT: v_bfe_i32 v6, v0, 16, 8 1977; GFX7-NEXT: v_mad_i32_i24 v1, v4, v3, v1 1978; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 1979; GFX7-NEXT: v_ashrrev_i32_e32 v0, 24, v0 1980; GFX7-NEXT: v_mad_i32_i24 v1, v5, v6, v1 1981; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 1982; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 1983; GFX7-NEXT: s_endpgm 1984; 1985; GFX8-LABEL: idot4_acc32_3src: 1986; GFX8: ; %bb.0: ; %entry 1987; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 1988; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 1989; GFX8-NEXT: s_waitcnt lgkmcnt(0) 1990; GFX8-NEXT: v_mov_b32_e32 v1, s1 1991; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 1992; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1993; GFX8-NEXT: flat_load_dword v3, v[0:1] 1994; GFX8-NEXT: v_mov_b32_e32 v1, s3 1995; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 1996; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1997; GFX8-NEXT: flat_load_dword v4, v[0:1] 1998; GFX8-NEXT: v_mov_b32_e32 v1, s5 1999; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2000; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2001; GFX8-NEXT: flat_load_dword v0, v[0:1] 2002; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 2003; GFX8-NEXT: s_waitcnt vmcnt(2) 2004; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 2005; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8 2006; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2007; GFX8-NEXT: v_mad_i32_i24 v1, v1, v1, s0 2008; GFX8-NEXT: v_bfe_i32 v5, v3, 16, 8 2009; GFX8-NEXT: v_ashrrev_i32_e32 v3, 24, v3 2010; GFX8-NEXT: s_waitcnt vmcnt(1) 2011; GFX8-NEXT: v_bfe_i32 v4, v4, 8, 8 2012; GFX8-NEXT: v_mad_i32_i24 v1, v2, v4, v1 2013; GFX8-NEXT: s_waitcnt vmcnt(0) 2014; GFX8-NEXT: v_bfe_i32 v6, v0, 16, 8 2015; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0 2016; GFX8-NEXT: v_mad_i32_i24 v1, v5, v6, v1 2017; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 2018; GFX8-NEXT: v_mov_b32_e32 v0, s6 2019; GFX8-NEXT: v_mov_b32_e32 v1, s7 2020; GFX8-NEXT: flat_store_dword v[0:1], v2 2021; GFX8-NEXT: s_endpgm 2022; 2023; GFX9-NODL-LABEL: idot4_acc32_3src: 2024; GFX9-NODL: ; %bb.0: ; %entry 2025; GFX9-NODL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 2026; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2027; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2028; GFX9-NODL-NEXT: global_load_dword v1, v0, s[8:9] 2029; GFX9-NODL-NEXT: global_load_dword v2, v0, s[10:11] 2030; GFX9-NODL-NEXT: global_load_dword v3, v0, s[12:13] 2031; GFX9-NODL-NEXT: s_load_dword s0, s[14:15], 0x0 2032; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2033; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 2034; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 2035; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2036; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v2, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 2037; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2038; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 2039; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 2040; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2041; GFX9-NODL-NEXT: v_add3_u32 v2, v4, s0, v2 2042; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 2043; GFX9-NODL-NEXT: global_store_dword v0, v1, s[14:15] 2044; GFX9-NODL-NEXT: s_endpgm 2045; 2046; GFX9-DL-LABEL: idot4_acc32_3src: 2047; GFX9-DL: ; %bb.0: ; %entry 2048; GFX9-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 2049; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2050; GFX9-DL-NEXT: s_mov_b32 s0, 0x706010c 2051; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0c00 2052; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2053; GFX9-DL-NEXT: global_load_dword v1, v0, s[10:11] 2054; GFX9-DL-NEXT: global_load_dword v2, v0, s[12:13] 2055; GFX9-DL-NEXT: global_load_dword v3, v0, s[8:9] 2056; GFX9-DL-NEXT: s_load_dword s1, s[14:15], 0x0 2057; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2058; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2059; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0 2060; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2061; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2 2062; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2 2063; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2064; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v3, v1, s1 2065; GFX9-DL-NEXT: global_store_dword v0, v1, s[14:15] 2066; GFX9-DL-NEXT: s_endpgm 2067; 2068; GFX10-DL-LABEL: idot4_acc32_3src: 2069; GFX10-DL: ; %bb.0: ; %entry 2070; GFX10-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 2071; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2072; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2073; GFX10-DL-NEXT: s_clause 0x2 2074; GFX10-DL-NEXT: global_load_dword v1, v0, s[10:11] 2075; GFX10-DL-NEXT: global_load_dword v2, v0, s[12:13] 2076; GFX10-DL-NEXT: global_load_dword v3, v0, s[8:9] 2077; GFX10-DL-NEXT: s_load_dword s0, s[14:15], 0x0 2078; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2079; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0x706010c 2080; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2081; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc0c0c00 2082; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 2083; GFX10-DL-NEXT: v_or_b32_e32 v0, v0, v1 2084; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2085; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0 2086; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v3, v0 2087; GFX10-DL-NEXT: global_store_dword v2, v1, s[14:15] 2088; GFX10-DL-NEXT: s_endpgm 2089; 2090; GFX11-DL-LABEL: idot4_acc32_3src: 2091; GFX11-DL: ; %bb.0: ; %entry 2092; GFX11-DL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 2093; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2094; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 2095; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2096; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2097; GFX11-DL-NEXT: s_clause 0x2 2098; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] 2099; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] 2100; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] 2101; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0 2102; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 2103; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0x706010c 2104; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 2105; GFX11-DL-NEXT: v_perm_b32 v2, v0, v0, 0xc0c0c00 2106; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) 2107; GFX11-DL-NEXT: v_or_b32_e32 v1, v1, v2 2108; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 2109; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2110; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0] 2111; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7] 2112; GFX11-DL-NEXT: s_endpgm 2113 ptr addrspace(1) %src2, 2114 ptr addrspace(1) %src3, 2115 ptr addrspace(1) nocapture %dst) { 2116entry: 2117 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2118 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 2119 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 2120 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 2121 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 2122 %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx 2123 %vec3 = load <4 x i8>, ptr addrspace(1) %gep3 2124 2125 %v1e0 = extractelement <4 x i8> %vec1, i64 0 2126 %cv1e0 = sext i8 %v1e0 to i32 2127 %mul1 = mul nuw nsw i32 %cv1e0, %cv1e0 2128 2129 %v1e1 = extractelement <4 x i8> %vec1, i64 1 2130 %cv1e1 = sext i8 %v1e1 to i32 2131 %v2e1 = extractelement <4 x i8> %vec2, i64 1 2132 %cv2e1 = sext i8 %v2e1 to i32 2133 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 2134 2135 %v1e2 = extractelement <4 x i8> %vec1, i64 2 2136 %cv1e2 = sext i8 %v1e2 to i32 2137 %v3e2 = extractelement <4 x i8> %vec3, i64 2 2138 %cv3e2 = sext i8 %v3e2 to i32 2139 %mul3 = mul nuw nsw i32 %cv1e2, %cv3e2 2140 2141 %v1e3 = extractelement <4 x i8> %vec1, i64 3 2142 %cv1e3 = sext i8 %v1e3 to i32 2143 %v3e3 = extractelement <4 x i8> %vec3, i64 3 2144 %cv3e3 = sext i8 %v3e3 to i32 2145 %mul4 = mul nuw nsw i32 %cv1e3, %cv3e3 2146 2147 %acc = load i32, ptr addrspace(1) %dst, align 4 2148 %add1 = add i32 %mul1, %acc 2149 %add2 = add i32 %add1, %mul2 2150 %add3 = add i32 %add2, %mul3 2151 %add4 = add i32 %add3, %mul4 2152 store i32 %add4, ptr addrspace(1) %dst, align 4 2153 ret void 2154} 2155 2156define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1, 2157; GFX7-LABEL: idot4_acc32_3src_3ele: 2158; GFX7: ; %bb.0: ; %entry 2159; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 2160; GFX7-NEXT: s_mov_b32 s11, 0xf000 2161; GFX7-NEXT: s_mov_b32 s14, 0 2162; GFX7-NEXT: s_mov_b32 s15, s11 2163; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2164; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2165; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1] 2166; GFX7-NEXT: v_mov_b32_e32 v1, 0 2167; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 2168; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3] 2169; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 2170; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] 2171; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 2172; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0 2173; GFX7-NEXT: s_mov_b32 s10, -1 2174; GFX7-NEXT: s_mov_b32 s8, s6 2175; GFX7-NEXT: s_mov_b32 s9, s7 2176; GFX7-NEXT: s_waitcnt vmcnt(2) 2177; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 2178; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8 2179; GFX7-NEXT: s_waitcnt vmcnt(1) 2180; GFX7-NEXT: v_bfe_i32 v3, v3, 8, 8 2181; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2182; GFX7-NEXT: v_mad_i32_i24 v1, v1, v1, s0 2183; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8 2184; GFX7-NEXT: s_waitcnt vmcnt(0) 2185; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8 2186; GFX7-NEXT: v_mad_i32_i24 v1, v4, v3, v1 2187; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 2188; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 2189; GFX7-NEXT: s_endpgm 2190; 2191; GFX8-LABEL: idot4_acc32_3src_3ele: 2192; GFX8: ; %bb.0: ; %entry 2193; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 2194; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2195; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2196; GFX8-NEXT: v_mov_b32_e32 v1, s1 2197; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2198; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2199; GFX8-NEXT: flat_load_dword v3, v[0:1] 2200; GFX8-NEXT: v_mov_b32_e32 v1, s3 2201; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2202; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2203; GFX8-NEXT: flat_load_dword v4, v[0:1] 2204; GFX8-NEXT: v_mov_b32_e32 v1, s5 2205; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2206; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2207; GFX8-NEXT: flat_load_dword v0, v[0:1] 2208; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 2209; GFX8-NEXT: s_waitcnt vmcnt(2) 2210; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 2211; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8 2212; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2213; GFX8-NEXT: v_mad_i32_i24 v1, v1, v1, s0 2214; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8 2215; GFX8-NEXT: s_waitcnt vmcnt(1) 2216; GFX8-NEXT: v_bfe_i32 v4, v4, 8, 8 2217; GFX8-NEXT: v_mad_i32_i24 v1, v2, v4, v1 2218; GFX8-NEXT: s_waitcnt vmcnt(0) 2219; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8 2220; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 2221; GFX8-NEXT: v_mov_b32_e32 v0, s6 2222; GFX8-NEXT: v_mov_b32_e32 v1, s7 2223; GFX8-NEXT: flat_store_dword v[0:1], v2 2224; GFX8-NEXT: s_endpgm 2225; 2226; GFX9-NODL-LABEL: idot4_acc32_3src_3ele: 2227; GFX9-NODL: ; %bb.0: ; %entry 2228; GFX9-NODL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 2229; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2230; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2231; GFX9-NODL-NEXT: global_load_dword v1, v0, s[8:9] 2232; GFX9-NODL-NEXT: global_load_dword v2, v0, s[10:11] 2233; GFX9-NODL-NEXT: global_load_dword v3, v0, s[12:13] 2234; GFX9-NODL-NEXT: s_load_dword s0, s[14:15], 0x0 2235; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2236; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 2237; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 0, 8 2238; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2239; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v2, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 2240; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2241; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 2242; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2243; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v4, s0 2244; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1 2245; GFX9-NODL-NEXT: global_store_dword v0, v1, s[14:15] 2246; GFX9-NODL-NEXT: s_endpgm 2247; 2248; GFX9-DL-LABEL: idot4_acc32_3src_3ele: 2249; GFX9-DL: ; %bb.0: ; %entry 2250; GFX9-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 2251; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2252; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c 2253; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c00 2254; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020100 2255; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2256; GFX9-DL-NEXT: global_load_dword v1, v0, s[10:11] 2257; GFX9-DL-NEXT: global_load_dword v2, v0, s[12:13] 2258; GFX9-DL-NEXT: global_load_dword v3, v0, s[8:9] 2259; GFX9-DL-NEXT: s_load_dword s3, s[14:15], 0x0 2260; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2261; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2262; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0 2263; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2264; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s1 2265; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2 2266; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2 2267; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2268; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s3 2269; GFX9-DL-NEXT: global_store_dword v0, v1, s[14:15] 2270; GFX9-DL-NEXT: s_endpgm 2271; 2272; GFX10-DL-LABEL: idot4_acc32_3src_3ele: 2273; GFX10-DL: ; %bb.0: ; %entry 2274; GFX10-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 2275; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2276; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2277; GFX10-DL-NEXT: s_clause 0x2 2278; GFX10-DL-NEXT: global_load_dword v1, v0, s[10:11] 2279; GFX10-DL-NEXT: global_load_dword v2, v0, s[12:13] 2280; GFX10-DL-NEXT: global_load_dword v3, v0, s[8:9] 2281; GFX10-DL-NEXT: s_load_dword s0, s[14:15], 0x0 2282; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2283; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc06010c 2284; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2285; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc0c0c00 2286; GFX10-DL-NEXT: v_perm_b32 v2, v3, v3, 0xc020100 2287; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 2288; GFX10-DL-NEXT: v_or_b32_e32 v0, v0, v1 2289; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2290; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0 2291; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v2, v0 2292; GFX10-DL-NEXT: global_store_dword v3, v1, s[14:15] 2293; GFX10-DL-NEXT: s_endpgm 2294; 2295; GFX11-DL-LABEL: idot4_acc32_3src_3ele: 2296; GFX11-DL: ; %bb.0: ; %entry 2297; GFX11-DL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 2298; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2299; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 2300; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2301; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2302; GFX11-DL-NEXT: s_clause 0x2 2303; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] 2304; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5] 2305; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] 2306; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0 2307; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 2308; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc06010c 2309; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 2310; GFX11-DL-NEXT: v_perm_b32 v2, v0, v0, 0xc0c0c00 2311; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100 2312; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 2313; GFX11-DL-NEXT: v_or_b32_e32 v1, v1, v2 2314; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 2315; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2316; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0] 2317; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7] 2318; GFX11-DL-NEXT: s_endpgm 2319 ptr addrspace(1) %src2, 2320 ptr addrspace(1) %src3, 2321 ptr addrspace(1) nocapture %dst) { 2322entry: 2323 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2324 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 2325 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 2326 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 2327 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 2328 %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx 2329 %vec3 = load <4 x i8>, ptr addrspace(1) %gep3 2330 2331 %v1e0 = extractelement <4 x i8> %vec1, i64 0 2332 %cv1e0 = sext i8 %v1e0 to i32 2333 %mul1 = mul nuw nsw i32 %cv1e0, %cv1e0 2334 2335 %v1e1 = extractelement <4 x i8> %vec1, i64 1 2336 %cv1e1 = sext i8 %v1e1 to i32 2337 %v2e1 = extractelement <4 x i8> %vec2, i64 1 2338 %cv2e1 = sext i8 %v2e1 to i32 2339 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 2340 2341 %v1e2 = extractelement <4 x i8> %vec1, i64 2 2342 %cv1e2 = sext i8 %v1e2 to i32 2343 %v3e2 = extractelement <4 x i8> %vec3, i64 2 2344 %cv3e2 = sext i8 %v3e2 to i32 2345 %mul3 = mul nuw nsw i32 %cv1e2, %cv3e2 2346 2347 %acc = load i32, ptr addrspace(1) %dst, align 4 2348 %add1 = add i32 %mul1, %acc 2349 %add2 = add i32 %add1, %mul2 2350 %add3 = add i32 %add2, %mul3 2351 store i32 %add3, ptr addrspace(1) %dst, align 4 2352 ret void 2353} 2354 2355define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1, 2356; GFX7-LABEL: idot4_bad_source: 2357; GFX7: ; %bb.0: ; %entry 2358; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 2359; GFX7-NEXT: s_load_dword s12, s[4:5], 0xf 2360; GFX7-NEXT: s_mov_b32 s7, 0xf000 2361; GFX7-NEXT: s_mov_b32 s10, 0 2362; GFX7-NEXT: s_mov_b32 s11, s7 2363; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2364; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] 2365; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2366; GFX7-NEXT: v_mov_b32_e32 v1, 0 2367; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 2368; GFX7-NEXT: s_mov_b64 s[8:9], s[2:3] 2369; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 2370; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x11 2371; GFX7-NEXT: s_sext_i32_i16 s1, s12 2372; GFX7-NEXT: s_mov_b32 s6, -1 2373; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2374; GFX7-NEXT: s_load_dword s0, s[4:5], 0x0 2375; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2376; GFX7-NEXT: v_mov_b32_e32 v1, s0 2377; GFX7-NEXT: s_waitcnt vmcnt(1) 2378; GFX7-NEXT: v_bfe_i32 v3, v2, 0, 8 2379; GFX7-NEXT: v_bfe_i32 v4, v2, 8, 8 2380; GFX7-NEXT: s_waitcnt vmcnt(0) 2381; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8 2382; GFX7-NEXT: v_mad_i32_i24 v1, v3, s1, v1 2383; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8 2384; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8 2385; GFX7-NEXT: v_mad_i32_i24 v1, v4, v5, v1 2386; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 2387; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 2388; GFX7-NEXT: s_endpgm 2389; 2390; GFX8-LABEL: idot4_bad_source: 2391; GFX8: ; %bb.0: ; %entry 2392; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2393; GFX8-NEXT: s_load_dword s6, s[4:5], 0x3c 2394; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2395; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2396; GFX8-NEXT: v_mov_b32_e32 v1, s1 2397; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2398; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2399; GFX8-NEXT: flat_load_dword v3, v[0:1] 2400; GFX8-NEXT: v_mov_b32_e32 v1, s3 2401; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2402; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2403; GFX8-NEXT: flat_load_dword v0, v[0:1] 2404; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 2405; GFX8-NEXT: s_sext_i32_i16 s3, s6 2406; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2407; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 2408; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2409; GFX8-NEXT: v_mov_b32_e32 v1, s2 2410; GFX8-NEXT: s_waitcnt vmcnt(1) 2411; GFX8-NEXT: v_bfe_i32 v2, v3, 0, 8 2412; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 2413; GFX8-NEXT: v_mad_i32_i24 v1, v2, s3, v1 2414; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8 2415; GFX8-NEXT: s_waitcnt vmcnt(0) 2416; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8 2417; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8 2418; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 2419; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 2420; GFX8-NEXT: v_mov_b32_e32 v0, s0 2421; GFX8-NEXT: v_mov_b32_e32 v1, s1 2422; GFX8-NEXT: flat_store_dword v[0:1], v2 2423; GFX8-NEXT: s_endpgm 2424; 2425; GFX9-NODL-LABEL: idot4_bad_source: 2426; GFX9-NODL: ; %bb.0: ; %entry 2427; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2428; GFX9-NODL-NEXT: s_load_dword s6, s[4:5], 0x3c 2429; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2430; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2431; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 2432; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 2433; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 2434; GFX9-NODL-NEXT: s_sext_i32_i16 s3, s6 2435; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2436; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2437; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 2438; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2439; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 2440; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2441; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 2442; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 2443; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2444; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2 2445; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, s3, v2 2446; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v4, v1 2447; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 2448; GFX9-NODL-NEXT: s_endpgm 2449; 2450; GFX9-DL-LABEL: idot4_bad_source: 2451; GFX9-DL: ; %bb.0: ; %entry 2452; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2453; GFX9-DL-NEXT: s_load_dword s6, s[4:5], 0x3c 2454; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2455; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2456; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 2457; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 2458; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 2459; GFX9-DL-NEXT: s_mov_b32 s3, 0xc0c0201 2460; GFX9-DL-NEXT: s_sext_i32_i16 s4, s6 2461; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2462; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2463; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 2464; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2465; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2 2466; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2467; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 0, 8 2468; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2469; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s3 2470; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, s4, v3 2471; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s3 2472; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, v3 2473; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 2474; GFX9-DL-NEXT: s_endpgm 2475; 2476; GFX10-DL-LABEL: idot4_bad_source: 2477; GFX10-DL: ; %bb.0: ; %entry 2478; GFX10-DL-NEXT: s_clause 0x1 2479; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2480; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x3c 2481; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2482; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 2483; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2484; GFX10-DL-NEXT: s_clause 0x1 2485; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 2486; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 2487; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 2488; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 2489; GFX10-DL-NEXT: s_sext_i32_i16 s3, s6 2490; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2491; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 2492; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2493; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8 2494; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2495; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0201 2496; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201 2497; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2498; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, s3, s2 2499; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2 2500; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1] 2501; GFX10-DL-NEXT: s_endpgm 2502; 2503; GFX11-DL-LABEL: idot4_bad_source: 2504; GFX11-DL: ; %bb.0: ; %entry 2505; GFX11-DL-NEXT: s_clause 0x1 2506; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2507; GFX11-DL-NEXT: s_load_b32 s6, s[4:5], 0x3c 2508; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 2509; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 2510; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2511; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2512; GFX11-DL-NEXT: s_clause 0x1 2513; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 2514; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 2515; GFX11-DL-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 2516; GFX11-DL-NEXT: s_sext_i32_i16 s3, s6 2517; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2518; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 2519; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 2520; GFX11-DL-NEXT: v_bfe_i32 v2, v1, 0, 8 2521; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 2522; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0201 2523; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201 2524; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2525; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, s3, s2 2526; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 2527; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0] 2528; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1] 2529; GFX11-DL-NEXT: s_endpgm 2530 ptr addrspace(1) %src2, 2531 ptr addrspace(1) %src3, 2532 i16 %badsource, 2533 ptr addrspace(1) nocapture %dst) { 2534entry: 2535 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2536 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 2537 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 2538 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 2539 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 2540 %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx 2541 %vec3 = load <4 x i8>, ptr addrspace(1) %gep3 2542 2543 %v1e0 = extractelement <4 x i8> %vec1, i64 0 2544 %cv1e0 = sext i8 %v1e0 to i32 2545 %v2e0 = extractelement <4 x i8> %vec2, i64 0 2546 %other = sext i16 %badsource to i32 2547 %mul1 = mul nuw nsw i32 %cv1e0, %other 2548 2549 %v1e1 = extractelement <4 x i8> %vec1, i64 1 2550 %cv1e1 = sext i8 %v1e1 to i32 2551 %v2e1 = extractelement <4 x i8> %vec2, i64 1 2552 %cv2e1 = sext i8 %v2e1 to i32 2553 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 2554 2555 %v2e2 = extractelement <4 x i8> %vec2, i64 2 2556 %cv2e2 = sext i8 %v2e2 to i32 2557 %v1e2 = extractelement <4 x i8> %vec1, i64 2 2558 %cv1e2 = sext i8 %v1e2 to i32 2559 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 2560 2561 2562 %acc = load i32, ptr addrspace(1) %dst, align 4 2563 %mad1 = add i32 %mul1, %acc 2564 %mad2 = add i32 %mad1, %mul2 2565 %mad3 = add i32 %mad2, %mul3 2566 2567 store i32 %mad3, ptr addrspace(1) %dst, align 4 2568 ret void 2569} 2570 2571 2572define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1, 2573; GFX7-LABEL: idot4_commutative: 2574; GFX7: ; %bb.0: ; %entry 2575; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 2576; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xf 2577; GFX7-NEXT: s_mov_b32 s3, 0xf000 2578; GFX7-NEXT: s_mov_b32 s6, 0 2579; GFX7-NEXT: s_mov_b32 s7, s3 2580; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2581; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 2582; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2583; GFX7-NEXT: v_mov_b32_e32 v1, 0 2584; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 2585; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 2586; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 2587; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 2588; GFX7-NEXT: s_mov_b32 s2, -1 2589; GFX7-NEXT: s_waitcnt vmcnt(1) 2590; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 2591; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8 2592; GFX7-NEXT: s_waitcnt vmcnt(0) 2593; GFX7-NEXT: v_bfe_i32 v4, v0, 0, 8 2594; GFX7-NEXT: v_bfe_i32 v5, v0, 8, 8 2595; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2596; GFX7-NEXT: v_mad_i32_i24 v1, v1, v4, s4 2597; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8 2598; GFX7-NEXT: v_bfe_i32 v2, v2, 16, 8 2599; GFX7-NEXT: v_mad_i32_i24 v1, v3, v5, v1 2600; GFX7-NEXT: v_mad_i32_i24 v0, v2, v0, v1 2601; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 2602; GFX7-NEXT: s_endpgm 2603; 2604; GFX8-LABEL: idot4_commutative: 2605; GFX8: ; %bb.0: ; %entry 2606; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2607; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x3c 2608; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2609; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2610; GFX8-NEXT: v_mov_b32_e32 v1, s1 2611; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2612; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2613; GFX8-NEXT: flat_load_dword v3, v[0:1] 2614; GFX8-NEXT: v_mov_b32_e32 v1, s3 2615; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2616; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2617; GFX8-NEXT: flat_load_dword v0, v[0:1] 2618; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 2619; GFX8-NEXT: s_waitcnt vmcnt(1) 2620; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 2621; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8 2622; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8 2623; GFX8-NEXT: s_waitcnt vmcnt(0) 2624; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8 2625; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8 2626; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2627; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0 2628; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8 2629; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1 2630; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 2631; GFX8-NEXT: v_mov_b32_e32 v0, s4 2632; GFX8-NEXT: v_mov_b32_e32 v1, s5 2633; GFX8-NEXT: flat_store_dword v[0:1], v2 2634; GFX8-NEXT: s_endpgm 2635; 2636; GFX9-NODL-LABEL: idot4_commutative: 2637; GFX9-NODL: ; %bb.0: ; %entry 2638; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2639; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 2640; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2641; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2642; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 2643; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 2644; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 2645; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2646; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2647; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8 2648; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2649; GFX9-NODL-NEXT: v_bfe_i32 v4, v2, 0, 8 2650; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v5, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 2651; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 2652; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2653; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v4, s0 2654; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1 2655; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 2656; GFX9-NODL-NEXT: s_endpgm 2657; 2658; GFX9-DL-LABEL: idot4_commutative: 2659; GFX9-DL: ; %bb.0: ; %entry 2660; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2661; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 2662; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2663; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2664; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3] 2665; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1] 2666; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2667; GFX9-DL-NEXT: s_mov_b32 s1, 0xc020100 2668; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2669; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2670; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1 2671; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2672; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1 2673; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2674; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s0 2675; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 2676; GFX9-DL-NEXT: s_endpgm 2677; 2678; GFX10-DL-LABEL: idot4_commutative: 2679; GFX10-DL: ; %bb.0: ; %entry 2680; GFX10-DL-NEXT: s_clause 0x1 2681; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 2682; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c 2683; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2684; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 2685; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2686; GFX10-DL-NEXT: s_clause 0x1 2687; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3] 2688; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] 2689; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 2690; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 2691; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2692; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100 2693; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2694; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100 2695; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2696; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0 2697; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0 2698; GFX10-DL-NEXT: global_store_dword v3, v2, s[6:7] 2699; GFX10-DL-NEXT: s_endpgm 2700; 2701; GFX11-DL-LABEL: idot4_commutative: 2702; GFX11-DL: ; %bb.0: ; %entry 2703; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 2704; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2705; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x3c 2706; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 2707; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 2708; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2709; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2710; GFX11-DL-NEXT: s_clause 0x1 2711; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3] 2712; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1] 2713; GFX11-DL-NEXT: s_load_b32 s0, s[4:5], 0x0 2714; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 2715; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100 2716; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 2717; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100 2718; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2719; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 2720; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0] 2721; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5] 2722; GFX11-DL-NEXT: s_endpgm 2723 ptr addrspace(1) %src2, 2724 ptr addrspace(1) %src3, 2725 ptr addrspace(1) nocapture %dst) { 2726entry: 2727 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2728 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 2729 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 2730 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 2731 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 2732 %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx 2733 %vec3 = load <4 x i8>, ptr addrspace(1) %gep3 2734 2735 %v1e0 = extractelement <4 x i8> %vec1, i64 0 2736 %cv1e0 = sext i8 %v1e0 to i32 2737 %v2e0 = extractelement <4 x i8> %vec2, i64 0 2738 %cv2e0 = sext i8 %v2e0 to i32 2739 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0 2740 2741 %v1e1 = extractelement <4 x i8> %vec1, i64 1 2742 %cv1e1 = sext i8 %v1e1 to i32 2743 %v2e1 = extractelement <4 x i8> %vec2, i64 1 2744 %cv2e1 = sext i8 %v2e1 to i32 2745 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 2746 2747 %v2e2 = extractelement <4 x i8> %vec2, i64 2 2748 %cv2e2 = sext i8 %v2e2 to i32 2749 %v1e2 = extractelement <4 x i8> %vec1, i64 2 2750 %cv1e2 = sext i8 %v1e2 to i32 2751 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2 2752 2753 2754 %acc = load i32, ptr addrspace(1) %dst, align 4 2755 %mad1 = add i32 %mul1, %acc 2756 %mad2 = add i32 %mad1, %mul2 2757 %mad3 = add i32 %mad2, %mul3 2758 2759 store i32 %mad3, ptr addrspace(1) %dst, align 4 2760 ret void 2761} 2762 2763define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1, 2764; GFX7-LABEL: idot4_acc32_3src_3ele_src0: 2765; GFX7: ; %bb.0: ; %entry 2766; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 2767; GFX7-NEXT: s_mov_b32 s11, 0xf000 2768; GFX7-NEXT: s_mov_b32 s14, 0 2769; GFX7-NEXT: s_mov_b32 s15, s11 2770; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2771; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2772; GFX7-NEXT: s_mov_b64 s[12:13], s[0:1] 2773; GFX7-NEXT: v_mov_b32_e32 v1, 0 2774; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[12:15], 0 addr64 2775; GFX7-NEXT: s_mov_b64 s[12:13], s[2:3] 2776; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[12:15], 0 addr64 2777; GFX7-NEXT: s_mov_b64 s[12:13], s[4:5] 2778; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 2779; GFX7-NEXT: s_load_dword s0, s[6:7], 0x0 2780; GFX7-NEXT: s_mov_b32 s10, -1 2781; GFX7-NEXT: s_mov_b32 s8, s6 2782; GFX7-NEXT: s_mov_b32 s9, s7 2783; GFX7-NEXT: s_waitcnt vmcnt(2) 2784; GFX7-NEXT: v_bfe_i32 v1, v2, 8, 8 2785; GFX7-NEXT: s_waitcnt vmcnt(1) 2786; GFX7-NEXT: v_bfe_i32 v2, v3, 8, 8 2787; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2788; GFX7-NEXT: v_mad_i32_i24 v4, v2, v2, s0 2789; GFX7-NEXT: v_bfe_i32 v3, v3, 16, 8 2790; GFX7-NEXT: s_waitcnt vmcnt(0) 2791; GFX7-NEXT: v_bfe_i32 v0, v0, 16, 8 2792; GFX7-NEXT: v_mad_i32_i24 v1, v1, v2, v4 2793; GFX7-NEXT: v_mad_i32_i24 v0, v3, v0, v1 2794; GFX7-NEXT: buffer_store_dword v0, off, s[8:11], 0 2795; GFX7-NEXT: s_endpgm 2796; 2797; GFX8-LABEL: idot4_acc32_3src_3ele_src0: 2798; GFX8: ; %bb.0: ; %entry 2799; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 2800; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 2801; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2802; GFX8-NEXT: v_mov_b32_e32 v1, s1 2803; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 2804; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2805; GFX8-NEXT: flat_load_dword v3, v[0:1] 2806; GFX8-NEXT: v_mov_b32_e32 v1, s3 2807; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 2808; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2809; GFX8-NEXT: flat_load_dword v4, v[0:1] 2810; GFX8-NEXT: v_mov_b32_e32 v1, s5 2811; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 2812; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2813; GFX8-NEXT: flat_load_dword v0, v[0:1] 2814; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0 2815; GFX8-NEXT: s_waitcnt vmcnt(2) 2816; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8 2817; GFX8-NEXT: s_waitcnt vmcnt(1) 2818; GFX8-NEXT: v_bfe_i32 v1, v4, 8, 8 2819; GFX8-NEXT: v_bfe_i32 v3, v4, 16, 8 2820; GFX8-NEXT: s_waitcnt lgkmcnt(0) 2821; GFX8-NEXT: v_mad_i32_i24 v4, v1, v1, s0 2822; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1, v4 2823; GFX8-NEXT: s_waitcnt vmcnt(0) 2824; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8 2825; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1 2826; GFX8-NEXT: v_mov_b32_e32 v0, s6 2827; GFX8-NEXT: v_mov_b32_e32 v1, s7 2828; GFX8-NEXT: flat_store_dword v[0:1], v2 2829; GFX8-NEXT: s_endpgm 2830; 2831; GFX9-NODL-LABEL: idot4_acc32_3src_3ele_src0: 2832; GFX9-NODL: ; %bb.0: ; %entry 2833; GFX9-NODL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 2834; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2835; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2836; GFX9-NODL-NEXT: global_load_dword v1, v0, s[10:11] 2837; GFX9-NODL-NEXT: global_load_dword v2, v0, s[12:13] 2838; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9] 2839; GFX9-NODL-NEXT: s_load_dword s0, s[14:15], 0x0 2840; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 2841; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 2842; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 8, 8 2843; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 2844; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 2845; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 2846; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v2, sext(v3), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 2847; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 2848; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v4, s0 2849; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1 2850; GFX9-NODL-NEXT: global_store_dword v0, v1, s[14:15] 2851; GFX9-NODL-NEXT: s_endpgm 2852; 2853; GFX9-DL-LABEL: idot4_acc32_3src_3ele_src0: 2854; GFX9-DL: ; %bb.0: ; %entry 2855; GFX9-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 2856; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2857; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c 2858; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c01 2859; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020101 2860; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2861; GFX9-DL-NEXT: global_load_dword v1, v0, s[12:13] 2862; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9] 2863; GFX9-DL-NEXT: global_load_dword v3, v0, s[10:11] 2864; GFX9-DL-NEXT: s_load_dword s3, s[14:15], 0x0 2865; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 2866; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 2867; GFX9-DL-NEXT: v_perm_b32 v1, v1, v2, s0 2868; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 2869; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s1 2870; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2 2871; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2 2872; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 2873; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s3 2874; GFX9-DL-NEXT: global_store_dword v0, v1, s[14:15] 2875; GFX9-DL-NEXT: s_endpgm 2876; 2877; GFX10-DL-LABEL: idot4_acc32_3src_3ele_src0: 2878; GFX10-DL: ; %bb.0: ; %entry 2879; GFX10-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 2880; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2881; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2882; GFX10-DL-NEXT: s_clause 0x2 2883; GFX10-DL-NEXT: global_load_dword v1, v0, s[12:13] 2884; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9] 2885; GFX10-DL-NEXT: global_load_dword v3, v0, s[10:11] 2886; GFX10-DL-NEXT: s_load_dword s0, s[14:15], 0x0 2887; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 2888; GFX10-DL-NEXT: v_perm_b32 v0, v1, v2, 0xc06010c 2889; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 2890; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc0c0c01 2891; GFX10-DL-NEXT: v_perm_b32 v2, v3, v3, 0xc020101 2892; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 2893; GFX10-DL-NEXT: v_or_b32_e32 v0, v0, v1 2894; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 2895; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0 2896; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v2, v0 2897; GFX10-DL-NEXT: global_store_dword v3, v1, s[14:15] 2898; GFX10-DL-NEXT: s_endpgm 2899; 2900; GFX11-DL-LABEL: idot4_acc32_3src_3ele_src0: 2901; GFX11-DL: ; %bb.0: ; %entry 2902; GFX11-DL-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 2903; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 2904; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 2905; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2906; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2907; GFX11-DL-NEXT: s_clause 0x2 2908; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5] 2909; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1] 2910; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 2911; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0 2912; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 2913; GFX11-DL-NEXT: v_perm_b32 v1, v1, v2, 0xc06010c 2914; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 2915; GFX11-DL-NEXT: v_perm_b32 v2, v0, v0, 0xc0c0c01 2916; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020101 2917; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) 2918; GFX11-DL-NEXT: v_or_b32_e32 v1, v1, v2 2919; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0 2920; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 2921; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0] 2922; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7] 2923; GFX11-DL-NEXT: s_endpgm 2924 ptr addrspace(1) %src2, 2925 ptr addrspace(1) %src3, 2926 ptr addrspace(1) nocapture %dst) { 2927entry: 2928 %idx = call i32 @llvm.amdgcn.workitem.id.x() 2929 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 2930 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 2931 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 2932 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 2933 %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx 2934 %vec3 = load <4 x i8>, ptr addrspace(1) %gep3 2935 2936 %v2e0 = extractelement <4 x i8> %vec2, i64 1 2937 %cv2e0 = sext i8 %v2e0 to i32 2938 %mul1 = mul nuw nsw i32 %cv2e0, %cv2e0 2939 2940 %v1e1 = extractelement <4 x i8> %vec1, i64 1 2941 %cv1e1 = sext i8 %v1e1 to i32 2942 %v2e1 = extractelement <4 x i8> %vec2, i64 1 2943 %cv2e1 = sext i8 %v2e1 to i32 2944 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1 2945 2946 %v3e2 = extractelement <4 x i8> %vec3, i64 2 2947 %cv3e2 = sext i8 %v3e2 to i32 2948 %v2e2 = extractelement <4 x i8> %vec2, i64 2 2949 %cv2e2 = sext i8 %v2e2 to i32 2950 %mul3 = mul nuw nsw i32 %cv2e2, %cv3e2 2951 2952 2953 %acc = load i32, ptr addrspace(1) %dst, align 4 2954 %mad1 = add i32 %mul1, %acc 2955 %mad2 = add i32 %mad1, %mul2 2956 %mad3 = add i32 %mad2, %mul3 2957 2958 store i32 %mad3, ptr addrspace(1) %dst, align 4 2959 ret void 2960} 2961 2962define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1, 2963; GFX7-LABEL: idot4_4src: 2964; GFX7: ; %bb.0: ; %entry 2965; GFX7-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x9 2966; GFX7-NEXT: s_mov_b32 s3, 0xf000 2967; GFX7-NEXT: s_mov_b32 s18, 0 2968; GFX7-NEXT: s_mov_b32 s19, s3 2969; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2970; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2971; GFX7-NEXT: s_mov_b64 s[16:17], s[8:9] 2972; GFX7-NEXT: v_mov_b32_e32 v1, 0 2973; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[16:19], 0 addr64 2974; GFX7-NEXT: s_mov_b64 s[16:17], s[10:11] 2975; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[16:19], 0 addr64 2976; GFX7-NEXT: s_mov_b64 s[16:17], s[12:13] 2977; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[16:19], 0 addr64 2978; GFX7-NEXT: s_mov_b64 s[16:17], s[14:15] 2979; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64 2980; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x11 2981; GFX7-NEXT: s_mov_b32 s2, -1 2982; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2983; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 2984; GFX7-NEXT: s_waitcnt vmcnt(3) 2985; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 2986; GFX7-NEXT: v_bfe_i32 v2, v2, 8, 8 2987; GFX7-NEXT: s_waitcnt vmcnt(2) 2988; GFX7-NEXT: v_bfe_i32 v5, v3, 0, 8 2989; GFX7-NEXT: v_bfe_i32 v3, v3, 8, 8 2990; GFX7-NEXT: s_waitcnt lgkmcnt(0) 2991; GFX7-NEXT: v_mad_i32_i24 v1, v1, v2, s4 2992; GFX7-NEXT: s_waitcnt vmcnt(1) 2993; GFX7-NEXT: v_bfe_i32 v2, v4, 0, 8 2994; GFX7-NEXT: v_bfe_i32 v4, v4, 8, 8 2995; GFX7-NEXT: v_mad_i32_i24 v1, v5, v3, v1 2996; GFX7-NEXT: s_waitcnt vmcnt(0) 2997; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 8 2998; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8 2999; GFX7-NEXT: v_mad_i32_i24 v1, v2, v4, v1 3000; GFX7-NEXT: v_mad_i32_i24 v0, v3, v0, v1 3001; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 3002; GFX7-NEXT: s_endpgm 3003; 3004; GFX8-LABEL: idot4_4src: 3005; GFX8: ; %bb.0: ; %entry 3006; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 3007; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3008; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 3009; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3010; GFX8-NEXT: v_mov_b32_e32 v1, s9 3011; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2 3012; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3013; GFX8-NEXT: flat_load_dword v3, v[0:1] 3014; GFX8-NEXT: v_mov_b32_e32 v1, s11 3015; GFX8-NEXT: v_add_u32_e32 v0, vcc, s10, v2 3016; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3017; GFX8-NEXT: flat_load_dword v4, v[0:1] 3018; GFX8-NEXT: v_mov_b32_e32 v1, s13 3019; GFX8-NEXT: v_add_u32_e32 v0, vcc, s12, v2 3020; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3021; GFX8-NEXT: flat_load_dword v5, v[0:1] 3022; GFX8-NEXT: v_mov_b32_e32 v1, s15 3023; GFX8-NEXT: v_add_u32_e32 v0, vcc, s14, v2 3024; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3025; GFX8-NEXT: flat_load_dword v0, v[0:1] 3026; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 3027; GFX8-NEXT: s_waitcnt vmcnt(3) 3028; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8 3029; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8 3030; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3031; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2 3032; GFX8-NEXT: s_waitcnt vmcnt(2) 3033; GFX8-NEXT: v_bfe_i32 v3, v4, 0, 8 3034; GFX8-NEXT: v_bfe_i32 v4, v4, 8, 8 3035; GFX8-NEXT: v_mad_i32_i24 v1, v3, v4, v1 3036; GFX8-NEXT: s_waitcnt vmcnt(1) 3037; GFX8-NEXT: v_bfe_i32 v6, v5, 0, 8 3038; GFX8-NEXT: v_bfe_i32 v5, v5, 8, 8 3039; GFX8-NEXT: v_mad_i32_i24 v1, v6, v5, v1 3040; GFX8-NEXT: s_waitcnt vmcnt(0) 3041; GFX8-NEXT: v_bfe_i32 v7, v0, 0, 8 3042; GFX8-NEXT: v_bfe_i32 v0, v0, 8, 8 3043; GFX8-NEXT: v_mad_i32_i24 v2, v7, v0, v1 3044; GFX8-NEXT: v_mov_b32_e32 v0, s0 3045; GFX8-NEXT: v_mov_b32_e32 v1, s1 3046; GFX8-NEXT: flat_store_dword v[0:1], v2 3047; GFX8-NEXT: s_endpgm 3048; 3049; GFX9-NODL-LABEL: idot4_4src: 3050; GFX9-NODL: ; %bb.0: ; %entry 3051; GFX9-NODL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 3052; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3053; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 3054; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 3055; GFX9-NODL-NEXT: global_load_dword v1, v0, s[8:9] 3056; GFX9-NODL-NEXT: global_load_dword v2, v0, s[10:11] 3057; GFX9-NODL-NEXT: global_load_dword v3, v0, s[12:13] 3058; GFX9-NODL-NEXT: global_load_dword v4, v0, s[14:15] 3059; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 3060; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0 3061; GFX9-NODL-NEXT: s_waitcnt vmcnt(3) 3062; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1 3063; GFX9-NODL-NEXT: s_waitcnt vmcnt(2) 3064; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v2, sext(v2), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1 3065; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 3066; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v3), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1 3067; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 3068; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1 3069; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 3070; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v2 3071; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v4 3072; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1] 3073; GFX9-NODL-NEXT: s_endpgm 3074; 3075; GFX9-DL-LABEL: idot4_4src: 3076; GFX9-DL: ; %bb.0: ; %entry 3077; GFX9-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 3078; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3079; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 3080; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0501 3081; GFX9-DL-NEXT: s_mov_b32 s3, 0x5010c0c 3082; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3083; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] 3084; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] 3085; GFX9-DL-NEXT: global_load_dword v3, v0, s[12:13] 3086; GFX9-DL-NEXT: global_load_dword v4, v0, s[14:15] 3087; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0400 3088; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 3089; GFX9-DL-NEXT: s_mov_b32 s5, 0x4000c0c 3090; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 3091; GFX9-DL-NEXT: s_waitcnt vmcnt(2) 3092; GFX9-DL-NEXT: v_perm_b32 v5, v2, v1, s2 3093; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s4 3094; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 3095; GFX9-DL-NEXT: v_perm_b32 v6, v4, v3, s3 3096; GFX9-DL-NEXT: v_perm_b32 v2, v4, v3, s5 3097; GFX9-DL-NEXT: v_or_b32_e32 v3, v6, v5 3098; GFX9-DL-NEXT: v_or_b32_e32 v1, v2, v1 3099; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3100; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v3, s6 3101; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1] 3102; GFX9-DL-NEXT: s_endpgm 3103; 3104; GFX10-DL-LABEL: idot4_4src: 3105; GFX10-DL: ; %bb.0: ; %entry 3106; GFX10-DL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 3107; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3108; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x44 3109; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3110; GFX10-DL-NEXT: s_clause 0x3 3111; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9] 3112; GFX10-DL-NEXT: global_load_dword v2, v0, s[10:11] 3113; GFX10-DL-NEXT: global_load_dword v3, v0, s[12:13] 3114; GFX10-DL-NEXT: global_load_dword v4, v0, s[14:15] 3115; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 3116; GFX10-DL-NEXT: s_waitcnt vmcnt(2) 3117; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc0c0501 3118; GFX10-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc0c0400 3119; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 3120; GFX10-DL-NEXT: v_perm_b32 v5, v4, v3, 0x5010c0c 3121; GFX10-DL-NEXT: v_perm_b32 v2, v4, v3, 0x4000c0c 3122; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 3123; GFX10-DL-NEXT: v_or_b32_e32 v0, v5, v0 3124; GFX10-DL-NEXT: v_or_b32_e32 v1, v2, v1 3125; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3126; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2 3127; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0 3128; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1] 3129; GFX10-DL-NEXT: s_endpgm 3130; 3131; GFX11-DL-LABEL: idot4_4src: 3132; GFX11-DL: ; %bb.0: ; %entry 3133; GFX11-DL-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 3134; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3135; GFX11-DL-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 3136; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 3137; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3138; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 3139; GFX11-DL-NEXT: s_clause 0x3 3140; GFX11-DL-NEXT: global_load_b32 v1, v0, s[8:9] 3141; GFX11-DL-NEXT: global_load_b32 v2, v0, s[10:11] 3142; GFX11-DL-NEXT: global_load_b32 v3, v0, s[12:13] 3143; GFX11-DL-NEXT: global_load_b32 v0, v0, s[14:15] 3144; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0 3145; GFX11-DL-NEXT: s_waitcnt vmcnt(2) 3146; GFX11-DL-NEXT: v_perm_b32 v4, v2, v1, 0xc0c0501 3147; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc0c0400 3148; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 3149; GFX11-DL-NEXT: v_perm_b32 v5, v0, v3, 0x5010c0c 3150; GFX11-DL-NEXT: v_perm_b32 v0, v0, v3, 0x4000c0c 3151; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) 3152; GFX11-DL-NEXT: v_or_b32_e32 v2, v5, v4 3153; GFX11-DL-NEXT: v_or_b32_e32 v0, v0, v1 3154; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0 3155; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 3156; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 3157; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v2, s2 neg_lo:[1,1,0] 3158; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1] 3159; GFX11-DL-NEXT: s_endpgm 3160 ptr addrspace(1) %src2, 3161 ptr addrspace(1) %src3, 3162 ptr addrspace(1) %src4, 3163 ptr addrspace(1) nocapture %dst) { 3164entry: 3165 %idx = call i32 @llvm.amdgcn.workitem.id.x() 3166 3167 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 3168 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 3169 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 3170 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 3171 %gep3 = getelementptr <4 x i8>, ptr addrspace(1) %src3, i32 %idx 3172 %vec3 = load <4 x i8>, ptr addrspace(1) %gep3 3173 %gep4 = getelementptr <4 x i8>, ptr addrspace(1) %src4, i32 %idx 3174 %vec4 = load <4 x i8>, ptr addrspace(1) %gep4 3175 3176 3177 %v1e0 = extractelement <4 x i8> %vec1, i64 0 3178 %cv1e0 = sext i8 %v1e0 to i32 3179 %v1e1 = extractelement <4 x i8> %vec1, i64 1 3180 %cv1e1 = sext i8 %v1e1 to i32 3181 %mul1 = mul nuw nsw i32 %cv1e0, %cv1e1 3182 3183 %v2e0 = extractelement <4 x i8> %vec2, i64 0 3184 %cv2e0 = sext i8 %v2e0 to i32 3185 %v2e1 = extractelement <4 x i8> %vec2, i64 1 3186 %cv2e1 = sext i8 %v2e1 to i32 3187 %mul2 = mul nuw nsw i32 %cv2e0, %cv2e1 3188 3189 %v3e0 = extractelement <4 x i8> %vec3, i64 0 3190 %cv3e0 = sext i8 %v3e0 to i32 3191 %v3e1 = extractelement <4 x i8> %vec3, i64 1 3192 %cv3e1 = sext i8 %v3e1 to i32 3193 %mul3 = mul nuw nsw i32 %cv3e0, %cv3e1 3194 3195 %v4e0 = extractelement <4 x i8> %vec4, i64 0 3196 %cv4e0 = sext i8 %v4e0 to i32 3197 %v4e1 = extractelement <4 x i8> %vec4, i64 1 3198 %cv4e1 = sext i8 %v4e1 to i32 3199 %mul4 = mul nuw nsw i32 %cv4e0, %cv4e1 3200 3201 3202 %acc = load i32, ptr addrspace(1) %dst, align 4 3203 %mad1 = add i32 %mul1, %acc 3204 %mad2 = add i32 %mad1, %mul2 3205 %mad3 = add i32 %mad2, %mul3 3206 %mad4 = add i32 %mad3, %mul4 3207 3208 store i32 %mad4, ptr addrspace(1) %dst, align 4 3209 ret void 3210} 3211 3212define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, 3213; GFX7-LABEL: idot4_nonstandard_signed: 3214; GFX7: ; %bb.0: ; %entry 3215; GFX7-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 3216; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd 3217; GFX7-NEXT: s_mov_b32 s3, 0xf000 3218; GFX7-NEXT: s_mov_b32 s6, 0 3219; GFX7-NEXT: s_mov_b32 s7, s3 3220; GFX7-NEXT: s_waitcnt lgkmcnt(0) 3221; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9] 3222; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3223; GFX7-NEXT: v_mov_b32_e32 v1, 0 3224; GFX7-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 3225; GFX7-NEXT: s_mov_b64 s[4:5], s[10:11] 3226; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 3227; GFX7-NEXT: s_mov_b32 s2, -1 3228; GFX7-NEXT: s_waitcnt vmcnt(1) 3229; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 3230; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8 3231; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 3232; GFX7-NEXT: s_waitcnt vmcnt(0) 3233; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 3234; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8 3235; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 3236; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 3237; GFX7-NEXT: v_mul_u32_u24_e32 v1, v1, v5 3238; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 3239; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 3240; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 3241; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1 3242; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 3243; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1 3244; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 3245; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1 3246; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16 3247; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 3248; GFX7-NEXT: s_endpgm 3249; 3250; GFX8-LABEL: idot4_nonstandard_signed: 3251; GFX8: ; %bb.0: ; %entry 3252; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3253; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 3254; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 3255; GFX8-NEXT: v_mov_b32_e32 v4, 0xff 3256; GFX8-NEXT: s_waitcnt lgkmcnt(0) 3257; GFX8-NEXT: v_mov_b32_e32 v1, s1 3258; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 3259; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3260; GFX8-NEXT: flat_load_dword v3, v[0:1] 3261; GFX8-NEXT: v_mov_b32_e32 v1, s3 3262; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 3263; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3264; GFX8-NEXT: flat_load_dword v2, v[0:1] 3265; GFX8-NEXT: v_mov_b32_e32 v0, s4 3266; GFX8-NEXT: v_mov_b32_e32 v1, s5 3267; GFX8-NEXT: s_waitcnt vmcnt(1) 3268; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 3269; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 3270; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 8 3271; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 3272; GFX8-NEXT: s_waitcnt vmcnt(0) 3273; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 3274; GFX8-NEXT: v_mul_lo_u16_sdwa v6, sext(v3), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 3275; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8 3276; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3277; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 3278; GFX8-NEXT: v_mad_u16 v6, v8, v7, v6 3279; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 3280; GFX8-NEXT: v_mad_u16 v4, v4, v5, v6 3281; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 3282; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 3283; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 16 3284; GFX8-NEXT: flat_store_dword v[0:1], v2 3285; GFX8-NEXT: s_endpgm 3286; 3287; GFX9-NODL-LABEL: idot4_nonstandard_signed: 3288; GFX9-NODL: ; %bb.0: ; %entry 3289; GFX9-NODL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3290; GFX9-NODL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3291; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3292; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) 3293; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1] 3294; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] 3295; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff 3296; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 3297; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) 3298; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 3299; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) 3300; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 3301; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 3302; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 3303; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8 3304; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6 3305; GFX9-NODL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3306; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 3307; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8 3308; GFX9-NODL-NEXT: v_mad_legacy_u16 v4, v6, v5, v4 3309; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 3310; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v7, v3, v4 3311; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 3312; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 3313; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 16 3314; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] 3315; GFX9-NODL-NEXT: s_endpgm 3316; 3317; GFX9-DL-LABEL: idot4_nonstandard_signed: 3318; GFX9-DL: ; %bb.0: ; %entry 3319; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3320; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3321; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3322; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) 3323; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] 3324; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] 3325; GFX9-DL-NEXT: s_movk_i32 s0, 0xff 3326; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 3327; GFX9-DL-NEXT: s_waitcnt vmcnt(1) 3328; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 3329; GFX9-DL-NEXT: s_waitcnt vmcnt(0) 3330; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 3331; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 3332; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 3333; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 3334; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 3335; GFX9-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3336; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 3337; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 3338; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v6, v5, v4 3339; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 3340; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v3, v4 3341; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 3342; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 3343; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 16 3344; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] 3345; GFX9-DL-NEXT: s_endpgm 3346; 3347; GFX10-DL-LABEL: idot4_nonstandard_signed: 3348; GFX10-DL: ; %bb.0: ; %entry 3349; GFX10-DL-NEXT: s_clause 0x1 3350; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 3351; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 3352; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3353; GFX10-DL-NEXT: v_mov_b32_e32 v6, 0xff 3354; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) 3355; GFX10-DL-NEXT: s_clause 0x1 3356; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] 3357; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] 3358; GFX10-DL-NEXT: s_waitcnt vmcnt(1) 3359; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8 3360; GFX10-DL-NEXT: s_waitcnt vmcnt(0) 3361; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2 3362; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 3363; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 3364; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1 3365; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 3366; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v3 3367; GFX10-DL-NEXT: v_bfe_i32 v3, v4, 0, 8 3368; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v5 3369; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 3370; GFX10-DL-NEXT: v_bfe_i32 v6, v7, 0, 8 3371; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 3372; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 3373; GFX10-DL-NEXT: v_mad_u16 v0, v4, v3, v0 3374; GFX10-DL-NEXT: v_mad_u16 v0, v5, v6, v0 3375; GFX10-DL-NEXT: v_mad_u16 v0, v1, v2, v0 3376; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 3377; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 16 3378; GFX10-DL-NEXT: global_store_dword v1, v0, s[6:7] 3379; GFX10-DL-NEXT: s_endpgm 3380; 3381; GFX11-DL-LABEL: idot4_nonstandard_signed: 3382; GFX11-DL: ; %bb.0: ; %entry 3383; GFX11-DL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 3384; GFX11-DL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 3385; GFX11-DL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 3386; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) 3387; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 3388; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0) 3389; GFX11-DL-NEXT: s_clause 0x1 3390; GFX11-DL-NEXT: global_load_b32 v1, v0, s[0:1] 3391; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3] 3392; GFX11-DL-NEXT: s_waitcnt vmcnt(1) 3393; GFX11-DL-NEXT: v_bfe_i32 v2, v1, 0, 8 3394; GFX11-DL-NEXT: s_waitcnt vmcnt(0) 3395; GFX11-DL-NEXT: v_and_b32_e32 v3, 0xff, v0 3396; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 3397; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v0 3398; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 3399; GFX11-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v0 3400; GFX11-DL-NEXT: v_mul_lo_u16 v2, v2, v3 3401; GFX11-DL-NEXT: v_bfe_i32 v3, v4, 0, 8 3402; GFX11-DL-NEXT: v_and_b32_e32 v4, 0xff, v5 3403; GFX11-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 3404; GFX11-DL-NEXT: v_bfe_i32 v5, v6, 0, 8 3405; GFX11-DL-NEXT: v_and_b32_e32 v6, 0xff, v7 3406; GFX11-DL-NEXT: v_lshrrev_b32_e32 v0, 24, v0 3407; GFX11-DL-NEXT: v_mad_u16 v2, v4, v3, v2 3408; GFX11-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 3409; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 3410; GFX11-DL-NEXT: v_mad_u16 v2, v6, v5, v2 3411; GFX11-DL-NEXT: v_mad_u16 v0, v1, v0, v2 3412; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0 3413; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) 3414; GFX11-DL-NEXT: v_bfe_i32 v0, v0, 0, 16 3415; GFX11-DL-NEXT: global_store_b32 v1, v0, s[4:5] 3416; GFX11-DL-NEXT: s_endpgm 3417 ptr addrspace(1) %src2, 3418 ptr addrspace(1) nocapture %dst) { 3419entry: 3420 %idx = call i32 @llvm.amdgcn.workitem.id.x() 3421 %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %src1, i32 %idx 3422 %vec1 = load <4 x i8>, ptr addrspace(1) %gep1 3423 %gep2 = getelementptr <4 x i8>, ptr addrspace(1) %src2, i32 %idx 3424 %vec2 = load <4 x i8>, ptr addrspace(1) %gep2 3425 %v1e0 = extractelement <4 x i8> %vec1, i64 0 3426 %v1e0e = sext i8 %v1e0 to i16 3427 %v2e0 = extractelement <4 x i8> %vec2, i64 0 3428 %v2e0e = zext i8 %v2e0 to i16 3429 %mul0 = mul nsw i16 %v1e0e, %v2e0e 3430 %add0 = add i16 %mul0, 0 3431 3432 %v1e1 = extractelement <4 x i8> %vec1, i64 1 3433 %v1e1e = sext i8 %v1e1 to i16 3434 %v2e1 = extractelement <4 x i8> %vec2, i64 1 3435 %v2e1e = zext i8 %v2e1 to i16 3436 %mul1 = mul nsw i16 %v2e1e, %v1e1e 3437 %add1 = add i16 %mul1, %add0 3438 %v1e2 = extractelement <4 x i8> %vec1, i64 2 3439 %v1e2e = sext i8 %v1e2 to i16 3440 %v2e2 = extractelement <4 x i8> %vec2, i64 2 3441 %v2e2e = zext i8 %v2e2 to i16 3442 %mul2 = mul nsw i16 %v2e2e, %v1e2e 3443 %add2 = add i16 %mul2, %add1 3444 %v1e3 = extractelement <4 x i8> %vec1, i64 3 3445 %v1e3e = sext i8 %v1e3 to i16 3446 %v2e3 = extractelement <4 x i8> %vec2, i64 3 3447 %v2e3e = zext i8 %v2e3 to i16 3448 %mul3 = mul nsw i16 %v1e3e, %v2e3e 3449 %add3 = add i16 %mul3, %add2 3450 %res = sext i16 %add3 to i32 3451 store i32 %res, ptr addrspace(1) %dst, align 4 3452 ret void 3453} 3454 3455 3456declare i32 @llvm.amdgcn.workitem.id.x() 3457